Skip to content

Commit

Permalink
Add reader for SPSS (.sav) files (pandas-dev#26537)
Browse files Browse the repository at this point in the history
  • Loading branch information
cbrnr authored and jreback committed Jun 16, 2019
1 parent 3381c64 commit 21fe224
Show file tree
Hide file tree
Showing 18 changed files with 161 additions and 3 deletions.
2 changes: 2 additions & 0 deletions LICENSES/HAVEN_LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
YEAR: 2013-2016
COPYRIGHT HOLDER: Hadley Wickham; RStudio; and Evan Miller
32 changes: 32 additions & 0 deletions LICENSES/HAVEN_MIT
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
Based on http://opensource.org/licenses/MIT

This is a template. Complete and ship as file LICENSE the following 2
lines (only)

YEAR:
COPYRIGHT HOLDER:

and specify as

License: MIT + file LICENSE

Copyright (c) <YEAR>, <COPYRIGHT HOLDER>

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
1 change: 1 addition & 0 deletions ci/deps/azure-macos-35.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ dependencies:
- xlsxwriter
- xlwt
- pip:
- pyreadstat
# universal
- pytest==4.5.0
- pytest-xdist
Expand Down
1 change: 1 addition & 0 deletions ci/deps/azure-windows-37.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ dependencies:
- pytest-mock
- moto
- hypothesis>=3.58.0
- pyreadstat
1 change: 1 addition & 0 deletions ci/deps/travis-37.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,6 @@ dependencies:
- hypothesis>=3.58.0
- s3fs
- pip
- pyreadstat
- pip:
- moto
1 change: 1 addition & 0 deletions doc/source/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@ pandas-gbq 0.8.0 Google Big Query access
psycopg2 PostgreSQL engine for sqlalchemy
pyarrow 0.9.0 Parquet and feather reading / writing
pymysql MySQL engine for sqlalchemy
pyreadstat SPSS files (.sav) reading
qtpy Clipboard I/O
s3fs 0.0.8 Amazon S3 access
xarray 0.8.2 pandas-like API for N-dimensional data
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ Other Enhancements
- Error message for missing required imports now includes the original import error's text (:issue:`23868`)
- :class:`DatetimeIndex` and :class:`TimedeltaIndex` now have a ``mean`` method (:issue:`24757`)
- :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`)
- Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`)

.. _whatsnew_0250.api_breaking:

Expand Down
2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,5 @@ dependencies:
- xlrd # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
- xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
- xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
- pip:
- pyreadstat # pandas.read_spss
2 changes: 1 addition & 1 deletion pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@

# misc
read_clipboard, read_parquet, read_feather, read_gbq,
read_html, read_json, read_stata, read_sas)
read_html, read_json, read_stata, read_sas, read_spss)

from pandas.util._tester import test
import pandas.testing
Expand Down
1 change: 1 addition & 0 deletions pandas/io/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@
from pandas.io.pickle import read_pickle, to_pickle
from pandas.io.pytables import HDFStore, read_hdf
from pandas.io.sas import read_sas
from pandas.io.spss import read_spss
from pandas.io.sql import read_sql, read_sql_query, read_sql_table
from pandas.io.stata import read_stata
41 changes: 41 additions & 0 deletions pandas/io/spss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from pathlib import Path
from typing import Optional, Sequence, Union

from pandas.compat._optional import import_optional_dependency

from pandas.api.types import is_list_like
from pandas.core.api import DataFrame


def read_spss(path: Union[str, Path],
usecols: Optional[Sequence[str]] = None,
convert_categoricals: bool = True) -> DataFrame:
"""
Load an SPSS file from the file path, returning a DataFrame.
.. versionadded 0.25.0
Parameters
----------
path : string or Path
File path
usecols : list-like, optional
Return a subset of the columns. If None, return all columns.
convert_categoricals : bool, default is True
Convert categorical columns into pd.Categorical.
Returns
-------
DataFrame
"""
pyreadstat = import_optional_dependency("pyreadstat")

if usecols is not None:
if not is_list_like(usecols):
raise TypeError("usecols must be list-like.")
else:
usecols = list(usecols) # pyreadstat requires a list

df, _ = pyreadstat.read_sav(path, usecols=usecols,
apply_value_formats=convert_categoricals)
return df
2 changes: 1 addition & 1 deletion pandas/tests/api/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ class TestPDApi(Base):
'read_gbq', 'read_hdf', 'read_html', 'read_json',
'read_msgpack', 'read_pickle', 'read_sas', 'read_sql',
'read_sql_query', 'read_sql_table', 'read_stata',
'read_table', 'read_feather', 'read_parquet']
'read_table', 'read_feather', 'read_parquet', 'read_spss']

# top-level to_* funcs
funcs_to = ['to_datetime', 'to_msgpack',
Expand Down
Binary file added pandas/tests/io/data/labelled-num-na.sav
Binary file not shown.
Binary file added pandas/tests/io/data/labelled-num.sav
Binary file not shown.
Binary file added pandas/tests/io/data/labelled-str.sav
Binary file not shown.
Binary file added pandas/tests/io/data/umlauts.sav
Binary file not shown.
74 changes: 74 additions & 0 deletions pandas/tests/io/test_spss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import numpy as np
import pytest

import pandas as pd
from pandas.util import testing as tm

pyreadstat = pytest.importorskip("pyreadstat")


def test_spss_labelled_num(datapath):
# test file from the Haven project (https://haven.tidyverse.org/)
fname = datapath("io", "data", "labelled-num.sav")

df = pd.read_spss(fname, convert_categoricals=True)
expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0])
expected["VAR00002"] = pd.Categorical(expected["VAR00002"])
tm.assert_frame_equal(df, expected)

df = pd.read_spss(fname, convert_categoricals=False)
expected = pd.DataFrame({"VAR00002": 1.0}, index=[0])
tm.assert_frame_equal(df, expected)


def test_spss_labelled_num_na(datapath):
# test file from the Haven project (https://haven.tidyverse.org/)
fname = datapath("io", "data", "labelled-num-na.sav")

df = pd.read_spss(fname, convert_categoricals=True)
expected = pd.DataFrame({"VAR00002": ["This is one", None]})
expected["VAR00002"] = pd.Categorical(expected["VAR00002"])
tm.assert_frame_equal(df, expected)

df = pd.read_spss(fname, convert_categoricals=False)
expected = pd.DataFrame({"VAR00002": [1.0, np.nan]})
tm.assert_frame_equal(df, expected)


def test_spss_labelled_str(datapath):
# test file from the Haven project (https://haven.tidyverse.org/)
fname = datapath("io", "data", "labelled-str.sav")

df = pd.read_spss(fname, convert_categoricals=True)
expected = pd.DataFrame({"gender": ["Male", "Female"]})
expected["gender"] = pd.Categorical(expected["gender"])
tm.assert_frame_equal(df, expected)

df = pd.read_spss(fname, convert_categoricals=False)
expected = pd.DataFrame({"gender": ["M", "F"]})
tm.assert_frame_equal(df, expected)


def test_spss_umlauts(datapath):
# test file from the Haven project (https://haven.tidyverse.org/)
fname = datapath("io", "data", "umlauts.sav")

df = pd.read_spss(fname, convert_categoricals=True)
expected = pd.DataFrame({"var1": ["the ä umlaut",
"the ü umlaut",
"the ä umlaut",
"the ö umlaut"]})
expected["var1"] = pd.Categorical(expected["var1"])
tm.assert_frame_equal(df, expected)

df = pd.read_spss(fname, convert_categoricals=False)
expected = pd.DataFrame({"var1": [1.0, 2.0, 1.0, 3.0]})
tm.assert_frame_equal(df, expected)


def test_spss_usecols(datapath):
# usecols must be list-like
fname = datapath("io", "data", "labelled-num.sav")

with pytest.raises(TypeError, match="usecols must be list-like."):
pd.read_spss(fname, usecols="VAR00002")
3 changes: 2 additions & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,5 @@ sqlalchemy
xarray
xlrd
xlsxwriter
xlwt
xlwt
pyreadstat

0 comments on commit 21fe224

Please sign in to comment.