From 21fe224627a07e9c913d8788026a5bdc32b28b8c Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Sun, 16 Jun 2019 16:30:39 +0200 Subject: [PATCH] Add reader for SPSS (.sav) files (#26537) --- LICENSES/HAVEN_LICENSE | 2 + LICENSES/HAVEN_MIT | 32 ++++++++++ ci/deps/azure-macos-35.yaml | 1 + ci/deps/azure-windows-37.yaml | 1 + ci/deps/travis-37.yaml | 1 + doc/source/install.rst | 1 + doc/source/whatsnew/v0.25.0.rst | 1 + environment.yml | 2 + pandas/__init__.py | 2 +- pandas/io/api.py | 1 + pandas/io/spss.py | 41 +++++++++++++ pandas/tests/api/test_api.py | 2 +- pandas/tests/io/data/labelled-num-na.sav | Bin 0 -> 535 bytes pandas/tests/io/data/labelled-num.sav | Bin 0 -> 507 bytes pandas/tests/io/data/labelled-str.sav | Bin 0 -> 525 bytes pandas/tests/io/data/umlauts.sav | Bin 0 -> 567 bytes pandas/tests/io/test_spss.py | 74 +++++++++++++++++++++++ requirements-dev.txt | 3 +- 18 files changed, 161 insertions(+), 3 deletions(-) create mode 100644 LICENSES/HAVEN_LICENSE create mode 100644 LICENSES/HAVEN_MIT create mode 100644 pandas/io/spss.py create mode 100755 pandas/tests/io/data/labelled-num-na.sav create mode 100755 pandas/tests/io/data/labelled-num.sav create mode 100755 pandas/tests/io/data/labelled-str.sav create mode 100755 pandas/tests/io/data/umlauts.sav create mode 100644 pandas/tests/io/test_spss.py diff --git a/LICENSES/HAVEN_LICENSE b/LICENSES/HAVEN_LICENSE new file mode 100644 index 0000000000000..2f444cb44d505 --- /dev/null +++ b/LICENSES/HAVEN_LICENSE @@ -0,0 +1,2 @@ +YEAR: 2013-2016 +COPYRIGHT HOLDER: Hadley Wickham; RStudio; and Evan Miller diff --git a/LICENSES/HAVEN_MIT b/LICENSES/HAVEN_MIT new file mode 100644 index 0000000000000..b03d0e640627a --- /dev/null +++ b/LICENSES/HAVEN_MIT @@ -0,0 +1,32 @@ +Based on http://opensource.org/licenses/MIT + +This is a template. Complete and ship as file LICENSE the following 2 +lines (only) + +YEAR: +COPYRIGHT HOLDER: + +and specify as + +License: MIT + file LICENSE + +Copyright (c) , + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index 8ed48b46b5b5a..24c753e16d98d 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -23,6 +23,7 @@ dependencies: - xlsxwriter - xlwt - pip: + - pyreadstat # universal - pytest==4.5.0 - pytest-xdist diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 04e4f74f85e4d..5bdc29e0eec80 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -30,3 +30,4 @@ dependencies: - pytest-mock - moto - hypothesis>=3.58.0 + - pyreadstat diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 722a35111ab01..c9a8c274fb144 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -19,5 +19,6 @@ dependencies: - hypothesis>=3.58.0 - s3fs - pip + - pyreadstat - pip: - moto diff --git a/doc/source/install.rst b/doc/source/install.rst index db31d75e3013e..1c1f0c1d4cf8e 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -285,6 +285,7 @@ pandas-gbq 0.8.0 Google Big Query access psycopg2 PostgreSQL engine for sqlalchemy pyarrow 0.9.0 Parquet and feather reading / writing pymysql MySQL engine for sqlalchemy +pyreadstat SPSS files (.sav) reading qtpy Clipboard I/O s3fs 0.0.8 Amazon S3 access xarray 0.8.2 pandas-like API for N-dimensional data diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 207d16afd350f..f7faeea7a646f 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -99,6 +99,7 @@ Other Enhancements - Error message for missing required imports now includes the original import error's text (:issue:`23868`) - :class:`DatetimeIndex` and :class:`TimedeltaIndex` now have a ``mean`` method (:issue:`24757`) - :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`) +- Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`) .. _whatsnew_0250.api_breaking: diff --git a/environment.yml b/environment.yml index 7db2ec72ccb3b..de9bd67dd9f06 100644 --- a/environment.yml +++ b/environment.yml @@ -79,3 +79,5 @@ dependencies: - xlrd # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile + - pip: + - pyreadstat # pandas.read_spss diff --git a/pandas/__init__.py b/pandas/__init__.py index a2fa14be83998..b95c312f12eed 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -105,7 +105,7 @@ # misc read_clipboard, read_parquet, read_feather, read_gbq, - read_html, read_json, read_stata, read_sas) + read_html, read_json, read_stata, read_sas, read_spss) from pandas.util._tester import test import pandas.testing diff --git a/pandas/io/api.py b/pandas/io/api.py index 8c8d7cf73b37a..725e82604ca7f 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -16,5 +16,6 @@ from pandas.io.pickle import read_pickle, to_pickle from pandas.io.pytables import HDFStore, read_hdf from pandas.io.sas import read_sas +from pandas.io.spss import read_spss from pandas.io.sql import read_sql, read_sql_query, read_sql_table from pandas.io.stata import read_stata diff --git a/pandas/io/spss.py b/pandas/io/spss.py new file mode 100644 index 0000000000000..b1b92fc2b8439 --- /dev/null +++ b/pandas/io/spss.py @@ -0,0 +1,41 @@ +from pathlib import Path +from typing import Optional, Sequence, Union + +from pandas.compat._optional import import_optional_dependency + +from pandas.api.types import is_list_like +from pandas.core.api import DataFrame + + +def read_spss(path: Union[str, Path], + usecols: Optional[Sequence[str]] = None, + convert_categoricals: bool = True) -> DataFrame: + """ + Load an SPSS file from the file path, returning a DataFrame. + + .. versionadded 0.25.0 + + Parameters + ---------- + path : string or Path + File path + usecols : list-like, optional + Return a subset of the columns. If None, return all columns. + convert_categoricals : bool, default is True + Convert categorical columns into pd.Categorical. + + Returns + ------- + DataFrame + """ + pyreadstat = import_optional_dependency("pyreadstat") + + if usecols is not None: + if not is_list_like(usecols): + raise TypeError("usecols must be list-like.") + else: + usecols = list(usecols) # pyreadstat requires a list + + df, _ = pyreadstat.read_sav(path, usecols=usecols, + apply_value_formats=convert_categoricals) + return df diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index aa42484bf9513..b57c7a0cf0625 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -81,7 +81,7 @@ class TestPDApi(Base): 'read_gbq', 'read_hdf', 'read_html', 'read_json', 'read_msgpack', 'read_pickle', 'read_sas', 'read_sql', 'read_sql_query', 'read_sql_table', 'read_stata', - 'read_table', 'read_feather', 'read_parquet'] + 'read_table', 'read_feather', 'read_parquet', 'read_spss'] # top-level to_* funcs funcs_to = ['to_datetime', 'to_msgpack', diff --git a/pandas/tests/io/data/labelled-num-na.sav b/pandas/tests/io/data/labelled-num-na.sav new file mode 100755 index 0000000000000000000000000000000000000000..fbe6ee77672406ba5e28289d88621faf68c72ee3 GIT binary patch literal 535 zcmY#!^D%PJP}WrNbn;aQ4hRlb2o7-!@eB^}bPiVV4OR%x%uC5HFIF%z(lY=-1vJ3K zz`(!=#XyPy#D{=L2Lm$&x6~vBLsLT&D`OKYBO^4uBy)iV!_0@<$-&CN0K{RAK?Xoz z#0OO4pO;gqke`>TP?nfenyR1xagUM%Gmrt&2LT`KxkEBCixq$fq!tKRfD$ls*@0}3 zdXN|g5Q_mZ3|s&@0;Ue+o|k_a7(nV_YC-z`gTZ?U_5XeSKPVrhjvJ~Lqz(i?=7Y=w z$?-zjfw86p26; W0mF_VG{jBU;yKhUskuNAAOHZej8AF+ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/labelled-num.sav b/pandas/tests/io/data/labelled-num.sav new file mode 100755 index 0000000000000000000000000000000000000000..bfab052089d7e62d2e9747c51434cb3a42156278 GIT binary patch literal 507 zcmY#!^D%PJP}WrNbn;aQ4hRlb2o7-!@eB^}bPiVV4OR%x%uC5HFIF%z(lY=-1vJ3K zz`(!=#Xt(o2GI-*kq!oC3T~-M3WlbJCRWDAR>o#%dP(L2O$FJ90ytP17=Sp;F~|T2 zjF^FvAooDP2Yc?2jLc#MAj;261=+^}lz_RH9moc$2Z?b2u^14;zy+X1Fm)h1U;bra z0I7$m1?l?_2Ja!%|M&I(pnQ-zZm3$2IuHPv4>AuV#|vc(0%@4LZD9T$&(7B2Y!Z@L>oIanrSU4s}Z^Pyhq~H&;mv literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/labelled-str.sav b/pandas/tests/io/data/labelled-str.sav new file mode 100755 index 0000000000000000000000000000000000000000..b96a9c00fcec10b33cf35b2d3d87bef396fcd3ad GIT binary patch literal 525 zcmbVIT}uK%6dlV-j3CMfA1^4V55|6&#DcJCt}L_>&PMcDID}vt8Tja5^k0Ne{sOam z+PTnkXE@xsbI-kJW~$Mx7uG8cin|Hvd#y>Q*J-TNxTmSzYQBs=Dbe&eo{naVIeD!M z2a5!IN~xSB2ZcPtQ|S7n%{#em*AF}=xb&szzmW%vpSY+TyE6yv0&F zx95qW#OC<~8Bv~fa_=MFqYq~VW|=8iv7zYTz1}JXy=c+5`^6>;yUp_3=FlBmEp(WJ z`2cDsOq?NR_wT%#>BxMbc*=zM?}M=iP(Nd$`J9<`1=Vmko0xjdsTCWLl&s`{<3k!X RufA{##+Dxe$fw9>{Qxb9P$&QZ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/umlauts.sav b/pandas/tests/io/data/umlauts.sav new file mode 100755 index 0000000000000000000000000000000000000000..e99cf1267bebebd16bdfe881579e6e319aa10986 GIT binary patch literal 567 zcmY#!^D%PJP}WrNbn;aQ4hRlb2o7-!@eB^}bPiVV4OR%x%uC5HFIF%z(lY=-1vJ3K zz`(!=#XyRI1w?>Bq=S*Mf?H~mf}yFQk(G(Dm5CXeUXr;$b3yij0LV@-;9zB70OBym zAVZLOB0#~AjLc#MAWFj2vPtHTZYgOH(iV8Q2SHUQqzHA3;+QaUGV?_ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py new file mode 100644 index 0000000000000..b9f58f9bf6cf6 --- /dev/null +++ b/pandas/tests/io/test_spss.py @@ -0,0 +1,74 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.util import testing as tm + +pyreadstat = pytest.importorskip("pyreadstat") + + +def test_spss_labelled_num(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) + fname = datapath("io", "data", "labelled-num.sav") + + df = pd.read_spss(fname, convert_categoricals=True) + expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0]) + expected["VAR00002"] = pd.Categorical(expected["VAR00002"]) + tm.assert_frame_equal(df, expected) + + df = pd.read_spss(fname, convert_categoricals=False) + expected = pd.DataFrame({"VAR00002": 1.0}, index=[0]) + tm.assert_frame_equal(df, expected) + + +def test_spss_labelled_num_na(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) + fname = datapath("io", "data", "labelled-num-na.sav") + + df = pd.read_spss(fname, convert_categoricals=True) + expected = pd.DataFrame({"VAR00002": ["This is one", None]}) + expected["VAR00002"] = pd.Categorical(expected["VAR00002"]) + tm.assert_frame_equal(df, expected) + + df = pd.read_spss(fname, convert_categoricals=False) + expected = pd.DataFrame({"VAR00002": [1.0, np.nan]}) + tm.assert_frame_equal(df, expected) + + +def test_spss_labelled_str(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) + fname = datapath("io", "data", "labelled-str.sav") + + df = pd.read_spss(fname, convert_categoricals=True) + expected = pd.DataFrame({"gender": ["Male", "Female"]}) + expected["gender"] = pd.Categorical(expected["gender"]) + tm.assert_frame_equal(df, expected) + + df = pd.read_spss(fname, convert_categoricals=False) + expected = pd.DataFrame({"gender": ["M", "F"]}) + tm.assert_frame_equal(df, expected) + + +def test_spss_umlauts(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) + fname = datapath("io", "data", "umlauts.sav") + + df = pd.read_spss(fname, convert_categoricals=True) + expected = pd.DataFrame({"var1": ["the ä umlaut", + "the ü umlaut", + "the ä umlaut", + "the ö umlaut"]}) + expected["var1"] = pd.Categorical(expected["var1"]) + tm.assert_frame_equal(df, expected) + + df = pd.read_spss(fname, convert_categoricals=False) + expected = pd.DataFrame({"var1": [1.0, 2.0, 1.0, 3.0]}) + tm.assert_frame_equal(df, expected) + + +def test_spss_usecols(datapath): + # usecols must be list-like + fname = datapath("io", "data", "labelled-num.sav") + + with pytest.raises(TypeError, match="usecols must be list-like."): + pd.read_spss(fname, usecols="VAR00002") diff --git a/requirements-dev.txt b/requirements-dev.txt index b40aa86e946b6..169af7da5e037 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -52,4 +52,5 @@ sqlalchemy xarray xlrd xlsxwriter -xlwt \ No newline at end of file +xlwt +pyreadstat \ No newline at end of file