diff --git a/LICENSES/HAVEN_LICENSE b/LICENSES/HAVEN_LICENSE new file mode 100644 index 0000000000000..2f444cb44d505 --- /dev/null +++ b/LICENSES/HAVEN_LICENSE @@ -0,0 +1,2 @@ +YEAR: 2013-2016 +COPYRIGHT HOLDER: Hadley Wickham; RStudio; and Evan Miller diff --git a/LICENSES/HAVEN_MIT b/LICENSES/HAVEN_MIT new file mode 100644 index 0000000000000..b03d0e640627a --- /dev/null +++ b/LICENSES/HAVEN_MIT @@ -0,0 +1,32 @@ +Based on http://opensource.org/licenses/MIT + +This is a template. Complete and ship as file LICENSE the following 2 +lines (only) + +YEAR: +COPYRIGHT HOLDER: + +and specify as + +License: MIT + file LICENSE + +Copyright (c) , + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index 8ed48b46b5b5a..24c753e16d98d 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -23,6 +23,7 @@ dependencies: - xlsxwriter - xlwt - pip: + - pyreadstat # universal - pytest==4.5.0 - pytest-xdist diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 04e4f74f85e4d..5bdc29e0eec80 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -30,3 +30,4 @@ dependencies: - pytest-mock - moto - hypothesis>=3.58.0 + - pyreadstat diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 722a35111ab01..c9a8c274fb144 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -19,5 +19,6 @@ dependencies: - hypothesis>=3.58.0 - s3fs - pip + - pyreadstat - pip: - moto diff --git a/doc/source/install.rst b/doc/source/install.rst index db31d75e3013e..1c1f0c1d4cf8e 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -285,6 +285,7 @@ pandas-gbq 0.8.0 Google Big Query access psycopg2 PostgreSQL engine for sqlalchemy pyarrow 0.9.0 Parquet and feather reading / writing pymysql MySQL engine for sqlalchemy +pyreadstat SPSS files (.sav) reading qtpy Clipboard I/O s3fs 0.0.8 Amazon S3 access xarray 0.8.2 pandas-like API for N-dimensional data diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 207d16afd350f..f7faeea7a646f 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -99,6 +99,7 @@ Other Enhancements - Error message for missing required imports now includes the original import error's text (:issue:`23868`) - :class:`DatetimeIndex` and :class:`TimedeltaIndex` now have a ``mean`` method (:issue:`24757`) - :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`) +- Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`) .. _whatsnew_0250.api_breaking: diff --git a/environment.yml b/environment.yml index 7db2ec72ccb3b..de9bd67dd9f06 100644 --- a/environment.yml +++ b/environment.yml @@ -79,3 +79,5 @@ dependencies: - xlrd # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile + - pip: + - pyreadstat # pandas.read_spss diff --git a/pandas/__init__.py b/pandas/__init__.py index a2fa14be83998..b95c312f12eed 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -105,7 +105,7 @@ # misc read_clipboard, read_parquet, read_feather, read_gbq, - read_html, read_json, read_stata, read_sas) + read_html, read_json, read_stata, read_sas, read_spss) from pandas.util._tester import test import pandas.testing diff --git a/pandas/io/api.py b/pandas/io/api.py index 8c8d7cf73b37a..725e82604ca7f 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -16,5 +16,6 @@ from pandas.io.pickle import read_pickle, to_pickle from pandas.io.pytables import HDFStore, read_hdf from pandas.io.sas import read_sas +from pandas.io.spss import read_spss from pandas.io.sql import read_sql, read_sql_query, read_sql_table from pandas.io.stata import read_stata diff --git a/pandas/io/spss.py b/pandas/io/spss.py new file mode 100644 index 0000000000000..b1b92fc2b8439 --- /dev/null +++ b/pandas/io/spss.py @@ -0,0 +1,41 @@ +from pathlib import Path +from typing import Optional, Sequence, Union + +from pandas.compat._optional import import_optional_dependency + +from pandas.api.types import is_list_like +from pandas.core.api import DataFrame + + +def read_spss(path: Union[str, Path], + usecols: Optional[Sequence[str]] = None, + convert_categoricals: bool = True) -> DataFrame: + """ + Load an SPSS file from the file path, returning a DataFrame. + + .. versionadded 0.25.0 + + Parameters + ---------- + path : string or Path + File path + usecols : list-like, optional + Return a subset of the columns. If None, return all columns. + convert_categoricals : bool, default is True + Convert categorical columns into pd.Categorical. + + Returns + ------- + DataFrame + """ + pyreadstat = import_optional_dependency("pyreadstat") + + if usecols is not None: + if not is_list_like(usecols): + raise TypeError("usecols must be list-like.") + else: + usecols = list(usecols) # pyreadstat requires a list + + df, _ = pyreadstat.read_sav(path, usecols=usecols, + apply_value_formats=convert_categoricals) + return df diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index aa42484bf9513..b57c7a0cf0625 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -81,7 +81,7 @@ class TestPDApi(Base): 'read_gbq', 'read_hdf', 'read_html', 'read_json', 'read_msgpack', 'read_pickle', 'read_sas', 'read_sql', 'read_sql_query', 'read_sql_table', 'read_stata', - 'read_table', 'read_feather', 'read_parquet'] + 'read_table', 'read_feather', 'read_parquet', 'read_spss'] # top-level to_* funcs funcs_to = ['to_datetime', 'to_msgpack', diff --git a/pandas/tests/io/data/labelled-num-na.sav b/pandas/tests/io/data/labelled-num-na.sav new file mode 100755 index 0000000000000..fbe6ee7767240 Binary files /dev/null and b/pandas/tests/io/data/labelled-num-na.sav differ diff --git a/pandas/tests/io/data/labelled-num.sav b/pandas/tests/io/data/labelled-num.sav new file mode 100755 index 0000000000000..bfab052089d7e Binary files /dev/null and b/pandas/tests/io/data/labelled-num.sav differ diff --git a/pandas/tests/io/data/labelled-str.sav b/pandas/tests/io/data/labelled-str.sav new file mode 100755 index 0000000000000..b96a9c00fcec1 Binary files /dev/null and b/pandas/tests/io/data/labelled-str.sav differ diff --git a/pandas/tests/io/data/umlauts.sav b/pandas/tests/io/data/umlauts.sav new file mode 100755 index 0000000000000..e99cf1267bebe Binary files /dev/null and b/pandas/tests/io/data/umlauts.sav differ diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py new file mode 100644 index 0000000000000..b9f58f9bf6cf6 --- /dev/null +++ b/pandas/tests/io/test_spss.py @@ -0,0 +1,74 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.util import testing as tm + +pyreadstat = pytest.importorskip("pyreadstat") + + +def test_spss_labelled_num(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) + fname = datapath("io", "data", "labelled-num.sav") + + df = pd.read_spss(fname, convert_categoricals=True) + expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0]) + expected["VAR00002"] = pd.Categorical(expected["VAR00002"]) + tm.assert_frame_equal(df, expected) + + df = pd.read_spss(fname, convert_categoricals=False) + expected = pd.DataFrame({"VAR00002": 1.0}, index=[0]) + tm.assert_frame_equal(df, expected) + + +def test_spss_labelled_num_na(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) + fname = datapath("io", "data", "labelled-num-na.sav") + + df = pd.read_spss(fname, convert_categoricals=True) + expected = pd.DataFrame({"VAR00002": ["This is one", None]}) + expected["VAR00002"] = pd.Categorical(expected["VAR00002"]) + tm.assert_frame_equal(df, expected) + + df = pd.read_spss(fname, convert_categoricals=False) + expected = pd.DataFrame({"VAR00002": [1.0, np.nan]}) + tm.assert_frame_equal(df, expected) + + +def test_spss_labelled_str(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) + fname = datapath("io", "data", "labelled-str.sav") + + df = pd.read_spss(fname, convert_categoricals=True) + expected = pd.DataFrame({"gender": ["Male", "Female"]}) + expected["gender"] = pd.Categorical(expected["gender"]) + tm.assert_frame_equal(df, expected) + + df = pd.read_spss(fname, convert_categoricals=False) + expected = pd.DataFrame({"gender": ["M", "F"]}) + tm.assert_frame_equal(df, expected) + + +def test_spss_umlauts(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) + fname = datapath("io", "data", "umlauts.sav") + + df = pd.read_spss(fname, convert_categoricals=True) + expected = pd.DataFrame({"var1": ["the ä umlaut", + "the ü umlaut", + "the ä umlaut", + "the ö umlaut"]}) + expected["var1"] = pd.Categorical(expected["var1"]) + tm.assert_frame_equal(df, expected) + + df = pd.read_spss(fname, convert_categoricals=False) + expected = pd.DataFrame({"var1": [1.0, 2.0, 1.0, 3.0]}) + tm.assert_frame_equal(df, expected) + + +def test_spss_usecols(datapath): + # usecols must be list-like + fname = datapath("io", "data", "labelled-num.sav") + + with pytest.raises(TypeError, match="usecols must be list-like."): + pd.read_spss(fname, usecols="VAR00002") diff --git a/requirements-dev.txt b/requirements-dev.txt index b40aa86e946b6..169af7da5e037 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -52,4 +52,5 @@ sqlalchemy xarray xlrd xlsxwriter -xlwt \ No newline at end of file +xlwt +pyreadstat \ No newline at end of file