From 21fe224627a07e9c913d8788026a5bdc32b28b8c Mon Sep 17 00:00:00 2001
From: Clemens Brunner <clemens.brunner@gmail.com>
Date: Sun, 16 Jun 2019 16:30:39 +0200
Subject: [PATCH] Add reader for SPSS (.sav) files (#26537)

---
 LICENSES/HAVEN_LICENSE                   |   2 +
 LICENSES/HAVEN_MIT                       |  32 ++++++++++
 ci/deps/azure-macos-35.yaml              |   1 +
 ci/deps/azure-windows-37.yaml            |   1 +
 ci/deps/travis-37.yaml                   |   1 +
 doc/source/install.rst                   |   1 +
 doc/source/whatsnew/v0.25.0.rst          |   1 +
 environment.yml                          |   2 +
 pandas/__init__.py                       |   2 +-
 pandas/io/api.py                         |   1 +
 pandas/io/spss.py                        |  41 +++++++++++++
 pandas/tests/api/test_api.py             |   2 +-
 pandas/tests/io/data/labelled-num-na.sav | Bin 0 -> 535 bytes
 pandas/tests/io/data/labelled-num.sav    | Bin 0 -> 507 bytes
 pandas/tests/io/data/labelled-str.sav    | Bin 0 -> 525 bytes
 pandas/tests/io/data/umlauts.sav         | Bin 0 -> 567 bytes
 pandas/tests/io/test_spss.py             |  74 +++++++++++++++++++++++
 requirements-dev.txt                     |   3 +-
 18 files changed, 161 insertions(+), 3 deletions(-)
 create mode 100644 LICENSES/HAVEN_LICENSE
 create mode 100644 LICENSES/HAVEN_MIT
 create mode 100644 pandas/io/spss.py
 create mode 100755 pandas/tests/io/data/labelled-num-na.sav
 create mode 100755 pandas/tests/io/data/labelled-num.sav
 create mode 100755 pandas/tests/io/data/labelled-str.sav
 create mode 100755 pandas/tests/io/data/umlauts.sav
 create mode 100644 pandas/tests/io/test_spss.py
diff --git a/LICENSES/HAVEN_LICENSE b/LICENSES/HAVEN_LICENSE
new file mode 100644
index 0000000000000..2f444cb44d505
--- /dev/null
+++ b/LICENSES/HAVEN_LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2013-2016
+COPYRIGHT HOLDER: Hadley Wickham; RStudio; and Evan Miller
diff --git a/LICENSES/HAVEN_MIT b/LICENSES/HAVEN_MIT
new file mode 100644
index 0000000000000..b03d0e640627a
--- /dev/null
+++ b/LICENSES/HAVEN_MIT
@@ -0,0 +1,32 @@
+Based on http://opensource.org/licenses/MIT
+
+This is a template. Complete and ship as file LICENSE the following 2
+lines (only)
+
+YEAR:
+COPYRIGHT HOLDER:
+
+and specify as
+
+License: MIT + file LICENSE
+
+Copyright (c) <YEAR>, <COPYRIGHT HOLDER>
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml
index 8ed48b46b5b5a..24c753e16d98d 100644
--- a/ci/deps/azure-macos-35.yaml
+++ b/ci/deps/azure-macos-35.yaml
@@ -23,6 +23,7 @@ dependencies:
   - xlsxwriter
   - xlwt
   - pip:
+    - pyreadstat
     # universal
     - pytest==4.5.0
     - pytest-xdist
diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml
index 04e4f74f85e4d..5bdc29e0eec80 100644
--- a/ci/deps/azure-windows-37.yaml
+++ b/ci/deps/azure-windows-37.yaml
@@ -30,3 +30,4 @@ dependencies:
   - pytest-mock
   - moto
   - hypothesis>=3.58.0
+  - pyreadstat
diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml
index 722a35111ab01..c9a8c274fb144 100644
--- a/ci/deps/travis-37.yaml
+++ b/ci/deps/travis-37.yaml
@@ -19,5 +19,6 @@ dependencies:
   - hypothesis>=3.58.0
   - s3fs
   - pip
+  - pyreadstat
   - pip:
     - moto
diff --git a/doc/source/install.rst b/doc/source/install.rst
index db31d75e3013e..1c1f0c1d4cf8e 100644
--- a/doc/source/install.rst
+++ b/doc/source/install.rst
@@ -285,6 +285,7 @@ pandas-gbq                0.8.0              Google Big Query access
 psycopg2                                     PostgreSQL engine for sqlalchemy
 pyarrow                   0.9.0              Parquet and feather reading / writing
 pymysql                                      MySQL engine for sqlalchemy
+pyreadstat                                   SPSS files (.sav) reading
 qtpy                                         Clipboard I/O
 s3fs                      0.0.8              Amazon S3 access
 xarray                    0.8.2              pandas-like API for N-dimensional data
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
index 207d16afd350f..f7faeea7a646f 100644
--- a/doc/source/whatsnew/v0.25.0.rst
+++ b/doc/source/whatsnew/v0.25.0.rst
@@ -99,6 +99,7 @@ Other Enhancements
 - Error message for missing required imports now includes the original import error's text (:issue:`23868`)
 - :class:`DatetimeIndex` and :class:`TimedeltaIndex` now have a ``mean`` method (:issue:`24757`)
 - :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`)
+- Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`)
 
 .. _whatsnew_0250.api_breaking:
 
diff --git a/environment.yml b/environment.yml
index 7db2ec72ccb3b..de9bd67dd9f06 100644
--- a/environment.yml
+++ b/environment.yml
@@ -79,3 +79,5 @@ dependencies:
   - xlrd  # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
   - xlsxwriter  # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
   - xlwt  # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
+  - pip:
+    - pyreadstat  # pandas.read_spss
diff --git a/pandas/__init__.py b/pandas/__init__.py
index a2fa14be83998..b95c312f12eed 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -105,7 +105,7 @@
 
     # misc
     read_clipboard, read_parquet, read_feather, read_gbq,
-    read_html, read_json, read_stata, read_sas)
+    read_html, read_json, read_stata, read_sas, read_spss)
 
 from pandas.util._tester import test
 import pandas.testing
diff --git a/pandas/io/api.py b/pandas/io/api.py
index 8c8d7cf73b37a..725e82604ca7f 100644
--- a/pandas/io/api.py
+++ b/pandas/io/api.py
@@ -16,5 +16,6 @@
 from pandas.io.pickle import read_pickle, to_pickle
 from pandas.io.pytables import HDFStore, read_hdf
 from pandas.io.sas import read_sas
+from pandas.io.spss import read_spss
 from pandas.io.sql import read_sql, read_sql_query, read_sql_table
 from pandas.io.stata import read_stata
diff --git a/pandas/io/spss.py b/pandas/io/spss.py
new file mode 100644
index 0000000000000..b1b92fc2b8439
--- /dev/null
+++ b/pandas/io/spss.py
@@ -0,0 +1,41 @@
+from pathlib import Path
+from typing import Optional, Sequence, Union
+
+from pandas.compat._optional import import_optional_dependency
+
+from pandas.api.types import is_list_like
+from pandas.core.api import DataFrame
+
+
+def read_spss(path: Union[str, Path],
+              usecols: Optional[Sequence[str]] = None,
+              convert_categoricals: bool = True) -> DataFrame:
+    """
+    Load an SPSS file from the file path, returning a DataFrame.
+
+    .. versionadded 0.25.0
+
+    Parameters
+    ----------
+    path : string or Path
+        File path
+    usecols : list-like, optional
+        Return a subset of the columns. If None, return all columns.
+    convert_categoricals : bool, default is True
+        Convert categorical columns into pd.Categorical.
+
+    Returns
+    -------
+    DataFrame
+    """
+    pyreadstat = import_optional_dependency("pyreadstat")
+
+    if usecols is not None:
+        if not is_list_like(usecols):
+            raise TypeError("usecols must be list-like.")
+        else:
+            usecols = list(usecols)  # pyreadstat requires a list
+
+    df, _ = pyreadstat.read_sav(path, usecols=usecols,
+                                apply_value_formats=convert_categoricals)
+    return df
diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
index aa42484bf9513..b57c7a0cf0625 100644
--- a/pandas/tests/api/test_api.py
+++ b/pandas/tests/api/test_api.py
@@ -81,7 +81,7 @@ class TestPDApi(Base):
                   'read_gbq', 'read_hdf', 'read_html', 'read_json',
                   'read_msgpack', 'read_pickle', 'read_sas', 'read_sql',
                   'read_sql_query', 'read_sql_table', 'read_stata',
-                  'read_table', 'read_feather', 'read_parquet']
+                  'read_table', 'read_feather', 'read_parquet', 'read_spss']
 
     # top-level to_* funcs
     funcs_to = ['to_datetime', 'to_msgpack',
diff --git a/pandas/tests/io/data/labelled-num-na.sav b/pandas/tests/io/data/labelled-num-na.sav
new file mode 100755
index 0000000000000000000000000000000000000000..fbe6ee77672406ba5e28289d88621faf68c72ee3
GIT binary patch
literal 535
zcmY#!^D%PJP}WrNbn;aQ4hRlb2o7-!@eB^}bPiVV4OR%x%uC5HFIF%z(lY=-1vJ3K
zz`(!=#XyPy#D{=L2Lm$&x6~vBLsLT&D`OKYBO^4uBy)iV!_0@<$-&CN0K{RAK?Xoz
z#0OO4pO;gqke`>TP?nfenyR1xagUM%Gmrt&2LT`KxkEBCixq$fq!tKRfD$ls*@0}3
zdXN|g5Q_mZ3|s&@0;Ue+o|k_a7(nV_YC-z`gTZ?U_5XeSKPVrhjvJ~Lqz(i?=7Y=w
z$?-zjf<PMPCtDZ;q(A_OK^j07!#o2E6p*|SR9qBD!}M9HI0WVAq-v-esB>w86p26;
W0mF_VG{jBU;yKhUskuNAAOHZej8AF+

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/data/labelled-num.sav b/pandas/tests/io/data/labelled-num.sav
new file mode 100755
index 0000000000000000000000000000000000000000..bfab052089d7e62d2e9747c51434cb3a42156278
GIT binary patch
literal 507
zcmY#!^D%PJP}WrNbn;aQ4hRlb2o7-!@eB^}bPiVV4OR%x%uC5HFIF%z(lY=-1vJ3K
zz`(!=#Xt(o2GI-*kq!oC3T~-M3WlbJCRWDAR>o#%dP(L2O$FJ90ytP17=Sp;F~|T2
zjF^FvAooDP2Yc?2jLc#MAj;261=+^}lz_RH9moc$2Z?b2u^14;zy+X1Fm)h1U;bra
z0I7$m1?l?_2Ja!%|M&I(pnQ-zZm3$2IuHPv4>AuV#|vc(0%@4LZD9<M0s$ZfX@CVT
zkO}t`NL~oa7X{KVeO4+CLHRkU8tMk>T$&(7B2Y!Z@L>oIanrSU4s}Z^Pyhq~H&;mv

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/data/labelled-str.sav b/pandas/tests/io/data/labelled-str.sav
new file mode 100755
index 0000000000000000000000000000000000000000..b96a9c00fcec10b33cf35b2d3d87bef396fcd3ad
GIT binary patch
literal 525
zcmbVIT}uK%6dlV-j3CMfA1^4V55|6&#DcJCt}L_>&PMcDID}vt8Tja5^k0Ne{sOam
z+PTnkXE@xsbI-kJW~$Mx7uG8cin|Hvd#y>Q*J-TNxTmSzYQBs=Dbe&eo{naVIeD!M
z2a5!IN~xSB2ZcPtQ|S7n%{#em*AF}=xb&szzmW%vpSY+TyE6y<yPN)yXy0lg>v0&F
zx95qW#OC<~8Bv~fa_=MFqYq~VW|=8iv7zYTz1}JXy=c+5`^6>;yUp_3=FlBmEp(WJ
z`2cDsOq?NR_wT%#>BxMbc*=zM?}M=iP(Nd$`J9<`1=Vmko0xjdsTCWLl&s`{<3k!X
RufA{##+Dxe$fw9>{Qxb9P$&QZ

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/data/umlauts.sav b/pandas/tests/io/data/umlauts.sav
new file mode 100755
index 0000000000000000000000000000000000000000..e99cf1267bebebd16bdfe881579e6e319aa10986
GIT binary patch
literal 567
zcmY#!^D%PJP}WrNbn;aQ4hRlb2o7-!@eB^}bPiVV4OR%x%uC5HFIF%z(lY=-1vJ3K
zz`(!=#XyRI1w?>Bq=S*Mf?H~mf}yFQk(G(Dm5CXeUXr;$b3yij0LV@-;9zB70OBym
zAVZLOB0#~AjLc#MAWF<rIJ`u+G&d))w1gQb4#E)d!JfAyBNZsC0FhNtfJ!((#P^_y
zb3nzn!No!Tg!!8t$Of4W65{}3F(8J43qS|M)Pdao@-G7eNIgs~NZ)@jcn_iezpwuX
z<%865L)F6E4l*BP9!QQC%H{;pAphHzB^DWiBm{sMq!1Q4Kqfq5K=MLB3^GIzh(UU+
pR2+ixb5b?b4b-_bK?*>j2vPtHTZYgOH(iV8Q2SHUQqzHA3;+QaUGV?_

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py
new file mode 100644
index 0000000000000..b9f58f9bf6cf6
--- /dev/null
+++ b/pandas/tests/io/test_spss.py
@@ -0,0 +1,74 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas.util import testing as tm
+
+pyreadstat = pytest.importorskip("pyreadstat")
+
+
+def test_spss_labelled_num(datapath):
+    # test file from the Haven project (https://haven.tidyverse.org/)
+    fname = datapath("io", "data", "labelled-num.sav")
+
+    df = pd.read_spss(fname, convert_categoricals=True)
+    expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0])
+    expected["VAR00002"] = pd.Categorical(expected["VAR00002"])
+    tm.assert_frame_equal(df, expected)
+
+    df = pd.read_spss(fname, convert_categoricals=False)
+    expected = pd.DataFrame({"VAR00002": 1.0}, index=[0])
+    tm.assert_frame_equal(df, expected)
+
+
+def test_spss_labelled_num_na(datapath):
+    # test file from the Haven project (https://haven.tidyverse.org/)
+    fname = datapath("io", "data", "labelled-num-na.sav")
+
+    df = pd.read_spss(fname, convert_categoricals=True)
+    expected = pd.DataFrame({"VAR00002": ["This is one", None]})
+    expected["VAR00002"] = pd.Categorical(expected["VAR00002"])
+    tm.assert_frame_equal(df, expected)
+
+    df = pd.read_spss(fname, convert_categoricals=False)
+    expected = pd.DataFrame({"VAR00002": [1.0, np.nan]})
+    tm.assert_frame_equal(df, expected)
+
+
+def test_spss_labelled_str(datapath):
+    # test file from the Haven project (https://haven.tidyverse.org/)
+    fname = datapath("io", "data", "labelled-str.sav")
+
+    df = pd.read_spss(fname, convert_categoricals=True)
+    expected = pd.DataFrame({"gender": ["Male", "Female"]})
+    expected["gender"] = pd.Categorical(expected["gender"])
+    tm.assert_frame_equal(df, expected)
+
+    df = pd.read_spss(fname, convert_categoricals=False)
+    expected = pd.DataFrame({"gender": ["M", "F"]})
+    tm.assert_frame_equal(df, expected)
+
+
+def test_spss_umlauts(datapath):
+    # test file from the Haven project (https://haven.tidyverse.org/)
+    fname = datapath("io", "data", "umlauts.sav")
+
+    df = pd.read_spss(fname, convert_categoricals=True)
+    expected = pd.DataFrame({"var1": ["the ä umlaut",
+                                      "the ü umlaut",
+                                      "the ä umlaut",
+                                      "the ö umlaut"]})
+    expected["var1"] = pd.Categorical(expected["var1"])
+    tm.assert_frame_equal(df, expected)
+
+    df = pd.read_spss(fname, convert_categoricals=False)
+    expected = pd.DataFrame({"var1": [1.0, 2.0, 1.0, 3.0]})
+    tm.assert_frame_equal(df, expected)
+
+
+def test_spss_usecols(datapath):
+    # usecols must be list-like
+    fname = datapath("io", "data", "labelled-num.sav")
+
+    with pytest.raises(TypeError, match="usecols must be list-like."):
+        pd.read_spss(fname, usecols="VAR00002")
diff --git a/requirements-dev.txt b/requirements-dev.txt
index b40aa86e946b6..169af7da5e037 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -52,4 +52,5 @@ sqlalchemy
 xarray
 xlrd
 xlsxwriter
-xlwt
\ No newline at end of file
+xlwt
+pyreadstat
\ No newline at end of file