diff --git a/LICENSES/SAS7BDAT_LICENSE b/LICENSES/SAS7BDAT_LICENSE new file mode 100644 index 0000000000000..8fbf194013e93 --- /dev/null +++ b/LICENSES/SAS7BDAT_LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2015 Jared Hobbs + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py index 12e48295d8d05..74bec72eb05e9 100644 --- a/asv_bench/benchmarks/packers.py +++ b/asv_bench/benchmarks/packers.py @@ -318,6 +318,24 @@ def remove(self, f): pass +class packers_read_sas7bdat(object): + + def setup(self): + self.f = 'data/test1.sas7bdat' + + def time_packers_read_sas7bdat(self): + pd.read_sas(self.f, format='sas7bdat') + + +class packers_read_xport(object): + + def setup(self): + self.f = 'data/paxraw_d_short.xpt' + + def time_packers_read_xport(self): + pd.read_sas(self.f, format='xport') + + class packers_write_csv(object): goal_time = 0.2 @@ -854,4 +872,4 @@ def remove(self, f): try: os.remove(self.f) except: - pass \ No newline at end of file + pass diff --git a/doc/source/io.rst b/doc/source/io.rst index 577d6b34ec719..bbfa711eb4445 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2554,7 +2554,7 @@ both on the writing (serialization), and reading (deserialization). +----------------------+------------------------+ | 0.18 | >= 0.18 | +======================+========================+ - + Reading (files packed by older versions) is backward-compatibile, except for files packed with 0.17 in Python 2, in which case only they can only be unpacked in Python 2. .. ipython:: python @@ -4198,7 +4198,7 @@ Authenticating with user account credentials is as simple as following the promp which will be automatically opened for you. You will be authenticated to the specified ``BigQuery`` account using the product name ``pandas GBQ``. It is only possible on local host. The remote authentication using user account credentials is not currently supported in Pandas. -Additional information on the authentication mechanism can be found +Additional information on the authentication mechanism can be found `here `__. Authentication with service account credentials is possible via the `'private_key'` parameter. This method @@ -4564,24 +4564,25 @@ easy conversion to and from pandas. .. _io.sas_reader: -SAS Format ----------- +SAS Formats +----------- .. versionadded:: 0.17.0 -The top-level function :func:`read_sas` currently can read (but -not write) SAS xport (.XPT) format files. Pandas cannot currently -handle SAS7BDAT files. +The top-level function :func:`read_sas` can read (but not write) SAS +`xport` (.XPT) and `SAS7BDAT` (.sas7bdat) format files (v0.18.0). -XPORT files only contain two value types: ASCII text and double -precision numeric values. There is no automatic type conversion to -integers, dates, or categoricals. By default the whole file is read -and returned as a ``DataFrame``. +SAS files only contain two value types: ASCII text and floating point +values (usually 8 bytes but sometimes truncated). For xport files, +there is no automatic type conversion to integers, dates, or +categoricals. For SAS7BDAT files, the format codes may allow date +variables to be automatically converted to dates. By default the +whole file is read and returned as a ``DataFrame``. -Specify a ``chunksize`` or use ``iterator=True`` to obtain an -``XportReader`` object for incrementally reading the file. The -``XportReader`` object also has attributes that contain additional -information about the file and its variables. +Specify a ``chunksize`` or use ``iterator=True`` to obtain reader +objects (``XportReader`` or ``SAS7BDATReader``) for incrementally +reading the file. The reader objects also have attributes that +contain additional information about the file and its variables. Read a SAS XPORT file: @@ -4602,6 +4603,8 @@ web site. .. _specification: https://support.sas.com/techsup/technote/ts140.pdf +No official documentation is available for the SAS7BDAT format. + .. _io.perf: Performance Considerations diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index aab9e6125732a..b3bbc5cf5ef8c 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -24,6 +24,7 @@ Highlights include: since 0.14.0. This will now raise a ``TypeError``, see :ref:`here `. - The ``.to_xarray()`` function has been added for compatibility with the `xarray package `__, see :ref:`here `. +- The ``read_sas`` function has been enhanced to read ``sas7bdat`` files, see :ref:`here `. - Addition of the :ref:`.str.extractall() method `, and API changes to the :ref:`.str.extract() method ` and :ref:`.str.cat() method `. @@ -403,6 +404,13 @@ For example, if you have a jupyter notebook you plan to convert to latex using n Options ``display.latex.escape`` and ``display.latex.longtable`` have also been added to the configuration and are used automatically by the ``to_latex`` method. See the :ref:`options documentation` for more info. +.. _whatsnew_0180.enhancements.sas: + +SAS7BDAT files +^^^^^^^^^^^^^^ + +Pandas can now read SAS7BDAT files, including compressed files. The files can be read in entirety, or incrementally. For full details see :ref:`here `. (issue:`4052`) + .. _whatsnew_0180.enhancements.other: Other enhancements diff --git a/pandas/io/api.py b/pandas/io/api.py index 3ac4c670c8466..920ece9c4c3a8 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -11,7 +11,7 @@ from pandas.io.json import read_json from pandas.io.html import read_html from pandas.io.sql import read_sql, read_sql_table, read_sql_query -from pandas.io.sas import read_sas +from pandas.io.sas.sasreader import read_sas from pandas.io.stata import read_stata from pandas.io.pickle import read_pickle, to_pickle from pandas.io.packers import read_msgpack, to_msgpack diff --git a/pandas/io/sas/__init__.py b/pandas/io/sas/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py new file mode 100644 index 0000000000000..e068c51df585d --- /dev/null +++ b/pandas/io/sas/sas7bdat.py @@ -0,0 +1,828 @@ +""" +Read SAS7BDAT files + +Based on code written by Jared Hobbs: + https://bitbucket.org/jaredhobbs/sas7bdat + +See also: + https://github.com/BioStatMatt/sas7bdat + +Partial documentation of the file format: + https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf + +Reference for binary data compression: + http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm +""" + +import pandas as pd +from pandas import compat +from pandas.io.common import get_filepath_or_buffer, BaseIterator +import numpy as np +import struct +from .saslib import (_rle_decompress, _rdc_decompress, + process_byte_array_with_data) + +_magic = (b"\x00\x00\x00\x00\x00\x00\x00\x00" + + b"\x00\x00\x00\x00\xc2\xea\x81\x60" + + b"\xb3\x14\x11\xcf\xbd\x92\x08\x00" + + b"\x09\xc7\x31\x8c\x18\x1f\x10\x11") + +_align_1_checker_value = b'3' +_align_1_offset = 32 +_align_1_length = 1 +_align_1_value = 4 +_u64_byte_checker_value = b'3' +_align_2_offset = 35 +_align_2_length = 1 +_align_2_value = 4 +_endianness_offset = 37 +_endianness_length = 1 +_platform_offset = 39 +_platform_length = 1 +_encoding_offset = 70 +_encoding_length = 1 +_dataset_offset = 92 +_dataset_length = 64 +_file_type_offset = 156 +_file_type_length = 8 +_date_created_offset = 164 +_date_created_length = 8 +_date_modified_offset = 172 +_date_modified_length = 8 +_header_size_offset = 196 +_header_size_length = 4 +_page_size_offset = 200 +_page_size_length = 4 +_page_count_offset = 204 +_page_count_length = 4 +_sas_release_offset = 216 +_sas_release_length = 8 +_sas_server_type_offset = 224 +_sas_server_type_length = 16 +_os_version_number_offset = 240 +_os_version_number_length = 16 +_os_maker_offset = 256 +_os_maker_length = 16 +_os_name_offset = 272 +_os_name_length = 16 +_page_bit_offset_x86 = 16 +_page_bit_offset_x64 = 32 +_subheader_pointer_length_x86 = 12 +_subheader_pointer_length_x64 = 24 +_page_type_offset = 0 +_page_type_length = 2 +_block_count_offset = 2 +_block_count_length = 2 +_subheader_count_offset = 4 +_subheader_count_length = 2 +_page_meta_type = 0 +_page_data_type = 256 +_page_amd_type = 1024 +_page_metc_type = 16384 +_page_comp_type = -28672 +_page_mix_types = [512, 640] +_subheader_pointers_offset = 8 +_truncated_subheader_id = 1 +_compressed_subheader_id = 4 +_compressed_subheader_type = 1 +_text_block_size_length = 2 +_row_length_offset_multiplier = 5 +_row_count_offset_multiplier = 6 +_col_count_p1_multiplier = 9 +_col_count_p2_multiplier = 10 +_row_count_on_mix_page_offset_multiplier = 15 +_column_name_pointer_length = 8 +_column_name_text_subheader_offset = 0 +_column_name_text_subheader_length = 2 +_column_name_offset_offset = 2 +_column_name_offset_length = 2 +_column_name_length_offset = 4 +_column_name_length_length = 2 +_column_data_offset_offset = 8 +_column_data_length_offset = 8 +_column_data_length_length = 4 +_column_type_offset = 14 +_column_type_length = 1 +_column_format_text_subheader_index_offset = 22 +_column_format_text_subheader_index_length = 2 +_column_format_offset_offset = 24 +_column_format_offset_length = 2 +_column_format_length_offset = 26 +_column_format_length_length = 2 +_column_label_text_subheader_index_offset = 28 +_column_label_text_subheader_index_length = 2 +_column_label_offset_offset = 30 +_column_label_offset_length = 2 +_column_label_length_offset = 32 +_column_label_length_length = 2 +_rle_compression = 'SASYZCRL' +_rdc_compression = 'SASYZCR2' + +_compression_literals = [_rle_compression, _rdc_compression] + +# Incomplete list of encodings +_encoding_names = {29: "latin1", 20: "utf-8", 33: "cyrillic", 60: "wlatin2", + 61: "wcyrillic", 62: "wlatin1", 90: "ebcdic870"} + +# Should be enum + + +class _index: + rowSizeIndex = 0 + columnSizeIndex = 1 + subheaderCountsIndex = 2 + columnTextIndex = 3 + columnNameIndex = 4 + columnAttributesIndex = 5 + formatAndLabelIndex = 6 + columnListIndex = 7 + dataSubheaderIndex = 8 + + +_subheader_signature_to_index = { + b"\xF7\xF7\xF7\xF7": _index.rowSizeIndex, + b"\x00\x00\x00\x00\xF7\xF7\xF7\xF7": _index.rowSizeIndex, + b"\xF7\xF7\xF7\xF7\x00\x00\x00\x00": _index.rowSizeIndex, + b"\xF7\xF7\xF7\xF7\xFF\xFF\xFB\xFE": _index.rowSizeIndex, + b"\xF6\xF6\xF6\xF6": _index.columnSizeIndex, + b"\x00\x00\x00\x00\xF6\xF6\xF6\xF6": _index.columnSizeIndex, + b"\xF6\xF6\xF6\xF6\x00\x00\x00\x00": _index.columnSizeIndex, + b"\xF6\xF6\xF6\xF6\xFF\xFF\xFB\xFE": _index.columnSizeIndex, + b"\x00\xFC\xFF\xFF": _index.subheaderCountsIndex, + b"\xFF\xFF\xFC\x00": _index.subheaderCountsIndex, + b"\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF": _index.subheaderCountsIndex, + b"\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00": _index.subheaderCountsIndex, + b"\xFD\xFF\xFF\xFF": _index.columnTextIndex, + b"\xFF\xFF\xFF\xFD": _index.columnTextIndex, + b"\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF": _index.columnTextIndex, + b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD": _index.columnTextIndex, + b"\xFF\xFF\xFF\xFF": _index.columnNameIndex, + b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF": _index.columnNameIndex, + b"\xFC\xFF\xFF\xFF": _index.columnAttributesIndex, + b"\xFF\xFF\xFF\xFC": _index.columnAttributesIndex, + b"\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF": _index.columnAttributesIndex, + b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC": _index.columnAttributesIndex, + b"\xFE\xFB\xFF\xFF": _index.formatAndLabelIndex, + b"\xFF\xFF\xFB\xFE": _index.formatAndLabelIndex, + b"\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF": _index.formatAndLabelIndex, + b"\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE": _index.formatAndLabelIndex, + b"\xFE\xFF\xFF\xFF": _index.columnListIndex, + b"\xFF\xFF\xFF\xFE": _index.columnListIndex, + b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": _index.columnListIndex, + b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": _index.columnListIndex} + + +class _subheader_pointer(object): + pass + + +class _column(object): + pass + + +# SAS7BDAT represents a SAS data file in SAS7BDAT format. +class SAS7BDATReader(BaseIterator): + """ + Read SAS files in SAS7BDAT format. + + Parameters + ---------- + path_or_buf : path name or buffer + Name of SAS file or file-like object pointing to SAS file + contents. + index : column identifier, defaults to None + Column to use as index. + convert_dates : boolean, defaults to True + Attempt to convert dates to Pandas datetime values. Note all + SAS date formats are supported. + blank_missing : boolean, defaults to True + Convert empty strings to missing values (SAS uses blanks to + indicate missing character variables). + chunksize : int, defaults to None + Return SAS7BDATReader object for iterations, returns chunks + with given number of lines. + encoding : string, defaults to None + String encoding. If None, text variables are left as raw bytes. + """ + + def __init__(self, path_or_buf, index=None, convert_dates=True, + blank_missing=True, chunksize=None, encoding=None): + + self.index = index + self.convert_dates = convert_dates + self.blank_missing = blank_missing + self.chunksize = chunksize + self.encoding = encoding + + self.compression = "" + self.column_names_strings = [] + self.column_names = [] + self.column_types = [] + self.column_formats = [] + self.columns = [] + + self._current_page_data_subheader_pointers = [] + self._cached_page = None + self._column_data_lengths = [] + self._column_data_offsets = [] + self._current_row_in_file_index = 0 + self._current_row_on_page_index = 0 + self._current_row_in_file_index = 0 + + self._path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf) + if isinstance(self._path_or_buf, compat.string_types): + self._path_or_buf = open(self._path_or_buf, 'rb') + + self._get_properties() + self._parse_metadata() + + def _get_properties(self): + + # Check magic number + self._path_or_buf.seek(0) + self._cached_page = self._path_or_buf.read(288) + if self._cached_page[0:len(_magic)] != _magic: + raise ValueError("magic number mismatch (not a SAS file?)") + + # Get alignment information + align1, align2 = 0, 0 + buf = self._read_bytes(_align_1_offset, _align_1_length) + if buf == _u64_byte_checker_value: + align2 = _align_2_value + self.U64 = True + self._int_length = 8 + self._page_bit_offset = _page_bit_offset_x64 + self._subheader_pointer_length = _subheader_pointer_length_x64 + else: + self.U64 = False + self._page_bit_offset = _page_bit_offset_x86 + self._subheader_pointer_length = _subheader_pointer_length_x86 + self._int_length = 4 + buf = self._read_bytes(_align_2_offset, _align_2_length) + if buf == _align_1_checker_value: + align1 = _align_2_value + total_align = align1 + align2 + + # Get endianness information + buf = self._read_bytes(_endianness_offset, _endianness_length) + if buf == b'\x01': + self.byte_order = "<" + else: + self.byte_order = ">" + + # Get encoding information + buf = self._read_bytes(_encoding_offset, _encoding_length)[0] + if buf in _encoding_names: + self.file_encoding = _encoding_names[buf] + else: + self.file_encoding = "unknown (code=%s)" % str(buf) + + # Get platform information + buf = self._read_bytes(_platform_offset, _platform_length) + if buf == b'1': + self.platform = "unix" + elif buf == b'2': + self.platform = "windows" + else: + self.platform = "unknown" + + buf = self._read_bytes(_dataset_offset, _dataset_length) + self.name = buf.rstrip(b'\x00 ').decode() + + buf = self._read_bytes(_file_type_offset, _file_type_length) + self.file_type = buf.rstrip(b'\x00 ').decode() + + # Timestamp is epoch 01/01/1960 + epoch = pd.datetime(1960, 1, 1) + x = self._read_float(_date_created_offset + align1, + _date_created_length) + self.date_created = epoch + pd.to_timedelta(x, unit='s') + x = self._read_float(_date_modified_offset + align1, + _date_modified_length) + self.date_modified = epoch + pd.to_timedelta(x, unit='s') + + self.header_length = self._read_int(_header_size_offset + align1, + _header_size_length) + + # Read the rest of the header into cached_page. + buf = self._path_or_buf.read(self.header_length - 288) + self._cached_page += buf + if len(self._cached_page) != self.header_length: + raise ValueError("The SAS7BDAT file appears to be truncated.") + + self._page_length = self._read_int(_page_size_offset + align1, + _page_size_length) + self._page_count = self._read_int(_page_count_offset + align1, + _page_count_length) + + buf = self._read_bytes(_sas_release_offset + total_align, + _sas_release_length) + self.sas_release = buf.rstrip(b'\x00 ').decode() + + buf = self._read_bytes(_sas_server_type_offset + total_align, + _sas_server_type_length) + self.server_type = buf.rstrip(b'\x00 ').decode() + + buf = self._read_bytes(_os_version_number_offset + total_align, + _os_version_number_length) + self.os_version = buf.rstrip(b'\x00 ').decode() + + buf = self._read_bytes( + _os_name_offset, _os_name_length).rstrip(b'\x00 ') + if len(buf) > 0: + self.os_name = buf.rstrip(b'\x00 ').decode() + else: + buf = self._path_or_buf.read(_os_maker_offset, _os_maker_length) + self.os_name = buf.rstrip(b'\x00 ').decode() + + # Read a single float of the given width (4 or 8). + def _read_float(self, offset, width): + if width not in (4, 8): + raise ValueError("invalid float width") + buf = self._read_bytes(offset, width) + fd = "f" if width == 4 else "d" + return struct.unpack(self.byte_order + fd, buf)[0] + + # Read a single signed integer of the given width (1, 2, 4 or 8). + def _read_int(self, offset, width): + if width not in (1, 2, 4, 8): + raise ValueError("invalid int width") + buf = self._read_bytes(offset, width) + it = {1: "b", 2: "h", 4: "l", 8: "q"}[width] + iv = struct.unpack(self.byte_order + it, buf)[0] + return iv + + def _read_bytes(self, offset, length): + if self._cached_page is None: + self._path_or_buf.seek(offset) + buf = self._path_or_buf.read(length) + if len(buf) < length: + msg = "Unable to read {:d} bytes from file position {:d}." + raise ValueError(msg.format(length, offset)) + return buf + else: + if offset + length > len(self._cached_page): + raise ValueError("The cached page is too small.") + return self._cached_page[offset:offset + length] + + def _parse_metadata(self): + done = False + while not done: + self._cached_page = self._path_or_buf.read(self._page_length) + if len(self._cached_page) <= 0: + break + if len(self._cached_page) != self._page_length: + raise ValueError( + "Failed to read a meta data page from the SAS file.") + done = self._process_page_meta() + + def _process_page_meta(self): + self._read_page_header() + pt = [_page_meta_type, _page_amd_type] + _page_mix_types + if self._current_page_type in pt: + self._process_page_metadata() + return ((self._current_page_type in [256] + _page_mix_types) or + (self._current_page_data_subheader_pointers is not None)) + + def _read_page_header(self): + bit_offset = self._page_bit_offset + tx = _page_type_offset + bit_offset + self._current_page_type = self._read_int(tx, _page_type_length) + tx = _block_count_offset + bit_offset + self._current_page_block_count = self._read_int(tx, + _block_count_length) + tx = _subheader_count_offset + bit_offset + self._current_page_subheaders_count = ( + self._read_int(tx, _subheader_count_length)) + + def _process_page_metadata(self): + bit_offset = self._page_bit_offset + + for i in range(self._current_page_subheaders_count): + pointer = self._process_subheader_pointers( + _subheader_pointers_offset + bit_offset, i) + if pointer.length == 0: + continue + if pointer.compression == _truncated_subheader_id: + continue + subheader_signature = self._read_subheader_signature( + pointer.offset) + subheader_index = ( + self._get_subheader_index(subheader_signature, + pointer.compression, pointer.ptype)) + self._process_subheader(subheader_index, pointer) + + def _get_subheader_index(self, signature, compression, ptype): + index = _subheader_signature_to_index.get(signature) + if index is None: + f1 = ((compression == _compressed_subheader_id) or + (compression == 0)) + f2 = (ptype == _compressed_subheader_type) + if (self.compression != "") and f1 and f2: + index = _index.dataSubheaderIndex + else: + raise ValueError("Unknown subheader signature") + return index + + def _process_subheader_pointers(self, offset, subheader_pointer_index): + + subheader_pointer_length = self._subheader_pointer_length + total_offset = (offset + + subheader_pointer_length * subheader_pointer_index) + + subheader_offset = self._read_int(total_offset, self._int_length) + total_offset += self._int_length + + subheader_length = self._read_int(total_offset, self._int_length) + total_offset += self._int_length + + subheader_compression = self._read_int(total_offset, 1) + total_offset += 1 + + subheader_type = self._read_int(total_offset, 1) + + x = _subheader_pointer() + x.offset = subheader_offset + x.length = subheader_length + x.compression = subheader_compression + x.ptype = subheader_type + + return x + + def _read_subheader_signature(self, offset): + subheader_signature = self._read_bytes(offset, self._int_length) + return subheader_signature + + def _process_subheader(self, subheader_index, pointer): + offset = pointer.offset + length = pointer.length + + if subheader_index == _index.rowSizeIndex: + processor = self._process_rowsize_subheader + elif subheader_index == _index.columnSizeIndex: + processor = self._process_columnsize_subheader + elif subheader_index == _index.columnTextIndex: + processor = self._process_columntext_subheader + elif subheader_index == _index.columnNameIndex: + processor = self._process_columnname_subheader + elif subheader_index == _index.columnAttributesIndex: + processor = self._process_columnattributes_subheader + elif subheader_index == _index.formatAndLabelIndex: + processor = self._process_format_subheader + elif subheader_index == _index.columnListIndex: + processor = self._process_columnlist_subheader + elif subheader_index == _index.subheaderCountsIndex: + processor = self._process_subheader_counts + elif subheader_index == _index.dataSubheaderIndex: + self._current_page_data_subheader_pointers.append(pointer) + return + else: + raise ValueError("unknown subheader index") + + processor(offset, length) + + def _process_rowsize_subheader(self, offset, length): + + int_len = self._int_length + lcs_offset = offset + lcp_offset = offset + if self.U64: + lcs_offset += 682 + lcp_offset += 706 + else: + lcs_offset += 354 + lcp_offset += 378 + + self.row_length = self._read_int( + offset + _row_length_offset_multiplier * int_len, int_len) + self.row_count = self._read_int( + offset + _row_count_offset_multiplier * int_len, int_len) + self.col_count_p1 = self._read_int( + offset + _col_count_p1_multiplier * int_len, int_len) + self.col_count_p2 = self._read_int( + offset + _col_count_p2_multiplier * int_len, int_len) + mx = _row_count_on_mix_page_offset_multiplier * int_len + self._mix_page_row_count = self._read_int(offset + mx, int_len) + self._lcs = self._read_int(lcs_offset, 2) + self._lcp = self._read_int(lcp_offset, 2) + + def _process_columnsize_subheader(self, offset, length): + int_len = self._int_length + offset += int_len + self.column_count = self._read_int(offset, int_len) + if (self.col_count_p1 + self.col_count_p2 != + self.column_count): + print("Warning: column count mismatch (%d + %d != %d)\n", + self.col_count_p1, self.col_count_p2, self.column_count) + + # Unknown purpose + def _process_subheader_counts(self, offset, length): + pass + + def _process_columntext_subheader(self, offset, length): + + offset += self._int_length + text_block_size = self._read_int(offset, _text_block_size_length) + + buf = self._read_bytes(offset, text_block_size) + self.column_names_strings.append( + buf[0:text_block_size].rstrip(b"\x00 ").decode()) + + if len(self.column_names_strings) == 1: + column_name = self.column_names_strings[0] + compression_literal = "" + for cl in _compression_literals: + if cl in column_name: + compression_literal = cl + self.compression = compression_literal + offset -= self._int_length + + offset1 = offset + 16 + if self.U64: + offset1 += 4 + + buf = self._read_bytes(offset1, self._lcp) + compression_literal = buf.rstrip(b"\x00") + if compression_literal == "": + self._lcs = 0 + offset1 = offset + 32 + if self.U64: + offset1 += 4 + buf = self._read_bytes(offset1, self._lcp) + self.creator_proc = buf[0:self._lcp].decode() + elif compression_literal == _rle_compression: + offset1 = offset + 40 + if self.U64: + offset1 += 4 + buf = self._read_bytes(offset1, self._lcp) + self.creator_proc = buf[0:self._lcp].decode() + elif self._lcs > 0: + self._lcp = 0 + offset1 = offset + 16 + if self.U64: + offset1 += 4 + buf = self._read_bytes(offset1, self._lcs) + self.creator_proc = buf[0:self._lcp].decode() + + def _process_columnname_subheader(self, offset, length): + int_len = self._int_length + offset += int_len + column_name_pointers_count = (length - 2 * int_len - 12) // 8 + for i in range(column_name_pointers_count): + text_subheader = offset + _column_name_pointer_length * \ + (i + 1) + _column_name_text_subheader_offset + col_name_offset = offset + _column_name_pointer_length * \ + (i + 1) + _column_name_offset_offset + col_name_length = offset + _column_name_pointer_length * \ + (i + 1) + _column_name_length_offset + + idx = self._read_int( + text_subheader, _column_name_text_subheader_length) + col_offset = self._read_int( + col_name_offset, _column_name_offset_length) + col_len = self._read_int( + col_name_length, _column_name_length_length) + + name_str = self.column_names_strings[idx] + self.column_names.append(name_str[col_offset:col_offset + col_len]) + + def _process_columnattributes_subheader(self, offset, length): + int_len = self._int_length + column_attributes_vectors_count = ( + length - 2 * int_len - 12) // (int_len + 8) + self.column_types = np.empty( + column_attributes_vectors_count, dtype=np.dtype('S1')) + for i in range(column_attributes_vectors_count): + col_data_offset = (offset + int_len + + _column_data_offset_offset + i * (int_len + 8)) + col_data_len = (offset + 2 * int_len + + _column_data_length_offset + i * (int_len + 8)) + col_types = (offset + 2 * int_len + + _column_type_offset + i * (int_len + 8)) + + self._column_data_offsets.append( + self._read_int(col_data_offset, int_len)) + + x = self._read_int(col_data_len, _column_data_length_length) + self._column_data_lengths.append(x) + + x = self._read_int(col_types, _column_type_length) + if x == 1: + self.column_types[i] = b'd' + else: + self.column_types[i] = b's' + + def _process_columnlist_subheader(self, offset, length): + # unknown purpose + pass + + def _process_format_subheader(self, offset, length): + int_len = self._int_length + text_subheader_format = offset + \ + _column_format_text_subheader_index_offset + 3 * int_len + col_format_offset = offset + _column_format_offset_offset + 3 * int_len + col_format_len = offset + _column_format_length_offset + 3 * int_len + text_subheader_label = offset + \ + _column_label_text_subheader_index_offset + 3 * int_len + col_label_offset = offset + _column_label_offset_offset + 3 * int_len + col_label_len = offset + _column_label_length_offset + 3 * int_len + + x = self._read_int(text_subheader_format, + _column_format_text_subheader_index_length) + format_idx = min(x, len(self.column_names_strings) - 1) + + format_start = self._read_int( + col_format_offset, _column_format_offset_length) + format_len = self._read_int( + col_format_len, _column_format_length_length) + + label_idx = self._read_int( + text_subheader_label, _column_label_text_subheader_index_length) + label_idx = min(label_idx, len(self.column_names_strings) - 1) + + label_start = self._read_int( + col_label_offset, _column_label_offset_length) + label_len = self._read_int(col_label_len, _column_label_length_length) + + label_names = self.column_names_strings[label_idx] + column_label = label_names[label_start: label_start + label_len] + format_names = self.column_names_strings[format_idx] + column_format = format_names[format_start: format_start + format_len] + current_column_number = len(self.columns) + + col = _column() + col.col_id = current_column_number + col.name = self.column_names[current_column_number] + col.label = column_label + col.format = column_format + col.ctype = self.column_types[current_column_number] + col.length = self._column_data_lengths[current_column_number] + + self.column_formats.append(column_format) + self.columns.append(col) + + def read(self, nrows=None): + + if (nrows is None) and (self.chunksize is not None): + nrows = self.chunksize + elif nrows is None: + nrows = self.row_count + + if self._current_row_in_file_index >= self.row_count: + return None + + nd = (self.column_types == b'd').sum() + ns = (self.column_types == b's').sum() + + self._string_chunk = np.empty((ns, nrows), dtype=np.object) + self._byte_chunk = np.empty((nd, 8 * nrows), dtype=np.uint8) + + self._current_row_in_chunk_index = 0 + for i in range(nrows): + done = self._readline() + if done: + break + + rslt = self._chunk_to_dataframe() + if self.index is not None: + rslt = rslt.set_index(self.index) + + return rslt + + def _readline(self): + + bit_offset = self._page_bit_offset + subheader_pointer_length = self._subheader_pointer_length + + # If there is no page, go to the end of the header and read a page. + if self._cached_page is None: + self._path_or_buf.seek(self.header_length) + done = self._read_next_page() + if done: + return True + + # Loop until a data row is read + while True: + if self._current_page_type == _page_meta_type: + flag = (self._current_row_on_page_index >= + len(self._current_page_data_subheader_pointers)) + if flag: + done = self._read_next_page() + if done: + return True + self._current_row_on_page_index = 0 + continue + current_subheader_pointer = ( + self._current_page_data_subheader_pointers[ + self._current_row_on_page_index]) + process_byte_array_with_data(self, + current_subheader_pointer.offset, + current_subheader_pointer.length, + self._byte_chunk, + self._string_chunk) + return False + elif self._current_page_type in _page_mix_types: + align_correction = (bit_offset + _subheader_pointers_offset + + self._current_page_subheaders_count * + subheader_pointer_length) + align_correction = align_correction % 8 + offset = bit_offset + align_correction + offset += _subheader_pointers_offset + offset += (self._current_page_subheaders_count * + subheader_pointer_length) + offset += self._current_row_on_page_index * self.row_length + process_byte_array_with_data(self, offset, self.row_length, + self._byte_chunk, + self._string_chunk) + mn = min(self.row_count, self._mix_page_row_count) + if self._current_row_on_page_index == mn: + done = self._read_next_page() + if done: + return True + self._current_row_on_page_index = 0 + return False + elif self._current_page_type == _page_data_type: + process_byte_array_with_data(self, + bit_offset + + _subheader_pointers_offset + + self._current_row_on_page_index * + self.row_length, + self.row_length, self._byte_chunk, + self._string_chunk) + flag = (self._current_row_on_page_index == + self._current_page_block_count) + if flag: + done = self._read_next_page() + if done: + return True + self._current_row_on_page_index = 0 + return False + else: + raise ValueError("unknown page type: %s", + self._current_page_type) + + def _read_next_page(self): + self._current_page_data_subheader_pointers = [] + self._cached_page = self._path_or_buf.read(self._page_length) + if len(self._cached_page) <= 0: + return True + elif len(self._cached_page) != self._page_length: + msg = ("failed to read complete page from file " + "(read {:d} of {:d} bytes)") + raise ValueError(msg.format(len(self._cached_page), + self._page_length)) + + self._read_page_header() + if self._current_page_type == _page_meta_type: + self._process_page_metadata() + pt = [_page_meta_type, _page_data_type] + [_page_mix_types] + if self._current_page_type not in pt: + return self._read_next_page() + + return False + + def _decompress(self, row_length, page): + page = np.frombuffer(page, dtype=np.uint8) + if self.compression == _rle_compression: + return _rle_decompress(row_length, page) + elif self.compression == _rdc_compression: + return _rdc_decompress(row_length, page) + else: + raise ValueError("unknown SAS compression method: %s" % + self.compression) + + def _chunk_to_dataframe(self): + + n = self._current_row_in_chunk_index + m = self._current_row_in_file_index + ix = range(m - n, m) + rslt = pd.DataFrame(index=ix) + + js, jb = 0, 0 + for j in range(self.column_count): + + name = self.column_names[j] + + if self.column_types[j] == b'd': + rslt[name] = self._byte_chunk[jb, :].view( + dtype=self.byte_order + 'd') + rslt[name] = np.asarray(rslt[name], dtype=np.float64) + if self.convert_dates and (self.column_formats[j] == "MMDDYY"): + epoch = pd.datetime(1960, 1, 1) + rslt[name] = epoch + pd.to_timedelta(rslt[name], unit='d') + jb += 1 + elif self.column_types[j] == b's': + rslt[name] = self._string_chunk[js, :] + rslt[name] = rslt[name].apply(lambda x: x.rstrip(b'\x00 ')) + if self.encoding is not None: + rslt[name] = rslt[name].apply( + lambda x: x.decode(encoding=self.encoding)) + if self.blank_missing: + ii = rslt[name].str.len() == 0 + rslt.loc[ii, name] = np.nan + js += 1 + else: + raise ValueError("unknown column type %s" % + self.column_types[j]) + + return rslt diff --git a/pandas/io/sas.py b/pandas/io/sas/sas_xport.py similarity index 86% rename from pandas/io/sas.py rename to pandas/io/sas/sas_xport.py index 49013a98c77ff..e4ca99fdcb109 100644 --- a/pandas/io/sas.py +++ b/pandas/io/sas/sas_xport.py @@ -15,9 +15,10 @@ import struct import numpy as np from pandas.util.decorators import Appender +import warnings -_correct_line1 = ("HEADER RECORD*******LIBRARY HEADER RECORD" - "!!!!!!!000000000000000000000000000000 ") +_correct_line1 = ("HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!" + "000000000000000000000000000000 ") _correct_header1 = ("HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!" "000000000000000001600000000") _correct_header2 = ("HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!" @@ -28,6 +29,7 @@ 'nform', 'nfl', 'num_decimals', 'nfj', 'nfill', 'niform', 'nifl', 'nifd', 'npos', '_'] + _base_params_doc = """\ Parameters ---------- @@ -112,25 +114,6 @@ """ -@Appender(_read_sas_doc) -def read_sas(filepath_or_buffer, format='xport', index=None, - encoding='ISO-8859-1', chunksize=None, iterator=False): - - format = format.lower() - - if format == 'xport': - reader = XportReader(filepath_or_buffer, index=index, - encoding=encoding, - chunksize=chunksize) - else: - raise ValueError('only xport format is supported') - - if iterator or chunksize: - return reader - - return reader.read() - - def _parse_date(datestr): """ Given a date in xport format, return Python date. """ try: @@ -282,8 +265,9 @@ def _read_header(self): raise ValueError("Header record is not an XPORT file.") line2 = self._get_row() - file_info = _split_line(line2, [['prefix', 24], ['version', 8], - ['OS', 8], ['_', 24], ['created', 16]]) + fif = [['prefix', 24], ['version', 8], ['OS', 8], + ['_', 24], ['created', 16]] + file_info = _split_line(line2, fif) if file_info['prefix'] != "SAS SAS SASLIB": raise ValueError("Header record has invalid prefix.") file_info['created'] = _parse_date(file_info['created']) @@ -295,22 +279,19 @@ def _read_header(self): # read member header header1 = self._get_row() header2 = self._get_row() - if (not header1.startswith(_correct_header1) or - not header2 == _correct_header2): - raise ValueError("Member header not found.") - fieldnamelength = int(header1[-5:-2]) # usually 140, could be 135 + headflag1 = header1.startswith(_correct_header1) + headflag2 = (header2 == _correct_header2) + if not (headflag1 and headflag2): + raise ValueError("Member header not found") + # usually 140, could be 135 + fieldnamelength = int(header1[-5:-2]) # member info - member_info = _split_line(self._get_row(), [['prefix', 8], - ['set_name', 8], - ['sasdata', 8], - ['version', 8], - ['OS', 8], ['_', 24], - ['created', 16]]) - member_info.update(_split_line(self._get_row(), - [['modified', 16], - ['_', 16], - ['label', 40], ['type', 8]])) + mem = [['prefix', 8], ['set_name', 8], ['sasdata', 8], + ['version', 8], ['OS', 8], ['_', 24], ['created', 16]] + member_info = _split_line(self._get_row(), mem) + mem = [['modified', 16], ['_', 16], ['label', 40], ['type', 8]] + member_info.update(_split_line(self._get_row(), mem)) member_info['modified'] = _parse_date(member_info['modified']) member_info['created'] = _parse_date(member_info['created']) self.member_info = member_info @@ -319,15 +300,16 @@ def _read_header(self): types = {1: 'numeric', 2: 'char'} fieldcount = int(self._get_row()[54:58]) datalength = fieldnamelength * fieldcount - if datalength % 80: # round up to nearest 80 + # round up to nearest 80 + if datalength % 80: datalength += 80 - datalength % 80 fielddata = self.filepath_or_buffer.read(datalength) fields = [] obs_length = 0 while len(fielddata) >= fieldnamelength: # pull data for one field - field, fielddata = ( - fielddata[:fieldnamelength], fielddata[fieldnamelength:]) + field, fielddata = (fielddata[:fieldnamelength], + fielddata[fieldnamelength:]) # rest at end gets ignored, so if field is short, pad out # to match struct pattern below @@ -339,8 +321,8 @@ def _read_header(self): field['ntype'] = types[field['ntype']] fl = field['field_length'] if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)): - raise TypeError("Floating point field width %d is not between " - "2 and 8." % fl) + msg = "Floating field width {0} is not between 2 and 8." + raise TypeError(msg.format(fl)) for k, v in field.items(): try: @@ -376,8 +358,8 @@ def _record_count(self): """ Get number of records in file. - This is maybe suboptimal because we have to seek to the end of the - file. + This is maybe suboptimal because we have to seek to the end of + the file. Side effect: returns file position to record_start. """ @@ -387,7 +369,6 @@ def _record_count(self): self.record_start) if total_records_length % 80 != 0: - import warnings warnings.warn("xport file may be corrupted") if self.record_length > 80: @@ -461,7 +442,8 @@ def read(self, nrows=None): elif self.fields[j]['ntype'] == 'char': v = [y.rstrip() for y in vec] if compat.PY3: - v = [y.decode(self._encoding) for y in v] + if self._encoding is not None: + v = [y.decode(self._encoding) for y in v] df[x] = v if self._index is None: diff --git a/pandas/io/sas/saslib.pyx b/pandas/io/sas/saslib.pyx new file mode 100644 index 0000000000000..a963bf4fe25d3 --- /dev/null +++ b/pandas/io/sas/saslib.pyx @@ -0,0 +1,237 @@ +import numpy as np +cimport numpy as np +from numpy cimport uint8_t, uint16_t + +# rle_decompress decompresses data using a Run Length Encoding +# algorithm. It is partially documented here: +# +# https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf +def _rle_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff): + + cdef uint8_t control_byte + cdef uint8_t [:] result = np.zeros(result_length, np.uint8) + + cdef int rpos = 0 + cdef int ipos = 0 + cdef int i + cdef int nbytes + cdef uint8_t x + cdef length = len(inbuff) + + while ipos < length: + control_byte = inbuff[ipos] & 0xF0 + end_of_first_byte = int(inbuff[ipos] & 0x0F) + ipos += 1 + + if control_byte == 0x00: + if end_of_first_byte != 0: + print("Unexpected non-zero end_of_first_byte") + nbytes = int(inbuff[ipos]) + 64 + ipos += 1 + for i in range(nbytes): + result[rpos] = inbuff[ipos] + rpos += 1 + ipos += 1 + elif control_byte == 0x40: + # not documented + nbytes = end_of_first_byte * 16 + nbytes += int(inbuff[ipos]) + ipos += 1 + for i in range(nbytes): + result[rpos] = inbuff[ipos] + rpos += 1 + ipos += 1 + elif control_byte == 0x60: + nbytes = end_of_first_byte*256 + int(inbuff[ipos]) + 17 + ipos += 1 + for i in range(nbytes): + result[rpos] = 0x20 + rpos += 1 + elif control_byte == 0x70: + nbytes = end_of_first_byte*256 + int(inbuff[ipos]) + 17 + ipos += 1 + for i in range(nbytes): + result[rpos] = 0x00 + rpos += 1 + elif control_byte == 0x80: + nbytes = end_of_first_byte + 1 + for i in range(nbytes): + result[rpos] = inbuff[ipos + i] + rpos += 1 + ipos += nbytes + elif control_byte == 0x90: + nbytes = end_of_first_byte + 17 + for i in range(nbytes): + result[rpos] = inbuff[ipos + i] + rpos += 1 + ipos += nbytes + elif control_byte == 0xA0: + nbytes = end_of_first_byte + 33 + for i in range(nbytes): + result[rpos] = inbuff[ipos + i] + rpos += 1 + ipos += nbytes + elif control_byte == 0xB0: + nbytes = end_of_first_byte + 49 + for i in range(nbytes): + result[rpos] = inbuff[ipos + i] + rpos += 1 + ipos += nbytes + elif control_byte == 0xC0: + nbytes = end_of_first_byte + 3 + x = inbuff[ipos] + ipos += 1 + for i in range(nbytes): + result[rpos] = x + rpos += 1 + elif control_byte == 0xD0: + nbytes = end_of_first_byte + 2 + for i in range(nbytes): + result[rpos] = 0x40 + rpos += 1 + elif control_byte == 0xE0: + nbytes = end_of_first_byte + 2 + for i in range(nbytes): + result[rpos] = 0x20 + rpos += 1 + elif control_byte == 0xF0: + nbytes = end_of_first_byte + 2 + for i in range(nbytes): + result[rpos] = 0x00 + rpos += 1 + else: + raise ValueError("unknown control byte: %v", control_byte) + + if len(result) != result_length: + print("RLE: %v != %v\n", (len(result), result_length)) + + return np.asarray(result).tostring() + + +# rdc_decompress decompresses data using the Ross Data Compression algorithm: +# +# http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm +def _rdc_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff): + + cdef uint8_t cmd + cdef uint16_t ctrl_bits + cdef uint16_t ctrl_mask = 0 + cdef uint16_t ofs + cdef uint16_t cnt + cdef int ipos = 0 + cdef int rpos = 0 + cdef int k + + cdef uint8_t [:] outbuff = np.zeros(result_length, dtype=np.uint8) + + ii = -1 + + while ipos < len(inbuff): + ii += 1 + ctrl_mask = ctrl_mask >> 1 + if ctrl_mask == 0: + ctrl_bits = (inbuff[ipos] << 8) + inbuff[ipos + 1] + ipos += 2 + ctrl_mask = 0x8000 + + if ctrl_bits & ctrl_mask == 0: + outbuff[rpos] = inbuff[ipos] + ipos += 1 + rpos += 1 + continue + + cmd = (inbuff[ipos] >> 4) & 0x0F + cnt = (inbuff[ipos] & 0x0F) + ipos += 1 + + # short RLE + if cmd == 0: + cnt += 3 + for k in range(cnt): + outbuff[rpos + k] = inbuff[ipos] + rpos += cnt + ipos += 1 + + # long RLE + elif cmd == 1: + cnt += inbuff[ipos] << 4 + cnt += 19 + ipos += 1 + for k in range(cnt): + outbuff[rpos + k] = inbuff[ipos] + rpos += cnt + ipos += 1 + + # long pattern + elif cmd == 2: + ofs = cnt + 3 + ofs += inbuff[ipos] << 4 + ipos += 1 + cnt = inbuff[ipos] + ipos += 1 + cnt += 16 + for k in range(cnt): + outbuff[rpos + k] = outbuff[rpos - int(ofs) + k] + rpos += cnt + + # short pattern + elif (cmd >= 3) & (cmd <= 15): + ofs = cnt + 3 + ofs += inbuff[ipos] << 4 + ipos += 1 + for k in range(cmd): + outbuff[rpos + k] = outbuff[rpos - int(ofs) + k] + rpos += cmd + + else: + raise ValueError("unknown RDC command") + + if len(outbuff) != result_length: + raise ValueError("RDC: %v != %v\n", len(outbuff), result_length) + + return np.asarray(outbuff).tostring() + +def process_byte_array_with_data(parser, int offset, int length, np.ndarray[uint8_t, ndim=2] byte_chunk, + np.ndarray[dtype=object, ndim=2] string_chunk): + + cdef int s + cdef int j + cdef int m + cdef int start + cdef int end + cdef bytes source + cdef bytes temp + cdef int jb + cdef int js + + if (parser.compression != "") and (length < parser.row_length): + source = parser._decompress(parser.row_length, parser._cached_page[offset:offset + length]) + else: + source = parser._cached_page[offset:offset + length] + + s = 8 * parser._current_row_in_chunk_index + js = 0 + jb = 0 + for j in range(parser.column_count): + length = parser._column_data_lengths[j] + if length == 0: + break + start = parser._column_data_offsets[j] + end = start + length + temp = source[start:end] + if parser.column_types[j] == b'd': + m = 8 - length + if parser.byte_order == "<": + byte_chunk[jb, s+m:s+8] = np.frombuffer(temp, dtype=np.uint8) + else: + byte_chunk[jb, s:s+length] = np.frombuffer(temp, dtype=np.uint8) + jb += 1 + elif parser.column_types[j] == b's': + string_chunk[js, parser._current_row_in_chunk_index] = bytes(temp) + js += 1 + else: + raise ValueError("unknown column type: %s" % parser.columns[j].ctype) + + parser._current_row_on_page_index += 1 + parser._current_row_in_chunk_index += 1 + parser._current_row_in_file_index += 1 diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py new file mode 100644 index 0000000000000..9a60200c78893 --- /dev/null +++ b/pandas/io/sas/sasreader.py @@ -0,0 +1,61 @@ +""" +Read SAS sas7bdat or xport files. +""" + + +def read_sas(filepath_or_buffer, format=None, index=None, encoding=None, + chunksize=None, iterator=False): + """ + Read SAS files stored as either XPORT or SAS7BDAT format files. + + Parameters + ---------- + filepath_or_buffer : string or file-like object + Path to the SAS file. + format : string {'xport', 'sas7bdat'} or None + If None, file format is inferred. If 'xport' or 'sas7bdat', + uses the corresponding format. + index : identifier of index column, defaults to None + Identifier of column that should be used as index of the DataFrame. + encoding : string, default is None + Encoding for text data. If None, text data are stored as raw bytes. + chunksize : int + Read file `chunksize` lines at a time, returns iterator. + iterator : bool, defaults to False + If True, returns an iterator for reading the file incrementally. + + Returns + ------- + DataFrame if iterator=False and chunksize=None, else SAS7BDATReader + or XportReader + """ + + if format is None: + try: + fname = filepath_or_buffer.lower() + if fname.endswith(".xpt"): + format = "xport" + elif fname.endswith(".sas7bdat"): + format = "sas7bdat" + else: + raise ValueError("unable to infer format of SAS file") + except: + pass + + if format.lower() == 'xport': + from pandas.io.sas.sas_xport import XportReader + reader = XportReader(filepath_or_buffer, index=index, + encoding=encoding, + chunksize=chunksize) + elif format.lower() == 'sas7bdat': + from pandas.io.sas.sas7bdat import SAS7BDATReader + reader = SAS7BDATReader(filepath_or_buffer, index=index, + encoding=encoding, + chunksize=chunksize) + else: + raise ValueError('unknown SAS format') + + if iterator or chunksize: + return reader + + return reader.read() diff --git a/pandas/io/tests/data/DEMO_G.csv b/pandas/io/tests/sas/data/DEMO_G.csv similarity index 100% rename from pandas/io/tests/data/DEMO_G.csv rename to pandas/io/tests/sas/data/DEMO_G.csv diff --git a/pandas/io/tests/data/DEMO_G.xpt b/pandas/io/tests/sas/data/DEMO_G.xpt similarity index 100% rename from pandas/io/tests/data/DEMO_G.xpt rename to pandas/io/tests/sas/data/DEMO_G.xpt diff --git a/pandas/io/tests/data/DRXFCD_G.csv b/pandas/io/tests/sas/data/DRXFCD_G.csv similarity index 100% rename from pandas/io/tests/data/DRXFCD_G.csv rename to pandas/io/tests/sas/data/DRXFCD_G.csv diff --git a/pandas/io/tests/data/DRXFCD_G.xpt b/pandas/io/tests/sas/data/DRXFCD_G.xpt similarity index 100% rename from pandas/io/tests/data/DRXFCD_G.xpt rename to pandas/io/tests/sas/data/DRXFCD_G.xpt diff --git a/pandas/io/tests/data/SSHSV1_A.csv b/pandas/io/tests/sas/data/SSHSV1_A.csv similarity index 100% rename from pandas/io/tests/data/SSHSV1_A.csv rename to pandas/io/tests/sas/data/SSHSV1_A.csv diff --git a/pandas/io/tests/data/SSHSV1_A.xpt b/pandas/io/tests/sas/data/SSHSV1_A.xpt similarity index 100% rename from pandas/io/tests/data/SSHSV1_A.xpt rename to pandas/io/tests/sas/data/SSHSV1_A.xpt diff --git a/pandas/io/tests/data/paxraw_d_short.csv b/pandas/io/tests/sas/data/paxraw_d_short.csv similarity index 100% rename from pandas/io/tests/data/paxraw_d_short.csv rename to pandas/io/tests/sas/data/paxraw_d_short.csv diff --git a/pandas/io/tests/data/paxraw_d_short.xpt b/pandas/io/tests/sas/data/paxraw_d_short.xpt similarity index 100% rename from pandas/io/tests/data/paxraw_d_short.xpt rename to pandas/io/tests/sas/data/paxraw_d_short.xpt diff --git a/pandas/io/tests/sas/data/test1.sas7bdat b/pandas/io/tests/sas/data/test1.sas7bdat new file mode 100644 index 0000000000000..951173ce4d9f9 Binary files /dev/null and b/pandas/io/tests/sas/data/test1.sas7bdat differ diff --git a/pandas/io/tests/sas/data/test10.sas7bdat b/pandas/io/tests/sas/data/test10.sas7bdat new file mode 100644 index 0000000000000..a5fd43e6cb9ac Binary files /dev/null and b/pandas/io/tests/sas/data/test10.sas7bdat differ diff --git a/pandas/io/tests/sas/data/test11.sas7bdat b/pandas/io/tests/sas/data/test11.sas7bdat new file mode 100644 index 0000000000000..072aa683f66d9 Binary files /dev/null and b/pandas/io/tests/sas/data/test11.sas7bdat differ diff --git a/pandas/io/tests/sas/data/test12.sas7bdat b/pandas/io/tests/sas/data/test12.sas7bdat new file mode 100644 index 0000000000000..e2a9db874948d Binary files /dev/null and b/pandas/io/tests/sas/data/test12.sas7bdat differ diff --git a/pandas/io/tests/sas/data/test13.sas7bdat b/pandas/io/tests/sas/data/test13.sas7bdat new file mode 100644 index 0000000000000..b1dc6f9f8eddc Binary files /dev/null and b/pandas/io/tests/sas/data/test13.sas7bdat differ diff --git a/pandas/io/tests/sas/data/test14.sas7bdat b/pandas/io/tests/sas/data/test14.sas7bdat new file mode 100644 index 0000000000000..5a958df51f0ce Binary files /dev/null and b/pandas/io/tests/sas/data/test14.sas7bdat differ diff --git a/pandas/io/tests/sas/data/test15.sas7bdat b/pandas/io/tests/sas/data/test15.sas7bdat new file mode 100644 index 0000000000000..c028d8041a3d3 Binary files /dev/null and b/pandas/io/tests/sas/data/test15.sas7bdat differ diff --git a/pandas/io/tests/sas/data/test16.sas7bdat b/pandas/io/tests/sas/data/test16.sas7bdat new file mode 100644 index 0000000000000..867c3c51bbddd Binary files /dev/null and b/pandas/io/tests/sas/data/test16.sas7bdat differ diff --git a/pandas/io/tests/sas/data/test2.sas7bdat b/pandas/io/tests/sas/data/test2.sas7bdat new file mode 100644 index 0000000000000..ba0b8e8dcbb91 Binary files /dev/null and b/pandas/io/tests/sas/data/test2.sas7bdat differ diff --git a/pandas/io/tests/sas/data/test3.sas7bdat b/pandas/io/tests/sas/data/test3.sas7bdat new file mode 100644 index 0000000000000..a061b1ddd0d45 Binary files /dev/null and b/pandas/io/tests/sas/data/test3.sas7bdat differ diff --git a/pandas/io/tests/sas/data/test4.sas7bdat b/pandas/io/tests/sas/data/test4.sas7bdat new file mode 100644 index 0000000000000..addd6edf90830 Binary files /dev/null and b/pandas/io/tests/sas/data/test4.sas7bdat differ diff --git a/pandas/io/tests/sas/data/test5.sas7bdat b/pandas/io/tests/sas/data/test5.sas7bdat new file mode 100644 index 0000000000000..ba741d5a635df Binary files /dev/null and b/pandas/io/tests/sas/data/test5.sas7bdat differ diff --git a/pandas/io/tests/sas/data/test6.sas7bdat b/pandas/io/tests/sas/data/test6.sas7bdat new file mode 100644 index 0000000000000..2d9b4b0466047 Binary files /dev/null and b/pandas/io/tests/sas/data/test6.sas7bdat differ diff --git a/pandas/io/tests/sas/data/test7.sas7bdat b/pandas/io/tests/sas/data/test7.sas7bdat new file mode 100644 index 0000000000000..785b12cf175e3 Binary files /dev/null and b/pandas/io/tests/sas/data/test7.sas7bdat differ diff --git a/pandas/io/tests/sas/data/test8.sas7bdat b/pandas/io/tests/sas/data/test8.sas7bdat new file mode 100644 index 0000000000000..67db5a143de07 Binary files /dev/null and b/pandas/io/tests/sas/data/test8.sas7bdat differ diff --git a/pandas/io/tests/sas/data/test9.sas7bdat b/pandas/io/tests/sas/data/test9.sas7bdat new file mode 100644 index 0000000000000..d76a1f28033f4 Binary files /dev/null and b/pandas/io/tests/sas/data/test9.sas7bdat differ diff --git a/pandas/io/tests/sas/data/test_sas7bdat_1.csv b/pandas/io/tests/sas/data/test_sas7bdat_1.csv new file mode 100644 index 0000000000000..3eb23e42448d1 --- /dev/null +++ b/pandas/io/tests/sas/data/test_sas7bdat_1.csv @@ -0,0 +1,11 @@ +Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13,Column14,Column15,Column16,Column17,Column18,Column19,Column20,Column21,Column22,Column23,Column24,Column25,Column26,Column27,Column28,Column29,Column30,Column31,Column32,Column33,Column34,Column35,Column36,Column37,Column38,Column39,Column40,Column41,Column42,Column43,Column44,Column45,Column46,Column47,Column48,Column49,Column50,Column51,Column52,Column53,Column54,Column55,Column56,Column57,Column58,Column59,Column60,Column61,Column62,Column63,Column64,Column65,Column66,Column67,Column68,Column69,Column70,Column71,Column72,Column73,Column74,Column75,Column76,Column77,Column78,Column79,Column80,Column81,Column82,Column83,Column84,Column85,Column86,Column87,Column88,Column89,Column90,Column91,Column92,Column93,Column94,Column95,Column96,Column97,Column98,Column99,Column100 +0.636,pear,84,2170,0.103,apple,20,,0.621,apple,,9697,0.047,dog,7,2543,0.728,crocodile,55,2615,0.146,crocodile,10,2832,0.644,crocodile,6,9671,,crocodile,28,9126,0.433,crocodile,22,8117,0.318,dog,61,3363,0.938,pear,58,3700,0.844,dog,26,3989,0.132,crocodile,88,8240,0.325,,9,6102,0.032,apple,34,2987,0.651,crocodile,81,8778,,apple,91,9908,0.897,dog,26,3682,0.274,crocodile,75,1629,0.918,apple,9,7935,0.761,crocodile,,2398,0.914,apple,75,9204,0.946,pear,87,5587,0.940,apple,50,1611,0.480,apple,45,3230 +0.283,dog,49,6275,0.398,pear,50,339,0.561,apple,22,8596,0.661,pear,38,4928,0.709,crocodile,73,6011,0.239,crocodile,93,,0.093,crocodile,23,6198,0.757,dog,61,,0.593,pear,12,9571,,dog,6,892,0.883,pear,81,3363,0.166,pear,,1814,0.454,dog,52,1161,0.630,dog,43,159,0.398,apple,17,8194,0.905,dog,29,4752,0.382,pear,44,9302,0.247,apple,45,4645,0.321,apple,56,,0.415,,,5912,0.580,,60,1918,0.821,dog,68,6847,0.542,apple,83,4465,0.169,pear,53,5820,0.942,dog,76,4904 +0.452,pear,35,8627,0.117,pear,70,5704,0.209,apple,7,5079,0.961,pear,73,,0.414,dog,,9681,,pear,80,4183,0.131,crocodile,53,972,,apple,86,772,0.460,,89,5950,0.293,apple,25,288,0.314,dog,38,106,0.108,pear,49,979,0.481,dog,25,7904,0.270,dog,4,4891,,dog,32,9820,0.517,,64,9053,0.487,dog,78,7238,0.488,apple,54,4349,0.356,pear,73,9991,0.113,,63,5731,0.294,dog,58,3060,,pear,2,,0.004,dog,45,9872,0.024,dog,69,,0.336,pear,9, +0.557,dog,29,5292,0.640,pear,34,2766,0.910,dog,26,2576,0.924,pear,85,3495,0.478,crocodile,43,1586,0.576,apple,79,4403,0.329,crocodile,,3572,0.702,dog,46,1913,0.147,,10,4292,0.368,crocodile,27,7037,0.137,pear,19,4545,0.999,apple,81,,0.095,pear,36,9195,0.494,pear,61,3393,,crocodile,27,8252,,dog,87,7968,0.845,apple,31,3801,0.514,dog,,6542,0.483,dog,58,4688,,pear,71,402,0.275,apple,51,3673,0.642,,82,4443,0.538,apple,10,114,0.155,,17,8170,0.493,pear,89,8566 +0.138,,55,1903,0.583,crocodile,34,4170,0.226,crocodile,11,3985,0.271,pear,,4624,,,43,2539,,dog,50,,0.469,crocodile,72,5999,0.476,apple,71,1184,0.333,crocodile,23,5075,0.120,pear,64,5242,0.561,apple,7,8206,0.103,,45,,0.972,,1,432,0.423,dog,4,1860,0.686,apple,43,9640,0.622,pear,15,91,0.795,apple,33,2149,0.248,dog,,,0.942,dog,85,987,0.492,pear,2,754,0.955,apple,48,1268,0.498,apple,83,6654,0.305,crocodile,81,423,0.013,,32,3217,0.046,,57,894 +0.948,dog,33,8962,0.691,pear,,2276,0.330,crocodile,27,1104,0.668,,73,2883,0.007,,18,3726,0.301,,85,3621,,apple,39,9646,0.305,pear,48,,0.103,dog,71,8216,0.813,dog,41,6387,0.114,apple,24,4428,0.122,dog,41,,0.558,pear,49,6455,0.119,pear,56,9390,0.482,apple,75,9168,0.437,dog,74,4101,0.557,dog,74,1631,0.159,dog,26,2218,0.274,crocodile,,7653,,dog,,9637,0.345,apple,82,440,0.604,apple,89,6848,0.177,pear,31,2597,0.847,dog,77,,0.700,crocodile,60,6088 +0.162,crocodile,17,8189,0.002,pear,30,5161,0.851,,,7230,0.886,dog,43,5277,0.086,dog,10,6891,0.510,dog,63,2275,0.641,dog,74,9202,0.452,pear,4,6580,0.008,,38,,0.491,apple,11,1469,0.672,dog,89,6182,0.477,apple,4,1877,0.241,dog,61,5083,,apple,78,2526,,,,7547,0.969,dog,22,1578,,dog,86,1211,0.221,apple,1,3252,0.146,,85,9278,0.676,dog,30,1218,0.012,apple,93,3159,0.681,crocodile,45,,0.825,crocodile,0,3749,0.831,,74,7021,0.534,pear,20,6122 +0.148,crocodile,37,1740,0.411,apple,23,8833,0.620,crocodile,5,439,0.372,apple,31,,0.881,dog,84,736,0.347,crocodile,46,224,0.080,crocodile,80,891,0.452,pear,82,3304,0.418,pear,85,2984,0.362,dog,65,,0.167,,65,5265,,apple,89,2101,0.864,apple,92,3636,0.382,crocodile,89,269,0.958,dog,23,1419,0.410,pear,86,8488,0.739,pear,,6576,0.613,dog,37,859,0.426,dog,15,8618,0.554,crocodile,51,4864,0.046,,,4421,0.995,dog,25,622,0.191,pear,84,536,0.405,apple,54,7081,0.575,crocodile,15,2570 +,pear,15,3679,0.102,pear,1,3227,0.157,pear,12,1627,0.955,pear,21,64,0.967,dog,14,2876,0.533,,74,4269,0.881,apple,58,5565,,dog,,6286,0.373,crocodile,46,6525,0.477,crocodile,18,1836,0.067,apple,66,3851,0.224,pear,,7233,0.443,,75,5577,0.392,crocodile,92,4353,0.549,pear,96,4884,0.581,crocodile,10,4058,,crocodile,22,8038,0.411,,94,9236,,pear,40,,0.770,crocodile,36,4591,0.854,dog,32,2902,0.724,apple,57,5499,0.566,pear,19,7577,0.853,,,,,crocodile,84,2709 +0.663,pear,,7735,0.086,apple,80,,,pear,16,9159,0.053,dog,52,3478,0.691,pear,49,9979,0.428,dog,46,5776,0.744,crocodile,3,9549,0.249,dog,90,1192,0.009,dog,68,8678,0.046,apple,62,1873,0.684,,31,7227,0.300,crocodile,83,9881,,,82,8272,0.621,crocodile,40,8171,0.002,crocodile,38,,0.580,pear,31,3094,,,5,5711,0.668,,30,217,0.672,dog,34,3184,0.105,pear,,1521,0.239,dog,28,6896,0.011,apple,,4509,0.691,,97,9247,0.167,pear,74,9824,0.945,crocodile,89, diff --git a/pandas/io/tests/sas/data/test_sas7bdat_2.csv b/pandas/io/tests/sas/data/test_sas7bdat_2.csv new file mode 100644 index 0000000000000..adc0587ae2797 --- /dev/null +++ b/pandas/io/tests/sas/data/test_sas7bdat_2.csv @@ -0,0 +1,11 @@ +Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13,Column14,Column15,Column16,Column17,Column18,Column19,Column20,Column21,Column22,Column23,Column24,Column25,Column26,Column27,Column28,Column29,Column30,Column31,Column32,Column33,Column34,Column35,Column36,Column37,Column38,Column39,Column40,Column41,Column42,Column43,Column44,Column45,Column46,Column47,Column48,Column49,Column50,Column51,Column52,Column53,Column54,Column55,Column56,Column57,Column58,Column59,Column60,Column61,Column62,Column63,Column64,Column65,Column66,Column67,Column68,Column69,Column70,Column71,Column72,Column73,Column74,Column75,Column76,Column77,Column78,Column79,Column80,Column81,Column82,Column83,Column84,Column85,Column86,Column87,Column88,Column89,Column90,Column91,Column92,Column93,Column94,Column95,Column96,Column97,Column98,Column99,Column100 +0.636,高雄市,84,2170,0.103,부산,20,,0.621,부산,,9697,0.047,Иркутск,7,2543,0.728,鱷魚,55,2615,0.146,鱷魚,10,2832,0.644,鱷魚,6,9671,,鱷魚,28,9126,0.433,鱷魚,22,8117,0.318,Иркутск,61,3363,0.938,高雄市,58,3700,0.844,Иркутск,26,3989,0.132,鱷魚,88,8240,0.325,,9,6102,0.032,부산,34,2987,0.651,鱷魚,81,8778,,부산,91,9908,0.897,Иркутск,26,3682,0.274,鱷魚,75,1629,0.918,부산,9,7935,0.761,鱷魚,,2398,0.914,부산,75,9204,0.946,高雄市,87,5587,0.940,부산,50,1611,0.480,부산,45,3230 +0.283,Иркутск,49,6275,0.398,高雄市,50,339,0.561,부산,22,8596,0.661,高雄市,38,4928,0.709,鱷魚,73,6011,0.239,鱷魚,93,,0.093,鱷魚,23,6198,0.757,Иркутск,61,,0.593,高雄市,12,9571,,Иркутск,6,892,0.883,高雄市,81,3363,0.166,高雄市,,1814,0.454,Иркутск,52,1161,0.630,Иркутск,43,159,0.398,부산,17,8194,0.905,Иркутск,29,4752,0.382,高雄市,44,9302,0.247,부산,45,4645,0.321,부산,56,,0.415,,,5912,0.580,,60,1918,0.821,Иркутск,68,6847,0.542,부산,83,4465,0.169,高雄市,53,5820,0.942,Иркутск,76,4904 +0.452,高雄市,35,8627,0.117,高雄市,70,5704,0.209,부산,7,5079,0.961,高雄市,73,,0.414,Иркутск,,9681,,高雄市,80,4183,0.131,鱷魚,53,972,,부산,86,772,0.460,,89,5950,0.293,부산,25,288,0.314,Иркутск,38,106,0.108,高雄市,49,979,0.481,Иркутск,25,7904,0.270,Иркутск,4,4891,,Иркутск,32,9820,0.517,,64,9053,0.487,Иркутск,78,7238,0.488,부산,54,4349,0.356,高雄市,73,9991,0.113,,63,5731,0.294,Иркутск,58,3060,,高雄市,2,,0.004,Иркутск,45,9872,0.024,Иркутск,69,,0.336,高雄市,9, +0.557,Иркутск,29,5292,0.640,高雄市,34,2766,0.910,Иркутск,26,2576,0.924,高雄市,85,3495,0.478,鱷魚,43,1586,0.576,부산,79,4403,0.329,鱷魚,,3572,0.702,Иркутск,46,1913,0.147,,10,4292,0.368,鱷魚,27,7037,0.137,高雄市,19,4545,0.999,부산,81,,0.095,高雄市,36,9195,0.494,高雄市,61,3393,,鱷魚,27,8252,,Иркутск,87,7968,0.845,부산,31,3801,0.514,Иркутск,,6542,0.483,Иркутск,58,4688,,高雄市,71,402,0.275,부산,51,3673,0.642,,82,4443,0.538,부산,10,114,0.155,,17,8170,0.493,高雄市,89,8566 +0.138,,55,1903,0.583,鱷魚,34,4170,0.226,鱷魚,11,3985,0.271,高雄市,,4624,,,43,2539,,Иркутск,50,,0.469,鱷魚,72,5999,0.476,부산,71,1184,0.333,鱷魚,23,5075,0.120,高雄市,64,5242,0.561,부산,7,8206,0.103,,45,,0.972,,1,432,0.423,Иркутск,4,1860,0.686,부산,43,9640,0.622,高雄市,15,91,0.795,부산,33,2149,0.248,Иркутск,,,0.942,Иркутск,85,987,0.492,高雄市,2,754,0.955,부산,48,1268,0.498,부산,83,6654,0.305,鱷魚,81,423,0.013,,32,3217,0.046,,57,894 +0.948,Иркутск,33,8962,0.691,高雄市,,2276,0.330,鱷魚,27,1104,0.668,,73,2883,0.007,,18,3726,0.301,,85,3621,,부산,39,9646,0.305,高雄市,48,,0.103,Иркутск,71,8216,0.813,Иркутск,41,6387,0.114,부산,24,4428,0.122,Иркутск,41,,0.558,高雄市,49,6455,0.119,高雄市,56,9390,0.482,부산,75,9168,0.437,Иркутск,74,4101,0.557,Иркутск,74,1631,0.159,Иркутск,26,2218,0.274,鱷魚,,7653,,Иркутск,,9637,0.345,부산,82,440,0.604,부산,89,6848,0.177,高雄市,31,2597,0.847,Иркутск,77,,0.700,鱷魚,60,6088 +0.162,鱷魚,17,8189,0.002,高雄市,30,5161,0.851,,,7230,0.886,Иркутск,43,5277,0.086,Иркутск,10,6891,0.510,Иркутск,63,2275,0.641,Иркутск,74,9202,0.452,高雄市,4,6580,0.008,,38,,0.491,부산,11,1469,0.672,Иркутск,89,6182,0.477,부산,4,1877,0.241,Иркутск,61,5083,,부산,78,2526,,,,7547,0.969,Иркутск,22,1578,,Иркутск,86,1211,0.221,부산,1,3252,0.146,,85,9278,0.676,Иркутск,30,1218,0.012,부산,93,3159,0.681,鱷魚,45,,0.825,鱷魚,0,3749,0.831,,74,7021,0.534,高雄市,20,6122 +0.148,鱷魚,37,1740,0.411,부산,23,8833,0.620,鱷魚,5,439,0.372,부산,31,,0.881,Иркутск,84,736,0.347,鱷魚,46,224,0.080,鱷魚,80,891,0.452,高雄市,82,3304,0.418,高雄市,85,2984,0.362,Иркутск,65,,0.167,,65,5265,,부산,89,2101,0.864,부산,92,3636,0.382,鱷魚,89,269,0.958,Иркутск,23,1419,0.410,高雄市,86,8488,0.739,高雄市,,6576,0.613,Иркутск,37,859,0.426,Иркутск,15,8618,0.554,鱷魚,51,4864,0.046,,,4421,0.995,Иркутск,25,622,0.191,高雄市,84,536,0.405,부산,54,7081,0.575,鱷魚,15,2570 +,高雄市,15,3679,0.102,高雄市,1,3227,0.157,高雄市,12,1627,0.955,高雄市,21,64,0.967,Иркутск,14,2876,0.533,,74,4269,0.881,부산,58,5565,,Иркутск,,6286,0.373,鱷魚,46,6525,0.477,鱷魚,18,1836,0.067,부산,66,3851,0.224,高雄市,,7233,0.443,,75,5577,0.392,鱷魚,92,4353,0.549,高雄市,96,4884,0.581,鱷魚,10,4058,,鱷魚,22,8038,0.411,,94,9236,,高雄市,40,,0.770,鱷魚,36,4591,0.854,Иркутск,32,2902,0.724,부산,57,5499,0.566,高雄市,19,7577,0.853,,,,,鱷魚,84,2709 +0.663,高雄市,,7735,0.086,부산,80,,,高雄市,16,9159,0.053,Иркутск,52,3478,0.691,高雄市,49,9979,0.428,Иркутск,46,5776,0.744,鱷魚,3,9549,0.249,Иркутск,90,1192,0.009,Иркутск,68,8678,0.046,부산,62,1873,0.684,,31,7227,0.300,鱷魚,83,9881,,,82,8272,0.621,鱷魚,40,8171,0.002,鱷魚,38,,0.580,高雄市,31,3094,,,5,5711,0.668,,30,217,0.672,Иркутск,34,3184,0.105,高雄市,,1521,0.239,Иркутск,28,6896,0.011,부산,,4509,0.691,,97,9247,0.167,高雄市,74,9824,0.945,鱷魚,89, diff --git a/pandas/io/tests/sas/test_sas7bdat.py b/pandas/io/tests/sas/test_sas7bdat.py new file mode 100644 index 0000000000000..a9e6ea68f3979 --- /dev/null +++ b/pandas/io/tests/sas/test_sas7bdat.py @@ -0,0 +1,64 @@ +import pandas as pd +from pandas.compat import PY2 +import pandas.util.testing as tm +import os +import io +import numpy as np + + +class TestSAS7BDAT(tm.TestCase): + + def setUp(self): + self.dirpath = tm.get_data_path() + self.data = [] + self.test_ix = [list(range(1, 16)), [16]] + for j in 1, 2: + fname = os.path.join(self.dirpath, "test_sas7bdat_%d.csv" % j) + df = pd.read_csv(fname) + epoch = pd.datetime(1960, 1, 1) + t1 = pd.to_timedelta(df["Column4"], unit='d') + df["Column4"] = epoch + t1 + t2 = pd.to_timedelta(df["Column12"], unit='d') + df["Column12"] = epoch + t2 + for k in range(df.shape[1]): + col = df.iloc[:, k] + if col.dtype == np.int64: + df.iloc[:, k] = df.iloc[:, k].astype(np.float64) + elif col.dtype == np.dtype('O'): + if PY2: + f = lambda x: (x.decode('utf-8') if + isinstance(x, str) else x) + df.iloc[:, k] = df.iloc[:, k].apply(f) + self.data.append(df) + + def test_from_file(self): + for j in 0, 1: + df0 = self.data[j] + for k in self.test_ix[j]: + fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) + df = pd.read_sas(fname, encoding='utf-8') + tm.assert_frame_equal(df, df0) + + def test_from_buffer(self): + for j in 0, 1: + df0 = self.data[j] + for k in self.test_ix[j]: + fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) + byts = open(fname, 'rb').read() + buf = io.BytesIO(byts) + df = pd.read_sas(buf, format="sas7bdat", encoding='utf-8') + tm.assert_frame_equal(df, df0) + + def test_from_iterator(self): + for j in 0, 1: + df0 = self.data[j] + for k in self.test_ix[j]: + fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) + byts = open(fname, 'rb').read() + buf = io.BytesIO(byts) + rdr = pd.read_sas(buf, format="sas7bdat", + iterator=True, encoding='utf-8') + df = rdr.read(2) + tm.assert_frame_equal(df, df0.iloc[0:2, :]) + df = rdr.read(3) + tm.assert_frame_equal(df, df0.iloc[2:5, :]) diff --git a/pandas/io/tests/test_sas.py b/pandas/io/tests/sas/test_xport.py similarity index 73% rename from pandas/io/tests/test_sas.py rename to pandas/io/tests/sas/test_xport.py index 9b31d9443de3b..ae378c41cd24b 100644 --- a/pandas/io/tests/test_sas.py +++ b/pandas/io/tests/sas/test_xport.py @@ -1,6 +1,6 @@ import pandas as pd import pandas.util.testing as tm -from pandas.io.sas import XportReader, read_sas +from pandas.io.sas.sasreader import read_sas import numpy as np import os @@ -33,16 +33,16 @@ def test1_basic(self): numeric_as_float(data_csv) # Read full file - data = XportReader(self.file01).read() + data = read_sas(self.file01, format="xport") tm.assert_frame_equal(data, data_csv) # Test incremental read with `read` method. - reader = XportReader(self.file01) + reader = read_sas(self.file01, format="xport", iterator=True) data = reader.read(10) tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) # Test incremental read with `get_chunk` method. - reader = XportReader(self.file01, chunksize=10) + reader = read_sas(self.file01, format="xport", chunksize=10) data = reader.get_chunk() tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) @@ -59,20 +59,22 @@ def test1_index(self): numeric_as_float(data_csv) # Read full file - data = XportReader(self.file01, index="SEQN").read() + data = read_sas(self.file01, index="SEQN", format="xport") tm.assert_frame_equal(data, data_csv, check_index_type=False) # Test incremental read with `read` method. - reader = XportReader(self.file01, index="SEQN") + reader = read_sas(self.file01, index="SEQN", format="xport", + iterator=True) data = reader.read(10) - tm.assert_frame_equal(data, data_csv.iloc[ - 0:10, :], check_index_type=False) + tm.assert_frame_equal(data, data_csv.iloc[0:10, :], + check_index_type=False) # Test incremental read with `get_chunk` method. - reader = XportReader(self.file01, index="SEQN", chunksize=10) + reader = read_sas(self.file01, index="SEQN", format="xport", + chunksize=10) data = reader.get_chunk() - tm.assert_frame_equal(data, data_csv.iloc[ - 0:10, :], check_index_type=False) + tm.assert_frame_equal(data, data_csv.iloc[0:10, :], + check_index_type=False) def test1_incremental(self): # Test with DEMO_G.xpt, reading full file incrementally @@ -81,18 +83,13 @@ def test1_incremental(self): data_csv = data_csv.set_index("SEQN") numeric_as_float(data_csv) - reader = XportReader(self.file01, index="SEQN", chunksize=1000) + reader = read_sas(self.file01, index="SEQN", chunksize=1000) all_data = [x for x in reader] data = pd.concat(all_data, axis=0) tm.assert_frame_equal(data, data_csv, check_index_type=False) - reader = XportReader(self.file01, index="SEQN", chunksize=1000) - data = pd.concat(reader, axis=0) - - tm.assert_frame_equal(data, data_csv, check_index_type=False) - def test2(self): # Test with SSHSV1_A.xpt @@ -100,7 +97,7 @@ def test2(self): data_csv = pd.read_csv(self.file02.replace(".xpt", ".csv")) numeric_as_float(data_csv) - data = XportReader(self.file02).read() + data = read_sas(self.file02) tm.assert_frame_equal(data, data_csv) def test_multiple_types(self): @@ -109,10 +106,7 @@ def test_multiple_types(self): # Compare to this data_csv = pd.read_csv(self.file03.replace(".xpt", ".csv")) - data = XportReader(self.file03).read() - tm.assert_frame_equal(data, data_csv) - - data = read_sas(self.file03) + data = read_sas(self.file03, encoding="utf-8") tm.assert_frame_equal(data, data_csv) def test_truncated_float_support(self): @@ -124,8 +118,5 @@ def test_truncated_float_support(self): data_csv = pd.read_csv(self.file04.replace(".xpt", ".csv")) - data = XportReader(self.file04).read() - tm.assert_frame_equal(data.astype('int64'), data_csv) - - data = read_sas(self.file04) + data = read_sas(self.file04, format="xport") tm.assert_frame_equal(data.astype('int64'), data_csv) diff --git a/setup.py b/setup.py index b5609d213c773..f33b01b24c165 100755 --- a/setup.py +++ b/setup.py @@ -262,7 +262,8 @@ class CheckSDist(sdist_class): 'pandas/parser.pyx', 'pandas/src/period.pyx', 'pandas/src/sparse.pyx', - 'pandas/src/testing.pyx'] + 'pandas/src/testing.pyx', + 'pandas/io/sas/saslib.pyx'] def initialize_options(self): sdist_class.initialize_options(self) @@ -418,9 +419,11 @@ def pxd(name): 'pandas/src/parser/io.h', 'pandas/src/numpy_helper.h'], 'sources': ['pandas/src/parser/tokenizer.c', - 'pandas/src/parser/io.c']} + 'pandas/src/parser/io.c']}, ) +ext_data["io.sas.saslib"] = {'pyxfile': 'io/sas/saslib'} + extensions = [] for name, data in ext_data.items(): @@ -527,6 +530,7 @@ def pxd(name): 'pandas.core', 'pandas.indexes', 'pandas.io', + 'pandas.io.sas', 'pandas.rpy', 'pandas.sandbox', 'pandas.sparse',