diff --git a/LICENSES/SAS7BDAT_LICENSE b/LICENSES/SAS7BDAT_LICENSE
new file mode 100644
index 0000000000000..8fbf194013e93
--- /dev/null
+++ b/LICENSES/SAS7BDAT_LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2015 Jared Hobbs
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py
index 12e48295d8d05..74bec72eb05e9 100644
--- a/asv_bench/benchmarks/packers.py
+++ b/asv_bench/benchmarks/packers.py
@@ -318,6 +318,24 @@ def remove(self, f):
pass
+class packers_read_sas7bdat(object):
+
+ def setup(self):
+ self.f = 'data/test1.sas7bdat'
+
+ def time_packers_read_sas7bdat(self):
+ pd.read_sas(self.f, format='sas7bdat')
+
+
+class packers_read_xport(object):
+
+ def setup(self):
+ self.f = 'data/paxraw_d_short.xpt'
+
+ def time_packers_read_xport(self):
+ pd.read_sas(self.f, format='xport')
+
+
class packers_write_csv(object):
goal_time = 0.2
@@ -854,4 +872,4 @@ def remove(self, f):
try:
os.remove(self.f)
except:
- pass
\ No newline at end of file
+ pass
diff --git a/doc/source/io.rst b/doc/source/io.rst
index 577d6b34ec719..bbfa711eb4445 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -2554,7 +2554,7 @@ both on the writing (serialization), and reading (deserialization).
+----------------------+------------------------+
| 0.18 | >= 0.18 |
+======================+========================+
-
+
Reading (files packed by older versions) is backward-compatibile, except for files packed with 0.17 in Python 2, in which case only they can only be unpacked in Python 2.
.. ipython:: python
@@ -4198,7 +4198,7 @@ Authenticating with user account credentials is as simple as following the promp
which will be automatically opened for you. You will be authenticated to the specified
``BigQuery`` account using the product name ``pandas GBQ``. It is only possible on local host.
The remote authentication using user account credentials is not currently supported in Pandas.
-Additional information on the authentication mechanism can be found
+Additional information on the authentication mechanism can be found
`here `__.
Authentication with service account credentials is possible via the `'private_key'` parameter. This method
@@ -4564,24 +4564,25 @@ easy conversion to and from pandas.
.. _io.sas_reader:
-SAS Format
-----------
+SAS Formats
+-----------
.. versionadded:: 0.17.0
-The top-level function :func:`read_sas` currently can read (but
-not write) SAS xport (.XPT) format files. Pandas cannot currently
-handle SAS7BDAT files.
+The top-level function :func:`read_sas` can read (but not write) SAS
+`xport` (.XPT) and `SAS7BDAT` (.sas7bdat) format files (v0.18.0).
-XPORT files only contain two value types: ASCII text and double
-precision numeric values. There is no automatic type conversion to
-integers, dates, or categoricals. By default the whole file is read
-and returned as a ``DataFrame``.
+SAS files only contain two value types: ASCII text and floating point
+values (usually 8 bytes but sometimes truncated). For xport files,
+there is no automatic type conversion to integers, dates, or
+categoricals. For SAS7BDAT files, the format codes may allow date
+variables to be automatically converted to dates. By default the
+whole file is read and returned as a ``DataFrame``.
-Specify a ``chunksize`` or use ``iterator=True`` to obtain an
-``XportReader`` object for incrementally reading the file. The
-``XportReader`` object also has attributes that contain additional
-information about the file and its variables.
+Specify a ``chunksize`` or use ``iterator=True`` to obtain reader
+objects (``XportReader`` or ``SAS7BDATReader``) for incrementally
+reading the file. The reader objects also have attributes that
+contain additional information about the file and its variables.
Read a SAS XPORT file:
@@ -4602,6 +4603,8 @@ web site.
.. _specification: https://support.sas.com/techsup/technote/ts140.pdf
+No official documentation is available for the SAS7BDAT format.
+
.. _io.perf:
Performance Considerations
diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
index aab9e6125732a..b3bbc5cf5ef8c 100644
--- a/doc/source/whatsnew/v0.18.0.txt
+++ b/doc/source/whatsnew/v0.18.0.txt
@@ -24,6 +24,7 @@ Highlights include:
since 0.14.0. This will now raise a ``TypeError``, see :ref:`here `.
- The ``.to_xarray()`` function has been added for compatibility with the
`xarray package `__, see :ref:`here `.
+- The ``read_sas`` function has been enhanced to read ``sas7bdat`` files, see :ref:`here `.
- Addition of the :ref:`.str.extractall() method `,
and API changes to the :ref:`.str.extract() method `
and :ref:`.str.cat() method `.
@@ -403,6 +404,13 @@ For example, if you have a jupyter notebook you plan to convert to latex using n
Options ``display.latex.escape`` and ``display.latex.longtable`` have also been added to the configuration and are used automatically by the ``to_latex``
method. See the :ref:`options documentation` for more info.
+.. _whatsnew_0180.enhancements.sas:
+
+SAS7BDAT files
+^^^^^^^^^^^^^^
+
+Pandas can now read SAS7BDAT files, including compressed files. The files can be read in entirety, or incrementally. For full details see :ref:`here `. (issue:`4052`)
+
.. _whatsnew_0180.enhancements.other:
Other enhancements
diff --git a/pandas/io/api.py b/pandas/io/api.py
index 3ac4c670c8466..920ece9c4c3a8 100644
--- a/pandas/io/api.py
+++ b/pandas/io/api.py
@@ -11,7 +11,7 @@
from pandas.io.json import read_json
from pandas.io.html import read_html
from pandas.io.sql import read_sql, read_sql_table, read_sql_query
-from pandas.io.sas import read_sas
+from pandas.io.sas.sasreader import read_sas
from pandas.io.stata import read_stata
from pandas.io.pickle import read_pickle, to_pickle
from pandas.io.packers import read_msgpack, to_msgpack
diff --git a/pandas/io/sas/__init__.py b/pandas/io/sas/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py
new file mode 100644
index 0000000000000..e068c51df585d
--- /dev/null
+++ b/pandas/io/sas/sas7bdat.py
@@ -0,0 +1,828 @@
+"""
+Read SAS7BDAT files
+
+Based on code written by Jared Hobbs:
+ https://bitbucket.org/jaredhobbs/sas7bdat
+
+See also:
+ https://github.com/BioStatMatt/sas7bdat
+
+Partial documentation of the file format:
+ https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf
+
+Reference for binary data compression:
+ http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
+"""
+
+import pandas as pd
+from pandas import compat
+from pandas.io.common import get_filepath_or_buffer, BaseIterator
+import numpy as np
+import struct
+from .saslib import (_rle_decompress, _rdc_decompress,
+ process_byte_array_with_data)
+
+_magic = (b"\x00\x00\x00\x00\x00\x00\x00\x00" +
+ b"\x00\x00\x00\x00\xc2\xea\x81\x60" +
+ b"\xb3\x14\x11\xcf\xbd\x92\x08\x00" +
+ b"\x09\xc7\x31\x8c\x18\x1f\x10\x11")
+
+_align_1_checker_value = b'3'
+_align_1_offset = 32
+_align_1_length = 1
+_align_1_value = 4
+_u64_byte_checker_value = b'3'
+_align_2_offset = 35
+_align_2_length = 1
+_align_2_value = 4
+_endianness_offset = 37
+_endianness_length = 1
+_platform_offset = 39
+_platform_length = 1
+_encoding_offset = 70
+_encoding_length = 1
+_dataset_offset = 92
+_dataset_length = 64
+_file_type_offset = 156
+_file_type_length = 8
+_date_created_offset = 164
+_date_created_length = 8
+_date_modified_offset = 172
+_date_modified_length = 8
+_header_size_offset = 196
+_header_size_length = 4
+_page_size_offset = 200
+_page_size_length = 4
+_page_count_offset = 204
+_page_count_length = 4
+_sas_release_offset = 216
+_sas_release_length = 8
+_sas_server_type_offset = 224
+_sas_server_type_length = 16
+_os_version_number_offset = 240
+_os_version_number_length = 16
+_os_maker_offset = 256
+_os_maker_length = 16
+_os_name_offset = 272
+_os_name_length = 16
+_page_bit_offset_x86 = 16
+_page_bit_offset_x64 = 32
+_subheader_pointer_length_x86 = 12
+_subheader_pointer_length_x64 = 24
+_page_type_offset = 0
+_page_type_length = 2
+_block_count_offset = 2
+_block_count_length = 2
+_subheader_count_offset = 4
+_subheader_count_length = 2
+_page_meta_type = 0
+_page_data_type = 256
+_page_amd_type = 1024
+_page_metc_type = 16384
+_page_comp_type = -28672
+_page_mix_types = [512, 640]
+_subheader_pointers_offset = 8
+_truncated_subheader_id = 1
+_compressed_subheader_id = 4
+_compressed_subheader_type = 1
+_text_block_size_length = 2
+_row_length_offset_multiplier = 5
+_row_count_offset_multiplier = 6
+_col_count_p1_multiplier = 9
+_col_count_p2_multiplier = 10
+_row_count_on_mix_page_offset_multiplier = 15
+_column_name_pointer_length = 8
+_column_name_text_subheader_offset = 0
+_column_name_text_subheader_length = 2
+_column_name_offset_offset = 2
+_column_name_offset_length = 2
+_column_name_length_offset = 4
+_column_name_length_length = 2
+_column_data_offset_offset = 8
+_column_data_length_offset = 8
+_column_data_length_length = 4
+_column_type_offset = 14
+_column_type_length = 1
+_column_format_text_subheader_index_offset = 22
+_column_format_text_subheader_index_length = 2
+_column_format_offset_offset = 24
+_column_format_offset_length = 2
+_column_format_length_offset = 26
+_column_format_length_length = 2
+_column_label_text_subheader_index_offset = 28
+_column_label_text_subheader_index_length = 2
+_column_label_offset_offset = 30
+_column_label_offset_length = 2
+_column_label_length_offset = 32
+_column_label_length_length = 2
+_rle_compression = 'SASYZCRL'
+_rdc_compression = 'SASYZCR2'
+
+_compression_literals = [_rle_compression, _rdc_compression]
+
+# Incomplete list of encodings
+_encoding_names = {29: "latin1", 20: "utf-8", 33: "cyrillic", 60: "wlatin2",
+ 61: "wcyrillic", 62: "wlatin1", 90: "ebcdic870"}
+
+# Should be enum
+
+
+class _index:
+ rowSizeIndex = 0
+ columnSizeIndex = 1
+ subheaderCountsIndex = 2
+ columnTextIndex = 3
+ columnNameIndex = 4
+ columnAttributesIndex = 5
+ formatAndLabelIndex = 6
+ columnListIndex = 7
+ dataSubheaderIndex = 8
+
+
+_subheader_signature_to_index = {
+ b"\xF7\xF7\xF7\xF7": _index.rowSizeIndex,
+ b"\x00\x00\x00\x00\xF7\xF7\xF7\xF7": _index.rowSizeIndex,
+ b"\xF7\xF7\xF7\xF7\x00\x00\x00\x00": _index.rowSizeIndex,
+ b"\xF7\xF7\xF7\xF7\xFF\xFF\xFB\xFE": _index.rowSizeIndex,
+ b"\xF6\xF6\xF6\xF6": _index.columnSizeIndex,
+ b"\x00\x00\x00\x00\xF6\xF6\xF6\xF6": _index.columnSizeIndex,
+ b"\xF6\xF6\xF6\xF6\x00\x00\x00\x00": _index.columnSizeIndex,
+ b"\xF6\xF6\xF6\xF6\xFF\xFF\xFB\xFE": _index.columnSizeIndex,
+ b"\x00\xFC\xFF\xFF": _index.subheaderCountsIndex,
+ b"\xFF\xFF\xFC\x00": _index.subheaderCountsIndex,
+ b"\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF": _index.subheaderCountsIndex,
+ b"\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00": _index.subheaderCountsIndex,
+ b"\xFD\xFF\xFF\xFF": _index.columnTextIndex,
+ b"\xFF\xFF\xFF\xFD": _index.columnTextIndex,
+ b"\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF": _index.columnTextIndex,
+ b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD": _index.columnTextIndex,
+ b"\xFF\xFF\xFF\xFF": _index.columnNameIndex,
+ b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF": _index.columnNameIndex,
+ b"\xFC\xFF\xFF\xFF": _index.columnAttributesIndex,
+ b"\xFF\xFF\xFF\xFC": _index.columnAttributesIndex,
+ b"\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF": _index.columnAttributesIndex,
+ b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC": _index.columnAttributesIndex,
+ b"\xFE\xFB\xFF\xFF": _index.formatAndLabelIndex,
+ b"\xFF\xFF\xFB\xFE": _index.formatAndLabelIndex,
+ b"\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF": _index.formatAndLabelIndex,
+ b"\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE": _index.formatAndLabelIndex,
+ b"\xFE\xFF\xFF\xFF": _index.columnListIndex,
+ b"\xFF\xFF\xFF\xFE": _index.columnListIndex,
+ b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": _index.columnListIndex,
+ b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": _index.columnListIndex}
+
+
+class _subheader_pointer(object):
+ pass
+
+
+class _column(object):
+ pass
+
+
+# SAS7BDAT represents a SAS data file in SAS7BDAT format.
+class SAS7BDATReader(BaseIterator):
+ """
+ Read SAS files in SAS7BDAT format.
+
+ Parameters
+ ----------
+ path_or_buf : path name or buffer
+ Name of SAS file or file-like object pointing to SAS file
+ contents.
+ index : column identifier, defaults to None
+ Column to use as index.
+ convert_dates : boolean, defaults to True
+ Attempt to convert dates to Pandas datetime values. Note all
+ SAS date formats are supported.
+ blank_missing : boolean, defaults to True
+ Convert empty strings to missing values (SAS uses blanks to
+ indicate missing character variables).
+ chunksize : int, defaults to None
+ Return SAS7BDATReader object for iterations, returns chunks
+ with given number of lines.
+ encoding : string, defaults to None
+ String encoding. If None, text variables are left as raw bytes.
+ """
+
+ def __init__(self, path_or_buf, index=None, convert_dates=True,
+ blank_missing=True, chunksize=None, encoding=None):
+
+ self.index = index
+ self.convert_dates = convert_dates
+ self.blank_missing = blank_missing
+ self.chunksize = chunksize
+ self.encoding = encoding
+
+ self.compression = ""
+ self.column_names_strings = []
+ self.column_names = []
+ self.column_types = []
+ self.column_formats = []
+ self.columns = []
+
+ self._current_page_data_subheader_pointers = []
+ self._cached_page = None
+ self._column_data_lengths = []
+ self._column_data_offsets = []
+ self._current_row_in_file_index = 0
+ self._current_row_on_page_index = 0
+ self._current_row_in_file_index = 0
+
+ self._path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf)
+ if isinstance(self._path_or_buf, compat.string_types):
+ self._path_or_buf = open(self._path_or_buf, 'rb')
+
+ self._get_properties()
+ self._parse_metadata()
+
+ def _get_properties(self):
+
+ # Check magic number
+ self._path_or_buf.seek(0)
+ self._cached_page = self._path_or_buf.read(288)
+ if self._cached_page[0:len(_magic)] != _magic:
+ raise ValueError("magic number mismatch (not a SAS file?)")
+
+ # Get alignment information
+ align1, align2 = 0, 0
+ buf = self._read_bytes(_align_1_offset, _align_1_length)
+ if buf == _u64_byte_checker_value:
+ align2 = _align_2_value
+ self.U64 = True
+ self._int_length = 8
+ self._page_bit_offset = _page_bit_offset_x64
+ self._subheader_pointer_length = _subheader_pointer_length_x64
+ else:
+ self.U64 = False
+ self._page_bit_offset = _page_bit_offset_x86
+ self._subheader_pointer_length = _subheader_pointer_length_x86
+ self._int_length = 4
+ buf = self._read_bytes(_align_2_offset, _align_2_length)
+ if buf == _align_1_checker_value:
+ align1 = _align_2_value
+ total_align = align1 + align2
+
+ # Get endianness information
+ buf = self._read_bytes(_endianness_offset, _endianness_length)
+ if buf == b'\x01':
+ self.byte_order = "<"
+ else:
+ self.byte_order = ">"
+
+ # Get encoding information
+ buf = self._read_bytes(_encoding_offset, _encoding_length)[0]
+ if buf in _encoding_names:
+ self.file_encoding = _encoding_names[buf]
+ else:
+ self.file_encoding = "unknown (code=%s)" % str(buf)
+
+ # Get platform information
+ buf = self._read_bytes(_platform_offset, _platform_length)
+ if buf == b'1':
+ self.platform = "unix"
+ elif buf == b'2':
+ self.platform = "windows"
+ else:
+ self.platform = "unknown"
+
+ buf = self._read_bytes(_dataset_offset, _dataset_length)
+ self.name = buf.rstrip(b'\x00 ').decode()
+
+ buf = self._read_bytes(_file_type_offset, _file_type_length)
+ self.file_type = buf.rstrip(b'\x00 ').decode()
+
+ # Timestamp is epoch 01/01/1960
+ epoch = pd.datetime(1960, 1, 1)
+ x = self._read_float(_date_created_offset + align1,
+ _date_created_length)
+ self.date_created = epoch + pd.to_timedelta(x, unit='s')
+ x = self._read_float(_date_modified_offset + align1,
+ _date_modified_length)
+ self.date_modified = epoch + pd.to_timedelta(x, unit='s')
+
+ self.header_length = self._read_int(_header_size_offset + align1,
+ _header_size_length)
+
+ # Read the rest of the header into cached_page.
+ buf = self._path_or_buf.read(self.header_length - 288)
+ self._cached_page += buf
+ if len(self._cached_page) != self.header_length:
+ raise ValueError("The SAS7BDAT file appears to be truncated.")
+
+ self._page_length = self._read_int(_page_size_offset + align1,
+ _page_size_length)
+ self._page_count = self._read_int(_page_count_offset + align1,
+ _page_count_length)
+
+ buf = self._read_bytes(_sas_release_offset + total_align,
+ _sas_release_length)
+ self.sas_release = buf.rstrip(b'\x00 ').decode()
+
+ buf = self._read_bytes(_sas_server_type_offset + total_align,
+ _sas_server_type_length)
+ self.server_type = buf.rstrip(b'\x00 ').decode()
+
+ buf = self._read_bytes(_os_version_number_offset + total_align,
+ _os_version_number_length)
+ self.os_version = buf.rstrip(b'\x00 ').decode()
+
+ buf = self._read_bytes(
+ _os_name_offset, _os_name_length).rstrip(b'\x00 ')
+ if len(buf) > 0:
+ self.os_name = buf.rstrip(b'\x00 ').decode()
+ else:
+ buf = self._path_or_buf.read(_os_maker_offset, _os_maker_length)
+ self.os_name = buf.rstrip(b'\x00 ').decode()
+
+ # Read a single float of the given width (4 or 8).
+ def _read_float(self, offset, width):
+ if width not in (4, 8):
+ raise ValueError("invalid float width")
+ buf = self._read_bytes(offset, width)
+ fd = "f" if width == 4 else "d"
+ return struct.unpack(self.byte_order + fd, buf)[0]
+
+ # Read a single signed integer of the given width (1, 2, 4 or 8).
+ def _read_int(self, offset, width):
+ if width not in (1, 2, 4, 8):
+ raise ValueError("invalid int width")
+ buf = self._read_bytes(offset, width)
+ it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
+ iv = struct.unpack(self.byte_order + it, buf)[0]
+ return iv
+
+ def _read_bytes(self, offset, length):
+ if self._cached_page is None:
+ self._path_or_buf.seek(offset)
+ buf = self._path_or_buf.read(length)
+ if len(buf) < length:
+ msg = "Unable to read {:d} bytes from file position {:d}."
+ raise ValueError(msg.format(length, offset))
+ return buf
+ else:
+ if offset + length > len(self._cached_page):
+ raise ValueError("The cached page is too small.")
+ return self._cached_page[offset:offset + length]
+
+ def _parse_metadata(self):
+ done = False
+ while not done:
+ self._cached_page = self._path_or_buf.read(self._page_length)
+ if len(self._cached_page) <= 0:
+ break
+ if len(self._cached_page) != self._page_length:
+ raise ValueError(
+ "Failed to read a meta data page from the SAS file.")
+ done = self._process_page_meta()
+
+ def _process_page_meta(self):
+ self._read_page_header()
+ pt = [_page_meta_type, _page_amd_type] + _page_mix_types
+ if self._current_page_type in pt:
+ self._process_page_metadata()
+ return ((self._current_page_type in [256] + _page_mix_types) or
+ (self._current_page_data_subheader_pointers is not None))
+
+ def _read_page_header(self):
+ bit_offset = self._page_bit_offset
+ tx = _page_type_offset + bit_offset
+ self._current_page_type = self._read_int(tx, _page_type_length)
+ tx = _block_count_offset + bit_offset
+ self._current_page_block_count = self._read_int(tx,
+ _block_count_length)
+ tx = _subheader_count_offset + bit_offset
+ self._current_page_subheaders_count = (
+ self._read_int(tx, _subheader_count_length))
+
+ def _process_page_metadata(self):
+ bit_offset = self._page_bit_offset
+
+ for i in range(self._current_page_subheaders_count):
+ pointer = self._process_subheader_pointers(
+ _subheader_pointers_offset + bit_offset, i)
+ if pointer.length == 0:
+ continue
+ if pointer.compression == _truncated_subheader_id:
+ continue
+ subheader_signature = self._read_subheader_signature(
+ pointer.offset)
+ subheader_index = (
+ self._get_subheader_index(subheader_signature,
+ pointer.compression, pointer.ptype))
+ self._process_subheader(subheader_index, pointer)
+
+ def _get_subheader_index(self, signature, compression, ptype):
+ index = _subheader_signature_to_index.get(signature)
+ if index is None:
+ f1 = ((compression == _compressed_subheader_id) or
+ (compression == 0))
+ f2 = (ptype == _compressed_subheader_type)
+ if (self.compression != "") and f1 and f2:
+ index = _index.dataSubheaderIndex
+ else:
+ raise ValueError("Unknown subheader signature")
+ return index
+
+ def _process_subheader_pointers(self, offset, subheader_pointer_index):
+
+ subheader_pointer_length = self._subheader_pointer_length
+ total_offset = (offset +
+ subheader_pointer_length * subheader_pointer_index)
+
+ subheader_offset = self._read_int(total_offset, self._int_length)
+ total_offset += self._int_length
+
+ subheader_length = self._read_int(total_offset, self._int_length)
+ total_offset += self._int_length
+
+ subheader_compression = self._read_int(total_offset, 1)
+ total_offset += 1
+
+ subheader_type = self._read_int(total_offset, 1)
+
+ x = _subheader_pointer()
+ x.offset = subheader_offset
+ x.length = subheader_length
+ x.compression = subheader_compression
+ x.ptype = subheader_type
+
+ return x
+
+ def _read_subheader_signature(self, offset):
+ subheader_signature = self._read_bytes(offset, self._int_length)
+ return subheader_signature
+
+ def _process_subheader(self, subheader_index, pointer):
+ offset = pointer.offset
+ length = pointer.length
+
+ if subheader_index == _index.rowSizeIndex:
+ processor = self._process_rowsize_subheader
+ elif subheader_index == _index.columnSizeIndex:
+ processor = self._process_columnsize_subheader
+ elif subheader_index == _index.columnTextIndex:
+ processor = self._process_columntext_subheader
+ elif subheader_index == _index.columnNameIndex:
+ processor = self._process_columnname_subheader
+ elif subheader_index == _index.columnAttributesIndex:
+ processor = self._process_columnattributes_subheader
+ elif subheader_index == _index.formatAndLabelIndex:
+ processor = self._process_format_subheader
+ elif subheader_index == _index.columnListIndex:
+ processor = self._process_columnlist_subheader
+ elif subheader_index == _index.subheaderCountsIndex:
+ processor = self._process_subheader_counts
+ elif subheader_index == _index.dataSubheaderIndex:
+ self._current_page_data_subheader_pointers.append(pointer)
+ return
+ else:
+ raise ValueError("unknown subheader index")
+
+ processor(offset, length)
+
+ def _process_rowsize_subheader(self, offset, length):
+
+ int_len = self._int_length
+ lcs_offset = offset
+ lcp_offset = offset
+ if self.U64:
+ lcs_offset += 682
+ lcp_offset += 706
+ else:
+ lcs_offset += 354
+ lcp_offset += 378
+
+ self.row_length = self._read_int(
+ offset + _row_length_offset_multiplier * int_len, int_len)
+ self.row_count = self._read_int(
+ offset + _row_count_offset_multiplier * int_len, int_len)
+ self.col_count_p1 = self._read_int(
+ offset + _col_count_p1_multiplier * int_len, int_len)
+ self.col_count_p2 = self._read_int(
+ offset + _col_count_p2_multiplier * int_len, int_len)
+ mx = _row_count_on_mix_page_offset_multiplier * int_len
+ self._mix_page_row_count = self._read_int(offset + mx, int_len)
+ self._lcs = self._read_int(lcs_offset, 2)
+ self._lcp = self._read_int(lcp_offset, 2)
+
+ def _process_columnsize_subheader(self, offset, length):
+ int_len = self._int_length
+ offset += int_len
+ self.column_count = self._read_int(offset, int_len)
+ if (self.col_count_p1 + self.col_count_p2 !=
+ self.column_count):
+ print("Warning: column count mismatch (%d + %d != %d)\n",
+ self.col_count_p1, self.col_count_p2, self.column_count)
+
+ # Unknown purpose
+ def _process_subheader_counts(self, offset, length):
+ pass
+
+ def _process_columntext_subheader(self, offset, length):
+
+ offset += self._int_length
+ text_block_size = self._read_int(offset, _text_block_size_length)
+
+ buf = self._read_bytes(offset, text_block_size)
+ self.column_names_strings.append(
+ buf[0:text_block_size].rstrip(b"\x00 ").decode())
+
+ if len(self.column_names_strings) == 1:
+ column_name = self.column_names_strings[0]
+ compression_literal = ""
+ for cl in _compression_literals:
+ if cl in column_name:
+ compression_literal = cl
+ self.compression = compression_literal
+ offset -= self._int_length
+
+ offset1 = offset + 16
+ if self.U64:
+ offset1 += 4
+
+ buf = self._read_bytes(offset1, self._lcp)
+ compression_literal = buf.rstrip(b"\x00")
+ if compression_literal == "":
+ self._lcs = 0
+ offset1 = offset + 32
+ if self.U64:
+ offset1 += 4
+ buf = self._read_bytes(offset1, self._lcp)
+ self.creator_proc = buf[0:self._lcp].decode()
+ elif compression_literal == _rle_compression:
+ offset1 = offset + 40
+ if self.U64:
+ offset1 += 4
+ buf = self._read_bytes(offset1, self._lcp)
+ self.creator_proc = buf[0:self._lcp].decode()
+ elif self._lcs > 0:
+ self._lcp = 0
+ offset1 = offset + 16
+ if self.U64:
+ offset1 += 4
+ buf = self._read_bytes(offset1, self._lcs)
+ self.creator_proc = buf[0:self._lcp].decode()
+
+ def _process_columnname_subheader(self, offset, length):
+ int_len = self._int_length
+ offset += int_len
+ column_name_pointers_count = (length - 2 * int_len - 12) // 8
+ for i in range(column_name_pointers_count):
+ text_subheader = offset + _column_name_pointer_length * \
+ (i + 1) + _column_name_text_subheader_offset
+ col_name_offset = offset + _column_name_pointer_length * \
+ (i + 1) + _column_name_offset_offset
+ col_name_length = offset + _column_name_pointer_length * \
+ (i + 1) + _column_name_length_offset
+
+ idx = self._read_int(
+ text_subheader, _column_name_text_subheader_length)
+ col_offset = self._read_int(
+ col_name_offset, _column_name_offset_length)
+ col_len = self._read_int(
+ col_name_length, _column_name_length_length)
+
+ name_str = self.column_names_strings[idx]
+ self.column_names.append(name_str[col_offset:col_offset + col_len])
+
+ def _process_columnattributes_subheader(self, offset, length):
+ int_len = self._int_length
+ column_attributes_vectors_count = (
+ length - 2 * int_len - 12) // (int_len + 8)
+ self.column_types = np.empty(
+ column_attributes_vectors_count, dtype=np.dtype('S1'))
+ for i in range(column_attributes_vectors_count):
+ col_data_offset = (offset + int_len +
+ _column_data_offset_offset + i * (int_len + 8))
+ col_data_len = (offset + 2 * int_len +
+ _column_data_length_offset + i * (int_len + 8))
+ col_types = (offset + 2 * int_len +
+ _column_type_offset + i * (int_len + 8))
+
+ self._column_data_offsets.append(
+ self._read_int(col_data_offset, int_len))
+
+ x = self._read_int(col_data_len, _column_data_length_length)
+ self._column_data_lengths.append(x)
+
+ x = self._read_int(col_types, _column_type_length)
+ if x == 1:
+ self.column_types[i] = b'd'
+ else:
+ self.column_types[i] = b's'
+
+ def _process_columnlist_subheader(self, offset, length):
+ # unknown purpose
+ pass
+
+ def _process_format_subheader(self, offset, length):
+ int_len = self._int_length
+ text_subheader_format = offset + \
+ _column_format_text_subheader_index_offset + 3 * int_len
+ col_format_offset = offset + _column_format_offset_offset + 3 * int_len
+ col_format_len = offset + _column_format_length_offset + 3 * int_len
+ text_subheader_label = offset + \
+ _column_label_text_subheader_index_offset + 3 * int_len
+ col_label_offset = offset + _column_label_offset_offset + 3 * int_len
+ col_label_len = offset + _column_label_length_offset + 3 * int_len
+
+ x = self._read_int(text_subheader_format,
+ _column_format_text_subheader_index_length)
+ format_idx = min(x, len(self.column_names_strings) - 1)
+
+ format_start = self._read_int(
+ col_format_offset, _column_format_offset_length)
+ format_len = self._read_int(
+ col_format_len, _column_format_length_length)
+
+ label_idx = self._read_int(
+ text_subheader_label, _column_label_text_subheader_index_length)
+ label_idx = min(label_idx, len(self.column_names_strings) - 1)
+
+ label_start = self._read_int(
+ col_label_offset, _column_label_offset_length)
+ label_len = self._read_int(col_label_len, _column_label_length_length)
+
+ label_names = self.column_names_strings[label_idx]
+ column_label = label_names[label_start: label_start + label_len]
+ format_names = self.column_names_strings[format_idx]
+ column_format = format_names[format_start: format_start + format_len]
+ current_column_number = len(self.columns)
+
+ col = _column()
+ col.col_id = current_column_number
+ col.name = self.column_names[current_column_number]
+ col.label = column_label
+ col.format = column_format
+ col.ctype = self.column_types[current_column_number]
+ col.length = self._column_data_lengths[current_column_number]
+
+ self.column_formats.append(column_format)
+ self.columns.append(col)
+
+ def read(self, nrows=None):
+
+ if (nrows is None) and (self.chunksize is not None):
+ nrows = self.chunksize
+ elif nrows is None:
+ nrows = self.row_count
+
+ if self._current_row_in_file_index >= self.row_count:
+ return None
+
+ nd = (self.column_types == b'd').sum()
+ ns = (self.column_types == b's').sum()
+
+ self._string_chunk = np.empty((ns, nrows), dtype=np.object)
+ self._byte_chunk = np.empty((nd, 8 * nrows), dtype=np.uint8)
+
+ self._current_row_in_chunk_index = 0
+ for i in range(nrows):
+ done = self._readline()
+ if done:
+ break
+
+ rslt = self._chunk_to_dataframe()
+ if self.index is not None:
+ rslt = rslt.set_index(self.index)
+
+ return rslt
+
+ def _readline(self):
+
+ bit_offset = self._page_bit_offset
+ subheader_pointer_length = self._subheader_pointer_length
+
+ # If there is no page, go to the end of the header and read a page.
+ if self._cached_page is None:
+ self._path_or_buf.seek(self.header_length)
+ done = self._read_next_page()
+ if done:
+ return True
+
+ # Loop until a data row is read
+ while True:
+ if self._current_page_type == _page_meta_type:
+ flag = (self._current_row_on_page_index >=
+ len(self._current_page_data_subheader_pointers))
+ if flag:
+ done = self._read_next_page()
+ if done:
+ return True
+ self._current_row_on_page_index = 0
+ continue
+ current_subheader_pointer = (
+ self._current_page_data_subheader_pointers[
+ self._current_row_on_page_index])
+ process_byte_array_with_data(self,
+ current_subheader_pointer.offset,
+ current_subheader_pointer.length,
+ self._byte_chunk,
+ self._string_chunk)
+ return False
+ elif self._current_page_type in _page_mix_types:
+ align_correction = (bit_offset + _subheader_pointers_offset +
+ self._current_page_subheaders_count *
+ subheader_pointer_length)
+ align_correction = align_correction % 8
+ offset = bit_offset + align_correction
+ offset += _subheader_pointers_offset
+ offset += (self._current_page_subheaders_count *
+ subheader_pointer_length)
+ offset += self._current_row_on_page_index * self.row_length
+ process_byte_array_with_data(self, offset, self.row_length,
+ self._byte_chunk,
+ self._string_chunk)
+ mn = min(self.row_count, self._mix_page_row_count)
+ if self._current_row_on_page_index == mn:
+ done = self._read_next_page()
+ if done:
+ return True
+ self._current_row_on_page_index = 0
+ return False
+ elif self._current_page_type == _page_data_type:
+ process_byte_array_with_data(self,
+ bit_offset +
+ _subheader_pointers_offset +
+ self._current_row_on_page_index *
+ self.row_length,
+ self.row_length, self._byte_chunk,
+ self._string_chunk)
+ flag = (self._current_row_on_page_index ==
+ self._current_page_block_count)
+ if flag:
+ done = self._read_next_page()
+ if done:
+ return True
+ self._current_row_on_page_index = 0
+ return False
+ else:
+ raise ValueError("unknown page type: %s",
+ self._current_page_type)
+
+ def _read_next_page(self):
+ self._current_page_data_subheader_pointers = []
+ self._cached_page = self._path_or_buf.read(self._page_length)
+ if len(self._cached_page) <= 0:
+ return True
+ elif len(self._cached_page) != self._page_length:
+ msg = ("failed to read complete page from file "
+ "(read {:d} of {:d} bytes)")
+ raise ValueError(msg.format(len(self._cached_page),
+ self._page_length))
+
+ self._read_page_header()
+ if self._current_page_type == _page_meta_type:
+ self._process_page_metadata()
+ pt = [_page_meta_type, _page_data_type] + [_page_mix_types]
+ if self._current_page_type not in pt:
+ return self._read_next_page()
+
+ return False
+
+ def _decompress(self, row_length, page):
+ page = np.frombuffer(page, dtype=np.uint8)
+ if self.compression == _rle_compression:
+ return _rle_decompress(row_length, page)
+ elif self.compression == _rdc_compression:
+ return _rdc_decompress(row_length, page)
+ else:
+ raise ValueError("unknown SAS compression method: %s" %
+ self.compression)
+
+ def _chunk_to_dataframe(self):
+
+ n = self._current_row_in_chunk_index
+ m = self._current_row_in_file_index
+ ix = range(m - n, m)
+ rslt = pd.DataFrame(index=ix)
+
+ js, jb = 0, 0
+ for j in range(self.column_count):
+
+ name = self.column_names[j]
+
+ if self.column_types[j] == b'd':
+ rslt[name] = self._byte_chunk[jb, :].view(
+ dtype=self.byte_order + 'd')
+ rslt[name] = np.asarray(rslt[name], dtype=np.float64)
+ if self.convert_dates and (self.column_formats[j] == "MMDDYY"):
+ epoch = pd.datetime(1960, 1, 1)
+ rslt[name] = epoch + pd.to_timedelta(rslt[name], unit='d')
+ jb += 1
+ elif self.column_types[j] == b's':
+ rslt[name] = self._string_chunk[js, :]
+ rslt[name] = rslt[name].apply(lambda x: x.rstrip(b'\x00 '))
+ if self.encoding is not None:
+ rslt[name] = rslt[name].apply(
+ lambda x: x.decode(encoding=self.encoding))
+ if self.blank_missing:
+ ii = rslt[name].str.len() == 0
+ rslt.loc[ii, name] = np.nan
+ js += 1
+ else:
+ raise ValueError("unknown column type %s" %
+ self.column_types[j])
+
+ return rslt
diff --git a/pandas/io/sas.py b/pandas/io/sas/sas_xport.py
similarity index 86%
rename from pandas/io/sas.py
rename to pandas/io/sas/sas_xport.py
index 49013a98c77ff..e4ca99fdcb109 100644
--- a/pandas/io/sas.py
+++ b/pandas/io/sas/sas_xport.py
@@ -15,9 +15,10 @@
import struct
import numpy as np
from pandas.util.decorators import Appender
+import warnings
-_correct_line1 = ("HEADER RECORD*******LIBRARY HEADER RECORD"
- "!!!!!!!000000000000000000000000000000 ")
+_correct_line1 = ("HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!"
+ "000000000000000000000000000000 ")
_correct_header1 = ("HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!"
"000000000000000001600000000")
_correct_header2 = ("HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!"
@@ -28,6 +29,7 @@
'nform', 'nfl', 'num_decimals', 'nfj', 'nfill', 'niform',
'nifl', 'nifd', 'npos', '_']
+
_base_params_doc = """\
Parameters
----------
@@ -112,25 +114,6 @@
"""
-@Appender(_read_sas_doc)
-def read_sas(filepath_or_buffer, format='xport', index=None,
- encoding='ISO-8859-1', chunksize=None, iterator=False):
-
- format = format.lower()
-
- if format == 'xport':
- reader = XportReader(filepath_or_buffer, index=index,
- encoding=encoding,
- chunksize=chunksize)
- else:
- raise ValueError('only xport format is supported')
-
- if iterator or chunksize:
- return reader
-
- return reader.read()
-
-
def _parse_date(datestr):
""" Given a date in xport format, return Python date. """
try:
@@ -282,8 +265,9 @@ def _read_header(self):
raise ValueError("Header record is not an XPORT file.")
line2 = self._get_row()
- file_info = _split_line(line2, [['prefix', 24], ['version', 8],
- ['OS', 8], ['_', 24], ['created', 16]])
+ fif = [['prefix', 24], ['version', 8], ['OS', 8],
+ ['_', 24], ['created', 16]]
+ file_info = _split_line(line2, fif)
if file_info['prefix'] != "SAS SAS SASLIB":
raise ValueError("Header record has invalid prefix.")
file_info['created'] = _parse_date(file_info['created'])
@@ -295,22 +279,19 @@ def _read_header(self):
# read member header
header1 = self._get_row()
header2 = self._get_row()
- if (not header1.startswith(_correct_header1) or
- not header2 == _correct_header2):
- raise ValueError("Member header not found.")
- fieldnamelength = int(header1[-5:-2]) # usually 140, could be 135
+ headflag1 = header1.startswith(_correct_header1)
+ headflag2 = (header2 == _correct_header2)
+ if not (headflag1 and headflag2):
+ raise ValueError("Member header not found")
+ # usually 140, could be 135
+ fieldnamelength = int(header1[-5:-2])
# member info
- member_info = _split_line(self._get_row(), [['prefix', 8],
- ['set_name', 8],
- ['sasdata', 8],
- ['version', 8],
- ['OS', 8], ['_', 24],
- ['created', 16]])
- member_info.update(_split_line(self._get_row(),
- [['modified', 16],
- ['_', 16],
- ['label', 40], ['type', 8]]))
+ mem = [['prefix', 8], ['set_name', 8], ['sasdata', 8],
+ ['version', 8], ['OS', 8], ['_', 24], ['created', 16]]
+ member_info = _split_line(self._get_row(), mem)
+ mem = [['modified', 16], ['_', 16], ['label', 40], ['type', 8]]
+ member_info.update(_split_line(self._get_row(), mem))
member_info['modified'] = _parse_date(member_info['modified'])
member_info['created'] = _parse_date(member_info['created'])
self.member_info = member_info
@@ -319,15 +300,16 @@ def _read_header(self):
types = {1: 'numeric', 2: 'char'}
fieldcount = int(self._get_row()[54:58])
datalength = fieldnamelength * fieldcount
- if datalength % 80: # round up to nearest 80
+ # round up to nearest 80
+ if datalength % 80:
datalength += 80 - datalength % 80
fielddata = self.filepath_or_buffer.read(datalength)
fields = []
obs_length = 0
while len(fielddata) >= fieldnamelength:
# pull data for one field
- field, fielddata = (
- fielddata[:fieldnamelength], fielddata[fieldnamelength:])
+ field, fielddata = (fielddata[:fieldnamelength],
+ fielddata[fieldnamelength:])
# rest at end gets ignored, so if field is short, pad out
# to match struct pattern below
@@ -339,8 +321,8 @@ def _read_header(self):
field['ntype'] = types[field['ntype']]
fl = field['field_length']
if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)):
- raise TypeError("Floating point field width %d is not between "
- "2 and 8." % fl)
+ msg = "Floating field width {0} is not between 2 and 8."
+ raise TypeError(msg.format(fl))
for k, v in field.items():
try:
@@ -376,8 +358,8 @@ def _record_count(self):
"""
Get number of records in file.
- This is maybe suboptimal because we have to seek to the end of the
- file.
+ This is maybe suboptimal because we have to seek to the end of
+ the file.
Side effect: returns file position to record_start.
"""
@@ -387,7 +369,6 @@ def _record_count(self):
self.record_start)
if total_records_length % 80 != 0:
- import warnings
warnings.warn("xport file may be corrupted")
if self.record_length > 80:
@@ -461,7 +442,8 @@ def read(self, nrows=None):
elif self.fields[j]['ntype'] == 'char':
v = [y.rstrip() for y in vec]
if compat.PY3:
- v = [y.decode(self._encoding) for y in v]
+ if self._encoding is not None:
+ v = [y.decode(self._encoding) for y in v]
df[x] = v
if self._index is None:
diff --git a/pandas/io/sas/saslib.pyx b/pandas/io/sas/saslib.pyx
new file mode 100644
index 0000000000000..a963bf4fe25d3
--- /dev/null
+++ b/pandas/io/sas/saslib.pyx
@@ -0,0 +1,237 @@
+import numpy as np
+cimport numpy as np
+from numpy cimport uint8_t, uint16_t
+
+# rle_decompress decompresses data using a Run Length Encoding
+# algorithm. It is partially documented here:
+#
+# https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf
+def _rle_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
+
+ cdef uint8_t control_byte
+ cdef uint8_t [:] result = np.zeros(result_length, np.uint8)
+
+ cdef int rpos = 0
+ cdef int ipos = 0
+ cdef int i
+ cdef int nbytes
+ cdef uint8_t x
+ cdef length = len(inbuff)
+
+ while ipos < length:
+ control_byte = inbuff[ipos] & 0xF0
+ end_of_first_byte = int(inbuff[ipos] & 0x0F)
+ ipos += 1
+
+ if control_byte == 0x00:
+ if end_of_first_byte != 0:
+ print("Unexpected non-zero end_of_first_byte")
+ nbytes = int(inbuff[ipos]) + 64
+ ipos += 1
+ for i in range(nbytes):
+ result[rpos] = inbuff[ipos]
+ rpos += 1
+ ipos += 1
+ elif control_byte == 0x40:
+ # not documented
+ nbytes = end_of_first_byte * 16
+ nbytes += int(inbuff[ipos])
+ ipos += 1
+ for i in range(nbytes):
+ result[rpos] = inbuff[ipos]
+ rpos += 1
+ ipos += 1
+ elif control_byte == 0x60:
+ nbytes = end_of_first_byte*256 + int(inbuff[ipos]) + 17
+ ipos += 1
+ for i in range(nbytes):
+ result[rpos] = 0x20
+ rpos += 1
+ elif control_byte == 0x70:
+ nbytes = end_of_first_byte*256 + int(inbuff[ipos]) + 17
+ ipos += 1
+ for i in range(nbytes):
+ result[rpos] = 0x00
+ rpos += 1
+ elif control_byte == 0x80:
+ nbytes = end_of_first_byte + 1
+ for i in range(nbytes):
+ result[rpos] = inbuff[ipos + i]
+ rpos += 1
+ ipos += nbytes
+ elif control_byte == 0x90:
+ nbytes = end_of_first_byte + 17
+ for i in range(nbytes):
+ result[rpos] = inbuff[ipos + i]
+ rpos += 1
+ ipos += nbytes
+ elif control_byte == 0xA0:
+ nbytes = end_of_first_byte + 33
+ for i in range(nbytes):
+ result[rpos] = inbuff[ipos + i]
+ rpos += 1
+ ipos += nbytes
+ elif control_byte == 0xB0:
+ nbytes = end_of_first_byte + 49
+ for i in range(nbytes):
+ result[rpos] = inbuff[ipos + i]
+ rpos += 1
+ ipos += nbytes
+ elif control_byte == 0xC0:
+ nbytes = end_of_first_byte + 3
+ x = inbuff[ipos]
+ ipos += 1
+ for i in range(nbytes):
+ result[rpos] = x
+ rpos += 1
+ elif control_byte == 0xD0:
+ nbytes = end_of_first_byte + 2
+ for i in range(nbytes):
+ result[rpos] = 0x40
+ rpos += 1
+ elif control_byte == 0xE0:
+ nbytes = end_of_first_byte + 2
+ for i in range(nbytes):
+ result[rpos] = 0x20
+ rpos += 1
+ elif control_byte == 0xF0:
+ nbytes = end_of_first_byte + 2
+ for i in range(nbytes):
+ result[rpos] = 0x00
+ rpos += 1
+ else:
+ raise ValueError("unknown control byte: %v", control_byte)
+
+ if len(result) != result_length:
+ print("RLE: %v != %v\n", (len(result), result_length))
+
+ return np.asarray(result).tostring()
+
+
+# rdc_decompress decompresses data using the Ross Data Compression algorithm:
+#
+# http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
+def _rdc_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff):
+
+ cdef uint8_t cmd
+ cdef uint16_t ctrl_bits
+ cdef uint16_t ctrl_mask = 0
+ cdef uint16_t ofs
+ cdef uint16_t cnt
+ cdef int ipos = 0
+ cdef int rpos = 0
+ cdef int k
+
+ cdef uint8_t [:] outbuff = np.zeros(result_length, dtype=np.uint8)
+
+ ii = -1
+
+ while ipos < len(inbuff):
+ ii += 1
+ ctrl_mask = ctrl_mask >> 1
+ if ctrl_mask == 0:
+ ctrl_bits = (inbuff[ipos] << 8) + inbuff[ipos + 1]
+ ipos += 2
+ ctrl_mask = 0x8000
+
+ if ctrl_bits & ctrl_mask == 0:
+ outbuff[rpos] = inbuff[ipos]
+ ipos += 1
+ rpos += 1
+ continue
+
+ cmd = (inbuff[ipos] >> 4) & 0x0F
+ cnt = (inbuff[ipos] & 0x0F)
+ ipos += 1
+
+ # short RLE
+ if cmd == 0:
+ cnt += 3
+ for k in range(cnt):
+ outbuff[rpos + k] = inbuff[ipos]
+ rpos += cnt
+ ipos += 1
+
+ # long RLE
+ elif cmd == 1:
+ cnt += inbuff[ipos] << 4
+ cnt += 19
+ ipos += 1
+ for k in range(cnt):
+ outbuff[rpos + k] = inbuff[ipos]
+ rpos += cnt
+ ipos += 1
+
+ # long pattern
+ elif cmd == 2:
+ ofs = cnt + 3
+ ofs += inbuff[ipos] << 4
+ ipos += 1
+ cnt = inbuff[ipos]
+ ipos += 1
+ cnt += 16
+ for k in range(cnt):
+ outbuff[rpos + k] = outbuff[rpos - int(ofs) + k]
+ rpos += cnt
+
+ # short pattern
+ elif (cmd >= 3) & (cmd <= 15):
+ ofs = cnt + 3
+ ofs += inbuff[ipos] << 4
+ ipos += 1
+ for k in range(cmd):
+ outbuff[rpos + k] = outbuff[rpos - int(ofs) + k]
+ rpos += cmd
+
+ else:
+ raise ValueError("unknown RDC command")
+
+ if len(outbuff) != result_length:
+ raise ValueError("RDC: %v != %v\n", len(outbuff), result_length)
+
+ return np.asarray(outbuff).tostring()
+
+def process_byte_array_with_data(parser, int offset, int length, np.ndarray[uint8_t, ndim=2] byte_chunk,
+ np.ndarray[dtype=object, ndim=2] string_chunk):
+
+ cdef int s
+ cdef int j
+ cdef int m
+ cdef int start
+ cdef int end
+ cdef bytes source
+ cdef bytes temp
+ cdef int jb
+ cdef int js
+
+ if (parser.compression != "") and (length < parser.row_length):
+ source = parser._decompress(parser.row_length, parser._cached_page[offset:offset + length])
+ else:
+ source = parser._cached_page[offset:offset + length]
+
+ s = 8 * parser._current_row_in_chunk_index
+ js = 0
+ jb = 0
+ for j in range(parser.column_count):
+ length = parser._column_data_lengths[j]
+ if length == 0:
+ break
+ start = parser._column_data_offsets[j]
+ end = start + length
+ temp = source[start:end]
+ if parser.column_types[j] == b'd':
+ m = 8 - length
+ if parser.byte_order == "<":
+ byte_chunk[jb, s+m:s+8] = np.frombuffer(temp, dtype=np.uint8)
+ else:
+ byte_chunk[jb, s:s+length] = np.frombuffer(temp, dtype=np.uint8)
+ jb += 1
+ elif parser.column_types[j] == b's':
+ string_chunk[js, parser._current_row_in_chunk_index] = bytes(temp)
+ js += 1
+ else:
+ raise ValueError("unknown column type: %s" % parser.columns[j].ctype)
+
+ parser._current_row_on_page_index += 1
+ parser._current_row_in_chunk_index += 1
+ parser._current_row_in_file_index += 1
diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py
new file mode 100644
index 0000000000000..9a60200c78893
--- /dev/null
+++ b/pandas/io/sas/sasreader.py
@@ -0,0 +1,61 @@
+"""
+Read SAS sas7bdat or xport files.
+"""
+
+
+def read_sas(filepath_or_buffer, format=None, index=None, encoding=None,
+ chunksize=None, iterator=False):
+ """
+ Read SAS files stored as either XPORT or SAS7BDAT format files.
+
+ Parameters
+ ----------
+ filepath_or_buffer : string or file-like object
+ Path to the SAS file.
+ format : string {'xport', 'sas7bdat'} or None
+ If None, file format is inferred. If 'xport' or 'sas7bdat',
+ uses the corresponding format.
+ index : identifier of index column, defaults to None
+ Identifier of column that should be used as index of the DataFrame.
+ encoding : string, default is None
+ Encoding for text data. If None, text data are stored as raw bytes.
+ chunksize : int
+ Read file `chunksize` lines at a time, returns iterator.
+ iterator : bool, defaults to False
+ If True, returns an iterator for reading the file incrementally.
+
+ Returns
+ -------
+ DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
+ or XportReader
+ """
+
+ if format is None:
+ try:
+ fname = filepath_or_buffer.lower()
+ if fname.endswith(".xpt"):
+ format = "xport"
+ elif fname.endswith(".sas7bdat"):
+ format = "sas7bdat"
+ else:
+ raise ValueError("unable to infer format of SAS file")
+ except:
+ pass
+
+ if format.lower() == 'xport':
+ from pandas.io.sas.sas_xport import XportReader
+ reader = XportReader(filepath_or_buffer, index=index,
+ encoding=encoding,
+ chunksize=chunksize)
+ elif format.lower() == 'sas7bdat':
+ from pandas.io.sas.sas7bdat import SAS7BDATReader
+ reader = SAS7BDATReader(filepath_or_buffer, index=index,
+ encoding=encoding,
+ chunksize=chunksize)
+ else:
+ raise ValueError('unknown SAS format')
+
+ if iterator or chunksize:
+ return reader
+
+ return reader.read()
diff --git a/pandas/io/tests/data/DEMO_G.csv b/pandas/io/tests/sas/data/DEMO_G.csv
similarity index 100%
rename from pandas/io/tests/data/DEMO_G.csv
rename to pandas/io/tests/sas/data/DEMO_G.csv
diff --git a/pandas/io/tests/data/DEMO_G.xpt b/pandas/io/tests/sas/data/DEMO_G.xpt
similarity index 100%
rename from pandas/io/tests/data/DEMO_G.xpt
rename to pandas/io/tests/sas/data/DEMO_G.xpt
diff --git a/pandas/io/tests/data/DRXFCD_G.csv b/pandas/io/tests/sas/data/DRXFCD_G.csv
similarity index 100%
rename from pandas/io/tests/data/DRXFCD_G.csv
rename to pandas/io/tests/sas/data/DRXFCD_G.csv
diff --git a/pandas/io/tests/data/DRXFCD_G.xpt b/pandas/io/tests/sas/data/DRXFCD_G.xpt
similarity index 100%
rename from pandas/io/tests/data/DRXFCD_G.xpt
rename to pandas/io/tests/sas/data/DRXFCD_G.xpt
diff --git a/pandas/io/tests/data/SSHSV1_A.csv b/pandas/io/tests/sas/data/SSHSV1_A.csv
similarity index 100%
rename from pandas/io/tests/data/SSHSV1_A.csv
rename to pandas/io/tests/sas/data/SSHSV1_A.csv
diff --git a/pandas/io/tests/data/SSHSV1_A.xpt b/pandas/io/tests/sas/data/SSHSV1_A.xpt
similarity index 100%
rename from pandas/io/tests/data/SSHSV1_A.xpt
rename to pandas/io/tests/sas/data/SSHSV1_A.xpt
diff --git a/pandas/io/tests/data/paxraw_d_short.csv b/pandas/io/tests/sas/data/paxraw_d_short.csv
similarity index 100%
rename from pandas/io/tests/data/paxraw_d_short.csv
rename to pandas/io/tests/sas/data/paxraw_d_short.csv
diff --git a/pandas/io/tests/data/paxraw_d_short.xpt b/pandas/io/tests/sas/data/paxraw_d_short.xpt
similarity index 100%
rename from pandas/io/tests/data/paxraw_d_short.xpt
rename to pandas/io/tests/sas/data/paxraw_d_short.xpt
diff --git a/pandas/io/tests/sas/data/test1.sas7bdat b/pandas/io/tests/sas/data/test1.sas7bdat
new file mode 100644
index 0000000000000..951173ce4d9f9
Binary files /dev/null and b/pandas/io/tests/sas/data/test1.sas7bdat differ
diff --git a/pandas/io/tests/sas/data/test10.sas7bdat b/pandas/io/tests/sas/data/test10.sas7bdat
new file mode 100644
index 0000000000000..a5fd43e6cb9ac
Binary files /dev/null and b/pandas/io/tests/sas/data/test10.sas7bdat differ
diff --git a/pandas/io/tests/sas/data/test11.sas7bdat b/pandas/io/tests/sas/data/test11.sas7bdat
new file mode 100644
index 0000000000000..072aa683f66d9
Binary files /dev/null and b/pandas/io/tests/sas/data/test11.sas7bdat differ
diff --git a/pandas/io/tests/sas/data/test12.sas7bdat b/pandas/io/tests/sas/data/test12.sas7bdat
new file mode 100644
index 0000000000000..e2a9db874948d
Binary files /dev/null and b/pandas/io/tests/sas/data/test12.sas7bdat differ
diff --git a/pandas/io/tests/sas/data/test13.sas7bdat b/pandas/io/tests/sas/data/test13.sas7bdat
new file mode 100644
index 0000000000000..b1dc6f9f8eddc
Binary files /dev/null and b/pandas/io/tests/sas/data/test13.sas7bdat differ
diff --git a/pandas/io/tests/sas/data/test14.sas7bdat b/pandas/io/tests/sas/data/test14.sas7bdat
new file mode 100644
index 0000000000000..5a958df51f0ce
Binary files /dev/null and b/pandas/io/tests/sas/data/test14.sas7bdat differ
diff --git a/pandas/io/tests/sas/data/test15.sas7bdat b/pandas/io/tests/sas/data/test15.sas7bdat
new file mode 100644
index 0000000000000..c028d8041a3d3
Binary files /dev/null and b/pandas/io/tests/sas/data/test15.sas7bdat differ
diff --git a/pandas/io/tests/sas/data/test16.sas7bdat b/pandas/io/tests/sas/data/test16.sas7bdat
new file mode 100644
index 0000000000000..867c3c51bbddd
Binary files /dev/null and b/pandas/io/tests/sas/data/test16.sas7bdat differ
diff --git a/pandas/io/tests/sas/data/test2.sas7bdat b/pandas/io/tests/sas/data/test2.sas7bdat
new file mode 100644
index 0000000000000..ba0b8e8dcbb91
Binary files /dev/null and b/pandas/io/tests/sas/data/test2.sas7bdat differ
diff --git a/pandas/io/tests/sas/data/test3.sas7bdat b/pandas/io/tests/sas/data/test3.sas7bdat
new file mode 100644
index 0000000000000..a061b1ddd0d45
Binary files /dev/null and b/pandas/io/tests/sas/data/test3.sas7bdat differ
diff --git a/pandas/io/tests/sas/data/test4.sas7bdat b/pandas/io/tests/sas/data/test4.sas7bdat
new file mode 100644
index 0000000000000..addd6edf90830
Binary files /dev/null and b/pandas/io/tests/sas/data/test4.sas7bdat differ
diff --git a/pandas/io/tests/sas/data/test5.sas7bdat b/pandas/io/tests/sas/data/test5.sas7bdat
new file mode 100644
index 0000000000000..ba741d5a635df
Binary files /dev/null and b/pandas/io/tests/sas/data/test5.sas7bdat differ
diff --git a/pandas/io/tests/sas/data/test6.sas7bdat b/pandas/io/tests/sas/data/test6.sas7bdat
new file mode 100644
index 0000000000000..2d9b4b0466047
Binary files /dev/null and b/pandas/io/tests/sas/data/test6.sas7bdat differ
diff --git a/pandas/io/tests/sas/data/test7.sas7bdat b/pandas/io/tests/sas/data/test7.sas7bdat
new file mode 100644
index 0000000000000..785b12cf175e3
Binary files /dev/null and b/pandas/io/tests/sas/data/test7.sas7bdat differ
diff --git a/pandas/io/tests/sas/data/test8.sas7bdat b/pandas/io/tests/sas/data/test8.sas7bdat
new file mode 100644
index 0000000000000..67db5a143de07
Binary files /dev/null and b/pandas/io/tests/sas/data/test8.sas7bdat differ
diff --git a/pandas/io/tests/sas/data/test9.sas7bdat b/pandas/io/tests/sas/data/test9.sas7bdat
new file mode 100644
index 0000000000000..d76a1f28033f4
Binary files /dev/null and b/pandas/io/tests/sas/data/test9.sas7bdat differ
diff --git a/pandas/io/tests/sas/data/test_sas7bdat_1.csv b/pandas/io/tests/sas/data/test_sas7bdat_1.csv
new file mode 100644
index 0000000000000..3eb23e42448d1
--- /dev/null
+++ b/pandas/io/tests/sas/data/test_sas7bdat_1.csv
@@ -0,0 +1,11 @@
+Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13,Column14,Column15,Column16,Column17,Column18,Column19,Column20,Column21,Column22,Column23,Column24,Column25,Column26,Column27,Column28,Column29,Column30,Column31,Column32,Column33,Column34,Column35,Column36,Column37,Column38,Column39,Column40,Column41,Column42,Column43,Column44,Column45,Column46,Column47,Column48,Column49,Column50,Column51,Column52,Column53,Column54,Column55,Column56,Column57,Column58,Column59,Column60,Column61,Column62,Column63,Column64,Column65,Column66,Column67,Column68,Column69,Column70,Column71,Column72,Column73,Column74,Column75,Column76,Column77,Column78,Column79,Column80,Column81,Column82,Column83,Column84,Column85,Column86,Column87,Column88,Column89,Column90,Column91,Column92,Column93,Column94,Column95,Column96,Column97,Column98,Column99,Column100
+0.636,pear,84,2170,0.103,apple,20,,0.621,apple,,9697,0.047,dog,7,2543,0.728,crocodile,55,2615,0.146,crocodile,10,2832,0.644,crocodile,6,9671,,crocodile,28,9126,0.433,crocodile,22,8117,0.318,dog,61,3363,0.938,pear,58,3700,0.844,dog,26,3989,0.132,crocodile,88,8240,0.325,,9,6102,0.032,apple,34,2987,0.651,crocodile,81,8778,,apple,91,9908,0.897,dog,26,3682,0.274,crocodile,75,1629,0.918,apple,9,7935,0.761,crocodile,,2398,0.914,apple,75,9204,0.946,pear,87,5587,0.940,apple,50,1611,0.480,apple,45,3230
+0.283,dog,49,6275,0.398,pear,50,339,0.561,apple,22,8596,0.661,pear,38,4928,0.709,crocodile,73,6011,0.239,crocodile,93,,0.093,crocodile,23,6198,0.757,dog,61,,0.593,pear,12,9571,,dog,6,892,0.883,pear,81,3363,0.166,pear,,1814,0.454,dog,52,1161,0.630,dog,43,159,0.398,apple,17,8194,0.905,dog,29,4752,0.382,pear,44,9302,0.247,apple,45,4645,0.321,apple,56,,0.415,,,5912,0.580,,60,1918,0.821,dog,68,6847,0.542,apple,83,4465,0.169,pear,53,5820,0.942,dog,76,4904
+0.452,pear,35,8627,0.117,pear,70,5704,0.209,apple,7,5079,0.961,pear,73,,0.414,dog,,9681,,pear,80,4183,0.131,crocodile,53,972,,apple,86,772,0.460,,89,5950,0.293,apple,25,288,0.314,dog,38,106,0.108,pear,49,979,0.481,dog,25,7904,0.270,dog,4,4891,,dog,32,9820,0.517,,64,9053,0.487,dog,78,7238,0.488,apple,54,4349,0.356,pear,73,9991,0.113,,63,5731,0.294,dog,58,3060,,pear,2,,0.004,dog,45,9872,0.024,dog,69,,0.336,pear,9,
+0.557,dog,29,5292,0.640,pear,34,2766,0.910,dog,26,2576,0.924,pear,85,3495,0.478,crocodile,43,1586,0.576,apple,79,4403,0.329,crocodile,,3572,0.702,dog,46,1913,0.147,,10,4292,0.368,crocodile,27,7037,0.137,pear,19,4545,0.999,apple,81,,0.095,pear,36,9195,0.494,pear,61,3393,,crocodile,27,8252,,dog,87,7968,0.845,apple,31,3801,0.514,dog,,6542,0.483,dog,58,4688,,pear,71,402,0.275,apple,51,3673,0.642,,82,4443,0.538,apple,10,114,0.155,,17,8170,0.493,pear,89,8566
+0.138,,55,1903,0.583,crocodile,34,4170,0.226,crocodile,11,3985,0.271,pear,,4624,,,43,2539,,dog,50,,0.469,crocodile,72,5999,0.476,apple,71,1184,0.333,crocodile,23,5075,0.120,pear,64,5242,0.561,apple,7,8206,0.103,,45,,0.972,,1,432,0.423,dog,4,1860,0.686,apple,43,9640,0.622,pear,15,91,0.795,apple,33,2149,0.248,dog,,,0.942,dog,85,987,0.492,pear,2,754,0.955,apple,48,1268,0.498,apple,83,6654,0.305,crocodile,81,423,0.013,,32,3217,0.046,,57,894
+0.948,dog,33,8962,0.691,pear,,2276,0.330,crocodile,27,1104,0.668,,73,2883,0.007,,18,3726,0.301,,85,3621,,apple,39,9646,0.305,pear,48,,0.103,dog,71,8216,0.813,dog,41,6387,0.114,apple,24,4428,0.122,dog,41,,0.558,pear,49,6455,0.119,pear,56,9390,0.482,apple,75,9168,0.437,dog,74,4101,0.557,dog,74,1631,0.159,dog,26,2218,0.274,crocodile,,7653,,dog,,9637,0.345,apple,82,440,0.604,apple,89,6848,0.177,pear,31,2597,0.847,dog,77,,0.700,crocodile,60,6088
+0.162,crocodile,17,8189,0.002,pear,30,5161,0.851,,,7230,0.886,dog,43,5277,0.086,dog,10,6891,0.510,dog,63,2275,0.641,dog,74,9202,0.452,pear,4,6580,0.008,,38,,0.491,apple,11,1469,0.672,dog,89,6182,0.477,apple,4,1877,0.241,dog,61,5083,,apple,78,2526,,,,7547,0.969,dog,22,1578,,dog,86,1211,0.221,apple,1,3252,0.146,,85,9278,0.676,dog,30,1218,0.012,apple,93,3159,0.681,crocodile,45,,0.825,crocodile,0,3749,0.831,,74,7021,0.534,pear,20,6122
+0.148,crocodile,37,1740,0.411,apple,23,8833,0.620,crocodile,5,439,0.372,apple,31,,0.881,dog,84,736,0.347,crocodile,46,224,0.080,crocodile,80,891,0.452,pear,82,3304,0.418,pear,85,2984,0.362,dog,65,,0.167,,65,5265,,apple,89,2101,0.864,apple,92,3636,0.382,crocodile,89,269,0.958,dog,23,1419,0.410,pear,86,8488,0.739,pear,,6576,0.613,dog,37,859,0.426,dog,15,8618,0.554,crocodile,51,4864,0.046,,,4421,0.995,dog,25,622,0.191,pear,84,536,0.405,apple,54,7081,0.575,crocodile,15,2570
+,pear,15,3679,0.102,pear,1,3227,0.157,pear,12,1627,0.955,pear,21,64,0.967,dog,14,2876,0.533,,74,4269,0.881,apple,58,5565,,dog,,6286,0.373,crocodile,46,6525,0.477,crocodile,18,1836,0.067,apple,66,3851,0.224,pear,,7233,0.443,,75,5577,0.392,crocodile,92,4353,0.549,pear,96,4884,0.581,crocodile,10,4058,,crocodile,22,8038,0.411,,94,9236,,pear,40,,0.770,crocodile,36,4591,0.854,dog,32,2902,0.724,apple,57,5499,0.566,pear,19,7577,0.853,,,,,crocodile,84,2709
+0.663,pear,,7735,0.086,apple,80,,,pear,16,9159,0.053,dog,52,3478,0.691,pear,49,9979,0.428,dog,46,5776,0.744,crocodile,3,9549,0.249,dog,90,1192,0.009,dog,68,8678,0.046,apple,62,1873,0.684,,31,7227,0.300,crocodile,83,9881,,,82,8272,0.621,crocodile,40,8171,0.002,crocodile,38,,0.580,pear,31,3094,,,5,5711,0.668,,30,217,0.672,dog,34,3184,0.105,pear,,1521,0.239,dog,28,6896,0.011,apple,,4509,0.691,,97,9247,0.167,pear,74,9824,0.945,crocodile,89,
diff --git a/pandas/io/tests/sas/data/test_sas7bdat_2.csv b/pandas/io/tests/sas/data/test_sas7bdat_2.csv
new file mode 100644
index 0000000000000..adc0587ae2797
--- /dev/null
+++ b/pandas/io/tests/sas/data/test_sas7bdat_2.csv
@@ -0,0 +1,11 @@
+Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13,Column14,Column15,Column16,Column17,Column18,Column19,Column20,Column21,Column22,Column23,Column24,Column25,Column26,Column27,Column28,Column29,Column30,Column31,Column32,Column33,Column34,Column35,Column36,Column37,Column38,Column39,Column40,Column41,Column42,Column43,Column44,Column45,Column46,Column47,Column48,Column49,Column50,Column51,Column52,Column53,Column54,Column55,Column56,Column57,Column58,Column59,Column60,Column61,Column62,Column63,Column64,Column65,Column66,Column67,Column68,Column69,Column70,Column71,Column72,Column73,Column74,Column75,Column76,Column77,Column78,Column79,Column80,Column81,Column82,Column83,Column84,Column85,Column86,Column87,Column88,Column89,Column90,Column91,Column92,Column93,Column94,Column95,Column96,Column97,Column98,Column99,Column100
+0.636,高雄市,84,2170,0.103,부산,20,,0.621,부산,,9697,0.047,Иркутск,7,2543,0.728,鱷魚,55,2615,0.146,鱷魚,10,2832,0.644,鱷魚,6,9671,,鱷魚,28,9126,0.433,鱷魚,22,8117,0.318,Иркутск,61,3363,0.938,高雄市,58,3700,0.844,Иркутск,26,3989,0.132,鱷魚,88,8240,0.325,,9,6102,0.032,부산,34,2987,0.651,鱷魚,81,8778,,부산,91,9908,0.897,Иркутск,26,3682,0.274,鱷魚,75,1629,0.918,부산,9,7935,0.761,鱷魚,,2398,0.914,부산,75,9204,0.946,高雄市,87,5587,0.940,부산,50,1611,0.480,부산,45,3230
+0.283,Иркутск,49,6275,0.398,高雄市,50,339,0.561,부산,22,8596,0.661,高雄市,38,4928,0.709,鱷魚,73,6011,0.239,鱷魚,93,,0.093,鱷魚,23,6198,0.757,Иркутск,61,,0.593,高雄市,12,9571,,Иркутск,6,892,0.883,高雄市,81,3363,0.166,高雄市,,1814,0.454,Иркутск,52,1161,0.630,Иркутск,43,159,0.398,부산,17,8194,0.905,Иркутск,29,4752,0.382,高雄市,44,9302,0.247,부산,45,4645,0.321,부산,56,,0.415,,,5912,0.580,,60,1918,0.821,Иркутск,68,6847,0.542,부산,83,4465,0.169,高雄市,53,5820,0.942,Иркутск,76,4904
+0.452,高雄市,35,8627,0.117,高雄市,70,5704,0.209,부산,7,5079,0.961,高雄市,73,,0.414,Иркутск,,9681,,高雄市,80,4183,0.131,鱷魚,53,972,,부산,86,772,0.460,,89,5950,0.293,부산,25,288,0.314,Иркутск,38,106,0.108,高雄市,49,979,0.481,Иркутск,25,7904,0.270,Иркутск,4,4891,,Иркутск,32,9820,0.517,,64,9053,0.487,Иркутск,78,7238,0.488,부산,54,4349,0.356,高雄市,73,9991,0.113,,63,5731,0.294,Иркутск,58,3060,,高雄市,2,,0.004,Иркутск,45,9872,0.024,Иркутск,69,,0.336,高雄市,9,
+0.557,Иркутск,29,5292,0.640,高雄市,34,2766,0.910,Иркутск,26,2576,0.924,高雄市,85,3495,0.478,鱷魚,43,1586,0.576,부산,79,4403,0.329,鱷魚,,3572,0.702,Иркутск,46,1913,0.147,,10,4292,0.368,鱷魚,27,7037,0.137,高雄市,19,4545,0.999,부산,81,,0.095,高雄市,36,9195,0.494,高雄市,61,3393,,鱷魚,27,8252,,Иркутск,87,7968,0.845,부산,31,3801,0.514,Иркутск,,6542,0.483,Иркутск,58,4688,,高雄市,71,402,0.275,부산,51,3673,0.642,,82,4443,0.538,부산,10,114,0.155,,17,8170,0.493,高雄市,89,8566
+0.138,,55,1903,0.583,鱷魚,34,4170,0.226,鱷魚,11,3985,0.271,高雄市,,4624,,,43,2539,,Иркутск,50,,0.469,鱷魚,72,5999,0.476,부산,71,1184,0.333,鱷魚,23,5075,0.120,高雄市,64,5242,0.561,부산,7,8206,0.103,,45,,0.972,,1,432,0.423,Иркутск,4,1860,0.686,부산,43,9640,0.622,高雄市,15,91,0.795,부산,33,2149,0.248,Иркутск,,,0.942,Иркутск,85,987,0.492,高雄市,2,754,0.955,부산,48,1268,0.498,부산,83,6654,0.305,鱷魚,81,423,0.013,,32,3217,0.046,,57,894
+0.948,Иркутск,33,8962,0.691,高雄市,,2276,0.330,鱷魚,27,1104,0.668,,73,2883,0.007,,18,3726,0.301,,85,3621,,부산,39,9646,0.305,高雄市,48,,0.103,Иркутск,71,8216,0.813,Иркутск,41,6387,0.114,부산,24,4428,0.122,Иркутск,41,,0.558,高雄市,49,6455,0.119,高雄市,56,9390,0.482,부산,75,9168,0.437,Иркутск,74,4101,0.557,Иркутск,74,1631,0.159,Иркутск,26,2218,0.274,鱷魚,,7653,,Иркутск,,9637,0.345,부산,82,440,0.604,부산,89,6848,0.177,高雄市,31,2597,0.847,Иркутск,77,,0.700,鱷魚,60,6088
+0.162,鱷魚,17,8189,0.002,高雄市,30,5161,0.851,,,7230,0.886,Иркутск,43,5277,0.086,Иркутск,10,6891,0.510,Иркутск,63,2275,0.641,Иркутск,74,9202,0.452,高雄市,4,6580,0.008,,38,,0.491,부산,11,1469,0.672,Иркутск,89,6182,0.477,부산,4,1877,0.241,Иркутск,61,5083,,부산,78,2526,,,,7547,0.969,Иркутск,22,1578,,Иркутск,86,1211,0.221,부산,1,3252,0.146,,85,9278,0.676,Иркутск,30,1218,0.012,부산,93,3159,0.681,鱷魚,45,,0.825,鱷魚,0,3749,0.831,,74,7021,0.534,高雄市,20,6122
+0.148,鱷魚,37,1740,0.411,부산,23,8833,0.620,鱷魚,5,439,0.372,부산,31,,0.881,Иркутск,84,736,0.347,鱷魚,46,224,0.080,鱷魚,80,891,0.452,高雄市,82,3304,0.418,高雄市,85,2984,0.362,Иркутск,65,,0.167,,65,5265,,부산,89,2101,0.864,부산,92,3636,0.382,鱷魚,89,269,0.958,Иркутск,23,1419,0.410,高雄市,86,8488,0.739,高雄市,,6576,0.613,Иркутск,37,859,0.426,Иркутск,15,8618,0.554,鱷魚,51,4864,0.046,,,4421,0.995,Иркутск,25,622,0.191,高雄市,84,536,0.405,부산,54,7081,0.575,鱷魚,15,2570
+,高雄市,15,3679,0.102,高雄市,1,3227,0.157,高雄市,12,1627,0.955,高雄市,21,64,0.967,Иркутск,14,2876,0.533,,74,4269,0.881,부산,58,5565,,Иркутск,,6286,0.373,鱷魚,46,6525,0.477,鱷魚,18,1836,0.067,부산,66,3851,0.224,高雄市,,7233,0.443,,75,5577,0.392,鱷魚,92,4353,0.549,高雄市,96,4884,0.581,鱷魚,10,4058,,鱷魚,22,8038,0.411,,94,9236,,高雄市,40,,0.770,鱷魚,36,4591,0.854,Иркутск,32,2902,0.724,부산,57,5499,0.566,高雄市,19,7577,0.853,,,,,鱷魚,84,2709
+0.663,高雄市,,7735,0.086,부산,80,,,高雄市,16,9159,0.053,Иркутск,52,3478,0.691,高雄市,49,9979,0.428,Иркутск,46,5776,0.744,鱷魚,3,9549,0.249,Иркутск,90,1192,0.009,Иркутск,68,8678,0.046,부산,62,1873,0.684,,31,7227,0.300,鱷魚,83,9881,,,82,8272,0.621,鱷魚,40,8171,0.002,鱷魚,38,,0.580,高雄市,31,3094,,,5,5711,0.668,,30,217,0.672,Иркутск,34,3184,0.105,高雄市,,1521,0.239,Иркутск,28,6896,0.011,부산,,4509,0.691,,97,9247,0.167,高雄市,74,9824,0.945,鱷魚,89,
diff --git a/pandas/io/tests/sas/test_sas7bdat.py b/pandas/io/tests/sas/test_sas7bdat.py
new file mode 100644
index 0000000000000..a9e6ea68f3979
--- /dev/null
+++ b/pandas/io/tests/sas/test_sas7bdat.py
@@ -0,0 +1,64 @@
+import pandas as pd
+from pandas.compat import PY2
+import pandas.util.testing as tm
+import os
+import io
+import numpy as np
+
+
+class TestSAS7BDAT(tm.TestCase):
+
+ def setUp(self):
+ self.dirpath = tm.get_data_path()
+ self.data = []
+ self.test_ix = [list(range(1, 16)), [16]]
+ for j in 1, 2:
+ fname = os.path.join(self.dirpath, "test_sas7bdat_%d.csv" % j)
+ df = pd.read_csv(fname)
+ epoch = pd.datetime(1960, 1, 1)
+ t1 = pd.to_timedelta(df["Column4"], unit='d')
+ df["Column4"] = epoch + t1
+ t2 = pd.to_timedelta(df["Column12"], unit='d')
+ df["Column12"] = epoch + t2
+ for k in range(df.shape[1]):
+ col = df.iloc[:, k]
+ if col.dtype == np.int64:
+ df.iloc[:, k] = df.iloc[:, k].astype(np.float64)
+ elif col.dtype == np.dtype('O'):
+ if PY2:
+ f = lambda x: (x.decode('utf-8') if
+ isinstance(x, str) else x)
+ df.iloc[:, k] = df.iloc[:, k].apply(f)
+ self.data.append(df)
+
+ def test_from_file(self):
+ for j in 0, 1:
+ df0 = self.data[j]
+ for k in self.test_ix[j]:
+ fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k)
+ df = pd.read_sas(fname, encoding='utf-8')
+ tm.assert_frame_equal(df, df0)
+
+ def test_from_buffer(self):
+ for j in 0, 1:
+ df0 = self.data[j]
+ for k in self.test_ix[j]:
+ fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k)
+ byts = open(fname, 'rb').read()
+ buf = io.BytesIO(byts)
+ df = pd.read_sas(buf, format="sas7bdat", encoding='utf-8')
+ tm.assert_frame_equal(df, df0)
+
+ def test_from_iterator(self):
+ for j in 0, 1:
+ df0 = self.data[j]
+ for k in self.test_ix[j]:
+ fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k)
+ byts = open(fname, 'rb').read()
+ buf = io.BytesIO(byts)
+ rdr = pd.read_sas(buf, format="sas7bdat",
+ iterator=True, encoding='utf-8')
+ df = rdr.read(2)
+ tm.assert_frame_equal(df, df0.iloc[0:2, :])
+ df = rdr.read(3)
+ tm.assert_frame_equal(df, df0.iloc[2:5, :])
diff --git a/pandas/io/tests/test_sas.py b/pandas/io/tests/sas/test_xport.py
similarity index 73%
rename from pandas/io/tests/test_sas.py
rename to pandas/io/tests/sas/test_xport.py
index 9b31d9443de3b..ae378c41cd24b 100644
--- a/pandas/io/tests/test_sas.py
+++ b/pandas/io/tests/sas/test_xport.py
@@ -1,6 +1,6 @@
import pandas as pd
import pandas.util.testing as tm
-from pandas.io.sas import XportReader, read_sas
+from pandas.io.sas.sasreader import read_sas
import numpy as np
import os
@@ -33,16 +33,16 @@ def test1_basic(self):
numeric_as_float(data_csv)
# Read full file
- data = XportReader(self.file01).read()
+ data = read_sas(self.file01, format="xport")
tm.assert_frame_equal(data, data_csv)
# Test incremental read with `read` method.
- reader = XportReader(self.file01)
+ reader = read_sas(self.file01, format="xport", iterator=True)
data = reader.read(10)
tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
# Test incremental read with `get_chunk` method.
- reader = XportReader(self.file01, chunksize=10)
+ reader = read_sas(self.file01, format="xport", chunksize=10)
data = reader.get_chunk()
tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
@@ -59,20 +59,22 @@ def test1_index(self):
numeric_as_float(data_csv)
# Read full file
- data = XportReader(self.file01, index="SEQN").read()
+ data = read_sas(self.file01, index="SEQN", format="xport")
tm.assert_frame_equal(data, data_csv, check_index_type=False)
# Test incremental read with `read` method.
- reader = XportReader(self.file01, index="SEQN")
+ reader = read_sas(self.file01, index="SEQN", format="xport",
+ iterator=True)
data = reader.read(10)
- tm.assert_frame_equal(data, data_csv.iloc[
- 0:10, :], check_index_type=False)
+ tm.assert_frame_equal(data, data_csv.iloc[0:10, :],
+ check_index_type=False)
# Test incremental read with `get_chunk` method.
- reader = XportReader(self.file01, index="SEQN", chunksize=10)
+ reader = read_sas(self.file01, index="SEQN", format="xport",
+ chunksize=10)
data = reader.get_chunk()
- tm.assert_frame_equal(data, data_csv.iloc[
- 0:10, :], check_index_type=False)
+ tm.assert_frame_equal(data, data_csv.iloc[0:10, :],
+ check_index_type=False)
def test1_incremental(self):
# Test with DEMO_G.xpt, reading full file incrementally
@@ -81,18 +83,13 @@ def test1_incremental(self):
data_csv = data_csv.set_index("SEQN")
numeric_as_float(data_csv)
- reader = XportReader(self.file01, index="SEQN", chunksize=1000)
+ reader = read_sas(self.file01, index="SEQN", chunksize=1000)
all_data = [x for x in reader]
data = pd.concat(all_data, axis=0)
tm.assert_frame_equal(data, data_csv, check_index_type=False)
- reader = XportReader(self.file01, index="SEQN", chunksize=1000)
- data = pd.concat(reader, axis=0)
-
- tm.assert_frame_equal(data, data_csv, check_index_type=False)
-
def test2(self):
# Test with SSHSV1_A.xpt
@@ -100,7 +97,7 @@ def test2(self):
data_csv = pd.read_csv(self.file02.replace(".xpt", ".csv"))
numeric_as_float(data_csv)
- data = XportReader(self.file02).read()
+ data = read_sas(self.file02)
tm.assert_frame_equal(data, data_csv)
def test_multiple_types(self):
@@ -109,10 +106,7 @@ def test_multiple_types(self):
# Compare to this
data_csv = pd.read_csv(self.file03.replace(".xpt", ".csv"))
- data = XportReader(self.file03).read()
- tm.assert_frame_equal(data, data_csv)
-
- data = read_sas(self.file03)
+ data = read_sas(self.file03, encoding="utf-8")
tm.assert_frame_equal(data, data_csv)
def test_truncated_float_support(self):
@@ -124,8 +118,5 @@ def test_truncated_float_support(self):
data_csv = pd.read_csv(self.file04.replace(".xpt", ".csv"))
- data = XportReader(self.file04).read()
- tm.assert_frame_equal(data.astype('int64'), data_csv)
-
- data = read_sas(self.file04)
+ data = read_sas(self.file04, format="xport")
tm.assert_frame_equal(data.astype('int64'), data_csv)
diff --git a/setup.py b/setup.py
index b5609d213c773..f33b01b24c165 100755
--- a/setup.py
+++ b/setup.py
@@ -262,7 +262,8 @@ class CheckSDist(sdist_class):
'pandas/parser.pyx',
'pandas/src/period.pyx',
'pandas/src/sparse.pyx',
- 'pandas/src/testing.pyx']
+ 'pandas/src/testing.pyx',
+ 'pandas/io/sas/saslib.pyx']
def initialize_options(self):
sdist_class.initialize_options(self)
@@ -418,9 +419,11 @@ def pxd(name):
'pandas/src/parser/io.h',
'pandas/src/numpy_helper.h'],
'sources': ['pandas/src/parser/tokenizer.c',
- 'pandas/src/parser/io.c']}
+ 'pandas/src/parser/io.c']},
)
+ext_data["io.sas.saslib"] = {'pyxfile': 'io/sas/saslib'}
+
extensions = []
for name, data in ext_data.items():
@@ -527,6 +530,7 @@ def pxd(name):
'pandas.core',
'pandas.indexes',
'pandas.io',
+ 'pandas.io.sas',
'pandas.rpy',
'pandas.sandbox',
'pandas.sparse',