Merge pull request #21 from alimanfoo/doc-msgpack-pickle

alimanfoo · web-flow · commit 73feb8f3a189 · 2017-02-28T10:59:27.000Z
Review msgpack pickle. Resolves #6, resolves #8.
diff --git a/docs/conf.py b/docs/conf.py
@@ -28,13 +28,13 @@ def __getattr__(cls, name):
         return Mock()
 
 
-MOCK_MODULES = []
+MOCK_MODULES = ['msgpack']
 if PY2:
     MOCK_MODULES.append('lzma')
 
 
 sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
-                        
+
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
diff --git a/docs/index.rst b/docs/index.rst
@@ -59,6 +59,8 @@ Contents
     packbits
     categorize
     checksum32
+    pickles
+    msgpacks
     release
 
 Acknowledgments
diff --git a/docs/msgpacks.rst b/docs/msgpacks.rst
@@ -0,0 +1,7 @@
+MsgPack
+=======
+.. automodule:: numcodecs.msgpacks
+
+.. autoclass:: MsgPack
+
+    .. autoattribute:: codec_id
diff --git a/docs/pickles.rst b/docs/pickles.rst
@@ -0,0 +1,7 @@
+Pickle
+======
+.. automodule:: numcodecs.pickles
+
+.. autoclass:: Pickle
+
+    .. autoattribute:: codec_id
diff --git a/numcodecs/msgpacks.py b/numcodecs/msgpacks.py
@@ -6,17 +6,12 @@
 
 
 from numcodecs.abc import Codec
-from numcodecs.compat import ndarray_from_buffer, buffer_copy
 import msgpack
 
 
 class MsgPack(Codec):
-    """Codec to encode data as msgpacked bytes. Useful for encoding python
-    strings
-
-    Raises
-    ------
-    encoding a non-object dtyped ndarray will raise ValueError
+    """Codec to encode data as msgpacked bytes. Useful for encoding an array of Python string
+    objects.
 
     Examples
     --------
@@ -27,26 +22,39 @@ class MsgPack(Codec):
     >>> f.decode(f.encode(x))
     array(['foo', 'bar', 'baz'], dtype=object)
 
+    See Also
+    --------
+    :class:`numcodecs.pickles.Pickle`
+
+    Notes
+    -----
+    Requires `msgpack-python <https://pypi.python.org/pypi/msgpack-python>`_ to be installed.
+
     """  # flake8: noqa
 
     codec_id = 'msgpack'
 
+    def __init__(self, encoding='utf-8'):
+        self.encoding = encoding
+
     def encode(self, buf):
-        if hasattr(buf, 'dtype') and buf.dtype != 'object':
-            raise ValueError("cannot encode non-object ndarrays, %s "
-                             "dtype was passed" % buf.dtype)
-        return msgpack.packb(buf.tolist(), encoding='utf-8')
+        buf = np.asarray(buf)
+        l = buf.tolist()
+        l.append(buf.dtype.str)
+        return msgpack.packb(l, encoding=self.encoding)
 
     def decode(self, buf, out=None):
-        dec = np.array(msgpack.unpackb(buf, encoding='utf-8'), dtype='object')
+        l = msgpack.unpackb(buf, encoding=self.encoding)
+        dec = np.array(l[:-1], dtype=l[-1])
         if out is not None:
             np.copyto(out, dec)
             return out
         else:
             return dec
 
     def get_config(self):
-        return dict(id=self.codec_id)
+        return dict(id=self.codec_id,
+                    encoding=self.encoding)
 
     def __repr__(self):
-        return 'MsgPack()'
+        return 'MsgPack(encoding=%r)' % self.encoding
diff --git a/numcodecs/pickles.py b/numcodecs/pickles.py
@@ -14,17 +14,13 @@
 
 
 class Pickle(Codec):
-    """Codec to encode data as as pickled bytes. Useful for encoding python
-    strings.
+    """Codec to encode data as as pickled bytes. Useful for encoding an array of Python string
+    objects.
 
     Parameters
     ----------
     protocol : int, defaults to pickle.HIGHEST_PROTOCOL
-        the protocol used to pickle data
-
-    Raises
-    ------
-    encoding a non-object dtyped ndarray will raise ValueError
+        The protocol used to pickle data.
 
     Examples
     --------
@@ -35,6 +31,10 @@ class Pickle(Codec):
     >>> f.decode(f.encode(x))
     array(['foo', 'bar', 'baz'], dtype=object)
 
+    See Also
+    --------
+    :class:`numcodecs.msgpacks.MsgPack`
+
     """  # flake8: noqa
 
     codec_id = 'pickle'
@@ -43,9 +43,6 @@ def __init__(self, protocol=pickle.HIGHEST_PROTOCOL):
         self.protocol = protocol
 
     def encode(self, buf):
-        if hasattr(buf, 'dtype') and buf.dtype != 'object':
-            raise ValueError("cannot encode non-object ndarrays, %s "
-                             "dtype was passed" % buf.dtype)
         return pickle.dumps(buf, protocol=self.protocol)
 
     def decode(self, buf, out=None):
diff --git a/numcodecs/tests/common.py b/numcodecs/tests/common.py
@@ -91,16 +91,13 @@ def compare(res):
     compare(out)
 
 
-def check_encode_decode_objects(arr, codec):
-
-    # this is a more specific test that check_encode_decode
-    # as these require actual objects (and not bytes only)
+def check_encode_decode_array(arr, codec):
 
     def compare(res, arr=arr):
 
         assert_true(isinstance(res, np.ndarray))
         assert_true(res.shape == arr.shape)
-        assert_true(res.dtype == 'object')
+        assert_true(res.dtype == arr.dtype)
 
         # numpy asserts don't compare object arrays
         # properly; assert that we have the same nans
diff --git a/numcodecs/tests/test_lzma.py b/numcodecs/tests/test_lzma.py
@@ -1,54 +1,49 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import, print_function, division
+import itertools
 
 
-_lzma = None
+import nose
+import numpy as np
+
 try:
-    import lzma as _lzma
+    from numcodecs.lzma import LZMA, _lzma
 except ImportError:  # pragma: no cover
-    try:
-        from backports import lzma as _lzma
-    except ImportError:
-        pass
-
-
-if _lzma:
-
-    import itertools
-    import numpy as np
-    from numcodecs.lzma import LZMA
-    from numcodecs.tests.common import check_encode_decode, check_config, \
-        check_repr
-
-    codecs = [
-        LZMA(),
-        LZMA(preset=1),
-        LZMA(preset=5),
-        LZMA(preset=9),
-        LZMA(format=_lzma.FORMAT_RAW,
-             filters=[dict(id=_lzma.FILTER_LZMA2, preset=1)])
-    ]
-
-    # mix of dtypes: integer, float, bool, string
-    # mix of shapes: 1D, 2D, 3D
-    # mix of orders: C, F
-    arrays = [
-        np.arange(1000, dtype='i4'),
-        np.linspace(1000, 1001, 1000, dtype='f8'),
-        np.random.normal(loc=1000, scale=1, size=(100, 10)),
-        np.random.randint(0, 2, size=1000, dtype=bool).reshape(100, 10,
-                                                               order='F'),
-        np.random.choice([b'a', b'bb', b'ccc'], size=1000).reshape(10, 10, 10)
-    ]
-
-    def test_encode_decode():
-        for arr, codec in itertools.product(arrays, codecs):
-            check_encode_decode(arr, codec)
-
-    def test_config():
-        codec = LZMA(preset=1, format=_lzma.FORMAT_XZ,
-                     check=_lzma.CHECK_NONE, filters=None)
-        check_config(codec)
-
-    def test_repr():
-        check_repr('LZMA(format=1, check=0, preset=1, filters=None)')
+    raise nose.SkipTest("LZMA not available")
+
+from numcodecs.tests.common import check_encode_decode, check_config, check_repr
+
+
+codecs = [
+    LZMA(),
+    LZMA(preset=1),
+    LZMA(preset=5),
+    LZMA(preset=9),
+    LZMA(format=_lzma.FORMAT_RAW, filters=[dict(id=_lzma.FILTER_LZMA2, preset=1)])
+]
+
+
+# mix of dtypes: integer, float, bool, string
+# mix of shapes: 1D, 2D, 3D
+# mix of orders: C, F
+arrays = [
+    np.arange(1000, dtype='i4'),
+    np.linspace(1000, 1001, 1000, dtype='f8'),
+    np.random.normal(loc=1000, scale=1, size=(100, 10)),
+    np.random.randint(0, 2, size=1000, dtype=bool).reshape(100, 10, order='F'),
+    np.random.choice([b'a', b'bb', b'ccc'], size=1000).reshape(10, 10, 10)
+]
+
+
+def test_encode_decode():
+    for arr, codec in itertools.product(arrays, codecs):
+        check_encode_decode(arr, codec)
+
+
+def test_config():
+    codec = LZMA(preset=1, format=_lzma.FORMAT_XZ, check=_lzma.CHECK_NONE, filters=None)
+    check_config(codec)
+
+
+def test_repr():
+    check_repr('LZMA(format=1, check=0, preset=1, filters=None)')
diff --git a/numcodecs/tests/test_msgpacks.py b/numcodecs/tests/test_msgpacks.py
@@ -3,10 +3,14 @@
 
 
 import numpy as np
-from numpy.testing import assert_raises
-from numcodecs.msgpacks import MsgPack
-from numcodecs.tests.common import (check_config, check_repr,
-                                    check_encode_decode_objects)
+import nose
+
+try:
+    from numcodecs.msgpacks import MsgPack
+except ImportError:  # pragma: no cover
+    raise nose.SkipTest("msgpack-python not available")
+
+from numcodecs.tests.common import check_config, check_repr, check_encode_decode_array
 
 
 # object array with strings
@@ -16,25 +20,15 @@
     np.array(['foo', 'bar', 'baz'] * 300, dtype=object),
     np.array([['foo', 'bar', np.nan]] * 300, dtype=object),
     np.array(['foo', 1.0, 2] * 300, dtype=object),
-]
-
-# non-object ndarrays
-arrays_incompat = [
     np.arange(1000, dtype='i4'),
     np.array(['foo', 'bar', 'baz'] * 300),
 ]
 
 
-def test_encode_errors():
-    for arr in arrays_incompat:
-        codec = MsgPack()
-        assert_raises(ValueError, codec.encode, arr)
-
-
 def test_encode_decode():
     for arr in arrays:
         codec = MsgPack()
-        check_encode_decode_objects(arr, codec)
+        check_encode_decode_array(arr, codec)
 
 
 def test_config():
@@ -43,4 +37,5 @@ def test_config():
 
 
 def test_repr():
-    check_repr("MsgPack()")
+    check_repr("MsgPack(encoding='utf-8')")
+    check_repr("MsgPack(encoding='ascii')")
diff --git a/numcodecs/tests/test_pickles.py b/numcodecs/tests/test_pickles.py
@@ -3,12 +3,10 @@
 
 
 import numpy as np
-from numpy.testing import assert_raises
 
 
 from numcodecs.pickles import Pickle
-from numcodecs.tests.common import (check_config, check_repr,
-                                    check_encode_decode_objects)
+from numcodecs.tests.common import check_config, check_repr, check_encode_decode_array
 
 
 # object array with strings
@@ -18,25 +16,15 @@
     np.array(['foo', 'bar', 'baz'] * 300, dtype=object),
     np.array([['foo', 'bar', np.nan]] * 300, dtype=object),
     np.array(['foo', 1.0, 2] * 300, dtype=object),
-]
-
-# non-object ndarrays
-arrays_incompat = [
     np.arange(1000, dtype='i4'),
     np.array(['foo', 'bar', 'baz'] * 300),
 ]
 
 
-def test_encode_errors():
-    for arr in arrays_incompat:
-        codec = Pickle()
-        assert_raises(ValueError, codec.encode, arr)
-
-
 def test_encode_decode():
+    codec = Pickle()
     for arr in arrays:
-        codec = Pickle()
-        check_encode_decode_objects(arr, codec)
+        check_encode_decode_array(arr, codec)
 
 
 def test_config():
diff --git a/setup.py b/setup.py
@@ -172,6 +172,9 @@ def run_setup(with_extensions):
         install_requires=[
             'numpy>=1.7',
         ],
+        extras_require={
+            'msgpack':  ["msgpack-python"],
+        },
         ext_modules=ext_modules,
         cmdclass=cmdclass,
         package_dir={'': '.'},
diff --git a/tox.ini b/tox.ini
@@ -10,14 +10,14 @@ envlist = py27, py34, py35, py36, docs
 setenv =
     PYTHONHASHSEED = 42
 commands =
-    py27: pip install -U backports.lzma
     python setup.py build_ext --inplace
     py27,py34,py35: nosetests -v numcodecs --with-coverage --cover-erase --cover-package=numcodecs
     py36: nosetests -v --with-coverage --cover-erase --cover-package=numcodecs --with-doctest --doctest-options=+NORMALIZE_WHITESPACE numcodecs
     py36: flake8 --max-line-length=100 numcodecs
     python setup.py bdist_wheel
     coverage report -m
 deps =
+    py27: backports.lzma
     -rrequirements_dev.txt
 
 [testenv:docs]