Document surrogateescape support and enable it for bytes decoding (issue #116)

edschofield · edschofield · commit ade1bd3cc902 · 2015-07-25T11:03:31.000+10:00
diff --git a/README.rst b/README.rst
@@ -53,6 +53,9 @@ Features
     ``past.utils`` selected from Py2/3 compatibility interfaces from projects
     like ``six``, ``IPython``, ``Jinja2``, ``Django``, and ``Pandas``.
 
+-   partial support for the ``surrogateescape`` error handler when encoding and
+    decoding the backported ``str`` and ``bytes`` objects. (This is currently
+    in alpha.)
 
 .. _code-examples:
 
@@ -152,7 +155,7 @@ interface works like this:
     # Then, for example:
     from itertools import filterfalse, zip_longest
     from urllib.request import urlopen
-    from collections import Counter, OrderedDict   # backported to Py2.6
+    from collections import Counter, OrderedDict, ChainMap   # backported to Py2.6
     from collections import UserDict, UserList, UserString
     from subprocess import getoutput, getstatusoutput
 
diff --git a/docs/bytes_object.rst b/docs/bytes_object.rst
@@ -66,26 +66,19 @@ code incompatibilities caused by the many differences between Py3 bytes
 and Py2 strings.
 
 
-..
-    .. _bytes-test-results:
-    
-    bytes test results
-    ~~~~~~~~~~~~~~~~~~
-    
-    For reference, when using Py2's default :class:`bytes` (i.e.
-    :class:`str`), running the ``bytes`` unit tests from Python 3.3's
-    ``test_bytes.py`` on Py2 (after fixing imports) gives this::
-    
-        --------------------------------------------------------------
-        Ran 203 tests in 0.209s
-        
-        FAILED (failures=31, errors=55, skipped=1)
-        --------------------------------------------------------------
-    
-    Using :mod:`future`'s backported :class:`bytes` object passes most of
-    the same Python 3.3 tests on Py2, except those requiring specific
-    wording in exception messages.
-    
-    See ``future/tests/test_bytes.py`` in the source for the actual set
-    of unit tests that are actually run.
+surrogateescape
+~~~~~~~~~~~~~~~
+
+The :class:`bytes` type from :mod:`builtins` also provides support for the
+``surrogateescape`` error handler on Python 2.x. Here is an example that works
+identically on Python 2.x and 3.x::
 
+    >>> from builtins import bytes
+    >>> b = bytes(b'\xff')
+    >>> b.decode('utf-8', 'surrogateescape')
+    '\udcc3'
+
+This feature is in alpha. Please leave feedback `here
+<https://github.com/PythonCharmers/python-future/issues>`_ about whether this
+works for you.
+ 
diff --git a/docs/faq.rst b/docs/faq.rst
@@ -265,13 +265,6 @@ definitions) that greatly reduce the maintenance burden for single-source
 Py2/3 compatible code. ``future`` leverages these features and aims to
 close the remaining gap between Python 3 and 2.6 / 2.7.
 
-Python 2.6 does not offer the following features which help with Py3
-compatibility:
-- ``surrogateescape`` error handler for string encoding or decoding;
-- ``memoryview`` objects.
-
-Otherwise Python 2.6 is mostly supported.
-
 Python 3.2 could perhaps be supported too, although the illegal unicode
 literal ``u'...'`` syntax may be inconvenient to work around. The Py3.2
 userbase is very small, however. Please let us know via GitHub `issue #29
diff --git a/docs/str_object.rst b/docs/str_object.rst
@@ -84,21 +84,19 @@ same behaviours as Python 3's :class:`str`::
     >>> assert list(s) == ['A', 'B', 'C', 'D']
     >>> assert s.split('B') == ['A', 'CD']
 
-.. If you must ensure identical use of (unicode) strings across Py3 and Py2 in a
-.. single-source codebase, you can wrap string literals in a :func:`~str` call,
-.. as follows::
-..     
-..     from __future__ import unicode_literals
-..     from future.builtins import *
-..     
-..     # ...
-.. 
-..     s = str('This absolutely must behave like a Py3 string')
-.. 
-..     # ...
-.. 
-.. Most of the time this is unnecessary, but the stricter type-checking of the
-.. ``future.builtins.str`` object is useful for ensuring the same consistent
-.. separation between unicode and byte strings on Py2 as on Py3. This is
-.. important when writing protocol handlers, for example.
+surrogateescape
+~~~~~~~~~~~~~~~
+
+The :class:`str` type from :mod:`builtins` also provides support for the
+``surrogateescape`` error handler on Python 2.x. Here is an example that works
+identically on Python 2.x and 3.x::
+
+    >>> from builtins import str
+    >>> s = str(u'\udcff')
+    >>> s.encode('utf-8', 'surrogateescape')
+    b'\xff'
+
+This feature is in alpha. Please leave feedback `here
+<https://github.com/PythonCharmers/python-future/issues>`_ about whether this
+works for you.
 
diff --git a/docs/whatsnew.rst b/docs/whatsnew.rst
@@ -25,6 +25,7 @@ New features:
 - Backport of ``itertools.count`` for Py2.6 (issue #152)
 - Add constants to ``http.client`` such as ``HTTP_PORT`` and ``BAD_REQUEST`` (issue #137)
 - Backport of ``reprlib.recursive_repr`` to Py2
+- Enable support for the ``surrogateescape`` error handler for ``newstr`` and ``newbytes`` objects on Py2.x (issue #116). This feature is currently in alpha.
 
 Bug fixes:
 
diff --git a/src/future/types/newbytes.py b/src/future/types/newbytes.py
@@ -201,6 +201,11 @@ def decode(self, encoding='utf-8', errors='strict'):
         # not keyword arguments as in Python 3 str.
 
         from future.types.newstr import newstr
+
+        if errors == 'surrogateescape':
+            from future.utils.surrogateescape import register_surrogateescape
+            register_surrogateescape()
+
         return newstr(super(newbytes, self).decode(encoding, errors))
 
         # This is currently broken:
diff --git a/src/future/utils/surrogateescape.py b/src/future/utils/surrogateescape.py
@@ -186,14 +186,15 @@ def register_surrogateescape():
         codecs.register_error(FS_ERRORS, surrogateescape_handler)
 
 
-if True:
-    # Tests:
-    register_surrogateescape()
-
-    b = decodefilename(fn)
-    assert b == encoded, "%r != %r" % (b, encoded)
-    c = encodefilename(b)
-    assert c == fn, '%r != %r' % (c, fn)
-    # print("ok")
+if __name__ == '__main__':
+    pass
+    # # Tests:
+    # register_surrogateescape()
+
+    # b = decodefilename(fn)
+    # assert b == encoded, "%r != %r" % (b, encoded)
+    # c = encodefilename(b)
+    # assert c == fn, '%r != %r' % (c, fn)
+    # # print("ok")
 
 
diff --git a/tests/test_future/test_bytes.py b/tests/test_future/test_bytes.py
@@ -627,6 +627,19 @@ class MetaClass(type):
         class TestClass(with_metaclass(MetaClass, bytes)):
             pass
 
+    def test_surrogateescape_decoding(self):
+        """
+        Tests whether surrogateescape decoding works correctly.
+        """
+        pairs = [(u'\udcc3', b'\xc3'),
+                 (u'\udcff', b'\xff')]
+
+        for (s, b) in pairs:
+            decoded = bytes(b).decode('utf-8', 'surrogateescape')
+            self.assertEqual(s, decoded)
+            self.assertTrue(isinstance(decoded, str))
+            self.assertEqual(b, decoded.encode('utf-8', 'surrogateescape'))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_future/test_str.py b/tests/test_future/test_str.py
@@ -551,6 +551,19 @@ class MetaClass(type):
         class TestClass(with_metaclass(MetaClass, str)):
             pass
 
+    def test_surrogateescape_encoding(self):
+        """
+        Tests whether surrogateescape encoding works correctly.
+        """
+        pairs = [(u'\udcc3', b'\xc3'),
+                 (u'\udcff', b'\xff')]
+
+        for (s, b) in pairs:
+            encoded = str(s).encode('utf-8', 'surrogateescape')
+            self.assertEqual(b, encoded)
+            self.assertTrue(isinstance(encoded, bytes))
+            self.assertEqual(s, encoded.decode('utf-8', 'surrogateescape'))
+
 
 if __name__ == '__main__':
     unittest.main()