@@ -1751,13 +1751,15 @@ def test_basic(self):
17511751 writer .write_utf8 (b'var' , - 1 )
17521752
17531753 # test PyUnicodeWriter_WriteChar()
1754- writer .write_char ('=' )
1754+ writer .write_char (ord ( '=' ) )
17551755
17561756 # test PyUnicodeWriter_WriteSubstring()
17571757 writer .write_substring ("[long]" , 1 , 5 )
1758+ # CRASHES writer.write_substring(NULL, 0, 0)
17581759
17591760 # test PyUnicodeWriter_WriteStr()
17601761 writer .write_str (" value " )
1762+ # CRASHES writer.write_str(NULL)
17611763
17621764 # test PyUnicodeWriter_WriteRepr()
17631765 writer .write_repr ("repr" )
@@ -1772,21 +1774,38 @@ def test_repr_null(self):
17721774 self .assertEqual (writer .finish (),
17731775 "var=<NULL>" )
17741776
1777+ def test_write_char (self ):
1778+ writer = self .create_writer (0 )
1779+ writer .write_char (0 )
1780+ writer .write_char (ord ('$' ))
1781+ writer .write_char (0x20ac )
1782+ writer .write_char (0x10_ffff )
1783+ self .assertRaises (ValueError , writer .write_char , 0x11_0000 )
1784+ self .assertRaises (ValueError , writer .write_char , 0xFFFF_FFFF )
1785+ self .assertEqual (writer .finish (),
1786+ "\0 $\u20AC \U0010FFFF " )
1787+
17751788 def test_utf8 (self ):
17761789 writer = self .create_writer (0 )
17771790 writer .write_utf8 (b"ascii" , - 1 )
1778- writer .write_char ('-' )
1791+ writer .write_char (ord ( '-' ) )
17791792 writer .write_utf8 (b"latin1=\xC3 \xA9 " , - 1 )
1780- writer .write_char ('-' )
1793+ writer .write_char (ord ( '-' ) )
17811794 writer .write_utf8 (b"euro=\xE2 \x82 \xAC " , - 1 )
1782- writer .write_char ('.' )
1795+ writer .write_char (ord ('.' ))
1796+ writer .write_utf8 (NULL , 0 )
1797+ # CRASHES writer.write_utf8(NULL, 1)
1798+ # CRASHES writer.write_utf8(NULL, -1)
17831799 self .assertEqual (writer .finish (),
17841800 "ascii-latin1=\xE9 -euro=\u20AC ." )
17851801
17861802 def test_ascii (self ):
17871803 writer = self .create_writer (0 )
17881804 writer .write_ascii (b"Hello " , - 1 )
17891805 writer .write_ascii (b"" , 0 )
1806+ writer .write_ascii (NULL , 0 )
1807+ # CRASHES writer.write_ascii(NULL, 1)
1808+ # CRASHES writer.write_ascii(NULL, -1)
17901809 writer .write_ascii (b"Python! <truncated>" , 6 )
17911810 self .assertEqual (writer .finish (), "Hello Python" )
17921811
@@ -1803,6 +1822,9 @@ def test_recover_utf8_error(self):
18031822 # write fails with an invalid string
18041823 with self .assertRaises (UnicodeDecodeError ):
18051824 writer .write_utf8 (b"invalid\xFF " , - 1 )
1825+ with self .assertRaises (UnicodeDecodeError ):
1826+ s = "truncated\u20AC " .encode ()
1827+ writer .write_utf8 (s , len (s ) - 1 )
18061828
18071829 # retry write with a valid string
18081830 writer .write_utf8 (b"valid" , - 1 )
@@ -1814,13 +1836,19 @@ def test_decode_utf8(self):
18141836 # test PyUnicodeWriter_DecodeUTF8Stateful()
18151837 writer = self .create_writer (0 )
18161838 writer .decodeutf8stateful (b"ign\xFF ore" , - 1 , b"ignore" )
1817- writer .write_char ('-' )
1839+ writer .write_char (ord ( '-' ) )
18181840 writer .decodeutf8stateful (b"replace\xFF " , - 1 , b"replace" )
1819- writer .write_char ('-' )
1841+ writer .write_char (ord ( '-' ) )
18201842
18211843 # incomplete trailing UTF-8 sequence
18221844 writer .decodeutf8stateful (b"incomplete\xC3 " , - 1 , b"replace" )
18231845
1846+ writer .decodeutf8stateful (NULL , 0 , b"replace" )
1847+ # CRASHES writer.decodeutf8stateful(NULL, 1, b"replace")
1848+ # CRASHES writer.decodeutf8stateful(NULL, -1, b"replace")
1849+ with self .assertRaises (UnicodeDecodeError ):
1850+ writer .decodeutf8stateful (b"default\xFF " , - 1 , NULL )
1851+
18241852 self .assertEqual (writer .finish (),
18251853 "ignore-replace\uFFFD -incomplete\uFFFD " )
18261854
@@ -1831,12 +1859,12 @@ def test_decode_utf8_consumed(self):
18311859 # valid string
18321860 consumed = writer .decodeutf8stateful (b"text" , - 1 , b"strict" , True )
18331861 self .assertEqual (consumed , 4 )
1834- writer .write_char ('-' )
1862+ writer .write_char (ord ( '-' ) )
18351863
18361864 # non-ASCII
18371865 consumed = writer .decodeutf8stateful (b"\xC3 \xA9 -\xE2 \x82 \xAC " , 6 , b"strict" , True )
18381866 self .assertEqual (consumed , 6 )
1839- writer .write_char ('-' )
1867+ writer .write_char (ord ( '-' ) )
18401868
18411869 # invalid UTF-8 (consumed is 0 on error)
18421870 with self .assertRaises (UnicodeDecodeError ):
@@ -1845,54 +1873,92 @@ def test_decode_utf8_consumed(self):
18451873 # ignore error handler
18461874 consumed = writer .decodeutf8stateful (b"more\xFF " , - 1 , b"ignore" , True )
18471875 self .assertEqual (consumed , 5 )
1848- writer .write_char ('-' )
1876+ writer .write_char (ord ( '-' ) )
18491877
18501878 # incomplete trailing UTF-8 sequence
18511879 consumed = writer .decodeutf8stateful (b"incomplete\xC3 " , - 1 , b"ignore" , True )
18521880 self .assertEqual (consumed , 10 )
1881+ writer .write_char (ord ('-' ))
18531882
1854- self .assertEqual (writer .finish (), "text-\xE9 -\u20AC -more-incomplete" )
1883+ consumed = writer .decodeutf8stateful (NULL , 0 , b"replace" , True )
1884+ self .assertEqual (consumed , 0 )
1885+ # CRASHES writer.decodeutf8stateful(NULL, 1, b"replace", True)
1886+ # CRASHES writer.decodeutf8stateful(NULL, -1, b"replace", True)
1887+ consumed = writer .decodeutf8stateful (b"default\xC3 " , - 1 , NULL , True )
1888+ self .assertEqual (consumed , 7 )
1889+
1890+ self .assertEqual (writer .finish (), "text-\xE9 -\u20AC -more-incomplete-default" )
18551891
18561892 def test_widechar (self ):
1893+ from _testcapi import SIZEOF_WCHAR_T
1894+
1895+ if SIZEOF_WCHAR_T == 2 :
1896+ encoding = 'utf-16le' if sys .byteorder == 'little' else 'utf-16be'
1897+ elif SIZEOF_WCHAR_T == 4 :
1898+ encoding = 'utf-32le' if sys .byteorder == 'little' else 'utf-32be'
1899+
18571900 writer = self .create_writer (0 )
1858- writer .write_widechar ("latin1=\xE9 " )
1859- writer .write_widechar ("-" )
1860- writer .write_widechar ("euro=\u20AC " )
1861- writer .write_char ("-" )
1862- writer .write_widechar ("max=\U0010ffff " )
1863- writer .write_char ('.' )
1901+ writer .write_widechar ("latin1=\xE9 " .encode (encoding ))
1902+ writer .write_char (ord ("-" ))
1903+ writer .write_widechar ("euro=\u20AC " .encode (encoding ))
1904+ writer .write_char (ord ("-" ))
1905+ writer .write_widechar ("max=\U0010ffff " .encode (encoding ))
1906+ writer .write_char (ord ("-" ))
1907+ writer .write_widechar ("zeroes=" .encode (encoding ).ljust (SIZEOF_WCHAR_T * 10 , b'\0 ' ),
1908+ 10 )
1909+ writer .write_char (ord ('.' ))
1910+
1911+ if SIZEOF_WCHAR_T == 4 :
1912+ invalid = (b'\x00 \x00 \x11 \x00 ' if sys .byteorder == 'little' else
1913+ b'\x00 \x11 \x00 \x00 ' )
1914+ with self .assertRaises (ValueError ):
1915+ writer .write_widechar ("invalid=" .encode (encoding ) + invalid )
1916+ writer .write_widechar (b'' , - 5 )
1917+ writer .write_widechar (NULL , 0 )
1918+ # CRASHES writer.write_widechar(NULL, 1)
1919+ # CRASHES writer.write_widechar(NULL, -1)
1920+
18641921 self .assertEqual (writer .finish (),
1865- "latin1=\xE9 -euro=\u20AC -max=\U0010ffff ." )
1922+ "latin1=\xE9 -euro=\u20AC -max=\U0010ffff -zeroes= \0 \0 \0 ." )
18661923
18671924 def test_ucs4 (self ):
1925+ encoding = 'utf-32le' if sys .byteorder == 'little' else 'utf-32be'
1926+
18681927 writer = self .create_writer (0 )
1869- writer .write_ucs4 ("ascii IGNORED" , 5 )
1870- writer .write_char ("-" )
1871- writer .write_ucs4 ("latin1=\xe9 " , 8 )
1872- writer .write_char ("-" )
1873- writer .write_ucs4 ("euro=\u20ac " , 6 )
1874- writer .write_char ("-" )
1875- writer .write_ucs4 ("max=\U0010ffff " , 5 )
1876- writer .write_char ("." )
1928+ writer .write_ucs4 ("ascii IGNORED" . encode ( encoding ) , 5 )
1929+ writer .write_char (ord ( "-" ) )
1930+ writer .write_ucs4 ("latin1=\xe9 " . encode ( encoding ) )
1931+ writer .write_char (ord ( "-" ) )
1932+ writer .write_ucs4 ("euro=\u20ac " . encode ( encoding ) )
1933+ writer .write_char (ord ( "-" ) )
1934+ writer .write_ucs4 ("max=\U0010ffff " . encode ( encoding ) )
1935+ writer .write_char (ord ( "." ) )
18771936 self .assertEqual (writer .finish (),
18781937 "ascii-latin1=\xE9 -euro=\u20AC -max=\U0010ffff ." )
18791938
18801939 # Test some special characters
18811940 writer = self .create_writer (0 )
18821941 # Lone surrogate character
1883- writer .write_ucs4 ("lone\uDC80 " , 5 )
1884- writer .write_char ("-" )
1942+ writer .write_ucs4 ("lone\uDC80 " . encode ( encoding , 'surrogatepass' ) )
1943+ writer .write_char (ord ( "-" ) )
18851944 # Surrogate pair
1886- writer .write_ucs4 ("pair\uDBFF \uDFFF " , 5 )
1887- writer .write_char ("-" )
1888- writer .write_ucs4 ("null[\0 ]" , 7 )
1945+ writer .write_ucs4 ("pair\uD83D \uDC0D " .encode (encoding , 'surrogatepass' ))
1946+ writer .write_char (ord ("-" ))
1947+ writer .write_ucs4 ("null[\0 ]" .encode (encoding ), 7 )
1948+ invalid = (b'\x00 \x00 \x11 \x00 ' if sys .byteorder == 'little' else
1949+ b'\x00 \x11 \x00 \x00 ' )
1950+ # CRASHES writer.write_ucs4("invalid".encode(encoding) + invalid)
1951+ writer .write_ucs4 (NULL , 0 )
1952+ # CRASHES writer.write_ucs4(NULL, 1)
18891953 self .assertEqual (writer .finish (),
1890- "lone\udc80 -pair\udbff -null[\0 ]" )
1954+ "lone\udc80 -pair\ud83d \udc0d -null[\x00 ]" )
18911955
18921956 # invalid size
18931957 writer = self .create_writer (0 )
18941958 with self .assertRaises (ValueError ):
1895- writer .write_ucs4 ("text" , - 1 )
1959+ writer .write_ucs4 ("text" .encode (encoding ), - 1 )
1960+ self .assertRaises (ValueError , writer .write_ucs4 , b'' , - 1 )
1961+ self .assertRaises (ValueError , writer .write_ucs4 , NULL , - 1 )
18961962
18971963 def test_substring_empty (self ):
18981964 writer = self .create_writer (0 )
@@ -1918,7 +1984,7 @@ def test_format(self):
19181984 from ctypes import c_int
19191985 writer = self .create_writer (0 )
19201986 self .writer_format (writer , b'%s %i' , b'abc' , c_int (123 ))
1921- writer .write_char ('.' )
1987+ writer .write_char (ord ( '.' ) )
19221988 self .assertEqual (writer .finish (), 'abc 123.' )
19231989
19241990 def test_recover_error (self ):
0 commit comments