Skip to content

Commit 2105187

Browse files
[3.14] Improve tests for the PyUnicodeWriter C API (GH-146157) (GH-146180)
Add tests for corner cases: NULL pointers and out of range values. (cherry picked from commit ab47892) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
1 parent 7ba9580 commit 2105187

File tree

2 files changed

+144
-83
lines changed

2 files changed

+144
-83
lines changed

Lib/test/test_capi/test_unicode.py

Lines changed: 99 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1751,35 +1751,54 @@ def test_basic(self):
17511751
writer.write_utf8(b'var', -1)
17521752

17531753
# test PyUnicodeWriter_WriteChar()
1754-
writer.write_char('=')
1754+
writer.write_char(ord('='))
17551755

17561756
# test PyUnicodeWriter_WriteSubstring()
17571757
writer.write_substring("[long]", 1, 5)
1758+
# CRASHES writer.write_substring(NULL, 0, 0)
17581759

17591760
# test PyUnicodeWriter_WriteStr()
17601761
writer.write_str(" value ")
1762+
# CRASHES writer.write_str(NULL)
17611763

17621764
# test PyUnicodeWriter_WriteRepr()
17631765
writer.write_repr("repr")
17641766

17651767
self.assertEqual(writer.finish(),
17661768
"var=long value 'repr'")
17671769

1770+
def test_write_char(self):
1771+
writer = self.create_writer(0)
1772+
writer.write_char(0)
1773+
writer.write_char(ord('$'))
1774+
writer.write_char(0x20ac)
1775+
writer.write_char(0x10_ffff)
1776+
self.assertRaises(ValueError, writer.write_char, 0x11_0000)
1777+
self.assertRaises(ValueError, writer.write_char, 0xFFFF_FFFF)
1778+
self.assertEqual(writer.finish(),
1779+
"\0$\u20AC\U0010FFFF")
1780+
17681781
def test_utf8(self):
17691782
writer = self.create_writer(0)
17701783
writer.write_utf8(b"ascii", -1)
1771-
writer.write_char('-')
1784+
writer.write_char(ord('-'))
17721785
writer.write_utf8(b"latin1=\xC3\xA9", -1)
1773-
writer.write_char('-')
1786+
writer.write_char(ord('-'))
17741787
writer.write_utf8(b"euro=\xE2\x82\xAC", -1)
1775-
writer.write_char('.')
1788+
writer.write_char(ord('.'))
1789+
writer.write_utf8(NULL, 0)
1790+
# CRASHES writer.write_utf8(NULL, 1)
1791+
# CRASHES writer.write_utf8(NULL, -1)
17761792
self.assertEqual(writer.finish(),
17771793
"ascii-latin1=\xE9-euro=\u20AC.")
17781794

17791795
def test_ascii(self):
17801796
writer = self.create_writer(0)
17811797
writer.write_ascii(b"Hello ", -1)
17821798
writer.write_ascii(b"", 0)
1799+
writer.write_ascii(NULL, 0)
1800+
# CRASHES writer.write_ascii(NULL, 1)
1801+
# CRASHES writer.write_ascii(NULL, -1)
17831802
writer.write_ascii(b"Python! <truncated>", 6)
17841803
self.assertEqual(writer.finish(), "Hello Python")
17851804

@@ -1796,6 +1815,9 @@ def test_recover_utf8_error(self):
17961815
# write fails with an invalid string
17971816
with self.assertRaises(UnicodeDecodeError):
17981817
writer.write_utf8(b"invalid\xFF", -1)
1818+
with self.assertRaises(UnicodeDecodeError):
1819+
s = "truncated\u20AC".encode()
1820+
writer.write_utf8(s, len(s) - 1)
17991821

18001822
# retry write with a valid string
18011823
writer.write_utf8(b"valid", -1)
@@ -1807,13 +1829,19 @@ def test_decode_utf8(self):
18071829
# test PyUnicodeWriter_DecodeUTF8Stateful()
18081830
writer = self.create_writer(0)
18091831
writer.decodeutf8stateful(b"ign\xFFore", -1, b"ignore")
1810-
writer.write_char('-')
1832+
writer.write_char(ord('-'))
18111833
writer.decodeutf8stateful(b"replace\xFF", -1, b"replace")
1812-
writer.write_char('-')
1834+
writer.write_char(ord('-'))
18131835

18141836
# incomplete trailing UTF-8 sequence
18151837
writer.decodeutf8stateful(b"incomplete\xC3", -1, b"replace")
18161838

1839+
writer.decodeutf8stateful(NULL, 0, b"replace")
1840+
# CRASHES writer.decodeutf8stateful(NULL, 1, b"replace")
1841+
# CRASHES writer.decodeutf8stateful(NULL, -1, b"replace")
1842+
with self.assertRaises(UnicodeDecodeError):
1843+
writer.decodeutf8stateful(b"default\xFF", -1, NULL)
1844+
18171845
self.assertEqual(writer.finish(),
18181846
"ignore-replace\uFFFD-incomplete\uFFFD")
18191847

@@ -1824,12 +1852,12 @@ def test_decode_utf8_consumed(self):
18241852
# valid string
18251853
consumed = writer.decodeutf8stateful(b"text", -1, b"strict", True)
18261854
self.assertEqual(consumed, 4)
1827-
writer.write_char('-')
1855+
writer.write_char(ord('-'))
18281856

18291857
# non-ASCII
18301858
consumed = writer.decodeutf8stateful(b"\xC3\xA9-\xE2\x82\xAC", 6, b"strict", True)
18311859
self.assertEqual(consumed, 6)
1832-
writer.write_char('-')
1860+
writer.write_char(ord('-'))
18331861

18341862
# invalid UTF-8 (consumed is 0 on error)
18351863
with self.assertRaises(UnicodeDecodeError):
@@ -1838,54 +1866,92 @@ def test_decode_utf8_consumed(self):
18381866
# ignore error handler
18391867
consumed = writer.decodeutf8stateful(b"more\xFF", -1, b"ignore", True)
18401868
self.assertEqual(consumed, 5)
1841-
writer.write_char('-')
1869+
writer.write_char(ord('-'))
18421870

18431871
# incomplete trailing UTF-8 sequence
18441872
consumed = writer.decodeutf8stateful(b"incomplete\xC3", -1, b"ignore", True)
18451873
self.assertEqual(consumed, 10)
1874+
writer.write_char(ord('-'))
18461875

1847-
self.assertEqual(writer.finish(), "text-\xE9-\u20AC-more-incomplete")
1876+
consumed = writer.decodeutf8stateful(NULL, 0, b"replace", True)
1877+
self.assertEqual(consumed, 0)
1878+
# CRASHES writer.decodeutf8stateful(NULL, 1, b"replace", True)
1879+
# CRASHES writer.decodeutf8stateful(NULL, -1, b"replace", True)
1880+
consumed = writer.decodeutf8stateful(b"default\xC3", -1, NULL, True)
1881+
self.assertEqual(consumed, 7)
1882+
1883+
self.assertEqual(writer.finish(), "text-\xE9-\u20AC-more-incomplete-default")
18481884

18491885
def test_widechar(self):
1886+
from _testcapi import SIZEOF_WCHAR_T
1887+
1888+
if SIZEOF_WCHAR_T == 2:
1889+
encoding = 'utf-16le' if sys.byteorder == 'little' else 'utf-16be'
1890+
elif SIZEOF_WCHAR_T == 4:
1891+
encoding = 'utf-32le' if sys.byteorder == 'little' else 'utf-32be'
1892+
18501893
writer = self.create_writer(0)
1851-
writer.write_widechar("latin1=\xE9")
1852-
writer.write_widechar("-")
1853-
writer.write_widechar("euro=\u20AC")
1854-
writer.write_char("-")
1855-
writer.write_widechar("max=\U0010ffff")
1856-
writer.write_char('.')
1894+
writer.write_widechar("latin1=\xE9".encode(encoding))
1895+
writer.write_char(ord("-"))
1896+
writer.write_widechar("euro=\u20AC".encode(encoding))
1897+
writer.write_char(ord("-"))
1898+
writer.write_widechar("max=\U0010ffff".encode(encoding))
1899+
writer.write_char(ord("-"))
1900+
writer.write_widechar("zeroes=".encode(encoding).ljust(SIZEOF_WCHAR_T * 10, b'\0'),
1901+
10)
1902+
writer.write_char(ord('.'))
1903+
1904+
if SIZEOF_WCHAR_T == 4:
1905+
invalid = (b'\x00\x00\x11\x00' if sys.byteorder == 'little' else
1906+
b'\x00\x11\x00\x00')
1907+
with self.assertRaises(ValueError):
1908+
writer.write_widechar("invalid=".encode(encoding) + invalid)
1909+
writer.write_widechar(b'', -5)
1910+
writer.write_widechar(NULL, 0)
1911+
# CRASHES writer.write_widechar(NULL, 1)
1912+
# CRASHES writer.write_widechar(NULL, -1)
1913+
18571914
self.assertEqual(writer.finish(),
1858-
"latin1=\xE9-euro=\u20AC-max=\U0010ffff.")
1915+
"latin1=\xE9-euro=\u20AC-max=\U0010ffff-zeroes=\0\0\0.")
18591916

18601917
def test_ucs4(self):
1918+
encoding = 'utf-32le' if sys.byteorder == 'little' else 'utf-32be'
1919+
18611920
writer = self.create_writer(0)
1862-
writer.write_ucs4("ascii IGNORED", 5)
1863-
writer.write_char("-")
1864-
writer.write_ucs4("latin1=\xe9", 8)
1865-
writer.write_char("-")
1866-
writer.write_ucs4("euro=\u20ac", 6)
1867-
writer.write_char("-")
1868-
writer.write_ucs4("max=\U0010ffff", 5)
1869-
writer.write_char(".")
1921+
writer.write_ucs4("ascii IGNORED".encode(encoding), 5)
1922+
writer.write_char(ord("-"))
1923+
writer.write_ucs4("latin1=\xe9".encode(encoding))
1924+
writer.write_char(ord("-"))
1925+
writer.write_ucs4("euro=\u20ac".encode(encoding))
1926+
writer.write_char(ord("-"))
1927+
writer.write_ucs4("max=\U0010ffff".encode(encoding))
1928+
writer.write_char(ord("."))
18701929
self.assertEqual(writer.finish(),
18711930
"ascii-latin1=\xE9-euro=\u20AC-max=\U0010ffff.")
18721931

18731932
# Test some special characters
18741933
writer = self.create_writer(0)
18751934
# Lone surrogate character
1876-
writer.write_ucs4("lone\uDC80", 5)
1877-
writer.write_char("-")
1935+
writer.write_ucs4("lone\uDC80".encode(encoding, 'surrogatepass'))
1936+
writer.write_char(ord("-"))
18781937
# Surrogate pair
1879-
writer.write_ucs4("pair\uDBFF\uDFFF", 5)
1880-
writer.write_char("-")
1881-
writer.write_ucs4("null[\0]", 7)
1938+
writer.write_ucs4("pair\uD83D\uDC0D".encode(encoding, 'surrogatepass'))
1939+
writer.write_char(ord("-"))
1940+
writer.write_ucs4("null[\0]".encode(encoding), 7)
1941+
invalid = (b'\x00\x00\x11\x00' if sys.byteorder == 'little' else
1942+
b'\x00\x11\x00\x00')
1943+
# CRASHES writer.write_ucs4("invalid".encode(encoding) + invalid)
1944+
writer.write_ucs4(NULL, 0)
1945+
# CRASHES writer.write_ucs4(NULL, 1)
18821946
self.assertEqual(writer.finish(),
1883-
"lone\udc80-pair\udbff-null[\0]")
1947+
"lone\udc80-pair\ud83d\udc0d-null[\x00]")
18841948

18851949
# invalid size
18861950
writer = self.create_writer(0)
18871951
with self.assertRaises(ValueError):
1888-
writer.write_ucs4("text", -1)
1952+
writer.write_ucs4("text".encode(encoding), -1)
1953+
self.assertRaises(ValueError, writer.write_ucs4, b'', -1)
1954+
self.assertRaises(ValueError, writer.write_ucs4, NULL, -1)
18891955

18901956
def test_substring_empty(self):
18911957
writer = self.create_writer(0)
@@ -1911,7 +1977,7 @@ def test_format(self):
19111977
from ctypes import c_int
19121978
writer = self.create_writer(0)
19131979
self.writer_format(writer, b'%s %i', b'abc', c_int(123))
1914-
writer.write_char('.')
1980+
writer.write_char(ord('.'))
19151981
self.assertEqual(writer.finish(), 'abc 123.')
19161982

19171983
def test_recover_error(self):

0 commit comments

Comments
 (0)