Skip to content

Commit e418e33

Browse files
[3.13] gh-88091: Fix unicodedata.decomposition() for Hangul Syllables (GH-144993) (GH-145190)
(cherry picked from commit 56c4f10)
1 parent a3c0a80 commit e418e33

File tree

3 files changed

+44
-15
lines changed

3 files changed

+44
-15
lines changed

Lib/test/test_unicodedata.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,9 @@ class UnicodeFunctionsTest(unittest.TestCase):
8989

9090
# Update this if the database changes. Make sure to do a full rebuild
9191
# (e.g. 'make distclean && make') to get the correct checksum.
92-
expectedchecksum = ('a91d306c268ba7d5cdf14d49e63b3f967058869c'
92+
expectedchecksum = ('a5b8431ae6c0a0a78075c216193b7364a0497075'
9393
if quicktest else
94-
'232affd2a50ec4bd69d2482aa0291385cbdefaba')
94+
'72241cd356ce6dad7d0570d206ce869169151850')
9595

9696
def test_function_checksum(self):
9797
db = self.db
@@ -335,6 +335,12 @@ def test_decomposition(self):
335335
# New in 15.0.0
336336
self.assertEqual(self.db.decomposition('\U0001e06d'), '' if self.old else '<super> 04B1')
337337

338+
# Hangul characters
339+
self.assertEqual(self.db.decomposition('\uAC00'), '1100 1161')
340+
self.assertEqual(self.db.decomposition('\uD4DB'), '1111 1171 11B6')
341+
self.assertEqual(self.db.decomposition('\uC2F8'), '110A 1161')
342+
self.assertEqual(self.db.decomposition('\uD7A3'), '1112 1175 11C2')
343+
338344
self.assertRaises(TypeError, self.db.decomposition)
339345
self.assertRaises(TypeError, self.db.decomposition, 'xx')
340346

@@ -628,9 +634,9 @@ def test_east_asian_width_unassigned(self):
628634
class Unicode_3_2_0_FunctionsTest(UnicodeFunctionsTest):
629635
db = unicodedata.ucd_3_2_0
630636
old = True
631-
expectedchecksum = ('4154d8d1232837e255edf3cdcbb5ab184d71f4a4'
637+
expectedchecksum = ('883824cb6c0ccf994e4451ebf281e2d6d479af47'
632638
if quicktest else
633-
'b678d38ffbf1f1de092b2af1ed155602909fcd8d')
639+
'44bbc0dfbfd746ba08180183482aa569a3830510')
634640

635641

636642
class UnicodeMiscTest(unittest.TestCase):
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix :func:`unicodedata.decomposition` for Hangul characters.

Modules/unicodedata.c

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,17 @@ unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
386386
return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
387387
}
388388

389+
// For Hangul decomposition
390+
#define SBase 0xAC00
391+
#define LBase 0x1100
392+
#define VBase 0x1161
393+
#define TBase 0x11A7
394+
#define LCount 19
395+
#define VCount 21
396+
#define TCount 28
397+
#define NCount (VCount*TCount)
398+
#define SCount (LCount*NCount)
399+
389400
/*[clinic input]
390401
unicodedata.UCD.decomposition
391402
@@ -416,6 +427,25 @@ unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
416427
return PyUnicode_FromString(""); /* unassigned */
417428
}
418429

430+
// Hangul Decomposition.
431+
// See section 3.12.2, "Hangul Syllable Decomposition"
432+
// https://www.unicode.org/versions/latest/core-spec/chapter-3/#G56669
433+
if (SBase <= code && code < (SBase + SCount)) {
434+
int SIndex = code - SBase;
435+
int L = LBase + SIndex / NCount;
436+
int V = VBase + (SIndex % NCount) / TCount;
437+
int T = TBase + SIndex % TCount;
438+
if (T != TBase) {
439+
PyOS_snprintf(decomp, sizeof(decomp),
440+
"%04X %04X %04X", L, V, T);
441+
}
442+
else {
443+
PyOS_snprintf(decomp, sizeof(decomp),
444+
"%04X %04X", L, V);
445+
}
446+
return PyUnicode_FromString(decomp);
447+
}
448+
419449
if (code < 0 || code >= 0x110000)
420450
index = 0;
421451
else {
@@ -478,16 +508,6 @@ get_decomp_record(PyObject *self, Py_UCS4 code,
478508
(*index)++;
479509
}
480510

481-
#define SBase 0xAC00
482-
#define LBase 0x1100
483-
#define VBase 0x1161
484-
#define TBase 0x11A7
485-
#define LCount 19
486-
#define VCount 21
487-
#define TCount 28
488-
#define NCount (VCount*TCount)
489-
#define SCount (LCount*NCount)
490-
491511
static PyObject*
492512
nfd_nfkd(PyObject *self, PyObject *input, int k)
493513
{
@@ -541,7 +561,9 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
541561
}
542562
output = new_output;
543563
}
544-
/* Hangul Decomposition. */
564+
// Hangul Decomposition.
565+
// See section 3.12.2, "Hangul Syllable Decomposition"
566+
// https://www.unicode.org/versions/latest/core-spec/chapter-3/#G56669
545567
if (SBase <= code && code < (SBase+SCount)) {
546568
int SIndex = code - SBase;
547569
int L = LBase + SIndex / NCount;

0 commit comments

Comments
 (0)