Skip to content

Commit 0701ce6

Browse files
[3.14] gh-88091: Fix unicodedata.decomposition() for Hangul Syllables (GH-144993) (GH-145189)
(cherry picked from commit 56c4f10)
1 parent 12092af commit 0701ce6

File tree

3 files changed

+44
-15
lines changed

3 files changed

+44
-15
lines changed

Lib/test/test_unicodedata.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,9 @@ class UnicodeFunctionsTest(unittest.TestCase):
8989

9090
# Update this if the database changes. Make sure to do a full rebuild
9191
# (e.g. 'make distclean && make') to get the correct checksum.
92-
expectedchecksum = ('35e842600fa7ae2db93739db08ef201b726a2374'
92+
expectedchecksum = ('1ba453ec456896f1190d849b6e9b7c2e1a4128e0'
9393
if quicktest else
94-
'23ab09ed4abdf93db23b97359108ed630dd8311d')
94+
'46ca89d9fe34881d0be3a4a4b29f5aa8c019640c')
9595

9696
def test_function_checksum(self):
9797
db = self.db
@@ -346,6 +346,12 @@ def test_decomposition(self):
346346
# New in 16.0.0
347347
self.assertEqual(self.db.decomposition('\U0001CCD6'), '' if self.old else '<font> 0041')
348348

349+
# Hangul characters
350+
self.assertEqual(self.db.decomposition('\uAC00'), '1100 1161')
351+
self.assertEqual(self.db.decomposition('\uD4DB'), '1111 1171 11B6')
352+
self.assertEqual(self.db.decomposition('\uC2F8'), '110A 1161')
353+
self.assertEqual(self.db.decomposition('\uD7A3'), '1112 1175 11C2')
354+
349355
self.assertRaises(TypeError, self.db.decomposition)
350356
self.assertRaises(TypeError, self.db.decomposition, 'xx')
351357

@@ -649,9 +655,9 @@ def test_east_asian_width_unassigned(self):
649655
class Unicode_3_2_0_FunctionsTest(UnicodeFunctionsTest):
650656
db = unicodedata.ucd_3_2_0
651657
old = True
652-
expectedchecksum = ('4154d8d1232837e255edf3cdcbb5ab184d71f4a4'
658+
expectedchecksum = ('883824cb6c0ccf994e4451ebf281e2d6d479af47'
653659
if quicktest else
654-
'b0a8df4ce8cf910def4e75f2d03c93defcc9bb09')
660+
'caf1a7f2f380f927461837f1901ef20683f98683')
655661

656662

657663
class UnicodeMiscTest(unittest.TestCase):
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix :func:`unicodedata.decomposition` for Hangul characters.

Modules/unicodedata.c

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,17 @@ unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
388388
return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
389389
}
390390

391+
// For Hangul decomposition
392+
#define SBase 0xAC00
393+
#define LBase 0x1100
394+
#define VBase 0x1161
395+
#define TBase 0x11A7
396+
#define LCount 19
397+
#define VCount 21
398+
#define TCount 28
399+
#define NCount (VCount*TCount)
400+
#define SCount (LCount*NCount)
401+
391402
/*[clinic input]
392403
unicodedata.UCD.decomposition
393404
@@ -418,6 +429,25 @@ unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
418429
return Py_GetConstant(Py_CONSTANT_EMPTY_STR); /* unassigned */
419430
}
420431

432+
// Hangul Decomposition.
433+
// See section 3.12.2, "Hangul Syllable Decomposition"
434+
// https://www.unicode.org/versions/latest/core-spec/chapter-3/#G56669
435+
if (SBase <= code && code < (SBase + SCount)) {
436+
int SIndex = code - SBase;
437+
int L = LBase + SIndex / NCount;
438+
int V = VBase + (SIndex % NCount) / TCount;
439+
int T = TBase + SIndex % TCount;
440+
if (T != TBase) {
441+
PyOS_snprintf(decomp, sizeof(decomp),
442+
"%04X %04X %04X", L, V, T);
443+
}
444+
else {
445+
PyOS_snprintf(decomp, sizeof(decomp),
446+
"%04X %04X", L, V);
447+
}
448+
return PyUnicode_FromString(decomp);
449+
}
450+
421451
if (code < 0 || code >= 0x110000)
422452
index = 0;
423453
else {
@@ -480,16 +510,6 @@ get_decomp_record(PyObject *self, Py_UCS4 code,
480510
(*index)++;
481511
}
482512

483-
#define SBase 0xAC00
484-
#define LBase 0x1100
485-
#define VBase 0x1161
486-
#define TBase 0x11A7
487-
#define LCount 19
488-
#define VCount 21
489-
#define TCount 28
490-
#define NCount (VCount*TCount)
491-
#define SCount (LCount*NCount)
492-
493513
static PyObject*
494514
nfd_nfkd(PyObject *self, PyObject *input, int k)
495515
{
@@ -543,7 +563,9 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
543563
}
544564
output = new_output;
545565
}
546-
/* Hangul Decomposition. */
566+
// Hangul Decomposition.
567+
// See section 3.12.2, "Hangul Syllable Decomposition"
568+
// https://www.unicode.org/versions/latest/core-spec/chapter-3/#G56669
547569
if (SBase <= code && code < (SBase+SCount)) {
548570
int SIndex = code - SBase;
549571
int L = LBase + SIndex / NCount;

0 commit comments

Comments
 (0)