Skip to content

Commit f78dd97

Browse files
tamaroningCohenArthur
authored andcommitted
gccrs: Normalize Hangul to NFC
gcc/rust/ChangeLog: * util/rust-unicode.cc (decomp_cano): Decompose Hangul. (sort_cano): Fix bounds check. (recomp): use `compose_hangul`. (compose_hangul): Compose Hangul. (rust_utf8_normalize_test): Add tests. Signed-off-by: Raiki Tamura <[email protected]>
1 parent 4f1838b commit f78dd97

File tree

1 file changed

+123
-33
lines changed

1 file changed

+123
-33
lines changed

gcc/rust/util/rust-unicode.cc

+123-33
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,15 @@ namespace Rust {
99
typedef uint32_t codepoint_t;
1010
typedef std::vector<codepoint_t> string_t;
1111

12+
// These constants are used to compose and decompose of Hangul syllables.
13+
// See `Sample Code for Hangul Algorithms` in 3.1.2
14+
// unicode.org/versions/Unicode15.0.0/ch03.pdf
15+
const uint32_t S_BASE = 0xAC00;
16+
const uint32_t L_BASE = 0x1100, V_BASE = 0x1161, T_BASE = 0x11A7;
17+
const uint32_t L_COUNT = 19, V_COUNT = 21, T_COUNT = 28;
18+
const uint32_t N_COUNT = V_COUNT * T_COUNT;
19+
const uint32_t S_COUNT = L_COUNT * N_COUNT;
20+
1221
template <std::size_t SIZE>
1322
int64_t
1423
binary_search_ranges (
@@ -115,10 +124,26 @@ recursive_decomp_cano (codepoint_t c, string_t &buf)
115124
string_t
116125
decomp_cano (string_t s)
117126
{
118-
// TODO: Algorithmic lookup for Hangul
119127
string_t buf;
120128
for (codepoint_t c : s)
121-
recursive_decomp_cano (c, buf);
129+
{
130+
int64_t s_index = c - S_BASE;
131+
if (0 <= s_index && s_index < S_COUNT)
132+
{
133+
// decompose Hangul argorithmically
134+
uint32_t l = L_BASE + s_index / N_COUNT;
135+
uint32_t v = V_BASE + (s_index % N_COUNT) / T_COUNT;
136+
uint32_t t = T_BASE + s_index % T_COUNT;
137+
buf.push_back (l);
138+
buf.push_back (v);
139+
if (t != T_BASE)
140+
buf.push_back (t);
141+
continue;
142+
}
143+
144+
// Current character is not hangul
145+
recursive_decomp_cano (c, buf);
146+
}
122147
return buf;
123148
}
124149

@@ -132,7 +157,7 @@ sort_cano (string_t &s)
132157
{
133158
cc_here = lookup_cc (s[i]);
134159
cc_prev = lookup_cc (s[i - 1]);
135-
if (cc_here >= 0 && cc_prev > cc_here)
160+
if (cc_here > 0 && cc_prev > 0 && cc_prev > cc_here)
136161
{
137162
// swap
138163
int tmp = s[i];
@@ -145,45 +170,100 @@ sort_cano (string_t &s)
145170
}
146171

147172
string_t
148-
recomp (string_t s)
173+
compose_hangul (string_t s)
149174
{
150-
// TODO: Algorithmic lookup for Hangul
151175
string_t buf;
152-
if (s.size () > 0)
176+
if (s.size () < 2)
177+
return s;
178+
179+
codepoint_t last = s[0];
180+
buf.push_back (last);
181+
for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
153182
{
154-
int last_class = -1;
155-
// Assume the first character is Starter.
156-
codepoint_t starter_ch = s[0];
157-
for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
183+
codepoint_t ch = s[src_pos];
184+
185+
// L V => LV
186+
int64_t l_index = last - L_BASE;
187+
if (0 <= l_index && l_index < L_COUNT)
158188
{
159-
// get current character
160-
codepoint_t ch = s[src_pos];
161-
int ch_class = lookup_cc (ch);
162-
tl::optional<codepoint_t> composite = lookup_recomp (starter_ch, ch);
163-
if (composite.has_value () && last_class < ch_class)
189+
int64_t v_index = ch - V_BASE;
190+
if (0 <= v_index && v_index < V_COUNT)
164191
{
165-
// ch can be composed
166-
buf.push_back (composite.value ());
167-
starter_ch = composite.value ();
192+
last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT;
193+
// pop L
194+
buf.pop_back ();
195+
buf.push_back (last);
196+
continue;
168197
}
169-
else if (ch_class == 0)
170-
{
171-
// ch is Starter and cannot be composed.
172-
if (src_pos == 1)
173-
// FIXME: buggy?
174-
buf.push_back (starter_ch);
175-
// starter_pos = target_pos;
176-
starter_ch = ch;
177-
last_class = -1;
178-
buf.push_back (ch);
179-
}
180-
else
198+
}
199+
200+
// LV T => LVT
201+
int64_t s_index = last - S_BASE;
202+
if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0)
203+
{
204+
int64_t t_index = ch - T_BASE;
205+
if (0 < t_index && t_index < T_COUNT)
181206
{
182-
// ch is not Starter.
183-
last_class = ch_class;
184-
buf.push_back (ch);
207+
last += t_index;
208+
// pop LV
209+
buf.pop_back ();
210+
buf.push_back (last);
211+
continue;
185212
}
186213
}
214+
last = ch;
215+
buf.push_back (last);
216+
}
217+
return buf;
218+
}
219+
220+
string_t
221+
recomp (string_t s)
222+
{
223+
// compose hangul first
224+
s = compose_hangul (s);
225+
226+
string_t buf;
227+
if (s.size () < 2)
228+
return s;
229+
230+
int last_class = -1;
231+
// int starter_pos = 0; // Assume the first character is Starter. Correct?
232+
// int target_pos = 1;
233+
codepoint_t starter_ch = s[0];
234+
235+
for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
236+
{
237+
// get current character
238+
codepoint_t ch = s[src_pos];
239+
240+
int ch_class = lookup_cc (ch);
241+
tl::optional<codepoint_t> composite = lookup_recomp (starter_ch, ch);
242+
if (composite.has_value () && last_class < ch_class)
243+
{
244+
// ch can be composed
245+
buf.push_back (composite.value ());
246+
starter_ch = composite.value ();
247+
}
248+
else if (ch_class == 0)
249+
{
250+
// ch is Starter and cannot be composed.
251+
if (src_pos == 1)
252+
// FIXME: buggy?
253+
buf.push_back (starter_ch);
254+
starter_ch = ch;
255+
last_class = -1;
256+
buf.push_back (ch);
257+
}
258+
else
259+
{
260+
if (src_pos == 1)
261+
// FIXME: buggy?
262+
buf.push_back (starter_ch);
263+
// ch is not Starter.
264+
last_class = ch_class;
265+
buf.push_back (ch);
266+
}
187267
}
188268
return buf;
189269
}
@@ -256,6 +336,16 @@ rust_utf8_normalize_test ()
256336
assert_normalize ({0x1e0c, 0x0307}, {0x1e0c, 0x0307});
257337
assert_normalize ({0x0044, 0x0307, 0x0323}, {0x1e0c, 0x0307});
258338

339+
// testcases for Hangul from Part0
340+
assert_normalize ({0x1100, 0xac00, 0x11a8}, {0x1100, 0xac01});
341+
assert_normalize ({0x1100, 0xac00, 0x11a8, 0x11a8}, {0x1100, 0xac01, 0x11a8});
342+
// testcases for Hangul from Part1
343+
assert_normalize ({0x3131}, {0x3131});
344+
assert_normalize ({0x3132}, {0x3132});
345+
// testcases for Hangul from Part3
346+
assert_normalize ({0x1100, 0x0334, 0x1161}, {0x1100, 0x0334, 0x1161});
347+
assert_normalize ({0xac54, 0x0334, 0x11ae}, {0xac54, 0x0334, 0x11ae});
348+
259349
// TODO: add more testcases in
260350
// https://unicode.org/Public/UNIDATA/NormalizationTest.txt
261351
}

0 commit comments

Comments
 (0)