@@ -9,6 +9,15 @@ namespace Rust {
9
9
typedef uint32_t codepoint_t ;
10
10
typedef std::vector<codepoint_t > string_t ;
11
11
12
+ // These constants are used to compose and decompose of Hangul syllables.
13
+ // See `Sample Code for Hangul Algorithms` in 3.1.2
14
+ // unicode.org/versions/Unicode15.0.0/ch03.pdf
15
+ const uint32_t S_BASE = 0xAC00 ;
16
+ const uint32_t L_BASE = 0x1100 , V_BASE = 0x1161 , T_BASE = 0x11A7 ;
17
+ const uint32_t L_COUNT = 19 , V_COUNT = 21 , T_COUNT = 28 ;
18
+ const uint32_t N_COUNT = V_COUNT * T_COUNT;
19
+ const uint32_t S_COUNT = L_COUNT * N_COUNT;
20
+
12
21
template <std::size_t SIZE>
13
22
int64_t
14
23
binary_search_ranges (
@@ -115,10 +124,26 @@ recursive_decomp_cano (codepoint_t c, string_t &buf)
115
124
string_t
116
125
decomp_cano (string_t s)
117
126
{
118
- // TODO: Algorithmic lookup for Hangul
119
127
string_t buf;
120
128
for (codepoint_t c : s)
121
- recursive_decomp_cano (c, buf);
129
+ {
130
+ int64_t s_index = c - S_BASE;
131
+ if (0 <= s_index && s_index < S_COUNT)
132
+ {
133
+ // decompose Hangul argorithmically
134
+ uint32_t l = L_BASE + s_index / N_COUNT;
135
+ uint32_t v = V_BASE + (s_index % N_COUNT) / T_COUNT;
136
+ uint32_t t = T_BASE + s_index % T_COUNT;
137
+ buf.push_back (l);
138
+ buf.push_back (v);
139
+ if (t != T_BASE)
140
+ buf.push_back (t);
141
+ continue ;
142
+ }
143
+
144
+ // Current character is not hangul
145
+ recursive_decomp_cano (c, buf);
146
+ }
122
147
return buf;
123
148
}
124
149
@@ -132,7 +157,7 @@ sort_cano (string_t &s)
132
157
{
133
158
cc_here = lookup_cc (s[i]);
134
159
cc_prev = lookup_cc (s[i - 1 ]);
135
- if (cc_here >= 0 && cc_prev > cc_here)
160
+ if (cc_here > 0 && cc_prev > 0 && cc_prev > cc_here)
136
161
{
137
162
// swap
138
163
int tmp = s[i];
@@ -145,45 +170,100 @@ sort_cano (string_t &s)
145
170
}
146
171
147
172
string_t
148
- recomp (string_t s)
173
+ compose_hangul (string_t s)
149
174
{
150
- // TODO: Algorithmic lookup for Hangul
151
175
string_t buf;
152
- if (s.size () > 0 )
176
+ if (s.size () < 2 )
177
+ return s;
178
+
179
+ codepoint_t last = s[0 ];
180
+ buf.push_back (last);
181
+ for (unsigned int src_pos = 1 ; src_pos < s.size (); src_pos++)
153
182
{
154
- int last_class = -1 ;
155
- // Assume the first character is Starter.
156
- codepoint_t starter_ch = s[0 ];
157
- for (unsigned int src_pos = 1 ; src_pos < s.size (); src_pos++)
183
+ codepoint_t ch = s[src_pos];
184
+
185
+ // L V => LV
186
+ int64_t l_index = last - L_BASE;
187
+ if (0 <= l_index && l_index < L_COUNT)
158
188
{
159
- // get current character
160
- codepoint_t ch = s[src_pos];
161
- int ch_class = lookup_cc (ch);
162
- tl::optional<codepoint_t > composite = lookup_recomp (starter_ch, ch);
163
- if (composite.has_value () && last_class < ch_class)
189
+ int64_t v_index = ch - V_BASE;
190
+ if (0 <= v_index && v_index < V_COUNT)
164
191
{
165
- // ch can be composed
166
- buf.push_back (composite.value ());
167
- starter_ch = composite.value ();
192
+ last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT;
193
+ // pop L
194
+ buf.pop_back ();
195
+ buf.push_back (last);
196
+ continue ;
168
197
}
169
- else if (ch_class == 0 )
170
- {
171
- // ch is Starter and cannot be composed.
172
- if (src_pos == 1 )
173
- // FIXME: buggy?
174
- buf.push_back (starter_ch);
175
- // starter_pos = target_pos;
176
- starter_ch = ch;
177
- last_class = -1 ;
178
- buf.push_back (ch);
179
- }
180
- else
198
+ }
199
+
200
+ // LV T => LVT
201
+ int64_t s_index = last - S_BASE;
202
+ if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0 )
203
+ {
204
+ int64_t t_index = ch - T_BASE;
205
+ if (0 < t_index && t_index < T_COUNT)
181
206
{
182
- // ch is not Starter.
183
- last_class = ch_class;
184
- buf.push_back (ch);
207
+ last += t_index;
208
+ // pop LV
209
+ buf.pop_back ();
210
+ buf.push_back (last);
211
+ continue ;
185
212
}
186
213
}
214
+ last = ch;
215
+ buf.push_back (last);
216
+ }
217
+ return buf;
218
+ }
219
+
220
+ string_t
221
+ recomp (string_t s)
222
+ {
223
+ // compose hangul first
224
+ s = compose_hangul (s);
225
+
226
+ string_t buf;
227
+ if (s.size () < 2 )
228
+ return s;
229
+
230
+ int last_class = -1 ;
231
+ // int starter_pos = 0; // Assume the first character is Starter. Correct?
232
+ // int target_pos = 1;
233
+ codepoint_t starter_ch = s[0 ];
234
+
235
+ for (unsigned int src_pos = 1 ; src_pos < s.size (); src_pos++)
236
+ {
237
+ // get current character
238
+ codepoint_t ch = s[src_pos];
239
+
240
+ int ch_class = lookup_cc (ch);
241
+ tl::optional<codepoint_t > composite = lookup_recomp (starter_ch, ch);
242
+ if (composite.has_value () && last_class < ch_class)
243
+ {
244
+ // ch can be composed
245
+ buf.push_back (composite.value ());
246
+ starter_ch = composite.value ();
247
+ }
248
+ else if (ch_class == 0 )
249
+ {
250
+ // ch is Starter and cannot be composed.
251
+ if (src_pos == 1 )
252
+ // FIXME: buggy?
253
+ buf.push_back (starter_ch);
254
+ starter_ch = ch;
255
+ last_class = -1 ;
256
+ buf.push_back (ch);
257
+ }
258
+ else
259
+ {
260
+ if (src_pos == 1 )
261
+ // FIXME: buggy?
262
+ buf.push_back (starter_ch);
263
+ // ch is not Starter.
264
+ last_class = ch_class;
265
+ buf.push_back (ch);
266
+ }
187
267
}
188
268
return buf;
189
269
}
@@ -256,6 +336,16 @@ rust_utf8_normalize_test ()
256
336
assert_normalize ({0x1e0c , 0x0307 }, {0x1e0c , 0x0307 });
257
337
assert_normalize ({0x0044 , 0x0307 , 0x0323 }, {0x1e0c , 0x0307 });
258
338
339
+ // testcases for Hangul from Part0
340
+ assert_normalize ({0x1100 , 0xac00 , 0x11a8 }, {0x1100 , 0xac01 });
341
+ assert_normalize ({0x1100 , 0xac00 , 0x11a8 , 0x11a8 }, {0x1100 , 0xac01 , 0x11a8 });
342
+ // testcases for Hangul from Part1
343
+ assert_normalize ({0x3131 }, {0x3131 });
344
+ assert_normalize ({0x3132 }, {0x3132 });
345
+ // testcases for Hangul from Part3
346
+ assert_normalize ({0x1100 , 0x0334 , 0x1161 }, {0x1100 , 0x0334 , 0x1161 });
347
+ assert_normalize ({0xac54 , 0x0334 , 0x11ae }, {0xac54 , 0x0334 , 0x11ae });
348
+
259
349
// TODO: add more testcases in
260
350
// https://unicode.org/Public/UNIDATA/NormalizationTest.txt
261
351
}
0 commit comments