@@ -9,6 +9,15 @@ namespace Rust {
99typedef uint32_t codepoint_t ;
1010typedef std::vector<codepoint_t > string_t ;
1111
12+ // These constants are used to compose and decompose of Hangul syllables.
13+ // See `Sample Code for Hangul Algorithms` in 3.1.2
14+ // unicode.org/versions/Unicode15.0.0/ch03.pdf
15+ const uint32_t S_BASE = 0xAC00 ;
16+ const uint32_t L_BASE = 0x1100 , V_BASE = 0x1161 , T_BASE = 0x11A7 ;
17+ const uint32_t L_COUNT = 19 , V_COUNT = 21 , T_COUNT = 28 ;
18+ const uint32_t N_COUNT = V_COUNT * T_COUNT;
19+ const uint32_t S_COUNT = L_COUNT * N_COUNT;
20+
1221template <std::size_t SIZE>
1322int64_t
1423binary_search_ranges (
@@ -115,10 +124,26 @@ recursive_decomp_cano (codepoint_t c, string_t &buf)
115124string_t
116125decomp_cano (string_t s)
117126{
118- // TODO: Algorithmic lookup for Hangul
119127 string_t buf;
120128 for (codepoint_t c : s)
121- recursive_decomp_cano (c, buf);
129+ {
130+ int64_t s_index = c - S_BASE;
131+ if (0 <= s_index && s_index < S_COUNT)
132+ {
133+ // decompose Hangul argorithmically
134+ uint32_t l = L_BASE + s_index / N_COUNT;
135+ uint32_t v = V_BASE + (s_index % N_COUNT) / T_COUNT;
136+ uint32_t t = T_BASE + s_index % T_COUNT;
137+ buf.push_back (l);
138+ buf.push_back (v);
139+ if (t != T_BASE)
140+ buf.push_back (t);
141+ continue ;
142+ }
143+
144+ // Current character is not hangul
145+ recursive_decomp_cano (c, buf);
146+ }
122147 return buf;
123148}
124149
@@ -132,7 +157,7 @@ sort_cano (string_t &s)
132157 {
133158 cc_here = lookup_cc (s[i]);
134159 cc_prev = lookup_cc (s[i - 1 ]);
135- if (cc_here >= 0 && cc_prev > cc_here)
160+ if (cc_here > 0 && cc_prev > 0 && cc_prev > cc_here)
136161 {
137162 // swap
138163 int tmp = s[i];
@@ -145,45 +170,100 @@ sort_cano (string_t &s)
145170}
146171
147172string_t
148- recomp (string_t s)
173+ compose_hangul (string_t s)
149174{
150- // TODO: Algorithmic lookup for Hangul
151175 string_t buf;
152- if (s.size () > 0 )
176+ if (s.size () < 2 )
177+ return s;
178+
179+ codepoint_t last = s[0 ];
180+ buf.push_back (last);
181+ for (unsigned int src_pos = 1 ; src_pos < s.size (); src_pos++)
153182 {
154- int last_class = -1 ;
155- // Assume the first character is Starter.
156- codepoint_t starter_ch = s[0 ];
157- for (unsigned int src_pos = 1 ; src_pos < s.size (); src_pos++)
183+ codepoint_t ch = s[src_pos];
184+
185+ // L V => LV
186+ int64_t l_index = last - L_BASE;
187+ if (0 <= l_index && l_index < L_COUNT)
158188 {
159- // get current character
160- codepoint_t ch = s[src_pos];
161- int ch_class = lookup_cc (ch);
162- tl::optional<codepoint_t > composite = lookup_recomp (starter_ch, ch);
163- if (composite.has_value () && last_class < ch_class)
189+ int64_t v_index = ch - V_BASE;
190+ if (0 <= v_index && v_index < V_COUNT)
164191 {
165- // ch can be composed
166- buf.push_back (composite.value ());
167- starter_ch = composite.value ();
192+ last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT;
193+ // pop L
194+ buf.pop_back ();
195+ buf.push_back (last);
196+ continue ;
168197 }
169- else if (ch_class == 0 )
170- {
171- // ch is Starter and cannot be composed.
172- if (src_pos == 1 )
173- // FIXME: buggy?
174- buf.push_back (starter_ch);
175- // starter_pos = target_pos;
176- starter_ch = ch;
177- last_class = -1 ;
178- buf.push_back (ch);
179- }
180- else
198+ }
199+
200+ // LV T => LVT
201+ int64_t s_index = last - S_BASE;
202+ if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0 )
203+ {
204+ int64_t t_index = ch - T_BASE;
205+ if (0 < t_index && t_index < T_COUNT)
181206 {
182- // ch is not Starter.
183- last_class = ch_class;
184- buf.push_back (ch);
207+ last += t_index;
208+ // pop LV
209+ buf.pop_back ();
210+ buf.push_back (last);
211+ continue ;
185212 }
186213 }
214+ last = ch;
215+ buf.push_back (last);
216+ }
217+ return buf;
218+ }
219+
220+ string_t
221+ recomp (string_t s)
222+ {
223+ // compose hangul first
224+ s = compose_hangul (s);
225+
226+ string_t buf;
227+ if (s.size () < 2 )
228+ return s;
229+
230+ int last_class = -1 ;
231+ // int starter_pos = 0; // Assume the first character is Starter. Correct?
232+ // int target_pos = 1;
233+ codepoint_t starter_ch = s[0 ];
234+
235+ for (unsigned int src_pos = 1 ; src_pos < s.size (); src_pos++)
236+ {
237+ // get current character
238+ codepoint_t ch = s[src_pos];
239+
240+ int ch_class = lookup_cc (ch);
241+ tl::optional<codepoint_t > composite = lookup_recomp (starter_ch, ch);
242+ if (composite.has_value () && last_class < ch_class)
243+ {
244+ // ch can be composed
245+ buf.push_back (composite.value ());
246+ starter_ch = composite.value ();
247+ }
248+ else if (ch_class == 0 )
249+ {
250+ // ch is Starter and cannot be composed.
251+ if (src_pos == 1 )
252+ // FIXME: buggy?
253+ buf.push_back (starter_ch);
254+ starter_ch = ch;
255+ last_class = -1 ;
256+ buf.push_back (ch);
257+ }
258+ else
259+ {
260+ if (src_pos == 1 )
261+ // FIXME: buggy?
262+ buf.push_back (starter_ch);
263+ // ch is not Starter.
264+ last_class = ch_class;
265+ buf.push_back (ch);
266+ }
187267 }
188268 return buf;
189269}
@@ -256,6 +336,16 @@ rust_utf8_normalize_test ()
256336 assert_normalize ({0x1e0c , 0x0307 }, {0x1e0c , 0x0307 });
257337 assert_normalize ({0x0044 , 0x0307 , 0x0323 }, {0x1e0c , 0x0307 });
258338
339+ // testcases for Hangul from Part0
340+ assert_normalize ({0x1100 , 0xac00 , 0x11a8 }, {0x1100 , 0xac01 });
341+ assert_normalize ({0x1100 , 0xac00 , 0x11a8 , 0x11a8 }, {0x1100 , 0xac01 , 0x11a8 });
342+ // testcases for Hangul from Part1
343+ assert_normalize ({0x3131 }, {0x3131 });
344+ assert_normalize ({0x3132 }, {0x3132 });
345+ // testcases for Hangul from Part3
346+ assert_normalize ({0x1100 , 0x0334 , 0x1161 }, {0x1100 , 0x0334 , 0x1161 });
347+ assert_normalize ({0xac54 , 0x0334 , 0x11ae }, {0xac54 , 0x0334 , 0x11ae });
348+
259349 // TODO: add more testcases in
260350 // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
261351}
0 commit comments