Skip to content

Commit 887d390

Browse files
Add API to transform into KS X 1026-1 standard Korean syllables
Gated behind the `ks_x_1026-1` feature.
1 parent c24ac7f commit 887d390

File tree

3 files changed

+262
-60
lines changed

3 files changed

+262
-60
lines changed

src/lib.rs

+40-9
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,10 @@ pub use crate::quick_check::{
7373
};
7474
pub use crate::recompose::Recompositions;
7575
pub use crate::replace::Replacements;
76-
pub use crate::standardize_korean_syllables::StandardKoreanSyllables;
76+
pub use crate::standardize_korean_syllables::StandardizeKoreanSyllables;
77+
#[cfg(feature = "ks_x_1026-1")]
78+
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
79+
pub use crate::standardize_korean_syllables::StandardizeKoreanSyllablesKsX1026_1;
7780
pub use crate::stream_safe::StreamSafe;
7881
pub use crate::tables::UNICODE_VERSION;
7982
use core::{option, str::Chars};
@@ -148,9 +151,9 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
148151
/// inserted according to the Stream-Safe Text Process ([UAX15-D4](https://unicode.org/reports/tr15/#UAX15-D4))
149152
fn stream_safe(self) -> StreamSafe<I>;
150153

151-
/// An iterator over the string with Hangul choseong and jugseong filler characters inserted
154+
/// An iterator over the string with Hangul choseong and jungseong filler characters inserted
152155
/// to ensure that all Korean syllable blocks are in standard form according to [UAX29](https://www.unicode.org/reports/tr29/#Transforming_Into_SKS).
153-
fn standard_korean_syllables(self) -> StandardKoreanSyllables<I>;
156+
fn standard_korean_syllables(self) -> StandardizeKoreanSyllables<I>;
154157

155158
/// An iterator over the string in the variant of Unicode Normalization Form KD
156159
/// defined by Korean Standard X 1026-1. This normalization differs from that defined by Unicode
@@ -183,6 +186,12 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
183186
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
184187

185188
fn nfkc_ks_x_1026_1(self) -> RecomposeHangul<Recompositions<NormalizeJamoKdkc<I>>>;
189+
190+
/// An iterator over the string with Hangul choseong and jungseong filler characters inserted
191+
/// to ensure that all Korean syllable blocks are in standard form according to KS X 1026-1 § 7.8.
192+
#[cfg(feature = "ks_x_1026-1")]
193+
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
194+
fn standard_korean_syllables_ks_x_1026_1(self) -> StandardizeKoreanSyllablesKsX1026_1<I>;
186195
}
187196

188197
impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
@@ -217,8 +226,8 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
217226
}
218227

219228
#[inline]
220-
fn standard_korean_syllables(self) -> StandardKoreanSyllables<Chars<'a>> {
221-
StandardKoreanSyllables::new(self.chars())
229+
fn standard_korean_syllables(self) -> StandardizeKoreanSyllables<Chars<'a>> {
230+
StandardizeKoreanSyllables::new(self.chars())
222231
}
223232

224233
#[cfg(feature = "ks_x_1026-1")]
@@ -243,6 +252,14 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
243252
self.chars(),
244253
)))
245254
}
255+
256+
#[cfg(feature = "ks_x_1026-1")]
257+
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
258+
fn standard_korean_syllables_ks_x_1026_1(
259+
self,
260+
) -> StandardizeKoreanSyllablesKsX1026_1<Chars<'a>> {
261+
StandardizeKoreanSyllablesKsX1026_1::new(self.chars())
262+
}
246263
}
247264

248265
impl UnicodeNormalization<option::IntoIter<char>> for char {
@@ -277,8 +294,8 @@ impl UnicodeNormalization<option::IntoIter<char>> for char {
277294
}
278295

279296
#[inline]
280-
fn standard_korean_syllables(self) -> StandardKoreanSyllables<option::IntoIter<char>> {
281-
StandardKoreanSyllables::new(Some(self).into_iter())
297+
fn standard_korean_syllables(self) -> StandardizeKoreanSyllables<option::IntoIter<char>> {
298+
StandardizeKoreanSyllables::new(Some(self).into_iter())
282299
}
283300

284301
#[cfg(feature = "ks_x_1026-1")]
@@ -305,6 +322,14 @@ impl UnicodeNormalization<option::IntoIter<char>> for char {
305322
Some(self).into_iter(),
306323
)))
307324
}
325+
326+
#[cfg(feature = "ks_x_1026-1")]
327+
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
328+
fn standard_korean_syllables_ks_x_1026_1(
329+
self,
330+
) -> StandardizeKoreanSyllablesKsX1026_1<option::IntoIter<char>> {
331+
StandardizeKoreanSyllablesKsX1026_1::new(Some(self).into_iter())
332+
}
308333
}
309334

310335
impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
@@ -339,8 +364,8 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
339364
}
340365

341366
#[inline]
342-
fn standard_korean_syllables(self) -> StandardKoreanSyllables<I> {
343-
StandardKoreanSyllables::new(self)
367+
fn standard_korean_syllables(self) -> StandardizeKoreanSyllables<I> {
368+
StandardizeKoreanSyllables::new(self)
344369
}
345370

346371
#[cfg(feature = "ks_x_1026-1")]
@@ -363,4 +388,10 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
363388
fn nfkc_ks_x_1026_1(self) -> RecomposeHangul<Recompositions<NormalizeJamoKdkc<I>>> {
364389
RecomposeHangul::new(recompose::new_compatible(NormalizeJamoKdkc::new(self)))
365390
}
391+
392+
#[cfg(feature = "ks_x_1026-1")]
393+
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
394+
fn standard_korean_syllables_ks_x_1026_1(self) -> StandardizeKoreanSyllablesKsX1026_1<I> {
395+
StandardizeKoreanSyllablesKsX1026_1::new(self)
396+
}
366397
}

src/standardize_korean_syllables.rs

+172-43
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use core::iter::FusedIterator;
1+
use core::{iter::FusedIterator, marker::PhantomData};
22

33
use tinyvec::ArrayVec;
44

@@ -42,17 +42,27 @@ impl JamoKind {
4242
}
4343
}
4444

45-
/// Iterator over a string's characters, with '\u{115F}' and '\u{1160}' inserted
46-
/// where needed to ensure all Korean syllable blocks are in standard form
47-
/// by [UAX29 rules](https://www.unicode.org/reports/tr29/#Standard_Korean_Syllables).
45+
trait NormalizeKoreanSyllables {
46+
fn insert_fillers(
47+
next_c: Option<char>,
48+
prev_end_jamo_kind: Option<JamoKind>,
49+
next_start_jamo_kind: Option<JamoKind>,
50+
buf: &mut ArrayVec<[Option<char>; 3]>,
51+
) -> Option<char>;
52+
}
53+
54+
// Used to abstract over UAX29 and KS X 1026-1 rules
4855
#[derive(Clone, Debug)]
49-
pub struct StandardKoreanSyllables<I> {
56+
struct StandardizeKoreanSyllablesInner<I, N> {
5057
prev_end_jamo_kind: Option<JamoKind>,
5158
buf: ArrayVec<[Option<char>; 3]>,
5259
inner: I,
60+
normalizer: PhantomData<N>,
5361
}
5462

55-
impl<I: Iterator<Item = char>> Iterator for StandardKoreanSyllables<I> {
63+
impl<I: Iterator<Item = char>, N: NormalizeKoreanSyllables> Iterator
64+
for StandardizeKoreanSyllablesInner<I, N>
65+
{
5666
type Item = char;
5767

5868
fn next(&mut self) -> Option<Self::Item> {
@@ -65,7 +75,7 @@ impl<I: Iterator<Item = char>> Iterator for StandardKoreanSyllables<I> {
6575
next_c.map_or((None, None), JamoKind::of);
6676
self.prev_end_jamo_kind = next_end_jamo_kind;
6777

68-
insert_fillers(
78+
N::insert_fillers(
6979
next_c,
7080
prev_end_jamo_kind,
7181
next_start_jamo_kind,
@@ -87,50 +97,169 @@ impl<I: Iterator<Item = char>> Iterator for StandardKoreanSyllables<I> {
8797
}
8898
}
8999

90-
impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for StandardKoreanSyllables<I> {}
100+
impl<I: Iterator<Item = char> + FusedIterator, N: NormalizeKoreanSyllables> FusedIterator
101+
for StandardizeKoreanSyllablesInner<I, N>
102+
{
103+
}
91104

92-
#[inline]
93-
fn insert_fillers(
94-
next_c: Option<char>,
95-
prev_end_jamo_kind: Option<JamoKind>,
96-
next_start_jamo_kind: Option<JamoKind>,
97-
buf: &mut ArrayVec<[Option<char>; 3]>,
98-
) -> Option<char> {
99-
match (prev_end_jamo_kind, next_start_jamo_kind) {
100-
// Insert choseong filler before V not preceded by L or V
101-
(None, Some(JamoKind::V)) | (Some(JamoKind::T), Some(JamoKind::V)) => {
102-
buf.push(next_c);
103-
Some('\u{115F}')
104-
}
105-
// Insert choseong and jungseong fillers before T preceded non-jamo
106-
(None, Some(JamoKind::T)) => {
107-
buf.push(next_c);
108-
buf.push(Some('\u{1160}'));
109-
Some('\u{115F}')
110-
}
111-
// Insert V filler between L and non-jamo
112-
(Some(JamoKind::L), None) => {
113-
buf.push(next_c);
114-
Some('\u{1160}')
105+
impl<I, N> StandardizeKoreanSyllablesInner<I, N> {
106+
#[inline]
107+
fn new(iter: I) -> Self {
108+
Self {
109+
prev_end_jamo_kind: None,
110+
buf: ArrayVec::new(),
111+
inner: iter,
112+
normalizer: PhantomData,
115113
}
116-
// For L followed by T, insert V filler, L filler, then another V filler
117-
(Some(JamoKind::L), Some(JamoKind::T)) => {
118-
buf.push(next_c);
119-
buf.push(Some('\u{1160}'));
120-
buf.push(Some('\u{115F}'));
121-
Some('\u{1160}')
114+
}
115+
}
116+
117+
// UAX 29 normalization
118+
119+
#[derive(Clone, Debug)]
120+
struct Uax29;
121+
122+
impl NormalizeKoreanSyllables for Uax29 {
123+
#[inline]
124+
fn insert_fillers(
125+
next_c: Option<char>,
126+
prev_end_jamo_kind: Option<JamoKind>,
127+
next_start_jamo_kind: Option<JamoKind>,
128+
buf: &mut ArrayVec<[Option<char>; 3]>,
129+
) -> Option<char> {
130+
match (prev_end_jamo_kind, next_start_jamo_kind) {
131+
// Insert choseong filler before V not preceded by L or V
132+
(None, Some(JamoKind::V)) | (Some(JamoKind::T), Some(JamoKind::V)) => {
133+
buf.push(next_c);
134+
Some('\u{115F}')
135+
}
136+
// Insert choseong and jungseong fillers before T preceded non-jamo
137+
(None, Some(JamoKind::T)) => {
138+
buf.push(next_c);
139+
buf.push(Some('\u{1160}'));
140+
Some('\u{115F}')
141+
}
142+
// Insert V filler between L and non-jamo
143+
(Some(JamoKind::L), None) => {
144+
buf.push(next_c);
145+
Some('\u{1160}')
146+
}
147+
// For L followed by T, insert V filler, L filler, then another V filler
148+
(Some(JamoKind::L), Some(JamoKind::T)) => {
149+
buf.push(next_c);
150+
buf.push(Some('\u{1160}'));
151+
buf.push(Some('\u{115F}'));
152+
Some('\u{1160}')
153+
}
154+
_ => next_c,
122155
}
123-
_ => next_c,
124156
}
125157
}
126158

127-
impl<I> StandardKoreanSyllables<I> {
159+
/// Iterator over a string's characters, with U+115F and U+1160 inserted
160+
/// where needed to ensure all Korean syllable blocks are in standard form
161+
/// by [UAX29 rules](https://www.unicode.org/reports/tr29/#Standard_Korean_Syllables).
162+
#[derive(Clone, Debug)]
163+
pub struct StandardizeKoreanSyllables<I>(StandardizeKoreanSyllablesInner<I, Uax29>);
164+
165+
impl<I> StandardizeKoreanSyllables<I> {
128166
#[inline]
129167
pub(crate) fn new(iter: I) -> Self {
130-
Self {
131-
prev_end_jamo_kind: None,
132-
buf: ArrayVec::new(),
133-
inner: iter,
168+
Self(StandardizeKoreanSyllablesInner::new(iter))
169+
}
170+
}
171+
172+
impl<I: Iterator<Item = char>> Iterator for StandardizeKoreanSyllables<I> {
173+
type Item = char;
174+
175+
fn next(&mut self) -> Option<Self::Item> {
176+
self.0.next()
177+
}
178+
179+
fn size_hint(&self) -> (usize, Option<usize>) {
180+
self.0.size_hint()
181+
}
182+
}
183+
184+
impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for StandardizeKoreanSyllables<I> {}
185+
186+
// KS X 1026 1 normalization
187+
188+
#[cfg(feature = "ks_x_1026-1")]
189+
#[derive(Clone, Debug)]
190+
struct KsX1026_1;
191+
192+
#[cfg(feature = "ks_x_1026-1")]
193+
impl NormalizeKoreanSyllables for KsX1026_1 {
194+
#[inline]
195+
fn insert_fillers(
196+
next_c: Option<char>,
197+
prev_end_jamo_kind: Option<JamoKind>,
198+
next_start_jamo_kind: Option<JamoKind>,
199+
buf: &mut ArrayVec<[Option<char>; 3]>,
200+
) -> Option<char> {
201+
match (prev_end_jamo_kind, next_start_jamo_kind) {
202+
// Insert choseong filler before V preceded by V, T or non-jamo
203+
(None, Some(JamoKind::V))
204+
| (Some(JamoKind::V), Some(JamoKind::V))
205+
| (Some(JamoKind::T), Some(JamoKind::V)) => {
206+
buf.push(next_c);
207+
Some('\u{115F}')
208+
}
209+
// Insert choseong and jungseong fillers before T preceded by T or non-jamo
210+
(None, Some(JamoKind::T)) | (Some(JamoKind::T), Some(JamoKind::T)) => {
211+
buf.push(next_c);
212+
buf.push(Some('\u{1160}'));
213+
Some('\u{115F}')
214+
}
215+
// Insert V filler between L and non-jamo or other L
216+
(Some(JamoKind::L), None) | (Some(JamoKind::L), Some(JamoKind::L)) => {
217+
buf.push(next_c);
218+
Some('\u{1160}')
219+
}
220+
// For L followed by T, insert V filler, L filler, then another V filler
221+
(Some(JamoKind::L), Some(JamoKind::T)) => {
222+
buf.push(next_c);
223+
buf.push(Some('\u{1160}'));
224+
buf.push(Some('\u{115F}'));
225+
Some('\u{1160}')
226+
}
227+
_ => next_c,
134228
}
135229
}
136230
}
231+
232+
/// Iterator over a string's characters, with U+115F and U+1160 inserted
233+
/// where needed to ensure all Korean syllable blocks are in standard form
234+
/// by [KS X 1026-1](http://std.dkuug.dk/jtc1/sc2/wg2/docs/n3422.pdf) rules.
235+
#[cfg(feature = "ks_x_1026-1")]
236+
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
237+
#[derive(Clone, Debug)]
238+
pub struct StandardizeKoreanSyllablesKsX1026_1<I>(StandardizeKoreanSyllablesInner<I, KsX1026_1>);
239+
240+
#[cfg(feature = "ks_x_1026-1")]
241+
impl<I> StandardizeKoreanSyllablesKsX1026_1<I> {
242+
#[inline]
243+
pub(crate) fn new(iter: I) -> Self {
244+
Self(StandardizeKoreanSyllablesInner::new(iter))
245+
}
246+
}
247+
248+
#[cfg(feature = "ks_x_1026-1")]
249+
impl<I: Iterator<Item = char>> Iterator for StandardizeKoreanSyllablesKsX1026_1<I> {
250+
type Item = char;
251+
252+
fn next(&mut self) -> Option<Self::Item> {
253+
self.0.next()
254+
}
255+
256+
fn size_hint(&self) -> (usize, Option<usize>) {
257+
self.0.size_hint()
258+
}
259+
}
260+
261+
#[cfg(feature = "ks_x_1026-1")]
262+
impl<I: Iterator<Item = char> + FusedIterator> FusedIterator
263+
for StandardizeKoreanSyllablesKsX1026_1<I>
264+
{
265+
}

0 commit comments

Comments
 (0)