|
| 1 | +# |
| 2 | +# This file is part of KoreanCodecs. |
| 3 | +# |
| 4 | +# Copyright(C) 2002-2003 Hye-Shik Chang <[email protected]>. |
| 5 | +# |
| 6 | +# KoreanCodecs is free software; you can redistribute it and/or modify |
| 7 | +# it under the terms of the GNU Lesser General Public License as published |
| 8 | +# by the Free Software Foundation; either version 2 of the License, or |
| 9 | +# (at your option) any later version. |
| 10 | +# |
| 11 | +# KoreanCodecs is distributed in the hope that it will be useful, |
| 12 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | +# GNU Lesser General Public License for more details. |
| 15 | +# |
| 16 | +# You should have received a copy of the GNU Lesser General Public License |
| 17 | +# along with KoreanCodecs; if not, write to the Free Software |
| 18 | +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
| 19 | +# |
| 20 | +# $Id: hangul.py,v 1.2 2003/10/15 19:24:53 perky Exp $ |
| 21 | +# |
| 22 | + |
| 23 | +class UnicodeHangulError(Exception): |
| 24 | + |
| 25 | + def __init__ (self, msg): |
| 26 | + self.msg = msg |
| 27 | + Exception.__init__(self, msg) |
| 28 | + |
| 29 | + def __repr__ (self): |
| 30 | + return self.msg |
| 31 | + |
| 32 | + __str__ = __repr__ |
| 33 | + |
| 34 | +Null = u'' |
| 35 | +try: |
| 36 | + True |
| 37 | +except: |
| 38 | + True = 1 |
| 39 | + False = 0 |
| 40 | + |
| 41 | +class Jaeum: |
| 42 | + |
| 43 | + Codes = (u'\u3131', u'\u3132', u'\u3133', u'\u3134', u'\u3135', u'\u3136', |
| 44 | + # G GG GS N NJ NH |
| 45 | + u'\u3137', u'\u3138', u'\u3139', u'\u313a', u'\u313b', u'\u313c', |
| 46 | + # D DD L LG LM LB |
| 47 | + u'\u313d', u'\u313e', u'\u313f', u'\u3140', u'\u3141', u'\u3142', |
| 48 | + # LS LT LP LH M B |
| 49 | + u'\u3143', u'\u3144', u'\u3145', u'\u3146', u'\u3147', u'\u3148', |
| 50 | + # BB BS S SS NG J |
| 51 | + u'\u3149', u'\u314a', u'\u314b', u'\u314c', u'\u314d', u'\u314e') |
| 52 | + # JJ C K T P H |
| 53 | + Width = len(Codes) |
| 54 | + G, GG, GS, N, NJ, NH, D, DD, L, LG, LM, LB, LS, LT, LP, LH, M, B, \ |
| 55 | + BB, BS, S, SS, NG, J, JJ, C, K, T, P, H = Codes |
| 56 | + Choseong = [G, GG, N, D, DD, L, M, B, BB, S, SS, NG, J, JJ, C, K, T, P, H] |
| 57 | + Jongseong = [Null, G, GG, GS, N, NJ, NH, D, L, LG, LM, LB, LS, LT, \ |
| 58 | + LP, LH, M, B, BS, S, SS, NG, J, C, K, T, P, H] |
| 59 | + MultiElement = { |
| 60 | + GG: (G, G), GS: (G, S), NJ: (N, J), NH: (N, H), DD: (D, D), |
| 61 | + LG: (L, G), LM: (L, M), LB: (L, B), LS: (L, S), LT: (L, T), |
| 62 | + LP: (L, P), LH: (L, H), BB: (B, B), BS: (B, S), SS: (S, S), |
| 63 | + JJ: (J, J) |
| 64 | + } |
| 65 | + |
| 66 | + |
| 67 | +class Moeum: |
| 68 | + |
| 69 | + Codes = (u'\u314f', u'\u3150', u'\u3151', u'\u3152', u'\u3153', u'\u3154', |
| 70 | + # A AE YA YAE EO E |
| 71 | + u'\u3155', u'\u3156', u'\u3157', u'\u3158', u'\u3159', u'\u315a', |
| 72 | + # YEO YE O WA WAE OE |
| 73 | + u'\u315b', u'\u315c', u'\u315d', u'\u315e', u'\u315f', u'\u3160', |
| 74 | + # YO U WEO WE WI YU |
| 75 | + u'\u3161', u'\u3162', u'\u3163') |
| 76 | + # EU YI I |
| 77 | + Width = len(Codes) |
| 78 | + A, AE, YA, YAE, EO, E, YEO, YE, O, WA, WAE, OE, YO, \ |
| 79 | + U, WEO, WE, WI, YU, EU, YI, I = Codes |
| 80 | + Jungseong = list(Codes) |
| 81 | + MultiElement = { |
| 82 | + AE: (A, I), YAE: (YA, I), YE: (YEO, I), WA: (O, A), WAE: (O, A, I), |
| 83 | + OE: (O, I), WEO: (U, EO), WE: (U, E), WI: (U, I), YI: (EU, I) |
| 84 | + } |
| 85 | + |
| 86 | +# Aliases for your convinience |
| 87 | +Choseong = Jaeum.Choseong |
| 88 | +Jungseong = Moeum.Jungseong |
| 89 | +Jongseong = Jaeum.Jongseong |
| 90 | + |
| 91 | +for name, code in Jaeum.__dict__.items() + Moeum.__dict__.items(): |
| 92 | + if name.isupper() and len(name) <= 3: |
| 93 | + exec "%s = %s" % (name, repr(code)) |
| 94 | +del name, code |
| 95 | + |
| 96 | +# Unicode Hangul Syllables Characteristics |
| 97 | +ZONE = (u'\uAC00', u'\uD7A3') |
| 98 | +NCHOSEONG = len(Choseong) |
| 99 | +NJUNGSEONG = len(Jungseong) |
| 100 | +NJONGSEONG = len(Jongseong) |
| 101 | +JBASE_CHOSEONG = u'\u1100' |
| 102 | +JBASE_JUNGSEONG = u'\u1161' |
| 103 | +JBASE_JONGSEONG = u'\u11A8' |
| 104 | +CHOSEONG_FILLER = u'\u115F' |
| 105 | +JUNGSEONG_FILLER = u'\u1160' |
| 106 | + |
| 107 | +_ishangul = ( |
| 108 | + lambda code: |
| 109 | + ZONE[0] <= code <= ZONE[1] or |
| 110 | + code in Jaeum.Codes or |
| 111 | + code in Moeum.Codes |
| 112 | +) |
| 113 | + |
| 114 | +# Alternative Suffixes : do not use outside |
| 115 | +ALT_SUFFIXES = { |
| 116 | + u'\uc744': (u'\ub97c', u'\uc744'), # reul, eul |
| 117 | + u'\ub97c': (u'\ub97c', u'\uc744'), # reul, eul |
| 118 | + u'\uc740': (u'\ub294', u'\uc740'), # neun, eun |
| 119 | + u'\ub294': (u'\ub294', u'\uc740'), # neun, eun |
| 120 | + u'\uc774': (u'\uac00', u'\uc774'), # yi, ga |
| 121 | + u'\uac00': (u'\uac00', u'\uc774'), # yi, ga |
| 122 | + u'\uc640': (u'\uc640', u'\uacfc'), # wa, gwa |
| 123 | + u'\uacfc': (u'\uc640', u'\uacfc'), # wa, gwa |
| 124 | +} |
| 125 | + |
| 126 | +# Ida-Varitaion Suffixes : do not use outside |
| 127 | +IDA_SUFFIXES = { |
| 128 | + u'(\uc774)': (u'', u'\uc774'), # (yi)da |
| 129 | + u'(\uc785)': (17, u'\uc785'), # (ip)nida |
| 130 | + u'(\uc778)': (4, u'\uc778'), # (in)- |
| 131 | +} |
| 132 | + |
| 133 | +def isChoseong(u): |
| 134 | + if u: |
| 135 | + for c in u: |
| 136 | + if c not in Jaeum.Choseong: |
| 137 | + break |
| 138 | + else: |
| 139 | + return True |
| 140 | + return False |
| 141 | + |
| 142 | +def isJungseong(u): |
| 143 | + if u: |
| 144 | + for c in u: |
| 145 | + if c not in Moeum.Jungseong: |
| 146 | + break |
| 147 | + else: |
| 148 | + return True |
| 149 | + return False |
| 150 | + |
| 151 | +def isJongseong(u): |
| 152 | + if u: |
| 153 | + for c in u: |
| 154 | + if c not in Jaeum.Jongseong: |
| 155 | + break |
| 156 | + else: |
| 157 | + return True |
| 158 | + return False |
| 159 | + |
| 160 | +def isJaeum(u): |
| 161 | + if u: |
| 162 | + for c in u: |
| 163 | + if c not in Jaeum.Codes: |
| 164 | + break |
| 165 | + else: |
| 166 | + return True |
| 167 | + return False |
| 168 | + |
| 169 | +def isMoeum(u): |
| 170 | + if u: |
| 171 | + for c in u: |
| 172 | + if c not in Moeum.Codes: |
| 173 | + break |
| 174 | + else: |
| 175 | + return True |
| 176 | + return False |
| 177 | + |
| 178 | +def ishangul(u): |
| 179 | + if u: |
| 180 | + for c in u: |
| 181 | + if not _ishangul(c): |
| 182 | + break |
| 183 | + else: |
| 184 | + return True |
| 185 | + return False |
| 186 | + |
| 187 | +def join(codes): |
| 188 | + """ Join function which makes hangul syllable from jamos """ |
| 189 | + if len(codes) is not 3: |
| 190 | + raise UnicodeHangulError("needs 3-element tuple") |
| 191 | + if not codes[0] or not codes[1]: # single jamo |
| 192 | + return codes[0] or codes[1] |
| 193 | + |
| 194 | + return unichr( |
| 195 | + 0xac00 + ( |
| 196 | + Choseong.index(codes[0])*NJUNGSEONG + |
| 197 | + Jungseong.index(codes[1]) |
| 198 | + )*NJONGSEONG + Jongseong.index(codes[2]) |
| 199 | + ) |
| 200 | + |
| 201 | +def split(code): |
| 202 | + """ Split function which splits hangul syllable into jamos """ |
| 203 | + if len(code) != 1 or not _ishangul(code): |
| 204 | + raise UnicodeHangulError("needs 1 hangul letter") |
| 205 | + if code in Jaeum.Codes: |
| 206 | + return (code, Null, Null) |
| 207 | + if code in Moeum.Codes: |
| 208 | + return (Null, code, Null) |
| 209 | + |
| 210 | + code = ord(code) - 0xac00 |
| 211 | + return ( |
| 212 | + Choseong[int(code / (NJUNGSEONG*NJONGSEONG))], # Python3000 safe |
| 213 | + Jungseong[int(code / NJONGSEONG) % NJUNGSEONG], |
| 214 | + Jongseong[code % NJONGSEONG] |
| 215 | + ) |
| 216 | + |
| 217 | +def conjoin(s): |
| 218 | + obuff = [] |
| 219 | + ncur = 0 |
| 220 | + |
| 221 | + while ncur < len(s): |
| 222 | + c = s[ncur] |
| 223 | + if JBASE_CHOSEONG <= c <= u'\u1112' or c == CHOSEONG_FILLER: # starts with choseong |
| 224 | + if len(s) > ncur+1 and JUNGSEONG_FILLER <= s[ncur+1] <= u'\u1175': |
| 225 | + cho = Choseong[ord(c) - ord(JBASE_CHOSEONG)] |
| 226 | + jung = Jungseong[ord(s[ncur+1]) - ord(JBASE_JUNGSEONG)] |
| 227 | + if len(s) > ncur+2 and JBASE_JONGSEONG <= s[ncur+2] <= u'\u11C2': |
| 228 | + jong = Jongseong[ord(s[ncur+2]) - ord(JBASE_JONGSEONG) + 1] |
| 229 | + ncur += 2 |
| 230 | + else: |
| 231 | + jong = Null |
| 232 | + ncur += 1 |
| 233 | + obuff.append(join([cho, jung, jong])) |
| 234 | + else: |
| 235 | + obuff.append(join([Choseong[ord(c) - ord(JBASE_CHOSEONG)], Null, Null])) |
| 236 | + elif JBASE_JUNGSEONG <= c <= u'\u1175': |
| 237 | + obuff.append(join([Null, Jungseong[ord(c) - ord(JBASE_JUNGSEONG)], Null])) |
| 238 | + else: |
| 239 | + obuff.append(c) |
| 240 | + ncur += 1 |
| 241 | + |
| 242 | + return u''.join(obuff) |
| 243 | + |
| 244 | +def disjoint(s): |
| 245 | + obuff = [] |
| 246 | + for c in s: |
| 247 | + if _ishangul(c): |
| 248 | + cho, jung, jong = split(c) |
| 249 | + if cho: |
| 250 | + obuff.append( unichr(ord(JBASE_CHOSEONG) + Choseong.index(cho)) ) |
| 251 | + else: |
| 252 | + obuff.append( CHOSEONG_FILLER ) |
| 253 | + |
| 254 | + if jung: |
| 255 | + obuff.append( unichr(ord(JBASE_JUNGSEONG) + Jungseong.index(jung)) ) |
| 256 | + else: |
| 257 | + obuff.append( JUNGSEONG_FILLER ) |
| 258 | + |
| 259 | + if jong: |
| 260 | + obuff.append( unichr(ord(JBASE_JONGSEONG) + Jongseong.index(jong) - 1) ) |
| 261 | + else: |
| 262 | + obuff.append(c) |
| 263 | + return u''.join(obuff) |
| 264 | + |
| 265 | +def _has_final(c): |
| 266 | + # for internal use only |
| 267 | + if u'\uac00' <= c <= u'\ud7a3': # hangul |
| 268 | + return 1, (ord(c) - 0xac00) % 28 > 0 |
| 269 | + else: |
| 270 | + return 0, c in u'013678.bklmnptLMNRZ' |
| 271 | + |
| 272 | +# Iterator Emulator for ancient versions before 2.1 |
| 273 | +try: |
| 274 | + iter |
| 275 | +except: |
| 276 | + class iter: |
| 277 | + def __init__(self, obj): |
| 278 | + self.obj = obj |
| 279 | + self.ptr = 0 |
| 280 | + def next(self): |
| 281 | + try: |
| 282 | + return self.obj[self.ptr] |
| 283 | + finally: |
| 284 | + self.ptr += 1 |
| 285 | + |
| 286 | +# Nested scope lambda emulation for versions before 2.2 |
| 287 | +import sys |
| 288 | +if sys.hexversion < '0x2020000': |
| 289 | + class plambda: |
| 290 | + def __init__(self, obj): |
| 291 | + self.obj = obj |
| 292 | + def __call__(self): |
| 293 | + return self.obj |
| 294 | +else: |
| 295 | + plambda = None |
| 296 | +del sys |
| 297 | + |
| 298 | +def format(fmtstr, *args, **kwargs): |
| 299 | + if kwargs: |
| 300 | + argget = lambda:kwargs |
| 301 | + if plambda: |
| 302 | + argget = plambda(kwargs) |
| 303 | + else: |
| 304 | + argget = iter(args).next |
| 305 | + |
| 306 | + obuff = [] |
| 307 | + ncur = escape = fmtinpth = 0 |
| 308 | + ofmt = fmt = u'' |
| 309 | + |
| 310 | + while ncur < len(fmtstr): |
| 311 | + c = fmtstr[ncur] |
| 312 | + |
| 313 | + if escape: |
| 314 | + obuff.append(c) |
| 315 | + escape = 0 |
| 316 | + ofmt = u'' |
| 317 | + elif c == u'\\': |
| 318 | + escape = 1 |
| 319 | + elif fmt: |
| 320 | + fmt += c |
| 321 | + if not fmtinpth and c.isalpha(): |
| 322 | + ofmt = fmt % argget() |
| 323 | + obuff.append(ofmt) |
| 324 | + fmt = u'' |
| 325 | + elif fmtinpth and c == u')': |
| 326 | + fmtinpth = 0 |
| 327 | + elif c == u'(': |
| 328 | + fmtinpth = 1 |
| 329 | + elif c == u'%': |
| 330 | + obuff.append(u'%') |
| 331 | + elif c == u'%': |
| 332 | + fmt += c |
| 333 | + ofmt = u'' |
| 334 | + else: |
| 335 | + if ofmt and ALT_SUFFIXES.has_key(c): |
| 336 | + obuff.append(ALT_SUFFIXES[c][ |
| 337 | + _has_final(ofmt[-1])[1] and 1 or 0 |
| 338 | + ]) |
| 339 | + elif ofmt and IDA_SUFFIXES.has_key(fmtstr[ncur:ncur+3]): |
| 340 | + sel = IDA_SUFFIXES[fmtstr[ncur:ncur+3]] |
| 341 | + ishan, hasfinal = _has_final(ofmt[-1]) |
| 342 | + |
| 343 | + if hasfinal: |
| 344 | + obuff.append(sel[1]) |
| 345 | + elif ishan: |
| 346 | + if sel[0]: |
| 347 | + obuff[-1] = obuff[-1][:-1] + unichr(ord(ofmt[-1]) + sel[0]) |
| 348 | + else: |
| 349 | + obuff.append(sel[0] and sel[1]) |
| 350 | + ncur += 2 |
| 351 | + else: |
| 352 | + obuff.append(c) |
| 353 | + |
| 354 | + ofmt = u'' |
| 355 | + |
| 356 | + ncur += 1 |
| 357 | + |
| 358 | + return u''.join(obuff) |
| 359 | + |
0 commit comments