Skip to content

Commit 4d81507

Browse files
committed
initial commit
1 parent 45a82e5 commit 4d81507

5 files changed

+590
-0
lines changed

hangul.py

+359
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,359 @@
1+
#
2+
# This file is part of KoreanCodecs.
3+
#
4+
# Copyright(C) 2002-2003 Hye-Shik Chang <[email protected]>.
5+
#
6+
# KoreanCodecs is free software; you can redistribute it and/or modify
7+
# it under the terms of the GNU Lesser General Public License as published
8+
# by the Free Software Foundation; either version 2 of the License, or
9+
# (at your option) any later version.
10+
#
11+
# KoreanCodecs is distributed in the hope that it will be useful,
12+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
# GNU Lesser General Public License for more details.
15+
#
16+
# You should have received a copy of the GNU Lesser General Public License
17+
# along with KoreanCodecs; if not, write to the Free Software
18+
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19+
#
20+
# $Id: hangul.py,v 1.2 2003/10/15 19:24:53 perky Exp $
21+
#
22+
23+
class UnicodeHangulError(Exception):
24+
25+
def __init__ (self, msg):
26+
self.msg = msg
27+
Exception.__init__(self, msg)
28+
29+
def __repr__ (self):
30+
return self.msg
31+
32+
__str__ = __repr__
33+
34+
Null = u''
35+
try:
36+
True
37+
except:
38+
True = 1
39+
False = 0
40+
41+
class Jaeum:
42+
43+
Codes = (u'\u3131', u'\u3132', u'\u3133', u'\u3134', u'\u3135', u'\u3136',
44+
# G GG GS N NJ NH
45+
u'\u3137', u'\u3138', u'\u3139', u'\u313a', u'\u313b', u'\u313c',
46+
# D DD L LG LM LB
47+
u'\u313d', u'\u313e', u'\u313f', u'\u3140', u'\u3141', u'\u3142',
48+
# LS LT LP LH M B
49+
u'\u3143', u'\u3144', u'\u3145', u'\u3146', u'\u3147', u'\u3148',
50+
# BB BS S SS NG J
51+
u'\u3149', u'\u314a', u'\u314b', u'\u314c', u'\u314d', u'\u314e')
52+
# JJ C K T P H
53+
Width = len(Codes)
54+
G, GG, GS, N, NJ, NH, D, DD, L, LG, LM, LB, LS, LT, LP, LH, M, B, \
55+
BB, BS, S, SS, NG, J, JJ, C, K, T, P, H = Codes
56+
Choseong = [G, GG, N, D, DD, L, M, B, BB, S, SS, NG, J, JJ, C, K, T, P, H]
57+
Jongseong = [Null, G, GG, GS, N, NJ, NH, D, L, LG, LM, LB, LS, LT, \
58+
LP, LH, M, B, BS, S, SS, NG, J, C, K, T, P, H]
59+
MultiElement = {
60+
GG: (G, G), GS: (G, S), NJ: (N, J), NH: (N, H), DD: (D, D),
61+
LG: (L, G), LM: (L, M), LB: (L, B), LS: (L, S), LT: (L, T),
62+
LP: (L, P), LH: (L, H), BB: (B, B), BS: (B, S), SS: (S, S),
63+
JJ: (J, J)
64+
}
65+
66+
67+
class Moeum:
68+
69+
Codes = (u'\u314f', u'\u3150', u'\u3151', u'\u3152', u'\u3153', u'\u3154',
70+
# A AE YA YAE EO E
71+
u'\u3155', u'\u3156', u'\u3157', u'\u3158', u'\u3159', u'\u315a',
72+
# YEO YE O WA WAE OE
73+
u'\u315b', u'\u315c', u'\u315d', u'\u315e', u'\u315f', u'\u3160',
74+
# YO U WEO WE WI YU
75+
u'\u3161', u'\u3162', u'\u3163')
76+
# EU YI I
77+
Width = len(Codes)
78+
A, AE, YA, YAE, EO, E, YEO, YE, O, WA, WAE, OE, YO, \
79+
U, WEO, WE, WI, YU, EU, YI, I = Codes
80+
Jungseong = list(Codes)
81+
MultiElement = {
82+
AE: (A, I), YAE: (YA, I), YE: (YEO, I), WA: (O, A), WAE: (O, A, I),
83+
OE: (O, I), WEO: (U, EO), WE: (U, E), WI: (U, I), YI: (EU, I)
84+
}
85+
86+
# Aliases for your convinience
87+
Choseong = Jaeum.Choseong
88+
Jungseong = Moeum.Jungseong
89+
Jongseong = Jaeum.Jongseong
90+
91+
for name, code in Jaeum.__dict__.items() + Moeum.__dict__.items():
92+
if name.isupper() and len(name) <= 3:
93+
exec "%s = %s" % (name, repr(code))
94+
del name, code
95+
96+
# Unicode Hangul Syllables Characteristics
97+
ZONE = (u'\uAC00', u'\uD7A3')
98+
NCHOSEONG = len(Choseong)
99+
NJUNGSEONG = len(Jungseong)
100+
NJONGSEONG = len(Jongseong)
101+
JBASE_CHOSEONG = u'\u1100'
102+
JBASE_JUNGSEONG = u'\u1161'
103+
JBASE_JONGSEONG = u'\u11A8'
104+
CHOSEONG_FILLER = u'\u115F'
105+
JUNGSEONG_FILLER = u'\u1160'
106+
107+
_ishangul = (
108+
lambda code:
109+
ZONE[0] <= code <= ZONE[1] or
110+
code in Jaeum.Codes or
111+
code in Moeum.Codes
112+
)
113+
114+
# Alternative Suffixes : do not use outside
115+
ALT_SUFFIXES = {
116+
u'\uc744': (u'\ub97c', u'\uc744'), # reul, eul
117+
u'\ub97c': (u'\ub97c', u'\uc744'), # reul, eul
118+
u'\uc740': (u'\ub294', u'\uc740'), # neun, eun
119+
u'\ub294': (u'\ub294', u'\uc740'), # neun, eun
120+
u'\uc774': (u'\uac00', u'\uc774'), # yi, ga
121+
u'\uac00': (u'\uac00', u'\uc774'), # yi, ga
122+
u'\uc640': (u'\uc640', u'\uacfc'), # wa, gwa
123+
u'\uacfc': (u'\uc640', u'\uacfc'), # wa, gwa
124+
}
125+
126+
# Ida-Varitaion Suffixes : do not use outside
127+
IDA_SUFFIXES = {
128+
u'(\uc774)': (u'', u'\uc774'), # (yi)da
129+
u'(\uc785)': (17, u'\uc785'), # (ip)nida
130+
u'(\uc778)': (4, u'\uc778'), # (in)-
131+
}
132+
133+
def isChoseong(u):
134+
if u:
135+
for c in u:
136+
if c not in Jaeum.Choseong:
137+
break
138+
else:
139+
return True
140+
return False
141+
142+
def isJungseong(u):
143+
if u:
144+
for c in u:
145+
if c not in Moeum.Jungseong:
146+
break
147+
else:
148+
return True
149+
return False
150+
151+
def isJongseong(u):
152+
if u:
153+
for c in u:
154+
if c not in Jaeum.Jongseong:
155+
break
156+
else:
157+
return True
158+
return False
159+
160+
def isJaeum(u):
161+
if u:
162+
for c in u:
163+
if c not in Jaeum.Codes:
164+
break
165+
else:
166+
return True
167+
return False
168+
169+
def isMoeum(u):
170+
if u:
171+
for c in u:
172+
if c not in Moeum.Codes:
173+
break
174+
else:
175+
return True
176+
return False
177+
178+
def ishangul(u):
179+
if u:
180+
for c in u:
181+
if not _ishangul(c):
182+
break
183+
else:
184+
return True
185+
return False
186+
187+
def join(codes):
188+
""" Join function which makes hangul syllable from jamos """
189+
if len(codes) is not 3:
190+
raise UnicodeHangulError("needs 3-element tuple")
191+
if not codes[0] or not codes[1]: # single jamo
192+
return codes[0] or codes[1]
193+
194+
return unichr(
195+
0xac00 + (
196+
Choseong.index(codes[0])*NJUNGSEONG +
197+
Jungseong.index(codes[1])
198+
)*NJONGSEONG + Jongseong.index(codes[2])
199+
)
200+
201+
def split(code):
202+
""" Split function which splits hangul syllable into jamos """
203+
if len(code) != 1 or not _ishangul(code):
204+
raise UnicodeHangulError("needs 1 hangul letter")
205+
if code in Jaeum.Codes:
206+
return (code, Null, Null)
207+
if code in Moeum.Codes:
208+
return (Null, code, Null)
209+
210+
code = ord(code) - 0xac00
211+
return (
212+
Choseong[int(code / (NJUNGSEONG*NJONGSEONG))], # Python3000 safe
213+
Jungseong[int(code / NJONGSEONG) % NJUNGSEONG],
214+
Jongseong[code % NJONGSEONG]
215+
)
216+
217+
def conjoin(s):
218+
obuff = []
219+
ncur = 0
220+
221+
while ncur < len(s):
222+
c = s[ncur]
223+
if JBASE_CHOSEONG <= c <= u'\u1112' or c == CHOSEONG_FILLER: # starts with choseong
224+
if len(s) > ncur+1 and JUNGSEONG_FILLER <= s[ncur+1] <= u'\u1175':
225+
cho = Choseong[ord(c) - ord(JBASE_CHOSEONG)]
226+
jung = Jungseong[ord(s[ncur+1]) - ord(JBASE_JUNGSEONG)]
227+
if len(s) > ncur+2 and JBASE_JONGSEONG <= s[ncur+2] <= u'\u11C2':
228+
jong = Jongseong[ord(s[ncur+2]) - ord(JBASE_JONGSEONG) + 1]
229+
ncur += 2
230+
else:
231+
jong = Null
232+
ncur += 1
233+
obuff.append(join([cho, jung, jong]))
234+
else:
235+
obuff.append(join([Choseong[ord(c) - ord(JBASE_CHOSEONG)], Null, Null]))
236+
elif JBASE_JUNGSEONG <= c <= u'\u1175':
237+
obuff.append(join([Null, Jungseong[ord(c) - ord(JBASE_JUNGSEONG)], Null]))
238+
else:
239+
obuff.append(c)
240+
ncur += 1
241+
242+
return u''.join(obuff)
243+
244+
def disjoint(s):
245+
obuff = []
246+
for c in s:
247+
if _ishangul(c):
248+
cho, jung, jong = split(c)
249+
if cho:
250+
obuff.append( unichr(ord(JBASE_CHOSEONG) + Choseong.index(cho)) )
251+
else:
252+
obuff.append( CHOSEONG_FILLER )
253+
254+
if jung:
255+
obuff.append( unichr(ord(JBASE_JUNGSEONG) + Jungseong.index(jung)) )
256+
else:
257+
obuff.append( JUNGSEONG_FILLER )
258+
259+
if jong:
260+
obuff.append( unichr(ord(JBASE_JONGSEONG) + Jongseong.index(jong) - 1) )
261+
else:
262+
obuff.append(c)
263+
return u''.join(obuff)
264+
265+
def _has_final(c):
266+
# for internal use only
267+
if u'\uac00' <= c <= u'\ud7a3': # hangul
268+
return 1, (ord(c) - 0xac00) % 28 > 0
269+
else:
270+
return 0, c in u'013678.bklmnptLMNRZ'
271+
272+
# Iterator Emulator for ancient versions before 2.1
273+
try:
274+
iter
275+
except:
276+
class iter:
277+
def __init__(self, obj):
278+
self.obj = obj
279+
self.ptr = 0
280+
def next(self):
281+
try:
282+
return self.obj[self.ptr]
283+
finally:
284+
self.ptr += 1
285+
286+
# Nested scope lambda emulation for versions before 2.2
287+
import sys
288+
if sys.hexversion < '0x2020000':
289+
class plambda:
290+
def __init__(self, obj):
291+
self.obj = obj
292+
def __call__(self):
293+
return self.obj
294+
else:
295+
plambda = None
296+
del sys
297+
298+
def format(fmtstr, *args, **kwargs):
299+
if kwargs:
300+
argget = lambda:kwargs
301+
if plambda:
302+
argget = plambda(kwargs)
303+
else:
304+
argget = iter(args).next
305+
306+
obuff = []
307+
ncur = escape = fmtinpth = 0
308+
ofmt = fmt = u''
309+
310+
while ncur < len(fmtstr):
311+
c = fmtstr[ncur]
312+
313+
if escape:
314+
obuff.append(c)
315+
escape = 0
316+
ofmt = u''
317+
elif c == u'\\':
318+
escape = 1
319+
elif fmt:
320+
fmt += c
321+
if not fmtinpth and c.isalpha():
322+
ofmt = fmt % argget()
323+
obuff.append(ofmt)
324+
fmt = u''
325+
elif fmtinpth and c == u')':
326+
fmtinpth = 0
327+
elif c == u'(':
328+
fmtinpth = 1
329+
elif c == u'%':
330+
obuff.append(u'%')
331+
elif c == u'%':
332+
fmt += c
333+
ofmt = u''
334+
else:
335+
if ofmt and ALT_SUFFIXES.has_key(c):
336+
obuff.append(ALT_SUFFIXES[c][
337+
_has_final(ofmt[-1])[1] and 1 or 0
338+
])
339+
elif ofmt and IDA_SUFFIXES.has_key(fmtstr[ncur:ncur+3]):
340+
sel = IDA_SUFFIXES[fmtstr[ncur:ncur+3]]
341+
ishan, hasfinal = _has_final(ofmt[-1])
342+
343+
if hasfinal:
344+
obuff.append(sel[1])
345+
elif ishan:
346+
if sel[0]:
347+
obuff[-1] = obuff[-1][:-1] + unichr(ord(ofmt[-1]) + sel[0])
348+
else:
349+
obuff.append(sel[0] and sel[1])
350+
ncur += 2
351+
else:
352+
obuff.append(c)
353+
354+
ofmt = u''
355+
356+
ncur += 1
357+
358+
return u''.join(obuff)
359+

0 commit comments

Comments
 (0)