Skip to content

Commit 22cb29f

Browse files
committed
support ignoring characters
1 parent 632af82 commit 22cb29f

File tree

2 files changed

+27
-5
lines changed

2 files changed

+27
-5
lines changed

tests/test_unidecode.py

+18
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,24 @@ def test_ascii(self):
6868

6969
wlog.stop()
7070

71+
72+
def test_ignore(self):
73+
wlog = WarningLogger()
74+
wlog.start("should be ignored")
75+
76+
r = self.unidecode(u'æøåÆØÅ', ignore=u'æøå')
77+
self.assertEqual(r, u'æøåAEOA')
78+
79+
if sys.version_info[0] >= 3:
80+
self.assertEqual(type(r), str)
81+
else:
82+
self.assertEqual(type(r), unicode)
83+
84+
# unicode objects shouldn't raise warnings
85+
self.assertEqual(0, len(wlog.log))
86+
87+
wlog.stop()
88+
7189
def test_bmp(self):
7290
for n in range(0,0x10000):
7391
# skip over surrogate pairs, which throw a warning

unidecode/__init__.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def _warn_if_not_unicode(string):
2828
RuntimeWarning, 2)
2929

3030

31-
def unidecode_expect_ascii(string):
31+
def unidecode_expect_ascii(string, ignore=u''):
3232
"""Transliterate an Unicode object into an ASCII string
3333
3434
>>> unidecode(u"\u5317\u4EB0")
@@ -47,30 +47,34 @@ def unidecode_expect_ascii(string):
4747
try:
4848
bytestring = string.encode('ASCII')
4949
except UnicodeEncodeError:
50-
return _unidecode(string)
50+
return _unidecode(string, ignore)
5151
if version_info[0] >= 3:
5252
return string
5353
else:
5454
return bytestring
5555

56-
def unidecode_expect_nonascii(string):
56+
def unidecode_expect_nonascii(string, ignore=u''):
5757
"""Transliterate an Unicode object into an ASCII string
5858
5959
>>> unidecode(u"\u5317\u4EB0")
6060
"Bei Jing "
6161
"""
6262

6363
_warn_if_not_unicode(string)
64-
return _unidecode(string)
64+
return _unidecode(string, ignore)
6565

6666
unidecode = unidecode_expect_ascii
6767

68-
def _unidecode(string):
68+
def _unidecode(string, ignore=u''):
6969
retval = []
7070

7171
for char in string:
7272
codepoint = ord(char)
7373

74+
if char in ignore:
75+
retval.append(char)
76+
continue
77+
7478
if codepoint < 0x80: # Basic ASCII
7579
retval.append(str(char))
7680
continue

0 commit comments

Comments
 (0)