Skip to content

Add new unidecode_translate method #79

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions benchmark.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,27 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import timeit

def main():
print "unidecode_expect_ascii, ASCII string"
print("unidecode_expect_ascii, ASCII string")
timeit.main([
'-s',
'from unidecode import unidecode_expect_ascii',
'unidecode_expect_ascii(u"Hello, World")'])

print "unidecode_expect_ascii, non-ASCII string"
print("unidecode_expect_ascii, non-ASCII string")
timeit.main([
'-s',
'from unidecode import unidecode_expect_ascii',
'unidecode_expect_ascii(u"¡Hola mundo!")'])

print "unidecode_expect_nonascii, ASCII string"
print("unidecode_expect_nonascii, ASCII string")
timeit.main([
'-s',
'from unidecode import unidecode_expect_nonascii',
'unidecode_expect_nonascii(u"Hello, World")'])

print "unidecode_expect_nonascii, non-ASCII string"
print("unidecode_expect_nonascii, non-ASCII string")
timeit.main([
'-s',
'from unidecode import unidecode_expect_nonascii',
Expand Down
86 changes: 84 additions & 2 deletions unidecode/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@
b'Knosos'
"""
import warnings
from typing import Dict, Optional, Sequence
from typing import Dict, Iterator, Optional, Sequence
from pathlib import Path

Cache: Dict[int, Optional[Sequence[Optional[str]]]] = {}
Cache = {} # type: Dict[int, Optional[Sequence[Optional[str]]]]
Translator = None # type: Optional[Dict[int, str]]

class UnidecodeError(ValueError):
def __init__(self, message: str, index: Optional[int] = None) -> None:
Expand Down Expand Up @@ -136,3 +138,83 @@ def _unidecode(string: str, errors: str, replace_str:str) -> str:
retval.append(repl)

return ''.join(retval)

def preload_translator() -> Dict[int, str]:
global Translator

if Translator is None:
Translator = {
codepoint : char

for file in Path(__file__).parent.glob('x*.py')
for codepoint, char in enumerate(
__import__(f'unidecode.{file.stem}', globals(), locals(), ['data']).data,
int(f'0{file.stem}', base=16) << 8
)
if codepoint > 127 and isinstance(char, str)
}

return Translator

def _unidecode_translate_replace_iterator (string: str, replace_str: str) -> Iterator[int]:
replace_bytes = replace_str.encode()

for char in string:
char_ord = ord(char)

if char_ord > 127:
yield from replace_bytes

else:
yield char_ord

def unidecode_translate(
string: str, errors: str = 'ignore', replace_str: str = '?', check_surrogates: bool = False
) -> str:
"""Transliterate an Unicode object into an ASCII string
This method is usually faster than unidecode_expect_nonascii/unidecode, but it uses more memory
To reduce first call time, invoke preload_translator to preload translation table

>>> unidecode("\u5317\u4EB0")
"Bei Jing "

See unidecode_expect_nonascii.
"""
if check_surrogates:
for char in string:
if 0xd800 <= ord(char) <= 0xdfff:
warnings.warn(
f'Surrogate character {char} will be ignored. '
'You might be using a narrow Python build.',
RuntimeWarning, 2
)

retval = string.translate(preload_translator())

if errors == 'preserve':
return retval

retval_bytes: bytes

if errors in ('ignore', 'strict') or (errors == 'replace' and replace_str == '?'):
try:
retval_bytes = retval.encode('ascii', errors=errors)

except UnicodeEncodeError as exc:
raise UnidecodeError(
f'no replacement found for character {exc.object[exc.start : exc.end]} '
f'in position {exc.start}',
exc.start
) from None

elif errors == 'replace':
if replace_str == '?':
retval_bytes = retval.encode('ascii', errors='replace')

else:
retval_bytes = bytes(_unidecode_translate_replace_iterator(retval, replace_str))

else:
raise UnidecodeError(f'invalid value for errors parameter {errors}')

return retval_bytes.decode()