Skip to content

Commit 6123ac1

Browse files
committed
add separate_script feature.
1 parent c83b258 commit 6123ac1

File tree

3 files changed

+95
-8
lines changed

3 files changed

+95
-8
lines changed

GlotScript/GlotScript.py

+66-6
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,18 @@
22
Author: Amir Hossein Kargaran
33
Date: August, 2023
44
5-
Description: This code detects the script (writing system) of the given text.
5+
Description: This code detects/separates the script(s) (writing system(s)) of the given text.
66
77
MIT License
88
9-
Original code is from Meta and is based on the MIT license, with permission for distribution and modification.
10-
The original code is capable of detecting less than 40 scripts: https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/utils/predict_script.py
9+
The base code (Meta, MIT license::permission for distribution and modification) is capable of detecting less than 40 scripts: https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/utils/predict_script.py
10+
The currently developed code supports all 161 Unicode 15.0 scripts.
1111
"""
1212

1313
import string
1414
import typing as tp
1515
from collections import Counter, defaultdict
16-
16+
from typing import Dict, List
1717

1818
SCRIPT_RANGES = {
1919
'Latn': [(65, 90), (97, 122), (170, 170), (186, 186), (192, 214), (216, 246), (248, 696), (736, 740), (7424, 7461), (7468, 7516), (7522, 7525), (7531, 7543), (7545, 7614), (7680, 7935), (8305, 8305), (8319, 8319), (8336, 8348), (8490, 8491), (8498, 8498), (8526, 8526), (8544, 8584), (11360, 11391), (42786, 42887), (42891, 42954), (42960, 42961), (42963, 42963), (42965, 42969), (42994, 43007), (43824, 43866), (43868, 43876), (43878, 43881), (64256, 64262), (65313, 65338), (65345, 65370), (67456, 67461), (67463, 67504), (67506, 67514), (122624, 122654), (122661, 122666)], # Latin
@@ -186,7 +186,7 @@
186186
ScoredScript = tp.Tuple[tp.Optional[str], float]
187187

188188

189-
def get_script_predictor() -> tp.Callable[[str], ScoredScript]:
189+
def get_script_predictor(replace_punctuation=True, replace_digits=True) -> tp.Callable[[str], ScoredScript]:
190190

191191
hist_map: tp.Dict[int, tp.Set[str]] = {}
192192
for key, ranges in SCRIPT_RANGES.items():
@@ -199,9 +199,15 @@ def get_script_predictor() -> tp.Callable[[str], ScoredScript]:
199199
replace_by = "" # we just get rid of characters that are ubiquitous
200200
replacement_map = {
201201
ord(c): replace_by
202-
for c in string.whitespace + string.punctuation + string.digits
202+
for c in string.whitespace
203203
}
204204

205+
if replace_punctuation:
206+
replacement_map.update({ord(c): replace_by for c in string.punctuation})
207+
208+
if replace_digits:
209+
replacement_map.update({ord(c): replace_by for c in string.digits})
210+
205211
def predict_script(sent: str) -> ScoredScript:
206212
sent = sent.translate(replacement_map)
207213

@@ -243,6 +249,38 @@ def predict_script(sent: str) -> ScoredScript:
243249

244250
return predict_script
245251

252+
253+
254+
def separate_script(sent: str) -> Dict[str, str]:
255+
"""
256+
Separates characters in the input string based on different scripts.
257+
258+
Args:
259+
sent (str): Input string containing characters from different scripts.
260+
261+
Returns:
262+
Dict[str, str]: A dictionary mapping script names to the separated characters.
263+
"""
264+
result: Dict[str, List[str]] = {}
265+
266+
for char in sent:
267+
code_point = ord(char)
268+
269+
for script, ranges in SCRIPT_RANGES.items():
270+
for start, end in ranges:
271+
if start <= code_point <= end or code_point == ord(' '):
272+
if script not in result:
273+
result[script] = []
274+
result[script].append(char)
275+
break
276+
277+
# Filter out empty values and spaces, and convert the list of characters to a string
278+
result = {key: ''.join(value) for key, value in result.items() if value and ''.join(value).strip()}
279+
280+
return result
281+
282+
283+
246284
def test_predict_script():
247285
predictor_fn = get_script_predictor()
248286

@@ -257,3 +295,25 @@ def test_predict_script():
257295
assert predictor_fn(string.digits)[:2] == (None, 0)
258296
assert predictor_fn(string.whitespace)[:2] == (None, 0)
259297
assert predictor_fn("")[:2] == (None, 0)
298+
299+
300+
def test_separate_script():
301+
302+
sent = "Hello Salut سلام 你好 こんにちは שלום مرحبا"
303+
detected_scripts = separate_script(sent)
304+
305+
ground_truth = {
306+
'Latn': 'Hello Salut ',
307+
'Hebr': ' שלום ',
308+
'Arab': ' سلام مرحبا',
309+
'Hani': ' 你好 ',
310+
'Hira': ' こんにちは '
311+
}
312+
313+
for key in ground_truth.keys():
314+
assert key in detected_scripts, f"Error: '{key}' script not found in detected scripts."
315+
316+
detected_tokens = [x.strip() for x in detected_scripts[key].split() if len(x.strip()) != 0]
317+
ground_truth_tokens = [x.strip() for x in ground_truth[key].split() if len(x.strip()) != 0]
318+
319+
assert sorted(detected_tokens) == sorted(ground_truth_tokens), f"Error: Tokens for key '{key}' do not match."

GlotScript/__init__.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
from .GlotScript import get_script_predictor
2+
from .GlotScript import separate_script
23

3-
sp = get_script_predictor()
4+
5+
sp = get_script_predictor(replace_punctuation=True, replace_digits=True)
6+
sc = separate_script
47

58
__version__ = '1.2'

README.md

+25-1
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,19 @@ pip3 install GlotScript
3333
pip3 install GlotScript@git+https://github.com/cisnlp/GlotScript
3434
```
3535

36-
### Usage
36+
### Usage: Script Detection
3737

3838
```python
3939
from GlotScript import get_script_predictor
4040
sp = get_script_predictor()
4141
```
4242

43+
OR
44+
45+
```python
46+
from GlotScript import sp
47+
```
48+
4349
```python
4450
sp('これは日本人です')
4551
>> ('Hira', 0.625, {'details': {'Hira': 0.625, 'Hani': 0.375}, 'tie': False, 'interval': 0.25})
@@ -60,6 +66,24 @@ sp('𝄞𝄫 𒊕𒀸')
6066
>> ('Xsux', 0.5, {'details': {'Xsux': 0.5, 'Zyyy': 0.5}, 'tie': True, 'interval': 0.0})
6167
```
6268

69+
### Usage: Script Separation
70+
71+
```python
72+
from GlotScript import separate_script
73+
```
74+
75+
```python
76+
sent = "Hello Salut سلام 你好 こんにちは שלום مرحبا"
77+
separate_script(sent)
78+
>> {
79+
"Latn":"Hello Salut ",
80+
"Hebr":" שלום ",
81+
"Arab":" سلام مرحبا",
82+
"Hani":" 你好 ",
83+
"Hira":" こんにちは "
84+
}
85+
```
86+
6387
### Exploring Unicode Blocks: Related Sources
6488
<details>
6589
<summary>Click to Exapand</summary>

0 commit comments

Comments
 (0)