forked from bfsujason/bertalign
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathg.py
More file actions
108 lines (73 loc) · 2.65 KB
/
g.py
File metadata and controls
108 lines (73 loc) · 2.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import datetime
import os
from pathlib import Path
from datasets import load_dataset
from bertalign import Bertalign
import process_zh_text
import process_en_text
from helper import dump_align_result_to_file
LANGS = ['zh', 'fr', 'es', 'ru']
def cat(*args): return '/'.join(args)
def without_preprocess(row):
aligner = Bertalign(row)
aligner.align_sents()
result = aligner.create_result()
dump_align_result_to_file(row['record'], result)
# dst = 'en'
# dst_text = row[dst].replace('\n----\n', '\n')
# # zh = row['zh'].replace('\n----\n', '\n')
# for src in langs:
# src_text = row[src].replace('\n----\n', '\n')
# output_dir = f'./aligned/{src}_{dst}/'
# Path(output_dir).mkdir(parents=True, exist_ok=True)
# output_filename = f'{output_dir}{id}.txt'
# if os.path.exists(output_filename):
# print('skip', output_filename)
# continue
# aligner = Bertalign(src_text, dst_text, src_lang=src, tgt_lang=dst)
# aligner.align_sents()
# with open(f'{output_filename}', 'w', encoding='utf-8') as f:
# for aligned in aligner.yield_sents():
# f.write(aligned + '=' * 10 + '\n')
def dump_src(row: str):
rec = row['record']
for lang in ['zh', 'en', 'fr', 'es', 'ru']:
with open(f"./pre/{lang}src.txt", 'a', encoding='utf-8') as f:
f.write(f'''
==========
{rec}
==========
''' + row[lang])
def with_preprocess(row: str):
Path("./done").mkdir(parents=True, exist_ok=True)
rec = row['record']
# zh_text = process_zh_text.start(row["zh"])
# en_text = process_en_text.start(row["en"])
# row["zh"] = zh_text
# row["en"] = en_text
# with open(f"./pre/zhall.txt", 'a', encoding='utf-8') as f:
# f.write(f'''
# ==========
# {rec}
# ==========
# ''' + zh_text)
# with open(f"./pre/enall.txt", 'a', encoding='utf-8') as f:
# f.write(f'''
# ==========
# {rec}
# ==========
# ''' + en_text)
# with open(f"./pre/{rec}zh.txt", "w", encoding='utf-8') as f:
# f.write(zh_text)
# with open(f"./pre/{rec}en.txt", "w", encoding='utf-8') as f:
# f.write(en_text)
# without_preprocess(row)
if __name__ == "__main__":
# os.environ['HSA_OVERRIDE_GFX_VERSION'] = '10.3.0'
Path(f"./pre").mkdir(parents=True, exist_ok=True)
begin_time = datetime.datetime.now()
dataset = load_dataset("ranWang/UN_Historical_PDF_Article_Text_Corpus", split='randomTest')
dataset.map(with_preprocess)
# print(process_zh_text.start(dataset[0]["zh"]))
end_time = datetime.datetime.now()
print('Time elapsed:', end_time - begin_time)