-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathpreprocess_sign.py
47 lines (30 loc) · 1.49 KB
/
preprocess_sign.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import argparse
from bpemb import BPEmb
parser = argparse.ArgumentParser(description="Preprocess sign dataset")
parser.add_argument('--vocab-size', default=25000, help="size of pretrained word embedding vocabulary")
parser.add_argument('--dim', default=300, help="dim of word embedding")
parser.add_argument('--save-vecs', default=None, help="save path for extracted word embedding")
parser.add_argument('input_file', )
parser.add_argument('output_file', )
args = parser.parse_args()
def process(texts, vocab_size=25000, dim=300):
emb = BPEmb(lang='de', vs=vocab_size, dim=dim)
texts = [emb.encode(t) for t in texts]
unique_words = set([w for t in texts for w in t])
vecs = [wv for (i, wv) in enumerate(zip(emb.words, emb.vectors))
if i < 3 or wv[0] in unique_words] # reserve the special tokens
return texts, vecs
if __name__ == "__main__":
print (args)
with open(args.input_file, 'r', encoding='utf-8') as f:
texts = [l.strip() for l in f.readlines()]
texts, vecs = process(texts, args.vocab_size, args.dim)
with open(args.output_file, 'w', encoding='utf-8') as f:
for t in texts:
f.write(' '.join(t) + '\n')
if args.save_vecs:
with open(args.save_vecs, 'w', encoding='utf-8') as f:
f.write('{} {}\n'.format(len(vecs), 300))
for w,v in vecs:
# print (w, v)
f.write('{} {}\n'.format(w, ' '.join([str(n) for n in v])))