forked from longxudou/HIT-SCIR-CoNLL2019
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathaugment_data_conllu.py
33 lines (30 loc) · 1.12 KB
/
augment_data_conllu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import json
import collections
import argparse
from conllu.parser import parse_line, DEFAULT_FIELDS
parser = argparse.ArgumentParser(description='Augment Data')
parser.add_argument("conll", type=str, help="Augment CoNLL file")
parser.add_argument("mrp", type=str, help="Input MRP file")
parser.add_argument("output", type=str, help="Output Augmented file")
args = parser.parse_args()
conll_file = args.conll
mrp_file = args.mrp
out_file = args.output
augs = {}
with open(conll_file, 'r', encoding='utf8') as f_c:
conlls = f_c.read().split('\n\n')
for conll in conlls:
id = conll.split('\n')[0][1:]
augs[id] = [parse_line(line, DEFAULT_FIELDS) for line in conll.strip().split('\n')[1:]]
#print augs.keys()
with open(mrp_file, 'r', encoding='utf8') as f_m, open(out_file, 'w', encoding='utf8') as fo:
line = f_m.readline()
while line:
mrp = json.loads(line, object_pairs_hook=collections.OrderedDict)
id = mrp['id']
if id not in augs:
print("id:{} not in companion".format(id))
else:
mrp['companion'] = dict(sent_id=id, toks=augs[id])
fo.write((json.dumps(mrp)+'\n'))
line = f_m.readline()