forked from longxudou/HIT-SCIR-CoNLL2019
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathaugment_data_conllulex.py
66 lines (58 loc) · 2.55 KB
/
augment_data_conllulex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import json
import collections
import argparse
from conllulex2json import load_sents
import re
def add_token_ranges(toks, text):
"""
For each token, add TokenRange to the MISC field.
The format of the TokenRange field (inspired by Python) is TokenRange=start:end, where start is a zero-based
document-level index of the start of the token (counted in Unicode characters) and end is a zero-based
document-level index of the first character following the token (i.e., the length of the token is end-start).
See http://ufal.mff.cuni.cz/udpipe/users-manual#run_udpipe_tokenizer_ranges
:param toks: list of dicts for each token, modified in-place
:param text: raw text of the sentence
"""
start = end = 0
for tok in toks:
end += len(tok["word"])
assert text[start:end] == tok["word"], (text[start:end], tok["word"], text, start, end, tok)
misc = tok.get("misc")
tok["misc"] = {"TokenRange": f"{start}:{end}"}
if misc != "SpaceAfter=No":
end += 1
start = end
class CompanionToConllulex:
def __init__(self, f):
self.lines = f
self.name = f.name
def __iter__(self):
for ln in self.lines:
m = re.match(r'^#(\S+)$', ln) # MRP 2019 companion encodes sent_id as simple comment
if m:
yield f"# sent_id = {m.group(1)}"
elif ln.count("\t") == 9: # CoNLL-U has 10 columns
yield ln.strip() + 9 * "\t_" + "\n" # CoNLL-U-Lex has 18 columns
else:
yield ln
parser = argparse.ArgumentParser(description='Augment Data')
parser.add_argument("conllulex", type=str, help="Augment CoNLL-U/CoNLL-U-Lex/JSON file")
parser.add_argument("mrp", type=str, help="Input MRP file")
parser.add_argument("output", type=str, help="Output Augmented file")
args = parser.parse_args()
conllulex_file = args.conllulex
mrp_file = args.mrp
out_file = args.output
with open(conllulex_file, 'r', encoding='utf8') as f_c:
augs = {sent["sent_id"].replace("reviews-", ""): sent for sent in load_sents(CompanionToConllulex(f_c))}
with open(mrp_file, 'r', encoding='utf8') as f_in, open(out_file, 'w', encoding='utf8') as f_out:
for line in f_in:
mrp = json.loads(line, object_pairs_hook=collections.OrderedDict)
sent_id = mrp['id']
aug = augs[sent_id]
if aug is None:
print("id:{} not in companion".format(sent_id))
else:
add_token_ranges(aug["toks"], aug["text"])
mrp['companion'] = aug
f_out.write((json.dumps(mrp) + '\n'))