-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_time.py
85 lines (78 loc) · 3.26 KB
/
extract_time.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# from origin_data.riedel import Document_pb2
import os
import pdb
import timex
import re
# Now we need to process wikipedia text for timex extraction
def main():
data_path = "./origin_data/riedel/nyt-2005-2006.backup/"
output_path = "./data/"
with open(output_path + "processed.txt", "wb") as fout:
outputs = []
for item in os.listdir(data_path):
with open(data_path + item, "rb") as fin:
if item[-3:] != ".pb":
continue
# pdb.set_trace()
doc = Document_pb2.Document()
doc.ParseFromString(fin.read())
# whole_doc is for time extraction
whole_doc = []
valid_set = []
for sentence in doc.sentences:
s = []
m = []
t = []
# extract the token words into one sentence.
for token in sentence.tokens:
s.append(token.word)
# extract mentions
for mention in sentence.mentions:
# mention got entity_name, mform, to
m.append(["_".join(s[int(mention.mfrom):int(mention.to+1)]), mention.mfrom, mention.to])
# if mentions is smaller than 2, means this may not in my train-test set.
# in this case, the mention could be bigger than 2, so we may need iterations for further processing.
if len(m) < 2:
valid_set.append([m, s])
whole_doc.append(s)
# tagging op should appear in each doc iter
# since we need the whole doc to set base-time
timex_found, whole_doc = timex.tag(" ".join(whole_doc))
if len(timex_found) > 0:
# set base-time tobe the last time found.
base_t = timex.retrieve_Date_time(timex_found)
pdb.set_trace()
# we first need to seperate the document separately.
# each file end with a </doc>
def parse_wiki():
data_path = "../wiki-data/"
out_path = "../wiki-tagged-data/"
dirs = os.listdir(data_path)
count = 0
for dir in dirs:
path = data_path + dir + "/"
for files in os.listdir(path):
count += 1
with open(path + files, "r") as f:
lines = f.readlines()
index = 0
# for each doc
while index < len(lines):
# denoting the starting line of a document
start = index
title = "_".join(re.split(r'[^a-z|0-9]', lines[index+1].strip().lower()))
while lines[index].strip() != "</doc>":
index += 1
end = index
# next line is the next start
index += 1
text = " ".join(lines[start:end])
# call the modified ground func
text = timex.ground(text)
with open(out_path + title, 'a') as fout:
fout.write(text)
# if count >= 5:
# return
if __name__ == "__main__":
# main()
parse_wiki()