-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwiki_revised_gtruth.py
executable file
·78 lines (47 loc) · 1.84 KB
/
wiki_revised_gtruth.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# -*- coding: iso-8859-1 -*-
from tphyl2 import *
import unicodedata
def wr_groundtruth(data_file, length, max_len=False, randomize=True):
corpus = read_file(data_file)
corpus = corpus.decode("utf-8")
corpus = unicodedata.normalize('NFKD', corpus).encode("ascii", "ignore")
corpus = corpus.split("</doc>\n")
if len(corpus) < length and not max_len:
return False
if corpus[0].startswith("\n"):
corpus[0] = corpus[0][1:]
for i in range(len(corpus)):
corpus[i] = "\n".join(corpus[i].split("\n")[1:])
topology = [0] * length
new_corpus = [corpus[0]]
father = 0
for i in range(1, len(corpus)):
if len(new_corpus) == length:
break
if (len(corpus) - i) + len(new_corpus) < length and not max_len:
return False
if corpus[i] in new_corpus:
father = new_corpus.index(corpus[i])
else:
new_corpus.append(corpus[i])
topology[len(new_corpus) - 1] = father
father = len(new_corpus) - 1
topology = topology[:len(new_corpus)]
if randomize:
return randomize_corpus(new_corpus, topology)
else:
return {"topology": topology, "corpus": new_corpus}
def randomize_corpus(corpus, topology):
nodes = random.sample(range(len(topology)), len(topology))
root = find_root(topology)
new_topology = [nodes[root]] * len(topology)
new_corpus = [""] * len(topology)
for i in range(len(topology)):
desc = find_descendants(topology, i)
new_corpus[nodes[i]] = corpus[i]
for d in desc:
new_topology[nodes[d]] = nodes[i]
return {"topology": new_topology, "corpus": new_corpus}
def write_real_tree(base_folder, corpus, name):
tree_string = "\n<\\tphyldoc>\n".join(corpus)
write_file(base_folder, name + ".txt", tree_string)