forked from digling/shijing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathC_cluster_rimes.py
150 lines (122 loc) · 5.38 KB
/
C_cluster_rimes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from lingpy import *
import networkx as nx
from lingpy.thirdparty import linkcomm as lc
from sys import argv
from collections import Counter
import markdown
import pickle
try:
I = pickle.load(open('R_infomap.bin','rb'))
print('[i] loaded graph')
except:
# load any graph version, let's take the simple one for the moment
I = nx.read_yaml('R_infomap.yaml')
with open('R_infomap.bin', 'wb') as f:
pickle.dump(I, f)
# open and analyze the infomap graph
#I = nx.read_yaml('R_infomap.yaml')
D = {}
for nA, data in I.nodes(data=True):
imp = data['infomap']
_data = ['rime','occurrence','certainty','shangsheng','reading','mch', 'stanza']
try:
D[imp] += [(nA,)+tuple([data[x] for x in _data])]
except KeyError:
D[imp] = [(nA,)+tuple([data[x] for x in _data])]
tpl = """<tr>
<td style="border: 1px solid lightgray;"><a
style="text-decoration:none;color:Crimson;" target="other" href="http://dighl.github.io/shijing/index.html?char={0}">{0}</a></td>
<td style="border: 1px solid lightgray;">{1}</td>
<td style="border: 1px solid lightgray;">{2}</td>
<td style="border: 1px solid lightgray;">{3}</td>
<td style="border: 1px solid lightgray;">{4}</td>
</tr>"""
tps = """<a style="text-decoration:none;color:cornflowerblue;" target="other"
href="http://dighl.github.io/shijing/index.html?stanza={0}&break=break&char={1}">{0}</a>"""
tpt = """
<table id="table_{1}" style="border:2px solid black;display: none;">
<tr>
<th onclick="sort_table(0,'table_{1}', 0);" style="cursor:pointer;border: 1px solid gray;">Character</th>
<th onclick="sort_table(1,'table_{1}', 0);" style="cursor:pointer;border: 1px solid gray;">Middle Chinese</th>
<th onclick="sort_table(2,'table_{1}', 0);" style="cursor:pointer;border: 1px solid gray;">Old Chinese</th>
<th onclick="sort_table(3,'table_{1}', 0);" style="cursor:pointer;border: 1px solid gray;">Occurrence</th>
<th style="border: 1px solid gray; max-width:500px;">Stanza</th>
</tr>
{0}
</table>"""
with open('R_stats_infomap.tsv', 'w') as f:
most_coms = []
txt = ''
for idx in sorted(D, key=lambda x: len(D[x]), reverse=True):
rimes = [line[1] if line[1] else '?' for line in D[idx]]
rimeset = sorted(set(rimes), key=lambda x: rimes.count(x),
reverse=True)
# make a dictionary sorting
tmp = {}
for line in D[idx]:
rime = line[1] if line[1] else '?'
if 'ʔ' in line[4]:
rime = rime + 'ʔ'
if line[3] == '?':
rime = rime + '[?]'
try:
tmp[rime] += [tuple(line)]
except KeyError:
tmp[rime] = [tuple(line)]
# write stuff to file
sorted_data = sorted(tmp.items(), key=lambda x: len(x[1]),
reverse=True)
print(sorted_data[0][1],len(sorted_data[0][1]))
best_char = sorted_data[0][0]
txt2 = ''
txt2 += '<h3>Community {0} (ID: {1}, Members: {2})</h3>\n<ul>'.format(
best_char,
idx+1,
sum([len(x[1]) for x in sorted_data]))
for k,v in sorted_data:
# prepare text3
txt3 = ''
for line in sorted(v, key=lambda x: x[1], reverse=True):
txt3 += tpl.format(
line[0],
line[6],
line[5].replace(' ','') + str('<sup>'+line[3]+'</sup>' if line[3] == '?' else ''),
line[2],
', '.join([tps.format(x,line[0]) for x in line[-1].split(',')])
)
txt3 = tpt.format(txt3, str(idx+1)+'_'+k)
txt2 += """<li style="display:flex;" id="cluster_{3}"><b style="width:150px!important;">-{0}</b>
<span style="width:150px">Occurrence: {1}</span> <span
style="border:1px solid
black;background:Crimson;cursor:pointer;color:white;font-weight:bold;" id="span_{3}"
onclick="toggle_table('{3}');">SHOW</span> </li> \n{2}""".format(
k, len(v), txt3,
str(idx+1)+'_'+k
)
txt2 += '</ul>\n'
txt += '<div id="community_'+str(idx+1)+'">'+txt2+'</div>'
# write cluster to file
f.write('# '+rimeset[0]+' / {0} / {1}\n'.format(
idx,
len(rimes)
))
for elm in rimeset:
f.write(' '+elm+'\t'+str(rimes.count(elm))+'\n')
f.write('\n')
most_coms += [(idx,elm)]
head = '<html><head><meta charset=utf8 /></head><body>{1}<div id="mainpart">{0}</div></body></html>'
title = '<h1>Infomap Community Detection Analysis</h1>'
title += """
Type in the keywords you wish to search for, type
<pre><code>community=1,2,3,5</code></pre> in order to search for specific
communities by their ID.
Type
<pre><code>characters=我,你</code></pre>
in order to search for communities containing certain characters, and type
<pre><code>rimes=ak,a</code></pre>
in order to limit the search to specific rime groups.<br>
"""
title += '<script src="T_community.js"></script>'
title += '<input id="textfield" type="text" placeholder="filter by community" style="width:200px" /><input id="click" onclick="filterCommunities({cnum})" type="button" value="OK" />'.format(cnum=len(D)+1)
with open('R_infomap.html', 'w') as f:
f.write(head.format(txt, title))