Skip to content

Commit d72532c

Browse files
author
sxs149331
committed
First Version
0 parents  commit d72532c

File tree

187 files changed

+461429
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

187 files changed

+461429
-0
lines changed

ActorDictionary.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import re
2+
3+
from ClusterSImilarity import FuzzyClusterSimilarity
4+
5+
6+
class ActorDictionary:
7+
actor_filenames= ['Phoenix.Countries.actors.txt',
8+
'Phoenix.International.actors.txt',
9+
'Phoenix.MilNonState.actors.txt']
10+
folder = 'data/dictionaries'
11+
12+
actor_set = set()
13+
14+
fcs = FuzzyClusterSimilarity()
15+
16+
THERSHOLD = 0.75
17+
18+
def __init__(self):
19+
for filename in self.actor_filenames:
20+
fs = open(self.folder + "/" + filename)
21+
for line in fs:
22+
line = line.strip()
23+
if line.startswith('#') or len(line) == 0: # if it is a comment
24+
continue
25+
line = line.split('#')[0]
26+
27+
line = re.sub(r'\[[^\]]*\]', '', line).replace('_', ' ').replace('+', '').strip()
28+
#print line
29+
if len(line) > 1:
30+
self.actor_set.add(line)
31+
32+
fs.close()
33+
34+
def contains(self, actorname):
35+
test = actorname.replace('_',' ').strip()
36+
if ('VLADIMIR' in test) or ('PUTIN' in test):
37+
print 'Found'
38+
39+
return test in self.actor_set
40+
# for name in self.actor_set:
41+
# if self.fcs.measure(name, actorname) > self.THERSHOLD:
42+
# return True
43+
# return False
44+
45+
46+

ActorDictionary.pyc

1.66 KB
Binary file not shown.

ClusterManager.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
import pprint
2+
3+
from ActorDictionary import ActorDictionary
4+
from ClusterSImilarity import FuzzyClusterSimilarity
5+
from UnionFind import UnionFind
6+
7+
8+
class ActorResolver:
9+
10+
11+
stored_actor_dict = ActorDictionary()
12+
13+
possible_actor_list = set()
14+
15+
clsSimilarity = FuzzyClusterSimilarity()
16+
17+
compressed_dict = {}
18+
19+
def getFrequencyCount(self, freq_dict={}):
20+
21+
updated_dict = {}
22+
23+
for doc_key in freq_dict:
24+
doc_actor_dict = freq_dict[doc_key]
25+
26+
compress_actor_dict = self.compress(doc_actor_dict)
27+
#compress_actor_dict = doc_actor_dict
28+
29+
for actor_key in compress_actor_dict:
30+
if self.stored_actor_dict.contains(actor_key):
31+
continue # already in the CAMEO Actor Dictionary
32+
else:
33+
possible_actor, ratio = self.getClosestMatch(actor_key)
34+
35+
if possible_actor is not None:
36+
tokens, freq = updated_dict[possible_actor]
37+
if len(possible_actor) < len(actor_key):
38+
self.possible_actor_list.remove(possible_actor)
39+
self.possible_actor_list.add(actor_key)
40+
41+
updated_dict.pop(possible_actor, None)
42+
updated_dict[actor_key] = (compress_actor_dict[actor_key][0].append(tokens), compress_actor_dict[actor_key][1]+freq)
43+
else:
44+
updated_dict[possible_actor] = (compress_actor_dict[actor_key][0].append(tokens), compress_actor_dict[actor_key][1]+freq)
45+
else:
46+
updated_dict[actor_key] = compress_actor_dict[actor_key]
47+
self.possible_actor_list.add(actor_key)
48+
return updated_dict
49+
50+
51+
def rank(self, freq_dict={}):
52+
updated_dict = self.getFrequencyCount(freq_dict)
53+
54+
55+
56+
57+
def getClosestMatch(self, actor_name):
58+
max_ratio = 70
59+
possible_actor = None
60+
for name in self.possible_actor_list:
61+
if self.clsSimilarity.measure(name, actor_name) > max_ratio:
62+
max_ratio = self.clsSimilarity.measure(name, actor_name)
63+
possible_actor = name
64+
return (possible_actor, max_ratio)
65+
66+
67+
def getParent(self, parentDict={}, key=None):
68+
temp = key
69+
while parentDict[temp] != temp:
70+
temp = parentDict[temp]
71+
print temp
72+
return temp
73+
74+
75+
def compress(self, actor_freq_dict={}):
76+
compressed_dict = {}
77+
list_of_names = []
78+
79+
for key in actor_freq_dict:
80+
list_of_names.append(key)
81+
82+
uf = UnionFind(list_of_names)
83+
84+
for i in range(0, len(list_of_names)):
85+
maxMatched = None
86+
maxRatio = 70
87+
for j in range(0, len(list_of_names)):
88+
if i == j:
89+
continue
90+
ratio = self.clsSimilarity.measure(uf.find(list_of_names[i]), uf.find(list_of_names[j]))
91+
92+
print ratio
93+
if ratio > maxRatio:
94+
maxRatio = ratio
95+
maxMatched = list_of_names[j]
96+
if maxMatched is not None:
97+
uf.union(maxMatched, list_of_names[i])
98+
print "TEST"
99+
100+
for key in list_of_names:
101+
print key
102+
parent = uf.find(key)
103+
print parent
104+
if parent not in compressed_dict:
105+
print "Inserting"
106+
print actor_freq_dict[key][1]
107+
compressed_dict[parent]=([key], actor_freq_dict[key][1])
108+
else:
109+
print "Updating"
110+
compressed_dict[parent][0].append(key)
111+
maximum = max(compressed_dict[parent][1], actor_freq_dict[key][1])
112+
compressed_dict[parent]= (compressed_dict[parent][0], maximum)
113+
114+
return compressed_dict
115+
116+
117+
118+
#===================================================================================================================
119+
if __name__ == '__main__':
120+
121+
actorResolver = ActorResolver()
122+
123+
test_dict = {u'HENDRICKS': ([u'HENDRICKS'], 4), u'EBTEKAR': ([u'EBTEKAR'], 4), u'BARBARA_HENDRICKS': ([u'HENDRICKS', u'BARBARA'], 4), u'MASOUMEH_EBTEKAR': ([u'EBTEKAR', u'MASOUMEH'], 4)}
124+
125+
pprint.pprint(actorResolver.compress(test_dict))

ClusterManager.pyc

3.47 KB
Binary file not shown.

ClusterSImilarity.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import re
2+
from pprint import pprint
3+
4+
from fuzzywuzzy import fuzz
5+
from datasketch import MinHash, MinHashLSH
6+
7+
class ClusterSimilarity:
8+
9+
def matches(self, stra, strb):
10+
return stra in strb
11+
12+
def measure(self, currentActor, newActor):
13+
wordsCurrentActor = currentActor.split("_")
14+
wordsNewActor = newActor.split("_")
15+
16+
if len(wordsCurrentActor) >= len(wordsNewActor): #current actor name may be a superset of the new name suggested
17+
i = 0
18+
j = 0
19+
while i < len(wordsCurrentActor):
20+
if self.matches(wordsNewActor[j], wordsCurrentActor[i]):
21+
j = j + 1
22+
i = i + 1
23+
return float(j) / len(wordsNewActor)
24+
25+
else:
26+
i = 0
27+
j = 0
28+
while i < len(wordsNewActor):
29+
if self.matches(wordsCurrentActor[j], wordsNewActor[i]):
30+
j = j + 1
31+
i = i + 1
32+
return float(j)/len(wordsNewActor)
33+
34+
#==================================================================================================
35+
class FuzzyClusterSimilarity(ClusterSimilarity):
36+
def measure(self, currentActor, newActor):
37+
return fuzz.partial_ratio(currentActor.replace("_"," "), newActor.replace("_"," "))
38+
39+
40+
#==================================================================================================
41+
class LSHClusterSimilarity(ClusterSimilarity):
42+
43+
NUM_PERM =128
44+
45+
def measure(self, currentActor, newActor):
46+
mh1 = MinHash(num_perm=self.NUM_PERM)
47+
mh2 = MinHash(num_perm=self.NUM_PERM)
48+
mh1.update(currentActor.encode('utf8'))
49+
mh2.update(newActor.encode('utf8'))
50+
lsh = MinHashLSH(threshold=0.5, num_perm=self.NUM_PERM)
51+
lsh.insert(currentActor, mh1)
52+
print lsh.query(mh2)
53+
#print mh1.jaccard(mh2)
54+
55+
56+
57+
#===================================================================================================
58+
clusterSimilarity = FuzzyClusterSimilarity()
59+
60+
actorFile = open("data/dictionaries/Phoenix.International.actors.txt")
61+
62+
currentActor = None
63+
actorNamesDict = {}
64+
65+
for line in actorFile:
66+
line = line.strip()
67+
if line.startswith('#') or len(line) == 0: # if it is a comment
68+
continue
69+
line = line.split('#')[0]
70+
71+
line = re.sub(r'\[[^\]]*\]', '', line).strip()
72+
73+
if len(line) != 0:
74+
if line.startswith("+"):
75+
if currentActor not in actorNamesDict:
76+
actorNamesDict[currentActor] = []
77+
actorNamesDict[currentActor].append(line.replace("+",""))
78+
else:
79+
currentActor = line
80+
81+
82+
#pprint(actorNamesDict)
83+
84+
actorSynsetRatio = {}
85+
86+
for key in actorNamesDict:
87+
actorSynonyms = actorNamesDict[key]
88+
actorSynsetRatio[key] = {}
89+
for other in actorSynonyms:
90+
res = clusterSimilarity.measure(key, other)
91+
actorSynsetRatio[key][other] = res
92+
93+
pprint(actorSynsetRatio)
94+
95+
96+
diffActorRatio = {}
97+
98+
actorNames = []
99+
100+
for key in actorNamesDict:
101+
actorNames.append(key)
102+
103+
i = 0
104+
for i in range(0, len(actorNames)):
105+
diffActorRatio[actorNames[i]] = {}
106+
for j in range(i+1, len(actorNames)):
107+
res = clusterSimilarity.measure(actorNames[i], actorNames[j])
108+
diffActorRatio[actorNames[i]][actorNames[j]] = res
109+
110+
print "\n"
111+
pprint(diffActorRatio)
112+
113+
print clusterSimilarity.measure("Donald Trump", "Melanila Trump")
114+
115+
116+
lshCS = LSHClusterSimilarity()
117+
118+
lshCS.measure("TRUMP", "TRUM")
119+
120+
121+
122+
123+
124+
125+
126+
127+

ClusterSImilarity.pyc

3.54 KB
Binary file not shown.

LICENSE

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
The MIT License (MIT)
2+
3+
Copyright (c) 2014 Open Event Data
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy of
6+
this software and associated documentation files (the "Software"), to deal in
7+
the Software without restriction, including without limitation the rights to
8+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9+
the Software, and to permit persons to whom the Software is furnished to do so,
10+
subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
recursive-include petrarch/data *

PETRARCH.log

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
INFO 2016-08-28 15:12:34,871: Running
2+
INFO 2016-08-28 15:12:34,871: Using Config file: PETR_config.ini
3+
INFO 2016-08-28 15:12:34,873: Reading CAMEO.2.0.txt
4+
INFO 2016-08-28 15:12:37,091: Reading Phoenix.agents.txt
5+
6+
WARNING 2016-08-28 15:12:37,121: Codes are required for agents in .agents file ; line skipped
7+
INFO 2016-08-28 15:12:37,128: Reading Phoenix.discards.txt
8+
INFO 2016-08-28 15:12:37,131: Reading Phoenix.IssueCoding.txt

Petrarch2.pdf

205 KB
Binary file not shown.

0 commit comments

Comments
 (0)