openeventdata
diff --git a/‎ActorDictionary.py
Lines changed: 46 additions & 0 deletions b/‎ActorDictionary.py
Lines changed: 46 additions & 0 deletions
diff --git a/‎ActorDictionary.pyc
1.66 KB b/‎ActorDictionary.pyc
1.66 KB
diff --git a/‎ClusterManager.py
Lines changed: 125 additions & 0 deletions b/‎ClusterManager.py
Lines changed: 125 additions & 0 deletions
diff --git a/‎ClusterManager.pyc
3.47 KB b/‎ClusterManager.pyc
3.47 KB
diff --git a/‎ClusterSImilarity.py
Lines changed: 127 additions & 0 deletions b/‎ClusterSImilarity.py
Lines changed: 127 additions & 0 deletions
diff --git a/‎ClusterSImilarity.pyc
3.54 KB b/‎ClusterSImilarity.pyc
3.54 KB
diff --git a/‎LICENSE
Lines changed: 20 additions & 0 deletions b/‎LICENSE
Lines changed: 20 additions & 0 deletions
diff --git a/‎MANIFEST.in
Lines changed: 1 addition & 0 deletions b/‎MANIFEST.in
Lines changed: 1 addition & 0 deletions
diff --git a/‎PETRARCH.log
Lines changed: 8 additions & 0 deletions b/‎PETRARCH.log
Lines changed: 8 additions & 0 deletions
diff --git a/‎Petrarch2.pdf
205 KB b/‎Petrarch2.pdf
205 KB
@@ -0,0 +1,46 @@
+import re
+
+from ClusterSImilarity import FuzzyClusterSimilarity
+
+
+class ActorDictionary:
+    actor_filenames= ['Phoenix.Countries.actors.txt',
+                      'Phoenix.International.actors.txt',
+                      'Phoenix.MilNonState.actors.txt']
+    folder = 'data/dictionaries'
+
+    actor_set = set()
+
+    fcs = FuzzyClusterSimilarity()
+
+    THERSHOLD = 0.75
+
+    def __init__(self):
+        for filename in self.actor_filenames:
+            fs = open(self.folder + "/" + filename)
+            for line in fs:
+                line = line.strip()
+                if line.startswith('#') or len(line) == 0:  # if it is a comment
+                    continue
+                line = line.split('#')[0]
+
+                line = re.sub(r'\[[^\]]*\]', '', line).replace('_', ' ').replace('+', '').strip()
+                #print line
+                if len(line) > 1:
+                    self.actor_set.add(line)
+
+            fs.close()
+
+    def contains(self, actorname):
+        test = actorname.replace('_',' ').strip()
+        if ('VLADIMIR' in  test) or ('PUTIN' in test):
+            print 'Found'
+
+        return test in self.actor_set
+        # for name in self.actor_set:
+        #     if self.fcs.measure(name, actorname) > self.THERSHOLD:
+        #         return True
+        # return False
+
+
+
@@ -0,0 +1,125 @@
+import pprint
+
+from ActorDictionary import ActorDictionary
+from ClusterSImilarity import FuzzyClusterSimilarity
+from UnionFind import UnionFind
+
+
+class ActorResolver:
+
+
+    stored_actor_dict = ActorDictionary()
+
+    possible_actor_list = set()
+
+    clsSimilarity = FuzzyClusterSimilarity()
+
+    compressed_dict = {}
+
+    def getFrequencyCount(self, freq_dict={}):
+
+        updated_dict = {}
+
+        for doc_key in freq_dict:
+            doc_actor_dict = freq_dict[doc_key]
+
+            compress_actor_dict = self.compress(doc_actor_dict)
+            #compress_actor_dict = doc_actor_dict
+
+            for actor_key in compress_actor_dict:
+                if self.stored_actor_dict.contains(actor_key):
+                    continue # already in the CAMEO Actor Dictionary
+                else:
+                    possible_actor, ratio = self.getClosestMatch(actor_key)
+
+                    if possible_actor is not None:
+                        tokens, freq = updated_dict[possible_actor]
+                        if len(possible_actor) < len(actor_key):
+                            self.possible_actor_list.remove(possible_actor)
+                            self.possible_actor_list.add(actor_key)
+
+                            updated_dict.pop(possible_actor, None)
+                            updated_dict[actor_key] = (compress_actor_dict[actor_key][0].append(tokens), compress_actor_dict[actor_key][1]+freq)
+                        else:
+                            updated_dict[possible_actor] = (compress_actor_dict[actor_key][0].append(tokens), compress_actor_dict[actor_key][1]+freq)
+                    else:
+                        updated_dict[actor_key] = compress_actor_dict[actor_key]
+                        self.possible_actor_list.add(actor_key)
+        return updated_dict
+
+
+    def rank(self, freq_dict={}):
+        updated_dict = self.getFrequencyCount(freq_dict)
+
+
+
+
+    def getClosestMatch(self, actor_name):
+        max_ratio = 70
+        possible_actor = None
+        for name in self.possible_actor_list:
+            if self.clsSimilarity.measure(name, actor_name) > max_ratio:
+                max_ratio = self.clsSimilarity.measure(name, actor_name)
+                possible_actor = name
+        return (possible_actor, max_ratio)
+
+
+    def getParent(self, parentDict={}, key=None):
+        temp = key
+        while parentDict[temp] != temp:
+            temp = parentDict[temp]
+        print temp
+        return temp
+
+
+    def compress(self, actor_freq_dict={}):
+        compressed_dict = {}
+        list_of_names = []
+
+        for key in actor_freq_dict:
+            list_of_names.append(key)
+
+        uf = UnionFind(list_of_names)
+
+        for i in range(0, len(list_of_names)):
+            maxMatched = None
+            maxRatio = 70
+            for j in range(0, len(list_of_names)):
+                if i == j:
+                    continue
+                ratio = self.clsSimilarity.measure(uf.find(list_of_names[i]), uf.find(list_of_names[j]))
+
+                print ratio
+                if ratio > maxRatio:
+                    maxRatio = ratio
+                    maxMatched = list_of_names[j]
+            if maxMatched is not None:
+                uf.union(maxMatched, list_of_names[i])
+        print "TEST"
+
+        for key in list_of_names:
+            print key
+            parent = uf.find(key)
+            print parent
+            if parent not in compressed_dict:
+                print "Inserting"
+                print actor_freq_dict[key][1]
+                compressed_dict[parent]=([key], actor_freq_dict[key][1])
+            else:
+                print "Updating"
+                compressed_dict[parent][0].append(key)
+                maximum = max(compressed_dict[parent][1], actor_freq_dict[key][1])
+                compressed_dict[parent]= (compressed_dict[parent][0], maximum)
+
+        return compressed_dict
+
+
+
+#===================================================================================================================
+if __name__ == '__main__':
+
+    actorResolver = ActorResolver()
+
+    test_dict = {u'HENDRICKS': ([u'HENDRICKS'], 4), u'EBTEKAR': ([u'EBTEKAR'], 4), u'BARBARA_HENDRICKS': ([u'HENDRICKS', u'BARBARA'], 4), u'MASOUMEH_EBTEKAR': ([u'EBTEKAR', u'MASOUMEH'], 4)}
+
+    pprint.pprint(actorResolver.compress(test_dict))
@@ -0,0 +1,127 @@
+import re
+from pprint import pprint
+
+from fuzzywuzzy import fuzz
+from datasketch import MinHash, MinHashLSH
+
+class ClusterSimilarity:
+
+    def matches(self, stra, strb):
+        return stra in strb
+
+    def measure(self, currentActor, newActor):
+        wordsCurrentActor = currentActor.split("_")
+        wordsNewActor = newActor.split("_")
+
+        if len(wordsCurrentActor) >= len(wordsNewActor): #current actor name may be a superset of the new name suggested
+            i = 0
+            j = 0
+            while i < len(wordsCurrentActor):
+                if self.matches(wordsNewActor[j], wordsCurrentActor[i]):
+                    j = j + 1
+                i = i + 1
+            return float(j) / len(wordsNewActor)
+
+        else:
+            i = 0
+            j = 0
+            while i < len(wordsNewActor):
+                if self.matches(wordsCurrentActor[j], wordsNewActor[i]):
+                    j = j + 1
+                i = i + 1
+            return float(j)/len(wordsNewActor)
+
+#==================================================================================================
+class FuzzyClusterSimilarity(ClusterSimilarity):
+    def measure(self, currentActor, newActor):
+        return fuzz.partial_ratio(currentActor.replace("_"," "), newActor.replace("_"," "))
+
+
+#==================================================================================================
+class LSHClusterSimilarity(ClusterSimilarity):
+
+    NUM_PERM =128
+
+    def measure(self, currentActor, newActor):
+        mh1 = MinHash(num_perm=self.NUM_PERM)
+        mh2 = MinHash(num_perm=self.NUM_PERM)
+        mh1.update(currentActor.encode('utf8'))
+        mh2.update(newActor.encode('utf8'))
+        lsh = MinHashLSH(threshold=0.5, num_perm=self.NUM_PERM)
+        lsh.insert(currentActor, mh1)
+        print lsh.query(mh2)
+        #print mh1.jaccard(mh2)
+
+
+
+#===================================================================================================
+clusterSimilarity = FuzzyClusterSimilarity()
+
+actorFile  = open("data/dictionaries/Phoenix.International.actors.txt")
+
+currentActor = None
+actorNamesDict = {}
+
+for line in actorFile:
+    line = line.strip()
+    if line.startswith('#') or len(line) == 0:  # if it is a comment
+        continue
+    line = line.split('#')[0]
+
+    line = re.sub(r'\[[^\]]*\]', '', line).strip()
+
+    if len(line) != 0:
+        if line.startswith("+"):
+            if currentActor not in actorNamesDict:
+                actorNamesDict[currentActor] = []
+            actorNamesDict[currentActor].append(line.replace("+",""))
+        else:
+            currentActor = line
+
+
+#pprint(actorNamesDict)
+
+actorSynsetRatio = {}
+
+for key in actorNamesDict:
+    actorSynonyms = actorNamesDict[key]
+    actorSynsetRatio[key] = {}
+    for other in actorSynonyms:
+        res = clusterSimilarity.measure(key, other)
+        actorSynsetRatio[key][other] = res
+
+pprint(actorSynsetRatio)
+
+
+diffActorRatio = {}
+
+actorNames = []
+
+for key in actorNamesDict:
+    actorNames.append(key)
+
+i = 0
+for i in range(0, len(actorNames)):
+    diffActorRatio[actorNames[i]] = {}
+    for j in range(i+1, len(actorNames)):
+        res = clusterSimilarity.measure(actorNames[i], actorNames[j])
+        diffActorRatio[actorNames[i]][actorNames[j]] = res
+
+print "\n"
+pprint(diffActorRatio)
+
+print clusterSimilarity.measure("Donald Trump", "Melanila Trump")
+
+
+lshCS = LSHClusterSimilarity()
+
+lshCS.measure("TRUMP", "TRUM")
+
+
+
+
+
+
+
+
+
@@ -0,0 +1,20 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 Open Event Data
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1 @@
+recursive-include petrarch/data *
@@ -0,0 +1,8 @@
+INFO 2016-08-28 15:12:34,871: Running
+INFO 2016-08-28 15:12:34,871: Using Config file: PETR_config.ini
+INFO 2016-08-28 15:12:34,873: Reading CAMEO.2.0.txt
+INFO 2016-08-28 15:12:37,091: Reading Phoenix.agents.txt
+
+WARNING 2016-08-28 15:12:37,121: Codes are required for agents in .agents file ; line skipped
+INFO 2016-08-28 15:12:37,128: Reading Phoenix.discards.txt
+INFO 2016-08-28 15:12:37,131: Reading Phoenix.IssueCoding.txt