-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathRunModel.py
118 lines (80 loc) · 3.35 KB
/
RunModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from SENNA.IO.FeatureStore import FeatureStore
from SENNA.TrainModels.TrainITC import TrainITC
import ConfigParser
import numpy as np
from libs.stanford_corenlp_pywrapper import CoreNLP
coreNlpPath = "corenlp/*"
inputPath = "input.txt"
outputPath = "output.txt"
configPath = "config/config.txt"
featureFile = inputPath+".features"
modelPath = 'models/model_eventnuggets.obj'
# :: Convert input to a .features file ::
proc = CoreNLP("pos", corenlp_jars=[coreNlpPath])
fIn = open(inputPath)
text = fIn.read()
res = proc.parse_doc(text)
fOut = open(featureFile, 'w')
def getCasing(word):
if word.isdigit(): #Is a digit
return 'numeric'
if word.islower(): #All lower case
return 'allLower'
elif word.isupper(): #All upper case
return 'allUpper'
elif word[0].isupper(): #is a title, initial char upper, then all lower
return 'initialUpper'
return 'other'
def getFeatures(sent, position):
DEFAULT = "PADDING"
features = {}
for offset in xrange(-3,3+1):
for fieldName, featureName in {'tokens':'Token', 'lemmas':'Lemma', 'pos':'POS'}.iteritems():
features[featureName+"["+str(offset)+"]"] = sent[fieldName][position+offset].strip() if (position+offset) >= 0 and (position+offset) < len(sent[fieldName]) else DEFAULT
features["Case["+str(offset)+"]"] = getCasing(sent['tokens'][position+offset]) if (position+offset) >= 0 and (position+offset) < len(sent[fieldName]) else DEFAULT
return features
for sentence in res['sentences']:
for position in xrange(len(sentence['tokens'])):
features = getFeatures(sentence, position)
featureString = []
for key in sorted(features.keys()):
featureString.append("%s=%s" % (key, features[key]))
featureString = "\t".join(featureString)
label = "O"
fOut.write("%s\t%s\n" % (label, featureString))
fOut.write("\n")
fOut.close()
# :: Run the existent model ::
#featureFile = 'tacdata/dev-data.txt'
labelsMapping = {'O':0, 'B-EVENT':1, 'I-EVENT':2}
inverseLabelsMapping = {v: k for k, v in labelsMapping.items()}
config = ConfigParser.ConfigParser()
config.read(configPath)
fixedFeatureNames = set([feature.strip() for feature in config.get('Main', 'featureSet').split(',') if len(feature.strip()) > 0])
featureStore = FeatureStore(config, fixedFeatureNames, {})
featureStore = FeatureStore(config, fixedFeatureNames, {})
featureStore.minSentenceLength = 1
allFeatureNames = fixedFeatureNames
featureStore.labelTransform = None
featureStore.initVocabs()
# Load the embeddings
print "Load embeddings"
featureStore.loadVocabs()
devData = featureStore.createMatrices(featureFile, labelsMapping)
model = TrainITC([])
model.setData(devData, None, None)
model.numHidden = int(config.get('Main', 'numHidden'))
model.loadModel(modelPath)
predictions = model.predictLabels(devData.setX, devData.sentenceLengths)
#Store the predictions
tokens = featureStore.readFeatures(featureFile, ['Token[0]'])
fOut = open(outputPath, 'w')
predictionIdx = 0
for sentence in tokens:
for token in sentence:
word = token[1]['Token[0]']
pred = inverseLabelsMapping[predictions[predictionIdx]]
predictionIdx += 1
fOut.write("%s\t%s\n" % (word, pred))
fOut.write("\n")
fOut.close()