-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextractSentencesWithNP.py
122 lines (100 loc) · 3.75 KB
/
extractSentencesWithNP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import time
import copy
import spacy
nlp = spacy.load("en_core_web_sm")
with open("test.txt") as fp:
line = fp.readline()
count = 0
sentenceList = []
sentCount = 0
NPstarted = False
NPList = []
sentence = ""
runningNP = ""
while line:
lineList = line.split(" ")
if line.strip() == "":
tempObj = {"sentence" : sentence.strip(), "NPList" : copy.deepcopy(NPList)}
sentenceList.append(tempObj)
sentCount = sentCount + 1
NPstarted = False
NPList = []
runningNP = ""
sentence = ""
line = fp.readline()
continue
word = lineList[0]
POS = lineList[1]
POS = POS.strip()
NP = lineList[2]
NP = NP.strip()
if NP == "B-NP" and NPstarted == False:
#print("came here")
NPstarted = True
runningNP = word
else:
if NPstarted == True and (NP == "I-NP" or (NP == "B-NP" and POS == "POS")):
#print("came here ")
if POS == "POS" or POS == "." or POS == ",":
runningNP = runningNP + word
else:
runningNP = runningNP + " " + word
elif NPstarted == True and NP != "I-NP":
#print("came in elif")
NPstarted = False
NPList.append(runningNP.strip())
runningNP = ""
if NP == "B-NP":
NPstarted = True
runningNP = word
if POS == "POS" or POS == "." or POS == ",":
sentence = sentence + word
else:
sentence = sentence + " " + word
count = count + 1
line = fp.readline()
print("Number of total sentences being evaluated: " + str(sentCount))
onlySentenceList = [item["sentence"] for item in sentenceList]
docs = list(nlp.pipe(onlySentenceList))
time1 = time.time()
chunks = list(list(doc.noun_chunks) for doc in docs)
time2 = time.time()
same = 0
changes = []
totalChunksGroundTruth = 0
totalChunksPredicted = 0
totalCorrectChunksPredicted = 0
for i in range(len(chunks)):
chunkList = []
for item in chunks[i]:
chunkList.append(str(item))
totalChunksGroundTruth = totalChunksGroundTruth + len(sentenceList[i]["NPList"])
totalChunksPredicted = totalChunksPredicted + len(chunkList)
for predictedNP in chunkList:
if predictedNP in sentenceList[i]["NPList"]:
totalCorrectChunksPredicted = totalCorrectChunksPredicted + 1
if chunkList == sentenceList[i]["NPList"]:
same = same + 1
else:
changes.append(i)
print("Fraction of sentences whose spacy predictions matched perfectly with CoNLL 2000 dataset ground truth: " + str(same/len(chunks)) + "\n")
precision = (totalCorrectChunksPredicted/totalChunksPredicted)*100
print("-----------------------------------------------------------------")
print("Precision: \t" + str(precision))
recall = (totalCorrectChunksPredicted/totalChunksGroundTruth)*100
print("Recall: \t" + str(recall))
FScore = 2 * ((precision * recall)/(precision + recall))
print("F Score: \t" + str(FScore))
print("-----------------------------------------------------------------")
# uncomment the following lines to understand the specific cases where spaCy makes mistakes. The list "changes" contains a list of index of all the sentences that had a mismatch with the ground truth.
'''
print(changes)
sampleTest = 5
print("********* The sentence being analyzed *********")
print(onlySentenceList[sampleTest])
print("********* Predicted Noun Phrases *********")
print(chunks[sampleTest])
print("********* Ground Truth Noun Phrases *********")
print(sentenceList[sampleTest]["NPList"])
'''
#print(sentenceList[0:5])