Skip to content

Commit 7773cee

Browse files
author
Daniel
committed
update OpenCyc instance count
explicitly make sure that previously declared classes are not added to the instance set
1 parent 501cb97 commit 7773cee

File tree

2 files changed

+191
-0
lines changed

2 files changed

+191
-0
lines changed
File renamed without changes.

OpenCyc/statistics_opencyc_v4.py

+191
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
#KG, #triples, #nodes, #properties, #classes, avgIndegree, medianIndegree, avgOutdegree, medianOutdegree
2+
import numpy
3+
import operator
4+
5+
#readFile = '../../../SeminarPaper_KG_Files/OpenCyc/opencyc-latest_s.nt'
6+
readFile = '../../../SeminarPaper_KG_Files/OpenCyc/opencyc-latest.nt'
7+
rdfType = '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>'
8+
owlClass = '<http://www.w3.org/2002/07/owl#Class>'
9+
owlThing = '<http://www.w3.org/2002/07/owl#Thing>'
10+
lineProgress = 1000000
11+
12+
sTriples = 0
13+
sNodes = set()
14+
sNodesNamespace = set()
15+
sProperties = set()
16+
sClasses = set()
17+
sInstances = set()
18+
indegreeDict = dict()
19+
outdegreeDict = dict()
20+
instanceIndegreeDict = dict()
21+
instanceOutdegreeDict = dict()
22+
23+
def isLiteral(w):
24+
if (w.startswith('"')):
25+
return True
26+
return False
27+
28+
def isURI(w):
29+
if (w.startswith('<')):
30+
return True
31+
return False
32+
33+
def isBlankNode(w):
34+
if (w.startswith('_:')):
35+
return True
36+
return False
37+
38+
def isNamespaceURI(w):
39+
if (w.startswith('<http://sw.opencyc.org/concept/')):
40+
return True
41+
return False
42+
43+
def getSPO(splittedLine):
44+
word_position = 0
45+
for word in splittedLine:
46+
if (word_position == 0):
47+
subj = word
48+
elif (word_position == 1):
49+
pred = word
50+
elif (word_position == 2):
51+
obj = word
52+
else:
53+
return subj, pred, obj
54+
word_position += 1
55+
return subj, pred, obj
56+
57+
def countTriple(s,p,o):
58+
global sTriples
59+
if ((isURI(s) or isBlankNode(s)) and isURI(p) and (isURI(o) or isBlankNode(o) or isLiteral(o))):
60+
sTriples += 1
61+
else:
62+
print ('{} {} {}'.format(s,p,o))
63+
64+
def checkAndAddNode(n):
65+
global sNodes
66+
global sNodesNamespace
67+
if(isURI(n) or isBlankNode(n)):
68+
if (not n in sNodes):
69+
sNodes.add(n)
70+
if (isNamespaceURI(n)):
71+
if (not n in sNodesNamespace):
72+
sNodesNamespace.add(n)
73+
74+
def countNodes(s,o):
75+
checkAndAddNode(s)
76+
checkAndAddNode(o)
77+
78+
def countProperties(p):
79+
global sProperties
80+
if (isURI(p)):
81+
if (not p in sProperties):
82+
sProperties.add(p)
83+
84+
def countClasses(s,p,o):
85+
global sClasses
86+
if (p == rdfType and o==owlClass):
87+
if (not s in sClasses):
88+
sClasses.add(s)
89+
90+
def countInstances(s,p,o):
91+
global sInstances
92+
if(p==rdfType and o in sClasses):
93+
if (not s in sClasses):
94+
if (not s in sInstances):
95+
sInstances.add(s)
96+
97+
def addIndegree(o):
98+
global indegreeDict
99+
if (isURI(o) or isBlankNode(o)):
100+
if (o in indegreeDict):
101+
indegreeDict[o] += 1
102+
else:
103+
indegreeDict[o] = 1
104+
105+
def addOutdegree(s):
106+
global outdegreeDict
107+
if (isURI(s) or isBlankNode(s)):
108+
if (s in outdegreeDict):
109+
outdegreeDict[s] += 1
110+
else:
111+
outdegreeDict[s] = 1
112+
113+
def addInstanceIndegree(o):
114+
global sInstances
115+
global instanceIndegreeDict
116+
if (o in sInstances):
117+
if (o in instanceIndegreeDict):
118+
instanceIndegreeDict[o] += 1
119+
else:
120+
instanceIndegreeDict[o] = 1
121+
122+
def addInstanceOutdegree(s):
123+
global sInstances
124+
global instanceOutdegreeDict
125+
if (s in sInstances):
126+
if (s in instanceOutdegreeDict):
127+
instanceOutdegreeDict[s] += 1
128+
else:
129+
instanceOutdegreeDict[s] = 1
130+
131+
def getAvg(d):
132+
return numpy.average(numpy.array(d.values()))
133+
134+
def getMedian(d):
135+
return numpy.median(numpy.array(d.values()))
136+
137+
try:
138+
print('GET STATISTICS FOR OPENCYC')
139+
f = open(readFile, 'r')
140+
lineCounter = 0
141+
for line in f:
142+
#print line
143+
splittedLine = line.rstrip('\n').split()
144+
s, p, o = getSPO(splittedLine)
145+
countTriple(s,p,o)
146+
countNodes(s,o)
147+
countProperties(p)
148+
countClasses(s,p,o)
149+
addIndegree(o)
150+
addOutdegree(s)
151+
lineCounter += 1
152+
if (lineCounter % lineProgress == 0):
153+
print ('{} million lines read'.format(lineCounter / 1000000))
154+
f.close()
155+
print('First run complete.')
156+
print('#triples: {}, #nodes: {}, #namespaceNodes: {}, #properties: {}, #classes: {}, #instances: {}, avgIndegree: {}, medianIndegree: {}, avgOutdegree: {}, medianOutdegree: {}, avgInstanceIndegree: {}, medianInstanceIndegree: {}, avgInstanceOutdegree: {}, medianInstanceOutdegree: {}'.format(sTriples, len(sNodes), len(sNodesNamespace), len(sProperties), len(sClasses), len(sInstances), getAvg(indegreeDict), getMedian(indegreeDict), getAvg(outdegreeDict), getMedian(outdegreeDict), getAvg(instanceIndegreeDict), getMedian(instanceIndegreeDict), getAvg(instanceOutdegreeDict), getMedian(instanceOutdegreeDict)))
157+
f = open(readFile, 'r')
158+
lineCounter = 0
159+
for line in f:
160+
splittedLine = line.rstrip('\n').split()
161+
s, p, o = getSPO(splittedLine)
162+
countInstances(s,p,o)
163+
lineCounter += 1
164+
if (lineCounter % lineProgress == 0):
165+
print ('{} million lines read'.format(lineCounter / 1000000))
166+
f.close()
167+
print('Second run complete.')
168+
print('#triples: {}, #nodes: {}, #namespaceNodes: {}, #properties: {}, #classes: {}, #instances: {}, avgIndegree: {}, medianIndegree: {}, avgOutdegree: {}, medianOutdegree: {}, avgInstanceIndegree: {}, medianInstanceIndegree: {}, avgInstanceOutdegree: {}, medianInstanceOutdegree: {}'.format(sTriples, len(sNodes), len(sNodesNamespace), len(sProperties), len(sClasses), len(sInstances), getAvg(indegreeDict), getMedian(indegreeDict), getAvg(outdegreeDict), getMedian(outdegreeDict), getAvg(instanceIndegreeDict), getMedian(instanceIndegreeDict), getAvg(instanceOutdegreeDict), getMedian(instanceOutdegreeDict)))
169+
f = open(readFile, 'r')
170+
lineCounter = 0
171+
for line in f:
172+
splittedLine = line.rstrip('\n').split()
173+
s, p, o = getSPO(splittedLine)
174+
addInstanceIndegree(o)
175+
addInstanceOutdegree(s)
176+
lineCounter += 1
177+
if (lineCounter % lineProgress == 0):
178+
print ('{} million lines read'.format(lineCounter / 1000000))
179+
f.close()
180+
print('DONE')
181+
print('#triples: {}, #nodes: {}, #namespaceNodes: {}, #properties: {}, #classes: {}, #instances: {}, avgIndegree: {}, medianIndegree: {}, avgOutdegree: {}, medianOutdegree: {}, avgInstanceIndegree: {}, medianInstanceIndegree: {}, avgInstanceOutdegree: {}, medianInstanceOutdegree: {}'.format(sTriples, len(sNodes), len(sNodesNamespace), len(sProperties), len(sClasses), len(sInstances), getAvg(indegreeDict), getMedian(indegreeDict), getAvg(outdegreeDict), getMedian(outdegreeDict), getAvg(instanceIndegreeDict), getMedian(instanceIndegreeDict), getAvg(instanceOutdegreeDict), getMedian(instanceOutdegreeDict)))
182+
#print sNodes
183+
#print sProperties
184+
#print ('###############')
185+
#print ('#####')
186+
#print ('###############')
187+
#print sClasses
188+
#print (sorted(instanceIndegreeDict.items(), key=operator.itemgetter(1)))
189+
except:
190+
print('ERROR')
191+

0 commit comments

Comments
 (0)