@@ -24,75 +24,93 @@ public static void main(String[] args) {
24
24
25
25
boolean dbpedia = false ;
26
26
boolean yago = false ;
27
- boolean opencyc = true ;
27
+ boolean opencyc = false ;
28
+ boolean nell = true ;
28
29
29
- String fInstanceTypesTransitive = "" ;
30
- String fInstanceTypes = "" ;
30
+ String fType1 = "" ;
31
+ String fType2 = "" ;
31
32
String fLabels = "" ;
32
33
33
34
if (dbpedia ) {
34
35
//DBpedia files
35
36
if (useSamples ) {
36
- fInstanceTypesTransitive = "/Users/curtis/SeminarPaper_KG_files/DBpedia/instance_types_transitive_en_s.ttl" ;
37
- fInstanceTypes = "/Users/curtis/SeminarPaper_KG_files/DBpedia/instance_types_en_s.ttl" ;
37
+ fType1 = "/Users/curtis/SeminarPaper_KG_files/DBpedia/instance_types_transitive_en_s.ttl" ;
38
+ fType2 = "/Users/curtis/SeminarPaper_KG_files/DBpedia/instance_types_en_s.ttl" ;
38
39
fLabels = "/Users/curtis/SeminarPaper_KG_files/DBpedia/labels_en_s.ttl" ;
39
40
} else { //full files
40
- fInstanceTypesTransitive = "/Users/curtis/SeminarPaper_KG_files/DBpedia/instance_types_transitive_en.ttl" ;
41
- fInstanceTypes = "/Users/curtis/SeminarPaper_KG_files/DBpedia/instance_types_en.ttl" ;
41
+ fType1 = "/Users/curtis/SeminarPaper_KG_files/DBpedia/instance_types_transitive_en.ttl" ;
42
+ fType2 = "/Users/curtis/SeminarPaper_KG_files/DBpedia/instance_types_en.ttl" ;
42
43
fLabels = "/Users/curtis/SeminarPaper_KG_files/DBpedia/labels_en.ttl" ;
43
44
}
44
45
// get all classes for DBpedia
45
46
HashSet <String > classes = getDBpediaClasses ();
46
47
System .out .println (classes );
47
48
48
- runProcess (0 , fInstanceTypesTransitive , fInstanceTypes , fLabels , classes );
49
+ runProcess (0 , fType1 , fType2 , fLabels , classes );
49
50
}
50
51
if (yago ) {
51
52
//YAGO files
52
53
if (useSamples ) {
53
- fInstanceTypesTransitive = "/Users/curtis/SeminarPaper_KG_files/YAGO/yagoTransitiveType_s.ttl" ;
54
- fInstanceTypes = "/Users/curtis/SeminarPaper_KG_files/YAGO/yagoTypes_s.ttl" ;
54
+ fType1 = "/Users/curtis/SeminarPaper_KG_files/YAGO/yagoTransitiveType_s.ttl" ;
55
+ fType2 = "/Users/curtis/SeminarPaper_KG_files/YAGO/yagoTypes_s.ttl" ;
55
56
fLabels = "/Users/curtis/SeminarPaper_KG_files/YAGO/yagoLabels_s.ttl" ;
56
57
} else {
57
- fInstanceTypesTransitive = "/Users/curtis/SeminarPaper_KG_files/YAGO/yagoTransitiveType.ttl" ;
58
- fInstanceTypes = "/Users/curtis/SeminarPaper_KG_files/YAGO/yagoTypes.ttl" ;
58
+ fType1 = "/Users/curtis/SeminarPaper_KG_files/YAGO/yagoTransitiveType.ttl" ;
59
+ fType2 = "/Users/curtis/SeminarPaper_KG_files/YAGO/yagoTypes.ttl" ;
59
60
fLabels = "/Users/curtis/SeminarPaper_KG_files/YAGO/yagoLabels.ttl" ;
60
61
}
61
62
// get all classes for YAGO
62
63
HashSet <String > classes = getYagoClasses ();
63
64
System .out .println (classes );
64
65
65
- runProcess (1 , fInstanceTypesTransitive , fInstanceTypes , fLabels , classes );
66
+ runProcess (1 , fType1 , fType2 , fLabels , classes );
66
67
67
68
}
68
69
69
70
if (opencyc ) {
70
71
//OpenCyc files
71
72
if (useSamples ) {
72
- fInstanceTypes = "/Users/curtis/SeminarPaper_KG_files/OpenCyc/opencyc-latest_sample.nt" ;
73
+ fType1 = "/Users/curtis/SeminarPaper_KG_files/OpenCyc/opencyc-latest_sample.nt" ;
73
74
fLabels = "/Users/curtis/SeminarPaper_KG_files/OpenCyc/opencyc-latest_sample.nt" ;
74
75
} else {
75
- fInstanceTypes = "/Users/curtis/SeminarPaper_KG_files/OpenCyc/opencyc-latest.nt" ;
76
+ fType1 = "/Users/curtis/SeminarPaper_KG_files/OpenCyc/opencyc-latest.nt" ;
76
77
fLabels = "/Users/curtis/SeminarPaper_KG_files/OpenCyc/opencyc-latest.nt" ;
77
78
}
78
79
// get all classes for OpenCyc
79
80
HashSet <String > classes = getOpenCycClasses ();
80
81
System .out .println (classes );
81
82
82
- runProcess (2 , fInstanceTypesTransitive , fInstanceTypes , fLabels , classes );
83
+ runProcess (2 , fType1 , fType2 , fLabels , classes );
84
+ }
85
+ if (nell ) {
86
+ //NELL files
87
+ if (useSamples ) {
88
+ fType1 = "/Users/curtis/SeminarPaper_KG_files/NELL/NELL.08m.995.esv_s.csv" ;
89
+ fType2 = "/Users/curtis/SeminarPaper_KG_files/NELL/NELL.08m.995.ontology_s.csv" ;
90
+ fLabels = "/Users/curtis/SeminarPaper_KG_files/NELL/NELL.08m.995.esv_s.csv" ;
91
+ } else {
92
+ fType1 = "/Users/curtis/SeminarPaper_KG_files/NELL/NELL.08m.995.esv.csv" ;
93
+ fType2 = "/Users/curtis/SeminarPaper_KG_files/NELL/NELL.08m.995.ontology.csv" ;
94
+ fLabels = "/Users/curtis/SeminarPaper_KG_files/NELL/NELL.08m.995.esv.csv" ;
95
+ }
96
+ // get all classes for OpenCyc
97
+ HashSet <String > classes = getNellClasses ();
98
+ System .out .println (classes );
99
+
100
+ runProcess (3 , fType1 , fType2 , fLabels , classes );
83
101
}
84
102
85
103
System .out .println ("EXECUTION TIME: " + ((System .nanoTime () - startTime )/1000000000 ) + " seconds." );
86
104
}
87
105
88
106
89
107
90
- private static void runProcess (int kg , String fInstanceTypesTransitive ,
91
- String fInstanceTypes , String fLabels , HashSet <String > classes ) {
108
+ private static void runProcess (int kg , String fType1 ,
109
+ String fType2 , String fLabels , HashSet <String > classes ) {
92
110
93
111
int skipRows = 0 ;
94
- if (kg == 0 )
95
- skipRows = 1 ; //skip first row for dbpedia
112
+ if (kg == 0 || kg == 3 )
113
+ skipRows = 1 ; //skip first row for dbpedia and nell
96
114
else if (kg == 1 )
97
115
skipRows = 10 ; //skip first ten rows for yago
98
116
@@ -101,9 +119,8 @@ else if (kg == 1)
101
119
102
120
try {
103
121
// GET ALL INSTANCES FOR ALL CLASSES
104
- Map <String , Set <String >> classInstances = getClassInstances (kg , fInstanceTypesTransitive , fInstanceTypes , classes , allInstancesSet , skipRows );
122
+ Map <String , Set <String >> classInstances = getClassInstances (kg , fType1 , fType2 , classes , allInstancesSet , skipRows );
105
123
106
-
107
124
System .out .println ("allInstancesSet.size():" + allInstancesSet .size ());
108
125
int instanceCount = 0 ;
109
126
for (Entry <String , Set <String >> entry : classInstances .entrySet ()) {
@@ -120,7 +137,7 @@ else if (kg == 1)
120
137
.skip (skipRows ) //skip first row
121
138
.filter (line -> containsInstanceNameEn (kg , line , classInstances , allInstancesSet , labeledInstancesSet ))
122
139
//.collect(Collectors.toMap(line -> getS(line), Collectors.toSet(line -> getLabel(getO(line)))));
123
- .collect (Collectors .groupingBy (line -> getS (kg , line ), Collectors .mapping (line -> getLabel (kg , getO ( kg , line ) ), Collectors .toSet ())));
140
+ .collect (Collectors .groupingBy (line -> getS (kg , line ), Collectors .mapping (line -> getLabel (kg , line ), Collectors .toSet ())));
124
141
125
142
//System.out.println(classInstances);
126
143
//System.out.println(instancesWithLabel);
@@ -166,6 +183,8 @@ else if (kg == 1)
166
183
resultFolder = "yagoResults/" ;
167
184
} else if (kg == 2 ) {
168
185
resultFolder = "opencycResults/" ;
186
+ } else if (kg == 3 ) {
187
+ resultFolder ="nellResults/" ;
169
188
}
170
189
//http://stackoverflow.com/questions/2885173/how-to-create-a-file-and-write-to-a-file-in-java
171
190
//for (Entry<String, Set<String>> entry : classInstances.entrySet()) {
@@ -188,14 +207,14 @@ else if (kg == 1)
188
207
189
208
190
209
private static Map <String , Set <String >> getClassInstances (
191
- int kg , String fInstanceTypesTransitive , String fInstanceTypes , HashSet <String > classes , HashSet <String > allInstancesSet , int skipRows ) {
210
+ int kg , String fType1 , String fType2 , HashSet <String > classes , HashSet <String > allInstancesSet , int skipRows ) {
192
211
//create stream objects of the files
193
212
//http://www.oracle.com/technetwork/articles/java/ma14-java-se-8-streams-2177646.html
194
213
Map <String , Set <String >> classInstances = null ;
195
214
try {
196
- if (fInstanceTypesTransitive != "" ) { //check transitive type file
197
- Stream <String > itTransitive = Files .lines (Paths .get (fInstanceTypesTransitive ));
198
- Stream <String > it = Files .lines (Paths .get (fInstanceTypes ));
215
+ if (fType2 != "" ) { //concat if two files are passed to method
216
+ Stream <String > itTransitive = Files .lines (Paths .get (fType1 ));
217
+ Stream <String > it = Files .lines (Paths .get (fType2 ));
199
218
// read files
200
219
classInstances =
201
220
Stream .concat (itTransitive , it )
@@ -204,7 +223,7 @@ private static Map<String, Set<String>> getClassInstances(
204
223
//collect: group by className (third argument), set of all instance names (first argument): instance a className
205
224
.collect (Collectors .groupingBy (line -> getO (kg , line ), Collectors .mapping (line -> getS (kg , line ), Collectors .toSet ())));
206
225
} else { //only one file
207
- Stream <String > it = Files .lines (Paths .get (fInstanceTypes ));
226
+ Stream <String > it = Files .lines (Paths .get (fType2 ));
208
227
// read files
209
228
classInstances =
210
229
it
@@ -241,10 +260,14 @@ private static boolean containsInstanceNameEn(int kg, String line,
241
260
//DBpedia and OpenCyc
242
261
labelString = "<http://www.w3.org/2000/01/rdf-schema#label>" ;
243
262
englishLabel = "@en" ;
244
- } else {
263
+ } else if ( kg == 1 ) {
245
264
//YAGO
246
265
labelString = "rdfs:label" ;
247
266
englishLabel = "@eng" ;
267
+ } else if (kg == 3 ) {
268
+ labelString = "generalizations" ;
269
+ englishLabel = "concept" ;
270
+
248
271
}
249
272
//check if line was complete (yago contains single element references as line)
250
273
if (spo .length >= 3 ) {
@@ -290,22 +313,38 @@ private static String getS(int kg, String line) {
290
313
}
291
314
/**
292
315
* Get label of string
293
- * @param o (string)
316
+ * @param line (string)
294
317
* @returns substring without label
295
318
*/
296
- private static String getLabel (int kg , String o ) {
319
+ private static String getLabel (int kg , String line ) {
297
320
String returnString = "" ;
321
+
322
+
298
323
if (kg ==0 || kg == 2 ) {
299
324
//DBpedia and OpenCyc: LABEL_TO_KEEP@en\s
325
+ String o = getO (kg , line );
300
326
returnString = o .substring (0 , o .length ()-4 ); //-4 due to whitespace created by getSPO in the end
301
327
} else if (kg == 1 ) {
302
328
//YAGO: "LABEL_TO_KEEP"@eng .\n
329
+ String o = getO (kg , line );
303
330
returnString = o .substring (1 , o .length ()-7 );
331
+ } else if (kg == 3 ) { //NELL
332
+ returnString = getEntityLiteralStringsInNell (kg , line );
304
333
}
305
334
//System.out.println("getLabel: " + returnString);
306
335
return returnString ;
307
336
}
337
+ /**
338
+ * Get seventh argument of line
339
+ * @param line
340
+ * @returns spo[6] (String)
341
+ */
342
+ private static String getEntityLiteralStringsInNell (int kg , String line ) {
343
+ String spo [] = getSPO (kg , line );
344
+ return spo [6 ];
345
+ }
308
346
347
+
309
348
/**
310
349
* Check if line contains className
311
350
* @param kg 0:dbpedia, 1:yago
@@ -333,8 +372,11 @@ private static boolean containsClassName(int kg, String line, HashSet<String> cl
333
372
} else if (kg == 2 ) {//opencyc
334
373
typeString = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>" ;
335
374
classString = "<http://sw.opencyc.org/concept/" ;
336
- classString2 = "<http://sw.opencyc.org/concept/" ;
337
-
375
+ classString2 = "<http://sw.opencyc.org/concept/" ;
376
+ } else if (kg == 3 ) { //nell
377
+ typeString = "generalizations" ;
378
+ classString = "concept:" ;
379
+ classString2 = "concept:" ;
338
380
}
339
381
340
382
//check if line was complete (yago contains single element references as line)
@@ -373,6 +415,9 @@ private static String getClassNameOfURI(int kg, String o) {
373
415
} else if (kg == 2 ) {
374
416
//<http://sw.opencyc.org/concept/CLASSNAME_TO_KEEP>
375
417
returnString = o .substring (31 , o .length ()-1 );
418
+ } else if (kg == 3 ) {
419
+ //"concept:CLASSNAME_TO_KEEP"
420
+ returnString = o .substring (8 , o .length ());
376
421
}
377
422
//System.out.println("getClassNameOfURI: " + returnString);
378
423
return returnString ;
@@ -417,7 +462,7 @@ private static String[] getSPO(int kg, String line) {
417
462
String [] preWords = line .split ("\\ s+" ); //split on whitespace
418
463
words = preWords ;
419
464
}
420
- } else { //YAGO
465
+ } else { //YAGO and NELL
421
466
String [] preWords = line .split ("\\ t" ); //split on tab
422
467
words = preWords ;
423
468
}
@@ -529,6 +574,10 @@ private static HashSet<String> getYagoClasses() {
529
574
));
530
575
return classNameArray ;
531
576
}
577
+ /**
578
+ * Get HashSet containing all class names in OpenCyc
579
+ * @return Array of all OpenCyc classes
580
+ */
532
581
private static HashSet <String > getOpenCycClasses () {
533
582
HashSet <String > classNameArray = new HashSet <String >();
534
583
classNameArray .addAll (Arrays .asList (
@@ -576,5 +625,47 @@ private static HashSet<String> getOpenCycClasses() {
576
625
));
577
626
return classNameArray ;
578
627
}
628
+ /**
629
+ * Get HashSet containing all class names in NELL
630
+ * @return Array of all NELL classes
631
+ */
632
+ private static HashSet <String > getNellClasses () {
633
+ HashSet <String > classNameArray = new HashSet <String >();
634
+ classNameArray .addAll (Arrays .asList (
635
+ //PERSON
636
+ "humanagent" ,
637
+ "agent" ,
638
+ "person" ,
639
+ "politician" ,
640
+ "athlete" ,
641
+ "actor" ,
642
+ //ORGANIZATION
643
+ "governmentorganization" ,
644
+ "company" ,
645
+ "politicalparty" ,
646
+ //PLACE
647
+ "location" ,
648
+ "geopoliticallocation" ,
649
+ "city" ,
650
+ "country" ,
651
+ //ART
652
+ "creativework" ,
653
+ "musicalbum" ,
654
+ "musicsong" ,
655
+ "movie" ,
656
+ "book" ,
657
+ //EVENT
658
+ "event" ,
659
+ "militaryeventtype" ,
660
+ "militaryconflict" ,
661
+ "sportsevent" ,
662
+ //TRANSPORT
663
+ "vehicle" ,
664
+ //OTHER
665
+ "chemical" ,
666
+ "planet"
667
+ ));
668
+ return classNameArray ;
669
+ }
579
670
}
580
671
0 commit comments