Skip to content

Commit 30bfb95

Browse files
author
Daniel
committed
get NELL labels
update script to get the labels of the NELL KG
1 parent a037cda commit 30bfb95

36 files changed

+16204
-2780
lines changed

.idea/workspace.xml

Lines changed: 90 additions & 359 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.metadata/.plugins/org.eclipse.core.resources/.history/10/8034eae76cb500161ceeb559396e6832

Lines changed: 661 additions & 0 deletions
Large diffs are not rendered by default.

.metadata/.plugins/org.eclipse.core.resources/.history/1c/503bbc846db500161ceeb559396e6832

Lines changed: 657 additions & 0 deletions
Large diffs are not rendered by default.

.metadata/.plugins/org.eclipse.core.resources/.history/1d/909c8ca56cb500161ceeb559396e6832

Lines changed: 661 additions & 0 deletions
Large diffs are not rendered by default.

.metadata/.plugins/org.eclipse.core.resources/.history/73/b0c903c55bb500161ceeb559396e6832

Lines changed: 629 additions & 0 deletions
Large diffs are not rendered by default.

.metadata/.plugins/org.eclipse.core.resources/.history/7b/50e457ec6bb500161ceeb559396e6832

Lines changed: 655 additions & 0 deletions
Large diffs are not rendered by default.

.metadata/.plugins/org.eclipse.core.resources/.history/87/300c75ca6db500161ceeb559396e6832

Lines changed: 673 additions & 0 deletions
Large diffs are not rendered by default.

.metadata/.plugins/org.eclipse.core.resources/.history/88/7018bb756cb500161ceeb559396e6832

Lines changed: 661 additions & 0 deletions
Large diffs are not rendered by default.

.metadata/.plugins/org.eclipse.core.resources/.history/91/20ac65236cb500161ceeb559396e6832

Lines changed: 661 additions & 0 deletions
Large diffs are not rendered by default.

.metadata/.plugins/org.eclipse.core.resources/.history/93/106bc8ac6cb500161ceeb559396e6832

Lines changed: 661 additions & 0 deletions
Large diffs are not rendered by default.

.metadata/.plugins/org.eclipse.core.resources/.history/9c/1006585369b500161ceeb559396e6832

Lines changed: 653 additions & 0 deletions
Large diffs are not rendered by default.

.metadata/.plugins/org.eclipse.core.resources/.history/a1/a01bffdc56b500161ceeb559396e6832

Lines changed: 580 additions & 0 deletions
Large diffs are not rendered by default.

.metadata/.plugins/org.eclipse.core.resources/.history/a8/205a200f6db500161ceeb559396e6832

Lines changed: 657 additions & 0 deletions
Large diffs are not rendered by default.

.metadata/.plugins/org.eclipse.core.resources/.history/b1/40f3cc6669b500161ceeb559396e6832

Lines changed: 653 additions & 0 deletions
Large diffs are not rendered by default.

.metadata/.plugins/org.eclipse.core.resources/.history/bc/7088c3c869b500161ceeb559396e6832

Lines changed: 653 additions & 0 deletions
Large diffs are not rendered by default.

.metadata/.plugins/org.eclipse.core.resources/.history/c4/10a084d25ab500161ceeb559396e6832

Lines changed: 580 additions & 0 deletions
Large diffs are not rendered by default.

.metadata/.plugins/org.eclipse.core.resources/.history/c6/a0e3feb768b500161ceeb559396e6832

Lines changed: 650 additions & 0 deletions
Large diffs are not rendered by default.

.metadata/.plugins/org.eclipse.core.resources/.history/da/004136cc69b500161ceeb559396e6832

Lines changed: 655 additions & 0 deletions
Large diffs are not rendered by default.

.metadata/.plugins/org.eclipse.core.resources/.history/ef/a00eeaf55ab500161ceeb559396e6832

Lines changed: 628 additions & 0 deletions
Large diffs are not rendered by default.

.metadata/.plugins/org.eclipse.core.resources/.history/f2/50d05e1c6cb500161ceeb559396e6832

Lines changed: 657 additions & 0 deletions
Large diffs are not rendered by default.

.metadata/.plugins/org.eclipse.core.resources/.history/f8/c073c9b56db500161ceeb559396e6832

Lines changed: 673 additions & 0 deletions
Large diffs are not rendered by default.

.metadata/.plugins/org.eclipse.core.resources/.history/fa/504dcc8a68b500161ceeb559396e6832

Lines changed: 645 additions & 0 deletions
Large diffs are not rendered by default.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

.metadata/.plugins/org.eclipse.e4.workbench/workbench.xmi

Lines changed: 2386 additions & 2387 deletions
Large diffs are not rendered by default.
Binary file not shown.
Loading
Loading

GetInstances/src/GetInstances.java

Lines changed: 125 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -24,75 +24,93 @@ public static void main(String[] args) {
2424

2525
boolean dbpedia = false;
2626
boolean yago = false;
27-
boolean opencyc = true;
27+
boolean opencyc = false;
28+
boolean nell = true;
2829

29-
String fInstanceTypesTransitive = "";
30-
String fInstanceTypes = "";
30+
String fType1 = "";
31+
String fType2 = "";
3132
String fLabels = "";
3233

3334
if (dbpedia) {
3435
//DBpedia files
3536
if (useSamples) {
36-
fInstanceTypesTransitive = "/Users/curtis/SeminarPaper_KG_files/DBpedia/instance_types_transitive_en_s.ttl";
37-
fInstanceTypes = "/Users/curtis/SeminarPaper_KG_files/DBpedia/instance_types_en_s.ttl";
37+
fType1 = "/Users/curtis/SeminarPaper_KG_files/DBpedia/instance_types_transitive_en_s.ttl";
38+
fType2 = "/Users/curtis/SeminarPaper_KG_files/DBpedia/instance_types_en_s.ttl";
3839
fLabels = "/Users/curtis/SeminarPaper_KG_files/DBpedia/labels_en_s.ttl";
3940
} else { //full files
40-
fInstanceTypesTransitive = "/Users/curtis/SeminarPaper_KG_files/DBpedia/instance_types_transitive_en.ttl";
41-
fInstanceTypes = "/Users/curtis/SeminarPaper_KG_files/DBpedia/instance_types_en.ttl";
41+
fType1 = "/Users/curtis/SeminarPaper_KG_files/DBpedia/instance_types_transitive_en.ttl";
42+
fType2 = "/Users/curtis/SeminarPaper_KG_files/DBpedia/instance_types_en.ttl";
4243
fLabels = "/Users/curtis/SeminarPaper_KG_files/DBpedia/labels_en.ttl";
4344
}
4445
// get all classes for DBpedia
4546
HashSet<String> classes = getDBpediaClasses();
4647
System.out.println(classes);
4748

48-
runProcess(0, fInstanceTypesTransitive, fInstanceTypes, fLabels, classes);
49+
runProcess(0, fType1, fType2, fLabels, classes);
4950
}
5051
if (yago) {
5152
//YAGO files
5253
if (useSamples) {
53-
fInstanceTypesTransitive = "/Users/curtis/SeminarPaper_KG_files/YAGO/yagoTransitiveType_s.ttl";
54-
fInstanceTypes = "/Users/curtis/SeminarPaper_KG_files/YAGO/yagoTypes_s.ttl";
54+
fType1 = "/Users/curtis/SeminarPaper_KG_files/YAGO/yagoTransitiveType_s.ttl";
55+
fType2 = "/Users/curtis/SeminarPaper_KG_files/YAGO/yagoTypes_s.ttl";
5556
fLabels = "/Users/curtis/SeminarPaper_KG_files/YAGO/yagoLabels_s.ttl";
5657
} else {
57-
fInstanceTypesTransitive = "/Users/curtis/SeminarPaper_KG_files/YAGO/yagoTransitiveType.ttl";
58-
fInstanceTypes = "/Users/curtis/SeminarPaper_KG_files/YAGO/yagoTypes.ttl";
58+
fType1 = "/Users/curtis/SeminarPaper_KG_files/YAGO/yagoTransitiveType.ttl";
59+
fType2 = "/Users/curtis/SeminarPaper_KG_files/YAGO/yagoTypes.ttl";
5960
fLabels = "/Users/curtis/SeminarPaper_KG_files/YAGO/yagoLabels.ttl";
6061
}
6162
// get all classes for YAGO
6263
HashSet<String> classes = getYagoClasses();
6364
System.out.println(classes);
6465

65-
runProcess(1, fInstanceTypesTransitive, fInstanceTypes, fLabels, classes);
66+
runProcess(1, fType1, fType2, fLabels, classes);
6667

6768
}
6869

6970
if (opencyc) {
7071
//OpenCyc files
7172
if (useSamples) {
72-
fInstanceTypes = "/Users/curtis/SeminarPaper_KG_files/OpenCyc/opencyc-latest_sample.nt";
73+
fType1 = "/Users/curtis/SeminarPaper_KG_files/OpenCyc/opencyc-latest_sample.nt";
7374
fLabels = "/Users/curtis/SeminarPaper_KG_files/OpenCyc/opencyc-latest_sample.nt";
7475
} else {
75-
fInstanceTypes = "/Users/curtis/SeminarPaper_KG_files/OpenCyc/opencyc-latest.nt";
76+
fType1 = "/Users/curtis/SeminarPaper_KG_files/OpenCyc/opencyc-latest.nt";
7677
fLabels = "/Users/curtis/SeminarPaper_KG_files/OpenCyc/opencyc-latest.nt";
7778
}
7879
// get all classes for OpenCyc
7980
HashSet<String> classes = getOpenCycClasses();
8081
System.out.println(classes);
8182

82-
runProcess(2, fInstanceTypesTransitive, fInstanceTypes, fLabels, classes);
83+
runProcess(2, fType1, fType2, fLabels, classes);
84+
}
85+
if (nell) {
86+
//NELL files
87+
if (useSamples) {
88+
fType1 = "/Users/curtis/SeminarPaper_KG_files/NELL/NELL.08m.995.esv_s.csv";
89+
fType2 = "/Users/curtis/SeminarPaper_KG_files/NELL/NELL.08m.995.ontology_s.csv";
90+
fLabels = "/Users/curtis/SeminarPaper_KG_files/NELL/NELL.08m.995.esv_s.csv";
91+
} else {
92+
fType1 = "/Users/curtis/SeminarPaper_KG_files/NELL/NELL.08m.995.esv.csv";
93+
fType2 = "/Users/curtis/SeminarPaper_KG_files/NELL/NELL.08m.995.ontology.csv";
94+
fLabels = "/Users/curtis/SeminarPaper_KG_files/NELL/NELL.08m.995.esv.csv";
95+
}
96+
// get all classes for OpenCyc
97+
HashSet<String> classes = getNellClasses();
98+
System.out.println(classes);
99+
100+
runProcess(3, fType1, fType2, fLabels, classes);
83101
}
84102

85103
System.out.println("EXECUTION TIME: " + ((System.nanoTime() - startTime)/1000000000) + " seconds." );
86104
}
87105

88106

89107

90-
private static void runProcess(int kg, String fInstanceTypesTransitive,
91-
String fInstanceTypes, String fLabels, HashSet<String> classes) {
108+
private static void runProcess(int kg, String fType1,
109+
String fType2, String fLabels, HashSet<String> classes) {
92110

93111
int skipRows = 0;
94-
if (kg == 0)
95-
skipRows = 1; //skip first row for dbpedia
112+
if (kg == 0 || kg == 3)
113+
skipRows = 1; //skip first row for dbpedia and nell
96114
else if (kg == 1)
97115
skipRows = 10; //skip first ten rows for yago
98116

@@ -101,9 +119,8 @@ else if (kg == 1)
101119

102120
try {
103121
// GET ALL INSTANCES FOR ALL CLASSES
104-
Map<String, Set<String>> classInstances = getClassInstances(kg, fInstanceTypesTransitive, fInstanceTypes, classes, allInstancesSet, skipRows);
122+
Map<String, Set<String>> classInstances = getClassInstances(kg, fType1, fType2, classes, allInstancesSet, skipRows);
105123

106-
107124
System.out.println("allInstancesSet.size():" + allInstancesSet.size());
108125
int instanceCount = 0;
109126
for (Entry<String, Set<String>> entry : classInstances.entrySet()) {
@@ -120,7 +137,7 @@ else if (kg == 1)
120137
.skip(skipRows) //skip first row
121138
.filter(line -> containsInstanceNameEn(kg, line, classInstances, allInstancesSet, labeledInstancesSet))
122139
//.collect(Collectors.toMap(line -> getS(line), Collectors.toSet(line -> getLabel(getO(line)))));
123-
.collect(Collectors.groupingBy(line -> getS(kg, line), Collectors.mapping(line -> getLabel(kg, getO(kg, line)), Collectors.toSet())));
140+
.collect(Collectors.groupingBy(line -> getS(kg, line), Collectors.mapping(line -> getLabel(kg, line), Collectors.toSet())));
124141

125142
//System.out.println(classInstances);
126143
//System.out.println(instancesWithLabel);
@@ -166,6 +183,8 @@ else if (kg == 1)
166183
resultFolder = "yagoResults/";
167184
} else if (kg == 2) {
168185
resultFolder = "opencycResults/";
186+
} else if (kg == 3) {
187+
resultFolder ="nellResults/";
169188
}
170189
//http://stackoverflow.com/questions/2885173/how-to-create-a-file-and-write-to-a-file-in-java
171190
//for (Entry<String, Set<String>> entry : classInstances.entrySet()) {
@@ -188,14 +207,14 @@ else if (kg == 1)
188207

189208

190209
private static Map<String, Set<String>> getClassInstances(
191-
int kg, String fInstanceTypesTransitive, String fInstanceTypes, HashSet<String> classes, HashSet<String> allInstancesSet, int skipRows) {
210+
int kg, String fType1, String fType2, HashSet<String> classes, HashSet<String> allInstancesSet, int skipRows) {
192211
//create stream objects of the files
193212
//http://www.oracle.com/technetwork/articles/java/ma14-java-se-8-streams-2177646.html
194213
Map<String, Set<String>> classInstances = null;
195214
try {
196-
if (fInstanceTypesTransitive != "") { //check transitive type file
197-
Stream<String> itTransitive = Files.lines(Paths.get(fInstanceTypesTransitive));
198-
Stream<String> it = Files.lines(Paths.get(fInstanceTypes));
215+
if (fType2 != "") { //concat if two files are passed to method
216+
Stream<String> itTransitive = Files.lines(Paths.get(fType1));
217+
Stream<String> it = Files.lines(Paths.get(fType2));
199218
// read files
200219
classInstances =
201220
Stream.concat(itTransitive, it)
@@ -204,7 +223,7 @@ private static Map<String, Set<String>> getClassInstances(
204223
//collect: group by className (third argument), set of all instance names (first argument): instance a className
205224
.collect(Collectors.groupingBy(line -> getO(kg, line), Collectors.mapping(line -> getS(kg, line), Collectors.toSet())));
206225
} else { //only one file
207-
Stream<String> it = Files.lines(Paths.get(fInstanceTypes));
226+
Stream<String> it = Files.lines(Paths.get(fType2));
208227
// read files
209228
classInstances =
210229
it
@@ -241,10 +260,14 @@ private static boolean containsInstanceNameEn(int kg, String line,
241260
//DBpedia and OpenCyc
242261
labelString = "<http://www.w3.org/2000/01/rdf-schema#label>";
243262
englishLabel = "@en";
244-
} else {
263+
} else if (kg == 1) {
245264
//YAGO
246265
labelString = "rdfs:label";
247266
englishLabel = "@eng";
267+
} else if (kg == 3) {
268+
labelString = "generalizations";
269+
englishLabel = "concept";
270+
248271
}
249272
//check if line was complete (yago contains single element references as line)
250273
if (spo.length >= 3) {
@@ -290,22 +313,38 @@ private static String getS(int kg, String line) {
290313
}
291314
/**
292315
* Get label of string
293-
* @param o (string)
316+
* @param line (string)
294317
* @returns substring without label
295318
*/
296-
private static String getLabel(int kg, String o) {
319+
private static String getLabel(int kg, String line) {
297320
String returnString = "";
321+
322+
298323
if (kg==0 || kg == 2) {
299324
//DBpedia and OpenCyc: LABEL_TO_KEEP@en\s
325+
String o = getO(kg, line);
300326
returnString = o.substring(0, o.length()-4); //-4 due to whitespace created by getSPO in the end
301327
} else if (kg == 1) {
302328
//YAGO: "LABEL_TO_KEEP"@eng .\n
329+
String o = getO(kg, line);
303330
returnString = o.substring(1, o.length()-7);
331+
} else if (kg == 3) { //NELL
332+
returnString = getEntityLiteralStringsInNell(kg, line);
304333
}
305334
//System.out.println("getLabel: " + returnString);
306335
return returnString;
307336
}
337+
/**
338+
* Get seventh argument of line
339+
* @param line
340+
* @returns spo[6] (String)
341+
*/
342+
private static String getEntityLiteralStringsInNell(int kg, String line) {
343+
String spo[] = getSPO(kg, line);
344+
return spo[6];
345+
}
308346

347+
309348
/**
310349
* Check if line contains className
311350
* @param kg 0:dbpedia, 1:yago
@@ -333,8 +372,11 @@ private static boolean containsClassName(int kg, String line, HashSet<String> cl
333372
} else if (kg == 2) {//opencyc
334373
typeString = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>";
335374
classString = "<http://sw.opencyc.org/concept/";
336-
classString2 = "<http://sw.opencyc.org/concept/";
337-
375+
classString2 = "<http://sw.opencyc.org/concept/";
376+
} else if (kg == 3) { //nell
377+
typeString = "generalizations";
378+
classString = "concept:";
379+
classString2 = "concept:";
338380
}
339381

340382
//check if line was complete (yago contains single element references as line)
@@ -373,6 +415,9 @@ private static String getClassNameOfURI(int kg, String o) {
373415
} else if (kg == 2) {
374416
//<http://sw.opencyc.org/concept/CLASSNAME_TO_KEEP>
375417
returnString = o.substring(31, o.length()-1);
418+
} else if (kg == 3) {
419+
//"concept:CLASSNAME_TO_KEEP"
420+
returnString = o.substring(8, o.length());
376421
}
377422
//System.out.println("getClassNameOfURI: " + returnString);
378423
return returnString;
@@ -417,7 +462,7 @@ private static String[] getSPO(int kg, String line) {
417462
String[] preWords = line.split("\\s+"); //split on whitespace
418463
words = preWords;
419464
}
420-
} else { //YAGO
465+
} else { //YAGO and NELL
421466
String[] preWords = line.split("\\t"); //split on tab
422467
words = preWords;
423468
}
@@ -529,6 +574,10 @@ private static HashSet<String> getYagoClasses() {
529574
));
530575
return classNameArray;
531576
}
577+
/**
578+
* Get HashSet containing all class names in OpenCyc
579+
* @return Array of all OpenCyc classes
580+
*/
532581
private static HashSet<String> getOpenCycClasses() {
533582
HashSet<String> classNameArray = new HashSet<String>();
534583
classNameArray.addAll(Arrays.asList(
@@ -576,5 +625,47 @@ private static HashSet<String> getOpenCycClasses() {
576625
));
577626
return classNameArray;
578627
}
628+
/**
629+
* Get HashSet containing all class names in NELL
630+
* @return Array of all NELL classes
631+
*/
632+
private static HashSet<String> getNellClasses() {
633+
HashSet<String> classNameArray = new HashSet<String>();
634+
classNameArray.addAll(Arrays.asList(
635+
//PERSON
636+
"humanagent",
637+
"agent",
638+
"person",
639+
"politician",
640+
"athlete",
641+
"actor",
642+
//ORGANIZATION
643+
"governmentorganization",
644+
"company",
645+
"politicalparty",
646+
//PLACE
647+
"location",
648+
"geopoliticallocation",
649+
"city",
650+
"country",
651+
//ART
652+
"creativework",
653+
"musicalbum",
654+
"musicsong",
655+
"movie",
656+
"book",
657+
//EVENT
658+
"event",
659+
"militaryeventtype",
660+
"militaryconflict",
661+
"sportsevent",
662+
//TRANSPORT
663+
"vehicle",
664+
//OTHER
665+
"chemical",
666+
"planet"
667+
));
668+
return classNameArray;
669+
}
579670
}
580671

0 commit comments

Comments
 (0)