Skip to content

(WIP) Adding fine-grain entity typing to the pipeline. #602

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@ public class ViewNames {
public static final String POST_ERE = "POST_ERE";
public static final String EVENT_ERE = "EVENT_ERE";

public static final String FINE_NER_TYPE = "FINE_NER_TYPE";
public static final String FINE_NER_TYPE_WSD = "FINE_NER_TYPE_WSD";
public static final String WEB_IS_A = "WEB_IS_A";

public static final String TRANSLITERATION = "TRANSLITERATION";

public static ViewTypes getViewType(String viewName) {
Expand Down
57 changes: 57 additions & 0 deletions finetyper/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<artifactId>illinois-cogcomp-nlp</artifactId>
<groupId>edu.illinois.cs.cogcomp</groupId>
<version>4.0.1</version>
</parent>

<artifactId>fine-typer</artifactId>

<dependencies>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-core-utilities</artifactId>
<version>4.0.1</version>
</dependency>
<dependency>
<groupId>org.cogcomp</groupId>
<artifactId>cogcomp-datastore</artifactId>
<version>RELEASE</version>
</dependency>
<dependency>
<groupId>net.sf.extjwnl</groupId>
<artifactId>extjwnl</artifactId>
<version>1.9.2</version>
</dependency>
<dependency>
<groupId>net.sf.extjwnl</groupId>
<artifactId>extjwnl-data-wn30</artifactId>
<version>1.2</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
<version>1.8.1</version>
</dependency>
<dependency>
<groupId>edu.illinois.cs.cogcomp</groupId>
<artifactId>illinois-nlp-pipeline</artifactId>
<version>4.0.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.25</version>
<scope>test</scope>
</dependency>

</dependencies>


</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package edu.illinois.cs.cogcomp.finetyper;

import io.minio.errors.InvalidEndpointException;
import io.minio.errors.InvalidPortException;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.cogcomp.Datastore;
import org.cogcomp.DatastoreException;

import java.io.*;

/**
* Created by haowu4 on 2/3/18.
*/
public class FinerResource {

public static final String FINER_RESOURCE_GROUP_ID = "edu.cogcomp.cs.illinois.finetyper";

public static final String WORD_EMBEDDING_TAR_GZ = "word_embedding.txt.tar.gz";
public static final String SENSE_EMBEDDING_TAR_GZ = "synset_embeddings_300.txt.tar.gz";
public static final String KB_BIAS_RESOURCE_TAR_GZ = "kbias.txt.tar.gz";
public static final String WORD_POS_TO_SENSE_TAR_GZ = "word_pos_to_synsets.txt.tar.gz";
public static final String SYNSET2TYPE_TAR_GZ = "synset2TypeMap.txt.tar.gz";

public static InputStream getTarGZInputStrem(File file) throws IOException {
return new TarArchiveInputStream(new GzipCompressorInputStream(new FileInputStream(file)));
}

public static Datastore getDefaultDatastore() throws DatastoreException {
return new Datastore();
}

public static InputStream getResourceInputStream(Datastore dataStore, String name) throws IOException, DatastoreException {
File file = dataStore.getFile(
FinerResource.FINER_RESOURCE_GROUP_ID, name, 1.0);
if (name.endsWith("tar.gz")) {
return getTarGZInputStrem(file);
}
return new FileInputStream(file);
}


}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package edu.illinois.cs.cogcomp.finetyper;

import edu.illinois.cs.cogcomp.core.datastructures.ViewNames;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.View;

import java.util.ArrayList;
import java.util.List;

/**
* Created by haowu4 on 2/7/18.
*/
public class Utils {
public static List<Constituent> getSentenceConstituents(Sentence sentence, TextAnnotation ta, String viewName) {
List<Constituent> ret = new ArrayList<>();
int start = sentence.getStartSpan();
int end = sentence.getEndSpan();
View view = ta.getView(viewName);

for (Constituent ct : view.getConstituents()) {
if (ct.getStartSpan() >= start && ct.getEndSpan() < end) {
ret.add(ct);
}
}
return ret;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package edu.illinois.cs.cogcomp.finetyper.finer;

import edu.illinois.cs.cogcomp.annotation.Annotator;
import edu.illinois.cs.cogcomp.core.datastructures.ViewNames;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.*;
import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager;
import edu.illinois.cs.cogcomp.finetyper.finer.components.typers.IFinerTyper;
import edu.illinois.cs.cogcomp.finetyper.finer.components.mention.MentionDetecter;
import edu.illinois.cs.cogcomp.finetyper.finer.datastructure.FineTypeConstituent;

import java.util.ArrayList;
import java.util.List;
import java.util.Optional;

/**
* Created by haowu4 on 1/15/17.
*/
public class FinerAnnotator extends Annotator {
private MentionDetecter mentionDetecter;
private List<IFinerTyper> typers;


public FinerAnnotator(MentionDetecter mentionDetecter, List<IFinerTyper> typers) {
super(ViewNames.FINE_NER_TYPE, new String[]{ViewNames.POS, ViewNames.NER_ONTONOTES});
this.mentionDetecter = mentionDetecter;
this.typers = typers;
}

public void setMentionDetecter(MentionDetecter mentionDetecter) {
this.mentionDetecter = mentionDetecter;
}

public void addTyper(IFinerTyper typer) {
this.typers.add(typer);
}

@Override
public void initialize(ResourceManager rm) {

}

public void addView(TextAnnotation ta) {
List<FineTypeConstituent> fineTypes = this.getAllFineTypeConstituents(ta);
View finalAnnotation = new SpanLabelView(ViewNames.FINE_NER_TYPE, ta);
for (FineTypeConstituent c : fineTypes) {
Optional<Constituent> ret = c.toConstituent(ViewNames.FINE_NER_TYPE);
ret.ifPresent(finalAnnotation::addConstituent);
}
ta.addView(ViewNames.FINE_NER_TYPE, finalAnnotation);
}

public List<FineTypeConstituent> getAllFineTypeConstituents(TextAnnotation ta) {
List<FineTypeConstituent> allCandidates = new ArrayList<>();
for (int i = 0; i < ta.getNumberOfSentences(); i++) {
Sentence sent = ta.getSentence(i);
List<FineTypeConstituent> sentence_candidates = mentionDetecter.getMentionCandidates(ta, sent);
for (IFinerTyper typer : this.typers) {
typer.annotate(sentence_candidates, sent);
}

for (FineTypeConstituent c : sentence_candidates) {
c.finish();
allCandidates.add(c);
}
}
return allCandidates;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
package edu.illinois.cs.cogcomp.finetyper.finer;

import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import edu.illinois.cs.cogcomp.finetyper.FinerResource;
import edu.illinois.cs.cogcomp.finetyper.finer.components.typers.IFinerTyper;
import edu.illinois.cs.cogcomp.finetyper.finer.components.mention.MentionDetecter;
import edu.illinois.cs.cogcomp.finetyper.finer.components.typers.HypernymTyper;
import edu.illinois.cs.cogcomp.finetyper.finer.components.typers.KBBiasTyper;
import edu.illinois.cs.cogcomp.finetyper.finer.components.mention.BasicMentionDetection;
import edu.illinois.cs.cogcomp.finetyper.finer.components.mention.TypeMapper;
import edu.illinois.cs.cogcomp.finetyper.finer.components.typers.NGramPattern;
import edu.illinois.cs.cogcomp.finetyper.finer.components.typers.NGramPatternBasedTyper;
import edu.illinois.cs.cogcomp.finetyper.finer.datastructure.types.FinerType;
import edu.illinois.cs.cogcomp.finetyper.finer.datastructure.types.TypeSystem;
import org.cogcomp.Datastore;
import org.cogcomp.DatastoreException;

import java.io.*;
import java.util.*;
import java.util.stream.Collectors;

/**
* Created by haowu4 on 5/16/17.
*/
public class FinerTyperFactory {
public FinerTyperFactory() throws DatastoreException {
this(true);
}

public FinerTyperFactory(boolean lazyInit) throws DatastoreException {

this.typers = new ArrayList<>();
if (!lazyInit) {
this.init();
}
}

private void init() throws DatastoreException {

try (InputStream is = ClassLoader.getSystemResourceAsStream("finer_resource/figer_hier.json")) {
this.typeSystem = TypeSystem.getFromJson(is);
} catch (IOException e) {
e.printStackTrace();
}

try (InputStream is = ClassLoader.getSystemResourceAsStream("finer_resource/ontonote_type_mapping.json")) {
this.mentionDetecter = this.getMentionDetecter(is);
} catch (IOException e) {
e.printStackTrace();
}

try (InputStream is = ClassLoader.getSystemResourceAsStream("finer_resource/patterndb.txt")) {
this.typers.add(this.getPatternTyper(is));
} catch (IOException e) {
e.printStackTrace();
}

// Now load larger components from Datastore.
Datastore ds = FinerResource.getDefaultDatastore();
try (InputStream is = FinerResource.getResourceInputStream(ds, FinerResource.SYNSET2TYPE_TAR_GZ)) {
this.typers.add(this.getHypTyper(is));
} catch (IOException e) {
e.printStackTrace();
}

try (InputStream is = FinerResource.getResourceInputStream(ds, FinerResource.KB_BIAS_RESOURCE_TAR_GZ)) {
this.typers.add(this.getKBBiasTyper(is));
} catch (IOException e) {
e.printStackTrace();
}
}


public FinerAnnotator getAnnotator() throws DatastoreException {
if (this.mentionDetecter == null || this.typers.isEmpty()) {
this.init();
}
return new FinerAnnotator(this.mentionDetecter, this.typers);
}

private MentionDetecter getMentionDetecter(InputStream is) {
Gson gson = new GsonBuilder().create();
Map<String, String> ret = new HashMap<>();

try (BufferedReader reader =
new BufferedReader(new InputStreamReader(is))) {
ret = gson.fromJson(reader, ret.getClass());
} catch (IOException e) {
e.printStackTrace();
}

return new BasicMentionDetection(new TypeMapper(this.typeSystem, ret));
}

private IFinerTyper getKBBiasTyper(InputStream is) throws IOException {
Map<String, Map<FinerType, Double>> map = new HashMap<>();
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
String line;
while ((line = reader.readLine()) != null) {
String[] parts = line.split("\\t");
String pattern = parts[0];
Map<FinerType, Double> scoreMap = new HashMap<>();
try {
for (String typeAndScore : parts[1].split(" ")) {
FinerType type = getTypeOrFail(typeAndScore.split(":")[0]);
double score = Double.parseDouble(typeAndScore.split(":")[1]);
scoreMap.put(type, score);
}
map.put(pattern, scoreMap);
} catch (RuntimeException exp) {
System.err.println("[" + line + "] failed to process..");
}

}
return new KBBiasTyper(map);
}

private IFinerTyper getHypTyper(InputStream is) throws IOException {
Map<String, List<FinerType>> map = new HashMap<>();
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
String line;
while ((line = reader.readLine()) != null) {
String[] parts = line.split("\t");
String synsetId = parts[0];
List<FinerType> types = Arrays.stream(parts[1].split(" "))
.map(this::getType)
.filter(Optional::isPresent)
.map(Optional::get)
.collect(Collectors.toList());
map.put(synsetId, types);
}
return new HypernymTyper(map);
}

private IFinerTyper getPatternTyper(InputStream is) throws IOException {
Map<NGramPattern, List<FinerType>> map = new HashMap<>();
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
String line;
while ((line = reader.readLine()) != null) {
String[] parts = line.split("\t");

int before = Integer.parseInt(parts[0]);
String[] tokens = parts[1].split(" ");
int after = Integer.parseInt(parts[2]);

NGramPattern pattern = new NGramPattern(before, after, tokens);

List<FinerType> types = Arrays.stream(parts[3].split(" "))
.map(this::getType)
.filter(Optional::isPresent)
.map(Optional::get)
.collect(Collectors.toList());
map.put(pattern, types);
}
return new NGramPatternBasedTyper(map);

}

private FinerType getTypeOrFail(String name) {
return this.typeSystem.getTypeOrFail(name);
}

private Optional<FinerType> getType(String name) {
return this.typeSystem.getType(name);
}


private TypeSystem typeSystem = null;
private MentionDetecter mentionDetecter = null;
private List<IFinerTyper> typers = null;

public void setMentionDetecter(MentionDetecter mentionDetecter) {
this.mentionDetecter = mentionDetecter;
}

public void setTypers(List<IFinerTyper> typers) {
this.typers = typers;
}

public List<IFinerTyper> getTypers() {
return typers;
}
}
Loading