Skip to content
This repository was archived by the owner on Jan 14, 2021. It is now read-only.

Hi Julien, I have commited the changes to allow to optionally generate the vector in the same step, as well as exposing the vector params to the plugin #1

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions README
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@ Unzip the distribution archive into GATE/plugins or to the directory of your cho

Usage
The plugins contains 3 Processing resources :
- TrainingCorpusCreator : generates a lexicon + raw file in the specified directory. See https://code.google.com/p/textclassification/ for instructions on
how to generate a vector file and model from a raw file.
- TrainingCorpusCreator : generates a lexicon + raw file + training vector in the specified directory. See https://code.google.com/p/textclassification/ for instructions on
how to generate a model file using the base text classification API or http://www.csie.ntu.edu.tw/~cjlin/libsvm/ on how to use directly the libsvm tools over the vector file.
- ClassifierPR : takes a model and lexicon to classify the annotations specified in textAnnotationType
- NGram maker : generates ngrams that can be used as input for the corpus generation or classification


2 changes: 1 addition & 1 deletion build.properties
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
version=1.1-dev
version=1.2-dev
organisation=com.digitalpebble
plugin.name=TextClassificationPlugin
dist.dir=distrib
Expand Down
16 changes: 10 additions & 6 deletions creole.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
<RESOURCE>
<NAME>TrainingCorpusCreator</NAME>
<CLASS>com.digitalpebble.gate.textclassification.TrainingCorpusCreatorPR</CLASS>
<JAR>TextClassificationPlugin-1.1-dev.jar</JAR>
<JAR>lib/textclassification-1.4.1-SNAPSHOT.jar</JAR>
<JAR>TextClassificationPlugin-1.2-dev.jar</JAR>
<JAR>lib/textclassification-1.6.1-SNAPSHOT.jar</JAR>
<JAR>lib/liblinear-1.51-with-deps.jar</JAR>
<PARAMETER NAME="document" RUNTIME="true">gate.Document</PARAMETER>
<PARAMETER NAME="inputAnnotationSet" RUNTIME="true" OPTIONAL="true">java.lang.String</PARAMETER>
Expand All @@ -17,6 +17,10 @@
<PARAMETER NAME="weightingScheme" RUNTIME="true" DEFAULT="frequency" OPTIONAL="true">java.lang.String</PARAMETER>
<PARAMETER NAME="directory" RUNTIME="false" DEFAULT="" OPTIONAL="false">java.net.URL</PARAMETER>
<PARAMETER NAME="reinitCorpus" RUNTIME="false" DEFAULT="True" OPTIONAL="false">java.lang.Boolean</PARAMETER>
<PARAMETER NAME="minFreq" RUNTIME="true" DEFAULT="1" OPTIONAL="true">java.lang.Integer</PARAMETER>
<PARAMETER NAME="maxFreq" RUNTIME="true" DEFAULT="2147483647" OPTIONAL="true">java.lang.Integer</PARAMETER>
<PARAMETER NAME="keepNBestAttributes" RUNTIME="true" DEFAULT="0" OPTIONAL="true">java.lang.Integer</PARAMETER>
<PARAMETER NAME="compactLexicon" RUNTIME="true" DEFAULT="True" OPTIONAL="true">java.lang.Boolean</PARAMETER>
<ICON>/dipe.png</ICON>
</RESOURCE>
</CREOLE>
Expand All @@ -25,8 +29,8 @@
<RESOURCE>
<NAME>Classifier</NAME>
<CLASS>com.digitalpebble.gate.textclassification.ClassifierPR</CLASS>
<JAR>TextClassificationPlugin-1.1-dev.jar</JAR>
<JAR>lib/textclassification-1.4.1-SNAPSHOT.jar</JAR>
<JAR>TextClassificationPlugin-1.2-dev.jar</JAR>
<JAR>lib/textclassification-1.6.1-SNAPSHOT.jar</JAR>
<JAR>lib/liblinear-1.51-with-deps.jar</JAR>
<PARAMETER NAME="document" RUNTIME="true">gate.Document</PARAMETER>
<PARAMETER NAME="inputAnnotationSet" RUNTIME="true" OPTIONAL="true">java.lang.String</PARAMETER>
Expand All @@ -43,8 +47,8 @@
<RESOURCE>
<NAME>NGram maker</NAME>
<CLASS>com.digitalpebble.gate.textclassification.NGram</CLASS>
<JAR>TextClassificationPlugin-1.1-dev.jar</JAR>
<JAR>lib/textclassification-1.4.1-SNAPSHOT.jar</JAR>
<JAR>TextClassificationPlugin-1.2-dev.jar</JAR>
<JAR>lib/textclassification-1.6.1-SNAPSHOT.jar</JAR>
<JAR>lib/liblinear-1.51-with-deps.jar</JAR>
<PARAMETER NAME="document" RUNTIME="true">gate.Document</PARAMETER>
<PARAMETER NAME="inputAnnotationSet" RUNTIME="true" OPTIONAL="true">java.lang.String</PARAMETER>
Expand Down
Binary file added lib/libsvm-3.0.jar
Binary file not shown.
Binary file removed lib/textclassification-1.4.1-SNAPSHOT.jar
Binary file not shown.
Binary file added lib/textclassification-1.6.1-SNAPSHOT.jar
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,20 @@
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import com.digitalpebble.classification.Document;
import com.digitalpebble.classification.FileTrainingCorpus;
import com.digitalpebble.classification.Learner;
import com.digitalpebble.classification.Lexicon;
import com.digitalpebble.classification.Parameters;
import com.digitalpebble.classification.TrainingCorpus;
import com.digitalpebble.classification.Parameters.WeightingMethod;
import com.digitalpebble.classification.RAMTrainingCorpus;
import com.digitalpebble.classification.util.CorpusUtils;
import com.digitalpebble.classification.util.scorers.AttributeScorer;
import com.digitalpebble.classification.util.scorers.logLikelihoodAttributeScorer;
import com.digitalpebble.classification.libsvm.Utils;

public class TrainingCorpusCreatorPR extends AbstractLanguageAnalyser
implements ProcessingResource {
Expand Down Expand Up @@ -66,11 +73,32 @@ public class TrainingCorpusCreatorPR extends AbstractLanguageAnalyser
* ComponentAnnotationValue (e.g. form) The feature value used for the ML attributes.
*/
private String attributeAnnotationValue;
/**
* Directory where lexicon, vector and raw model will be saved
*/
private URL directory;
private FileTrainingCorpus trainingcorpus;
/**
* FEature weighting scheme
*/
private String weightingScheme;

private Integer minFreq=1;
private Integer maxFreq=Integer.MAX_VALUE;
/**
* Run after prunning according to min and max freq
*/
private Integer keepNBestAttributes =0;
/**
* Compact the lexicon after prunning
*/
Boolean compactLexicon =true;

private Boolean reinitCorpus = true;

private FileTrainingCorpus trainingcorpus;
private String implementation = Learner.LibSVMModelCreator;
String pathDirectory;
private String libsvmVectorPath;

/*
* this method gets called whenever an object of this class is created
Expand All @@ -92,8 +120,12 @@ public Resource init() throws ResourceInstantiationException {
}

// initializes the modelCreator
String pathDirectory = new File(URI.create(directory.toExternalForm()))
pathDirectory = new File(URI.create(directory.toExternalForm()))
.getAbsolutePath();
if(libsvmVectorPath == null || libsvmVectorPath.isEmpty()){
libsvmVectorPath = pathDirectory+File.separator+"vector";
}


try {
this.creator = Learner.getLearner(pathDirectory, implementation,
Expand Down Expand Up @@ -199,7 +231,28 @@ public void execute() throws ExecutionException {
.methodFromString(getWeightingScheme());
this.creator.setMethod(method);
trainingcorpus.close();
Lexicon lexicon = creator.getLexicon();
creator.saveLexicon();
//prune by frequency
lexicon.pruneTermsDocFreq(minFreq, maxFreq);
//further keep only the N best attributes
if (keepNBestAttributes >0) {
AttributeScorer scorer = logLikelihoodAttributeScorer.getScorer(
trainingcorpus, lexicon);
lexicon.setAttributeScorer(scorer);
lexicon.applyAttributeFilter(scorer, keepNBestAttributes);
}
// change the indices of the attributes to remove
// gaps between them
Map<Integer, Integer> equiv = null;
if (compactLexicon){
// create a new Lexicon object
equiv = lexicon.compact();
}
// save the modified lexicon file
lexicon.saveToFile(this.pathDirectory+"lexicon.compact");
Utils.writeExamples(trainingcorpus,lexicon,
this.libsvmVectorPath, equiv);
} catch (Exception e) {
throw new ExecutionException(e);
} finally {
Expand Down Expand Up @@ -304,4 +357,44 @@ public void setImplementation(String implementation) {
this.implementation = implementation;
}

public Learner getCreator() {
return creator;
}

public void setCreator(Learner creator) {
this.creator = creator;
}

public Integer getMinFreq() {
return minFreq;
}

public void setMinFreq(Integer minFreq) {
this.minFreq = minFreq;
}

public Integer getMaxFreq() {
return maxFreq;
}

public void setMaxFreq(Integer maxFreq) {
this.maxFreq = maxFreq;
}

public Integer getKeepNBestAttributes() {
return keepNBestAttributes;
}

public void setKeepNBestAttributes(Integer keepNBestAttributes) {
this.keepNBestAttributes = keepNBestAttributes;
}

public Boolean getCompactLexicon() {
return compactLexicon;
}

public void setCompactLexicon(Boolean compactLexicon) {
this.compactLexicon = compactLexicon;
}

}