Description
German (and some other European languages) use a dot to denote ordinals.
I.e. instead of "1st place", German uses "1. Platz".
Instead of "July 28th", German uses "28. Juli".
Examples can be found en masse, for example:
dewiki:Fußball-Bundesliga (28. Juli
, 2. Bundesliga
, 1. Liga
)
dewiki:9/11 (11. September
)
dewiki:Stanford University (Der Grund und Boden wurde am 11. November 1885 von Leland Stanford zur Gründung der Universität gestiftet
)
And the Duden, the "prescriptive source for German language spelling" (Wikipedia) uses:
Duden - Die deutsche Rechtschreibung, 26. Auflage
Unfortunately, CoreNLP will split all these sentences at the dot.
So CoreNLP currently cannot reliably split German sentences if they contain ordinal numbers or dates.
I am currently using the following workaround hack:
private static class FilteredTokenizer implements Annotator {
private TokenizerAnnotator inner;
public FilteredTokenizer(TokenizerAnnotator inner) {
this.inner = inner;
}
@Override
public void annotate(Annotation annotation) {
inner.annotate(annotation);
List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
ArrayList<CoreLabel> filtered = new ArrayList<>(tokens.size());
CoreLabel previous = null;
for(CoreLabel t : tokens)
if(previous == null || !updateAnnotation(previous, t))
filtered.add(previous = t);
annotation.set(CoreAnnotations.TokensAnnotation.class, filtered);
}
private boolean updateAnnotation(CoreLabel prev, CoreLabel curr) {
int begin = curr.beginPosition(), end = curr.endPosition();
if(begin + 1 != end || begin != prev.endPosition() || prev.beginPosition() == prev.endPosition())
return false;
String ct = curr.getString(CoreAnnotations.OriginalTextAnnotation.class);
if(!".".equals(ct))
return false;
String pt = prev.getString(CoreAnnotations.OriginalTextAnnotation.class);
for(int i = 0; i < pt.length(); i++)
if(!Character.isDigit(pt.charAt(i)))
return false;
// We keep TextAnnotation unmodified, to 1. gets labeled CARDINAL.
prev.set(CoreAnnotations.OriginalTextAnnotation.class, pt + ct);
prev.setEndPosition(end);
return true;
}
@SuppressWarnings("rawtypes")
@Override
public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
return inner.requirementsSatisfied();
}
@SuppressWarnings("rawtypes")
@Override
public Set<Class<? extends CoreAnnotation>> requires() {
return inner.requires();
}
}