Skip to content

Commit 87d0bd2

Browse files
committed
A few more tokenizer test cases
1 parent 4064293 commit 87d0bd2

File tree

1 file changed

+10
-1
lines changed

1 file changed

+10
-1
lines changed

test/src/edu/stanford/nlp/process/PTBTokenizerTest.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -958,6 +958,10 @@ public void testHyphensQuoteAndBOM() {
958958
"Let's shoot'em up",
959959
"In Louis L'Amour's 1985 historical novel",
960960
"Grace O'Malley's Castle.",
961+
"The Z-R relationship was Z = 408R9.20",
962+
"I use SPSS28.0 to measure Lee’s指数 as 其IC50约为4.814μmol / L",
963+
"Some people write BA2.12.1. Tests were DM899.00.",
964+
"@Insanomania They do... Their mentality doesn't :(\n",
961965

962966
};
963967

@@ -982,13 +986,18 @@ public void testHyphensQuoteAndBOM() {
982986
{ "Let", "'s", "shoot", "'em", "up" },
983987
{ "In", "Louis", "L'Amour", "'s", "1985", "historical", "novel" },
984988
{ "Grace", "O'Malley", "'s", "Castle", "." },
989+
{ "The", "Z-R", "relationship", "was", "Z", "=", "408R", "9.20" },
990+
{ "I", "use", "SPSS", "28.0", "to", "measure", "Lee's指数", "as", "其IC50约为", "4.814", "μmol", "/", "L" }, // could use \p{Latin} more in patterns?
991+
{ "Some", "people", "write", "BA", "2.12.1", ".","Tests", "were", "DM", "899.00", "." },
992+
{ "@Insanomania", "They", "do", "...", "Their", "mentality", "does", "n't", ":(" },
985993

986994
};
987995

988996
@Test
989997
public void testApostrophes() {
990998
// Note that this is running with "latex" normalization of quotes!
991-
TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("normalizeCurrency=false,invertible,ptb3Escaping");
999+
TokenizerFactory<CoreLabel> tokFactory =
1000+
PTBTokenizer.coreLabelFactory("invertible,ptb3Escaping,normalizeCurrency=false,normalizeParentheses=false");
9921001
runOnTwoArrays(tokFactory, apostropheInputs, apostropheGold);
9931002
runAgainstOrig(tokFactory, apostropheInputs);
9941003
}

0 commit comments

Comments
 (0)