Skip to content

Commit 03eb77b

Browse files
committed
Update deberta unit tests
1 parent 95c8cc5 commit 03eb77b

File tree

1 file changed

+25
-1
lines changed

1 file changed

+25
-1
lines changed

tests/models/deberta-v2/tokenization.js

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { DebertaV2Tokenizer } from "../../../src/tokenizers.js";
2-
import { BASE_TEST_STRINGS } from "../test_strings.js";
2+
import { BASE_TEST_STRINGS, BERT_TEST_STRINGS } from "../test_strings.js";
33

44
export const TOKENIZER_CLASS = DebertaV2Tokenizer;
55
export const TEST_CONFIG = {
@@ -154,6 +154,30 @@ export const TEST_CONFIG = {
154154
ids: [1, 507, 110119, 507, 123983, 507, 127294, 25377, 507, 3, 108391, 507, 3, 507, 117868, 25377, 507, 3, 108391, 507, 117868, 507, 125199, 108391, 507, 3, 507, 3, 507, 3, 507, 3, 507, 124709, 49509, 507, 124327, 507, 125199, 507, 124709, 507, 124709, 507, 126640, 507, 126853, 507, 3, 108391, 507, 3, 507, 3, 108391, 507, 126132, 3, 507, 125199, 108391, 49509, 25377, 507, 124327, 507, 125199, 118155, 2],
155155
decoded: "[CLS] \u2728 \ud83e\udd17 \ud83d\udc41\ufe0f [UNK]\ud83c\udffb [UNK] \u2642\ufe0f [UNK]\ud83c\udffb \u2642 \ud83d\udc68\ud83c\udffb [UNK] [UNK] [UNK] [UNK] \ud83d\udc69 \u2764 \ud83d\udc8b \ud83d\udc68 \ud83d\udc69 \ud83d\udc69 \ud83d\udc67 \ud83d\udc66 [UNK]\ud83c\udffb [UNK] [UNK]\ud83c\udffb \ud83c\udff4[UNK] \ud83d\udc68\ud83c\udffb \u2764\ufe0f \ud83d\udc8b \ud83d\udc68\ud83c\udffc[SEP]",
156156
},
157+
CHINESE_LATIN_MIXED: {
158+
text: BERT_TEST_STRINGS.CHINESE_LATIN_MIXED,
159+
tokens: ["\u2581a", "h", "\u535a", "\u63a8", "zz"],
160+
ids: [1, 266, 1537, 122598, 111743, 23260, 2],
161+
decoded: "[CLS] ah\u535a\u63a8zz[SEP]",
162+
},
163+
SIMPLE_WITH_ACCENTS: {
164+
text: BERT_TEST_STRINGS.SIMPLE_WITH_ACCENTS,
165+
tokens: ["\u2581H\u00e9", "llo"],
166+
ids: [1, 93519, 25341, 2],
167+
decoded: "[CLS] H\u00e9llo[SEP]",
168+
},
169+
MIXED_CASE_WITHOUT_ACCENTS: {
170+
text: BERT_TEST_STRINGS.MIXED_CASE_WITHOUT_ACCENTS,
171+
tokens: ["\u2581He", "LL", "o", "!", "how", "\u2581Are", "\u2581yo", "U", "?"],
172+
ids: [1, 383, 17145, 795, 300, 5608, 1396, 14469, 2628, 302, 2],
173+
decoded: "[CLS] HeLLo!how Are yoU?[SEP]",
174+
},
175+
MIXED_CASE_WITH_ACCENTS: {
176+
text: BERT_TEST_STRINGS.MIXED_CASE_WITH_ACCENTS,
177+
tokens: ["\u2581H\u00e4", "LL", "o", "!", "how", "\u2581Are", "\u2581yo", "U", "?"],
178+
ids: [1, 62693, 17145, 795, 300, 5608, 1396, 14469, 2628, 302, 2],
179+
decoded: "[CLS] H\u00e4LLo!how Are yoU?[SEP]",
180+
},
157181
},
158182
"Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7": {
159183
SIMPLE: {

0 commit comments

Comments
 (0)