|
1 | 1 | import { DebertaV2Tokenizer } from "../../../src/tokenizers.js";
|
2 |
| -import { BASE_TEST_STRINGS } from "../test_strings.js"; |
| 2 | +import { BASE_TEST_STRINGS, BERT_TEST_STRINGS } from "../test_strings.js"; |
3 | 3 |
|
4 | 4 | export const TOKENIZER_CLASS = DebertaV2Tokenizer;
|
5 | 5 | export const TEST_CONFIG = {
|
@@ -154,6 +154,30 @@ export const TEST_CONFIG = {
|
154 | 154 | ids: [1, 507, 110119, 507, 123983, 507, 127294, 25377, 507, 3, 108391, 507, 3, 507, 117868, 25377, 507, 3, 108391, 507, 117868, 507, 125199, 108391, 507, 3, 507, 3, 507, 3, 507, 3, 507, 124709, 49509, 507, 124327, 507, 125199, 507, 124709, 507, 124709, 507, 126640, 507, 126853, 507, 3, 108391, 507, 3, 507, 3, 108391, 507, 126132, 3, 507, 125199, 108391, 49509, 25377, 507, 124327, 507, 125199, 118155, 2],
|
155 | 155 | decoded: "[CLS] \u2728 \ud83e\udd17 \ud83d\udc41\ufe0f [UNK]\ud83c\udffb [UNK] \u2642\ufe0f [UNK]\ud83c\udffb \u2642 \ud83d\udc68\ud83c\udffb [UNK] [UNK] [UNK] [UNK] \ud83d\udc69 \u2764 \ud83d\udc8b \ud83d\udc68 \ud83d\udc69 \ud83d\udc69 \ud83d\udc67 \ud83d\udc66 [UNK]\ud83c\udffb [UNK] [UNK]\ud83c\udffb \ud83c\udff4[UNK] \ud83d\udc68\ud83c\udffb \u2764\ufe0f \ud83d\udc8b \ud83d\udc68\ud83c\udffc[SEP]",
|
156 | 156 | },
|
| 157 | + CHINESE_LATIN_MIXED: { |
| 158 | + text: BERT_TEST_STRINGS.CHINESE_LATIN_MIXED, |
| 159 | + tokens: ["\u2581a", "h", "\u535a", "\u63a8", "zz"], |
| 160 | + ids: [1, 266, 1537, 122598, 111743, 23260, 2], |
| 161 | + decoded: "[CLS] ah\u535a\u63a8zz[SEP]", |
| 162 | + }, |
| 163 | + SIMPLE_WITH_ACCENTS: { |
| 164 | + text: BERT_TEST_STRINGS.SIMPLE_WITH_ACCENTS, |
| 165 | + tokens: ["\u2581H\u00e9", "llo"], |
| 166 | + ids: [1, 93519, 25341, 2], |
| 167 | + decoded: "[CLS] H\u00e9llo[SEP]", |
| 168 | + }, |
| 169 | + MIXED_CASE_WITHOUT_ACCENTS: { |
| 170 | + text: BERT_TEST_STRINGS.MIXED_CASE_WITHOUT_ACCENTS, |
| 171 | + tokens: ["\u2581He", "LL", "o", "!", "how", "\u2581Are", "\u2581yo", "U", "?"], |
| 172 | + ids: [1, 383, 17145, 795, 300, 5608, 1396, 14469, 2628, 302, 2], |
| 173 | + decoded: "[CLS] HeLLo!how Are yoU?[SEP]", |
| 174 | + }, |
| 175 | + MIXED_CASE_WITH_ACCENTS: { |
| 176 | + text: BERT_TEST_STRINGS.MIXED_CASE_WITH_ACCENTS, |
| 177 | + tokens: ["\u2581H\u00e4", "LL", "o", "!", "how", "\u2581Are", "\u2581yo", "U", "?"], |
| 178 | + ids: [1, 62693, 17145, 795, 300, 5608, 1396, 14469, 2628, 302, 2], |
| 179 | + decoded: "[CLS] H\u00e4LLo!how Are yoU?[SEP]", |
| 180 | + }, |
157 | 181 | },
|
158 | 182 | "Xenova/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7": {
|
159 | 183 | SIMPLE: {
|
|
0 commit comments