Skip to content

Commit 32d8df4

Browse files
committed
Support inferringunigram tokenizer type
1 parent c61a76b commit 32d8df4

File tree

2 files changed

+19
-3
lines changed

2 files changed

+19
-3
lines changed

src/tokenizers.js

+10-3
Original file line numberDiff line numberDiff line change
@@ -357,14 +357,21 @@ export class TokenizerModel extends Callable {
357357
case 'Unigram':
358358
// @ts-ignore
359359
return new Unigram(config, ...args);
360-
361360
case 'BPE':
362361
return new BPE(config);
363362

364363
default:
364+
// Some tokenizers, like for google-t5/t5-small, do not have a `type` field.
365+
// In this case, we can infer the tokenizer type based on the structure of the `vocab` field.
365366
if (config.vocab) {
366-
// @ts-ignore
367-
return new LegacyTokenizerModel(config, ...args);
367+
if (Array.isArray(config.vocab)) {
368+
// config.vocab is of type `[string, number][]`
369+
// @ts-ignore
370+
return new Unigram(config, ...args);
371+
} else {
372+
// @ts-ignore
373+
return new LegacyTokenizerModel(config, ...args);
374+
}
368375
}
369376
throw new Error(`Unknown TokenizerModel type: ${config.type}`);
370377
}

tests/models/t5/tokenization.js

+9
Original file line numberDiff line numberDiff line change
@@ -237,4 +237,13 @@ export const TEST_CONFIG = {
237237
decoded: "Hey </s>. how are you</s>",
238238
},
239239
},
240+
"google-t5/t5-small": {
241+
// Test that tokenizer type can be inferred (`type: "Unigram"` is missing)
242+
SIMPLE: {
243+
text: BASE_TEST_STRINGS.SIMPLE,
244+
tokens: ["\u2581How", "\u2581are", "\u2581you", "\u2581doing", "?"],
245+
ids: [571, 33, 25, 692, 58, 1],
246+
decoded: "How are you doing?</s>",
247+
},
248+
}
240249
};

0 commit comments

Comments
 (0)