Support inferringunigram tokenizer type

xenova · xenova · commit 32d8df40c184 · 2024-10-04T08:56:46.000Z
diff --git a/src/tokenizers.js b/src/tokenizers.js
@@ -357,14 +357,21 @@ export class TokenizerModel extends Callable {
             case 'Unigram':
                 // @ts-ignore
                 return new Unigram(config, ...args);
-
             case 'BPE':
                 return new BPE(config);
 
             default:
+                // Some tokenizers, like for google-t5/t5-small, do not have a `type` field.
+                // In this case, we can infer the tokenizer type based on the structure of the `vocab` field.
                 if (config.vocab) {
-                    // @ts-ignore
-                    return new LegacyTokenizerModel(config, ...args);
+                    if (Array.isArray(config.vocab)) {
+                        // config.vocab is of type `[string, number][]`
+                        // @ts-ignore
+                        return new Unigram(config, ...args);
+                    } else {
+                        // @ts-ignore
+                        return new LegacyTokenizerModel(config, ...args);
+                    }
                 }
                 throw new Error(`Unknown TokenizerModel type: ${config.type}`);
         }
diff --git a/tests/models/t5/tokenization.js b/tests/models/t5/tokenization.js
@@ -237,4 +237,13 @@ export const TEST_CONFIG = {
       decoded: "Hey </s>. how are you</s>",
     },
   },
+  "google-t5/t5-small": {
+    // Test that tokenizer type can be inferred (`type: "Unigram"` is missing)
+    SIMPLE: {
+      text: BASE_TEST_STRINGS.SIMPLE,
+      tokens: ["\u2581How", "\u2581are", "\u2581you", "\u2581doing", "?"],
+      ids: [571, 33, 25, 692, 58, 1],
+      decoded: "How are you doing?</s>",
+    },
+  }
 };