pytorch · manuelcandales · Nov 15, 2025 · Nov 12, 2025 · Nov 14, 2025
@@ -39,10 +39,6 @@ DEFINE_string(
     audio_path,
     "",
     "Path to input audio file. Accepts .wav or raw float .bin.");
-DEFINE_string(
-    model_name,
-    "base",
-    "Whisper model name (base, small, medium, large, large-v2, large-v3, large-v3-turbo).");
 DEFINE_double(
     temperature,
     0.0,
@@ -114,21 +110,10 @@ int main(int argc, char** argv) {
   config.max_new_tokens = FLAGS_max_new_tokens;
   config.temperature = static_cast<float>(FLAGS_temperature);
 
-  // Set decoder_start_token_id based on model version
-  if (FLAGS_model_name == "large-v2" || FLAGS_model_name == "large-v3" ||
-      FLAGS_model_name == "large-v3-turbo") {
-    config.decoder_start_token_id = 50258;
-    ET_LOG(
-        Info,
-        "Using decoder_start_token_id=50258 for model: %s",
-        FLAGS_model_name.c_str());
-  } else {
-    config.decoder_start_token_id = 50257;
-    ET_LOG(
-        Info,
-        "Using decoder_start_token_id=50257 for model: %s",
-        FLAGS_model_name.c_str());
-  }
+  // All Whisper models from HuggingFace now use the v3 tokenizer format
+  // where token 50257 = <|endoftext|> and token 50258 = <|startoftranscript|>
+  config.decoder_start_token_id = 50258;
+  ET_LOG(Info, "Using decoder_start_token_id=50258");
 
   auto result =
       runner.transcribe(features, config, [&](const std::string& piece) {