Skip to content

Commit 58bd436

Browse files
committed
added more guidance and deleted redundant check
1 parent f720085 commit 58bd436

File tree

1 file changed

+9
-13
lines changed

1 file changed

+9
-13
lines changed

torchtitan/components/tokenizer.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,15 @@ def _load_tokenizer_from_path(self, tokenizer_path: str) -> Tokenizer:
8484
if not os.path.exists(tokenizer_path):
8585
if "assets/tokenizer" in tokenizer_path:
8686
raise FileNotFoundError(
87-
"Detected deprecated ./assets/tokenizer path. Remove --model.tokenizer_path "
88-
"and download to --model.hf_assets_path using ./scripts/download_hf_assets.py"
87+
"Detected ./assets/tokenizer path which was deprecated in https://github.com/pytorch/torchtitan/pull/1540. \n"
88+
"Remove --model.tokenizer_path and download to --model.hf_assets_path using ./scripts/download_hf_assets.py\n"
89+
"See example: https://github.com/pytorch/torchtitan/tree/main/torchtitan/models/deepseek_v3#download-tokenizer"
8990
)
9091
else:
9192
raise FileNotFoundError(
9293
f"Tokenizer path '{tokenizer_path}' does not exist"
9394
)
95+
9496
# Define paths for different tokenizer file types
9597
tokenizer_json_path = os.path.join(tokenizer_path, "tokenizer.json")
9698
vocab_txt_path = os.path.join(tokenizer_path, "vocab.txt")
@@ -165,17 +167,11 @@ def _load_tokenizer_from_path(self, tokenizer_path: str) -> Tokenizer:
165167
for f in os.listdir(tokenizer_path)
166168
if os.path.isfile(os.path.join(tokenizer_path, f))
167169
]
168-
if "assets/tokenizer" in tokenizer_path:
169-
raise FileNotFoundError(
170-
"Detected deprecated ./assets/tokenizer path. Remove --model.tokenizer_path "
171-
"and download to --model.hf_assets_path using ./scripts/download_hf_assets.py"
172-
)
173-
else:
174-
raise FileNotFoundError(
175-
f"No supported tokenizer files found in '{tokenizer_path}'. "
176-
f"Available files: {available_files}. "
177-
"Looking for: tokenizer.json, vocab.txt+merges.txt, or vocab.json+merges.txt"
178-
)
170+
raise FileNotFoundError(
171+
f"No supported tokenizer files found in '{tokenizer_path}'. "
172+
f"Available files: {available_files}. "
173+
"Looking for: tokenizer.json, vocab.txt+merges.txt, or vocab.json+merges.txt"
174+
)
179175

180176
def _get_token_from_config(self, config: dict[str, Any], key: str) -> Optional[str]:
181177
"""

0 commit comments

Comments
 (0)