From 2eaec4f68290f50c5e5bbe82f6298486b5504407 Mon Sep 17 00:00:00 2001 From: Suhas Kotha Date: Fri, 14 Feb 2025 16:27:54 -0800 Subject: [PATCH] fixing datasets without max sequences --- src/levanter/data/text.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/levanter/data/text.py b/src/levanter/data/text.py index 94a482089..2909bf4a2 100644 --- a/src/levanter/data/text.py +++ b/src/levanter/data/text.py @@ -1284,6 +1284,8 @@ def shuffle_ds(ds, key): for name, ds in token_datasets.items(): if self.max_sequences_dict is not None and name in self.max_sequences_dict: train_token_datasets[name] = ds.slice_dataset(end_index=self.max_sequences_dict[name]) + else: + train_token_datasets[name] = ds self.validation_token_datasets = {} for name, ds in token_datasets.items():