Use unique paths for writing shard info for different splits in SplitBuilder.submit_shard_based_generation

tomvdw · The TensorFlow Datasets Authors · commit 3ab829c3b5a0 · 2024-09-25T12:49:39.000-07:00
PiperOrigin-RevId: 678809954
diff --git a/tensorflow_datasets/core/naming.py b/tensorflow_datasets/core/naming.py
@@ -609,6 +609,7 @@ def sharded_filepaths(
   def filepath_prefix(
       self,
   ) -> str:
+    """Returns the path of the data shards without the shard suffix."""
     a_filepath = self.sharded_filepath(shard_index=0, num_shards=1)
     prefix = _replace_shard_pattern(os.fspath(a_filepath), '')
     return _remove_extension(prefix)
diff --git a/tensorflow_datasets/core/split_builder.py b/tensorflow_datasets/core/split_builder.py
@@ -201,7 +201,12 @@ def submit_shard_based_generation(
         )
         shard_lengths.append(num_examples)
     else:
-      shard_infos_path = filename_template.data_dir / 'shard_infos.json'
+      # To store the shard information temporarily, we use the same path as the
+      # data shard paths, minus the shard suffix (e.g., 00000-of-00042), with
+      # the suffix `.shard_infos.json`.
+      shard_infos_path = epath.Path(
+          f'{filename_template.filepath_prefix()}.shard_infos.json'
+      )
       with self.maybe_beam_pipeline():
         shard_infos = []
         for shard_index, example_gen in enumerate(example_gen_per_shard):