Skip to content

Commit 3ab829c

Browse files
tomvdwThe TensorFlow Datasets Authors
authored andcommitted
Use unique paths for writing shard info for different splits in SplitBuilder.submit_shard_based_generation
PiperOrigin-RevId: 678809954
1 parent b927e7e commit 3ab829c

File tree

2 files changed

+7
-1
lines changed

2 files changed

+7
-1
lines changed

tensorflow_datasets/core/naming.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,7 @@ def sharded_filepaths(
609609
def filepath_prefix(
610610
self,
611611
) -> str:
612+
"""Returns the path of the data shards without the shard suffix."""
612613
a_filepath = self.sharded_filepath(shard_index=0, num_shards=1)
613614
prefix = _replace_shard_pattern(os.fspath(a_filepath), '')
614615
return _remove_extension(prefix)

tensorflow_datasets/core/split_builder.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,12 @@ def submit_shard_based_generation(
201201
)
202202
shard_lengths.append(num_examples)
203203
else:
204-
shard_infos_path = filename_template.data_dir / 'shard_infos.json'
204+
# To store the shard information temporarily, we use the same path as the
205+
# data shard paths, minus the shard suffix (e.g., 00000-of-00042), with
206+
# the suffix `.shard_infos.json`.
207+
shard_infos_path = epath.Path(
208+
f'{filename_template.filepath_prefix()}.shard_infos.json'
209+
)
205210
with self.maybe_beam_pipeline():
206211
shard_infos = []
207212
for shard_index, example_gen in enumerate(example_gen_per_shard):

0 commit comments

Comments
 (0)