Merge pull request #11070 from lgeiger:fix-no-shuffle-beam-writer

The TensorFlow Datasets Authors · The TensorFlow Datasets Authors · commit bdf87d0c8858 · 2025-07-01T04:13:50.000-07:00
PiperOrigin-RevId: 777974077
diff --git a/tensorflow_datasets/core/naming.py b/tensorflow_datasets/core/naming.py
@@ -666,7 +666,7 @@ def sharded_filepaths_pattern(
     `/path/dataset_name-split.fileformat@num_shards` or
     `/path/dataset_name-split@num_shards.fileformat` depending on the format.
     If `num_shards` is not given, then it returns
-    `/path/dataset_name-split.fileformat*`.
+    `/path/dataset_name-split.fileformat-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]`.
 
     Args:
       num_shards: optional specification of the number of shards.
@@ -681,7 +681,7 @@ def sharded_filepaths_pattern(
     elif use_at_notation:
       replacement = '@*'
     else:
-      replacement = '*'
+      replacement = '-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]'
     return _replace_shard_pattern(os.fspath(a_filepath), replacement)
 
   def glob_pattern(self, num_shards: int | None = None) -> str:
diff --git a/tensorflow_datasets/core/naming_test.py b/tensorflow_datasets/core/naming_test.py
@@ -459,7 +459,7 @@ def test_sharded_file_template_shard_index():
   )
   assert (
       os.fspath(template.sharded_filepaths_pattern())
-      == '/my/path/data/mnist-train.tfrecord*'
+      == '/my/path/data/mnist-train.tfrecord-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]'
   )
   assert (
       os.fspath(template.sharded_filepaths_pattern(num_shards=100))
@@ -474,7 +474,10 @@ def test_glob_pattern():
       filetype_suffix='tfrecord',
       data_dir=epath.Path('/data'),
   )
-  assert '/data/ds-train.tfrecord*' == template.glob_pattern()
+  assert (
+      '/data/ds-train.tfrecord-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]'
+      == template.glob_pattern()
+  )
   assert '/data/ds-train.tfrecord-*-of-00042' == template.glob_pattern(
       num_shards=42
   )
diff --git a/tensorflow_datasets/core/writer.py b/tensorflow_datasets/core/writer.py
@@ -816,8 +816,9 @@ def finalize(self) -> tuple[list[int], int]:
     logging.info("Finalizing writer for %s", self._filename_template.split)
     # We don't know the number of shards, the length of each shard, nor the
     # total size, so we compute them here.
-    prefix = epath.Path(self._filename_template.filepath_prefix())
-    shards = self._filename_template.data_dir.glob(f"{prefix.name}*")
+    shards = self._filename_template.data_dir.glob(
+        self._filename_template.glob_pattern()
+    )
 
     def _get_length_and_size(shard: epath.Path) -> tuple[epath.Path, int, int]:
       length = self._file_adapter.num_examples(shard)
diff --git a/tensorflow_datasets/core/writer_test.py b/tensorflow_datasets/core/writer_test.py
@@ -592,39 +592,47 @@ def test_write_beam(self, file_format: file_adapters.FileFormat):
 
     with tempfile.TemporaryDirectory() as tmp_dir:
       tmp_dir = epath.Path(tmp_dir)
-      filename_template = naming.ShardedFileTemplate(
-          dataset_name='foo',
-          split='train',
-          filetype_suffix=file_format.file_suffix,
-          data_dir=tmp_dir,
-      )
-      writer = writer_lib.NoShuffleBeamWriter(
-          serializer=testing.DummySerializer('dummy specs'),
-          filename_template=filename_template,
-          file_format=file_format,
-      )
+
+      def get_writer(split):
+        filename_template = naming.ShardedFileTemplate(
+            dataset_name='foo',
+            split=split,
+            filetype_suffix=file_format.file_suffix,
+            data_dir=tmp_dir,
+        )
+        return writer_lib.NoShuffleBeamWriter(
+            serializer=testing.DummySerializer('dummy specs'),
+            filename_template=filename_template,
+            file_format=file_format,
+        )
+
       to_write = [(i, str(i).encode('utf-8')) for i in range(10)]
       # Here we need to disable type check as `beam.Create` is not capable of
       # inferring the type of the PCollection elements.
       options = beam.options.pipeline_options.PipelineOptions(
           pipeline_type_check=False
       )
-      with beam.Pipeline(options=options, runner=_get_runner()) as pipeline:
-
-        @beam.ptransform_fn
-        def _build_pcollection(pipeline):
-          pcollection = pipeline | 'Start' >> beam.Create(to_write)
-          return writer.write_from_pcollection(pcollection)
-
-        _ = pipeline | 'test' >> _build_pcollection()  # pylint: disable=no-value-for-parameter
-      shard_lengths, total_size = writer.finalize()
-      self.assertNotEmpty(shard_lengths)
-      self.assertEqual(sum(shard_lengths), 10)
-      self.assertGreater(total_size, 10)
+      writers = [get_writer(split) for split in ('train-b', 'train')]
+
+      for writer in writers:
+        with beam.Pipeline(options=options, runner=_get_runner()) as pipeline:
+
+          @beam.ptransform_fn
+          def _build_pcollection(pipeline, writer):
+            pcollection = pipeline | 'Start' >> beam.Create(to_write)
+            return writer.write_from_pcollection(pcollection)
+
+          _ = pipeline | 'test' >> _build_pcollection(writer)
+
       files = list(tmp_dir.iterdir())
-      self.assertGreaterEqual(len(files), 1)
+      self.assertGreaterEqual(len(files), 2)
       for f in files:
         self.assertIn(file_format.file_suffix, f.name)
+      for writer in writers:
+        shard_lengths, total_size = writer.finalize()
+        self.assertNotEmpty(shard_lengths)
+        self.assertEqual(sum(shard_lengths), 10)
+        self.assertGreater(total_size, 10)
 
 
 class CustomExampleWriter(writer_lib.ExampleWriter):

Original file line number	Diff line number	Diff line change
`@@ -459,7 +459,7 @@ def test_sharded_file_template_shard_index():`
`459`	`459`	`)`
`460`	`460`	`assert (`
`461`	`461`	`os.fspath(template.sharded_filepaths_pattern())`
`462`		`- == '/my/path/data/mnist-train.tfrecord*'`
	`462`	`+ == '/my/path/data/mnist-train.tfrecord-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]'`
`463`	`463`	`)`
`464`	`464`	`assert (`
`465`	`465`	`os.fspath(template.sharded_filepaths_pattern(num_shards=100))`
`@@ -474,7 +474,10 @@ def test_glob_pattern():`
`474`	`474`	`filetype_suffix='tfrecord',`
`475`	`475`	`data_dir=epath.Path('/data'),`
`476`	`476`	`)`
`477`		`- assert '/data/ds-train.tfrecord*' == template.glob_pattern()`
	`477`	`+ assert (`
	`478`	`+ '/data/ds-train.tfrecord-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]'`
	`479`	`+ == template.glob_pattern()`
	`480`	`+ )`
`478`	`481`	`assert '/data/ds-train.tfrecord-*-of-00042' == template.glob_pattern(`
`479`	`482`	`num_shards=42`
`480`	`483`	`)`