Normalize the file path before writing for the cats_vs_dogs.py dataset

SanjaySG · SanjaySG · commit 8568bfe4dc70 · 2025-01-26T23:35:07.000-08:00
diff --git a/tensorflow_datasets/image_classification/cats_vs_dogs.py b/tensorflow_datasets/image_classification/cats_vs_dogs.py
@@ -43,91 +43,91 @@
 )
 _NUM_CORRUPT_IMAGES = 1738
 _DESCRIPTION = (
-        "A large set of images of cats and dogs. "
-        "There are %d corrupted images that are dropped." % _NUM_CORRUPT_IMAGES
+    "A large set of images of cats and dogs. "
+    "There are %d corrupted images that are dropped." % _NUM_CORRUPT_IMAGES
 )
 
 _NAME_RE = re.compile(r"^PetImages[\\/](Cat|Dog)[\\/]\d+\.jpg$")
 
 
 class CatsVsDogs(tfds.core.GeneratorBasedBuilder):
-    """Cats vs Dogs."""
-
-    VERSION = tfds.core.Version("4.0.1")
-    RELEASE_NOTES = {
-        "4.0.0": "New split API (https://tensorflow.org/datasets/splits)",
-        "4.0.1": (
-            "Recoding images in generator to fix corrupt JPEG data warnings"
-            " (https://github.com/tensorflow/datasets/issues/2188)"
+  """Cats vs Dogs."""
+
+  VERSION = tfds.core.Version("4.0.1")
+  RELEASE_NOTES = {
+      "4.0.0": "New split API (https://tensorflow.org/datasets/splits)",
+      "4.0.1": (
+          "Recoding images in generator to fix corrupt JPEG data warnings"
+          " (https://github.com/tensorflow/datasets/issues/2188)"
+      ),
+  }
+
+  def _info(self):
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            "image": tfds.features.Image(),
+            "image/filename": tfds.features.Text(),  # eg 'PetImages/Dog/0.jpg'
+            "label": tfds.features.ClassLabel(names=["cat", "dog"]),
+        }),
+        supervised_keys=("image", "label"),
+        homepage=(
+            "https://www.microsoft.com/en-us/download/details.aspx?id=54765"
         ),
-    }
-
-    def _info(self):
-        return tfds.core.DatasetInfo(
-            builder=self,
-            description=_DESCRIPTION,
-            features=tfds.features.FeaturesDict({
-                "image": tfds.features.Image(),
-                "image/filename": tfds.features.Text(),  # eg 'PetImages/Dog/0.jpg'
-                "label": tfds.features.ClassLabel(names=["cat", "dog"]),
-            }),
-            supervised_keys=("image", "label"),
-            homepage=(
-                "https://www.microsoft.com/en-us/download/details.aspx?id=54765"
-            ),
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager):
-        path = dl_manager.download(_URL)
-
-        # There is no predefined train/val/test split for this dataset.
-        return [
-            tfds.core.SplitGenerator(
-                name=tfds.Split.TRAIN,
-                gen_kwargs={
-                    "archive": dl_manager.iter_archive(path),
-                },
-            ),
-        ]
-
-    def _generate_examples(self, archive):
-        """Generate Cats vs Dogs images and labels given a directory path."""
-        num_skipped = 0
-        for fname, fobj in archive:
-            norm_fname = os.path.normpath(fname)
-            res = _NAME_RE.match(norm_fname)
-            if not res:  # README file, ...
-                continue
-            label = res.group(1).lower()
-            if tf.compat.as_bytes("JFIF") not in fobj.peek(10):
-                num_skipped += 1
-                continue
-
-            # Some images caused 'Corrupt JPEG data...' messages during training or
-            # any other iteration recoding them once fixes the issue (discussion:
-            # https://github.com/tensorflow/datasets/issues/2188).
-            # Those messages are now displayed when generating the dataset instead.
-            img_data = fobj.read()
-            img_tensor = tf.image.decode_image(img_data)
-            img_recoded = tf.io.encode_jpeg(img_tensor)
-
-            # Converting the recoded image back into a zip file container.
-            buffer = io.BytesIO()
-            with zipfile.ZipFile(buffer, "w") as new_zip:
-                new_zip.writestr(norm_fname, img_recoded.numpy())
-            new_fobj = zipfile.ZipFile(buffer).open(norm_fname)
-
-            record = {
-                "image": new_fobj,
-                "image/filename": norm_fname,
-                "label": label,
-            }
-            yield norm_fname, record
-
-        if num_skipped != _NUM_CORRUPT_IMAGES:
-            raise ValueError(
-                "Expected %d corrupt images, but found %d"
-                % (_NUM_CORRUPT_IMAGES, num_skipped)
-            )
-        logging.warning("%d images were corrupted and were skipped", num_skipped)
+        citation=_CITATION,
+    )
+
+  def _split_generators(self, dl_manager):
+    path = dl_manager.download(_URL)
+
+    # There is no predefined train/val/test split for this dataset.
+    return [
+        tfds.core.SplitGenerator(
+            name=tfds.Split.TRAIN,
+            gen_kwargs={
+                "archive": dl_manager.iter_archive(path),
+            },
+        ),
+    ]
+
+  def _generate_examples(self, archive):
+    """Generate Cats vs Dogs images and labels given a directory path."""
+    num_skipped = 0
+    for fname, fobj in archive:
+      norm_fname = os.path.normpath(fname)
+      res = _NAME_RE.match(norm_fname)
+      if not res:  # README file, ...
+        continue
+      label = res.group(1).lower()
+      if tf.compat.as_bytes("JFIF") not in fobj.peek(10):
+        num_skipped += 1
+        continue
+
+      # Some images caused 'Corrupt JPEG data...' messages during training or
+      # any other iteration recoding them once fixes the issue (discussion:
+      # https://github.com/tensorflow/datasets/issues/2188).
+      # Those messages are now displayed when generating the dataset instead.
+      img_data = fobj.read()
+      img_tensor = tf.image.decode_image(img_data)
+      img_recoded = tf.io.encode_jpeg(img_tensor)
+
+      # Converting the recoded image back into a zip file container.
+      buffer = io.BytesIO()
+      with zipfile.ZipFile(buffer, "w") as new_zip:
+        new_zip.writestr(norm_fname, img_recoded.numpy())
+      new_fobj = zipfile.ZipFile(buffer).open(norm_fname)
+
+      record = {
+          "image": new_fobj,
+          "image/filename": norm_fname,
+          "label": label,
+      }
+      yield norm_fname, record
+
+    if num_skipped != _NUM_CORRUPT_IMAGES:
+      raise ValueError(
+          "Expected %d corrupt images, but found %d"
+          % (_NUM_CORRUPT_IMAGES, num_skipped)
+      )
+    logging.warning("%d images were corrupted and were skipped", num_skipped)