raise exception when datasets would like to cache pipeline result

ArneBinder · ArneBinder · commit ee2d35f56fc8 · 2022-05-08T14:52:13.000+02:00
diff --git a/src/pytorch_ie/pipeline.py b/src/pytorch_ie/pipeline.py
@@ -11,6 +11,7 @@
 from torch import Tensor
 from torch.utils.data import DataLoader
 
+from datasets import is_caching_enabled
 from pytorch_ie.core.document import Document
 from pytorch_ie.core.model import PyTorchIEModel
 from pytorch_ie.core.taskmodule import (
@@ -390,6 +391,10 @@ def __call__(
                 batched=True,
                 **dataset_map_params,
             )
+            # For now, we do not allow caching of pipeline results since fingerprinting may be incorrect
+            # TODO: elaborate why it may be incorrect
+            if is_caching_enabled() and documents._fingerprint == processed_documents._fingerprint:
+                raise Exception("Caching is not allowed for pipeline calls")
         else:
             processed_documents = self._process_documents(
                 documents=documents,