diff --git a/bayesflow/datasets/__init__.py b/bayesflow/datasets/__init__.py index 54713e98c..54ce6e2a4 100644 --- a/bayesflow/datasets/__init__.py +++ b/bayesflow/datasets/__init__.py @@ -1,4 +1,3 @@ -from .numpy_dataset import NumpyDataset from .offline_dataset import OfflineDataset from .online_dataset import OnlineDataset from .disk_dataset import DiskDataset diff --git a/bayesflow/datasets/numpy_dataset.py b/bayesflow/datasets/numpy_dataset.py deleted file mode 100644 index 38bb93f81..000000000 --- a/bayesflow/datasets/numpy_dataset.py +++ /dev/null @@ -1,86 +0,0 @@ -import keras -import numpy as np -import os -import pathlib as pl - -from bayesflow.data_adapters import DataAdapter - - -class NumpyDataset(keras.utils.PyDataset): - """ - A dataset used to load numpy files from disk. - The training strategy will be offline. - - By default, the expected file structure is as follows: - root - ├── parameter_name_1 - │ ├── sample_1.npy - │ ├── ... - │ └── sample_n.npy - ├── parameter_name_2 - │ ├── sample_1.npy - │ ├── ... - │ └── sample_n.npy - └── ... - - where each numpy file contains a sample of a single parameter (i.e., a numpy array). - - """ - - def __init__( - self, - root: os.PathLike, - *, - pattern: str = "*.npy", - batch_size: int, - data_adapter: DataAdapter | None, - **kwargs, - ): - super().__init__(**kwargs) - self.batch_size = batch_size - self.root = pl.Path(root) - self.data_adapter = data_adapter - - # TODO: the assumption on the file structure is a bit strong - # we should relax this assumption in the future or provide better customization - # via the pattern arguments - parameter_names = list(map(str, self.root.glob("*/"))) - - self.files = { - parameter_name: list(map(str, self.root.glob(f"{parameter_name}/{pattern}"))) - for parameter_name in parameter_names - } - - self.shuffle() - - def __getitem__(self, item): - if not 0 <= item < self.num_batches: - raise IndexError(f"Index {item} is out of bounds for dataset with {self.num_batches} batches.") - - batch = {} - - for parameter_name, files in self.files.items(): - files = files[item * self.batch_size : (item + 1) * self.batch_size] - - samples = [] - for file in files: - samples.append(np.load(file)) - - batch[parameter_name] = np.stack(samples) - - if self.data_adapter is not None: - batch = self.data_adapter.configure(batch) - - return batch - - def on_epoch_end(self): - self.shuffle() - - @property - def num_batches(self): - n = len(next(iter(self.files.values()))) - return int(np.ceil(n / self.batch_size)) - - def shuffle(self): - permutation = np.random.permutation(len(next(iter(self.files.values())))) - self.files = {parameter_name: [files[i] for i in permutation] for parameter_name, files in self.files.items()}