Merge pull request #253 from PolymathicAI/merge_public_expedite_internel

mikemccabe210 · web-flow · commit 762df7283436 · 2025-11-05T11:03:58.000-05:00
Merge public expedite internel
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -0,0 +1,65 @@
+name: Bug Report
+description: Create a report to help us reproduce and fix the bug
+title: "[Bug]: "
+labels: ["bug"]
+
+body:
+  - type: markdown
+    attributes:
+      value: >
+        Thank you for taking the time to file a bug report.
+        Before creating a new issue, you can have a quick look to the [FAQ](https://github.com/PolymathicAI/the_well/discussions/categories/q-a?discussions_q=category%3AQ%26A+) and [existing issues](https://github.com/PolymathicAI/the_well/issues).
+  - type: textarea
+    attributes:
+      label: "Describe the issue:"
+      placeholder: |
+        << your issue description here >>
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: "Code to reproduce the issue:"
+      description: >
+        A short code example that reproduces the problem/missing feature.
+        It should be self-contained.
+      placeholder: |
+        << your code here >>
+      render: python
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: "Version"
+      description: |
+        Which version of the Well are you using?
+        You can obtain the version by running the following command:
+        ```sh
+        python -c "import the_well; print(the_well.__version__)"
+        ```
+      placeholder: |
+        << your version here >>
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: "Environment"
+      description: |
+        Which environment are you using? List the packages you have installed along the Well.
+        In case you use pip, you can obtain the list of installed packages by running the following command:
+        ```sh
+        pip freeze
+        ```
+      placeholder: |
+        << your environment here >>
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: "Context for the issue:"
+      description: |
+        Please explain how this issue affects your intended use of the Well.
+        You can also provide additional context that you think might be relevant.
+      placeholder: |
+        << your explanation here >>
+    validations:
+      required: false
diff --git a/docs/datasets/rayleigh_benard_uniform.md b/docs/datasets/rayleigh_benard_uniform.md
@@ -0,0 +1 @@
+../../datasets/rayleigh_benard_uniform/README.md
diff --git a/tests/data/test_normalization.py b/tests/data/test_normalization.py
@@ -0,0 +1,105 @@
+import math
+
+import torch
+
+from the_well.data.normalization import RMSNormalization, ZScoreNormalization
+
+
+def test_zscore_normalization():
+    """Test the ZScoreNormalization actually provides the correct normalization.
+    We consider fields whose mean and std are given by a linear function of the field index.
+
+    """
+    n_fields = 4
+    h = 64
+    w = 64
+    t = 10
+    batch_size = 64
+    tol = 1e-2
+    std = torch.arange(n_fields) + 1.0
+    mean = torch.arange(n_fields)
+    delta_mean = torch.zeros_like(mean)
+    delta_std = math.sqrt(2) * std
+    stats = {
+        "mean": {f"field_{i}": mean[i] for i in range(n_fields)},
+        "std": {f"field_{i}": std[i] for i in range(n_fields)},
+        "mean_delta": {f"field_{i}": delta_mean[i] for i in range(n_fields)},
+        "std_delta": {f"field_{i}": delta_std[i] for i in range(n_fields)},
+    }
+    normalization = ZScoreNormalization(
+        stats=stats,
+        core_field_names=[f"field_{i}" for i in range(n_fields)],
+        core_constant_field_names=[],
+    )
+
+    input_tensor = std * torch.randn(batch_size, h, w, t, n_fields) + mean
+    delta_input_tensor = input_tensor[..., 1:, :] - input_tensor[..., :-1, :]
+    for i in range(n_fields):
+        normalized_tensor = normalization.normalize(input_tensor[..., i], f"field_{i}")
+        assert normalized_tensor.shape == (batch_size, h, w, t)
+        assert torch.allclose(
+            torch.mean(normalized_tensor), torch.tensor(0.0), atol=tol
+        )
+        assert torch.allclose(torch.std(normalized_tensor), torch.tensor(1.0), atol=tol)
+
+        normalized_delta_tensor = normalization.delta_normalize(
+            delta_input_tensor[..., i], f"field_{i}"
+        )
+        assert normalized_delta_tensor.shape == (batch_size, h, w, t - 1)
+        assert torch.allclose(
+            torch.mean(normalized_delta_tensor), torch.tensor(0.0), atol=tol
+        )
+        assert torch.allclose(
+            torch.std(normalized_delta_tensor), torch.tensor(1.0), atol=tol
+        )
+
+
+def test_rms_normalization():
+    """Test the RMSNormalization actually provides the correct normalization.
+    We consider fields whose mean and std are given by a linear function of the field index.
+    """
+    n_fields = 4
+    h = 64
+    w = 64
+    t = 10
+    batch_size = 64
+    tol = 1e-2
+    std = torch.arange(n_fields) + 1.0
+    mean = torch.arange(n_fields)
+    delta_std = math.sqrt(2) * std
+    stats = {
+        "rms": {f"field_{i}": std[i] for i in range(n_fields)},
+        "rms_delta": {f"field_{i}": delta_std[i] for i in range(n_fields)},
+    }
+    normalization = RMSNormalization(
+        stats=stats,
+        core_field_names=[f"field_{i}" for i in range(n_fields)],
+        core_constant_field_names=[],
+    )
+
+    input_tensor = std * torch.randn(batch_size, h, w, t, n_fields) + mean
+    delta_input_tensor = input_tensor[..., 1:, :] - input_tensor[..., :-1, :]
+    for i in range(n_fields):
+        normalized_tensor = normalization.normalize(input_tensor[..., i], f"field_{i}")
+        assert normalized_tensor.shape == (batch_size, h, w, t)
+        assert torch.allclose(
+            torch.mean(normalized_tensor),
+            mean[i].float() / std[i].float(),
+            atol=tol,
+        )
+        assert torch.allclose(
+            torch.std(normalized_tensor),
+            torch.tensor(1.0),
+            atol=tol,
+        )
+
+        normalized_delta_tensor = normalization.delta_normalize(
+            delta_input_tensor[..., i], f"field_{i}"
+        )
+        assert normalized_delta_tensor.shape == (batch_size, h, w, t - 1)
+        assert torch.allclose(
+            torch.mean(normalized_delta_tensor), torch.tensor(0.0), atol=tol
+        )
+        assert torch.allclose(
+            torch.std(normalized_delta_tensor), torch.tensor(1.0), atol=tol
+        )
diff --git a/the_well/benchmark/trainer/training.py b/the_well/benchmark/trainer/training.py
@@ -312,7 +312,11 @@ def validation_loop(
         loss_dict = {}
         time_logs = {}
         count = 0
-        denom = len(dataloader) if full else self.short_validation_length
+        denom = (
+            len(dataloader)
+            if full
+            else min(self.short_validation_length, len(dataloader))
+        )
         with torch.autocast(
             self.device.type, enabled=self.enable_amp, dtype=self.amp_type
         ):
@@ -398,7 +402,7 @@ def train_one_epoch(self, epoch: int, dataloader: DataLoader) -> float:
             backward_time = time.time() - batch_start - forward_time - batch_time
             total_time = time.time() - batch_start
             logger.info(
-                f"Epoch {epoch}, Batch {i+1}/{len(dataloader)}: loss {loss.item()}, total_time {total_time}, batch time {batch_time}, forward time {forward_time}, backward time {backward_time}"
+                f"Epoch {epoch}, Batch {i + 1}/{len(dataloader)}: loss {loss.item()}, total_time {total_time}, batch time {batch_time}, forward time {forward_time}, backward time {backward_time}"
             )
             batch_start = time.time()
         train_logs["time_per_train_iter"] = (time.time() - start_time) / len(dataloader)
@@ -458,6 +462,7 @@ def train(self):
                     self.save_model(
                         epoch, val_loss, os.path.join(self.checkpoint_folder, "best.pt")
                     )
+                    self.best_val_loss = val_loss
             # Check if time for expensive validation - periodic or final
             if epoch % self.rollout_val_frequency == 0 or (epoch == self.max_epoch):
                 logger.info(
diff --git a/the_well/benchmark/utils/experiment_utils.py b/the_well/benchmark/utils/experiment_utils.py
@@ -98,13 +98,17 @@ def configure_experiment(
         )
         folder_path = osp.join(experiment_folder, "extended_config.yaml")
         if osp.isfile(checkpoint_path):
-            logger.info(f"Config file exists relative to checkpoint override provided, \
-                            using config file {checkpoint_path}")
+            logger.info(
+                f"Config file exists relative to checkpoint override provided, \
+                            using config file {checkpoint_path}"
+            )
         elif osp.isfile(folder_path):
-            logger.warn(f"Config file not found in checkpoint override path. \
+            logger.warn(
+                f"Config file not found in checkpoint override path. \
                         Found in experiment folder, using config file {folder_path}. \
                         This could lead to weight compatibility issues if the checkpoints do not align with \
-                        the specified folder.")
+                        the specified folder."
+            )
         else:
             logger.warn(
                 "Checkpoint override provided, but config file not found in checkpoint override path \
diff --git a/the_well/data/datasets.py b/the_well/data/datasets.py
@@ -136,6 +136,8 @@ class WellDataset(Dataset):
             Whether to normalize data in the dataset
         normlization_type:
             What type of dataset normalization. Callable Options: ZSCORE and RMS
+        max_rollout_steps:
+            Maximum number of output steps to return in a single sample. Return the full trajectory if larger than its actual length.
         n_steps_input:
             Number of steps to include in each sample
         n_steps_output:
@@ -583,7 +585,9 @@ def _pad_axes(
         expand_dims = expand_dims + (1,) * tensor_order
         return torch.tile(field_data, expand_dims)
 
-    def _reconstruct_fields(self, file, cache, sample_idx, time_idx, n_steps, dt):
+    def _reconstruct_fields(
+        self, file: h5.File, cache, sample_idx, time_idx, n_steps, dt
+    ):
         """Reconstruct space fields starting at index sample_idx, time_idx, with
         n_steps and dt stride."""
         variable_fields = {0: {}, 1: {}, 2: {}}
@@ -634,7 +638,9 @@ def _reconstruct_fields(self, file, cache, sample_idx, time_idx, n_steps, dt):
 
         return (variable_fields, constant_fields)
 
-    def _reconstruct_scalars(self, file, cache, sample_idx, time_idx, n_steps, dt):
+    def _reconstruct_scalars(
+        self, file: h5.File, cache, sample_idx, time_idx, n_steps, dt
+    ):
         """Reconstruct scalar values (not fields) starting at index sample_idx, time_idx, with
         n_steps and dt stride."""
         variable_scalars = {}
@@ -670,7 +676,9 @@ def _reconstruct_scalars(self, file, cache, sample_idx, time_idx, n_steps, dt):
 
         return (variable_scalars, constant_scalars)
 
-    def _reconstruct_grids(self, file, cache, sample_idx, time_idx, n_steps, dt):
+    def _reconstruct_grids(
+        self, file: h5.File, cache, sample_idx, time_idx, n_steps, dt
+    ):
         """Reconstruct grid values starting at index sample_idx, time_idx, with
         n_steps and dt stride."""
         # Time
@@ -705,7 +713,7 @@ def _reconstruct_grids(self, file, cache, sample_idx, time_idx, n_steps, dt):
                 self._check_cache(cache, "space_grid", space_grid)
         return space_grid, time_grid
 
-    def _padding_bcs(self, file, cache, sample_idx, time_idx, n_steps, dt):
+    def _padding_bcs(self, file: h5.File, cache, sample_idx, time_idx, n_steps, dt):
         """Handles BC case where BC corresponds to a specific padding type
 
         Note/TODO - currently assumes boundaries to be axis-aligned and cover the entire
@@ -753,7 +761,7 @@ def _padding_bcs(self, file, cache, sample_idx, time_idx, n_steps, dt):
             self._check_cache(cache, "boundary_output", boundary_output)
         return boundary_output
 
-    def _reconstruct_bcs(self, file, cache, sample_idx, time_idx, n_steps, dt):
+    def _reconstruct_bcs(self, file: h5.File, cache, sample_idx, time_idx, n_steps, dt):
         """Needs work to support arbitrary BCs.
 
         Currently supports finite set of boundary condition types that describe

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../../datasets/rayleigh_benard_uniform/README.md`