Release 2.0.5 (#526)

vpratz · han-ol · daniel-habermann · web-flow · commit c52221dd4a11 · 2025-07-02T05:05:52.000-04:00
* fix trainable parameters in distributions (#520) * Improve numerical precision in MVNScore.log_prob * add log_gamma diagnostic (#522) * add log_gamma diagnostic * add missing export for log_gamma * add missing export for gamma_null_distribution, gamma_discrepancy * fix broken unit tests * rename log_gamma module to sbc * add test_log_gamma unit test * add return information to log_gamma doc string * fix typo in docstring, use fixed-length np array to collect log_gammas instead of appending to an empty list * Breaking changes: Fix bugs regarding counts in standardization layer (#525) * standardization: add test for multi-input values (failing) This test reveals to bugs in the standarization layer: - count is updated multiple times - batch_count is too small, as the sizes from reduce_axes have to be multiplied * breaking: fix bugs regarding count in standardization layer Fixes #524 This fixes the two bugs described in c4cc133: - count was accidentally updated, leading to wrong values - count was calculated wrongly, as only the batch size was used. Correct is the product of all reduce dimensions. This lead to wrong standard deviations While the batch dimension is the same for all inputs, the size of the second dimension might vary. For this reason, we need to introduce an input-specific `count` variable. This breaks serialization. * fix assert statement in test * bump version to 2.0.5, adjust deprecation warnings * rename log_gamma to calibration_log_gamma (#527) --------- Co-authored-by: han-ol <g@hans.olischlaeger.com> Co-authored-by: Daniel Habermann <133031176+daniel-habermann@users.noreply.github.com>
diff --git a/bayesflow/approximators/continuous_approximator.py b/bayesflow/approximators/continuous_approximator.py
@@ -578,9 +578,9 @@ def summarize(self, data: Mapping[str, np.ndarray], **kwargs) -> np.ndarray:
     def summaries(self, data: Mapping[str, np.ndarray], **kwargs) -> np.ndarray:
         """
         .. deprecated:: 2.0.4
-            `summaries` will be removed in version 2.0.5, it was renamed to `summarize` which should be used instead.
+            `summaries` will be removed in version 2.0.6, it was renamed to `summarize` which should be used instead.
         """
-        warnings.warn("`summaries` was renamed to `summarize` and will be removed in version 2.0.5.", FutureWarning)
+        warnings.warn("`summaries` was renamed to `summarize` and will be removed in version 2.0.6.", FutureWarning)
         return self.summarize(data=data, **kwargs)
 
     def log_prob(self, data: Mapping[str, np.ndarray], **kwargs) -> np.ndarray:
diff --git a/bayesflow/approximators/model_comparison_approximator.py b/bayesflow/approximators/model_comparison_approximator.py
@@ -442,9 +442,9 @@ def summarize(self, data: Mapping[str, np.ndarray], **kwargs) -> np.ndarray:
     def summaries(self, data: Mapping[str, np.ndarray], **kwargs) -> np.ndarray:
         """
         .. deprecated:: 2.0.4
-            `summaries` will be removed in version 2.0.5, it was renamed to `summarize` which should be used instead.
+            `summaries` will be removed in version 2.0.6, it was renamed to `summarize` which should be used instead.
         """
-        warnings.warn("`summaries` was renamed to `summarize` and will be removed in version 2.0.5.", FutureWarning)
+        warnings.warn("`summaries` was renamed to `summarize` and will be removed in version 2.0.6.", FutureWarning)
         return self.summarize(data=data, **kwargs)
 
     def _compute_logits(self, classifier_conditions: Tensor) -> Tensor:
diff --git a/bayesflow/diagnostics/metrics/__init__.py b/bayesflow/diagnostics/metrics/__init__.py
@@ -4,3 +4,4 @@
 from .expected_calibration_error import expected_calibration_error
 from .classifier_two_sample_test import classifier_two_sample_test
 from .model_misspecification import bootstrap_comparison, summary_space_comparison
+from .calibration_log_gamma import calibration_log_gamma, gamma_null_distribution, gamma_discrepancy
diff --git a/bayesflow/diagnostics/metrics/calibration_log_gamma.py b/bayesflow/diagnostics/metrics/calibration_log_gamma.py
@@ -0,0 +1,163 @@
+from collections.abc import Mapping, Sequence
+
+import numpy as np
+from scipy.stats import binom
+
+from ...utils.dict_utils import dicts_to_arrays
+
+
+def calibration_log_gamma(
+    estimates: Mapping[str, np.ndarray] | np.ndarray,
+    targets: Mapping[str, np.ndarray] | np.ndarray,
+    variable_keys: Sequence[str] = None,
+    variable_names: Sequence[str] = None,
+    num_null_draws: int = 1000,
+    quantile: float = 0.05,
+):
+    """
+    Compute the log gamma discrepancy statistic to test posterior calibration,
+    see [1] for additional information.
+    Log gamma is log(gamma/gamma_null), where gamma_null is the 5th percentile of the
+    null distribution under uniformity of ranks.
+    That is, if adopting a hypothesis testing framework,then log_gamma < 0 implies
+    a rejection of the hypothesis of uniform ranks at the 5% level.
+    This diagnostic is typically more sensitive than the Kolmogorov-Smirnoff test or
+    ChiSq test.
+
+    [1]  Martin Modrák. Angie H. Moon. Shinyoung Kim. Paul Bürkner. Niko Huurre.
+    Kateřina Faltejsková. Andrew Gelman. Aki Vehtari.
+    "Simulation-Based Calibration Checking for Bayesian Computation:
+    The Choice of Test Quantities Shapes Sensitivity."
+    Bayesian Anal. 20 (2) 461 - 488, June 2025. https://doi.org/10.1214/23-BA1404
+
+    Parameters
+    ----------
+    estimates  : np.ndarray of shape (num_datasets, num_draws, num_variables)
+        The random draws from the approximate posteriors over ``num_datasets``
+    targets : np.ndarray of shape (num_datasets, num_variables)
+        The corresponding ground-truth values sampled from the prior
+    variable_keys : Sequence[str], optional (default = None)
+       Select keys from the dictionaries provided in estimates and targets.
+       By default, select all keys.
+    variable_names : Sequence[str], optional (default = None)
+        Optional variable names to show in the output.
+    quantile : float in (0, 1), optional, default 0.05
+        The quantile from the null distribution to be used as a threshold.
+        A lower quantile increases sensitivity to deviations from uniformity.
+
+    Returns
+    -------
+    result : dict
+        Dictionary containing:
+
+        - "values" : float or np.ndarray
+            The log gamma values per variable
+        - "metric_name" : str
+            The name of the metric ("Log Gamma").
+        - "variable_names" : str
+            The (inferred) variable names.
+    """
+    samples = dicts_to_arrays(
+        estimates=estimates,
+        targets=targets,
+        variable_keys=variable_keys,
+        variable_names=variable_names,
+    )
+
+    num_ranks = samples["estimates"].shape[0]
+    num_post_draws = samples["estimates"].shape[1]
+
+    # rank statistics
+    ranks = np.sum(samples["estimates"] < samples["targets"][:, None], axis=1)
+
+    # null distribution and threshold
+    null_distribution = gamma_null_distribution(num_ranks, num_post_draws, num_null_draws)
+    null_quantile = np.quantile(null_distribution, quantile)
+
+    # compute log gamma for each parameter
+    log_gammas = np.empty(ranks.shape[-1])
+
+    for i in range(ranks.shape[-1]):
+        gamma = gamma_discrepancy(ranks[:, i], num_post_draws=num_post_draws)
+        log_gammas[i] = np.log(gamma / null_quantile)
+
+    output = {
+        "values": log_gammas,
+        "metric_name": "Log Gamma",
+        "variable_names": samples["estimates"].variable_names,
+    }
+
+    return output
+
+
+def gamma_null_distribution(num_ranks: int, num_post_draws: int = 1000, num_null_draws: int = 1000) -> np.ndarray:
+    """
+    Computes the distribution of expected gamma values under uniformity of ranks.
+
+    Parameters
+    ----------
+    num_ranks : int
+        Number of ranks to use for each gamma.
+    num_post_draws : int, optional, default 1000
+        Number of posterior draws that were used to calculate the rank distribution.
+    num_null_draws : int, optional, default 1000
+        Number of returned gamma values under uniformity of ranks.
+
+    Returns
+    -------
+    result : np.ndarray
+        Array of shape (num_null_draws,) containing gamma values under uniformity of ranks.
+    """
+    z_i = np.arange(1, num_post_draws + 2) / (num_post_draws + 1)
+    gamma = np.empty(num_null_draws)
+
+    # loop non-vectorized to reduce memory footprint
+    for i in range(num_null_draws):
+        u = np.random.uniform(size=num_ranks)
+        F_z = np.mean(u[:, None] < z_i, axis=0)
+        bin_1 = binom.cdf(num_ranks * F_z, num_ranks, z_i)
+        bin_2 = 1 - binom.cdf(num_ranks * F_z - 1, num_ranks, z_i)
+
+        gamma[i] = 2 * np.min(np.minimum(bin_1, bin_2))
+
+    return gamma
+
+
+def gamma_discrepancy(ranks: np.ndarray, num_post_draws: int = 100) -> float:
+    """
+    Quantifies deviation from uniformity by the likelihood of observing the
+    most extreme point on the empirical CDF of the given rank distribution
+    according to [1] (equation 7).
+
+    [1]  Martin Modrák. Angie H. Moon. Shinyoung Kim. Paul Bürkner. Niko Huurre.
+    Kateřina Faltejsková. Andrew Gelman. Aki Vehtari.
+    "Simulation-Based Calibration Checking for Bayesian Computation:
+    The Choice of Test Quantities Shapes Sensitivity."
+    Bayesian Anal. 20 (2) 461 - 488, June 2025. https://doi.org/10.1214/23-BA1404
+
+    Parameters
+    ----------
+    ranks : array of shape (num_ranks,)
+        Empirical rank distribution
+    num_post_draws : int, optional, default 100
+        Number of posterior draws used to generate ranks.
+
+    Returns
+    -------
+    result : float
+        Gamma discrepancy values for each parameter.
+    """
+    num_ranks = len(ranks)
+
+    # observed count of ranks smaller than i
+    R_i = np.array([sum(ranks < i) for i in range(1, num_post_draws + 2)])
+
+    # expected proportion of ranks smaller than i
+    z_i = np.arange(1, num_post_draws + 2) / (num_post_draws + 1)
+
+    bin_1 = binom.cdf(R_i, num_ranks, z_i)
+    bin_2 = 1 - binom.cdf(R_i - 1, num_ranks, z_i)
+
+    # likelihood of obtaining the most extreme point on the empirical CDF
+    # if the rank distribution was indeed uniform
+    return float(2 * np.min(np.minimum(bin_1, bin_2)))
diff --git a/bayesflow/distributions/diagonal_normal.py b/bayesflow/distributions/diagonal_normal.py
@@ -58,7 +58,6 @@ def __init__(
         self.seed_generator = seed_generator or keras.random.SeedGenerator()
 
         self.dim = None
-        self.log_normalization_constant = None
         self._mean = None
         self._std = None
 
@@ -71,17 +70,18 @@ def build(self, input_shape: Shape) -> None:
         self.mean = ops.cast(ops.broadcast_to(self.mean, (self.dim,)), "float32")
         self.std = ops.cast(ops.broadcast_to(self.std, (self.dim,)), "float32")
 
-        self.log_normalization_constant = -0.5 * self.dim * math.log(2.0 * math.pi) - ops.sum(ops.log(self.std))
-
         if self.trainable_parameters:
             self._mean = self.add_weight(
                 shape=ops.shape(self.mean),
-                initializer=keras.initializers.get(self.mean),
+                initializer=keras.initializers.get(keras.ops.copy(self.mean)),
                 dtype="float32",
                 trainable=True,
             )
             self._std = self.add_weight(
-                shape=ops.shape(self.std), initializer=keras.initializers.get(self.std), dtype="float32", trainable=True
+                shape=ops.shape(self.std),
+                initializer=keras.initializers.get(keras.ops.copy(self.std)),
+                dtype="float32",
+                trainable=True,
             )
         else:
             self._mean = self.mean
@@ -91,7 +91,8 @@ def log_prob(self, samples: Tensor, *, normalize: bool = True) -> Tensor:
         result = -0.5 * ops.sum((samples - self._mean) ** 2 / self._std**2, axis=-1)
 
         if normalize:
-            result += self.log_normalization_constant
+            log_normalization_constant = -0.5 * self.dim * math.log(2.0 * math.pi) - ops.sum(ops.log(self._std))
+            result += log_normalization_constant
 
         return result
 
diff --git a/bayesflow/distributions/diagonal_student_t.py b/bayesflow/distributions/diagonal_student_t.py
@@ -63,7 +63,6 @@ def __init__(
 
         self.seed_generator = seed_generator or keras.random.SeedGenerator()
 
-        self.log_normalization_constant = None
         self.dim = None
         self._loc = None
         self._scale = None
@@ -78,21 +77,16 @@ def build(self, input_shape: Shape) -> None:
         self.loc = ops.cast(ops.broadcast_to(self.loc, (self.dim,)), "float32")
         self.scale = ops.cast(ops.broadcast_to(self.scale, (self.dim,)), "float32")
 
-        self.log_normalization_constant = (
-            -0.5 * self.dim * math.log(self.df)
-            - 0.5 * self.dim * math.log(math.pi)
-            - math.lgamma(0.5 * self.df)
-            + math.lgamma(0.5 * (self.df + self.dim))
-            - ops.sum(keras.ops.log(self.scale))
-        )
-
         if self.trainable_parameters:
             self._loc = self.add_weight(
-                shape=ops.shape(self.loc), initializer=keras.initializers.get(self.loc), dtype="float32", trainable=True
+                shape=ops.shape(self.loc),
+                initializer=keras.initializers.get(keras.ops.copy(self.loc)),
+                dtype="float32",
+                trainable=True,
             )
             self._scale = self.add_weight(
                 shape=ops.shape(self.scale),
-                initializer=keras.initializers.get(self.scale),
+                initializer=keras.initializers.get(keras.ops.copy(self.scale)),
                 dtype="float32",
                 trainable=True,
             )
@@ -105,7 +99,14 @@ def log_prob(self, samples: Tensor, *, normalize: bool = True) -> Tensor:
         result = -0.5 * (self.df + self.dim) * ops.log1p(mahalanobis_term / self.df)
 
         if normalize:
-            result += self.log_normalization_constant
+            log_normalization_constant = (
+                -0.5 * self.dim * math.log(self.df)
+                - 0.5 * self.dim * math.log(math.pi)
+                - math.lgamma(0.5 * self.df)
+                + math.lgamma(0.5 * (self.df + self.dim))
+                - ops.sum(keras.ops.log(self._scale))
+            )
+            result += log_normalization_constant
 
         return result
 
diff --git a/bayesflow/distributions/mixture.py b/bayesflow/distributions/mixture.py
@@ -144,7 +144,7 @@ def build(self, input_shape: Shape) -> None:
 
         self._mixture_logits = self.add_weight(
             shape=(len(self.distributions),),
-            initializer=keras.initializers.get(self.mixture_logits),
+            initializer=keras.initializers.get(keras.ops.copy(self.mixture_logits)),
             dtype="float32",
             trainable=self.trainable_mixture,
         )
diff --git a/bayesflow/networks/standardization/standardization.py b/bayesflow/networks/standardization/standardization.py
@@ -40,7 +40,7 @@ def moving_std(self, index: int) -> Tensor:
         """
         return keras.ops.where(
             self.moving_m2[index] > 0,
-            keras.ops.sqrt(self.moving_m2[index] / self.count),
+            keras.ops.sqrt(self.moving_m2[index] / self.count[index]),
             1.0,
         )
 
@@ -53,7 +53,7 @@ def build(self, input_shape: Shape):
         self.moving_m2 = [
             self.add_weight(shape=(shape[-1],), initializer="zeros", trainable=False) for shape in flattened_shapes
         ]
-        self.count = self.add_weight(shape=(), initializer="zeros", trainable=False)
+        self.count = [self.add_weight(shape=(), initializer="zeros", trainable=False) for _ in flattened_shapes]
 
     def call(
         self,
@@ -150,7 +150,7 @@ def _update_moments(self, x: Tensor, index: int):
         """
 
         reduce_axes = tuple(range(x.ndim - 1))
-        batch_count = keras.ops.cast(keras.ops.shape(x)[0], self.count.dtype)
+        batch_count = keras.ops.cast(keras.ops.prod(keras.ops.shape(x)[:-1]), self.count[index].dtype)
 
         # Compute batch mean and M2 per feature
         batch_mean = keras.ops.mean(x, axis=reduce_axes)
@@ -159,7 +159,7 @@ def _update_moments(self, x: Tensor, index: int):
         # Read current totals
         mean = self.moving_mean[index]
         m2 = self.moving_m2[index]
-        count = self.count
+        count = self.count[index]
 
         total_count = count + batch_count
         delta = batch_mean - mean
@@ -169,4 +169,4 @@ def _update_moments(self, x: Tensor, index: int):
 
         self.moving_mean[index].assign(new_mean)
         self.moving_m2[index].assign(new_m2)
-        self.count.assign(total_count)
+        self.count[index].assign(total_count)
diff --git a/bayesflow/scores/multivariate_normal_score.py b/bayesflow/scores/multivariate_normal_score.py
@@ -82,13 +82,15 @@ def log_prob(self, x: Tensor, mean: Tensor, cov_chol: Tensor) -> Tensor:
         """
         diff = x - mean
 
-        # Calculate covariance from Cholesky factors
-        covariance = keras.ops.matmul(
-            cov_chol,
-            keras.ops.swapaxes(cov_chol, -2, -1),
+        # Calculate precision from Cholesky factors of covariance matrix
+        cov_chol_inv = keras.ops.inv(cov_chol)
+        precision = keras.ops.matmul(
+            keras.ops.swapaxes(cov_chol_inv, -2, -1),
+            cov_chol_inv,
         )
-        precision = keras.ops.inv(covariance)
-        log_det_covariance = keras.ops.slogdet(covariance)[1]  # Only take the log of the determinant part
+
+        # Compute log determinant, exploiting Cholesky factors
+        log_det_covariance = keras.ops.log(keras.ops.prod(keras.ops.diagonal(cov_chol, axis1=1, axis2=2), axis=1)) * 2
 
         # Compute the quadratic term in the exponential of the multivariate Gaussian
         quadratic_term = keras.ops.einsum("...i,...ij,...j->...", diff, precision, diff)
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "bayesflow"
-version = "2.0.4"
+version = "2.0.5"
 authors = [{ name = "The BayesFlow Team" }]
 classifiers = [
     "Development Status :: 5 - Production/Stable",
diff --git a/tests/test_approximators/test_approximator_standardization/test_approximator_standardization.py b/tests/test_approximators/test_approximator_standardization/test_approximator_standardization.py
@@ -8,7 +8,8 @@ def test_save_and_load(tmp_path, approximator, train_dataset, validation_dataset
     approximator.build(data_shapes)
     for layer in approximator.standardize_layers.values():
         assert layer.built
-        assert layer.count == 0
+        for count in layer.count:
+            assert count == 0.0
     approximator.compute_metrics(**train_dataset[0])
 
     keras.saving.save_model(approximator, tmp_path / "model.keras")
diff --git a/tests/test_approximators/test_build.py b/tests/test_approximators/test_build.py
@@ -14,4 +14,5 @@ def test_build(approximator, simulator, batch_size, adapter):
     approximator.build(batch_shapes)
     for layer in approximator.standardize_layers.values():
         assert layer.built
-        assert layer.count == 0
+        for count in layer.count:
+            assert count == 0.0
diff --git a/tests/test_diagnostics/test_diagnostics_metrics.py b/tests/test_diagnostics/test_diagnostics_metrics.py
diff --git a/tests/test_networks/test_standardization.py b/tests/test_networks/test_standardization.py

Original file line number	Diff line number	Diff line change
`@@ -144,7 +144,7 @@ def build(self, input_shape: Shape) -> None:`
`144`	`144`
`145`	`145`	`self._mixture_logits = self.add_weight(`
`146`	`146`	`shape=(len(self.distributions),),`
`147`		`- initializer=keras.initializers.get(self.mixture_logits),`
	`147`	`+ initializer=keras.initializers.get(keras.ops.copy(self.mixture_logits)),`
`148`	`148`	`dtype="float32",`
`149`	`149`	`trainable=self.trainable_mixture,`
`150`	`150`	`)`