bayesflow-org · stefanradev93 · Feb 14, 2025 · Feb 12, 2025 · Feb 12, 2025 · Feb 12, 2025
diff --git a/README.md b/README.md
@@ -93,12 +93,13 @@ conda env create --file environment.yaml --name bayesflow
 
 Check out some of our walk-through notebooks below. We are actively working on porting all notebooks to the new interface so more will be available soon!
 
-1. [Two moons starter toy example](examples/TwoMoons_StarterNotebook.ipynb)
-2. [Linear regression](examples/Linear_Regression.ipynb)
-3. [Bayesian experimental design](examples/Bayesian_Experimental_Design.ipynb)
-4. [SIR model with custom summary network](examples/SIR_PosteriorEstimation.ipynb)
+1. [Linear regression starter example](examples/Linear_Regression_Starter.ipynb)
+2. [Two moons starter example](examples/Two_Moons_Starter.ipynb)
+3. [SIR model with custom summary network](examples/SIR_Posterior_Estimation.ipynb)
+4. [SBML model using an external simulator](examples/SBML_Posterior_Estimation.ipynb)
 5. [Hyperparameter optimization](examples/Hyperparameter_Optimization.ipynb)
-6. Coming soon...
+6. [Bayesian experimental design](examples/Bayesian_Experimental_Design.ipynb)
+7. More coming soon...
 
 ## Documentation \& Help
 

diff --git a/bayesflow/diagnostics/metrics/calibration_error.py b/bayesflow/diagnostics/metrics/calibration_error.py
@@ -6,25 +6,31 @@
 
 
 def calibration_error(
+    estimates: Mapping[str, np.ndarray] | np.ndarray,
     targets: Mapping[str, np.ndarray] | np.ndarray,
-    references: Mapping[str, np.ndarray] | np.ndarray,
+    variable_keys: Sequence[str] = None,
+    variable_names: Sequence[str] = None,
     resolution: int = 20,
     aggregation: Callable = np.median,
     min_quantile: float = 0.005,
     max_quantile: float = 0.995,
-    variable_names: Sequence[str] = None,
 ) -> Mapping[str, Any]:
     """Computes an aggregate score for the marginal calibration error over an ensemble of approximate
     posteriors. The calibration error is given as the aggregate (e.g., median) of the absolute deviation
-    between an alpha-CI and the relative number of inliers from ``prior_samples`` over multiple alphas in
+    between an alpha-CI and the relative number of inliers from ``estimates`` over multiple alphas in
     (0, 1).
 
     Parameters
     ----------
-    targets  : np.ndarray of shape (num_datasets, num_draws, num_variables)
+    estimates  : np.ndarray of shape (num_datasets, num_draws, num_variables)
         The random draws from the approximate posteriors over ``num_datasets``
-    references : np.ndarray of shape (num_datasets, num_variables)
+    targets : np.ndarray of shape (num_datasets, num_variables)
         The corresponding ground-truth values sampled from the prior
+    variable_keys : Sequence[str], optional (default = None)
+       Select keys from the dictionaries provided in estimates and targets.
+       By default, select all keys.
+    variable_names : Sequence[str], optional (default = None)
+        Optional variable names to show in the output.
     resolution    : int, optional, default: 20
         The number of credibility intervals (CIs) to consider
     aggregation   : callable or None, optional, default: np.median
@@ -34,8 +40,6 @@ def calibration_error(
         The minimum posterior quantile to consider.
     max_quantile  : float in (0, 1), optional, default: 0.995
         The maximum posterior quantile to consider.
-    variable_names : Sequence[str], optional (default = None)
-        Optional variable names to select from the available variables.
 
     Returns
     -------
@@ -49,7 +53,12 @@ def calibration_error(
             The (inferred) variable names.
     """
 
-    samples = dicts_to_arrays(targets=targets, references=references, variable_names=variable_names)
+    samples = dicts_to_arrays(
+        estimates=estimates,
+        targets=targets,
+        variable_keys=variable_keys,
+        variable_names=variable_names,
+    )
 
     # Define alpha values and the corresponding quantile bounds
     alphas = np.linspace(start=min_quantile, stop=max_quantile, num=resolution)
@@ -58,14 +67,14 @@ def calibration_error(
     uppers = 1 - lowers
 
     # Compute quantiles for each alpha, for each dataset and parameter
-    quantiles = np.quantile(samples["targets"], [lowers, uppers], axis=1)
+    quantiles = np.quantile(samples["estimates"], [lowers, uppers], axis=1)
 
     # Shape: (2, resolution, num_datasets, num_params)
     lower_bounds, upper_bounds = quantiles[0], quantiles[1]
 
     # Compute masks for inliers
-    lower_mask = lower_bounds <= samples["references"][None, ...]
-    upper_mask = upper_bounds >= samples["references"][None, ...]
+    lower_mask = lower_bounds <= samples["targets"][None, ...]
+    upper_mask = upper_bounds >= samples["targets"][None, ...]
 
     # Logical AND to identify inliers for each alpha
     inlier_id = np.logical_and(lower_mask, upper_mask)

diff --git a/bayesflow/diagnostics/metrics/posterior_contraction.py b/bayesflow/diagnostics/metrics/posterior_contraction.py
@@ -6,24 +6,28 @@
 
 
 def posterior_contraction(
+    estimates: Mapping[str, np.ndarray] | np.ndarray,
     targets: Mapping[str, np.ndarray] | np.ndarray,
-    references: Mapping[str, np.ndarray] | np.ndarray,
-    aggregation: Callable = np.median,
+    variable_keys: Sequence[str] = None,
     variable_names: Sequence[str] = None,
+    aggregation: Callable = np.median,
 ) -> Mapping[str, Any]:
     """Computes the posterior contraction (PC) from prior to posterior for the given samples.
 
     Parameters
     ----------
-    targets   : np.ndarray of shape (num_datasets, num_draws_post, num_variables)
+    estimates   : np.ndarray of shape (num_datasets, num_draws_post, num_variables)
         Posterior samples, comprising `num_draws_post` random draws from the posterior distribution
         for each data set from `num_datasets`.
-    references  : np.ndarray of shape (num_datasets, num_variables)
+    targets  : np.ndarray of shape (num_datasets, num_variables)
         Prior samples, comprising `num_datasets` ground truths.
+    variable_keys : Sequence[str], optional (default = None)
+       Select keys from the dictionaries provided in estimates and targets.
+       By default, select all keys.
+    variable_names : Sequence[str], optional (default = None)
+        Optional variable names to show in the output.
     aggregation    : callable, optional (default = np.median)
         Function to aggregate the PC across draws. Typically `np.mean` or `np.median`.
-    variable_names : Sequence[str], optional (default = None)
-        Optional variable names to select from the available variables.
 
     Returns
     -------
@@ -43,10 +47,15 @@ def posterior_contraction(
     indicate low contraction.
     """
 
-    samples = dicts_to_arrays(targets=targets, references=references, variable_names=variable_names)
+    samples = dicts_to_arrays(
+        estimates=estimates,
+        targets=targets,
+        variable_keys=variable_keys,
+        variable_names=variable_names,
+    )
 
-    post_vars = samples["targets"].var(axis=1, ddof=1)
-    prior_vars = samples["references"].var(axis=0, keepdims=True, ddof=1)
+    post_vars = samples["estimates"].var(axis=1, ddof=1)
+    prior_vars = samples["targets"].var(axis=0, keepdims=True, ddof=1)
     contraction = 1 - (post_vars / prior_vars)
     contraction = aggregation(contraction, axis=0)
     return {"values": contraction, "metric_name": "Posterior Contraction", "variable_names": samples["variable_names"]}
diff --git a/bayesflow/diagnostics/metrics/root_mean_squared_error.py b/bayesflow/diagnostics/metrics/root_mean_squared_error.py
@@ -6,27 +6,31 @@
 
 
 def root_mean_squared_error(
+    estimates: Mapping[str, np.ndarray] | np.ndarray,
     targets: Mapping[str, np.ndarray] | np.ndarray,
-    references: Mapping[str, np.ndarray] | np.ndarray,
+    variable_keys: Sequence[str] = None,
+    variable_names: Sequence[str] = None,
     normalize: bool = True,
     aggregation: Callable = np.median,
-    variable_names: Sequence[str] = None,
 ) -> Mapping[str, Any]:
     """Computes the (Normalized) Root Mean Squared Error (RMSE/NRMSE) for the given posterior and prior samples.
 
     Parameters
     ----------
-    targets   : np.ndarray of shape (num_datasets, num_draws_post, num_variables)
+    estimates   : np.ndarray of shape (num_datasets, num_draws_post, num_variables)
         Posterior samples, comprising `num_draws_post` random draws from the posterior distribution
         for each data set from `num_datasets`.
-    references  : np.ndarray of shape (num_datasets, num_variables)
+    targets  : np.ndarray of shape (num_datasets, num_variables)
         Prior samples, comprising `num_datasets` ground truths.
+    variable_keys : Sequence[str], optional (default = None)
+       Select keys from the dictionaries provided in estimates and targets.
+       By default, select all keys.
+    variable_names : Sequence[str], optional (default = None)
+        Optional variable names to show in the output.
     normalize      : bool, optional (default = True)
         Whether to normalize the RMSE using the range of the prior samples.
     aggregation    : callable, optional (default = np.median)
         Function to aggregate the RMSE across draws. Typically `np.mean` or `np.median`.
-    variable_names : Sequence[str], optional (default = None)
-        Optional variable names to select from the available variables.
 
     Notes
     -----
@@ -45,12 +49,17 @@ def root_mean_squared_error(
             The (inferred) variable names.
     """
 
-    samples = dicts_to_arrays(targets=targets, references=references, variable_names=variable_names)
+    samples = dicts_to_arrays(
+        estimates=estimates,
+        targets=targets,
+        variable_keys=variable_keys,
+        variable_names=variable_names,
+    )
 
-    rmse = np.sqrt(np.mean((samples["targets"] - samples["references"][:, None, :]) ** 2, axis=0))
+    rmse = np.sqrt(np.mean((samples["estimates"] - samples["targets"][:, None, :]) ** 2, axis=0))
 
     if normalize:
-        rmse /= (samples["references"].max(axis=0) - samples["references"].min(axis=0))[None, :]
+        rmse /= (samples["targets"].max(axis=0) - samples["targets"].min(axis=0))[None, :]
         metric_name = "NRMSE"
     else:
         metric_name = "RMSE"

diff --git a/bayesflow/diagnostics/plots/calibration_ecdf.py b/bayesflow/diagnostics/plots/calibration_ecdf.py
@@ -8,8 +8,9 @@
 
 
 def calibration_ecdf(
+    estimates: dict[str, np.ndarray] | np.ndarray,
     targets: dict[str, np.ndarray] | np.ndarray,
-    references: dict[str, np.ndarray] | np.ndarray,
+    variable_keys: Sequence[str] = None,
     variable_names: Sequence[str] = None,
     difference: bool = False,
     stacked: bool = False,
@@ -50,9 +51,9 @@ def calibration_ecdf(
 
     Parameters
     ----------
-    targets      : np.ndarray of shape (n_data_sets, n_post_draws, n_params)
+    estimates      : np.ndarray of shape (n_data_sets, n_post_draws, n_params)
         The posterior draws obtained from n_data_sets
-    references     : np.ndarray of shape (n_data_sets, n_params)
+    targets     : np.ndarray of shape (n_data_sets, n_params)
         The prior draws obtained for generating n_data_sets
     difference        : bool, optional, default: False
         If `True`, plots the ECDF difference.
@@ -67,8 +68,11 @@ def calibration_ecdf(
         If `distance`, the ranks are computed as the fraction of posterior
         samples that are closer to a reference points (default here is the origin).
         You can pass a reference array in the same shape as the
-        `prior_samples` array by setting `references` in the ``ranks_kwargs``.
+        `estimates` array by setting `targets` in the ``ranks_kwargs``.
         This is motivated by [2].
+    variable_keys       : list or None, optional, default: None
+       Select keys from the dictionaries provided in estimates and targets.
+       By default, select all keys.
     variable_names    : list or None, optional, default: None
         The parameter names for nice plot titles.
         Inferred if None. Only relevant if `stacked=False`.
@@ -108,31 +112,32 @@ def calibration_ecdf(
     Raises
     ------
     ShapeError
-        If there is a deviation form the expected shapes of `post_samples`
-        and `prior_samples`.
+        If there is a deviation form the expected shapes of `estimates`
+        and `targets`.
     ValueError
         If an unknown `rank_type` is passed.
     """
 
     plot_data = prepare_plot_data(
+        estimates=estimates,
         targets=targets,
-        references=references,
+        variable_keys=variable_keys,
         variable_names=variable_names,
         num_col=num_col,
         num_row=num_row,
         figsize=figsize,
         stacked=stacked,
     )
 
+    estimates = plot_data.pop("estimates")
     targets = plot_data.pop("targets")
-    references = plot_data.pop("references")
 
     if rank_type == "fractional":
         # Compute fractional ranks
-        ranks = fractional_ranks(targets, references)
+        ranks = fractional_ranks(estimates, targets)
     elif rank_type == "distance":
         # Compute ranks based on distance to the origin
-        ranks = distance_ranks(targets, references, stacked=stacked, **kwargs.pop("ranks_kwargs", {}))
+        ranks = distance_ranks(estimates, targets, stacked=stacked, **kwargs.pop("ranks_kwargs", {}))
     else:
         raise ValueError(f"Unknown rank type: {rank_type}. Use 'fractional' or 'distance'.")
 
@@ -157,7 +162,7 @@ def calibration_ecdf(
             plot_data["axes"].flat[j].plot(xx, yy, color=rank_ecdf_color, alpha=0.95, label="Rank ECDF")
 
     # Compute uniform ECDF and bands
-    alpha, z, L, H = simultaneous_ecdf_bands(targets.shape[0], **kwargs.pop("ecdf_bands_kwargs", {}))
+    alpha, z, L, H = simultaneous_ecdf_bands(estimates.shape[0], **kwargs.pop("ecdf_bands_kwargs", {}))
 
     # Difference, if specified
     if difference:

diff --git a/bayesflow/diagnostics/plots/calibration_histogram.py b/bayesflow/diagnostics/plots/calibration_histogram.py
@@ -10,8 +10,9 @@
 
 
 def calibration_histogram(
+    estimates: dict[str, np.ndarray] | np.ndarray,
     targets: dict[str, np.ndarray] | np.ndarray,
-    references: dict[str, np.ndarray] | np.ndarray,
+    variable_keys: Sequence[str] = None,
     variable_names: Sequence[str] = None,
     figsize: Sequence[float] = None,
     num_bins: int = 10,
@@ -35,10 +36,13 @@ def calibration_histogram(
 
     Parameters
     ----------
-    targets      : np.ndarray of shape (n_data_sets, n_post_draws, n_params)
+    estimates      : np.ndarray of shape (n_data_sets, n_post_draws, n_params)
         The posterior draws obtained from n_data_sets
-    references     : np.ndarray of shape (n_data_sets, n_params)
+    targets     : np.ndarray of shape (n_data_sets, n_params)
         The prior draws obtained for generating n_data_sets
+    variable_keys       : list or None, optional, default: None
+       Select keys from the dictionaries provided in estimates and targets.
+       By default, select all keys.
     variable_names    : list or None, optional, default: None
         The parameter names for nice plot titles. Inferred if None
     figsize          : tuple or None, optional, default : None
@@ -67,25 +71,26 @@ def calibration_histogram(
     Raises
     ------
     ShapeError
-        If there is a deviation form the expected shapes of `post_samples` and `prior_samples`.
+        If there is a deviation form the expected shapes of `estimates` and `targets`.
     """
 
     plot_data = prepare_plot_data(
+        estimates=estimates,
         targets=targets,
-        references=references,
+        variable_keys=variable_keys,
         variable_names=variable_names,
         num_col=num_col,
         num_row=num_row,
         figsize=figsize,
     )
 
+    estimates = plot_data.pop("estimates")
     targets = plot_data.pop("targets")
-    references = plot_data.pop("references")
 
     # Determine the ratio of simulations to prior draw
     # num_params = plot_data['num_variables']
-    num_sims = targets.shape[0]
-    num_draws = targets.shape[1]
+    num_sims = estimates.shape[0]
+    num_draws = estimates.shape[1]
 
     ratio = int(num_sims / num_draws)
 
@@ -105,10 +110,10 @@ def calibration_histogram(
             num_bins = 4
 
     # Compute ranks (using broadcasting)
-    ranks = np.sum(targets < references[:, np.newaxis, :], axis=1)
+    ranks = np.sum(estimates < targets[:, np.newaxis, :], axis=1)
 
     # Compute confidence interval and mean
-    num_trials = int(references.shape[0])
+    num_trials = int(targets.shape[0])
     # uniform distribution expected -> for all bins: equal probability
     # p = 1 / num_bins that a rank lands in that bin
     endpoints = binom.interval(binomial_interval, num_trials, 1 / num_bins)

diff --git a/bayesflow/diagnostics/plots/mc_calibration.py b/bayesflow/diagnostics/plots/mc_calibration.py
@@ -68,8 +68,8 @@ def mc_calibration(
 
     # Gather plot data and metadata into a dictionary
     plot_data = prepare_plot_data(
-        targets=pred_models,
-        references=true_models,
+        estimates=pred_models,
+        targets=true_models,
         variable_names=model_names,
         num_col=num_col,
         num_row=num_row,
@@ -79,7 +79,7 @@ def mc_calibration(
 
     # Compute calibration
     cal_errors, true_probs, pred_probs = expected_calibration_error(
-        plot_data["references"], plot_data["targets"], num_bins
+        plot_data["targets"], plot_data["estimates"], num_bins
     )
 
     for j, ax in enumerate(plot_data["axes"].flat):
@@ -88,8 +88,8 @@ def mc_calibration(
 
         # Plot PMP distribution over bins
         uniform_bins = np.linspace(0.0, 1.0, num_bins + 1)
-        norm_weights = np.ones_like(plot_data["targets"]) / len(plot_data["targets"])
-        ax.hist(plot_data["targets"][:, j], bins=uniform_bins, weights=norm_weights[:, j], color="grey", alpha=0.3)
+        norm_weights = np.ones_like(plot_data["estimates"]) / len(plot_data["estimates"])
+        ax.hist(plot_data["estimates"][:, j], bins=uniform_bins, weights=norm_weights[:, j], color="grey", alpha=0.3)
 
         # Plot AB line
         ax.plot((0, 1), (0, 1), "--", color="black", alpha=0.9)