[FIX] Reproducable samples for PERMBU (#337)

elephaint · web-flow · commit ed3bd023ee74 · 2025-03-06T22:05:09.000+01:00
diff --git a/hierarchicalforecast/core.py b/hierarchicalforecast/core.py
@@ -50,6 +50,7 @@ def _reverse_engineer_sigmah(
     id_col: str = "unique_id",
     time_col: str = "ds",
     target_col: str = "y",
+    num_samples: int = 200,
 ) -> np.ndarray:
     """
     This function assumes that the model creates prediction intervals
@@ -81,7 +82,7 @@ def _reverse_engineer_sigmah(
     sign = -1 if "lo" in pi_col else 1
     level_cols = re.findall("[\d]+[.,\d]+|[\d]*[.][\d]+|[\d]+", pi_col)
     level_col = float(level_cols[-1])
-    z = norm.ppf(0.5 + level_col / 200)
+    z = norm.ppf(0.5 + level_col / num_samples)
     sigmah = Y_hat_df[pi_col].to_numpy().reshape(n_series, -1)
     sigmah = sign * (sigmah - y_hat) / z
 
@@ -430,18 +431,22 @@ def reconcile(
                     reconciler_args["y_hat_insample"] = y_hat_insample
 
                 if level is not None:
+                    reconciler_args["intervals_method"] = intervals_method
+                    reconciler_args["num_samples"] = 200
+                    reconciler_args["seed"] = seed
+
                     if intervals_method in ["normality", "permbu"]:
                         sigmah = _reverse_engineer_sigmah(
-                            Y_hat_df=Y_hat_nw, y_hat=y_hat, model_name=model_name
+                            Y_hat_df=Y_hat_nw,
+                            y_hat=y_hat,
+                            model_name=model_name,
+                            id_col=id_col,
+                            time_col=time_col,
+                            target_col=target_col,
+                            num_samples=reconciler_args["num_samples"],
                         )
                         reconciler_args["sigmah"] = sigmah
 
-                    reconciler_args["intervals_method"] = intervals_method
-                    reconciler_args["num_samples"] = (
-                        200  # TODO: solve duplicated num_samples
-                    )
-                    reconciler_args["seed"] = seed
-
                 # Mean and Probabilistic reconciliation
                 kwargs_ls = [
                     key
diff --git a/hierarchicalforecast/probabilistic_methods.py b/hierarchicalforecast/probabilistic_methods.py
@@ -85,14 +85,14 @@ def get_samples(self, num_samples: int):
         **Returns:**<br>
         `samples`: Coherent samples of size (`base`, `horizon`, `num_samples`).
         """
-        state = np.random.RandomState(self.seed)
+        rng = np.random.default_rng(self.seed)
         n_series, n_horizon = self.y_hat.shape
         samples = np.empty(shape=(num_samples, n_series, n_horizon))
         for t in range(n_horizon):
             with warnings.catch_warnings():
                 # Avoid 'RuntimeWarning: covariance is not positive-semidefinite.'
                 # By definition the multivariate distribution is not full-rank
-                partial_samples = state.multivariate_normal(
+                partial_samples = rng.multivariate_normal(
                     mean=self.SP @ self.y_hat[:, t],
                     cov=self.cov_rec[t],
                     size=num_samples,
@@ -194,8 +194,8 @@ def get_samples(self, num_samples: int):
         # removing nas from residuals
         residuals = residuals[:, np.isnan(residuals).sum(axis=0) == 0]
         sample_idx = np.arange(residuals.shape[1] - h)
-        state = np.random.RandomState(self.seed)
-        samples_idx = state.choice(sample_idx, size=num_samples)
+        rng = np.random.default_rng(self.seed)
+        samples_idx = rng.choice(sample_idx, size=num_samples)
         samples = [self.y_hat + residuals[:, idx : (idx + h)] for idx in samples_idx]
         SP = self.S @ self.P
         samples = np.apply_along_axis(
@@ -382,21 +382,21 @@ def get_samples(self, num_samples: Optional[int] = None):
             num_samples = residuals.shape[1]
 
         # Expand residuals to match num_samples [(a,b),T] -> [(a,b),num_samples]
+        rng = np.random.default_rng(self.seed)
         if num_samples > residuals.shape[1]:
-            residuals_idxs = np.random.choice(residuals.shape[1], size=num_samples)
+            residuals_idxs = rng.choice(residuals.shape[1], size=num_samples)
         else:
-            residuals_idxs = np.random.choice(
+            residuals_idxs = rng.choice(
                 residuals.shape[1], size=num_samples, replace=False
             )
         residuals = residuals[:, residuals_idxs]
         rank_permutations = self._obtain_ranks(residuals)
 
-        state = np.random.RandomState(self.seed)
         n_series, n_horizon = self.y_hat.shape
 
         base_samples = np.array(
             [
-                state.normal(loc=m, scale=s, size=num_samples)
+                rng.normal(loc=m, scale=s, size=num_samples)
                 for m, s in zip(self.y_hat.flatten(), self.sigmah.flatten())
             ]
         )
@@ -432,7 +432,7 @@ def get_samples(self, num_samples: Optional[int] = None):
             parent_samples = np.einsum("ab,bhs->ahs", Agg, children_samples)
             random_permutation = np.array(
                 [
-                    np.random.permutation(np.arange(num_samples))
+                    rng.permutation(np.arange(num_samples))
                     for serie in range(len(parent_samples))
                 ]
             )
diff --git a/nbs/src/core.ipynb b/nbs/src/core.ipynb
@@ -150,7 +150,8 @@
     "                             model_name: str,\n",
     "                             id_col: str = \"unique_id\",\n",
     "                             time_col: str = \"ds\",\n",
-    "                             target_col: str = \"y\") -> np.ndarray:\n",
+    "                             target_col: str = \"y\",\n",
+    "                             num_samples: int = 200) -> np.ndarray:\n",
     "    \"\"\"\n",
     "    This function assumes that the model creates prediction intervals\n",
     "    under a normality with the following the Equation:\n",
@@ -179,7 +180,7 @@
     "    sign = -1 if 'lo' in pi_col else 1\n",
     "    level_cols = re.findall('[\\d]+[.,\\d]+|[\\d]*[.][\\d]+|[\\d]+', pi_col)\n",
     "    level_col = float(level_cols[-1])\n",
-    "    z = norm.ppf(0.5 + level_col / 200)\n",
+    "    z = norm.ppf(0.5 + level_col / num_samples)\n",
     "    sigmah = Y_hat_df[pi_col].to_numpy().reshape(n_series,-1)\n",
     "    sigmah = sign * (sigmah - y_hat) / z\n",
     "\n",
@@ -476,14 +477,17 @@
     "                    reconciler_args['y_hat_insample'] = y_hat_insample\n",
     "\n",
     "                if level is not None:\n",
+    "                    reconciler_args['intervals_method'] = intervals_method\n",
+    "                    reconciler_args['num_samples'] = 200\n",
+    "                    reconciler_args['seed'] = seed\n",
+    "\n",
     "                    if intervals_method in ['normality', 'permbu']:\n",
     "                        sigmah = _reverse_engineer_sigmah(Y_hat_df=Y_hat_nw,\n",
-    "                                    y_hat=y_hat, model_name=model_name)\n",
+    "                                    y_hat=y_hat, model_name=model_name, \n",
+    "                                    id_col=id_col, time_col=time_col, \n",
+    "                                    target_col=target_col, num_samples=reconciler_args['num_samples'])\n",
     "                        reconciler_args['sigmah'] = sigmah\n",
     "\n",
-    "                    reconciler_args['intervals_method'] = intervals_method\n",
-    "                    reconciler_args['num_samples'] = 200 # TODO: solve duplicated num_samples\n",
-    "                    reconciler_args['seed'] = seed\n",
     "\n",
     "                # Mean and Probabilistic reconciliation\n",
     "                kwargs_ls = [key for key in signature(reconciler.fit_predict).parameters if key in reconciler_args.keys()]\n",
@@ -513,11 +517,10 @@
     "                    if num_samples > 0:\n",
     "                        samples = reconciler.sample(num_samples=num_samples)\n",
     "                        self.sample_names[recmodel_name] = [f'{recmodel_name}-sample-{i}' for i in range(num_samples)]\n",
-    "                        samples = np.reshape(samples, (len(Y_tilde_nw),-1))        \n",
+    "                        samples = np.reshape(samples, (len(Y_tilde_nw),-1)) \n",
     "                        y_tilde = dict(zip(self.sample_names[recmodel_name], samples.T))\n",
     "                        Y_tilde_nw = Y_tilde_nw.with_columns(**y_tilde)\n",
     "                      \n",
-    "\n",
     "                end = time.time()\n",
     "                self.execution_times[f'{model_name}/{reconcile_fn_name}'] = (end - start)\n",
     "\n",
diff --git a/nbs/src/probabilistic_methods.ipynb b/nbs/src/probabilistic_methods.ipynb
@@ -139,14 +139,14 @@
     "        **Returns:**<br>\n",
     "        `samples`: Coherent samples of size (`base`, `horizon`, `num_samples`).\n",
     "        \"\"\"\n",
-    "        state = np.random.RandomState(self.seed)\n",
+    "        rng = np.random.default_rng(self.seed)\n",
     "        n_series, n_horizon = self.y_hat.shape\n",
     "        samples = np.empty(shape=(num_samples, n_series, n_horizon))\n",
     "        for t in range(n_horizon):\n",
     "            with warnings.catch_warnings():\n",
     "                # Avoid 'RuntimeWarning: covariance is not positive-semidefinite.'\n",
     "                # By definition the multivariate distribution is not full-rank\n",
-    "                partial_samples = state.multivariate_normal(mean=self.SP @ self.y_hat[:,t],\n",
+    "                partial_samples = rng.multivariate_normal(mean=self.SP @ self.y_hat[:,t],\n",
     "                                                    cov=self.cov_rec[t], size=num_samples)\n",
     "            samples[:,:,t] = partial_samples\n",
     "\n",
@@ -273,8 +273,8 @@
     "        #removing nas from residuals\n",
     "        residuals = residuals[:, np.isnan(residuals).sum(axis=0) == 0]\n",
     "        sample_idx = np.arange(residuals.shape[1] - h)\n",
-    "        state = np.random.RandomState(self.seed)\n",
-    "        samples_idx = state.choice(sample_idx, size=num_samples)\n",
+    "        rng = np.random.default_rng(self.seed)\n",
+    "        samples_idx = rng.choice(sample_idx, size=num_samples)\n",
     "        samples = [self.y_hat + residuals[:, idx:(idx + h)] for idx in samples_idx]\n",
     "        SP = self.S @ self.P\n",
     "        samples = np.apply_along_axis(lambda path: np.matmul(SP, path),\n",
@@ -488,19 +488,19 @@
     "            num_samples = residuals.shape[1]\n",
     "\n",
     "        # Expand residuals to match num_samples [(a,b),T] -> [(a,b),num_samples]\n",
+    "        rng = np.random.default_rng(self.seed)\n",
     "        if num_samples > residuals.shape[1]:\n",
-    "            residuals_idxs = np.random.choice(residuals.shape[1], size=num_samples)\n",
+    "            residuals_idxs = rng.choice(residuals.shape[1], size=num_samples)\n",
     "        else:\n",
-    "            residuals_idxs = np.random.choice(residuals.shape[1], size=num_samples, \n",
+    "            residuals_idxs = rng.choice(residuals.shape[1], size=num_samples, \n",
     "                                              replace=False)\n",
     "        residuals = residuals[:,residuals_idxs]\n",
     "        rank_permutations = self._obtain_ranks(residuals)\n",
     "\n",
-    "        state = np.random.RandomState(self.seed)\n",
     "        n_series, n_horizon = self.y_hat.shape\n",
     "\n",
     "        base_samples = np.array([\n",
-    "            state.normal(loc=m, scale=s, size=num_samples) for m, s in \\\n",
+    "            rng.normal(loc=m, scale=s, size=num_samples) for m, s in \\\n",
     "            zip(self.y_hat.flatten(), self.sigmah.flatten())\n",
     "        ])\n",
     "        base_samples = base_samples.reshape(n_series, n_horizon, num_samples)\n",
@@ -536,7 +536,7 @@
     "            # and randomly shuffle parent predictions after aggregation\n",
     "            parent_samples = np.einsum('ab,bhs->ahs', Agg, children_samples)\n",
     "            random_permutation = np.array([\n",
-    "                np.random.permutation(np.arange(num_samples)) \\\n",
+    "                rng.permutation(np.arange(num_samples)) \\\n",
     "                for serie in range(len(parent_samples))\n",
     "            ])\n",
     "            parent_samples = self._permutate_predictions(\n",