Fix prediction fails with MOO ensemble and dummy is best (#1518)

eddiebergman · eddiebergman · commit 9209452b111e · 2022-08-18T20:08:49.000+02:00
* Init commit

* Fix DummyClassifiers in _load_pareto_set

* Add test for dummy only in classifiers

* Update no ensemble docstring

* Add automl case where automl only has dummy

* Remove tmp file

* Fix `include` statement to be regressor
diff --git a/autosklearn/automl.py b/autosklearn/automl.py
@@ -48,6 +48,7 @@
     BaseShuffleSplit,
     _RepeatedSplits,
 )
+from sklearn.pipeline import Pipeline
 from sklearn.utils import check_random_state
 from sklearn.utils.validation import check_is_fitted
 from smac.callbacks import IncorporateRunResultCallback
@@ -1473,6 +1474,7 @@ def predict(self, X, batch_size=None, n_jobs=1):
         # Each process computes predictions in chunks of batch_size rows.
         try:
             for i, tmp_model in enumerate(self.models_.values()):
+                # TODO, modify this
                 if isinstance(tmp_model, (DummyRegressor, DummyClassifier)):
                     check_is_fitted(tmp_model)
                 else:
@@ -1683,10 +1685,8 @@ def _load_best_individual_model(self):
         return ensemble
 
     def _load_pareto_set(self) -> Sequence[VotingClassifier | VotingRegressor]:
-        if self._ensemble_class is not None:
+        if self.ensemble_ is None:
             self.ensemble_ = self._backend.load_ensemble(self._seed)
-        else:
-            self.ensemble_ = None
 
         # If no ensemble is loaded we cannot do anything
         if not self.ensemble_:
@@ -1716,8 +1716,10 @@ def _load_pareto_set(self) -> Sequence[VotingClassifier | VotingRegressor]:
                     estimators=None,
                     voting="soft",
                 )
+                kind = "classifier"
             else:
                 voter = VotingRegressor(estimators=None)
+                kind = "regeressor"
 
             if self._resampling_strategy in ("cv", "cv-iterative-fit"):
                 models = self._backend.load_cv_models_by_identifiers(identifiers)
@@ -1730,8 +1732,32 @@ def _load_pareto_set(self) -> Sequence[VotingClassifier | VotingRegressor]:
             weight_vector = []
             estimators = []
             for identifier in identifiers:
-                weight_vector.append(weights[identifier])
-                estimators.append(models[identifier])
+                estimator = models[identifier]
+                weight = weights[identifier]
+
+                # Kind of hacky, really the dummy models should
+                # act like everything else does. Doing this is
+                # required so that the VotingClassifier/Regressor
+                # can use it as intended
+                if not isinstance(estimator, Pipeline):
+                    if kind == "classifier":
+                        steps = [
+                            ("data_preprocessor", None),
+                            ("balancing", None),
+                            ("feature_preprocessor", None),
+                            (kind, estimator),
+                        ]
+                    else:
+                        steps = [
+                            ("data_preprocessor", None),
+                            ("feature_preprocessor", None),
+                            (kind, estimator),
+                        ]
+
+                    estimator = Pipeline(steps=steps)
+
+                weight_vector.append(weight)
+                estimators.append(estimator)
 
             voter.estimators = estimators
             voter.estimators_ = estimators
@@ -2148,7 +2174,7 @@ def show_models(self) -> dict[int, Any]:
 
         ensemble_dict = {}
 
-        if self._ensemble_class is not None:
+        if self._ensemble_class is None:
             warnings.warn(
                 "No models in the ensemble. Kindly provide an ensemble class."
             )
diff --git a/test/test_automl/cases.py b/test/test_automl/cases.py
@@ -14,7 +14,6 @@
     {fitted} - If the automl case has been fitted
     {cv, holdout} - Whether explicitly cv or holdout was used
     {no_ensemble} - Fit with no ensemble size
-    {cached} - If the resulting case is then cached
     {multiobjective} - If the automl instance is multiobjective
 """
 from __future__ import annotations
@@ -24,17 +23,27 @@
 from pathlib import Path
 
 import numpy as np
+import sklearn.model_selection
 
 import autosklearn.metrics
 from autosklearn.automl import AutoMLClassifier, AutoMLRegressor
 from autosklearn.automl_common.common.utils.backend import Backend
+from autosklearn.evaluation.abstract_evaluator import (
+    MyDummyClassifier,
+    MyDummyRegressor,
+)
 
 from pytest_cases import case, parametrize
 
 from test.fixtures.backend import copy_backend
 from test.fixtures.caching import Cache
 
 
+def stop_at_first(smbo, run_info, result, time_left) -> bool:
+    """Used in some cases to enforce the only valid model is the dummy model"""
+    return False
+
+
 @case(tags=["classifier"])
 def case_classifier(
     tmp_dir: str,
@@ -60,7 +69,7 @@ def case_regressor(
 # ###################################
 # The following are fitted and cached
 # ###################################
-@case(tags=["classifier", "fitted", "holdout", "cached"])
+@case(tags=["classifier", "fitted", "holdout"])
 @parametrize("dataset", ["iris"])
 def case_classifier_fitted_holdout_iterative(
     dataset: str,
@@ -97,7 +106,7 @@ def case_classifier_fitted_holdout_iterative(
     return model
 
 
-@case(tags=["classifier", "fitted", "cv", "cached"])
+@case(tags=["classifier", "fitted", "cv"])
 @parametrize("dataset", ["iris"])
 def case_classifier_fitted_cv(
     make_cache: Callable[[str], Cache],
@@ -134,7 +143,7 @@ def case_classifier_fitted_cv(
     return model
 
 
-@case(tags=["classifier", "fitted", "holdout", "cached", "multiobjective"])
+@case(tags=["classifier", "fitted", "holdout", "multiobjective"])
 @parametrize("dataset", ["iris"])
 def case_classifier_fitted_holdout_multiobjective(
     dataset: str,
@@ -177,7 +186,7 @@ def case_classifier_fitted_holdout_multiobjective(
     return model
 
 
-@case(tags=["regressor", "fitted", "holdout", "cached"])
+@case(tags=["regressor", "fitted", "holdout"])
 @parametrize("dataset", ["boston"])
 def case_regressor_fitted_holdout(
     make_cache: Callable[[str], Cache],
@@ -212,7 +221,7 @@ def case_regressor_fitted_holdout(
     return model
 
 
-@case(tags=["regressor", "fitted", "cv", "cached"])
+@case(tags=["regressor", "fitted", "cv"])
 @parametrize("dataset", ["boston"])
 def case_regressor_fitted_cv(
     make_cache: Callable[[str], Cache],
@@ -249,7 +258,7 @@ def case_regressor_fitted_cv(
     return model
 
 
-@case(tags=["classifier", "fitted", "no_ensemble", "cached"])
+@case(tags=["classifier", "fitted", "no_ensemble"])
 @parametrize("dataset", ["iris"])
 def case_classifier_fitted_no_ensemble(
     make_cache: Callable[[str], Cache],
@@ -258,8 +267,7 @@ def case_classifier_fitted_no_ensemble(
     make_automl_classifier: Callable[..., AutoMLClassifier],
     make_sklearn_dataset: Callable[..., Tuple[np.ndarray, ...]],
 ) -> AutoMLClassifier:
-    """Case of a fitted classifier but ensemble was disabled by
-    not writing models to disk"""
+    """Case of a fitted classifier but ensemble was disabled"""
     key = f"case_classifier_fitted_no_ensemble_{dataset}"
 
     # This locks the cache for this item while we check, required for pytest-xdist
@@ -270,7 +278,6 @@ def case_classifier_fitted_no_ensemble(
                 temporary_directory=cache.path("backend"),
                 delete_tmp_folder_after_terminate=False,
                 ensemble_class=None,
-                disable_evaluator_output=True,
             )
 
             X, y, Xt, yt = make_sklearn_dataset(name=dataset)
@@ -282,3 +289,85 @@ def case_classifier_fitted_no_ensemble(
     model._backend = copy_backend(old=model._backend, new=make_backend())
 
     return model
+
+
+@case(tags=["classifier", "fitted"])
+def case_classifier_fitted_only_dummy(
+    make_cache: Callable[[str], Cache],
+    make_backend: Callable[..., Backend],
+    make_automl_classifier: Callable[..., AutoMLClassifier],
+) -> AutoMLClassifier:
+    """Case of a fitted classifier but only dummy was found"""
+    key = "case_classifier_fitted_only_dummy"
+
+    # This locks the cache for this item while we check, required for pytest-xdist
+
+    with make_cache(key) as cache:
+        if "model" not in cache:
+            model = make_automl_classifier(
+                temporary_directory=cache.path("backend"),
+                delete_tmp_folder_after_terminate=False,
+                include={"classifier": ["bernoulli_nb"]},  # Just a meh model
+                get_trials_callback=stop_at_first,
+            )
+            rand = np.random.RandomState(2)
+            _X = rand.random((100, 50))
+            _y = rand.randint(0, 2, (100,))
+            X, Xt, y, yt = sklearn.model_selection.train_test_split(
+                _X, _y, random_state=1  # Required to ensure dummy is best
+            )
+            model.fit(X, y, dataset_name="random")
+
+            # We now validate that indeed, the only model is the Dummy
+            members = list(model.models_.values())
+            if len(members) != 1 and not isinstance(members[0], MyDummyClassifier):
+                raise ValueError("Should only have one model, dummy\n", members)
+
+            cache.save(model, "model")
+
+    model = cache.load("model")
+    model._backend = copy_backend(old=model._backend, new=make_backend())
+
+    return model
+
+
+@case(tags=["regressor", "fitted"])
+def case_regressor_fitted_only_dummy(
+    make_cache: Callable[[str], Cache],
+    make_backend: Callable[..., Backend],
+    make_automl_regressor: Callable[..., AutoMLRegressor],
+) -> AutoMLRegressor:
+    """Case of a fitted classifier but only dummy was found"""
+    key = "case_regressor_fitted_only_dummy"
+
+    # This locks the cache for this item while we check, required for pytest-xdist
+
+    with make_cache(key) as cache:
+        if "model" not in cache:
+            model = make_automl_regressor(
+                temporary_directory=cache.path("backend"),
+                delete_tmp_folder_after_terminate=False,
+                include={"regressor": ["k_nearest_neighbors"]},  # Just a meh model
+                get_trials_callback=stop_at_first,
+            )
+
+            rand = np.random.RandomState(2)
+            _X = rand.random((100, 50))
+            _y = rand.random((100,))
+
+            X, Xt, y, yt = sklearn.model_selection.train_test_split(
+                _X, _y, random_state=1  # Required to ensure dummy is best
+            )
+            model.fit(X, y, dataset_name="random")
+
+            # We now validate that indeed, the only model is the Dummy
+            members = list(model.models_.values())
+            if len(members) != 1 and not isinstance(members[0], MyDummyRegressor):
+                raise ValueError("Should only have one model, dummy\n", members)
+
+            cache.save(model, "model")
+
+    model = cache.load("model")
+    model._backend = copy_backend(old=model._backend, new=make_backend())
+
+    return model