Models container, midway to train and evaluate several models

wtsimple · wtsimple · commit bc3fd03fd354 · 2020-07-28T09:02:59.000-04:00
diff --git a/models_container.py b/models_container.py
@@ -2,6 +2,7 @@
 
 from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, \
     LinearSVC, NaiveBayes
+from pyspark.sql import DataFrame
 
 from spark_launcher import SparkLauncher
 
@@ -16,6 +17,7 @@ def __init__(self, model, name='', kind=ModelKinds.CLASSIFICATION):
         self.model = model
         self.name = name
         self.kind = kind
+        self.fitted_model = None
 
 
 class ModelsContainer(object):
@@ -36,12 +38,40 @@ def __init__(self):
 
     @property
     def classification(self):
-        return [obj for name, obj in self.__dict__.items()
-                if getattr(obj, "kind", None) == ModelKinds.CLASSIFICATION]
+        """Returns the classification models"""
+        return self._get_models_of_kind(kind=ModelKinds.CLASSIFICATION)
+
+
+    def fit(self, data: DataFrame, kind="*"):
+        """Loops though all models of some kind and generates fitted models"""
+        if kind == "*":
+            models = self._all_models_dict.values()
+        else:
+            models = self._get_models_of_kind(kind)
+
+        for model in models:
+            model.fitted_model = model.model.fit(data)
+
+
+    @property
+    def fitted_models(self):
+        return [model.fitted_model for model in self._all_models_dict.values()]
 
 
     def _wrap_models(self):
-        for name, obj in self.__dict__.items():
-            if self.model_path in str(obj.__class__):
-                wrapped = Model(model=obj, name=name)
-                setattr(self, name, wrapped)
+        """Wraps the pyspark model in our own Model class that
+        provides some metadata and perhaps extra functionality"""
+        for name, obj in self._all_models_dict.items():
+            wrapped = Model(model=obj, name=name)
+            setattr(self, name, wrapped)
+
+
+    @property
+    def _all_models_dict(self):
+        return {name: obj for name, obj in self.__dict__.items()
+                if self.model_path in str(obj.__class__)}
+
+
+    def _get_models_of_kind(self, kind):
+        return [obj for name, obj in self.__dict__.items()
+                if getattr(obj, "kind", None) == kind]
diff --git a/tests/test_model_evaluator.py b/tests/test_model_evaluator.py
@@ -5,6 +5,7 @@
 from pyspark.mllib.evaluation import BinaryClassificationMetrics
 
 from model_evaluator import ModelEvaluator
+from models_container import ModelsContainer, ModelKinds
 
 """Expected results with real data
     model           dataset   AUC_ROC  AUC_PR
@@ -15,7 +16,7 @@
 CLASSIFICATION_METRICS = ["areaUnderROC", "areaUnderPR"]
 
 
-def test_model_evaluator_with_linear_regression(logistic_model, preprocessor):
+def test_model_evaluator_with_linear_regression_and_tiny_dataset(logistic_model, preprocessor):
     _check_evaluation(preprocessor=preprocessor, model=logistic_model,
                       metrics={"areaUnderROC": 1., "areaUnderPR": 1.})
 
@@ -26,9 +27,19 @@ def test_model_evaluator_with_linear_regression_and_full_train_data(logistic_mod
                       metrics={"areaUnderROC": 0.764655781, "areaUnderPR": 0.63384702449})
 
 
+def test_several_classification_models_fitting(preprocessor_train_data):
+    preprocessor_train_data.prepare_to_model(target_col='income', to_strip=' .')
+    evaluator = ModelEvaluator(metrics_class=BinaryClassificationMetrics)
+    models = ModelsContainer()
+    models.fit(preprocessor_train_data.train_encoded_df, kind=ModelKinds.CLASSIFICATION)
+    evaluator.compare({"train": preprocessor_train_data.train_encoded_df}, models=models.fitted_models)
+    print('kk')
+
+
 def _check_evaluation(preprocessor, model, metrics: Dict[str, float]):
     metrics_class = BinaryClassificationMetrics
     evaluator = ModelEvaluator(metrics_class=metrics_class)
+    # The purpose of this parameter is to prove names can be arbitrary in the compare method
     dataframes_sets = [['train', 'test'], ['train1', 'test1']]
     for dataframes in dataframes_sets:
         comparison = evaluator.compare(
diff --git a/todo_list.md b/todo_list.md
@@ -15,8 +15,8 @@
     (cleaning, encoding, etc)        
 - [x] obtain evaluation metrics for a single model
 - [ ] **fit and compare several classification models without tuning**
-    - [ ] create an object container for the models
-    - [ ] initialize the models with default hyperparameters
+    - [x] create an object container for the models
+    - [x] initialize the models with default hyperparameters
     - [ ] fit and compare the results with the evaluator
 - [ ] fit and compare several classification models with tuning and crossvalidation
     - [ ] be able to pass a list of hyperparameters values for each hyperparameter