Allow evaluations on new data with new categories (#96)

bambinos · Jun 9, 2023 · 0ec4fdf · 0ec4fdf
1 parent a160993
commit 0ec4fdf
Show file tree

Hide file tree

Showing 6 changed files with 100 additions and 18 deletions.
diff --git a/formulae/__init__.py b/formulae/__init__.py
@@ -1,12 +1,10 @@
 import logging
 
-from .config import Config
+from .config import config
 from .matrices import design_matrices
 from .model_description import model_description
 from .version import __version__
 
-config = Config()
-
 __all__ = [
     "config",
     "design_matrices",

diff --git a/formulae/config.py b/formulae/config.py
@@ -1,5 +1,5 @@
 class Config:
-    FIELDS = {"EVAL_NEW_CATEGORIES": ("error", "warning", "silent")}
+    FIELDS = {"EVAL_UNSEEN_CATEGORIES": ("error", "warning", "silent")}
 
     def __init__(self, config_dict: dict = None):
         config_dict = {} if config_dict is None else config_dict
@@ -35,3 +35,6 @@ def __str__(self):  # pragma: no cover
 
     def __repr__(self):  # pragma: no cover
         return str(self)
+
+
+config = Config()
diff --git a/formulae/terms/call.py b/formulae/terms/call.py
@@ -1,11 +1,13 @@
 import sys
+import warnings
 
 import numpy as np
 import pandas as pd
 
 from pandas.api.types import is_categorical_dtype, is_numeric_dtype, is_string_dtype
 
 from formulae.categorical import ENCODINGS, CategoricalBox, Treatment
+from formulae.config import config
 from formulae.transforms import TRANSFORMS, Proportion, Offset
 from formulae.terms.call_utils import CallVarsExtractor
 
@@ -309,12 +311,32 @@ def eval_new_data_categoric(self, x):
         if not difference:
             idxs = pd.Categorical(x, categories=self.levels).codes
             return self.contrast_matrix.matrix[idxs]
-        else:
+
+        if config["EVAL_UNSEEN_CATEGORIES"] == "error":
             difference = [str(x) for x in difference]
             raise ValueError(
-                f"The levels {', '.join(difference)} in '{self.name}' are not present in "
-                "the original data set."
+                f"The levels ({', '.join(difference)}) in '{self.name}' are not present in the "
+                "original data set."
+            )
+        # When there's an unseen category it will first use it as if it was the first category
+        # so we can still index 'contrast_matrix.matrix' but then it will replace all the values
+        # in there with all zeros.
+
+        # pandas uses '-1' for unseen levels
+        idxs_original = pd.Categorical(x, categories=self.levels).codes
+        idxs_modified = np.copy(idxs_original)
+        idxs_modified[idxs_original == -1] = 0
+        contribution = self.contrast_matrix.matrix[idxs_modified]
+        contribution[idxs_original == -1] = 0
+
+        if config["EVAL_UNSEEN_CATEGORIES"] == "warning":
+            difference = [str(x) for x in difference]
+            warnings.warn(
+                f"The levels ({', '.join(difference)}) in '{self.name}' are not present in the "
+                "original data set. It's impossible to select appropriate contrasts for them. "
+                "Setting all the indicator variables to zero."
             )
+        return contribution
 
     def eval_new_data_categorical_box(self, x):
         return self.eval_new_data_categoric(x.data)

diff --git a/formulae/terms/variable.py b/formulae/terms/variable.py
@@ -1,10 +1,12 @@
 import sys
+import warnings
 
 import numpy as np
 import pandas as pd
 
 from pandas.api.types import is_categorical_dtype, is_numeric_dtype, is_string_dtype
 
+from formulae.config import config
 from formulae.categorical import Treatment
 
 
@@ -218,12 +220,32 @@ def eval_new_data_categoric(self, x):
         if not difference:
             idxs = pd.Categorical(x, categories=self.levels).codes
             return self.contrast_matrix.matrix[idxs]
-        else:
+
+        if config["EVAL_UNSEEN_CATEGORIES"] == "error":
             difference = [str(x) for x in difference]
             raise ValueError(
-                f"The levels {', '.join(difference)} in '{self.name}' are not present in "
-                "the original data set."
+                f"The levels ({', '.join(difference)}) in '{self.name}' are not present in the "
+                "original data set."
+            )
+        # When there's an unseen category it will first use it as if it was the first category
+        # so we can still index 'contrast_matrix.matrix' but then it will replace all the values
+        # in there with all zeros.
+
+        # pandas uses '-1' for unseen levels
+        idxs_original = pd.Categorical(x, categories=self.levels).codes
+        idxs_modified = np.copy(idxs_original)
+        idxs_modified[idxs_original == -1] = 0
+        contribution = self.contrast_matrix.matrix[idxs_modified]
+        contribution[idxs_original == -1] = 0
+
+        if config["EVAL_UNSEEN_CATEGORIES"] == "warning":
+            difference = [str(x) for x in difference]
+            warnings.warn(
+                f"The levels ({', '.join(difference)}) in '{self.name}' are not present in the "
+                "original data set. It's impossible to select appropriate contrasts for them. "
+                "Setting all the indicator variables to zero."
             )
+        return contribution
 
     @property
     def labels(self):

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -6,11 +6,13 @@
 def test_config():
     config = Config()
 
-    assert config["EVAL_NEW_CATEGORIES"] == "error"
-    assert config.EVAL_NEW_CATEGORIES == "error"
+    assert config["EVAL_UNSEEN_CATEGORIES"] == "error"
+    assert config.EVAL_UNSEEN_CATEGORIES == "error"
 
-    with pytest.raises(ValueError, match="anything is not a valid value for 'EVAL_NEW_CATEGORIES'"):
-        config.EVAL_NEW_CATEGORIES = "anything"
+    with pytest.raises(
+        ValueError, match="anything is not a valid value for 'EVAL_UNSEEN_CATEGORIES'"
+    ):
+        config.EVAL_UNSEEN_CATEGORIES = "anything"
 
     with pytest.raises(KeyError, match="'DOESNT_EXIST' is not a valid configuration option"):
         config.DOESNT_EXIST = "anything"

diff --git a/tests/test_eval_new_data.py b/tests/test_eval_new_data.py
@@ -5,6 +5,7 @@
 import numpy as np
 import pandas as pd
 
+from formulae.config import config
 from formulae.environment import Environment
 from formulae.parser import Parser
 from formulae.scanner import Scanner
@@ -72,7 +73,8 @@ def test_term_new_data_categoric():
     # It remembers it saw "A", "B", and "C", but not "D".
     # So when you pass a new level, it raises a ValueError.
     with pytest.raises(
-        ValueError, match="The levels D in 'x' are not present in the original data set."
+        ValueError, 
+        match=re.escape("The levels (D) in 'x' are not present in the original data set.")
     ):
         data = pd.DataFrame({"x": ["B", "C", "D"]})
         var_term.eval_new_data(data)
@@ -91,7 +93,8 @@ def test_term_new_data_categoric():
     # It remembers it saw "A", "B", and "C", but not "D".
     # So when you pass a new level, it raises a ValueError.
     with pytest.raises(
-        ValueError, match="The levels D in 'x' are not present in the original data set."
+        ValueError, 
+        match=re.escape("The levels (D) in 'x' are not present in the original data set.")
     ):
         data = pd.DataFrame({"x": ["B", "C", "D"]})
         var_term.eval_new_data(data)
@@ -112,7 +115,7 @@ def test_call_new_data_categoric_stateful_transform():
 
     with pytest.raises(
         ValueError,
-        match=re.escape("The levels 4 in 'C(x)' are not present in the original data set"),
+        match=re.escape("The levels (4) in 'C(x)' are not present in the original data set"),
     ):
         data = pd.DataFrame({"x": [2, 3, 4]})
         call_term.eval_new_data(data)
@@ -131,7 +134,7 @@ def test_call_new_data_categoric_stateful_transform():
     # So when you pass a new level, it raises a ValueError.
     with pytest.raises(
         ValueError,
-        match=re.escape("The levels 4 in 'C(x)' are not present in the original data set"),
+        match=re.escape("The levels (4) in 'C(x)' are not present in the original data set"),
     ):
         data = pd.DataFrame({"x": [2, 3, 4]})
         call_term.eval_new_data(data)
@@ -307,3 +310,35 @@ def test_eval_new_data_when_evaluated_false(data, data2):
         common.evaluate_new_data(data2)
     with pytest.raises(ValueError):
         group.evaluate_new_data(data2)
+
+
+def test_eval_unseen_categories():
+    df = pd.DataFrame({"x": np.arange(10), "g": list("abcde") * 2})
+    df2 = pd.DataFrame({"g": list("abxz")})
+    dm = design_matrices("x ~ 0 + g", df)
+
+    with pytest.raises(ValueError, match="not present in the original data set"):
+        dm.common.evaluate_new_data(df2)
+
+    config.EVAL_UNSEEN_CATEGORIES = "warning"
+    with pytest.warns(UserWarning, match="It's impossible to select appropriate contrasts"):
+        common2 = dm.common.evaluate_new_data(df2)    
+        common2.design_matrix == np.array(
+            [
+                [1, 0, 0, 0, 0],
+                [0, 1, 0, 0, 0],
+                [0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0]
+            ]
+       )
+
+    config.EVAL_UNSEEN_CATEGORIES = "silent"
+    common2 = dm.common.evaluate_new_data(df2)    
+    common2.design_matrix == np.array(
+        [
+            [1, 0, 0, 0, 0],
+            [0, 1, 0, 0, 0],
+            [0, 0, 0, 0, 0],
+            [0, 0, 0, 0, 0]
+        ]
+    )