Skip to content

Commit

Permalink
Allow evaluations on new data with new categories (#96)
Browse files Browse the repository at this point in the history
  • Loading branch information
tomicapretto authored Jun 9, 2023
1 parent a160993 commit 0ec4fdf
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 18 deletions.
4 changes: 1 addition & 3 deletions formulae/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import logging

from .config import Config
from .config import config
from .matrices import design_matrices
from .model_description import model_description
from .version import __version__

config = Config()

__all__ = [
"config",
"design_matrices",
Expand Down
5 changes: 4 additions & 1 deletion formulae/config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
class Config:
FIELDS = {"EVAL_NEW_CATEGORIES": ("error", "warning", "silent")}
FIELDS = {"EVAL_UNSEEN_CATEGORIES": ("error", "warning", "silent")}

def __init__(self, config_dict: dict = None):
config_dict = {} if config_dict is None else config_dict
Expand Down Expand Up @@ -35,3 +35,6 @@ def __str__(self): # pragma: no cover

def __repr__(self): # pragma: no cover
return str(self)


config = Config()
28 changes: 25 additions & 3 deletions formulae/terms/call.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import sys
import warnings

import numpy as np
import pandas as pd

from pandas.api.types import is_categorical_dtype, is_numeric_dtype, is_string_dtype

from formulae.categorical import ENCODINGS, CategoricalBox, Treatment
from formulae.config import config
from formulae.transforms import TRANSFORMS, Proportion, Offset
from formulae.terms.call_utils import CallVarsExtractor

Expand Down Expand Up @@ -309,12 +311,32 @@ def eval_new_data_categoric(self, x):
if not difference:
idxs = pd.Categorical(x, categories=self.levels).codes
return self.contrast_matrix.matrix[idxs]
else:

if config["EVAL_UNSEEN_CATEGORIES"] == "error":
difference = [str(x) for x in difference]
raise ValueError(
f"The levels {', '.join(difference)} in '{self.name}' are not present in "
"the original data set."
f"The levels ({', '.join(difference)}) in '{self.name}' are not present in the "
"original data set."
)
# When there's an unseen category it will first use it as if it was the first category
# so we can still index 'contrast_matrix.matrix' but then it will replace all the values
# in there with all zeros.

# pandas uses '-1' for unseen levels
idxs_original = pd.Categorical(x, categories=self.levels).codes
idxs_modified = np.copy(idxs_original)
idxs_modified[idxs_original == -1] = 0
contribution = self.contrast_matrix.matrix[idxs_modified]
contribution[idxs_original == -1] = 0

if config["EVAL_UNSEEN_CATEGORIES"] == "warning":
difference = [str(x) for x in difference]
warnings.warn(
f"The levels ({', '.join(difference)}) in '{self.name}' are not present in the "
"original data set. It's impossible to select appropriate contrasts for them. "
"Setting all the indicator variables to zero."
)
return contribution

def eval_new_data_categorical_box(self, x):
return self.eval_new_data_categoric(x.data)
Expand Down
28 changes: 25 additions & 3 deletions formulae/terms/variable.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import sys
import warnings

import numpy as np
import pandas as pd

from pandas.api.types import is_categorical_dtype, is_numeric_dtype, is_string_dtype

from formulae.config import config
from formulae.categorical import Treatment


Expand Down Expand Up @@ -218,12 +220,32 @@ def eval_new_data_categoric(self, x):
if not difference:
idxs = pd.Categorical(x, categories=self.levels).codes
return self.contrast_matrix.matrix[idxs]
else:

if config["EVAL_UNSEEN_CATEGORIES"] == "error":
difference = [str(x) for x in difference]
raise ValueError(
f"The levels {', '.join(difference)} in '{self.name}' are not present in "
"the original data set."
f"The levels ({', '.join(difference)}) in '{self.name}' are not present in the "
"original data set."
)
# When there's an unseen category it will first use it as if it was the first category
# so we can still index 'contrast_matrix.matrix' but then it will replace all the values
# in there with all zeros.

# pandas uses '-1' for unseen levels
idxs_original = pd.Categorical(x, categories=self.levels).codes
idxs_modified = np.copy(idxs_original)
idxs_modified[idxs_original == -1] = 0
contribution = self.contrast_matrix.matrix[idxs_modified]
contribution[idxs_original == -1] = 0

if config["EVAL_UNSEEN_CATEGORIES"] == "warning":
difference = [str(x) for x in difference]
warnings.warn(
f"The levels ({', '.join(difference)}) in '{self.name}' are not present in the "
"original data set. It's impossible to select appropriate contrasts for them. "
"Setting all the indicator variables to zero."
)
return contribution

@property
def labels(self):
Expand Down
10 changes: 6 additions & 4 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@
def test_config():
config = Config()

assert config["EVAL_NEW_CATEGORIES"] == "error"
assert config.EVAL_NEW_CATEGORIES == "error"
assert config["EVAL_UNSEEN_CATEGORIES"] == "error"
assert config.EVAL_UNSEEN_CATEGORIES == "error"

with pytest.raises(ValueError, match="anything is not a valid value for 'EVAL_NEW_CATEGORIES'"):
config.EVAL_NEW_CATEGORIES = "anything"
with pytest.raises(
ValueError, match="anything is not a valid value for 'EVAL_UNSEEN_CATEGORIES'"
):
config.EVAL_UNSEEN_CATEGORIES = "anything"

with pytest.raises(KeyError, match="'DOESNT_EXIST' is not a valid configuration option"):
config.DOESNT_EXIST = "anything"
Expand Down
43 changes: 39 additions & 4 deletions tests/test_eval_new_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
import pandas as pd

from formulae.config import config
from formulae.environment import Environment
from formulae.parser import Parser
from formulae.scanner import Scanner
Expand Down Expand Up @@ -72,7 +73,8 @@ def test_term_new_data_categoric():
# It remembers it saw "A", "B", and "C", but not "D".
# So when you pass a new level, it raises a ValueError.
with pytest.raises(
ValueError, match="The levels D in 'x' are not present in the original data set."
ValueError,
match=re.escape("The levels (D) in 'x' are not present in the original data set.")
):
data = pd.DataFrame({"x": ["B", "C", "D"]})
var_term.eval_new_data(data)
Expand All @@ -91,7 +93,8 @@ def test_term_new_data_categoric():
# It remembers it saw "A", "B", and "C", but not "D".
# So when you pass a new level, it raises a ValueError.
with pytest.raises(
ValueError, match="The levels D in 'x' are not present in the original data set."
ValueError,
match=re.escape("The levels (D) in 'x' are not present in the original data set.")
):
data = pd.DataFrame({"x": ["B", "C", "D"]})
var_term.eval_new_data(data)
Expand All @@ -112,7 +115,7 @@ def test_call_new_data_categoric_stateful_transform():

with pytest.raises(
ValueError,
match=re.escape("The levels 4 in 'C(x)' are not present in the original data set"),
match=re.escape("The levels (4) in 'C(x)' are not present in the original data set"),
):
data = pd.DataFrame({"x": [2, 3, 4]})
call_term.eval_new_data(data)
Expand All @@ -131,7 +134,7 @@ def test_call_new_data_categoric_stateful_transform():
# So when you pass a new level, it raises a ValueError.
with pytest.raises(
ValueError,
match=re.escape("The levels 4 in 'C(x)' are not present in the original data set"),
match=re.escape("The levels (4) in 'C(x)' are not present in the original data set"),
):
data = pd.DataFrame({"x": [2, 3, 4]})
call_term.eval_new_data(data)
Expand Down Expand Up @@ -307,3 +310,35 @@ def test_eval_new_data_when_evaluated_false(data, data2):
common.evaluate_new_data(data2)
with pytest.raises(ValueError):
group.evaluate_new_data(data2)


def test_eval_unseen_categories():
df = pd.DataFrame({"x": np.arange(10), "g": list("abcde") * 2})
df2 = pd.DataFrame({"g": list("abxz")})
dm = design_matrices("x ~ 0 + g", df)

with pytest.raises(ValueError, match="not present in the original data set"):
dm.common.evaluate_new_data(df2)

config.EVAL_UNSEEN_CATEGORIES = "warning"
with pytest.warns(UserWarning, match="It's impossible to select appropriate contrasts"):
common2 = dm.common.evaluate_new_data(df2)
common2.design_matrix == np.array(
[
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0]
]
)

config.EVAL_UNSEEN_CATEGORIES = "silent"
common2 = dm.common.evaluate_new_data(df2)
common2.design_matrix == np.array(
[
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0]
]
)

0 comments on commit 0ec4fdf

Please sign in to comment.