Skip to content

Alternative scoring metrics #965

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
ae2b2e3
Copy everything from the other branch, make sure to use a --sign-off …
carl-offerfit Mar 31, 2025
806c4a1
Fix: No default y,t scoring so it defaults to the models built in
carl-offerfit Apr 2, 2025
a55aa7e
Switch to passing non sklearn scoring as a function argument
carl-offerfit May 1, 2025
5661117
Fix imperative nature of docstring
carl-offerfit May 1, 2025
61623b0
Update tests to use pearsonr function in the test
carl-offerfit May 1, 2025
3df876a
Fix for naming of return result in test
carl-offerfit May 1, 2025
e8deb97
Cleaner handling of the scorer name when it is a function
carl-offerfit May 1, 2025
3df55dd
Correct the docstring to include the other alternatives
carl-offerfit May 1, 2025
50a6b59
Fix type hints to be compatible with earlier Python versions
carl-offerfit May 15, 2025
72ade93
Fix for if a scoring function is partial to np.array vs. np.ndarray (…
carl-offerfit May 15, 2025
90a5d01
Add comment
carl-offerfit May 15, 2025
98394b9
Adding tests of score function validation
carl-offerfit May 15, 2025
de716b0
Fix the tests to pass
carl-offerfit May 15, 2025
4401d69
Merge branch 'main' into carl/metrics-take2
carl-offerfit May 27, 2025
b2028b0
Remove the validation of the score function.
Jun 9, 2025
a405cac
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 9, 2025
fe5f873
Remove the validation of the score function.
Jun 9, 2025
e148384
Merge branch 'carl/metrics-take2' of github.com:carl-offerfit/EconML …
Jun 10, 2025
c1ffdd8
Merge branch 'main' into carl/metrics-take2
Jun 11, 2025
469a6e1
Fix bug in else condition on application of squeeze
Jun 11, 2025
5db1240
Merge branch 'main' into carl/metrics-take2
Jun 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions econml/_ortho_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,7 +1027,7 @@ def effect_inference(self, X=None, *, T0=0, T1=1):

effect_inference.__doc__ = LinearCateEstimator.effect_inference.__doc__

def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None, scoring=None):
"""
Score the fitted CATE model on a new data set.

Expand Down Expand Up @@ -1055,6 +1055,9 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
Weights for each samples
groups: (n,) vector, optional
All rows corresponding to the same group will be kept together during splitting.
scoring: name of an sklearn scoring function to use instead of the default, optional
Supports f1_score, log_loss, mean_absolute_error, mean_squared_error, r2_score,
and roc_auc_score.

Returns
-------
Expand Down Expand Up @@ -1113,9 +1116,24 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):

accumulated_nuisances += nuisances

score_kwargs = {
'X': X,
'W': W,
'Z': Z,
'sample_weight': sample_weight,
'groups': groups
}
# If using an _rlearner, the scoring parameter can be passed along, if provided
if scoring is not None:
# Cannot import in header, or circular imports
from .dml._rlearner import _ModelFinal
if isinstance(self._ortho_learner_model_final, _ModelFinal):
score_kwargs['scoring'] = scoring
else:
raise NotImplementedError("scoring parameter only implemented for "
"_rlearner._ModelFinal")
return self._ortho_learner_model_final.score(Y, T, nuisances=accumulated_nuisances,
**filter_none_kwargs(X=X, W=W, Z=Z,
sample_weight=sample_weight, groups=groups))
**filter_none_kwargs(**score_kwargs))

@property
def ortho_learner_model_final_(self):
Expand Down
166 changes: 156 additions & 10 deletions econml/dml/_rlearner.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,16 @@

from abc import abstractmethod
import numpy as np

import pandas as pd
from sklearn.metrics import (
get_scorer,
get_scorer_names
)
from typing import Callable, Union
from ..sklearn_extensions.model_selection import ModelSelector
from ..utilities import (filter_none_kwargs)
from .._ortho_learner import _OrthoLearner


class _ModelNuisance(ModelSelector):
"""
RLearner nuisance model.
Expand All @@ -54,10 +58,13 @@ def train(self, is_selecting, folds, Y, T, X=None, W=None, Z=None, sample_weight
filter_none_kwargs(sample_weight=sample_weight, groups=groups))
return self

def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None,
y_scoring=None, t_scoring=None, t_score_by_dim=False):
# note that groups are not passed to score because they are only used for fitting
T_score = self._model_t.score(X, W, T, **filter_none_kwargs(sample_weight=sample_weight))
Y_score = self._model_y.score(X, W, Y, **filter_none_kwargs(sample_weight=sample_weight))
T_score = self._model_t.score(X, W, T, **filter_none_kwargs(sample_weight=sample_weight),
scoring=t_scoring, score_by_dim=t_score_by_dim)
Y_score = self._model_y.score(X, W, Y, **filter_none_kwargs(sample_weight=sample_weight),
scoring=y_scoring)
return Y_score, T_score

def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
Expand Down Expand Up @@ -98,18 +105,92 @@ def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None,
def predict(self, X=None):
return self._model_final.predict(X)

def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, groups=None):
def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, groups=None,
scoring='mean_squared_error'):
"""
Score final model fit of residualized outcomes from residualized treatments and nuisances.

The default scoring method "mean_squared_error" is the score used to fit residualized
outcomes from residualized treatments and nuisances, and reproduces the behavior of this
score function from before the scoring method option.

:param Y: Unused
:param T: Unused
:param X: Combined nuisances, treatments and instruments to call _model_final.predict
:param W: Unused
:param Z: Unused
:param nuisances: tuple of the outcome (Y) residuals and treatment (T) residuals
:param sample_weight: Optional weighting on the samples
:param groups: Unused
:param scoring: Optional alternative scoring metric from sklearn.get_scorer
:return: Float score
"""
Y_res, T_res = nuisances
if Y_res.ndim == 1:
Y_res = Y_res.reshape((-1, 1))
if T_res.ndim == 1:
T_res = T_res.reshape((-1, 1))
effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1]))
Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape)
return _ModelFinal._wrap_scoring(Y_true=Y_res, Y_pred=Y_res_pred, scoring=scoring, sample_weight=sample_weight)


@staticmethod
def _wrap_scoring(scoring:Union[str, Callable], Y_true, Y_pred, sample_weight=None):
"""
Pull the scoring function from sklearn.get_scorer and call it with Y_true, Y_pred.

Standard score names like "mean_squared_error" are present in sklearn scoring as
"neg_..." so score names are accepted either with or without the "neg_" prefix.
The function _score_func is called directly because the scorer objects from get_scorer()
do not accept a sample_weight parameter. The _score_func member has been available in
sklearn scorers since before sklearn 1.0. Note that custom callable score functions
are allowed but they are not validated before use; any errors will be raised.


:param scoring: A string name of a scoring function from sklearn, or any callable that will
function as thes core.
:param Y_true: True Y values
:param Y_pred: Predicted Y values
:param sample_weight: Optional weighting on the examples
:return: Float score
"""
if isinstance(scoring,str) and scoring in get_scorer_names():
score_fn = get_scorer(scoring)._score_func
elif isinstance(scoring,str) and 'neg_' + scoring in get_scorer_names():
score_fn = get_scorer('neg_' + scoring)._score_func
elif callable(scoring):
score_fn = scoring
else:
raise NotImplementedError(f"_wrap_scoring does not support '{scoring}'" )

# Some score like functions are partial to np.array and not np.ndarray with shape (N,1)
Y_true = Y_true.squeeze() if len(Y_true.shape)==2 and Y_true.shape[1]==1 else Y_true
Y_pred = Y_pred.squeeze() if len(Y_pred.shape)==2 and Y_pred.shape[1]==1 else Y_pred
if sample_weight is not None:
return np.mean(np.average((Y_res - Y_res_pred) ** 2, weights=sample_weight, axis=0))
res = score_fn(Y_true, Y_pred, sample_weight=sample_weight)
else:
return np.mean((Y_res - Y_res_pred) ** 2)
res = score_fn(Y_true, Y_pred)

return res


@staticmethod
def wrap_scoring(scoring, Y_true, Y_pred, sample_weight=None, score_by_dim=False):
"""
In case the caller wants a score for each dimension of a multiple treatment model.

Loop over the call to the single score wrapper.
"""
if not score_by_dim:
return _ModelFinal._wrap_scoring(scoring, Y_true, Y_pred, sample_weight)
else:
assert Y_true.shape == Y_pred.shape, "Mismatch shape in wrap_scoring"
n_out = Y_pred.shape[1]
res = [None]*Y_pred.shape[1]
for yidx in range(n_out):
res[yidx]= _ModelFinal.wrap_scoring(scoring, Y_true[:,yidx], Y_pred[:,yidx], sample_weight)
return res


class _RLearner(_OrthoLearner):
Expand Down Expand Up @@ -422,7 +503,7 @@ def fit(self, Y, T, *, X=None, W=None, sample_weight=None, freq_weight=None, sam
cache_values=cache_values,
inference=inference)

def score(self, Y, T, X=None, W=None, sample_weight=None):
def score(self, Y, T, X=None, W=None, sample_weight=None, scoring=None):
"""
Score the fitted CATE model on a new data set.

Expand Down Expand Up @@ -453,7 +534,7 @@ def score(self, Y, T, X=None, W=None, sample_weight=None):
The MSE of the final CATE model on the new data.
"""
# Replacing score from _OrthoLearner, to enforce Z=None and improve the docstring
return super().score(Y, T, X=X, W=W, sample_weight=sample_weight)
return super().score(Y, T, X=X, W=W, sample_weight=sample_weight, scoring=scoring)

@property
def rlearner_model_final_(self):
Expand Down Expand Up @@ -493,3 +574,68 @@ def residuals_(self):
"Set to `True` to enable residual storage.")
Y_res, T_res = self._cached_values.nuisances
return Y_res, T_res, self._cached_values.X, self._cached_values.W

@staticmethod
def scoring_name(scoring: Union[str,Callable,None])->str:
if scoring is None:
return 'default_score'
elif isinstance(scoring,str):
return scoring
elif callable(scoring):
return scoring.__name__
else:
raise ValueError("Scoring should be str|Callable|None")


def score_nuisances(self, Y, T, X=None, W=None, Z=None, sample_weight=None, y_scoring=None,
t_scoring=None, t_score_by_dim=False):
"""
Score the fitted nuisance models on arbitrary data and using any supported sklearn scoring.

Parameters
----------
Y: (n, d_y) matrix or vector of length n
Outcomes for each sample
T: (n, d_t) matrix or vector of length n
Treatments for each sample
X: (n, d_x) matrix, optional
Features for each sample
W: (n, d_w) matrix, optional
Controls for each sample
Z: (n, d_z) matrix, optional
Instruments for each sample
sample_weight:(n,) vector, optional
Weights for each samples
t_scoring: str, optional
Name of an sklearn scoring function to use instead of the default for model_t, choices
are from sklearn.get_scoring_names() plus pearsonr
y_scoring: str, optional
Name of an sklearn scoring function to use instead of the default for model_y, choices
are from sklearn.get_scoring_names() plus pearsonr
t_score_by_dim: bool, default=False
Score prediction of treatment dimensions separately

Returns
-------
score_dict : dict[str,list[float]]
A dictionary where the keys indicate the Y and T scores used and the values are
lists of scores, one per CV fold model.
"""
Y_key = f'Y_{_RLearner.scoring_name(y_scoring)}'
T_Key = f'T_{_RLearner.scoring_name(t_scoring)}'
score_dict = {
Y_key : [],
T_Key : []
}

# For discrete treatments, these will have to be one hot encoded
Y_2_score = pd.get_dummies(Y) if self.discrete_outcome and (len(Y.shape) == 1 or Y.shape[1] == 1) else Y
T_2_score = pd.get_dummies(T) if self.discrete_treatment and (len(T.shape) == 1 or T.shape[1] == 1) else T

for m in self._models_nuisance[0]:
Y_score, T_score = m.score(Y_2_score, T_2_score, X=X, W=W, Z=Z, sample_weight=sample_weight,
y_scoring=y_scoring, t_scoring=t_scoring,
t_score_by_dim=t_score_by_dim)
score_dict[Y_key].append(Y_score)
score_dict[T_Key].append(T_score)
return score_dict
43 changes: 33 additions & 10 deletions econml/dml/dml.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
from sklearn.preprocessing import (FunctionTransformer)
from sklearn.utils import check_random_state


from .._ortho_learner import _OrthoLearner
from ._rlearner import _RLearner
from ._rlearner import _RLearner, _ModelFinal
from .._cate_estimator import (DebiasedLassoCateEstimatorMixin,
LinearModelFinalCateEstimatorMixin,
StatsModelsCateEstimatorMixin,
Expand Down Expand Up @@ -54,20 +55,42 @@ def predict(self, X, W):
raise AttributeError("Cannot use a classifier as a first stage model when the target is continuous!")
return self._model.predict(_combine(X, W, n_samples))

def score(self, X, W, Target, sample_weight=None):
if hasattr(self._model, 'score'):
if self._discrete_target:
# In this case, the Target is the one-hot-encoding of the treatment variable
# We need to go back to the label representation of the one-hot so as to call
# the classifier.
Target = inverse_onehot(Target)
def score(self, X, W, Target, sample_weight=None, scoring=None, score_by_dim=False):
"""
Score the first stage model on provided data.

:param X: Nuisances
:param W: Treatments
:param Target: The true targets
:param sample_weight: optional sample weights
:param scoring: non-standard scoring function name from sklearn get_scorer. Results in
call to _rlearner._wrap_scoring
:param score_by_dim: If a multi-dimension treatment, score each treatment separately.
:return:
"""
XW_combined = _combine(X, W, Target.shape[0])
if self._discrete_target:
# In this case, the Target is the one-hot-encoding of the treatment variable
# We need to go back to the label representation of the one-hot so as to call
# the classifier.
Target = inverse_onehot(Target)
if hasattr(self._model, 'score') and scoring is None and not score_by_dim:
# Standard default model scoring
if sample_weight is not None:
return self._model.score(_combine(X, W, Target.shape[0]), Target, sample_weight=sample_weight)
return self._model.score(XW_combined, Target, sample_weight=sample_weight)
else:
return self._model.score(_combine(X, W, Target.shape[0]), Target)
return self._model.score(XW_combined, Target)
elif hasattr(self._model, 'score'):
return _FirstStageWrapper._wrap_scoring(scoring,Y_true=Target, X=XW_combined, est=self._model,
sample_weight=sample_weight, score_by_dim=score_by_dim)
else:
return None

@staticmethod
def _wrap_scoring(scoring, Y_true, X, est, sample_weight=None, score_by_dim=False):
"""Predict from the estimator, and use the _ModelFinal.wrap_scoring function."""
Y_pred = est.predict(X)
return _ModelFinal.wrap_scoring(scoring, Y_true, Y_pred, sample_weight, score_by_dim=score_by_dim)

class _FirstStageSelector(SingleModelSelector):
def __init__(self, model: SingleModelSelector, discrete_target):
Expand Down
Loading
Loading