Skip to content

Alternative scoring metrics #965

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions econml/_ortho_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,7 +1027,7 @@ def effect_inference(self, X=None, *, T0=0, T1=1):

effect_inference.__doc__ = LinearCateEstimator.effect_inference.__doc__

def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None, scoring=None):
"""
Score the fitted CATE model on a new data set.

Expand Down Expand Up @@ -1055,6 +1055,9 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
Weights for each samples
groups: (n,) vector, optional
All rows corresponding to the same group will be kept together during splitting.
scoring: name of an sklearn scoring function to use instead of the default, optional
Supports f1_score, log_loss, mean_absolute_error, mean_squared_error, r2_score,
and roc_auc_score.

Returns
-------
Expand Down Expand Up @@ -1113,9 +1116,24 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):

accumulated_nuisances += nuisances

score_kwargs = {
'X': X,
'W': W,
'Z': Z,
'sample_weight': sample_weight,
'groups': groups
}
# If using an _rlearner, the scoring parameter can be passed along, if provided
if scoring is not None:
# Cannot import in header, or circular imports
from .dml._rlearner import _ModelFinal
if isinstance(self._ortho_learner_model_final, _ModelFinal):
score_kwargs['scoring'] = scoring
else:
raise NotImplementedError("scoring parameter only implemented for "
"_rlearner._ModelFinal")
return self._ortho_learner_model_final.score(Y, T, nuisances=accumulated_nuisances,
**filter_none_kwargs(X=X, W=W, Z=Z,
sample_weight=sample_weight, groups=groups))
**filter_none_kwargs(**score_kwargs))

@property
def ortho_learner_model_final_(self):
Expand Down
153 changes: 143 additions & 10 deletions econml/dml/_rlearner.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,12 @@

from abc import abstractmethod
import numpy as np

import pandas as pd
from sklearn.metrics import (
get_scorer,
get_scorer_names
)
from scipy.stats import pearsonr
from ..sklearn_extensions.model_selection import ModelSelector
from ..utilities import (filter_none_kwargs)
from .._ortho_learner import _OrthoLearner
Expand All @@ -54,10 +59,13 @@ def train(self, is_selecting, folds, Y, T, X=None, W=None, Z=None, sample_weight
filter_none_kwargs(sample_weight=sample_weight, groups=groups))
return self

def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None,
y_scoring=None, t_scoring=None, t_score_by_dim=False):
# note that groups are not passed to score because they are only used for fitting
T_score = self._model_t.score(X, W, T, **filter_none_kwargs(sample_weight=sample_weight))
Y_score = self._model_y.score(X, W, Y, **filter_none_kwargs(sample_weight=sample_weight))
T_score = self._model_t.score(X, W, T, **filter_none_kwargs(sample_weight=sample_weight),
scoring=t_scoring, score_by_dim=t_score_by_dim)
Y_score = self._model_y.score(X, W, Y, **filter_none_kwargs(sample_weight=sample_weight),
scoring=y_scoring)
return Y_score, T_score

def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
Expand Down Expand Up @@ -98,18 +106,89 @@ def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None,
def predict(self, X=None):
return self._model_final.predict(X)

def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, groups=None):
def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, groups=None,
scoring='mean_squared_error'):
"""
Score final model fit of residualized outcomes from residualized treatments and nuisances.

The default scoring method "mean_squared_error" is the score used to fit residualized
outcomes from residualized treatments and nuisances, and reproduces the behavior of this
score function from before the scoring method option.

:param Y: Unused
:param T: Unused
:param X: Combined nuisances, treatments and instruments to call _model_final.predict
:param W: Unused
:param Z: Unused
:param nuisances: tuple of the outcome (Y) residuals and treatment (T) residuals
:param sample_weight: Optional weighting on the samples
:param groups: Unused
:param scoring: Optional alternative scoring metric from sklearn.get_scorer
:return: Float score
"""
Y_res, T_res = nuisances
if Y_res.ndim == 1:
Y_res = Y_res.reshape((-1, 1))
if T_res.ndim == 1:
T_res = T_res.reshape((-1, 1))
effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1]))
Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape)
if sample_weight is not None:
return np.mean(np.average((Y_res - Y_res_pred) ** 2, weights=sample_weight, axis=0))
return _ModelFinal._wrap_scoring(Y_true=Y_res, Y_pred=Y_res_pred, scoring=scoring, sample_weight=sample_weight)

@staticmethod
def _wrap_scoring(scoring, Y_true, Y_pred, sample_weight=None):
"""
Pull the scoring function from sklearn.get_scorer and call it with Y_true, Y_pred.

Standard score names like "mean_squared_error" are present in sklearn scoring as
"neg_..." so score names are accepted either with or without the "neg_" prefix.
The function _score_func is called directly because the scorer objects from get_scorer()
do not accept a sample_weight parameter. The _score_func member has been available in
sklearn scorers since before sklearn 1.0.

A special case is written for using 'pearsonr' as a score function, with unweighted samples.
pearsonr is a useful score function for "Zero Inflated" regression problems, in which
a regression problem has mainly zero outcomes. In that case the MSE is typically a very
small number and the r-squared may be negative; a statistically significant correlation
between predictions and outcomes is the best evidence the model fit is meaningful.

:param scoring: A string name of a scoring function from sklearn
:param Y_true: True Y values
:param Y_pred: Predicted Y values
:param sample_weight: Optional weighting on the examples
:return: Float score
"""
if scoring in get_scorer_names():
score_fn = get_scorer(scoring)._score_func
elif 'neg_' + scoring in get_scorer_names():
score_fn = get_scorer('neg_' + scoring)._score_func
elif scoring == 'pearsonr':
if sample_weight is not None:
raise NotImplementedError("scoring does not support pearsonr with sample_weight" )
return pearsonr(np.squeeze(Y_true), np.squeeze(Y_pred))
else:
raise NotImplementedError(f"_wrap_scoring does not support '{scoring}'" )

res = score_fn(Y_true, Y_pred, sample_weight=sample_weight)
return res


@staticmethod
def wrap_scoring(scoring, Y_true, Y_pred, sample_weight=None, score_by_dim=False):
"""
In case the caller wants a score for each dimension of a multiple treatment model.

Loop over the call to the single score wrapper.
"""
if not score_by_dim:
return _ModelFinal._wrap_scoring(scoring, Y_true, Y_pred, sample_weight)
else:
return np.mean((Y_res - Y_res_pred) ** 2)
assert Y_true.shape == Y_pred.shape, "Mismatch shape in wrap_scoring"
n_out = Y_pred.shape[1]
res = [None]*Y_pred.shape[1]
for yidx in range(n_out):
res[yidx]= _ModelFinal.wrap_scoring(scoring, Y_true[:,yidx], Y_pred[:,yidx], sample_weight)
return res


class _RLearner(_OrthoLearner):
Expand Down Expand Up @@ -422,7 +501,7 @@ def fit(self, Y, T, *, X=None, W=None, sample_weight=None, freq_weight=None, sam
cache_values=cache_values,
inference=inference)

def score(self, Y, T, X=None, W=None, sample_weight=None):
def score(self, Y, T, X=None, W=None, sample_weight=None, scoring=None):
"""
Score the fitted CATE model on a new data set.

Expand Down Expand Up @@ -453,7 +532,7 @@ def score(self, Y, T, X=None, W=None, sample_weight=None):
The MSE of the final CATE model on the new data.
"""
# Replacing score from _OrthoLearner, to enforce Z=None and improve the docstring
return super().score(Y, T, X=X, W=W, sample_weight=sample_weight)
return super().score(Y, T, X=X, W=W, sample_weight=sample_weight, scoring=scoring)

@property
def rlearner_model_final_(self):
Expand Down Expand Up @@ -493,3 +572,57 @@ def residuals_(self):
"Set to `True` to enable residual storage.")
Y_res, T_res = self._cached_values.nuisances
return Y_res, T_res, self._cached_values.X, self._cached_values.W


def score_nuisances(self, Y, T, X=None, W=None, Z=None, sample_weight=None, y_scoring=None,
t_scoring=None, t_score_by_dim=False):
"""
Score the fitted nuisance models on arbitrary data and using any supported sklearn scoring.

Parameters
----------
Y: (n, d_y) matrix or vector of length n
Outcomes for each sample
T: (n, d_t) matrix or vector of length n
Treatments for each sample
X: (n, d_x) matrix, optional
Features for each sample
W: (n, d_w) matrix, optional
Controls for each sample
Z: (n, d_z) matrix, optional
Instruments for each sample
sample_weight:(n,) vector, optional
Weights for each samples
t_scoring: str, optional
Name of an sklearn scoring function to use instead of the default for model_t, choices
are from sklearn.get_scoring_names() plus pearsonr
y_scoring: str, optional
Name of an sklearn scoring function to use instead of the default for model_y, choices
are from sklearn.get_scoring_names() plus pearsonr
t_score_by_dim: bool, default=False
Score prediction of treatment dimensions separately

Returns
-------
score_dict : dict[str,list[float]]
A dictionary where the keys indicate the Y and T scores used and the values are
lists of scores, one per CV fold model.
"""
Y_key = 'Y_defscore' if not y_scoring else f"Y_{y_scoring}"
T_Key = 'T_defscore' if not t_scoring else f"T_{t_scoring}"
score_dict = {
Y_key : [],
T_Key : []
}

# For discrete treatments, these will have to be one hot encoded
Y_2_score = pd.get_dummies(Y) if self.discrete_outcome and (len(Y.shape) == 1 or Y.shape[1] == 1) else Y
T_2_score = pd.get_dummies(T) if self.discrete_treatment and (len(T.shape) == 1 or T.shape[1] == 1) else T

for m in self._models_nuisance[0]:
Y_score, T_score = m.score(Y_2_score, T_2_score, X=X, W=W, Z=Z, sample_weight=sample_weight,
y_scoring=y_scoring, t_scoring=t_scoring,
t_score_by_dim=t_score_by_dim)
score_dict[Y_key].append(Y_score)
score_dict[T_Key].append(T_score)
return score_dict
43 changes: 33 additions & 10 deletions econml/dml/dml.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
from sklearn.preprocessing import (FunctionTransformer)
from sklearn.utils import check_random_state


from .._ortho_learner import _OrthoLearner
from ._rlearner import _RLearner
from ._rlearner import _RLearner, _ModelFinal
from .._cate_estimator import (DebiasedLassoCateEstimatorMixin,
LinearModelFinalCateEstimatorMixin,
StatsModelsCateEstimatorMixin,
Expand Down Expand Up @@ -52,20 +53,42 @@ def predict(self, X, W):
raise AttributeError("Cannot use a classifier as a first stage model when the target is continuous!")
return self._model.predict(_combine(X, W, n_samples))

def score(self, X, W, Target, sample_weight=None):
if hasattr(self._model, 'score'):
if self._discrete_target:
# In this case, the Target is the one-hot-encoding of the treatment variable
# We need to go back to the label representation of the one-hot so as to call
# the classifier.
Target = inverse_onehot(Target)
def score(self, X, W, Target, sample_weight=None, scoring=None, score_by_dim=False):
"""
Score the first stage model on provided data.

:param X: Nuisances
:param W: Treatments
:param Target: The true targets
:param sample_weight: optional sample weights
:param scoring: non-standard scoring function name from sklearn get_scorer. Results in
call to _rlearner._wrap_scoring
:param score_by_dim: If a multi-dimension treatment, score each treatment separately.
:return:
"""
XW_combined = _combine(X, W, Target.shape[0])
if self._discrete_target:
# In this case, the Target is the one-hot-encoding of the treatment variable
# We need to go back to the label representation of the one-hot so as to call
# the classifier.
Target = inverse_onehot(Target)
if hasattr(self._model, 'score') and scoring is None and not score_by_dim:
# Standard default model scoring
if sample_weight is not None:
return self._model.score(_combine(X, W, Target.shape[0]), Target, sample_weight=sample_weight)
return self._model.score(XW_combined, Target, sample_weight=sample_weight)
else:
return self._model.score(_combine(X, W, Target.shape[0]), Target)
return self._model.score(XW_combined, Target)
elif hasattr(self._model, 'score'):
return _FirstStageWrapper._wrap_scoring(scoring,Y_true=Target, X=XW_combined, est=self._model,
sample_weight=sample_weight, score_by_dim=score_by_dim)
else:
return None

@staticmethod
def _wrap_scoring(scoring, Y_true, X, est, sample_weight=None, score_by_dim=False):
"""Predict from the estimator, and use the _ModelFinal.wrap_scoring function."""
Y_pred = est.predict(X)
return _ModelFinal.wrap_scoring(scoring, Y_true, Y_pred, sample_weight, score_by_dim=score_by_dim)

class _FirstStageSelector(SingleModelSelector):
def __init__(self, model: SingleModelSelector, discrete_target):
Expand Down
75 changes: 75 additions & 0 deletions econml/tests/test_dml.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,81 @@ def true_fn(x):
np.testing.assert_array_less(lb - .01, truth)
np.testing.assert_array_less(truth, ub + .01)

def test_forest_dml_score_fns(self):
np.random.seed(1234)
n = 20000 # number of raw samples
d = 10

Z = np.random.binomial(1, .5, size=(n, d))
T = np.random.binomial(1, .5, size=(n,))

def true_fn(x):
return -1 + 2 * x[:, 0] + x[:, 1] * x[:, 2]

y = true_fn(Z) * T + Z[:, 0] + (1 * Z[:, 0] + 1) * np.random.normal(0, 1, size=(n,))
X = Z[:, :4]
W = Z[:, 4:]

est = CausalForestDML(model_y=GradientBoostingRegressor(n_estimators=30, min_samples_leaf=30),
model_t=GradientBoostingClassifier(n_estimators=30, min_samples_leaf=30),
discrete_treatment=True,
cv=2,
n_jobs=None,
n_estimators=1000,
max_samples=.4,
min_samples_leaf=10,
min_impurity_decrease=0.001,
verbose=0, min_var_fraction_leaf=.1,
fit_intercept=False,
random_state=12345)

est.fit(y, T, X=X, W=W)

s1 = est.score(Y=y,T=T,X=X, W=W, scoring='mean_squared_error')
s2 = est.score(Y=y,T=T,X=X, W=W)
assert s1 == s2
np.testing.assert_allclose(s1, 2.50, rtol=0, atol=.01)
s3 = est.score(Y=y, T=T, X=X, W=W, scoring='mean_absolute_error')
np.testing.assert_allclose(s3, 1.19, rtol=0, atol=.01)
s4 = est.score(Y=y, T=T, X=X, W=W, scoring='r2')
np.testing.assert_allclose(s4, 0.113, rtol=0, atol=.001)
s5 = est.score(Y=y, T=T, X=X, W=W, scoring='pearsonr')
np.testing.assert_allclose(s5[0], 0.337, rtol=0, atol=0.005 )

sn1 = est.score_nuisances(Y=y, T=T, X=X, W=W,
t_scoring='mean_squared_error',
y_scoring='mean_squared_error')
np.testing.assert_allclose(sn1['Y_mean_squared_error'], [2.8,2.8], rtol=0, atol=.1)
np.testing.assert_allclose(sn1['T_mean_squared_error'], [1.5,1.5], rtol=0, atol=.1)

sn2 = est.score_nuisances(Y=y, T=T, X=X, W=W,
t_scoring='mean_absolute_error',
y_scoring='mean_absolute_error')
np.testing.assert_allclose(sn2['Y_mean_absolute_error'], [1.3,1.3], rtol=0, atol=.1)
np.testing.assert_allclose(sn2['T_mean_absolute_error'], [1.0,1.0], rtol=0, atol=.1)

sn3 = est.score_nuisances(Y=y, T=T, X=X, W=W,
t_scoring='r2',
y_scoring='r2')
np.testing.assert_allclose(sn3['Y_r2'], [0.27,0.27], rtol=0, atol=.005)
np.testing.assert_allclose(sn3['T_r2'], [-5.1,-5.1], rtol=0, atol=0.25)

sn4 = est.score_nuisances(Y=y, T=T, X=X, W=W,
t_scoring='pearsonr',
y_scoring='pearsonr')
# Ignoring the p-values returned with the score
y_pearsonr = [s[0] for s in sn4['Y_pearsonr']]
t_pearsonr = [s[0] for s in sn4['T_pearsonr']]
np.testing.assert_allclose(y_pearsonr, [0.52, 0.52], rtol=0, atol=.01)
np.testing.assert_allclose(t_pearsonr, [.035, .035], rtol=0, atol=0.005)

# T is binary, and can be used to check binary eval functions
sn5 = est.score_nuisances(Y=y, T=T, X=X, W=W, t_scoring='roc_auc')
np.testing.assert_allclose(sn5['T_roc_auc'], [0.52,0.52], rtol=0, atol=.01)

sn6 = est.score_nuisances(Y=y, T=T, X=X, W=W, t_scoring='log_loss')
np.testing.assert_allclose(sn6['T_log_loss'], [17.4,17.4], rtol=0, atol=0.1)

def test_aaforest_pandas(self):
"""Test that we can use CausalForest with pandas inputs."""
df = pd.DataFrame({'a': np.random.normal(size=500),
Expand Down
Loading
Loading