Skip to content

Commit 7719417

Browse files
authored
Merge pull request #363 from DoubleML/s-add-propensity-score-adjustments
Refactor propensity score adjustments
2 parents 6c5abb7 + c096604 commit 7719417

File tree

81 files changed

+2170
-670
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

81 files changed

+2170
-670
lines changed

doubleml/data/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from .ssm_data import DoubleMLSSMData
1212

1313

14+
# TODO: Remove DoubleMLClusterData with version 0.12.0
1415
class DoubleMLClusterData(DoubleMLData):
1516
"""
1617
Backwards compatibility wrapper for DoubleMLData with cluster_cols.

doubleml/data/did_data.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import io
2+
import warnings
23

34
import pandas as pd
45
from sklearn.utils import assert_all_finite
@@ -7,6 +8,7 @@
78
from doubleml.data.base_data import DoubleMLData
89

910

11+
# TODO: Remove DoubleMLDIDData with version 0.12.0
1012
class DoubleMLDIDData(DoubleMLData):
1113
"""Double machine learning data-backend for Difference-in-Differences models.
1214
@@ -81,7 +83,13 @@ def __init__(
8183
use_other_treat_as_covariate=True,
8284
force_all_x_finite=True,
8385
force_all_d_finite=True,
84-
): # Initialize _t_col to None first to avoid AttributeError during parent init
86+
):
87+
warnings.warn(
88+
"DoubleMLDIDData is deprecated and will be removed with version 0.12.0." "Use DoubleMLPanelData instead.",
89+
FutureWarning,
90+
stacklevel=2,
91+
)
92+
# Initialize _t_col to None first to avoid AttributeError during parent init
8593
self._t_col = None
8694

8795
# Store whether x_cols was originally None to reset it later

doubleml/did/did.py

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@
77
from doubleml.data.did_data import DoubleMLDIDData
88
from doubleml.double_ml import DoubleML
99
from doubleml.double_ml_score_mixins import LinearScoreMixin
10-
from doubleml.utils._checks import _check_finite_predictions, _check_is_propensity, _check_score, _check_trimming
10+
from doubleml.utils._checks import _check_finite_predictions, _check_is_propensity, _check_score
1111
from doubleml.utils._estimation import _dml_cv_predict, _dml_tune, _get_cond_smpls
12-
from doubleml.utils._propensity_score import _trimm
1312

1413

14+
# TODO: Remove DoubleMLDIDData with version 0.12.0
1515
class DoubleMLDID(LinearScoreMixin, DoubleML):
1616
"""Double machine learning for difference-in-differences models with panel data (two time periods).
1717
@@ -50,12 +50,8 @@ class DoubleMLDID(LinearScoreMixin, DoubleML):
5050
Indicates whether to use a slightly different normalization from Sant'Anna and Zhao (2020).
5151
Default is ``True``.
5252
53-
trimming_rule : str
54-
A str (``'truncate'`` is the only choice) specifying the trimming approach.
55-
Default is ``'truncate'``.
56-
57-
trimming_threshold : float
58-
The threshold used for trimming.
53+
clipping_threshold : float
54+
The threshold used for clipping.
5955
Default is ``1e-2``.
6056
6157
draw_sample_splitting : bool
@@ -89,10 +85,14 @@ def __init__(
8985
n_rep=1,
9086
score="observational",
9187
in_sample_normalization=True,
92-
trimming_rule="truncate",
93-
trimming_threshold=1e-2,
88+
clipping_threshold=1e-2,
9489
draw_sample_splitting=True,
9590
):
91+
warnings.warn(
92+
"DoubleMLDID is deprecated and will be removed with version 0.12.0. " "Please use DoubleMLDIDBinary instead.",
93+
DeprecationWarning,
94+
stacklevel=2,
95+
)
9696
super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting)
9797

9898
self._check_data(self._dml_data)
@@ -142,9 +142,7 @@ def __init__(
142142
self._predict_method["ml_m"] = "predict_proba"
143143
self._initialize_ml_nuisance_params()
144144

145-
self._trimming_rule = trimming_rule
146-
self._trimming_threshold = trimming_threshold
147-
_check_trimming(self._trimming_rule, self._trimming_threshold)
145+
self._clipping_threshold = clipping_threshold
148146
self._sensitivity_implemented = True
149147
self._external_predictions_implemented = True
150148

@@ -156,18 +154,11 @@ def in_sample_normalization(self):
156154
return self._in_sample_normalization
157155

158156
@property
159-
def trimming_rule(self):
157+
def clipping_threshold(self):
160158
"""
161-
Specifies the used trimming rule.
159+
Specifies the used clipping threshold.
162160
"""
163-
return self._trimming_rule
164-
165-
@property
166-
def trimming_threshold(self):
167-
"""
168-
Specifies the used trimming threshold.
169-
"""
170-
return self._trimming_threshold
161+
return self._clipping_threshold
171162

172163
def _initialize_ml_nuisance_params(self):
173164
if self.score == "observational":
@@ -269,9 +260,10 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
269260
method=self._predict_method["ml_m"],
270261
return_models=return_models,
271262
)
263+
272264
_check_finite_predictions(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls)
273265
_check_is_propensity(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls, eps=1e-12)
274-
m_hat["preds"] = _trimm(m_hat["preds"], self.trimming_rule, self.trimming_threshold)
266+
m_hat["preds"] = np.clip(m_hat["preds"], self.clipping_threshold, 1 - self.clipping_threshold)
275267

276268
# nuisance estimates of the uncond. treatment prob.
277269
p_hat = np.full_like(d, d.mean(), dtype="float64")

doubleml/did/did_binary.py

Lines changed: 48 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import warnings
2+
from typing import Optional
23

34
import numpy as np
45
from sklearn.utils import check_X_y
@@ -19,14 +20,13 @@
1920
from doubleml.utils._checks import (
2021
_check_bool,
2122
_check_finite_predictions,
22-
_check_is_propensity,
2323
_check_score,
24-
_check_trimming,
2524
)
2625
from doubleml.utils._estimation import _dml_cv_predict, _dml_tune, _get_cond_smpls
27-
from doubleml.utils._propensity_score import _trimm
26+
from doubleml.utils.propensity_score_processing import PSProcessorConfig, init_ps_processor
2827

2928

29+
# TODO [v0.12.0]: Remove support for 'trimming_rule' and 'trimming_threshold' (deprecated).
3030
class DoubleMLDIDBinary(LinearScoreMixin, DoubleML):
3131
"""Double machine learning for difference-in-differences models with panel data (binary setting in terms of group and time
3232
combinations).
@@ -83,13 +83,16 @@ class DoubleMLDIDBinary(LinearScoreMixin, DoubleML):
8383
Indicates whether to use a slightly different normalization from Sant'Anna and Zhao (2020).
8484
Default is ``True``.
8585
86-
trimming_rule : str
87-
A str (``'truncate'`` is the only choice) specifying the trimming approach.
88-
Default is ``'truncate'``.
86+
trimming_rule : str, optional, deprecated
87+
(DEPRECATED) A str (``'truncate'`` is the only choice) specifying the trimming approach.
88+
Use `ps_processor_config` instead. Will be removed in a future version.
8989
90-
trimming_threshold : float
91-
The threshold used for trimming.
92-
Default is ``1e-2``.
90+
trimming_threshold : float, optional, deprecated
91+
(DEPRECATED) The threshold used for trimming.
92+
Use `ps_processor_config` instead. Will be removed in a future version.
93+
94+
ps_processor_config : PSProcessorConfig, optional
95+
Configuration for propensity score processing (clipping, calibration, etc.).
9396
9497
draw_sample_splitting : bool
9598
Indicates whether the sample splitting should be drawn during initialization of the object.
@@ -115,8 +118,9 @@ def __init__(
115118
n_rep=1,
116119
score="observational",
117120
in_sample_normalization=True,
118-
trimming_rule="truncate",
119-
trimming_threshold=1e-2,
121+
trimming_rule="truncate", # TODO [v0.12.0]: Remove support for 'trimming_rule' and 'trimming_threshold' (deprecated).
122+
trimming_threshold=1e-2, # TODO [v0.12.0]: Remove support for 'trimming_rule' and 'trimming_threshold' (deprecated).
123+
ps_processor_config: Optional[PSProcessorConfig] = None,
120124
draw_sample_splitting=True,
121125
print_periods=False,
122126
):
@@ -232,9 +236,12 @@ def __init__(
232236
self._predict_method["ml_m"] = "predict_proba"
233237
self._initialize_ml_nuisance_params()
234238

239+
# TODO [v0.12.0]: Remove support for 'trimming_rule' and 'trimming_threshold' (deprecated).
240+
self._ps_processor_config, self._ps_processor = init_ps_processor(
241+
ps_processor_config, trimming_rule, trimming_threshold
242+
)
235243
self._trimming_rule = trimming_rule
236-
self._trimming_threshold = trimming_threshold
237-
_check_trimming(self._trimming_rule, self._trimming_threshold)
244+
self._trimming_threshold = self._ps_processor.clipping_threshold
238245

239246
self._sensitivity_implemented = True
240247
self._external_predictions_implemented = True
@@ -321,19 +328,44 @@ def in_sample_normalization(self):
321328
"""
322329
return self._in_sample_normalization
323330

331+
@property
332+
def ps_processor_config(self):
333+
"""
334+
Configuration for propensity score processing (clipping, calibration, etc.).
335+
"""
336+
return self._ps_processor_config
337+
338+
@property
339+
def ps_processor(self):
340+
"""
341+
Propensity score processor.
342+
"""
343+
return self._ps_processor
344+
345+
# TODO [v0.12.0]: Remove support for 'trimming_rule' and 'trimming_threshold' (deprecated).
324346
@property
325347
def trimming_rule(self):
326348
"""
327349
Specifies the used trimming rule.
328350
"""
351+
warnings.warn(
352+
"'trimming_rule' is deprecated and will be removed in a future version. ", DeprecationWarning, stacklevel=2
353+
)
329354
return self._trimming_rule
330355

356+
# TODO [v0.12.0]: Remove support for 'trimming_rule' and 'trimming_threshold' (deprecated).
331357
@property
332358
def trimming_threshold(self):
333359
"""
334360
Specifies the used trimming threshold.
335361
"""
336-
return self._trimming_threshold
362+
warnings.warn(
363+
"'trimming_threshold' is deprecated and will be removed in a future version. "
364+
"Use 'ps_processor_config.clipping_threshold' or 'ps_processor.clipping_threshold' instead.",
365+
DeprecationWarning,
366+
stacklevel=2,
367+
)
368+
return self._ps_processor.clipping_threshold
337369

338370
@property
339371
def n_obs_subset(self):
@@ -499,9 +531,9 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
499531
method=self._predict_method["ml_m"],
500532
return_models=return_models,
501533
)
534+
502535
_check_finite_predictions(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls)
503-
_check_is_propensity(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls, eps=1e-12)
504-
m_hat["preds"] = _trimm(m_hat["preds"], self.trimming_rule, self.trimming_threshold)
536+
m_hat["preds"] = self._ps_processor.adjust_ps(m_hat["preds"], d, cv=smpls, learner_name="ml_m")
505537

506538
# nuisance estimates of the uncond. treatment prob.
507539
p_hat = np.full_like(d, d.mean(), dtype="float64")

doubleml/did/did_cs.py

Lines changed: 18 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@
77
from doubleml.data.did_data import DoubleMLDIDData
88
from doubleml.double_ml import DoubleML
99
from doubleml.double_ml_score_mixins import LinearScoreMixin
10-
from doubleml.utils._checks import _check_finite_predictions, _check_is_propensity, _check_score, _check_trimming
10+
from doubleml.utils._checks import _check_finite_predictions, _check_is_propensity, _check_score
1111
from doubleml.utils._estimation import _dml_cv_predict, _dml_tune, _get_cond_smpls_2d
12-
from doubleml.utils._propensity_score import _trimm
1312

1413

14+
# TODO: Remove DoubleMLDIDData with version 0.12.0
1515
class DoubleMLDIDCS(LinearScoreMixin, DoubleML):
1616
"""Double machine learning for difference-in-difference with repeated cross-sections.
1717
@@ -50,12 +50,8 @@ class DoubleMLDIDCS(LinearScoreMixin, DoubleML):
5050
Indicates whether to use a slightly different normalization from Sant'Anna and Zhao (2020).
5151
Default is ``True``.
5252
53-
trimming_rule : str
54-
A str (``'truncate'`` is the only choice) specifying the trimming approach.
55-
Default is ``'truncate'``.
56-
57-
trimming_threshold : float
58-
The threshold used for trimming.
53+
clipping_threshold : float
54+
The threshold used for clipping.
5955
Default is ``1e-2``.
6056
6157
draw_sample_splitting : bool
@@ -87,10 +83,14 @@ def __init__(
8783
n_rep=1,
8884
score="observational",
8985
in_sample_normalization=True,
90-
trimming_rule="truncate",
91-
trimming_threshold=1e-2,
86+
clipping_threshold=1e-2,
9287
draw_sample_splitting=True,
9388
):
89+
warnings.warn(
90+
"DoubleMLDIDCS is deprecated and will be removed with version 0.12.0. " "Please use DoubleMLDIDCSBinary instead.",
91+
DeprecationWarning,
92+
stacklevel=2,
93+
)
9494
super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting)
9595

9696
self._check_data(self._dml_data)
@@ -140,10 +140,7 @@ def __init__(
140140
self._predict_method["ml_m"] = "predict_proba"
141141
self._initialize_ml_nuisance_params()
142142

143-
self._trimming_rule = trimming_rule
144-
self._trimming_threshold = trimming_threshold
145-
_check_trimming(self._trimming_rule, self._trimming_threshold)
146-
143+
self._clipping_threshold = clipping_threshold
147144
self._sensitivity_implemented = True
148145
self._external_predictions_implemented = True
149146

@@ -155,18 +152,11 @@ def in_sample_normalization(self):
155152
return self._in_sample_normalization
156153

157154
@property
158-
def trimming_rule(self):
159-
"""
160-
Specifies the used trimming rule.
161-
"""
162-
return self._trimming_rule
163-
164-
@property
165-
def trimming_threshold(self):
155+
def clipping_threshold(self):
166156
"""
167-
Specifies the used trimming threshold.
157+
Specifies the used clipping threshold.
168158
"""
169-
return self._trimming_threshold
159+
return self._clipping_threshold
170160

171161
def _initialize_ml_nuisance_params(self):
172162
if self.score == "observational":
@@ -312,9 +302,10 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
312302
method=self._predict_method["ml_m"],
313303
return_models=return_models,
314304
)
315-
_check_finite_predictions(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls)
316-
_check_is_propensity(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls, eps=1e-12)
317-
m_hat["preds"] = _trimm(m_hat["preds"], self.trimming_rule, self.trimming_threshold)
305+
306+
_check_finite_predictions(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls)
307+
_check_is_propensity(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls, eps=1e-12)
308+
m_hat["preds"] = np.clip(m_hat["preds"], self.clipping_threshold, 1 - self.clipping_threshold)
318309

319310
psi_a, psi_b = self._score_elements(
320311
y,

0 commit comments

Comments
 (0)