Skip to content

Commit a97907d

Browse files
authored
Merge pull request #365 from DoubleML/jh-logistic-model
LPLR model
2 parents 70e5afe + 6999467 commit a97907d

38 files changed

+2171
-322
lines changed

doubleml/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from .irm.pq import DoubleMLPQ
1414
from .irm.qte import DoubleMLQTE
1515
from .irm.ssm import DoubleMLSSM
16+
from .plm.lplr import DoubleMLLPLR
1617
from .plm.pliv import DoubleMLPLIV
1718
from .plm.plr import DoubleMLPLR
1819
from .utils.blp import DoubleMLBLP
@@ -42,6 +43,7 @@
4243
"DoubleMLBLP",
4344
"DoubleMLPolicyTree",
4445
"DoubleMLSSM",
46+
"DoubleMLLPLR",
4547
]
4648

4749
__version__ = importlib.metadata.version("doubleml")

doubleml/did/did.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ class DoubleMLDID(LinearScoreMixin, DoubleML):
7070
>>> data = make_did_SZ2020(n_obs=500, return_type='DataFrame')
7171
>>> obj_dml_data = dml.DoubleMLDIDData(data, 'y', 'd')
7272
>>> dml_did_obj = dml.DoubleMLDID(obj_dml_data, ml_g, ml_m)
73-
>>> dml_did_obj.fit().summary
73+
>>> dml_did_obj.fit().summary # doctest: +SKIP
7474
coef std err t P>|t| 2.5 % 97.5 %
7575
d -2.840718 1.760386 -1.613691 0.106595 -6.291011 0.609575
7676

doubleml/did/did_cs.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,8 @@ class DoubleMLDIDCS(LinearScoreMixin, DoubleML):
5959
Default is ``True``.
6060
6161
Examples
62-
-------- >>> import numpy as np
62+
--------
63+
>>> import numpy as np
6364
>>> import doubleml as dml
6465
>>> from doubleml.did.datasets import make_did_SZ2020
6566
>>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
@@ -69,7 +70,7 @@ class DoubleMLDIDCS(LinearScoreMixin, DoubleML):
6970
>>> data = make_did_SZ2020(n_obs=500, cross_sectional_data=True, return_type='DataFrame')
7071
>>> obj_dml_data = dml.DoubleMLDIDData(data, 'y', 'd', t_col='t')
7172
>>> dml_did_obj = dml.DoubleMLDIDCS(obj_dml_data, ml_g, ml_m)
72-
>>> dml_did_obj.fit().summary
73+
>>> dml_did_obj.fit().summary # doctest: +SKIP
7374
coef std err t P>|t| 2.5 % 97.5 %
7475
d -4.9944 7.561785 -0.660479 0.508947 -19.815226 9.826426
7576
"""

doubleml/did/did_multi.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ class DoubleMLDIDMulti:
140140
... gt_combinations="standard",
141141
... control_group="never_treated",
142142
... )
143-
>>> print(dml_did_obj.fit().summary)
143+
>>> print(dml_did_obj.fit().summary) # doctest: +SKIP
144144
coef std err ... 2.5 % 97.5 %
145145
ATT(2025-03,2025-01,2025-02) -0.797617 0.459617 ... -1.698450 0.103215
146146
ATT(2025-03,2025-02,2025-03) 0.270311 0.456453 ... -0.624320 1.164941

doubleml/double_ml.py

Lines changed: 53 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
class DoubleML(SampleSplittingMixin, ABC):
2323
"""Double Machine Learning."""
2424

25-
def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting):
25+
def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting, double_sample_splitting=False):
2626
# check and pick up obj_dml_data
2727
if not isinstance(obj_dml_data, DoubleMLBaseData):
2828
raise TypeError(
@@ -34,18 +34,10 @@ def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting):
3434
if obj_dml_data.n_cluster_vars > 2:
3535
raise NotImplementedError("Multi-way (n_ways > 2) clustering not yet implemented.")
3636
self._is_cluster_data = True
37-
self._is_panel_data = False
38-
if isinstance(obj_dml_data, DoubleMLPanelData):
39-
self._is_panel_data = True
40-
self._is_did_data = False
41-
if isinstance(obj_dml_data, DoubleMLDIDData):
42-
self._is_did_data = True
43-
self._is_ssm_data = False
44-
if isinstance(obj_dml_data, DoubleMLSSMData):
45-
self._is_ssm_data = True
46-
self._is_rdd_data = False
47-
if isinstance(obj_dml_data, DoubleMLRDDData):
48-
self._is_rdd_data = True
37+
self._is_panel_data = isinstance(obj_dml_data, DoubleMLPanelData)
38+
self._is_did_data = isinstance(obj_dml_data, DoubleMLDIDData)
39+
self._is_ssm_data = isinstance(obj_dml_data, DoubleMLSSMData)
40+
self._is_rdd_data = isinstance(obj_dml_data, DoubleMLRDDData)
4941

5042
self._dml_data = obj_dml_data
5143
self._n_obs = self._dml_data.n_obs
@@ -108,6 +100,9 @@ def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting):
108100
self._smpls = None
109101
self._smpls_cluster = None
110102
self._n_obs_sample_splitting = self.n_obs
103+
self._double_sample_splitting = double_sample_splitting
104+
if self._double_sample_splitting:
105+
self._smpls_inner = None
111106
if draw_sample_splitting:
112107
self.draw_sample_splitting()
113108
self._score_dim = (self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs)
@@ -359,6 +354,21 @@ def smpls(self):
359354
raise ValueError(err_msg)
360355
return self._smpls
361356

357+
@property
358+
def smpls_inner(self):
359+
"""
360+
The partition used for cross-fitting.
361+
"""
362+
if not self._double_sample_splitting:
363+
raise ValueError("smpls_inner is only available for double sample splitting.")
364+
if self._smpls_inner is None:
365+
err_msg = (
366+
"Sample splitting not specified. Either draw samples via .draw_sample splitting() "
367+
+ "or set external samples via .set_sample_splitting()."
368+
)
369+
raise ValueError(err_msg)
370+
return self._smpls_inner
371+
362372
@property
363373
def smpls_cluster(self):
364374
"""
@@ -507,6 +517,18 @@ def summary(self):
507517
def __smpls(self):
508518
return self._smpls[self._i_rep]
509519

520+
@property
521+
def __smpls__inner(self):
522+
if not self._double_sample_splitting:
523+
raise ValueError("smpls_inner is only available for double sample splitting.")
524+
if self._smpls_inner is None:
525+
err_msg = (
526+
"Sample splitting not specified. Either draw samples via .draw_sample splitting() "
527+
+ "or set external samples via .set_sample_splitting()."
528+
)
529+
raise ValueError(err_msg)
530+
return self._smpls_inner[self._i_rep]
531+
510532
@property
511533
def __smpls_cluster(self):
512534
return self._smpls_cluster[self._i_rep]
@@ -1081,7 +1103,10 @@ def _initalize_fit(self, store_predictions, store_models):
10811103

10821104
def _fit_nuisance_and_score_elements(self, n_jobs_cv, store_predictions, external_predictions, store_models):
10831105
ext_prediction_dict = _set_external_predictions(
1084-
external_predictions, learners=self.params_names, treatment=self._dml_data.d_cols[self._i_treat], i_rep=self._i_rep
1106+
external_predictions,
1107+
learners=self.params_names,
1108+
treatment=self._dml_data.d_cols[self._i_treat],
1109+
i_rep=self._i_rep,
10851110
)
10861111

10871112
# ml estimation of nuisance models and computation of score elements
@@ -1230,7 +1255,7 @@ def evaluate_learners(self, learners=None, metric=_rmse):
12301255
>>> def mae(y_true, y_pred):
12311256
... subset = np.logical_not(np.isnan(y_true))
12321257
... return mean_absolute_error(y_true[subset], y_pred[subset])
1233-
>>> dml_irm_obj.evaluate_learners(metric=mae)
1258+
>>> dml_irm_obj.evaluate_learners(metric=mae) # doctest: +SKIP
12341259
{'ml_g0': array([[0.88173585]]), 'ml_g1': array([[0.83854057]]), 'ml_m': array([[0.35871235]])}
12351260
"""
12361261
# if no learners are provided try to evaluate all learners
@@ -1249,12 +1274,19 @@ def evaluate_learners(self, learners=None, metric=_rmse):
12491274
for learner in learners:
12501275
for rep in range(self.n_rep):
12511276
for coef_idx in range(self._dml_data.n_coefs):
1252-
res = metric(
1253-
y_pred=self.predictions[learner][:, rep, coef_idx].reshape(1, -1),
1254-
y_true=self.nuisance_targets[learner][:, rep, coef_idx].reshape(1, -1),
1255-
)
1256-
if not np.isfinite(res):
1257-
raise ValueError(f"Evaluation from learner {str(learner)} is not finite.")
1277+
targets = self.nuisance_targets[learner][:, rep, coef_idx].reshape(1, -1)
1278+
1279+
if np.all(np.isnan(targets)):
1280+
res = np.nan
1281+
else:
1282+
predictions = self.predictions[learner][:, rep, coef_idx].reshape(1, -1)
1283+
res = metric(
1284+
y_pred=predictions,
1285+
y_true=targets,
1286+
)
1287+
if not np.isfinite(res):
1288+
raise ValueError(f"Evaluation from learner {str(learner)} is not finite.")
1289+
12581290
dist[learner][rep, coef_idx] = res
12591291
return dist
12601292
else:

doubleml/double_ml_sampling_mixins.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from abc import abstractmethod
22

33
from doubleml.utils._checks import _check_sample_splitting
4-
from doubleml.utils.resampling import DoubleMLClusterResampling, DoubleMLResampling
4+
from doubleml.utils.resampling import DoubleMLClusterResampling, DoubleMLDoubleResampling, DoubleMLResampling
55

66

77
class SampleSplittingMixin:
@@ -17,6 +17,8 @@ class SampleSplittingMixin:
1717
`sample splitting <https://docs.doubleml.org/stable/guide/resampling.html>`_ in the DoubleML user guide.
1818
"""
1919

20+
_double_sample_splitting = False
21+
2022
def draw_sample_splitting(self):
2123
"""
2224
Draw sample splitting for DoubleML models.
@@ -29,6 +31,8 @@ def draw_sample_splitting(self):
2931
self : object
3032
"""
3133
if self._is_cluster_data:
34+
if self._double_sample_splitting:
35+
raise ValueError("Cluster data not supported for double sample splitting.")
3236
obj_dml_resampling = DoubleMLClusterResampling(
3337
n_folds=self._n_folds_per_cluster,
3438
n_rep=self.n_rep,
@@ -38,10 +42,20 @@ def draw_sample_splitting(self):
3842
)
3943
self._smpls, self._smpls_cluster = obj_dml_resampling.split_samples()
4044
else:
41-
obj_dml_resampling = DoubleMLResampling(
42-
n_folds=self.n_folds, n_rep=self.n_rep, n_obs=self._n_obs_sample_splitting, stratify=self._strata
43-
)
44-
self._smpls = obj_dml_resampling.split_samples()
45+
if self._double_sample_splitting:
46+
obj_dml_resampling = DoubleMLDoubleResampling(
47+
n_folds=self.n_folds,
48+
n_folds_inner=self.n_folds_inner,
49+
n_rep=self.n_rep,
50+
n_obs=self._dml_data.n_obs,
51+
stratify=self._strata,
52+
)
53+
self._smpls, self._smpls_inner = obj_dml_resampling.split_samples()
54+
else:
55+
obj_dml_resampling = DoubleMLResampling(
56+
n_folds=self.n_folds, n_rep=self.n_rep, n_obs=self._n_obs_sample_splitting, stratify=self._strata
57+
)
58+
self._smpls = obj_dml_resampling.split_samples()
4559

4660
return self
4761

@@ -104,6 +118,9 @@ def set_sample_splitting(self, all_smpls, all_smpls_cluster=None):
104118
>>> dml_plr_obj.set_sample_splitting(smpls) # doctest: +ELLIPSIS
105119
<doubleml.plm.plr.DoubleMLPLR object at 0x...>
106120
"""
121+
if self._double_sample_splitting:
122+
raise ValueError("set_sample_splitting not supported for double sample splitting.")
123+
107124
self._smpls, self._smpls_cluster, self._n_rep, self._n_folds = _check_sample_splitting(
108125
all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data, n_obs=self._n_obs_sample_splitting
109126
)

doubleml/double_ml_score_mixins.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ class NonLinearScoreMixin:
8686
_score_type = "nonlinear"
8787
_coef_start_val = np.nan
8888
_coef_bounds = None
89+
_error_on_convergence_failure = False
8990

9091
@property
9192
@abstractmethod
@@ -149,12 +150,16 @@ def score_deriv(theta):
149150
theta_hat = root_res.root
150151
if not root_res.converged:
151152
score_val = score(theta_hat)
152-
warnings.warn(
153+
msg = (
153154
"Could not find a root of the score function.\n "
154155
f"Flag: {root_res.flag}.\n"
155156
f"Score value found is {score_val} "
156157
f"for parameter theta equal to {theta_hat}."
157158
)
159+
if self._error_on_convergence_failure:
160+
raise ValueError(msg)
161+
else:
162+
warnings.warn(msg)
158163
else:
159164
signs_different, bracket_guess = _get_bracket_guess(score, self._coef_start_val, self._coef_bounds)
160165

@@ -186,12 +191,16 @@ def score_squared(theta):
186191
score, self._coef_start_val, approx_grad=True, bounds=[self._coef_bounds]
187192
)
188193
theta_hat = theta_hat_array.item()
189-
warnings.warn(
194+
msg = (
190195
"Could not find a root of the score function.\n "
191196
f"Minimum score value found is {score_val} "
192197
f"for parameter theta equal to {theta_hat}.\n "
193198
"No theta found such that the score function evaluates to a negative value."
194199
)
200+
if self._error_on_convergence_failure:
201+
raise ValueError(msg)
202+
else:
203+
warnings.warn(msg)
195204
else:
196205

197206
def neg_score(theta):
@@ -202,11 +211,15 @@ def neg_score(theta):
202211
neg_score, self._coef_start_val, approx_grad=True, bounds=[self._coef_bounds]
203212
)
204213
theta_hat = theta_hat_array.item()
205-
warnings.warn(
214+
msg = (
206215
"Could not find a root of the score function. "
207216
f"Maximum score value found is {-1 * neg_score_val} "
208217
f"for parameter theta equal to {theta_hat}. "
209218
"No theta found such that the score function evaluates to a positive value."
210219
)
220+
if self._error_on_convergence_failure:
221+
raise ValueError(msg)
222+
else:
223+
warnings.warn(msg)
211224

212225
return theta_hat

doubleml/irm/cvar.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ class DoubleMLCVAR(LinearScoreMixin, DoubleML):
9797
>>> data = make_irm_data(theta=0.5, n_obs=500, dim_x=20, return_type='DataFrame')
9898
>>> obj_dml_data = dml.DoubleMLData(data, 'y', 'd')
9999
>>> dml_cvar_obj = dml.DoubleMLCVAR(obj_dml_data, ml_g, ml_m, treatment=1, quantile=0.5)
100-
>>> dml_cvar_obj.fit().summary
100+
>>> dml_cvar_obj.fit().summary # doctest: +SKIP
101101
coef std err t P>|t| 2.5 % 97.5 %
102102
d 1.588364 0.096616 16.43989 9.909942e-61 1.398999 1.777728
103103

doubleml/irm/iivm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ class DoubleMLIIVM(LinearScoreMixin, DoubleML):
9595
>>> data = make_iivm_data(theta=0.5, n_obs=1000, dim_x=20, alpha_x=1.0, return_type='DataFrame')
9696
>>> obj_dml_data = dml.DoubleMLData(data, 'y', 'd', z_cols='z')
9797
>>> dml_iivm_obj = dml.DoubleMLIIVM(obj_dml_data, ml_g, ml_m, ml_r)
98-
>>> dml_iivm_obj.fit().summary
98+
>>> dml_iivm_obj.fit().summary # doctest: +SKIP
9999
coef std err t P>|t| 2.5 % 97.5 %
100100
d 0.362398 0.191578 1.891649 0.058538 -0.013088 0.737884
101101

doubleml/irm/irm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ class DoubleMLIRM(LinearScoreMixin, DoubleML):
9696
>>> data = make_irm_data(theta=0.5, n_obs=500, dim_x=20, return_type='DataFrame')
9797
>>> obj_dml_data = dml.DoubleMLData(data, 'y', 'd')
9898
>>> dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, ml_g, ml_m)
99-
>>> dml_irm_obj.fit().summary
99+
>>> dml_irm_obj.fit().summary # doctest: +SKIP
100100
coef std err t P>|t| 2.5 % 97.5 %
101101
d 0.371972 0.206802 1.798685 0.072069 -0.033353 0.777297
102102

0 commit comments

Comments
 (0)