Merge pull request #110 from DoubleML/m-check-smpls

MalteKurz · web-flow · commit e268dca0c332 · 2021-05-25T18:20:45.000+02:00
improve exception handling for externally provided sample splitting
diff --git a/doubleml/_helper.py b/doubleml/_helper.py
@@ -34,27 +34,32 @@ def _check_is_partition(smpls, n_obs):
     return True
 
 
-def _check_all_smpls(all_smpls, n_obs):
+def _check_all_smpls(all_smpls, n_obs, check_intersect=False):
     all_smpls_checked = list()
     for smpl in all_smpls:
-        this_smpl_checked = list()
-        for tpl in smpl:
-            this_smpl_checked.append(_check_smpl_split_tpl(tpl, n_obs))
-        all_smpls_checked.append(this_smpl_checked)
+        all_smpls_checked.append(_check_smpl_split(smpl, n_obs, check_intersect))
     return all_smpls_checked
 
 
-def _check_smpl_split_tpl(smpl, n_obs):
-    train_index = np.sort(np.array(smpl[0]))
-    test_index = np.sort(np.array(smpl[1]))
+def _check_smpl_split(smpl, n_obs, check_intersect=False):
+    smpl_checked = list()
+    for tpl in smpl:
+        smpl_checked.append(_check_smpl_split_tpl(tpl, n_obs, check_intersect))
+    return smpl_checked
+
+
+def _check_smpl_split_tpl(tpl, n_obs, check_intersect=False):
+    train_index = np.sort(np.array(tpl[0]))
+    test_index = np.sort(np.array(tpl[1]))
 
     if not issubclass(train_index.dtype.type, np.integer):
         raise TypeError('Invalid sample split. Train indices must be of type integer.')
     if not issubclass(test_index.dtype.type, np.integer):
         raise TypeError('Invalid sample split. Test indices must be of type integer.')
 
-    if set(train_index) & set(test_index):
-        raise ValueError('Invalid sample split. Intersection of train and test indices is not empty.')
+    if check_intersect:
+        if set(train_index) & set(test_index):
+            raise ValueError('Invalid sample split. Intersection of train and test indices is not empty.')
 
     if len(np.unique(train_index)) != len(train_index):
         raise ValueError('Invalid sample split. Train indices contain non-unique entries.')
diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py
@@ -12,7 +12,7 @@
 
 from .double_ml_data import DoubleMLData
 from .double_ml_resampling import DoubleMLResampling
-from ._helper import _check_is_partition, _check_all_smpls, _draw_weights
+from ._helper import _check_is_partition, _check_all_smpls, _check_smpl_split, _check_smpl_split_tpl, _draw_weights
 
 
 class DoubleML(ABC):
@@ -1010,6 +1010,7 @@ def set_sample_splitting(self, all_smpls):
             if not len(all_smpls) == 2:
                 raise ValueError('Invalid partition provided. '
                                  'Tuple for train_ind and test_ind must consist of exactly two elements.')
+            all_smpls = _check_smpl_split_tpl(all_smpls, self._dml_data.n_obs)
             if (_check_is_partition([all_smpls], self._dml_data.n_obs) &
                     _check_is_partition([(all_smpls[1], all_smpls[0])], self._dml_data.n_obs)):
                 self._n_rep = 1
@@ -1020,7 +1021,7 @@ def set_sample_splitting(self, all_smpls):
                 self._n_rep = 1
                 self._n_folds = 2
                 self._apply_cross_fitting = False
-                self._smpls = _check_all_smpls([[all_smpls]], self._dml_data.n_obs)
+                self._smpls = _check_all_smpls([[all_smpls]], self._dml_data.n_obs, check_intersect=True)
         else:
             if not isinstance(all_smpls, list):
                 raise TypeError('all_smpls must be of list or tuple type. '
@@ -1031,6 +1032,7 @@ def set_sample_splitting(self, all_smpls):
                     raise ValueError('Invalid partition provided. '
                                      'All tuples for train_ind and test_ind must consist of exactly two elements.')
                 self._n_rep = 1
+                all_smpls = _check_smpl_split(all_smpls, self._dml_data.n_obs)
                 if _check_is_partition(all_smpls, self._dml_data.n_obs):
                     if ((len(all_smpls) == 1) &
                             _check_is_partition([(all_smpls[0][1], all_smpls[0][0])], self._dml_data.n_obs)):
@@ -1040,14 +1042,14 @@ def set_sample_splitting(self, all_smpls):
                     else:
                         self._n_folds = len(all_smpls)
                         self._apply_cross_fitting = True
-                        self._smpls = _check_all_smpls([all_smpls], self._dml_data.n_obs)
+                        self._smpls = _check_all_smpls([all_smpls], self._dml_data.n_obs, check_intersect=True)
                 else:
                     if not len(all_smpls) == 1:
                         raise ValueError('Invalid partition provided. '
                                          'Tuples for more than one fold provided that don\'t form a partition.')
                     self._n_folds = 2
                     self._apply_cross_fitting = False
-                    self._smpls = _check_all_smpls([all_smpls], self._dml_data.n_obs)
+                    self._smpls = _check_all_smpls([all_smpls], self._dml_data.n_obs, check_intersect=True)
             else:
                 all_list = all([isinstance(smpl, list) for smpl in all_smpls])
                 if not all_list:
@@ -1065,6 +1067,7 @@ def set_sample_splitting(self, all_smpls):
                 if not np.all(n_folds_each_smpl == n_folds_each_smpl[0]):
                     raise ValueError('Invalid partition provided. '
                                      'Different number of folds for repeated sample splitting.')
+                all_smpls = _check_all_smpls(all_smpls, self._dml_data.n_obs)
                 smpls_are_partitions = [_check_is_partition(smpl, self._dml_data.n_obs) for smpl in all_smpls]
 
                 if all(smpls_are_partitions):
@@ -1078,7 +1081,7 @@ def set_sample_splitting(self, all_smpls):
                         self._n_rep = len(all_smpls)
                         self._n_folds = n_folds_each_smpl[0]
                         self._apply_cross_fitting = True
-                        self._smpls = _check_all_smpls(all_smpls, self._dml_data.n_obs)
+                        self._smpls = _check_all_smpls(all_smpls, self._dml_data.n_obs, check_intersect=True)
                 else:
                     if not n_folds_each_smpl[0] == 1:
                         raise ValueError('Invalid partition provided. '
@@ -1087,7 +1090,7 @@ def set_sample_splitting(self, all_smpls):
                     self._n_rep = len(all_smpls)
                     self._n_folds = 2
                     self._apply_cross_fitting = False
-                    self._smpls = _check_all_smpls(all_smpls, self._dml_data.n_obs)
+                    self._smpls = _check_all_smpls(all_smpls, self._dml_data.n_obs, check_intersect=True)
 
         self._psi, self._psi_a, self._psi_b, \
             self._coef, self._se, self._all_coef, self._all_se, self._all_dml1_coef = self._initialize_arrays()
diff --git a/doubleml/tests/test_doubleml_set_sample_splitting.py b/doubleml/tests/test_doubleml_set_sample_splitting.py
@@ -215,3 +215,63 @@ def test_doubleml_draw_vs_set():
                                 n_folds=2, n_rep=4, apply_cross_fitting=False)
     dml_plr_set.set_sample_splitting(dml_plr_drawn.smpls)
     _assert_resampling_pars(dml_plr_drawn, dml_plr_set)
+
+
+@pytest.mark.ci
+def test_doubleml_set_sample_splitting_invalid_sets():
+    # sample splitting with two folds and repeated cross-fitting with n_rep = 2
+    smpls = [[([0, 1.2, 2, 3, 4], [5, 6, 7, 8, 9]),
+              ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])],
+             [([0, 2, 4, 6, 8], [1, 3, 5, 7, 9]),
+              ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]]
+    msg = 'Invalid sample split. Train indices must be of type integer.'
+    with pytest.raises(TypeError, match=msg):
+        dml_plr.set_sample_splitting(smpls)
+
+    smpls = [[([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]),
+              ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])],
+             [([0, 2, 4, 6, 8], [1, 3.5, 5, 7, 9]),
+              ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]]
+    msg = 'Invalid sample split. Test indices must be of type integer.'
+    with pytest.raises(TypeError, match=msg):
+        dml_plr.set_sample_splitting(smpls)
+
+    smpls = [[([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]),
+              ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])],
+             [([0, 2, 3, 4, 6, 8], [1, 3, 5, 7, 9]),
+              ([1, 5, 7, 9], [0, 2, 4, 6, 8])]]
+    msg = 'Invalid sample split. Intersection of train and test indices is not empty.'
+    with pytest.raises(ValueError, match=msg):
+        dml_plr.set_sample_splitting(smpls)
+
+    smpls = [[([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]),
+              ([5, 6, 7, 7, 8, 9], [0, 1, 2, 3, 4])],
+             [([0, 2, 4, 4, 6, 8], [1, 3, 5, 7, 9]),
+              ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]]
+    msg = 'Invalid sample split. Train indices contain non-unique entries.'
+    with pytest.raises(ValueError, match=msg):
+        dml_plr.set_sample_splitting(smpls)
+
+    smpls = [[([0, 1, 2, 3, 4], [5, 5, 6, 7, 8, 9]),
+              ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])],
+             [([0, 2, 4, 6, 8], [1, 3, 5, 7, 9]),
+              ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]]
+    msg = 'Invalid sample split. Test indices contain non-unique entries.'
+    with pytest.raises(ValueError, match=msg):
+        dml_plr.set_sample_splitting(smpls)
+
+    smpls = [[([0, 1, 2, 3, 20], [5, 6, 7, 8, 9]),
+              ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])],
+             [([0, 2, 4, 6, 8], [1, 3, 5, 7, 9]),
+              ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]]
+    msg = r'Invalid sample split. Train indices must be in \[0, n_obs\).'
+    with pytest.raises(ValueError, match=msg):
+        dml_plr.set_sample_splitting(smpls)
+
+    smpls = [[([0, 1, 2, 3, 4], [5, -6, 7, 8, 9]),
+              ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])],
+             [([0, 2, 4, 6, 8], [1, 3, 5, 7, 9]),
+              ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]]
+    msg = r'Invalid sample split. Test indices must be in \[0, n_obs\).'
+    with pytest.raises(ValueError, match=msg):
+        dml_plr.set_sample_splitting(smpls)