DCRBaseline and DCROverfitting throws incorrect warnings about missing columns. (#756)

lajohn4747 · web-flow · commit 649047a9041e · 2025-03-28T17:12:30.000-05:00
diff --git a/sdmetrics/single_table/privacy/dcr_baseline_protection.py b/sdmetrics/single_table/privacy/dcr_baseline_protection.py
@@ -5,7 +5,6 @@
 import numpy as np
 import pandas as pd
 
-from sdmetrics._utils_metadata import _process_data_with_metadata
 from sdmetrics.goal import Goal
 from sdmetrics.single_table.base import SingleTableMetric
 from sdmetrics.single_table.privacy.dcr_utils import calculate_dcr
@@ -31,7 +30,6 @@ def _validate_inputs(
         cls,
         real_data,
         synthetic_data,
-        metadata,
         num_rows_subsample,
         num_iterations,
     ):
@@ -46,12 +44,13 @@ def _validate_inputs(
             num_rows_subsample = None
             num_iterations = 1
 
-        real_data_copy = real_data.copy()
-        synthetic_data_copy = synthetic_data.copy()
-        real_data_copy = _process_data_with_metadata(real_data_copy, metadata, True)
-        synthetic_data_copy = _process_data_with_metadata(synthetic_data_copy, metadata, True)
+        if not (isinstance(real_data, pd.DataFrame) and isinstance(synthetic_data, pd.DataFrame)):
+            raise TypeError(
+                f'Both real_data ({type(real_data)}) and synthetic_data ({type(synthetic_data)}) '
+                'must be of type pandas.DataFrame.'
+            )
 
-        return real_data_copy, synthetic_data_copy, num_rows_subsample, num_iterations
+        return num_rows_subsample, num_iterations
 
     @classmethod
     def compute_breakdown(
@@ -87,38 +86,32 @@ def compute_breakdown(
                 and the median DCR score between the random data and real data.
                 Averages of the medians are returned in the case of multiple iterations.
         """
-        sanitized_data = cls._validate_inputs(
+        num_rows_subsample, num_iterations = cls._validate_inputs(
             real_data,
             synthetic_data,
-            metadata,
             num_rows_subsample,
             num_iterations,
         )
 
-        sanitized_real_data = sanitized_data[0]
-        sanitized_synthetic_data = sanitized_data[1]
-        num_rows_subsample = sanitized_data[2]
-        num_iterations = sanitized_data[3]
-
-        size_of_random_data = len(sanitized_synthetic_data)
-        random_data = cls._generate_random_data(sanitized_real_data, size_of_random_data)
+        size_of_random_data = len(synthetic_data)
+        random_data = cls._generate_random_data(real_data, size_of_random_data)
 
         sum_synthetic_median = 0
         sum_random_median = 0
         sum_score = 0
 
         for _ in range(num_iterations):
-            synthetic_sample = sanitized_synthetic_data
+            synthetic_sample = synthetic_data
             random_sample = random_data
             if num_rows_subsample is not None:
-                synthetic_sample = sanitized_synthetic_data.sample(n=num_rows_subsample)
+                synthetic_sample = synthetic_data.sample(n=num_rows_subsample)
                 random_sample = random_data.sample(n=num_rows_subsample)
 
             dcr_real = calculate_dcr(
-                reference_dataset=sanitized_real_data, dataset=synthetic_sample, metadata=metadata
+                reference_dataset=real_data, dataset=synthetic_sample, metadata=metadata
             )
             dcr_random = calculate_dcr(
-                reference_dataset=sanitized_real_data, dataset=random_sample, metadata=metadata
+                reference_dataset=real_data, dataset=random_sample, metadata=metadata
             )
             synthetic_data_median = dcr_real.median()
             random_data_median = dcr_random.median()
diff --git a/sdmetrics/single_table/privacy/dcr_overfitting_protection.py b/sdmetrics/single_table/privacy/dcr_overfitting_protection.py
@@ -3,8 +3,8 @@
 import warnings
 
 import numpy as np
+import pandas as pd
 
-from sdmetrics._utils_metadata import _process_data_with_metadata
 from sdmetrics.goal import Goal
 from sdmetrics.single_table.base import SingleTableMetric
 from sdmetrics.single_table.privacy.dcr_utils import calculate_dcr
@@ -29,7 +29,6 @@ def _validate_inputs(
         real_training_data,
         synthetic_data,
         real_validation_data,
-        metadata,
         num_rows_subsample,
         num_iterations,
     ):
@@ -44,27 +43,25 @@ def _validate_inputs(
             num_rows_subsample = None
             num_iterations = 1
 
+        if not (
+            isinstance(real_training_data, pd.DataFrame)
+            and isinstance(synthetic_data, pd.DataFrame)
+            and isinstance(real_validation_data, pd.DataFrame)
+        ):
+            raise TypeError(
+                f'All of real_training_data ({type(real_training_data)}), synthetic_data '
+                f'({type(synthetic_data)}), and real_validation_data ({type(real_validation_data)}) '
+                'must be of type pandas.DataFrame.'
+            )
+
         if len(real_training_data) * 0.5 > len(real_validation_data):
             warnings.warn(
                 f'Your real_validation_data contains {len(real_validation_data)} rows while your '
                 f'real_training_data contains {len(real_training_data)} rows. For most accurate '
                 'results, we recommend that the validation data at least half the size of the training data.'
             )
 
-        real_data_copy = real_training_data.copy()
-        synthetic_data_copy = synthetic_data.copy()
-        real_validation_copy = real_validation_data.copy()
-        real_data_copy = _process_data_with_metadata(real_data_copy, metadata, True)
-        synthetic_data_copy = _process_data_with_metadata(synthetic_data_copy, metadata, True)
-        real_validation_copy = _process_data_with_metadata(real_validation_copy, metadata, True)
-
-        return (
-            real_data_copy,
-            synthetic_data_copy,
-            real_validation_copy,
-            num_rows_subsample,
-            num_iterations,
-        )
+        return num_rows_subsample, num_iterations
 
     @classmethod
     def compute_breakdown(
@@ -104,34 +101,27 @@ def compute_breakdown(
                 closer to the real dataset. Averages of the medians are returned in the case of
                 multiple iterations.
         """
-        sanitized_data = cls._validate_inputs(
+        num_rows_subsample, num_iterations = cls._validate_inputs(
             real_training_data,
             synthetic_data,
             real_validation_data,
-            metadata,
             num_rows_subsample,
             num_iterations,
         )
 
-        training_data = sanitized_data[0]
-        sanitized_synthetic_data = sanitized_data[1]
-        validation_data = sanitized_data[2]
-        num_rows_subsample = sanitized_data[3]
-        num_iterations = sanitized_data[4]
-
         sum_of_scores = 0
         sum_percent_close_to_real = 0
         sum_percent_close_to_random = 0
         for _ in range(num_iterations):
-            synthetic_sample = sanitized_synthetic_data
+            synthetic_sample = synthetic_data
             if num_rows_subsample is not None:
-                synthetic_sample = sanitized_synthetic_data.sample(n=num_rows_subsample)
+                synthetic_sample = synthetic_data.sample(n=num_rows_subsample)
 
             dcr_real = calculate_dcr(
-                reference_dataset=training_data, dataset=synthetic_sample, metadata=metadata
+                reference_dataset=real_training_data, dataset=synthetic_sample, metadata=metadata
             )
             dcr_holdout = calculate_dcr(
-                reference_dataset=validation_data, dataset=synthetic_sample, metadata=metadata
+                reference_dataset=real_validation_data, dataset=synthetic_sample, metadata=metadata
             )
 
             num_rows_closer_to_real = np.where(dcr_real < dcr_holdout, 1.0, 0.0).sum()
diff --git a/tests/integration/single_table/privacy/test_dcr_baseline_protection.py b/tests/integration/single_table/privacy/test_dcr_baseline_protection.py
@@ -12,6 +12,7 @@
 
 
 class TestDCRBaselineProtection:
+    @pytest.mark.filterwarnings('error')
     def test_end_to_end_with_demo(self):
         """Test end to end for DCRBaslineProtection metric against the demo dataset.
 
diff --git a/tests/integration/single_table/privacy/test_dcr_overfitting_protection.py b/tests/integration/single_table/privacy/test_dcr_overfitting_protection.py
@@ -11,6 +11,7 @@
 
 
 class TestDCROverfittingProtection:
+    @pytest.mark.filterwarnings('error')
     def test_end_to_end_with_demo(self):
         """Test end to end for DCROverfittingProtection metric against the demo dataset.
 
@@ -21,7 +22,7 @@ def test_end_to_end_with_demo(self):
         """
         # Setup
         real_data, synthetic_data, metadata = load_single_table_demo()
-        train_df, holdout_df = train_test_split(real_data, test_size=0.2)
+        train_df, holdout_df = train_test_split(real_data, test_size=0.5)
 
         # Run
         num_rows_subsample = 50
diff --git a/tests/unit/single_table/privacy/test_dcr_baseline_protection.py b/tests/unit/single_table/privacy/test_dcr_baseline_protection.py
@@ -53,6 +53,13 @@ def test__validate_inputs(self, test_data):
         with pytest.raises(ValueError, match=missing_metric):
             DCRBaselineProtection.compute_breakdown(no_dcr_data, no_dcr_data, no_dcr_metadata)
 
+        no_df_msg = re.escape(
+            f'Both real_data ({type(None)}) and synthetic_data ({type({})}) '
+            'must be of type pandas.DataFrame.'
+        )
+        with pytest.raises(TypeError, match=no_df_msg):
+            DCRBaselineProtection.compute_breakdown(None, {}, metadata)
+
     @patch(
         'sdmetrics.single_table.privacy.dcr_baseline_protection.DCRBaselineProtection._generate_random_data'
     )
diff --git a/tests/unit/single_table/privacy/test_dcr_overfitting_protection.py b/tests/unit/single_table/privacy/test_dcr_overfitting_protection.py
@@ -71,6 +71,14 @@ def test__validate_inputs(self, test_data):
                 train_data, synthetic_data, small_holdout_data, metadata
             )
 
+        no_df_msg = re.escape(
+            f'All of real_training_data ({type(None)}), synthetic_data '
+            f'({type({})}), and real_validation_data ({type({})}) '
+            'must be of type pandas.DataFrame.'
+        )
+        with pytest.raises(TypeError, match=no_df_msg):
+            DCROverfittingProtection.compute_breakdown(None, {}, {}, metadata)
+
     @patch('numpy.where')
     @patch('sdmetrics.single_table.privacy.dcr_overfitting_protection.calculate_dcr')
     def test_compute_breakdown(self, mock_calculate_dcr, mock_numpy_where, test_data):