Improve performance of DCR metrics (#762)

frances-h · frances-h · commit cd4e2cc77b6e · 2025-04-10T14:09:01.000-04:00
diff --git a/sdmetrics/single_table/privacy/dcr_baseline_protection.py b/sdmetrics/single_table/privacy/dcr_baseline_protection.py
@@ -23,6 +23,7 @@ class DCRBaselineProtection(SingleTableMetric):
     goal = Goal.MAXIMIZE
     min_value = 0.0
     max_value = 1.0
+    CHUNK_SIZE = 1000
     _seed = None
 
     @classmethod
@@ -103,15 +104,23 @@ def compute_breakdown(
         for _ in range(num_iterations):
             synthetic_sample = synthetic_data
             random_sample = random_data
+            real_sample = real_data
             if num_rows_subsample is not None:
                 synthetic_sample = synthetic_data.sample(n=num_rows_subsample)
                 random_sample = random_data.sample(n=num_rows_subsample)
+                real_sample = real_data.sample(n=num_rows_subsample)
 
             dcr_real = calculate_dcr(
-                reference_dataset=real_data, dataset=synthetic_sample, metadata=metadata
+                reference_dataset=real_sample,
+                dataset=synthetic_sample,
+                metadata=metadata,
+                chunk_size=cls.CHUNK_SIZE,
             )
             dcr_random = calculate_dcr(
-                reference_dataset=real_data, dataset=random_sample, metadata=metadata
+                reference_dataset=real_sample,
+                dataset=random_sample,
+                metadata=metadata,
+                chunk_size=cls.CHUNK_SIZE,
             )
             synthetic_data_median = dcr_real.median()
             random_data_median = dcr_random.median()
diff --git a/sdmetrics/single_table/privacy/dcr_overfitting_protection.py b/sdmetrics/single_table/privacy/dcr_overfitting_protection.py
@@ -22,6 +22,7 @@ class DCROverfittingProtection(SingleTableMetric):
     goal = Goal.MAXIMIZE
     min_value = 0.0
     max_value = 1.0
+    CHUNK_SIZE = 1000
 
     @classmethod
     def _validate_inputs(
@@ -114,14 +115,24 @@ def compute_breakdown(
         sum_percent_close_to_random = 0
         for _ in range(num_iterations):
             synthetic_sample = synthetic_data
+            real_training_sample = real_training_data
+            real_validation_sample = real_validation_data
             if num_rows_subsample is not None:
                 synthetic_sample = synthetic_data.sample(n=num_rows_subsample)
+                real_training_sample = real_training_data.sample(n=num_rows_subsample)
+                real_validation_sample = real_validation_data.sample(n=num_rows_subsample)
 
             dcr_real = calculate_dcr(
-                reference_dataset=real_training_data, dataset=synthetic_sample, metadata=metadata
+                reference_dataset=real_training_sample,
+                dataset=synthetic_sample,
+                metadata=metadata,
+                chunk_size=cls.CHUNK_SIZE,
             )
             dcr_holdout = calculate_dcr(
-                reference_dataset=real_validation_data, dataset=synthetic_sample, metadata=metadata
+                reference_dataset=real_validation_sample,
+                dataset=synthetic_sample,
+                metadata=metadata,
+                chunk_size=cls.CHUNK_SIZE,
             )
 
             num_rows_closer_to_real = np.where(dcr_real < dcr_holdout, 1.0, 0.0).sum()
diff --git a/sdmetrics/single_table/privacy/dcr_utils.py b/sdmetrics/single_table/privacy/dcr_utils.py
@@ -9,8 +9,8 @@
 CHUNK_SIZE = 1000
 
 
-def _process_dcr_chunk(chunk, reference_copy, cols_to_keep, metadata, ranges):
-    full_dataset = chunk.merge(reference_copy, how='cross', suffixes=('_data', '_ref'))
+def _process_dcr_chunk(dataset_chunk, reference_chunk, cols_to_keep, metadata, ranges):
+    full_dataset = dataset_chunk.merge(reference_chunk, how='cross', suffixes=('_data', '_ref'))
 
     for col_name in cols_to_keep:
         sdtype = metadata['columns'][col_name]['sdtype']
@@ -51,7 +51,7 @@ def _process_dcr_chunk(chunk, reference_copy, cols_to_keep, metadata, ranges):
     return chunk_result['diff']
 
 
-def calculate_dcr(dataset, reference_dataset, metadata):
+def calculate_dcr(dataset, reference_dataset, metadata, chunk_size=1000):
     """Calculate the Distance to Closest Record for all rows in the synthetic data.
 
     Arguments:
@@ -66,10 +66,10 @@ def calculate_dcr(dataset, reference_dataset, metadata):
         pandas.Series:
             Returns a Series that shows the DCR value for every row of dataset
     """
-    dataset_copy = _process_data_with_metadata(dataset.copy(), metadata, True)
-    reference_copy = _process_data_with_metadata(reference_dataset.copy(), metadata, True)
+    dataset = _process_data_with_metadata(dataset.copy(), metadata, True)
+    reference = _process_data_with_metadata(reference_dataset.copy(), metadata, True)
 
-    common_cols = set(dataset_copy.columns) & set(reference_copy.columns)
+    common_cols = set(dataset.columns) & set(reference.columns)
     cols_to_keep = []
     ranges = {}
 
@@ -83,7 +83,7 @@ def calculate_dcr(dataset, reference_dataset, metadata):
             cols_to_keep.append(col_name)
 
             if sdtype in ['numerical', 'datetime']:
-                col_range = reference_copy[col_name].max() - reference_copy[col_name].min()
+                col_range = reference[col_name].max() - reference[col_name].min()
                 if isinstance(col_range, pd.Timedelta):
                     col_range = col_range.total_seconds()
 
@@ -92,23 +92,35 @@ def calculate_dcr(dataset, reference_dataset, metadata):
     if not cols_to_keep:
         raise ValueError('There are no overlapping statistical columns to measure.')
 
-    dataset_copy = dataset_copy[cols_to_keep]
-    dataset_copy['index'] = range(len(dataset_copy))
+    dataset = dataset[cols_to_keep]
+    dataset['index'] = range(len(dataset))
 
-    reference_copy = reference_copy[cols_to_keep]
-    reference_copy['index'] = range(len(reference_copy))
+    reference = reference[cols_to_keep]
+    reference['index'] = range(len(reference))
     results = []
 
-    for chunk_start in range(0, len(dataset_copy), CHUNK_SIZE):
-        chunk = dataset_copy.iloc[chunk_start : chunk_start + CHUNK_SIZE].copy()
-        chunk_result = _process_dcr_chunk(
-            chunk=chunk,
-            reference_copy=reference_copy,
-            cols_to_keep=cols_to_keep,
-            metadata=metadata,
-            ranges=ranges,
-        )
-        results.append(chunk_result)
+    for dataset_chunk_start in range(0, len(dataset), chunk_size):
+        dataset_chunk = dataset.iloc[dataset_chunk_start : dataset_chunk_start + chunk_size]
+        minimum_chunk_distance = None
+        for reference_chunk_start in range(0, len(reference), chunk_size):
+            reference_chunk = reference.iloc[
+                reference_chunk_start : reference_chunk_start + chunk_size
+            ]
+            chunk_result = _process_dcr_chunk(
+                dataset_chunk=dataset_chunk,
+                reference_chunk=reference_chunk,
+                cols_to_keep=cols_to_keep,
+                metadata=metadata,
+                ranges=ranges,
+            )
+            if minimum_chunk_distance is None:
+                minimum_chunk_distance = chunk_result
+            else:
+                minimum_chunk_distance = pd.Series.min(
+                    pd.concat([minimum_chunk_distance, chunk_result], axis=1), axis=1
+                )
+
+        results.append(minimum_chunk_distance)
 
     result = pd.concat(results, ignore_index=True)
     result.name = None
diff --git a/tests/integration/single_table/privacy/test_dcr_baseline_protection.py b/tests/integration/single_table/privacy/test_dcr_baseline_protection.py
@@ -171,16 +171,12 @@ def test_end_to_end_sample_random_median(self):
         real_data = pd.DataFrame(data={'A': [2, 6, 3, 4, 1]})
         synthetic_data = pd.DataFrame(data={'A': [5, 5, 5, 5, 5]})
         metadata = {'columns': {'A': {'sdtype': 'numerical'}}}
-        num_rows_sample = 1
-        num_iterations = 5
 
         # Run
         result = DCRBaselineProtection.compute_breakdown(
             real_data=real_data,
             synthetic_data=synthetic_data,
             metadata=metadata,
-            num_rows_subsample=num_rows_sample,
-            num_iterations=num_iterations,
         )
 
         # Assert
diff --git a/tests/integration/single_table/privacy/test_dcr_overfitting_protection.py b/tests/integration/single_table/privacy/test_dcr_overfitting_protection.py
@@ -25,25 +25,23 @@ def test_end_to_end_with_demo(self):
         train_df, holdout_df = train_test_split(real_data, test_size=0.5)
 
         # Run
-        num_rows_subsample = 50
         compute_breakdown_result = DCROverfittingProtection.compute_breakdown(
             train_df, synthetic_data, holdout_df, metadata
         )
         compute_result = DCROverfittingProtection.compute(
             train_df, synthetic_data, holdout_df, metadata
         )
         compute_holdout_same = DCROverfittingProtection.compute_breakdown(
-            train_df, synthetic_data, synthetic_data, metadata, num_rows_subsample
+            train_df, synthetic_data, synthetic_data, metadata
         )
         compute_train_same = DCROverfittingProtection.compute_breakdown(
-            synthetic_data, synthetic_data, holdout_df, metadata, num_rows_subsample
+            synthetic_data, synthetic_data, holdout_df, metadata
         )
         compute_all_same = DCROverfittingProtection.compute_breakdown(
             synthetic_data,
             synthetic_data,
             synthetic_data,
             metadata,
-            num_rows_subsample,
         )
 
         synth_percentages_key = 'synthetic_data_percentages'
@@ -136,18 +134,9 @@ def test_compute_breakdown_iterations(self):
         compute_num_iteration_1000 = DCROverfittingProtection.compute_breakdown(
             train_data, synthetic_data, holdout_data, metadata, num_rows_subsample, num_iterations
         )
-        compute_train_same = DCROverfittingProtection.compute_breakdown(
-            synthetic_data,
-            synthetic_data,
-            holdout_data,
-            metadata,
-            num_rows_subsample,
-            num_iterations,
-        )
 
         # Assert
         assert compute_num_iteration_1 != compute_num_iteration_1000
-        assert compute_train_same['score'] == 0.0
 
     def test_end_to_end_with_datetimes(self):
         """Test end to end with datetime synthetic values."""
diff --git a/tests/integration/single_table/privacy/test_dcr_utils.py b/tests/integration/single_table/privacy/test_dcr_utils.py
@@ -2,6 +2,7 @@
 
 import pandas as pd
 
+from sdmetrics.demos import load_single_table_demo
 from sdmetrics.single_table.privacy.dcr_utils import (
     calculate_dcr,
 )
@@ -54,3 +55,27 @@ def test_calculate_dcr_with_zero_col_range():
     # Assert
     expected_result = pd.Series([1.0, 1.0, 1.0, 0.5, 0.0])
     pd.testing.assert_series_equal(result, expected_result)
+
+
+def test_calculate_dcr_chunked():
+    """Test calculate_dcr with chunking calculations."""
+    # Setup
+    real_data, synthetic_data, metadata = load_single_table_demo()
+
+    # Run
+    result = calculate_dcr(
+        reference_dataset=real_data,
+        dataset=synthetic_data,
+        metadata=metadata,
+        chunk_size=1000,
+    )
+    chunked_result = calculate_dcr(
+        reference_dataset=real_data,
+        dataset=synthetic_data,
+        metadata=metadata,
+        chunk_size=50,
+    )
+
+    # Assert
+    assert len(result) == len(real_data)
+    pd.testing.assert_series_equal(result, chunked_result)
diff --git a/tests/unit/single_table/privacy/test_dcr_baseline_protection.py b/tests/unit/single_table/privacy/test_dcr_baseline_protection.py
@@ -1,7 +1,7 @@
 import random
 import re
 from datetime import datetime
-from unittest.mock import patch
+from unittest.mock import Mock, patch
 
 import numpy as np
 import pandas as pd
@@ -217,7 +217,9 @@ def test_compute_breakdown_with_dcr_random_same_real(self, mock_generate_random,
         # Setup
         real_data, synthetic_data, metadata = test_data
         num_rows_subsample = 10
-        mock_generate_random.return_value = real_data.copy()
+        real_data.sample = Mock()
+        real_data.sample.return_value = real_data.iloc[:10]
+        mock_generate_random.return_value = real_data.iloc[:10]
 
         # Run
         result = DCRBaselineProtection.compute_breakdown(
diff --git a/tests/unit/single_table/privacy/test_dcr_utils.py b/tests/unit/single_table/privacy/test_dcr_utils.py
@@ -235,8 +235,8 @@ def test__process_dcr_chunk(real_data, synthetic_data, test_metadata, column_ran
 
     # Run
     result = _process_dcr_chunk(
-        chunk=chunk,
-        reference_copy=real_data,
+        dataset_chunk=chunk,
+        reference_chunk=real_data,
         cols_to_keep=cols_to_keep,
         metadata=test_metadata,
         ranges=column_ranges,