Optimize DCR calculation using vector operations (#754)

lajohn4747 · web-flow · commit f88c7590d453 · 2025-03-27T09:11:39.000-05:00
diff --git a/sdmetrics/single_table/privacy/dcr_baseline_protection.py b/sdmetrics/single_table/privacy/dcr_baseline_protection.py
@@ -115,10 +115,10 @@ def compute_breakdown(
                 random_sample = random_data.sample(n=num_rows_subsample)
 
             dcr_real = calculate_dcr(
-                real_data=sanitized_real_data, synthetic_data=synthetic_sample, metadata=metadata
+                reference_dataset=sanitized_real_data, dataset=synthetic_sample, metadata=metadata
             )
             dcr_random = calculate_dcr(
-                real_data=sanitized_real_data, synthetic_data=random_sample, metadata=metadata
+                reference_dataset=sanitized_real_data, dataset=random_sample, metadata=metadata
             )
             synthetic_data_median = dcr_real.median()
             random_data_median = dcr_random.median()
diff --git a/sdmetrics/single_table/privacy/dcr_overfitting_protection.py b/sdmetrics/single_table/privacy/dcr_overfitting_protection.py
@@ -128,10 +128,10 @@ def compute_breakdown(
                 synthetic_sample = sanitized_synthetic_data.sample(n=num_rows_subsample)
 
             dcr_real = calculate_dcr(
-                real_data=training_data, synthetic_data=synthetic_sample, metadata=metadata
+                reference_dataset=training_data, dataset=synthetic_sample, metadata=metadata
             )
             dcr_holdout = calculate_dcr(
-                real_data=validation_data, synthetic_data=synthetic_sample, metadata=metadata
+                reference_dataset=validation_data, dataset=synthetic_sample, metadata=metadata
             )
 
             num_rows_closer_to_real = np.where(dcr_real < dcr_holdout, 1.0, 0.0).sum()
diff --git a/sdmetrics/single_table/privacy/dcr_utils.py b/sdmetrics/single_table/privacy/dcr_utils.py
@@ -1,156 +1,116 @@
 """Distance to closest record measurement functions."""
 
+import numpy as np
 import pandas as pd
 
 from sdmetrics._utils_metadata import _process_data_with_metadata
 from sdmetrics.utils import get_columns_from_metadata
 
+CHUNK_SIZE = 1000
 
-def _calculate_dcr_value(synthetic_value, real_value, sdtype, col_range=None):
-    """Calculate the Distance to Closest Record between two different values.
 
-    Arguments:
-        synthetic_value (int, float, datetime, boolean, string, or None):
-            The synthetic value that we are calculating DCR value for
-        real_value (int, float, datetime, boolean, string, or None):
-            The data value that we are referencing for measuring DCR.
-        sdtype (string):
-            The sdtype of the column values.
-        col_range (float):
-            The range of values for a column used for numerical values to calculate DCR.
-            Defaults to None.
+def _process_dcr_chunk(chunk, reference_copy, cols_to_keep, metadata, ranges):
+    full_dataset = chunk.merge(reference_copy, how='cross', suffixes=('_data', '_ref'))
 
-    Returns:
-       float:
-            Returns dcr value between two given values.
-    """
-    if pd.isna(synthetic_value) and pd.isna(real_value):
-        return 0.0
-    elif pd.isna(synthetic_value) or pd.isna(real_value):
-        return 1.0
-
-    if sdtype == 'numerical' or sdtype == 'datetime':
-        if col_range is None:
-            raise ValueError(
-                'No col_range was provided. The col_range is required '
-                'for numerical and datetime sdtype DCR calculation.'
+    for col_name in cols_to_keep:
+        sdtype = metadata['columns'][col_name]['sdtype']
+        ref_column = full_dataset[col_name + '_ref']
+        data_column = full_dataset[col_name + '_data']
+        diff_col_name = col_name + '_diff'
+        if sdtype in ['numerical', 'datetime']:
+            diff = (ref_column - data_column).abs()
+            if pd.api.types.is_timedelta64_dtype(diff):
+                diff = diff.dt.total_seconds()
+
+            full_dataset[col_name + '_diff'] = np.where(
+                ranges[col_name] == 0,
+                (diff > 0).astype(int),
+                np.minimum(diff / ranges[col_name], 1.0),
             )
 
-        difference = abs(synthetic_value - real_value)
-        if isinstance(difference, pd.Timedelta):
-            difference = difference.total_seconds()
+            xor_condition = (ref_column.isna() & ~data_column.isna()) | (
+                ~ref_column.isna() & data_column.isna()
+            )
 
-        distance = 0.0 if synthetic_value == real_value else 1.0
-        if col_range != 0:
-            distance = difference / col_range
+            full_dataset.loc[xor_condition, diff_col_name] = 1
 
-        return min(distance, 1.0)
+            both_nan_condition = ref_column.isna() & data_column.isna()
 
-    if synthetic_value == real_value:
-        return 0.0
-    else:
-        return 1.0
+            full_dataset.loc[both_nan_condition, diff_col_name] = 0
 
+        elif sdtype in ['categorical', 'boolean']:
+            equals_cat = (ref_column == data_column) | (ref_column.isna() & data_column.isna())
+            full_dataset[diff_col_name] = (~equals_cat).astype(int)
 
-def _calculate_dcr_between_rows(synthetic_row, comparison_row, column_ranges, metadata):
-    """Calculate the Distance to Closest Record between two rows.
+        full_dataset.drop(columns=[col_name + '_ref', col_name + '_data'], inplace=True)
 
-    Arguments:
-        synthetic_row (pandas.Series):
-            The synthetic row that we are calculating DCR value for.
-        comparison_row (pandas.Series):
-            The data value that we are referencing for measuring DCR.
-        column_ranges (dict):
-            A dictionary that defines the range for each numerical column.
-        metadata (dict):
-            The metadata dict.
-
-    Returns:
-        float:
-            Returns DCR value (the average value of DCR values we computed across the row).
-    """
-    dcr_values = synthetic_row.index.to_series().apply(
-        lambda synthetic_column_name: _calculate_dcr_value(
-            synthetic_row[synthetic_column_name],
-            comparison_row[synthetic_column_name],
-            metadata['columns'][synthetic_column_name]['sdtype'],
-            column_ranges.get(synthetic_column_name),
-        )
+    full_dataset['diff'] = full_dataset.iloc[:, 2:].sum(axis=1) / len(cols_to_keep)
+    chunk_result = (
+        full_dataset[['index_data', 'diff']].groupby('index_data').min().reset_index(drop=True)
     )
-
-    return dcr_values.mean()
+    return chunk_result['diff']
 
 
-def _calculate_dcr_between_row_and_data(synthetic_row, real_data, column_ranges, metadata):
-    """Calculate the DCR between a single row in the synthetic data and another dataset.
+def calculate_dcr(dataset, reference_dataset, metadata):
+    """Calculate the Distance to Closest Record for all rows in the synthetic data.
 
     Arguments:
-        synthetic_row (pandas.Series):
-            The synthetic row that we are calculating DCR against an entire dataset.
-        real_data (pandas.Dataframe):
-            The dataset that acts as the reference for DCR calculations.
-        column_ranges (dict):
-            A dictionary that defines the range for each numerical column.
+        dataset (pandas.Dataframe):
+            The dataset for which we want to compute the DCR values
+        reference_dataset (pandas.Dataframe):
+            The reference dataset that is used for the distance computations
         metadata (dict):
             The metadata dict.
 
     Returns:
-        float:
-            Returns the minimum distance to closest record computed between the
-            synthetic row and the reference dataset.
+        pandas.Series:
+            Returns a Series that shows the DCR value for every row of dataset
     """
-    synthetic_distance_to_all_real = real_data.apply(
-        lambda real_row: _calculate_dcr_between_rows(
-            synthetic_row, real_row, column_ranges, metadata
-        ),
-        axis=1,
-    )
-    return synthetic_distance_to_all_real.min()
+    dataset_copy = _process_data_with_metadata(dataset.copy(), metadata, True)
+    reference_copy = _process_data_with_metadata(reference_dataset.copy(), metadata, True)
 
+    common_cols = set(dataset_copy.columns) & set(reference_copy.columns)
+    cols_to_keep = []
+    ranges = {}
 
-def calculate_dcr(real_data, synthetic_data, metadata):
-    """Calculate the Distance to Closest Record for all rows in the synthetic data.
+    for col_name, col_metadata in get_columns_from_metadata(metadata).items():
+        sdtype = col_metadata['sdtype']
 
-    Arguments:
-        real_data (pandas.Dataframe):
-            The dataset that acts as the reference for DCR calculations. Ranges are determined from
-            this dataset.
-        synthetic_data (pandas.Dataframe):
-            The synthetic data that we are calculating DCR values for. Every row will be measured
-            against the comparison data.
-        metadata (dict):
-            The metadata dict.
+        if (
+            sdtype in ['numerical', 'categorical', 'boolean', 'datetime']
+            and col_name in common_cols
+        ):
+            cols_to_keep.append(col_name)
 
-    Returns:
-        pandas.Series:
-            Returns a Series that shows the DCR value for every row of synthetic data.
-    """
-    column_ranges = {}
+            if sdtype in ['numerical', 'datetime']:
+                col_range = reference_copy[col_name].max() - reference_copy[col_name].min()
+                if isinstance(col_range, pd.Timedelta):
+                    col_range = col_range.total_seconds()
 
-    real_data_copy = real_data.copy()
-    synthetic_data_copy = synthetic_data.copy()
-    real_data_copy = _process_data_with_metadata(real_data_copy, metadata, True)
-    synthetic_data_copy = _process_data_with_metadata(synthetic_data_copy, metadata, True)
+                ranges[col_name] = col_range
 
-    overlapping_columns = set(real_data_copy.columns) & set(synthetic_data_copy.columns)
-    if not overlapping_columns:
+    if not cols_to_keep:
         raise ValueError('There are no overlapping statistical columns to measure.')
 
-    for col_name, column in get_columns_from_metadata(metadata).items():
-        sdtype = column['sdtype']
-        col_range = None
-        if sdtype == 'numerical' or sdtype == 'datetime':
-            col_range = real_data_copy[col_name].max() - real_data_copy[col_name].min()
-            if isinstance(col_range, pd.Timedelta):
-                col_range = col_range.total_seconds()
-
-        column_ranges[col_name] = col_range
-
-    dcr_dist_df = synthetic_data_copy.apply(
-        lambda synth_row: _calculate_dcr_between_row_and_data(
-            synth_row, real_data_copy, column_ranges, metadata
-        ),
-        axis=1,
-    )
+    dataset_copy = dataset_copy[cols_to_keep]
+    dataset_copy['index'] = range(len(dataset_copy))
+
+    reference_copy = reference_copy[cols_to_keep]
+    reference_copy['index'] = range(len(reference_copy))
+    results = []
+
+    for chunk_start in range(0, len(dataset_copy), CHUNK_SIZE):
+        chunk = dataset_copy.iloc[chunk_start : chunk_start + CHUNK_SIZE].copy()
+        chunk_result = _process_dcr_chunk(
+            chunk=chunk,
+            reference_copy=reference_copy,
+            cols_to_keep=cols_to_keep,
+            metadata=metadata,
+            ranges=ranges,
+        )
+        results.append(chunk_result)
+
+    result = pd.concat(results, ignore_index=True)
+    result.name = None
 
-    return dcr_dist_df
+    return result
diff --git a/tests/integration/single_table/privacy/test_dcr_utils.py b/tests/integration/single_table/privacy/test_dcr_utils.py
@@ -20,7 +20,7 @@ def test_calculate_dcr():
     metadata = {'columns': {'num_col': {'sdtype': 'numerical'}}}
 
     # Run
-    result = calculate_dcr(real_data=real_df, synthetic_data=synthetic_df_diff, metadata=metadata)
+    result = calculate_dcr(reference_dataset=real_df, dataset=synthetic_df_diff, metadata=metadata)
 
     # Assert
     expected_result = pd.Series([0.2, 0.0])
@@ -49,7 +49,7 @@ def test_calculate_dcr_with_zero_col_range():
     metadata = {'columns': {'num_col': {'sdtype': 'numerical'}, 'date_col': {'sdtype': 'datetime'}}}
 
     # Run
-    result = calculate_dcr(real_data=real_df, synthetic_data=synthetic_df_diff, metadata=metadata)
+    result = calculate_dcr(reference_dataset=real_df, dataset=synthetic_df_diff, metadata=metadata)
 
     # Assert
     expected_result = pd.Series([1.0, 1.0, 1.0, 0.5, 0.0])
diff --git a/tests/unit/single_table/privacy/test_dcr_utils.py b/tests/unit/single_table/privacy/test_dcr_utils.py

Original file line number	Diff line number	Diff line change
`@@ -115,10 +115,10 @@ def compute_breakdown(`
`115`	`115`	`random_sample = random_data.sample(n=num_rows_subsample)`
`116`	`116`
`117`	`117`	`dcr_real = calculate_dcr(`
`118`		`- real_data=sanitized_real_data, synthetic_data=synthetic_sample, metadata=metadata`
	`118`	`+ reference_dataset=sanitized_real_data, dataset=synthetic_sample, metadata=metadata`
`119`	`119`	`)`
`120`	`120`	`dcr_random = calculate_dcr(`
`121`		`- real_data=sanitized_real_data, synthetic_data=random_sample, metadata=metadata`
	`121`	`+ reference_dataset=sanitized_real_data, dataset=random_sample, metadata=metadata`
`122`	`122`	`)`
`123`	`123`	`synthetic_data_median = dcr_real.median()`
`124`	`124`	`random_data_median = dcr_random.median()`
Original file line number	Diff line number	Diff line change
`@@ -128,10 +128,10 @@ def compute_breakdown(`
`128`	`128`	`synthetic_sample = sanitized_synthetic_data.sample(n=num_rows_subsample)`
`129`	`129`
`130`	`130`	`dcr_real = calculate_dcr(`
`131`		`- real_data=training_data, synthetic_data=synthetic_sample, metadata=metadata`
	`131`	`+ reference_dataset=training_data, dataset=synthetic_sample, metadata=metadata`
`132`	`132`	`)`
`133`	`133`	`dcr_holdout = calculate_dcr(`
`134`		`- real_data=validation_data, synthetic_data=synthetic_sample, metadata=metadata`
	`134`	`+ reference_dataset=validation_data, dataset=synthetic_sample, metadata=metadata`
`135`	`135`	`)`
`136`	`136`
`137`	`137`	`num_rows_closer_to_real = np.where(dcr_real < dcr_holdout, 1.0, 0.0).sum()`