sdv-dev · lajohn4747 · Feb 26, 2025 · Feb 26, 2025 · Feb 27, 2025 · Mar 19, 2025
@@ -71,6 +71,8 @@
     DisclosureProtection,
     DisclosureProtectionEstimate,
 )
+from sdmetrics.single_table.privacy.dcr_baseline_protection import DCRBaselineProtection
+from sdmetrics.single_table.privacy.dcr_overfitting_protection import DCROverfittingProtection
 from sdmetrics.single_table.privacy.ensemble import CategoricalEnsemble
 from sdmetrics.single_table.privacy.numerical_sklearn import NumericalLR, NumericalMLP, NumericalSVR
 from sdmetrics.single_table.privacy.radius_nearest_neighbor import NumericalRadiusNearestNeighbor
@@ -136,4 +138,6 @@
     'RangeCoverage',
     'NewRowSynthesis',
     'TableStructure',
+    'DCRBaselineProtection',
+    'DCROverfittingProtection',
 ]
@@ -16,6 +16,8 @@
     DisclosureProtection,
     DisclosureProtectionEstimate,
 )
+from sdmetrics.single_table.privacy.dcr_baseline_protection import DCRBaselineProtection
+from sdmetrics.single_table.privacy.dcr_overfitting_protection import DCROverfittingProtection
 from sdmetrics.single_table.privacy.ensemble import CategoricalEnsemble
 from sdmetrics.single_table.privacy.numerical_sklearn import NumericalLR, NumericalMLP, NumericalSVR
 from sdmetrics.single_table.privacy.radius_nearest_neighbor import NumericalRadiusNearestNeighbor
@@ -37,4 +39,6 @@
     'NumericalPrivacyMetric',
     'NumericalRadiusNearestNeighbor',
     'NumericalSVR',
+    'DCRBaselineProtection',
+    'DCROverfittingProtection',
 ]
@@ -0,0 +1,223 @@
+"""DCR Baseline Protection metrics."""
+
+import warnings
+
+import numpy as np
+import pandas as pd
+
+from sdmetrics.goal import Goal
+from sdmetrics.single_table.base import SingleTableMetric
+from sdmetrics.single_table.privacy.dcr_utils import calculate_dcr
+from sdmetrics.single_table.privacy.util import validate_num_samples_num_iteration
+from sdmetrics.utils import is_datetime
+
+
+class DCRBaselineProtection(SingleTableMetric):
+    """DCR Baseline Protection metric.
+
+    This metric uses a DCR (distance to closest record) computation to measure how close the
+    synthetic data is to the real data as opposed to a baseline of random data.
+    """
+
+    name = 'DCRBaselineProtection'
+    goal = Goal.MAXIMIZE
+    min_value = 0.0
+    max_value = 1.0
+    _seed = None
+
+    @classmethod
+    def _validate_inputs(
+        cls,
+        real_data,
+        synthetic_data,
+        num_rows_subsample,
+        num_iterations,
+    ):
+        validate_num_samples_num_iteration(num_rows_subsample, num_iterations)
+
+        if num_rows_subsample and num_rows_subsample > len(synthetic_data):
+            warnings.warn(
+                f'num_rows_subsample ({num_rows_subsample}) is greater than the length of the '
+                f'synthetic data ({len(synthetic_data)}). Ignoring the num_rows_subsample and '
+                'num_iterations args.',
+            )
+            num_rows_subsample = None
+            num_iterations = 1
+
+        if not (isinstance(real_data, pd.DataFrame) and isinstance(synthetic_data, pd.DataFrame)):
+            raise TypeError(
+                f'Both real_data ({type(real_data)}) and synthetic_data ({type(synthetic_data)}) '
+                'must be of type pandas.DataFrame.'
+            )
+
+        return num_rows_subsample, num_iterations
+
+    @classmethod
+    def compute_breakdown(
+        cls,
+        real_data,
+        synthetic_data,
+        metadata,
+        num_rows_subsample=None,
+        num_iterations=1,
+    ):
+        """Compute the DCRBaselineProtection metric.
+
+        Args:
+            real_data (pd.DataFrame):
+                A pd.DataFrame object containing the real data used for training the synthesizer.
+            synthetic_data (pd.DataFrame):
+                A pandas.DataFrame object containing the synthetic data sampled
+                from the synthesizer.
+            metadata (dict):
+                A metadata dictionary that describes the table of data.
+            num_rows_subsample (int or None):
+                The number of synthetic data rows to subsample from the synthetic data.
+                This is used to increase the speed of the computation, if the dataset is large.
+                Defaults to None which means no subsampling will be done.
+            num_iterations (int):
+                The number of iterations to perform when subsampling.
+                The final score will be the average of all iterations. Default is 1 iteration.
+
+        Returns:
+            dict:
+                Returns a dictionary that contains the overall score,
+                the median DCR score between the synthetic data and real data,
+                and the median DCR score between the random data and real data.
+                Averages of the medians are returned in the case of multiple iterations.
+        """
+        num_rows_subsample, num_iterations = cls._validate_inputs(
+            real_data,
+            synthetic_data,
+            num_rows_subsample,
+            num_iterations,
+        )
+
+        size_of_random_data = len(synthetic_data)
+        random_data = cls._generate_random_data(real_data, size_of_random_data)
+
+        sum_synthetic_median = 0
+        sum_random_median = 0
+        sum_score = 0
+
+        for _ in range(num_iterations):
+            synthetic_sample = synthetic_data
+            random_sample = random_data
+            if num_rows_subsample is not None:
+                synthetic_sample = synthetic_data.sample(n=num_rows_subsample)
+                random_sample = random_data.sample(n=num_rows_subsample)
+
+            dcr_real = calculate_dcr(
+                reference_dataset=real_data, dataset=synthetic_sample, metadata=metadata
+            )
+            dcr_random = calculate_dcr(
+                reference_dataset=real_data, dataset=random_sample, metadata=metadata
+            )
+            synthetic_data_median = dcr_real.median()
+            random_data_median = dcr_random.median()
+            score = np.nan
+            if random_data_median != 0.0:
+                score = min((synthetic_data_median / random_data_median), 1.0)
+
+            sum_synthetic_median += synthetic_data_median
+            sum_random_median += random_data_median
+            sum_score += score
+
+        if sum_random_median == 0.0:
+            sum_score = np.nan
+
+        result = {
+            'score': sum_score / num_iterations,
+            'median_DCR_to_real_data': {
+                'synthetic_data': sum_synthetic_median / num_iterations,
+                'random_data_baseline': sum_random_median / num_iterations,
+            },
+        }
+
+        return result
+
+    @classmethod
+    def compute(
+        cls,
+        real_data,
+        synthetic_data,
+        metadata,
+        num_rows_subsample=None,
+        num_iterations=1,
+    ):
+        """Compute the DCRBaselineProtection metric.
+
+        Args:
+            real_data (pd.DataFrame):
+                A pd.DataFrame object containing the real data used for training the synthesizer.
+            synthetic_data (pd.DataFrame):
+                A pandas.DataFrame object containing the synthetic data sampled
+                from the synthesizer.
+            real_validation_data (pd.DataFrame):
+                A pandas.DataFrame object containing a holdout set of real data.
+            metadata (dict):
+                A metadata dictionary that describes the table of data.
+            num_rows_subsample (int or None):
+                The number of synthetic data rows to subsample from the synthetic data.
+                This is used to increase the speed of the computation, if the dataset is large.
+                Defaults to None which means no subsampling will be done.
+            num_iterations (int):
+                The number of iterations to perform when subsampling.
+                The final score will be the average of all iterations. Default is 1 iteration.
+
+        Returns:
+            float:
+                The score for the DCRBaselineProtection metric.
+        """
+        result = cls.compute_breakdown(
+            real_data,
+            synthetic_data,
+            metadata,
+            num_rows_subsample,
+            num_iterations,
+        )
+
+        return result.get('score')
+
+    @classmethod
+    def _generate_random_data(cls, real_data, num_samples=None):
+        random_data = {}
+        num_samples = len(real_data) if num_samples is None else num_samples
+        seed = getattr(cls, '_seed', None)
+        randomizer = np.random.default_rng(seed)
+
+        for col in real_data.columns:
+            nan_ratio = real_data[col].isna().mean()
+
+            if pd.api.types.is_integer_dtype(real_data[col]):
+                random_values = randomizer.integers(
+                    low=real_data[col].min(), high=real_data[col].max() + 1, size=num_samples
+                )
+
+            elif pd.api.types.is_float_dtype(real_data[col]):
+                random_values = randomizer.uniform(
+                    low=real_data[col].min(), high=real_data[col].max(), size=num_samples
+                )
+
+            elif is_datetime(real_data[col]):
+                min_date, max_date = real_data[col].min(), real_data[col].max()
+                total_seconds = (max_date - min_date).total_seconds()
+                random_values = min_date + pd.to_timedelta(
+                    randomizer.uniform(low=0, high=total_seconds, size=num_samples), unit='s'
+                )
+
+            else:
+                random_values = randomizer.choice(
+                    real_data[col].dropna().unique(), size=num_samples
+                )
+
+            nan_mask = np.random.rand(num_samples) < nan_ratio
+            random_values = pd.Series(random_values)
+            if is_datetime(real_data[col]):
+                random_values[nan_mask] = pd.NaT
+            else:
+                random_values[nan_mask] = np.nan
+
+            random_data[col] = random_values
+
+        return pd.DataFrame(random_data)