-
Notifications
You must be signed in to change notification settings - Fork 47
/
Copy pathdcr_baseline_protection.py
223 lines (188 loc) · 8.2 KB
/
dcr_baseline_protection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
"""DCR Baseline Protection metrics."""
import warnings
import numpy as np
import pandas as pd
from sdmetrics.goal import Goal
from sdmetrics.single_table.base import SingleTableMetric
from sdmetrics.single_table.privacy.dcr_utils import calculate_dcr
from sdmetrics.single_table.privacy.util import validate_num_samples_num_iteration
from sdmetrics.utils import is_datetime
class DCRBaselineProtection(SingleTableMetric):
"""DCR Baseline Protection metric.
This metric uses a DCR (distance to closest record) computation to measure how close the
synthetic data is to the real data as opposed to a baseline of random data.
"""
name = 'DCRBaselineProtection'
goal = Goal.MAXIMIZE
min_value = 0.0
max_value = 1.0
_seed = None
@classmethod
def _validate_inputs(
cls,
real_data,
synthetic_data,
num_rows_subsample,
num_iterations,
):
validate_num_samples_num_iteration(num_rows_subsample, num_iterations)
if num_rows_subsample and num_rows_subsample > len(synthetic_data):
warnings.warn(
f'num_rows_subsample ({num_rows_subsample}) is greater than the length of the '
f'synthetic data ({len(synthetic_data)}). Ignoring the num_rows_subsample and '
'num_iterations args.',
)
num_rows_subsample = None
num_iterations = 1
if not (isinstance(real_data, pd.DataFrame) and isinstance(synthetic_data, pd.DataFrame)):
raise TypeError(
f'Both real_data ({type(real_data)}) and synthetic_data ({type(synthetic_data)}) '
'must be of type pandas.DataFrame.'
)
return num_rows_subsample, num_iterations
@classmethod
def compute_breakdown(
cls,
real_data,
synthetic_data,
metadata,
num_rows_subsample=None,
num_iterations=1,
):
"""Compute the DCRBaselineProtection metric.
Args:
real_data (pd.DataFrame):
A pd.DataFrame object containing the real data used for training the synthesizer.
synthetic_data (pd.DataFrame):
A pandas.DataFrame object containing the synthetic data sampled
from the synthesizer.
metadata (dict):
A metadata dictionary that describes the table of data.
num_rows_subsample (int or None):
The number of synthetic data rows to subsample from the synthetic data.
This is used to increase the speed of the computation, if the dataset is large.
Defaults to None which means no subsampling will be done.
num_iterations (int):
The number of iterations to perform when subsampling.
The final score will be the average of all iterations. Default is 1 iteration.
Returns:
dict:
Returns a dictionary that contains the overall score,
the median DCR score between the synthetic data and real data,
and the median DCR score between the random data and real data.
Averages of the medians are returned in the case of multiple iterations.
"""
num_rows_subsample, num_iterations = cls._validate_inputs(
real_data,
synthetic_data,
num_rows_subsample,
num_iterations,
)
size_of_random_data = len(synthetic_data)
random_data = cls._generate_random_data(real_data, size_of_random_data)
sum_synthetic_median = 0
sum_random_median = 0
sum_score = 0
for _ in range(num_iterations):
synthetic_sample = synthetic_data
random_sample = random_data
if num_rows_subsample is not None:
synthetic_sample = synthetic_data.sample(n=num_rows_subsample)
random_sample = random_data.sample(n=num_rows_subsample)
dcr_real = calculate_dcr(
reference_dataset=real_data, dataset=synthetic_sample, metadata=metadata
)
dcr_random = calculate_dcr(
reference_dataset=real_data, dataset=random_sample, metadata=metadata
)
synthetic_data_median = dcr_real.median()
random_data_median = dcr_random.median()
score = np.nan
if random_data_median != 0.0:
score = min((synthetic_data_median / random_data_median), 1.0)
sum_synthetic_median += synthetic_data_median
sum_random_median += random_data_median
sum_score += score
if sum_random_median == 0.0:
sum_score = np.nan
result = {
'score': sum_score / num_iterations,
'median_DCR_to_real_data': {
'synthetic_data': sum_synthetic_median / num_iterations,
'random_data_baseline': sum_random_median / num_iterations,
},
}
return result
@classmethod
def compute(
cls,
real_data,
synthetic_data,
metadata,
num_rows_subsample=None,
num_iterations=1,
):
"""Compute the DCRBaselineProtection metric.
Args:
real_data (pd.DataFrame):
A pd.DataFrame object containing the real data used for training the synthesizer.
synthetic_data (pd.DataFrame):
A pandas.DataFrame object containing the synthetic data sampled
from the synthesizer.
real_validation_data (pd.DataFrame):
A pandas.DataFrame object containing a holdout set of real data.
metadata (dict):
A metadata dictionary that describes the table of data.
num_rows_subsample (int or None):
The number of synthetic data rows to subsample from the synthetic data.
This is used to increase the speed of the computation, if the dataset is large.
Defaults to None which means no subsampling will be done.
num_iterations (int):
The number of iterations to perform when subsampling.
The final score will be the average of all iterations. Default is 1 iteration.
Returns:
float:
The score for the DCRBaselineProtection metric.
"""
result = cls.compute_breakdown(
real_data,
synthetic_data,
metadata,
num_rows_subsample,
num_iterations,
)
return result.get('score')
@classmethod
def _generate_random_data(cls, real_data, num_samples=None):
random_data = {}
num_samples = len(real_data) if num_samples is None else num_samples
seed = getattr(cls, '_seed', None)
randomizer = np.random.default_rng(seed)
for col in real_data.columns:
nan_ratio = real_data[col].isna().mean()
if pd.api.types.is_integer_dtype(real_data[col]):
random_values = randomizer.integers(
low=real_data[col].min(), high=real_data[col].max() + 1, size=num_samples
)
elif pd.api.types.is_float_dtype(real_data[col]):
random_values = randomizer.uniform(
low=real_data[col].min(), high=real_data[col].max(), size=num_samples
)
elif is_datetime(real_data[col]):
min_date, max_date = real_data[col].min(), real_data[col].max()
total_seconds = (max_date - min_date).total_seconds()
random_values = min_date + pd.to_timedelta(
randomizer.uniform(low=0, high=total_seconds, size=num_samples), unit='s'
)
else:
random_values = randomizer.choice(
real_data[col].dropna().unique(), size=num_samples
)
nan_mask = np.random.rand(num_samples) < nan_ratio
random_values = pd.Series(random_values)
if is_datetime(real_data[col]):
random_values[nan_mask] = pd.NaT
else:
random_values[nan_mask] = np.nan
random_data[col] = random_values
return pd.DataFrame(random_data)