Add DCROverfittingProtection metric (#733)

lajohn4747 · R-Palazzo · web-flow · commit b1d36127f066 · 2025-02-26T15:50:02.000-06:00
Co-authored-by: R-Palazzo &lt;romainpalazzo@gmail.com&gt;
diff --git a/sdmetrics/single_table/__init__.py b/sdmetrics/single_table/__init__.py
@@ -71,6 +71,7 @@
     DisclosureProtection,
     DisclosureProtectionEstimate,
 )
+from sdmetrics.single_table.privacy.dcr_overfitting_protection import DCROverfittingProtection
 from sdmetrics.single_table.privacy.ensemble import CategoricalEnsemble
 from sdmetrics.single_table.privacy.numerical_sklearn import NumericalLR, NumericalMLP, NumericalSVR
 from sdmetrics.single_table.privacy.radius_nearest_neighbor import NumericalRadiusNearestNeighbor
@@ -136,4 +137,5 @@
     'RangeCoverage',
     'NewRowSynthesis',
     'TableStructure',
+    'DCROverfittingProtection',
 ]
diff --git a/sdmetrics/single_table/privacy/__init__.py b/sdmetrics/single_table/privacy/__init__.py
@@ -16,6 +16,7 @@
     DisclosureProtection,
     DisclosureProtectionEstimate,
 )
+from sdmetrics.single_table.privacy.dcr_overfitting_protection import DCROverfittingProtection
 from sdmetrics.single_table.privacy.ensemble import CategoricalEnsemble
 from sdmetrics.single_table.privacy.numerical_sklearn import NumericalLR, NumericalMLP, NumericalSVR
 from sdmetrics.single_table.privacy.radius_nearest_neighbor import NumericalRadiusNearestNeighbor
@@ -37,4 +38,5 @@
     'NumericalPrivacyMetric',
     'NumericalRadiusNearestNeighbor',
     'NumericalSVR',
+    'DCROverfittingProtection',
 ]
diff --git a/sdmetrics/single_table/privacy/dcr_overfitting_protection.py b/sdmetrics/single_table/privacy/dcr_overfitting_protection.py
@@ -0,0 +1,196 @@
+"""DCR Overfitting Protection metrics."""
+
+import warnings
+
+import numpy as np
+
+from sdmetrics._utils_metadata import _process_data_with_metadata
+from sdmetrics.goal import Goal
+from sdmetrics.single_table.base import SingleTableMetric
+from sdmetrics.single_table.privacy.dcr_utils import calculate_dcr
+from sdmetrics.single_table.privacy.util import validate_num_samples_num_iteration
+
+
+class DCROverfittingProtection(SingleTableMetric):
+    """DCR Overfitting Protection metric.
+
+    This metric uses a DCR (distance to closest record) computation to measure whether the
+    synthetic data has been overfit to the real data, as compared to a holdout set.
+    """
+
+    name = 'DCROverfittingProtection'
+    goal = Goal.MAXIMIZE
+    min_value = 0.0
+    max_value = 1.0
+
+    @classmethod
+    def _validate_inputs(
+        cls,
+        real_training_data,
+        synthetic_data,
+        real_validation_data,
+        metadata,
+        num_rows_subsample,
+        num_iterations,
+    ):
+        validate_num_samples_num_iteration(num_rows_subsample, num_iterations)
+
+        if num_rows_subsample and num_rows_subsample > len(synthetic_data):
+            warnings.warn(
+                f'num_rows_subsample ({num_rows_subsample}) is greater than the length of the '
+                f'synthetic data ({len(synthetic_data)}). Ignoring the num_rows_subsample and '
+                'num_iterations args.',
+            )
+            num_rows_subsample = None
+            num_iterations = 1
+
+        if len(real_training_data) * 0.5 > len(real_validation_data):
+            warnings.warn(
+                f'Your real_validation_data contains {len(real_validation_data)} rows while your '
+                f'real_training_data contains {len(real_training_data)} rows. For most accurate '
+                'results, we recommend that the validation data at least half the size of the training data.'
+            )
+
+        real_data_copy = real_training_data.copy()
+        synthetic_data_copy = synthetic_data.copy()
+        real_validation_copy = real_validation_data.copy()
+        real_data_copy = _process_data_with_metadata(real_data_copy, metadata, True)
+        synthetic_data_copy = _process_data_with_metadata(synthetic_data_copy, metadata, True)
+        real_validation_copy = _process_data_with_metadata(real_validation_copy, metadata, True)
+
+        return (
+            real_data_copy,
+            synthetic_data_copy,
+            real_validation_copy,
+            num_rows_subsample,
+            num_iterations,
+        )
+
+    @classmethod
+    def compute_breakdown(
+        cls,
+        real_training_data,
+        synthetic_data,
+        real_validation_data,
+        metadata,
+        num_rows_subsample=None,
+        num_iterations=1,
+    ):
+        """Compute the DCROverfittingProtection metric.
+
+        Args:
+            real_training_data (pd.DataFrame):
+                A pd.DataFrame object containing the real data used for training the synthesizer.
+            synthetic_data (pd.DataFrame):
+                A pandas.DataFrame object containing the synthetic data sampled
+                from the synthesizer.
+            real_validation_data (pd.DataFrame):
+                A pandas.DataFrame object containing a validation set of real data.
+                This data should not have been used to train the synthesizer.
+            metadata (dict):
+                A metadata dictionary that describes the table of data.
+            num_rows_subsample (int or None):
+                The number of synthetic data rows to subsample from the synthetic data.
+                This is used to increase the speed of the computation, if the dataset is large.
+                Defaults to None which means no subsampling will be done.
+            num_iterations (int):
+                The number of iterations to perform when subsampling.
+                The final score will be the average of all iterations. Default is 1 iteration.
+
+        Returns:
+            dict:
+                Returns a dictionary that contains the overall score, the % of synthetic data rows
+                that were closer to the validation set, and the % of synthetic data rows that were
+                closer to the real dataset. Averages of the medians are returned in the case of
+                multiple iterations.
+        """
+        sanitized_data = cls._validate_inputs(
+            real_training_data,
+            synthetic_data,
+            real_validation_data,
+            metadata,
+            num_rows_subsample,
+            num_iterations,
+        )
+
+        training_data = sanitized_data[0]
+        sanitized_synthetic_data = sanitized_data[1]
+        validation_data = sanitized_data[2]
+        num_rows_subsample = sanitized_data[3]
+        num_iterations = sanitized_data[4]
+
+        sum_of_scores = 0
+        sum_percent_close_to_real = 0
+        sum_percent_close_to_random = 0
+        for _ in range(num_iterations):
+            synthetic_sample = sanitized_synthetic_data
+            if num_rows_subsample is not None:
+                synthetic_sample = sanitized_synthetic_data.sample(n=num_rows_subsample)
+
+            dcr_real = calculate_dcr(training_data, synthetic_sample, metadata)
+            dcr_holdout = calculate_dcr(validation_data, synthetic_sample, metadata)
+
+            num_rows_closer_to_real = np.where(dcr_real < dcr_holdout, 1.0, 0.0).sum()
+            total_rows = dcr_real.size
+            percentage_close_to_real = num_rows_closer_to_real / total_rows
+            percentage_close_to_random = 1 - percentage_close_to_real
+            score = min((1.0 - percentage_close_to_real) * 2, 1.0)
+            sum_of_scores += score
+            sum_percent_close_to_real += percentage_close_to_real
+            sum_percent_close_to_random += percentage_close_to_random
+
+        result = {
+            'score': sum_of_scores / num_iterations,
+            'synthetic_data_percentages': {
+                'closer_to_training': sum_percent_close_to_real / num_iterations,
+                'closer_to_holdout': sum_percent_close_to_random / num_iterations,
+            },
+        }
+
+        return result
+
+    @classmethod
+    def compute(
+        cls,
+        real_training_data,
+        synthetic_data,
+        real_validation_data,
+        metadata,
+        num_rows_subsample=None,
+        num_iterations=1,
+    ):
+        """Compute the DCROverfittingProtection metric.
+
+        Args:
+            real_training_data (pd.DataFrame):
+                A pd.DataFrame object containing the real data used for training the synthesizer.
+            synthetic_data (pd.DataFrame):
+                A pandas.DataFrame object containing the synthetic data sampled
+                from the synthesizer.
+            real_validation_data (pd.DataFrame):
+                A pandas.DataFrame object containing a validation set of real data.
+                This data should not have been used to train the synthesizer.
+            metadata (dict):
+                A metadata dictionary that describes the table of data.
+            num_rows_subsample (int or None):
+                The number of synthetic data rows to subsample from the synthetic data.
+                This is used to increase the speed of the computation, if the dataset is large.
+                Defaults to None which means no subsampling will be done.
+            num_iterations (int):
+                The number of iterations to perform when subsampling.
+                The final score will be the average of all iterations. Default is 1 iteration.
+
+        Returns:
+            float:
+                The score for the DCROverfittingProtection metric.
+        """
+        result = cls.compute_breakdown(
+            real_training_data,
+            synthetic_data,
+            real_validation_data,
+            metadata,
+            num_rows_subsample,
+            num_iterations,
+        )
+
+        return result.get('score')
diff --git a/sdmetrics/single_table/privacy/util.py b/sdmetrics/single_table/privacy/util.py
@@ -148,3 +148,17 @@ def allow_nan_array(attributes):
             ret.append(entry)
 
     return ret
+
+
+def validate_num_samples_num_iteration(num_rows_subsample, num_iterations):
+    if num_rows_subsample is not None:
+        if not isinstance(num_rows_subsample, int) or num_rows_subsample < 1:
+            raise ValueError(
+                f'num_rows_subsample ({num_rows_subsample}) must be an integer greater than 1.'
+            )
+
+    elif num_rows_subsample is None and num_iterations > 1:
+        raise ValueError('num_iterations should not be greater than 1 if there is no subsampling.')
+
+    if not isinstance(num_iterations, int) or num_iterations < 1:
+        raise ValueError(f'num_iterations ({num_iterations}) must be an integer greater than 1.')
diff --git a/tests/integration/single_table/privacy/test_dcr_overfitting_protection.py b/tests/integration/single_table/privacy/test_dcr_overfitting_protection.py
@@ -0,0 +1,148 @@
+import random
+import re
+
+import pandas as pd
+import pytest
+from sklearn.model_selection import train_test_split
+
+from sdmetrics.demos import load_single_table_demo
+from sdmetrics.single_table.privacy import DCROverfittingProtection
+
+
+class TestDCROverfittingProtection:
+    def test_end_to_end_with_demo(self):
+        """Test end to end for DCROverfittingProtection metric against the demo dataset.
+
+        In this end to end test, test against demo dataset. Use subsampling to speed
+        up the test. Make sure that if hold two datasets to be the same we get expected
+        values even with subsampling. Note that if synthetic data is equally distant from
+        the training data and the holdout data, it is labeled as closer to holdout data.
+        """
+        # Setup
+        real_data, synthetic_data, metadata = load_single_table_demo()
+        train_df, holdout_df = train_test_split(real_data, test_size=0.2)
+
+        # Run
+        num_rows_subsample = 50
+        compute_breakdown_result = DCROverfittingProtection.compute_breakdown(
+            train_df, synthetic_data, holdout_df, metadata
+        )
+        compute_result = DCROverfittingProtection.compute(
+            train_df, synthetic_data, holdout_df, metadata
+        )
+        compute_holdout_same = DCROverfittingProtection.compute_breakdown(
+            train_df, synthetic_data, synthetic_data, metadata, num_rows_subsample
+        )
+        compute_train_same = DCROverfittingProtection.compute_breakdown(
+            synthetic_data, synthetic_data, holdout_df, metadata, num_rows_subsample
+        )
+        compute_all_same = DCROverfittingProtection.compute_breakdown(
+            synthetic_data,
+            synthetic_data,
+            synthetic_data,
+            metadata,
+            num_rows_subsample,
+        )
+
+        synth_percentages_key = 'synthetic_data_percentages'
+        synth_train_key = 'closer_to_training'
+        synth_holdout_key = 'closer_to_holdout'
+        score_key = 'score'
+
+        # Assert
+        assert compute_result == compute_breakdown_result[score_key]
+        assert compute_holdout_same[score_key] == 1.0
+        assert compute_holdout_same[synth_percentages_key][synth_train_key] == 0.0
+        assert compute_holdout_same[synth_percentages_key][synth_holdout_key] == 1.0
+        assert compute_train_same[score_key] == 0.0
+        assert compute_train_same[synth_percentages_key][synth_train_key] == 1.0
+        assert compute_train_same[synth_percentages_key][synth_holdout_key] == 0.0
+        assert compute_all_same[score_key] == 1.0
+        assert compute_all_same[synth_percentages_key][synth_train_key] == 0.0
+        assert compute_all_same[synth_percentages_key][synth_holdout_key] == 1.0
+
+    def test_compute_breakdown_drop_all_columns(self):
+        """Testing invalid sdtypes and ensure only appropriate columns are measured."""
+        # Setup
+        train_data = pd.DataFrame({'bad_col': [10.0, 15.0], 'num_col': [1.0, 2.0]})
+        synth_data = pd.DataFrame({'bad_col': [2.0, 1.0], 'num_col': [1.0, 2.0]})
+        holdout_data = pd.DataFrame({'bad_col': [2.0, 1.0], 'num_col': [3.0, 4.0]})
+        metadata = {
+            'columns': {
+                'bad_col': {'sdtype': 'unknown'},
+                'num_col': {'sdtype': 'numerical'},
+            }
+        }
+
+        # Run
+        result = DCROverfittingProtection.compute_breakdown(
+            train_data, synth_data, holdout_data, metadata
+        )
+
+        # Assert
+        assert result['score'] == 0.0
+        assert result['synthetic_data_percentages']['closer_to_training'] == 1.0
+        assert result['synthetic_data_percentages']['closer_to_holdout'] == 0.0
+
+    def test_compute_breakdown_subsampling(self):
+        """Test subsampling produces different values."""
+        # Setup
+        train_data = pd.DataFrame({'num_col': [random.randint(1, 1000) for _ in range(50)]})
+        holdout_data = pd.DataFrame({'num_col': [random.randint(1, 1000) for _ in range(50)]})
+        synthetic_data = pd.DataFrame({'num_col': [random.randint(1, 1000) for _ in range(50)]})
+        metadata = {'columns': {'num_col': {'sdtype': 'numerical'}}}
+        num_rows_subsample = 4
+        large_num_subsample = len(synthetic_data) * 2
+
+        # Run
+        compute_subsample = DCROverfittingProtection.compute_breakdown(
+            train_data, synthetic_data, holdout_data, metadata, num_rows_subsample
+        )
+
+        large_subsample_msg = re.escape('Ignoring the num_rows_subsample and num_iterations args.')
+        with pytest.warns(UserWarning, match=large_subsample_msg):
+            compute_large_subsample = DCROverfittingProtection.compute_breakdown(
+                train_data, synthetic_data, holdout_data, metadata, large_num_subsample
+            )
+
+        compute_full_1 = DCROverfittingProtection.compute_breakdown(
+            train_data, synthetic_data, holdout_data, metadata
+        )
+        compute_full_2 = DCROverfittingProtection.compute_breakdown(
+            train_data, synthetic_data, holdout_data, metadata
+        )
+
+        # Assert that subsampling provides different values if smaller than data length.
+        assert compute_subsample != compute_full_1
+        assert compute_full_1 == compute_full_2
+        assert compute_large_subsample == compute_full_1
+
+    def test_compute_breakdown_iterations(self):
+        """Test that number iterations for subsampling works as expected."""
+        # Setup
+        train_data = pd.DataFrame({'num_col': [random.randint(1, 1000) for _ in range(10)]})
+        holdout_data = pd.DataFrame({'num_col': [random.randint(1, 1000) for _ in range(10)]})
+        synthetic_data = pd.DataFrame({'num_col': [random.randint(1, 1000) for _ in range(10)]})
+        metadata = {'columns': {'num_col': {'sdtype': 'numerical'}}}
+        num_rows_subsample = 3
+        num_iterations = 1000
+
+        # Run
+        compute_num_iteration_1 = DCROverfittingProtection.compute_breakdown(
+            train_data, synthetic_data, holdout_data, metadata, num_rows_subsample, 1
+        )
+        compute_num_iteration_1000 = DCROverfittingProtection.compute_breakdown(
+            train_data, synthetic_data, holdout_data, metadata, num_rows_subsample, num_iterations
+        )
+        compute_train_same = DCROverfittingProtection.compute_breakdown(
+            synthetic_data,
+            synthetic_data,
+            holdout_data,
+            metadata,
+            num_rows_subsample,
+            num_iterations,
+        )
+
+        # Assert
+        assert compute_num_iteration_1 != compute_num_iteration_1000
+        assert compute_train_same['score'] == 0.0
diff --git a/tests/unit/single_table/privacy/test_dcr_overfitting_protection.py b/tests/unit/single_table/privacy/test_dcr_overfitting_protection.py
diff --git a/tests/unit/single_table/privacy/test_util.py b/tests/unit/single_table/privacy/test_util.py

Original file line number	Diff line number	Diff line change
`@@ -71,6 +71,7 @@`
`71`	`71`	`DisclosureProtection,`
`72`	`72`	`DisclosureProtectionEstimate,`
`73`	`73`	`)`
	`74`	`+from sdmetrics.single_table.privacy.dcr_overfitting_protection import DCROverfittingProtection`
`74`	`75`	`from sdmetrics.single_table.privacy.ensemble import CategoricalEnsemble`
`75`	`76`	`from sdmetrics.single_table.privacy.numerical_sklearn import NumericalLR, NumericalMLP, NumericalSVR`
`76`	`77`	`from sdmetrics.single_table.privacy.radius_nearest_neighbor import NumericalRadiusNearestNeighbor`
`@@ -136,4 +137,5 @@`
`136`	`137`	`'RangeCoverage',`
`137`	`138`	`'NewRowSynthesis',`
`138`	`139`	`'TableStructure',`
	`140`	`+ 'DCROverfittingProtection',`
`139`	`141`	`]`
Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@`
`16`	`16`	`DisclosureProtection,`
`17`	`17`	`DisclosureProtectionEstimate,`
`18`	`18`	`)`
	`19`	`+from sdmetrics.single_table.privacy.dcr_overfitting_protection import DCROverfittingProtection`
`19`	`20`	`from sdmetrics.single_table.privacy.ensemble import CategoricalEnsemble`
`20`	`21`	`from sdmetrics.single_table.privacy.numerical_sklearn import NumericalLR, NumericalMLP, NumericalSVR`
`21`	`22`	`from sdmetrics.single_table.privacy.radius_nearest_neighbor import NumericalRadiusNearestNeighbor`
`@@ -37,4 +38,5 @@`
`37`	`38`	`'NumericalPrivacyMetric',`
`38`	`39`	`'NumericalRadiusNearestNeighbor',`
`39`	`40`	`'NumericalSVR',`
	`41`	`+ 'DCROverfittingProtection',`
`40`	`42`	`]`