@@ -61,6 +61,7 @@ def test_end_to_end_with_demo(self):
61
61
62
62
def test_compute_breakdown_drop_all_columns (self ):
63
63
"""Testing invalid sdtypes and ensure only appropriate columns are measured."""
64
+ # Setup
64
65
train_data = pd .DataFrame ({'bad_col' : [10.0 , 15.0 ], 'num_col' : [1.0 , 2.0 ]})
65
66
synth_data = pd .DataFrame ({'bad_col' : [2.0 , 1.0 ], 'num_col' : [1.0 , 2.0 ]})
66
67
holdout_data = pd .DataFrame ({'bad_col' : [2.0 , 1.0 ], 'num_col' : [3.0 , 4.0 ]})
@@ -71,9 +72,12 @@ def test_compute_breakdown_drop_all_columns(self):
71
72
}
72
73
}
73
74
75
+ # Run
74
76
result = DCROverfittingProtection .compute_breakdown (
75
77
train_data , synth_data , holdout_data , metadata
76
78
)
79
+
80
+ # Assert
77
81
assert result ['score' ] == 0.0
78
82
assert result ['synthetic_data_percentages' ]['closer_to_training' ] == 1.0
79
83
assert result ['synthetic_data_percentages' ]['closer_to_holdout' ] == 0.0
@@ -104,24 +108,31 @@ def test_compute_breakdown_subsampling(self):
104
108
assert compute_full_1 == compute_full_2
105
109
106
110
def test_compute_breakdown_iterations (self ):
107
- """Test that number iterations for subsampling affect results ."""
111
+ """Test that number iterations for subsampling works as expected ."""
108
112
# Setup
109
113
train_data = pd .DataFrame ({'num_col' : [random .randint (1 , 1000 ) for _ in range (10 )]})
110
114
holdout_data = pd .DataFrame ({'num_col' : [random .randint (1 , 1000 ) for _ in range (10 )]})
111
115
synthetic_data = pd .DataFrame ({'num_col' : [random .randint (1 , 1000 ) for _ in range (10 )]})
112
116
metadata = {'columns' : {'num_col' : {'sdtype' : 'numerical' }}}
113
117
num_rows_subsample = 3
118
+ num_iterations = 1000
114
119
115
120
# Run
116
121
compute_num_iteration_1 = DCROverfittingProtection .compute_breakdown (
117
122
train_data , synthetic_data , holdout_data , metadata , num_rows_subsample , 1
118
123
)
119
124
compute_num_iteration_1000 = DCROverfittingProtection .compute_breakdown (
120
- train_data , synthetic_data , holdout_data , metadata , num_rows_subsample , 1000
125
+ train_data , synthetic_data , holdout_data , metadata , num_rows_subsample , num_iterations
121
126
)
122
127
compute_train_same = DCROverfittingProtection .compute_breakdown (
123
- synthetic_data , synthetic_data , holdout_data , metadata , num_rows_subsample , 1000
128
+ synthetic_data ,
129
+ synthetic_data ,
130
+ holdout_data ,
131
+ metadata ,
132
+ num_rows_subsample ,
133
+ num_iterations ,
124
134
)
125
135
136
+ # Assert
126
137
assert compute_num_iteration_1 != compute_num_iteration_1000
127
138
assert compute_train_same ['score' ] == 0.0
0 commit comments