1
+ import re
1
2
from unittest .mock import patch
2
3
3
4
import pandas as pd
7
8
8
9
9
10
class TestContingencySimilarity :
11
+ def test__validate_inputs (self ):
12
+ """Test the ``_validate_inputs`` method."""
13
+ # Setup
14
+ bad_data = pd .Series (range (5 ))
15
+ real_data = pd .DataFrame ({'col1' : range (10 ), 'col2' : range (10 , 20 )})
16
+ bad_synthetic_data = pd .DataFrame ({'bad_column' : range (10 ), 'col2' : range (10 )})
17
+ synthetic_data = pd .DataFrame ({'col1' : range (5 ), 'col2' : range (5 )})
18
+ bad_continous_columns = ['col1' , 'missing_col' ]
19
+ bad_num_discrete_bins = - 1
20
+
21
+ # Run and Assert
22
+ expected_bad_data = re .escape ('The data must be a pandas DataFrame with two columns.' )
23
+ with pytest .raises (ValueError , match = expected_bad_data ):
24
+ ContingencySimilarity ._validate_inputs (
25
+ real_data = bad_data ,
26
+ synthetic_data = bad_data ,
27
+ continuous_column_names = None ,
28
+ num_discrete_bins = 10 ,
29
+ )
30
+
31
+ expected_mismatch_columns_error = re .escape (
32
+ 'The columns in the real and synthetic data must match.'
33
+ )
34
+ with pytest .raises (ValueError , match = expected_mismatch_columns_error ):
35
+ ContingencySimilarity ._validate_inputs (
36
+ real_data = real_data ,
37
+ synthetic_data = bad_synthetic_data ,
38
+ continuous_column_names = None ,
39
+ num_discrete_bins = 10 ,
40
+ )
41
+
42
+ expected_bad_continous_column_error = re .escape (
43
+ "Continuous column(s) 'missing_col' not found in the data."
44
+ )
45
+ with pytest .raises (ValueError , match = expected_bad_continous_column_error ):
46
+ ContingencySimilarity ._validate_inputs (
47
+ real_data = real_data ,
48
+ synthetic_data = synthetic_data ,
49
+ continuous_column_names = bad_continous_columns ,
50
+ num_discrete_bins = 10 ,
51
+ )
52
+
53
+ expected_bad_num_discrete_bins_error = re .escape (
54
+ '`num_discrete_bins` must be an integer greater than zero.'
55
+ )
56
+ with pytest .raises (ValueError , match = expected_bad_num_discrete_bins_error ):
57
+ ContingencySimilarity ._validate_inputs (
58
+ real_data = real_data ,
59
+ synthetic_data = synthetic_data ,
60
+ continuous_column_names = ['col1' ],
61
+ num_discrete_bins = bad_num_discrete_bins ,
62
+ )
63
+
10
64
def test_compute (self ):
11
65
"""Test the ``compute`` method.
12
66
@@ -32,6 +86,22 @@ def test_compute(self):
32
86
# Assert
33
87
assert result == expected_score
34
88
89
+ def test_compute_with_discretization (self ):
90
+ """Test the ``compute`` method with continuous columns."""
91
+ # Setup
92
+ real_data = pd .DataFrame ({'col1' : [1.0 , 2.4 , 2.6 , 0.8 ], 'col2' : [1 , 2 , 3 , 4 ]})
93
+ synthetic_data = pd .DataFrame ({'col1' : [1.0 , 1.8 , 2.6 , 1.0 ], 'col2' : [2 , 3 , 7 , - 10 ]})
94
+ expected_score = 0.25
95
+
96
+ # Run
97
+ metric = ContingencySimilarity ()
98
+ result = metric .compute (
99
+ real_data , synthetic_data , continuous_column_names = ['col2' ], num_discrete_bins = 4
100
+ )
101
+
102
+ # Assert
103
+ assert result == expected_score
104
+
35
105
@patch ('sdmetrics.column_pairs.statistical.contingency_similarity.ColumnPairsMetric.normalize' )
36
106
def test_normalize (self , normalize_mock ):
37
107
"""Test the ``normalize`` method.
0 commit comments