|
3 | 3 | import pandas as pd
|
4 | 4 |
|
5 | 5 |
|
6 |
| -def _validate_parameters( |
7 |
| - real_training_data, |
8 |
| - synthetic_data, |
9 |
| - real_validation_data, |
10 |
| - metadata, |
11 |
| - prediction_column_name, |
12 |
| - classifier, |
13 |
| - fixed_recall_value, |
14 |
| -): |
15 |
| - """Validate the parameters of the Data Augmentation metrics.""" |
| 6 | +def _validate_tables(real_training_data, synthetic_data, real_validation_data): |
| 7 | + """Validate the tables of the Data Augmentation metrics.""" |
16 | 8 | tables = [real_training_data, synthetic_data, real_validation_data]
|
17 | 9 | if any(not isinstance(table, pd.DataFrame) for table in tables):
|
18 | 10 | raise ValueError(
|
19 | 11 | '`real_training_data`, `synthetic_data` and `real_validation_data` must be '
|
20 | 12 | 'pandas DataFrames.'
|
21 | 13 | )
|
22 | 14 |
|
| 15 | + |
| 16 | +def _validate_metadata(metadata): |
| 17 | + """Validate the metadata of the Data Augmentation metrics.""" |
23 | 18 | if not isinstance(metadata, dict):
|
24 | 19 | raise TypeError(
|
25 | 20 | f"Expected a dictionary but received a '{type(metadata).__name__}' instead."
|
26 | 21 | " For SDV metadata objects, please use the 'to_dict' function to convert it"
|
27 | 22 | ' to a dictionary.'
|
28 | 23 | )
|
29 | 24 |
|
| 25 | + |
| 26 | +def _validate_prediction_column_name(prediction_column_name): |
| 27 | + """Validate the prediction column name of the Data Augmentation metrics.""" |
30 | 28 | if not isinstance(prediction_column_name, str):
|
31 | 29 | raise TypeError('`prediction_column_name` must be a string.')
|
32 | 30 |
|
| 31 | + |
| 32 | +def _validate_classifier(classifier): |
| 33 | + """Validate the classifier of the Data Augmentation metrics.""" |
33 | 34 | if classifier is not None and not isinstance(classifier, str):
|
34 | 35 | raise TypeError('`classifier` must be a string or None.')
|
35 | 36 |
|
36 | 37 | if classifier != 'XGBoost':
|
37 | 38 | raise ValueError('Currently only `XGBoost` is supported as classifier.')
|
38 | 39 |
|
| 40 | + |
| 41 | +def _validate_fixed_recall_value(fixed_recall_value): |
| 42 | + """Validate the fixed recall value of the Data Augmentation metrics.""" |
39 | 43 | if not isinstance(fixed_recall_value, (int, float)) or not (0 < fixed_recall_value < 1):
|
40 | 44 | raise TypeError('`fixed_recall_value` must be a float in the range (0, 1).')
|
41 | 45 |
|
42 | 46 |
|
| 47 | +def _validate_parameters( |
| 48 | + real_training_data, |
| 49 | + synthetic_data, |
| 50 | + real_validation_data, |
| 51 | + metadata, |
| 52 | + prediction_column_name, |
| 53 | + classifier, |
| 54 | + fixed_recall_value, |
| 55 | +): |
| 56 | + """Validate the parameters of the Data Augmentation metrics.""" |
| 57 | + _validate_tables(real_training_data, synthetic_data, real_validation_data) |
| 58 | + _validate_metadata(metadata) |
| 59 | + _validate_prediction_column_name(prediction_column_name) |
| 60 | + _validate_classifier(classifier) |
| 61 | + _validate_fixed_recall_value(fixed_recall_value) |
| 62 | + |
| 63 | + |
43 | 64 | def _validate_data_and_metadata(
|
44 | 65 | real_training_data,
|
45 | 66 | synthetic_data,
|
@@ -89,10 +110,11 @@ def _validate_data_and_metadata(
|
89 | 110 | synthetic_labels = set(synthetic_data[prediction_column_name].unique())
|
90 | 111 | real_labels = set(real_training_data[prediction_column_name].unique())
|
91 | 112 | if not synthetic_labels.issubset(real_labels):
|
| 113 | + to_print = "', '".join(sorted(synthetic_labels - real_labels)) |
92 | 114 | raise ValueError(
|
93 | 115 | f'The ``{prediction_column_name}`` column must have the same values in the real '
|
94 |
| - 'and synthetic data. The synthetic data has the following unseen values: ' |
95 |
| - f'{sorted(synthetic_labels - real_labels)}' |
| 116 | + 'and synthetic data. The following values are present in the synthetic data and' |
| 117 | + f" not the real data: '{to_print}'" |
96 | 118 | )
|
97 | 119 |
|
98 | 120 |
|
|
0 commit comments