From 057e73fe4643bde4a5426e48f0d1d8646934b603 Mon Sep 17 00:00:00 2001 From: sofiene26000 Date: Fri, 19 Jan 2024 14:12:51 +0100 Subject: [PATCH 01/12] first commit --- clinicadl/utils/maps_manager/maps_manager.py | 18 +-- clinicadl/utils/metric_module.py | 117 +++++++++++++++++- .../utils/task_manager/classification.py | 7 +- .../utils/task_manager/reconstruction.py | 2 +- clinicadl/utils/task_manager/regression.py | 3 +- clinicadl/utils/task_manager/task_manager.py | 16 ++- 6 files changed, 145 insertions(+), 18 deletions(-) diff --git a/clinicadl/utils/maps_manager/maps_manager.py b/clinicadl/utils/maps_manager/maps_manager.py index ef776d234..289be43b8 100644 --- a/clinicadl/utils/maps_manager/maps_manager.py +++ b/clinicadl/utils/maps_manager/maps_manager.py @@ -1739,13 +1739,13 @@ def _test_loader( model = DDP(model) prediction_df, metrics = self.task_manager.test( - model, dataloader, criterion, use_labels=use_labels, amp=amp + model, dataloader, criterion, use_labels=use_labels, amp=amp, ci = True ) if use_labels: if network is not None: metrics[f"{self.mode}_id"] = network logger.info( - f"{self.mode} level {data_group} loss is {metrics['loss']} for model selected on {selection_metric}" + f"{self.mode} level {data_group} loss is {metrics['Metric_values'][-1]} for model selected on {selection_metric}" ) if cluster.master: @@ -2558,12 +2558,12 @@ def _mode_level_to_tsv( metrics_path = performance_dir / f"{data_group}_{self.mode}_level_metrics.tsv" if metrics is not None: if not metrics_path.is_file(): - pd.DataFrame(metrics, index=[0]).to_csv( - metrics_path, index=False, sep="\t" + pd.DataFrame(metrics).T.to_csv( + metrics_path, index=False, sep="\t", header=None ) else: - pd.DataFrame(metrics, index=[0]).to_csv( - metrics_path, index=False, sep="\t", mode="a", header=False + pd.DataFrame(metrics).T.to_csv( + metrics_path, index=False, sep="\t", mode="a", header=None ) def _ensemble_to_tsv( @@ -2612,6 +2612,10 @@ def _ensemble_to_tsv( use_labels=use_labels, ) + print(df_final) + col = df_final['true_label'] + df_final['predicted_label'] + if df_final is not None: df_final.to_csv( performance_dir / f"{data_group}_image_level_prediction.tsv", @@ -2619,7 +2623,7 @@ def _ensemble_to_tsv( sep="\t", ) if metrics is not None: - pd.DataFrame(metrics, index=[0]).to_csv( + pd.DataFrame(metrics).to_csv( performance_dir / f"{data_group}_image_level_metrics.tsv", index=False, sep="\t", diff --git a/clinicadl/utils/metric_module.py b/clinicadl/utils/metric_module.py index e9f63f681..ca0c49615 100644 --- a/clinicadl/utils/metric_module.py +++ b/clinicadl/utils/metric_module.py @@ -1,5 +1,6 @@ from logging import getLogger from typing import Dict, List +from sklearn.utils import resample import numpy as np @@ -11,6 +12,7 @@ "specificity": "max", "PPV": "max", "NPV": "max", + "F1_score": "max", "BA": "max", "PSNR": "max", "SSIM": "max", @@ -40,7 +42,49 @@ def __init__(self, metrics, n_classes=2): f"The metric {metric} is not implemented in the module." ) - def apply(self, y, y_pred): + + def compute_confidence_interval(self, + y, + y_pred, + metric_fn, + class_number=None, + confidence_level=0.95, + num_bootstrap_samples=1000): + + + """ + Compute confidence interval for a given metric using bootstrapping. + + Args: + y (array-like): True labels. + y_pred (array-like): Predicted labels. + metric_fn (callable): Metric function. + class_number (int, optional): Class number for class-specific metrics. + confidence_level (float, optional): Desired confidence level for intervals. + num_bootstrap_samples (int, optional): Number of bootstrap samples. + + Returns: + Tuple[float, float, float]: Lower bound, upper bound, and standard error of the metric. + """ + bootstrap_samples = np.zeros(num_bootstrap_samples) + + for i in range(num_bootstrap_samples): + indices = np.random.choice(len(y), len(y), replace=True) + y_bootstrap, y_pred_bootstrap = y[indices], y_pred[indices] + + if class_number is not None: + bootstrap_samples[i] = metric_fn(y_bootstrap, y_pred_bootstrap, class_number) + else: + bootstrap_samples[i] = metric_fn(y_bootstrap, y_pred_bootstrap) + + lower_ci, upper_ci = np.percentile(bootstrap_samples, + [(1 - confidence_level) / 2 * 100, (1 + confidence_level) / 2 * 100]) + + standard_error = np.std(bootstrap_samples) + + return lower_ci, upper_ci, standard_error + + def apply(self, y, y_pred, ci): """ This is a function to calculate the different metrics based on the list of true label and predicted label @@ -55,21 +99,66 @@ def apply(self, y, y_pred): y = np.array(y) y_pred = np.array(y_pred) + metric_names = ["Metrics"] + metric_values = ["Values"] # Collect metric values + lower_ci_values = ["Lower bound CI"] # Collect lower CI values + upper_ci_values = ["Upper bound CI"] # Collect upper CI values + se_values = ["SE"] # Collect standard error values for metric_key, metric_fn in self.metrics.items(): + metric_args = list(metric_fn.__code__.co_varnames) if "class_number" in metric_args and self.n_classes > 2: for class_number in range(self.n_classes): - results[f"{metric_key}-{class_number}"] = metric_fn( + if ci : + metric_result = metric_fn(y, y_pred, class_number) + lower_ci, upper_ci, standard_error = self.compute_confidence_interval(y, y_pred, metric_fn, class_number) + + metric_values.append(metric_result) + lower_ci_values.append(lower_ci) + upper_ci_values.append(upper_ci) + se_values.append(standard_error) + metric_names.append(f"{metric_key}-{class_number}") + else: + results[f"{metric_key}-{class_number}"] = metric_fn( y, y_pred, class_number ) + elif "class_number" in metric_args: - results[f"{metric_key}"] = metric_fn(y, y_pred, 0) + if ci: + metric_result = metric_fn(y, y_pred, 0) + metric_values.append(metric_result) + lower_ci, upper_ci, standard_error = self.compute_confidence_interval(y, y_pred, metric_fn, 0) + lower_ci_values.append(lower_ci) + upper_ci_values.append(upper_ci) + se_values.append(standard_error) + metric_names.append(f"{metric_key}") + else: + results[f"{metric_key}"] = metric_fn(y, y_pred, 0) + else: - results[metric_key] = metric_fn(y, y_pred) + if ci: + metric_result = metric_fn(y, y_pred) + metric_values.append(metric_result) + lower_ci, upper_ci, standard_error = self.compute_confidence_interval(y, y_pred, metric_fn) + lower_ci_values.append(lower_ci) + upper_ci_values.append(upper_ci) + se_values.append(standard_error) + metric_names.append(f"{metric_key}") + else: + results[f"{metric_key}"] = metric_fn(y, y_pred) + + if ci: + # Construct the final results dictionary + results["Metric_names"] = metric_names + results["Metric_values"] = metric_values + results["Lower_CI"] = lower_ci_values + results["Upper_CI"] = upper_ci_values + results["SE"] = se_values + else: results = dict() - return results + return results @staticmethod def mae_fn(y, y_pred): @@ -179,6 +268,24 @@ def npv_fn(y, y_pred, class_number): return true_negative / (true_negative + false_negative) else: return 0.0 + + @staticmethod + def f1_score_fn(y, y_pred, class_number): + """ + Args: + y (List): list of labels + y_pred (List): list of predictions + class_number (int): number of the class studied + Returns: + (float) F1 score + """ + + precision = MetricModule.ppv_fn(y, y_pred, class_number) + recall = MetricModule.sensitivity_fn(y, y_pred, class_number) + + f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0 + + return f1_score @staticmethod def ba_fn(y, y_pred, class_number): diff --git a/clinicadl/utils/task_manager/classification.py b/clinicadl/utils/task_manager/classification.py index 3265698fa..e37bc8fb5 100644 --- a/clinicadl/utils/task_manager/classification.py +++ b/clinicadl/utils/task_manager/classification.py @@ -42,7 +42,7 @@ def columns(self): @property def evaluation_metrics(self): - return ["accuracy", "sensitivity", "specificity", "PPV", "NPV", "BA"] + return ["BA", "accuracy","F1_score", "sensitivity", "specificity", "PPV", "NPV"] @property def save_outputs(self): @@ -62,10 +62,11 @@ def generate_test_row(self, idx, data, outputs): + [normalized_output[i].item() for i in range(self.n_classes)] ] - def compute_metrics(self, results_df): + def compute_metrics(self, results_df, ci): return self.metrics_module.apply( results_df.true_label.values, results_df.predicted_label.values, + ci = ci ) @staticmethod @@ -223,6 +224,8 @@ def check_prediction(row): row = [[subject, session, 0, label, prediction] + proba_list] row_df = pd.DataFrame(row, columns=self.columns) df_final = pd.concat([df_final, row_df]) + + print(df_final) if use_labels: results = self.compute_metrics(df_final) diff --git a/clinicadl/utils/task_manager/reconstruction.py b/clinicadl/utils/task_manager/reconstruction.py index ec4b73fa5..7be28b896 100644 --- a/clinicadl/utils/task_manager/reconstruction.py +++ b/clinicadl/utils/task_manager/reconstruction.py @@ -41,7 +41,7 @@ def generate_test_row(self, idx, data, outputs): row.append(metrics[metric]) return [row] - def compute_metrics(self, results_df): + def compute_metrics(self, results_df, ci = False): metrics = dict() for metric in self.evaluation_metrics: metrics[metric] = results_df[metric].mean() diff --git a/clinicadl/utils/task_manager/regression.py b/clinicadl/utils/task_manager/regression.py index 88d2f1064..7f35e908e 100644 --- a/clinicadl/utils/task_manager/regression.py +++ b/clinicadl/utils/task_manager/regression.py @@ -44,10 +44,11 @@ def generate_test_row(self, idx, data, outputs): ] ] - def compute_metrics(self, results_df): + def compute_metrics(self, results_df, ci): return self.metrics_module.apply( results_df.true_label.values, results_df.predicted_label.values, + ci = ci, ) @staticmethod diff --git a/clinicadl/utils/task_manager/task_manager.py b/clinicadl/utils/task_manager/task_manager.py index b5c7602b1..8f5b8f915 100644 --- a/clinicadl/utils/task_manager/task_manager.py +++ b/clinicadl/utils/task_manager/task_manager.py @@ -181,6 +181,7 @@ def test( criterion: _Loss, use_labels: bool = True, amp: bool = False, + ci = False, ) -> Tuple[pd.DataFrame, Dict[str, float]]: """ Computes the predictions and evaluation metrics. @@ -231,10 +232,20 @@ def test( if not use_labels: metrics_dict = None else: - metrics_dict = self.compute_metrics(results_df) + metrics_dict = self.compute_metrics(results_df, ci = ci) for loss_component in total_loss.keys(): dist.reduce(total_loss[loss_component], dst=0) - metrics_dict[loss_component] = total_loss[loss_component].item() + + if ci: + metrics_dict["Metric_names"].append(loss_component) + metrics_dict["Metric_values"].append(total_loss[loss_component].item()) + metrics_dict["Lower_CI"].append("N/A") + metrics_dict["Upper_CI"].append("N/A") + metrics_dict["SE"].append("N/A") + + else: + metrics_dict[loss_component] = total_loss[loss_component].item() + torch.cuda.empty_cache() return results_df, metrics_dict @@ -285,6 +296,7 @@ def test_da( else: metrics_dict = self.compute_metrics(results_df) metrics_dict["loss"] = total_loss + torch.cuda.empty_cache() return results_df, metrics_dict From ca624f75bea76494bfd61659defa49f8f503892a Mon Sep 17 00:00:00 2001 From: sofiene26000 Date: Tue, 23 Jan 2024 14:59:38 +0100 Subject: [PATCH 02/12] Add metrics and CI --- clinicadl/utils/maps_manager/maps_manager.py | 33 +++- clinicadl/utils/metric_module.py | 186 +++++++++++++++--- .../utils/task_manager/classification.py | 2 +- .../utils/task_manager/reconstruction.py | 50 ++++- clinicadl/utils/task_manager/regression.py | 2 +- clinicadl/utils/task_manager/task_manager.py | 1 - 6 files changed, 229 insertions(+), 45 deletions(-) diff --git a/clinicadl/utils/maps_manager/maps_manager.py b/clinicadl/utils/maps_manager/maps_manager.py index 289be43b8..cd20dae01 100644 --- a/clinicadl/utils/maps_manager/maps_manager.py +++ b/clinicadl/utils/maps_manager/maps_manager.py @@ -1331,6 +1331,7 @@ def _train( self.selection_metrics, amp=self.amp, network=network, + # ci = False, ) self._test_loader( valid_loader, @@ -1340,6 +1341,7 @@ def _train( self.selection_metrics, amp=self.amp, network=network, + #ci = False, ) if self.task_manager.save_outputs: @@ -1698,6 +1700,7 @@ def _test_loader( gpu=None, amp=False, network=None, + ci = True, ): """ Launches the testing task on a dataset wrapped by a DataLoader and writes prediction TSV files. @@ -1739,13 +1742,19 @@ def _test_loader( model = DDP(model) prediction_df, metrics = self.task_manager.test( - model, dataloader, criterion, use_labels=use_labels, amp=amp, ci = True + model, dataloader, criterion, use_labels=use_labels, amp=amp, ci = ci ) if use_labels: if network is not None: metrics[f"{self.mode}_id"] = network + + if ci: + loss_to_log = metrics['Metric_values'][-1] + else: + loss_to_log = metrics['loss'] + logger.info( - f"{self.mode} level {data_group} loss is {metrics['Metric_values'][-1]} for model selected on {selection_metric}" + f"{self.mode} level {data_group} loss is {loss_to_log} for model selected on {selection_metric}" ) if cluster.master: @@ -2557,13 +2566,24 @@ def _mode_level_to_tsv( metrics_path = performance_dir / f"{data_group}_{self.mode}_level_metrics.tsv" if metrics is not None: + + # if data_group == "train" or data_group == "validation": + # pd_metrics = pd.DataFrame(metrics, index = [0]) + # header = True + # else: + # pd_metrics = pd.DataFrame(metrics).T + # header = False + + pd_metrics = pd.DataFrame(metrics).T + header = False + #import ipdb; ipdb.set_trace() if not metrics_path.is_file(): - pd.DataFrame(metrics).T.to_csv( - metrics_path, index=False, sep="\t", header=None + pd_metrics.to_csv( + metrics_path, index=False, sep="\t", header=header ) else: - pd.DataFrame(metrics).T.to_csv( - metrics_path, index=False, sep="\t", mode="a", header=None + pd_metrics.to_csv( + metrics_path, index=False, sep="\t", mode="a", header=header ) def _ensemble_to_tsv( @@ -2612,7 +2632,6 @@ def _ensemble_to_tsv( use_labels=use_labels, ) - print(df_final) col = df_final['true_label'] df_final['predicted_label'] diff --git a/clinicadl/utils/metric_module.py b/clinicadl/utils/metric_module.py index ca0c49615..6bfe40222 100644 --- a/clinicadl/utils/metric_module.py +++ b/clinicadl/utils/metric_module.py @@ -6,7 +6,7 @@ metric_optimum = { "MAE": "min", - "MSE": "min", + "RMSE": "min", "accuracy": "max", "sensitivity": "max", "specificity": "max", @@ -43,47 +43,76 @@ def __init__(self, metrics, n_classes=2): ) - def compute_confidence_interval(self, - y, - y_pred, - metric_fn, - class_number=None, - confidence_level=0.95, - num_bootstrap_samples=1000): + # def compute_confidence_interval(self, y, y_pred, metric_fn, class_number=None, confidence_level=0.95, num_bootstrap_samples=1000): + # # Generate a matrix of random indices for bootstrapping + # indices_matrix = np.random.choice(len(y), (num_bootstrap_samples, len(y)), replace=True) - - """ - Compute confidence interval for a given metric using bootstrapping. + # # Index the true labels (y) and predicted labels (y_pred) using the generated indices matrix + # y_bootstrap_matrix, y_pred_bootstrap_matrix = y[indices_matrix], y_pred[indices_matrix] - Args: - y (array-like): True labels. - y_pred (array-like): Predicted labels. - metric_fn (callable): Metric function. - class_number (int, optional): Class number for class-specific metrics. - confidence_level (float, optional): Desired confidence level for intervals. - num_bootstrap_samples (int, optional): Number of bootstrap samples. + # # Define a lambda function to compute the metric for each bootstrap sample along axis 1 + # compute_metric = ( + # lambda x: metric_fn(x, y_pred_bootstrap_matrix, class_number) + # if class_number is not None + # else metric_fn(x, y_pred_bootstrap_matrix) + # ) - Returns: - Tuple[float, float, float]: Lower bound, upper bound, and standard error of the metric. - """ + # #import ipdb; ipdb.set_trace() + # # Compute the metric for each bootstrap sample along axis 1 + # bootstrap_samples = np.apply_along_axis(compute_metric, axis=1, arr=y_bootstrap_matrix) + + # # Calculate confidence interval and standard error + # lower_ci, upper_ci = np.percentile(bootstrap_samples, [(1 - confidence_level) / 2 * 100, (1 + confidence_level) / 2 * 100]) + # standard_error = np.std(bootstrap_samples) + + # return lower_ci, upper_ci, standard_error + + # def compute_confidence_interval(self, y, y_pred, metric_fn, class_number=None, confidence_level=0.95, num_bootstrap_samples=1000): + # # Generate a matrix of random indices for bootstrapping + # indices_matrix = np.random.choice(len(y), (num_bootstrap_samples, len(y)), replace=True) + + # # Index the true labels (y) and predicted labels (y_pred) using the generated indices matrix + # y_bootstrap_matrix, y_pred_bootstrap_matrix = y[indices_matrix], y_pred[indices_matrix] + + # # Define a lambda function to compute the metric for each bootstrap sample along axis 1 + # compute_metric = ( + # lambda x, y_pred_matrix: metric_fn(x, y_pred_matrix, class_number) + # if class_number is not None + # else metric_fn(x, y_pred_matrix) + # ) + + # # Apply the function to each pair of rows in y_bootstrap_matrix and y_pred_bootstrap_matrix + # bootstrap_samples = np.apply_along_axis(compute_metric, axis=1, arr=y_bootstrap_matrix, y_pred_matrix=y_pred_bootstrap_matrix) + + # # Calculate confidence interval and standard error + # lower_ci, upper_ci = np.percentile(bootstrap_samples, [(1 - confidence_level) / 2 * 100, (1 + confidence_level) / 2 * 100]) + # standard_error = np.std(bootstrap_samples) + + # return lower_ci, upper_ci, standard_error + + def compute_confidence_interval(self, y, y_pred, metric_fn, class_number=None, confidence_level=0.95, num_bootstrap_samples=3000): + bootstrap_samples = np.zeros(num_bootstrap_samples) for i in range(num_bootstrap_samples): indices = np.random.choice(len(y), len(y), replace=True) + + y_bootstrap, y_pred_bootstrap = y[indices], y_pred[indices] if class_number is not None: - bootstrap_samples[i] = metric_fn(y_bootstrap, y_pred_bootstrap, class_number) + metric_result = metric_fn(y_bootstrap, y_pred_bootstrap, class_number) else: - bootstrap_samples[i] = metric_fn(y_bootstrap, y_pred_bootstrap) + metric_result = metric_fn(y_bootstrap, y_pred_bootstrap) - lower_ci, upper_ci = np.percentile(bootstrap_samples, - [(1 - confidence_level) / 2 * 100, (1 + confidence_level) / 2 * 100]) - + bootstrap_samples[i] = metric_result + + lower_ci, upper_ci = np.percentile(bootstrap_samples, [(1 - confidence_level) / 2 * 100, (1 + confidence_level) / 2 * 100]) standard_error = np.std(bootstrap_samples) return lower_ci, upper_ci, standard_error + def apply(self, y, y_pred, ci): """ This is a function to calculate the different metrics based on the list of true label and predicted label @@ -91,6 +120,7 @@ def apply(self, y, y_pred, ci): Args: y (List): list of labels y_pred (List): list of predictions + ci (bool) : If True confidence intervals are reported Returns: (Dict[str:float]) metrics results """ @@ -104,12 +134,13 @@ def apply(self, y, y_pred, ci): lower_ci_values = ["Lower bound CI"] # Collect lower CI values upper_ci_values = ["Upper bound CI"] # Collect upper CI values se_values = ["SE"] # Collect standard error values + for metric_key, metric_fn in self.metrics.items(): metric_args = list(metric_fn.__code__.co_varnames) if "class_number" in metric_args and self.n_classes > 2: for class_number in range(self.n_classes): - if ci : + if ci : metric_result = metric_fn(y, y_pred, class_number) lower_ci, upper_ci, standard_error = self.compute_confidence_interval(y, y_pred, metric_fn, class_number) @@ -173,16 +204,35 @@ def mae_fn(y, y_pred): return np.mean(np.abs(y - y_pred)) @staticmethod - def mse_fn(y, y_pred): + def rmse_fn(y, y_pred): """ Args: y (List): list of labels y_pred (List): list of predictions Returns: - (float) mean squared error + (float) root mean squared error """ - return np.mean(np.square(y - y_pred)) + return np.sqrt(np.mean(np.square(y - y_pred))) + + @staticmethod + def r2_score_fn(y, y_pred): + """ + Calculate the R-squared (coefficient of determination) score. + + Args: + y (List): List of actual labels + y_pred (List): List of predicted labels + + Returns: + (float) R-squared score + """ + mean_y = np.mean(y) + total_sum_squares = np.sum((y - mean_y) ** 2) + residual_sum_squares = np.sum((y - y_pred) ** 2) + r2_score = 1 - (residual_sum_squares / total_sum_squares) if total_sum_squares != 0 else 0 + + return r2_score @staticmethod def accuracy_fn(y, y_pred): @@ -303,6 +353,82 @@ def ba_fn(y, y_pred, class_number): + MetricModule.specificity_fn(y, y_pred, class_number) ) / 2 + @staticmethod + def mcc_fn(y, y_pred, class_number): + """ + Calculate the Matthews correlation coefficient (MCC) for a specific class. + + Args: + y (List): List of actual labels + y_pred (List): List of predicted labels + class_number (int): Number of the class studied + + Returns: + (float) Matthews correlation coefficient for the specified class + """ + true_positive = np.sum((y_pred == class_number) & (y == class_number)) + true_negative = np.sum((y_pred != class_number) & (y != class_number)) + false_positive = np.sum((y_pred == class_number) & (y != class_number)) + false_negative = np.sum((y_pred != class_number) & (y == class_number)) + denominator = np.sqrt((true_positive + false_positive) * (true_positive + false_negative) * (true_negative + false_positive) * (true_negative + false_negative)) + mcc = (true_positive * true_negative - false_positive * false_negative) / denominator if denominator != 0 else 0 + return mcc + + @staticmethod + def mk_fn(y, y_pred, class_number): + """ + Calculate Markedness (MK) for a specific class. + + Args: + y (List): List of actual labels + y_pred (List): List of predicted labels + class_number (int): Number of the class studied + + Returns: + (float) Markedness for the specified class + """ + precision = MetricModule.ppv_fn(y, y_pred, class_number) + npv = MetricModule.npv_fn(y, y_pred, class_number) + mk = precision + npv - 1 + return mk + + + @staticmethod + def lr_plus_fn(y, y_pred, class_number): + """ + Calculate Positive Likelihood Ratio (LR+). + + Args: + y (List): List of actual labels + y_pred (List): List of predicted labels + class_number (int): Number of the class studied + + Returns: + (float) Positive Likelihood Ratio + """ + sensitivity = MetricModule.sensitivity_fn(y, y_pred, class_number) + specificity = MetricModule.specificity_fn(y, y_pred, class_number) + lr_plus = sensitivity / (1 - specificity) if (1 - specificity) != 0 else 0 + return lr_plus + + @staticmethod + def lr_minus_fn(y, y_pred, class_number): + """ + Calculate Negative Likelihood Ratio (LR-). + + Args: + y (List): List of actual labels + y_pred (List): List of predicted labels + class_number (int): Number of the class studied + + Returns: + (float) Negative Likelihood Ratio + """ + sensitivity = MetricModule.sensitivity_fn(y, y_pred, class_number) + specificity = MetricModule.specificity_fn(y, y_pred, class_number) + lr_minus = (1 - sensitivity) / specificity if specificity != 0 else 0 + return lr_minus + @staticmethod def confusion_matrix_fn(y, y_pred): """ diff --git a/clinicadl/utils/task_manager/classification.py b/clinicadl/utils/task_manager/classification.py index e37bc8fb5..3dbf36734 100644 --- a/clinicadl/utils/task_manager/classification.py +++ b/clinicadl/utils/task_manager/classification.py @@ -42,7 +42,7 @@ def columns(self): @property def evaluation_metrics(self): - return ["BA", "accuracy","F1_score", "sensitivity", "specificity", "PPV", "NPV"] + return ["BA", "accuracy","F1_score", "sensitivity", "specificity", "PPV", "NPV", "MCC", "MK", "LR_plus", "LR_minus"] @property def save_outputs(self): diff --git a/clinicadl/utils/task_manager/reconstruction.py b/clinicadl/utils/task_manager/reconstruction.py index 7be28b896..7a21b985c 100644 --- a/clinicadl/utils/task_manager/reconstruction.py +++ b/clinicadl/utils/task_manager/reconstruction.py @@ -22,7 +22,7 @@ def columns(self): @property def evaluation_metrics(self): - return ["MSE", "MAE", "PSNR", "SSIM"] + return ["MAE", "RMSE", "PSNR", "SSIM"] @property def save_outputs(self): @@ -31,7 +31,7 @@ def save_outputs(self): def generate_test_row(self, idx, data, outputs): y = data["image"][idx] y_pred = outputs[idx].cpu() - metrics = self.metrics_module.apply(y, y_pred) + metrics = self.metrics_module.apply(y, y_pred, ci=False) row = [ data["participant_id"][idx], data["session_id"][idx], @@ -42,10 +42,50 @@ def generate_test_row(self, idx, data, outputs): return [row] def compute_metrics(self, results_df, ci = False): + metrics = dict() - for metric in self.evaluation_metrics: - metrics[metric] = results_df[metric].mean() - return metrics + + if ci: + + from scipy.stats import bootstrap + from numpy import mean as np_mean + + metric_names = ["Metrics"] + metric_values = ["Values"] + lower_ci_values = ["Lower bound CI"] + upper_ci_values = ["Upper bound CI"] + se_values = ["SE"] + + for metric in self.evaluation_metrics: + + metric_vals = results_df[metric] + + metric_result = metric_vals.mean() + + metric_vals = (metric_vals, ) + res = bootstrap(metric_vals, np_mean, confidence_level=0.95, method="percentile") + lower_ci, upper_ci = res.confidence_interval + standard_error = res.standard_error + + metric_names.append(metric) + metric_values.append(metric_result) + lower_ci_values.append(lower_ci) + upper_ci_values.append(upper_ci) + se_values.append(standard_error) + + metrics["Metric_names"] = metric_names + metrics["Metric_values"] = metric_values + metrics["Lower_CI"] = lower_ci_values + metrics["Upper_CI"] = upper_ci_values + metrics["SE"] = se_values + + return metrics + + else: + for metric in self.evaluation_metrics: + metrics[metric] = results_df[metric].mean() + return metrics + @staticmethod def output_size(input_size, df, label): diff --git a/clinicadl/utils/task_manager/regression.py b/clinicadl/utils/task_manager/regression.py index 7f35e908e..473e8b2f8 100644 --- a/clinicadl/utils/task_manager/regression.py +++ b/clinicadl/utils/task_manager/regression.py @@ -27,7 +27,7 @@ def columns(self): @property def evaluation_metrics(self): - return ["MSE", "MAE"] + return ["R2_score", "MAE", "RMSE"] @property def save_outputs(self): diff --git a/clinicadl/utils/task_manager/task_manager.py b/clinicadl/utils/task_manager/task_manager.py index 8f5b8f915..af6f2952f 100644 --- a/clinicadl/utils/task_manager/task_manager.py +++ b/clinicadl/utils/task_manager/task_manager.py @@ -235,7 +235,6 @@ def test( metrics_dict = self.compute_metrics(results_df, ci = ci) for loss_component in total_loss.keys(): dist.reduce(total_loss[loss_component], dst=0) - if ci: metrics_dict["Metric_names"].append(loss_component) metrics_dict["Metric_values"].append(total_loss[loss_component].item()) From 13075994a796910b519f441a92e741fc88373d72 Mon Sep 17 00:00:00 2001 From: sofiene26000 Date: Mon, 29 Jan 2024 15:35:32 +0100 Subject: [PATCH 03/12] Add metrics (R2 score, MCC, MK, lr-, lr+) and non parametric (bootstrap) confidence intervals --- clinicadl/utils/maps_manager/maps_manager.py | 18 ++- clinicadl/utils/metric_module.py | 153 ++++-------------- .../utils/task_manager/classification.py | 4 +- .../utils/task_manager/reconstruction.py | 13 +- clinicadl/utils/task_manager/regression.py | 4 +- clinicadl/utils/task_manager/task_manager.py | 19 ++- 6 files changed, 74 insertions(+), 137 deletions(-) diff --git a/clinicadl/utils/maps_manager/maps_manager.py b/clinicadl/utils/maps_manager/maps_manager.py index cd20dae01..a35f0e69b 100644 --- a/clinicadl/utils/maps_manager/maps_manager.py +++ b/clinicadl/utils/maps_manager/maps_manager.py @@ -1331,7 +1331,6 @@ def _train( self.selection_metrics, amp=self.amp, network=network, - # ci = False, ) self._test_loader( valid_loader, @@ -1341,7 +1340,6 @@ def _train( self.selection_metrics, amp=self.amp, network=network, - #ci = False, ) if self.task_manager.save_outputs: @@ -1700,7 +1698,7 @@ def _test_loader( gpu=None, amp=False, network=None, - ci = True, + report_ci = True, ): """ Launches the testing task on a dataset wrapped by a DataLoader and writes prediction TSV files. @@ -1742,13 +1740,13 @@ def _test_loader( model = DDP(model) prediction_df, metrics = self.task_manager.test( - model, dataloader, criterion, use_labels=use_labels, amp=amp, ci = ci + model, dataloader, criterion, use_labels=use_labels, amp=amp, report_ci = report_ci ) if use_labels: if network is not None: metrics[f"{self.mode}_id"] = network - if ci: + if report_ci: loss_to_log = metrics['Metric_values'][-1] else: loss_to_log = metrics['loss'] @@ -1779,6 +1777,7 @@ def _test_loader_ssda( gpu=None, network=None, target=False, + report_ci = True, ): """ Launches the testing task on a dataset wrapped by a DataLoader and writes prediction TSV files. @@ -1820,12 +1819,19 @@ def _test_loader_ssda( dataloader, criterion, target=target, + report_ci = report_ci ) if use_labels: if network is not None: metrics[f"{self.mode}_id"] = network + + if report_ci: + loss_to_log = metrics['Metric_values'][-1] + else: + loss_to_log = metrics['loss'] + logger.info( - f"{self.mode} level {data_group} loss is {metrics['loss']} for model selected on {selection_metric}" + f"{self.mode} level {data_group} loss is {loss_to_log} for model selected on {selection_metric}" ) # Replace here diff --git a/clinicadl/utils/metric_module.py b/clinicadl/utils/metric_module.py index 6bfe40222..d2e13b784 100644 --- a/clinicadl/utils/metric_module.py +++ b/clinicadl/utils/metric_module.py @@ -41,86 +41,15 @@ def __init__(self, metrics, n_classes=2): raise NotImplementedError( f"The metric {metric} is not implemented in the module." ) - - - # def compute_confidence_interval(self, y, y_pred, metric_fn, class_number=None, confidence_level=0.95, num_bootstrap_samples=1000): - # # Generate a matrix of random indices for bootstrapping - # indices_matrix = np.random.choice(len(y), (num_bootstrap_samples, len(y)), replace=True) - - # # Index the true labels (y) and predicted labels (y_pred) using the generated indices matrix - # y_bootstrap_matrix, y_pred_bootstrap_matrix = y[indices_matrix], y_pred[indices_matrix] - - # # Define a lambda function to compute the metric for each bootstrap sample along axis 1 - # compute_metric = ( - # lambda x: metric_fn(x, y_pred_bootstrap_matrix, class_number) - # if class_number is not None - # else metric_fn(x, y_pred_bootstrap_matrix) - # ) - - # #import ipdb; ipdb.set_trace() - # # Compute the metric for each bootstrap sample along axis 1 - # bootstrap_samples = np.apply_along_axis(compute_metric, axis=1, arr=y_bootstrap_matrix) - - # # Calculate confidence interval and standard error - # lower_ci, upper_ci = np.percentile(bootstrap_samples, [(1 - confidence_level) / 2 * 100, (1 + confidence_level) / 2 * 100]) - # standard_error = np.std(bootstrap_samples) - - # return lower_ci, upper_ci, standard_error - - # def compute_confidence_interval(self, y, y_pred, metric_fn, class_number=None, confidence_level=0.95, num_bootstrap_samples=1000): - # # Generate a matrix of random indices for bootstrapping - # indices_matrix = np.random.choice(len(y), (num_bootstrap_samples, len(y)), replace=True) - - # # Index the true labels (y) and predicted labels (y_pred) using the generated indices matrix - # y_bootstrap_matrix, y_pred_bootstrap_matrix = y[indices_matrix], y_pred[indices_matrix] - - # # Define a lambda function to compute the metric for each bootstrap sample along axis 1 - # compute_metric = ( - # lambda x, y_pred_matrix: metric_fn(x, y_pred_matrix, class_number) - # if class_number is not None - # else metric_fn(x, y_pred_matrix) - # ) - - # # Apply the function to each pair of rows in y_bootstrap_matrix and y_pred_bootstrap_matrix - # bootstrap_samples = np.apply_along_axis(compute_metric, axis=1, arr=y_bootstrap_matrix, y_pred_matrix=y_pred_bootstrap_matrix) - - # # Calculate confidence interval and standard error - # lower_ci, upper_ci = np.percentile(bootstrap_samples, [(1 - confidence_level) / 2 * 100, (1 + confidence_level) / 2 * 100]) - # standard_error = np.std(bootstrap_samples) - - # return lower_ci, upper_ci, standard_error - - def compute_confidence_interval(self, y, y_pred, metric_fn, class_number=None, confidence_level=0.95, num_bootstrap_samples=3000): - - bootstrap_samples = np.zeros(num_bootstrap_samples) - - for i in range(num_bootstrap_samples): - indices = np.random.choice(len(y), len(y), replace=True) - - - y_bootstrap, y_pred_bootstrap = y[indices], y_pred[indices] - - if class_number is not None: - metric_result = metric_fn(y_bootstrap, y_pred_bootstrap, class_number) - else: - metric_result = metric_fn(y_bootstrap, y_pred_bootstrap) - - bootstrap_samples[i] = metric_result - - lower_ci, upper_ci = np.percentile(bootstrap_samples, [(1 - confidence_level) / 2 * 100, (1 + confidence_level) / 2 * 100]) - standard_error = np.std(bootstrap_samples) - - return lower_ci, upper_ci, standard_error - - - def apply(self, y, y_pred, ci): + + def apply(self, y, y_pred, report_ci): """ This is a function to calculate the different metrics based on the list of true label and predicted label Args: y (List): list of labels y_pred (List): list of predictions - ci (bool) : If True confidence intervals are reported + report_ci (bool) : If True confidence intervals are reported Returns: (Dict[str:float]) metrics results """ @@ -129,6 +58,9 @@ def apply(self, y, y_pred, ci): y = np.array(y) y_pred = np.array(y_pred) + if report_ci: + from scipy.stats import bootstrap + metric_names = ["Metrics"] metric_values = ["Values"] # Collect metric values lower_ci_values = ["Lower bound CI"] # Collect lower CI values @@ -138,61 +70,46 @@ def apply(self, y, y_pred, ci): for metric_key, metric_fn in self.metrics.items(): metric_args = list(metric_fn.__code__.co_varnames) - if "class_number" in metric_args and self.n_classes > 2: - for class_number in range(self.n_classes): - if ci : - metric_result = metric_fn(y, y_pred, class_number) - lower_ci, upper_ci, standard_error = self.compute_confidence_interval(y, y_pred, metric_fn, class_number) - - metric_values.append(metric_result) - lower_ci_values.append(lower_ci) - upper_ci_values.append(upper_ci) - se_values.append(standard_error) - metric_names.append(f"{metric_key}-{class_number}") - else: - results[f"{metric_key}-{class_number}"] = metric_fn( - y, y_pred, class_number - ) - - elif "class_number" in metric_args: - if ci: - metric_result = metric_fn(y, y_pred, 0) - metric_values.append(metric_result) - lower_ci, upper_ci, standard_error = self.compute_confidence_interval(y, y_pred, metric_fn, 0) - lower_ci_values.append(lower_ci) - upper_ci_values.append(upper_ci) - se_values.append(standard_error) - metric_names.append(f"{metric_key}") - else: - results[f"{metric_key}"] = metric_fn(y, y_pred, 0) - else: - if ci: - metric_result = metric_fn(y, y_pred) + class_numbers = range(self.n_classes) if "class_number" in metric_args and self.n_classes > 2 else [0] + + for class_number in class_numbers: + + metric_result = metric_fn(y, y_pred, class_number) + + if report_ci: + res = bootstrap((y, y_pred), + lambda y, y_pred : metric_fn(y, y_pred, class_number), + n_resamples = 3000, + confidence_level=0.95, + method="percentile", + paired=True) + + lower_ci, upper_ci = res.confidence_interval + standard_error = res.standard_error + metric_values.append(metric_result) - lower_ci, upper_ci, standard_error = self.compute_confidence_interval(y, y_pred, metric_fn) lower_ci_values.append(lower_ci) upper_ci_values.append(upper_ci) se_values.append(standard_error) - metric_names.append(f"{metric_key}") + metric_names.append(f"{metric_key}-{class_number}" if len(class_numbers) > 1 else f"{metric_key}") else: - results[f"{metric_key}"] = metric_fn(y, y_pred) + results[f"{metric_key}-{class_number}" if len(class_numbers) > 1 else f"{metric_key}"] = metric_result - if ci: + if report_ci: # Construct the final results dictionary results["Metric_names"] = metric_names results["Metric_values"] = metric_values results["Lower_CI"] = lower_ci_values results["Upper_CI"] = upper_ci_values results["SE"] = se_values - else: results = dict() - return results + return results @staticmethod - def mae_fn(y, y_pred): + def mae_fn(y, y_pred, *args): """ Args: y (List): list of labels @@ -204,7 +121,7 @@ def mae_fn(y, y_pred): return np.mean(np.abs(y - y_pred)) @staticmethod - def rmse_fn(y, y_pred): + def rmse_fn(y, y_pred, *args): """ Args: y (List): list of labels @@ -216,7 +133,7 @@ def rmse_fn(y, y_pred): return np.sqrt(np.mean(np.square(y - y_pred))) @staticmethod - def r2_score_fn(y, y_pred): + def r2_score_fn(y, y_pred, *args): """ Calculate the R-squared (coefficient of determination) score. @@ -235,7 +152,7 @@ def r2_score_fn(y, y_pred): return r2_score @staticmethod - def accuracy_fn(y, y_pred): + def accuracy_fn(y, y_pred, *args): """ Args: y (List): list of labels @@ -430,7 +347,7 @@ def lr_minus_fn(y, y_pred, class_number): return lr_minus @staticmethod - def confusion_matrix_fn(y, y_pred): + def confusion_matrix_fn(y, y_pred, *args): """ Args: y (List): list of labels @@ -451,7 +368,7 @@ def confusion_matrix_fn(y, y_pred): } @staticmethod - def ssim_fn(y, y_pred): + def ssim_fn(y, y_pred, *args): """ Args: y (List): list of labels @@ -467,7 +384,7 @@ def ssim_fn(y, y_pred): return ssim3D(y, y_pred).item() @staticmethod - def psnr_fn(y, y_pred): + def psnr_fn(y, y_pred, *args): """ Args: y (List): list of labels @@ -480,7 +397,7 @@ def psnr_fn(y, y_pred): return peak_signal_noise_ratio(y, y_pred) @staticmethod - def lncc_fn(y, y_pred): + def lncc_fn(y, y_pred, *args): """ Args: y (List): list of labels diff --git a/clinicadl/utils/task_manager/classification.py b/clinicadl/utils/task_manager/classification.py index 3dbf36734..28a86e2e8 100644 --- a/clinicadl/utils/task_manager/classification.py +++ b/clinicadl/utils/task_manager/classification.py @@ -62,11 +62,11 @@ def generate_test_row(self, idx, data, outputs): + [normalized_output[i].item() for i in range(self.n_classes)] ] - def compute_metrics(self, results_df, ci): + def compute_metrics(self, results_df, report_ci): return self.metrics_module.apply( results_df.true_label.values, results_df.predicted_label.values, - ci = ci + report_ci = report_ci ) @staticmethod diff --git a/clinicadl/utils/task_manager/reconstruction.py b/clinicadl/utils/task_manager/reconstruction.py index 7a21b985c..c7f6d06af 100644 --- a/clinicadl/utils/task_manager/reconstruction.py +++ b/clinicadl/utils/task_manager/reconstruction.py @@ -31,21 +31,22 @@ def save_outputs(self): def generate_test_row(self, idx, data, outputs): y = data["image"][idx] y_pred = outputs[idx].cpu() - metrics = self.metrics_module.apply(y, y_pred, ci=False) + metrics = self.metrics_module.apply(y, y_pred, report_ci=False) row = [ data["participant_id"][idx], data["session_id"][idx], data[f"{self.mode}_id"][idx].item(), ] + for metric in self.evaluation_metrics: row.append(metrics[metric]) return [row] - def compute_metrics(self, results_df, ci = False): + def compute_metrics(self, results_df, report_ci = False): metrics = dict() - if ci: + if report_ci: from scipy.stats import bootstrap from numpy import mean as np_mean @@ -63,7 +64,11 @@ def compute_metrics(self, results_df, ci = False): metric_result = metric_vals.mean() metric_vals = (metric_vals, ) - res = bootstrap(metric_vals, np_mean, confidence_level=0.95, method="percentile") + res = bootstrap(metric_vals, + np_mean, + n_resamples = 3000, + confidence_level=0.95, + method="percentile") lower_ci, upper_ci = res.confidence_interval standard_error = res.standard_error diff --git a/clinicadl/utils/task_manager/regression.py b/clinicadl/utils/task_manager/regression.py index 473e8b2f8..b17088bb0 100644 --- a/clinicadl/utils/task_manager/regression.py +++ b/clinicadl/utils/task_manager/regression.py @@ -44,11 +44,11 @@ def generate_test_row(self, idx, data, outputs): ] ] - def compute_metrics(self, results_df, ci): + def compute_metrics(self, results_df, report_ci): return self.metrics_module.apply( results_df.true_label.values, results_df.predicted_label.values, - ci = ci, + report_ci = report_ci, ) @staticmethod diff --git a/clinicadl/utils/task_manager/task_manager.py b/clinicadl/utils/task_manager/task_manager.py index af6f2952f..5c825c7bc 100644 --- a/clinicadl/utils/task_manager/task_manager.py +++ b/clinicadl/utils/task_manager/task_manager.py @@ -181,7 +181,7 @@ def test( criterion: _Loss, use_labels: bool = True, amp: bool = False, - ci = False, + report_ci = False, ) -> Tuple[pd.DataFrame, Dict[str, float]]: """ Computes the predictions and evaluation metrics. @@ -232,10 +232,10 @@ def test( if not use_labels: metrics_dict = None else: - metrics_dict = self.compute_metrics(results_df, ci = ci) + metrics_dict = self.compute_metrics(results_df, report_ci = report_ci) for loss_component in total_loss.keys(): dist.reduce(total_loss[loss_component], dst=0) - if ci: + if report_ci: metrics_dict["Metric_names"].append(loss_component) metrics_dict["Metric_values"].append(total_loss[loss_component].item()) metrics_dict["Lower_CI"].append("N/A") @@ -257,6 +257,7 @@ def test_da( alpha: float = 0, use_labels: bool = True, target: bool = True, + report_ci = False, ) -> Tuple[pd.DataFrame, Dict[str, float]]: """ Computes the predictions and evaluation metrics. @@ -293,8 +294,16 @@ def test_da( if not use_labels: metrics_dict = None else: - metrics_dict = self.compute_metrics(results_df) - metrics_dict["loss"] = total_loss + metrics_dict = self.compute_metrics(results_df, report_ci = report_ci) + if report_ci: + metrics_dict["Metric_names"].append("loss") + metrics_dict["Metric_values"].append(total_loss) + metrics_dict["Lower_CI"].append("N/A") + metrics_dict["Upper_CI"].append("N/A") + metrics_dict["SE"].append("N/A") + + else: + metrics_dict["loss"] = total_loss torch.cuda.empty_cache() From cb5167e9e6a9d60ae5265dc2fb80873ce73909b8 Mon Sep 17 00:00:00 2001 From: sofiene26000 Date: Fri, 9 Feb 2024 16:18:03 +0100 Subject: [PATCH 04/12] Run test --- clinicadl/utils/maps_manager/maps_manager.py | 42 +++++----- clinicadl/utils/metric_module.py | 78 +++++++++++++------ .../utils/task_manager/classification.py | 20 +++-- .../utils/task_manager/reconstruction.py | 47 +++++------ clinicadl/utils/task_manager/regression.py | 4 +- clinicadl/utils/task_manager/task_manager.py | 16 ++-- 6 files changed, 126 insertions(+), 81 deletions(-) diff --git a/clinicadl/utils/maps_manager/maps_manager.py b/clinicadl/utils/maps_manager/maps_manager.py index a35f0e69b..8e8484bf2 100644 --- a/clinicadl/utils/maps_manager/maps_manager.py +++ b/clinicadl/utils/maps_manager/maps_manager.py @@ -1698,7 +1698,7 @@ def _test_loader( gpu=None, amp=False, network=None, - report_ci = True, + report_ci=True, ): """ Launches the testing task on a dataset wrapped by a DataLoader and writes prediction TSV files. @@ -1740,16 +1740,20 @@ def _test_loader( model = DDP(model) prediction_df, metrics = self.task_manager.test( - model, dataloader, criterion, use_labels=use_labels, amp=amp, report_ci = report_ci + model, + dataloader, + criterion, + use_labels=use_labels, + amp=amp, + report_ci=report_ci, ) if use_labels: if network is not None: metrics[f"{self.mode}_id"] = network - if report_ci: - loss_to_log = metrics['Metric_values'][-1] - else: - loss_to_log = metrics['loss'] + loss_to_log = ( + metrics["Metric_values"][-1] if report_ci else metrics["loss"] + ) logger.info( f"{self.mode} level {data_group} loss is {loss_to_log} for model selected on {selection_metric}" @@ -1777,7 +1781,7 @@ def _test_loader_ssda( gpu=None, network=None, target=False, - report_ci = True, + report_ci=True, ): """ Launches the testing task on a dataset wrapped by a DataLoader and writes prediction TSV files. @@ -1815,20 +1819,16 @@ def _test_loader_ssda( network=network, ) prediction_df, metrics = self.task_manager.test_da( - model, - dataloader, - criterion, - target=target, - report_ci = report_ci + model, dataloader, criterion, target=target, report_ci=report_ci ) if use_labels: if network is not None: metrics[f"{self.mode}_id"] = network if report_ci: - loss_to_log = metrics['Metric_values'][-1] + loss_to_log = metrics["Metric_values"][-1] else: - loss_to_log = metrics['loss'] + loss_to_log = metrics["loss"] logger.info( f"{self.mode} level {data_group} loss is {loss_to_log} for model selected on {selection_metric}" @@ -2055,6 +2055,7 @@ def _ensemble_prediction( selection_metrics = self._find_selection_metrics(split) for selection_metric in selection_metrics: + ##################### # Soft voting if self.num_networks > 1: self._ensemble_to_tsv( @@ -2572,7 +2573,6 @@ def _mode_level_to_tsv( metrics_path = performance_dir / f"{data_group}_{self.mode}_level_metrics.tsv" if metrics is not None: - # if data_group == "train" or data_group == "validation": # pd_metrics = pd.DataFrame(metrics, index = [0]) # header = True @@ -2582,11 +2582,9 @@ def _mode_level_to_tsv( pd_metrics = pd.DataFrame(metrics).T header = False - #import ipdb; ipdb.set_trace() + # import ipdb; ipdb.set_trace() if not metrics_path.is_file(): - pd_metrics.to_csv( - metrics_path, index=False, sep="\t", header=header - ) + pd_metrics.to_csv(metrics_path, index=False, sep="\t", header=header) else: pd_metrics.to_csv( metrics_path, index=False, sep="\t", mode="a", header=header @@ -2638,8 +2636,8 @@ def _ensemble_to_tsv( use_labels=use_labels, ) - col = df_final['true_label'] - df_final['predicted_label'] + col = df_final["true_label"] + df_final["predicted_label"] if df_final is not None: df_final.to_csv( @@ -2648,7 +2646,7 @@ def _ensemble_to_tsv( sep="\t", ) if metrics is not None: - pd.DataFrame(metrics).to_csv( + pd.DataFrame(metrics, index=[0]).to_csv( performance_dir / f"{data_group}_image_level_metrics.tsv", index=False, sep="\t", diff --git a/clinicadl/utils/metric_module.py b/clinicadl/utils/metric_module.py index d2e13b784..04f8f9d28 100644 --- a/clinicadl/utils/metric_module.py +++ b/clinicadl/utils/metric_module.py @@ -1,8 +1,8 @@ from logging import getLogger from typing import Dict, List -from sklearn.utils import resample import numpy as np +from sklearn.utils import resample metric_optimum = { "MAE": "min", @@ -41,7 +41,7 @@ def __init__(self, metrics, n_classes=2): raise NotImplementedError( f"The metric {metric} is not implemented in the module." ) - + def apply(self, y, y_pred, report_ci): """ This is a function to calculate the different metrics based on the list of true label and predicted label @@ -66,24 +66,31 @@ def apply(self, y, y_pred, report_ci): lower_ci_values = ["Lower bound CI"] # Collect lower CI values upper_ci_values = ["Upper bound CI"] # Collect upper CI values se_values = ["SE"] # Collect standard error values - + for metric_key, metric_fn in self.metrics.items(): - metric_args = list(metric_fn.__code__.co_varnames) - class_numbers = range(self.n_classes) if "class_number" in metric_args and self.n_classes > 2 else [0] + class_numbers = ( + range(self.n_classes) + if "class_number" in metric_args and self.n_classes > 2 + else [0] + ) for class_number in class_numbers: - metric_result = metric_fn(y, y_pred, class_number) - if report_ci: - res = bootstrap((y, y_pred), - lambda y, y_pred : metric_fn(y, y_pred, class_number), - n_resamples = 3000, - confidence_level=0.95, - method="percentile", - paired=True) + if ( + report_ci and len(y) >= 2 + ): # Compute confidence intervals only if there are at least two samples in the data. + + res = bootstrap( + (y, y_pred), + lambda y, y_pred: metric_fn(y, y_pred, class_number), + n_resamples=3000, + confidence_level=0.95, + method="percentile", + paired=True, + ) lower_ci, upper_ci = res.confidence_interval standard_error = res.standard_error @@ -92,9 +99,17 @@ def apply(self, y, y_pred, report_ci): lower_ci_values.append(lower_ci) upper_ci_values.append(upper_ci) se_values.append(standard_error) - metric_names.append(f"{metric_key}-{class_number}" if len(class_numbers) > 1 else f"{metric_key}") + metric_names.append( + f"{metric_key}-{class_number}" + if len(class_numbers) > 1 + else f"{metric_key}" + ) else: - results[f"{metric_key}-{class_number}" if len(class_numbers) > 1 else f"{metric_key}"] = metric_result + results[ + f"{metric_key}-{class_number}" + if len(class_numbers) > 1 + else f"{metric_key}" + ] = metric_result if report_ci: # Construct the final results dictionary @@ -131,7 +146,7 @@ def rmse_fn(y, y_pred, *args): """ return np.sqrt(np.mean(np.square(y - y_pred))) - + @staticmethod def r2_score_fn(y, y_pred, *args): """ @@ -147,7 +162,11 @@ def r2_score_fn(y, y_pred, *args): mean_y = np.mean(y) total_sum_squares = np.sum((y - mean_y) ** 2) residual_sum_squares = np.sum((y - y_pred) ** 2) - r2_score = 1 - (residual_sum_squares / total_sum_squares) if total_sum_squares != 0 else 0 + r2_score = ( + 1 - (residual_sum_squares / total_sum_squares) + if total_sum_squares != 0 + else 0 + ) return r2_score @@ -235,7 +254,7 @@ def npv_fn(y, y_pred, class_number): return true_negative / (true_negative + false_negative) else: return 0.0 - + @staticmethod def f1_score_fn(y, y_pred, class_number): """ @@ -246,11 +265,15 @@ def f1_score_fn(y, y_pred, class_number): Returns: (float) F1 score """ - + precision = MetricModule.ppv_fn(y, y_pred, class_number) recall = MetricModule.sensitivity_fn(y, y_pred, class_number) - f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0 + f1_score = ( + 2 * (precision * recall) / (precision + recall) + if (precision + recall) != 0 + else 0 + ) return f1_score @@ -287,8 +310,18 @@ def mcc_fn(y, y_pred, class_number): true_negative = np.sum((y_pred != class_number) & (y != class_number)) false_positive = np.sum((y_pred == class_number) & (y != class_number)) false_negative = np.sum((y_pred != class_number) & (y == class_number)) - denominator = np.sqrt((true_positive + false_positive) * (true_positive + false_negative) * (true_negative + false_positive) * (true_negative + false_negative)) - mcc = (true_positive * true_negative - false_positive * false_negative) / denominator if denominator != 0 else 0 + denominator = np.sqrt( + (true_positive + false_positive) + * (true_positive + false_negative) + * (true_negative + false_positive) + * (true_negative + false_negative) + ) + mcc = ( + (true_positive * true_negative - false_positive * false_negative) + / denominator + if denominator != 0 + else 0 + ) return mcc @staticmethod @@ -309,7 +342,6 @@ def mk_fn(y, y_pred, class_number): mk = precision + npv - 1 return mk - @staticmethod def lr_plus_fn(y, y_pred, class_number): """ diff --git a/clinicadl/utils/task_manager/classification.py b/clinicadl/utils/task_manager/classification.py index 28a86e2e8..4b95d06f1 100644 --- a/clinicadl/utils/task_manager/classification.py +++ b/clinicadl/utils/task_manager/classification.py @@ -42,7 +42,19 @@ def columns(self): @property def evaluation_metrics(self): - return ["BA", "accuracy","F1_score", "sensitivity", "specificity", "PPV", "NPV", "MCC", "MK", "LR_plus", "LR_minus"] + return [ + "BA", + "accuracy", + "F1_score", + "sensitivity", + "specificity", + "PPV", + "NPV", + "MCC", + "MK", + "LR_plus", + "LR_minus", + ] @property def save_outputs(self): @@ -66,7 +78,7 @@ def compute_metrics(self, results_df, report_ci): return self.metrics_module.apply( results_df.true_label.values, results_df.predicted_label.values, - report_ci = report_ci + report_ci=report_ci, ) @staticmethod @@ -224,11 +236,9 @@ def check_prediction(row): row = [[subject, session, 0, label, prediction] + proba_list] row_df = pd.DataFrame(row, columns=self.columns) df_final = pd.concat([df_final, row_df]) - - print(df_final) if use_labels: - results = self.compute_metrics(df_final) + results = self.compute_metrics(df_final, report_ci=False) else: results = None diff --git a/clinicadl/utils/task_manager/reconstruction.py b/clinicadl/utils/task_manager/reconstruction.py index c7f6d06af..2cf21e352 100644 --- a/clinicadl/utils/task_manager/reconstruction.py +++ b/clinicadl/utils/task_manager/reconstruction.py @@ -42,35 +42,39 @@ def generate_test_row(self, idx, data, outputs): row.append(metrics[metric]) return [row] - def compute_metrics(self, results_df, report_ci = False): - + def compute_metrics(self, results_df, report_ci=False): metrics = dict() if report_ci: - - from scipy.stats import bootstrap from numpy import mean as np_mean + from scipy.stats import bootstrap metric_names = ["Metrics"] - metric_values = ["Values"] - lower_ci_values = ["Lower bound CI"] - upper_ci_values = ["Upper bound CI"] - se_values = ["SE"] - - for metric in self.evaluation_metrics: + metric_values = ["Values"] + lower_ci_values = ["Lower bound CI"] + upper_ci_values = ["Upper bound CI"] + se_values = ["SE"] + for metric in self.evaluation_metrics: metric_vals = results_df[metric] - + metric_result = metric_vals.mean() - metric_vals = (metric_vals, ) - res = bootstrap(metric_vals, - np_mean, - n_resamples = 3000, - confidence_level=0.95, - method="percentile") - lower_ci, upper_ci = res.confidence_interval - standard_error = res.standard_error + metric_vals = (metric_vals,) + if ( + len(results_df) >= 2 + ): # Compute confidence intervals only if there are at least two samples in the data. + res = bootstrap( + metric_vals, + np_mean, + n_resamples=3000, + confidence_level=0.95, + method="percentile", + ) + lower_ci, upper_ci = res.confidence_interval + standard_error = res.standard_error + else: + lower_ci, upper_ci, standard_error = "N/A" metric_names.append(metric) metric_values.append(metric_result) @@ -83,15 +87,14 @@ def compute_metrics(self, results_df, report_ci = False): metrics["Lower_CI"] = lower_ci_values metrics["Upper_CI"] = upper_ci_values metrics["SE"] = se_values - + return metrics - else: + else: for metric in self.evaluation_metrics: metrics[metric] = results_df[metric].mean() return metrics - @staticmethod def output_size(input_size, df, label): return input_size diff --git a/clinicadl/utils/task_manager/regression.py b/clinicadl/utils/task_manager/regression.py index b17088bb0..df32ee838 100644 --- a/clinicadl/utils/task_manager/regression.py +++ b/clinicadl/utils/task_manager/regression.py @@ -48,7 +48,7 @@ def compute_metrics(self, results_df, report_ci): return self.metrics_module.apply( results_df.true_label.values, results_df.predicted_label.values, - report_ci = report_ci, + report_ci=report_ci, ) @staticmethod @@ -156,7 +156,7 @@ def ensemble_prediction( df_final = pd.concat([df_final, row_df]) if use_labels: - results = self.compute_metrics(df_final) + results = self.compute_metrics(df_final, report_ci=False) else: results = None diff --git a/clinicadl/utils/task_manager/task_manager.py b/clinicadl/utils/task_manager/task_manager.py index 5c825c7bc..163340c77 100644 --- a/clinicadl/utils/task_manager/task_manager.py +++ b/clinicadl/utils/task_manager/task_manager.py @@ -181,7 +181,7 @@ def test( criterion: _Loss, use_labels: bool = True, amp: bool = False, - report_ci = False, + report_ci=False, ) -> Tuple[pd.DataFrame, Dict[str, float]]: """ Computes the predictions and evaluation metrics. @@ -232,19 +232,21 @@ def test( if not use_labels: metrics_dict = None else: - metrics_dict = self.compute_metrics(results_df, report_ci = report_ci) + metrics_dict = self.compute_metrics(results_df, report_ci=report_ci) for loss_component in total_loss.keys(): dist.reduce(total_loss[loss_component], dst=0) if report_ci: metrics_dict["Metric_names"].append(loss_component) - metrics_dict["Metric_values"].append(total_loss[loss_component].item()) + metrics_dict["Metric_values"].append( + total_loss[loss_component].item() + ) metrics_dict["Lower_CI"].append("N/A") metrics_dict["Upper_CI"].append("N/A") metrics_dict["SE"].append("N/A") else: - metrics_dict[loss_component] = total_loss[loss_component].item() - + metrics_dict[loss_component] = total_loss[loss_component].item() + torch.cuda.empty_cache() return results_df, metrics_dict @@ -257,7 +259,7 @@ def test_da( alpha: float = 0, use_labels: bool = True, target: bool = True, - report_ci = False, + report_ci=False, ) -> Tuple[pd.DataFrame, Dict[str, float]]: """ Computes the predictions and evaluation metrics. @@ -294,7 +296,7 @@ def test_da( if not use_labels: metrics_dict = None else: - metrics_dict = self.compute_metrics(results_df, report_ci = report_ci) + metrics_dict = self.compute_metrics(results_df, report_ci=report_ci) if report_ci: metrics_dict["Metric_names"].append("loss") metrics_dict["Metric_values"].append(total_loss) From 1ba6218b6707460236c0ac1cea9592f481a1cdff Mon Sep 17 00:00:00 2001 From: sofiene26000 Date: Mon, 12 Feb 2024 18:39:11 +0100 Subject: [PATCH 05/12] A few bugs fixed and code improved --- clinicadl/utils/maps_manager/maps_manager.py | 3 - clinicadl/utils/metric_module.py | 68 +++++++------- .../utils/task_manager/reconstruction.py | 93 +++++++++---------- 3 files changed, 79 insertions(+), 85 deletions(-) diff --git a/clinicadl/utils/maps_manager/maps_manager.py b/clinicadl/utils/maps_manager/maps_manager.py index c46d99912..c293d5f5f 100644 --- a/clinicadl/utils/maps_manager/maps_manager.py +++ b/clinicadl/utils/maps_manager/maps_manager.py @@ -2670,9 +2670,6 @@ def _ensemble_to_tsv( use_labels=use_labels, ) - col = df_final["true_label"] - df_final["predicted_label"] - if df_final is not None: df_final.to_csv( performance_dir / f"{data_group}_image_level_prediction.tsv", diff --git a/clinicadl/utils/metric_module.py b/clinicadl/utils/metric_module.py index 04f8f9d28..0d28bb007 100644 --- a/clinicadl/utils/metric_module.py +++ b/clinicadl/utils/metric_module.py @@ -35,8 +35,10 @@ def __init__(self, metrics, n_classes=2): ] self.metrics = dict() for metric in metrics: - if f"{metric.lower()}_fn" in list_fn: - self.metrics[metric] = getattr(MetricModule, f"{metric.lower()}_fn") + if f"compute_{metric.lower()}" in list_fn: + self.metrics[metric] = getattr( + MetricModule, f"compute_{metric.lower()}" + ) else: raise NotImplementedError( f"The metric {metric} is not implemented in the module." @@ -79,10 +81,8 @@ def apply(self, y, y_pred, report_ci): for class_number in class_numbers: metric_result = metric_fn(y, y_pred, class_number) - if ( - report_ci and len(y) >= 2 - ): # Compute confidence intervals only if there are at least two samples in the data. - + # Compute confidence intervals only if there are at least two samples in the data. + if report_ci and len(y) >= 2: res = bootstrap( (y, y_pred), lambda y, y_pred: metric_fn(y, y_pred, class_number), @@ -124,7 +124,7 @@ def apply(self, y, y_pred, report_ci): return results @staticmethod - def mae_fn(y, y_pred, *args): + def compute_mae(y, y_pred, *args): """ Args: y (List): list of labels @@ -136,7 +136,7 @@ def mae_fn(y, y_pred, *args): return np.mean(np.abs(y - y_pred)) @staticmethod - def rmse_fn(y, y_pred, *args): + def compute_rmse(y, y_pred, *args): """ Args: y (List): list of labels @@ -148,7 +148,7 @@ def rmse_fn(y, y_pred, *args): return np.sqrt(np.mean(np.square(y - y_pred))) @staticmethod - def r2_score_fn(y, y_pred, *args): + def compute_r2_score(y, y_pred, *args): """ Calculate the R-squared (coefficient of determination) score. @@ -171,7 +171,7 @@ def r2_score_fn(y, y_pred, *args): return r2_score @staticmethod - def accuracy_fn(y, y_pred, *args): + def compute_accuracy(y, y_pred, *args): """ Args: y (List): list of labels @@ -184,7 +184,7 @@ def accuracy_fn(y, y_pred, *args): return true / len(y) @staticmethod - def sensitivity_fn(y, y_pred, class_number): + def compute_sensitivity(y, y_pred, class_number): """ Args: y (List): list of labels @@ -202,7 +202,7 @@ def sensitivity_fn(y, y_pred, class_number): return 0.0 @staticmethod - def specificity_fn(y, y_pred, class_number): + def compute_specificity(y, y_pred, class_number): """ Args: y (List): list of labels @@ -220,7 +220,7 @@ def specificity_fn(y, y_pred, class_number): return 0.0 @staticmethod - def ppv_fn(y, y_pred, class_number): + def compute_ppv(y, y_pred, class_number): """ Args: y (List): list of labels @@ -238,7 +238,7 @@ def ppv_fn(y, y_pred, class_number): return 0.0 @staticmethod - def npv_fn(y, y_pred, class_number): + def compute_npv(y, y_pred, class_number): """ Args: y (List): list of labels @@ -256,7 +256,7 @@ def npv_fn(y, y_pred, class_number): return 0.0 @staticmethod - def f1_score_fn(y, y_pred, class_number): + def compute_f1_score(y, y_pred, class_number): """ Args: y (List): list of labels @@ -266,8 +266,8 @@ def f1_score_fn(y, y_pred, class_number): (float) F1 score """ - precision = MetricModule.ppv_fn(y, y_pred, class_number) - recall = MetricModule.sensitivity_fn(y, y_pred, class_number) + precision = MetricModule.compute_ppv(y, y_pred, class_number) + recall = MetricModule.compute_sensitivity(y, y_pred, class_number) f1_score = ( 2 * (precision * recall) / (precision + recall) @@ -278,7 +278,7 @@ def f1_score_fn(y, y_pred, class_number): return f1_score @staticmethod - def ba_fn(y, y_pred, class_number): + def compute_ba(y, y_pred, class_number): """ Args: y (List): list of labels @@ -289,12 +289,12 @@ def ba_fn(y, y_pred, class_number): """ return ( - MetricModule.sensitivity_fn(y, y_pred, class_number) - + MetricModule.specificity_fn(y, y_pred, class_number) + MetricModule.compute_sensitivity(y, y_pred, class_number) + + MetricModule.compute_specificity(y, y_pred, class_number) ) / 2 @staticmethod - def mcc_fn(y, y_pred, class_number): + def compute_mcc(y, y_pred, class_number): """ Calculate the Matthews correlation coefficient (MCC) for a specific class. @@ -325,7 +325,7 @@ def mcc_fn(y, y_pred, class_number): return mcc @staticmethod - def mk_fn(y, y_pred, class_number): + def compute_mk(y, y_pred, class_number): """ Calculate Markedness (MK) for a specific class. @@ -337,13 +337,13 @@ def mk_fn(y, y_pred, class_number): Returns: (float) Markedness for the specified class """ - precision = MetricModule.ppv_fn(y, y_pred, class_number) - npv = MetricModule.npv_fn(y, y_pred, class_number) + precision = MetricModule.compute_ppv(y, y_pred, class_number) + npv = MetricModule.compute_npv(y, y_pred, class_number) mk = precision + npv - 1 return mk @staticmethod - def lr_plus_fn(y, y_pred, class_number): + def compute_lr_plus(y, y_pred, class_number): """ Calculate Positive Likelihood Ratio (LR+). @@ -355,13 +355,13 @@ def lr_plus_fn(y, y_pred, class_number): Returns: (float) Positive Likelihood Ratio """ - sensitivity = MetricModule.sensitivity_fn(y, y_pred, class_number) - specificity = MetricModule.specificity_fn(y, y_pred, class_number) + sensitivity = MetricModule.compute_sensitivity(y, y_pred, class_number) + specificity = MetricModule.compute_specificity(y, y_pred, class_number) lr_plus = sensitivity / (1 - specificity) if (1 - specificity) != 0 else 0 return lr_plus @staticmethod - def lr_minus_fn(y, y_pred, class_number): + def compute_lr_minus(y, y_pred, class_number): """ Calculate Negative Likelihood Ratio (LR-). @@ -373,13 +373,13 @@ def lr_minus_fn(y, y_pred, class_number): Returns: (float) Negative Likelihood Ratio """ - sensitivity = MetricModule.sensitivity_fn(y, y_pred, class_number) - specificity = MetricModule.specificity_fn(y, y_pred, class_number) + sensitivity = MetricModule.compute_sensitivity(y, y_pred, class_number) + specificity = MetricModule.compute_specificity(y, y_pred, class_number) lr_minus = (1 - sensitivity) / specificity if specificity != 0 else 0 return lr_minus @staticmethod - def confusion_matrix_fn(y, y_pred, *args): + def compute_confusion_matrix(y, y_pred, *args): """ Args: y (List): list of labels @@ -400,7 +400,7 @@ def confusion_matrix_fn(y, y_pred, *args): } @staticmethod - def ssim_fn(y, y_pred, *args): + def compute_ssim(y, y_pred, *args): """ Args: y (List): list of labels @@ -416,7 +416,7 @@ def ssim_fn(y, y_pred, *args): return ssim3D(y, y_pred).item() @staticmethod - def psnr_fn(y, y_pred, *args): + def compute_psnr(y, y_pred, *args): """ Args: y (List): list of labels @@ -429,7 +429,7 @@ def psnr_fn(y, y_pred, *args): return peak_signal_noise_ratio(y, y_pred) @staticmethod - def lncc_fn(y, y_pred, *args): + def compute_lncc(y, y_pred, *args): """ Args: y (List): list of labels diff --git a/clinicadl/utils/task_manager/reconstruction.py b/clinicadl/utils/task_manager/reconstruction.py index 2cf21e352..66905938b 100644 --- a/clinicadl/utils/task_manager/reconstruction.py +++ b/clinicadl/utils/task_manager/reconstruction.py @@ -43,57 +43,54 @@ def generate_test_row(self, idx, data, outputs): return [row] def compute_metrics(self, results_df, report_ci=False): + if not report_ci: + return { + metric: results_df[metric].mean() for metric in self.evaluation_metrics + } + + from numpy import mean as np_mean + from scipy.stats import bootstrap + metrics = dict() + metric_names = ["Metrics"] + metric_values = ["Values"] + lower_ci_values = ["Lower bound CI"] + upper_ci_values = ["Upper bound CI"] + se_values = ["SE"] + + for metric in self.evaluation_metrics: + metric_vals = results_df[metric] + + metric_result = metric_vals.mean() + + metric_vals = (metric_vals,) + # Compute confidence intervals only if there are at least two samples in the data. + if len(results_df) >= 2: + res = bootstrap( + metric_vals, + np_mean, + n_resamples=3000, + confidence_level=0.95, + method="percentile", + ) + lower_ci, upper_ci = res.confidence_interval + standard_error = res.standard_error + else: + lower_ci, upper_ci, standard_error = "N/A" - if report_ci: - from numpy import mean as np_mean - from scipy.stats import bootstrap - - metric_names = ["Metrics"] - metric_values = ["Values"] - lower_ci_values = ["Lower bound CI"] - upper_ci_values = ["Upper bound CI"] - se_values = ["SE"] - - for metric in self.evaluation_metrics: - metric_vals = results_df[metric] - - metric_result = metric_vals.mean() - - metric_vals = (metric_vals,) - if ( - len(results_df) >= 2 - ): # Compute confidence intervals only if there are at least two samples in the data. - res = bootstrap( - metric_vals, - np_mean, - n_resamples=3000, - confidence_level=0.95, - method="percentile", - ) - lower_ci, upper_ci = res.confidence_interval - standard_error = res.standard_error - else: - lower_ci, upper_ci, standard_error = "N/A" - - metric_names.append(metric) - metric_values.append(metric_result) - lower_ci_values.append(lower_ci) - upper_ci_values.append(upper_ci) - se_values.append(standard_error) - - metrics["Metric_names"] = metric_names - metrics["Metric_values"] = metric_values - metrics["Lower_CI"] = lower_ci_values - metrics["Upper_CI"] = upper_ci_values - metrics["SE"] = se_values - - return metrics + metric_names.append(metric) + metric_values.append(metric_result) + lower_ci_values.append(lower_ci) + upper_ci_values.append(upper_ci) + se_values.append(standard_error) - else: - for metric in self.evaluation_metrics: - metrics[metric] = results_df[metric].mean() - return metrics + metrics["Metric_names"] = metric_names + metrics["Metric_values"] = metric_values + metrics["Lower_CI"] = lower_ci_values + metrics["Upper_CI"] = upper_ci_values + metrics["SE"] = se_values + + return metrics @staticmethod def output_size(input_size, df, label): From 6611f10a672c1d5c7d43b10fa707e2174368f917 Mon Sep 17 00:00:00 2001 From: camillebrianceau Date: Tue, 13 Feb 2024 15:30:10 +0100 Subject: [PATCH 06/12] review --- clinicadl/utils/maps_manager/maps_manager.py | 27 +++++----- clinicadl/utils/metric_module.py | 49 +++++++++++-------- .../utils/task_manager/classification.py | 2 +- 3 files changed, 40 insertions(+), 38 deletions(-) diff --git a/clinicadl/utils/maps_manager/maps_manager.py b/clinicadl/utils/maps_manager/maps_manager.py index c293d5f5f..8e85120af 100644 --- a/clinicadl/utils/maps_manager/maps_manager.py +++ b/clinicadl/utils/maps_manager/maps_manager.py @@ -2075,7 +2075,6 @@ def _ensemble_prediction( selection_metrics = self._find_selection_metrics(split) for selection_metric in selection_metrics: - ##################### # Soft voting if self.num_networks > 1: self._ensemble_to_tsv( @@ -2607,22 +2606,18 @@ def _mode_level_to_tsv( metrics_path = performance_dir / f"{data_group}_{self.mode}_level_metrics.tsv" if metrics is not None: - # if data_group == "train" or data_group == "validation": - # pd_metrics = pd.DataFrame(metrics, index = [0]) - # header = True - # else: - # pd_metrics = pd.DataFrame(metrics).T - # header = False - - pd_metrics = pd.DataFrame(metrics).T - header = False - # import ipdb; ipdb.set_trace() - if not metrics_path.is_file(): - pd_metrics.to_csv(metrics_path, index=False, sep="\t", header=header) + if data_group == "train" or data_group == "validation": + pd_metrics = pd.DataFrame(metrics, index=[0]) + header = True else: - pd_metrics.to_csv( - metrics_path, index=False, sep="\t", mode="a", header=header - ) + pd_metrics = pd.DataFrame(metrics).T + header = False + + # pd_metrics = pd.DataFrame(metrics).T + # header = False + pd_metrics.to_csv( + metrics_path, index=False, sep="\t", mode="a", header=header + ) def _ensemble_to_tsv( self, diff --git a/clinicadl/utils/metric_module.py b/clinicadl/utils/metric_module.py index 0d28bb007..bf3b6c591 100644 --- a/clinicadl/utils/metric_module.py +++ b/clinicadl/utils/metric_module.py @@ -56,18 +56,24 @@ def apply(self, y, y_pred, report_ci): (Dict[str:float]) metrics results """ if y is not None and y_pred is not None: - results = dict() + results = { + "Metric_names": ["Metrics"], + "Metric_values": ["Values"], + "Lower_CI": ["Lower bound CI"], + "Upper_CI": ["Upper bound CI"], + "SE": ["SE"], + } y = np.array(y) y_pred = np.array(y_pred) - if report_ci: - from scipy.stats import bootstrap + # if report_ci: + # from scipy.stats import bootstrap - metric_names = ["Metrics"] - metric_values = ["Values"] # Collect metric values - lower_ci_values = ["Lower bound CI"] # Collect lower CI values - upper_ci_values = ["Upper bound CI"] # Collect upper CI values - se_values = ["SE"] # Collect standard error values + # metric_names = ["Metrics"] + # metric_values = ["Values"] # Collect metric values + # lower_ci_values = ["Lower bound CI"] # Collect lower CI values + # upper_ci_values = ["Upper bound CI"] # Collect upper CI values + # se_values = ["SE"] # Collect standard error values for metric_key, metric_fn in self.metrics.items(): metric_args = list(metric_fn.__code__.co_varnames) @@ -83,6 +89,8 @@ def apply(self, y, y_pred, report_ci): # Compute confidence intervals only if there are at least two samples in the data. if report_ci and len(y) >= 2: + from scipy.stats import bootstrap + res = bootstrap( (y, y_pred), lambda y, y_pred: metric_fn(y, y_pred, class_number), @@ -93,13 +101,12 @@ def apply(self, y, y_pred, report_ci): ) lower_ci, upper_ci = res.confidence_interval - standard_error = res.standard_error - metric_values.append(metric_result) - lower_ci_values.append(lower_ci) - upper_ci_values.append(upper_ci) - se_values.append(standard_error) - metric_names.append( + results["Metric_values"].append(metric_result) + results["Lower_CI"].append(lower_ci) + results["Upper_CI"].append(upper_ci) + results["SE"].append(res.standard_error) + results["Metric_names"].append( f"{metric_key}-{class_number}" if len(class_numbers) > 1 else f"{metric_key}" @@ -111,13 +118,13 @@ def apply(self, y, y_pred, report_ci): else f"{metric_key}" ] = metric_result - if report_ci: - # Construct the final results dictionary - results["Metric_names"] = metric_names - results["Metric_values"] = metric_values - results["Lower_CI"] = lower_ci_values - results["Upper_CI"] = upper_ci_values - results["SE"] = se_values + # if report_ci: + # # Construct the final results dictionary + # results["Metric_names"] = metric_names + # results["Metric_values"] = metric_values + # results["Lower_CI"] = lower_ci_values + # results["Upper_CI"] = upper_ci_values + # results["SE"] = se_values else: results = dict() diff --git a/clinicadl/utils/task_manager/classification.py b/clinicadl/utils/task_manager/classification.py index 4b95d06f1..3ff5710cc 100644 --- a/clinicadl/utils/task_manager/classification.py +++ b/clinicadl/utils/task_manager/classification.py @@ -238,7 +238,7 @@ def check_prediction(row): df_final = pd.concat([df_final, row_df]) if use_labels: - results = self.compute_metrics(df_final, report_ci=False) + results = self.compute_metrics(df_final) else: results = None From 622e0a17aea42e709c45f4c7c5f5f2219ccea6e5 Mon Sep 17 00:00:00 2001 From: camillebrianceau Date: Fri, 16 Feb 2024 11:38:45 +0100 Subject: [PATCH 07/12] review --- clinicadl/utils/maps_manager/maps_manager.py | 20 ++++--------- clinicadl/utils/metric_module.py | 16 ---------- .../utils/task_manager/reconstruction.py | 29 ++++++++----------- 3 files changed, 17 insertions(+), 48 deletions(-) diff --git a/clinicadl/utils/maps_manager/maps_manager.py b/clinicadl/utils/maps_manager/maps_manager.py index 8e85120af..64ba1ac8e 100644 --- a/clinicadl/utils/maps_manager/maps_manager.py +++ b/clinicadl/utils/maps_manager/maps_manager.py @@ -2597,24 +2597,14 @@ def _mode_level_to_tsv( performance_path = ( performance_dir / f"{data_group}_{self.mode}_level_prediction.tsv" ) - if not performance_path.is_file(): - results_df.to_csv(performance_path, index=False, sep="\t") - else: - results_df.to_csv( - performance_path, index=False, sep="\t", mode="a", header=False - ) + results_df.to_csv( + performance_path, index=False, sep="\t", mode="a", header=False + ) metrics_path = performance_dir / f"{data_group}_{self.mode}_level_metrics.tsv" if metrics is not None: - if data_group == "train" or data_group == "validation": - pd_metrics = pd.DataFrame(metrics, index=[0]) - header = True - else: - pd_metrics = pd.DataFrame(metrics).T - header = False - - # pd_metrics = pd.DataFrame(metrics).T - # header = False + pd_metrics = pd.DataFrame(metrics).T + header = False pd_metrics.to_csv( metrics_path, index=False, sep="\t", mode="a", header=header ) diff --git a/clinicadl/utils/metric_module.py b/clinicadl/utils/metric_module.py index bf3b6c591..b9b2a863e 100644 --- a/clinicadl/utils/metric_module.py +++ b/clinicadl/utils/metric_module.py @@ -66,15 +66,6 @@ def apply(self, y, y_pred, report_ci): y = np.array(y) y_pred = np.array(y_pred) - # if report_ci: - # from scipy.stats import bootstrap - - # metric_names = ["Metrics"] - # metric_values = ["Values"] # Collect metric values - # lower_ci_values = ["Lower bound CI"] # Collect lower CI values - # upper_ci_values = ["Upper bound CI"] # Collect upper CI values - # se_values = ["SE"] # Collect standard error values - for metric_key, metric_fn in self.metrics.items(): metric_args = list(metric_fn.__code__.co_varnames) @@ -118,13 +109,6 @@ def apply(self, y, y_pred, report_ci): else f"{metric_key}" ] = metric_result - # if report_ci: - # # Construct the final results dictionary - # results["Metric_names"] = metric_names - # results["Metric_values"] = metric_values - # results["Lower_CI"] = lower_ci_values - # results["Upper_CI"] = upper_ci_values - # results["SE"] = se_values else: results = dict() diff --git a/clinicadl/utils/task_manager/reconstruction.py b/clinicadl/utils/task_manager/reconstruction.py index 66905938b..868192f24 100644 --- a/clinicadl/utils/task_manager/reconstruction.py +++ b/clinicadl/utils/task_manager/reconstruction.py @@ -51,12 +51,13 @@ def compute_metrics(self, results_df, report_ci=False): from numpy import mean as np_mean from scipy.stats import bootstrap - metrics = dict() - metric_names = ["Metrics"] - metric_values = ["Values"] - lower_ci_values = ["Lower bound CI"] - upper_ci_values = ["Upper bound CI"] - se_values = ["SE"] + metrics = { + "Metric_names": ["Metrics"], + "Metric_values": ["Values"], + "Lower_CI": ["Lower bound CI"], + "Upper_CI": ["Upper bound CI"], + "SE": ["SE"], + } for metric in self.evaluation_metrics: metric_vals = results_df[metric] @@ -78,17 +79,11 @@ def compute_metrics(self, results_df, report_ci=False): else: lower_ci, upper_ci, standard_error = "N/A" - metric_names.append(metric) - metric_values.append(metric_result) - lower_ci_values.append(lower_ci) - upper_ci_values.append(upper_ci) - se_values.append(standard_error) - - metrics["Metric_names"] = metric_names - metrics["Metric_values"] = metric_values - metrics["Lower_CI"] = lower_ci_values - metrics["Upper_CI"] = upper_ci_values - metrics["SE"] = se_values + metrics["Metric_names"].append(metric) + metrics["Metric_values"].append(metric_result) + metrics["Lower_CI"].append(lower_ci) + metrics["Upper_CI"].append(upper_ci) + metrics["SE"].append(standard_error) return metrics From f07b6f1d7823571544bbe3c4ea443face235bfe0 Mon Sep 17 00:00:00 2001 From: camillebrianceau Date: Fri, 16 Feb 2024 11:42:31 +0100 Subject: [PATCH 08/12] back --- clinicadl/prepare_data/prepare_data.py | 16 +++++--- .../quality_check/t1_linear/quality_check.py | 1 + .../quality_check/t1_volume/quality_check.py | 1 + clinicadl/random_search/random_search.py | 1 + clinicadl/utils/cli_param/argument.py | 1 + clinicadl/utils/maps_manager/maps_manager.py | 38 ++++++++++--------- clinicadl/utils/meta_maps/getter.py | 1 + clinicadl/utils/metric_module.py | 8 ++-- 8 files changed, 40 insertions(+), 27 deletions(-) diff --git a/clinicadl/prepare_data/prepare_data.py b/clinicadl/prepare_data/prepare_data.py index 6eae5cf2f..aa48425b5 100644 --- a/clinicadl/prepare_data/prepare_data.py +++ b/clinicadl/prepare_data/prepare_data.py @@ -154,18 +154,22 @@ def prepare_roi(file): parameters["masks_location"], parameters["roi_list"], parameters["roi_mask_pattern"], - None - if parameters["use_uncropped_image"] is None - else not parameters["use_uncropped_image"], + ( + None + if parameters["use_uncropped_image"] is None + else not parameters["use_uncropped_image"] + ), ) output_mode = extract_roi( Path(file), masks_location=parameters["masks_location"], mask_pattern=parameters["roi_mask_pattern"], - cropped_input=None - if parameters["use_uncropped_image"] is None - else not parameters["use_uncropped_image"], + cropped_input=( + None + if parameters["use_uncropped_image"] is None + else not parameters["use_uncropped_image"] + ), roi_names=parameters["roi_list"], uncrop_output=parameters["uncropped_roi"], ) diff --git a/clinicadl/quality_check/t1_linear/quality_check.py b/clinicadl/quality_check/t1_linear/quality_check.py index 2cf8f1f33..a28a87256 100755 --- a/clinicadl/quality_check/t1_linear/quality_check.py +++ b/clinicadl/quality_check/t1_linear/quality_check.py @@ -1,6 +1,7 @@ """ This file contains all methods needed to perform the quality check procedure after t1-linear preprocessing. """ + from logging import getLogger from pathlib import Path diff --git a/clinicadl/quality_check/t1_volume/quality_check.py b/clinicadl/quality_check/t1_volume/quality_check.py index 166350614..b9b651497 100644 --- a/clinicadl/quality_check/t1_volume/quality_check.py +++ b/clinicadl/quality_check/t1_volume/quality_check.py @@ -4,6 +4,7 @@ 2) percentage of non zero values < 15 % or > 50 % 3) frontal similarity of T1 volume with the template < 0.40 """ + from logging import getLogger from pathlib import Path diff --git a/clinicadl/random_search/random_search.py b/clinicadl/random_search/random_search.py index cd65c8a0d..c9f8efae4 100755 --- a/clinicadl/random_search/random_search.py +++ b/clinicadl/random_search/random_search.py @@ -1,6 +1,7 @@ """ Launch a random network training. """ + from pathlib import Path from clinicadl.random_search.random_search_utils import get_space_dict, random_sampling diff --git a/clinicadl/utils/cli_param/argument.py b/clinicadl/utils/cli_param/argument.py index ad22773f4..1f9fb1721 100644 --- a/clinicadl/utils/cli_param/argument.py +++ b/clinicadl/utils/cli_param/argument.py @@ -1,4 +1,5 @@ """Common CLI arguments used by ClinicaDL pipelines.""" + from pathlib import Path import click diff --git a/clinicadl/utils/maps_manager/maps_manager.py b/clinicadl/utils/maps_manager/maps_manager.py index 64ba1ac8e..09dd30e34 100644 --- a/clinicadl/utils/maps_manager/maps_manager.py +++ b/clinicadl/utils/maps_manager/maps_manager.py @@ -304,16 +304,16 @@ def predict( multi_cohort=group_parameters["multi_cohort"], label_presence=use_labels, label=self.label if label is None else label, - label_code=self.label_code - if label_code == "default" - else label_code, + label_code=( + self.label_code if label_code == "default" else label_code + ), cnn_index=network, ) test_loader = DataLoader( data_test, - batch_size=batch_size - if batch_size is not None - else self.batch_size, + batch_size=( + batch_size if batch_size is not None else self.batch_size + ), shuffle=False, sampler=DistributedSampler( data_test, @@ -371,16 +371,16 @@ def predict( multi_cohort=group_parameters["multi_cohort"], label_presence=use_labels, label=self.label if label is None else label, - label_code=self.label_code - if label_code == "default" - else label_code, + label_code=( + self.label_code if label_code == "default" else label_code + ), ) test_loader = DataLoader( data_test, - batch_size=batch_size - if batch_size is not None - else self.batch_size, + batch_size=( + batch_size if batch_size is not None else self.batch_size + ), shuffle=False, sampler=DistributedSampler( data_test, @@ -2423,12 +2423,14 @@ def _write_data_group( self.write_parameters( group_path, { - "caps_directory": caps_directory - if caps_directory is not None - else self.caps_directory, - "multi_cohort": multi_cohort - if multi_cohort is not None - else self.multi_cohort, + "caps_directory": ( + caps_directory + if caps_directory is not None + else self.caps_directory + ), + "multi_cohort": ( + multi_cohort if multi_cohort is not None else self.multi_cohort + ), }, ) diff --git a/clinicadl/utils/meta_maps/getter.py b/clinicadl/utils/meta_maps/getter.py index 44e457e1b..2f400ffc3 100644 --- a/clinicadl/utils/meta_maps/getter.py +++ b/clinicadl/utils/meta_maps/getter.py @@ -1,6 +1,7 @@ """ Produces a tsv file to analyze the performance of one launch of the random search. """ + from pathlib import Path import pandas as pd diff --git a/clinicadl/utils/metric_module.py b/clinicadl/utils/metric_module.py index b9b2a863e..4effb893f 100644 --- a/clinicadl/utils/metric_module.py +++ b/clinicadl/utils/metric_module.py @@ -104,9 +104,11 @@ def apply(self, y, y_pred, report_ci): ) else: results[ - f"{metric_key}-{class_number}" - if len(class_numbers) > 1 - else f"{metric_key}" + ( + f"{metric_key}-{class_number}" + if len(class_numbers) > 1 + else f"{metric_key}" + ) ] = metric_result else: From 2cbe22ca63fba32b8f56b85990460781af7497c3 Mon Sep 17 00:00:00 2001 From: camillebrianceau Date: Fri, 16 Feb 2024 13:14:21 +0100 Subject: [PATCH 09/12] tests --- clinicadl/utils/maps_manager/maps_manager.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clinicadl/utils/maps_manager/maps_manager.py b/clinicadl/utils/maps_manager/maps_manager.py index 09dd30e34..444c4fd73 100644 --- a/clinicadl/utils/maps_manager/maps_manager.py +++ b/clinicadl/utils/maps_manager/maps_manager.py @@ -2606,9 +2606,8 @@ def _mode_level_to_tsv( metrics_path = performance_dir / f"{data_group}_{self.mode}_level_metrics.tsv" if metrics is not None: pd_metrics = pd.DataFrame(metrics).T - header = False pd_metrics.to_csv( - metrics_path, index=False, sep="\t", mode="a", header=header + metrics_path, index=False, sep="\t", mode="a", header=False ) def _ensemble_to_tsv( From 823dc7df8cd8af4bb74d140d856543f9f381a5ce Mon Sep 17 00:00:00 2001 From: camillebrianceau Date: Fri, 16 Feb 2024 14:30:36 +0100 Subject: [PATCH 10/12] tests --- clinicadl/utils/maps_manager/maps_manager.py | 20 +++++++++++++------ .../utils/task_manager/classification.py | 2 +- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/clinicadl/utils/maps_manager/maps_manager.py b/clinicadl/utils/maps_manager/maps_manager.py index 444c4fd73..f8dd81807 100644 --- a/clinicadl/utils/maps_manager/maps_manager.py +++ b/clinicadl/utils/maps_manager/maps_manager.py @@ -2599,16 +2599,23 @@ def _mode_level_to_tsv( performance_path = ( performance_dir / f"{data_group}_{self.mode}_level_prediction.tsv" ) - results_df.to_csv( - performance_path, index=False, sep="\t", mode="a", header=False - ) + if not performance_path.is_dir(): + results_df.to_csv(performance_path, index=False, sep="\t", header=True) + else: + results_df.to_csv( + performance_path, index=False, sep="\t", mode="a", header=False + ) metrics_path = performance_dir / f"{data_group}_{self.mode}_level_metrics.tsv" if metrics is not None: pd_metrics = pd.DataFrame(metrics).T - pd_metrics.to_csv( - metrics_path, index=False, sep="\t", mode="a", header=False - ) + + if not metrics_path.is_file(): + pd_metrics.to_csv(metrics_path, index=False, sep="\t", header=True) + else: + pd_metrics.to_csv( + metrics_path, index=False, sep="\t", mode="a", header=False + ) def _ensemble_to_tsv( self, @@ -3095,6 +3102,7 @@ def get_prediction( prediction_dir / f"{data_group}_{mode}_level_prediction.tsv", sep="\t", ) + print(df) df.set_index(["participant_id", "session_id"], inplace=True, drop=True) return df diff --git a/clinicadl/utils/task_manager/classification.py b/clinicadl/utils/task_manager/classification.py index 3ff5710cc..4b95d06f1 100644 --- a/clinicadl/utils/task_manager/classification.py +++ b/clinicadl/utils/task_manager/classification.py @@ -238,7 +238,7 @@ def check_prediction(row): df_final = pd.concat([df_final, row_df]) if use_labels: - results = self.compute_metrics(df_final) + results = self.compute_metrics(df_final, report_ci=False) else: results = None From 16999120666145c9bdcf60e0dc43205f28f27484 Mon Sep 17 00:00:00 2001 From: camillebrianceau Date: Fri, 16 Feb 2024 14:40:21 +0100 Subject: [PATCH 11/12] tests --- clinicadl/utils/maps_manager/maps_manager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/clinicadl/utils/maps_manager/maps_manager.py b/clinicadl/utils/maps_manager/maps_manager.py index f8dd81807..681d9156e 100644 --- a/clinicadl/utils/maps_manager/maps_manager.py +++ b/clinicadl/utils/maps_manager/maps_manager.py @@ -3102,7 +3102,6 @@ def get_prediction( prediction_dir / f"{data_group}_{mode}_level_prediction.tsv", sep="\t", ) - print(df) df.set_index(["participant_id", "session_id"], inplace=True, drop=True) return df From 5f26bbbbf1e5b9a4d4e2dfeaea0c9f0eb8f87537 Mon Sep 17 00:00:00 2001 From: camillebrianceau Date: Fri, 16 Feb 2024 15:03:32 +0100 Subject: [PATCH 12/12] tests --- clinicadl/utils/maps_manager/maps_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clinicadl/utils/maps_manager/maps_manager.py b/clinicadl/utils/maps_manager/maps_manager.py index 681d9156e..0534d3b95 100644 --- a/clinicadl/utils/maps_manager/maps_manager.py +++ b/clinicadl/utils/maps_manager/maps_manager.py @@ -2600,7 +2600,7 @@ def _mode_level_to_tsv( performance_dir / f"{data_group}_{self.mode}_level_prediction.tsv" ) if not performance_path.is_dir(): - results_df.to_csv(performance_path, index=False, sep="\t", header=True) + results_df.to_csv(performance_path, index=False, sep="\t") else: results_df.to_csv( performance_path, index=False, sep="\t", mode="a", header=False @@ -2611,7 +2611,7 @@ def _mode_level_to_tsv( pd_metrics = pd.DataFrame(metrics).T if not metrics_path.is_file(): - pd_metrics.to_csv(metrics_path, index=False, sep="\t", header=True) + pd_metrics.to_csv(metrics_path, index=False, sep="\t", header=False) else: pd_metrics.to_csv( metrics_path, index=False, sep="\t", mode="a", header=False