nubank · fabiano-santos-nubank · Dec 6, 2022
@@ -2,6 +2,7 @@
 
 import pandas as pd
 from sklearn.ensemble import IsolationForest
+from sklearn.cluster import KMeans
 import sklearn
 from toolz import curry, merge
 
@@ -71,3 +72,74 @@ def p(new_df: pd.DataFrame) -> pd.DataFrame:
 
 
 isolation_forest_learner.__doc__ += learner_return_docstring("Isolation Forest")
+
+
+@curry
+@log_learner_time(learner_name='kmeans_learner')
+def kmeans_learner(df: pd.DataFrame,
+                   features: List[str],
+                   n_clusters: int = 8,
+                   extra_params: Dict[str, Any] = None,
+                   prediction_column: str = "prediction",
+                   encode_extra_cols: bool = True) -> LearnerReturnType:
+    """
+    The KMeans algorithm clusters data by trying to separate samples in n groups of equal variance, minimizing a
+    criterion known as the inertia or within-cluster sum-of-squares (see below). This algorithm requires the number of
+    clusters to be specified. For now, the implementation is limited to euclidean distance.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        A Pandas' DataFrame with features.
+        The model will be trained to split data into k groups
+        from the features.
+
+    features : list of str
+        A list os column names that are used as features for the model. All this names
+        should be in `df`.
+
+    n_clusters : int
+        The number of clusters to form as well as the number of centroids to generate.
+
+    extra_params : dict
+        The KMeans parameters in the format {"par_name": param}. See:
+        https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
+
+    prediction_column : str
+        The name of the column with the predictions from the model.
+
+    encode_extra_cols : bool (default: True)
+        If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
+    """
+
+    default_params = {"init": "k-means++", "n_init": 10, "max_iter": 300, "tol": 1e-4}
+    params = default_params if not extra_params else merge(default_params, extra_params)
+
+    features = features if not encode_extra_cols else expand_features_encoded(df, features)
+
+    model = KMeans(n_clusters=n_clusters)
+    model.set_params(**params)
+    model.fit(df[features].values)
+
+    def p(new_df: pd.DataFrame) -> pd.DataFrame:
+        output_col = {prediction_column: model.predict(
+            new_df[features])}
+
+        return new_df.assign(**output_col)
+
+    p.__doc__ = learner_pred_fn_docstring("kmeans_learner")
+
+    log = {'kmeans_learner': {
+        'features': features,
+        'n_clusters': n_clusters,
+        'centers': {i: model.cluster_centers_[i].tolist() for i in range(model.n_clusters)},
+        'parameters': params,
+        'prediction_column': prediction_column,
+        'package': "sklearn",
+        'package_version': sklearn.__version__,
+        'training_samples': len(df)}}
+
+    return p, p(df), log
+
+
+kmeans_learner.__doc__ += learner_return_docstring("K-Means clustering")
@@ -8,7 +8,7 @@
 from sklearn.metrics import (average_precision_score, brier_score_loss,
                              fbeta_score, log_loss, mean_absolute_error,
                              mean_squared_error, precision_score, r2_score,
-                             recall_score, roc_auc_score)
+                             recall_score, roc_auc_score, silhouette_score, davies_bouldin_score)
 from toolz import curry, last, first
 from scipy import optimize
 from sklearn.linear_model import LogisticRegression
@@ -58,6 +58,50 @@ def p(test_data: pd.DataFrame,
     return p
 
 
+def generic_unsupervised_sklearn_evaluator(name_prefix: str,
+                                           sklearn_metric: Callable[..., float]) -> UncurriedEvalFnType:
+    """
+    Returns an evaluator build from a metric from sklearn.metrics without target column
+
+    Parameters
+    ----------
+    name_prefix: str
+        The default name of the evaluator will be name_prefix + target_column.
+
+    sklearn_metric: Callable
+        Metric function from sklearn.metrics. It should take as parameters y_score, kwargs.
+
+    Returns
+    ----------
+    eval_fn: Callable
+       An evaluator function that uses the provided metric
+    """
+
+    def p(test_data: pd.DataFrame,
+          features: List[str],
+          prediction_column: str = "prediction",
+          eval_name: str = None,
+          **kwargs: Any) -> EvalReturnType:
+        try:
+            df = test_data[features].values
+
+            labels = test_data[prediction_column].values
+
+            score = sklearn_metric(df,
+                                   labels,
+                                   **kwargs)
+        except ValueError:
+            # this might happen if there isn't any label column defined
+            score = np.nan
+
+        if eval_name is None:
+            eval_name = name_prefix + prediction_column
+
+        return {eval_name: score}
+
+    return p
+
+
 @curry
 def auc_evaluator(test_data: pd.DataFrame,
                   prediction_column: str = "prediction",
@@ -1069,3 +1113,76 @@ def logistic_coefficient_evaluator(test_data: pd.DataFrame,
                                                                       test_data[target_column]).coef_[0][0]
 
     return {eval_name: score}
+
+
+@curry
+def silhouette_score__evaluator(test_data: pd.DataFrame,
+                                features: List[str],
+                                prediction_column: str = "prediction",
+                                eval_name: str = None) -> EvalReturnType:
+    """
+    The Silhouette Coefficient is calculated using the mean intra-cluster distance (a) and
+    the mean nearest-cluster distance (b) for each sample. The Silhouette Coefficient
+    for a sample is (b - a) / max(a, b). To clarify, b is the distance between a sample and
+    the nearest cluster that the sample is not a part of.
+
+    Parameters
+    ----------
+    test_data : Pandas' DataFrame
+        A Pandas' DataFrame with prediction scores.
+
+    features : list of str
+        A list os column names that are used as features for the model. All this names
+        should be in `df`.
+
+    prediction_column : Strings
+        The name of the column in `test_data` with the prediction scores.
+
+    eval_name : String, optional (default=None)
+        the name of the evaluator as it will appear in the logs.
+
+    Returns
+    ----------
+    log: dict
+        A log-like dictionary with the Silhouette Score
+    """
+
+    eval_fn = generic_unsupervised_sklearn_evaluator("silhouette_score__evaluator__", silhouette_score)
+
+    return eval_fn(test_data, features, prediction_column, eval_name)
+
+
+@curry
+def davies_bouldin_score__evaluator(test_data: pd.DataFrame,
+                                    features: List[str],
+                                    prediction_column: str = "prediction",
+                                    eval_name: str = None) -> EvalReturnType:
+    """
+    The Davies-Bouldin score is defined as the average similarity measure of each cluster with its most similar
+    cluster, where similarity is the ratio of within-cluster distances to between-cluster distances. Thus,
+    clusters which are farther apart and less dispersed will result in a better score.
+
+    Parameters
+    ----------
+    test_data : Pandas' DataFrame
+        A Pandas' DataFrame with prediction scores.
+
+    features : list of str
+        A list os column names that are used as features for the model. All this names
+        should be in `df`.
+
+    prediction_column : Strings
+        The name of the column in `test_data` with the prediction scores.
+
+    eval_name : String, optional (default=None)
+        the name of the evaluator as it will appear in the logs.
+
+    Returns
+    ----------
+    log: dict
+        A log-like dictionary with the Davies-Bouldin Score
+    """
+
+    eval_fn = generic_unsupervised_sklearn_evaluator("davies_bouldin_score__evaluator__", davies_bouldin_score)
+
+    return eval_fn(test_data, features, prediction_column, eval_name)
@@ -3,7 +3,7 @@
 
 import pandas as pd
 
-from fklearn.training.unsupervised import isolation_forest_learner
+from fklearn.training.unsupervised import isolation_forest_learner, kmeans_learner
 
 
 def test_anomaly_learner():
@@ -35,3 +35,32 @@ def test_anomaly_learner():
     assert Counter(expected_col_train) == Counter(pred_train.columns.tolist())
     assert Counter(expected_col_test) == Counter(pred_test.columns.tolist())
     assert (pred_test.columns == pred_train.columns).all()
+
+
+def test_kmeans_learner():
+    df_train_binary = pd.DataFrame({
+        'id': ["id1", "id2", "id3", "id4"],
+        'x1': [10.0, 13.0, 100.0, 13.0],
+        'x2': [0, 1, 100, 0]
+    })
+
+    df_test_binary = pd.DataFrame({
+        'id': ["id1", "id2", "id3", "id4"],
+        'x1': [1200.0, 19000.0, -400.0, 0.0],
+        'x2': [1, 101111, 111110, 1]
+    })
+
+    # Standard Behavior
+    predict_fn, pred_train, log = kmeans_learner(df_train_binary,
+                                                 features=["x1", "x2"],
+                                                 n_clusters=2,
+                                                 extra_params={"random_state": 42})
+
+    pred_test = predict_fn(df_test_binary)
+
+    expected_col_train = df_train_binary.columns.tolist() + ["prediction"]
+    expected_col_test = df_test_binary.columns.tolist() + ["prediction"]
+
+    assert Counter(expected_col_train) == Counter(pred_train.columns.tolist())
+    assert Counter(expected_col_test) == Counter(pred_test.columns.tolist())
+    assert (pred_test.columns == pred_train.columns).all()
@@ -11,7 +11,8 @@
     mean_prediction_evaluator, mse_evaluator, permutation_evaluator,
     pr_auc_evaluator, precision_evaluator, r2_evaluator, recall_evaluator,
     roc_auc_evaluator, spearman_evaluator, linear_coefficient_evaluator, ndcg_evaluator, split_evaluator,
-    temporal_split_evaluator, exponential_coefficient_evaluator, logistic_coefficient_evaluator)
+    temporal_split_evaluator, exponential_coefficient_evaluator, logistic_coefficient_evaluator,
+    silhouette_score__evaluator, davies_bouldin_score__evaluator)
 
 
 def test_combined_evaluators():
@@ -475,7 +476,6 @@ def test_hash_evaluator():
 
 
 def test_exponential_coefficient_evaluator():
-
     a1 = -10
     a0 = -2
 
@@ -492,7 +492,6 @@ def test_exponential_coefficient_evaluator():
 
 
 def test_logistic_coefficient_evaluator():
-
     predictions = pd.DataFrame(dict(
         prediction=[1, 1, 1, 2, 2, 2, 3, 3, 3],
         target=[0, 0, 0, 0, 0, 0, 1, 1, 1]
@@ -501,3 +500,29 @@ def test_logistic_coefficient_evaluator():
     result = logistic_coefficient_evaluator(predictions)
 
     assert round(result['logistic_coefficient_evaluator__target'], 3) == 20.645
+
+
+def test_silhouette_score__evaluator():
+    predictions = pd.DataFrame({
+        'id': ["id1", "id2", "id3", "id4"],
+        'x1': [10.0, 13.0, 100.0, 13.0],
+        'x2': [0, 1, 100, 0],
+        'prediction': [1, 1, 0, 1]
+    })
+
+    result = silhouette_score__evaluator(predictions, features=["x1", "x2"])
+
+    assert round(result['silhouette_score__evaluator__prediction'], 3) == 0.737
+
+
+def test_davies_bouldin_score__evaluator():
+    predictions = pd.DataFrame({
+        'id': ["id1", "id2", "id3", "id4"],
+        'x1': [10.0, 13.0, 100.0, 13.0],
+        'x2': [0, 1, 100, 0],
+        'prediction': [1, 1, 0, 1]
+    })
+
+    result = davies_bouldin_score__evaluator(predictions, features=["x1", "x2"])
+
+    assert round(result['davies_bouldin_score__evaluator__prediction'], 3) == 0.011