Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add K-means algorithm and unsupervised evaluators #219

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions src/fklearn/training/unsupervised.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
import sklearn
from toolz import curry, merge

Expand Down Expand Up @@ -71,3 +72,74 @@ def p(new_df: pd.DataFrame) -> pd.DataFrame:


isolation_forest_learner.__doc__ += learner_return_docstring("Isolation Forest")


@curry
@log_learner_time(learner_name='kmeans_learner')
def kmeans_learner(df: pd.DataFrame,
features: List[str],
n_clusters: int = 8,
extra_params: Dict[str, Any] = None,
prediction_column: str = "prediction",
encode_extra_cols: bool = True) -> LearnerReturnType:
"""
The KMeans algorithm clusters data by trying to separate samples in n groups of equal variance, minimizing a
criterion known as the inertia or within-cluster sum-of-squares (see below). This algorithm requires the number of
clusters to be specified. For now, the implementation is limited to euclidean distance.

Parameters
----------
df : pandas.DataFrame
A Pandas' DataFrame with features.
The model will be trained to split data into k groups
from the features.

features : list of str
A list os column names that are used as features for the model. All this names
should be in `df`.

n_clusters : int
The number of clusters to form as well as the number of centroids to generate.

extra_params : dict
The KMeans parameters in the format {"par_name": param}. See:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

prediction_column : str
The name of the column with the predictions from the model.

encode_extra_cols : bool (default: True)
If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
"""

default_params = {"init": "k-means++", "n_init": 10, "max_iter": 300, "tol": 1e-4}
params = default_params if not extra_params else merge(default_params, extra_params)

features = features if not encode_extra_cols else expand_features_encoded(df, features)

model = KMeans(n_clusters=n_clusters)
model.set_params(**params)
model.fit(df[features].values)

def p(new_df: pd.DataFrame) -> pd.DataFrame:
output_col = {prediction_column: model.predict(
new_df[features])}

return new_df.assign(**output_col)

p.__doc__ = learner_pred_fn_docstring("kmeans_learner")

log = {'kmeans_learner': {
'features': features,
'n_clusters': n_clusters,
'centers': {i: model.cluster_centers_[i].tolist() for i in range(model.n_clusters)},
'parameters': params,
'prediction_column': prediction_column,
'package': "sklearn",
'package_version': sklearn.__version__,
'training_samples': len(df)}}

return p, p(df), log


kmeans_learner.__doc__ += learner_return_docstring("K-Means clustering")
119 changes: 118 additions & 1 deletion src/fklearn/validation/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sklearn.metrics import (average_precision_score, brier_score_loss,
fbeta_score, log_loss, mean_absolute_error,
mean_squared_error, precision_score, r2_score,
recall_score, roc_auc_score)
recall_score, roc_auc_score, silhouette_score, davies_bouldin_score)
from toolz import curry, last, first
from scipy import optimize
from sklearn.linear_model import LogisticRegression
Expand Down Expand Up @@ -58,6 +58,50 @@ def p(test_data: pd.DataFrame,
return p


def generic_unsupervised_sklearn_evaluator(name_prefix: str,
sklearn_metric: Callable[..., float]) -> UncurriedEvalFnType:
"""
Returns an evaluator build from a metric from sklearn.metrics without target column

Parameters
----------
name_prefix: str
The default name of the evaluator will be name_prefix + target_column.

sklearn_metric: Callable
Metric function from sklearn.metrics. It should take as parameters y_score, kwargs.

Returns
----------
eval_fn: Callable
An evaluator function that uses the provided metric
"""

def p(test_data: pd.DataFrame,
features: List[str],
prediction_column: str = "prediction",
eval_name: str = None,
**kwargs: Any) -> EvalReturnType:
try:
df = test_data[features].values

labels = test_data[prediction_column].values

score = sklearn_metric(df,
labels,
**kwargs)
except ValueError:
# this might happen if there isn't any label column defined
score = np.nan

if eval_name is None:
eval_name = name_prefix + prediction_column

return {eval_name: score}

return p


@curry
def auc_evaluator(test_data: pd.DataFrame,
prediction_column: str = "prediction",
Expand Down Expand Up @@ -1069,3 +1113,76 @@ def logistic_coefficient_evaluator(test_data: pd.DataFrame,
test_data[target_column]).coef_[0][0]

return {eval_name: score}


@curry
def silhouette_score__evaluator(test_data: pd.DataFrame,
features: List[str],
prediction_column: str = "prediction",
eval_name: str = None) -> EvalReturnType:
"""
The Silhouette Coefficient is calculated using the mean intra-cluster distance (a) and
the mean nearest-cluster distance (b) for each sample. The Silhouette Coefficient
for a sample is (b - a) / max(a, b). To clarify, b is the distance between a sample and
the nearest cluster that the sample is not a part of.

Parameters
----------
test_data : Pandas' DataFrame
A Pandas' DataFrame with prediction scores.

features : list of str
A list os column names that are used as features for the model. All this names
should be in `df`.

prediction_column : Strings
The name of the column in `test_data` with the prediction scores.

eval_name : String, optional (default=None)
the name of the evaluator as it will appear in the logs.

Returns
----------
log: dict
A log-like dictionary with the Silhouette Score
"""

eval_fn = generic_unsupervised_sklearn_evaluator("silhouette_score__evaluator__", silhouette_score)

return eval_fn(test_data, features, prediction_column, eval_name)


@curry
def davies_bouldin_score__evaluator(test_data: pd.DataFrame,
features: List[str],
prediction_column: str = "prediction",
eval_name: str = None) -> EvalReturnType:
"""
The Davies-Bouldin score is defined as the average similarity measure of each cluster with its most similar
cluster, where similarity is the ratio of within-cluster distances to between-cluster distances. Thus,
clusters which are farther apart and less dispersed will result in a better score.

Parameters
----------
test_data : Pandas' DataFrame
A Pandas' DataFrame with prediction scores.

features : list of str
A list os column names that are used as features for the model. All this names
should be in `df`.

prediction_column : Strings
The name of the column in `test_data` with the prediction scores.

eval_name : String, optional (default=None)
the name of the evaluator as it will appear in the logs.

Returns
----------
log: dict
A log-like dictionary with the Davies-Bouldin Score
"""

eval_fn = generic_unsupervised_sklearn_evaluator("davies_bouldin_score__evaluator__", davies_bouldin_score)

return eval_fn(test_data, features, prediction_column, eval_name)
31 changes: 30 additions & 1 deletion tests/training/test_unsupervised.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import pandas as pd

from fklearn.training.unsupervised import isolation_forest_learner
from fklearn.training.unsupervised import isolation_forest_learner, kmeans_learner


def test_anomaly_learner():
Expand Down Expand Up @@ -35,3 +35,32 @@ def test_anomaly_learner():
assert Counter(expected_col_train) == Counter(pred_train.columns.tolist())
assert Counter(expected_col_test) == Counter(pred_test.columns.tolist())
assert (pred_test.columns == pred_train.columns).all()


def test_kmeans_learner():
df_train_binary = pd.DataFrame({
'id': ["id1", "id2", "id3", "id4"],
'x1': [10.0, 13.0, 100.0, 13.0],
'x2': [0, 1, 100, 0]
})

df_test_binary = pd.DataFrame({
'id': ["id1", "id2", "id3", "id4"],
'x1': [1200.0, 19000.0, -400.0, 0.0],
'x2': [1, 101111, 111110, 1]
})

# Standard Behavior
predict_fn, pred_train, log = kmeans_learner(df_train_binary,
features=["x1", "x2"],
n_clusters=2,
extra_params={"random_state": 42})

pred_test = predict_fn(df_test_binary)

expected_col_train = df_train_binary.columns.tolist() + ["prediction"]
expected_col_test = df_test_binary.columns.tolist() + ["prediction"]

assert Counter(expected_col_train) == Counter(pred_train.columns.tolist())
assert Counter(expected_col_test) == Counter(pred_test.columns.tolist())
assert (pred_test.columns == pred_train.columns).all()
31 changes: 28 additions & 3 deletions tests/validation/test_evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
mean_prediction_evaluator, mse_evaluator, permutation_evaluator,
pr_auc_evaluator, precision_evaluator, r2_evaluator, recall_evaluator,
roc_auc_evaluator, spearman_evaluator, linear_coefficient_evaluator, ndcg_evaluator, split_evaluator,
temporal_split_evaluator, exponential_coefficient_evaluator, logistic_coefficient_evaluator)
temporal_split_evaluator, exponential_coefficient_evaluator, logistic_coefficient_evaluator,
silhouette_score__evaluator, davies_bouldin_score__evaluator)


def test_combined_evaluators():
Expand Down Expand Up @@ -475,7 +476,6 @@ def test_hash_evaluator():


def test_exponential_coefficient_evaluator():

a1 = -10
a0 = -2

Expand All @@ -492,7 +492,6 @@ def test_exponential_coefficient_evaluator():


def test_logistic_coefficient_evaluator():

predictions = pd.DataFrame(dict(
prediction=[1, 1, 1, 2, 2, 2, 3, 3, 3],
target=[0, 0, 0, 0, 0, 0, 1, 1, 1]
Expand All @@ -501,3 +500,29 @@ def test_logistic_coefficient_evaluator():
result = logistic_coefficient_evaluator(predictions)

assert round(result['logistic_coefficient_evaluator__target'], 3) == 20.645


def test_silhouette_score__evaluator():
predictions = pd.DataFrame({
'id': ["id1", "id2", "id3", "id4"],
'x1': [10.0, 13.0, 100.0, 13.0],
'x2': [0, 1, 100, 0],
'prediction': [1, 1, 0, 1]
})

result = silhouette_score__evaluator(predictions, features=["x1", "x2"])

assert round(result['silhouette_score__evaluator__prediction'], 3) == 0.737


def test_davies_bouldin_score__evaluator():
predictions = pd.DataFrame({
'id': ["id1", "id2", "id3", "id4"],
'x1': [10.0, 13.0, 100.0, 13.0],
'x2': [0, 1, 100, 0],
'prediction': [1, 1, 0, 1]
})

result = davies_bouldin_score__evaluator(predictions, features=["x1", "x2"])

assert round(result['davies_bouldin_score__evaluator__prediction'], 3) == 0.011