Skip to content

Commit

Permalink
mixture and feature_selection modules
Browse files Browse the repository at this point in the history
  • Loading branch information
FBruzzesi committed Dec 15, 2024
1 parent 9693384 commit bac7e83
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 24 deletions.
7 changes: 4 additions & 3 deletions sklego/feature_selection/mrmr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
from sklearn.base import BaseEstimator
from sklearn.feature_selection import f_classif, f_regression
from sklearn.feature_selection._base import SelectorMixin
from sklearn.utils.validation import check_is_fitted, check_X_y
from sklearn.utils.validation import check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data


def _redundancy_pearson(X, selected, left):
Expand Down Expand Up @@ -201,13 +202,13 @@ def fit(self, X, y):
k parameter is not integer type or is < n_features_in (X.shape[1]) or < 1
"""
X, y = check_X_y(X, y, dtype="numeric", y_numeric=True)
X, y = validate_data(self, X=X, y=y, dtype="numeric", y_numeric=True, reset=True)
_check_n_features(self, X, reset=True)
self._y_dtype = y.dtype

relevance = self._get_relevance
redundancy = self._get_redundancy

self.n_features_in_ = X.shape[1]
left_features = list(range(self.n_features_in_))
selected_features = []
selected_scores = []
Expand Down
16 changes: 10 additions & 6 deletions sklego/mixture/bayesian_gmm_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
from scipy.special import softmax
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.mixture import BayesianGaussianMixture
from sklearn.utils import check_X_y
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data


class BayesianGMMClassifier(ClassifierMixin, BaseEstimator):
Expand Down Expand Up @@ -77,9 +77,10 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "BayesianGMMClassifier":
self : BayesianGMMClassifier
The fitted estimator.
"""
X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
if X.ndim == 1:
X = np.expand_dims(X, 1)
_check_n_features(self, X, reset=True)

self.gmms_ = {}
self.classes_ = unique_labels(y)
Expand All @@ -106,7 +107,6 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "BayesianGMMClassifier":
)
self.gmms_[c] = mixture.fit(subset_x, subset_y)

self.n_features_in_ = X.shape[1]
self.n_iter_ = sum(mixture.n_iter_ for mixture in self.gmms_.values())

return self
Expand All @@ -125,7 +125,9 @@ def predict(self, X):
The predicted data.
"""
check_is_fitted(self, ["gmms_", "classes_"])
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X, reset=False)

return self.classes_[self.predict_proba(X).argmax(axis=1)]

def predict_proba(self, X):
Expand All @@ -141,8 +143,10 @@ def predict_proba(self, X):
array-like of shape (n_samples, n_classes)
The predicted probabilities.
"""
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
check_is_fitted(self, ["gmms_", "classes_"])
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X, reset=False)

res = np.zeros((X.shape[0], self.classes_.shape[0]))
for idx, c in enumerate(self.classes_):
res[:, idx] = self.gmms_[c].score_samples(X)
Expand Down
12 changes: 8 additions & 4 deletions sklego/mixture/bayesian_gmm_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from scipy.stats import gaussian_kde
from sklearn.base import BaseEstimator, OutlierMixin
from sklearn.mixture import BayesianGaussianMixture
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data


class BayesianGMMOutlierDetector(OutlierMixin, BaseEstimator):
Expand Down Expand Up @@ -109,10 +110,12 @@ def fit(self, X: np.ndarray, y=None) -> "BayesianGMMOutlierDetector":
"""

# GMM sometimes throws an error if you don't do this
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=True)
if len(X.shape) == 1:
X = np.expand_dims(X, 1)

_check_n_features(self, X, reset=True)

if (self.method == "quantile") and ((self.threshold > 1) or (self.threshold < 0)):
raise ValueError(f"Threshold {self.threshold} with method {self.method} needs to be 0 < threshold < 1")
if (self.method == "stddev") and (self.threshold < 0):
Expand Down Expand Up @@ -154,13 +157,14 @@ def fit(self, X: np.ndarray, y=None) -> "BayesianGMMOutlierDetector":
self.likelihood_threshold_ = mean_likelihood - (self.threshold * new_likelihoods_std)

self.n_iter_ = self.gmm_.n_iter_
self.n_features_in_ = X.shape[1]
self.offset_ = self.likelihood_threshold_
return self

def score_samples(self, X):
"""Compute the log likelihood for each sample and return the negative value."""
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X, reset=False)

check_is_fitted(self, ["gmm_", "likelihood_threshold_"])
if len(X.shape) == 1:
X = np.expand_dims(X, 1)
Expand Down
16 changes: 10 additions & 6 deletions sklego/mixture/gmm_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
from scipy.special import softmax
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.mixture import GaussianMixture
from sklearn.utils import check_X_y
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data


class GMMClassifier(ClassifierMixin, BaseEstimator):
Expand Down Expand Up @@ -72,9 +72,10 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "GMMClassifier":
self : GMMClassifier
The fitted estimator.
"""
X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
if X.ndim == 1:
X = np.expand_dims(X, 1)
_check_n_features(self, X, reset=True)

self.gmms_ = {}
self.classes_ = unique_labels(y)
Expand All @@ -98,7 +99,6 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "GMMClassifier":
)
self.gmms_[c] = mixture.fit(subset_x, subset_y)

self.n_features_in_ = X.shape[1]
self.n_iter_ = sum(mixture.n_iter_ for mixture in self.gmms_.values())

return self
Expand All @@ -117,7 +117,9 @@ def predict(self, X):
The predicted data.
"""
check_is_fitted(self, ["gmms_", "classes_"])
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X, reset=False)

return self.classes_[self.predict_proba(X).argmax(axis=1)]

def predict_proba(self, X):
Expand All @@ -133,8 +135,10 @@ def predict_proba(self, X):
array-like of shape (n_samples, n_classes)
The predicted probabilities.
"""
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
check_is_fitted(self, ["gmms_", "classes_"])
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X, reset=False)

res = np.zeros((X.shape[0], self.classes_.shape[0]))
for idx, c in enumerate(self.classes_):
res[:, idx] = self.gmms_[c].score_samples(X)
Expand Down
14 changes: 9 additions & 5 deletions sklego/mixture/gmm_outlier_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from scipy.stats import gaussian_kde
from sklearn.base import BaseEstimator, OutlierMixin
from sklearn.mixture import GaussianMixture
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data


class GMMOutlierDetector(OutlierMixin, BaseEstimator):
Expand Down Expand Up @@ -102,9 +103,10 @@ def fit(self, X: np.ndarray, y=None) -> "GMMOutlierDetector":
- If `method` is not in `["quantile", "stddev"]`.
"""
# GMM sometimes throws an error if you don't do this
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
if len(X.shape) == 1:
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=True)
if X.ndim == 1:
X = np.expand_dims(X, 1)
_check_n_features(self, X, reset=True)

if (self.method == "quantile") and ((self.threshold > 1) or (self.threshold < 0)):
raise ValueError(f"Threshold {self.threshold} with method {self.method} needs to be 0 < threshold < 1")
Expand Down Expand Up @@ -150,10 +152,12 @@ def fit(self, X: np.ndarray, y=None) -> "GMMOutlierDetector":

def score_samples(self, X):
"""Compute the log likelihood for each sample and return the negative value."""
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
check_is_fitted(self, ["gmm_", "likelihood_threshold_"])
if len(X.shape) == 1:
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)

if X.ndim == 1:
X = np.expand_dims(X, 1)
_check_n_features(self, X, reset=False)

return self.gmm_.score_samples(X)

Expand Down

0 comments on commit bac7e83

Please sign in to comment.