mixture and feature_selection modules

koaning · Dec 15, 2024 · bac7e83 · bac7e83
1 parent 9693384
commit bac7e83
Show file tree

Hide file tree

Showing 5 changed files with 41 additions and 24 deletions.
diff --git a/sklego/feature_selection/mrmr.py b/sklego/feature_selection/mrmr.py
@@ -4,7 +4,8 @@
 from sklearn.base import BaseEstimator
 from sklearn.feature_selection import f_classif, f_regression
 from sklearn.feature_selection._base import SelectorMixin
-from sklearn.utils.validation import check_is_fitted, check_X_y
+from sklearn.utils.validation import check_is_fitted
+from sklearn_compat.utils.validation import _check_n_features, validate_data
 
 
 def _redundancy_pearson(X, selected, left):
@@ -201,13 +202,13 @@ def fit(self, X, y):
 
                 k parameter is not integer type or is < n_features_in (X.shape[1]) or < 1
         """
-        X, y = check_X_y(X, y, dtype="numeric", y_numeric=True)
+        X, y = validate_data(self, X=X, y=y, dtype="numeric", y_numeric=True, reset=True)
+        _check_n_features(self, X, reset=True)
         self._y_dtype = y.dtype
 
         relevance = self._get_relevance
         redundancy = self._get_redundancy
 
-        self.n_features_in_ = X.shape[1]
         left_features = list(range(self.n_features_in_))
         selected_features = []
         selected_scores = []

diff --git a/sklego/mixture/bayesian_gmm_classifier.py b/sklego/mixture/bayesian_gmm_classifier.py
@@ -2,9 +2,9 @@
 from scipy.special import softmax
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.mixture import BayesianGaussianMixture
-from sklearn.utils import check_X_y
 from sklearn.utils.multiclass import unique_labels
-from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
+from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
+from sklearn_compat.utils.validation import _check_n_features, validate_data
 
 
 class BayesianGMMClassifier(ClassifierMixin, BaseEstimator):
@@ -77,9 +77,10 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "BayesianGMMClassifier":
         self : BayesianGMMClassifier
             The fitted estimator.
         """
-        X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
+        X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
         if X.ndim == 1:
             X = np.expand_dims(X, 1)
+        _check_n_features(self, X, reset=True)
 
         self.gmms_ = {}
         self.classes_ = unique_labels(y)
@@ -106,7 +107,6 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "BayesianGMMClassifier":
             )
             self.gmms_[c] = mixture.fit(subset_x, subset_y)
 
-        self.n_features_in_ = X.shape[1]
         self.n_iter_ = sum(mixture.n_iter_ for mixture in self.gmms_.values())
 
         return self
@@ -125,7 +125,9 @@ def predict(self, X):
             The predicted data.
         """
         check_is_fitted(self, ["gmms_", "classes_"])
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
+        _check_n_features(self, X, reset=False)
+
         return self.classes_[self.predict_proba(X).argmax(axis=1)]
 
     def predict_proba(self, X):
@@ -141,8 +143,10 @@ def predict_proba(self, X):
         array-like of shape (n_samples, n_classes)
             The predicted probabilities.
         """
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
         check_is_fitted(self, ["gmms_", "classes_"])
+        X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
+        _check_n_features(self, X, reset=False)
+
         res = np.zeros((X.shape[0], self.classes_.shape[0]))
         for idx, c in enumerate(self.classes_):
             res[:, idx] = self.gmms_[c].score_samples(X)

diff --git a/sklego/mixture/bayesian_gmm_detector.py b/sklego/mixture/bayesian_gmm_detector.py
@@ -5,7 +5,8 @@
 from scipy.stats import gaussian_kde
 from sklearn.base import BaseEstimator, OutlierMixin
 from sklearn.mixture import BayesianGaussianMixture
-from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
+from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
+from sklearn_compat.utils.validation import _check_n_features, validate_data
 
 
 class BayesianGMMOutlierDetector(OutlierMixin, BaseEstimator):
@@ -109,10 +110,12 @@ def fit(self, X: np.ndarray, y=None) -> "BayesianGMMOutlierDetector":
         """
 
         # GMM sometimes throws an error if you don't do this
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=True)
         if len(X.shape) == 1:
             X = np.expand_dims(X, 1)
 
+        _check_n_features(self, X, reset=True)
+
         if (self.method == "quantile") and ((self.threshold > 1) or (self.threshold < 0)):
             raise ValueError(f"Threshold {self.threshold} with method {self.method} needs to be 0 < threshold < 1")
         if (self.method == "stddev") and (self.threshold < 0):
@@ -154,13 +157,14 @@ def fit(self, X: np.ndarray, y=None) -> "BayesianGMMOutlierDetector":
             self.likelihood_threshold_ = mean_likelihood - (self.threshold * new_likelihoods_std)
 
         self.n_iter_ = self.gmm_.n_iter_
-        self.n_features_in_ = X.shape[1]
         self.offset_ = self.likelihood_threshold_
         return self
 
     def score_samples(self, X):
         """Compute the log likelihood for each sample and return the negative value."""
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
+        _check_n_features(self, X, reset=False)
+
         check_is_fitted(self, ["gmm_", "likelihood_threshold_"])
         if len(X.shape) == 1:
             X = np.expand_dims(X, 1)

diff --git a/sklego/mixture/gmm_classifier.py b/sklego/mixture/gmm_classifier.py
@@ -2,9 +2,9 @@
 from scipy.special import softmax
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.mixture import GaussianMixture
-from sklearn.utils import check_X_y
 from sklearn.utils.multiclass import unique_labels
-from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
+from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
+from sklearn_compat.utils.validation import _check_n_features, validate_data
 
 
 class GMMClassifier(ClassifierMixin, BaseEstimator):
@@ -72,9 +72,10 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "GMMClassifier":
         self : GMMClassifier
             The fitted estimator.
         """
-        X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES)
+        X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
         if X.ndim == 1:
             X = np.expand_dims(X, 1)
+        _check_n_features(self, X, reset=True)
 
         self.gmms_ = {}
         self.classes_ = unique_labels(y)
@@ -98,7 +99,6 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "GMMClassifier":
             )
             self.gmms_[c] = mixture.fit(subset_x, subset_y)
 
-        self.n_features_in_ = X.shape[1]
         self.n_iter_ = sum(mixture.n_iter_ for mixture in self.gmms_.values())
 
         return self
@@ -117,7 +117,9 @@ def predict(self, X):
             The predicted data.
         """
         check_is_fitted(self, ["gmms_", "classes_"])
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
+        _check_n_features(self, X, reset=False)
+
         return self.classes_[self.predict_proba(X).argmax(axis=1)]
 
     def predict_proba(self, X):
@@ -133,8 +135,10 @@ def predict_proba(self, X):
         array-like of shape (n_samples, n_classes)
             The predicted probabilities.
         """
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
         check_is_fitted(self, ["gmms_", "classes_"])
+        X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
+        _check_n_features(self, X, reset=False)
+
         res = np.zeros((X.shape[0], self.classes_.shape[0]))
         for idx, c in enumerate(self.classes_):
             res[:, idx] = self.gmms_[c].score_samples(X)

diff --git a/sklego/mixture/gmm_outlier_detector.py b/sklego/mixture/gmm_outlier_detector.py
@@ -5,7 +5,8 @@
 from scipy.stats import gaussian_kde
 from sklearn.base import BaseEstimator, OutlierMixin
 from sklearn.mixture import GaussianMixture
-from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
+from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
+from sklearn_compat.utils.validation import _check_n_features, validate_data
 
 
 class GMMOutlierDetector(OutlierMixin, BaseEstimator):
@@ -102,9 +103,10 @@ def fit(self, X: np.ndarray, y=None) -> "GMMOutlierDetector":
             - If `method` is not in `["quantile", "stddev"]`.
         """
         # GMM sometimes throws an error if you don't do this
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
-        if len(X.shape) == 1:
+        X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=True)
+        if X.ndim == 1:
             X = np.expand_dims(X, 1)
+        _check_n_features(self, X, reset=True)
 
         if (self.method == "quantile") and ((self.threshold > 1) or (self.threshold < 0)):
             raise ValueError(f"Threshold {self.threshold} with method {self.method} needs to be 0 < threshold < 1")
@@ -150,10 +152,12 @@ def fit(self, X: np.ndarray, y=None) -> "GMMOutlierDetector":
 
     def score_samples(self, X):
         """Compute the log likelihood for each sample and return the negative value."""
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
         check_is_fitted(self, ["gmm_", "likelihood_threshold_"])
-        if len(X.shape) == 1:
+        X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
+
+        if X.ndim == 1:
             X = np.expand_dims(X, 1)
+        _check_n_features(self, X, reset=False)
 
         return self.gmm_.score_samples(X)