Merge pull request #53 from ncooder/arfs_corrections

Typos/docstrings corrections
ThomasBury · Jan 31, 2025 · a795388 · a795388
2 parents 586e982 + 0f3220f
commit a795388
Show file tree

Hide file tree

Showing 6 changed files with 30 additions and 30 deletions.
diff --git a/src/arfs/association.py b/src/arfs/association.py
@@ -271,7 +271,7 @@ def theils_u_matrix(X, sample_weight=None, n_jobs=1, handle_na="drop"):
     cat_cols = dtypes_dic["cat"]
 
     if cat_cols and (len(cat_cols) >= 2):
-        # explicitely store the unique 2-permutation of column names
+        # explicitly store the unique 2-permutation of column names
         # permutations and not combinations because U is asymmetric
         comb_list = [comb for comb in permutations(cat_cols, 2)]
         # define the number of cores
@@ -439,7 +439,7 @@ def cramer_v_matrix(X, sample_weight=None, n_jobs=1, handle_na="drop"):
     cat_cols = dtypes_dic["cat"]
 
     if cat_cols and (len(cat_cols) >= 2):
-        # explicitely store the unique 2-combinations of column names
+        # explicitly store the unique 2-combinations of column names
         comb_list = [comb for comb in combinations(cat_cols, 2)]
         # define the number of cores
         n_jobs = (
@@ -656,7 +656,7 @@ def correlation_ratio_matrix(X, sample_weight=None, n_jobs=1, handle_na="drop"):
     num_cols = dtypes_dic["num"]
 
     if cat_cols and num_cols:
-        # explicitely store the unique 2-combinations of column names
+        # explicitly store the unique 2-combinations of column names
         # the first one should be the categorical predictor
         comb_list = list(product(cat_cols, num_cols))
         # define the number of cores
@@ -764,9 +764,9 @@ def wcov(x, y, w):
     Parameters
     ----------
     x : array-like of shape (n_samples,)
-        the perdictor 1 array
+        the predictor 1 array
     y : array-like of shape (n_samples,)
-        the perdictor 2 array
+        the predictor 2 array
     w : array-like of shape (n_samples,)
         the sample weights array
 
@@ -784,9 +784,9 @@ def wcorr(x, y, w):
     Parameters
     ----------
     x : array-like of shape (n_samples,)
-        the perdictor 1 array
+        the predictor 1 array
     y : array-like of shape (n_samples,)
-        the perdictor 2 array
+        the predictor 2 array
     w : array-like of shape (n_samples,)
         the sample weights array
 
@@ -1432,7 +1432,7 @@ def f_oneway_weighted(*args):
     """
     # how many levels (predictor)
     n_classes = len(args)
-    # convert to float 2-uple d'array
+    # convert to float 2-tuple array
     args = [as_float_array(a) for a in args]
     # compute the total weight per level
     weight_per_class = np.array([a[1].sum() for a in args])
@@ -2272,7 +2272,7 @@ def plot_association_matrix(
     The sorting is done using hierarchical clustering,
     very like in seaborn or other packages.
     Categorical(nom): uncertainty coefficient & correlation ratio from 0 to 1.
-    The uncertainty coefficient is assymmetrical, (approximating how much the elements on the
+    The uncertainty coefficient is asymmetrical, (approximating how much the elements on the
     left PROVIDE INFORMATION on elements in the row). Continuous(con): symmetrical numerical
     correlations (Spearman's) from -1 to 1
 
@@ -2427,7 +2427,7 @@ def plot_association_matrix_int(
     title_str = "**Continuous (con) and Categorical (nom) Associations **"
     sub_title_str = (
         "*Categorical(nom): uncertainty coefficient & correlation ratio from 0 to 1. The uncertainty "
-        "coefficient is assymmetrical, (approximating how much the elements on the "
+        "coefficient is asymmetrical, (approximating how much the elements on the "
         "left PROVIDE INFORMATION on elements in the row). Continuous(con): symmetrical numerical "
         "correlations (Spearman's) from -1 to 1*"
     )

diff --git a/src/arfs/benchmark.py b/src/arfs/benchmark.py
@@ -5,7 +5,7 @@
 Module Structure:
 -----------------
 - ``sklearn_pimp_bench``: function for comparing using the sklearn permutation importance
-- ``compare_varimp``: function for comparing using 3 kinds of var.imp.
+- ``compare_varimp``: function for comparing using possible 4 kinds of variable importance
 - ``highlight_tick``: function for highlighting specific (genuine or noise for instance) predictors in the importance chart
 """
 
@@ -33,7 +33,7 @@ def sklearn_pimp_bench(model, X, y, task="regression", sample_weight=None):
     y : array-like or None, shape (n_samples, ) or (n_samples, n_classes)
         Targets for supervised or None for unsupervised.
     task : str, optional
-        kind of task, either 'regression' or 'classification", by default 'regression'
+        kind of task, either 'regression' or 'classification', by default 'regression'
     sample_weight : array-like of shape (n_samples,), optional
         Sample weights, by default None
 
@@ -152,7 +152,7 @@ def compare_varimp(feat_selector, models, X, y, sample_weight=None):
 
 
 def highlight_tick(str_match, figure, color="red", axis="y"):
-    """Highlight the x/y tick-labels if they contains a given string
+    """Highlight the x/y tick-labels if they contain a given string
 
     Parameters
     ----------

diff --git a/src/arfs/gbm.py b/src/arfs/gbm.py
@@ -475,7 +475,7 @@ def _fit_early_stopped_lgb(
     params["verbosity"] = -1
 
     n_trees = params["num_boost_round"] if "num_boost_round" in params else 10_000
-    # remove key if exists to avoid LGB userwarnings
+    # remove key if exists to avoid LGB user warnings
     params.pop("num_boost_round", None)
 
     model = lgb.train(

diff --git a/src/arfs/parallel.py b/src/arfs/parallel.py
@@ -17,14 +17,14 @@
 
 def parallel_matrix_entries(func, df, comb_list, sample_weight=None, n_jobs=-1):
     """parallel_matrix_entries applies a function to each chunk of
-    combinaison of columns of the dataframe, distributed by cores.
+    combination of columns of the dataframe, distributed by cores.
     This is similar to https://github.com/smazzanti/mrmr/mrmr/pandas.py
 
 
     Parameters
     ----------
     func : callable
-        function to be applied to each column
+        function to be applied to each pair of columns in comb_list
     df : pd.DataFrame
         the dataframe on which to apply the function
     comb_list : list of tuples of str
@@ -145,8 +145,8 @@ def _compute_matrix_entries(
     sample_weight=None,
     func_xyw=None,
 ):
-    """base closure for computing matrix entries appling a function to each chunk of
-    combinaison of columns of the dataframe, distributed by cores.
+    """base closure for computing matrix entries applying a function to each chunk of
+    combination of columns of the dataframe, distributed by cores.
     This is similar to https://github.com/smazzanti/mrmr/mrmr/pandas.py
 
     Parameters
@@ -158,13 +158,13 @@ def _compute_matrix_entries(
     func_xyw : callable, optional
         callable (function) for computing the individual elements of the matrix
         takes two mandatory inputs (x and y) and an optional input w, sample_weights
-    comb_list : list of 2-uple of str
+    comb_list : list of 2-tuple of str
         Pairs of column names corresponding to the entries
 
     Returns
     -------
-    pd.DataFrame
-        concatenated results into a single pandas DF
+    List[pd.DataFrame]
+        a list of partial dfs to be concatenated
     """
     v_df_list = [
         func_xyw(x=X[comb[0]], y=X[comb[1]], sample_weight=sample_weight, as_frame=True)

diff --git a/src/arfs/preprocessing.py b/src/arfs/preprocessing.py
@@ -879,12 +879,12 @@ class PatsyTransformer(BaseEstimator, TransformerMixin):
         Pasty formula used to transform the data.
 
     add_intercept : boolean, default=False
-        Wether to add an intersept. By default scikit-learn has built-in
+        Whether to add an intercept. By default scikit-learn has built-in
         intercepts for all models, so we don't add an intercept to the data,
         even if one is specified in the formula.
 
     eval_env : environment or int, default=0
-        Envirionment in which to evalute the formula.
+        Environment in which to evaluate the formula.
         Defaults to the scope in which PatsyModel was instantiated.
 
     NA_action : string or NAAction, default="drop"
@@ -902,14 +902,14 @@ class PatsyTransformer(BaseEstimator, TransformerMixin):
         data type that transform method will return. Default is ``"dataframe"``
         for numpy array, but if you would like to get Pandas dataframe (for
         example for using it in scikit transformers with dataframe as input
-        use ``"dataframe"`` and if numpy array use ``"ndarray"``
+        use ``"dataframe"`` and if numpy array use ``"ndarray"``)
 
     Note
     ----
     PastyTransformer does by default not add an intercept, even if you
     specified it in the formula. You need to set add_intercept=True.
 
-    As scikit-learn transformers can not ouput y, the formula
+    As scikit-learn transformers can not output y, the formula
     should not contain a left hand side.  If you need to transform both
     features and targets, use PatsyModel.
     """

diff --git a/src/arfs/sampling.py b/src/arfs/sampling.py
@@ -19,7 +19,7 @@ def sample(df, n=1000, sample_weight=None, method="gower"):
     For the clustering algorithm, clusters are determined using the Gower distance
     (mixed type data) and the dataset is shrunk from n_samples to n_clusters.
 
-    For the isolation forest algorithm, samples are added till a suffisant 2-samples
+    For the isolation forest algorithm, samples are added till a sufficient 2-samples
     KS statistics is reached or if the number iteration reached the max number (20)
 
     Parameters
@@ -278,7 +278,7 @@ def _gower_distance_row(
     feature_weight_num : np.array
         weight vector for the numerical features
     feature_weight_sum : float
-        The sum of the wieghts
+        The sum of the weights
     ranges_of_numeric : np.array
         range of the scaled numerical features (between 0 and 1)
 
@@ -426,7 +426,7 @@ def get_5_percent_splits(length):
 
 
 def isolation_forest(X, sample_weight=None):
-    """fits isloation forest to the dataset and gives an anomally score to every sample
+    """fits isolation forest to the dataset and gives an anomaly score to every sample
 
     Parameters
     ----------
@@ -440,8 +440,8 @@ def isolation_forest(X, sample_weight=None):
 
 
 def isof_find_sample(X, sample_weight=None):
-    """Finds a sample by comparing the distributions of the anomally scores between the sample and the original
-    distribution using the KS-test. Starts of a 5% howver will increase to 10% and then 15% etc. if a significant sample can not be found
+    """Finds a sample by comparing the distributions of the anomaly scores between the sample and the original
+    distribution using the KS-test. Starts of a 5% however will increase to 10% and then 15% etc. if a significant sample can not be found
 
     References
     ----------