Skip to content

Commit

Permalink
Merge pull request #53 from ncooder/arfs_corrections
Browse files Browse the repository at this point in the history
Typos/docstrings corrections
  • Loading branch information
ThomasBury authored Jan 31, 2025
2 parents 586e982 + 0f3220f commit a795388
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 30 deletions.
20 changes: 10 additions & 10 deletions src/arfs/association.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ def theils_u_matrix(X, sample_weight=None, n_jobs=1, handle_na="drop"):
cat_cols = dtypes_dic["cat"]

if cat_cols and (len(cat_cols) >= 2):
# explicitely store the unique 2-permutation of column names
# explicitly store the unique 2-permutation of column names
# permutations and not combinations because U is asymmetric
comb_list = [comb for comb in permutations(cat_cols, 2)]
# define the number of cores
Expand Down Expand Up @@ -439,7 +439,7 @@ def cramer_v_matrix(X, sample_weight=None, n_jobs=1, handle_na="drop"):
cat_cols = dtypes_dic["cat"]

if cat_cols and (len(cat_cols) >= 2):
# explicitely store the unique 2-combinations of column names
# explicitly store the unique 2-combinations of column names
comb_list = [comb for comb in combinations(cat_cols, 2)]
# define the number of cores
n_jobs = (
Expand Down Expand Up @@ -656,7 +656,7 @@ def correlation_ratio_matrix(X, sample_weight=None, n_jobs=1, handle_na="drop"):
num_cols = dtypes_dic["num"]

if cat_cols and num_cols:
# explicitely store the unique 2-combinations of column names
# explicitly store the unique 2-combinations of column names
# the first one should be the categorical predictor
comb_list = list(product(cat_cols, num_cols))
# define the number of cores
Expand Down Expand Up @@ -764,9 +764,9 @@ def wcov(x, y, w):
Parameters
----------
x : array-like of shape (n_samples,)
the perdictor 1 array
the predictor 1 array
y : array-like of shape (n_samples,)
the perdictor 2 array
the predictor 2 array
w : array-like of shape (n_samples,)
the sample weights array
Expand All @@ -784,9 +784,9 @@ def wcorr(x, y, w):
Parameters
----------
x : array-like of shape (n_samples,)
the perdictor 1 array
the predictor 1 array
y : array-like of shape (n_samples,)
the perdictor 2 array
the predictor 2 array
w : array-like of shape (n_samples,)
the sample weights array
Expand Down Expand Up @@ -1432,7 +1432,7 @@ def f_oneway_weighted(*args):
"""
# how many levels (predictor)
n_classes = len(args)
# convert to float 2-uple d'array
# convert to float 2-tuple array
args = [as_float_array(a) for a in args]
# compute the total weight per level
weight_per_class = np.array([a[1].sum() for a in args])
Expand Down Expand Up @@ -2272,7 +2272,7 @@ def plot_association_matrix(
The sorting is done using hierarchical clustering,
very like in seaborn or other packages.
Categorical(nom): uncertainty coefficient & correlation ratio from 0 to 1.
The uncertainty coefficient is assymmetrical, (approximating how much the elements on the
The uncertainty coefficient is asymmetrical, (approximating how much the elements on the
left PROVIDE INFORMATION on elements in the row). Continuous(con): symmetrical numerical
correlations (Spearman's) from -1 to 1
Expand Down Expand Up @@ -2427,7 +2427,7 @@ def plot_association_matrix_int(
title_str = "**Continuous (con) and Categorical (nom) Associations **"
sub_title_str = (
"*Categorical(nom): uncertainty coefficient & correlation ratio from 0 to 1. The uncertainty "
"coefficient is assymmetrical, (approximating how much the elements on the "
"coefficient is asymmetrical, (approximating how much the elements on the "
"left PROVIDE INFORMATION on elements in the row). Continuous(con): symmetrical numerical "
"correlations (Spearman's) from -1 to 1*"
)
Expand Down
6 changes: 3 additions & 3 deletions src/arfs/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
Module Structure:
-----------------
- ``sklearn_pimp_bench``: function for comparing using the sklearn permutation importance
- ``compare_varimp``: function for comparing using 3 kinds of var.imp.
- ``compare_varimp``: function for comparing using possible 4 kinds of variable importance
- ``highlight_tick``: function for highlighting specific (genuine or noise for instance) predictors in the importance chart
"""

Expand Down Expand Up @@ -33,7 +33,7 @@ def sklearn_pimp_bench(model, X, y, task="regression", sample_weight=None):
y : array-like or None, shape (n_samples, ) or (n_samples, n_classes)
Targets for supervised or None for unsupervised.
task : str, optional
kind of task, either 'regression' or 'classification", by default 'regression'
kind of task, either 'regression' or 'classification', by default 'regression'
sample_weight : array-like of shape (n_samples,), optional
Sample weights, by default None
Expand Down Expand Up @@ -152,7 +152,7 @@ def compare_varimp(feat_selector, models, X, y, sample_weight=None):


def highlight_tick(str_match, figure, color="red", axis="y"):
"""Highlight the x/y tick-labels if they contains a given string
"""Highlight the x/y tick-labels if they contain a given string
Parameters
----------
Expand Down
2 changes: 1 addition & 1 deletion src/arfs/gbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,7 @@ def _fit_early_stopped_lgb(
params["verbosity"] = -1

n_trees = params["num_boost_round"] if "num_boost_round" in params else 10_000
# remove key if exists to avoid LGB userwarnings
# remove key if exists to avoid LGB user warnings
params.pop("num_boost_round", None)

model = lgb.train(
Expand Down
14 changes: 7 additions & 7 deletions src/arfs/parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@

def parallel_matrix_entries(func, df, comb_list, sample_weight=None, n_jobs=-1):
"""parallel_matrix_entries applies a function to each chunk of
combinaison of columns of the dataframe, distributed by cores.
combination of columns of the dataframe, distributed by cores.
This is similar to https://github.com/smazzanti/mrmr/mrmr/pandas.py
Parameters
----------
func : callable
function to be applied to each column
function to be applied to each pair of columns in comb_list
df : pd.DataFrame
the dataframe on which to apply the function
comb_list : list of tuples of str
Expand Down Expand Up @@ -145,8 +145,8 @@ def _compute_matrix_entries(
sample_weight=None,
func_xyw=None,
):
"""base closure for computing matrix entries appling a function to each chunk of
combinaison of columns of the dataframe, distributed by cores.
"""base closure for computing matrix entries applying a function to each chunk of
combination of columns of the dataframe, distributed by cores.
This is similar to https://github.com/smazzanti/mrmr/mrmr/pandas.py
Parameters
Expand All @@ -158,13 +158,13 @@ def _compute_matrix_entries(
func_xyw : callable, optional
callable (function) for computing the individual elements of the matrix
takes two mandatory inputs (x and y) and an optional input w, sample_weights
comb_list : list of 2-uple of str
comb_list : list of 2-tuple of str
Pairs of column names corresponding to the entries
Returns
-------
pd.DataFrame
concatenated results into a single pandas DF
List[pd.DataFrame]
a list of partial dfs to be concatenated
"""
v_df_list = [
func_xyw(x=X[comb[0]], y=X[comb[1]], sample_weight=sample_weight, as_frame=True)
Expand Down
8 changes: 4 additions & 4 deletions src/arfs/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,12 +879,12 @@ class PatsyTransformer(BaseEstimator, TransformerMixin):
Pasty formula used to transform the data.
add_intercept : boolean, default=False
Wether to add an intersept. By default scikit-learn has built-in
Whether to add an intercept. By default scikit-learn has built-in
intercepts for all models, so we don't add an intercept to the data,
even if one is specified in the formula.
eval_env : environment or int, default=0
Envirionment in which to evalute the formula.
Environment in which to evaluate the formula.
Defaults to the scope in which PatsyModel was instantiated.
NA_action : string or NAAction, default="drop"
Expand All @@ -902,14 +902,14 @@ class PatsyTransformer(BaseEstimator, TransformerMixin):
data type that transform method will return. Default is ``"dataframe"``
for numpy array, but if you would like to get Pandas dataframe (for
example for using it in scikit transformers with dataframe as input
use ``"dataframe"`` and if numpy array use ``"ndarray"``
use ``"dataframe"`` and if numpy array use ``"ndarray"``)
Note
----
PastyTransformer does by default not add an intercept, even if you
specified it in the formula. You need to set add_intercept=True.
As scikit-learn transformers can not ouput y, the formula
As scikit-learn transformers can not output y, the formula
should not contain a left hand side. If you need to transform both
features and targets, use PatsyModel.
"""
Expand Down
10 changes: 5 additions & 5 deletions src/arfs/sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def sample(df, n=1000, sample_weight=None, method="gower"):
For the clustering algorithm, clusters are determined using the Gower distance
(mixed type data) and the dataset is shrunk from n_samples to n_clusters.
For the isolation forest algorithm, samples are added till a suffisant 2-samples
For the isolation forest algorithm, samples are added till a sufficient 2-samples
KS statistics is reached or if the number iteration reached the max number (20)
Parameters
Expand Down Expand Up @@ -278,7 +278,7 @@ def _gower_distance_row(
feature_weight_num : np.array
weight vector for the numerical features
feature_weight_sum : float
The sum of the wieghts
The sum of the weights
ranges_of_numeric : np.array
range of the scaled numerical features (between 0 and 1)
Expand Down Expand Up @@ -426,7 +426,7 @@ def get_5_percent_splits(length):


def isolation_forest(X, sample_weight=None):
"""fits isloation forest to the dataset and gives an anomally score to every sample
"""fits isolation forest to the dataset and gives an anomaly score to every sample
Parameters
----------
Expand All @@ -440,8 +440,8 @@ def isolation_forest(X, sample_weight=None):


def isof_find_sample(X, sample_weight=None):
"""Finds a sample by comparing the distributions of the anomally scores between the sample and the original
distribution using the KS-test. Starts of a 5% howver will increase to 10% and then 15% etc. if a significant sample can not be found
"""Finds a sample by comparing the distributions of the anomaly scores between the sample and the original
distribution using the KS-test. Starts of a 5% however will increase to 10% and then 15% etc. if a significant sample can not be found
References
----------
Expand Down

0 comments on commit a795388

Please sign in to comment.