From 146ff9d23a4b7001e06d20e3e54081221af855b6 Mon Sep 17 00:00:00 2001 From: Florian Huber <36473328+florian-huber@users.noreply.github.com> Date: Wed, 2 Feb 2022 11:23:31 +0100 Subject: [PATCH 1/4] change allowed_missing_percentage default --- spec2vec/Spec2Vec.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spec2vec/Spec2Vec.py b/spec2vec/Spec2Vec.py index 2ca6f43..3987d24 100644 --- a/spec2vec/Spec2Vec.py +++ b/spec2vec/Spec2Vec.py @@ -84,7 +84,7 @@ def spectrum_processing(s): """ def __init__(self, model: Word2Vec, intensity_weighting_power: Union[float, int] = 0, - allowed_missing_percentage: Union[float, int] = 0, progress_bar: bool = False): + allowed_missing_percentage: Union[float, int] = 50, progress_bar: bool = False): """ Parameters @@ -99,8 +99,8 @@ def __init__(self, model: Word2Vec, intensity_weighting_power: Union[float, int] allowed_missing_percentage: Set the maximum allowed percentage of the document that may be missing from the input model. This is measured as percentage of the weighted, missing - words compared to all word vectors of the document. Default is 0, which - means no missing words are allowed. + words compared to all word vectors of the document. Default is 100, which + means up to 50% missing words are allowed (not a very strict setting!). progress_bar: Set to True to monitor the embedding creating with a progress bar. Default is False. From 35a6abc02b1e1d94ebc9cc3a486ef6cccfcb0158 Mon Sep 17 00:00:00 2001 From: Florian Huber <36473328+florian-huber@users.noreply.github.com> Date: Wed, 2 Feb 2022 11:34:28 +0100 Subject: [PATCH 2/4] update code example --- spec2vec/Spec2Vec.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/spec2vec/Spec2Vec.py b/spec2vec/Spec2Vec.py index 3987d24..cc1f323 100644 --- a/spec2vec/Spec2Vec.py +++ b/spec2vec/Spec2Vec.py @@ -78,8 +78,6 @@ def spectrum_processing(s): .. testoutput:: - Removed adduct M-H from compound name. - ... ['CCMSLIB00001058300', 'CCMSLIB00001058289', 'CCMSLIB00001058303', ... """ From 9ec43de3beb81dd3dbf76b0e83523e950ee20c34 Mon Sep 17 00:00:00 2001 From: Florian Huber <36473328+florian-huber@users.noreply.github.com> Date: Wed, 2 Feb 2022 12:01:35 +0100 Subject: [PATCH 3/4] Update CHANGELOG.md --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f2b7bf..bf7578a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Now supports Python 3.9 (including CI test runs) [#40](https://github.com/iomega/spec2vec/issues/40) +## Changed + +- changed default setting for `allowed_missing_percentage` to 50.0 to be less strict on model coverage [#72](https://github.com/iomega/spec2vec/pull/72) + ## [0.5.0] - 2021-06-18 ## Changed From 9d36ff79c425c6c3336f1f63bb866e3edbc38c7b Mon Sep 17 00:00:00 2001 From: Florian Huber <36473328+florian-huber@users.noreply.github.com> Date: Wed, 2 Feb 2022 12:06:16 +0100 Subject: [PATCH 4/4] Update vector_operations.py --- spec2vec/vector_operations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec2vec/vector_operations.py b/spec2vec/vector_operations.py index 7d13bea..6fdf7b5 100644 --- a/spec2vec/vector_operations.py +++ b/spec2vec/vector_operations.py @@ -45,7 +45,7 @@ def _check_model_coverage(): f"Weighted missing percentage not covered by the given model is {missing_percentage:.2f}%.") message = ("Missing percentage is larger than set maximum.", - "Consider retraining the used model or increasing the allowed percentage.") + "Consider retraining the used model or change the `allowed_missing_percentage`.") assert missing_percentage <= allowed_missing_percentage, message idx_not_in_model = [i for i, x in enumerate(document.words) if x not in model.wv.key_to_index]