From de1fd987ac15af2a3a61338836b17b83cd530144 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Primo=C5=BE=20Godec?= Date: Mon, 30 Sep 2019 17:00:12 +0200 Subject: [PATCH] Fix stowords filtering --- orangecontrib/text/preprocess/filter.py | 5 +++-- orangecontrib/text/tests/test_preprocess.py | 20 +++++++++++++++++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py index f79ab09fa..e9e84a1f6 100644 --- a/orangecontrib/text/preprocess/filter.py +++ b/orangecontrib/text/preprocess/filter.py @@ -78,7 +78,7 @@ def supported_languages(): except LookupError: # when no NLTK data is available pass - return [file.capitalize() for file in stopwords_listdir] + return sorted(file.capitalize() for file in stopwords_listdir) @wait_nltk_data def __init__(self, language='English', word_list=None): @@ -96,7 +96,8 @@ def language(self, value): if not self._language: self.stopwords = [] else: - self.stopwords = set(stopwords.words(self.language.lower())) + self.stopwords = set( + x.strip() for x in stopwords.words(self.language.lower())) def __str__(self): config = '' diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py index e2cc7f943..0dc51a3ea 100644 --- a/orangecontrib/text/tests/test_preprocess.py +++ b/orangecontrib/text/tests/test_preprocess.py @@ -276,10 +276,24 @@ def check(self, token): self.assertEqual(df([['a', '1']]), [['a']]) def test_stopwords(self): - filter = preprocess.StopwordsFilter('english') + f = preprocess.StopwordsFilter('english') - self.assertFalse(filter.check('a')) - self.assertTrue(filter.check('filter')) + self.assertFalse(f.check('a')) + self.assertTrue(f.check('filter')) + + self.assertListEqual( + ["snake", "house"], + f(["a", "snake", "is", "in", "a", "house"])) + + def test_stopwords_slovene(self): + f = preprocess.StopwordsFilter('slovene') + + self.assertFalse(f.check('in')) + self.assertTrue(f.check('abeceda')) + + self.assertListEqual( + ["kača", "hiši"], + f(["kača", "je", "v", "hiši", "in"])) def test_lexicon(self): filter = preprocess.LexiconFilter(['filter'])