From de1fd987ac15af2a3a61338836b17b83cd530144 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Primo=C5=BE=20Godec?=
Date: Mon, 30 Sep 2019 17:00:12 +0200
Subject: [PATCH] Fix stowords filtering
---
orangecontrib/text/preprocess/filter.py | 5 +++--
orangecontrib/text/tests/test_preprocess.py | 20 +++++++++++++++++---
2 files changed, 20 insertions(+), 5 deletions(-)
diff --git a/orangecontrib/text/preprocess/filter.py b/orangecontrib/text/preprocess/filter.py
index f79ab09fa..e9e84a1f6 100644
--- a/orangecontrib/text/preprocess/filter.py
+++ b/orangecontrib/text/preprocess/filter.py
@@ -78,7 +78,7 @@ def supported_languages():
except LookupError: # when no NLTK data is available
pass
- return [file.capitalize() for file in stopwords_listdir]
+ return sorted(file.capitalize() for file in stopwords_listdir)
@wait_nltk_data
def __init__(self, language='English', word_list=None):
@@ -96,7 +96,8 @@ def language(self, value):
if not self._language:
self.stopwords = []
else:
- self.stopwords = set(stopwords.words(self.language.lower()))
+ self.stopwords = set(
+ x.strip() for x in stopwords.words(self.language.lower()))
def __str__(self):
config = ''
diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py
index e2cc7f943..0dc51a3ea 100644
--- a/orangecontrib/text/tests/test_preprocess.py
+++ b/orangecontrib/text/tests/test_preprocess.py
@@ -276,10 +276,24 @@ def check(self, token):
self.assertEqual(df([['a', '1']]), [['a']])
def test_stopwords(self):
- filter = preprocess.StopwordsFilter('english')
+ f = preprocess.StopwordsFilter('english')
- self.assertFalse(filter.check('a'))
- self.assertTrue(filter.check('filter'))
+ self.assertFalse(f.check('a'))
+ self.assertTrue(f.check('filter'))
+
+ self.assertListEqual(
+ ["snake", "house"],
+ f(["a", "snake", "is", "in", "a", "house"]))
+
+ def test_stopwords_slovene(self):
+ f = preprocess.StopwordsFilter('slovene')
+
+ self.assertFalse(f.check('in'))
+ self.assertTrue(f.check('abeceda'))
+
+ self.assertListEqual(
+ ["kača", "hiši"],
+ f(["kača", "je", "v", "hiši", "in"]))
def test_lexicon(self):
filter = preprocess.LexiconFilter(['filter'])