Skip to content

Commit

Permalink
Merge pull request #627 from PrimozGodec/fix-corpus
Browse files Browse the repository at this point in the history
[FIX]Corpus fix from_numpy and from_list; modify widget to work with corpuses without text_features
  • Loading branch information
ajdapretnar authored Mar 11, 2021
2 parents 165df5f + a1e87a1 commit 4c4069b
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 8 deletions.
14 changes: 10 additions & 4 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,14 +554,20 @@ def from_table(cls, domain, source, row_indices=...):

@classmethod
def from_numpy(cls, *args, **kwargs):
c = super().from_numpy(*args, **kwargs)
c._set_unique_titles()
t = super().from_numpy(*args, **kwargs)
# t is corpus but its constructor was not called since from_numpy
# calls just class method __new__, call it here to set default values
# for attributes such as _titles, _tokens, preprocessors, text_features
c = Corpus(t.domain, t.X, t.Y, t.metas, t.W, ids=t.ids)
return c

@classmethod
def from_list(cls, domain, rows, weights=None):
c = super().from_list(domain, rows, weights)
c._set_unique_titles()
t = super().from_list(domain, rows, weights)
# t is corpus but its constructor was not called since from_numpy
# calls just class method __new__, call it here to set default values
# for attributes such as _titles, _tokens, preprocessors, text_features
c = Corpus(t.domain, t.X, t.Y, t.metas, t.W, ids=t.ids)
return c

@classmethod
Expand Down
26 changes: 26 additions & 0 deletions orangecontrib/text/tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,32 @@ def test_corpus_from_file_with_tab(self):
c2 = Corpus.from_file('book-excerpts.tab')
self.assertEqual(c, c2)

def test_corpus_from_numpy(self):
domain = Domain(
[], metas=[StringVariable("title"), StringVariable("a")]
)
corpus = Corpus.from_numpy(
domain,
np.empty((2, 0)),
metas=np.array([["title1", "a"], ["title2", "b"]])
)
self.assertEqual(2, len(corpus))
assert_array_equal(["Document 1", "Document 2"], corpus.titles)
self.assertListEqual([StringVariable("title")], corpus.text_features)
self.assertIsNone(corpus._tokens)
self.assertListEqual([], corpus.used_preprocessor.preprocessors)

def test_corpus_from_list(self):
domain = Domain(
[], metas=[StringVariable("title"), StringVariable("a")]
)
corpus = Corpus.from_list(domain, [["title1", "a"], ["title2", "b"]])
self.assertEqual(2, len(corpus))
assert_array_equal(["Document 1", "Document 2"], corpus.titles)
self.assertListEqual([StringVariable("title")], corpus.text_features)
self.assertIsNone(corpus._tokens)
self.assertListEqual([], corpus.used_preprocessor.preprocessors)

def test_corpus_from_file_missing(self):
with self.assertRaises(FileNotFoundError):
Corpus.from_file('missing_file')
Expand Down
8 changes: 5 additions & 3 deletions orangecontrib/text/widgets/owcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def __init__(self):

# Used Text Features
fbox = gui.widgetBox(self.controlArea, orientation=0)
ubox = gui.widgetBox(fbox, "Used text features", addSpace=False)
ubox = gui.widgetBox(fbox, "Used text features")
self.used_attrs_model = VariableListModel(enable_dnd=True)
self.used_attrs_view = VariablesListItemView()
self.used_attrs_view.setModel(self.used_attrs_model)
Expand All @@ -94,7 +94,7 @@ def __init__(self):
aa.rowsRemoved.connect(self.update_feature_selection)

# Ignored Text Features
ibox = gui.widgetBox(fbox, "Ignored text features", addSpace=False)
ibox = gui.widgetBox(fbox, "Ignored text features")
self.unused_attrs_model = VariableListModel(enable_dnd=True)
self.unused_attrs_view = VariablesListItemView()
self.unused_attrs_view.setModel(self.unused_attrs_model)
Expand Down Expand Up @@ -146,6 +146,7 @@ def _load_corpus(path: str, data: Table, state: TaskState) -> Corpus:
def open_file(self, path=None, data=None):
self.closeContext()
self.Error.clear()
self.cancel()
self.unused_attrs_model[:] = []
self.used_attrs_model[:] = []
self.start(self._load_corpus, path, data)
Expand All @@ -158,7 +159,8 @@ def on_done(self, corpus: Corpus) -> None:
self.update_output_info()
self._setup_title_dropdown()
self.used_attrs = list(self.corpus.text_features)
if not self.corpus.text_features:
all_str_features = [f for f in self.corpus.domain.metas if f.is_string]
if not all_str_features:
self.Error.corpus_without_text_features()
self.Outputs.corpus.send(None)
return
Expand Down
36 changes: 35 additions & 1 deletion orangecontrib/text/widgets/tests/test_owcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

class TestOWCorpus(WidgetTest):
def setUp(self):
self.widget = self.create_widget(OWCorpus)
self.widget: OWCorpus = self.create_widget(OWCorpus)

def check_output(self, sel_title):
"""
Expand Down Expand Up @@ -286,6 +286,40 @@ def test_keep_selected_variables(self):
self.wait_until_finished()
self.assertListEqual(list(prew_selected), self.widget.used_attrs)

def test_no_text_feature(self):
"""
Test with data which have empty text_features. Widget should not show
the error but, should have all features unused.
"""
# widget already loads book-excerpts from file and store context
# settings this call restore context settings to default otherwise
# Text variable is moved to used_attributes by the context
self.widget.settingsHandler.reset_to_original(self.widget)
data = Corpus.from_file("book-excerpts")
data.text_features = []
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_finished()
self.assertFalse(
self.widget.Error.corpus_without_text_features.is_shown()
)
self.assertEqual(0, len(list(self.widget.used_attrs_model)))
self.assertListEqual(
[data.domain["Text"]],
list(self.widget.unused_attrs_model)
)

def test_corpus_without_text_features(self):
"""
Test if corpus_without_text_features is correctly raised for data
without text features
"""
data = Table("iris")
self.send_signal(self.widget.Inputs.data, data)
self.wait_until_finished()
self.assertTrue(
self.widget.Error.corpus_without_text_features.is_shown()
)


if __name__ == "__main__":
unittest.main()

0 comments on commit 4c4069b

Please sign in to comment.