From 2060c7bd198fd51de0db79e4316d49d48acea1ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Primo=C5=BE=20Godec?=
Date: Thu, 8 Oct 2020 15:17:42 +0200
Subject: [PATCH] Corpus - from_table: keep text feature when renamed
---
orangecontrib/text/corpus.py | 18 ++++++++++++++++--
orangecontrib/text/tests/test_corpus.py | 12 ++++++++++++
2 files changed, 28 insertions(+), 2 deletions(-)
diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py
index 83cbf58c2..8c0e5efcc 100644
--- a/orangecontrib/text/corpus.py
+++ b/orangecontrib/text/corpus.py
@@ -591,6 +591,7 @@ def retain_preprocessing(orig, new, key=...):
if isinstance(orig, Corpus):
if isinstance(key, tuple): # get row selection
key = key[0]
+
if orig._tokens is not None: # retain preprocessing
if isinstance(key, Integral):
new._tokens = np.array([orig._tokens[key]])
@@ -606,9 +607,22 @@ def retain_preprocessing(orig, new, key=...):
else:
raise TypeError('Indexing by type {} not supported.'.format(type(key)))
new._dictionary = orig._dictionary
+
+ if isinstance(new, Corpus):
+ # _find_identical_feature returns non when feature not found
+ # filter this Nones from list
+ new.text_features = list(filter(None, [
+ new._find_identical_feature(tf)
+ for tf in orig.text_features
+ ]))
+ else:
+ new.text_features = [
+ tf
+ for tf in orig.text_features
+ if tf in set(new.domain.metas)
+ ]
+
new._titles = orig._titles[key]
- new_domain_metas = set(new.domain.metas)
- new.text_features = [tf for tf in orig.text_features if tf in new_domain_metas]
new.ngram_range = orig.ngram_range
new.attributes = orig.attributes
new.used_preprocessor = orig.used_preprocessor
diff --git a/orangecontrib/text/tests/test_corpus.py b/orangecontrib/text/tests/test_corpus.py
index f2a5db090..44cc432d6 100644
--- a/orangecontrib/text/tests/test_corpus.py
+++ b/orangecontrib/text/tests/test_corpus.py
@@ -180,6 +180,18 @@ def test_from_table(self):
np.testing.assert_equal(t.metas, c.metas)
self.assertEqual(c.text_features, [t.domain.metas[0]])
+ def test_from_table_renamed(self):
+ c1 = Corpus.from_file('book-excerpts')
+ new_domain = Domain(c1.domain.attributes, metas=[c1.domain.metas[0].renamed("text1")])
+
+ # when text feature renamed
+ c2 = Corpus.from_table(new_domain, c1)
+ self.assertIsInstance(c2, Corpus)
+ self.assertEqual(len(c1), len(c2))
+ np.testing.assert_equal(c1.metas, c2.metas)
+ self.assertEqual(1, len(c2.text_features))
+ self.assertEqual("text1", c2.text_features[0].name)
+
def test_infer_text_features(self):
c = Corpus.from_file('friends-transcripts')
tf = c.text_features