Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Concordance: tokenization and small fixes #232

Merged
merged 4 commits into from
May 26, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 29 additions & 19 deletions orangecontrib/text/widgets/owconcordance.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
from typing import Optional

from itertools import chain
from AnyQt.QtCore import Qt, QAbstractTableModel, QSize, QItemSelectionModel, \
QItemSelection
from AnyQt.QtWidgets import QSizePolicy, QApplication, QTableView, \
QStyledItemDelegate
from AnyQt.QtGui import QColor

from Orange.data import Table
from Orange.widgets import gui
from Orange.widgets.settings import Setting
from Orange.widgets.widget import OWWidget, Msg
from nltk import ConcordanceIndex
from orangecontrib.text.corpus import Corpus
from orangecontrib.text.topics import Topic
from orangecontrib.text.preprocess import Preprocessor, WordPunctTokenizer
from orangecontrib.text.preprocess import WordPunctTokenizer


class HorizontalGridDelegate(QStyledItemDelegate):
Expand All @@ -34,6 +34,10 @@ def select(self, selection, flags):
# which rows have been selected
indexes = selection.indexes() if isinstance(selection, QItemSelection) \
else [selection]
# prevent crashing when deleting the connection
if not indexes:
super().select(selection, flags)
return
# indexes[0].row() == -1 indicates clicking outside of the table
if len(indexes) == 1 and indexes[0].row() == -1:
self.clear()
Expand All @@ -58,6 +62,9 @@ def __init__(self):
QAbstractTableModel.__init__(self)
self.word = None
self.corpus = None
self.tokens = None
self.n_tokens = None
self.n_types = None
self.indices = None
self.word_index = None
self.width = 8
Expand All @@ -72,10 +79,20 @@ def set_word(self, word):
def set_corpus(self, corpus):
self.modelAboutToBeReset.emit()
self.corpus = corpus
self.set_tokens()
self._compute_indices()
self._compute_word_index()
self.modelReset.emit()

def set_tokens(self):
if self.corpus is None:
self.tokens = None
return
tokenizer = WordPunctTokenizer()
self.tokens = tokenizer(self.corpus.documents)
self.n_tokens = sum(map(len, self.tokens))
self.n_types = len(set(chain.from_iterable(self.tokens)))

def set_width(self, width):
self.modelAboutToBeReset.emit()
self.width = width
Expand All @@ -85,7 +102,8 @@ def flags(self, _):
return Qt.ItemIsEnabled | Qt.ItemIsSelectable

def rowCount(self, parent=None, *args, **kwargs):
return 0 if parent.isValid() or self.word_index is None \
return 0 if parent is None or parent.isValid() or \
self.word_index is None \
else len(self.word_index)

def columnCount(self, parent=None, *args, **kwargs):
Expand All @@ -96,7 +114,7 @@ def data(self, index, role=Qt.DisplayRole):
doc, index = self.word_index[row]

if role == Qt.DisplayRole:
tokens = self.corpus.tokens
tokens = self.tokens
if col == 0:
return ' '.join(tokens[doc][max(index - self.width, 0):index])
if col == 1:
Expand All @@ -117,11 +135,8 @@ def _compute_indices(self): # type: () -> Optional[None, list]
if self.corpus is None:
self.indices = None
return
if self.corpus and not self.corpus.has_tokens():
preprocessor = Preprocessor(tokenizer=WordPunctTokenizer())
preprocessor(self.corpus)
self.indices = [ConcordanceIndex(doc, key=lambda x: x.lower())
for doc in self.corpus.tokens]
for doc in self.tokens]

def _compute_word_index(self):
if self.indices is None or self.word is None:
Expand All @@ -146,10 +161,10 @@ class OWConcordance(OWWidget):
priority = 30000

inputs = [
('Corpus', Table, 'set_corpus'),
('Corpus', Corpus, 'set_corpus'),
('Query Word', Topic, 'set_word_from_input'),
]
outputs = [('Selected Documents', Table, )]
outputs = [('Selected Documents', Corpus, )]

autocommit = Setting(True)
context_width = Setting(5)
Expand All @@ -164,14 +179,12 @@ def __init__(self):
super().__init__()

self.corpus = None # Corpus
self.n_documents = '' # Info on docs
self.n_matching = '' # Info on docs matching the word
self.n_tokens = '' # Info on tokens
self.n_types = '' # Info on types (unique tokens)

# Info attributes
info_box = gui.widgetBox(self.controlArea, 'Info')
gui.label(info_box, self, 'Documents: %(n_documents)s')
gui.label(info_box, self, 'Tokens: %(n_tokens)s')
gui.label(info_box, self, 'Types: %(n_types)s')
gui.label(info_box, self, 'Matching: %(n_matching)s')
Expand Down Expand Up @@ -243,6 +256,7 @@ def set_word_from_input(self, topic):
def set_word(self):
self.model.set_word(self.word)
self.update_widget()
self.commit()

def resize_columns(self):
col_width = (self.conc_view.width() -
Expand All @@ -260,16 +274,12 @@ def update_widget(self):
self.conc_view.resizeRowsToContents()

if self.corpus is not None:
self.n_documents = len(self.corpus)
self.n_matching = '{}/{}'.format(
self.model.matching_docs() if self.word else 0,
self.n_documents)
self.n_tokens = sum(map(len, self.corpus.tokens)) \
if self.corpus.has_tokens() else 'n/a'
self.n_types = len(self.corpus.dictionary) \
if self.corpus.has_tokens() else 'n/a'
len(self.corpus))
self.n_tokens = self.model.n_tokens
self.n_types = self.model.n_types
else:
self.n_documents = ''
self.n_matching = ''
self.n_tokens = ''
self.n_types = ''
Expand Down
16 changes: 16 additions & 0 deletions orangecontrib/text/widgets/tests/test_concordances.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,22 @@ def test_selection(self):
selection_model.select(ind_10, selection_model.Select)
self.assertIsNone(self.get_output("Selected Documents"))

def test_signal_to_none(self):
self.send_signal("Corpus", self.corpus)
widget = self.widget
widget.controls.word.setText("of")
view = self.widget.conc_view
nrows = widget.model.rowCount()
view.selectRow(1)

self.send_signal("Corpus", None)
self.assertIsNone(self.get_output("Selected Documents"))
self.assertEqual(widget.model.rowCount(), 0)
self.assertEqual(widget.controls.word.text(), "of")

self.send_signal("Corpus", self.corpus)
self.assertEqual(widget.model.rowCount(), nrows)


if __name__ == "__main__":
unittest.main()