Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Corpus Viewer: Output selected data and memorize selection #562

Merged
merged 1 commit into from
Oct 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 89 additions & 48 deletions orangecontrib/text/widgets/owcorpusviewer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
import sre_constants
from itertools import chain
from typing import Set

from AnyQt.QtCore import (
Qt, QUrl, QItemSelection, QItemSelectionModel, QItemSelectionRange
Expand Down Expand Up @@ -38,10 +39,9 @@ class Outputs:
search_indices = ContextSetting([], exclude_metas=False) # features included in search
display_indices = ContextSetting([], exclude_metas=False) # features for display
display_features = ContextSetting([], exclude_metas=False)
selected_documents = ContextSetting([])
regexp_filter = ContextSetting("")

selection = [0] # TODO: DataHashContextHandler

show_tokens = Setting(False)
autocommit = Setting(True)

Expand All @@ -54,7 +54,6 @@ def __init__(self):

self.corpus = None # Corpus
self.corpus_docs = None # Documents generated from Corpus
self.output_mask = [] # Output corpus indices
self.doc_webview = None # WebView for showing content
self.search_features = [] # two copies are needed since Display allows drag & drop
self.display_list_indices = [0]
Expand Down Expand Up @@ -101,7 +100,6 @@ def __init__(self):
orientation=Qt.Horizontal,
childrenCollapsible=False,
)

# Document list
self.doc_list = QTableView()
self.doc_list.setSelectionBehavior(QTableView.SelectRows)
Expand All @@ -113,8 +111,9 @@ def __init__(self):

self.doc_list_model = QStandardItemModel(self)
self.doc_list.setModel(self.doc_list_model)
self.doc_list.selectionModel().selectionChanged.connect(self.show_docs)

self.doc_list.selectionModel().selectionChanged.connect(
self.selection_changed
)
# Document contents
self.doc_webview = gui.WebviewWidget(self.splitter, debug=False)

Expand All @@ -141,7 +140,7 @@ def set_data(self, corpus=None):
self.display_features = list(filter_visible(chain(domain.variables, domain.metas)))
self.search_indices = list(range(len(self.search_features)))
self.display_indices = list(range(len(self.display_features)))
self.selection = [0]
self.selected_documents = [corpus.titles[0]]
self.openContext(self.corpus)
self.display_list_indices = self.display_indices
self.regenerate_docs()
Expand All @@ -155,7 +154,6 @@ def reset_widget(self):
# Corpus
self.corpus = None
self.corpus_docs = None
self.output_mask = []
self.display_features = []
# Widgets
self.search_listbox.clear()
Expand Down Expand Up @@ -185,7 +183,6 @@ def list_docs(self):
def is_match(x):
return not bool(search_keyword) or reg.search(x)

self.output_mask.clear()
self.doc_list_model.clear()

for i, (doc, title, content) in enumerate(zip(self.corpus, self.corpus.titles,
Expand All @@ -195,28 +192,56 @@ def is_match(x):
item.setData(str(title), Qt.DisplayRole)
item.setData(doc, Qt.UserRole)
self.doc_list_model.appendRow(item)
self.output_mask.append(i)

def reset_selection(self):
if self.doc_list_model.rowCount() > 0:
self.doc_list.selectRow(0) # Select the first document
else:
self.doc_webview.setHtml('')

def set_selection(self):
def get_selected_documents_from_view(self) -> Set[str]:
"""
Returns
-------
Set with names of selected documents in the QTableView
"""
return {
i.data(Qt.DisplayRole)
for i in self.doc_list.selectionModel().selectedRows()
}

def set_selection(self) -> None:
"""
Select documents in selected_documents attribute in the view
"""
view = self.doc_list
if len(self.selection):
selection = QItemSelection()

for row in self.selection:
selection.append(
QItemSelectionRange(
view.model().index(row, 0),
view.model().index(row, 0)
)
)
view.selectionModel().select(
selection, QItemSelectionModel.ClearAndSelect)
model = view.model()

previously_selected = self.selected_documents.copy()
selection = QItemSelection()
for row in range(model.rowCount()):
document = model.data(model.index(row, 0), Qt.DisplayRole)
if document in self.selected_documents:
selection.append(QItemSelectionRange(
view.model().index(row, 0),
view.model().index(row, 0)
))
view.selectionModel().select(
selection, QItemSelectionModel.ClearAndSelect
)
if len(selection) == 0:
# in cases when selection is empty qt's selection_changed is not
# called and so we need to manually trigger show_docs
self.show_docs()
# select emmit selection change signal which causes calling
# selection_changed when filtering it means that documents which
# are currently filtered out get removed from self.selected_douments
# we still want to keep them to be still selected after user removes
# filter
self.selected_documents = previously_selected

def selection_changed(self) -> None:
"""
Function is called every time the selection changes - when user select
new range of documents
"""
self.selected_documents = self.get_selected_documents_from_view()
self.show_docs()
self.commit()

def show_docs(self):
""" Show the selected documents in the right area """
Expand Down Expand Up @@ -308,9 +333,6 @@ def show_docs(self):
if i in self.search_indices]

html = '<table>'
selection = [i.row() for i in self.doc_list.selectionModel().selectedRows()]
if selection != []:
self.selection = selection
for doc_count, index in enumerate(self.doc_list.selectionModel().selectedRows()):
if doc_count > 0: # add split
html += '<tr class="line separator"><td/><td/></tr>' \
Expand Down Expand Up @@ -376,7 +398,7 @@ def regenerate_docs(self):
def refresh_search(self):
if self.corpus is not None:
self.list_docs()
self.reset_selection()
self.set_selection()
self.update_info()
self.commit()

Expand All @@ -399,35 +421,54 @@ def update_info(self):
self.ngram_range = ''

def commit(self):
if self.corpus is not None:
matched = self.corpus[self.output_mask]
output_mask = set(self.output_mask)
unmatched_mask = [i for i in range(len(self.corpus)) if i not in output_mask]
unmatched = self.corpus[unmatched_mask]
self.Outputs.matching_docs.send(matched)
self.Outputs.other_docs.send(unmatched)
else:
self.Outputs.matching_docs.send(None)
self.Outputs.other_docs.send(None)
matched = unmatched = None
corpus = self.corpus
if corpus is not None:
# it returns a set of selected documents which are in view
selected_docs = self.get_selected_documents_from_view()
titles = corpus.titles
matched_mask = [
i for i, t in enumerate(titles) if t in selected_docs
]
unmatched_mask = [
i for i, t in enumerate(titles) if t not in selected_docs
]

matched = corpus[matched_mask] if len(matched_mask) else None
unmatched = corpus[unmatched_mask] if len(unmatched_mask) else None
self.Outputs.matching_docs.send(matched)
self.Outputs.other_docs.send(unmatched)

def send_report(self):
self.report_items((
("Query", self.regexp_filter),
("Matching documents", self.n_matching),
))

def showEvent(self, event):
super().showEvent(event)
self.update_splitter()

def update_splitter(self):
"""
Update splitter that document list on the left never take more
than 1/3 of the space. It is only set on showEvent. If user
later changes sizes it stays as it is.
"""
w1, w2 = self.splitter.sizes()
ws = w1 + w2
if w2 < 2/3 * ws:
self.splitter.setSizes([ws * 1/3, ws * 2/3])


if __name__ == '__main__':
from orangecontrib.text.preprocess import BASE_TOKENIZER
from orangecontrib.text.tag.pos import AveragedPerceptronTagger
from orangewidget.utils.widgetpreview import WidgetPreview

app = QApplication([])
widget = OWCorpusViewer()
widget.show()
corpus = Corpus.from_file('book-excerpts')
corpus = corpus[:3]
tagger = AveragedPerceptronTagger()
tagged_corpus = tagger(BASE_TOKENIZER(corpus))
tagged_corpus.ngram_range = (1, 2)
widget.set_data(tagged_corpus)
app.exec()
WidgetPreview(OWCorpusViewer).run(tagged_corpus)
65 changes: 63 additions & 2 deletions orangecontrib/text/widgets/tests/test_owcorpusviewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,37 @@ def setUp(self):
def test_data(self):
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.assertEqual(self.widget.n_documents, 9)
self.widget.doc_list.selectAll()
out_corpus = self.get_output(self.widget.Outputs.matching_docs)
self.assertEqual(out_corpus, self.corpus)

def test_search(self):
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.widget.regexp_filter = "Human"
self.process_events()
out_corpus = self.get_output(self.widget.Outputs.matching_docs)
self.assertEqual(len(out_corpus), 1)

# first document is selected, when filter with word that is not in
# selected document out_corpus is None
self.widget.regexp_filter = "graph"
self.process_events()
out_corpus = self.get_output(self.widget.Outputs.matching_docs)
self.assertEqual(len(out_corpus), 4)
self.assertIsNone(out_corpus)

def test_highlighting(self):
self.send_signal(self.widget.Inputs.corpus, self.corpus)
# no intersection between filter and selection
self.widget.regexp_filter = "graph"
self.process_events()
self.widget.doc_webview.html()
spy = QSignalSpy(self.widget.doc_webview.loadFinished)
spy.wait()
html = self.widget.doc_webview.html()
self.assertNotIn('<mark data-markjs="true">', html)

# all documents are selected
self.widget.regexp_filter = "graph"
self.widget.doc_list.selectAll()
spy = QSignalSpy(self.widget.doc_webview.loadFinished)
spy.wait()
html = self.widget.doc_webview.html()
Expand All @@ -56,6 +72,51 @@ def test_highlighting_non_latin(self):
html = self.widget.doc_webview.html()
self.assertIn('<mark data-markjs="true">', html)

def test_output(self):
""" Output is intersection between selection and filter """
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.widget.regexp_filter = "graph"
self.process_events()
self.assertIsNone(self.get_output(self.widget.Outputs.matching_docs))
self.assertEqual(
9, len(self.get_output(self.widget.Outputs.other_docs))
)

self.widget.doc_list.selectAll() # selects current documents in list
self.assertEqual(
4, len(self.get_output(self.widget.Outputs.matching_docs))
)
self.assertEqual(
5, len(self.get_output(self.widget.Outputs.other_docs))
)

self.widget.regexp_filter = "human"
self.process_events()
# empty because none of mathching documents is selected
self.assertIsNone(self.get_output(self.widget.Outputs.matching_docs))
self.assertEqual(
9, len(self.get_output(self.widget.Outputs.other_docs))
)

self.widget.doc_list.selectAll()
self.assertEqual(
5, len(self.get_output(self.widget.Outputs.matching_docs))
)
self.assertEqual(
4, len(self.get_output(self.widget.Outputs.other_docs))
)

self.send_signal(self.widget.Inputs.corpus, None)
self.assertIsNone(self.get_output(self.widget.Outputs.matching_docs))
self.assertIsNone(self.get_output(self.widget.Outputs.other_docs))

def test_report(self):
self.widget.send_report()

self.widget.regexp_filter = "human"
self.process_events()
self.widget.send_report()


if __name__ == "__main__":
unittest.main()