diff --git a/orangecontrib/text/widgets/owimportdocuments.py b/orangecontrib/text/widgets/owimportdocuments.py index 1df27390d..a6e8fee8c 100644 --- a/orangecontrib/text/widgets/owimportdocuments.py +++ b/orangecontrib/text/widgets/owimportdocuments.py @@ -13,6 +13,7 @@ from types import SimpleNamespace as namespace from concurrent.futures._base import TimeoutError +from typing import List, Optional from AnyQt.QtCore import Qt, QEvent, QFileInfo, QThread from AnyQt.QtCore import pyqtSlot as Slot @@ -24,6 +25,7 @@ QVBoxLayout, QLabel ) +from Orange.data import Table, Domain, StringVariable from Orange.widgets import widget, gui, settings from Orange.widgets.utils.filedialogs import RecentPath from Orange.widgets.utils.concurrent import ( @@ -40,6 +42,13 @@ from Orange.canvas.preview.previewbrowser import TextLabel +# domain for skipped images output +SKIPPED_DOMAIN = Domain([], metas=[ + StringVariable("name"), + StringVariable("path") +]) + + def prettifypath(path): home = os.path.expanduser("~/") if path.startswith(home): # case sensitivity! @@ -79,23 +88,21 @@ class OWImportDocuments(widget.OWWidget): class Outputs: data = Output("Corpus", Corpus) + skipped_documents = Output("Skipped documents", Table) #: list of recent paths - recent_paths = settings.Setting([]) # type: List[RecentPath] - currentPath = settings.Setting(None) + recent_paths: List[RecentPath] = settings.Setting([]) + currentPath: Optional[str] = settings.Setting(None) want_main_area = False resizing_enabled = False Modality = Qt.ApplicationModal - MaxRecentItems = 20 - class Warning(widget.OWWidget.Warning): read_error = widget.Msg("{} couldn't be read.") - def __init__(self): super().__init__() #: widget's runtime state @@ -103,7 +110,7 @@ def __init__(self): self.corpus = None self.n_text_categories = 0 self.n_text_data = 0 - self.n_skipped = 0 + self.skipped_documents = [] self.__invalidated = False self.__pendingTask = None @@ -169,7 +176,8 @@ def __init__(self): minimum=0, maximum=100 ) self.cancel_button = QPushButton( - "Cancel", icon=self.style().standardIcon(QStyle.SP_DialogCancelButton), + "Cancel", + icon=self.style().standardIcon(QStyle.SP_DialogCancelButton), ) self.cancel_button.clicked.connect(self.cancel) @@ -286,7 +294,7 @@ def __updateInfo(self): elif self.__state == State.Done: nvalid = self.n_text_data ncategories = self.n_text_categories - n_skipped = self.n_skipped + n_skipped = len(self.skipped_documents) if ncategories < 2: text = "{} document{}".format(nvalid, "s" if nvalid != 1 else "") else: @@ -536,10 +544,13 @@ def __onRunFinished(self): if corpus.domain.class_var else 0 self.corpus = corpus - self.n_skipped = len(errors) + self.corpus.name = "Documents" + self.skipped_documents = errors if len(errors): - self.Warning.read_error("Some files" if len(errors) > 1 else "One file") + self.Warning.read_error( + "Some files" if len(errors) > 1 else "One file" + ) self.__setRuntimeState(state) self.commit() @@ -561,14 +572,23 @@ def __onReportProgress(self, arg): assert QThread.currentThread() is self.thread() if self.__state == State.Processing: self.pathlabel.setText(prettifypath(arg.lastpath)) - self.progress_widget.setValue(arg.progress) - self.progress_widget.setValue(100 * arg.progress) + self.progress_widget.setValue(int(100 * arg.progress)) def commit(self): """ Create and commit a Corpus from the collected text meta data. """ self.Outputs.data.send(self.corpus) + skipped_table = ( + Table.from_list( + SKIPPED_DOMAIN, + [[x, os.path.join(self.currentPath, x)] + for x in self.skipped_documents] + ) + if self.skipped_documents else None + ) + skipped_table.name = "Skipped documents" + self.Outputs.skipped_documents.send(skipped_table) def onDeleteWidget(self): self.cancel() @@ -615,8 +635,8 @@ def send_report(self): ('Number of documents', self.n_text_data)] if self.n_text_categories: items += [('Categories', self.n_text_categories)] - if self.n_skipped: - items += [('Number of skipped', self.n_skipped)] + if self.skipped_documents: + items += [('Number of skipped', len(self.skipped_documents))] self.report_items(items, ) @@ -646,5 +666,6 @@ def main(argv=sys.argv): w.onDeleteWidget() return 0 + if __name__ == "__main__": sys.exit(main()) diff --git a/orangecontrib/text/widgets/tests/data/sample_docx.docx b/orangecontrib/text/widgets/tests/data/sample_docx.docx new file mode 100644 index 000000000..a5b6a3aef Binary files /dev/null and b/orangecontrib/text/widgets/tests/data/sample_docx.docx differ diff --git a/orangecontrib/text/widgets/tests/data/sample_odt.odt b/orangecontrib/text/widgets/tests/data/sample_odt.odt new file mode 100644 index 000000000..6ae7f0b51 Binary files /dev/null and b/orangecontrib/text/widgets/tests/data/sample_odt.odt differ diff --git a/orangecontrib/text/widgets/tests/data/sample_pdf.pdf b/orangecontrib/text/widgets/tests/data/sample_pdf.pdf new file mode 100644 index 000000000..47d89e492 Binary files /dev/null and b/orangecontrib/text/widgets/tests/data/sample_pdf.pdf differ diff --git a/orangecontrib/text/widgets/tests/data/sample_pdf_corrupted.pdf b/orangecontrib/text/widgets/tests/data/sample_pdf_corrupted.pdf new file mode 100644 index 000000000..1dfbf06e6 Binary files /dev/null and b/orangecontrib/text/widgets/tests/data/sample_pdf_corrupted.pdf differ diff --git a/orangecontrib/text/widgets/tests/data/sample_txt.txt b/orangecontrib/text/widgets/tests/data/sample_txt.txt new file mode 100644 index 000000000..2e500dafc --- /dev/null +++ b/orangecontrib/text/widgets/tests/data/sample_txt.txt @@ -0,0 +1 @@ +This is a test txt file \ No newline at end of file diff --git a/orangecontrib/text/widgets/tests/test_owimportdocuments.py b/orangecontrib/text/widgets/tests/test_owimportdocuments.py new file mode 100644 index 000000000..3966772d3 --- /dev/null +++ b/orangecontrib/text/widgets/tests/test_owimportdocuments.py @@ -0,0 +1,75 @@ +import os +import unittest + +from Orange.widgets.tests.base import WidgetTest +from orangecontrib.text.widgets.owimportdocuments import OWImportDocuments + + +class TestOWImportDocuments(WidgetTest): + def setUp(self) -> None: + self.widget: OWImportDocuments = self.create_widget(OWImportDocuments) + path = os.path.join(os.path.dirname(__file__), "data") + self.widget.setCurrentPath(path) + self.widget.reload() + self.wait_until_finished() + + def test_current_path(self): + path = os.path.join(os.path.dirname(__file__), "data") + self.assertEqual(path, self.widget.currentPath) + + def test_output(self): + output = self.get_output(self.widget.Outputs.data) + self.assertEqual(4, len(output)) + self.assertEqual(3, len(output.domain.metas)) + names = output.get_column_view("name")[0] + self.assertListEqual( + ["sample_docx", "sample_odt", "sample_pdf", "sample_txt"], + sorted(names.tolist()), + ) + texts = output.get_column_view("content")[0] + self.assertListEqual( + [ + f"This is a test {x} file" + for x in ["docx", "odt", "pdf", "txt"] + ], + sorted([x.strip() for x in texts.tolist()]), + ) + self.assertEqual("content", output.text_features[0].name) + + skipped_output = self.get_output(self.widget.Outputs.skipped_documents) + self.assertEqual(1, len(skipped_output)) + self.assertEqual(2, len(skipped_output.domain.metas)) + names = skipped_output.get_column_view("name")[0] + self.assertListEqual( + ["sample_pdf_corrupted.pdf"], + sorted(names.tolist()), + ) + + def test_could_not_be_read_warning(self): + """ + sample_pdf_corrupted.pdf is corrupted file and cannot be loaded + correctly - widget must show the warning + """ + self.assertTrue(self.widget.Warning.read_error.is_shown()) + self.assertEqual( + "One file couldn't be read.", + str(self.widget.Warning.read_error), + ) + + def test_send_report(self): + self.widget.send_report() + + def test_info_box(self): + self.assertEqual( + "4 documents, 1 skipped", self.widget.info_area.text() + ) + + # empty widget + self.widget: OWImportDocuments = self.create_widget(OWImportDocuments) + self.assertEqual( + "No document set selected", self.widget.info_area.text() + ) + + +if __name__ == "__main__": + unittest.main()