Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Use ISO language setting in widgets #1034

Merged
merged 6 commits into from
Feb 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 41 additions & 10 deletions orangecontrib/text/language.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from collections import Counter
from typing import Optional
from typing import Optional, Sequence

from AnyQt.QtCore import Qt
from langdetect import DetectorFactory, detect
Expand Down Expand Up @@ -41,7 +41,7 @@
"ga": "Irish",
"gl": "Galician",
"got": "Gothic",
"grc": "Ancient greek",
"grc": "Ancient Greek",
"gu": "Gujarati",
"he": "Hebrew",
"hi": "Hindi",
Expand Down Expand Up @@ -104,21 +104,38 @@
None: None,
}
LANG2ISO = {lang: code for code, lang in ISO2LANG.items()}
DEFAULT_LANGUAGE = "English"
DEFAULT_LANGUAGE = "en"


class LanguageModel(PyListModel):
"""Model for language selection dropdowns in the widgets"""

def __init__(self):
languages = sorted(filter(None, ISO2LANG.values()))
super().__init__(iterable=[None] + languages)
def __init__(
self, include_none: bool = False, languages: Optional[Sequence[str]] = None
):
"""
Parameters
----------
include_none
Indicates if "(no language)" value is available on the top of the list
languages
List of languages available in the dropdown.
If None all add-on supported languages are available.
"""
if languages is None:
# if languages not provided take all available languages
languages = sorted(filter(None, ISO2LANG), key=ISO2LANG.get)
if include_none:
languages = [None] + languages
super().__init__(iterable=languages)

def data(self, index, role=Qt.DisplayRole):
if index.row() == 0 and role == Qt.DisplayRole:
return "(no language)"
else:
return super().data(index, role)
if role == Qt.DisplayRole:
value = super().data(index, role)
if value is None:
return "(no language)"
return ISO2LANG[value]
return super().data(index, role)


DetectorFactory.seed = 0
Expand Down Expand Up @@ -167,3 +184,17 @@ def infer_language_from_variable(variable: DiscreteVariable) -> Optional[str]:
Language ISO code if all documents have the same language, None otherwise
"""
return variable.values[0] if len(variable.values) == 1 else None


# this dictionary hold all changes in language names
LANGUAGE_MIGRATIONS = {
"Ancient greek": "Ancient Greek"
}


def migrate_language_name(language: str) -> str:
"""
We changed some languages names after they were introduced in the add-on.
This function transform any langauge name to its new name if existed.
"""
return LANGUAGE_MIGRATIONS.get(language, language)
21 changes: 15 additions & 6 deletions orangecontrib/text/widgets/owcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@

from orangecontrib.text.corpus import Corpus, get_sample_corpora_dir
from orangecontrib.text.language import (
LANG2ISO,
detect_language,
ISO2LANG,
LanguageModel,
LANG2ISO,
migrate_language_name,
)
from orangecontrib.text.widgets.utils import widgets, QSize

Expand Down Expand Up @@ -106,6 +106,7 @@ class Outputs:
key=list(FileFormat.readers.values()).index)))

settingsHandler = CorpusContextHandler()
settings_version = 2

recent_files = Setting([
"book-excerpts.tab",
Expand All @@ -116,7 +117,7 @@ class Outputs:
])
used_attrs = ContextSetting([])
title_variable = ContextSetting("")
language: str = ContextSetting("English")
language: str = ContextSetting("en")

class Error(OWWidget.Error):
read_file = Msg("Can't read file ({})")
Expand Down Expand Up @@ -163,7 +164,7 @@ def __init__(self):
self,
"language",
label="Language",
model=LanguageModel(),
model=LanguageModel(include_none=True),
sendSelectedValue=True,
**common_settings
)
Expand Down Expand Up @@ -253,7 +254,7 @@ def on_done(self, corpus: Corpus) -> None:
return
# set language on Corpus's language (when corpus with already defined
# language opened) or guess language
self.language = ISO2LANG[corpus.language or detect_language(corpus)]
self.language = corpus.language or detect_language(corpus)
self.openContext(self.corpus)
self.used_attrs_model.extend(self.used_attrs)
self.unused_attrs_model.extend(
Expand Down Expand Up @@ -341,7 +342,7 @@ def remove_duplicates(l):
self.Error.no_text_features_used()

corpus.set_title_variable(self.title_variable)
corpus.attributes["language"] = LANG2ISO[self.language]
corpus.attributes["language"] = self.language
# prevent sending "empty" corpora
dom = corpus.domain
empty = (
Expand Down Expand Up @@ -369,6 +370,14 @@ def describe(features):
('Target', describe(domain.class_vars)),
))

@classmethod
def migrate_context(cls, context, version):
if version < 2:
if "language" in context.values:
language, type_ = context.values["language"]
language = LANG2ISO[migrate_language_name(language)]
context.values["language"] = (language, type_)


if __name__ == '__main__':
from orangewidget.utils.widgetpreview import WidgetPreview
Expand Down
16 changes: 13 additions & 3 deletions orangecontrib/text/widgets/owcreatecorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
from orangewidget.settings import Setting

from orangecontrib.text import Corpus
from orangecontrib.text.language import LANG2ISO, DEFAULT_LANGUAGE, LanguageModel
from orangecontrib.text.language import (
DEFAULT_LANGUAGE, LanguageModel, LANG2ISO, migrate_language_name
)


class EditorsVerticalScrollArea(gui.VerticalScrollArea):
Expand Down Expand Up @@ -78,6 +80,7 @@ class Outputs:

want_main_area = False

settings_version = 2
language: str = Setting(DEFAULT_LANGUAGE)
texts: List[Tuple[str, str]] = Setting([("", "")] * 3)
auto_commit: bool = Setting(True)
Expand All @@ -90,7 +93,7 @@ def __init__(self):
self.controlArea,
self,
"language",
model=LanguageModel(),
model=LanguageModel(include_none=True),
box="Language",
orientation=Qt.Horizontal,
callback=self.commit.deferred,
Expand Down Expand Up @@ -157,14 +160,21 @@ def commit(self):
np.empty((len(self.texts), 0)),
metas=np.array(self.texts),
text_features=[doc_var],
language=LANG2ISO[self.language],
language=self.language,
)
corpus.set_title_variable(title_var)
self.Outputs.corpus.send(corpus)

def sizeHint(self) -> QSize:
return QSize(600, 650)

@classmethod
def migrate_settings(cls, settings, version):
if version is None or version < 2:
if "language" in settings:
language = migrate_language_name(settings["language"])
settings["language"] = LANG2ISO[language]


if __name__ == "__main__":
from orangewidget.utils.widgetpreview import WidgetPreview
Expand Down
27 changes: 15 additions & 12 deletions orangecontrib/text/widgets/owdocumentembedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
from Orange.widgets.widget import Msg, Output, OWWidget

from orangecontrib.text.corpus import Corpus
from orangecontrib.text.language import ISO2LANG, LANG2ISO
from orangecontrib.text.language import (
ISO2LANG, DEFAULT_LANGUAGE, LanguageModel, LANG2ISO
)
from orangecontrib.text.vectorization.document_embedder import (
AGGREGATORS,
AGGREGATORS_ITEMS,
Expand Down Expand Up @@ -39,10 +41,9 @@ class OWDocumentEmbedding(OWBaseVectorizer):
priority = 300

buttons_area_orientation = Qt.Vertical
settings_version = 2
settings_version = 3

Methods = [SBERT, DocumentEmbedder]
DEFAULT_LANGUAGE = "English"

class Outputs(OWBaseVectorizer.Outputs):
skipped = Output("Skipped documents", Corpus)
Expand Down Expand Up @@ -84,7 +85,7 @@ def create_configuration_layout(self):
ibox,
self,
"language",
items=[ISO2LANG[lg] for lg in LANGUAGES],
model=LanguageModel(languages=LANGUAGES),
label="Language:",
sendSelectedValue=True, # value is actual string not index
orientation=Qt.Horizontal,
Expand All @@ -108,10 +109,10 @@ def create_configuration_layout(self):
def set_data(self, corpus):
# set language from corpus as selected language
if corpus and corpus.language in LANGUAGES:
self.language = ISO2LANG[corpus.language]
self.language = corpus.language
else:
# if Corpus's language not supported use default language
self.language = self.DEFAULT_LANGUAGE
self.language = DEFAULT_LANGUAGE

# when workflow loaded use language saved in workflow
if self.__pending_language is not None:
Expand All @@ -127,9 +128,7 @@ def update_method(self):
self.vectorizer = EmbeddingVectorizer(self.init_method(), self.corpus)

def init_method(self):
params = dict(
language=LANG2ISO[self.language], aggregator=self.aggregator
)
params = dict(language=self.language, aggregator=self.aggregator)
kwargs = ({}, params)[self.method]
return self.Methods[self.method](**kwargs)

Expand Down Expand Up @@ -170,18 +169,22 @@ def migrate_settings(cls, settings: Dict[str, Any], version: Optional[int]):
settings["language"] = LANGUAGES[settings["language"]]
if "aggregator" in settings:
settings["aggregator"] = AGGREGATORS[settings["aggregator"]]
if version is None or version < 3 and "language" in settings:
# before version 3 language settings were language names, transform to ISO
settings["language"] = LANG2ISO[settings["language"]]

def send_report(self):
if self.method == 0:
self.report_items((
("Embedder", "Multilingual SBERT"),
))
if self.method == 1:
self.report_items((
items = (
("Embedder", "fastText"),
("Language", self.language),
("Language", ISO2LANG[self.language]),
("Aggregator", self.aggregator),
))
)
self.report_items(items)


if __name__ == "__main__":
Expand Down
21 changes: 13 additions & 8 deletions orangecontrib/text/widgets/owimportdocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,7 @@
from orangecontrib.text.corpus import Corpus
from orangecontrib.text.import_documents import ImportDocuments, NoDocumentsException
from orangecontrib.text.language import (
ISO2LANG,
detect_language,
LANG2ISO,
LanguageModel,
detect_language, LanguageModel, DEFAULT_LANGUAGE, LANG2ISO, migrate_language_name
)

# domain for skipped images output
Expand Down Expand Up @@ -124,6 +121,7 @@ class Outputs:
skipped_documents = Output("Skipped documents", Table)

settingsHandler = ImportDocumentContextHandler()
settings_version = 2

LOCAL_FILE, URL = range(2)
source = settings.Setting(LOCAL_FILE)
Expand All @@ -134,7 +132,7 @@ class Outputs:
lemma_cb = settings.Setting(True)
pos_cb = settings.Setting(False)
ner_cb = settings.Setting(False)
language: str = settings.ContextSetting("English")
language: str = settings.ContextSetting(DEFAULT_LANGUAGE)

want_main_area = False
resizing_enabled = False
Expand Down Expand Up @@ -253,7 +251,7 @@ def __init__(self):
self,
"language",
box="Language",
model=LanguageModel(),
model=LanguageModel(include_none=True),
sendSelectedValue=True,
searchable=True,
callback=self.commit,
Expand Down Expand Up @@ -665,7 +663,7 @@ def __onRunFinished(self):
self.n_text_data = len(corpus)
self.n_text_categories = len(corpus.domain.class_var.values) \
if corpus.domain.class_var else 0
self.language = ISO2LANG[corpus.language or detect_language(corpus)]
self.language = corpus.language or detect_language(corpus)
self.openContext(corpus)
else:
self.language = None
Expand Down Expand Up @@ -727,7 +725,7 @@ def commit(self):
if self.is_conllu:
self.add_features()
if self.corpus:
self.corpus.attributes["language"] = LANG2ISO[self.language]
self.corpus.attributes["language"] = self.language
self.Outputs.data.send(self.corpus)
if self.skipped_documents:
skipped_table = (
Expand Down Expand Up @@ -791,6 +789,13 @@ def send_report(self):
items += [('Number of skipped', len(self.skipped_documents))]
self.report_items(items, )

@classmethod
def migrate_context(cls, context, version):
if version < 2:
if "language" in context.values:
language = LANG2ISO[migrate_language_name(context.values["language"])]
context.values["language"] = language


class UserInterruptError(BaseException):
"""
Expand Down
Loading
Loading