Skip to content

Commit

Permalink
norm
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Dec 21, 2023
1 parent 5e61092 commit c83c8a6
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 50 deletions.
47 changes: 24 additions & 23 deletions orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import warnings
from typing import List, Callable, Optional
from typing import List, Callable
import os
import ufal.udpipe as udpipe
from lemmagen3 import Lemmatizer
Expand Down Expand Up @@ -115,9 +114,8 @@ def _find_file(self, language):
def __files_to_dict(self, files):
iso2lang = {}
for f in files:
language_name = self.__file_to_language(f[0])
iso = self.__lang2iso(language_name)
iso2lang[iso] = (language_name, f)
langauge, iso = self.__file_to_language(f[0])
iso2lang[iso] = (langauge, f[0])
return iso2lang

@property
Expand All @@ -132,28 +130,31 @@ def online(self):
except ConnectionError:
return False

# use _ since - is already used in iso standard
VARIATION_DELIMITER = "_"

# todo: improve
def __lang2iso(self, language):
if "(" in language:
language, model = language.split("(")
language = LANG2ISO[language.strip()]
return UDPipeModels.VARIATION_DELIMITER.join((language, model.strip(")")))
return LANG2ISO[language]

def __file_to_language(self, file):
lg = file[: file.find("ud") - 1].split("-")
# if filename includes "-" then variation is part of the name
lg, model_variation = lg if len(lg) == 2 else (lg[0], "")
"""
Transform filenames to langauge strings and iso codes.
Language name has format "Language (Model)"
ISO code consist of real iso code which we add the model variation to for
example "en_lines" for lines english model.
"""
# language and potential model variation are delimited with -
name_split = file[: file.find("ud") - 1].split("-")
# capitalize multi-word languages separated by _
lg = " ".join(map(lambda x: x.capitalize(), lg.split("_")))
lg = name_split[0].replace("_", " ").title()
# fix wrong spelling for Norwegian Bokmål
lg = self.UDPIPE2LANG.get(lg, lg)
if model_variation:
model_variation = f"({model_variation})"
return " ".join((lg, model_variation)).strip()

if len(name_split) > 1:
# languages with multiple models have model name as second item in split
return f"{lg} ({name_split[1]})", self.__lang2iso(lg, name_split[1])
return lg, self.__lang2iso(lg, None)

@staticmethod
def __lang2iso(language, model):
language = [LANG2ISO[language]]
if model:
language.append(model)
return "_".join(language)


class UDPipeStopIteration(StopIteration):
Expand Down
42 changes: 15 additions & 27 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,8 @@ def test_udpipe_pickle(self):
# udpipe store model after first call - model is not picklable
normalizer(self.corpus)
loaded = pickle.loads(pickle.dumps(normalizer))
self.assertEqual(normalizer._language, loaded._language)
self.assertEqual(normalizer._UDPipeLemmatizer__language,
loaded._UDPipeLemmatizer__language)
self.assertEqual(normalizer._UDPipeLemmatizer__use_tokenizer,
loaded._UDPipeLemmatizer__use_tokenizer)
with self.corpus.unlocked():
Expand All @@ -332,7 +333,8 @@ def test_udpipe_pickle(self):
def test_udpipe_deepcopy(self):
normalizer = preprocess.UDPipeLemmatizer("lt", True)
copied = copy.deepcopy(normalizer)
self.assertEqual(normalizer._language, copied._language)
self.assertEqual(normalizer._UDPipeLemmatizer__language,
copied._UDPipeLemmatizer__language)
self.assertEqual(normalizer._UDPipeLemmatizer__use_tokenizer,
copied._UDPipeLemmatizer__use_tokenizer)
with self.corpus.unlocked():
Expand Down Expand Up @@ -363,7 +365,7 @@ def test_normalizers_picklable(self):
for nm in set(preprocess.normalize.__all__) - {"BaseNormalizer"}:
normalizer = getattr(preprocess.normalize, nm)
normalizer = (
normalizer(language="Lithuanian")
normalizer(language="lt")
if normalizer is preprocess.UDPipeLemmatizer
else normalizer()
)
Expand All @@ -372,7 +374,7 @@ def test_normalizers_picklable(self):
loaded(self.corpus)

def test_cache(self):
normalizer = preprocess.UDPipeLemmatizer("Lithuanian")
normalizer = preprocess.UDPipeLemmatizer("lt")
with self.corpus.unlocked():
self.corpus.metas[0, 0] = "esu"
normalizer(self.corpus)
Expand All @@ -388,23 +390,17 @@ def test_cache(self):
class UDPipeModelsTests(unittest.TestCase):
def test_label_transform(self, _):
"""Test helper functions for label transformation"""
model = UDPipeModels()
self.assertEqual(
model.file_to_language("slovenian-sst-ud-2.0-170801.udpipe"),
"Slovenian (sst)"
)
self.assertEqual(model.__iso_to_file("sl_sst"), "slovenian-sst-ud")
self.assertEqual(
model.file_to_language("norwegian_bokmaal-sst-ud-2.0-170801.udpipe"),
"Norwegian Bokmål (sst)",
)
self.assertEqual(model.__iso_to_file("nb_sst"), "norwegian_bokmaal-sst-ud")
fun = UDPipeModels()._UDPipeModels__file_to_language
r = fun("slovenian-sst-ud-2.0-170801.udpipe")
self.assertTupleEqual(r, ("Slovenian (sst)", "sl_sst"))
r = fun("norwegian_bokmaal-sst-ud-2.0-170801.udpipe")
self.assertTupleEqual(r, ("Norwegian Bokmål (sst)", "nb_sst"))

@patch(SF_DOWNLOAD, download_patch)
def test_udpipe_model(self, _):
"""Test udpipe models loading from server"""
models = UDPipeModels()
self.assertIn("lt", models.supported_languages_iso())
self.assertIn(('Lithuanian', 'lt'), models.supported_languages)
self.assertEqual(7, len(models.supported_languages))

local_file = os.path.join(models.local_data, "lithuanian-ud-2.0-170801.udpipe")
Expand All @@ -420,24 +416,16 @@ def test_udpipe_local_models(self, sf_mock):
# use Uyghur, it is the smallest model, we can have it in the repository
_ = models["lt"]
sf_mock.side_effect = ConnectionError()
self.assertIn("lt", UDPipeModels().supported_languages_iso())
self.assertIn("Lithuanian", UDPipeModels().supported_languages)
self.assertEqual(1, len(UDPipeModels().supported_languages_iso()))
exp = {"lt": ('Lithuanian', 'lithuanian-ud-2.0-170801.udpipe')}
self.assertDictEqual(exp, models.model_files)
self.assertListEqual([('Lithuanian', 'lt')], models.supported_languages)

def test_udpipe_offline(self, sf_mock):
"""Test if UDPipe works offline"""
self.assertTrue(UDPipeModels().online)
sf_mock.side_effect = ConnectionError()
self.assertFalse(UDPipeModels().online)

def test_language_to_iso(self, _):
self.assertEqual("en", UDPipeModels.lang2iso("English"))
self.assertEqual("en_lines", UDPipeModels.lang2iso("English (lines)"))

def test_iso_to_language(self, _):
self.assertEqual("English", UDPipeModels.iso_to_language("en"))
self.assertEqual("English (lines)", UDPipeModels.iso_to_language("en_lines"))


class FilteringTests(unittest.TestCase):
def setUp(self):
Expand Down

0 comments on commit c83c8a6

Please sign in to comment.