Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Import Documents: Add conllu reader #675

Merged
merged 6 commits into from
Jul 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ recursive-include orangecontrib/text/tests *.txt *.json
recursive-include orangecontrib/text/tutorials *.ows
recursive-include orangecontrib/text/widgets/icons *.svg *.png *.ai
recursive-include orangecontrib/text/widgets/resources *.js *.css *.html
recursive-include orangecontrib/text/widgets/tests/data *.docx *.odt *.pdf *.txt
recursive-include orangecontrib/text/widgets/tests/data *.docx *.odt *.pdf *.txt *.conllu
include orangecontrib/text/widgets/tests/bow-test
recursive-include scripts *.sh *.py

Expand Down
Binary file removed doc/widgets/images/Import-Documents-stamped.png
Binary file not shown.
Binary file added doc/widgets/images/ImportDocuments-Conllu.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/widgets/images/ImportDocuments.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
17 changes: 14 additions & 3 deletions doc/widgets/importdocuments.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,29 @@ Import text documents from folders.
**Outputs**

- Corpus: A collection of documents from the local machine.
- Skipped Documents: A list of documents that couldn't be imported.

**Import Documents** widget retrieves text files from folders and creates a corpus. The widget reads .txt, .docx, .odt, .pdf and .xml files. If a folder contains subfolders, they will be used as class labels.
**Import Documents** widget retrieves text files from folders and creates a corpus. The widget reads .txt, .docx, .odt, .pdf, .xml, and .conllu files. If a folder contains subfolders, they will be used as class labels.

![](images/Import-Documents-stamped.png)
![](images/ImportDocuments.png)

1. Folder being loaded.
2. Load folder from a local machine.
3. Reload the data.
4. Number of documents retrieved.
4. Options for importing .conllu files.
5. Number of documents retrieved.

If the widget cannot read the file for some reason, the file will be skipped. Files that were successfully retrieved will still be on the output.

Conllu files
------------

![](images/ImportDocuments-Conllu.png)

Since Text version 1.5.0, Orange supports reading [.conllu files](https://universaldependencies.org/format.html). Each file will be considered as a separate document in the corpus. If utterance IDs exist, utterances will become documents (each row in the corpus will be a single utterance).

Lemmas and POS tags from *Conllu import options* will be added as tokens and the corpus will be considered preprocessed. Named entities will be added as a comma-separated string (if they exist in the file).

Example
-------

Expand Down
189 changes: 157 additions & 32 deletions orangecontrib/text/import_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import re
import yaml
from urllib.parse import quote, unquote

from conllu import parse_incr
from requests.exceptions import ConnectionError

from collections import namedtuple
Expand Down Expand Up @@ -39,7 +41,7 @@

from orangecontrib.text.corpus import Corpus

DefaultFormats = ("docx", "odt", "txt", "pdf", "xml")
DefaultFormats = ("docx", "odt", "txt", "pdf", "xml", "conllu")

TextData = namedtuple(
"Text",
Expand Down Expand Up @@ -88,7 +90,8 @@ def read(self, ):
return textdata, error

def read_file(self):
raise NotImplementedError("No reader for {}".format(pathlib.Path(self.path).suffix))
raise NotImplementedError(
"No reader for {}".format(pathlib.Path(self.path).suffix))

def make_text_data(self):
name = pathlib.Path(self.path).stem
Expand Down Expand Up @@ -153,7 +156,8 @@ def read_file(self):
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj,
LTTextLine):
extracted_text.append(lt_obj.get_text())
self.content = ' '.join(extracted_text).replace('\x00', '')

Expand Down Expand Up @@ -186,6 +190,13 @@ def read_file(self):
self.content[k] = ""


class TsvMetaReader(Reader):
ext = [".tsv"]

def read_file(self):
self.content = pd.read_csv(self.path, delimiter="\t")


class UrlReader(Reader, CoreUrlReader):
ext = [".url"]

Expand Down Expand Up @@ -217,8 +228,95 @@ def make_text_data(self):
text_data.category, text_data.content)


class ConlluReader(Reader):
TextData = namedtuple(
"Text",
["name", "path", "ext", "category", "doc_id", "content"]
)

ext = [".conllu"]

def __init__(self, path):
super().__init__(path)
self.tokens = None
self.pos = None
self.ner = None

@staticmethod
def parse_ner(tokens):
entities = []
temp_ner = []
for token in tokens:
if token["misc"] is None or "NER" not in token["misc"]:
continue
# "0" means the token is not named entity
if token["misc"]["NER"] != "O":
# lemma?
temp_ner.append(token["lemma"])
elif temp_ner:
entities.append(" ".join(temp_ner))
temp_ner = []
if temp_ner:
entities.append(" ".join(temp_ner))
return entities

def read_file(self):
content = []
file = open(self.path, "r", encoding="utf-8")
utterance_id = ""
utterance = []
tokens = []
pos = []
ner = []
temp_tokens = []
temp_pos = []
temp_ner = []
for sentence in parse_incr(file):
if "newdoc id" in sentence.metadata.keys():
if utterance_id:
content.append([utterance_id, " ".join(utterance)])
tokens.append(temp_tokens)
pos.append(temp_pos)
ner.append(temp_ner)
utterance = []
temp_tokens = []
temp_pos = []
temp_ner = []
utterance_id = sentence.metadata["newdoc id"]
utterance.append(sentence.metadata["text"])
temp_tokens.extend([token["lemma"] for token in sentence])
temp_pos.extend([token["upos"] for token in sentence])
temp_ner.extend(self.parse_ner(sentence))
if temp_tokens or utterance:
content.append([utterance_id, " ".join(utterance)])
tokens.append(temp_tokens)
pos.append(temp_pos)
ner.append(temp_ner)
file.close()
self.tokens = tokens
self.pos = pos
self.ner = np.array([", ".join(tokens) for tokens in ner], dtype=object)
self.content = pd.DataFrame(content, columns=["newdoc id", "text"])

def make_text_data(self):
text_objects = []
name = pathlib.Path(self.path).stem
directory = pathlib.PurePath(self.path).parent
category = directory.parts[-1] or "None"
for _, row in self.content.iterrows():
if self.replace_white_space:
row["text"] = re.sub(r'\s+', ' ', row["text"])
text_objects.append(self.TextData(name, self.path, self.ext,
category,
row["newdoc id"],
row["text"]))
return text_objects


class ImportDocuments:
META_DATA_FILE_KEY = "Text file"
# this is what we will merge meta data on, change to user-set variable
CONLLU_META_DATA = "ID"

def __init__(self, startdir: str,
is_url: bool = False,
Expand All @@ -235,13 +333,19 @@ def __init__(self, startdir: str,
self._is_url = is_url
self._text_data = []
self._meta_data: pd.DataFrame = None

def run(self) -> Tuple[Corpus, List]:
self._text_data, errors_text = self._read_text_data()
self.is_conllu = False
self.tokens = None
self.pos = None
self.ner = None

def run(self) -> Tuple[Corpus, List, List, List, List, bool]:
self._text_data, errors_text, tokens, pos, ner, conllu \
= self._read_text_data()
self._meta_data, errors_meta = self._read_meta_data()
self.is_conllu = conllu
corpus = self._create_corpus()
corpus = self._add_metadata(corpus)
return corpus, errors_text + errors_meta
return corpus, errors_text + errors_meta, tokens, pos, ner, conllu

def _read_text_data(self):
text_data = []
Expand All @@ -251,6 +355,10 @@ def _read_text_data(self):
paths = scan(self.startdir, include_patterns=patterns)
n_paths = len(paths)
batch = []
tokens = []
pos = []
ner = []
conllu = False

if n_paths == 0:
raise NoDocumentsException()
Expand All @@ -267,19 +375,28 @@ def _read_text_data(self):
else UrlReader(path)
text, error = reader.read()
if text is not None:
text_data.append(text)
if type(reader) == ConlluReader:
conllu = True
for t in text:
text_data.append(t)
tokens.extend(reader.tokens)
pos.extend(reader.pos)
ner.extend(reader.ner)
else:
conllu = False
text_data.append(text)
batch.append(text_data)
else:
errors.append(error)

if self.cancelled:
return

return text_data, errors
return text_data, errors, tokens, pos, ner, conllu

def _read_meta_data(self):
scan = self.scan_url if self._is_url else self.scan
patterns = ["*.csv", "*.yaml", "*.yml"]
patterns = ["*.csv", "*.yaml", "*.yml", "*.tsv"]
paths = scan(self.startdir, include_patterns=patterns)
meta_dfs, errors = [], []
for path in paths:
Expand All @@ -301,25 +418,27 @@ def _read_meta_data(self):

def _create_corpus(self) -> Corpus:
corpus = None
names = ["name", "path", "content"]
names = ["name", "path", "content"] if not self.is_conllu else [
"name", "path", "utterance", "content"]
data = []
category_data = []
text_categories = list(set(t.category for t in self._text_data))
values = list(set(text_categories))
category_var = DiscreteVariable.make("category", values=values)
for textdata in self._text_data:
data.append(
[
# some characters are written as decomposed (č is char c
# and separate char for caron), with NFC normalization we
# normalize them to be written as precomposed (č is one
# unicode char - 0x10D)
# https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize
normalize('NFC', textdata.name),
normalize('NFC', textdata.path),
normalize('NFC', textdata.content)
]
)
datum = [
# some characters are written as decomposed (č is char c
# and separate char for caron), with NFC normalization we
# normalize them to be written as precomposed (č is one
# unicode char - 0x10D)
# https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize
normalize('NFC', textdata.name),
normalize('NFC', textdata.path),
normalize('NFC', textdata.content)
]
if self.is_conllu:
datum.insert(2, normalize('NFC', textdata.doc_id))
data.append(datum)
category_data.append(category_var.to_val(textdata.category))
if len(text_categories) > 1:
category_data = np.array(category_data)
Expand All @@ -335,19 +454,24 @@ def _create_corpus(self) -> Corpus:
corpus = Corpus(domain,
Y=category_data,
metas=data,
text_features=[domain.metas[2]])

text_features=[domain.metas[-1]])
return corpus

def _add_metadata(self, corpus: Corpus) -> Corpus:
if "path" not in corpus.domain or self._meta_data is None \
or self.META_DATA_FILE_KEY not in self._meta_data.columns:
or (self.META_DATA_FILE_KEY not in self._meta_data.columns
and self.CONLLU_META_DATA not in self._meta_data.columns):
return corpus

df = self._meta_data.set_index(
self.startdir + self._meta_data[self.META_DATA_FILE_KEY]
)
path_column = corpus.get_column_view("path")[0]
if self.is_conllu:
df = self._meta_data.set_index(self.CONLLU_META_DATA)
path_column = corpus.get_column_view("utterance")[0]
else:
df = self._meta_data.set_index(
self.startdir + self._meta_data[self.META_DATA_FILE_KEY]
)
path_column = corpus.get_column_view("path")[0]

if len(df.index.drop_duplicates()) != len(df.index):
df = df[~df.index.duplicated(keep='first')]
filtered = df.reindex(path_column)
Expand Down Expand Up @@ -396,8 +520,9 @@ def scan(topdir, include_patterns=("*",), exclude_patterns=(".*",)):

filenames = [fname for fname in filenames
if matches_any(fname, include_patterns)
and not matches_any(fname, exclude_patterns)]
paths = paths + [os.path.join(dirpath, fname) for fname in filenames]
and not matches_any(fname, exclude_patterns)]
paths = paths + [os.path.join(dirpath, fname) for fname in
filenames]
return paths

@staticmethod
Expand Down
Loading