Skip to content

Commit 3fdb5e0

Browse files
authored
Merge pull request #677 from ajdapretnar/metas-proper-types
Import Documents: Read metas as the right type
2 parents a43bbef + 42405e2 commit 3fdb5e0

File tree

2 files changed

+21
-10
lines changed

2 files changed

+21
-10
lines changed

orangecontrib/text/import_documents.py

+17-6
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import contextlib
2+
import datetime
23
import fnmatch
34
import logging
45
import os
@@ -29,8 +30,10 @@
2930

3031
import serverfiles
3132

32-
from Orange.data import DiscreteVariable, Domain, StringVariable
33-
from Orange.data.io import detect_encoding, UrlReader as CoreUrlReader
33+
from Orange.data import DiscreteVariable, Domain, StringVariable, \
34+
guess_data_type
35+
from Orange.data.io import detect_encoding, sanitize_variable,\
36+
UrlReader as CoreUrlReader
3437
from Orange.data.util import get_unique_names
3538
from Orange.util import Registry
3639

@@ -178,6 +181,9 @@ class YamlMetaReader(Reader):
178181
def read_file(self):
179182
with open(self.path, "r") as f:
180183
self.content = yaml.safe_load(f)
184+
for k in self.content:
185+
if self.content[k] is None:
186+
self.content[k] = ""
181187

182188

183189
class UrlReader(Reader, CoreUrlReader):
@@ -345,13 +351,18 @@ def _add_metadata(self, corpus: Corpus) -> Corpus:
345351
if len(df.index.drop_duplicates()) != len(df.index):
346352
df = df[~df.index.duplicated(keep='first')]
347353
filtered = df.reindex(path_column)
348-
for column in filtered.columns:
354+
for name, column in filtered.iteritems():
355+
data = column.astype(str).values
356+
val_map, vals, var_type = guess_data_type(data)
357+
values, variable = sanitize_variable(val_map, vals, data,
358+
var_type, {},
359+
name=get_unique_names(
360+
corpus.domain, name))
349361
corpus = corpus.add_column(
350-
StringVariable(get_unique_names(corpus.domain, column)),
351-
filtered[column].to_numpy(),
362+
variable,
363+
values,
352364
to_metas=True
353365
)
354-
355366
return corpus
356367

357368
@staticmethod

orangecontrib/text/tests/test_import_documents.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -130,16 +130,16 @@ def test_run_url(self):
130130
importer = ImportDocuments(path, True)
131131
corpus2, _ = importer.run()
132132
self.assertGreater(len(corpus1), 0)
133-
self.assertEqual(corpus1.metas[mask].tolist(),
134-
corpus2.metas[mask].tolist())
133+
np.testing.assert_array_equal(corpus1.metas[mask].tolist(),
134+
corpus2.metas[mask].tolist())
135135

136136
path = "http://file.biolab.si/text-semantics/data" \
137137
"/predlogi-vladi-sample"
138138
importer = ImportDocuments(path, True)
139139
corpus3, _ = importer.run()
140140
self.assertGreater(len(corpus2), 0)
141-
self.assertEqual(corpus1.metas[mask].tolist(),
142-
corpus3.metas[mask].tolist())
141+
np.testing.assert_array_equal(corpus1.metas[mask].tolist(),
142+
corpus3.metas[mask].tolist())
143143

144144
def test_run_url_special_characters(self):
145145
path = "http://file.biolab.si/text-semantics/data/" \

0 commit comments

Comments
 (0)