|
1 | 1 | import contextlib
|
| 2 | +import datetime |
2 | 3 | import fnmatch
|
3 | 4 | import logging
|
4 | 5 | import os
|
|
29 | 30 |
|
30 | 31 | import serverfiles
|
31 | 32 |
|
32 |
| -from Orange.data import DiscreteVariable, Domain, StringVariable |
33 |
| -from Orange.data.io import detect_encoding, UrlReader as CoreUrlReader |
| 33 | +from Orange.data import DiscreteVariable, Domain, StringVariable, \ |
| 34 | + guess_data_type |
| 35 | +from Orange.data.io import detect_encoding, sanitize_variable,\ |
| 36 | + UrlReader as CoreUrlReader |
34 | 37 | from Orange.data.util import get_unique_names
|
35 | 38 | from Orange.util import Registry
|
36 | 39 |
|
@@ -178,6 +181,9 @@ class YamlMetaReader(Reader):
|
178 | 181 | def read_file(self):
|
179 | 182 | with open(self.path, "r") as f:
|
180 | 183 | self.content = yaml.safe_load(f)
|
| 184 | + for k in self.content: |
| 185 | + if self.content[k] is None: |
| 186 | + self.content[k] = "" |
181 | 187 |
|
182 | 188 |
|
183 | 189 | class UrlReader(Reader, CoreUrlReader):
|
@@ -345,13 +351,18 @@ def _add_metadata(self, corpus: Corpus) -> Corpus:
|
345 | 351 | if len(df.index.drop_duplicates()) != len(df.index):
|
346 | 352 | df = df[~df.index.duplicated(keep='first')]
|
347 | 353 | filtered = df.reindex(path_column)
|
348 |
| - for column in filtered.columns: |
| 354 | + for name, column in filtered.iteritems(): |
| 355 | + data = column.astype(str).values |
| 356 | + val_map, vals, var_type = guess_data_type(data) |
| 357 | + values, variable = sanitize_variable(val_map, vals, data, |
| 358 | + var_type, {}, |
| 359 | + name=get_unique_names( |
| 360 | + corpus.domain, name)) |
349 | 361 | corpus = corpus.add_column(
|
350 |
| - StringVariable(get_unique_names(corpus.domain, column)), |
351 |
| - filtered[column].to_numpy(), |
| 362 | + variable, |
| 363 | + values, |
352 | 364 | to_metas=True
|
353 | 365 | )
|
354 |
| - |
355 | 366 | return corpus
|
356 | 367 |
|
357 | 368 | @staticmethod
|
|
0 commit comments