cli: add annotate

simula · Mar 11, 2025 · 08d61be · 08d61be
1 parent ecc18a1
commit 08d61be
Show file tree

Hide file tree

Showing 8 changed files with 234 additions and 25 deletions.
diff --git a/src/damast/cli/data_annotate.py b/src/damast/cli/data_annotate.py
@@ -0,0 +1,69 @@
+import re
+from argparse import ArgumentParser
+from pathlib import Path
+
+from damast.cli.base import BaseParser
+from damast.core.annotations import Annotation
+from damast.core.dataframe import DAMAST_SPEC_SUFFIX, AnnotatedDataFrame
+from damast.core.metadata import DataSpecification, MetaData
+
+
+class DataAnnotateParser(BaseParser):
+    """
+    Argparser for creation of the specification / annotating an AnnotatedDataFrame
+
+    :param parser: The base parser
+    """
+
+    def __init__(self, parser: ArgumentParser):
+        super().__init__(parser=parser)
+
+        parser.description = "damast annotate - data annotation subcommand called"
+        parser.add_argument("-f", "--filename",
+                            help="Filename or pattern of the (annotated) data file that should be annotated",
+                            required=True
+                            )
+        parser.add_argument("-i", "--interactive",
+                            help="Perform the annotation interactively",
+                            default=False,
+                            required=False)
+
+        parser.add_argument("-o", "--output-dir",
+                            help="Output directory",
+                            default=None,
+                            required=False)
+
+
+    def execute(self, args):
+        super().execute(args)
+
+        base = Path(args.filename).parent
+        name = Path(args.filename).name
+        files = [x for x in base.glob(name)]
+
+        sum_st_size = 0
+        for idx, file in enumerate(files, start=1):
+            sum_st_size += file.stat().st_size
+
+        print(f"Loading dataframe ({len(files)} files) of total size: {sum_st_size / (1024**2):.2f} MB")
+
+        adf = AnnotatedDataFrame.from_file(filename=args.filename, metadata_required=False)
+        print(adf.head(10).collect())
+
+        metadata_filename = Path(args.filename).with_suffix(DAMAST_SPEC_SUFFIX)
+        if args.output_dir is not None:
+            output_dir = Path(args.output_dir)
+            output_dir.mkdir(parents=True, exist_ok=True)
+
+            metadata_filename = output_dir / metadata_filename.name
+
+        metadata = AnnotatedDataFrame.infer_annotation(df=adf.dataframe)
+        metadata.add_annotation(
+                Annotation(
+                    name=Annotation.Key.Source,
+                    value=args.filename
+                )
+        )
+        metadata.save_yaml(metadata_filename)
+
+        print(f"Created {metadata_filename}")
diff --git a/src/damast/cli/main.py b/src/damast/cli/main.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 
 from damast.cli.base import BaseParser
+from damast.cli.data_annotate import DataAnnotateParser
 from damast.cli.data_converter import DataConvertParser
 from damast.cli.data_inspect import DataInspectParser
 from damast.cli.data_processing import DataProcessingParser
@@ -38,6 +39,9 @@ def run():
     Run the main command line interface
     """
     main_parser = MainParser()
+    main_parser.attach_subcommand_parser(subcommand="annotate",
+                                         help="Annotate a dataframe",
+                                         parser_klass=DataAnnotateParser)
 
     convert_help_desc = "Convert a dataset (set of .csv-files) to a .h5-file (containing the data)" +\
         " and .yml-file (containing data specification)"

diff --git a/src/damast/core/dataframe.py b/src/damast/core/dataframe.py
@@ -19,6 +19,7 @@
 import pyarrow.parquet as pq
 
 from .annotations import Annotation
+from .datarange import MinMax
 from .metadata import DataSpecification, MetaData, ValidationMode
 from .types import DataFrame, XDataFrame
 
@@ -187,7 +188,7 @@ def export(self, filename: str | Path):
     @classmethod
     def from_file(cls,
             filename: Union[str, Path],
-            metadata_required: bool = False
+            metadata_required: bool = True,
         ) -> AnnotatedDataFrame:
         """
         Create an annotated dataframe from an hdf5 file.
@@ -206,7 +207,10 @@ def from_file(cls,
             except Exception as e:
                 _log.warning(f"{filename} has no (damast) annotations")
         elif Path(filename).suffix in [ ".csv"]:
-            df = polars.scan_csv(filename)
+            df = polars.scan_csv(filename, separator=";")
+            if len(df.compat.column_names) <= 1:
+                # unlikely that this frame has only one column, so trying with comma
+                df = polars.scan_csv(filename, separator=",")
         elif Path(filename).suffix in [".h5", ".hdf5"]:
             df, metadata = XDataFrame.import_hdf5(filename)
         else:
@@ -215,25 +219,42 @@ def from_file(cls,
                 "csv (.csv), and hdf5 (.h5, .hdf5 generated by vaex or pandas"
             )
 
-        if not metadata:
-            metadata_filename = Path(filename).with_suffix(DAMAST_SPEC_SUFFIX)
-            if metadata_filename.exists():
-                metadata = MetaData.load_yaml(filename=metadata_filename)
+        if metadata is None:
+            if metadata_required:
+                metadata_filename = Path(filename).with_suffix(DAMAST_SPEC_SUFFIX)
+                if metadata_filename.exists():
+                    metadata = MetaData.load_yaml(filename=metadata_filename)
+                else:
+                    head = df.head(10).collect()
+                    # metadata missing
+                    raise RuntimeError(
+                        f"{cls.__name__}.from_file:"
+                        f" metadata is missing in '{filename}'"
+                        f" and needs to be added'\n"
+                        f"{head}"
+                    )
             else:
-                head = df.head(10).collect()
-                # metadata missing
-                raise RuntimeError(
-                    f"{cls.__name__}.from_file:"
-                    f" metadata is missing in '{filename}'"
-                    f" and needs to be added'\n"
-                    f"{head}"
-                )
-
-        if not metadata:
-            metadata = MetaData(columns=list_attrs, annotations=annotations)
+                metadata = MetaData(columns=[])
 
         return cls(dataframe=df, metadata=metadata)
 
+    @classmethod
+    def infer_annotation(cls, df: DataFrame) -> MetaData:
+        column_specs: list[DataSpecification] = []
+        for column in df.compat.column_names:
+            data = {'name': column,
+                    'is_optional': False,
+                    'representation_type': df.compat.dtype(column)
+            }
+
+            if not str(df.compat.dtype(column)).lower().startswith("str"):
+                min_value, max_value = df.compat.minmax(column)
+                data['value_range'] = MinMax(min_value, max_value)
+
+            ds = DataSpecification(**data)
+            column_specs.append(ds)
+        return MetaData(columns=column_specs, annotations=[])
+
     @classmethod
     def convert_csv_to_adf(
         cls,

diff --git a/src/damast/core/metadata.py b/src/damast/core/metadata.py
@@ -364,8 +364,8 @@ def __iter__(self):
         if self.representation_type is not None:
             if inspect.isclass(self.representation_type):
                 yield "representation_type", self.representation_type.__name__
-            elif isinstance(self.representation_type, DataType):
-                yield "representation_type", self.representation_type.name
+            elif isinstance(self.representation_type, pl.datatypes.DataType):
+                yield "representation_type", str(self.representation_type)
             else:
                 raise TypeError(
                     f"{self.__class__.__name__}.__iter__ failed to identify representation_type from"
@@ -883,6 +883,12 @@ def __init__(
                 )
             self._annotations = {an.name: an for an in annotations}
 
+    def add_annotation(self, annotation: Annotation):
+        if annotation.name in self._annotations:
+            raise KeyError(f"Annotation {annotation.name} already exists")
+
+        self._annotations[annotation.name] = annotation
+
     @property
     def annotations(self) -> Dict[str, Annotation]:
         """Get dictionary of annotations"""

diff --git a/tests/damast/cli/test_cli.py b/tests/damast/cli/test_cli.py
@@ -4,22 +4,26 @@
 from pathlib import Path
 
 import pytest
+import yaml
 from pytest_console_scripts import ScriptRunner
 
 import damast.cli.main as cli_main
+from damast.cli.data_annotate import DataAnnotateParser
 from damast.cli.data_converter import DataConvertParser
 from damast.cli.data_inspect import DataInspectParser
 from damast.cli.data_processing import DataProcessingParser
 from damast.cli.experiment import ExperimentParser
+from damast.core.dataframe import DAMAST_SPEC_SUFFIX
 
 
 @pytest.fixture
 def subparsers():
     return [
+        "annotate",
+        "convert",
+        "experiment",
         "inspect",
         "process",
-        "convert",
-        "experiment"
     ]
 
 def test_help(subparsers, capsys, monkeypatch):
@@ -32,10 +36,11 @@ def test_help(subparsers, capsys, monkeypatch):
 
 
 @pytest.mark.parametrize("name, klass", [
-    [ "inspect", DataInspectParser ],
-    [ "process", DataProcessingParser ],
+    [ "annotate", DataAnnotateParser ],
     [ "convert", DataConvertParser ],
+    [ "inspect", DataInspectParser ],
     [ "experiment", ExperimentParser ],
+    [ "process", DataProcessingParser ],
 ])
 def test_subparser(name, klass, script_runner):
     result = script_runner.run(['damast', name, "--help"])
@@ -63,3 +68,19 @@ def test_inspect(data_path, filename, script_runner):
     assert re.search("Loading dataframe \(1 files\)", result.stdout) is not None, "Process dataframe"
     assert re.search("shape:", result.stdout) is not None, "Show dataframe"
 
+@pytest.mark.parametrize("filename, spec_filename", [
+    ["test_ais.csv", f"test_ais{DAMAST_SPEC_SUFFIX}"]
+])
+def test_inspect(data_path, filename, spec_filename, tmp_path, script_runner):
+    result = script_runner.run(['damast', 'annotate', '-f', str(data_path / filename), '-o', tmp_path])
+
+    assert result.returncode == 0
+
+    with open(tmp_path / spec_filename, "r") as f:
+        written_spec = yaml.load(f, Loader=yaml.SafeLoader)
+
+    with open(data_path / spec_filename, "r") as f:
+        expected_spec = yaml.load(f, Loader=yaml.SafeLoader)
+        expected_spec["annotations"]["source"] = str(data_path / filename)
+
+    assert written_spec == expected_spec
diff --git a/tests/damast/conftest.py b/tests/damast/conftest.py
@@ -1,6 +1,8 @@
-import pytest
 from pathlib import Path
 
+import pytest
+
+
 @pytest.fixture
 def data_path():
     return Path(__file__).parent / "data"

diff --git a/tests/damast/core/test_dataframe.py b/tests/damast/core/test_dataframe.py
@@ -234,7 +234,7 @@ def test_force_range():
     assert XDataFrame(adf._dataframe).equals(XDataFrame(df_filtered))
 
 def test_convert_csv_to_adf(tmp_path):
-    output_filename =  tmp_path / "test-convert-csv.pq" #hdf5"
+    output_filename =  tmp_path / "test-convert-csv.pq"
     test_path = Path(__file__).parent.parent / "data"
     AnnotatedDataFrame.convert_csv_to_adf(csv_filenames=[test_path / "test_dataframe.csv"],
                                           metadata_filename=test_path / "test_dataframe.spec.yaml",

diff --git a/tests/damast/data/test_ais.spec.yaml b/tests/damast/data/test_ais.spec.yaml
@@ -0,0 +1,86 @@
+columns:
+- name: mmsi
+  is_optional: false
+  representation_type: Int64
+  value_range:
+    MinMax:
+      min: 209318000
+      max: 230666000
+      allow_missing: true
+- name: imo_nr
+  is_optional: false
+  representation_type: Int64
+  value_range:
+    MinMax:
+      min: 8911736
+      max: 9754446
+      allow_missing: true
+- name: length
+  is_optional: false
+  representation_type: Int64
+  value_range:
+    MinMax:
+      min: 89
+      max: 170
+      allow_missing: true
+- name: date_time_utc
+  is_optional: false
+  representation_type: String
+- name: lon
+  is_optional: false
+  representation_type: Float64
+  value_range:
+    MinMax:
+      min: 5.25503
+      max: 5.54976
+      allow_missing: true
+- name: lat
+  is_optional: false
+  representation_type: Float64
+  value_range:
+    MinMax:
+      min: 58.7635
+      max: 59.407
+      allow_missing: true
+- name: sog
+  is_optional: false
+  representation_type: Float64
+  value_range:
+    MinMax:
+      min: 9.4
+      max: 18.0
+      allow_missing: true
+- name: cog
+  is_optional: false
+  representation_type: Float64
+  value_range:
+    MinMax:
+      min: 2.9
+      max: 355.7
+      allow_missing: true
+- name: true_heading
+  is_optional: false
+  representation_type: Int64
+  value_range:
+    MinMax:
+      min: 6
+      max: 359
+      allow_missing: true
+- name: nav_status
+  is_optional: false
+  representation_type: Int64
+  value_range:
+    MinMax:
+      min: 0
+      max: 0
+      allow_missing: true
+- name: message_nr
+  is_optional: false
+  representation_type: Int64
+  value_range:
+    MinMax:
+      min: 1
+      max: 1
+      allow_missing: true
+annotations:
+  source: tests/damast/data/test_ais.csv