Skip to content

Commit

Permalink
cli: add annotate
Browse files Browse the repository at this point in the history
  • Loading branch information
2maz committed Mar 11, 2025
1 parent ecc18a1 commit 08d61be
Show file tree
Hide file tree
Showing 8 changed files with 234 additions and 25 deletions.
69 changes: 69 additions & 0 deletions src/damast/cli/data_annotate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import re
from argparse import ArgumentParser
from pathlib import Path

from damast.cli.base import BaseParser
from damast.core.annotations import Annotation
from damast.core.dataframe import DAMAST_SPEC_SUFFIX, AnnotatedDataFrame
from damast.core.metadata import DataSpecification, MetaData


class DataAnnotateParser(BaseParser):
"""
Argparser for creation of the specification / annotating an AnnotatedDataFrame
:param parser: The base parser
"""

def __init__(self, parser: ArgumentParser):
super().__init__(parser=parser)

parser.description = "damast annotate - data annotation subcommand called"
parser.add_argument("-f", "--filename",
help="Filename or pattern of the (annotated) data file that should be annotated",
required=True
)
parser.add_argument("-i", "--interactive",
help="Perform the annotation interactively",
default=False,
required=False)

parser.add_argument("-o", "--output-dir",
help="Output directory",
default=None,
required=False)


def execute(self, args):
super().execute(args)

base = Path(args.filename).parent
name = Path(args.filename).name
files = [x for x in base.glob(name)]

sum_st_size = 0
for idx, file in enumerate(files, start=1):
sum_st_size += file.stat().st_size

print(f"Loading dataframe ({len(files)} files) of total size: {sum_st_size / (1024**2):.2f} MB")

adf = AnnotatedDataFrame.from_file(filename=args.filename, metadata_required=False)
print(adf.head(10).collect())

metadata_filename = Path(args.filename).with_suffix(DAMAST_SPEC_SUFFIX)
if args.output_dir is not None:
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

metadata_filename = output_dir / metadata_filename.name

metadata = AnnotatedDataFrame.infer_annotation(df=adf.dataframe)
metadata.add_annotation(
Annotation(
name=Annotation.Key.Source,
value=args.filename
)
)
metadata.save_yaml(metadata_filename)

print(f"Created {metadata_filename}")
4 changes: 4 additions & 0 deletions src/damast/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path

from damast.cli.base import BaseParser
from damast.cli.data_annotate import DataAnnotateParser
from damast.cli.data_converter import DataConvertParser
from damast.cli.data_inspect import DataInspectParser
from damast.cli.data_processing import DataProcessingParser
Expand Down Expand Up @@ -38,6 +39,9 @@ def run():
Run the main command line interface
"""
main_parser = MainParser()
main_parser.attach_subcommand_parser(subcommand="annotate",
help="Annotate a dataframe",
parser_klass=DataAnnotateParser)

convert_help_desc = "Convert a dataset (set of .csv-files) to a .h5-file (containing the data)" +\
" and .yml-file (containing data specification)"
Expand Down
55 changes: 38 additions & 17 deletions src/damast/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import pyarrow.parquet as pq

from .annotations import Annotation
from .datarange import MinMax
from .metadata import DataSpecification, MetaData, ValidationMode
from .types import DataFrame, XDataFrame

Expand Down Expand Up @@ -187,7 +188,7 @@ def export(self, filename: str | Path):
@classmethod
def from_file(cls,
filename: Union[str, Path],
metadata_required: bool = False
metadata_required: bool = True,
) -> AnnotatedDataFrame:
"""
Create an annotated dataframe from an hdf5 file.
Expand All @@ -206,7 +207,10 @@ def from_file(cls,
except Exception as e:
_log.warning(f"{filename} has no (damast) annotations")
elif Path(filename).suffix in [ ".csv"]:
df = polars.scan_csv(filename)
df = polars.scan_csv(filename, separator=";")
if len(df.compat.column_names) <= 1:
# unlikely that this frame has only one column, so trying with comma
df = polars.scan_csv(filename, separator=",")
elif Path(filename).suffix in [".h5", ".hdf5"]:
df, metadata = XDataFrame.import_hdf5(filename)
else:
Expand All @@ -215,25 +219,42 @@ def from_file(cls,
"csv (.csv), and hdf5 (.h5, .hdf5 generated by vaex or pandas"
)

if not metadata:
metadata_filename = Path(filename).with_suffix(DAMAST_SPEC_SUFFIX)
if metadata_filename.exists():
metadata = MetaData.load_yaml(filename=metadata_filename)
if metadata is None:
if metadata_required:
metadata_filename = Path(filename).with_suffix(DAMAST_SPEC_SUFFIX)
if metadata_filename.exists():
metadata = MetaData.load_yaml(filename=metadata_filename)
else:
head = df.head(10).collect()
# metadata missing
raise RuntimeError(
f"{cls.__name__}.from_file:"
f" metadata is missing in '{filename}'"
f" and needs to be added'\n"
f"{head}"
)
else:
head = df.head(10).collect()
# metadata missing
raise RuntimeError(
f"{cls.__name__}.from_file:"
f" metadata is missing in '{filename}'"
f" and needs to be added'\n"
f"{head}"
)

if not metadata:
metadata = MetaData(columns=list_attrs, annotations=annotations)
metadata = MetaData(columns=[])

return cls(dataframe=df, metadata=metadata)

@classmethod
def infer_annotation(cls, df: DataFrame) -> MetaData:
column_specs: list[DataSpecification] = []
for column in df.compat.column_names:
data = {'name': column,
'is_optional': False,
'representation_type': df.compat.dtype(column)
}

if not str(df.compat.dtype(column)).lower().startswith("str"):
min_value, max_value = df.compat.minmax(column)
data['value_range'] = MinMax(min_value, max_value)

ds = DataSpecification(**data)
column_specs.append(ds)
return MetaData(columns=column_specs, annotations=[])

@classmethod
def convert_csv_to_adf(
cls,
Expand Down
10 changes: 8 additions & 2 deletions src/damast/core/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,8 +364,8 @@ def __iter__(self):
if self.representation_type is not None:
if inspect.isclass(self.representation_type):
yield "representation_type", self.representation_type.__name__
elif isinstance(self.representation_type, DataType):
yield "representation_type", self.representation_type.name
elif isinstance(self.representation_type, pl.datatypes.DataType):
yield "representation_type", str(self.representation_type)
else:
raise TypeError(
f"{self.__class__.__name__}.__iter__ failed to identify representation_type from"
Expand Down Expand Up @@ -883,6 +883,12 @@ def __init__(
)
self._annotations = {an.name: an for an in annotations}

def add_annotation(self, annotation: Annotation):
if annotation.name in self._annotations:
raise KeyError(f"Annotation {annotation.name} already exists")

self._annotations[annotation.name] = annotation

@property
def annotations(self) -> Dict[str, Annotation]:
"""Get dictionary of annotations"""
Expand Down
29 changes: 25 additions & 4 deletions tests/damast/cli/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,26 @@
from pathlib import Path

import pytest
import yaml
from pytest_console_scripts import ScriptRunner

import damast.cli.main as cli_main
from damast.cli.data_annotate import DataAnnotateParser
from damast.cli.data_converter import DataConvertParser
from damast.cli.data_inspect import DataInspectParser
from damast.cli.data_processing import DataProcessingParser
from damast.cli.experiment import ExperimentParser
from damast.core.dataframe import DAMAST_SPEC_SUFFIX


@pytest.fixture
def subparsers():
return [
"annotate",
"convert",
"experiment",
"inspect",
"process",
"convert",
"experiment"
]

def test_help(subparsers, capsys, monkeypatch):
Expand All @@ -32,10 +36,11 @@ def test_help(subparsers, capsys, monkeypatch):


@pytest.mark.parametrize("name, klass", [
[ "inspect", DataInspectParser ],
[ "process", DataProcessingParser ],
[ "annotate", DataAnnotateParser ],
[ "convert", DataConvertParser ],
[ "inspect", DataInspectParser ],
[ "experiment", ExperimentParser ],
[ "process", DataProcessingParser ],
])
def test_subparser(name, klass, script_runner):
result = script_runner.run(['damast', name, "--help"])
Expand Down Expand Up @@ -63,3 +68,19 @@ def test_inspect(data_path, filename, script_runner):
assert re.search("Loading dataframe \(1 files\)", result.stdout) is not None, "Process dataframe"
assert re.search("shape:", result.stdout) is not None, "Show dataframe"

@pytest.mark.parametrize("filename, spec_filename", [
["test_ais.csv", f"test_ais{DAMAST_SPEC_SUFFIX}"]
])
def test_inspect(data_path, filename, spec_filename, tmp_path, script_runner):
result = script_runner.run(['damast', 'annotate', '-f', str(data_path / filename), '-o', tmp_path])

assert result.returncode == 0

with open(tmp_path / spec_filename, "r") as f:
written_spec = yaml.load(f, Loader=yaml.SafeLoader)

with open(data_path / spec_filename, "r") as f:
expected_spec = yaml.load(f, Loader=yaml.SafeLoader)
expected_spec["annotations"]["source"] = str(data_path / filename)

assert written_spec == expected_spec
4 changes: 3 additions & 1 deletion tests/damast/conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import pytest
from pathlib import Path

import pytest


@pytest.fixture
def data_path():
return Path(__file__).parent / "data"
Expand Down
2 changes: 1 addition & 1 deletion tests/damast/core/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def test_force_range():
assert XDataFrame(adf._dataframe).equals(XDataFrame(df_filtered))

def test_convert_csv_to_adf(tmp_path):
output_filename = tmp_path / "test-convert-csv.pq" #hdf5"
output_filename = tmp_path / "test-convert-csv.pq"
test_path = Path(__file__).parent.parent / "data"
AnnotatedDataFrame.convert_csv_to_adf(csv_filenames=[test_path / "test_dataframe.csv"],
metadata_filename=test_path / "test_dataframe.spec.yaml",
Expand Down
86 changes: 86 additions & 0 deletions tests/damast/data/test_ais.spec.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
columns:
- name: mmsi
is_optional: false
representation_type: Int64
value_range:
MinMax:
min: 209318000
max: 230666000
allow_missing: true
- name: imo_nr
is_optional: false
representation_type: Int64
value_range:
MinMax:
min: 8911736
max: 9754446
allow_missing: true
- name: length
is_optional: false
representation_type: Int64
value_range:
MinMax:
min: 89
max: 170
allow_missing: true
- name: date_time_utc
is_optional: false
representation_type: String
- name: lon
is_optional: false
representation_type: Float64
value_range:
MinMax:
min: 5.25503
max: 5.54976
allow_missing: true
- name: lat
is_optional: false
representation_type: Float64
value_range:
MinMax:
min: 58.7635
max: 59.407
allow_missing: true
- name: sog
is_optional: false
representation_type: Float64
value_range:
MinMax:
min: 9.4
max: 18.0
allow_missing: true
- name: cog
is_optional: false
representation_type: Float64
value_range:
MinMax:
min: 2.9
max: 355.7
allow_missing: true
- name: true_heading
is_optional: false
representation_type: Int64
value_range:
MinMax:
min: 6
max: 359
allow_missing: true
- name: nav_status
is_optional: false
representation_type: Int64
value_range:
MinMax:
min: 0
max: 0
allow_missing: true
- name: message_nr
is_optional: false
representation_type: Int64
value_range:
MinMax:
min: 1
max: 1
allow_missing: true
annotations:
source: tests/damast/data/test_ais.csv

0 comments on commit 08d61be

Please sign in to comment.