Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(ingest): improve error messages for unknown metadata objects #12745

Merged
merged 1 commit into from
Feb 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions metadata-ingestion/src/datahub/ingestion/source/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,10 +410,13 @@ def _from_obj_for_file(
item = MetadataChangeEvent.from_obj(obj)
elif "aspect" in obj:
item = MetadataChangeProposalWrapper.from_obj(obj)
else:
elif "bucket" in obj:
item = UsageAggregationClass.from_obj(obj)
else:
raise ValueError(f"Unknown object type: {obj}")

if not item.validate():
raise ValueError(f"failed to parse: {obj}")
raise ValueError(f"Failed to parse: {obj}")

if isinstance(item, UsageAggregationClass):
logger.warning(f"Dropping deprecated UsageAggregationClass: {item}")
Expand Down
16 changes: 16 additions & 0 deletions metadata-ingestion/tests/unit/serde/test_domain_properties.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[
{
"entityType": "domain",
"entityUrn": "urn:li:domain:marketing",
"changeType": "UPSERT",
"aspectName": "domainProperties",
"aspect": {
"json": {
"customProperties": {},
"name": "Marketing",
"description": "Description of the marketing domain",
"parentDomain": "urn:li:domain:gtm"
}
}
}
]
11 changes: 11 additions & 0 deletions metadata-ingestion/tests/unit/serde/test_invalid_object.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[
{
"entityUrn": "urn:li:domain:marketing",
"aspectName": "domainProperties",
"domainProperties": {
"name": "Marketing",
"description": "Marketing domain",
"parentDomain": "urn:li:domain:gtm"
}
}
]
49 changes: 46 additions & 3 deletions metadata-ingestion/tests/unit/serde/test_serde.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,17 @@
import datahub.metadata.schema_classes as models
from datahub.cli.json_file import check_mce_file
from datahub.emitter import mce_builder
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.serialization_helper import post_json_transform, pre_json_transform
from datahub.ingestion.run.pipeline import Pipeline
from datahub.ingestion.source.file import FileSourceConfig, GenericFileSource
from datahub.ingestion.source.file import (
FileSourceConfig,
GenericFileSource,
read_metadata_file,
)
from datahub.metadata.schema_classes import MetadataChangeEventClass
from datahub.metadata.schemas import getMetadataChangeEventSchema
from datahub.testing.pytest_hooks import get_golden_settings
from tests.test_helpers import mce_helpers
from tests.test_helpers.click_helpers import run_datahub_cmd

Expand Down Expand Up @@ -112,7 +118,7 @@ def test_serde_to_avro(
[
# Normal test.
"tests/unit/serde/test_serde_large.json",
# Check for backwards compatability with specifying all union types.
# Check for backwards compatibility with specifying all union types.
"tests/unit/serde/test_serde_backwards_compat.json",
# Usage stats.
"tests/unit/serde/test_serde_usage.json",
Expand All @@ -131,6 +137,43 @@ def test_check_metadata_schema(pytestconfig: pytest.Config, json_filename: str)
run_datahub_cmd(["check", "metadata-file", f"{json_file_path}"])


def test_serde_paired(pytestconfig: pytest.Config) -> None:
# Test with a pair of python object + json file.
# Validates both deserialization and serialization.

python_metadata = [
MetadataChangeProposalWrapper(
entityUrn="urn:li:domain:marketing",
aspect=models.DomainPropertiesClass(
name="Marketing",
description="Description of the marketing domain",
parentDomain="urn:li:domain:gtm",
),
)
]
for metadata in python_metadata:
assert metadata.validate()

json_file_path = (
pytestconfig.rootpath / "tests/unit/serde/test_domain_properties.json"
)
if not get_golden_settings().update_golden:
json_metadata = list(read_metadata_file(json_file_path))
assert python_metadata == json_metadata

mce_helpers.check_goldens_stream(
outputs=python_metadata,
golden_path=json_file_path,
ignore_order=False,
)


def test_unknown_object_deser_error(pytestconfig: pytest.Config) -> None:
json_file_path = pytestconfig.rootpath / "tests/unit/serde/test_invalid_object.json"
with pytest.raises(ValueError, match="Unknown object type"):
list(read_metadata_file(json_file_path))


def test_check_metadata_rewrite(
pytestconfig: pytest.Config, tmp_path: pathlib.Path
) -> None:
Expand Down Expand Up @@ -356,7 +399,7 @@ def test_json_transforms(model, ref_server_obj):
assert recovered == model


def test_unions_with_aliases_assumptions():
def test_unions_with_aliases_assumptions() -> None:
# We have special handling for unions with aliases in our json serialization helpers.
# Specifically, we assume that cost is the only instance of a union with alias.
# This test validates that assumption.
Expand Down
Loading