Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: introduce opossum files to CLI #173

Merged
merged 11 commits into from
Jan 14, 2025
Prev Previous commit
Next Next commit
feat: read file and validate opossum file structure -- first version
* Currently only contains the things where we do have
structure for already in our Opossum model
* This is not fully compliant to the full opossum model yet
--> will be enhanced
Hellgartner committed Jan 14, 2025
commit a0d714be9d5864515c05382f4064f5780e8b9190
5 changes: 3 additions & 2 deletions src/opossum_lib/opossum/merger.py
Original file line number Diff line number Diff line change
@@ -9,6 +9,7 @@
Resource,
ResourcePath,
ResourceType,
convert_resource_in_file_to_resource,
)


@@ -23,10 +24,10 @@ def merge_opossum_information(
metadata=expanded_opossum_information[0].metadata,
resources=_merge_resources(
[
opossum_information.resources
convert_resource_in_file_to_resource(opossum_information.resources)
for opossum_information in expanded_opossum_information
]
),
).convert_to_file_resource(),
externalAttributions=_merge_dicts_without_duplicates(
[
opossum_information.externalAttributions
37 changes: 33 additions & 4 deletions src/opossum_lib/opossum/opossum_file.py
Original file line number Diff line number Diff line change
@@ -6,18 +6,19 @@
from copy import deepcopy
from dataclasses import field
from enum import Enum, auto
from typing import Literal
from typing import Literal, cast

from pydantic.dataclasses import dataclass

OpossumPackageIdentifier = str
ResourcePath = str
type ResourceInFile = dict[str, ResourceInFile] | int
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in which file? perhaps better: "OpossumFileResource"?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about:

  • The "Resource" class anyhow does not describe the file model --> Remove it from the file
  • Then just rename it to Resource to match the model (and maybe rename to old Resource for easier distinguishing)

Copy link
Contributor

@abraemer abraemer Jan 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A good time to change this is probably when tackling #178
I think we could move the current Resource type fully to the spdx section because:

  • the opossum code now uses the (to be renamed) ResourceInFile
  • the scancode code uses its own tree data structure and used to convert to Resource but this is historic and could be changed very easily (in fact I'll just do it now)

We then could rename ResourceInFile just to Resources or OpossumResources to match the top-level key and perhaps make it a full pydantic.Model with some convenience functions for construction. That has the advantage that there is a single point for the logic of how these resources are structured and changing it (e.g. because #38) would be easy. The small downside is that we need to hook into the serialization of pydantic which shouldn't be hard.



@dataclass(frozen=True)
class OpossumInformation:
metadata: Metadata
resources: Resource
resources: ResourceInFile
externalAttributions: dict[OpossumPackageIdentifier, OpossumPackage]
resourcesToAttributions: dict[ResourcePath, list[OpossumPackageIdentifier]]
attributionBreakpoints: list[str] = field(default_factory=list)
@@ -29,7 +30,8 @@ class OpossumInformation:
@dataclass(frozen=True)
class SourceInfo:
name: str
documentConfidence: int | None = 0
documentConfidence: int | float | None = 0
additionalName: str | None = None
Comment on lines 52 to +54
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not the first time, but i'm surprised to see camel case here. in python it's usually standard to use snake case. is there some special need for camel here?
if it's because of the serializing to JSON, pydantic has a configuration option to convert camel to snake during serialization if i remember correctly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.



@dataclass(frozen=True)
@@ -51,7 +53,9 @@ class OpossumPackage:
preSelected: bool | None = None
followUp: Literal["FOLLOW_UP"] | None = None
originId: str | None = None
originIds: list[str] | None = None
criticality: Literal["high"] | Literal["medium"] | None = None
wasPreferred: bool | None = None


@dataclass(frozen=True)
@@ -123,7 +127,7 @@ def drop_element(

return resource

def to_dict(self) -> int | dict:
def to_dict(self) -> ResourceInFile:
if not self.has_children():
if self.type == ResourceType.FOLDER:
return {}
@@ -154,8 +158,33 @@ def get_paths_of_all_leaf_nodes_with_types(
def has_children(self) -> bool:
return len(self.children) > 0

def convert_to_file_resource(self) -> ResourceInFile:
return self.to_dict()


@dataclass(frozen=True)
class ExternalAttributionSource:
name: str
priority: int


def build_resource_tree(resource: ResourceInFile) -> Resource:
if isinstance(resource, int):
return Resource(type=ResourceType.FILE)
else:
result = Resource(type=ResourceType.FOLDER)
for name, child_resource in resource.items():
result.children[name] = build_resource_tree(child_resource)
return result


def convert_resource_in_file_to_resource(resource: ResourceInFile) -> Resource:
root_node = Resource(ResourceType.TOP_LEVEL)

if isinstance(resource, dict):
dict_resource = cast(dict[str, ResourceInFile], resource)
for name, child_resource in dict_resource.items():
child_resource = cast(ResourceInFile, child_resource)
root_node.children[name] = build_resource_tree(child_resource)

return root_node
53 changes: 22 additions & 31 deletions src/opossum_lib/opossum/read_opossum_file.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
# SPDX-FileCopyrightText: TNG Technology Consulting GmbH <https://www.tngtech.com>
#
# SPDX-License-Identifier: Apache-2.0
import json
import logging
import sys
from datetime import datetime
from zipfile import ZipFile

from pydantic import TypeAdapter

from opossum_lib.opossum.opossum_file import (
Metadata,
OpossumInformation,
Resource,
ResourceType,
)


@@ -21,34 +20,26 @@ def read_opossum_file(filename: str) -> OpossumInformation:
with (
ZipFile(filename, "r") as input_zip_file,
):
if "input.json" not in input_zip_file.namelist():
logging.error(
f"Opossum file {filename} is corrupt"
f" and does not contain 'input.json'"
)
sys.exit(1)
if "output.json" in input_zip_file.namelist():
logging.error(
f"Opossum file {filename} also contains"
f" 'output.json' which cannot be processed"
)
sys.exit(1)


validate_zip_file_contents(input_zip_file)
with input_zip_file.open("input.json") as input_json_file:
input_json = json.load(input_json_file)
return TypeAdapter(OpossumInformation).validate_python(input_json)
except Exception as e:
# handle the exception
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what value does this comment add?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

None ... probably a reminder from pressing autocomplete and not taking care appropriately

print(f"Error reading file {filename}: {e}")
sys.exit(1)


dummy_metadata = Metadata(
projectId="test id",
fileCreationDate=datetime.now().isoformat(),
projectTitle="test title",
)
return OpossumInformation(
metadata=dummy_metadata,
resources=Resource(type=ResourceType.FILE, children={}),
externalAttributions={},
resourcesToAttributions={},
attributionBreakpoints=[],
externalAttributionSources={},
)
def validate_zip_file_contents(input_zip_file: ZipFile) -> None:
if "input.json" not in input_zip_file.namelist():
logging.error(
f"Opossum file {input_zip_file.filename} is corrupt"
f" and does not contain 'input.json'"
)
sys.exit(1)
if "output.json" in input_zip_file.namelist():
logging.error(
f"Opossum file {input_zip_file.filename} also contains"
f" 'output.json' which cannot be processed"
)
sys.exit(1)
2 changes: 1 addition & 1 deletion src/opossum_lib/spdx/convert_to_opossum.py
Original file line number Diff line number Diff line change
@@ -117,7 +117,7 @@ def convert_tree_to_opossum_information(tree: DiGraph) -> OpossumInformation:

opossum_information = OpossumInformation(
metadata=metadata,
resources=resources,
resources=resources.convert_to_file_resource(),
externalAttributions=external_attributions,
resourcesToAttributions=resources_to_attributions,
attributionBreakpoints=attribution_breakpoints,
372 changes: 372 additions & 0 deletions tests/data/opossum_input.json

Large diffs are not rendered by default.

32 changes: 31 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -60,17 +60,47 @@ def test_successful_conversion_of_spdx_file(tmp_path: Path, options: str) -> Non
def test_successful_conversion_of_opossum_file(tmp_path: Path) -> None:
runner = CliRunner()

output_file = str(tmp_path / "output_opossum.opossum")
result = runner.invoke(
generate,
[
"--opossum",
str(Path(__file__).resolve().parent / "data" / "opossum_input.opossum"),
"-o",
str(tmp_path / "output_opossum"),
output_file,
],
)

with open(Path(__file__).resolve().parent / "data" / "opossum_input.json") as file:
expected_opossum_dict = json.load(file)

assert result.exit_code == 0

with (
ZipFile(output_file, "r") as z,
z.open("input.json") as file,
):
opossum_dict = json.load(file)

## goal
# metadata
# resources
# externalAttributions
# resourcesToAttributions
# frequentLicenses
# attributionBreakpoints
# filesWithChildren
# baseUrlsForSources
# externalAttributionSources

print(expected_opossum_dict["externalAttributions"].keys())

assert result.exit_code == 0
assert expected_opossum_dict["resources"] == opossum_dict["resources"]
assert (
expected_opossum_dict["externalAttributions"]
== opossum_dict["externalAttributions"]
)


def test_cli_no_output_file_provided() -> None:
53 changes: 41 additions & 12 deletions tests/test_opossum/test_file_generation.py
Original file line number Diff line number Diff line change
@@ -2,6 +2,7 @@
#
# SPDX-License-Identifier: Apache-2.0
from pathlib import Path
from typing import cast
from unittest import TestCase

from spdx_tools.spdx.model import Document
@@ -11,6 +12,7 @@
from opossum_lib.opossum.opossum_file import (
ExternalAttributionSource,
OpossumInformation,
ResourceInFile,
)
from opossum_lib.spdx.constants import (
SPDX_FILE_IDENTIFIER,
@@ -39,7 +41,7 @@ def test_different_paths_graph() -> None:
document = _create_minimal_document()
opossum_information = _get_opossum_information_from_document(document)

file_tree = opossum_information.resources.to_dict()
file_tree = opossum_information.resources
assert file_tree == expected_file_tree
TestCase().assertCountEqual(
opossum_information.attributionBreakpoints,
@@ -103,7 +105,7 @@ def test_unconnected_paths_graph() -> None:
]
opossum_information = _get_opossum_information_from_document(document)

file_tree = opossum_information.resources.to_dict()
file_tree = opossum_information.resources
assert file_tree == expected_file_tree
TestCase().assertCountEqual(
opossum_information.attributionBreakpoints,
@@ -139,6 +141,18 @@ def test_unconnected_paths_graph() -> None:
)


def get_value_at_file_tree_path(
file_tree: ResourceInFile, path_elements: list[str]
) -> ResourceInFile:
if len(path_elements) != 0:
assert isinstance(file_tree, dict)
file_tree = cast(dict[str, ResourceInFile], file_tree)
return get_value_at_file_tree_path(
file_tree[path_elements[0]], path_elements[1:]
)
return file_tree


def test_different_roots_graph() -> None:
"""Creating a tree from a connected graph where some edges are not reachable
from the SPDX Lite Document node. This means that the connected graph has multiple
@@ -152,7 +166,7 @@ def test_different_roots_graph() -> None:
document = _generate_document_with_from_root_node_unreachable_file()
opossum_information = _get_opossum_information_from_document(document)

file_tree = opossum_information.resources.to_dict()
file_tree = opossum_information.resources
assert file_tree == expected_file_tree
TestCase().assertCountEqual(
opossum_information.attributionBreakpoints,
@@ -188,7 +202,7 @@ def test_tree_generation_for_bigger_examples_json() -> None:
opossum_information = _get_opossum_information_from_file(
"SPDXJSONExample-v2.3.spdx.json"
)
file_tree = opossum_information.resources.to_dict()
file_tree = opossum_information.resources

expected_breakpoints = [
"/SPDX-Tools-v2.0/CONTAINS/glibc/CONTAINS/"
@@ -202,20 +216,27 @@ def test_tree_generation_for_bigger_examples_json() -> None:
for attribution_breakpoint in expected_breakpoints:
assert attribution_breakpoint in opossum_information.attributionBreakpoints
assert (
file_tree["SPDX-Tools-v2.0"]["COPY_OF"][
"DocumentRef-spdx-tool-1.2:SPDXRef-ToolsElement"
]
get_value_at_file_tree_path(
file_tree,
[
"SPDX-Tools-v2.0",
"COPY_OF",
"DocumentRef-spdx-tool-1.2:SPDXRef-ToolsElement",
],
)
== 1
)

assert (
file_tree["SPDX-Tools-v2.0"]["CONTAINS"]["glibc"]["DYNAMIC_LINK"]["Saxon"] == {}
get_value_at_file_tree_path(
file_tree, ["SPDX-Tools-v2.0", "CONTAINS", "glibc", "DYNAMIC_LINK", "Saxon"]
)
== {}
)


def test_tree_generation_for_bigger_examples_spdx() -> None:
opossum_information = _get_opossum_information_from_file("SPDX.spdx")
file_tree = opossum_information.resources.to_dict()
file_tree = opossum_information.resources
expected_breakpoints = [
"/SPDX Lite Document/DESCRIBES/Package A/CONTAINS/",
"/SPDX Lite Document/DESCRIBES/Package A/COPY_OF/Package C/CONTAINS/",
@@ -227,10 +248,18 @@ def test_tree_generation_for_bigger_examples_spdx() -> None:
for attribution_breakpoint in expected_breakpoints:
assert attribution_breakpoint in opossum_information.attributionBreakpoints

assert file_tree["SPDX Lite Document"]["DESCRIBES"]["Package B"] == {}
assert (
get_value_at_file_tree_path(
file_tree, ["SPDX Lite Document", "DESCRIBES", "Package B"]
)
== {}
)

assert (
file_tree["SPDX Lite Document"]["DESCRIBES"]["Package A"]["CONTAINS"]["File-C"]
get_value_at_file_tree_path(
file_tree,
["SPDX Lite Document", "DESCRIBES", "Package A", "CONTAINS", "File-C"],
)
== 1
)

50 changes: 11 additions & 39 deletions tests/test_opossum/test_merge.py
Original file line number Diff line number Diff line change
@@ -24,38 +24,18 @@
)


@mock.patch("opossum_lib.opossum.opossum_file.OpossumPackage", autospec=True)
def test_merge_opossum_information(opossum_package: OpossumPackage) -> None:
def test_merge_opossum_information() -> None:
opossum_package = OpossumPackage(source=SourceInfo("source"))
opossum_information = OpossumInformation(
Metadata("project-id", "30-05-2023", "test data"),
Resource(
ResourceType.TOP_LEVEL,
{
"A": Resource(
ResourceType.FOLDER, {"B": Resource(ResourceType.FOLDER, {})}
)
},
),
{"A": {"B": {}}},
{"SPDXRef-Package": opossum_package},
{"/A/B/": ["SPDXRef-Package"]},
)

opossum_information_2 = OpossumInformation(
Metadata("test-data-id", "29-05-2023", "second test data"),
Resource(
ResourceType.TOP_LEVEL,
{
"A": Resource(
ResourceType.FOLDER,
{
"D": Resource(
ResourceType.FOLDER,
{"C": Resource(ResourceType.FILE, {})},
)
},
)
},
),
{"A": {"D": {"C": 1}}},
{"SPDXRef-File": opossum_package},
{"/A/D/C": ["SPDXRef-File"]},
)
@@ -65,20 +45,12 @@ def test_merge_opossum_information(opossum_package: OpossumPackage) -> None:
)

assert merged_information.metadata == opossum_information.metadata
assert merged_information.resources == Resource(
ResourceType.TOP_LEVEL,
{
"A": Resource(
ResourceType.FOLDER,
{
"B": Resource(ResourceType.FOLDER, {}),
"D": Resource(
ResourceType.FOLDER, {"C": Resource(ResourceType.FILE, {})}
),
},
)
},
)
assert merged_information.resources == {
"A": {
"B": {},
"D": {"C": 1},
}
}
assert merged_information.externalAttributions == {
"project-id-SPDXRef-Package": opossum_package,
"test-data-id-SPDXRef-File": opossum_package,
@@ -208,7 +180,7 @@ def test_expand_opossum_package_identifier() -> None:
opossum_information_expanded = expand_opossum_package_identifier(
OpossumInformation(
Metadata("project-id", "2022-03-02", "project title"),
resources=Resource(ResourceType.FILE, {}),
resources={},
externalAttributions={"SPDXRef-Package": opossum_package},
resourcesToAttributions={"/path/to/resource": ["SPDXRef-Package"]},
attributionBreakpoints=[],
8 changes: 3 additions & 5 deletions tests/test_opossum/test_read_opossum_file.py
Original file line number Diff line number Diff line change
@@ -8,6 +8,8 @@

from opossum_lib.opossum.read_opossum_file import read_opossum_file

TEST_DATA_PATH = Path(__file__).resolve().parent.parent / "data"


def test_read_opossum_file_corrupted_file_exits_1(caplog: LogCaptureFixture) -> None:
input_path = (
@@ -23,11 +25,7 @@ def test_read_opossum_file_corrupted_file_exits_1(caplog: LogCaptureFixture) ->


def test_read_opossum_file_with_result_json_exits_1(caplog: LogCaptureFixture) -> None:
input_path = (
Path(__file__).resolve().parent.parent
/ "data"
/ "opossum_input_with_result.opossum"
)
input_path = TEST_DATA_PATH / "opossum_input_with_result.opossum"

with pytest.raises(SystemExit) as system_exit:
read_opossum_file(str(input_path))