Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: introduce opossum files to CLI #173

Merged
merged 11 commits into from
Jan 14, 2025
42 changes: 32 additions & 10 deletions src/opossum_lib/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import click

from opossum_lib.opossum.file_generation import write_opossum_information_to_file
from opossum_lib.opossum.opossum_file import OpossumInformation
from opossum_lib.opossum.read_opossum_file import read_opossum_file
from opossum_lib.spdx.convert_to_opossum import convert_spdx_to_opossum_information


Expand All @@ -27,6 +29,12 @@ def opossum_file() -> None:
multiple=True,
type=click.Path(exists=True),
)
@click.option(
"--opossum",
help="opossum files used as input.",
Copy link
Member

@mstykow mstykow Jan 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since you're using plural, it sounds like this is a list of files but in reality you mean that each flag of --opossum has exactly one argument. so to improve clarity, i suggest you say something like this: "Specify a path to a .opossum file that you would like to include in the final output. Option can be repeated."

also note, that i'm including "path" in the help message to make it clear that this must be a valid file path.

multiple=True,
type=click.Path(exists=True),
)
@click.option(
"--outfile",
"-o",
Expand All @@ -35,23 +43,16 @@ def opossum_file() -> None:
help="The file path to write the generated opossum document to. "
'If appropriate, the extension ".opossum" will be appended.',
)
def generate(spdx: list[str], outfile: str) -> None:
def generate(spdx: list[str], opossum: list[str], outfile: str) -> None:
"""
Generate an Opossum file from various other file formats.

\b
Currently supported input formats:
- SPDX
"""
if len(spdx) == 0:
logging.warning("No input provided. Exiting.")
sys.exit(1)
if len(spdx) > 1:
logging.error("Merging of multiple SPDX files not yet supported!")
sys.exit(1)

the_spdx_file = spdx[0]
opossum_information = convert_spdx_to_opossum_information(the_spdx_file)
validate_input_exit_on_error(spdx, opossum)
opossum_information = convert_after_valid_input(spdx, opossum)

if not outfile.endswith(".opossum"):
outfile += ".opossum"
Expand All @@ -62,5 +63,26 @@ def generate(spdx: list[str], outfile: str) -> None:
write_opossum_information_to_file(opossum_information, Path(outfile))


def validate_input_exit_on_error(spdx: list[str], opossum: list[str]) -> None:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe clearer: "validate_input_and_exit_on_error". at first i read this as "validate input-exit on error".

total_number_of_files = len(spdx) + len(opossum)
if total_number_of_files == 0:
logging.warning("No input provided. Exiting.")
sys.exit(1)
if total_number_of_files > 1:
logging.error("Merging of multiple files not yet supported!")
sys.exit(1)


def convert_after_valid_input(
spdx: list[str], opossum_files: list[str]
) -> OpossumInformation:
if len(spdx) == 1:
the_spdx_file = spdx[0]
return convert_spdx_to_opossum_information(the_spdx_file)
else:
opossum_input_file = opossum_files[0]
return read_opossum_file(opossum_input_file)


if __name__ == "__main__":
opossum_file()
2 changes: 2 additions & 0 deletions src/opossum_lib/opossum/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@
#
# SPDX-License-Identifier: Apache-2.0
COMPRESSION_LEVEL = 5
INPUT_JSON_NAME = "input.json"
OUTPUT_JSON_NAME = "output.json"
54 changes: 9 additions & 45 deletions src/opossum_lib/opossum/file_generation.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,14 @@
# SPDX-FileCopyrightText: TNG Technology Consulting GmbH <https://www.tngtech.com>
#
# SPDX-License-Identifier: Apache-2.0
import json
from dataclasses import fields
from pathlib import Path
from zipfile import ZIP_DEFLATED, ZipFile

from opossum_lib.opossum.constants import COMPRESSION_LEVEL
from pydantic import TypeAdapter

from opossum_lib.opossum.constants import COMPRESSION_LEVEL, INPUT_JSON_NAME
from opossum_lib.opossum.opossum_file import (
ExternalAttributionSource,
Metadata,
OpossumInformation,
OpossumPackage,
Resource,
SourceInfo,
)


Expand All @@ -23,40 +18,9 @@ def write_opossum_information_to_file(
with ZipFile(
file_path, "w", compression=ZIP_DEFLATED, compresslevel=COMPRESSION_LEVEL
) as z:
z.writestr("input.json", json.dumps(to_dict(opossum_information), indent=4))


def to_dict(
element: Resource
| Metadata
| OpossumPackage
| OpossumInformation
| SourceInfo
| ExternalAttributionSource
| str
| int
| bool
| dict[str, OpossumPackage]
| dict[str, list[str]]
| list[str]
| None,
) -> dict | str | list[str] | bool | int | None:
if isinstance(element, Resource):
return element.to_dict()
if isinstance(
element,
Metadata
| OpossumPackage
| OpossumInformation
| SourceInfo
| ExternalAttributionSource,
):
result = []
for f in fields(element):
value = to_dict(getattr(element, f.name))
result.append((f.name, value))
return {k: v for (k, v) in result if v is not None}
elif isinstance(element, dict):
return {k: to_dict(v) for k, v in element.items()}
else:
return element
z.writestr(
INPUT_JSON_NAME,
TypeAdapter(OpossumInformation).dump_json(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice improvement 👍

opossum_information, indent=4, exclude_none=True
),
)
5 changes: 3 additions & 2 deletions src/opossum_lib/opossum/merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
Resource,
ResourcePath,
ResourceType,
convert_resource_in_file_to_resource,
)


Expand All @@ -23,10 +24,10 @@ def merge_opossum_information(
metadata=expanded_opossum_information[0].metadata,
resources=_merge_resources(
[
opossum_information.resources
convert_resource_in_file_to_resource(opossum_information.resources)
for opossum_information in expanded_opossum_information
]
),
).convert_to_file_resource(),
externalAttributions=_merge_dicts_without_duplicates(
[
opossum_information.externalAttributions
Expand Down
65 changes: 57 additions & 8 deletions src/opossum_lib/opossum/opossum_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,52 @@
from copy import deepcopy
from dataclasses import field
from enum import Enum, auto
from typing import Literal
from typing import Literal, cast

from pydantic import BaseModel, ConfigDict, model_serializer
from pydantic.dataclasses import dataclass

OpossumPackageIdentifier = str
ResourcePath = str
type OpossumPackageIdentifier = str
type ResourcePath = str
type ResourceInFile = dict[str, ResourceInFile] | int
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in which file? perhaps better: "OpossumFileResource"?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about:

  • The "Resource" class anyhow does not describe the file model --> Remove it from the file
  • Then just rename it to Resource to match the model (and maybe rename to old Resource for easier distinguishing)

Copy link
Contributor

@abraemer abraemer Jan 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A good time to change this is probably when tackling #178
I think we could move the current Resource type fully to the spdx section because:

  • the opossum code now uses the (to be renamed) ResourceInFile
  • the scancode code uses its own tree data structure and used to convert to Resource but this is historic and could be changed very easily (in fact I'll just do it now)

We then could rename ResourceInFile just to Resources or OpossumResources to match the top-level key and perhaps make it a full pydantic.Model with some convenience functions for construction. That has the advantage that there is a single point for the logic of how these resources are structured and changing it (e.g. because #38) would be easy. The small downside is that we need to hook into the serialization of pydantic which shouldn't be hard.



@dataclass(frozen=True)
class OpossumInformation:
metadata: Metadata
resources: Resource
resources: ResourceInFile
externalAttributions: dict[OpossumPackageIdentifier, OpossumPackage]
resourcesToAttributions: dict[ResourcePath, list[OpossumPackageIdentifier]]
attributionBreakpoints: list[str] = field(default_factory=list)
externalAttributionSources: dict[str, ExternalAttributionSource] = field(
default_factory=dict
)
frequentLicenses: list[FrequentLicense] | None = None
filesWithChildren: list[str] | None = None
baseUrlsForSources: BaseUrlsForSources | None = None


class BaseUrlsForSources(BaseModel):
@model_serializer
def serialize(self) -> dict:
# hack to override not serializing keys with corresponding value none:
# In this case this is valid and should be part of the serialization
Comment on lines +37 to +38
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

above, you're using exclude_none=True so is it really a surprise that none values are not serialized?

return {k: v for k, v in self}

model_config = ConfigDict(extra="allow", frozen=True)


class FrequentLicense(BaseModel):
fullName: str
shortName: str
defaultText: str


@dataclass(frozen=True)
class SourceInfo:
name: str
documentConfidence: int | None = 0
documentConfidence: int | float | None = 0
additionalName: str | None = None
Comment on lines 52 to +54
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not the first time, but i'm surprised to see camel case here. in python it's usually standard to use snake case. is there some special need for camel here?
if it's because of the serializing to JSON, pydantic has a configuration option to convert camel to snake during serialization if i remember correctly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.



@dataclass(frozen=True)
Expand All @@ -51,11 +73,13 @@ class OpossumPackage:
preSelected: bool | None = None
followUp: Literal["FOLLOW_UP"] | None = None
originId: str | None = None
originIds: list[str] | None = None
criticality: Literal["high"] | Literal["medium"] | None = None
wasPreferred: bool | None = None


@dataclass(frozen=True)
class Metadata:
class Metadata(BaseModel):
model_config = ConfigDict(extra="allow", frozen=True)
projectId: str
fileCreationDate: str
projectTitle: str
Expand Down Expand Up @@ -123,7 +147,7 @@ def drop_element(

return resource

def to_dict(self) -> int | dict:
def to_dict(self) -> ResourceInFile:
if not self.has_children():
if self.type == ResourceType.FOLDER:
return {}
Expand Down Expand Up @@ -154,8 +178,33 @@ def get_paths_of_all_leaf_nodes_with_types(
def has_children(self) -> bool:
return len(self.children) > 0

def convert_to_file_resource(self) -> ResourceInFile:
return self.to_dict()


@dataclass(frozen=True)
class ExternalAttributionSource:
name: str
priority: int
isRelevantForPreferred: bool | None = None


def _build_resource_tree(resource: ResourceInFile) -> Resource:
if isinstance(resource, int):
return Resource(type=ResourceType.FILE)
else:
result = Resource(type=ResourceType.FOLDER)
for name, child_resource in resource.items():
result.children[name] = _build_resource_tree(child_resource)
return result


def convert_resource_in_file_to_resource(resource: ResourceInFile) -> Resource:
root_node = Resource(ResourceType.TOP_LEVEL)

if isinstance(resource, dict):
dict_resource = cast(dict[str, ResourceInFile], resource)
for name, child_resource in dict_resource.items():
root_node.children[name] = _build_resource_tree(child_resource)

return root_node
46 changes: 46 additions & 0 deletions src/opossum_lib/opossum/read_opossum_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# SPDX-FileCopyrightText: TNG Technology Consulting GmbH <https://www.tngtech.com>
#
# SPDX-License-Identifier: Apache-2.0
import json
import logging
import sys
from zipfile import ZipFile

from pydantic import TypeAdapter

from opossum_lib.opossum.constants import INPUT_JSON_NAME, OUTPUT_JSON_NAME
from opossum_lib.opossum.opossum_file import (
OpossumInformation,
)


def read_opossum_file(filename: str) -> OpossumInformation:
logging.info(f"Converting opossum to opossum {filename}")

try:
with (
ZipFile(filename, "r") as input_zip_file,
):
validate_zip_file_contents(input_zip_file)
with input_zip_file.open(INPUT_JSON_NAME) as input_json_file:
input_json = json.load(input_json_file)
return TypeAdapter(OpossumInformation).validate_python(input_json)
except Exception as e:
# handle the exception
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what value does this comment add?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

None ... probably a reminder from pressing autocomplete and not taking care appropriately

print(f"Error reading file {filename}: {e}")
sys.exit(1)


def validate_zip_file_contents(input_zip_file: ZipFile) -> None:
if INPUT_JSON_NAME not in input_zip_file.namelist():
logging.error(
f"Opossum file {input_zip_file.filename} is corrupt"
f" and does not contain '{INPUT_JSON_NAME}'"
)
sys.exit(1)
if OUTPUT_JSON_NAME in input_zip_file.namelist():
logging.error(
f"Opossum file {input_zip_file.filename} also contains"
f" '{OUTPUT_JSON_NAME}' which cannot be processed"
)
sys.exit(1)
Comment on lines +41 to +46
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we care? can we just ignore the output file?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still would argue that this would surprise the user.
There is a follow up issue to add a force modifier which implements that behavior
#177

9 changes: 7 additions & 2 deletions src/opossum_lib/spdx/convert_to_opossum.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@


def convert_spdx_to_opossum_information(filename: str) -> OpossumInformation:
logging.info(f"Converting {filename} to opossum information.")
try:
document: SpdxDocument = parse_file(filename)

Expand Down Expand Up @@ -116,7 +117,7 @@ def convert_tree_to_opossum_information(tree: DiGraph) -> OpossumInformation:

opossum_information = OpossumInformation(
metadata=metadata,
resources=resources,
resources=resources.convert_to_file_resource(),
externalAttributions=external_attributions,
resourcesToAttributions=resources_to_attributions,
attributionBreakpoints=attribution_breakpoints,
Expand Down Expand Up @@ -170,5 +171,9 @@ def create_attribution_and_link_with_resource(
def create_metadata(tree: DiGraph) -> Metadata:
doc_name = tree.nodes["SPDXRef-DOCUMENT"]["element"].name
created = tree.nodes["SPDXRef-DOCUMENT"]["element"].created
metadata = Metadata(str(uuid.uuid4()), created.isoformat(), doc_name)
metadata = Metadata(
projectId=str(uuid.uuid4()),
fileCreationDate=created.isoformat(),
projectTitle=doc_name,
)
return metadata
Loading
Loading