From ffb704bdf905bb55542521e2d3ded17edc55c7dd Mon Sep 17 00:00:00 2001 From: Adrian Braemer Date: Mon, 20 Jan 2025 08:23:15 +0100 Subject: [PATCH 01/11] refactor: introduce new internal representation for Opossum files * this model encapsulates also the semantic relationships of resources, resourcesToAttribuions and externalAttributions. These are not enforced by the file structure alone. * This will be used as a target for the other file format frontends and simplify their logic. * It also allows for easier testing since it allows to check for semantic/structural equivalence among opossum files (e.g. the IDs of the attribution carry no semantic semantic information themselves i.e. are arbitrary labels) --- src/opossum_lib/opossum_model.py | 222 +++++++++++++++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 src/opossum_lib/opossum_model.py diff --git a/src/opossum_lib/opossum_model.py b/src/opossum_lib/opossum_model.py new file mode 100644 index 00000000..021c3de7 --- /dev/null +++ b/src/opossum_lib/opossum_model.py @@ -0,0 +1,222 @@ +# SPDX-FileCopyrightText: TNG Technology Consulting GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from dataclasses import field +from enum import Enum, auto +from os.path import relpath +from typing import Literal + +from pydantic import BaseModel, ConfigDict + +import opossum_lib.opossum.opossum_file as opossum_file + +type OpossumPackageIdentifier = str +type ResourcePath = str + + +class Opossum(BaseModel): + model_config = ConfigDict(frozen=True, extra="forbid") + metadata: Metadata + resources: list[Resource] + attribution_breakpoints: list[str] = field(default_factory=list) + external_attribution_sources: dict[str, ExternalAttributionSource] = field( + default_factory=dict + ) + frequent_licenses: list[FrequentLicense] | None = None + files_with_children: list[str] | None = None + base_urls_for_sources: BaseUrlsForSources | None = None + + def to_opossum_file_format(self) -> opossum_file.OpossumInformation: + external_attributions, resources_to_attributions = ( + self.create_attribution_mapping(self.resources) + ) + frequent_licenses = ( + None + if self.frequent_licenses is None + else [ + license.to_opossum_file_format() for license in self.frequent_licenses + ] + ) + base_urls_for_sources = ( + self.base_urls_for_sources + and self.base_urls_for_sources.to_opossum_file_format() + ) + return opossum_file.OpossumInformation( + metadata=self.metadata.to_opossum_file_format(), + resources={ + resource.path: resource.to_opossum_file_format() + for resource in self.resources + }, + external_attributions=external_attributions, + resources_to_attributions=resources_to_attributions, + attribution_breakpoints=self.attribution_breakpoints, + external_attribution_sources=self.external_attribution_sources, + frequent_licenses=frequent_licenses, + files_with_children=self.files_with_children, + base_urls_for_sources=base_urls_for_sources, + ) + + def create_attribution_mapping( + self, + root_nodes: list[Resource], + ) -> tuple[ + dict[opossum_file.OpossumPackageIdentifier, opossum_file.OpossumPackage], + dict[opossum_file.ResourcePath, list[opossum_file.OpossumPackageIdentifier]], + ]: + external_attributions: dict[ + opossum_file.OpossumPackageIdentifier, opossum_file.OpossumPackage + ] = {} + resources_to_attributions: dict[ + opossum_file.ResourcePath, list[opossum_file.OpossumPackageIdentifier] + ] = {} + + def process_node(node: Resource) -> None: + # the / is required by OpossumUI + path = "/" + node.path + attributions = node.attributions + + new_attributions_with_id = { + self.get_attribution_key(a): a.to_opossum_file_format() + for a in attributions + } + external_attributions.update(new_attributions_with_id) + + if len(new_attributions_with_id) > 0: + resources_to_attributions[path] = list(new_attributions_with_id.keys()) + + for child in node.children: + process_node(child) + + for child in root_nodes: + process_node(child) + + return external_attributions, resources_to_attributions + + def get_attribution_key( + self, attribution: OpossumPackage + ) -> OpossumPackageIdentifier: + return f"{attribution.license_name}-{hash(attribution)}" + + +class Resource(BaseModel): + model_config = ConfigDict(frozen=True, extra="forbid") + path: str + id: str + type: ResourceType + attributions: list[OpossumPackage] + children: list[Resource] + + def to_opossum_file_format(self) -> opossum_file.ResourceInFile: + if self.type == ResourceType.FILE: + return 1 + else: + return { + relpath(child.path, self.path): child.to_opossum_file_format() + for child in self.children + } + + +class BaseUrlsForSources(BaseModel): + model_config = ConfigDict(frozen=True, extra="allow") + + def to_opossum_file_format(self) -> opossum_file.BaseUrlsForSources: + return opossum_file.BaseUrlsForSources(**self.model_dump()) + + +class FrequentLicense(BaseModel): + model_config = ConfigDict(frozen=True, extra="forbid") + full_name: str + short_name: str + default_text: str + + def to_opossum_file_format(self) -> opossum_file.FrequentLicense: + return opossum_file.FrequentLicense(**self.model_dump()) + + +class SourceInfo(BaseModel): + model_config = ConfigDict(frozen=True, extra="forbid") + name: str + document_confidence: int | float | None = 0 + additional_name: str | None = None + + def to_opossum_file_format(self) -> opossum_file.SourceInfo: + return opossum_file.SourceInfo(**self.model_dump()) + + +class OpossumPackage(BaseModel): + model_config = ConfigDict(frozen=True, extra="forbid") + source: SourceInfo + attribution_confidence: int | None = None + comment: str | None = None + package_name: str | None = None + package_version: str | None = None + package_namespace: str | None = None + package_type: str | None = None + package_p_u_r_l_appendix: str | None = None + copyright: str | None = None + license_name: str | None = None + license_text: str | None = None + url: str | None = None + first_party: bool | None = None + exclude_from_notice: bool | None = None + pre_selected: bool | None = None + follow_up: Literal["FOLLOW_UP"] | None = None + origin_id: str | None = None + origin_ids: list[str] | None = None + criticality: Literal["high"] | Literal["medium"] | None = None + was_preferred: bool | None = None + + def to_opossum_file_format(self) -> opossum_file.OpossumPackage: + return opossum_file.OpossumPackage( + source=self.source.to_opossum_file_format(), + attribution_confidence=self.attribution_confidence, + comment=self.comment, + package_name=self.package_name, + package_version=self.package_version, + package_namespace=self.package_namespace, + package_type=self.package_type, + package_p_u_r_l_appendix=self.package_p_u_r_l_appendix, + copyright=self.copyright, + license_name=self.license_name, + license_text=self.license_text, + url=self.url, + first_party=self.first_party, + exclude_from_notice=self.exclude_from_notice, + pre_selected=self.pre_selected, + follow_up=self.follow_up, + origin_id=self.origin_id, + origin_ids=self.origin_ids, + criticality=self.criticality, + was_preferred=self.was_preferred, + ) + + +class Metadata(BaseModel): + model_config = ConfigDict(frozen=True, extra="allow") + project_id: str + file_creation_date: str + project_title: str + project_version: str | None = None + expected_release_date: str | None = None + build_date: str | None = None + + def to_opossum_file_format(self) -> opossum_file.Metadata: + return opossum_file.Metadata(**self.model_dump()) + + +class ResourceType(Enum): + FILE = auto() + FOLDER = auto() + + +class ExternalAttributionSource(BaseModel): + model_config = ConfigDict(frozen=True, extra="forbid") + name: str + priority: int + is_relevant_for_preferred: bool | None = None + + def to_opossum_file_format(self) -> opossum_file.ExternalAttributionSource: + return opossum_file.ExternalAttributionSource(**self.model_dump()) From 9d51cdbfe07d0d5d580e674a69d3d6ebc4e2243d Mon Sep 17 00:00:00 2001 From: Adrian Braemer Date: Mon, 20 Jan 2025 15:03:02 +0100 Subject: [PATCH 02/11] refactor: Switch ScanCode frontend to new Opossum model --- src/opossum_lib/opossum_model.py | 16 +++- .../scancode/convert_scancode_to_opossum.py | 24 ++---- src/opossum_lib/scancode/resource_tree.py | 86 ++++++------------- tests/test_scancode/test_resource_tree.py | 79 ++--------------- 4 files changed, 54 insertions(+), 151 deletions(-) diff --git a/src/opossum_lib/opossum_model.py b/src/opossum_lib/opossum_model.py index 021c3de7..7c605a68 100644 --- a/src/opossum_lib/opossum_model.py +++ b/src/opossum_lib/opossum_model.py @@ -4,6 +4,8 @@ from __future__ import annotations +import uuid +from collections import defaultdict from dataclasses import field from enum import Enum, auto from os.path import relpath @@ -98,13 +100,23 @@ def process_node(node: Resource) -> None: def get_attribution_key( self, attribution: OpossumPackage ) -> OpossumPackageIdentifier: - return f"{attribution.license_name}-{hash(attribution)}" + return str(uuid.uuid4()) + + +class OpossumWithFixedAttributionIdentifiers(Opossum): + attribution_to_id: dict[OpossumPackage, str] = field(default_factory=defaultdict) + + def get_attribution_key( + self, attribution: OpossumPackage + ) -> OpossumPackageIdentifier: + id = self.attribution_to_id[attribution] + self.attribution_to_id[attribution] = id + return id class Resource(BaseModel): model_config = ConfigDict(frozen=True, extra="forbid") path: str - id: str type: ResourceType attributions: list[OpossumPackage] children: list[Resource] diff --git a/src/opossum_lib/scancode/convert_scancode_to_opossum.py b/src/opossum_lib/scancode/convert_scancode_to_opossum.py index 81db5399..5c8469a7 100644 --- a/src/opossum_lib/scancode/convert_scancode_to_opossum.py +++ b/src/opossum_lib/scancode/convert_scancode_to_opossum.py @@ -8,15 +8,10 @@ import sys import uuid -from opossum_lib.opossum.opossum_file import ( - Metadata, - OpossumInformation, -) +import opossum_lib.opossum_model as opossum_model from opossum_lib.opossum.opossum_file_content import OpossumFileContent from opossum_lib.scancode.model import Header, ScanCodeData from opossum_lib.scancode.resource_tree import ( - convert_to_opossum_resources, - create_attribution_mapping, scancode_to_file_tree, ) @@ -27,27 +22,26 @@ def convert_scancode_to_opossum(filename: str) -> OpossumFileContent: scancode_data = load_scancode_json(filename) filetree = scancode_to_file_tree(scancode_data) - resources = convert_to_opossum_resources(filetree) - external_attributions, resources_to_attributions = create_attribution_mapping( - filetree - ) + resources = filetree.to_opossum_resources() + with open("debug.json", "w") as out: + out.write(resources[0].model_dump_json(indent=4, by_alias=True)) scancode_header = extract_scancode_header(scancode_data, filename) - metadata = Metadata( + metadata = opossum_model.Metadata( project_id=str(uuid.uuid4()), file_creation_date=scancode_header.end_timestamp, project_title="ScanCode file", ) return OpossumFileContent( - OpossumInformation( + opossum_model.Opossum( metadata=metadata, resources=resources, - external_attributions=external_attributions, - resources_to_attributions=resources_to_attributions, attribution_breakpoints=[], external_attribution_sources={}, - ) + frequent_licenses=None, + files_with_children=None, + ).to_opossum_file_format() ) diff --git a/src/opossum_lib/scancode/resource_tree.py b/src/opossum_lib/scancode/resource_tree.py index ad1e8987..f27693f6 100644 --- a/src/opossum_lib/scancode/resource_tree.py +++ b/src/opossum_lib/scancode/resource_tree.py @@ -5,17 +5,9 @@ from __future__ import annotations -from os.path import relpath - from pydantic import BaseModel -from opossum_lib.opossum.opossum_file import ( - OpossumPackage, - OpossumPackageIdentifier, - ResourceInFile, - ResourcePath, - SourceInfo, -) +import opossum_lib.opossum_model as opossum_model from opossum_lib.scancode.constants import SCANCODE_SOURCE_NAME from opossum_lib.scancode.helpers import check_schema, path_segments from opossum_lib.scancode.model import File, FileType, ScanCodeData @@ -38,6 +30,21 @@ def revalidate(self) -> None: for child in self.children.values(): child.revalidate() + def to_opossum_resources( + self, + ) -> list[opossum_model.Resource]: + def process_node( + node: ScanCodeFileTree, + ) -> opossum_model.Resource: + return opossum_model.Resource( + path=node.file.path, + attributions=get_attribution_info(node.file), + type=convert_resource_type(node.file.type), + children=[process_node(child) for child in node.children.values()], + ) + + return [process_node(self)] + def scancode_to_file_tree(scancode_data: ScanCodeData) -> ScanCodeFileTree: temp_root = ScanCodeFileTree.model_construct(file=None) # type: ignore @@ -51,26 +58,11 @@ def scancode_to_file_tree(scancode_data: ScanCodeData) -> ScanCodeFileTree: return root -def convert_to_opossum_resources(root_node: ScanCodeFileTree) -> ResourceInFile: - def process_node(node: ScanCodeFileTree) -> ResourceInFile: - if node.file.type == FileType.FILE: - return 1 - else: - root_path = node.file.path - children = { - relpath(n.file.path, root_path): process_node(n) - for n in node.children.values() - } - return children - - return {root_node.file.path: process_node(root_node)} - - -def get_attribution_info(file: File) -> list[OpossumPackage]: +def get_attribution_info(file: File) -> list[opossum_model.OpossumPackage]: if file.type == FileType.DIRECTORY: return [] copyright = "\n".join(c.copyright for c in file.copyrights) - source_info = SourceInfo(name=SCANCODE_SOURCE_NAME) + source_info = opossum_model.SourceInfo(name=SCANCODE_SOURCE_NAME) attribution_infos = [] for license_detection in file.license_detections: @@ -78,10 +70,10 @@ def get_attribution_info(file: File) -> list[OpossumPackage]: max_score = max(m.score for m in license_detection.matches) attribution_confidence = int(max_score) - package = OpossumPackage( + package = opossum_model.OpossumPackage( source=source_info, - licenseName=license_name, - attributionConfidence=attribution_confidence, + license_name=license_name, + attribution_confidence=attribution_confidence, copyright=copyright, ) attribution_infos.append(package) @@ -89,34 +81,8 @@ def get_attribution_info(file: File) -> list[OpossumPackage]: return attribution_infos -def get_attribution_key(attribution: OpossumPackage) -> OpossumPackageIdentifier: - return f"{attribution.license_name}-{hash(attribution)}" - - -def create_attribution_mapping( - root_node: ScanCodeFileTree, -) -> tuple[ - dict[OpossumPackageIdentifier, OpossumPackage], - dict[ResourcePath, list[OpossumPackageIdentifier]], -]: - external_attributions: dict[OpossumPackageIdentifier, OpossumPackage] = {} - resources_to_attributions: dict[ResourcePath, list[OpossumPackageIdentifier]] = {} - - def process_node(node: ScanCodeFileTree) -> None: - # the / is required by OpossumUI - path = "/" + node.file.path - attributions = get_attribution_info(node.file) - - new_attributions_with_id = {get_attribution_key(a): a for a in attributions} - external_attributions.update(new_attributions_with_id) - - if len(new_attributions_with_id) > 0: - resources_to_attributions[path] = list(new_attributions_with_id.keys()) - - for child in node.children.values(): - process_node(child) - - for child in root_node.children.values(): - process_node(child) - - return external_attributions, resources_to_attributions +def convert_resource_type(val: FileType) -> opossum_model.ResourceType: + if val == FileType.FILE: + return opossum_model.ResourceType.FILE + else: + return opossum_model.ResourceType.FOLDER diff --git a/tests/test_scancode/test_resource_tree.py b/tests/test_scancode/test_resource_tree.py index fa9d3279..89a5955c 100644 --- a/tests/test_scancode/test_resource_tree.py +++ b/tests/test_scancode/test_resource_tree.py @@ -3,13 +3,11 @@ # SPDX-License-Identifier: Apache-2.0 from copy import deepcopy -from typing import Any -from unittest import mock import pytest from pydantic import ValidationError -from opossum_lib.opossum.opossum_file import OpossumPackage, SourceInfo +from opossum_lib.opossum_model import OpossumPackage, SourceInfo from opossum_lib.scancode.constants import SCANCODE_SOURCE_NAME from opossum_lib.scancode.model import ( Copyright, @@ -21,8 +19,6 @@ ) from opossum_lib.scancode.resource_tree import ( ScanCodeFileTree, - convert_to_opossum_resources, - create_attribution_mapping, get_attribution_info, scancode_to_file_tree, ) @@ -85,71 +81,6 @@ def test_scancode_to_resource_tree_produces_expected_result() -> None: assert tree == reference -def test_convert_to_opossum_resources_produces_expected_result() -> None: - scancode_data = ScanCodeData( - headers=[], - packages=[], - dependencies=[], - license_detections=[], - files=_create_reference_scancode_files(), - ) - - tree = scancode_to_file_tree(scancode_data) - resources = convert_to_opossum_resources(tree) - reference = {"A": {"B": {"file3": 1}, "file1": 1, "file2.txt": 1}} - assert resources == reference - - -@mock.patch( - "opossum_lib.scancode.resource_tree.get_attribution_info", - autospec=True, - return_value=[OpossumPackage(source=SourceInfo(name="mocked"))], -) -def test_create_attribution_mapping_paths_have_root_prefix(_: Any) -> None: - rootnode = _create_reference_node_structure() - _, resources_to_attributions = create_attribution_mapping(rootnode) - # OpossumUI automatically prepends every path with a "/" - # So our resourcesToAttributions needs to start every path with "/" - # even though ScanCode paths don't start with "/". - assert "/A/file1" in resources_to_attributions - assert "/A/file2.txt" in resources_to_attributions - assert "/A/B/file3" in resources_to_attributions - - -def test_create_attribution_mapping() -> None: - _, _, file1, file2, file3 = _create_reference_scancode_files() - pkg1 = OpossumPackage(source=SourceInfo(name="S1")) - pkg2 = OpossumPackage(source=SourceInfo(name="S2")) - pkg3 = OpossumPackage(source=SourceInfo(name="S3")) - - def get_attribution_info_mock(file: File) -> list[OpossumPackage]: - if file == file1: - return [deepcopy(pkg1), deepcopy(pkg2)] - elif file == file2: - return [deepcopy(pkg1), deepcopy(pkg2), deepcopy(pkg3)] - elif file == file3: - return [] - else: - return [] - - root_node = _create_reference_node_structure() - - with mock.patch( - "opossum_lib.scancode.resource_tree.get_attribution_info", - new=get_attribution_info_mock, - ): - external_attributions, resources_to_attributions = create_attribution_mapping( - root_node - ) - assert len(external_attributions) == 3 # deduplication worked - - reverse_mapping = {v: k for (k, v) in external_attributions.items()} - id1, id2, id3 = reverse_mapping[pkg1], reverse_mapping[pkg2], reverse_mapping[pkg3] - assert len(resources_to_attributions) == 2 # only files with attributions - assert set(resources_to_attributions["/" + file1.path]) == {id1, id2} - assert set(resources_to_attributions["/" + file2.path]) == {id1, id2, id3} - - def test_get_attribution_info_directory() -> None: folder = _create_file("A", FileType.DIRECTORY) assert get_attribution_info(folder) == [] @@ -217,15 +148,15 @@ def test_get_attribution_info_file_multiple() -> None: attributions = get_attribution_info(file) expected1 = OpossumPackage( source=SourceInfo(name=SCANCODE_SOURCE_NAME), - licenseName="Apache-2.0", + license_name="Apache-2.0", copyright="Me\nMyself\nI", - attributionConfidence=95, + attribution_confidence=95, ) expected2 = OpossumPackage( source=SourceInfo(name=SCANCODE_SOURCE_NAME), - licenseName="MIT", + license_name="MIT", copyright="Me\nMyself\nI", - attributionConfidence=50, + attribution_confidence=50, ) assert set(attributions) == {expected1, expected2} From 03feb75afe8fe3dc99a9b09642ab0d386290acd6 Mon Sep 17 00:00:00 2001 From: Adrian Braemer Date: Tue, 21 Jan 2025 07:28:58 +0100 Subject: [PATCH 03/11] feat: extend Opossum model * Opossum class should be able to carry all information that could be present in an .opossum file --- src/opossum_lib/opossum_model.py | 50 ++++++++++--------- .../scancode/convert_scancode_to_opossum.py | 18 +++---- 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/src/opossum_lib/opossum_model.py b/src/opossum_lib/opossum_model.py index 7c605a68..a501167f 100644 --- a/src/opossum_lib/opossum_model.py +++ b/src/opossum_lib/opossum_model.py @@ -14,11 +14,17 @@ from pydantic import BaseModel, ConfigDict import opossum_lib.opossum.opossum_file as opossum_file +import opossum_lib.opossum.opossum_file_content as opossum_file_content +from opossum_lib.opossum.output_model import OpossumOutputFile type OpossumPackageIdentifier = str type ResourcePath = str +def default_attribution_id_mapper() -> dict[OpossumPackage, str]: + return defaultdict(lambda: str(uuid.uuid4())) + + class Opossum(BaseModel): model_config = ConfigDict(frozen=True, extra="forbid") metadata: Metadata @@ -30,8 +36,12 @@ class Opossum(BaseModel): frequent_licenses: list[FrequentLicense] | None = None files_with_children: list[str] | None = None base_urls_for_sources: BaseUrlsForSources | None = None + attribution_to_id: dict[OpossumPackage, str] = field( + default_factory=default_attribution_id_mapper + ) + output_file: OpossumOutputFile | None = None - def to_opossum_file_format(self) -> opossum_file.OpossumInformation: + def to_opossum_file_format(self) -> opossum_file_content.OpossumFileContent: external_attributions, resources_to_attributions = ( self.create_attribution_mapping(self.resources) ) @@ -46,19 +56,22 @@ def to_opossum_file_format(self) -> opossum_file.OpossumInformation: self.base_urls_for_sources and self.base_urls_for_sources.to_opossum_file_format() ) - return opossum_file.OpossumInformation( - metadata=self.metadata.to_opossum_file_format(), - resources={ - resource.path: resource.to_opossum_file_format() - for resource in self.resources - }, - external_attributions=external_attributions, - resources_to_attributions=resources_to_attributions, - attribution_breakpoints=self.attribution_breakpoints, - external_attribution_sources=self.external_attribution_sources, - frequent_licenses=frequent_licenses, - files_with_children=self.files_with_children, - base_urls_for_sources=base_urls_for_sources, + return opossum_file_content.OpossumFileContent( + input_file=opossum_file.OpossumInformation( + metadata=self.metadata.to_opossum_file_format(), + resources={ + resource.path: resource.to_opossum_file_format() + for resource in self.resources + }, + external_attributions=external_attributions, + resources_to_attributions=resources_to_attributions, + attribution_breakpoints=self.attribution_breakpoints, + external_attribution_sources=self.external_attribution_sources, + frequent_licenses=frequent_licenses, + files_with_children=self.files_with_children, + base_urls_for_sources=base_urls_for_sources, + ), + output_file=self.output_file, ) def create_attribution_mapping( @@ -97,15 +110,6 @@ def process_node(node: Resource) -> None: return external_attributions, resources_to_attributions - def get_attribution_key( - self, attribution: OpossumPackage - ) -> OpossumPackageIdentifier: - return str(uuid.uuid4()) - - -class OpossumWithFixedAttributionIdentifiers(Opossum): - attribution_to_id: dict[OpossumPackage, str] = field(default_factory=defaultdict) - def get_attribution_key( self, attribution: OpossumPackage ) -> OpossumPackageIdentifier: diff --git a/src/opossum_lib/scancode/convert_scancode_to_opossum.py b/src/opossum_lib/scancode/convert_scancode_to_opossum.py index 5c8469a7..b39e9455 100644 --- a/src/opossum_lib/scancode/convert_scancode_to_opossum.py +++ b/src/opossum_lib/scancode/convert_scancode_to_opossum.py @@ -33,16 +33,14 @@ def convert_scancode_to_opossum(filename: str) -> OpossumFileContent: project_title="ScanCode file", ) - return OpossumFileContent( - opossum_model.Opossum( - metadata=metadata, - resources=resources, - attribution_breakpoints=[], - external_attribution_sources={}, - frequent_licenses=None, - files_with_children=None, - ).to_opossum_file_format() - ) + return opossum_model.Opossum( + metadata=metadata, + resources=resources, + attribution_breakpoints=[], + external_attribution_sources={}, + frequent_licenses=None, + files_with_children=None, + ).to_opossum_file_format() def load_scancode_json(filename: str) -> ScanCodeData: From d35c997eb61dc6f50a75a478621b58fabd21a3e8 Mon Sep 17 00:00:00 2001 From: Adrian Braemer Date: Tue, 21 Jan 2025 11:36:06 +0100 Subject: [PATCH 04/11] refactor: helper functions for resource * resources can now by added to the resource structure without knowledge about internals * for this reason: - resources can be created with just a path (i.e. without type) - resources can now be merged together if the types are compatible. types are compatible if at least one is not set or types are identical - when converting to opossum file format, unset type defaults to folder. Maybe this should raise an error instead, but being more permissible probably just makes things more ergonomic without hurting correctness. --- src/opossum_lib/opossum_model.py | 69 ++++++++++++++++++----- src/opossum_lib/scancode/resource_tree.py | 4 +- 2 files changed, 58 insertions(+), 15 deletions(-) diff --git a/src/opossum_lib/opossum_model.py b/src/opossum_lib/opossum_model.py index a501167f..3861b75a 100644 --- a/src/opossum_lib/opossum_model.py +++ b/src/opossum_lib/opossum_model.py @@ -6,9 +6,10 @@ import uuid from collections import defaultdict +from collections.abc import Iterable from dataclasses import field from enum import Enum, auto -from os.path import relpath +from pathlib import Path from typing import Literal from pydantic import BaseModel, ConfigDict @@ -60,7 +61,7 @@ def to_opossum_file_format(self) -> opossum_file_content.OpossumFileContent: input_file=opossum_file.OpossumInformation( metadata=self.metadata.to_opossum_file_format(), resources={ - resource.path: resource.to_opossum_file_format() + str(resource.path): resource.to_opossum_file_format() for resource in self.resources }, external_attributions=external_attributions, @@ -89,8 +90,10 @@ def create_attribution_mapping( ] = {} def process_node(node: Resource) -> None: - # the / is required by OpossumUI - path = "/" + node.path + path = str(node.path) + if not path.startswith("/"): + # the / is required by OpossumUI + path = "/" + path attributions = node.attributions new_attributions_with_id = { @@ -102,11 +105,11 @@ def process_node(node: Resource) -> None: if len(new_attributions_with_id) > 0: resources_to_attributions[path] = list(new_attributions_with_id.keys()) - for child in node.children: + for child in node.children.values(): process_node(child) - for child in root_nodes: - process_node(child) + for root in root_nodes: + process_node(root) return external_attributions, resources_to_attributions @@ -119,21 +122,59 @@ def get_attribution_key( class Resource(BaseModel): - model_config = ConfigDict(frozen=True, extra="forbid") - path: str - type: ResourceType - attributions: list[OpossumPackage] - children: list[Resource] + model_config = ConfigDict(extra="forbid") + path: Path + type: ResourceType | None = None + attributions: list[OpossumPackage] = [] + children: dict[str, Resource] = {} def to_opossum_file_format(self) -> opossum_file.ResourceInFile: if self.type == ResourceType.FILE: return 1 else: return { - relpath(child.path, self.path): child.to_opossum_file_format() - for child in self.children + str(child.path.relative_to(self.path)): child.to_opossum_file_format() + for child in self.children.values() } + def add_resource(self, resource: Resource) -> None: + if not resource.path.is_relative_to(self.path): + raise RuntimeError( + f"The path {resource.path} is not a child of this node at {self.path}." + ) + remaining_path_parts = resource.path.relative_to(self.path).parts + if remaining_path_parts: + self._add_resource(resource, remaining_path_parts) + else: + self._update(resource) + + def _add_resource( + self, resource: Resource, remaining_path_parts: Iterable[str] + ) -> None: + if not remaining_path_parts: + self._update(resource) + return + next, *rest_parts = remaining_path_parts + if next not in self.children: + self.children[next] = Resource(path=self.path / next) + self.children[next]._add_resource(resource, rest_parts) + + def _update(self, other: Resource) -> None: + if self.path != other.path: + raise RuntimeError( + "Trying to merge nodes with different paths: " + + f"{self.path} vs. {other.path}" + ) + if self.type and other.type and self.type != other.type: + raise RuntimeError("Trying to merge incompatible node types.") + self.type = self.type or other.type + self.attributions.extend(other.attributions) + for key, child in other.children.items(): + if key in self.children: + self.children[key]._update(child) + else: + self.children[key] = child + class BaseUrlsForSources(BaseModel): model_config = ConfigDict(frozen=True, extra="allow") diff --git a/src/opossum_lib/scancode/resource_tree.py b/src/opossum_lib/scancode/resource_tree.py index f27693f6..15e50a91 100644 --- a/src/opossum_lib/scancode/resource_tree.py +++ b/src/opossum_lib/scancode/resource_tree.py @@ -40,7 +40,9 @@ def process_node( path=node.file.path, attributions=get_attribution_info(node.file), type=convert_resource_type(node.file.type), - children=[process_node(child) for child in node.children.values()], + children={ + key: process_node(child) for (key, child) in node.children.items() + }, ) return [process_node(self)] From e086bffbbc9c8754a4ceacce5334cc1b098df06e Mon Sep 17 00:00:00 2001 From: Adrian Braemer Date: Tue, 21 Jan 2025 12:19:41 +0100 Subject: [PATCH 05/11] refactor: simplify scancode conversion using new Resource functions --- .../scancode/convert_scancode_to_opossum.py | 5 +- src/opossum_lib/scancode/helpers.py | 19 ---- src/opossum_lib/scancode/resource_tree.py | 54 +++-------- tests/test_scancode/model_helpers.py | 10 +++ tests/test_scancode/test_resource_tree.py | 89 ------------------- 5 files changed, 21 insertions(+), 156 deletions(-) delete mode 100644 src/opossum_lib/scancode/helpers.py diff --git a/src/opossum_lib/scancode/convert_scancode_to_opossum.py b/src/opossum_lib/scancode/convert_scancode_to_opossum.py index b39e9455..7a1439b0 100644 --- a/src/opossum_lib/scancode/convert_scancode_to_opossum.py +++ b/src/opossum_lib/scancode/convert_scancode_to_opossum.py @@ -21,10 +21,7 @@ def convert_scancode_to_opossum(filename: str) -> OpossumFileContent: scancode_data = load_scancode_json(filename) - filetree = scancode_to_file_tree(scancode_data) - resources = filetree.to_opossum_resources() - with open("debug.json", "w") as out: - out.write(resources[0].model_dump_json(indent=4, by_alias=True)) + resources = [scancode_to_file_tree(scancode_data)] scancode_header = extract_scancode_header(scancode_data, filename) metadata = opossum_model.Metadata( diff --git a/src/opossum_lib/scancode/helpers.py b/src/opossum_lib/scancode/helpers.py deleted file mode 100644 index 1ba1fd3f..00000000 --- a/src/opossum_lib/scancode/helpers.py +++ /dev/null @@ -1,19 +0,0 @@ -# SPDX-FileCopyrightText: TNG Technology Consulting GmbH -# -# SPDX-License-Identifier: Apache-2.0 - - -import os.path - -from pydantic import BaseModel -from pydantic_core import SchemaValidator - - -def path_segments(path: str) -> list[str]: - path = os.path.normpath(path) - return path.split(os.sep) - - -def check_schema(model: BaseModel) -> None: - schema_validator = SchemaValidator(schema=model.__pydantic_core_schema__) - schema_validator.validate_python(model.__dict__) diff --git a/src/opossum_lib/scancode/resource_tree.py b/src/opossum_lib/scancode/resource_tree.py index 15e50a91..e48399e1 100644 --- a/src/opossum_lib/scancode/resource_tree.py +++ b/src/opossum_lib/scancode/resource_tree.py @@ -5,59 +5,25 @@ from __future__ import annotations -from pydantic import BaseModel +from pathlib import Path import opossum_lib.opossum_model as opossum_model from opossum_lib.scancode.constants import SCANCODE_SOURCE_NAME -from opossum_lib.scancode.helpers import check_schema, path_segments from opossum_lib.scancode.model import File, FileType, ScanCodeData -class ScanCodeFileTree(BaseModel): - file: File - children: dict[str, ScanCodeFileTree] = {} - - def get_path(self, path: list[str]) -> ScanCodeFileTree: - if len(path) == 0: - return self - next_segment, *rest = path - if next_segment not in self.children: - self.children[next_segment] = ScanCodeFileTree.model_construct(None) # type: ignore - return self.children[next_segment].get_path(rest) - - def revalidate(self) -> None: - check_schema(self) - for child in self.children.values(): - child.revalidate() - - def to_opossum_resources( - self, - ) -> list[opossum_model.Resource]: - def process_node( - node: ScanCodeFileTree, - ) -> opossum_model.Resource: - return opossum_model.Resource( - path=node.file.path, - attributions=get_attribution_info(node.file), - type=convert_resource_type(node.file.type), - children={ - key: process_node(child) for (key, child) in node.children.items() - }, - ) - - return [process_node(self)] - - -def scancode_to_file_tree(scancode_data: ScanCodeData) -> ScanCodeFileTree: - temp_root = ScanCodeFileTree.model_construct(file=None) # type: ignore +def scancode_to_file_tree(scancode_data: ScanCodeData) -> opossum_model.Resource: + temp_root = opossum_model.Resource(path=Path("")) for file in scancode_data.files: - segments = path_segments(file.path) - temp_root.get_path(segments).file = file + resource = opossum_model.Resource( + path=Path(file.path), + attributions=get_attribution_info(file), + type=convert_resource_type(file.type), + ) + temp_root.add_resource(resource) assert len(temp_root.children) == 1 - root = list(temp_root.children.values())[0] - check_schema(root) - return root + return list(temp_root.children.values())[0] def get_attribution_info(file: File) -> list[opossum_model.OpossumPackage]: diff --git a/tests/test_scancode/model_helpers.py b/tests/test_scancode/model_helpers.py index d7b61629..c453b459 100644 --- a/tests/test_scancode/model_helpers.py +++ b/tests/test_scancode/model_helpers.py @@ -14,6 +14,16 @@ ) +def _create_reference_scancode_files() -> list[File]: + return [ + _create_file("A", FileType.DIRECTORY), + _create_file("A/B", FileType.DIRECTORY), + _create_file("A/file1", FileType.FILE), + _create_file("A/file2.txt", FileType.FILE), + _create_file("A/B/file3", FileType.FILE), + ] + + def _create_file( path: str, type: FileType, diff --git a/tests/test_scancode/test_resource_tree.py b/tests/test_scancode/test_resource_tree.py index 89a5955c..70701c0b 100644 --- a/tests/test_scancode/test_resource_tree.py +++ b/tests/test_scancode/test_resource_tree.py @@ -4,83 +4,20 @@ from copy import deepcopy -import pytest -from pydantic import ValidationError - from opossum_lib.opossum_model import OpossumPackage, SourceInfo from opossum_lib.scancode.constants import SCANCODE_SOURCE_NAME from opossum_lib.scancode.model import ( Copyright, - File, FileBasedLicenseDetection, FileType, Match, - ScanCodeData, ) from opossum_lib.scancode.resource_tree import ( - ScanCodeFileTree, get_attribution_info, - scancode_to_file_tree, ) from tests.test_scancode.model_helpers import _create_file -class TestRevalidate: - def test_successfully_revalidate_valid_file_tree(self) -> None: - dummy_file = _create_file("A", FileType.FILE) - valid_structure = ScanCodeFileTree( - file=dummy_file, - children={ - "A": ScanCodeFileTree(file=dummy_file), - "B": ScanCodeFileTree( - file=dummy_file, children={"C": ScanCodeFileTree(file=dummy_file)} - ), - }, - ) - valid_structure.revalidate() - - def test_fail_to_revalidate_file_tree_invalid_at_toplevel(self) -> None: - dummy_file = _create_file("A", FileType.FILE) - invalid_structure = ScanCodeFileTree.model_construct( - children={ - "A": ScanCodeFileTree(file=dummy_file), - "B": ScanCodeFileTree( - file=dummy_file, children={"C": ScanCodeFileTree(file=dummy_file)} - ), - }, - file=None, # type: ignore - ) - with pytest.raises(ValidationError): - invalid_structure.revalidate() - - def test_fail_to_revalidate_file_tree_invalid_only_at_lower_level(self) -> None: - dummy_file = _create_file("A", FileType.FILE) - invalid_structure = ScanCodeFileTree( - file=dummy_file, - children={ - "A": ScanCodeFileTree(file=dummy_file), - "B": ScanCodeFileTree( - file=dummy_file, - children={"C": ScanCodeFileTree.model_construct(None)}, # type: ignore - ), - }, - ) - with pytest.raises(ValidationError): - invalid_structure.revalidate() - - -def test_scancode_to_resource_tree_produces_expected_result() -> None: - files = _create_reference_scancode_files() - scancode_data = ScanCodeData( - headers=[], packages=[], dependencies=[], license_detections=[], files=files - ) - - tree = scancode_to_file_tree(scancode_data) - reference = _create_reference_node_structure() - - assert tree == reference - - def test_get_attribution_info_directory() -> None: folder = _create_file("A", FileType.DIRECTORY) assert get_attribution_info(folder) == [] @@ -159,29 +96,3 @@ def test_get_attribution_info_file_multiple() -> None: attribution_confidence=50, ) assert set(attributions) == {expected1, expected2} - - -def _create_reference_scancode_files() -> list[File]: - return [ - _create_file("A", FileType.DIRECTORY), - _create_file("A/B", FileType.DIRECTORY), - _create_file("A/file1", FileType.FILE), - _create_file("A/file2.txt", FileType.FILE), - _create_file("A/B/file3", FileType.FILE), - ] - - -def _create_reference_node_structure() -> ScanCodeFileTree: - folder, subfolder, file1, file2, file3 = _create_reference_scancode_files() - inner = ScanCodeFileTree( - file=subfolder, children={"file3": ScanCodeFileTree(file=file3)} - ) - reference = ScanCodeFileTree( - file=folder, - children={ - "B": inner, - "file1": ScanCodeFileTree(file=file1), - "file2.txt": ScanCodeFileTree(file=file2), - }, - ) - return reference From 301ba34ef1cbe9a24290dc3530e9179293ade556 Mon Sep 17 00:00:00 2001 From: Adrian Braemer Date: Tue, 21 Jan 2025 12:37:31 +0100 Subject: [PATCH 06/11] fix: use PurePaths instead to make it work on windows --- src/opossum_lib/opossum_model.py | 4 ++-- src/opossum_lib/scancode/resource_tree.py | 6 +++--- tests/test_scancode/model_helpers.py | 9 +++++---- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/opossum_lib/opossum_model.py b/src/opossum_lib/opossum_model.py index 3861b75a..b78332fa 100644 --- a/src/opossum_lib/opossum_model.py +++ b/src/opossum_lib/opossum_model.py @@ -9,7 +9,7 @@ from collections.abc import Iterable from dataclasses import field from enum import Enum, auto -from pathlib import Path +from pathlib import PurePath from typing import Literal from pydantic import BaseModel, ConfigDict @@ -123,7 +123,7 @@ def get_attribution_key( class Resource(BaseModel): model_config = ConfigDict(extra="forbid") - path: Path + path: PurePath type: ResourceType | None = None attributions: list[OpossumPackage] = [] children: dict[str, Resource] = {} diff --git a/src/opossum_lib/scancode/resource_tree.py b/src/opossum_lib/scancode/resource_tree.py index e48399e1..5428f651 100644 --- a/src/opossum_lib/scancode/resource_tree.py +++ b/src/opossum_lib/scancode/resource_tree.py @@ -5,7 +5,7 @@ from __future__ import annotations -from pathlib import Path +from pathlib import PurePath import opossum_lib.opossum_model as opossum_model from opossum_lib.scancode.constants import SCANCODE_SOURCE_NAME @@ -13,10 +13,10 @@ def scancode_to_file_tree(scancode_data: ScanCodeData) -> opossum_model.Resource: - temp_root = opossum_model.Resource(path=Path("")) + temp_root = opossum_model.Resource(path=PurePath("")) for file in scancode_data.files: resource = opossum_model.Resource( - path=Path(file.path), + path=PurePath(file.path.replace("\\", "/")), attributions=get_attribution_info(file), type=convert_resource_type(file.type), ) diff --git a/tests/test_scancode/model_helpers.py b/tests/test_scancode/model_helpers.py index c453b459..40a3a1b2 100644 --- a/tests/test_scancode/model_helpers.py +++ b/tests/test_scancode/model_helpers.py @@ -2,7 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 -from pathlib import Path + +from pathlib import PurePath from opossum_lib.scancode.model import ( Copyright, @@ -83,11 +84,11 @@ def _create_file( if copyrights is None: copyrights = [] if name is None: - name = Path(path).name + name = PurePath(path).name if base_name is None: - base_name = Path(Path(path).name).stem + base_name = PurePath(PurePath(path).name).stem if extension is None: - extension = Path(path).suffix + extension = PurePath(path).suffix return File( authors=authors, base_name=base_name, From cc299e03356b9281b14ce9cccc91e152073f2b83 Mon Sep 17 00:00:00 2001 From: Adrian Braemer Date: Tue, 21 Jan 2025 12:47:33 +0100 Subject: [PATCH 07/11] fix: paths on windows by replacing \ with / --- src/opossum_lib/opossum_model.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/opossum_lib/opossum_model.py b/src/opossum_lib/opossum_model.py index b78332fa..1bd4a1a3 100644 --- a/src/opossum_lib/opossum_model.py +++ b/src/opossum_lib/opossum_model.py @@ -90,7 +90,7 @@ def create_attribution_mapping( ] = {} def process_node(node: Resource) -> None: - path = str(node.path) + path = str(node.path).replace("\\", "/") if not path.startswith("/"): # the / is required by OpossumUI path = "/" + path @@ -133,7 +133,9 @@ def to_opossum_file_format(self) -> opossum_file.ResourceInFile: return 1 else: return { - str(child.path.relative_to(self.path)): child.to_opossum_file_format() + str(child.path.relative_to(self.path)).replace( + "\\", "/" + ): child.to_opossum_file_format() for child in self.children.values() } From 31dd40ec9c0ebd358dd617436c4d00dadcee7b1f Mon Sep 17 00:00:00 2001 From: Adrian Braemer Date: Tue, 21 Jan 2025 16:18:32 +0100 Subject: [PATCH 08/11] fix: Forgot to convert external_attribution_sources --- src/opossum_lib/opossum_model.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/opossum_lib/opossum_model.py b/src/opossum_lib/opossum_model.py index 1bd4a1a3..0b87c056 100644 --- a/src/opossum_lib/opossum_model.py +++ b/src/opossum_lib/opossum_model.py @@ -57,6 +57,12 @@ def to_opossum_file_format(self) -> opossum_file_content.OpossumFileContent: self.base_urls_for_sources and self.base_urls_for_sources.to_opossum_file_format() ) + + external_attribution_sources = { + key: val.to_opossum_file_format() + for (key, val) in self.external_attribution_sources.items() + } + return opossum_file_content.OpossumFileContent( input_file=opossum_file.OpossumInformation( metadata=self.metadata.to_opossum_file_format(), @@ -67,7 +73,7 @@ def to_opossum_file_format(self) -> opossum_file_content.OpossumFileContent: external_attributions=external_attributions, resources_to_attributions=resources_to_attributions, attribution_breakpoints=self.attribution_breakpoints, - external_attribution_sources=self.external_attribution_sources, + external_attribution_sources=external_attribution_sources, frequent_licenses=frequent_licenses, files_with_children=self.files_with_children, base_urls_for_sources=base_urls_for_sources, From 60f4f48217c72123cef4e6577bc382bbc6ac68bb Mon Sep 17 00:00:00 2001 From: Adrian Braemer Date: Wed, 22 Jan 2025 11:14:31 +0100 Subject: [PATCH 09/11] fix: minor changes to opossum_model.py * default to treating a Resource as file if the type is undefined and no children present * provide slightly more information in an error message --- src/opossum_lib/opossum_model.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/opossum_lib/opossum_model.py b/src/opossum_lib/opossum_model.py index 0b87c056..0e22774f 100644 --- a/src/opossum_lib/opossum_model.py +++ b/src/opossum_lib/opossum_model.py @@ -135,15 +135,15 @@ class Resource(BaseModel): children: dict[str, Resource] = {} def to_opossum_file_format(self) -> opossum_file.ResourceInFile: - if self.type == ResourceType.FILE: - return 1 - else: + if self.children or self.type == ResourceType.FOLDER: return { str(child.path.relative_to(self.path)).replace( "\\", "/" ): child.to_opossum_file_format() for child in self.children.values() } + else: + return 1 def add_resource(self, resource: Resource) -> None: if not resource.path.is_relative_to(self.path): @@ -174,7 +174,10 @@ def _update(self, other: Resource) -> None: + f"{self.path} vs. {other.path}" ) if self.type and other.type and self.type != other.type: - raise RuntimeError("Trying to merge incompatible node types.") + raise RuntimeError( + "Trying to merge incompatible node types. " + + f"Current node is {self.type}. Other is {other.type}" + ) self.type = self.type or other.type self.attributions.extend(other.attributions) for key, child in other.children.items(): From 7a9b11dc721a116b1c074db0e5ede38ff62f1efc Mon Sep 17 00:00:00 2001 From: Adrian Braemer Date: Thu, 23 Jan 2025 08:39:24 +0100 Subject: [PATCH 10/11] refactor: address review comments --- src/opossum_lib/opossum_model.py | 80 +++++++++++-------- .../scancode/convert_scancode_to_opossum.py | 4 - src/opossum_lib/scancode/resource_tree.py | 6 +- 3 files changed, 49 insertions(+), 41 deletions(-) diff --git a/src/opossum_lib/opossum_model.py b/src/opossum_lib/opossum_model.py index 0e22774f..8dc370e7 100644 --- a/src/opossum_lib/opossum_model.py +++ b/src/opossum_lib/opossum_model.py @@ -7,6 +7,7 @@ import uuid from collections import defaultdict from collections.abc import Iterable +from copy import deepcopy from dataclasses import field from enum import Enum, auto from pathlib import PurePath @@ -15,13 +16,17 @@ from pydantic import BaseModel, ConfigDict import opossum_lib.opossum.opossum_file as opossum_file -import opossum_lib.opossum.opossum_file_content as opossum_file_content +from opossum_lib.opossum.opossum_file_content import OpossumFileContent from opossum_lib.opossum.output_model import OpossumOutputFile type OpossumPackageIdentifier = str type ResourcePath = str +def _convert_path_to_str(path: PurePath) -> str: + return str(path).replace("\\", "/") + + def default_attribution_id_mapper() -> dict[OpossumPackage, str]: return defaultdict(lambda: str(uuid.uuid4())) @@ -30,10 +35,8 @@ class Opossum(BaseModel): model_config = ConfigDict(frozen=True, extra="forbid") metadata: Metadata resources: list[Resource] - attribution_breakpoints: list[str] = field(default_factory=list) - external_attribution_sources: dict[str, ExternalAttributionSource] = field( - default_factory=dict - ) + attribution_breakpoints: list[str] = [] + external_attribution_sources: dict[str, ExternalAttributionSource] = {} frequent_licenses: list[FrequentLicense] | None = None files_with_children: list[str] | None = None base_urls_for_sources: BaseUrlsForSources | None = None @@ -42,17 +45,15 @@ class Opossum(BaseModel): ) output_file: OpossumOutputFile | None = None - def to_opossum_file_format(self) -> opossum_file_content.OpossumFileContent: + def to_opossum_file_format(self) -> OpossumFileContent: external_attributions, resources_to_attributions = ( self.create_attribution_mapping(self.resources) ) - frequent_licenses = ( - None - if self.frequent_licenses is None - else [ + frequent_licenses = None + if self.frequent_licenses: + frequent_licenses = [ license.to_opossum_file_format() for license in self.frequent_licenses ] - ) base_urls_for_sources = ( self.base_urls_for_sources and self.base_urls_for_sources.to_opossum_file_format() @@ -63,7 +64,7 @@ def to_opossum_file_format(self) -> opossum_file_content.OpossumFileContent: for (key, val) in self.external_attribution_sources.items() } - return opossum_file_content.OpossumFileContent( + return OpossumFileContent( input_file=opossum_file.OpossumInformation( metadata=self.metadata.to_opossum_file_format(), resources={ @@ -72,10 +73,10 @@ def to_opossum_file_format(self) -> opossum_file_content.OpossumFileContent: }, external_attributions=external_attributions, resources_to_attributions=resources_to_attributions, - attribution_breakpoints=self.attribution_breakpoints, + attribution_breakpoints=deepcopy(self.attribution_breakpoints), external_attribution_sources=external_attribution_sources, frequent_licenses=frequent_licenses, - files_with_children=self.files_with_children, + files_with_children=deepcopy(self.files_with_children), base_urls_for_sources=base_urls_for_sources, ), output_file=self.output_file, @@ -96,20 +97,19 @@ def create_attribution_mapping( ] = {} def process_node(node: Resource) -> None: - path = str(node.path).replace("\\", "/") + path = _convert_path_to_str(node.path) if not path.startswith("/"): # the / is required by OpossumUI path = "/" + path - attributions = node.attributions - new_attributions_with_id = { + node_attributions_by_id = { self.get_attribution_key(a): a.to_opossum_file_format() - for a in attributions + for a in node.attributions } - external_attributions.update(new_attributions_with_id) + external_attributions.update(node_attributions_by_id) - if len(new_attributions_with_id) > 0: - resources_to_attributions[path] = list(new_attributions_with_id.keys()) + if len(node_attributions_by_id) > 0: + resources_to_attributions[path] = list(node_attributions_by_id.keys()) for child in node.children.values(): process_node(child) @@ -127,8 +127,13 @@ def get_attribution_key( return id +class ResourceType(Enum): + FILE = auto() + FOLDER = auto() + + class Resource(BaseModel): - model_config = ConfigDict(extra="forbid") + model_config = ConfigDict(frozen=False, extra="forbid") path: PurePath type: ResourceType | None = None attributions: list[OpossumPackage] = [] @@ -137,8 +142,8 @@ class Resource(BaseModel): def to_opossum_file_format(self) -> opossum_file.ResourceInFile: if self.children or self.type == ResourceType.FOLDER: return { - str(child.path.relative_to(self.path)).replace( - "\\", "/" + _convert_path_to_str( + child.path.relative_to(self.path) ): child.to_opossum_file_format() for child in self.children.values() } @@ -201,7 +206,11 @@ class FrequentLicense(BaseModel): default_text: str def to_opossum_file_format(self) -> opossum_file.FrequentLicense: - return opossum_file.FrequentLicense(**self.model_dump()) + return opossum_file.FrequentLicense( + full_name=self.full_name, + short_name=self.short_name, + default_text=self.default_text, + ) class SourceInfo(BaseModel): @@ -211,7 +220,11 @@ class SourceInfo(BaseModel): additional_name: str | None = None def to_opossum_file_format(self) -> opossum_file.SourceInfo: - return opossum_file.SourceInfo(**self.model_dump()) + return opossum_file.SourceInfo( + name=self.name, + document_confidence=self.document_confidence, + additional_name=self.additional_name, + ) class OpossumPackage(BaseModel): @@ -223,7 +236,7 @@ class OpossumPackage(BaseModel): package_version: str | None = None package_namespace: str | None = None package_type: str | None = None - package_p_u_r_l_appendix: str | None = None + package_purl_appendix: str | None = None copyright: str | None = None license_name: str | None = None license_text: str | None = None @@ -246,7 +259,7 @@ def to_opossum_file_format(self) -> opossum_file.OpossumPackage: package_version=self.package_version, package_namespace=self.package_namespace, package_type=self.package_type, - package_p_u_r_l_appendix=self.package_p_u_r_l_appendix, + package_p_u_r_l_appendix=self.package_purl_appendix, copyright=self.copyright, license_name=self.license_name, license_text=self.license_text, @@ -275,11 +288,6 @@ def to_opossum_file_format(self) -> opossum_file.Metadata: return opossum_file.Metadata(**self.model_dump()) -class ResourceType(Enum): - FILE = auto() - FOLDER = auto() - - class ExternalAttributionSource(BaseModel): model_config = ConfigDict(frozen=True, extra="forbid") name: str @@ -287,4 +295,8 @@ class ExternalAttributionSource(BaseModel): is_relevant_for_preferred: bool | None = None def to_opossum_file_format(self) -> opossum_file.ExternalAttributionSource: - return opossum_file.ExternalAttributionSource(**self.model_dump()) + return opossum_file.ExternalAttributionSource( + name=self.name, + priority=self.priority, + is_relevant_for_preferred=self.is_relevant_for_preferred, + ) diff --git a/src/opossum_lib/scancode/convert_scancode_to_opossum.py b/src/opossum_lib/scancode/convert_scancode_to_opossum.py index 7a1439b0..c8be7e29 100644 --- a/src/opossum_lib/scancode/convert_scancode_to_opossum.py +++ b/src/opossum_lib/scancode/convert_scancode_to_opossum.py @@ -33,10 +33,6 @@ def convert_scancode_to_opossum(filename: str) -> OpossumFileContent: return opossum_model.Opossum( metadata=metadata, resources=resources, - attribution_breakpoints=[], - external_attribution_sources={}, - frequent_licenses=None, - files_with_children=None, ).to_opossum_file_format() diff --git a/src/opossum_lib/scancode/resource_tree.py b/src/opossum_lib/scancode/resource_tree.py index 5428f651..89318e93 100644 --- a/src/opossum_lib/scancode/resource_tree.py +++ b/src/opossum_lib/scancode/resource_tree.py @@ -16,7 +16,7 @@ def scancode_to_file_tree(scancode_data: ScanCodeData) -> opossum_model.Resource temp_root = opossum_model.Resource(path=PurePath("")) for file in scancode_data.files: resource = opossum_model.Resource( - path=PurePath(file.path.replace("\\", "/")), + path=PurePath(file.path), attributions=get_attribution_info(file), type=convert_resource_type(file.type), ) @@ -49,8 +49,8 @@ def get_attribution_info(file: File) -> list[opossum_model.OpossumPackage]: return attribution_infos -def convert_resource_type(val: FileType) -> opossum_model.ResourceType: - if val == FileType.FILE: +def convert_resource_type(file_type: FileType) -> opossum_model.ResourceType: + if file_type == FileType.FILE: return opossum_model.ResourceType.FILE else: return opossum_model.ResourceType.FOLDER From ddf4130c850bfeaca115609999e31b81e1d1b2d4 Mon Sep 17 00:00:00 2001 From: Adrian Braemer Date: Thu, 23 Jan 2025 13:04:36 +0100 Subject: [PATCH 11/11] refactor: change Opossum to reflect structure of OpossumFileContent --- src/opossum_lib/opossum_model.py | 44 +++++++++++-------- .../scancode/convert_scancode_to_opossum.py | 6 ++- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/src/opossum_lib/opossum_model.py b/src/opossum_lib/opossum_model.py index 8dc370e7..b76c337c 100644 --- a/src/opossum_lib/opossum_model.py +++ b/src/opossum_lib/opossum_model.py @@ -32,6 +32,18 @@ def default_attribution_id_mapper() -> dict[OpossumPackage, str]: class Opossum(BaseModel): + model_config = ConfigDict(frozen=True, extra="forbid") + scan_results: ScanResults + review_results: OpossumOutputFile | None = None + + def to_opossum_file_format(self) -> OpossumFileContent: + return OpossumFileContent( + input_file=self.scan_results.to_opossum_file_format(), + output_file=self.review_results, + ) + + +class ScanResults(BaseModel): model_config = ConfigDict(frozen=True, extra="forbid") metadata: Metadata resources: list[Resource] @@ -43,9 +55,8 @@ class Opossum(BaseModel): attribution_to_id: dict[OpossumPackage, str] = field( default_factory=default_attribution_id_mapper ) - output_file: OpossumOutputFile | None = None - def to_opossum_file_format(self) -> OpossumFileContent: + def to_opossum_file_format(self) -> opossum_file.OpossumInformation: external_attributions, resources_to_attributions = ( self.create_attribution_mapping(self.resources) ) @@ -64,22 +75,19 @@ def to_opossum_file_format(self) -> OpossumFileContent: for (key, val) in self.external_attribution_sources.items() } - return OpossumFileContent( - input_file=opossum_file.OpossumInformation( - metadata=self.metadata.to_opossum_file_format(), - resources={ - str(resource.path): resource.to_opossum_file_format() - for resource in self.resources - }, - external_attributions=external_attributions, - resources_to_attributions=resources_to_attributions, - attribution_breakpoints=deepcopy(self.attribution_breakpoints), - external_attribution_sources=external_attribution_sources, - frequent_licenses=frequent_licenses, - files_with_children=deepcopy(self.files_with_children), - base_urls_for_sources=base_urls_for_sources, - ), - output_file=self.output_file, + return opossum_file.OpossumInformation( + metadata=self.metadata.to_opossum_file_format(), + resources={ + str(resource.path): resource.to_opossum_file_format() + for resource in self.resources + }, + external_attributions=external_attributions, + resources_to_attributions=resources_to_attributions, + attribution_breakpoints=deepcopy(self.attribution_breakpoints), + external_attribution_sources=external_attribution_sources, + frequent_licenses=frequent_licenses, + files_with_children=deepcopy(self.files_with_children), + base_urls_for_sources=base_urls_for_sources, ) def create_attribution_mapping( diff --git a/src/opossum_lib/scancode/convert_scancode_to_opossum.py b/src/opossum_lib/scancode/convert_scancode_to_opossum.py index c8be7e29..1a84a383 100644 --- a/src/opossum_lib/scancode/convert_scancode_to_opossum.py +++ b/src/opossum_lib/scancode/convert_scancode_to_opossum.py @@ -31,8 +31,10 @@ def convert_scancode_to_opossum(filename: str) -> OpossumFileContent: ) return opossum_model.Opossum( - metadata=metadata, - resources=resources, + scan_results=opossum_model.ScanResults( + metadata=metadata, + resources=resources, + ) ).to_opossum_file_format()