Skip to content

Commit ad2c938

Browse files
committed
chore: pypi inspector link generation now lives in a dataclass in the pypi registry code
Signed-off-by: Carl Flottmann <[email protected]>
1 parent 6aa7a4c commit ad2c938

File tree

5 files changed

+192
-236
lines changed

5 files changed

+192
-236
lines changed

src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_absence.py

Lines changed: 7 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,10 @@
66
import logging
77

88
from macaron.errors import HeuristicAnalyzerValueError
9-
from macaron.json_tools import JsonType, json_extract
9+
from macaron.json_tools import JsonType
1010
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
1111
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
1212
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset
13-
from macaron.util import send_head_http_raw
1413

1514
logger: logging.Logger = logging.getLogger(__name__)
1615

@@ -23,13 +22,6 @@ class WheelAbsenceAnalyzer(BaseHeuristicAnalyzer):
2322
heuristic fails.
2423
"""
2524

26-
WHEEL: str = "bdist_wheel"
27-
# as per https://github.com/pypi/inspector/blob/main/inspector/main.py line 125
28-
INSPECTOR_TEMPLATE = (
29-
"{inspector_url_scheme}://{inspector_url_netloc}/project/"
30-
"{name}/{version}/packages/{first}/{second}/{rest}/{filename}"
31-
)
32-
3325
def __init__(self) -> None:
3426
super().__init__(
3527
name="wheel_absence_analyzer",
@@ -53,83 +45,17 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
5345
Raises
5446
------
5547
HeuristicAnalyzerValueError
56-
If there is no release information, or has other missing package information.
48+
If there is missing package information.
5749
"""
58-
releases = pypi_package_json.get_releases()
59-
if releases is None: # no release information
60-
error_msg = "There is no information for any release of this package."
61-
logger.debug(error_msg)
62-
raise HeuristicAnalyzerValueError(error_msg)
63-
64-
version = pypi_package_json.component_version
65-
if version is None: # check latest release version
66-
version = pypi_package_json.get_latest_version()
67-
68-
if version is None:
69-
error_msg = "There is no latest version of this package."
70-
logger.debug(error_msg)
71-
raise HeuristicAnalyzerValueError(error_msg)
72-
73-
# Contains a boolean field identifying if the link is reachable by this Macaron instance or not.
74-
inspector_links: dict[str, JsonType] = {}
75-
wheel_present: bool = False
76-
77-
release_distributions = json_extract(releases, [version], list)
78-
if release_distributions is None:
79-
error_msg = f"The version {version} is not available as a release."
50+
if not pypi_package_json.get_inspector_links():
51+
error_msg = "Unable to retrieve PyPI inspector information about package"
8052
logger.debug(error_msg)
8153
raise HeuristicAnalyzerValueError(error_msg)
8254

83-
for distribution in release_distributions:
84-
# validate data
85-
package_type = json_extract(distribution, ["packagetype"], str)
86-
if package_type is None:
87-
error_msg = f"The version {version} has no 'package type' field in a distribution"
88-
logger.debug(error_msg)
89-
raise HeuristicAnalyzerValueError(error_msg)
90-
91-
name = json_extract(pypi_package_json.package_json, ["info", "name"], str)
92-
if name is None:
93-
error_msg = f"The version {version} has no 'name' field in a distribution"
94-
logger.debug(error_msg)
95-
raise HeuristicAnalyzerValueError(error_msg)
96-
97-
blake2b_256 = json_extract(distribution, ["digests", "blake2b_256"], str)
98-
if blake2b_256 is None:
99-
error_msg = f"The version {version} has no 'blake2b_256' field in a distribution"
100-
logger.debug(error_msg)
101-
raise HeuristicAnalyzerValueError(error_msg)
102-
103-
filename = json_extract(distribution, ["filename"], str)
104-
if filename is None:
105-
error_msg = f"The version {version} has no 'filename' field in a distribution"
106-
logger.debug(error_msg)
107-
raise HeuristicAnalyzerValueError(error_msg)
108-
109-
if package_type == self.WHEEL:
110-
wheel_present = True
111-
112-
inspector_link = self.INSPECTOR_TEMPLATE.format(
113-
inspector_url_scheme=pypi_package_json.pypi_registry.inspector_url_scheme,
114-
inspector_url_netloc=pypi_package_json.pypi_registry.inspector_url_netloc,
115-
name=name,
116-
version=version,
117-
first=blake2b_256[0:2],
118-
second=blake2b_256[2:4],
119-
rest=blake2b_256[4:],
120-
filename=filename,
121-
)
122-
123-
# use a head request because we don't care about the response contents
124-
inspector_links[inspector_link] = False
125-
if send_head_http_raw(inspector_link):
126-
inspector_links[inspector_link] = True # link was reachable
127-
128-
detail_info: dict[str, JsonType] = {
129-
"inspector_links": inspector_links,
130-
}
55+
detail_info: dict = {"inspector_links": pypi_package_json.inspector_asset.package_link_reachability}
13156

132-
if wheel_present:
57+
# At least one wheel file exists
58+
if len(pypi_package_json.inspector_asset.package_whl_links) > 0:
13359
return HeuristicResult.PASS, detail_info
13460

13561
return HeuristicResult.FAIL, detail_info

src/macaron/repo_finder/repo_finder_pypi.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@
99
from macaron.repo_finder.repo_finder_enums import RepoFinderInfo
1010
from macaron.repo_finder.repo_validator import find_valid_repository_url
1111
from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES, PyPIRegistry
12-
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset, find_or_create_pypi_asset
12+
from macaron.slsa_analyzer.package_registry.pypi_registry import (
13+
PyPIInspectorAsset,
14+
PyPIPackageJsonAsset,
15+
find_or_create_pypi_asset,
16+
)
1317
from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo
1418

1519
logger: logging.Logger = logging.getLogger(__name__)
@@ -58,7 +62,9 @@ def find_repo(
5862
pypi_registry = next((registry for registry in PACKAGE_REGISTRIES if isinstance(registry, PyPIRegistry)), None)
5963
if not pypi_registry:
6064
return "", RepoFinderInfo.PYPI_NO_REGISTRY
61-
pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, False, pypi_registry, {}, "")
65+
pypi_asset = PyPIPackageJsonAsset(
66+
purl.name, purl.version, False, pypi_registry, {}, "", PyPIInspectorAsset("", [], {})
67+
)
6268

6369
if not pypi_asset:
6470
# This should be unreachable, as the pypi_registry has already been confirmed to be of type PyPIRegistry.

src/macaron/slsa_analyzer/package_registry/pypi_registry.py

Lines changed: 119 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from macaron.json_tools import json_extract
2727
from macaron.malware_analyzer.datetime_parser import parse_datetime
2828
from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry
29-
from macaron.util import download_file_with_size_limit, send_get_http_raw, stream_file_with_size_limit
29+
from macaron.util import download_file_with_size_limit, send_get_http_raw, stream_file_with_size_limit, send_head_http_raw
3030

3131
if TYPE_CHECKING:
3232
from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo
@@ -443,6 +443,33 @@ def extract_attestation(attestation_data: dict) -> dict | None:
443443
return attestations[0]
444444

445445

446+
# as per https://github.com/pypi/inspector/blob/main/inspector/main.py line 125
447+
INSPECTOR_TEMPLATE = (
448+
"{inspector_url_scheme}://{inspector_url_netloc}/project/"
449+
"{name}/{version}/packages/{first}/{second}/{rest}/{filename}"
450+
)
451+
452+
453+
@dataclass
454+
class PyPIInspectorAsset:
455+
"""The package PyPI inspector information."""
456+
457+
#: the pypi inspector link to the tarball
458+
package_sdist_link: str
459+
460+
#: the pypi inspector link(s) to the wheel(s)
461+
package_whl_links: list[str]
462+
463+
#: a mapping of inspector links to whether they are reachable
464+
package_link_reachability: dict[str, bool]
465+
466+
def __bool__(self) -> bool:
467+
"""Determine if this inspector object is empty."""
468+
if (self.package_sdist_link or self.package_whl_links) and self.package_link_reachability:
469+
return True
470+
return False
471+
472+
446473
@dataclass
447474
class PyPIPackageJsonAsset:
448475
"""The package JSON hosted on the PyPI registry."""
@@ -465,6 +492,9 @@ class PyPIPackageJsonAsset:
465492
#: the source code temporary location name
466493
package_sourcecode_path: str
467494

495+
#: the pypi inspector information about this package
496+
inspector_asset: PyPIInspectorAsset
497+
468498
#: The size of the asset (in bytes). This attribute is added to match the AssetLocator
469499
#: protocol and is not used because pypi API registry does not provide it.
470500
@property
@@ -718,6 +748,91 @@ def get_sha256(self) -> str | None:
718748
logger.debug("Found sha256 hash: %s", artifact_hash)
719749
return artifact_hash
720750

751+
def get_inspector_links(self) -> bool:
752+
"""Generate PyPI inspector links for this package version's distributions and fill in the inspector asset.
753+
754+
Returns
755+
-------
756+
bool
757+
True if the link generation was successful, False otherwise.
758+
"""
759+
if self.inspector_asset:
760+
return True
761+
762+
if not self.package_json and not self.download(""):
763+
logger.warning("No package metadata available, cannot get links")
764+
return False
765+
766+
releases = self.get_releases()
767+
if releases is None:
768+
logger.warning("Package has no releases, cannot create inspector links.")
769+
return False
770+
771+
version = self.component_version
772+
if self.component_version is None:
773+
version = self.get_latest_version()
774+
775+
if version is None:
776+
logger.warning("No version set, and no latest version exists. cannot create inspector links.")
777+
return False
778+
779+
distributions = json_extract(releases, [version], list)
780+
781+
if not distributions:
782+
logger.warning(
783+
"Package has no distributions for release version %s. Cannot create inspector links.", version
784+
)
785+
return False
786+
787+
for distribution in distributions:
788+
package_type = json_extract(distribution, ["packagetype"], str)
789+
if package_type is None:
790+
logger.warning("The version %s has no 'package type' field in a distribution", version)
791+
continue
792+
793+
name = json_extract(self.package_json, ["info", "name"], str)
794+
if name is None:
795+
logger.warning("The version %s has no 'name' field in a distribution", version)
796+
continue
797+
798+
blake2b_256 = json_extract(distribution, ["digests", "blake2b_256"], str)
799+
if blake2b_256 is None:
800+
logger.warning("The version %s has no 'blake2b_256' field in a distribution", version)
801+
continue
802+
803+
filename = json_extract(distribution, ["filename"], str)
804+
if filename is None:
805+
logger.warning("The version %s has no 'filename' field in a distribution", version)
806+
continue
807+
808+
link = INSPECTOR_TEMPLATE.format(
809+
inspector_url_scheme=self.pypi_registry.inspector_url_scheme,
810+
inspector_url_netloc=self.pypi_registry.inspector_url_netloc,
811+
name=name,
812+
version=version,
813+
first=blake2b_256[0:2],
814+
second=blake2b_256[2:4],
815+
rest=blake2b_256[4:],
816+
filename=filename,
817+
)
818+
819+
# use a head request because we don't care about the response contents
820+
reachable = False
821+
if send_head_http_raw(link):
822+
reachable = True # link was reachable
823+
824+
if package_type == "sdist":
825+
self.inspector_asset.package_sdist_link = link
826+
self.inspector_asset.package_link_reachability[link] = reachable
827+
elif package_type == "bdist_wheel":
828+
self.inspector_asset.package_whl_links.append(link)
829+
self.inspector_asset.package_link_reachability[link] = reachable
830+
else: # no other package types exist, so else statement should never occur
831+
logger.debug("Unknown package distribution type: %s", package_type)
832+
833+
# if all distributions were invalid and went along a 'continue' path
834+
return bool(self.inspector_asset)
835+
721836

722837
def find_or_create_pypi_asset(
723838
asset_name: str, asset_version: str | None, pypi_registry_info: PackageRegistryInfo
@@ -755,6 +870,8 @@ def find_or_create_pypi_asset(
755870
logger.debug("Failed to create PyPIPackageJson asset.")
756871
return None
757872

758-
asset = PyPIPackageJsonAsset(asset_name, asset_version, False, package_registry, {}, "")
873+
asset = PyPIPackageJsonAsset(
874+
asset_name, asset_version, False, package_registry, {}, "", PyPIInspectorAsset("", [], {})
875+
)
759876
pypi_registry_info.metadata.append(asset)
760877
return asset

tests/malware_analyzer/pypi/conftest.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
1+
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
22
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
33

44
"""This module contains test configurations for malware analyzer."""
@@ -8,7 +8,7 @@
88
import pytest
99

1010
from macaron.database.table_definitions import Analysis, Component, RepoFinderMetadata
11-
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset, PyPIRegistry
11+
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIInspectorAsset, PyPIPackageJsonAsset, PyPIRegistry
1212

1313

1414
@pytest.fixture(autouse=True)
@@ -26,4 +26,5 @@ def pypi_package_json() -> MagicMock:
2626
pypi_package.component = Component(
2727
purl="pkg:pypi/package", analysis=Analysis(), repository=None, repo_finder_metadata=RepoFinderMetadata()
2828
)
29+
pypi_package.inspector_asset = MagicMock(spec=PyPIInspectorAsset)
2930
return pypi_package

0 commit comments

Comments
 (0)