Skip to content

Commit ae021e1

Browse files
committed
feat(heuristics): add three analyzers to detect dependency confusion and distinguish from stub packages
Signed-off-by: Amine <[email protected]>
1 parent 6a712af commit ae021e1

File tree

14 files changed

+634
-9
lines changed

14 files changed

+634
-9
lines changed

src/macaron/malware_analyzer/README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,22 @@ When a heuristic fails, with `HeuristicResult.FAIL`, then that is an indicator b
5656
- **Description**: Checks if the package name is suspiciously similar to any package name in a predefined list of popular packages. The similarity check incorporates the Jaro-Winkler distance and considers keyboard layout proximity to identify potential typosquatting.
5757
- **Rule**: Return `HeuristicResult.FAIL` if the similarity ratio between the package name and any popular package name meets or exceeds a defined threshold; otherwise, return `HeuristicResult.PASS`.
5858
- **Dependency**: None.
59+
60+
11. **Minimal Content**
61+
- **Description**: Checks if the package has a small number of files.
62+
- **Rule**: Return `HeuristicResult.FAIL` if the number of files is strictly less than FILES_THRESHOLD; otherwise, return `HeuristicResult.PASS`.
63+
- **Dependency**: None.
64+
65+
12. **Unsecure Description**
66+
- **Description**: Checks if the package description is unsecure, such as not having a descriptive keywords that indicates its a stub package .
67+
- **Rule**: Return `HeuristicResult.FAIL` if no descriptive word is found in the package description or summary ; otherwise, return `HeuristicResult.PASS`.
68+
- **Dependency**: None.
69+
70+
13. **Unknown Organization**
71+
- **Description**: Checks if the package is from a known organization.
72+
- **Rule**: Return `HeuristicResult.FAIL` if no organisation in the trusted organisation file found in the package metadata ; otherwise, return `HeuristicResult.PASS`.
73+
- **Dependency**: None.
74+
5975
### Source Code Analysis with Semgrep
6076
**PyPI Source Code Analyzer**
6177
- **Description**: Uses Semgrep, with default rules written in `src/macaron/resources/pypi_malware_rules` and custom rules available by supplying a path to `custom_semgrep_rules` in `defaults.ini`, to scan the package `.tar` source code.

src/macaron/malware_analyzer/pypi_heuristics/heuristics.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,15 @@ class Heuristics(str, Enum):
4343
#: Indicates that the package source code contains suspicious code patterns.
4444
SUSPICIOUS_PATTERNS = "suspicious_patterns"
4545

46+
#: Indicates that the package is associated with an unknown organization.
47+
UNKNOWN_ORGANIZATION = "unknown_organization"
48+
49+
#: Indicates that the package has minimal content.
50+
MINIMAL_CONTENT = "minimal_content"
51+
52+
#: Indicates that the package's description is unsecure, such as not having a descriptive keywords.
53+
UNSECURE_DESCRIPTION = "unsecure_description"
54+
4655

4756
class HeuristicResult(str, Enum):
4857
"""Result type indicating the outcome of a heuristic."""
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
"""This analyzer checks if a PyPI package has minimal content."""
5+
6+
import logging
7+
import os
8+
9+
from macaron.errors import SourceCodeError
10+
from macaron.json_tools import JsonType
11+
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
12+
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
13+
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset
14+
15+
logger: logging.Logger = logging.getLogger(__name__)
16+
17+
18+
class MinimalContentAnalyzer(BaseHeuristicAnalyzer):
19+
"""Check whether the package has minimal content."""
20+
21+
FILES_THRESHOLD = 3
22+
23+
def __init__(self) -> None:
24+
super().__init__(
25+
name="minimal_content_analyzer",
26+
heuristic=Heuristics.MINIMAL_CONTENT,
27+
depends_on=None,
28+
)
29+
30+
def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]:
31+
"""Analyze the package.
32+
33+
Parameters
34+
----------
35+
pypi_package_json: PyPIPackageJsonAsset
36+
The PyPI package JSON asset object.
37+
38+
Returns
39+
-------
40+
tuple[HeuristicResult, dict[str, JsonType]]:
41+
The result and related information collected during the analysis.
42+
"""
43+
result = pypi_package_json.download_sourcecode()
44+
if not result:
45+
error_msg = "No source code files have been downloaded"
46+
logger.debug(error_msg)
47+
raise SourceCodeError(error_msg)
48+
49+
file_count = sum(len(files) for _, _, files in os.walk(pypi_package_json.package_sourcecode_path))
50+
51+
if file_count >= self.FILES_THRESHOLD:
52+
return HeuristicResult.PASS, {"message": "Package has sufficient content"}
53+
54+
return HeuristicResult.FAIL, {"message": "Not enough files found"}
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
"""This analyzer checks if a PyPI package is associated with a not trusted organization."""
5+
6+
import logging
7+
import os
8+
import re
9+
10+
from macaron import MACARON_PATH
11+
from macaron.errors import HeuristicAnalyzerValueError
12+
from macaron.json_tools import JsonType, json_extract
13+
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
14+
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
15+
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset
16+
17+
logger: logging.Logger = logging.getLogger(__name__)
18+
19+
20+
class UnknownOrganizationAnalyzer(BaseHeuristicAnalyzer):
21+
"""Check whether the package is associated with a not trusted organization."""
22+
23+
def __init__(self, trusted_organizations_path: str | None = None) -> None:
24+
super().__init__(
25+
name="unknown_organization_analyzer", heuristic=Heuristics.UNKNOWN_ORGANIZATION, depends_on=None
26+
)
27+
self.path = trusted_organizations_path or os.path.join(MACARON_PATH, "resources/trusted_organizations.txt")
28+
self.trusted_organizations = self._load_defaults()
29+
30+
def _load_defaults(self) -> list[str]:
31+
"""Load default settings from defaults.ini.
32+
33+
Returns
34+
-------
35+
list[str]:
36+
The trusted organizations list.
37+
"""
38+
trusted_organizations = []
39+
try:
40+
with open(self.path, encoding="utf-8") as file:
41+
trusted_organizations = file.read().splitlines()
42+
except OSError as error:
43+
error_message = "Could not read trusted organizations file"
44+
logger.debug(error_message)
45+
raise HeuristicAnalyzerValueError(error_message) from error
46+
return [organisation.lower() for organisation in trusted_organizations]
47+
48+
def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]:
49+
"""Analyze the package.
50+
51+
Parameters
52+
----------
53+
pypi_package_json: PyPIPackageJsonAsset
54+
The PyPI package JSON asset object.
55+
56+
Returns
57+
-------
58+
tuple[HeuristicResult, dict[str, JsonType]]:
59+
The result and related information collected during the analysis.
60+
"""
61+
if not self.trusted_organizations:
62+
warning_message = "Trusted organizations file is empty"
63+
logger.warning(warning_message)
64+
return HeuristicResult.SKIP, {"warning": warning_message}
65+
66+
package_json = pypi_package_json.package_json
67+
if not package_json:
68+
error_message = "No package JSON found in metadata"
69+
logger.debug(error_message)
70+
raise HeuristicAnalyzerValueError(error_message)
71+
72+
author = json_extract(package_json, ["info", "author"], str)
73+
maintainer = json_extract(package_json, ["info", "maintainer"], str)
74+
author_email = json_extract(package_json, ["info", "author_email"], str)
75+
description = json_extract(package_json, ["info", "description"], str)
76+
summary = json_extract(package_json, ["info", "summary"], str)
77+
data = f"{author} {maintainer} {author_email} {description} {summary}"
78+
79+
for org in self.trusted_organizations:
80+
if re.search(rf"\b{re.escape(org)}\b", data, re.IGNORECASE):
81+
return HeuristicResult.PASS, {"message": "Package is associated with a trusted organization"}
82+
83+
return HeuristicResult.FAIL, {"message": "Package is associated with an unknown organization"}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
"""This analyzer checks if a PyPI package has unsecure description."""
5+
6+
import logging
7+
import re
8+
9+
from macaron.errors import HeuristicAnalyzerValueError
10+
from macaron.json_tools import JsonType, json_extract
11+
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
12+
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
13+
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset
14+
15+
logger: logging.Logger = logging.getLogger(__name__)
16+
17+
18+
class UnsecureDescriptionAnalyzer(BaseHeuristicAnalyzer):
19+
"""Check whether the package's description is unsecure."""
20+
21+
SECURE_DESCRIPTION_REGEX = re.compile(
22+
r"\b(?:internal|private|stub|placeholder|dependency confusion|security|namespace protection|reserved)\b",
23+
re.IGNORECASE,
24+
)
25+
26+
def __init__(self) -> None:
27+
super().__init__(
28+
name="unsecure_description_analyzer", heuristic=Heuristics.UNSECURE_DESCRIPTION, depends_on=None
29+
)
30+
31+
def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]:
32+
"""Analyze the package.
33+
34+
Parameters
35+
----------
36+
pypi_package_json: PyPIPackageJsonAsset
37+
The PyPI package JSON asset object.
38+
39+
Returns
40+
-------
41+
tuple[HeuristicResult, dict[str, JsonType]]:
42+
The result and related information collected during the analysis.
43+
"""
44+
package_json = pypi_package_json.package_json
45+
info = package_json.get("info", {})
46+
if not info:
47+
error_msg = "No package info found in metadata"
48+
logger.debug(error_msg)
49+
raise HeuristicAnalyzerValueError(error_msg)
50+
51+
description = json_extract(package_json, ["info", "description"], str)
52+
summary = json_extract(package_json, ["info", "summary"], str)
53+
data = f"{description} {summary}"
54+
if self.SECURE_DESCRIPTION_REGEX.search(data):
55+
return HeuristicResult.PASS, {"message": "Package description is secure"}
56+
return HeuristicResult.FAIL, {"message": "Package description is unsecure"}
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
Oracle
2+
jetbrains
3+
Rami Sayar
4+
AWS SDK Common Runtime Team
5+
Rafi Kurnia Putra
6+
Frank Seide
7+
Conchylicultor
8+
Pete Bryan
9+
John Evans
10+
Daniel van Flymen
11+
James Saryerwinnie
12+
PyTorch Team
13+
torchort contributors
14+
Ravindra Marella
15+
Lars van Gemerden
16+
Mark Saniscalchi
17+
Azure Red Team
18+
Google Cloud Datastore Team
19+
Mario Vilas
20+
Google AI Princeton
21+
CRE Avengers
22+
Microsoft Research - Causica
23+
Hunter Fernandes
24+
Deep Procedural Intelligence
25+
Paul O. Hayne
26+
Gregory Kwok
27+
Microsoft
28+
TBD
29+
Microsoft Corp.
30+
Luke Harries, Sebastian Lee, Jaroslaw Rzepecki, Katya Hofmann, Sam Devlin
31+
Bing Ads SDK Team
32+
John Schroeder
33+
Recommenders contributors
34+
Florian Berger
35+
pymox maintainers
36+
Sandeep Kumar
37+
Goffi (Jérôme Poisson)
38+
Yuxuan Dong
39+
Marc-Alexandre Côté
40+
pltrdy
41+
Gregory P. Smith
42+
piePaul
43+
Natalia Maximo
44+
Jon Wayne Parrott
45+
The Brotli Authors
46+
Microsoft Corporation License-Expression: Apache-2.0
47+
Mike Bayer
48+
David Herberth
49+
Google Inc.
50+
Darcy Mason and contributors
51+
Rüdiger Voigt
52+
Forensic artifacts
53+
Charles Marsh
54+
rego-cpp Team
55+
Lukas Schwab
56+
Google Quantum AI open-source maintainers
57+
Bling
58+
Stepan Perlov
59+
Eric Bridgeford
60+
The gRPC Authors
61+
hm-distro
62+
Capirca Team
63+
Microsoft Research AI Compilers Team
64+
PyWhy contributors
65+
Andy Casey
66+
The qsim/qsimh Developers
67+
Samuel Stauffer
68+
Tomi Pajunen
69+
Google
70+
OpenCensus Authors
71+
Charles Reese
72+
Jarek Potiuk, Szymon Przedwojski, Kamil Breguła, Feng Lu, Cameron Moberg
73+
Amazon Web Service
74+
Radovan Bast
75+
Mitch Garnaat
76+
Brian Quinlan
77+
The OpenFermion FQE Developers
78+
Project AIM
79+
Benjamin S. Meyers
80+
Kivanc Yuksel
81+
Patrick Costello
82+
PiCloud, Inc.
83+
Joseph DiLallo
84+
David Berthelot
85+
Holger Krekel, Bruno Oliveira, Ronny Pfannschmidt, Floris Bruynooghe, Brianna Laugher, Florian Bruhin, Others (See AUTHORS)
86+
Jonghak Choi
87+
Ludovico Magnocavallo
88+
Google LLC
89+
Dale Myers
90+
Tink Developers
91+
Amazon Web Services
92+
The Kubeflow Authors
93+
Xuan Ma
94+
Tim Swast, Google Inc.
95+
Prem Nair
96+
Wijnand Modderman-Lenstra
97+
The PyBigQuery Authors
98+
王振华(Zhenhua WANG)
99+
PyCQA
100+
Pedro Rodriguez
101+
Elizabeth Golden
102+
Stefan-Code
103+
Idin
104+
Michael Foord
105+
Matteo Cypriani
106+
suryapoojarypy
107+
Ian Hellen
108+
Microsoft Corporation
109+
MIT Lincoln Laboratory
110+
Dax Pryce
111+
Eider Moore
112+
James Gardner
113+
Jose Roberts
114+
JAX team
115+
Thea Flowers
116+
ScenePic Team
117+
Rodrigo Moraes
118+
The OpenFermion Developers
119+
Google, Inc.

src/macaron/slsa_analyzer/build_tool/gradle.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved.
1+
# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved.
22
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
33

44
"""This module contains the Gradle class which inherits BaseBuildTool.
@@ -122,7 +122,7 @@ def get_dep_analyzer(self) -> CycloneDxGradle:
122122
raise DependencyAnalyzerError("No default dependency analyzer is found.")
123123
if not DependencyAnalyzer.tool_valid(defaults.get("dependency.resolver", "dep_tool_gradle")):
124124
raise DependencyAnalyzerError(
125-
f"Dependency analyzer {defaults.get('dependency.resolver','dep_tool_gradle')} is not valid.",
125+
f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_gradle')} is not valid.",
126126
)
127127

128128
tool_name, tool_version = tuple(

src/macaron/slsa_analyzer/build_tool/maven.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved.
1+
# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved.
22
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
33

44
"""This module contains the Maven class which inherits BaseBuildTool.
@@ -116,7 +116,7 @@ def get_dep_analyzer(self) -> CycloneDxMaven:
116116
raise DependencyAnalyzerError("No default dependency analyzer is found.")
117117
if not DependencyAnalyzer.tool_valid(defaults.get("dependency.resolver", "dep_tool_maven")):
118118
raise DependencyAnalyzerError(
119-
f"Dependency analyzer {defaults.get('dependency.resolver','dep_tool_maven')} is not valid.",
119+
f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_maven')} is not valid.",
120120
)
121121

122122
tool_name, tool_version = tuple(

src/macaron/slsa_analyzer/build_tool/pip.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved.
1+
# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved.
22
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
33

44
"""This module contains the Pip class which inherits BaseBuildTool.
@@ -88,7 +88,7 @@ def get_dep_analyzer(self) -> DependencyAnalyzer:
8888
tool_name = "cyclonedx_py"
8989
if not DependencyAnalyzer.tool_valid(f"{tool_name}:{cyclonedx_version}"):
9090
raise DependencyAnalyzerError(
91-
f"Dependency analyzer {defaults.get('dependency.resolver','dep_tool_gradle')} is not valid.",
91+
f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_gradle')} is not valid.",
9292
)
9393
return CycloneDxPython(
9494
resources_path=global_config.resources_path,

0 commit comments

Comments
 (0)