Skip to content

Commit cfdd0e6

Browse files
committed
feat: add batched PURL endpoint calls and conditional license fetching
- Add --max-purl-batch-size flag (default: 5000, range: 1-9999) to control batch size for license detail API calls - Skip PURL endpoint entirely when --generate-license is not set, improving performance for scans that don't need license attribution/details - Implement batching in get_license_text_via_purl() to process packages in configurable chunks, preventing API overload on large repos - Add validation for max_purl_batch_size parameter with clear error messages - Remove unused check_full_scans_status() method (dead code cleanup) This change optimizes license data retrieval by: 1. Only calling PURL endpoint when license output is actually needed 2. Processing packages in manageable batches to avoid timeouts/limits 3. Providing tunable batch sizes for different repo sizes
1 parent 40530ce commit cfdd0e6

File tree

4 files changed

+70
-74
lines changed

4 files changed

+70
-74
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ build-backend = "hatchling.build"
66

77
[project]
88
name = "socketsecurity"
9-
version = "2.2.51"
9+
version = "2.2.52"
1010
requires-python = ">= 3.10"
1111
license = {"file" = "LICENSE"}
1212
dependencies = [

socketsecurity/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
__author__ = 'socket.dev'
2-
__version__ = '2.2.51'
2+
__version__ = '2.2.52'
33
USER_AGENT = f'SocketPythonCLI/{__version__}'

socketsecurity/config.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ class CliConfig:
7878
reach_additional_params: Optional[List[str]] = None
7979
only_facts_file: bool = False
8080
reach_use_only_pregenerated_sboms: bool = False
81+
max_purl_batch_size: int = 5000
8182

8283
@classmethod
8384
def from_args(cls, args_list: Optional[List[str]] = None) -> 'CliConfig':
@@ -141,6 +142,7 @@ def from_args(cls, args_list: Optional[List[str]] = None) -> 'CliConfig':
141142
'reach_additional_params': args.reach_additional_params,
142143
'only_facts_file': args.only_facts_file,
143144
'reach_use_only_pregenerated_sboms': args.reach_use_only_pregenerated_sboms,
145+
'max_purl_batch_size': args.max_purl_batch_size,
144146
'version': __version__
145147
}
146148
try:
@@ -187,6 +189,11 @@ def from_args(cls, args_list: Optional[List[str]] = None) -> 'CliConfig':
187189
logging.error("--reach-concurrency must be >= 1")
188190
exit(1)
189191

192+
# Validate max_purl_batch_size is within allowed range
193+
if args.max_purl_batch_size < 1 or args.max_purl_batch_size > 9999:
194+
logging.error("--max-purl-batch-size must be between 1 and 9999")
195+
exit(1)
196+
190197
return cls(**config_args)
191198

192199
def to_dict(self) -> dict:
@@ -446,6 +453,13 @@ def create_argument_parser() -> argparse.ArgumentParser:
446453
action="store_true",
447454
help="Exclude license details from the diff report (boosts performance for large repos)"
448455
)
456+
output_group.add_argument(
457+
"--max-purl-batch-size",
458+
dest="max_purl_batch_size",
459+
type=int,
460+
default=5000,
461+
help="Maximum batch size for PURL endpoint calls when generating license info (default: 5000, min: 1, max: 9999)"
462+
)
449463

450464
output_group.add_argument(
451465
"--disable-security-issue",

socketsecurity/core/__init__.py

Lines changed: 54 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -659,54 +659,6 @@ def create_full_scan_with_report_url(
659659
# Return result in the format expected by the user
660660
return diff
661661

662-
def check_full_scans_status(self, head_full_scan_id: str, new_full_scan_id: str) -> bool:
663-
is_ready = False
664-
current_timeout = self.config.timeout
665-
self.sdk.set_timeout(0.5)
666-
try:
667-
self.sdk.fullscans.stream(self.config.org_slug, head_full_scan_id)
668-
except Exception:
669-
log.debug(f"Queued up full scan for processing ({head_full_scan_id})")
670-
671-
try:
672-
self.sdk.fullscans.stream(self.config.org_slug, new_full_scan_id)
673-
except Exception:
674-
log.debug(f"Queued up full scan for processing ({new_full_scan_id})")
675-
self.sdk.set_timeout(current_timeout)
676-
start_check = time.time()
677-
head_is_ready = False
678-
new_is_ready = False
679-
while not is_ready:
680-
head_full_scan_metadata = self.sdk.fullscans.metadata(self.config.org_slug, head_full_scan_id)
681-
if head_full_scan_metadata:
682-
head_state = head_full_scan_metadata.get("scan_state")
683-
else:
684-
head_state = None
685-
new_full_scan_metadata = self.sdk.fullscans.metadata(self.config.org_slug, new_full_scan_id)
686-
if new_full_scan_metadata:
687-
new_state = new_full_scan_metadata.get("scan_state")
688-
else:
689-
new_state = None
690-
if head_state and head_state == "resolve":
691-
head_is_ready = True
692-
if new_state and new_state == "resolve":
693-
new_is_ready = True
694-
if head_is_ready and new_is_ready:
695-
is_ready = True
696-
current_time = time.time()
697-
if current_time - start_check >= self.config.timeout:
698-
log.debug(
699-
f"Timeout reached while waiting for full scans to be ready "
700-
f"({head_full_scan_id}, {new_full_scan_id})"
701-
)
702-
break
703-
total_time = time.time() - start_check
704-
if is_ready:
705-
log.info(f"Full scans are ready in {total_time:.2f} seconds")
706-
else:
707-
log.warning(f"Full scans are not ready yet ({head_full_scan_id}, {new_full_scan_id})")
708-
return is_ready
709-
710662
def get_full_scan(self, full_scan_id: str) -> FullScan:
711663
"""
712664
Get a FullScan object for an existing full scan including sbom_artifacts and packages.
@@ -846,28 +798,54 @@ def update_package_values(pkg: Package) -> Package:
846798
pkg.url += f"/{pkg.name}/overview/{pkg.version}"
847799
return pkg
848800

849-
def get_license_text_via_purl(self, packages: dict[str, Package]) -> dict:
850-
components = []
801+
def get_license_text_via_purl(self, packages: dict[str, Package], batch_size: int = 5000) -> dict:
802+
"""Get license attribution and details via PURL endpoint in batches.
803+
804+
Args:
805+
packages: Dictionary of packages to get license info for
806+
batch_size: Maximum number of packages to process per API call (1-9999)
807+
808+
Returns:
809+
Updated packages dictionary with licenseAttrib and licenseDetails populated
810+
"""
811+
# Validate batch size
812+
batch_size = max(1, min(9999, batch_size))
813+
814+
# Build list of all components
815+
all_components = []
851816
for purl in packages:
852817
full_purl = f"pkg:/{purl}"
853-
components.append({"purl": full_purl})
854-
results = self.sdk.purl.post(
855-
license=True,
856-
components=components,
857-
licenseattrib=True,
858-
licensedetails=True
859-
)
860-
purl_packages = []
861-
for result in results:
862-
ecosystem = result["type"]
863-
name = result["name"]
864-
package_version = result["version"]
865-
licenseDetails = result.get("licenseDetails")
866-
licenseAttrib = result.get("licenseAttrib")
867-
purl = f"{ecosystem}/{name}@{package_version}"
868-
if purl not in purl_packages and purl in packages:
869-
packages[purl].licenseAttrib = licenseAttrib
870-
packages[purl].licenseDetails = licenseDetails
818+
all_components.append({"purl": full_purl})
819+
820+
# Process in batches
821+
total_components = len(all_components)
822+
log.debug(f"Processing {total_components} packages in batches of {batch_size}")
823+
824+
for i in range(0, total_components, batch_size):
825+
batch_components = all_components[i:i + batch_size]
826+
batch_num = (i // batch_size) + 1
827+
total_batches = (total_components + batch_size - 1) // batch_size
828+
log.debug(f"Processing batch {batch_num}/{total_batches} ({len(batch_components)} packages)")
829+
830+
results = self.sdk.purl.post(
831+
license=True,
832+
components=batch_components,
833+
licenseattrib=True,
834+
licensedetails=True
835+
)
836+
837+
purl_packages = []
838+
for result in results:
839+
ecosystem = result["type"]
840+
name = result["name"]
841+
package_version = result["version"]
842+
licenseDetails = result.get("licenseDetails")
843+
licenseAttrib = result.get("licenseAttrib")
844+
purl = f"{ecosystem}/{name}@{package_version}"
845+
if purl not in purl_packages and purl in packages:
846+
packages[purl].licenseAttrib = licenseAttrib
847+
packages[purl].licenseDetails = licenseDetails
848+
871849
return packages
872850

873851
def get_added_and_removed_packages(
@@ -960,7 +938,14 @@ def get_added_and_removed_packages(
960938
log.error(f"Artifact details - name: {artifact.name}, version: {artifact.version}")
961939
log.error("No matching packages found in head_full_scan")
962940

963-
packages = self.get_license_text_via_purl(packages)
941+
# Only fetch license details if generate_license is enabled
942+
if self.cli_config and self.cli_config.generate_license:
943+
log.debug("Fetching license details via PURL endpoint")
944+
batch_size = self.cli_config.max_purl_batch_size if self.cli_config else 5000
945+
packages = self.get_license_text_via_purl(packages, batch_size=batch_size)
946+
else:
947+
log.debug("Skipping PURL endpoint call (--generate-license not set)")
948+
964949
return added_packages, removed_packages, packages
965950

966951
def create_new_diff(
@@ -1092,9 +1077,6 @@ def create_new_diff(
10921077
log.warning(f"Failed to clean up temporary file {temp_file}: {e}")
10931078

10941079
# Handle diff generation - now we always have both scans
1095-
scans_ready = self.check_full_scans_status(head_full_scan_id, new_full_scan.id)
1096-
if scans_ready is False:
1097-
log.error(f"Full scans did not complete within {self.config.timeout} seconds")
10981080
(
10991081
added_packages,
11001082
removed_packages,

0 commit comments

Comments
 (0)