Skip to content

Commit 7329f50

Browse files
authored
feat: add batched PURL endpoint calls and conditional license fetching (#140)
* feat: add batched PURL endpoint calls and conditional license fetching - Add --max-purl-batch-size flag (default: 5000, range: 1-9999) to control batch size for license detail API calls - Skip PURL endpoint entirely when --generate-license is not set, improving performance for scans that don't need license attribution/details - Implement batching in get_license_text_via_purl() to process packages in configurable chunks, preventing API overload on large repos - Add validation for max_purl_batch_size parameter with clear error messages - Remove unused check_full_scans_status() method (dead code cleanup) This change optimizes license data retrieval by: 1. Only calling PURL endpoint when license output is actually needed 2. Processing packages in manageable batches to avoid timeouts/limits 3. Providing tunable batch sizes for different repo sizes * Fixing --ignore-commit-files to properly work again * properly included the enable diff param to the main module * Adding NPM CLI to Dockerfile
1 parent 40530ce commit 7329f50

File tree

6 files changed

+99
-89
lines changed

6 files changed

+99
-89
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ RUN if [ "$DOTNET_VERSION" = "6" ]; then \
5757
fi
5858

5959
# Install additional tools
60-
RUN npm install @coana-tech/cli -g && \
60+
RUN npm install @coana-tech/cli socket -g && \
6161
gem install bundler && \
6262
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \
6363
. ~/.cargo/env && \

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ build-backend = "hatchling.build"
66

77
[project]
88
name = "socketsecurity"
9-
version = "2.2.51"
9+
version = "2.2.55"
1010
requires-python = ">= 3.10"
1111
license = {"file" = "LICENSE"}
1212
dependencies = [

socketsecurity/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
__author__ = 'socket.dev'
2-
__version__ = '2.2.51'
2+
__version__ = '2.2.55'
33
USER_AGENT = f'SocketPythonCLI/{__version__}'

socketsecurity/config.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ class CliConfig:
7878
reach_additional_params: Optional[List[str]] = None
7979
only_facts_file: bool = False
8080
reach_use_only_pregenerated_sboms: bool = False
81+
max_purl_batch_size: int = 5000
8182

8283
@classmethod
8384
def from_args(cls, args_list: Optional[List[str]] = None) -> 'CliConfig':
@@ -106,6 +107,7 @@ def from_args(cls, args_list: Optional[List[str]] = None) -> 'CliConfig':
106107
'commit_sha': args.commit_sha,
107108
'generate_license': args.generate_license,
108109
'enable_debug': args.enable_debug,
110+
'enable_diff': args.enable_diff,
109111
'allow_unverified': args.allow_unverified,
110112
'enable_json': args.enable_json,
111113
'enable_sarif': args.enable_sarif,
@@ -141,6 +143,7 @@ def from_args(cls, args_list: Optional[List[str]] = None) -> 'CliConfig':
141143
'reach_additional_params': args.reach_additional_params,
142144
'only_facts_file': args.only_facts_file,
143145
'reach_use_only_pregenerated_sboms': args.reach_use_only_pregenerated_sboms,
146+
'max_purl_batch_size': args.max_purl_batch_size,
144147
'version': __version__
145148
}
146149
try:
@@ -187,6 +190,11 @@ def from_args(cls, args_list: Optional[List[str]] = None) -> 'CliConfig':
187190
logging.error("--reach-concurrency must be >= 1")
188191
exit(1)
189192

193+
# Validate max_purl_batch_size is within allowed range
194+
if args.max_purl_batch_size < 1 or args.max_purl_batch_size > 9999:
195+
logging.error("--max-purl-batch-size must be between 1 and 9999")
196+
exit(1)
197+
190198
return cls(**config_args)
191199

192200
def to_dict(self) -> dict:
@@ -446,6 +454,13 @@ def create_argument_parser() -> argparse.ArgumentParser:
446454
action="store_true",
447455
help="Exclude license details from the diff report (boosts performance for large repos)"
448456
)
457+
output_group.add_argument(
458+
"--max-purl-batch-size",
459+
dest="max_purl_batch_size",
460+
type=int,
461+
default=5000,
462+
help="Maximum batch size for PURL endpoint calls when generating license info (default: 5000, min: 1, max: 9999)"
463+
)
449464

450465
output_group.add_argument(
451466
"--disable-security-issue",

socketsecurity/core/__init__.py

Lines changed: 54 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -659,54 +659,6 @@ def create_full_scan_with_report_url(
659659
# Return result in the format expected by the user
660660
return diff
661661

662-
def check_full_scans_status(self, head_full_scan_id: str, new_full_scan_id: str) -> bool:
663-
is_ready = False
664-
current_timeout = self.config.timeout
665-
self.sdk.set_timeout(0.5)
666-
try:
667-
self.sdk.fullscans.stream(self.config.org_slug, head_full_scan_id)
668-
except Exception:
669-
log.debug(f"Queued up full scan for processing ({head_full_scan_id})")
670-
671-
try:
672-
self.sdk.fullscans.stream(self.config.org_slug, new_full_scan_id)
673-
except Exception:
674-
log.debug(f"Queued up full scan for processing ({new_full_scan_id})")
675-
self.sdk.set_timeout(current_timeout)
676-
start_check = time.time()
677-
head_is_ready = False
678-
new_is_ready = False
679-
while not is_ready:
680-
head_full_scan_metadata = self.sdk.fullscans.metadata(self.config.org_slug, head_full_scan_id)
681-
if head_full_scan_metadata:
682-
head_state = head_full_scan_metadata.get("scan_state")
683-
else:
684-
head_state = None
685-
new_full_scan_metadata = self.sdk.fullscans.metadata(self.config.org_slug, new_full_scan_id)
686-
if new_full_scan_metadata:
687-
new_state = new_full_scan_metadata.get("scan_state")
688-
else:
689-
new_state = None
690-
if head_state and head_state == "resolve":
691-
head_is_ready = True
692-
if new_state and new_state == "resolve":
693-
new_is_ready = True
694-
if head_is_ready and new_is_ready:
695-
is_ready = True
696-
current_time = time.time()
697-
if current_time - start_check >= self.config.timeout:
698-
log.debug(
699-
f"Timeout reached while waiting for full scans to be ready "
700-
f"({head_full_scan_id}, {new_full_scan_id})"
701-
)
702-
break
703-
total_time = time.time() - start_check
704-
if is_ready:
705-
log.info(f"Full scans are ready in {total_time:.2f} seconds")
706-
else:
707-
log.warning(f"Full scans are not ready yet ({head_full_scan_id}, {new_full_scan_id})")
708-
return is_ready
709-
710662
def get_full_scan(self, full_scan_id: str) -> FullScan:
711663
"""
712664
Get a FullScan object for an existing full scan including sbom_artifacts and packages.
@@ -846,28 +798,54 @@ def update_package_values(pkg: Package) -> Package:
846798
pkg.url += f"/{pkg.name}/overview/{pkg.version}"
847799
return pkg
848800

849-
def get_license_text_via_purl(self, packages: dict[str, Package]) -> dict:
850-
components = []
801+
def get_license_text_via_purl(self, packages: dict[str, Package], batch_size: int = 5000) -> dict:
802+
"""Get license attribution and details via PURL endpoint in batches.
803+
804+
Args:
805+
packages: Dictionary of packages to get license info for
806+
batch_size: Maximum number of packages to process per API call (1-9999)
807+
808+
Returns:
809+
Updated packages dictionary with licenseAttrib and licenseDetails populated
810+
"""
811+
# Validate batch size
812+
batch_size = max(1, min(9999, batch_size))
813+
814+
# Build list of all components
815+
all_components = []
851816
for purl in packages:
852817
full_purl = f"pkg:/{purl}"
853-
components.append({"purl": full_purl})
854-
results = self.sdk.purl.post(
855-
license=True,
856-
components=components,
857-
licenseattrib=True,
858-
licensedetails=True
859-
)
860-
purl_packages = []
861-
for result in results:
862-
ecosystem = result["type"]
863-
name = result["name"]
864-
package_version = result["version"]
865-
licenseDetails = result.get("licenseDetails")
866-
licenseAttrib = result.get("licenseAttrib")
867-
purl = f"{ecosystem}/{name}@{package_version}"
868-
if purl not in purl_packages and purl in packages:
869-
packages[purl].licenseAttrib = licenseAttrib
870-
packages[purl].licenseDetails = licenseDetails
818+
all_components.append({"purl": full_purl})
819+
820+
# Process in batches
821+
total_components = len(all_components)
822+
log.debug(f"Processing {total_components} packages in batches of {batch_size}")
823+
824+
for i in range(0, total_components, batch_size):
825+
batch_components = all_components[i:i + batch_size]
826+
batch_num = (i // batch_size) + 1
827+
total_batches = (total_components + batch_size - 1) // batch_size
828+
log.debug(f"Processing batch {batch_num}/{total_batches} ({len(batch_components)} packages)")
829+
830+
results = self.sdk.purl.post(
831+
license=True,
832+
components=batch_components,
833+
licenseattrib=True,
834+
licensedetails=True
835+
)
836+
837+
purl_packages = []
838+
for result in results:
839+
ecosystem = result["type"]
840+
name = result["name"]
841+
package_version = result["version"]
842+
licenseDetails = result.get("licenseDetails")
843+
licenseAttrib = result.get("licenseAttrib")
844+
purl = f"{ecosystem}/{name}@{package_version}"
845+
if purl not in purl_packages and purl in packages:
846+
packages[purl].licenseAttrib = licenseAttrib
847+
packages[purl].licenseDetails = licenseDetails
848+
871849
return packages
872850

873851
def get_added_and_removed_packages(
@@ -960,7 +938,14 @@ def get_added_and_removed_packages(
960938
log.error(f"Artifact details - name: {artifact.name}, version: {artifact.version}")
961939
log.error("No matching packages found in head_full_scan")
962940

963-
packages = self.get_license_text_via_purl(packages)
941+
# Only fetch license details if generate_license is enabled
942+
if self.cli_config and self.cli_config.generate_license:
943+
log.debug("Fetching license details via PURL endpoint")
944+
batch_size = self.cli_config.max_purl_batch_size if self.cli_config else 5000
945+
packages = self.get_license_text_via_purl(packages, batch_size=batch_size)
946+
else:
947+
log.debug("Skipping PURL endpoint call (--generate-license not set)")
948+
964949
return added_packages, removed_packages, packages
965950

966951
def create_new_diff(
@@ -1092,9 +1077,6 @@ def create_new_diff(
10921077
log.warning(f"Failed to clean up temporary file {temp_file}: {e}")
10931078

10941079
# Handle diff generation - now we always have both scans
1095-
scans_ready = self.check_full_scans_status(head_full_scan_id, new_full_scan.id)
1096-
if scans_ready is False:
1097-
log.error(f"Full scans did not complete within {self.config.timeout} seconds")
10981080
(
10991081
added_packages,
11001082
removed_packages,

socketsecurity/socketcli.py

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,7 @@ def main_code():
356356
# Determine files to check based on the new logic
357357
files_to_check = []
358358
force_api_mode = False
359+
force_diff_mode = False
359360

360361
if files_explicitly_specified:
361362
# Case 2: Files are specified - use them and don't check commit details
@@ -365,10 +366,21 @@ def main_code():
365366
# Case 1: Files not specified and --ignore-commit-files not set - try to find changed files from commit
366367
files_to_check = git_repo.changed_files
367368
log.debug(f"Using changed files from commit: {files_to_check}")
369+
elif config.ignore_commit_files and is_repo:
370+
# Case 3: Git repo with --ignore-commit-files - force diff mode
371+
files_to_check = []
372+
force_diff_mode = True
373+
log.debug("Git repo with --ignore-commit-files: forcing diff mode")
368374
else:
369-
# ignore_commit_files is set or not a repo - scan everything but force API mode if no supported files
375+
# Case 4: Not a git repo (ignore_commit_files was auto-set to True)
370376
files_to_check = []
371-
log.debug("No files to check from commit (ignore_commit_files=True or not a repo)")
377+
# If --enable-diff is set, force diff mode for non-git repos
378+
log.debug(f"Case 4: Non-git repo - config.enable_diff={config.enable_diff}, type={type(config.enable_diff)}")
379+
if config.enable_diff:
380+
force_diff_mode = True
381+
log.debug("Non-git repo with --enable-diff: forcing diff mode")
382+
else:
383+
log.debug("Non-git repo without --enable-diff: will use full scan mode")
372384

373385
# Check if we have supported manifest files
374386
has_supported_files = files_to_check and core.has_manifest_files(files_to_check)
@@ -389,22 +401,21 @@ def main_code():
389401
has_supported_files = False
390402

391403
# Case 3: If no supported files or files are empty, force API mode (no PR comments)
392-
if not has_supported_files:
404+
# BUT: Don't force API mode if we're in force_diff_mode
405+
log.debug(f"files_to_check={files_to_check}, has_supported_files={has_supported_files}, force_diff_mode={force_diff_mode}, config.enable_diff={config.enable_diff}")
406+
if not has_supported_files and not force_diff_mode:
393407
force_api_mode = True
394408
log.debug("No supported manifest files found, forcing API mode")
409+
log.debug(f"force_api_mode={force_api_mode}")
395410

396411
# Determine scan behavior
397412
should_skip_scan = False # Always perform scan, but behavior changes based on supported files
398-
if config.ignore_commit_files and not files_explicitly_specified:
399-
# Force full scan when ignoring commit files and no explicit files
400-
should_skip_scan = False
401-
log.debug("Forcing full scan due to ignore_commit_files")
402-
elif not has_supported_files:
403-
# No supported files - still scan but in API mode
413+
if not has_supported_files and not force_diff_mode:
414+
# No supported files and not forcing diff - still scan but in API mode
404415
should_skip_scan = False
405416
log.debug("No supported files but will scan in API mode")
406417
else:
407-
log.debug("Found supported manifest files, proceeding with normal scan")
418+
log.debug("Found supported manifest files or forcing diff mode, proceeding with normal scan")
408419

409420
org_slug = core.config.org_slug
410421
if config.repo_is_public:
@@ -457,6 +468,7 @@ def main_code():
457468
diff.report_url = ""
458469

459470
# Handle SCM-specific flows
471+
log.debug(f"Flow decision: scm={scm is not None}, force_diff_mode={force_diff_mode}, force_api_mode={force_api_mode}, enable_diff={config.enable_diff}")
460472
if scm is not None and scm.check_event_type() == "comment":
461473
# FIXME: This entire flow should be a separate command called "filter_ignored_alerts_in_comments"
462474
# It's not related to scanning or diff generation - it just:
@@ -531,14 +543,15 @@ def main_code():
531543

532544
output_handler.handle_output(diff)
533545

534-
elif config.enable_diff and not force_api_mode:
535-
# New logic: --enable-diff forces diff mode even with --integration api (no SCM)
546+
elif (config.enable_diff or force_diff_mode) and not force_api_mode:
547+
# New logic: --enable-diff or force_diff_mode (from --ignore-commit-files in git repos) forces diff mode
536548
log.info("Diff mode enabled without SCM integration")
537549
diff = core.create_new_diff(scan_paths, params, no_change=should_skip_scan, save_files_list_path=config.save_submitted_files_list, save_manifest_tar_path=config.save_manifest_tar, base_paths=base_paths, explicit_files=sbom_files_to_submit)
538550
output_handler.handle_output(diff)
539551

540-
elif config.enable_diff and force_api_mode:
541-
# User requested diff mode but no manifest files were detected
552+
elif (config.enable_diff or force_diff_mode) and force_api_mode:
553+
# User requested diff mode but no manifest files were detected - this should not happen with new logic
554+
# but keeping as a safety net
542555
log.warning("--enable-diff was specified but no supported manifest files were detected in the changed files. Falling back to full scan mode.")
543556
log.info("Creating Socket Report (full scan)")
544557
serializable_params = {

0 commit comments

Comments
 (0)