Skip to content

Commit 4c50f11

Browse files
committed
Merge remote-tracking branch 'origin/main' into barslev/streaming-logs
# Conflicts: # CHANGELOG.md
2 parents 3b154de + 1af7934 commit 4c50f11

5 files changed

Lines changed: 350 additions & 6 deletions

File tree

CHANGELOG.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,23 @@
88
- New `--no-upload-logs` flag (mutually exclusive with `--upload-logs`) explicitly opts the run out of uploading logs, even when an org-level override would otherwise enable it. Use this when you need a guaranteed no-upload guarantee (e.g. legal/consent reasons).
99
- The Socket backend can also force-enable streaming for specific orgs in the absence of an explicit opt-out. The feature is best-effort — registration or upload failures silently degrade and never block the scan.
1010

11+
### Fixed: retry transient full-scan upload failures
12+
13+
- The full-scan upload (`POST /orgs/<org>/full-scans`) now retries transient
14+
gateway/connection failures — HTTP 502/503/504/408, dropped or reset connections, and
15+
request timeouts — up to 3 total attempts with increasing waits (~10s, then ~30s, plus
16+
jitter). Such failures are intermittent and a retried upload almost always succeeds.
17+
In these failure modes the server never finished reading the request body, so no scan
18+
was created and a retry does not duplicate one; in the rare case where a gateway
19+
timeout races a request the server later completes, the extra scan is benign and
20+
superseded by the retried one (as if the CLI had run twice).
21+
Non-transient errors (400/401/403/404/429 and error payloads) are never retried. Each
22+
retry logs a warning explaining what failed and when the next attempt happens.
23+
- Requires `socketdev>=3.3.0`: the SDK now records the HTTP status code on the exceptions
24+
it raises and owns the transient-vs-deterministic classification
25+
(`APIFailure.is_transient_error()`), so the CLI no longer parses status codes out of
26+
exception message text.
27+
1128
## 2.4.7
1229

1330
### Changed: pin @coana-tech/cli version; auto-update is now opt-in

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ dependencies = [
1616
'GitPython',
1717
'packaging',
1818
'python-dotenv',
19-
"socketdev>=3.2.1,<4.0.0",
19+
"socketdev>=3.3.0,<4.0.0",
2020
"bs4>=0.0.2",
2121
"markdown>=3.10",
2222
"brotli>=1.0.9; platform_python_implementation == 'CPython'",

socketsecurity/core/__init__.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import logging
22
import os
3+
import random
34
import re
45
import sys
56
import tarfile
@@ -76,6 +77,21 @@
7677
TIER1_FINALIZE_MAX_ATTEMPTS = 3
7778
TIER1_FINALIZE_BACKOFF_SECONDS = 1.0
7879

80+
# Full scan upload retry policy. An upload can fail transiently at the gateway/connection
81+
# level (an HTTP 502/503/504/408, a dropped or reset connection, or a client-side timeout)
82+
# without the server having created the scan; the SDK classifies those failures via
83+
# APIFailure.is_transient_error() (socketdev>=3.3.0). In these failure modes no scan was
84+
# created, so a retry does not duplicate one. (A duplicate is possible only if a gateway
85+
# timeout races a request the server later completes; that is benign - the retried scan
86+
# supersedes the orphaned one, same as running the CLI twice.)
87+
#
88+
# Each schedule entry is the wait before the next attempt once the current one fails (plus
89+
# a little jitter so a fleet of CI jobs hitting the same failure doesn't retry in
90+
# lock-step); the final None means the last attempt's failure is re-raised, not retried.
91+
FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS = (10.0, 30.0, None)
92+
FULL_SCAN_UPLOAD_MAX_ATTEMPTS = len(FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS)
93+
FULL_SCAN_UPLOAD_BACKOFF_JITTER_SECONDS = 2.0
94+
7995

8096
def _humanize_alert_type(alert_type: str) -> str:
8197
"""Convert a camelCase/PascalCase alert type into a Title-Cased label.
@@ -787,7 +803,33 @@ def create_full_scan(self, files: List[str], params: FullScanParams, base_paths:
787803
# facts file under the per-file upload size cap. See _compress_facts_files_for_upload.
788804
upload_files, compressed_temp_files = self._compress_facts_files_for_upload(files)
789805
try:
790-
res = self.sdk.fullscans.post(upload_files, params, use_types=True, use_lazy_loading=True, max_open_files=50, base_paths=base_paths)
806+
# Retry transient gateway/timeout failures (502/503/504/408, dropped connections,
807+
# timeouts) with increasing waits. In these failure modes the server never finished
808+
# reading the request body, so no scan was created and a retry does not duplicate
809+
# one (see the retry-policy comment above FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS).
810+
# fullscans.post() rebuilds its lazy file loaders from the plain paths in
811+
# upload_files on every call, so simply calling it again per attempt is safe. The
812+
# loop must stay inside this try so the temp .br files (cleaned up in the finally
813+
# below) outlive every attempt.
814+
for attempt, backoff_seconds in enumerate(FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS, start=1):
815+
try:
816+
res = self.sdk.fullscans.post(upload_files, params, use_types=True, use_lazy_loading=True, max_open_files=50, base_paths=base_paths)
817+
break
818+
except APIFailure as error:
819+
if backoff_seconds is None or not error.is_transient_error():
820+
raise
821+
wait_seconds = backoff_seconds + random.uniform(
822+
0, FULL_SCAN_UPLOAD_BACKOFF_JITTER_SECONDS
823+
)
824+
# SDK error messages can span many lines (path + response headers); the
825+
# first line carries the status, which is all the warning needs.
826+
error_summary = str(error).strip().splitlines()[0] if str(error).strip() else ""
827+
log.warning(
828+
f"Full scan upload failed with {type(error).__name__}({error_summary}), "
829+
f"retrying in {wait_seconds:.0f}s "
830+
f"(attempt {attempt + 1}/{FULL_SCAN_UPLOAD_MAX_ATTEMPTS})"
831+
)
832+
time.sleep(wait_seconds)
791833
finally:
792834
for temp_file in compressed_temp_files:
793835
try:

tests/unit/test_full_scan_retry.py

Lines changed: 285 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
"""Tests for the full-scan upload retry on transient gateway/connection failures.
2+
3+
A `POST /orgs/<org>/full-scans` upload can fail transiently (an HTTP 502/503/504/408, a
4+
dropped or reset connection, or a timeout) without the server having created the scan.
5+
`Core.create_full_scan` retries the failures the SDK classifies as transient
6+
(`APIFailure.is_transient_error()`, socketdev>=3.3.0); these tests cover the retry
7+
decision, the loop bounds, and that the temporary brotli-compressed facts files survive
8+
until every attempt has finished.
9+
"""
10+
11+
import logging
12+
from unittest.mock import MagicMock
13+
14+
import pytest
15+
from socketdev.exceptions import (
16+
APIAccessDenied,
17+
APIBadGateway,
18+
APIConnectionError,
19+
APIFailure,
20+
APIResourceNotFound,
21+
APITimeout,
22+
)
23+
from socketdev.fullscans import FullScanMetadata
24+
25+
from socketsecurity.core import (
26+
FULL_SCAN_UPLOAD_MAX_ATTEMPTS,
27+
SOCKET_FACTS_BROTLI_FILENAME,
28+
SOCKET_FACTS_FILENAME,
29+
Core,
30+
)
31+
32+
33+
def _success_response():
34+
metadata = FullScanMetadata(
35+
id="scan-1",
36+
created_at="2026-01-01T00:00:00Z",
37+
updated_at="2026-01-01T00:00:00Z",
38+
organization_id="org-1",
39+
repository_id="repo-1",
40+
branch="main",
41+
html_report_url="https://socket.dev/report",
42+
)
43+
response = MagicMock()
44+
response.success = True
45+
response.data = metadata
46+
return response
47+
48+
49+
# Catch-all APIFailure as the SDK raises it for statuses without a dedicated class
50+
# (socketdev/core/api.py); the recorded status_code drives is_transient_error().
51+
def _catch_all_failure(status_code: int) -> APIFailure:
52+
return APIFailure(
53+
f"Bad Request: HTTP original_status_code:{status_code}\n"
54+
f"Path: https://api.socket.dev/v0/orgs/org/full-scans\n\n"
55+
f"Headers:\ncf-ray: abc123",
56+
status_code=status_code,
57+
)
58+
59+
60+
@pytest.fixture
61+
def core_with_mock_sdk():
62+
# Build a Core without running org setup; we only exercise create_full_scan.
63+
core = Core.__new__(Core)
64+
core.sdk = MagicMock()
65+
core.cli_config = None # skip the tier1 finalize branch
66+
return core
67+
68+
69+
@pytest.fixture(autouse=True)
70+
def no_sleep(mocker):
71+
return mocker.patch("socketsecurity.core.time.sleep")
72+
73+
74+
def test_upload_succeeds_first_try(core_with_mock_sdk, tmp_path, no_sleep):
75+
manifest = tmp_path / "package.json"
76+
manifest.write_text("{}")
77+
core_with_mock_sdk.sdk.fullscans.post.return_value = _success_response()
78+
79+
full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())
80+
81+
assert full_scan.id == "scan-1"
82+
assert core_with_mock_sdk.sdk.fullscans.post.call_count == 1
83+
no_sleep.assert_not_called()
84+
85+
86+
def test_upload_retries_on_502_then_succeeds(
87+
core_with_mock_sdk, tmp_path, no_sleep, caplog
88+
):
89+
manifest = tmp_path / "package.json"
90+
manifest.write_text("{}")
91+
core_with_mock_sdk.sdk.fullscans.post.side_effect = [
92+
APIBadGateway(),
93+
APIBadGateway(),
94+
_success_response(),
95+
]
96+
97+
with caplog.at_level(logging.WARNING, logger="socketdev"):
98+
full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())
99+
100+
assert full_scan.id == "scan-1"
101+
assert core_with_mock_sdk.sdk.fullscans.post.call_count == 3
102+
assert no_sleep.call_count == 2 # waits before attempts 2 and 3
103+
retry_warnings = [r for r in caplog.records if "retrying in" in r.message]
104+
assert len(retry_warnings) == 2
105+
assert "APIBadGateway" in retry_warnings[0].message
106+
assert f"(attempt 2/{FULL_SCAN_UPLOAD_MAX_ATTEMPTS})" in retry_warnings[0].message
107+
108+
109+
def test_upload_raises_after_exhausting_attempts(
110+
core_with_mock_sdk, tmp_path, no_sleep
111+
):
112+
manifest = tmp_path / "package.json"
113+
manifest.write_text("{}")
114+
core_with_mock_sdk.sdk.fullscans.post.side_effect = APIBadGateway()
115+
116+
with pytest.raises(APIBadGateway):
117+
core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())
118+
119+
assert (
120+
core_with_mock_sdk.sdk.fullscans.post.call_count
121+
== FULL_SCAN_UPLOAD_MAX_ATTEMPTS
122+
)
123+
124+
125+
@pytest.mark.parametrize("status_code", [408, 503, 504])
126+
def test_upload_retries_on_catch_all_transient_statuses(
127+
core_with_mock_sdk, tmp_path, no_sleep, status_code
128+
):
129+
manifest = tmp_path / "package.json"
130+
manifest.write_text("{}")
131+
core_with_mock_sdk.sdk.fullscans.post.side_effect = [
132+
_catch_all_failure(status_code),
133+
_success_response(),
134+
]
135+
136+
full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())
137+
138+
assert full_scan.id == "scan-1"
139+
assert core_with_mock_sdk.sdk.fullscans.post.call_count == 2
140+
141+
142+
@pytest.mark.parametrize("error_class", [APIConnectionError, APITimeout])
143+
def test_upload_retries_on_connection_level_errors(
144+
core_with_mock_sdk, tmp_path, no_sleep, error_class
145+
):
146+
manifest = tmp_path / "package.json"
147+
manifest.write_text("{}")
148+
core_with_mock_sdk.sdk.fullscans.post.side_effect = [
149+
error_class(),
150+
_success_response(),
151+
]
152+
153+
full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())
154+
155+
assert full_scan.id == "scan-1"
156+
assert core_with_mock_sdk.sdk.fullscans.post.call_count == 2
157+
158+
159+
def test_upload_does_not_retry_on_400(core_with_mock_sdk, tmp_path, no_sleep):
160+
manifest = tmp_path / "package.json"
161+
manifest.write_text("{}")
162+
core_with_mock_sdk.sdk.fullscans.post.side_effect = _catch_all_failure(400)
163+
164+
with pytest.raises(APIFailure):
165+
core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())
166+
167+
assert core_with_mock_sdk.sdk.fullscans.post.call_count == 1
168+
no_sleep.assert_not_called()
169+
170+
171+
@pytest.mark.parametrize(
172+
"error_class,status_code", [(APIAccessDenied, 401), (APIResourceNotFound, 404)]
173+
)
174+
def test_upload_does_not_retry_on_dedicated_4xx_classes(
175+
core_with_mock_sdk, tmp_path, no_sleep, error_class, status_code
176+
):
177+
manifest = tmp_path / "package.json"
178+
manifest.write_text("{}")
179+
core_with_mock_sdk.sdk.fullscans.post.side_effect = error_class(
180+
status_code=status_code
181+
)
182+
183+
with pytest.raises(error_class):
184+
core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())
185+
186+
assert core_with_mock_sdk.sdk.fullscans.post.call_count == 1
187+
no_sleep.assert_not_called()
188+
189+
190+
def test_upload_does_not_retry_on_error_payload(core_with_mock_sdk, tmp_path, no_sleep):
191+
# A response that came back but reports failure (res.success False) is not transient.
192+
manifest = tmp_path / "package.json"
193+
manifest.write_text("{}")
194+
failed = MagicMock()
195+
failed.success = False
196+
failed.message = "tarball too large"
197+
failed.status = 200
198+
core_with_mock_sdk.sdk.fullscans.post.return_value = failed
199+
200+
with pytest.raises(Exception, match="tarball too large"):
201+
core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())
202+
203+
assert core_with_mock_sdk.sdk.fullscans.post.call_count == 1
204+
no_sleep.assert_not_called()
205+
206+
207+
def test_temp_br_file_survives_retries_and_is_cleaned_after(
208+
core_with_mock_sdk, tmp_path, no_sleep
209+
):
210+
# The brotli-compressed facts sibling must stay on disk across every retry attempt
211+
# (the SDK re-reads it per call) and only be deleted once all attempts finished.
212+
facts = tmp_path / SOCKET_FACTS_FILENAME
213+
facts.write_text('{"components": []}')
214+
compressed = tmp_path / SOCKET_FACTS_BROTLI_FILENAME
215+
br_present_per_attempt = []
216+
217+
def post_side_effect(upload_files, *args, **kwargs):
218+
br_present_per_attempt.append(compressed.is_file())
219+
assert str(compressed) in upload_files
220+
if len(br_present_per_attempt) < 3:
221+
raise APIBadGateway()
222+
return _success_response()
223+
224+
core_with_mock_sdk.sdk.fullscans.post.side_effect = post_side_effect
225+
226+
full_scan = core_with_mock_sdk.create_full_scan([str(facts)], MagicMock())
227+
228+
assert full_scan.id == "scan-1"
229+
assert br_present_per_attempt == [True, True, True]
230+
assert not compressed.exists() # cleaned up after the attempts finished
231+
assert facts.is_file() # the original facts file is never touched
232+
233+
234+
def test_temp_br_file_cleaned_after_exhausted_retries(
235+
core_with_mock_sdk, tmp_path, no_sleep
236+
):
237+
facts = tmp_path / SOCKET_FACTS_FILENAME
238+
facts.write_text('{"components": []}')
239+
compressed = tmp_path / SOCKET_FACTS_BROTLI_FILENAME
240+
core_with_mock_sdk.sdk.fullscans.post.side_effect = APIBadGateway()
241+
242+
with pytest.raises(APIBadGateway):
243+
core_with_mock_sdk.create_full_scan([str(facts)], MagicMock())
244+
245+
assert (
246+
core_with_mock_sdk.sdk.fullscans.post.call_count
247+
== FULL_SCAN_UPLOAD_MAX_ATTEMPTS
248+
)
249+
assert not compressed.exists()
250+
251+
252+
class _StubFailure(APIFailure):
253+
"""An APIFailure whose transience is fixed, regardless of class or status code."""
254+
255+
def __init__(self, transient: bool):
256+
super().__init__("stub failure")
257+
self._transient = transient
258+
259+
def is_transient_error(self) -> bool:
260+
return self._transient
261+
262+
263+
@pytest.mark.parametrize("transient,expected_calls", [(True, 2), (False, 1)])
264+
def test_retry_decision_delegates_to_sdk_classification(
265+
core_with_mock_sdk, tmp_path, no_sleep, transient, expected_calls
266+
):
267+
# The CLI encodes no knowledge of the SDK's exception hierarchy or status codes:
268+
# the retry decision is exactly APIFailure.is_transient_error(). (The transient /
269+
# non-transient truth table itself is tested in the SDK, next to the code that
270+
# raises the exceptions.)
271+
manifest = tmp_path / "package.json"
272+
manifest.write_text("{}")
273+
core_with_mock_sdk.sdk.fullscans.post.side_effect = [
274+
_StubFailure(transient),
275+
_success_response(),
276+
]
277+
278+
if transient:
279+
full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())
280+
assert full_scan.id == "scan-1"
281+
else:
282+
with pytest.raises(_StubFailure):
283+
core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock())
284+
285+
assert core_with_mock_sdk.sdk.fullscans.post.call_count == expected_calls

0 commit comments

Comments
 (0)