Skip to content

Commit f1f7c63

Browse files
authored
CM-45588 - Make batching more configurable and friendly in logs (#284)
1 parent 84b8e34 commit f1f7c63

File tree

4 files changed

+76
-9
lines changed

4 files changed

+76
-9
lines changed

Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ FROM base AS builder
66
ENV POETRY_VERSION=1.8.3
77

88
# deps are required to build cffi
9-
RUN apk add --no-cache --virtual .build-deps gcc=14.2.0-r4 libffi-dev=3.4.6-r0 musl-dev=1.2.5-r9 && \
9+
RUN apk add --no-cache --virtual .build-deps gcc=14.2.0-r4 libffi-dev=3.4.7-r0 musl-dev=1.2.5-r9 && \
1010
pip install --no-cache-dir "poetry==$POETRY_VERSION" "poetry-dynamic-versioning[plugin]" && \
1111
apk del .build-deps gcc libffi-dev musl-dev
1212

cycode/cli/commands/scan/code_scanner.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ def _scan_batch_thread_func(batch: List[Document]) -> Tuple[str, CliError, Local
171171
should_use_sync_flow = _should_use_sync_flow(command_scan_type, scan_type, sync_option, scan_parameters)
172172

173173
try:
174-
logger.debug('Preparing local files, %s', {'batch_size': len(batch)})
174+
logger.debug('Preparing local files, %s', {'batch_files_count': len(batch)})
175175
zipped_documents = zip_documents(scan_type, batch)
176176
zip_file_size = zipped_documents.size
177177
scan_result = perform_scan(

cycode/cli/consts.py

+4
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,11 @@
145145
# scan in batches
146146
DEFAULT_SCAN_BATCH_MAX_SIZE_IN_BYTES = 9 * 1024 * 1024
147147
SCAN_BATCH_MAX_SIZE_IN_BYTES = {SAST_SCAN_TYPE: 50 * 1024 * 1024}
148+
SCAN_BATCH_MAX_SIZE_IN_BYTES_ENV_VAR_NAME = 'SCAN_BATCH_MAX_SIZE_IN_BYTES'
149+
148150
DEFAULT_SCAN_BATCH_MAX_FILES_COUNT = 1000
151+
SCAN_BATCH_MAX_FILES_COUNT_ENV_VAR_NAME = 'SCAN_BATCH_MAX_FILES_COUNT'
152+
149153
# if we increase this values, the server doesn't allow connecting (ConnectionError)
150154
SCAN_BATCH_MAX_PARALLEL_SCANS = 5
151155
SCAN_BATCH_SCANS_PER_CPU = 1

cycode/cli/utils/scan_batch.py

+70-7
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,83 @@
55
from cycode.cli import consts
66
from cycode.cli.models import Document
77
from cycode.cli.utils.progress_bar import ScanProgressBarSection
8+
from cycode.cyclient import logger
89

910
if TYPE_CHECKING:
1011
from cycode.cli.models import CliError, LocalScanResult
1112
from cycode.cli.utils.progress_bar import BaseProgressBar
1213

1314

15+
def _get_max_batch_size(scan_type: str) -> int:
16+
logger.debug(
17+
'You can customize the batch size by setting the environment variable "%s"',
18+
consts.SCAN_BATCH_MAX_SIZE_IN_BYTES_ENV_VAR_NAME,
19+
)
20+
21+
custom_size = os.environ.get(consts.SCAN_BATCH_MAX_SIZE_IN_BYTES_ENV_VAR_NAME)
22+
if custom_size:
23+
logger.debug('Custom batch size is set, %s', {'custom_size': custom_size})
24+
return int(custom_size)
25+
26+
return consts.SCAN_BATCH_MAX_SIZE_IN_BYTES.get(scan_type, consts.DEFAULT_SCAN_BATCH_MAX_SIZE_IN_BYTES)
27+
28+
29+
def _get_max_batch_files_count(_: str) -> int:
30+
logger.debug(
31+
'You can customize the batch files count by setting the environment variable "%s"',
32+
consts.SCAN_BATCH_MAX_FILES_COUNT_ENV_VAR_NAME,
33+
)
34+
35+
custom_files_count = os.environ.get(consts.SCAN_BATCH_MAX_FILES_COUNT_ENV_VAR_NAME)
36+
if custom_files_count:
37+
logger.debug('Custom batch files count is set, %s', {'custom_files_count': custom_files_count})
38+
return int(custom_files_count)
39+
40+
return consts.DEFAULT_SCAN_BATCH_MAX_FILES_COUNT
41+
42+
1443
def split_documents_into_batches(
44+
scan_type: str,
1545
documents: List[Document],
16-
max_size: int = consts.DEFAULT_SCAN_BATCH_MAX_SIZE_IN_BYTES,
17-
max_files_count: int = consts.DEFAULT_SCAN_BATCH_MAX_FILES_COUNT,
1846
) -> List[List[Document]]:
47+
max_size = _get_max_batch_size(scan_type)
48+
max_files_count = _get_max_batch_files_count(scan_type)
49+
50+
logger.debug(
51+
'Splitting documents into batches, %s',
52+
{'document_count': len(documents), 'max_batch_size': max_size, 'max_files_count': max_files_count},
53+
)
54+
1955
batches = []
2056

2157
current_size = 0
2258
current_batch = []
2359
for document in documents:
2460
document_size = len(document.content.encode('UTF-8'))
2561

26-
if (current_size + document_size > max_size) or (len(current_batch) >= max_files_count):
62+
exceeds_max_size = current_size + document_size > max_size
63+
if exceeds_max_size:
64+
logger.debug(
65+
'Going to create new batch because current batch size exceeds the limit, %s',
66+
{
67+
'batch_index': len(batches),
68+
'current_batch_size': current_size + document_size,
69+
'max_batch_size': max_size,
70+
},
71+
)
72+
73+
exceeds_max_files_count = len(current_batch) >= max_files_count
74+
if exceeds_max_files_count:
75+
logger.debug(
76+
'Going to create new batch because current batch files count exceeds the limit, %s',
77+
{
78+
'batch_index': len(batches),
79+
'current_batch_files_count': len(current_batch),
80+
'max_batch_files_count': max_files_count,
81+
},
82+
)
83+
84+
if exceeds_max_size or exceeds_max_files_count:
2785
batches.append(current_batch)
2886

2987
current_batch = [document]
@@ -35,6 +93,8 @@ def split_documents_into_batches(
3593
if current_batch:
3694
batches.append(current_batch)
3795

96+
logger.debug('Documents were split into batches %s', {'batches_count': len(batches)})
97+
3898
return batches
3999

40100

@@ -49,9 +109,8 @@ def run_parallel_batched_scan(
49109
documents: List[Document],
50110
progress_bar: 'BaseProgressBar',
51111
) -> Tuple[Dict[str, 'CliError'], List['LocalScanResult']]:
52-
max_size = consts.SCAN_BATCH_MAX_SIZE_IN_BYTES.get(scan_type, consts.DEFAULT_SCAN_BATCH_MAX_SIZE_IN_BYTES)
53-
54-
batches = [documents] if scan_type == consts.SCA_SCAN_TYPE else split_documents_into_batches(documents, max_size)
112+
# batching is disabled for SCA; requested by Mor
113+
batches = [documents] if scan_type == consts.SCA_SCAN_TYPE else split_documents_into_batches(scan_type, documents)
55114

56115
progress_bar.set_section_length(ScanProgressBarSection.SCAN, len(batches)) # * 3
57116
# TODO(MarshalX): we should multiply the count of batches in SCAN section because each batch has 3 steps:
@@ -61,9 +120,13 @@ def run_parallel_batched_scan(
61120
# it's not possible yet because not all scan types moved to polling mechanism
62121
# the progress bar could be significant improved (be more dynamic) in the future
63122

123+
threads_count = _get_threads_count()
64124
local_scan_results: List['LocalScanResult'] = []
65125
cli_errors: Dict[str, 'CliError'] = {}
66-
with ThreadPool(processes=_get_threads_count()) as pool:
126+
127+
logger.debug('Running parallel batched scan, %s', {'threads_count': threads_count, 'batches_count': len(batches)})
128+
129+
with ThreadPool(processes=threads_count) as pool:
67130
for scan_id, err, result in pool.imap(scan_function, batches):
68131
if result:
69132
local_scan_results.append(result)

0 commit comments

Comments
 (0)