Skip to content

Commit b71a11e

Browse files
Scan package files and extract for packages
For rootfs pipelines (rootfs, docker, docker-windows) all package files which were a part of system packages had their status updated and consequently were not being scanned for licenses, copyrights, emails and urls. We were also not scanning package metadata files tagged as application packages in scan_codebase and the rootfs pipelines. This commit scans all package files and package metadata files to make sure we are not missing any information. Reference: #762 Reference: #1194 Reference: #83 Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
1 parent bb521c1 commit b71a11e

12 files changed

+6913
-208
lines changed

scanpipe/models.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1909,6 +1909,17 @@ def no_status(self, status=None):
19091909
return self.filter(~Q(status=status))
19101910
return self.filter(status="")
19111911

1912+
def package_files(self):
1913+
"""
1914+
Filter for CodebaseResources which are part of either an application
1915+
package or a system package.
1916+
"""
1917+
from scanpipe.pipes import flag
1918+
1919+
return self.filter(
1920+
Q(status=flag.APPLICATION_PACKAGE) | Q(status=flag.SYSTEM_PACKAGE)
1921+
)
1922+
19121923
def empty(self):
19131924
return self.filter(Q(size__isnull=True) | Q(size=0))
19141925

scanpipe/pipelines/docker.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def steps(cls):
3333
return (
3434
cls.extract_images,
3535
cls.extract_layers,
36+
cls.extract_archives,
3637
cls.find_images_os_and_distro,
3738
cls.collect_images_information,
3839
cls.collect_and_create_codebase_resources,
@@ -42,6 +43,7 @@ def steps(cls):
4243
cls.flag_ignored_resources,
4344
cls.scan_for_application_packages,
4445
cls.scan_for_files,
46+
cls.scan_package_files,
4547
cls.analyze_scanned_files,
4648
cls.flag_not_analyzed_codebase_resources,
4749
)

scanpipe/pipelines/docker_windows.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def steps(cls):
3434
return (
3535
cls.extract_images,
3636
cls.extract_layers,
37+
cls.extract_archives,
3738
cls.find_images_os_and_distro,
3839
cls.collect_images_information,
3940
cls.collect_and_create_codebase_resources,
@@ -45,6 +46,7 @@ def steps(cls):
4546
cls.flag_ignored_resources,
4647
cls.scan_for_application_packages,
4748
cls.scan_for_files,
49+
cls.scan_package_files,
4850
cls.analyze_scanned_files,
4951
cls.flag_data_files_with_no_clues,
5052
cls.flag_not_analyzed_codebase_resources,

scanpipe/pipelines/root_filesystem.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ class RootFS(Pipeline):
3535
def steps(cls):
3636
return (
3737
cls.extract_input_files_to_codebase_directory,
38+
cls.extract_archives,
3839
cls.find_root_filesystems,
3940
cls.collect_rootfs_information,
4041
cls.collect_and_create_codebase_resources,
@@ -45,6 +46,7 @@ def steps(cls):
4546
cls.scan_for_application_packages,
4647
cls.match_not_analyzed_to_system_packages,
4748
cls.scan_for_files,
49+
cls.scan_package_files,
4850
cls.analyze_scanned_files,
4951
cls.flag_not_analyzed_codebase_resources,
5052
)
@@ -89,7 +91,7 @@ def collect_and_create_system_packages(self):
8991
rootfs.scan_rootfs_for_system_packages(self.project, rfs)
9092

9193
def flag_uninteresting_codebase_resources(self):
92-
"""Flag files—not worth tracking—that don’t belong to any system packages."""
94+
"""Flag files—not worth tracking—that do not belong to any system packages."""
9395
rootfs.flag_uninteresting_codebase_resources(self.project)
9496

9597
def scan_for_application_packages(self):
@@ -123,6 +125,13 @@ def scan_for_files(self):
123125
"""Scan unknown resources for copyrights, licenses, emails, and urls."""
124126
scancode.scan_for_files(self.project, progress_logger=self.log)
125127

128+
def scan_package_files(self):
129+
"""
130+
Scan files which are part of a package, for copyright, license, email
131+
and urls.
132+
"""
133+
scancode.scan_package_files(self.project, progress_logger=self.log)
134+
126135
def analyze_scanned_files(self):
127136
"""Analyze single file scan results for completeness."""
128137
flag.analyze_scanned_files(self.project)

scanpipe/pipelines/scan_codebase.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ def steps(cls):
4545
cls.flag_ignored_resources,
4646
cls.scan_for_application_packages,
4747
cls.scan_for_files,
48+
cls.scan_package_files,
4849
)
4950

5051
def copy_inputs_to_codebase_directory(self):
@@ -65,3 +66,10 @@ def scan_for_application_packages(self):
6566
def scan_for_files(self):
6667
"""Scan unknown resources for copyrights, licenses, emails, and urls."""
6768
scancode.scan_for_files(self.project, progress_logger=self.log)
69+
70+
def scan_package_files(self):
71+
"""
72+
Scan files which are manifests for detected application packages, for copyright,
73+
license, email and urls.
74+
"""
75+
scancode.scan_package_files(self.project, progress_logger=self.log)

scanpipe/pipes/scancode.py

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,9 @@ def scan_for_package_data(location, with_threading=True, package_only=False, **k
235235
return _scan_resource(location, scanners, with_threading=with_threading)
236236

237237

238-
def save_scan_file_results(codebase_resource, scan_results, scan_errors):
238+
def save_scan_file_results(
239+
codebase_resource, scan_results, scan_errors, update_status=True, **kwargs
240+
):
239241
"""
240242
Save the resource scan file results in the database.
241243
Create project errors if any occurred during the scan.
@@ -246,6 +248,9 @@ def save_scan_file_results(codebase_resource, scan_results, scan_errors):
246248
codebase_resource.add_errors(scan_errors)
247249
status = flag.SCANNED_WITH_ERROR
248250

251+
if not update_status:
252+
status = None
253+
249254
codebase_resource.set_scan_results(scan_results, status)
250255

251256

@@ -266,7 +271,12 @@ def save_scan_package_results(codebase_resource, scan_results, scan_errors):
266271

267272

268273
def scan_resources(
269-
resource_qs, scan_func, save_func, scan_func_kwargs=None, progress_logger=None
274+
resource_qs,
275+
scan_func,
276+
save_func,
277+
scan_func_kwargs=None,
278+
save_func_kwargs=None,
279+
progress_logger=None,
270280
):
271281
"""
272282
Run the `scan_func` on the codebase resources of the provided `resource_qs`.
@@ -286,6 +296,9 @@ def scan_resources(
286296
if not scan_func_kwargs:
287297
scan_func_kwargs = {}
288298

299+
if not save_func_kwargs:
300+
save_func_kwargs = {}
301+
289302
resource_count = resource_qs.count()
290303
logger.info(f"Scan {resource_count} codebase resources with {scan_func.__name__}")
291304
resource_iterator = resource_qs.iterator(chunk_size=2000)
@@ -300,7 +313,7 @@ def scan_resources(
300313
scan_results, scan_errors = scan_func(
301314
resource.location, with_threading, **scan_func_kwargs
302315
)
303-
save_func(resource, scan_results, scan_errors)
316+
save_func(resource, scan_results, scan_errors, **save_func_kwargs)
304317
return
305318

306319
logger.info(f"Starting ProcessPoolExecutor with {max_workers} max_workers")
@@ -319,10 +332,10 @@ def scan_resources(
319332
progress.log_progress()
320333
logger.debug(f"{scan_func.__name__} pk={resource.pk}")
321334
scan_results, scan_errors = future.result()
322-
save_func(resource, scan_results, scan_errors)
335+
save_func(resource, scan_results, scan_errors, **save_func_kwargs)
323336

324337

325-
def scan_for_files(project, resource_qs=None, progress_logger=None):
338+
def scan_for_files(project, resource_qs=None, progress_logger=None, update_status=True):
326339
"""
327340
Run a license, copyright, email, and url scan on files without a status for
328341
a `project`.
@@ -338,12 +351,39 @@ def scan_for_files(project, resource_qs=None, progress_logger=None):
338351
if license_score := project.get_env("scancode_license_score"):
339352
scan_func_kwargs["min_license_score"] = license_score
340353

354+
save_func_kwargs = {
355+
"update_status": update_status,
356+
}
357+
341358
scan_resources(
342359
resource_qs=resource_qs,
343360
scan_func=scan_file,
344361
save_func=save_scan_file_results,
345362
scan_func_kwargs=scan_func_kwargs,
363+
save_func_kwargs=save_func_kwargs,
364+
progress_logger=progress_logger,
365+
)
366+
367+
368+
def scan_package_files(
369+
project,
370+
progress_logger=None,
371+
update_status=False,
372+
):
373+
"""
374+
Scan files which are part of a package, for copyright, license, email
375+
and urls.
376+
377+
If `update_status` is False, the status field of codebase resources is not
378+
updated to `scanned` (which is a side-effect of scanning files), but rather
379+
keep the old status intact.
380+
"""
381+
package_files = project.codebaseresources.package_files()
382+
scan_for_files(
383+
project=project,
384+
resource_qs=package_files,
346385
progress_logger=progress_logger,
386+
update_status=update_status,
347387
)
348388

349389

0 commit comments

Comments
 (0)