Skip to content

Commit e65b58e

Browse files
authored
Merge pull request #119 from scanoss/feature/mdaloia/SP-2587-SCANOSS-PY-Add-directory-hashes-to-folder-hashing-command
[SP-2587] Add directory simhash, modify concatenated names to remove extensions
2 parents b66d054 + e8f040a commit e65b58e

File tree

10 files changed

+185
-82
lines changed

10 files changed

+185
-82
lines changed

CHANGELOG.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
99
### Added
1010
- Upcoming changes...
1111

12+
## [1.27.0] - 2025-06-30
13+
### Added
14+
- Add directory hash calculation to folder hasher
15+
- Add rank-threshold option to folder scan command
16+
1217
## [1.26.3] - 2025-06-26
1318
### Fixed
1419
- Fixed crash in inspect subcommand when processing components that lack license information
@@ -570,4 +575,5 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
570575
[1.26.0]: https://github.com/scanoss/scanoss.py/compare/v1.25.2...v1.26.0
571576
[1.26.1]: https://github.com/scanoss/scanoss.py/compare/v1.26.0...v1.26.1
572577
[1.26.2]: https://github.com/scanoss/scanoss.py/compare/v1.26.1...v1.26.2
573-
[1.26.3]: https://github.com/scanoss/scanoss.py/compare/v1.26.2...v1.26.3
578+
[1.26.3]: https://github.com/scanoss/scanoss.py/compare/v1.26.2...v1.26.3
579+
[1.27.0]: https://github.com/scanoss/scanoss.py/compare/v1.26.3...v1.27.0

CLIENT_HELP.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,15 @@ The new `folder-scan` subcommand performs a comprehensive scan on an entire dire
485485
scanoss-py folder-scan /path/to/folder -o folder-scan-results.json
486486
```
487487

488+
**Options:**
489+
- `--rank-threshold`: Filter results to only show those with rank value at or below this threshold (e.g., `--rank-threshold 3` returns results with rank 1, 2, or 3). Lower rank values indicate higher quality matches.
490+
- `--format`: Result output format (json or cyclonedx, default: json)
491+
492+
**Example with rank threshold:**
493+
```shell
494+
scanoss-py folder-scan /path/to/folder --rank-threshold 3 -o folder-scan-results.json
495+
```
496+
488497
### Container-Scan a Docker Image
489498

490499
The `container-scan` subcommand allows you to scan Docker container images for dependencies. This command extracts and analyzes dependencies from container images, helping you identify open source components within containerized applications.

docs/source/index.rst

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -249,13 +249,11 @@ Performs a comprehensive scan of a directory using folder hashing to identify co
249249
* - --output <file name>, -o <file name>
250250
- Output result file name (optional - default STDOUT)
251251
* - --format <format>, -f <format>
252-
- Output format: {json} (optional - default json)
252+
- Output format: {json, cyclonedx} (optional - default json)
253253
* - --timeout <seconds>, -M <seconds>
254254
- Timeout in seconds for API communication (optional - default 600)
255-
* - --best-match, -bm
256-
- Enable best match mode (optional - default: False)
257-
* - --threshold <1-100>
258-
- Threshold for result matching (optional - default: 100)
255+
* - --rank-threshold <number>
256+
- Filter results to only show those with rank value at or below this threshold (e.g., --rank-threshold 3 returns results with rank 1, 2, or 3). Lower rank values indicate higher quality matches.
259257
* - --settings <file>, -st <file>
260258
- Settings file to use for scanning (optional - default scanoss.json)
261259
* - --skip-settings-file, -stf

src/scanoss/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,4 @@
2222
THE SOFTWARE.
2323
"""
2424

25-
__version__ = '1.26.3'
25+
__version__ = '1.27.0'

src/scanoss/api/scanning/v2/scanoss_scanning_pb2.py

Lines changed: 29 additions & 23 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/scanoss/cli.py

Lines changed: 16 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
from .components import Components
5555
from .constants import (
5656
DEFAULT_API_TIMEOUT,
57+
DEFAULT_HFH_RANK_THRESHOLD,
5758
DEFAULT_POST_SIZE,
5859
DEFAULT_RETRY,
5960
DEFAULT_TIMEOUT,
@@ -623,24 +624,16 @@ def setup_args() -> None: # noqa: PLR0912, PLR0915
623624
'--format',
624625
'-f',
625626
type=str,
626-
choices=['json'],
627+
choices=['json', 'cyclonedx'],
627628
default='json',
628629
help='Result output format (optional - default: json)',
629630
)
630631
p_folder_scan.add_argument(
631-
'--best-match',
632-
'-bm',
633-
action='store_true',
634-
default=False,
635-
help='Enable best match mode (optional - default: False)',
636-
)
637-
p_folder_scan.add_argument(
638-
'--threshold',
632+
'--rank-threshold',
639633
type=int,
640-
choices=range(1, 101),
641-
metavar='1-100',
642-
default=100,
643-
help='Threshold for result matching (optional - default: 100)',
634+
default=DEFAULT_HFH_RANK_THRESHOLD,
635+
help='Filter results to only show those with rank value at or below this threshold (e.g., --rank-threshold 3 '
636+
'returns results with rank 1, 2, or 3). Lower rank values indicate higher quality matches.',
644637
)
645638
p_folder_scan.set_defaults(func=folder_hashing_scan)
646639

@@ -1455,7 +1448,7 @@ def utils_certloc(*_):
14551448
Run the "utils certloc" sub-command
14561449
:param _: ignored/unused
14571450
"""
1458-
import certifi # noqa: PLC0415,I001
1451+
import certifi # noqa: PLC0415,I001
14591452

14601453
print(f'CA Cert File: {certifi.where()}')
14611454

@@ -1466,11 +1459,11 @@ def utils_cert_download(_, args): # pylint: disable=PLR0912 # noqa: PLR0912
14661459
:param _: ignore/unused
14671460
:param args: Parsed arguments
14681461
"""
1469-
import socket # noqa: PLC0415,I001
1470-
import traceback # noqa: PLC0415,I001
1471-
from urllib.parse import urlparse # noqa: PLC0415,I001
1462+
import socket # noqa: PLC0415,I001
1463+
import traceback # noqa: PLC0415,I001
1464+
from urllib.parse import urlparse # noqa: PLC0415,I001
14721465

1473-
from OpenSSL import SSL, crypto # noqa: PLC0415,I001
1466+
from OpenSSL import SSL, crypto # noqa: PLC0415,I001
14741467

14751468
file = sys.stdout
14761469
if args.output:
@@ -1518,7 +1511,7 @@ def utils_pac_proxy(_, args):
15181511
:param _: ignore/unused
15191512
:param args: Parsed arguments
15201513
"""
1521-
from pypac.resolver import ProxyResolver # noqa: PLC0415,I001
1514+
from pypac.resolver import ProxyResolver # noqa: PLC0415,I001
15221515

15231516
if not args.pac:
15241517
print_stderr('Error: No pac file option specified.')
@@ -1592,7 +1585,7 @@ def crypto_algorithms(parser, args):
15921585
sys.exit(1)
15931586
except Exception as e:
15941587
if args.debug:
1595-
import traceback # noqa: PLC0415,I001
1588+
import traceback # noqa: PLC0415,I001
15961589

15971590
traceback.print_exc()
15981591
print_stderr(f'ERROR: {e}')
@@ -1634,7 +1627,7 @@ def crypto_hints(parser, args):
16341627
sys.exit(1)
16351628
except Exception as e:
16361629
if args.debug:
1637-
import traceback # noqa: PLC0415,I001
1630+
import traceback # noqa: PLC0415,I001
16381631

16391632
traceback.print_exc()
16401633
print_stderr(f'ERROR: {e}')
@@ -1676,7 +1669,7 @@ def crypto_versions_in_range(parser, args):
16761669
sys.exit(1)
16771670
except Exception as e:
16781671
if args.debug:
1679-
import traceback # noqa: PLC0415,I001
1672+
import traceback # noqa: PLC0415,I001
16801673

16811674
traceback.print_exc()
16821675
print_stderr(f'ERROR: {e}')
@@ -1965,11 +1958,9 @@ def folder_hashing_scan(parser, args):
19651958
config=scanner_config,
19661959
client=client,
19671960
scanoss_settings=scanoss_settings,
1961+
rank_threshold=args.rank_threshold,
19681962
)
19691963

1970-
scanner.best_match = args.best_match
1971-
scanner.threshold = args.threshold
1972-
19731964
if scanner.scan():
19741965
scanner.present(output_file=args.output, output_format=args.format)
19751966
except ScanossGrpcError as e:

src/scanoss/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,5 @@
1212
DEFAULT_URL2 = 'https://api.scanoss.com' # default premium service URL
1313

1414
DEFAULT_API_TIMEOUT = 600
15+
16+
DEFAULT_HFH_RANK_THRESHOLD = 5

src/scanoss/file_filters.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
import os
2626
import sys
2727
from pathlib import Path
28-
from typing import List
28+
from typing import List, Optional
2929

3030
from pathspec import GitIgnoreSpec
3131

@@ -511,7 +511,7 @@ def get_filtered_files_from_folder(self, root: str) -> List[str]:
511511
# Now filter the files and return the reduced list
512512
return self.get_filtered_files_from_files(all_files, str(root_path))
513513

514-
def get_filtered_files_from_files(self, files: List[str], scan_root: str = None) -> List[str]:
514+
def get_filtered_files_from_files(self, files: List[str], scan_root: Optional[str] = None) -> List[str]:
515515
"""
516516
Retrieve a list of files to scan or fingerprint from a given list of files based on filter settings.
517517
@@ -615,8 +615,13 @@ def _get_operation_patterns(self, operation_type: str) -> List[str]:
615615
# Default patterns for skipping directories
616616
if not self.all_folders:
617617
DEFAULT_SKIPPED_DIR_LIST = DEFAULT_SKIPPED_DIRS_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_DIRS
618+
DEFAULT_SKIPPED_DIR_EXT_LIST = (
619+
DEFAULT_SKIPPED_DIR_EXT_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_DIR_EXT
620+
)
618621
for dir_name in DEFAULT_SKIPPED_DIR_LIST:
619622
patterns.append(f'{dir_name}/')
623+
for dir_extension in DEFAULT_SKIPPED_DIR_EXT_LIST:
624+
patterns.append(f'*{dir_extension}/')
620625

621626
# Custom patterns added in SCANOSS settings file
622627
if self.scanoss_settings:

0 commit comments

Comments
 (0)