Skip to content

Commit 2a062f5

Browse files
authored
Fix/44-gpl-3.0-or-later (#77)
* added own function to check if license tag matches with linked license file content * introduced similarity percents as comparing method if tag and found license text don't match * changed order of cases, added caching for license files * added type ignore for request library and ignore for output_verbose * added test and fixed bug of similarity percentage * added jellyfish to dependencies --------- Signed-off-by: Anton Utz <[email protected]>
1 parent 63c7406 commit 2a062f5

File tree

5 files changed

+289
-23
lines changed

5 files changed

+289
-23
lines changed

pyproject.toml

+3-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@ dependencies = [
2828
"gitpython",
2929
"rospkg",
3030
"scancode-toolkit>=32.0.8",
31-
"spdx-tools>=0.7.0rc0"
31+
"spdx-tools>=0.7.0rc0",
32+
"requests",
33+
"jellyfish"
3234
]
3335
requires-python = ">=3.7"
3436

src/ros_license_toolkit/license_checks/license_text_exists_check.py

+70-22
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,18 @@
1919
import os
2020
from typing import Any, Dict, Optional
2121

22+
import jellyfish
23+
import requests # type: ignore[import-untyped]
24+
2225
from ros_license_toolkit.checks import Check, Status
2326
from ros_license_toolkit.common import get_spdx_license_name
2427
from ros_license_toolkit.license_tag import LicenseTag, is_license_name_in_spdx_list
2528
from ros_license_toolkit.package import Package
2629
from ros_license_toolkit.ui_elements import red
2730

31+
# Value for minimal percentage between license texts for them to be accepted
32+
SIMILARITY_THRESHOLD = 90 # in percent
33+
2834

2935
class LicenseTextExistsCheck(Check):
3036
"""This ensures that the license text file referenced by the tag exists."""
@@ -85,30 +91,43 @@ def _check_licenses(self, package: Package) -> None:
8591
)
8692
self.missing_license_texts_status[license_tag] = Status.FAILURE
8793
continue
94+
8895
if actual_license != license_tag.get_license_id():
89-
self.license_tags_without_license_text[license_tag] = (
90-
f"License text file '{license_text_file}' is "
91-
+ f"of license {actual_license} but tag is "
92-
+ f"{license_tag.get_license_id()}."
93-
)
94-
# If Tag and File both are in SPDX but don't match -> Error
95-
if is_license_name_in_spdx_list(license_tag.get_license_id()):
96-
self.missing_license_texts_status[license_tag] = Status.FAILURE
97-
else:
98-
self.missing_license_texts_status[license_tag] = Status.WARNING
99-
self.files_with_wrong_tags[license_tag] = {
100-
"actual_license": actual_license,
101-
"license_tag": license_tag.get_license_id(),
102-
}
103-
continue
96+
if license_tag.has_license_text_file():
97+
license_file_for_tag = (
98+
package.abspath + "/" + license_tag.get_license_text_file()
99+
)
100+
with open(license_file_for_tag, "r", encoding="utf-8") as f:
101+
content = f.read()
102+
similarity_of_texts = self.compare_text_with_spdx_text(license_tag, content)
103+
104+
# IDEA: if accepted, add the tag to the package.found_license_texts, since scanning
105+
# has failed to do so. Also solves problem of license_file_referenced check
106+
107+
# if similarity couldn't be determined or is too low --> fail, else success
108+
if similarity_of_texts is None or similarity_of_texts < SIMILARITY_THRESHOLD:
109+
self.license_tags_without_license_text[license_tag] = (
110+
f"License text file '{license_text_file}' is "
111+
+ f"of license {actual_license} but tag is "
112+
+ f"{license_tag.get_license_id()}."
113+
)
114+
# If Tag and File both are in SPDX but don't match -> Error
115+
if is_license_name_in_spdx_list(license_tag.get_license_id()):
116+
self.missing_license_texts_status[license_tag] = Status.FAILURE
117+
else:
118+
self.missing_license_texts_status[license_tag] = Status.WARNING
119+
self.files_with_wrong_tags[license_tag] = {
120+
"actual_license": actual_license,
121+
"license_tag": license_tag.get_license_id(),
122+
}
123+
continue
104124

105125
def _evaluate_results(self):
106126
if len(self.license_tags_without_license_text) > 0:
107127
if max(self.missing_license_texts_status.values()) == Status.WARNING:
108128
self._warning(
109-
"Since they are not in the SPDX list, "
110-
"we can not check if these tags have the correct "
111-
"license text:\n"
129+
"Since they are not in the SPDX list, we can not check if these tags have the"
130+
" correct license text:\n"
112131
+ "\n".join(
113132
[
114133
f" '{x[0]}': {x[1]}"
@@ -118,18 +137,47 @@ def _evaluate_results(self):
118137
)
119138
else:
120139
self._failed(
121-
"The following license tags do not "
122-
"have a valid license text "
123-
"file:\n"
140+
"The following license tags do not have a valid license text file:\n"
124141
+ "\n".join(
125142
[
126143
f" '{x[0]}': {x[1]}"
127144
for x in self.license_tags_without_license_text.items()
128145
]
129146
)
130147
)
131-
self.verbose_output = red(
148+
self.verbose_output = red( # pylint: disable=attribute-defined-outside-init
132149
"\n".join([f" '{x[0]}': {x[1]}" for x in self.found_license_texts.items()])
133150
)
134151
else:
135152
self._success("All license tags have a valid license text file.")
153+
154+
def compare_text_with_spdx_text(self, tag, found_lic_text):
155+
"""Get similarity percent between original license text (from spdx api) and given license
156+
text."""
157+
cache_dir: str = os.path.expanduser("~/.cache/ros_license_toolkit")
158+
os.makedirs(cache_dir, exist_ok=True)
159+
license_file = os.path.join(cache_dir, f"license_{tag}.txt")
160+
161+
if not os.path.exists(license_file):
162+
url = f"https://spdx.org/licenses/{tag}.json"
163+
response = requests.get(url, timeout=100)
164+
if response is not None and response.status_code == 200:
165+
parsed_response = response.json()
166+
original_text = parsed_response["licenseText"]
167+
with open(license_file, "w", encoding="utf-8") as f:
168+
f.write(original_text)
169+
else:
170+
return None
171+
else:
172+
with open(license_file, "r", encoding="utf-8") as f:
173+
original_text = f.read()
174+
difference = self.get_similarity_percent(original_text, found_lic_text)
175+
return difference
176+
177+
def get_similarity_percent(self, text1, text2):
178+
"""Levenshtein distance based similarity percent of text1 and text2, regularized to longer
179+
text for percent value."""
180+
lev_dis = float(jellyfish.levenshtein_distance(text1, text2))
181+
bigger = float(max(len(text1), len(text2)))
182+
similarity_percentage = round(100 * (bigger - lev_dis) / bigger, 2)
183+
return similarity_percentage

0 commit comments

Comments
 (0)