Skip to content

Commit 1e5ffcb

Browse files
committed
normalize to "copyright" and "trademark" rather than single-char variants
This improves matching correctness when trying to match a license file to the text of a license which includes "copyright" verbatim in the body of the license text, since the same normalization is not applied to the original license texts. This includes common licenses such as the 2- and 3-clause BSD licenses.
1 parent 63bc934 commit 1e5ffcb

File tree

2 files changed

+8
-5
lines changed

2 files changed

+8
-5
lines changed

licensedb/internal/normalize/normalize.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -75,13 +75,13 @@ var (
7575
)
7676

7777
// 9.1.1 "©", "(c)", or "Copyright" should be considered equivalent and interchangeable.
78-
copyrightRe = regexp.MustCompile("copyright|\\(c\\)")
79-
trademarkRe = regexp.MustCompile("trademark(s?)|\\(tm\\)")
78+
copyrightRe = regexp.MustCompile("©|\\(c\\)")
79+
trademarkRe = regexp.MustCompile("trademarks|\\(tm\\)|™")
8080

8181
// extra cleanup
8282
brokenLinkRe = regexp.MustCompile("http s ://")
8383
urlCleanupRe = regexp.MustCompile("[<(](http(s?)://[^\\s]+)[)>]")
84-
copyrightLineRe = regexp.MustCompile("(?m)^((©.*)|(all rights reserved(\\.)?)|(li[cs]en[cs]e))\n")
84+
copyrightLineRe = regexp.MustCompile("(?m)^((copyright.*)|(all rights reserved(\\.)?)|(li[cs]en[cs]e))\n")
8585
nonAlphaNumRe = regexp.MustCompile("[^- \\na-z0-9]")
8686

8787
// used in Split()
@@ -128,8 +128,8 @@ func LicenseText(text string, strictness Strictness) string {
128128
text = wordReplacer.Replace(text)
129129

130130
// 9. Copyright Symbol
131-
text = copyrightRe.ReplaceAllString(text, "©")
132-
text = trademarkRe.ReplaceAllString(text, "")
131+
text = copyrightRe.ReplaceAllString(text, "copyright")
132+
text = trademarkRe.ReplaceAllString(text, "trademark")
133133

134134
// fix broken URLs in SPDX source texts
135135
text = brokenLinkRe.ReplaceAllString(text, "https://")
@@ -155,7 +155,9 @@ func LicenseText(text string, strictness Strictness) string {
155155
// there are common mismatches because of trailing dots
156156
text = strings.Replace(text, ".", "", -1)
157157
// usually copyright lines are custom and occur multiple times
158+
text = strings.Replace(text, "copyright notice", "PLACEHOLDER", -1)
158159
text = copyrightLineRe.ReplaceAllString(text, "")
160+
text = strings.Replace(text, "PLACEHOLDER", "copyright notice", -1)
159161
}
160162

161163
if strictness > Moderate {

licensedb/internal/normalize/normalize_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ permissions granted by this license.`},
2323
{"punctuation", "a-‒–—―⁓⸺⸻~˗‐‑⁃⁻₋−∼⎯⏤─➖𐆑֊﹘﹣-", "a-"},
2424
{"bullet", "-\n*\n\n\n\n\n\n\n🞄\n\n\n", ""},
2525
{"license", "", ""},
26+
{"copyright notice", "copyright notice", "copyright notice"},
2627
}
2728

2829
for _, tc := range tt {

0 commit comments

Comments
 (0)