Skip to content

Commit 529469d

Browse files
committed
Stop URLs at more invalid characters, notably '<' and '>' (#7)
Before, we would allow angle brackets if they were balanced (http://example.org/<>). But according to RFC 3987, angle brackets are not allowed in URLs, and other linkers don't seem to allow them either. Also add some more control characters in ASCII range to that list.
1 parent 687656d commit 529469d

File tree

2 files changed

+94
-18
lines changed

2 files changed

+94
-18
lines changed

src/main/java/org/nibor/autolink/internal/UrlScanner.java

Lines changed: 70 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -54,21 +54,86 @@ private int findLast(CharSequence input, int beginIndex) {
5454
int round = 0;
5555
int square = 0;
5656
int curly = 0;
57-
int angle = 0;
5857
boolean doubleQuote = false;
5958
boolean singleQuote = false;
6059
int last = beginIndex;
6160
loop:
6261
for (int i = beginIndex; i < input.length(); i++) {
6362
char c = input.charAt(i);
6463
switch (c) {
65-
case ' ':
64+
case '\u0000':
65+
case '\u0001':
66+
case '\u0002':
67+
case '\u0003':
68+
case '\u0004':
69+
case '\u0005':
70+
case '\u0006':
71+
case '\u0007':
72+
case '\u0008':
6673
case '\t':
6774
case '\n':
6875
case '\u000B':
6976
case '\f':
7077
case '\r':
71-
// These can never be part of an URL, so stop now
78+
case '\u000E':
79+
case '\u000F':
80+
case '\u0010':
81+
case '\u0011':
82+
case '\u0012':
83+
case '\u0013':
84+
case '\u0014':
85+
case '\u0015':
86+
case '\u0016':
87+
case '\u0017':
88+
case '\u0018':
89+
case '\u0019':
90+
case '\u001A':
91+
case '\u001B':
92+
case '\u001C':
93+
case '\u001D':
94+
case '\u001E':
95+
case '\u001F':
96+
case ' ':
97+
case '<':
98+
case '>':
99+
case '\u007F':
100+
case '\u0080':
101+
case '\u0081':
102+
case '\u0082':
103+
case '\u0083':
104+
case '\u0084':
105+
case '\u0085':
106+
case '\u0086':
107+
case '\u0087':
108+
case '\u0088':
109+
case '\u0089':
110+
case '\u008A':
111+
case '\u008B':
112+
case '\u008C':
113+
case '\u008D':
114+
case '\u008E':
115+
case '\u008F':
116+
case '\u0090':
117+
case '\u0091':
118+
case '\u0092':
119+
case '\u0093':
120+
case '\u0094':
121+
case '\u0095':
122+
case '\u0096':
123+
case '\u0097':
124+
case '\u0098':
125+
case '\u0099':
126+
case '\u009A':
127+
case '\u009B':
128+
case '\u009C':
129+
case '\u009D':
130+
case '\u009E':
131+
case '\u009F':
132+
// These can never be part of an URL, so stop now. See RFC 3986 and RFC 3987.
133+
// Some characters are not in the above list, even they are not in "unreserved" or "reserved":
134+
// '"', '\\', '^', '`', '{', '|', '}'
135+
// The reason for this is that other link detectors also allow them. Also see below, we require
136+
// the quote and the braces to be balanced.
72137
break loop;
73138
case '?':
74139
case '!':
@@ -97,9 +162,11 @@ private int findLast(CharSequence input, int beginIndex) {
97162
}
98163
break;
99164
case '[':
165+
// Allowed in IPv6 address host
100166
square++;
101167
break;
102168
case ']':
169+
// Allowed in IPv6 address host
103170
square--;
104171
if (square >= 0) {
105172
last = i;
@@ -120,18 +187,6 @@ private int findLast(CharSequence input, int beginIndex) {
120187
break loop;
121188
}
122189
break;
123-
case '<':
124-
angle++;
125-
break;
126-
case '>':
127-
angle--;
128-
if (angle >= 0) {
129-
last = i;
130-
} else {
131-
// More closing than opening brackets, stop now
132-
break loop;
133-
}
134-
break;
135190
case '"':
136191
doubleQuote = !doubleQuote;
137192
if (!doubleQuote) {

src/test/java/org/nibor/autolink/AutolinkUrlTest.java

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ public void linking() {
6666
assertLinked("http://example.org/", "|http://example.org/|");
6767
assertLinked("http://example.org/123", "|http://example.org/123|");
6868
assertLinked("http://example.org/?foo=test&bar=123", "|http://example.org/?foo=test&bar=123|");
69+
assertLinked("http://example.org/?foo=%20", "|http://example.org/?foo=%20|");
70+
assertLinked("http://example.org/%3C", "|http://example.org/%3C|");
6971
}
7072

7173
@Test
@@ -74,13 +76,29 @@ public void schemeSeparatedByNonAlphanumeric() {
7476
}
7577

7678
@Test
77-
public void spaceSeparation() {
79+
public void spaceCharactersStopUrl() {
7880
assertLinked("foo http://example.org/", "foo |http://example.org/|");
7981
assertLinked("http://example.org/ bar", "|http://example.org/| bar");
82+
assertLinked("http://example.org/\tbar", "|http://example.org/|\tbar");
83+
assertLinked("http://example.org/\nbar", "|http://example.org/|\nbar");
84+
assertLinked("http://example.org/\u000Bbar", "|http://example.org/|\u000Bbar");
85+
assertLinked("http://example.org/\fbar", "|http://example.org/|\fbar");
86+
assertLinked("http://example.org/\rbar", "|http://example.org/|\rbar");
8087
}
8188

8289
@Test
83-
public void delimiterSeparation() {
90+
public void illegalCharactersStopUrl() {
91+
assertLinked("http://example.org/<", "|http://example.org/|<");
92+
assertLinked("http://example.org/>", "|http://example.org/|>");
93+
assertLinked("http://example.org/<>", "|http://example.org/|<>");
94+
assertLinked("http://example.org/\u0000", "|http://example.org/|\u0000");
95+
assertLinked("http://example.org/\u000E", "|http://example.org/|\u000E");
96+
assertLinked("http://example.org/\u007F", "|http://example.org/|\u007F");
97+
assertLinked("http://example.org/\u009F", "|http://example.org/|\u009F");
98+
}
99+
100+
@Test
101+
public void delimiterAtEnd() {
84102
assertLinked("http://example.org/.", "|http://example.org/|.");
85103
assertLinked("http://example.org/..", "|http://example.org/|..");
86104
assertLinked("http://example.org/,", "|http://example.org/|,");
@@ -95,7 +113,6 @@ public void matchingPunctuation() {
95113
assertLinked("http://example.org/a(b)", "|http://example.org/a(b)|");
96114
assertLinked("http://example.org/a[b]", "|http://example.org/a[b]|");
97115
assertLinked("http://example.org/a{b}", "|http://example.org/a{b}|");
98-
assertLinked("http://example.org/a<b>", "|http://example.org/a<b>|");
99116
assertLinked("http://example.org/a\"b\"", "|http://example.org/a\"b\"|");
100117
assertLinked("http://example.org/a'b'", "|http://example.org/a'b'|");
101118
assertLinked("(http://example.org/)", "(|http://example.org/|)");
@@ -136,6 +153,8 @@ public void html() {
136153
assertLinked("http://example.org'>", "|http://example.org|'>");
137154
assertLinked("http://example.org\"/>", "|http://example.org|\"/>");
138155
assertLinked("http://example.org'/>", "|http://example.org|'/>");
156+
assertLinked("http://example.org<p>", "|http://example.org|<p>");
157+
assertLinked("http://example.org</p>", "|http://example.org|</p>");
139158
}
140159

141160
@Test
@@ -161,6 +180,8 @@ public void multiple() {
161180
@Test
162181
public void international() {
163182
assertLinked("http://üñîçøðé.com/ä", "|http://üñîçøðé.com/ä|");
183+
assertLinked("http://example.org/\u00A1", "|http://example.org/\u00A1|");
184+
assertLinked("http://example.org/\u00A2", "|http://example.org/\u00A2|");
164185
}
165186

166187
@Test

0 commit comments

Comments
 (0)