Skip to content

Commit 51b561b

Browse files
committed
Make Encoding.decodeHtml publicly available.
Fixes #91
1 parent a30315f commit 51b561b

File tree

4 files changed

+58
-6
lines changed

4 files changed

+58
-6
lines changed

RELEASE-checklist.sh

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ set -e
1010
# Make sure the build is ok via
1111
mvn -Dguava.version=27.0-jre -f aggregate clean verify javadoc:jar source:jar
1212
mvn -f aggregate clean verify jacoco:report site javadoc:jar source:jar
13+
mvn install
1314
mvn org.sonatype.ossindex.maven:ossindex-maven-plugin:audit -f aggregate
1415

1516
echo

change_log.md

+3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
# OWASP Java HTML Sanitizer Change Log
22

33
Most recent at top.
4+
* Unreleased
5+
* Make Encoding class public so that clients can use HTML text decoder.
6+
* Fix bug in srcset handling.
47
* Release 20190325.1
58
* Properly parse `srcset` attribute values to apply URL policy to
69
each URL in turn.

src/main/java/org/owasp/html/Encoding.java

+53-5
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,14 @@
3333
import javax.annotation.Nullable;
3434

3535
/** Encoders and decoders for HTML. */
36-
final class Encoding {
36+
public final class Encoding {
3737

3838
/**
3939
* Decodes HTML entities to produce a string containing only valid
4040
* Unicode scalar values.
41+
*
42+
* @param s text/html
43+
* @return text/plain
4144
*/
4245
public static String decodeHtml(String s) {
4346
int firstAmp = s.indexOf('&');
@@ -151,11 +154,40 @@ private static int longestPrefixOfGoodCodeunits(String s) {
151154
return -1;
152155
}
153156

157+
/**
158+
* Appends an encoded form of plainText to output where the encoding is
159+
* sufficient to prevent an HTML parser from interpreting any characters in
160+
* the appended chunk as part of an attribute or tag boundary.
161+
*
162+
* @param plainText text/plain
163+
* @param output a buffer of text/html that has a well-formed HTML prefix that
164+
* ends after the open-quote of an attribute value and does not yet contain
165+
* a corresponding close quote.
166+
* Modified in place.
167+
*/
154168
static void encodeHtmlAttribOnto(String plainText, Appendable output)
155169
throws IOException {
156170
encodeHtmlOnto(plainText, output, "{\u200B");
157171
}
158172

173+
/**
174+
* Appends an encoded form of plainText to putput where the encoding is
175+
* sufficient to prevent an HTML parser from transitioning out of the
176+
* <a href="https://html.spec.whatwg.org/multipage/parsing.html#data-state">
177+
* Data state</a>.
178+
*
179+
* This is suitable for encoding a text node inside any element that does not
180+
* require special handling as a context element (see "context element" in
181+
* <a href="https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments">
182+
* step 4</a>.)
183+
*
184+
* @param plainText text/plain
185+
* @param output a buffer of text/html that has a well-formed HTML prefix that
186+
* would leave an HTML parser in the Data state if it were to encounter a space
187+
* character as the next character. In practice this means that the buffer
188+
* does not contain partial tags or comments, and does not have an unclosed
189+
* element with a special content model.
190+
*/
159191
static void encodePcdataOnto(String plainText, Appendable output)
160192
throws IOException {
161193
// Avoid problems with client-side template languages like
@@ -166,7 +198,23 @@ static void encodePcdataOnto(String plainText, Appendable output)
166198
encodeHtmlOnto(plainText, output, "{<!-- -->");
167199
}
168200

169-
static void encodeRcdataOnto(String plainText, Appendable output)
201+
/**
202+
* Appends an encoded form of plainText to putput where the encoding is
203+
* sufficient to prevent an HTML parser from transitioning out of the
204+
* <a href="https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state">
205+
* RCDATA state</a>.
206+
*
207+
* This is suitable for encoding a text node inside a {@code <textarea>} or
208+
* {@code <title>} element outside foreign content.
209+
*
210+
* @param plainText text/plain
211+
* @param output a buffer of text/html that has a well-formed HTML prefix that
212+
* would leave an HTML parser in the Data state if it were to encounter a space
213+
* character as the next character. In practice this means that the buffer
214+
* does not contain partial tags or comments, and the most recently opened
215+
* element is `<textarea>` or `<title>` and that element is still open.
216+
*/
217+
public static void encodeRcdataOnto(String plainText, Appendable output)
170218
throws IOException {
171219
// Avoid problems with client-side template languages like
172220
// Angular & Polymer which attach special significance to text like
@@ -316,7 +364,7 @@ static void appendNumericEntity(int codepoint, Appendable output)
316364
};
317365

318366
/** Maps ASCII chars that need to be encoded to an equivalent HTML entity. */
319-
static final String[] REPLACEMENTS = new String[0x80];
367+
private static final String[] REPLACEMENTS = new String[0x80];
320368
static {
321369
for (int i = 0; i < ' '; ++i) {
322370
// We elide control characters so that we can ensure that our output is
@@ -342,8 +390,8 @@ static void appendNumericEntity(int codepoint, Appendable output)
342390
}
343391

344392
/**
345-
* {@code DECODES_TO_SELF[c]} is true iff the codepoint c decodes to itself in
346-
* an HTML5 text node or properly quoted attribute value.
393+
* IS_BANNED_ASCII[i] where is an ASCII control character codepoint (&lt; 0x20)
394+
* is true for control characters that are not allowed in an XML source text.
347395
*/
348396
private static boolean[] IS_BANNED_ASCII = new boolean[0x20];
349397
static {

src/main/java/org/owasp/html/HtmlEntities.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
/**
3434
* Utilities for decoding HTML entities, e.g., {@code &amp;}.
3535
*/
36-
class HtmlEntities {
36+
final class HtmlEntities {
3737

3838
/**
3939
* Decodes any HTML entity at the given location. This handles both named and

0 commit comments

Comments
 (0)