Skip to content

Commit

Permalink
Mozilla bug 1499682 - SIMD-accelerate the HTML tokenizer in the inner…
Browse files Browse the repository at this point in the history
…HTML case.

Differential Revision: https://phabricator.services.mozilla.com/D227317
  • Loading branch information
hsivonen authored and sideshowbarker committed Dec 7, 2024
1 parent bfcfee0 commit 620099d
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 48 deletions.
168 changes: 121 additions & 47 deletions src/nu/validator/htmlparser/impl/Tokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -1493,12 +1493,22 @@ public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException {
*/
// CPPONLY: if (mViewSource) {
// CPPONLY: mViewSource.SetBuffer(buffer);
// CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
// CPPONLY: if (htmlaccel_enabled()) {
// CPPONLY: pos = StateLoopViewSourceSIMD(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
// CPPONLY: } else {
// CPPONLY: pos = StateLoopViewSourceALU(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
// CPPONLY: }
// CPPONLY: mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1);
// CPPONLY: } else if (tokenHandler.WantsLineAndColumn()) {
// CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
// CPPONLY: if (htmlaccel_enabled()) {
// CPPONLY: pos = StateLoopLineColSIMD(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
// CPPONLY: } else {
// CPPONLY: pos = StateLoopLineColALU(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
// CPPONLY: }
// CPPONLY: } else if (htmlaccel_enabled() && ((buffer.getEnd() - pos) >= 32)) {
// CPPONLY: pos = StateLoopFastestSIMD(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
// CPPONLY: } else {
// CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
// CPPONLY: pos = StateLoopFastestALU(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
// CPPONLY: }
// [NOCPP[
pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState,
Expand Down Expand Up @@ -1623,54 +1633,118 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
switch (state) {
case DATA:
dataloop: for (;;) {
// Ideally this reconsume block would be a separate state, DATA_RECONSUME above this one
// with fallthrough into this state. However, such a change would be disruptive to
// TransitionHandler and everything that works with returnState.
if (reconsume) {
reconsume = false;
} else {
if (++pos == endPos) {
break stateloop;
// This is a manual copy of the switch below with break/continue
// adjusted as relevant. Make sure to keep in sync with the switch below!
switch (c) {
case '&':
/*
* U+0026 AMPERSAND (&) Switch to the character
* reference in data state.
*/
flushChars(buf, pos);
assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
appendCharRefBuf(c);
setAdditionalAndRememberAmpersandLocation('\u0000');
returnState = state;
state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
continue stateloop;
case '<':
/*
* U+003C LESS-THAN SIGN (<) Switch to the tag
* open state.
*/
flushChars(buf, pos);

state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos);
// `break` optimizes; `continue stateloop;` would be valid
break dataloop;
case '\u0000':
maybeEmitReplacementCharacter(buf, pos);
break;
case '\r':
emitCarriageReturn(buf, pos);
break stateloop;
case '\n':
silentLineFeed();
// CPPONLY: MOZ_FALLTHROUGH;
default:
/*
* Anything else Emit the input character as a
* character token.
*
* Stay in the data state.
*/
break;
}
c = checkChar(buf, pos);
}
switch (c) {
case '&':
/*
* U+0026 AMPERSAND (&) Switch to the character
* reference in data state.
*/
flushChars(buf, pos);
assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
appendCharRefBuf(c);
setAdditionalAndRememberAmpersandLocation('\u0000');
returnState = state;
state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
continue stateloop;
case '<':
/*
* U+003C LESS-THAN SIGN (<) Switch to the tag
* open state.
*/
flushChars(buf, pos);

state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos);
// `break` optimizes; `continue stateloop;` would be valid
break dataloop;
case '\u0000':
maybeEmitReplacementCharacter(buf, pos);
continue;
case '\r':
emitCarriageReturn(buf, pos);
break stateloop;
case '\n':
silentLineFeed();
// CPPONLY: MOZ_FALLTHROUGH;
default:
/*
* Anything else Emit the input character as a
* character token.
*
* Stay in the data state.
*/
continue;
datamiddle: for (;;) {
++pos;
// Perhaps at some point, it will be appropriate to do SIMD in Java, but not today.
// The line below advances pos by some number of code units that this state is indifferent to.
// CPPONLY: pos += accelerateData(buf, pos, endPos);
for (;;) {
if (pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
// Make sure to keep in sync with the switch above in the reconsume block!
switch (c) {
case '&':
/*
* U+0026 AMPERSAND (&) Switch to the character
* reference in data state.
*/
flushChars(buf, pos);
assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
appendCharRefBuf(c);
setAdditionalAndRememberAmpersandLocation('\u0000');
returnState = state;
state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
continue stateloop;
case '<':
/*
* U+003C LESS-THAN SIGN (<) Switch to the tag
* open state.
*/
flushChars(buf, pos);

state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos);
// `break` optimizes; `continue stateloop;` would be valid
break dataloop;
case '\u0000':
maybeEmitReplacementCharacter(buf, pos);
// Climb back to the SIMD path.
continue datamiddle;
case '\r':
emitCarriageReturn(buf, pos);
break stateloop;
case '\n':
silentLineFeed();
// Climb back to the SIMD path.
continue datamiddle;
default:
/*
* Anything else Emit the input character as a
* character token.
*
* Stay in the data state.
*/
// Don't go back to SIMD. We have less than a SIMD
// stride to go if we come here in the SIMD case with
// the fastest loop policy. With other policies, we
// can come here due to a non-BMP character, in which
// case we stay on the ALU path until the end of the
// line.
// We need to increment pos!
++pos;
continue;
}
}
}
}
// CPPONLY: MOZ_FALLTHROUGH;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ public class CppTypes {
"nsHtml5ArrayCopy", "nsHtml5AtomTable", "nsHtml5DocumentMode",
"nsHtml5Highlighter", "nsHtml5Macros", "nsHtml5NamedCharacters",
"nsHtml5NamedCharactersAccel", "nsHtml5String",
"nsIContent", "nsTraceRefcnt" };
"nsIContent", "nsTraceRefcnt", "mozilla/parser/htmlaccel_enabled" };

private static final String[] STACK_NODE_INCLUDES = { "nsAtom", "nsHtml5AtomTable",
"nsHtml5HtmlAttributes", "nsHtml5String", "nsNameSpaceManager", "nsIContent",
Expand Down Expand Up @@ -486,6 +486,10 @@ public String checkChar() {
return "P::checkChar";
}

public String accelerateData() {
return "P::accelerateData";
}

public String silentLineFeed() {
return "P::silentLineFeed";
}
Expand Down Expand Up @@ -541,4 +545,8 @@ public String crashMacro() {
public String loopPolicyInclude() {
return "nsHtml5TokenizerLoopPolicies";
}

public String alwaysInline() {
return "MOZ_ALWAYS_INLINE_EVEN_DEBUG";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1320,6 +1320,9 @@ public void visit(MethodCallExpr n, LocalSymbolTable arg) {
} else if ("checkChar".equals(n.getName())
&& n.getScope() == null) {
visitCheckChar(n, arg);
} else if ("accelerateData".equals(n.getName())
&& n.getScope() == null) {
visitAccelerateData(n, arg);
} else if ("silentCarriageReturn".equals(n.getName())
&& n.getScope() == null) {
visitSilentCarriageReturn(n, arg);
Expand Down Expand Up @@ -1650,6 +1653,10 @@ protected void printMethodDeclaration(MethodDeclaration n,
&& "Tokenizer".equals(javaClassName)
&& cppTypes.stateLoopPolicies().length > 0) {
printer.print("template<class P>");
if ("stateLoop".equals(n.getName())) {
printer.print(" ");
printer.print(cppTypes.alwaysInline());
}
if (inHeader()) {
printer.print(" ");
} else {
Expand Down Expand Up @@ -1956,6 +1963,22 @@ private void visitCheckChar(MethodCallExpr call, LocalSymbolTable arg) {
printer.print(")");
}

private void visitAccelerateData(MethodCallExpr call, LocalSymbolTable arg) {
List<Expression> args = call.getArgs();
printer.print(cppTypes.accelerateData());
printer.print("(this, ");
if (call.getArgs() != null) {
for (Iterator<Expression> i = call.getArgs().iterator(); i.hasNext();) {
Expression e = i.next();
e.accept(this, arg);
if (i.hasNext()) {
printer.print(", ");
}
}
}
printer.print(")");
}

private void visitSilentLineFeed(MethodCallExpr call, LocalSymbolTable arg) {
printer.print(cppTypes.silentLineFeed());
printer.print("(this)");
Expand Down

0 comments on commit 620099d

Please sign in to comment.