From 620099d4ea7bec80f5083fe792c0a42121dc2ae2 Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Fri, 29 Nov 2024 13:34:19 +0200 Subject: [PATCH] Mozilla bug 1499682 - SIMD-accelerate the HTML tokenizer in the innerHTML case. Differential Revision: https://phabricator.services.mozilla.com/D227317 --- .../validator/htmlparser/impl/Tokenizer.java | 168 +++++++++++++----- .../htmlparser/cpptranslate/CppTypes.java | 10 +- .../htmlparser/cpptranslate/CppVisitor.java | 23 +++ 3 files changed, 153 insertions(+), 48 deletions(-) diff --git a/src/nu/validator/htmlparser/impl/Tokenizer.java b/src/nu/validator/htmlparser/impl/Tokenizer.java index 6d7c3491..ac42bea2 100755 --- a/src/nu/validator/htmlparser/impl/Tokenizer.java +++ b/src/nu/validator/htmlparser/impl/Tokenizer.java @@ -1493,12 +1493,22 @@ public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException { */ // CPPONLY: if (mViewSource) { // CPPONLY: mViewSource.SetBuffer(buffer); - // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); + // CPPONLY: if (htmlaccel_enabled()) { + // CPPONLY: pos = StateLoopViewSourceSIMD(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); + // CPPONLY: } else { + // CPPONLY: pos = StateLoopViewSourceALU(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); + // CPPONLY: } // CPPONLY: mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1); // CPPONLY: } else if (tokenHandler.WantsLineAndColumn()) { - // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); + // CPPONLY: if (htmlaccel_enabled()) { + // CPPONLY: pos = StateLoopLineColSIMD(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); + // CPPONLY: } else { + // CPPONLY: pos = StateLoopLineColALU(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); + // CPPONLY: } + // CPPONLY: } else if (htmlaccel_enabled() && ((buffer.getEnd() - pos) >= 32)) { + // CPPONLY: pos = StateLoopFastestSIMD(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); // CPPONLY: } else { - // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); + // CPPONLY: pos = StateLoopFastestALU(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); // CPPONLY: } // [NOCPP[ pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, @@ -1623,54 +1633,118 @@ private void ensureBufferSpace(int inputLength) throws SAXException { switch (state) { case DATA: dataloop: for (;;) { + // Ideally this reconsume block would be a separate state, DATA_RECONSUME above this one + // with fallthrough into this state. However, such a change would be disruptive to + // TransitionHandler and everything that works with returnState. if (reconsume) { reconsume = false; - } else { - if (++pos == endPos) { - break stateloop; + // This is a manual copy of the switch below with break/continue + // adjusted as relevant. Make sure to keep in sync with the switch below! + switch (c) { + case '&': + /* + * U+0026 AMPERSAND (&) Switch to the character + * reference in data state. + */ + flushChars(buf, pos); + assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; + appendCharRefBuf(c); + setAdditionalAndRememberAmpersandLocation('\u0000'); + returnState = state; + state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); + continue stateloop; + case '<': + /* + * U+003C LESS-THAN SIGN (<) Switch to the tag + * open state. + */ + flushChars(buf, pos); + + state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid + break dataloop; + case '\u0000': + maybeEmitReplacementCharacter(buf, pos); + break; + case '\r': + emitCarriageReturn(buf, pos); + break stateloop; + case '\n': + silentLineFeed(); + // CPPONLY: MOZ_FALLTHROUGH; + default: + /* + * Anything else Emit the input character as a + * character token. + * + * Stay in the data state. + */ + break; } - c = checkChar(buf, pos); } - switch (c) { - case '&': - /* - * U+0026 AMPERSAND (&) Switch to the character - * reference in data state. - */ - flushChars(buf, pos); - assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; - appendCharRefBuf(c); - setAdditionalAndRememberAmpersandLocation('\u0000'); - returnState = state; - state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); - continue stateloop; - case '<': - /* - * U+003C LESS-THAN SIGN (<) Switch to the tag - * open state. - */ - flushChars(buf, pos); - - state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos); - // `break` optimizes; `continue stateloop;` would be valid - break dataloop; - case '\u0000': - maybeEmitReplacementCharacter(buf, pos); - continue; - case '\r': - emitCarriageReturn(buf, pos); - break stateloop; - case '\n': - silentLineFeed(); - // CPPONLY: MOZ_FALLTHROUGH; - default: - /* - * Anything else Emit the input character as a - * character token. - * - * Stay in the data state. - */ - continue; + datamiddle: for (;;) { + ++pos; + // Perhaps at some point, it will be appropriate to do SIMD in Java, but not today. + // The line below advances pos by some number of code units that this state is indifferent to. + // CPPONLY: pos += accelerateData(buf, pos, endPos); + for (;;) { + if (pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + // Make sure to keep in sync with the switch above in the reconsume block! + switch (c) { + case '&': + /* + * U+0026 AMPERSAND (&) Switch to the character + * reference in data state. + */ + flushChars(buf, pos); + assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; + appendCharRefBuf(c); + setAdditionalAndRememberAmpersandLocation('\u0000'); + returnState = state; + state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); + continue stateloop; + case '<': + /* + * U+003C LESS-THAN SIGN (<) Switch to the tag + * open state. + */ + flushChars(buf, pos); + + state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos); + // `break` optimizes; `continue stateloop;` would be valid + break dataloop; + case '\u0000': + maybeEmitReplacementCharacter(buf, pos); + // Climb back to the SIMD path. + continue datamiddle; + case '\r': + emitCarriageReturn(buf, pos); + break stateloop; + case '\n': + silentLineFeed(); + // Climb back to the SIMD path. + continue datamiddle; + default: + /* + * Anything else Emit the input character as a + * character token. + * + * Stay in the data state. + */ + // Don't go back to SIMD. We have less than a SIMD + // stride to go if we come here in the SIMD case with + // the fastest loop policy. With other policies, we + // can come here due to a non-BMP character, in which + // case we stay on the ALU path until the end of the + // line. + // We need to increment pos! + ++pos; + continue; + } + } } } // CPPONLY: MOZ_FALLTHROUGH; diff --git a/translator-src/nu/validator/htmlparser/cpptranslate/CppTypes.java b/translator-src/nu/validator/htmlparser/cpptranslate/CppTypes.java index 7e122e93..133ceea5 100644 --- a/translator-src/nu/validator/htmlparser/cpptranslate/CppTypes.java +++ b/translator-src/nu/validator/htmlparser/cpptranslate/CppTypes.java @@ -96,7 +96,7 @@ public class CppTypes { "nsHtml5ArrayCopy", "nsHtml5AtomTable", "nsHtml5DocumentMode", "nsHtml5Highlighter", "nsHtml5Macros", "nsHtml5NamedCharacters", "nsHtml5NamedCharactersAccel", "nsHtml5String", - "nsIContent", "nsTraceRefcnt" }; + "nsIContent", "nsTraceRefcnt", "mozilla/parser/htmlaccel_enabled" }; private static final String[] STACK_NODE_INCLUDES = { "nsAtom", "nsHtml5AtomTable", "nsHtml5HtmlAttributes", "nsHtml5String", "nsNameSpaceManager", "nsIContent", @@ -486,6 +486,10 @@ public String checkChar() { return "P::checkChar"; } + public String accelerateData() { + return "P::accelerateData"; + } + public String silentLineFeed() { return "P::silentLineFeed"; } @@ -541,4 +545,8 @@ public String crashMacro() { public String loopPolicyInclude() { return "nsHtml5TokenizerLoopPolicies"; } + + public String alwaysInline() { + return "MOZ_ALWAYS_INLINE_EVEN_DEBUG"; + } } diff --git a/translator-src/nu/validator/htmlparser/cpptranslate/CppVisitor.java b/translator-src/nu/validator/htmlparser/cpptranslate/CppVisitor.java index 4aed1fe0..dbe3f82a 100755 --- a/translator-src/nu/validator/htmlparser/cpptranslate/CppVisitor.java +++ b/translator-src/nu/validator/htmlparser/cpptranslate/CppVisitor.java @@ -1320,6 +1320,9 @@ public void visit(MethodCallExpr n, LocalSymbolTable arg) { } else if ("checkChar".equals(n.getName()) && n.getScope() == null) { visitCheckChar(n, arg); + } else if ("accelerateData".equals(n.getName()) + && n.getScope() == null) { + visitAccelerateData(n, arg); } else if ("silentCarriageReturn".equals(n.getName()) && n.getScope() == null) { visitSilentCarriageReturn(n, arg); @@ -1650,6 +1653,10 @@ protected void printMethodDeclaration(MethodDeclaration n, && "Tokenizer".equals(javaClassName) && cppTypes.stateLoopPolicies().length > 0) { printer.print("template"); + if ("stateLoop".equals(n.getName())) { + printer.print(" "); + printer.print(cppTypes.alwaysInline()); + } if (inHeader()) { printer.print(" "); } else { @@ -1956,6 +1963,22 @@ private void visitCheckChar(MethodCallExpr call, LocalSymbolTable arg) { printer.print(")"); } + private void visitAccelerateData(MethodCallExpr call, LocalSymbolTable arg) { + List args = call.getArgs(); + printer.print(cppTypes.accelerateData()); + printer.print("(this, "); + if (call.getArgs() != null) { + for (Iterator i = call.getArgs().iterator(); i.hasNext();) { + Expression e = i.next(); + e.accept(this, arg); + if (i.hasNext()) { + printer.print(", "); + } + } + } + printer.print(")"); + } + private void visitSilentLineFeed(MethodCallExpr call, LocalSymbolTable arg) { printer.print(cppTypes.silentLineFeed()); printer.print("(this)");