Skip to content

Commit 834ffad

Browse files
committed
feat: Significant performance improvements
1 parent 75280a8 commit 834ffad

File tree

1 file changed

+48
-138
lines changed

1 file changed

+48
-138
lines changed

src/nodes/html.ts

Lines changed: 48 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import Matcher from '../matcher';
77
import arr_back from '../back';
88
import CommentNode from './comment';
99

10-
// const { decode } = he;
10+
const voidTags = new Set([ 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr' ]);
1111

1212
type IRawTagName =
1313
| 'LI'
@@ -154,7 +154,7 @@ export default class HTMLElement extends Node {
154154
*/
155155

156156
private quoteAttribute(attr: string) {
157-
if (attr === null) {
157+
if (attr == null) {
158158
return 'null';
159159
}
160160

@@ -241,6 +241,11 @@ export default class HTMLElement extends Node {
241241
public get localName() {
242242
return this.rawTagName.toLowerCase();
243243
}
244+
245+
public get isVoidElement() {
246+
return voidTags.has(this.localName);
247+
}
248+
244249
/**
245250
* Get escpaed (as-it) text value of current node and its children.
246251
* @return {string} text content
@@ -313,14 +318,8 @@ export default class HTMLElement extends Node {
313318
public toString() {
314319
const tag = this.rawTagName;
315320
if (tag) {
316-
// const void_tags = new Set('area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr'.split('|'));
317-
// const is_void = void_tags.has(tag);
318-
const is_void = /^(area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)$/i.test(tag);
319321
const attrs = this.rawAttrs ? ` ${this.rawAttrs}` : '';
320-
if (is_void) {
321-
return `<${tag}${attrs}>`;
322-
}
323-
return `<${tag}${attrs}>${this.innerHTML}</${tag}>`;
322+
return this.isVoidElement ? `<${tag}${attrs}>` : `<${tag}${attrs}>${this.innerHTML}</${tag}>`;
324323
}
325324
return this.innerHTML;
326325
}
@@ -458,64 +457,6 @@ export default class HTMLElement extends Node {
458457
xmlMode: true,
459458
adapter: Matcher,
460459
});
461-
462-
// let matcher: Matcher;
463-
// if (selector instanceof Matcher) {
464-
// matcher = selector;
465-
// matcher.reset();
466-
// } else {
467-
// if (selector.includes(',')) {
468-
// const selectors = selector.split(',');
469-
// return Array.from(selectors.reduce((pre, cur) => {
470-
// const result = this.querySelectorAll(cur.trim());
471-
// return result.reduce((p, c) => {
472-
// return p.add(c);
473-
// }, pre);
474-
// }, new Set<HTMLElement>()));
475-
// }
476-
// matcher = new Matcher(selector);
477-
// }
478-
// interface IStack {
479-
// 0: Node; // node
480-
// 1: number; // children
481-
// 2: boolean; // found flag
482-
// }
483-
// const stack = [] as IStack[];
484-
// return this.childNodes.reduce((res, cur) => {
485-
// stack.push([cur, 0, false]);
486-
// while (stack.length) {
487-
// const state = arr_back(stack); // get last element
488-
// const el = state[0];
489-
// if (state[1] === 0) {
490-
// // Seen for first time.
491-
// if (el.nodeType !== NodeType.ELEMENT_NODE) {
492-
// stack.pop();
493-
// continue;
494-
// }
495-
// const html_el = el as HTMLElement;
496-
// state[2] = matcher.advance(html_el);
497-
// if (state[2]) {
498-
// if (matcher.matched) {
499-
// res.push(html_el);
500-
// res.push(...(html_el.querySelectorAll(selector)));
501-
// // no need to go further.
502-
// matcher.rewind();
503-
// stack.pop();
504-
// continue;
505-
// }
506-
// }
507-
// }
508-
// if (state[1] < el.childNodes.length) {
509-
// stack.push([el.childNodes[state[1]++], 0, false]);
510-
// } else {
511-
// if (state[2]) {
512-
// matcher.rewind();
513-
// }
514-
// stack.pop();
515-
// }
516-
// }
517-
// return res;
518-
// }, [] as HTMLElement[]);
519460
}
520461

521462
/**
@@ -528,43 +469,6 @@ export default class HTMLElement extends Node {
528469
xmlMode: true,
529470
adapter: Matcher,
530471
});
531-
// let matcher: Matcher;
532-
// if (selector instanceof Matcher) {
533-
// matcher = selector;
534-
// matcher.reset();
535-
// } else {
536-
// matcher = new Matcher(selector);
537-
// }
538-
// const stack = [] as { 0: Node; 1: 0 | 1; 2: boolean }[];
539-
// for (const node of this.childNodes) {
540-
// stack.push([node, 0, false]);
541-
// while (stack.length) {
542-
// const state = arr_back(stack);
543-
// const el = state[0];
544-
// if (state[1] === 0) {
545-
// // Seen for first time.
546-
// if (el.nodeType !== NodeType.ELEMENT_NODE) {
547-
// stack.pop();
548-
// continue;
549-
// }
550-
// state[2] = matcher.advance(el as HTMLElement);
551-
// if (state[2]) {
552-
// if (matcher.matched) {
553-
// return el as HTMLElement;
554-
// }
555-
// }
556-
// }
557-
// if (state[1] < el.childNodes.length) {
558-
// stack.push([el.childNodes[state[1]++], 0, false]);
559-
// } else {
560-
// if (state[2]) {
561-
// matcher.rewind();
562-
// }
563-
// stack.pop();
564-
// }
565-
// }
566-
// }
567-
// return null;
568472
}
569473

570474
/**
@@ -682,7 +586,7 @@ export default class HTMLElement extends Node {
682586
}
683587

684588
/**
685-
* Get escaped (as-it) attributes
589+
* Get escaped (as-is) attributes
686590
* @return {Object} parsed attributes
687591
*/
688592
public get rawAttributes() {
@@ -691,10 +595,13 @@ export default class HTMLElement extends Node {
691595
}
692596
const attrs = {} as RawAttributes;
693597
if (this.rawAttrs) {
694-
const re = /([a-z()#][a-z0-9-_:()#]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/gi;
598+
const re = /([a-zA-Z()#][a-zA-Z0-9-_:()#]*)(?:\s*=\s*((?:'[^']*')|(?:"[^"]*")|\S+))?/g;
695599
let match: RegExpExecArray;
696600
while ((match = re.exec(this.rawAttrs))) {
697-
attrs[match[1]] = match[2] || match[3] || match[4] || null;
601+
const key = match[1];
602+
let val = match[2] || null;
603+
if (val && (val[0] === `'` || val[0] === `"`)) val = val.slice(1, val.length - 1);
604+
attrs[key] = val;
698605
}
699606
}
700607
this._rawAttrs = attrs;
@@ -873,12 +780,8 @@ export default class HTMLElement extends Node {
873780
}
874781

875782
// https://html.spec.whatwg.org/multipage/custom-elements.html#valid-custom-element-name
876-
const kMarkupPattern = /<!--[^]*?(?=-->)-->|<(\/?)([a-z][-.:0-9_a-z]*)\s*((?=[/>]*?)|(?:.*?[\s\d/'"])|(?:.*?[\w]))(\/?)>/gi;
877-
// <(?<tag>[^\s]*)(.*)>(.*)</\k<tag>>
878-
// <([a-z][-.:0-9_a-z]*)\s*\/>
879-
// <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?>
880-
// <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?>|<(?<tag>[^\s]*)(.*)>(.*)</\k<tag>>
881-
const kAttributePattern = /(^|\s)(id|class)\s*=\s*("([^"]*)"|'([^']*)'|(\S+))/gi;
783+
const kMarkupPattern = /<!--[\s\S]*?-->|<(\/?)([a-zA-Z][-.:0-9_a-zA-Z]*)((?:\s+[^>]*?(?:(?:'[^']*')|(?:"[^"]*"))?)*)\s*(\/?)>/g;
784+
const kAttributePattern = /(?:^|\s)(id|class)\s*=\s*((?:'[^']*')|(?:"[^"]*")|\S+)/gi;
882785
const kSelfClosingElements = {
883786
area: true,
884787
AREA: true,
@@ -995,17 +898,22 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
995898
let match: RegExpExecArray;
996899
// https://github.com/taoqf/node-html-parser/issues/38
997900
data = `<${frameflag}>${data}</${frameflag}>`;
901+
const { lowerCaseTagName } = options;
998902

999903
const dataEndPos = data.length - (frameflag.length + 2);
1000904
const frameFlagOffset = frameflag.length + 2;
1001905

1002906
while ((match = kMarkupPattern.exec(data))) {
1003-
const tagStartPos = kMarkupPattern.lastIndex - match[0].length;
907+
// Note: Object destructuring here consistently tests as higher performance than array destructuring
908+
// eslint-disable-next-line prefer-const
909+
let { 0: matchText, 1: leadingSlash, 2: tagName, 3: attributes, 4: closingSlash } = match;
910+
const matchLength = matchText.length;
911+
const tagStartPos = kMarkupPattern.lastIndex - matchLength;
1004912
const tagEndPos = kMarkupPattern.lastIndex;
1005913

1006914
// Add TextNode if content
1007915
if (lastTextPos > -1) {
1008-
if (lastTextPos + match[0].length < tagEndPos) {
916+
if (lastTextPos + matchLength < tagEndPos) {
1009917
const text = data.substring(lastTextPos, tagStartPos);
1010918
currentParent.appendChild(new TextNode(text, currentParent, createRange(lastTextPos, tagStartPos)));
1011919
}
@@ -1015,10 +923,10 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
1015923

1016924
// https://github.com/taoqf/node-html-parser/issues/38
1017925
// Skip frameflag node
1018-
if (match[2] === frameflag) continue;
926+
if (tagName === frameflag) continue;
1019927

1020928
// Handle comments
1021-
if (match[0][1] === '!') {
929+
if (matchText[1] === '!') {
1022930
if (options.comment) {
1023931
// Only keep what is in between <!-- and -->
1024932
const text = data.substring(tagStartPos + 4, tagEndPos - 3);
@@ -1029,27 +937,29 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
1029937

1030938
/* -- Handle tag matching -- */
1031939
// Fix tag casing if necessary
1032-
if (options.lowerCaseTagName) match[2] = match[2].toLowerCase();
940+
if (lowerCaseTagName) tagName = tagName.toLowerCase();
1033941

1034942
// Handle opening tags (ie. <this> not </that>)
1035-
if (!match[1]) {
943+
if (!leadingSlash) {
1036944
/* Populate attributes */
1037945
const attrs = {};
1038-
for (let attMatch; (attMatch = kAttributePattern.exec(match[3])); ) {
1039-
attrs[attMatch[2].toLowerCase()] = attMatch[4] || attMatch[5] || attMatch[6];
946+
for (let attMatch; (attMatch = kAttributePattern.exec(attributes)); ) {
947+
const { 1: key, 2: val } = attMatch;
948+
const isQuoted = val[0] === `'` || val[0] === `"`;
949+
attrs[key.toLowerCase()] = isQuoted ? val.slice(1, val.length - 1) : val;
1040950
}
1041951

1042-
const tagName = currentParent.rawTagName as IRawTagName;
952+
const parentTagName = currentParent.rawTagName as IRawTagName;
1043953

1044-
if (!match[4] && kElementsClosedByOpening[tagName]) {
1045-
if (kElementsClosedByOpening[tagName][match[2]]) {
954+
if (!closingSlash && kElementsClosedByOpening[parentTagName]) {
955+
if (kElementsClosedByOpening[parentTagName][tagName]) {
1046956
stack.pop();
1047957
currentParent = arr_back(stack);
1048958
}
1049959
}
1050960

1051961
// Prevent nested A tags by terminating the last A and starting a new one : see issue #144
1052-
if (match[2] === 'a' || match[2] === 'A') {
962+
if (tagName === 'a' || tagName === 'A') {
1053963
if (noNestedTagIndex !== undefined) {
1054964
stack.splice(noNestedTagIndex);
1055965
currentParent = arr_back(stack);
@@ -1058,23 +968,23 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
1058968
}
1059969

1060970
const tagEndPos = kMarkupPattern.lastIndex;
1061-
const tagStartPos = tagEndPos - match[0].length;
971+
const tagStartPos = tagEndPos - matchLength;
1062972

1063973
currentParent = currentParent.appendChild(
1064974
// Initialize range (end position updated later for closed tags)
1065-
new HTMLElement(match[2], attrs, match[3], null, createRange(tagStartPos, tagEndPos))
975+
new HTMLElement(tagName, attrs, attributes.slice(1), null, createRange(tagStartPos, tagEndPos))
1066976
);
1067977
stack.push(currentParent);
1068978

1069-
if (is_block_text_element(match[2])) {
979+
if (is_block_text_element(tagName)) {
1070980
// Find closing tag
1071-
const closeMarkup = `</${match[2]}>`;
1072-
const closeIndex = options.lowerCaseTagName
981+
const closeMarkup = `</${tagName}>`;
982+
const closeIndex = lowerCaseTagName
1073983
? data.toLocaleLowerCase().indexOf(closeMarkup, kMarkupPattern.lastIndex)
1074984
: data.indexOf(closeMarkup, kMarkupPattern.lastIndex);
1075985
const textEndPos = closeIndex === -1 ? dataEndPos : closeIndex;
1076986

1077-
if (element_should_be_ignore(match[2])) {
987+
if (element_should_be_ignore(tagName)) {
1078988
const text = data.substring(tagEndPos, textEndPos);
1079989
if (text.length > 0 && /\S/.test(text)) {
1080990
currentParent.appendChild(new TextNode(text, currentParent, createRange(tagEndPos, textEndPos)));
@@ -1086,26 +996,26 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
1086996
} else {
1087997
lastTextPos = kMarkupPattern.lastIndex = closeIndex + closeMarkup.length;
1088998
// Cause to be treated as self-closing, because no close found
1089-
match[1] = 'true';
999+
leadingSlash = '/';
10901000
}
10911001
}
10921002
}
10931003

10941004
// Handle closing tags or self-closed elements (ie </tag> or <br>)
1095-
if (match[1] || match[4] || kSelfClosingElements[match[2]]) {
1005+
if (leadingSlash || closingSlash || kSelfClosingElements[tagName]) {
10961006
while (true) {
1097-
if (match[2] === 'a' || match[2] === 'A') noNestedTagIndex = undefined;
1098-
if (currentParent.rawTagName === match[2]) {
1007+
if (tagName === 'a' || tagName === 'A') noNestedTagIndex = undefined;
1008+
if (currentParent.rawTagName === tagName) {
10991009
// Update range end for closed tag
11001010
(<[number, number]>currentParent.range)[1] = createRange(-1, Math.max(lastTextPos, tagEndPos))[1];
11011011
stack.pop();
11021012
currentParent = arr_back(stack);
11031013
break;
11041014
} else {
1105-
const tagName = currentParent.tagName as 'LI' | 'A' | 'B' | 'I' | 'P' | 'TD' | 'TH';
1015+
const parentTagName = currentParent.tagName as 'LI' | 'A' | 'B' | 'I' | 'P' | 'TD' | 'TH';
11061016
// Trying to close current tag, and move on
1107-
if (kElementsClosedByClosing[tagName]) {
1108-
if (kElementsClosedByClosing[tagName][match[2]]) {
1017+
if (kElementsClosedByClosing[parentTagName]) {
1018+
if (kElementsClosedByClosing[parentTagName][tagName]) {
11091019
stack.pop();
11101020
currentParent = arr_back(stack);
11111021
continue;

0 commit comments

Comments
 (0)