Skip to content

Commit 3c5b8e2

Browse files
nonaraRon S
authored and
Ron S
committed
feat: Improved parsing performance + matching (closes #164)
1 parent b387a51 commit 3c5b8e2

File tree

1 file changed

+48
-138
lines changed

1 file changed

+48
-138
lines changed

src/nodes/html.ts

Lines changed: 48 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import Matcher from '../matcher';
77
import arr_back from '../back';
88
import CommentNode from './comment';
99

10-
// const { decode } = he;
10+
const voidTags = new Set([ 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr' ]);
1111

1212
type IRawTagName =
1313
| 'LI'
@@ -154,7 +154,7 @@ export default class HTMLElement extends Node {
154154
*/
155155

156156
private quoteAttribute(attr: string) {
157-
if (attr === null) {
157+
if (attr == null) {
158158
return 'null';
159159
}
160160

@@ -241,6 +241,11 @@ export default class HTMLElement extends Node {
241241
public get localName() {
242242
return this.rawTagName.toLowerCase();
243243
}
244+
245+
public get isVoidElement() {
246+
return voidTags.has(this.localName);
247+
}
248+
244249
/**
245250
* Get escpaed (as-it) text value of current node and its children.
246251
* @return {string} text content
@@ -313,14 +318,8 @@ export default class HTMLElement extends Node {
313318
public toString() {
314319
const tag = this.rawTagName;
315320
if (tag) {
316-
// const void_tags = new Set('area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr'.split('|'));
317-
// const is_void = void_tags.has(tag);
318-
const is_void = /^(area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)$/i.test(tag);
319321
const attrs = this.rawAttrs ? ` ${this.rawAttrs}` : '';
320-
if (is_void) {
321-
return `<${tag}${attrs}>`;
322-
}
323-
return `<${tag}${attrs}>${this.innerHTML}</${tag}>`;
322+
return this.isVoidElement ? `<${tag}${attrs}>` : `<${tag}${attrs}>${this.innerHTML}</${tag}>`;
324323
}
325324
return this.innerHTML;
326325
}
@@ -458,64 +457,6 @@ export default class HTMLElement extends Node {
458457
xmlMode: true,
459458
adapter: Matcher,
460459
});
461-
462-
// let matcher: Matcher;
463-
// if (selector instanceof Matcher) {
464-
// matcher = selector;
465-
// matcher.reset();
466-
// } else {
467-
// if (selector.includes(',')) {
468-
// const selectors = selector.split(',');
469-
// return Array.from(selectors.reduce((pre, cur) => {
470-
// const result = this.querySelectorAll(cur.trim());
471-
// return result.reduce((p, c) => {
472-
// return p.add(c);
473-
// }, pre);
474-
// }, new Set<HTMLElement>()));
475-
// }
476-
// matcher = new Matcher(selector);
477-
// }
478-
// interface IStack {
479-
// 0: Node; // node
480-
// 1: number; // children
481-
// 2: boolean; // found flag
482-
// }
483-
// const stack = [] as IStack[];
484-
// return this.childNodes.reduce((res, cur) => {
485-
// stack.push([cur, 0, false]);
486-
// while (stack.length) {
487-
// const state = arr_back(stack); // get last element
488-
// const el = state[0];
489-
// if (state[1] === 0) {
490-
// // Seen for first time.
491-
// if (el.nodeType !== NodeType.ELEMENT_NODE) {
492-
// stack.pop();
493-
// continue;
494-
// }
495-
// const html_el = el as HTMLElement;
496-
// state[2] = matcher.advance(html_el);
497-
// if (state[2]) {
498-
// if (matcher.matched) {
499-
// res.push(html_el);
500-
// res.push(...(html_el.querySelectorAll(selector)));
501-
// // no need to go further.
502-
// matcher.rewind();
503-
// stack.pop();
504-
// continue;
505-
// }
506-
// }
507-
// }
508-
// if (state[1] < el.childNodes.length) {
509-
// stack.push([el.childNodes[state[1]++], 0, false]);
510-
// } else {
511-
// if (state[2]) {
512-
// matcher.rewind();
513-
// }
514-
// stack.pop();
515-
// }
516-
// }
517-
// return res;
518-
// }, [] as HTMLElement[]);
519460
}
520461

521462
/**
@@ -528,43 +469,6 @@ export default class HTMLElement extends Node {
528469
xmlMode: true,
529470
adapter: Matcher,
530471
});
531-
// let matcher: Matcher;
532-
// if (selector instanceof Matcher) {
533-
// matcher = selector;
534-
// matcher.reset();
535-
// } else {
536-
// matcher = new Matcher(selector);
537-
// }
538-
// const stack = [] as { 0: Node; 1: 0 | 1; 2: boolean }[];
539-
// for (const node of this.childNodes) {
540-
// stack.push([node, 0, false]);
541-
// while (stack.length) {
542-
// const state = arr_back(stack);
543-
// const el = state[0];
544-
// if (state[1] === 0) {
545-
// // Seen for first time.
546-
// if (el.nodeType !== NodeType.ELEMENT_NODE) {
547-
// stack.pop();
548-
// continue;
549-
// }
550-
// state[2] = matcher.advance(el as HTMLElement);
551-
// if (state[2]) {
552-
// if (matcher.matched) {
553-
// return el as HTMLElement;
554-
// }
555-
// }
556-
// }
557-
// if (state[1] < el.childNodes.length) {
558-
// stack.push([el.childNodes[state[1]++], 0, false]);
559-
// } else {
560-
// if (state[2]) {
561-
// matcher.rewind();
562-
// }
563-
// stack.pop();
564-
// }
565-
// }
566-
// }
567-
// return null;
568472
}
569473

570474
/**
@@ -727,7 +631,7 @@ export default class HTMLElement extends Node {
727631
}
728632

729633
/**
730-
* Get escaped (as-it) attributes
634+
* Get escaped (as-is) attributes
731635
* @return {Object} parsed attributes
732636
*/
733637
public get rawAttributes() {
@@ -736,10 +640,13 @@ export default class HTMLElement extends Node {
736640
}
737641
const attrs = {} as RawAttributes;
738642
if (this.rawAttrs) {
739-
const re = /([a-z()#][a-z0-9-_:()#]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/gi;
643+
const re = /([a-zA-Z()#][a-zA-Z0-9-_:()#]*)(?:\s*=\s*((?:'[^']*')|(?:"[^"]*")|\S+))?/g;
740644
let match: RegExpExecArray;
741645
while ((match = re.exec(this.rawAttrs))) {
742-
attrs[match[1]] = match[2] || match[3] || match[4] || null;
646+
const key = match[1];
647+
let val = match[2] || null;
648+
if (val && (val[0] === `'` || val[0] === `"`)) val = val.slice(1, val.length - 1);
649+
attrs[key] = val;
743650
}
744651
}
745652
this._rawAttrs = attrs;
@@ -918,12 +825,8 @@ export default class HTMLElement extends Node {
918825
}
919826

920827
// https://html.spec.whatwg.org/multipage/custom-elements.html#valid-custom-element-name
921-
const kMarkupPattern = /<!--[^]*?(?=-->)-->|<(\/?)([a-z][-.:0-9_a-z]*)\s*((?=[/>]*?)|(?:.*?[\s\d/'"])|(?:.*?[\w]))(\/?)>/gi;
922-
// <(?<tag>[^\s]*)(.*)>(.*)</\k<tag>>
923-
// <([a-z][-.:0-9_a-z]*)\s*\/>
924-
// <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?>
925-
// <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?>|<(?<tag>[^\s]*)(.*)>(.*)</\k<tag>>
926-
const kAttributePattern = /(^|\s)(id|class)\s*=\s*("([^"]*)"|'([^']*)'|(\S+))/gi;
828+
const kMarkupPattern = /<!--[\s\S]*?-->|<(\/?)([a-zA-Z][-.:0-9_a-zA-Z]*)((?:\s+[^>]*?(?:(?:'[^']*')|(?:"[^"]*"))?)*)\s*(\/?)>/g;
829+
const kAttributePattern = /(?:^|\s)(id|class)\s*=\s*((?:'[^']*')|(?:"[^"]*")|\S+)/gi;
927830
const kSelfClosingElements = {
928831
area: true,
929832
AREA: true,
@@ -1040,17 +943,22 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
1040943
let match: RegExpExecArray;
1041944
// https://github.com/taoqf/node-html-parser/issues/38
1042945
data = `<${frameflag}>${data}</${frameflag}>`;
946+
const { lowerCaseTagName } = options;
1043947

1044948
const dataEndPos = data.length - (frameflag.length + 2);
1045949
const frameFlagOffset = frameflag.length + 2;
1046950

1047951
while ((match = kMarkupPattern.exec(data))) {
1048-
const tagStartPos = kMarkupPattern.lastIndex - match[0].length;
952+
// Note: Object destructuring here consistently tests as higher performance than array destructuring
953+
// eslint-disable-next-line prefer-const
954+
let { 0: matchText, 1: leadingSlash, 2: tagName, 3: attributes, 4: closingSlash } = match;
955+
const matchLength = matchText.length;
956+
const tagStartPos = kMarkupPattern.lastIndex - matchLength;
1049957
const tagEndPos = kMarkupPattern.lastIndex;
1050958

1051959
// Add TextNode if content
1052960
if (lastTextPos > -1) {
1053-
if (lastTextPos + match[0].length < tagEndPos) {
961+
if (lastTextPos + matchLength < tagEndPos) {
1054962
const text = data.substring(lastTextPos, tagStartPos);
1055963
currentParent.appendChild(new TextNode(text, currentParent, createRange(lastTextPos, tagStartPos)));
1056964
}
@@ -1060,10 +968,10 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
1060968

1061969
// https://github.com/taoqf/node-html-parser/issues/38
1062970
// Skip frameflag node
1063-
if (match[2] === frameflag) continue;
971+
if (tagName === frameflag) continue;
1064972

1065973
// Handle comments
1066-
if (match[0][1] === '!') {
974+
if (matchText[1] === '!') {
1067975
if (options.comment) {
1068976
// Only keep what is in between <!-- and -->
1069977
const text = data.substring(tagStartPos + 4, tagEndPos - 3);
@@ -1074,27 +982,29 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
1074982

1075983
/* -- Handle tag matching -- */
1076984
// Fix tag casing if necessary
1077-
if (options.lowerCaseTagName) match[2] = match[2].toLowerCase();
985+
if (lowerCaseTagName) tagName = tagName.toLowerCase();
1078986

1079987
// Handle opening tags (ie. <this> not </that>)
1080-
if (!match[1]) {
988+
if (!leadingSlash) {
1081989
/* Populate attributes */
1082990
const attrs = {};
1083-
for (let attMatch; (attMatch = kAttributePattern.exec(match[3])); ) {
1084-
attrs[attMatch[2].toLowerCase()] = attMatch[4] || attMatch[5] || attMatch[6];
991+
for (let attMatch; (attMatch = kAttributePattern.exec(attributes)); ) {
992+
const { 1: key, 2: val } = attMatch;
993+
const isQuoted = val[0] === `'` || val[0] === `"`;
994+
attrs[key.toLowerCase()] = isQuoted ? val.slice(1, val.length - 1) : val;
1085995
}
1086996

1087-
const tagName = currentParent.rawTagName as IRawTagName;
997+
const parentTagName = currentParent.rawTagName as IRawTagName;
1088998

1089-
if (!match[4] && kElementsClosedByOpening[tagName]) {
1090-
if (kElementsClosedByOpening[tagName][match[2]]) {
999+
if (!closingSlash && kElementsClosedByOpening[parentTagName]) {
1000+
if (kElementsClosedByOpening[parentTagName][tagName]) {
10911001
stack.pop();
10921002
currentParent = arr_back(stack);
10931003
}
10941004
}
10951005

10961006
// Prevent nested A tags by terminating the last A and starting a new one : see issue #144
1097-
if (match[2] === 'a' || match[2] === 'A') {
1007+
if (tagName === 'a' || tagName === 'A') {
10981008
if (noNestedTagIndex !== undefined) {
10991009
stack.splice(noNestedTagIndex);
11001010
currentParent = arr_back(stack);
@@ -1103,23 +1013,23 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
11031013
}
11041014

11051015
const tagEndPos = kMarkupPattern.lastIndex;
1106-
const tagStartPos = tagEndPos - match[0].length;
1016+
const tagStartPos = tagEndPos - matchLength;
11071017

11081018
currentParent = currentParent.appendChild(
11091019
// Initialize range (end position updated later for closed tags)
1110-
new HTMLElement(match[2], attrs, match[3], null, createRange(tagStartPos, tagEndPos))
1020+
new HTMLElement(tagName, attrs, attributes.slice(1), null, createRange(tagStartPos, tagEndPos))
11111021
);
11121022
stack.push(currentParent);
11131023

1114-
if (is_block_text_element(match[2])) {
1024+
if (is_block_text_element(tagName)) {
11151025
// Find closing tag
1116-
const closeMarkup = `</${match[2]}>`;
1117-
const closeIndex = options.lowerCaseTagName
1026+
const closeMarkup = `</${tagName}>`;
1027+
const closeIndex = lowerCaseTagName
11181028
? data.toLocaleLowerCase().indexOf(closeMarkup, kMarkupPattern.lastIndex)
11191029
: data.indexOf(closeMarkup, kMarkupPattern.lastIndex);
11201030
const textEndPos = closeIndex === -1 ? dataEndPos : closeIndex;
11211031

1122-
if (element_should_be_ignore(match[2])) {
1032+
if (element_should_be_ignore(tagName)) {
11231033
const text = data.substring(tagEndPos, textEndPos);
11241034
if (text.length > 0 && /\S/.test(text)) {
11251035
currentParent.appendChild(new TextNode(text, currentParent, createRange(tagEndPos, textEndPos)));
@@ -1131,26 +1041,26 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
11311041
} else {
11321042
lastTextPos = kMarkupPattern.lastIndex = closeIndex + closeMarkup.length;
11331043
// Cause to be treated as self-closing, because no close found
1134-
match[1] = 'true';
1044+
leadingSlash = '/';
11351045
}
11361046
}
11371047
}
11381048

11391049
// Handle closing tags or self-closed elements (ie </tag> or <br>)
1140-
if (match[1] || match[4] || kSelfClosingElements[match[2]]) {
1050+
if (leadingSlash || closingSlash || kSelfClosingElements[tagName]) {
11411051
while (true) {
1142-
if (match[2] === 'a' || match[2] === 'A') noNestedTagIndex = undefined;
1143-
if (currentParent.rawTagName === match[2]) {
1052+
if (tagName === 'a' || tagName === 'A') noNestedTagIndex = undefined;
1053+
if (currentParent.rawTagName === tagName) {
11441054
// Update range end for closed tag
11451055
(<[number, number]>currentParent.range)[1] = createRange(-1, Math.max(lastTextPos, tagEndPos))[1];
11461056
stack.pop();
11471057
currentParent = arr_back(stack);
11481058
break;
11491059
} else {
1150-
const tagName = currentParent.tagName as 'LI' | 'A' | 'B' | 'I' | 'P' | 'TD' | 'TH';
1060+
const parentTagName = currentParent.tagName as 'LI' | 'A' | 'B' | 'I' | 'P' | 'TD' | 'TH';
11511061
// Trying to close current tag, and move on
1152-
if (kElementsClosedByClosing[tagName]) {
1153-
if (kElementsClosedByClosing[tagName][match[2]]) {
1062+
if (kElementsClosedByClosing[parentTagName]) {
1063+
if (kElementsClosedByClosing[parentTagName][tagName]) {
11541064
stack.pop();
11551065
currentParent = arr_back(stack);
11561066
continue;

0 commit comments

Comments
 (0)