@@ -7,7 +7,7 @@ import Matcher from '../matcher';
7
7
import arr_back from '../back' ;
8
8
import CommentNode from './comment' ;
9
9
10
- // const { decode } = he ;
10
+ const voidTags = new Set ( [ 'area' , 'base' , 'br' , 'col' , 'embed' , 'hr' , 'img' , 'input' , 'link' , 'meta' , 'param' , 'source' , 'track' , 'wbr' ] ) ;
11
11
12
12
type IRawTagName =
13
13
| 'LI'
@@ -154,7 +154,7 @@ export default class HTMLElement extends Node {
154
154
*/
155
155
156
156
private quoteAttribute ( attr : string ) {
157
- if ( attr === null ) {
157
+ if ( attr == null ) {
158
158
return 'null' ;
159
159
}
160
160
@@ -241,6 +241,11 @@ export default class HTMLElement extends Node {
241
241
public get localName ( ) {
242
242
return this . rawTagName . toLowerCase ( ) ;
243
243
}
244
+
245
+ public get isVoidElement ( ) {
246
+ return voidTags . has ( this . localName ) ;
247
+ }
248
+
244
249
/**
245
250
* Get escpaed (as-it) text value of current node and its children.
246
251
* @return {string } text content
@@ -313,14 +318,8 @@ export default class HTMLElement extends Node {
313
318
public toString ( ) {
314
319
const tag = this . rawTagName ;
315
320
if ( tag ) {
316
- // const void_tags = new Set('area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr'.split('|'));
317
- // const is_void = void_tags.has(tag);
318
- const is_void = / ^ ( a r e a | b a s e | b r | c o l | e m b e d | h r | i m g | i n p u t | l i n k | m e t a | p a r a m | s o u r c e | t r a c k | w b r ) $ / i. test ( tag ) ;
319
321
const attrs = this . rawAttrs ? ` ${ this . rawAttrs } ` : '' ;
320
- if ( is_void ) {
321
- return `<${ tag } ${ attrs } >` ;
322
- }
323
- return `<${ tag } ${ attrs } >${ this . innerHTML } </${ tag } >` ;
322
+ return this . isVoidElement ? `<${ tag } ${ attrs } >` : `<${ tag } ${ attrs } >${ this . innerHTML } </${ tag } >` ;
324
323
}
325
324
return this . innerHTML ;
326
325
}
@@ -458,64 +457,6 @@ export default class HTMLElement extends Node {
458
457
xmlMode : true ,
459
458
adapter : Matcher ,
460
459
} ) ;
461
-
462
- // let matcher: Matcher;
463
- // if (selector instanceof Matcher) {
464
- // matcher = selector;
465
- // matcher.reset();
466
- // } else {
467
- // if (selector.includes(',')) {
468
- // const selectors = selector.split(',');
469
- // return Array.from(selectors.reduce((pre, cur) => {
470
- // const result = this.querySelectorAll(cur.trim());
471
- // return result.reduce((p, c) => {
472
- // return p.add(c);
473
- // }, pre);
474
- // }, new Set<HTMLElement>()));
475
- // }
476
- // matcher = new Matcher(selector);
477
- // }
478
- // interface IStack {
479
- // 0: Node; // node
480
- // 1: number; // children
481
- // 2: boolean; // found flag
482
- // }
483
- // const stack = [] as IStack[];
484
- // return this.childNodes.reduce((res, cur) => {
485
- // stack.push([cur, 0, false]);
486
- // while (stack.length) {
487
- // const state = arr_back(stack); // get last element
488
- // const el = state[0];
489
- // if (state[1] === 0) {
490
- // // Seen for first time.
491
- // if (el.nodeType !== NodeType.ELEMENT_NODE) {
492
- // stack.pop();
493
- // continue;
494
- // }
495
- // const html_el = el as HTMLElement;
496
- // state[2] = matcher.advance(html_el);
497
- // if (state[2]) {
498
- // if (matcher.matched) {
499
- // res.push(html_el);
500
- // res.push(...(html_el.querySelectorAll(selector)));
501
- // // no need to go further.
502
- // matcher.rewind();
503
- // stack.pop();
504
- // continue;
505
- // }
506
- // }
507
- // }
508
- // if (state[1] < el.childNodes.length) {
509
- // stack.push([el.childNodes[state[1]++], 0, false]);
510
- // } else {
511
- // if (state[2]) {
512
- // matcher.rewind();
513
- // }
514
- // stack.pop();
515
- // }
516
- // }
517
- // return res;
518
- // }, [] as HTMLElement[]);
519
460
}
520
461
521
462
/**
@@ -528,43 +469,6 @@ export default class HTMLElement extends Node {
528
469
xmlMode : true ,
529
470
adapter : Matcher ,
530
471
} ) ;
531
- // let matcher: Matcher;
532
- // if (selector instanceof Matcher) {
533
- // matcher = selector;
534
- // matcher.reset();
535
- // } else {
536
- // matcher = new Matcher(selector);
537
- // }
538
- // const stack = [] as { 0: Node; 1: 0 | 1; 2: boolean }[];
539
- // for (const node of this.childNodes) {
540
- // stack.push([node, 0, false]);
541
- // while (stack.length) {
542
- // const state = arr_back(stack);
543
- // const el = state[0];
544
- // if (state[1] === 0) {
545
- // // Seen for first time.
546
- // if (el.nodeType !== NodeType.ELEMENT_NODE) {
547
- // stack.pop();
548
- // continue;
549
- // }
550
- // state[2] = matcher.advance(el as HTMLElement);
551
- // if (state[2]) {
552
- // if (matcher.matched) {
553
- // return el as HTMLElement;
554
- // }
555
- // }
556
- // }
557
- // if (state[1] < el.childNodes.length) {
558
- // stack.push([el.childNodes[state[1]++], 0, false]);
559
- // } else {
560
- // if (state[2]) {
561
- // matcher.rewind();
562
- // }
563
- // stack.pop();
564
- // }
565
- // }
566
- // }
567
- // return null;
568
472
}
569
473
570
474
/**
@@ -727,7 +631,7 @@ export default class HTMLElement extends Node {
727
631
}
728
632
729
633
/**
730
- * Get escaped (as-it ) attributes
634
+ * Get escaped (as-is ) attributes
731
635
* @return {Object } parsed attributes
732
636
*/
733
637
public get rawAttributes ( ) {
@@ -736,10 +640,13 @@ export default class HTMLElement extends Node {
736
640
}
737
641
const attrs = { } as RawAttributes ;
738
642
if ( this . rawAttrs ) {
739
- const re = / ( [ a - z ( ) # ] [ a - z 0 - 9 - _ : ( ) # ] * ) (?: \s * = \s * (?: " ( [ ^ " ] * ) " | ' ( [ ^ ' ] * ) ' | ( \S + ) ) ) ? / gi ;
643
+ const re = / ( [ a - z A - Z ( ) # ] [ a - z A - Z 0 - 9 - _ : ( ) # ] * ) (?: \s * = \s * ( (?: ' [ ^ ' ] * ' ) | (?: " [ ^ " ] * " ) | \S + ) ) ? / g ;
740
644
let match : RegExpExecArray ;
741
645
while ( ( match = re . exec ( this . rawAttrs ) ) ) {
742
- attrs [ match [ 1 ] ] = match [ 2 ] || match [ 3 ] || match [ 4 ] || null ;
646
+ const key = match [ 1 ] ;
647
+ let val = match [ 2 ] || null ;
648
+ if ( val && ( val [ 0 ] === `'` || val [ 0 ] === `"` ) ) val = val . slice ( 1 , val . length - 1 ) ;
649
+ attrs [ key ] = val ;
743
650
}
744
651
}
745
652
this . _rawAttrs = attrs ;
@@ -918,12 +825,8 @@ export default class HTMLElement extends Node {
918
825
}
919
826
920
827
// https://html.spec.whatwg.org/multipage/custom-elements.html#valid-custom-element-name
921
- const kMarkupPattern = / < ! - - [ ^ ] * ?(? = - - > ) - - > | < ( \/ ? ) ( [ a - z ] [ - . : 0 - 9 _ a - z ] * ) \s * ( (? = [ / > ] * ?) | (?: .* ?[ \s \d / ' " ] ) | (?: .* ?[ \w ] ) ) ( \/ ? ) > / gi;
922
- // <(?<tag>[^\s]*)(.*)>(.*)</\k<tag>>
923
- // <([a-z][-.:0-9_a-z]*)\s*\/>
924
- // <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?>
925
- // <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?>|<(?<tag>[^\s]*)(.*)>(.*)</\k<tag>>
926
- const kAttributePattern = / ( ^ | \s ) ( i d | c l a s s ) \s * = \s * ( " ( [ ^ " ] * ) " | ' ( [ ^ ' ] * ) ' | ( \S + ) ) / gi;
828
+ const kMarkupPattern = / < ! - - [ \s \S ] * ?- - > | < ( \/ ? ) ( [ a - z A - Z ] [ - . : 0 - 9 _ a - z A - Z ] * ) ( (?: \s + [ ^ > ] * ?(?: (?: ' [ ^ ' ] * ' ) | (?: " [ ^ " ] * " ) ) ? ) * ) \s * ( \/ ? ) > / g;
829
+ const kAttributePattern = / (?: ^ | \s ) ( i d | c l a s s ) \s * = \s * ( (?: ' [ ^ ' ] * ' ) | (?: " [ ^ " ] * " ) | \S + ) / gi;
927
830
const kSelfClosingElements = {
928
831
area : true ,
929
832
AREA : true ,
@@ -1040,17 +943,22 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
1040
943
let match : RegExpExecArray ;
1041
944
// https://github.com/taoqf/node-html-parser/issues/38
1042
945
data = `<${ frameflag } >${ data } </${ frameflag } >` ;
946
+ const { lowerCaseTagName } = options ;
1043
947
1044
948
const dataEndPos = data . length - ( frameflag . length + 2 ) ;
1045
949
const frameFlagOffset = frameflag . length + 2 ;
1046
950
1047
951
while ( ( match = kMarkupPattern . exec ( data ) ) ) {
1048
- const tagStartPos = kMarkupPattern . lastIndex - match [ 0 ] . length ;
952
+ // Note: Object destructuring here consistently tests as higher performance than array destructuring
953
+ // eslint-disable-next-line prefer-const
954
+ let { 0 : matchText , 1 : leadingSlash , 2 : tagName , 3 : attributes , 4 : closingSlash } = match ;
955
+ const matchLength = matchText . length ;
956
+ const tagStartPos = kMarkupPattern . lastIndex - matchLength ;
1049
957
const tagEndPos = kMarkupPattern . lastIndex ;
1050
958
1051
959
// Add TextNode if content
1052
960
if ( lastTextPos > - 1 ) {
1053
- if ( lastTextPos + match [ 0 ] . length < tagEndPos ) {
961
+ if ( lastTextPos + matchLength < tagEndPos ) {
1054
962
const text = data . substring ( lastTextPos , tagStartPos ) ;
1055
963
currentParent . appendChild ( new TextNode ( text , currentParent , createRange ( lastTextPos , tagStartPos ) ) ) ;
1056
964
}
@@ -1060,10 +968,10 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
1060
968
1061
969
// https://github.com/taoqf/node-html-parser/issues/38
1062
970
// Skip frameflag node
1063
- if ( match [ 2 ] === frameflag ) continue ;
971
+ if ( tagName === frameflag ) continue ;
1064
972
1065
973
// Handle comments
1066
- if ( match [ 0 ] [ 1 ] === '!' ) {
974
+ if ( matchText [ 1 ] === '!' ) {
1067
975
if ( options . comment ) {
1068
976
// Only keep what is in between <!-- and -->
1069
977
const text = data . substring ( tagStartPos + 4 , tagEndPos - 3 ) ;
@@ -1074,27 +982,29 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
1074
982
1075
983
/* -- Handle tag matching -- */
1076
984
// Fix tag casing if necessary
1077
- if ( options . lowerCaseTagName ) match [ 2 ] = match [ 2 ] . toLowerCase ( ) ;
985
+ if ( lowerCaseTagName ) tagName = tagName . toLowerCase ( ) ;
1078
986
1079
987
// Handle opening tags (ie. <this> not </that>)
1080
- if ( ! match [ 1 ] ) {
988
+ if ( ! leadingSlash ) {
1081
989
/* Populate attributes */
1082
990
const attrs = { } ;
1083
- for ( let attMatch ; ( attMatch = kAttributePattern . exec ( match [ 3 ] ) ) ; ) {
1084
- attrs [ attMatch [ 2 ] . toLowerCase ( ) ] = attMatch [ 4 ] || attMatch [ 5 ] || attMatch [ 6 ] ;
991
+ for ( let attMatch ; ( attMatch = kAttributePattern . exec ( attributes ) ) ; ) {
992
+ const { 1 : key , 2 : val } = attMatch ;
993
+ const isQuoted = val [ 0 ] === `'` || val [ 0 ] === `"` ;
994
+ attrs [ key . toLowerCase ( ) ] = isQuoted ? val . slice ( 1 , val . length - 1 ) : val ;
1085
995
}
1086
996
1087
- const tagName = currentParent . rawTagName as IRawTagName ;
997
+ const parentTagName = currentParent . rawTagName as IRawTagName ;
1088
998
1089
- if ( ! match [ 4 ] && kElementsClosedByOpening [ tagName ] ) {
1090
- if ( kElementsClosedByOpening [ tagName ] [ match [ 2 ] ] ) {
999
+ if ( ! closingSlash && kElementsClosedByOpening [ parentTagName ] ) {
1000
+ if ( kElementsClosedByOpening [ parentTagName ] [ tagName ] ) {
1091
1001
stack . pop ( ) ;
1092
1002
currentParent = arr_back ( stack ) ;
1093
1003
}
1094
1004
}
1095
1005
1096
1006
// Prevent nested A tags by terminating the last A and starting a new one : see issue #144
1097
- if ( match [ 2 ] === 'a' || match [ 2 ] === 'A' ) {
1007
+ if ( tagName === 'a' || tagName === 'A' ) {
1098
1008
if ( noNestedTagIndex !== undefined ) {
1099
1009
stack . splice ( noNestedTagIndex ) ;
1100
1010
currentParent = arr_back ( stack ) ;
@@ -1103,23 +1013,23 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
1103
1013
}
1104
1014
1105
1015
const tagEndPos = kMarkupPattern . lastIndex ;
1106
- const tagStartPos = tagEndPos - match [ 0 ] . length ;
1016
+ const tagStartPos = tagEndPos - matchLength ;
1107
1017
1108
1018
currentParent = currentParent . appendChild (
1109
1019
// Initialize range (end position updated later for closed tags)
1110
- new HTMLElement ( match [ 2 ] , attrs , match [ 3 ] , null , createRange ( tagStartPos , tagEndPos ) )
1020
+ new HTMLElement ( tagName , attrs , attributes . slice ( 1 ) , null , createRange ( tagStartPos , tagEndPos ) )
1111
1021
) ;
1112
1022
stack . push ( currentParent ) ;
1113
1023
1114
- if ( is_block_text_element ( match [ 2 ] ) ) {
1024
+ if ( is_block_text_element ( tagName ) ) {
1115
1025
// Find closing tag
1116
- const closeMarkup = `</${ match [ 2 ] } >` ;
1117
- const closeIndex = options . lowerCaseTagName
1026
+ const closeMarkup = `</${ tagName } >` ;
1027
+ const closeIndex = lowerCaseTagName
1118
1028
? data . toLocaleLowerCase ( ) . indexOf ( closeMarkup , kMarkupPattern . lastIndex )
1119
1029
: data . indexOf ( closeMarkup , kMarkupPattern . lastIndex ) ;
1120
1030
const textEndPos = closeIndex === - 1 ? dataEndPos : closeIndex ;
1121
1031
1122
- if ( element_should_be_ignore ( match [ 2 ] ) ) {
1032
+ if ( element_should_be_ignore ( tagName ) ) {
1123
1033
const text = data . substring ( tagEndPos , textEndPos ) ;
1124
1034
if ( text . length > 0 && / \S / . test ( text ) ) {
1125
1035
currentParent . appendChild ( new TextNode ( text , currentParent , createRange ( tagEndPos , textEndPos ) ) ) ;
@@ -1131,26 +1041,26 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
1131
1041
} else {
1132
1042
lastTextPos = kMarkupPattern . lastIndex = closeIndex + closeMarkup . length ;
1133
1043
// Cause to be treated as self-closing, because no close found
1134
- match [ 1 ] = 'true ' ;
1044
+ leadingSlash = '/ ' ;
1135
1045
}
1136
1046
}
1137
1047
}
1138
1048
1139
1049
// Handle closing tags or self-closed elements (ie </tag> or <br>)
1140
- if ( match [ 1 ] || match [ 4 ] || kSelfClosingElements [ match [ 2 ] ] ) {
1050
+ if ( leadingSlash || closingSlash || kSelfClosingElements [ tagName ] ) {
1141
1051
while ( true ) {
1142
- if ( match [ 2 ] === 'a' || match [ 2 ] === 'A' ) noNestedTagIndex = undefined ;
1143
- if ( currentParent . rawTagName === match [ 2 ] ) {
1052
+ if ( tagName === 'a' || tagName === 'A' ) noNestedTagIndex = undefined ;
1053
+ if ( currentParent . rawTagName === tagName ) {
1144
1054
// Update range end for closed tag
1145
1055
( < [ number , number ] > currentParent . range ) [ 1 ] = createRange ( - 1 , Math . max ( lastTextPos , tagEndPos ) ) [ 1 ] ;
1146
1056
stack . pop ( ) ;
1147
1057
currentParent = arr_back ( stack ) ;
1148
1058
break ;
1149
1059
} else {
1150
- const tagName = currentParent . tagName as 'LI' | 'A' | 'B' | 'I' | 'P' | 'TD' | 'TH' ;
1060
+ const parentTagName = currentParent . tagName as 'LI' | 'A' | 'B' | 'I' | 'P' | 'TD' | 'TH' ;
1151
1061
// Trying to close current tag, and move on
1152
- if ( kElementsClosedByClosing [ tagName ] ) {
1153
- if ( kElementsClosedByClosing [ tagName ] [ match [ 2 ] ] ) {
1062
+ if ( kElementsClosedByClosing [ parentTagName ] ) {
1063
+ if ( kElementsClosedByClosing [ parentTagName ] [ tagName ] ) {
1154
1064
stack . pop ( ) ;
1155
1065
currentParent = arr_back ( stack ) ;
1156
1066
continue ;
0 commit comments