@@ -7,7 +7,7 @@ import Matcher from '../matcher';
7
7
import arr_back from '../back' ;
8
8
import CommentNode from './comment' ;
9
9
10
- // const { decode } = he ;
10
+ const voidTags = new Set ( [ 'area' , 'base' , 'br' , 'col' , 'embed' , 'hr' , 'img' , 'input' , 'link' , 'meta' , 'param' , 'source' , 'track' , 'wbr' ] ) ;
11
11
12
12
type IRawTagName =
13
13
| 'LI'
@@ -154,7 +154,7 @@ export default class HTMLElement extends Node {
154
154
*/
155
155
156
156
private quoteAttribute ( attr : string ) {
157
- if ( attr === null ) {
157
+ if ( attr == null ) {
158
158
return 'null' ;
159
159
}
160
160
@@ -241,6 +241,11 @@ export default class HTMLElement extends Node {
241
241
public get localName ( ) {
242
242
return this . rawTagName . toLowerCase ( ) ;
243
243
}
244
+
245
+ public get isVoidElement ( ) {
246
+ return voidTags . has ( this . localName ) ;
247
+ }
248
+
244
249
/**
245
250
* Get escpaed (as-it) text value of current node and its children.
246
251
* @return {string } text content
@@ -313,14 +318,8 @@ export default class HTMLElement extends Node {
313
318
public toString ( ) {
314
319
const tag = this . rawTagName ;
315
320
if ( tag ) {
316
- // const void_tags = new Set('area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr'.split('|'));
317
- // const is_void = void_tags.has(tag);
318
- const is_void = / ^ ( a r e a | b a s e | b r | c o l | e m b e d | h r | i m g | i n p u t | l i n k | m e t a | p a r a m | s o u r c e | t r a c k | w b r ) $ / i. test ( tag ) ;
319
321
const attrs = this . rawAttrs ? ` ${ this . rawAttrs } ` : '' ;
320
- if ( is_void ) {
321
- return `<${ tag } ${ attrs } >` ;
322
- }
323
- return `<${ tag } ${ attrs } >${ this . innerHTML } </${ tag } >` ;
322
+ return this . isVoidElement ? `<${ tag } ${ attrs } >` : `<${ tag } ${ attrs } >${ this . innerHTML } </${ tag } >` ;
324
323
}
325
324
return this . innerHTML ;
326
325
}
@@ -458,64 +457,6 @@ export default class HTMLElement extends Node {
458
457
xmlMode : true ,
459
458
adapter : Matcher ,
460
459
} ) ;
461
-
462
- // let matcher: Matcher;
463
- // if (selector instanceof Matcher) {
464
- // matcher = selector;
465
- // matcher.reset();
466
- // } else {
467
- // if (selector.includes(',')) {
468
- // const selectors = selector.split(',');
469
- // return Array.from(selectors.reduce((pre, cur) => {
470
- // const result = this.querySelectorAll(cur.trim());
471
- // return result.reduce((p, c) => {
472
- // return p.add(c);
473
- // }, pre);
474
- // }, new Set<HTMLElement>()));
475
- // }
476
- // matcher = new Matcher(selector);
477
- // }
478
- // interface IStack {
479
- // 0: Node; // node
480
- // 1: number; // children
481
- // 2: boolean; // found flag
482
- // }
483
- // const stack = [] as IStack[];
484
- // return this.childNodes.reduce((res, cur) => {
485
- // stack.push([cur, 0, false]);
486
- // while (stack.length) {
487
- // const state = arr_back(stack); // get last element
488
- // const el = state[0];
489
- // if (state[1] === 0) {
490
- // // Seen for first time.
491
- // if (el.nodeType !== NodeType.ELEMENT_NODE) {
492
- // stack.pop();
493
- // continue;
494
- // }
495
- // const html_el = el as HTMLElement;
496
- // state[2] = matcher.advance(html_el);
497
- // if (state[2]) {
498
- // if (matcher.matched) {
499
- // res.push(html_el);
500
- // res.push(...(html_el.querySelectorAll(selector)));
501
- // // no need to go further.
502
- // matcher.rewind();
503
- // stack.pop();
504
- // continue;
505
- // }
506
- // }
507
- // }
508
- // if (state[1] < el.childNodes.length) {
509
- // stack.push([el.childNodes[state[1]++], 0, false]);
510
- // } else {
511
- // if (state[2]) {
512
- // matcher.rewind();
513
- // }
514
- // stack.pop();
515
- // }
516
- // }
517
- // return res;
518
- // }, [] as HTMLElement[]);
519
460
}
520
461
521
462
/**
@@ -528,43 +469,6 @@ export default class HTMLElement extends Node {
528
469
xmlMode : true ,
529
470
adapter : Matcher ,
530
471
} ) ;
531
- // let matcher: Matcher;
532
- // if (selector instanceof Matcher) {
533
- // matcher = selector;
534
- // matcher.reset();
535
- // } else {
536
- // matcher = new Matcher(selector);
537
- // }
538
- // const stack = [] as { 0: Node; 1: 0 | 1; 2: boolean }[];
539
- // for (const node of this.childNodes) {
540
- // stack.push([node, 0, false]);
541
- // while (stack.length) {
542
- // const state = arr_back(stack);
543
- // const el = state[0];
544
- // if (state[1] === 0) {
545
- // // Seen for first time.
546
- // if (el.nodeType !== NodeType.ELEMENT_NODE) {
547
- // stack.pop();
548
- // continue;
549
- // }
550
- // state[2] = matcher.advance(el as HTMLElement);
551
- // if (state[2]) {
552
- // if (matcher.matched) {
553
- // return el as HTMLElement;
554
- // }
555
- // }
556
- // }
557
- // if (state[1] < el.childNodes.length) {
558
- // stack.push([el.childNodes[state[1]++], 0, false]);
559
- // } else {
560
- // if (state[2]) {
561
- // matcher.rewind();
562
- // }
563
- // stack.pop();
564
- // }
565
- // }
566
- // }
567
- // return null;
568
472
}
569
473
570
474
/**
@@ -682,7 +586,7 @@ export default class HTMLElement extends Node {
682
586
}
683
587
684
588
/**
685
- * Get escaped (as-it ) attributes
589
+ * Get escaped (as-is ) attributes
686
590
* @return {Object } parsed attributes
687
591
*/
688
592
public get rawAttributes ( ) {
@@ -691,10 +595,13 @@ export default class HTMLElement extends Node {
691
595
}
692
596
const attrs = { } as RawAttributes ;
693
597
if ( this . rawAttrs ) {
694
- const re = / ( [ a - z ( ) # ] [ a - z 0 - 9 - _ : ( ) # ] * ) (?: \s * = \s * (?: " ( [ ^ " ] * ) " | ' ( [ ^ ' ] * ) ' | ( \S + ) ) ) ? / gi ;
598
+ const re = / ( [ a - z A - Z ( ) # ] [ a - z A - Z 0 - 9 - _ : ( ) # ] * ) (?: \s * = \s * ( (?: ' [ ^ ' ] * ' ) | (?: " [ ^ " ] * " ) | \S + ) ) ? / g ;
695
599
let match : RegExpExecArray ;
696
600
while ( ( match = re . exec ( this . rawAttrs ) ) ) {
697
- attrs [ match [ 1 ] ] = match [ 2 ] || match [ 3 ] || match [ 4 ] || null ;
601
+ const key = match [ 1 ] ;
602
+ let val = match [ 2 ] || null ;
603
+ if ( val && ( val [ 0 ] === `'` || val [ 0 ] === `"` ) ) val = val . slice ( 1 , val . length - 1 ) ;
604
+ attrs [ key ] = val ;
698
605
}
699
606
}
700
607
this . _rawAttrs = attrs ;
@@ -873,12 +780,8 @@ export default class HTMLElement extends Node {
873
780
}
874
781
875
782
// https://html.spec.whatwg.org/multipage/custom-elements.html#valid-custom-element-name
876
- const kMarkupPattern = / < ! - - [ ^ ] * ?(? = - - > ) - - > | < ( \/ ? ) ( [ a - z ] [ - . : 0 - 9 _ a - z ] * ) \s * ( (? = [ / > ] * ?) | (?: .* ?[ \s \d / ' " ] ) | (?: .* ?[ \w ] ) ) ( \/ ? ) > / gi;
877
- // <(?<tag>[^\s]*)(.*)>(.*)</\k<tag>>
878
- // <([a-z][-.:0-9_a-z]*)\s*\/>
879
- // <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?>
880
- // <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?>|<(?<tag>[^\s]*)(.*)>(.*)</\k<tag>>
881
- const kAttributePattern = / ( ^ | \s ) ( i d | c l a s s ) \s * = \s * ( " ( [ ^ " ] * ) " | ' ( [ ^ ' ] * ) ' | ( \S + ) ) / gi;
783
+ const kMarkupPattern = / < ! - - [ \s \S ] * ?- - > | < ( \/ ? ) ( [ a - z A - Z ] [ - . : 0 - 9 _ a - z A - Z ] * ) ( (?: \s + [ ^ > ] * ?(?: (?: ' [ ^ ' ] * ' ) | (?: " [ ^ " ] * " ) ) ? ) * ) \s * ( \/ ? ) > / g;
784
+ const kAttributePattern = / (?: ^ | \s ) ( i d | c l a s s ) \s * = \s * ( (?: ' [ ^ ' ] * ' ) | (?: " [ ^ " ] * " ) | \S + ) / gi;
882
785
const kSelfClosingElements = {
883
786
area : true ,
884
787
AREA : true ,
@@ -995,17 +898,22 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
995
898
let match : RegExpExecArray ;
996
899
// https://github.com/taoqf/node-html-parser/issues/38
997
900
data = `<${ frameflag } >${ data } </${ frameflag } >` ;
901
+ const { lowerCaseTagName } = options ;
998
902
999
903
const dataEndPos = data . length - ( frameflag . length + 2 ) ;
1000
904
const frameFlagOffset = frameflag . length + 2 ;
1001
905
1002
906
while ( ( match = kMarkupPattern . exec ( data ) ) ) {
1003
- const tagStartPos = kMarkupPattern . lastIndex - match [ 0 ] . length ;
907
+ // Note: Object destructuring here consistently tests as higher performance than array destructuring
908
+ // eslint-disable-next-line prefer-const
909
+ let { 0 : matchText , 1 : leadingSlash , 2 : tagName , 3 : attributes , 4 : closingSlash } = match ;
910
+ const matchLength = matchText . length ;
911
+ const tagStartPos = kMarkupPattern . lastIndex - matchLength ;
1004
912
const tagEndPos = kMarkupPattern . lastIndex ;
1005
913
1006
914
// Add TextNode if content
1007
915
if ( lastTextPos > - 1 ) {
1008
- if ( lastTextPos + match [ 0 ] . length < tagEndPos ) {
916
+ if ( lastTextPos + matchLength < tagEndPos ) {
1009
917
const text = data . substring ( lastTextPos , tagStartPos ) ;
1010
918
currentParent . appendChild ( new TextNode ( text , currentParent , createRange ( lastTextPos , tagStartPos ) ) ) ;
1011
919
}
@@ -1015,10 +923,10 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
1015
923
1016
924
// https://github.com/taoqf/node-html-parser/issues/38
1017
925
// Skip frameflag node
1018
- if ( match [ 2 ] === frameflag ) continue ;
926
+ if ( tagName === frameflag ) continue ;
1019
927
1020
928
// Handle comments
1021
- if ( match [ 0 ] [ 1 ] === '!' ) {
929
+ if ( matchText [ 1 ] === '!' ) {
1022
930
if ( options . comment ) {
1023
931
// Only keep what is in between <!-- and -->
1024
932
const text = data . substring ( tagStartPos + 4 , tagEndPos - 3 ) ;
@@ -1029,27 +937,29 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
1029
937
1030
938
/* -- Handle tag matching -- */
1031
939
// Fix tag casing if necessary
1032
- if ( options . lowerCaseTagName ) match [ 2 ] = match [ 2 ] . toLowerCase ( ) ;
940
+ if ( lowerCaseTagName ) tagName = tagName . toLowerCase ( ) ;
1033
941
1034
942
// Handle opening tags (ie. <this> not </that>)
1035
- if ( ! match [ 1 ] ) {
943
+ if ( ! leadingSlash ) {
1036
944
/* Populate attributes */
1037
945
const attrs = { } ;
1038
- for ( let attMatch ; ( attMatch = kAttributePattern . exec ( match [ 3 ] ) ) ; ) {
1039
- attrs [ attMatch [ 2 ] . toLowerCase ( ) ] = attMatch [ 4 ] || attMatch [ 5 ] || attMatch [ 6 ] ;
946
+ for ( let attMatch ; ( attMatch = kAttributePattern . exec ( attributes ) ) ; ) {
947
+ const { 1 : key , 2 : val } = attMatch ;
948
+ const isQuoted = val [ 0 ] === `'` || val [ 0 ] === `"` ;
949
+ attrs [ key . toLowerCase ( ) ] = isQuoted ? val . slice ( 1 , val . length - 1 ) : val ;
1040
950
}
1041
951
1042
- const tagName = currentParent . rawTagName as IRawTagName ;
952
+ const parentTagName = currentParent . rawTagName as IRawTagName ;
1043
953
1044
- if ( ! match [ 4 ] && kElementsClosedByOpening [ tagName ] ) {
1045
- if ( kElementsClosedByOpening [ tagName ] [ match [ 2 ] ] ) {
954
+ if ( ! closingSlash && kElementsClosedByOpening [ parentTagName ] ) {
955
+ if ( kElementsClosedByOpening [ parentTagName ] [ tagName ] ) {
1046
956
stack . pop ( ) ;
1047
957
currentParent = arr_back ( stack ) ;
1048
958
}
1049
959
}
1050
960
1051
961
// Prevent nested A tags by terminating the last A and starting a new one : see issue #144
1052
- if ( match [ 2 ] === 'a' || match [ 2 ] === 'A' ) {
962
+ if ( tagName === 'a' || tagName === 'A' ) {
1053
963
if ( noNestedTagIndex !== undefined ) {
1054
964
stack . splice ( noNestedTagIndex ) ;
1055
965
currentParent = arr_back ( stack ) ;
@@ -1058,23 +968,23 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
1058
968
}
1059
969
1060
970
const tagEndPos = kMarkupPattern . lastIndex ;
1061
- const tagStartPos = tagEndPos - match [ 0 ] . length ;
971
+ const tagStartPos = tagEndPos - matchLength ;
1062
972
1063
973
currentParent = currentParent . appendChild (
1064
974
// Initialize range (end position updated later for closed tags)
1065
- new HTMLElement ( match [ 2 ] , attrs , match [ 3 ] , null , createRange ( tagStartPos , tagEndPos ) )
975
+ new HTMLElement ( tagName , attrs , attributes . slice ( 1 ) , null , createRange ( tagStartPos , tagEndPos ) )
1066
976
) ;
1067
977
stack . push ( currentParent ) ;
1068
978
1069
- if ( is_block_text_element ( match [ 2 ] ) ) {
979
+ if ( is_block_text_element ( tagName ) ) {
1070
980
// Find closing tag
1071
- const closeMarkup = `</${ match [ 2 ] } >` ;
1072
- const closeIndex = options . lowerCaseTagName
981
+ const closeMarkup = `</${ tagName } >` ;
982
+ const closeIndex = lowerCaseTagName
1073
983
? data . toLocaleLowerCase ( ) . indexOf ( closeMarkup , kMarkupPattern . lastIndex )
1074
984
: data . indexOf ( closeMarkup , kMarkupPattern . lastIndex ) ;
1075
985
const textEndPos = closeIndex === - 1 ? dataEndPos : closeIndex ;
1076
986
1077
- if ( element_should_be_ignore ( match [ 2 ] ) ) {
987
+ if ( element_should_be_ignore ( tagName ) ) {
1078
988
const text = data . substring ( tagEndPos , textEndPos ) ;
1079
989
if ( text . length > 0 && / \S / . test ( text ) ) {
1080
990
currentParent . appendChild ( new TextNode ( text , currentParent , createRange ( tagEndPos , textEndPos ) ) ) ;
@@ -1086,26 +996,26 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
1086
996
} else {
1087
997
lastTextPos = kMarkupPattern . lastIndex = closeIndex + closeMarkup . length ;
1088
998
// Cause to be treated as self-closing, because no close found
1089
- match [ 1 ] = 'true ' ;
999
+ leadingSlash = '/ ' ;
1090
1000
}
1091
1001
}
1092
1002
}
1093
1003
1094
1004
// Handle closing tags or self-closed elements (ie </tag> or <br>)
1095
- if ( match [ 1 ] || match [ 4 ] || kSelfClosingElements [ match [ 2 ] ] ) {
1005
+ if ( leadingSlash || closingSlash || kSelfClosingElements [ tagName ] ) {
1096
1006
while ( true ) {
1097
- if ( match [ 2 ] === 'a' || match [ 2 ] === 'A' ) noNestedTagIndex = undefined ;
1098
- if ( currentParent . rawTagName === match [ 2 ] ) {
1007
+ if ( tagName === 'a' || tagName === 'A' ) noNestedTagIndex = undefined ;
1008
+ if ( currentParent . rawTagName === tagName ) {
1099
1009
// Update range end for closed tag
1100
1010
( < [ number , number ] > currentParent . range ) [ 1 ] = createRange ( - 1 , Math . max ( lastTextPos , tagEndPos ) ) [ 1 ] ;
1101
1011
stack . pop ( ) ;
1102
1012
currentParent = arr_back ( stack ) ;
1103
1013
break ;
1104
1014
} else {
1105
- const tagName = currentParent . tagName as 'LI' | 'A' | 'B' | 'I' | 'P' | 'TD' | 'TH' ;
1015
+ const parentTagName = currentParent . tagName as 'LI' | 'A' | 'B' | 'I' | 'P' | 'TD' | 'TH' ;
1106
1016
// Trying to close current tag, and move on
1107
- if ( kElementsClosedByClosing [ tagName ] ) {
1108
- if ( kElementsClosedByClosing [ tagName ] [ match [ 2 ] ] ) {
1017
+ if ( kElementsClosedByClosing [ parentTagName ] ) {
1018
+ if ( kElementsClosedByClosing [ parentTagName ] [ tagName ] ) {
1109
1019
stack . pop ( ) ;
1110
1020
currentParent = arr_back ( stack ) ;
1111
1021
continue ;
0 commit comments