Skip to content

Commit c9a7e2d

Browse files
authored
Simplify and fix URI regex (#29)
The inner pattern already guarantees that the URI doesn't end in a whitespace or endchar. Fixes #28
1 parent 67446ac commit c9a7e2d

File tree

6 files changed

+75
-95
lines changed

6 files changed

+75
-95
lines changed

docs/js/tree-sitter-comment.wasm

-433 Bytes
Binary file not shown.

grammar.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ module.exports = grammar({
9494
function get_uri_regex() {
9595
let end_chars = escapeRegExp(END_CHARS.join(""));
9696
return new RegExp(
97-
`https?://([^\\s${end_chars}]|[${end_chars}][a-zA-Z0-9]+)*[^\\s${end_chars}]`
97+
`https?://([^\\s${end_chars}]|[${end_chars}][a-zA-Z0-9])+`
9898
);
9999
}
100100

src/grammar.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@
102102
},
103103
"uri": {
104104
"type": "PATTERN",
105-
"value": "https?:\\/\\/([^\\s\\.,:;!\\?\\\\'\"\\}\\]\\)>]|[\\.,:;!\\?\\\\'\"\\}\\]\\)>][a-zA-Z0-9]+)*[^\\s\\.,:;!\\?\\\\'\"\\}\\]\\)>]"
105+
"value": "https?:\\/\\/([^\\s\\.,:;!\\?\\\\'\"\\}\\]\\)>]|[\\.,:;!\\?\\\\'\"\\}\\]\\)>][a-zA-Z0-9])+"
106106
},
107107
"_text": {
108108
"type": "CHOICE",

src/parser.c

+70-93
Original file line numberDiff line numberDiff line change
@@ -347,35 +347,35 @@ static bool ts_lex(TSLexer *lexer, TSStateId state) {
347347
eof = lexer->eof(lexer);
348348
switch (state) {
349349
case 0:
350-
if (eof) ADVANCE(7);
351-
if (lookahead == '!') ADVANCE(30);
352-
if (lookahead == '"') ADVANCE(23);
353-
if (lookahead == '\'') ADVANCE(22);
354-
if (lookahead == '(') ADVANCE(9);
355-
if (lookahead == ')') ADVANCE(11);
356-
if (lookahead == ',') ADVANCE(28);
357-
if (lookahead == '-') ADVANCE(36);
358-
if (lookahead == '.') ADVANCE(27);
359-
if (lookahead == '/') ADVANCE(21);
360-
if (lookahead == ':') ADVANCE(8);
361-
if (lookahead == ';') ADVANCE(29);
362-
if (lookahead == '<') ADVANCE(24);
363-
if (lookahead == '>') ADVANCE(35);
364-
if (lookahead == '?') ADVANCE(31);
365-
if (lookahead == '[') ADVANCE(25);
366-
if (lookahead == '\\') ADVANCE(32);
367-
if (lookahead == ']') ADVANCE(34);
368-
if (lookahead == 'h') ADVANCE(19);
369-
if (lookahead == '{') ADVANCE(26);
370-
if (lookahead == '}') ADVANCE(33);
350+
if (eof) ADVANCE(6);
351+
if (lookahead == '!') ADVANCE(28);
352+
if (lookahead == '"') ADVANCE(21);
353+
if (lookahead == '\'') ADVANCE(20);
354+
if (lookahead == '(') ADVANCE(8);
355+
if (lookahead == ')') ADVANCE(10);
356+
if (lookahead == ',') ADVANCE(26);
357+
if (lookahead == '-') ADVANCE(34);
358+
if (lookahead == '.') ADVANCE(25);
359+
if (lookahead == '/') ADVANCE(19);
360+
if (lookahead == ':') ADVANCE(7);
361+
if (lookahead == ';') ADVANCE(27);
362+
if (lookahead == '<') ADVANCE(22);
363+
if (lookahead == '>') ADVANCE(33);
364+
if (lookahead == '?') ADVANCE(29);
365+
if (lookahead == '[') ADVANCE(23);
366+
if (lookahead == '\\') ADVANCE(30);
367+
if (lookahead == ']') ADVANCE(32);
368+
if (lookahead == 'h') ADVANCE(17);
369+
if (lookahead == '{') ADVANCE(24);
370+
if (lookahead == '}') ADVANCE(31);
371371
if (lookahead == '\t' ||
372372
lookahead == '\n' ||
373373
lookahead == '\r' ||
374-
lookahead == ' ') ADVANCE(12);
375-
if (lookahead != 0) ADVANCE(20);
374+
lookahead == ' ') ADVANCE(11);
375+
if (lookahead != 0) ADVANCE(18);
376376
END_STATE();
377377
case 1:
378-
if (lookahead == '/') ADVANCE(5);
378+
if (lookahead == '/') ADVANCE(4);
379379
END_STATE();
380380
case 2:
381381
if (lookahead == '/') ADVANCE(1);
@@ -384,153 +384,130 @@ static bool ts_lex(TSLexer *lexer, TSStateId state) {
384384
if (lookahead == '\t' ||
385385
lookahead == '\n' ||
386386
lookahead == '\r' ||
387-
lookahead == ' ') ADVANCE(10);
387+
lookahead == ' ') ADVANCE(9);
388388
if (lookahead != 0 &&
389389
lookahead != '(' &&
390-
lookahead != ')') ADVANCE(10);
390+
lookahead != ')') ADVANCE(9);
391391
END_STATE();
392392
case 4:
393-
if (sym_uri_character_set_1(lookahead)) ADVANCE(6);
394-
if (('0' <= lookahead && lookahead <= '9') ||
395-
('A' <= lookahead && lookahead <= 'Z') ||
396-
('a' <= lookahead && lookahead <= 'z')) ADVANCE(13);
393+
if (sym_uri_character_set_1(lookahead)) ADVANCE(5);
397394
if (lookahead != 0 &&
398395
lookahead != '\t' &&
399396
lookahead != '\n' &&
400397
lookahead != '\r' &&
401-
lookahead != ' ') ADVANCE(14);
398+
lookahead != ' ') ADVANCE(12);
402399
END_STATE();
403400
case 5:
404-
if (sym_uri_character_set_1(lookahead)) ADVANCE(6);
405-
if (lookahead != 0 &&
406-
lookahead != '\t' &&
407-
lookahead != '\n' &&
408-
lookahead != '\r' &&
409-
lookahead != ' ') ADVANCE(14);
410-
END_STATE();
411-
case 6:
412401
if (('0' <= lookahead && lookahead <= '9') ||
413402
('A' <= lookahead && lookahead <= 'Z') ||
414-
('a' <= lookahead && lookahead <= 'z')) ADVANCE(4);
403+
('a' <= lookahead && lookahead <= 'z')) ADVANCE(12);
415404
END_STATE();
416-
case 7:
405+
case 6:
417406
ACCEPT_TOKEN(ts_builtin_sym_end);
418407
END_STATE();
419-
case 8:
408+
case 7:
420409
ACCEPT_TOKEN(anon_sym_COLON);
421410
END_STATE();
422-
case 9:
411+
case 8:
423412
ACCEPT_TOKEN(anon_sym_LPAREN);
424413
END_STATE();
425-
case 10:
414+
case 9:
426415
ACCEPT_TOKEN(aux_sym__user_token1);
427416
if (lookahead != 0 &&
428417
lookahead != '(' &&
429-
lookahead != ')') ADVANCE(10);
418+
lookahead != ')') ADVANCE(9);
430419
END_STATE();
431-
case 11:
420+
case 10:
432421
ACCEPT_TOKEN(anon_sym_RPAREN);
433422
END_STATE();
434-
case 12:
423+
case 11:
435424
ACCEPT_TOKEN(aux_sym__full_uri_token1);
436425
END_STATE();
437-
case 13:
426+
case 12:
438427
ACCEPT_TOKEN(sym_uri);
439-
if (sym_uri_character_set_1(lookahead)) ADVANCE(6);
440-
if (('0' <= lookahead && lookahead <= '9') ||
441-
('A' <= lookahead && lookahead <= 'Z') ||
442-
('a' <= lookahead && lookahead <= 'z')) ADVANCE(13);
428+
if (sym_uri_character_set_1(lookahead)) ADVANCE(5);
443429
if (lookahead != 0 &&
444430
lookahead != '\t' &&
445431
lookahead != '\n' &&
446432
lookahead != '\r' &&
447-
lookahead != ' ') ADVANCE(14);
433+
lookahead != ' ') ADVANCE(12);
434+
END_STATE();
435+
case 13:
436+
ACCEPT_TOKEN(aux_sym__text_token1);
437+
if (lookahead == ':') ADVANCE(2);
438+
if (lookahead == 's') ADVANCE(14);
439+
if (!aux_sym__text_token1_character_set_1(lookahead)) ADVANCE(18);
448440
END_STATE();
449441
case 14:
450-
ACCEPT_TOKEN(sym_uri);
451-
if (sym_uri_character_set_1(lookahead)) ADVANCE(6);
452-
if (lookahead != 0 &&
453-
lookahead != '\t' &&
454-
lookahead != '\n' &&
455-
lookahead != '\r' &&
456-
lookahead != ' ') ADVANCE(14);
442+
ACCEPT_TOKEN(aux_sym__text_token1);
443+
if (lookahead == ':') ADVANCE(2);
444+
if (!aux_sym__text_token1_character_set_1(lookahead)) ADVANCE(18);
457445
END_STATE();
458446
case 15:
459447
ACCEPT_TOKEN(aux_sym__text_token1);
460-
if (lookahead == ':') ADVANCE(2);
461-
if (lookahead == 's') ADVANCE(16);
462-
if (!aux_sym__text_token1_character_set_1(lookahead)) ADVANCE(20);
448+
if (lookahead == 'p') ADVANCE(13);
449+
if (!aux_sym__text_token1_character_set_2(lookahead)) ADVANCE(18);
463450
END_STATE();
464451
case 16:
465452
ACCEPT_TOKEN(aux_sym__text_token1);
466-
if (lookahead == ':') ADVANCE(2);
467-
if (!aux_sym__text_token1_character_set_1(lookahead)) ADVANCE(20);
453+
if (lookahead == 't') ADVANCE(15);
454+
if (!aux_sym__text_token1_character_set_2(lookahead)) ADVANCE(18);
468455
END_STATE();
469456
case 17:
470457
ACCEPT_TOKEN(aux_sym__text_token1);
471-
if (lookahead == 'p') ADVANCE(15);
472-
if (!aux_sym__text_token1_character_set_2(lookahead)) ADVANCE(20);
458+
if (lookahead == 't') ADVANCE(16);
459+
if (!aux_sym__text_token1_character_set_2(lookahead)) ADVANCE(18);
473460
END_STATE();
474461
case 18:
475462
ACCEPT_TOKEN(aux_sym__text_token1);
476-
if (lookahead == 't') ADVANCE(17);
477-
if (!aux_sym__text_token1_character_set_2(lookahead)) ADVANCE(20);
463+
if (!aux_sym__text_token1_character_set_2(lookahead)) ADVANCE(18);
478464
END_STATE();
479465
case 19:
480-
ACCEPT_TOKEN(aux_sym__text_token1);
481-
if (lookahead == 't') ADVANCE(18);
482-
if (!aux_sym__text_token1_character_set_2(lookahead)) ADVANCE(20);
483-
END_STATE();
484-
case 20:
485-
ACCEPT_TOKEN(aux_sym__text_token1);
486-
if (!aux_sym__text_token1_character_set_2(lookahead)) ADVANCE(20);
487-
END_STATE();
488-
case 21:
489466
ACCEPT_TOKEN(anon_sym_SLASH);
490467
END_STATE();
491-
case 22:
468+
case 20:
492469
ACCEPT_TOKEN(anon_sym_SQUOTE);
493470
END_STATE();
494-
case 23:
471+
case 21:
495472
ACCEPT_TOKEN(anon_sym_DQUOTE);
496473
END_STATE();
497-
case 24:
474+
case 22:
498475
ACCEPT_TOKEN(anon_sym_LT);
499476
END_STATE();
500-
case 25:
477+
case 23:
501478
ACCEPT_TOKEN(anon_sym_LBRACK);
502479
END_STATE();
503-
case 26:
480+
case 24:
504481
ACCEPT_TOKEN(anon_sym_LBRACE);
505482
END_STATE();
506-
case 27:
483+
case 25:
507484
ACCEPT_TOKEN(anon_sym_DOT);
508485
END_STATE();
509-
case 28:
486+
case 26:
510487
ACCEPT_TOKEN(anon_sym_COMMA);
511488
END_STATE();
512-
case 29:
489+
case 27:
513490
ACCEPT_TOKEN(anon_sym_SEMI);
514491
END_STATE();
515-
case 30:
492+
case 28:
516493
ACCEPT_TOKEN(anon_sym_BANG);
517494
END_STATE();
518-
case 31:
495+
case 29:
519496
ACCEPT_TOKEN(anon_sym_QMARK);
520497
END_STATE();
521-
case 32:
498+
case 30:
522499
ACCEPT_TOKEN(anon_sym_BSLASH);
523500
END_STATE();
524-
case 33:
501+
case 31:
525502
ACCEPT_TOKEN(anon_sym_RBRACE);
526503
END_STATE();
527-
case 34:
504+
case 32:
528505
ACCEPT_TOKEN(anon_sym_RBRACK);
529506
END_STATE();
530-
case 35:
507+
case 33:
531508
ACCEPT_TOKEN(anon_sym_GT);
532509
END_STATE();
533-
case 36:
510+
case 34:
534511
ACCEPT_TOKEN(anon_sym_DASH);
535512
END_STATE();
536513
default:

test/corpus/source.txt

+3
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ URI: https://user:[email protected]/org/repo/?foo=baz
9494

9595
URI(me): (https://github.com/stsewd/?foo=bar#baz)
9696

97+
https://github.com/stsewd/tree-sitter-rst#1.1
98+
9799
--------------------------------------------------------------------------------
98100

99101
(source
@@ -111,4 +113,5 @@ URI(me): (https://github.com/stsewd/?foo=bar#baz)
111113
(tag
112114
(name)
113115
(user))
116+
(uri)
114117
(uri))

tree-sitter-comment.wasm

-433 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)