Skip to content

Fix scanning issues related to Unicode escapes in identifiers #61042

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
200 changes: 65 additions & 135 deletions src/compiler/scanner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -976,8 +976,8 @@ export function isIdentifierStart(ch: number, languageVersion: ScriptTarget | un

export function isIdentifierPart(ch: number, languageVersion: ScriptTarget | undefined, identifierVariant?: LanguageVariant): boolean {
return isWordCharacter(ch) || ch === CharacterCodes.$ ||
// "-" and ":" are valid in JSX Identifiers
(identifierVariant === LanguageVariant.JSX ? (ch === CharacterCodes.minus || ch === CharacterCodes.colon) : false) ||
// "-" is valid in JSX Identifiers. ":" is part of JSXNamespacedName but not JSXIdentifier.
identifierVariant === LanguageVariant.JSX && ch === CharacterCodes.minus ||
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing this line doesn't affect everything in the test suite, but I am not sure if it affects code completion.

ch > CharacterCodes.maxAsciiCharacter && isUnicodeIdentifierPart(ch, languageVersion);
}

Expand Down Expand Up @@ -1328,9 +1328,9 @@ export function createScanner(
}

const identifierStart = pos;
const { length } = scanIdentifierParts();
const { length } = scanIdentifierParts(languageVersion);

if (length === 1 && text[identifierStart] === "n") {
if (length === 1 && codePointUnchecked(identifierStart) === CharacterCodes.n) {
if (isScientific) {
error(Diagnostics.A_bigint_literal_cannot_use_exponential_notation, numericStart, identifierStart - numericStart + 1);
}
Expand Down Expand Up @@ -1777,35 +1777,36 @@ export function createScanner(
return -1;
}

function scanIdentifierParts(): string {
function scanIdentifierParts(languageVersion: ScriptTarget, identifierVariant?: LanguageVariant): string {
let result = "";
let start = pos;
while (pos < end) {
let ch = codePointUnchecked(pos);
if (isIdentifierPart(ch, languageVersion)) {
if (isIdentifierPart(ch, languageVersion, identifierVariant)) {
pos += charSize(ch);
continue;
}
else if (ch === CharacterCodes.backslash) {

if (ch === CharacterCodes.backslash) {
ch = peekExtendedUnicodeEscape();
if (ch >= 0 && isIdentifierPart(ch, languageVersion)) {
if (ch >= 0 && isIdentifierPart(ch, languageVersion, identifierVariant)) {
result += text.substring(start, pos);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the missing line that causes #61043.

result += scanExtendedUnicodeEscape(/*shouldEmitInvalidEscapeError*/ true);
start = pos;
continue;
}

ch = peekUnicodeEscape();
if (!(ch >= 0 && isIdentifierPart(ch, languageVersion))) {
break;
if (ch >= 0 && isIdentifierPart(ch, languageVersion, identifierVariant)) {
tokenFlags |= TokenFlags.UnicodeEscape;
result += text.substring(start, pos);
result += String.fromCharCode(ch);
pos += 6; // Valid Unicode escape is always six characters
start = pos;
continue;
}
tokenFlags |= TokenFlags.UnicodeEscape;
result += text.substring(start, pos);
result += utf16EncodeAsString(ch);
// Valid Unicode escape is always six characters
pos += 6;
start = pos;
}
else {
break;
}
break;
}
result += text.substring(start, pos);
return result;
Expand Down Expand Up @@ -2302,72 +2303,26 @@ export function createScanner(
case CharacterCodes.at:
pos++;
return token = SyntaxKind.AtToken;
case CharacterCodes.backslash:
const extendedCookedChar = peekExtendedUnicodeEscape();
if (extendedCookedChar >= 0 && isIdentifierStart(extendedCookedChar, languageVersion)) {
tokenValue = scanExtendedUnicodeEscape(/*shouldEmitInvalidEscapeError*/ true) + scanIdentifierParts();
return token = getIdentifierToken();
}

const cookedChar = peekUnicodeEscape();
if (cookedChar >= 0 && isIdentifierStart(cookedChar, languageVersion)) {
pos += 6;
tokenFlags |= TokenFlags.UnicodeEscape;
tokenValue = String.fromCharCode(cookedChar) + scanIdentifierParts();
return token = getIdentifierToken();
}

error(Diagnostics.Invalid_character);
pos++;
return token = SyntaxKind.Unknown;
case CharacterCodes.hash:
if (pos !== 0 && text[pos + 1] === "!") {
error(Diagnostics.can_only_be_used_at_the_start_of_a_file, pos, 2);
pos++;
if (pos !== 1 && codePointUnchecked(pos) === CharacterCodes.exclamation) {
pos++;
error(Diagnostics.can_only_be_used_at_the_start_of_a_file, pos - 2, 2);
return token = SyntaxKind.Unknown;
}

const charAfterHash = codePointUnchecked(pos + 1);
if (charAfterHash === CharacterCodes.backslash) {
pos++;
const extendedCookedChar = peekExtendedUnicodeEscape();
if (extendedCookedChar >= 0 && isIdentifierStart(extendedCookedChar, languageVersion)) {
tokenValue = "#" + scanExtendedUnicodeEscape(/*shouldEmitInvalidEscapeError*/ true) + scanIdentifierParts();
return token = SyntaxKind.PrivateIdentifier;
}

const cookedChar = peekUnicodeEscape();
if (cookedChar >= 0 && isIdentifierStart(cookedChar, languageVersion)) {
pos += 6;
tokenFlags |= TokenFlags.UnicodeEscape;
tokenValue = "#" + String.fromCharCode(cookedChar) + scanIdentifierParts();
return token = SyntaxKind.PrivateIdentifier;
}
pos--;
}

if (isIdentifierStart(charAfterHash, languageVersion)) {
pos++;
// We're relying on scanIdentifier's behavior and adjusting the token kind after the fact.
// Notably absent from this block is the fact that calling a function named "scanIdentifier",
// but identifiers don't include '#', and that function doesn't deal with it at all.
// This works because 'scanIdentifier' tries to reuse source characters and builds up substrings;
// however, it starts at the 'tokenPos' which includes the '#', and will "accidentally" prepend the '#' for us.
scanIdentifier(charAfterHash, languageVersion);
}
else {
tokenValue = "#";
error(Diagnostics.Invalid_character, pos++, charSize(ch));
if (!scanIdentifier(languageVersion)) {
error(Diagnostics.Invalid_character, pos - 1, 1);
}
tokenValue = "#" + tokenValue;
return token = SyntaxKind.PrivateIdentifier;
case CharacterCodes.replacementCharacter:
error(Diagnostics.File_appears_to_be_binary, 0, 0);
pos = end;
return token = SyntaxKind.NonTextFileMarkerTrivia;
default:
const identifierKind = scanIdentifier(ch, languageVersion);
if (identifierKind) {
return token = identifierKind;
if (scanIdentifier(languageVersion)) {
return token;
}
else if (isWhiteSpaceSingleLine(ch)) {
pos += charSize(ch);
Expand Down Expand Up @@ -2411,27 +2366,39 @@ export function createScanner(
function reScanInvalidIdentifier(): SyntaxKind {
Debug.assert(token === SyntaxKind.Unknown, "'reScanInvalidIdentifier' should only be called when the current token is 'SyntaxKind.Unknown'.");
pos = tokenStart = fullStartPos;
tokenFlags = 0;
const ch = codePointUnchecked(pos);
const identifierKind = scanIdentifier(ch, ScriptTarget.ESNext);
if (identifierKind) {
return token = identifierKind;
}
pos += charSize(ch);
return token; // Still `SyntaxKind.Unknown`
tokenFlags = TokenFlags.None;
return scanIdentifier(ScriptTarget.ESNext);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tested and it does not affect anything in the test suite even if pos is not advanced.

}

function scanIdentifier(startCharacter: number, languageVersion: ScriptTarget) {
let ch = startCharacter;
if (isIdentifierStart(ch, languageVersion)) {
pos += charSize(ch);
while (pos < end && isIdentifierPart(ch = codePointUnchecked(pos), languageVersion)) pos += charSize(ch);
tokenValue = text.substring(tokenStart, pos);
if (ch === CharacterCodes.backslash) {
tokenValue += scanIdentifierParts();
function scanIdentifierStart(languageVersion: ScriptTarget): string {
const ch = codePointChecked(pos);
if (ch === CharacterCodes.backslash) {
const extendedCookedChar = peekExtendedUnicodeEscape();
if (extendedCookedChar >= 0 && isIdentifierStart(extendedCookedChar, languageVersion)) {
return scanExtendedUnicodeEscape(/*shouldEmitInvalidEscapeError*/ true);
}
return getIdentifierToken();

const cookedChar = peekUnicodeEscape();
if (cookedChar >= 0 && isIdentifierStart(cookedChar, languageVersion)) {
pos += 6; // Valid Unicode escape is always six characters
tokenFlags |= TokenFlags.UnicodeEscape;
return String.fromCharCode(cookedChar);
}
}
else if (isIdentifierStart(ch, languageVersion)) {
pos += charSize(ch);
return utf16EncodeAsString(ch);
}
return "";
}

function scanIdentifier(languageVersion: ScriptTarget, identifierVariant?: LanguageVariant) {
tokenValue = scanIdentifierStart(languageVersion);
if (tokenValue) {
tokenValue += scanIdentifierParts(languageVersion, identifierVariant);
return token = getIdentifierToken();
}
return token = SyntaxKind.Unknown;
}

function reScanGreaterToken(): SyntaxKind {
Expand Down Expand Up @@ -2996,8 +2963,7 @@ export function createScanner(
function scanGroupName(isReference: boolean) {
Debug.assertEqual(charCodeUnchecked(pos - 1), CharacterCodes.lessThan);
tokenStart = pos;
scanIdentifier(codePointChecked(pos), languageVersion);
if (pos === tokenStart) {
if (!scanIdentifier(languageVersion)) {
error(Diagnostics.Expected_a_capturing_group_name);
}
else if (isReference) {
Expand Down Expand Up @@ -3772,19 +3738,8 @@ export function createScanner(
// everything after it to the token
// Do note that this means that `scanJsxIdentifier` effectively _mutates_ the visible token without advancing to a new token
// Any caller should be expecting this behavior and should only read the pos or token value after calling it.
while (pos < end) {
const ch = charCodeUnchecked(pos);
if (ch === CharacterCodes.minus) {
tokenValue += "-";
pos++;
continue;
}
const oldPos = pos;
tokenValue += scanIdentifierParts(); // reuse `scanIdentifierParts` so unicode escapes are handled
if (pos === oldPos) {
break;
}
}
// Here `scanIdentifierParts` is reused to ensure unicode escapes are handled.
tokenValue += scanIdentifierParts(languageVersion, LanguageVariant.JSX);
return getIdentifierToken();
}
return token;
Expand Down Expand Up @@ -3893,36 +3848,11 @@ export function createScanner(
return token = SyntaxKind.BacktickToken;
case CharacterCodes.hash:
return token = SyntaxKind.HashToken;
case CharacterCodes.backslash:
pos--;
const extendedCookedChar = peekExtendedUnicodeEscape();
if (extendedCookedChar >= 0 && isIdentifierStart(extendedCookedChar, languageVersion)) {
tokenValue = scanExtendedUnicodeEscape(/*shouldEmitInvalidEscapeError*/ true) + scanIdentifierParts();
return token = getIdentifierToken();
}

const cookedChar = peekUnicodeEscape();
if (cookedChar >= 0 && isIdentifierStart(cookedChar, languageVersion)) {
pos += 6;
tokenFlags |= TokenFlags.UnicodeEscape;
tokenValue = String.fromCharCode(cookedChar) + scanIdentifierParts();
return token = getIdentifierToken();
}
pos++;
return token = SyntaxKind.Unknown;
}

if (isIdentifierStart(ch, languageVersion)) {
let char = ch;
while (pos < end && isIdentifierPart(char = codePointUnchecked(pos), languageVersion) || char === CharacterCodes.minus) pos += charSize(char);
tokenValue = text.substring(tokenStart, pos);
if (char === CharacterCodes.backslash) {
tokenValue += scanIdentifierParts();
}
return token = getIdentifierToken();
}
else {
return token = SyntaxKind.Unknown;
default:
pos = tokenStart;
const identifierKind = scanIdentifier(languageVersion, LanguageVariant.JSX);
if (!identifierKind) pos += charSize(ch); // skip the character
return identifierKind;
}
}

Expand Down
2 changes: 1 addition & 1 deletion tests/baselines/reference/TypeArgumentList1.errors.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ TypeArgumentList1.ts(1,14): error TS2695: Left side of comma operator is unused
!!! error TS2304: Cannot find name 'A'.
~
!!! error TS2304: Cannot find name 'B'.

~
!!! error TS1127: Invalid character.
~
!!! error TS2304: Cannot find name 'C'.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ constructorWithIncompleteTypeAnnotation.ts(261,1): error TS1128: Declaration or
var undef = undefined;

var _\uD4A5\u7204\uC316\uE59F = local;

~
!!! error TS1127: Invalid character.
var мир = local;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ invalidUnicodeEscapeSequance.ts(1,8): error TS1127: Invalid character.

==== invalidUnicodeEscapeSequance.ts (1 errors) ====
var arg\u003

~
!!! error TS1127: Invalid character.
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ invalidUnicodeEscapeSequance2.ts(1,8): error TS1127: Invalid character.

==== invalidUnicodeEscapeSequance2.ts (1 errors) ====
var arg\uxxxx

~
!!! error TS1127: Invalid character.
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ invalidUnicodeEscapeSequance3.ts(1,3): error TS2304: Cannot find name 'u'.
a\u
~
!!! error TS2304: Cannot find name 'a'.

~
!!! error TS1127: Invalid character.
~
!!! error TS2304: Cannot find name 'u'.
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ invalidUnicodeEscapeSequance4.ts(2,5): error TS1127: Invalid character.
==== invalidUnicodeEscapeSequance4.ts (1 errors) ====
var a\u0031; // a1 is a valid identifier
var \u0031a; // 1a is an invalid identifier

~
!!! error TS1127: Invalid character.
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,18 @@

1 const a =!@#!@$
   ~~
a.ts:1:13 - error TS1134: Variable declaration expected.
a.ts:1:14 - error TS1134: Variable declaration expected.

1 const a =!@#!@$
   ~
a.ts:1:16 - error TS1109: Expression expected.

1 const a =!@#!@$
   
   ~
a.ts:2:13 - error TS18026: '#!' can only be used at the start of a file.

2 const b = !@#!@#!@#!
   ~~
a.ts:2:14 - error TS1134: Variable declaration expected.
a.ts:2:15 - error TS1134: Variable declaration expected.

2 const b = !@#!@#!@#!
   ~
   ~
a.ts:2:16 - error TS18026: '#!' can only be used at the start of a file.

2 const b = !@#!@#!@#!
Expand Down Expand Up @@ -76,18 +72,16 @@
  ~~~~~


==== a.ts (16 errors) ====
==== a.ts (15 errors) ====
const a =!@#!@$
~~
!!! error TS18026: '#!' can only be used at the start of a file.
~
~
!!! error TS1134: Variable declaration expected.

!!! error TS1109: Expression expected.
const b = !@#!@#!@#!
~~
!!! error TS18026: '#!' can only be used at the start of a file.
~
~
!!! error TS1134: Variable declaration expected.
~~
!!! error TS18026: '#!' can only be used at the start of a file.
Expand Down Expand Up @@ -125,8 +119,8 @@
limit
~~~~~
!!! error TS2304: Cannot find name 'limit'.
Found 19 errors in 2 files.
Found 18 errors in 2 files.

Errors Files
16 a.ts:1
15 a.ts:1
3 b.ts:1
Loading