Parse 0.2 after … as float literal, not member access

ahoppen · ahoppen · commit be58d53d8e68 · 2023-02-01T15:56:00.000+01:00
Deciding whether `0.2` should be lexed as a float literal or a member access is a little more difficult than just looking at the previous character because `0.2` might be preceeded by an operator like `…` or `.^.`, in which case it should be lexed as a float literal and not a member access.

We might be able to do some disambiguation magic on whether the character before the period is also an operator continuation point but that seems fairly brittle to me. The sanest way of doing this, is to store the previously lexed token’s kind in the cursor and checking that.

I measured and did not see a performance regregssion when parsing MovieSwiftUI.

rdar://103273988
diff --git a/Sources/SwiftParser/Lexer.swift b/Sources/SwiftParser/Lexer.swift
@@ -214,6 +214,8 @@ extension Lexer {
   public struct Cursor: Equatable {
     var input: UnsafeBufferPointer<UInt8>
     var previous: UInt8
+    /// If we have already lexed a token, the kind of the previously lexed token
+    var previousTokenKind: RawTokenKind?
 
     @_spi(LexerDiagnostics)
     public init(input: UnsafeBufferPointer<UInt8>, previous: UInt8) {
@@ -338,6 +340,10 @@ extension Lexer.Cursor {
 }
 
 extension Lexer.Cursor {
+  /// Revert the lexer by `offset` bytes. This should only be used by `resetForSplit`.
+  /// This must not back up by more bytes than the last token because that would
+  /// require us to also update `previousTokenKind`, which we don't do in this
+  /// function
   fileprivate mutating func backUp(by offset: Int) {
     assert(!self.isAtStartOfFile)
     self.previous = self.input.baseAddress!.advanced(by: -(offset + 1)).pointee
@@ -796,6 +802,9 @@ extension Lexer.Cursor {
     if newlineInLeadingTrivia == .present {
       flags.insert(.isAtStartOfLine)
     }
+
+    self.previousTokenKind = kind
+
     return .init(
       tokenKind: kind,
       flags: flags,
@@ -1393,7 +1402,15 @@ extension Lexer.Cursor {
     if !self.isAtEndOfFile, self.peek() == UInt8(ascii: ".") {
       // NextToken is the soon to be previous token
       // Therefore: x.0.1 is sub-tuple access, not x.float_literal
-      if self.input.count > 1, !Unicode.Scalar(self.peek(at: 1)).isDigit || TokStart.previous == UInt8(ascii: ".") {
+      if self.input.count <= 1 {
+        // If there are no more digits following the '.', we don't have a float
+        // literal.
+        return (.integerLiteral, [])
+      } else if !Unicode.Scalar(self.peek(at: 1)).isDigit {
+        // ".a" is a member access and certainly not a float literal
+        return (.integerLiteral, [])
+      } else if self.previousTokenKind == .period {
+        // Lex x.0.1 as sub-tuple access, not x.float_literal.
         return (.integerLiteral, [])
       }
     } else {
diff --git a/Tests/SwiftParserTest/LexerTests.swift b/Tests/SwiftParserTest/LexerTests.swift
@@ -935,6 +935,55 @@ public class LexerTests: XCTestCase {
       ]
     )
   }
+
+  func testMultiDigitTupleAccess() {
+    var data = "x.13.1"
+    data.withUTF8 { buf in
+      let lexemes = Lexer.lex(buf)
+      AssertEqualTokens(
+        lexemes,
+        [
+          lexeme(.identifier, "x"),
+          lexeme(.period, "."),
+          lexeme(.integerLiteral, "13"),
+          lexeme(.period, "."),
+          lexeme(.integerLiteral, "1"),
+          lexeme(.eof, ""),
+        ]
+      )
+    }
+  }
+
+  func testFloatingPointNumberAfterRangeOperator() {
+    var data = "0.1...0.2"
+    data.withUTF8 { buf in
+      let lexemes = Lexer.lex(buf)
+      AssertEqualTokens(
+        lexemes,
+        [
+          lexeme(.floatingLiteral, "0.1"),
+          lexeme(.unspacedBinaryOperator, "..."),
+          lexeme(.floatingLiteral, "0.2"),
+          lexeme(.eof, ""),
+        ]
+      )
+    }
+  }
+
+  func testUnterminatedFloatLiteral() {
+    var data = "0."
+    data.withUTF8 { buf in
+      let lexemes = Lexer.lex(buf)
+      AssertEqualTokens(
+        lexemes,
+        [
+          lexeme(.integerLiteral, "0"),
+          lexeme(.unknown, "."),
+          lexeme(.eof, ""),
+        ]
+      )
+    }
+  }
 }
 
 extension Lexer {