Skip to content

Add UTF8 byte matching optimization #787

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Nov 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ let package = Package(
swiftSettings: [availabilityDefinition]),
.testTarget(
name: "RegexTests",
dependencies: ["_StringProcessing", "TestSupport"],
dependencies: ["_StringProcessing", "RegexBuilder", "TestSupport"],
swiftSettings: [
availabilityDefinition
]),
Expand Down
1 change: 1 addition & 0 deletions Sources/RegexBenchmark/BenchmarkRegistration.swift
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ extension BenchmarkRunner {
self.addIpAddress()

self.addURLWithWordBoundaries()
self.addFSPathsRegex()
// -- end of registrations --
}
}
51 changes: 51 additions & 0 deletions Sources/RegexBenchmark/Inputs/FSPaths.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Successful match FSPaths
private let fsPathSuccess = #"""
./First/Second/Third/some/really/long/content.extension/more/stuff/OptionLeft
./First/Second/Third/some/really/long/content.extension/more/stuff/OptionRight
./First/Second/PrefixThird/some/really/long/content.extension/more/stuff/OptionLeft
./First/Second/PrefixThird/some/really/long/content.extension/more/stuff/OptionRight
"""#

// Unsucessful match FSPaths.
//
// We will have far more failures than successful matches by interspersing
// this whole list between each success
private let fsPathFailure = #"""
a/b/c
/smol/path
/a/really/long/path/that/is/certainly/stored/out/of/line
./First/Second/Third/some/really/long/content.extension/more/stuff/NothingToSeeHere
./First/Second/PrefixThird/some/really/long/content.extension/more/stuff/NothingToSeeHere
./First/Second/Third/some/really/long/content.extension/more/stuff/OptionNeither
./First/Second/PrefixThird/some/really/long/content.extension/more/stuff/OptionNeither
/First/Second/Third/some/really/long/content.extension/more/stuff/OptionLeft
/First/Second/Third/some/really/long/content.extension/more/stuff/OptionRight
/First/Second/PrefixThird/some/really/long/content.extension/more/stuff/OptionLeft
/First/Second/PrefixThird/some/really/long/content.extension/more/stuff/OptionRight
./First/Second/Third/some/really/long/content/more/stuff/OptionLeft
./First/Second/Third/some/really/long/content/more/stuff/OptionRight
./First/Second/PrefixThird/some/really/long/content/more/stuff/OptionLeft
./First/Second/PrefixThird/some/really/long/content/more/stuff/OptionRight
"""#

extension Inputs {
static let fsPathsList: [String] = {
var result: [String] = []
let failures: [String] = fsPathFailure.split(whereSeparator: { $0.isNewline }).map { String($0) }
result.append(contentsOf: failures)

for success in fsPathSuccess.split(whereSeparator: { $0.isNewline }) {
result.append(String(success))
result.append(contentsOf: failures)
}

// Scale result up a bit
result.append(contentsOf: result)
result.append(contentsOf: result)
result.append(contentsOf: result)
result.append(contentsOf: result)

return result

}()
}
16 changes: 16 additions & 0 deletions Sources/RegexBenchmark/Suite/FSPathsRegex.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import _StringProcessing


extension BenchmarkRunner {
mutating func addFSPathsRegex() {
let fsPathsRegex =
#"^\./First/Second/(Prefix)?Third/.*\.extension/.*(OptionLeft|OptionRight)$"#
let paths = CrossInputListBenchmark(
baseName: "FSPathsRegex",
regex: fsPathsRegex,
inputs: Inputs.fsPathsList
)
paths.register(&self)
}
}

1 change: 1 addition & 0 deletions Sources/RegexBenchmark/Suite/URLRegex.swift
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ extension BenchmarkRunner {
url.register(&self)
}
}

24 changes: 24 additions & 0 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,30 @@ fileprivate extension Compiler.ByteCodeGen {
}

mutating func emitQuotedLiteral(_ s: String) {
// ASCII is normalization-invariant, so is the safe subset for
// us to optimize
if optimizationsEnabled,
!options.usesCanonicalEquivalence || s.utf8.allSatisfy(\._isASCII),
!s.isEmpty
{

// TODO: Make an optimizations configuration struct, where
// we can enable/disable specific optimizations and change
// thresholds
let longThreshold = 5

// Longer content will be matched against UTF-8 in contiguous
// memory
//
// TODO: case-insensitive variant (just add/subtract from
// ASCII value)
if s.utf8.count >= longThreshold, !options.isCaseInsensitive {
let boundaryCheck = options.semanticLevel == .graphemeCluster
builder.buildMatchUTF8(Array(s.utf8), boundaryCheck: boundaryCheck)
return
}
}

guard options.semanticLevel == .graphemeCluster else {
for char in s {
for scalar in char.unicodeScalars {
Expand Down
16 changes: 12 additions & 4 deletions Sources/_StringProcessing/Engine/InstPayload.swift
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ extension Instruction.Payload {
// and variables

case string(StringRegister)
case sequence(SequenceRegister)
case utf8(UTF8Register)
case position(PositionRegister)
case optionalString(StringRegister?)
case int(IntRegister)
Expand Down Expand Up @@ -168,10 +168,18 @@ extension Instruction.Payload {
return (scalar, caseInsensitive: caseInsensitive, boundaryCheck: boundaryCheck)
}

init(sequence: SequenceRegister) {
self.init(sequence)
init(utf8: UTF8Register, boundaryCheck: Bool) {
self.init(boundaryCheck ? 1 : 0, utf8)
}
var sequence: SequenceRegister {
var matchUTF8Payload: (UTF8Register, boundaryCheck: Bool) {
let pair: (UInt64, UTF8Register) = interpretPair()
return (pair.1, pair.0 == 1)
}

init(utf8: UTF8Register) {
self.init(utf8)
}
var utf8: UTF8Register {
interpret()
}

Expand Down
11 changes: 11 additions & 0 deletions Sources/_StringProcessing/Engine/Instruction.swift
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,17 @@ extension Instruction {
/// Operands: Scalar value to match against and booleans
case matchScalar

/// Match directly (binary semantics) against a series of UTF-8 bytes
///
/// NOTE: Compiler should ensure to only emit this instruction when normalization
/// is not required. E.g., scalar-semantic mode or when the matched portion is entirely ASCII
/// (which is invariant under NFC). Similary, this is case-sensitive.
///
/// TODO: should we add case-insensitive?
///
/// matchUTF8(_: UTF8Register, boundaryCheck: Bool)
case matchUTF8

/// Match a character or a scalar against a set of valid ascii values stored in a bitset
///
/// matchBitset(_: AsciiBitsetRegister, isScalar: Bool)
Expand Down
11 changes: 8 additions & 3 deletions Sources/_StringProcessing/Engine/MEBuilder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ extension MEProgram {
var enableMetrics = false

var elements = TypedSetVector<Input.Element, _ElementRegister>()
var sequences = TypedSetVector<[Input.Element], _SequenceRegister>()
var utf8Contents = TypedSetVector<[UInt8], _UTF8Register>()

var asciiBitsets: [DSLTree.CustomCharacterClass.AsciiBitset] = []
var consumeFunctions: [ConsumeFunction] = []
Expand Down Expand Up @@ -198,6 +198,11 @@ extension MEProgram.Builder {
.match, .init(element: elements.store(e), isCaseInsensitive: isCaseInsensitive)))
}

mutating func buildMatchUTF8(_ utf8: Array<UInt8>, boundaryCheck: Bool) {
instructions.append(.init(.matchUTF8, .init(
utf8: utf8Contents.store(utf8), boundaryCheck: boundaryCheck)))
}

mutating func buildMatchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) {
instructions.append(.init(.matchScalar, .init(scalar: s, caseInsensitive: false, boundaryCheck: boundaryCheck)))
}
Expand Down Expand Up @@ -416,7 +421,7 @@ extension MEProgram.Builder {

var regInfo = MEProgram.RegisterInfo()
regInfo.elements = elements.count
regInfo.sequences = sequences.count
regInfo.utf8Contents = utf8Contents.count
regInfo.ints = nextIntRegister.rawValue
regInfo.values = nextValueRegister.rawValue
regInfo.positions = nextPositionRegister.rawValue
Expand All @@ -430,7 +435,7 @@ extension MEProgram.Builder {
return MEProgram(
instructions: InstructionList(instructions),
staticElements: elements.stored,
staticSequences: sequences.stored,
staticUTF8Contents: utf8Contents.stored,
staticBitsets: asciiBitsets,
staticConsumeFunctions: consumeFunctions,
staticTransformFunctions: transformFunctions,
Expand Down
2 changes: 1 addition & 1 deletion Sources/_StringProcessing/Engine/MEProgram.swift
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ struct MEProgram {
var instructions: InstructionList<Instruction>

var staticElements: [Input.Element]
var staticSequences: [[Input.Element]]
var staticUTF8Contents: [[UInt8]]
var staticBitsets: [DSLTree.CustomCharacterClass.AsciiBitset]
var staticConsumeFunctions: [ConsumeFunction]
var staticTransformFunctions: [TransformFunction]
Expand Down
48 changes: 48 additions & 0 deletions Sources/_StringProcessing/Engine/Processor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,24 @@ extension Processor {
return true
}

// TODO: bytes should be a Span or RawSpan
mutating func matchUTF8(
_ bytes: Array<UInt8>,
boundaryCheck: Bool
) -> Bool {
guard let next = input.matchUTF8(
bytes,
at: currentPosition,
limitedBy: end,
boundaryCheck: boundaryCheck
) else {
signalFailure()
return false
}
currentPosition = next
return true
}

// If we have a bitset we know that the CharacterClass only matches against
// ascii characters, so check if the current input element is ascii then
// check if it is set in the bitset
Expand Down Expand Up @@ -542,6 +560,15 @@ extension Processor {
controller.step()
}

case .matchUTF8:
let (utf8Reg, boundaryCheck) = payload.matchUTF8Payload
let utf8Content = registers[utf8Reg]
if matchUTF8(
utf8Content, boundaryCheck: boundaryCheck
) {
controller.step()
}

case .matchBitset:
let (isScalar, reg) = payload.bitsetPayload
let bitset = registers[reg]
Expand Down Expand Up @@ -752,6 +779,27 @@ extension String {
return idx
}

func matchUTF8(
_ bytes: Array<UInt8>,
at pos: Index,
limitedBy end: Index,
boundaryCheck: Bool
) -> Index? {
var cur = pos
for b in bytes {
guard cur < end, self.utf8[cur] == b else { return nil }
self.utf8.formIndex(after: &cur)
}

guard cur <= end else { return nil }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this could be an assertion, but it's probably fine here.


if boundaryCheck && !isOnGraphemeClusterBoundary(cur) {
return nil
}

return cur
}

func matchASCIIBitset(
_ bitset: DSLTree.CustomCharacterClass.AsciiBitset,
at pos: Index,
Expand Down
20 changes: 9 additions & 11 deletions Sources/_StringProcessing/Engine/Registers.swift
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,9 @@ extension Processor {
// Verbatim elements to compare against
var elements: [Element]

// Verbatim sequences to compare against
//
// TODO: Degenericize Processor and store Strings
var sequences: [[Element]] = []

// Verbatim bytes to compare against
var utf8Contents: [[UInt8]]

var bitsets: [DSLTree.CustomCharacterClass.AsciiBitset]

var consumeFunctions: [MEProgram.ConsumeFunction]
Expand All @@ -55,9 +53,6 @@ extension Processor {
extension Processor.Registers {
typealias Input = String

subscript(_ i: SequenceRegister) -> [Input.Element] {
sequences[i.rawValue]
}
subscript(_ i: IntRegister) -> Int {
get { ints[i.rawValue] }
set {
Expand All @@ -82,6 +77,9 @@ extension Processor.Registers {
subscript(_ i: ElementRegister) -> Input.Element {
elements[i.rawValue]
}
subscript(_ i: UTF8Register) -> [UInt8] {
utf8Contents[i.rawValue]
}
subscript(
_ i: AsciiBitsetRegister
) -> DSLTree.CustomCharacterClass.AsciiBitset {
Expand Down Expand Up @@ -110,8 +108,8 @@ extension Processor.Registers {
self.elements = program.staticElements
assert(elements.count == info.elements)

self.sequences = program.staticSequences
assert(sequences.count == info.sequences)
self.utf8Contents = program.staticUTF8Contents
assert(utf8Contents.count == info.utf8Contents)

self.bitsets = program.staticBitsets
assert(bitsets.count == info.bitsets)
Expand Down Expand Up @@ -156,7 +154,7 @@ extension MutableCollection {
extension MEProgram {
struct RegisterInfo {
var elements = 0
var sequences = 0
var utf8Contents = 0
var bools = 0
var strings = 0
var bitsets = 0
Expand Down
10 changes: 9 additions & 1 deletion Sources/_StringProcessing/MatchingOptions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,15 @@ extension MatchingOptions {
? .graphemeCluster
: .unicodeScalar
}


/// Whether matching needs to honor canonical equivalence.
///
/// Currently, this is synonymous with grapheme-cluster semantics, but could
/// become its own option in the future
var usesCanonicalEquivalence: Bool {
semanticLevel == .graphemeCluster
}

var usesNSRECompatibleDot: Bool {
stack.last!.contains(.nsreCompatibleDot)
}
Expand Down
5 changes: 3 additions & 2 deletions Sources/_StringProcessing/Utility/TypedInt.swift
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,9 @@ enum _SavePointAddress {}
typealias ElementRegister = TypedInt<_ElementRegister>
enum _ElementRegister {}

typealias SequenceRegister = TypedInt<_SequenceRegister>
enum _SequenceRegister {}
/// The register number for a sequence of UTF-8 bytes
typealias UTF8Register = TypedInt<_UTF8Register>
enum _UTF8Register {}

/// The register number for a stored boolean value
///
Expand Down
Loading