Skip to content

Commit ce9d3b9

Browse files
committed
Add UTF-8 byte matching optimization
1 parent 12dd0df commit ce9d3b9

File tree

11 files changed

+150
-23
lines changed

11 files changed

+150
-23
lines changed

Sources/RegexBenchmark/Suite/URLRegex.swift

+1
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@ extension BenchmarkRunner {
1212
url.register(&self)
1313
}
1414
}
15+

Sources/_StringProcessing/ByteCodeGen.swift

+24
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,30 @@ fileprivate extension Compiler.ByteCodeGen {
117117
}
118118

119119
mutating func emitQuotedLiteral(_ s: String) {
120+
// ASCII is normalization-invariant, so is the safe subset for
121+
// us to optimize
122+
if optimizationsEnabled,
123+
!options.usesCanonicalEquivalence || s.utf8.allSatisfy(\._isASCII),
124+
!s.isEmpty
125+
{
126+
127+
// TODO: Make an optimizations configuration struct, where
128+
// we can enable/disable specific optimizations and change
129+
// thresholds
130+
let longThreshold = 5
131+
132+
// Longer content will be matched against UTF-8 in contiguous
133+
// memory
134+
//
135+
// TODO: case-insensitive variant (just add/subtract from
136+
// ASCII value)
137+
if s.utf8.count >= longThreshold, !options.isCaseInsensitive {
138+
let boundaryCheck = options.semanticLevel == .graphemeCluster
139+
builder.buildMatchUTF8(Array(s.utf8), boundaryCheck: boundaryCheck)
140+
return
141+
}
142+
}
143+
120144
guard options.semanticLevel == .graphemeCluster else {
121145
for char in s {
122146
for scalar in char.unicodeScalars {

Sources/_StringProcessing/Engine/InstPayload.swift

+12-4
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ extension Instruction.Payload {
4343
// and variables
4444

4545
case string(StringRegister)
46-
case sequence(SequenceRegister)
46+
case utf8(UTF8Register)
4747
case position(PositionRegister)
4848
case optionalString(StringRegister?)
4949
case int(IntRegister)
@@ -168,10 +168,18 @@ extension Instruction.Payload {
168168
return (scalar, caseInsensitive: caseInsensitive, boundaryCheck: boundaryCheck)
169169
}
170170

171-
init(sequence: SequenceRegister) {
172-
self.init(sequence)
171+
init(utf8: UTF8Register, boundaryCheck: Bool) {
172+
self.init(boundaryCheck ? 1 : 0, utf8)
173173
}
174-
var sequence: SequenceRegister {
174+
var matchUTF8Payload: (UTF8Register, boundaryCheck: Bool) {
175+
let pair: (UInt64, UTF8Register) = interpretPair()
176+
return (pair.1, pair.0 == 1)
177+
}
178+
179+
init(utf8: UTF8Register) {
180+
self.init(utf8)
181+
}
182+
var utf8: UTF8Register {
175183
interpret()
176184
}
177185

Sources/_StringProcessing/Engine/Instruction.swift

+11
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,17 @@ extension Instruction {
112112
/// Operands: Scalar value to match against and booleans
113113
case matchScalar
114114

115+
/// Match directly (binary semantics) against a series of UTF-8 bytes
116+
///
117+
/// NOTE: Compiler should ensure to only emit this instruction when normalization
118+
/// is not required. E.g., scalar-semantic mode or when the matched portion is entirely ASCII
119+
/// (which is invariant under NFC). Similary, this is case-sensitive.
120+
///
121+
/// TODO: should we add case-insensitive?
122+
///
123+
/// matchUTF8(_: UTF8Register, boundaryCheck: Bool)
124+
case matchUTF8
125+
115126
/// Match a character or a scalar against a set of valid ascii values stored in a bitset
116127
///
117128
/// matchBitset(_: AsciiBitsetRegister, isScalar: Bool)

Sources/_StringProcessing/Engine/MEBuilder.swift

+8-3
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ extension MEProgram {
2020
var enableMetrics = false
2121

2222
var elements = TypedSetVector<Input.Element, _ElementRegister>()
23-
var sequences = TypedSetVector<[Input.Element], _SequenceRegister>()
23+
var utf8Contents = TypedSetVector<[UInt8], _UTF8Register>()
2424

2525
var asciiBitsets: [DSLTree.CustomCharacterClass.AsciiBitset] = []
2626
var consumeFunctions: [ConsumeFunction] = []
@@ -198,6 +198,11 @@ extension MEProgram.Builder {
198198
.match, .init(element: elements.store(e), isCaseInsensitive: isCaseInsensitive)))
199199
}
200200

201+
mutating func buildMatchUTF8(_ utf8: Array<UInt8>, boundaryCheck: Bool) {
202+
instructions.append(.init(.matchUTF8, .init(
203+
utf8: utf8Contents.store(utf8), boundaryCheck: boundaryCheck)))
204+
}
205+
201206
mutating func buildMatchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) {
202207
instructions.append(.init(.matchScalar, .init(scalar: s, caseInsensitive: false, boundaryCheck: boundaryCheck)))
203208
}
@@ -416,7 +421,7 @@ extension MEProgram.Builder {
416421

417422
var regInfo = MEProgram.RegisterInfo()
418423
regInfo.elements = elements.count
419-
regInfo.sequences = sequences.count
424+
regInfo.utf8Contents = utf8Contents.count
420425
regInfo.ints = nextIntRegister.rawValue
421426
regInfo.values = nextValueRegister.rawValue
422427
regInfo.positions = nextPositionRegister.rawValue
@@ -430,7 +435,7 @@ extension MEProgram.Builder {
430435
return MEProgram(
431436
instructions: InstructionList(instructions),
432437
staticElements: elements.stored,
433-
staticSequences: sequences.stored,
438+
staticUTF8Contents: utf8Contents.stored,
434439
staticBitsets: asciiBitsets,
435440
staticConsumeFunctions: consumeFunctions,
436441
staticTransformFunctions: transformFunctions,

Sources/_StringProcessing/Engine/MEProgram.swift

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ struct MEProgram {
2323
var instructions: InstructionList<Instruction>
2424

2525
var staticElements: [Input.Element]
26-
var staticSequences: [[Input.Element]]
26+
var staticUTF8Contents: [[UInt8]]
2727
var staticBitsets: [DSLTree.CustomCharacterClass.AsciiBitset]
2828
var staticConsumeFunctions: [ConsumeFunction]
2929
var staticTransformFunctions: [TransformFunction]

Sources/_StringProcessing/Engine/Processor.swift

+48
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,24 @@ extension Processor {
320320
return true
321321
}
322322

323+
// TODO: bytes should be a Span or RawSpan
324+
mutating func matchUTF8(
325+
_ bytes: Array<UInt8>,
326+
boundaryCheck: Bool
327+
) -> Bool {
328+
guard let next = input.matchUTF8(
329+
bytes,
330+
at: currentPosition,
331+
limitedBy: end,
332+
boundaryCheck: boundaryCheck
333+
) else {
334+
signalFailure()
335+
return false
336+
}
337+
currentPosition = next
338+
return true
339+
}
340+
323341
// If we have a bitset we know that the CharacterClass only matches against
324342
// ascii characters, so check if the current input element is ascii then
325343
// check if it is set in the bitset
@@ -542,6 +560,15 @@ extension Processor {
542560
controller.step()
543561
}
544562

563+
case .matchUTF8:
564+
let (utf8Reg, boundaryCheck) = payload.matchUTF8Payload
565+
let utf8Content = registers[utf8Reg]
566+
if matchUTF8(
567+
utf8Content, boundaryCheck: boundaryCheck
568+
) {
569+
controller.step()
570+
}
571+
545572
case .matchBitset:
546573
let (isScalar, reg) = payload.bitsetPayload
547574
let bitset = registers[reg]
@@ -752,6 +779,27 @@ extension String {
752779
return idx
753780
}
754781

782+
func matchUTF8(
783+
_ bytes: Array<UInt8>,
784+
at pos: Index,
785+
limitedBy end: Index,
786+
boundaryCheck: Bool
787+
) -> Index? {
788+
var cur = pos
789+
for b in bytes {
790+
guard cur < end, self.utf8[cur] == b else { return nil }
791+
self.utf8.formIndex(after: &cur)
792+
}
793+
794+
guard cur <= end else { return nil }
795+
796+
if boundaryCheck && !isOnGraphemeClusterBoundary(cur) {
797+
return nil
798+
}
799+
800+
return cur
801+
}
802+
755803
func matchASCIIBitset(
756804
_ bitset: DSLTree.CustomCharacterClass.AsciiBitset,
757805
at pos: Index,

Sources/_StringProcessing/Engine/Registers.swift

+9-11
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,9 @@ extension Processor {
2424
// Verbatim elements to compare against
2525
var elements: [Element]
2626

27-
// Verbatim sequences to compare against
28-
//
29-
// TODO: Degenericize Processor and store Strings
30-
var sequences: [[Element]] = []
31-
27+
// Verbatim bytes to compare against
28+
var utf8Contents: [[UInt8]]
29+
3230
var bitsets: [DSLTree.CustomCharacterClass.AsciiBitset]
3331

3432
var consumeFunctions: [MEProgram.ConsumeFunction]
@@ -55,9 +53,6 @@ extension Processor {
5553
extension Processor.Registers {
5654
typealias Input = String
5755

58-
subscript(_ i: SequenceRegister) -> [Input.Element] {
59-
sequences[i.rawValue]
60-
}
6156
subscript(_ i: IntRegister) -> Int {
6257
get { ints[i.rawValue] }
6358
set {
@@ -82,6 +77,9 @@ extension Processor.Registers {
8277
subscript(_ i: ElementRegister) -> Input.Element {
8378
elements[i.rawValue]
8479
}
80+
subscript(_ i: UTF8Register) -> [UInt8] {
81+
utf8Contents[i.rawValue]
82+
}
8583
subscript(
8684
_ i: AsciiBitsetRegister
8785
) -> DSLTree.CustomCharacterClass.AsciiBitset {
@@ -110,8 +108,8 @@ extension Processor.Registers {
110108
self.elements = program.staticElements
111109
assert(elements.count == info.elements)
112110

113-
self.sequences = program.staticSequences
114-
assert(sequences.count == info.sequences)
111+
self.utf8Contents = program.staticUTF8Contents
112+
assert(utf8Contents.count == info.utf8Contents)
115113

116114
self.bitsets = program.staticBitsets
117115
assert(bitsets.count == info.bitsets)
@@ -156,7 +154,7 @@ extension MutableCollection {
156154
extension MEProgram {
157155
struct RegisterInfo {
158156
var elements = 0
159-
var sequences = 0
157+
var utf8Contents = 0
160158
var bools = 0
161159
var strings = 0
162160
var bitsets = 0

Sources/_StringProcessing/MatchingOptions.swift

+9-1
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,15 @@ extension MatchingOptions {
125125
? .graphemeCluster
126126
: .unicodeScalar
127127
}
128-
128+
129+
/// Whether matching needs to honor canonical equivalence.
130+
///
131+
/// Currently, this is synonymous with grapheme-cluster semantics, but could
132+
/// become its own option in the future
133+
var usesCanonicalEquivalence: Bool {
134+
semanticLevel == .graphemeCluster
135+
}
136+
129137
var usesNSRECompatibleDot: Bool {
130138
stack.last!.contains(.nsreCompatibleDot)
131139
}

Sources/_StringProcessing/Utility/TypedInt.swift

+3-2
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,9 @@ enum _SavePointAddress {}
122122
typealias ElementRegister = TypedInt<_ElementRegister>
123123
enum _ElementRegister {}
124124

125-
typealias SequenceRegister = TypedInt<_SequenceRegister>
126-
enum _SequenceRegister {}
125+
/// The register number for a sequence of UTF-8 bytes
126+
typealias UTF8Register = TypedInt<_UTF8Register>
127+
enum _UTF8Register {}
127128

128129
/// The register number for a stored boolean value
129130
///

Tests/RegexTests/CompileTests.swift

+24-1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ enum DecodedInstr {
4141
case matchAnyNonNewline
4242
case matchBitset
4343
case matchBuiltin
44+
case matchUTF8
4445
case consumeBy
4546
case assertBy
4647
case matchBy
@@ -141,6 +142,8 @@ extension DecodedInstr {
141142
return .captureValue
142143
case .matchBuiltin:
143144
return .matchBuiltin
145+
case .matchUTF8:
146+
return .matchUTF8
144147
}
145148
}
146149
}
@@ -443,10 +446,30 @@ extension RegexTests {
443446
contains: [.matchScalarUnchecked],
444447
doesNotContain: [.match, .consumeBy, .matchScalar])
445448
expectProgram(
446-
for: "aaa\u{301}",
449+
for: "a\u{301}",
447450
semanticLevel: .unicodeScalar,
448451
contains: [.matchScalarUnchecked],
449452
doesNotContain: [.match, .consumeBy, .matchScalar])
453+
expectProgram(
454+
for: "abcdefg",
455+
semanticLevel: .unicodeScalar,
456+
contains: [.matchUTF8],
457+
doesNotContain: [.match, .consumeBy, .matchScalar])
458+
expectProgram(
459+
for: "abcdefg",
460+
semanticLevel: .graphemeCluster,
461+
contains: [.matchUTF8],
462+
doesNotContain: [.match, .consumeBy, .matchScalar])
463+
expectProgram(
464+
for: "aaa\u{301}",
465+
semanticLevel: .unicodeScalar,
466+
contains: [.matchUTF8],
467+
doesNotContain: [.match, .consumeBy, .matchScalar])
468+
expectProgram(
469+
for: "aaa\u{301}",
470+
semanticLevel: .graphemeCluster,
471+
contains: [.match],
472+
doesNotContain: [.matchUTF8, .consumeBy])
450473
}
451474

452475
func testCaseInsensitivityCompilation() {

0 commit comments

Comments
 (0)