Skip to content

Commit c5f32d9

Browse files
authored
Merge branch 'main' into word_breaking_test_case
2 parents 87dc7fb + 69f406c commit c5f32d9

File tree

6 files changed

+70
-13
lines changed

6 files changed

+70
-13
lines changed

Sources/_RegexParser/Regex/AST/MatchingOptions.swift

+3
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ extension AST {
4444

4545
// Swift-only default possessive quantifier
4646
case possessiveByDefault // t.b.d.
47+
48+
// NSRegularExpression compatibility special-case
49+
case nsreCompatibleDot // no AST representation
4750
}
4851

4952
public var kind: Kind

Sources/_RegexParser/Regex/Parse/Sema.swift

+2-1
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,8 @@ extension RegexValidator {
142142

143143
case .caseInsensitive, .possessiveByDefault, .reluctantByDefault,
144144
.singleLine, .multiline, .namedCapturesOnly, .extended, .extraExtended,
145-
.asciiOnlyDigit, .asciiOnlyWord, .asciiOnlySpace, .asciiOnlyPOSIXProps:
145+
.asciiOnlyDigit, .asciiOnlyWord, .asciiOnlySpace, .asciiOnlyPOSIXProps,
146+
.nsreCompatibleDot:
146147
break
147148
}
148149
}

Sources/_StringProcessing/ByteCodeGen.swift

+10-4
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ fileprivate extension Compiler.ByteCodeGen {
6767
emitAnyNonNewline()
6868

6969
case .dot:
70-
emitDot()
70+
try emitDot()
7171

7272
case let .char(c):
7373
emitCharacter(c)
@@ -238,9 +238,15 @@ fileprivate extension Compiler.ByteCodeGen {
238238
}
239239
}
240240

241-
mutating func emitDot() {
241+
mutating func emitDot() throws {
242242
if options.dotMatchesNewline {
243-
emitAny()
243+
if options.usesNSRECompatibleDot {
244+
try emitAlternation([
245+
.atom(.characterClass(.newlineSequence)),
246+
.atom(.anyNonNewline)])
247+
} else {
248+
emitAny()
249+
}
244250
} else {
245251
emitAnyNonNewline()
246252
}
@@ -964,7 +970,7 @@ fileprivate extension Compiler.ByteCodeGen {
964970
case let .customCharacterClass(ccc):
965971
if ccc.containsDot {
966972
if !ccc.isInverted {
967-
emitDot()
973+
try emitDot()
968974
} else {
969975
throw Unsupported("Inverted any")
970976
}

Sources/_StringProcessing/MatchingOptions.swift

+7
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,10 @@ extension MatchingOptions {
120120
? .graphemeCluster
121121
: .unicodeScalar
122122
}
123+
124+
var usesNSRECompatibleDot: Bool {
125+
stack.last!.contains(.nsreCompatibleDot)
126+
}
123127
}
124128

125129
// MARK: - Implementation
@@ -141,6 +145,7 @@ extension MatchingOptions {
141145
// Not available via regex literal flags
142146
case transparentBounds
143147
case withoutAnchoringBounds
148+
case nsreCompatibleDot
144149

145150
// Oniguruma options
146151
case asciiOnlyDigit
@@ -197,6 +202,8 @@ extension MatchingOptions {
197202
self = .byteSemantics
198203
case .possessiveByDefault:
199204
self = .possessiveByDefault
205+
case .nsreCompatibleDot:
206+
self = .nsreCompatibleDot
200207

201208
// Whitespace options are only relevant during parsing, not compilation.
202209
case .extended, .extraExtended:

Sources/_StringProcessing/Regex/Options.swift

+12
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,18 @@ extension Regex {
159159
return wrapInOption(.unicodeScalarSemantics, addingIf: true)
160160
}
161161
}
162+
163+
/// Returns a regular expression that uses an NSRegularExpression
164+
/// compatibility mode.
165+
///
166+
/// This mode includes using Unicode scalar semantics and treating a `dot`
167+
/// as matching newline sequences (when in the unrelated dot-matches-newlines
168+
/// mode).
169+
@_spi(Foundation)
170+
public var _nsreCompatibility: Regex<RegexOutput> {
171+
wrapInOption(.nsreCompatibleDot, addingIf: true)
172+
.wrapInOption(.unicodeScalarSemantics, addingIf: true)
173+
}
162174
}
163175

164176
/// A semantic level to use during regex matching.

Tests/RegexTests/MatchTests.swift

+36-8
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
import XCTest
1313
@testable import _RegexParser
14-
@testable @_spi(RegexBenchmark) import _StringProcessing
14+
@testable @_spi(RegexBenchmark) @_spi(Foundation) import _StringProcessing
1515
import TestSupport
1616

1717
struct MatchError: Error {
@@ -2749,13 +2749,41 @@ extension RegexTests {
27492749
XCTAssertNotNil(str.wholeMatch(of: possessiveRegex))
27502750
}
27512751
}
2752-
2753-
func testFoo() throws {
2754-
let re = try Regex(#"^[\u{0000}-\u{024F}]+$"#)
2755-
2756-
XCTAssertNotNil("aaa".wholeMatch(of: re))
2757-
XCTAssertNotNil("aa\u{301}a".wholeMatch(of: re))
2758-
XCTAssertNil("aa\u{301}\u{302}a".wholeMatch(of: re))
2752+
2753+
func testNSRECompatibility() throws {
2754+
// NSRE-compatibility includes scalar matching, so `[\r\n]` should match
2755+
// either `\r` or `\n`.
2756+
let text = #"""
2757+
y=sin(x)+sin(2x)+sin(3x);\#rText "This is a function of x.";\r
2758+
"""#
2759+
let lineTerminationRegex = try Regex(#";[\r\n]"#)
2760+
._nsreCompatibility
2761+
2762+
let afterLine = try XCTUnwrap(text.firstRange(of: "Text"))
2763+
let match = try lineTerminationRegex.firstMatch(in: text)
2764+
XCTAssert(match?.range.upperBound == afterLine.lowerBound)
2765+
2766+
// NSRE-compatibility treats "dot" as special, in that it can match a
2767+
// newline sequence as well as a single Unicode scalar.
2768+
let aDotBRegex = try Regex(#"a.b"#)
2769+
._nsreCompatibility
2770+
.dotMatchesNewlines()
2771+
for input in ["a\rb", "a\nb", "a\r\nb"] {
2772+
XCTAssertNotNil(try aDotBRegex.wholeMatch(in: input))
2773+
}
2774+
2775+
// NSRE-compatibility doesn't give special treatment to newline sequences
2776+
// when matching other "match everything" regex patterns, like `[[^z]z]`,
2777+
// so this pattern doesn't match "a\r\nb".
2778+
let aCCBRegex = try Regex(#"a[[^z]z]b"#)
2779+
._nsreCompatibility
2780+
for input in ["a\rb", "a\nb", "a\r\nb"] {
2781+
if input.unicodeScalars.count == 3 {
2782+
XCTAssertNotNil(try aCCBRegex.wholeMatch(in: input))
2783+
} else {
2784+
XCTAssertNil(try aCCBRegex.wholeMatch(in: input))
2785+
}
2786+
}
27592787
}
27602788
}
27612789

0 commit comments

Comments
 (0)