Skip to content

Commit 393c24d

Browse files
authored
UTF8Span (#78531) (#80890)
Add support for UTF8Span Also, refactor validation and grapheme breaking
1 parent c363658 commit 393c24d

22 files changed

+3340
-442
lines changed

Runtimes/Core/core/CMakeLists.txt

+7
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,13 @@ add_library(swiftCore
208208
UnsafeRawPointer.swift
209209
UTFEncoding.swift
210210
UTF8.swift
211+
UTF8EncodingError.swift
212+
UTF8Span.swift
213+
UTF8SpanBits.swift
214+
UTF8SpanComparisons.swift
215+
UTF8SpanFundamentals.swift
216+
UTF8SpanInternalHelpers.swift
217+
UTF8SpanIterators.swift
211218
UTF16.swift
212219
UTF32.swift
213220
Unicode.swift # ORDER DEPENDENCY: must follow new unicode support

stdlib/public/core/CMakeLists.txt

+7
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,13 @@ split_embedded_sources(
214214
EMBEDDED UnsafeRawPointer.swift
215215
EMBEDDED UTFEncoding.swift
216216
EMBEDDED UTF8.swift
217+
EMBEDDED UTF8EncodingError.swift
218+
EMBEDDED UTF8Span.swift
219+
EMBEDDED UTF8SpanBits.swift
220+
EMBEDDED UTF8SpanComparisons.swift
221+
EMBEDDED UTF8SpanFundamentals.swift
222+
EMBEDDED UTF8SpanInternalHelpers.swift
223+
EMBEDDED UTF8SpanIterators.swift
217224
EMBEDDED UTF16.swift
218225
EMBEDDED UTF32.swift
219226
EMBEDDED Unicode.swift # ORDER DEPENDENCY: must follow new unicode support

stdlib/public/core/GroupInfo.json

+9
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,15 @@
205205
"RawSpan.swift",
206206
"Span.swift"
207207
],
208+
"UTF8Span": [
209+
"UTF8EncodingError.swift",
210+
"UTF8Span.swift",
211+
"UTF8SpanBits.swift",
212+
"UTF8SpanComparisons.swift",
213+
"UTF8SpanFundamentals.swift",
214+
"UTF8SpanInternalHelpers.swift",
215+
"UTF8SpanIterators.swift"
216+
],
208217
"Protocols": [
209218
"CompilerProtocols.swift",
210219
"ShadowProtocols.swift"

stdlib/public/core/String.swift

-104
Original file line numberDiff line numberDiff line change
@@ -1112,108 +1112,4 @@ extension String {
11121112
}
11131113
}
11141114

1115-
extension _StringGutsSlice {
1116-
internal func _isScalarNFCQC(
1117-
_ scalar: Unicode.Scalar,
1118-
_ prevCCC: inout UInt8
1119-
) -> Bool {
1120-
let normData = Unicode._NormData(scalar, fastUpperbound: 0x300)
11211115

1122-
if prevCCC > normData.ccc, normData.ccc != 0 {
1123-
return false
1124-
}
1125-
1126-
if !normData.isNFCQC {
1127-
return false
1128-
}
1129-
1130-
prevCCC = normData.ccc
1131-
return true
1132-
}
1133-
1134-
internal func _withNFCCodeUnits(_ f: (UInt8) throws -> Void) rethrows {
1135-
let substring = String(_guts)[range]
1136-
// Fast path: If we're already NFC (or ASCII), then we don't need to do
1137-
// anything at all.
1138-
if _fastPath(_guts.isNFC) {
1139-
try substring.utf8.forEach(f)
1140-
return
1141-
}
1142-
1143-
var isNFCQC = true
1144-
var prevCCC: UInt8 = 0
1145-
1146-
if _guts.isFastUTF8 {
1147-
_fastNFCCheck(&isNFCQC, &prevCCC)
1148-
1149-
// Because we have access to the fastUTF8, we can go through that instead
1150-
// of accessing the UTF8 view on String.
1151-
if isNFCQC {
1152-
try unsafe withFastUTF8 {
1153-
for unsafe byte in unsafe $0 {
1154-
try f(byte)
1155-
}
1156-
}
1157-
1158-
return
1159-
}
1160-
} else {
1161-
for scalar in substring.unicodeScalars {
1162-
if !_isScalarNFCQC(scalar, &prevCCC) {
1163-
isNFCQC = false
1164-
break
1165-
}
1166-
}
1167-
1168-
if isNFCQC {
1169-
for byte in substring.utf8 {
1170-
try f(byte)
1171-
}
1172-
1173-
return
1174-
}
1175-
}
1176-
1177-
for scalar in substring.unicodeScalars._internalNFC {
1178-
try scalar.withUTF8CodeUnits {
1179-
for unsafe byte in unsafe $0 {
1180-
try f(byte)
1181-
}
1182-
}
1183-
}
1184-
}
1185-
1186-
internal func _fastNFCCheck(_ isNFCQC: inout Bool, _ prevCCC: inout UInt8) {
1187-
unsafe withFastUTF8 { utf8 in
1188-
var position = 0
1189-
1190-
while position < utf8.count {
1191-
// If our first byte is less than 0xCC, then it means we're under the
1192-
// 0x300 scalar value and everything up to 0x300 is NFC already.
1193-
if unsafe utf8[position] < 0xCC {
1194-
// If our first byte is less than 0xC0, then it means it is ASCII
1195-
// and only takes up a single byte.
1196-
if unsafe utf8[position] < 0xC0 {
1197-
position &+= 1
1198-
} else {
1199-
// Otherwise, this is a 2 byte < 0x300 sequence.
1200-
position &+= 2
1201-
}
1202-
// ASCII always has ccc of 0.
1203-
prevCCC = 0
1204-
1205-
continue
1206-
}
1207-
1208-
let (scalar, len) = unsafe _decodeScalar(utf8, startingAt: position)
1209-
1210-
if !_isScalarNFCQC(scalar, &prevCCC) {
1211-
isNFCQC = false
1212-
return
1213-
}
1214-
1215-
position &+= len
1216-
}
1217-
}
1218-
}
1219-
}

stdlib/public/core/StringComparison.swift

+1-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ internal func _stringCompareInternal(
9797
}
9898

9999
@_effects(readonly)
100-
private func _stringCompareFastUTF8(
100+
internal func _stringCompareFastUTF8(
101101
_ utf8Left: UnsafeBufferPointer<UInt8>,
102102
_ utf8Right: UnsafeBufferPointer<UInt8>,
103103
expecting: _StringComparisonResult,

stdlib/public/core/StringCreate.swift

+2-2
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ extension String {
117117
return unsafe (String._uncheckedFromUTF8(
118118
input, asciiPreScanResult: extraInfo.isASCII
119119
), false)
120-
case .error(let initialRange):
120+
case .error(_, let initialRange):
121121
return unsafe (repairUTF8(input, firstKnownBrokenRange: initialRange), true)
122122
}
123123
}
@@ -139,7 +139,7 @@ extension String {
139139
newIsASCII: info.isASCII
140140
)
141141
return result.asString
142-
case .error(let initialRange):
142+
case .error(_, let initialRange):
143143
defer { _fixLifetime(result) }
144144
//This could be optimized to use excess tail capacity
145145
return unsafe repairUTF8(result.codeUnits, firstKnownBrokenRange: initialRange)

0 commit comments

Comments
 (0)