Skip to content

Commit 02b828d

Browse files
Sendable Config (#189)
* Sendable Config ConfigTests, BinaryDistinctDictionary removed, Config JSON serialization/deserialization, Config compatible with jinja templating system @dynamicMemberLookup brought back for backward compatibility, ConfigTests/ConfigEquatable, Condig.Data equality improved @dynamicMemberLookup dot notation used in favour of the subscript formatting rebase Test cleanup Test fix * swiftformat --config .swiftformat . * swift-tools-version: 5.9, platforms: [.iOS(.v17), .macOS(.v14)] * Testing replaced with XCTest * Package.swift reverted * ConfigTests string encoding fix * Package.swift dependency cleanup * Update ConfigTests.swift Co-authored-by: Pedro Cuenca <[email protected]> * swiftformat --------- Co-authored-by: Pedro Cuenca <[email protected]>
1 parent ca72653 commit 02b828d

21 files changed

+1775
-284
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,6 @@ DerivedData/
99
.swiftpm/config/registries.json
1010
.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
1111
.netrc
12-
.idea
12+
.idea
13+
.index-build
14+
*.out

Package.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ let package = Package(
3030
.target(name: "Generation", dependencies: ["Tokenizers", "TensorUtils"]),
3131
.target(name: "Models", dependencies: ["Tokenizers", "Generation", "TensorUtils"]),
3232
.testTarget(name: "TokenizersTests", dependencies: ["Tokenizers", "Models", "Hub"], resources: [.process("Resources"), .process("Vocabs")]),
33-
.testTarget(name: "HubTests", dependencies: ["Hub"]),
33+
.testTarget(name: "HubTests", dependencies: ["Hub", .product(name: "Jinja", package: "Jinja")]),
3434
.testTarget(name: "PreTokenizerTests", dependencies: ["Tokenizers", "Hub"]),
3535
.testTarget(name: "TensorUtilsTests", dependencies: ["TensorUtils", "Models", "Hub"], resources: [.process("Resources")]),
3636
.testTarget(name: "NormalizerTests", dependencies: ["Tokenizers", "Hub"]),

Sources/Hub/BinaryDistinct.swift

Lines changed: 242 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,242 @@
1+
//
2+
// BinaryDistinct.swift
3+
// swift-transformers
4+
//
5+
// Created by Piotr Kowalczuk on 06.03.25.
6+
//
7+
8+
import Foundation
9+
10+
/// BinaryDistinctString helps to overcome limitations of both String and NSString types. Where the prior is performing unicode normalization and the following is not Sendable. For more reference [Modifying-and-Comparing-Strings](https://developer.apple.com/documentation/swift/string#Modifying-and-Comparing-Strings).
11+
public struct BinaryDistinctString: Equatable, Hashable, Sendable, Comparable, CustomStringConvertible, ExpressibleByStringLiteral {
12+
public let value: [UInt16]
13+
14+
public var nsString: NSString {
15+
String(utf16CodeUnits: value, count: value.count) as NSString
16+
}
17+
18+
public var string: String {
19+
String(nsString)
20+
}
21+
22+
public var count: Int {
23+
string.count
24+
}
25+
26+
/// Satisfies ``CustomStringConvertible`` protocol.
27+
public var description: String {
28+
string
29+
}
30+
31+
public init(_ bytes: [UInt16]) {
32+
value = bytes
33+
}
34+
35+
public init(_ str: NSString) {
36+
value = Array(str as String).flatMap { $0.utf16 }
37+
}
38+
39+
public init(_ str: String) {
40+
self.init(str as NSString)
41+
}
42+
43+
public init(_ character: BinaryDistinctCharacter) {
44+
value = character.bytes
45+
}
46+
47+
public init(_ characters: [BinaryDistinctCharacter]) {
48+
var data: [UInt16] = []
49+
for character in characters {
50+
data.append(contentsOf: character.bytes)
51+
}
52+
value = data
53+
}
54+
55+
/// Satisfies ``ExpressibleByStringLiteral`` protocol.
56+
public init(stringLiteral value: String) {
57+
self.init(value)
58+
}
59+
60+
public static func == (lhs: BinaryDistinctString, rhs: BinaryDistinctString) -> Bool {
61+
lhs.value == rhs.value
62+
}
63+
64+
public static func < (lhs: BinaryDistinctString, rhs: BinaryDistinctString) -> Bool {
65+
lhs.value.lexicographicallyPrecedes(rhs.value)
66+
}
67+
68+
public static func + (lhs: BinaryDistinctString, rhs: BinaryDistinctString) -> BinaryDistinctString {
69+
BinaryDistinctString(lhs.value + rhs.value)
70+
}
71+
72+
public func hasPrefix(_ prefix: BinaryDistinctString) -> Bool {
73+
guard prefix.value.count <= value.count else { return false }
74+
return value.starts(with: prefix.value)
75+
}
76+
77+
public func hasSuffix(_ suffix: BinaryDistinctString) -> Bool {
78+
guard suffix.value.count <= value.count else { return false }
79+
return value.suffix(suffix.value.count) == suffix.value
80+
}
81+
82+
public func lowercased() -> BinaryDistinctString {
83+
.init(string.lowercased())
84+
}
85+
86+
public func replacingOccurrences(of: Self, with: Self) -> BinaryDistinctString {
87+
BinaryDistinctString(string.replacingOccurrences(of: of.string, with: with.string))
88+
}
89+
}
90+
91+
public extension BinaryDistinctString {
92+
typealias Index = Int // Treat indices as integers
93+
94+
var startIndex: Index { 0 }
95+
var endIndex: Index { count }
96+
97+
func index(_ i: Index, offsetBy distance: Int) -> Index {
98+
let newIndex = i + distance
99+
guard newIndex >= 0, newIndex <= count else {
100+
fatalError("Index out of bounds")
101+
}
102+
return newIndex
103+
}
104+
105+
func index(_ i: Index, offsetBy distance: Int, limitedBy limit: Index) -> Index? {
106+
let newIndex = i + distance
107+
return newIndex <= limit ? newIndex : nil
108+
}
109+
}
110+
111+
extension BinaryDistinctString: Sequence {
112+
public func makeIterator() -> AnyIterator<BinaryDistinctCharacter> {
113+
var iterator = string.makeIterator() // Use native Swift String iterator
114+
115+
return AnyIterator {
116+
guard let char = iterator.next() else { return nil }
117+
return BinaryDistinctCharacter(char)
118+
}
119+
}
120+
}
121+
122+
public extension BinaryDistinctString {
123+
subscript(bounds: PartialRangeFrom<Int>) -> BinaryDistinctString {
124+
let validRange = bounds.lowerBound..<value.count // Convert to Range<Int>
125+
return self[validRange]
126+
}
127+
128+
/// Returns a slice of the `BinaryDistinctString` while ensuring correct rune (grapheme cluster) boundaries.
129+
subscript(bounds: Range<Int>) -> BinaryDistinctString {
130+
guard bounds.lowerBound >= 0, bounds.upperBound <= count else {
131+
fatalError("Index out of bounds")
132+
}
133+
134+
let utf8Bytes = value
135+
var byteIndices: [Int] = []
136+
137+
// Decode UTF-8 manually to find rune start positions
138+
var currentByteIndex = 0
139+
for (index, scalar) in string.unicodeScalars.enumerated() {
140+
if index == bounds.lowerBound {
141+
byteIndices.append(currentByteIndex)
142+
}
143+
currentByteIndex += scalar.utf8.count
144+
if index == bounds.upperBound - 1 {
145+
byteIndices.append(currentByteIndex)
146+
break
147+
}
148+
}
149+
150+
// Extract the byte range
151+
let startByteIndex = byteIndices.first ?? 0
152+
let endByteIndex = byteIndices.last ?? utf8Bytes.count
153+
154+
let slicedBytes = Array(utf8Bytes[startByteIndex..<endByteIndex])
155+
return BinaryDistinctString(slicedBytes)
156+
}
157+
}
158+
159+
public extension Dictionary where Key == BinaryDistinctString {
160+
/// Merges another `BinaryDistinctDictionary` into this one
161+
mutating func merge(_ other: [BinaryDistinctString: Value], strategy: (Value, Value) -> Value = { _, new in new }) {
162+
merge(other, uniquingKeysWith: strategy)
163+
}
164+
165+
/// Merges a `[String: Value]` dictionary into this one
166+
mutating func merge(_ other: [String: Value], strategy: (Value, Value) -> Value = { _, new in new }) {
167+
let converted = Dictionary(uniqueKeysWithValues: other.map { (BinaryDistinctString($0.key), $0.value) })
168+
merge(converted, uniquingKeysWith: strategy)
169+
}
170+
171+
/// Merges a `[NSString: Value]` dictionary into this one
172+
mutating func merge(_ other: [NSString: Value], strategy: (Value, Value) -> Value = { _, new in new }) {
173+
let converted = Dictionary(uniqueKeysWithValues: other.map { (BinaryDistinctString($0.key), $0.value) })
174+
merge(converted, uniquingKeysWith: strategy)
175+
}
176+
177+
func merging(_ other: [String: Value], strategy: (Value, Value) -> Value = { _, new in new }) -> Self {
178+
var newDict = self
179+
newDict.merge(other, strategy: strategy)
180+
return newDict
181+
}
182+
183+
func merging(_ other: [BinaryDistinctString: Value], strategy: (Value, Value) -> Value = { _, new in new }) -> Self {
184+
var newDict = self
185+
newDict.merge(other, strategy: strategy)
186+
return newDict
187+
}
188+
189+
func merging(_ other: [NSString: Value], strategy: (Value, Value) -> Value = { _, new in new }) -> Self {
190+
var newDict = self
191+
newDict.merge(other, strategy: strategy)
192+
return newDict
193+
}
194+
}
195+
196+
public protocol StringConvertible: ExpressibleByStringLiteral { }
197+
198+
extension BinaryDistinctString: StringConvertible { }
199+
extension String: StringConvertible { }
200+
extension NSString: StringConvertible { }
201+
202+
public struct BinaryDistinctCharacter: Equatable, Hashable, CustomStringConvertible, ExpressibleByStringLiteral {
203+
let bytes: [UInt16]
204+
205+
public init(_ character: Character) {
206+
bytes = Array(character.utf16)
207+
}
208+
209+
public init(_ string: String) {
210+
bytes = Array(string.utf16)
211+
}
212+
213+
public init(_ nsString: NSString) {
214+
let swiftString = nsString as String
215+
bytes = Array(swiftString.utf16)
216+
}
217+
218+
public init(bytes: [UInt16]) {
219+
self.bytes = bytes
220+
}
221+
222+
/// Satisfies ``ExpressibleByStringLiteral`` protocol.
223+
public init(stringLiteral value: String) {
224+
self.init(value)
225+
}
226+
227+
var stringValue: String? {
228+
String(utf16CodeUnits: bytes, count: bytes.count)
229+
}
230+
231+
public var description: String {
232+
if let str = stringValue {
233+
"BinaryDistinctCharacter('\(str)', bytes: \(bytes.map { String(format: "0x%02X", $0) }))"
234+
} else {
235+
"BinaryDistinctCharacter(invalid UTF-8, bytes: \(bytes.map { String(format: "0x%02X", $0) }))"
236+
}
237+
}
238+
239+
public static func == (lhs: BinaryDistinctCharacter, rhs: BinaryDistinctCharacter) -> Bool {
240+
lhs.bytes == rhs.bytes
241+
}
242+
}

0 commit comments

Comments
 (0)