huggingface · kashif · Oct 4, 2025 · Oct 4, 2025 · pcuenca · Oct 8, 2025
diff --git a/Examples/transformers-cli/Sources/transformers-cli/Transformers.swift b/Examples/transformers-cli/Sources/transformers-cli/Transformers.swift
@@ -49,6 +49,9 @@ struct TransformersCLI: AsyncParsableCommand {
     @Option(help: "Repetition penalty to discourage repeating tokens (typical: 1.0-2.0, 1.0 = no penalty)")
     var repetitionPenalty: Float?
 
+    @Option(help: "Path to a local folder containing tokenizer_config.json and tokenizer.json")
+    var tokenizerFolder: String?
-    var tokenizerFolder: String?
+    var tokenizerPath: String?
-    var tokenizerFolder: String?
+    var tokenizerPath: String?
+
     func generate(
         model: LanguageModel,
         config: GenerationConfig,
@@ -104,7 +107,17 @@ struct TransformersCLI: AsyncParsableCommand {
         let url = URL(filePath: modelPath)
         let compiledURL = try compile(at: url)
         print("Loading model \(compiledURL)")
-        let model = try LanguageModel.loadCompiled(url: compiledURL, computeUnits: computeUnits.asMLComputeUnits)
+        let model: LanguageModel
+        if let tokenizerFolder {
+            let tokenizerURL = URL(filePath: tokenizerFolder, directoryHint: .isDirectory)
+            model = try LanguageModel.loadCompiled(
+                url: compiledURL,
+                tokenizerFolder: tokenizerURL,
+                computeUnits: computeUnits.asMLComputeUnits
+            )
+        } else {
+            model = try LanguageModel.loadCompiled(url: compiledURL, computeUnits: computeUnits.asMLComputeUnits)
+        }
 
         var config = model.defaultGenerationConfig
         config.doSample = doSample

diff --git a/README.md b/README.md
@@ -88,6 +88,41 @@ example converting and running Mistral 7B using CoreML [here](https://github.com
 
 The [modernization of Core ML](https://github.com/huggingface/swift-transformers/pull/257) and corresponding examples were primarily contributed by @joshnewnham, @1duo, @alejandro-isaza, @aseemw. Thank you 🙏
 
+### Offline CoreML tokenizers
+
+When you bundle a compiled CoreML model and tokenizer files with your app, you can skip any network requests by injecting
+the tokenizer (or a local configuration) when constructing `LanguageModel`:
+
+```swift
+let compiledURL: URL = ... // path to .mlmodelc
+let tokenizerFolder: URL = ... // folder containing tokenizer_config.json and tokenizer.json
-let tokenizerFolder: URL = ... // folder containing tokenizer_config.json and tokenizer.json
+let tokenizerURL: URL = ... // folder containing tokenizer_config.json and tokenizer.json
-let tokenizerFolder: URL = ... // folder containing tokenizer_config.json and tokenizer.json
+let tokenizerURL: URL = ... // folder containing tokenizer_config.json and tokenizer.json
+
+let model = try LanguageModel.loadCompiled(
+    url: compiledURL,
+    tokenizerFolder: tokenizerFolder
+)
+
+// Or construct the tokenizer yourself (inside an async context)
+let tokenizer = try await AutoTokenizer.from(modelFolder: tokenizerFolder)
+let modelWithTokenizer = try LanguageModel.loadCompiled(
+    url: compiledURL,
+    tokenizer: tokenizer
+)
+```
+
+Make sure the tokenizer assets come from the same Hugging Face repo as the original checkpoint. For the
-Make sure the tokenizer assets come from the same Hugging Face repo as the original checkpoint. For the
+Make sure the tokenizer assets come from the same Hugging Face repo as the original checkpoint or are compatible with the model you use. For the
-Make sure the tokenizer assets come from the same Hugging Face repo as the original checkpoint. For the
+Make sure the tokenizer assets come from the same Hugging Face repo as the original checkpoint or are compatible with the model you use. For the
+Mistral example in `Examples/Mistral7B/`, you can fetch the tokenizer like this:
+
+```bash
+huggingface-cli download \
+  mistralai/Mistral-7B-Instruct-v0.3 \
+  tokenizer.json tokenizer_config.json \
+  --local-dir Examples/Mistral7B/local-tokenizer
+```
+
+If the repo is gated, authenticate with `huggingface-cli login` first. Both initializers reuse the tokenizer
+you pass in and never reach out to the Hugging Face Hub.
+
 ## Usage via SwiftPM
 
 To use `swift-transformers` with SwiftPM, you can add this to your `Package.swift`:
@@ -139,5 +174,3 @@ To format your code, run `swift format -i --recursive .`.
 ## License
 
 [Apache 2](LICENSE).
-
-
diff --git a/Sources/Models/LanguageModel.swift b/Sources/Models/LanguageModel.swift
@@ -33,12 +33,30 @@ public class LanguageModel {
 
     /// Creates a new language model instance from a CoreML model.
     ///
-    /// - Parameter model: The CoreML model to wrap
+    /// - Parameters:
+    ///   - model: The CoreML model to wrap
+    ///   - configuration: Optional Hub configuration already resolved on disk
+    ///   - tokenizer: Optional preconstructed tokenizer to reuse
     /// - Important: Triggers a fatal error if the model doesn't have the expected input shape information
-    public required init(model: MLModel) {
+    public required init(
+        model: MLModel,
+        configuration: LanguageModelConfigurationFromHub? = nil,
+        tokenizer: Tokenizer? = nil
+    ) {
         self.model = model
+        _tokenizer = tokenizer
         (minContextLength, maxContextLength) = Self.contextRange(from: model)
-        configuration = LanguageModelConfigurationFromHub(modelName: modelName)
+        if let configuration {
+            self.configuration = configuration
+        } else if tokenizer == nil {
+            self.configuration = LanguageModelConfigurationFromHub(modelName: modelName)
+        } else {
+            self.configuration = nil
+        }
+    }
+
+    public convenience required init(model: MLModel) {
+        self.init(model: model, configuration: nil, tokenizer: nil)
     }
 
     public func resetState() async {}
@@ -142,17 +160,60 @@ public extension LanguageModel {
     /// - Parameters:
     ///   - url: The URL of the compiled CoreML model file (.mlmodelc)
     ///   - computeUnits: The compute units to use for model inference
+    ///   - configuration: Optional Hub configuration describing tokenizer/model metadata
+    ///   - tokenizer: Optional tokenizer instance to reuse instead of loading from disk
     /// - Returns: A configured `LanguageModel` instance
     /// - Throws: An error if the model cannot be loaded from the specified URL
-    static func loadCompiled(url: URL, computeUnits: MLComputeUnits = .cpuAndGPU) throws -> LanguageModel {
+    static func loadCompiled(
+        url: URL,
+        computeUnits: MLComputeUnits = .cpuAndGPU,
+        configuration: LanguageModelConfigurationFromHub? = nil,
+        tokenizer: Tokenizer? = nil
+    ) throws -> LanguageModel {
         let config = MLModelConfiguration()
         config.computeUnits = computeUnits
         let model = try MLModel(contentsOf: url, configuration: config)
         return switch kvCacheAvailability(for: model) {
-        case .statefulKVCache: LanguageModelWithStatefulKVCache(model: model)
-        default: LanguageModel(model: model)
+        case .statefulKVCache:
+            LanguageModelWithStatefulKVCache(
+                model: model,
+                configuration: configuration,
+                tokenizer: tokenizer
+            )
+        default:
+            LanguageModel(
+                model: model,
+                configuration: configuration,
+                tokenizer: tokenizer
+            )
         }
     }
+
+    static func loadCompiled(
+        url: URL,
+        tokenizerFolder: URL,
+        computeUnits: MLComputeUnits = .cpuAndGPU
-        tokenizerFolder: URL,
-        computeUnits: MLComputeUnits = .cpuAndGPU
+        computeUnits: MLComputeUnits = .cpuAndGPU,
+        tokenizer tokenizerFolder: URL,
-        tokenizerFolder: URL,
-        computeUnits: MLComputeUnits = .cpuAndGPU
+        computeUnits: MLComputeUnits = .cpuAndGPU,
+        tokenizer tokenizerFolder: URL,
+    ) throws -> LanguageModel {
+        let configuration = LanguageModelConfigurationFromHub(modelFolder: tokenizerFolder)
+        return try loadCompiled(
+            url: url,
+            computeUnits: computeUnits,
+            configuration: configuration
+        )
+    }
+
+    static func loadCompiled(
+        url: URL,
+        tokenizer: Tokenizer,
+        computeUnits: MLComputeUnits = .cpuAndGPU
-        tokenizer: Tokenizer,
-        computeUnits: MLComputeUnits = .cpuAndGPU
+        computeUnits: MLComputeUnits = .cpuAndGPU,
+        tokenizer: Tokenizer,
-        tokenizer: Tokenizer,
-        computeUnits: MLComputeUnits = .cpuAndGPU
+        computeUnits: MLComputeUnits = .cpuAndGPU,
+        tokenizer: Tokenizer,
+    ) throws -> LanguageModel {
+        try loadCompiled(
+            url: url,
+            computeUnits: computeUnits,
+            configuration: nil,
+            tokenizer: tokenizer
+        )
+    }
 }
 
 @available(macOS 15.0, iOS 18.0, *)
@@ -304,7 +365,8 @@ public extension LanguageModel {
     /// - Throws: An error if the configuration cannot be loaded
     var modelConfig: Config? {
         get async throws {
-            try await configuration!.modelConfig
+            guard let configuration else { return nil }
+            return try await configuration.modelConfig
         }
     }
 
@@ -314,7 +376,8 @@ public extension LanguageModel {
     /// - Throws: An error if the configuration cannot be loaded
     var tokenizerConfig: Config? {
         get async throws {
-            try await configuration!.tokenizerConfig
+            guard let configuration else { return nil }
+            return try await configuration.tokenizerConfig
         }
     }
 
@@ -324,7 +387,10 @@ public extension LanguageModel {
     /// - Throws: An error if the tokenizer data cannot be loaded
     var tokenizerData: Config {
         get async throws {
-            try await configuration!.tokenizerData
+            guard let configuration else {
+                throw TokenizerError.missingConfig
+            }
+            return try await configuration.tokenizerData
         }
     }
 
@@ -434,8 +500,12 @@ public class LanguageModelWithStatefulKVCache: LanguageModel {
 
     var state: MLState?
 
-    public required init(model: MLModel) {
-        super.init(model: model)
+    public required init(
+        model: MLModel,
+        configuration: LanguageModelConfigurationFromHub? = nil,
+        tokenizer: Tokenizer? = nil
+    ) {
+        super.init(model: model, configuration: configuration, tokenizer: tokenizer)
         // To support pre-filling and extend, the input must support
         // flexible shapes.
         guard maxContextLength - minContextLength > 1 else {
@@ -506,11 +576,15 @@ public class LanguageModelWithStatefulKVCache: LanguageModel {
 public enum TokenizerError: LocalizedError {
     /// The tokenizer configuration file could not be found.
     case tokenizerConfigNotFound
+    /// The language model configuration required to load tokenizer data is missing.
+    case missingConfig
 
     public var errorDescription: String? {
         switch self {
         case .tokenizerConfigNotFound:
             String(localized: "Tokenizer configuration could not be found. The model may be missing required tokenizer files.", comment: "Error when tokenizer configuration is missing")
+        case .missingConfig:
+            String(localized: "Language model configuration was not set, tokenizer assets could not be loaded.", comment: "Error when configuration needed for tokenizer data is missing")
         }
     }
 }