|
8 | 8 |
|
9 | 9 | import XCTest
|
10 | 10 | @testable import Tokenizers
|
11 |
| - |
| 11 | +@testable import Hub |
12 | 12 |
|
13 | 13 |
|
14 | 14 | class BertTokenizerTests: XCTestCase {
|
@@ -178,4 +178,35 @@ class BertTokenizerTests: XCTestCase {
|
178 | 178 | XCTAssertEqual(decoded, String(expected))
|
179 | 179 | }
|
180 | 180 | }
|
| 181 | + |
| 182 | + func testBertTokenizerAddedTokensRecognized() async throws { |
| 183 | + let base: URL = FileManager.default.urls(for: .cachesDirectory, in: .userDomainMask).first!.appending(component: "huggingface-tests") |
| 184 | + let hubApi = HubApi(downloadBase: base) |
| 185 | + let configuration = LanguageModelConfigurationFromHub(modelName: "google-bert/bert-base-uncased", hubApi: hubApi) |
| 186 | + guard let tokenizerConfig = try await configuration.tokenizerConfig else { fatalError("missing tokenizer config") } |
| 187 | + let tokenizerData = try await configuration.tokenizerData |
| 188 | + let addedTokens = [ |
| 189 | + "[ROAD]": 60_001, |
| 190 | + "[RIVER]": 60_002, |
| 191 | + "[BUILDING]": 60_003, |
| 192 | + "[PARK]": 60_004, |
| 193 | + "[BUFFER]": 60_005, |
| 194 | + "[INTERSECT]": 60_006, |
| 195 | + "[UNION]": 60_007, |
| 196 | + ] |
| 197 | + let tokenizer = try BertTokenizer(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData, addedTokens: addedTokens) |
| 198 | + for (token, idx) in addedTokens { |
| 199 | + XCTAssertEqual(tokenizer.convertTokenToId(token), idx) |
| 200 | + } |
| 201 | + for (token, idx) in addedTokens { |
| 202 | + XCTAssertEqual(tokenizer.convertIdToToken(idx), token) |
| 203 | + } |
| 204 | + |
| 205 | + // Reading added_tokens from tokenizer.json |
| 206 | + XCTAssertEqual(tokenizer.convertTokenToId("[PAD]"), 0) |
| 207 | + XCTAssertEqual(tokenizer.convertTokenToId("[UNK]"), 100) |
| 208 | + XCTAssertEqual(tokenizer.convertTokenToId("[CLS]"), 101) |
| 209 | + XCTAssertEqual(tokenizer.convertTokenToId("[SEP]"), 102) |
| 210 | + XCTAssertEqual(tokenizer.convertTokenToId("[MASK]"), 103) |
| 211 | + } |
181 | 212 | }
|
0 commit comments