Skip to content

Commit 4bdcc14

Browse files
committed
Merge remote-tracking branch 'abetlen/main' into ci/cpu
2 parents 186070b + 02d6bee commit 4bdcc14

File tree

12 files changed

+489
-71
lines changed

12 files changed

+489
-71
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.3.20]
11+
12+
- refactor: Replace deprecated llama.cpp references in library, docs, and examples by @abetlen in #2170
13+
- feat: Update llama.cpp to ggerganov/llama.cpp@f49e9178767d557a522618b16ce8694f9ddac628 by @abetlen in #2169
1014
- feat(server): Add model-load `chat_template_kwargs` support and document the CLI/config usage by @abetlen in #2168
1115
- ci: Publish release wheels as `py3-none` by @Bing-su in #2166
1216
- fix(ci): Publish distinct manylinux and musllinux CPU wheels by @abetlen in #2165

README.md

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -717,16 +717,20 @@ Below is a short example demonstrating how to use the low-level API to tokenize
717717
```python
718718
import llama_cpp
719719
import ctypes
720-
llama_cpp.llama_backend_init(False) # Must be called once at the start of each program
721-
params = llama_cpp.llama_context_default_params()
720+
llama_cpp.llama_backend_init() # Must be called once at the start of each program
721+
model_params = llama_cpp.llama_model_default_params()
722+
ctx_params = llama_cpp.llama_context_default_params()
723+
prompt = b"Q: Name the planets in the solar system? A: "
722724
# use bytes for char * params
723-
model = llama_cpp.llama_load_model_from_file(b"./models/7b/llama-model.gguf", params)
724-
ctx = llama_cpp.llama_new_context_with_model(model, params)
725-
max_tokens = params.n_ctx
725+
model = llama_cpp.llama_model_load_from_file(b"./models/7b/llama-model.gguf", model_params)
726+
ctx = llama_cpp.llama_init_from_model(model, ctx_params)
727+
vocab = llama_cpp.llama_model_get_vocab(model)
728+
max_tokens = ctx_params.n_ctx
726729
# use ctypes arrays for array params
727730
tokens = (llama_cpp.llama_token * int(max_tokens))()
728-
n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, llama_cpp.c_bool(True))
731+
n_tokens = llama_cpp.llama_tokenize(vocab, prompt, len(prompt), tokens, max_tokens, True, False)
729732
llama_cpp.llama_free(ctx)
733+
llama_cpp.llama_model_free(model)
730734
```
731735

732736
Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API.

examples/batch-processing/server.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@
66
# path = b"../../models/Qwen1.5-0.5B-Chat-GGUF/qwen1_5-0_5b-chat-q8_0.gguf"
77

88
# model_params = llama_cpp.llama_model_default_params()
9-
# model = llama_cpp.llama_load_model_from_file(path, model_params)
9+
# model = llama_cpp.llama_model_load_from_file(path, model_params)
1010

1111
# if model is None:
1212
# raise RuntimeError(f"Failed to load model from file: {path}")
1313

1414

1515
# ctx_params = llama_cpp.llama_context_default_params()
16-
# ctx = llama_cpp.llama_new_context_with_model(model, ctx_params)
16+
# ctx = llama_cpp.llama_init_from_model(model, ctx_params)
1717

1818
# if ctx is None:
1919
# raise RuntimeError("Failed to create context")

examples/low_level_api/low_level_api_chat_cpp.py

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -79,19 +79,22 @@ def __init__(self, params: GptParams) -> None:
7979
self.lparams.use_mlock = self.params.use_mlock
8080
self.lparams.use_mmap = self.params.use_mmap
8181

82-
self.model = llama_cpp.llama_load_model_from_file(
82+
self.model = llama_cpp.llama_model_load_from_file(
8383
self.params.model.encode("utf8"), self.lparams
8484
)
85+
self.vocab = llama_cpp.llama_model_get_vocab(self.model)
8586

8687
# Context Params.
8788
self.cparams = llama_cpp.llama_context_default_params()
8889

89-
self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.cparams)
90+
self.ctx = llama_cpp.llama_init_from_model(self.model, self.cparams)
9091
if not self.ctx:
9192
raise RuntimeError(f"error: failed to load model '{self.params.model}'")
9293

9394
if self.params.ignore_eos:
94-
self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf")
95+
self.params.logit_bias[llama_cpp.llama_vocab_eos(self.vocab)] = -float(
96+
"inf"
97+
)
9598

9699
if len(self.params.lora_adapter) > 0:
97100
if (
@@ -153,7 +156,7 @@ def __init__(self, params: GptParams) -> None:
153156
_session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))()
154157
_n_token_count_out = llama_cpp.c_size_t()
155158
if (
156-
llama_cpp.llama_load_session_file(
159+
llama_cpp.llama_state_load_file(
157160
self.ctx,
158161
self.params.path_session.encode("utf8"),
159162
_session_tokens,
@@ -314,7 +317,7 @@ def __init__(self, params: GptParams) -> None:
314317
def _tokenize(self, prompt, bos=True):
315318
_arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))()
316319
_n = llama_cpp.llama_tokenize(
317-
self.model,
320+
self.vocab,
318321
prompt.encode("utf8", errors="ignore"),
319322
len(prompt),
320323
_arr,
@@ -406,7 +409,7 @@ def generate(self):
406409
if len(self.embd_inp) <= self.input_consumed: # && !is_interacting
407410
# out of user input, sample next token
408411
top_k = (
409-
llama_cpp.llama_n_vocab(self.ctx)
412+
llama_cpp.llama_vocab_n_tokens(self.vocab)
410413
if self.params.top_k <= 0
411414
else self.params.top_k
412415
)
@@ -419,7 +422,7 @@ def generate(self):
419422
# optionally save the session on first sample (for faster prompt loading next time)
420423
if len(self.params.path_session) > 0 and self.need_to_save_session:
421424
self.need_to_save_session = False
422-
llama_cpp.llama_save_session_file(
425+
llama_cpp.llama_state_save_file(
423426
self.ctx,
424427
self.params.path_session.encode("utf8"),
425428
(llama_cpp.llama_token * len(self.session_tokens))(
@@ -431,7 +434,7 @@ def generate(self):
431434
id = 0
432435

433436
logits = llama_cpp.llama_get_logits(self.ctx)
434-
n_vocab = llama_cpp.llama_n_vocab(self.model)
437+
n_vocab = llama_cpp.llama_vocab_n_tokens(self.vocab)
435438

436439
# Apply params.logit_bias map
437440
for key, value in self.params.logit_bias.items():
@@ -448,7 +451,7 @@ def generate(self):
448451
)
449452

450453
# Apply penalties
451-
nl_logit = logits[llama_cpp.llama_token_nl(self.ctx)]
454+
nl_logit = logits[llama_cpp.llama_vocab_nl(self.vocab)]
452455
last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)
453456

454457
_arr = (llama_cpp.llama_token * last_n_repeat)(
@@ -470,7 +473,7 @@ def generate(self):
470473
# last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))
471474

472475
if not self.params.penalize_nl:
473-
logits[llama_cpp.llama_token_nl()] = nl_logit
476+
logits[llama_cpp.llama_vocab_nl(self.vocab)] = nl_logit
474477

475478
if self.params.temp <= 0:
476479
# Greedy sampling
@@ -539,7 +542,7 @@ def generate(self):
539542

540543
# replace end of text token with newline token when in interactive mode
541544
if (
542-
id == llama_cpp.llama_token_eos(self.ctx)
545+
id == llama_cpp.llama_vocab_eos(self.vocab)
543546
and self.params.interactive
544547
and not self.params.instruct
545548
):
@@ -599,8 +602,8 @@ def generate(self):
599602
break
600603

601604
# end of text token
602-
if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos(
603-
self.ctx
605+
if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_vocab_eos(
606+
self.vocab
604607
):
605608
if not self.params.instruct:
606609
for i in self.llama_token_eot:
@@ -636,7 +639,7 @@ def token_to_str(self, token_id: int) -> bytes:
636639
size = 32
637640
buffer = (ctypes.c_char * size)()
638641
n = llama_cpp.llama_token_to_piece(
639-
self.model, llama_cpp.llama_token(token_id), buffer, size
642+
self.vocab, llama_cpp.llama_token(token_id), buffer, size, 0, False
640643
)
641644
assert n <= size
642645
return bytes(buffer[:n])

examples/low_level_api/low_level_api_llama_cpp.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import llama_cpp
66

7-
llama_cpp.llama_backend_init(numa=False)
7+
llama_cpp.llama_backend_init()
88

99
N_THREADS = multiprocessing.cpu_count()
1010
MODEL_PATH = os.environ.get("MODEL", "../models/7B/ggml-model.bin")
@@ -13,8 +13,9 @@
1313

1414
lparams = llama_cpp.llama_model_default_params()
1515
cparams = llama_cpp.llama_context_default_params()
16-
model = llama_cpp.llama_load_model_from_file(MODEL_PATH.encode("utf-8"), lparams)
17-
ctx = llama_cpp.llama_new_context_with_model(model, cparams)
16+
model = llama_cpp.llama_model_load_from_file(MODEL_PATH.encode("utf-8"), lparams)
17+
ctx = llama_cpp.llama_init_from_model(model, cparams)
18+
vocab = llama_cpp.llama_model_get_vocab(model)
1819

1920
# determine the required inference memory per token:
2021
tmp = [0, 1, 2, 3]
@@ -28,13 +29,13 @@
2829

2930
embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
3031
n_of_tok = llama_cpp.llama_tokenize(
31-
model=model,
32-
text=bytes(str(prompt), "utf-8"),
33-
text_len=len(embd_inp),
32+
vocab=vocab,
33+
text=prompt,
34+
text_len=len(prompt),
3435
tokens=embd_inp,
35-
n_max_tokens=len(embd_inp),
36-
add_bos=False,
37-
special=False,
36+
n_tokens_max=len(embd_inp),
37+
add_special=False,
38+
parse_special=False,
3839
)
3940
embd_inp = embd_inp[:n_of_tok]
4041

@@ -70,7 +71,7 @@
7071
embd = []
7172
if len(embd_inp) <= input_consumed:
7273
logits = llama_cpp.llama_get_logits(ctx)
73-
n_vocab = llama_cpp.llama_n_vocab(model)
74+
n_vocab = llama_cpp.llama_vocab_n_tokens(vocab)
7475

7576
_arr = (llama_cpp.llama_token_data * n_vocab)(
7677
*[
@@ -114,7 +115,7 @@
114115
size = 32
115116
buffer = (ctypes.c_char * size)()
116117
n = llama_cpp.llama_token_to_piece(
117-
model, llama_cpp.llama_token(id), buffer, size
118+
vocab, llama_cpp.llama_token(id), buffer, size, 0, False
118119
)
119120
assert n <= size
120121
print(
@@ -123,11 +124,12 @@
123124
flush=True,
124125
)
125126

126-
if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos(ctx):
127+
if len(embd) > 0 and embd[-1] == llama_cpp.llama_vocab_eos(vocab):
127128
break
128129

129130
print()
130131

131132
llama_cpp.llama_print_timings(ctx)
132133

133134
llama_cpp.llama_free(ctx)
135+
llama_cpp.llama_model_free(model)

examples/notebooks/Batching.ipynb

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,10 @@
122122
"source": [
123123
"params = llama_cpp.llama_model_default_params()\n",
124124
"params.n_gpu_layers = 35\n",
125-
"model = llama_cpp.llama_load_model_from_file(\n",
125+
"model = llama_cpp.llama_model_load_from_file(\n",
126126
" b\"/workspaces/llama-cpp-python/mistral-7b-v0.1.Q2_K.gguf\", params\n",
127-
") # Update this to whatever"
127+
") # Update this to whatever\n",
128+
"vocab = llama_cpp.llama_model_get_vocab(model)"
128129
]
129130
},
130131
{
@@ -149,7 +150,7 @@
149150
"\n",
150151
"tokens = (llama_cpp.llama_token * n_ctx)()\n",
151152
"tokens_len = llama_cpp.llama_tokenize(\n",
152-
" model, prompt, len(prompt), tokens, len(tokens), True, True\n",
153+
" vocab, prompt, len(prompt), tokens, len(tokens), True, True\n",
153154
")\n",
154155
"print(tokens[:tokens_len])\n",
155156
"\n",
@@ -188,7 +189,7 @@
188189
"ctx_params.n_batch = max(n_len, n_parallel)\n",
189190
"ctx_params.n_threads = 1\n",
190191
"ctx_params.n_threads_batch = 1\n",
191-
"ctx = llama_cpp.llama_new_context_with_model(model, ctx_params)"
192+
"ctx = llama_cpp.llama_init_from_model(model, ctx_params)"
192193
]
193194
},
194195
{
@@ -338,14 +339,14 @@
338339
" # Sample the next token using the sampler chain\n",
339340
" new_token_id = llama_cpp.llama_sampler_sample(sampler_chain, ctx, -1)\n",
340341
"\n",
341-
" if new_token_id == llama_cpp.llama_token_eos(ctx) or n_cur == n_len:\n",
342+
" if new_token_id == llama_cpp.llama_vocab_eos(vocab) or n_cur == n_len:\n",
342343
" i_batch[i] = -1\n",
343344
" continue\n",
344345
"\n",
345346
" buf = (ctypes.c_char * 32)()\n",
346347
" \n",
347348
" # Convert token ID to text\n",
348-
" outlen = llama_cpp.llama_token_to_piece(model, new_token_id, buf, len(buf), 0, False)\n",
349+
" outlen = llama_cpp.llama_token_to_piece(vocab, new_token_id, buf, len(buf), 0, False)\n",
349350
" streams[i] += bytes(buf[:outlen]).decode(\"utf-8\")\n",
350351
"\n",
351352
" batch.token[batch.n_tokens] = new_token_id\n",
@@ -411,7 +412,7 @@
411412
"metadata": {},
412413
"outputs": [],
413414
"source": [
414-
"llama_cpp.llama_free_model(model)"
415+
"llama_cpp.llama_model_free(model)"
415416
]
416417
},
417418
{

llama_cpp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.3.19"
4+
__version__ = "0.3.20"

llama_cpp/_internals.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ def token_eos(self) -> int:
135135
return llama_cpp.llama_vocab_eos(self.vocab)
136136

137137
def token_cls(self) -> int:
138-
return llama_cpp.llama_vocab_cls(self.vocab)
138+
return llama_cpp.llama_vocab_bos(self.vocab)
139139

140140
def token_sep(self) -> int:
141141
return llama_cpp.llama_vocab_sep(self.vocab)
@@ -317,9 +317,9 @@ def get_state_size(self) -> int:
317317

318318
# TODO: set_state_data
319319

320-
# TODO: llama_load_session_file
320+
# TODO: llama_state_load_file
321321

322-
# TODO: llama_save_session_file
322+
# TODO: llama_state_save_file
323323

324324
def decode(self, batch: LlamaBatch):
325325
return_code = llama_cpp.llama_decode(

llama_cpp/llama.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1351,7 +1351,7 @@ def logit_bias_processor(
13511351
logits_processor=logits_processor,
13521352
grammar=grammar,
13531353
):
1354-
if llama_cpp.llama_token_is_eog(self._model.vocab, token):
1354+
if llama_cpp.llama_vocab_is_eog(self._model.vocab, token):
13551355
text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
13561356
finish_reason = "stop"
13571357
break
@@ -2148,13 +2148,13 @@ def __setstate__(self, state):
21482148
def save_state(self) -> LlamaState:
21492149
if self.verbose:
21502150
print("Llama.save_state: saving llama state", file=sys.stderr)
2151-
state_size = llama_cpp.llama_get_state_size(self._ctx.ctx)
2151+
state_size = llama_cpp.llama_state_get_size(self._ctx.ctx)
21522152
if self.verbose:
21532153
print(f"Llama.save_state: got state size: {state_size}", file=sys.stderr)
21542154
llama_state = (ctypes.c_uint8 * int(state_size))()
21552155
if self.verbose:
21562156
print("Llama.save_state: allocated state", file=sys.stderr)
2157-
n_bytes = llama_cpp.llama_copy_state_data(self._ctx.ctx, llama_state)
2157+
n_bytes = llama_cpp.llama_state_get_data(self._ctx.ctx, llama_state, state_size)
21582158
if self.verbose:
21592159
print(f"Llama.save_state: copied llama state: {n_bytes}", file=sys.stderr)
21602160
if int(n_bytes) > int(state_size):
@@ -2187,7 +2187,10 @@ def load_state(self, state: LlamaState) -> None:
21872187
LLamaStateArrayType = ctypes.c_uint8 * state_size
21882188
llama_state = LLamaStateArrayType.from_buffer_copy(state.llama_state)
21892189

2190-
if llama_cpp.llama_set_state_data(self._ctx.ctx, llama_state) != state_size:
2190+
if (
2191+
llama_cpp.llama_state_set_data(self._ctx.ctx, llama_state, state_size)
2192+
!= state_size
2193+
):
21912194
raise RuntimeError("Failed to set llama state data")
21922195

21932196
def n_ctx(self) -> int:

0 commit comments

Comments
 (0)