Merge remote-tracking branch 'abetlen/main' into ci/cpu

Bing-su · Bing-su · commit 4bdcc14cbbbc · 2026-04-03T16:19:00.000+09:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.20]
+
+- refactor: Replace deprecated llama.cpp references in library, docs, and examples by @abetlen in #2170
+- feat: Update llama.cpp to ggerganov/llama.cpp@f49e9178767d557a522618b16ce8694f9ddac628 by @abetlen in #2169
 - feat(server): Add model-load `chat_template_kwargs` support and document the CLI/config usage by @abetlen in #2168
 - ci: Publish release wheels as `py3-none` by @Bing-su in #2166
 - fix(ci): Publish distinct manylinux and musllinux CPU wheels by @abetlen in #2165
diff --git a/README.md b/README.md
@@ -717,16 +717,20 @@ Below is a short example demonstrating how to use the low-level API to tokenize
 ```python
 import llama_cpp
 import ctypes
-llama_cpp.llama_backend_init(False) # Must be called once at the start of each program
-params = llama_cpp.llama_context_default_params()
+llama_cpp.llama_backend_init()  # Must be called once at the start of each program
+model_params = llama_cpp.llama_model_default_params()
+ctx_params = llama_cpp.llama_context_default_params()
+prompt = b"Q: Name the planets in the solar system? A: "
 # use bytes for char * params
-model = llama_cpp.llama_load_model_from_file(b"./models/7b/llama-model.gguf", params)
-ctx = llama_cpp.llama_new_context_with_model(model, params)
-max_tokens = params.n_ctx
+model = llama_cpp.llama_model_load_from_file(b"./models/7b/llama-model.gguf", model_params)
+ctx = llama_cpp.llama_init_from_model(model, ctx_params)
+vocab = llama_cpp.llama_model_get_vocab(model)
+max_tokens = ctx_params.n_ctx
 # use ctypes arrays for array params
 tokens = (llama_cpp.llama_token * int(max_tokens))()
-n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, llama_cpp.c_bool(True))
+n_tokens = llama_cpp.llama_tokenize(vocab, prompt, len(prompt), tokens, max_tokens, True, False)
 llama_cpp.llama_free(ctx)
+llama_cpp.llama_model_free(model)
 ```
 
 Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API.
diff --git a/examples/batch-processing/server.py b/examples/batch-processing/server.py
@@ -6,14 +6,14 @@
 # path = b"../../models/Qwen1.5-0.5B-Chat-GGUF/qwen1_5-0_5b-chat-q8_0.gguf"
 
 # model_params = llama_cpp.llama_model_default_params()
-# model = llama_cpp.llama_load_model_from_file(path, model_params)
+# model = llama_cpp.llama_model_load_from_file(path, model_params)
 
 # if model is None:
 #     raise RuntimeError(f"Failed to load model from file: {path}")
 
 
 # ctx_params = llama_cpp.llama_context_default_params()
-# ctx = llama_cpp.llama_new_context_with_model(model, ctx_params)
+# ctx = llama_cpp.llama_init_from_model(model, ctx_params)
 
 # if ctx is None:
 #     raise RuntimeError("Failed to create context")
diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -79,19 +79,22 @@ def __init__(self, params: GptParams) -> None:
         self.lparams.use_mlock = self.params.use_mlock
         self.lparams.use_mmap = self.params.use_mmap
 
-        self.model = llama_cpp.llama_load_model_from_file(
+        self.model = llama_cpp.llama_model_load_from_file(
             self.params.model.encode("utf8"), self.lparams
         )
+        self.vocab = llama_cpp.llama_model_get_vocab(self.model)
 
         # Context Params.
         self.cparams = llama_cpp.llama_context_default_params()
 
-        self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.cparams)
+        self.ctx = llama_cpp.llama_init_from_model(self.model, self.cparams)
         if not self.ctx:
             raise RuntimeError(f"error: failed to load model '{self.params.model}'")
 
         if self.params.ignore_eos:
-            self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf")
+            self.params.logit_bias[llama_cpp.llama_vocab_eos(self.vocab)] = -float(
+                "inf"
+            )
 
         if len(self.params.lora_adapter) > 0:
             if (
@@ -153,7 +156,7 @@ def __init__(self, params: GptParams) -> None:
                 _session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))()
                 _n_token_count_out = llama_cpp.c_size_t()
                 if (
-                    llama_cpp.llama_load_session_file(
+                    llama_cpp.llama_state_load_file(
                         self.ctx,
                         self.params.path_session.encode("utf8"),
                         _session_tokens,
@@ -314,7 +317,7 @@ def __init__(self, params: GptParams) -> None:
     def _tokenize(self, prompt, bos=True):
         _arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))()
         _n = llama_cpp.llama_tokenize(
-            self.model,
+            self.vocab,
             prompt.encode("utf8", errors="ignore"),
             len(prompt),
             _arr,
@@ -406,7 +409,7 @@ def generate(self):
             if len(self.embd_inp) <= self.input_consumed:  # && !is_interacting
                 # out of user input, sample next token
                 top_k = (
-                    llama_cpp.llama_n_vocab(self.ctx)
+                    llama_cpp.llama_vocab_n_tokens(self.vocab)
                     if self.params.top_k <= 0
                     else self.params.top_k
                 )
@@ -419,7 +422,7 @@ def generate(self):
                 # optionally save the session on first sample (for faster prompt loading next time)
                 if len(self.params.path_session) > 0 and self.need_to_save_session:
                     self.need_to_save_session = False
-                    llama_cpp.llama_save_session_file(
+                    llama_cpp.llama_state_save_file(
                         self.ctx,
                         self.params.path_session.encode("utf8"),
                         (llama_cpp.llama_token * len(self.session_tokens))(
@@ -431,7 +434,7 @@ def generate(self):
                 id = 0
 
                 logits = llama_cpp.llama_get_logits(self.ctx)
-                n_vocab = llama_cpp.llama_n_vocab(self.model)
+                n_vocab = llama_cpp.llama_vocab_n_tokens(self.vocab)
 
                 # Apply params.logit_bias map
                 for key, value in self.params.logit_bias.items():
@@ -448,7 +451,7 @@ def generate(self):
                 )
 
                 # Apply penalties
-                nl_logit = logits[llama_cpp.llama_token_nl(self.ctx)]
+                nl_logit = logits[llama_cpp.llama_vocab_nl(self.vocab)]
                 last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)
 
                 _arr = (llama_cpp.llama_token * last_n_repeat)(
@@ -470,7 +473,7 @@ def generate(self):
                 # 	last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))
 
                 if not self.params.penalize_nl:
-                    logits[llama_cpp.llama_token_nl()] = nl_logit
+                    logits[llama_cpp.llama_vocab_nl(self.vocab)] = nl_logit
 
                 if self.params.temp <= 0:
                     # Greedy sampling
@@ -539,7 +542,7 @@ def generate(self):
 
                 # replace end of text token with newline token when in interactive mode
                 if (
-                    id == llama_cpp.llama_token_eos(self.ctx)
+                    id == llama_cpp.llama_vocab_eos(self.vocab)
                     and self.params.interactive
                     and not self.params.instruct
                 ):
@@ -599,8 +602,8 @@ def generate(self):
                     break
 
             # end of text token
-            if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos(
-                self.ctx
+            if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_vocab_eos(
+                self.vocab
             ):
                 if not self.params.instruct:
                     for i in self.llama_token_eot:
@@ -636,7 +639,7 @@ def token_to_str(self, token_id: int) -> bytes:
         size = 32
         buffer = (ctypes.c_char * size)()
         n = llama_cpp.llama_token_to_piece(
-            self.model, llama_cpp.llama_token(token_id), buffer, size
+            self.vocab, llama_cpp.llama_token(token_id), buffer, size, 0, False
         )
         assert n <= size
         return bytes(buffer[:n])
diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py
@@ -4,7 +4,7 @@
 
 import llama_cpp
 
-llama_cpp.llama_backend_init(numa=False)
+llama_cpp.llama_backend_init()
 
 N_THREADS = multiprocessing.cpu_count()
 MODEL_PATH = os.environ.get("MODEL", "../models/7B/ggml-model.bin")
@@ -13,8 +13,9 @@
 
 lparams = llama_cpp.llama_model_default_params()
 cparams = llama_cpp.llama_context_default_params()
-model = llama_cpp.llama_load_model_from_file(MODEL_PATH.encode("utf-8"), lparams)
-ctx = llama_cpp.llama_new_context_with_model(model, cparams)
+model = llama_cpp.llama_model_load_from_file(MODEL_PATH.encode("utf-8"), lparams)
+ctx = llama_cpp.llama_init_from_model(model, cparams)
+vocab = llama_cpp.llama_model_get_vocab(model)
 
 # determine the required inference memory per token:
 tmp = [0, 1, 2, 3]
@@ -28,13 +29,13 @@
 
 embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
 n_of_tok = llama_cpp.llama_tokenize(
-    model=model,
-    text=bytes(str(prompt), "utf-8"),
-    text_len=len(embd_inp),
+    vocab=vocab,
+    text=prompt,
+    text_len=len(prompt),
     tokens=embd_inp,
-    n_max_tokens=len(embd_inp),
-    add_bos=False,
-    special=False,
+    n_tokens_max=len(embd_inp),
+    add_special=False,
+    parse_special=False,
 )
 embd_inp = embd_inp[:n_of_tok]
 
@@ -70,7 +71,7 @@
     embd = []
     if len(embd_inp) <= input_consumed:
         logits = llama_cpp.llama_get_logits(ctx)
-        n_vocab = llama_cpp.llama_n_vocab(model)
+        n_vocab = llama_cpp.llama_vocab_n_tokens(vocab)
 
         _arr = (llama_cpp.llama_token_data * n_vocab)(
             *[
@@ -114,7 +115,7 @@
             size = 32
             buffer = (ctypes.c_char * size)()
             n = llama_cpp.llama_token_to_piece(
-                model, llama_cpp.llama_token(id), buffer, size
+                vocab, llama_cpp.llama_token(id), buffer, size, 0, False
             )
             assert n <= size
             print(
@@ -123,11 +124,12 @@
                 flush=True,
             )
 
-    if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos(ctx):
+    if len(embd) > 0 and embd[-1] == llama_cpp.llama_vocab_eos(vocab):
         break
 
 print()
 
 llama_cpp.llama_print_timings(ctx)
 
 llama_cpp.llama_free(ctx)
+llama_cpp.llama_model_free(model)
diff --git a/examples/notebooks/Batching.ipynb b/examples/notebooks/Batching.ipynb
@@ -122,9 +122,10 @@
    "source": [
     "params = llama_cpp.llama_model_default_params()\n",
     "params.n_gpu_layers = 35\n",
-    "model = llama_cpp.llama_load_model_from_file(\n",
+    "model = llama_cpp.llama_model_load_from_file(\n",
     "    b\"/workspaces/llama-cpp-python/mistral-7b-v0.1.Q2_K.gguf\", params\n",
-    ")  # Update this to whatever"
+    ")  # Update this to whatever\n",
+    "vocab = llama_cpp.llama_model_get_vocab(model)"
    ]
   },
   {
@@ -149,7 +150,7 @@
     "\n",
     "tokens = (llama_cpp.llama_token * n_ctx)()\n",
     "tokens_len = llama_cpp.llama_tokenize(\n",
-    "    model, prompt, len(prompt), tokens, len(tokens), True, True\n",
+    "    vocab, prompt, len(prompt), tokens, len(tokens), True, True\n",
     ")\n",
     "print(tokens[:tokens_len])\n",
     "\n",
@@ -188,7 +189,7 @@
     "ctx_params.n_batch = max(n_len, n_parallel)\n",
     "ctx_params.n_threads = 1\n",
     "ctx_params.n_threads_batch = 1\n",
-    "ctx = llama_cpp.llama_new_context_with_model(model, ctx_params)"
+    "ctx = llama_cpp.llama_init_from_model(model, ctx_params)"
    ]
   },
   {
@@ -338,14 +339,14 @@
     "        # Sample the next token using the sampler chain\n",
     "        new_token_id = llama_cpp.llama_sampler_sample(sampler_chain, ctx, -1)\n",
     "\n",
-    "        if new_token_id == llama_cpp.llama_token_eos(ctx) or n_cur == n_len:\n",
+    "        if new_token_id == llama_cpp.llama_vocab_eos(vocab) or n_cur == n_len:\n",
     "            i_batch[i] = -1\n",
     "            continue\n",
     "\n",
     "        buf = (ctypes.c_char * 32)()\n",
     "        \n",
     "        # Convert token ID to text\n",
-    "        outlen = llama_cpp.llama_token_to_piece(model, new_token_id, buf, len(buf), 0, False)\n",
+    "        outlen = llama_cpp.llama_token_to_piece(vocab, new_token_id, buf, len(buf), 0, False)\n",
     "        streams[i] += bytes(buf[:outlen]).decode(\"utf-8\")\n",
     "\n",
     "        batch.token[batch.n_tokens] = new_token_id\n",
@@ -411,7 +412,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "llama_cpp.llama_free_model(model)"
+    "llama_cpp.llama_model_free(model)"
    ]
   },
   {
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.19"
+__version__ = "0.3.20"
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -135,7 +135,7 @@ def token_eos(self) -> int:
         return llama_cpp.llama_vocab_eos(self.vocab)
 
     def token_cls(self) -> int:
-        return llama_cpp.llama_vocab_cls(self.vocab)
+        return llama_cpp.llama_vocab_bos(self.vocab)
 
     def token_sep(self) -> int:
         return llama_cpp.llama_vocab_sep(self.vocab)
@@ -317,9 +317,9 @@ def get_state_size(self) -> int:
 
     # TODO: set_state_data
 
-    # TODO: llama_load_session_file
+    # TODO: llama_state_load_file
 
-    # TODO: llama_save_session_file
+    # TODO: llama_state_save_file
 
     def decode(self, batch: LlamaBatch):
         return_code = llama_cpp.llama_decode(
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -1351,7 +1351,7 @@ def logit_bias_processor(
             logits_processor=logits_processor,
             grammar=grammar,
         ):
-            if llama_cpp.llama_token_is_eog(self._model.vocab, token):
+            if llama_cpp.llama_vocab_is_eog(self._model.vocab, token):
                 text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
                 finish_reason = "stop"
                 break
@@ -2148,13 +2148,13 @@ def __setstate__(self, state):
     def save_state(self) -> LlamaState:
         if self.verbose:
             print("Llama.save_state: saving llama state", file=sys.stderr)
-        state_size = llama_cpp.llama_get_state_size(self._ctx.ctx)
+        state_size = llama_cpp.llama_state_get_size(self._ctx.ctx)
         if self.verbose:
             print(f"Llama.save_state: got state size: {state_size}", file=sys.stderr)
         llama_state = (ctypes.c_uint8 * int(state_size))()
         if self.verbose:
             print("Llama.save_state: allocated state", file=sys.stderr)
-        n_bytes = llama_cpp.llama_copy_state_data(self._ctx.ctx, llama_state)
+        n_bytes = llama_cpp.llama_state_get_data(self._ctx.ctx, llama_state, state_size)
         if self.verbose:
             print(f"Llama.save_state: copied llama state: {n_bytes}", file=sys.stderr)
         if int(n_bytes) > int(state_size):
@@ -2187,7 +2187,10 @@ def load_state(self, state: LlamaState) -> None:
         LLamaStateArrayType = ctypes.c_uint8 * state_size
         llama_state = LLamaStateArrayType.from_buffer_copy(state.llama_state)
 
-        if llama_cpp.llama_set_state_data(self._ctx.ctx, llama_state) != state_size:
+        if (
+            llama_cpp.llama_state_set_data(self._ctx.ctx, llama_state, state_size)
+            != state_size
+        ):
             raise RuntimeError("Failed to set llama state data")
 
     def n_ctx(self) -> int:
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
diff --git a/vendor/llama.cpp b/vendor/llama.cpp