Skip to content

Commit 83d5839

Browse files
committed
feat: Implement MiniCPMv45ChatHandler for MiniCPM-V 4.5 with multi-image tracking
Signed-off-by: JamePeng <jame_peng@sina.com>
1 parent 8468ac2 commit 83d5839

File tree

2 files changed

+139
-0
lines changed

2 files changed

+139
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,7 @@ Below are the supported multi-modal models and their respective chat handlers (P
508508
| [nanollava](https://huggingface.co/abetlen/nanollava-gguf) | `NanollavaChatHandler` | `nanollava` |
509509
| [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` |
510510
| [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6`, `minicpm-v-4.0` |
511+
| [minicpm-v-4.5](https://huggingface.co/openbmb/MiniCPM-V-4_5-gguf) | `MiniCPMv45ChatHandler` | `minicpm-v-4.5` |
511512
| [gemma3](https://huggingface.co/unsloth/gemma-3-27b-it-GGUF) | `Gemma3ChatHandler` | `gemma3` |
512513
| [glm4.1v](https://huggingface.co/unsloth/GLM-4.1V-9B-Thinking-GGUF) | `GLM41VChatHandler` | `glm4.1v` |
513514
| [glm4.6v](https://huggingface.co/unsloth/GLM-4.6V-Flash-GGUF) | `GLM46VChatHandler` | `glm4.6v` |

llama_cpp/llama_chat_format.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3704,6 +3704,144 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler):
37043704
)
37053705

37063706

3707+
class MiniCPMv45ChatHandler(Llava15ChatHandler):
3708+
"""
3709+
Handler for MiniCPM-V 4.5 models.
3710+
3711+
Supports:
3712+
- Multi-step tool calls with <tool_call> and <tool_response> XML tags.
3713+
- Integrated reasoning (thinking) process with <think> tags.
3714+
- Specialized system prompt handling with tool definitions.
3715+
- Global image numbering for multi-image processing.
3716+
"""
3717+
3718+
# Model specific control tokens
3719+
MINICPMV_BOS_TOKEN = "<|im_start|>"
3720+
MINICPMV_EOS_TOKEN = "<|im_end|>"
3721+
MINICPMV_PAD_TOKEN = "<|endoftext|>"
3722+
3723+
# Image placeholder tags
3724+
MINICPMV_IMAGE_START_TOKEN = "<image>"
3725+
MINICPMV_IMAGE_END_TOKEN = "</image>"
3726+
MINICPMV_IMAGE_ID_START_TOKEN = "<image_id>"
3727+
MINICPMV_IMAGE_ID_END_TOKEN = "</image_id>"
3728+
3729+
CHAT_FORMAT = (
3730+
# --- 1. First System Message & Tools Definitions ---
3731+
"{%- if tools %}"
3732+
"{{- '" + MINICPMV_BOS_TOKEN + "system\\n' }}"
3733+
"{%- if messages[0].role == 'system' %}{{- messages[0].content + '\\n\\n' }}{%- endif %}"
3734+
"{{- '# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\n' }}"
3735+
"{{- 'You are provided with function signatures within <tools></tools> XML tags:\\n<tools>' }}"
3736+
"{%- for tool in tools %}{{- '\\n' + (tool | tojson) }}{%- endfor %}"
3737+
"{{- '\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\\n</tool_call>" + MINICPMV_EOS_TOKEN + "\\n' }}"
3738+
"{%- elif messages[0].role == 'system' %}"
3739+
"{{- '" + MINICPMV_BOS_TOKEN + "system\\n' + messages[0].content + '" + MINICPMV_EOS_TOKEN + "\\n' }}"
3740+
"{%- endif %}"
3741+
3742+
# --- 2. Message Stream Processing ---
3743+
"{% set image_count = namespace(value=0) %}"
3744+
"{%- for message in messages %}"
3745+
# --- Unified Role Handling (User, Assistant, and subsequent Systems) ---
3746+
"{%- if message.role in ['user', 'assistant'] or (message.role == 'system' and not loop.first) %}"
3747+
"{{- '" + MINICPMV_BOS_TOKEN + "' + message.role + '\\n' }}"
3748+
3749+
"{%- set content = message.content %}"
3750+
"{%- if content is not string %}"
3751+
"{%- set ns = namespace(content_str='') %}"
3752+
"{%- for item in content %}"
3753+
# --- Explicit image_url type and value checking ---
3754+
"{%- if item.type == 'image_url' %}"
3755+
"{%- set image_url = item.image_url if item.image_url is string else item.image_url.url %}"
3756+
"{%- set image_count.value = image_count.value + 1 %}"
3757+
# Format: <image_id>N</image_id>: <image>IMAGE_URL</image>
3758+
"{%- set ns.content_str = ns.content_str + '<image_id>' + (image_count.value | string) + '</image_id>: <image>' + image_url + '</image>' %}"
3759+
"{%- elif item.type == 'text' %}"
3760+
"{%- set ns.content_str = ns.content_str + item.text %}"
3761+
"{%- endif %}"
3762+
"{%- endfor %}"
3763+
"{%- set content = ns.content_str %}"
3764+
"{%- endif %}"
3765+
3766+
"{{- content -}}"
3767+
3768+
# Append tool_calls to assistant messages if they exist
3769+
"{%- if message.role == 'assistant' and message.tool_calls %}"
3770+
"{%- for tool_call in message.tool_calls %}"
3771+
"{%- set tc = tool_call.function if tool_call.function else tool_call %}"
3772+
"{{- '\\n<tool_call>\\n{\"name\": \"' + tc.name + '\", \"arguments\": ' }}"
3773+
"{{- tc.arguments if tc.arguments is string else tc.arguments | tojson }}"
3774+
"{{- '}\\n</tool_call>' }}"
3775+
"{%- endfor %}"
3776+
"{%- endif %}"
3777+
"{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}"
3778+
3779+
# --- Specialized Tool Response Handling ---
3780+
# Group consecutive tool responses under a single user-like block
3781+
"{%- elif message.role == 'tool' %}"
3782+
"{%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}"
3783+
"{{- '" + MINICPMV_BOS_TOKEN + "user' }}"
3784+
"{%- endif %}"
3785+
"{{- '\\n<tool_response>\\n' + message.content + '\\n</tool_response>' }}"
3786+
"{%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}"
3787+
"{{- '" + MINICPMV_EOS_TOKEN + "\\n' }}"
3788+
"{%- endif %}"
3789+
"{%- endif %}"
3790+
"{%- endfor %}"
3791+
3792+
# --- 3. Generation Prompt ---
3793+
"{%- if add_generation_prompt %}"
3794+
"{{- '" + MINICPMV_BOS_TOKEN + "assistant\\n' }}"
3795+
# Handle thinking/reasoning block visibility based on configuration
3796+
"{%- if enable_thinking is defined and enable_thinking is false %}"
3797+
"{{- '<think>\\n\\n</think>\\n\\n' }}"
3798+
"{%- elif enable_thinking is defined and enable_thinking is true %}"
3799+
"{{- '<think>\\n' }}"
3800+
"{%- endif %}"
3801+
"{%- endif %}"
3802+
)
3803+
3804+
def __init__(self, enable_thinking: bool = True, **kwargs):
3805+
"""
3806+
Initializes the MiniCPM-V 4.5 Handler.
3807+
3808+
Args:
3809+
enable_thinking (bool): If True, model generates reasoning before the final answer.
3810+
**kwargs: Additional arguments for the base Llava15ChatHandler.
3811+
"""
3812+
self.enable_thinking = enable_thinking
3813+
super().__init__(**kwargs)
3814+
3815+
def __call__(self, **kwargs):
3816+
# Inject thinking control flag into the template
3817+
self.extra_template_arguments["enable_thinking"] = self.enable_thinking
3818+
3819+
# Set stop token patch
3820+
kwargs['stop'] = [self.MINICPMV_EOS_TOKEN, self.MINICPMV_PAD_TOKEN]
3821+
3822+
llama = kwargs['llama']
3823+
llama.reset()
3824+
llama._ctx.memory_clear(True)
3825+
llama.n_tokens = 0
3826+
3827+
if hasattr(llama, 'input_ids'):
3828+
llama.input_ids.fill(0)
3829+
3830+
if hasattr(self, '_last_image_embed'):
3831+
self._last_image_embed = None
3832+
self._last_image_hash = None
3833+
3834+
if self.verbose:
3835+
messages = kwargs.get('messages', [])
3836+
try:
3837+
image_count = len(self.get_image_urls(messages))
3838+
print(f"MiniCPMV45ChatHandler(enable_thinking={self.enable_thinking}) - Processing {image_count} images", file=sys.stderr)
3839+
except Exception:
3840+
print(f"MiniCPMV45ChatHandler - Cleared state", file=sys.stderr)
3841+
3842+
return super().__call__(**kwargs)
3843+
3844+
37073845
class Gemma3ChatHandler(Llava15ChatHandler):
37083846

37093847
GEMMA3_BOI_TOKEN = "<start_of_image>"

0 commit comments

Comments
 (0)