From ca0074008089dc3cb459b6e322262f401125ecec Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Tue, 2 Dec 2025 03:04:01 +0000 Subject: [PATCH 01/19] [Frontend] supports deepseekv32 chat template Signed-off-by: chaunceyjiang --- vllm/config/model.py | 4 +- vllm/entrypoints/openai/serving_engine.py | 8 +- vllm/tokenizers/__init__.py | 2 + vllm/tokenizers/deepseekv32.py | 460 ++++++++++++++++++++++ 4 files changed, 471 insertions(+), 3 deletions(-) create mode 100644 vllm/tokenizers/deepseekv32.py diff --git a/vllm/config/model.py b/vllm/config/model.py index 5de97697698a..33cccbacaf52 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -84,7 +84,7 @@ "transcription", "draft", ] -TokenizerMode = Literal["auto", "hf", "slow", "mistral"] +TokenizerMode = Literal["auto", "hf", "slow", "mistral", "custom", ] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] LogprobsMode = Literal[ "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs" @@ -141,7 +141,7 @@ class ModelConfig: - "hf" will use the fast tokenizer if available.\n - "slow" will always use the slow tokenizer.\n - "mistral" will always use the tokenizer from `mistral_common`.\n - - Other custom values can be supported via plugins.""" + - "custom" will use --tokenizer to select the preregistered tokenizer.""" trust_remote_code: bool = False """Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer.""" diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 1d89aa011af2..4fa5ac9a62f9 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -106,7 +106,7 @@ from vllm.pooling_params import PoolingParams from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.sampling_params import BeamSearchParams, SamplingParams -from vllm.tokenizers import MistralTokenizer, TokenizerLike +from vllm.tokenizers import DeepseekV32Tokenizer, MistralTokenizer, TokenizerLike from vllm.tracing import ( contains_trace_headers, extract_trace_headers, @@ -1129,6 +1129,12 @@ async def _preprocess_chat( messages=messages, **_chat_template_kwargs, ) + elif isinstance(tokenizer, DeepseekV32Tokenizer): + request_prompt = tokenizer.apply_chat_template( + messages=messages, + model_config=model_config, + **_chat_template_kwargs, + ) else: request_prompt = apply_hf_chat_template( tokenizer=tokenizer, diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py index 42487f5f5165..67a6d7c8eb3d 100644 --- a/vllm/tokenizers/__init__.py +++ b/vllm/tokenizers/__init__.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from .deepseekv32 import DeepseekV32Tokenizer from .hf import HfTokenizer from .mistral import MistralTokenizer from .protocol import TokenizerLike @@ -21,4 +22,5 @@ "get_tokenizer", "cached_tokenizer_from_config", "init_tokenizer_from_config", + "DeepseekV32Tokenizer", ] diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py new file mode 100644 index 000000000000..f6e8b28d1914 --- /dev/null +++ b/vllm/tokenizers/deepseekv32.py @@ -0,0 +1,460 @@ +import copy +import json +import re +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union + +from .hf import HfTokenizer +from .protocol import TokenizerLike + +TOOLS_SYSTEM_TEMPLATE = """## Tools +You have access to a set of tools you can use to answer the user's question. +You can invoke functions by writing a "<{dsml_token}function_calls>" block like the following as part of your reply to the user: +<{dsml_token}function_calls> +<{dsml_token}invoke name="$FUNCTION_NAME"> +<{dsml_token}parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE +... + +<{dsml_token}invoke name="$FUNCTION_NAME2"> +... + + +String and scalar parameters should be specified as is without any escaping or quotes, while lists and objects should use JSON format. The "string" attribute should be set to "true" for string type parameters and "false" for other types (numbers, booleans, arrays, objects). +If the thinking_mode is enabled, then after function results you should strongly consider outputting a thinking block. Here is an example: +<{dsml_token}function_calls> +... + + +... + +{thinking_start_token}...thinking about results{thinking_end_token} +Here are the functions available in JSONSchema format: + +{tool_schemas} + +""" + +bos_token: str = "<|begin▁of▁sentence|>" +eos_token: str = "<|end▁of▁sentence|>" +thinking_start_token: str = "" +thinking_end_token: str = "" +dsml_token: str = "|DSML|" +system_msg_template: str = "{content}" +user_msg_template: str = "<|User|>{content}<|Assistant|>" +assistant_msg_template: str = "{reasoning}{content}{tool_calls}<|end▁of▁sentence|>" +thinking_template = "{reasoning_content}" + +response_format_template: str = "## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}" +tool_call_template: str = ( + '<{dsml_token}invoke name="{name}">\n{arguments}\n' +) +tool_calls_template = ( + "<{dsml_token}function_calls>\n{tool_calls}\n" +) + +tool_output_template: str = "\n{content}" + + +def to_json(value: Any) -> str: + try: + return json.dumps(value, ensure_ascii=False) + except: + return json.dumps(value, ensure_ascii=True) + + +def tools_from_openai_format(tools): + return [tool["function"] for tool in tools] + + +def tool_calls_from_openai_format(tool_calls): + return [ + { + "name": tool_call["function"]["name"], + "arguments": tool_call["function"]["arguments"], + } + for tool_call in tool_calls + ] + + +def tool_calls_to_openai_format(tool_calls): + return [ + { + "type": "function", + "function": { + "name": tool_call["name"], + "arguments": tool_call["arguments"], + }, + } + for tool_call in tool_calls + ] + + +def encode_arguments_to_dsml(tool_call: Dict[str, str]) -> str: + p_dsml_template = """<{dsml_token}parameter name="{key}" string="{is_str}">{value}""" + P_dsml_strs = [] + + arguments = json.loads(tool_call["arguments"]) + + for k, v in arguments.items(): + p_dsml_str = p_dsml_template.format( + dsml_token=dsml_token, + key=k, + is_str="true" if isinstance(v, str) else "false", + value=v if isinstance(v, str) else to_json(v), + ) + + P_dsml_strs.append(p_dsml_str) + + return "\n".join(P_dsml_strs) + + +def decode_dsml_to_arguments( + tool_name: str, tool_args: Dict[str, Tuple[str, str]] +) -> Dict[str, str]: + def _decode_value(key: str, value: str, string: str): + if string == "true": + value = to_json(value) + return f"{to_json(key)}: {value}" + + tool_args_json = ( + "{" + + ", ".join( + [_decode_value(k, v, string=is_str) for k, (v, is_str) in tool_args.items()] + ) + + "}" + ) + return dict(name=tool_name, arguments=tool_args_json) + + +def render_tools(tools: List[Dict[str, Union[str, Dict[str, Any]]]]) -> str: + tools_json = [to_json(t) for t in tools] + + return TOOLS_SYSTEM_TEMPLATE.format( + tool_schemas="\n".join(tools_json), + dsml_token=dsml_token, + thinking_start_token=thinking_start_token, + thinking_end_token=thinking_end_token, + ) + + +def find_last_user_index(messages: List[Dict[str, Any]]) -> int: + last_user_index = -1 + for idx in range(len(messages) - 1, -1, -1): + if messages[idx].get("role") in ["user", "developer"]: + last_user_index = idx + break + return last_user_index + + +def render_message( + index: int, messages: List[Dict[str, Any]], thinking_mode: str +) -> str: + assert 0 <= index < len(messages) + assert thinking_mode in ["chat", "thinking"], ( + f"Invalid thinking_mode `{thinking_mode}`" + ) + + prompt = "" + msg = messages[index] + last_user_idx = find_last_user_index(messages) + + role = msg.get("role") + content = msg.get("content") + tools = msg.get("tools") + response_format = msg.get("response_format") + tool_calls = msg.get("tool_calls") + reasoning_content = msg.get("reasoning_content") + + if tools: + tools = tools_from_openai_format(tools) + if tool_calls: + tool_calls = tool_calls_from_openai_format(tool_calls) + + if role == "system": + prompt += system_msg_template.format(content=content or "") + if tools: + prompt += "\n\n" + render_tools(tools) + + if response_format: + prompt += "\n\n" + response_format_template.format( + schema=to_json(response_format) + ) + + elif role == "developer": + assert content, f"Invalid message for role `{role}`: {msg}" + content_developer = "" + if tools: + content_developer += "\n\n" + render_tools(tools) + + if response_format: + content_developer += "\n\n" + response_format_template.format( + schema=to_json(response_format) + ) + + content_developer += "\n\n# The user's message is: {}".format(content) + + prompt += user_msg_template.format(content=content_developer) + if index == last_user_idx and thinking_mode == "thinking": + prompt += thinking_start_token + else: + prompt += thinking_end_token + + elif role == "user": + prompt += user_msg_template.format(content=content) + + if index == last_user_idx and thinking_mode == "thinking": + prompt += thinking_start_token + else: + prompt += thinking_end_token + + elif role == "tool": + prev_assistant_idx = index - 1 + assistant_msg = messages[prev_assistant_idx] + while prev_assistant_idx >= 0 and assistant_msg.get("role") == "tool": + prev_assistant_idx -= 1 + assistant_msg = messages[prev_assistant_idx] + + assert ( + index == 0 + or prev_assistant_idx >= 0 + and assistant_msg.get("role") == "assistant" + ), f"Invalid messages at {index}:\n{assistant_msg}" + + tool_call_order = index - prev_assistant_idx + assistant_tool_calls = assistant_msg.get("tool_calls") + assert assistant_tool_calls and len(assistant_tool_calls) >= tool_call_order, ( + "No tool calls but found tool output" + ) + + if tool_call_order == 1: + prompt += "\n\n" + + prompt += tool_output_template.format(content=content) + + if tool_call_order == len(assistant_tool_calls): + prompt += "\n" + + if index >= last_user_idx and thinking_mode == "thinking": + prompt += "\n\n" + thinking_start_token + else: + prompt += "\n\n" + thinking_end_token + + elif role == "assistant": + prev_assistant_idx = index + thinking_part = "" + + tool_calls_content = "" + if tool_calls: + tool_calls = [ + tool_call_template.format( + dsml_token=dsml_token, + name=tool_call.get("name"), + arguments=encode_arguments_to_dsml(tool_call), + ) + for tool_call in tool_calls + ] + tool_calls_content += "\n\n" + tool_calls_template.format( + dsml_token=dsml_token, tool_calls="\n".join(tool_calls) + ) + + summary_content = content or "" + + if thinking_mode == "thinking" and index > last_user_idx: + assert reasoning_content or tool_calls, ( + f"ThinkingMode: {thinking_mode}, invalid message without reasoning_content/tool_calls `{msg}` after last user message" + ) + thinking_part = ( + thinking_template.format(reasoning_content=reasoning_content or "") + + thinking_end_token + ) + + prompt += assistant_msg_template.format( + reasoning=thinking_part, + content=summary_content, + tool_calls=tool_calls_content, + ) + else: + raise NotImplementedError(f"Unknown role: {role}") + + return prompt + + +def drop_thinking_messages( + messages: List[Dict[str, Any]], last_user_idx: Optional[int] = None +) -> List[Dict[str, Any]]: + messages_wo_thinking: List[Dict[str, Any]] = [] + last_user_idx = ( + find_last_user_index(messages) if last_user_idx is None else last_user_idx + ) + for idx, msg in enumerate(messages): + role = msg.get("role") + if role in ["user", "system", "tool"] or idx >= last_user_idx: + messages_wo_thinking.append(msg) + continue + + elif role == "assistant": + msg_wo_thinking = copy.copy(msg) + msg_wo_thinking.pop("reasoning_content", None) + messages_wo_thinking.append(msg_wo_thinking) + + return messages_wo_thinking + + +def encode_messages( + messages: List[Dict[str, Any]], + thinking_mode: str, + context: Optional[List[Dict[str, Any]]] = None, + drop_thinking: bool = True, + add_default_bos_token: bool = True, +) -> str: + context = context if context else [] + full_messages = context + messages + + prompt = bos_token if add_default_bos_token and len(context) == 0 else "" + + if thinking_mode == "thinking" and drop_thinking: + full_messages = drop_thinking_messages(full_messages) + + for idx in range(len(messages)): + prompt += render_message( + idx + len(context), full_messages, thinking_mode=thinking_mode + ) + + return prompt + + +def _read_until_stop( + index: int, text: str, stop: List[str] +) -> Tuple[int, str, Optional[str]]: + min_pos = len(text) + matched_stop = None + + for s in stop: + pos = text.find(s, index) + if pos != -1 and pos < min_pos: + min_pos = pos + matched_stop = s + + if matched_stop: + content = text[index:min_pos] + return min_pos + len(matched_stop), content, matched_stop + else: + content = text[index:] + return len(text), content, None + + +def parse_tool_calls(index: int, text: str): + tool_calls: List[Dict[str, Any]] = [] + stop_token = None + tool_calls_end_token = f"" + + while index < len(text): + index, _, stop_token = _read_until_stop( + index, text, [f"<{dsml_token}invoke", tool_calls_end_token] + ) + assert _ == ">\n", "Tool call format error" + + if stop_token == tool_calls_end_token: + break + + assert stop_token is not None, "Missing special token" + + index, tool_name_content, stop_token = _read_until_stop( + index, text, [f"<{dsml_token}parameter", f"\n$', tool_name_content, flags=re.DOTALL + ) + assert len(p_tool_name) == 1, "Tool name format error" + tool_name = p_tool_name[0] + + tool_args: Dict[str, Tuple[str, str]] = {} + while stop_token == f"<{dsml_token}parameter": + index, param_content, stop_token = _read_until_stop( + index, text, [f"/{dsml_token}parameter"] + ) + + param_kv = re.findall( + r'^ name="(.*?)" string="(true|false)">(.*?)<$', + param_content, + flags=re.DOTALL, + ) + assert len(param_kv) == 1, "Parameter format error" + param_name, string, param_value = param_kv[0] + + assert param_name not in tool_args, "Duplicate parameter name" + tool_args[param_name] = (param_value, string) + + index, content, stop_token = _read_until_stop( + index, text, [f"<{dsml_token}parameter", f"\n", "Parameter format error" + + tool_call = decode_dsml_to_arguments(tool_name=tool_name, tool_args=tool_args) + tool_calls.append(tool_call) + + return index, stop_token, tool_calls + + +# NOTE: This function is designed to parse only correctly formatted string and will not attempt to correct malformed output that may be generated by the model. +def parse_message_from_completion_text(text: str, thinking_mode: str): + summary_content, reasoning_content, tool_calls = "", "", [] + index, stop_token = 0, None + tool_calls_start_token = f"\n\n<{dsml_token}function_calls" + + is_thinking, is_tool_calling = thinking_mode == "thinking", False + + if is_thinking: + index, content_delta, stop_token = _read_until_stop( + index, text, [thinking_end_token, tool_calls_start_token] + ) + reasoning_content = content_delta + assert stop_token == thinking_end_token, "Invalid thinking format" + + index, content_delta, stop_token = _read_until_stop( + index, text, [eos_token, tool_calls_start_token] + ) + summary_content = content_delta + if stop_token == tool_calls_start_token: + is_tool_calling = True + else: + assert stop_token == eos_token, "Invalid summary format" + + if is_tool_calling: + index, stop_token, tool_calls = parse_tool_calls(index, text) + + index, tool_ends_text, stop_token = _read_until_stop(index, text, [eos_token]) + assert not tool_ends_text, "Unexpected content after tool calls" + + assert len(text) == index and stop_token in [eos_token, None], ( + "Unexpected content at end" + ) + + for sp_token in [ + bos_token, + eos_token, + thinking_start_token, + thinking_end_token, + dsml_token, + ]: + assert sp_token not in summary_content and sp_token not in reasoning_content, ( + "Unexpected special token in content" + ) + + return { + "role": "assistant", + "content": summary_content, + "reasoning_content": reasoning_content, + "tool_calls": tool_calls_to_openai_format(tool_calls), + } + + +class DeepseekV32Tokenizer(HfTokenizer): + def apply_chat_template(self, messages, tools=None, **kwargs): + encode_config = dict( + thinking_mode="thinking", drop_thinking=True, add_default_bos_token=True + ) + # messages -> string + prompt_str = encode_messages(messages, **encode_config) + return prompt_str From 7dd1d559092e12285e78f8b9afed61abae8d7eee Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Tue, 2 Dec 2025 03:20:00 +0000 Subject: [PATCH 02/19] [Frontend] supports deepseekv32 chat template Signed-off-by: chaunceyjiang --- vllm/config/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 33cccbacaf52..0ad818c75b31 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -84,7 +84,7 @@ "transcription", "draft", ] -TokenizerMode = Literal["auto", "hf", "slow", "mistral", "custom", ] +TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] LogprobsMode = Literal[ "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs" @@ -141,7 +141,7 @@ class ModelConfig: - "hf" will use the fast tokenizer if available.\n - "slow" will always use the slow tokenizer.\n - "mistral" will always use the tokenizer from `mistral_common`.\n - - "custom" will use --tokenizer to select the preregistered tokenizer.""" + - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.""" trust_remote_code: bool = False """Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer.""" From 713ae5b7ca6f432d9357a21e069b474d3a1b5171 Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Tue, 2 Dec 2025 03:24:32 +0000 Subject: [PATCH 03/19] [Frontend] supports deepseekv32 chat template Signed-off-by: chaunceyjiang --- vllm/tokenizers/deepseekv32.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py index f6e8b28d1914..e41c9b9d6d5d 100644 --- a/vllm/tokenizers/deepseekv32.py +++ b/vllm/tokenizers/deepseekv32.py @@ -5,7 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union from .hf import HfTokenizer -from .protocol import TokenizerLike +from .registry import TokenizerRegistry TOOLS_SYSTEM_TEMPLATE = """## Tools You have access to a set of tools you can use to answer the user's question. @@ -450,6 +450,7 @@ def parse_message_from_completion_text(text: str, thinking_mode: str): } +@TokenizerRegistry.register("deepseek_v32") class DeepseekV32Tokenizer(HfTokenizer): def apply_chat_template(self, messages, tools=None, **kwargs): encode_config = dict( From 82596db86de2a0c282d6014af1edbacc6949a428 Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Tue, 2 Dec 2025 04:23:11 +0000 Subject: [PATCH 04/19] [Frontend] supports deepseekv32 chat template Signed-off-by: chaunceyjiang --- vllm/entrypoints/openai/serving_engine.py | 4 +++- vllm/tokenizers/deepseekv32.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 4fa5ac9a62f9..d8ca82a22874 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1120,7 +1120,9 @@ async def _preprocess_chat( _chat_template_kwargs.update(chat_template_kwargs or {}) request_prompt: str | list[int] - + print( + f"--------{isinstance(tokenizer, DeepseekV32Tokenizer)}--{type(tokenizer)}-----------" + ) if tokenizer is None: request_prompt = "placeholder" elif isinstance(tokenizer, MistralTokenizer): diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py index e41c9b9d6d5d..ea4570945c41 100644 --- a/vllm/tokenizers/deepseekv32.py +++ b/vllm/tokenizers/deepseekv32.py @@ -4,7 +4,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union -from .hf import HfTokenizer +from .hf import HfTokenizer, TokenizerLike from .registry import TokenizerRegistry TOOLS_SYSTEM_TEMPLATE = """## Tools From e907f8d57251dabb4d3e9cd9d11716a9f3db00ff Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Tue, 2 Dec 2025 04:59:04 +0000 Subject: [PATCH 05/19] [Frontend] supports deepseekv32 chat template Signed-off-by: chaunceyjiang --- vllm/entrypoints/openai/serving_engine.py | 7 +++++-- vllm/tokenizers/hf.py | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index d8ca82a22874..6de6b8bfcb4e 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1121,7 +1121,8 @@ async def _preprocess_chat( request_prompt: str | list[int] print( - f"--------{isinstance(tokenizer, DeepseekV32Tokenizer)}--{type(tokenizer)}-----------" + f"--------{isinstance(tokenizer, DeepseekV32Tokenizer)}--{type(tokenizer)}--" + f"-{(hasattr(tokenizer, '__raw_class__') and issubclass(tokenizer.__raw_class__, DeepseekV32Tokenizer))}--------" ) if tokenizer is None: request_prompt = "placeholder" @@ -1131,7 +1132,9 @@ async def _preprocess_chat( messages=messages, **_chat_template_kwargs, ) - elif isinstance(tokenizer, DeepseekV32Tokenizer): + elif hasattr(tokenizer, "__raw_class__") and issubclass( + tokenizer.__raw_class__, DeepseekV32Tokenizer + ): request_prompt = tokenizer.apply_chat_template( messages=messages, model_config=model_config, diff --git a/vllm/tokenizers/hf.py b/vllm/tokenizers/hf.py index 344507312038..cb7a34beaee8 100644 --- a/vllm/tokenizers/hf.py +++ b/vllm/tokenizers/hf.py @@ -24,6 +24,7 @@ def get_cached_tokenizer( each time they are called, leading to a significant slowdown. This proxy caches these properties for faster access. """ + tokenizer.__raw_class__ = tokenizer.__class__ cached_tokenizer = copy.copy(tokenizer) tokenizer_all_special_ids = tokenizer.all_special_ids From b32a0845ce6227b35f9c8dc81d6a58ec36af696c Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Tue, 2 Dec 2025 05:12:15 +0000 Subject: [PATCH 06/19] [Frontend] supports deepseekv32 chat template Signed-off-by: chaunceyjiang --- vllm/entrypoints/openai/serving_engine.py | 5 +---- vllm/tokenizers/deepseekv32.py | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 6de6b8bfcb4e..48740f4f15ce 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1122,7 +1122,6 @@ async def _preprocess_chat( request_prompt: str | list[int] print( f"--------{isinstance(tokenizer, DeepseekV32Tokenizer)}--{type(tokenizer)}--" - f"-{(hasattr(tokenizer, '__raw_class__') and issubclass(tokenizer.__raw_class__, DeepseekV32Tokenizer))}--------" ) if tokenizer is None: request_prompt = "placeholder" @@ -1132,9 +1131,7 @@ async def _preprocess_chat( messages=messages, **_chat_template_kwargs, ) - elif hasattr(tokenizer, "__raw_class__") and issubclass( - tokenizer.__raw_class__, DeepseekV32Tokenizer - ): + elif isinstance(tokenizer, DeepseekV32Tokenizer): request_prompt = tokenizer.apply_chat_template( messages=messages, model_config=model_config, diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py index ea4570945c41..adcc9b261613 100644 --- a/vllm/tokenizers/deepseekv32.py +++ b/vllm/tokenizers/deepseekv32.py @@ -452,6 +452,27 @@ def parse_message_from_completion_text(text: str, thinking_mode: str): @TokenizerRegistry.register("deepseek_v32") class DeepseekV32Tokenizer(HfTokenizer): + @classmethod + def from_pretrained( + cls, + path_or_repo_id: str | Path, + *args, + trust_remote_code: bool = False, + revision: str | None = None, + download_dir: str | None = None, + **kwargs, + ) -> "TokenizerLike": + tokenizer = super().from_pretrained( + path_or_repo_id, + *args, + trust_remote_code=trust_remote_code, + revision=revision, + download_dir=download_dir, + **kwargs, + ) + tokenizer.__class__ = DeepseekV32Tokenizer + return tokenizer + def apply_chat_template(self, messages, tools=None, **kwargs): encode_config = dict( thinking_mode="thinking", drop_thinking=True, add_default_bos_token=True From 59c7e740a86c726da7ebc1b7ad63b05d6ca0f0c4 Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Tue, 2 Dec 2025 05:17:26 +0000 Subject: [PATCH 07/19] [Frontend] supports deepseekv32 chat template Signed-off-by: chaunceyjiang --- vllm/entrypoints/openai/serving_engine.py | 25 ++++++++++++----------- vllm/tokenizers/deepseekv32.py | 1 - 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 48740f4f15ce..b9448bc054ec 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1131,19 +1131,20 @@ async def _preprocess_chat( messages=messages, **_chat_template_kwargs, ) - elif isinstance(tokenizer, DeepseekV32Tokenizer): - request_prompt = tokenizer.apply_chat_template( - messages=messages, - model_config=model_config, - **_chat_template_kwargs, - ) else: - request_prompt = apply_hf_chat_template( - tokenizer=tokenizer, - conversation=conversation, - model_config=model_config, - **_chat_template_kwargs, - ) + try: + request_prompt = apply_hf_chat_template( + tokenizer=tokenizer, + conversation=conversation, + model_config=model_config, + **_chat_template_kwargs, + ) + except ValueError: + request_prompt = tokenizer.apply_chat_template( + messages=messages, + model_config=model_config, + **_chat_template_kwargs, + ) mm_data = await mm_data_future diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py index adcc9b261613..66339026d5b8 100644 --- a/vllm/tokenizers/deepseekv32.py +++ b/vllm/tokenizers/deepseekv32.py @@ -470,7 +470,6 @@ def from_pretrained( download_dir=download_dir, **kwargs, ) - tokenizer.__class__ = DeepseekV32Tokenizer return tokenizer def apply_chat_template(self, messages, tools=None, **kwargs): From 6ce992eb3a487cf4ae22b5c82aaeab0bc95923fe Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Tue, 2 Dec 2025 05:23:52 +0000 Subject: [PATCH 08/19] [Frontend] supports deepseekv32 chat template Signed-off-by: chaunceyjiang --- vllm/entrypoints/openai/serving_engine.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index b9448bc054ec..530f62715344 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1141,6 +1141,7 @@ async def _preprocess_chat( ) except ValueError: request_prompt = tokenizer.apply_chat_template( + conversation=conversation, messages=messages, model_config=model_config, **_chat_template_kwargs, From e4ef0d7176abadd174caa9527071bfc62340b547 Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Tue, 2 Dec 2025 06:37:52 +0000 Subject: [PATCH 09/19] [Frontend] supports deepseekv32 chat template Signed-off-by: chaunceyjiang --- vllm/tokenizers/deepseekv32.py | 102 ++++++++++++++++++++++++++++++++- 1 file changed, 101 insertions(+), 1 deletion(-) diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py index 66339026d5b8..0ab29c788cd6 100644 --- a/vllm/tokenizers/deepseekv32.py +++ b/vllm/tokenizers/deepseekv32.py @@ -4,6 +4,8 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union +from transformers import BatchEncoding + from .hf import HfTokenizer, TokenizerLike from .registry import TokenizerRegistry @@ -452,6 +454,11 @@ def parse_message_from_completion_text(text: str, thinking_mode: str): @TokenizerRegistry.register("deepseek_v32") class DeepseekV32Tokenizer(HfTokenizer): + def __init__(self, tokenizer: TokenizerLike): + super().__init__(tokenizer) + self.tokenizer = tokenizer + self.name_or_path = tokenizer.name_or_path + @classmethod def from_pretrained( cls, @@ -470,7 +477,7 @@ def from_pretrained( download_dir=download_dir, **kwargs, ) - return tokenizer + return DeepseekV32Tokenizer(tokenizer) def apply_chat_template(self, messages, tools=None, **kwargs): encode_config = dict( @@ -479,3 +486,96 @@ def apply_chat_template(self, messages, tools=None, **kwargs): # messages -> string prompt_str = encode_messages(messages, **encode_config) return prompt_str + + @property + def all_special_tokens(self) -> list[str]: + return self.tokenizer.all_special_tokens + + @property + def all_special_ids(self) -> list[int]: + return self.tokenizer.all_special_ids + + @property + def bos_token_id(self) -> int: + raise NotImplementedError + + @property + def eos_token_id(self) -> int: + return self.tokenizer.eos_token_id + + @property + def pad_token_id(self) -> int: + return self.tokenizer.pad_token_id + + @property + def is_fast(self) -> bool: + return self.tokenizer.is_fast + + @property + def vocab_size(self) -> int: + return self.tokenizer.vocab_size + + @property + def max_token_id(self) -> int: + return self.tokenizer.max_token_id + + @property + def truncation_side(self) -> str: + return self.tokenizer.truncation_side + + def __hash__(self) -> int: + return hash(id(self)) + + def __len__(self) -> int: + return self.vocab_size + + def __call__( + self, + text: str | list[str], + text_pair: str | None = None, + add_special_tokens: bool = True, + truncation: bool = False, + max_length: int | None = None, + ) -> "BatchEncoding": + return self.tokenizer( + text, + text_pair=text_pair, + add_special_tokens=add_special_tokens, + truncation=truncation, + max_length=max_length, + ) + + def get_vocab(self) -> dict[str, int]: + return self.tokenizer.get_vocab() + + def get_added_vocab(self) -> dict[str, int]: + return self.tokenizer.get_added_vocab() + + def encode( + self, + text: str, + truncation: bool | None = None, + max_length: int | None = None, + add_special_tokens: bool = True, + ) -> list[int]: + return self.tokenizer.encode( + text, + truncation=truncation, + max_length=max_length, + add_special_tokens=add_special_tokens, + ) + + def convert_tokens_to_string(self, tokens: list[str]) -> str: + return self.tokenizer.convert_tokens_to_string(tokens) + + def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str: + return self.tokenizer.decode(ids, skip_special_tokens=skip_special_tokens) + + def convert_ids_to_tokens( + self, + ids: list[int], + skip_special_tokens: bool = False, + ) -> list[str]: + return self.tokenizer.convert_ids_to_tokens( + ids, skip_special_tokens=skip_special_tokens + ) From 5f0caa104964a6f740f089ac91eb3bb4d56969ea Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Tue, 2 Dec 2025 06:45:34 +0000 Subject: [PATCH 10/19] [Frontend] supports deepseekv32 chat template Signed-off-by: chaunceyjiang --- vllm/entrypoints/openai/serving_engine.py | 31 +- vllm/tokenizers/deepseek_v32_encoding.py | 446 +++++++++++++++++++++ vllm/tokenizers/deepseekv32.py | 449 +--------------------- 3 files changed, 462 insertions(+), 464 deletions(-) create mode 100644 vllm/tokenizers/deepseek_v32_encoding.py diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 530f62715344..c34d598a325c 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1120,9 +1120,7 @@ async def _preprocess_chat( _chat_template_kwargs.update(chat_template_kwargs or {}) request_prompt: str | list[int] - print( - f"--------{isinstance(tokenizer, DeepseekV32Tokenizer)}--{type(tokenizer)}--" - ) + if tokenizer is None: request_prompt = "placeholder" elif isinstance(tokenizer, MistralTokenizer): @@ -1131,21 +1129,20 @@ async def _preprocess_chat( messages=messages, **_chat_template_kwargs, ) + elif isinstance(tokenizer, DeepseekV32Tokenizer): + request_prompt = tokenizer.apply_chat_template( + conversation=conversation, + messages=messages, + model_config=model_config, + **_chat_template_kwargs, + ) else: - try: - request_prompt = apply_hf_chat_template( - tokenizer=tokenizer, - conversation=conversation, - model_config=model_config, - **_chat_template_kwargs, - ) - except ValueError: - request_prompt = tokenizer.apply_chat_template( - conversation=conversation, - messages=messages, - model_config=model_config, - **_chat_template_kwargs, - ) + request_prompt = apply_hf_chat_template( + tokenizer=tokenizer, + conversation=conversation, + model_config=model_config, + **_chat_template_kwargs, + ) mm_data = await mm_data_future diff --git a/vllm/tokenizers/deepseek_v32_encoding.py b/vllm/tokenizers/deepseek_v32_encoding.py new file mode 100644 index 000000000000..ba6eb98a8212 --- /dev/null +++ b/vllm/tokenizers/deepseek_v32_encoding.py @@ -0,0 +1,446 @@ +import copy +import json +import re +from typing import Any, Dict, List, Optional, Tuple, Union + +TOOLS_SYSTEM_TEMPLATE = """## Tools +You have access to a set of tools you can use to answer the user's question. +You can invoke functions by writing a "<{dsml_token}function_calls>" block like the following as part of your reply to the user: +<{dsml_token}function_calls> +<{dsml_token}invoke name="$FUNCTION_NAME"> +<{dsml_token}parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE +... + +<{dsml_token}invoke name="$FUNCTION_NAME2"> +... + + +String and scalar parameters should be specified as is without any escaping or quotes, while lists and objects should use JSON format. The "string" attribute should be set to "true" for string type parameters and "false" for other types (numbers, booleans, arrays, objects). +If the thinking_mode is enabled, then after function results you should strongly consider outputting a thinking block. Here is an example: +<{dsml_token}function_calls> +... + + +... + +{thinking_start_token}...thinking about results{thinking_end_token} +Here are the functions available in JSONSchema format: + +{tool_schemas} + +""" + +bos_token: str = "<|begin▁of▁sentence|>" +eos_token: str = "<|end▁of▁sentence|>" +thinking_start_token: str = "" +thinking_end_token: str = "" +dsml_token: str = "|DSML|" +system_msg_template: str = "{content}" +user_msg_template: str = "<|User|>{content}<|Assistant|>" +assistant_msg_template: str = "{reasoning}{content}{tool_calls}<|end▁of▁sentence|>" +thinking_template = "{reasoning_content}" + +response_format_template: str = "## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}" +tool_call_template: str = ( + '<{dsml_token}invoke name="{name}">\n{arguments}\n' +) +tool_calls_template = ( + "<{dsml_token}function_calls>\n{tool_calls}\n" +) + +tool_output_template: str = "\n{content}" + + +def to_json(value: Any) -> str: + try: + return json.dumps(value, ensure_ascii=False) + except: + return json.dumps(value, ensure_ascii=True) + + +def tools_from_openai_format(tools): + return [tool["function"] for tool in tools] + + +def tool_calls_from_openai_format(tool_calls): + return [ + { + "name": tool_call["function"]["name"], + "arguments": tool_call["function"]["arguments"], + } + for tool_call in tool_calls + ] + + +def tool_calls_to_openai_format(tool_calls): + return [ + { + "type": "function", + "function": { + "name": tool_call["name"], + "arguments": tool_call["arguments"], + }, + } + for tool_call in tool_calls + ] + + +def encode_arguments_to_dsml(tool_call: Dict[str, str]) -> str: + p_dsml_template = """<{dsml_token}parameter name="{key}" string="{is_str}">{value}""" + P_dsml_strs = [] + + arguments = json.loads(tool_call["arguments"]) + + for k, v in arguments.items(): + p_dsml_str = p_dsml_template.format( + dsml_token=dsml_token, + key=k, + is_str="true" if isinstance(v, str) else "false", + value=v if isinstance(v, str) else to_json(v), + ) + + P_dsml_strs.append(p_dsml_str) + + return "\n".join(P_dsml_strs) + + +def decode_dsml_to_arguments( + tool_name: str, tool_args: Dict[str, Tuple[str, str]] +) -> Dict[str, str]: + def _decode_value(key: str, value: str, string: str): + if string == "true": + value = to_json(value) + return f"{to_json(key)}: {value}" + + tool_args_json = ( + "{" + + ", ".join( + [_decode_value(k, v, string=is_str) for k, (v, is_str) in tool_args.items()] + ) + + "}" + ) + return dict(name=tool_name, arguments=tool_args_json) + + +def render_tools(tools: List[Dict[str, Union[str, Dict[str, Any]]]]) -> str: + tools_json = [to_json(t) for t in tools] + + return TOOLS_SYSTEM_TEMPLATE.format( + tool_schemas="\n".join(tools_json), + dsml_token=dsml_token, + thinking_start_token=thinking_start_token, + thinking_end_token=thinking_end_token, + ) + + +def find_last_user_index(messages: List[Dict[str, Any]]) -> int: + last_user_index = -1 + for idx in range(len(messages) - 1, -1, -1): + if messages[idx].get("role") in ["user", "developer"]: + last_user_index = idx + break + return last_user_index + + +def render_message( + index: int, messages: List[Dict[str, Any]], thinking_mode: str +) -> str: + assert 0 <= index < len(messages) + assert thinking_mode in ["chat", "thinking"], ( + f"Invalid thinking_mode `{thinking_mode}`" + ) + + prompt = "" + msg = messages[index] + last_user_idx = find_last_user_index(messages) + + role = msg.get("role") + content = msg.get("content") + tools = msg.get("tools") + response_format = msg.get("response_format") + tool_calls = msg.get("tool_calls") + reasoning_content = msg.get("reasoning_content") + + if tools: + tools = tools_from_openai_format(tools) + if tool_calls: + tool_calls = tool_calls_from_openai_format(tool_calls) + + if role == "system": + prompt += system_msg_template.format(content=content or "") + if tools: + prompt += "\n\n" + render_tools(tools) + + if response_format: + prompt += "\n\n" + response_format_template.format( + schema=to_json(response_format) + ) + + elif role == "developer": + assert content, f"Invalid message for role `{role}`: {msg}" + content_developer = "" + if tools: + content_developer += "\n\n" + render_tools(tools) + + if response_format: + content_developer += "\n\n" + response_format_template.format( + schema=to_json(response_format) + ) + + content_developer += "\n\n# The user's message is: {}".format(content) + + prompt += user_msg_template.format(content=content_developer) + if index == last_user_idx and thinking_mode == "thinking": + prompt += thinking_start_token + else: + prompt += thinking_end_token + + elif role == "user": + prompt += user_msg_template.format(content=content) + + if index == last_user_idx and thinking_mode == "thinking": + prompt += thinking_start_token + else: + prompt += thinking_end_token + + elif role == "tool": + prev_assistant_idx = index - 1 + assistant_msg = messages[prev_assistant_idx] + while prev_assistant_idx >= 0 and assistant_msg.get("role") == "tool": + prev_assistant_idx -= 1 + assistant_msg = messages[prev_assistant_idx] + + assert ( + index == 0 + or prev_assistant_idx >= 0 + and assistant_msg.get("role") == "assistant" + ), f"Invalid messages at {index}:\n{assistant_msg}" + + tool_call_order = index - prev_assistant_idx + assistant_tool_calls = assistant_msg.get("tool_calls") + assert assistant_tool_calls and len(assistant_tool_calls) >= tool_call_order, ( + "No tool calls but found tool output" + ) + + if tool_call_order == 1: + prompt += "\n\n" + + prompt += tool_output_template.format(content=content) + + if tool_call_order == len(assistant_tool_calls): + prompt += "\n" + + if index >= last_user_idx and thinking_mode == "thinking": + prompt += "\n\n" + thinking_start_token + else: + prompt += "\n\n" + thinking_end_token + + elif role == "assistant": + prev_assistant_idx = index + thinking_part = "" + + tool_calls_content = "" + if tool_calls: + tool_calls = [ + tool_call_template.format( + dsml_token=dsml_token, + name=tool_call.get("name"), + arguments=encode_arguments_to_dsml(tool_call), + ) + for tool_call in tool_calls + ] + tool_calls_content += "\n\n" + tool_calls_template.format( + dsml_token=dsml_token, tool_calls="\n".join(tool_calls) + ) + + summary_content = content or "" + + if thinking_mode == "thinking" and index > last_user_idx: + assert reasoning_content or tool_calls, ( + f"ThinkingMode: {thinking_mode}, invalid message without reasoning_content/tool_calls `{msg}` after last user message" + ) + thinking_part = ( + thinking_template.format(reasoning_content=reasoning_content or "") + + thinking_end_token + ) + + prompt += assistant_msg_template.format( + reasoning=thinking_part, + content=summary_content, + tool_calls=tool_calls_content, + ) + else: + raise NotImplementedError(f"Unknown role: {role}") + + return prompt + + +def drop_thinking_messages( + messages: List[Dict[str, Any]], last_user_idx: Optional[int] = None +) -> List[Dict[str, Any]]: + messages_wo_thinking: List[Dict[str, Any]] = [] + last_user_idx = ( + find_last_user_index(messages) if last_user_idx is None else last_user_idx + ) + for idx, msg in enumerate(messages): + role = msg.get("role") + if role in ["user", "system", "tool"] or idx >= last_user_idx: + messages_wo_thinking.append(msg) + continue + + elif role == "assistant": + msg_wo_thinking = copy.copy(msg) + msg_wo_thinking.pop("reasoning_content", None) + messages_wo_thinking.append(msg_wo_thinking) + + return messages_wo_thinking + + +def encode_messages( + messages: List[Dict[str, Any]], + thinking_mode: str, + context: Optional[List[Dict[str, Any]]] = None, + drop_thinking: bool = True, + add_default_bos_token: bool = True, +) -> str: + context = context if context else [] + full_messages = context + messages + + prompt = bos_token if add_default_bos_token and len(context) == 0 else "" + + if thinking_mode == "thinking" and drop_thinking: + full_messages = drop_thinking_messages(full_messages) + + for idx in range(len(messages)): + prompt += render_message( + idx + len(context), full_messages, thinking_mode=thinking_mode + ) + + return prompt + + +def _read_until_stop( + index: int, text: str, stop: List[str] +) -> Tuple[int, str, Optional[str]]: + min_pos = len(text) + matched_stop = None + + for s in stop: + pos = text.find(s, index) + if pos != -1 and pos < min_pos: + min_pos = pos + matched_stop = s + + if matched_stop: + content = text[index:min_pos] + return min_pos + len(matched_stop), content, matched_stop + else: + content = text[index:] + return len(text), content, None + + +def parse_tool_calls(index: int, text: str): + tool_calls: List[Dict[str, Any]] = [] + stop_token = None + tool_calls_end_token = f"" + + while index < len(text): + index, _, stop_token = _read_until_stop( + index, text, [f"<{dsml_token}invoke", tool_calls_end_token] + ) + assert _ == ">\n", "Tool call format error" + + if stop_token == tool_calls_end_token: + break + + assert stop_token is not None, "Missing special token" + + index, tool_name_content, stop_token = _read_until_stop( + index, text, [f"<{dsml_token}parameter", f"\n$', tool_name_content, flags=re.DOTALL + ) + assert len(p_tool_name) == 1, "Tool name format error" + tool_name = p_tool_name[0] + + tool_args: Dict[str, Tuple[str, str]] = {} + while stop_token == f"<{dsml_token}parameter": + index, param_content, stop_token = _read_until_stop( + index, text, [f"/{dsml_token}parameter"] + ) + + param_kv = re.findall( + r'^ name="(.*?)" string="(true|false)">(.*?)<$', + param_content, + flags=re.DOTALL, + ) + assert len(param_kv) == 1, "Parameter format error" + param_name, string, param_value = param_kv[0] + + assert param_name not in tool_args, "Duplicate parameter name" + tool_args[param_name] = (param_value, string) + + index, content, stop_token = _read_until_stop( + index, text, [f"<{dsml_token}parameter", f"\n", "Parameter format error" + + tool_call = decode_dsml_to_arguments(tool_name=tool_name, tool_args=tool_args) + tool_calls.append(tool_call) + + return index, stop_token, tool_calls + + +# NOTE: This function is designed to parse only correctly formatted string and will not attempt to correct malformed output that may be generated by the model. +def parse_message_from_completion_text(text: str, thinking_mode: str): + summary_content, reasoning_content, tool_calls = "", "", [] + index, stop_token = 0, None + tool_calls_start_token = f"\n\n<{dsml_token}function_calls" + + is_thinking, is_tool_calling = thinking_mode == "thinking", False + + if is_thinking: + index, content_delta, stop_token = _read_until_stop( + index, text, [thinking_end_token, tool_calls_start_token] + ) + reasoning_content = content_delta + assert stop_token == thinking_end_token, "Invalid thinking format" + + index, content_delta, stop_token = _read_until_stop( + index, text, [eos_token, tool_calls_start_token] + ) + summary_content = content_delta + if stop_token == tool_calls_start_token: + is_tool_calling = True + else: + assert stop_token == eos_token, "Invalid summary format" + + if is_tool_calling: + index, stop_token, tool_calls = parse_tool_calls(index, text) + + index, tool_ends_text, stop_token = _read_until_stop(index, text, [eos_token]) + assert not tool_ends_text, "Unexpected content after tool calls" + + assert len(text) == index and stop_token in [eos_token, None], ( + "Unexpected content at end" + ) + + for sp_token in [ + bos_token, + eos_token, + thinking_start_token, + thinking_end_token, + dsml_token, + ]: + assert sp_token not in summary_content and sp_token not in reasoning_content, ( + "Unexpected special token in content" + ) + + return { + "role": "assistant", + "content": summary_content, + "reasoning_content": reasoning_content, + "tool_calls": tool_calls_to_openai_format(tool_calls), + } diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py index 0ab29c788cd6..8b70155820b4 100644 --- a/vllm/tokenizers/deepseekv32.py +++ b/vllm/tokenizers/deepseekv32.py @@ -1,456 +1,11 @@ -import copy -import json -import re from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union from transformers import BatchEncoding +from .deepseek_v32_encoding import encode_messages from .hf import HfTokenizer, TokenizerLike from .registry import TokenizerRegistry -TOOLS_SYSTEM_TEMPLATE = """## Tools -You have access to a set of tools you can use to answer the user's question. -You can invoke functions by writing a "<{dsml_token}function_calls>" block like the following as part of your reply to the user: -<{dsml_token}function_calls> -<{dsml_token}invoke name="$FUNCTION_NAME"> -<{dsml_token}parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE -... - -<{dsml_token}invoke name="$FUNCTION_NAME2"> -... - - -String and scalar parameters should be specified as is without any escaping or quotes, while lists and objects should use JSON format. The "string" attribute should be set to "true" for string type parameters and "false" for other types (numbers, booleans, arrays, objects). -If the thinking_mode is enabled, then after function results you should strongly consider outputting a thinking block. Here is an example: -<{dsml_token}function_calls> -... - - -... - -{thinking_start_token}...thinking about results{thinking_end_token} -Here are the functions available in JSONSchema format: - -{tool_schemas} - -""" - -bos_token: str = "<|begin▁of▁sentence|>" -eos_token: str = "<|end▁of▁sentence|>" -thinking_start_token: str = "" -thinking_end_token: str = "" -dsml_token: str = "|DSML|" -system_msg_template: str = "{content}" -user_msg_template: str = "<|User|>{content}<|Assistant|>" -assistant_msg_template: str = "{reasoning}{content}{tool_calls}<|end▁of▁sentence|>" -thinking_template = "{reasoning_content}" - -response_format_template: str = "## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}" -tool_call_template: str = ( - '<{dsml_token}invoke name="{name}">\n{arguments}\n' -) -tool_calls_template = ( - "<{dsml_token}function_calls>\n{tool_calls}\n" -) - -tool_output_template: str = "\n{content}" - - -def to_json(value: Any) -> str: - try: - return json.dumps(value, ensure_ascii=False) - except: - return json.dumps(value, ensure_ascii=True) - - -def tools_from_openai_format(tools): - return [tool["function"] for tool in tools] - - -def tool_calls_from_openai_format(tool_calls): - return [ - { - "name": tool_call["function"]["name"], - "arguments": tool_call["function"]["arguments"], - } - for tool_call in tool_calls - ] - - -def tool_calls_to_openai_format(tool_calls): - return [ - { - "type": "function", - "function": { - "name": tool_call["name"], - "arguments": tool_call["arguments"], - }, - } - for tool_call in tool_calls - ] - - -def encode_arguments_to_dsml(tool_call: Dict[str, str]) -> str: - p_dsml_template = """<{dsml_token}parameter name="{key}" string="{is_str}">{value}""" - P_dsml_strs = [] - - arguments = json.loads(tool_call["arguments"]) - - for k, v in arguments.items(): - p_dsml_str = p_dsml_template.format( - dsml_token=dsml_token, - key=k, - is_str="true" if isinstance(v, str) else "false", - value=v if isinstance(v, str) else to_json(v), - ) - - P_dsml_strs.append(p_dsml_str) - - return "\n".join(P_dsml_strs) - - -def decode_dsml_to_arguments( - tool_name: str, tool_args: Dict[str, Tuple[str, str]] -) -> Dict[str, str]: - def _decode_value(key: str, value: str, string: str): - if string == "true": - value = to_json(value) - return f"{to_json(key)}: {value}" - - tool_args_json = ( - "{" - + ", ".join( - [_decode_value(k, v, string=is_str) for k, (v, is_str) in tool_args.items()] - ) - + "}" - ) - return dict(name=tool_name, arguments=tool_args_json) - - -def render_tools(tools: List[Dict[str, Union[str, Dict[str, Any]]]]) -> str: - tools_json = [to_json(t) for t in tools] - - return TOOLS_SYSTEM_TEMPLATE.format( - tool_schemas="\n".join(tools_json), - dsml_token=dsml_token, - thinking_start_token=thinking_start_token, - thinking_end_token=thinking_end_token, - ) - - -def find_last_user_index(messages: List[Dict[str, Any]]) -> int: - last_user_index = -1 - for idx in range(len(messages) - 1, -1, -1): - if messages[idx].get("role") in ["user", "developer"]: - last_user_index = idx - break - return last_user_index - - -def render_message( - index: int, messages: List[Dict[str, Any]], thinking_mode: str -) -> str: - assert 0 <= index < len(messages) - assert thinking_mode in ["chat", "thinking"], ( - f"Invalid thinking_mode `{thinking_mode}`" - ) - - prompt = "" - msg = messages[index] - last_user_idx = find_last_user_index(messages) - - role = msg.get("role") - content = msg.get("content") - tools = msg.get("tools") - response_format = msg.get("response_format") - tool_calls = msg.get("tool_calls") - reasoning_content = msg.get("reasoning_content") - - if tools: - tools = tools_from_openai_format(tools) - if tool_calls: - tool_calls = tool_calls_from_openai_format(tool_calls) - - if role == "system": - prompt += system_msg_template.format(content=content or "") - if tools: - prompt += "\n\n" + render_tools(tools) - - if response_format: - prompt += "\n\n" + response_format_template.format( - schema=to_json(response_format) - ) - - elif role == "developer": - assert content, f"Invalid message for role `{role}`: {msg}" - content_developer = "" - if tools: - content_developer += "\n\n" + render_tools(tools) - - if response_format: - content_developer += "\n\n" + response_format_template.format( - schema=to_json(response_format) - ) - - content_developer += "\n\n# The user's message is: {}".format(content) - - prompt += user_msg_template.format(content=content_developer) - if index == last_user_idx and thinking_mode == "thinking": - prompt += thinking_start_token - else: - prompt += thinking_end_token - - elif role == "user": - prompt += user_msg_template.format(content=content) - - if index == last_user_idx and thinking_mode == "thinking": - prompt += thinking_start_token - else: - prompt += thinking_end_token - - elif role == "tool": - prev_assistant_idx = index - 1 - assistant_msg = messages[prev_assistant_idx] - while prev_assistant_idx >= 0 and assistant_msg.get("role") == "tool": - prev_assistant_idx -= 1 - assistant_msg = messages[prev_assistant_idx] - - assert ( - index == 0 - or prev_assistant_idx >= 0 - and assistant_msg.get("role") == "assistant" - ), f"Invalid messages at {index}:\n{assistant_msg}" - - tool_call_order = index - prev_assistant_idx - assistant_tool_calls = assistant_msg.get("tool_calls") - assert assistant_tool_calls and len(assistant_tool_calls) >= tool_call_order, ( - "No tool calls but found tool output" - ) - - if tool_call_order == 1: - prompt += "\n\n" - - prompt += tool_output_template.format(content=content) - - if tool_call_order == len(assistant_tool_calls): - prompt += "\n" - - if index >= last_user_idx and thinking_mode == "thinking": - prompt += "\n\n" + thinking_start_token - else: - prompt += "\n\n" + thinking_end_token - - elif role == "assistant": - prev_assistant_idx = index - thinking_part = "" - - tool_calls_content = "" - if tool_calls: - tool_calls = [ - tool_call_template.format( - dsml_token=dsml_token, - name=tool_call.get("name"), - arguments=encode_arguments_to_dsml(tool_call), - ) - for tool_call in tool_calls - ] - tool_calls_content += "\n\n" + tool_calls_template.format( - dsml_token=dsml_token, tool_calls="\n".join(tool_calls) - ) - - summary_content = content or "" - - if thinking_mode == "thinking" and index > last_user_idx: - assert reasoning_content or tool_calls, ( - f"ThinkingMode: {thinking_mode}, invalid message without reasoning_content/tool_calls `{msg}` after last user message" - ) - thinking_part = ( - thinking_template.format(reasoning_content=reasoning_content or "") - + thinking_end_token - ) - - prompt += assistant_msg_template.format( - reasoning=thinking_part, - content=summary_content, - tool_calls=tool_calls_content, - ) - else: - raise NotImplementedError(f"Unknown role: {role}") - - return prompt - - -def drop_thinking_messages( - messages: List[Dict[str, Any]], last_user_idx: Optional[int] = None -) -> List[Dict[str, Any]]: - messages_wo_thinking: List[Dict[str, Any]] = [] - last_user_idx = ( - find_last_user_index(messages) if last_user_idx is None else last_user_idx - ) - for idx, msg in enumerate(messages): - role = msg.get("role") - if role in ["user", "system", "tool"] or idx >= last_user_idx: - messages_wo_thinking.append(msg) - continue - - elif role == "assistant": - msg_wo_thinking = copy.copy(msg) - msg_wo_thinking.pop("reasoning_content", None) - messages_wo_thinking.append(msg_wo_thinking) - - return messages_wo_thinking - - -def encode_messages( - messages: List[Dict[str, Any]], - thinking_mode: str, - context: Optional[List[Dict[str, Any]]] = None, - drop_thinking: bool = True, - add_default_bos_token: bool = True, -) -> str: - context = context if context else [] - full_messages = context + messages - - prompt = bos_token if add_default_bos_token and len(context) == 0 else "" - - if thinking_mode == "thinking" and drop_thinking: - full_messages = drop_thinking_messages(full_messages) - - for idx in range(len(messages)): - prompt += render_message( - idx + len(context), full_messages, thinking_mode=thinking_mode - ) - - return prompt - - -def _read_until_stop( - index: int, text: str, stop: List[str] -) -> Tuple[int, str, Optional[str]]: - min_pos = len(text) - matched_stop = None - - for s in stop: - pos = text.find(s, index) - if pos != -1 and pos < min_pos: - min_pos = pos - matched_stop = s - - if matched_stop: - content = text[index:min_pos] - return min_pos + len(matched_stop), content, matched_stop - else: - content = text[index:] - return len(text), content, None - - -def parse_tool_calls(index: int, text: str): - tool_calls: List[Dict[str, Any]] = [] - stop_token = None - tool_calls_end_token = f"" - - while index < len(text): - index, _, stop_token = _read_until_stop( - index, text, [f"<{dsml_token}invoke", tool_calls_end_token] - ) - assert _ == ">\n", "Tool call format error" - - if stop_token == tool_calls_end_token: - break - - assert stop_token is not None, "Missing special token" - - index, tool_name_content, stop_token = _read_until_stop( - index, text, [f"<{dsml_token}parameter", f"\n$', tool_name_content, flags=re.DOTALL - ) - assert len(p_tool_name) == 1, "Tool name format error" - tool_name = p_tool_name[0] - - tool_args: Dict[str, Tuple[str, str]] = {} - while stop_token == f"<{dsml_token}parameter": - index, param_content, stop_token = _read_until_stop( - index, text, [f"/{dsml_token}parameter"] - ) - - param_kv = re.findall( - r'^ name="(.*?)" string="(true|false)">(.*?)<$', - param_content, - flags=re.DOTALL, - ) - assert len(param_kv) == 1, "Parameter format error" - param_name, string, param_value = param_kv[0] - - assert param_name not in tool_args, "Duplicate parameter name" - tool_args[param_name] = (param_value, string) - - index, content, stop_token = _read_until_stop( - index, text, [f"<{dsml_token}parameter", f"\n", "Parameter format error" - - tool_call = decode_dsml_to_arguments(tool_name=tool_name, tool_args=tool_args) - tool_calls.append(tool_call) - - return index, stop_token, tool_calls - - -# NOTE: This function is designed to parse only correctly formatted string and will not attempt to correct malformed output that may be generated by the model. -def parse_message_from_completion_text(text: str, thinking_mode: str): - summary_content, reasoning_content, tool_calls = "", "", [] - index, stop_token = 0, None - tool_calls_start_token = f"\n\n<{dsml_token}function_calls" - - is_thinking, is_tool_calling = thinking_mode == "thinking", False - - if is_thinking: - index, content_delta, stop_token = _read_until_stop( - index, text, [thinking_end_token, tool_calls_start_token] - ) - reasoning_content = content_delta - assert stop_token == thinking_end_token, "Invalid thinking format" - - index, content_delta, stop_token = _read_until_stop( - index, text, [eos_token, tool_calls_start_token] - ) - summary_content = content_delta - if stop_token == tool_calls_start_token: - is_tool_calling = True - else: - assert stop_token == eos_token, "Invalid summary format" - - if is_tool_calling: - index, stop_token, tool_calls = parse_tool_calls(index, text) - - index, tool_ends_text, stop_token = _read_until_stop(index, text, [eos_token]) - assert not tool_ends_text, "Unexpected content after tool calls" - - assert len(text) == index and stop_token in [eos_token, None], ( - "Unexpected content at end" - ) - - for sp_token in [ - bos_token, - eos_token, - thinking_start_token, - thinking_end_token, - dsml_token, - ]: - assert sp_token not in summary_content and sp_token not in reasoning_content, ( - "Unexpected special token in content" - ) - - return { - "role": "assistant", - "content": summary_content, - "reasoning_content": reasoning_content, - "tool_calls": tool_calls_to_openai_format(tool_calls), - } - @TokenizerRegistry.register("deepseek_v32") class DeepseekV32Tokenizer(HfTokenizer): @@ -458,7 +13,7 @@ def __init__(self, tokenizer: TokenizerLike): super().__init__(tokenizer) self.tokenizer = tokenizer self.name_or_path = tokenizer.name_or_path - + @classmethod def from_pretrained( cls, From a18a43b50625479d98c04b5ba809397b05192542 Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Tue, 2 Dec 2025 06:50:35 +0000 Subject: [PATCH 11/19] [Frontend] supports deepseekv32 chat template Signed-off-by: chaunceyjiang --- vllm/tokenizers/hf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/tokenizers/hf.py b/vllm/tokenizers/hf.py index cb7a34beaee8..c93a6fc3ca1c 100644 --- a/vllm/tokenizers/hf.py +++ b/vllm/tokenizers/hf.py @@ -24,7 +24,7 @@ def get_cached_tokenizer( each time they are called, leading to a significant slowdown. This proxy caches these properties for faster access. """ - tokenizer.__raw_class__ = tokenizer.__class__ + cached_tokenizer = copy.copy(tokenizer) tokenizer_all_special_ids = tokenizer.all_special_ids From 9abf6894ea810be1d9aa7f604a898622c08ec72c Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Tue, 2 Dec 2025 07:02:33 +0000 Subject: [PATCH 12/19] [Frontend] supports deepseekv32 chat template Signed-off-by: chaunceyjiang --- vllm/config/model.py | 3 +- vllm/tokenizers/deepseek_v32_encoding.py | 43 ++++++++++++++---------- vllm/tokenizers/deepseekv32.py | 3 ++ vllm/tokenizers/hf.py | 1 - 4 files changed, 30 insertions(+), 20 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 0ad818c75b31..655b7c995f6d 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -141,7 +141,8 @@ class ModelConfig: - "hf" will use the fast tokenizer if available.\n - "slow" will always use the slow tokenizer.\n - "mistral" will always use the tokenizer from `mistral_common`.\n - - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.""" + - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n + - Other custom values can be supported via plugins.""" trust_remote_code: bool = False """Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer.""" diff --git a/vllm/tokenizers/deepseek_v32_encoding.py b/vllm/tokenizers/deepseek_v32_encoding.py index ba6eb98a8212..1e0eb571dda4 100644 --- a/vllm/tokenizers/deepseek_v32_encoding.py +++ b/vllm/tokenizers/deepseek_v32_encoding.py @@ -1,7 +1,12 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +# copy from https://huggingface.co/deepseek-ai/DeepSeek-V3.2/blob/main/encoding/encoding_dsv32.py import copy import json import re -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any TOOLS_SYSTEM_TEMPLATE = """## Tools You have access to a set of tools you can use to answer the user's question. @@ -85,8 +90,8 @@ def tool_calls_to_openai_format(tool_calls): ] -def encode_arguments_to_dsml(tool_call: Dict[str, str]) -> str: - p_dsml_template = """<{dsml_token}parameter name="{key}" string="{is_str}">{value}""" +def encode_arguments_to_dsml(tool_call: dict[str, str]) -> str: + p_dsml_template = """<{dsml_token}parameter name="{key}" string="{is_str}">{value}""" # noqa: E501 P_dsml_strs = [] arguments = json.loads(tool_call["arguments"]) @@ -105,8 +110,8 @@ def encode_arguments_to_dsml(tool_call: Dict[str, str]) -> str: def decode_dsml_to_arguments( - tool_name: str, tool_args: Dict[str, Tuple[str, str]] -) -> Dict[str, str]: + tool_name: str, tool_args: dict[str, tuple[str, str]] +) -> dict[str, str]: def _decode_value(key: str, value: str, string: str): if string == "true": value = to_json(value) @@ -122,7 +127,7 @@ def _decode_value(key: str, value: str, string: str): return dict(name=tool_name, arguments=tool_args_json) -def render_tools(tools: List[Dict[str, Union[str, Dict[str, Any]]]]) -> str: +def render_tools(tools: list[dict[str, Union[str, dict[str, Any]]]]) -> str: tools_json = [to_json(t) for t in tools] return TOOLS_SYSTEM_TEMPLATE.format( @@ -133,7 +138,7 @@ def render_tools(tools: List[Dict[str, Union[str, Dict[str, Any]]]]) -> str: ) -def find_last_user_index(messages: List[Dict[str, Any]]) -> int: +def find_last_user_index(messages: list[dict[str, Any]]) -> int: last_user_index = -1 for idx in range(len(messages) - 1, -1, -1): if messages[idx].get("role") in ["user", "developer"]: @@ -143,7 +148,7 @@ def find_last_user_index(messages: List[Dict[str, Any]]) -> int: def render_message( - index: int, messages: List[Dict[str, Any]], thinking_mode: str + index: int, messages: list[dict[str, Any]], thinking_mode: str ) -> str: assert 0 <= index < len(messages) assert thinking_mode in ["chat", "thinking"], ( @@ -276,9 +281,9 @@ def render_message( def drop_thinking_messages( - messages: List[Dict[str, Any]], last_user_idx: Optional[int] = None -) -> List[Dict[str, Any]]: - messages_wo_thinking: List[Dict[str, Any]] = [] + messages: list[dict[str, Any]], last_user_idx: int | None = None +) -> list[dict[str, Any]]: + messages_wo_thinking: list[dict[str, Any]] = [] last_user_idx = ( find_last_user_index(messages) if last_user_idx is None else last_user_idx ) @@ -297,9 +302,9 @@ def drop_thinking_messages( def encode_messages( - messages: List[Dict[str, Any]], + messages: list[dict[str, Any]], thinking_mode: str, - context: Optional[List[Dict[str, Any]]] = None, + context: list[dict[str, Any]] | None = None, drop_thinking: bool = True, add_default_bos_token: bool = True, ) -> str: @@ -320,8 +325,8 @@ def encode_messages( def _read_until_stop( - index: int, text: str, stop: List[str] -) -> Tuple[int, str, Optional[str]]: + index: int, text: str, stop: list[str] +) -> tuple[int, str, None | str]: min_pos = len(text) matched_stop = None @@ -340,7 +345,7 @@ def _read_until_stop( def parse_tool_calls(index: int, text: str): - tool_calls: List[Dict[str, Any]] = [] + tool_calls: list[dict[str, Any]] = [] stop_token = None tool_calls_end_token = f"" @@ -365,7 +370,7 @@ def parse_tool_calls(index: int, text: str): assert len(p_tool_name) == 1, "Tool name format error" tool_name = p_tool_name[0] - tool_args: Dict[str, Tuple[str, str]] = {} + tool_args: dict[str, tuple[str, str]] = {} while stop_token == f"<{dsml_token}parameter": index, param_content, stop_token = _read_until_stop( index, text, [f"/{dsml_token}parameter"] @@ -393,7 +398,9 @@ def parse_tool_calls(index: int, text: str): return index, stop_token, tool_calls -# NOTE: This function is designed to parse only correctly formatted string and will not attempt to correct malformed output that may be generated by the model. +# NOTE: This function is designed to parse only correctly +# formatted string and will not attempt to correct malformed output +# that may be generated by the model. def parse_message_from_completion_text(text: str, thinking_mode: str): summary_content, reasoning_content, tool_calls = "", "", [] index, stop_token = 0, None diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py index 8b70155820b4..99859215435d 100644 --- a/vllm/tokenizers/deepseekv32.py +++ b/vllm/tokenizers/deepseekv32.py @@ -1,3 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + from pathlib import Path from transformers import BatchEncoding diff --git a/vllm/tokenizers/hf.py b/vllm/tokenizers/hf.py index c93a6fc3ca1c..344507312038 100644 --- a/vllm/tokenizers/hf.py +++ b/vllm/tokenizers/hf.py @@ -24,7 +24,6 @@ def get_cached_tokenizer( each time they are called, leading to a significant slowdown. This proxy caches these properties for faster access. """ - cached_tokenizer = copy.copy(tokenizer) tokenizer_all_special_ids = tokenizer.all_special_ids From c0681546eb2340f112eb989609c922ed631c45f2 Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Tue, 2 Dec 2025 07:13:26 +0000 Subject: [PATCH 13/19] [Frontend] supports deepseekv32 chat template Signed-off-by: chaunceyjiang --- vllm/tokenizers/deepseek_v32_encoding.py | 10 ++++++---- vllm/tokenizers/deepseekv32.py | 8 ++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/vllm/tokenizers/deepseek_v32_encoding.py b/vllm/tokenizers/deepseek_v32_encoding.py index 1e0eb571dda4..ab37f169dc55 100644 --- a/vllm/tokenizers/deepseek_v32_encoding.py +++ b/vllm/tokenizers/deepseek_v32_encoding.py @@ -45,7 +45,7 @@ assistant_msg_template: str = "{reasoning}{content}{tool_calls}<|end▁of▁sentence|>" thinking_template = "{reasoning_content}" -response_format_template: str = "## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}" +response_format_template: str = "## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}" # noqa: E501 tool_call_template: str = ( '<{dsml_token}invoke name="{name}">\n{arguments}\n' ) @@ -127,7 +127,7 @@ def _decode_value(key: str, value: str, string: str): return dict(name=tool_name, arguments=tool_args_json) -def render_tools(tools: list[dict[str, Union[str, dict[str, Any]]]]) -> str: +def render_tools(tools: list[dict[str, str | dict[str, Any]]]) -> str: tools_json = [to_json(t) for t in tools] return TOOLS_SYSTEM_TEMPLATE.format( @@ -164,7 +164,7 @@ def render_message( tools = msg.get("tools") response_format = msg.get("response_format") tool_calls = msg.get("tool_calls") - reasoning_content = msg.get("reasoning_content") + reasoning_content = msg.get("reasoning") or msg.get("reasoning_content") if tools: tools = tools_from_openai_format(tools) @@ -262,7 +262,7 @@ def render_message( if thinking_mode == "thinking" and index > last_user_idx: assert reasoning_content or tool_calls, ( - f"ThinkingMode: {thinking_mode}, invalid message without reasoning_content/tool_calls `{msg}` after last user message" + f"ThinkingMode: {thinking_mode}, invalid message without reasoning_content/tool_calls `{msg}` after last user message" # noqa: E501 ) thinking_part = ( thinking_template.format(reasoning_content=reasoning_content or "") @@ -296,6 +296,7 @@ def drop_thinking_messages( elif role == "assistant": msg_wo_thinking = copy.copy(msg) msg_wo_thinking.pop("reasoning_content", None) + msg_wo_thinking.pop("reasoning", None) messages_wo_thinking.append(msg_wo_thinking) return messages_wo_thinking @@ -449,5 +450,6 @@ def parse_message_from_completion_text(text: str, thinking_mode: str): "role": "assistant", "content": summary_content, "reasoning_content": reasoning_content, + "reasoning": reasoning_content, "tool_calls": tool_calls_to_openai_format(tool_calls), } diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py index 99859215435d..b6232d1958c6 100644 --- a/vllm/tokenizers/deepseekv32.py +++ b/vllm/tokenizers/deepseekv32.py @@ -13,9 +13,10 @@ @TokenizerRegistry.register("deepseek_v32") class DeepseekV32Tokenizer(HfTokenizer): def __init__(self, tokenizer: TokenizerLike): - super().__init__(tokenizer) self.tokenizer = tokenizer - self.name_or_path = tokenizer.name_or_path + self.name_or_path = ( + tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else "" + ) @classmethod def from_pretrained( @@ -41,8 +42,7 @@ def apply_chat_template(self, messages, tools=None, **kwargs): encode_config = dict( thinking_mode="thinking", drop_thinking=True, add_default_bos_token=True ) - # messages -> string - prompt_str = encode_messages(messages, **encode_config) + prompt_str = encode_messages(messages, **encode_config) # type: ignore return prompt_str @property From 8d1bc671e2bf2df1468734aefea819806d24405d Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Tue, 2 Dec 2025 07:20:34 +0000 Subject: [PATCH 14/19] [Frontend] supports deepseekv32 chat template Signed-off-by: chaunceyjiang --- vllm/tokenizers/deepseek_v32_encoding.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/tokenizers/deepseek_v32_encoding.py b/vllm/tokenizers/deepseek_v32_encoding.py index ab37f169dc55..72f43395b192 100644 --- a/vllm/tokenizers/deepseek_v32_encoding.py +++ b/vllm/tokenizers/deepseek_v32_encoding.py @@ -8,6 +8,7 @@ import re from typing import Any +# flake8: noqa: E501 TOOLS_SYSTEM_TEMPLATE = """## Tools You have access to a set of tools you can use to answer the user's question. You can invoke functions by writing a "<{dsml_token}function_calls>" block like the following as part of your reply to the user: @@ -45,7 +46,7 @@ assistant_msg_template: str = "{reasoning}{content}{tool_calls}<|end▁of▁sentence|>" thinking_template = "{reasoning_content}" -response_format_template: str = "## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}" # noqa: E501 +response_format_template: str = "## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}" tool_call_template: str = ( '<{dsml_token}invoke name="{name}">\n{arguments}\n' ) @@ -59,7 +60,7 @@ def to_json(value: Any) -> str: try: return json.dumps(value, ensure_ascii=False) - except: + except Exception: return json.dumps(value, ensure_ascii=True) @@ -91,7 +92,7 @@ def tool_calls_to_openai_format(tool_calls): def encode_arguments_to_dsml(tool_call: dict[str, str]) -> str: - p_dsml_template = """<{dsml_token}parameter name="{key}" string="{is_str}">{value}""" # noqa: E501 + p_dsml_template = """<{dsml_token}parameter name="{key}" string="{is_str}">{value}""" P_dsml_strs = [] arguments = json.loads(tool_call["arguments"]) @@ -262,7 +263,7 @@ def render_message( if thinking_mode == "thinking" and index > last_user_idx: assert reasoning_content or tool_calls, ( - f"ThinkingMode: {thinking_mode}, invalid message without reasoning_content/tool_calls `{msg}` after last user message" # noqa: E501 + f"ThinkingMode: {thinking_mode}, invalid message without reasoning_content/tool_calls `{msg}` after last user message" ) thinking_part = ( thinking_template.format(reasoning_content=reasoning_content or "") From 3c83c7bb2e4c020b646ba48d8e7344912de90444 Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Tue, 2 Dec 2025 09:07:08 +0000 Subject: [PATCH 15/19] [Frontend] supports deepseekv32 chat template Signed-off-by: chaunceyjiang --- vllm/tokenizers/deepseekv32.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py index b6232d1958c6..852111713801 100644 --- a/vllm/tokenizers/deepseekv32.py +++ b/vllm/tokenizers/deepseekv32.py @@ -42,6 +42,12 @@ def apply_chat_template(self, messages, tools=None, **kwargs): encode_config = dict( thinking_mode="thinking", drop_thinking=True, add_default_bos_token=True ) + messages.append( + { + "role": "system", + "tools": tools if tools is not None else [], + } + ) prompt_str = encode_messages(messages, **encode_config) # type: ignore return prompt_str From f6bdf83fb9231452430347f3c9266f276890d0b3 Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Tue, 2 Dec 2025 09:44:35 +0000 Subject: [PATCH 16/19] [Frontend] supports deepseekv32 chat template Signed-off-by: chaunceyjiang --- vllm/tokenizers/deepseekv32.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py index 852111713801..3149d4dd67e0 100644 --- a/vllm/tokenizers/deepseekv32.py +++ b/vllm/tokenizers/deepseekv32.py @@ -39,14 +39,21 @@ def from_pretrained( return DeepseekV32Tokenizer(tokenizer) def apply_chat_template(self, messages, tools=None, **kwargs): + thinking = kwargs.get("thinking", True) + thinking_mode = "thinking" + if not thinking: + thinking_mode = "chat" + encode_config = dict( - thinking_mode="thinking", drop_thinking=True, add_default_bos_token=True + thinking_mode=thinking_mode, drop_thinking=False, add_default_bos_token=True ) - messages.append( + messages.insert( + 0, { "role": "system", + "content": "You are a helpful Assistant.", "tools": tools if tools is not None else [], - } + }, ) prompt_str = encode_messages(messages, **encode_config) # type: ignore return prompt_str From 84766772777d3345b00ce20abf1a422b8a883988 Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Tue, 2 Dec 2025 11:24:53 +0000 Subject: [PATCH 17/19] [Frontend] supports deepseekv32 chat template Signed-off-by: chaunceyjiang --- vllm/tokenizers/deepseekv32.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py index 3149d4dd67e0..d4a13e97696f 100644 --- a/vllm/tokenizers/deepseekv32.py +++ b/vllm/tokenizers/deepseekv32.py @@ -39,22 +39,17 @@ def from_pretrained( return DeepseekV32Tokenizer(tokenizer) def apply_chat_template(self, messages, tools=None, **kwargs): - thinking = kwargs.get("thinking", True) + thinking = kwargs.get("thinking", False) thinking_mode = "thinking" if not thinking: thinking_mode = "chat" - - encode_config = dict( - thinking_mode=thinking_mode, drop_thinking=False, add_default_bos_token=True - ) - messages.insert( - 0, - { - "role": "system", - "content": "You are a helpful Assistant.", - "tools": tools if tools is not None else [], - }, - ) + messages = messages.copy() + drop_thinking = True + if tools is not None and len(tools) > 0: + messages.insert(0, {"role": "system"}) + messages[0]["tools"] = tools + drop_thinking = False + encode_config = dict(thinking_mode=thinking_mode, drop_thinking=drop_thinking) prompt_str = encode_messages(messages, **encode_config) # type: ignore return prompt_str From 470ad4cc3d9797ab6a95737e0e3c4e2e0dc0a8e1 Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Tue, 2 Dec 2025 16:14:00 +0000 Subject: [PATCH 18/19] [Frontend] supports deepseekv32 chat template Signed-off-by: chaunceyjiang --- vllm/tokenizers/deepseekv32.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py index d4a13e97696f..d7dae67153e5 100644 --- a/vllm/tokenizers/deepseekv32.py +++ b/vllm/tokenizers/deepseekv32.py @@ -63,7 +63,7 @@ def all_special_ids(self) -> list[int]: @property def bos_token_id(self) -> int: - raise NotImplementedError + return self.tokenizer.bos_token_id @property def eos_token_id(self) -> int: @@ -135,6 +135,7 @@ def convert_tokens_to_string(self, tokens: list[str]) -> str: return self.tokenizer.convert_tokens_to_string(tokens) def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str: + skip_special_tokens = False return self.tokenizer.decode(ids, skip_special_tokens=skip_special_tokens) def convert_ids_to_tokens( @@ -142,6 +143,4 @@ def convert_ids_to_tokens( ids: list[int], skip_special_tokens: bool = False, ) -> list[str]: - return self.tokenizer.convert_ids_to_tokens( - ids, skip_special_tokens=skip_special_tokens - ) + return [self.decode(ids, skip_special_tokens=skip_special_tokens)] From 171ce8c1e6e50eb4a1cb12c4a9f60f9680f7234c Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Tue, 2 Dec 2025 16:36:00 +0000 Subject: [PATCH 19/19] [Frontend] supports deepseekv32 chat template Signed-off-by: chaunceyjiang --- vllm/tokenizers/deepseekv32.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py index d7dae67153e5..7466ad4076bd 100644 --- a/vllm/tokenizers/deepseekv32.py +++ b/vllm/tokenizers/deepseekv32.py @@ -93,7 +93,8 @@ def __hash__(self) -> int: return hash(id(self)) def __len__(self) -> int: - return self.vocab_size + # is an added token in DeepseekV32 tokenizer + return self.vocab_size + len(self.get_added_vocab()) def __call__( self, @@ -135,7 +136,6 @@ def convert_tokens_to_string(self, tokens: list[str]) -> str: return self.tokenizer.convert_tokens_to_string(tokens) def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str: - skip_special_tokens = False return self.tokenizer.decode(ids, skip_special_tokens=skip_special_tokens) def convert_ids_to_tokens( @@ -143,4 +143,6 @@ def convert_ids_to_tokens( ids: list[int], skip_special_tokens: bool = False, ) -> list[str]: - return [self.decode(ids, skip_special_tokens=skip_special_tokens)] + return self.tokenizer.convert_ids_to_tokens( + ids, skip_special_tokens=skip_special_tokens + )