Skip to content

Commit f0a28bf

Browse files
[Misc] Unify tokenizer registration (#29767)
Signed-off-by: DarkLight1337 <[email protected]>
1 parent 86e178f commit f0a28bf

File tree

14 files changed

+237
-183
lines changed

14 files changed

+237
-183
lines changed

tests/entrypoints/openai/test_tokenization.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ async def test_tokenize_completions(
5353
model_name: str,
5454
tokenizer_name: str,
5555
):
56-
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
56+
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
5757

5858
for add_special in [False, True]:
5959
prompt = "vllm1 This is a test prompt."
@@ -87,7 +87,7 @@ async def test_tokenize_chat(
8787
model_name: str,
8888
tokenizer_name: str,
8989
):
90-
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
90+
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
9191

9292
for add_generation in [False, True]:
9393
for add_special in [False, True]:
@@ -140,7 +140,7 @@ async def test_tokenize_chat_with_tools(
140140
model_name: str,
141141
tokenizer_name: str,
142142
):
143-
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
143+
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
144144

145145
for add_generation in [False, True]:
146146
for add_special in [False, True]:
@@ -210,7 +210,7 @@ async def test_tokenize_with_return_token_strs(
210210
model_name: str,
211211
tokenizer_name: str,
212212
):
213-
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
213+
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
214214

215215
prompt = "This is a token_strs test prompt! vllm1"
216216
response = requests.post(
@@ -240,7 +240,7 @@ async def test_detokenize(
240240
model_name: str,
241241
tokenizer_name: str,
242242
):
243-
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
243+
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
244244

245245
prompt = "This is a test prompt. vllm1"
246246
tokens = tokenizer.encode(prompt, add_special_tokens=False)

tests/entrypoints/pooling/embed/test_online.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ async def test_conversation_embedding(
197197
chat_response.raise_for_status()
198198
chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
199199

200-
tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
200+
tokenizer = get_tokenizer(tokenizer_name=model_name)
201201
prompt = tokenizer.apply_chat_template(
202202
messages,
203203
chat_template=DUMMY_CHAT_TEMPLATE,

tests/entrypoints/pooling/pooling/test_online.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -158,11 +158,7 @@ async def test_conversation_pooling(server: RemoteOpenAIServer, model_name: str)
158158
chat_response.raise_for_status()
159159
chat_poolings = PoolingResponse.model_validate(chat_response.json())
160160

161-
tokenizer = get_tokenizer(
162-
tokenizer_name=model_name,
163-
tokenizer_mode="fast",
164-
trust_remote_code=True,
165-
)
161+
tokenizer = get_tokenizer(tokenizer_name=model_name, trust_remote_code=True)
166162
prompt = tokenizer.apply_chat_template(
167163
messages,
168164
chat_template=DUMMY_CHAT_TEMPLATE,

tests/models/registry.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ class _HfExamplesInfo:
2323
tokenizer: str | None = None
2424
"""Set the tokenizer to load for this architecture."""
2525

26-
tokenizer_mode: TokenizerMode = "auto"
26+
tokenizer_mode: TokenizerMode | str = "auto"
2727
"""Set the tokenizer type for this architecture."""
2828

2929
speculative_model: str | None = None

tests/tokenizers_/test_registry.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,28 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
from pathlib import Path
4+
35
from vllm.tokenizers import TokenizerLike, TokenizerRegistry
46
from vllm.transformers_utils.tokenizer import get_tokenizer
57

68

79
class TestTokenizer(TokenizerLike):
810
@classmethod
9-
def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
10-
return TestTokenizer() # type: ignore
11+
def from_pretrained(
12+
cls,
13+
path_or_repo_id: str | Path,
14+
*args,
15+
trust_remote_code: bool = False,
16+
revision: str | None = None,
17+
download_dir: str | None = None,
18+
**kwargs,
19+
) -> "TestTokenizer":
20+
return TestTokenizer(path_or_repo_id) # type: ignore
21+
22+
def __init__(self, path_or_repo_id: str | Path) -> None:
23+
super().__init__()
24+
25+
self.path_or_repo_id = path_or_repo_id
1126

1227
@property
1328
def bos_token_id(self) -> int:
@@ -29,14 +44,16 @@ def is_fast(self) -> bool:
2944
def test_customized_tokenizer():
3045
TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__)
3146

32-
tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
47+
tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer", "abc")
3348
assert isinstance(tokenizer, TestTokenizer)
49+
assert tokenizer.path_or_repo_id == "abc"
3450
assert tokenizer.bos_token_id == 0
3551
assert tokenizer.eos_token_id == 1
3652
assert tokenizer.pad_token_id == 2
3753

38-
tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
54+
tokenizer = get_tokenizer("abc", tokenizer_mode="test_tokenizer")
3955
assert isinstance(tokenizer, TestTokenizer)
56+
assert tokenizer.path_or_repo_id == "abc"
4057
assert tokenizer.bos_token_id == 0
4158
assert tokenizer.eos_token_id == 1
4259
assert tokenizer.pad_token_id == 2

tests/v1/entrypoints/llm/test_struct_output_generate.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import json
66
from enum import Enum
7-
from typing import TYPE_CHECKING, Any
7+
from typing import Any
88

99
import jsonschema
1010
import pytest
@@ -24,11 +24,6 @@
2424
StructuredOutputsParams,
2525
)
2626

27-
if TYPE_CHECKING:
28-
from vllm.config.model import TokenizerMode
29-
else:
30-
TokenizerMode = str
31-
3227
NGRAM_SPEC_CONFIG = {
3328
"model": "[ngram]",
3429
"num_speculative_tokens": 5,
@@ -627,7 +622,7 @@ def test_structured_output(
627622
)
628623
def test_structured_output_with_reasoning_matrices(
629624
backend: str,
630-
tokenizer_mode: TokenizerMode,
625+
tokenizer_mode: str,
631626
reasoning_parser: str,
632627
model_name: str,
633628
speculative_config: dict[str, Any] | None,

vllm/config/model.py

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@
8686
"transcription",
8787
"draft",
8888
]
89-
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "custom"]
89+
TokenizerMode = Literal["auto", "hf", "slow", "mistral"]
9090
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
9191
LogprobsMode = Literal[
9292
"raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
@@ -137,13 +137,13 @@ class ModelConfig:
137137
tokenizer: SkipValidation[str] = None # type: ignore
138138
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
139139
name or path will be used."""
140-
tokenizer_mode: TokenizerMode = "auto"
140+
tokenizer_mode: TokenizerMode | str = "auto"
141141
"""Tokenizer mode:\n
142142
- "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n
143143
- "hf" will use the fast tokenizer if available.\n
144144
- "slow" will always use the slow tokenizer.\n
145145
- "mistral" will always use the tokenizer from `mistral_common`.\n
146-
- "custom" will use --tokenizer to select the preregistered tokenizer."""
146+
- Other custom values can be supported via plugins."""
147147
trust_remote_code: bool = False
148148
"""Trust remote code (e.g., from HuggingFace) when downloading the model
149149
and tokenizer."""
@@ -708,16 +708,17 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
708708
# can be correctly capped to sliding window size
709709
self.hf_text_config.sliding_window = None
710710

711-
if not self.skip_tokenizer_init:
712-
self._verify_tokenizer_mode()
713-
714711
# Avoid running try_verify_and_update_config multiple times
715712
self.config_updated = False
716713

717714
self._verify_quantization()
718715
self._verify_cuda_graph()
719716
self._verify_bnb_config()
720717

718+
@field_validator("tokenizer_mode", mode="after")
719+
def _lowercase_tokenizer_mode(cls, tokenizer_mode: str) -> str:
720+
return tokenizer_mode.lower()
721+
721722
@field_validator("quantization", mode="before")
722723
@classmethod
723724
def validate_quantization_before(cls, value: Any) -> Any:
@@ -829,15 +830,6 @@ def _get_encoder_config(self):
829830
model, _ = split_remote_gguf(model)
830831
return get_sentence_transformer_tokenizer_config(model, self.revision)
831832

832-
def _verify_tokenizer_mode(self) -> None:
833-
tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
834-
if tokenizer_mode not in get_args(TokenizerMode):
835-
raise ValueError(
836-
f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
837-
f"one of {get_args(TokenizerMode)}."
838-
)
839-
self.tokenizer_mode = tokenizer_mode
840-
841833
def _get_default_runner_type(
842834
self,
843835
architectures: list[str],

vllm/engine/arg_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,7 @@ class EngineArgs:
360360
task: TaskOption | None = ModelConfig.task
361361
skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
362362
enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
363-
tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
363+
tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode
364364
trust_remote_code: bool = ModelConfig.trust_remote_code
365365
allowed_local_media_path: str = ModelConfig.allowed_local_media_path
366366
allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains

vllm/entrypoints/llm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ def __init__(
188188
runner: RunnerOption = "auto",
189189
convert: ConvertOption = "auto",
190190
tokenizer: str | None = None,
191-
tokenizer_mode: TokenizerMode = "auto",
191+
tokenizer_mode: TokenizerMode | str = "auto",
192192
skip_tokenizer_init: bool = False,
193193
trust_remote_code: bool = False,
194194
allowed_local_media_path: str = "",

vllm/tokenizers/__init__.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@
44
from .hf import HfTokenizer
55
from .mistral import MistralTokenizer
66
from .protocol import TokenizerLike
7-
from .registry import TokenizerRegistry
7+
from .registry import TokenizerRegistry, get_tokenizer
88

9-
__all__ = ["TokenizerLike", "HfTokenizer", "MistralTokenizer", "TokenizerRegistry"]
9+
__all__ = [
10+
"TokenizerLike",
11+
"HfTokenizer",
12+
"MistralTokenizer",
13+
"TokenizerRegistry",
14+
"get_tokenizer",
15+
]

0 commit comments

Comments
 (0)