Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion benchmarks/benchmark_ngram_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,10 @@ def benchmark_batched_propose(args):
device_config=DeviceConfig(device=current_platform.device_type),
parallel_config=ParallelConfig(),
load_config=LoadConfig(),
scheduler_config=SchedulerConfig(),
scheduler_config=SchedulerConfig(
max_model_len=model_config.max_model_len,
is_encoder_decoder=model_config.is_encoder_decoder,
),
)

# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
Expand Down
15 changes: 10 additions & 5 deletions tests/compile/test_fusion_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,13 +318,18 @@ def test_attention_quant_pattern(
torch.set_default_dtype(dtype)
torch.manual_seed(42)

model_config = ModelConfig(
model=model_name,
max_model_len=2048,
dtype=dtype,
)
vllm_config = VllmConfig(
model_config=ModelConfig(
model=model_name,
max_model_len=2048,
dtype=dtype,
model_config=model_config,
scheduler_config=SchedulerConfig(
max_num_seqs=1024,
max_model_len=model_config.max_model_len,
is_encoder_decoder=model_config.is_encoder_decoder,
),
scheduler_config=SchedulerConfig(max_num_seqs=1024),
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
custom_ops=custom_ops_list,
Expand Down
25 changes: 17 additions & 8 deletions tests/lora/test_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,16 @@ def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
lora_requests, lora_mapping
)

model_config = ModelConfig(
MODEL_PATH,
seed=0,
dtype="float16",
max_model_len=127,
enforce_eager=True,
)

vllm_config = VllmConfig(
model_config=ModelConfig(
MODEL_PATH,
seed=0,
dtype="float16",
max_model_len=127,
enforce_eager=True,
),
model_config=model_config,
load_config=LoadConfig(
download_dir=None,
load_format="dummy",
Expand All @@ -50,7 +52,14 @@ def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
tensor_parallel_size=1,
data_parallel_size=1,
),
scheduler_config=SchedulerConfig("generate", 32, 32, 32),
scheduler_config=SchedulerConfig(
max_model_len=model_config.max_model_len,
is_encoder_decoder=model_config.is_encoder_decoder,
runner_type="generate",
max_num_batched_tokens=32,
max_num_seqs=32,
max_num_partial_prefills=32,
),
device_config=DeviceConfig("cuda"),
cache_config=CacheConfig(
block_size=16,
Expand Down
13 changes: 13 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@
from unittest.mock import patch

import pytest
from pydantic import ValidationError

from vllm.compilation.backends import VllmBackend
from vllm.config import (
CompilationConfig,
ModelConfig,
PoolerConfig,
SchedulerConfig,
VllmConfig,
update_config,
)
Expand Down Expand Up @@ -1095,3 +1097,14 @@ def test_vllm_config_explicit_overrides():
# Other fields should still use defaults
assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE


def test_scheduler_config_init():
with pytest.raises(ValidationError):
# Positional InitVars missing
# (InitVars cannot have defaults otherwise they will become attributes)
SchedulerConfig()

with pytest.raises(AttributeError):
# InitVar does not become an attribute
print(SchedulerConfig.default_factory().max_model_len)
2 changes: 2 additions & 0 deletions tests/v1/attention/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ def create_vllm_config(
max_num_seqs=max_num_seqs,
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=enable_chunked_prefill,
max_model_len=model_config.max_model_len,
is_encoder_decoder=model_config.is_encoder_decoder,
)

device_config = DeviceConfig()
Expand Down
11 changes: 9 additions & 2 deletions tests/v1/core/test_kv_cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1128,7 +1128,11 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len)
dtype="float16",
max_model_len=max_model_len,
)
scheduler_config = SchedulerConfig(max_num_batched_tokens=32768)
scheduler_config = SchedulerConfig(
max_num_batched_tokens=32768,
max_model_len=model_config.max_model_len,
is_encoder_decoder=model_config.is_encoder_decoder,
)

vllm_config = VllmConfig(
model_config=model_config,
Expand Down Expand Up @@ -1163,7 +1167,10 @@ def test_get_max_concurrency_for_kv_cache_config():
max_model_len=max_model_len,
)
scheduler_config = SchedulerConfig(
max_num_batched_tokens=1024, enable_chunked_prefill=True
max_num_batched_tokens=1024,
enable_chunked_prefill=True,
max_model_len=model_config.max_model_len,
is_encoder_decoder=model_config.is_encoder_decoder,
)

vllm_config = VllmConfig(
Expand Down
13 changes: 7 additions & 6 deletions tests/v1/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1508,6 +1508,12 @@ def create_scheduler_with_priority(
Returns:
{class}`Scheduler` instance with priority scheduling
"""
model_config = ModelConfig(
model=model,
trust_remote_code=True,
dtype="float16",
seed=42,
)
if max_model_len is None:
max_model_len = max_num_batched_tokens
scheduler_config = SchedulerConfig(
Expand All @@ -1517,14 +1523,9 @@ def create_scheduler_with_priority(
long_prefill_token_threshold=long_prefill_token_threshold,
disable_chunked_mm_input=disable_chunked_mm_input,
enable_chunked_prefill=True,
is_encoder_decoder=model_config.is_encoder_decoder,
policy="priority", # Enable priority scheduling
)
model_config = ModelConfig(
model=model,
trust_remote_code=True,
dtype="float16",
seed=42,
)
# Cache config, optionally force APC
cache_config = CacheConfig(
block_size=block_size,
Expand Down
15 changes: 8 additions & 7 deletions tests/v1/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@ def create_scheduler(
Returns:
{class}`Scheduler` instance
"""
model_config = ModelConfig(
model=model,
trust_remote_code=True,
dtype="float16",
seed=42,
skip_tokenizer_init=skip_tokenizer_init,
)
if max_model_len is None:
max_model_len = max_num_batched_tokens
scheduler_config = SchedulerConfig(
Expand All @@ -79,13 +86,7 @@ def create_scheduler(
disable_chunked_mm_input=disable_chunked_mm_input,
enable_chunked_prefill=enable_chunked_prefill,
async_scheduling=async_scheduling,
)
model_config = ModelConfig(
model=model,
trust_remote_code=True,
dtype="float16",
seed=42,
skip_tokenizer_init=skip_tokenizer_init,
is_encoder_decoder=model_config.is_encoder_decoder,
)
# Cache config, optionally force APC
cache_config = CacheConfig(
Expand Down
4 changes: 3 additions & 1 deletion tests/v1/cudagraph/test_cudagraph_dispatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ def _create_vllm_config(
) -> MagicMock:
mock_config = MagicMock(spec=VllmConfig)
mock_config.compilation_config = compilation_config
mock_config.scheduler_config = SchedulerConfig(max_num_seqs=max_num_seqs)
mock_config.scheduler_config = SchedulerConfig.default_factory(
max_num_seqs=max_num_seqs,
)
mock_config.parallel_config = ParallelConfig()
mock_config.speculative_config = None # No speculative decoding
if not lora_config:
Expand Down
13 changes: 7 additions & 6 deletions tests/v1/engine/test_engine_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,19 +484,20 @@ def test_encoder_instance_zero_kv_cache(
vision encoder, so they don't need KV cache for text generation.
"""
# Form vllm config
scheduler_config = SchedulerConfig(
max_num_seqs=10,
max_num_batched_tokens=512,
max_model_len=512,
disable_hybrid_kv_cache_manager=True,
)
model_config = ModelConfig(
model="llava-hf/llava-1.5-7b-hf", # Multimodal model
enforce_eager=True,
trust_remote_code=True,
dtype="float16",
seed=42,
)
scheduler_config = SchedulerConfig(
max_num_seqs=10,
max_num_batched_tokens=512,
max_model_len=512,
disable_hybrid_kv_cache_manager=True,
is_encoder_decoder=model_config.is_encoder_decoder,
)
cache_config = CacheConfig(
block_size=16,
gpu_memory_utilization=gpu_memory_utilization,
Expand Down
13 changes: 7 additions & 6 deletions tests/v1/kv_connector/unit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,18 +92,19 @@ def create_vllm_config(
enable_permute_local_kv: bool = False,
) -> VllmConfig:
"""Initialize VllmConfig For Testing."""
scheduler_config = SchedulerConfig(
max_num_seqs=max_num_seqs,
max_num_batched_tokens=max_num_batched_tokens,
max_model_len=max_model_len,
enable_chunked_prefill=enable_chunked_prefill,
)
model_config = ModelConfig(
model=model,
trust_remote_code=True,
dtype="float16",
seed=42,
)
scheduler_config = SchedulerConfig(
max_num_seqs=max_num_seqs,
max_num_batched_tokens=max_num_batched_tokens,
max_model_len=max_model_len,
enable_chunked_prefill=enable_chunked_prefill,
is_encoder_decoder=model_config.is_encoder_decoder,
)
# Cache config, optionally force APC
cache_config = CacheConfig(
block_size=block_size,
Expand Down
5 changes: 4 additions & 1 deletion tests/v1/spec_decode/test_eagle.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,10 @@ def _create_proposer(
device_config=DeviceConfig(device=current_platform.device_type),
parallel_config=ParallelConfig(),
load_config=LoadConfig(),
scheduler_config=SchedulerConfig(),
scheduler_config=SchedulerConfig(
max_model_len=model_config.max_model_len,
is_encoder_decoder=model_config.is_encoder_decoder,
),
)

return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type)
Expand Down
5 changes: 4 additions & 1 deletion tests/v1/spec_decode/test_mtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,10 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
device_config=DeviceConfig(device=current_platform.device_type),
parallel_config=ParallelConfig(),
load_config=LoadConfig(),
scheduler_config=SchedulerConfig(),
scheduler_config=SchedulerConfig(
max_model_len=model_config.max_model_len,
is_encoder_decoder=model_config.is_encoder_decoder,
),
)

return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type)
Expand Down
11 changes: 6 additions & 5 deletions tests/v1/tpu/worker/test_tpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,17 @@


def get_vllm_config():
scheduler_config = SchedulerConfig(
max_num_seqs=10,
max_num_batched_tokens=512,
max_model_len=512,
)
model_config = ModelConfig(
model="facebook/opt-125m",
dtype="bfloat16", # TPUs typically use bfloat16
seed=42,
)
scheduler_config = SchedulerConfig(
max_num_seqs=10,
max_num_batched_tokens=512,
max_model_len=512,
is_encoder_decoder=model_config.is_encoder_decoder,
)
cache_config = CacheConfig(
block_size=16,
gpu_memory_utilization=0.9,
Expand Down
20 changes: 11 additions & 9 deletions tests/v1/worker/test_gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,16 +79,17 @@ def initialize_kv_cache(runner: GPUModelRunner):


def get_vllm_config():
scheduler_config = SchedulerConfig(
max_num_seqs=10,
max_num_batched_tokens=512,
max_model_len=512,
)
model_config = ModelConfig(
model="facebook/opt-125m",
dtype="float16",
seed=42,
)
scheduler_config = SchedulerConfig(
max_num_seqs=10,
max_num_batched_tokens=512,
max_model_len=512,
is_encoder_decoder=model_config.is_encoder_decoder,
)
cache_config = CacheConfig(
block_size=BLOCK_SIZE,
gpu_memory_utilization=0.9,
Expand Down Expand Up @@ -784,14 +785,15 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
initialize_model_parallel(tensor_model_parallel_size=1)
torch.set_default_dtype(torch.float16)

model_config = ModelConfig(
model="ibm-granite/granite-4.0-tiny-preview",
dtype="float16",
)
scheduler_config = SchedulerConfig(
max_num_seqs=10,
max_num_batched_tokens=512,
max_model_len=512,
)
model_config = ModelConfig(
model="ibm-granite/granite-4.0-tiny-preview",
dtype="float16",
is_encoder_decoder=model_config.is_encoder_decoder,
)
cache_config = CacheConfig(
block_size=BLOCK_SIZE,
Expand Down
Loading