Skip to content

Commit d183dcb

Browse files
author
PatchouliTaisa
committed
fix typo error
Signed-off-by: PatchouliTaisa <[email protected]>
1 parent c92e4b8 commit d183dcb

File tree

6 files changed

+826
-18
lines changed

6 files changed

+826
-18
lines changed

tests/v1/e2e/test_async_scheduling.py

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def test_without_spec_decoding(
7373
run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)
7474

7575

76-
def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
76+
def test_with_eagle3_spec_decoding(monkeypatch: pytest.MonkeyPatch):
7777
"""Test consistency and acceptance rates with some different combos of
7878
preemption, executor, async scheduling, prefill chunking,
7979
spec decoding model length.
@@ -106,6 +106,42 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
106106
run_tests(monkeypatch, MTP_MODEL, test_configs, [{}])
107107

108108

109+
def test_with_ngram_gpu_spec_decoding(monkeypatch: pytest.MonkeyPatch):
110+
"""Test ngram_gpu speculative decoding with different configurations.
111+
112+
This test specifically validates ngram_gpu behavior with various:
113+
- Number of speculative tokens (2-6)
114+
- Prompt lookup window sizes (min/max)
115+
- Async scheduling enabled (as in production)
116+
- Different executors and chunking settings
117+
"""
118+
119+
# Variant with larger speculation window
120+
ngram_gpu_config = {
121+
"method": "ngram_gpu",
122+
"num_speculative_tokens": 3,
123+
"prompt_lookup_max": 3,
124+
"prompt_lookup_min": 2,
125+
}
126+
127+
# Test configurations covering various scenarios
128+
# test_preemption, executor, async_scheduling,
129+
# spec_config, test_prefill_chunking
130+
test_configs = [
131+
(False, "mp", False, None, False),
132+
(False, "mp", False, ngram_gpu_config, False),
133+
(True, "mp", False, ngram_gpu_config, True),
134+
(False, "mp", True, ngram_gpu_config, False),
135+
(True, "mp", True, ngram_gpu_config, False),
136+
(True, "uni", True, ngram_gpu_config, False),
137+
(True, "mp", True, ngram_gpu_config, True),
138+
]
139+
140+
# Use MODEL (Qwen) for ngram_gpu tests as it's lighter weight
141+
# and ngram_gpu doesn't require a specific draft model
142+
run_tests(monkeypatch, MODEL, test_configs, [{}])
143+
144+
109145
@dynamo_config.patch(cache_size_limit=16)
110146
def run_tests(
111147
monkeypatch: pytest.MonkeyPatch,
@@ -217,18 +253,19 @@ def run_test(
217253
else dict(gpu_memory_utilization=0.9)
218254
)
219255
spec_mml = (spec_config or {}).get("max_model_len")
256+
spec_method = (spec_config or {}).get("method", "none")
220257
test_config = (
221258
f"executor={executor}, preemption={test_preemption}, "
222259
f"async_sched={async_scheduling}, "
223260
f"chunk_prefill={test_prefill_chunking}, "
224-
f"spec_decoding={spec_decoding}, spec_mml={spec_mml}"
261+
f"spec_decoding={spec_decoding}, spec_method={spec_method}, spec_mml={spec_mml}"
225262
)
226263
print("-" * 80)
227264
print(f"---- TESTING {test_str}: {test_config}")
228265
print("-" * 80)
229266
with VllmRunner(
230267
model,
231-
max_model_len=512,
268+
max_model_len=4096,
232269
enable_chunked_prefill=test_prefill_chunking,
233270
# Force prefill chunking
234271
max_num_batched_tokens=48 if test_prefill_chunking else None,

vllm/config/speculative.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,15 @@
4040
"pangu_ultra_moe_mtp",
4141
]
4242
EagleModelTypes = Literal["eagle", "eagle3", MTPModelTypes]
43+
NgramGPUTypes = Literal["ngram_gpu"]
4344
SpeculativeMethod = Literal[
4445
"ngram",
4546
"medusa",
4647
"mlp_speculator",
4748
"draft_model",
4849
"suffix",
4950
EagleModelTypes,
51+
NgramGPUTypes,
5052
]
5153

5254

@@ -260,6 +262,8 @@ def __post_init__(self):
260262
self.quantization = self.target_model_config.quantization
261263
elif self.method in ("ngram", "[ngram]"):
262264
self.model = "ngram"
265+
elif self.method == "ngram_gpu":
266+
self.model = "ngram_gpu"
263267
elif self.method == "suffix":
264268
self.model = "suffix"
265269
else:
@@ -274,9 +278,10 @@ def __post_init__(self):
274278
):
275279
self.method = "ngram"
276280

277-
if self.method in ("ngram", "[ngram]"):
281+
if self.method in ("ngram", "[ngram]", "ngram_gpu"):
278282
# Unified to "ngram" internally
279-
self.method = "ngram"
283+
if self.method in ("ngram", "[ngram]"):
284+
self.method = "ngram"
280285
# Set default values if not provided
281286
if self.prompt_lookup_min is None and self.prompt_lookup_max is None:
282287
# TODO(woosuk): Tune these values. They are arbitrarily chosen.

vllm/config/vllm.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from pydantic.dataclasses import dataclass
2222

2323
import vllm.envs as envs
24-
from vllm.config.speculative import EagleModelTypes
24+
from vllm.config.speculative import EagleModelTypes, NgramGPUTypes
2525
from vllm.logger import enable_trace_function_call, init_logger
2626
from vllm.transformers_utils.runai_utils import is_runai_obj_uri
2727
from vllm.utils import random_uuid
@@ -378,10 +378,12 @@ def __post_init__(self):
378378
# Currently, async scheduling only support eagle speculative
379379
# decoding.
380380
if self.speculative_config is not None:
381-
if self.speculative_config.method not in get_args(EagleModelTypes):
381+
if self.speculative_config.method not in get_args(
382+
EagleModelTypes
383+
) and self.speculative_config.method not in get_args(NgramGPUTypes):
382384
raise ValueError(
383385
"Currently, async scheduling is only supported "
384-
"with EAGLE/MTP kind of speculative decoding"
386+
"with EAGLE/MTP/NGram GPU kind of speculative decoding"
385387
)
386388
if self.speculative_config.disable_padded_drafter_batch:
387389
raise ValueError(

0 commit comments

Comments
 (0)