Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
c92e4b8
[CI Failure] Fix Gemma3 RoPE configuration for sliding attention laye…
hl475 Nov 21, 2025
d183dcb
fix typo error
Nov 21, 2025
293e3ae
fix return values in ngram gpu
Nov 21, 2025
4534c88
python3.13 pre-commit check
Nov 24, 2025
07e6b8a
fix pre-commit and sign-off
Nov 24, 2025
2f08629
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Nov 24, 2025
e70b060
fix ngram gpu kernel compile issue
Nov 25, 2025
cde94b2
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Nov 25, 2025
33c4437
Merge branch 'patchy/async_ngram' of https://github.com/PatchouliTIS/…
Nov 25, 2025
25d36b1
fix docs bug
Nov 26, 2025
71b0dca
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Nov 26, 2025
183556e
v.01
Nov 29, 2025
f6f871f
test
Nov 30, 2025
1fbf296
fix large batch performance.
Dec 2, 2025
b5243ec
refactor ngram gpu
Dec 2, 2025
0081487
modify nvtx
Dec 2, 2025
bcf454f
change copy to async
Dec 2, 2025
0d2638b
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Dec 2, 2025
34cc523
remove irrelevant files
Dec 2, 2025
c9f2724
use discard_request_mask in ngram
Dec 2, 2025
16eb87c
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Dec 2, 2025
82ff639
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Dec 2, 2025
3abd884
remove irrelevant computations
Dec 4, 2025
cd9ecc9
Merge branch 'patchy/async_ngram' of https://github.com/PatchouliTIS/…
Dec 4, 2025
38cf7fd
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Dec 4, 2025
b518ef2
remove irrelevant comments
Dec 4, 2025
d07f4a7
Merge branch 'patchy/async_ngram' of https://github.com/PatchouliTIS/…
Dec 4, 2025
3d28827
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Dec 4, 2025
8920a59
move token ids tensor gpu init inline
Dec 4, 2025
6967bb2
Merge branch 'patchy/async_ngram' of https://github.com/PatchouliTIS/…
Dec 4, 2025
25d6b1f
remove unused status check
Dec 4, 2025
3a6df84
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Dec 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 40 additions & 3 deletions tests/v1/e2e/test_async_scheduling.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def test_without_spec_decoding(
run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)


def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
def test_with_eagle3_spec_decoding(monkeypatch: pytest.MonkeyPatch):
"""Test consistency and acceptance rates with some different combos of
preemption, executor, async scheduling, prefill chunking,
spec decoding model length.
Expand Down Expand Up @@ -111,6 +111,42 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
run_tests(monkeypatch, MTP_MODEL, test_configs, test_sampling_params)


def test_with_ngram_gpu_spec_decoding(monkeypatch: pytest.MonkeyPatch):
"""Test ngram_gpu speculative decoding with different configurations.

This test specifically validates ngram_gpu behavior with various:
- Number of speculative tokens (2-6)
- Prompt lookup window sizes (min/max)
- Async scheduling enabled (as in production)
- Different executors and chunking settings
"""

# Variant with larger speculation window
ngram_gpu_config = {
"method": "ngram_gpu",
"num_speculative_tokens": 3,
"prompt_lookup_max": 3,
"prompt_lookup_min": 2,
}

# Test configurations covering various scenarios
# test_preemption, executor, async_scheduling,
# spec_config, test_prefill_chunking
test_configs = [
(False, "mp", False, None, False),
(False, "mp", False, ngram_gpu_config, False),
(True, "mp", False, ngram_gpu_config, True),
(False, "mp", True, ngram_gpu_config, False),
(True, "mp", True, ngram_gpu_config, False),
(True, "uni", True, ngram_gpu_config, False),
(True, "mp", True, ngram_gpu_config, True),
]

# Use MODEL (Qwen) for ngram_gpu tests as it's lighter weight
# and ngram_gpu doesn't require a specific draft model
run_tests(monkeypatch, MODEL, test_configs, [{}])


@dynamo_config.patch(cache_size_limit=16)
def run_tests(
monkeypatch: pytest.MonkeyPatch,
Expand Down Expand Up @@ -222,18 +258,19 @@ def run_test(
else dict(gpu_memory_utilization=0.9)
)
spec_mml = (spec_config or {}).get("max_model_len")
spec_method = (spec_config or {}).get("method", "none")
test_config = (
f"executor={executor}, preemption={test_preemption}, "
f"async_sched={async_scheduling}, "
f"chunk_prefill={test_prefill_chunking}, "
f"spec_decoding={spec_decoding}, spec_mml={spec_mml}"
f"spec_decoding={spec_decoding}, spec_method={spec_method}, spec_mml={spec_mml}"
)
print("-" * 80)
print(f"---- TESTING {test_str}: {test_config}")
print("-" * 80)
with VllmRunner(
model,
max_model_len=512,
max_model_len=4096,
enable_chunked_prefill=test_prefill_chunking,
# Force prefill chunking
max_num_batched_tokens=48 if test_prefill_chunking else None,
Expand Down
7 changes: 7 additions & 0 deletions vllm/compilation/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,13 @@ def __call__(
# Honors opt-outs such as CompilationMode.NONE or VLLM_DISABLE_COMPILE_CACHE.
disable_cache = not is_compile_cache_enabled(self.inductor_config)

# TODO(patchy): ngram gpu kernel will cause vllm torch compile cache errors.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why? Can this be fixed?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I enabled torch compile in the ngram gpu kernel, the computational graph corresponding to ngram operator would hit a precompiled computational graph cache in the main model, leading to mismatched computational graph results. Therefore, I directly disabled the compile cache here. I tested this locally, and disabling the cache had no impact on performance.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume disabling the compile cache would lead to longer startup time? I'm not an expert here but maybe it's possible to add an identifier to the compile cache to avoid extraneous cache hits?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, the startup time will increase a little. I attempted to add additional input parameters and other member variables to the nn.Modules forward method decorated with @support_torch_compile to achieve cache isolation, but none of them worked. I suspect this might be related to the internal implementation of @support_torch_compile within vLLM. However, as things stand, disabling torch compile caching only impacts the performance of the entire inference service during the initial startup phase.

is_ngram_gpu_enabled = (
vllm_config.speculative_config
and vllm_config.speculative_config.method == "ngram_gpu"
)
disable_cache = disable_cache or is_ngram_gpu_enabled

if disable_cache:
logger.info_once("vLLM's torch.compile cache is disabled.", scope="local")
else:
Expand Down
9 changes: 7 additions & 2 deletions vllm/config/speculative.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,15 @@
"pangu_ultra_moe_mtp",
]
EagleModelTypes = Literal["eagle", "eagle3", MTPModelTypes]
NgramGPUTypes = Literal["ngram_gpu"]
SpeculativeMethod = Literal[
"ngram",
"medusa",
"mlp_speculator",
"draft_model",
"suffix",
EagleModelTypes,
NgramGPUTypes,
]


Expand Down Expand Up @@ -263,6 +265,8 @@ def __post_init__(self):
self.quantization = self.target_model_config.quantization
elif self.method in ("ngram", "[ngram]"):
self.model = "ngram"
elif self.method == "ngram_gpu":
self.model = "ngram_gpu"
elif self.method == "suffix":
self.model = "suffix"
else:
Expand All @@ -277,9 +281,10 @@ def __post_init__(self):
):
self.method = "ngram"

if self.method in ("ngram", "[ngram]"):
if self.method in ("ngram", "[ngram]", "ngram_gpu"):
# Unified to "ngram" internally
self.method = "ngram"
if self.method in ("ngram", "[ngram]"):
self.method = "ngram"
# Set default values if not provided
if self.prompt_lookup_min is None and self.prompt_lookup_max is None:
# TODO(woosuk): Tune these values. They are arbitrarily chosen.
Expand Down
8 changes: 5 additions & 3 deletions vllm/config/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from pydantic.dataclasses import dataclass

import vllm.envs as envs
from vllm.config.speculative import EagleModelTypes
from vllm.config.speculative import EagleModelTypes, NgramGPUTypes
from vllm.logger import enable_trace_function_call, init_logger
from vllm.transformers_utils.runai_utils import is_runai_obj_uri
from vllm.utils import random_uuid
Expand Down Expand Up @@ -537,10 +537,12 @@ def __post_init__(self):
# Currently, async scheduling only support eagle speculative
# decoding.
if self.speculative_config is not None:
if self.speculative_config.method not in get_args(EagleModelTypes):
if self.speculative_config.method not in get_args(
EagleModelTypes
) and self.speculative_config.method not in get_args(NgramGPUTypes):
raise ValueError(
"Currently, async scheduling is only supported "
"with EAGLE/MTP kind of speculative decoding"
"with EAGLE/MTP/NGram GPU kind of speculative decoding"
)
if self.speculative_config.disable_padded_drafter_batch:
raise ValueError(
Expand Down
Loading