Skip to content

Commit 32a6d03

Browse files
Merge branch 'main' into fix-gpt-oss-tool-parser-v2
2 parents 06b38e5 + d84d8f4 commit 32a6d03

File tree

23 files changed

+128
-89
lines changed

23 files changed

+128
-89
lines changed

setup.py

Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -74,18 +74,6 @@ def is_ninja_available() -> bool:
7474
return which("ninja") is not None
7575

7676

77-
def is_url_available(url: str) -> bool:
78-
from urllib.request import urlopen
79-
80-
status = None
81-
try:
82-
with urlopen(url) as f:
83-
status = f.status
84-
except Exception:
85-
return False
86-
return status == 200
87-
88-
8977
class CMakeExtension(Extension):
9078
def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
9179
super().__init__(name, sources=[], py_limited_api=True, **kwa)
@@ -533,28 +521,6 @@ def get_nvcc_cuda_version() -> Version:
533521
return nvcc_cuda_version
534522

535523

536-
def get_gaudi_sw_version():
537-
"""
538-
Returns the driver version.
539-
"""
540-
# Enable console printing for `hl-smi` check
541-
output = subprocess.run(
542-
"hl-smi",
543-
shell=True,
544-
text=True,
545-
capture_output=True,
546-
env={"ENABLE_CONSOLE": "true"},
547-
)
548-
if output.returncode == 0 and output.stdout:
549-
return (
550-
output.stdout.split("\n")[2]
551-
.replace(" ", "")
552-
.split(":")[1][:-1]
553-
.split("-")[0]
554-
)
555-
return "0.0.0" # when hl-smi is not available
556-
557-
558524
def get_vllm_version() -> str:
559525
# Allow overriding the version. This is useful to build platform-specific
560526
# wheels (e.g. CPU, TPU) without modifying the source.

tests/v1/attention/test_mla_backends.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161

6262
BACKEND_BLOCK_SIZES = {}
6363
for backend in BACKENDS_TO_TEST:
64-
supported_sizes = backend.get_class().supported_kernel_block_sizes
64+
supported_sizes = backend.get_class().get_supported_kernel_block_sizes()
6565
if supported_sizes:
6666
default_size = supported_sizes[0]
6767
block_size = (

tests/v1/sample/test_logprobs.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -521,8 +521,8 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode):
521521
pytest.param(
522522
(
523523
"eagle",
524-
"meta-llama/Llama-3.1-8B-Instruct",
525-
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B",
524+
"meta-llama/Llama-3.2-1B-Instruct",
525+
"nm-testing/Llama3_2_1B_speculator.eagle3",
526526
),
527527
marks=large_gpu_mark(min_gb=32),
528528
),
@@ -541,7 +541,7 @@ def test_spec_decode_logprobs(
541541
"""
542542
from vllm import LLM
543543

544-
prompt = "Hello world"
544+
prompt = "Hello world " * 50
545545
sampling_params = SamplingParams(
546546
temperature=0, logprobs=3, max_tokens=10, ignore_eos=False
547547
)
@@ -582,6 +582,9 @@ def test_spec_decode_logprobs(
582582
seed=42,
583583
logprobs_mode=logprobs_mode,
584584
gpu_memory_utilization=0.4,
585+
# Force prefill chunking
586+
enable_chunked_prefill=True,
587+
max_num_batched_tokens=32,
585588
)
586589
spec_results = spec_llm.generate([prompt], sampling_params)
587590
# Collect logprobs outputs from spec decode LLM.
@@ -597,6 +600,8 @@ def test_spec_decode_logprobs(
597600
# Per-token logprobs are expected to be the same.
598601
assert len(ref_logprobs) == len(spec_logprobs)
599602
for ref_logprob, spec_logprob in zip(ref_logprobs, spec_logprobs):
600-
assert math.isclose(ref_logprob.logprob, spec_logprob.logprob, abs_tol=1e-3)
603+
assert math.isclose(
604+
ref_logprob.logprob, spec_logprob.logprob, rel_tol=5e-2, abs_tol=1e-1
605+
)
601606
assert ref_logprob.rank == spec_logprob.rank
602607
assert ref_logprob.decoded_token == spec_logprob.decoded_token

tests/v1/worker/test_gpu_model_runner.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,9 @@ def _make_mock_backend_for_kernel_block_size(
185185
supported_sizes: list[int | MultipleOf],
186186
):
187187
class _MockBackend:
188-
supported_kernel_block_sizes = supported_sizes
188+
@staticmethod
189+
def get_supported_kernel_block_sizes():
190+
return supported_sizes
189191

190192
return _MockBackend()
191193

vllm/attention/backends/abstract.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,12 @@ class AttentionBackend(ABC):
4646
# makes sure the output tensor is allocated inside the cudagraph.
4747
accept_output_buffer: bool = False
4848
supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
49-
supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(1)]
5049
supported_kv_cache_dtypes: ClassVar[list["CacheDType"]] = ["auto"]
5150

51+
@staticmethod
52+
def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
53+
return [MultipleOf(1)]
54+
5255
@staticmethod
5356
@abstractmethod
5457
def get_name() -> str:
@@ -142,10 +145,11 @@ def supports_block_size(cls, block_size: int | None) -> bool:
142145
if block_size not in valid_sizes:
143146
return False
144147

145-
if not cls.supported_kernel_block_sizes:
148+
supported_kernel_block_sizes = cls.get_supported_kernel_block_sizes()
149+
if not supported_kernel_block_sizes:
146150
return True
147151

148-
for supported_size in cls.supported_kernel_block_sizes:
152+
for supported_size in supported_kernel_block_sizes:
149153
if isinstance(supported_size, MultipleOf):
150154
supported_size = supported_size.base
151155
# With hybrid_blocks feature, the framework-level block size

vllm/model_executor/layers/batch_invariant.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -805,11 +805,11 @@ def override_envs_for_invariance():
805805
"FLASH_ATTN", # best supported backend
806806
"FLASHINFER",
807807
"FLASH_ATTN_MLA",
808-
"TRITON_MLA",
809808
# Not yet supported MLA backends
810809
# "FLASHMLA",
811810
# "FLEX_ATTENTION", # IMA issue even if we disable batch invariance
812811
# "FLASHINFER_MLA", https://github.com/vllm-project/vllm/pull/28967
812+
# "TRITON_MLA",
813813
]
814814
if curr_attn_backend not in supported_backends:
815815
warning = (

vllm/model_executor/model_loader/utils.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,6 @@
1919
QuantizationConfig,
2020
QuantizeMethodBase,
2121
)
22-
from vllm.model_executor.models.adapters import (
23-
as_embedding_model,
24-
as_reward_model,
25-
as_seq_cls_model,
26-
try_create_mm_pooling_model_cls,
27-
)
2822
from vllm.model_executor.models.interfaces import SupportsQuant, supports_multimodal
2923
from vllm.utils.platform_utils import is_pin_memory_available
3024

@@ -172,6 +166,13 @@ def device_loading_context(module: torch.nn.Module, target_device: torch.device)
172166

173167

174168
def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], str]:
169+
from vllm.model_executor.models.adapters import (
170+
as_embedding_model,
171+
as_reward_model,
172+
as_seq_cls_model,
173+
try_create_mm_pooling_model_cls,
174+
)
175+
175176
architectures = getattr(model_config.hf_config, "architectures", [])
176177

177178
model_cls, arch = model_config.registry.resolve_model_cls(

vllm/model_executor/models/qwen2_5_vl.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,9 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
230230
- hidden_size must match the hidden size of language model backbone.
231231
- video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
232232
format
233+
- second_per_grid_ts: The video time interval (in seconds) for each
234+
grid along the temporal dimension in the 3D position IDs. Returned
235+
when `videos` is not `None`.
233236
"""
234237

235238
type: Literal["video_embeds"]
@@ -244,6 +247,11 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
244247
TensorShape("nv", 3),
245248
]
246249

250+
second_per_grid_ts: Annotated[
251+
torch.Tensor | None,
252+
TensorShape("nv"),
253+
] = None
254+
247255

248256
Qwen2_5_VLVideoInputs: TypeAlias = (
249257
Qwen2_5_VLVideoPixelInputs | Qwen2_5_VLVideoEmbeddingInputs
@@ -1311,6 +1319,7 @@ def _parse_and_validate_video_input(
13111319
type="video_embeds",
13121320
video_embeds=video_embeds,
13131321
video_grid_thw=video_grid_thw,
1322+
second_per_grid_ts=second_per_grid_ts,
13141323
)
13151324

13161325
def _process_image_input(
@@ -1422,7 +1431,13 @@ def _postprocess_video_embeds_evs(
14221431

14231432
# Cast to long to match the original code
14241433
# https://github.com/huggingface/transformers/blob/41980ce93e775f6c88500c51c8db7946fc6a2add/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py#L491 # noqa
1425-
second_per_grid_ts = video_input["second_per_grid_ts"].long()
1434+
second_per_grid_ts = video_input.get("second_per_grid_ts")
1435+
if second_per_grid_ts is None:
1436+
raise ValueError(
1437+
"second_per_grid_ts is required when video_pruning_rate > 0 "
1438+
"is enabled for video inputs, including the video_embeds path."
1439+
)
1440+
second_per_grid_ts = second_per_grid_ts.long()
14261441
tokens_per_second = self.config.vision_config.tokens_per_second
14271442

14281443
video_embeds_out = []

vllm/v1/attention/backends/flash_attn.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
get_scheduler_metadata,
3333
reshape_and_cache_flash,
3434
)
35-
from vllm.config import VllmConfig, get_layers_from_vllm_config
35+
from vllm.config import VllmConfig, get_current_vllm_config, get_layers_from_vllm_config
3636
from vllm.config.cache import CacheDType
3737
from vllm.distributed.parallel_state import get_dcp_group
3838
from vllm.logger import init_logger
@@ -56,11 +56,26 @@
5656
class FlashAttentionBackend(AttentionBackend):
5757
accept_output_buffer: bool = True
5858
supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
59-
# NOTE(tdoublep): while in principle, FA supports
60-
# MultipleOf(16), these are the block sizes that do not
61-
# suffer from the NaN propagation problem described here:
62-
# https://github.com/Dao-AILab/flash-attention/issues/1974
63-
supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [16, 32, 64]
59+
60+
@staticmethod
61+
def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
62+
vllm_config = get_current_vllm_config()
63+
model_config = vllm_config.model_config
64+
cache_config = vllm_config.cache_config
65+
if (
66+
model_config
67+
and model_config.is_hybrid
68+
and (
69+
cache_config.mamba_ssm_cache_dtype == "float32"
70+
or cache_config.mamba_cache_dtype == "float32"
71+
)
72+
):
73+
# NOTE(tdoublep): while in principle, FA supports
74+
# MultipleOf(16), these are the block sizes that do not
75+
# suffer from the NaN propagation problem described here:
76+
# https://github.com/Dao-AILab/flash-attention/issues/1974
77+
return [16, 32, 64]
78+
return [MultipleOf(16)]
6479

6580
@staticmethod
6681
def get_name() -> str:

vllm/v1/attention/backends/flashinfer.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache
1717
from flashinfer.prefill import trtllm_batch_context_with_kv_cache
1818
from flashinfer.utils import FP4Tensor
19-
from typing_extensions import override
2019

2120
from vllm import envs
2221
from vllm.attention.backends.abstract import (
@@ -275,17 +274,19 @@ def run(
275274
class FlashInferBackend(AttentionBackend):
276275
accept_output_buffer: bool = True
277276
supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
278-
# Note: Not sure for all platforms,
279-
# but on Blackwell, only support a page size of
280-
# 16, 32, 64
281-
supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [16, 32, 64]
282277
supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
283278
"auto",
284279
"fp8",
285280
"fp8_e4m3",
286281
"fp8_e5m2",
287282
]
288283

284+
@staticmethod
285+
def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
286+
# Note: Not sure for all platforms, but on Blackwell,
287+
# only support a page size of 16, 32, 64.
288+
return [16, 32, 64]
289+
289290
@staticmethod
290291
def get_name() -> str:
291292
return "FLASHINFER"
@@ -566,7 +567,6 @@ def __init__(
566567
)
567568

568569
@classmethod
569-
@override
570570
def get_cudagraph_support(
571571
cls: type["FlashInferMetadataBuilder"],
572572
vllm_config: VllmConfig,

0 commit comments

Comments
 (0)