Skip to content

Commit d84d8f4

Browse files
Fix EVS crash when using video_embeds inputs in Qwen2.5-VL (#29232)
Signed-off-by: zitian.zhao <[email protected]> Co-authored-by: Cyrus Leung <[email protected]>
1 parent ae66818 commit d84d8f4

File tree

1 file changed

+16
-1
lines changed

1 file changed

+16
-1
lines changed

vllm/model_executor/models/qwen2_5_vl.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,9 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
230230
- hidden_size must match the hidden size of language model backbone.
231231
- video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
232232
format
233+
- second_per_grid_ts: The video time interval (in seconds) for each
234+
grid along the temporal dimension in the 3D position IDs. Returned
235+
when `videos` is not `None`.
233236
"""
234237

235238
type: Literal["video_embeds"]
@@ -244,6 +247,11 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
244247
TensorShape("nv", 3),
245248
]
246249

250+
second_per_grid_ts: Annotated[
251+
torch.Tensor | None,
252+
TensorShape("nv"),
253+
] = None
254+
247255

248256
Qwen2_5_VLVideoInputs: TypeAlias = (
249257
Qwen2_5_VLVideoPixelInputs | Qwen2_5_VLVideoEmbeddingInputs
@@ -1311,6 +1319,7 @@ def _parse_and_validate_video_input(
13111319
type="video_embeds",
13121320
video_embeds=video_embeds,
13131321
video_grid_thw=video_grid_thw,
1322+
second_per_grid_ts=second_per_grid_ts,
13141323
)
13151324

13161325
def _process_image_input(
@@ -1422,7 +1431,13 @@ def _postprocess_video_embeds_evs(
14221431

14231432
# Cast to long to match the original code
14241433
# https://github.com/huggingface/transformers/blob/41980ce93e775f6c88500c51c8db7946fc6a2add/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py#L491 # noqa
1425-
second_per_grid_ts = video_input["second_per_grid_ts"].long()
1434+
second_per_grid_ts = video_input.get("second_per_grid_ts")
1435+
if second_per_grid_ts is None:
1436+
raise ValueError(
1437+
"second_per_grid_ts is required when video_pruning_rate > 0 "
1438+
"is enabled for video inputs, including the video_embeds path."
1439+
)
1440+
second_per_grid_ts = second_per_grid_ts.long()
14261441
tokens_per_second = self.config.vision_config.tokens_per_second
14271442

14281443
video_embeds_out = []

0 commit comments

Comments
 (0)