@@ -230,6 +230,9 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
230230 - hidden_size must match the hidden size of language model backbone.
231231 - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
232232 format
233+ - second_per_grid_ts: The video time interval (in seconds) for each
234+ grid along the temporal dimension in the 3D position IDs. Returned
235+ when `videos` is not `None`.
233236 """
234237
235238 type : Literal ["video_embeds" ]
@@ -244,6 +247,11 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
244247 TensorShape ("nv" , 3 ),
245248 ]
246249
250+ second_per_grid_ts : Annotated [
251+ torch .Tensor | None ,
252+ TensorShape ("nv" ),
253+ ] = None
254+
247255
248256Qwen2_5_VLVideoInputs : TypeAlias = (
249257 Qwen2_5_VLVideoPixelInputs | Qwen2_5_VLVideoEmbeddingInputs
@@ -1311,6 +1319,7 @@ def _parse_and_validate_video_input(
13111319 type = "video_embeds" ,
13121320 video_embeds = video_embeds ,
13131321 video_grid_thw = video_grid_thw ,
1322+ second_per_grid_ts = second_per_grid_ts ,
13141323 )
13151324
13161325 def _process_image_input (
@@ -1422,7 +1431,13 @@ def _postprocess_video_embeds_evs(
14221431
14231432 # Cast to long to match the original code
14241433 # https://github.com/huggingface/transformers/blob/41980ce93e775f6c88500c51c8db7946fc6a2add/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py#L491 # noqa
1425- second_per_grid_ts = video_input ["second_per_grid_ts" ].long ()
1434+ second_per_grid_ts = video_input .get ("second_per_grid_ts" )
1435+ if second_per_grid_ts is None :
1436+ raise ValueError (
1437+ "second_per_grid_ts is required when video_pruning_rate > 0 "
1438+ "is enabled for video inputs, including the video_embeds path."
1439+ )
1440+ second_per_grid_ts = second_per_grid_ts .long ()
14261441 tokens_per_second = self .config .vision_config .tokens_per_second
14271442
14281443 video_embeds_out = []
0 commit comments