Skip to content

Commit 86e178f

Browse files
[crashfix] Eagle + multimodal can crash on mm cache miss (#29750)
Signed-off-by: Mickael Seznec <[email protected]> Co-authored-by: Roger Wang <[email protected]>
1 parent 014ece9 commit 86e178f

File tree

1 file changed

+11
-3
lines changed

1 file changed

+11
-3
lines changed

vllm/v1/core/sched/scheduler.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,7 @@ def schedule(self) -> SchedulerOutput:
263263
request.num_computed_tokens,
264264
num_new_tokens,
265265
encoder_compute_budget,
266+
shift_computed_tokens=1 if self.use_eagle else 0,
266267
)
267268

268269
if num_new_tokens == 0:
@@ -532,6 +533,7 @@ def schedule(self) -> SchedulerOutput:
532533
num_computed_tokens,
533534
num_new_tokens,
534535
encoder_compute_budget,
536+
shift_computed_tokens=1 if self.use_eagle else 0,
535537
)
536538
if num_new_tokens == 0:
537539
# The request cannot be scheduled.
@@ -829,6 +831,7 @@ def _try_schedule_encoder_inputs(
829831
num_computed_tokens: int,
830832
num_new_tokens: int,
831833
encoder_compute_budget: int,
834+
shift_computed_tokens: int = 0,
832835
) -> tuple[list[int], int, int, list[int]]:
833836
"""
834837
Determine which encoder inputs need to be scheduled in the current step,
@@ -873,7 +876,10 @@ def _try_schedule_encoder_inputs(
873876
# The encoder output is needed if the two ranges overlap:
874877
# [num_computed_tokens, num_computed_tokens + num_new_tokens) and
875878
# [start_pos, start_pos + num_encoder_tokens)
876-
if start_pos >= num_computed_tokens + num_new_tokens:
879+
if (
880+
start_pos
881+
>= num_computed_tokens + num_new_tokens + shift_computed_tokens
882+
):
877883
# The encoder input is not needed in this step.
878884
break
879885

@@ -929,10 +935,12 @@ def _try_schedule_encoder_inputs(
929935
# NOTE(woosuk): We assume that the encoder input tokens should
930936
# be processed altogether, as the encoder usually uses
931937
# bidirectional attention.
932-
if num_computed_tokens < start_pos:
938+
if num_computed_tokens + shift_computed_tokens < start_pos:
933939
# We only schedule the decoder tokens just before the
934940
# encoder input.
935-
num_new_tokens = start_pos - num_computed_tokens
941+
num_new_tokens = start_pos - (
942+
num_computed_tokens + shift_computed_tokens
943+
)
936944
else:
937945
# Because of prefix caching, num_computed_tokens is greater
938946
# than start_pos even though its encoder input is not

0 commit comments

Comments
 (0)