Skip to content

Commit bcf454f

Browse files
author
PatchouliTaisa
committed
change copy to async
Signed-off-by: PatchouliTaisa <[email protected]>
1 parent 0081487 commit bcf454f

File tree

2 files changed

+17
-3
lines changed

2 files changed

+17
-3
lines changed

vllm/v1/worker/gpu_input_batch.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,13 @@ def __init__(
123123
# Maps req_index -> tensor of shape (num_prompt_tokens, hidden_size)
124124
self.req_prompt_embeds: dict[int, torch.Tensor] = {}
125125
self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
126-
self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
126+
self.num_tokens_no_spec_cpu_tensor = torch.zeros(
127+
(max_num_reqs,),
128+
device="cpu",
129+
dtype=torch.int32,
130+
pin_memory=pin_memory,
131+
)
132+
self.num_tokens_no_spec = self.num_tokens_no_spec_cpu_tensor.numpy()
127133
self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
128134
self.num_computed_tokens_cpu_tensor = torch.zeros(
129135
(max_num_reqs,),

vllm/v1/worker/gpu_model_runner.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1100,7 +1100,12 @@ def _update_ngram_gpu_tensors_incremental(
11001100
self.input_batch.token_ids_cpu_tensor[new_req_idx, :num_tokens],
11011101
non_blocking=True,
11021102
)
1103-
self.num_tokens_no_spec_gpu[new_req_idx] = num_tokens
1103+
self.num_tokens_no_spec_gpu[new_req_idx : new_req_idx + 1].copy_(
1104+
self.input_batch.num_tokens_no_spec_cpu_tensor[
1105+
new_req_idx : new_req_idx + 1
1106+
],
1107+
non_blocking=True,
1108+
)
11041109

11051110
def _ngram_gpu_full_init(self) -> None:
11061111
"""Initialize all GPU tensors for ngram proposer from scratch.
@@ -1116,7 +1121,10 @@ def _ngram_gpu_full_init(self) -> None:
11161121
self.input_batch.token_ids_cpu_tensor[idx, :num_tokens],
11171122
non_blocking=True,
11181123
)
1119-
self.num_tokens_no_spec_gpu[idx] = num_tokens
1124+
self.num_tokens_no_spec_gpu[idx : idx + 1].copy_(
1125+
self.input_batch.num_tokens_no_spec_cpu_tensor[idx : idx + 1],
1126+
non_blocking=True,
1127+
)
11201128

11211129
def _update_states_after_model_execute(
11221130
self, output_token_ids: torch.Tensor

0 commit comments

Comments
 (0)