change copy to async

PatchouliTaisa · PatchouliTaisa · commit bcf454f07c0b · 2025-12-02T15:49:18.000+08:00
Signed-off-by: PatchouliTaisa &lt;patchychen@tencent.com&gt;
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
@@ -123,7 +123,13 @@ def __init__(
         # Maps req_index -> tensor of shape (num_prompt_tokens, hidden_size)
         self.req_prompt_embeds: dict[int, torch.Tensor] = {}
         self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
-        self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_tokens_no_spec_cpu_tensor = torch.zeros(
+            (max_num_reqs,),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.num_tokens_no_spec = self.num_tokens_no_spec_cpu_tensor.numpy()
         self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_computed_tokens_cpu_tensor = torch.zeros(
             (max_num_reqs,),
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -1100,7 +1100,12 @@ def _update_ngram_gpu_tensors_incremental(
                     self.input_batch.token_ids_cpu_tensor[new_req_idx, :num_tokens],
                     non_blocking=True,
                 )
-                self.num_tokens_no_spec_gpu[new_req_idx] = num_tokens
+                self.num_tokens_no_spec_gpu[new_req_idx : new_req_idx + 1].copy_(
+                    self.input_batch.num_tokens_no_spec_cpu_tensor[
+                        new_req_idx : new_req_idx + 1
+                    ],
+                    non_blocking=True,
+                )
 
     def _ngram_gpu_full_init(self) -> None:
         """Initialize all GPU tensors for ngram proposer from scratch.
@@ -1116,7 +1121,10 @@ def _ngram_gpu_full_init(self) -> None:
                     self.input_batch.token_ids_cpu_tensor[idx, :num_tokens],
                     non_blocking=True,
                 )
-                self.num_tokens_no_spec_gpu[idx] = num_tokens
+                self.num_tokens_no_spec_gpu[idx : idx + 1].copy_(
+                    self.input_batch.num_tokens_no_spec_cpu_tensor[idx : idx + 1],
+                    non_blocking=True,
+                )
 
     def _update_states_after_model_execute(
         self, output_token_ids: torch.Tensor