bugfix: fix coredump issue when both prefixcache and mtp are enabled. (#377)

DongheJin · DongheJin · commit e542c225afb2 · 2025-11-17T10:38:41.000+08:00
* bugfix: fix coredump issue when both prefixcache and mtp are enabled.

* bugfix: fix coredump caused by incorrect token replacement.
diff --git a/xllm/core/framework/model/model_args.h b/xllm/core/framework/model/model_args.h
@@ -54,7 +54,7 @@ struct ModelArgs {
   PROPERTY(int64_t, vocab_size) = -1;
 
   PROPERTY(bool, use_qk_norm) = false;
-  
+
   PROPERTY(float, rms_norm_eps) = 0.0f;
 
   PROPERTY(float, layer_norm_eps) = 0.0f;
diff --git a/xllm/core/framework/request/sequence_kv_state.cpp b/xllm/core/framework/request/sequence_kv_state.cpp
@@ -58,7 +58,6 @@ void KVCacheState::add_shared_kv_blocks(std::vector<Block>&& blocks,
   if (blocks.empty()) {
     return;
   }
-
   // The number of matched blocks may be fewer than the number of blocks held by
   // the sequence itself. In this case, try to replace the blocks computed by
   // the sequence with blocks from the prefix_cache and release the computed
@@ -86,6 +85,10 @@ void KVCacheState::add_shared_kv_blocks(std::vector<Block>&& blocks,
     CHECK_GT(block_size, 0);
     num_shared_tokens =
         ((current_total_num_tokens - 1) / block_size) * block_size;
+    if (num_owned_shared_blocks_ > 0) {
+      num_owned_shared_blocks_--;
+      blocks_.pop_back();
+    }
   }
   CHECK_LT(num_shared_tokens, current_total_num_tokens);
   // update the kv cache position
diff --git a/xllm/core/layers/npu/npu_glm4_moe_decoder_layer.cpp b/xllm/core/layers/npu/npu_glm4_moe_decoder_layer.cpp
@@ -382,7 +382,8 @@ void Glm4MoeDecoderImpl::initialize_basic_parameters(
 
   param.mlpLinearTransposeType = {1, -1, 1, -1};
 
-  param.enableSplitFuse = (FLAGS_enable_chunked_prefill || FLAGS_enable_prefix_cache) && is_prefill;
+  param.enableSplitFuse =
+      (FLAGS_enable_chunked_prefill || FLAGS_enable_prefix_cache) && is_prefill;
 
   param.moeLinearTransposeType = (layer_id_ < args.first_k_dense_replace())
                                      ? std::vector<int>{-1, -1, -1, -1}
@@ -406,7 +407,7 @@ void Glm4MoeDecoderImpl::initialize_basic_parameters(
   param.enableSwiGLUQuantForSharedExperts = false;  // TODO
 
   param.useQKNorm = args.use_qk_norm();
-  if(args.use_qk_norm()){
+  if (args.use_qk_norm()) {
     WEIGHT_COUNT_PER_LAYER = 70;
     WEIGHT_MAPPING_W8A8["self_attn.q_norm.weight"] = Q_NORM_WEIGHT;
     WEIGHT_MAPPING_W8A8["self_attn.k_norm.weight"] = K_NORM_WEIGHT;
diff --git a/xllm/core/runtime/llm_worker_impl.cpp b/xllm/core/runtime/llm_worker_impl.cpp
@@ -189,7 +189,7 @@ std::optional<ForwardOutput> LLMWorkerImpl::step(
   // should be in same prefill stage, so, to judge empty_kv_cache,
   // just use micro batch 0 here
   if (options_.enable_speculative_decode() && !is_spec_draft_) {
-    if (input_params_micro_batches[0].q_seq_lens_vec[0] > 1) {
+    if (check_is_prefill(inputs.micro_inputs[0].input_params.q_seq_lens_vec)) {
       output.sample_output.embeddings = hidden_states;
     } else if (concated_sampling_params.sample_idxes.defined()) {
       // auto sample_idxes =
diff --git a/xllm/core/runtime/speculative_worker_impl.cpp b/xllm/core/runtime/speculative_worker_impl.cpp
@@ -173,7 +173,7 @@ std::optional<ForwardOutput> SpeculativeWorkerImpl::step(
   }
 
   // TODO: support data parallel case
-  if (inputs.micro_inputs[0].input_params.q_seq_lens_vec[0] > 1) {
+  if (check_is_prefill(inputs.micro_inputs[0].input_params.q_seq_lens_vec)) {
     return step_prefill(inputs);
   } else {
     return step_decode(inputs);
@@ -182,7 +182,7 @@ std::optional<ForwardOutput> SpeculativeWorkerImpl::step(
 
 std::optional<ForwardOutput> SpeculativeWorkerImpl::step_empty(
     const BatchedForwardInputs& inputs) {
-  if (inputs.micro_inputs[0].input_params.q_seq_lens_vec[0] > 1) {
+  if (check_is_prefill(inputs.micro_inputs[0].input_params.q_seq_lens_vec)) {
     auto output = impl_->step(inputs);
     auto draft_output = draft_impl_->step(inputs);
     return output;
@@ -230,7 +230,8 @@ std::optional<ForwardOutput> SpeculativeWorkerImpl::step_prefill(
     if (token_offset > 0) {
       prefill_inputs.micro_inputs[i].input_params.mm_data = MMData(
           MMType::EMBEDDING,
-          {{"embedding", embeddings.narrow(0, token_start_idx, token_offset)}});
+          {{"embedding",
+            embeddings.narrow(0, token_start_idx, token_offset).clone()}});
     }
     if (next_tokens.defined()) {
       auto& token_ids = prefill_inputs.micro_inputs[i].token_ids;
@@ -293,6 +294,7 @@ std::optional<ForwardOutput> SpeculativeWorkerImpl::step_prefill(
 void SpeculativeWorkerImpl::prepare_prefill_inputs(
     const BatchedForwardInputs& inputs,
     BatchedForwardInputs& prefill_inputs) {
+  prefill_inputs.micro_inputs.clear();
   prefill_inputs.micro_inputs.reserve(inputs.micro_inputs.size());
   for (auto i = 0; i < inputs.micro_inputs.size(); ++i) {
     auto& input = inputs.micro_inputs[i];
@@ -308,16 +310,16 @@ void SpeculativeWorkerImpl::prepare_prefill_inputs(
     int32_t start_idx = 0;
     std::vector<int32_t> new_token_ids;
     new_token_ids.reserve(input.token_ids.numel());
-    for (size_t i = 0; i < input_params.num_sequences; ++i) {
+    for (size_t j = 0; j < input_params.num_sequences; ++j) {
       int32_t q_len = 0;
-      q_len = input_params.q_seq_lens_vec[i];
+      q_len = input_params.q_seq_lens_vec[j];
       Slice<int32_t> tokens_ids_slice_i =
           tokens_ids_slice.slice(start_idx + 1, start_idx + q_len);
       start_idx += q_len;
       new_token_ids.insert(new_token_ids.end(),
                            tokens_ids_slice_i.begin(),
                            tokens_ids_slice_i.end());
-      new_token_ids.emplace_back(extra_token_ids[i]);
+      new_token_ids.emplace_back(extra_token_ids[j]);
     }
     prefill_input.token_ids =
         torch::tensor(new_token_ids, prefill_input.positions.options());
@@ -359,7 +361,11 @@ std::optional<ForwardOutput> SpeculativeWorkerImpl::step_decode(
       // final step
       prepare_validate_inputs(inputs, validate_inputs, true);
     } else {
-      prepare_draft_inputs(draft_inputs, next_step_input, 1, device_);
+      if (i == 0) {
+        prepare_draft_inputs(inputs, next_step_input, 1, device_);
+      } else {
+        prepare_draft_inputs(draft_inputs, next_step_input, 1, device_);
+      }
     }
     draft_outputs.push_back(std::move(future).get().value());
     // update input of next step
@@ -368,8 +374,8 @@ std::optional<ForwardOutput> SpeculativeWorkerImpl::step_decode(
       auto last_output = draft_outputs.back().sample_output;
       auto start_idx = 0;
       auto token_start_idx = 0;
-      for (auto i = 0; i < draft_inputs.micro_inputs.size(); ++i) {
-        auto& draft_input = draft_inputs.micro_inputs[i];
+      for (auto j = 0; j < draft_inputs.micro_inputs.size(); ++j) {
+        auto& draft_input = draft_inputs.micro_inputs[j];
         auto offset = draft_input.input_params.num_sequences;
         auto token_offset = draft_input.token_ids.size(0);
         draft_input.token_ids = safe_to(
@@ -379,6 +385,7 @@ std::optional<ForwardOutput> SpeculativeWorkerImpl::step_decode(
               MMType::EMBEDDING,
               {{"embedding",
                 last_output.embeddings.narrow(0, token_start_idx, token_offset)
+                    .clone()
                     .to(device_)}});
         }
         start_idx += offset;
@@ -394,9 +401,11 @@ std::optional<ForwardOutput> SpeculativeWorkerImpl::step_decode(
     auto next_tokens =
         safe_to(draft_output.sample_output.next_tokens, torch::kInt);
     int32_t start_idx = 0;
-    for (auto i = 0; i < validate_inputs.micro_inputs.size(); ++i) {
-      int32_t offset = draft_inputs.micro_inputs[i].input_params.num_sequences;
-      auto& validate_input = validate_inputs.micro_inputs[i];
+    for (auto j = 0; j < validate_inputs.micro_inputs.size(); ++j) {
+      int32_t offset =
+          validate_inputs.micro_inputs[j].input_params.num_sequences /
+          (options_.num_speculative_tokens() + 1);
+      auto& validate_input = validate_inputs.micro_inputs[j];
       auto& token_ids = validate_input.token_ids;
       auto mask = (token_ids == -1 * (i + 1));
       token_ids.masked_scatter_(mask, next_tokens.narrow(0, start_idx, offset));
@@ -447,9 +456,10 @@ void SpeculativeWorkerImpl::prepare_draft_inputs(
     const int64_t offset,
     const torch::Device device) {
   // prepare input for MTP in decoding phase (Like Eagle).
+  draft_inputs.micro_inputs.clear();
   draft_inputs.micro_inputs.reserve(inputs.micro_inputs.size());
-  for (auto i = 0; i < inputs.micro_inputs.size(); ++i) {
-    auto& input = inputs.micro_inputs[i];
+  for (auto idx = 0; idx < inputs.micro_inputs.size(); ++idx) {
+    auto& input = inputs.micro_inputs[idx];
     ForwardInput draft_input = input.to(device, dtype_);
 
     auto& input_params = draft_input.input_params;
@@ -504,8 +514,8 @@ void SpeculativeWorkerImpl::prepare_validate_inputs(
     BatchedForwardInputs& validate_inputs,
     bool enable_schedule_overlap) {
   validate_inputs.micro_inputs.reserve(inputs.micro_inputs.size());
-  for (auto i = 0; i < inputs.micro_inputs.size(); ++i) {
-    auto& input = inputs.micro_inputs[i];
+  for (auto idx = 0; idx < inputs.micro_inputs.size(); ++idx) {
+    auto& input = inputs.micro_inputs[idx];
 
     ForwardInput validate_input = input.to(device_, dtype_);
     auto& input_params = validate_input.input_params;
@@ -823,7 +833,7 @@ void SpeculativeWorkerImpl::update_sampling_params(
 void SpeculativeWorkerImpl::prepare_work_before_execute(
     const BatchedForwardInputs& inputs,
     BatchedForwardInputs& processed_inputs) {
-  if (inputs.micro_inputs[0].input_params.q_seq_lens_vec[0] > 1) {
+  if (check_is_prefill(inputs.micro_inputs[0].input_params.q_seq_lens_vec)) {
     WorkerImpl::prepare_work_before_execute(inputs, processed_inputs);
   } else {
     if (enable_schedule_overlap()) {
diff --git a/xllm/core/runtime/worker_impl.cpp b/xllm/core/runtime/worker_impl.cpp
@@ -780,4 +780,13 @@ int64_t WorkerImpl::get_active_activation_memory() {
       .active_activation_memory;
 }
 
+bool WorkerImpl::check_is_prefill(const std::vector<int>& q_seq_lens_vec) {
+  for (auto q_len : q_seq_lens_vec) {
+    if (q_len > 1) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace xllm
diff --git a/xllm/core/runtime/worker_impl.h b/xllm/core/runtime/worker_impl.h
@@ -166,6 +166,8 @@ class WorkerImpl {
 
   torch::ScalarType dtype() const { return dtype_; }
 
+  bool check_is_prefill(const std::vector<int>& q_seq_lens_vec);
+
   int32_t hidden_size() const {
     return context_.get_model_args().hidden_size();
   }
diff --git a/xllm/core/scheduler/continuous_scheduler.cpp b/xllm/core/scheduler/continuous_scheduler.cpp
@@ -93,7 +93,6 @@ ContinuousScheduler::ContinuousScheduler(Engine* engine, const Options& options)
   } else {
     min_speculative_tokens_required_ = options_.num_speculative_tokens();
   }
-
 }
 
 ContinuousScheduler::~ContinuousScheduler() { running_requests_.clear(); }

Original file line number	Diff line number	Diff line change
`@@ -166,6 +166,8 @@ class WorkerImpl {`
`166`	`166`
`167`	`167`	`torch::ScalarType dtype() const { return dtype_; }`
`168`	`168`
	`169`	`+ bool check_is_prefill(const std::vector<int>& q_seq_lens_vec);`
	`170`	`+`
`169`	`171`	`int32_t hidden_size() const {`
`170`	`172`	`return context_.get_model_args().hidden_size();`
`171`	`173`	`}`
Original file line number	Diff line number	Diff line change
`@@ -93,7 +93,6 @@ ContinuousScheduler::ContinuousScheduler(Engine* engine, const Options& options)`
`93`	`93`	`} else {`
`94`	`94`	`min_speculative_tokens_required_ = options_.num_speculative_tokens();`
`95`	`95`	`}`
`96`		`-`
`97`	`96`	`}`
`98`	`97`
`99`	`98`	`ContinuousScheduler::~ContinuousScheduler() { running_requests_.clear(); }`