aligning

Red-Caesar · Red-Caesar · commit de729698ef2f · 2025-11-18T10:25:34.000Z
Signed-off-by: Barbara Suslova &lt;barbara.suslova@axel-t.com&gt;
diff --git a/csrc/moe/moe_fused_gate.cu b/csrc/moe/moe_fused_gate.cu
@@ -69,8 +69,7 @@ __device__ void moe_fused_gate_impl(void* input, void* bias, float* output_ptr,
   }
 
   // Calculate topk_excluding_share_expert_fusion from topk
-  int64_t topk_excluding_share_expert_fusion =
-      topk - (num_fused_shared_experts > 0 ? 1 : 0);
+  int64_t topk_excluding_share_expert_fusion = topk - num_fused_shared_experts;
 
   // Cast pointers to type T:
   auto* input_ptr = reinterpret_cast<T*>(input);
@@ -362,6 +361,9 @@ std::vector<at::Tensor> moe_fused_gate(
     at::Tensor& input, at::Tensor& bias, int64_t num_expert_group,
     int64_t topk_group, int64_t topk, int64_t num_fused_shared_experts,
     double routed_scaling_factor, bool apply_routed_scaling_factor_on_output) {
+  TORCH_CHECK(input.dtype() == bias.dtype(),
+              "input and bias should have the same dtype");
+
   int64_t num_rows = input.size(0);
   int32_t num_experts = input.size(1);
   auto options =
@@ -410,16 +412,16 @@ std::vector<at::Tensor> moe_fused_gate(
           LAUNCH_MOE_GATE_CONFIG(float16_t, 256, 8);
         } else if (input.scalar_type() == at::kFloat) {
           LAUNCH_MOE_GATE_CONFIG(float32_t, 256, 8);
-        } else if (num_expert_group == 16) {
-          // Here VPT = 256/16 = 16, ROWS_PER_WARP = 32/16 = 2, ROWS_PER_CTA = 6
-          // * 2 = 12.
-          if (input.scalar_type() == at::kBFloat16) {
-            LAUNCH_MOE_GATE_CONFIG(bfloat16_t, 256, 16);
-          } else if (input.scalar_type() == at::kHalf) {
-            LAUNCH_MOE_GATE_CONFIG(float16_t, 256, 16);
-          } else if (input.scalar_type() == at::kFloat) {
-            LAUNCH_MOE_GATE_CONFIG(float32_t, 256, 16);
-          }
+        }
+      } else if (num_expert_group == 16) {
+        // Here VPT = 256/16 = 16, ROWS_PER_WARP = 32/16 = 2, ROWS_PER_CTA = 6
+        // * 2 = 12.
+        if (input.scalar_type() == at::kBFloat16) {
+          LAUNCH_MOE_GATE_CONFIG(bfloat16_t, 256, 16);
+        } else if (input.scalar_type() == at::kHalf) {
+          LAUNCH_MOE_GATE_CONFIG(float16_t, 256, 16);
+        } else if (input.scalar_type() == at::kFloat) {
+          LAUNCH_MOE_GATE_CONFIG(float32_t, 256, 16);
         }
       }
       break;
@@ -433,16 +435,16 @@ std::vector<at::Tensor> moe_fused_gate(
           LAUNCH_MOE_GATE_CONFIG(float16_t, 128, 4);
         } else if (input.scalar_type() == at::kFloat) {
           LAUNCH_MOE_GATE_CONFIG(float32_t, 128, 4);
-        } else if (num_expert_group == 8) {
-          // VPT = 128/8 = 16, ROWS_PER_WARP = 32/8 = 4, ROWS_PER_CTA = 6 * 4
-          // = 24.
-          if (input.scalar_type() == at::kBFloat16) {
-            LAUNCH_MOE_GATE_CONFIG(bfloat16_t, 128, 8);
-          } else if (input.scalar_type() == at::kHalf) {
-            LAUNCH_MOE_GATE_CONFIG(float16_t, 128, 8);
-          } else if (input.scalar_type() == at::kFloat) {
-            LAUNCH_MOE_GATE_CONFIG(float32_t, 128, 8);
-          }
+        }
+      } else if (num_expert_group == 8) {
+        // VPT = 128/8 = 16, ROWS_PER_WARP = 32/8 = 4, ROWS_PER_CTA = 6 * 4
+        // = 24.
+        if (input.scalar_type() == at::kBFloat16) {
+          LAUNCH_MOE_GATE_CONFIG(bfloat16_t, 128, 8);
+        } else if (input.scalar_type() == at::kHalf) {
+          LAUNCH_MOE_GATE_CONFIG(float16_t, 128, 8);
+        } else if (input.scalar_type() == at::kFloat) {
+          LAUNCH_MOE_GATE_CONFIG(float32_t, 128, 8);
         }
       }
       break;
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1169,7 +1169,8 @@ def grouped_topk(
     num_fused_shared_experts: int = 0,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     use_fused_moe_grouped_topk = envs.VLLM_USE_FUSED_MOE_GROUPED_TOPK
-    if num_fused_shared_experts > 0 and use_fused_moe_grouped_topk:
+    enable_fused_shared_experts = num_fused_shared_experts > 0
+    if enable_fused_shared_experts and use_fused_moe_grouped_topk:
         logger.info(
             "Fused MoE grouped topk is enabled with fused shared experts.",
             "Only one of these options can be used at a time",
@@ -1235,15 +1236,23 @@ def grouped_topk(
     tmp_scores = scores.masked_fill(~score_mask.bool(), float("-inf"))  # [n, e]
 
     if e_score_correction_bias is not None:
-        topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=use_sorted)[1]
+        topk_ids = torch.topk(
+            tmp_scores,
+            k=topk,
+            dim=-1,
+            sorted=(use_sorted or enable_fused_shared_experts),
+        )[1]
         # Use original unbiased scores for the routing weights
         topk_weights = original_scores.gather(1, topk_ids)
     else:
         topk_weights, topk_ids = torch.topk(
-            tmp_scores, k=topk, dim=-1, sorted=use_sorted
+            tmp_scores,
+            k=topk,
+            dim=-1,
+            sorted=(use_sorted or enable_fused_shared_experts),
         )
 
-    if num_fused_shared_experts > 0:
+    if enable_fused_shared_experts:
         assert routed_scaling_factor is not None, "With num_fused_shared_experts>0"
         ", routed_scaling_factor need to be provided"
         topk_ids[:, -1] = torch.randint(
@@ -1253,16 +1262,19 @@ def grouped_topk(
             dtype=topk_ids.dtype,
             device=topk_ids.device,
         )
-        topk_weights[:, -1] = topk_weights[:, :-1].sum(dim=-1) / routed_scaling_factor
+        if routed_scaling_factor != 1.0:
+            topk_weights[:, -1] = (
+                topk_weights[:, :-1].sum(dim=-1) / routed_scaling_factor
+            )
 
     if renormalize:
-        if num_fused_shared_experts == 0:
+        if not enable_fused_shared_experts:
             topk_weights_sum = topk_weights.sum(dim=-1, keepdim=True)
         else:
             topk_weights_sum = topk_weights[:, :-1].sum(dim=-1, keepdim=True)
         topk_weights = topk_weights / topk_weights_sum
 
-    if num_fused_shared_experts == 0 and routed_scaling_factor != 1.0:
+    if not enable_fused_shared_experts and routed_scaling_factor != 1.0:
         topk_weights = topk_weights * routed_scaling_factor
     return topk_weights.to(torch.float32), topk_ids.to(torch.int32)