vllm-project
diff --git a/‎csrc/moe/moe_fused_gate.cu‎
Lines changed: 13 additions & 15 deletions b/‎csrc/moe/moe_fused_gate.cu‎
Lines changed: 13 additions & 15 deletions
diff --git a/‎csrc/moe/moe_ops.h‎
Lines changed: 4 additions & 8 deletions b/‎csrc/moe/moe_ops.h‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎tests/kernels/moe/test_moe_fused_gate.py‎
Lines changed: 15 additions & 14 deletions b/‎tests/kernels/moe/test_moe_fused_gate.py‎
Lines changed: 15 additions & 14 deletions
diff --git a/‎vllm/_custom_ops.py‎
Lines changed: 2 additions & 0 deletions b/‎vllm/_custom_ops.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎vllm/config/parallel.py‎
Lines changed: 2 additions & 2 deletions b/‎vllm/config/parallel.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vllm/envs.py‎
Lines changed: 6 additions & 5 deletions b/‎vllm/envs.py‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/fused_moe.py‎
Lines changed: 14 additions & 16 deletions b/‎vllm/model_executor/layers/fused_moe/fused_moe.py‎
Lines changed: 14 additions & 16 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py‎
Lines changed: 1 addition & 0 deletions b/‎vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/layer.py‎
Lines changed: 28 additions & 22 deletions b/‎vllm/model_executor/layers/fused_moe/layer.py‎
Lines changed: 28 additions & 22 deletions
diff --git a/‎vllm/model_executor/layers/quantization/awq_marlin.py‎
Lines changed: 1 addition & 1 deletion b/‎vllm/model_executor/layers/quantization/awq_marlin.py‎
Lines changed: 1 addition & 1 deletion
@@ -287,19 +287,18 @@ struct KernelParams {
 
 template <typename T, int VPT, int NUM_EXPERTS, int THREADS_PER_ROW,
           int ROWS_PER_WARP, int ROWS_PER_CTA, int WARPS_PER_CTA>
-__global__ void moe_fused_gate_kernel(void* input, void* bias,
-                                      float* output_ptr, int32_t* indices_ptr,
-                                      int64_t num_rows, int64_t topk_group,
-                                      int64_t topk,
-                                      int64_t num_fused_shared_experts,
-                                      double routed_scaling_factor,
-                                      bool apply_routed_scaling_factor_on_output) {
+__global__ void moe_fused_gate_kernel(
+    void* input, void* bias, float* output_ptr, int32_t* indices_ptr,
+    int64_t num_rows, int64_t topk_group, int64_t topk,
+    int64_t num_fused_shared_experts, double routed_scaling_factor,
+    bool apply_routed_scaling_factor_on_output) {
   KernelParams<VPT, NUM_EXPERTS, THREADS_PER_ROW, ROWS_PER_WARP, ROWS_PER_CTA,
                WARPS_PER_CTA>
       params;
   moe_fused_gate_impl<T>(input, bias, output_ptr, indices_ptr, num_rows,
                          topk_group, topk, num_fused_shared_experts,
-                         routed_scaling_factor, apply_routed_scaling_factor_on_output, params);
+                         routed_scaling_factor,
+                         apply_routed_scaling_factor_on_output, params);
 }
 
 // Macro to compute compile-time constants and launch the kernel.
@@ -352,18 +351,17 @@ __global__ void moe_fused_gate_kernel_dynamic(
 
   moe_fused_gate_impl<T>(input, bias, output_ptr, indices_ptr, num_rows,
                          topk_group, topk, num_fused_shared_experts,
-                         routed_scaling_factor, apply_routed_scaling_factor_on_output, params);
+                         routed_scaling_factor,
+                         apply_routed_scaling_factor_on_output, params);
 }
 
 //------------------------------------------------------------------------------
 // Host Launcher Function
 //------------------------------------------------------------------------------
-std::vector<at::Tensor> moe_fused_gate(at::Tensor& input, at::Tensor& bias,
-                                       int64_t num_expert_group,
-                                       int64_t topk_group, int64_t topk,
-                                       int64_t num_fused_shared_experts,
-                                       double routed_scaling_factor,
-                                       bool apply_routed_scaling_factor_on_output) {
+std::vector<at::Tensor> moe_fused_gate(
+    at::Tensor& input, at::Tensor& bias, int64_t num_expert_group,
+    int64_t topk_group, int64_t topk, int64_t num_fused_shared_experts,
+    double routed_scaling_factor, bool apply_routed_scaling_factor_on_output) {
   int64_t num_rows = input.size(0);
   int32_t num_experts = input.size(1);
   auto options =
 
@@ -28,14 +28,10 @@ void moe_lora_align_block_size(
     torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled,
     torch::Tensor lora_ids);
 
-std::vector<at::Tensor> moe_fused_gate(torch::Tensor& input,
-                                       torch::Tensor& bias,
-                                       int64_t num_expert_group,
-                                       int64_t topk_group, int64_t topk,
-                                       int64_t num_fused_shared_experts,
-                                       double routed_scaling_factor,
-                                       bool apply_routed_scaling_factor_on_output
-                                    );
+std::vector<at::Tensor> moe_fused_gate(
+    torch::Tensor& input, torch::Tensor& bias, int64_t num_expert_group,
+    int64_t topk_group, int64_t topk, int64_t num_fused_shared_experts,
+    double routed_scaling_factor, bool apply_routed_scaling_factor_on_output);
 
 #ifndef USE_ROCM
 torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
 
@@ -10,14 +10,12 @@
 
 @pytest.mark.parametrize(
     "seq_length",
-    list(range(1, 10)) +
-    [16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536],
+    list(range(1, 10))
+    + [16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536],
 )
 @pytest.mark.parametrize(
     "dtype",
-    [
-        torch.float32
-    ]  #  torch.float16, torch.bfloat16 - aren't working correctly yet
+    [torch.float32],  #  torch.float16, torch.bfloat16 - aren't working correctly yet
 )
 @pytest.mark.parametrize(
     "params",
@@ -36,7 +34,7 @@
 )
 def test_moe_fused_gate_combined(
     seq_length, dtype, params, num_fused_shared_experts, monkeypatch
-    ):
+):
     num_experts, num_expert_group, topk_group, topk = params
     topk += 1 if num_fused_shared_experts > 0 else 0
 
@@ -82,16 +80,19 @@ def test_moe_fused_gate_combined(
         shared_indices = original_indices[:, -1]
         shared_ref_indices = original_ref_indices[:, -1]
         if shared_indices is not None:
-            assert torch.all((shared_indices >= valid_min) & (
-                shared_indices < valid_max)), (
-                    "Shared expert indices out of range: ",
-                    f"found values outside [{valid_min}, {valid_max})")
+            assert torch.all(
+                (shared_indices >= valid_min) & (shared_indices < valid_max)
+            ), (
+                "Shared expert indices out of range: ",
+                f"found values outside [{valid_min}, {valid_max})",
+            )
         if shared_ref_indices is not None:
             assert torch.all(
-                (shared_ref_indices >= valid_min)
-                & (shared_ref_indices < valid_max)), (
-                    "Shared expert reference indices out of range: ",
-                    f"found values outside [{valid_min}, {valid_max})")
+                (shared_ref_indices >= valid_min) & (shared_ref_indices < valid_max)
+            ), (
+                "Shared expert reference indices out of range: ",
+                f"found values outside [{valid_min}, {valid_max})",
+            )
 
     vllm_idx_check = torch.allclose(
         ref_vllm_indices.sort()[0].to(torch.int32),
 
@@ -1874,6 +1874,7 @@ def moe_lora_align_block_size(
         lora_ids,
     )
 
+
 def moe_fused_gate(
     input_tensor: torch.Tensor,
     bias: torch.Tensor,
@@ -1919,6 +1920,7 @@ def _moe_fused_gate_fake(
             device=input_tensor.device,
         )
 
+
 def moe_wna16_gemm(
     input: torch.Tensor,
     output: torch.Tensor,
 
@@ -256,10 +256,10 @@ class is dynamically inherited by the worker class. This is used to inject
         This is an internal config that is only valid for and
         should only be set by API server scale-out.
     """
-    
+
     enable_fused_shared_experts: bool = False
     """Enable the fusion of the shared experts of the model with other experts."""
-    
+
     enable_fused_moe_router: bool = False
     """Use the fused grouped top-k MoE expert selection router"""
 
 
@@ -1486,12 +1486,13 @@ def get_vllm_port() -> int | None:
     # FlatLogprobs.
     "VLLM_FLAT_LOGPROBS": lambda: bool(int(os.getenv("VLLM_FLAT_LOGPROBS", "0"))),
     # Enable the fusion of the shared experts of the model with other experts.
-    "VLLM_USE_CUDA_FUSION_SHARED_EXPERTS":
-    lambda: bool(int(os.getenv("VLLM_USE_CUDA_FUSION_SHARED_EXPERTS", "0"))),
-
+    "VLLM_USE_CUDA_FUSION_SHARED_EXPERTS": lambda: bool(
+        int(os.getenv("VLLM_USE_CUDA_FUSION_SHARED_EXPERTS", "0"))
+    ),
     # Use the fused grouped top-k MoE expert selection router
-    "VLLM_USE_FUSED_MOE_ROUTER":
-    lambda: bool(int(os.getenv("VLLM_USE_FUSED_MOE_ROUTER", "0"))),
+    "VLLM_USE_FUSED_MOE_ROUTER": lambda: bool(
+        int(os.getenv("VLLM_USE_FUSED_MOE_ROUTER", "0"))
+    ),
 }
 
 # --8<-- [end:env-vars-definition]
 
@@ -1173,7 +1173,7 @@ def grouped_topk(
         logger.info(
             "Fused MoE grouped topk is enabled with fused shared experts.",
             "Only one of these options can be used at a time",
-            "Fused MoE grouped topk is disabled."
+            "Fused MoE grouped topk is disabled.",
         )
         use_fused_moe_grouped_topk = False
 
@@ -1239,21 +1239,20 @@ def grouped_topk(
         # Use original unbiased scores for the routing weights
         topk_weights = original_scores.gather(1, topk_ids)
     else:
-        topk_weights, topk_ids = torch.topk(tmp_scores,
-                                            k=topk,
-                                            dim=-1,
-                                            sorted=use_sorted)
+        topk_weights, topk_ids = torch.topk(
+            tmp_scores, k=topk, dim=-1, sorted=use_sorted
+        )
 
     if num_fused_shared_experts > 0:
-        assert routed_scaling_factor is not None, \
-        "With num_fused_shared_experts>0"
+        assert routed_scaling_factor is not None, "With num_fused_shared_experts>0"
         ", routed_scaling_factor need to be provided"
-        topk_ids[:, -1] = torch.randint(low=num_experts,
-                                        high=num_experts +
-                                        num_fused_shared_experts,
-                                        size=(topk_ids.size(0), ),
-                                        dtype=topk_ids.dtype,
-                                        device=topk_ids.device)
+        topk_ids[:, -1] = torch.randint(
+            low=num_experts,
+            high=num_experts + num_fused_shared_experts,
+            size=(topk_ids.size(0),),
+            dtype=topk_ids.dtype,
+            device=topk_ids.device,
+        )
         topk_weights[:, -1] = topk_weights[:, :-1].sum(dim=-1) / routed_scaling_factor
 
     if renormalize:
@@ -1263,9 +1262,8 @@ def grouped_topk(
             topk_weights_sum = topk_weights[:, :-1].sum(dim=-1, keepdim=True)
         topk_weights = topk_weights / topk_weights_sum
 
-    if num_fused_shared_experts == 0:
-        if routed_scaling_factor != 1.0:
-            topk_weights = topk_weights * routed_scaling_factor
+    if num_fused_shared_experts == 0 and routed_scaling_factor != 1.0:
+        topk_weights = topk_weights * routed_scaling_factor
     return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
 
 
 
@@ -103,6 +103,7 @@ def apply(
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
+        enable_fused_moe_router: bool = False,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         # Is getattr needed?
         zero_expert_num = getattr(layer, "zero_expert_num", 0)
 
@@ -2,11 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
-from abc import abstractmethod
 from collections.abc import Callable, Iterable
 from contextlib import nullcontext
 from enum import Enum
-from functools import partial
 from typing import Literal, get_args, overload
 
 import torch
@@ -54,8 +52,9 @@
 from vllm.v1.worker.ubatching import dbo_current_ubatch_id
 
 if current_platform.is_cuda_alike():
-    from .fused_moe import eplb_map_to_physical_and_record, fused_experts
     from vllm._custom_ops import moe_fused_gate
+
+    from .fused_moe import eplb_map_to_physical_and_record, fused_experts
 else:
     fused_experts = None  # type: ignore
     FusedMoEPermuteExpertsUnpermute = object  # type: ignore
@@ -371,9 +370,10 @@ def __init__(
             dp_size_=dp_size_,
             vllm_parallel_config=vllm_config.parallel_config,
         )
-        
+
         self.enable_fused_shared_experts = enable_fused_shared_experts
         if self.enable_fused_shared_experts:
+            assert n_shared_experts is not None
             num_experts += n_shared_experts
             top_k += n_shared_experts
 
@@ -414,10 +414,11 @@ def __init__(
 
         self.num_fused_shared_experts = (
             n_shared_experts
-            if (
-                n_shared_experts is not None
-                and self.aiter_fmoe_shared_expert_enabled
-            ) or self.enable_fused_shared_experts
+            if n_shared_experts is not None
+            and (
+                self.aiter_fmoe_shared_expert_enabled
+                or self.enable_fused_shared_experts
+            )
             else 0
         )
         if (
@@ -487,12 +488,15 @@ def __init__(
                 self.global_num_experts,
                 get_compressed_expert_map(self.expert_map),
             )
-            if (self.num_fused_shared_experts > 0):
+            if self.num_fused_shared_experts > 0:
                 logger.warning(
                     "With EP enabled and share expert fusion enabled"
                     ", share expert replica should be same as ep_size"
                     "got share expert replica = %d"
-                    "and ep_size = %d", self.num_fused_shared_experts, self.ep_size)
+                    "and ep_size = %d",
+                    self.num_fused_shared_experts,
+                    self.ep_size,
+                )
         else:
             self.local_num_experts, self.expert_map, self.expert_mask = (
                 self.global_num_experts,
@@ -1375,23 +1379,24 @@ def select_experts(
             assert topk_group is not None
             assert num_expert_group is not None
             if hidden_states.shape[0] == 0:
-                topk_ids = torch.full((0, top_k),
-                                      -1,
-                                      dtype=torch.int,
-                                      device=hidden_states.device)
-                topk_weights = torch.empty((0, top_k),
-                                           dtype=torch.float32,
-                                           device=hidden_states.device)
+                topk_ids = torch.full(
+                    (0, top_k), -1, dtype=torch.int, device=hidden_states.device
+                )
+                topk_weights = torch.empty(
+                    (0, top_k), dtype=torch.float32, device=hidden_states.device
+                )
             elif rocm_aiter_ops.is_fused_moe_enabled():
                 if not rocm_aiter_ops.is_fusion_moe_shared_experts_enabled():
                     assert num_fused_shared_experts == 0
                 grouped_topk_impl = rocm_aiter_grouped_topk
             else:
                 grouped_topk_impl = grouped_topk
 
-            if (enable_fused_moe_router
-                  and e_score_correction_bias is not None
-                  and is_power_of_two(e_score_correction_bias.shape[0])):
+            if (
+                enable_fused_moe_router
+                and e_score_correction_bias is not None
+                and is_power_of_two(e_score_correction_bias.shape[0])
+            ):
                 # The fused kernel can only work with 128/256 experts
                 topk_weights, topk_ids = moe_fused_gate(
                     input_tensor=router_logits.to(dtype=torch.float32),
@@ -1401,7 +1406,8 @@ def select_experts(
                     topk=top_k,
                     num_fused_shared_experts=num_fused_shared_experts,
                     routed_scaling_factor=routed_scaling_factor
-                    if routed_scaling_factor is not None else 1.0,
+                    if routed_scaling_factor is not None
+                    else 1.0,
                     apply_routed_scaling_factor_on_output=False,
                 )
             else:
@@ -1415,7 +1421,7 @@ def select_experts(
                     scoring_func=scoring_func,
                     routed_scaling_factor=routed_scaling_factor,
                     e_score_correction_bias=e_score_correction_bias,
-                    num_fused_shared_experts=num_fused_shared_experts
+                    num_fused_shared_experts=num_fused_shared_experts,
                 )
             if indices_type is not None:
                 topk_ids = topk_ids.to(dtype=indices_type)
 
@@ -639,7 +639,7 @@ def apply(
             indices_type=self.topk_indices_dtype,
             num_fused_shared_experts=layer.num_fused_shared_experts,
             enable_fused_moe_router=enable_fused_moe_router,
-            )
+        )
 
         return fused_marlin_moe(
             x,
Original file line number	Diff line number	Diff line change
`@@ -1874,6 +1874,7 @@ def moe_lora_align_block_size(`
`1874`	`1874`	`lora_ids,`
`1875`	`1875`	`)`
`1876`	`1876`
	`1877`	`+`
`1877`	`1878`	`def moe_fused_gate(`
`1878`	`1879`	`input_tensor: torch.Tensor,`
`1879`	`1880`	`bias: torch.Tensor,`
`@@ -1919,6 +1920,7 @@ def _moe_fused_gate_fake(`
`1919`	`1920`	`device=input_tensor.device,`
`1920`	`1921`	`)`
`1921`	`1922`
	`1923`	`+`
`1922`	`1924`	`def moe_wna16_gemm(`
`1923`	`1925`	`input: torch.Tensor,`
`1924`	`1926`	`output: torch.Tensor,`