[Optimization] Add Fused Triton Kernel for topk+softmax

ijpq · ijpq · commit dc3f82016f88 · 2025-11-24T15:14:37.000Z
Signed-off-by: ijpq &lt;509634578tk@gmail.com&gt;
diff --git a/tests/kernels/moe/test_gpt_oss_fused_router.py b/tests/kernels/moe/test_gpt_oss_fused_router.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from vllm.model_executor.layers.fused_moe.gpt_oss_fused_router import fused_topk_softmax
+
+
+@pytest.mark.parametrize("M", [1, 32, 128, 2048])
+@pytest.mark.parametrize("N", [32, 128])
+@pytest.mark.parametrize("K", [1, 2])
+def test_fused_router(M, N, K):
+    device = "cuda"
+    torch.manual_seed(0)
+
+    logits = torch.randn((M, N), device=device, dtype=torch.float32)
+
+    ref_vals, ref_indices = torch.topk(logits, K, dim=-1)
+    ref_probs = torch.softmax(ref_vals, dim=-1)
+
+    tri_probs, tri_indices = fused_topk_softmax(logits, K, renormalize=True)
+
+    torch.testing.assert_close(tri_indices.long(), ref_indices)
+    torch.testing.assert_close(tri_probs, ref_probs, atol=1e-4, rtol=1e-4)
+
+
+if __name__ == "__main__":
+    test_fused_router(128, 32, 2)
+    print("Test Passed!")
diff --git a/tests/kernels/moe/test_gpt_oss_routing_consistency.py b/tests/kernels/moe/test_gpt_oss_routing_consistency.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from vllm.model_executor.layers.fused_moe.gpt_oss_fused_router import (
+    gpt_oss_custom_routing_function,
+)
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+
+
+@pytest.mark.parametrize("num_tokens", [10, 128, 1024])
+@pytest.mark.parametrize("num_experts", [32, 128])
+@pytest.mark.parametrize("top_k", [1, 2])
+def test_routing_consistency(num_tokens, num_experts, top_k):
+    torch.manual_seed(42)
+    device = torch.device("cuda")
+
+    hidden_states = torch.randn(num_tokens, 4096, device=device, dtype=torch.float16)
+    router_logits = torch.randn(
+        num_tokens, num_experts, device=device, dtype=torch.float32
+    )
+
+    ref_weights, ref_ids, _ = FusedMoE.select_experts(
+        hidden_states=hidden_states,
+        router_logits=router_logits,
+        top_k=top_k,
+        use_grouped_topk=False,
+        renormalize=True,
+        custom_routing_function=None,
+    )
+
+    triton_weights, triton_ids, _ = FusedMoE.select_experts(
+        hidden_states=hidden_states,
+        router_logits=router_logits,
+        top_k=top_k,
+        use_grouped_topk=False,
+        renormalize=True,
+        custom_routing_function=gpt_oss_custom_routing_function,
+    )
+
+    print(f"\nTesting M={num_tokens}, E={num_experts}, K={top_k}")
+
+    torch.testing.assert_close(
+        triton_ids,
+        ref_ids,
+        msg="Expert indices mismatch between Native and Triton implementation",
+    )
+
+    torch.testing.assert_close(
+        triton_weights,
+        ref_weights,
+        atol=1e-3,
+        rtol=1e-3,
+        msg="Expert weights mismatch between Native and Triton implementation",
+    )
+
+
+if __name__ == "__main__":
+    test_routing_consistency(128, 32, 2)
+    print("Consistency Test Passed!")
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_fused_router.py b/vllm/model_executor/layers/fused_moe/gpt_oss_fused_router.py
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def _topk_softmax_kernel(
+    logits_ptr,
+    weights_ptr,
+    indices_ptr,
+    M,
+    N,
+    K: tl.constexpr,
+    stride_lm,
+    stride_ln,
+    stride_wm,
+    stride_wk,
+    stride_im,
+    stride_ik,
+    BLOCK_N: tl.constexpr,
+    RENORM: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    offs = tl.arange(0, BLOCK_N)
+    logits_offs = logits_ptr + pid * stride_lm + offs * stride_ln
+    mask = offs < N
+    logits = tl.load(logits_offs, mask=mask, other=float("-inf"))
+
+    if K == 1:
+        max_val = tl.max(logits, axis=0)
+        max_idx = tl.argmax(logits, axis=0)
+
+        weight = 1.0 if RENORM else max_val
+
+        tl.store(weights_ptr + pid * stride_wm, weight)
+        tl.store(indices_ptr + pid * stride_im, max_idx)
+
+    elif K == 2:
+        # first max
+        v1 = tl.max(logits, axis=0)
+        i1 = tl.argmax(logits, axis=0)
+
+        # second max
+        masked = tl.where(offs != i1, logits, float("-inf"))
+        v2 = tl.max(masked, axis=0)
+        i2 = tl.argmax(masked, axis=0)
+
+        if RENORM:
+            vmax = tl.maximum(v1, v2)
+            e1 = tl.exp(v1 - vmax)
+            e2 = tl.exp(v2 - vmax)
+            s = e1 + e2
+            w1, w2 = e1 / s, e2 / s
+        else:
+            w1, w2 = v1, v2
+
+        tl.store(weights_ptr + pid * stride_wm, w1)
+        tl.store(weights_ptr + pid * stride_wm + stride_wk, w2)
+        tl.store(indices_ptr + pid * stride_im, i1)
+        tl.store(indices_ptr + pid * stride_im + stride_ik, i2)
+
+
+def fused_topk_softmax(
+    router_logits: torch.Tensor,
+    top_k: int,
+    renormalize: bool = True,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if top_k not in [1, 2]:
+        raise NotImplementedError(f"Only K=1,2 supported, got {top_k}")
+
+    M, N = router_logits.shape
+
+    weights = torch.empty((M, top_k), device=router_logits.device, dtype=torch.float32)
+    indices = torch.empty((M, top_k), device=router_logits.device, dtype=torch.int32)
+
+    BLOCK_N = triton.next_power_of_2(N)
+    grid = (M,)
+
+    _topk_softmax_kernel[grid](
+        router_logits,
+        weights,
+        indices,
+        M,
+        N,
+        K=top_k,
+        stride_lm=router_logits.stride(0),
+        stride_ln=router_logits.stride(1),
+        stride_wm=weights.stride(0),
+        stride_wk=weights.stride(1),
+        stride_im=indices.stride(0),
+        stride_ik=indices.stride(1),
+        BLOCK_N=BLOCK_N,
+        RENORM=renormalize,
+    )
+
+    return weights, indices
+
+
+def gpt_oss_custom_routing_function(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    # only use gating_output to avoid padding issues
+    return fused_topk_softmax(gating_output, topk, renormalize)
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
@@ -20,6 +20,9 @@
 )
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
+from vllm.model_executor.layers.fused_moe.gpt_oss_fused_router import (
+    gpt_oss_custom_routing_function,
+)
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -173,6 +176,11 @@ def __init__(
             has_bias=True,
             activation="swigluoai",
             is_sequence_parallel=self.is_sequence_parallel,
+            custom_routing_function=(
+                gpt_oss_custom_routing_function
+                if not current_platform.is_rocm()
+                else None
+            ),
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor: