[Optimization]: Optimize Fused Triton Kernel for topk+softmax

ijpq · ijpq · commit 66e6711516de · 2025-11-30T23:46:56.000+08:00
- split two kernels, in case renorm or not
- add online softmax
- unroll along M

Signed-off-by: ijpq &lt;509634578tk@gmail.com&gt;
diff --git a/tests/kernels/moe/test_gpt_oss_routing_consistency.py b/tests/kernels/moe/test_gpt_oss_routing_consistency.py
@@ -11,8 +11,9 @@
 
 @pytest.mark.parametrize("num_tokens", [10, 128, 1024])
 @pytest.mark.parametrize("num_experts", [32, 65, 128])
-@pytest.mark.parametrize("topk", [1, 2, 3, 4, 5])
-def test_routing_consistency(num_tokens, num_experts, topk):
+@pytest.mark.parametrize("topk", [2, 4])
+@pytest.mark.parametrize("renorm", [True, False])
+def test_routing_consistency(num_tokens, num_experts, topk, renorm):
     torch.manual_seed(42)
     device = torch.device("cuda")
 
@@ -21,12 +22,24 @@ def test_routing_consistency(num_tokens, num_experts, topk):
         num_tokens, num_experts, device=device, dtype=torch.float32
     )
 
+    def native_impl(logits, topk, renorm):
+        if renorm:
+            ref_vals, ref_indices = torch.topk(logits, topk, dim=1)
+            ref_vals = torch.softmax(ref_vals, dim=1)
+        else:
+            ref_vals = torch.softmax(logits, dim=1)
+            ref_vals, ref_indices = torch.topk(ref_vals, topk, dim=1)
+
+        return ref_vals, ref_indices
+
+    native_weights, native_ids = native_impl(router_logits, topk, renorm)
+
     ref_weights, ref_ids, _ = FusedMoE.select_experts(
         hidden_states=hidden_states,
         router_logits=router_logits,
         top_k=topk,
         use_grouped_topk=False,
-        renormalize=True,
+        renormalize=renorm,
         custom_routing_function=None,
     )
 
@@ -35,27 +48,50 @@ def test_routing_consistency(num_tokens, num_experts, topk):
         router_logits=router_logits,
         top_k=topk,
         use_grouped_topk=False,
-        renormalize=True,
+        renormalize=renorm,
         custom_routing_function=gpt_oss_custom_routing_function,
     )
 
     print(f"\nTesting M={num_tokens}, E={num_experts}, K={topk}")
 
+    # compare triton with torch
+    torch.testing.assert_close(
+        triton_ids.to(native_ids.dtype),
+        native_ids,
+        msg="Expert indices mismatch between native and triton implementation",
+    )
+
+    torch.testing.assert_close(
+        triton_weights,
+        native_weights,
+        atol=1e-3,
+        rtol=1e-3,
+        msg="Expert weights mismatch between native and triton implementation",
+    )
+
+    # compare triton with origin
     torch.testing.assert_close(
         triton_ids,
         ref_ids,
-        msg="Expert indices mismatch between Native and Triton implementation",
+        msg="Expert indices mismatch between origin and triton implementation",
     )
-
     torch.testing.assert_close(
         triton_weights,
         ref_weights,
         atol=1e-3,
         rtol=1e-3,
-        msg="Expert weights mismatch between Native and Triton implementation",
+        msg="Expert weights mismatch between origin and triton implementation",
+    )
+    # compare origin with torch
+    torch.testing.assert_close(
+        native_ids,
+        ref_ids.to(native_ids.dtype),
+        msg="Expert indices mismatch between origin and native implementation",
+    )
+    torch.testing.assert_close(
+        native_weights,
+        ref_weights,
+        atol=1e-3,
+        rtol=1e-3,
+        msg="Expert weights mismatch between origin and native implementation",
     )
-
-
-if __name__ == "__main__":
-    test_routing_consistency(128, 32, 2)
-    print("Consistency Test Passed!")
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_fused_router.py b/vllm/model_executor/layers/fused_moe/gpt_oss_fused_router.py
@@ -7,11 +7,24 @@
 from vllm.triton_utils import tl, triton
 
 
+def torch_dtype_to_tl(dtype: torch.dtype):
+    if dtype == torch.float16:
+        return tl.float16
+    elif dtype == torch.bfloat16:
+        return tl.bfloat16
+    elif dtype == torch.float32:
+        return tl.float32
+    elif dtype == torch.int32:
+        return tl.int32
+    else:
+        raise ValueError(f"Unsupported dtype: {dtype}")
+
+
 @triton.jit
 def _topk_softmax_kernel(
-    logits_ptr,
-    weights_ptr,
-    indices_ptr,
+    logits_ptr: torch.Tensor,
+    weights_ptr: torch.Tensor,
+    indices_ptr: torch.Tensor,
     M,
     N,
     topk: tl.constexpr,
@@ -23,81 +36,167 @@ def _topk_softmax_kernel(
     stride_im,
     stride_ik,
     BLOCK_N: tl.constexpr,
-    RENORM: tl.constexpr,
+    num_stages: tl.constexpr,
 ):
-    token_idx = tl.program_id(0)
+    pid = tl.program_id(0)
+    num_programs = tl.num_programs(0)
 
-    offs = tl.arange(0, BLOCK_N)
-    mask = offs < N
-    logit_offs = logits_ptr + token_idx * stride_lm + offs * stride_ln
-    logits = tl.load(logit_offs, mask=mask, other=float("-inf"))
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_k = tl.arange(0, topk_padded)
+    mask_n = offs_n < N
 
     topk_vals = tl.zeros([topk_padded], dtype=tl.float32) + float("-inf")
     topk_idxs = tl.zeros([topk_padded], dtype=tl.int32)
 
-    working_logits = logits
+    for row_idx in tl.range(pid, M, num_programs, num_stages):
+        logits = tl.load(
+            logits_ptr + row_idx * stride_lm + offs_n * stride_ln,
+            mask=mask_n,
+            other=float("-inf"),
+        )
+        row_sub_max = logits - tl.max(logits, axis=0)
+        numerator = tl.exp(row_sub_max)
+        denominator = tl.sum(numerator, axis=0)
+        logits = numerator / denominator
+
+        for k in tl.static_range(topk):
+            cur_max = tl.max(logits, axis=0)
+            cur_idx = tl.argmax(logits, axis=0)
+
+            k_mask = offs_k == k
+            topk_vals = tl.where(k_mask, cur_max, topk_vals)
+            topk_idxs = tl.where(k_mask, cur_idx, topk_idxs)
+
+            logits = tl.where(offs_n == cur_idx, float("-inf"), logits)
+
+        store_mask = offs_k < topk
+        tl.store(
+            weights_ptr + row_idx * stride_wm + offs_k * stride_wk,
+            topk_vals,
+            mask=store_mask,
+        )
+        tl.store(
+            indices_ptr + row_idx * stride_im + offs_k * stride_ik,
+            topk_idxs,
+            mask=store_mask,
+        )
 
-    for k in range(topk):
-        cur_max = tl.max(working_logits, axis=0)
-        cur_idx = tl.argmax(working_logits, axis=0)
 
-        k_mask = tl.arange(0, topk_padded) == k
-        topk_vals = tl.where(k_mask, cur_max, topk_vals)
-        topk_idxs = tl.where(k_mask, cur_idx, topk_idxs)
+@triton.jit
+def _topk_softmax_renorm_kernel(
+    logits_ptr,
+    weights_ptr,
+    indices_ptr,
+    M,
+    N,
+    topk: tl.constexpr,
+    topk_padded: tl.constexpr,
+    stride_lm,
+    stride_ln,
+    stride_wm,
+    stride_wk,
+    stride_im,
+    stride_ik,
+    BLOCK_N: tl.constexpr,
+    num_stages: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    num_programs = tl.num_programs(0)
 
-        mask_selected = offs == cur_idx
-        working_logits = tl.where(mask_selected, float("-inf"), working_logits)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_k = tl.arange(0, topk_padded)
+    mask_n = offs_n < N
 
-    if RENORM:
-        max_val = tl.max(topk_vals, axis=0)
-        exp_vals = tl.exp(topk_vals - max_val)
-        sum_exp = tl.sum(exp_vals, axis=0)
-        topk_vals = exp_vals / sum_exp
+    for row_idx in tl.range(pid, M, num_programs, num_stages):
+        logits = tl.load(
+            logits_ptr + row_idx * stride_lm + offs_n * stride_ln,
+            mask=mask_n,
+            other=float("-inf"),
+        )
 
-    offs_k = tl.arange(0, topk_padded)
+        topk_vals = tl.zeros([topk_padded], dtype=tl.float32) + float("-inf")
+        topk_idxs = tl.zeros([topk_padded], dtype=tl.int32)
 
-    store_mask = offs_k < topk
+        running_max = float("-inf")
+        running_sum = 0.0
 
-    weight_ptrs = weights_ptr + token_idx * stride_wm + offs_k * stride_wk
-    tl.store(weight_ptrs, topk_vals, mask=store_mask)
+        for k in tl.static_range(topk):
+            cur_max = tl.max(logits, axis=0)
+            cur_idx = tl.argmax(logits, axis=0)
 
-    index_ptrs = indices_ptr + token_idx * stride_im + offs_k * stride_ik
-    tl.store(index_ptrs, topk_idxs, mask=store_mask)
+            new_max = tl.maximum(running_max, cur_max)
+            running_sum = running_sum * tl.exp(running_max - new_max) + tl.exp(
+                cur_max - new_max
+            )
+            running_max = new_max
+
+            k_mask = offs_k == k
+            topk_vals = tl.where(k_mask, cur_max, topk_vals)
+            topk_idxs = tl.where(k_mask, cur_idx, topk_idxs)
+
+            logits = tl.where(offs_n == cur_idx, float("-inf"), logits)
+
+        topk_vals = tl.exp(topk_vals - running_max) / running_sum
+
+        tl.store(weights_ptr + row_idx * stride_wm + offs_k * stride_wk, topk_vals)
+        tl.store(indices_ptr + row_idx * stride_im + offs_k * stride_ik, topk_idxs)
 
 
 def fused_topk_softmax(
     router_logits: torch.Tensor,
     topk: int,
     renormalize: bool = True,
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    M, N = router_logits.shape
+    M, N = router_logits.shape  # num_tokens, num_experts
 
-    weights = torch.empty((M, topk), device=router_logits.device, dtype=torch.float32)
+    weights = torch.empty(
+        (M, topk), device=router_logits.device, dtype=router_logits.dtype
+    )
     indices = torch.empty((M, topk), device=router_logits.device, dtype=torch.int32)
 
-    BLOCK_N = triton.next_power_of_2(N)
+    BLOCK_N = triton.next_power_of_2(N)  # num_padded_experts
 
     topk_padded = triton.next_power_of_2(topk)
 
     grid = (M,)
-
-    _topk_softmax_kernel[grid](
-        logits_ptr=router_logits,
-        weights_ptr=weights,
-        indices_ptr=indices,
-        M=M,
-        N=N,
-        topk=topk,
-        topk_padded=topk_padded,
-        stride_lm=router_logits.stride(0),
-        stride_ln=router_logits.stride(1),
-        stride_wm=weights.stride(0),
-        stride_wk=weights.stride(1),
-        stride_im=indices.stride(0),
-        stride_ik=indices.stride(1),
-        BLOCK_N=BLOCK_N,
-        RENORM=renormalize,
-    )
+    num_stages = 2
+
+    if renormalize:
+        _topk_softmax_renorm_kernel[grid](
+            logits_ptr=router_logits,
+            weights_ptr=weights,
+            indices_ptr=indices,
+            M=M,
+            N=N,
+            topk=topk,
+            topk_padded=topk_padded,
+            stride_lm=router_logits.stride(0),
+            stride_ln=router_logits.stride(1),
+            stride_wm=weights.stride(0),
+            stride_wk=weights.stride(1),
+            stride_im=indices.stride(0),
+            stride_ik=indices.stride(1),
+            BLOCK_N=BLOCK_N,
+            num_stages=num_stages,
+        )
+    else:
+        _topk_softmax_kernel[grid](
+            logits_ptr=router_logits,
+            weights_ptr=weights,
+            indices_ptr=indices,
+            M=M,
+            N=N,
+            topk=topk,
+            topk_padded=topk_padded,
+            stride_lm=router_logits.stride(0),
+            stride_ln=router_logits.stride(1),
+            stride_wm=weights.stride(0),
+            stride_wk=weights.stride(1),
+            stride_im=indices.stride(0),
+            stride_ik=indices.stride(1),
+            BLOCK_N=BLOCK_N,
+            num_stages=num_stages,
+        )
 
     return weights, indices