From 7004ee7bc1827a0ba88c6bdb4f252c82011dd8d8 Mon Sep 17 00:00:00 2001
From: Koushik Dutta <koush@koushikdutta.com>
Date: Tue, 2 Dec 2025 10:51:47 -0800
Subject: [PATCH 1/3] Update block size logic for sm120 architectures

Update block size assignments for consumer Blackwell sm120 architecture.
---
 .../attention/triton_ops/extend_attention.py    | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py
index 62132a3403b..e3813f24ca9 100644
--- a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py
+++ b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py
@@ -65,11 +65,20 @@ def _get_block_sizes_for_extend_attention(Lq: int, Lv: int):
         num_warps = 4
     else:
         if _is_cuda and CUDA_CAPABILITY[0] >= 9:
-            # Hopper architecture (H100, etc.)
-            if Lq <= 256:
-                BLOCK_M, BLOCK_N = (128, 64)
+            # sm120 consumer Blackwell architecture (RTX Pro 6000) has a much smaller shared memory size (100K)
+            if CUDA_CAPABILITY[0] == 12:
+                if Lq <= 128:
+                    BLOCK_M, BLOCK_N = (64, 128)
+                elif Lq <= 256:
+                    BLOCK_M, BLOCK_N = (64, 64)
+                else:
+                    BLOCK_M, BLOCK_N = (32, 32)
             else:
-                BLOCK_M, BLOCK_N = (32, 64)
+                # Hopper architecture (H100, etc.)
+                if Lq <= 256:
+                    BLOCK_M, BLOCK_N = (128, 64)
+                else:
+                    BLOCK_M, BLOCK_N = (32, 64)
         elif _is_cuda and CUDA_CAPABILITY[0] >= 8:
             # Ampere architecture (A100, etc.)
             # sm86/sm89 has a much smaller shared memory size (100K) than sm80 (160K)

From 386c7e8660df25b7d9a07fecd0e4d10f082c68d6 Mon Sep 17 00:00:00 2001
From: Koushik Dutta <koushd@gmail.com>
Date: Tue, 2 Dec 2025 11:02:16 -0800
Subject: [PATCH 2/3] fix up if nesting per review

---
 .../attention/triton_ops/extend_attention.py  | 25 +++++++++----------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py
index e3813f24ca9..418e5ae6efc 100644
--- a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py
+++ b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py
@@ -64,21 +64,20 @@ def _get_block_sizes_for_extend_attention(Lq: int, Lv: int):
         BLOCK_M, BLOCK_N = (64, 64)
         num_warps = 4
     else:
-        if _is_cuda and CUDA_CAPABILITY[0] >= 9:
+        if _is_cuda and CUDA_CAPABILITY[0] == 12:
             # sm120 consumer Blackwell architecture (RTX Pro 6000) has a much smaller shared memory size (100K)
-            if CUDA_CAPABILITY[0] == 12:
-                if Lq <= 128:
-                    BLOCK_M, BLOCK_N = (64, 128)
-                elif Lq <= 256:
-                    BLOCK_M, BLOCK_N = (64, 64)
-                else:
-                    BLOCK_M, BLOCK_N = (32, 32)
+            if Lq <= 128:
+                BLOCK_M, BLOCK_N = (64, 128)
+            elif Lq <= 256:
+                BLOCK_M, BLOCK_N = (64, 64)
             else:
-                # Hopper architecture (H100, etc.)
-                if Lq <= 256:
-                    BLOCK_M, BLOCK_N = (128, 64)
-                else:
-                    BLOCK_M, BLOCK_N = (32, 64)
+                BLOCK_M, BLOCK_N = (32, 32)
+        elif _is_cuda and CUDA_CAPABILITY[0] >= 9:
+            # Hopper architecture (H100, etc.)
+            if Lq <= 256:
+                BLOCK_M, BLOCK_N = (128, 64)
+            else:
+                BLOCK_M, BLOCK_N = (32, 64)
         elif _is_cuda and CUDA_CAPABILITY[0] >= 8:
             # Ampere architecture (A100, etc.)
             # sm86/sm89 has a much smaller shared memory size (100K) than sm80 (160K)

From 9d3f94c89712271378a9535d9d439726b8397203 Mon Sep 17 00:00:00 2001
From: Koushik Dutta <koushd@gmail.com>
Date: Tue, 2 Dec 2025 11:19:09 -0800
Subject: [PATCH 3/3] rename sm120 from consumer to workstation branding

---
 .../sglang/srt/layers/attention/triton_ops/extend_attention.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py
index 418e5ae6efc..2fb428fb2a2 100644
--- a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py
+++ b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py
@@ -65,7 +65,7 @@ def _get_block_sizes_for_extend_attention(Lq: int, Lv: int):
         num_warps = 4
     else:
         if _is_cuda and CUDA_CAPABILITY[0] == 12:
-            # sm120 consumer Blackwell architecture (RTX Pro 6000) has a much smaller shared memory size (100K)
+            # sm120 workstation Blackwell architecture (RTX Pro 6000) has a much smaller shared memory size (100K)
             if Lq <= 128:
                 BLOCK_M, BLOCK_N = (64, 128)
             elif Lq <= 256: