From 7004ee7bc1827a0ba88c6bdb4f252c82011dd8d8 Mon Sep 17 00:00:00 2001 From: Koushik Dutta Date: Tue, 2 Dec 2025 10:51:47 -0800 Subject: [PATCH 1/3] Update block size logic for sm120 architectures Update block size assignments for consumer Blackwell sm120 architecture. --- .../attention/triton_ops/extend_attention.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py index 62132a3403b..e3813f24ca9 100644 --- a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py +++ b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py @@ -65,11 +65,20 @@ def _get_block_sizes_for_extend_attention(Lq: int, Lv: int): num_warps = 4 else: if _is_cuda and CUDA_CAPABILITY[0] >= 9: - # Hopper architecture (H100, etc.) - if Lq <= 256: - BLOCK_M, BLOCK_N = (128, 64) + # sm120 consumer Blackwell architecture (RTX Pro 6000) has a much smaller shared memory size (100K) + if CUDA_CAPABILITY[0] == 12: + if Lq <= 128: + BLOCK_M, BLOCK_N = (64, 128) + elif Lq <= 256: + BLOCK_M, BLOCK_N = (64, 64) + else: + BLOCK_M, BLOCK_N = (32, 32) else: - BLOCK_M, BLOCK_N = (32, 64) + # Hopper architecture (H100, etc.) + if Lq <= 256: + BLOCK_M, BLOCK_N = (128, 64) + else: + BLOCK_M, BLOCK_N = (32, 64) elif _is_cuda and CUDA_CAPABILITY[0] >= 8: # Ampere architecture (A100, etc.) # sm86/sm89 has a much smaller shared memory size (100K) than sm80 (160K) From 386c7e8660df25b7d9a07fecd0e4d10f082c68d6 Mon Sep 17 00:00:00 2001 From: Koushik Dutta Date: Tue, 2 Dec 2025 11:02:16 -0800 Subject: [PATCH 2/3] fix up if nesting per review --- .../attention/triton_ops/extend_attention.py | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py index e3813f24ca9..418e5ae6efc 100644 --- a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py +++ b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py @@ -64,21 +64,20 @@ def _get_block_sizes_for_extend_attention(Lq: int, Lv: int): BLOCK_M, BLOCK_N = (64, 64) num_warps = 4 else: - if _is_cuda and CUDA_CAPABILITY[0] >= 9: + if _is_cuda and CUDA_CAPABILITY[0] == 12: # sm120 consumer Blackwell architecture (RTX Pro 6000) has a much smaller shared memory size (100K) - if CUDA_CAPABILITY[0] == 12: - if Lq <= 128: - BLOCK_M, BLOCK_N = (64, 128) - elif Lq <= 256: - BLOCK_M, BLOCK_N = (64, 64) - else: - BLOCK_M, BLOCK_N = (32, 32) + if Lq <= 128: + BLOCK_M, BLOCK_N = (64, 128) + elif Lq <= 256: + BLOCK_M, BLOCK_N = (64, 64) else: - # Hopper architecture (H100, etc.) - if Lq <= 256: - BLOCK_M, BLOCK_N = (128, 64) - else: - BLOCK_M, BLOCK_N = (32, 64) + BLOCK_M, BLOCK_N = (32, 32) + elif _is_cuda and CUDA_CAPABILITY[0] >= 9: + # Hopper architecture (H100, etc.) + if Lq <= 256: + BLOCK_M, BLOCK_N = (128, 64) + else: + BLOCK_M, BLOCK_N = (32, 64) elif _is_cuda and CUDA_CAPABILITY[0] >= 8: # Ampere architecture (A100, etc.) # sm86/sm89 has a much smaller shared memory size (100K) than sm80 (160K) From 9d3f94c89712271378a9535d9d439726b8397203 Mon Sep 17 00:00:00 2001 From: Koushik Dutta Date: Tue, 2 Dec 2025 11:19:09 -0800 Subject: [PATCH 3/3] rename sm120 from consumer to workstation branding --- .../sglang/srt/layers/attention/triton_ops/extend_attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py index 418e5ae6efc..2fb428fb2a2 100644 --- a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py +++ b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py @@ -65,7 +65,7 @@ def _get_block_sizes_for_extend_attention(Lq: int, Lv: int): num_warps = 4 else: if _is_cuda and CUDA_CAPABILITY[0] == 12: - # sm120 consumer Blackwell architecture (RTX Pro 6000) has a much smaller shared memory size (100K) + # sm120 workstation Blackwell architecture (RTX Pro 6000) has a much smaller shared memory size (100K) if Lq <= 128: BLOCK_M, BLOCK_N = (64, 128) elif Lq <= 256: