fix lora masking

gnovack · gnovack · commit 7dcee158606b · 2025-12-05T23:12:20.000Z
Signed-off-by: gnovack &lt;gnovack@amazon.com&gt;
diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
@@ -246,10 +246,10 @@ __device__ void _moe_align_block_size_small_batch_expert(
     int32_t rank_post_pad =
         tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];
 
-    int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];
-    sorted_token_ids[sorted_token_ids_offset + rank_post_pad] +=
-        ((i - numel) * mask);
-    tokens_cnts[threadIdx.x * num_experts + expert_id] += mask;
+    if (token_mask == nullptr || token_mask[i / topk_num]) {
+      sorted_token_ids[sorted_token_ids_offset + rank_post_pad] = i;
+      ++tokens_cnts[threadIdx.x * num_experts + expert_id];
+    }
   }
 }
 
@@ -268,11 +268,12 @@ __device__ void _count_and_sort_expert_tokens(
       continue;
     }
 
-    int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];
-    int32_t rank_post_pad = atomicAdd(
-        &cumsum_buffer[(model_offset * (num_experts + 1)) + expert_id], mask);
-    sorted_token_ids[max_num_tokens_padded * model_offset + rank_post_pad] +=
-        ((i - numel) * mask);
+    if (token_mask == nullptr || token_mask[i / topk_num]) {
+      int32_t rank_post_pad = atomicAdd(
+          &cumsum_buffer[(model_offset * (num_experts + 1)) + expert_id], 1);
+      sorted_token_ids[max_num_tokens_padded * model_offset + rank_post_pad] =
+          i;
+    }
   }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -246,10 +246,10 @@ __device__ void _moe_align_block_size_small_batch_expert(`
`246`	`246`	`int32_t rank_post_pad =`
`247`	`247`	`tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];`
`248`	`248`
`249`		`- int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];`
`250`		`- sorted_token_ids[sorted_token_ids_offset + rank_post_pad] +=`
`251`		`- ((i - numel) * mask);`
`252`		`- tokens_cnts[threadIdx.x * num_experts + expert_id] += mask;`
	`249`	`+ if (token_mask == nullptr \|\| token_mask[i / topk_num]) {`
	`250`	`+ sorted_token_ids[sorted_token_ids_offset + rank_post_pad] = i;`
	`251`	`+ ++tokens_cnts[threadIdx.x * num_experts + expert_id];`
	`252`	`+ }`
`253`	`253`	`}`
`254`	`254`	`}`
`255`	`255`
`@@ -268,11 +268,12 @@ __device__ void _count_and_sort_expert_tokens(`
`268`	`268`	`continue;`
`269`	`269`	`}`
`270`	`270`
`271`		`- int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num];`
`272`		`- int32_t rank_post_pad = atomicAdd(`
`273`		`- &cumsum_buffer[(model_offset * (num_experts + 1)) + expert_id], mask);`
`274`		`- sorted_token_ids[max_num_tokens_padded * model_offset + rank_post_pad] +=`
`275`		`- ((i - numel) * mask);`
	`271`	`+ if (token_mask == nullptr \|\| token_mask[i / topk_num]) {`
	`272`	`+ int32_t rank_post_pad = atomicAdd(`
	`273`	`+ &cumsum_buffer[(model_offset * (num_experts + 1)) + expert_id], 1);`
	`274`	`+ sorted_token_ids[max_num_tokens_padded * model_offset + rank_post_pad] =`
	`275`	`+ i;`
	`276`	`+ }`
`276`	`277`	`}`
`277`	`278`	`}`
`278`	`279`