[Pallas MGPU] Use the cp.async.bulk instruction for large contiguous copies. Currently we’re limited to 256 elements per dimension when using the tensormap in cp.async.bulk.tensor.

Rifur13 · Google-ML-Automation · commit f9230e2ea5ec · 2025-12-02T18:46:03.000-08:00
PiperOrigin-RevId: 838501119
diff --git a/jax/experimental/mosaic/gpu/launch_context.py b/jax/experimental/mosaic/gpu/launch_context.py
@@ -859,11 +859,6 @@ def partition_dim(dim: int, idx: ir.Value, num_chunks: int):
             f" {collective_size}"
         )
 
-    if max(slice_shape) > 256:
-      raise ValueError(
-          "Async copies only support copying <=256 elements along each"
-          " dimension"
-      )
     if (zeroth_bw := slice_shape[-1] * element_bitwidth) % 128 != 0:
       raise ValueError(
           "Async copies require the number of bits copied along the last"
@@ -1264,6 +1259,109 @@ def async_copy(
       return
 
     assert gather_indices is None  # Only tiled TMA handled below.
+
+    def check_contiguous_slice(slice_shape, strides):
+      assert strides[-1] == 1
+
+      expected_stride = 1
+      for dim, stride in zip(reversed(slice_shape), reversed(strides), strict=True):
+        if dim != 1 and stride != expected_stride:
+          return False
+        expected_stride *= dim
+
+      return True
+
+    gmem_ref = _find_kernel_argument_for_gmem_ref(gmem_ref)
+    ref = gmem_ref
+    for t in gmem_transform:
+      ref = t.apply(ref)
+    ref_ty = ir.MemRefType(ref.type)
+    strides, _ = ref_ty.get_strides_and_offset()
+
+    # Use the simpler copy instruction for contiguous transfers.
+    is_raw_contiguous_copy = (
+        check_contiguous_slice(slice_shape, strides)
+        and reduction_op is None
+        and (
+            swizzle is None or swizzle == mgpu_dialect.SwizzlingMode.kNoSwizzle
+        )
+        and collective_size == 1
+        and partitioned is None
+    )
+    if isinstance(predicate, _DefaultPredicate):
+      predicate = utils.single_thread_predicate(utils.ThreadSubset.WARPGROUP)
+    if predicate is None:
+      predicate = c(1, ir.IntegerType.get_signless(1))
+
+    smem_ptr = utils.memref_ptr(smem_ref, memory_space=3)
+    if is_raw_contiguous_copy:
+      index = ir.IndexType.get()
+      i64 = ir.IntegerType.get_signless(64)
+      base, base_offset, *_ = memref.extract_strided_metadata(gmem_ref)
+
+      dyn_offset = base_offset
+      for dyn_idx, stride in zip(dyn_base_indices, strides):
+        step = arith.muli(dyn_idx, c(stride, index))
+        dyn_offset = arith.addi(dyn_offset, step)
+      dyn_offset_i64 = arith.index_cast(i64, dyn_offset)
+
+      gmem_base_ptr = utils.getelementptr(
+          utils.memref_ptr(base), [dyn_offset_i64], src_ref_ty.element_type
+      )
+
+      if gmem_peer_id is not None:
+        assert gmem_peer_id is not GLOBAL_BROADCAST
+        self._ensure_nvshmem_decls()
+        if not isinstance(gmem_peer_id, ir.Value):
+          gmem_peer_id = c(gmem_peer_id, i32)
+
+        gmem_base_ptr = llvm.call(
+            gmem_base_ptr.type,
+            [gmem_base_ptr, gmem_peer_id],
+            [],
+            [],
+            callee="nvshmem_ptr",
+        )
+      gmem_base_ptr = llvm.addrspacecast(
+          ir.Type.parse("!llvm.ptr<1>"), gmem_base_ptr
+      )
+
+      if gmem_ref is src_ref:
+        assert barrier is not None  # for pytype
+        barrier_ptr = barrier.get_ptr()
+        if arrive:
+          nvvm.mbarrier_arrive_expect_tx(
+              barrier_ptr, transfer_bytes, predicate=predicate
+          )
+        llvm.inline_asm(
+          ir.Type.parse("!llvm.void"),
+          [predicate, smem_ptr, gmem_base_ptr, transfer_bytes, barrier_ptr],
+          """
+          @$0 cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [$1], [$2], $3, [$4];
+          """,
+          "b,l,l,r,l",
+          has_side_effects=True,
+        )
+      else:
+        llvm.inline_asm(
+          ir.Type.parse("!llvm.void"),
+          [predicate, gmem_base_ptr, smem_ptr, transfer_bytes],
+          """
+          @$0 cp.async.bulk.global.shared::cta.bulk_group [$1], [$2], $3;
+          """,
+          "b,l,l,r",
+          has_side_effects=True,
+          )
+        if arrive:
+          nvvm.cp_async_bulk_commit_group()
+      return
+
+    # Below are tiled TMA copies using a tensormap.
+    if max(slice_shape) > 256:
+      raise ValueError(
+          "Async copies only support copying <=256 elements along each"
+          " dimension"
+      )
     tma_desc = self._get_tma_desc(
         gmem_ref, gmem_transform, gmem_peer_id,
         tuple(slice_shape), swizzle, reduction_op,
@@ -1272,11 +1370,6 @@ def async_copy(
     rev_dyn_base_indices = [
         arith.index_cast(i32, idx) for idx in reversed(dyn_base_indices)
     ]
-    if isinstance(predicate, _DefaultPredicate):
-      predicate = utils.single_thread_predicate(utils.ThreadSubset.WARPGROUP)
-    if predicate is None:
-      predicate = c(1, ir.IntegerType.get_signless(1))
-    smem_ptr = utils.memref_ptr(smem_ref, memory_space=3)
     if gmem_ref is src_ref:
       assert barrier is not None  # for pytype
       barrier_ptr = barrier.get_ptr()
diff --git a/tests/pallas/gpu_pallas_distributed_test.py b/tests/pallas/gpu_pallas_distributed_test.py
@@ -29,6 +29,7 @@
 from jax.experimental.pallas.ops.gpu.reduce_scatter_mgpu import reduce_scatter
 from jax.experimental.pallas.ops.gpu.all_gather_mgpu import all_gather
 import jax.numpy as jnp
+import math
 import numpy as np
 
 
@@ -306,6 +307,52 @@ def _store():
     ref = lax.broadcasted_iota(jnp.int32, (128, 128), 1)
     np.testing.assert_array_equal(y, np.concat([ref, ref], axis=0))
 
+  def test_contiguous_copy_tma(self):
+    if jax.process_index() > 2:
+      return  # Only 2 processes needed.
+
+    shape = (512,)
+
+    def kernel(y_ref, smem_ref, sem):
+      dev_id = lax.axis_index("y")
+      other_dev_id = 1 - dev_id
+
+      # Device ID must be an int32.
+      zero = jnp.int32(0)
+
+      @pl.when(dev_id == zero)
+      def _store():
+        output = plgpu.layout_cast(
+            jnp.arange(math.prod(shape)).reshape(shape),
+            plgpu.Layout.WG_STRIDED(shape, vec_size=1),
+        )
+        smem_ref[...] = output
+        plgpu.commit_smem()
+        plgpu.copy_smem_to_gmem(smem_ref, plgpu.remote_ref(y_ref, (zero, dev_id)))
+        plgpu.copy_smem_to_gmem(smem_ref, plgpu.remote_ref(y_ref, (zero, other_dev_id)))
+        plgpu.wait_smem_to_gmem(0)
+      pl.semaphore_signal(sem, 1, device_id=(zero, other_dev_id))
+      pl.semaphore_wait(sem)
+
+    kernel_call = pl.pallas_call(
+        kernel,
+        out_specs=pl.BlockSpec(memory_space=plgpu.GMEM),
+        out_shape=jax.ShapeDtypeStruct(shape, jnp.int32),
+        scratch_shapes=[
+            plgpu.SMEM(shape, jnp.int32),
+            plgpu.SemaphoreType.REGULAR,
+        ],
+    )
+    mesh = jtu.create_mesh((1, 2), ("x", "y"))
+    y = jax.jit(
+        jax.shard_map(
+            kernel_call, mesh=mesh, in_specs=(), out_specs=P("y"), check_vma=False,
+        )
+    )()
+    y = multihost_utils.process_allgather(y, tiled=True)
+    ref = jnp.arange(math.prod(shape)).reshape(shape)
+    np.testing.assert_array_equal(y, np.concat([ref, ref], axis=0))
+
 
 class PallasCallMultimemTest(TestCase):
 
diff --git a/tests/pallas/mosaic_gpu_test.py b/tests/pallas/mosaic_gpu_test.py
@@ -676,6 +676,41 @@ def kernel(x_ref, o_ref_gmem, o_ref_gmem_alias, scratch_ref):
     output_val = x.reshape(-1, 128).sum(axis=0)
     np.testing.assert_array_equal(output, output_val)
 
+  @parameterized.parameters(
+      ((64, 128,), (slice(2, 3), slice(0, 128)), jnp.bfloat16),
+      ((256,), (...,), jnp.bfloat16),
+      ((64, 128,), (...,), jnp.bfloat16),
+      ((3, 64, 1, 128), (0, slice(0, 32), 0, slice(0, 128)), jnp.float32),
+      ((3, 64, 1, 128), (...,), jnp.float32),
+      ((3, 64, 128), (...,), jnp.float32),
+      ((10, 10, 512,), (4, 4), jnp.bfloat16),
+      ((10, 1024,), (4,), jnp.bfloat16),
+      ((8192,), (...,), jnp.bfloat16),
+      ((8192,), (slice(4096, 8192),), jnp.bfloat16),
+      ((8192,), (slice(4096, 8192),), jnp.float32),
+  )
+  def test_copy_gmem_to_smem_contiguous(self, shape, indexer, dtype):
+    @functools.partial(
+        self.pallas_call,
+        out_shape=jax.ShapeDtypeStruct(shape, dtype),
+        out_specs=pl.BlockSpec(memory_space=plgpu.GMEM),
+        in_specs=(pl.BlockSpec(memory_space=plgpu.GMEM),),
+        scratch_shapes=[plgpu.SMEM(shape, dtype), plgpu.Barrier()],
+        grid=(1,),
+    )
+    def kernel(x_ref_gmem, o_ref, scratch_ref, barrier_ref):
+      plgpu.copy_gmem_to_smem(
+          x_ref_gmem.at[indexer], scratch_ref.at[indexer], barrier_ref
+      )
+      plgpu.barrier_wait(barrier_ref)
+      scratch_ref[indexer] = scratch_ref[indexer] + 1
+      plgpu.commit_smem()
+      plgpu.copy_smem_to_gmem(scratch_ref.at[indexer], o_ref.at[indexer])
+      plgpu.wait_smem_to_gmem(0)
+
+    x = jax.random.normal(jax.random.key(0), shape, dtype=dtype)
+    np.testing.assert_allclose(kernel(x)[indexer], x[indexer] + 1.0)
+
   @parameterized.named_parameters(
       {"testcase_name": "1d_none",
        "shape": (256,), "indexers": (slice(0, 128), slice(None, 32))},
@@ -1482,15 +1517,15 @@ def kernel(out1_ref, out2_ref):
   def test_program_id_in_block_spec(self):
     @functools.partial(
         self.pallas_call,
-        in_specs=(pl.BlockSpec((2, 128), lambda i: (pl.program_id(0), i)),),
-        out_specs=pl.BlockSpec((2, 128), lambda i: (pl.program_id(0), i)),
-        out_shape=jax.ShapeDtypeStruct([2, 128], jnp.int32),
+        in_specs=(pl.BlockSpec((1, 128), lambda i: (pl.program_id(0), i)),),
+        out_specs=pl.BlockSpec((1, 128), lambda i: (pl.program_id(0), i)),
+        out_shape=jax.ShapeDtypeStruct([2, 256], jnp.int32),
         grid=2,
     )
     def kernel(x_ref, o_ref):
       o_ref[...] = x_ref[...]
 
-    x = jnp.arange(2 * 128, dtype=jnp.int32).reshape([2, 128])
+    x = jnp.arange(2 * 256, dtype=jnp.int32).reshape([2, 256])
     np.testing.assert_array_equal(kernel(x), x)
 
   def test_num_programs(self):
@@ -2528,8 +2563,10 @@ def kernel(x_ref, o_ref):
     ptx = output()
     self.assertIn(".file", ptx)
     self.assertIn(".loc", ptx)
-    [path] = re.findall(r'.file\s+\d+\s+"(.+)"', ptx)
-    self.assertEndsWith(__file__, path)
+    paths = re.findall(r'.file\s+\d+\s+"(.+)"', ptx)
+    paths = [p for p in paths if p != "-"]
+    self.assertLen(paths, 1)
+    self.assertEndsWith(__file__, paths[0])
 
   def test_collective_arrival_count(self):
     def kernel(dst, collective_barrier):