Avoid the unnecessary initial copy in the MGPU collective matmul

apaszke · Google-ML-Automation · commit 8263fc4810af · 2025-09-02T03:16:34.000-07:00
It seems to be quite expensive and is completely unnecessary. Ideally
we'd have a way to select between scratch_ref and lhs_ref, but we have
to peel the first iteration of the device loop until Pallas can represent it.

PiperOrigin-RevId: 802067411
diff --git a/jax/experimental/pallas/ops/gpu/collective_matmul_mgpu.py b/jax/experimental/pallas/ops/gpu/collective_matmul_mgpu.py
@@ -72,6 +72,10 @@ def all_gather_lhs_matmul(
         f"{n_shard_per_sm_n=} must be divisible by {block_n=}"
     )
   num_sms_m = max_num_sms // sm_n_tile
+  if num_sms_m < (m_shard // block_m) and sm_n_tile > 1:
+    # We never synchronize the N SMs across the different steps of the M
+    # loop, so they can start overwriting each other's data.
+    raise NotImplementedError("The kernel has races when M is large and sm_n_tile > 1")
 
   swizzle = min(
       plgpu.find_swizzle(block_k * jnp.finfo(element_type).bits, "lhs"),
@@ -99,22 +103,12 @@ def _m_loop(idx):
       (mi,) = idx
       m_tile_slice = pl.ds(mi * block_m, block_m)
 
-      # For some reason ptxas spills if we unroll the loop over k
-      copy_block = 32
-      @pl.loop(0, k, step=copy_block)
-      def _k_copy_loop(ki):
-        k_slice = pl.ds(ki, copy_block)
-        scratch_ref[0, :, k_slice] = lhs_ref[m_tile_slice, k_slice]
-
-      @pl.loop(0, num_devices)
-      def _device_loop(device_offset):
+      def device_step(lhs_source_ref, next_scratch_slot, device_offset):
+        # Loop invariant: lhs_source_ref is ready to be used
         device_m_slice = pl.ds(
             lax.rem(device_offset + dev_id, num_devices) * m_shard, block_m
         )
 
-        scratch_slot = device_offset
-        next_scratch_slot = scratch_slot + 1
-
         def compute(n_tile_slice, send: bool):
           @functools.partial(
               pl.run_scoped, acc_ref=plgpu.ACC((block_m, block_n))
@@ -143,7 +137,7 @@ def k_loop(idxs, lhs_smem, rhs_smem):
               plgpu.wgmma(acc_ref, lhs_smem, rhs_smem)
               if send:
                 # TODO(giorgioa): Send only for first sm_n.
-                @pl.when(next_scratch_slot <= num_devices - 1)
+                @pl.when(next_scratch_slot < num_devices - 1)
                 def _():
                   (ki,) = idxs
                   k_slice = pl.ds(ki * block_k, block_k)
@@ -153,7 +147,7 @@ def _():
                   # We only delay release by 1 step, so we need to wait for the
                   # previous copies.
                   plgpu.wait_smem_to_gmem(1, wait_read_only=True)
-            k_loop(scratch_ref.at[scratch_slot], rhs_ref.at[..., n_tile_slice])
+            k_loop(lhs_source_ref, rhs_ref.at[..., n_tile_slice])
             if send:
               # Make sure the copy is done and signal the receiving device.
               plgpu.wait_smem_to_gmem(0, wait_read_only=False)
@@ -176,6 +170,11 @@ def _n_loop(ni):
         # Wait for the next scratch to arrive --- see the device loop invariant.
         pl.semaphore_wait(received_sem)
 
+      device_step(lhs_ref.at[m_tile_slice], 0, 0)
+      @pl.loop(1, num_devices)
+      def _device_loop(device_offset):
+        device_step(scratch_ref.at[device_offset - 1], device_offset, device_offset)
+
     # Make sure all copies are fully done.
     plgpu.wait_smem_to_gmem(0, wait_read_only=True)
 
@@ -185,7 +184,7 @@ def _n_loop(ni):
           # The output, with its M dimension all-gathered.
           jax.ShapeDtypeStruct((axis_size * m_shard, n_shard), dtype),
           # The scratch buffer used for the all-gather.
-          jax.ShapeDtypeStruct((num_sms_m, num_devices, block_m, k), dtype),
+          jax.ShapeDtypeStruct((num_sms_m, num_devices - 1, block_m, k), dtype),
       ],
       scratch_shapes=[
           plgpu.SMEM((block_m, block_n), dtype, transforms=transforms),
diff --git a/tests/pallas/mgpu_collective_matmul_test.py b/tests/pallas/mgpu_collective_matmul_test.py
@@ -100,6 +100,8 @@ def test_all_gather_lhs_matmul(
       )
     if m_shard % block_m:
       self.skipTest("m_shard must be divisible by block_m for now.")
+    if (132 // sm_n_tile) < m_shard // block_m and sm_n_tile > 1:
+      self.skipTest("The kernel has races when M is large and sm_n_tile > 1")
 
     k1, k2 = random.split(random.key(1234), num=2)
     lhs = random.normal(k1, (num_devices * m_shard, k), dtype)
@@ -118,6 +120,7 @@ def run(body):
       )(out)
       return out
 
+    ref_out = run(lambda x, y: lax.all_gather(x, "x", axis=0, tiled=True) @ y)
     out = run(
         functools.partial(
             collective_matmul_mgpu.all_gather_lhs_matmul,
@@ -130,7 +133,6 @@ def run(body):
             dtype=dtype,
         )
     )
-    ref_out = run(lambda x, y: lax.all_gather(x, "x", axis=0, tiled=True) @ y)
     np.testing.assert_allclose(out, ref_out)
 
 

Original file line number	Diff line number	Diff line change
`@@ -100,6 +100,8 @@ def test_all_gather_lhs_matmul(`
`100`	`100`	`)`
`101`	`101`	`if m_shard % block_m:`
`102`	`102`	`self.skipTest("m_shard must be divisible by block_m for now.")`
	`103`	`+ if (132 // sm_n_tile) < m_shard // block_m and sm_n_tile > 1:`
	`104`	`+ self.skipTest("The kernel has races when M is large and sm_n_tile > 1")`
`103`	`105`
`104`	`106`	`k1, k2 = random.split(random.key(1234), num=2)`
`105`	`107`	`lhs = random.normal(k1, (num_devices * m_shard, k), dtype)`
`@@ -118,6 +120,7 @@ def run(body):`
`118`	`120`	`)(out)`
`119`	`121`	`return out`
`120`	`122`
	`123`	`+ ref_out = run(lambda x, y: lax.all_gather(x, "x", axis=0, tiled=True) @ y)`
`121`	`124`	`out = run(`
`122`	`125`	`functools.partial(`
`123`	`126`	`collective_matmul_mgpu.all_gather_lhs_matmul,`
`@@ -130,7 +133,6 @@ def run(body):`
`130`	`133`	`dtype=dtype,`
`131`	`134`	`)`
`132`	`135`	`)`
`133`		`- ref_out = run(lambda x, y: lax.all_gather(x, "x", axis=0, tiled=True) @ y)`
`134`	`136`	`np.testing.assert_allclose(out, ref_out)`
`135`	`137`
`136`	`138`