[Mosaic GPU] Account for squeezed dims in partitioned axis of copy_smem_to_gmem.

justinjfu · Google-ML-Automation · commit 11788091197b · 2025-08-28T15:05:18.000-07:00
PiperOrigin-RevId: 800628443
diff --git a/jax/experimental/mosaic/gpu/launch_context.py b/jax/experimental/mosaic/gpu/launch_context.py
@@ -788,6 +788,9 @@ def async_copy(
       if gmem_ref is dst_ref:
         raise ValueError("Only GMEM -> SMEM copies can be collective")
     if partitioned is not None:
+      # Increment partitioned by the number of preceding squeezed dimensions.
+      partitioned = np.where(
+          np.cumsum(~np.array(is_squeezed)) == partitioned+1)[0][0]
       # Partitioning happens on the logical slice we extract from GMEM, so we do
       # it before we apply transforms.
       if collective is None:  # This implies non-gather TMA already.
diff --git a/jax/experimental/pallas/ops/gpu/blackwell_ragged_dot_mgpu.py b/jax/experimental/pallas/ops/gpu/blackwell_ragged_dot_mgpu.py
@@ -174,8 +174,7 @@ def _():
                 b_gmem.at[slice_k, slice_n],
                 b_smem.at[slot],
                 b_tma_barrier.at[slot],
-                # TODO: partitioned_axis doesn't account for squeezed dims so we have 2 instead of 1 here.
-                partitioned_axis=2 if collective else None,
+                partitioned_axis=1 if collective else None,
                 collective_axes=collective_axis,
             )
           lax.fori_loop(0, k_iters, _loop_body, None)
diff --git a/tests/pallas/mosaic_gpu_test.py b/tests/pallas/mosaic_gpu_test.py
@@ -3947,18 +3947,25 @@ def kernel(a_smem, b_smem, out_ref, acc_tmem, scratch_smem, barrier_ref):
     expected = x @ y
     np.testing.assert_allclose(result, expected, rtol=1e-3)
 
-  @parameterized.parameters((True,), (False,))
-  def test_copy_gmem_to_smem_partitioned(self, warp_level):
+  @parameterized.product(
+      warp_level=(True, False),
+      squeezed_index=(True, False),
+  )
+  def test_copy_gmem_to_smem_partitioned(self, warp_level, squeezed_index):
     self.skip_if_wg_semantics()
     block_size = (128, 128)
     partitioned_block_size = (block_size[0] // 2, block_size[1])
     a = jax.random.uniform(
         jax.random.key(0), shape=block_size, dtype=jnp.float32)
+    if squeezed_index:
+      a = a.reshape(1, *block_size)
     b = jax.random.uniform(
         jax.random.key(1), shape=block_size, dtype=jnp.float32)
     def kernel(a_gmem, b_gmem, out_gmem,
               a_smem, b_smem, out_smem,
               a_tma_barrier, b_tma_barrier, cluster_barrier):
+      if squeezed_index:
+        a_gmem = a_gmem.at[0]
       cluster_idx = lax.axis_index("x")
       out_slice = pl.ds(cluster_idx * partitioned_block_size[0],
                         partitioned_block_size[0])
@@ -4024,6 +4031,8 @@ def _():
         ),
     )
     result = f(a, b)
+    if squeezed_index:
+      a = a[0]
     np.testing.assert_array_equal(result, a + b)
 
   def test_arrive_wait_on_tc_barrier(self):