[None][fix] Fix W4A8 MoE kernel issue (NVIDIA#7072)

yuhyao · web-flow · commit 8ac7dec62371 · 2025-08-20T06:52:47.000-04:00
Signed-off-by: yuhyao &lt;827623970@qq.com&gt;
diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp
@@ -1524,6 +1524,11 @@ struct CollectiveMmaArrayMixedInput<
     CUTLASS_DEVICE void tensormaps_cp_fence_release(
         TensorMapStorage& shared_tensormaps, cute::tuple<TMs...> const& input_tensormaps)
     {
+        if (cute::elect_one_sync())
+        {
+            cute::tma_desc_commit_group();
+            cute::tma_desc_wait_group();
+        }
         // Entire warp must do this (i.e. it's aligned)
         tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
         tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);