NVIDIA
diff --git a/‎.gitattributes
Lines changed: 3 additions & 0 deletions b/‎.gitattributes
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/pr-check.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pr-check.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/deep_gemm/scheduler.cuh
Lines changed: 1 addition & 1 deletion b/‎cpp/include/tensorrt_llm/deep_gemm/scheduler.cuh
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/fusion/sm90_visitor_scatter.hpp
Lines changed: 198 additions & 0 deletions b/‎cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/fusion/sm90_visitor_scatter.hpp
Lines changed: 198 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
Lines changed: 6 additions & 3 deletions b/‎cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
Lines changed: 6 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
Lines changed: 32 additions & 24 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
Lines changed: 32 additions & 24 deletions
@@ -9,3 +9,6 @@ triton_backend/tools/gpt/input_data.json filter=lfs diff=lfs merge=lfs -text
 docs/source/blogs/media/tech_blog3_mla_absorb.png filter=lfs diff=lfs merge=lfs -text
 tests/integration/test_input_files/*.png filter=lfs diff=lfs merge=lfs -text
 tests/integration/test_input_files/*.jpg filter=lfs diff=lfs merge=lfs -text
+docs/source/blogs/media/tech_blog10_baseline_performance_detail.png filter=lfs diff=lfs merge=lfs -text
+docs/source/blogs/media/tech_blog10_full_strategy_performance.png filter=lfs diff=lfs merge=lfs -text
+docs/source/blogs/media/tech_blog10_context_wait_performance.png  filter=lfs diff=lfs merge=lfs -text
@@ -69,5 +69,5 @@ jobs:
       - name: Validate PR Checklist
         env:
           PR_BODY: ${{ github.event.pull_request.body }}
-          ENFORCE_PR_HAS_CHECKLIST: true
+          ENFORCE_PR_HAS_CHECKLIST: false
         run: python .github/scripts/pr_checklist_check.py
@@ -379,7 +379,7 @@ struct GroupedMaskedScheduler
     }
 };
 
-// Need to keep the same as the one in tests/unittest/_torch/thop/deep_gemm_tests.py
+// Need to keep the same as the one in tests/unittest/_torch/thop/parallel/deep_gemm_tests.py
 template <typename T_offset, typename T_index>
 __host__ __device__ __forceinline__ T_offset compute_padded_offset(T_offset offset, T_index problem_idx)
 {
 
@@ -373,6 +373,22 @@ struct ScaledAccPerRowBias
   static constexpr bool IsPerRowBiasSupported = true;
 };
 
+template<
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct ScaledAccPerColBias
+    : ScaledAcc<ElementOutput_, ElementCompute_, ElementScalar_, RoundStyle_>
+{
+  using ElementBias = ElementBias_;
+  static constexpr int AlignmentBias = AlignmentBias_;
+  static constexpr bool IsPerColBiasSupported = true;
+};
+
 template<
   class GmemLayoutTagOut,
   class ElementOutput,
@@ -393,6 +409,26 @@ struct ScaledAccPerRowBiasPerColScaleScatter
   static constexpr bool IsAuxOutSupported = true;
 };
 
+template<
+  class GmemLayoutTagOut,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementScale = ElementCompute,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / cute::sizeof_bits_v<ElementBias>,
+  int AlignmentOutput = 128 / cute::sizeof_bits_v<ElementOutput>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+struct ScaledAccPerColBiasPerRowScaleScatter
+    : ScaledAccPerColBias<ElementOutput, ElementCompute, ElementBias, ElementScalar, AlignmentBias, RoundStyle>
+{
+  using ElementAux = ElementOutput;
+  using GmemLayoutTagAux = GmemLayoutTagOut;
+  static constexpr int AlignmentAux = AlignmentOutput;
+  static constexpr bool IsAuxOutSupported = true;
+};
+
 // D = alpha * acc + per-row bias
 template<
   class CtaTileShapeMNK,
@@ -410,6 +446,22 @@ using Sm90ScaledAccPerRowBiasPtrArray =
     Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias *, ElementCompute, Stride<_1,_0,int64_t>, AlignmentBias> // bias
   >;
 
+template<
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledAccPerColBiasPtrArray =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // alpha * acc + bias
+    Sm90ScalarBroadcastPtrArray<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
+    Sm90AccFetch, // acc
+    Sm90RowBroadcast<0, CtaTileShapeMNK, ElementBias *, ElementCompute, Stride<_0,_1,int64_t>, AlignmentBias> // bias
+  >;
+
 template<
   class CtaTileShapeMNK,
   class EpilogueTile,
@@ -433,6 +485,29 @@ using Sm90ScaledAccPerRowBiasPerColScaleScatterPtrArray =
     >
   >;
 
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class StrideOutput,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementScale = ElementCompute,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / cute::sizeof_bits_v<ElementBias>,
+  int AlignmentOutput = 128 / cute::sizeof_bits_v<ElementOutput>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledAccPerColBiasPerRowScaleScatterPtrArray =
+  Sm90EVT<Sm90ScatterPtrArray<EpilogueTile, StrideOutput, SmemLayoutAtom, CopyOpR2S, ElementOutput, AlignmentOutput, RoundStyle>, // scatter store
+    Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // scale * (alpha * acc + bias)
+      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementScalar *, ElementCompute, Stride<_1,_0,int64_t>, 1>, // scale
+      Sm90ScaledAccPerColBiasPtrArray<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementScalar, AlignmentBias, RoundStyle> // alpha * acc + bias
+    >
+  >;
+
 template <
   int StagesC,
   int StagesD,
@@ -556,6 +631,129 @@ struct FusionCallbacks<
 
 };
 
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups,
+  class GmemLayoutTagOut,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementScale,
+  class ElementScalar,
+  int AlignmentBias,
+  int AlignmentOutput,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class SmemLayoutAtom,
+  class CopyOpR2S
+>
+struct FusionCallbacks<
+    epilogue::Sm90PtrArrayTmaWarpSpecialized<StagesC,
+                                             StagesD,
+                                             FragmentSize,
+                                             ReuseSmemC,
+                                             DelayTmaStore,
+                                             NumEpilogueWarpGroups
+                                            >,
+    fusion::ScaledAccPerColBiasPerRowScaleScatter<GmemLayoutTagOut,
+                                                  ElementOutput,
+                                                  ElementCompute,
+                                                  ElementBias,
+                                                  ElementScale,
+                                                  ElementScalar,
+                                                  AlignmentBias,
+                                                  AlignmentOutput,
+                                                  RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile,
+    SmemLayoutAtom,
+    CopyOpR2S
+> : Sm90ScaledAccPerColBiasPerRowScaleScatterPtrArray<
+      CtaTileShapeMNK,
+      EpilogueTile,
+      cutlass::gemm::TagToStrideC_t<GmemLayoutTagOut>,
+      SmemLayoutAtom, CopyOpR2S,
+      ElementOutput, ElementCompute, ElementBias, ElementScale, ElementScalar,
+      AlignmentBias, AlignmentOutput, RoundStyle
+    > {
+
+  using StrideOutput = cutlass::gemm::TagToStrideC_t<GmemLayoutTagOut>;
+
+  using Impl = Sm90ScaledAccPerColBiasPerRowScaleScatterPtrArray<
+    CtaTileShapeMNK,
+    EpilogueTile,
+    StrideOutput,
+    SmemLayoutAtom, CopyOpR2S,
+    ElementOutput, ElementCompute, ElementBias, ElementScale, ElementScalar,
+    AlignmentBias, AlignmentOutput, RoundStyle
+  >;
+  using Operation = fusion::ScaledAccPerColBiasPerRowScaleScatter<
+    GmemLayoutTagOut,
+    ElementOutput,
+    ElementCompute,
+    ElementBias,
+    ElementScale,
+    ElementScalar,
+    AlignmentBias,
+    AlignmentOutput,
+    RoundStyle>;
+
+  struct Arguments {
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar const* alpha_ptr{};
+    ElementScalar const* const* alpha_ptr_array{};
+    StrideAlpha dAlpha{};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* const* bias_ptr{};
+    StrideBias dBias{};
+
+    using StrideScale = Stride<_1,_0,int64_t>;
+    ElementScalar const* const* scale_ptr_array{};
+    StrideScale dScale{};
+
+    // Nested args not usable due to a compiler bug with constexpr evaluation
+    // using ScatterArguments = typename Sm90ScatterPtrArray<EpilogueTile, StrideOutput, SmemLayoutAtom, CopyOpR2S, ElementOutput, AlignmentOutput, RoundStyle>::Arguments;
+    // ScatterArguments scatter{};
+
+    ElementOutput* ptr_out = nullptr;
+    StrideOutput dOut = {};
+    int const* const* ptr_index{};   // per-group pointer to the scatter index
+    int index_modulo{}; // modulo used to transform the index before store
+    int shape_override = -1; // override value for contiguous output tensor mode
+    bool use_reduction = true;
+
+    operator typename Impl::Arguments() const {
+      return
+        {                                                           // unary op: reduce(scale * (beta * C + (alpha * acc)))
+          {                                                             // binary op: scale * (beta * C + (alpha * acc))
+            { scale_ptr_array, ElementScalar(1), dScale },                  // leaf args : scale broadcast
+            {                                                                 // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {alpha_ptr_array}, {dAlpha}},                // leaf args : alpha
+              {},                                                                 // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias},                                  // leaf args : bias
+              {}                                                                  // ternary args : multiply_add
+            },                                                                // end binary op
+            {}                                                              // binary args: multiply
+          },                                                            // end binary op
+          //scatter                                                       // unary args: reduce
+          { ptr_out, dOut, ptr_index, index_modulo, shape_override, use_reduction }
+        };                                                          // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+
+};
+
 } // namespace cutlass::epilogue::fusion
 
 // clang-format on
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -440,6 +440,7 @@ struct CutlassGemmConfig
     };
 
     EpilogueFusionType epilogue_fusion_type = EpilogueFusionType::NONE;
+    bool swap_ab = false;
 
     CutlassGemmConfig() = default;
 
@@ -511,7 +512,8 @@ struct CutlassGemmConfig
                    << "\n\tcluster shape ID: " << (int) cluster_shape
                    << "\n\tmainloop sched: " << (int) mainloop_schedule << "\n\tepi sched: " << (int) epilogue_schedule
                    << "\n\tenable cuda kernel: " << (enableCudaKernel ? "true" : "false")
-                   << "\n\tepilogue fusion type: " << (int) epilogue_fusion_type;
+                   << "\n\tepilogue fusion type: " << (int) epilogue_fusion_type
+                   << "\n\tswap_ab: " << (swap_ab ? "true" : "false");
         }
         else if (tile_config_sm80 != tensorrt_llm::cutlass_extensions::CutlassTileConfig::ChooseWithHeuristic)
         {
@@ -544,7 +546,8 @@ inline std::ostream& operator<<(std::ostream& out, CutlassGemmConfig const& conf
             << ", epilogue_schedule_enum: " << int(config.epilogue_schedule)
             << ", cluster_shape_enum: " << int(config.cluster_shape)
             << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false")
-            << ", epilogue_fusion_type: " << int(config.epilogue_fusion_type);
+            << ", epilogue_fusion_type: " << int(config.epilogue_fusion_type)
+            << ", swap_ab: " << (config.swap_ab ? "true" : "false");
     }
     else
     {
 
@@ -74,12 +74,24 @@ struct TmaWarpSpecializedGroupedGemmInput
     static_assert(std::is_same_v<cutlass::layout::RowMajor, TransposeLayoutTag<cutlass::layout::ColumnMajor>>);
     static_assert(std::is_same_v<cutlass::layout::ColumnMajor, TransposeLayoutTag<cutlass::layout::RowMajor>>);
 
-    // Layout for A and B is transposed and then swapped in the implementation
-    // This uses B^T * A^T = (A * B)^T to get a better layout for the GEMM
-    using LayoutA = TransposeLayoutTag<cutlass::layout::RowMajor>;    // Layout type for A matrix operand
-    using LayoutB = TransposeLayoutTag<cutlass::layout::ColumnMajor>; // Layout type for B matrix operand
-    using LayoutC = TransposeLayoutTag<cutlass::layout::RowMajor>;    // Layout type for C matrix operand
-    using LayoutD = TransposeLayoutTag<cutlass::layout::RowMajor>;    // Layout type for D matrix operand
+    // These are always the layout of A & B matrices, activations and weights will be assigned to either A or B based on
+    // swap_ab
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+
+    // When using Swap A&B we need to transpose the output matrix
+    using LayoutC = cutlass::layout::RowMajor;
+    using LayoutD = cutlass::layout::RowMajor;
+    using LayoutC_T = TransposeLayoutTag<LayoutC>;
+    using LayoutD_T = TransposeLayoutTag<LayoutD>;
+
+    using StrideA = std::remove_pointer_t<cutlass::detail::TagToStrideA_t<LayoutA*>>;
+    using StrideB = std::remove_pointer_t<cutlass::detail::TagToStrideB_t<LayoutB*>>;
+
+    using StrideC = std::remove_pointer_t<cutlass::detail::TagToStrideC_t<LayoutC*>>;
+    using StrideD = std::remove_pointer_t<cutlass::detail::TagToStrideC_t<LayoutD*>>;
+    using StrideC_T = std::remove_pointer_t<cutlass::detail::TagToStrideC_t<LayoutC_T*>>;
+    using StrideD_T = std::remove_pointer_t<cutlass::detail::TagToStrideC_t<LayoutD_T*>>;
 
     constexpr static int NVFP4BlockScaleVectorSize = 16;
     constexpr static int MXFPXBlockScaleVectorSize = 32;
@@ -110,13 +122,6 @@ struct TmaWarpSpecializedGroupedGemmInput
         return (dim + alignment - 1) / alignment * alignment;
     }
 
-    using StrideA
-        = std::remove_pointer_t<cutlass::detail::TagToStrideB_t<LayoutA*>>; // Use B because they will be swapped
-    using StrideB
-        = std::remove_pointer_t<cutlass::detail::TagToStrideA_t<LayoutB*>>; // Use A because they will be swapped
-    using StrideC = std::remove_pointer_t<cutlass::detail::TagToStrideC_t<LayoutC*>>;
-    using StrideD = std::remove_pointer_t<cutlass::detail::TagToStrideC_t<LayoutD*>>;
-
 #ifdef ENABLE_FP8
     template <class T>
     constexpr static bool IsFP8_v = std::is_same_v<T, __nv_fp8_e4m3> || std::is_same_v<T, __nv_fp8_e5m2>;
@@ -131,26 +136,29 @@ struct TmaWarpSpecializedGroupedGemmInput
 
     using ProblemShape = cutlass::gemm::GroupProblemShape<cute::Shape<int64_t, int64_t, int64_t>>;
 
+    bool swap_ab = false;
     ProblemShape shape_info{};
-    StrideA* stride_a = nullptr;
-    StrideB* stride_b = nullptr;
+    void* stride_act = nullptr;
+    void* stride_weight = nullptr;
 
-    void const** ptr_a = nullptr;
-    void const** ptr_b = nullptr;
+    void const** ptr_act = nullptr;
+    void const** ptr_weight = nullptr;
 
     // C is currently the same in both epilogues
-    StrideC* stride_c = nullptr;
+    void* stride_c = nullptr;
     void const** ptr_c = nullptr;
 
     // D is used in all cases except fused finalize
-    StrideD* stride_d = nullptr;
+    void* stride_d = nullptr;
     void** ptr_d = nullptr;
 
     struct FusedFinalizeEpilogue
     {
+        using StrideFinalOutput_T = cutlass::detail::TagToStrideC_t<LayoutD_T>;
         using StrideFinalOutput = cutlass::detail::TagToStrideC_t<LayoutD>;
 
         void* ptr_final_output = nullptr;
+        StrideFinalOutput_T stride_final_output_transposed{};
         StrideFinalOutput stride_final_output{};
 
         void const** ptr_bias = nullptr;
@@ -179,11 +187,11 @@ struct TmaWarpSpecializedGroupedGemmInput
     using ElementSF = uint8_t;
     using MXFPXElementSF = ElementSF; // Just an alias for now
     using NVFP4ElementSF = ElementSF; // Just an alias for now
-    ElementSF const** fpX_block_scaling_factors_A = nullptr;
-    ElementSF const** fpX_block_scaling_factors_B = nullptr;
+    ElementSF const** fpX_block_scaling_factors_act = nullptr;
+    ElementSF const** fpX_block_scaling_factors_weight = nullptr;
 
-    void* fpX_block_scaling_factors_stride_A = nullptr;
-    void* fpX_block_scaling_factors_stride_B = nullptr;
+    void* fpX_block_scaling_factors_stride_act = nullptr;
+    void* fpX_block_scaling_factors_stride_weight = nullptr;
 
     enum class FpXBlockScalingType
     {
@@ -229,7 +237,7 @@ struct TmaWarpSpecializedGroupedGemmInput
 
     bool isValid() const
     {
-        return stride_a != nullptr && ptr_a != nullptr;
+        return stride_act != nullptr && ptr_act != nullptr;
     }
 
     void setFinalizeFusionParams(void* final_output, int hidden_size, int num_output_tokens, bool use_reduction);
Original file line number	Diff line number	Diff line change
`@@ -379,7 +379,7 @@ struct GroupedMaskedScheduler`
`379`	`379`	`}`
`380`	`380`	`};`
`381`	`381`
`382`		`-// Need to keep the same as the one in tests/unittest/_torch/thop/deep_gemm_tests.py`
	`382`	`+// Need to keep the same as the one in tests/unittest/_torch/thop/parallel/deep_gemm_tests.py`
`383`	`383`	`template <typename T_offset, typename T_index>`
`384`	`384`	`__host__ __device__ __forceinline__ T_offset compute_padded_offset(T_offset offset, T_index problem_idx)`
`385`	`385`	`{`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.`
	`2`	`+ * Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.`
`3`	`3`	`*`
`4`	`4`	`* Licensed under the Apache License, Version 2.0 (the "License");`
`5`	`5`	`* you may not use this file except in compliance with the License.`
`@@ -440,6 +440,7 @@ struct CutlassGemmConfig`
`440`	`440`	`};`
`441`	`441`
`442`	`442`	`EpilogueFusionType epilogue_fusion_type = EpilogueFusionType::NONE;`
	`443`	`+ bool swap_ab = false;`
`443`	`444`
`444`	`445`	`CutlassGemmConfig() = default;`
`445`	`446`
`@@ -511,7 +512,8 @@ struct CutlassGemmConfig`
`511`	`512`	`<< "\n\tcluster shape ID: " << (int) cluster_shape`
`512`	`513`	`<< "\n\tmainloop sched: " << (int) mainloop_schedule << "\n\tepi sched: " << (int) epilogue_schedule`
`513`	`514`	`<< "\n\tenable cuda kernel: " << (enableCudaKernel ? "true" : "false")`
`514`		`- << "\n\tepilogue fusion type: " << (int) epilogue_fusion_type;`
	`515`	`+ << "\n\tepilogue fusion type: " << (int) epilogue_fusion_type`
	`516`	`+ << "\n\tswap_ab: " << (swap_ab ? "true" : "false");`
`515`	`517`	`}`
`516`	`518`	`else if (tile_config_sm80 != tensorrt_llm::cutlass_extensions::CutlassTileConfig::ChooseWithHeuristic)`
`517`	`519`	`{`
`@@ -544,7 +546,8 @@ inline std::ostream& operator<<(std::ostream& out, CutlassGemmConfig const& conf`
`544`	`546`	`<< ", epilogue_schedule_enum: " << int(config.epilogue_schedule)`
`545`	`547`	`<< ", cluster_shape_enum: " << int(config.cluster_shape)`
`546`	`548`	`<< ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false")`
`547`		`- << ", epilogue_fusion_type: " << int(config.epilogue_fusion_type);`
	`549`	`+ << ", epilogue_fusion_type: " << int(config.epilogue_fusion_type)`
	`550`	`+ << ", swap_ab: " << (config.swap_ab ? "true" : "false");`
`548`	`551`	`}`
`549`	`552`	`else`
`550`	`553`	`{`