NVIDIA
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu
Lines changed: 163 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu
Lines changed: 163 additions & 2 deletions
diff --git a/‎cpp/tests/unit_tests/kernels/routing/routingRenormalizeTest.cpp
Lines changed: 22 additions & 0 deletions b/‎cpp/tests/unit_tests/kernels/routing/routingRenormalizeTest.cpp
Lines changed: 22 additions & 0 deletions
@@ -27,6 +27,8 @@ static constexpr int MaxNumTopExperts = 8;
 static constexpr int MaxNumExperts = 128;
 static constexpr int MaxNumTokensSingleCluster = NumBlocksPerCluster * NumThreads;
 static constexpr int MaxNumTokensSingleClusterScores = NumBlocksPerCluster * NumWarps;
+static constexpr int NumThreadsSingleBlock = MaxNumExperts;
+static constexpr int BlockKernelMaxNumTokens = 4;
 
 template <typename DataType, typename InputType, int VecSize, bool DoSoftmaxBeforeTopK>
 __forceinline__ __device__ void routingTopKExperts(cg::thread_block_tile<WarpSize> const& warp,
@@ -75,6 +77,156 @@ __forceinline__ __device__ void routingTopKExperts(cg::thread_block_tile<WarpSiz
     }
 }
 
+template <typename KernelParams, bool DoSoftmaxBeforeTopK = false>
+__global__ void __launch_bounds__(NumThreadsSingleBlock) routingIndicesBlockKernel(KernelParams params)
+{
+    // types used in this kernel
+    using OutputT = typename KernelParams::OutputT;
+    using InputT = typename KernelParams::InputT;
+    using BaseType = std::conditional_t<KernelParams::DoSoftmaxBeforeTopK, float, InputT>;
+    using TypePacked = PackedScoreIdx<BaseType>;
+
+    int32_t const warpIdx = __shfl_sync(0xffffffff, threadIdx.x / WarpSize, 0);
+    int32_t const laneIdx = cutlass::arch::LaneId();
+    int32_t const expert = threadIdx.x;
+    auto scoreOffset = warpIdx * params.mNumExperts;
+    bool validToken = warpIdx < params.mNumTokens;
+
+    static constexpr int VecSize = MaxNumExperts / WarpSize;
+    static constexpr int totalExpertCounts = BlockKernelMaxNumTokens * MaxNumExperts;
+    __shared__ int8_t __attribute((aligned(128))) smemOffset[totalExpertCounts];
+    __shared__ int8_t __attribute((aligned(128))) smemKIdx[totalExpertCounts];
+
+    using Scan = cub::BlockScan<int32_t, NumThreadsSingleBlock, cub::BLOCK_SCAN_WARP_SCANS>;
+    __shared__ typename Scan::TempStorage tempStorage;
+
+    auto block = cg::this_thread_block();
+    auto warp = cg::tiled_partition<WarpSize>(block);
+
+    for (int i = threadIdx.x; i < totalExpertCounts; i += blockDim.x)
+    {
+        smemOffset[i] = int8_t{-1};
+        smemKIdx[i] = int8_t{-1};
+    }
+    __syncthreads();
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    // then wait on primary grid
+    if constexpr (KernelParams::UsePdl)
+    {
+        cudaGridDependencySynchronize();
+    }
+#endif
+
+    if (params.mPtrScores != nullptr)
+    {
+        // in this case, each warp represents a token
+        BaseType score[VecSize];
+        int32_t idx[VecSize];
+
+        BaseType warpTopKScore[MaxNumTopExperts];
+        int32_t warpTopKExpertIdx[MaxNumTopExperts];
+
+        BaseType minScore = BaseType{-INFINITY};
+        if (validToken)
+        {
+            routingTopKExperts<BaseType, InputT, VecSize, KernelParams::DoSoftmaxBeforeTopK>(warp, score, idx,
+                warpTopKScore, warpTopKExpertIdx, laneIdx, params.mNumExperts, params.mTopK,
+                params.mPtrScores + scoreOffset, params.mNormTopkProb);
+
+            if (laneIdx < params.mTopK)
+            {
+                int offset = warpIdx * MaxNumExperts + warpTopKExpertIdx[laneIdx];
+                smemKIdx[offset] = static_cast<int8_t>(laneIdx);
+                if (params.mPtrExpertWeights != nullptr)
+                {
+                    params.mPtrExpertWeights[warpIdx * params.mTopK + laneIdx] = OutputT{warpTopKScore[laneIdx]};
+                }
+            }
+        } // end if (validToken)
+    }
+    __syncthreads();
+
+    // set local experts
+    auto localExpertIdx = expert - params.mLocalExpertsStartIdx;
+    auto isLocalExpert = localExpertIdx >= 0 && localExpertIdx < params.mNumLocalExperts
+        && (localExpertIdx & params.mLocalExpertsStrideLog2) == 0;
+    // Get the count of each expert and the offset for each token
+    int accExpertCount = 0;
+
+    if (isLocalExpert)
+    {
+        int offset = expert;
+        for (int j = 0; j < BlockKernelMaxNumTokens; j++)
+        {
+            if (smemKIdx[offset] >= 0)
+            {
+                smemOffset[offset] = static_cast<int8_t>(accExpertCount);
+                accExpertCount++;
+            }
+            offset += MaxNumExperts;
+        }
+    }
+    __syncthreads();
+    // Get the number of CTAs and the offset for each CTA
+    const int32_t numCta = divUpLog2<int32_t>(accExpertCount, params.mPaddingLog2);
+    int32_t ctaOffset = 0;
+    int32_t numNonExitingCtas;
+    Scan(tempStorage).ExclusiveSum(numCta, ctaOffset, numNonExitingCtas);
+
+    int32_t expertScanCounts = 0;
+    Scan(tempStorage).ExclusiveSum(divUpMulLog2(accExpertCount, params.mPaddingLog2), expertScanCounts);
+    __syncthreads();
+
+    if (isLocalExpert)
+    {
+        for (int cta = 0; cta < numCta; ++cta)
+        {
+            const int32_t localExpertIdx = (expert - params.mLocalExpertsStartIdx) >> params.mLocalExpertsStrideLog2;
+            params.mPtrCtaIdxXyToBatchIdx[ctaOffset + cta] = localExpertIdx;
+            params.mPtrCtaIdxXyToMnLimit[ctaOffset + cta]
+                = min(mulLog2<int32_t>(ctaOffset + cta + 1, params.mPaddingLog2),
+                    mulLog2<int32_t>(ctaOffset, params.mPaddingLog2) + accExpertCount);
+        }
+    }
+
+    // at this point, we can write out padded count
+    if (threadIdx.x == 0)
+    {
+        const int32_t permutedIdxSize = mulLog2<int32_t>(numNonExitingCtas, params.mPaddingLog2);
+        params.mPtrPermutedIdxSize[0] = permutedIdxSize;
+        params.mPtrNumNonExitingCtas[0] = numNonExitingCtas;
+    }
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+#if !defined(FDL_PROFILE) || FDL_PROFILE == 0
+    // we can trigger the next kernel at this point
+    if constexpr (KernelParams::UsePdl)
+    {
+        cudaTriggerProgrammaticLaunchCompletion();
+    }
+#endif
+#endif
+
+    for (int tokenIdx = 0; tokenIdx < params.mNumTokens; tokenIdx++)
+    {
+        int offset = tokenIdx * MaxNumExperts + threadIdx.x;
+        if (smemKIdx[offset] >= 0)
+        {
+            int const expandedIdx = tokenIdx * params.mTopK + smemKIdx[offset];
+            int const offsetWithinExpert = static_cast<int>(smemOffset[offset]);
+            int const offsetForExpert = expertScanCounts;
+            int const permutedIdx = isLocalExpert ? offsetForExpert + offsetWithinExpert : int32_t{-1};
+
+            params.mPtrExpandedIdxToPermutedIdx[expandedIdx] = permutedIdx;
+            if (isLocalExpert)
+            {
+                params.mPtrPermutedIdxToTokenIdx[permutedIdx] = tokenIdx;
+            }
+        }
+    }
+}
+
 template <typename KernelParams>
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
 __global__ void __cluster_dims__(NumBlocksPerCluster, 1, 1) __launch_bounds__(NumThreads)
@@ -234,18 +386,27 @@ void run(Data const& data, void* stream)
         data.mNumExperts % 4 == 0, "Routing kernel expects #experts %d to be a multiple of 4.", data.mNumExperts);
     TLLM_CHECK_WITH_INFO(data.mPaddingLog2 < 8, "Routing kernel expects padding log2 < 8, got %d", data.mPaddingLog2);
 
+    bool const useSingleBlock = data.mNumTokens <= BlockKernelMaxNumTokens;
     bool const useSingleCluster
         = data.mNumTokens <= (data.mPtrScores != nullptr ? MaxNumTokensSingleClusterScores : MaxNumTokensSingleCluster);
 
-    if (!useSingleCluster)
+    if (!useSingleCluster && !useSingleBlock)
     {
         TLLM_CHECK_WITH_INFO(
             data.mPtrExpertIdx != nullptr, "When #tokens is large, `mPtrExpertIdx` is a required input.");
         TLLM_CHECK_WITH_INFO(
             data.mPtrExpertCounts != nullptr, "When #tokens is large, `mPtrExpertCounts` is a required input.");
     }
 
-    if (useSingleCluster)
+    if (useSingleBlock)
+    {
+        //@TODO: For now we use the single block kernel for cases with token number no larger than 4.
+        // We will future tune this threshold based on the performance.
+        LAUNCH_ROUTING_WITH_EXTRA_FLAG(data, false, routingIndicesBlockKernel, 1, NumThreadsSingleBlock,
+            /*smemSize=*/0, // No dynamic smem
+            stream, data.mDoSoftmaxBeforeTopK, /*forceFloatInput=*/false);
+    }
+    else if (useSingleCluster)
     {
         LAUNCH_ROUTING_WITH_EXTRA_FLAG(data, false, routingIndicesClusterKernel, NumBlocksPerCluster, NumThreads,
             /*smemSize=*/0, // No dynamic smem
 
@@ -178,6 +178,28 @@ class RoutingRenormalizeKernelTest : public RoutingKernelTest<T>
 
 TYPED_TEST_SUITE(RoutingRenormalizeKernelTest, FloatAndBf16Types);
 
+TYPED_TEST(RoutingRenormalizeKernelTest, BlockLevelParallelization)
+{
+    RoutingKernelTestParam param(RoutingMethodType::Renormalize, /*numTokens=*/4,
+        /*numExperts=*/128, /*topK=*/8,
+        /*expertParallelization=*/1, /*expertParallelizationId=*/0,
+        /*paddingLog2=*/3, /*localExpertsStrideLog2=*/0,
+        /*usePdl=*/true, /*getExpWeights=*/true,
+        /*nGroup*/ 0, /*topkGroup*/ 0, /*routedScalingFactor*/ 1.0f, /*requiredComputeCapability*/ 9);
+    this->runTest(param);
+};
+
+TYPED_TEST(RoutingRenormalizeKernelTest, BlockLevelParallelizationWithExpertParallelization)
+{
+    RoutingKernelTestParam param(RoutingMethodType::Renormalize, /*numTokens=*/14,
+        /*numExperts=*/128, /*topK=*/8,
+        /*expertParallelization=*/2, /*expertParallelizationId=*/1,
+        /*paddingLog2=*/3, /*localExpertsStrideLog2=*/0,
+        /*usePdl=*/true, /*getExpWeights=*/true,
+        /*nGroup*/ 0, /*topkGroup*/ 0, /*routedScalingFactor*/ 1.0f, /*requiredComputeCapability*/ 9);
+    this->runTest(param);
+};
+
 TYPED_TEST(RoutingRenormalizeKernelTest, ClusterLevelParallelization)
 {
     RoutingKernelTestParam param(RoutingMethodType::Renormalize, /*numTokens=*/10,