Add basic support for cross-attention to XQA dispatch in support of Whisper

DomBrown · DomBrown · commit 9ae979b28cee · 2025-08-19T11:45:09.000+01:00
Signed-off-by: Dom Brown &lt;3886319+DomBrown@users.noreply.github.com&gt;
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -358,7 +358,7 @@ endif()
 
 setup_sanitizers()
 
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda -lineinfo")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 if(FAST_MATH)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math")
diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp
@@ -30,6 +30,7 @@
 #include "tensorrt_llm/runtime/utils/mpiUtils.h"
 #include <algorithm>
 #include <cstdint>
+#include <cstdlib>
 #include <type_traits>
 
 using namespace tensorrt_llm::kernels;
@@ -285,6 +286,13 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams&
     xqaParams.fp4_out_sf_scale = generationsParams.attention_output_sf_scale;
     xqaParams.start_token_idx_sf = generationsParams.start_token_idx_sf;
 
+    xqaParams.num_tokens = generationsParams.num_tokens;
+    // Cross attention parameters.
+    xqaParams.encoder_input_lengths = generationsParams.encoder_input_lengths;
+    // xqaParams.cross_kv = generationsParams.cross_kv;
+    // xqaParams.cross_kv_length = generationsParams.cross_kv_length;
+    // xqaParams.num_encoder_tokens = generationsParams.num_encoder_tokens;
+
     return true;
 }
 
@@ -2210,6 +2218,10 @@ int AttentionOp::enqueueGeneration(EnqueueGenerationParams<T> const& params, cud
         {
             TLLM_CHECK_WITH_INFO(false, "No available kernels are found for FP4 output.");
         }
+        else
+        {
+            TLLM_LOG_DEBUG("XQA kernels are not selected in the generation phase. mEnableXQA: %d", mEnableXQA);
+        }
     }
 
     // This is the number of kv tokens that q needs to visit, but excluding one as it will be processed before the kv
@@ -2728,7 +2740,7 @@ int AttentionOp::initialize() noexcept
             !useCustomMask() || mEnableContextFMHA, "Only Context FMHA supports custom mask input currently.");
     }
 
-    mEnableXQA = (mEnableXQA || mIsSpecDecodingEnabled) && !mCrossAttention
+    mEnableXQA = (mEnableXQA || mIsSpecDecodingEnabled)
         && (mType == nvinfer1::DataType::kHALF || mType == nvinfer1::DataType::kBF16) && mUseKVCache;
 
     if (mEnableXQA)
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h
@@ -106,6 +106,13 @@ struct XQAParams
 
     void* quant_q_buffer_ptr = nullptr;
 
+    int num_tokens;
+    // for cross attention
+    int32_t const* encoder_input_lengths = nullptr;
+    // void const* cross_kv = nullptr;
+    // int32_t cross_kv_length = 0;
+    // int32_t num_encoder_tokens = 0;
+
     cudaStream_t stream = 0;
 
     std::string toString() const
@@ -175,6 +182,11 @@ struct XQAParams
            << "total_num_input_tokens :" << total_num_input_tokens << std ::endl
            << "is_fp8_output :" << (is_fp8_output ? "true" : "false") << std ::endl
            << "fp8_out_scale :" << fp8_out_scale << std ::endl
+           << "encoder_input_lengths: " << encoder_input_lengths
+           << std::endl
+           //<< "cross_kv: " << cross_kv << std::endl
+           //<< "cross_kv_length: " << cross_kv_length << std::endl
+           //<< "num_encoder_tokens: " << num_encoder_tokens << std::endl
            << "stream :" << stream;
 
         return ss.str();
diff --git a/cpp/tensorrt_llm/kernels/gptKernels.h b/cpp/tensorrt_llm/kernels/gptKernels.h
@@ -186,6 +186,7 @@ struct BuildDecoderInfoParams
     float2* rotaryEmbeddingCoeffCache;
     // Dynamic scaling;
     int rotaryEmbeddingMaxPositions;
+    bool isCrossAttention{false};
 
     bool isBuildDecoderInfoKernelNeeded()
     {
@@ -213,6 +214,10 @@ struct BuildDecoderInfoParams
         {
             return true;
         }
+        if (isCrossAttention)
+        {
+            return true;
+        }
         // Other cases don't need to call buildDecoderInfo kernel.
         return false;
     }
diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h
@@ -1348,9 +1348,10 @@ __global__ void updateKVCacheForCrossAttention(QKVPreprocessingParams<T, KVCache
     int const batch_idx = blockIdx.z;
 
     // The decoder sequence length.
-    int const decoder_seq_len = params.seq_lens[batch_idx];
+    // Spec decoding not supported for cross-attention at the moment so we can set 1 and batch_idx here
+    int const decoder_seq_len = params.generation_phase ? 1 : params.seq_lens[batch_idx];
     // The decoder sequence offset.
-    int const decoder_seq_offset = params.cu_seq_lens[batch_idx];
+    int const decoder_seq_offset = params.generation_phase ? batch_idx : params.cu_seq_lens[batch_idx];
     // The decoder cache sequence length (includes the current input).
     int const decoder_cache_seq_len = params.cache_seq_lens[batch_idx];
     // The encoder sequence length.
@@ -1411,45 +1412,49 @@ __global__ void updateKVCacheForCrossAttention(QKVPreprocessingParams<T, KVCache
             }
         }
 
-        // Encoder tokens (i.e. KV tokens).
-        if (head_idx == (kv_head_idx * params.qheads_per_kv_head) && token_idx < encoder_seq_len
-            && store_encoder_kv_cache && params.kv_cache_buffer.data != nullptr)
+        if (!params.generation_phase)
         {
-            // The global token idx in all sequences.
-            int global_token_idx = token_idx + encoder_seq_offset;
-
-            // The memory offset.
-            auto const src_k_idx = static_cast<size_t>(global_token_idx) * params.kv_hidden_size * 2 + hidden_idx_kv;
-            auto const src_v_idx
-                = static_cast<size_t>(global_token_idx) * params.kv_hidden_size * 2 + src_v_offset + hidden_idx_kv;
-
-            // Only load K,V tokens from encoder qkv input.
-            auto k = *reinterpret_cast<VecT const*>(&params.cross_kv_input[src_k_idx]);
-            auto v = *reinterpret_cast<VecT const*>(&params.cross_kv_input[src_v_idx]);
-
-            // The kv cache pointers.
-            auto k_cache_block_ptr
-                = reinterpret_cast<TCache*>(params.kv_cache_buffer.getKBlockPtr(batch_idx, token_idx));
-            auto v_cache_block_ptr
-                = reinterpret_cast<TCache*>(params.kv_cache_buffer.getVBlockPtr(batch_idx, token_idx));
-            // The vector idx in the cache block.
-            auto block_vec_idx
-                = params.kv_cache_buffer.getKVLocalIdx(token_idx, kv_head_idx, VECS_PER_HEAD, head_dim_vec_idx);
-
-            // Store K and V to the cache.
-            // INT8/FP8 kv cache.
-            if constexpr (sizeof(TCache) == 1)
-            {
-                // The element index inside the block.
-                auto block_elt_idx = block_vec_idx * ELTS_PER_VEC;
-                // Store 8bits kv cache.
-                mmha::store_8bits_vec(k_cache_block_ptr, k, block_elt_idx, scale_orig_quant);
-                mmha::store_8bits_vec(v_cache_block_ptr, v, block_elt_idx, scale_orig_quant);
-            }
-            else
+            // Encoder tokens (i.e. KV tokens).
+            if (head_idx == (kv_head_idx * params.qheads_per_kv_head) && token_idx < encoder_seq_len
+                && store_encoder_kv_cache && params.kv_cache_buffer.data != nullptr)
             {
-                reinterpret_cast<VecT*>(k_cache_block_ptr)[block_vec_idx] = k;
-                reinterpret_cast<VecT*>(v_cache_block_ptr)[block_vec_idx] = v;
+                // The global token idx in all sequences.
+                int global_token_idx = token_idx + encoder_seq_offset;
+
+                // The memory offset.
+                auto const src_k_idx
+                    = static_cast<size_t>(global_token_idx) * params.kv_hidden_size * 2 + hidden_idx_kv;
+                auto const src_v_idx
+                    = static_cast<size_t>(global_token_idx) * params.kv_hidden_size * 2 + src_v_offset + hidden_idx_kv;
+
+                // Only load K,V tokens from encoder qkv input.
+                auto k = *reinterpret_cast<VecT const*>(&params.cross_kv_input[src_k_idx]);
+                auto v = *reinterpret_cast<VecT const*>(&params.cross_kv_input[src_v_idx]);
+
+                // The kv cache pointers.
+                auto k_cache_block_ptr
+                    = reinterpret_cast<TCache*>(params.kv_cache_buffer.getKBlockPtr(batch_idx, token_idx));
+                auto v_cache_block_ptr
+                    = reinterpret_cast<TCache*>(params.kv_cache_buffer.getVBlockPtr(batch_idx, token_idx));
+                // The vector idx in the cache block.
+                auto block_vec_idx
+                    = params.kv_cache_buffer.getKVLocalIdx(token_idx, kv_head_idx, VECS_PER_HEAD, head_dim_vec_idx);
+
+                // Store K and V to the cache.
+                // INT8/FP8 kv cache.
+                if constexpr (sizeof(TCache) == 1)
+                {
+                    // The element index inside the block.
+                    auto block_elt_idx = block_vec_idx * ELTS_PER_VEC;
+                    // Store 8bits kv cache.
+                    mmha::store_8bits_vec(k_cache_block_ptr, k, block_elt_idx, scale_orig_quant);
+                    mmha::store_8bits_vec(v_cache_block_ptr, v, block_elt_idx, scale_orig_quant);
+                }
+                else
+                {
+                    reinterpret_cast<VecT*>(k_cache_block_ptr)[block_vec_idx] = k;
+                    reinterpret_cast<VecT*>(v_cache_block_ptr)[block_vec_idx] = v;
+                }
             }
         }
     }
diff --git a/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp b/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp
diff --git a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp

Original file line number	Diff line number	Diff line change
`@@ -186,6 +186,7 @@ struct BuildDecoderInfoParams`
`186`	`186`	`float2* rotaryEmbeddingCoeffCache;`
`187`	`187`	`// Dynamic scaling;`
`188`	`188`	`int rotaryEmbeddingMaxPositions;`
	`189`	`+ bool isCrossAttention{false};`
`189`	`190`
`190`	`191`	`bool isBuildDecoderInfoKernelNeeded()`
`191`	`192`	`{`
`@@ -213,6 +214,10 @@ struct BuildDecoderInfoParams`
`213`	`214`	`{`
`214`	`215`	`return true;`
`215`	`216`	`}`
	`217`	`+ if (isCrossAttention)`
	`218`	`+ {`
	`219`	`+ return true;`
	`220`	`+ }`
`216`	`221`	`// Other cases don't need to call buildDecoderInfo kernel.`
`217`	`222`	`return false;`
`218`	`223`	`}`