Respond to review comments

DomBrown · DomBrown · commit 603173d948f2 · 2025-08-19T15:24:21.000+01:00
Signed-off-by: Dom Brown &lt;3886319+DomBrown@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/gptKernels.h b/cpp/tensorrt_llm/kernels/gptKernels.h
@@ -186,7 +186,6 @@ struct BuildDecoderInfoParams
     float2* rotaryEmbeddingCoeffCache;
     // Dynamic scaling;
     int rotaryEmbeddingMaxPositions;
-    bool isCrossAttention{false};
 
     bool isBuildDecoderInfoKernelNeeded()
     {
@@ -214,10 +213,6 @@ struct BuildDecoderInfoParams
         {
             return true;
         }
-        if (isCrossAttention)
-        {
-            return true;
-        }
         // Other cases don't need to call buildDecoderInfo kernel.
         return false;
     }
@@ -269,7 +264,6 @@ struct BuildDecoderInfoParams
         ss << "rotaryEmbeddingInvFreqCache: " << rotaryEmbeddingInvFreqCache << std::endl;
         ss << "rotaryEmbeddingCoeffCache: " << rotaryEmbeddingCoeffCache << std::endl;
         ss << "rotaryEmbeddingMaxPositions: " << rotaryEmbeddingMaxPositions << std::endl;
-        ss << "isCrossAttention: " << isCrossAttention << std::endl;
 
         return ss.str();
     }
diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h
@@ -1357,7 +1357,8 @@ __global__ void updateKVCacheForCrossAttention(QKVPreprocessingParams<T, KVCache
     // The encoder sequence length.
     int const encoder_seq_len = params.encoder_seq_lens[batch_idx];
     // The encoder sequence offset.
-    int const encoder_seq_offset = params.cu_kv_seq_lens[batch_idx];
+    // Not needed in Gen phase
+    int const encoder_seq_offset = params.generation_phase ? -1 : params.cu_kv_seq_lens[batch_idx];
     // THe maximum sequence length of encoder and decoder.
     int const max_seq_len = max(decoder_seq_len, encoder_seq_len);
 
diff --git a/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp b/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp
@@ -114,12 +114,8 @@ QKVPreprocessingParams<T, KVCacheBuffer> makeQKVPreprocessingParams(XQAParams co
 
     // Cross-attention only.
 
-    preprocessingParms.cu_kv_seq_lens = cu_kv_seqlens;
     preprocessingParms.encoder_seq_lens = params.encoder_input_lengths;
 
-    // Not available in generation phase
-    preprocessingParms.mrope_rotary_cos_sin = nullptr;
-
     return preprocessingParms;
 }
 
@@ -355,14 +351,6 @@ void XqaDispatcher::runImpl(XQAParams params, KVCacheBuffer const& kv_cache_buff
         decoder_params.rotaryEmbeddingInvFreq = launchParams.rotary_inv_freq_buf;
         decoder_params.rotaryEmbeddingInvFreqCache = params.rotary_embedding_inv_freq_cache;
         decoder_params.rotaryEmbeddingMaxPositions = params.rotary_embedding_max_positions;
-        decoder_params.isCrossAttention = params.cross_attention;
-
-        if (params.cross_attention)
-        {
-            // cross attention only
-            decoder_params.maxEncoderQSeqLength = params.max_past_kv_length;
-            decoder_params.encoderPaddingOffsets = nullptr;
-        }
 
         // The rotary_embedding_inv_freq_cache for QKVPreprocessing.
         // Use the params.rotary_embedding_inv_freq_cache input when the buildDecoderInfoKernel is skipped.

Original file line number	Diff line number	Diff line change
`@@ -186,7 +186,6 @@ struct BuildDecoderInfoParams`
`186`	`186`	`float2* rotaryEmbeddingCoeffCache;`
`187`	`187`	`// Dynamic scaling;`
`188`	`188`	`int rotaryEmbeddingMaxPositions;`
`189`		`- bool isCrossAttention{false};`
`190`	`189`
`191`	`190`	`bool isBuildDecoderInfoKernelNeeded()`
`192`	`191`	`{`
`@@ -214,10 +213,6 @@ struct BuildDecoderInfoParams`
`214`	`213`	`{`
`215`	`214`	`return true;`
`216`	`215`	`}`
`217`		`- if (isCrossAttention)`
`218`		`- {`
`219`		`- return true;`
`220`		`- }`
`221`	`216`	`// Other cases don't need to call buildDecoderInfo kernel.`
`222`	`217`	`return false;`
`223`	`218`	`}`
`@@ -269,7 +264,6 @@ struct BuildDecoderInfoParams`
`269`	`264`	`ss << "rotaryEmbeddingInvFreqCache: " << rotaryEmbeddingInvFreqCache << std::endl;`
`270`	`265`	`ss << "rotaryEmbeddingCoeffCache: " << rotaryEmbeddingCoeffCache << std::endl;`
`271`	`266`	`ss << "rotaryEmbeddingMaxPositions: " << rotaryEmbeddingMaxPositions << std::endl;`
`272`		`- ss << "isCrossAttention: " << isCrossAttention << std::endl;`
`273`	`267`
`274`	`268`	`return ss.str();`
`275`	`269`	`}`