Cleanup for PR

DomBrown · DomBrown · commit 9c196470461f · 2025-08-19T15:24:21.000+01:00
Signed-off-by: Dom Brown &lt;3886319+DomBrown@users.noreply.github.com&gt;
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -358,7 +358,7 @@ endif()
 
 setup_sanitizers()
 
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda -lineinfo")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 if(FAST_MATH)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math")
diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp
@@ -30,7 +30,6 @@
 #include "tensorrt_llm/runtime/utils/mpiUtils.h"
 #include <algorithm>
 #include <cstdint>
-#include <cstdlib>
 #include <type_traits>
 
 using namespace tensorrt_llm::kernels;
@@ -286,12 +285,8 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams&
     xqaParams.fp4_out_sf_scale = generationsParams.attention_output_sf_scale;
     xqaParams.start_token_idx_sf = generationsParams.start_token_idx_sf;
 
-    xqaParams.num_tokens = generationsParams.num_tokens;
     // Cross attention parameters.
     xqaParams.encoder_input_lengths = generationsParams.encoder_input_lengths;
-    // xqaParams.cross_kv = generationsParams.cross_kv;
-    // xqaParams.cross_kv_length = generationsParams.cross_kv_length;
-    // xqaParams.num_encoder_tokens = generationsParams.num_encoder_tokens;
 
     return true;
 }
@@ -2239,7 +2234,7 @@ int AttentionOp::enqueueGeneration(EnqueueGenerationParams<T> const& params, cud
         }
         else
         {
-            TLLM_LOG_DEBUG("XQA kernels are not selected in the generation phase. mEnableXQA: %d", mEnableXQA);
+            TLLM_LOG_DEBUG("XQA kernels are not selected in the generation phase.");
         }
     }
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h
@@ -106,12 +106,8 @@ struct XQAParams
 
     void* quant_q_buffer_ptr = nullptr;
 
-    int num_tokens;
     // for cross attention
     int32_t const* encoder_input_lengths = nullptr;
-    // void const* cross_kv = nullptr;
-    // int32_t cross_kv_length = 0;
-    // int32_t num_encoder_tokens = 0;
 
     cudaStream_t stream = 0;
 
@@ -182,11 +178,7 @@ struct XQAParams
            << "total_num_input_tokens :" << total_num_input_tokens << std ::endl
            << "is_fp8_output :" << (is_fp8_output ? "true" : "false") << std ::endl
            << "fp8_out_scale :" << fp8_out_scale << std ::endl
-           << "encoder_input_lengths: " << encoder_input_lengths
-           << std::endl
-           //<< "cross_kv: " << cross_kv << std::endl
-           //<< "cross_kv_length: " << cross_kv_length << std::endl
-           //<< "num_encoder_tokens: " << num_encoder_tokens << std::endl
+           << "encoder_input_lengths: " << encoder_input_lengths << std::endl
            << "stream :" << stream;
 
         return ss.str();
diff --git a/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp b/cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp
@@ -1041,7 +1041,6 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32
 
         common_enqueue_params.input_seq_length = max_context_q_len;
         common_enqueue_params.max_past_kv_length = max_context_kv_len;
-
         EnqueueContextParams<T> enqueue_params{common_enqueue_params};
         enqueue_params.attention_packed_mask = attention_packed_mask;
         enqueue_params.host_block_offsets = host_block_offsets;

Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,6 @@`
`30`	`30`	`#include "tensorrt_llm/runtime/utils/mpiUtils.h"`
`31`	`31`	`#include <algorithm>`
`32`	`32`	`#include <cstdint>`
`33`		`-#include <cstdlib>`
`34`	`33`	`#include <type_traits>`
`35`	`34`
`36`	`35`	`using namespace tensorrt_llm::kernels;`
`@@ -286,12 +285,8 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams&`
`286`	`285`	`xqaParams.fp4_out_sf_scale = generationsParams.attention_output_sf_scale;`
`287`	`286`	`xqaParams.start_token_idx_sf = generationsParams.start_token_idx_sf;`
`288`	`287`
`289`		`- xqaParams.num_tokens = generationsParams.num_tokens;`
`290`	`288`	`// Cross attention parameters.`
`291`	`289`	`xqaParams.encoder_input_lengths = generationsParams.encoder_input_lengths;`
`292`		`- // xqaParams.cross_kv = generationsParams.cross_kv;`
`293`		`- // xqaParams.cross_kv_length = generationsParams.cross_kv_length;`
`294`		`- // xqaParams.num_encoder_tokens = generationsParams.num_encoder_tokens;`
`295`	`290`
`296`	`291`	`return true;`
`297`	`292`	`}`
`@@ -2239,7 +2234,7 @@ int AttentionOp::enqueueGeneration(EnqueueGenerationParams<T> const& params, cud`
`2239`	`2234`	`}`
`2240`	`2235`	`else`
`2241`	`2236`	`{`
`2242`		`- TLLM_LOG_DEBUG("XQA kernels are not selected in the generation phase. mEnableXQA: %d", mEnableXQA);`
	`2237`	`+ TLLM_LOG_DEBUG("XQA kernels are not selected in the generation phase.");`
`2243`	`2238`	`}`
`2244`	`2239`	`}`
`2245`	`2240`