NVIDIA
diff --git a/‎.github/CODEOWNERS
Lines changed: 126 additions & 0 deletions b/‎.github/CODEOWNERS
Lines changed: 126 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 4 additions & 1 deletion b/‎README.md
Lines changed: 4 additions & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Lines changed: 5 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Lines changed: 5 additions & 0 deletions
diff --git a/‎cpp/kernels/xqa/CMakeLists.txt
Lines changed: 5 additions & 1 deletion b/‎cpp/kernels/xqa/CMakeLists.txt
Lines changed: 5 additions & 1 deletion
diff --git a/‎cpp/kernels/xqa/README.md
Lines changed: 11 additions & 1 deletion b/‎cpp/kernels/xqa/README.md
Lines changed: 11 additions & 1 deletion
diff --git a/‎cpp/kernels/xqa/defines.h
Lines changed: 9 additions & 0 deletions b/‎cpp/kernels/xqa/defines.h
Lines changed: 9 additions & 0 deletions
diff --git a/‎cpp/kernels/xqa/mha.cu
Lines changed: 50 additions & 10 deletions b/‎cpp/kernels/xqa/mha.cu
Lines changed: 50 additions & 10 deletions
diff --git a/‎cpp/kernels/xqa/mha.h
Lines changed: 8 additions & 0 deletions b/‎cpp/kernels/xqa/mha.h
Lines changed: 8 additions & 0 deletions
@@ -14,6 +14,103 @@
 /tensorrt_llm/_torch/auto_deploy @NVIDIA/trt-llm-torch-autodeploy-devs
 /tensorrt_llm/examples/auto_deploy @NVIDIA/trt-llm-torch-autodeploy-devs
 
+## TensorRT-LLM Pytorch - Speculative Decoding
+/tensorrt_llm/_torch/speculative @NVIDIA/trt-llm-torch-spec-decoding
+
+## TensorRT-LLM Pytorch - Graph Compiler
+/tensorrt_llm/_torch/compilation @NVIDIA/trt-llm-torch-graph-compiler
+/tensorrt_llm/_torch/custom_ops @NVIDIA/trt-llm-torch-graph-compiler
+/tensorrt_llm/_torch/autotuner.py @NVIDIA/trt-llm-torch-graph-compiler
+/tests/unittest/_torch/compilation @NVIDIA/trt-llm-torch-graph-compiler
+/tests/unittest/_torch/multi_gpu/test_ar_residual_norm.py @NVIDIA/trt-llm-torch-graph-compiler
+/tests/unittest/_torch/multi_gpu/test_user_buffers.py @NVIDIA/trt-llm-torch-graph-compiler
+/tests/unittest/_torch/test_custom_ops.py @NVIDIA/trt-llm-torch-graph-compiler
+/tests/unittest/_torch/test_autotuner.py @NVIDIA/trt-llm-torch-graph-compiler
+
+## TensorRT-LLM Pytorch - Attention
+/tensorrt_llm/_torch/attention_backend @NVIDIA/trt-llm-torch-attention-devs
+/tensorrt_llm/_torch/modules/attention.py @NVIDIA/trt-llm-torch-attention-devs
+
+## TensorRT-LLM Pytorch - Modules
+/tensorrt_llm/_torch/modules @NVIDIA/trt-llm-torch-modules
+
+
+## TensorRT-LLM Pytorch Models
+/tensorrt_llm/_torch/models @NVIDIA/trt-llm-torch-models-devs
+
+### TensorRT-LLM Pytorch - Models - Gemma
+/tensorrt_llm/_torch/models/modeling_gemma3.py @NVIDIA/trt-llm-torch-models-gemma-devs @NVIDIA/trt-llm-torch-models-devs
+/tensorrt_llm/_torch/models/modeling_gemma3vl.py @NVIDIA/trt-llm-torch-models-gemma-devs @NVIDIA/trt-llm-torch-models-devs
+/tests/unittest/_torch/modeling/test_modeling_gemma3.py @NVIDIA/trt-llm-torch-models-gemma-devs @NVIDIA/trt-llm-torch-models-devs
+
+### TensorRT-LLM Pytorch - Models - Mistral & Mixtral
+/tensorrt_llm/_torch/models/modeling_mistral.py @NVIDIA/trt-llm-torch-models-mistral-devs @NVIDIA/trt-llm-torch-models-devs
+/tensorrt_llm/_torch/models/modeling_pixtral.py @NVIDIA/trt-llm-torch-models-mistral-devs @NVIDIA/trt-llm-torch-models-devs
+/tests/unittest/_torch/modeling/test_modeling_mistral.py @NVIDIA/trt-llm-torch-models-mistral-devs @NVIDIA/trt-llm-torch-models-devs
+/tests/unittest/_torch/modeling/test_modeling_mixtral.py @NVIDIA/trt-llm-torch-models-mistral-devs @NVIDIA/trt-llm-torch-models-devs
+
+### TensorRT-LLM Pytorch - Models - CLIP
+/tensorrt_llm/_torch/models/modeling_clip.py @NVIDIA/trt-llm-torch-models-clip-devs @NVIDIA/trt-llm-torch-models-devs
+/tests/unittest/_torch/modeling/test_modeling_clip.py @NVIDIA/trt-llm-torch-models-clip-devs @NVIDIA/trt-llm-torch-models-devs
+
+### TensorRT-LLM Pytorch - Models - Phi
+/tensorrt_llm/_torch/models/modeling_phi3.py @NVIDIA/trt-llm-torch-models-phi-devs @NVIDIA/trt-llm-torch-models-devs
+/tensorrt_llm/_torch/models/modeling_phi4mm.py @NVIDIA/trt-llm-torch-models-phi-devs @NVIDIA/trt-llm-torch-models-devs
+/tests/unittest/_torch/modeling/test_modeling_phi3.py @NVIDIA/trt-llm-torch-models-phi-devs @NVIDIA/trt-llm-torch-models-devs
+/tests/integration/defs/examples/test_multimodal.py @NVIDIA/trt-llm-torch-models-phi-devs @NVIDIA/trt-llm-torch-models-devs
+
+### TensorRT-LLM Pytorch - Models - Deepseek
+/tensorrt_llm/_torch/models/modeling_deepseekv3.py @NVIDIA/trt-llm-torch-models-deepseek-devs @NVIDIA/trt-llm-torch-models-devs
+/tests/unittest/_torch/modeling/test_modeling_deepseek.py @NVIDIA/trt-llm-torch-models-deepseek-devs @NVIDIA/trt-llm-torch-models-devs
+
+### TensorRT-LLM Pytorch - Models - Llama
+/tensorrt_llm/_torch/models/modeling_mllama.py @NVIDIA/trt-llm-torch-models-llama-devs @NVIDIA/trt-llm-torch-models-devs
+/tensorrt_llm/_torch/models/modeling_llama.py @NVIDIA/trt-llm-torch-models-llama-devs @NVIDIA/trt-llm-torch-models-devs
+/tensorrt_llm/_torch/models/modeling_llama_min_latency.py @NVIDIA/trt-llm-torch-models-llama-devs @NVIDIA/trt-llm-torch-models-devs
+/tests/unittest/_torch/modeling/test_modeling_llama.py @NVIDIA/trt-llm-torch-models-llama-devs @NVIDIA/trt-llm-torch-models-devs
+/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py @NVIDIA/trt-llm-torch-models-llama-devs @NVIDIA/trt-llm-torch-models-devs
+
+### TensorRT-LLM Pytorch - Models - Qwen
+/tensorrt_llm/_torch/models/modeling_qwen3_moe.py @NVIDIA/trt-llm-torch-models-qwen-devs @NVIDIA/trt-llm-torch-models-devs
+/tensorrt_llm/_torch/models/modeling_qwen3.py @NVIDIA/trt-llm-torch-models-qwen-devs @NVIDIA/trt-llm-torch-models-devs
+/tensorrt_llm/_torch/models/modeling_qwen2vl.py @NVIDIA/trt-llm-torch-models-qwen-devs @NVIDIA/trt-llm-torch-models-devs
+/tensorrt_llm/_torch/models/modeling_qwen.py @NVIDIA/trt-llm-torch-models-qwen-devs @NVIDIA/trt-llm-torch-models-devs
+/tensorrt_llm/_torch/models/modeling_qwen_moe.py @NVIDIA/trt-llm-torch-models-qwen-devs @NVIDIA/trt-llm-torch-models-devs
+
+### TensorRT-LLM Pytorch - Models - VLMs
+/tensorrt_llm/_torch/models/modeling_vila.py @NVIDIA/trt-llm-torch-models-vlm-devs @NVIDIA/trt-llm-torch-models-devs
+/tests/unittest/_torch/modeling/test_modeling_vila.py @NVIDIA/trt-llm-torch-models-vlm-devs @NVIDIA/trt-llm-torch-models-devs
+/tensorrt_llm/_torch/models/modeling_pixtral.py @NVIDIA/trt-llm-torch-models-vlm-devs @NVIDIA/trt-llm-torch-models-devs
+/tests/unittest/_torch/modeling/test_modeling_pixtral.py @NVIDIA/trt-llm-torch-models-vlm-devs @NVIDIA/trt-llm-torch-models-devs
+
+### TensorRT-LLM Pytorch - Models - Nemotron
+/tensorrt_llm/_torch/models/modeling_nemotron_nas.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-devs
+/tensorrt_llm/_torch/models/modeling_nemotron_h.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-devs
+/tensorrt_llm/_torch/models/modeling_nemotron_nas.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-devs
+/tensorrt_llm/_torch/pyexecutor/resource_manager.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-runtime-devs @NVIDIA/trt-llm-torch-models-devs
+/tensorrt_llm/_torch/modules/mamba @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-devs
+/tensorrt_llm/_torch/models/checkpoints/hf/nemotron_h_weight_mapper.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-devs
+/tests/unittest/_torch/modeling/test_modeling_nemotron.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-devs
+/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-devs
+/tests/unittest/_torch/modeling/test_modeling_nemotron_nas.py @NVIDIA/trt-llm-torch-models-nemotron-devs @NVIDIA/trt-llm-torch-models-devs
+
+## TensorRT-LLM - PEFT
+/tensorrt_llm/_torch/peft @NVIDIA/trt-llm-torch-peft
+/tensorrt_llm/lora_manager.py @NVIDIA/trt-llm-torch-peft
+/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp @NVIDIA/trt-llm-torch-peft
+/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h @NVIDIA/trt-llm-torch-peft
+/cpp/tensorrt_llm/runtime/loraCache.cpp @NVIDIA/trt-llm-torch-peft
+/cpp/include/tensorrt_llm/runtime/loraCache.h @NVIDIA/trt-llm-torch-peft
+/cpp/tensorrt_llm/runtime/loraModule.cpp @NVIDIA/trt-llm-torch-peft
+/cpp/include/tensorrt_llm/runtime/loraModule.h @NVIDIA/trt-llm-torch-peft
+/cpp/tensorrt_llm/runtime/loraManager.cpp @NVIDIA/trt-llm-torch-peft
+/cpp/tensorrt_llm/runtime/loraManager.h @NVIDIA/trt-llm-torch-peft
+/cpp/tensorrt_llm/runtime/loraUtils.cpp @NVIDIA/trt-llm-torch-peft
+/cpp/tensorrt_llm/runtime/loraUtils.h @NVIDIA/trt-llm-torch-peft
+
+## TensorRT-LLM - Triton backend
+/triton_backend @NVIDIA/trt-llm-triton-backend-devs
+
 ## TensorRT-LLM trtllm-bench Reviewers
 /tensorrt_llm/bench @NVIDIA/trtllm-bench-reviewers
 /tensorrt_llm/commands/bench.py @NVIDIA/trtllm-bench-reviewers
@@ -23,6 +120,35 @@ docs/source/performance/perf-benchmarking.md @NVIDIA/trtllm-bench-reviewers
 /tensorrt_llm/llmapi @NVIDIA/trt-llm-llmapi-devs
 /tensorrt_llm/executor @NVIDIA/trt-llm-llmapi-devs
 
+## TensorRT-LLM LLM Disaggregated
+/examples/disaggregated @NVIDIA/trt-llm-disagg-devs
+/tensorrt_llm/disaggregated_params.py @NVIDIA/trt-llm-disagg-devs
+/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py @NVIDIA/trt-llm-disagg-devs
+/tensorrt_llm/_torch/pyexecutor/py_executor.py @NVIDIA/trt-llm-disagg-devs
+/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp @NVIDIA/trt-llm-disagg-devs
+/cpp/tensorrt_llm/batch_manager/cacheFormatter.h @NVIDIA/trt-llm-disagg-devs
+/cpp/tensorrt_llm/batch_manager/cacheTransBuffer.cpp @NVIDIA/trt-llm-disagg-devs
+/cpp/tensorrt_llm/batch_manager/cacheTransBuffer.h @NVIDIA/trt-llm-disagg-devs
+/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp @NVIDIA/trt-llm-disagg-devs
+/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp @NVIDIA/trt-llm-disagg-devs
+/cpp/tensorrt_llm/batch_manager/dataTransceiver.h @NVIDIA/trt-llm-disagg-devs
+/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.cpp @NVIDIA/trt-llm-disagg-devs
+/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.h @NVIDIA/trt-llm-disagg-devs
+
+## TensorRT-LLM Infra
+
+### CI
+/jenkins @NVIDIA/trt-llm-ci-infra-devs @NVIDIA/trt-llm-infra-devs
+### Setup
+/docker @NVIDIA/trt-llm-setup-infra-devs @NVIDIA/trt-llm-infra-devs
+### Github workflows
+/tensorrt_llm/.github @NVIDIA/trt-llm-gh-workflows-infra-devs @NVIDIA/trt-llm-infra-devs
+/tensorrt_llm/.coderabbit.yaml @NVIDIA/trt-llm-gh-workflows-infra-devs @NVIDIA/trt-llm-infra-devs
+
+## TensorRT-LLM - Docs
+/docs @NVIDIA/trt-llm-doc-owners
+/examples @NVIDIA/trt-llm-doc-owners
+
 # The rule below requires that any PR modifying public APIs must be approved by at least one member
 # of the NVIDIA/trt-llm-committed-api-review-committee or NVIDIA/trt-llm-noncommitted-api-review-committee team.
 # This approval is mandatory regardless of other approvals the PR may have received. Without approval
 
@@ -18,12 +18,15 @@ TensorRT-LLM
 <div align="left">
 
 ## Tech Blogs
+* [08/06] Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
+✨ [➡️ link](./docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md)
+
 
 * [08/01] Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
 ✨ [➡️ link](./docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md)
 
 * [07/26] N-Gram Speculative Decoding in TensorRT‑LLM
-✨ [➡️ link](./docs/source/blogs/tech_blog/blog_7_NGram_performance_Analysis_And_Auto_Enablement.md)
+✨ [➡️ link](./docs/source/blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.md)
 
 * [06/19] Disaggregated Serving in TensorRT-LLM
 ✨ [➡️ link](./docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md)
 
@@ -2334,6 +2334,11 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
     void createSerializedResult(
         std::vector<char>& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank = 0);
 
+    /// @brief Check if the (user-provided) tokens fall within the vocabulary range.
+    /// @details Currently only supports invocation before context phase is completed.
+    /// @return True if tokens are within range.
+    bool checkTokenIdRange(SizeType32 vocabSize);
+
     void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded,
         std::optional<SizeType32> maxEncoderInputLen = std::nullopt, bool enableKVCacheReuse = false);
 
 
@@ -23,6 +23,10 @@ set(CMAKE_CUDA_ARCHITECTURES 89-real 90a-real)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 option(BUILD_XQA_TESTS "Build XQA tests" OFF)
+set(PAGED_KV_CACHE_LAYOUT
+    "0"
+    CACHE STRING "Paged KV cache format (0 for XQA Original, 1 for VLLM)")
+add_definitions(-DPAGED_KV_CACHE_LAYOUT=${PAGED_KV_CACHE_LAYOUT})
 
 # todo: remove include_directories link_directories and link libs like
 # CUDA::cuda_driver CUDA::cudart CUDA::nvrtc
@@ -37,7 +41,7 @@ set(CMAKE_CXX_FLAGS
     "${CMAKE_CXX_FLAGS} -march=haswell -Wfatal-errors -Wreturn-type -Wall -Wextra -Wno-unknown-pragmas"
 )
 set(CMAKE_CUDA_FLAGS
-    "${CMAKE_CUDA_FLAGS} -allow-unsupported-compiler --expt-relaxed-constexpr -t 0 -res-usage"
+    "${CMAKE_CUDA_FLAGS} -allow-unsupported-compiler --expt-relaxed-constexpr -t 0 -res-usage -DPAGED_KV_CACHE_LAYOUT=${PAGED_KV_CACHE_LAYOUT}"
 )
 set(CUDA_PTXAS_FLAGS "-warn-lmem-usage -warn-double-usage -warn-spills"
 )# -Werror -v
 
@@ -16,7 +16,7 @@ You need to install libgtest-dev and libeigen3-dev before building. To build, us
 
 - ```mkdir build```
 - ```cd build```
-- ```cmake .. -DCMAKE_BUILD_TYPE=Release```
+- ```cmake .. -DCMAKE_BUILD_TYPE=Release -DBUILD_XQA_TESTS=ON```
 - ```cmake --build . -j```
 
 To run unit tests, run `./unitTests`. There are a few runtime options that can be controlled with environment variables:
@@ -25,6 +25,16 @@ To run unit tests, run `./unitTests`. There are a few runtime options that can b
 - XQA_USE_QGMMA: On Hopper, we try to use TMA+QGMMA kernel (mha_sm90.cu) by default if possible. To force using mha.cu, set this to 0.
 - XQA_NB_SUB_SEQ: The number of CUDA thread blocks used to handle one K/V head. We have reasonable default but if you want to change it manually, use this variable.
 
+## Support for VLLM Paged KV-Cache
+When `PAGED_KV_CACHE_LAYOUT=1` is enabled, XQA supports VLLM-style KV pool input with split-wise KV-pool and sequence-first memory layout.
+To build and test with this feature enabled, run the following commands:
+
+- ```mkdir build```
+- ```cd build```
+- ```cmake .. -DCMAKE_BUILD_TYPE=Release -DBUILD_XQA_TESTS=ON -DPAGED_KV_CACHE_LAYOUT=1```
+- ```cmake --build . -j```
+- ```./unitTests```
+
 ## Generation cubins used in TensorRT-LLM
 
 Run `gen_cubin.py` in the repo workspace.
@@ -97,6 +97,15 @@ static_assert(SPEC_DEC, "SPEC_Q_SEQ_LEN should only be used when SPEC_DEC is ena
 #define USE_PAGED_KV_CACHE (TOKENS_PER_PAGE > 0)
 #endif
 
+// Paged KV Cache Format
+// 0 - XQA Original
+// 1 - separate K and V cache pools, each with layout (batch, seq_len, head, head_elem) for VLLM/SGLang
+#ifdef USE_PAGED_KV_CACHE
+#ifndef PAGED_KV_CACHE_LAYOUT
+#define PAGED_KV_CACHE_LAYOUT 0
+#endif
+#endif
+
 // don't modify
 #define USE_BEAM_SEARCH (BEAM_WIDTH > 1)
 
 
@@ -1671,17 +1671,33 @@ CUBIN_EXPORT __global__
             uint32_t const dstHeadOffset = 0;
             uint32_t const seqOffset = ctaTile.x * seqIter + warpTile.x * warpIdx.x;
 #if USE_PAGED_KV_CACHE
+#if PAGED_KV_CACHE_LAYOUT == 1
+            uint32_t const idxHeadBeg = (seqOffset % tokensPerPage) * nbKHeads + idxHeadGrp;
+
+#else
             uint32_t const idxHeadBeg = tokensPerPage * idxHeadGrp + seqOffset % tokensPerPage;
+#endif
 #if BEAM_WIDTH == 1
+#if PAGED_KV_CACHE_LAYOUT == 1
+            HeadPtr<GMemCacheHead const, tokensPerPage, nbPagesPerWarpTile> const src{
+                cacheList.kCacheVLLM, pageIdx, nbKHeads, idxHeadBeg};
+#else
             HeadPtr<GMemCacheHead const, tokensPerPage, nbPagesPerWarpTile> const src{
                 cacheList.pool, pageIdx, nbKHeads, idxHeadBeg};
+#endif
 #else
-            IndexedHeadPtr<GMemCacheHead const, tokensPerPage, nbPagesPerWarpTile> const src{
+            IndexedHeadPtr<GMemCacheHead const, tokensPerPage, nbPagesPerWarpTile> const src
+            {
                 /*indices=*/smem.gemm0CacheIndir[warpIdx.x].data,
-                /*pool=*/cacheList.pool,
-                /*pageIndices=*/smem.kCachePages[warpIdx.x].data,
-                /*nbKHeads=*/nbKHeads,
-                /*offset=*/idxHeadBeg};
+#if PAGED_KV_CACHE_LAYOUT == 1
+                    /*pool=*/cacheList.kCacheVLLM,
+#else
+                    /*pool=*/cacheList.pool,
+#endif
+                    /*pageIndices=*/smem.kCachePages[warpIdx.x].data,
+                    /*nbKHeads=*/nbKHeads,
+                    /*offset=*/idxHeadBeg
+            };
 #endif
 #else
             uint32_t const idxHeadBeg = cacheKSeqBaseOffset + seqOffset;
@@ -1990,17 +2006,33 @@ CUBIN_EXPORT __global__
                   uint32_t const seqOffset = ctaTile.x * seqIter + warpTile.x * nbXTilesPerXIter * xIter
                       + cacheVTileSeqStride * vIter + cacheVTileSeqLen * warpGrpIdx;
 #if USE_PAGED_KV_CACHE
+#if PAGED_KV_CACHE_LAYOUT == 1
+                  uint32_t const idxHeadBeg = (seqOffset % tokensPerPage) * nbKHeads + idxHeadGrp;
+
+#else
                   uint32_t const idxHeadBeg = tokensPerPage * idxHeadGrp + seqOffset % tokensPerPage;
+#endif
 #if BEAM_WIDTH == 1
+#if PAGED_KV_CACHE_LAYOUT == 1
+                  HeadPtr<GMemCacheHead const, tokensPerPage, nbPagesPerVTile> const src{
+                      cacheList.vCacheVLLM, pageIdx, nbKHeads, idxHeadBeg};
+#else
                   HeadPtr<GMemCacheHead const, tokensPerPage, nbPagesPerVTile> const src{
                       cacheList.pool, pageIdx, nbKHeads, idxHeadBeg};
+#endif
 #else
-                  IndexedHeadPtr<GMemCacheHead const, tokensPerPage, nbPagesPerVTile> const src{
+                  IndexedHeadPtr<GMemCacheHead const, tokensPerPage, nbPagesPerVTile> const src
+                  {
                       /*indices=*/smem.gemm1CacheIndir[grpLoadV ? warpGrpIdx : warpIdx.x].data,
-                      /*pool=*/cacheList.pool,
-                      /*pageIndices=*/smem.vCachePages[grpLoadV ? warpGrpIdx : warpIdx.x].data,
-                      /*nbKHeads=*/nbKHeads,
-                      /*offset=*/idxHeadBeg};
+#if PAGED_KV_CACHE_LAYOUT == 1
+                          /*pool=*/cacheList.vCacheVLLM,
+#else
+                          /*pool=*/cacheList.pool,
+#endif
+                          /*pageIndices=*/smem.vCachePages[grpLoadV ? warpGrpIdx : warpIdx.x].data,
+                          /*nbKHeads=*/nbKHeads,
+                          /*offset=*/idxHeadBeg
+                  };
 #endif
 #else
                   uint32_t const idxHeadBeg = cacheVSeqBaseOffset + seqOffset;
@@ -2636,7 +2668,11 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
     InputHead const* q,
 #endif
 #if USE_PAGED_KV_CACHE
+#if PAGED_KV_CACHE_LAYOUT == 1
+    GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
+#else
     GMemCacheHead* pool, // global pool of pages
+#endif
     KVCachePageIndex const*
         kvCachePageList, // device pointer. shape: KVCachePageIndex[batchSize][beamWidth][2][maxNbPagesPerSeq].
 #else
@@ -2702,7 +2738,11 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
     auto const launchCfg = makeLaunchConfig(dimGrid, dimCta, hostSmemSize, stream, ENABLE_PDL != 0);
 #if USE_PAGED_KV_CACHE
     uint32_t const maxNbPagesPerSeq = exactDiv(maxSeqLen, tokensPerPage);
+#if PAGED_KV_CACHE_LAYOUT == 1
+    KVCacheList<true> const cacheList{kCacheVLLM, vCacheVLLM, kvCachePageList, seqLen, maxNbPagesPerSeq};
+#else
     KVCacheList<true> const cacheList{pool, kvCachePageList, seqLen, maxNbPagesPerSeq};
+#endif
     cudaLaunchKernelEx(&launchCfg, kernel_mha,
 #if SPEC_DEC
         qSeqLen, nbKHeads, headGrpSize, qCuSeqLens,
 
@@ -102,7 +102,11 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t const nbKHeads,
     InputHead const* q,
 #endif
 #if USE_PAGED_KV_CACHE
+#if PAGED_KV_CACHE_LAYOUT == 1
+    GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
+#else
     GMemCacheHead* pool, // global pool of pages
+#endif
     KVCachePageIndex const*
         kvCachePageList, // device pointer. shape: KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq]
 #else
@@ -137,7 +141,11 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
     InputHead const* q,
 #endif
 #if USE_PAGED_KV_CACHE
+#if PAGED_KV_CACHE_LAYOUT == 1
+    GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
+#else
     GMemCacheHead* pool, // global pool of pages
+#endif
     KVCachePageIndex const*
         kvCachePageList, // device pointer. shape: KVCachePageIndex[batchSize][beamWidth][2][maxNbPagesPerSeq].
 #else