NVIDIA
diff --git a/‎.gitignore
Lines changed: 3 additions & 0 deletions b/‎.gitignore
Lines changed: 3 additions & 0 deletions
diff --git a/‎.gitmodules
Lines changed: 3 additions & 0 deletions b/‎.gitmodules
Lines changed: 3 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎3rdparty/DeepGEMM b/‎3rdparty/DeepGEMM
diff --git a/‎README.md
Lines changed: 2 additions & 2 deletions b/‎README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/CMakeLists.txt
Lines changed: 7 additions & 2 deletions b/‎cpp/CMakeLists.txt
Lines changed: 7 additions & 2 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Lines changed: 4 additions & 1 deletion b/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Lines changed: 4 additions & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/common/quantization.h
Lines changed: 57 additions & 16 deletions b/‎cpp/include/tensorrt_llm/common/quantization.h
Lines changed: 57 additions & 16 deletions
diff --git a/‎cpp/kernels/fmha_v2/fmha_test.py
Lines changed: 3 additions & 3 deletions b/‎cpp/kernels/fmha_v2/fmha_test.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h
Lines changed: 4 additions & 4 deletions b/‎cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h
Lines changed: 4 additions & 4 deletions
@@ -43,6 +43,9 @@ tensorrt_llm/bindings/**/*.pyi
 tensorrt_llm/deep_ep/
 tensorrt_llm/deep_ep_cpp_tllm.*.so
 tensorrt_llm/deep_ep_cpp_tllm.pyi
+tensorrt_llm/deep_gemm/
+tensorrt_llm/deep_gemm_cpp_tllm.*.so
+tensorrt_llm/deep_gemm_cpp_tllm.pyi
 *docs/cpp_docs*
 *docs/source/_cpp_gen*
 docs/source/**/*.rst
 
@@ -26,3 +26,6 @@
 [submodule "3rdparty/cppzmq"]
 	path = 3rdparty/cppzmq
 	url = https://github.com/zeromq/cppzmq.git
+[submodule "3rdparty/DeepGEMM"]
+	path = 3rdparty/DeepGEMM
+	url = https://github.com/deepseek-ai/DeepGEMM.git
@@ -27,7 +27,7 @@ repos:
         args: [--allow-multiple-documents]
         exclude: ".*/gitlab/.*.yml"
     -   id: trailing-whitespace
-        exclude: '\.patch$'
+        exclude: '\.(patch|md)$'
     -   id: check-toml
     -   id: mixed-line-ending
         args: [--fix=lf]
 
@@ -9,7 +9,7 @@ TensorRT-LLM
 [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-12.9.1-green)](https://developer.nvidia.com/cuda-downloads)
 [![trt](https://img.shields.io/badge/TRT-10.11.0-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-1.0.0rc6-green)](./tensorrt_llm/version.py)
+[![version](https://img.shields.io/badge/release-1.1.0rc0-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/torch/arch_overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)
@@ -253,5 +253,5 @@ Deprecation is used to inform developers that some APIs and tools are no longer
 ## Useful Links
 - [Quantized models on Hugging Face](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4): A growing collection of quantized (e.g., FP8, FP4) and optimized LLMs, including [DeepSeek FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), ready for fast inference with TensorRT-LLM.
 - [NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo): A datacenter scale distributed inference serving framework that works seamlessly with TensorRT-LLM.
-- [AutoDeploy](./examples/auto_deploy/README.md): An experimental backend for TensorRT-LLM to simplify and accelerate the deployment of PyTorch models.
+- [AutoDeploy](./examples/auto_deploy/README.md): A prototype backend for TensorRT-LLM to simplify and accelerate the deployment of PyTorch models.
 - [WeChat Discussion Group](https://github.com/NVIDIA/TensorRT-LLM/issues/5359): A real-time channel for TensorRT-LLM Q&A and news.
@@ -31,6 +31,7 @@ option(BUILD_PYT "Build in PyTorch TorchScript class mode" ON)
 option(BUILD_TESTS "Build Google tests" ON)
 option(BUILD_BENCHMARKS "Build benchmarks" ON)
 option(BUILD_DEEP_EP "Build the Deep EP module" ON)
+option(BUILD_DEEP_GEMM "Build the DeepGEMM module" ON)
 option(BUILD_MICRO_BENCHMARKS "Build C++ micro benchmarks" OFF)
 option(NVTX_DISABLE "Disable all NVTX features" ON)
 option(WARNING_IS_ERROR "Treat all warnings as errors" OFF)
@@ -199,7 +200,9 @@ set(TRT_LIB TensorRT::NvInfer)
 get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} PATH)
 
 set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty)
-if(BINDING_TYPE STREQUAL "pybind" OR BUILD_DEEP_EP)
+if(BINDING_TYPE STREQUAL "pybind"
+   OR BUILD_DEEP_EP
+   OR BUILD_DEEP_GEMM)
   add_subdirectory(${3RDPARTY_DIR}/pybind11
                    ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
 endif()
@@ -218,7 +221,9 @@ include_directories(
   ${3RDPARTY_DIR}/cutlass/tools/util/include
   ${3RDPARTY_DIR}/NVTX/include
   ${3RDPARTY_DIR}/json/include)
-if(BINDING_TYPE STREQUAL "pybind" OR BUILD_DEEP_EP)
+if(BINDING_TYPE STREQUAL "pybind"
+   OR BUILD_DEEP_EP
+   OR BUILD_DEEP_GEMM)
   include_directories(${3RDPARTY_DIR}/pybind11/include)
 endif()
 if(BINDING_TYPE STREQUAL "nanobind")
 
@@ -2027,7 +2027,7 @@ class GenericLlmRequest
 
         // Scatter the input tokens to other beam
         mTokens = BeamTokens(mSamplingConfig.beamWidth, inputTokens);
-        mLastTokens = VecTokens(mSamplingConfig.beamWidth);
+        mLastTokens = VecTokens(mSamplingConfig.beamWidth, inputTokens.back());
 
         // Init mUniqueTokens
         VecUniqueTokens uniqueTokens{inputTokens.size()};
@@ -2347,6 +2347,9 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
     void movePromptEmbeddingTableToGpu(runtime::BufferManager const& manager);
 
     void moveLoraWeightsToGpu(runtime::BufferManager const& manager);
+
+    // Remove LoRA weights and LoRA config tensors
+    void removeLoraTensors();
 };
 
 } // namespace tensorrt_llm::batch_manager
@@ -122,6 +122,16 @@ class QuantMode
         return QuantMode(BaseType(1u) << 14);
     }
 
+    static constexpr QuantMode w4a8Mxfp4Mxfp8() noexcept
+    {
+        return QuantMode(BaseType(1u) << 15);
+    }
+
+    static constexpr QuantMode w4a16Mxfp4() noexcept
+    {
+        return QuantMode(BaseType(1u) << 16);
+    }
+
     constexpr BaseType value() const noexcept
     {
         return mValue;
@@ -202,14 +212,25 @@ class QuantMode
         return isSet(w4a8Mxfp4Fp8());
     }
 
+    constexpr bool hasW4a8Mxfp4Mxfp8() const noexcept
+    {
+        return isSet(w4a8Mxfp4Mxfp8());
+    }
+
+    constexpr bool hasW4a16Mxfp4() const noexcept
+    {
+        return isSet(w4a16Mxfp4());
+    }
+
     constexpr bool hasKvCacheQuant() const noexcept
     {
         return hasInt8KvCache() || hasFp8KvCache() || hasFp4KvCache();
     }
 
     static constexpr QuantMode fromDescription(bool quantizeWeights, bool quantizeActivations, bool perToken,
         bool perChannel, bool perGroup, bool useInt4Weights, bool useInt8KvCache, bool useFp8KvCache, bool useFp8Qdq,
-        bool useFp8RowWise, bool useW4a8QServe, bool useFp4Quant, bool useFp8BlockScales, bool useW4a8Mxfp4Fp8)
+        bool useFp8RowWise, bool useW4a8QServe, bool useFp4Quant, bool useFp8BlockScales, bool useW4a8Mxfp4Fp8,
+        bool useW4a8Mxfp4Mxfp8, bool useW4a16Mxfp4)
     {
         QuantMode quantMode{};
         if (quantizeWeights)
@@ -278,25 +299,35 @@ class QuantMode
             quantMode += w4a8Mxfp4Fp8();
         }
 
+        if (useW4a8Mxfp4Mxfp8)
+        {
+            quantMode += w4a8Mxfp4Mxfp8();
+        }
+
+        if (useW4a16Mxfp4)
+        {
+            quantMode += w4a16Mxfp4();
+        }
+
         return quantMode;
     }
 
     static constexpr QuantMode useSmoothQuant(bool perToken = false, bool perChannel = false)
     {
-        return fromDescription(
-            true, true, perToken, perChannel, false, false, false, false, false, false, false, false, false, false);
+        return fromDescription(true, true, perToken, perChannel, false, false, false, false, false, false, false, false,
+            false, false, false, false);
     }
 
     static constexpr QuantMode useQServe(bool perGroup)
     {
-        return fromDescription(
-            true, true, false, false, perGroup, true, false, false, false, false, true, false, false, false);
+        return fromDescription(true, true, false, false, perGroup, true, false, false, false, false, true, false, false,
+            false, false, false);
     }
 
     static constexpr QuantMode useWeightOnly(bool useInt4Weights = false, bool perGroup = false)
     {
         return fromDescription(true, false, false, false, perGroup, useInt4Weights, false, false, false, false, false,
-            false, false, false);
+            false, false, false, false, false);
     }
 
     static QuantMode const fromQuantAlgo(
@@ -353,28 +384,38 @@ class QuantMode
         }
         else if (quantAlgo == "FP8")
         {
-            quantMode = fromDescription(
-                false, false, false, false, false, false, false, false, true, false, false, false, false, false);
+            quantMode = fromDescription(false, false, false, false, false, false, false, false, true, false, false,
+                false, false, false, false, false);
         }
         else if (quantAlgo == "FP8_ROWWISE")
         {
-            quantMode = fromDescription(
-                false, false, true, true, false, false, false, false, false, true, false, false, false, false);
+            quantMode = fromDescription(false, false, true, true, false, false, false, false, false, true, false, false,
+                false, false, false, false);
         }
         else if (quantAlgo == "FP4")
         {
-            quantMode = fromDescription(
-                false, false, false, false, false, false, false, false, false, false, false, true, false, false);
+            quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,
+                true, false, false, false, false);
         }
         else if (quantAlgo == "FP8_BLOCK_SCALES")
         {
-            quantMode = fromDescription(
-                false, false, false, false, false, false, false, false, false, false, false, false, true, false);
+            quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,
+                false, true, false, false, false);
         }
         else if (quantAlgo == "W4A8_MXFP4_FP8")
         {
-            quantMode = fromDescription(
-                false, false, false, false, false, false, false, false, false, false, false, false, false, true);
+            quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,
+                false, false, true, false, false);
+        }
+        else if (quantAlgo == "W4A8_MXFP4_MXFP8")
+        {
+            quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,
+                false, false, false, true, false);
+        }
+        else if (quantAlgo == "W4A16_MXFP4")
+        {
+            quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,
+                false, false, false, false, true);
         }
 
         if (kvCacheQuantAlgo == "INT8")
 
@@ -50,7 +50,7 @@ def getSMVersion():
                          ids=["fp16", "bf16", "fp16-fp32", "e4m3"])
 @pytest.mark.parametrize('flag', [
     "-s-q 128 -paged-kv", "-s-q 63 -paged-kv", "-paged-kv",
-    "-softcapping-scale-bmm1 30", "-contiguous-q-kv"
+    "-softcapping-scale-bmm1 30", "-contiguous-q-kv", "-use-attention-sinks"
 ])
 @pytest.mark.parametrize('tiled_kernel', ["", "-force-non-tiled"])
 def test_trtllm_flash_attention_fmha(d, s, dtype, flag, tiled_kernel):
@@ -117,8 +117,8 @@ def test_trtllm_flash_attention_fmha(d, s, dtype, flag, tiled_kernel):
             f"bin/fmha.exe -d {d} -h 16 -b 8 -s {s} -min-s 128 -custom-mask -gqa 2 -v {verbose} {dtype} {epsilon} {flag} {tiled_kernel}",
             shell=True,
             check=True)
-    # alibi and softcapping-scale-bmm1 are mutually exclusive.
-    if '-softcapping-scale-bmm1' not in flag:
+    # alibi doesn't work with softcapping-scale-bmm1/use-attention-sinks.
+    if '-softcapping-scale-bmm1' not in flag and '-use-attention-sinks' not in flag:
         subprocess.run(
             f"bin/fmha.exe -d {d} -h 16 -b 8 -s {s} -min-s 128 -causal-mask -alibi -v {verbose} {dtype} {epsilon} {flag} {tiled_kernel}",
             shell=True,
 
@@ -326,9 +326,6 @@ struct Compute
         uint32_t smem_v = __cvta_generic_to_shared(&shared->smem_v[0]);
         Compute_tile_o ctile_o(0, smem_v);
 
-        // BMM2 epilogue
-        Tile_o_epilogue tile_o_epilogue(params);
-
         // Mutex between two compute groups.
         OrderedMutexAccessor mutex_accessor(shared->compute_mutex, warpgroup_id, SYNC_BARRIER);
         // Notify warpgroup 0 to execute HGMMA first (overlap HGMMA and Softmax Math Instructions).
@@ -368,6 +365,9 @@ struct Compute
                 sage_scale_row = head_info.bidb * params.h + head_info.bidh;
             }
 
+            // BMM2 epilogue
+            Tile_o_epilogue tile_o_epilogue(params, head_info);
+
             int q_step_idx = warpgroup_id;
 
             // Compute work.
@@ -490,7 +490,7 @@ struct Compute
                 if (valid_run)
                 {
                     // Final step's update.
-                    tile_o_epilogue.scale(ctile_o, p_sum);
+                    tile_o_epilogue.scale(ctile_o, p_max, p_sum);
                     // Store o_tile to gmem.
                     gmem_o.store(ctile_o.acc_);
                 }
Original file line number	Diff line number	Diff line change
`@@ -122,6 +122,16 @@ class QuantMode`
`122`	`122`	`return QuantMode(BaseType(1u) << 14);`
`123`	`123`	`}`
`124`	`124`
	`125`	`+ static constexpr QuantMode w4a8Mxfp4Mxfp8() noexcept`
	`126`	`+ {`
	`127`	`+ return QuantMode(BaseType(1u) << 15);`
	`128`	`+ }`
	`129`	`+`
	`130`	`+ static constexpr QuantMode w4a16Mxfp4() noexcept`
	`131`	`+ {`
	`132`	`+ return QuantMode(BaseType(1u) << 16);`
	`133`	`+ }`
	`134`	`+`
`125`	`135`	`constexpr BaseType value() const noexcept`
`126`	`136`	`{`
`127`	`137`	`return mValue;`
`@@ -202,14 +212,25 @@ class QuantMode`
`202`	`212`	`return isSet(w4a8Mxfp4Fp8());`
`203`	`213`	`}`
`204`	`214`
	`215`	`+ constexpr bool hasW4a8Mxfp4Mxfp8() const noexcept`
	`216`	`+ {`
	`217`	`+ return isSet(w4a8Mxfp4Mxfp8());`
	`218`	`+ }`
	`219`	`+`
	`220`	`+ constexpr bool hasW4a16Mxfp4() const noexcept`
	`221`	`+ {`
	`222`	`+ return isSet(w4a16Mxfp4());`
	`223`	`+ }`
	`224`	`+`
`205`	`225`	`constexpr bool hasKvCacheQuant() const noexcept`
`206`	`226`	`{`
`207`	`227`	`return hasInt8KvCache() \|\| hasFp8KvCache() \|\| hasFp4KvCache();`
`208`	`228`	`}`
`209`	`229`
`210`	`230`	`static constexpr QuantMode fromDescription(bool quantizeWeights, bool quantizeActivations, bool perToken,`
`211`	`231`	`bool perChannel, bool perGroup, bool useInt4Weights, bool useInt8KvCache, bool useFp8KvCache, bool useFp8Qdq,`
`212`		`- bool useFp8RowWise, bool useW4a8QServe, bool useFp4Quant, bool useFp8BlockScales, bool useW4a8Mxfp4Fp8)`
	`232`	`+ bool useFp8RowWise, bool useW4a8QServe, bool useFp4Quant, bool useFp8BlockScales, bool useW4a8Mxfp4Fp8,`
	`233`	`+ bool useW4a8Mxfp4Mxfp8, bool useW4a16Mxfp4)`
`213`	`234`	`{`
`214`	`235`	`QuantMode quantMode{};`
`215`	`236`	`if (quantizeWeights)`
`@@ -278,25 +299,35 @@ class QuantMode`
`278`	`299`	`quantMode += w4a8Mxfp4Fp8();`
`279`	`300`	`}`
`280`	`301`
	`302`	`+ if (useW4a8Mxfp4Mxfp8)`
	`303`	`+ {`
	`304`	`+ quantMode += w4a8Mxfp4Mxfp8();`
	`305`	`+ }`
	`306`	`+`
	`307`	`+ if (useW4a16Mxfp4)`
	`308`	`+ {`
	`309`	`+ quantMode += w4a16Mxfp4();`
	`310`	`+ }`
	`311`	`+`
`281`	`312`	`return quantMode;`
`282`	`313`	`}`
`283`	`314`
`284`	`315`	`static constexpr QuantMode useSmoothQuant(bool perToken = false, bool perChannel = false)`
`285`	`316`	`{`
`286`		`- return fromDescription(`
`287`		`- true, true, perToken, perChannel, false, false, false, false, false, false, false, false, false, false);`
	`317`	`+ return fromDescription(true, true, perToken, perChannel, false, false, false, false, false, false, false, false,`
	`318`	`+ false, false, false, false);`
`288`	`319`	`}`
`289`	`320`
`290`	`321`	`static constexpr QuantMode useQServe(bool perGroup)`
`291`	`322`	`{`
`292`		`- return fromDescription(`
`293`		`- true, true, false, false, perGroup, true, false, false, false, false, true, false, false, false);`
	`323`	`+ return fromDescription(true, true, false, false, perGroup, true, false, false, false, false, true, false, false,`
	`324`	`+ false, false, false);`
`294`	`325`	`}`
`295`	`326`
`296`	`327`	`static constexpr QuantMode useWeightOnly(bool useInt4Weights = false, bool perGroup = false)`
`297`	`328`	`{`
`298`	`329`	`return fromDescription(true, false, false, false, perGroup, useInt4Weights, false, false, false, false, false,`
`299`		`- false, false, false);`
	`330`	`+ false, false, false, false, false);`
`300`	`331`	`}`
`301`	`332`
`302`	`333`	`static QuantMode const fromQuantAlgo(`
`@@ -353,28 +384,38 @@ class QuantMode`
`353`	`384`	`}`
`354`	`385`	`else if (quantAlgo == "FP8")`
`355`	`386`	`{`
`356`		`- quantMode = fromDescription(`
`357`		`- false, false, false, false, false, false, false, false, true, false, false, false, false, false);`
	`387`	`+ quantMode = fromDescription(false, false, false, false, false, false, false, false, true, false, false,`
	`388`	`+ false, false, false, false, false);`
`358`	`389`	`}`
`359`	`390`	`else if (quantAlgo == "FP8_ROWWISE")`
`360`	`391`	`{`
`361`		`- quantMode = fromDescription(`
`362`		`- false, false, true, true, false, false, false, false, false, true, false, false, false, false);`
	`392`	`+ quantMode = fromDescription(false, false, true, true, false, false, false, false, false, true, false, false,`
	`393`	`+ false, false, false, false);`
`363`	`394`	`}`
`364`	`395`	`else if (quantAlgo == "FP4")`
`365`	`396`	`{`
`366`		`- quantMode = fromDescription(`
`367`		`- false, false, false, false, false, false, false, false, false, false, false, true, false, false);`
	`397`	`+ quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,`
	`398`	`+ true, false, false, false, false);`
`368`	`399`	`}`
`369`	`400`	`else if (quantAlgo == "FP8_BLOCK_SCALES")`
`370`	`401`	`{`
`371`		`- quantMode = fromDescription(`
`372`		`- false, false, false, false, false, false, false, false, false, false, false, false, true, false);`
	`402`	`+ quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,`
	`403`	`+ false, true, false, false, false);`
`373`	`404`	`}`
`374`	`405`	`else if (quantAlgo == "W4A8_MXFP4_FP8")`
`375`	`406`	`{`
`376`		`- quantMode = fromDescription(`
`377`		`- false, false, false, false, false, false, false, false, false, false, false, false, false, true);`
	`407`	`+ quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,`
	`408`	`+ false, false, true, false, false);`
	`409`	`+ }`
	`410`	`+ else if (quantAlgo == "W4A8_MXFP4_MXFP8")`
	`411`	`+ {`
	`412`	`+ quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,`
	`413`	`+ false, false, false, true, false);`
	`414`	`+ }`
	`415`	`+ else if (quantAlgo == "W4A16_MXFP4")`
	`416`	`+ {`
	`417`	`+ quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,`
	`418`	`+ false, false, false, false, true);`
`378`	`419`	`}`
`379`	`420`
`380`	`421`	`if (kvCacheQuantAlgo == "INT8")`