From c95b3205a181bff7eb71a5a2228d63bf308485ad Mon Sep 17 00:00:00 2001 From: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com> Date: Fri, 5 Sep 2025 03:18:16 +0000 Subject: [PATCH] [None][Doc] Rename TensorRT-LLM to TensorRT LLM. Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com> --- docs/source/architecture/add-model.md | 12 ++-- docs/source/architecture/checkpoint.md | 24 ++++---- docs/source/architecture/overview.md | 10 ++-- ...practice_on_DeepSeek-R1_in_TensorRT-LLM.md | 38 ++++++------ docs/source/blogs/Falcon180B-H200.md | 18 +++--- docs/source/blogs/H100vsA100.md | 12 ++-- docs/source/blogs/H200launch.md | 16 ++--- ...Seek-R1_Performance_on_NVIDIA_B200_GPUs.md | 8 +-- ..._R1_MTP_Implementation_and_Optimization.md | 22 +++---- ..._R1_Throughput_on_NVIDIA_Blackwell_GPUs.md | 22 +++---- ...ling_Expert_Parallelism_in_TensorRT-LLM.md | 30 +++++----- ...5_Disaggregated_Serving_in_TensorRT-LLM.md | 34 +++++------ .../blog6_Llama4_maverick_eagle_guide.md | 18 +++--- ...erformance_Analysis_And_Auto_Enablement.md | 4 +- ...xpert_Parallelism_in_TensorRT-LLM_part2.md | 26 ++++---- .../blog9_Deploying_GPT_OSS_on_TRTLLM.md | 20 +++---- docs/source/commands/trtllm-bench.rst | 2 +- docs/source/commands/trtllm-eval.rst | 4 +- .../run-benchmark-with-trtllm-serve.md | 6 +- ...-start-recipe-for-deepseek-r1-on-trtllm.md | 24 ++++---- ...start-recipe-for-llama3.3-70b-on-trtllm.md | 24 ++++---- ...start-recipe-for-llama4-scout-on-trtllm.md | 24 ++++---- .../dev-on-cloud/build-image-to-dockerhub.md | 4 +- docs/source/developer-guide/perf-analysis.md | 2 +- .../developer-guide/perf-benchmarking.md | 12 ++-- docs/source/examples/customization.md | 2 +- docs/source/examples/kvcacheconfig.md | 2 +- docs/source/features/attention.md | 24 ++++---- .../features/auto_deploy/auto-deploy.md | 12 ++-- docs/source/features/checkpoint-loading.md | 2 +- docs/source/features/multi-modality.md | 10 ++-- .../features/paged-attention-ifb-scheduler.md | 18 +++--- docs/source/features/parallel-strategy.md | 12 ++-- docs/source/features/quantization.md | 8 +-- docs/source/features/speculative-decoding.md | 2 +- docs/source/index.rst | 4 +- .../installation/build-from-source-linux.md | 59 ++++++++++--------- docs/source/installation/containers.md | 4 +- docs/source/installation/index.rst | 2 +- docs/source/installation/linux.md | 14 ++--- docs/source/overview.md | 18 +++--- docs/source/quick-start-guide.md | 10 ++-- docs/source/torch.md | 6 +- docs/source/torch/kv_cache_manager.md | 4 +- 44 files changed, 315 insertions(+), 314 deletions(-) diff --git a/docs/source/architecture/add-model.md b/docs/source/architecture/add-model.md index 4857b54cbe9..7235ee07fbd 100644 --- a/docs/source/architecture/add-model.md +++ b/docs/source/architecture/add-model.md @@ -2,11 +2,11 @@ # Adding a Model -This document describes how to add a typical decoder-only model in TensorRT-LLM. +This document describes how to add a typical decoder-only model in TensorRT LLM. ## Step 1. Write Modeling Part -TensorRT-LLM provides different levels of APIs: +TensorRT LLM provides different levels of APIs: - Low-level functions, for example, `concat`, `add`, and `sum`. - Basic layers, such as, `Linear` and `LayerNorm`. @@ -14,7 +14,7 @@ TensorRT-LLM provides different levels of APIs: - Base class for typical decoder-only models, such as, `DecoderModelForCausalLM`. 1. Create a model directory in `tensorrt_llm/models`, for example `my_model`. -2. Write a `model.py` with TensorRT-LLM's APIs +2. Write a `model.py` with TensorRT LLM's APIs ```python class MyDecoderLayer(Module): @@ -52,7 +52,7 @@ class MyModelForCausalLM(DecoderModelForCausalLM): ## Step 2. Implement Weight Conversion -The weights from source framework need to be converted and bound to the new added TensorRT-LLM model. Here is an example of converting HuggingFace weights: +The weights from source framework need to be converted and bound to the new added TensorRT LLM model. Here is an example of converting HuggingFace weights: ```python class MyModelForCausalLM(DecoderModelForCausalLM): @@ -62,8 +62,8 @@ class MyModelForCausalLM(DecoderModelForCausalLM): hf_model_dir, dtype='float16', mapping: Optional[Mapping] = None) -> MyModelForCausalLM - # create a TensorRT-LLM MyModelForCausalLM model object - # convert HuggingFace checkpoint to TensorRT-LLM expected weights dict + # create a TensorRT LLM MyModelForCausalLM model object + # convert HuggingFace checkpoint to TensorRT LLM expected weights dict # load the weights to MyModelForCausalLM object ``` diff --git a/docs/source/architecture/checkpoint.md b/docs/source/architecture/checkpoint.md index ad51b7e3104..427a0916f58 100644 --- a/docs/source/architecture/checkpoint.md +++ b/docs/source/architecture/checkpoint.md @@ -1,36 +1,36 @@ -# TensorRT-LLM Checkpoint +# TensorRT LLM Checkpoint ## Overview -The earlier versions (pre-0.8 version) of TensorRT-LLM were developed with a very aggressive timeline. For those versions, emphasis was not put on defining a unified workflow. Now that TensorRT-LLM has reached some level of feature richness, the development team has decided to put more effort into unifying the APIs and workflow of TensorRT-LLM. This file documents the workflow around TensorRT-LLM checkpoint and the set of CLI tools to generate checkpoint, build engines, and evaluate engines. +The earlier versions (pre-0.8 version) of TensorRT LLM were developed with a very aggressive timeline. For those versions, emphasis was not put on defining a unified workflow. Now that TensorRT LLM has reached some level of feature richness, the development team has decided to put more effort into unifying the APIs and workflow of TensorRT LLM. This file documents the workflow around TensorRT LLM checkpoint and the set of CLI tools to generate checkpoint, build engines, and evaluate engines. There are three steps in the workflow: -1. Convert weights from different source frameworks into TensorRT-LLM checkpoint. -2. Build the TensorRT-LLM checkpoint into TensorRT engines with a unified build command. -3. Load the engines to TensorRT-LLM model runner and evaluate with different evaluation tasks. +1. Convert weights from different source frameworks into TensorRT LLM checkpoint. +2. Build the TensorRT LLM checkpoint into TensorRT engines with a unified build command. +3. Load the engines to TensorRT LLM model runner and evaluate with different evaluation tasks. ``` NeMo ------------- | HuggingFace ------ | convert build load -Modelopt --------- ----------> TensorRT-LLM Checkpoint --------> TensorRT Engine ------> TensorRT-LLM ModelRunner +Modelopt --------- ----------> TensorRT LLM Checkpoint --------> TensorRT Engine ------> TensorRT LLM ModelRunner | JAX -------------- | DeepSpeed -------- ``` -## Prepare the TensorRT-LLM Checkpoint +## Prepare the TensorRT LLM Checkpoint -TensorRT-LLM aims at supporting different sources: +TensorRT LLM aims at supporting different sources: 1. Trained models from NVIDIA NeMo, Microsoft DeepSpeed, and JAX 2. Quantized models from NVIDIA Modelopt 3. Popular models from HuggingFace -TensorRT-LLM defines its own checkpoint format. A checkpoint directory includes: +TensorRT LLM defines its own checkpoint format. A checkpoint directory includes: 1. One config `json` file, which contains several model hyper-parameters. 2. One or several rank weights files, each file contains a dictionary of tensors (weights). @@ -107,7 +107,7 @@ Here is the model specific config list: ### Rank Weights Like PyTorch, the tensor (weight) name is a string containing hierarchical information, -which is uniquely mapped to a certain parameter of a TensorRT-LLM model. +which is uniquely mapped to a certain parameter of a TensorRT LLM model. For example, each transformer layer of the OPT model contains an `Attention` layer, an `MLP` layer. and two `LayerNorm` layers. @@ -169,7 +169,7 @@ Here is the AWQ scaling factors of `mlp.fc` linear layer: - `transformer.layers.0.mlp.fc.prequant_scaling_factor` ```{note} - The linear weights in TensorRT-LLM checkpoint always follows (`out_feature`, `in_feature`) shape, whereas some quantized linear in TensorRT-LLM implemented by plugin may use (`in_feature`, `out_fature`) shape. The `trtllm-build` command adds a transpose operation to post-process it. + The linear weights in TensorRT LLM checkpoint always follows (`out_feature`, `in_feature`) shape, whereas some quantized linear in TensorRT LLM implemented by plugin may use (`in_feature`, `out_fature`) shape. The `trtllm-build` command adds a transpose operation to post-process it. ### Example @@ -218,7 +218,7 @@ Here is the `config.json`: ## Build Checkpoint into TensorRT Engine -TensorRT-LLM provides a unified build command: `trtllm-build`. Before using it, +TensorRT LLM provides a unified build command: `trtllm-build`. Before using it, you may need to add it to the `PATH`. ```bash diff --git a/docs/source/architecture/overview.md b/docs/source/architecture/overview.md index 9a3207c93a0..f1e9b7b3b6c 100644 --- a/docs/source/architecture/overview.md +++ b/docs/source/architecture/overview.md @@ -1,6 +1,6 @@ # Architecture Overview -The `LLM` class is a core entry point for the TensorRT-LLM, providing a simplified `generate()` API for efficient large language model inference. This abstraction aims to streamline the user experience, as demonstrated with TinyLlama: +The `LLM` class is a core entry point for the TensorRT LLM, providing a simplified `generate()` API for efficient large language model inference. This abstraction aims to streamline the user experience, as demonstrated with TinyLlama: ```python from tensorrt_llm import LLM @@ -16,7 +16,7 @@ The `LLM` class automatically manages essential pre and post-processing steps, i Internally, the `LLM` class orchestrates the creation of a dedicated `PyExecutor(Worker)` process on each rank. -![TRT-LLM Architecture Overview](../media/TRTLLM_Architecture_Overview.png) +![TensorRT LLM Architecture Overview](../media/TRTLLM_Architecture_Overview.png) This `PyExecutor` operates in a continuous background loop, designed for the efficient, asynchronous processing of inference requests. @@ -45,13 +45,13 @@ During each iteration of its background loop, the `PyExecutor` performs the foll ## Runtime Optimizations -TensorRT-LLM enhances inference throughput and reduces latency by integrating a suite of runtime optimizations, including CUDA Graph, [Overlap Scheduler](../features/overlap-scheduler.md), [Speculative decoding](../features/speculative-decoding.md), etc. +TensorRT LLM enhances inference throughput and reduces latency by integrating a suite of runtime optimizations, including CUDA Graph, [Overlap Scheduler](../features/overlap-scheduler.md), [Speculative decoding](../features/speculative-decoding.md), etc. ### CUDA Graph CUDA Graphs drastically reduce the CPU-side overhead associated with launching GPU kernels, which is particularly impactful in PyTorch-based inference where Python's host-side code can be a bottleneck. By capturing a sequence of CUDA operations as a single graph, the entire sequence can be launched with one API call, minimizing CPU-GPU synchronization and driver overhead. -To maximize the "hit rate" of these cached graphs, TensorRT-LLM employs CUDA Graph padding. If an incoming batch's size doesn't match a captured graph, it's padded to the nearest larger, supported size for which a graph exists. While this incurs minor overhead from computing "wasted" tokens, it's often a better trade-off than falling back to slower eager mode execution. This optimization has a significant impact, demonstrating up to a 22% end-to-end throughput increase on certain models and hardware. +To maximize the "hit rate" of these cached graphs, TensorRT LLM employs CUDA Graph padding. If an incoming batch's size doesn't match a captured graph, it's padded to the nearest larger, supported size for which a graph exists. While this incurs minor overhead from computing "wasted" tokens, it's often a better trade-off than falling back to slower eager mode execution. This optimization has a significant impact, demonstrating up to a 22% end-to-end throughput increase on certain models and hardware. ### Overlap Scheduler @@ -72,4 +72,4 @@ if self.previous_batch is not None: self._process_previous_batch() ``` -This approach effectively reduces GPU idle time and improves overall hardware occupancy. While it introduces one extra decoding step into the pipeline, the resulting throughput gain is a significant trade-off. For this reason, the Overlap Scheduler is enabled by default in TensorRT-LLM. +This approach effectively reduces GPU idle time and improves overall hardware occupancy. While it introduces one extra decoding step into the pipeline, the resulting throughput gain is a significant trade-off. For this reason, the Overlap Scheduler is enabled by default in TensorRT LLM. diff --git a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md index 05d18284a06..65c9738cb80 100644 --- a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md +++ b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md @@ -1,4 +1,4 @@ -# How to get best performance on DeepSeek-R1 in TensorRT-LLM +# How to get best performance on DeepSeek-R1 in TensorRT LLM NVIDIA has announced world-record DeepSeek-R1 inference performance at NVIDIA GTC 2025. A single NVIDIA DGX system with eight NVIDIA Blackwell GPUs can achieve over 250 tokens per second per user or a maximum throughput of over 30,000 tokens per second on the massive, state-of-the-art 671 billion parameter DeepSeek-R1 model. [NVIDIA Blackwell Delivers World-Record DeepSeek-R1 Inference Performance](https://developer.nvidia.com/blog/nvidia-blackwell-delivers-world-record-deepseek-r1-inference-performance/) @@ -6,13 +6,13 @@ In this blog, we share the configurations and procedures about how to reproduce ## Table of Contents -- [How to get best performance on DeepSeek-R1 in TensorRT-LLM](#how-to-get-best-performance-on-deepseek-r1-in-tensorrt-llm) +- [How to get best performance on DeepSeek-R1 in TensorRT LLM](#how-to-get-best-performance-on-deepseek-r1-in-tensorrt-llm) - [Table of Contents](#table-of-contents) - - [Prerequisites: Install TensorRT-LLM and download models](#prerequisites-install-tensorrt-llm-and-download-models) - - [1. Download TensorRT-LLM](#1-download-tensorrt-llm) + - [Prerequisites: Install TensorRT LLM and download models](#prerequisites-install-tensorrt-llm-and-download-models) + - [1. Download TensorRT LLM](#1-download-tensorrt-llm) - [2. Download the DeepSeek R1 models](#2-download-the-deepseek-r1-models) - - [3. Build and run TensorRT-LLM container](#3-build-and-run-tensorrt-llm-container) - - [4. Compile and Install TensorRT-LLM](#4-compile-and-install-tensorrt-llm) + - [3. Build and run TensorRT LLM container](#3-build-and-run-tensorrt-llm-container) + - [4. Compile and Install TensorRT LLM](#4-compile-and-install-tensorrt-llm) - [5. Optional: Tune GPU clocks](#5-optional-tune-gpu-clocks) - [6. Dataset preparation](#6-dataset-preparation) - [Reproducing steps](#reproducing-steps) @@ -34,13 +34,13 @@ In this blog, we share the configurations and procedures about how to reproduce - [Out of memory issues](#out-of-memory-issues) -## Prerequisites: Install TensorRT-LLM and download models +## Prerequisites: Install TensorRT LLM and download models -This section can be skipped if you already have TensorRT-LLM installed and have already downloaded the DeepSeek R1 model checkpoint. +This section can be skipped if you already have TensorRT LLM installed and have already downloaded the DeepSeek R1 model checkpoint. -#### 1. Download TensorRT-LLM +#### 1. Download TensorRT LLM -**You can also find more comprehensive instructions to install TensorRT-LLM in this [TensorRT-LLM installation guide](https://nvidia.github.io/TensorRT-LLM/installation/build-from-source-linux.html), refer to that guide for common issues if you encounter any here.** +**You can also find more comprehensive instructions to install TensorRT LLM in this [TensorRT LLM installation guide](https://nvidia.github.io/TensorRT-LLM/installation/build-from-source-linux.html), refer to that guide for common issues if you encounter any here.** ``` bash # Prerequisites @@ -50,7 +50,7 @@ git lfs install # Replace with your actual path YOUR_WORK_PATH= -# Clone the TensorRT-LLM repository +# Clone the TensorRT LLM repository cd $YOUR_WORK_PATH git clone https://github.com/NVIDIA/TensorRT-LLM.git cd TensorRT-LLM @@ -77,7 +77,7 @@ git clone https://huggingface.co/nvidia/DeepSeek-R1-FP4 git clone https://huggingface.co/deepseek-ai/DeepSeek-R1 ``` -#### 3. Build and run TensorRT-LLM container +#### 3. Build and run TensorRT LLM container ``` bash cd TensorRT-LLM @@ -85,7 +85,7 @@ make -C docker run LOCAL_USER=1 DOCKER_RUN_ARGS="-v $YOUR_MODEL_PATH:$YOUR_MODEL ``` Here we set `LOCAL_USER=1` argument to set up the local user instead of root account inside the container, you can remove it if running as root inside container is fine. -#### 4. Compile and Install TensorRT-LLM +#### 4. Compile and Install TensorRT LLM Here we compile the source inside the container: ``` bash @@ -122,11 +122,11 @@ The command to generate synthetic dataset will be attached to the max throughput This section provides the reproducing steps for NVIDIA Blackwell B200 and H200 GPUs, for both min-latency and max-throughput scenarios. -All the benchmarking is done by the trtllm-bench command line tool provided in the TensorRT-LLM installation, see [TensorRT-LLM Benchmarking](https://nvidia.github.io/TensorRT-LLM/performance/perf-benchmarking.html) for details of this tool. +All the benchmarking is done by the trtllm-bench command line tool provided in the TensorRT LLM installation, see [TensorRT LLM Benchmarking](https://nvidia.github.io/TensorRT-LLM/performance/perf-benchmarking.html) for details of this tool. For brevity, we only provide the commands to reproduce the perf numbers without detailed explanation of the tools and options in this doc. -All these commands here are assumed to be running inside the container started by `make -C docker run ...` command mentioned in the [Build and run TensorRT-LLM container section](#3-build-and-run-tensorrt-llm-container) +All these commands here are assumed to be running inside the container started by `make -C docker run ...` command mentioned in the [Build and run TensorRT LLM container section](#3-build-and-run-tensorrt-llm-container) ### B200 min-latency Our benchmark results are based on **Batch = 1, ISL = 1K, OSL = 2K, num_requests = 10 from real dataset** @@ -158,7 +158,7 @@ trtllm-bench --model nvidia/DeepSeek-R1-FP4 \ ``` Explanation: -- `trtllm-bench`: A CLI benchmarking utility that aims to make it easier for users to reproduce our officially published. See [TensorRT-LLM Benchmarking](https://nvidia.github.io/TensorRT-LLM/performance/perf-benchmarking.html) for details. +- `trtllm-bench`: A CLI benchmarking utility that aims to make it easier for users to reproduce our officially published. See [TensorRT LLM Benchmarking](https://nvidia.github.io/TensorRT-LLM/performance/perf-benchmarking.html) for details. - `--dataset`: Prompt dataset used to benchmark. Our official benchmark dataset has ISL = 1K, OSL = 2K - `--num_requests`: Num requests used for the benchmark. - `--concurrency`: Total concurrency for the system. @@ -186,7 +186,7 @@ Average request latency (ms): 7456.1219 Due to our evaluation found that FP8 KV cache does not introduce obvious accuracy drop compared to BF16 KV cache. See [Precision strategy](./tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md#precision-strategy), the latest [DeepSeek-R1-0528-FP4](https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4) checkpoint had enabled FP8 KV cache by-default. -We are seeing meaningful speedup using FP8 KV cache, thus refreshing the numbers here. The results are reproduced with TensorRT-LLM commit b6261862419c33d6ce2313aff1e7116067d6037d. +We are seeing meaningful speedup using FP8 KV cache, thus refreshing the numbers here. The results are reproduced with TensorRT LLM commit b6261862419c33d6ce2313aff1e7116067d6037d. !! Note that the exact command to reproduce numbers can change as the API/options are refactored, the option and numbers here is a reference at given exact commit. @@ -239,7 +239,7 @@ Per GPU Output Throughput (tps/gpu): 5393.2755 ### B200 max-throughput for R1 with FP16 KV cache Our benchmark results are based on **Batch = 3072, ISL = 1K, OSL = 2K, num_requests = 49152 from synthetic dataset**. -The results are reproduced with TensorRT-LLM commit b6261862419c33d6ce2313aff1e7116067d6037d. +The results are reproduced with TensorRT LLM commit b6261862419c33d6ce2313aff1e7116067d6037d. !! Note that the exact command to reproduce numbers can change as the API/options are refactored, the option and numbers here is a reference at given exact commit. @@ -401,7 +401,7 @@ Average request latency (ms): 181540.5739 ## Exploring more ISL/OSL combinations -To benchmark TensorRT-LLM on DeepSeek models with more ISL/OSL combinations, you can use `prepare_dataset.py` to generate the dataset and use similar commands mentioned in the previous section. TensorRT-LLM is working on enhancements that can make the benchmark process smoother. +To benchmark TensorRT LLM on DeepSeek models with more ISL/OSL combinations, you can use `prepare_dataset.py` to generate the dataset and use similar commands mentioned in the previous section. TensorRT LLM is working on enhancements that can make the benchmark process smoother. ### WIP: Enable more features by default Currently, there are some features that need to be enabled through a user-defined file `extra-llm-api-config.yml`, such as CUDA graph, overlap scheduler and attention dp. We're working on to enable those features by default, so that users can get good out-of-the-box performance on DeepSeek models. diff --git a/docs/source/blogs/Falcon180B-H200.md b/docs/source/blogs/Falcon180B-H200.md index f2c2fe75929..01e5eeba59a 100644 --- a/docs/source/blogs/Falcon180B-H200.md +++ b/docs/source/blogs/Falcon180B-H200.md @@ -1,13 +1,13 @@ # Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100 -H200's large capacity & high memory bandwidth, paired with TensorRT-LLM's +H200's large capacity & high memory bandwidth, paired with TensorRT LLM's optimizations, maximizes inference performance. ## Falcon-180B on a single H200 with INT4 AWQ [Falcon-180B](https://huggingface.co/tiiuae/falcon-180B), one of the largest & most accurate open source models available, can run on a *single* H200 GPU. -The 141GB of memory on H200, paired with TensorRT-LLM running INT4 AWQ with +The 141GB of memory on H200, paired with TensorRT LLM running INT4 AWQ with FP8, allows for the entire large language model to fit on a single GPU, where previously eight A100s were required. H200 Falcon-180B provides up to **800** tok/s and retains high accuracy. @@ -30,7 +30,7 @@ BS: (in order) 256, 128 **Model Accuracy:** Often quantization can have adverse impacts on the accuracy of the model, -however, TensorRT-LLM's AWQ decreases memory footprint of the model by **4x** +however, TensorRT LLM's AWQ decreases memory footprint of the model by **4x** while maintaining high accuracy. Falcon-180B accuracy comparison @@ -52,18 +52,18 @@ retain higher accuracy than other 4bit methods and reduce memory usage, but requires special kernels capable of handling the change in precision performantly. -TensorRT-LLM has implemented custom kernels for AWQ, and taken the technique a +TensorRT LLM has implemented custom kernels for AWQ, and taken the technique a step further by performing FP8 computation on Hopper GPUs instead of the standard FP16. -Similar examples running Falcon-180B with quantization in TensorRT-LLM are +Similar examples running Falcon-180B with quantization in TensorRT LLM are available in [examples/models/contrib/falcon](/examples/models/contrib/falcon). ## Llama-70B on H200 up to 6.7x A100 -TensorRT-LLM has improved its Group Query Attention (GQA) kernels, in the +TensorRT LLM has improved its Group Query Attention (GQA) kernels, in the generation phase, providing up to 2.4x improvement on Llama-70B over -TensorRT-LLM v0.5, achieving over **3,800** tok/s/gpu at up to **6.7x** faster +TensorRT LLM v0.5, achieving over **3,800** tok/s/gpu at up to **6.7x** faster than A100. **H200 6.7x A100** @@ -106,7 +106,7 @@ BS 192 [**Grouped Query Attention (GQA)**](https://arxiv.org/abs/2305.13245v2) (Ainslie et al., 2023), used in Llama-70B, is a variant of Multihead Attention (MHA) which groups key-value (KV) heads together, resulting in fewer KV heads -than query (Q) heads. TensorRT-LLM has a custom implementation of MHA which +than query (Q) heads. TensorRT LLM has a custom implementation of MHA which supports GQA, multi-query attention (MQA) and standard MHA. It leverages Tensor Cores, including in the generation phase, and delivers great performance on NVIDIA GPUs. @@ -116,7 +116,7 @@ NVIDIA GPUs. These improvements will be published in the `main` branch soon, and will be included in the v0.7 & v0.8 releases. -Similar examples running Llama-70B in TensorRT-LLM are published in +Similar examples running Llama-70B in TensorRT LLM are published in [examples/models/core/llama](/examples/models/core/llama). For more information about H200, please see the [H200 announcement blog](./H200launch.md). diff --git a/docs/source/blogs/H100vsA100.md b/docs/source/blogs/H100vsA100.md index bd87dc718a3..06edd816202 100644 --- a/docs/source/blogs/H100vsA100.md +++ b/docs/source/blogs/H100vsA100.md @@ -1,16 +1,16 @@ -> :bangbang: :new: *NVIDIA H200 has been announced & is optimized on TensorRT-LLM. Learn more about H200, & H100 comparison, here:* [**H200** achieves nearly **12,000 tokens/sec on Llama2-13B** with TensorRT-LLM](./H200launch.md) +> :bangbang: :new: *NVIDIA H200 has been announced & is optimized on TensorRT LLM. Learn more about H200, & H100 comparison, here:* [**H200** achieves nearly **12,000 tokens/sec on Llama2-13B** with TensorRT LLM](./H200launch.md) -# H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token +# H100 has 4.6x A100 Performance in TensorRT LLM, achieving 10,000 tok/s at 100ms to first token -TensorRT-LLM evaluated on both Hopper and Ampere shows **H100 FP8 is up to 4.6x max throughput and 4.4x faster 1st token latency than A100**. H100 FP8 is able to achieve over 10,000 output tok/s at peak throughput for 64 concurrent requests, while maintaining a 1st token latency of 100ms. For min-latency applications, TRT-LLM H100 can achieve less than 10ms to 1st token latency. +TensorRT LLM evaluated on both Hopper and Ampere shows **H100 FP8 is up to 4.6x max throughput and 4.4x faster 1st token latency than A100**. H100 FP8 is able to achieve over 10,000 output tok/s at peak throughput for 64 concurrent requests, while maintaining a 1st token latency of 100ms. For min-latency applications, TRT-LLM H100 can achieve less than 10ms to 1st token latency. max throughput 1st token latency -TensorRT-LLM throughput & first token latency on H100 & A100. H100 FP8, A100 FP16, SXM 80GB GPUs, ISL/OSL's provided, TP=1, BS=32/64 max throughput, BS=1 1st token latency. TensorRT-LLM v0.5.0, TensorRT 9.1. +TensorRT LLM throughput & first token latency on H100 & A100. H100 FP8, A100 FP16, SXM 80GB GPUs, ISL/OSL's provided, TP=1, BS=32/64 max throughput, BS=1 1st token latency. TensorRT LLM v0.5.0, TensorRT 9.1. Max throughput calculated by sweeping BS 1,2,...,64. Throughput taken at largest successful. **Max Throughput & Min Latency** @@ -26,9 +26,9 @@ TensorRT-LLM evaluated on both Hopper and Ampere shows **H100 FP8 is up to 4.6x | GPT-J 6B | 64 | 128 | 128 | **3.0x** | **4.7x** | | GPT-J 6B | 1 | 128 | - | **2.4x** | 1.7x | -FP8 H100, FP16 A100, SXM 80GB GPUs, TP1, ISL/OSL's provided, TensorRT-LLM v0.5.0., TensorRT 9.1 +FP8 H100, FP16 A100, SXM 80GB GPUs, TP1, ISL/OSL's provided, TensorRT LLM v0.5.0., TensorRT 9.1 -The full data behind these charts & tables and including larger models with higher TP values can be found in TensorRT-LLM's [Performance Documentation](https://nvidia.github.io/TensorRT-LLM/latest/performance/perf-overview.html) +The full data behind these charts & tables and including larger models with higher TP values can be found in TensorRT LLM's [Performance Documentation](https://nvidia.github.io/TensorRT-LLM/latest/performance/perf-overview.html) Stay tuned for a highlight on Llama coming soon! diff --git a/docs/source/blogs/H200launch.md b/docs/source/blogs/H200launch.md index baa4905613d..6fd0737c33d 100644 --- a/docs/source/blogs/H200launch.md +++ b/docs/source/blogs/H200launch.md @@ -1,8 +1,8 @@ -:loudspeaker: Note: The below data is using TensorRT-LLM v0.5. There have been significant improvements in v0.6 & later. Please see updated Llama performance [here](./Falcon180B-H200.md). +:loudspeaker: Note: The below data is using TensorRT LLM v0.5. There have been significant improvements in v0.6 & later. Please see updated Llama performance [here](./Falcon180B-H200.md). -# H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM +# H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT LLM -TensorRT-LLM evaluation of the [new H200 GPU](https://nvidianews.nvidia.com/news/nvidia-supercharges-hopper-the-worlds-leading-ai-computing-platform) achieves **11,819 tokens/s on Llama2-13B** on a single GPU. H200 is up to **1.9x faster** than H100. This performance is enabled by H200's larger, faster [HBM3e memory](#latest-hbm-memory). +TensorRT LLM evaluation of the [new H200 GPU](https://nvidianews.nvidia.com/news/nvidia-supercharges-hopper-the-worlds-leading-ai-computing-platform) achieves **11,819 tokens/s on Llama2-13B** on a single GPU. H200 is up to **1.9x faster** than H100. This performance is enabled by H200's larger, faster [HBM3e memory](#latest-hbm-memory). **H200 FP8 Max throughput** @@ -17,11 +17,11 @@ TensorRT-LLM evaluation of the [new H200 GPU](https://nvidianews.nvidia.com/news | llama_70b | 64 | 1 | 2048 | 128 | 341 | | llama_70b | 32 | 1 | 2048 | 128 | 303 | -Preliminary measured performance, subject to change. TensorRT-LLM v0.5.0, TensorRT v9.1.0.4 | H200, H100 FP8. +Preliminary measured performance, subject to change. TensorRT LLM v0.5.0, TensorRT v9.1.0.4 | H200, H100 FP8. *(1) Largest batch supported on given TP configuration by power of 2.* *(2) TP = Tensor Parallelism* -Additional Performance data is available on the [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference/ai-inference) page, & soon in [TensorRT-LLM's Performance Documentation](https://nvidia.github.io/TensorRT-LLM/latest/performance/perf-overview.html). +Additional Performance data is available on the [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference/ai-inference) page, & soon in [TensorRT LLM's Performance Documentation](https://nvidia.github.io/TensorRT-LLM/latest/performance/perf-overview.html). ### H200 vs H100 @@ -38,7 +38,7 @@ an online chat agent scenario (ISL/OSL=80/200) with GPT3-175B on a full HGX (TP8 H200 TPS Preliminary measured performance, subject to change. -TensorRT-LLM v0.5.0, TensorRT v9.1.0.4. | Llama-70B: H100 FP8 BS 8, H200 FP8 BS 32 | GPT3-175B: H100 FP8 BS 64, H200 FP8 BS 128 +TensorRT LLM v0.5.0, TensorRT v9.1.0.4. | Llama-70B: H100 FP8 BS 8, H200 FP8 BS 32 | GPT3-175B: H100 FP8 BS 64, H200 FP8 BS 128 **Max Throughput across TP/BS:** @@ -47,7 +47,7 @@ Max throughput(3) on H200 vs H100 varies by model, sequence lengths, max throughput llama sweep Preliminary measured performance, subject to change. -TensorRT-LLM v0.5.0, TensorRT v9.1.0.4 | H200, H100 FP8. +TensorRT LLM v0.5.0, TensorRT v9.1.0.4 | H200, H100 FP8. *(3) Max Throughput per GPU is defined as the highest tok/s per GPU, swept across TP configurations & BS powers of 2.* @@ -55,4 +55,4 @@ TensorRT-LLM v0.5.0, TensorRT v9.1.0.4 | H200, H100 FP8. ### Latest HBM Memory -H200 is the newest addition to NVIDIA’s data center GPU portfolio. To maximize that compute performance, H200 is the first GPU with HBM3e memory with 4.8TB/s of memory bandwidth, a 1.4X increase over H100. H200 also expands GPU memory capacity nearly 2X to 141 gigabytes (GB). The combination of faster and larger HBM memory accelerates performance of LLM model inference performance with faster throughput and tokens per second. These results are measured and preliminary, more updates expected as optimizations for H200 continue with TensorRT-LLM. +H200 is the newest addition to NVIDIA’s data center GPU portfolio. To maximize that compute performance, H200 is the first GPU with HBM3e memory with 4.8TB/s of memory bandwidth, a 1.4X increase over H100. H200 also expands GPU memory capacity nearly 2X to 141 gigabytes (GB). The combination of faster and larger HBM memory accelerates performance of LLM model inference performance with faster throughput and tokens per second. These results are measured and preliminary, more updates expected as optimizations for H200 continue with TensorRT LLM. diff --git a/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md b/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md index 201c3781a8d..05e310c97c9 100644 --- a/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md +++ b/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md @@ -1,5 +1,5 @@ # Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs -by NVIDIA TensorRT-LLM team +by NVIDIA TensorRT LLM team ## Table of Contents - [Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs](#pushing-latency-boundaries-optimizing-deepseek-r1-performance-on-nvidia-b200-gpus) @@ -39,7 +39,7 @@ by NVIDIA TensorRT-LLM team ## Background Recent advancements in Large Language Reasoning Models have demonstrated remarkable success, while creating new deployment challenges. A critical challenge emerges from extended Output Sequence Lengths (OSL) due to complex "thinking and reasoning" processes. Longer OSL demands stricter Token-to-Token Latency (TTL) requirements, often forcing concurrency limitations. The most extreme case, single concurrency (min-latency scenario) , becomes particularly challenging for real-time applications. -This article explores how TensorRT-LLM achieves record-breaking performance for [DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) in min-latency scenarios on NVIDIA's 8×B200 GPU configuration progressing from 67 tokens per second (TPS) to 253 before GTC 2025(**3.7x** speed-up), and to our current number is 368 TPS (**5.5x** speed-up). +This article explores how TensorRT LLM achieves record-breaking performance for [DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) in min-latency scenarios on NVIDIA's 8×B200 GPU configuration progressing from 67 tokens per second (TPS) to 253 before GTC 2025(**3.7x** speed-up), and to our current number is 368 TPS (**5.5x** speed-up). ## Implementation Configuration @@ -65,7 +65,7 @@ We have explored a mixed precision recipe, which provides a better tradeoff betw | 3x MTP Layers | bf16 | | RouterGEMM*** | bf16 | -*TensorRT-LLM already supports [FP8 Attention](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/deepseek_v3#fp8-kv-cache-and-mla) while for this latency scenario low-precision attention computation doesn't help with performance so we choose to use bf16 precision for the Attention Modules. +*TensorRT LLM already supports [FP8 Attention](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/deepseek_v3#fp8-kv-cache-and-mla) while for this latency scenario low-precision attention computation doesn't help with performance so we choose to use bf16 precision for the Attention Modules. ** nvfp4 model checkpoint is generated by the [NVIDIA TensorRT Model Optimizer toolkit](https://github.com/NVIDIA/TensorRT-Model-Optimizer). @@ -263,6 +263,6 @@ It's also needed to set `use_relaxed_acceptance_for_thinking: true`, `relaxed_to - More Exploration of MTP ## Acknowledgment -Pushing the performance boundaries of DeepSeek R1 for latency-sensitive applications has been a remarkable engineering journey. The optimizations detailed in this post represent an exceptional cross-functional collaboration across the entire AI technology stack - spanning kernel-level optimizations, runtime enhancements, model quantization techniques, algorithmic improvements, and systematic performance analysis and tuning. While we can't individually acknowledge every contributor, we're proud to recognize the dedicated team of engineers whose collective expertise has helped advance the state-of-the-art in TensorRT-LLM performance engineering. +Pushing the performance boundaries of DeepSeek R1 for latency-sensitive applications has been a remarkable engineering journey. The optimizations detailed in this post represent an exceptional cross-functional collaboration across the entire AI technology stack - spanning kernel-level optimizations, runtime enhancements, model quantization techniques, algorithmic improvements, and systematic performance analysis and tuning. While we can't individually acknowledge every contributor, we're proud to recognize the dedicated team of engineers whose collective expertise has helped advance the state-of-the-art in TensorRT LLM performance engineering. Through this collaborative endeavor, we've developed valuable insights into maximizing GPU utilization for large language model inference. We hope that the techniques and best practices shared in this blog will empower the developer community to better leverage NVIDIA GPU capabilities in their mission-critical LLM inference applications. diff --git a/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md b/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md index 9093aa663a3..479e3db2c28 100644 --- a/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md +++ b/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md @@ -1,5 +1,5 @@ # DeepSeek R1 MTP Implementation and Optimization -by NVIDIA TensorRT-LLM team +by NVIDIA TensorRT LLM team ## Table of Contents - [DeepSeek R1 MTP Implementation and Optimization](#deepseek-r1-mtp-implementation-and-optimization) - [Table of Contents](#table-of-contents) @@ -7,7 +7,7 @@ by NVIDIA TensorRT-LLM team - [Background](#background) - [MTP Vanilla](#mtp-vanilla) - [MTP Eagle](#mtp-eagle) - - [MTP implementation in TensorRT-LLM](#mtp-implementation-in-tensorrt-llm) + - [MTP implementation in TensorRT LLM](#mtp-implementation-in-tensorrt-llm) - [Basic Implementation](#basic-implementation) - [MTP Modules](#mtp-modules) - [Attention for MTP](#attention-for-mtp) @@ -25,7 +25,7 @@ by NVIDIA TensorRT-LLM team - [Acknowledgment](#acknowledgment) -TensorRT-LLM achieves world-record inference performance for DeepSeek-R1 on NVIDIA Blackwell GPUs, where Multi-Token Prediction (MTP) delivers a significant speedup. In our [previous blog post](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md), we discussed the key optimizations that enable the outstanding inference latency of the DeepSeek-R1 model. This article dives deeper into the implementation and optimization of MTP in TensorRT-LLM. +TensorRT LLM achieves world-record inference performance for DeepSeek-R1 on NVIDIA Blackwell GPUs, where Multi-Token Prediction (MTP) delivers a significant speedup. In our [previous blog post](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md), we discussed the key optimizations that enable the outstanding inference latency of the DeepSeek-R1 model. This article dives deeper into the implementation and optimization of MTP in TensorRT LLM. ## MTP for inference Inspired by a previous [research work](https://arxiv.org/pdf/2404.19737), MTP is designed to help the DeepSeek-V3 training. It adds additional MTP modules at the end of the main model and uses them to predict additional tokens. In this way, MTP can extend the prediction scope to multiple future tokens at each position to achieve better model accuracy. During inference, those MTP modules can also be used for speculative decoding to improve the generation latency further. In this section, we will introduce the MTP speculative decoding algorithm for LLM inference. @@ -74,18 +74,18 @@ Figure 3 gives an MTP Eagle example. In the context phase, the inputs of the fir In the generation phase, the verification stage is the same as MTP Vanilla. Once we get the accepted tokens, we use all of them along with their corresponding hidden states as inputs for the first MTP module forward. Unlike MTP Vanilla, which needs to store past tokens and hidden states, this approach is much easier to implement. Subsequent MTP module forwards follow the same input preparation method as the context phase. After predicting all draft tokens, we need to evict the key/value pairs of any rejected draft tokens from the main model’s KV cache. -## MTP implementation in TensorRT-LLM +## MTP implementation in TensorRT LLM ### Basic Implementation -TensorRT-LLM has two different paths for MTP, one for [MTP Vanilla](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/speculative/mtp.py#L1047) and another for [MTP Eagle](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/speculative/mtp.py#L1047). MTP Eagle is the default path for DeepSeek-V3 and DeepSeek-R1 models. +TensorRT LLM has two different paths for MTP, one for [MTP Vanilla](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/speculative/mtp.py#L1047) and another for [MTP Eagle](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/speculative/mtp.py#L1047). MTP Eagle is the default path for DeepSeek-V3 and DeepSeek-R1 models.
tech_blog2_overall_workflow
-

Figure 4. MTP workflow in TensorRT-LLM

+

Figure 4. MTP workflow in TensorRT LLM

-Figure 4 shows the overall workflow of MTP in TensorRT-LLM. Both paths share the runtime workflow, and the differences are in the MTP modules forward. In the context phase, there is no draft token in the inputs. TensorRT-LLM model engine fetches the input IDs from the requests and inputs to the model engine forward to get the next token and the hidden state. Then we prepare the MTP module inputs, and the MTP modules forward the inputs to predict the draft tokens. +Figure 4 shows the overall workflow of MTP in TensorRT LLM. Both paths share the runtime workflow, and the differences are in the MTP modules forward. In the context phase, there is no draft token in the inputs. TensorRT LLM model engine fetches the input IDs from the requests and inputs to the model engine forward to get the next token and the hidden state. Then we prepare the MTP module inputs, and the MTP modules forward the inputs to predict the draft tokens. The generation workflow is more complicated. We need to do both the verification and draft stages. The predicted new token and draft tokens are the inputs for the main model. After the main model forward, we can sample from the output logits and get the following new tokens. Then compare them with the input draft tokens to get the final accepted tokens. The verification stage will be finished here. We will use the accepted tokens and hidden states to start a new draft stage, which uses the MTP layers to predict new draft tokens for the next iteration. Finally, we need to rewind the KV cache to evict keys/values corresponding to those rejected tokens. @@ -107,7 +107,7 @@ The MTP module follows the design in DeepSeek-V3. The embedding layer and output ### Attention for MTP -Attention is also a very important component in supporting MTP inference. The changes are mainly in the attention kernels for the generation phase. For the normal request, there will be only one input token in the generation phase, but for MTP, there will be $K+1$ input tokens. Since MTP sequentially predicts additional tokens, the predicted draft tokens are chained. Though we have an MTP Eagle path, currently, we only have the chain-based support for MTP Eagle. So, a causal mask is enough for the attention kernel to support MTP. In our implementation, TensorRT-LLM will use the fp8 flashMLA generation kernel on Hopper GPU, while using TRTLLM customized attention kernels on Blackwell for better performance. +Attention is also a very important component in supporting MTP inference. The changes are mainly in the attention kernels for the generation phase. For the normal request, there will be only one input token in the generation phase, but for MTP, there will be $K+1$ input tokens. Since MTP sequentially predicts additional tokens, the predicted draft tokens are chained. Though we have an MTP Eagle path, currently, we only have the chain-based support for MTP Eagle. So, a causal mask is enough for the attention kernel to support MTP. In our implementation, TensorRT LLM will use the fp8 flashMLA generation kernel on Hopper GPU, while using TRTLLM customized attention kernels on Blackwell for better performance. ### How to run DeepSeek models with MTP Run DeepSeek-V3/R1 models with MTP, use [examples/llm-api/quickstart_advanced.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llm-api/quickstart_advanced.py) with additional options: @@ -237,12 +237,12 @@ We validated the Relaxed Acceptance on different datasets. In Figure 8, we show

Figure 9. Comparison between the chain-based and tree-based speculative decoding

-TensorRT-LLM PyTorch backend can only support chain-based speculative decoding now, both MTP Vanilla and MTP Eagle. However, the tree-based speculative decoding technique is widely used in previous advanced methods, such as Ealge2 and Eagle3, to increase the acceptance rate. MTPs in TensorRT-LLM can also be extended to support the tree-based technique. Figure 9 compares the chain-based method with the tree-based method. Both full tree and dynamic tree methods can help expand the candidate combinations, so that we can have more choices for the draft tokens. +TensorRT LLM PyTorch backend can only support chain-based speculative decoding now, both MTP Vanilla and MTP Eagle. However, the tree-based speculative decoding technique is widely used in previous advanced methods, such as Ealge2 and Eagle3, to increase the acceptance rate. MTPs in TensorRT LLM can also be extended to support the tree-based technique. Figure 9 compares the chain-based method with the tree-based method. Both full tree and dynamic tree methods can help expand the candidate combinations, so that we can have more choices for the draft tokens. ### Eagle3 support -Another important method is Eagle3. From the [Eagle3 paper](https://arxiv.org/pdf/2503.01840), the promising results show that it can help greatly increase the acceptance rate by leveraging different levels’ hidden states to predict draft tokens. Since TensorRT-LLM already has [Eagle-3 support](https://github.com/NVIDIA/TensorRT-LLM/pull/3035) now, in the future, we also want to train an Eagle3 head to support DeepSeek-V3/R1+Eagle3 to achieve better speedup. +Another important method is Eagle3. From the [Eagle3 paper](https://arxiv.org/pdf/2503.01840), the promising results show that it can help greatly increase the acceptance rate by leveraging different levels’ hidden states to predict draft tokens. Since TensorRT LLM already has [Eagle-3 support](https://github.com/NVIDIA/TensorRT-LLM/pull/3035) now, in the future, we also want to train an Eagle3 head to support DeepSeek-V3/R1+Eagle3 to achieve better speedup. ## Acknowledgment -This was a remarkable cross-team effort to support and optimize MTP in TensorRT-LLM. We would like to extend our gratitude to everyone who contributed to making this possible, as it involved a typical system/algorithm co-design approach spanning multiple technical layers—including kernel optimization, runtime enhancements, algorithmic improvements, and performance measurement & analysis. And a special thanks goes to the DeepSeek team for developing the MTP method, which lays down the foundation of this blog. +This was a remarkable cross-team effort to support and optimize MTP in TensorRT LLM. We would like to extend our gratitude to everyone who contributed to making this possible, as it involved a typical system/algorithm co-design approach spanning multiple technical layers—including kernel optimization, runtime enhancements, algorithmic improvements, and performance measurement & analysis. And a special thanks goes to the DeepSeek team for developing the MTP method, which lays down the foundation of this blog. diff --git a/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md b/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md index a5bb524e661..2da07411a83 100644 --- a/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md +++ b/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md @@ -1,6 +1,6 @@ # Optimizing DeepSeek R1 Throughput on NVIDIA Blackwell GPUs: A Deep Dive for Developers -By NVIDIA TensorRT-LLM team +By NVIDIA TensorRT LLM team ## Table of Contents - [Optimizing DeepSeek R1 Throughput on NVIDIA Blackwell GPUs: A Deep Dive for Developers](#optimizing-deepseek-r1-throughput-on-nvidia-blackwell-gpus-a-deep-dive-for-developers) - [Table of Contents](#table-of-contents) @@ -18,7 +18,7 @@ By NVIDIA TensorRT-LLM team - [Acknowledgment](#acknowledgment) ## Introduction -The open source DeepSeek R1 model's innovative architecture including the multi-head latent attention (MLA) and large sparse Mixture-of-Experts (MoE) significantly improved the inference efficiency of the LLM models. However, harnessing the full potential of such an innovative structure requires equally important hardware/software co-optimization. This post delves into the optimization strategies for DeepSeek R1 throughput oriented scenarios (TPS/GPU), developed by NVIDIA within TensorRT-LLM on NVIDIA's Blackwell B200 GPUs. We will explore the rationale behind each enhancement. [The other min-latency optimization blog](./blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md) explained in detail how TensorRT-LLM optimizes the R1 performance to achieve the best of the TPS/USER. +The open source DeepSeek R1 model's innovative architecture including the multi-head latent attention (MLA) and large sparse Mixture-of-Experts (MoE) significantly improved the inference efficiency of the LLM models. However, harnessing the full potential of such an innovative structure requires equally important hardware/software co-optimization. This post delves into the optimization strategies for DeepSeek R1 throughput oriented scenarios (TPS/GPU), developed by NVIDIA within TensorRT LLM on NVIDIA's Blackwell B200 GPUs. We will explore the rationale behind each enhancement. [The other min-latency optimization blog](./blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md) explained in detail how TensorRT LLM optimizes the R1 performance to achieve the best of the TPS/USER. These optimizations have significantly boosted DeepSeek R1 throughput on Blackwell. Performance increased from approximately 2000 TPS/GPU in February to 4600 TPS/GPU on ISL/OSL 1K/2K dataset. The optimizations are general and applicable to other ISL/OSL configs too. These optimization items were broadly categorized into three areas: MLA layers, MoE layers, and runtime. @@ -29,12 +29,12 @@ The mixed precision recipe for DeepSeek R1 throughput scenario is almost the sam * FP8 KV cache and FP8 attention, rather than BF16 precision. * FP4 Allgather for better communication bandwidth utilization. -The checkpoint used in this blog is hosted in [nvidia/DeepSeek-R1-FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), generated by [NVIDIA Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer). The accuracy score of common dataset on this FP4 checkpoint and TensorRT-LLM implementations are: +The checkpoint used in this blog is hosted in [nvidia/DeepSeek-R1-FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), generated by [NVIDIA Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer). The accuracy score of common dataset on this FP4 checkpoint and TensorRT LLM implementations are: | Precision | GPQA Diamond | MATH-500 | :-- | :-- | :-- | -| TensorRT-LLM FP8 | 0.697 | 0.954 | -| TensorRT-LLM FP4 | 0.705 | 0.96 | +| TensorRT LLM FP8 | 0.697 | 0.954 | +| TensorRT LLM FP4 | 0.705 | 0.96 | ** Note there are some run-to-run variance for these evaluations, so FP4 data is slight higher here. We think FP4 has comparable accuracy with FP8 on these datasets. @@ -62,12 +62,12 @@ In the following sections we will explain the rationale why DP and EP are chosen ### Weights absorb and MQA -The core idea of MLA is the low-rank joint compression for the attention keys and values to reduce KV-cache size during the inference. Based on the MLA formulas, the down-projected KV latent is up-projected to multiple heads and combined with the up-projected Q to establish a normal multi-head attention (MHA). Due to the nature of the matrix multiplication, the up projection weights matrix of the K (W^UK) can be multiplied by the up-projection weights matrix of Q (W^Q) firstly, the computed results of these 2 can be then multiplied to Q. The up-projection weights matrix of V (W^UV) and the attention output projection matrix W^O can also be multiplied after the attention output. The DeepSeek-V2 technical report calls this technique "absorb". After the weights are absorbed, the MLA is equivalent to multiple query attention(MQA). Please see the [original DeepSeek-V2 technical paper](https://arxiv.org/pdf/2405.04434) for the detailed formulas and explanations, the following block diagram shows the computational flow of weights absorbed MLA in TensorRT-LLM. +The core idea of MLA is the low-rank joint compression for the attention keys and values to reduce KV-cache size during the inference. Based on the MLA formulas, the down-projected KV latent is up-projected to multiple heads and combined with the up-projected Q to establish a normal multi-head attention (MHA). Due to the nature of the matrix multiplication, the up projection weights matrix of the K (W^UK) can be multiplied by the up-projection weights matrix of Q (W^Q) firstly, the computed results of these 2 can be then multiplied to Q. The up-projection weights matrix of V (W^UV) and the attention output projection matrix W^O can also be multiplied after the attention output. The DeepSeek-V2 technical report calls this technique "absorb". After the weights are absorbed, the MLA is equivalent to multiple query attention(MQA). Please see the [original DeepSeek-V2 technical paper](https://arxiv.org/pdf/2405.04434) for the detailed formulas and explanations, the following block diagram shows the computational flow of weights absorbed MLA in TensorRT LLM. ![Weights Absorb](../media/tech_blog3_mla_absorb.png "Weights Absorbed MLA") For the decoding phase, the weights absorb significantly reduces the math FLOPS needed to up project the K and V, since the FLOPs needed for these up projections of KV are linear to the KV cache length, while length of Q vector is always 1 in the decoding phase. The longer the KV cache history is, the more FLOPs are needed, and the up projections are repeated for every decoded token since only the projected KV latent were saved, which further increases the FLOPs needed. For the prefill phase, the weights absorbed version changes the dimensions of Q and KV thus increasing the number of FLOPs for attention. Based on roofline analysis, non absorbed version is beneficial for the prefill phase with input length 256 or larger -The TensorRT-LLM MLA implementation chooses different highly optimized kernels for prefill and decoding, see [MLA](../../../../tensorrt_llm/_torch/modules/attention.py). +The TensorRT LLM MLA implementation chooses different highly optimized kernels for prefill and decoding, see [MLA](../../../../tensorrt_llm/_torch/modules/attention.py). ### Data Parallel for Attention module (ADP) @@ -147,7 +147,7 @@ The following optimizations are already done for MoE layers. ## Runtime Optimizations -These optimizations target the overall execution flow, scheduling, and resource management within the inference system. They are shared between DeepSeek R1 models and other models supported in the TensorRT-LLM, here we are sharing some ablation study for the performance benefits on DeepSeek R1 on B200. +These optimizations target the overall execution flow, scheduling, and resource management within the inference system. They are shared between DeepSeek R1 models and other models supported in the TensorRT LLM, here we are sharing some ablation study for the performance benefits on DeepSeek R1 on B200. * CUDA Graph @@ -155,13 +155,13 @@ These optimizations target the overall execution flow, scheduling, and resource CUDA Graphs allow capturing a sequence of CUDA operations and launching them as a single unit, drastically reducing kernel launch overheads. This is particularly beneficial for models with many small kernels, and particularly on the PyTorch flow, because the python host code normally executes slower than C++. Since the CUDA Graph freezes the kernel launch parameters, which is normally associated with the tensor shapes, it can only be safely used with static shape, meaning that different CUDA graphs need to be captured for different batch sizes. Each graph will have some cost of memory usage, and capturing time, thus we cannot capture every possible CUDA graph for all possible batches. For the non-captured batch sizes, PyTorch eager mode code will be executed. - There is a feature called CUDA Graph padding in TensorRT-LLM, which is a good trade-off between the number of CUDA Graphs and the CUDA Graph hit ratio; it tries to pad a batch to the nearest one with a captured CUDA Graph. Normally you should enable the CUDA Graph padding feature to increase the CUDA Graph hit rate, but the padding itself has some overhead due to wasted tokens computation. + There is a feature called CUDA Graph padding in TensorRT LLM, which is a good trade-off between the number of CUDA Graphs and the CUDA Graph hit ratio; it tries to pad a batch to the nearest one with a captured CUDA Graph. Normally you should enable the CUDA Graph padding feature to increase the CUDA Graph hit rate, but the padding itself has some overhead due to wasted tokens computation. Users can opt-out the CUDA Graph padding feature to see the perf benefits, by setting the `cuda_graph_config:\n enable_padding: False`, see API here [Pytorch backend config](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41) * Overlap Scheduler: - Showed a **4% E2E performance impact** and should generally **always be used**. This scheduler manages the execution of different operations (like computation and communication) to overlap them effectively on the GPU and network. The intuition is to hide latency by performing computation while waiting for data transfers or vice versa, improving overall hardware utilization. The overlap schedule is already defaulted on in TensorRT-LLM by [commit](https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428#diff-3c4f29d6594b37af0f1fbb97f5291b18e49f3f2510f9d296c7adb2829e9da0bf). In case there are corner cases where it does not work, users can still opt-out this feature by set *disable_overlap_scheduler* to true. + Showed a **4% E2E performance impact** and should generally **always be used**. This scheduler manages the execution of different operations (like computation and communication) to overlap them effectively on the GPU and network. The intuition is to hide latency by performing computation while waiting for data transfers or vice versa, improving overall hardware utilization. The overlap schedule is already defaulted on in TensorRT LLM by [commit](https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428#diff-3c4f29d6594b37af0f1fbb97f5291b18e49f3f2510f9d296c7adb2829e9da0bf). In case there are corner cases where it does not work, users can still opt-out this feature by set *disable_overlap_scheduler* to true. * Memory Optimizations @@ -179,4 +179,4 @@ See [Perf practices](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/sourc ## Acknowledgment -The substantial throughput advancements for DeepSeek R1 on Blackwell GPUs, as detailed in this post, are the fruit of a dedicated and collaborative engineering effort. Achieving nearly a 2.3x increase in TPS/GPU required a deep dive into MLA layers, MoE layers, and runtime optimizations. We extend our sincere appreciation to all the engineers involved in this intensive optimization process. Their collective expertise in pushing the boundaries of throughput performance within TensorRT-LLM has been instrumental. We trust that sharing these specific strategies for maximizing throughput will prove beneficial to the developer community as they tackle demanding LLM inference workloads on NVIDIA hardware. +The substantial throughput advancements for DeepSeek R1 on Blackwell GPUs, as detailed in this post, are the fruit of a dedicated and collaborative engineering effort. Achieving nearly a 2.3x increase in TPS/GPU required a deep dive into MLA layers, MoE layers, and runtime optimizations. We extend our sincere appreciation to all the engineers involved in this intensive optimization process. Their collective expertise in pushing the boundaries of throughput performance within TensorRT LLM has been instrumental. We trust that sharing these specific strategies for maximizing throughput will prove beneficial to the developer community as they tackle demanding LLM inference workloads on NVIDIA hardware. diff --git a/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md b/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md index ed4a7a4dc54..c0dbcea6bc5 100644 --- a/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md +++ b/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md @@ -1,9 +1,9 @@ -# Scaling Expert Parallelism in TensorRT-LLM (Part 1: Design and Implementation of Large-scale EP) +# Scaling Expert Parallelism in TensorRT LLM (Part 1: Design and Implementation of Large-scale EP) -By NVIDIA TensorRT-LLM Team +By NVIDIA TensorRT LLM Team ## Table of Contents -- [Scaling Expert Parallelism in TensorRT-LLM (Part 1: Design and Implementation of Large-scale EP)](#scaling-expert-parallelism-in-tensorrt-llm-part-1-design-and-implementation-of-large-scale-ep) +- [Scaling Expert Parallelism in TensorRT LLM (Part 1: Design and Implementation of Large-scale EP)](#scaling-expert-parallelism-in-tensorrt-llm-part-1-design-and-implementation-of-large-scale-ep) - [Table of Contents](#table-of-contents) - [Motivation for large-scale EP](#motivation-for-large-scale-ep) - [Observations over one machine translation dataset](#observations-over-one-machine-translation-dataset) @@ -39,7 +39,7 @@ In the past, we have shared TensorRT-LLM’s optimization experience to [push th The DeepSeek team has also shared their valuable experience and practice on how to optimize this kind of large-scale Expert Parallelism (EP) model, including [DeepEP](https://github.com/deepseek-ai/DeepEP) and [EPLB](https://github.com/deepseek-ai/EPLB). Also, the DeepSeek team has shared their concrete design considerations in [this](https://arxiv.org/abs/2412.19437) tech report. On top of those great sharings, there are also nice community efforts to implement large-scale EP in other inference engines, such as [this](https://lmsys.org/blog/2025-05-05-large-scale-ep/) effort from the SGLang team. -In this tech blog, we will introduce the details of the design and implementation to support E2E large-scale EP in TensorRT-LLM. This blog post mainly covers the following: +In this tech blog, we will introduce the details of the design and implementation to support E2E large-scale EP in TensorRT LLM . This blog post mainly covers the following: * How to leverage NVIDIA GB200 Multi-Node NVLink (MNNVL) HW features to implement high-performance communication kernels. * How to design and implement an online expert workload balancer to dynamically balance the expert load distribution and adapt to the changes of online traffic patterns. We present: @@ -48,16 +48,16 @@ In this tech blog, we will introduce the details of the design and implementatio * The design and implementation of the replication/placement strategy. * The MoE weight load/re-distributer to balance the online workload across multiple GPUs. * The changes needed to the MoE router and computation module to adapt to the expert load balancer needs. - * Some preliminary data demonstrating the effectiveness of the current implementation in TensorRT-LLM. + * Some preliminary data demonstrating the effectiveness of the current implementation in TensorRT LLM . In future tech blogs, we will also cover the following topics: -* The introduction of performance tuning and optimization for TensorRT-LLM large-scale EP GB200 implementation. +* The introduction of performance tuning and optimization for TensorRT LLM large-scale EP GB200 implementation. * How to implement efficient large-scale EP support for B200/Hopper and other NVIDIA GPUs without MNNVL. * The best practices to leverage large-scale EP and get performance gains. * How to combine large-scale EP with other system optimization techniques. -Even if, in this tech blog, we focus on TensorRT-LLM, we believe the core ideas and implementation can also be applied to other inference engines to help the inference performance on NVIDIA GPUs. Also, with the help of the community, we would like to figure out how to better modularize the current TensorRT-LLM large-scale EP implementation and make it more easily reusable by the community. +Even if, in this tech blog, we focus on TensorRT LLM , we believe the core ideas and implementation can also be applied to other inference engines to help the inference performance on NVIDIA GPUs. Also, with the help of the community, we would like to figure out how to better modularize the current TensorRT LLM large-scale EP implementation and make it more easily reusable by the community. Finally, in this tech blog, there are implementation details which are targeted towards the GB200 system, such as the communication components leveraging the GB200 MNNVL inter-GPU connection, and the MoE weight load/re-distributer module leveraging the high bandwidth C2C connection between Grace CPU and Blackwell GPU. Nevertheless, the overall design principle and software architecture can still apply to non-GB200 NVIDIA GPU systems. To facilitate the extension to other non-GB200 system, we have, on purpose, paid attention to the generalization of the design and implementation. These changes should be easily composable with other existing components. @@ -221,7 +221,7 @@ To make sure large-scale EP can run well, careful considerations are needed to m -

Figure 12: the high-level design of TensorRT-LLM large-scale EP

+

Figure 12: the high-level design of TensorRT LLM large-scale EP

In this design, there are both CPU and GPU side logics: @@ -306,7 +306,7 @@ The current technical decision is: The considerations are: -* DeepEP is a great piece of work done by the DeepSeek team. When we started the TensorRT-LLM large-scale EP efforts, our first focus was on GB200. We chose to implement our own custom EP communication kernels as it was easier to introduce optimizations requiring the GB200 MNNVL capability. Also, based on our current evaluation, DeepEP does not provide CUDA graph compatibility for all the scenarios. We believe that CUDA graph is needed for the scenario we are interested in. +* DeepEP is a great piece of work done by the DeepSeek team. When we started the TensorRT LLM large-scale EP efforts, our first focus was on GB200. We chose to implement our own custom EP communication kernels as it was easier to introduce optimizations requiring the GB200 MNNVL capability. Also, based on our current evaluation, DeepEP does not provide CUDA graph compatibility for all the scenarios. We believe that CUDA graph is needed for the scenario we are interested in. * When we started the efforts to enable large-scale EP on Hopper, we concluded that DeepEP could be adapted and meet our needs on this platform. We plan to extend DeepEP to work for B200 in the future. We are also actively evaluating the possibility of consolidating GB200 and non-GB200 EP communication kernels into a single solution to make the system simpler, and we will keep the community posted on the status. @@ -333,7 +333,7 @@ More details can be found in [PR 3504](https://github.com/NVIDIA/TensorRT-LLM/pu ## EP Load Balancer -TensorRT-LLM implements a set of functionalities to achieve EP Load Balancing. There are several key components: +TensorRT LLM implements a set of functionalities to achieve EP Load Balancing. There are several key components: ### Python Interface @@ -364,7 +364,7 @@ The GPU core logic contains the following components: There are GPU/CPU synchronization components implemented. More details can be found in [PR 4384](https://github.com/NVIDIA/TensorRT-LLM/pull/4384) and [PR 4495](https://github.com/NVIDIA/TensorRT-LLM/pull/4495). -Based on these core utilities, there are two versions of EP Load Balancer in TensorRT-LLM: Offline EP Load Balancer and Online EP Load Balancer. +Based on these core utilities, there are two versions of EP Load Balancer in TensorRT LLM : Offline EP Load Balancer and Online EP Load Balancer. ### Online EP Load Balancer @@ -687,14 +687,14 @@ Based on our current performance analysis, when you plan to apply large-scale EP **Please use your own judgement to decide whether to use large-scale EP into your system or not, and when you use it, what is the suitable EP size and concrete deployment settings suitable for your own requirements.** -The current TensorRT-LLM large-scale EP implementation is not perfect and there are still known limitations (community contributions are welcome to help us improve). For example, we need: +The current TensorRT LLM large-scale EP implementation is not perfect and there are still known limitations (community contributions are welcome to help us improve). For example, we need: * More platforms coverage * Extending the support to cover other non-GB200 NVIDIA GPU HWs. **We are actively working on this now.** * Currently the large-EP support only covers NVFP4 data precision, incremental efforts are needed to cover FP8 and INT8/INT4 data precision. * Performance * Further performance tuning and optimizations. **We are actively working on this now.** - * More validation with workloads close to production traffic. **Here we highly welcome the community’s feedback to help us calibrate TensorRT-LLM large-scale EP implementation based on more concrete workloads.** + * More validation with workloads close to production traffic. **Here we highly welcome the community’s feedback to help us calibrate TensorRT LLM large-scale EP implementation based on more concrete workloads.** * The thorough validation of combination with other inference core features, such as dis-aggregated serving, speculative decoding, validation on more MoE model families, etc. **We are actively working on this now.** * Ease-of-use * Easy customization @@ -707,11 +707,11 @@ The current TensorRT-LLM large-scale EP implementation is not perfect and there * Because large-scale EP deployment solution may lead to an increased fault ratio of the online deployment system, it may increase the need for cross-layer interactions with multiple components of the E2E LLM inference system on NVIDIA GPUs. This includes the low-level communication kernel, the cluster-level orchestrator and scheduler, etc. We are actively working with various NVIDIA engineering teams to push forward on this. -We believe the current implementation can be viewed as a reasonable E2E large-scale EP implementation and we encourage the community to try new ideas and performance validation. We encourage the community to share feedback to help us move fast in this area. We are actively tracking the TensorRT-LLM large-scale EP execution in [this](https://github.com/NVIDIA/TensorRT-LLM/issues/4127) GitHub issue to ensure transparency to the community. +We believe the current implementation can be viewed as a reasonable E2E large-scale EP implementation and we encourage the community to try new ideas and performance validation. We encourage the community to share feedback to help us move fast in this area. We are actively tracking the TensorRT LLM large-scale EP execution in [this](https://github.com/NVIDIA/TensorRT-LLM/issues/4127) GitHub issue to ensure transparency to the community. ## Acknowledgement -The large-scale EP work is another great team effort, spanning kernel-level optimizations, runtime enhancements, and systematic performance analysis and tuning. While we cannot individually acknowledge every contributor, we are proud to recognize the dedicated team of engineers whose collective expertise has helped advance the state-of-the-art in terms of performance in TensorRT-LLM. +The large-scale EP work is another great team effort, spanning kernel-level optimizations, runtime enhancements, and systematic performance analysis and tuning. While we cannot individually acknowledge every contributor, we are proud to recognize the dedicated team of engineers whose collective expertise has helped advance the state-of-the-art in terms of performance in TensorRT LLM . Through this collaborative endeavor, we have developed valuable insights to allow us improve GPU utilization for large language model inference. We hope that the techniques and the experience shared in this blog will help the developer community to better leverage NVIDIA GPU capabilities in their mission-critical LLM inference applications. diff --git a/docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md b/docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md index ab5a2467840..15cfbbea011 100644 --- a/docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md +++ b/docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md @@ -1,10 +1,10 @@ -# Disaggregated Serving in TensorRT-LLM +# Disaggregated Serving in TensorRT LLM -By NVIDIA TensorRT-LLM Team +By NVIDIA TensorRT LLM Team -- [Disaggregated Serving in TensorRT-LLM](#disaggregated-serving-in-tensorrt-llm) +- [Disaggregated Serving in TensorRT LLM](#disaggregated-serving-in-tensorrt-llm) - [Motivation](#motivation) - - [Disaggregated Serving in TensorRT-LLM](#disaggregated-serving-in-tensorrt-llm-1) + - [Disaggregated Serving in TensorRT LLM](#disaggregated-serving-in-tensorrt-llm-1) - [trtllm-serve](#trtllm-serve) - [Dynamo](#dynamo) - [Triton Inference Server](#triton-inference-server) @@ -24,7 +24,7 @@ By NVIDIA TensorRT-LLM Team - [Future Work](#future-work) - [Acknowledgement](#acknowledgement) -In the past tech blogs, we have introduced optimization specifically for [low-latency](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md) and [throughput](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md) oriented optimizations. For production deployment, users also care about per GPU throughput satisfying certain latency constraints. In this tech blog, we will introduce the design concept and usage of the TensorRT-LLM disaggregated serving which directly targets throughput@latency performance scenarios, together with performance study results. +In the past tech blogs, we have introduced optimization specifically for [low-latency](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md) and [throughput](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md) oriented optimizations. For production deployment, users also care about per GPU throughput satisfying certain latency constraints. In this tech blog, we will introduce the design concept and usage of the TensorRT LLM disaggregated serving which directly targets throughput@latency performance scenarios, together with performance study results. ## Motivation @@ -55,15 +55,15 @@ Disaggregated serving resolves these challenges by decoupling the two phases, al You can also refer to [this paper](https://arxiv.org/pdf/2506.05508) for more details about the rational and design considerations of disaggregated serving. -## Disaggregated Serving in TensorRT-LLM +## Disaggregated Serving in TensorRT LLM -There are three different approaches to do disaggregation LLM inference with TensorRT-LLM, where each approach offers distinct architectural and operational characteristics suited to different deployment scenarios. +There are three different approaches to do disaggregation LLM inference with TensorRT LLM, where each approach offers distinct architectural and operational characteristics suited to different deployment scenarios. ### trtllm-serve -[`trtllm-serve`](https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve.html) is a command-line utility that facilitates the deployment of an OpenAI-compatible server for TensorRT-LLM instances. +[`trtllm-serve`](https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve.html) is a command-line utility that facilitates the deployment of an OpenAI-compatible server for TensorRT LLM instances. -The first approach to do disaggregated LLM inference with TensorRT-LLM involves launching a separate OpenAI-compatible server per context and generation instance using `trtllm-serve`. An additional server, referred to as the "disaggregated" server, is also launched with `trtllm-serve` and acts as an orchestrator which receives client requests and dispatches them to the appropriate context and generation servers via OpenAI REST API. Figure 3 below illustrates the disaggregated serving workflow when using this approach. When a context instance is done generating the KV blocks associated with the prompt, it returns a response to the disaggregated server. This response includes the prompt tokens, the first generated token and metadata associated with the context request and context instance. This metadata is referred to as context parameters (`ctx_params` in Figure 3). These parameters are then used by the generation instances to establish communication with the context instance and retrieve the KV cache blocks associated with the request. +The first approach to do disaggregated LLM inference with TensorRT LLM involves launching a separate OpenAI-compatible server per context and generation instance using `trtllm-serve`. An additional server, referred to as the "disaggregated" server, is also launched with `trtllm-serve` and acts as an orchestrator which receives client requests and dispatches them to the appropriate context and generation servers via OpenAI REST API. Figure 3 below illustrates the disaggregated serving workflow when using this approach. When a context instance is done generating the KV blocks associated with the prompt, it returns a response to the disaggregated server. This response includes the prompt tokens, the first generated token and metadata associated with the context request and context instance. This metadata is referred to as context parameters (`ctx_params` in Figure 3). These parameters are then used by the generation instances to establish communication with the context instance and retrieve the KV cache blocks associated with the request.
@@ -124,11 +124,11 @@ In the Dynamo workflow, requests are initially processed by pre- and post-proces Dynamo also includes built-in support for Kubernetes deployment, monitoring, and metrics collection. The development team is actively working on enabling dynamic instance scaling, further enhancing its suitability for production environments. -For more information on how to use Dynamo with TensorRT-LLM, please refer to [this documentation](https://docs.nvidia.com/dynamo/latest/examples/trtllm.html). +For more information on how to use Dynamo with TensorRT LLM, please refer to [this documentation](https://docs.nvidia.com/dynamo/latest/examples/trtllm.html). ### Triton Inference Server -The third approach to do disaggregated LLM inference with TensorRT-LLM utilizes the Triton Inference Server. With this approach a Triton ensemble model is employed, comprising a preprocessor, an orchestrator implemented as [a Python business logic scripting (BLS) backend](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/bls.html), and a post-processor. The orchestrator is responsible for routing client requests to context and generation instances, managing the flow of prompt tokens, and handling the return of generated tokens. This approach is illustrated in Figure 5. The Triton Inference Server approach relies on the Triton TensorRT-LLM backend and the Executor API, which is supported only for the TensorRT backend. For more information on how to use this approach, please refer to [this documentation](https://github.com/NVIDIA/TensorRT-LLM/tree/main/triton_backend/all_models/disaggregated_serving#running-disaggregated-serving-with-triton-tensorrt-llm-backend). +The third approach to do disaggregated LLM inference with TensorRT LLM utilizes the Triton Inference Server. With this approach a Triton ensemble model is employed, comprising a preprocessor, an orchestrator implemented as [a Python business logic scripting (BLS) backend](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/bls.html), and a post-processor. The orchestrator is responsible for routing client requests to context and generation instances, managing the flow of prompt tokens, and handling the return of generated tokens. This approach is illustrated in Figure 5. The Triton Inference Server approach relies on the Triton TensorRT LLM backend and the Executor API, which is supported only for the TensorRT backend. For more information on how to use this approach, please refer to [this documentation](https://github.com/NVIDIA/TensorRT-LLM/tree/main/triton_backend/all_models/disaggregated_serving#running-disaggregated-serving-with-triton-tensorrt-llm-backend).
@@ -141,7 +141,7 @@ The third approach to do disaggregated LLM inference with TensorRT-LLM utilizes ### Multi-backend Support -In TensorRT-LLM, the KV cache exchange is modularly decoupled from the KV cache manager and the underlying communication libraries, as shown in Figure 6. The KV cache exchange module is responsible for efficient transmission and reception of the cache, promptly releasing cache space, and performing cache layout conversions during the exchange process. Currently, mainstream communication protocols—MPI, UCX, and NIXL—are all supported by TensorRT-LLM, and the underlying communication protocols utilize RDMA / NVLink. Currently, we recommend using UCX and NIXL backends, as we are adding a dynamic scaling mechanism on top of them—specifically, dynamic node joining and leaving. This allows customers to adjust the load based on traffic demands or switch roles between context and generation dynamically. +In TensorRT LLM, the KV cache exchange is modularly decoupled from the KV cache manager and the underlying communication libraries, as shown in Figure 6. The KV cache exchange module is responsible for efficient transmission and reception of the cache, promptly releasing cache space, and performing cache layout conversions during the exchange process. Currently, mainstream communication protocols—MPI, UCX, and NIXL—are all supported by TensorRT LLM, and the underlying communication protocols utilize RDMA / NVLink. Currently, we recommend using UCX and NIXL backends, as we are adding a dynamic scaling mechanism on top of them—specifically, dynamic node joining and leaving. This allows customers to adjust the load based on traffic demands or switch roles between context and generation dynamically.
@@ -152,7 +152,7 @@ In TensorRT-LLM, the KV cache exchange is modularly decoupled from the KV cache ### Overlap Optimization -To optimize the overall performance of disaggregated serving, TensorRT-LLM overlaps the KV cache transmission with computation for multiple independent requests. While one request is sending or receiving its KV cache blocks, other requests can proceed with computation, as illustrated in Figure 7. Furthermore, if context and generation instances are using multiple GPUs per instance, KV cache transmission between different sets of GPUs can occur in parallel. +To optimize the overall performance of disaggregated serving, TensorRT LLM overlaps the KV cache transmission with computation for multiple independent requests. While one request is sending or receiving its KV cache blocks, other requests can proceed with computation, as illustrated in Figure 7. Furthermore, if context and generation instances are using multiple GPUs per instance, KV cache transmission between different sets of GPUs can occur in parallel.
@@ -163,7 +163,7 @@ To optimize the overall performance of disaggregated serving, TensorRT-LLM overl ### Cache Layout Transformation -To minimize KV cache transmission latency, TensorRT-LLM currently uses direct transmission between device memories for cache transfer. The KV cache transmission supports using different parallel strategies for the context and generation phases. In such cases, careful orchestration of KV cache block mapping is required. Figure 8 illustrates this using the example of context phase with TP2 and generation phase with PP2. +To minimize KV cache transmission latency, TensorRT LLM currently uses direct transmission between device memories for cache transfer. The KV cache transmission supports using different parallel strategies for the context and generation phases. In such cases, careful orchestration of KV cache block mapping is required. Figure 8 illustrates this using the example of context phase with TP2 and generation phase with PP2.
@@ -172,7 +172,7 @@ To minimize KV cache transmission latency, TensorRT-LLM currently uses direct tr

Figure 8. KV cache layout conversion

-The optimizations required for KV cache transmission vary depending on whether it's single-node multi-GPU, multi-node multi-GPU, or different GPU models. To accommodate this, TensorRT-LLM provides a set of environment variables for selection in different environments. Please refer to [this document](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/features/disagg-serving.md) for details. +The optimizations required for KV cache transmission vary depending on whether it's single-node multi-GPU, multi-node multi-GPU, or different GPU models. To accommodate this, TensorRT LLM provides a set of environment variables for selection in different environments. Please refer to [this document](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/features/disagg-serving.md) for details. ## Performance Studies @@ -281,7 +281,7 @@ We provide a set of scripts to reproduce the performance data presented in this ## Future Work -Although we can already demonstrate the performance benefits of doing disaggregated LLM inference with TensorRT-LLM, there is still work to be done to further improve the performance and ease of use. Among other things, we plan to: +Although we can already demonstrate the performance benefits of doing disaggregated LLM inference with TensorRT LLM, there is still work to be done to further improve the performance and ease of use. Among other things, we plan to: * Provide detailed steps and scripts to automate the generation of throughput-latency performance curves comparing aggregated with disaggregated. * Continue to improve performance at larger scales (large-scale EP for example). @@ -290,4 +290,4 @@ Although we can already demonstrate the performance benefits of doing disaggrega ## Acknowledgement -Adding support for disaggregated serving in TensorRT-LLM is a typical one-team effort requiring close collaboration spanning kernel-level optimizations, runtime enhancements, and systematic performance analysis and tuning. While we cannot individually acknowledge every contributor, we are proud to recognize the dedicated team of engineers whose collective expertise has helped advance the state-of-the-art in terms of performance in TensorRT-LLM. Through this collaborative endeavor, we have developed valuable insights to allow us to improve GPU utilization for large language model inference. We hope that the techniques and the experience shared in this blog will help the developer community better leverage NVIDIA GPU capabilities in their mission-critical LLM inference applications. +Adding support for disaggregated serving in TensorRT LLM is a typical one-team effort requiring close collaboration spanning kernel-level optimizations, runtime enhancements, and systematic performance analysis and tuning. While we cannot individually acknowledge every contributor, we are proud to recognize the dedicated team of engineers whose collective expertise has helped advance the state-of-the-art in terms of performance in TensorRT LLM. Through this collaborative endeavor, we have developed valuable insights to allow us to improve GPU utilization for large language model inference. We hope that the techniques and the experience shared in this blog will help the developer community better leverage NVIDIA GPU capabilities in their mission-critical LLM inference applications. diff --git a/docs/source/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.md b/docs/source/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.md index 4553331b1cd..fb2bcdcee66 100644 --- a/docs/source/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.md +++ b/docs/source/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.md @@ -1,8 +1,8 @@ -# How to launch Llama4 Maverick + Eagle3 TensorRT-LLM server +# How to launch Llama4 Maverick + Eagle3 TensorRT LLM server -Artificial Analysis has benchmarked the Llama4 Maverick with Eagle3 enabled TensorRT-LLM server running at over [1000 tokens per second per user on 8xB200 GPUs](https://developer.nvidia.com/blog/blackwell-breaks-the-1000-tps-user-barrier-with-metas-llama-4-maverick/). This implementation leverages NVIDIA's TensorRT-LLM combined with speculative decoding using the Eagle3 model to further boost performance. +Artificial Analysis has benchmarked the Llama4 Maverick with Eagle3 enabled TensorRT LLM server running at over [1000 tokens per second per user on 8xB200 GPUs](https://developer.nvidia.com/blog/blackwell-breaks-the-1000-tps-user-barrier-with-metas-llama-4-maverick/). This implementation leverages NVIDIA's TensorRT LLM combined with speculative decoding using the Eagle3 model to further boost performance. -In the guide below, we will walk you through how to launch your own high-performance Llama4 Maverick with Eagle3 enabled TensorRT-LLM server, from build to deployment. (Note that your specific performance numbers may vary—speculative decoding speedups depend upon the dataset!) +In the guide below, we will walk you through how to launch your own high-performance Llama4 Maverick with Eagle3 enabled TensorRT LLM server, from build to deployment. (Note that your specific performance numbers may vary—speculative decoding speedups depend upon the dataset!) ## Prerequisites @@ -18,7 +18,7 @@ In the guide below, we will walk you through how to launch your own high-perform * [NVIDIA Llama 4 Maverick 17B 128E Instruct FP8](https://huggingface.co/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8) * [NVIDIA Llama 4 Maverick 17B 128E Eagle3 BF16](https://huggingface.co/nvidia/Llama-4-Maverick-17B-128E-Eagle3) -In [Step 4: Start the TensorRT-LLM server](#step-4-start-the-tensorrt-llm-server), `/path/to/maverick` and `/path/to/eagle` refer to the download paths of the above respective models. +In [Step 4: Start the TensorRT LLM server](#step-4-start-the-tensorrt-llm-server), `/path/to/maverick` and `/path/to/eagle` refer to the download paths of the above respective models. ## Launching the server @@ -33,14 +33,14 @@ git lfs pull The last command, `git lfs pull`, ensures all large files stored with Git LFS are properly downloaded. If `git lfs` is not installed, please install following [Install Git LFS](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage) -### Step 2: Prepare the TensorRT-LLM release Docker image +### Step 2: Prepare the TensorRT LLM release Docker image #### Option 1. Use weekly release NGC docker image -TensorRT-LLM provides weekly release [docker image](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release) +TensorRT LLM provides weekly release [docker image](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release) -#### Option 2. Build TensorRT-LLM Docker image (Alternative way) -If you want to compile a specific TensorRT-LLM commit, you can build the docker image by checking out the specific branch or commit and running a make command. This may take 15-30 minutes depending on your system. +#### Option 2. Build TensorRT LLM Docker image (Alternative way) +If you want to compile a specific TensorRT LLM commit, you can build the docker image by checking out the specific branch or commit and running a make command. This may take 15-30 minutes depending on your system. ``` make -C docker release_build @@ -57,7 +57,7 @@ docker push docker.io//tensorrt_llm:main Replace `` with your Docker Hub username or your private registry path. -### Step 4: Start the TensorRT-LLM server +### Step 4: Start the TensorRT LLM server This command launches the server with Llama4 Maverick as the main model and Eagle3 as the draft model for speculative decoding. Make sure you have downloaded both model checkpoints before running this command. diff --git a/docs/source/blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.md b/docs/source/blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.md index ed59464da46..9f65414a004 100644 --- a/docs/source/blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.md +++ b/docs/source/blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.md @@ -1,4 +1,4 @@ -# N-Gram Speculative Decoding in TensorRT‑LLM +# N-Gram Speculative Decoding in TensorRT LLM N-Gram speculative decoding leverages the natural repetition in many LLM workloads. It splits previously seen text into configurable (key, value) n‑gram pairs and, during generation, swiftly proposes draft tokens by matching the current key against n-gram pools in memory. In this blog, we introduce design choices in TensorRT‑LLM’s N-Gram speculative decoding algorithm, share our experimental results of performance gains, and explain N-Gram's low barrier to adoption by deriving a simple heuristic to enable it. @@ -35,7 +35,7 @@ Speculative decoding drafts several tokens, verifies them on the model, and keep ## Algorithm & Complexity -`NGramDecodingConfig` in TensorRT-LLM: +`NGramDecodingConfig` in TensorRT LLM: ```python spec_config = NGramDecodingConfig( max_draft_len = v , # max length of draft tokens diff --git a/docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md b/docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md index d01643860ed..d669bd713dd 100644 --- a/docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md +++ b/docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md @@ -1,11 +1,11 @@ -# Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization) +# Scaling Expert Parallelism in TensorRT LLM (Part 2: Performance Status and Optimization) -This blog post continues our previous work on [Scaling Expert Parallelism in TensorRT-LLM (Part 1: Design and Implementation of Large-scale EP)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md), where we introduced the fundamental design and implementation of large-scale Expert Parallelism (EP) in TensorRT-LLM. Building upon that foundation, we have made significant performance improvements through various optimizations, achieving better throughput and latency for large-scale MoE models. +This blog post continues our previous work on [Scaling Expert Parallelism in TensorRT LLM (Part 1: Design and Implementation of Large-scale EP)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md), where we introduced the fundamental design and implementation of large-scale Expert Parallelism (EP) in TensorRT LLM. Building upon that foundation, we have made significant performance improvements through various optimizations, achieving better throughput and latency for large-scale MoE models. -*By NVIDIA TensorRT-LLM Team* +*By NVIDIA TensorRT LLM Team* ## Table of Contents -- [Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)](#scaling-expert-parallelism-in-tensorrt-llm-part-2-performance-status-and-optimization) +- [Scaling Expert Parallelism in TensorRT LLM (Part 2: Performance Status and Optimization)](#scaling-expert-parallelism-in-tensorrt-llm-part-2-performance-status-and-optimization) - [Table of Contents](#table-of-contents) - [Optimization Highlights](#optimization-highlights) - [Kernel Optimizations](#kernel-optimizations) @@ -28,7 +28,7 @@ This blog post continues our previous work on [Scaling Expert Parallelism in Ten ## Optimization Highlights -Following the introduction of the fundamental design and implementation of large-scale Expert Parallelism (EP) in TensorRT-LLM in our [previous blog](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md), the TensorRT-LLM team has focused on optimizing the large EP implementation to improve performance. +Following the introduction of the fundamental design and implementation of large-scale Expert Parallelism (EP) in TensorRT LLM in our [previous blog](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md), the TensorRT LLM team has focused on optimizing the large EP implementation to improve performance. At the kernel level, we analyzed kernel duration and optimized performance by either improving existing kernels or developing new kernels that perform better. At the system level, we refined and optimized the EPLB implementation (which also helps reduce kernel scalability issues), integrated additional features such as MTP, and optimized host overhead to prevent Python code from slowing down inference. @@ -94,7 +94,7 @@ This optimization was implemented in [PR 5570](https://github.com/NVIDIA/TensorR ### Expert Parallelism Load Balancer (EPLB) -As introduced in our [previous blog](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md#ep-load-balancer), EP-level workload imbalance is common for large-scale EP inference across multiple datasets and has significant performance impacts. TensorRT-LLM implements a set of functionalities to address this issue. We have refined the code and improved the usability of this feature, and the benefits of EPLB are directly reflected in kernel duration improvements. +As introduced in our [previous blog](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md#ep-load-balancer), EP-level workload imbalance is common for large-scale EP inference across multiple datasets and has significant performance impacts. TensorRT LLM implements a set of functionalities to address this issue. We have refined the code and improved the usability of this feature, and the benefits of EPLB are directly reflected in kernel duration improvements. The core challenge with EP scaling is that different experts receive varying amounts of work based on the routing decisions made by the MoE layer. This imbalance becomes more pronounced as EP size increases, leading to scenarios where some GPUs are heavily loaded while others remain underutilized. The Expert Parallelism Load Balancer (EPLB) addresses this by dynamically redistributing expert assignments to achieve better load balance across all participating GPUs. @@ -235,7 +235,7 @@ After implementing huge pages, we found that warmup kernels now execute in only ### Multi-Token Prediction (MTP) -MTP allows verifying and accepting several draft tokens in a single iteration, which is very beneficial for scenarios that prefer low latency. TensorRT-LLM has supported MTP, and we refer to our previous [MTP blog](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md#mtp-implementation-in-tensorrt-llm) for more details on the implementation. +MTP allows verifying and accepting several draft tokens in a single iteration, which is very beneficial for scenarios that prefer low latency. TensorRT LLM has supported MTP, and we refer to our previous [MTP blog](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md#mtp-implementation-in-tensorrt-llm) for more details on the implementation. For large EP, we have also extended the implementation so that it works well with online EPLB. This was implemented in [PR 5213](https://github.com/NVIDIA/TensorRT-LLM/pull/5213). @@ -247,11 +247,11 @@ To address the increased host overhead when scaling parallelism in the system, w #### Reduce Binding and Inter-Process Communication Overhead -TensorRT-LLM is designed to be composed of both C++ and Python code, so that C++ can handle the most performance-sensitive parts while Python handles higher-level logic. As we try to put more logic into Python to make the program easier to read and debug, there are still frequent conversations through binding interfaces between C++ and Python. Besides, since most of the logic is implemented in Python, there are several layers of implementation that communicate with each other through inter-process communication overhead. Frequent binding calls and serialization/deserialization introduced by inter-process communication slow down the core library. +TensorRT LLM is designed to be composed of both C++ and Python code, so that C++ can handle the most performance-sensitive parts while Python handles higher-level logic. As we try to put more logic into Python to make the program easier to read and debug, there are still frequent conversations through binding interfaces between C++ and Python. Besides, since most of the logic is implemented in Python, there are several layers of implementation that communicate with each other through inter-process communication overhead. Frequent binding calls and serialization/deserialization introduced by inter-process communication slow down the core library. To improve program efficiency, we used environment variables introduced in the [performance analysis guidance](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-analysis.md) to measure and profile CPU overhead, and improved performance by reducing and reusing different binding calls as much as possible, and delaying Python object deserialization to avoid duplicated serialization and reduce message size when doing inter-process communication. This optimization was added in [PR 5224](https://github.com/NVIDIA/TensorRT-LLM/pull/5224). We have also reduced Python garbage collection (GC) impacts in [PR 5141](https://github.com/NVIDIA/TensorRT-LLM/pull/5141). -To enable powerful NVTX markers for easier analysis of host overheads, TensorRT-LLM provides several useful environment variables: +To enable powerful NVTX markers for easier analysis of host overheads, TensorRT LLM provides several useful environment variables: ```bash export TLLM_NVTX_DEBUG=1 # enables more NVTX markers @@ -261,9 +261,9 @@ export TLLM_PROFILE_START_STOP=100-150 # enable specific iterations profiling #### Support Stream Interval -As mentioned previously, one outcome of large-scale workloads is that they significantly increase the number of requests and responses that the system must handle, putting huge pressure on Python threads. When the GPU finishes one iteration of calculation, a batch of responses are generated under streaming mode. For each response, TensorRT-LLM must perform detokenization so that output IDs are converted to strings, and OpenAI API protocol objects need to be initialized so that responses can be returned to the user. This becomes time-consuming, especially when the number of responses is huge and the CPU must process them on each iteration. One observation from the user side will be reduced streaming performance when compared to non-streaming. +As mentioned previously, one outcome of large-scale workloads is that they significantly increase the number of requests and responses that the system must handle, putting huge pressure on Python threads. When the GPU finishes one iteration of calculation, a batch of responses are generated under streaming mode. For each response, TensorRT LLM must perform detokenization so that output IDs are converted to strings, and OpenAI API protocol objects need to be initialized so that responses can be returned to the user. This becomes time-consuming, especially when the number of responses is huge and the CPU must process them on each iteration. One observation from the user side will be reduced streaming performance when compared to non-streaming. -To address this problem, TensorRT-LLM has supported a feature called stream interval. Instead of handling all responses on each iteration, a user-specified `stream_interval` `N` indicates that responses will be handled and returned every `N` iterations. This way, on each iteration, there will still be one output ID generated, but it won't be returned to users immediately (except for the first token for the sake of time-to-first-token latency). Instead, tokens accumulate for `N` iterations, and one response is created to handle those `N` generated tokens, which greatly reduces pressure on the CPU side by giving more time for the CPU to catch up. Meanwhile, users can still get streamed output. +To address this problem, TensorRT LLM has supported a feature called stream interval. Instead of handling all responses on each iteration, a user-specified `stream_interval` `N` indicates that responses will be handled and returned every `N` iterations. This way, on each iteration, there will still be one output ID generated, but it won't be returned to users immediately (except for the first token for the sake of time-to-first-token latency). Instead, tokens accumulate for `N` iterations, and one response is created to handle those `N` generated tokens, which greatly reduces pressure on the CPU side by giving more time for the CPU to catch up. Meanwhile, users can still get streamed output. This feature was added in [PR 5284](https://github.com/NVIDIA/TensorRT-LLM/pull/5284), and we have verified that it works effectively to reduce host overhead. In most cases, setting `stream_interval` to 2 or 4 should close the gap (if any) between streaming and non-streaming modes. The feature can be enabled by setting the following in the YAML extra config file: @@ -307,7 +307,7 @@ When enabling MTP, there is an extra performance boost compared to the baseline.

Figure 8: DeepSeek R1 throughput on ISL/OSL 8k/1k with MTP enabled.

-To reproduce the numbers, refer to the [`examples/wide_ep/slurm_scripts`](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/wide_ep/slurm_scripts) directory. The scripts there demonstrate how to launch TensorRT-LLM disaggregated serving with large-scale EP and other features enabled on a SLURM cluster. +To reproduce the numbers, refer to the [`examples/wide_ep/slurm_scripts`](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/wide_ep/slurm_scripts) directory. The scripts there demonstrate how to launch TensorRT LLM disaggregated serving with large-scale EP and other features enabled on a SLURM cluster. ## Future Work @@ -317,6 +317,6 @@ We are planning to implement more performance optimizations for the large EP imp ## Acknowledgements -This work represents an outstanding example of collaborative engineering excellence within the TensorRT-LLM team. The successful implementation and optimization of large-scale Expert Parallelism required coordinated efforts across multiple domains - from low-level CUDA kernel optimizations to high-level system architecture design. The dedication and technical expertise demonstrated by our team members throughout this project has been truly remarkable. +This work represents an outstanding example of collaborative engineering excellence within the TensorRT LLM team. The successful implementation and optimization of large-scale Expert Parallelism required coordinated efforts across multiple domains - from low-level CUDA kernel optimizations to high-level system architecture design. The dedication and technical expertise demonstrated by our team members throughout this project has been truly remarkable. Large-scale Expert Parallelism represents one of the important workloads for users productive scenarios, enabling efficient deployment of large MoE models. The performance improvements achieved through this work demonstrate the transformative potential of expert parallelism at scale, and this work opens new possibilities for deploying increasingly sophisticated AI models in production environments. diff --git a/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md b/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md index b50171ddf70..7ab7e6c7699 100644 --- a/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md +++ b/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md @@ -1,7 +1,7 @@ -# Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM +# Running a High Performance GPT-OSS-120B Inference Server with TensorRT LLM In the guide below, we will walk you through how to launch your own -high-performance TensorRT-LLM server for **gpt-oss-120b** for inference. +high-performance TensorRT LLM server for **gpt-oss-120b** for inference. This guide covers both low-latency and max-throughput cases. The typical use case for **low-latency**, is when we try to maximize the number of tokens per second per user with a limited concurrency (4, 8 or 16 users). @@ -20,11 +20,11 @@ For **maximum throughput**, the goal is to maximize the amount of tokens produce We have a forthcoming guide for getting great performance on H100, however this guide focuses on the above GPUs. -## Launching the TensorRT-LLM docker container +## Launching the TensorRT LLM docker container The container image that you will use will be pulled from NVIDIA's NGC. This container is multi-platform and will run on both x64 and arm64 architectures: `nvcr.io/nvidia/tensorrt-llm/release:gpt-oss-dev` -Run the follow docker command to start the TensorRT-LLM container in interactive mode: +Run the follow docker command to start the TensorRT LLM container in interactive mode: ```bash docker run --rm --ipc=host -it \ @@ -52,7 +52,7 @@ This command: Lastly the container mounts your user `.cache` directory to save the downloaded model checkpoints which are saved to `~/.cache/huggingface/hub/` by default. This prevents having to redownload the weights each time you rerun the container. -## Running the TensorRT-LLM Server +## Running the TensorRT LLM Server As pointed out in the introduction, this guide covers low-latency and max-throughput cases. Each requires a different configurations and commands to run. We will first cover the Low-Latency use-case, followed by the max throughput use-case. @@ -77,9 +77,9 @@ EOF > Note: If you are using NVIDIA H200 GPUs it is highly recommended to set the `moe_config.backend` to TRITON to use the OpenAI Triton MoE kernel. See the section [(H200 Only) Using OpenAI Triton Kernels for MoE](#h200-only-using-openai-triton-kernels-for-moe) for more details. -#### Launching TensorRT-LLM Serve +#### Launching TensorRT LLM Serve -To launch the TensorRT-LLM Server to serve the model with the **low latency** config, run the following command. Commands for different GPU configurations are provided (1xGPU, 8xGPU, 4xGPU): +To launch the TensorRT LLM Server to serve the model with the **low latency** config, run the following command. Commands for different GPU configurations are provided (1xGPU, 8xGPU, 4xGPU):
1x B200/GB200/H200 @@ -153,9 +153,9 @@ EOF > Note: If you are using NVIDIA H200 GPUs it is highly recommended to set the `moe_config.backend` to TRITON to use the OpenAI Triton MoE kernel. See the section [(H200 Only) Using OpenAI Triton Kernels for MoE](#h200-only-using-openai-triton-kernels-for-moe) for more details. -#### Launching TensorRT-LLM Serve +#### Launching TensorRT LLM Serve -To launch the TensorRT-LLM Server to serve the model with the **max throughput** config, run the following command. Commands for different GPU configurations are provided (1xGPU, 8xGPU, 4xGPU): +To launch the TensorRT LLM Server to serve the model with the **max throughput** config, run the following command. Commands for different GPU configurations are provided (1xGPU, 8xGPU, 4xGPU):
1x B200/GB200/H200 @@ -221,7 +221,7 @@ The initialization may take several minutes as it loads and optimizes the models ## (H200 Only) Using OpenAI Triton Kernels for MoE -OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT-LLM can leverage these kernels for Hopper based GPUs like NVIDIA's H200 for best performance. The NGC TensorRT-LLM container image mentioned above already includes the required kernels so you do not need to build or install them. It is highly recommended to enable them with the steps below: +OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT LLM can leverage these kernels for Hopper based GPUs like NVIDIA's H200 for best performance. The NGC TensorRT LLM container image mentioned above already includes the required kernels so you do not need to build or install them. It is highly recommended to enable them with the steps below: ### Selecting Triton as the MoE backend diff --git a/docs/source/commands/trtllm-bench.rst b/docs/source/commands/trtllm-bench.rst index 7f03c8dfc66..cd69874e0c2 100644 --- a/docs/source/commands/trtllm-bench.rst +++ b/docs/source/commands/trtllm-bench.rst @@ -1,7 +1,7 @@ trtllm-bench =========================== -trtllm-bench is a comprehensive benchmarking tool for TensorRT-LLM engines. It provides three main subcommands for different benchmarking scenarios: +trtllm-bench is a comprehensive benchmarking tool for TensorRT LLM engines. It provides three main subcommands for different benchmarking scenarios: **Common Options for All Commands:** diff --git a/docs/source/commands/trtllm-eval.rst b/docs/source/commands/trtllm-eval.rst index 55378c7ee74..1f4cf62c562 100644 --- a/docs/source/commands/trtllm-eval.rst +++ b/docs/source/commands/trtllm-eval.rst @@ -4,7 +4,7 @@ trtllm-eval About ----- -The ``trtllm-eval`` command provides developers with a unified entry point for accuracy evaluation. It shares the core evaluation logic with the `accuracy test suite `_ of TensorRT-LLM. +The ``trtllm-eval`` command provides developers with a unified entry point for accuracy evaluation. It shares the core evaluation logic with the `accuracy test suite `_ of TensorRT LLM. ``trtllm-eval`` is built on the offline API -- LLM API. Compared to the online ``trtllm-serve``, the offline API provides clearer error messages and simplifies the debugging workflow. @@ -47,7 +47,7 @@ The following tasks are currently supported: .. note:: - ``trtllm-eval`` originates from the TensorRT-LLM accuracy test suite and serves as a lightweight utility for verifying and debugging accuracy. At this time, ``trtllm-eval`` is intended solely for development and is not recommended for production use. + ``trtllm-eval`` originates from the TensorRT LLM accuracy test suite and serves as a lightweight utility for verifying and debugging accuracy. At this time, ``trtllm-eval`` is intended solely for development and is not recommended for production use. Usage and Examples ------------------ diff --git a/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md b/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md index 161535e96e0..5824a4719cd 100644 --- a/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md +++ b/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md @@ -1,6 +1,6 @@ # Run benchmarking with `trtllm-serve` -TensorRT-LLM provides the OpenAI-compatiable API via `trtllm-serve` command. +TensorRT LLM provides the OpenAI-compatiable API via `trtllm-serve` command. A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/api-reference). This step-by-step tutorial covers the following topics for running online serving benchmarking with Llama 3.1 70B: @@ -19,7 +19,7 @@ The overall performance benchmarking involves: ## Launch the NGC container -TensorRT-LLM distributes the pre-built container on [NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags). +TensorRT LLM distributes the pre-built container on [NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags). You can launch the container using the following command: @@ -114,7 +114,7 @@ Then we can run the benchmark using the command below. bash -x bench.sh &> output_bench.log ``` -Below is some example TensorRT-LLM serving benchmark output. Your actual results may vary. +Below is some example TensorRT LLM serving benchmark output. Your actual results may vary. ``` ============ Serving Benchmark Result ============ diff --git a/docs/source/deployment-guide/quick-start-recipe-for-deepseek-r1-on-trtllm.md b/docs/source/deployment-guide/quick-start-recipe-for-deepseek-r1-on-trtllm.md index 1a538a6dd86..47db24a15b4 100644 --- a/docs/source/deployment-guide/quick-start-recipe-for-deepseek-r1-on-trtllm.md +++ b/docs/source/deployment-guide/quick-start-recipe-for-deepseek-r1-on-trtllm.md @@ -1,10 +1,10 @@ -# Quick Start Recipe for DeepSeek R1 on TensorRT-LLM - Blackwell & Hopper Hardware +# Quick Start Recipe for DeepSeek R1 on TensorRT LLM - Blackwell & Hopper Hardware ## Introduction -This deployment guide provides step-by-step instructions for running the DeepSeek R1 model using TensorRT-LLM with FP8 and NVFP4 quantization, optimized for NVIDIA GPUs. It covers the complete setup required; from accessing model weights and preparing the software environment to configuring TensorRT-LLM parameters, launching the server, and validating inference output. +This deployment guide provides step-by-step instructions for running the DeepSeek R1 model using TensorRT LLM with FP8 and NVFP4 quantization, optimized for NVIDIA GPUs. It covers the complete setup required; from accessing model weights and preparing the software environment to configuring TensorRT LLM parameters, launching the server, and validating inference output. -The guide is intended for developers and practitioners seeking high-throughput or low-latency inference using NVIDIA’s accelerated stack—starting with the PyTorch container from NGC, then installing TensorRT-LLM for model serving, FlashInfer for optimized CUDA kernels, and ModelOpt to enable FP8 and NVFP4 quantized execution. +The guide is intended for developers and practitioners seeking high-throughput or low-latency inference using NVIDIA’s accelerated stack—starting with the PyTorch container from NGC, then installing TensorRT LLM for model serving, FlashInfer for optimized CUDA kernels, and ModelOpt to enable FP8 and NVFP4 quantized execution. ## Prerequisites @@ -26,7 +26,7 @@ Note that NVFP4 is only supported on NVIDIA Blackwell platform. ### Run Docker Container -Run the docker container using the TensorRT-LLM NVIDIA NGC image. +Run the docker container using the TensorRT LLM NVIDIA NGC image. ```shell docker run --rm -it \ @@ -46,11 +46,11 @@ Note: * The command also maps port **8000** from the container to your host so you can access the LLM API endpoint from your host * See the [https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags) for all the available containers. The containers published in the main branch weekly have “rcN” suffix, while the monthly release with QA tests has no “rcN” suffix. Use the rc release to get the latest model and feature support. -If you want to use latest main branch, you can choose to build from source to install TensorRT-LLM, the steps refer to [https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html) +If you want to use latest main branch, you can choose to build from source to install TensorRT LLM, the steps refer to [https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html) ### Creating the TRT-LLM Server config -We create a YAML configuration file /tmp/config.yml for the TensorRT-LLM Server and populate it with the following recommended performance settings. +We create a YAML configuration file /tmp/config.yml for the TensorRT LLM Server and populate it with the following recommended performance settings. ```shell EXTRA_LLM_API_FILE=/tmp/config.yml @@ -131,7 +131,7 @@ These options are used directly on the command line when you start the `trtllm-s #### `--backend pytorch` - **Description:** Tells TensorRT-LLM to use the **pytorch** backend. + **Description:** Tells TensorRT LLM to use the **pytorch** backend. #### `--max_batch_size` @@ -147,7 +147,7 @@ These options are used directly on the command line when you start the `trtllm-s #### `--trust_remote_code` - **Description:** Allows TensorRT-LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API. + **Description:** Allows TensorRT LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API. #### Extra LLM API Options (YAML Configuration) @@ -206,7 +206,7 @@ See the [TorchLlmArgs class](https://nvidia.github.io/TensorRT-LLM/llm-api/refer ### Basic Test -Start a new terminal on the host to test the TensorRT-LLM server you just launched. +Start a new terminal on the host to test the TensorRT LLM server you just launched. You can query the health/readiness of the server using: @@ -245,7 +245,7 @@ Here is an example response, showing that the TRT-LLM server returns “New York We use the lm-eval tool to test the model’s accuracy. For more information see [https://github.com/EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness). -To run the evaluation harness exec into the running TensorRT-LLM container and install with this command: +To run the evaluation harness exec into the running TensorRT LLM container and install with this command: ```shell docker exec -it tensorrt_llm /bin/bash @@ -293,7 +293,7 @@ Sample result in Blackwell: ## Benchmarking Performance -To benchmark the performance of your TensorRT-LLM server you can leverage the built-in “benchmark\_serving.py” script. To do this first creating a wrapper [bench.sh](http://bench.sh) script. +To benchmark the performance of your TensorRT LLM server you can leverage the built-in “benchmark\_serving.py” script. To do this first creating a wrapper [bench.sh](http://bench.sh) script. ```shell cat < bench.sh @@ -341,7 +341,7 @@ Run bench.sh to begin a serving benchmark. This will take a long time if you run ./bench.sh ``` -Sample TensorRT-LLM serving benchmark output. Your results may vary due to ongoing software optimizations. +Sample TensorRT LLM serving benchmark output. Your results may vary due to ongoing software optimizations. ``` ============ Serving Benchmark Result ============ diff --git a/docs/source/deployment-guide/quick-start-recipe-for-llama3.3-70b-on-trtllm.md b/docs/source/deployment-guide/quick-start-recipe-for-llama3.3-70b-on-trtllm.md index f984046a801..ed7dd58d41c 100644 --- a/docs/source/deployment-guide/quick-start-recipe-for-llama3.3-70b-on-trtllm.md +++ b/docs/source/deployment-guide/quick-start-recipe-for-llama3.3-70b-on-trtllm.md @@ -1,10 +1,10 @@ -# Quick Start Recipe for Llama3.3 70B on TensorRT-LLM - Blackwell & Hopper Hardware +# Quick Start Recipe for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware ## Introduction -This deployment guide provides step-by-step instructions for running the Llama 3.3-70B Instruct model using TensorRT-LLM with FP8 and NVFP4 quantization, optimized for NVIDIA GPUs. It covers the complete setup required; from accessing model weights and preparing the software environment to configuring TensorRT-LLM parameters, launching the server, and validating inference output. +This deployment guide provides step-by-step instructions for running the Llama 3.3-70B Instruct model using TensorRT LLM with FP8 and NVFP4 quantization, optimized for NVIDIA GPUs. It covers the complete setup required; from accessing model weights and preparing the software environment to configuring TensorRT LLM parameters, launching the server, and validating inference output. -The guide is intended for developers and practitioners seeking high-throughput or low-latency inference using NVIDIA’s accelerated stack—starting with the PyTorch container from NGC, then installing TensorRT-LLM for model serving, FlashInfer for optimized CUDA kernels, and ModelOpt to enable FP8 and NVFP4 quantized execution. +The guide is intended for developers and practitioners seeking high-throughput or low-latency inference using NVIDIA’s accelerated stack—starting with the PyTorch container from NGC, then installing TensorRT LLM for model serving, FlashInfer for optimized CUDA kernels, and ModelOpt to enable FP8 and NVFP4 quantized execution. ## Access & Licensing @@ -30,7 +30,7 @@ Note that NVFP4 is only supported on NVIDIA Blackwell ### Run Docker Container -Run the docker container using the TensorRT-LLM NVIDIA NGC image. +Run the docker container using the TensorRT LLM NVIDIA NGC image. ```shell docker run --rm -it \ @@ -50,11 +50,11 @@ Note: * The command also maps port **8000** from the container to your host so you can access the LLM API endpoint from your host * See the [https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags) for all the available containers. The containers published in the main branch weekly have “rcN” suffix, while the monthly release with QA tests has no “rcN” suffix. Use the rc release to get the latest model and feature support. -If you want to use latest main branch, you can choose to build from source to install TensorRT-LLM, the steps refer to [https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html) +If you want to use latest main branch, you can choose to build from source to install TensorRT LLM, the steps refer to [https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html) ### Creating the TRT-LLM Server config -We create a YAML configuration file /tmp/config.yml for the TensorRT-LLM Server and populate it with the following recommended performance settings. +We create a YAML configuration file /tmp/config.yml for the TensorRT LLM Server and populate it with the following recommended performance settings. ```shell EXTRA_LLM_API_FILE=/tmp/config.yml @@ -109,7 +109,7 @@ These options are used directly on the command line when you start the `trtllm-s #### `--backend pytorch` - **Description:** Tells TensorRT-LLM to use the **pytorch** backend. + **Description:** Tells TensorRT LLM to use the **pytorch** backend. #### `--max_batch_size` @@ -125,7 +125,7 @@ These options are used directly on the command line when you start the `trtllm-s #### `--trust_remote_code` - **Description:** Allows TensorRT-LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API. + **Description:** Allows TensorRT LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API. #### Extra LLM API Options (YAML Configuration) @@ -184,7 +184,7 @@ See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.h ### Basic Test -Start a new terminal on the host to test the TensorRT-LLM server you just launched. +Start a new terminal on the host to test the TensorRT LLM server you just launched. You can query the health/readiness of the server using: @@ -223,7 +223,7 @@ Here is an example response, showing that the TRT-LLM server returns “New York We use the lm-eval tool to test the model’s accuracy. For more information see [https://github.com/EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness). -To run the evaluation harness exec into the running TensorRT-LLM container and install with this command: +To run the evaluation harness exec into the running TensorRT LLM container and install with this command: ```shell docker exec -it tensorrt_llm /bin/bash @@ -271,7 +271,7 @@ Sample result in Blackwell ## Benchmarking Performance -To benchmark the performance of your TensorRT-LLM server you can leverage the built-in “benchmark\_serving.py” script. To do this first creating a wrapper [bench.sh](http://bench.sh) script. +To benchmark the performance of your TensorRT LLM server you can leverage the built-in “benchmark\_serving.py” script. To do this first creating a wrapper [bench.sh](http://bench.sh) script. ```shell cat < bench.sh @@ -319,7 +319,7 @@ Run bench.sh to begin a serving benchmark. This will take a long time if you run ./bench.sh ``` -Sample TensorRT-LLM serving benchmark output. Your results may vary due to ongoing software optimizations. +Sample TensorRT LLM serving benchmark output. Your results may vary due to ongoing software optimizations. ``` ============ Serving Benchmark Result ============ diff --git a/docs/source/deployment-guide/quick-start-recipe-for-llama4-scout-on-trtllm.md b/docs/source/deployment-guide/quick-start-recipe-for-llama4-scout-on-trtllm.md index 7f12e5208aa..d786cb214b0 100644 --- a/docs/source/deployment-guide/quick-start-recipe-for-llama4-scout-on-trtllm.md +++ b/docs/source/deployment-guide/quick-start-recipe-for-llama4-scout-on-trtllm.md @@ -1,10 +1,10 @@ -# Quick Start Recipe for Llama4 Scout 17B on TensorRT-LLM - Blackwell & Hopper Hardware +# Quick Start Recipe for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware ## Introduction -This deployment guide provides step-by-step instructions for running the Llama-4-Scout-17B-16E-Instruct model using TensorRT-LLM with FP8 and NVFP4 quantization, optimized for NVIDIA GPUs. It covers the complete setup required; from accessing model weights and preparing the software environment to configuring TensorRT-LLM parameters, launching the server, and validating inference output. +This deployment guide provides step-by-step instructions for running the Llama-4-Scout-17B-16E-Instruct model using TensorRT LLM with FP8 and NVFP4 quantization, optimized for NVIDIA GPUs. It covers the complete setup required; from accessing model weights and preparing the software environment to configuring TensorRT LLM parameters, launching the server, and validating inference output. -The guide is intended for developers and practitioners seeking high-throughput or low-latency inference using NVIDIA’s accelerated stack—starting with the PyTorch container from NGC, then installing TensorRT-LLM for model serving, FlashInfer for optimized CUDA kernels, and ModelOpt to enable FP8 and NVFP4 quantized execution. +The guide is intended for developers and practitioners seeking high-throughput or low-latency inference using NVIDIA’s accelerated stack—starting with the PyTorch container from NGC, then installing TensorRT LLM for model serving, FlashInfer for optimized CUDA kernels, and ModelOpt to enable FP8 and NVFP4 quantized execution. ## Access & Licensing @@ -29,7 +29,7 @@ Note that NVFP4 is only supported on NVIDIA Blackwell platform. ### Run Docker Container -Run the docker container using the TensorRT-LLM NVIDIA NGC image. +Run the docker container using the TensorRT LLM NVIDIA NGC image. ```shell docker run --rm -it \ @@ -49,11 +49,11 @@ Note: * The command also maps port 8000 from the container to your host so you can access the LLM API endpoint from your host * See the [https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags) for all the available containers. The containers published in the main branch weekly have “rcN” suffix, while the monthly release with QA tests has no “rcN” suffix. Use the rc release to get the latest model and feature support. -If you want to use latest main branch, you can choose to build from source to install TensorRT-LLM, the steps refer to [https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html) +If you want to use latest main branch, you can choose to build from source to install TensorRT LLM, the steps refer to [https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html) ### Creating the TRT-LLM Server config -We create a YAML configuration file /tmp/config.yml for the TensorRT-LLM Server and populate it with the following recommended performance settings. +We create a YAML configuration file /tmp/config.yml for the TensorRT LLM Server and populate it with the following recommended performance settings. ```shell EXTRA_LLM_API_FILE=/tmp/config.yml @@ -108,7 +108,7 @@ These options are used directly on the command line when you start the `trtllm-s #### `--backend pytorch` - **Description:** Tells TensorRT-LLM to use the **pytorch** backend. + **Description:** Tells TensorRT LLM to use the **pytorch** backend. #### `--max_batch_size` @@ -124,7 +124,7 @@ These options are used directly on the command line when you start the `trtllm-s #### `--trust_remote_code` - **Description:** Allows TensorRT-LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API. + **Description:** Allows TensorRT LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API. #### Extra LLM API Options (YAML Configuration) @@ -183,7 +183,7 @@ See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.h ### Basic Test -Start a new terminal on the host to test the TensorRT-LLM server you just launched. +Start a new terminal on the host to test the TensorRT LLM server you just launched. You can query the health/readiness of the server using: @@ -222,7 +222,7 @@ Here is an example response, showing that the TRT-LLM server returns “New York We use the lm-eval tool to test the model’s accuracy. For more information see [https://github.com/EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness). -To run the evaluation harness exec into the running TensorRT-LLM container and install with this command: +To run the evaluation harness exec into the running TensorRT LLM container and install with this command: ```shell docker exec -it tensorrt_llm /bin/bash @@ -266,7 +266,7 @@ Sample result in Blackwell ## Benchmarking Performance -To benchmark the performance of your TensorRT-LLM server you can leverage the built-in `benchmark_serving.py` script. To do this first creating a wrapper [bench.sh](http://bench.sh) script. +To benchmark the performance of your TensorRT LLM server you can leverage the built-in `benchmark_serving.py` script. To do this first creating a wrapper [bench.sh](http://bench.sh) script. ```shell cat < bench.sh @@ -314,7 +314,7 @@ Run bench.sh to begin a serving benchmark. This will take a long time if you run ./bench.sh ``` -Sample TensorRT-LLM serving benchmark output. Your results may vary due to ongoing software optimizations. +Sample TensorRT LLM serving benchmark output. Your results may vary due to ongoing software optimizations. ``` ============ Serving Benchmark Result ============ diff --git a/docs/source/dev-on-cloud/build-image-to-dockerhub.md b/docs/source/dev-on-cloud/build-image-to-dockerhub.md index e2402b3f794..a07fbfdbc81 100644 --- a/docs/source/dev-on-cloud/build-image-to-dockerhub.md +++ b/docs/source/dev-on-cloud/build-image-to-dockerhub.md @@ -1,9 +1,9 @@ (build-image-to-dockerhub)= -# Build the TensorRT-LLM Docker Image +# Build the TensorRT LLM Docker Image When you develop trt-llm on cloud platform such as runpod, you may need to provide a docker image for the platform. So you firstly need to upload the image to dockerhub. -## Build the TensorRT-LLM Docker Image and Upload to DockerHub +## Build the TensorRT LLM Docker Image and Upload to DockerHub ```bash make -C docker build diff --git a/docs/source/developer-guide/perf-analysis.md b/docs/source/developer-guide/perf-analysis.md index 0764b46345e..3ac01d82ed0 100644 --- a/docs/source/developer-guide/perf-analysis.md +++ b/docs/source/developer-guide/perf-analysis.md @@ -4,7 +4,7 @@ NVIDIA Nsight Systems reports at the application level are highly informative. Metric sampling capabilities have increased over generations and provide a clean middle-ground between timing analysis and kernel-level deep dives with NVIDIA Nsight Compute. -Given the potential long runtimes of Large Languages Models (LLMs) and the diversity of workloads a model may experience during a single inference pass or binary execution, NVIDIA has added features to TensorRT-LLM to get the most out of Nsight Systems capabilities. This document outlines those features as well as provides examples of how to best utilize them to understand your application. +Given the potential long runtimes of Large Languages Models (LLMs) and the diversity of workloads a model may experience during a single inference pass or binary execution, NVIDIA has added features to TensorRT LLM to get the most out of Nsight Systems capabilities. This document outlines those features as well as provides examples of how to best utilize them to understand your application. ## Feature Descriptions diff --git a/docs/source/developer-guide/perf-benchmarking.md b/docs/source/developer-guide/perf-benchmarking.md index 4afdbf4dac7..f46bd507dd6 100644 --- a/docs/source/developer-guide/perf-benchmarking.md +++ b/docs/source/developer-guide/perf-benchmarking.md @@ -1,18 +1,18 @@ (perf-benchmarking)= -# TensorRT-LLM Benchmarking +# TensorRT LLM Benchmarking ```{important} This benchmarking suite is a work in progress. Expect breaking API changes. ``` -TensorRT-LLM provides the `trtllm-bench` CLI, a packaged benchmarking utility that aims to make it +TensorRT LLM provides the `trtllm-bench` CLI, a packaged benchmarking utility that aims to make it easier for users to reproduce our officially published [performance overiew](./perf-overview.md#throughput-measurements). `trtllm-bench` provides the follows: - A streamlined way to build tuned engines for benchmarking for a variety of models and platforms. - An entirely Python workflow for benchmarking. -- Ability to benchmark various flows and features within TensorRT-LLM. +- Ability to benchmark various flows and features within TensorRT LLM. `trtllm-bench` executes all benchmarks using [in-flight batching] -- for more information see the [in-flight batching section](../advanced/gpt-attention.md#in-flight-batching) that describes the concept @@ -65,7 +65,7 @@ sudo nvidia-smi boost-slider --vboost #### Validated Networks for Benchmarking -While `trtllm-bench` should be able to run any network that TensorRT-LLM supports, the following are the list +While `trtllm-bench` should be able to run any network that TensorRT LLM supports, the following are the list that have been validated extensively and is the same listing as seen on the [Performance Overview](./perf-overview.md) page. @@ -102,7 +102,7 @@ For more information about quantization, refer to [](../reference/precision.md) the [support matrix](../reference/precision.md#support-matrix) of the supported quantization methods for each network. ```{tip} -Although TensorRT-LLM supports more quantization modes than listed above, `trtllm-bench` currently only configures for +Although TensorRT LLM supports more quantization modes than listed above, `trtllm-bench` currently only configures for a smaller subset. ``` @@ -417,7 +417,7 @@ P99: 1059.0000 137.0000 1178.0000 #### Quantization in the PyTorch Flow To run a quantized benchmark with `trtllm-bench` utilizing the PyTorch flow, you will need to use a pre-quantized -checkpoint. For the Llama-3.1 models, TensorRT-LLM provides the following checkpoints via HuggingFace: +checkpoint. For the Llama-3.1 models, TensorRT LLM provides the following checkpoints via HuggingFace: - [`nvidia/Llama-3.1-8B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8) - [`nvidia/Llama-3.1-70B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-70B-Instruct-FP8) diff --git a/docs/source/examples/customization.md b/docs/source/examples/customization.md index 4d36bb9f079..fc20383857e 100644 --- a/docs/source/examples/customization.md +++ b/docs/source/examples/customization.md @@ -2,7 +2,7 @@ ## Quantization -TensorRT-LLM can quantize the Hugging Face model automatically. By setting the appropriate flags in the `LLM` instance. For example, to perform an Int4 AWQ quantization, the following code triggers the model quantization. Please refer to complete list of [supported flags](https://nvidia.github.io/TensorRT-LLM/_modules/tensorrt_llm/quantization/mode.html#QuantAlgo) and acceptable values. +TensorRT LLM can quantize the Hugging Face model automatically. By setting the appropriate flags in the `LLM` instance. For example, to perform an Int4 AWQ quantization, the following code triggers the model quantization. Please refer to complete list of [supported flags](https://nvidia.github.io/TensorRT-LLM/_modules/tensorrt_llm/quantization/mode.html#QuantAlgo) and acceptable values. ``` python from tensorrt_llm.llmapi import QuantConfig, QuantAlgo diff --git a/docs/source/examples/kvcacheconfig.md b/docs/source/examples/kvcacheconfig.md index 1a2c9853c57..8d1eed2bb3e 100644 --- a/docs/source/examples/kvcacheconfig.md +++ b/docs/source/examples/kvcacheconfig.md @@ -28,7 +28,7 @@ if __name__ == '__main__': main() ``` -This example runs with default KV cache properties. The default value for `free_gpu_memory_fraction` is 0.9, which means TensorRT-LLM tries to allocate 90% of free GPU memory (after loading weights) for KV cache. Depending on your use case, this allocation can be too aggressive. You can reduce this value to 0.7 by adding the following lines to the quickstart example: +This example runs with default KV cache properties. The default for `free_gpu_memory_fraction` is 0.9, which means TensorRT LLM will try to allocate 90% of free GPU memory for KV cache. Depending on your system, this may be too aggressive, so you decide to dial that back to 0.7. This is done by adding the following lines to the quickstart example: ```python from tensorrt_llm.llmapi import KvCacheConfig diff --git a/docs/source/features/attention.md b/docs/source/features/attention.md index 69287fd6d91..9de639d80e2 100644 --- a/docs/source/features/attention.md +++ b/docs/source/features/attention.md @@ -6,13 +6,13 @@ This document details the implementation of multi-head attention (MHA), multi-query attention (MQA), and group-query attention (GQA) for autoregressive -models in TensorRT-LLM's PyTorch backend. +models in TensorRT LLM's PyTorch backend. Multi-head attention involves a sequence of batched matrix multiplications, a softmax operation, and another batched matrix multiplication, as described in the [Attention Is All You Need](https://arxiv.org/abs/1706.03762) paper. [Multi-query Attention (MQA)](https://arxiv.org/abs/1911.02150) and [Group-query Attention (GQA)](https://arxiv.org/abs/2307.09288) are variants of MHA that use fewer KV heads than the number of query heads. -TensorRT-LLM provides several implementations using different backends in `tensorrt_llm/_torch/attention_backend/`. +TensorRT LLM provides several implementations using different backends in `tensorrt_llm/_torch/attention_backend/`. The following sections explain how to use these implementations and provide a brief guide on implementing new backends. @@ -115,7 +115,7 @@ that are shorter than the `max_sequence_length` to the maximum length. It may result in excessive memory consumption as well as unneeded computations on padding tokens (in the various matrix multiplications that surround the MHA block). -To overcome that problem, TensorRT-LLM supports a mode without padding where +To overcome that problem, TensorRT LLM supports a mode without padding where the different tokens are packed together and the user provides the operator with a 1D tensor containing the lengths of the different sequences. @@ -155,9 +155,9 @@ Please be aware that this feature is only supported on Ada, Hopper and above. #### Generation Phase The generation phase is implemented using a single kernel called the masked -multi-head attention in TensorRT-LLM. That kernel is able to apply +multi-head attention in TensorRT LLM. That kernel is able to apply pre-processing on the Q, K, and V elements on-the-fly: it adds the QKV bias, applies -RoPE, and performs dequantization and quantization. TensorRT-LLM will continue to add (or +RoPE, and performs dequantization and quantization. TensorRT LLM will continue to add (or enable) additional features in future releases, such as enabling support for IA3. The masked MHA kernel has a special version that distributes the work across @@ -202,7 +202,7 @@ Supported configurations can be found using the `shouldUse` function of the `Dec ### In-flight Batching -TensorRT-LLM supports in-flight batching of requests (also known as continuous +TensorRT LLM supports in-flight batching of requests (also known as continuous batching or iteration-level batching) for higher serving throughput. With this feature, sequences in the context phase can be processed together with sequences in the generation phase. The purpose of that technique is to better interleave @@ -236,10 +236,10 @@ to be an integer multiple of the kv-cache block size. In the generation phase, a common optimization is to provide the MHA kernel with a cache containing the values of the past K and V elements that have -already been computed. That cache is known as the KV cache. TensorRT-LLM uses -that technique to accelerate its generation phase. In TensorRT-LLM, there is +already been computed. That cache is known as the KV cache. TensorRT LLM uses +that technique to accelerate its generation phase. In TensorRT LLM, there is one KV cache per Transformer layer, which means that there are as many KV -caches as layers in a model. The current version of TensorRT-LLM supports two +caches as layers in a model. The current version of TensorRT LLM supports two different types of KV caches: **contiguous** and **paged** KV caches. #### Contiguous KV Cache @@ -266,7 +266,7 @@ blocks when required. See the implementation of In its current implementation, even if the rest of the network runs in INT8 or FP8, the attention operator works with FP32, FP16, and BFloat16 inputs and -outputs. However, TensorRT-LLM supports INT8 and FP8 +outputs. However, TensorRT LLM supports INT8 and FP8 (`QuantMode.INT8_KV_CACHE` and `QuantMode.FP8_KV_CACHE`) KV caches. @@ -284,7 +284,7 @@ the MHA/MQA kernel. Dequantization is defined as ### Sliding Window Attention, Cyclic (Rolling Buffer) KV Cache -TensorRT-LLM has a feature called `Cyclic KV Cache`, which treats the kv cache +TensorRT LLM has a feature called `Cyclic KV Cache`, which treats the kv cache as a circular buffer. This means that it only stores the kv cache for the last N tokens, where N is determined by the `attention_window_size` parameter in `TrtllmAttention.forward`. When the cache is full, new tokens’ kv cache will @@ -304,7 +304,7 @@ the context kv cache are shared across beams. The StreamingLLM feature uses a window attention to perform efficient and stable LLM on long texts, which means that only `N` tokens need to be stored in the KV cache. -Similar to the cyclic KV cache feature in TensorRT-LLM, `attention_window_size` +Similar to the cyclic KV cache feature in TensorRT LLM, `attention_window_size` parameter is used to determine `N`. Different from the cyclic KV cache feature, the first `S` tokens, called sink tokens, are always kept in the attention window, where `S` is determined by `sink_token_length` parameter. diff --git a/docs/source/features/auto_deploy/auto-deploy.md b/docs/source/features/auto_deploy/auto-deploy.md index 6cda4f66ab7..4e8eca49325 100644 --- a/docs/source/features/auto_deploy/auto-deploy.md +++ b/docs/source/features/auto_deploy/auto-deploy.md @@ -4,18 +4,18 @@ This project is under active development and is currently in a prototype stage. The code is a prototype, subject to change, and may include backward-incompatible updates. While we strive for correctness, there are no guarantees regarding functionality, stability, or reliability. ``` -## Seamless Model Deployment from PyTorch to TensorRT-LLM +## Seamless Model Deployment from PyTorch to TensorRT LLM -AutoDeploy is a prototype designed to simplify and accelerate the deployment of PyTorch models, including off-the-shelf models such as those from the Hugging Face Transformers library, to TensorRT-LLM. +AutoDeploy is a prototype designed to simplify and accelerate the deployment of PyTorch models, including off-the-shelf models such as those from the Hugging Face Transformers library, to TensorRT LLM. ![AutoDeploy overview](../../media/ad_overview.png) -AutoDeploy overview and relation with TensorRT-LLM's LLM API +AutoDeploy overview and relation with TensorRT LLM's LLM API -AutoDeploy provides an alternative method for deploying models using the LLM API without requiring code changes to the source model (for example, Hugging Face Transformers models) or manual implementation of inference optimizations, such as KV-caches, multi-GPU parallelism, or quantization. Instead, AutoDeploy extracts a computation graph from the source model and applies inference optimizations through a series of automated graph transformations. AutoDeploy generates an inference-optimized graph that can be directly executed in the TensorRT-LLM PyTorch runtime and leverages various runtime optimizations including in-flight batching, paging, and overlap scheduling. +AutoDeploy provides an alternative method for deploying models using the LLM API without requiring code changes to the source model (for example, Hugging Face Transformers models) or manual implementation of inference optimizations, such as KV-caches, multi-GPU parallelism, or quantization. Instead, AutoDeploy extracts a computation graph from the source model and applies inference optimizations through a series of automated graph transformations. AutoDeploy generates an inference-optimized graph that can be directly executed in the TensorRT LLM PyTorch runtime and leverages various runtime optimizations including in-flight batching, paging, and overlap scheduling. ## Key Features -- **Seamless Model Translation:** Automatically converts PyTorch/Hugging Face models to TensorRT-LLM without manual rewrites. +- **Seamless Model Translation:** Automatically converts PyTorch/Hugging Face models to TensorRT LLM without manual rewrites. - **Unified Model Definition:** Maintain a single source of truth with your original PyTorch/Hugging Face model. - **Optimized Inference:** Built-in transformations for sharding, quantization, KV-cache integration, MHA fusion, and CudaGraph optimization. - **Immediate Deployment:** Day-0 support for models with continuous performance enhancements. @@ -48,7 +48,7 @@ python build_and_run_ad.py --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" AutoDeploy streamlines the model deployment process through an automated workflow designed for efficiency and performance. The workflow begins with a PyTorch model, which is exported using `torch.export` to generate a standard Torch graph. This graph contains core PyTorch ATen operations alongside custom attention operations, determined by the attention backend specified in the configuration. -The exported graph then undergoes a series of automated transformations, including graph sharding, KV-cache insertion, and GEMM fusion, to optimize model performance. After these transformations, the graph is compiled using one of the supported compile backends (like `torch-opt`), followed by deploying it via the TensorRT-LLM runtime. +The exported graph then undergoes a series of automated transformations, including graph sharding, KV-cache insertion, and GEMM fusion, to optimize model performance. After these transformations, the graph is compiled using one of the supported compile backends (like `torch-opt`), followed by deploying it via the TensorRT LLM runtime. - [Support Matrix](support_matrix.md) diff --git a/docs/source/features/checkpoint-loading.md b/docs/source/features/checkpoint-loading.md index 7d0bce05ae5..4a37ef76234 100644 --- a/docs/source/features/checkpoint-loading.md +++ b/docs/source/features/checkpoint-loading.md @@ -16,7 +16,7 @@ The checkpoint loading design is built around a plugin-like architecture that is - **Checkpoint Loaders**: Orchestrate the loading process for specific formats - **Config Loaders**: Handle model configuration parsing and validation - **Weight Loaders**: Manage the actual loading of model weights from storage into memory -- **Weight Mappers**: Map and transform loaded weights to TensorRT-LLM model's definition +- **Weight Mappers**: Map and transform loaded weights to TensorRT LLM model's definition This modular design allows for easy extension to support new checkpoint formats while maintaining backward compatibility and performance optimizations. By separating the checkpoint loading components into four different subcomponents, any user can employ any relevant previous work while also introducing their own custom checkpoint-specific components. diff --git a/docs/source/features/multi-modality.md b/docs/source/features/multi-modality.md index 72c962aa9b8..b9f20294127 100644 --- a/docs/source/features/multi-modality.md +++ b/docs/source/features/multi-modality.md @@ -1,6 +1,6 @@ -# Multimodal Support in TensorRT-LLM +# Multimodal Support in TensorRT LLM -TensorRT-LLM supports a variety of multimodal models, enabling efficient inference with inputs beyond just text. +TensorRT LLM supports a variety of multimodal models, enabling efficient inference with inputs beyond just text. --- @@ -14,7 +14,7 @@ Multimodal LLMs typically handle non-text inputs by combining a multimodal encod ## Optimizations -TensorRT-LLM incorporates some key optimizations to enhance the performance of multimodal inference: +TensorRT LLM incorporates some key optimizations to enhance the performance of multimodal inference: * **In-Flight Batching**: Batches multimodal requests within the GPU executor to improve GPU utilization and throughput. * **CPU/GPU Concurrency**: Asynchronously overlaps data preprocessing on the CPU with image encoding on the GPU. @@ -28,11 +28,11 @@ Please refer to the latest multimodal [support matrix](../models/supported-model ## Examples -The following examples demonstrate how to use TensorRT-LLM's multimodal support in various scenarios, including quick run examples, serving endpoints, and performance benchmarking. +The following examples demonstrate how to use TensorRT LLM's multimodal support in various scenarios, including quick run examples, serving endpoints, and performance benchmarking. ### Quick start -Quickly try out TensorRT-LLM's multimodal support using our `LLM-API` and a ready-to-run [example](source:examples/llm-api/quickstart_multimodal.py): +Quickly try out TensorRT LLM's multimodal support using our `LLM-API` and a ready-to-run [example](source:examples/llm-api/quickstart_multimodal.py): ```bash python3 quickstart_multimodal.py --model_dir Efficient-Large-Model/NVILA-8B --modality image diff --git a/docs/source/features/paged-attention-ifb-scheduler.md b/docs/source/features/paged-attention-ifb-scheduler.md index 3bd27920405..2057be56048 100644 --- a/docs/source/features/paged-attention-ifb-scheduler.md +++ b/docs/source/features/paged-attention-ifb-scheduler.md @@ -2,7 +2,7 @@ ## In-flight Batching -TensorRT-LLM supports in-flight batching of requests (also known as continuous +TensorRT LLM supports in-flight batching of requests (also known as continuous batching or iteration-level batching) for higher serving throughput. With this feature, sequences in the context phase can be processed together with sequences in the generation phase. The purpose of that technique is to better interleave @@ -38,7 +38,7 @@ Set a sufficiently high `max_batch_size` when building the engine so that it do `max_seq_len` defines the maximum sequence length of single request​ -Starting from TensorRT-LLM v0.11, when `--remove_input_padding` and `--context_fmha` are enabled, `max_seq_len` can replace `max_input_len` and `max_output_len`, and is set to `max_position_embeddings` by default. +Starting from TensorRT LLM v0.11, when `--remove_input_padding` and `--context_fmha` are enabled, `max_seq_len` can replace `max_input_len` and `max_output_len`, and is set to `max_position_embeddings` by default. Use default `max_seq_len` (which is `max_position_embeddings`), no need to tune it unless you are very sure what max sequence lengths would be on your workloads. If GPU memory is so limited that it cannot support even one request reaching `max_seq_len`, you need to reduce it. @@ -58,7 +58,7 @@ sequence batching is enabled, requests in context phase will be executed with requests in generation phase. Those latter requests produce a lot fewer tokens than `max_input_len` (at most, `beam_width` tokens). -Using a more realistic value for `max_num_tokens` allows TensorRT-LLM to +Using a more realistic value for `max_num_tokens` allows TensorRT LLM to allocate more memory to store the KV cache and execute more requests together. It leads to an increased efficiency. @@ -80,10 +80,10 @@ needs to be enabled. Except for the last chunk, the size of each context chunk n In the generation phase, a common optimization is to provide the MHA kernel with a cache containing the values of the past K and V elements that have -already been computed. That cache is known as the KV cache. TensorRT-LLM uses -that technique to accelerate its generation phase. In TensorRT-LLM, there is +already been computed. That cache is known as the KV cache. TensorRT LLM uses +that technique to accelerate its generation phase. In TensorRT LLM, there is one KV cache per Transformer layer, which means that there are as many KV -caches as layers in a model. The current version of TensorRT-LLM supports two +caches as layers in a model. The current version of TensorRT LLM supports two different types of KV caches: **contiguous** and **paged** KV caches. ### Contiguous KV Cache @@ -106,7 +106,7 @@ A more efficient C++ implementation is included in the ## The schedulers -This section visualizes how TensorRT-LLM schedules requests based on max-batch size and max-num tokens. The example starts out with a newly initialized engine as well as a few unscheduled requests that have come in. For the sake of this example, toy values are set to `max batch size = 4` and `max num tokens = 12`. Each square block represents a token, and its color represents which request it belongs to. +This section visualizes how TensorRT LLM schedules requests based on max-batch size and max-num tokens. The example starts out with a newly initialized engine as well as a few unscheduled requests that have come in. For the sake of this example, toy values are set to `max batch size = 4` and `max num tokens = 12`. Each square block represents a token, and its color represents which request it belongs to. ![TRT-LLM Scheduler Visualization 1](../media/TRTLLM_Scheduler_Vis_1.svg) @@ -135,9 +135,9 @@ Overall, the max batch size and max num tokens limits play a key role in determi ## Revisiting Paged Context Attention and Context Chunking -[Previously](./useful-build-time-flags.md#paged-context-attention) we recommended enabling paged context attention even though in our case study it didn't affect performance significantly. Now that we understand the TensorRT-LLM scheduler, we can explain why this is beneficial. In short, we recommend enabling it because it enables context chunking, which allows the context phase of a request to be broken up into pieces and processed over several execution iterations, allowing the engine to provide a more stable balance of context and generation phase execution. +[Previously](./useful-build-time-flags.md#paged-context-attention) we recommended enabling paged context attention even though in our case study it didn't affect performance significantly. Now that we understand the TensorRT LLM scheduler, we can explain why this is beneficial. In short, we recommend enabling it because it enables context chunking, which allows the context phase of a request to be broken up into pieces and processed over several execution iterations, allowing the engine to provide a more stable balance of context and generation phase execution. -The [visualization](#the-schedulers) of the TensorRT-LLM scheduler showed that initially Request 3 couldn't be scheduled because it would put the scheduler over the max-num tokens limit. However, with context chunking, this is no longer the case, and the first chunk of Request 3 can be scheduled. +The [visualization](#the-schedulers) of the TensorRT LLM scheduler showed that initially Request 3 couldn't be scheduled because it would put the scheduler over the max-num tokens limit. However, with context chunking, this is no longer the case, and the first chunk of Request 3 can be scheduled. ![TRT-LLM Scheduler Visualization Chunked Context 1](../media/TRTLLM_Scheduler_Vis_Chunked_Context_1.svg) diff --git a/docs/source/features/parallel-strategy.md b/docs/source/features/parallel-strategy.md index 6e51cb17e69..b528c639d7f 100644 --- a/docs/source/features/parallel-strategy.md +++ b/docs/source/features/parallel-strategy.md @@ -1,10 +1,10 @@ -# Parallelism in TensorRT-LLM +# Parallelism in TensorRT LLM Parallelism across multiple GPUs becomes necessary when either * the model cannot fit in a single GPU’s memory, or * a single GPU cannot deliver the desired performance. -TensorRT-LLM supports multiple parallelism strategies for deployment on both single and multiple nodes: +TensorRT LLM supports multiple parallelism strategies for deployment on both single and multiple nodes: * **Tensor Parallel (TP)** - Shards model weights across GPUs * **Pipeline Parallel (PP)** - Distributes model layers across GPUs * **Data Parallel (DP)** - Replicates model across GPUs for different requests @@ -48,7 +48,7 @@ Wide-EP is an advanced form of expert parallelism that addresses the inherent wo ### Attention Module -TensorRT-LLM supports two strategies for attention modules: +TensorRT LLM supports two strategies for attention modules: - **Tensor Parallelism (TP)** — best for small batch sizes - **Data Parallelism (DP)** — best for large batch sizes @@ -90,7 +90,7 @@ Tensor Parallelism is supported for the FFN layers of dense models. MoE replaces a single FFN with multiple experts. A router selects the top-k experts for each token and dispatches the corresponding hidden states. -TensorRT-LLM supports three execution patterns for MoE: +TensorRT LLM supports three execution patterns for MoE: * **TP** - Every expert's weight matrix is sliced across all GPUs. Each GPU sees all tokens. * **EP** - Full weights of each expert reside on a single GPU. Each GPU only sees tokens routed to its local experts. @@ -122,7 +122,7 @@ The product of `moe_tensor_parallel_size` and `moe_expert_parallel_size` must eq ## Wide Expert Parallelism (Wide-EP) -Wide Expert Parallelism (Wide-EP) is TensorRT-LLM's advanced solution for large-scale MoE model inference. It addresses the challenges of traditional expert parallelism through intelligent load balancing and expert replication strategies. +Wide Expert Parallelism (Wide-EP) is TensorRT LLM's advanced solution for large-scale MoE model inference. It addresses the challenges of traditional expert parallelism through intelligent load balancing and expert replication strategies. ### Motivation for Wide-EP @@ -170,7 +170,7 @@ The system maintains a routing table that maps Expert IDs to Slot IDs, which can ### References -- [Technical Blog: Scaling Expert Parallelism in TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md) +- [Technical Blog: Scaling Expert Parallelism in TensorRT LLM](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md) - [DeepSeek-V3 Paper](https://arxiv.org/abs/2412.19437) - [EPLB Implementation](https://github.com/deepseek-ai/EPLB) diff --git a/docs/source/features/quantization.md b/docs/source/features/quantization.md index 8487e98540f..2adcc6992c8 100644 --- a/docs/source/features/quantization.md +++ b/docs/source/features/quantization.md @@ -1,10 +1,10 @@ # Quantization -## Quantization in TensorRT-LLM +## Quantization in TensorRT LLM Quantization is a technique used to reduces memory footprint and computational cost by converting the model's weights and/or activations from high-precision floating-point numbers (like BF16) to lower-precision data types, such as INT8, FP8, or FP4. -TensorRT-LLM offers a variety of quantization recipes to optimize LLM inference. These recipes can be broadly categorized as follows: +TensorRT LLM offers a variety of quantization recipes to optimize LLM inference. These recipes can be broadly categorized as follows: * FP4 * FP8 Per Tensor @@ -23,7 +23,7 @@ The default PyTorch backend supports FP4 and FP8 quantization on the latest Blac ### Running Pre-quantized Models -TensorRT-LLM can directly run [pre-quantized models](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4) generated with the [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer). +TensorRT LLM can directly run [pre-quantized models](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4) generated with the [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer). ```python from tensorrt_llm import LLM @@ -34,7 +34,7 @@ llm.generate("Hello, my name is") #### FP8 KV Cache ```{note} -TensorRT-LLM allows you to enable the FP8 KV cache manually, even for checkpoints that do not have it enabled by default. +TensorRT LLM allows you to enable the FP8 KV cache manually, even for checkpoints that do not have it enabled by default. ``` Here is an example of how to set the FP8 KV Cache option: diff --git a/docs/source/features/speculative-decoding.md b/docs/source/features/speculative-decoding.md index cd6bc7d04fc..1ee178f5cb7 100644 --- a/docs/source/features/speculative-decoding.md +++ b/docs/source/features/speculative-decoding.md @@ -149,7 +149,7 @@ nontrivial methods, however. decoding algorithms. - `extend_ctx`: If true, the speculative decoding dispatches requests with `py_draft_tokens` attached to them to the *prefill* version of the attention kernels. This usually needs to be true. The exception is when you're on -Blackwell using the TensorRT-LLM attention backend. In that case, use the generation kernels for better performance. +Blackwell using the TensorRT LLM attention backend. In that case, use the generation kernels for better performance. This optimized kernel has one limitation; all draft lengths must be the same (or padding must be used) in this case. > *These may be refactored in the future to reduce the difficulty of adding a new speculative diff --git a/docs/source/index.rst b/docs/source/index.rst index 18e6e7bfa9c..31a3d4b4ab5 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,9 +1,9 @@ -.. TensorRT-LLM documentation master file, created by +.. TensorRT LLM documentation master file, created by sphinx-quickstart on Wed Sep 20 08:35:21 2023. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to TensorRT-LLM's Documentation! +Welcome to TensorRT LLM's Documentation! ======================================== .. toctree:: diff --git a/docs/source/installation/build-from-source-linux.md b/docs/source/installation/build-from-source-linux.md index bf06b3b38f1..f4b6f3836ff 100644 --- a/docs/source/installation/build-from-source-linux.md +++ b/docs/source/installation/build-from-source-linux.md @@ -2,17 +2,17 @@ # Building from Source Code on Linux -This document provides instructions for building TensorRT-LLM from source code on Linux. Building from source is recommended for achieving optimal performance, enabling debugging capabilities, or when you need a different [GNU CXX11 ABI](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) configuration than what is available in the pre-built TensorRT-LLM wheel on PyPI. Note that the current pre-built TensorRT-LLM wheel on PyPI is linked against PyTorch 2.7.0 and subsequent versions, which uses the new CXX11 ABI. +This document provides instructions for building TensorRT LLM from source code on Linux. Building from source is recommended for achieving optimal performance, enabling debugging capabilities, or when you need a different [GNU CXX11 ABI](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) configuration than what is available in the pre-built TensorRT LLM wheel on PyPI. Note that the current pre-built TensorRT LLM wheel on PyPI is linked against PyTorch 2.7.0 and subsequent versions, which uses the new CXX11 ABI. ## Prerequisites -Use [Docker](https://www.docker.com) to build and run TensorRT-LLM. Instructions to install an environment to run Docker containers for the NVIDIA platform can be found [here](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). +Use [Docker](https://www.docker.com) to build and run TensorRT LLM. Instructions to install an environment to run Docker containers for the NVIDIA platform can be found [here](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). -If you intend to build any TensortRT-LLM artifacts, such as any of the container images (note that there exist pre-built [develop](#build-from-source-tip-develop-container) and [release](#build-from-source-tip-release-container) container images in NGC), or the TensorRT-LLM Python wheel, you first need to clone the TensorRT-LLM repository: +If you intend to build any TensortRT-LLM artifacts, such as any of the container images (note that there exist pre-built [develop](#build-from-source-tip-develop-container) and [release](#build-from-source-tip-release-container) container images in NGC), or the TensorRT LLM Python wheel, you first need to clone the TensorRT LLM repository: ```bash -# TensorRT-LLM uses git-lfs, which needs to be installed in advance. +# TensorRT LLM uses git-lfs, which needs to be installed in advance. apt-get update && apt-get -y install git git-lfs git lfs install @@ -22,24 +22,24 @@ git submodule update --init --recursive git lfs pull ``` -## Building a TensorRT-LLM Docker Image +## Building a TensorRT LLM Docker Image -There are two options to create a TensorRT-LLM Docker image. The approximate disk space required to build the image is 63 GB. +There are two options to create a TensorRT LLM Docker image. The approximate disk space required to build the image is 63 GB. -### Option 1: Build TensorRT-LLM in One Step +### Option 1: Build TensorRT LLM in One Step ```{tip} :name: build-from-source-tip-release-container -If you just want to run TensorRT-LLM, you can instead [use the pre-built TensorRT-LLM Release container images](containers). +If you just want to run TensorRT LLM, you can instead [use the pre-built TensorRT LLM Release container images](containers). ``` -TensorRT-LLM contains a simple command to create a Docker image. Note that if you plan to develop on TensorRT-LLM, we recommend using [Option 2: Build TensorRT-LLM Step-By-Step](#option-2-build-tensorrt-llm-step-by-step). +TensorRT LLM contains a simple command to create a Docker image. Note that if you plan to develop on TensorRT LLM, we recommend using [Option 2: Build TensorRT LLM Step-By-Step](#option-2-build-tensorrt-llm-step-by-step). ```bash make -C docker release_build ``` -You can add the `CUDA_ARCHS=""` optional argument to specify which architectures should be supported by TensorRT-LLM. It restricts the supported GPU architectures but helps reduce compilation time: +You can add the `CUDA_ARCHS=""` optional argument to specify which architectures should be supported by TensorRT LLM. It restricts the supported GPU architectures but helps reduce compilation time: ```bash # Restrict the compilation to Ada and Hopper architectures. @@ -52,19 +52,20 @@ After the image is built, the Docker container can be run. make -C docker release_run ``` -The `make` command supports the `LOCAL_USER=1` argument to switch to the local user account instead of `root` inside the container. The examples of TensorRT-LLM are installed in the `/app/tensorrt_llm/examples` directory. +The `make` command supports the `LOCAL_USER=1` argument to switch to the local user account instead of `root` inside the container. The examples of TensorRT LLM are installed in the `/app/tensorrt_llm/examples` directory. -Since TensorRT-LLM has been built and installed, you can skip the remaining steps. +Since TensorRT LLM has been built and installed, you can skip the remaining steps. -### Option 2: Container for building TensorRT-LLM Step-by-Step +(option-2-build-tensorrt-llm-step-by-step)= +### Option 2: Container for building TensorRT LLM Step-by-Step -If you are looking for more flexibility, TensorRT-LLM has commands to create and run a development container in which TensorRT-LLM can be built. +If you are looking for more flexibility, TensorRT LLM has commands to create and run a development container in which TensorRT LLM can be built. ```{tip} :name: build-from-source-tip-develop-container As an alternative to building the container image following the instructions below, -you can pull a pre-built [TensorRT-LLM Develop container image](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/devel) from NGC (see [here](containers) for information on container tags). -Follow the linked catalog entry to enter a new container based on the pre-built container image, with the TensorRT source repository mounted into it. You can then skip this section and continue straight to [building TensorRT-LLM](#build-tensorrt-llm). +you can pull a pre-built [TensorRT LLM Develop container image](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/devel) from NGC (see [here](containers) for information on container tags). +Follow the linked catalog entry to enter a new container based on the pre-built container image, with the TensorRT source repository mounted into it. You can then skip this section and continue straight to [building TensorRT LLM](#build-tensorrt-llm). ``` **On systems with GNU `make`** @@ -110,21 +111,21 @@ Follow the linked catalog entry to enter a new container based on the pre-built ``` Note: please make sure to set `--ipc=host` as a docker run argument to avoid `Bus error (core dumped)`. -Once inside the container, follow the next steps to build TensorRT-LLM from source. +Once inside the container, follow the next steps to build TensorRT LLM from source. ### Advanced topics -For more information on building and running various TensorRT-LLM container images, +For more information on building and running various TensorRT LLM container images, check . -## Build TensorRT-LLM +## Build TensorRT LLM ### Option 1: Full Build with C++ Compilation The following command compiles the C++ code and packages the compiled libraries along with the Python files into a wheel. When developing C++ code, you need this full build command to apply your code changes. ```bash -# To build the TensorRT-LLM code. +# To build the TensorRT LLM code. python3 ./scripts/build_wheel.py ``` @@ -147,13 +148,13 @@ directory, add the `--clean` option: python3 ./scripts/build_wheel.py --clean ``` -It is possible to restrict the compilation of TensorRT-LLM to specific CUDA +It is possible to restrict the compilation of TensorRT LLM to specific CUDA architectures. For that purpose, the `build_wheel.py` script accepts a semicolon separated list of CUDA architecture as shown in the following example: ```bash -# Build TensorRT-LLM for Ampere. +# Build TensorRT LLM for Ampere. python3 ./scripts/build_wheel.py --cuda_architectures "80-real;86-real" ``` @@ -179,15 +180,15 @@ relevant classes. The associated unit tests should also be consulted for underst This feature will not be enabled when [`building only the C++ runtime`](#link-with-the-tensorrt-llm-c++-runtime). -#### Linking with the TensorRT-LLM C++ Runtime +#### Linking with the TensorRT LLM C++ Runtime -The `build_wheel.py` script will also compile the library containing the C++ runtime of TensorRT-LLM. If Python support and `torch` modules are not required, the script provides the option `--cpp_only` which restricts the build to the C++ runtime only. +The `build_wheel.py` script will also compile the library containing the C++ runtime of TensorRT LLM. If Python support and `torch` modules are not required, the script provides the option `--cpp_only` which restricts the build to the C++ runtime only. ```bash python3 ./scripts/build_wheel.py --cuda_architectures "80-real;86-real" --cpp_only --clean ``` -This is particularly useful for avoiding linking issues that may arise with older versions of `torch` (prior to 2.7.0) due to the [Dual ABI support in GCC](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html). The `--clean` option removes the build directory before starting a new build. By default, TensorRT-LLM uses `cpp/build` as the build directory, but you can specify a different location with the `--build_dir` option. For a complete list of available build options, run `python3 ./scripts/build_wheel.py --help`. +This is particularly useful for avoiding linking issues that may arise with older versions of `torch` (prior to 2.7.0) due to the [Dual ABI support in GCC](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html). The `--clean` option removes the build directory before starting a new build. By default, TensorRT LLM uses `cpp/build` as the build directory, but you can specify a different location with the `--build_dir` option. For a complete list of available build options, run `python3 ./scripts/build_wheel.py --help`. The shared library can be found in the following location: @@ -203,18 +204,18 @@ cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so #### Supported C++ Header Files -When using TensorRT-LLM, you need to add the `cpp` and `cpp/include` directories to the project's include paths. Only header files contained in `cpp/include` are part of the supported API and may be directly included. Other headers contained under `cpp` should not be included directly since they might change in future versions. +When using TensorRT LLM, you need to add the `cpp` and `cpp/include` directories to the project's include paths. Only header files contained in `cpp/include` are part of the supported API and may be directly included. Other headers contained under `cpp` should not be included directly since they might change in future versions. ### Option 2: Python-Only Build without C++ Compilation -If you only need to modify Python code, it is possible to package and install TensorRT-LLM without compilation. +If you only need to modify Python code, it is possible to package and install TensorRT LLM without compilation. ```bash -# Package TensorRT-LLM wheel. +# Package TensorRT LLM wheel. TRTLLM_USE_PRECOMPILED=1 pip wheel . --no-deps --wheel-dir ./build -# Install TensorRT-LLM wheel. +# Install TensorRT LLM wheel. pip install ./build/tensorrt_llm*.whl ``` diff --git a/docs/source/installation/containers.md b/docs/source/installation/containers.md index 3dde7b352bf..d948366ef80 100644 --- a/docs/source/installation/containers.md +++ b/docs/source/installation/containers.md @@ -2,8 +2,8 @@ # Pre-built release container images on NGC -Pre-built TensorRT-LLM releases are made available as container images -on NGC. This is likely the simplest way to obtain TensorRT-LLM. Please refer to the [documentation in NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release) for usage instructions. +Pre-built TensorRT LLM releases are made available as container images +on NGC. This is likely the simplest way to obtain TensorRT LLM. Please refer to the [documentation in NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release) for usage instructions. {{container_tag_admonition}} diff --git a/docs/source/installation/index.rst b/docs/source/installation/index.rst index 7d5dc214912..05b744a0946 100644 --- a/docs/source/installation/index.rst +++ b/docs/source/installation/index.rst @@ -3,7 +3,7 @@ Installation ============ -There are multiple ways to install and run TensorRT-LLM. For most users, the options below should be ordered from simple to complex. The approaches are equivalent in terms of the supported features. +There are multiple ways to install and run TensorRT LLM. For most users, the options below should be ordered from simple to complex. The approaches are equivalent in terms of the supported features. Note: **This project will download and install additional third-party open source software projects. Review the license terms of these open source projects before use.** diff --git a/docs/source/installation/linux.md b/docs/source/installation/linux.md index 9262453b660..02a0cf7817d 100644 --- a/docs/source/installation/linux.md +++ b/docs/source/installation/linux.md @@ -2,7 +2,7 @@ # Installing on Linux via `pip` -1. Install TensorRT-LLM (tested on Ubuntu 24.04). +1. Install TensorRT LLM (tested on Ubuntu 24.04). ### Install prerequisites @@ -23,14 +23,14 @@ ```{tip} Instead of manually installing the preqrequisites as described - above, it is also possible to use the pre-built [TensorRT-LLM Develop container + above, it is also possible to use the pre-built [TensorRT LLM Develop container image hosted on NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/devel) (see [here](containers) for information on container tags). ``` - ### Install pre-built TensorRT-LLM wheel + ### Install pre-built TensorRT LLM wheel - Once all prerequisites are in place, TensorRT-LLM can be installed as follows: + Once all prerequisites are in place, TensorRT LLM can be installed as follows: ```bash pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm @@ -46,12 +46,12 @@ **Known limitations** -There are some known limitations when you pip install pre-built TensorRT-LLM wheel package. +There are some known limitations when you pip install pre-built TensorRT LLM wheel package. 1. MPI in the Slurm environment - If you encounter an error while running TensorRT-LLM in a Slurm-managed cluster, you need to reconfigure the MPI installation to work with Slurm. - The setup methods depends on your slurm configuration, pls check with your admin. This is not a TensorRT-LLM specific, rather a general mpi+slurm issue. + If you encounter an error while running TensorRT LLM in a Slurm-managed cluster, you need to reconfigure the MPI installation to work with Slurm. + The setup methods depends on your slurm configuration, pls check with your admin. This is not a TensorRT LLM specific, rather a general mpi+slurm issue. ``` The application appears to have been direct launched using "srun", but OMPI was not built with SLURM support. This usually happens diff --git a/docs/source/overview.md b/docs/source/overview.md index cb833f4365e..6009b41f050 100644 --- a/docs/source/overview.md +++ b/docs/source/overview.md @@ -2,28 +2,28 @@ # Overview -## About TensorRT-LLM +## About TensorRT LLM -[TensorRT-LLM](https://developer.nvidia.com/tensorrt) is NVIDIA's comprehensive open-source library for accelerating and optimizing inference performance of the latest large language models (LLMs) on NVIDIA GPUs. +[TensorRT LLM](https://developer.nvidia.com/tensorrt) is NVIDIA's comprehensive open-source library for accelerating and optimizing inference performance of the latest large language models (LLMs) on NVIDIA GPUs. ## Key Capabilities ### 🔥 **Architected on Pytorch** -TensorRT-LLM provides a high-level Python [LLM API](./quick-start-guide.md#run-offline-inference-with-llm-api) that supports a wide range of inference setups - from single-GPU to multi-GPU or multi-node deployments. It includes built-in support for various parallelism strategies and advanced features. The LLM API integrates seamlessly with the broader inference ecosystem, including NVIDIA [Dynamo](https://github.com/ai-dynamo/dynamo) and the [Triton Inference Server](https://github.com/triton-inference-server/server). +TensorRT LLM provides a high-level Python [LLM API](./quick-start-guide.md#run-offline-inference-with-llm-api) that supports a wide range of inference setups - from single-GPU to multi-GPU or multi-node deployments. It includes built-in support for various parallelism strategies and advanced features. The LLM API integrates seamlessly with the broader inference ecosystem, including NVIDIA [Dynamo](https://github.com/ai-dynamo/dynamo) and the [Triton Inference Server](https://github.com/triton-inference-server/server). -TensorRT-LLM is designed to be modular and easy to modify. Its PyTorch-native architecture allows developers to experiment with the runtime or extend functionality. Several popular models are also pre-defined and can be customized using [native PyTorch code](source:tensorrt_llm/_torch/models/modeling_deepseekv3.py), making it easy to adapt the system to specific needs. +TensorRT LLM is designed to be modular and easy to modify. Its PyTorch-native architecture allows developers to experiment with the runtime or extend functionality. Several popular models are also pre-defined and can be customized using [native PyTorch code](source:tensorrt_llm/_torch/models/modeling_deepseekv3.py), making it easy to adapt the system to specific needs. ### ⚡ **State-of-the-Art Performance** -TensorRT-LLM delivers breakthrough performance on the latest NVIDIA GPUs: +TensorRT LLM delivers breakthrough performance on the latest NVIDIA GPUs: - **DeepSeek R1**: [World-record inference performance on Blackwell GPUs](.https://developer.nvidia.com/blog/nvidia-blackwell-delivers-world-record-deepseek-r1-inference-performance/) - **Llama 4 Maverick**: [Breaks the 1,000 TPS/User Barrier on B200 GPUs](https://developer.nvidia.com/blog/blackwell-breaks-the-1000-tps-user-barrier-with-metas-llama-4-maverick/) ### 🎯 **Comprehensive Model Support** -TensorRT-LLM supports the latest and most popular LLM architectures: +TensorRT LLM supports the latest and most popular LLM architectures: - **Language Models**: GPT-OSS, Deepseek-R1/V3, Llama 3/4, Qwen2/3, Gemma 3, Phi 4... - **Multi-modal Models**: LLaVA-NeXT, Qwen2-VL, VILA, Llama 3.2 Vision... @@ -46,12 +46,12 @@ TensorRT LLM strives to support the most popular models on **Day 0**. ### 🔧 **Latest GPU Architecture Support** -TensorRT-LLM supports the full spectrum of NVIDIA GPU architectures: +TensorRT LLM supports the full spectrum of NVIDIA GPU architectures: - **NVIDIA Blackwell**: B200, GB200, RTX Pro 6000 SE with FP4 optimization - **NVIDIA Hopper**: H100, H200,GH200 with FP8 acceleration - **NVIDIA Ada Lovelace**: L40/L40S, RTX 40 series with FP8 acceleration - **NVIDIA Ampere**: A100, RTX 30 series for production workloads -## What Can You Do With TensorRT-LLM? +## What Can You Do With TensorRT LLM? -Whether you're building the next generation of AI applications, optimizing existing LLM deployments, or exploring the frontiers of large language model technology, TensorRT-LLM provides the tools, performance, and flexibility you need to succeed in the era of generative AI.To get started, refer to the {ref}`quick-start-guide`. +Whether you're building the next generation of AI applications, optimizing existing LLM deployments, or exploring the frontiers of large language model technology, TensorRT LLM provides the tools, performance, and flexibility you need to succeed in the era of generative AI.To get started, refer to the {ref}`quick-start-guide`. diff --git a/docs/source/quick-start-guide.md b/docs/source/quick-start-guide.md index 06b4a913554..0560cc83336 100644 --- a/docs/source/quick-start-guide.md +++ b/docs/source/quick-start-guide.md @@ -2,7 +2,7 @@ # Quick Start Guide -This is the starting point to try out TensorRT-LLM. Specifically, this Quick Start Guide enables you to quickly get set up and send HTTP requests using TensorRT-LLM. +This is the starting point to try out TensorRT LLM. Specifically, this Quick Start Guide enables you to quickly get set up and send HTTP requests using TensorRT LLM. ## Launch Docker on a node with NVIDIA GPUs deployed @@ -78,7 +78,7 @@ _Example Output_ For detailed examples and command syntax, refer to the [trtllm-serve](commands/trtllm-serve/trtllm-serve.rst) section. ## Run Offline inference with LLM API -The LLM API is a Python API designed to facilitate setup and inference with TensorRT-LLM directly within Python. It enables model optimization by simply specifying a HuggingFace repository name or a model checkpoint. The LLM API streamlines the process by managing model loading, optimization, and inference, all through a single `LLM` instance. +The LLM API is a Python API designed to facilitate setup and inference with TensorRT LLM directly within Python. It enables model optimization by simply specifying a HuggingFace repository name or a model checkpoint. The LLM API streamlines the process by managing model loading, optimization, and inference, all through a single `LLM` instance. Here is a simple example to show how to use the LLM API with TinyLlama. @@ -95,14 +95,14 @@ To learn more about the LLM API, check out the [](llm-api/index) and [](examples In this Quick Start Guide, you have: - Learned how to deploy a model with `trtllm-serve` for online serving -- Explored the LLM API for offline inference with TensorRT-LLM +- Explored the LLM API for offline inference with TensorRT LLM -To continue your journey with TensorRT-LLM, explore these resources: +To continue your journey with TensorRT LLM, explore these resources: - **[Installation Guide](installation/index.rst)** - Detailed installation instructions for different platforms - **[Deployment Guide](examples/llm_api_examples)** - Comprehensive examples for deploying LLM inference in various scenarios - **[Model Support](models/supported-models.md)** - Check which models are supported and how to add new ones -- **CLI Reference** - Explore TensorRT-LLM command-line tools: +- **CLI Reference** - Explore TensorRT LLM command-line tools: - [`trtllm-serve`](commands/trtllm-serve/trtllm-serve.rst) - Deploy models for online serving - [`trtllm-bench`](commands/trtllm-bench.rst) - Benchmark model performance - [`trtllm-eval`](commands/trtllm-eval.rst) - Evaluate model accuracy diff --git a/docs/source/torch.md b/docs/source/torch.md index b41f9d216c2..114453c9436 100644 --- a/docs/source/torch.md +++ b/docs/source/torch.md @@ -4,9 +4,9 @@ Note: This feature is currently in beta, and the related API is subjected to change in future versions. ``` -To enhance the usability of the system and improve developer efficiency, TensorRT-LLM launches a new backend based on PyTorch. +To enhance the usability of the system and improve developer efficiency, TensorRT LLM launches a new backend based on PyTorch. -The PyTorch backend of TensorRT-LLM is available in version 0.17 and later. You can try it via importing `tensorrt_llm._torch`. +The PyTorch backend of TensorRT LLM is available in version 0.17 and later. You can try it via importing `tensorrt_llm._torch`. ## Quick Start @@ -42,4 +42,4 @@ Here is a simple example to show how to use `tensorrt_llm.LLM` API with Llama mo ## Prototype Features -- [AutoDeploy: Seamless Model Deployment from PyTorch to TensorRT-LLM](./torch/auto_deploy/auto-deploy.md) +- [AutoDeploy: Seamless Model Deployment from PyTorch to TensorRT LLM](./torch/auto_deploy/auto-deploy.md) diff --git a/docs/source/torch/kv_cache_manager.md b/docs/source/torch/kv_cache_manager.md index cbe2e3e40f4..1feb504accd 100644 --- a/docs/source/torch/kv_cache_manager.md +++ b/docs/source/torch/kv_cache_manager.md @@ -2,9 +2,9 @@ In Transformer-based models, the KV (Key-Value) Cache is a mechanism used to optimize decoding efficiency, particularly during autoregressive generation tasks. Since KV Cache requires memory to store, it is also an important resource. -In TensorRT-LLM, KV Cache is managed by the `KVCacheManager`. +In TensorRT LLM, KV Cache is managed by the `KVCacheManager`. -For details of the TensorRT-LLM `KVCacheManager` implementation see [KV Cache Management](../advanced/kv-cache-management.md). +For details of the TensorRT LLM `KVCacheManager` implementation see [KV Cache Management](../advanced/kv-cache-management.md). ## KV Cache Manager Introduction