Add trtlllm to triton bench (#379)

Aya-ZIbra · facebook-github-bot · commit 7a4063e25c73 · 2025-09-04T09:06:32.000-07:00
Summary: Pull Request resolved: #379 Run C++ FLASHINFER_CUBIN_DIR=/data/users/$USER/fbsource/fbcode/deeplearning/flashinfer/fb/cubins/ buck2 run mode/opt mode/inplace -c fbcode.enable_gpu_sections=true -c fbcode.nvcc_arch=b200a -c fbcode.platform010_cuda_version=12.8 //deeplearning/flashinfer/trtllm_kernel_interfaces:run_example``` ------- Run Triton bench buck2 run mode/opt mode/inplace -c fbcode.enable_gpu_sections=true -c fbcode.nvcc_arch=b200a -c fbcode.platform010_cuda_version=12.8 //pytorch/tritonbench:run -- --op decoding_attention --only trtllm_decode_fmha --seq-len-q 1 --metrics gbps Todo: Support non-paged case Differential Revision: D81021980
diff --git a/tritonbench/operators/decoding_attention/operator.py b/tritonbench/operators/decoding_attention/operator.py
@@ -55,6 +55,11 @@
 torch.ops.load_library(
     "//deeplearning/fbgemm/fbgemm_gpu/experimental:gen_ai_attention_ops"
 )
+torch.ops.load_library(
+    "//deeplearning/flashinfer/trtllm_kernel_interfaces:trtllm_fmha_pybind"
+)
+
+from .trtllm_utils import trtllm_decode_fmha_func
 
 from tritonbench.utils.triton_op import (
     BenchmarkOperator,
@@ -660,3 +665,17 @@ def aiter_paged_fp8kv(
             k_scale_asm,
             v_scale_asm,
         )
+
+    @register_benchmark()
+    def trtllm_decode_fmha(
+        self,
+        q: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        cache_seqlens: torch.Tensor,
+    ) -> Callable:
+        
+        args = trtllm_decode_fmha_func(q, k_cache, v_cache, cache_seqlens)
+        return lambda: torch.ops.trtllm_kernel_interfaces.trtllm_decode_fmha(
+        *args
+    )
diff --git a/tritonbench/operators/decoding_attention/trtllm_utils.py b/tritonbench/operators/decoding_attention/trtllm_utils.py
@@ -0,0 +1,68 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+TRTLLM FMHA utility functions for handling tensor conversion and kernel preparation.
+"""
+
+import torch
+
+
+def trtllm_decode_fmha_func(q, k_cache, v_cache, cache_seqlens):
+    """
+    TRTLLM FMHA decode function that converts standard tensors to paged format
+    and calls the TRTLLM FMHA kernel via PyBind extension.
+    """
+    
+    device = q.device
+    # Convert input tensors to paged format for TRTLLM FMHA
+    batch_size, seq_len_q, num_qo_heads, head_dim = q.shape
+    _, max_seq_len_kv, num_kv_heads, _ = k_cache.shape
+    
+    # Use page size of 16 for TRTLLM FMHA
+    page_size = 16
+    max_num_blocks_per_seq = (max_seq_len_kv + page_size - 1) // page_size
+    total_pages = batch_size * max_num_blocks_per_seq
+    
+    # Reshape k_cache and v_cache to paged format [total_pages, num_kv_heads, page_size, head_dim]
+    k_cache_paged = k_cache.view(batch_size, max_num_blocks_per_seq, page_size, num_kv_heads, head_dim)
+    k_cache_paged = k_cache_paged.permute(0, 1, 3, 2, 4).contiguous()
+    k_cache_paged = k_cache_paged.view(total_pages, num_kv_heads, page_size, head_dim)
+    
+    v_cache_paged = v_cache.view(batch_size, max_num_blocks_per_seq, page_size, num_kv_heads, head_dim)
+    v_cache_paged = v_cache_paged.permute(0, 1, 3, 2, 4).contiguous()
+    v_cache_paged = v_cache_paged.view(total_pages, num_kv_heads, page_size, head_dim)
+    
+    # Create block tables
+    block_tables = torch.zeros(
+        (batch_size, max_num_blocks_per_seq), 
+        dtype=torch.int32, 
+        device=device
+    )
+    for i in range(batch_size):
+        for j in range(max_num_blocks_per_seq):
+            block_tables[i, j] = i * max_num_blocks_per_seq + j
+    
+    # Create output tensor
+    out = torch.zeros_like(q)
+    
+    # Create workspace buffer
+    workspace_size = 128 * 1024 * 1024  # 128MB
+    workspace_buffer = torch.zeros(workspace_size, dtype=torch.uint8, device=device)
+    
+    # Attention parameters
+    max_seq_len = cache_seqlens.max().item()
+    bmm1_scale = 1.0 / (head_dim ** 0.5)
+    bmm2_scale = 1.0
+    window_left = -1  # No sliding window
+    sm_count = torch.cuda.get_device_properties(device).multi_processor_count
+    
+    args =(
+        out, q, k_cache_paged, v_cache_paged, workspace_buffer,
+        block_tables, cache_seqlens, max_seq_len,
+        bmm1_scale, bmm2_scale, window_left, sm_count
+    )
+    return args