jd-opensource
diff --git a/‎xllm/core/kernels/cuda/batch_decode.cpp‎
Lines changed: 3 additions & 36 deletions b/‎xllm/core/kernels/cuda/batch_decode.cpp‎
Lines changed: 3 additions & 36 deletions
diff --git a/‎xllm/core/kernels/cuda/batch_prefill.cpp‎
Lines changed: 3 additions & 37 deletions b/‎xllm/core/kernels/cuda/batch_prefill.cpp‎
Lines changed: 3 additions & 37 deletions
diff --git a/‎xllm/core/kernels/cuda/cuda_ops_api.h‎
Lines changed: 6 additions & 2 deletions b/‎xllm/core/kernels/cuda/cuda_ops_api.h‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎xllm/core/kernels/cuda/utils.h‎
Lines changed: 2 additions & 1 deletion b/‎xllm/core/kernels/cuda/utils.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎xllm/core/kernels/ops_api.cpp‎
Lines changed: 6 additions & 2 deletions b/‎xllm/core/kernels/ops_api.cpp‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎xllm/core/kernels/param.h‎
Lines changed: 2 additions & 0 deletions b/‎xllm/core/kernels/param.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎xllm/core/layers/common/qwen2_attention.cpp‎
Lines changed: 4 additions & 5 deletions b/‎xllm/core/layers/common/qwen2_attention.cpp‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎xllm/core/layers/common/qwen2_attention.h‎
Lines changed: 1 addition & 1 deletion b/‎xllm/core/layers/common/qwen2_attention.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎xllm/core/layers/common/qwen2_decoder_layer.cpp‎
Lines changed: 1 addition & 1 deletion b/‎xllm/core/layers/common/qwen2_decoder_layer.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎xllm/core/layers/common/qwen2_decoder_layer.h‎
Lines changed: 1 addition & 3 deletions b/‎xllm/core/layers/common/qwen2_decoder_layer.h‎
Lines changed: 1 addition & 3 deletions
@@ -18,7 +18,9 @@ limitations under the License.
 
 namespace xllm::kernel::cuda {
 
-void batch_decode(torch::Tensor float_workspace_buffer,
+void batch_decode(const std::string& uri,
+                  torch::Tensor plan_info,
+                  torch::Tensor float_workspace_buffer,
                   torch::Tensor int_workspace_buffer,
                   torch::Tensor page_locked_int_workspace_buffer,
                   torch::Tensor query,
@@ -32,41 +34,6 @@ void batch_decode(torch::Tensor float_workspace_buffer,
                   torch::Tensor output,
                   std::optional<torch::Tensor>& output_lse,
                   bool enable_cuda_graph) {
-  std::string uri = get_batch_decode_uri(query.scalar_type(),
-                                         k_cache.scalar_type(),
-                                         output.scalar_type(),
-                                         paged_kv_indptr.scalar_type(),
-                                         query.size(-1),
-                                         v_cache.size(-1),
-                                         /*pos_encoding_mode=*/0,
-                                         /*use_sliding_window=*/false,
-                                         /*use_logits_soft_cap=*/false);
-
-  torch::Tensor paged_kv_indptr_host = paged_kv_indptr.to(torch::kCPU);
-  const int64_t batch_size = paged_kv_last_page_len.size(0);
-
-  torch::Tensor empty_q_data =
-      torch::empty({0}, torch::TensorOptions().dtype(query.scalar_type()));
-  torch::Tensor empty_kv_data =
-      torch::empty({0}, torch::TensorOptions().dtype(k_cache.scalar_type()));
-
-  auto plan_info = FunctionFactory::get_instance().decode_plan_func(uri).call(
-      float_workspace_buffer,
-      int_workspace_buffer,
-      page_locked_int_workspace_buffer,
-      paged_kv_indptr_host,
-      batch_size,
-      query.size(1),    // num_qo_heads
-      k_cache.size(2),  // num_kv_heads
-      k_cache.size(1),  // block_size
-      enable_cuda_graph,
-      window_left,
-      /*logits_soft_cap=*/0.0,
-      query.size(-1),    // head_dim_qk
-      v_cache.size(-1),  // head_dim_vo
-      empty_q_data,
-      empty_kv_data);
-
   FunctionFactory::get_instance().decode_run_func(uri).call(
       float_workspace_buffer,
       int_workspace_buffer,
 
@@ -18,7 +18,9 @@ limitations under the License.
 
 namespace xllm::kernel::cuda {
 
-void batch_prefill(torch::Tensor float_workspace_buffer,
+void batch_prefill(const std::string& uri,
+                   torch::Tensor plan_info,
+                   torch::Tensor float_workspace_buffer,
                    torch::Tensor int_workspace_buffer,
                    torch::Tensor page_locked_int_workspace_buffer,
                    torch::Tensor query,
@@ -31,42 +33,6 @@ void batch_prefill(torch::Tensor float_workspace_buffer,
                    torch::Tensor output,
                    std::optional<torch::Tensor>& output_lse,
                    bool enable_cuda_graph) {
-  std::string uri = get_batch_prefill_uri(/*backend=*/"fa2",
-                                          query.scalar_type(),
-                                          key.scalar_type(),
-                                          output.scalar_type(),
-                                          q_cu_seq_lens.scalar_type(),
-                                          query.size(-1),
-                                          value.size(-1),
-                                          /*pos_encoding_mode=*/0,
-                                          /*use_sliding_window=*/false,
-                                          /*use_logits_soft_cap=*/false,
-                                          /*use_fp16_qk_reduction=*/false);
-
-  torch::Tensor qo_indptr_host = q_cu_seq_lens.to(torch::kCPU);
-  torch::Tensor kv_cu_seq_lens_host = kv_cu_seq_lens.to(torch::kCPU);
-  torch::Tensor kv_len_arr_host =
-      kv_cu_seq_lens_host.slice(0, 1) - kv_cu_seq_lens_host.slice(0, 0, -1);
-  const int64_t total_num_rows = qo_indptr_host[-1].item<int64_t>();
-  const int64_t batch_size = qo_indptr_host.size(0) - 1;
-
-  auto plan_info = FunctionFactory::get_instance().prefill_plan_func(uri).call(
-      float_workspace_buffer,
-      int_workspace_buffer,
-      page_locked_int_workspace_buffer,
-      qo_indptr_host,
-      kv_cu_seq_lens_host,
-      kv_len_arr_host,
-      total_num_rows,
-      batch_size,
-      query.size(1),  // num_qo_heads
-      key.size(1),    // num_kv_heads
-      /*page_size=*/1,
-      enable_cuda_graph,
-      query.size(-1),  // head_dim_qk
-      value.size(-1),  // head_dim_vo
-      /*causal=*/true);
-
   FunctionFactory::get_instance().prefill_ragged_run_func(uri).call(
       float_workspace_buffer,
       int_workspace_buffer,
 
@@ -45,7 +45,9 @@ void reshape_paged_cache(
     torch::Tensor key_cache,  // [n_blocks, block_size, n_heads, head_dim]
     torch::Tensor value_cache);
 
-void batch_prefill(torch::Tensor float_workspace_buffer,
+void batch_prefill(const std::string& uri,
+                   torch::Tensor plan_info,
+                   torch::Tensor float_workspace_buffer,
                    torch::Tensor int_workspace_buffer,
                    torch::Tensor page_locked_int_workspace_buffer,
                    torch::Tensor query,
@@ -59,7 +61,9 @@ void batch_prefill(torch::Tensor float_workspace_buffer,
                    std::optional<torch::Tensor>& output_lse,
                    bool enable_cuda_graph);
 
-void batch_decode(torch::Tensor float_workspace_buffer,
+void batch_decode(const std::string& uri,
+                  torch::Tensor plan_info,
+                  torch::Tensor float_workspace_buffer,
                   torch::Tensor int_workspace_buffer,
                   torch::Tensor page_locked_int_workspace_buffer,
                   torch::Tensor query,
 
@@ -15,6 +15,7 @@ limitations under the License.
 
 #pragma once
 
+#include <ATen/DynamicLibrary.h>
 #include <torch/torch.h>
 
 #include <string>
@@ -56,4 +57,4 @@ std::string get_batch_decode_uri(torch::ScalarType dtype_q,
                                  bool use_sliding_window,
                                  bool use_logits_soft_cap);
 
-}  // namespace xllm::kernel::cuda
+}  // namespace xllm::kernel::cuda
@@ -153,7 +153,9 @@ void batch_prefill(AttentionParams& params) {
                      params.scale,
                      params.output);
 #elif defined(USE_CUDA)
-  cuda::batch_prefill(params.float_workspace_buffer,
+  cuda::batch_prefill(params.uri,
+                      params.plan_info,
+                      params.float_workspace_buffer,
                       params.int_workspace_buffer,
                       params.page_locked_int_workspace_buffer,
                       params.query,
@@ -225,7 +227,9 @@ void batch_decode(AttentionParams& params) {
 #elif defined(USE_CUDA)
   params.query = params.query.squeeze(1);
   params.output = params.output.squeeze(1);
-  cuda::batch_decode(params.float_workspace_buffer,
+  cuda::batch_decode(params.uri,
+                     params.plan_info,
+                     params.float_workspace_buffer,
                      params.int_workspace_buffer,
                      params.page_locked_int_workspace_buffer,
                      params.query,
 
@@ -209,6 +209,8 @@ struct AttentionParams {
   torch::Tensor kv_cu_seq_lens;
   torch::Tensor q_cu_seq_lens;
   bool enable_cuda_graph = false;
+  std::string uri;
+  torch::Tensor plan_info;
 
   // ========== Prefill-specific parameters ==========
   // Key tensor. Shape: [num_tokens, num_kv_heads, head_dim_qk] (packed) or
 
@@ -110,11 +110,10 @@ Qwen2AttentionImpl::Qwen2AttentionImpl(const ModelContext& context) {
                                     args.sliding_window()));
 }
 
-torch::Tensor Qwen2AttentionImpl::forward(
-    const torch::Tensor& positions,
-    const torch::Tensor& hidden_states,
-    const AttentionMetadata& attn_metadata,
-    KVCache& kv_cache) {
+torch::Tensor Qwen2AttentionImpl::forward(const torch::Tensor& positions,
+                                          const torch::Tensor& hidden_states,
+                                          AttentionMetadata& attn_metadata,
+                                          KVCache& kv_cache) {
   // 1. qkv projection
   auto qkv = qkv_proj_->forward(hidden_states);
 
 
@@ -43,7 +43,7 @@ class Qwen2AttentionImpl : public torch::nn::Module {
 
   torch::Tensor forward(const torch::Tensor& positions,
                         const torch::Tensor& hidden_states,
-                        const AttentionMetadata& attn_metadata,
+                        AttentionMetadata& attn_metadata,
                         KVCache& kv_cache);
 
   void load_state_dict(const StateDict& state_dict);
 
@@ -67,7 +67,7 @@ torch::Tensor Qwen2DecoderLayerImpl::forward(
     torch::Tensor& x,
     std::optional<torch::Tensor>& residual,
     torch::Tensor& positions,
-    const AttentionMetadata& attn_metadata,
+    AttentionMetadata& attn_metadata,
     KVCache& kv_cache,
     const ModelInputParams& input_params) {
   // Pre-attention norm
 
@@ -51,7 +51,7 @@ class Qwen2DecoderLayerImpl : public torch::nn::Module {
   torch::Tensor forward(torch::Tensor& x,
                         std::optional<torch::Tensor>& residual,
                         torch::Tensor& positions,
-                        const AttentionMetadata& attn_metadata,
+                        AttentionMetadata& attn_metadata,
                         KVCache& kv_cache,
                         const ModelInputParams& input_params);
 
@@ -64,7 +64,5 @@ class Qwen2DecoderLayerImpl : public torch::nn::Module {
   ParallelArgs parallel_args_;
 };
 
-using Qwen3DecoderLayerImpl = Qwen2DecoderLayerImpl;
-
 }  // namespace layer
 }  // namespace xllm