udpate codes

tangzhiyi11 · tangzhiyi11 · commit e264503fc70c · 2025-07-08T03:20:47.000Z
diff --git a/dlinfer/graph/dicp/vendor/AtbGraph/atb_op.py b/dlinfer/graph/dicp/vendor/AtbGraph/atb_op.py
@@ -835,9 +835,15 @@ def infer_result(
         self, x, lora_a, lora_b, scaling, ranks, seq_lens, adapter_ids, dtype
     ):
         M, K = x.shape
+        ranks = lora_a.size(0)
         N = lora_b.size(1)
         output = torch.empty((M, N), dtype=x.dtype, device=x.device)
-        return output, output
+        # assuem totalRank is the max rank
+        internal_output_x_lora_a = torch.empty(
+            (M, ranks * M), dtype=x.dtype, device=x.device
+        )
+        internal_lora_a_transpose = torch.empty_like(lora_a)
+        return output, internal_output_x_lora_a, internal_lora_a_transpose
 
 
 class AclNnInplaceAdd(Operator):
diff --git a/dlinfer/graph/dicp/vendor/AtbGraph/codegen/atb_op.py b/dlinfer/graph/dicp/vendor/AtbGraph/codegen/atb_op.py
@@ -1251,7 +1251,6 @@ def CustomFusedLora(
         name, x, lora_a, lora_b, scaling, ranks, seq_lens, adapter_ids, dtype
     ):
         op = Operation(name, "CustomFusedLoraOperation")
-        # TODO: add param
         param = infer_param.CustomFusedLoraParam()
         param.name = name
         param.dtype = get_ascend_dtype(dtype)
diff --git a/dlinfer/graph/dicp/vendor/AtbGraph/codegen/runtime/ops/custom_ops/fused_lora_operation.cpp b/dlinfer/graph/dicp/vendor/AtbGraph/codegen/runtime/ops/custom_ops/fused_lora_operation.cpp
@@ -102,9 +102,6 @@ void CustomFusedLoraOperation::ClearInternal() {
     aclWeightA_.clear();
     aclWeightB_.clear();
     aclWeightATranspose_.clear();
-    weightA_.clear();
-    weightB_.clear();
-    weightATranspose_.clear();
 
     aclScalingInput_.clear();
     scalingInput_.clear();
@@ -115,19 +112,6 @@ void CustomFusedLoraOperation::ClearInternal() {
     aclScalingExecutor_.clear();
 }
 
-// Helper function to create weight tensor
-atb::Tensor CustomFusedLoraOperation::CreateWeightTensor(const atb::Tensor& baseTensor, int64_t rank, int64_t dim, uint64_t offset) {
-    atb::Tensor weightTensor;
-    weightTensor.desc.dtype = baseTensor.desc.dtype;
-    weightTensor.desc.format = baseTensor.desc.format;
-    weightTensor.desc.shape.dimNum = baseTensor.desc.shape.dimNum;
-    weightTensor.desc.shape.dims[0] = rank;
-    weightTensor.desc.shape.dims[1] = dim;
-    weightTensor.dataSize = atb::Utils::GetTensorSize(weightTensor.desc);
-    weightTensor.deviceData = static_cast<uint8_t*>(baseTensor.deviceData) + offset;
-    return weightTensor;
-}
-
 // Helper function to calculate offset for weight tensors
 uint64_t CustomFusedLoraOperation::CalculateWeightOffset(const std::vector<int32_t>& ranksVec, size_t adapterId, uint64_t tensorSizePerRank) {
     uint64_t offset = 0;
@@ -183,12 +167,6 @@ int CustomFusedLoraOperation::Setup(const atb::VariantPack& variantPack, uint64_
     const int64_t loraBDim = variantPack.inTensors.at(2).desc.shape.dims[1];
 
     ClearInternal();
-
-    // Pre-allocate vectors to avoid reallocations
-    weightA_.reserve(adapterIdsVec.size());
-    weightATranspose_.reserve(adapterIdsVec.size());
-    weightB_.reserve(adapterIdsVec.size());
-
     aclWeightA_.reserve(adapterIdsVec.size());
     aclWeightB_.reserve(adapterIdsVec.size());
     aclWeightATranspose_.reserve(adapterIdsVec.size());
diff --git a/dlinfer/graph/dicp/vendor/AtbGraph/codegen/runtime/ops/custom_ops/fused_lora_operation.h b/dlinfer/graph/dicp/vendor/AtbGraph/codegen/runtime/ops/custom_ops/fused_lora_operation.h
@@ -28,19 +28,13 @@ class CustomFusedLoraOperation : public atb::Operation {
     void ClearAclScalrs();
     void ClearInternal();
 
-    // Helper functions for weight tensor creation and offset calculation
-    atb::Tensor CreateWeightTensor(const atb::Tensor& baseTensor, int64_t rank, int64_t dim, uint64_t offset);
     uint64_t CalculateWeightOffset(const std::vector<int32_t>& ranksVec, size_t adapterId, uint64_t tensorSizePerRank);
 
 private:
     std::string opName_;
     std::string dtype_;
     std::vector<aclScalar*> aclScalingScalar_;
 
-    std::vector<atb::Tensor> weightA_;
-    std::vector<atb::Tensor> weightB_;
-    std::vector<atb::Tensor> weightATranspose_;
-
     std::vector<AclNnTensor> aclWeightA_;
     std::vector<AclNnTensor> aclWeightB_;
     std::vector<AclNnTensor> aclWeightATranspose_;