[Serving][Backend] Backend support zero_copy_infer and Serving reduce the output memory copy (#703)

* backend add zero copy infer interface * fix bug * fix bug * fix bug * paddle ipu
2025-10-13 12:23:55 +08:00 · 2022-11-28 14:07:53 +08:00
parent edcf150d33
commit 42f1888bb0
21 changed files with 254 additions and 109 deletions
--- a/serving/src/fastdeploy_runtime.cc
+++ b/serving/src/fastdeploy_runtime.cc
@@ -607,9 +607,6 @@ class ModelInstanceState : public BackendModelInstance {
  std::vector<std::string> output_names_;
  std::vector<fastdeploy::TensorInfo> input_tensor_infos_;
  std::vector<fastdeploy::TensorInfo> output_tensor_infos_;
-
-  std::vector<fastdeploy::FDTensor> input_tensors_;
-  std::vector<fastdeploy::FDTensor> output_tensors_;
 };

 TRITONSERVER_Error* ModelInstanceState::Create(
@@ -647,8 +644,6 @@ ModelInstanceState::~ModelInstanceState() { ReleaseRunResources(); }
 void ModelInstanceState::ReleaseRunResources() {
  input_names_.clear();
  output_names_.clear();
-  input_tensors_.clear();
-  output_tensors_.clear();
  input_tensor_infos_.clear();
  output_tensor_infos_.clear();
 }
@@ -671,9 +666,7 @@ TRITONSERVER_Error* ModelInstanceState::ValidateInputs() {
  input_tensor_infos_ = runtime_->GetInputInfos();
  std::vector<std::string> names;
  GetInfoNames(input_tensor_infos_, names);
-  input_tensors_.clear();
  input_names_.clear();
-  input_tensors_.reserve(input_tensor_infos_.size());

  triton::common::TritonJson::Value ios;
  RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("input", &ios));
@@ -700,7 +693,6 @@ TRITONSERVER_Error* ModelInstanceState::ValidateInputs() {
      std::set<std::string> inames(names.begin(), names.end());
      RETURN_IF_ERROR(CheckAllowedModelInput(io, inames));
    }
-    input_tensors_.emplace_back(io_name);

    auto fd_data_type = ModelConfigDataTypeToFDType(io_dtype);
    if (fd_data_type == fastdeploy::FDDataType::UNKNOWN1) {
@@ -759,11 +751,8 @@ TRITONSERVER_Error* ModelInstanceState::ValidateInputs() {

 TRITONSERVER_Error* ModelInstanceState::ValidateOutputs() {
  output_tensor_infos_ = runtime_->GetOutputInfos();
-  output_tensors_.clear();
-  output_tensors_.reserve(output_tensor_infos_.size());
  std::set<std::string> out_names;
  for (const auto& info : output_tensor_infos_) {
-    output_tensors_.emplace_back(info.name);
    out_names.insert(info.name);
  }
  output_names_.clear();
@@ -793,7 +782,6 @@ TRITONSERVER_Error* ModelInstanceState::ValidateOutputs() {
    if (index < 0) {
      RETURN_IF_ERROR(CheckAllowedModelInput(io, out_names));
    }
-    // output_tensors_.emplace_back(io_name);

    auto fd_data_type = ModelConfigDataTypeToFDType(io_dtype);
    if (fd_data_type == fastdeploy::FDDataType::UNKNOWN1) {
@@ -1009,7 +997,7 @@ void ModelInstanceState::ProcessRequests(TRITONBACKEND_Request** requests,
 TRITONSERVER_Error* ModelInstanceState::Run(
    std::vector<TRITONBACKEND_Response*>* responses,
    const uint32_t response_count) {
-  runtime_->Infer(input_tensors_, &output_tensors_);
+  runtime_->Infer();
 #ifdef TRITON_ENABLE_GPU
  if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
    cudaStreamSynchronize(CudaStream());
@@ -1042,18 +1030,7 @@ TRITONSERVER_Error* ModelInstanceState::SetInputTensors(
        input, &input_name, &input_datatype, &input_shape, &input_dims_count,
        nullptr, nullptr));

-    int index = GetInfoIndex(std::string(input_name), input_tensor_infos_);
-    if (index < 0) {
-      auto err = TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          (std::string("Input name [") + input_name +
-           std::string("] is not one of the FD predictor input: ") +
-           input_tensors_[index].name)
-              .c_str());
-      // SendErrorForResponses(responses, request_count, err);
-      return err;
-    }
-
+    std::string in_name = std::string(input_name);
    std::vector<int64_t> batchn_shape;
    // For a ragged input tensor, the tensor shape should be
    // the flatten shape of the whole batch
@@ -1082,23 +1059,40 @@ TRITONSERVER_Error* ModelInstanceState::SetInputTensors(
      }
    }

+    const char* input_buffer;
+    size_t batchn_byte_size;
    TRITONSERVER_MemoryType memory_type;
-    int64_t device_id = 0;
-    fastdeploy::Device device;
+    int64_t memory_type_id;
+    std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>>
+        allowed_input_types;
    if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
-      memory_type = TRITONSERVER_MEMORY_GPU;
+      allowed_input_types = {{TRITONSERVER_MEMORY_GPU, DeviceId()},
+                              {TRITONSERVER_MEMORY_CPU_PINNED, 0},
+                              {TRITONSERVER_MEMORY_CPU, 0}};
+    } else {
+      allowed_input_types = {{TRITONSERVER_MEMORY_CPU_PINNED, 0},
+                              {TRITONSERVER_MEMORY_CPU, 0}};
+    }
+
+    RETURN_IF_ERROR(
+        collector->ProcessTensor(
+            input_name, nullptr, 0, allowed_input_types, &input_buffer,
+            &batchn_byte_size, &memory_type, &memory_type_id));
+
+    int32_t device_id = -1;
+    fastdeploy::Device device;
+    if (memory_type == TRITONSERVER_MEMORY_GPU) {
      device_id = DeviceId();
      device = fastdeploy::Device::GPU;
    } else {
-      memory_type = TRITONSERVER_MEMORY_CPU;
      device = fastdeploy::Device::CPU;
    }
-    input_tensors_[index].Resize(
-        batchn_shape, ConvertDataTypeToFD(input_datatype), input_name, device);
-    collector->ProcessTensor(
-        input_name,
-        reinterpret_cast<char*>(input_tensors_[index].MutableData()),
-        input_tensors_[index].Nbytes(), memory_type, device_id);
+
+    fastdeploy::FDTensor fdtensor(in_name);
+    fdtensor.SetExternalData(
+      batchn_shape, ConvertDataTypeToFD(input_datatype),
+      const_cast<char*>(input_buffer), device, device_id);
+    runtime_->BindInputTensor(in_name, fdtensor);
  }

  // Finalize...
@@ -1134,12 +1128,25 @@ TRITONSERVER_Error* ModelInstanceState::ReadOutputTensors(
  // }

  for (auto& output_name : output_names_) {
-    int idx = GetInfoIndex(output_name, output_tensor_infos_);
+    auto* output_tensor = runtime_->GetOutputTensor(output_name);
+    if (output_tensor == nullptr) {
+        RETURN_IF_ERROR(
+            TRITONSERVER_ErrorNew(
+                TRITONSERVER_ERROR_INTERNAL,
+                (std::string("output tensor '") + output_name + "' is not found")
+                    .c_str()));
+    }
+    TRITONSERVER_MemoryType memory_type = TRITONSERVER_MEMORY_CPU;
+    int64_t memory_type_id = 0;
+    if(output_tensor->device == fastdeploy::Device::GPU) {
+      memory_type = TRITONSERVER_MEMORY_GPU;
+      memory_type_id = DeviceId();
+    }
    responder.ProcessTensor(
-        output_tensors_[idx].name, ConvertFDType(output_tensors_[idx].dtype),
-        output_tensors_[idx].shape,
-        reinterpret_cast<char*>(output_tensors_[idx].MutableData()),
-        TRITONSERVER_MEMORY_CPU, 0);
+        output_tensor->name, ConvertFDType(output_tensor->dtype),
+        output_tensor->shape,
+        reinterpret_cast<char*>(output_tensor->MutableData()),
+        memory_type, memory_type_id);
  }

  // Finalize and wait for any pending buffer copies.