From cf1ff2077da62858cb011db4fa3d57650a4d6623 Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Tue, 11 Jul 2023 13:49:47 +0800
Subject: [PATCH] [Bug Fix] fix trt backend page-locked error (#2095)

* [Bug Fix] fix trt backend page-locked error

* Update trt_backend.cc
---
 .../runtime/backends/tensorrt/trt_backend.cc  | 34 ++++++++++++++-----
 1 file changed, 25 insertions(+), 9 deletions(-)
diff --git a/fastdeploy/runtime/backends/tensorrt/trt_backend.cc b/fastdeploy/runtime/backends/tensorrt/trt_backend.cc
index 7a14221ab..f1ac6a729 100644
--- a/fastdeploy/runtime/backends/tensorrt/trt_backend.cc
+++ b/fastdeploy/runtime/backends/tensorrt/trt_backend.cc
@@ -470,16 +470,32 @@ void TrtBackend::SetInputs(const std::vector<FDTensor>& inputs) {
       if (item.dtype == FDDataType::INT64) {
         int64_t* data = static_cast<int64_t*>(const_cast<void*>(item.Data()));
         std::vector<int32_t> casted_data(data, data + item.Numel());
-        FDASSERT(cudaMemcpyAsync(inputs_device_buffer_[item.name].data(),
-                                 static_cast<void*>(casted_data.data()),
-                                 item.Nbytes() / 2, cudaMemcpyHostToDevice,
-                                 stream_) == 0,
-                 "Error occurs while copy memory from CPU to GPU.");
+        // FDASSERT(cudaMemcpyAsync(inputs_device_buffer_[item.name].data(),
+        //                          static_cast<void*>(casted_data.data()),
+        //                          item.Nbytes() / 2, cudaMemcpyHostToDevice,
+        //                          stream_) == 0,
+        //          "Error occurs while copy memory from CPU to GPU.");
+        // WARN: For cudaMemcpyHostToDevice direction, cudaMemcpyAsync need page-locked host 
+        // memory to avoid any overlap to occur. The page-locked feature need by cudaMemcpyAsync 
+        // may not guarantee by FDTensor now. Reference: 
+        // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creation-and-destruction  
+        FDASSERT(cudaMemcpy(inputs_device_buffer_[item.name].data(),
+                            static_cast<void*>(casted_data.data()),
+                            item.Nbytes() / 2, cudaMemcpyHostToDevice) == 0,
+                 "Error occurs while copy memory from CPU to GPU.");         
       } else {
-        FDASSERT(cudaMemcpyAsync(inputs_device_buffer_[item.name].data(),
-                                 item.Data(), item.Nbytes(),
-                                 cudaMemcpyHostToDevice, stream_) == 0,
-                 "Error occurs while copy memory from CPU to GPU.");
+        // FDASSERT(cudaMemcpyAsync(inputs_device_buffer_[item.name].data(),
+        //                          item.Data(), item.Nbytes(),
+        //                          cudaMemcpyHostToDevice, stream_) == 0,
+        //          "Error occurs while copy memory from CPU to GPU.");
+        // WARN: For cudaMemcpyHostToDevice direction, cudaMemcpyAsync need page-locked host 
+        // memory to avoid any overlap to occur. The page-locked feature need by cudaMemcpyAsync 
+        // may not guarantee by FDTensor now. Reference: 
+        // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creation-and-destruction 
+        FDASSERT(cudaMemcpy(inputs_device_buffer_[item.name].data(),
+                            item.Data(), item.Nbytes(),
+                            cudaMemcpyHostToDevice) == 0,
+                 "Error occurs while copy memory from CPU to GPU.");         
       }
     }
     // binding input buffer