[Serving]support 22.12 (#1974)

support 22.12
2025-10-06 00:57:33 +08:00 · 2023-05-22 22:27:13 +08:00
parent 1ac6e8e614
commit 3e7cb88049
4 changed files with 82 additions and 61 deletions
--- a/fastdeploy/runtime/backends/paddle/paddle_backend.cc
+++ b/fastdeploy/runtime/backends/paddle/paddle_backend.cc
@@ -296,22 +296,22 @@ bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
    ShareTensorFromFDTensor(handle.get(), inputs[i]);
  }
  // prebinded output only support for GPU
-  if (!copy_to_fd) {
-    for (size_t i = 0; i < (*outputs).size(); ++i) {
-      auto output_name = (*outputs)[i].name;
-      // if a output is not prebinded,
-      // the name of output is expected to be empty.
-      // We skip here
-      if (output_name.empty()) {
-        continue;
-      }
-      // Record the prebinded output_name.
-      // Those outputs do not need PaddleTensorToFDTensor
-      // after predictor_.Run()
-      auto handle = predictor_->GetOutputHandle(output_name);
-      ShareOutTensorFromFDTensor(handle.get(), (*outputs)[i]);
-    }
-  }
+  // if (!copy_to_fd) {
+  //   for (size_t i = 0; i < (*outputs).size(); ++i) {
+  //     auto output_name = (*outputs)[i].name;
+  //     // if a output is not prebinded,
+  //     // the name of output is expected to be empty.
+  //     // We skip here
+  //     if (output_name.empty()) {
+  //       continue;
+  //     }
+  //     // Record the prebinded output_name.
+  //     // Those outputs do not need PaddleTensorToFDTensor
+  //     // after predictor_.Run()
+  //     auto handle = predictor_->GetOutputHandle(output_name);
+  //     ShareOutTensorFromFDTensor(handle.get(), (*outputs)[i]);
+  //   }
+  // }

  RUNTIME_PROFILE_LOOP_BEGIN(1)
  predictor_->Run();
--- a/serving/Dockerfile
+++ b/serving/Dockerfile
@@ -12,51 +12,72 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+
+FROM nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04
+
 ARG http_proxy
 ARG https_proxy

-FROM nvcr.io/nvidia/tritonserver:21.10-py3 as full
-FROM nvcr.io/nvidia/tritonserver:21.10-py3-min
+#Install the build dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends curl wget vim git patchelf python3-dev python3-pip openssl \
+    python3-setuptools build-essential libgl1-mesa-glx libglib2.0-dev ca-certificates libb64-dev datacenter-gpu-manager \
+    libssl-dev zlib1g-dev rapidjson-dev libboost-dev libre2-dev librdmacm-dev libnuma-dev libarchive-dev unzip && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*

-COPY --from=full /opt/tritonserver/bin/tritonserver /opt/tritonserver/bin/fastdeployserver
-COPY --from=full /opt/tritonserver/lib /opt/tritonserver/lib
-COPY --from=full /opt/tritonserver/include /opt/tritonserver/include
-COPY --from=full /opt/tritonserver/backends/python /opt/tritonserver/backends/python
+RUN python3 -m pip install --upgrade pip && python3 -m pip install redis

-COPY serving/TensorRT-8.5.2.2 /opt/TensorRT-8.5.2.2
+# install cmake
+WORKDIR /home
+RUN wget -q https://github.com/Kitware/CMake/releases/download/v3.18.6/cmake-3.18.6-Linux-x86_64.tar.gz && tar -zxvf cmake-3.18.6-Linux-x86_64.tar.gz
+ENV PATH=/home/cmake-3.18.6-Linux-x86_64/bin:$PATH

-ENV TZ=Asia/Shanghai \
-    DEBIAN_FRONTEND=noninteractive \
-    DCGM_VERSION=2.2.9 \
-    http_proxy=$http_proxy \
-    https_proxy=$http_proxy

-RUN apt-get update \
-    && apt-key del 7fa2af80 \
-    && wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb \
-    && dpkg -i cuda-keyring_1.0-1_all.deb \
-    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub \
-    && apt-get update && apt-get install -y --no-install-recommends datacenter-gpu-manager=1:2.2.9
-
-RUN apt-get update \
-    && apt-get install -y --no-install-recommends libre2-5 libb64-0d python3 python3-pip libarchive-dev ffmpeg libsm6 libxext6 \
-    && python3 -m pip install -U pip \
-    && python3 -m pip install paddlenlp fast-tokenizer-python
+#install triton
+ENV TAG=r22.12
+RUN git clone https://github.com/triton-inference-server/server.git -b $TAG && \
+   cd server && \
+   mkdir -p build/tritonserver/install && \
+   python3 build.py \
+     --build-dir `pwd`/build \
+     --no-container-build \
+     --backend=ensemble \
+     --enable-gpu \
+     --endpoint=grpc \
+     --endpoint=http \
+     --enable-stats \
+     --enable-tracing \
+     --enable-logging \
+     --enable-metrics \
+     --enable-gpu-metrics \
+     --enable-cpu-metrics \
+     --enable-nvtx \
+     --cmake-dir `pwd` \
+     --repo-tag=common:$TAG \
+     --repo-tag=core:$TAG \
+     --repo-tag=backend:$TAG \
+     --repo-tag=thirdparty:$TAG \
+     --backend=python:$TAG

 COPY python/dist/*.whl /opt/fastdeploy/
 RUN python3 -m pip install  /opt/fastdeploy/*.whl \
    && rm -rf /opt/fastdeploy/*.whl

-# unset proxy
-# ENV http_proxy=''
-# ENV https_proxy=''
-# RUN python3 -m pip install paddlepaddle-gpu==2.4.1.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
-RUN unset http_proxy
-RUN unset https_proxy
-RUN python3 -m pip install https://paddle-wheel.bj.bcebos.com/2.4.1/linux/linux-gpu-cuda11.2-cudnn8-mkl-gcc8.2-avx/paddlepaddle_gpu-2.4.1.post112-cp38-cp38-linux_x86_64.whl

+# compile triton-inference-server/server，copy tritonserver and python backend into image
+# triton server
+RUN mkdir -p /opt/tritonserver && cp -r /home/server/build/tritonserver/install/* /opt/tritonserver
+# python backend
+RUN mkdir -p /opt/tritonserver/backends/python && cp -r /home/server/build/python/install/backends/python /opt/tritonserver/backends/
+
+# copy compiled fastdeploy backend into image
 COPY serving/build/libtriton_fastdeploy.so /opt/tritonserver/backends/fastdeploy/
-COPY build/fastdeploy_install /opt/fastdeploy/

-ENV LD_LIBRARY_PATH="/opt/TensorRT-8.5.2.2/lib/:/opt/fastdeploy/lib:/opt/fastdeploy/third_libs/install/onnxruntime/lib:/opt/fastdeploy/third_libs/install/paddle2onnx/lib:/opt/fastdeploy/third_libs/install/tensorrt/lib:/opt/fastdeploy/third_libs/install/paddle_inference/paddle/lib:/opt/fastdeploy/third_libs/install/paddle_inference/third_party/install/mkldnn/lib:/opt/fastdeploy/third_libs/install/paddle_inference/third_party/install/mklml/lib:/opt/fastdeploy/third_libs/install/openvino/runtime/lib:$LD_LIBRARY_PATH"
-ENV PATH="/opt/tritonserver/bin:$PATH"
+# rename tritonserver to fastdeployserver
+RUN mv /opt/tritonserver/bin/tritonserver /opt/tritonserver/bin/fastdeployserver
+
+# copy compiled fastdeploy_install into image
+COPY build/fastdeploy_install/* /opt/fastdeploy/
+
+# Set environment variable
+ENV LD_LIBRARY_PATH="/opt/fastdeploy/lib:/opt/fastdeploy/third_libs/install/onnxruntime/lib:/opt/fastdeploy/third_libs/install/paddle2onnx/lib:/opt/fastdeploy/third_libs/install/paddle_inference/paddle/lib:/opt/fastdeploy/third_libs/install/openvino/runtime/lib/:/opt/fastdeploy/third_libs/install/tensorrt/lib/:/opt/fastdeploy/third_libs/install/opencv/lib64/:$LD_LIBRARY_PATH"
+ENV PATH="/opt/tritonserver/bin:$PATH"
--- a/serving/scripts/build.sh
+++ b/serving/scripts/build.sh
@@ -86,14 +86,14 @@ nvidia-docker run -i --rm --name ${docker_name} \
           -e "http_proxy=${http_proxy}" \
           -e "https_proxy=${https_proxy}" \
           -e "trt_version=${trt_version}"\
-           nvcr.io/nvidia/tritonserver:21.10-py3-min \
+           nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04 \
           bash -c \
           'export https_proxy_tmp=${https_proxy}
            export http_proxy_tmp=${http_proxy}
            cd /workspace/fastdeploy/python;
            rm -rf .setuptools-cmake-build dist build fastdeploy/libs/third_libs;
            apt-get update;
-            apt-get install -y --no-install-recommends patchelf python3-dev python3-pip rapidjson-dev;
+            apt-get install -y --no-install-recommends patchelf python3-dev python3-pip rapidjson-dev git;
            unset http_proxy
            unset https_proxy
            ln -s /usr/bin/python3 /usr/bin/python;
@@ -117,7 +117,7 @@ nvidia-docker run -i --rm --name ${docker_name} \
            rm -rf build; mkdir build; cd build;
            export https_proxy=${https_proxy_tmp}
            export http_proxy=${http_proxy_tmp}
-            cmake .. -DFASTDEPLOY_DIR=/workspace/fastdeploy/build/fastdeploy_install -DTRITON_COMMON_REPO_TAG=r21.10 -DTRITON_CORE_REPO_TAG=r21.10 -DTRITON_BACKEND_REPO_TAG=r21.10;
+            cmake .. -DFASTDEPLOY_DIR=/workspace/fastdeploy/build/fastdeploy_install -DTRITON_COMMON_REPO_TAG=r22.12 -DTRITON_CORE_REPO_TAG=r22.12 -DTRITON_BACKEND_REPO_TAG=r22.12;
            make -j`nproc`'

 echo "build FD GPU library done"
--- a/serving/src/fastdeploy_runtime.cc
+++ b/serving/src/fastdeploy_runtime.cc
@@ -1145,16 +1145,16 @@ TRITONSERVER_Error* ModelInstanceState::ReadOutputTensors(
    size_t total_batch_size, TRITONBACKEND_Request** requests,
    const uint32_t request_count,
    std::vector<TRITONBACKEND_Response*>* responses) {
-  // r22.03
-  // BackendOutputResponder responder(
-  //     requests, request_count, responses,
-  //     model_state_->TritonMemoryManager(), model_state_->MaxBatchSize() > 0,
-  //     model_state_->EnablePinnedOutput(), CudaStream());
-  // r21.10
+  // r22.12
  BackendOutputResponder responder(
-      requests, request_count, responses, StateForModel()->MaxBatchSize(),
-      StateForModel()->TritonMemoryManager(),
-      StateForModel()->EnablePinnedOutput(), CudaStream());
+      requests, request_count, responses,
+      model_state_->TritonMemoryManager(), model_state_->MaxBatchSize() > 0,
+      model_state_->EnablePinnedOutput(), CudaStream());
+  // r21.10
+  // BackendOutputResponder responder(
+  //     requests, request_count, responses, StateForModel()->MaxBatchSize(),
+  //     StateForModel()->TritonMemoryManager(),
+  //     StateForModel()->EnablePinnedOutput(), CudaStream());

  // Use to hold string output contents
  bool cuda_copy = false;