[Serving]support 22.12 (#1974)

support 22.12
This commit is contained in:
heliqi
2023-05-22 22:27:13 +08:00
committed by GitHub
parent 1ac6e8e614
commit 3e7cb88049
4 changed files with 82 additions and 61 deletions

View File

@@ -296,22 +296,22 @@ bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
ShareTensorFromFDTensor(handle.get(), inputs[i]);
}
// prebinded output only support for GPU
if (!copy_to_fd) {
for (size_t i = 0; i < (*outputs).size(); ++i) {
auto output_name = (*outputs)[i].name;
// if a output is not prebinded,
// the name of output is expected to be empty.
// We skip here
if (output_name.empty()) {
continue;
}
// Record the prebinded output_name.
// Those outputs do not need PaddleTensorToFDTensor
// after predictor_.Run()
auto handle = predictor_->GetOutputHandle(output_name);
ShareOutTensorFromFDTensor(handle.get(), (*outputs)[i]);
}
}
// if (!copy_to_fd) {
// for (size_t i = 0; i < (*outputs).size(); ++i) {
// auto output_name = (*outputs)[i].name;
// // if a output is not prebinded,
// // the name of output is expected to be empty.
// // We skip here
// if (output_name.empty()) {
// continue;
// }
// // Record the prebinded output_name.
// // Those outputs do not need PaddleTensorToFDTensor
// // after predictor_.Run()
// auto handle = predictor_->GetOutputHandle(output_name);
// ShareOutTensorFromFDTensor(handle.get(), (*outputs)[i]);
// }
// }
RUNTIME_PROFILE_LOOP_BEGIN(1)
predictor_->Run();

View File

@@ -12,51 +12,72 @@
# See the License for the specific language governing permissions and
# limitations under the License.
FROM nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04
ARG http_proxy
ARG https_proxy
FROM nvcr.io/nvidia/tritonserver:21.10-py3 as full
FROM nvcr.io/nvidia/tritonserver:21.10-py3-min
#Install the build dependencies
RUN apt-get update && apt-get install -y --no-install-recommends curl wget vim git patchelf python3-dev python3-pip openssl \
python3-setuptools build-essential libgl1-mesa-glx libglib2.0-dev ca-certificates libb64-dev datacenter-gpu-manager \
libssl-dev zlib1g-dev rapidjson-dev libboost-dev libre2-dev librdmacm-dev libnuma-dev libarchive-dev unzip && \
apt-get clean && rm -rf /var/lib/apt/lists/*
COPY --from=full /opt/tritonserver/bin/tritonserver /opt/tritonserver/bin/fastdeployserver
COPY --from=full /opt/tritonserver/lib /opt/tritonserver/lib
COPY --from=full /opt/tritonserver/include /opt/tritonserver/include
COPY --from=full /opt/tritonserver/backends/python /opt/tritonserver/backends/python
RUN python3 -m pip install --upgrade pip && python3 -m pip install redis
COPY serving/TensorRT-8.5.2.2 /opt/TensorRT-8.5.2.2
# install cmake
WORKDIR /home
RUN wget -q https://github.com/Kitware/CMake/releases/download/v3.18.6/cmake-3.18.6-Linux-x86_64.tar.gz && tar -zxvf cmake-3.18.6-Linux-x86_64.tar.gz
ENV PATH=/home/cmake-3.18.6-Linux-x86_64/bin:$PATH
ENV TZ=Asia/Shanghai \
DEBIAN_FRONTEND=noninteractive \
DCGM_VERSION=2.2.9 \
http_proxy=$http_proxy \
https_proxy=$http_proxy
RUN apt-get update \
&& apt-key del 7fa2af80 \
&& wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb \
&& dpkg -i cuda-keyring_1.0-1_all.deb \
&& apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub \
&& apt-get update && apt-get install -y --no-install-recommends datacenter-gpu-manager=1:2.2.9
RUN apt-get update \
&& apt-get install -y --no-install-recommends libre2-5 libb64-0d python3 python3-pip libarchive-dev ffmpeg libsm6 libxext6 \
&& python3 -m pip install -U pip \
&& python3 -m pip install paddlenlp fast-tokenizer-python
#install triton
ENV TAG=r22.12
RUN git clone https://github.com/triton-inference-server/server.git -b $TAG && \
cd server && \
mkdir -p build/tritonserver/install && \
python3 build.py \
--build-dir `pwd`/build \
--no-container-build \
--backend=ensemble \
--enable-gpu \
--endpoint=grpc \
--endpoint=http \
--enable-stats \
--enable-tracing \
--enable-logging \
--enable-metrics \
--enable-gpu-metrics \
--enable-cpu-metrics \
--enable-nvtx \
--cmake-dir `pwd` \
--repo-tag=common:$TAG \
--repo-tag=core:$TAG \
--repo-tag=backend:$TAG \
--repo-tag=thirdparty:$TAG \
--backend=python:$TAG
COPY python/dist/*.whl /opt/fastdeploy/
RUN python3 -m pip install /opt/fastdeploy/*.whl \
&& rm -rf /opt/fastdeploy/*.whl
# unset proxy
# ENV http_proxy=''
# ENV https_proxy=''
# RUN python3 -m pip install paddlepaddle-gpu==2.4.1.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
RUN unset http_proxy
RUN unset https_proxy
RUN python3 -m pip install https://paddle-wheel.bj.bcebos.com/2.4.1/linux/linux-gpu-cuda11.2-cudnn8-mkl-gcc8.2-avx/paddlepaddle_gpu-2.4.1.post112-cp38-cp38-linux_x86_64.whl
# compile triton-inference-server/servercopy tritonserver and python backend into image
# triton server
RUN mkdir -p /opt/tritonserver && cp -r /home/server/build/tritonserver/install/* /opt/tritonserver
# python backend
RUN mkdir -p /opt/tritonserver/backends/python && cp -r /home/server/build/python/install/backends/python /opt/tritonserver/backends/
# copy compiled fastdeploy backend into image
COPY serving/build/libtriton_fastdeploy.so /opt/tritonserver/backends/fastdeploy/
COPY build/fastdeploy_install /opt/fastdeploy/
ENV LD_LIBRARY_PATH="/opt/TensorRT-8.5.2.2/lib/:/opt/fastdeploy/lib:/opt/fastdeploy/third_libs/install/onnxruntime/lib:/opt/fastdeploy/third_libs/install/paddle2onnx/lib:/opt/fastdeploy/third_libs/install/tensorrt/lib:/opt/fastdeploy/third_libs/install/paddle_inference/paddle/lib:/opt/fastdeploy/third_libs/install/paddle_inference/third_party/install/mkldnn/lib:/opt/fastdeploy/third_libs/install/paddle_inference/third_party/install/mklml/lib:/opt/fastdeploy/third_libs/install/openvino/runtime/lib:$LD_LIBRARY_PATH"
ENV PATH="/opt/tritonserver/bin:$PATH"
# rename tritonserver to fastdeployserver
RUN mv /opt/tritonserver/bin/tritonserver /opt/tritonserver/bin/fastdeployserver
# copy compiled fastdeploy_install into image
COPY build/fastdeploy_install/* /opt/fastdeploy/
# Set environment variable
ENV LD_LIBRARY_PATH="/opt/fastdeploy/lib:/opt/fastdeploy/third_libs/install/onnxruntime/lib:/opt/fastdeploy/third_libs/install/paddle2onnx/lib:/opt/fastdeploy/third_libs/install/paddle_inference/paddle/lib:/opt/fastdeploy/third_libs/install/openvino/runtime/lib/:/opt/fastdeploy/third_libs/install/tensorrt/lib/:/opt/fastdeploy/third_libs/install/opencv/lib64/:$LD_LIBRARY_PATH"
ENV PATH="/opt/tritonserver/bin:$PATH"

View File

@@ -86,14 +86,14 @@ nvidia-docker run -i --rm --name ${docker_name} \
-e "http_proxy=${http_proxy}" \
-e "https_proxy=${https_proxy}" \
-e "trt_version=${trt_version}"\
nvcr.io/nvidia/tritonserver:21.10-py3-min \
nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04 \
bash -c \
'export https_proxy_tmp=${https_proxy}
export http_proxy_tmp=${http_proxy}
cd /workspace/fastdeploy/python;
rm -rf .setuptools-cmake-build dist build fastdeploy/libs/third_libs;
apt-get update;
apt-get install -y --no-install-recommends patchelf python3-dev python3-pip rapidjson-dev;
apt-get install -y --no-install-recommends patchelf python3-dev python3-pip rapidjson-dev git;
unset http_proxy
unset https_proxy
ln -s /usr/bin/python3 /usr/bin/python;
@@ -117,7 +117,7 @@ nvidia-docker run -i --rm --name ${docker_name} \
rm -rf build; mkdir build; cd build;
export https_proxy=${https_proxy_tmp}
export http_proxy=${http_proxy_tmp}
cmake .. -DFASTDEPLOY_DIR=/workspace/fastdeploy/build/fastdeploy_install -DTRITON_COMMON_REPO_TAG=r21.10 -DTRITON_CORE_REPO_TAG=r21.10 -DTRITON_BACKEND_REPO_TAG=r21.10;
cmake .. -DFASTDEPLOY_DIR=/workspace/fastdeploy/build/fastdeploy_install -DTRITON_COMMON_REPO_TAG=r22.12 -DTRITON_CORE_REPO_TAG=r22.12 -DTRITON_BACKEND_REPO_TAG=r22.12;
make -j`nproc`'
echo "build FD GPU library done"

View File

@@ -1145,16 +1145,16 @@ TRITONSERVER_Error* ModelInstanceState::ReadOutputTensors(
size_t total_batch_size, TRITONBACKEND_Request** requests,
const uint32_t request_count,
std::vector<TRITONBACKEND_Response*>* responses) {
// r22.03
// BackendOutputResponder responder(
// requests, request_count, responses,
// model_state_->TritonMemoryManager(), model_state_->MaxBatchSize() > 0,
// model_state_->EnablePinnedOutput(), CudaStream());
// r21.10
// r22.12
BackendOutputResponder responder(
requests, request_count, responses, StateForModel()->MaxBatchSize(),
StateForModel()->TritonMemoryManager(),
StateForModel()->EnablePinnedOutput(), CudaStream());
requests, request_count, responses,
model_state_->TritonMemoryManager(), model_state_->MaxBatchSize() > 0,
model_state_->EnablePinnedOutput(), CudaStream());
// r21.10
// BackendOutputResponder responder(
// requests, request_count, responses, StateForModel()->MaxBatchSize(),
// StateForModel()->TritonMemoryManager(),
// StateForModel()->EnablePinnedOutput(), CudaStream());
// Use to hold string output contents
bool cuda_copy = false;