mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 00:57:33 +08:00
@@ -296,22 +296,22 @@ bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
|
||||
ShareTensorFromFDTensor(handle.get(), inputs[i]);
|
||||
}
|
||||
// prebinded output only support for GPU
|
||||
if (!copy_to_fd) {
|
||||
for (size_t i = 0; i < (*outputs).size(); ++i) {
|
||||
auto output_name = (*outputs)[i].name;
|
||||
// if a output is not prebinded,
|
||||
// the name of output is expected to be empty.
|
||||
// We skip here
|
||||
if (output_name.empty()) {
|
||||
continue;
|
||||
}
|
||||
// Record the prebinded output_name.
|
||||
// Those outputs do not need PaddleTensorToFDTensor
|
||||
// after predictor_.Run()
|
||||
auto handle = predictor_->GetOutputHandle(output_name);
|
||||
ShareOutTensorFromFDTensor(handle.get(), (*outputs)[i]);
|
||||
}
|
||||
}
|
||||
// if (!copy_to_fd) {
|
||||
// for (size_t i = 0; i < (*outputs).size(); ++i) {
|
||||
// auto output_name = (*outputs)[i].name;
|
||||
// // if a output is not prebinded,
|
||||
// // the name of output is expected to be empty.
|
||||
// // We skip here
|
||||
// if (output_name.empty()) {
|
||||
// continue;
|
||||
// }
|
||||
// // Record the prebinded output_name.
|
||||
// // Those outputs do not need PaddleTensorToFDTensor
|
||||
// // after predictor_.Run()
|
||||
// auto handle = predictor_->GetOutputHandle(output_name);
|
||||
// ShareOutTensorFromFDTensor(handle.get(), (*outputs)[i]);
|
||||
// }
|
||||
// }
|
||||
|
||||
RUNTIME_PROFILE_LOOP_BEGIN(1)
|
||||
predictor_->Run();
|
||||
|
@@ -12,51 +12,72 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
FROM nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04
|
||||
|
||||
ARG http_proxy
|
||||
ARG https_proxy
|
||||
|
||||
FROM nvcr.io/nvidia/tritonserver:21.10-py3 as full
|
||||
FROM nvcr.io/nvidia/tritonserver:21.10-py3-min
|
||||
#Install the build dependencies
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends curl wget vim git patchelf python3-dev python3-pip openssl \
|
||||
python3-setuptools build-essential libgl1-mesa-glx libglib2.0-dev ca-certificates libb64-dev datacenter-gpu-manager \
|
||||
libssl-dev zlib1g-dev rapidjson-dev libboost-dev libre2-dev librdmacm-dev libnuma-dev libarchive-dev unzip && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY --from=full /opt/tritonserver/bin/tritonserver /opt/tritonserver/bin/fastdeployserver
|
||||
COPY --from=full /opt/tritonserver/lib /opt/tritonserver/lib
|
||||
COPY --from=full /opt/tritonserver/include /opt/tritonserver/include
|
||||
COPY --from=full /opt/tritonserver/backends/python /opt/tritonserver/backends/python
|
||||
RUN python3 -m pip install --upgrade pip && python3 -m pip install redis
|
||||
|
||||
COPY serving/TensorRT-8.5.2.2 /opt/TensorRT-8.5.2.2
|
||||
# install cmake
|
||||
WORKDIR /home
|
||||
RUN wget -q https://github.com/Kitware/CMake/releases/download/v3.18.6/cmake-3.18.6-Linux-x86_64.tar.gz && tar -zxvf cmake-3.18.6-Linux-x86_64.tar.gz
|
||||
ENV PATH=/home/cmake-3.18.6-Linux-x86_64/bin:$PATH
|
||||
|
||||
ENV TZ=Asia/Shanghai \
|
||||
DEBIAN_FRONTEND=noninteractive \
|
||||
DCGM_VERSION=2.2.9 \
|
||||
http_proxy=$http_proxy \
|
||||
https_proxy=$http_proxy
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-key del 7fa2af80 \
|
||||
&& wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb \
|
||||
&& dpkg -i cuda-keyring_1.0-1_all.deb \
|
||||
&& apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub \
|
||||
&& apt-get update && apt-get install -y --no-install-recommends datacenter-gpu-manager=1:2.2.9
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends libre2-5 libb64-0d python3 python3-pip libarchive-dev ffmpeg libsm6 libxext6 \
|
||||
&& python3 -m pip install -U pip \
|
||||
&& python3 -m pip install paddlenlp fast-tokenizer-python
|
||||
#install triton
|
||||
ENV TAG=r22.12
|
||||
RUN git clone https://github.com/triton-inference-server/server.git -b $TAG && \
|
||||
cd server && \
|
||||
mkdir -p build/tritonserver/install && \
|
||||
python3 build.py \
|
||||
--build-dir `pwd`/build \
|
||||
--no-container-build \
|
||||
--backend=ensemble \
|
||||
--enable-gpu \
|
||||
--endpoint=grpc \
|
||||
--endpoint=http \
|
||||
--enable-stats \
|
||||
--enable-tracing \
|
||||
--enable-logging \
|
||||
--enable-metrics \
|
||||
--enable-gpu-metrics \
|
||||
--enable-cpu-metrics \
|
||||
--enable-nvtx \
|
||||
--cmake-dir `pwd` \
|
||||
--repo-tag=common:$TAG \
|
||||
--repo-tag=core:$TAG \
|
||||
--repo-tag=backend:$TAG \
|
||||
--repo-tag=thirdparty:$TAG \
|
||||
--backend=python:$TAG
|
||||
|
||||
COPY python/dist/*.whl /opt/fastdeploy/
|
||||
RUN python3 -m pip install /opt/fastdeploy/*.whl \
|
||||
&& rm -rf /opt/fastdeploy/*.whl
|
||||
|
||||
# unset proxy
|
||||
# ENV http_proxy=''
|
||||
# ENV https_proxy=''
|
||||
# RUN python3 -m pip install paddlepaddle-gpu==2.4.1.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
|
||||
RUN unset http_proxy
|
||||
RUN unset https_proxy
|
||||
RUN python3 -m pip install https://paddle-wheel.bj.bcebos.com/2.4.1/linux/linux-gpu-cuda11.2-cudnn8-mkl-gcc8.2-avx/paddlepaddle_gpu-2.4.1.post112-cp38-cp38-linux_x86_64.whl
|
||||
|
||||
# compile triton-inference-server/server,copy tritonserver and python backend into image
|
||||
# triton server
|
||||
RUN mkdir -p /opt/tritonserver && cp -r /home/server/build/tritonserver/install/* /opt/tritonserver
|
||||
# python backend
|
||||
RUN mkdir -p /opt/tritonserver/backends/python && cp -r /home/server/build/python/install/backends/python /opt/tritonserver/backends/
|
||||
|
||||
# copy compiled fastdeploy backend into image
|
||||
COPY serving/build/libtriton_fastdeploy.so /opt/tritonserver/backends/fastdeploy/
|
||||
COPY build/fastdeploy_install /opt/fastdeploy/
|
||||
|
||||
ENV LD_LIBRARY_PATH="/opt/TensorRT-8.5.2.2/lib/:/opt/fastdeploy/lib:/opt/fastdeploy/third_libs/install/onnxruntime/lib:/opt/fastdeploy/third_libs/install/paddle2onnx/lib:/opt/fastdeploy/third_libs/install/tensorrt/lib:/opt/fastdeploy/third_libs/install/paddle_inference/paddle/lib:/opt/fastdeploy/third_libs/install/paddle_inference/third_party/install/mkldnn/lib:/opt/fastdeploy/third_libs/install/paddle_inference/third_party/install/mklml/lib:/opt/fastdeploy/third_libs/install/openvino/runtime/lib:$LD_LIBRARY_PATH"
|
||||
ENV PATH="/opt/tritonserver/bin:$PATH"
|
||||
# rename tritonserver to fastdeployserver
|
||||
RUN mv /opt/tritonserver/bin/tritonserver /opt/tritonserver/bin/fastdeployserver
|
||||
|
||||
# copy compiled fastdeploy_install into image
|
||||
COPY build/fastdeploy_install/* /opt/fastdeploy/
|
||||
|
||||
# Set environment variable
|
||||
ENV LD_LIBRARY_PATH="/opt/fastdeploy/lib:/opt/fastdeploy/third_libs/install/onnxruntime/lib:/opt/fastdeploy/third_libs/install/paddle2onnx/lib:/opt/fastdeploy/third_libs/install/paddle_inference/paddle/lib:/opt/fastdeploy/third_libs/install/openvino/runtime/lib/:/opt/fastdeploy/third_libs/install/tensorrt/lib/:/opt/fastdeploy/third_libs/install/opencv/lib64/:$LD_LIBRARY_PATH"
|
||||
ENV PATH="/opt/tritonserver/bin:$PATH"
|
@@ -86,14 +86,14 @@ nvidia-docker run -i --rm --name ${docker_name} \
|
||||
-e "http_proxy=${http_proxy}" \
|
||||
-e "https_proxy=${https_proxy}" \
|
||||
-e "trt_version=${trt_version}"\
|
||||
nvcr.io/nvidia/tritonserver:21.10-py3-min \
|
||||
nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04 \
|
||||
bash -c \
|
||||
'export https_proxy_tmp=${https_proxy}
|
||||
export http_proxy_tmp=${http_proxy}
|
||||
cd /workspace/fastdeploy/python;
|
||||
rm -rf .setuptools-cmake-build dist build fastdeploy/libs/third_libs;
|
||||
apt-get update;
|
||||
apt-get install -y --no-install-recommends patchelf python3-dev python3-pip rapidjson-dev;
|
||||
apt-get install -y --no-install-recommends patchelf python3-dev python3-pip rapidjson-dev git;
|
||||
unset http_proxy
|
||||
unset https_proxy
|
||||
ln -s /usr/bin/python3 /usr/bin/python;
|
||||
@@ -117,7 +117,7 @@ nvidia-docker run -i --rm --name ${docker_name} \
|
||||
rm -rf build; mkdir build; cd build;
|
||||
export https_proxy=${https_proxy_tmp}
|
||||
export http_proxy=${http_proxy_tmp}
|
||||
cmake .. -DFASTDEPLOY_DIR=/workspace/fastdeploy/build/fastdeploy_install -DTRITON_COMMON_REPO_TAG=r21.10 -DTRITON_CORE_REPO_TAG=r21.10 -DTRITON_BACKEND_REPO_TAG=r21.10;
|
||||
cmake .. -DFASTDEPLOY_DIR=/workspace/fastdeploy/build/fastdeploy_install -DTRITON_COMMON_REPO_TAG=r22.12 -DTRITON_CORE_REPO_TAG=r22.12 -DTRITON_BACKEND_REPO_TAG=r22.12;
|
||||
make -j`nproc`'
|
||||
|
||||
echo "build FD GPU library done"
|
||||
|
@@ -1145,16 +1145,16 @@ TRITONSERVER_Error* ModelInstanceState::ReadOutputTensors(
|
||||
size_t total_batch_size, TRITONBACKEND_Request** requests,
|
||||
const uint32_t request_count,
|
||||
std::vector<TRITONBACKEND_Response*>* responses) {
|
||||
// r22.03
|
||||
// BackendOutputResponder responder(
|
||||
// requests, request_count, responses,
|
||||
// model_state_->TritonMemoryManager(), model_state_->MaxBatchSize() > 0,
|
||||
// model_state_->EnablePinnedOutput(), CudaStream());
|
||||
// r21.10
|
||||
// r22.12
|
||||
BackendOutputResponder responder(
|
||||
requests, request_count, responses, StateForModel()->MaxBatchSize(),
|
||||
StateForModel()->TritonMemoryManager(),
|
||||
StateForModel()->EnablePinnedOutput(), CudaStream());
|
||||
requests, request_count, responses,
|
||||
model_state_->TritonMemoryManager(), model_state_->MaxBatchSize() > 0,
|
||||
model_state_->EnablePinnedOutput(), CudaStream());
|
||||
// r21.10
|
||||
// BackendOutputResponder responder(
|
||||
// requests, request_count, responses, StateForModel()->MaxBatchSize(),
|
||||
// StateForModel()->TritonMemoryManager(),
|
||||
// StateForModel()->EnablePinnedOutput(), CudaStream());
|
||||
|
||||
// Use to hold string output contents
|
||||
bool cuda_copy = false;
|
||||
|
Reference in New Issue
Block a user