diff --git a/serving/Dockerfile b/serving/Dockerfile index 71921dc8a..c3b5f40dc 100644 --- a/serving/Dockerfile +++ b/serving/Dockerfile @@ -20,7 +20,7 @@ COPY --from=full /opt/tritonserver/lib /opt/tritonserver/lib COPY --from=full /opt/tritonserver/include /opt/tritonserver/include COPY --from=full /opt/tritonserver/backends/python /opt/tritonserver/backends/python -COPY TensorRT-8.4.1.5 /opt/ +COPY serving/TensorRT-8.4.1.5 /opt/ ENV TZ=Asia/Shanghai \ DEBIAN_FRONTEND=noninteractive \ diff --git a/serving/Dockfile_cpu b/serving/Dockerfile_cpu similarity index 93% rename from serving/Dockfile_cpu rename to serving/Dockerfile_cpu index 390272d30..e6270bb59 100644 --- a/serving/Dockfile_cpu +++ b/serving/Dockerfile_cpu @@ -19,9 +19,9 @@ ENV TZ=Asia/Shanghai \ RUN apt-get update && apt-get install -y --no-install-recommends apt-utils libgomp1 \ && python3 -m pip install -U pip \ - && python3 -m pip install paddlepaddle faster_tokenizer + && python3 -m pip install paddlepaddle paddlenlp faster_tokenizer -COPY python/dist/*.whl /opt/fastdeploy/ +COPY python/dist/*.whl *.whl /opt/fastdeploy/ RUN python3 -m pip install /opt/fastdeploy/*.whl \ && rm -rf /opt/fastdeploy/*.whl diff --git a/serving/README_CN.md b/serving/README_CN.md index 5849af03f..afd0d7131 100644 --- a/serving/README_CN.md +++ b/serving/README_CN.md @@ -6,14 +6,31 @@ FastDeploy基于[Triton Inference Server](https://github.com/triton-inference-server/server)搭建了端到端的服务化部署。底层后端使用FastDeploy高性能Runtime模块,并串联FastDeploy前后处理模块实现端到端的服务化部署。具有快速部署、使用简单、性能卓越的特性。 -## 端到端部署示例 +## 准备环境 -- [YOLOV5 检测任务](../examples/vision/detection/yolov5/README.md) -- [OCR ]() -- [Erinie3.0 文本分类任务]() -- [UIE ]() -- [Speech ]() +### 环境要求 +- Linux +- 如果使用GPU镜像, 要求NVIDIA Driver >= 470(如果是旧的Tesla架构GPU,如T4使用的NVIDIA Driver可以是418.40+、440.33+、450.51+、460.27+) -## 高阶文档 -- [模型仓库](docs/zh_CN/model_repository.md) -- [模型配置](docs/zh_CN/model_configuration.md) +### 获取镜像 + +#### CPU镜像 +CPU镜像仅支持Paddle/ONNX模型在CPU上进行服务化部署,支持的推理后端包括OpenVINO、Paddle Inference和ONNX Runtime +``` shell +docker pull paddlepaddle/fastdeploy:0.3.0-cpu-only +``` + +#### GPU镜像 +GPU镜像支持Paddle/ONNX模型在GPU/CPU上进行服务化部署,支持的推理后端包括OpenVINO、TensorRT、Paddle Inference和ONNX Runtime +``` +docker pull paddlepaddle/fastdeploy:0.3.0-gpu-cuda11.4-trt8.4-21.10 +``` + +用户也可根据自身需求,参考如下文档自行编译镜像 +- [FastDeploy服务化部署镜像编译说明](docs/zh_CN/compile.md) + +## 其它文档 +- [服务化模型目录说明](docs/zh_CN/model_repository.md) (说明如何准备模型目录) +- [服务化部署配置说明](docs/zh_CN/model_configuration.md) (说明runtime的配置选项) +- [服务化部署示例](docs/zh_CN/demo.md) + - [YOLOV5 检测任务](../examples/vision/detection/yolov5/serving/README.md) diff --git a/serving/docs/zh_CN/compile.md b/serving/docs/zh_CN/compile.md new file mode 100644 index 000000000..3beae64fb --- /dev/null +++ b/serving/docs/zh_CN/compile.md @@ -0,0 +1 @@ +# 服务化部署镜像编译 diff --git a/serving/docs/zh_CN/demo.md b/serving/docs/zh_CN/demo.md new file mode 100644 index 000000000..8fdfa08e7 --- /dev/null +++ b/serving/docs/zh_CN/demo.md @@ -0,0 +1 @@ +# 服务化部署示例 diff --git a/serving/scripts/build_fd_backend.sh b/serving/scripts/build_fd_backend.sh index 5d402c5d9..b7aaae7b4 100644 --- a/serving/scripts/build_fd_backend.sh +++ b/serving/scripts/build_fd_backend.sh @@ -24,17 +24,17 @@ if [ ! -d "./cmake-3.18.6-Linux-x86_64/" ]; then fi docker run -it --rm --name build_fd_backend \ - -v`pwd`:/workspace/fastdeploy \ + -v`pwd`/..:/workspace/fastdeploy \ nvcr.io/nvidia/tritonserver:21.10-py3 \ bash -c \ 'cd /workspace/fastdeploy/serving; rm -rf build; mkdir build; cd build; apt-get update; apt-get install -y --no-install-recommends rapidjson-dev; - export PATH=/workspace/fastdeploy/cmake-3.18.6-Linux-x86_64/bin:$PATH; + export PATH=/workspace/fastdeploy/serving/cmake-3.18.6-Linux-x86_64/bin:$PATH; cmake .. -DFASTDEPLOY_DIR=/workspace/fastdeploy/build/fastdeploy-0.0.3 -DTRITON_COMMON_REPO_TAG=r21.10 -DTRITON_CORE_REPO_TAG=r21.10 -DTRITON_BACKEND_REPO_TAG=r21.10; make -j`nproc`' else docker run -it --rm --name build_fd_backend \ - -v`pwd`:/workspace/fastdeploy \ + -v`pwd`/..:/workspace/fastdeploy \ paddlepaddle/fastdeploy:22.09-cpu-only-buildbase \ bash -c \ 'cd /workspace/fastdeploy/serving; diff --git a/serving/scripts/build_fd_runtime.sh b/serving/scripts/build_fd_runtime.sh index 7525ca6a2..723eeb366 100644 --- a/serving/scripts/build_fd_runtime.sh +++ b/serving/scripts/build_fd_runtime.sh @@ -29,7 +29,7 @@ if [ ! -d "./TensorRT-8.4.1.5/" ]; then fi docker run -it --rm --name build_fd_runtime \ - -v`pwd`:/workspace/fastdeploy \ + -v`pwd`/..:/workspace/fastdeploy \ nvcr.io/nvidia/tritonserver:21.10-py3-min \ bash -c \ 'cd /workspace/fastdeploy; @@ -37,15 +37,15 @@ docker run -it --rm --name build_fd_runtime \ apt-get update; apt-get install -y --no-install-recommends python3-dev python3-pip; ln -s /usr/bin/python3 /usr/bin/python; - export PATH=/workspace/fastdeploy/cmake-3.18.6-Linux-x86_64/bin:$PATH; - cmake .. -DENABLE_TRT_BACKEND=ON -DCMAKE_INSTALL_PREFIX=${PWD}/fastdeploy-0.0.3 -DWITH_GPU=ON -DTRT_DIRECTORY=${PWD}/../TensorRT-8.4.1.5/ -DENABLE_PADDLE_BACKEND=ON -DENABLE_ORT_BACKEND=ON -DENABLE_OPENVINO_BACKEND=ON -DENABLE_VISION=OFF -DBUILD_FASTDEPLOY_PYTHON=OFF -DENABLE_PADDLE_FRONTEND=ON -DENABLE_TEXT=OFF -DLIBRARY_NAME=fastdeploy_runtime; + export PATH=/workspace/fastdeploy/serving/cmake-3.18.6-Linux-x86_64/bin:$PATH; + cmake .. -DENABLE_TRT_BACKEND=ON -DCMAKE_INSTALL_PREFIX=${PWD}/fastdeploy-0.0.3 -DWITH_GPU=ON -DTRT_DIRECTORY=/workspace/fastdeploy/serving/TensorRT-8.4.1.5/ -DENABLE_PADDLE_BACKEND=ON -DENABLE_ORT_BACKEND=ON -DENABLE_OPENVINO_BACKEND=ON -DENABLE_VISION=OFF -DBUILD_FASTDEPLOY_PYTHON=OFF -DENABLE_PADDLE_FRONTEND=ON -DENABLE_TEXT=OFF -DLIBRARY_NAME=fastdeploy_runtime; make -j`nproc`; make install' else docker run -it --rm --name build_fd_runtime \ - -v`pwd`:/workspace/fastdeploy \ + -v`pwd`/..:/workspace/fastdeploy \ paddlepaddle/fastdeploy:22.09-cpu-only-buildbase \ bash -c \ 'cd /workspace/fastdeploy; diff --git a/serving/scripts/build_fd_vison.sh b/serving/scripts/build_fd_vison.sh index e0beb6e7f..59cd923b1 100644 --- a/serving/scripts/build_fd_vison.sh +++ b/serving/scripts/build_fd_vison.sh @@ -23,7 +23,7 @@ if [ ! -d "./cmake-3.18.6-Linux-x86_64/" ]; then fi docker run -it --rm --name build_fd_vison \ - -v`pwd`:/workspace/fastdeploy \ + -v`pwd`/..:/workspace/fastdeploy \ nvcr.io/nvidia/tritonserver:21.10-py3-min \ bash -c \ 'cd /workspace/fastdeploy/python; @@ -31,7 +31,7 @@ docker run -it --rm --name build_fd_vison \ apt-get update; apt-get install -y --no-install-recommends patchelf python3-dev python3-pip; ln -s /usr/bin/python3 /usr/bin/python; - export PATH=/workspace/fastdeploy/cmake-3.18.6-Linux-x86_64/bin:$PATH; + export PATH=/workspace/fastdeploy/serving/cmake-3.18.6-Linux-x86_64/bin:$PATH; export WITH_GPU=ON; export ENABLE_ORT_BACKEND=OFF; export ENABLE_VISION=ON; diff --git a/serving/src/fastdeploy_runtime.cc b/serving/src/fastdeploy_runtime.cc index b1ed8b6b0..fd541d5f9 100644 --- a/serving/src/fastdeploy_runtime.cc +++ b/serving/src/fastdeploy_runtime.cc @@ -315,6 +315,8 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model) // &runtime_options_->trt_max_workspace_size)); } else if (param_key == "cache_file") { runtime_options_->SetTrtCacheFile(value_string); + } else (param_key == "use_paddle") { + runtime_options_->EnablePaddleToTrt(); } } } @@ -1025,12 +1027,13 @@ TRITONSERVER_Error* ModelInstanceState::SetInputTensors( input, &input_name, &input_datatype, &input_shape, &input_dims_count, nullptr, nullptr)); - if (input_tensors_[input_idx].name != std::string(input_name)) { + int index = GetInfoIndex(std::string(input_name), input_tensor_infos_); + if (index < 0) { auto err = TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, (std::string("Input name [") + input_name + std::string("] is not one of the FD predictor input: ") + - input_tensors_[input_idx].name) + input_tensors_[index].name) .c_str()); // SendErrorForResponses(responses, request_count, err); return err; @@ -1075,12 +1078,12 @@ TRITONSERVER_Error* ModelInstanceState::SetInputTensors( memory_type = TRITONSERVER_MEMORY_CPU; device = fastdeploy::Device::CPU; } - input_tensors_[input_idx].Resize( + input_tensors_[index].Resize( batchn_shape, ConvertDataTypeToFD(input_datatype), input_name, device); collector->ProcessTensor( input_name, - reinterpret_cast(input_tensors_[input_idx].MutableData()), - input_tensors_[input_idx].Nbytes(), memory_type, device_id); + reinterpret_cast(input_tensors_[index].MutableData()), + input_tensors_[index].Nbytes(), memory_type, device_id); } // Finalize...