[Serving] Support FastDeploy XPU Triton Server (#1994)

* [patchelf] fix patchelf error for inference xpu

* [serving] add xpu dockerfile and support fd server

* [serving] add xpu dockerfile and support fd server

* [Serving] support XPU + Tritron

* [Serving] support XPU + Tritron

* [Dockerfile] update xpu tritron docker file -> paddle 0.0.0

* [Dockerfile] update xpu tritron docker file -> paddle 0.0.0

* [Dockerfile] update xpu tritron docker file -> paddle 0.0.0

* [Dockerfile] add comments for xpu tritron dockerfile

* [Doruntime] fix xpu infer error

* [Doruntime] fix xpu infer error

* [XPU] update xpu dockerfile

* add xpu triton server docs

* add xpu triton server docs

* add xpu triton server docs

* add xpu triton server docs

* update xpu triton server docs

* update xpu triton server docs

* update xpu triton server docs

* update xpu triton server docs

* update xpu triton server docs

* update xpu triton server docs

* update xpu triton server docs

* update xpu triton server docs
This commit is contained in:
DefTruth
2023-05-29 14:38:25 +08:00
committed by GitHub
parent 3a9904411a
commit 434b48dda5
10 changed files with 517 additions and 49 deletions

171
serving/src/fastdeploy_runtime.cc Executable file → Normal file
View File

@@ -199,6 +199,9 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
runtime_options_->UseOrtBackend();
} else if (name == "paddle") {
runtime_options_->UsePaddleBackend();
} else if (name == "paddle_xpu") {
// Note(qiuyanjun): use XPU via paddle inference backend.
runtime_options_->UsePaddleInferBackend();
} else if (name == "openvino") {
runtime_options_->UseOpenVINOBackend();
} else if (name != "") {
@@ -212,44 +215,118 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
}
triton::common::TritonJson::Value params;
if (ea.Find("parameters", &params)) {
std::vector<std::string> param_keys;
THROW_IF_BACKEND_MODEL_ERROR(params.Members(&param_keys));
for (const auto& param_key : param_keys) {
std::string value_string;
THROW_IF_BACKEND_MODEL_ERROR(
params.MemberAsString(param_key.c_str(), &value_string));
if (param_key == "cpu_threads") {
int cpu_thread_num;
if (name == "paddle_xpu") {
// parse parameters for cpu host + xpu device.
if (ea.Find("parameters", &params)) {
std::vector<std::string> param_keys;
THROW_IF_BACKEND_MODEL_ERROR(params.Members(&param_keys));
// default settings for XPU.
int kunlunxin_id = 0;
int l3_workspace_size = 0xfffc00;
bool locked = false;
bool autotune = true;
std::string autotune_file = "";
std::string precision = "int16";
bool adaptive_seqlen = false;
bool enable_multi_stream = false;
// for future use (only support lite backend now).
int gm_default_size = 0;
// common settings for cpu host.
int cpu_thread_num = -1;
bool use_paddle_log = false;
for (const auto& param_key : param_keys) {
std::string value_string;
THROW_IF_BACKEND_MODEL_ERROR(
ParseIntValue(value_string, &cpu_thread_num));
runtime_options_->SetCpuThreadNum(cpu_thread_num);
} else if (param_key == "use_mkldnn") {
bool pd_enable_mkldnn;
params.MemberAsString(param_key.c_str(), &value_string));
// parse common settings for cpu host.
if (param_key == "cpu_threads") {
THROW_IF_BACKEND_MODEL_ERROR(
ParseIntValue(value_string, &cpu_thread_num));
runtime_options_->SetCpuThreadNum(cpu_thread_num);
} else if (param_key == "use_paddle_log") {
THROW_IF_BACKEND_MODEL_ERROR(
ParseBoolValue(value_string, &use_paddle_log));
runtime_options_->paddle_infer_option.enable_log_info =
use_paddle_log;
} else if (param_key == "is_clone") {
THROW_IF_BACKEND_MODEL_ERROR(
ParseBoolValue(value_string, &is_clone_));
} else if (param_key == "encryption_key") {
runtime_options_->SetEncryptionKey(value_string);
// parse common settings for xpu device.
} else if (param_key == "kunlunxin_id") {
THROW_IF_BACKEND_MODEL_ERROR(
ParseIntValue(value_string, &kunlunxin_id));
} else if (param_key == "l3_workspace_size") {
THROW_IF_BACKEND_MODEL_ERROR(
ParseIntValue(value_string, &l3_workspace_size));
} else if (param_key == "locked") {
THROW_IF_BACKEND_MODEL_ERROR(
ParseBoolValue(value_string, &locked));
} else if (param_key == "autotune") {
THROW_IF_BACKEND_MODEL_ERROR(
ParseBoolValue(value_string, &autotune));
} else if (param_key == "precision") {
precision = value_string;
} else if (param_key == "adaptive_seqlen") {
THROW_IF_BACKEND_MODEL_ERROR(
ParseBoolValue(value_string, &adaptive_seqlen));
} else if (param_key == "enable_multi_stream") {
THROW_IF_BACKEND_MODEL_ERROR(
ParseBoolValue(value_string, &enable_multi_stream));
} else if (param_key == "gm_default_size") {
THROW_IF_BACKEND_MODEL_ERROR(
ParseIntValue(value_string, &gm_default_size));
}
}
// initialize xpu device settings
runtime_options_->UseKunlunXin(
kunlunxin_id, l3_workspace_size, locked, autotune,
autotune_file, precision, adaptive_seqlen, enable_multi_stream,
int64_t(gm_default_size));
}
} else {
// parse parameters for cpu only
if (ea.Find("parameters", &params)) {
std::vector<std::string> param_keys;
THROW_IF_BACKEND_MODEL_ERROR(params.Members(&param_keys));
for (const auto& param_key : param_keys) {
std::string value_string;
THROW_IF_BACKEND_MODEL_ERROR(
ParseBoolValue(value_string, &pd_enable_mkldnn));
runtime_options_->SetPaddleMKLDNN(pd_enable_mkldnn);
} else if (param_key == "use_paddle_log") {
bool use_paddle_log;
THROW_IF_BACKEND_MODEL_ERROR(
ParseBoolValue(value_string, &use_paddle_log));
runtime_options_->paddle_infer_option.enable_log_info =
use_paddle_log;
} else if (param_key == "num_streams") {
int num_streams;
THROW_IF_BACKEND_MODEL_ERROR(
ParseIntValue(value_string, &num_streams));
runtime_options_->openvino_option.num_streams = num_streams;
} else if (param_key == "is_clone") {
THROW_IF_BACKEND_MODEL_ERROR(
ParseBoolValue(value_string, &is_clone_));
} else if (param_key == "use_ipu") {
// runtime_options_->UseIpu();
} else if (param_key == "encryption_key") {
runtime_options_->SetEncryptionKey(value_string);
params.MemberAsString(param_key.c_str(), &value_string));
if (param_key == "cpu_threads") {
int cpu_thread_num;
THROW_IF_BACKEND_MODEL_ERROR(
ParseIntValue(value_string, &cpu_thread_num));
runtime_options_->SetCpuThreadNum(cpu_thread_num);
} else if (param_key == "use_mkldnn") {
bool pd_enable_mkldnn;
THROW_IF_BACKEND_MODEL_ERROR(
ParseBoolValue(value_string, &pd_enable_mkldnn));
runtime_options_->SetPaddleMKLDNN(pd_enable_mkldnn);
} else if (param_key == "use_paddle_log") {
bool use_paddle_log;
THROW_IF_BACKEND_MODEL_ERROR(
ParseBoolValue(value_string, &use_paddle_log));
runtime_options_->paddle_infer_option.enable_log_info =
use_paddle_log;
} else if (param_key == "num_streams") {
int num_streams;
THROW_IF_BACKEND_MODEL_ERROR(
ParseIntValue(value_string, &num_streams));
runtime_options_->openvino_option.num_streams = num_streams;
} else if (param_key == "is_clone") {
THROW_IF_BACKEND_MODEL_ERROR(
ParseBoolValue(value_string, &is_clone_));
} else if (param_key == "use_ipu") {
// runtime_options_->UseIpu();
} else if (param_key == "encryption_key") {
runtime_options_->SetEncryptionKey(value_string);
}
}
}
}
} // end 'name == "paddle_xpu"'
}
}
}
@@ -422,7 +499,7 @@ TRITONSERVER_Error* ModelState::LoadModel(
}
}
// GPU
// GPU
#ifdef TRITON_ENABLE_GPU
if ((instance_group_kind == TRITONSERVER_INSTANCEGROUPKIND_GPU) ||
(instance_group_kind == TRITONSERVER_INSTANCEGROUPKIND_AUTO)) {
@@ -432,8 +509,9 @@ TRITONSERVER_Error* ModelState::LoadModel(
runtime_options_->UseCpu();
}
#else
if (runtime_options_->device != fastdeploy::Device::IPU) {
// If Device is set to IPU, just skip CPU setting.
if ((runtime_options_->device != fastdeploy::Device::IPU) &&
(runtime_options_->device != fastdeploy::Device::KUNLUNXIN)) {
// If Device is set to IPU/XPU, just skip CPU setting.
runtime_options_->UseCpu();
}
#endif // TRITON_ENABLE_GPU
@@ -972,7 +1050,7 @@ void ModelInstanceState::ProcessRequests(TRITONBACKEND_Request** requests,
SetInputTensors(total_batch_size, requests, request_count, &responses,
&collector, &cuda_copy));
// Wait for any in-flight input tensor copies to complete.
// Wait for any in-flight input tensor copies to complete.
#ifdef TRITON_ENABLE_GPU
if (cuda_copy) {
cudaStreamSynchronize(CudaStream());
@@ -1146,15 +1224,16 @@ TRITONSERVER_Error* ModelInstanceState::ReadOutputTensors(
const uint32_t request_count,
std::vector<TRITONBACKEND_Response*>* responses) {
// r22.12
BackendOutputResponder responder(
requests, request_count, responses,
model_state_->TritonMemoryManager(), model_state_->MaxBatchSize() > 0,
model_state_->EnablePinnedOutput(), CudaStream());
// r21.10
// BackendOutputResponder responder(
// requests, request_count, responses, StateForModel()->MaxBatchSize(),
// StateForModel()->TritonMemoryManager(),
// StateForModel()->EnablePinnedOutput(), CudaStream());
// requests, request_count, responses,
// model_state_->TritonMemoryManager(), model_state_->MaxBatchSize() > 0,
// model_state_->EnablePinnedOutput(), CudaStream());
// r21.10
BackendOutputResponder responder(
requests, request_count, responses, StateForModel()->MaxBatchSize(),
StateForModel()->TritonMemoryManager(),
StateForModel()->EnablePinnedOutput(), CudaStream());
// Use to hold string output contents
bool cuda_copy = false;