mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 09:07:10 +08:00
[LLM] Support deploy LLM model
This commit is contained in:
58
llm/server/scripts/start_server.sh
Normal file
58
llm/server/scripts/start_server.sh
Normal file
@@ -0,0 +1,58 @@
|
||||
#!/usr/bin/bash
|
||||
|
||||
export GLOG_v=0
|
||||
export GLOG_logtostderr=1
|
||||
export PYTHONIOENCODING=utf8
|
||||
export LC_ALL=C.UTF-8
|
||||
|
||||
# PaddlePaddle environment variables
|
||||
export FLAGS_allocator_strategy=naive_best_fit
|
||||
export FLAGS_fraction_of_gpu_memory_to_use=0.96
|
||||
export FLAGS_dynamic_static_unified_comm=0
|
||||
export FLAGS_use_xqa_optim=1
|
||||
export FLAGS_gemm_use_half_precision_compute_type=0
|
||||
export NVIDIA_TF32_OVERRIDE=0
|
||||
|
||||
# Model hyperparameters
|
||||
export MP_NUM=${MP_NUM:-"1"} # GPU num
|
||||
export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"} # GPU
|
||||
export MAX_SEQ_LEN=${MAX_SEQ_LEN:-"8192"}
|
||||
export MAX_DEC_LEN=${MAX_DEC_LEN:-"2048"}
|
||||
export BATCH_SIZE=${BATCH_SIZE:-"20"}
|
||||
export BLOCK_BS=${BLOCK_BS:-"4"}
|
||||
export BLOCK_SIZE=${BLOCK_SIZE:-"64"}
|
||||
export DTYPE=${DTYPE:-"bfloat16"}
|
||||
export USE_CACHE_KV_INT8=${USE_CACHE_KV_INT8:-"0"} # c8 model requires configuration 1
|
||||
export BLOCK_RATIO=${BLOCK_RATIO:-"0.75"}
|
||||
export ENC_DEC_BLOCK_NUM=${ENC_DEC_BLOCK_NUM:-"4"}
|
||||
export FIRST_TOKEN_ID=${FIRST_TOKEN_ID:-"1"}
|
||||
export MAX_PREFILL_BATCH=${MAX_PREFILL_BATCH:-"4"}
|
||||
export STOP_THRESHOLD=${STOP_THRESHOLD:-"0"}
|
||||
export MODEL_DIR=${MODEL_DIR:-"/models/"}
|
||||
export DISTRIBUTED_CONFIG=${DISTRIBUTED_CONFIG:-"${MODEL_DIR}/rank_mapping.csv"}
|
||||
export CONFIG_JSON_FILE=${CONFIG_JSON_FILE:-"config.json"}
|
||||
export PUSH_MODE_HTTP_WORKERS=${PUSH_MODE_HTTP_WORKERS:-"4"}
|
||||
|
||||
# serving port
|
||||
export HTTP_PORT=${HTTP_PORT:-"8110"}
|
||||
export GRPC_PORT=${GRPC_PORT:-"8811"}
|
||||
export METRICS_PORT=${METRICS_PORT:-"8722"}
|
||||
export INFER_QUEUE_PORT=${INFER_QUEUE_PORT:-"8813"}
|
||||
export PUSH_MODE_HTTP_PORT=${PUSH_MODE_HTTP_PORT:-"9965"}
|
||||
|
||||
mkdir -p log
|
||||
rm -rf console.log log/*
|
||||
rm -rf /dev/shm/*
|
||||
|
||||
# 启动服务
|
||||
echo "start serving ..."
|
||||
|
||||
tritonserver --exit-timeout-secs 100 --cuda-memory-pool-byte-size 0:0 --cuda-memory-pool-byte-size 1:0 \
|
||||
--cuda-memory-pool-byte-size 2:0 --cuda-memory-pool-byte-size 3:0 --cuda-memory-pool-byte-size 4:0 \
|
||||
--cuda-memory-pool-byte-size 5:0 --cuda-memory-pool-byte-size 6:0 --cuda-memory-pool-byte-size 7:0 \
|
||||
--pinned-memory-pool-byte-size 0 --model-repository llm_model/ \
|
||||
--allow-http false \
|
||||
--grpc-port=${GRPC_PORT} \
|
||||
--metrics-port=${METRICS_PORT} \
|
||||
--log-file log/server.log --log-info true > log/console.log 2>&1 &
|
||||
echo "模型服务的启动日志,请查看" ${PWD}"/log/server.log 和 "${PWD}"/log/workerlog.0 "
|
69
llm/server/scripts/stop_server.sh
Normal file
69
llm/server/scripts/stop_server.sh
Normal file
@@ -0,0 +1,69 @@
|
||||
# /bin/bash
|
||||
|
||||
pids=($(ps aux | grep -E 'tritonserver' | grep -v grep | awk '{print $2}'))
|
||||
|
||||
if [ ${#pids[@]} -eq 0 ]; then
|
||||
echo "未找到 tritonserver 相关进程"
|
||||
timeout=1
|
||||
else
|
||||
timeout=300
|
||||
fi
|
||||
|
||||
# kill processor
|
||||
for pid in "${pids[@]}"; do
|
||||
echo "正在中断进程 $pid"
|
||||
kill -2 "$pid"
|
||||
done
|
||||
|
||||
timeout_interval=$1
|
||||
if [ ! "$timeout_interval" == "" ]; then
|
||||
timeout=$timeout_interval
|
||||
echo $timeout
|
||||
fi
|
||||
|
||||
start_time=$(date +%s)
|
||||
|
||||
while : ; do
|
||||
current_time=$(date +%s)
|
||||
|
||||
elapsed_time=$((current_time - start_time))
|
||||
|
||||
if [ $elapsed_time -ge $timeout ]; then
|
||||
echo "tritonserver进程超时未退出"
|
||||
echo "强制杀死所有有关进程"
|
||||
pids=$(ps auxww | grep -E "tritonserver|triton_python_backend_stub|new_infer.py|infer|multiprocessing.resource_tracker|paddle.distributed.launch|task_queue_manager|app.py|memory_log.py|spawn_main" | grep -v grep | grep -v start_both | awk '{print $2}');
|
||||
echo $pids;
|
||||
for pid in ${pids[@]}; do
|
||||
kill -9 ${pid}
|
||||
done
|
||||
break
|
||||
fi
|
||||
|
||||
pids=$(ps auxww | grep -E "tritonserver|triton_python_backend_stub|new_infer.py|multiprocessing.resource_tracker|paddle.distributed.launch|app.py|memory_log.py|spawn_main" | grep -v grep | awk '{print $2}');
|
||||
array=($(echo "$pids" | tr ' ' '\n'))
|
||||
|
||||
if [ ${#array[*]} -ne 0 ]; then
|
||||
echo "进程还没有清理干净, 等待清理完毕"
|
||||
sleep 1
|
||||
else
|
||||
echo "进程已经清理干净"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
manager_pids=$(ps auxww | grep "task_queue_manager" | grep -v grep | awk '{print $2}')
|
||||
echo $manager_pids
|
||||
for in_pid in ${manager_pids[@]}; do
|
||||
kill -9 ${in_pid}
|
||||
done
|
||||
echo 'end kill queue manager'
|
||||
|
||||
health_checker_pids=$(ps auxww | grep "health.py" | grep -v grep | awk '{print $2}')
|
||||
echo $health_checker_pids
|
||||
for in_pid in ${health_checker_pids[@]}; do
|
||||
kill -9 ${in_pid}
|
||||
done
|
||||
echo 'end kill health checker'
|
||||
|
||||
echo "所有进程已终止"
|
||||
exit 0
|
Reference in New Issue
Block a user