[LLM] Support deploy LLM model

2025-10-06 09:07:10 +08:00 · 2024-08-29 19:38:11 +08:00
parent cd0ee79c91
commit 2d7d0ee92e
37 changed files with 4521 additions and 3 deletions
--- a/llm/server/scripts/start_server.sh
+++ b/llm/server/scripts/start_server.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/bash
+
+export GLOG_v=0
+export GLOG_logtostderr=1
+export PYTHONIOENCODING=utf8
+export LC_ALL=C.UTF-8
+
+# PaddlePaddle environment variables
+export FLAGS_allocator_strategy=naive_best_fit
+export FLAGS_fraction_of_gpu_memory_to_use=0.96
+export FLAGS_dynamic_static_unified_comm=0
+export FLAGS_use_xqa_optim=1
+export FLAGS_gemm_use_half_precision_compute_type=0
+export NVIDIA_TF32_OVERRIDE=0
+
+# Model hyperparameters
+export MP_NUM=${MP_NUM:-"1"}                                # GPU num
+export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"}    # GPU
+export MAX_SEQ_LEN=${MAX_SEQ_LEN:-"8192"}
+export MAX_DEC_LEN=${MAX_DEC_LEN:-"2048"}
+export BATCH_SIZE=${BATCH_SIZE:-"20"}
+export BLOCK_BS=${BLOCK_BS:-"4"}
+export BLOCK_SIZE=${BLOCK_SIZE:-"64"}
+export DTYPE=${DTYPE:-"bfloat16"}
+export USE_CACHE_KV_INT8=${USE_CACHE_KV_INT8:-"0"}  # c8 model requires configuration 1
+export BLOCK_RATIO=${BLOCK_RATIO:-"0.75"}
+export ENC_DEC_BLOCK_NUM=${ENC_DEC_BLOCK_NUM:-"4"}
+export FIRST_TOKEN_ID=${FIRST_TOKEN_ID:-"1"}
+export MAX_PREFILL_BATCH=${MAX_PREFILL_BATCH:-"4"}
+export STOP_THRESHOLD=${STOP_THRESHOLD:-"0"}
+export MODEL_DIR=${MODEL_DIR:-"/models/"}
+export DISTRIBUTED_CONFIG=${DISTRIBUTED_CONFIG:-"${MODEL_DIR}/rank_mapping.csv"}
+export CONFIG_JSON_FILE=${CONFIG_JSON_FILE:-"config.json"}
+export PUSH_MODE_HTTP_WORKERS=${PUSH_MODE_HTTP_WORKERS:-"4"}
+
+# serving port
+export HTTP_PORT=${HTTP_PORT:-"8110"}
+export GRPC_PORT=${GRPC_PORT:-"8811"}
+export METRICS_PORT=${METRICS_PORT:-"8722"}
+export INFER_QUEUE_PORT=${INFER_QUEUE_PORT:-"8813"}
+export PUSH_MODE_HTTP_PORT=${PUSH_MODE_HTTP_PORT:-"9965"}
+
+mkdir -p log
+rm -rf console.log log/*
+rm -rf /dev/shm/*
+
+# 启动服务
+echo "start serving ..."
+
+tritonserver --exit-timeout-secs 100 --cuda-memory-pool-byte-size 0:0 --cuda-memory-pool-byte-size 1:0 \
+                 --cuda-memory-pool-byte-size 2:0 --cuda-memory-pool-byte-size 3:0 --cuda-memory-pool-byte-size 4:0 \
+                 --cuda-memory-pool-byte-size 5:0 --cuda-memory-pool-byte-size 6:0 --cuda-memory-pool-byte-size 7:0 \
+                 --pinned-memory-pool-byte-size 0 --model-repository llm_model/ \
+                 --allow-http false \
+                 --grpc-port=${GRPC_PORT} \
+                 --metrics-port=${METRICS_PORT} \
+                 --log-file log/server.log --log-info true  > log/console.log 2>&1 &
+echo "模型服务的启动日志，请查看" ${PWD}"/log/server.log 和 "${PWD}"/log/workerlog.0 "
--- a/llm/server/scripts/stop_server.sh
+++ b/llm/server/scripts/stop_server.sh
@@ -0,0 +1,69 @@
+# /bin/bash
+
+pids=($(ps aux | grep -E 'tritonserver' | grep -v grep | awk '{print $2}'))
+
+if [ ${#pids[@]} -eq 0 ]; then
+    echo "未找到 tritonserver 相关进程"
+    timeout=1
+else
+    timeout=300
+fi
+
+# kill processor
+for pid in "${pids[@]}"; do
+    echo "正在中断进程 $pid"
+    kill -2 "$pid"
+done
+
+timeout_interval=$1
+if [ ! "$timeout_interval" == "" ]; then
+    timeout=$timeout_interval
+    echo $timeout
+fi
+
+start_time=$(date +%s)
+
+while : ; do
+  current_time=$(date +%s)
+
+  elapsed_time=$((current_time - start_time))
+
+  if [ $elapsed_time -ge $timeout ]; then
+    echo "tritonserver进程超时未退出"
+    echo "强制杀死所有有关进程"
+    pids=$(ps auxww | grep -E "tritonserver|triton_python_backend_stub|new_infer.py|infer|multiprocessing.resource_tracker|paddle.distributed.launch|task_queue_manager|app.py|memory_log.py|spawn_main" | grep -v grep | grep -v start_both | awk '{print $2}');
+    echo $pids;
+    for pid in ${pids[@]}; do
+    kill -9 ${pid}
+    done
+    break
+  fi
+
+  pids=$(ps auxww | grep -E "tritonserver|triton_python_backend_stub|new_infer.py|multiprocessing.resource_tracker|paddle.distributed.launch|app.py|memory_log.py|spawn_main" | grep -v grep | awk '{print $2}');
+  array=($(echo "$pids" | tr ' ' '\n'))
+
+  if [ ${#array[*]} -ne 0 ]; then
+    echo "进程还没有清理干净, 等待清理完毕"
+    sleep 1
+  else
+    echo "进程已经清理干净"
+    break
+  fi
+done
+
+manager_pids=$(ps auxww | grep "task_queue_manager" | grep -v grep | awk '{print $2}')
+echo $manager_pids
+for in_pid in ${manager_pids[@]}; do
+    kill -9 ${in_pid}
+done
+echo 'end kill queue manager'
+
+health_checker_pids=$(ps auxww | grep "health.py" | grep -v grep | awk '{print $2}')
+echo $health_checker_pids
+for in_pid in ${health_checker_pids[@]}; do
+    kill -9 ${in_pid}
+done
+echo 'end kill health checker'
+
+echo "所有进程已终止"
+exit 0