[LLM] Support deploy LLM model

2025-10-06 17:17:14 +08:00 · 2024-08-29 19:38:11 +08:00
parent cd0ee79c91
commit 2d7d0ee92e
37 changed files with 4521 additions and 3 deletions
--- a/llm/server/scripts/start_server.sh
+++ b/llm/server/scripts/start_server.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/bash
+
+export GLOG_v=0
+export GLOG_logtostderr=1
+export PYTHONIOENCODING=utf8
+export LC_ALL=C.UTF-8
+
+# PaddlePaddle environment variables
+export FLAGS_allocator_strategy=naive_best_fit
+export FLAGS_fraction_of_gpu_memory_to_use=0.96
+export FLAGS_dynamic_static_unified_comm=0
+export FLAGS_use_xqa_optim=1
+export FLAGS_gemm_use_half_precision_compute_type=0
+export NVIDIA_TF32_OVERRIDE=0
+
+# Model hyperparameters
+export MP_NUM=${MP_NUM:-"1"}                                # GPU num
+export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"}    # GPU
+export MAX_SEQ_LEN=${MAX_SEQ_LEN:-"8192"}
+export MAX_DEC_LEN=${MAX_DEC_LEN:-"2048"}
+export BATCH_SIZE=${BATCH_SIZE:-"20"}
+export BLOCK_BS=${BLOCK_BS:-"4"}
+export BLOCK_SIZE=${BLOCK_SIZE:-"64"}
+export DTYPE=${DTYPE:-"bfloat16"}
+export USE_CACHE_KV_INT8=${USE_CACHE_KV_INT8:-"0"}  # c8 model requires configuration 1
+export BLOCK_RATIO=${BLOCK_RATIO:-"0.75"}
+export ENC_DEC_BLOCK_NUM=${ENC_DEC_BLOCK_NUM:-"4"}
+export FIRST_TOKEN_ID=${FIRST_TOKEN_ID:-"1"}
+export MAX_PREFILL_BATCH=${MAX_PREFILL_BATCH:-"4"}
+export STOP_THRESHOLD=${STOP_THRESHOLD:-"0"}
+export MODEL_DIR=${MODEL_DIR:-"/models/"}
+export DISTRIBUTED_CONFIG=${DISTRIBUTED_CONFIG:-"${MODEL_DIR}/rank_mapping.csv"}
+export CONFIG_JSON_FILE=${CONFIG_JSON_FILE:-"config.json"}
+export PUSH_MODE_HTTP_WORKERS=${PUSH_MODE_HTTP_WORKERS:-"4"}
+
+# serving port
+export HTTP_PORT=${HTTP_PORT:-"8110"}
+export GRPC_PORT=${GRPC_PORT:-"8811"}
+export METRICS_PORT=${METRICS_PORT:-"8722"}
+export INFER_QUEUE_PORT=${INFER_QUEUE_PORT:-"8813"}
+export PUSH_MODE_HTTP_PORT=${PUSH_MODE_HTTP_PORT:-"9965"}
+
+mkdir -p log
+rm -rf console.log log/*
+rm -rf /dev/shm/*
+
+# 启动服务
+echo "start serving ..."
+
+tritonserver --exit-timeout-secs 100 --cuda-memory-pool-byte-size 0:0 --cuda-memory-pool-byte-size 1:0 \
+                 --cuda-memory-pool-byte-size 2:0 --cuda-memory-pool-byte-size 3:0 --cuda-memory-pool-byte-size 4:0 \
+                 --cuda-memory-pool-byte-size 5:0 --cuda-memory-pool-byte-size 6:0 --cuda-memory-pool-byte-size 7:0 \
+                 --pinned-memory-pool-byte-size 0 --model-repository llm_model/ \
+                 --allow-http false \
+                 --grpc-port=${GRPC_PORT} \
+                 --metrics-port=${METRICS_PORT} \
+                 --log-file log/server.log --log-info true  > log/console.log 2>&1 &
+echo "模型服务的启动日志，请查看" ${PWD}"/log/server.log 和 "${PWD}"/log/workerlog.0 "