mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[PD Disaggregation] remove splitwise deployment on single node and refine the code (#4891)
* remove splitwise deployment on single node and refine the code * up * up * up * add test * up
This commit is contained in:
@@ -29,48 +29,6 @@ In multi-instance scenarios, each incoming request needs to be assigned to diffe
|
||||
|
||||
## Usage Instructions
|
||||
|
||||
### Single-machine Disaggregated Deployment
|
||||
|
||||
#### Online Inference Service
|
||||
Use the following commands for service deployment:
|
||||
|
||||
**Prefill Instance**
|
||||
|
||||
```bash
|
||||
export FD_LOG_DIR="log_prefill"
|
||||
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ERNIE-4.5-300B-A47B-BF16 \
|
||||
--port 8180 --metrics-port 8181 \
|
||||
--engine-worker-queue-port 8182 \
|
||||
--cache-queue-port 8183 \
|
||||
--tensor-parallel-size 4 \
|
||||
--quantization wint4 \
|
||||
--splitwise-role "prefill"
|
||||
```
|
||||
|
||||
**Decode Instance**
|
||||
|
||||
```bash
|
||||
export FD_LOG_DIR="log_decode"
|
||||
export CUDA_VISIBLE_DEVICES=4,5,6,7
|
||||
# Note: innode-prefill-ports should specify the engine-worker-queue-port of the Prefill service
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ERNIE-4.5-300B-A47B-BF16 \
|
||||
--port 8184 --metrics-port 8185 \
|
||||
--engine-worker-queue-port 8186 \
|
||||
--cache-queue-port 8187 \
|
||||
--tensor-parallel-size 4 \
|
||||
--quantization wint4 \
|
||||
--innode-prefill-ports 8182 \
|
||||
--splitwise-role "decode"
|
||||
```
|
||||
|
||||
Note: When requesting single-machine PD disaggregated service, **users should request the Decode service's port**.
|
||||
|
||||
#### Offline Inference Service
|
||||
Refer to the example code `offline_disaggregated_demo.py` in the `fastdeploy/demo` directory for offline inference service deployment.
|
||||
|
||||
### Multi-machine Disaggregated Deployment
|
||||
|
||||
#### Prerequisite: Redis
|
||||
@@ -118,12 +76,14 @@ For multi-machine deployment, confirm that the NIC supports RDMA and that all no
|
||||
```bash
|
||||
export FD_LOG_DIR="log_prefill"
|
||||
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=0
|
||||
echo "set RDMA NICS"
|
||||
export $(bash scripts/get_rdma_nics.sh gpu)
|
||||
echo "KVCACHE_RDMA_NICS ${KVCACHE_RDMA_NICS}"
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ERNIE-4.5-300B-A47B-BF16 \
|
||||
--port 8180 --metrics-port 8181 \
|
||||
--port 8180 \
|
||||
--metrics-port 8181 \
|
||||
--engine-worker-queue-port 8182 \
|
||||
--cache-queue-port 8183 \
|
||||
--tensor-parallel-size 4 \
|
||||
@@ -143,12 +103,14 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
```bash
|
||||
export FD_LOG_DIR="log_decode"
|
||||
export CUDA_VISIBLE_DEVICES=4,5,6,7
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=0
|
||||
echo "set RDMA NICS"
|
||||
export $(bash scripts/get_rdma_nics.sh gpu)
|
||||
echo "KVCACHE_RDMA_NICS ${KVCACHE_RDMA_NICS}"
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ERNIE-4.5-300B-A47B-BF16 \
|
||||
--port 8184 --metrics-port 8185 \
|
||||
--port 8184 \
|
||||
--metrics-port 8185 \
|
||||
--engine-worker-queue-port 8186 \
|
||||
--cache-queue-port 8187 \
|
||||
--tensor-parallel-size 4 \
|
||||
|
||||
@@ -29,49 +29,6 @@
|
||||
|
||||
## 使用说明
|
||||
|
||||
### 单机分离式部署
|
||||
|
||||
#### 在线推理服务
|
||||
使用如下命令进行服务部署
|
||||
|
||||
**prefill 实例**
|
||||
|
||||
```bash
|
||||
export FD_LOG_DIR="log_prefill"
|
||||
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ERNIE-4.5-300B-A47B-BF16 \
|
||||
--port 8180 --metrics-port 8181 \
|
||||
--engine-worker-queue-port 8182 \
|
||||
--cache-queue-port 8183 \
|
||||
--tensor-parallel-size 4 \
|
||||
--quantization wint4 \
|
||||
--splitwise-role "prefill"
|
||||
```
|
||||
|
||||
**decode 实例**
|
||||
|
||||
```bash
|
||||
export FD_LOG_DIR="log_decode"
|
||||
export CUDA_VISIBLE_DEVICES=4,5,6,7
|
||||
# 注意innode-prefill-ports指定为Prefill服务的engine-worker-queue-port
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ERNIE-4.5-300B-A47B-BF16 \
|
||||
--port 8184 --metrics-port 8185 \
|
||||
--engine-worker-queue-port 8186 \
|
||||
--cache-queue-port 8187 \
|
||||
--tensor-parallel-size 4 \
|
||||
--quantization wint4 \
|
||||
--innode-prefill-ports 8182 \
|
||||
--splitwise-role "decode"
|
||||
```
|
||||
|
||||
注意在请求单机PD分离服务时,**用户需请求Decode服务的端口**。
|
||||
|
||||
#### 离线推理服务
|
||||
|
||||
参考`fastdeploy/demo` 目录下 `offline_disaggregated_demo.py` 示例代码,进行离线推理服务部署
|
||||
|
||||
### 多机分离式部署
|
||||
|
||||
#### 前置依赖 Redis
|
||||
@@ -120,6 +77,7 @@ sudo systemctl start redis
|
||||
|
||||
export FD_LOG_DIR="log_prefill"
|
||||
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=0
|
||||
echo "set RDMA NICS"
|
||||
export $(bash scripts/get_rdma_nics.sh gpu)
|
||||
echo "KVCACHE_RDMA_NICS ${KVCACHE_RDMA_NICS}"
|
||||
@@ -146,6 +104,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
```bash
|
||||
export FD_LOG_DIR="log_decode"
|
||||
export CUDA_VISIBLE_DEVICES=4,5,6,7
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=0
|
||||
echo "set RDMA NICS"
|
||||
export $(bash scripts/get_rdma_nics.sh gpu)
|
||||
echo "KVCACHE_RDMA_NICS ${KVCACHE_RDMA_NICS}"
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Test mixed server + router
|
||||
|
||||
wait_for_health() {
|
||||
local server_port=$1
|
||||
while true; do
|
||||
@@ -16,7 +18,6 @@ wait_for_health() {
|
||||
|
||||
# prepare environment
|
||||
MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
|
||||
# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
|
||||
|
||||
export FD_DEBUG=1
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=0
|
||||
@@ -25,13 +26,16 @@ export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
|
||||
unset http_proxy && unset https_proxy
|
||||
rm -rf log_*
|
||||
|
||||
S1_PORT=52400
|
||||
S2_PORT=52500
|
||||
ROUTER_PORT=52600
|
||||
|
||||
# start router
|
||||
export FD_LOG_DIR="log_router"
|
||||
mkdir -p ${FD_LOG_DIR}
|
||||
|
||||
router_port=9000
|
||||
nohup python -m fastdeploy.router.launch \
|
||||
--port ${router_port} \
|
||||
--port ${ROUTER_PORT} \
|
||||
2>&1 >${FD_LOG_DIR}/nohup &
|
||||
sleep 1
|
||||
|
||||
@@ -42,16 +46,16 @@ mkdir -p ${FD_LOG_DIR}
|
||||
|
||||
nohup python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_NAME} \
|
||||
--port 8100 \
|
||||
--metrics-port 8101 \
|
||||
--engine-worker-queue-port 8102 \
|
||||
--cache-queue-port 8103 \
|
||||
--port ${S1_PORT} \
|
||||
--metrics-port $((S1_PORT + 1)) \
|
||||
--engine-worker-queue-port $((S1_PORT + 2)) \
|
||||
--cache-queue-port $((S1_PORT + 3)) \
|
||||
--max-model-len 32768 \
|
||||
--router "0.0.0.0:${router_port}" \
|
||||
--router "0.0.0.0:${ROUTER_PORT}" \
|
||||
2>&1 >${FD_LOG_DIR}/nohup &
|
||||
sleep 1
|
||||
|
||||
wait_for_health 8100
|
||||
wait_for_health ${S1_PORT}
|
||||
|
||||
# start modelserver 1
|
||||
export CUDA_VISIBLE_DEVICES=1
|
||||
@@ -60,12 +64,24 @@ mkdir -p ${FD_LOG_DIR}
|
||||
|
||||
nohup python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_NAME} \
|
||||
--port 8200 \
|
||||
--metrics-port 8201 \
|
||||
--engine-worker-queue-port 8202 \
|
||||
--cache-queue-port 8203 \
|
||||
--port ${S2_PORT} \
|
||||
--metrics-port $((S2_PORT + 1)) \
|
||||
--engine-worker-queue-port $((S2_PORT + 2)) \
|
||||
--cache-queue-port $((S2_PORT + 3)) \
|
||||
--max-model-len 32768 \
|
||||
--router "0.0.0.0:${router_port}" \
|
||||
--router "0.0.0.0:${ROUTER_PORT}" \
|
||||
2>&1 >${FD_LOG_DIR}/nohup &
|
||||
|
||||
wait_for_health 8200
|
||||
wait_for_health ${S2_PORT}
|
||||
|
||||
# send request
|
||||
sleep 10 # make sure server is registered to router
|
||||
curl -X POST "http://0.0.0.0:${ROUTER_PORT}/v1/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"messages": [
|
||||
{"role": "user", "content": "hello"}
|
||||
],
|
||||
"max_tokens": 20,
|
||||
"stream": true
|
||||
}'
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
set -e
|
||||
|
||||
# Test splitwise deployment
|
||||
# v0 requires prefill and decode in one node and it uses local scheduler
|
||||
# v1 supports prefill and decode in multi node and it uses splitwise scheduler
|
||||
# v2 supports prefill and decode in multi node and it uses router and local scheduler
|
||||
# There are two methods for splitwise deployment:
|
||||
# v0: using splitwise_scheduler or dp_scheduler
|
||||
# v1: using local_scheduler + router
|
||||
|
||||
wait_for_health() {
|
||||
local server_port=$1
|
||||
@@ -19,48 +19,97 @@ wait_for_health() {
|
||||
done
|
||||
}
|
||||
|
||||
# prepare environment
|
||||
MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
|
||||
# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
|
||||
aistudio download --model ${MODEL_NAME}
|
||||
|
||||
export FD_DEBUG=1
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
|
||||
|
||||
SCRIPT_PATH=$(readlink -f "$0")
|
||||
SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
|
||||
export $(bash ${SCRIPT_DIR}/../../scripts/get_rdma_nics.sh gpu)
|
||||
echo "KVCACHE_RDMA_NICS:${KVCACHE_RDMA_NICS}"
|
||||
if [ -z "${KVCACHE_RDMA_NICS}" ]; then
|
||||
echo "KVCACHE_RDMA_NICS is empty, please check the output of get_rdma_nics.sh"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
unset http_proxy && unset https_proxy
|
||||
rm -rf log_*
|
||||
|
||||
P_PORT=52400
|
||||
D_PORT=52500
|
||||
REDIS_PORT=56388
|
||||
|
||||
# start redis
|
||||
if ! redis-cli -p ${REDIS_PORT} ping &>/dev/null; then
|
||||
echo "Redis is not running. Starting redis-server..."
|
||||
redis-server --daemonize yes --port ${REDIS_PORT}
|
||||
sleep 1
|
||||
else
|
||||
echo "Redis is already running."
|
||||
fi
|
||||
sleep 1
|
||||
|
||||
# start prefill
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
export FD_LOG_DIR="log_prefill"
|
||||
mkdir -p ${FD_LOG_DIR}
|
||||
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
export FD_DEBUG=1
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=0
|
||||
|
||||
nohup python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_NAME} \
|
||||
--port 8100 \
|
||||
--metrics-port 8101 \
|
||||
--engine-worker-queue-port 8102 \
|
||||
--cache-queue-port 8103 \
|
||||
--port ${P_PORT} \
|
||||
--metrics-port $((P_PORT + 1)) \
|
||||
--engine-worker-queue-port $((P_PORT + 2)) \
|
||||
--cache-queue-port $((P_PORT + 3)) \
|
||||
--max-model-len 32768 \
|
||||
--num-gpu-blocks-override 1000 \
|
||||
--splitwise-role "prefill" \
|
||||
--cache-transfer-protocol "rdma" \
|
||||
--rdma-comm-ports $((P_PORT + 4)) \
|
||||
--pd-comm-port $((P_PORT + 5)) \
|
||||
--scheduler-name "splitwise" \
|
||||
--scheduler-host "127.0.0.1" \
|
||||
--scheduler-port ${REDIS_PORT} \
|
||||
--scheduler-ttl 9000 \
|
||||
2>&1 >${FD_LOG_DIR}/nohup &
|
||||
wait_for_health 8100
|
||||
|
||||
wait_for_health ${P_PORT}
|
||||
|
||||
# start decode
|
||||
export CUDA_VISIBLE_DEVICES=1
|
||||
export FD_LOG_DIR="log_decode"
|
||||
mkdir -p ${FD_LOG_DIR}
|
||||
|
||||
export CUDA_VISIBLE_DEVICES=1
|
||||
export FD_DEBUG=1
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=0
|
||||
|
||||
nohup python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_NAME} \
|
||||
--port 9000 \
|
||||
--metrics-port 9001 \
|
||||
--engine-worker-queue-port 9002 \
|
||||
--cache-queue-port 9003 \
|
||||
--port ${D_PORT} \
|
||||
--metrics-port $((D_PORT + 1)) \
|
||||
--engine-worker-queue-port $((D_PORT + 2)) \
|
||||
--cache-queue-port $((D_PORT + 3)) \
|
||||
--max-model-len 32768 \
|
||||
--splitwise-role "decode" \
|
||||
--innode-prefill-ports 8102 \
|
||||
--cache-transfer-protocol "rdma" \
|
||||
--rdma-comm-ports $((D_PORT + 4)) \
|
||||
--pd-comm-port $((D_PORT + 5)) \
|
||||
--scheduler-name "splitwise" \
|
||||
--scheduler-host "127.0.0.1" \
|
||||
--scheduler-port ${REDIS_PORT} \
|
||||
--scheduler-ttl 9000 \
|
||||
2>&1 >${FD_LOG_DIR}/nohup &
|
||||
wait_for_health 9000
|
||||
|
||||
wait_for_health ${D_PORT}
|
||||
|
||||
|
||||
# send request
|
||||
sleep 10 # make sure server is registered to router
|
||||
curl -X POST "http://0.0.0.0:${D_PORT}/v1/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"messages": [
|
||||
{"role": "user", "content": "hello"}
|
||||
],
|
||||
"max_tokens": 20,
|
||||
"stream": true
|
||||
}'
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
set -e
|
||||
|
||||
# Test splitwise deployment
|
||||
# v0 requires prefill and decode in one node and it uses local scheduler
|
||||
# v1 supports prefill and decode in multi node and it uses splitwise scheduler
|
||||
# v2 supports prefill and decode in multi node and it uses router and local scheduler
|
||||
# There are two methods for splitwise deployment:
|
||||
# v0: using splitwise_scheduler or dp_scheduler
|
||||
# v1: using local_scheduler + router
|
||||
|
||||
wait_for_health() {
|
||||
local server_port=$1
|
||||
@@ -21,7 +21,6 @@ wait_for_health() {
|
||||
|
||||
# prepare environment
|
||||
MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
|
||||
# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
|
||||
|
||||
export FD_DEBUG=1
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=0
|
||||
@@ -39,16 +38,14 @@ fi
|
||||
unset http_proxy && unset https_proxy
|
||||
rm -rf log_*
|
||||
|
||||
# start router
|
||||
export FD_LOG_DIR="log_router"
|
||||
mkdir -p ${FD_LOG_DIR}
|
||||
|
||||
echo "start router"
|
||||
router_port=9000
|
||||
nohup python -m fastdeploy.router.launch \
|
||||
--port ${router_port} \
|
||||
--splitwise \
|
||||
2>&1 >${FD_LOG_DIR}/nohup &
|
||||
# start redis
|
||||
if ! redis-cli ping &>/dev/null; then
|
||||
echo "Redis is not running. Starting redis-server..."
|
||||
redis-server --daemonize yes
|
||||
sleep 1
|
||||
else
|
||||
echo "Redis is already running."
|
||||
fi
|
||||
sleep 1
|
||||
|
||||
# start prefill
|
||||
@@ -56,41 +53,59 @@ export CUDA_VISIBLE_DEVICES=0,1
|
||||
export FD_LOG_DIR="log_prefill"
|
||||
mkdir -p ${FD_LOG_DIR}
|
||||
|
||||
echo "start prefill"
|
||||
nohup python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_NAME} \
|
||||
--port 8100 \
|
||||
--metrics-port 8101 \
|
||||
--engine-worker-queue-port 8102 \
|
||||
--cache-queue-port 8103 \
|
||||
--tensor-parallel-size 2 \
|
||||
--max-model-len 32768 \
|
||||
--tensor-parallel-size 2 \
|
||||
--splitwise-role "prefill" \
|
||||
--cache-transfer-protocol "rdma,ipc" \
|
||||
--pd-comm-port 8104 \
|
||||
--rdma-comm-ports 8105,8106 \
|
||||
--router "0.0.0.0:${router_port}" \
|
||||
--scheduler-name "splitwise" \
|
||||
--scheduler-host "127.0.0.1" \
|
||||
--scheduler-port 6379 \
|
||||
--scheduler-ttl 9000 \
|
||||
2>&1 >${FD_LOG_DIR}/nohup &
|
||||
|
||||
wait_for_health 8100
|
||||
# wait_for_health 8100
|
||||
|
||||
# start decode
|
||||
export CUDA_VISIBLE_DEVICES=2,3
|
||||
export FD_LOG_DIR="log_decode"
|
||||
mkdir -p ${FD_LOG_DIR}
|
||||
|
||||
echo "start decode"
|
||||
nohup python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_NAME} \
|
||||
--port 8200 \
|
||||
--metrics-port 8201 \
|
||||
--engine-worker-queue-port 8202 \
|
||||
--cache-queue-port 8203 \
|
||||
--port 9000 \
|
||||
--metrics-port 9001 \
|
||||
--engine-worker-queue-port 9002 \
|
||||
--cache-queue-port 9003 \
|
||||
--max-model-len 32768 \
|
||||
--tensor-parallel-size 2 \
|
||||
--splitwise-role "decode" \
|
||||
--pd-comm-port 8204 \
|
||||
--rdma-comm-ports 8205,8206 \
|
||||
--router "0.0.0.0:${router_port}" \
|
||||
--cache-transfer-protocol "rdma,ipc" \
|
||||
--pd-comm-port 9004 \
|
||||
--rdma-comm-ports 9005,9006 \
|
||||
--scheduler-name "splitwise" \
|
||||
--scheduler-host "127.0.0.1" \
|
||||
--scheduler-port 6379 \
|
||||
--scheduler-ttl 9000 \
|
||||
2>&1 >${FD_LOG_DIR}/nohup &
|
||||
wait_for_health 9000
|
||||
|
||||
wait_for_health 8200
|
||||
|
||||
# send request
|
||||
sleep 10 # make sure server is registered to router
|
||||
port=9000
|
||||
curl -X POST "http://0.0.0.0:${port}/v1/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"messages": [
|
||||
{"role": "user", "content": "hello"}
|
||||
],
|
||||
"max_tokens": 20,
|
||||
"stream": true
|
||||
}'
|
||||
@@ -2,9 +2,9 @@
|
||||
set -e
|
||||
|
||||
# Test splitwise deployment
|
||||
# v0 requires prefill and decode in one node and it uses local scheduler
|
||||
# v1 supports prefill and decode in multi node and it uses splitwise scheduler
|
||||
# v2 supports prefill and decode in multi node and it uses router and local scheduler
|
||||
# There are two methods for splitwise deployment:
|
||||
# v0: using splitwise_scheduler or dp_scheduler
|
||||
# v1: using local_scheduler + router
|
||||
|
||||
wait_for_health() {
|
||||
local server_port=$1
|
||||
@@ -21,10 +21,9 @@ wait_for_health() {
|
||||
|
||||
# prepare environment
|
||||
MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
|
||||
# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
|
||||
|
||||
export FD_DEBUG=1
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=0
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
|
||||
|
||||
SCRIPT_PATH=$(readlink -f "$0")
|
||||
@@ -39,14 +38,18 @@ fi
|
||||
unset http_proxy && unset https_proxy
|
||||
rm -rf log_*
|
||||
|
||||
# start redis
|
||||
if ! redis-cli ping &>/dev/null; then
|
||||
echo "Redis is not running. Starting redis-server..."
|
||||
redis-server --daemonize yes
|
||||
sleep 1
|
||||
else
|
||||
echo "Redis is already running."
|
||||
fi
|
||||
P_PORT=52400
|
||||
D_PORT=52500
|
||||
ROUTER_PORT=52600
|
||||
|
||||
# start router
|
||||
export FD_LOG_DIR="log_router"
|
||||
mkdir -p ${FD_LOG_DIR}
|
||||
|
||||
nohup python -m fastdeploy.router.launch \
|
||||
--port ${ROUTER_PORT} \
|
||||
--splitwise \
|
||||
2>&1 >${FD_LOG_DIR}/nohup &
|
||||
sleep 1
|
||||
|
||||
# start prefill
|
||||
@@ -56,21 +59,20 @@ mkdir -p ${FD_LOG_DIR}
|
||||
|
||||
nohup python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_NAME} \
|
||||
--port 8100 \
|
||||
--metrics-port 8101 \
|
||||
--engine-worker-queue-port 8102 \
|
||||
--cache-queue-port 8103 \
|
||||
--port "${P_PORT}" \
|
||||
--metrics-port "$((P_PORT + 1))" \
|
||||
--engine-worker-queue-port "$((P_PORT + 2))" \
|
||||
--cache-queue-port "$((P_PORT + 3))" \
|
||||
--max-model-len 32768 \
|
||||
--splitwise-role "prefill" \
|
||||
--cache-transfer-protocol "rdma,ipc" \
|
||||
--rdma-comm-ports 8104 \
|
||||
--pd-comm-port 8105 \
|
||||
--scheduler-name "splitwise" \
|
||||
--scheduler-host "127.0.0.1" \
|
||||
--scheduler-port 6379 \
|
||||
--scheduler-ttl 9000 \
|
||||
--cache-transfer-protocol "rdma" \
|
||||
--rdma-comm-ports "$((P_PORT + 4))" \
|
||||
--pd-comm-port "$((P_PORT + 5))" \
|
||||
--num-gpu-blocks-override 2000 \
|
||||
--router "0.0.0.0:${ROUTER_PORT}" \
|
||||
2>&1 >${FD_LOG_DIR}/nohup &
|
||||
wait_for_health 8100
|
||||
|
||||
wait_for_health ${P_PORT}
|
||||
|
||||
# start decode
|
||||
export CUDA_VISIBLE_DEVICES=1
|
||||
@@ -79,18 +81,28 @@ mkdir -p ${FD_LOG_DIR}
|
||||
|
||||
nohup python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_NAME} \
|
||||
--port 9000 \
|
||||
--metrics-port 9001 \
|
||||
--engine-worker-queue-port 9002 \
|
||||
--cache-queue-port 9003 \
|
||||
--port "${D_PORT}" \
|
||||
--metrics-port "$((D_PORT + 2))" \
|
||||
--engine-worker-queue-port "$((D_PORT + 3))" \
|
||||
--cache-queue-port "$((D_PORT + 1))" \
|
||||
--max-model-len 32768 \
|
||||
--splitwise-role "decode" \
|
||||
--cache-transfer-protocol "rdma,ipc" \
|
||||
--rdma-comm-ports 9004 \
|
||||
--pd-comm-port 9005 \
|
||||
--scheduler-name "splitwise" \
|
||||
--scheduler-host "127.0.0.1" \
|
||||
--scheduler-port 6379 \
|
||||
--scheduler-ttl 9000 \
|
||||
--cache-transfer-protocol "rdma" \
|
||||
--rdma-comm-ports "$((D_PORT + 4))" \
|
||||
--pd-comm-port "$((D_PORT + 5))" \
|
||||
--router "0.0.0.0:${ROUTER_PORT}" \
|
||||
2>&1 >${FD_LOG_DIR}/nohup &
|
||||
wait_for_health 9000
|
||||
|
||||
wait_for_health ${D_PORT}
|
||||
|
||||
# send request
|
||||
sleep 10 # make sure server is registered to router
|
||||
curl -X POST "http://0.0.0.0:${ROUTER_PORT}/v1/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"messages": [
|
||||
{"role": "user", "content": "hello"}
|
||||
],
|
||||
"max_tokens": 20,
|
||||
"stream": true
|
||||
}'
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
set -e
|
||||
|
||||
# Test splitwise deployment
|
||||
# v0 requires prefill and decode in one node and it uses local scheduler
|
||||
# v1 supports prefill and decode in multi node and it uses splitwise scheduler
|
||||
# v2 supports prefill and decode in multi node and it uses router and local scheduler
|
||||
# There are two methods for splitwise deployment:
|
||||
# v0: using splitwise_scheduler or dp_scheduler
|
||||
# v1: using local_scheduler + router
|
||||
|
||||
wait_for_health() {
|
||||
local server_port=$1
|
||||
@@ -21,7 +21,6 @@ wait_for_health() {
|
||||
|
||||
# prepare environment
|
||||
MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
|
||||
# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
|
||||
|
||||
export FD_DEBUG=1
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=0
|
||||
@@ -39,14 +38,16 @@ fi
|
||||
unset http_proxy && unset https_proxy
|
||||
rm -rf log_*
|
||||
|
||||
# start redis
|
||||
if ! redis-cli ping &>/dev/null; then
|
||||
echo "Redis is not running. Starting redis-server..."
|
||||
redis-server --daemonize yes
|
||||
sleep 1
|
||||
else
|
||||
echo "Redis is already running."
|
||||
fi
|
||||
# start router
|
||||
export FD_LOG_DIR="log_router"
|
||||
mkdir -p ${FD_LOG_DIR}
|
||||
|
||||
echo "start router"
|
||||
router_port=9000
|
||||
nohup python -m fastdeploy.router.launch \
|
||||
--port ${router_port} \
|
||||
--splitwise \
|
||||
2>&1 >${FD_LOG_DIR}/nohup &
|
||||
sleep 1
|
||||
|
||||
# start prefill
|
||||
@@ -54,45 +55,56 @@ export CUDA_VISIBLE_DEVICES=0,1
|
||||
export FD_LOG_DIR="log_prefill"
|
||||
mkdir -p ${FD_LOG_DIR}
|
||||
|
||||
echo "start prefill"
|
||||
nohup python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_NAME} \
|
||||
--port 8100 \
|
||||
--metrics-port 8101 \
|
||||
--engine-worker-queue-port 8102 \
|
||||
--cache-queue-port 8103 \
|
||||
--max-model-len 32768 \
|
||||
--tensor-parallel-size 2 \
|
||||
--max-model-len 32768 \
|
||||
--splitwise-role "prefill" \
|
||||
--cache-transfer-protocol "rdma,ipc" \
|
||||
--pd-comm-port 8104 \
|
||||
--rdma-comm-ports 8105,8106 \
|
||||
--scheduler-name "splitwise" \
|
||||
--scheduler-host "127.0.0.1" \
|
||||
--scheduler-port 6379 \
|
||||
--scheduler-ttl 9000 \
|
||||
--router "0.0.0.0:${router_port}" \
|
||||
2>&1 >${FD_LOG_DIR}/nohup &
|
||||
wait_for_health 8100
|
||||
|
||||
# wait_for_health 8100
|
||||
|
||||
# start decode
|
||||
export CUDA_VISIBLE_DEVICES=2,3
|
||||
export FD_LOG_DIR="log_decode"
|
||||
mkdir -p ${FD_LOG_DIR}
|
||||
|
||||
echo "start decode"
|
||||
nohup python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_NAME} \
|
||||
--port 9000 \
|
||||
--metrics-port 9001 \
|
||||
--engine-worker-queue-port 9002 \
|
||||
--cache-queue-port 9003 \
|
||||
--port 8200 \
|
||||
--metrics-port 8201 \
|
||||
--engine-worker-queue-port 8202 \
|
||||
--cache-queue-port 8203 \
|
||||
--max-model-len 32768 \
|
||||
--tensor-parallel-size 2 \
|
||||
--splitwise-role "decode" \
|
||||
--cache-transfer-protocol "rdma,ipc" \
|
||||
--pd-comm-port 9004 \
|
||||
--rdma-comm-ports 9005,9006 \
|
||||
--scheduler-name "splitwise" \
|
||||
--scheduler-host "127.0.0.1" \
|
||||
--scheduler-port 6379 \
|
||||
--scheduler-ttl 9000 \
|
||||
--pd-comm-port 8204 \
|
||||
--rdma-comm-ports 8205,8206 \
|
||||
--router "0.0.0.0:${router_port}" \
|
||||
2>&1 >${FD_LOG_DIR}/nohup &
|
||||
wait_for_health 9000
|
||||
|
||||
wait_for_health 8200
|
||||
|
||||
|
||||
|
||||
# send request
|
||||
sleep 10 # make sure server is registered to router
|
||||
port=9000
|
||||
curl -X POST "http://0.0.0.0:${port}/v1/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"messages": [
|
||||
{"role": "user", "content": "hello"}
|
||||
],
|
||||
"max_tokens": 20,
|
||||
"stream": true
|
||||
}'
|
||||
|
||||
@@ -1,93 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Test splitwise deployment
|
||||
# v0 requires prefill and decode in one node and it uses local scheduler
|
||||
# v1 supports prefill and decode in multi node and it uses splitwise scheduler
|
||||
# v2 supports prefill and decode in multi node and it uses router and local scheduler
|
||||
|
||||
wait_for_health() {
|
||||
local server_port=$1
|
||||
while true; do
|
||||
status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
|
||||
if [ "$status_code" -eq 200 ]; then
|
||||
break
|
||||
else
|
||||
echo "Service not ready. Retrying in 2s..."
|
||||
sleep 2
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# prepare environment
|
||||
MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
|
||||
# MODEL_NAME="baidu/ERNIE-4.5-21B-A3B-Paddle"
|
||||
|
||||
export FD_DEBUG=1
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=0
|
||||
export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
|
||||
|
||||
SCRIPT_PATH=$(readlink -f "$0")
|
||||
SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
|
||||
export $(bash ${SCRIPT_DIR}/../../scripts/get_rdma_nics.sh gpu)
|
||||
echo "KVCACHE_RDMA_NICS:${KVCACHE_RDMA_NICS}"
|
||||
if [ -z "${KVCACHE_RDMA_NICS}" ]; then
|
||||
echo "KVCACHE_RDMA_NICS is empty, please check the output of get_rdma_nics.sh"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
unset http_proxy && unset https_proxy
|
||||
rm -rf log_*
|
||||
|
||||
# start router
|
||||
export FD_LOG_DIR="log_router"
|
||||
mkdir -p ${FD_LOG_DIR}
|
||||
|
||||
router_port=9000
|
||||
nohup python -m fastdeploy.router.launch \
|
||||
--port ${router_port} \
|
||||
--splitwise \
|
||||
2>&1 >${FD_LOG_DIR}/nohup &
|
||||
sleep 1
|
||||
|
||||
# start prefill
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
export FD_LOG_DIR="log_prefill"
|
||||
mkdir -p ${FD_LOG_DIR}
|
||||
|
||||
nohup python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_NAME} \
|
||||
--port 8100 \
|
||||
--metrics-port 8101 \
|
||||
--engine-worker-queue-port 8102 \
|
||||
--cache-queue-port 8103 \
|
||||
--max-model-len 32768 \
|
||||
--splitwise-role "prefill" \
|
||||
--cache-transfer-protocol "ipc,rdma" \
|
||||
--rdma-comm-ports 8104 \
|
||||
--pd-comm-port 8105 \
|
||||
--router "0.0.0.0:${router_port}" \
|
||||
2>&1 >${FD_LOG_DIR}/nohup &
|
||||
|
||||
wait_for_health 8100
|
||||
|
||||
# start decode
|
||||
export CUDA_VISIBLE_DEVICES=1
|
||||
export FD_LOG_DIR="log_decode"
|
||||
mkdir -p ${FD_LOG_DIR}
|
||||
|
||||
nohup python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_NAME} \
|
||||
--port 8200 \
|
||||
--metrics-port 8201 \
|
||||
--engine-worker-queue-port 8202 \
|
||||
--cache-queue-port 8203 \
|
||||
--max-model-len 32768 \
|
||||
--splitwise-role "decode" \
|
||||
--cache-transfer-protocol "ipc,rdma" \
|
||||
--rdma-comm-ports 8204 \
|
||||
--pd-comm-port 8205 \
|
||||
--router "0.0.0.0:${router_port}" \
|
||||
2>&1 >${FD_LOG_DIR}/nohup &
|
||||
|
||||
wait_for_health 8200
|
||||
@@ -622,8 +622,13 @@ class CacheMessagerV1:
|
||||
target_id = int(task["rdma_ports"][self.rank])
|
||||
if "error" in task["status"]:
|
||||
continue
|
||||
|
||||
# TODO: use is connected to check if the connection is still alive
|
||||
logger.debug(f"rdma, start connect decode, {target_ip}:{target_id}")
|
||||
status = self.messager[current_transfer_protocol].connect(target_ip, target_id)
|
||||
if not status:
|
||||
if status:
|
||||
logger.info(f"connect to {target_ip}:{target_id} success")
|
||||
else:
|
||||
logger.error(f"connect to {target_ip}:{target_id} failed")
|
||||
task["status"] = "connection error"
|
||||
continue
|
||||
@@ -752,7 +757,7 @@ class CacheMessagerV1:
|
||||
self.engine_worker_queue.connect_task_response_barrier.wait()
|
||||
self.engine_worker_queue.put_connect_rdma_task_response(response)
|
||||
except Exception as e:
|
||||
logger.error(f"handle_connect_task has exception: {e}")
|
||||
logger.error(f"handle_connect_task has exception: {e}, {traceback.format_exc()}")
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
@@ -51,7 +51,6 @@ class IPCConnector:
|
||||
self.remote_key_tensor_ptr_list.append(get_data_ptr_ipc(tmp, key_unique_name))
|
||||
self.remote_value_tensor_ptr_list.append(get_data_ptr_ipc(tmp, value_unique_name))
|
||||
self.write_stream = paddle.device.Stream(f"gpu:{self.local_gpu_id}")
|
||||
self.finish_event = paddle.device.Event()
|
||||
|
||||
|
||||
class IPCCommManager:
|
||||
|
||||
@@ -20,7 +20,7 @@ import json
|
||||
import os
|
||||
from dataclasses import field
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Literal, Optional, Union
|
||||
from typing import Any, Dict, Literal, Optional, Union
|
||||
|
||||
import paddle
|
||||
import paddle.distributed as dist
|
||||
@@ -1453,7 +1453,6 @@ class FDConfig:
|
||||
use_warmup: bool = False,
|
||||
limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
|
||||
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
|
||||
innode_prefill_ports: Optional[List[int]] = None,
|
||||
max_num_partial_prefills: int = 1,
|
||||
max_long_partial_prefills: int = 1,
|
||||
long_prefill_token_threshold: int = 0,
|
||||
@@ -1517,13 +1516,10 @@ class FDConfig:
|
||||
self.limit_mm_per_prompt = limit_mm_per_prompt
|
||||
self.mm_processor_kwargs = mm_processor_kwargs
|
||||
self.use_warmup = use_warmup
|
||||
self.innode_prefill_ports = innode_prefill_ports
|
||||
self.max_num_partial_prefills = max_num_partial_prefills
|
||||
self.max_long_partial_prefills = max_long_partial_prefills
|
||||
self.long_prefill_token_threshold = long_prefill_token_threshold
|
||||
|
||||
self._str_to_list("innode_prefill_ports", int)
|
||||
|
||||
if envs.FD_FOR_TORCH_MODEL_FORMAT:
|
||||
self.model_config.model_format = "torch"
|
||||
|
||||
@@ -1773,23 +1769,15 @@ class FDConfig:
|
||||
"""
|
||||
initialize cache info
|
||||
"""
|
||||
# TODO: group the splitiwse params, remove code of v0
|
||||
# v0 requires prefill and decode in one node and it uses local scheduler
|
||||
# v1 supports prefill and decode in multi node and it uses splitwise or dp scheduler
|
||||
# v2 supports prefill and decode in multi node and it uses router and local scheduler
|
||||
# TODO: group the splitiwse params
|
||||
# There are two methods for splitwise deployment:
|
||||
# 1. v0 splitwise_scheduler or dp_scheduler
|
||||
# 2. v1 local_scheduler + router
|
||||
self.splitwise_version = None
|
||||
if self.scheduler_config.name == "local" and (self.router_config is None or self.router_config.router is None):
|
||||
if self.scheduler_config.name in ("splitwise", "dp"):
|
||||
self.splitwise_version = "v0"
|
||||
elif self.scheduler_config.name in ("splitwise", "dp"):
|
||||
self.splitwise_version = "v1"
|
||||
elif self.scheduler_config.name == "local" and self.router_config and self.router_config.router:
|
||||
self.splitwise_version = "v2"
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported scheduler mode, scheduler_name: {self.scheduler_config.name}, "
|
||||
f"router_config: {self.router_config}"
|
||||
)
|
||||
logger.info(f"splitwise_version: {self.splitwise_version}")
|
||||
self.splitwise_version = "v1"
|
||||
|
||||
if isinstance(self.parallel_config.engine_worker_queue_port, (int, str)):
|
||||
engine_worker_queue_port = self.parallel_config.engine_worker_queue_port
|
||||
|
||||
@@ -1,64 +0,0 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import multiprocessing
|
||||
import os
|
||||
import time
|
||||
|
||||
from fastdeploy.entrypoints.llm import LLM
|
||||
|
||||
model_name_or_path = "baidu/ERNIE-4.5-0.3B-Paddle"
|
||||
|
||||
|
||||
def start_decode(model_name_or_path):
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
||||
os.environ["FD_LOG_DIR"] = "log_decode"
|
||||
llm_decode = LLM(
|
||||
model=model_name_or_path,
|
||||
tensor_parallel_size=1,
|
||||
splitwise_role="decode",
|
||||
engine_worker_queue_port=6678,
|
||||
innode_prefill_ports=[6677],
|
||||
cache_queue_port=55668,
|
||||
)
|
||||
return llm_decode
|
||||
|
||||
|
||||
def start_prefill(model_name_or_path):
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
os.environ["FD_LOG_DIR"] = "log_prefill"
|
||||
LLM(
|
||||
model=model_name_or_path,
|
||||
tensor_parallel_size=1,
|
||||
splitwise_role="prefill",
|
||||
engine_worker_queue_port=6677,
|
||||
cache_queue_port=55667,
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
prefill = multiprocessing.Process(target=start_prefill, args=(model_name_or_path,)).start()
|
||||
time.sleep(10)
|
||||
llm_decode = start_decode(model_name_or_path)
|
||||
|
||||
output = llm_decode.generate(prompts=["who are you?", "what can you do?"], use_tqdm=True)
|
||||
print(output)
|
||||
|
||||
prefill.join()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -296,11 +296,6 @@ class EngineArgs:
|
||||
Port for splitwise communication.
|
||||
"""
|
||||
|
||||
innode_prefill_ports: Optional[List[int]] = None
|
||||
"""
|
||||
Ports for innode dispatch request.
|
||||
"""
|
||||
|
||||
rdma_comm_ports: Optional[List[int]] = None
|
||||
"""
|
||||
Ports for rdma communication.
|
||||
@@ -500,8 +495,33 @@ class EngineArgs:
|
||||
if self.max_logprobs == -1 and not envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
raise NotImplementedError("Only ENABLE_V1_KVCACHE_SCHEDULER=1 support max_logprobs=-1")
|
||||
|
||||
if self.splitwise_role != "mixed" and self.cache_transfer_protocol != "rdma":
|
||||
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
|
||||
if self.splitwise_role != "mixed":
|
||||
if self.scheduler_name == "local" and self.router is None:
|
||||
raise ValueError(
|
||||
f"When using {self.splitwise_role} role and the {self.scheduler_name} "
|
||||
f"scheduler, please provide --router argument."
|
||||
)
|
||||
|
||||
if "rdma" in self.cache_transfer_protocol:
|
||||
if self.rdma_comm_ports is None:
|
||||
raise ValueError(
|
||||
"Please set --rdma_comm_ports argument when using " "rdma cache transfer protocol."
|
||||
)
|
||||
if len(self.rdma_comm_ports) != self.tensor_parallel_size:
|
||||
raise ValueError("The number of rdma comm ports must be equal to tensor parallel size.")
|
||||
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER == 1:
|
||||
if "ipc" in self.cache_transfer_protocol:
|
||||
# FIXME: support ipc cache transfer protocol
|
||||
raise NotImplementedError(
|
||||
"only support rdma cache transfer protocol " "when using ENABLE_V1_KVCACHE_SCHEDULER."
|
||||
)
|
||||
# FIXME: fix this bug
|
||||
if self.splitwise_role == "prefill" and self.num_gpu_blocks_override is None:
|
||||
raise NotImplementedError(
|
||||
"please set num_gpu_blocks_override for prefill " "instance using ENABLE_V1_KVCACHE_SCHEDULER."
|
||||
)
|
||||
|
||||
if not current_platform.is_cuda() and not current_platform.is_xpu():
|
||||
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
|
||||
if self.guided_decoding_backend != "off":
|
||||
@@ -931,13 +951,6 @@ class EngineArgs:
|
||||
'mixed'. (prefill, decode, mixed)",
|
||||
)
|
||||
|
||||
splitwise_group.add_argument(
|
||||
"--innode-prefill-ports",
|
||||
type=lambda s: s.split(",") if s else None,
|
||||
default=EngineArgs.innode_prefill_ports,
|
||||
help="port for innode prefill, only used in single machine splitwise deployment",
|
||||
)
|
||||
|
||||
splitwise_group.add_argument(
|
||||
"--cache-transfer-protocol",
|
||||
type=str,
|
||||
@@ -1233,7 +1246,6 @@ class EngineArgs:
|
||||
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
||||
tool_parser=self.tool_call_parser,
|
||||
innode_prefill_ports=self.innode_prefill_ports,
|
||||
max_num_partial_prefills=self.max_num_partial_prefills,
|
||||
max_long_partial_prefills=self.max_long_partial_prefills,
|
||||
long_prefill_token_threshold=self.long_prefill_token_threshold,
|
||||
|
||||
@@ -899,8 +899,6 @@ class AsyncLLMEngine:
|
||||
|
||||
def launch_components(self):
|
||||
if self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
# 单机逻辑
|
||||
self.engine_service.engine_worker_queue.available_prefill_instances.put(1)
|
||||
self.engine_service.split_mode_get_tasks()
|
||||
if self.cfg.scheduler_config.name == "splitwise":
|
||||
self.splitwise_receive_thread = threading.Thread(
|
||||
|
||||
@@ -160,8 +160,8 @@ class EngineService:
|
||||
self.insert_task_to_worker_thread.start()
|
||||
self.token_processor.tasks_queue = self.engine_worker_queue
|
||||
self.token_processor.run()
|
||||
if self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
self._process_splitwise_task()
|
||||
if self.cfg.scheduler_config.splitwise_role == "decode":
|
||||
self._decode_process_splitwise_requests()
|
||||
|
||||
self._register_to_router()
|
||||
|
||||
@@ -329,54 +329,13 @@ class EngineService:
|
||||
local_data_parallel_id=self.cfg.parallel_config.local_data_parallel_id,
|
||||
)
|
||||
|
||||
def insert_tasks(self, tasks: Union[List[Request], List[RequestOutput]], current_id=-1, allocated=False):
|
||||
def insert_tasks(self, tasks: Union[List[Request], List[RequestOutput]], current_id=-1):
|
||||
"""
|
||||
Insert tasks to engine.
|
||||
"""
|
||||
for task in tasks:
|
||||
start_span_request("DEQUEUE", task, trace.SpanKind.CONSUMER)
|
||||
|
||||
# TODO 返回至 scheduler
|
||||
if allocated:
|
||||
current_tasks = []
|
||||
for task in tasks:
|
||||
cur_task_idx = self.resource_manager.req_dict[task.request_id]
|
||||
del self.resource_manager.req_dict[task.request_id]
|
||||
cur_task = self.resource_manager.tasks_list[cur_task_idx]
|
||||
if envs.FD_ENABLE_INTERNAL_ADAPTER:
|
||||
if not task.outputs.token_ids: # first token is eos in Prefill, just recycle resource and continue
|
||||
self.resource_manager.stop_flags[cur_task_idx] = True
|
||||
self.resource_manager.tasks_list[cur_task_idx] = None
|
||||
self.resource_manager._recycle_block_tables(cur_task)
|
||||
if task.request_id in self.token_processor.tokens_counter:
|
||||
del self.token_processor.tokens_counter[task.request_id]
|
||||
self.llm_logger.warning(f"{task.request_id} need not decode after first token")
|
||||
continue
|
||||
cur_task.prompt_token_ids[0] = task.outputs.token_ids[0]
|
||||
cur_task.num_cached_tokens = task.num_cached_tokens
|
||||
if (
|
||||
self.cfg.speculative_config.method in ["mtp"]
|
||||
and self.cfg.scheduler_config.splitwise_role == "decode"
|
||||
):
|
||||
cur_task.draft_token_ids = copy.deepcopy(task.outputs.draft_token_ids)
|
||||
if task.error_code != 200:
|
||||
self.resource_manager.stop_flags[cur_task_idx] = True
|
||||
self.resource_manager.tasks_list[cur_task_idx] = None
|
||||
self.resource_manager._recycle_block_tables(cur_task)
|
||||
if task.request_id in self.token_processor.tokens_counter:
|
||||
del self.token_processor.tokens_counter[task.request_id]
|
||||
self.scheduler.put_results([task])
|
||||
self.llm_logger.warning(
|
||||
f"{task.request_id} prefill failed with msg:{task.error_msg}, recycle resource."
|
||||
)
|
||||
continue
|
||||
self.token_processor.tokens_counter[task.request_id] = 1
|
||||
current_tasks.append(cur_task)
|
||||
if current_tasks:
|
||||
self.engine_worker_queue.put_tasks((current_tasks, self.resource_manager.real_bsz))
|
||||
self.llm_logger.debug(f"put task to engine worker queue, task:{current_tasks}")
|
||||
return True
|
||||
|
||||
self.resource_manager.check_and_free_block_tables()
|
||||
|
||||
if not isinstance(tasks, list):
|
||||
@@ -445,8 +404,53 @@ class EngineService:
|
||||
else:
|
||||
self.update_mm_requests_chunk_size(tasks)
|
||||
self.engine_worker_queue.put_tasks((tasks, self.resource_manager.real_bsz))
|
||||
if is_prefill and self.cfg.scheduler_config.name != "splitwise":
|
||||
self.engine_worker_queue.available_prefill_instances.put(1)
|
||||
return True
|
||||
|
||||
def _insert_prefilled_requests(self, request_outputs: List[RequestOutput]):
|
||||
"""
|
||||
insert prefilled requests into engine worker queue.
|
||||
Args:
|
||||
request_outputs: a list of RequestOutput sent by prefill instance
|
||||
"""
|
||||
to_infer_reqs = []
|
||||
for req_out in request_outputs:
|
||||
solt_idx = self.resource_manager.req_dict[req_out.request_id]
|
||||
del self.resource_manager.req_dict[req_out.request_id]
|
||||
cur_req = self.resource_manager.tasks_list[solt_idx]
|
||||
|
||||
if envs.FD_ENABLE_INTERNAL_ADAPTER:
|
||||
if not req_out.outputs.token_ids: # first token is eos in Prefill, just recycle resource and continue
|
||||
self.resource_manager.stop_flags[solt_idx] = True
|
||||
self.resource_manager.tasks_list[solt_idx] = None
|
||||
self.resource_manager._recycle_block_tables(cur_req)
|
||||
if req_out.request_id in self.token_processor.tokens_counter:
|
||||
del self.token_processor.tokens_counter[req_out.request_id]
|
||||
self.llm_logger.warning(f"{req_out.request_id} need not decode after first token")
|
||||
continue
|
||||
|
||||
cur_req.prompt_token_ids[0] = req_out.outputs.token_ids[0]
|
||||
cur_req.num_cached_tokens = req_out.num_cached_tokens
|
||||
if self.cfg.speculative_config.method in ["mtp"] and self.cfg.scheduler_config.splitwise_role == "decode":
|
||||
cur_req.draft_token_ids = copy.deepcopy(req_out.outputs.draft_token_ids)
|
||||
|
||||
if req_out.error_code != 200:
|
||||
self.resource_manager.stop_flags[solt_idx] = True
|
||||
self.resource_manager.tasks_list[solt_idx] = None
|
||||
self.resource_manager._recycle_block_tables(cur_req)
|
||||
if req_out.request_id in self.token_processor.tokens_counter:
|
||||
del self.token_processor.tokens_counter[req_out.request_id]
|
||||
self.scheduler.put_results([req_out])
|
||||
self.llm_logger.warning(
|
||||
f"{req_out.request_id} prefill failed with msg:{req_out.error_msg}, recycle resource."
|
||||
)
|
||||
continue
|
||||
|
||||
self.token_processor.tokens_counter[req_out.request_id] = 1
|
||||
to_infer_reqs.append(cur_req)
|
||||
|
||||
if to_infer_reqs:
|
||||
self.engine_worker_queue.put_tasks((to_infer_reqs, self.resource_manager.real_bsz))
|
||||
self.llm_logger.debug(f"put requests to engine worker queue, task:{to_infer_reqs}")
|
||||
return True
|
||||
|
||||
def task_is_finished(self, index):
|
||||
@@ -636,8 +640,9 @@ class EngineService:
|
||||
if len(tasks) == 0:
|
||||
time.sleep(0.001)
|
||||
continue
|
||||
if self.cfg.splitwise_version == "v2" and self.cfg.scheduler_config.splitwise_role == "decode":
|
||||
# the task in decode instance will processed in _process_splitwise_task thread
|
||||
if self.cfg.scheduler_config.splitwise_role == "decode":
|
||||
# Decode will instert the request sent by prefill to engine,
|
||||
# so the task sent by client will be ignored
|
||||
continue
|
||||
|
||||
llm_logger.debug(f"get tasks from scheduler: {tasks}")
|
||||
@@ -684,7 +689,14 @@ class EngineService:
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
batch=num_prefill_batch,
|
||||
)
|
||||
self.llm_logger.debug(f"get tasks from scheduler: {tasks}")
|
||||
|
||||
if self.cfg.scheduler_config.splitwise_role == "decode":
|
||||
# Decode will instert the request sent by prefill to engine,
|
||||
# so the task sent by client will be ignored
|
||||
is_fetching = False
|
||||
return
|
||||
|
||||
self.llm_logger.debug(f"get tasks from {type(self.scheduler)}: {tasks}")
|
||||
if self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
need_delete_tasks = []
|
||||
if envs.FD_OFFLINE_PERF_TEST_FOR_PD:
|
||||
@@ -705,6 +717,7 @@ class EngineService:
|
||||
for task in tasks:
|
||||
# assure can allocate block ids in P
|
||||
while not self.resource_manager.preallocate_resource_in_p(task):
|
||||
self.llm_logger.info("wait for preallocate_resource_in_p")
|
||||
time.sleep(0.005)
|
||||
self.llm_logger.info(f"ask D resource for req_id: {task.request_id}")
|
||||
self.split_connector.send_splitwise_tasks([task], task.idx)
|
||||
@@ -864,7 +877,7 @@ class EngineService:
|
||||
request.llm_engine_recv_req_timestamp = time.time()
|
||||
start_span("ENQUEUE_ZMQ", data, trace.SpanKind.PRODUCER)
|
||||
main_process_metrics.requests_number.inc()
|
||||
self.llm_logger.debug(f"Receive request: {request}")
|
||||
self.llm_logger.debug(f"Receive request from api server: {request}")
|
||||
except Exception as e:
|
||||
self.llm_logger.error(f"Receive request error: {e}, {traceback.format_exc()!s}")
|
||||
err_msg = str(e)
|
||||
@@ -997,156 +1010,126 @@ class EngineService:
|
||||
except Exception as e:
|
||||
llm_logger.error(f"Unexcepted error happend: {e}, {traceback.format_exc()!s}")
|
||||
|
||||
def _process_splitwise_task(self):
|
||||
def _decode_process_splitwise_requests(self):
|
||||
"""
|
||||
Processing tasks from engine worker queue in splitwise deployment.
|
||||
For v0 version, prefill instance gets tasks from engine worker queue.
|
||||
For v1 and v2 version, decode instance gets raw tasks from engine worker queue to preallocate resources,
|
||||
and decode instance gets prefilled tasks from engine worker queue to generate tokens.
|
||||
TODO: unifiy the communication between decode and prefill instances.
|
||||
Decode processes requests from engine worker queue, which are sent by prefill.
|
||||
TODO: merge this function to the schedule function in resource manager
|
||||
"""
|
||||
allocate_resource_requests: list[Request] = []
|
||||
prefilled_request_ouputs: list[RequestOutput] = []
|
||||
|
||||
def receiver_loop():
|
||||
waiting_resource_requests = []
|
||||
waiting_ready_tasks = []
|
||||
def _fetch_requests():
|
||||
if self.engine_worker_queue.disaggregate_queue_empty():
|
||||
return
|
||||
|
||||
items = self.engine_worker_queue.get_disaggregated_tasks()
|
||||
for item in items:
|
||||
tasks = item[1]
|
||||
if isinstance(tasks[0], Request):
|
||||
self.llm_logger.debug(f"receive tasks to preallocate resource, {tasks}")
|
||||
allocate_resource_requests.extend(tasks)
|
||||
elif isinstance(tasks[0], RequestOutput):
|
||||
self.llm_logger.debug(f"receive prefilled tasks, {tasks}")
|
||||
if not isinstance(tasks, list):
|
||||
tasks = [tasks]
|
||||
for task in tasks:
|
||||
task.finished = False
|
||||
prefilled_request_ouputs.extend(tasks)
|
||||
|
||||
def _process_allocate_resource_requests():
|
||||
processed_indices = []
|
||||
for idx, task in enumerate(allocate_resource_requests):
|
||||
is_success = False
|
||||
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
if self.resource_manager.preallocate_resource_in_d(task):
|
||||
self.llm_logger.info(f"Resource available, processing task {task.request_id}")
|
||||
self.split_connector.send_cache_infos([task], -1)
|
||||
processed_indices.append(idx)
|
||||
is_success = True
|
||||
else:
|
||||
if self.resource_manager.is_resource_sufficient(task.prompt_token_ids_len):
|
||||
self.llm_logger.info(f"Resource available, processing task {task.request_id}")
|
||||
self.insert_tasks([task])
|
||||
processed_indices.append(idx)
|
||||
is_success = True
|
||||
|
||||
if not is_success:
|
||||
if not self.enable_decode_cache_task:
|
||||
task.error_msg = "Not enough resources"
|
||||
self.split_connector.send_cache_infos([task], -1)
|
||||
processed_indices.append(idx)
|
||||
else:
|
||||
self.llm_logger.debug(f"Still waiting for resources {task.request_id}")
|
||||
break
|
||||
|
||||
for idx in sorted(processed_indices, reverse=True):
|
||||
allocate_resource_requests.pop(idx)
|
||||
|
||||
def _process_prefilled_requests():
|
||||
nonlocal prefilled_request_ouputs
|
||||
ready_request_outputs = []
|
||||
waiting_request_outputs = []
|
||||
# Waiting for the api_server and scheduler in decode to
|
||||
# receive the request sent by the client
|
||||
def _decode_process_prefilled_task_v0_scheduler(input_tasks):
|
||||
ready_tasks = []
|
||||
waiting_tasks = []
|
||||
for task in input_tasks:
|
||||
if not hasattr(self.scheduler, "has_request") or self.scheduler.has_request(task.request_id):
|
||||
ready_tasks.append(task)
|
||||
else:
|
||||
waiting_tasks.append(task)
|
||||
self.insert_tasks(ready_tasks, allocated=True)
|
||||
if self.cfg.splitwise_version in ("v0", "v2"):
|
||||
self.scheduler.put_results(ready_tasks)
|
||||
return waiting_tasks
|
||||
for task in prefilled_request_ouputs:
|
||||
if not hasattr(self.scheduler, "has_request") or self.scheduler.has_request(task.request_id):
|
||||
ready_request_outputs.append(task)
|
||||
else:
|
||||
waiting_request_outputs.append(task)
|
||||
|
||||
prefilled_request_ouputs = waiting_request_outputs
|
||||
if self.cfg.splitwise_version == "v1":
|
||||
# decode return first token to client
|
||||
self.scheduler.put_results(ready_request_outputs)
|
||||
|
||||
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
self._insert_prefilled_requests(ready_request_outputs)
|
||||
else:
|
||||
for task in ready_request_outputs:
|
||||
if envs.FD_ENABLE_INTERNAL_ADAPTER:
|
||||
if (
|
||||
not task.outputs.token_ids
|
||||
): # first token is eos in Prefill, just recycle resource and continue
|
||||
cur_req = self.resource_manager.requests[task.request_id]
|
||||
self.resource_manager.stop_flags[cur_req.idx] = True
|
||||
self.resource_manager.tasks_list[cur_req.idx] = None
|
||||
self.resource_manager._free_blocks(cur_req)
|
||||
if cur_req.request_id in self.token_processor.tokens_counter:
|
||||
del self.token_processor.tokens_counter[task.request_id]
|
||||
self.llm_logger.warning(f"{task.request_id} need not decode after first token")
|
||||
del self.resource_manager.requests[task.request_id]
|
||||
del self.resource_manager.req_dict[task.request_id]
|
||||
continue
|
||||
if task.error_code != 200:
|
||||
cur_req = self.resource_manager.requests[task.request_id]
|
||||
self.resource_manager.stop_flags[cur_req.idx] = True
|
||||
self.resource_manager.tasks_list[cur_req.idx] = None
|
||||
self.resource_manager._free_blocks(cur_req)
|
||||
if cur_req.request_id in self.token_processor.tokens_counter:
|
||||
del self.token_processor.tokens_counter[task.request_id]
|
||||
self.scheduler.put_results([task])
|
||||
self.llm_logger.warning(
|
||||
f"{task.request_id} prefill failed with msg:{task.error_msg}, recycle resource."
|
||||
)
|
||||
continue
|
||||
self.token_processor.tokens_counter[task.request_id] = 1
|
||||
self.resource_manager.insert_task_for_decoding(task)
|
||||
|
||||
def decode_loop():
|
||||
while self.running:
|
||||
try:
|
||||
processed_indices = []
|
||||
for idx, task in enumerate(waiting_resource_requests):
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
if self.resource_manager.preallocate_resource_in_d(task):
|
||||
self.llm_logger.info(f"Resource available, processing task {task.request_id}")
|
||||
self.split_connector.send_cache_infos([task], -1)
|
||||
processed_indices.append(idx)
|
||||
else:
|
||||
self.llm_logger.debug(f"Still waiting for resources {task.request_id}")
|
||||
break
|
||||
else:
|
||||
if self.resource_manager.is_resource_sufficient(task.prompt_token_ids_len):
|
||||
self.insert_tasks([task])
|
||||
self.llm_logger.info(f"Resource available, processing task {task.request_id}")
|
||||
processed_indices.append(idx)
|
||||
else:
|
||||
self.llm_logger.debug(f"Still waiting for resources {task.request_id}")
|
||||
break
|
||||
|
||||
for idx in sorted(processed_indices, reverse=True):
|
||||
waiting_resource_requests.pop(idx)
|
||||
|
||||
waiting_ready_tasks = _decode_process_prefilled_task_v0_scheduler(waiting_ready_tasks)
|
||||
|
||||
if self.engine_worker_queue.disaggregate_queue_empty():
|
||||
time.sleep(0.001)
|
||||
else:
|
||||
items = self.engine_worker_queue.get_disaggregated_tasks()
|
||||
for item in items:
|
||||
role = item[0]
|
||||
tasks = item[1]
|
||||
|
||||
# prefill instance gets tasks from engine worker queue
|
||||
if role == "prefill":
|
||||
for task in tasks:
|
||||
task.max_tokens = task.min_tokens = 2
|
||||
self.insert_tasks(tasks)
|
||||
# decode instance gets tasks from engine worker queue
|
||||
elif role == "decode":
|
||||
if isinstance(tasks[0], RequestOutput):
|
||||
self.llm_logger.debug(f"receive prefilled tasks, {tasks}")
|
||||
if not isinstance(tasks, list):
|
||||
tasks = [tasks]
|
||||
for task in tasks:
|
||||
task.finished = False
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
for task in tasks:
|
||||
if envs.FD_ENABLE_INTERNAL_ADAPTER:
|
||||
if (
|
||||
not task.outputs.token_ids
|
||||
): # first token is eos in Prefill, just recycle resource and continue
|
||||
cur_task = self.resource_manager.requests[task.request_id]
|
||||
self.resource_manager.stop_flags[cur_task.idx] = True
|
||||
self.resource_manager.tasks_list[cur_task.idx] = None
|
||||
self.resource_manager._free_blocks(cur_task)
|
||||
if cur_task.request_id in self.token_processor.tokens_counter:
|
||||
del self.token_processor.tokens_counter[task.request_id]
|
||||
self.llm_logger.warning(
|
||||
f"{task.request_id} need not decode after first token"
|
||||
)
|
||||
del self.resource_manager.requests[task.request_id]
|
||||
del self.resource_manager.req_dict[task.request_id]
|
||||
continue
|
||||
if task.error_code != 200:
|
||||
cur_task = self.resource_manager.requests[task.request_id]
|
||||
self.resource_manager.stop_flags[cur_task.idx] = True
|
||||
self.resource_manager.tasks_list[cur_task.idx] = None
|
||||
self.resource_manager._free_blocks(cur_task)
|
||||
if cur_task.request_id in self.token_processor.tokens_counter:
|
||||
del self.token_processor.tokens_counter[task.request_id]
|
||||
self.scheduler.put_results([task])
|
||||
self.llm_logger.warning(
|
||||
f"{task.request_id} prefill failed with msg:{task.error_msg}, recycle resource."
|
||||
)
|
||||
continue
|
||||
self.token_processor.tokens_counter[task.request_id] = 1
|
||||
self.resource_manager.insert_task_for_decoding(task)
|
||||
|
||||
else:
|
||||
waiting_ready_tasks.extend(_decode_process_prefilled_task_v0_scheduler(tasks))
|
||||
elif isinstance(tasks[0], Request):
|
||||
self.llm_logger.debug(f"receive tasks to preallocate resource, {tasks}")
|
||||
if len(waiting_resource_requests):
|
||||
self.llm_logger.info(f"Waiting for resource for task {tasks[0].request_id}")
|
||||
waiting_resource_requests.extend(tasks)
|
||||
else:
|
||||
new_waiting = []
|
||||
for task in tasks:
|
||||
can_allocate_resource = False
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
if self.resource_manager.preallocate_resource_in_d(task):
|
||||
self.split_connector.send_cache_infos([task], -1)
|
||||
can_allocate_resource = True
|
||||
else:
|
||||
if self.resource_manager.is_resource_sufficient(
|
||||
task.prompt_token_ids_len
|
||||
):
|
||||
self.insert_tasks([task])
|
||||
can_allocate_resource = True
|
||||
if can_allocate_resource is False:
|
||||
if not self.enable_decode_cache_task:
|
||||
task.error_msg = "Not enough resources"
|
||||
new_waiting.append(task)
|
||||
|
||||
if new_waiting:
|
||||
if not self.enable_decode_cache_task:
|
||||
self.split_connector.send_cache_infos(new_waiting, -1)
|
||||
else:
|
||||
waiting_resource_requests.extend(new_waiting)
|
||||
self.llm_logger.info(
|
||||
f"Added {len(new_waiting)} tasks to waiting queue"
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported task type: {type(tasks[0])}")
|
||||
|
||||
_fetch_requests()
|
||||
_process_allocate_resource_requests()
|
||||
_process_prefilled_requests()
|
||||
time.sleep(0.001)
|
||||
except Exception as e:
|
||||
self.llm_logger.error(f"Error in main loop: {e}")
|
||||
time.sleep(0.1)
|
||||
self.llm_logger.error(
|
||||
f"Error in main loop of decode_process_splitwise_requests: " f"{e}, {traceback.format_exc()}"
|
||||
)
|
||||
time.sleep(0.01)
|
||||
|
||||
threading.Thread(target=receiver_loop, daemon=True).start()
|
||||
threading.Thread(target=decode_loop, daemon=True).start()
|
||||
|
||||
def start_cache_service(self, device_ids, ipc_signal_suffix):
|
||||
return self.resource_manager.cache_manager.launch_cache_manager(
|
||||
|
||||
@@ -690,8 +690,6 @@ class LLMEngine:
|
||||
|
||||
def launch_components(self):
|
||||
if self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
# 单机逻辑
|
||||
self.engine.engine_worker_queue.available_prefill_instances.put(1)
|
||||
self.splitwise_receive_thread = threading.Thread(
|
||||
target=self.engine.split_connector.start_receiver, args=()
|
||||
)
|
||||
|
||||
@@ -75,6 +75,8 @@ class Request:
|
||||
pooling_params: Optional[PoolingParams] = None,
|
||||
preprocess_start_time: Optional[float] = None,
|
||||
preprocess_end_time: Optional[float] = None,
|
||||
inference_start_time: float = 0,
|
||||
llm_engine_recv_req_timestamp: float = 0,
|
||||
multimodal_inputs: Optional[dict] = None,
|
||||
multimodal_data: Optional[dict] = None,
|
||||
disable_chat_template: bool = False,
|
||||
@@ -118,6 +120,10 @@ class Request:
|
||||
self.arrival_time = arrival_time
|
||||
self.preprocess_start_time = preprocess_start_time
|
||||
self.preprocess_end_time = preprocess_end_time
|
||||
self.inference_start_time = inference_start_time
|
||||
self.llm_engine_recv_req_timestamp = (
|
||||
llm_engine_recv_req_timestamp if llm_engine_recv_req_timestamp else time.time()
|
||||
)
|
||||
self.disable_chat_template = disable_chat_template
|
||||
self.disaggregate_info = disaggregate_info
|
||||
|
||||
@@ -166,7 +172,6 @@ class Request:
|
||||
self.extend_block_tables = []
|
||||
# dp
|
||||
self.dp_rank = dp_rank
|
||||
self.llm_engine_recv_req_timestamp = time.time()
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, d: dict):
|
||||
@@ -217,6 +222,8 @@ class Request:
|
||||
video_end=d.get("video_end", 0),
|
||||
audio_end=d.get("audio_end", 0),
|
||||
dp_rank=d.get("dp_rank", None),
|
||||
inference_start_time=d.get("inference_start_time"),
|
||||
llm_engine_recv_req_timestamp=d.get("llm_engine_recv_req_timestamp"),
|
||||
)
|
||||
|
||||
@property
|
||||
|
||||
@@ -287,12 +287,6 @@ class EngineWorkerQueue:
|
||||
callable=lambda idx: self.disaggregate_requests[idx],
|
||||
)
|
||||
|
||||
self.available_prefill_instances = Queue()
|
||||
QueueManager.register(
|
||||
"get_available_prefill_instances",
|
||||
callable=lambda: self.available_prefill_instances,
|
||||
)
|
||||
|
||||
QueueManager.register(
|
||||
"get_finish_request_barrier",
|
||||
callable=lambda idx: self.finish_request_barrier[idx],
|
||||
@@ -351,7 +345,6 @@ class EngineWorkerQueue:
|
||||
QueueManager.register("get_client_read_info_flag")
|
||||
QueueManager.register("get_lock_info")
|
||||
QueueManager.register("get_disaggregate_requests")
|
||||
QueueManager.register("get_available_prefill_instances")
|
||||
QueueManager.register("get_finish_request_barrier")
|
||||
QueueManager.register("get_finish_add_cache_task_barrier")
|
||||
QueueManager.register("get_connect_task_barrier")
|
||||
@@ -390,7 +383,6 @@ class EngineWorkerQueue:
|
||||
|
||||
# p/d 分离获取
|
||||
self.disaggregate_requests = self.manager.get_disaggregate_requests(self.local_data_parallel_id)
|
||||
self.available_prefill_instances = self.manager.get_available_prefill_instances()
|
||||
self.finish_request_barrier = self.manager.get_finish_request_barrier(self.local_data_parallel_id)
|
||||
self.finish_add_cache_task_barrier = self.manager.get_finish_add_cache_task_barrier(
|
||||
self.local_data_parallel_id
|
||||
@@ -652,15 +644,6 @@ class EngineWorkerQueue:
|
||||
self.connect_task_response_lock.release()
|
||||
return task_response
|
||||
|
||||
def get_prefill_instances(self):
|
||||
"""
|
||||
check if the prefill queue is empty
|
||||
"""
|
||||
if self.available_prefill_instances.qsize() == 0:
|
||||
return 0
|
||||
else:
|
||||
return self.available_prefill_instances.get()
|
||||
|
||||
def put_cache_info(self, cache_info) -> None:
|
||||
"""
|
||||
Args:
|
||||
|
||||
@@ -498,6 +498,9 @@ class TokenProcessor:
|
||||
self.split_connector.send_first_token(task.disaggregate_info, [result])
|
||||
break
|
||||
else:
|
||||
# TODO: Refine checking sending cache and do not keep waiting
|
||||
if time.time() - start_time > 30:
|
||||
llm_logger.warning(f"wait for sending cache, {task_id}")
|
||||
time.sleep(0.002)
|
||||
else:
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
@@ -753,10 +756,8 @@ class TokenProcessor:
|
||||
self._recycle_resources(task_id, i, task, result, is_prefill)
|
||||
break
|
||||
|
||||
if not (is_prefill and self.cfg.splitwise_version == "v0"):
|
||||
# NOTE: prefill instance in v0 version does not return result to scheduler
|
||||
llm_logger.debug(f"get response from infer: {result}")
|
||||
batch_result.append(result)
|
||||
llm_logger.debug(f"get response from infer: {result}")
|
||||
batch_result.append(result)
|
||||
|
||||
self.postprocess(batch_result, mtype)
|
||||
|
||||
|
||||
@@ -175,72 +175,6 @@ class SplitwiseConnector:
|
||||
self.push_sockets[addr].close()
|
||||
del self.push_sockets[addr]
|
||||
|
||||
def has_splitwise_tasks(self):
|
||||
"""
|
||||
PD mode: check prefill empty
|
||||
"""
|
||||
if self.cfg.innode_prefill_ports is None:
|
||||
return True
|
||||
else:
|
||||
for port in self.cfg.innode_prefill_ports:
|
||||
if port not in self.connect_innode_instances:
|
||||
self.create_connection(port)
|
||||
if self.connect_innode_instances[port].available_prefill_instances.qsize() > 0:
|
||||
return False
|
||||
return True
|
||||
|
||||
def dispatch_innode_splitwise_tasks(self, tasks, current_id):
|
||||
"""
|
||||
Dispatch splitwise tasks .
|
||||
|
||||
Parameters:
|
||||
tasks (list): List of tasks.
|
||||
"""
|
||||
tasks_status = "mixed"
|
||||
is_changable = envs.FD_PD_CHANGEABLE == "1"
|
||||
while True:
|
||||
for port in self.cfg.innode_prefill_ports:
|
||||
current_port = -1
|
||||
if port not in self.connect_innode_instances:
|
||||
self.create_connection(port)
|
||||
if self.connect_innode_instances[port].get_prefill_instances() == 1:
|
||||
for task in tasks:
|
||||
task.disaggregate_info = {
|
||||
"role": "prefill",
|
||||
"transfer_protocol": "ipc",
|
||||
"cache_info": {
|
||||
"ipc": {
|
||||
"ip": "0.0.0.0",
|
||||
"port": self.cfg.parallel_config.engine_worker_queue_port[self.idx],
|
||||
"current_id": current_id,
|
||||
},
|
||||
},
|
||||
}
|
||||
self.connect_innode_instances[port].put_disaggregated_tasks(("prefill", tasks))
|
||||
current_port = port
|
||||
|
||||
if current_port != -1:
|
||||
tasks_status = "decode"
|
||||
break
|
||||
if current_port != -1 or is_changable:
|
||||
break
|
||||
else:
|
||||
time.sleep(0.005)
|
||||
|
||||
if tasks_status == "decode":
|
||||
for task in tasks:
|
||||
task.disaggregate_info = {
|
||||
"role": tasks_status,
|
||||
"transfer_protocol": "ipc",
|
||||
"cache_info": {
|
||||
"ipc": {
|
||||
"ip": "0.0.0.0",
|
||||
"port": current_port,
|
||||
"current_id": current_id,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
def send_splitwise_tasks(self, tasks: List[Request], current_id):
|
||||
"""
|
||||
Send splitwise tasks to all connected addresses.
|
||||
@@ -249,10 +183,6 @@ class SplitwiseConnector:
|
||||
tasks (list): List of tasks.
|
||||
current_id (int): Current ID.
|
||||
"""
|
||||
|
||||
if self.cfg.innode_prefill_ports is not None:
|
||||
self.dispatch_innode_splitwise_tasks(tasks, current_id)
|
||||
return
|
||||
addr = None
|
||||
decode_diagg = None
|
||||
for task in tasks:
|
||||
|
||||
@@ -929,8 +929,6 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
|
||||
logger.info(f"- Dynamic load weight: {load_config.dynamic_load_weight}")
|
||||
logger.info(f"- Load strategy: {load_config.load_strategy}")
|
||||
|
||||
if args.splitwise_role != "mixed" and args.cache_transfer_protocol != "rdma":
|
||||
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
|
||||
if not current_platform.is_cuda() and not current_platform.is_xpu():
|
||||
logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported.")
|
||||
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
|
||||
|
||||
@@ -12,23 +12,29 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Test splitwise deployment which uses local_scheduler + router,
|
||||
# and ENABLE_V1_KVCACHE_SCHEDULER is 0
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import signal
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
from utils.serving_utils import (
|
||||
FD_API_PORT,
|
||||
FD_CACHE_QUEUE_PORT,
|
||||
FD_ENGINE_QUEUE_PORT,
|
||||
FD_METRICS_PORT,
|
||||
clean_ports,
|
||||
get_registered_number,
|
||||
)
|
||||
|
||||
# Read ports from environment variables; use default values if not set
|
||||
FD_API_PORT = int(os.getenv("FD_API_PORT", 8188))
|
||||
FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133))
|
||||
FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233))
|
||||
FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
|
||||
FD_CONNECTOR_PORT = int(os.getenv("FD_CONNECTOR_PORT", 8433))
|
||||
FD_ROUTER_PORT = int(os.getenv("FD_ROUTER_PORT", 8533))
|
||||
|
||||
@@ -48,133 +54,6 @@ PORTS_TO_CLEAN = [
|
||||
]
|
||||
|
||||
|
||||
def is_port_open(host: str, port: int, timeout=1.0):
|
||||
"""
|
||||
Check if a TCP port is open on the given host.
|
||||
Returns True if connection succeeds, False otherwise.
|
||||
"""
|
||||
try:
|
||||
with socket.create_connection((host, port), timeout):
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def check_service_health(base_url: str, timeout: int = 3) -> bool:
|
||||
"""
|
||||
Check the health status of a service.
|
||||
|
||||
Args:
|
||||
base_url (str): The base URL of the service, e.g. "http://127.0.0.1:8080"
|
||||
timeout (int): Request timeout in seconds.
|
||||
|
||||
Returns:
|
||||
bool: True if the service is healthy, False otherwise.
|
||||
"""
|
||||
if not base_url.startswith("http"):
|
||||
base_url = f"http://{base_url}"
|
||||
url = f"{base_url.rstrip('/')}/health"
|
||||
try:
|
||||
resp = requests.get(url, timeout=timeout)
|
||||
if resp.status_code == 200:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def get_registered_number(router_url) -> list:
|
||||
"""
|
||||
Get the number of registered models in the router.
|
||||
|
||||
Args:
|
||||
router_url (str): The base URL of the router, e.g. "http://localhost:8080".
|
||||
|
||||
Returns:
|
||||
int: The number of registered models.
|
||||
"""
|
||||
if not router_url.startswith("http"):
|
||||
router_url = f"http://{router_url}"
|
||||
|
||||
try:
|
||||
response = requests.get(f"{router_url}/registered_number", timeout=60)
|
||||
registered_numbers = response.json()
|
||||
return registered_numbers
|
||||
except Exception:
|
||||
return {"mixed": 0, "prefill": 0, "decode": 0}
|
||||
|
||||
|
||||
def kill_process_on_port(port: int):
|
||||
"""
|
||||
Kill processes that are listening on the given port.
|
||||
Uses multiple methods to ensure thorough cleanup.
|
||||
"""
|
||||
current_pid = os.getpid()
|
||||
parent_pid = os.getppid()
|
||||
|
||||
# Method 1: Use lsof to find processes
|
||||
try:
|
||||
output = subprocess.check_output(f"lsof -i:{port} -t", shell=True).decode().strip()
|
||||
for pid in output.splitlines():
|
||||
pid = int(pid)
|
||||
if pid in (current_pid, parent_pid):
|
||||
print(f"Skip killing current process (pid={pid}) on port {port}")
|
||||
continue
|
||||
try:
|
||||
# First try SIGTERM for graceful shutdown
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
time.sleep(1)
|
||||
# Then SIGKILL if still running
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
print(f"Killed process on port {port}, pid={pid}")
|
||||
except ProcessLookupError:
|
||||
pass # Process already terminated
|
||||
except subprocess.CalledProcessError:
|
||||
pass
|
||||
|
||||
# Method 2: Use netstat and fuser as backup
|
||||
try:
|
||||
# Find processes using netstat and awk
|
||||
cmd = f"netstat -tulpn 2>/dev/null | grep :{port} | awk '{{print $7}}' | cut -d'/' -f1"
|
||||
output = subprocess.check_output(cmd, shell=True).decode().strip()
|
||||
for pid in output.splitlines():
|
||||
if pid and pid.isdigit():
|
||||
pid = int(pid)
|
||||
if pid in (current_pid, parent_pid):
|
||||
continue
|
||||
try:
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
print(f"Killed process (netstat) on port {port}, pid={pid}")
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
pass
|
||||
|
||||
# Method 3: Use fuser if available
|
||||
try:
|
||||
subprocess.run(f"fuser -k {port}/tcp", shell=True, timeout=5)
|
||||
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError):
|
||||
pass
|
||||
|
||||
|
||||
def clean_ports():
|
||||
"""
|
||||
Kill all processes occupying the ports listed in PORTS_TO_CLEAN.
|
||||
"""
|
||||
print(f"Cleaning ports: {PORTS_TO_CLEAN}")
|
||||
for port in PORTS_TO_CLEAN:
|
||||
kill_process_on_port(port)
|
||||
|
||||
# Double check and retry if ports are still in use
|
||||
time.sleep(2)
|
||||
for port in PORTS_TO_CLEAN:
|
||||
if is_port_open("127.0.0.1", port, timeout=0.1):
|
||||
print(f"Port {port} still in use, retrying cleanup...")
|
||||
kill_process_on_port(port)
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def setup_and_run_server():
|
||||
"""
|
||||
@@ -185,7 +64,7 @@ def setup_and_run_server():
|
||||
- Tears down server after all tests finish
|
||||
"""
|
||||
print("Pre-test port cleanup...")
|
||||
clean_ports()
|
||||
clean_ports(PORTS_TO_CLEAN)
|
||||
|
||||
print("log dir clean ")
|
||||
if os.path.exists("log_router") and os.path.isdir("log_router"):
|
||||
@@ -232,7 +111,6 @@ def setup_and_run_server():
|
||||
env_prefill["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
env_prefill["ENABLE_V1_KVCACHE_SCHEDULER"] = "0"
|
||||
env_prefill["FD_LOG_DIR"] = "log_prefill"
|
||||
env_prefill["INFERENCE_MSG_QUEUE_ID"] = str(FD_API_PORT)
|
||||
prefill_log_path = "server.log"
|
||||
prefill_cmd = [
|
||||
sys.executable,
|
||||
@@ -282,7 +160,6 @@ def setup_and_run_server():
|
||||
env_decode = os.environ.copy()
|
||||
env_decode["CUDA_VISIBLE_DEVICES"] = "1"
|
||||
env_decode["ENABLE_V1_KVCACHE_SCHEDULER"] = "0"
|
||||
env_decode["INFERENCE_MSG_QUEUE_ID"] = str(FD_API_PORT + 1)
|
||||
env_decode["FD_LOG_DIR"] = "log_decode"
|
||||
decode_log_path = "decode_server.log"
|
||||
decode_cmd = [
|
||||
@@ -351,7 +228,7 @@ def setup_and_run_server():
|
||||
os.killpg(process_router.pid, signal.SIGTERM)
|
||||
os.killpg(process_prefill.pid, signal.SIGTERM)
|
||||
os.killpg(process_decode.pid, signal.SIGTERM)
|
||||
clean_ports()
|
||||
clean_ports(PORTS_TO_CLEAN)
|
||||
print(f"Prefill server (pid={process_prefill.pid}) terminated")
|
||||
print(f"Decode server (pid={process_decode.pid}) terminated")
|
||||
except Exception as e:
|
||||
@@ -12,23 +12,31 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Test splitwise deployment which uses splitwise_scheduler,
|
||||
# and ENABLE_V1_KVCACHE_SCHEDULER is 0
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import signal
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
from utils.serving_utils import (
|
||||
FD_API_PORT,
|
||||
FD_CACHE_QUEUE_PORT,
|
||||
FD_ENGINE_QUEUE_PORT,
|
||||
FD_METRICS_PORT,
|
||||
clean_ports,
|
||||
is_port_open,
|
||||
)
|
||||
|
||||
# Read ports from environment variables; use default values if not set
|
||||
FD_API_PORT = int(os.getenv("FD_API_PORT", 8188))
|
||||
FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133))
|
||||
FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233))
|
||||
FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
|
||||
FD_CONNECTOR_PORT = int(os.getenv("FD_CONNECTOR_PORT", 8433))
|
||||
FD_REDIS_PORT = int(os.getenv("FD_REDIS_PORT", 8533))
|
||||
|
||||
# List of ports to clean before and after tests
|
||||
PORTS_TO_CLEAN = [
|
||||
@@ -36,95 +44,16 @@ PORTS_TO_CLEAN = [
|
||||
FD_ENGINE_QUEUE_PORT,
|
||||
FD_METRICS_PORT,
|
||||
FD_CACHE_QUEUE_PORT,
|
||||
FD_CONNECTOR_PORT,
|
||||
FD_API_PORT + 1,
|
||||
FD_ENGINE_QUEUE_PORT + 1,
|
||||
FD_METRICS_PORT + 1,
|
||||
FD_CACHE_QUEUE_PORT + 1,
|
||||
FD_CONNECTOR_PORT + 1,
|
||||
FD_REDIS_PORT,
|
||||
]
|
||||
|
||||
|
||||
def is_port_open(host: str, port: int, timeout=1.0):
|
||||
"""
|
||||
Check if a TCP port is open on the given host.
|
||||
Returns True if connection succeeds, False otherwise.
|
||||
"""
|
||||
try:
|
||||
with socket.create_connection((host, port), timeout):
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def kill_process_on_port(port: int):
|
||||
"""
|
||||
Kill processes that are listening on the given port.
|
||||
Uses multiple methods to ensure thorough cleanup.
|
||||
"""
|
||||
current_pid = os.getpid()
|
||||
parent_pid = os.getppid()
|
||||
|
||||
# Method 1: Use lsof to find processes
|
||||
try:
|
||||
output = subprocess.check_output(f"lsof -i:{port} -t", shell=True).decode().strip()
|
||||
for pid in output.splitlines():
|
||||
pid = int(pid)
|
||||
if pid in (current_pid, parent_pid):
|
||||
print(f"Skip killing current process (pid={pid}) on port {port}")
|
||||
continue
|
||||
try:
|
||||
# First try SIGTERM for graceful shutdown
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
time.sleep(1)
|
||||
# Then SIGKILL if still running
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
print(f"Killed process on port {port}, pid={pid}")
|
||||
except ProcessLookupError:
|
||||
pass # Process already terminated
|
||||
except subprocess.CalledProcessError:
|
||||
pass
|
||||
|
||||
# Method 2: Use netstat and fuser as backup
|
||||
try:
|
||||
# Find processes using netstat and awk
|
||||
cmd = f"netstat -tulpn 2>/dev/null | grep :{port} | awk '{{print $7}}' | cut -d'/' -f1"
|
||||
output = subprocess.check_output(cmd, shell=True).decode().strip()
|
||||
for pid in output.splitlines():
|
||||
if pid and pid.isdigit():
|
||||
pid = int(pid)
|
||||
if pid in (current_pid, parent_pid):
|
||||
continue
|
||||
try:
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
print(f"Killed process (netstat) on port {port}, pid={pid}")
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
pass
|
||||
|
||||
# Method 3: Use fuser if available
|
||||
try:
|
||||
subprocess.run(f"fuser -k {port}/tcp", shell=True, timeout=5)
|
||||
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError):
|
||||
pass
|
||||
|
||||
|
||||
def clean_ports():
|
||||
"""
|
||||
Kill all processes occupying the ports listed in PORTS_TO_CLEAN.
|
||||
"""
|
||||
print(f"Cleaning ports: {PORTS_TO_CLEAN}")
|
||||
for port in PORTS_TO_CLEAN:
|
||||
kill_process_on_port(port)
|
||||
|
||||
# Double check and retry if ports are still in use
|
||||
time.sleep(2)
|
||||
for port in PORTS_TO_CLEAN:
|
||||
if is_port_open("127.0.0.1", port, timeout=0.1):
|
||||
print(f"Port {port} still in use, retrying cleanup...")
|
||||
kill_process_on_port(port)
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def setup_and_run_server():
|
||||
"""
|
||||
@@ -135,9 +64,11 @@ def setup_and_run_server():
|
||||
- Tears down server after all tests finish
|
||||
"""
|
||||
print("Pre-test port cleanup...")
|
||||
clean_ports()
|
||||
clean_ports(PORTS_TO_CLEAN)
|
||||
|
||||
print("log dir clean ")
|
||||
if os.path.exists("log_redis") and os.path.isdir("log_redis"):
|
||||
shutil.rmtree("log_redis")
|
||||
if os.path.exists("log_prefill") and os.path.isdir("log_prefill"):
|
||||
shutil.rmtree("log_prefill")
|
||||
if os.path.exists("log_decode") and os.path.isdir("log_decode"):
|
||||
@@ -150,13 +81,34 @@ def setup_and_run_server():
|
||||
model_path = "baidu/ERNIE-4.5-0.3B-Paddle"
|
||||
print(f"model_path: {model_path}")
|
||||
|
||||
# redis-server
|
||||
print("start redis...")
|
||||
env_copy = os.environ.copy()
|
||||
log_path = "router.log"
|
||||
|
||||
cmd = [
|
||||
"redis-server",
|
||||
"--port",
|
||||
str(FD_REDIS_PORT),
|
||||
"--daemonize",
|
||||
"yes",
|
||||
]
|
||||
|
||||
with open(log_path, "w") as logfile:
|
||||
process_redis = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=logfile,
|
||||
stderr=subprocess.STDOUT,
|
||||
start_new_session=True, # Enables killing full group via os.killpg
|
||||
env=env_copy,
|
||||
)
|
||||
|
||||
# prefill实例
|
||||
print("start prefill...")
|
||||
env_prefill = os.environ.copy()
|
||||
env_prefill["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
env_prefill["ENABLE_V1_KVCACHE_SCHEDULER"] = "0"
|
||||
env_prefill["FD_LOG_DIR"] = "log_prefill"
|
||||
env_prefill["INFERENCE_MSG_QUEUE_ID"] = str(FD_API_PORT)
|
||||
prefill_log_path = "server.log"
|
||||
prefill_cmd = [
|
||||
sys.executable,
|
||||
@@ -182,6 +134,16 @@ def setup_and_run_server():
|
||||
"wint8",
|
||||
"--splitwise-role",
|
||||
"prefill",
|
||||
"--cache-transfer-protocol",
|
||||
"ipc",
|
||||
"--pd-comm-port",
|
||||
str(FD_CONNECTOR_PORT),
|
||||
"--scheduler-name",
|
||||
"splitwise",
|
||||
"--scheduler-host",
|
||||
"127.0.0.1",
|
||||
"--scheduler-port",
|
||||
str(FD_REDIS_PORT),
|
||||
]
|
||||
|
||||
# Start subprocess in new process group
|
||||
@@ -193,14 +155,13 @@ def setup_and_run_server():
|
||||
start_new_session=True, # Enables killing full group via os.killpg
|
||||
env=env_prefill,
|
||||
)
|
||||
time.sleep(3)
|
||||
time.sleep(1)
|
||||
|
||||
# decode实例
|
||||
print("start decode...")
|
||||
env_decode = os.environ.copy()
|
||||
env_decode["CUDA_VISIBLE_DEVICES"] = "1"
|
||||
env_prefill["ENABLE_V1_KVCACHE_SCHEDULER"] = "0"
|
||||
env_decode["INFERENCE_MSG_QUEUE_ID"] = str(FD_API_PORT + 1)
|
||||
env_decode["ENABLE_V1_KVCACHE_SCHEDULER"] = "0"
|
||||
env_decode["FD_LOG_DIR"] = "log_decode"
|
||||
decode_log_path = "decode_server.log"
|
||||
decode_cmd = [
|
||||
@@ -227,8 +188,16 @@ def setup_and_run_server():
|
||||
"wint8",
|
||||
"--splitwise-role",
|
||||
"decode",
|
||||
"--innode-prefill-ports",
|
||||
str(FD_ENGINE_QUEUE_PORT),
|
||||
"--cache-transfer-protocol",
|
||||
"ipc",
|
||||
"--pd-comm-port",
|
||||
str(FD_CONNECTOR_PORT + 1),
|
||||
"--scheduler-name",
|
||||
"splitwise",
|
||||
"--scheduler-host",
|
||||
"127.0.0.1",
|
||||
"--scheduler-port",
|
||||
str(FD_REDIS_PORT),
|
||||
]
|
||||
|
||||
# Start subprocess in new process group
|
||||
@@ -242,19 +211,18 @@ def setup_and_run_server():
|
||||
)
|
||||
|
||||
# Wait up to 300 seconds for API server to be ready
|
||||
for _ in range(300):
|
||||
if is_port_open("127.0.0.1", FD_API_PORT):
|
||||
if is_port_open("127.0.0.1", FD_API_PORT + 1):
|
||||
print(f"Prefill server is up on port {FD_API_PORT}")
|
||||
print(f"Decode server is up on port {FD_API_PORT + 1}")
|
||||
break
|
||||
time.sleep(1)
|
||||
for _ in range(60):
|
||||
if is_port_open("127.0.0.1", FD_API_PORT) and is_port_open("127.0.0.1", FD_API_PORT + 1):
|
||||
print(f"Prefill server is up on port {FD_API_PORT}")
|
||||
print(f"Decode server is up on port {FD_API_PORT + 1}")
|
||||
break
|
||||
time.sleep(5)
|
||||
else:
|
||||
print("[TIMEOUT] API server failed to start in 5 minutes. Cleaning up...")
|
||||
try:
|
||||
os.killpg(process_prefill.pid, signal.SIGTERM)
|
||||
os.killpg(process_decode.pid, signal.SIGTERM)
|
||||
clean_ports()
|
||||
clean_ports(PORTS_TO_CLEAN)
|
||||
except Exception as e:
|
||||
print(f"Failed to kill process group: {e}")
|
||||
raise RuntimeError(f"API server did not start on port {FD_API_PORT}")
|
||||
@@ -263,9 +231,10 @@ def setup_and_run_server():
|
||||
|
||||
print("\n===== Post-test server cleanup... =====")
|
||||
try:
|
||||
os.killpg(process_redis.pid, signal.SIGTERM)
|
||||
os.killpg(process_prefill.pid, signal.SIGTERM)
|
||||
os.killpg(process_decode.pid, signal.SIGTERM)
|
||||
clean_ports()
|
||||
clean_ports(PORTS_TO_CLEAN)
|
||||
print(f"Prefill server (pid={process_prefill.pid}) terminated")
|
||||
print(f"Decode server (pid={process_decode.pid}) terminated")
|
||||
except Exception as e:
|
||||
@@ -277,7 +246,7 @@ def api_url(request):
|
||||
"""
|
||||
Returns the API endpoint URL for chat completions.
|
||||
"""
|
||||
return f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions", f"http://0.0.0.0:{FD_API_PORT + 1}/v1/chat/completions"
|
||||
return f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions", f"http://0.0.0.0:{FD_API_PORT+1}/v1/chat/completions"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
@@ -364,15 +333,12 @@ def test_chat_usage_stream(api_url):
|
||||
"stream_options": {"include_usage": True, "continuous_usage_stats": True},
|
||||
"metadata": {"min_tokens": 10},
|
||||
}
|
||||
_, d_url = api_url # Only the decode server receives the request
|
||||
|
||||
response = send_request(url=d_url, payload=payload)
|
||||
p_url, d_url = api_url
|
||||
response = send_request(url=p_url, payload=payload)
|
||||
chunks = get_stream_chunks(response)
|
||||
result = "".join([x["choices"][0]["delta"]["content"] for x in chunks[:-1]])
|
||||
print("Decode Response:", result)
|
||||
assert result != "", "结果为空"
|
||||
# for idx, chunk in enumerate(chunks):
|
||||
# print(f"\nchunk[{idx}]:\n{json.dumps(chunk, indent=2, ensure_ascii=False)}")
|
||||
usage = chunks[-1]["usage"]
|
||||
total_tokens = usage["completion_tokens"] + usage["prompt_tokens"]
|
||||
assert payload["max_tokens"] >= usage["completion_tokens"], "completion_tokens大于max_tokens"
|
||||
@@ -395,9 +361,9 @@ def test_chat_usage_non_stream(api_url):
|
||||
"stream": False,
|
||||
"metadata": {"min_tokens": 10},
|
||||
}
|
||||
_, d_url = api_url
|
||||
|
||||
response = send_request(url=d_url, payload=payload).json()
|
||||
p_url, d_url = api_url
|
||||
response = send_request(url=p_url, payload=payload).json()
|
||||
usage = response["usage"]
|
||||
result = response["choices"][0]["message"]["content"]
|
||||
assert result != "", "结果为空"
|
||||
@@ -420,13 +386,13 @@ def test_non_chat_usage_stream(api_url):
|
||||
"stream_options": {"include_usage": True, "continuous_usage_stats": True},
|
||||
"metadata": {"min_tokens": 10},
|
||||
}
|
||||
_, d_url = api_url
|
||||
d_url = d_url.replace("chat/completions", "completions")
|
||||
p_url, d_url = api_url
|
||||
p_url = p_url.replace("chat/completions", "completions")
|
||||
|
||||
response = send_request(url=d_url, payload=payload)
|
||||
response = send_request(url=p_url, payload=payload)
|
||||
chunks = get_stream_chunks(response)
|
||||
result = "".join([x["choices"][0]["text"] for x in chunks[:-1]])
|
||||
print("Decode Response:", result)
|
||||
# print("Decode Response:", result)
|
||||
assert result != "", "结果为空"
|
||||
usage = chunks[-1]["usage"]
|
||||
total_tokens = usage["completion_tokens"] + usage["prompt_tokens"]
|
||||
@@ -447,13 +413,13 @@ def test_non_chat_usage_non_stream(api_url):
|
||||
"stream": False,
|
||||
"metadata": {"min_tokens": 10},
|
||||
}
|
||||
_, d_url = api_url
|
||||
d_url = d_url.replace("chat/completions", "completions")
|
||||
p_url, d_url = api_url
|
||||
p_url = p_url.replace("chat/completions", "completions")
|
||||
|
||||
response = send_request(url=d_url, payload=payload).json()
|
||||
response = send_request(url=p_url, payload=payload).json()
|
||||
usage = response["usage"]
|
||||
result = response["choices"][0]["text"]
|
||||
print("Decode Response:", result)
|
||||
# print("Decode Response:", result)
|
||||
assert result != "", "结果为空"
|
||||
total_tokens = usage["completion_tokens"] + usage["prompt_tokens"]
|
||||
assert payload["max_tokens"] >= usage["completion_tokens"], "completion_tokens大于max_tokens"
|
||||
@@ -4,6 +4,8 @@ import socket
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
import requests
|
||||
|
||||
# Read ports from environment variables; use default values if not set
|
||||
FD_API_PORT = int(os.getenv("FD_API_PORT", 8188))
|
||||
FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133))
|
||||
@@ -79,18 +81,66 @@ def kill_process_on_port(port: int):
|
||||
pass
|
||||
|
||||
|
||||
def clean_ports():
|
||||
def clean_ports(ports=None):
|
||||
"""
|
||||
Kill all processes occupying the ports listed in PORTS_TO_CLEAN.
|
||||
Kill all processes occupying the ports
|
||||
"""
|
||||
print(f"Cleaning ports: {PORTS_TO_CLEAN}")
|
||||
for port in PORTS_TO_CLEAN:
|
||||
if ports is None:
|
||||
ports = PORTS_TO_CLEAN
|
||||
|
||||
print(f"Cleaning ports: {ports}")
|
||||
for port in ports:
|
||||
kill_process_on_port(port)
|
||||
|
||||
# Double check and retry if ports are still in use
|
||||
time.sleep(2)
|
||||
for port in PORTS_TO_CLEAN:
|
||||
for port in ports:
|
||||
if is_port_open("127.0.0.1", port, timeout=0.1):
|
||||
print(f"Port {port} still in use, retrying cleanup...")
|
||||
kill_process_on_port(port)
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
def check_service_health(base_url: str, timeout: int = 3) -> bool:
|
||||
"""
|
||||
Check the health status of a service.
|
||||
|
||||
Args:
|
||||
base_url (str): The base URL of the service, e.g. "http://127.0.0.1:8080"
|
||||
timeout (int): Request timeout in seconds.
|
||||
|
||||
Returns:
|
||||
bool: True if the service is healthy, False otherwise.
|
||||
"""
|
||||
if not base_url.startswith("http"):
|
||||
base_url = f"http://{base_url}"
|
||||
url = f"{base_url.rstrip('/')}/health"
|
||||
try:
|
||||
resp = requests.get(url, timeout=timeout)
|
||||
if resp.status_code == 200:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def get_registered_number(router_url) -> dict:
|
||||
"""
|
||||
Get the registered model counts by type from the router.
|
||||
|
||||
Args:
|
||||
router_url (str): The base URL of the router, e.g. "http://localhost:8080".
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing registered model counts with keys "mixed", "prefill", and "decode".
|
||||
"""
|
||||
if not router_url.startswith("http"):
|
||||
router_url = f"http://{router_url}"
|
||||
|
||||
try:
|
||||
response = requests.get(f"{router_url}/registered_number", timeout=60)
|
||||
registered_numbers = response.json()
|
||||
return registered_numbers
|
||||
except Exception:
|
||||
return {"mixed": 0, "prefill": 0, "decode": 0}
|
||||
|
||||
Reference in New Issue
Block a user