diff --git a/.github/workflows/_accuracy_test.yml b/.github/workflows/_accuracy_test.yml index 72f1e609c..a41b9c339 100644 --- a/.github/workflows/_accuracy_test.yml +++ b/.github/workflows/_accuracy_test.yml @@ -80,12 +80,14 @@ jobs: FD_API_PORT=$((42088 + DEVICE_PORT * 100)) FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100)) FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100)) + FD_CACHE_QUEUE_PORT=$((42098 + DEVICE_PORT * 100)) echo "Test ENV Parameter:" echo "=========================================================" echo "FLASK_PORT=${FLASK_PORT}" echo "FD_API_PORT=${FD_API_PORT}" echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" echo "FD_METRICS_PORT=${FD_METRICS_PORT}" + echo "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" echo "DEVICES=${DEVICES}" echo "=========================================================" @@ -99,7 +101,7 @@ jobs: exit 1 fi - PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT) + PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT $FD_CACHE_QUEUE_PORT) LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log" echo "==== LOG_FILE is ${LOG_FILE} ====" @@ -133,6 +135,7 @@ jobs: -e "FD_API_PORT=${FD_API_PORT}" \ -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \ -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \ + -e "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" \ -e "FLASK_PORT=${FLASK_PORT}" \ -v "${MODEL_CACHE_DIR}:/MODELDATA" \ -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \ diff --git a/.github/workflows/_base_test.yml b/.github/workflows/_base_test.yml index be3696320..9e5f309ca 100644 --- a/.github/workflows/_base_test.yml +++ b/.github/workflows/_base_test.yml @@ -80,12 +80,14 @@ jobs: FD_API_PORT=$((42088 + DEVICE_PORT * 100)) FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100)) FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100)) + FD_CACHE_QUEUE_PORT=$((42098 + DEVICE_PORT * 100)) echo "Test ENV Parameter:" echo "=========================================================" echo "FLASK_PORT=${FLASK_PORT}" echo "FD_API_PORT=${FD_API_PORT}" echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" echo "FD_METRICS_PORT=${FD_METRICS_PORT}" + echo "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" echo "DEVICES=${DEVICES}" echo "=========================================================" @@ -99,7 +101,7 @@ jobs: exit 1 fi - PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT) + PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT $FD_CACHE_QUEUE_PORT) LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log" echo "==== LOG_FILE is ${LOG_FILE} ====" @@ -134,6 +136,7 @@ jobs: -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \ -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \ -e "FLASK_PORT=${FLASK_PORT}" \ + -e "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" \ -v "${MODEL_CACHE_DIR}:/MODELDATA" \ -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \ -v "${CACHE_DIR}/.cache:/root/.cache" \ diff --git a/.github/workflows/_logprob_test_linux.yml b/.github/workflows/_logprob_test_linux.yml index 435ecf81e..e15dd8722 100644 --- a/.github/workflows/_logprob_test_linux.yml +++ b/.github/workflows/_logprob_test_linux.yml @@ -70,12 +70,14 @@ jobs: FD_API_PORT=$((42088 + DEVICE_PORT * 100)) FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100)) FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100)) + FD_CACHE_QUEUE_PORT=$((42098 + DEVICE_PORT * 100)) echo "Test ENV Parameter:" echo "=========================================================" echo "FLASK_PORT=${FLASK_PORT}" echo "FD_API_PORT=${FD_API_PORT}" echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" echo "FD_METRICS_PORT=${FD_METRICS_PORT}" + echo "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" echo "DEVICES=${DEVICES}" echo "=========================================================" @@ -89,7 +91,7 @@ jobs: exit 1 fi - PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT) + PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT $FD_CACHE_QUEUE_PORT) LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log" echo "==== LOG_FILE is ${LOG_FILE} ====" @@ -123,6 +125,7 @@ jobs: -e "FD_API_PORT=${FD_API_PORT}" \ -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \ -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \ + -e "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" \ -e "FLASK_PORT=${FLASK_PORT}" \ -v "${MODEL_CACHE_DIR}:/MODELDATA" \ -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \ diff --git a/.github/workflows/_pre_ce_test.yml b/.github/workflows/_pre_ce_test.yml index 5ed0ad353..02a064639 100644 --- a/.github/workflows/_pre_ce_test.yml +++ b/.github/workflows/_pre_ce_test.yml @@ -81,12 +81,14 @@ jobs: FD_API_PORT=$((42088 + DEVICE_PORT * 100)) FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100)) FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100)) + FD_CACHE_QUEUE_PORT=$((42098 + DEVICE_PORT * 100)) echo "Test ENV Parameter:" echo "=========================================================" echo "FLASK_PORT=${FLASK_PORT}" echo "FD_API_PORT=${FD_API_PORT}" echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" echo "FD_METRICS_PORT=${FD_METRICS_PORT}" + echo "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" echo "DEVICES=${DEVICES}" echo "=========================================================" @@ -96,7 +98,7 @@ jobs: touch "${CACHE_DIR}/gitconfig" fi - PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT) + PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT $FD_CACHE_QUEUE_PORT) LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log" echo "==== LOG_FILE is ${LOG_FILE} ====" @@ -134,6 +136,7 @@ jobs: -e "FD_API_PORT=${FD_API_PORT}" \ -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \ -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \ + -e "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" \ -e "FLASK_PORT=${FLASK_PORT}" \ -e "fd_wheel_url=${fd_wheel_url}" \ --gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c ' diff --git a/.github/workflows/_unit_test_coverage.yml b/.github/workflows/_unit_test_coverage.yml index 6df21ffbf..4e9e2ef43 100644 --- a/.github/workflows/_unit_test_coverage.yml +++ b/.github/workflows/_unit_test_coverage.yml @@ -102,12 +102,14 @@ jobs: FD_API_PORT=$((42088 + DEVICE_PORT * 100)) FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100)) FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100)) + FD_CACHE_QUEUE_PORT=$((42098 + DEVICE_PORT * 100)) echo "Test ENV Parameter:" echo "=========================================================" echo "FLASK_PORT=${FLASK_PORT}" echo "FD_API_PORT=${FD_API_PORT}" echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" echo "FD_METRICS_PORT=${FD_METRICS_PORT}" + echo "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" echo "DEVICES=${DEVICES}" echo "=========================================================" @@ -117,7 +119,7 @@ jobs: touch "${CACHE_DIR}/gitconfig" fi - PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT) + PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT $FD_CACHE_QUEUE_PORT) LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log" echo "==== LOG_FILE is ${LOG_FILE} ====" @@ -156,6 +158,7 @@ jobs: -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \ -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \ -e "FLASK_PORT=${FLASK_PORT}" \ + -e "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" \ -e TZ="Asia/Shanghai" \ -e "fd_wheel_url=${fd_wheel_url}" \ -e "BASE_REF=${BASE_REF}" \ diff --git a/tests/ce/deploy/deploy.py b/tests/ce/deploy/deploy.py index 1952d7cfe..d9a2556e3 100644 --- a/tests/ce/deploy/deploy.py +++ b/tests/ce/deploy/deploy.py @@ -59,10 +59,12 @@ FLASK_PORT = get_available_port("FLASK_PORT", base_port + 1) FD_API_PORT = get_available_port("FD_API_PORT", FLASK_PORT + 1) FD_ENGINE_QUEUE_PORT = get_available_port("FD_ENGINE_QUEUE_PORT", FD_API_PORT + 1) FD_METRICS_PORT = get_available_port("FD_METRICS_PORT", FD_ENGINE_QUEUE_PORT + 1) +FD_CACHE_QUEUE_PORT = get_available_port("FD_CACHE_QUEUE_PORT", FD_METRICS_PORT + 1) DEFAULT_PARAMS = { "--port": FD_API_PORT, "--engine-worker-queue-port": FD_ENGINE_QUEUE_PORT, "--metrics-port": FD_METRICS_PORT, + "--cache-queue-port": FD_CACHE_QUEUE_PORT, "--enable-logprob": True, } @@ -179,7 +181,7 @@ def stop_server(signum=None, frame=None): except Exception as e: print(f"Failed to stop server: {e}, {str(traceback.format_exc())}") - for port in [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT]: + for port in [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT, FD_CACHE_QUEUE_PORT]: try: output = subprocess.check_output(f"lsof -i:{port} -t", shell=True).decode().strip() for pid in output.splitlines(): diff --git a/tests/ci_use/EB_Lite/test_EB_Lite_serving.py b/tests/ci_use/EB_Lite/test_EB_Lite_serving.py index 9cdc0a9bd..e82fba5e0 100644 --- a/tests/ci_use/EB_Lite/test_EB_Lite_serving.py +++ b/tests/ci_use/EB_Lite/test_EB_Lite_serving.py @@ -28,9 +28,10 @@ import requests FD_API_PORT = int(os.getenv("FD_API_PORT", 8188)) FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133)) FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233)) +FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8234)) # List of ports to clean before and after tests -PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT] +PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT, FD_CACHE_QUEUE_PORT] def is_port_open(host: str, port: int, timeout=1.0): @@ -100,6 +101,8 @@ def setup_and_run_server(): str(FD_ENGINE_QUEUE_PORT), "--metrics-port", str(FD_METRICS_PORT), + "--cache-queue-port", + str(FD_CACHE_QUEUE_PORT), "--max-model-len", "32768", "--max-num-seqs", diff --git a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index 6eb78345d..e5d7bfccd 100644 --- a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -29,9 +29,10 @@ import requests FD_API_PORT = int(os.getenv("FD_API_PORT", 8188)) FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133)) FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233)) +FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8234)) # List of ports to clean before and after tests -PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT] +PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT, FD_CACHE_QUEUE_PORT] def is_port_open(host: str, port: int, timeout=1.0): @@ -103,6 +104,8 @@ def setup_and_run_server(): str(FD_ENGINE_QUEUE_PORT), "--metrics-port", str(FD_METRICS_PORT), + "--cache-queue-port", + str(FD_CACHE_QUEUE_PORT), "--enable-mm", "--max-model-len", "32768", diff --git a/tests/ci_use/ERNIE_0dot3B/test_ernie_03b_pd.py b/tests/ci_use/ERNIE_0dot3B/test_ernie_03b_pd.py index f58434ea3..480ad584b 100644 --- a/tests/ci_use/ERNIE_0dot3B/test_ernie_03b_pd.py +++ b/tests/ci_use/ERNIE_0dot3B/test_ernie_03b_pd.py @@ -27,15 +27,18 @@ import requests FD_API_PORT = int(os.getenv("FD_API_PORT", 8188)) FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133)) FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233)) +FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333)) # List of ports to clean before and after tests PORTS_TO_CLEAN = [ FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT, + FD_CACHE_QUEUE_PORT, FD_API_PORT + 1, FD_ENGINE_QUEUE_PORT + 1, FD_METRICS_PORT + 1, + FD_CACHE_QUEUE_PORT + 1, ] @@ -116,6 +119,8 @@ def setup_and_run_server(): str(FD_ENGINE_QUEUE_PORT), "--metrics-port", str(FD_METRICS_PORT), + "--cache-queue-port", + str(FD_CACHE_QUEUE_PORT), "--max-model-len", "8192", "--max-num-seqs", @@ -157,7 +162,7 @@ def setup_and_run_server(): "--metrics-port", str(FD_METRICS_PORT + 1), "--cache-queue-port", - str(FD_API_PORT + 2), + str(FD_CACHE_QUEUE_PORT + 1), "--max-model-len", "8192", "--max-num-seqs", diff --git a/tests/ci_use/Qwen2-7B-Instruct_offline/test_Qwen2-7B-Instruct_offline.py b/tests/ci_use/Qwen2-7B-Instruct_offline/test_Qwen2-7B-Instruct_offline.py index de18c3d2f..3c2caffd0 100644 --- a/tests/ci_use/Qwen2-7B-Instruct_offline/test_Qwen2-7B-Instruct_offline.py +++ b/tests/ci_use/Qwen2-7B-Instruct_offline/test_Qwen2-7B-Instruct_offline.py @@ -24,6 +24,7 @@ import pytest from fastdeploy import LLM, SamplingParams FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313)) +FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333)) MAX_WAIT_SECONDS = 60 @@ -87,6 +88,7 @@ def llm(model_path): model=model_path, tensor_parallel_size=1, engine_worker_queue_port=FD_ENGINE_QUEUE_PORT, + cache_queue_port=FD_CACHE_QUEUE_PORT, max_model_len=32768, quantization="wint8", ) diff --git a/tests/ci_use/Qwen2-7B-Instruct_serving/test_Qwen2-7B-Instruct_serving.py b/tests/ci_use/Qwen2-7B-Instruct_serving/test_Qwen2-7B-Instruct_serving.py index 4b03a9835..f29f4e110 100644 --- a/tests/ci_use/Qwen2-7B-Instruct_serving/test_Qwen2-7B-Instruct_serving.py +++ b/tests/ci_use/Qwen2-7B-Instruct_serving/test_Qwen2-7B-Instruct_serving.py @@ -31,9 +31,10 @@ from jsonschema import validate FD_API_PORT = int(os.getenv("FD_API_PORT", 8188)) FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133)) FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233)) +FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333)) # List of ports to clean before and after tests -PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT] +PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT, FD_CACHE_QUEUE_PORT] def is_port_open(host: str, port: int, timeout=1.0): @@ -103,6 +104,8 @@ def setup_and_run_server(): str(FD_ENGINE_QUEUE_PORT), "--metrics-port", str(FD_METRICS_PORT), + "--cache-queue-port", + str(FD_CACHE_QUEUE_PORT), "--max-model-len", "32768", "--max-num-seqs", diff --git a/tests/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py b/tests/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py index cb9d13d19..e5e7eb9e5 100644 --- a/tests/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py +++ b/tests/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py @@ -27,9 +27,10 @@ import requests FD_API_PORT = int(os.getenv("FD_API_PORT", 8188)) FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133)) FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233)) +FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333)) # List of ports to clean before and after tests -PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT] +PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT, FD_CACHE_QUEUE_PORT] def is_port_open(host: str, port: int, timeout=1.0): @@ -99,6 +100,8 @@ def setup_and_run_server(): str(FD_ENGINE_QUEUE_PORT), "--metrics-port", str(FD_METRICS_PORT), + "--cache-queue-port", + str(FD_CACHE_QUEUE_PORT), "--max-model-len", "32768", "--max-num-seqs", diff --git a/tests/e2e/test_EB_Lite_serving.py b/tests/e2e/test_EB_Lite_serving.py index 17e8891d5..8d8ee0232 100644 --- a/tests/e2e/test_EB_Lite_serving.py +++ b/tests/e2e/test_EB_Lite_serving.py @@ -29,9 +29,10 @@ import requests FD_API_PORT = int(os.getenv("FD_API_PORT", 8188)) FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133)) FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233)) +FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333)) # List of ports to clean before and after tests -PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT] +PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT, FD_CACHE_QUEUE_PORT] def is_port_open(host: str, port: int, timeout=1.0): @@ -110,6 +111,8 @@ def setup_and_run_server(): str(FD_ENGINE_QUEUE_PORT), "--metrics-port", str(FD_METRICS_PORT), + "--cache-queue-port", + str(FD_CACHE_QUEUE_PORT), "--max-model-len", "32768", "--max-num-seqs", diff --git a/tests/e2e/test_EB_VL_Lite_serving.py b/tests/e2e/test_EB_VL_Lite_serving.py index 86d18a6e1..af6daf246 100644 --- a/tests/e2e/test_EB_VL_Lite_serving.py +++ b/tests/e2e/test_EB_VL_Lite_serving.py @@ -30,9 +30,10 @@ import requests FD_API_PORT = int(os.getenv("FD_API_PORT", 8188)) FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133)) FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233)) +FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333)) # List of ports to clean before and after tests -PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT] +PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT, FD_CACHE_QUEUE_PORT] def is_port_open(host: str, port: int, timeout=1.0): @@ -114,6 +115,8 @@ def setup_and_run_server(): str(FD_ENGINE_QUEUE_PORT), "--metrics-port", str(FD_METRICS_PORT), + "--cache-queue-port", + str(FD_CACHE_QUEUE_PORT), "--enable-mm", "--max-model-len", "32768", diff --git a/tests/e2e/test_Qwen2-7B-Instruct_serving.py b/tests/e2e/test_Qwen2-7B-Instruct_serving.py index 17729fc6c..cf836ef94 100644 --- a/tests/e2e/test_Qwen2-7B-Instruct_serving.py +++ b/tests/e2e/test_Qwen2-7B-Instruct_serving.py @@ -32,9 +32,10 @@ from jsonschema import validate FD_API_PORT = int(os.getenv("FD_API_PORT", 8188)) FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133)) FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233)) +FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333)) # List of ports to clean before and after tests -PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT] +PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT, FD_CACHE_QUEUE_PORT] def is_port_open(host: str, port: int, timeout=1.0): @@ -115,6 +116,8 @@ def setup_and_run_server(): str(FD_ENGINE_QUEUE_PORT), "--metrics-port", str(FD_METRICS_PORT), + "--cache-queue-port", + str(FD_CACHE_QUEUE_PORT), "--max-model-len", "32768", "--max-num-seqs", diff --git a/tests/entrypoints/test_generation.py b/tests/entrypoints/test_generation.py index 214f1017c..345c393b2 100644 --- a/tests/entrypoints/test_generation.py +++ b/tests/entrypoints/test_generation.py @@ -50,6 +50,7 @@ class TestGeneration(unittest.TestCase): max_num_batched_tokens=4096, tensor_parallel_size=1, engine_worker_queue_port=int(os.getenv("FD_ENGINE_QUEUE_PORT")), + cache_queue_port=int(os.getenv("FD_CACHE_QUEUE_PORT")), ) cls.llm = weakref.proxy(llm) except Exception as e: diff --git a/tests/layers/test_moba_attention.py b/tests/layers/test_moba_attention.py index a2dba74ab..b19485042 100644 --- a/tests/layers/test_moba_attention.py +++ b/tests/layers/test_moba_attention.py @@ -360,6 +360,7 @@ class TestMobaAttention(unittest.TestCase): tensor_parallel_size=2, max_model_len=131072, engine_worker_queue_port=int(os.getenv("FD_ENGINE_QUEUE_PORT")), + cache_queue_port=int(os.getenv("FD_CACHE_QUEUE_PORT")), max_num_seqs=32, quantization="wint4", enable_chunked_prefill=True, diff --git a/tests/model_loader/test_common_model.py b/tests/model_loader/test_common_model.py index f2c348195..acd18dc0a 100644 --- a/tests/model_loader/test_common_model.py +++ b/tests/model_loader/test_common_model.py @@ -30,6 +30,7 @@ from tests.model_loader.utils import ( ) FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313)) +FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333)) prompts = ["解释下“温故而知新", "Hello, how are you?"] @@ -113,6 +114,7 @@ def test_common_model( "default", FD_ENGINE_QUEUE_PORT, prompts, + FD_CACHE_QUEUE_PORT, ), ) fd_outputs_v1 = run_with_timeout( @@ -127,6 +129,7 @@ def test_common_model( "default_v1", FD_ENGINE_QUEUE_PORT, prompts, + FD_CACHE_QUEUE_PORT, ), ) check_tokens_id_and_text_close( diff --git a/tests/model_loader/test_load_ernie_vl.py b/tests/model_loader/test_load_ernie_vl.py index 031b59779..9483e3cc2 100644 --- a/tests/model_loader/test_load_ernie_vl.py +++ b/tests/model_loader/test_load_ernie_vl.py @@ -32,9 +32,10 @@ if project_root not in sys.path: FD_API_PORT = int(os.getenv("FD_API_PORT", 8188)) FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133)) FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233)) +FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333)) # List of ports to clean before and after tests -PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT] +PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT, FD_CACHE_QUEUE_PORT] def is_port_open(host: str, port: int, timeout=1.0): @@ -106,6 +107,8 @@ def setup_and_run_server(): str(FD_ENGINE_QUEUE_PORT), "--metrics-port", str(FD_METRICS_PORT), + "--cache-queue-port", + str(FD_CACHE_QUEUE_PORT), "--enable-mm", "--max-model-len", "32768", diff --git a/tests/model_loader/test_torch_model.py b/tests/model_loader/test_torch_model.py index 3ae7288db..ca9bcb883 100644 --- a/tests/model_loader/test_torch_model.py +++ b/tests/model_loader/test_torch_model.py @@ -30,6 +30,7 @@ from tests.model_loader.utils import ( ) FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313)) +FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333)) prompts = ["北京天安门在哪里?"] @@ -128,6 +129,7 @@ def test_model_against_baseline( "default_v1", FD_ENGINE_QUEUE_PORT, prompts, + FD_CACHE_QUEUE_PORT, ), ) diff --git a/tests/model_loader/test_w4a8_model.py b/tests/model_loader/test_w4a8_model.py index 3007b0a1b..da58af8ac 100644 --- a/tests/model_loader/test_w4a8_model.py +++ b/tests/model_loader/test_w4a8_model.py @@ -28,6 +28,7 @@ FD_ENGINE_QUEUE_PORTS = [ [9981, 9982, 9983, 9984, 9985, 9986, 9987, 9988], [9991, 9992, 9993, 9994, 9995, 9996, 9997, 9998], ] +FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333)) models = [ @@ -53,6 +54,7 @@ def llm(request): max_model_len=8192, num_gpu_blocks_override=1024, engine_worker_queue_port=FD_ENGINE_QUEUE_PORTS[port_index], + cache_queue_port=FD_CACHE_QUEUE_PORT, load_choices="default", enable_expert_parallel=True, ) diff --git a/tests/model_loader/utils.py b/tests/model_loader/utils.py index 316128949..0705af8ef 100644 --- a/tests/model_loader/utils.py +++ b/tests/model_loader/utils.py @@ -78,6 +78,7 @@ def form_model_get_output_topp0( load_choices, engine_worker_queue_port, prompts, + cache_queue_port, result_queue, ): try: @@ -88,6 +89,7 @@ def form_model_get_output_topp0( load_choices=load_choices, quantization=quantization, engine_worker_queue_port=engine_worker_queue_port, + cache_queue_port=cache_queue_port, ) as fd_model: fd_outputs = fd_model.generate_topp0(prompts, max_tokens=max_tokens) result_queue.put(fd_outputs)