diff --git a/.github/workflows/_base_test.yml b/.github/workflows/_base_test.yml index 69c912d62..8eddfa6ae 100644 --- a/.github/workflows/_base_test.yml +++ b/.github/workflows/_base_test.yml @@ -206,13 +206,6 @@ jobs: check_service 90 python -m pytest -sv test_max_waiting_time.py || TEST_EXIT_CODE=1 - curl -X POST http://0.0.0.0:${FLASK_PORT}/switch \ - -H "Content-Type: application/json" \ - -d "{\"--model\": \"/MODELDATA/ernie-4_5-21b-a3b-bf16-paddle\", \"--config\": \"21b_mtp.yaml\", \"--enable-logprob\": \"False\"}" - check_service 180 - export TEMPLATE=TOKEN_NORMAL - python -m pytest -sv test_seed_usage.py -k "not test_seed_stream" || TEST_EXIT_CODE=1 - popd echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> /workspace/FastDeploy/exit_code.env ' diff --git a/.github/workflows/pr_build_and_test.yml b/.github/workflows/pr_build_and_test.yml index 23eb2fefa..620bf9fc4 100644 --- a/.github/workflows/pr_build_and_test.yml +++ b/.github/workflows/pr_build_and_test.yml @@ -75,23 +75,3 @@ jobs: FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }} FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }} MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData" - - accuracy_test: - name: Run Accuracy Tests - needs: [clone,build] - uses: ./.github/workflows/_accuracy_test.yml - with: - DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate - FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }} - FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }} - MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData" - - stable_test: - name: Run Stable Tests - needs: [clone,build] - uses: ./.github/workflows/_stable_test.yml - with: - DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate - FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }} - FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }} - MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData" diff --git a/tests/ce/deploy/21b_mtp.yaml b/tests/ce/deploy/21b_mtp.yaml deleted file mode 100644 index 752240625..000000000 --- a/tests/ce/deploy/21b_mtp.yaml +++ /dev/null @@ -1,8 +0,0 @@ -max_model_len: 32768 -max_num_seqs: 128 -tensor_parallel_size: 1 -quantization: wint4 -speculative_config: - method: mtp - num_speculative_tokens: 1 - model: /MODELDATA/ernie-4_5-21b-a3b-bf16-paddle/mtp/ diff --git a/tests/ce/deploy/21b_sot.yaml b/tests/ce/deploy/21b_sot.yaml deleted file mode 100644 index 243e5335b..000000000 --- a/tests/ce/deploy/21b_sot.yaml +++ /dev/null @@ -1,9 +0,0 @@ -max_model_len: 32768 -max_num_seqs: 128 -tensor_parallel_size: 1 -quantization: wint4 -graph_optimization_config: - graph_opt_level: 1 - sot_warmup_sizes: [2,16,32,64] - use_cudagraph: True - full_cuda_graph: False diff --git a/tests/ci_use/EB_Lite/test_EB_Lite_serving.py b/tests/ci_use/EB_Lite/test_EB_Lite_serving.py index 1e4944390..baeeb2598 100644 --- a/tests/ci_use/EB_Lite/test_EB_Lite_serving.py +++ b/tests/ci_use/EB_Lite/test_EB_Lite_serving.py @@ -425,7 +425,7 @@ def test_streaming_with_stop_str(openai_client): last_token = "" for chunk in response: last_token = chunk.choices[0].delta.content - assert last_token == "" + assert last_token.endswith("") response = openai_client.chat.completions.create( model="default", diff --git a/tests/e2e/test_EB_Lite_serving.py b/tests/e2e/test_EB_Lite_serving.py index ef489ff4c..28d3d912a 100644 --- a/tests/e2e/test_EB_Lite_serving.py +++ b/tests/e2e/test_EB_Lite_serving.py @@ -589,7 +589,7 @@ def test_streaming_with_stop_str(openai_client): last_token = "" for chunk in response: last_token = chunk.choices[0].delta.content - assert last_token == "" + assert last_token.endswith("") response = openai_client.chat.completions.create( model="default", diff --git a/tests/model_loader/test_common_model.py b/tests/model_loader/test_common_model.py deleted file mode 100644 index bc894bddc..000000000 --- a/tests/model_loader/test_common_model.py +++ /dev/null @@ -1,205 +0,0 @@ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys - -import pytest - -current_dir = os.path.dirname(os.path.abspath(__file__)) -project_root = os.path.abspath(os.path.join(current_dir, "..")) -if project_root not in sys.path: - sys.path.insert(0, project_root) - -from tests.model_loader.utils import ( - check_tokens_id_and_text_close, - form_model_get_output_topp0, - form_model_get_output_topp1, - get_paddle_model_path, - get_torch_model_path, - run_with_timeout, -) - -FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313)) -FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333)) - -prompts = ["解释下”温故而知新”", "Hello, how are you?"] - - -model_param_map = { - "Qwen3-0.6B": { - "max_num_seqs": 1, - "quantizations": ["None", "wint8", "wint4"], - }, - "ernie-4_5-21b-a3b-bf16-paddle": { - "max_num_seqs": 1, - "tensor_parallel_size": 2, - "quantizations": [ - "wint8", - ], - }, - "Qwen2-7B-Instruct": { - "max_num_seqs": 1, - "quantizations": ["wint4"], - }, - "Qwen2.5-VL-7B-Instruct": { - "max_num_seqs": 1, - "quantizations": ["wint4"], - "is_mm": True, - "torch_model_name_or_path": "Qwen2.5-VL-7B-Instruct-PT", - }, - "Qwen3-30B-A3B": { - "tensor_parallel_size": 2, - "max_num_seqs": 1, - "quantizations": [ - { - "quant_type": "block_wise_fp8", - "backend": "triton", - "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"}, - }, - { - "quant_type": "block_wise_fp8", - "backend": "deepgemm", - "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17", "FD_USE_DEEP_GEMM": "1"}, - }, - ], - }, - "DeepSeek-V3-0324": { - "tensor_parallel_size": 2, - "quantizations": [ - { - "quant_type": "wint4", - "env": { - "FD_ATTENTION_BACKEND": "MLA_ATTN", - "FLAGS_mla_use_tensorcore": "1", - "FLAGS_flash_attn_version": "3", - "FD_USE_MACHETE": "1", - }, - }, - ], - }, -} - - -params = [] -for model, cfg in model_param_map.items(): - for q in cfg["quantizations"]: - if isinstance(q, dict): - quant, backend, env = q["quant_type"], q.get("backend", "default"), q.get("env", {}) - else: - quant, backend, env = q, "default", {} - params.append( - pytest.param( - model, - cfg.get("torch_model_name_or_path", ""), - cfg.get("tensor_parallel_size", 1), - cfg.get("max_num_seqs", 1), - cfg.get("max_model_len", 1024), - quant, - cfg.get("max_tokens", 32), - env, - cfg.get("is_mm", False), - marks=[pytest.mark.core_model], - id=f"{model}.{quant}.{backend}", - ) - ) - - -@pytest.mark.parametrize( - "model_name_or_path,torch_model_name_or_path,tensor_parallel_size,max_num_seqs,max_model_len,quantization,max_tokens,env,is_mm", - params, -) -def test_common_model( - fd_runner, - model_name_or_path: str, - torch_model_name_or_path: str, - tensor_parallel_size: int, - max_num_seqs, - max_model_len: int, - max_tokens: int, - quantization: str, - env, - is_mm: bool, - monkeypatch, -) -> None: - model_path = get_paddle_model_path(model_name_or_path) - if env: - for k, v in env.items(): - monkeypatch.setenv(k, v) - - form_model_get_output = form_model_get_output_topp0 if not is_mm else form_model_get_output_topp1 - fd_outputs_v0 = run_with_timeout( - target=form_model_get_output, - args=( - fd_runner, - model_path, - tensor_parallel_size, - max_num_seqs, - max_model_len, - max_tokens, - quantization, - "default", - FD_ENGINE_QUEUE_PORT, - prompts, - FD_CACHE_QUEUE_PORT, - ), - ) - fd_outputs_v1 = run_with_timeout( - target=form_model_get_output, - args=( - fd_runner, - model_path, - tensor_parallel_size, - max_num_seqs, - max_model_len, - max_tokens, - quantization, - "default_v1", - FD_ENGINE_QUEUE_PORT, - prompts, - FD_CACHE_QUEUE_PORT, - ), - ) - - check_tokens_id_and_text_close( - outputs_0_lst=fd_outputs_v0, - outputs_1_lst=fd_outputs_v1, - name_0="default loader", - name_1="default_v1 loader", - ) - - if torch_model_name_or_path != "": - torch_model_path = get_torch_model_path(torch_model_name_or_path) - fd_outputs_v1_torch = run_with_timeout( - target=form_model_get_output, - args=( - fd_runner, - torch_model_path, - tensor_parallel_size, - max_num_seqs, - max_model_len, - max_tokens, - quantization, - "default_v1", - FD_ENGINE_QUEUE_PORT, - prompts, - FD_CACHE_QUEUE_PORT, - ), - ) - check_tokens_id_and_text_close( - outputs_0_lst=fd_outputs_v1, - outputs_1_lst=fd_outputs_v1_torch, - name_0="default loader", - name_1="default_v1 loader", - )