diff --git a/.github/workflows/_accuracy_test.yml b/.github/workflows/_accuracy_test.yml new file mode 100644 index 000000000..8db47c6ef --- /dev/null +++ b/.github/workflows/_accuracy_test.yml @@ -0,0 +1,174 @@ +name: Accuracy Test +description: "Run Accuracy Tests" + +on: + workflow_call: + inputs: + DOCKER_IMAGE: + description: "Build Images" + required: true + type: string + default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310" + FASTDEPLOY_ARCHIVE_URL: + description: "URL of the compressed FastDeploy code archive." + required: true + type: string + FASTDEPLOY_WHEEL_URL: + description: "URL of the FastDeploy Wheel." + required: true + type: string + CACHE_DIR: + description: "Cache Dir Use" + required: false + type: string + default: "" + MODEL_CACHE_DIR: + description: "Cache Dir Use" + required: false + type: string + default: "" + +jobs: + accuracy_tests: + runs-on: [self-hosted, GPU-h20-1Cards] + steps: + - name: Code Prepare + shell: bash + env: + docker_image: ${{ inputs.DOCKER_IMAGE }} + fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }} + run: | + set -x + REPO="https://github.com/${{ github.repository }}.git" + FULL_REPO="${{ github.repository }}" + REPO_NAME="${FULL_REPO##*/}" + BASE_BRANCH="${{ github.base_ref }}" + + # Clean the repository directory before starting + docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ + -e "REPO_NAME=${REPO_NAME}" \ + ${docker_image} /bin/bash -c ' + if [ -d ${REPO_NAME} ]; then + echo "Directory ${REPO_NAME} exists, removing it..." + rm -rf ${REPO_NAME}* + fi + ' + + wget -q ${fd_archive_url} + tar -xf FastDeploy.tar.gz + rm -rf FastDeploy.tar.gz + cd FastDeploy + git config --global user.name "FastDeployCI" + git config --global user.email "fastdeploy_ci@example.com" + git log -n 3 --oneline + + - name: Run FastDeploy Base Tests + shell: bash + env: + docker_image: ${{ inputs.DOCKER_IMAGE }} + fastdeploy_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }} + CACHE_DIR: ${{ inputs.CACHE_DIR }} + MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }} + run: | + runner_name="${{ runner.name }}" + CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}') + DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,) + DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1) + + FLASK_PORT=$((42068 + DEVICE_PORT * 100)) + FD_API_PORT=$((42088 + DEVICE_PORT * 100)) + FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100)) + FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100)) + echo "Test ENV Parameter:" + echo "=========================================================" + echo "FLASK_PORT=${FLASK_PORT}" + echo "FD_API_PORT=${FD_API_PORT}" + echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" + echo "FD_METRICS_PORT=${FD_METRICS_PORT}" + echo "DEVICES=${DEVICES}" + echo "=========================================================" + + CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}" + echo "CACHE_DIR is set to ${CACHE_DIR}" + if [ ! -f "${CACHE_DIR}/gitconfig" ]; then + touch "${CACHE_DIR}/gitconfig" + fi + if [ ! -d "${MODEL_CACHE_DIR}" ]; then + echo "Error: MODEL_CACHE_DIR '${MODEL_CACHE_DIR}' does not exist." + exit 1 + fi + + PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT) + LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log" + echo "==== LOG_FILE is ${LOG_FILE} ====" + + echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE + + for port in "${PORTS[@]}"; do + PIDS=$(lsof -t -i :$port || true) + if [ -n "$PIDS" ]; then + echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE + echo "$PIDS" | xargs -r kill -9 + echo "Port $port cleared" | tee -a $LOG_FILE + else + echo "Port $port is free" | tee -a $LOG_FILE + fi + done + + echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE + + docker run --rm --ipc=host --pid=host --net=host \ + -v $(pwd):/workspace \ + -w /workspace \ + -e fastdeploy_wheel_url=${fastdeploy_wheel_url} \ + -e "FD_API_PORT=${FD_API_PORT}" \ + -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \ + -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \ + -e "FLASK_PORT=${FLASK_PORT}" \ + -v "${MODEL_CACHE_DIR}:/MODELDATA" \ + -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \ + -v "${CACHE_DIR}/.cache:/root/.cache" \ + -v "${CACHE_DIR}/ConfigDir:/root/.config" \ + -e TZ="Asia/Shanghai" \ + --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc ' + python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/ + + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + + python -m pip install ${fastdeploy_wheel_url} + python -m pip install pytest + + wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64 + chmod +x ./llm-deploy-linux-amd64 + ./llm-deploy-linux-amd64 -python python3.10 \ + -model_name ERNIE-4.5-0.3B-Paddle \ + -model_path /MODELDATA \ + --skip install + + git config --global --add safe.directory /workspace/FastDeploy + cd FastDeploy + pushd test/ce/deploy + python3.10 deploy.py > dd.log 2>&1 & + sleep 3 + curl -X POST http://0.0.0.0:${FLASK_PORT}/start \ + -H "Content-Type: application/json" \ + -d "{\"--model\": \"/MODELDATA/ERNIE-4.5-0.3B-Paddle\"}" + + curl -X POST http://localhost:${FLASK_PORT}/wait_for_infer?timeout=90 + popd + + pushd test/ce/accuracy_cases + export URL=http://localhost:${FD_API_PORT}/v1/chat/completions + export TEMPLATE=TOKEN_LOGPROB + export MODEL_SIZE=0.3B + TEST_EXIT_CODE=0 + python gsm8k.py || TEST_EXIT_CODE=1 + popd + echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> /workspace/FastDeploy/exit_code.env + ' + if [ -f ./FastDeploy/exit_code.env ]; then + source ./FastDeploy/exit_code.env + cat ./FastDeploy/exit_code.env >> $GITHUB_ENV + fi + echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" + exit ${TEST_EXIT_CODE} diff --git a/.github/workflows/pr_build_and_test.yml b/.github/workflows/pr_build_and_test.yml index 5b3485036..8faa7436b 100644 --- a/.github/workflows/pr_build_and_test.yml +++ b/.github/workflows/pr_build_and_test.yml @@ -73,3 +73,13 @@ jobs: FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }} FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }} MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData" + + accuracy_test: + name: Run Accuracy Tests + needs: [clone,build] + uses: ./.github/workflows/_accuracy_test.yml + with: + DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate + FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }} + FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }} + MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData" diff --git a/test/ce/accuracy_cases/gsm8k.py b/test/ce/accuracy_cases/gsm8k.py index f156f58c7..4ccfd2482 100644 --- a/test/ce/accuracy_cases/gsm8k.py +++ b/test/ce/accuracy_cases/gsm8k.py @@ -18,11 +18,13 @@ BASELINE = { "21B": 0.49, "300B": 0.96, } -baseline = BASELINE.get(os.environ.get("MODEL"), None) +baseline = BASELINE.get(os.environ.get("MODEL_SIZE"), None) base_url = os.environ.get("URL", None) atol = 0.03 if baseline is None: - raise ValueError(f"Invalid MODEL value '{os.environ.get('MODEL')}', expected one of {list(BASELINE.keys())}") + raise ValueError( + f"Invalid MODEL_SIZE value '{os.environ.get('MODEL_SIZE')}', expected one of {list(BASELINE.keys())}" + ) if base_url is None: raise ValueError( "Environment variable 'URL' is not set. " diff --git a/test/ce/server/gsm8k.parquet b/test/ce/server/gsm8k.parquet deleted file mode 100644 index 9f8c0207c..000000000 Binary files a/test/ce/server/gsm8k.parquet and /dev/null differ diff --git a/test/ce/server/gsm8k.py b/test/ce/server/gsm8k.py deleted file mode 100644 index f156f58c7..000000000 --- a/test/ce/server/gsm8k.py +++ /dev/null @@ -1,188 +0,0 @@ -#!/bin/env python3 -# -*- coding: utf-8 -*- -# @author DDDivano -# encoding=utf-8 vi:ts=4:sw=4:expandtab:ft=python - - -import os -import re -from concurrent.futures import ThreadPoolExecutor, as_completed -from urllib.parse import urlparse, urlunparse - -import openai -from datasets import load_dataset -from tqdm import tqdm - -BASELINE = { - "0.3B": 0.05, - "21B": 0.49, - "300B": 0.96, -} -baseline = BASELINE.get(os.environ.get("MODEL"), None) -base_url = os.environ.get("URL", None) -atol = 0.03 -if baseline is None: - raise ValueError(f"Invalid MODEL value '{os.environ.get('MODEL')}', expected one of {list(BASELINE.keys())}") -if base_url is None: - raise ValueError( - "Environment variable 'URL' is not set. " - "Please specify the inference service address, e.g., 'http://localhost:8191/v1'." - ) - - -def strip_path_suffix(url: str, suffix: str = "chat/completions") -> str: - """ - 去除 URL 中的指定路径后缀(如 chat/completions) - """ - parsed = urlparse(url) - # 移除末尾的 suffix(注意确保只移除结尾部分) - if parsed.path.endswith("/" + suffix): - new_path = parsed.path[: -(len(suffix) + 1)] # +1 是斜杠 - else: - new_path = parsed.path - # 重新构造 URL - cleaned_url = urlunparse( - ( - parsed.scheme, - parsed.netloc, - new_path.rstrip("/"), # 去掉末尾的斜杠 - "", - "", - "", # 忽略 params/query/fragment - ) - ) - return cleaned_url - - -# ========== OpenAI 客户端配置 ========== -client = openai.OpenAI( - api_key="DDDivano", - # base_url="http://占位:8187/v1" - base_url=strip_path_suffix(base_url), -) - -model_name = "eb" -max_samples = 690 -max_tokens = 12288 -max_workers = 33 - -# ========== 加载数据集 ========== -dataset = load_dataset("parquet", data_files="gsm8k.parquet", split="train") -dataset = dataset.select(range(min(len(dataset), max_samples))) - - -# ========== 提取 GT 中 "#### 数字" 格式的最终答案 ========== -def extract_gt_answer(text): - match = re.search(r"####\s*([\d,]+(?:\.\d+)?)", text) - if match: - return match.group(1).replace(",", "").strip() - return None - - -# ========== 提取模型输出中的“最后一句话”中的数字 ========== -def extract_model_answer(text): - if not text: - return None - text = text.replace(",", "").replace("$", "") - lines = text.strip().splitlines() - last_line = lines[-1] if lines else text - match = re.search(r"-?\d+(?:\.\d+)?", last_line) - return match.group(0) if match else None - - -# ========== 数值比较函数 ========== -def is_answer_equal(pred, gt, tol=1e-6): - if pred is None or gt is None: - return False - try: - return abs(float(pred) - float(gt)) < tol - except: - return pred == gt - - -# ========== 构造 Prompt ========== -def build_prompt(sample): - return f"以下是一个数学问题,请直接给出最终答案。一定要把最终答案数字在最后输出。\n\n问题:{sample['question']}\n\n答案:" - - -# ========== 模型请求函数 ========== -def query_model(prompt): - try: - response = client.chat.completions.create( - model=model_name, - messages=[ - {"role": "system", "content": "你是一个数学专家,擅长严谨地解答数学问题。"}, - {"role": "user", "content": prompt}, - ], - temperature=1.0, - top_p=0.8, - max_tokens=max_tokens, - ) - return response.choices[0].message.content.strip() - except Exception as e: - return f"[Error] {e}" - - -# ========== 评估函数 ========== -def evaluate_sample(sample): - prompt = build_prompt(sample) - model_output = query_model(prompt) - - gt_value = extract_gt_answer(sample["answer"]) - pred_value = extract_model_answer(model_output) - is_correct = is_answer_equal(pred_value, gt_value) - - result = { - "question": sample["question"], - "gt_answer": gt_value, - "model_answer": pred_value, - "raw_gt_answer": sample["answer"], - "raw_model_output": model_output, - "is_correct": is_correct, - } - - return result - - -# ========== 主流程 ========== - -acc = [] -times = 3 - -for i in range(times): - correct = 0 - total = 0 - results = [] - - print(f"🚀 Starting evaluation with {max_workers} threads...") - - with ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = [executor.submit(evaluate_sample, sample) for sample in dataset] - for future in tqdm(as_completed(futures), total=len(futures), desc="Evaluating"): - result = future.result() - results.append(result) - total += 1 - if result["is_correct"]: - correct += 1 - else: - print("\n❌ Wrong prediction:") - print(f"Q: {result['question']}") - print(f"GT: {result['gt_answer']}") - print(f"Model: {result['model_answer']}") - print(f"Full GT: {result['raw_gt_answer']}") - print(f"Model Output: {result['raw_model_output']}") - - # ========== 输出准确率 ========== - accuracy = correct / total * 100 if total > 0 else 0.0 - print(f"\n🎯 Evaluation Complete: Accuracy = {accuracy:.2f}% ({correct}/{total})") - acc.append(accuracy) - -avg_acc = round(sum(acc) / times / 100, 4) # 优化百分数 -print(f"平均准确率:{avg_acc * 100:.2f}%") - -assert ( - abs(avg_acc - baseline) <= atol -), f"模型准确率 {avg_acc:.2f} 与基准 {baseline:.2f} 相差 {abs(avg_acc - baseline):.2f},超出容忍范围 {atol:.2f}" - -# with open("eval_result_math.json", "w", encoding="utf-8") as f: -# json.dump(results, f, indent=2, ensure_ascii=False)