add accuracy check ci (#3389)

* add accuracy ci * fix * fix * update * rename ci jobs
2025-10-05 16:48:03 +08:00 · 2025-08-15 15:17:43 +08:00
parent 4bd6a9fa7d
commit cc8ee50f27
5 changed files with 188 additions and 190 deletions
--- a/.github/workflows/_accuracy_test.yml
+++ b/.github/workflows/_accuracy_test.yml
@@ -0,0 +1,174 @@
 name: Accuracy Test
 description: "Run Accuracy Tests"
 on:
  workflow_call:
    inputs:
      DOCKER_IMAGE:
        description: "Build Images"
        required: true
        type: string
        default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310"
      FASTDEPLOY_ARCHIVE_URL:
        description: "URL of the compressed FastDeploy code archive."
        required: true
        type: string
      FASTDEPLOY_WHEEL_URL:
        description: "URL of the FastDeploy Wheel."
        required: true
        type: string
      CACHE_DIR:
        description: "Cache Dir Use"
        required: false
        type: string
        default: ""
      MODEL_CACHE_DIR:
        description: "Cache Dir Use"
        required: false
        type: string
        default: ""
 jobs:
  accuracy_tests:
    runs-on: [self-hosted, GPU-h20-1Cards]
    steps:
      - name: Code Prepare
        shell: bash
        env:
          docker_image: ${{ inputs.DOCKER_IMAGE }}
          fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
        run: |
            set -x
            REPO="https://github.com/${{ github.repository }}.git"
            FULL_REPO="${{ github.repository }}"
            REPO_NAME="${FULL_REPO##*/}"
            BASE_BRANCH="${{ github.base_ref }}"
            # Clean the repository directory before starting
            docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
            -e "REPO_NAME=${REPO_NAME}" \
            ${docker_image} /bin/bash -c '
              if [ -d ${REPO_NAME} ]; then
                echo "Directory ${REPO_NAME} exists, removing it..."
                rm -rf ${REPO_NAME}*
              fi
            '
            wget -q ${fd_archive_url}
            tar -xf FastDeploy.tar.gz
            rm -rf FastDeploy.tar.gz
            cd FastDeploy
            git config --global user.name "FastDeployCI"
            git config --global user.email "fastdeploy_ci@example.com"
            git log -n 3 --oneline
      - name: Run FastDeploy Base Tests
        shell: bash
        env:
          docker_image: ${{ inputs.DOCKER_IMAGE }}
          fastdeploy_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
          CACHE_DIR: ${{ inputs.CACHE_DIR }}
          MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
        run: |
          runner_name="${{ runner.name }}"
          CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
          DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
          DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)
          FLASK_PORT=$((42068 + DEVICE_PORT * 100))
          FD_API_PORT=$((42088 + DEVICE_PORT * 100))
          FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
          FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
          echo "Test ENV Parameter:"
          echo "========================================================="
          echo "FLASK_PORT=${FLASK_PORT}"
          echo "FD_API_PORT=${FD_API_PORT}"
          echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
          echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
          echo "DEVICES=${DEVICES}"
          echo "========================================================="
          CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
          echo "CACHE_DIR is set to ${CACHE_DIR}"
          if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
            touch "${CACHE_DIR}/gitconfig"
          fi
          if [ ! -d "${MODEL_CACHE_DIR}" ]; then
            echo "Error: MODEL_CACHE_DIR '${MODEL_CACHE_DIR}' does not exist."
            exit 1
          fi
          PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT)
          LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
          echo "==== LOG_FILE is ${LOG_FILE} ===="
          echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
          for port in "${PORTS[@]}"; do
              PIDS=$(lsof -t -i :$port || true)
              if [ -n "$PIDS" ]; then
                  echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
                  echo "$PIDS" | xargs -r kill -9
                  echo "Port $port cleared" | tee -a $LOG_FILE
              else
                  echo "Port $port is free" | tee -a $LOG_FILE
              fi
          done
          echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE
          docker run --rm --ipc=host --pid=host --net=host \
          -v $(pwd):/workspace \
          -w /workspace \
          -e fastdeploy_wheel_url=${fastdeploy_wheel_url} \
          -e "FD_API_PORT=${FD_API_PORT}" \
          -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
          -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
          -e "FLASK_PORT=${FLASK_PORT}" \
          -v "${MODEL_CACHE_DIR}:/MODELDATA" \
          -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
          -v "${CACHE_DIR}/.cache:/root/.cache" \
          -v "${CACHE_DIR}/ConfigDir:/root/.config" \
          -e TZ="Asia/Shanghai" \
          --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
          python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
          python -m pip install ${fastdeploy_wheel_url}
          python -m pip install pytest
          wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64
          chmod +x ./llm-deploy-linux-amd64
          ./llm-deploy-linux-amd64 -python python3.10 \
          -model_name ERNIE-4.5-0.3B-Paddle \
          -model_path /MODELDATA \
          --skip install
          git config --global --add safe.directory /workspace/FastDeploy
          cd FastDeploy
          pushd test/ce/deploy
          python3.10 deploy.py > dd.log 2>&1 &
          sleep 3
          curl -X POST http://0.0.0.0:${FLASK_PORT}/start \
            -H "Content-Type: application/json" \
            -d "{\"--model\": \"/MODELDATA/ERNIE-4.5-0.3B-Paddle\"}"
          curl -X POST http://localhost:${FLASK_PORT}/wait_for_infer?timeout=90
          popd
          pushd test/ce/accuracy_cases
          export URL=http://localhost:${FD_API_PORT}/v1/chat/completions
          export TEMPLATE=TOKEN_LOGPROB
          export MODEL_SIZE=0.3B
          TEST_EXIT_CODE=0
          python gsm8k.py || TEST_EXIT_CODE=1
          popd
          echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> /workspace/FastDeploy/exit_code.env
          '
          if [ -f ./FastDeploy/exit_code.env ]; then
            source ./FastDeploy/exit_code.env
            cat ./FastDeploy/exit_code.env >> $GITHUB_ENV
          fi
          echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}"
          exit ${TEST_EXIT_CODE}
--- a/.github/workflows/pr_build_and_test.yml
+++ b/.github/workflows/pr_build_and_test.yml
@@ -73,3 +73,13 @@ jobs:
      FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
      FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
      MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
  accuracy_test:
    name: Run Accuracy Tests
    needs: [clone,build]
    uses: ./.github/workflows/_accuracy_test.yml
    with:
      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
      FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
      FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
      MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
--- a/test/ce/accuracy_cases/gsm8k.py
+++ b/test/ce/accuracy_cases/gsm8k.py
@@ -18,11 +18,13 @@ BASELINE = {
    "21B": 0.49,
    "300B": 0.96,
 }
-baseline = BASELINE.get(os.environ.get("MODEL"), None)
+baseline = BASELINE.get(os.environ.get("MODEL_SIZE"), None)
 base_url = os.environ.get("URL", None)
 atol = 0.03
 if baseline is None:
-    raise ValueError(f"Invalid MODEL value '{os.environ.get('MODEL')}', expected one of {list(BASELINE.keys())}")
+    raise ValueError(
        f"Invalid MODEL_SIZE value '{os.environ.get('MODEL_SIZE')}', expected one of {list(BASELINE.keys())}"
    )
 if base_url is None:
    raise ValueError(
        "Environment variable 'URL' is not set. "
--- a/test/ce/server/gsm8k.parquet
+++ b/test/ce/server/gsm8k.parquet
--- a/test/ce/server/gsm8k.py
+++ b/test/ce/server/gsm8k.py
@@ -1,188 +0,0 @@
 #!/bin/env python3
 # -*- coding: utf-8 -*-
 # @author DDDivano
 # encoding=utf-8 vi:ts=4:sw=4:expandtab:ft=python
 import os
 import re
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from urllib.parse import urlparse, urlunparse
 import openai
 from datasets import load_dataset
 from tqdm import tqdm
 BASELINE = {
    "0.3B": 0.05,
    "21B": 0.49,
    "300B": 0.96,
 }
 baseline = BASELINE.get(os.environ.get("MODEL"), None)
 base_url = os.environ.get("URL", None)
 atol = 0.03
 if baseline is None:
    raise ValueError(f"Invalid MODEL value '{os.environ.get('MODEL')}', expected one of {list(BASELINE.keys())}")
 if base_url is None:
    raise ValueError(
        "Environment variable 'URL' is not set. "
        "Please specify the inference service address, e.g., 'http://localhost:8191/v1'."
    )
 def strip_path_suffix(url: str, suffix: str = "chat/completions") -> str:
    """
    去除 URL 中的指定路径后缀（如 chat/completions）
    """
    parsed = urlparse(url)
    # 移除末尾的 suffix（注意确保只移除结尾部分）
    if parsed.path.endswith("/" + suffix):
        new_path = parsed.path[: -(len(suffix) + 1)]  # +1 是斜杠
    else:
        new_path = parsed.path
    # 重新构造 URL
    cleaned_url = urlunparse(
        (
            parsed.scheme,
            parsed.netloc,
            new_path.rstrip("/"),  # 去掉末尾的斜杠
            "",
            "",
            "",  # 忽略 params/query/fragment
        )
    )
    return cleaned_url
 # ========== OpenAI 客户端配置 ==========
 client = openai.OpenAI(
    api_key="DDDivano",
    # base_url="http://占位:8187/v1"
    base_url=strip_path_suffix(base_url),
 )
 model_name = "eb"
 max_samples = 690
 max_tokens = 12288
 max_workers = 33
 # ========== 加载数据集 ==========
 dataset = load_dataset("parquet", data_files="gsm8k.parquet", split="train")
 dataset = dataset.select(range(min(len(dataset), max_samples)))
 # ========== 提取 GT 中 "#### 数字" 格式的最终答案 ==========
 def extract_gt_answer(text):
    match = re.search(r"####\s*([\d,]+(?:\.\d+)?)", text)
    if match:
        return match.group(1).replace(",", "").strip()
    return None
 # ========== 提取模型输出中的“最后一句话”中的数字 ==========
 def extract_model_answer(text):
    if not text:
        return None
    text = text.replace(",", "").replace("$", "")
    lines = text.strip().splitlines()
    last_line = lines[-1] if lines else text
    match = re.search(r"-?\d+(?:\.\d+)?", last_line)
    return match.group(0) if match else None
 # ========== 数值比较函数 ==========
 def is_answer_equal(pred, gt, tol=1e-6):
    if pred is None or gt is None:
        return False
    try:
        return abs(float(pred) - float(gt)) < tol
    except:
        return pred == gt
 # ========== 构造 Prompt ==========
 def build_prompt(sample):
    return f"以下是一个数学问题，请直接给出最终答案。一定要把最终答案数字在最后输出。\n\n问题：{sample['question']}\n\n答案："
 # ========== 模型请求函数 ==========
 def query_model(prompt):
    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": "你是一个数学专家，擅长严谨地解答数学问题。"},
                {"role": "user", "content": prompt},
            ],
            temperature=1.0,
            top_p=0.8,
            max_tokens=max_tokens,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"[Error] {e}"
 # ========== 评估函数 ==========
 def evaluate_sample(sample):
    prompt = build_prompt(sample)
    model_output = query_model(prompt)
    gt_value = extract_gt_answer(sample["answer"])
    pred_value = extract_model_answer(model_output)
    is_correct = is_answer_equal(pred_value, gt_value)
    result = {
        "question": sample["question"],
        "gt_answer": gt_value,
        "model_answer": pred_value,
        "raw_gt_answer": sample["answer"],
        "raw_model_output": model_output,
        "is_correct": is_correct,
    }
    return result
 # ========== 主流程 ==========
 acc = []
 times = 3
 for i in range(times):
    correct = 0
    total = 0
    results = []
    print(f"🚀 Starting evaluation with {max_workers} threads...")
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(evaluate_sample, sample) for sample in dataset]
        for future in tqdm(as_completed(futures), total=len(futures), desc="Evaluating"):
            result = future.result()
            results.append(result)
            total += 1
            if result["is_correct"]:
                correct += 1
            else:
                print("\n❌ Wrong prediction:")
                print(f"Q: {result['question']}")
                print(f"GT: {result['gt_answer']}")
                print(f"Model: {result['model_answer']}")
                print(f"Full GT: {result['raw_gt_answer']}")
                print(f"Model Output: {result['raw_model_output']}")
    # ========== 输出准确率 ==========
    accuracy = correct / total * 100 if total > 0 else 0.0
    print(f"\n🎯 Evaluation Complete: Accuracy = {accuracy:.2f}% ({correct}/{total})")
    acc.append(accuracy)
 avg_acc = round(sum(acc) / times / 100, 4)  # 优化百分数
 print(f"平均准确率：{avg_acc * 100:.2f}%")
 assert (
    abs(avg_acc - baseline) <= atol
 ), f"模型准确率 {avg_acc:.2f} 与基准 {baseline:.2f} 相差 {abs(avg_acc - baseline):.2f}，超出容忍范围 {atol:.2f}"
 # with open("eval_result_math.json", "w", encoding="utf-8") as f:
 #     json.dump(results, f, indent=2, ensure_ascii=False)