add accuracy check ci (#3389)

* add accuracy ci * fix * fix * update * rename ci jobs
2025-10-05 08:37:06 +08:00 · 2025-08-15 15:17:43 +08:00
parent 4bd6a9fa7d
commit cc8ee50f27
5 changed files with 188 additions and 190 deletions
--- a/.github/workflows/_accuracy_test.yml
+++ b/.github/workflows/_accuracy_test.yml
@@ -0,0 +1,174 @@
+name: Accuracy Test
+description: "Run Accuracy Tests"
+
+on:
+  workflow_call:
+    inputs:
+      DOCKER_IMAGE:
+        description: "Build Images"
+        required: true
+        type: string
+        default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310"
+      FASTDEPLOY_ARCHIVE_URL:
+        description: "URL of the compressed FastDeploy code archive."
+        required: true
+        type: string
+      FASTDEPLOY_WHEEL_URL:
+        description: "URL of the FastDeploy Wheel."
+        required: true
+        type: string
+      CACHE_DIR:
+        description: "Cache Dir Use"
+        required: false
+        type: string
+        default: ""
+      MODEL_CACHE_DIR:
+        description: "Cache Dir Use"
+        required: false
+        type: string
+        default: ""
+
+jobs:
+  accuracy_tests:
+    runs-on: [self-hosted, GPU-h20-1Cards]
+    steps:
+      - name: Code Prepare
+        shell: bash
+        env:
+          docker_image: ${{ inputs.DOCKER_IMAGE }}
+          fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
+        run: |
+            set -x
+            REPO="https://github.com/${{ github.repository }}.git"
+            FULL_REPO="${{ github.repository }}"
+            REPO_NAME="${FULL_REPO##*/}"
+            BASE_BRANCH="${{ github.base_ref }}"
+
+            # Clean the repository directory before starting
+            docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
+            -e "REPO_NAME=${REPO_NAME}" \
+            ${docker_image} /bin/bash -c '
+              if [ -d ${REPO_NAME} ]; then
+                echo "Directory ${REPO_NAME} exists, removing it..."
+                rm -rf ${REPO_NAME}*
+              fi
+            '
+
+            wget -q ${fd_archive_url}
+            tar -xf FastDeploy.tar.gz
+            rm -rf FastDeploy.tar.gz
+            cd FastDeploy
+            git config --global user.name "FastDeployCI"
+            git config --global user.email "fastdeploy_ci@example.com"
+            git log -n 3 --oneline
+
+      - name: Run FastDeploy Base Tests
+        shell: bash
+        env:
+          docker_image: ${{ inputs.DOCKER_IMAGE }}
+          fastdeploy_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
+          CACHE_DIR: ${{ inputs.CACHE_DIR }}
+          MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
+        run: |
+          runner_name="${{ runner.name }}"
+          CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
+          DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
+          DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)
+
+          FLASK_PORT=$((42068 + DEVICE_PORT * 100))
+          FD_API_PORT=$((42088 + DEVICE_PORT * 100))
+          FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
+          FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
+          echo "Test ENV Parameter:"
+          echo "========================================================="
+          echo "FLASK_PORT=${FLASK_PORT}"
+          echo "FD_API_PORT=${FD_API_PORT}"
+          echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
+          echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
+          echo "DEVICES=${DEVICES}"
+          echo "========================================================="
+
+          CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
+          echo "CACHE_DIR is set to ${CACHE_DIR}"
+          if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
+            touch "${CACHE_DIR}/gitconfig"
+          fi
+          if [ ! -d "${MODEL_CACHE_DIR}" ]; then
+            echo "Error: MODEL_CACHE_DIR '${MODEL_CACHE_DIR}' does not exist."
+            exit 1
+          fi
+
+          PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT)
+          LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
+          echo "==== LOG_FILE is ${LOG_FILE} ===="
+
+          echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
+
+          for port in "${PORTS[@]}"; do
+              PIDS=$(lsof -t -i :$port || true)
+              if [ -n "$PIDS" ]; then
+                  echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
+                  echo "$PIDS" | xargs -r kill -9
+                  echo "Port $port cleared" | tee -a $LOG_FILE
+              else
+                  echo "Port $port is free" | tee -a $LOG_FILE
+              fi
+          done
+
+          echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE
+
+          docker run --rm --ipc=host --pid=host --net=host \
+          -v $(pwd):/workspace \
+          -w /workspace \
+          -e fastdeploy_wheel_url=${fastdeploy_wheel_url} \
+          -e "FD_API_PORT=${FD_API_PORT}" \
+          -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
+          -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
+          -e "FLASK_PORT=${FLASK_PORT}" \
+          -v "${MODEL_CACHE_DIR}:/MODELDATA" \
+          -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
+          -v "${CACHE_DIR}/.cache:/root/.cache" \
+          -v "${CACHE_DIR}/ConfigDir:/root/.config" \
+          -e TZ="Asia/Shanghai" \
+          --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
+          python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+
+          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+          python -m pip install ${fastdeploy_wheel_url}
+          python -m pip install pytest
+
+          wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64
+          chmod +x ./llm-deploy-linux-amd64
+          ./llm-deploy-linux-amd64 -python python3.10 \
+          -model_name ERNIE-4.5-0.3B-Paddle \
+          -model_path /MODELDATA \
+          --skip install
+
+          git config --global --add safe.directory /workspace/FastDeploy
+          cd FastDeploy
+          pushd test/ce/deploy
+          python3.10 deploy.py > dd.log 2>&1 &
+          sleep 3
+          curl -X POST http://0.0.0.0:${FLASK_PORT}/start \
+            -H "Content-Type: application/json" \
+            -d "{\"--model\": \"/MODELDATA/ERNIE-4.5-0.3B-Paddle\"}"
+
+          curl -X POST http://localhost:${FLASK_PORT}/wait_for_infer?timeout=90
+          popd
+
+          pushd test/ce/accuracy_cases
+          export URL=http://localhost:${FD_API_PORT}/v1/chat/completions
+          export TEMPLATE=TOKEN_LOGPROB
+          export MODEL_SIZE=0.3B
+          TEST_EXIT_CODE=0
+          python gsm8k.py || TEST_EXIT_CODE=1
+          popd
+          echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> /workspace/FastDeploy/exit_code.env
+          '
+          if [ -f ./FastDeploy/exit_code.env ]; then
+            source ./FastDeploy/exit_code.env
+            cat ./FastDeploy/exit_code.env >> $GITHUB_ENV
+          fi
+          echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}"
+          exit ${TEST_EXIT_CODE}
--- a/.github/workflows/pr_build_and_test.yml
+++ b/.github/workflows/pr_build_and_test.yml
@@ -73,3 +73,13 @@ jobs:
      FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
      FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
      MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
+
+  accuracy_test:
+    name: Run Accuracy Tests
+    needs: [clone,build]
+    uses: ./.github/workflows/_accuracy_test.yml
+    with:
+      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
+      FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
+      FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
+      MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
--- a/test/ce/accuracy_cases/gsm8k.py
+++ b/test/ce/accuracy_cases/gsm8k.py
@@ -18,11 +18,13 @@ BASELINE = {
    "21B": 0.49,
    "300B": 0.96,
 }
-baseline = BASELINE.get(os.environ.get("MODEL"), None)
+baseline = BASELINE.get(os.environ.get("MODEL_SIZE"), None)
 base_url = os.environ.get("URL", None)
 atol = 0.03
 if baseline is None:
-    raise ValueError(f"Invalid MODEL value '{os.environ.get('MODEL')}', expected one of {list(BASELINE.keys())}")
+    raise ValueError(
+        f"Invalid MODEL_SIZE value '{os.environ.get('MODEL_SIZE')}', expected one of {list(BASELINE.keys())}"
+    )
 if base_url is None:
    raise ValueError(
        "Environment variable 'URL' is not set. "
--- a/test/ce/server/gsm8k.parquet
+++ b/test/ce/server/gsm8k.parquet
--- a/test/ce/server/gsm8k.py
+++ b/test/ce/server/gsm8k.py
@@ -1,188 +0,0 @@
-#!/bin/env python3
-# -*- coding: utf-8 -*-
-# @author DDDivano
-# encoding=utf-8 vi:ts=4:sw=4:expandtab:ft=python
-
-
-import os
-import re
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from urllib.parse import urlparse, urlunparse
-
-import openai
-from datasets import load_dataset
-from tqdm import tqdm
-
-BASELINE = {
-    "0.3B": 0.05,
-    "21B": 0.49,
-    "300B": 0.96,
-}
-baseline = BASELINE.get(os.environ.get("MODEL"), None)
-base_url = os.environ.get("URL", None)
-atol = 0.03
-if baseline is None:
-    raise ValueError(f"Invalid MODEL value '{os.environ.get('MODEL')}', expected one of {list(BASELINE.keys())}")
-if base_url is None:
-    raise ValueError(
-        "Environment variable 'URL' is not set. "
-        "Please specify the inference service address, e.g., 'http://localhost:8191/v1'."
-    )
-
-
-def strip_path_suffix(url: str, suffix: str = "chat/completions") -> str:
-    """
-    去除 URL 中的指定路径后缀（如 chat/completions）
-    """
-    parsed = urlparse(url)
-    # 移除末尾的 suffix（注意确保只移除结尾部分）
-    if parsed.path.endswith("/" + suffix):
-        new_path = parsed.path[: -(len(suffix) + 1)]  # +1 是斜杠
-    else:
-        new_path = parsed.path
-    # 重新构造 URL
-    cleaned_url = urlunparse(
-        (
-            parsed.scheme,
-            parsed.netloc,
-            new_path.rstrip("/"),  # 去掉末尾的斜杠
-            "",
-            "",
-            "",  # 忽略 params/query/fragment
-        )
-    )
-    return cleaned_url
-
-
-# ========== OpenAI 客户端配置 ==========
-client = openai.OpenAI(
-    api_key="DDDivano",
-    # base_url="http://占位:8187/v1"
-    base_url=strip_path_suffix(base_url),
-)
-
-model_name = "eb"
-max_samples = 690
-max_tokens = 12288
-max_workers = 33
-
-# ========== 加载数据集 ==========
-dataset = load_dataset("parquet", data_files="gsm8k.parquet", split="train")
-dataset = dataset.select(range(min(len(dataset), max_samples)))
-
-
-# ========== 提取 GT 中 "#### 数字" 格式的最终答案 ==========
-def extract_gt_answer(text):
-    match = re.search(r"####\s*([\d,]+(?:\.\d+)?)", text)
-    if match:
-        return match.group(1).replace(",", "").strip()
-    return None
-
-
-# ========== 提取模型输出中的“最后一句话”中的数字 ==========
-def extract_model_answer(text):
-    if not text:
-        return None
-    text = text.replace(",", "").replace("$", "")
-    lines = text.strip().splitlines()
-    last_line = lines[-1] if lines else text
-    match = re.search(r"-?\d+(?:\.\d+)?", last_line)
-    return match.group(0) if match else None
-
-
-# ========== 数值比较函数 ==========
-def is_answer_equal(pred, gt, tol=1e-6):
-    if pred is None or gt is None:
-        return False
-    try:
-        return abs(float(pred) - float(gt)) < tol
-    except:
-        return pred == gt
-
-
-# ========== 构造 Prompt ==========
-def build_prompt(sample):
-    return f"以下是一个数学问题，请直接给出最终答案。一定要把最终答案数字在最后输出。\n\n问题：{sample['question']}\n\n答案："
-
-
-# ========== 模型请求函数 ==========
-def query_model(prompt):
-    try:
-        response = client.chat.completions.create(
-            model=model_name,
-            messages=[
-                {"role": "system", "content": "你是一个数学专家，擅长严谨地解答数学问题。"},
-                {"role": "user", "content": prompt},
-            ],
-            temperature=1.0,
-            top_p=0.8,
-            max_tokens=max_tokens,
-        )
-        return response.choices[0].message.content.strip()
-    except Exception as e:
-        return f"[Error] {e}"
-
-
-# ========== 评估函数 ==========
-def evaluate_sample(sample):
-    prompt = build_prompt(sample)
-    model_output = query_model(prompt)
-
-    gt_value = extract_gt_answer(sample["answer"])
-    pred_value = extract_model_answer(model_output)
-    is_correct = is_answer_equal(pred_value, gt_value)
-
-    result = {
-        "question": sample["question"],
-        "gt_answer": gt_value,
-        "model_answer": pred_value,
-        "raw_gt_answer": sample["answer"],
-        "raw_model_output": model_output,
-        "is_correct": is_correct,
-    }
-
-    return result
-
-
-# ========== 主流程 ==========
-
-acc = []
-times = 3
-
-for i in range(times):
-    correct = 0
-    total = 0
-    results = []
-
-    print(f"🚀 Starting evaluation with {max_workers} threads...")
-
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        futures = [executor.submit(evaluate_sample, sample) for sample in dataset]
-        for future in tqdm(as_completed(futures), total=len(futures), desc="Evaluating"):
-            result = future.result()
-            results.append(result)
-            total += 1
-            if result["is_correct"]:
-                correct += 1
-            else:
-                print("\n❌ Wrong prediction:")
-                print(f"Q: {result['question']}")
-                print(f"GT: {result['gt_answer']}")
-                print(f"Model: {result['model_answer']}")
-                print(f"Full GT: {result['raw_gt_answer']}")
-                print(f"Model Output: {result['raw_model_output']}")
-
-    # ========== 输出准确率 ==========
-    accuracy = correct / total * 100 if total > 0 else 0.0
-    print(f"\n🎯 Evaluation Complete: Accuracy = {accuracy:.2f}% ({correct}/{total})")
-    acc.append(accuracy)
-
-avg_acc = round(sum(acc) / times / 100, 4)  # 优化百分数
-print(f"平均准确率：{avg_acc * 100:.2f}%")
-
-assert (
-    abs(avg_acc - baseline) <= atol
-), f"模型准确率 {avg_acc:.2f} 与基准 {baseline:.2f} 相差 {abs(avg_acc - baseline):.2f}，超出容忍范围 {atol:.2f}"
-
-# with open("eval_result_math.json", "w", encoding="utf-8") as f:
-#     json.dump(results, f, indent=2, ensure_ascii=False)