mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
add accuracy check ci (#3389)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* add accuracy ci * fix * fix * update * rename ci jobs
This commit is contained in:
174
.github/workflows/_accuracy_test.yml
vendored
Normal file
174
.github/workflows/_accuracy_test.yml
vendored
Normal file
@@ -0,0 +1,174 @@
|
|||||||
|
name: Accuracy Test
|
||||||
|
description: "Run Accuracy Tests"
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_call:
|
||||||
|
inputs:
|
||||||
|
DOCKER_IMAGE:
|
||||||
|
description: "Build Images"
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310"
|
||||||
|
FASTDEPLOY_ARCHIVE_URL:
|
||||||
|
description: "URL of the compressed FastDeploy code archive."
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
FASTDEPLOY_WHEEL_URL:
|
||||||
|
description: "URL of the FastDeploy Wheel."
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
CACHE_DIR:
|
||||||
|
description: "Cache Dir Use"
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
default: ""
|
||||||
|
MODEL_CACHE_DIR:
|
||||||
|
description: "Cache Dir Use"
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
default: ""
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
accuracy_tests:
|
||||||
|
runs-on: [self-hosted, GPU-h20-1Cards]
|
||||||
|
steps:
|
||||||
|
- name: Code Prepare
|
||||||
|
shell: bash
|
||||||
|
env:
|
||||||
|
docker_image: ${{ inputs.DOCKER_IMAGE }}
|
||||||
|
fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
|
||||||
|
run: |
|
||||||
|
set -x
|
||||||
|
REPO="https://github.com/${{ github.repository }}.git"
|
||||||
|
FULL_REPO="${{ github.repository }}"
|
||||||
|
REPO_NAME="${FULL_REPO##*/}"
|
||||||
|
BASE_BRANCH="${{ github.base_ref }}"
|
||||||
|
|
||||||
|
# Clean the repository directory before starting
|
||||||
|
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
|
||||||
|
-e "REPO_NAME=${REPO_NAME}" \
|
||||||
|
${docker_image} /bin/bash -c '
|
||||||
|
if [ -d ${REPO_NAME} ]; then
|
||||||
|
echo "Directory ${REPO_NAME} exists, removing it..."
|
||||||
|
rm -rf ${REPO_NAME}*
|
||||||
|
fi
|
||||||
|
'
|
||||||
|
|
||||||
|
wget -q ${fd_archive_url}
|
||||||
|
tar -xf FastDeploy.tar.gz
|
||||||
|
rm -rf FastDeploy.tar.gz
|
||||||
|
cd FastDeploy
|
||||||
|
git config --global user.name "FastDeployCI"
|
||||||
|
git config --global user.email "fastdeploy_ci@example.com"
|
||||||
|
git log -n 3 --oneline
|
||||||
|
|
||||||
|
- name: Run FastDeploy Base Tests
|
||||||
|
shell: bash
|
||||||
|
env:
|
||||||
|
docker_image: ${{ inputs.DOCKER_IMAGE }}
|
||||||
|
fastdeploy_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
|
||||||
|
CACHE_DIR: ${{ inputs.CACHE_DIR }}
|
||||||
|
MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
|
||||||
|
run: |
|
||||||
|
runner_name="${{ runner.name }}"
|
||||||
|
CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
|
||||||
|
DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
|
||||||
|
DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)
|
||||||
|
|
||||||
|
FLASK_PORT=$((42068 + DEVICE_PORT * 100))
|
||||||
|
FD_API_PORT=$((42088 + DEVICE_PORT * 100))
|
||||||
|
FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
|
||||||
|
FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
|
||||||
|
echo "Test ENV Parameter:"
|
||||||
|
echo "========================================================="
|
||||||
|
echo "FLASK_PORT=${FLASK_PORT}"
|
||||||
|
echo "FD_API_PORT=${FD_API_PORT}"
|
||||||
|
echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
|
||||||
|
echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
|
||||||
|
echo "DEVICES=${DEVICES}"
|
||||||
|
echo "========================================================="
|
||||||
|
|
||||||
|
CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
|
||||||
|
echo "CACHE_DIR is set to ${CACHE_DIR}"
|
||||||
|
if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
|
||||||
|
touch "${CACHE_DIR}/gitconfig"
|
||||||
|
fi
|
||||||
|
if [ ! -d "${MODEL_CACHE_DIR}" ]; then
|
||||||
|
echo "Error: MODEL_CACHE_DIR '${MODEL_CACHE_DIR}' does not exist."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT)
|
||||||
|
LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
|
||||||
|
echo "==== LOG_FILE is ${LOG_FILE} ===="
|
||||||
|
|
||||||
|
echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
|
||||||
|
|
||||||
|
for port in "${PORTS[@]}"; do
|
||||||
|
PIDS=$(lsof -t -i :$port || true)
|
||||||
|
if [ -n "$PIDS" ]; then
|
||||||
|
echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
|
||||||
|
echo "$PIDS" | xargs -r kill -9
|
||||||
|
echo "Port $port cleared" | tee -a $LOG_FILE
|
||||||
|
else
|
||||||
|
echo "Port $port is free" | tee -a $LOG_FILE
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE
|
||||||
|
|
||||||
|
docker run --rm --ipc=host --pid=host --net=host \
|
||||||
|
-v $(pwd):/workspace \
|
||||||
|
-w /workspace \
|
||||||
|
-e fastdeploy_wheel_url=${fastdeploy_wheel_url} \
|
||||||
|
-e "FD_API_PORT=${FD_API_PORT}" \
|
||||||
|
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
|
||||||
|
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
|
||||||
|
-e "FLASK_PORT=${FLASK_PORT}" \
|
||||||
|
-v "${MODEL_CACHE_DIR}:/MODELDATA" \
|
||||||
|
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
|
||||||
|
-v "${CACHE_DIR}/.cache:/root/.cache" \
|
||||||
|
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
|
||||||
|
-e TZ="Asia/Shanghai" \
|
||||||
|
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
|
||||||
|
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
|
||||||
|
|
||||||
|
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||||
|
|
||||||
|
python -m pip install ${fastdeploy_wheel_url}
|
||||||
|
python -m pip install pytest
|
||||||
|
|
||||||
|
wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64
|
||||||
|
chmod +x ./llm-deploy-linux-amd64
|
||||||
|
./llm-deploy-linux-amd64 -python python3.10 \
|
||||||
|
-model_name ERNIE-4.5-0.3B-Paddle \
|
||||||
|
-model_path /MODELDATA \
|
||||||
|
--skip install
|
||||||
|
|
||||||
|
git config --global --add safe.directory /workspace/FastDeploy
|
||||||
|
cd FastDeploy
|
||||||
|
pushd test/ce/deploy
|
||||||
|
python3.10 deploy.py > dd.log 2>&1 &
|
||||||
|
sleep 3
|
||||||
|
curl -X POST http://0.0.0.0:${FLASK_PORT}/start \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d "{\"--model\": \"/MODELDATA/ERNIE-4.5-0.3B-Paddle\"}"
|
||||||
|
|
||||||
|
curl -X POST http://localhost:${FLASK_PORT}/wait_for_infer?timeout=90
|
||||||
|
popd
|
||||||
|
|
||||||
|
pushd test/ce/accuracy_cases
|
||||||
|
export URL=http://localhost:${FD_API_PORT}/v1/chat/completions
|
||||||
|
export TEMPLATE=TOKEN_LOGPROB
|
||||||
|
export MODEL_SIZE=0.3B
|
||||||
|
TEST_EXIT_CODE=0
|
||||||
|
python gsm8k.py || TEST_EXIT_CODE=1
|
||||||
|
popd
|
||||||
|
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> /workspace/FastDeploy/exit_code.env
|
||||||
|
'
|
||||||
|
if [ -f ./FastDeploy/exit_code.env ]; then
|
||||||
|
source ./FastDeploy/exit_code.env
|
||||||
|
cat ./FastDeploy/exit_code.env >> $GITHUB_ENV
|
||||||
|
fi
|
||||||
|
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}"
|
||||||
|
exit ${TEST_EXIT_CODE}
|
10
.github/workflows/pr_build_and_test.yml
vendored
10
.github/workflows/pr_build_and_test.yml
vendored
@@ -73,3 +73,13 @@ jobs:
|
|||||||
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
|
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
|
||||||
FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
|
FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
|
||||||
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
|
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
|
||||||
|
|
||||||
|
accuracy_test:
|
||||||
|
name: Run Accuracy Tests
|
||||||
|
needs: [clone,build]
|
||||||
|
uses: ./.github/workflows/_accuracy_test.yml
|
||||||
|
with:
|
||||||
|
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
|
||||||
|
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
|
||||||
|
FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
|
||||||
|
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
|
||||||
|
@@ -18,11 +18,13 @@ BASELINE = {
|
|||||||
"21B": 0.49,
|
"21B": 0.49,
|
||||||
"300B": 0.96,
|
"300B": 0.96,
|
||||||
}
|
}
|
||||||
baseline = BASELINE.get(os.environ.get("MODEL"), None)
|
baseline = BASELINE.get(os.environ.get("MODEL_SIZE"), None)
|
||||||
base_url = os.environ.get("URL", None)
|
base_url = os.environ.get("URL", None)
|
||||||
atol = 0.03
|
atol = 0.03
|
||||||
if baseline is None:
|
if baseline is None:
|
||||||
raise ValueError(f"Invalid MODEL value '{os.environ.get('MODEL')}', expected one of {list(BASELINE.keys())}")
|
raise ValueError(
|
||||||
|
f"Invalid MODEL_SIZE value '{os.environ.get('MODEL_SIZE')}', expected one of {list(BASELINE.keys())}"
|
||||||
|
)
|
||||||
if base_url is None:
|
if base_url is None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Environment variable 'URL' is not set. "
|
"Environment variable 'URL' is not set. "
|
||||||
|
Binary file not shown.
@@ -1,188 +0,0 @@
|
|||||||
#!/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
# @author DDDivano
|
|
||||||
# encoding=utf-8 vi:ts=4:sw=4:expandtab:ft=python
|
|
||||||
|
|
||||||
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
||||||
from urllib.parse import urlparse, urlunparse
|
|
||||||
|
|
||||||
import openai
|
|
||||||
from datasets import load_dataset
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
BASELINE = {
|
|
||||||
"0.3B": 0.05,
|
|
||||||
"21B": 0.49,
|
|
||||||
"300B": 0.96,
|
|
||||||
}
|
|
||||||
baseline = BASELINE.get(os.environ.get("MODEL"), None)
|
|
||||||
base_url = os.environ.get("URL", None)
|
|
||||||
atol = 0.03
|
|
||||||
if baseline is None:
|
|
||||||
raise ValueError(f"Invalid MODEL value '{os.environ.get('MODEL')}', expected one of {list(BASELINE.keys())}")
|
|
||||||
if base_url is None:
|
|
||||||
raise ValueError(
|
|
||||||
"Environment variable 'URL' is not set. "
|
|
||||||
"Please specify the inference service address, e.g., 'http://localhost:8191/v1'."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def strip_path_suffix(url: str, suffix: str = "chat/completions") -> str:
|
|
||||||
"""
|
|
||||||
去除 URL 中的指定路径后缀(如 chat/completions)
|
|
||||||
"""
|
|
||||||
parsed = urlparse(url)
|
|
||||||
# 移除末尾的 suffix(注意确保只移除结尾部分)
|
|
||||||
if parsed.path.endswith("/" + suffix):
|
|
||||||
new_path = parsed.path[: -(len(suffix) + 1)] # +1 是斜杠
|
|
||||||
else:
|
|
||||||
new_path = parsed.path
|
|
||||||
# 重新构造 URL
|
|
||||||
cleaned_url = urlunparse(
|
|
||||||
(
|
|
||||||
parsed.scheme,
|
|
||||||
parsed.netloc,
|
|
||||||
new_path.rstrip("/"), # 去掉末尾的斜杠
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"", # 忽略 params/query/fragment
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return cleaned_url
|
|
||||||
|
|
||||||
|
|
||||||
# ========== OpenAI 客户端配置 ==========
|
|
||||||
client = openai.OpenAI(
|
|
||||||
api_key="DDDivano",
|
|
||||||
# base_url="http://占位:8187/v1"
|
|
||||||
base_url=strip_path_suffix(base_url),
|
|
||||||
)
|
|
||||||
|
|
||||||
model_name = "eb"
|
|
||||||
max_samples = 690
|
|
||||||
max_tokens = 12288
|
|
||||||
max_workers = 33
|
|
||||||
|
|
||||||
# ========== 加载数据集 ==========
|
|
||||||
dataset = load_dataset("parquet", data_files="gsm8k.parquet", split="train")
|
|
||||||
dataset = dataset.select(range(min(len(dataset), max_samples)))
|
|
||||||
|
|
||||||
|
|
||||||
# ========== 提取 GT 中 "#### 数字" 格式的最终答案 ==========
|
|
||||||
def extract_gt_answer(text):
|
|
||||||
match = re.search(r"####\s*([\d,]+(?:\.\d+)?)", text)
|
|
||||||
if match:
|
|
||||||
return match.group(1).replace(",", "").strip()
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
# ========== 提取模型输出中的“最后一句话”中的数字 ==========
|
|
||||||
def extract_model_answer(text):
|
|
||||||
if not text:
|
|
||||||
return None
|
|
||||||
text = text.replace(",", "").replace("$", "")
|
|
||||||
lines = text.strip().splitlines()
|
|
||||||
last_line = lines[-1] if lines else text
|
|
||||||
match = re.search(r"-?\d+(?:\.\d+)?", last_line)
|
|
||||||
return match.group(0) if match else None
|
|
||||||
|
|
||||||
|
|
||||||
# ========== 数值比较函数 ==========
|
|
||||||
def is_answer_equal(pred, gt, tol=1e-6):
|
|
||||||
if pred is None or gt is None:
|
|
||||||
return False
|
|
||||||
try:
|
|
||||||
return abs(float(pred) - float(gt)) < tol
|
|
||||||
except:
|
|
||||||
return pred == gt
|
|
||||||
|
|
||||||
|
|
||||||
# ========== 构造 Prompt ==========
|
|
||||||
def build_prompt(sample):
|
|
||||||
return f"以下是一个数学问题,请直接给出最终答案。一定要把最终答案数字在最后输出。\n\n问题:{sample['question']}\n\n答案:"
|
|
||||||
|
|
||||||
|
|
||||||
# ========== 模型请求函数 ==========
|
|
||||||
def query_model(prompt):
|
|
||||||
try:
|
|
||||||
response = client.chat.completions.create(
|
|
||||||
model=model_name,
|
|
||||||
messages=[
|
|
||||||
{"role": "system", "content": "你是一个数学专家,擅长严谨地解答数学问题。"},
|
|
||||||
{"role": "user", "content": prompt},
|
|
||||||
],
|
|
||||||
temperature=1.0,
|
|
||||||
top_p=0.8,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
)
|
|
||||||
return response.choices[0].message.content.strip()
|
|
||||||
except Exception as e:
|
|
||||||
return f"[Error] {e}"
|
|
||||||
|
|
||||||
|
|
||||||
# ========== 评估函数 ==========
|
|
||||||
def evaluate_sample(sample):
|
|
||||||
prompt = build_prompt(sample)
|
|
||||||
model_output = query_model(prompt)
|
|
||||||
|
|
||||||
gt_value = extract_gt_answer(sample["answer"])
|
|
||||||
pred_value = extract_model_answer(model_output)
|
|
||||||
is_correct = is_answer_equal(pred_value, gt_value)
|
|
||||||
|
|
||||||
result = {
|
|
||||||
"question": sample["question"],
|
|
||||||
"gt_answer": gt_value,
|
|
||||||
"model_answer": pred_value,
|
|
||||||
"raw_gt_answer": sample["answer"],
|
|
||||||
"raw_model_output": model_output,
|
|
||||||
"is_correct": is_correct,
|
|
||||||
}
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
# ========== 主流程 ==========
|
|
||||||
|
|
||||||
acc = []
|
|
||||||
times = 3
|
|
||||||
|
|
||||||
for i in range(times):
|
|
||||||
correct = 0
|
|
||||||
total = 0
|
|
||||||
results = []
|
|
||||||
|
|
||||||
print(f"🚀 Starting evaluation with {max_workers} threads...")
|
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
||||||
futures = [executor.submit(evaluate_sample, sample) for sample in dataset]
|
|
||||||
for future in tqdm(as_completed(futures), total=len(futures), desc="Evaluating"):
|
|
||||||
result = future.result()
|
|
||||||
results.append(result)
|
|
||||||
total += 1
|
|
||||||
if result["is_correct"]:
|
|
||||||
correct += 1
|
|
||||||
else:
|
|
||||||
print("\n❌ Wrong prediction:")
|
|
||||||
print(f"Q: {result['question']}")
|
|
||||||
print(f"GT: {result['gt_answer']}")
|
|
||||||
print(f"Model: {result['model_answer']}")
|
|
||||||
print(f"Full GT: {result['raw_gt_answer']}")
|
|
||||||
print(f"Model Output: {result['raw_model_output']}")
|
|
||||||
|
|
||||||
# ========== 输出准确率 ==========
|
|
||||||
accuracy = correct / total * 100 if total > 0 else 0.0
|
|
||||||
print(f"\n🎯 Evaluation Complete: Accuracy = {accuracy:.2f}% ({correct}/{total})")
|
|
||||||
acc.append(accuracy)
|
|
||||||
|
|
||||||
avg_acc = round(sum(acc) / times / 100, 4) # 优化百分数
|
|
||||||
print(f"平均准确率:{avg_acc * 100:.2f}%")
|
|
||||||
|
|
||||||
assert (
|
|
||||||
abs(avg_acc - baseline) <= atol
|
|
||||||
), f"模型准确率 {avg_acc:.2f} 与基准 {baseline:.2f} 相差 {abs(avg_acc - baseline):.2f},超出容忍范围 {atol:.2f}"
|
|
||||||
|
|
||||||
# with open("eval_result_math.json", "w", encoding="utf-8") as f:
|
|
||||||
# json.dump(results, f, indent=2, ensure_ascii=False)
|
|
Reference in New Issue
Block a user