[Metax] update ci test (#5652)

This commit is contained in:
MingkunZhang
2025-12-19 17:25:47 +08:00
committed by GitHub
parent 669dfe8dca
commit 46d83be065
5 changed files with 569 additions and 6 deletions

259
scripts/run_ci_metax.sh Normal file
View File

@@ -0,0 +1,259 @@
#!/bin/bash
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
tests_path="$DIR/../tests/"
export PYTEST_INI="$DIR/../tests/cov_pytest.ini"
run_path=$( realpath "$DIR/../")
export COVERAGE_FILE=${COVERAGE_FILE:-$DIR/../coveragedata/.coverage}
export COVERAGE_RCFILE=${COVERAGE_RCFILE:-$DIR/../scripts/.coveragerc}
LOG_ROOT_PATH=${run_path}/metax_log
LOG_SUBDIR=${LOG_ROOT_PATH}/logs
LOG_RESULT_TMP=$(mktemp)
PASS_FILE_LIST=${LOG_ROOT_PATH}/passed_files.txt
FAIL_FILE_LIST=${LOG_ROOT_PATH}/failed_files.txt
SUMMARY_FILE_LIST=${LOG_ROOT_PATH}/summary.txt
trap 'rm -f "$LOG_RESULT_TMP"' EXIT
mkdir -p "$LOG_ROOT_PATH" "$LOG_SUBDIR"
OVERWRITE_OLD_RESULT="yes"
METAX_GPU_TARGET=C500
PARALLEL_NUM="4"
PYTEST_EXTRA_ARGS="${3:-}"
declare -a IGNORE_PATHS=()
while IFS= read -r line; do
if [ -z "$line" ]; then
continue
fi
path=$(echo "$line" | sed 's/^\s*--ignore=//')
if [ -n "$path" ]; then
IGNORE_PATHS+=("$path")
fi
done < <(grep -E '^\s*--ignore=' "$PYTEST_INI")
declare -A SEEN=()
declare -a EXCLUDE_PATHS=()
for path in "${IGNORE_PATHS[@]}"; do
if [[ -z "${SEEN[$path]}" ]]; then
SEEN[$path]=1
EXCLUDE_PATHS+=("$path")
fi
done
# declare -a CUSTOM_EXCLUDE_PATHS=(
# "tests/e2e"
# "tests/model_loader"
# "tests/pooling"
# "tests/entrypoints"
# )
# for path in "${CUSTOM_EXCLUDE_PATHS[@]}"; do
# if [[ -z "${SEEN[$path]}" ]]; then
# SEEN[$path]=1
# EXCLUDE_PATHS+=("$path")
# fi
# done
is_excluded() {
local target_path="$1"
for exclude in "${EXCLUDE_PATHS[@]}"; do
if [ -z "$exclude" ]; then
continue
fi
if [[ "$target_path" == *"$exclude"* ]]; then
return 0
fi
done
return 1
}
# FIND_PATTERN="test_*.py"
# declare -a ALL_PATHS=()
# while IFS= read -r path; do
# [[ -n "$path" ]] && ALL_PATHS+=("$path")
# done < <(find "${tests_path}" -type f -name "$FIND_PATTERN" | sort | uniq)
declare -a FILTERED_PATHS=()
METAX_CI_CASELIST=(
"tests/metax_ci/test_fused_moe.py"
"tests/operators/test_limit_thinking_content_length.py"
"tests/operators/test_update_inputs_v1.py"
"tests/operators/test_set_value_by_flags_and_idx.py"
"tests/operators/test_get_token_penalty_multi_scores.py"
"tests/operators/test_speculate_get_token_penalty_multi_scores.py"
"tests/operators/test_token_penalty.py"
"tests/operators/test_stop_generation_multi_ends.py"
"tests/operators/test_get_padding_offset.py"
"tests/operators/test_speculate_get_padding_offset.py"
"tests/operators/test_rebuild_padding.py"
"tests/operators/test_share_external_data.py"
"tests/operators/test_rejection_top_p_sampling.py"
"tests/layers/test_min_sampling.py"
)
for path in "${METAX_CI_CASELIST[@]}"; do
FILTERED_PATHS+=("$run_path/$path")
# if ! is_excluded "$path"; then
# FILTERED_PATHS+=("$run_path/$path")
# fi
done
echo -e "\n================== Metax CI test total num ( ${#FILTERED_PATHS[@]} ) =================="
if [ "$OVERWRITE_OLD_RESULT" = "yes" ]; then
> "${SUMMARY_FILE_LIST}"
> "${PASS_FILE_LIST}"
> "${FAIL_FILE_LIST}"
rm -f "${LOG_SUBDIR}"/*.log
else
echo -e "\n================== $(date +%Y-%m-%d_%H:%M:%S) =================" >> "${SUMMARY_FILE_LIST}"
fi
get_max_free_gpu() {
local log=$(mx-smi)
local gpu_lines=$(echo "$log" | grep -E "MetaX ${METAX_GPU_TARGET}|MiB" | grep -v "Process" | grep -v "Board Name")
local GPU_INFO=()
local current_gpu_idx=""
while IFS= read -r line; do
if echo "$line" | grep -q "MetaX ${METAX_GPU_TARGET}"; then
current_gpu_idx=$(echo "$line" | awk '{print $2}' | grep -E '^[0-9]+$')
elif echo "$line" | grep -q "MiB"; then
if [ -n "$current_gpu_idx" ]; then
mem_used=$(echo "$line" | awk '{for(i=1;i<=NF;i++){if($i ~ /MiB/){split($(i-1),mem,"/");print mem[1]}}}' )
mem_total=$(echo "$line" | awk '{for(i=1;i<=NF;i++){if($i ~ /MiB/){split($(i-1),mem,"/");print mem[2]}}}' )
mem_free=$((mem_total - mem_used))
GPU_INFO+=("$current_gpu_idx:$mem_used:$mem_total:$mem_free")
# echo "$current_gpu_idx - ${mem_used}"
current_gpu_idx=""
fi
fi
done <<< "$gpu_lines"
gpu_mem_info=${GPU_INFO[@]}
local sorted_gpus=$(echo "$gpu_mem_info" | tr ' ' '\n' | sort -t ':' -k4,4nr -k1,1n)
echo "${sorted_gpus}"
local count=0
local top_n=1
local gpu_list=""
while IFS= read -r gpu && [ $count -lt $top_n ]; do
if [ -n "$gpu" ]; then
local gpu_idx=$(echo "$gpu" | cut -d ':' -f 1)
local gpu_free=$(echo "$gpu" | cut -d ':' -f 4)
gpu_list="$gpu_list,$gpu_idx"
count=$((count + 1))
fi
done <<< "$sorted_gpus"
gpu_list=${gpu_list:1:${#gpu_list}}
echo "$gpu_list"
}
test_env_init() {
export MACA_PATH=/opt/maca
if [ ! -d ${HOME}/cu-bridge ]; then
`${MACA_PATH}/tools/cu-bridge/tools/pre_make`
fi
export CUDA_PATH=${HOME}/cu-bridge/CUDA_DIR
export LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:$LD_LIBRARY_PATH
export PADDLE_XCCL_BACKEND=metax_gpu
export FLAGS_weight_only_linear_arch=80
export FD_MOE_BACKEND=cutlass # or triton
export FD_METAX_KVCACHE_MEM=8
export FD_ENC_DEC_BLOCK_NUM=2
export MACA_VISIBLE_DEVICES=$(get_max_free_gpu)
}
test_single_file() {
test_env_init
local file="$1"
local extra_args="$2"
local log_file="${LOG_SUBDIR}/$(basename "$file").log"
local start_time=$(date +%s)
pytest "$file" $extra_args > "$log_file" 2>&1
local exit_code=$?
local end_time=$(date +%s)
local cost_time=$((end_time - start_time))
echo "$file,$exit_code,$cost_time" >> "$LOG_RESULT_TMP"
if [ $exit_code -eq 0 ]; then
echo "$(basename "$file") passed (cost: ${cost_time}s)"
else
echo "$(basename "$file") failed (cost: ${cost_time}s, exit code: $exit_code)"
fi
}
export -f get_max_free_gpu
export -f test_env_init
export -f test_single_file
export LOG_RESULT_TMP PYTEST_EXTRA_ARGS LOG_SUBDIR
# if [ "$PARALLEL_NUM" = "auto" ]; then
# PARALLEL_NUM=$(nproc --all 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
# fi
printf "%s\n" "${FILTERED_PATHS[@]}" | xargs -I {} -P "$PARALLEL_NUM" bash -c 'test_single_file "{}" "$PYTEST_EXTRA_ARGS"'
PASS_COUNT=0
FAIL_COUNT=0
TOTAL_COST_TIME=0
declare -a FAIL_FILES=()
while IFS=, read -r file exit_code cost_time; do
TOTAL_COST_TIME=$((TOTAL_COST_TIME + cost_time))
if [ "$exit_code" -eq 0 ]; then
PASS_COUNT=$((PASS_COUNT + 1))
echo "$file" >> ${PASS_FILE_LIST}
else
FAIL_COUNT=$((FAIL_COUNT + 1))
FAIL_FILES+=$(basename "$file")
echo "$file" >> ${FAIL_FILE_LIST}
fi
done < "$LOG_RESULT_TMP"
{
echo "============================= TEST RESULT SUMMARY ================================"
echo "Pytest date: $(date +%Y-%m-%d\ %H:%M:%S)"
echo "Pytest parallel num: $PARALLEL_NUM"
echo "Pytest extra args: $PYTEST_EXTRA_ARGS"
echo "Pytest total num: $((PASS_COUNT + FAIL_COUNT))"
echo "Pytest successful: $PASS_COUNT"
echo "Pytest failed: $FAIL_COUNT"
echo "Pytest all cost: $TOTAL_COST_TIME s"
echo "=================================================================================="
} >> "${SUMMARY_FILE_LIST}"
cat ${SUMMARY_FILE_LIST}
if [ "$FAIL_COUNT" -ne 0 ]; then
# echo "Failed test cases are listed in $failed_tests_file"
# cat "$FAIL_FILE_LIST"
echo ${FAIL_FILES[@]}
exit 1
fi