Files
FastDeploy/scripts/get_rdma_nics.sh
2025-07-19 23:19:27 +08:00

226 lines
7.8 KiB
Bash

#!/bin/bash
Cur_Dir=$(cd `dirname $0`; pwd)
NICNAME_TYPE=xgbe # 默认检测类型
type=$1
if [ "$ENABLE_EP_DP" == "1" ]; then
gpu_root_port_filename="${Cur_Dir}/gpu_rootport_${DP_RANK}.txt"
else
gpu_root_port_filename="${Cur_Dir}/gpu_rootport.txt"
fi
function __NEW_GPU_ROOTPORT_FILE__() {
touch ${gpu_root_port_filename} 2>/dev/null
echo "" > ${gpu_root_port_filename} 2>/dev/null
for gpu_bus in $(lspci 2>/dev/null | grep -iE "Communication controller: | controller: NVIDIA" | awk '{print $1}')
do
readlink "/sys/bus/pci/devices/0000:${gpu_bus}" 2>/dev/null | awk -F [/] '{print $6}' >> ${gpu_root_port_filename}
done
}
function __RM_GPU_ROOTPORT_FILE__() {
rm -rf ${gpu_root_port_filename} 2>/dev/null
}
function __JUDGE_NIC_TYPE__() {
XGBE_NUM=$(ip a 2>/dev/null | grep -c ": ${NICNAME_TYPE}")
gpu_first=true
xpu_first=true
cpu_first=true
for (( xgbe_no=0; xgbe_no < XGBE_NUM; xgbe_no++ ))
do
[ ! -d "/sys/class/net/${NICNAME_TYPE}${xgbe_no}" ] && continue
PCI_ADDRESS=$(ethtool -i "${NICNAME_TYPE}${xgbe_no}" 2>/dev/null | awk -F '0000:' '/bus-info/{print $2}')
[ -z "$PCI_ADDRESS" ] && continue
NIC_ROOTPORT=$(readlink "/sys/bus/pci/devices/0000:${PCI_ADDRESS}" 2>/dev/null | awk -F '/' '{print $6}')
NIC_TYPE="CPU_NIC"
grep -qxF "$NIC_ROOTPORT" ${gpu_root_port_filename} 2>/dev/null && NIC_TYPE="GPU_NIC"
if [[ "$type" == "gpu" && "$NIC_TYPE" == "GPU_NIC" ]]; then
ibdev=$(ibdev2netdev 2>/dev/null | awk -v nic="${NICNAME_TYPE}${xgbe_no}" '$5 == nic {print $1}')
if [ -n "$ibdev" ] && ip link show "${NICNAME_TYPE}${xgbe_no}" | grep -q "state UP"; then
if $gpu_first; then
printf "KVCACHE_RDMA_NICS=%s" "$ibdev"
gpu_first=false
else
printf ",%s" "$ibdev"
fi
fi
fi
if [[ "$type" == "xpu" && "$NIC_TYPE" == "GPU_NIC" ]]; then
ibdev=$(ibdev2netdev 2>/dev/null | awk -v nic="${NICNAME_TYPE}${xgbe_no}" '$5 == nic {print $1}')
if [ -n "$ibdev" ] && ip link show "${NICNAME_TYPE}${xgbe_no}" | grep -q "state UP"; then
if $xpu_first; then
printf "KVCACHE_RDMA_NICS=%s,%s" "$ibdev" "$ibdev"
xpu_first=false
else
printf ",%s,%s" "$ibdev" "$ibdev"
fi
fi
fi
if [[ "$type" == "cpu" ]]; then
for (( xgbe_no=0; xgbe_no < XGBE_NUM; xgbe_no++ ))
do
[ ! -d "/sys/class/net/${NICNAME_TYPE}${xgbe_no}" ] && continue
PCI_ADDRESS=$(ethtool -i "${NICNAME_TYPE}${xgbe_no}" 2>/dev/null | awk -F '0000:' '/bus-info/{print $2}')
[ -z "$PCI_ADDRESS" ] && continue
NIC_ROOTPORT=$(readlink "/sys/bus/pci/devices/0000:${PCI_ADDRESS}" 2>/dev/null | awk -F '/' '{print $6}')
grep -qxF "$NIC_ROOTPORT" ${gpu_root_port_filename} 2>/dev/null && continue
if ip link show "${NICNAME_TYPE}${xgbe_no}" | grep -q "state UP" && \
ip a show "${NICNAME_TYPE}${xgbe_no}" | grep -q "inet"; then
printf "KV_CACHE_SOCKET_IFNAME=%s\n" "${NICNAME_TYPE}${xgbe_no}"
return 0
fi
done
echo "ERROR: No active CPU NIC with IP found!" >&2
return 1
fi
if [[ "$type" == "cpu_ib" && "$NIC_TYPE" == "CPU_NIC" ]]; then
ibdev=$(ibdev2netdev 2>/dev/null | awk -v nic="${NICNAME_TYPE}${xgbe_no}" '$5 == nic {print $1}')
if [ -n "$ibdev" ] && ip link show "${NICNAME_TYPE}${xgbe_no}" | grep -q "state UP" && \
ip a show "${NICNAME_TYPE}${xgbe_no}" | grep -q "inet "; then
if $cpu_ib_first; then
printf "KVCACHE_RDMA_NICS=%s" "$ibdev"
cpu_ib_first=false
else
printf ",%s" "$ibdev"
fi
fi
fi
done
case "$type" in
gpu) ! $gpu_first && printf "\n" ;;
xpu) ! $xpu_first && printf "\n" ;;
cpu) ! $cpu_first && printf "\n" ;;
cpu_ib) ! $cpu_ib_first && printf "\n" ;;
esac
}
function get_vxpu_nics() {
local topo_output=$(xpu-smi topo -m)
local xpu_info=$(echo "$topo_output" | grep -E '^XPU[0-9]+')
local nic_mapping=()
while IFS= read -r line; do
if [[ $line =~ NIC([0-9]+):\ +(mlx[0-9_]+) ]]; then
local nic_idx=${BASH_REMATCH[1]}
local nic_name=${BASH_REMATCH[2]}
nic_mapping[$nic_idx]=$nic_name
fi
done < <(echo "$topo_output" | grep -E '^\s*NIC[0-9]+:')
local nic_count=${#nic_mapping[@]}
declare -A priority_map=([PIX]=2 [NODE]=1 [SYS]=0)
local optimal_nics=()
while IFS= read -r line; do
local fields=($line)
local nic_start_index=5
local max_nics=$(( ${#fields[@]} - nic_start_index ))
local actual_nic_count=$(( max_nics < nic_count ? max_nics : nic_count ))
local best_priority=-1
local best_nic=""
for ((nic_idx=0; nic_idx<actual_nic_count; nic_idx++)); do
local conn_type=${fields[nic_idx+nic_start_index]}
local current_priority=${priority_map[$conn_type]:--1}
if (( current_priority > best_priority )); then
best_priority=$current_priority
best_nic="${nic_mapping[$nic_idx]}"
fi
done
if [[ -n "$best_nic" ]]; then
optimal_nics+=("$best_nic")
fi
done <<< "$xpu_info"
local IFS=,
export KVCACHE_RDMA_NICS="${optimal_nics[*]}"
echo "KVCACHE_RDMA_NICS=${optimal_nics[*]}"
}
function get_vcpu_nics() {
ip -o addr show | awk '$3 == "inet" && $4 ~ /^10\./ {print "KV_CACHE_SOCKET_IFNAME="$2; exit}'
}
function __main__() {
if [[ "$type" == "vxpu" ]]; then
get_vxpu_nics
return 0
fi
if [[ "$type" == "vcpu" ]]; then
get_vcpu_nics
return 0
fi
# 处理 bond 情况
if [[ "$type" == "cpu" ]]; then
for bond in $(ls -d /sys/class/net/bond* 2>/dev/null); do
bond_if=$(basename "$bond")
if ip link show "$bond_if" | grep -q "state UP" && \
ip a show "$bond_if" | grep -q "inet "; then
printf "KV_CACHE_SOCKET_IFNAME=%s\n" "$bond_if"
return 0
fi
done
fi
if [[ "$type" == "cpu_ib" ]]; then
first=true
for bond in $(ls -d /sys/class/net/bond* 2>/dev/null); do
bond_if=$(basename "$bond")
__NEW_GPU_ROOTPORT_FILE__
ibdev=$(ibdev2netdev 2>/dev/null | grep -w "$bond_if" | awk '{print $1}')
if [ -n "$ibdev" ] && ip link show "$bond_if" | grep -q "state UP" && \
ip a show "$bond_if" | grep -q "inet "; then
if $first; then
printf "KVCACHE_RDMA_NICS=%s" "$ibdev"
first=false
else
printf ",%s" "$ibdev"
fi
fi
bondib=$(show_gids 2>/dev/null | grep -w "$bond_if" | awk '{print $1}' | grep "mlx.*bond" | head -1)
if [ -n "$bondib" ] && ip link show "$bond_if" | grep -q "state UP" && \
ip a show "$bond_if" | grep -q "inet " && $first; then
printf "KVCACHE_RDMA_NICS=%s" "$bondib"
first=false
fi
__RM_GPU_ROOTPORT_FILE__
done
! $first && printf "\n"
[ ! $first ] && return 0
fi
local nic_types=("eth" "ib" "xgbe")
for nt in "${nic_types[@]}"; do
if ip a | grep -iq "$nt"; then
__NEW_GPU_ROOTPORT_FILE__
NICNAME_TYPE=$nt
__JUDGE_NIC_TYPE__
__RM_GPU_ROOTPORT_FILE__
fi
done
}
__main__