Compare commits

..

2 Commits

Author SHA1 Message Date
copilot-swe-agent[bot]
0f2b609496 Fix int32 to uint32 casting issues causing -1 to become large positive number
Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com>
2025-09-17 10:55:36 +00:00
copilot-swe-agent[bot]
3e319c0f90 Initial plan 2025-09-17 10:44:11 +00:00
308 changed files with 2816 additions and 26168 deletions

View File

@@ -143,8 +143,7 @@ jobs:
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
-e TZ="Asia/Shanghai" \
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
# python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
python -m pip install paddlepaddle-gpu==3.3.0.dev20250917 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple

View File

@@ -106,12 +106,7 @@ jobs:
CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
gpu_id=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
IFS='/' read -ra parts <<< "${GITHUB_WORKSPACE}"
len=${#parts[@]}
CCACHE_DEFAULT_DIR="/$(IFS=/; echo "${parts[*]:1:$((len-5))}")"
echo "$CCACHE_DEFAULT_DIR"
CACHE_DIR="${CACHE_DIR:-$CCACHE_DEFAULT_DIR}"
CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
echo "CACHE_DIR is set to ${CACHE_DIR}"
if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
touch "${CACHE_DIR}/gitconfig"
@@ -132,7 +127,6 @@ jobs:
-e "PADDLEVERSION=${PADDLEVERSION}" \
-e "PADDLE_WHL_URL=${PADDLE_WHL_URL}" \
-e "BRANCH_REF=${BRANCH_REF}" \
-e "CCACHE_MAXSIZE=50G" \
--gpus "\"device=${gpu_id}\"" ${docker_image} /bin/bash -c '
if [[ -n "${FD_VERSION}" ]]; then
export FASTDEPLOY_VERSION=${FD_VERSION}

View File

@@ -82,9 +82,6 @@ jobs:
FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
FD_CACHE_QUEUE_PORT=$((42098 + DEVICE_PORT * 100))
FD_ZMQ_RECV_REQUEST_SERVER_PORT=$((42048 + DEVICE_PORT * 100))
FD_ZMQ_SEND_RESPONSE_SERVER_PORT=$((42038 + DEVICE_PORT * 100))
FD_ZMQ_CONTROL_CMD_SERVER_PORTS=$((42028 + DEVICE_PORT * 100))
echo "Test ENV Parameter:"
echo "========================================================="
echo "FLASK_PORT=${FLASK_PORT}"

View File

@@ -28,22 +28,18 @@ jobs:
REPO="https://github.com/${{ github.repository }}.git"
FULL_REPO="${{ github.repository }}"
REPO_NAME="${FULL_REPO##*/}"
BASE_BRANCH="${{ github.base_ref }}"
# Clean the repository directory before starting
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-e "REPO_NAME=${REPO_NAME}" \
-e "BASE_BRANCH=${BASE_BRANCH}" \
${docker_image} /bin/bash -c '
if [ -d ${REPO_NAME} ]; then
echo "Directory ${REPO_NAME} exists, removing it..."
rm -rf ${REPO_NAME}
fi
'
git config --global http.proxy "http://61.151.249.150:33128"
git config --global https.proxy "http://61.151.249.150:33128"
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
git clone --recursive ${REPO} ${REPO_NAME} -b ${BASE_BRANCH}
git clone ${REPO} ${REPO_NAME}
cd FastDeploy
if [ "${{ github.event_name }}" = "pull_request" ]; then
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}

View File

@@ -43,7 +43,7 @@ English | [简体中文](README_CN.md)
- 🤝 **OpenAI API Server and vLLM Compatible**: One-command deployment with [vLLM](https://github.com/vllm-project/vllm/) interface compatibility.
- 🧮 **Comprehensive Quantization Format Support**: W8A16, W8A8, W4A16, W4A8, W2A16, FP8, and more.
-**Advanced Acceleration Techniques**: Speculative decoding, Multi-Token Prediction (MTP) and Chunked Prefill.
- 🖥️ **Multi-Hardware Support**: NVIDIA GPU, Kunlunxin XPU, Hygon DCU, Ascend NPU, Iluvatar GPU, Enflame GCU, MetaX GPU, Intel Gaudi etc.
- 🖥️ **Multi-Hardware Support**: NVIDIA GPU, Kunlunxin XPU, Hygon DCU, Ascend NPU, Iluvatar GPU, Enflame GCU, MetaX GPU etc.
## Requirements
@@ -60,7 +60,6 @@ FastDeploy supports inference deployment on **NVIDIA GPUs**, **Kunlunxin XPUs**,
- [Enflame GCU](./docs/get_started/installation/Enflame_gcu.md)
- [Hygon DCU](./docs/get_started/installation/hygon_dcu.md)
- [MetaX GPU](./docs/get_started/installation/metax_gpu.md)
- [Intel Gaudi](./docs/get_started/installation/intel_gaudi.md)
**Note:** We are actively working on expanding hardware support. Additional hardware platforms including Ascend NPU are currently under development and testing. Stay tuned for updates!

View File

@@ -41,7 +41,7 @@
- 🤝 **OpenAI API服务与vLLM兼容**:单命令部署,兼容[vLLM](https://github.com/vllm-project/vllm/)接口
- 🧮 **全量化格式支持**W8A16、W8A8、W4A16、W4A8、W2A16、FP8等
-**高级加速技术**推测解码、多令牌预测MTP及分块预填充
- 🖥️ **多硬件支持**NVIDIA GPU、昆仑芯XPU、海光DCU、昇腾NPU、天数智芯GPU、燧原GCU、沐曦GPU、英特尔Gaudi
- 🖥️ **多硬件支持**NVIDIA GPU、昆仑芯XPU、海光DCU、昇腾NPU、天数智芯GPU、燧原GCU、沐曦GPU等
## 要求
@@ -58,7 +58,6 @@ FastDeploy 支持在**英伟达NVIDIAGPU**、**昆仑芯KunlunxinXPU
- [燧原 S60](./docs/zh/get_started/installation/Enflame_gcu.md)
- [海光 DCU](./docs/zh/get_started/installation/hygon_dcu.md)
- [沐曦 GPU](./docs/zh/get_started/installation/metax_gpu.md)
- [英特尔 Gaudi](./docs/zh/get_started/installation/intel_gaudi.md)
**注意:** 我们正在积极拓展硬件支持范围。目前包括昇腾AscendNPU 等其他硬件平台正在开发测试中。敬请关注更新!

View File

@@ -98,7 +98,7 @@ def main(args):
raise ValueError("--max_concurrency should be same length as --s_itl_base_model")
for max_concurrency, s_itl in zip(args.max_concurrency, args.s_itl_base_model):
# Warmup
# Wramup
print("Starting warmup...")
with open(os.devnull, "w") as f:
with contextlib.redirect_stdout(f):

View File

@@ -965,7 +965,7 @@ if __name__ == "__main__":
parser.add_argument(
"--backend",
type=str,
default="openai-chat",
default="vllm",
choices=list(ASYNC_REQUEST_FUNCS.keys()),
)
parser.add_argument(

View File

@@ -1,5 +0,0 @@
max_model_len: 32768
max_num_seqs: 128
tensor_parallel_size: 4
use_cudagraph: True
load_choices: "default_v1"

View File

@@ -1,6 +0,0 @@
max_model_len: 32768
max_num_seqs: 128
tensor_parallel_size: 4
use_cudagraph: True
load_choices: "default_v1"
quantization: wfp8afp8

View File

@@ -1,8 +0,0 @@
top_p: 0.95
temperature: 0.6
metadata:
min_tokens: 1
max_tokens: 12288
repetition_penalty: 1.0
frequency_penalty: 0
presence_penalty: 0

View File

@@ -128,12 +128,6 @@ function copy_ops(){
echo -e "MACA ops have been copy to fastdeploy"
return
fi
is_intel_hpu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('intel_hpu'))"`
if [ "$is_intel_hpu" = "True" ]; then
DEVICE_TYPE="intel-hpu"
echo -e "intel_hpu ops have been copy to fastdeploy"
return
fi
DEVICE_TYPE="cpu"
cd ../../../../
@@ -165,9 +159,7 @@ function build_and_install_ops() {
else
FD_BUILDING_ARCS=${FD_BUILDING_ARCS} ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
fi
if [ -d "${OPS_TMP_DIR}" ]; then
find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
fi
find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
else
echo "Error: Invalid parameter '$FD_CPU_USE_BF16'. Please use true or false."
exit 1

View File

@@ -435,7 +435,7 @@ __global__ void multi_query_append_attention_warp1_4_kernel(
float *__restrict__ tmp_d, // [token_num, num_chunks, num_heads]
OutT *__restrict__ out,
const int speculate_max_draft_token_num = 5,
const uint32_t attn_mask_len = -1) {
const int32_t attn_mask_len = -1) {
constexpr uint32_t num_vecs_per_head = HEAD_DIM / num_elems_per_128b<T>();
static_assert(NUM_WARP_Q == 1, "NUM_WARP_Q must be 1");
static_assert(NUM_WARP_KV == 4, "NUM_WARP_KV must be 4");
@@ -1089,7 +1089,7 @@ void MultiQueryAppendAttention(
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
}
uint32_t attn_mask_len;
int32_t attn_mask_len;
if (attn_mask) {
attn_mask_len = attn_mask.get().shape()[1];
} else {

View File

@@ -533,7 +533,7 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel(
float *__restrict__ tmp_d, // [token_num, num_chunks, num_heads]
OutT *__restrict__ out,
const int speculate_max_draft_token_num = 5,
const uint32_t attn_mask_len = -1) {
const int32_t attn_mask_len = -1) {
constexpr uint32_t num_vecs_per_head = HEAD_DIM / num_elems_per_128b<T>();
constexpr uint32_t num_vecs_per_head_k =
HEAD_DIM / 2 / num_elems_per_128b<CacheT>();
@@ -1313,7 +1313,7 @@ void MultiQueryAppendC4Attention(
}
const int num_chunks = div_up(max_seq_len, chunk_size);
uint32_t attn_mask_len;
int32_t attn_mask_len;
if (attn_mask) {
attn_mask_len = attn_mask.get().shape()[1];
} else {

View File

@@ -540,7 +540,7 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel(
float *__restrict__ tmp_d, // [token_num, num_chunks, num_heads]
OutT *__restrict__ out,
const int speculate_max_draft_token_num = 5,
const uint32_t attn_mask_len = -1) {
const int32_t attn_mask_len = -1) {
constexpr uint32_t num_vecs_per_head = HEAD_DIM / num_elems_per_128b<T>();
constexpr uint32_t num_vecs_per_head_k =
HEAD_DIM / num_elems_per_128b<CacheT>();
@@ -1372,7 +1372,7 @@ void MultiQueryAppendC8Attention(
}
const int num_chunks = div_up(max_seq_len, chunk_size);
uint32_t attn_mask_len;
int32_t attn_mask_len;
if (attn_mask) {
attn_mask_len = attn_mask.get().shape()[1];
} else {

View File

@@ -1026,7 +1026,7 @@ __device__ __forceinline__ void mask_s(const bool* attn_mask,
const uint32_t qo_len,
const uint32_t kv_len,
const uint32_t chunk_end,
const uint32_t attn_mask_len,
const int32_t attn_mask_len,
float (*s_frag)[num_frags_z][8],
const int *mask_offset = nullptr) {
const uint32_t tx = threadIdx.x;
@@ -1050,7 +1050,7 @@ __device__ __forceinline__ void mask_s(const bool* attn_mask,
(causal
? (kv_idx > kv_len + q_idx - qo_len || (kv_idx >= chunk_end))
: kv_idx >= chunk_end);
if (attn_mask != nullptr && kv_idx > kv_len - qo_len && kv_idx < chunk_end && q_idx < attn_mask_len) {
if (attn_mask != nullptr && kv_idx > kv_len - qo_len && kv_idx < chunk_end && attn_mask_len > 0 && q_idx < static_cast<uint32_t>(attn_mask_len)) {
const int32_t mask_idx = q_idx * attn_mask_len + kv_idx - kv_len + qo_len;
bool mask = attn_mask[mask_idx];
out_of_boundary |= mask;

View File

@@ -1004,8 +1004,7 @@ __global__ void cache_kernel(
const uint32_t qkv_bias = bias % hidden_size;
const uint32_t hi = qkv_bias / head_size;
const uint32_t h_bias = qkv_bias % head_size;
const int32_t ori_bi = batch_id_per_token[token_idx];
if (ori_bi == -1) continue; // skip batch_id_per_token[token_idx]=-1
const uint32_t ori_bi = batch_id_per_token[token_idx];
if (seq_lens[ori_bi] == 0) continue;
const uint32_t ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];

View File

@@ -571,7 +571,6 @@ std::vector<paddle::Tensor> NoauxTc(
int n_group,
int topk_group,
int topk,
bool renormalize,
float routed_scaling_factor);
#ifdef ENABLE_FP8
@@ -623,8 +622,6 @@ int64_t open_mem_handle(paddle::Tensor& mem_handle);
void free_shared_buffer(int64_t buffer);
void clear_ipc_handles(int64_t _fa);
// speculative decoding Kernel
std::vector<paddle::Tensor> SpeculateGetPaddingOffset(
const paddle::Tensor& input_ids,
@@ -1231,8 +1228,6 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("free_shared_buffer", &free_shared_buffer, "free_shared_buffer");
m.def("clear_ipc_handles", &clear_ipc_handles, "clear_ipc_handles");
m.def("open_mem_handle", &open_mem_handle, "open_mem_handle");
m.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta, "get_graph_buffer_ipc_meta");

View File

@@ -122,14 +122,10 @@ void register_graph_buffers(fptr_t _fa,
for (int i = 0; i < handles.size(); i++) {
bytes.emplace_back(handles[i].begin(), handles[i].end());
}
bytes.reserve(handles.size());
fa->register_graph_buffers(bytes, offsets);
}
void clear_ipc_handles(fptr_t _fa) {
auto fa = reinterpret_cast<paddle::CustomAllreduce*>(_fa);
fa->clear_ipc_handles();
}
std::tuple<fptr_t, paddle::Tensor> allocate_shared_buffer_and_handle(
int64_t size) {

View File

@@ -303,7 +303,7 @@ class CustomAllreduce {
bool full_nvlink_;
RankSignals sg_;
// Stores an map from a pointer to its peer pointers from all ranks.
// Stores an map from a pointer to its peer pointters from all ranks.
std::unordered_map<void*, RankData*> buffers_;
Signal* self_sg_;
@@ -517,15 +517,10 @@ class CustomAllreduce {
#undef KL
}
void clear_ipc_handles(){
~CustomAllreduce() {
for (auto [_, ptr] : ipc_handles_) {
CUDACHECK(cudaIpcCloseMemHandle(ptr));
}
ipc_handles_.clear();
}
~CustomAllreduce() {
clear_ipc_handles();
}
};
} // namespace paddle

View File

@@ -39,6 +39,9 @@ void GetOutputTopK(const paddle::Tensor& x,
int k,
int64_t rank_id,
bool wait_flag) {
if (rank_id > 0) {
return;
}
static struct msgdata msg_rcv;
int msg_queue_id = 1;

View File

@@ -14,8 +14,6 @@
#pragma once
#include <cuda_fp8.h>
#ifndef PADDLE_WITH_COREX
#include "glog/logging.h"
#endif
@@ -153,34 +151,6 @@ inline int GetGPUComputeCapability(int id) {
#endif
#ifndef FP8_E4M3_MAX
#define FP8_E4M3_MAX 448.0
#endif
#ifndef DISPATCH_FLOAT_FP6_DTYPE
#define DISPATCH_FLOAT_FP6_DTYPE(pd_dtype, c_type, ...) \
switch (pd_dtype) { \
case phi::DataType::FLOAT32: { \
using c_type = float; \
__VA_ARGS__ \
break; \
} \
case phi::DataType::BFLOAT16: { \
using c_type = phi::dtype::bfloat16; \
__VA_ARGS__ \
break; \
} \
case phi::DataType::FLOAT16: { \
using c_type = phi::dtype::float16; \
__VA_ARGS__ \
break; \
} \
default: { \
PD_THROW("Only supported attr of input type in [fp32, fp16, bf16]."); \
} \
}
#endif
inline constexpr uint32_t next_pow_2(uint32_t const num) {
if (num <= 1)
return num;
@@ -223,13 +193,11 @@ public:
typedef uint8_t data_t;
};
#ifndef PADDLE_WITH_COREX
template <> class PDTraits<paddle::DataType::FLOAT8_E4M3FN> {
public:
typedef __nv_fp8_e4m3 DataType;
typedef paddle::float8_e4m3fn data_t;
};
#endif
template <typename T, int Size> struct alignas(sizeof(T) * Size) AlignedVector {
T val[Size];
@@ -603,28 +571,3 @@ inline bool GetMlaUseTensorcore() {
flags_mla_use_tensorcore && enable_mla_tensorcore;
return mla_use_tensorcore;
}
__device__ __forceinline__ float warpReduceMax(float value) {
value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 16));
value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 8));
value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 4));
value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 2));
value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 1));
return value;
}
__device__ __forceinline__ float blockReduceMax(float value) {
static __shared__ float warpLevelMaxs[WARP_SIZE];
const int laneId = threadIdx.x % WARP_SIZE;
const int warpId = threadIdx.x / WARP_SIZE;
value = warpReduceMax(value);
if (laneId == 0) warpLevelMaxs[warpId] = value;
__syncthreads();
value = (threadIdx.x < blockDim.x / WARP_SIZE) ? warpLevelMaxs[laneId] : 0;
if (warpId == 0) value = warpReduceMax(value);
return value;
}

View File

@@ -18,6 +18,7 @@
#include "iomanip"
#include <nvml.h>
#include <iostream>
#include <nvml.h>
// #define PRINT_GPU_MEMORY
// 函数用于获取 NVIDIA GPU 显存信息
bool getNvidiaGPUMemoryUsage(int callLine) {

View File

@@ -33,11 +33,6 @@
__VA_ARGS__ \
break; \
} \
case 3: { \
constexpr size_t NUM_EXPERTS_PER_RANK = 3; \
__VA_ARGS__ \
break; \
} \
case 6: { \
constexpr size_t NUM_EXPERTS_PER_RANK = 6; \
__VA_ARGS__ \

View File

@@ -26,7 +26,6 @@ std::vector<paddle::Tensor> NoauxTc(paddle::Tensor& scores,
int n_group,
int topk_group,
int topk,
bool renormalize,
float routed_scaling_factor) {
auto input_shape = scores_with_bias.shape();
PD_CHECK(input_shape.size() == 2);
@@ -49,7 +48,6 @@ std::vector<paddle::Tensor> NoauxTc(paddle::Tensor& scores,
n_group,
topk_group,
topk,
renormalize,
routed_scaling_factor,
stream);
@@ -78,7 +76,6 @@ PD_BUILD_STATIC_OP(noaux_tc)
.Attrs({"n_group: int",
"topk_group: int",
"topk:int",
"renormalize: bool",
"routed_scaling_factor: float"})
.SetKernelFn(PD_KERNEL(NoauxTc))
.SetInferShapeFn(PD_INFER_SHAPE(NoauxTcInferShape))

View File

@@ -25,23 +25,6 @@ constexpr unsigned FULL_WARP_MASK = 0xffffffff;
constexpr int32_t BLOCK_SIZE = 512;
constexpr int32_t NUM_WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE;
template <typename T_OUT, typename T_IN>
__device__ inline T_OUT cuda_cast(T_IN val) {
return val;
}
template <>
__device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) {
return __bfloat162float(val);
}
template <typename T>
__device__ inline T neg_inf() {
// cuda::std::numeric_limits<T>::infinity() returns `0` for [T=bf16 or fp16]
// so we need to cast from fp32
return cuda_cast<T, float>(-cuda::std::numeric_limits<float>::infinity());
}
namespace warp_topk {
template <int size, typename T>
@@ -58,21 +41,10 @@ constexpr __host__ __device__ bool isPowerOf2(T v) {
}
template <bool greater, typename T>
__forceinline__ __device__ bool is_better_than(T val, T baseline) {
__device__ bool is_better_than(T val, T baseline) {
return (val > baseline && greater) || (val < baseline && !greater);
}
template <bool greater, typename T, typename idxT>
__forceinline__ __device__ bool is_better_than(T val, T baseline, idxT index,
idxT baseline_index) {
bool res = (val > baseline && greater) || (val < baseline && !greater);
if (val == baseline) {
res = (index < baseline_index && greater) ||
(index < baseline_index && !greater);
}
return res;
}
template <typename T, typename idxT>
int calc_smem_size_for_block_wide(int num_of_warp, int64_t k) {
int64_t cache_topk = (sizeof(T) + sizeof(idxT)) * num_of_warp * k;
@@ -81,8 +53,7 @@ int calc_smem_size_for_block_wide(int num_of_warp, int64_t k) {
round_up_to_multiple_of<256>(n * sizeof(T)) + n * sizeof(idxT));
}
template <int size, bool ascending, bool reverse, typename T, typename idxT,
bool is_stable>
template <int size, bool ascending, typename T, typename idxT>
struct BitonicMerge {
// input should be a bitonic sequence, and sort it to be a monotonic sequence
__device__ static void merge(T* __restrict__ val_arr,
@@ -96,15 +67,7 @@ struct BitonicMerge {
int const other_i = i + stride;
T& val = val_arr[i];
T& other_val = val_arr[other_i];
bool is_better;
if constexpr (is_stable) {
is_better = is_better_than<ascending>(val, other_val, idx_arr[i],
idx_arr[other_i]);
} else {
is_better = is_better_than<ascending>(val, other_val);
}
if (is_better) {
if ((val > other_val && ascending) || (val < other_val && !ascending)) {
T tmp = val;
val = other_val;
other_val = tmp;
@@ -115,14 +78,13 @@ struct BitonicMerge {
}
}
BitonicMerge<size / 2, ascending, reverse, T, idxT, is_stable>::merge(
val_arr, idx_arr);
BitonicMerge<size / 2, ascending, reverse, T, idxT, is_stable>::merge(
val_arr + arr_len / 2, idx_arr + arr_len / 2);
BitonicMerge<size / 2, ascending, T, idxT>::merge(val_arr, idx_arr);
BitonicMerge<size / 2, ascending, T, idxT>::merge(val_arr + arr_len / 2,
idx_arr + arr_len / 2);
}
};
template <int size, bool ascending, typename T, typename idxT, bool is_stable>
template <int size, bool ascending, typename T, typename idxT>
struct BitonicSort {
__device__ static void sort(T* __restrict__ val_arr,
idxT* __restrict__ idx_arr) {
@@ -130,16 +92,15 @@ struct BitonicSort {
static_assert(size >= 2 * WARP_SIZE);
constexpr int arr_len = size / WARP_SIZE;
BitonicSort<size / 2, true, T, idxT, is_stable>::sort(val_arr, idx_arr);
BitonicSort<size / 2, false, T, idxT, is_stable>::sort(
val_arr + arr_len / 2, idx_arr + arr_len / 2);
BitonicMerge<size, ascending, ascending, T, idxT, is_stable>::merge(
val_arr, idx_arr);
BitonicSort<size / 2, true, T, idxT>::sort(val_arr, idx_arr);
BitonicSort<size / 2, false, T, idxT>::sort(val_arr + arr_len / 2,
idx_arr + arr_len / 2);
BitonicMerge<size, ascending, T, idxT>::merge(val_arr, idx_arr);
}
};
template <bool ascending, typename T, typename idxT, bool is_stable>
struct BitonicSort<32, ascending, T, idxT, is_stable> {
template <bool ascending, typename T, typename idxT>
struct BitonicSort<32, ascending, T, idxT> {
__device__ static void sort(T* __restrict__ val_arr,
idxT* __restrict__ idx_arr) {
int const lane = threadIdx.x % WARP_SIZE;
@@ -153,37 +114,19 @@ struct BitonicSort<32, ascending, T, idxT, is_stable> {
T other = __shfl_xor_sync(FULL_WARP_MASK, *val_arr, stride);
idxT other_idx = __shfl_xor_sync(FULL_WARP_MASK, *idx_arr, stride);
bool is_better;
if constexpr (is_stable) {
if constexpr (ascending) {
is_better = ((*val_arr > other) ||
((*val_arr == other) && (*idx_arr < other_idx))) !=
(reverse != is_second);
} else {
is_better = ((*val_arr > other) ||
((*val_arr == other) && (*idx_arr > other_idx))) !=
(reverse != is_second);
}
} else {
is_better = (*val_arr != other &&
(*val_arr > other) != (reverse != is_second));
}
if (is_better) {
if (*val_arr != other && (*val_arr > other) != (reverse != is_second)) {
*val_arr = other;
*idx_arr = other_idx;
}
}
}
BitonicMerge<32, ascending, ascending, T, idxT, is_stable>::merge(val_arr,
idx_arr);
BitonicMerge<32, ascending, T, idxT>::merge(val_arr, idx_arr);
}
};
template <bool ascending, bool reverse, typename T, typename idxT,
bool is_stable>
struct BitonicMerge<32, ascending, reverse, T, idxT, is_stable> {
template <bool ascending, typename T, typename idxT>
struct BitonicMerge<32, ascending, T, idxT> {
__device__ static void merge(T* __restrict__ val_arr,
idxT* __restrict__ idx_arr) {
int const lane = threadIdx.x % WARP_SIZE;
@@ -193,24 +136,7 @@ struct BitonicMerge<32, ascending, reverse, T, idxT, is_stable> {
T other = __shfl_xor_sync(FULL_WARP_MASK, val, stride);
idxT& idx = *idx_arr;
idxT other_idx = __shfl_xor_sync(FULL_WARP_MASK, idx, stride);
bool is_better;
if constexpr (is_stable) {
if constexpr (ascending) {
is_better = ((*val_arr > other) ||
((*val_arr == other) && (*idx_arr < other_idx))) ==
(reverse != is_second); // for min
} else {
is_better = ((*val_arr > other) ||
((*val_arr == other) && (*idx_arr > other_idx))) ==
(reverse != is_second); // for max
}
} else {
is_better =
(val != other && ((val > other) == (ascending != is_second)));
}
if (is_better) {
if (val != other && ((val > other) == (ascending != is_second))) {
val = other;
idx = other_idx;
}
@@ -218,42 +144,34 @@ struct BitonicMerge<32, ascending, reverse, T, idxT, is_stable> {
}
};
template <int capacity, bool greater, typename T, typename idxT, bool is_stable>
template <int capacity, bool greater, typename T, typename idxT>
class WarpSort {
public:
public:
__device__ WarpSort(idxT k, T dummy)
: lane_(threadIdx.x % WARP_SIZE), k_(k), dummy_(dummy) {
static_assert(capacity >= WARP_SIZE && isPowerOf2(capacity));
for (int i = 0; i < max_arr_len_; ++i) {
val_arr_[i] = dummy_;
idx_arr_[i] = 0;
}
}
// load and merge k sorted values
__device__ void load_sorted(T const* __restrict__ in,
idxT const* __restrict__ in_idx, idxT start) {
idxT const* __restrict__ in_idx,
idxT start) {
idxT idx = start + WARP_SIZE - 1 - lane_;
for (int i = max_arr_len_ - 1; i >= 0; --i, idx += WARP_SIZE) {
if (idx < start + k_) {
T t = in[idx];
bool is_better;
if constexpr (is_stable) {
is_better =
is_better_than<greater>(t, val_arr_[i], in_idx[idx], idx_arr_[i]);
} else {
is_better = is_better_than<greater>(t, val_arr_[i]);
}
if (is_better) {
if (is_better_than<greater>(t, val_arr_[i])) {
val_arr_[i] = t;
idx_arr_[i] = in_idx[idx];
}
}
}
BitonicMerge<capacity, greater, !greater, T, idxT, is_stable>::merge(
val_arr_, idx_arr_);
BitonicMerge<capacity, !greater, T, idxT>::merge(val_arr_, idx_arr_);
}
__device__ void dump(T* __restrict__ out, idxT* __restrict__ out_idx) const {
@@ -275,7 +193,7 @@ class WarpSort {
}
}
protected:
protected:
static constexpr int max_arr_len_ = capacity / WARP_SIZE;
T val_arr_[max_arr_len_];
@@ -287,11 +205,11 @@ class WarpSort {
}; // end class WarpSort
template <int capacity, bool greater, typename T, typename idxT, bool is_stable>
class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
public:
template <int capacity, bool greater, typename T, typename idxT>
class WarpSelect : public WarpSort<capacity, greater, T, idxT> {
public:
__device__ WarpSelect(idxT k, T dummy)
: WarpSort<capacity, greater, T, idxT, is_stable>(k, dummy),
: WarpSort<capacity, greater, T, idxT>(k, dummy),
k_th_(dummy),
k_th_lane_((k - 1) % WARP_SIZE) {
extern __shared__ char smem_buf[]; // extern __shared__ T smem_buf[];
@@ -316,13 +234,7 @@ class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
}
__device__ void add(T val, idxT idx) {
bool do_add;
if constexpr (is_stable) {
do_add = is_better_than<greater>(val, k_th_, idx, k_th_idx_);
} else {
do_add = is_better_than<greater>(val, k_th_);
}
bool do_add = is_better_than<greater>(val, k_th_);
uint32_t mask = __ballot_sync(FULL_WARP_MASK, do_add);
if (mask == 0) {
return;
@@ -359,52 +271,37 @@ class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
__syncthreads();
}
private:
private:
__device__ void set_k_th_() {
k_th_ = __shfl_sync(FULL_WARP_MASK, val_arr_[max_arr_len_ - 1], k_th_lane_);
if constexpr (is_stable) {
k_th_idx_ =
__shfl_sync(FULL_WARP_MASK, idx_arr_[max_arr_len_ - 1], k_th_lane_);
}
}
__device__ void merge_buf_(T val, idxT idx) {
BitonicSort<WARP_SIZE, greater, T, idxT, is_stable>::sort(&val, &idx);
BitonicSort<WARP_SIZE, greater, T, idxT>::sort(&val, &idx);
T& old = val_arr_[max_arr_len_ - 1];
bool is_better;
if constexpr (is_stable) {
is_better =
is_better_than<greater>(val, old, idx, idx_arr_[max_arr_len_ - 1]);
} else {
is_better = is_better_than<greater>(val, old);
}
if (is_better) {
if (is_better_than<greater>(val, old)) {
old = val;
idx_arr_[max_arr_len_ - 1] = idx;
}
BitonicMerge<capacity, greater, !greater, T, idxT, is_stable>::merge(
val_arr_, idx_arr_);
BitonicMerge<capacity, !greater, T, idxT>::merge(val_arr_, idx_arr_);
set_k_th_();
}
using WarpSort<capacity, greater, T, idxT, is_stable>::max_arr_len_;
using WarpSort<capacity, greater, T, idxT, is_stable>::val_arr_;
using WarpSort<capacity, greater, T, idxT, is_stable>::idx_arr_;
using WarpSort<capacity, greater, T, idxT, is_stable>::lane_;
using WarpSort<capacity, greater, T, idxT, is_stable>::k_;
using WarpSort<capacity, greater, T, idxT, is_stable>::dummy_;
using WarpSort<capacity, greater, T, idxT>::max_arr_len_;
using WarpSort<capacity, greater, T, idxT>::val_arr_;
using WarpSort<capacity, greater, T, idxT>::idx_arr_;
using WarpSort<capacity, greater, T, idxT>::lane_;
using WarpSort<capacity, greater, T, idxT>::k_;
using WarpSort<capacity, greater, T, idxT>::dummy_;
T* val_smem_;
idxT* idx_smem_;
int smem_buf_len_ = 0;
T k_th_;
idxT k_th_idx_;
int const k_th_lane_;
}; // end class WarpSelect
} // namespace warp_topk
@@ -416,8 +313,8 @@ __device__ void topk_with_k2(T* output,
int32_t const lane_id,
int const num_experts_per_group) {
// Get the top2 per thread
T largest = neg_inf<T>();
T second_largest = neg_inf<T>();
T largest = cuda::std::numeric_limits<T>::min();
T second_largest = cuda::std::numeric_limits<T>::min();
if (num_experts_per_group > WARP_SIZE) {
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
@@ -471,14 +368,8 @@ __global__ void topk_with_k2_kernel(T* output,
cg::thread_block block = cg::this_thread_block();
cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile("griddepcontrol.wait;");
#endif
topk_with_k2(output, input, tile, lane_id, num_experts_per_group);
}
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile("griddepcontrol.launch_dependents;");
#endif
}
template <typename T, typename IdxT>
@@ -494,7 +385,6 @@ __global__ void group_idx_and_topk_idx_kernel(
int64_t const topk,
int64_t const num_experts,
int64_t const num_experts_per_group,
bool const renormalize,
double routed_scaling_factor) {
int32_t warp_id = threadIdx.x / WARP_SIZE;
int32_t lane_id = threadIdx.x % WARP_SIZE;
@@ -513,29 +403,19 @@ __global__ void group_idx_and_topk_idx_kernel(
extern __shared__ char smem_buf[]; // NOTE: reuse the shared memory here to
// store the target topk idx
int32_t* s_topk_idx = reinterpret_cast<int32_t*>(smem_buf);
int32_t* s_topk_idx = reinterpret_cast<int32_t*>(smem_buf) + warp_id * topk;
T* s_topk_value =
reinterpret_cast<T*>(s_topk_idx + NUM_WARPS_PER_BLOCK * topk) +
warp_id * topk;
s_topk_idx += warp_id * topk;
T value = neg_inf<T>();
T topk_group_value = neg_inf<T>();
T value = cuda::std::numeric_limits<T>::min();
T topk_group_value = cuda::std::numeric_limits<T>::min();
int32_t num_equalto_topkth_group;
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile("griddepcontrol.wait;"); // I think all prolog can be put before
// acqbulk because it's ptr arithmetic
#endif
if (case_id < num_tokens) {
if ((n_group > topk_group) && (case_id < num_tokens)) {
// calculate group_idx
int32_t target_num_min = WARP_SIZE - n_group + topk_group;
if (lane_id < n_group &&
(isfinite(cuda_cast<float, T>(
group_scores[lane_id])))) // The check is necessary to avoid
// abnormal input
{
if (lane_id < n_group) {
value = group_scores[lane_id];
}
@@ -546,23 +426,22 @@ __global__ void group_idx_and_topk_idx_kernel(
__syncwarp(); // Ensure all threads have valid data before reduction
topk_group_value = cg::reduce(tile, value, cg::greater<T>());
if (value == topk_group_value) {
value = neg_inf<T>();
value = cuda::std::numeric_limits<T>::min();
}
pre_count_equal_to_top_value = count_equal_to_top_value;
count_equal_to_top_value = __popc(__ballot_sync(
FULL_WARP_MASK, (value == neg_inf<T>())));
FULL_WARP_MASK, (value == cuda::std::numeric_limits<T>::min())));
}
num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
}
__syncthreads();
warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
/* is_stable */ true>
queue((int32_t)topk, neg_inf<T>());
warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t>
queue((int32_t)topk, cuda::std::numeric_limits<T>::min());
int count_equalto_topkth_group = 0;
bool if_proceed_next_topk = (topk_group_value != neg_inf<T>());
if (case_id < num_tokens && if_proceed_next_topk) {
bool if_proceed_next_topk = (topk_group_value != cuda::std::numeric_limits<T>::min());
if (case_id < num_tokens) {
for (int i_group = 0; i_group < n_group; i_group++) {
if ((group_scores[i_group] > topk_group_value) ||
((group_scores[i_group] == topk_group_value) &&
@@ -570,11 +449,9 @@ __global__ void group_idx_and_topk_idx_kernel(
int32_t offset = i_group * num_experts_per_group;
for (int32_t i = lane_id; i < align_num_experts_per_group;
i += WARP_SIZE) {
T candidates =
(i < num_experts_per_group) && isfinite(cuda_cast<float, T>(
scores_with_bias[offset + i]))
? scores_with_bias[offset + i]
: neg_inf<T>();
T candidates = i < num_experts_per_group
? scores_with_bias[offset + i]
: cuda::std::numeric_limits<T>::min();
queue.add(candidates, offset + i);
}
if (group_scores[i_group] == topk_group_value) {
@@ -592,7 +469,7 @@ __global__ void group_idx_and_topk_idx_kernel(
// Load the valid score value
// Calculate the summation
float topk_sum = 1e-20;
if (case_id < num_tokens && if_proceed_next_topk) {
if (case_id < num_tokens) {
for (int i = lane_id;
i < warp_topk::round_up_to_multiple_of<WARP_SIZE>(topk);
i += WARP_SIZE) {
@@ -601,45 +478,33 @@ __global__ void group_idx_and_topk_idx_kernel(
if (i < topk) {
s_topk_value[i] = value;
}
topk_sum += reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
topk_sum += reduce(tile, value, cg::plus<float>());
}
}
__syncthreads();
if (case_id < num_tokens && if_proceed_next_topk) {
if (case_id < num_tokens) {
for (int i = lane_id; i < num_experts; i += WARP_SIZE) {
scores[i] = 0;
}
}
__syncwarp();
__threadfence();
__syncthreads();
if (case_id < num_tokens) {
if (if_proceed_next_topk) {
for (int i = lane_id; i < topk; i += WARP_SIZE) {
float value;
if (renormalize) {
value = cuda_cast<float, T>(s_topk_value[i]) / topk_sum *
routed_scaling_factor;
} else {
value = cuda_cast<float, T>(s_topk_value[i]) * routed_scaling_factor;
}
scores[s_topk_idx[i]] = value;
for (int i = lane_id; i < topk; i += WARP_SIZE) {
float value = s_topk_value[i] / topk_sum * routed_scaling_factor;
scores[s_topk_idx[i]] = value;
if (if_proceed_next_topk) {
topk_indices[i] = s_topk_idx[i];
topk_values[i] = cuda_cast<T, float>(value);
topk_values[i] = static_cast<T>(value);
}
} else {
for (int i = lane_id; i < topk; i += WARP_SIZE) {
else {
topk_indices[i] = i;
topk_values[i] = cuda_cast<T, float>(1.0f / topk);
topk_values[i] = static_cast<float>(1.0f / topk);
}
}
// Note: when if_proceed_next_topk==false, choose the first 8 experts as the
// default result.
}
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile("griddepcontrol.launch_dependents;");
#endif
}
template <typename T, typename IdxT>
@@ -653,24 +518,17 @@ void invokeNoAuxTc(T* scores,
int64_t const n_group,
int64_t const topk_group,
int64_t const topk,
bool const renormalize,
double const routed_scaling_factor,
cudaStream_t const stream) {
int64_t num_cases = num_tokens * n_group;
int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
auto* kernel_instance1 = &topk_with_k2_kernel<T>;
cudaLaunchConfig_t config;
config.gridDim = topk_with_k2_num_blocks;
config.blockDim = BLOCK_SIZE;
config.dynamicSmemBytes = 0;
config.stream = stream;
cudaLaunchAttribute attrs[1];
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
attrs[0].val.programmaticStreamSerializationAllowed = false;
config.numAttrs = 1;
config.attrs = attrs;
cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores_with_bias,
num_tokens, num_cases, n_group, num_experts / n_group);
topk_with_k2_kernel<T><<<topk_with_k2_num_blocks, BLOCK_SIZE, 0, stream>>>(
group_scores,
scores_with_bias,
num_tokens,
num_cases,
n_group,
num_experts / n_group);
int64_t topk_with_k_group_num_blocks =
(num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
@@ -678,19 +536,21 @@ void invokeNoAuxTc(T* scores,
warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
topk);
auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
config.gridDim = topk_with_k_group_num_blocks;
config.blockDim = BLOCK_SIZE;
config.dynamicSmemBytes = dynamic_smem_in_bytes;
config.stream = stream;
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
attrs[0].val.programmaticStreamSerializationAllowed = false;
config.numAttrs = 1;
config.attrs = attrs;
cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
topk_values, topk_indices, scores_with_bias, num_tokens,
n_group, topk_group, topk, num_experts,
num_experts / n_group, renormalize, routed_scaling_factor);
group_idx_and_topk_idx_kernel<T><<<topk_with_k_group_num_blocks,
BLOCK_SIZE,
dynamic_smem_in_bytes,
stream>>>(scores,
group_scores,
topk_values,
topk_indices,
scores_with_bias,
num_tokens,
n_group,
topk_group,
topk,
num_experts,
num_experts / n_group,
routed_scaling_factor);
}
#define INSTANTIATE_NOAUX_TC(T, IdxT) \
@@ -704,7 +564,6 @@ void invokeNoAuxTc(T* scores,
int64_t const n_group, \
int64_t const topk_group, \
int64_t const topk, \
bool const renormalize, \
double const routed_scaling_factor, \
cudaStream_t const stream);

View File

@@ -3,158 +3,6 @@
#include "quantization/common.cuh"
// adapted from: https://github.com/sgl-project/sglang/blob/v0.5.2rc2/sgl-kernel/csrc/gemm/per_token_quant_fp8.cu
// ---------------------------------------------------------------------------
// 1. Warplocal, no shared memory
// • One warp handles one token.
// • Eight tokens per 256thread CTA.
// ---------------------------------------------------------------------------
template <typename T, typename DST_DTYPE, int kTokensPerCTA = 8, int kVecSize = 16>
__global__ void per_token_quant_fp8_kernel(
const T* __restrict__ input,
DST_DTYPE* __restrict__ output_q,
float* __restrict__ output_s,
const float scale_ub,
const int64_t hidden_size,
const int64_t num_tokens) {
const int warp_id = threadIdx.x / WARP_SIZE; // 07 (8 warps)
const int lane_id = threadIdx.x & (WARP_SIZE - 1); // 031
const int token_id = blockIdx.x * kTokensPerCTA + warp_id;
if (token_id >= num_tokens) return;
// Global tensors for this token
const T* token_input = input + token_id * hidden_size;
DST_DTYPE* token_output = output_q + token_id * hidden_size;
float* token_scale = output_s + token_id;
//
// Pass-1: Perform a warp reduce to find the max_value of a token's hidden_size
//
float max_value = 0.f;
using vec_t = AlignedVector<T, kVecSize>;
const int32_t num_vec_elems = hidden_size / kVecSize;
for (int32_t i = lane_id; i < num_vec_elems; i += WARP_SIZE) {
vec_t input_vec;
Load(token_input + i * kVecSize, &input_vec);
#pragma unroll
for (uint32_t j = 0; j < kVecSize; ++j) {
max_value = fmaxf(max_value, fabsf(static_cast<float>(input_vec[j])));
}
}
float warp_max = warpReduceMax(max_value);
if (scale_ub > 0){
warp_max = fminf(warp_max, scale_ub);
}
float scale;
scale = warp_max / FP8_E4M3_MAX;
// Broadcast scale
if (lane_id == 0) {
token_scale[0] = scale;
}
float scale_inv = (scale == 0.f) ? 0.f : 1.0f / scale;
//
// Pass-2: quantize and write back
//
for (int i = lane_id; i < num_vec_elems; i += WARP_SIZE) {
vec_t input_vec;
Load(token_input + i * kVecSize, &input_vec);
DST_DTYPE output_arr[kVecSize];
#pragma unroll
for (uint32_t j = 0; j < kVecSize; ++j) {
float val = static_cast<float>(input_vec[j]) * scale_inv;
val = fmaxf(fminf(val, FP8_E4M3_MAX), -FP8_E4M3_MAX);
output_arr[j] = static_cast<DST_DTYPE>(val);
}
if constexpr (kVecSize == 16) {
*(uint4*)(token_output + i * kVecSize) = *(uint4*)output_arr;
} else {
// Use element-wise copy for vector size 8 to ensure correctness
for (int k = 0; k < kVecSize; ++k) {
token_output[i * kVecSize + k] = output_arr[k];
}
}
}
}
// ---------------------------------------------------------------------------
// 2. Baseline kernel (1 token / CTA, CUB block reduce)
// ---------------------------------------------------------------------------
template <typename T, typename DST_DTYPE, int kVecSize = 16>
__global__ void per_token_quant_fp8_small_batch_kernel(
const T* __restrict__ input,
DST_DTYPE* __restrict__ output_q,
float* __restrict__ output_s,
const float scale_ub,
const int64_t hidden_size,
const int64_t num_tokens) {
const int token_idx = blockIdx.x;
if (token_idx >= num_tokens) return;
const int tid = threadIdx.x;
const int block_dim = blockDim.x;
const T* token_input = input + token_idx * hidden_size;
DST_DTYPE* token_output = output_q + token_idx * hidden_size;
float max_value = 0.0f;
// Use template parameter for vector size
using vec_t = AlignedVector<T, kVecSize>;
const int32_t num_vec_elems = hidden_size / kVecSize;
// Find max using vectorized loads
for (int32_t i = tid; i < num_vec_elems; i += block_dim) {
vec_t input_vec;
Load(token_input + i * kVecSize, &input_vec);
#pragma unroll
for (uint32_t j = 0; j < kVecSize; ++j) {
float val = static_cast<float>(input_vec[j]);
max_value = fmaxf(max_value, fabsf(val));
}
}
max_value = blockReduceMax(max_value);
if (scale_ub > 0){
max_value = fminf(max_value, scale_ub);
}
__shared__ float scale;
if (tid == 0) {
scale = max_value / FP8_E4M3_MAX;
output_s[token_idx] = scale;
}
__syncthreads();
const float scale_inv = 1.0f / scale;
// Quantize using vectorized loads
for (int32_t i = tid; i < num_vec_elems; i += block_dim) {
vec_t input_vec;
Load(token_input + i * kVecSize, &input_vec);
DST_DTYPE output_arr[kVecSize];
#pragma unroll
for (uint32_t j = 0; j < kVecSize; ++j) {
float val = fmaxf(fminf(static_cast<float>(input_vec[j]) * scale_inv, FP8_E4M3_MAX), -FP8_E4M3_MAX);
output_arr[j] = static_cast<DST_DTYPE>(val);
}
if constexpr (kVecSize == 16) {
*(uint4*)(token_output + i * kVecSize) = *(uint4*)output_arr;
} else {
// Use element-wise copy for vector size 8 to ensure correctness
for (int k = 0; k < kVecSize; ++k) {
token_output[i * kVecSize + k] = output_arr[k];
}
}
}
}
namespace fastdeploy {
template <typename scalar_t, typename fp8_type>
@@ -331,78 +179,39 @@ void DynamicPerTokenScaledFp8Quant(paddle::Tensor &out, // [..., d]
auto rank = input.dims().size();
int const hidden_size = input.dims()[rank - 1];
int const num_tokens = input.numel() / hidden_size;
cudaStream_t stream = input.stream();
if (hidden_size % 8 == 0){
int device = 0;
cudaGetDevice(&device);
int sm_count = 0;
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device);
const int TOKENS_PER_CTA = 8;
const bool use_warp_kernel = (num_tokens >= sm_count * 2 * TOKENS_PER_CTA);
const bool use_vec16 = (hidden_size % 16 == 0);
DISPATCH_FLOAT_FP6_DTYPE(input.dtype(), scalar_t, {
if (use_warp_kernel) {
// -------- warplocal ---------------------------------------------------
constexpr int THREADS = TOKENS_PER_CTA * WARP_SIZE; // 256
dim3 grid((num_tokens + TOKENS_PER_CTA - 1) / TOKENS_PER_CTA);
dim3 block(THREADS);
if (use_vec16) {
per_token_quant_fp8_kernel<scalar_t, __nv_fp8_e4m3, TOKENS_PER_CTA, 16><<<grid, block, 0, stream>>>(
reinterpret_cast<const scalar_t*>(input.data<scalar_t>()),
reinterpret_cast<__nv_fp8_e4m3*>(out.data<fp8_t>()),
reinterpret_cast<float*>(scales.data<float>()),
scale_ub,
hidden_size,
num_tokens);
} else {
per_token_quant_fp8_kernel<scalar_t, __nv_fp8_e4m3, TOKENS_PER_CTA, 8><<<grid, block, 0, stream>>>(
reinterpret_cast<const scalar_t*>(input.data<scalar_t>()),
reinterpret_cast<__nv_fp8_e4m3*>(out.data<fp8_t>()),
reinterpret_cast<float*>(scales.data<float>()),
scale_ub,
hidden_size,
num_tokens);
}
} else {
// -------- baseline -----------------------------------------------------
constexpr int THREADS = 256;
dim3 grid(num_tokens);
dim3 block(THREADS);
if (use_vec16) {
per_token_quant_fp8_small_batch_kernel<scalar_t, __nv_fp8_e4m3, 16><<<grid, block, 0, stream>>>(
reinterpret_cast<const scalar_t*>(input.data<scalar_t>()),
reinterpret_cast<__nv_fp8_e4m3*>(out.data<fp8_t>()),
reinterpret_cast<float*>(scales.data<float>()),
scale_ub,
hidden_size,
num_tokens);
} else {
per_token_quant_fp8_small_batch_kernel<scalar_t, __nv_fp8_e4m3, 8><<<grid, block, 0, stream>>>(
reinterpret_cast<const scalar_t*>(input.data<scalar_t>()),
reinterpret_cast<__nv_fp8_e4m3*>(out.data<fp8_t>()),
reinterpret_cast<float*>(scales.data<float>()),
scale_ub,
hidden_size,
num_tokens);
}
}
});
return;
}
dim3 const grid(num_tokens);
dim3 const block(std::min(hidden_size, 1024));
DISPATCH_FLOAT_FP6_DTYPE(input.dtype(), scalar_t, {
cudaStream_t stream = input.stream();
switch (input.dtype()) {
case paddle::DataType::FLOAT32: {
using scalar_t = float;
fastdeploy::dynamic_per_token_scaled_fp8_quant_kernel<scalar_t, fp8_t>
<<<grid, block, 0, stream>>>(out.data<fp8_t>(), scales.data<float>(),
input.data<scalar_t>(), scale_ub,
hidden_size);
});
break;
}
case paddle::DataType::FLOAT16: {
using scalar_t = phi::dtype::float16;
fastdeploy::dynamic_per_token_scaled_fp8_quant_kernel<scalar_t, fp8_t>
<<<grid, block, 0, stream>>>(out.data<fp8_t>(), scales.data<float>(),
input.data<scalar_t>(), scale_ub,
hidden_size);
break;
}
case paddle::DataType::BFLOAT16: {
using scalar_t = phi::dtype::bfloat16;
fastdeploy::dynamic_per_token_scaled_fp8_quant_kernel<scalar_t, fp8_t>
<<<grid, block, 0, stream>>>(out.data<fp8_t>(), scales.data<float>(),
input.data<scalar_t>(), scale_ub,
hidden_size);
break;
}
default:
PD_THROW("Only supported attr of input type in [fp32, fp16, bf16].");
}
}
PD_BUILD_STATIC_OP(static_scaled_fp8_quant)

View File

@@ -1,71 +0,0 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "helper.h"
#include "cuda_multiprocess.h"
#if !defined(_WIN32)
#include <errno.h>
#include <string.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#endif
// 可选:仅删除/解除共享内存命名对象(不依赖之前保存的 addr/fd
static inline int sharedMemoryUnlinkByName(const char* name) {
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
// Windows 上没有 shm_unlink 语义。命名对象在最后一个句柄关闭后消失。
// 这里做“尽力而为”:尝试打开后立即关闭,减少一次引用。
HANDLE hMap = OpenFileMappingA(FILE_MAP_ALL_ACCESS, FALSE, name);
if (hMap) {
CloseHandle(hMap);
return 0;
}
// 已经不存在也算成功
return 0;
#else
// POSIX: 移除名字,未来不可再 open已映射区仍存活直至 munmap
if (shm_unlink(name) != 0) {
if (errno == ENOENT) return 0; // 不存在视作成功
return errno;
}
return 0;
#endif
}
void UnsetDataIpc(const paddle::Tensor& tmp_input,
const std::string& shm_name,
bool close_ipc,
bool unlink_shm) {
// 1) 关闭消费者导入的 IPC 映射(仅当 close_ipc=true 且该指针确为 OpenMemHandle 得来)
if (close_ipc) {
void* ptr = const_cast<void*>(tmp_input.data());
checkCudaErrors(cudaIpcCloseMemHandle(ptr));
}
// 2) 解除共享内存命名对象(仅处理“名字”,不保证解除旧映射)
if (unlink_shm) {
int rc = sharedMemoryUnlinkByName(shm_name.c_str());
if (rc != 0) {
PD_THROW("Unlink shared memory failed: name=%s, err=%d",
shm_name.c_str(), rc);
}
}
}
PD_BUILD_STATIC_OP(unset_data_ipc)
.Inputs({"tmp_input"})
.Attrs({"shm_name: std::string", "close_ipc: bool", "unlink_shm: bool"})
.SetKernelFn(PD_KERNEL(UnsetDataIpc));

View File

@@ -32,8 +32,7 @@ __global__ void update_inputs_kernel_v1(bool *not_need_stop,
const int max_bsz,
const int input_ids_stride,
const int block_num_per_seq,
const int block_size,
bool prefill_one_step_stop) {
const int block_size) {
int thread_idx = threadIdx.x;
typedef cub::BlockReduce<int64_t, THREADBLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
@@ -55,32 +54,23 @@ __global__ void update_inputs_kernel_v1(bool *not_need_stop,
seq_lens_encoder[thread_idx] = 0;
} else {
if (seq_lens_this_time[thread_idx] + seq_lens_decoder[thread_idx] >= prompt_lens[thread_idx]) {
if (prefill_one_step_stop) {
// prefill done, stop
stop_flags[thread_idx] = true;
seq_lens_this_time[thread_idx] = 0;
seq_lens_decoder[thread_idx] = 0;
seq_lens_encoder[thread_idx] = 0;
stop_flag_now_int = 1;
} else{
// decoding
seq_lens_decoder[thread_idx] += seq_lens_this_time[thread_idx];
seq_lens_this_time[thread_idx] = 1;
seq_lens_encoder[thread_idx] = 0;
int64_t *input_ids_now = input_ids + thread_idx * input_ids_stride;
input_ids_now[0] = next_tokens[thread_idx];
// decoding
seq_lens_decoder[thread_idx] += seq_lens_this_time[thread_idx];
seq_lens_this_time[thread_idx] = 1;
seq_lens_encoder[thread_idx] = 0;
int64_t *input_ids_now = input_ids + thread_idx * input_ids_stride;
input_ids_now[0] = next_tokens[thread_idx];
// to judge whether block is not enough
int *block_table_now = block_tables + thread_idx * block_num_per_seq;
if (seq_lens_this_time[thread_idx] != 0 && block_table_now[seq_lens_decoder[thread_idx] / block_size] == -1) {
// should be scheduled by server
is_block_step[thread_idx] = true;
seq_lens_this_time[thread_idx]= 0;
stop_flags[thread_idx] = true;
step_seq_lens_decoder[thread_idx] = seq_lens_decoder[thread_idx];
seq_lens_decoder[thread_idx] = 0;
stop_flag_now_int = 1;
}
// to judge whether block is not enough
int *block_table_now = block_tables + thread_idx * block_num_per_seq;
if (seq_lens_this_time[thread_idx] != 0 && block_table_now[seq_lens_decoder[thread_idx] / block_size] == -1) {
// should be scheduled by server
is_block_step[thread_idx] = true;
seq_lens_this_time[thread_idx]= 0;
stop_flags[thread_idx] = true;
step_seq_lens_decoder[thread_idx] = seq_lens_decoder[thread_idx];
seq_lens_decoder[thread_idx] = 0;
stop_flag_now_int = 1;
}
} else
{
@@ -120,12 +110,6 @@ void UpdateInputesV1(const paddle::Tensor &stop_flags,
#else
auto cu_stream = input_ids.stream();
#endif
bool prefill_one_step_stop = false;
if (const char *env_p = std::getenv("PREFILL_NODE_ONE_STEP_STOP_V1")) {
if (env_p[0] == '1') {
prefill_one_step_stop = true;
}
}
const int max_bsz = stop_flags.shape()[0];
const int now_bsz = seq_lens_this_time.shape()[0];
const int input_ids_stride = input_ids.shape()[1];
@@ -149,8 +133,7 @@ void UpdateInputesV1(const paddle::Tensor &stop_flags,
max_bsz,
input_ids_stride,
block_num_per_seq,
block_size,
prefill_one_step_stop);
block_size);
auto not_need_stop_cpu =
not_need_stop_gpu.copy_to(not_need_stop.place(), false);
bool *not_need_stop_data = const_cast<bool *>(not_need_stop.data<bool>());

View File

@@ -23,7 +23,7 @@
template <typename OutputType>
void DisPatchWFp8AFp8Gemm(
const cutlass::float_e4m3_t* input,
const uint32_t* sparse_idx,
const int32_t* sparse_idx,
const cutlass::float_e4m3_t* weight,
const int * tokens,
const float * weight_scale,
@@ -80,7 +80,7 @@ void WFp8AFp8Gemm(
if (is_bfloat16) {
DisPatchWFp8AFp8Gemm(
reinterpret_cast<const cutlass::float_e4m3_t*>(input.data<phi::dtype::float8_e4m3fn>()),
reinterpret_cast<const uint32_t*>(sparse_idx.data<int32_t>()),
sparse_idx.data<int32_t>(),
reinterpret_cast<const cutlass::float_e4m3_t*>(weight.data<phi::dtype::float8_e4m3fn>()),
tokens.data<int>(),
weight_scale.data<float>(),

View File

@@ -1,376 +0,0 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "helper.h"
#include "iluvatar_context.h"
template <paddle::DataType T>
void MixedFusedPagedAttnKernel(const paddle::Tensor& qkv,
paddle::Tensor& k_cache,
paddle::Tensor& v_cache,
const paddle::Tensor& prefill_block_table,
const paddle::Tensor& decode_block_table,
const paddle::Tensor& cu_seqlens_qkv,
const paddle::Tensor& seq_lens,
const paddle::optional<paddle::Tensor> &rope_sin,
const paddle::optional<paddle::Tensor> &rope_cos,
int prefill_num_tokens,
int num_heads,
int head_dim,
int num_kv_heads,
int block_size,
int max_seq_len,
float scale,
bool causal,
bool q_rope,
bool k_rope,
bool v_rope,
int window_left,
int window_right,
float softcap,
bool enable_cuda_graph,
bool use_sqrt_alibi,
paddle::Tensor& out) {
typedef PDTraits<T> traits_;
typedef typename traits_::data_t data_t;
const auto& dtype = qkv.dtype();
cuinferDataType_t cuinfer_data_type;
cudaDataType_t cu_data_type;
if (dtype == paddle::DataType::FLOAT16) {
cuinfer_data_type = CUINFER_DATA_HALF;
cu_data_type = CUDA_R_16F;
} else {
cuinfer_data_type = CUINFER_DATA_BFLOAT16;
cu_data_type = CUDA_R_16BF;
}
const auto& qkv_dims = qkv.dims();
const auto& kv_cache_dims = k_cache.dims();
const auto& prefill_block_table_dims = prefill_block_table.dims();
const auto& cu_seqlens_qkv_dims = cu_seqlens_qkv.dims();
int prefill_batch_size = prefill_block_table_dims[0];
int num_tokens = qkv_dims[0];
int decode_num_tokens = num_tokens - prefill_num_tokens;
int num_total_heads = num_heads + 2 * num_kv_heads;
int max_num_blocks_per_seq = prefill_block_table_dims[1];
int qkv_stride = qkv.strides()[0];
int num_blocks = kv_cache_dims[0];
int kv_block_stride = k_cache.strides()[0];
int kv_head_stride = k_cache.strides()[1];
int block_table_stride = prefill_block_table.strides()[0];
const float *rope_sin_ptr = rope_sin ? rope_sin.get().data<float>() : nullptr;
const float *rope_cos_ptr = rope_cos ? rope_cos.get().data<float>() : nullptr;
cuinferTensorDescriptor_t qkv_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
qkv_desc,
cuinfer_data_type,
3,
std::vector<int>({prefill_num_tokens, num_total_heads, head_dim}).data(),
std::vector<int>({num_total_heads * head_dim, head_dim, 1}).data()));
cuinferTensorDescriptor_t qkv_seqlens_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_seqlens_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
qkv_seqlens_desc,
CUINFER_DATA_INT32,
1,
std::vector<int>({prefill_batch_size + 1}).data(),
std::vector<int>({1}).data()));
cuinferTensorDescriptor_t block_table_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&block_table_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
block_table_desc,
CUINFER_DATA_INT32,
2,
std::vector<int>({prefill_batch_size, block_table_stride}).data(),
std::vector<int>({block_table_stride, 1}).data()));
cuinferTensorDescriptor_t o_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&o_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
o_desc,
cuinfer_data_type,
3,
std::vector<int>({prefill_num_tokens, num_heads, head_dim}).data(),
std::vector<int>({num_heads * head_dim, head_dim, 1}).data()));
cuinferTensorDescriptor_t k_cache_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&k_cache_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
k_cache_desc,
cuinfer_data_type,
4,
std::vector<int>({num_blocks, num_kv_heads, block_size, head_dim}).data(),
std::vector<int>({num_kv_heads * block_size * head_dim, block_size * head_dim, head_dim, 1}).data()));
cuinferTensorDescriptor_t v_cache_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&v_cache_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
v_cache_desc,
cuinfer_data_type,
4,
std::vector<int>({num_blocks, num_kv_heads, block_size, head_dim}).data(),
std::vector<int>({num_kv_heads * block_size * head_dim, block_size * head_dim, head_dim, 1}).data()));
cuinferTensorDescriptor_t cos_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&cos_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
cos_desc,
CUINFER_DATA_FLOAT,
2,
std::vector<int>({max_seq_len, head_dim}).data(),
std::vector<int>({head_dim, 1}).data()));
cuinferTensorDescriptor_t sin_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&sin_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
sin_desc,
CUINFER_DATA_FLOAT,
2,
std::vector<int>({max_seq_len, head_dim}).data(),
std::vector<int>({head_dim, 1}).data()));
cuinferHandle_t cuinfer_handle = iluvatar::getContextInstance()->getIxInferHandle();
size_t prefill_workspace_size = 0;
CUINFER_CHECK(cuinferGetFmhaFwdMergedFuseRopeWorkspaceSize(prefill_num_tokens,
num_heads,
num_kv_heads,
head_dim,
q_rope,
k_rope,
v_rope,
cuinfer_data_type,
cuinfer_data_type,
cuinfer_data_type,
&prefill_workspace_size));
auto* allocator = paddle::GetAllocator(qkv.place());
phi::Allocator::AllocationPtr prefill_tmp_workspace = allocator->Allocate(prefill_workspace_size);
void* prefill_workspace_ptr = prefill_tmp_workspace->ptr();
CUINFER_CHECK(cuinferFmhaFwdMergedFuseRopeFunc(cuinfer_handle,
qkv_desc,
qkv.data(),
qkv_seqlens_desc,
cu_seqlens_qkv.data<int32_t>(),
block_table_desc,
prefill_block_table.data<int32_t>(),
o_desc,
out.data(),
k_cache_desc,
k_cache.data(),
v_cache_desc,
v_cache.data(),
prefill_workspace_ptr,
prefill_workspace_size,
cos_desc,
rope_cos_ptr,
sin_desc,
rope_sin_ptr,
prefill_batch_size,
num_heads,
num_kv_heads,
head_dim,
causal,
scale,
q_rope,
k_rope,
v_rope));
size_t decode_workspace_size = 0;
CUINFER_CHECK(cuInferPageAttentionGetWorkspaceV7(decode_num_tokens,
num_heads,
num_kv_heads,
head_dim,
block_size,
max_seq_len,
&decode_workspace_size));
phi::Allocator::AllocationPtr decode_tmp_workspace = allocator->Allocate(decode_workspace_size);
void* decode_workspace_ptr = decode_tmp_workspace->ptr();
void* decode_qkv_ptr = (void*)(qkv.data<data_t>() + prefill_num_tokens * qkv_stride);
void* decode_out_ptr = (void*)(out.data<data_t>() + prefill_num_tokens * out.strides()[0]);
PageAttentionWithKVCacheArguments args{
static_cast<float>(scale), 1.0, 1.0, static_cast<float>(softcap), window_left, window_right,
causal, use_sqrt_alibi, enable_cuda_graph, false, nullptr, decode_qkv_ptr, decode_qkv_ptr,
decode_workspace_ptr, true, rope_sin_ptr, rope_cos_ptr};
CUINFER_CHECK(cuInferPageAttentionV7(cuinfer_handle,
decode_out_ptr,
cu_data_type,
decode_qkv_ptr,
cu_data_type,
decode_num_tokens,
num_heads,
num_kv_heads,
head_dim,
qkv_stride,
kv_block_stride,
kv_head_stride,
k_cache.data(),
cu_data_type,
v_cache.data(),
cu_data_type,
block_size,
max_num_blocks_per_seq,
max_seq_len,
decode_block_table.data<int32_t>(),
seq_lens.data<int32_t>(),
args));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_seqlens_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(block_table_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(o_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(k_cache_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(v_cache_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(cos_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(sin_desc));
}
std::vector<paddle::Tensor> MixedFusedPagedAttn(const paddle::Tensor& qkv,
paddle::Tensor& k_cache,
paddle::Tensor& v_cache,
const paddle::Tensor& prefill_block_table,
const paddle::Tensor& decode_block_table,
const paddle::Tensor& cu_seqlens_qkv,
const paddle::Tensor& seq_lens,
const paddle::optional<paddle::Tensor> &rope_sin,
const paddle::optional<paddle::Tensor> &rope_cos,
int prefill_num_tokens,
int num_heads,
int head_dim,
int num_kv_heads,
int block_size,
int max_seq_len,
float scale,
bool causal,
bool q_rope,
bool k_rope,
bool v_rope,
int window_left,
int window_right,
float softcap,
bool enable_cuda_graph,
bool use_sqrt_alibi) {
const auto dtype = qkv.dtype();
auto out = paddle::empty({qkv.shape()[0], num_heads * head_dim}, dtype, qkv.place());
switch (dtype) {
case paddle::DataType::BFLOAT16:
MixedFusedPagedAttnKernel<paddle::DataType::BFLOAT16>(qkv,
k_cache,
v_cache,
prefill_block_table,
decode_block_table,
cu_seqlens_qkv,
seq_lens,
rope_sin,
rope_cos,
prefill_num_tokens,
num_heads,
head_dim,
num_kv_heads,
block_size,
max_seq_len,
scale,
causal,
q_rope,
k_rope,
v_rope,
window_left,
window_right,
softcap,
enable_cuda_graph,
use_sqrt_alibi,
out);
break;
case paddle::DataType::FLOAT16:
MixedFusedPagedAttnKernel<paddle::DataType::FLOAT16>(qkv,
k_cache,
v_cache,
prefill_block_table,
decode_block_table,
cu_seqlens_qkv,
seq_lens,
rope_sin,
rope_cos,
prefill_num_tokens,
num_heads,
head_dim,
num_kv_heads,
block_size,
max_seq_len,
scale,
causal,
q_rope,
k_rope,
v_rope,
window_left,
window_right,
softcap,
enable_cuda_graph,
use_sqrt_alibi,
out);
break;
default:
PD_THROW("Unsupported data type for mixed paged attn");
}
return {out};
}
std::vector<std::vector<int64_t>> MixedFusedPagedAttnInferShape(const std::vector<int64_t>& qkv_shape,
int num_heads,
int head_dim) {
return {{qkv_shape[0], num_heads * head_dim}};
}
std::vector<paddle::DataType> MixedFusedPagedAttnInferDtype(const paddle::DataType& qkv_dtype) {
return {qkv_dtype};
}
PD_BUILD_STATIC_OP(mixed_fused_paged_attn)
.Inputs({"qkv", "k_cache", "v_cache", "prefill_block_table", "decode_block_table",
"cu_seqlens_qkv", "seq_lens", paddle::Optional("rope_sin"), paddle::Optional("rope_cos")})
.Outputs({"out"})
.Attrs({"prefill_num_tokens:int",
"num_heads: int",
"head_dim:int",
"num_kv_heads:int",
"block_size:int",
"max_seq_len:int",
"scale:float",
"causal:bool",
"q_rope:bool",
"k_rope:bool",
"v_rope:bool",
"window_left:int",
"window_right:int",
"softcap:float",
"enable_cuda_graph:bool",
"use_sqrt_alibi:bool"})
.SetKernelFn(PD_KERNEL(MixedFusedPagedAttn))
.SetInferShapeFn(PD_INFER_SHAPE(MixedFusedPagedAttnInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(MixedFusedPagedAttnInferDtype));

View File

@@ -53,7 +53,6 @@ void MoeDispatchKernel(const paddle::Tensor& input,
const paddle::optional<paddle::Tensor>& gating_correction_bias,
const int moe_topk,
const bool group_moe,
const std::string &moe_quant_type,
const bool topk_only_mode,
const int num_rows,
const int hidden_size,
@@ -184,7 +183,6 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
const paddle::optional<paddle::Tensor>& w4a8_in_scale,
const int moe_topk,
const bool group_moe,
const std::string &moe_quant_type,
const bool topk_only_mode) {
const auto input_type = input.dtype();
auto place = input.place();
@@ -222,7 +220,6 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
gating_correction_bias,
moe_topk,
group_moe,
moe_quant_type,
topk_only_mode,
num_rows,
hidden_size,
@@ -239,7 +236,6 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
gating_correction_bias,
moe_topk,
group_moe,
moe_quant_type,
topk_only_mode,
num_rows,
hidden_size,
@@ -309,7 +305,7 @@ PD_BUILD_STATIC_OP(moe_expert_dispatch)
"top_k_weight",
"top_k_indices",
"expert_idx_per_token"})
.Attrs({"moe_topk:int", "group_moe:bool", "moe_quant_type:std::string", "topk_only_mode:bool"})
.Attrs({"moe_topk:int", "group_moe:bool", "topk_only_mode:bool"})
.SetKernelFn(PD_KERNEL(MoeExpertDispatch))
.SetInferShapeFn(PD_INFER_SHAPE(MoeExpertDispatchInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(MoeExpertDispatchInferDtype));

View File

@@ -27,8 +27,6 @@ void PagedAttnKernel(const paddle::Tensor& q,
const paddle::optional<paddle::Tensor> &v,
const paddle::optional<paddle::Tensor> &rope_sin,
const paddle::optional<paddle::Tensor> &rope_cos,
int num_heads,
int head_dim,
int num_kv_heads,
float scale,
int block_size,
@@ -88,36 +86,32 @@ void PagedAttnKernel(const paddle::Tensor& q,
common::errors::InvalidArgument(
"paged_attention expects seq_lens is contiguous"));
// check dim and shape
// k_cache: [num_blocks, kv_num_heads, block_size, head_dim]
// v_cache: [num_blocks, kv_num_heads, block_size, head_dim]
// k_cache: [num_blocks, kv_num_heads, block_size, head_size]
// v_cache: [num_blocks, kv_num_heads, block_size, head_size]
// block_table: [num_seqs, max_num_blocks_per_seq]
// seq_lens: [num_seqs]
// q and out:
// if merged_qkv = false:
// q:[num_seqs, hidden_size]
// out:[num_seqs, hidden_size]
// if merged_qkv = true:
// q: [num_seqs, (num_heads+2*num_kv_heads)*head_dim]
// out: [num_seqs, hidden_size]
// merged_qkv = false: [num_seqs, num_heads, head_size]
// merged_qkv = true: [num_seqs, num_heads+2*num_kv_heads, head_size]
const auto& q_dims = q.dims();
PADDLE_ENFORCE_EQ(q_dims.size(),
2,
3,
common::errors::InvalidArgument(
"paged_attn receive query dims is "
"[num_seqs, (num_heads+2*num_kv_heads)*head_dim]"));
"[num_seqs, num_heads, head_size]"));
PADDLE_ENFORCE_EQ(out.dims().size(),
2,
3,
common::errors::InvalidArgument(
"paged_attn receive out dims is "
"[num_seqs, hidden_size]"));
"[num_seqs, num_heads, head_size]"));
const auto& kv_cache_dims = k_cache.dims();
PADDLE_ENFORCE_EQ(kv_cache_dims.size(),
4,
common::errors::InvalidArgument(
"paged_attn receive kv cache dims is "
"[num_blocks, kv_num_heads, block_size, head_dim]"));
"[num_blocks, kv_num_heads, block_size, head_size]"));
const auto& block_table_dims = block_table.dims();
PADDLE_ENFORCE_EQ(block_table_dims.size(),
@@ -133,6 +127,8 @@ void PagedAttnKernel(const paddle::Tensor& q,
"paged_attn receive seq_lens dims is [num_seqs]"));
int num_seqs = q_dims[0];
int num_heads = merged_qkv ? q_dims[1] - 2 * num_kv_heads : q_dims[1];
int head_size = q_dims[2];
int max_num_blocks_per_seq = block_table_dims[1];
int q_stride = q.strides()[0];
int num_blocks = kv_cache_dims[0];
@@ -146,9 +142,9 @@ void PagedAttnKernel(const paddle::Tensor& q,
common::errors::InvalidArgument(
"kv_cache_dims[2] must be equal to block_size"));
PADDLE_ENFORCE_EQ(kv_cache_dims[3],
head_dim,
head_size,
common::errors::InvalidArgument(
"kv_cache_dims[3] must be equal to head_dim"));
"kv_cache_dims[3] must be equal to head_size"));
PADDLE_ENFORCE_EQ(block_table_dims[0],
num_seqs,
common::errors::InvalidArgument(
@@ -166,13 +162,14 @@ void PagedAttnKernel(const paddle::Tensor& q,
const float *rope_sin_ptr = merged_qkv ? rope_sin.get().data<float>() : nullptr;
const float *rope_cos_ptr = merged_qkv ? rope_cos.get().data<float>() : nullptr;
auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(q.place()));
cuinferHandle_t cuinfer_handle = iluvatar::getContextInstance()->getIxInferHandle();
size_t workspace_size = 0;
CUINFER_CHECK(cuInferPageAttentionGetWorkspaceV7(num_seqs,
num_heads,
num_kv_heads,
head_dim,
head_size,
block_size,
max_context_len,
&workspace_size));
@@ -192,7 +189,7 @@ void PagedAttnKernel(const paddle::Tensor& q,
num_seqs,
num_heads,
num_kv_heads,
head_dim,
head_size,
q_stride,
kv_block_stride,
kv_head_stride,
@@ -218,8 +215,6 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
const paddle::optional<paddle::Tensor> &v,
const paddle::optional<paddle::Tensor> &rope_sin,
const paddle::optional<paddle::Tensor> &rope_cos,
int num_heads,
int head_dim,
int num_kv_heads,
float scale,
int block_size,
@@ -233,7 +228,11 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
bool merged_qkv) {
const auto dtype = q.dtype();
auto out = paddle::empty({q.shape()[0], num_heads * head_dim}, dtype, q.place());
auto out_shape = q.shape();
if (merged_qkv) {
out_shape[1] -= 2 * num_kv_heads;
}
auto out = paddle::empty(out_shape, dtype, q.place());
switch (dtype) {
case paddle::DataType::BFLOAT16:
@@ -247,8 +246,6 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
v,
rope_sin,
rope_cos,
num_heads,
head_dim,
num_kv_heads,
scale,
block_size,
@@ -273,8 +270,6 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
v,
rope_sin,
rope_cos,
num_heads,
head_dim,
num_kv_heads,
scale,
block_size,
@@ -304,8 +299,6 @@ std::vector<std::vector<int64_t>> PagedAttnInferShape(const std::vector<int64_t>
const std::vector<int64_t>& v_shape,
const std::vector<int64_t>& rope_sin_shape,
const std::vector<int64_t>& rope_cos_shape,
int num_heads,
int head_dim,
int num_kv_heads,
float scale,
int block_size,
@@ -318,13 +311,36 @@ std::vector<std::vector<int64_t>> PagedAttnInferShape(const std::vector<int64_t>
bool use_sqrt_alibi,
bool merged_qkv) {
if (merged_qkv) {
return {{q_shape[0], num_heads * head_dim}};
int64_t num_tokens = q_shape[0];
int64_t num_heads = q_shape[1] - 2 * num_kv_heads;
int64_t head_dim = q_shape[2];
return {{num_tokens, num_heads, head_dim}};
} else {
return {q_shape};
}
}
std::vector<paddle::DataType> PagedAttnInferDtype(const paddle::DataType& q_dtype) {
std::vector<paddle::DataType> PagedAttnInferDtype(const paddle::DataType& q_dtype,
const paddle::DataType& k_cache_dtype,
const paddle::DataType& v_cache_dtype,
const paddle::DataType& block_table_dtype,
const paddle::DataType& seq_lens_dtype,
const paddle::DataType& alibi_slopes_dtype,
const paddle::DataType& k_dtype,
const paddle::DataType& v_dtype,
const paddle::DataType& rope_sin_dtype,
const paddle::DataType& rope_cos_dtype,
int num_kv_heads,
float scale,
int block_size,
int max_context_len,
bool causal,
int window_left,
int window_right,
float softcap,
bool enable_cuda_graph,
bool use_sqrt_alibi,
bool merged_qkv) {
return {q_dtype};
}
@@ -335,9 +351,7 @@ PD_BUILD_STATIC_OP(paged_attn)
paddle::Optional("v"), paddle::Optional("rope_sin"),
paddle::Optional("rope_cos")})
.Outputs({"out"})
.Attrs({"num_heads:int",
"head_dim:int",
"num_kv_heads:int",
.Attrs({"num_kv_heads:int",
"scale:float",
"block_size:int",
"max_context_len:int",

View File

@@ -1,378 +0,0 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "helper.h"
#include "iluvatar_context.h"
template <paddle::DataType T>
void PrefillFusedPagedAttnKernel(const paddle::Tensor& qkv,
paddle::Tensor& k_cache,
paddle::Tensor& v_cache,
const paddle::Tensor& block_table,
const paddle::Tensor& cu_seqlens_qkv,
const paddle::optional<paddle::Tensor> &rope_sin,
const paddle::optional<paddle::Tensor> &rope_cos,
int num_heads,
int head_dim,
int num_kv_heads,
int block_size,
int max_seq_len,
float scale,
bool causal,
bool q_rope,
bool k_rope,
bool v_rope,
paddle::Tensor& out) {
// check dtype and contiguous
const auto& dtype = qkv.dtype();
cuinferDataType_t data_type;
if (dtype == paddle::DataType::FLOAT16) {
data_type = CUINFER_DATA_HALF;
} else if (dtype == paddle::DataType::BFLOAT16) {
data_type = CUINFER_DATA_BFLOAT16;
} else {
common::errors::InvalidArgument("paged_attention support half and bfloat16 now");
}
PADDLE_ENFORCE_EQ(k_cache.dtype(),
dtype,
common::errors::InvalidArgument(
"k_cache dtype must be the same as query dtype"));
PADDLE_ENFORCE_EQ(k_cache.is_contiguous(),
true,
common::errors::InvalidArgument(
"paged_attention expects k_cache is contiguous"));
PADDLE_ENFORCE_EQ(block_table.dtype(),
paddle::DataType::INT32,
common::errors::InvalidArgument(
"block_table dtype must be int32"));
PADDLE_ENFORCE_EQ(block_table.is_contiguous(),
true,
common::errors::InvalidArgument(
"paged_attention expects block_table is contiguous"));
PADDLE_ENFORCE_EQ(cu_seqlens_qkv.dtype(),
paddle::DataType::INT32,
common::errors::InvalidArgument(
"cu_seqlens_qkv dtype must be int32"));
PADDLE_ENFORCE_EQ(cu_seqlens_qkv.is_contiguous(),
true,
common::errors::InvalidArgument(
"paged_attention expects cu_seqlens_qkv is contiguous"));
// check dim and shape
// k_cache: [num_blocks, kv_num_heads, block_size, head_dim]
// v_cache: [num_blocks, kv_num_heads, block_size, head_dim]
// block_table: [batch_size, max_num_blocks_per_seq]
// seq_lens: [batch_size]
// qkv: [num_tokens, (num_heads+2*num_kv_heads)*head_dim]
// out: [num_tokens, hidden_size]
const auto& qkv_dims = qkv.dims();
PADDLE_ENFORCE_EQ(qkv_dims.size(),
2,
common::errors::InvalidArgument(
"paged_attn receive query dims is "
"[num_tokens, (num_heads+2*num_kv_heads)*head_dim]"));
PADDLE_ENFORCE_EQ(out.dims().size(),
2,
common::errors::InvalidArgument(
"paged_attn receive out dims is "
"[num_tokens, hidden_size]"));
const auto& kv_cache_dims = k_cache.dims();
PADDLE_ENFORCE_EQ(kv_cache_dims.size(),
4,
common::errors::InvalidArgument(
"paged_attn receive kv cache dims is "
"[num_blocks, kv_num_heads, block_size, head_dim]"));
const auto& block_table_dims = block_table.dims();
PADDLE_ENFORCE_EQ(block_table_dims.size(),
2,
common::errors::InvalidArgument(
"paged_attn receive block_table dims is "
"[batch_size, max_num_blocks_per_seq]"));
const auto& cu_seqlens_qkv_dims = cu_seqlens_qkv.dims();
PADDLE_ENFORCE_EQ(cu_seqlens_qkv_dims.size(),
1,
common::errors::InvalidArgument(
"paged_attn receive cu_seqlens_qkv dims is [batch_size]"));
int batch_size = block_table_dims[0];
int num_tokens = qkv_dims[0];
int num_total_heads = num_heads + 2 * num_kv_heads;
int qkv_stride = qkv.strides()[0];
int num_blocks = kv_cache_dims[0];
PADDLE_ENFORCE_EQ(kv_cache_dims[1],
num_kv_heads,
common::errors::InvalidArgument(
"kv_cache_dims[1] must be equal to num_kv_head"));
PADDLE_ENFORCE_EQ(kv_cache_dims[2],
block_size,
common::errors::InvalidArgument(
"kv_cache_dims[2] must be equal to block_size"));
PADDLE_ENFORCE_EQ(kv_cache_dims[3],
head_dim,
common::errors::InvalidArgument(
"kv_cache_dims[3] must be equal to head_dim"));
PADDLE_ENFORCE_EQ(cu_seqlens_qkv_dims[0],
batch_size + 1,
common::errors::InvalidArgument(
"cu_seqlens_qkv_dims[0] must be equal to batch_size + 1"));
int block_table_stride = block_table.strides()[0];
const float *rope_sin_ptr = rope_sin ? rope_sin.get().data<float>() : nullptr;
const float *rope_cos_ptr = rope_cos ? rope_cos.get().data<float>() : nullptr;
cuinferHandle_t cuinfer_handle = iluvatar::getContextInstance()->getIxInferHandle();
size_t workspace_size = 0;
CUINFER_CHECK(cuinferGetFmhaFwdMergedFuseRopeWorkspaceSize(num_tokens,
num_heads,
num_kv_heads,
head_dim,
q_rope,
k_rope,
v_rope,
data_type,
data_type,
data_type,
&workspace_size));
auto* allocator = paddle::GetAllocator(qkv.place());
phi::Allocator::AllocationPtr tmp_workspace = allocator->Allocate(workspace_size);
void* workspace_ptr = tmp_workspace->ptr();
cuinferTensorDescriptor_t qkv_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
qkv_desc,
data_type,
3,
std::vector<int>({num_tokens, num_total_heads, head_dim}).data(),
std::vector<int>({num_total_heads * head_dim, head_dim, 1}).data()));
cuinferTensorDescriptor_t qkv_seqlens_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_seqlens_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
qkv_seqlens_desc,
CUINFER_DATA_INT32,
1,
std::vector<int>({batch_size + 1}).data(),
std::vector<int>({1}).data()));
cuinferTensorDescriptor_t block_table_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&block_table_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
block_table_desc,
CUINFER_DATA_INT32,
2,
std::vector<int>({batch_size, block_table_stride}).data(),
std::vector<int>({block_table_stride, 1}).data()));
cuinferTensorDescriptor_t o_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&o_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
o_desc,
data_type,
3,
std::vector<int>({num_tokens, num_heads, head_dim}).data(),
std::vector<int>({num_heads * head_dim, head_dim, 1}).data()));
cuinferTensorDescriptor_t k_cache_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&k_cache_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
k_cache_desc,
data_type,
4,
std::vector<int>({num_blocks, num_kv_heads, block_size, head_dim}).data(),
std::vector<int>({num_kv_heads * block_size * head_dim, block_size * head_dim, head_dim, 1}).data()));
cuinferTensorDescriptor_t v_cache_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&v_cache_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
v_cache_desc,
data_type,
4,
std::vector<int>({num_blocks, num_kv_heads, block_size, head_dim}).data(),
std::vector<int>({num_kv_heads * block_size * head_dim, block_size * head_dim, head_dim, 1}).data()));
cuinferTensorDescriptor_t cos_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&cos_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
cos_desc,
CUINFER_DATA_FLOAT,
2,
std::vector<int>({max_seq_len, head_dim}).data(),
std::vector<int>({head_dim, 1}).data()));
cuinferTensorDescriptor_t sin_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&sin_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
sin_desc,
CUINFER_DATA_FLOAT,
2,
std::vector<int>({max_seq_len, head_dim}).data(),
std::vector<int>({head_dim, 1}).data()));
CUINFER_CHECK(cuinferFmhaFwdMergedFuseRopeFunc(cuinfer_handle,
qkv_desc,
qkv.data(),
qkv_seqlens_desc,
cu_seqlens_qkv.data<int32_t>(),
block_table_desc,
block_table.data<int32_t>(),
o_desc,
out.data(),
k_cache_desc,
k_cache.data(),
v_cache_desc,
v_cache.data(),
workspace_ptr,
workspace_size,
cos_desc,
rope_cos_ptr,
sin_desc,
rope_sin_ptr,
batch_size,
num_heads,
num_kv_heads,
head_dim,
causal,
scale,
q_rope,
k_rope,
v_rope));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_seqlens_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(block_table_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(o_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(k_cache_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(v_cache_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(cos_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(sin_desc));
}
std::vector<paddle::Tensor> PrefillFusedPagedAttn(const paddle::Tensor& qkv,
paddle::Tensor& k_cache,
paddle::Tensor& v_cache,
const paddle::Tensor& block_table,
const paddle::Tensor& cu_seqlens_qkv,
const paddle::optional<paddle::Tensor> &rope_sin,
const paddle::optional<paddle::Tensor> &rope_cos,
int num_heads,
int head_dim,
int num_kv_heads,
int block_size,
int max_seq_len,
float scale,
bool causal,
bool q_rope,
bool k_rope,
bool v_rope) {
const auto dtype = qkv.dtype();
auto out = paddle::empty({qkv.shape()[0], num_heads * head_dim}, dtype, qkv.place());
switch (dtype) {
case paddle::DataType::BFLOAT16:
PrefillFusedPagedAttnKernel<paddle::DataType::BFLOAT16>(qkv,
k_cache,
v_cache,
block_table,
cu_seqlens_qkv,
rope_sin,
rope_cos,
num_heads,
head_dim,
num_kv_heads,
block_size,
max_seq_len,
scale,
causal,
q_rope,
k_rope,
v_rope,
out);
break;
case paddle::DataType::FLOAT16:
PrefillFusedPagedAttnKernel<paddle::DataType::FLOAT16>(qkv,
k_cache,
v_cache,
block_table,
cu_seqlens_qkv,
rope_sin,
rope_cos,
num_heads,
head_dim,
num_kv_heads,
block_size,
max_seq_len,
scale,
causal,
q_rope,
k_rope,
v_rope,
out);
break;
default:
PD_THROW("Unsupported data type for Paged attn");
}
return {out};
}
std::vector<std::vector<int64_t>> PrefillFusedPagedAttnInferShape(const std::vector<int64_t>& qkv_shape,
const std::vector<int64_t>& k_cache_shape,
const std::vector<int64_t>& v_cache_shape,
const std::vector<int64_t>& block_table_shape,
const std::vector<int64_t>& cu_seqlens_qkv_shape,
const std::vector<int64_t>& rope_sin_shape,
const std::vector<int64_t>& rope_cos_shape,
int num_heads,
int head_dim,
int num_kv_heads,
int block_size,
int max_seq_len,
float scale,
bool causal,
bool q_rope,
bool k_rope,
bool v_rope) {
return {{qkv_shape[0], num_heads * head_dim}};
}
std::vector<paddle::DataType> PrefillFusedPagedAttnInferDtype(const paddle::DataType& qkv_dtype) {
return {qkv_dtype};
}
PD_BUILD_STATIC_OP(prefill_fused_paged_attn)
.Inputs({"qkv", "k_cache", "v_cache", "block_table", "cu_seqlens_qkv",
paddle::Optional("rope_sin"), paddle::Optional("rope_cos")})
.Outputs({"out"})
.Attrs({"num_heads:int",
"head_dim:int",
"num_kv_heads:int",
"block_size:int",
"max_seq_len:int",
"scale:float",
"causal:bool",
"q_rope:bool",
"k_rope:bool",
"v_rope:bool"})
.SetKernelFn(PD_KERNEL(PrefillFusedPagedAttn))
.SetInferShapeFn(PD_INFER_SHAPE(PrefillFusedPagedAttnInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(PrefillFusedPagedAttnInferDtype));

View File

@@ -1,181 +0,0 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "helper.h"
#include "mc_fused_moe_helper.h"
#include "fused_moe_op.h"
__global__ void compute_total_rows_before_expert_kernel(
int* sorted_experts,
const int64_t sorted_experts_len,
const int64_t num_experts,
int32_t* total_rows_before_expert) {
const int expert = blockIdx.x * blockDim.x + threadIdx.x;
if (expert >= num_experts) return;
total_rows_before_expert[expert] =
find_total_elts_leq_target(sorted_experts, sorted_experts_len, expert);
}
void compute_total_rows_before_expert(int* sorted_indices,
const int64_t total_indices,
const int64_t num_experts,
int32_t* total_rows_before_expert,
cudaStream_t stream) {
const int threads = std::min(int64_t(1024), num_experts);
const int blocks = (num_experts + threads - 1) / threads;
compute_total_rows_before_expert_kernel<<<blocks, threads, 0, stream>>>(
sorted_indices, total_indices, num_experts, total_rows_before_expert);
}
template <paddle::DataType T, typename ElementA, typename ElementB, typename ElementC>
void FusedMoeKernel(const paddle::Tensor& input,
const paddle::Tensor& gate_weight,
const paddle::Tensor& ffn1_weight,
const paddle::optional<paddle::Tensor>& ffn1_scale,
const paddle::optional<paddle::Tensor>& ffn1_bias,
const paddle::Tensor& ffn2_weight,
const paddle::optional<paddle::Tensor>& ffn2_scale,
const paddle::optional<paddle::Tensor>& ffn2_bias,
const std::string& quant_method,
const int moe_topk,
const bool group_moe,
const bool norm_topk_prob,
paddle::Tensor* output) {
typedef PDTraits<T> traits_;
typedef typename traits_::DataType DataType_;
typedef typename traits_::data_t data_t;
auto* output_data = output->data<data_t>();
auto moe_compute = McMoeHelper<data_t, ElementA, ElementB, ElementC>(quant_method);
moe_compute.computeFFN(
&input,
&gate_weight,
&ffn1_weight,
ffn1_scale ? ffn1_scale.get_ptr() : nullptr,
ffn1_bias ? ffn1_bias.get_ptr() : nullptr,
&ffn2_weight,
ffn2_scale ? ffn2_scale.get_ptr() : nullptr,
ffn2_bias ? ffn2_bias.get_ptr() : nullptr,
nullptr,
moe_topk,
group_moe,
norm_topk_prob,
1.0, // ComputeFFN
"ffn",
output);
}
std::vector<paddle::Tensor> FusedExpertMoe(
const paddle::Tensor& input,
const paddle::Tensor& gate_weight,
const paddle::Tensor& ffn1_weight,
const paddle::Tensor& ffn2_weight,
const paddle::optional<paddle::Tensor>& ffn1_bias,
const paddle::optional<paddle::Tensor>& ffn1_scale,
const paddle::optional<paddle::Tensor>& ffn2_bias,
const paddle::optional<paddle::Tensor>& ffn2_scale,
const std::string& quant_method,
const int moe_topk,
const bool norm_topk_prob,
const bool group_moe) {
const auto input_type = input.dtype();
auto output = paddle::empty_like(input);
switch (input_type) {
case paddle::DataType::BFLOAT16:
FusedMoeKernel<paddle::DataType::BFLOAT16, maca_bfloat16, int8_t, maca_bfloat16>(input,
gate_weight,
ffn1_weight,
ffn1_scale,
ffn1_bias,
ffn2_weight,
ffn2_scale,
ffn2_bias,
quant_method,
moe_topk,
group_moe,
norm_topk_prob,
&output);
break;
// case paddle::DataType::FLOAT16:
// FusedMoeKernel<paddle::DataType::FLOAT16>(input,
// gate_weight,
// ffn1_weight,
// ffn1_scale,
// ffn1_bias,
// ffn2_weight,
// ffn2_scale,
// ffn2_bias,
// quant_method,
// moe_topk,
// group_moe,
// norm_topk_prob,
// &output);
// break;
default:
PD_THROW("Only support bf16 for FusedMoeKernel");
}
return {output};
}
std::vector<std::vector<int64_t>> FusedExpertMoeInferShape(
const std::vector<int64_t>& input_shape,
const std::vector<int64_t>& gate_weight_shape,
const std::vector<int64_t>& ffn1_weight_shape,
const std::vector<int64_t>& ffn2_weight_shape,
const paddle::optional<std::vector<int64_t>>& ffn1_bias_shape,
const paddle::optional<std::vector<int64_t>>& ffn1_scale_shape,
const paddle::optional<std::vector<int64_t>>& ffn2_bias_shape,
const paddle::optional<std::vector<int64_t>>& ffn2_scale_shape) {
return {input_shape};
}
std::vector<paddle::DataType> FusedExpertMoeInferDtype(
const paddle::DataType& input_dtype,
const paddle::DataType& gate_weight_dtype,
const paddle::DataType& ffn1_weight_dtype,
const paddle::DataType& ffn2_weight_dtype,
const paddle::optional<paddle::DataType>& ffn1_bias_dtype,
const paddle::optional<paddle::DataType>& ffn1_scale_dtype,
const paddle::optional<paddle::DataType>& ffn2_bias_dtype,
const paddle::optional<paddle::DataType>& ffn2_scale_dtype) {
return {input_dtype};
}
PD_BUILD_OP(fused_expert_moe)
.Inputs({"input",
"gate_weight",
"ffn1_weight",
"ffn2_weight",
paddle::Optional("ffn1_bias"),
paddle::Optional("ffn1_scale"),
paddle::Optional("ffn2_bias"),
paddle::Optional("ffn2_scale")})
.Outputs({"output"})
.Attrs({"quant_method:std::string",
"moe_topk:int",
"norm_topk_prob:bool",
"group_moe:bool"})
.SetKernelFn(PD_KERNEL(FusedExpertMoe))
.SetInferShapeFn(PD_INFER_SHAPE(FusedExpertMoeInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(FusedExpertMoeInferDtype));

View File

@@ -1,53 +0,0 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "cutlass_kernels/moe_gemm/fused_moe_gemm_kernels.h"
#include "fused_moe_op.h"
using namespace phi;
template <typename T, int VecSize>
__global__ void moe_token_type_ids_kernel(T *gating_output,
const int *moe_token_type_ids_out,
const int num_rows,
const int num_experts,
const int k) {
const int moe_token_index = blockIdx.x * blockDim.x + threadIdx.x;
if (moe_token_index >= num_rows) {
return;
}
gating_output[moe_token_index * 2] =
gating_output[moe_token_index * 2] +
(moe_token_type_ids_out[moe_token_index]) * -1e10;
gating_output[moe_token_index * 2 + 1] =
gating_output[moe_token_index * 2 + 1] +
(1 - moe_token_type_ids_out[moe_token_index]) * -1e10;
}
template <typename T>
void moe_token_type_ids_kernelLauncher(T *gating_output,
const int *moe_token_type_ids_out,
const int num_rows,
const int num_experts,
const int k,
cudaStream_t stream) {
const int blocks = num_rows * k / 512 + 1;
const int threads = 512;
moe_token_type_ids_kernel<T, 1><<<blocks, 512, 0, stream>>>(
gating_output, moe_token_type_ids_out, num_rows, num_experts, k);
}

View File

@@ -1,123 +0,0 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION &
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <string>
#include <sstream>
#include "cub/cub.cuh"
static const float HALF_FLT_MAX = 65504.F;
static const float HALF_FLT_MIN = -65504.F;
static inline size_t AlignTo16(const size_t& input) {
static constexpr int ALIGNMENT = 16;
return ALIGNMENT * ((input + ALIGNMENT - 1) / ALIGNMENT);
}
class CubKeyValueSorter {
public:
CubKeyValueSorter() : num_experts_(0), num_bits_(sizeof(int) * 8) {}
explicit CubKeyValueSorter(const int num_experts)
: num_experts_(num_experts),
num_bits_(static_cast<int>(log2(num_experts)) + 1) {}
void update_num_experts(const int num_experts) {
num_experts_ = num_experts;
num_bits_ = static_cast<int>(log2(num_experts)) + 1;
}
size_t getWorkspaceSize(const size_t num_key_value_pairs,
bool descending = false) {
num_key_value_pairs_ = num_key_value_pairs;
size_t required_storage = 0;
int* null_int = nullptr;
if (descending) {
cub::DeviceRadixSort::SortPairsDescending(NULL,
required_storage,
null_int,
null_int,
null_int,
null_int,
num_key_value_pairs,
0,
32);
} else {
cub::DeviceRadixSort::SortPairs(NULL,
required_storage,
null_int,
null_int,
null_int,
null_int,
num_key_value_pairs,
0,
num_bits_);
}
return required_storage;
}
template <typename KeyT>
void run(void* workspace,
const size_t workspace_size,
const KeyT* keys_in,
KeyT* keys_out,
const int* values_in,
int* values_out,
const size_t num_key_value_pairs,
bool descending,
cudaStream_t stream) {
size_t expected_ws_size = getWorkspaceSize(num_key_value_pairs);
size_t actual_ws_size = workspace_size;
if (expected_ws_size > workspace_size) {
std::stringstream err_ss;
err_ss << "[Error][CubKeyValueSorter::run]\n";
err_ss << "Error. The allocated workspace is too small to run this "
"problem.\n";
err_ss << "Expected workspace size of at least " << expected_ws_size
<< " but got problem size " << workspace_size << "\n";
throw std::runtime_error(err_ss.str());
}
if (descending) {
cub::DeviceRadixSort::SortPairsDescending(workspace,
actual_ws_size,
keys_in,
keys_out,
values_in,
values_out,
num_key_value_pairs,
0,
32,
stream);
} else {
cub::DeviceRadixSort::SortPairs(workspace,
actual_ws_size,
keys_in,
keys_out,
values_in,
values_out,
num_key_value_pairs,
0,
num_bits_,
stream);
}
}
private:
size_t num_key_value_pairs_;
int num_experts_;
int num_bits_;
};

View File

@@ -1,990 +0,0 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION &
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include "fused_moe_imp_op.h"
#include "fused_moe_helper.h"
#include "mctlass/numeric_conversion.h" // BUILD_MARK
// Ignore mctlass warnings about type punning
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wunused-function"
// #include "paddle/phi/backends/gpu/gpu_info.h"
#pragma GCC diagnostic pop
#include "helper.h"
#define WARP_SIZE 32
struct GpuLaunchConfig {
dim3 block_per_grid;
dim3 thread_per_block;
};
inline GpuLaunchConfig Get1DBlocksAnd2DGridsMoe(const int64_t cols) {
int blocks_x = cols;
int blocks_y = 1;
int blocks_z = 1;
if (blocks_x > 1024) {
blocks_y = 256;
blocks_x = (blocks_x + blocks_y - 1) / blocks_y;
}
GpuLaunchConfig config;
config.block_per_grid.x = blocks_x;
config.block_per_grid.y = blocks_y;
config.block_per_grid.z = blocks_z;
return config;
}
// ====================== Softmax things ===============================
// We have our own implementation of softmax here so we can support transposing
// the output in the softmax kernel when we extend this module to support
// expert-choice routing.
template <typename T, int TPB>
__launch_bounds__(TPB) __global__
void group_moe_softmax(const T* input,
T* output,
T* softmax_max_prob,
const int64_t num_cols,
const int64_t softmax_num_rows) {
using BlockReduce = cub::BlockReduce<float, TPB>;
__shared__ typename BlockReduce::TempStorage tmpStorage;
__shared__ float normalizing_factor;
__shared__ float float_max;
__shared__ float max_out;
int globalIdx = blockIdx.x + blockIdx.y * gridDim.x;
if (globalIdx >= softmax_num_rows) {
return;
}
const int64_t thread_row_offset = globalIdx * num_cols;
cub::Sum sum;
float threadData(-FLT_MAX);
for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
const int idx = thread_row_offset + ii;
threadData = max(static_cast<float>(input[idx]), threadData);
}
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
if (threadIdx.x == 0) {
float_max = maxElem;
}
__syncthreads();
threadData = 0;
for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
const int idx = thread_row_offset + ii;
threadData += exp((static_cast<float>(input[idx]) - float_max));
}
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum);
if (threadIdx.x == 0) {
normalizing_factor = 1.f / Z;
}
__syncthreads();
threadData = 0;
for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
const int idx = thread_row_offset + ii;
const float val =
exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
output[idx] = T(val);
threadData = max(static_cast<float>(T(val)), threadData);
}
const float maxOut = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
if (threadIdx.x == 0) {
// group max probs
max_out = 1.f / maxOut;
softmax_max_prob[globalIdx] = T(max_out);
}
__syncthreads();
for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
const int idx = thread_row_offset + ii;
// group softmax normalization
output[idx] = output[idx] * static_cast<T>(max_out);
}
}
template <typename T, int TPB>
__launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax,
T* output,
int* indices,
int* source_rows,
T* softmax_max_prob,
const int64_t num_experts,
const int64_t k,
const int64_t num_rows) {
using cub_kvp = cub::KeyValuePair<int, T>;
using BlockReduce = cub::BlockReduce<cub_kvp, TPB>;
__shared__ typename BlockReduce::TempStorage tmpStorage;
cub_kvp thread_kvp;
cub::ArgMax arg_max;
const int block_row = blockIdx.x + blockIdx.y * gridDim.x;
if (block_row >= num_rows) {
return;
}
const bool should_process_row = true;
const int thread_read_offset = block_row * num_experts;
for (int k_idx = 0; k_idx < k; ++k_idx) {
thread_kvp.key = 0;
thread_kvp.value = T(-1.f); // This is OK because inputs are probabilities
cub_kvp inp_kvp;
for (int expert = threadIdx.x; expert < num_experts; expert += TPB) {
const int idx = thread_read_offset + expert;
inp_kvp.key = expert;
inp_kvp.value = inputs_after_softmax[idx];
for (int prior_k = 0; prior_k < k_idx; ++prior_k) {
const int prior_winning_expert = indices[k * block_row + prior_k];
if (prior_winning_expert == expert) {
inp_kvp = thread_kvp;
}
}
thread_kvp = arg_max(inp_kvp, thread_kvp);
}
const cub_kvp result_kvp =
BlockReduce(tmpStorage).Reduce(thread_kvp, arg_max);
if (threadIdx.x == 0) {
const int idx = k * block_row + k_idx;
// restore normalized probes
output[idx] = result_kvp.value / T(softmax_max_prob[idx]);
indices[idx] = should_process_row ? result_kvp.key : num_experts;
source_rows[idx] = k_idx * num_rows + block_row;
}
__syncthreads();
}
}
template <typename T, int TPB>
__launch_bounds__(TPB) __global__ void moe_softmax(const T* input,
T* output,
const int64_t num_cols,
const int64_t num_rows) {
using BlockReduce = cub::BlockReduce<float, TPB>;
__shared__ typename BlockReduce::TempStorage tmpStorage;
__shared__ float normalizing_factor;
__shared__ float float_max;
int globalIdx = blockIdx.x + blockIdx.y * gridDim.x;
if (globalIdx >= num_rows) {
return;
}
const int64_t thread_row_offset = globalIdx * num_cols;
cub::Sum sum;
float threadData(-FLT_MAX);
for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
const int idx = thread_row_offset + ii;
threadData = max(static_cast<float>(input[idx]), threadData);
}
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
if (threadIdx.x == 0) {
float_max = maxElem;
}
__syncthreads();
threadData = 0;
for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
const int idx = thread_row_offset + ii;
threadData += exp((static_cast<float>(input[idx]) - float_max));
}
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum);
if (threadIdx.x == 0) {
normalizing_factor = 1.f / Z;
}
__syncthreads();
for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
const int idx = thread_row_offset + ii;
const float val =
exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
output[idx] = T(val);
}
}
template <typename T, int TPB>
__launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax,
T* output,
int* indices,
int* source_rows,
const int64_t num_experts,
const int64_t k,
const int64_t num_rows) {
using cub_kvp = cub::KeyValuePair<int, T>;
using BlockReduce = cub::BlockReduce<cub_kvp, TPB>;
__shared__ typename BlockReduce::TempStorage tmpStorage;
cub_kvp thread_kvp;
cub::ArgMax arg_max;
const int block_row = blockIdx.x + blockIdx.y * gridDim.x;
if (block_row >= num_rows) {
return;
}
const bool should_process_row = true;
const int thread_read_offset = block_row * num_experts;
for (int k_idx = 0; k_idx < k; ++k_idx) {
thread_kvp.key = 0;
thread_kvp.value = T(-1.f); // This is OK because inputs are probabilities
cub_kvp inp_kvp;
for (int expert = threadIdx.x; expert < num_experts; expert += TPB) {
const int idx = thread_read_offset + expert;
inp_kvp.key = expert;
inp_kvp.value = inputs_after_softmax[idx];
for (int prior_k = 0; prior_k < k_idx; ++prior_k) {
const int prior_winning_expert = indices[k * block_row + prior_k];
if (prior_winning_expert == expert) {
inp_kvp = thread_kvp;
}
}
thread_kvp = arg_max(inp_kvp, thread_kvp);
}
const cub_kvp result_kvp =
BlockReduce(tmpStorage).Reduce(thread_kvp, arg_max);
if (threadIdx.x == 0) {
const int idx = k * block_row + k_idx;
output[idx] = result_kvp.value;
indices[idx] = should_process_row ? result_kvp.key : num_experts;
source_rows[idx] = k_idx * num_rows + block_row;
}
__syncthreads();
}
}
// ====================== TopK softmax things ===============================
/*
A Top-K gating softmax written to exploit when the number of experts in the
MoE layers are a small power of 2. This allows us to cleanly share the rows
among the threads in a single warp and eliminate communication between warps
(so no need to use shared mem).
It fuses the softmax, max and argmax into a single kernel.
Limitations:
1) This implementation is intended for when the number of experts is a small
power of 2. 2) This implementation assumes k is small, but will work for any
k.
*/
template <typename T,
int VPT,
int NUM_EXPERTS,
int WARPS_PER_CTA,
int BYTES_PER_LDG>
__launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
void topk_gating_softmax(const T* input,
T* output,
const int64_t num_rows,
int* indices,
int* source_rows,
const int64_t k) {
// We begin by enforcing compile time assertions and setting up compile time
// constants.
static_assert(VPT == (VPT & -VPT), "VPT must be power of 2");
static_assert(NUM_EXPERTS == (NUM_EXPERTS & -NUM_EXPERTS),
"NUM_EXPERTS must be power of 2");
static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG),
"BYTES_PER_LDG must be power of 2");
static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
// Number of bytes each thread pulls in per load
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(T);
static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;
// Restrictions based on previous section.
static_assert(
VPT % ELTS_PER_LDG == 0,
"The elements per thread must be a multiple of the elements per ldg");
static_assert(WARP_SIZE % THREADS_PER_ROW == 0,
"The threads per row must cleanly divide the threads per warp");
static_assert(THREADS_PER_ROW == (THREADS_PER_ROW & -THREADS_PER_ROW),
"THREADS_PER_ROW must be power of 2");
static_assert(THREADS_PER_ROW <= WARP_SIZE,
"THREADS_PER_ROW can be at most warp size");
// We have NUM_EXPERTS elements per row. We specialize for small #experts
static constexpr int ELTS_PER_WARP = WARP_SIZE * VPT;
static constexpr int ROWS_PER_WARP = ELTS_PER_WARP / ELTS_PER_ROW;
static constexpr int ROWS_PER_CTA = WARPS_PER_CTA * ROWS_PER_WARP;
// Restrictions for previous section.
static_assert(ELTS_PER_WARP % ELTS_PER_ROW == 0,
"The elts per row must cleanly divide the total elt per warp");
// ===================== From this point, we finally start computing run-time
// variables. ========================
// Compute CTA and warp rows. We pack multiple rows into a single warp, and a
// block contains WARPS_PER_CTA warps. This, each block processes a chunk of
// rows. We start by computing the start row for each block.
const int cta_base_row = blockIdx.x * ROWS_PER_CTA;
// Now, using the base row per thread block, we compute the base row per warp.
const int warp_base_row = cta_base_row + threadIdx.y * ROWS_PER_WARP;
// The threads in a warp are split into sub-groups that will work on a row.
// We compute row offset for each thread sub-group
const int thread_row_in_warp = threadIdx.x / THREADS_PER_ROW;
const int thread_row = warp_base_row + thread_row_in_warp;
// Threads with indices out of bounds should early exit here.
if (thread_row >= num_rows) return;
const bool should_process_row = true;
// We finally start setting up the read pointers for each thread. First, each
// thread jumps to the start of the row it will read.
const T* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
// Now, we compute the group each thread belong to in order to determine the
// first column to start loads.
const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
const T* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
// Determine the pointer type to use to read in the data depending on the
// BYTES_PER_LDG template param. In theory, this can support all powers of 2
// up to 16.
using AccessType = mctlass::AlignedArray<T, ELTS_PER_LDG>;
// Finally, we pull in the data from global mem
mctlass::Array<T, VPT> row_chunk_input;
AccessType* row_chunk_vec_ptr =
reinterpret_cast<AccessType*>(&row_chunk_input);
const AccessType* vec_thread_read_ptr =
reinterpret_cast<const AccessType*>(thread_read_ptr);
#pragma unroll
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
}
using ComputeType = float;
using Converter = mctlass::NumericArrayConverter<ComputeType, T, VPT>;
Converter compute_type_converter;
mctlass::Array<ComputeType, VPT> row_chunk =
compute_type_converter(row_chunk_input);
// First, we perform a max reduce within the thread. We can do the max in fp16
// safely (I think) and just convert to float afterwards for the exp + sum
// reduction.
ComputeType thread_max = row_chunk[0];
#pragma unroll
for (int ii = 1; ii < VPT; ++ii) {
thread_max = max(thread_max, row_chunk[ii]);
}
// Now, we find the max within the thread group and distribute among the
// threads. We use a butterfly reduce.
#pragma unroll
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2) {
thread_max =
max(thread_max,
__shfl_xor_sync(0xFFFFFFFF, thread_max, mask, THREADS_PER_ROW));
}
// From this point, thread max in all the threads have the max within the row.
// Now, we subtract the max from each element in the thread and take the exp.
// We also compute the thread local sum.
float row_sum = 0;
#pragma unroll
for (int ii = 0; ii < VPT; ++ii) {
row_chunk[ii] = expf(row_chunk[ii] - thread_max);
row_sum += row_chunk[ii];
}
// Now, we perform the sum reduce within each thread group. Similar to the max
// reduce, we use a bufferfly pattern.
#pragma unroll
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2) {
row_sum += __shfl_xor_sync(0xFFFFFFFF, row_sum, mask, THREADS_PER_ROW);
}
// From this point, all threads have the max and the sum for their rows in the
// thread_max and thread_sum variables respectively. Finally, we can scale the
// rows for the softmax. Technically, for top-k gating we don't need to
// compute the entire softmax row. We can likely look at the maxes and only
// compute for the top-k values in the row. However, this kernel will likely
// not be a bottle neck and it seems better to closer match torch and find the
// argmax after computing the softmax.
const float reciprocal_row_sum = 1.f / row_sum;
#pragma unroll
for (int ii = 0; ii < VPT; ++ii) {
row_chunk[ii] = row_chunk[ii] * reciprocal_row_sum;
}
// Now, softmax_res contains the softmax of the row chunk. Now, I want to find
// the topk elements in each row, along with the max index.
int start_col = first_elt_read_by_thread;
static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
for (int k_idx = 0; k_idx < k; ++k_idx) {
// First, each thread does the local argmax
float max_val = row_chunk[0];
int expert = start_col;
#pragma unroll
for (int ldg = 0, col = start_col; ldg < LDG_PER_THREAD;
++ldg, col += COLS_PER_GROUP_LDG) {
#pragma unroll
for (int ii = 0; ii < ELTS_PER_LDG; ++ii) {
float val = row_chunk[ldg * ELTS_PER_LDG + ii];
// No check on the experts here since columns with the smallest index
// are processed first and only updated if > (not >=)
if (val > max_val) {
max_val = val;
expert = col + ii;
}
}
}
// Now, we perform the argmax reduce. We use the butterfly pattern so threads
// reach consensus about the max. This will be useful for K > 1 so that the
// threads can agree on "who" had the max value. That thread can then blank out
// their max with -inf and the warp can run more iterations...
#pragma unroll
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2) {
float other_max =
__shfl_xor_sync(0xFFFFFFFF, max_val, mask, THREADS_PER_ROW);
int other_expert =
__shfl_xor_sync(0xFFFFFFFF, expert, mask, THREADS_PER_ROW);
// We want lower indices to "win" in every thread so we break ties this
// way
if (other_max > max_val ||
(other_max == max_val && other_expert < expert)) {
max_val = other_max;
expert = other_expert;
}
}
// Write the max for this k iteration to global memory.
if (thread_group_idx == 0) {
// The lead thread from each sub-group will write out the final results to
// global memory. (This will be a single) thread per row of the
// input/output matrices.
const int idx = k * thread_row + k_idx;
output[idx] = T(max_val);
indices[idx] = should_process_row ? expert : NUM_EXPERTS;
source_rows[idx] = k_idx * num_rows + thread_row;
}
// Finally, we clear the value in the thread with the current max if there
// is another iteration to run.
if (k_idx + 1 < k) {
const int ldg_group_for_expert = expert / COLS_PER_GROUP_LDG;
const int thread_to_clear_in_group =
(expert / ELTS_PER_LDG) % THREADS_PER_ROW;
// Only the thread in the group which produced the max will reset the
// "winning" value to -inf.
if (thread_group_idx == thread_to_clear_in_group) {
const int offset_for_expert = expert % ELTS_PER_LDG;
// Safe to set to any negative value since row_chunk values must be
// between 0 and 1.
row_chunk[ldg_group_for_expert * ELTS_PER_LDG + offset_for_expert] =
ComputeType(-10000.f);
}
}
}
}
namespace detail {
// Constructs some constants needed to partition the work across threads at
// compile time.
template <typename T, int EXPERTS, int BYTES_PER_LDG>
struct TopkConstants {
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(T);
static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE) == 0 ||
EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0,
"");
static constexpr int VECs_PER_THREAD =
std::max(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
static constexpr int THREADS_PER_ROW = EXPERTS / VPT;
static constexpr int ROWS_PER_WARP = WARP_SIZE / THREADS_PER_ROW;
};
} // namespace detail
template <typename T, int EXPERTS, int WARPS_PER_TB>
void topk_gating_softmax_launcher_helper(const T* input,
T* output,
int* indices,
int* source_row,
const int64_t num_rows,
const int64_t num_experts,
const int64_t k,
cudaStream_t stream) {
static constexpr uint64_t MAX_BYTES_PER_LDG = 16;
static constexpr int BYTES_PER_LDG =
std::min(MAX_BYTES_PER_LDG, sizeof(T) * EXPERTS);
using Constants = detail::TopkConstants<T, EXPERTS, BYTES_PER_LDG>;
static constexpr int VPT = Constants::VPT;
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
dim3 block_dim(WARP_SIZE, WARPS_PER_TB);
topk_gating_softmax<T, VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG>
<<<num_blocks, block_dim, 0, stream>>>(
input, output, num_rows, indices, source_row, k);
}
template <typename T>
void topk_gating_softmax_kernelLauncher(const T* input,
T* output,
T* softmax,
int* indices,
int* source_row,
T* softmax_max_prob,
const int64_t num_rows,
const int64_t num_experts,
const int64_t k,
const bool group_moe,
cudaStream_t stream,
const bool topk_only_mode = false) {
if (topk_only_mode) {
static constexpr int TPB = 256;
const auto config_topk = Get1DBlocksAnd2DGridsMoe(num_rows);
moe_top_k<T, TPB><<<config_topk.block_per_grid, TPB, 0, stream>>>(
input, output, indices, source_row, num_experts, k, num_rows);
return;
}
static constexpr int WARPS_PER_TB = 4;
#define LAUNCH_TOPK_GATING_SOFTMAX_HELPER(N) \
case N: { \
topk_gating_softmax_launcher_helper<T, N, WARPS_PER_TB>( \
input, output, indices, source_row, num_rows, num_experts, k, stream); \
break; \
}
switch (num_experts) {
LAUNCH_TOPK_GATING_SOFTMAX_HELPER(2)
LAUNCH_TOPK_GATING_SOFTMAX_HELPER(4)
LAUNCH_TOPK_GATING_SOFTMAX_HELPER(8)
LAUNCH_TOPK_GATING_SOFTMAX_HELPER(16)
LAUNCH_TOPK_GATING_SOFTMAX_HELPER(32)
LAUNCH_TOPK_GATING_SOFTMAX_HELPER(64)
LAUNCH_TOPK_GATING_SOFTMAX_HELPER(128)
LAUNCH_TOPK_GATING_SOFTMAX_HELPER(256)
default: {
static constexpr int TPB = 256;
if (group_moe) {
const int group_experts = num_experts / k;
const int softmax_num_rows = num_rows * k;
const auto config_softmax = Get1DBlocksAnd2DGridsMoe(softmax_num_rows);
group_moe_softmax<T, TPB>
<<<config_softmax.block_per_grid, TPB, 0, stream>>>(
input,
softmax,
softmax_max_prob,
group_experts,
softmax_num_rows);
const auto config_topk = Get1DBlocksAnd2DGridsMoe(num_rows);
moe_top_k<T, TPB>
<<<config_topk.block_per_grid, TPB, 0, stream>>>(softmax,
output,
indices,
source_row,
softmax_max_prob,
num_experts,
k,
num_rows);
} else {
const auto config_topk = Get1DBlocksAnd2DGridsMoe(num_rows);
moe_softmax<T, TPB><<<config_topk.block_per_grid, TPB, 0, stream>>>(
input, softmax, num_experts, num_rows);
moe_top_k<T, TPB>
<<<config_topk.block_per_grid, TPB, 0, stream>>>(softmax,
output,
indices,
source_row,
num_experts,
k,
num_rows);
}
}
}
}
// ========================== Permutation things
// =======================================
// Duplicated and permutes rows for MoE. In addition, reverse the permutation
// map to help with finalizing routing.
// "expanded_x_row" simply means that the number of values is num_rows x k. It
// is "expanded" since we will have to duplicate some rows in the input matrix
// to match the dimensions. Duplicates will always get routed to separate
// experts in the end.
// Note that the expanded_dest_row_to_expanded_source_row map referred to here
// has indices in the range (0, k*rows_in_input - 1). However, it is set up so
// that index 0, rows_in_input, 2*rows_in_input ... (k-1)*rows_in_input all map
// to row 0 in the original matrix. Thus, to know where to read in the source
// matrix, we simply take the modulus of the expanded index.
template <typename T, int VecSize>
__global__ void initialize_moe_routing_kernel(
const T* unpermuted_input,
T* permuted_output,
const int* expanded_dest_row_to_expanded_source_row,
int* expanded_source_row_to_expanded_dest_row,
const int64_t num_rows,
const int64_t active_rows,
const int64_t cols,
const int64_t num_rows_k) {
using LoadT = AlignedVector<T, VecSize>;
LoadT src_vec;
// Reverse permutation map.
// I do this so that later, we can use the source -> dest map to do the k-way
// reduction and unpermuting. I need the reverse map for that reduction to
// allow each threadblock to do 1 k-way reduce without atomics later in MoE. 1
// thread block will be responsible for all k summations.
const int expanded_dest_row = blockIdx.x + blockIdx.y * gridDim.x;
if (expanded_dest_row >= num_rows_k) return;
const int expanded_source_row =
expanded_dest_row_to_expanded_source_row[expanded_dest_row];
if (threadIdx.x == 0) {
expanded_source_row_to_expanded_dest_row[expanded_source_row] =
expanded_dest_row;
}
if ((blockIdx.x + blockIdx.y * gridDim.x) < active_rows) {
// Duplicate and permute rows
const int source_row = expanded_source_row % num_rows;
const T* source_row_ptr = unpermuted_input + source_row * cols;
T* dest_row_ptr = permuted_output + expanded_dest_row * cols;
for (int tid = threadIdx.x * VecSize; tid < cols;
tid += blockDim.x * VecSize) {
// dest_row_ptr[tid] = source_row_ptr[tid];
Load<T, VecSize>(&source_row_ptr[tid], &src_vec);
Store<T, VecSize>(src_vec, &dest_row_ptr[tid]);
}
}
}
template <typename T>
void initialize_moe_routing_kernelLauncher(
const T* unpermuted_input,
T* permuted_output,
const int* expanded_dest_row_to_expanded_source_row,
int* expanded_source_row_to_expanded_dest_row,
const int64_t num_rows,
const int64_t active_rows,
const int64_t cols,
const int64_t k,
cudaStream_t stream) {
const int threads = std::min(cols, int64_t(1024));
constexpr int max_pack_size = 16 / sizeof(T);
const auto config_initialize = Get1DBlocksAnd2DGridsMoe(num_rows * k);
if (cols % max_pack_size == 0) {
initialize_moe_routing_kernel<T, max_pack_size>
<<<config_initialize.block_per_grid, threads, 0, stream>>>(
unpermuted_input,
permuted_output,
expanded_dest_row_to_expanded_source_row,
expanded_source_row_to_expanded_dest_row,
num_rows,
k * active_rows,
cols,
num_rows * k);
} else {
initialize_moe_routing_kernel<T, 1>
<<<config_initialize.block_per_grid, threads, 0, stream>>>(
unpermuted_input,
permuted_output,
expanded_dest_row_to_expanded_source_row,
expanded_source_row_to_expanded_dest_row,
num_rows,
k * active_rows,
cols,
num_rows * k);
}
}
// ============================== Infer GEMM sizes
// =================================
__device__ inline int find_total_elts_leq_target(int* sorted_indices,
const int64_t arr_length,
const int64_t target) {
int64_t low = 0, high = arr_length - 1, target_location = -1;
while (low <= high) {
int64_t mid = (low + high) / 2;
if (sorted_indices[mid] > target) {
high = mid - 1;
} else {
low = mid + 1;
target_location = mid;
}
}
return target_location + 1;
}
void compute_total_rows_before_expert(int* sorted_indices,
const int64_t total_indices,
const int64_t num_experts,
int32_t* total_rows_before_expert,
cudaStream_t stream);
// Final kernel to unpermute and scale
// This kernel unpermutes the original data, does the k-way reduction and
// performs the final skip connection.
template <typename T, int RESIDUAL_NUM>
__global__ void finalize_moe_routing_kernel(
const T* expanded_permuted_rows,
T* reduced_unpermuted_output,
const T* bias,
const float* scales,
const int* expanded_source_row_to_expanded_dest_row,
const int* expert_for_source_row,
const int64_t cols,
const int64_t k,
const int64_t compute_bias,
const bool norm_topk_prob,
const float routed_scaling_factor,
const int64_t num_rows) {
const int original_row = blockIdx.x + blockIdx.y * gridDim.x;
// const int original_row = blockIdx.x;
// const int num_rows = gridDim.x;
if (original_row >= num_rows) return;
T* reduced_row_ptr = reduced_unpermuted_output + original_row * cols;
for (int tid = threadIdx.x; tid < cols; tid += blockDim.x) {
T thread_output{0.f};
float row_rescale{0.f};
for (int k_idx = 0; k_idx < k; ++k_idx) {
const int expanded_original_row = original_row + k_idx * num_rows;
const int expanded_permuted_row =
expanded_source_row_to_expanded_dest_row[expanded_original_row];
const int64_t k_offset = original_row * k + k_idx;
const float row_scale = scales[k_offset];
row_rescale = row_rescale + row_scale;
const T* expanded_permuted_rows_row_ptr =
expanded_permuted_rows + expanded_permuted_row * cols;
const int expert_idx = expert_for_source_row[k_offset];
const T* bias_ptr = bias ? bias + expert_idx * cols : nullptr;
const T bias_value = bias_ptr ? bias_ptr[tid] : T{0.f};
thread_output =
static_cast<float>(thread_output) +
row_scale * static_cast<float>(
expanded_permuted_rows_row_ptr[tid] +
bias_value *
static_cast<T>(static_cast<float>(compute_bias)));
}
thread_output = static_cast<float>(thread_output) /
(norm_topk_prob ? row_rescale : 1.0f) *
routed_scaling_factor;
reduced_row_ptr[tid] = thread_output;
}
}
template <typename T>
void finalize_moe_routing_kernelLauncher(
const T* expanded_permuted_rows,
T* reduced_unpermuted_output,
const T* bias,
const float* scales,
const int* expanded_source_row_to_expanded_dest_row,
const int* expert_for_source_row,
const int64_t num_rows,
const int64_t cols,
const int64_t k,
const int64_t compute_bias,
const bool norm_topk_prob,
const float routed_scaling_factor,
cudaStream_t stream) {
const int threads = std::min(cols, int64_t(1024));
const auto config_final = Get1DBlocksAnd2DGridsMoe(num_rows);
finalize_moe_routing_kernel<T, 1>
<<<config_final.block_per_grid, threads, 0, stream>>>(
expanded_permuted_rows,
reduced_unpermuted_output,
bias,
scales,
expanded_source_row_to_expanded_dest_row,
expert_for_source_row,
cols,
k,
compute_bias,
norm_topk_prob,
routed_scaling_factor,
num_rows);
}
// ========================= TopK Softmax specializations
// ===========================
template void topk_gating_softmax_kernelLauncher(const float*,
float*,
float*,
int*,
int*,
float*,
const int64_t,
const int64_t,
const int64_t,
const bool,
cudaStream_t,
const bool);
template void topk_gating_softmax_kernelLauncher(const half*,
half*,
half*,
int*,
int*,
half*,
const int64_t,
const int64_t,
const int64_t,
const bool,
cudaStream_t,
const bool);
#ifdef PADDLE_CUDA_BF16
template void topk_gating_softmax_kernelLauncher(const __nv_bfloat16*,
__nv_bfloat16*,
__nv_bfloat16*,
int*,
int*,
__nv_bfloat16*,
const int64_t,
const int64_t,
const int64_t,
const bool,
cudaStream_t,
const bool);
#endif
// ===================== Specializations for init routing
// =========================
template void initialize_moe_routing_kernelLauncher(const float*,
float*,
const int*,
int*,
const int64_t,
const int64_t,
const int64_t,
const int64_t,
cudaStream_t);
template void initialize_moe_routing_kernelLauncher(const half*,
half*,
const int*,
int*,
const int64_t,
const int64_t,
const int64_t,
const int64_t,
cudaStream_t);
#ifdef PADDLE_CUDA_BF16
template void initialize_moe_routing_kernelLauncher(const __nv_bfloat16*,
__nv_bfloat16*,
const int*,
int*,
const int64_t,
const int64_t,
const int64_t,
const int64_t,
cudaStream_t);
#endif
// ==================== Specializations for final routing
// ===================================
template void finalize_moe_routing_kernelLauncher(const float*,
float*,
const float*,
const float*,
const int*,
const int*,
const int64_t,
const int64_t,
const int64_t,
const int64_t,
const bool,
const float,
cudaStream_t);
template void finalize_moe_routing_kernelLauncher(const half*,
half*,
const half*,
const float*,
const int*,
const int*,
const int64_t,
const int64_t,
const int64_t,
const int64_t,
const bool,
const float,
cudaStream_t);
#ifdef PADDLE_CUDA_BF16
template void finalize_moe_routing_kernelLauncher(const __nv_bfloat16*,
__nv_bfloat16*,
const __nv_bfloat16*,
const float*,
const int*,
const int*,
const int64_t,
const int64_t,
const int64_t,
const int64_t,
const bool,
const float,
cudaStream_t);
#endif

View File

@@ -1,417 +0,0 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mctlass/numeric_conversion.h"
#include "mctlassEx/mctlassEx.h"
#include "fused_moe_helper.h"
template <typename ElementA, typename ElementB, typename ElementC>
void mc_grouped_gemm_basic_kernel(
const ElementA* ptrA,
mctlassExOrder_t majorA,
const ElementB* ptrB,
mctlassExOrder_t majorB,
const ElementA* ptrScale,
const ElementA* ptrBias,
ElementC* ptrC,
mctlassExOrder_t majorC,
const int *ptrSegInd,
int numExperts,
int m, // expanded_active_expert_rows
int n, // inter_dim
int k, // hidden_size
mcStream_t stream) {
mctlassExHandle_t handle;
mctlassExHandleCreate(&handle);
int* ptrMNumTilesInd;
mcMallocAsync((void**)&ptrMNumTilesInd, sizeof(int) * numExperts, stream);
mctlassExMatrixLayout_t matLayoutA;
mctlassExMatrixLayout_t matLayoutB;
mctlassExMatrixLayout_t matLayoutC;
// mat A: (m, k)
mctlassExMatrixLayoutCreate(&matLayoutA, mctlassExDataType::MCTLASS_EX_BF16, m, k, k);
mctlassExMatrixLayoutSetAttribute(matLayoutA, mctlassExMatrixLayoutAttribute_t::MCTLASS_EX_MATRIX_LAYOUT_ORDER,
&majorA, sizeof(mctlassExOrder_t));
// mat B: (num_experts, n, k)
mctlassExMatrixLayoutCreate(&matLayoutB, mctlassExDataType::MCTLASS_EX_INT8, k, n, k);
mctlassExMatrixLayoutSetAttribute(matLayoutB, mctlassExMatrixLayoutAttribute_t::MCTLASS_EX_MATRIX_LAYOUT_ORDER,
&majorB, sizeof(mctlassExOrder_t));
mctlassExMatrixLayoutSetAttribute(matLayoutB, mctlassExMatrixLayoutAttribute_t::MCTLASS_EX_MATRIX_LAYOUT_BATCH_COUNT,
&numExperts, sizeof(int));
// mat C: (m, n)
mctlassExMatrixLayoutCreate(&matLayoutC, mctlassExDataType::MCTLASS_EX_BF16, m, n, n);
mctlassExMatrixLayoutSetAttribute(matLayoutC, mctlassExMatrixLayoutAttribute_t::MCTLASS_EX_MATRIX_LAYOUT_ORDER,
&majorC, sizeof(mctlassExOrder_t));
// bias: (num_experts, n)
// scale: (num, n)
mctlassExDesc_t mctlass_desc;
mctlassExCreateDesc(&mctlass_desc);
mctlassExDataType input_type = mctlassExDataType::MCTLASS_EX_BF16;
mctlassExDataType scale_type = mctlassExDataType::MCTLASS_EX_INT8;
mctlassExDataType compute_type = mctlassExDataType::MCTLASS_EX_FP32;
mctlassExEpilogueType epilogue_type = mctlassExEpilogueType::MCTLASS_EX_GEMM_DEFAULT;
if (ptrBias) {
epilogue_type = mctlassExEpilogueType::MCTLASS_EX_GEMM_BIAS_PERGROUP;
}
// set scale
mctlassExDescSetAttribute(mctlass_desc, mctlassExDescAttributes_t::MCTLASS_EX_GEMM_DESC_B_SCALE_POINTER,
&ptrScale, sizeof(ptrScale));
mctlassExDescSetAttribute(mctlass_desc, mctlassExDescAttributes_t::MCTLASS_EX_GEMM_DESC_B_SCALE_TYPE,
&scale_type, sizeof(mctlassExDataType));
// set bias
if (ptrBias) {
mctlassExDescSetAttribute(mctlass_desc, mctlassExDescAttributes_t::MCTLASS_EX_GEMM_DESC_BIAS_POINTER,
&ptrBias, sizeof(ptrBias));
}
// set coumpute type
mctlassExDescSetAttribute(mctlass_desc, mctlassExDescAttributes_t::MCTLASS_EX_GEMM_DESC_COMPUTE_TYPE,
&compute_type, sizeof(mctlassExDataType));
// set epilogue type
mctlassExDescSetAttribute(mctlass_desc, mctlassExDescAttributes_t::MCTLASS_EX_GEMM_DESC_EPILOGUE_TYPE,
&epilogue_type, sizeof(mctlassExEpilogueType));
const mctlassExContiguousGroupedGemmAlgo_t algo = mctlassExContiguousGroupedGemmAlgo_t::MCTLASS_EX_CONTIGUOUS_GROUPED_ALGO_SEGPTR;
int blocksizeM = mctlassExContiguousGroupedGemmGetBlocksizeM(handle, mctlass_desc, matLayoutA, matLayoutB, matLayoutC, &algo);
mctlassExContiguousGroupedGemmComputeMNumTilesIndptr(handle, mctlass_desc, matLayoutA, matLayoutB, matLayoutC, &algo, ptrSegInd, ptrMNumTilesInd, numExperts, blocksizeM);
mctlassExContiguousGroupedGemmBasic(handle, mctlass_desc,
ptrA, matLayoutA,
ptrB, matLayoutB,
ptrC, matLayoutC,
ptrSegInd, nullptr, ptrMNumTilesInd,
&algo, nullptr, 0, stream);
mctlassExHandleDestroy(handle);
mctlassExMatrixLayoutDestroy(matLayoutA);
mctlassExMatrixLayoutDestroy(matLayoutB);
mctlassExMatrixLayoutDestroy(matLayoutC);
mctlassExDestroyDesc(mctlass_desc);
mcFreeAsync(ptrMNumTilesInd, stream);
}
template<typename T, typename ElementA, typename ElementB, typename ElementC>
class McMoeHelper {
public:
McMoeHelper(const std::string gemm_method): gemm_method_(gemm_method) {}
// -------- getWorkspaceSize -------- //
template <typename KeyT>
size_t getWorkspaceSize(const int64_t num_rows,
const int64_t hidden_size,
const int64_t inter_size,
const int64_t num_experts,
const int64_t k) {
const size_t buf_size = AlignTo16(k * num_rows * hidden_size);
const size_t interbuf_size = AlignTo16(k * num_rows * inter_size);
const size_t padded_experts = AlignTo16(num_experts);
const size_t num_moe_inputs = AlignTo16(k * num_rows);
// softmax output, permuted_rows and permuted_experts have moved to outside
// of moe kernel, allocate them in Encoder or Decoder before invoking
// FfnLayer forward.
size_t total_ws_bytes =
5 * num_moe_inputs *
sizeof(int); // source_rows_, permuted_rows_, permuted_experts_
total_ws_bytes += buf_size * sizeof(KeyT); // permuted_data
total_ws_bytes +=
padded_experts * sizeof(int32_t); // Hold total_rows_before_expert_
const size_t bytes_for_fc1_result = interbuf_size * sizeof(KeyT);
const size_t sorter_ws_size_bytes =
AlignTo16(sorter_.getWorkspaceSize(num_rows));
sorter_.update_num_experts(num_experts);
int64_t bytes_for_intermediate_and_sorting = bytes_for_fc1_result;
if (sorter_ws_size_bytes > bytes_for_fc1_result) {
int64_t remaining_bytes =
AlignTo16(sorter_ws_size_bytes - bytes_for_fc1_result);
bytes_for_intermediate_and_sorting += remaining_bytes;
}
total_ws_bytes +=
bytes_for_intermediate_and_sorting; // intermediate (fc1) output + cub
// sorting workspace
int64_t num_softmax_outs = 0;
const bool is_pow_2 =
(num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
if (!is_pow_2 || num_experts > 256) {
num_softmax_outs = AlignTo16(num_rows * num_experts);
}
total_ws_bytes += num_softmax_outs * sizeof(float);
return total_ws_bytes;
}
void computeFFN(const paddle::Tensor *input,
const paddle::Tensor *gate_weight,
const paddle::Tensor *ffn1_weight,
const paddle::Tensor *ffn1_scale,
const paddle::Tensor *ffn1_bias,
const paddle::Tensor *ffn2_weight,
const paddle::Tensor *ffn2_scale,
const paddle::Tensor *ffn2_bias,
const paddle::Tensor *moe_token_type_ids,
const int moe_topk,
const bool group_moe,
const bool norm_topk_prob,
const float routed_scaling_factor,
const std::string moe_type,
paddle::Tensor *output) {
auto *input_activations = input->data<T>();
auto *gating_weights = gate_weight->data<float>();
const T *fc1_expert_biases = ffn1_bias ? ffn1_bias->data<T>() : nullptr;
const T *fc2_expert_biases = ffn2_bias ? ffn2_bias->data<T>() : nullptr;
auto *output_ = output->data<T>();
auto stream = input->stream();
auto place = input->place();
auto input_type = input->dtype();
auto input_dims = input->dims();
auto ffn1_dims = ffn1_weight->dims();
int64_t token_num = 0;
if (input_dims.size() == 3) {
token_num = input_dims[0] * input_dims[1];
} else {
token_num = input_dims[0];
}
const int64_t num_rows = token_num;
const int64_t hidden_size = ffn1_dims[2];
int64_t inter_dim = 0;
if (moe_type == "qkv") {
inter_dim = ffn1_dims[2] * ffn1_dims[3] * ffn1_dims[4];
} else {
inter_dim = ffn1_dims[1];
}
// if (gemm_method == "weight_only_int4") {
// inter_dim = inter_dim * 2;
// }
const int64_t inter_size = inter_dim;
const int64_t num_experts = ffn1_dims[0];
const int64_t k = moe_topk;
int64_t bytes =
getWorkspaceSize<T>(num_rows, hidden_size, inter_size, num_experts, k);
// Pointers
int *expert_for_source_row;
int *source_rows_;
int *permuted_rows_;
int *permuted_experts_;
int *expanded_source_row_to_expanded_dest_row;
T *permuted_data_;
int32_t *total_rows_before_expert_;
T *fc1_result_;
float *softmax_out_;
paddle::Tensor ws_ptr_tensor =
GetEmptyTensor({bytes}, paddle::DataType::INT8, place);
int8_t *ws_ptr = ws_ptr_tensor.data<int8_t>();
const int64_t buf_size = AlignTo16(k * num_rows * hidden_size);
const int64_t interbuf_size = AlignTo16(k * num_rows * inter_size);
const int64_t padded_experts = AlignTo16(num_experts);
const int64_t num_moe_inputs = AlignTo16(k * num_rows);
expert_for_source_row = reinterpret_cast<int *>(ws_ptr);
source_rows_ = expert_for_source_row + num_moe_inputs;
permuted_rows_ = source_rows_ + num_moe_inputs;
permuted_experts_ = permuted_rows_ + num_moe_inputs;
expanded_source_row_to_expanded_dest_row =
permuted_experts_ + num_moe_inputs;
permuted_data_ = reinterpret_cast<T *>(
expanded_source_row_to_expanded_dest_row + num_moe_inputs);
total_rows_before_expert_ =
reinterpret_cast<int32_t *>(permuted_data_ + buf_size);
fc1_result_ =
reinterpret_cast<T *>(total_rows_before_expert_ + padded_experts);
const bool is_pow_2 =
(num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
if (!is_pow_2 || num_experts > 256) {
softmax_out_ = reinterpret_cast<float *>(fc1_result_ + interbuf_size);
} else {
softmax_out_ = nullptr;
}
paddle::Tensor expert_scales_float_tensor =
GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
float *expert_scales_float = expert_scales_float_tensor.data<float>();
float *softmax_max_prob = nullptr;
if (group_moe) {
paddle::Tensor softmax_max_prob_tensor = GetEmptyTensor(
{num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
// (TODO: check fill success ?)
paddle::experimental::fill(softmax_max_prob_tensor, 0.f);
softmax_max_prob = softmax_max_prob_tensor.data<float>();
}
paddle::Tensor fc1_out_tensor =
GetEmptyTensor({num_rows * k, inter_size}, input_type, place);
T *fc1_out = fc1_out_tensor.data<T>();
auto input_cast_tensor =
paddle::experimental::cast(*input, paddle::DataType::FLOAT32);
auto gate_tensor =
paddle::experimental::matmul(input_cast_tensor, *gate_weight);
float *gating_output = gate_tensor.data<float>();
if (moe_token_type_ids) {
auto *moe_token_type_ids_out = moe_token_type_ids->data<int>();
moe_token_type_ids_kernelLauncher<float>(gating_output,
moe_token_type_ids_out,
num_rows,
num_experts,
k,
stream);
}
topk_gating_softmax_kernelLauncher<float>(gating_output,
expert_scales_float,
softmax_out_,
expert_for_source_row,
source_rows_,
softmax_max_prob,
num_rows,
num_experts,
k,
group_moe,
stream);
const int64_t sorter_ws_size_bytes =
AlignTo16(sorter_.getWorkspaceSize(int64_t(k * num_rows)));
sorter_.run(fc1_result_,
sorter_ws_size_bytes,
expert_for_source_row,
permuted_experts_,
source_rows_,
permuted_rows_,
k * num_rows,
false,
stream);
initialize_moe_routing_kernelLauncher(
input_activations,
permuted_data_,
permuted_rows_,
expanded_source_row_to_expanded_dest_row,
num_rows,
num_rows,
hidden_size,
k,
stream);
const int64_t expanded_active_expert_rows = k * num_rows;
compute_total_rows_before_expert(permuted_experts_,
expanded_active_expert_rows,
num_experts,
total_rows_before_expert_,
stream);
mctlassExOrder_t row_major = mctlassExOrder_t::MCTLASS_EX_ROWMAJOR_ORDER;
mctlassExOrder_t column_major = mctlassExOrder_t::MCTLASS_EX_COLUMNMAJOR_ORDER;
mc_grouped_gemm_basic_kernel<ElementA, ElementB, ElementC>(
reinterpret_cast<const ElementA *>(permuted_data_),
row_major,
reinterpret_cast<const ElementB *>(ffn1_weight->data<ElementB>()),
column_major,
reinterpret_cast<const ElementA *>(ffn1_scale->data<T>()),
reinterpret_cast<const ElementA *>(fc1_expert_biases),
reinterpret_cast<ElementC *>(fc1_out),
row_major,
total_rows_before_expert_,
num_experts,
expanded_active_expert_rows,
inter_size,
hidden_size,
stream);
if (moe_type == "ffn") {
auto act_out_tensor =
paddle::experimental::swiglu(fc1_out_tensor, nullptr);
auto act_out = act_out_tensor.data<T>();
paddle::Tensor fc2_output_tensor =
GetEmptyTensor({k * num_rows, hidden_size}, input_type, place);
T *fc2_result = fc2_output_tensor.data<T>();
mc_grouped_gemm_basic_kernel<ElementA, ElementB, ElementC>(
reinterpret_cast<const ElementA *>(act_out),
row_major,
reinterpret_cast<const ElementB *>(ffn2_weight->data<ElementB>()),
column_major,
reinterpret_cast<const ElementA *>(ffn2_scale->data<T>()),
nullptr,
reinterpret_cast<ElementC *>(fc2_result),
row_major,
total_rows_before_expert_,
num_experts,
expanded_active_expert_rows,
hidden_size,
inter_size / 2,
stream);
finalize_moe_routing_kernelLauncher(
fc2_result,
output_,
fc2_expert_biases,
reinterpret_cast<float *>(expert_scales_float),
expanded_source_row_to_expanded_dest_row,
expert_for_source_row,
num_rows,
hidden_size,
k,
static_cast<int>(1),
norm_topk_prob,
routed_scaling_factor,
stream);
} else {
finalize_moe_routing_kernelLauncher(
// fc2_result,
fc1_out,
output_,
fc1_expert_biases, // fc2_expert_biases,
reinterpret_cast<float *>(expert_scales_float),
expanded_source_row_to_expanded_dest_row,
expert_for_source_row,
num_rows,
inter_size,
k,
static_cast<int>(0),
norm_topk_prob,
routed_scaling_factor,
stream);
}
}
private:
std::string gemm_method_;
CubKeyValueSorter sorter_;
};

View File

@@ -1,274 +0,0 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wunused-function"
#pragma once
#include "fused_moe_helper.h"
#include "fused_moe_op.h"
#pragma GCC diagnostic pop
#include "helper.h"
template <paddle::DataType T>
void MoeDispatchKernel(const paddle::Tensor& input,
const paddle::Tensor& gating_output,
const int moe_topk,
const bool group_moe,
const bool topk_only_mode,
const int num_rows,
const int hidden_size,
const int expert_num,
paddle::Tensor* permute_input,
paddle::Tensor* tokens_expert_prefix_sum,
paddle::Tensor* permute_indices_per_token,
paddle::Tensor* top_k_weight,
paddle::Tensor* top_k_indices) {
typedef PDTraits<T> traits_;
typedef typename traits_::DataType DataType_;
typedef typename traits_::data_t data_t;
auto stream = input.stream();
auto place = input.place();
if (group_moe) {
// Check if expert_num is divisible by moe_topk, else throw an error
PADDLE_ENFORCE_EQ(expert_num % moe_topk,
0,
common::errors::InvalidArgument(
"The number of experts (expert_num) "
"must be divisible by moe_topk. "
"Got expert_num = %d and moe_topk = %d.",
expert_num,
moe_topk));
}
const int num_moe_inputs = AlignTo16(num_rows * moe_topk);
const int bytes = num_moe_inputs * sizeof(int);
CubKeyValueSorter sorter_;
sorter_.update_num_experts(expert_num);
const int sorter_ws_size_bytes =
AlignTo16(sorter_.getWorkspaceSize(moe_topk * num_rows));
const int sort_tmp_in_out_size = num_moe_inputs * 2 * sizeof(int);
paddle::Tensor ws_ptr_tensor =
GetEmptyTensor({bytes + sorter_ws_size_bytes + sort_tmp_in_out_size},
paddle::DataType::INT8,
place);
int8_t* ws_ptr = ws_ptr_tensor.data<int8_t>();
int* source_rows_ = reinterpret_cast<int*>(ws_ptr);
int8_t* sorter_ws_ptr = reinterpret_cast<int8_t*>(ws_ptr + bytes);
int* permuted_experts_ =
reinterpret_cast<int*>(sorter_ws_ptr + sorter_ws_size_bytes);
int* permuted_rows_ = permuted_experts_ + num_moe_inputs;
int* expert_for_source_row = top_k_indices->data<int>();
float* softmax_max_prob = nullptr;
if (group_moe) {
paddle::Tensor softmax_max_prob_tensor =
GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
paddle::experimental::fill(softmax_max_prob_tensor, 0.f);
softmax_max_prob = softmax_max_prob_tensor.data<float>();
}
float* softmax_out_;
const bool is_pow_2 =
(expert_num != 0) && ((expert_num & (expert_num - 1)) == 0);
paddle::Tensor softmax_buffer;
if (!is_pow_2 || expert_num > 256 || group_moe) {
softmax_buffer = GetEmptyTensor(
{num_rows * expert_num}, paddle::DataType::FLOAT32, place);
softmax_out_ = softmax_buffer.data<float>();
} else {
softmax_out_ = nullptr;
}
topk_gating_softmax_kernelLauncher<float>(gating_output.data<float>(),
top_k_weight->data<float>(),
softmax_out_,
expert_for_source_row,
source_rows_,
softmax_max_prob,
num_rows,
expert_num,
moe_topk,
group_moe,
stream,
topk_only_mode);
sorter_.run(reinterpret_cast<void*>(sorter_ws_ptr),
sorter_ws_size_bytes,
expert_for_source_row,
permuted_experts_,
source_rows_,
permuted_rows_,
moe_topk * num_rows,
false,
stream);
initialize_moe_routing_kernelLauncher(
input.data<data_t>(),
permute_input->data<data_t>(),
permuted_rows_,
permute_indices_per_token->data<int32_t>(),
num_rows,
num_rows,
hidden_size,
moe_topk,
stream);
compute_total_rows_before_expert(
permuted_experts_,
moe_topk * num_rows,
expert_num,
tokens_expert_prefix_sum->data<int32_t>(),
stream);
}
std::vector<paddle::Tensor> MoeExpertDispatch(
const paddle::Tensor& input,
const paddle::Tensor& gating_output,
const int moe_topk,
const bool group_moe,
const bool topk_only_mode) {
const auto input_type = input.dtype();
auto place = input.place();
int token_rows = 0;
auto input_dims = input.dims();
auto gating_dims = gating_output.dims();
const int expert_num = gating_dims[gating_dims.size() - 1];
if (input_dims.size() == 3) {
token_rows = input_dims[0] * input_dims[1];
} else {
token_rows = input_dims[0];
}
const int num_rows = token_rows;
const int hidden_size = input.dims()[input_dims.size() - 1];
auto permute_input =
GetEmptyTensor({moe_topk * num_rows, hidden_size}, input_type, place);
// correspond to the weighted coefficients of the results from each expert.
auto top_k_weight =
GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
auto top_k_indices =
GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::INT32, place);
auto tokens_expert_prefix_sum =
GetEmptyTensor({expert_num}, paddle::DataType::INT32, place);
auto permute_indices_per_token =
GetEmptyTensor({moe_topk, num_rows}, paddle::DataType::INT32, place);
switch (input_type) {
case paddle::DataType::BFLOAT16:
MoeDispatchKernel<paddle::DataType::BFLOAT16>(input,
gating_output,
moe_topk,
group_moe,
topk_only_mode,
num_rows,
hidden_size,
expert_num,
&permute_input,
&tokens_expert_prefix_sum,
&permute_indices_per_token,
&top_k_weight,
&top_k_indices);
break;
// case paddle::DataType::FLOAT16:
// MoeDispatchKernel<paddle::DataType::FLOAT16>(input,
// gating_output,
// moe_topk,
// group_moe,
// topk_only_mode,
// num_rows,
// hidden_size,
// expert_num,
// &permute_input,
// &tokens_expert_prefix_sum,
// &permute_indices_per_token,
// &top_k_weight,
// &top_k_indices);
// break;
default:
PD_THROW("Only support bf16 for MoeDispatchKernel");
}
return {permute_input,
tokens_expert_prefix_sum,
permute_indices_per_token,
top_k_weight,
top_k_indices};
}
std::vector<std::vector<int64_t>> MoeExpertDispatchInferShape(
const std::vector<int64_t>& input_shape,
const std::vector<int64_t>& gating_output_shape,
const int moe_topk) {
int token_rows = -1;
if (input_shape.size() == 3) {
token_rows = input_shape[0] * input_shape[1];
} else {
token_rows = input_shape[0];
}
const int expert_num = gating_output_shape[gating_output_shape.size() - 1];
const int num_rows = token_rows;
const int hidden_size = input_shape[input_shape.size() - 1];
return {{moe_topk * num_rows, hidden_size},
{expert_num},
{moe_topk, num_rows},
{num_rows, moe_topk},
{num_rows, moe_topk}};
}
std::vector<paddle::DataType> MoeExpertDispatchInferDtype(
const paddle::DataType& input_dtype,
const paddle::DataType& gating_output_dtype,
const int moe_topk) {
return {input_dtype,
paddle::DataType::INT64,
paddle::DataType::INT32,
paddle::DataType::FLOAT32,
paddle::DataType::INT32};
}
PD_BUILD_OP(moe_expert_dispatch)
.Inputs({"input", "gating_output"})
.Outputs({"permute_input",
"tokens_expert_prefix_sum",
"permute_indices_per_token",
"top_k_weight",
"top_k_indices"})
.Attrs({"moe_topk:int", "group_moe:bool", "topk_only_mode:bool"})
.SetKernelFn(PD_KERNEL(MoeExpertDispatch))
.SetInferShapeFn(PD_INFER_SHAPE(MoeExpertDispatchInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(MoeExpertDispatchInferDtype));

View File

@@ -1,173 +0,0 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "mc_fused_moe_helper.h"
#include "helper.h"
template <paddle::DataType T, typename ElementA, typename ElementB, typename ElementC>
void McMoeFFNKernel(const paddle::Tensor& permute_input,
const paddle::Tensor& tokens_expert_prefix_sum,
const paddle::Tensor& ffn1_weight,
const paddle::Tensor& ffn2_weight,
const paddle::optional<paddle::Tensor>& ffn1_bias,
const paddle::optional<paddle::Tensor>& ffn1_scale,
const paddle::optional<paddle::Tensor>& ffn2_scale,
const std::string& quant_method,
paddle::Tensor ffn_out) {
typedef PDTraits<T> traits_;
typedef typename traits_::DataType DataType_;
typedef typename traits_::data_t data_t;
auto ffn_out_ptr = ffn_out.data<data_t>();
auto permuted_input_ptr = permute_input.data<data_t>();
auto place = permute_input.place();
auto input_type = permute_input.dtype();
auto stream = permute_input.stream();
const int expanded_active_expert_rows = permute_input.dims()[0]; // permute_input.dims(): m, k
const int num_experts = ffn1_weight.dims()[0]; // batchsize
const int hidden_size = ffn1_weight.dims()[2]; // n
int inter_dim = ffn1_weight.dims()[1]; // k
const int64_t inter_size = inter_dim; // since weight_only_int_8
paddle::Tensor fc1_out_tensor = GetEmptyTensor(
{expanded_active_expert_rows, inter_size}, input_type, place);
auto fc1_out_ptr = fc1_out_tensor.data<data_t>();
mctlassExOrder_t row_major = mctlassExOrder_t::MCTLASS_EX_ROWMAJOR_ORDER;
mctlassExOrder_t column_major = mctlassExOrder_t::MCTLASS_EX_COLUMNMAJOR_ORDER;
// ffn1
auto fc1_expert_biases =
ffn1_bias
? const_cast<paddle::Tensor*>(ffn1_bias.get_ptr())->data<data_t>()
: nullptr;
auto fc1_expert_scales = const_cast<paddle::Tensor*>(ffn1_scale.get_ptr())->data<data_t>();
mc_grouped_gemm_basic_kernel<ElementA, ElementB, ElementC>(
reinterpret_cast<const ElementA *>(permuted_input_ptr),
row_major,
reinterpret_cast<const ElementB *>(ffn1_weight.data<ElementB>()),
column_major,
reinterpret_cast<const ElementA *>(fc1_expert_scales),
reinterpret_cast<const ElementA *>(fc1_expert_biases),
reinterpret_cast<ElementC *>(fc1_out_ptr),
row_major,
tokens_expert_prefix_sum.data<int>(),
num_experts,
expanded_active_expert_rows,
inter_dim,
hidden_size,
stream);
// swiglu
auto act_out_tensor = paddle::experimental::swiglu(fc1_out_tensor, nullptr);
auto act_out = act_out_tensor.data<data_t>();
auto fc2_expert_scales = const_cast<paddle::Tensor*>(ffn2_scale.get_ptr())->data<data_t>();
mc_grouped_gemm_basic_kernel<ElementA, ElementB, ElementC>(
reinterpret_cast<const ElementA *>(act_out),
row_major,
reinterpret_cast<const ElementB *>(ffn2_weight.data<ElementB>()),
column_major,
reinterpret_cast<const ElementA *>(fc2_expert_scales),
nullptr,
reinterpret_cast<ElementC *>(ffn_out_ptr),
row_major,
tokens_expert_prefix_sum.data<int>(),
num_experts,
expanded_active_expert_rows,
hidden_size,
inter_dim / 2,
stream);
}
std::vector<paddle::Tensor> MoeExpertFFN(
const paddle::Tensor& permute_input,
const paddle::Tensor& tokens_expert_prefix_sum,
const paddle::Tensor& ffn1_weight,
const paddle::Tensor& ffn2_weight,
const paddle::optional<paddle::Tensor>& ffn1_bias,
const paddle::optional<paddle::Tensor>& ffn1_scale,
const paddle::optional<paddle::Tensor>& ffn2_scale,
const std::string& quant_method) {
assert(quant_method == "weight_only_int8");
const auto input_type = permute_input.dtype();
auto ffn_out = paddle::empty_like(permute_input);
switch (input_type) {
case paddle::DataType::BFLOAT16:
McMoeFFNKernel<paddle::DataType::BFLOAT16, maca_bfloat16, int8_t, maca_bfloat16>(permute_input,
tokens_expert_prefix_sum,
ffn1_weight,
ffn2_weight,
ffn1_bias,
ffn1_scale,
ffn2_scale,
quant_method,
ffn_out);
break;
// case paddle::DataType::FLOAT16:
// MoeFFNKernel<paddle::DataType::FLOAT16>(permute_input,
// tokens_expert_prefix_sum,
// ffn1_weight,
// ffn2_weight,
// ffn1_bias,
// ffn1_scale,
// ffn2_scale,
// quant_method,
// ffn_out);
// break;
default:
PD_THROW("Only support bf16 for MoeExpertFFN");
}
return {ffn_out};
}
std::vector<std::vector<int64_t>> MoeExpertFFNInferShape(
const std::vector<int64_t>& permute_input_shape,
const std::vector<int64_t>& tokens_expert_prefix_sum_shape,
const std::vector<int64_t>& ffn1_weight_shape,
const std::vector<int64_t>& ffn2_weight_shape,
const paddle::optional<std::vector<int64_t>>& ffn1_bias_shape,
const paddle::optional<std::vector<int64_t>>& ffn1_scale_shape,
const paddle::optional<std::vector<int64_t>>& ffn2_scale_shape) {
return {permute_input_shape};
}
std::vector<paddle::DataType> MoeExpertFFNInferDtype(
const paddle::DataType& permute_input_dtype,
const paddle::DataType& tokens_expert_prefix_sum_dtype,
const paddle::DataType& ffn1_weight_dtype,
const paddle::DataType& ffn2_weight_dtype,
const paddle::optional<paddle::DataType>& ffn1_bias_dtype,
const paddle::optional<paddle::DataType>& ffn1_scale_dtype,
const paddle::optional<paddle::DataType>& ffn2_scale_dtype) {
return {permute_input_dtype};
}
PD_BUILD_OP(moe_expert_ffn)
.Inputs({"permute_input",
"tokens_expert_prefix_sum",
"ffn1_weight",
"ffn2_weight",
paddle::Optional("ffn1_bias"),
paddle::Optional("ffn1_scale"),
paddle::Optional("ffn2_scale")})
.Outputs({"output_tensor"})
.Attrs({"quant_method:std::string"})
.SetKernelFn(PD_KERNEL(MoeExpertFFN))
.SetInferShapeFn(PD_INFER_SHAPE(MoeExpertFFNInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(MoeExpertFFNInferDtype));

View File

@@ -1,143 +0,0 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "helper.h"
#include "fused_moe_helper.h"
#include "fused_moe_op.h"
template <paddle::DataType T>
void MoeReduceKernel(const paddle::Tensor& ffn_out,
const paddle::Tensor& top_k_weight,
const paddle::Tensor& permute_indices_per_token,
const paddle::Tensor& top_k_indices,
const paddle::optional<paddle::Tensor>& ffn2_bias,
const bool norm_topk_prob,
const float routed_scaling_factor,
const int num_rows,
const int hidden_size,
const int topk,
paddle::Tensor* output) {
typedef PDTraits<T> traits_;
typedef typename traits_::DataType DataType_;
typedef typename traits_::data_t data_t;
auto stream = ffn_out.stream();
finalize_moe_routing_kernelLauncher(
ffn_out.data<data_t>(),
output->data<data_t>(),
ffn2_bias ? ffn2_bias->data<data_t>() : nullptr,
top_k_weight.data<float>(),
permute_indices_per_token.data<int32_t>(),
top_k_indices.data<int>(),
num_rows,
hidden_size,
topk,
static_cast<int>(1),
norm_topk_prob,
routed_scaling_factor,
stream);
}
std::vector<paddle::Tensor> MoeExpertReduce(
const paddle::Tensor& ffn_out,
const paddle::Tensor& top_k_weight,
const paddle::Tensor& permute_indices_per_token,
const paddle::Tensor& top_k_indices,
const paddle::optional<paddle::Tensor>& ffn2_bias,
const bool norm_topk_prob,
const float routed_scaling_factor) {
const auto input_type = ffn_out.dtype();
auto place = ffn_out.place();
const int topk = top_k_indices.dims()[1];
const int num_rows = ffn_out.dims()[0] / topk;
const int hidden_size = ffn_out.dims()[1];
auto output = GetEmptyTensor({num_rows, hidden_size}, input_type, place);
// Avoids invalid configuration argument when we launch the kernel.
if (ffn_out.dims()[0] == 0) return {output};
switch (input_type) {
case paddle::DataType::BFLOAT16:
MoeReduceKernel<paddle::DataType::BFLOAT16>(ffn_out,
top_k_weight,
permute_indices_per_token,
top_k_indices,
ffn2_bias,
norm_topk_prob,
routed_scaling_factor,
num_rows,
hidden_size,
topk,
&output);
break;
// case paddle::DataType::FLOAT16:
// MoeReduceKernel<paddle::DataType::FLOAT16>(ffn_out,
// top_k_weight,
// permute_indices_per_token,
// top_k_indices,
// ffn2_bias,
// norm_topk_prob,
// routed_scaling_factor,
// num_rows,
// hidden_size,
// topk,
// &output);
// break;
default:
PD_THROW("Only support bf16 for MoeDispatchKernel");
}
return {output};
}
std::vector<std::vector<int64_t>> MoeExpertReduceInferShape(
const std::vector<int64_t>& ffn_out_shape,
const std::vector<int64_t>& top_k_weight_shape,
const std::vector<int64_t>& permute_indices_per_token_shape,
const std::vector<int64_t>& top_k_indices_shape,
const paddle::optional<std::vector<int64_t>>& ffn2_bias_shape) {
const int topk = top_k_indices_shape[1];
std::vector<int64_t> fused_moe_out_shape = {ffn_out_shape[0] / topk,
ffn_out_shape[1]};
return {fused_moe_out_shape};
}
std::vector<paddle::DataType> MoeExpertReduceInferDtype(
const paddle::DataType& ffn_out_dtype,
const paddle::DataType& top_k_weight_dtype,
const paddle::DataType& permute_indices_per_token_dtype,
const paddle::DataType& top_k_indices_dtype,
const paddle::optional<paddle::DataType>& ffn2_bias_dtype) {
return {ffn_out_dtype};
}
PD_BUILD_OP(moe_expert_reduce)
.Inputs({"ffn_out",
"top_k_weight",
"permute_indices_per_token",
"top_k_indices",
paddle::Optional("ffn2_bias")})
.Outputs({"output"})
.Attrs({"norm_topk_prob:bool", "routed_scaling_factor:float"})
.SetKernelFn(PD_KERNEL(MoeExpertReduce))
.SetInferShapeFn(PD_INFER_SHAPE(MoeExpertReduceInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(MoeExpertReduceInferDtype));

View File

@@ -208,7 +208,6 @@ if paddle.is_compiled_with_rocm():
"gpu_ops/rebuild_padding.cu",
"gpu_ops/step.cu",
"gpu_ops/set_data_ipc.cu",
"gpu_ops/unset_data_ipc.cu",
"gpu_ops/moe/tritonmoe_preprocess.cu",
"gpu_ops/step_system_cache.cu",
"gpu_ops/get_output_ep.cc",
@@ -279,7 +278,6 @@ elif paddle.is_compiled_with_cuda():
"gpu_ops/beam_search_softmax.cu",
"gpu_ops/rebuild_padding.cu",
"gpu_ops/set_data_ipc.cu",
"gpu_ops/unset_data_ipc.cu",
"gpu_ops/read_data_ipc.cu",
"gpu_ops/enforce_generation.cu",
"gpu_ops/dequant_int8.cu",
@@ -538,8 +536,6 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
"iluvatar_ops/moe_dispatch.cu",
"iluvatar_ops/moe_reduce.cu",
"iluvatar_ops/paged_attn.cu",
"iluvatar_ops/prefill_fused_attn.cu",
"iluvatar_ops/mixed_fused_attn.cu",
"iluvatar_ops/w8a16_group_gemm.cu",
"iluvatar_ops/runtime/iluvatar_context.cc",
],
@@ -597,10 +593,6 @@ elif paddle.device.is_compiled_with_custom_device("metax_gpu"):
"gpu_ops/moe/tritonmoe_preprocess.cu",
"gpu_ops/moe/moe_topk_select.cu",
"gpu_ops/recover_decode_task.cu",
"metax_ops/moe_dispatch.cu",
"metax_ops/moe_ffn.cu",
"metax_ops/moe_reduce.cu",
"metax_ops/fused_moe.cu",
]
sources += find_end_files("gpu_ops/speculate_decoding", ".cu")
@@ -621,7 +613,7 @@ elif paddle.device.is_compiled_with_custom_device("metax_gpu"):
],
},
library_dirs=[os.path.join(maca_path, "lib")],
extra_link_args=["-lruntime_cu", "-lmctlassEx"],
extra_link_args=["-lruntime_cu"],
include_dirs=[
os.path.join(maca_path, "include"),
os.path.join(maca_path, "include/mcr"),
@@ -629,8 +621,6 @@ elif paddle.device.is_compiled_with_custom_device("metax_gpu"):
],
),
)
elif paddle.is_compiled_with_custom_device("intel_hpu"):
pass
else:
use_bf16 = envs.FD_CPU_USE_BF16 == "True"

View File

@@ -41,9 +41,7 @@ std::vector<paddle::Tensor> BlockAttnKernel(
const paddle::Tensor &encoder_seq_lod_cpu,
const paddle::Tensor &encoder_batch_map_cpu,
const paddle::Tensor &decoder_context_len_cpu,
const paddle::Tensor &decoder_batch_map_cpu,
const std::string &pos_emb_type="NORMAL",
bool rope_3d=false) {
const paddle::Tensor &decoder_batch_map_cpu) {
phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
auto dev_ctx =
paddle::experimental::DeviceContextPool::Instance().Get(place);
@@ -74,14 +72,6 @@ std::vector<paddle::Tensor> BlockAttnKernel(
int enc_batch = enc_batch_tensor.data<int32_t>()[0];
int dec_batch = dec_batch_tensor.data<int32_t>()[0];
int total_enc_len = total_enc_len_tensor.data<int32_t>()[0];
int rope_max_seqlen = 0;
int rope_3d_num_seqs = 1;
if (rope_3d) {
rope_max_seqlen = rotary_embs.dims()[3];
rope_3d_num_seqs = rotary_embs.dims()[0];
} else {
rope_max_seqlen = rotary_embs.dims()[2];
}
auto block_attn_out =
paddle::full({token_num, hidden_dim}, -1, qkv.type(), qkv.place());
@@ -161,10 +151,10 @@ std::vector<paddle::Tensor> BlockAttnKernel(
prefix_lens_vp, // start_tokens
param.batch_size, // batch_size
1, // emb_batch_size
rope_max_seqlen, // max_seqlen
rotary_embs.dims()[2], // max_seqlen
param.head_num, param.kv_head_num, param.head_dim,
param.max_batch_size, block_size, max_block_per_seq, "BLHD",
"HLD", pos_emb_type,
"HLD", "NORMAL",
!p_kcache_perhead_scale.defined()
? nullptr
: p_kcache_perhead_scale.data<float>() +
@@ -256,10 +246,10 @@ std::vector<paddle::Tensor> BlockAttnKernel(
vsl.slot_mapping_vp, // real_batch
param.batch_size, // batch_size
1, // emb_batch_size
rope_max_seqlen, // max_seqlen
rotary_embs.dims()[2], // max_seqlen TODO!!double check
param.head_num, param.kv_head_num, param.head_dim,
param.max_batch_size, block_size, max_block_per_seq, "BLHD", "HLD",
pos_emb_type,
"NORMAL",
!p_kcache_perhead_scale.defined()
? nullptr
: p_kcache_perhead_scale.data<float>() +
@@ -270,9 +260,7 @@ std::vector<paddle::Tensor> BlockAttnKernel(
param.kv_head_num, // v_cache_scale_inv
nullptr, // k_cache_zp
nullptr, // v_cache_zp
false, // b_c8_pc
rope_3d, // rope_3d
rope_3d_num_seqs);
false); // b_c8_pc
XFTBLOCK_CHECK_EQ(ret, api::SUCCESS);
// attn decode
@@ -326,7 +314,6 @@ PD_BUILD_OP(block_attn)
"decoder_context_len_cpu",
"decoder_batch_map_cpu",
})
.Attrs({"pos_emb_type:std::string", "rope_3d:bool"})
.Outputs({"block_attn_out"})
.SetKernelFn(PD_KERNEL(BlockAttnKernel))
.SetInferShapeFn(PD_INFER_SHAPE(BlockAttnInferShape))

View File

@@ -1,60 +0,0 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/extension.h"
std::vector<paddle::Tensor> GetImgBoundaries(const paddle::Tensor& task_input_ids,
const paddle::Tensor& grid_thw,
const int64_t image_patch_id) {
// All tensor in cpu
auto input_ids_ptr = task_input_ids.data<int64_t>();
int64_t seq_lens_origin = task_input_ids.numel();
auto grid_thw_ptr = grid_thw.data<int64_t>();
int token_times = 4;
int token_idx = 0;
int image_idx = 0;
std::vector<int> img_boundaries, img_nums;
img_boundaries.emplace_back(0);
img_nums.emplace_back(0);
while (token_idx < seq_lens_origin) {
if (input_ids_ptr[token_idx] != image_patch_id) {
do {
token_idx++;
} while (token_idx < seq_lens_origin && input_ids_ptr[token_idx] != image_patch_id);
} else {
int cur_image_token_len = (grid_thw_ptr[image_idx * 3 + 1] * grid_thw_ptr[image_idx * 3 + 2]) / token_times;
image_idx++;
token_idx += cur_image_token_len;
}
img_boundaries.emplace_back(token_idx);
img_nums.emplace_back(image_idx);
}
int64_t num_img_boundaries = static_cast<int64_t>(img_boundaries.size());
auto out = paddle::full({2, num_img_boundaries}, 0, paddle::DataType::INT64, paddle::CPUPlace());
for (int i = 0; i < num_img_boundaries; i++) {
out.data<int64_t>()[i] = img_boundaries[i];
out.data<int64_t>()[num_img_boundaries + i] = img_nums[i];
}
return {out};
}
PD_BUILD_OP(get_img_boundaries)
.Inputs({"task_input_ids", "grid_thw"})
.Attrs({"image_patch_id: int64_t"})
.Outputs({"img_boundaries"})
.SetKernelFn(PD_KERNEL(GetImgBoundaries));

View File

@@ -72,7 +72,6 @@ void MoeExpertFFNImpl(xftblock::Tensor* ffn_in,
is_padding_input ? token_num_info : nullptr,
expert_num,
1, // moe_topk
0, // group_size
ffn1_out_shape.size() == 2 ? xftblock::MoeFCInputMode::DENSE
: xftblock::MoeFCInputMode::SPARSE);
PD_CHECK(ret == 0);
@@ -135,7 +134,6 @@ void MoeExpertFFNImpl(xftblock::Tensor* ffn_in,
is_padding_input ? token_num_info : nullptr,
expert_num,
1, // moe_topk
0, // group_size
ffn1_out_shape.size() == 2
? xftblock::MoeFCInputMode::DENSE
: xftblock::MoeFCInputMode::SPARSE); // bias_mode

View File

@@ -145,8 +145,7 @@ std::vector<paddle::Tensor> MoeLayerKernel(
? up_gate_proj_weight_scale.get_ptr()->data<float>()
: nullptr),
xftblock_tw,
std::vector<int64_t>{expert_num, inter_dim, hidden_dim}
);
std::vector<int64_t>{expert_num, inter_dim, hidden_dim});
xdown_proj_w = std::make_shared<xftblock::Tensor>(
const_cast<TW *>(down_proj_weight.data<TW>()), nullptr,
@@ -154,8 +153,7 @@ std::vector<paddle::Tensor> MoeLayerKernel(
? down_proj_weight_scale.get_ptr()->data<float>()
: nullptr),
xftblock_tw,
std::vector<int64_t>{expert_num, hidden_dim, outer_dim}
);
std::vector<int64_t>{expert_num, hidden_dim, outer_dim});
}
std::shared_ptr<xftblock::Tensor> xup_gate_proj_bias;
std::shared_ptr<xftblock::Tensor> xdown_proj_bias;

View File

@@ -1,83 +0,0 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <paddle/phi/backends/xpu/xpu_context.h>
#include <xft/xdnn_plugin.h>
#include "paddle/extension.h"
#include "xpu/plugin.h"
void TextImageGatherScatter(
paddle::Tensor& input,
paddle::Tensor& text_input,
paddle::Tensor& image_input,
paddle::Tensor& token_type_ids,
paddle::Tensor& text_index,
paddle::Tensor& image_index,
const bool is_scatter) {
phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
const int64_t token_num = input.dims()[0];
const int64_t hidden_size = input.dims()[1];
const int64_t text_token_num = text_input.dims()[0];
const int64_t image_token_num = image_input.dims()[0];
switch (input.type()) {
case paddle::DataType::BFLOAT16: {
using XPUType = typename XPUTypeTrait<bfloat16>::Type;
typedef paddle::bfloat16 data_t;
int r = baidu::xpu::api::plugin::text_image_gather_scatter<XPUType>(
xpu_ctx->x_context(),
reinterpret_cast<XPUType*>(input.data<data_t>()),
reinterpret_cast<XPUType*>(text_input.data<data_t>()),
reinterpret_cast<XPUType*>(image_input.data<data_t>()),
reinterpret_cast<int*>(token_type_ids.data<int>()),
reinterpret_cast<int*>(text_index.data<int>()),
reinterpret_cast<int*>(image_index.data<int>()),
token_num,
text_token_num,
image_token_num,
hidden_size,
is_scatter
);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "text_image_gather_scatter");
break;
}
default: {
PD_THROW(
"NOT supported data type. Only support BFLOAT16. ");
break;
}
}
}
PD_BUILD_OP(text_image_gather_scatter)
.Inputs({"input",
"text_input",
"image_input",
"token_type_ids",
"text_index",
"image_index"})
.Outputs({"text_input_out",
"image_input_out",
"text_index_out",
"image_index_out"})
.Attrs({"is_scatter:bool"})
.SetInplaceMap({{"text_input", "text_input_out"},
{"image_input", "image_input_out"},
{"text_index", "text_index_out"},
{"image_index", "image_index_out"}})
.SetKernelFn(PD_KERNEL(TextImageGatherScatter));

View File

@@ -1,48 +0,0 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <paddle/phi/backends/xpu/xpu_context.h>
#include "paddle/extension.h"
#include "xpu/plugin.h"
void TextImageIndexOut(
const paddle::Tensor& token_type_ids,
const paddle::Tensor& text_index,
const paddle::Tensor& image_index) {
if (token_type_ids.type() != paddle::DataType::INT32 || text_index.type()
!= paddle::DataType::INT32 || image_index.type() != paddle::DataType::INT32) {
PD_THROW("NOT supported data type. Only support BFLOAT16. ");
}
phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
const int64_t token_num = token_type_ids.shape()[0];
int r = baidu::xpu::api::plugin::text_image_index_out(xpu_ctx->x_context(),
token_type_ids.data<int32_t>(),
const_cast<int32_t*>(text_index.data<int32_t>()),
const_cast<int32_t*>(image_index.data<int32_t>()),
token_num);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "text_image_index_out");
}
PD_BUILD_OP(text_image_index_out)
.Inputs({"token_type_ids",
"text_index",
"image_index"})
.Outputs({"text_index_out",
"image_index_out"})
.SetInplaceMap({{"text_index", "text_index_out"},
{"image_index", "image_index_out"}})
.SetKernelFn(PD_KERNEL(TextImageIndexOut));

View File

@@ -140,25 +140,6 @@ DLL_EXPORT int quant2d_per_channel(api::Context *ctx, const TX *x,
const TSCALE *scale_in, TY *y,
TSCALE *scale_out, int64_t m, int64_t n);
DLL_EXPORT int text_image_index_out(Context* ctx,
const int* token_type_ids, // x
int* text_index, // y1
int* image_index, // y2
const int64_t token_num);
template <typename T>
DLL_EXPORT int text_image_gather_scatter(api::Context* ctx,
T* input,
T* text_input,
T* image_input,
int* token_type_ids,
int* text_index,
int* image_index,
int64_t token_num,
int64_t text_token_num,
int64_t image_token_num,
int64_t hidden_size,
bool is_scatter);
/*--------------------------------------- MTP being --------------------------------------------*/

View File

@@ -1,175 +0,0 @@
#include "xpu/kernel/cluster.h"
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
#include "xpu/kernel/xtdk_io.h"
namespace xpu3 {
namespace plugin {
template <typename T>
static __device__ inline void text_image_gather(
__global_ptr__ T* input,
__global_ptr__ T* text_input,
__global_ptr__ T* image_input,
__global_ptr__ int* token_type_ids,
__global_ptr__ int* text_index,
__global_ptr__ int* image_index,
int64_t token_num,
int64_t text_token_num,
int64_t image_token_num,
int64_t hidden_size,
T* input_lm) {
int cid = core_id();
int clusterid = cluster_id();
int token_start_cluster;
int token_end_cluster;
int token_start_core;
int token_end_core;
const int BUFSIZE = 2 * 1024 / sizeof(T); // 1024 for bf16, 512 for fp32
// cluster partition
partition(cluster_id(), cluster_num(), (int)token_num, 1, &token_start_cluster, &token_end_cluster);
if (token_start_cluster >= token_end_cluster) {
return;
}
int rows_cluster = token_end_cluster - token_start_cluster; // total rows for a cluster
// core partition
partition(core_id(), core_num(), rows_cluster, 1, &token_start_core, &token_end_core);
int rows_core = token_end_core - token_start_core; // total rows for a core
token_start_core += token_start_cluster;
token_end_core += token_start_cluster;
int read_len;
for (int i = token_start_core; i < token_end_core; i += 1) {
int token_type, text_image_token_idx;
__global_ptr__ T* text_image_input = nullptr;
__global_ptr__ int* text_image_index = nullptr;
GM2LM(token_type_ids + i, &token_type, sizeof(int));
if (token_type == 0) {
text_image_input = text_input;
text_image_index = text_index;
} else {
text_image_input = image_input;
text_image_index = image_index;
}
GM2LM(text_image_index + i, &text_image_token_idx, sizeof(int));
int input_offset = i * hidden_size;
int text_image_offset = text_image_token_idx * hidden_size;
for (int j = 0; j < hidden_size; j += BUFSIZE) {
read_len = min(hidden_size - j, BUFSIZE);
GM2LM(text_image_input + text_image_offset + j, input_lm, sizeof(T) * read_len);
LM2GM(input_lm, input + input_offset + j, sizeof(T) * read_len);
}
}
}
template <typename T>
static __device__ inline void text_image_scatter(
__global_ptr__ T* input,
__global_ptr__ T* text_input,
__global_ptr__ T* image_input,
__global_ptr__ int* token_type_ids,
__global_ptr__ int* text_index,
__global_ptr__ int* image_index,
int64_t token_num,
int64_t text_token_num,
int64_t image_token_num,
int64_t hidden_size,
T* input_lm) {
int cid = core_id();
int clusterid = cluster_id();
int token_start_cluster;
int token_end_cluster;
int token_start_core;
int token_end_core;
const int BUFSIZE = 2 * 1024 / sizeof(T); // 1024 for bf16, 512 for fp32
// cluster partition
partition(cluster_id(), cluster_num(), (int)token_num, 1, &token_start_cluster, &token_end_cluster);
if (token_start_cluster >= token_end_cluster) {
return;
}
int rows_cluster = token_end_cluster - token_start_cluster; // total rows for a cluster
// core partition
partition(core_id(), core_num(), rows_cluster, 1, &token_start_core, &token_end_core);
int rows_core = token_end_core - token_start_core; // total rows for a core
token_start_core += token_start_cluster;
token_end_core += token_start_cluster;
int read_len;
for (int i = token_start_core; i < token_end_core; i += 1) {
int token_type, text_image_token_idx;
__global_ptr__ T* text_image_input = nullptr;
__global_ptr__ int* text_image_index = nullptr;
GM2LM(token_type_ids + i, &token_type, sizeof(int));
if (token_type == 0) {
text_image_input = text_input;
text_image_index = text_index;
} else {
text_image_input = image_input;
text_image_index = image_index;
}
GM2LM(text_image_index + i, &text_image_token_idx, sizeof(int));
int input_offset = i * hidden_size;
int text_image_offset = text_image_token_idx * hidden_size;
for (int j = 0; j < hidden_size; j += BUFSIZE) {
read_len = min(hidden_size - j, BUFSIZE);
GM2LM(input + input_offset + j, input_lm, sizeof(T) * read_len);
LM2GM(input_lm, text_image_input + text_image_offset + j, sizeof(T) * read_len);
}
}
}
template <typename T>
__global__ void text_image_gather_scatter(
T* input,
T* text_input,
T* image_input,
int* token_type_ids,
int* text_index,
int* image_index,
int64_t token_num,
int64_t text_token_num,
int64_t image_token_num,
int64_t hidden_size,
bool is_scatter) {
int cid = core_id();
int ncores = core_num();
int clusterid = cluster_id();
int nclusters = cluster_num();
const int BUFSIZE = 2 * 1024 / sizeof(T); // 1024 for bf16, 512 for fp32
__simd__ T input_lm[BUFSIZE]; // 2KB for bf16 and fp32
if (is_scatter) {
text_image_scatter(
input, text_input, image_input, token_type_ids, text_index, image_index,
token_num, text_token_num, image_token_num, hidden_size, input_lm);
} else {
text_image_gather(
input, text_input, image_input, token_type_ids, text_index, image_index,
token_num, text_token_num, image_token_num, hidden_size, input_lm);
}
}
#define _XPU_DEF_TEXT_IMAGE_GATHER_SCATTER(T) \
template __global__ void text_image_gather_scatter<T>( \
T* input, \
T* text_input, \
T* image_input, \
int* token_type_ids, \
int* text_index, \
int* image_index, \
int64_t token_num, \
int64_t text_token_num, \
int64_t image_token_num, \
int64_t hidden_size, \
bool is_scatter);
_XPU_DEF_TEXT_IMAGE_GATHER_SCATTER(bfloat16);
} // namespace plugin
} // namespace xpu3

View File

@@ -1,97 +0,0 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* copyright (C) 2025 KUNLUNXIN, Inc
*/
#include "xpu/kernel/cluster.h"
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
#include "xpu/kernel/cluster_primitive_template.h"
namespace xpu3 {
namespace plugin {
static __device__ void do_calc(const _shared_ptr_ int* lm_x, int* lm_y1, int* lm_y2, int64_t size, int& text_count, int& images_count) {
for (int j = 0; j < size; j++) {
if (lm_x[j] == 0) {
lm_y1[j] = text_count;
text_count += 1;
} else {
lm_y2[j] = images_count;
images_count += 1;
}
}
mfence_lm_sm();
}
__global__ void text_image_index_out_kernel(
const int* token_type_ids, // x
int* text_index, // y1
int* image_index, // y2
const int64_t token_num) {
const int cid = core_id();
const int tid = core_id() * cluster_num() + cluster_id();
const int nthreads = core_num() * cluster_num();
if (tid >= 1) return;
constexpr int BUFSIZE = 1024;
constexpr int READ_MAX_SIZE = BUFSIZE / sizeof(int);
const int64_t len = token_num;
__simd__ char buffer0[BUFSIZE * 3];
__simd__ char buffer1[BUFSIZE * 3];
__simd__ __shared__ char buffer2[64][BUFSIZE * 2];
DoublePtr<READ_MAX_SIZE, SmPtr<int>> buffer_ptr_x((SmPtr<int>((_shared_ptr_ int*)buffer2[cid])));
TriplePtr<READ_MAX_SIZE, LmPtr<int>> buffer_ptr_y1((LmPtr<int>((int*)buffer0)));
TriplePtr<READ_MAX_SIZE, LmPtr<int>> buffer_ptr_y2((LmPtr<int>((int*)buffer1)));
int64_t buflen = get_1d_buflen(len, nthreads, READ_MAX_SIZE, 64);
int64_t i = tid * buflen;
int read_size = 0;
int offset = nthreads * buflen;
int text_count = 0;
int images_count = 0;
if (i < len) {
read_size = min<int64_t>(buflen, len - i);
buffer_ptr_y1.gm_load_async(text_index + tid * buflen, read_size);
buffer_ptr_y2.gm_load_async(image_index + tid * buflen, read_size);
buffer_ptr_x.gm_load_async(token_type_ids + tid * buflen, read_size);
mfence();
}
while (i < len && i + offset < len) {
i = i + offset;
int read_size_next = min<int64_t>(buflen, len - i);
buffer_ptr_x.next().gm_load_async(token_type_ids + i, read_size_next);
buffer_ptr_y1.next().gm_load_async(text_index + i, read_size_next);
buffer_ptr_y2.next().gm_load_async(image_index + i, read_size_next);
do_calc(buffer_ptr_x.ptr, buffer_ptr_y1.ptr, buffer_ptr_y2.ptr, read_size, text_count, images_count);
buffer_ptr_y1.gm_store_async(text_index + i - offset, read_size);
buffer_ptr_y2.gm_store_async(image_index + i - offset, read_size);
buffer_ptr_x.toggle();
buffer_ptr_y1.toggle();
buffer_ptr_y2.toggle();
read_size = read_size_next;
}
if (i < len) {
do_calc(buffer_ptr_x.ptr, buffer_ptr_y1.ptr, buffer_ptr_y2.ptr, read_size, text_count, images_count);
buffer_ptr_y1.gm_store_async(text_index + i, read_size);
buffer_ptr_y2.gm_store(image_index + i, read_size);
}
}
} // namespace plugin
} // namespace xpu3

View File

@@ -1,182 +0,0 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "xpu/plugin.h"
#include "xpu/refactor/impl_public/wrapper_check.h"
namespace xpu3 {
namespace plugin {
template <typename T>
__attribute__((global)) void text_image_gather_scatter(
T* input,
T* text_input,
T* image_input,
int* token_type_ids,
int* text_index,
int* image_index,
int64_t token_num,
int64_t text_token_num,
int64_t image_token_num,
int64_t hidden_size,
bool is_scatter);
} // namespace plugin
} // namespace xpu3
namespace baidu {
namespace xpu {
namespace api {
namespace plugin {
template <typename T>
static int cpu_wrapper(
Context* ctx,
T* input, // shape [token_num, hidden_size]
T* text_input, // shape [text_token_num, hidden_size]
T* image_input, // shape [image_token_num, hidden_size]
int* token_type_ids,// shape [token_num], 0 for text, 1 for image
int* text_index, // shape [token_num], mapping from input to text_input
int* image_index, // shape [token_num], mapping from input to image_input
int64_t token_num,
int64_t text_token_num,
int64_t image_token_num,
int64_t hidden_size,
bool is_scatter) {
if (is_scatter) {
// Scatter mode: input -> text_input/image_input
for (int64_t i = 0; i < token_num; i++) {
int token_type = token_type_ids[i];
T* text_image_input = nullptr;
int* text_image_index = nullptr;
if (token_type == 0) {
text_image_input = text_input;
text_image_index = text_index;
} else { // token_type == 1
text_image_input = image_input;
text_image_index = image_index;
}
int text_image_token_idx = text_image_index[i];
int input_offset = i * hidden_size;
int text_image_offset = text_image_token_idx * hidden_size;
for (int64_t j = 0; j < hidden_size; j++) {
T value = input[input_offset + j];
text_image_input[text_image_offset + j] = value;
}
}
} else {
// Gather mode: text_input/image_input -> input
for (int64_t i = 0; i < token_num; i++) {
int token_type = token_type_ids[i];
T* text_image_input = nullptr;
int* text_image_index = nullptr;
if (token_type == 0) {
text_image_input = text_input;
text_image_index = text_index;
} else { // token_type == 1
text_image_input = image_input;
text_image_index = image_index;
}
int text_image_token_idx = text_image_index[i];
int input_offset = i * hidden_size;
int text_image_offset = text_image_token_idx * hidden_size;
for (int64_t j = 0; j < hidden_size; j++) {
T value = text_image_input[text_image_offset + j];
input[input_offset + j] = value;
}
}
}
return api::SUCCESS;
}
template <typename T>
static int xpu3_wrapper(
Context* ctx,
T* input,
T* text_input,
T* image_input,
int* token_type_ids,
int* text_index,
int* image_index,
int64_t token_num,
int64_t text_token_num,
int64_t image_token_num,
int64_t hidden_size,
bool is_scatter) {
xpu3::plugin::text_image_gather_scatter<T> <<<ctx->ncluster(), 64, ctx->xpu_stream>>>(
input, text_input, image_input, token_type_ids, text_index, image_index,
token_num, text_token_num, image_token_num, hidden_size, is_scatter
);
return api::SUCCESS;
}
template <typename T>
int text_image_gather_scatter(
Context* ctx,
T* input, // shape [token_num, hidden_size]
T* text_input, // shape [text_token_num, hidden_size]
T* image_input, // shape [image_token_num, hidden_size]
int* token_type_ids,// shape [token_num], 0 for text, 1 for image
int* text_index, // shape [token_num], mapping from input to text_input
int* image_index, // shape [token_num], mapping from input to image_input
int64_t token_num,
int64_t text_token_num,
int64_t image_token_num,
int64_t hidden_size,
bool is_scatter) {
WRAPPER_CHECK_CTX(ctx);
WRAPPER_DUMP_FUNCTION_T1(ctx, "text_image_gather_scatter", T);
WRAPPER_DUMP_PARAM6(ctx, input, text_input, image_input, token_type_ids, text_index, image_index);
WRAPPER_DUMP_PARAM5(ctx, token_num, text_token_num, image_token_num, hidden_size, is_scatter);
WRAPPER_DUMP(ctx);
WRAPPER_CHECK_PTR(ctx, T, token_num * hidden_size, input);
if (text_token_num != 0) { // avoiding text_input tensor with shape [0, hidden_size]
WRAPPER_CHECK_PTR(ctx, T, text_token_num * hidden_size, text_input);
}
if (image_token_num != 0) { // avoiding image_input tensor with shape [0, hidden_size]
WRAPPER_CHECK_PTR(ctx, T, image_token_num * hidden_size, image_input);
}
WRAPPER_CHECK_PTR(ctx, int, token_num, token_type_ids);
WRAPPER_CHECK_PTR(ctx, int, token_num, text_index);
WRAPPER_CHECK_PTR(ctx, int, token_num, image_index);
WRAPPER_ASSERT_EQ(ctx, token_num, text_token_num + image_token_num);
if (ctx->dev().type() == api::kCPU) {
return cpu_wrapper<T>(
ctx, input, text_input, image_input, token_type_ids, text_index, image_index,
token_num, text_token_num, image_token_num, hidden_size, is_scatter
);
}
if (ctx->dev().type() == api::kXPU3) {
return xpu3_wrapper<T>(
ctx, input, text_input, image_input, token_type_ids, text_index, image_index,
token_num, text_token_num, image_token_num, hidden_size, is_scatter
);
}
WRAPPER_UNIMPLEMENTED(ctx);
}
template int text_image_gather_scatter(
Context*, bfloat16*, bfloat16*, bfloat16*, int*, int*, int*, const int64_t, const int64_t, const int64_t, const int64_t, bool);
} // namespace plugin
} // namespace api
} // namespace xpu
} // namespace baidu

View File

@@ -1,103 +0,0 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "xpu/plugin.h"
#include "xpu/refactor/impl_public/wrapper_check.h"
namespace xpu3 {
namespace plugin {
__attribute__((global)) void text_image_index_out_kernel(const int* token_type_ids, // x
int* text_index, // y1
int* image_index, // y2
const int64_t token_num);
} // namespace plugin
} // namespace xpu3
namespace baidu {
namespace xpu {
namespace api {
namespace plugin {
static int cpu_wrapper(Context* ctx,
const int* token_type_ids, // x
int* text_index, // y1
int* image_index, // y2
const int64_t token_num) {
int text_count = 0;
int image_count = 0;
for (int64_t i = 0; i < token_num; ++i) {
if (token_type_ids[i] == 0) {
text_index[i] = text_count;
++text_count;
} else {
image_index[i] = image_count;
++image_count;
}
}
return api::SUCCESS;
}
static int xpu3_wrapper(Context* ctx,
const int* token_type_ids, // x
int* text_index, // y1
int* image_index, // y2
const int64_t token_num) {
xpu3::plugin::text_image_index_out_kernel<<<1, 1, ctx->xpu_stream>>>(
token_type_ids,
text_index,
image_index,
token_num);
return api::SUCCESS;
}
int text_image_index_out(Context* ctx,
const int* token_type_ids, // x
int* text_index, // y1
int* image_index, // y2
const int64_t token_num) {
WRAPPER_CHECK_CTX(ctx);
WRAPPER_DUMP_FUNCTION_T1(ctx, "text_image_index_out", int);
WRAPPER_DUMP_PARAM4(
ctx, token_type_ids, text_index, image_index, token_num);
WRAPPER_DUMP(ctx);
WRAPPER_ASSERT_GT(ctx, token_num, 0);
WRAPPER_CHECK_PTR(ctx, int, token_num, token_type_ids);
WRAPPER_CHECK_PTR(ctx, int, token_num, text_index);
WRAPPER_CHECK_PTR(ctx, int, token_num, image_index);
if (ctx->dev().type() == api::kCPU) {
return cpu_wrapper(ctx,
token_type_ids,
text_index,
image_index,
token_num);
} else if (ctx->dev().type() == api::kXPU3) {
return xpu3_wrapper(ctx,
token_type_ids,
text_index,
image_index,
token_num);
}
WRAPPER_UNIMPLEMENTED(ctx);
}
} // namespace plugin
} // namespace api
} // namespace xpu
} // namespace baidu

View File

@@ -31,11 +31,11 @@ python -m fastdeploy.entrypoints.openai.api_server \
--quantization wint4 \
--max-model-len 32768 \
--max-num-seqs 128 \
--load-choices "default_v1"
--load_choices "default_v1"
```
- `--quantization`: indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of `wint8` / `wint4` / `block_wise_fp8`(Hopper is needed).
- `--max-model-len`: Indicates the maximum number of tokens supported by the currently deployed service. The larger the value, the longer the context length the model can support, but the more GPU memory is occupied, which may affect the concurrency.
- `--load-choices`: indicates the version of the loader. "default_v1" means enabling the v1 version of the loader, which has faster loading speed and less memory usage.
- `--load_choices`: indicates the version of the loader. "default_v1" means enabling the v1 version of the loader, which has faster loading speed and less memory usage.
For more parameter meanings and default settings, see [FastDeploy Parameter Documentation](../parameters.md)。

View File

@@ -31,11 +31,11 @@ python -m fastdeploy.entrypoints.openai.api_server \
--quantization wint4 \
--max-model-len 32768 \
--max-num-seqs 128 \
--load-choices "default_v1"
--load_choices "default_v1"
```
- `--quantization`: indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of `wint8` / `wint4` / `block_wise_fp8`(Hopper is needed).
- `--max-model-len`: Indicates the maximum number of tokens supported by the currently deployed service. The larger the value, the longer the context length the model can support, but the more GPU memory is occupied, which may affect the concurrency.
- `--load-choices`: indicates the version of the loader. "default_v1" means enabling the v1 version of the loader, which has faster loading speed and less memory usage.
- `--load_choices`: indicates the version of the loader. "default_v1" means enabling the v1 version of the loader, which has faster loading speed and less memory usage.
For more parameter meanings and default settings, see [FastDeploy Parameter Documentation](../parameters.md)。

View File

@@ -27,7 +27,7 @@ Start the service by following command:
```bash
python -m fastdeploy.entrypoints.openai.api_server \
--model baidu/ERNIE-4.5-21B-A3B-Thinking \
--load-choices "default_v1" \
--load_choices "default_v1" \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--quantization wint8 \
@@ -37,7 +37,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
```
- `--quantization`: Indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of `wint8` / `wint4` / `block_wise_fp8`(Hopper is needed).
- `--max-model-len`: Indicates the maximum number of tokens supported by the currently deployed service. The larger the value, the longer the context length the model can support, but the more GPU memory is occupied, which may affect the concurrency.
- `--load-choices`: Indicates the version of the loader. "default_v1" means enabling the v1 version of the loader, which has faster loading speed and less memory usage.
- `--load_choices`: Indicates the version of the loader. "default_v1" means enabling the v1 version of the loader, which has faster loading speed and less memory usage.
- `--reasoning-parser`, `--tool-call-parser`: Indicates the corresponding reasoning content and tool call parser.
For more parameter meanings and default settings, see [FastDeploy Parameter Documentation](../parameters.md)。

View File

@@ -28,11 +28,11 @@ python -m fastdeploy.entrypoints.openai.api_server \
--quantization wint4 \
--max-model-len 32768 \
--max-num-seqs 128 \
--load-choices "default_v1"
--load_choices "default_v1"
```
- `--quantization`: indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of `wint8` / `wint4` / `block_wise_fp8`(Hopper is needed).
- `--max-model-len`: Indicates the maximum number of tokens supported by the currently deployed service. The larger the value, the longer the context length the model can support, but the more GPU memory is occupied, which may affect the concurrency.
- `--load-choices`: indicates the version of the loader. "default_v1" means enabling the v1 version of the loader, which has faster loading speed and less memory usage.
- `--load_choices`: indicates the version of the loader. "default_v1" means enabling the v1 version of the loader, which has faster loading speed and less memory usage.
For more parameter meanings and default settings, see [FastDeploy Parameter Documentation](../parameters.md)。
@@ -91,7 +91,7 @@ Just specify the corresponding model name in the startup command, `baidu/ERNIE-4
```
Note:
- W4A8C8 quantized models are not supported when loaded via `--load-choices "default_v1"`.
- W4A8C8 quantized models are not supported when loaded via `--load_choices "default_v1"`.
#### 2.2.6 Rejection Sampling
**Idea:**

View File

@@ -196,7 +196,7 @@ We selected a subset (longbook_sum_eng) from InfiniteBench as the performance ev
## Usage
```
export FD_ATTENTION_BACKEND="PLAS_ATTN"
export FD_ATTENTION_BACKEND="MOBA_ATTN"
python -m fastdeploy.entrypoints.openai.api_server
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
@@ -207,13 +207,13 @@ python -m fastdeploy.entrypoints.openai.api_server
--max-num-batched-tokens 8192 \
--max-model-len 131072 \
--max-num-seqs 32 \
--plas-attention-config '{"plas_encoder_top_k_left": 50, "plas_encoder_top_k_right": 60, "plas_decoder_top_k_left": 100, "plas_decoder_top_k_right": 120}'
--moba-attention-config '{"moba_encoder_top_k_left": 50, "moba_encoder_top_k_right": 60, "moba_decoder_top_k_left": 100, "moba_decoder_top_k_right": 120}'
```
**Note**: If sparse attention is enabled, the system will automatically load the MLP weights from `plas_attention_mlp_weight.safetensors` in the weight directory. If the MLP weight file is not found, mean pooling will be applied to the key representations.
**Note**: If sparse attention is enabled, the system will automatically load the MLP weights from `moba_mlp_weight.safetensors` in the weight directory. If the MLP weight file is not found, mean pooling will be applied to the key representations.
**Parameter Description:**
* Setting `FD_ATTENTION_BACKEND="PLAS_ATTN"` enables PLAS sparse attention.
* `plas_encoder_top_k_left=50, plas_encoder_top_k_right=60` indicates that the range of top-k is between 50 and 60 when the encoder is sparse.
* `plas_decoder_top_k_left=100, plas_decoder_top_k_right=120` indicates that the range of top-k is between 100 and 120 when the decoder is sparse.
* Setting `FD_ATTENTION_BACKEND="MOBA_ATTN"` enables MOBA sparse attention.
* `moba_encoder_top_k_left=50, moba_encoder_top_k_right=60` indicates that the range of top-k is between 50 and 60 when the encoder is sparse.
* `moba_decoder_top_k_left=100, moba_decoder_top_k_right=120` indicates that the range of top-k is between 100 and 120 when the decoder is sparse.

View File

@@ -18,7 +18,7 @@ Assuming you have a custom model class `MyModelForCasualLM` and a pretrained cla
```python
# File: fd_add_dummy_model/__init__.py or fd_add_dummy_model/register.py
from fastdeploy.model_executor.models.model_base import ModelRegistry
from fastdeploy.model_registry import ModelRegistry
from my_custom_model import MyModelForCasualLM, MyPretrainedModel
from fastdeploy.config import ErnieArchitectures

View File

@@ -7,4 +7,3 @@ FastDeploy currently supports installation on the following hardware platforms:
- [Enflame S60 GCU Installation](Enflame_gcu.md)
- [Iluvatar GPU Installation](iluvatar_gpu.md)
- [Hygon DCU Installation](hygon_dcu.md)
- [Intel Gaudi Installation](intel_gaudi.md)

View File

@@ -1,4 +1,5 @@
# Run ERNIE-4.5-300B-A47B & ERNIE-4.5-21B-A3B model on iluvatar machine
The current version of the software merely serves as a demonstration demo for the Iluvatar CoreX combined with the Fastdeploy inference framework for large models. Running the latest ERNIE4.5 300B model on the GSM8K dataset takes about 6.3 hours.
## Machine Preparation
First, the `TP=16` when running the ERNIE4.5 300B model and so you need to prepare a machine with the following configurations:
@@ -29,7 +30,7 @@ docker exec -it paddle_infer bash
### Install paddle
```bash
pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
pip3 install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
```
For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
@@ -77,7 +78,7 @@ prompts = [
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
# load the model
llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, block_size=16, quantization='wint8')
llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, static_decode_blocks=0, block_size=16, quantization='wint8')
# Perform batch inference
outputs = llm.generate(prompts, sampling_params)
@@ -389,7 +390,7 @@ export INFERENCE_MSG_QUEUE_ID=232132
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
export FD_SAMPLING_CLASS=rejection
python3 -m fastdeploy.entrypoints.openai.api_server --model "/home/paddle/ernie-45t" --port 8188 --tensor-parallel-size 16 --block-size 16 --quantization wint8
python3 -m fastdeploy.entrypoints.openai.api_server --model "/home/paddle/ernie-45t" --port 8188 --tensor-parallel-size 16 --block-size 16 --static-decode-blocks 0 --quantization wint8
```
4. Running the Script
@@ -402,10 +403,10 @@ After the service is ready, open another terminal and run:
```bash
python3 -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parallel 8
```
It takes about 4.8 hours to run the GSM8K dataset.
It takes about 6.3 hours to run the GSM8K dataset.
```
Accuracy: 0.962
Accuracy: 0.964
Invaild: 0.000
Latency: 17332.728 s
Latency: 22918.186 s
```

View File

@@ -1,75 +0,0 @@
# Intel Gaudi Installation for running ERNIE 4.5 Series Models
The following installation methods are available when your environment meets these requirements:
- Python 3.10
- Intel Gaudi 2
- Intel Gaudi software version 1.22.0
- Linux X86_64
## 1. Run Docker Container
Use the following commands to run a Docker container. Make sure to update the versions below as listed in the [Support Matrix](https://docs.habana.ai/en/latest/Support_Matrix/Support_Matrix.html):
```{.console}
$ docker pull vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest
$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest
```
### 2. Install PaddlePaddle
```bash
python -m pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
```
### 3. Install PaddleCustomDevice
```shell
git clone https://github.com/PaddlePaddle/PaddleCustomDevice
cd PaddleCustomDevice/backends/intel_hpu/
mkdir -p build
cd build
cmake ..
make -j
pip install --force-reinstall dist/paddle_intel_hpu*.whl
cd PaddleCustomDevice/backends/intel_hpu/custom_ops
python setup.py install
```
### 4. Install FastDeploy
```shell
git clone https://github.com/PaddlePaddle/FastDeploy
cd FastDeploy
bash build.sh
```
## Prepare the inference demo
### 1. Start inference service
```shell
export GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
export GC_KERNEL_PATH=/usr/local/lib/python3.10/dist-packages/paddle_custom_device/intel_hpu/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH
export INTEL_HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export PADDLE_DISTRI_BACKEND=xccl
export PADDLE_XCCL_BACKEND=intel_hpu
export HABANA_PROFILE=0
export HPU_VISIBLE_DEVICES=0
HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN python -m fastdeploy.entrypoints.openai.api_server --model ERNIE-4.5-21B-A3B-Paddle --tensor-parallel-size 1 --max-model-len 32768 --max-num-seqs 128
```
### 2. Launch the request
```bash
curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
-H "Content-Type: application/json" \
-d '{
"messages": [
{"role": "user", "content": "What is AI?"}
], "max_tokens": 24
}'
```
### 3. Successfully returns the result
```json
{"id":"chatcmpl-3bd98ae2-fafe-46ae-a552-d653a8526503","object":"chat.completion","created":1757653575,"model":"ERNIE-4.5-21B-A3B-Paddle","choices":[{"index":0,"message":{"role":"assistant","content":"**AI (Artificial Intelligence)** refers to the development of computer systems that can perform tasks typically requiring human intelligence.","multimodal_content":null,"reasoning_content":null,"tool_calls":null,"prompt_token_ids":null,"completion_token_ids":null,"text_after_process":null,"raw_prediction":null,"prompt_tokens":null,"completion_tokens":null},"logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":11,"total_tokens":35,"completion_tokens":24,"prompt_tokens_details":{"cached_tokens":0}}}
```

View File

@@ -19,8 +19,8 @@ docker login --username=cr_temp_user --password=eyJpbnN0YW5jZUlkIjoiY3JpLXpxYTIz
## 2. paddlepaddle and custom device installation
```shell
1pip install paddlepaddle==3.0.0.dev20250825 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
2pip install paddle-metax-gpu==3.0.0.dev20250826 -i https://www.paddlepaddle.org.cn/packages/nightly/maca/
1pip install paddlepaddle==3.0.0.dev20250729 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
2pip install paddle-metax-gpu==3.0.0.dev20250807 -i https://www.paddlepaddle.org.cn/packages/nightly/maca/
```
## 3. Build Wheel from Source
@@ -47,8 +47,6 @@ from fastdeploy.model_executor.ops.gpu import beam_search_softmax
If the above code executes successfully, the environment is ready.
## 5. Demo
```python
from fastdeploy import LLM, SamplingParams
prompts = [
@@ -70,9 +68,7 @@ for output in outputs:
print(prompt)
print(generated_text)
print("-" * 50)
```
```
Output
INFO 2025-08-18 10:54:18,455 416822 engine.py[line:202] Waiting worker processes ready...
Loading Weights: 100%|█████████████████████████████████████████████████████████████████████████| 100/100 [03:33<00:00, 2.14s/it]
@@ -85,4 +81,3 @@ Generated 1 outputs
Hello. My name is
Alice and I'm here to help you. What can I do for you today?
Hello Alice! I'm trying to organize a small party
```

View File

@@ -10,7 +10,7 @@ The following installation methods are available when your environment meets the
## 1. Pre-built Docker Installation (Recommended)
**Notice**: The pre-built image only supports SM80/90 GPU(e.g. H800/A800)if you are deploying on SM86/89GPU(L40/4090/L20), please reinstall ```fastdeploy-gpu``` after you create the container.
**Notice**: The pre-built image only supports SM80/90 GPU(e.g. H800/A800)if you are deploying on SM86/89GPU(L40/4090/L20), please reinstall ```fastdpeloy-gpu``` after you create the container.
```shell
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.2.0

View File

@@ -16,7 +16,7 @@ For more information about how to install FastDeploy, refer to the [installation
After installing FastDeploy, execute the following command in the terminal to start the service. For the configuration method of the startup command, refer to [Parameter Description](../parameters.md)
> ⚠️ **Note:**
> When using HuggingFace models (torch format), you need to enable `--load-choices "default_v1"`.
> When using HuggingFace models (torch format), you need to enable `--load_choices "default_v1"`.
```
export ENABLE_V1_KVCACHE_SCHEDULER=1
@@ -27,7 +27,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
--engine-worker-queue-port 8182 \
--max-model-len 32768 \
--max-num-seqs 32 \
--load-choices "default_v1"
--load_choices "default_v1"
```
> 💡 Note: In the path specified by ```--model```, if the subdirectory corresponding to the path does not exist in the current directory, it will try to query whether AIStudio has a preset model based on the specified model name (such as ```Qwen/QWEN3-0.6b```). If it exists, it will automatically start downloading. The default download path is: ```~/xx```. For instructions and configuration on automatic model download, see [Model Download](../supported_models.md).

View File

@@ -107,7 +107,7 @@ messages = [
}
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
prompt = tokenizer.apply_chat_template(messages, tokenize=False)
images, videos = [], []
for message in messages:
content = message["content"]

View File

@@ -13,7 +13,7 @@ export FD_MODEL_SOURCE=AISTUDIO # "AISTUDIO", "MODELSCOPE" or "HUGGINGFACE"
export FD_MODEL_CACHE=/ssd1/download_models
```
> ⭐ **Note**: Models marked with an asterisk can directly use **HuggingFace Torch weights** and support **FP8/WINT8/WINT4** as well as **BF16**. When running inference, you need to enable **`--load-choices "default_v1"`**.
> ⭐ **Note**: Models marked with an asterisk can directly use **HuggingFace Torch weights** and support **FP8/WINT8/WINT4** as well as **BF16**. When running inference, you need to enable **`--load_choices "default_v1"`**.
> Example launch Command using baidu/ERNIE-4.5-21B-A3B-PT:
```
@@ -24,7 +24,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
--engine-worker-queue-port 8182 \
--max-model-len 32768 \
--max-num-seqs 32 \
--load-choices "default_v1"
--load_choices "default_v1"
```
## Large Language Models

View File

@@ -20,6 +20,6 @@ Below is an overview of the FastDeploy code structure and functionality organize
- ```platforms```: Platform-specific modules for underlying hardware support.
- ```scheduler```: Request scheduling module for large models.
- ```metrics```: Core component for collecting, managing, and exporting Prometheus metrics, tracking key runtime performance data (e.g., request latency, resource utilization, successful request counts).
- ```splitwise```: Modules related to PD disaggregation deployment.
- ```splitwise```: Modules related to PD disaggragation deployment.
- ```scripts```/```tools```: Utility scripts for FastDeploy operations (e.g., compilation, unit testing, code style fixes).
- ```test```: Code for unit testing and validation.

View File

@@ -1,20 +1,20 @@
## Supported Models
|Model Name|Context Length|Quantization|XPUs Required|Deployment Commands|Minimum Version Required|
|-|-|-|-|-|-|
|ERNIE-4.5-300B-A47B|32K|WINT8|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.3|
|ERNIE-4.5-300B-A47B|32K|WINT4|4 (Recommended)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 4 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.0|
|ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.95 \ <br> --load-choices "default"|>=2.0.0|
|ERNIE-4.5-300B-A47B|128K|WINT4|8 (Recommended)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.0|
|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.3|
|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.3|
|ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.3|
|ERNIE-4.5-0.3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.3|
|ERNIE-4.5-300B-A47B|32K|WINT8|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|ERNIE-4.5-300B-A47B|32K|WINT4|4 (Recommended)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 4 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.0.0|
|ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.95|>=2.0.0|
|ERNIE-4.5-300B-A47B|128K|WINT4|8 (Recommended)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.0.0|
|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|ERNIE-4.5-0.3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
## Quick start
@@ -28,7 +28,6 @@ Deploy an OpenAI API-compatible server using FastDeploy with the following comma
```bash
export XPU_VISIBLE_DEVICES="0,1,2,3" # Specify which cards to be used
export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported
python -m fastdeploy.entrypoints.openai.api_server \
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
@@ -36,8 +35,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--gpu-memory-utilization 0.9 \
--load-choices "default"
--gpu-memory-utilization 0.9
```
**Note:** When deploying on 4 XPUs, only two configurations are supported which constrained by hardware limitations such as interconnect capabilities.
@@ -51,7 +49,7 @@ All supported models can be found in the *Supported Models* section above.
#### Send requests
Send requests using either curl or Python.
Send requests using either curl or Python
```bash
curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \

View File

@@ -30,7 +30,7 @@ By default, logs are stored in the `log` directory under the execution path. To
* `cache_transfer_manager.log` : Logs startup parameters and received request information.
* `launch_cache_manager.log` : Records cache transfer startup parameters and error messages.
## PD Disaggregation Logs
## PD Disaggragation Logs
* `cache_messager.log` : Logs transmission protocols and messages used by the P instance.
* `splitwise_connector.log` : Records data received from P/D instances and connection establishment details.

View File

@@ -31,12 +31,12 @@ python -m fastdeploy.entrypoints.openai.api_server \
--quantization wint4 \
--max-model-len 32768 \
--max-num-seqs 128 \
--load-choices "default_v1"
--load_choices "default_v1"
```
其中:
- `--quantization`: 表示模型采用的量化策略。不同量化策略,模型的性能和精度也会不同。可选值包括:`wint8` / `wint4` / `block_wise_fp8`(需要Hopper架构)。
- `--max-model-len`表示当前部署的服务所支持的最长Token数量。设置得越大模型可支持的上下文长度也越大但相应占用的显存也越多可能影响并发数。
- `--load-choices`: 表示loader的版本"default_v1"表示启用v1版本的loader具有更快的加载速度和更少的内存使用。
- `--load_choices`: 表示loader的版本"default_v1"表示启用v1版本的loader具有更快的加载速度和更少的内存使用。
更多的参数含义与默认设置,请参见[FastDeploy参数说明](../parameters.md)。

View File

@@ -31,12 +31,12 @@ python -m fastdeploy.entrypoints.openai.api_server \
--quantization wint4 \
--max-model-len 32768 \
--max-num-seqs 128 \
--load-choices "default_v1"
--load_choices "default_v1"
```
其中:
- `--quantization`: 表示模型采用的量化策略。不同量化策略,模型的性能和精度也会不同。可选值包括:`wint8` / `wint4` / `block_wise_fp8`(需要Hopper架构)。
- `--max-model-len`表示当前部署的服务所支持的最长Token数量。设置得越大模型可支持的上下文长度也越大但相应占用的显存也越多可能影响并发数。
- `--load-choices`: 表示loader的版本"default_v1"表示启用v1版本的loader具有更快的加载速度和更少的内存使用。
- `--load_choices`: 表示loader的版本"default_v1"表示启用v1版本的loader具有更快的加载速度和更少的内存使用。
更多的参数含义与默认设置,请参见[FastDeploy参数说明](../parameters.md)。

View File

@@ -27,7 +27,7 @@ ERNIE-4.5-21B-A3B 各量化精度,在下列硬件上部署所需要的最小
```bash
python -m fastdeploy.entrypoints.openai.api_server \
--model baidu/ERNIE-4.5-21B-A3B-Thinking \
--load-choices "default_v1" \
--load_choices "default_v1" \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--quantization wint8 \
@@ -38,7 +38,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
其中:
- `--quantization`: 表示模型采用的量化策略。不同量化策略,模型的性能和精度也会不同。可选值包括:`wint8` / `wint4` / `block_wise_fp8`(需要Hopper架构)。
- `--max-model-len`表示当前部署的服务所支持的最长Token数量。设置得越大模型可支持的上下文长度也越大但相应占用的显存也越多可能影响并发数。
- `--load-choices`: 表示loader的版本"default_v1"表示启用v1版本的loader具有更快的加载速度和更少的内存使用。
- `--load_choices`: 表示loader的版本"default_v1"表示启用v1版本的loader具有更快的加载速度和更少的内存使用。
- `--reasoning-parser``--tool-call-parser`: 表示对应调用的思考内容和工具调用解析器
更多的参数含义与默认设置,请参见[FastDeploy参数说明](../parameters.md)。

View File

@@ -28,12 +28,12 @@ python -m fastdeploy.entrypoints.openai.api_server \
--quantization wint4 \
--max-model-len 32768 \
--max-num-seqs 128 \
--load-choices "default_v1"
--load_choices "default_v1"
```
其中:
- `--quantization`: 表示模型采用的量化策略。不同量化策略,模型的性能和精度也会不同。可选值包括:`wint8` / `wint4` / `block_wise_fp8`(需要Hopper架构)。
- `--max-model-len`表示当前部署的服务所支持的最长Token数量。设置得越大模型可支持的上下文长度也越大但相应占用的显存也越多可能影响并发数。
- `--load-choices`: 表示loader的版本"default_v1"表示启用v1版本的loader具有更快的加载速度和更少的内存使用。
- `--load_choices`: 表示loader的版本"default_v1"表示启用v1版本的loader具有更快的加载速度和更少的内存使用。
更多的参数含义与默认设置,请参见[FastDeploy参数说明](../parameters.md)。
@@ -92,7 +92,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
```
注:
- W4A8C8量化的模型不支持通过`--load-choices "default_v1"`载入。
- W4A8C8量化的模型不支持通过`--load_choices "default_v1"`载入。
#### 2.2.6 拒绝采样
**原理:**

View File

@@ -18,7 +18,7 @@
<img src="images/plas_training_distill.png" alt="Attention Gate Module" width="60%">
</div>
* **Attention Gate Module**: 如上图所示为了以较低的计算开销估计每个块的重要性我们设计了一个轻量级的注意力门模块。该模块首先通过一个MLP层压缩每个K个块,生成一个具有代表性的低维表示: $K_c^T=W_{kp}K^T$ ,其中 $W_{kp}$ 表示 MLP 层的权重。与直接应用均值池化相比,可学习的 MLP 可以更有效地捕捉不同 token 之间的语义关系和重要性分布,从而提供每个块的精细表示。在获得压缩表示 $K_c$ 之后,通过以下公式估计每个查询 token 相对于每个块的重要性:$Softmax(Q\cdot K_c^T)$。为了增强 MLP 层的判别能力,我们使用一维最大池化后的完整注意力结果 $1DMaxPooling(Softmax(Q \cdot K^T))$ 作为 ground truth。通过最小化两者之间的分布差异引导 MLP 层学习更符合真实注意力分布的特征表示。
* **Attention Gate Module**: 如上图所示,为了以较低的计算开销估计每个块的重要性,我们设计了一个轻量级的注意力门模块。该模块首先通过一个 MLP 层压缩每个 K 个块,生成一个具有代表性的低维表示:$K_c^T=W_{kp}K^T$,其中 $W_{kp}$ 表示 MLP 层的权重。与直接应用均值池化相比,可学习的 MLP 可以更有效地捕捉不同 token 之间的语义关系和重要性分布,从而提供每个块的精细表示。在获得压缩表示 $K_c$ 之后,通过以下公式估计每个查询 token 相对于每个块的重要性:$Softmax(Q\cdot K_c^T)$。为了增强 MLP 层的判别能力,我们使用一维最大池化后的完整注意力结果 $1DMaxPooling(Softmax(Q \cdot K^T))$ 作为 ground truth。通过最小化两者之间的分布差异引导 MLP 层学习更符合真实注意力分布的特征表示。
* **Training Data**: 得益于模型架构和训练范式的高效性,我们的方法仅使用 10 亿个 token 进行训练,便实现了近乎无损的精度。训练数据源自内部构建的包含长文本和短文本的混合语料库,从而增强了模块对不同序列长度的适应性。
@@ -36,7 +36,7 @@
* **Prefill Token Union**: 我们观察到相邻的查询标记倾向于选择相似的关键块。利用这种局部性,我们取连续 128 个查询标记选择的关键块的并集,并联合计算这些标记的稀疏注意力机制。
* **Decode Head Union**: 鉴于GQA在现代模型中的广泛应用我们发现同一组内的不同查询头经常选择重叠的关键块。因此我们将同一组内所有查询头选择的关键块合并为一个统一的集合并联合计算稀疏注意力机制。这种方式也减少了内存访问开销并进一步提高了解码效率。
* **Decode Head Union**: 鉴于 GQA 在现代模型中的广泛应用,我们发现同一组内的不同查询头经常选择重叠的关键块。因此,我们将同一组内所有查询头选择的关键块合并为一个统一的集合,并联合计算稀疏注意力机制。这种方式也减少了内存访问开销,并进一步提高了解码效率。
* **Top-K Selection**: 传统的 Top-k 算法基于排序或直接调用 Cub 库,会带来显著的运行时开销。为了缓解这个问题,我们实现了一个基于二分查找的近似 Top-k 选择算法,该算法在保持准确率的同时显著降低了延迟,最终实现了性能的显著提升。
@@ -200,7 +200,7 @@
## 使用方式
```
export FD_ATTENTION_BACKEND="PLAS_ATTN"
export FD_ATTENTION_BACKEND="MOBA_ATTN"
python -m fastdeploy.entrypoints.openai.api_server
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
@@ -211,13 +211,13 @@ python -m fastdeploy.entrypoints.openai.api_server
--max-num-batched-tokens 8192 \
--max-model-len 131072 \
--max-num-seqs 32 \
--plas-attention-config '{"plas_encoder_top_k_left": 50, "plas_encoder_top_k_right": 60, "plas_decoder_top_k_left": 100, "plas_decoder_top_k_right": 120}'
--moba-attention-config '{"moba_encoder_top_k_left": 50, "moba_encoder_top_k_right": 60, "moba_decoder_top_k_left": 100, "moba_decoder_top_k_right": 120}'
```
**Note**: 如果启用了稀疏注意力机制,系统将自动从权重目录中的`plas_attention_mlp_weight.safetensors`文件加载 MLP 权重。如果未找到 MLP 权重文件,则将对关键表示应用均值池化
**Note**: 如果启用了稀疏注意力机制,系统将自动从权重目录中的`moba_mlp_weight.safetensors`文件加载 MLP 权重。如果未找到 MLP 权重文件,则将对关键表示应用均值池化
**Parameter Description:**
* `FD_ATTENTION_BACKEND="PLAS_ATTN"` 启用 PLAS sparse attention.
* `plas_encoder_top_k_left=50, plas_encoder_top_k_right=60` 表示当encoder时top-k的范围在50到60之间。
* `plas_decoder_top_k_left=100, plas_decoder_top_k_right=120` 表示当decoder时top-k的范围在100到120之间。
* `FD_ATTENTION_BACKEND="MOBA_ATTN"` 启用 MOBA sparse attention.
* `moba_encoder_top_k_left=50, moba_encoder_top_k_right=60` 表示当encoder时top-k的范围在50到60之间。
* `moba_decoder_top_k_left=100, moba_decoder_top_k_right=120` 表示当decoder时top-k的范围在100到120之间。

View File

@@ -18,7 +18,7 @@ FastDeploy 利用 Python 的 `entry_points` 机制来发现并加载插件。开
```python
# 文件fd_add_dummy_model/__init__.py
from fastdeploy.model_executor.models.model_base import ModelRegistry
from fastdeploy.model_registry import ModelRegistry
from my_custom_model import MyModelForCasualLM, MyPretrainedModel
def register():

View File

@@ -7,4 +7,3 @@ FastDeploy支持如下硬件平台:
- [Enflame S60 GCU Installation](Enflame_gcu.md)
- [Iluvatar GPU Installation](iluvatar_gpu.md)
- [Hygon DCU Installation](hygon_dcu.md)
- [Intel Gaudi Installation](intel_gaudi.md)

View File

@@ -1,11 +1,12 @@
# 如何在天数机器上运行 ERNIE-4.5-300B-A47B-BF16 & ERNIE-4.5-21B-A3B
该软件的当前版本仅作为Iluvatar CoreX与大型模型的Fastdeploy推理框架相结合的演示。在GSM8K数据集上运行最新的ERNIE4.5 300B模型大约需要6.3小时。
## 准备机器
首先运行ERNIE4.5 300B模型需要`TP=16`, 所以您需要准备以下配置的机器
首先您需要准备以下配置的机器
| CPU | 内存 | 天数 | 硬盘|
|-----|------|-----|-----|
| x86 | 1TB| 16xBI150| 1TB|
| x86 | 1TB| 8xBI150| 1TB|
目前需要将完整模型 load 到 host memory 中,需要需要大于 600GB 的 host memory后续版本会优化。
@@ -29,7 +30,7 @@ docker exec -it paddle_infer bash
### 安装paddle
```bash
pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
pip3 install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
```
获取Paddle的最新安装版本 [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
@@ -76,7 +77,7 @@ prompts = [
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
# 加载模型
llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, quantization='wint8')
llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, static_decode_blocks=0, quantization='wint8')
# 批量进行推理llm内部基于资源情况进行请求排队、动态插入处理
outputs = llm.generate(prompts, sampling_params)
@@ -131,281 +132,3 @@ Now, let's break down each step:
**Step 3: Drawing the
The largest ocean is the Pacific Ocean, covering an area of approximately ⦠[3], The first scientific expeditions to determine the ocean's depth were the Challenger expedition (1872â1876) and the U.S. Navy Hydrographic Office survey (1877â1879). The oceanic crust is thin and irregular, consisting of upward moving magma from the mantle below, and cooling and solidifying on the surface. The shallowest parts of the ocean are called the continental shelves. Large tides are caused mainly by the alignment of the Sun, Moon, and Earth during new or full moons. The origin of the word "ocean" is not clear. The first global oceanic topography survey was completed by the Challenger expedition (1872â1876). [57] The sound speed in the ocean is primarily a function of water temperature and salinity, and varies with depth. The deep-ocean floor is mostly flat and devoid of life, with the exception of seamounts and various underwater volcanic features, including seamounts and hydrothermal vents. [73] Today, the five ocean
```
## 在GSM8K数据集上运行ernie4.5 300B模型
1. 下载GSM8K数据集
```bash
wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
```
2. 准备`bench_gsm8k.py`
```python
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Fastdeploy + ERNIE-4.5-Turbo 的指标评估 """
# adapted from https://github.com/sgl-project/sglang/blob/main/benchmark/gsm8k/bench_other.py
import argparse
import ast
import json
import re
import time
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import requests
from tqdm import tqdm
INVALID = -9999999
def call_generate(prompt, **kwargs):
"""
Generates response based on the input prompt.
Args:
prompt (str): The input prompt text.
**kwargs: Keyword arguments, including server IP address and port number.
Returns:
str: The response generated based on the prompt.
"""
url = f"http://{kwargs['ip']}:{kwargs['port']}/v1/chat/completions"
headers = {"Content-Type": "application/json"}
data = {
"messages": [
{
"role": "user",
"content": prompt,
}
],
"temperature": 0.6,
"max_tokens": 2047,
"top_p": 0.95,
"do_sample": True,
}
response = requests.post(url, headers=headers, data=json.dumps(data))
out = response.json()
return out["choices"][0]["message"]["content"]
def get_one_example(lines, i, include_answer):
"""
Retrieves a question-answer example from the given list of text lines.
Args:
lines (list of dict): A list of question-answer pairs.
i (int): The index of the question-answer pair to retrieve from lines.
include_answer (bool): Whether to include the answer in the returned string.
Returns:
str: A formatted question-answer string in the format "Question: <question>\nAnswer: <answer>".
"""
ret = "Question: " + lines[i]["question"] + "\nAnswer:"
if include_answer:
ret += " " + lines[i]["answer"]
return ret
def get_few_shot_examples(lines, k):
"""
Selects k examples from the given list of text lines and concatenates them into a single string.
Args:
lines (list): A list containing text lines.
k (int): The number of examples to select.
Returns:
str: A string composed of k examples, separated by two newline characters.
"""
ret = ""
for i in range(k):
ret += get_one_example(lines, i, True) + "\n\n"
return ret
def get_answer_value(answer_str):
"""
Extracts numerical values from an answer string and returns them.
Args:
answer_str (str): The string containing the answer.
Returns:
The extracted numerical value; returns "INVALID" if extraction fails.
"""
answer_str = answer_str.replace(",", "")
numbers = re.findall(r"\d+", answer_str)
if len(numbers) < 1:
return INVALID
try:
return ast.literal_eval(numbers[-1])
except SyntaxError:
return INVALID
def read_jsonl(filename: str):
"""
Reads a JSONL file.
Args:
filename (str): Path to the JSONL file.
Yields:
dict: A dictionary object corresponding to each line in the JSONL file.
"""
with open(filename) as fin:
for line in fin:
if line.startswith("#"):
continue
yield json.loads(line)
def main(args):
"""
Process inputs and generate answers by calling the model in parallel using a thread pool.
Args:
args (argparse.Namespace):
- num_questions (int): Number of questions to process.
- num_shots (int): Number of few-shot learning examples.
- ip (str): IP address of the model service.
- port (int): Port number of the model service.
- parallel (int): Number of questions to process in parallel.
- result_file (str): File path to store the results.
Returns:
None
"""
# Read data
filename = "test.jsonl"
lines = list(read_jsonl(filename))
# Construct prompts
num_questions = args.num_questions
num_shots = args.num_shots
few_shot_examples = get_few_shot_examples(lines, num_shots)
questions = []
labels = []
for i in range(len(lines[:num_questions])):
questions.append(get_one_example(lines, i, False))
labels.append(get_answer_value(lines[i]["answer"]))
assert all(l != INVALID for l in labels)
states = [None] * len(labels)
# Use thread pool
def get_one_answer(i):
answer = call_generate(
prompt=few_shot_examples + questions[i],
# stop=["Question", "Assistant:", "<|separator|>"],
ip=args.ip,
port=args.port,
)
states[i] = answer
tic = time.time()
if args.parallel == 1:
for i in tqdm(range(len(questions))):
get_one_answer(i)
else:
with ThreadPoolExecutor(args.parallel) as executor:
list(
tqdm(
executor.map(get_one_answer, list(range(len(questions)))),
total=len(questions),
)
)
latency = time.time() - tic
preds = []
for i in range(len(states)):
preds.append(get_answer_value(states[i]))
# Compute accuracy
acc = np.mean(np.array(preds) == np.array(labels))
invalid = np.mean(np.array(preds) == INVALID)
# Print results
print(f"Accuracy: {acc:.3f}")
print(f"Invalid: {invalid:.3f}")
print(f"Latency: {latency:.3f} s")
with open(args.result_file, "a") as fout:
value = {
"task": "gsm8k",
"backend": "paddlepaddle",
"num_gpus": 1,
"latency": round(latency, 3),
"accuracy": round(acc, 3),
"num_requests": args.num_questions,
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
},
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--ip", type=str, default="127.0.0.1")
parser.add_argument("--port", type=str, default="8188")
parser.add_argument("--num-shots", type=int, default=10)
parser.add_argument("--data-path", type=str, default="test.jsonl")
parser.add_argument("--num-questions", type=int, default=1319)
parser.add_argument("--result-file", type=str, default="result.jsonl")
parser.add_argument("--parallel", type=int, default=1)
args = parser.parse_args()
main(args)
```
3. 准备`run_bench.sh`
```bash
#!/bin/bash
export PADDLE_XCCL_BACKEND=iluvatar_gpu
export INFERENCE_MSG_QUEUE_ID=232132
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
export FD_SAMPLING_CLASS=rejection
python3 -m fastdeploy.entrypoints.openai.api_server --model "/home/paddle/ernie-45t" --port 8188 --tensor-parallel-size 16 --block-size 16 --quantization wint8
```
4. 运行脚本
首先打开一个终端执行服务端命令:
```bash
./run_bench.sh
```
等服务起好后,在打开另一个终端执行客户端命令:
```bash
python3 -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parallel 8
```
推理整个GSM8K数据集大概需要4.8个小时。
```
Accuracy: 0.962
Invaild: 0.000
Latency: 17332.728 s
```

View File

@@ -1,75 +0,0 @@
# 使用 Intel Gaudi 运行ERNIE 4.5 系列模型
在环境满足如下条件前提下
- Python 3.10
- Intel Gaudi 2
- Intel Gaudi software version 1.22.0
- Linux X86_64
## 1. 运行Docker容器
使用下面命令运行Docker容器. 确保更新的版本在如下列表中 [Support Matrix](https://docs.habana.ai/en/latest/Support_Matrix/Support_Matrix.html):
```{.console}
$ docker pull vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest
$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest
```
### 2. 安装 PaddlePaddle
```bash
python -m pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
```
### 3. 安装 PaddleCustomDevice
```shell
git clone https://github.com/PaddlePaddle/PaddleCustomDevice
cd PaddleCustomDevice/backends/intel_hpu/
mkdir -p build
cd build
cmake ..
make -j
pip install --force-reinstall dist/paddle_intel_hpu*.whl
cd PaddleCustomDevice/backends/intel_hpu/custom_ops
python setup.py install
```
### 4. 安装 FastDeploy
```shell
git clone https://github.com/PaddlePaddle/FastDeploy
cd FastDeploy
bash build.sh
```
## 准备推理示例
### 1. 启动推理服务
```shell
export GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
export GC_KERNEL_PATH=/usr/local/lib/python3.10/dist-packages/paddle_custom_device/intel_hpu/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH
export INTEL_HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export PADDLE_DISTRI_BACKEND=xccl
export PADDLE_XCCL_BACKEND=intel_hpu
export HABANA_PROFILE=0
export HPU_VISIBLE_DEVICES=0
HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN python -m fastdeploy.entrypoints.openai.api_server --model ERNIE-4.5-21B-A3B-Paddle --tensor-parallel-size 1 --max-model-len 32768 --max-num-seqs 128
```
### 2. 发送请求
```bash
curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
-H "Content-Type: application/json" \
-d '{
"messages": [
{"role": "user", "content": "What is AI?"}
], "max_tokens": 24
}'
```
### 3. 成功返回结果
```json
{"id":"chatcmpl-3bd98ae2-fafe-46ae-a552-d653a8526503","object":"chat.completion","created":1757653575,"model":"ERNIE-4.5-21B-A3B-Paddle","choices":[{"index":0,"message":{"role":"assistant","content":"**AI (Artificial Intelligence)** refers to the development of computer systems that can perform tasks typically requiring human intelligence.","multimodal_content":null,"reasoning_content":null,"tool_calls":null,"prompt_token_ids":null,"completion_token_ids":null,"text_after_process":null,"raw_prediction":null,"prompt_tokens":null,"completion_tokens":null},"logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":11,"total_tokens":35,"completion_tokens":24,"prompt_tokens_details":{"cached_tokens":0}}}
```

View File

@@ -15,7 +15,7 @@
安装FastDeploy后在终端执行如下命令启动服务其中启动命令配置方式参考[参数说明](../parameters.md)
> ⚠️ **注意:**
> 当使用HuggingFace 模型(torch格式)时, 需要开启 `--load-choices "default_v1"`
> 当使用HuggingFace 模型(torch格式)时, 需要开启 `--load_choices "default_v1"`
```shell
export ENABLE_V1_KVCACHE_SCHEDULER=1
@@ -26,7 +26,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
--engine-worker-queue-port 8182 \
--max-model-len 32768 \
--max-num-seqs 32 \
--load-choices "default_v1"
--load_choices "default_v1"
```
>💡 注意:在 ```--model``` 指定的路径中,若当前目录下不存在该路径对应的子目录,则会尝试根据指定的模型名称(如 ```Qwen/Qwen3-0.6B```查询AIStudio是否存在预置模型若存在则自动启动下载。默认的下载路径为```~/xx```。关于模型自动下载的说明和配置参阅[模型下载](../supported_models.md)。

View File

@@ -107,7 +107,7 @@ messages = [
}
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
prompt = tokenizer.apply_chat_template(messages, tokenize=False)
images, videos = [], []
for message in messages:
content = message["content"]

View File

@@ -13,7 +13,7 @@ export FD_MODEL_SOURCE=AISTUDIO # "AISTUDIO", "MODELSCOPE" or "HUGGINGFACE"
export FD_MODEL_CACHE=/ssd1/download_models
```
> ⭐ **说明**:带星号的模型可直接使用 **HuggingFace Torch 权重**,支持 **FP8/WINT8/WINT4 动态量化** 和 **BF16 精度** 推理,推理时需启用 **`--load-choices "default_v1"`**。
> ⭐ **说明**:带星号的模型可直接使用 **HuggingFace Torch 权重**,支持 **FP8/WINT8/WINT4 动态量化** 和 **BF16 精度** 推理,推理时需启用 **`--load_choices "default_v1"`**。
> 以baidu/ERNIE-4.5-21B-A3B-PT为例启动命令如下
```
@@ -24,7 +24,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
--engine-worker-queue-port 8182 \
--max-model-len 32768 \
--max-num-seqs 32 \
--load-choices "default_v1"
--load_choices "default_v1"
```
## 纯文本模型列表

View File

@@ -1,20 +1,20 @@
## 支持的模型
|模型名|上下文长度|量化|所需卡数|部署命令|最低版本要求|
|-|-|-|-|-|-|
|ERNIE-4.5-300B-A47B|32K|WINT8|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.3|
|ERNIE-4.5-300B-A47B|32K|WINT4|4 (推荐)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 4 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.0|
|ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.95 \ <br> --load-choices "default"|>=2.0.0|
|ERNIE-4.5-300B-A47B|128K|WINT4|8 (推荐)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.0|
|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.3|
|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="x" # 指定任意一张卡<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.3|
|ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.3|
|ERNIE-4.5-0.3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.3|
|ERNIE-4.5-300B-A47B|32K|WINT8|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|ERNIE-4.5-300B-A47B|32K|WINT4|4 (推荐)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 4 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.0.0|
|ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.95|>=2.0.0|
|ERNIE-4.5-300B-A47B|128K|WINT4|8 (推荐)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.0.0|
|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="x" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|ERNIE-4.5-0.3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
## 快速开始
@@ -28,7 +28,6 @@
```bash
export XPU_VISIBLE_DEVICES="0,1,2,3" # 设置使用的 XPU 卡
export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持
python -m fastdeploy.entrypoints.openai.api_server \
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
@@ -36,8 +35,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--gpu-memory-utilization 0.9 \
--load-choices "default"
--gpu-memory-utilization 0.9
```
**注意:** 使用 P800 在 4 块 XPU 上进行部署时,由于受到卡间互联拓扑等硬件限制,仅支持以下两种配置方式:

View File

@@ -28,7 +28,7 @@ from paddleformers.utils.log import logger as pf_logger
from fastdeploy.engine.sampling_params import SamplingParams
from fastdeploy.entrypoints.llm import LLM
from fastdeploy.utils import current_package_version, envs
from fastdeploy.utils import envs
if envs.FD_DEBUG != "1":
import logging
@@ -43,8 +43,6 @@ except ImportError:
pass
# TODO(tangbinhan): remove this code
__version__ = current_package_version()
def _patch_fastsafetensors():
try:

View File

@@ -14,10 +14,7 @@
# limitations under the License.
"""
import argparse
import json
import math
import queue
import threading
import time
import traceback
@@ -26,72 +23,16 @@ import numpy as np
import paddle
from fastdeploy.cache_manager.transfer_factory import IPCCommManager, RDMACommManager
from fastdeploy.config import SpeculativeConfig
from fastdeploy.inter_communicator import (
EngineWorkerQueue,
IPCSignal,
shared_memory_exists,
)
from fastdeploy.model_executor.ops.gpu import get_output_kv_signal, set_data_ipc
from fastdeploy.utils import envs, get_logger
from fastdeploy.utils import get_logger
logger = get_logger("cache_messager", "cache_messager.log")
def parse_args():
"""
从命令行解析参数
"""
parser = argparse.ArgumentParser("Cache Messager")
parser.add_argument(
"--splitwise_role",
type=str,
default="mixed",
help="splitwise role, can be decode, prefill or mixed",
)
parser.add_argument("--rank", type=int, default=0, help="current rank")
parser.add_argument("--device_id", type=int, default=0, help="device id")
parser.add_argument("--num_layers", type=int, default=1, help="model num layers")
parser.add_argument("--head_dim", type=int, default=1, help="model head dim")
parser.add_argument("--kv_num_head", type=int, default=1, help="model kv num head")
parser.add_argument("--rdma_port", type=str, default="", help="rmda port")
parser.add_argument("--mp_num", type=int, default=1, help="number of model parallel")
parser.add_argument("--engine_pid", type=str, default=None, help="engine pid")
parser.add_argument(
"--protocol",
type=str,
default="ipc",
help="cache transfer protocol, only surport ipc now",
)
parser.add_argument("--pod_ip", type=str, default="0.0.0.0", help="pod ip")
parser.add_argument("--cache_queue_port", type=int, default=9924, help="cache queue port")
parser.add_argument(
"--engine_worker_queue_port",
type=int,
default=9923,
help="engine worker queue port",
)
parser.add_argument("--num_gpu_blocks", type=int, default=1, help="gpu cache block number")
parser.add_argument("--block_size", type=int, default=64, help="cache block size(tokens)")
parser.add_argument(
"--cache_dtype",
type=str,
default="bfloat16",
choices=["uint8", "bfloat16"],
help="cache dtype",
)
parser.add_argument(
"--speculative_config",
type=json.loads,
default="{}",
help="speculative config",
)
parser.add_argument("--local_data_parallel_id", type=int, default=0)
args = parser.parse_args()
return args
class CacheMessager:
"""
CacheMessager is used to send the cache data between the engine worker and the cache server.
@@ -128,6 +69,11 @@ class CacheMessager:
Returns:
None
"""
assert splitwise_role in [
"prefill",
"decode",
], "splitwise_role must be prefill or decode"
self.splitwise_role = splitwise_role
self.gpu_cache_kvs = gpu_cache_kvs
self.rank = rank
@@ -201,16 +147,15 @@ class CacheMessager:
self.gpu_id = gpu_id
self.cache_info = dict()
self.rank_id = self.rank + local_data_parallel_id * self.nranks
self.dp_rank_id = self.rank + local_data_parallel_id * self.nranks
if self.splitwise_role != "mixed":
connect_rdma_thread = threading.Thread(target=self._handle_connect_task)
connect_rdma_thread.daemon = True
connect_rdma_thread.start()
layerwise_send_cache_thread = threading.Thread(target=self._prefill_layerwise_send_cache_thread)
layerwise_send_cache_thread.daemon = True
layerwise_send_cache_thread.start()
logger.info(f"cache messager init finished, use {transfer_protocol}")
def prefill_layerwise_send_cache_thread(self):
def _prefill_layerwise_send_cache_thread(self):
"""
layerwise_send_cache_thread:
send cache to other instance
@@ -218,23 +163,23 @@ class CacheMessager:
try:
prefilled_step_idx_data = np.zeros(shape=[1], dtype=np.int32)
prefilled_layer_idx_data = np.zeros(shape=[1], dtype=np.int32)
prefilled_layer_name = f"splitwise_complete_prefilled_layer_{self.rank_id}.{self.gpu_id}"
prefilled_step_name = f"splitwise_complete_prefilled_step_{self.rank_id}.{self.gpu_id}"
prefilled_layer_name = f"splitwise_complete_prefilled_layer_{self.dp_rank_id}.{self.gpu_id}"
prefilled_step_name = f"splitwise_complete_prefilled_step_{self.dp_rank_id}.{self.gpu_id}"
step_shm_value = IPCSignal(
name=f"splitwise_complete_prefilled_step_{self.rank_id}",
name=f"splitwise_complete_prefilled_step_{self.dp_rank_id}",
array=prefilled_step_idx_data,
dtype=np.int32,
suffix=self.gpu_id,
create=not shared_memory_exists(prefilled_step_name),
)
layer_shm_value = IPCSignal(
name=f"splitwise_complete_prefilled_layer_{self.rank_id}",
name=f"splitwise_complete_prefilled_layer_{self.dp_rank_id}",
array=prefilled_layer_idx_data,
dtype=np.int32,
suffix=self.gpu_id,
create=not shared_memory_exists(prefilled_layer_name),
)
logger.info(f"splitwise_complete_prefilled_step_{self.rank_id}, gpu_id: {self.gpu_id}")
logger.info(f"splitwise_complete_prefilled_step_{self.dp_rank_id}, gpu_id: {self.gpu_id}")
step_shm_value.value[0] = -1
layer_shm_value.value[0] = -1
@@ -242,9 +187,6 @@ class CacheMessager:
self.last_step_idx = -1
self.last_layer_idx = -1 # int32
max_step_idx = 100003
engine_recycled_count = 0
while True:
cache_info = self.engine_worker_queue.get_cache_info()
@@ -260,13 +202,16 @@ class CacheMessager:
-len(current_info["dest_block_ids"]) :
]
current_info["src_block_ids"] = current_src_blocks
current_info["current_layer_ids"] = 0
current_info["status"] = "init"
logger.info(f"start cache_infos: {current_info}")
self.cache_info[info["request_id"]] = current_info
self.last_step_idx = min(self.last_step_idx, current_info["current_id"])
else:
self.cache_info[info["request_id"]] = info
prefilled_layer_idx = layer_shm_value.value[0]
prefilled_step_idx = step_shm_value.value[0]
logger.info(f"prefilled_layer_idx: {prefilled_layer_idx}, prefilled_step_idx: {prefilled_step_idx}")
if prefilled_layer_idx == self.num_layers - 1:
time.sleep(0.001)
prefilled_layer_idx = layer_shm_value.value[0]
@@ -278,18 +223,7 @@ class CacheMessager:
if not self.cache_info:
time.sleep(0.001)
continue
if self.last_step_idx > prefilled_step_idx:
engine_recycled_count += 1
self.last_step_idx = prefilled_step_idx # only copy value read from shm memory
prefilled_step_idx = (
prefilled_step_idx + max_step_idx * engine_recycled_count
) # remap prefilled_step_idx for comparison
logger.debug(
f"prefilled_layer_idx: {prefilled_layer_idx}, prefilled_step_idx in shm: {self.last_step_idx},"
f"prefilled_step_idx: {prefilled_step_idx} engine_recycled_count {engine_recycled_count}"
)
logger.debug(f"prefilled_layer_idx: {prefilled_layer_idx}, prefilled_step_idx: {prefilled_step_idx}")
for req_id, item in list(self.cache_info.items()):
if "status" not in item:
continue
@@ -360,493 +294,12 @@ class CacheMessager:
logger.info(f"finish write cache {item['request_id']}")
self.engine_worker_queue.finish_request_barrier.wait()
if self.rank == 0:
# to do: robust in TP: here we assume all status in tp are the same. If wrong, all wrong. If ok, all ok.
self.engine_worker_queue.put_finished_req([(item["request_id"], "finished")])
logger.info(f"put write cache {item['request_id']}")
del self.cache_info[req_id]
self.last_layer_idx = prefilled_layer_idx
self.last_step_idx = prefilled_step_idx
self.last_layer_idx = prefilled_layer_idx
except Exception as e:
logger.error(f"prefill layerwise send cache thread has exception: {e}, {str(traceback.format_exc())}")
def _handle_connect_task(self):
while True:
try:
task = self.engine_worker_queue.get_connect_rdma_task()
if task is None:
time.sleep(0.001)
continue
logger.info(f"_handle_connect_task recv task: {task}")
task_id = task["task_id"]
ip, rdma_port = task["ip"], task["rdma_port"]
status = self.messager["rdma"].connect(ip, rdma_port)
if not status:
response = {"task_id": task_id, "success": False}
else:
response = {"task_id": task_id, "success": True}
self.engine_worker_queue.put_connect_rdma_task_response(response)
except Exception as e:
logger.error(f"handle_connect_task has exception: {e}")
class CacheMessagerV1:
"""
CacheMessager is used to send the cache data between the engine worker and the cache server.
"""
def __init__(
self,
splitwise_role,
transfer_protocol,
pod_ip,
engine_worker_queue_port,
local_data_parallel_id,
gpu_cache_kvs,
rank,
nranks,
num_layers,
gpu_id=0,
block_size=64,
rdma_port=None,
):
"""
Initialize the CacheMessager object.
Args:
splitwise_role (str): splitwise_role only can be 'prefill' or 'decode'.
transfer_protocol (str): support ipc and rdma
engine_worker_queue_port (int): engine_worker_queue port
gpu_cache_kvs (dict): GPU kv cache
rank (int): current rank
nranks (int): global rank number
num_layers (int): model layer number
gpu_id (int, optional): GPU ID
rdma_port (int, optional): RDMA port
Returns:
None
"""
self.splitwise_role = splitwise_role
self.gpu_cache_kvs = gpu_cache_kvs
self.rank = rank
self.nranks = nranks
address = (pod_ip, engine_worker_queue_port)
self.engine_worker_queue = EngineWorkerQueue(
address=address,
is_server=False,
num_client=self.nranks,
client_id=self.rank,
local_data_parallel_id=local_data_parallel_id,
)
self.block_size = block_size
transfer_protocol = transfer_protocol.split(",")
logger.info(f"splitwise role: {splitwise_role}, {transfer_protocol}" f"rank: {rank}")
# 1. initialize the cache_k_ptr_list and cache_v_ptr_list
self.num_layers = num_layers
cache_k_ptr_list = []
cache_v_ptr_list = []
cache_k = []
cache_v = []
self.messager = {}
for layer_idx in range(self.num_layers):
key_cache = self.gpu_cache_kvs[f"key_caches_{layer_idx}_rank{self.rank}_device{gpu_id}"]
val_cache = self.gpu_cache_kvs[f"value_caches_{layer_idx}_rank{self.rank}_device{gpu_id}"]
cache_k.append(key_cache)
cache_v.append(val_cache)
cache_k_ptr_list.append(key_cache.data_ptr())
cache_v_ptr_list.append(val_cache.data_ptr())
cache_k_ptr_list = np.array(cache_k_ptr_list)
cache_v_ptr_list = np.array(cache_v_ptr_list)
# 2. initialize the block_bytes
cache_shape = key_cache.shape
max_block_num = cache_shape[0]
block_bytes = math.prod(cache_shape[1:])
if key_cache.dtype == paddle.bfloat16:
block_bytes *= 2
logger.info(
f"layers {num_layers} cache_shape: {cache_shape}, max_block_num: {max_block_num}, "
f"block_bytes: {block_bytes}, dtype: {key_cache.dtype}"
)
self.block_bytes = block_bytes
# 3. initialize the messager
for protocol in transfer_protocol:
if protocol == "ipc":
self.messager[protocol] = IPCCommManager(
self.rank,
gpu_id,
cache_k,
cache_v,
)
local_device_id = int(str(cache_k[0].place)[-2])
logger.info(f"done create ipc_comm with local_device_id:{local_device_id}, ")
elif protocol == "rdma":
logger.info(f"splitwise_role rdma: {self.splitwise_role}, rank: {self.rank}, gpu_id: {gpu_id}")
self.messager[protocol] = RDMACommManager(
splitwise_role,
rank,
gpu_id,
cache_k_ptr_list,
cache_v_ptr_list,
max_block_num,
block_bytes,
rdma_port,
)
self.gpu_id = gpu_id
self.cache_info = dict()
self.rank_id = self.rank + local_data_parallel_id * self.nranks
self.engine_cache_task_thread_lock = threading.Lock()
self.engine_cache_tasks = [dict() for _ in range(512)]
self.idx_cache_task_dict = {}
self.cache_prefilled_engine_ids_queue = queue.Queue() # keep batch slot index for each prefill step
if splitwise_role == "prefill":
consume_signals_thread = threading.Thread(target=self.consume_signals)
consume_signals_thread.daemon = True
consume_signals_thread.start()
add_cache_task_thread = threading.Thread(target=self._add_cache_task_thread)
add_cache_task_thread.daemon = True
add_cache_task_thread.start()
if self.splitwise_role != "mixed":
connect_rdma_thread = threading.Thread(target=self._handle_connect_task)
connect_rdma_thread.daemon = True
connect_rdma_thread.start()
logger.info(f"cache messager init finished, use {transfer_protocol}")
def _add_cache_task_thread(self):
while True:
try:
cache_info = self.engine_worker_queue.get_cache_info()
self.engine_worker_queue.finish_add_cache_task_barrier.wait()
finished_add_cache_task_req_ids = []
if cache_info:
for info in cache_info:
if info["request_id"] in self.cache_info:
self.cache_info[info["request_id"]].update(info)
current_info = self.cache_info[info["request_id"]]
assert "dest_block_ids" in current_info and "src_block_ids" in current_info
finished_add_cache_task_req_ids.append(info["request_id"])
decode_cached_block_num = len(current_info["src_block_ids"]) - len(
current_info["dest_block_ids"]
)
padding_decode_block_ids = [-1 for i in range(decode_cached_block_num)] + current_info[
"dest_block_ids"
]
current_info["dest_block_ids"] = padding_decode_block_ids
current_info["decode_cached_tokens"] = decode_cached_block_num * self.block_size
current_info["sended_layer_id"] = -1
current_info["sended_block_num"] = current_info["decode_cached_tokens"] // self.block_size
current_info["status"] = "init"
logger.info(f"finish add cache task: {current_info}")
self.cache_info[info["request_id"]] = current_info
self.idx_cache_task_dict[current_info["current_id"]] = current_info
else:
self.cache_info[info["request_id"]] = info
if self.rank == 0 and finished_add_cache_task_req_ids:
self.engine_worker_queue.put_finished_add_cache_task_req(finished_add_cache_task_req_ids)
else:
time.sleep(0.001)
except Exception as e:
logger.info(f"add cache task occured error: {e}, {traceback.format_exc()!s}.")
def prefill_layerwise_send_cache_thread(self):
"""
layerwise_send_cache_thread:
send cache to other instance
"""
while True:
try:
engine_indexes = self.cache_prefilled_engine_ids_queue.get()
self.engine_worker_queue.finish_request_barrier.wait()
block_start_end_list = []
current_prefilled_token_num_list = []
for engine_index in engine_indexes:
assert engine_index in self.idx_cache_task_dict
block_id_start = self.idx_cache_task_dict[engine_index]["sended_block_num"]
prefilled_token_num = self.engine_cache_tasks[engine_index]["prefilled_token_num"]
if (
prefilled_token_num == self.idx_cache_task_dict[engine_index]["need_prefill_tokens"]
): # all chunks have been prefilled
block_id_end = len(self.idx_cache_task_dict[engine_index]["src_block_ids"])
else:
block_id_end = prefilled_token_num // self.block_size # [block_id_start, block_id_end)
block_start_end_list.append((block_id_start, block_id_end))
current_prefilled_token_num_list.append(prefilled_token_num)
while True: # from layer0 to last layer
sended_layer_idx = self.idx_cache_task_dict[engine_indexes[0]]["sended_layer_id"]
start_layer_idx = sended_layer_idx + 1
with self.engine_cache_task_thread_lock: # to check end_layer_idx
prefilled_layer_idx = self.engine_cache_tasks[engine_indexes[0]]["prefilled_layer_idx"]
if sended_layer_idx > prefilled_layer_idx: # computation must in next chunk
logger.info(
f"current_prefilled_token_num_list[0] {current_prefilled_token_num_list[0]} prefilled_token_num {self.engine_cache_tasks[engine_indexes[0]]['prefilled_token_num']}"
)
assert (
current_prefilled_token_num_list[0]
< self.engine_cache_tasks[engine_indexes[0]]["prefilled_token_num"]
), "when sended_layer_idx > prefilled_layer_idx, must be in next chunk, but not, sth wrong"
end_layer_idx = self.num_layers - 1 # [start_layer_idx, end_layer_idx)
else:
end_layer_idx = prefilled_layer_idx
if sended_layer_idx == prefilled_layer_idx: # computation not in next layer
time.sleep(0.01)
for layer_idx in range(start_layer_idx, end_layer_idx + 1):
for i, (block_id_start, block_id_end) in enumerate(block_start_end_list):
engine_index = engine_indexes[i]
task = self.idx_cache_task_dict[engine_index]
req_id = task["request_id"]
if (
block_id_start >= block_id_end
): # no blocks need to transfer for this request in this chunk
task["sended_layer_id"] += 1
assert task["sended_layer_id"] == layer_idx
if task["sended_layer_id"] == self.num_layers - 1:
task["sended_layer_id"] = -1
continue
else:
current_transfer_protocol = task["transfer_protocol"]
if task["transfer_protocol"] == "rdma":
target_ip = task["ip"]
target_id = int(task["rdma_ports"][self.rank])
if task["status"] == "error":
continue
status = self.messager[current_transfer_protocol].connect(target_ip, target_id)
if not status:
logger.error(f"connect to {target_ip}:{target_id} failed")
task["status"] = "connection error"
continue
elif task["transfer_protocol"] == "ipc":
target_ip = "0.0.0.0"
target_id = int(task["device_ids"][self.rank])
src_block_ids = task["src_block_ids"][block_id_start:block_id_end]
dest_block_ids = task["dest_block_ids"][block_id_start:block_id_end]
src_block_ids = paddle.to_tensor(src_block_ids, dtype="int32", place="cpu")
dest_block_ids = paddle.to_tensor(dest_block_ids, dtype="int32", place="cpu")
logger.info(
f"start write cache for a layer, {req_id}, {layer_idx}, {target_ip}, {target_id}, block_id_start {block_id_start} block_id_end {block_id_end}"
)
tic = time.time()
return_code = self.messager[current_transfer_protocol].write_cache(
target_ip,
target_id,
src_block_ids,
dest_block_ids,
layer_idx,
)
if return_code != 0:
task["status"] = "write cache error"
logger.error(
f"write cache failed, layer_idx: {layer_idx}, req_id: {req_id}, dest_ip: {target_ip}, block_id_start {block_id_start} block_id_end {block_id_end}"
)
tok = time.time()
cost_time = tok - tic
block_num = len(src_block_ids)
avg_time_per_block = cost_time * 1000 / block_num # ms
send_cache_speed = block_num * self.block_bytes / 1073741824 / cost_time # GB/s
logger.debug(
f"finish write cache for a layer, {req_id}, {layer_idx}, {target_ip}, {target_id},"
f"block_num: {block_num}, send_cache_speed(GB/s): {round(send_cache_speed, 5)},"
f"avg_time per block(ms): {round(avg_time_per_block, 5)} block_id_start {block_id_start} block_id_end {block_id_end}"
)
task["sended_layer_id"] += 1
assert task["sended_layer_id"] == layer_idx
if task["sended_layer_id"] == self.num_layers - 1:
self.idx_cache_task_dict[engine_index]["sended_block_num"] += (
block_id_end - block_id_start
)
if current_prefilled_token_num_list[i] == task["need_prefill_tokens"]:
if task["status"] != "error":
task["status"] = "finished"
logger.info(
f"finish write cache for all layers, req_id: {req_id}, block_id_end {block_id_end} need_prefill_tokens {task['need_prefill_tokens']}"
)
else:
task["sended_layer_id"] = -1
if end_layer_idx == self.num_layers - 1:
with self.engine_cache_task_thread_lock:
for engine_idx in engine_indexes:
task = self.idx_cache_task_dict[engine_idx]
if task["status"] == "finished" or ("error" in task["status"]):
target_id = int(task["rdma_ports"][self.rank])
if task["transfer_protocol"] == "ipc":
self.messager["ipc"].write_block_by_sync(target_id)
if self.rank == 0:
# to do: robust in TP, here we assume all status in tp are the same. If wrong, all wrong. If ok, all ok.
self.engine_worker_queue.put_finished_req(
[(task["request_id"], task["status"])]
)
logger.info(f"put write cache {task['request_id']}, status {task['status']}")
self.engine_cache_tasks[task["current_id"]] = dict()
del self.cache_info[task["request_id"]]
del self.idx_cache_task_dict[task["current_id"]]
break
except Exception as e:
logger.error(f"prefill layerwise send cache thread has exception: {e} {traceback.format_exc()!s}")
time.sleep(0.01)
def consume_signals(self):
paddle.device.set_device("cpu")
kv_signal_data = paddle.full(shape=[512 * 3 + 2], fill_value=-1, dtype="int32")
while True:
try:
get_output_kv_signal(kv_signal_data, self.rank_id, 0) # wait_flag
if not self.cache_info:
time.sleep(0.01)
continue
tasks_count = kv_signal_data[0]
if tasks_count == -1:
time.sleep(0.001)
continue
layer_id = kv_signal_data[1].numpy().tolist()
if layer_id == self.num_layers - 1:
logger.info(f"tasks_count: {tasks_count}, layer_id: {layer_id}")
batch_engine_ids = []
with self.engine_cache_task_thread_lock:
for bi in range(tasks_count):
engine_idx = kv_signal_data[3 * bi + 2].numpy().tolist()
chuck_token_offset = kv_signal_data[3 * bi + 3].numpy().tolist()
current_seq_len = kv_signal_data[3 * bi + 4].numpy().tolist()
self.engine_cache_tasks[engine_idx]["prefilled_layer_idx"] = layer_id
self.engine_cache_tasks[engine_idx]["prefilled_token_num"] = (
chuck_token_offset + current_seq_len
)
batch_engine_ids.append(engine_idx)
if layer_id == 0:
self.cache_prefilled_engine_ids_queue.put(batch_engine_ids)
except Exception as e:
logger.error(f"Consume signals get exception: {e}")
def _handle_connect_task(self):
while True:
try:
task = self.engine_worker_queue.get_connect_rdma_task()
if task is None:
time.sleep(0.001)
continue
logger.info(f"_handle_connect_task recv task: {task}")
task_id = task["task_id"]
ip, rdma_port = task["ip"], task["rdma_port"]
status = self.messager["rdma"].connect(ip, rdma_port)
if not status:
response = {"task_id": task_id, "success": False}
else:
response = {"task_id": task_id, "success": True}
self.engine_worker_queue.put_connect_rdma_task_response(response)
except Exception as e:
logger.error(f"handle_connect_task has exception: {e}")
def main():
device = args.device_id
rank = args.rank
paddle.set_device(f"gpu:{device}")
cache_type = args.cache_dtype
speculative_config = SpeculativeConfig(args.speculative_config)
num_extra_layers = speculative_config.num_extra_cache_layer
num_extra_layer_gpu_blocks = int(args.num_gpu_blocks * speculative_config.num_gpu_block_expand_ratio)
gpu_cache_kvs = {}
gpu_cache_k_tensors = []
gpu_cache_v_tensors = []
for i in range(args.num_layers + num_extra_layers):
num_gpu_blocks = args.num_gpu_blocks if i < args.num_layers else num_extra_layer_gpu_blocks
gpu_cache_kvs[f"key_caches_{i}_rank{rank}_device{device}"] = paddle.full(
shape=[
num_gpu_blocks,
args.kv_num_head,
args.block_size,
args.head_dim,
],
fill_value=0,
dtype=cache_type,
)
gpu_cache_k_tensors.append(gpu_cache_kvs[f"key_caches_{i}_rank{rank}_device{device}"])
gpu_cache_kvs[f"value_caches_{i}_rank{rank}_device{device}"] = paddle.full(
shape=[
num_gpu_blocks,
args.kv_num_head,
args.block_size,
args.head_dim,
],
fill_value=0,
dtype=cache_type,
)
gpu_cache_v_tensors.append(gpu_cache_kvs[f"value_caches_{i}_rank{rank}_device{device}"])
set_data_ipc(
gpu_cache_kvs[f"key_caches_{i}_rank{rank}_device{device}"],
f"key_caches_{i}_rank{rank}.device{device}",
)
set_data_ipc(
gpu_cache_kvs[f"value_caches_{i}_rank{rank}_device{device}"],
f"value_caches_{i}_rank{rank}.device{device}",
)
cache_kv_size_byte = sum([tmp.numel() * 1 for key, tmp in gpu_cache_kvs.items()])
logger.info(f"device :{device}")
logger.info(f"cache_kv_size_byte : {cache_kv_size_byte}")
logger.info(f"done init cache (full) gmem alloc : {paddle.device.cuda.memory_allocated()}")
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
cache_messager = CacheMessagerV1(
splitwise_role=args.splitwise_role,
transfer_protocol=args.protocol,
pod_ip=args.pod_ip,
engine_worker_queue_port=args.engine_worker_queue_port,
local_data_parallel_id=args.local_data_parallel_id,
gpu_cache_kvs=gpu_cache_kvs,
rank=rank,
nranks=args.mp_num,
num_layers=args.num_layers + num_extra_layers,
gpu_id=device,
rdma_port=args.rdma_port,
)
else:
cache_messager = CacheMessager(
splitwise_role=args.splitwise_role,
transfer_protocol=args.protocol,
pod_ip=args.pod_ip,
engine_worker_queue_port=args.engine_worker_queue_port,
local_data_parallel_id=args.local_data_parallel_id,
gpu_cache_kvs=gpu_cache_kvs,
rank=rank,
nranks=args.mp_num,
num_layers=args.num_layers + num_extra_layers,
gpu_id=device,
rdma_port=args.rdma_port,
)
cache_ready_signal_data = np.zeros(shape=[args.mp_num], dtype=np.int32)
cache_ready_signal = IPCSignal(
name="cache_ready_signal",
array=cache_ready_signal_data,
dtype=np.int32,
suffix=args.engine_pid,
create=False,
)
cache_ready_signal.value[rank] = 1
if args.splitwise_role == "mixed":
while True:
time.sleep(1)
cache_messager.prefill_layerwise_send_cache_thread()
if __name__ == "__main__":
args = parse_args()
rank_id = args.rank + args.local_data_parallel_id * args.mp_num
logger = get_logger("cache_messager", f"cache_messager_rank{rank_id}.log")
logger.info("create cache messager...")
logger.info(f"{args}")
main()

View File

@@ -16,27 +16,21 @@
import argparse
import concurrent.futures
import gc
import json
import queue
import threading
import time
import traceback
import numpy as np
import paddle
from fastdeploy import envs
from fastdeploy.cache_manager.cache_data import CacheStatus
from fastdeploy.config import SpeculativeConfig
from fastdeploy.inter_communicator import EngineCacheQueue, IPCSignal, KVCacheStatus
from fastdeploy.inter_communicator import EngineCacheQueue, IPCSignal
from fastdeploy.model_executor.ops.gpu import (
cuda_host_alloc,
cuda_host_free,
set_data_ipc,
share_external_data,
swap_cache_all_layers,
unset_data_ipc,
)
from fastdeploy.utils import get_logger
@@ -99,7 +93,6 @@ def parse_args():
help="speculative config",
)
parser.add_argument("--local_data_parallel_id", type=int, default=0)
parser.add_argument("--create_cache_tensor", action="store_true")
args = parser.parse_args()
return args
@@ -117,6 +110,7 @@ class CacheTransferManager:
device = args.device_id
rank = args.rank
paddle.set_device(f"gpu:{device}")
self.gpu_cache_kvs = {}
self.cpu_cache_kvs = {}
self.gpu_cache_k_tensors = []
@@ -132,7 +126,6 @@ class CacheTransferManager:
self.n_ranks = args.mp_num
self.rank = rank
self.device = device
self.engine_pid = args.engine_pid
address = (args.pod_ip, args.cache_queue_port)
self.cache_task_queue = EngineCacheQueue(
@@ -143,27 +136,92 @@ class CacheTransferManager:
local_data_parallel_id=args.local_data_parallel_id,
)
self.num_cpu_blocks = args.num_cpu_blocks
cache_type = args.cache_dtype
for i in range(args.num_layers + self.num_extra_layers):
num_gpu_blocks = args.num_gpu_blocks if i < args.num_layers else self.num_extra_layer_gpu_blocks
self.gpu_cache_kvs[f"key_caches_{i}_rank{rank}_device{device}"] = paddle.full(
shape=[
num_gpu_blocks,
args.kv_num_head,
args.block_size,
args.head_dim,
],
fill_value=0,
dtype=cache_type,
)
self.gpu_cache_k_tensors.append(self.gpu_cache_kvs[f"key_caches_{i}_rank{rank}_device{device}"])
self.gpu_cache_kvs[f"value_caches_{i}_rank{rank}_device{device}"] = paddle.full(
shape=[
num_gpu_blocks,
args.kv_num_head,
args.block_size,
args.head_dim,
],
fill_value=0,
dtype=cache_type,
)
self.gpu_cache_v_tensors.append(self.gpu_cache_kvs[f"value_caches_{i}_rank{rank}_device{device}"])
set_data_ipc(
self.gpu_cache_kvs[f"key_caches_{i}_rank{rank}_device{device}"],
f"key_caches_{i}_rank{rank}.device{device}",
)
set_data_ipc(
self.gpu_cache_kvs[f"value_caches_{i}_rank{rank}_device{device}"],
f"value_caches_{i}_rank{rank}.device{device}",
)
cache_kv_size_byte = sum([tmp.numel() * 1 for key, tmp in self.gpu_cache_kvs.items()])
logger.info(f"device :{self.device}")
logger.info(f"cache_kv_size_byte : {cache_kv_size_byte}")
logger.info(f"done init cache (full) gmem alloc : {paddle.device.cuda.memory_allocated()}")
paddle.set_device("cpu")
self.k_dst_ptrs = []
self.v_dst_ptrs = []
for i in range(args.num_layers + self.num_extra_layers):
self.cpu_cache_kvs[f"key_caches_{i}_rank{rank}"] = cuda_host_alloc(
args.num_cpu_blocks * args.bytes_per_layer_per_block
)
self.k_dst_ptrs.append(self.cpu_cache_kvs[f"key_caches_{i}_rank{rank}"])
self.cpu_cache_kvs[f"value_caches_{i}_rank{rank}"] = cuda_host_alloc(
args.num_cpu_blocks * args.bytes_per_layer_per_block
)
self.v_dst_ptrs.append(self.cpu_cache_kvs[f"value_caches_{i}_rank{rank}"])
cache_ready_signal_data = np.zeros(shape=[args.mp_num], dtype=np.int32)
self.cache_ready_signal = IPCSignal(
name="cache_ready_signal",
array=cache_ready_signal_data,
dtype=np.int32,
suffix=self.engine_pid,
create=False,
)
swap_space_ready_data = np.zeros(shape=[args.mp_num], dtype=np.int32)
self.swap_space_ready_signal = IPCSignal(
name="swap_space_ready_signal",
array=swap_space_ready_data,
dtype=np.int32,
suffix=self.engine_pid,
suffix=args.engine_pid,
create=False,
)
self.cache_ready_signal.value[self.rank] = 1
self.num_cpu_blocks = args.num_cpu_blocks
paddle.set_device(f"gpu:{device}")
if args.enable_splitwise:
logger.debug("create cache messager...")
logger.info(f"{args}")
from fastdeploy.cache_manager.cache_messager import CacheMessager
self._init_cpu_cache(args)
self._init_gpu_cache(args)
self.cache_messager = CacheMessager(
splitwise_role=args.splitwise_role,
transfer_protocol=args.protocol,
pod_ip=args.pod_ip,
engine_worker_queue_port=args.engine_worker_queue_port,
local_data_parallel_id=args.local_data_parallel_id,
gpu_cache_kvs=self.gpu_cache_kvs,
rank=self.rank,
nranks=args.mp_num,
num_layers=args.num_layers + self.num_extra_layers,
gpu_id=self.device,
rdma_port=args.rdma_port,
)
logger.info("successfully create cache messager")
logger.info(f"done init CacheMessager gmem alloc : {paddle.device.cuda.memory_allocated()}")
cache_task_broadcast_data = np.zeros(shape=[1], dtype=np.int32)
self.cache_task_broadcast_signal = IPCSignal(
@@ -174,76 +232,6 @@ class CacheTransferManager:
create=False,
)
threading.Thread(target=self.clear_or_update_caches, args=[args], daemon=True).start()
def _init_gpu_cache(self, args):
if not args.create_cache_tensor:
logger.info(f"[rank {self.rank}/{self.n_ranks}] Waiting for runners to create kv cache.")
while self.cache_ready_signal.value[self.rank] != 1:
time.sleep(0.1)
logger.info(f"[rank {self.rank}/{self.n_ranks}] OK! Stop waiting.")
logger.info(f"[rank {self.rank}/{self.n_ranks}] Initializing kv cache for all layers.")
paddle.set_device(f"gpu:{self.device}")
for i in range(args.num_layers + self.num_extra_layers):
num_gpu_blocks = args.num_gpu_blocks if i < args.num_layers else self.num_extra_layer_gpu_blocks
cache_shape = [num_gpu_blocks, args.kv_num_head, args.block_size, args.head_dim]
key_name = f"key_caches_{i}_rank{self.rank}.device{self.device}"
val_name = f"value_caches_{i}_rank{self.rank}.device{self.device}"
if args.create_cache_tensor:
logger.info(f"[rank {self.rank}/{self.n_ranks}] ..creating kv cache for layer {i}: {cache_shape}")
key_cache = paddle.full(shape=cache_shape, fill_value=0, dtype=args.cache_dtype)
val_cache = paddle.full(shape=cache_shape, fill_value=0, dtype=args.cache_dtype)
set_data_ipc(key_cache, key_name)
set_data_ipc(val_cache, val_name)
else:
logger.info(f"[rank {self.rank}/{self.n_ranks}] ..attaching kv cache for layer {i}: {cache_shape}")
key_cache = paddle.empty(shape=[], dtype=args.cache_dtype)
val_cache = paddle.empty(shape=[], dtype=args.cache_dtype)
key_cache = share_external_data(key_cache, key_name, cache_shape)
val_cache = share_external_data(val_cache, val_name, cache_shape)
self.gpu_cache_kvs[key_name] = key_cache
self.gpu_cache_kvs[val_name] = val_cache
self.gpu_cache_k_tensors.append(self.gpu_cache_kvs[key_name])
self.gpu_cache_v_tensors.append(self.gpu_cache_kvs[val_name])
if args.create_cache_tensor:
logger.info("[rank {self.rank}/{self.n_ranks}] ✅ kv cache is ready!")
self.cache_ready_signal.value[self.rank] = 1
cache_kv_size_byte = sum([tmp.numel() * 1 for key, tmp in self.gpu_cache_kvs.items()])
logger.info(f"[rank {self.rank}/{self.n_ranks}] device :{self.device}")
logger.info(f"[rank {self.rank}/{self.n_ranks}] cache_kv_size_byte : {cache_kv_size_byte}")
logger.info(
f"[rank {self.rank}/{self.n_ranks}] done init cache (full) gmem alloc : {paddle.device.cuda.memory_allocated()}"
)
def _init_cpu_cache(self, args):
if args.num_cpu_blocks == 0:
logger.info(f"[rank {self.rank}/{self.n_ranks}] 💡 no swap space (cpu cache) is specified.")
self.swap_space_ready_signal.value[self.rank] = 1
return
logger.info(f"[rank {self.rank}/{self.n_ranks}] Initializing swap space (cpu cache) for all layers.")
paddle.set_device("cpu")
self.k_dst_ptrs = []
self.v_dst_ptrs = []
for i in range(args.num_layers + self.num_extra_layers):
key_name = f"key_caches_{i}_rank{self.rank}"
val_name = f"value_caches_{i}_rank{self.rank}"
need_to_allocate_bytes = args.num_cpu_blocks * args.bytes_per_layer_per_block
logger.info(
f"[rank {self.rank}/{self.n_ranks}] ..creating cpu cache for layer {i}: {2 * need_to_allocate_bytes / 1024 ** 3:.2f}GB"
)
self.cpu_cache_kvs[key_name] = cuda_host_alloc(need_to_allocate_bytes)
self.k_dst_ptrs.append(self.cpu_cache_kvs[key_name])
self.cpu_cache_kvs[val_name] = cuda_host_alloc(need_to_allocate_bytes)
self.v_dst_ptrs.append(self.cpu_cache_kvs[val_name])
logger.info(f"[rank {self.rank}/{self.n_ranks}] ✅ swap space (cpu cache) is ready!")
self.swap_space_ready_signal.value[self.rank] = 1
def _do_swap_to_cpu_task(
self,
swap_node_ids,
@@ -441,92 +429,6 @@ class CacheTransferManager:
transfer_task_id,
)
def clear_or_update_caches(self, args):
logger.info("Start a thread to clear/restore kv cache when model weights are cleared/updated.")
logger.info(f"FD_ENABLE_SWAP_SPACE_CLEARING={envs.FD_ENABLE_SWAP_SPACE_CLEARING}")
kv_cache_status = np.zeros([1], dtype=np.int32)
kv_cache_status_signal = IPCSignal(
name="kv_cache_status",
array=kv_cache_status,
dtype=np.int32,
suffix=self.engine_pid,
create=False,
)
while True:
if kv_cache_status_signal.value[0] == KVCacheStatus.CLEARING:
try:
logger.info(
f"[rank {self.rank}/{self.n_ranks}] Start clearing caches {self.cache_ready_signal.value}"
)
# clear cpu caches
if envs.FD_ENABLE_SWAP_SPACE_CLEARING:
paddle.set_device("cpu")
for ptrs in self.k_dst_ptrs + self.v_dst_ptrs:
cuda_host_free(ptrs)
self.cpu_cache_kvs.clear()
self.k_dst_ptrs.clear()
self.v_dst_ptrs.clear()
gc.collect()
# reset swap_space_ready_signal
self.swap_space_ready_signal.value[self.rank] = 0
while np.sum(self.swap_space_ready_signal.value) != 0:
time.sleep(0.1)
# clear gpu caches
paddle.set_device(f"gpu:{self.device}")
for name, tensor in self.gpu_cache_kvs.items():
unset_data_ipc(tensor, name, True, False)
self.gpu_cache_kvs.clear()
self.gpu_cache_k_tensors.clear()
self.gpu_cache_v_tensors.clear()
# reset cache_ready_signal
self.cache_ready_signal.value[self.rank] = 0
logger.info(
f"[rank {self.rank}/{self.n_ranks}] Finish clearing caches {self.cache_ready_signal.value}"
)
# wait for all ranks caches to be cleared
if np.sum(self.cache_ready_signal.value) != 0:
time.sleep(0.1)
# reset kv_cache_status_signal
kv_cache_status_signal.value[0] = KVCacheStatus.CLEARED
logger.info("All ranks finish clearing caches")
except Exception as e:
logger.error(f"[rank {self.rank}/{self.n_ranks}] Failed to clear caches: {e}")
elif kv_cache_status_signal.value[0] == KVCacheStatus.UPDATING:
try:
logger.info(
f"[rank {self.rank}/{self.n_ranks}] Start restoring caches {self.cache_ready_signal.value}"
)
# restore cpu cache
if envs.FD_ENABLE_SWAP_SPACE_CLEARING:
self._init_cpu_cache(args)
while np.sum(self.swap_space_ready_signal.value) != args.mp_num:
time.sleep(0.1)
# restore gpu cache and set cache_ready_signal
self._init_gpu_cache(args)
logger.info(
f"[rank {self.rank}/{self.n_ranks}] Finish restoring caches {self.cache_ready_signal.value}"
)
# wait for all ranks caches to be ready
while np.sum(self.cache_ready_signal.value) != args.mp_num:
time.sleep(0.1)
# set kv_cache_status_signal
logger.info("All ranks finish restoring caches")
kv_cache_status_signal.value[0] = KVCacheStatus.NORMAL
except Exception as e:
logger.error(f"[rank {self.rank}/{self.n_ranks}] Failed to restore caches: {e}")
time.sleep(0.1)
def main():
"""
@@ -541,7 +443,5 @@ def main():
if __name__ == "__main__":
args = parse_args()
rank_id = args.rank + args.local_data_parallel_id * args.mp_num
logger = get_logger("cache_transfer_manager", f"cache_transfer_manager_rank{rank_id}.log")
paddle.set_device(f"gpu:{args.device_id}")
logger = get_logger("cache_transfer_manager", "cache_transfer_manager.log")
main()

View File

@@ -31,7 +31,7 @@ import numpy as np
from fastdeploy import envs
from fastdeploy.cache_manager.cache_data import BlockNode, CacheStatus
from fastdeploy.cache_manager.cache_metrics import CacheMetrics
from fastdeploy.inter_communicator import EngineCacheQueue, IPCSignal, PrefixTreeStatus
from fastdeploy.inter_communicator import EngineCacheQueue, IPCSignal
from fastdeploy.metrics.metrics import main_process_metrics
from fastdeploy.utils import get_logger
@@ -71,7 +71,6 @@ class PrefixCacheManager:
else:
self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num
self.num_cpu_blocks = self.cache_config.num_cpu_blocks
self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1))
if self.num_cpu_blocks > 0:
self.cpu_free_block_list = list(range(self.num_cpu_blocks - 1, -1, -1))
@@ -79,7 +78,6 @@ class PrefixCacheManager:
self.cpu_free_block_list = []
heapq.heapify(self.gpu_free_block_list)
heapq.heapify(self.cpu_free_block_list)
self.node_id_pool = list(range(self.num_gpu_blocks + self.num_cpu_blocks))
self.radix_tree_root = BlockNode(-1, [], 0, 0, -1, 0, None, None, None)
@@ -113,10 +111,6 @@ class PrefixCacheManager:
+ f"{self.num_cpu_blocks}, bytes_per_layer_per_block {self.cache_config.bytes_per_layer_per_block}"
)
main_process_metrics.max_gpu_block_num.set(self.num_gpu_blocks)
main_process_metrics.available_gpu_block_num.set(self.num_gpu_blocks)
main_process_metrics.available_gpu_resource.set(1.0)
@property
def available_gpu_resource(self):
return len(self.gpu_free_block_list) / self.num_gpu_blocks if self.num_gpu_blocks > 0 else 0.0
@@ -129,7 +123,6 @@ class PrefixCacheManager:
pod_ip,
engine_worker_queue_port,
pid_suffix,
create_cache_tensor,
):
"""
launch_cache_manager function used to initialize the cache manager.
@@ -140,7 +133,7 @@ class PrefixCacheManager:
name="cache_task_broadcast_signal",
array=broadcast_cache_task_flag_array,
dtype=np.int32,
suffix=engine_worker_queue_port,
suffix=pid_suffix,
create=True,
)
@@ -157,20 +150,6 @@ class PrefixCacheManager:
filename = "cache_transfer_manager.py"
py_path = os.path.join(current_dir_path, filename)
cache_messager_processes = []
if self.enable_splitwise:
cache_messager_processes = self.launch_cache_messager(
cache_config,
tensor_parallel_size,
device_ids,
pod_ip,
engine_worker_queue_port,
pid_suffix,
)
if cache_messager_processes is None:
raise RuntimeError("Launch cache messager failed")
return []
if (
hasattr(cache_config.model_cfg, "num_key_value_heads")
and hasattr(cache_config.model_cfg, "num_key_value_heads")
@@ -181,41 +160,20 @@ class PrefixCacheManager:
else:
kv_num_head = cache_config.model_cfg.num_attention_heads // tensor_parallel_size
kv_num_head = max(1, kv_num_head)
cache_ready_signal_data = np.zeros(shape=[tensor_parallel_size], dtype=np.int32)
self.cache_ready_signal = IPCSignal(
name="cache_ready_signal",
array=cache_ready_signal_data,
dtype=np.int32,
suffix=engine_worker_queue_port,
create=False,
suffix=pid_suffix,
create=True,
)
swap_space_ready_data = np.zeros(shape=[tensor_parallel_size], dtype=np.int32)
self.swap_space_ready_signal = IPCSignal(
name="swap_space_ready_signal",
array=swap_space_ready_data,
dtype=np.int32,
suffix=engine_worker_queue_port,
create=False,
)
prefix_tree_status = np.zeros([1], dtype=np.int32)
self.prefix_tree_status_signal = IPCSignal(
name="prefix_tree_status",
array=prefix_tree_status,
dtype=np.int32,
suffix=engine_worker_queue_port,
create=False,
)
# Run command to launch cache transfer managers
logger.info(f"create_cache_tensor: {create_cache_tensor}")
log_dir = envs.FD_LOG_DIR
cache_manager_processes = []
for i in range(tensor_parallel_size):
launch_cmd = (
"FLAGS_allocator_strategy=auto_growth CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7"
+ " NCCL_MAX_NCHANNELS=1 NCCL_BUFFSIZE=0"
+ f" FD_ENABLE_SWAP_SPACE_CLEARING={envs.FD_ENABLE_SWAP_SPACE_CLEARING}"
+ f" {sys.executable} {py_path}"
+ f" --device_id {int(device_ids[i])}"
+ f" --rank {i}"
@@ -238,104 +196,24 @@ class PrefixCacheManager:
+ f" --local_data_parallel_id {self.local_data_parallel_id}"
+ f" --rdma_port {cache_config.rdma_comm_ports[i] if cache_config.rdma_comm_ports is not None else '0'}"
+ f" --speculative_config '{self.speculative_config.to_json_string()}'"
+ (" --create_cache_tensor" if create_cache_tensor else "")
+ f" >{log_dir}/launch_cache_manager_{int(device_ids[i])}.log 2>&1"
)
logger.info(f"Launch cache transfer manager, command:{launch_cmd}")
cache_manager_processes.append(subprocess.Popen(launch_cmd, shell=True, preexec_fn=os.setsid))
logger.info("PrefixCacheManager is waiting for kv cache to be initialized.")
# 等待cache初始化完毕
logger.info("Waiting for cache transfer manager ready...")
while np.sum(self.cache_ready_signal.value) != tensor_parallel_size:
time.sleep(1)
if cache_config.enable_hierarchical_cache and self.num_cpu_blocks > 0:
while np.sum(self.swap_space_ready_signal.value) != tensor_parallel_size:
time.sleep(1)
exit_code = cache_manager_processes[-1].poll()
if exit_code is None:
logger.info("Launch cache transfer manager successful")
else:
logger.info("Launch cache transfer manager failed, see launch_cache_manager.log for more information")
# Start additional threads
if cache_config.enable_hierarchical_cache and self.num_cpu_blocks > 0:
logger.info("Enable hierarchical cache.")
threading.Thread(target=self.recv_data_transfer_result).start()
if cache_config.enable_prefix_caching:
threading.Thread(target=self.clear_prefix_cache, daemon=True).start()
all_cache_processes = cache_messager_processes + cache_manager_processes
return all_cache_processes
def launch_cache_messager(
self, cache_config, tensor_parallel_size, device_ids, pod_ip, engine_worker_queue_port, pid_suffix
):
"""
launch_cache_messager function used to initialize the cache messager.
"""
current_dir_path = os.path.split(os.path.abspath(__file__))[0]
filename = "cache_messager.py"
if (
hasattr(cache_config.model_cfg, "num_key_value_heads")
and hasattr(cache_config.model_cfg, "num_key_value_heads")
and cache_config.model_cfg.num_key_value_heads is not None
and int(cache_config.model_cfg.num_key_value_heads) > 0
):
kv_num_head = int(cache_config.model_cfg.num_key_value_heads) // tensor_parallel_size
else:
kv_num_head = cache_config.model_cfg.num_attention_heads // tensor_parallel_size
cache_ready_signal_data = np.zeros(shape=[tensor_parallel_size], dtype=np.int32)
self.cache_ready_signal = IPCSignal(
name="cache_ready_signal",
array=cache_ready_signal_data,
dtype=np.int32,
suffix=pid_suffix,
create=False,
)
py_path = os.path.join(current_dir_path, filename)
log_dir = envs.FD_LOG_DIR
cache_messager_processes = []
for i in range(tensor_parallel_size):
launch_cmd = (
"FLAGS_allocator_strategy=auto_growth CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7"
+ " NCCL_MAX_NCHANNELS=1 NCCL_BUFFSIZE=0"
+ f" {sys.executable} {py_path}"
+ f" --device_id {int(device_ids[i])}"
+ f" --rank {i}"
+ f" --splitwise_role {self.splitwise_role}"
+ f" --num_layers {cache_config.model_cfg.num_hidden_layers}"
+ f" --head_dim {cache_config.model_cfg.head_dim}"
+ f" --kv_num_head {kv_num_head}"
+ f" --mp_num {tensor_parallel_size}"
+ f" --cache_dtype {cache_config.cache_dtype}"
+ f" --pod_ip {pod_ip}"
+ f" --cache_queue_port {cache_config.cache_queue_port}"
+ f" --engine_worker_queue_port {engine_worker_queue_port}"
+ f" --num_gpu_blocks {cache_config.total_block_num}"
+ f" --block_size {cache_config.block_size}"
+ f" --protocol {cache_config.cache_transfer_protocol}"
+ f" --local_data_parallel_id {self.local_data_parallel_id}"
+ f" --engine_pid {pid_suffix}"
+ f" --rdma_port {cache_config.rdma_comm_ports[i] if cache_config.rdma_comm_ports is not None else '0'}"
+ f" --speculative_config '{self.speculative_config.to_json_string()}'"
+ f" >{log_dir}/launch_cache_messager_{int(device_ids[i])}.log 2>&1"
)
logger.info(f"Launch cache messager, command:{launch_cmd}")
cache_messager_processes.append(subprocess.Popen(launch_cmd, shell=True, preexec_fn=os.setsid))
logger.info("Waiting for cache ready...")
while np.sum(self.cache_ready_signal.value) != tensor_parallel_size:
time.sleep(1)
exit_code = cache_messager_processes[-1].poll()
if exit_code is None:
logger.info("Launch cache messager successful")
else:
logger.info("Launch cache messager failed, see launch_cache_messager.log for more information")
cache_messager_processes = None
return cache_messager_processes
self._enable_cpu_cache()
return cache_manager_processes
def update_cache_config(self, cache_config):
"""
@@ -357,9 +235,23 @@ class PrefixCacheManager:
self.node_id_pool = list(range(self.num_gpu_blocks + self.num_cpu_blocks))
main_process_metrics.max_gpu_block_num.set(self.num_gpu_blocks)
main_process_metrics.available_gpu_block_num.set(self.num_gpu_blocks)
main_process_metrics.available_gpu_resource.set(1.0)
def _enable_cpu_cache(self):
"""
_enable_cpu_cache function used to enable cpu cache.
"""
# ipc_cache_queue_port = self.cache_config.cache_queue_port
# self.cache_task_queue = CacheQueueManager(
# rank=0,
# mp_num=tensor_parallel_size,
# port=ipc_cache_queue_port,
# )
# 开启获取传输任务结果的监听线程
self.transfer_recv_thread = threading.Thread(target=self.recv_data_transfer_result)
self.transfer_recv_thread.start()
def can_allocate_gpu_blocks(self, num_blocks: int):
"""
Check if num_blocks gpu blocks can be allocated.
@@ -1403,70 +1295,3 @@ class PrefixCacheManager:
except Exception as e:
logger.warning(f"recv_data_transfer_result: error: {e}, {str(traceback.format_exc())}")
raise e
def reset(self):
"""
Reset the RadixTree.
"""
if len(self.node_map) == 0:
return
logger.info("Resetting the RadixTree!")
# wait for swap tasks to finish
if self.gpu_free_task_future is not None:
self.gpu_free_task_future.result()
self.gpu_free_task_future = None
for event in list(self.task_swapping_event.values()):
event.wait()
self.task_swapping_event.clear()
# clear node map
self.node_map.clear()
self.req_leaf_map.clear()
self.leaf_req_map.clear()
self.unfilled_req_block_map.clear()
self.cache_info.clear()
# reset gpu cache data structure
self.gpu_lru_leaf_heap.clear()
self.gpu_lru_leaf_set.clear()
# reset cpu cache data structure
self.cpu_lru_leaf_heap.clear()
self.cpu_lru_leaf_set.clear()
# reset gpu/cpu free block list
self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1))
if self.num_cpu_blocks > 0:
self.cpu_free_block_list = list(range(self.num_cpu_blocks - 1, -1, -1))
else:
self.cpu_free_block_list = []
heapq.heapify(self.gpu_free_block_list)
heapq.heapify(self.cpu_free_block_list)
# reset node/tree
self.node_id_pool = list(range(self.num_gpu_blocks + self.num_cpu_blocks))
self.radix_tree_root = BlockNode(-1, [], 0, 0, -1, 0, None, None, None)
# reset metrics
self.metrics.reset_metrics()
main_process_metrics.free_gpu_block_num.set(len(self.gpu_free_block_list))
main_process_metrics.available_gpu_resource.set(self.available_gpu_resource)
def clear_prefix_cache(self):
"""
If the model weights status is updating or clearing, reset prefix cache tree
"""
logger.info("Start a thread to clear prefix cache when model weights are cleared.")
prefix_tree_status_signal = self.prefix_tree_status_signal
while True:
if prefix_tree_status_signal.value[0] == PrefixTreeStatus.CLEARING:
self.reset()
prefix_tree_status_signal.value[0] = PrefixTreeStatus.CLEARED
logger.info("Prefix cache tree is cleared.")
if prefix_tree_status_signal.value[0] == PrefixTreeStatus.UPDATING:
prefix_tree_status_signal.value[0] = PrefixTreeStatus.NORMAL
logger.info("Prefix cache tree is updated.")
time.sleep(0.01)

View File

@@ -61,12 +61,18 @@ class RDMACommManager:
Connect to remote gpu and write cache.
"""
assert self.splitwise_role == "prefill", "only prefill can call this method"
addr = f"{ip}:{port!s}"
if addr in self.connected_rdma:
return True
ret = self.messager.is_connected(ip, str(port))
if ret:
self.connected_rdma.add(addr)
return True
ret = self.messager.connect(ip, str(port))
logger.info(f"connect to remote rdma address {ip}:{port} status is {ret}")
if ret == 0:
self.connected_rdma.add(addr)
return ret == 0
def write_cache(self, ip, port, local_block_ids, remote_block_ids, layer_idx):

View File

@@ -18,14 +18,12 @@ from __future__ import annotations
import json
import os
from dataclasses import field
from enum import Enum
from typing import Any, Dict, List, Literal, Optional, Union
import paddle
import paddle.distributed as dist
from paddleformers.transformers.configuration_utils import PretrainedConfig
from typing_extensions import assert_never
import fastdeploy
from fastdeploy import envs
@@ -33,68 +31,11 @@ from fastdeploy.model_executor.layers.quantization.quant_base import QuantConfig
from fastdeploy.multimodal.registry import MultimodalRegistry
from fastdeploy.platforms import current_platform
from fastdeploy.scheduler import SchedulerConfig
from fastdeploy.transformer_utils.config import get_pooling_config
from fastdeploy.utils import ceil_div, check_unified_ckpt, get_host_ip, get_logger
logger = get_logger("config", "config.log")
TaskOption = Literal["auto", "generate", "embedding", "embed"]
RunnerType = Literal["generate", "pooling"]
RunnerOption = Literal["auto", "generate", "pooling"]
ConvertOption = Literal["auto", "none", "embed"]
ConvertType = Literal["none", "embed"]
_ResolvedTask = Literal["generate", "encode", "embed"]
_RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = {
"generate": [],
"pooling": ["embed"],
}
# Some model suffixes are based on auto classes from Transformers:
# https://huggingface.co/docs/transformers/en/model_doc/auto
# NOTE: Items higher on this list priority over lower ones
_SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [
("ForCausalLM", ("generate", "none")),
("ForConditionalGeneration", ("generate", "none")),
("ChatModel", ("generate", "none")),
("LMHeadModel", ("generate", "none")),
("ForTextEncoding", ("pooling", "embed")),
("EmbeddingModel", ("pooling", "embed")),
("ForSequenceClassification", ("pooling", "classify")),
("ForAudioClassification", ("pooling", "classify")),
("ForImageClassification", ("pooling", "classify")),
("ForVideoClassification", ("pooling", "classify")),
("ClassificationModel", ("pooling", "classify")),
("ForRewardModeling", ("pooling", "reward")),
("RewardModel", ("pooling", "reward")),
# Let other `*Model`s take priority
("Model", ("pooling", "embed")),
]
def iter_architecture_defaults():
yield from _SUFFIX_TO_DEFAULTS
def try_match_architecture_defaults(
architecture: str,
*,
runner_type: Optional[RunnerType] = None,
convert_type: Optional[ConvertType] = None,
):
for suffix, (default_runner_type, default_convert_type) in iter_architecture_defaults():
if (
(runner_type is None or runner_type == default_runner_type)
and (convert_type is None or convert_type == default_convert_type)
and architecture.endswith(suffix)
):
return suffix, (default_runner_type, default_convert_type)
return None
TaskOption = Literal["generate"]
class MoEPhase:
@@ -192,12 +133,6 @@ class ModelConfig:
self.eos_tokens_lens: int = 2
self.lm_head_fp32: bool = False
self.model_format = "auto"
self.runner = "auto"
self.convert = "auto"
self.pooler_config: Optional["PoolerConfig"] = field(init=False)
self.override_pooler_config: Optional[Union[dict, "PoolerConfig"]] = None
self.revision = None
self.partial_rotary_factor: float = 1.0
self.num_nextn_predict_layers = 0
for key, value in args.items():
@@ -224,10 +159,8 @@ class ModelConfig:
self.vision_config = PretrainedConfig.from_dict(self.vision_config)
self.ori_vocab_size = args.get("ori_vocab_size", self.vocab_size)
self.think_end_id = args.get("think_end_id", -1)
architectures = self.architectures[0]
if MultimodalRegistry.contains_model(architectures):
self.enable_mm = True
else:
@@ -238,43 +171,6 @@ class ModelConfig:
self.override_name_from_config()
self.read_from_env()
self.read_model_config()
self.runner_type = self._get_runner_type(self.architectures, self.runner)
self.convert_type = self._get_convert_type(self.architectures, self.runner_type, self.convert)
registry = self.registry
is_generative_model = registry.is_text_generation_model(self.architectures, self)
is_pooling_model = registry.is_pooling_model(self.architectures, self)
is_multimodal_model = registry.is_multimodal_model(self.architectures, self)
if self.runner_type == "generate" and not is_generative_model:
if is_multimodal_model:
pass
else:
generate_converts = _RUNNER_CONVERTS["generate"]
if self.convert_type not in generate_converts:
raise ValueError("This model does not support '--runner generate.")
if self.runner_type == "pooling" and not is_pooling_model:
pooling_converts = _RUNNER_CONVERTS["pooling"]
if self.convert_type not in pooling_converts:
convert_option = "<" + "|".join(pooling_converts) + ">"
raise ValueError(
"This model does not support `--runner pooling`. "
f"You can pass `--convert {convert_option} to adapt "
"it into a pooling model."
)
self.supported_tasks = self._get_supported_tasks(self.architectures, self.runner_type, self.convert_type)
model_info, arch = registry.inspect_model_cls(self.architectures, self)
self._model_info = model_info
self._architecture = arch
self.pooler_config = self._init_pooler_config()
@property
def registry(self):
from fastdeploy.model_executor.models.model_base import ModelRegistry
return ModelRegistry()
def override_name_from_config(self):
"""
@@ -298,6 +194,7 @@ class ModelConfig:
def read_from_env(self):
"""
Read configuration information from environment variables and update the object's attributes.
If an attribute is not present or is an empty string in the environment variables, use the default value.
"""
self.max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM)
@@ -338,165 +235,6 @@ class ModelConfig:
f"Config file path: {config_path}"
)
def _get_default_runner_type(
self,
architectures: list[str],
) -> RunnerType:
registry = self.registry
if get_pooling_config(self.model, self.revision):
return "pooling"
for arch in architectures:
if arch in registry.get_supported_archs():
if registry.is_pooling_model(architectures, self):
return "pooling"
if registry.is_text_generation_model(architectures, self):
return "generate"
match = try_match_architecture_defaults(arch)
if match:
_, (runner_type, _) = match
return runner_type
return "generate"
def _get_default_convert_type(
self,
architectures: list[str],
runner_type: RunnerType,
) -> ConvertType:
registry = self.registry
for arch in architectures:
if arch in registry.get_supported_archs():
if runner_type == "generate" and registry.is_text_generation_model(architectures, self):
return "none"
if runner_type == "pooling" and registry.is_pooling_model(architectures, self):
return "none"
match = try_match_architecture_defaults(arch, runner_type=runner_type)
if match:
_, (_, convert_type) = match
return convert_type
# This is to handle Sentence Transformers models that use *ForCausalLM
# and also multi-modal pooling models which are not defined as
# Sentence Transformers models
if runner_type == "pooling":
return "embed"
return "none"
def _get_runner_type(
self,
architectures: list[str],
runner: RunnerOption,
) -> RunnerType:
if runner != "auto":
return runner
runner_type = self._get_default_runner_type(architectures)
if runner_type != "generate":
logger.info(
"Resolved `--runner auto` to `--runner %s`. " "Pass the value explicitly to silence this message.",
runner_type,
)
return runner_type
def _get_convert_type(
self,
architectures: list[str],
runner_type: RunnerType,
convert: ConvertOption,
) -> ConvertType:
if convert != "auto":
return convert
convert_type = self._get_default_convert_type(architectures, runner_type)
if convert_type != "none":
logger.info(
"Resolved `--convert auto` to `--convert %s`. " "Pass the value explicitly to silence this message.",
convert_type,
)
return convert_type
def _get_supported_generation_tasks(
self,
architectures: list[str],
convert_type: ConvertType,
) -> list[_ResolvedTask]:
registry = self.registry
supported_tasks = list[_ResolvedTask]()
if registry.is_text_generation_model(architectures, self) or convert_type in _RUNNER_CONVERTS["generate"]:
supported_tasks.append("generate")
# TODO:Temporarily does not support transcription.
return supported_tasks
def _get_default_pooling_task(
self,
architectures: list[str],
) -> Literal["embed"]:
# Temporarily does not support classification and reward.
for arch in architectures:
match = try_match_architecture_defaults(arch, runner_type="pooling")
if match:
_, (_, convert_type) = match
assert convert_type != "none"
return convert_type
return "embed"
def _get_supported_pooling_tasks(
self,
architectures: list[str],
convert_type: ConvertType,
) -> list[_ResolvedTask]:
registry = self.registry
supported_tasks = list[_ResolvedTask]()
if registry.is_pooling_model(architectures, self) or convert_type in _RUNNER_CONVERTS["pooling"]:
supported_tasks.append("encode")
extra_task = self._get_default_pooling_task(architectures) if convert_type == "none" else convert_type
supported_tasks.append(extra_task)
return supported_tasks
def _get_supported_tasks(
self,
architectures: list[str],
runner_type: RunnerType,
convert_type: ConvertType,
) -> list[_ResolvedTask]:
if runner_type == "generate":
return self._get_supported_generation_tasks(architectures, convert_type)
if runner_type == "pooling":
return self._get_supported_pooling_tasks(architectures, convert_type)
assert_never(runner_type)
def _init_pooler_config(self) -> Optional["PoolerConfig"]:
if self.runner_type == "pooling":
if isinstance(self.override_pooler_config, dict):
self.override_pooler_config = PoolerConfig(**self.override_pooler_config)
pooler_config = self.override_pooler_config or PoolerConfig()
base_config = get_pooling_config(self.model, self.revision)
if base_config is not None:
for k, v in base_config.items():
if getattr(pooler_config, k) is None:
setattr(pooler_config, k, v)
default_pooling_type = self._model_info.default_pooling_type
if pooler_config.pooling_type is None:
pooler_config.pooling_type = default_pooling_type
return pooler_config
return None
def _get_download_model(self, model_name, model_type="default"):
# TODO: Provide dynamic graph for self-downloading and save to the specified download directory.
pass
@@ -520,6 +258,7 @@ class ParallelConfig:
):
self.sequence_parallel = False # Whether to enable sequence parallelism.
self.use_ep = False # Whether to enable Expert Parallelism
self.moe_phase = MoEPhase("prefill") # Generation phase
self.msg_queue_id = 1 # message queue id
self.tensor_parallel_rank = 0 # TP rank ID
@@ -557,6 +296,8 @@ class ParallelConfig:
# Do profile or not
self.do_profile: bool = False
# splitwise role
self.splitwise_role: str = "mixed"
# guided decoding backend
self.guided_decoding_backend: str = None
# disable any whitespace for guided decoding
@@ -578,6 +319,14 @@ class ParallelConfig:
else:
self.expert_parallel_size = 1
self.use_ep = self.expert_parallel_size > 1
if self.splitwise_role == "mixed":
self.moe_phase = MoEPhase(phase="prefill")
elif self.splitwise_role == "prefill":
self.moe_phase = MoEPhase(phase="prefill")
elif self.splitwise_role == "decode":
self.moe_phase = MoEPhase(phase="decode")
else:
raise NotImplementedError
# pd_disaggregation
use_pd_disaggregation: int = int(os.getenv("FLAGS_use_pd_disaggregation", 0))
@@ -589,24 +338,20 @@ class ParallelConfig:
else:
self.pd_disaggregation_mode = "None"
def set_communicate_group(self):
def set_tp_group(self):
# different tp group id
# prevent different tp_groups using the same group_id
tp_gid_offset = envs.FD_TP_GROUP_GID_OFFSET
dist.collective._set_custom_gid(self.data_parallel_rank + tp_gid_offset)
self.tp_group = dist.new_group(
range(
self.data_parallel_rank * self.tensor_parallel_size,
(self.data_parallel_rank + 1) * self.tensor_parallel_size,
)
)
dist.collective._set_custom_gid(None)
# same ep group id
if self.enable_expert_parallel:
dist.collective._set_custom_gid(self.data_parallel_size + tp_gid_offset)
self.ep_group = dist.new_group(range(self.expert_parallel_size))
dist.collective._set_custom_gid(None)
dist.collective._set_custom_gid(self.data_parallel_size + tp_gid_offset)
self.ep_group = dist.new_group(range(self.expert_parallel_size))
logger.info(
f"data_parallel_size: {self.data_parallel_size}, tensor_parallel_size: {self.tensor_parallel_size}, expert_parallel_size: {self.expert_parallel_size}, data_parallel_rank: {self.data_parallel_rank}, tensor_parallel_rank: {self.tensor_parallel_rank}, expert_parallel_rank: {self.expert_parallel_rank}, tp_group: {self.tp_group}."
)
@@ -839,13 +584,8 @@ class GraphOptimizationConfig:
Now don't support capture both decode-only and prefill-only"""
self.full_cuda_graph: bool = True
""" Maximum CUDA Graph capture size """
self.max_capture_size: int = None
""" Record maps mapped from real shape to captured size to reduce runtime overhead """
self.real_shape_to_captured_size: dict[int, int] = None
""" Whether to use shared memory pool for multi capture_size """
self.use_unique_memory_pool: bool = False
# CINN Config ...
if args is not None:
for key, value in args.items():
@@ -948,63 +688,63 @@ class GraphOptimizationConfig:
argument = self.use_cudagraph
class PlasAttentionConfig:
class MobaAttentionConfig:
def __init__(
self,
args,
):
self.plas_encoder_top_k_left: int = None
self.plas_encoder_top_k_right: int = None
"The sparse topk of encoder attention is located at [plas_encoder_top_k_left, plas_encoder top_k_right]"
self.plas_decoder_top_k_left: int = None
self.plas_decoder_top_k_right: int = None
"The sparse topk of decoder attention is located at [plas_decoder_top_k_left, plas_decoder top_k_right]"
self.plas_use_encoder_seq_limit: int = None
"When the number of encdoer token is less than plas_use_encoder_seq_limit, it is not sparse"
self.plas_use_decoder_seq_limit: int = None
"When the number of decdoer token is less than plas_use_decoder_seq_limit, it is not sparse"
self.plas_block_size: int = 128
self.mlp_weight_name: str = "plas_attention_mlp_weight.safetensors"
self.plas_max_seq_length: int = 128 * 1024
self.moba_encoder_top_k_left: int = None
self.moba_encoder_top_k_right: int = None
"The sparse topk of encoder attention is located at [moba_encoder_top_k_left, moba_encoder top_k_right]"
self.moba_decoder_top_k_left: int = None
self.moba_decoder_top_k_right: int = None
"The sparse topk of decoder attention is located at [moba_decoder_top_k_left, moba_decoder top_k_right]"
self.moba_use_encoder_seq_limit: int = None
"When the number of encdoer token is less than moba_use_encoder_seq_limit, it is not sparse"
self.moba_use_decoder_seq_limit: int = None
"When the number of decdoer token is less than moba_use_decoder_seq_limit, it is not sparse"
self.moba_block_size: int = 128
self.mlp_weight_name: str = "moba_mlp_weight.safetensors"
self.moba_max_seq_length: int = 128 * 1024
if args is not None:
for key, value in args.items():
if hasattr(self, key):
setattr(self, key, value)
if self.plas_use_encoder_seq_limit is None and self.plas_encoder_top_k_left is not None:
self.plas_use_encoder_seq_limit = self.plas_encoder_top_k_left * self.plas_block_size
if self.plas_use_decoder_seq_limit is None and self.plas_decoder_top_k_left is not None:
self.plas_use_decoder_seq_limit = self.plas_decoder_top_k_left * self.plas_block_size
if self.moba_use_encoder_seq_limit is None and self.moba_encoder_top_k_left is not None:
self.moba_use_encoder_seq_limit = self.moba_encoder_top_k_left * self.moba_block_size
if self.moba_use_decoder_seq_limit is None and self.moba_decoder_top_k_left is not None:
self.moba_use_decoder_seq_limit = self.moba_decoder_top_k_left * self.moba_block_size
self.check_legality_parameters()
def check_legality_parameters(
self,
) -> None:
if self.plas_encoder_top_k_left is not None:
assert self.plas_encoder_top_k_left > 0, "plas_encoder_top_k_left must large than 0"
if self.moba_encoder_top_k_left is not None:
assert self.moba_encoder_top_k_left > 0, "moba_encoder_top_k_left must large than 0"
if self.plas_encoder_top_k_right is not None:
assert self.plas_encoder_top_k_right > 0, "plas_encoder_top_k_right must large than 0"
if self.moba_encoder_top_k_right is not None:
assert self.moba_encoder_top_k_right > 0, "moba_encoder_top_k_right must large than 0"
assert (
self.plas_encoder_top_k_right >= self.plas_encoder_top_k_left
), "plas_encoder_top_k_right must large than plas_encoder_top_k_left"
self.moba_encoder_top_k_right >= self.moba_encoder_top_k_left
), "moba_encoder_top_k_right must large than moba_encoder_top_k_left"
if self.plas_decoder_top_k_left is not None:
assert self.plas_decoder_top_k_left > 0, "plas_decoder_top_k_left must large than 0"
if self.moba_decoder_top_k_left is not None:
assert self.moba_decoder_top_k_left > 0, "moba_decoder_top_k_left must large than 0"
if self.plas_decoder_top_k_right is not None:
assert self.plas_decoder_top_k_right > 0, "plas_decoder_top_k_right must large than 0"
if self.moba_decoder_top_k_right is not None:
assert self.moba_decoder_top_k_right > 0, "moba_decoder_top_k_right must large than 0"
assert (
self.plas_decoder_top_k_right >= self.plas_decoder_top_k_left
), "plas_decoder_top_k_right must large than plas_decoder_top_k_left"
self.moba_decoder_top_k_right >= self.moba_decoder_top_k_left
), "moba_decoder_top_k_right must large than moba_decoder_top_k_left"
if self.plas_use_encoder_seq_limit is not None and self.plas_encoder_top_k_left is not None:
assert self.plas_use_encoder_seq_limit >= self.plas_encoder_top_k_left * self.plas_block_size
if self.plas_use_decoder_seq_limit is not None and self.plas_decoder_top_k_left is not None:
assert self.plas_use_decoder_seq_limit >= self.plas_decoder_top_k_left * self.plas_block_size
if self.moba_use_encoder_seq_limit is not None and self.moba_encoder_top_k_left is not None:
assert self.moba_use_encoder_seq_limit >= self.moba_encoder_top_k_left * self.moba_block_size
if self.moba_use_decoder_seq_limit is not None and self.moba_decoder_top_k_left is not None:
assert self.moba_use_decoder_seq_limit >= self.moba_decoder_top_k_left * self.moba_block_size
def to_json_string(self):
"""
Convert plas_attention_config to json string.
Convert moba_attention_config to json string.
"""
return json.dumps({key: value for key, value in self.__dict__.items() if value is not None})
@@ -1093,7 +833,6 @@ class LoadConfig:
load_strategy: Specifies the weight loading method when enabled:
- 'ipc': Real-time IPC streaming with automatic resharding
- 'ipc_snapshot': Load from disk snapshot of IPC weights
- 'meta': Only model meta messages
- None: No dynamic loading
"""
@@ -1104,47 +843,12 @@ class LoadConfig:
self.load_choices: Union[str, LoadChoices] = LoadChoices.DEFAULT.value
self.use_fastsafetensor = int(envs.FD_USE_FASTSAFETENSOR) == 1
self.dynamic_load_weight: bool = False
self.load_strategy: Optional[Literal["ipc", "ipc_snapshot", "meta", "normal"]] = "normal"
self.load_strategy: Optional[Literal["ipc", "ipc_snapshot"]] = None
for key, value in args.items():
if hasattr(self, key):
setattr(self, key, value)
class PoolerConfig:
"""Controls the behavior of output pooling in pooling models."""
pooling_type: Optional[str] = None
"""
The pooling method of the pooling model.
"""
# for embeddings models
normalize: Optional[bool] = None
"""
Whether to normalize the embeddings outputs. Defaults to True.
"""
dimensions: Optional[int] = None
"""
Reduce the dimensions of embeddings if model
support matryoshka representation. Defaults to None.
"""
enable_chunked_processing: Optional[bool] = None
"""
Whether to enable chunked processing for long inputs that exceed the model's
maximum position embeddings. When enabled, long inputs will be split into
chunks, processed separately, and then aggregated using weighted averaging.
This allows embedding models to handle arbitrarily long text without CUDA
errors. Defaults to False.
"""
max_embed_len: Optional[int] = None
"""
Maximum input length allowed for embedding generation. When set, allows
inputs longer than max_embed_len to be accepted for embedding models.
When an input exceeds max_embed_len, it will be handled according to
the original max_model_len validation logic.
Defaults to None (i.e. set to max_model_len).
"""
class LoRAConfig:
"""LoRA Config"""
@@ -1189,7 +893,7 @@ class CacheConfig:
self.kv_cache_ratio = 1.0
else:
self.kv_cache_ratio = 0.75
self.enc_dec_block_num = 0 if current_platform.is_maca() else envs.FD_ENC_DEC_BLOCK_NUM
self.enc_dec_block_num = 0 if current_platform.is_iluvatar() or current_platform.is_maca() else 2
self.prealloc_dec_block_slot_num_threshold = 12
self.cache_dtype = "bfloat16"
self.model_cfg = None
@@ -1399,14 +1103,16 @@ class FDConfig:
decoding_config: DecodingConfig = None,
quant_config: QuantConfigBase = None,
graph_opt_config: GraphOptimizationConfig = None,
plas_attention_config: PlasAttentionConfig = None,
moba_attention_config: MobaAttentionConfig = None,
speculative_config: SpeculativeConfig = None,
tokenizer: str = None,
max_model_len: int = 8192,
ips: str = None,
use_warmup: bool = False,
engine_worker_queue_port: str = "8002",
limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
splitwise_role: str = "mixed",
innode_prefill_ports: Optional[List[int]] = None,
max_num_partial_prefills: int = 1,
max_long_partial_prefills: int = 1,
@@ -1430,7 +1136,7 @@ class FDConfig:
self.early_stop_config: Optional[EarlyStopConfig] = early_stop_config
self.decoding_config: DecodingConfig = decoding_config # type: ignore
self.cache_config: CacheConfig = cache_config # type: ignore
self.plas_attention_config: Optional[PlasAttentionConfig] = plas_attention_config
self.moba_attention_config: Optional[MobaAttentionConfig] = moba_attention_config
# Initialize cuda graph capture list
if self.graph_opt_config.cudagraph_capture_sizes is None:
self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.scheduler_config.max_num_seqs)
@@ -1469,6 +1175,7 @@ class FDConfig:
self.limit_mm_per_prompt = limit_mm_per_prompt
self.mm_processor_kwargs = mm_processor_kwargs
self.use_warmup = use_warmup
self.splitwise_role = splitwise_role
self.innode_prefill_ports = innode_prefill_ports
self.max_num_partial_prefills = max_num_partial_prefills
self.max_long_partial_prefills = max_long_partial_prefills
@@ -1476,13 +1183,17 @@ class FDConfig:
self.reasoning_parser = reasoning_parser
self.guided_decoding_backend = guided_decoding_backend
self.disable_any_whitespace = disable_any_whitespace
self.engine_worker_queue_port = engine_worker_queue_port
self._str_to_list("innode_prefill_ports", int)
if isinstance(engine_worker_queue_port, int):
self.engine_worker_queue_port = str(engine_worker_queue_port)
self._str_to_list("engine_worker_queue_port", str)
if envs.FD_FOR_TORCH_MODEL_FORMAT:
self.model_config.model_format = "torch"
# TODO
self.max_prefill_batch = int(os.getenv("MAX_PREFILL_NUM", "3"))
self.max_prefill_batch = 3
if current_platform.is_xpu():
self.max_prefill_batch = 1
if self.model_config is not None and self.model_config.enable_mm:
@@ -1490,10 +1201,12 @@ class FDConfig:
num_ranks = self.parallel_config.tensor_parallel_size * self.parallel_config.data_parallel_size
self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
if num_ranks > self.max_chips_per_node and self.load_config.load_strategy != "meta":
if num_ranks > self.max_chips_per_node:
self.worker_num_per_node = self.max_chips_per_node
nnode = ceil_div(num_ranks, self.worker_num_per_node)
assert nnode == self.nnode, f"nnode: {nnode}, but got {self.nnode}"
# assert nnode == self.nnode, f"nnode: {nnode}, but got {self.nnode}"
else:
self.worker_num_per_node = num_ranks
@@ -1501,8 +1214,6 @@ class FDConfig:
self.device_ids = os.getenv("CUDA_VISIBLE_DEVICES", self.device_ids)
if current_platform.is_xpu():
self.device_ids = os.getenv("XPU_VISIBLE_DEVICES", self.device_ids)
if current_platform.is_intel_hpu():
self.device_ids = os.getenv("HPU_VISIBLE_DEVICES", self.device_ids)
self.read_from_config()
self.postprocess()
@@ -1543,8 +1254,6 @@ class FDConfig:
self.cache_config.postprocess(self.scheduler_config.max_num_batched_tokens, self.scheduler_config.max_num_seqs)
self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size)
if self.model_config is not None and self.model_config.enable_mm:
self.cache_config.enable_prefix_caching = False
if self.guided_decoding_backend == "auto":
if current_platform.is_xpu() or self.speculative_config.method is not None:
@@ -1553,15 +1262,6 @@ class FDConfig:
else:
self.guided_decoding_backend = "xgrammar"
if self.scheduler_config.splitwise_role == "mixed":
self.model_config.moe_phase = MoEPhase(phase="prefill")
elif self.scheduler_config.splitwise_role == "prefill":
self.model_config.moe_phase = MoEPhase(phase="prefill")
elif self.scheduler_config.splitwise_role == "decode":
self.model_config.moe_phase = MoEPhase(phase="decode")
else:
raise NotImplementedError
def check(self):
"""
check the legality of config
@@ -1596,7 +1296,7 @@ class FDConfig:
f"max_long_partial_prefills: {self.max_long_partial_prefills} should "
f"be less than or equal to max_num_partial_prefills: {self.max_num_partial_prefills}"
)
assert self.scheduler_config.splitwise_role in ["mixed", "prefill", "decode"]
assert self.splitwise_role in ["mixed", "prefill", "decode"]
# TODO(@wufeisheng): TP and EP need to be supported simultaneously.
assert (self.parallel_config.tensor_parallel_size == 1 and self.parallel_config.expert_parallel_size >= 1) or (
self.parallel_config.tensor_parallel_size >= 1 and self.parallel_config.expert_parallel_size == 1
@@ -1682,8 +1382,8 @@ class FDConfig:
initialize cache info
"""
disaggregate_info = {}
if self.scheduler_config.splitwise_role != "mixed":
disaggregate_info["role"] = self.scheduler_config.splitwise_role
if self.splitwise_role != "mixed":
disaggregate_info["role"] = self.splitwise_role
disaggregate_info["cache_info"] = dict()
current_protocol = self.cache_config.cache_transfer_protocol.split(",")
disaggregate_info["transfer_protocol"] = current_protocol
@@ -1691,9 +1391,7 @@ class FDConfig:
if protocol == "ipc":
disaggregate_info["cache_info"][protocol] = {
"ip": self.host_ip,
"port": self.parallel_config.engine_worker_queue_port[
self.parallel_config.local_data_parallel_id
],
"port": self.engine_worker_queue_port[self.parallel_config.local_data_parallel_id],
"device_ids": self.local_device_ids,
}
elif protocol == "rdma":

View File

@@ -42,12 +42,6 @@ def use_custom_allreduce(custom_all_reduce_max_bytes: int = 8192 * 1024):
_TP_AR = CustomAllreduce(model_parallel_group, custom_all_reduce_max_bytes)
def custom_ar_clear_ipc_handles():
global _TP_AR
if _TP_AR is not None:
_TP_AR.clear_ipc_handles()
try:
@paddle.jit.marker.unified
@@ -72,26 +66,3 @@ try:
except:
tensor_model_parallel_all_reduce = None
from paddle.distributed.communication import stream
from paddle.distributed.communication.reduce import ReduceOp
def all_reduce(
tensor,
op,
group,
sync_op: bool = True,
):
return stream.all_reduce(tensor, op=op, group=group, sync_op=sync_op, use_calc_stream=True)
@paddle.jit.marker.unified
def tensor_model_parallel_all_reduce_custom(input_: paddle.Tensor) -> paddle.Tensor:
"""All-reduce the input tensor across model parallel group on calc stream."""
if paddle.in_dynamic_mode():
hcg = dist.fleet.get_hybrid_communicate_group()
mp_group = hcg.get_model_parallel_group()
all_reduce(input_, op=ReduceOp.SUM, group=mp_group)
else:
dist.all_reduce(input_)

View File

@@ -25,7 +25,6 @@ from paddle.distributed.communication.group import Group
from fastdeploy.distributed.custom_all_reduce import cuda_wrapper
from fastdeploy.model_executor.ops.gpu import (
all_reduce,
clear_ipc_handles,
dispose,
get_graph_buffer_ipc_meta,
init_custom_all_reduce,
@@ -221,9 +220,6 @@ class CustomAllreduce:
else:
return self.all_reduce(input, input, registered=False)
def clear_ipc_handles(self):
clear_ipc_handles(self._ptr)
def close(self):
if self._ptr:
dispose(self._ptr)

View File

@@ -18,23 +18,20 @@ import argparse
import json
from dataclasses import asdict, dataclass
from dataclasses import fields as dataclass_fields
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional
import paddle
from fastdeploy import envs
from fastdeploy.config import (
CacheConfig,
ConvertOption,
EarlyStopConfig,
FDConfig,
GraphOptimizationConfig,
LoadConfig,
MobaAttentionConfig,
ModelConfig,
ParallelConfig,
PlasAttentionConfig,
PoolerConfig,
RunnerOption,
SpeculativeConfig,
TaskOption,
)
@@ -98,20 +95,6 @@ class EngineArgs:
"""
The task to be executed by the model.
"""
runner: RunnerOption = "auto"
"""
The type of model runner to use.Each FD instance only supports one model runner.
even if the same model can be used for multiple types.
"""
convert: ConvertOption = "auto"
"""
Convert the model using adapters. The most common use case is to
adapt a text generation model to be used for pooling tasks.
"""
override_pooler_config: Optional[Union[dict, PoolerConfig]] = None
"""
Override configuration for the pooler.
"""
max_num_seqs: int = 8
"""
Maximum number of sequences per iteration.
@@ -152,7 +135,7 @@ class EngineArgs:
"""
dynamic load weight
"""
load_strategy: str = "normal"
load_strategy: str = "ipc_snapshot"
"""
dynamic load weight strategy
"""
@@ -361,9 +344,9 @@ class EngineArgs:
"""
Configuration for graph optimization backend execution.
"""
plas_attention_config: Optional[Dict[str, Any]] = None
moba_attention_config: Optional[Dict[str, Any]] = None
"""
Configuration for plas attention.
Configuration for moba attention.
"""
enable_logprob: bool = False
@@ -403,27 +386,30 @@ class EngineArgs:
"""
Post-initialization processing to set default tokenizer if not provided.
"""
if not self.tokenizer:
self.tokenizer = self.model
if self.splitwise_role == "decode":
self.enable_prefix_caching = False
if self.speculative_config is not None:
self.enable_prefix_caching = False
if self.enable_mm:
self.enable_prefix_caching = False
if not current_platform.is_cuda():
self.enable_prefix_caching = False
# if self.dynamic_load_weight:
# self.enable_prefix_caching = False
if self.dynamic_load_weight:
self.enable_prefix_caching = False
if self.enable_logprob:
if self.speculative_config is not None:
raise NotImplementedError("Logprob does not support speculation_config.")
if self.enable_expert_parallel:
raise NotImplementedError("Logprob does not support enable_expert_parallel.")
if not current_platform.is_cuda():
raise NotImplementedError("Only CUDA platform supports logprob.")
if self.speculative_config is not None:
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
if self.splitwise_role != "mixed" and self.cache_transfer_protocol != "rdma":
if self.splitwise_role != "mixed":
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
if not current_platform.is_cuda() and not current_platform.is_xpu():
if not current_platform.is_cuda():
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
if self.guided_decoding_backend != "off":
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
@@ -489,21 +475,6 @@ class EngineArgs:
default=EngineArgs.task,
help="Task to be executed by the model.",
)
model_group.add_argument(
"--runner",
type=str,
default=EngineArgs.runner,
help="The type of model runner to use",
)
model_group.add_argument(
"--convert", type=str, default=EngineArgs.convert, help="Convert the model using adapters"
)
model_group.add_argument(
"--override-pooler-config",
type=json.loads,
default=EngineArgs.override_pooler_config,
help="Override the pooler configuration with a JSON string.",
)
model_group.add_argument(
"--use-warmup",
type=int,
@@ -600,9 +571,9 @@ class EngineArgs:
help="",
)
model_group.add_argument(
"--plas-attention-config",
"--moba-attention-config",
type=json.loads,
default=EngineArgs.plas_attention_config,
default=EngineArgs.moba_attention_config,
help="",
)
model_group.add_argument(
@@ -711,7 +682,7 @@ class EngineArgs:
# Load group
load_group = parser.add_argument_group("Load Configuration")
load_group.add_argument(
"--load-choices",
"--load_choices",
type=str,
default=EngineArgs.load_choices,
help="The format of the model weights to load.\
@@ -735,7 +706,7 @@ class EngineArgs:
cache_group.add_argument(
"--prealloc-dec-block-slot-num-threshold",
type=int,
default=EngineArgs.prealloc_dec_block_slot_num_threshold,
default=12,
help="Number of token slot threadshold to allocate next blocks for decoding.",
)
@@ -992,17 +963,17 @@ class EngineArgs:
graph_optimization_args[k] = v
return GraphOptimizationConfig(graph_optimization_args)
def create_plas_attention_config(self) -> PlasAttentionConfig:
def create_moba_attention_config(self) -> MobaAttentionConfig:
"""
Create and retuan a PlasAttentionConfig object based on the current settings.
Create and retuan a MobaAttentionConfig object based on the current settings.
"""
attention_args = asdict(self)
if self.plas_attention_config is not None:
for k, v in self.plas_attention_config.items():
if self.moba_attention_config is not None:
for k, v in self.moba_attention_config.items():
attention_args[k] = v
return PlasAttentionConfig(attention_args)
return MobaAttentionConfig(attention_args)
else:
return PlasAttentionConfig(None)
return MobaAttentionConfig(None)
def create_early_stop_config(self) -> EarlyStopConfig:
"""
@@ -1050,11 +1021,6 @@ class EngineArgs:
else:
self.max_num_batched_tokens = self.max_model_len
if isinstance(self.engine_worker_queue_port, int):
self.engine_worker_queue_port = str(self.engine_worker_queue_port)
if isinstance(self.engine_worker_queue_port, str):
self.engine_worker_queue_port = self.engine_worker_queue_port.split(",")
all_dict = asdict(self)
all_dict["model_cfg"] = model_cfg
cache_cfg = CacheConfig(all_dict)
@@ -1063,11 +1029,16 @@ class EngineArgs:
scheduler_cfg = self.create_scheduler_config()
graph_opt_cfg = self.create_graph_optimization_config()
graph_opt_cfg.update_use_cudagraph(self.use_cudagraph)
plas_attention_config = self.create_plas_attention_config()
moba_attention_config = self.create_moba_attention_config()
early_stop_cfg = self.create_early_stop_config()
early_stop_cfg.update_enable_early_stop(self.enable_early_stop)
if isinstance(self.engine_worker_queue_port, int):
self.engine_worker_queue_port = str(self.engine_worker_queue_port)
if isinstance(self.engine_worker_queue_port, str):
self.engine_worker_queue_port = self.engine_worker_queue_port.split(",")
assert is_port_available(
"0.0.0.0", int(self.engine_worker_queue_port[parallel_cfg.local_data_parallel_id])
), f"The parameter `engine_worker_queue_port`:{self.engine_worker_queue_port} is already in use."
@@ -1083,16 +1054,18 @@ class EngineArgs:
speculative_config=speculative_cfg,
ips=self.ips,
use_warmup=self.use_warmup,
engine_worker_queue_port=self.engine_worker_queue_port,
limit_mm_per_prompt=self.limit_mm_per_prompt,
mm_processor_kwargs=self.mm_processor_kwargs,
reasoning_parser=self.reasoning_parser,
tool_parser=self.tool_call_parser,
splitwise_role=self.splitwise_role,
innode_prefill_ports=self.innode_prefill_ports,
max_num_partial_prefills=self.max_num_partial_prefills,
max_long_partial_prefills=self.max_long_partial_prefills,
long_prefill_token_threshold=self.long_prefill_token_threshold,
graph_opt_config=graph_opt_cfg,
plas_attention_config=plas_attention_config,
moba_attention_config=moba_attention_config,
guided_decoding_backend=self.guided_decoding_backend,
disable_any_whitespace=self.guided_decoding_disable_any_whitespace,
early_stop_config=early_stop_cfg,

View File

@@ -30,23 +30,21 @@ import paddle
import zmq
from opentelemetry import trace
from fastdeploy.engine.request import Request, RequestOutput, RequestType
from fastdeploy.engine.request import Request, RequestOutput
from fastdeploy.engine.resource_manager import ResourceManager
from fastdeploy.engine.sched.resource_manager_v1 import ResourceManagerV1
from fastdeploy.inter_communicator import (
EngineCacheQueue,
EngineWorkerQueue,
IPCSignal,
ZmqIpcServer,
ZmqTcpServer,
ZmqClient,
)
from fastdeploy.metrics.metrics import main_process_metrics
from fastdeploy.metrics.trace_util import start_span, start_span_request
from fastdeploy.model_executor.guided_decoding import schema_checker
from fastdeploy.plugins.token_processor import load_token_processor_plugins
from fastdeploy.splitwise.internal_adapter_utils import InternalAdapter
from fastdeploy.splitwise.splitwise_connector import SplitwiseConnector
from fastdeploy.utils import EngineError, envs, get_logger, llm_logger
from fastdeploy.utils import EngineError, envs, llm_logger
try:
TokenProcessor = load_token_processor_plugins()
@@ -69,36 +67,32 @@ class EngineService:
"""
self.cfg = cfg
if self.cfg.parallel_config.enable_expert_parallel:
self.llm_logger = get_logger(
"fastdeploy", f"fastdeploy_rank{self.cfg.parallel_config.local_data_parallel_id}.log"
)
else:
self.llm_logger = llm_logger
self.scheduler = cfg.scheduler_config.scheduler()
self.enable_decode_cache_task = envs.FD_ENABLE_CACHE_TASK == "1"
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
self.resource_manager = ResourceManagerV1(
cfg.scheduler_config.max_num_seqs,
cfg,
cfg.parallel_config.tensor_parallel_size,
cfg.scheduler_config.splitwise_role,
cfg.splitwise_role,
cfg.parallel_config.local_data_parallel_id,
)
if cfg.splitwise_role != "mixed":
raise NotImplementedError(
"Currently ENABLE_V1_KVCACHE_SCHEDULER=1 only supported in mixed sampling now."
)
else:
self.resource_manager = ResourceManager(
cfg.scheduler_config.max_num_seqs,
cfg,
cfg.parallel_config.tensor_parallel_size,
cfg.scheduler_config.splitwise_role,
cfg.splitwise_role,
cfg.parallel_config.local_data_parallel_id,
)
self.start_worker_queue_service(start_queue)
os.environ["INFERENCE_MSG_QUEUE_ID"] = self.cfg.parallel_config.engine_worker_queue_port[
os.environ["INFERENCE_MSG_QUEUE_ID"] = self.cfg.engine_worker_queue_port[
self.cfg.parallel_config.local_data_parallel_id
]
@@ -139,14 +133,10 @@ class EngineService:
self.insert_task_to_worker_thread.start()
self.token_processor.tasks_queue = self.engine_worker_queue
self.token_processor.run()
if self.cfg.scheduler_config.splitwise_role != "mixed":
self.split_mode_get_tasks()
def _init_worker_monitor_signals(self): # exist_task_signal 用于各worker进程感知是否有新Task需要处理
current_suffix = int(
self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]
)
self.llm_logger.info(f"current_suffix: {current_suffix}")
current_suffix = int(self.cfg.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id])
llm_logger.info(f"current_suffix: {current_suffix}")
exist_task_signal_data = np.zeros([1], dtype=np.int32)
self.exist_task_signal = IPCSignal(
name="exist_task_signal",
@@ -188,24 +178,6 @@ class EngineService:
create=True,
)
cache_ready_signal_data = np.zeros(shape=[self.cfg.parallel_config.tensor_parallel_size], dtype=np.int32)
self.cache_ready_signal = IPCSignal(
name="cache_ready_signal",
array=cache_ready_signal_data,
dtype=np.int32,
suffix=current_suffix,
create=True,
)
swap_space_ready_signal_data = np.zeros(shape=[self.cfg.parallel_config.tensor_parallel_size], dtype=np.int32)
self.swap_space_ready_signal = IPCSignal(
name="swap_space_ready_signal",
array=swap_space_ready_signal_data,
dtype=np.int32,
suffix=current_suffix,
create=True,
)
model_weights_status = np.zeros([1], dtype=np.int32)
self.model_weights_status_signal = IPCSignal(
name="model_weights_status",
@@ -215,35 +187,17 @@ class EngineService:
create=True,
)
prefix_tree_status = np.zeros([1], dtype=np.int32)
self.prefix_tree_status_signal = IPCSignal(
name="prefix_tree_status",
array=prefix_tree_status,
dtype=np.int32,
suffix=current_suffix,
create=True,
)
kv_cache_status = np.zeros([1], dtype=np.int32)
self.kv_cache_status_signal = IPCSignal(
name="kv_cache_status",
array=kv_cache_status,
dtype=np.int32,
suffix=current_suffix,
create=True,
)
def start_worker_queue_service(self, start_queue):
"""
start queue service for engine worker communication
"""
address = (
self.cfg.master_ip,
int(self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]),
int(self.cfg.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]),
)
if start_queue and (self.cfg.host_ip == self.cfg.master_ip or self.cfg.master_ip == "0.0.0.0"):
self.llm_logger.info(f"Starting engine worker queue server service at {address}")
llm_logger.info(f"Starting engine worker queue server service at {address}")
self.engine_worker_queue_server = EngineWorkerQueue(
address=address,
is_server=True,
@@ -253,7 +207,7 @@ class EngineService:
if (
self.cfg.cache_config.enable_prefix_caching
or self.cfg.scheduler_config.splitwise_role != "mixed"
or self.cfg.splitwise_role != "mixed"
and self.cfg.parallel_config.local_data_parallel_id == 0
):
self.cache_task_queue = EngineCacheQueue(
@@ -267,7 +221,7 @@ class EngineService:
client_id=-1,
local_data_parallel_size=self.cfg.parallel_config.data_parallel_size,
)
self.llm_logger.info(
llm_logger.info(
f"local {min(self.cfg.worker_num_per_node * self.cfg.node_rank + self.cfg.parallel_config.local_data_parallel_id,self.cfg.parallel_config.data_parallel_size - 1)}"
)
self.engine_worker_queue = EngineWorkerQueue(
@@ -296,21 +250,8 @@ class EngineService:
cur_task_idx = self.resource_manager.req_dict[task.request_id]
del self.resource_manager.req_dict[task.request_id]
cur_task = self.resource_manager.tasks_list[cur_task_idx]
if envs.FD_ENABLE_INTERNAL_ADAPTER:
if not task.outputs.token_ids: # first token is eos in Prefill, just recycle resource and continue
self.resource_manager.stop_flags[cur_task_idx] = True
self.resource_manager.tasks_list[cur_task_idx] = None
self.resource_manager._recycle_block_tables(cur_task)
if task.request_id in self.token_processor.tokens_counter:
del self.token_processor.tokens_counter[task.request_id]
self.llm_logger.warning(f"{task.request_id} need not decode after first token")
continue
cur_task.prompt_token_ids[0] = task.outputs.token_ids[0]
cur_task.num_cached_tokens = task.num_cached_tokens
if (
self.cfg.speculative_config.method in ["mtp"]
and self.cfg.scheduler_config.splitwise_role == "decode"
):
if self.cfg.speculative_config.method in ["mtp"] and self.cfg.splitwise_role == "decode":
cur_task.draft_token_ids = copy.deepcopy(task.outputs.draft_token_ids)
if task.error_code != 200:
self.resource_manager.stop_flags[cur_task_idx] = True
@@ -319,14 +260,13 @@ class EngineService:
if task.request_id in self.token_processor.tokens_counter:
del self.token_processor.tokens_counter[task.request_id]
self.scheduler.put_results([task])
self.llm_logger.warning(
llm_logger.warning(
f"{task.request_id} prefill failed with msg:{task.error_msg}, recycle resource."
)
continue
self.token_processor.tokens_counter[task.request_id] = 1
current_tasks.append(cur_task)
if current_tasks:
self.engine_worker_queue.put_tasks((current_tasks, self.resource_manager.real_bsz))
self.engine_worker_queue.put_tasks((current_tasks, self.resource_manager.real_bsz))
return True
self.resource_manager.check_and_free_block_tables()
@@ -334,34 +274,13 @@ class EngineService:
if not isinstance(tasks, list):
tasks = [tasks]
need_delete_tasks = []
for task in tasks:
if self.cfg.scheduler_config.splitwise_role != "mixed":
status, msg = self.split_connector.check_decode_allocated(task)
if not status:
self.llm_logger.error(f"{task.request_id} prefill failed with msg:{msg}.")
self.scheduler.put_results(
[
RequestOutput(
request_id=task.request_id,
finished=True,
error_code=500,
error_msg=msg,
)
]
)
need_delete_tasks.append(task)
continue
for tmp_task in need_delete_tasks:
tasks.remove(tmp_task)
for item in tasks:
item.schedule_start_time = time.time()
available_batch = np.sum(self.resource_manager.stop_flags)
if len(tasks) > available_batch:
self.llm_logger.error(f"Inserting batch:{len(tasks)} exceeds the available batch:{available_batch}.")
self.llm_logger.error("The exceeded part will be ignored!")
llm_logger.error(f"Inserting batch:{len(tasks)} exceeds the available batch:{available_batch}.")
llm_logger.error("The exceeded part will be ignored!")
tasks = tasks[:available_batch]
req_ids = [t.request_id for t in tasks]
@@ -370,7 +289,7 @@ class EngineService:
if not tasks:
error_msg = f"The request required resources is exceed the limit, request id={req_ids}."
self.llm_logger.error(error_msg)
llm_logger.error(error_msg)
raise EngineError(error_msg, error_code=500)
return False
@@ -388,7 +307,7 @@ class EngineService:
self.split_connector.send_cache_infos(tasks, current_id)
if not is_decode:
self.llm_logger.info(f"Tasks are sent to engine, req_ids={req_ids}")
llm_logger.info(f"Tasks are sent to engine, req_ids={req_ids}")
for task in tasks:
task.inference_start_time = time.time()
if not is_prefill:
@@ -547,7 +466,7 @@ class EngineService:
Insert task to engine thread, monitor scheduler request queue.
if the engine has resource, insert task to engine
"""
current_id = 0
current_id = -1
while getattr(self, "running", True):
try:
if self.resource_manager.available_batch() == 0:
@@ -557,10 +476,7 @@ class EngineService:
time.sleep(0.001)
continue
if hasattr(self, "exist_prefill_task_signal") and self.exist_prefill_task_signal.value[0] > 0:
if (
self.cfg.scheduler_config.splitwise_role == "mixed"
or self.split_connector.has_splitwise_tasks()
):
if self.cfg.splitwise_role == "mixed" or self.split_connector.has_splitwise_tasks():
time.sleep(0.005)
continue
if self.engine_worker_queue.num_cache_infos() > 0:
@@ -588,21 +504,18 @@ class EngineService:
time.sleep(0.001)
continue
if self.cfg.scheduler_config.splitwise_role != "mixed":
self.llm_logger.info("Inserting splitwise tasks")
current_id = (current_id + 1) % 100003
if self.cfg.splitwise_role != "mixed":
llm_logger.info("Inserting splitwise tasks")
self.split_connector.send_splitwise_tasks(tasks, current_id)
insert_successful = self.insert_tasks(tasks, current_id)
if insert_successful:
current_id = current_id + 1
else:
continue
self.insert_tasks(tasks, current_id)
main_process_metrics.num_requests_waiting.dec(len(tasks))
main_process_metrics.num_requests_running.inc(len(tasks))
except Exception as e:
err_msg = f"Error happend while insert task to engine: {e}, {traceback.format_exc()!s}."
self.llm_logger.error(err_msg)
err_msg = f"Error happened while insert task to engine: {e}, {traceback.format_exc()!s}."
llm_logger.error(err_msg)
def _scheduler_task_to_worker_v1(self):
"""
@@ -612,145 +525,60 @@ class EngineService:
is_fetching = False
def _fetch_request():
try:
nonlocal is_fetching
is_fetching = True
num_prefill_batch = min(
int(self.resource_manager.available_batch()),
self.cfg.max_prefill_batch,
)
if self.cfg.model_config.enable_mm:
available_blocks = self.resource_manager.available_block_num()
else:
available_blocks = self.cfg.cache_config.max_block_num_per_seq
nonlocal is_fetching
is_fetching = True
num_prefill_batch = min(
int(self.resource_manager.available_batch()),
self.cfg.max_prefill_batch,
)
if self.cfg.model_config.enable_mm:
available_blocks = self.resource_manager.available_block_num()
else:
available_blocks = self.cfg.cache_config.max_block_num_per_seq
tasks = self.scheduler.get_requests(
available_blocks=available_blocks,
block_size=self.cfg.cache_config.block_size,
reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num,
max_num_batched_tokens=self.cfg.max_model_len,
batch=num_prefill_batch,
)
if self.cfg.scheduler_config.splitwise_role != "mixed":
for task in tasks:
# assure can allocate block ids in P
while not self.resource_manager.preallocate_resource_in_p(task):
time.sleep(0.005)
self.llm_logger.info(f"ask D resource for req_id: {task.request_id}")
self.split_connector.send_splitwise_tasks([task], task.idx)
need_delete_tasks = []
for task in tasks:
if self.cfg.scheduler_config.splitwise_role != "mixed":
# assure fetch block ids from D
status, msg = self.split_connector.check_decode_allocated(task)
if not status:
self.llm_logger.error(f"{task.request_id} prefill failed with msg:{msg}.")
self.scheduler.put_results(
[
RequestOutput(
request_id=task.request_id,
finished=True,
error_code=500,
error_msg=msg,
)
]
)
need_delete_tasks.append(task)
continue
for tmp_task in need_delete_tasks:
tasks.remove(tmp_task)
# release resource in P
self.resource_manager.prerelease_resource(tmp_task)
if self.cfg.scheduler_config.splitwise_role == "prefill":
# to send cache info to cache messager
if tasks:
self.split_connector.send_cache_infos(tasks, 0)
# ensure cache tasks has sent to cache_messager
need_check_req_ids = [task.request_id for task in tasks]
while need_check_req_ids:
req_ids = self.engine_worker_queue.get_finished_add_cache_task_req()
self.llm_logger.info(f"get_finished_add_cache_task_req: {req_ids}")
if req_ids:
for req_id in req_ids:
assert req_id in need_check_req_ids
need_check_req_ids.remove(req_id)
else:
time.sleep(0.001)
# Fetch requests and add them to the scheduling queue
if tasks:
if self.cfg.scheduler_config.splitwise_role == "prefill":
self.resource_manager.add_request_in_p(tasks)
else:
for task in tasks:
self.resource_manager.add_request(task)
is_fetching = False
except Exception as e:
self.llm_logger.error(f"fetching request error {e} {str(traceback.format_exc())}")
is_fetching = False
tasks = self.scheduler.get_requests(
available_blocks=available_blocks,
block_size=self.cfg.cache_config.block_size,
reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num,
max_num_batched_tokens=self.cfg.max_model_len,
batch=num_prefill_batch,
)
# Fetch requests and add them to the scheduling queue
for task in tasks:
self.resource_manager.add_request(task)
is_fetching = False
while self.running:
try:
if self.engine_worker_queue.num_tasks() > 0:
time.sleep(0.001)
continue
if self.cfg.scheduler_config.splitwise_role != "mixed":
if self.scheduler.get_unhandled_request_num() <= envs.FD_EP_MAX_PREFETCH_TASK_NUM and (
not is_fetching
):
get_request_pool.submit(_fetch_request)
else:
if (
len(self.resource_manager.waiting) == 0
and (not is_fetching)
and self.exist_prefill_task_signal.value[0] == 0
):
get_request_pool.submit(_fetch_request)
if (
len(self.resource_manager.waiting) == 0
and (not is_fetching)
and self.exist_prefill_task_signal.value[0] == 0
):
get_request_pool.submit(_fetch_request)
# 2. Schedule requests
tasks = self.resource_manager.schedule()
# 3. Send to engine
if tasks:
if self.cfg.scheduler_config.splitwise_role == "decode":
for task in tasks:
if task.task_type == RequestType.PREEMPTED:
msg = f"{task.request_id} decode not enough blocks, need to be rescheduled."
self.llm_logger.error(msg)
self.scheduler.put_results(
[
RequestOutput(
request_id=task.request_id,
finished=True,
error_code=500,
error_msg=msg,
)
]
)
self.resource_manager.get_real_bsz()
self.engine_worker_queue.put_tasks((tasks, self.resource_manager.real_bsz))
else:
time.sleep(0.005)
except Exception as e:
err_msg = "Error happend while insert task to engine: {}, {}.".format(e, str(traceback.format_exc()))
self.llm_logger.error(err_msg)
err_msg = "Error happened while insert task to engine: {}, {}.".format(e, str(traceback.format_exc()))
llm_logger.error(err_msg)
def start_zmq_service(self, api_server_pid=None):
if api_server_pid is None:
return
self.api_server_pid = api_server_pid
if envs.FD_ENABLE_INTERNAL_ADAPTER:
self.recv_request_server = ZmqTcpServer(port=envs.FD_ZMQ_RECV_REQUEST_SERVER_PORT, mode=zmq.PULL)
self.send_response_server = ZmqTcpServer(port=envs.FD_ZMQ_SEND_RESPONSE_SERVER_PORT, mode=zmq.ROUTER)
self.internal_adapter = InternalAdapter(
cfg=self.cfg, engine=self, dp_rank=self.cfg.node_rank * self.cfg.worker_num_per_node
)
else:
self.recv_request_server = ZmqIpcServer(name=api_server_pid, mode=zmq.PULL)
self.send_response_server = ZmqIpcServer(name=api_server_pid, mode=zmq.ROUTER)
self.recv_result_handle_thread = threading.Thread(
target=self.send_response_server.recv_result_handle, daemon=True
)
self.recv_result_handle_thread.start()
self.zmq_server = ZmqClient(name=api_server_pid, mode=zmq.PULL)
self.zmq_server.start_server()
self.zmq_server.create_router()
time.sleep(3)
self.insert_task_to_scheduler_thread = threading.Thread(target=self._insert_zmq_task_to_scheduler, daemon=True)
self.insert_task_to_scheduler_thread.start()
@@ -760,18 +588,15 @@ class EngineService:
def _insert_zmq_task_to_scheduler(self):
added_requests: Dict[str, int] = dict()
if envs.FD_ENABLE_INTERNAL_ADAPTER:
if self.cfg.scheduler_config.splitwise_role == "decode":
return
while self.running:
try:
block = True if len(added_requests) == 0 else False
if not self.cfg.model_config.enable_mm:
err, data = self.recv_request_server.receive_json_once(block)
err, data = self.zmq_server.receive_json_once(block)
else:
err, data = self.recv_request_server.receive_pyobj_once(block)
err, data = self.zmq_server.receive_pyobj_once(block)
if err is not None:
self.llm_logger.error(f"Engine stops inserting zmq task into scheduler, err:{err}")
llm_logger.error(f"Engine stops inserting zmq task into scheduler, err:{err}")
break
request, insert_task = None, []
@@ -782,16 +607,16 @@ class EngineService:
request = Request.from_dict(data)
start_span("ENQUEUE_ZMQ", data, trace.SpanKind.PRODUCER)
main_process_metrics.requests_number.inc()
self.llm_logger.debug(f"Receive request: {request}")
llm_logger.debug(f"Receive request: {request}")
except Exception as e:
self.llm_logger.error(f"Receive request error: {e}, {traceback.format_exc()!s}")
llm_logger.error(f"Receive request error: {e}, {traceback.format_exc()!s}")
err_msg = str(e)
results.append((data["request_id"], err_msg))
if self.guided_decoding_checker is not None and err_msg is None:
request, err_msg = self.guided_decoding_checker.schema_format(request)
if err_msg is not None:
self.llm_logger.error(f"Receive request error: {err_msg}")
llm_logger.error(f"Receive request error: {err_msg}")
results.append((request.request_id, err_msg))
if err_msg is None:
@@ -823,9 +648,9 @@ class EngineService:
)
# Since the request is not in scheduler
# Send result by zmq directly
self.send_response_server.send_response(request_id, [error_result])
self.zmq_server.send_multipart(request_id, [error_result])
except Exception as e:
self.llm_logger.error(
llm_logger.error(
f"Error happened while receiving new request from zmq, details={e}, "
f"traceback={traceback.format_exc()}"
)
@@ -841,10 +666,10 @@ class EngineService:
time.sleep(0.005)
continue
for request_id, contents in results.items():
self.send_response_server.send_response(request_id, contents)
self.zmq_server.send_multipart(request_id, contents)
except Exception as e:
self.llm_logger.error(f"Unexcepted error happend: {e}, {traceback.format_exc()!s}")
llm_logger.error(f"Unexcepted error happened: {e}, {traceback.format_exc()!s}")
def split_mode_get_tasks(self):
"""
@@ -857,22 +682,13 @@ class EngineService:
processed_indices = []
for idx, task in enumerate(self.waiting_requests):
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
if self.resource_manager.preallocate_resource_in_d(task):
self.llm_logger.info(f"Resource available, processing task {task.request_id}")
self.split_connector.send_cache_infos([task], -1)
processed_indices.append(idx)
else:
self.llm_logger.debug(f"Still waiting for resources {task.request_id}")
break
if self.resource_manager.is_resource_sufficient(task.prompt_token_ids_len):
self.insert_tasks([task])
llm_logger.info(f"Resource available, processing task {task.request_id}")
processed_indices.append(idx)
else:
if self.resource_manager.is_resource_sufficient(task.prompt_token_ids_len):
self.insert_tasks([task])
self.llm_logger.info(f"Resource available, processing task {task.request_id}")
processed_indices.append(idx)
else:
self.llm_logger.debug(f"Still waiting for resources {task.request_id}")
break
llm_logger.debug(f"Still waiting for resources {task.request_id}")
break
for idx in sorted(processed_indices, reverse=True):
self.waiting_requests.pop(idx)
@@ -894,111 +710,51 @@ class EngineService:
tasks = [tasks]
for task in tasks:
task.finished = False
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
for task in tasks:
if envs.FD_ENABLE_INTERNAL_ADAPTER:
if (
not task.outputs.token_ids
): # first token is eos in Prefill, just recycle resource and continue
cur_task = self.resource_manager.requests[task.request_id]
self.resource_manager.stop_flags[cur_task.idx] = True
self.resource_manager.tasks_list[cur_task.idx] = None
self.resource_manager._free_blocks(cur_task)
if cur_task.request_id in self.token_processor.tokens_counter:
del self.token_processor.tokens_counter[task.request_id]
self.llm_logger.warning(
f"{task.request_id} need not decode after first token"
)
del self.resource_manager.requests[task.request_id]
del self.resource_manager.req_dict[task.request_id]
continue
if task.error_code != 200:
cur_task = self.resource_manager.requests[task.request_id]
self.resource_manager.stop_flags[cur_task.idx] = True
self.resource_manager.tasks_list[cur_task.idx] = None
self.resource_manager._free_blocks(cur_task)
if cur_task.request_id in self.token_processor.tokens_counter:
del self.token_processor.tokens_counter[task.request_id]
self.scheduler.put_results([task])
self.llm_logger.warning(
f"{task.request_id} prefill failed with msg:{task.error_msg}, recycle resource."
)
continue
self.resource_manager.insert_task_for_decoding(task)
self.insert_tasks(tasks, allocated=True)
if self.cfg.innode_prefill_ports is not None:
self.scheduler.put_results(tasks)
else:
self.insert_tasks(tasks, allocated=True)
if self.cfg.innode_prefill_ports is not None:
self.scheduler.put_results(tasks)
else:
if len(self.waiting_requests):
self.llm_logger.info(f"Waiting for resource for task {tasks[0].request_id}")
llm_logger.info(f"Waiting for resource for task {tasks[0].request_id}")
self.waiting_requests.extend(tasks)
else:
new_waiting = []
for task in tasks:
can_allocate_resource = False
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
if self.resource_manager.preallocate_resource_in_d(task):
self.split_connector.send_cache_infos([task], -1)
can_allocate_resource = True
if self.resource_manager.is_resource_sufficient(task.prompt_token_ids_len):
self.insert_tasks([task])
else:
if self.resource_manager.is_resource_sufficient(
task.prompt_token_ids_len
):
self.insert_tasks([task])
can_allocate_resource = True
if can_allocate_resource is False:
if not self.enable_decode_cache_task:
task.error_msg = "Not enough resources"
new_waiting.append(task)
if new_waiting:
if not self.enable_decode_cache_task:
self.split_connector.send_cache_infos(new_waiting, -1)
else:
self.waiting_requests.extend(new_waiting)
self.llm_logger.info(
f"Added {len(new_waiting)} tasks to waiting queue"
)
self.waiting_requests.extend(new_waiting)
llm_logger.info(f"Added {len(new_waiting)} tasks to waiting queue")
else:
time.sleep(0.001)
except Exception as e:
self.llm_logger.error(f"Error in main loop: {e}")
llm_logger.error(f"Error in main loop: {e}")
time.sleep(0.1)
threading.Thread(target=receiver_loop, daemon=True).start()
def start_cache_service(self, device_ids, ipc_signal_suffix, create_cache_tensor):
def start_cache_service(self, device_ids, ipc_signal_suffix):
return self.resource_manager.cache_manager.launch_cache_manager(
cache_config=self.cfg.cache_config,
tensor_parallel_size=self.cfg.parallel_config.tensor_parallel_size,
device_ids=device_ids,
pod_ip=self.cfg.master_ip,
engine_worker_queue_port=int(
self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]
self.cfg.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]
),
pid_suffix=ipc_signal_suffix,
create_cache_tensor=create_cache_tensor,
)
def check_and_free_block_tables(self):
self.resource_manager.check_and_free_block_tables()
def clear_data(self):
try:
llm_logger.info("Clear Data: Start")
self.token_processor.clear_data()
self.engine_worker_queue.clear_data()
self.zmq_server.req_dict.clear()
llm_logger.info("Clear Data: Successfully")
return True
except Exception as e:
llm_logger.error(f"Clear data error: {e}")
return False
def _exit_sub_services(self):
"""
exit sub services
@@ -1008,15 +764,7 @@ class EngineService:
self.exist_task_signal.clear()
self.exist_swapped_task_signal.clear()
self.worker_healthy_live_signal.clear()
self.cache_ready_signal.clear()
self.swap_space_ready_signal.clear()
self.exist_prefill_task_signal.clear()
self.model_weights_status_signal.clear()
self.prefix_tree_status_signal.clear()
self.kv_cache_status_signal.clear()
if hasattr(self, "send_response_server") and self.send_response_server is not None:
self.send_response_server.close()
if hasattr(self, "recv_request_server") and self.recv_request_server is not None:
self.recv_request_server.close()
if hasattr(self, "recv_control_cmd_server") and self.recv_control_cmd_server is not None:
self.recv_control_cmd_server.close()
if hasattr(self, "zmq_server") and self.zmq_server is not None:
self.zmq_server.close()

View File

@@ -34,7 +34,6 @@ import numpy as np
import paddle
from tqdm import tqdm
from fastdeploy.config import ErnieArchitectures
from fastdeploy.engine.args_utils import EngineArgs
from fastdeploy.engine.common_engine import EngineService
from fastdeploy.engine.expert_service import start_data_parallel_service
@@ -116,24 +115,25 @@ class LLMEngine:
start_time = time.time()
self.api_server_pid = api_server_pid
self.ipc_signal_suffix = self.cfg.parallel_config.engine_worker_queue_port[0]
self.ipc_signal_suffix = self.cfg.engine_worker_queue_port[0]
self._init_worker_signals()
self.data_processor = self.input_processor.create_processor()
self.engine.data_processor = self.data_processor
# Launch components: scheduler, cache_manager, expert_service et.al.
self.launch_components()
self.engine.start()
if api_server_pid is not None:
llm_logger.info(f"Start zmq server, api_server_pid: {api_server_pid}")
self.engine.start_zmq_service(api_server_pid)
# If block numer is specified and model is deployed in mixed mode, start cache manager first
if not self.do_profile and self.cfg.scheduler_config.splitwise_role != "mixed":
if self.do_profile == 0 and (
self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != "mixed"
):
device_ids = self.cfg.device_ids.split(",")
self.cache_manager_processes = self.engine.start_cache_service(device_ids, self.ipc_signal_suffix, True)
self.cache_manager_processes = self.engine.start_cache_service(device_ids, self.ipc_signal_suffix)
# Start workers
self.worker_proc = self._start_worker_service()
console_logger.info("Waiting for worker processes to be ready...")
console_logger.info("Waiting worker processes ready...")
time.sleep(5)
self.worker_init_status = dict()
@@ -157,22 +157,13 @@ class LLMEngine:
return False
time.sleep(1)
# If block number is not specified, let workers do profiling to determine the block number,
# and then start the cache manager
if self.do_profile:
self._stop_profile()
elif self.cfg.cache_config.enable_prefix_caching:
device_ids = self.cfg.device_ids.split(",")
self.cache_manager_processes = self.engine.start_cache_service(device_ids, self.ipc_signal_suffix, False)
# Launch components: scheduler, cache_manager, expert_service et.al.
if self.cfg.scheduler_config.splitwise_role != "mixed":
self.launch_components()
if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != "mixed":
self.launched_cache_manager_signal.value[0] = 1
if api_server_pid is not None:
llm_logger.info(f"Start zmq server, api_server_pid: {api_server_pid}")
self.engine.start_zmq_service(api_server_pid)
# Worker launched
self.check_worker_initialize_status_func_thread.join()
if not result_container["worker_is_alive"]:
@@ -180,24 +171,6 @@ class LLMEngine:
return False
console_logger.info(f"Worker processes are launched with {time.time() - start_time} seconds.")
# Print blocks number & max running requests to console
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
block_size = self.cfg.cache_config.block_size
num_gpu_blocks = self.cfg.cache_config.num_gpu_blocks_override or self.cfg.cache_config.total_block_num
num_cpu_blocks = self.cfg.cache_config.num_cpu_blocks
max_running_requests = min(
(num_gpu_blocks + num_cpu_blocks) * block_size // self.cfg.max_model_len,
self.cfg.scheduler_config.max_num_seqs,
)
console_logger.info(
f"Detected {num_gpu_blocks} gpu blocks and {num_cpu_blocks} cpu blocks in cache (block size: {block_size})."
)
console_logger.info(
f"FastDeploy will be serving {max_running_requests} running requests "
f"if each sequence reaches its maximum length: {self.cfg.max_model_len}"
)
return True
def _get_generated_result(self):
@@ -245,9 +218,7 @@ class LLMEngine:
if sampling_params is not None:
request.sampling_params = sampling_params
request.preprocess_start_time = time.time()
chat_template_kwargs = kwargs.get("chat_template_kwargs") or {}
chat_template_kwargs["chat_template"] = kwargs.get("chat_template")
kwargs["chat_template_kwargs"] = chat_template_kwargs
request = self.data_processor.process_request(request, self.cfg.max_model_len, **kwargs)
request.prompt_token_ids_len = len(request.prompt_token_ids)
request.need_prefill_tokens = request.prompt_token_ids_len
@@ -259,6 +230,9 @@ class LLMEngine:
request.get("max_tokens"),
),
)
if request.get("reasoning_max_tokens") is None:
default_reasoning_max_tokens = max(int(request.get("max_tokens") * 0.8), 1)
request.set("reasoning_max_tokens", default_reasoning_max_tokens)
min_tokens = request.get("min_tokens")
if input_ids_len + min_tokens >= self.cfg.max_model_len:
error_msg = (
@@ -337,7 +311,7 @@ class LLMEngine:
)
# launched_cache_manager_signal 用于感知engine是否启动了cache_manager
if self.cfg.cache_config.enable_prefix_caching or self.cfg.scheduler_config.splitwise_role != "mixed":
if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != "mixed":
launched_cache_manager_signal_data = np.zeros([1], dtype=np.int32)
self.launched_cache_manager_signal = IPCSignal(
name="launched_cache_manager_signal",
@@ -452,13 +426,10 @@ class LLMEngine:
}
)
if self.cfg.scheduler_config.splitwise_role != "mixed":
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
variables["FLAGS_use_pd_disaggregation_per_chunk"] = 1
else:
variables["FLAGS_use_pd_disaggregation"] = 1
if self.cfg.splitwise_role != "mixed":
variables["FLAGS_use_pd_disaggregation"] = 1
# TODO dynamic load environment variable
if self.cfg.scheduler_config.splitwise_role == "prefill":
if self.cfg.splitwise_role == "prefill":
variables["FLAGS_fmt_write_cache_completed_signal"] = 1
if self.cfg.model_config.enable_mm:
@@ -492,15 +463,7 @@ class LLMEngine:
else len(self.data_processor.tokenizer.vocab)
)
is_ernie = ErnieArchitectures.contains_ernie_arch(self.cfg.model_config.architectures)
if is_ernie:
self.cfg.model_config.think_end_id = self.data_processor.tokenizer.get_vocab().get("</think>", -1)
if self.cfg.model_config.think_end_id != -1:
llm_logger.info(f"Get think_end_id {self.cfg.model_config.think_end_id} from vocab.")
else:
llm_logger.info("No </think> token found in vocabulary, the model can not do reasoning.")
ports = ",".join(self.cfg.parallel_config.engine_worker_queue_port)
ports = ",".join(self.cfg.engine_worker_queue_port)
ips = None
if self.cfg.ips is not None:
ips = ",".join(self.cfg.ips)
@@ -518,15 +481,14 @@ class LLMEngine:
f" --enc_dec_block_num {self.cfg.cache_config.enc_dec_block_num}"
f" --eos_tokens_lens {self.data_processor.eos_token_id_len}"
f" --pad_token_id {self.data_processor.pad_token_id}"
f" --engine_pid {self.cfg.parallel_config.engine_worker_queue_port[0]}"
f" --engine_pid {self.cfg.engine_worker_queue_port[0]}"
f" --max_num_batched_tokens {self.cfg.scheduler_config.max_num_batched_tokens}"
f" --splitwise_role {self.cfg.scheduler_config.splitwise_role}"
f" --splitwise_role {self.cfg.splitwise_role}"
f" --kv_cache_ratio {self.cfg.cache_config.kv_cache_ratio}"
f" --expert_parallel_size {self.cfg.parallel_config.expert_parallel_size}"
f" --data_parallel_size {self.cfg.parallel_config.data_parallel_size}"
f" --quantization '{json.dumps(self.cfg.model_config.quantization)}'"
f" --ori_vocab_size {ori_vocab_size}"
f" --think_end_id {self.cfg.model_config.think_end_id}"
f" --speculative_config '{self.cfg.speculative_config.to_json_string()}'"
f" --graph_optimization_config '{self.cfg.graph_opt_config.to_json_string()}'"
f" --guided_decoding_backend {self.cfg.guided_decoding_backend}"
@@ -534,12 +496,8 @@ class LLMEngine:
f" --early_stop_config '{self.cfg.early_stop_config.to_json_string()}'"
f" --reasoning_parser {self.cfg.reasoning_parser}"
f" --load_choices {self.cfg.load_config.load_choices}"
f" --plas_attention_config '{self.cfg.plas_attention_config.to_json_string()}'"
f" --moba_attention_config '{self.cfg.moba_attention_config.to_json_string()}'"
f" --ips {ips}"
f" --cache-transfer-protocol {self.cfg.cache_config.cache_transfer_protocol}"
f" --runner {self.cfg.model_config.runner}"
f" --convert {self.cfg.model_config.convert}"
f" --override-pooler-config {self.cfg.model_config.override_pooler_config}"
)
worker_append_flag = {
@@ -644,11 +602,9 @@ class LLMEngine:
num_gpu_blocks = self.get_profile_block_num_signal.value[0]
self.cfg.cache_config.reset(num_gpu_blocks)
self.engine.resource_manager.reset_cache_config(self.cfg.cache_config)
if self.cfg.cache_config.enable_prefix_caching or self.cfg.scheduler_config.splitwise_role != "mixed":
if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != "mixed":
device_ids = self.cfg.device_ids.split(",")
self.cache_manager_processes = self.engine.start_cache_service(
device_ids, self.ipc_signal_suffix, self.cfg.scheduler_config.splitwise_role != "mixed"
)
self.cache_manager_processes = self.engine.start_cache_service(device_ids, self.ipc_signal_suffix)
def check_health(self, time_interval_threashold=30):
"""
@@ -663,32 +619,24 @@ class LLMEngine:
return True, ""
def launch_components(self):
if self.cfg.scheduler_config.splitwise_role != "mixed":
if self.cfg.splitwise_role != "mixed":
# 单机逻辑
self.engine.engine_worker_queue.available_prefill_instances.put(1)
self.splitwise_receive_thread = threading.Thread(
target=self.engine.split_connector.start_receiver, args=()
)
self.splitwise_receive_thread.daemon = True
self.splitwise_receive_thread.start()
self.engine.split_mode_get_tasks()
if self.cfg.scheduler_config.name == "splitwise":
self.splitwise_receive_thread = threading.Thread(
target=self.engine.split_connector.start_receiver, args=()
)
self.splitwise_receive_thread.daemon = True
self.splitwise_receive_thread.start()
self.cfg.init_cache_info()
role = self.cfg.scheduler_config.splitwise_role
role = self.cfg.splitwise_role
host_ip = self.cfg.host_ip
disaggregate = self.cfg.disaggregate_info
request_queues_for_dp_ipc = None
result_queue_for_dp_ipc = None
if self.cfg.scheduler_config.name == "splitwise":
self.engine.scheduler.start(role, host_ip, disaggregate)
elif self.cfg.scheduler_config.name == "dp":
request_queues_for_dp_ipc = []
result_queue_for_dp_ipc = multiprocessing.Queue()
for i in range(self.cfg.parallel_config.data_parallel_size):
request_queues_for_dp_ipc.append(multiprocessing.Queue())
self.engine.scheduler.start(
self.cfg.node_rank * self.cfg.worker_num_per_node, request_queues_for_dp_ipc, result_queue_for_dp_ipc
)
if not envs.FD_ENABLE_MULTI_API_SERVER:
if self.cfg.parallel_config.enable_expert_parallel and self.cfg.parallel_config.data_parallel_size > 1:
@@ -701,7 +649,7 @@ class LLMEngine:
):
address = (
self.cfg.master_ip,
int(self.cfg.parallel_config.engine_worker_queue_port[i]),
int(self.cfg.engine_worker_queue_port[i]),
)
llm_logger.info(f"dp start queue service {address}")
self.dp_engine_worker_queue_server.append(
@@ -718,9 +666,6 @@ class LLMEngine:
args=(
self.cfg,
i,
None,
request_queues_for_dp_ipc,
result_queue_for_dp_ipc,
),
)
)

View File

@@ -27,7 +27,6 @@ import numpy as np
from fastdeploy.engine.common_engine import EngineService
from fastdeploy.inter_communicator import IPCSignal
from fastdeploy.splitwise.internal_adapter_utils import InternalAdapter
from fastdeploy.utils import console_logger, envs, llm_logger
@@ -51,13 +50,13 @@ class ExpertService:
self.cfg = cfg
start_pos = (local_data_parallel_id * self.cfg.parallel_config.tensor_parallel_size) % cfg.worker_num_per_node
end_pos = start_pos + self.cfg.parallel_config.tensor_parallel_size
if cfg.scheduler_config.splitwise_role != "mixed":
if cfg.splitwise_role != "mixed":
self.cfg.cache_config.rdma_comm_ports = self.cfg.cache_config.rdma_comm_ports[start_pos:end_pos]
self.cfg.local_device_ids = self.cfg.device_ids.split(",")[start_pos:end_pos]
llm_logger.info(f"local_data_parallel_id: {local_data_parallel_id}")
self.cfg.disaggregate_info = None
if cfg.scheduler_config.splitwise_role != "mixed":
if cfg.splitwise_role != "mixed":
if len(self.cfg.cache_config.pd_comm_port) == 1:
self.cfg.cache_config.pd_comm_port[0] = (
int(self.cfg.cache_config.pd_comm_port[0]) + local_data_parallel_id
@@ -70,12 +69,8 @@ class ExpertService:
self.engine.scheduler.reset_nodeid(f"{self.engine.scheduler.infer.nodeid}_{local_data_parallel_id!s}")
self._finalizer = weakref.finalize(self, self._exit_sub_services)
if envs.FD_ENABLE_INTERNAL_ADAPTER:
self.internal_adapter = InternalAdapter(cfg=self.cfg, engine=self.engine, dp_rank=local_data_parallel_id)
def start(
self, ipc_signal_suffix, local_data_parallel_id, request_queues_for_dp_ipc=None, result_queue_for_dp_ipc=None
):
def start(self, ipc_signal_suffix, local_data_parallel_id):
"""
Initializes the engine and starts its sub-services.
If `api_server_pid` is defined, will launch a thread
@@ -85,30 +80,25 @@ class ExpertService:
start_time = time.time()
self.engine.start()
if self.cfg.scheduler_config.name == "dp":
self.cfg.init_cache_info()
assert (request_queues_for_dp_ipc is not None) and (result_queue_for_dp_ipc is not None)
self.engine.scheduler.start(local_data_parallel_id, request_queues_for_dp_ipc, result_queue_for_dp_ipc)
if ipc_signal_suffix is not None:
self.api_server_pid = ipc_signal_suffix
self.engine.start_zmq_service(ipc_signal_suffix)
else:
ipc_signal_suffix = self.cfg.parallel_config.engine_worker_queue_port[0]
ipc_signal_suffix = self.cfg.engine_worker_queue_port[0]
llm_logger.info(f"start expert service {local_data_parallel_id}")
if self.cfg.scheduler_config.splitwise_role != "mixed":
ipc_signal_suffix_cache = self.cfg.parallel_config.engine_worker_queue_port[local_data_parallel_id]
self.engine.start_cache_service(self.cfg.local_device_ids, ipc_signal_suffix_cache)
if self.cfg.splitwise_role != "mixed":
self.engine.start_cache_service(self.cfg.local_device_ids, ipc_signal_suffix)
self.engine.split_mode_get_tasks()
if self.cfg.scheduler_config.name == "splitwise":
self.cfg.init_cache_info()
role = self.cfg.scheduler_config.splitwise_role
role = self.cfg.splitwise_role
host_ip = self.cfg.host_ip
disaggregate = self.cfg.disaggregate_info
self.engine.scheduler.start(role, host_ip, disaggregate)
if self.cfg.scheduler_config.splitwise_role != "mixed":
if self.cfg.splitwise_role != "mixed":
self.splitwise_receive_thread = threading.Thread(
target=self.engine.split_connector.start_receiver, args=()
)
@@ -142,6 +132,7 @@ class ExpertService:
if hasattr(self, "cache_manager_processes"):
self.engine.resource_manager.cache_manager.shm_cache_task_flag_broadcast.clear()
self.engine.resource_manager.cache_manager.cache_ready_signal.clear()
for p in self.cache_manager_processes:
llm_logger.info(f"Killing cache manager process {p.pid}")
try:
@@ -153,18 +144,14 @@ class ExpertService:
self.zmq_server.close()
def start_data_parallel_service(
cfg, local_data_parallel_id, ipc_signal_suffix=None, request_queues_for_dp_ipc=None, result_queue_for_dp_ipc=None
):
def start_data_parallel_service(cfg, local_data_parallel_id, ipc_signal_suffix=None):
"""
Start expert service
"""
expert_service = ExpertService(cfg, local_data_parallel_id, start_queue=False)
try:
expert_service.start(
ipc_signal_suffix, local_data_parallel_id, request_queues_for_dp_ipc, result_queue_for_dp_ipc
)
expert_service.start(ipc_signal_suffix, local_data_parallel_id)
def deamon_thread():
while True:
@@ -172,6 +159,5 @@ def start_data_parallel_service(
t_deamon = threading.Thread(target=deamon_thread, daemon=True)
t_deamon.start()
t_deamon.join()
except Exception as e:
llm_logger.exception(f"Expert service failed to start: {e}, {str(traceback.format_exc())}")

View File

@@ -1,170 +0,0 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from copy import deepcopy
from typing import TYPE_CHECKING, Annotated, Any, Optional
import msgspec
from fastdeploy.engine.sampling_params import RequestOutputKind
from fastdeploy.engine.tasks import PoolingTask
if TYPE_CHECKING:
from fastdeploy.config import ModelConfig
class PoolingParams:
"""API parameters for pooling models.
Attributes:
normalize: Whether to normalize the embeddings outputs.
dimensions: Reduce the dimensions of embeddings
if model support matryoshka representation.
activation: Whether to apply activation function to
the classification outputs.
softmax: Whether to apply softmax to the reward outputs.
step_tag_id: Step tag ID for process reward models to identify
specific steps in multi-step reasoning tasks.
returned_token_ids: List of token IDs to return rewards for,
used for fine-grained reward calculation.
task: Internal use only. Specifies the pooling task type
("embed" for embeddings, "encode" for reward models).
requires_token_ids: Internal use only. Whether token ID information
is required for processing.
extra_kwargs: Internal use only. Dictionary for storing additional
custom parameters for extended functionality.
output_kind: Output type specification, fixed to FINAL_ONLY
(only final outputs are returned).
"""
truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=-1)]] = None
"""If set to -1, will use the truncation size supported by the model. If
set to an integer k, will use only the last k tokens from the prompt
(i.e., left truncation). If set to `None`, truncation is disabled."""
# for embeddings models
dimensions: Optional[int] = None
normalize: Optional[bool] = None
# for reward models
softmax: Optional[bool] = None
step_tag_id: Optional[int] = None
returned_token_ids: Optional[list[int]] = None
task: Optional[PoolingTask] = None
"""Internal use only."""
requires_token_ids: bool = False
"""Internal use only."""
extra_kwargs: Optional[dict[str, Any]] = None
"""Internal use only."""
output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY
@property
def _all_parameters(self) -> list[str]:
return ["dimensions", "normalize", "softmax", "step_tag_id", "returned_token_ids"]
@property
def valid_parameters(self):
return {
"embed": ["dimensions", "normalize"],
"encode": ["softmax", "step_tag_id", "returned_token_ids"],
}
def clone(self) -> "PoolingParams":
"""Returns a deep copy of the PoolingParams instance."""
return deepcopy(self)
def verify(self, task: PoolingTask, model_config: Optional["ModelConfig"] = None) -> None:
if self.task is None:
self.task = task
elif self.task != task:
msg = f"You cannot overwrite {self.task=!r} with {task=!r}!"
raise ValueError(msg)
# NOTE: Task validation needs to done against the model instance,
# which is not available in model config. So, it's not included
# in this method
self._merge_default_parameters(model_config)
self._set_default_parameters(model_config)
self._verify_valid_parameters()
def _merge_default_parameters(self, model_config: Optional["ModelConfig"] = None) -> None:
if model_config is None:
return
pooler_config = model_config.pooler_config
if pooler_config is None:
return
assert self.task is not None, "task must be set"
valid_parameters = self.valid_parameters[self.task]
for k in valid_parameters:
if getattr(pooler_config, k, None) is None:
continue
if getattr(self, k, None) is None:
setattr(self, k, getattr(pooler_config, k))
def _set_default_parameters(self, model_config: Optional["ModelConfig"]):
if self.task == "embed":
if self.normalize is None:
self.normalize = True
elif self.task == "encode":
if self.softmax is None:
self.softmax = True
else:
raise ValueError(f"Unknown pooling task: {self.task}")
def _verify_valid_parameters(self):
assert self.task is not None, "task must be set"
valid_parameters = self.valid_parameters[self.task]
invalid_parameters = []
for k in self._all_parameters:
if k in valid_parameters:
continue
if getattr(self, k, None) is not None:
invalid_parameters.append(k)
if invalid_parameters:
raise ValueError(
f"Task {self.task} only supports {valid_parameters} "
f"parameters, does not support "
f"{invalid_parameters} parameters"
)
def __repr__(self) -> str:
return (
f"PoolingParams("
f"task={self.task}, "
f"normalize={self.normalize}, "
f"dimensions={self.dimensions}, "
f"softmax={self.softmax}, "
f"step_tag_id={self.step_tag_id}, "
f"returned_token_ids={self.returned_token_ids}, "
f"requires_token_ids={self.requires_token_ids}, "
f"extra_kwargs={self.extra_kwargs})"
)
def __post_init__(self) -> None:
assert self.output_kind == RequestOutputKind.FINAL_ONLY, "For pooling output_kind has to be FINAL_ONLY"

View File

@@ -73,7 +73,6 @@ class Request:
guided_json_object: Optional[bool] = None,
enable_thinking: Optional[bool] = True,
trace_carrier: dict = dict(),
dp_rank: Optional[int] = None,
chat_template: Optional[str] = None,
image_start: int = 0,
video_start: int = 0,
@@ -146,8 +145,6 @@ class Request:
# extend block tables
self.use_extend_tables = False
self.extend_block_tables = []
# dp
self.dp_rank = dp_rank
@classmethod
def from_dict(cls, d: dict):
@@ -190,7 +187,6 @@ class Request:
image_end=d.get("image_end", 0),
video_end=d.get("video_end", 0),
audio_end=d.get("audio_end", 0),
dp_rank=d.get("dp_rank", None),
)
@property
@@ -308,7 +304,6 @@ class CompletionOutput:
"index": self.index,
"send_idx": self.send_idx,
"token_ids": self.token_ids,
"decode_type": self.decode_type,
"logprob": self.logprob,
"top_logprobs": self.top_logprobs,
"logprobs": self.logprobs,

Some files were not shown because too many files have changed in this diff Show More