mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-03 15:56:49 +08:00
Compare commits
2 Commits
develop
...
copilot/fi
Author | SHA1 | Date | |
---|---|---|---|
![]() |
0f2b609496 | ||
![]() |
3e319c0f90 |
3
.github/workflows/_base_test.yml
vendored
3
.github/workflows/_base_test.yml
vendored
@@ -143,8 +143,7 @@ jobs:
|
||||
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
|
||||
-e TZ="Asia/Shanghai" \
|
||||
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
|
||||
# python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
|
||||
python -m pip install paddlepaddle-gpu==3.3.0.dev20250917 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
|
||||
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
|
||||
|
||||
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
|
||||
|
8
.github/workflows/_build_linux.yml
vendored
8
.github/workflows/_build_linux.yml
vendored
@@ -106,12 +106,7 @@ jobs:
|
||||
CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
|
||||
gpu_id=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
|
||||
|
||||
IFS='/' read -ra parts <<< "${GITHUB_WORKSPACE}"
|
||||
len=${#parts[@]}
|
||||
CCACHE_DEFAULT_DIR="/$(IFS=/; echo "${parts[*]:1:$((len-5))}")"
|
||||
echo "$CCACHE_DEFAULT_DIR"
|
||||
|
||||
CACHE_DIR="${CACHE_DIR:-$CCACHE_DEFAULT_DIR}"
|
||||
CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
|
||||
echo "CACHE_DIR is set to ${CACHE_DIR}"
|
||||
if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
|
||||
touch "${CACHE_DIR}/gitconfig"
|
||||
@@ -132,7 +127,6 @@ jobs:
|
||||
-e "PADDLEVERSION=${PADDLEVERSION}" \
|
||||
-e "PADDLE_WHL_URL=${PADDLE_WHL_URL}" \
|
||||
-e "BRANCH_REF=${BRANCH_REF}" \
|
||||
-e "CCACHE_MAXSIZE=50G" \
|
||||
--gpus "\"device=${gpu_id}\"" ${docker_image} /bin/bash -c '
|
||||
if [[ -n "${FD_VERSION}" ]]; then
|
||||
export FASTDEPLOY_VERSION=${FD_VERSION}
|
||||
|
3
.github/workflows/_pre_ce_test.yml
vendored
3
.github/workflows/_pre_ce_test.yml
vendored
@@ -82,9 +82,6 @@ jobs:
|
||||
FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
|
||||
FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
|
||||
FD_CACHE_QUEUE_PORT=$((42098 + DEVICE_PORT * 100))
|
||||
FD_ZMQ_RECV_REQUEST_SERVER_PORT=$((42048 + DEVICE_PORT * 100))
|
||||
FD_ZMQ_SEND_RESPONSE_SERVER_PORT=$((42038 + DEVICE_PORT * 100))
|
||||
FD_ZMQ_CONTROL_CMD_SERVER_PORTS=$((42028 + DEVICE_PORT * 100))
|
||||
echo "Test ENV Parameter:"
|
||||
echo "========================================================="
|
||||
echo "FLASK_PORT=${FLASK_PORT}"
|
||||
|
6
.github/workflows/ci_iluvatar.yml
vendored
6
.github/workflows/ci_iluvatar.yml
vendored
@@ -28,22 +28,18 @@ jobs:
|
||||
REPO="https://github.com/${{ github.repository }}.git"
|
||||
FULL_REPO="${{ github.repository }}"
|
||||
REPO_NAME="${FULL_REPO##*/}"
|
||||
BASE_BRANCH="${{ github.base_ref }}"
|
||||
# Clean the repository directory before starting
|
||||
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
|
||||
-e "REPO_NAME=${REPO_NAME}" \
|
||||
-e "BASE_BRANCH=${BASE_BRANCH}" \
|
||||
${docker_image} /bin/bash -c '
|
||||
if [ -d ${REPO_NAME} ]; then
|
||||
echo "Directory ${REPO_NAME} exists, removing it..."
|
||||
rm -rf ${REPO_NAME}
|
||||
fi
|
||||
'
|
||||
git config --global http.proxy "http://61.151.249.150:33128"
|
||||
git config --global https.proxy "http://61.151.249.150:33128"
|
||||
git config --global user.name "FastDeployCI"
|
||||
git config --global user.email "fastdeploy_ci@example.com"
|
||||
git clone --recursive ${REPO} ${REPO_NAME} -b ${BASE_BRANCH}
|
||||
git clone ${REPO} ${REPO_NAME}
|
||||
cd FastDeploy
|
||||
if [ "${{ github.event_name }}" = "pull_request" ]; then
|
||||
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
|
||||
|
@@ -43,7 +43,7 @@ English | [简体中文](README_CN.md)
|
||||
- 🤝 **OpenAI API Server and vLLM Compatible**: One-command deployment with [vLLM](https://github.com/vllm-project/vllm/) interface compatibility.
|
||||
- 🧮 **Comprehensive Quantization Format Support**: W8A16, W8A8, W4A16, W4A8, W2A16, FP8, and more.
|
||||
- ⏩ **Advanced Acceleration Techniques**: Speculative decoding, Multi-Token Prediction (MTP) and Chunked Prefill.
|
||||
- 🖥️ **Multi-Hardware Support**: NVIDIA GPU, Kunlunxin XPU, Hygon DCU, Ascend NPU, Iluvatar GPU, Enflame GCU, MetaX GPU, Intel Gaudi etc.
|
||||
- 🖥️ **Multi-Hardware Support**: NVIDIA GPU, Kunlunxin XPU, Hygon DCU, Ascend NPU, Iluvatar GPU, Enflame GCU, MetaX GPU etc.
|
||||
|
||||
## Requirements
|
||||
|
||||
@@ -60,7 +60,6 @@ FastDeploy supports inference deployment on **NVIDIA GPUs**, **Kunlunxin XPUs**,
|
||||
- [Enflame GCU](./docs/get_started/installation/Enflame_gcu.md)
|
||||
- [Hygon DCU](./docs/get_started/installation/hygon_dcu.md)
|
||||
- [MetaX GPU](./docs/get_started/installation/metax_gpu.md)
|
||||
- [Intel Gaudi](./docs/get_started/installation/intel_gaudi.md)
|
||||
|
||||
**Note:** We are actively working on expanding hardware support. Additional hardware platforms including Ascend NPU are currently under development and testing. Stay tuned for updates!
|
||||
|
||||
|
@@ -41,7 +41,7 @@
|
||||
- 🤝 **OpenAI API服务与vLLM兼容**:单命令部署,兼容[vLLM](https://github.com/vllm-project/vllm/)接口
|
||||
- 🧮 **全量化格式支持**:W8A16、W8A8、W4A16、W4A8、W2A16、FP8等
|
||||
- ⏩ **高级加速技术**:推测解码、多令牌预测(MTP)及分块预填充
|
||||
- 🖥️ **多硬件支持**:NVIDIA GPU、昆仑芯XPU、海光DCU、昇腾NPU、天数智芯GPU、燧原GCU、沐曦GPU、英特尔Gaudi等
|
||||
- 🖥️ **多硬件支持**:NVIDIA GPU、昆仑芯XPU、海光DCU、昇腾NPU、天数智芯GPU、燧原GCU、沐曦GPU等
|
||||
|
||||
## 要求
|
||||
|
||||
@@ -58,7 +58,6 @@ FastDeploy 支持在**英伟达(NVIDIA)GPU**、**昆仑芯(Kunlunxin)XPU
|
||||
- [燧原 S60](./docs/zh/get_started/installation/Enflame_gcu.md)
|
||||
- [海光 DCU](./docs/zh/get_started/installation/hygon_dcu.md)
|
||||
- [沐曦 GPU](./docs/zh/get_started/installation/metax_gpu.md)
|
||||
- [英特尔 Gaudi](./docs/zh/get_started/installation/intel_gaudi.md)
|
||||
|
||||
**注意:** 我们正在积极拓展硬件支持范围。目前,包括昇腾(Ascend)NPU 等其他硬件平台正在开发测试中。敬请关注更新!
|
||||
|
||||
|
@@ -98,7 +98,7 @@ def main(args):
|
||||
raise ValueError("--max_concurrency should be same length as --s_itl_base_model")
|
||||
|
||||
for max_concurrency, s_itl in zip(args.max_concurrency, args.s_itl_base_model):
|
||||
# Warmup
|
||||
# Wramup
|
||||
print("Starting warmup...")
|
||||
with open(os.devnull, "w") as f:
|
||||
with contextlib.redirect_stdout(f):
|
||||
|
@@ -965,7 +965,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--backend",
|
||||
type=str,
|
||||
default="openai-chat",
|
||||
default="vllm",
|
||||
choices=list(ASYNC_REQUEST_FUNCS.keys()),
|
||||
)
|
||||
parser.add_argument(
|
||||
|
@@ -1,5 +0,0 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
tensor_parallel_size: 4
|
||||
use_cudagraph: True
|
||||
load_choices: "default_v1"
|
@@ -1,6 +0,0 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
tensor_parallel_size: 4
|
||||
use_cudagraph: True
|
||||
load_choices: "default_v1"
|
||||
quantization: wfp8afp8
|
@@ -1,8 +0,0 @@
|
||||
top_p: 0.95
|
||||
temperature: 0.6
|
||||
metadata:
|
||||
min_tokens: 1
|
||||
max_tokens: 12288
|
||||
repetition_penalty: 1.0
|
||||
frequency_penalty: 0
|
||||
presence_penalty: 0
|
10
build.sh
10
build.sh
@@ -128,12 +128,6 @@ function copy_ops(){
|
||||
echo -e "MACA ops have been copy to fastdeploy"
|
||||
return
|
||||
fi
|
||||
is_intel_hpu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('intel_hpu'))"`
|
||||
if [ "$is_intel_hpu" = "True" ]; then
|
||||
DEVICE_TYPE="intel-hpu"
|
||||
echo -e "intel_hpu ops have been copy to fastdeploy"
|
||||
return
|
||||
fi
|
||||
|
||||
DEVICE_TYPE="cpu"
|
||||
cd ../../../../
|
||||
@@ -165,9 +159,7 @@ function build_and_install_ops() {
|
||||
else
|
||||
FD_BUILDING_ARCS=${FD_BUILDING_ARCS} ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
|
||||
fi
|
||||
if [ -d "${OPS_TMP_DIR}" ]; then
|
||||
find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
|
||||
fi
|
||||
find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
|
||||
else
|
||||
echo "Error: Invalid parameter '$FD_CPU_USE_BF16'. Please use true or false."
|
||||
exit 1
|
||||
|
@@ -435,7 +435,7 @@ __global__ void multi_query_append_attention_warp1_4_kernel(
|
||||
float *__restrict__ tmp_d, // [token_num, num_chunks, num_heads]
|
||||
OutT *__restrict__ out,
|
||||
const int speculate_max_draft_token_num = 5,
|
||||
const uint32_t attn_mask_len = -1) {
|
||||
const int32_t attn_mask_len = -1) {
|
||||
constexpr uint32_t num_vecs_per_head = HEAD_DIM / num_elems_per_128b<T>();
|
||||
static_assert(NUM_WARP_Q == 1, "NUM_WARP_Q must be 1");
|
||||
static_assert(NUM_WARP_KV == 4, "NUM_WARP_KV must be 4");
|
||||
@@ -1089,7 +1089,7 @@ void MultiQueryAppendAttention(
|
||||
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
|
||||
}
|
||||
|
||||
uint32_t attn_mask_len;
|
||||
int32_t attn_mask_len;
|
||||
if (attn_mask) {
|
||||
attn_mask_len = attn_mask.get().shape()[1];
|
||||
} else {
|
||||
|
@@ -533,7 +533,7 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel(
|
||||
float *__restrict__ tmp_d, // [token_num, num_chunks, num_heads]
|
||||
OutT *__restrict__ out,
|
||||
const int speculate_max_draft_token_num = 5,
|
||||
const uint32_t attn_mask_len = -1) {
|
||||
const int32_t attn_mask_len = -1) {
|
||||
constexpr uint32_t num_vecs_per_head = HEAD_DIM / num_elems_per_128b<T>();
|
||||
constexpr uint32_t num_vecs_per_head_k =
|
||||
HEAD_DIM / 2 / num_elems_per_128b<CacheT>();
|
||||
@@ -1313,7 +1313,7 @@ void MultiQueryAppendC4Attention(
|
||||
}
|
||||
|
||||
const int num_chunks = div_up(max_seq_len, chunk_size);
|
||||
uint32_t attn_mask_len;
|
||||
int32_t attn_mask_len;
|
||||
if (attn_mask) {
|
||||
attn_mask_len = attn_mask.get().shape()[1];
|
||||
} else {
|
||||
|
@@ -540,7 +540,7 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel(
|
||||
float *__restrict__ tmp_d, // [token_num, num_chunks, num_heads]
|
||||
OutT *__restrict__ out,
|
||||
const int speculate_max_draft_token_num = 5,
|
||||
const uint32_t attn_mask_len = -1) {
|
||||
const int32_t attn_mask_len = -1) {
|
||||
constexpr uint32_t num_vecs_per_head = HEAD_DIM / num_elems_per_128b<T>();
|
||||
constexpr uint32_t num_vecs_per_head_k =
|
||||
HEAD_DIM / num_elems_per_128b<CacheT>();
|
||||
@@ -1372,7 +1372,7 @@ void MultiQueryAppendC8Attention(
|
||||
}
|
||||
|
||||
const int num_chunks = div_up(max_seq_len, chunk_size);
|
||||
uint32_t attn_mask_len;
|
||||
int32_t attn_mask_len;
|
||||
if (attn_mask) {
|
||||
attn_mask_len = attn_mask.get().shape()[1];
|
||||
} else {
|
||||
|
@@ -1026,7 +1026,7 @@ __device__ __forceinline__ void mask_s(const bool* attn_mask,
|
||||
const uint32_t qo_len,
|
||||
const uint32_t kv_len,
|
||||
const uint32_t chunk_end,
|
||||
const uint32_t attn_mask_len,
|
||||
const int32_t attn_mask_len,
|
||||
float (*s_frag)[num_frags_z][8],
|
||||
const int *mask_offset = nullptr) {
|
||||
const uint32_t tx = threadIdx.x;
|
||||
@@ -1050,7 +1050,7 @@ __device__ __forceinline__ void mask_s(const bool* attn_mask,
|
||||
(causal
|
||||
? (kv_idx > kv_len + q_idx - qo_len || (kv_idx >= chunk_end))
|
||||
: kv_idx >= chunk_end);
|
||||
if (attn_mask != nullptr && kv_idx > kv_len - qo_len && kv_idx < chunk_end && q_idx < attn_mask_len) {
|
||||
if (attn_mask != nullptr && kv_idx > kv_len - qo_len && kv_idx < chunk_end && attn_mask_len > 0 && q_idx < static_cast<uint32_t>(attn_mask_len)) {
|
||||
const int32_t mask_idx = q_idx * attn_mask_len + kv_idx - kv_len + qo_len;
|
||||
bool mask = attn_mask[mask_idx];
|
||||
out_of_boundary |= mask;
|
||||
|
@@ -1004,8 +1004,7 @@ __global__ void cache_kernel(
|
||||
const uint32_t qkv_bias = bias % hidden_size;
|
||||
const uint32_t hi = qkv_bias / head_size;
|
||||
const uint32_t h_bias = qkv_bias % head_size;
|
||||
const int32_t ori_bi = batch_id_per_token[token_idx];
|
||||
if (ori_bi == -1) continue; // skip batch_id_per_token[token_idx]=-1
|
||||
const uint32_t ori_bi = batch_id_per_token[token_idx];
|
||||
if (seq_lens[ori_bi] == 0) continue;
|
||||
const uint32_t ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
|
||||
|
||||
|
@@ -571,7 +571,6 @@ std::vector<paddle::Tensor> NoauxTc(
|
||||
int n_group,
|
||||
int topk_group,
|
||||
int topk,
|
||||
bool renormalize,
|
||||
float routed_scaling_factor);
|
||||
|
||||
#ifdef ENABLE_FP8
|
||||
@@ -623,8 +622,6 @@ int64_t open_mem_handle(paddle::Tensor& mem_handle);
|
||||
|
||||
void free_shared_buffer(int64_t buffer);
|
||||
|
||||
void clear_ipc_handles(int64_t _fa);
|
||||
|
||||
// speculative decoding Kernel
|
||||
std::vector<paddle::Tensor> SpeculateGetPaddingOffset(
|
||||
const paddle::Tensor& input_ids,
|
||||
@@ -1231,8 +1228,6 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
|
||||
m.def("free_shared_buffer", &free_shared_buffer, "free_shared_buffer");
|
||||
|
||||
m.def("clear_ipc_handles", &clear_ipc_handles, "clear_ipc_handles");
|
||||
|
||||
m.def("open_mem_handle", &open_mem_handle, "open_mem_handle");
|
||||
|
||||
m.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta, "get_graph_buffer_ipc_meta");
|
||||
|
@@ -122,14 +122,10 @@ void register_graph_buffers(fptr_t _fa,
|
||||
for (int i = 0; i < handles.size(); i++) {
|
||||
bytes.emplace_back(handles[i].begin(), handles[i].end());
|
||||
}
|
||||
bytes.reserve(handles.size());
|
||||
fa->register_graph_buffers(bytes, offsets);
|
||||
}
|
||||
|
||||
void clear_ipc_handles(fptr_t _fa) {
|
||||
auto fa = reinterpret_cast<paddle::CustomAllreduce*>(_fa);
|
||||
fa->clear_ipc_handles();
|
||||
}
|
||||
|
||||
std::tuple<fptr_t, paddle::Tensor> allocate_shared_buffer_and_handle(
|
||||
int64_t size) {
|
||||
|
||||
|
@@ -303,7 +303,7 @@ class CustomAllreduce {
|
||||
bool full_nvlink_;
|
||||
|
||||
RankSignals sg_;
|
||||
// Stores an map from a pointer to its peer pointers from all ranks.
|
||||
// Stores an map from a pointer to its peer pointters from all ranks.
|
||||
std::unordered_map<void*, RankData*> buffers_;
|
||||
Signal* self_sg_;
|
||||
|
||||
@@ -517,15 +517,10 @@ class CustomAllreduce {
|
||||
#undef KL
|
||||
}
|
||||
|
||||
void clear_ipc_handles(){
|
||||
~CustomAllreduce() {
|
||||
for (auto [_, ptr] : ipc_handles_) {
|
||||
CUDACHECK(cudaIpcCloseMemHandle(ptr));
|
||||
}
|
||||
ipc_handles_.clear();
|
||||
}
|
||||
|
||||
~CustomAllreduce() {
|
||||
clear_ipc_handles();
|
||||
}
|
||||
};
|
||||
} // namespace paddle
|
||||
|
@@ -39,6 +39,9 @@ void GetOutputTopK(const paddle::Tensor& x,
|
||||
int k,
|
||||
int64_t rank_id,
|
||||
bool wait_flag) {
|
||||
if (rank_id > 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
static struct msgdata msg_rcv;
|
||||
int msg_queue_id = 1;
|
||||
|
@@ -14,8 +14,6 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cuda_fp8.h>
|
||||
|
||||
#ifndef PADDLE_WITH_COREX
|
||||
#include "glog/logging.h"
|
||||
#endif
|
||||
@@ -153,34 +151,6 @@ inline int GetGPUComputeCapability(int id) {
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef FP8_E4M3_MAX
|
||||
#define FP8_E4M3_MAX 448.0
|
||||
#endif
|
||||
|
||||
#ifndef DISPATCH_FLOAT_FP6_DTYPE
|
||||
#define DISPATCH_FLOAT_FP6_DTYPE(pd_dtype, c_type, ...) \
|
||||
switch (pd_dtype) { \
|
||||
case phi::DataType::FLOAT32: { \
|
||||
using c_type = float; \
|
||||
__VA_ARGS__ \
|
||||
break; \
|
||||
} \
|
||||
case phi::DataType::BFLOAT16: { \
|
||||
using c_type = phi::dtype::bfloat16; \
|
||||
__VA_ARGS__ \
|
||||
break; \
|
||||
} \
|
||||
case phi::DataType::FLOAT16: { \
|
||||
using c_type = phi::dtype::float16; \
|
||||
__VA_ARGS__ \
|
||||
break; \
|
||||
} \
|
||||
default: { \
|
||||
PD_THROW("Only supported attr of input type in [fp32, fp16, bf16]."); \
|
||||
} \
|
||||
}
|
||||
#endif
|
||||
|
||||
inline constexpr uint32_t next_pow_2(uint32_t const num) {
|
||||
if (num <= 1)
|
||||
return num;
|
||||
@@ -223,13 +193,11 @@ public:
|
||||
typedef uint8_t data_t;
|
||||
};
|
||||
|
||||
#ifndef PADDLE_WITH_COREX
|
||||
template <> class PDTraits<paddle::DataType::FLOAT8_E4M3FN> {
|
||||
public:
|
||||
typedef __nv_fp8_e4m3 DataType;
|
||||
typedef paddle::float8_e4m3fn data_t;
|
||||
};
|
||||
#endif
|
||||
|
||||
template <typename T, int Size> struct alignas(sizeof(T) * Size) AlignedVector {
|
||||
T val[Size];
|
||||
@@ -603,28 +571,3 @@ inline bool GetMlaUseTensorcore() {
|
||||
flags_mla_use_tensorcore && enable_mla_tensorcore;
|
||||
return mla_use_tensorcore;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ float warpReduceMax(float value) {
|
||||
value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 16));
|
||||
value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 8));
|
||||
value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 4));
|
||||
value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 2));
|
||||
value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 1));
|
||||
return value;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ float blockReduceMax(float value) {
|
||||
static __shared__ float warpLevelMaxs[WARP_SIZE];
|
||||
const int laneId = threadIdx.x % WARP_SIZE;
|
||||
const int warpId = threadIdx.x / WARP_SIZE;
|
||||
|
||||
value = warpReduceMax(value);
|
||||
|
||||
if (laneId == 0) warpLevelMaxs[warpId] = value;
|
||||
__syncthreads();
|
||||
|
||||
value = (threadIdx.x < blockDim.x / WARP_SIZE) ? warpLevelMaxs[laneId] : 0;
|
||||
if (warpId == 0) value = warpReduceMax(value);
|
||||
|
||||
return value;
|
||||
}
|
||||
|
@@ -18,6 +18,7 @@
|
||||
#include "iomanip"
|
||||
#include <nvml.h>
|
||||
#include <iostream>
|
||||
#include <nvml.h>
|
||||
// #define PRINT_GPU_MEMORY
|
||||
// 函数用于获取 NVIDIA GPU 显存信息
|
||||
bool getNvidiaGPUMemoryUsage(int callLine) {
|
||||
|
@@ -33,11 +33,6 @@
|
||||
__VA_ARGS__ \
|
||||
break; \
|
||||
} \
|
||||
case 3: { \
|
||||
constexpr size_t NUM_EXPERTS_PER_RANK = 3; \
|
||||
__VA_ARGS__ \
|
||||
break; \
|
||||
} \
|
||||
case 6: { \
|
||||
constexpr size_t NUM_EXPERTS_PER_RANK = 6; \
|
||||
__VA_ARGS__ \
|
||||
|
@@ -26,7 +26,6 @@ std::vector<paddle::Tensor> NoauxTc(paddle::Tensor& scores,
|
||||
int n_group,
|
||||
int topk_group,
|
||||
int topk,
|
||||
bool renormalize,
|
||||
float routed_scaling_factor) {
|
||||
auto input_shape = scores_with_bias.shape();
|
||||
PD_CHECK(input_shape.size() == 2);
|
||||
@@ -49,7 +48,6 @@ std::vector<paddle::Tensor> NoauxTc(paddle::Tensor& scores,
|
||||
n_group,
|
||||
topk_group,
|
||||
topk,
|
||||
renormalize,
|
||||
routed_scaling_factor,
|
||||
stream);
|
||||
|
||||
@@ -78,7 +76,6 @@ PD_BUILD_STATIC_OP(noaux_tc)
|
||||
.Attrs({"n_group: int",
|
||||
"topk_group: int",
|
||||
"topk:int",
|
||||
"renormalize: bool",
|
||||
"routed_scaling_factor: float"})
|
||||
.SetKernelFn(PD_KERNEL(NoauxTc))
|
||||
.SetInferShapeFn(PD_INFER_SHAPE(NoauxTcInferShape))
|
||||
|
@@ -25,23 +25,6 @@ constexpr unsigned FULL_WARP_MASK = 0xffffffff;
|
||||
constexpr int32_t BLOCK_SIZE = 512;
|
||||
constexpr int32_t NUM_WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE;
|
||||
|
||||
template <typename T_OUT, typename T_IN>
|
||||
__device__ inline T_OUT cuda_cast(T_IN val) {
|
||||
return val;
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) {
|
||||
return __bfloat162float(val);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ inline T neg_inf() {
|
||||
// cuda::std::numeric_limits<T>::infinity() returns `0` for [T=bf16 or fp16]
|
||||
// so we need to cast from fp32
|
||||
return cuda_cast<T, float>(-cuda::std::numeric_limits<float>::infinity());
|
||||
}
|
||||
|
||||
namespace warp_topk {
|
||||
|
||||
template <int size, typename T>
|
||||
@@ -58,21 +41,10 @@ constexpr __host__ __device__ bool isPowerOf2(T v) {
|
||||
}
|
||||
|
||||
template <bool greater, typename T>
|
||||
__forceinline__ __device__ bool is_better_than(T val, T baseline) {
|
||||
__device__ bool is_better_than(T val, T baseline) {
|
||||
return (val > baseline && greater) || (val < baseline && !greater);
|
||||
}
|
||||
|
||||
template <bool greater, typename T, typename idxT>
|
||||
__forceinline__ __device__ bool is_better_than(T val, T baseline, idxT index,
|
||||
idxT baseline_index) {
|
||||
bool res = (val > baseline && greater) || (val < baseline && !greater);
|
||||
if (val == baseline) {
|
||||
res = (index < baseline_index && greater) ||
|
||||
(index < baseline_index && !greater);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T, typename idxT>
|
||||
int calc_smem_size_for_block_wide(int num_of_warp, int64_t k) {
|
||||
int64_t cache_topk = (sizeof(T) + sizeof(idxT)) * num_of_warp * k;
|
||||
@@ -81,8 +53,7 @@ int calc_smem_size_for_block_wide(int num_of_warp, int64_t k) {
|
||||
round_up_to_multiple_of<256>(n * sizeof(T)) + n * sizeof(idxT));
|
||||
}
|
||||
|
||||
template <int size, bool ascending, bool reverse, typename T, typename idxT,
|
||||
bool is_stable>
|
||||
template <int size, bool ascending, typename T, typename idxT>
|
||||
struct BitonicMerge {
|
||||
// input should be a bitonic sequence, and sort it to be a monotonic sequence
|
||||
__device__ static void merge(T* __restrict__ val_arr,
|
||||
@@ -96,15 +67,7 @@ struct BitonicMerge {
|
||||
int const other_i = i + stride;
|
||||
T& val = val_arr[i];
|
||||
T& other_val = val_arr[other_i];
|
||||
bool is_better;
|
||||
if constexpr (is_stable) {
|
||||
is_better = is_better_than<ascending>(val, other_val, idx_arr[i],
|
||||
idx_arr[other_i]);
|
||||
} else {
|
||||
is_better = is_better_than<ascending>(val, other_val);
|
||||
}
|
||||
|
||||
if (is_better) {
|
||||
if ((val > other_val && ascending) || (val < other_val && !ascending)) {
|
||||
T tmp = val;
|
||||
val = other_val;
|
||||
other_val = tmp;
|
||||
@@ -115,14 +78,13 @@ struct BitonicMerge {
|
||||
}
|
||||
}
|
||||
|
||||
BitonicMerge<size / 2, ascending, reverse, T, idxT, is_stable>::merge(
|
||||
val_arr, idx_arr);
|
||||
BitonicMerge<size / 2, ascending, reverse, T, idxT, is_stable>::merge(
|
||||
val_arr + arr_len / 2, idx_arr + arr_len / 2);
|
||||
BitonicMerge<size / 2, ascending, T, idxT>::merge(val_arr, idx_arr);
|
||||
BitonicMerge<size / 2, ascending, T, idxT>::merge(val_arr + arr_len / 2,
|
||||
idx_arr + arr_len / 2);
|
||||
}
|
||||
};
|
||||
|
||||
template <int size, bool ascending, typename T, typename idxT, bool is_stable>
|
||||
template <int size, bool ascending, typename T, typename idxT>
|
||||
struct BitonicSort {
|
||||
__device__ static void sort(T* __restrict__ val_arr,
|
||||
idxT* __restrict__ idx_arr) {
|
||||
@@ -130,16 +92,15 @@ struct BitonicSort {
|
||||
static_assert(size >= 2 * WARP_SIZE);
|
||||
constexpr int arr_len = size / WARP_SIZE;
|
||||
|
||||
BitonicSort<size / 2, true, T, idxT, is_stable>::sort(val_arr, idx_arr);
|
||||
BitonicSort<size / 2, false, T, idxT, is_stable>::sort(
|
||||
val_arr + arr_len / 2, idx_arr + arr_len / 2);
|
||||
BitonicMerge<size, ascending, ascending, T, idxT, is_stable>::merge(
|
||||
val_arr, idx_arr);
|
||||
BitonicSort<size / 2, true, T, idxT>::sort(val_arr, idx_arr);
|
||||
BitonicSort<size / 2, false, T, idxT>::sort(val_arr + arr_len / 2,
|
||||
idx_arr + arr_len / 2);
|
||||
BitonicMerge<size, ascending, T, idxT>::merge(val_arr, idx_arr);
|
||||
}
|
||||
};
|
||||
|
||||
template <bool ascending, typename T, typename idxT, bool is_stable>
|
||||
struct BitonicSort<32, ascending, T, idxT, is_stable> {
|
||||
template <bool ascending, typename T, typename idxT>
|
||||
struct BitonicSort<32, ascending, T, idxT> {
|
||||
__device__ static void sort(T* __restrict__ val_arr,
|
||||
idxT* __restrict__ idx_arr) {
|
||||
int const lane = threadIdx.x % WARP_SIZE;
|
||||
@@ -153,37 +114,19 @@ struct BitonicSort<32, ascending, T, idxT, is_stable> {
|
||||
|
||||
T other = __shfl_xor_sync(FULL_WARP_MASK, *val_arr, stride);
|
||||
idxT other_idx = __shfl_xor_sync(FULL_WARP_MASK, *idx_arr, stride);
|
||||
|
||||
bool is_better;
|
||||
if constexpr (is_stable) {
|
||||
if constexpr (ascending) {
|
||||
is_better = ((*val_arr > other) ||
|
||||
((*val_arr == other) && (*idx_arr < other_idx))) !=
|
||||
(reverse != is_second);
|
||||
} else {
|
||||
is_better = ((*val_arr > other) ||
|
||||
((*val_arr == other) && (*idx_arr > other_idx))) !=
|
||||
(reverse != is_second);
|
||||
}
|
||||
} else {
|
||||
is_better = (*val_arr != other &&
|
||||
(*val_arr > other) != (reverse != is_second));
|
||||
}
|
||||
if (is_better) {
|
||||
if (*val_arr != other && (*val_arr > other) != (reverse != is_second)) {
|
||||
*val_arr = other;
|
||||
*idx_arr = other_idx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BitonicMerge<32, ascending, ascending, T, idxT, is_stable>::merge(val_arr,
|
||||
idx_arr);
|
||||
BitonicMerge<32, ascending, T, idxT>::merge(val_arr, idx_arr);
|
||||
}
|
||||
};
|
||||
|
||||
template <bool ascending, bool reverse, typename T, typename idxT,
|
||||
bool is_stable>
|
||||
struct BitonicMerge<32, ascending, reverse, T, idxT, is_stable> {
|
||||
template <bool ascending, typename T, typename idxT>
|
||||
struct BitonicMerge<32, ascending, T, idxT> {
|
||||
__device__ static void merge(T* __restrict__ val_arr,
|
||||
idxT* __restrict__ idx_arr) {
|
||||
int const lane = threadIdx.x % WARP_SIZE;
|
||||
@@ -193,24 +136,7 @@ struct BitonicMerge<32, ascending, reverse, T, idxT, is_stable> {
|
||||
T other = __shfl_xor_sync(FULL_WARP_MASK, val, stride);
|
||||
idxT& idx = *idx_arr;
|
||||
idxT other_idx = __shfl_xor_sync(FULL_WARP_MASK, idx, stride);
|
||||
|
||||
bool is_better;
|
||||
if constexpr (is_stable) {
|
||||
if constexpr (ascending) {
|
||||
is_better = ((*val_arr > other) ||
|
||||
((*val_arr == other) && (*idx_arr < other_idx))) ==
|
||||
(reverse != is_second); // for min
|
||||
} else {
|
||||
is_better = ((*val_arr > other) ||
|
||||
((*val_arr == other) && (*idx_arr > other_idx))) ==
|
||||
(reverse != is_second); // for max
|
||||
}
|
||||
} else {
|
||||
is_better =
|
||||
(val != other && ((val > other) == (ascending != is_second)));
|
||||
}
|
||||
|
||||
if (is_better) {
|
||||
if (val != other && ((val > other) == (ascending != is_second))) {
|
||||
val = other;
|
||||
idx = other_idx;
|
||||
}
|
||||
@@ -218,42 +144,34 @@ struct BitonicMerge<32, ascending, reverse, T, idxT, is_stable> {
|
||||
}
|
||||
};
|
||||
|
||||
template <int capacity, bool greater, typename T, typename idxT, bool is_stable>
|
||||
template <int capacity, bool greater, typename T, typename idxT>
|
||||
class WarpSort {
|
||||
public:
|
||||
public:
|
||||
__device__ WarpSort(idxT k, T dummy)
|
||||
: lane_(threadIdx.x % WARP_SIZE), k_(k), dummy_(dummy) {
|
||||
static_assert(capacity >= WARP_SIZE && isPowerOf2(capacity));
|
||||
|
||||
for (int i = 0; i < max_arr_len_; ++i) {
|
||||
val_arr_[i] = dummy_;
|
||||
idx_arr_[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// load and merge k sorted values
|
||||
__device__ void load_sorted(T const* __restrict__ in,
|
||||
idxT const* __restrict__ in_idx, idxT start) {
|
||||
idxT const* __restrict__ in_idx,
|
||||
idxT start) {
|
||||
idxT idx = start + WARP_SIZE - 1 - lane_;
|
||||
for (int i = max_arr_len_ - 1; i >= 0; --i, idx += WARP_SIZE) {
|
||||
if (idx < start + k_) {
|
||||
T t = in[idx];
|
||||
bool is_better;
|
||||
if constexpr (is_stable) {
|
||||
is_better =
|
||||
is_better_than<greater>(t, val_arr_[i], in_idx[idx], idx_arr_[i]);
|
||||
} else {
|
||||
is_better = is_better_than<greater>(t, val_arr_[i]);
|
||||
}
|
||||
if (is_better) {
|
||||
if (is_better_than<greater>(t, val_arr_[i])) {
|
||||
val_arr_[i] = t;
|
||||
idx_arr_[i] = in_idx[idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BitonicMerge<capacity, greater, !greater, T, idxT, is_stable>::merge(
|
||||
val_arr_, idx_arr_);
|
||||
BitonicMerge<capacity, !greater, T, idxT>::merge(val_arr_, idx_arr_);
|
||||
}
|
||||
|
||||
__device__ void dump(T* __restrict__ out, idxT* __restrict__ out_idx) const {
|
||||
@@ -275,7 +193,7 @@ class WarpSort {
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
protected:
|
||||
static constexpr int max_arr_len_ = capacity / WARP_SIZE;
|
||||
|
||||
T val_arr_[max_arr_len_];
|
||||
@@ -287,11 +205,11 @@ class WarpSort {
|
||||
|
||||
}; // end class WarpSort
|
||||
|
||||
template <int capacity, bool greater, typename T, typename idxT, bool is_stable>
|
||||
class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
|
||||
public:
|
||||
template <int capacity, bool greater, typename T, typename idxT>
|
||||
class WarpSelect : public WarpSort<capacity, greater, T, idxT> {
|
||||
public:
|
||||
__device__ WarpSelect(idxT k, T dummy)
|
||||
: WarpSort<capacity, greater, T, idxT, is_stable>(k, dummy),
|
||||
: WarpSort<capacity, greater, T, idxT>(k, dummy),
|
||||
k_th_(dummy),
|
||||
k_th_lane_((k - 1) % WARP_SIZE) {
|
||||
extern __shared__ char smem_buf[]; // extern __shared__ T smem_buf[];
|
||||
@@ -316,13 +234,7 @@ class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
|
||||
}
|
||||
|
||||
__device__ void add(T val, idxT idx) {
|
||||
bool do_add;
|
||||
if constexpr (is_stable) {
|
||||
do_add = is_better_than<greater>(val, k_th_, idx, k_th_idx_);
|
||||
} else {
|
||||
do_add = is_better_than<greater>(val, k_th_);
|
||||
}
|
||||
|
||||
bool do_add = is_better_than<greater>(val, k_th_);
|
||||
uint32_t mask = __ballot_sync(FULL_WARP_MASK, do_add);
|
||||
if (mask == 0) {
|
||||
return;
|
||||
@@ -359,52 +271,37 @@ class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
private:
|
||||
private:
|
||||
__device__ void set_k_th_() {
|
||||
k_th_ = __shfl_sync(FULL_WARP_MASK, val_arr_[max_arr_len_ - 1], k_th_lane_);
|
||||
if constexpr (is_stable) {
|
||||
k_th_idx_ =
|
||||
__shfl_sync(FULL_WARP_MASK, idx_arr_[max_arr_len_ - 1], k_th_lane_);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void merge_buf_(T val, idxT idx) {
|
||||
BitonicSort<WARP_SIZE, greater, T, idxT, is_stable>::sort(&val, &idx);
|
||||
BitonicSort<WARP_SIZE, greater, T, idxT>::sort(&val, &idx);
|
||||
|
||||
T& old = val_arr_[max_arr_len_ - 1];
|
||||
|
||||
bool is_better;
|
||||
if constexpr (is_stable) {
|
||||
is_better =
|
||||
is_better_than<greater>(val, old, idx, idx_arr_[max_arr_len_ - 1]);
|
||||
} else {
|
||||
is_better = is_better_than<greater>(val, old);
|
||||
}
|
||||
|
||||
if (is_better) {
|
||||
if (is_better_than<greater>(val, old)) {
|
||||
old = val;
|
||||
idx_arr_[max_arr_len_ - 1] = idx;
|
||||
}
|
||||
|
||||
BitonicMerge<capacity, greater, !greater, T, idxT, is_stable>::merge(
|
||||
val_arr_, idx_arr_);
|
||||
BitonicMerge<capacity, !greater, T, idxT>::merge(val_arr_, idx_arr_);
|
||||
|
||||
set_k_th_();
|
||||
}
|
||||
|
||||
using WarpSort<capacity, greater, T, idxT, is_stable>::max_arr_len_;
|
||||
using WarpSort<capacity, greater, T, idxT, is_stable>::val_arr_;
|
||||
using WarpSort<capacity, greater, T, idxT, is_stable>::idx_arr_;
|
||||
using WarpSort<capacity, greater, T, idxT, is_stable>::lane_;
|
||||
using WarpSort<capacity, greater, T, idxT, is_stable>::k_;
|
||||
using WarpSort<capacity, greater, T, idxT, is_stable>::dummy_;
|
||||
using WarpSort<capacity, greater, T, idxT>::max_arr_len_;
|
||||
using WarpSort<capacity, greater, T, idxT>::val_arr_;
|
||||
using WarpSort<capacity, greater, T, idxT>::idx_arr_;
|
||||
using WarpSort<capacity, greater, T, idxT>::lane_;
|
||||
using WarpSort<capacity, greater, T, idxT>::k_;
|
||||
using WarpSort<capacity, greater, T, idxT>::dummy_;
|
||||
|
||||
T* val_smem_;
|
||||
idxT* idx_smem_;
|
||||
int smem_buf_len_ = 0;
|
||||
|
||||
T k_th_;
|
||||
idxT k_th_idx_;
|
||||
int const k_th_lane_;
|
||||
}; // end class WarpSelect
|
||||
} // namespace warp_topk
|
||||
@@ -416,8 +313,8 @@ __device__ void topk_with_k2(T* output,
|
||||
int32_t const lane_id,
|
||||
int const num_experts_per_group) {
|
||||
// Get the top2 per thread
|
||||
T largest = neg_inf<T>();
|
||||
T second_largest = neg_inf<T>();
|
||||
T largest = cuda::std::numeric_limits<T>::min();
|
||||
T second_largest = cuda::std::numeric_limits<T>::min();
|
||||
|
||||
if (num_experts_per_group > WARP_SIZE) {
|
||||
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
|
||||
@@ -471,14 +368,8 @@ __global__ void topk_with_k2_kernel(T* output,
|
||||
cg::thread_block block = cg::this_thread_block();
|
||||
cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
|
||||
|
||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||
asm volatile("griddepcontrol.wait;");
|
||||
#endif
|
||||
topk_with_k2(output, input, tile, lane_id, num_experts_per_group);
|
||||
}
|
||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||
asm volatile("griddepcontrol.launch_dependents;");
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, typename IdxT>
|
||||
@@ -494,7 +385,6 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
int64_t const topk,
|
||||
int64_t const num_experts,
|
||||
int64_t const num_experts_per_group,
|
||||
bool const renormalize,
|
||||
double routed_scaling_factor) {
|
||||
int32_t warp_id = threadIdx.x / WARP_SIZE;
|
||||
int32_t lane_id = threadIdx.x % WARP_SIZE;
|
||||
@@ -513,29 +403,19 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
|
||||
extern __shared__ char smem_buf[]; // NOTE: reuse the shared memory here to
|
||||
// store the target topk idx
|
||||
int32_t* s_topk_idx = reinterpret_cast<int32_t*>(smem_buf);
|
||||
int32_t* s_topk_idx = reinterpret_cast<int32_t*>(smem_buf) + warp_id * topk;
|
||||
T* s_topk_value =
|
||||
reinterpret_cast<T*>(s_topk_idx + NUM_WARPS_PER_BLOCK * topk) +
|
||||
warp_id * topk;
|
||||
s_topk_idx += warp_id * topk;
|
||||
|
||||
T value = neg_inf<T>();
|
||||
T topk_group_value = neg_inf<T>();
|
||||
T value = cuda::std::numeric_limits<T>::min();
|
||||
T topk_group_value = cuda::std::numeric_limits<T>::min();
|
||||
int32_t num_equalto_topkth_group;
|
||||
|
||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||
asm volatile("griddepcontrol.wait;"); // I think all prolog can be put before
|
||||
// acqbulk because it's ptr arithmetic
|
||||
#endif
|
||||
|
||||
if (case_id < num_tokens) {
|
||||
if ((n_group > topk_group) && (case_id < num_tokens)) {
|
||||
// calculate group_idx
|
||||
int32_t target_num_min = WARP_SIZE - n_group + topk_group;
|
||||
if (lane_id < n_group &&
|
||||
(isfinite(cuda_cast<float, T>(
|
||||
group_scores[lane_id])))) // The check is necessary to avoid
|
||||
// abnormal input
|
||||
{
|
||||
if (lane_id < n_group) {
|
||||
value = group_scores[lane_id];
|
||||
}
|
||||
|
||||
@@ -546,23 +426,22 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
__syncwarp(); // Ensure all threads have valid data before reduction
|
||||
topk_group_value = cg::reduce(tile, value, cg::greater<T>());
|
||||
if (value == topk_group_value) {
|
||||
value = neg_inf<T>();
|
||||
value = cuda::std::numeric_limits<T>::min();
|
||||
}
|
||||
pre_count_equal_to_top_value = count_equal_to_top_value;
|
||||
count_equal_to_top_value = __popc(__ballot_sync(
|
||||
FULL_WARP_MASK, (value == neg_inf<T>())));
|
||||
FULL_WARP_MASK, (value == cuda::std::numeric_limits<T>::min())));
|
||||
}
|
||||
num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
|
||||
/* is_stable */ true>
|
||||
queue((int32_t)topk, neg_inf<T>());
|
||||
warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t>
|
||||
queue((int32_t)topk, cuda::std::numeric_limits<T>::min());
|
||||
|
||||
int count_equalto_topkth_group = 0;
|
||||
bool if_proceed_next_topk = (topk_group_value != neg_inf<T>());
|
||||
if (case_id < num_tokens && if_proceed_next_topk) {
|
||||
bool if_proceed_next_topk = (topk_group_value != cuda::std::numeric_limits<T>::min());
|
||||
if (case_id < num_tokens) {
|
||||
for (int i_group = 0; i_group < n_group; i_group++) {
|
||||
if ((group_scores[i_group] > topk_group_value) ||
|
||||
((group_scores[i_group] == topk_group_value) &&
|
||||
@@ -570,11 +449,9 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
int32_t offset = i_group * num_experts_per_group;
|
||||
for (int32_t i = lane_id; i < align_num_experts_per_group;
|
||||
i += WARP_SIZE) {
|
||||
T candidates =
|
||||
(i < num_experts_per_group) && isfinite(cuda_cast<float, T>(
|
||||
scores_with_bias[offset + i]))
|
||||
? scores_with_bias[offset + i]
|
||||
: neg_inf<T>();
|
||||
T candidates = i < num_experts_per_group
|
||||
? scores_with_bias[offset + i]
|
||||
: cuda::std::numeric_limits<T>::min();
|
||||
queue.add(candidates, offset + i);
|
||||
}
|
||||
if (group_scores[i_group] == topk_group_value) {
|
||||
@@ -592,7 +469,7 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
// Load the valid score value
|
||||
// Calculate the summation
|
||||
float topk_sum = 1e-20;
|
||||
if (case_id < num_tokens && if_proceed_next_topk) {
|
||||
if (case_id < num_tokens) {
|
||||
for (int i = lane_id;
|
||||
i < warp_topk::round_up_to_multiple_of<WARP_SIZE>(topk);
|
||||
i += WARP_SIZE) {
|
||||
@@ -601,45 +478,33 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
if (i < topk) {
|
||||
s_topk_value[i] = value;
|
||||
}
|
||||
topk_sum += reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
|
||||
topk_sum += reduce(tile, value, cg::plus<float>());
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (case_id < num_tokens && if_proceed_next_topk) {
|
||||
if (case_id < num_tokens) {
|
||||
for (int i = lane_id; i < num_experts; i += WARP_SIZE) {
|
||||
scores[i] = 0;
|
||||
}
|
||||
}
|
||||
__syncwarp();
|
||||
__threadfence();
|
||||
__syncthreads();
|
||||
|
||||
if (case_id < num_tokens) {
|
||||
if (if_proceed_next_topk) {
|
||||
for (int i = lane_id; i < topk; i += WARP_SIZE) {
|
||||
float value;
|
||||
if (renormalize) {
|
||||
value = cuda_cast<float, T>(s_topk_value[i]) / topk_sum *
|
||||
routed_scaling_factor;
|
||||
} else {
|
||||
value = cuda_cast<float, T>(s_topk_value[i]) * routed_scaling_factor;
|
||||
}
|
||||
scores[s_topk_idx[i]] = value;
|
||||
for (int i = lane_id; i < topk; i += WARP_SIZE) {
|
||||
float value = s_topk_value[i] / topk_sum * routed_scaling_factor;
|
||||
scores[s_topk_idx[i]] = value;
|
||||
if (if_proceed_next_topk) {
|
||||
topk_indices[i] = s_topk_idx[i];
|
||||
topk_values[i] = cuda_cast<T, float>(value);
|
||||
topk_values[i] = static_cast<T>(value);
|
||||
}
|
||||
} else {
|
||||
for (int i = lane_id; i < topk; i += WARP_SIZE) {
|
||||
else {
|
||||
topk_indices[i] = i;
|
||||
topk_values[i] = cuda_cast<T, float>(1.0f / topk);
|
||||
topk_values[i] = static_cast<float>(1.0f / topk);
|
||||
}
|
||||
}
|
||||
// Note: when if_proceed_next_topk==false, choose the first 8 experts as the
|
||||
// default result.
|
||||
}
|
||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||
asm volatile("griddepcontrol.launch_dependents;");
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, typename IdxT>
|
||||
@@ -653,24 +518,17 @@ void invokeNoAuxTc(T* scores,
|
||||
int64_t const n_group,
|
||||
int64_t const topk_group,
|
||||
int64_t const topk,
|
||||
bool const renormalize,
|
||||
double const routed_scaling_factor,
|
||||
cudaStream_t const stream) {
|
||||
int64_t num_cases = num_tokens * n_group;
|
||||
int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
|
||||
auto* kernel_instance1 = &topk_with_k2_kernel<T>;
|
||||
cudaLaunchConfig_t config;
|
||||
config.gridDim = topk_with_k2_num_blocks;
|
||||
config.blockDim = BLOCK_SIZE;
|
||||
config.dynamicSmemBytes = 0;
|
||||
config.stream = stream;
|
||||
cudaLaunchAttribute attrs[1];
|
||||
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
|
||||
attrs[0].val.programmaticStreamSerializationAllowed = false;
|
||||
config.numAttrs = 1;
|
||||
config.attrs = attrs;
|
||||
cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores_with_bias,
|
||||
num_tokens, num_cases, n_group, num_experts / n_group);
|
||||
topk_with_k2_kernel<T><<<topk_with_k2_num_blocks, BLOCK_SIZE, 0, stream>>>(
|
||||
group_scores,
|
||||
scores_with_bias,
|
||||
num_tokens,
|
||||
num_cases,
|
||||
n_group,
|
||||
num_experts / n_group);
|
||||
|
||||
int64_t topk_with_k_group_num_blocks =
|
||||
(num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
|
||||
@@ -678,19 +536,21 @@ void invokeNoAuxTc(T* scores,
|
||||
warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
|
||||
topk);
|
||||
|
||||
auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
|
||||
config.gridDim = topk_with_k_group_num_blocks;
|
||||
config.blockDim = BLOCK_SIZE;
|
||||
config.dynamicSmemBytes = dynamic_smem_in_bytes;
|
||||
config.stream = stream;
|
||||
attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
|
||||
attrs[0].val.programmaticStreamSerializationAllowed = false;
|
||||
config.numAttrs = 1;
|
||||
config.attrs = attrs;
|
||||
cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
|
||||
topk_values, topk_indices, scores_with_bias, num_tokens,
|
||||
n_group, topk_group, topk, num_experts,
|
||||
num_experts / n_group, renormalize, routed_scaling_factor);
|
||||
group_idx_and_topk_idx_kernel<T><<<topk_with_k_group_num_blocks,
|
||||
BLOCK_SIZE,
|
||||
dynamic_smem_in_bytes,
|
||||
stream>>>(scores,
|
||||
group_scores,
|
||||
topk_values,
|
||||
topk_indices,
|
||||
scores_with_bias,
|
||||
num_tokens,
|
||||
n_group,
|
||||
topk_group,
|
||||
topk,
|
||||
num_experts,
|
||||
num_experts / n_group,
|
||||
routed_scaling_factor);
|
||||
}
|
||||
|
||||
#define INSTANTIATE_NOAUX_TC(T, IdxT) \
|
||||
@@ -704,7 +564,6 @@ void invokeNoAuxTc(T* scores,
|
||||
int64_t const n_group, \
|
||||
int64_t const topk_group, \
|
||||
int64_t const topk, \
|
||||
bool const renormalize, \
|
||||
double const routed_scaling_factor, \
|
||||
cudaStream_t const stream);
|
||||
|
||||
|
@@ -3,158 +3,6 @@
|
||||
|
||||
#include "quantization/common.cuh"
|
||||
|
||||
// adapted from: https://github.com/sgl-project/sglang/blob/v0.5.2rc2/sgl-kernel/csrc/gemm/per_token_quant_fp8.cu
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// 1. Warp‑local, no shared memory
|
||||
// • One warp handles one token.
|
||||
// • Eight tokens per 256‑thread CTA.
|
||||
// ---------------------------------------------------------------------------
|
||||
template <typename T, typename DST_DTYPE, int kTokensPerCTA = 8, int kVecSize = 16>
|
||||
__global__ void per_token_quant_fp8_kernel(
|
||||
const T* __restrict__ input,
|
||||
DST_DTYPE* __restrict__ output_q,
|
||||
float* __restrict__ output_s,
|
||||
const float scale_ub,
|
||||
const int64_t hidden_size,
|
||||
const int64_t num_tokens) {
|
||||
const int warp_id = threadIdx.x / WARP_SIZE; // 0‑7 (8 warps)
|
||||
const int lane_id = threadIdx.x & (WARP_SIZE - 1); // 0‑31
|
||||
const int token_id = blockIdx.x * kTokensPerCTA + warp_id;
|
||||
if (token_id >= num_tokens) return;
|
||||
|
||||
// Global tensors for this token
|
||||
const T* token_input = input + token_id * hidden_size;
|
||||
DST_DTYPE* token_output = output_q + token_id * hidden_size;
|
||||
float* token_scale = output_s + token_id;
|
||||
|
||||
//
|
||||
// Pass-1: Perform a warp reduce to find the max_value of a token's hidden_size
|
||||
//
|
||||
float max_value = 0.f;
|
||||
using vec_t = AlignedVector<T, kVecSize>;
|
||||
const int32_t num_vec_elems = hidden_size / kVecSize;
|
||||
|
||||
for (int32_t i = lane_id; i < num_vec_elems; i += WARP_SIZE) {
|
||||
vec_t input_vec;
|
||||
Load(token_input + i * kVecSize, &input_vec);
|
||||
|
||||
#pragma unroll
|
||||
for (uint32_t j = 0; j < kVecSize; ++j) {
|
||||
max_value = fmaxf(max_value, fabsf(static_cast<float>(input_vec[j])));
|
||||
}
|
||||
}
|
||||
|
||||
float warp_max = warpReduceMax(max_value);
|
||||
if (scale_ub > 0){
|
||||
warp_max = fminf(warp_max, scale_ub);
|
||||
}
|
||||
float scale;
|
||||
scale = warp_max / FP8_E4M3_MAX;
|
||||
// Broadcast scale
|
||||
if (lane_id == 0) {
|
||||
token_scale[0] = scale;
|
||||
}
|
||||
float scale_inv = (scale == 0.f) ? 0.f : 1.0f / scale;
|
||||
|
||||
//
|
||||
// Pass-2: quantize and write back
|
||||
//
|
||||
for (int i = lane_id; i < num_vec_elems; i += WARP_SIZE) {
|
||||
vec_t input_vec;
|
||||
Load(token_input + i * kVecSize, &input_vec);
|
||||
DST_DTYPE output_arr[kVecSize];
|
||||
#pragma unroll
|
||||
for (uint32_t j = 0; j < kVecSize; ++j) {
|
||||
float val = static_cast<float>(input_vec[j]) * scale_inv;
|
||||
val = fmaxf(fminf(val, FP8_E4M3_MAX), -FP8_E4M3_MAX);
|
||||
output_arr[j] = static_cast<DST_DTYPE>(val);
|
||||
}
|
||||
if constexpr (kVecSize == 16) {
|
||||
*(uint4*)(token_output + i * kVecSize) = *(uint4*)output_arr;
|
||||
} else {
|
||||
// Use element-wise copy for vector size 8 to ensure correctness
|
||||
for (int k = 0; k < kVecSize; ++k) {
|
||||
token_output[i * kVecSize + k] = output_arr[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// 2. Baseline kernel (1 token / CTA, CUB block reduce)
|
||||
// ---------------------------------------------------------------------------
|
||||
template <typename T, typename DST_DTYPE, int kVecSize = 16>
|
||||
__global__ void per_token_quant_fp8_small_batch_kernel(
|
||||
const T* __restrict__ input,
|
||||
DST_DTYPE* __restrict__ output_q,
|
||||
float* __restrict__ output_s,
|
||||
const float scale_ub,
|
||||
const int64_t hidden_size,
|
||||
const int64_t num_tokens) {
|
||||
const int token_idx = blockIdx.x;
|
||||
if (token_idx >= num_tokens) return;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int block_dim = blockDim.x;
|
||||
|
||||
const T* token_input = input + token_idx * hidden_size;
|
||||
DST_DTYPE* token_output = output_q + token_idx * hidden_size;
|
||||
|
||||
float max_value = 0.0f;
|
||||
|
||||
// Use template parameter for vector size
|
||||
using vec_t = AlignedVector<T, kVecSize>;
|
||||
const int32_t num_vec_elems = hidden_size / kVecSize;
|
||||
|
||||
// Find max using vectorized loads
|
||||
for (int32_t i = tid; i < num_vec_elems; i += block_dim) {
|
||||
vec_t input_vec;
|
||||
Load(token_input + i * kVecSize, &input_vec);
|
||||
|
||||
#pragma unroll
|
||||
for (uint32_t j = 0; j < kVecSize; ++j) {
|
||||
float val = static_cast<float>(input_vec[j]);
|
||||
max_value = fmaxf(max_value, fabsf(val));
|
||||
}
|
||||
}
|
||||
|
||||
max_value = blockReduceMax(max_value);
|
||||
if (scale_ub > 0){
|
||||
max_value = fminf(max_value, scale_ub);
|
||||
}
|
||||
__shared__ float scale;
|
||||
if (tid == 0) {
|
||||
scale = max_value / FP8_E4M3_MAX;
|
||||
output_s[token_idx] = scale;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
const float scale_inv = 1.0f / scale;
|
||||
|
||||
// Quantize using vectorized loads
|
||||
for (int32_t i = tid; i < num_vec_elems; i += block_dim) {
|
||||
vec_t input_vec;
|
||||
Load(token_input + i * kVecSize, &input_vec);
|
||||
|
||||
DST_DTYPE output_arr[kVecSize];
|
||||
#pragma unroll
|
||||
for (uint32_t j = 0; j < kVecSize; ++j) {
|
||||
float val = fmaxf(fminf(static_cast<float>(input_vec[j]) * scale_inv, FP8_E4M3_MAX), -FP8_E4M3_MAX);
|
||||
output_arr[j] = static_cast<DST_DTYPE>(val);
|
||||
}
|
||||
|
||||
if constexpr (kVecSize == 16) {
|
||||
*(uint4*)(token_output + i * kVecSize) = *(uint4*)output_arr;
|
||||
} else {
|
||||
// Use element-wise copy for vector size 8 to ensure correctness
|
||||
for (int k = 0; k < kVecSize; ++k) {
|
||||
token_output[i * kVecSize + k] = output_arr[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace fastdeploy {
|
||||
|
||||
template <typename scalar_t, typename fp8_type>
|
||||
@@ -331,78 +179,39 @@ void DynamicPerTokenScaledFp8Quant(paddle::Tensor &out, // [..., d]
|
||||
auto rank = input.dims().size();
|
||||
int const hidden_size = input.dims()[rank - 1];
|
||||
int const num_tokens = input.numel() / hidden_size;
|
||||
cudaStream_t stream = input.stream();
|
||||
|
||||
if (hidden_size % 8 == 0){
|
||||
int device = 0;
|
||||
cudaGetDevice(&device);
|
||||
int sm_count = 0;
|
||||
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device);
|
||||
const int TOKENS_PER_CTA = 8;
|
||||
const bool use_warp_kernel = (num_tokens >= sm_count * 2 * TOKENS_PER_CTA);
|
||||
const bool use_vec16 = (hidden_size % 16 == 0);
|
||||
DISPATCH_FLOAT_FP6_DTYPE(input.dtype(), scalar_t, {
|
||||
if (use_warp_kernel) {
|
||||
// -------- warp‑local ---------------------------------------------------
|
||||
constexpr int THREADS = TOKENS_PER_CTA * WARP_SIZE; // 256
|
||||
dim3 grid((num_tokens + TOKENS_PER_CTA - 1) / TOKENS_PER_CTA);
|
||||
dim3 block(THREADS);
|
||||
|
||||
if (use_vec16) {
|
||||
per_token_quant_fp8_kernel<scalar_t, __nv_fp8_e4m3, TOKENS_PER_CTA, 16><<<grid, block, 0, stream>>>(
|
||||
reinterpret_cast<const scalar_t*>(input.data<scalar_t>()),
|
||||
reinterpret_cast<__nv_fp8_e4m3*>(out.data<fp8_t>()),
|
||||
reinterpret_cast<float*>(scales.data<float>()),
|
||||
scale_ub,
|
||||
hidden_size,
|
||||
num_tokens);
|
||||
} else {
|
||||
per_token_quant_fp8_kernel<scalar_t, __nv_fp8_e4m3, TOKENS_PER_CTA, 8><<<grid, block, 0, stream>>>(
|
||||
reinterpret_cast<const scalar_t*>(input.data<scalar_t>()),
|
||||
reinterpret_cast<__nv_fp8_e4m3*>(out.data<fp8_t>()),
|
||||
reinterpret_cast<float*>(scales.data<float>()),
|
||||
scale_ub,
|
||||
hidden_size,
|
||||
num_tokens);
|
||||
}
|
||||
} else {
|
||||
// -------- baseline -----------------------------------------------------
|
||||
constexpr int THREADS = 256;
|
||||
dim3 grid(num_tokens);
|
||||
dim3 block(THREADS);
|
||||
|
||||
if (use_vec16) {
|
||||
per_token_quant_fp8_small_batch_kernel<scalar_t, __nv_fp8_e4m3, 16><<<grid, block, 0, stream>>>(
|
||||
reinterpret_cast<const scalar_t*>(input.data<scalar_t>()),
|
||||
reinterpret_cast<__nv_fp8_e4m3*>(out.data<fp8_t>()),
|
||||
reinterpret_cast<float*>(scales.data<float>()),
|
||||
scale_ub,
|
||||
hidden_size,
|
||||
num_tokens);
|
||||
} else {
|
||||
per_token_quant_fp8_small_batch_kernel<scalar_t, __nv_fp8_e4m3, 8><<<grid, block, 0, stream>>>(
|
||||
reinterpret_cast<const scalar_t*>(input.data<scalar_t>()),
|
||||
reinterpret_cast<__nv_fp8_e4m3*>(out.data<fp8_t>()),
|
||||
reinterpret_cast<float*>(scales.data<float>()),
|
||||
scale_ub,
|
||||
hidden_size,
|
||||
num_tokens);
|
||||
}
|
||||
}
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
dim3 const grid(num_tokens);
|
||||
dim3 const block(std::min(hidden_size, 1024));
|
||||
|
||||
DISPATCH_FLOAT_FP6_DTYPE(input.dtype(), scalar_t, {
|
||||
cudaStream_t stream = input.stream();
|
||||
|
||||
switch (input.dtype()) {
|
||||
case paddle::DataType::FLOAT32: {
|
||||
using scalar_t = float;
|
||||
fastdeploy::dynamic_per_token_scaled_fp8_quant_kernel<scalar_t, fp8_t>
|
||||
<<<grid, block, 0, stream>>>(out.data<fp8_t>(), scales.data<float>(),
|
||||
input.data<scalar_t>(), scale_ub,
|
||||
hidden_size);
|
||||
});
|
||||
|
||||
break;
|
||||
}
|
||||
case paddle::DataType::FLOAT16: {
|
||||
using scalar_t = phi::dtype::float16;
|
||||
fastdeploy::dynamic_per_token_scaled_fp8_quant_kernel<scalar_t, fp8_t>
|
||||
<<<grid, block, 0, stream>>>(out.data<fp8_t>(), scales.data<float>(),
|
||||
input.data<scalar_t>(), scale_ub,
|
||||
hidden_size);
|
||||
break;
|
||||
}
|
||||
case paddle::DataType::BFLOAT16: {
|
||||
using scalar_t = phi::dtype::bfloat16;
|
||||
fastdeploy::dynamic_per_token_scaled_fp8_quant_kernel<scalar_t, fp8_t>
|
||||
<<<grid, block, 0, stream>>>(out.data<fp8_t>(), scales.data<float>(),
|
||||
input.data<scalar_t>(), scale_ub,
|
||||
hidden_size);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
PD_THROW("Only supported attr of input type in [fp32, fp16, bf16].");
|
||||
}
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(static_scaled_fp8_quant)
|
||||
|
@@ -1,71 +0,0 @@
|
||||
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "helper.h"
|
||||
#include "cuda_multiprocess.h"
|
||||
|
||||
#if !defined(_WIN32)
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <fcntl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#endif
|
||||
|
||||
// 可选:仅删除/解除共享内存命名对象(不依赖之前保存的 addr/fd)
|
||||
static inline int sharedMemoryUnlinkByName(const char* name) {
|
||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||
// Windows 上没有 shm_unlink 语义。命名对象在最后一个句柄关闭后消失。
|
||||
// 这里做“尽力而为”:尝试打开后立即关闭,减少一次引用。
|
||||
HANDLE hMap = OpenFileMappingA(FILE_MAP_ALL_ACCESS, FALSE, name);
|
||||
if (hMap) {
|
||||
CloseHandle(hMap);
|
||||
return 0;
|
||||
}
|
||||
// 已经不存在也算成功
|
||||
return 0;
|
||||
#else
|
||||
// POSIX: 移除名字,未来不可再 open;已映射区仍存活直至 munmap
|
||||
if (shm_unlink(name) != 0) {
|
||||
if (errno == ENOENT) return 0; // 不存在视作成功
|
||||
return errno;
|
||||
}
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
void UnsetDataIpc(const paddle::Tensor& tmp_input,
|
||||
const std::string& shm_name,
|
||||
bool close_ipc,
|
||||
bool unlink_shm) {
|
||||
// 1) 关闭消费者导入的 IPC 映射(仅当 close_ipc=true 且该指针确为 OpenMemHandle 得来)
|
||||
if (close_ipc) {
|
||||
void* ptr = const_cast<void*>(tmp_input.data());
|
||||
checkCudaErrors(cudaIpcCloseMemHandle(ptr));
|
||||
}
|
||||
|
||||
// 2) 解除共享内存命名对象(仅处理“名字”,不保证解除旧映射)
|
||||
if (unlink_shm) {
|
||||
int rc = sharedMemoryUnlinkByName(shm_name.c_str());
|
||||
if (rc != 0) {
|
||||
PD_THROW("Unlink shared memory failed: name=%s, err=%d",
|
||||
shm_name.c_str(), rc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(unset_data_ipc)
|
||||
.Inputs({"tmp_input"})
|
||||
.Attrs({"shm_name: std::string", "close_ipc: bool", "unlink_shm: bool"})
|
||||
.SetKernelFn(PD_KERNEL(UnsetDataIpc));
|
@@ -32,8 +32,7 @@ __global__ void update_inputs_kernel_v1(bool *not_need_stop,
|
||||
const int max_bsz,
|
||||
const int input_ids_stride,
|
||||
const int block_num_per_seq,
|
||||
const int block_size,
|
||||
bool prefill_one_step_stop) {
|
||||
const int block_size) {
|
||||
int thread_idx = threadIdx.x;
|
||||
typedef cub::BlockReduce<int64_t, THREADBLOCK_SIZE> BlockReduce;
|
||||
__shared__ typename BlockReduce::TempStorage temp_storage;
|
||||
@@ -55,32 +54,23 @@ __global__ void update_inputs_kernel_v1(bool *not_need_stop,
|
||||
seq_lens_encoder[thread_idx] = 0;
|
||||
} else {
|
||||
if (seq_lens_this_time[thread_idx] + seq_lens_decoder[thread_idx] >= prompt_lens[thread_idx]) {
|
||||
if (prefill_one_step_stop) {
|
||||
// prefill done, stop
|
||||
stop_flags[thread_idx] = true;
|
||||
seq_lens_this_time[thread_idx] = 0;
|
||||
seq_lens_decoder[thread_idx] = 0;
|
||||
seq_lens_encoder[thread_idx] = 0;
|
||||
stop_flag_now_int = 1;
|
||||
} else{
|
||||
// decoding
|
||||
seq_lens_decoder[thread_idx] += seq_lens_this_time[thread_idx];
|
||||
seq_lens_this_time[thread_idx] = 1;
|
||||
seq_lens_encoder[thread_idx] = 0;
|
||||
int64_t *input_ids_now = input_ids + thread_idx * input_ids_stride;
|
||||
input_ids_now[0] = next_tokens[thread_idx];
|
||||
// decoding
|
||||
seq_lens_decoder[thread_idx] += seq_lens_this_time[thread_idx];
|
||||
seq_lens_this_time[thread_idx] = 1;
|
||||
seq_lens_encoder[thread_idx] = 0;
|
||||
int64_t *input_ids_now = input_ids + thread_idx * input_ids_stride;
|
||||
input_ids_now[0] = next_tokens[thread_idx];
|
||||
|
||||
// to judge whether block is not enough
|
||||
int *block_table_now = block_tables + thread_idx * block_num_per_seq;
|
||||
if (seq_lens_this_time[thread_idx] != 0 && block_table_now[seq_lens_decoder[thread_idx] / block_size] == -1) {
|
||||
// should be scheduled by server
|
||||
is_block_step[thread_idx] = true;
|
||||
seq_lens_this_time[thread_idx]= 0;
|
||||
stop_flags[thread_idx] = true;
|
||||
step_seq_lens_decoder[thread_idx] = seq_lens_decoder[thread_idx];
|
||||
seq_lens_decoder[thread_idx] = 0;
|
||||
stop_flag_now_int = 1;
|
||||
}
|
||||
// to judge whether block is not enough
|
||||
int *block_table_now = block_tables + thread_idx * block_num_per_seq;
|
||||
if (seq_lens_this_time[thread_idx] != 0 && block_table_now[seq_lens_decoder[thread_idx] / block_size] == -1) {
|
||||
// should be scheduled by server
|
||||
is_block_step[thread_idx] = true;
|
||||
seq_lens_this_time[thread_idx]= 0;
|
||||
stop_flags[thread_idx] = true;
|
||||
step_seq_lens_decoder[thread_idx] = seq_lens_decoder[thread_idx];
|
||||
seq_lens_decoder[thread_idx] = 0;
|
||||
stop_flag_now_int = 1;
|
||||
}
|
||||
} else
|
||||
{
|
||||
@@ -120,12 +110,6 @@ void UpdateInputesV1(const paddle::Tensor &stop_flags,
|
||||
#else
|
||||
auto cu_stream = input_ids.stream();
|
||||
#endif
|
||||
bool prefill_one_step_stop = false;
|
||||
if (const char *env_p = std::getenv("PREFILL_NODE_ONE_STEP_STOP_V1")) {
|
||||
if (env_p[0] == '1') {
|
||||
prefill_one_step_stop = true;
|
||||
}
|
||||
}
|
||||
const int max_bsz = stop_flags.shape()[0];
|
||||
const int now_bsz = seq_lens_this_time.shape()[0];
|
||||
const int input_ids_stride = input_ids.shape()[1];
|
||||
@@ -149,8 +133,7 @@ void UpdateInputesV1(const paddle::Tensor &stop_flags,
|
||||
max_bsz,
|
||||
input_ids_stride,
|
||||
block_num_per_seq,
|
||||
block_size,
|
||||
prefill_one_step_stop);
|
||||
block_size);
|
||||
auto not_need_stop_cpu =
|
||||
not_need_stop_gpu.copy_to(not_need_stop.place(), false);
|
||||
bool *not_need_stop_data = const_cast<bool *>(not_need_stop.data<bool>());
|
||||
|
@@ -23,7 +23,7 @@
|
||||
template <typename OutputType>
|
||||
void DisPatchWFp8AFp8Gemm(
|
||||
const cutlass::float_e4m3_t* input,
|
||||
const uint32_t* sparse_idx,
|
||||
const int32_t* sparse_idx,
|
||||
const cutlass::float_e4m3_t* weight,
|
||||
const int * tokens,
|
||||
const float * weight_scale,
|
||||
@@ -80,7 +80,7 @@ void WFp8AFp8Gemm(
|
||||
if (is_bfloat16) {
|
||||
DisPatchWFp8AFp8Gemm(
|
||||
reinterpret_cast<const cutlass::float_e4m3_t*>(input.data<phi::dtype::float8_e4m3fn>()),
|
||||
reinterpret_cast<const uint32_t*>(sparse_idx.data<int32_t>()),
|
||||
sparse_idx.data<int32_t>(),
|
||||
reinterpret_cast<const cutlass::float_e4m3_t*>(weight.data<phi::dtype::float8_e4m3fn>()),
|
||||
tokens.data<int>(),
|
||||
weight_scale.data<float>(),
|
||||
|
@@ -1,376 +0,0 @@
|
||||
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "helper.h"
|
||||
#include "iluvatar_context.h"
|
||||
|
||||
template <paddle::DataType T>
|
||||
void MixedFusedPagedAttnKernel(const paddle::Tensor& qkv,
|
||||
paddle::Tensor& k_cache,
|
||||
paddle::Tensor& v_cache,
|
||||
const paddle::Tensor& prefill_block_table,
|
||||
const paddle::Tensor& decode_block_table,
|
||||
const paddle::Tensor& cu_seqlens_qkv,
|
||||
const paddle::Tensor& seq_lens,
|
||||
const paddle::optional<paddle::Tensor> &rope_sin,
|
||||
const paddle::optional<paddle::Tensor> &rope_cos,
|
||||
int prefill_num_tokens,
|
||||
int num_heads,
|
||||
int head_dim,
|
||||
int num_kv_heads,
|
||||
int block_size,
|
||||
int max_seq_len,
|
||||
float scale,
|
||||
bool causal,
|
||||
bool q_rope,
|
||||
bool k_rope,
|
||||
bool v_rope,
|
||||
int window_left,
|
||||
int window_right,
|
||||
float softcap,
|
||||
bool enable_cuda_graph,
|
||||
bool use_sqrt_alibi,
|
||||
paddle::Tensor& out) {
|
||||
|
||||
typedef PDTraits<T> traits_;
|
||||
typedef typename traits_::data_t data_t;
|
||||
|
||||
const auto& dtype = qkv.dtype();
|
||||
cuinferDataType_t cuinfer_data_type;
|
||||
cudaDataType_t cu_data_type;
|
||||
if (dtype == paddle::DataType::FLOAT16) {
|
||||
cuinfer_data_type = CUINFER_DATA_HALF;
|
||||
cu_data_type = CUDA_R_16F;
|
||||
} else {
|
||||
cuinfer_data_type = CUINFER_DATA_BFLOAT16;
|
||||
cu_data_type = CUDA_R_16BF;
|
||||
}
|
||||
|
||||
const auto& qkv_dims = qkv.dims();
|
||||
const auto& kv_cache_dims = k_cache.dims();
|
||||
const auto& prefill_block_table_dims = prefill_block_table.dims();
|
||||
const auto& cu_seqlens_qkv_dims = cu_seqlens_qkv.dims();
|
||||
|
||||
int prefill_batch_size = prefill_block_table_dims[0];
|
||||
int num_tokens = qkv_dims[0];
|
||||
int decode_num_tokens = num_tokens - prefill_num_tokens;
|
||||
int num_total_heads = num_heads + 2 * num_kv_heads;
|
||||
int max_num_blocks_per_seq = prefill_block_table_dims[1];
|
||||
int qkv_stride = qkv.strides()[0];
|
||||
int num_blocks = kv_cache_dims[0];
|
||||
|
||||
int kv_block_stride = k_cache.strides()[0];
|
||||
int kv_head_stride = k_cache.strides()[1];
|
||||
int block_table_stride = prefill_block_table.strides()[0];
|
||||
const float *rope_sin_ptr = rope_sin ? rope_sin.get().data<float>() : nullptr;
|
||||
const float *rope_cos_ptr = rope_cos ? rope_cos.get().data<float>() : nullptr;
|
||||
|
||||
cuinferTensorDescriptor_t qkv_desc;
|
||||
CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_desc));
|
||||
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||
qkv_desc,
|
||||
cuinfer_data_type,
|
||||
3,
|
||||
std::vector<int>({prefill_num_tokens, num_total_heads, head_dim}).data(),
|
||||
std::vector<int>({num_total_heads * head_dim, head_dim, 1}).data()));
|
||||
|
||||
cuinferTensorDescriptor_t qkv_seqlens_desc;
|
||||
CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_seqlens_desc));
|
||||
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||
qkv_seqlens_desc,
|
||||
CUINFER_DATA_INT32,
|
||||
1,
|
||||
std::vector<int>({prefill_batch_size + 1}).data(),
|
||||
std::vector<int>({1}).data()));
|
||||
|
||||
cuinferTensorDescriptor_t block_table_desc;
|
||||
CUINFER_CHECK(cuinferCreateTensorDescriptor(&block_table_desc));
|
||||
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||
block_table_desc,
|
||||
CUINFER_DATA_INT32,
|
||||
2,
|
||||
std::vector<int>({prefill_batch_size, block_table_stride}).data(),
|
||||
std::vector<int>({block_table_stride, 1}).data()));
|
||||
|
||||
cuinferTensorDescriptor_t o_desc;
|
||||
CUINFER_CHECK(cuinferCreateTensorDescriptor(&o_desc));
|
||||
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||
o_desc,
|
||||
cuinfer_data_type,
|
||||
3,
|
||||
std::vector<int>({prefill_num_tokens, num_heads, head_dim}).data(),
|
||||
std::vector<int>({num_heads * head_dim, head_dim, 1}).data()));
|
||||
|
||||
cuinferTensorDescriptor_t k_cache_desc;
|
||||
CUINFER_CHECK(cuinferCreateTensorDescriptor(&k_cache_desc));
|
||||
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||
k_cache_desc,
|
||||
cuinfer_data_type,
|
||||
4,
|
||||
std::vector<int>({num_blocks, num_kv_heads, block_size, head_dim}).data(),
|
||||
std::vector<int>({num_kv_heads * block_size * head_dim, block_size * head_dim, head_dim, 1}).data()));
|
||||
|
||||
cuinferTensorDescriptor_t v_cache_desc;
|
||||
CUINFER_CHECK(cuinferCreateTensorDescriptor(&v_cache_desc));
|
||||
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||
v_cache_desc,
|
||||
cuinfer_data_type,
|
||||
4,
|
||||
std::vector<int>({num_blocks, num_kv_heads, block_size, head_dim}).data(),
|
||||
std::vector<int>({num_kv_heads * block_size * head_dim, block_size * head_dim, head_dim, 1}).data()));
|
||||
|
||||
cuinferTensorDescriptor_t cos_desc;
|
||||
CUINFER_CHECK(cuinferCreateTensorDescriptor(&cos_desc));
|
||||
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||
cos_desc,
|
||||
CUINFER_DATA_FLOAT,
|
||||
2,
|
||||
std::vector<int>({max_seq_len, head_dim}).data(),
|
||||
std::vector<int>({head_dim, 1}).data()));
|
||||
|
||||
cuinferTensorDescriptor_t sin_desc;
|
||||
CUINFER_CHECK(cuinferCreateTensorDescriptor(&sin_desc));
|
||||
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||
sin_desc,
|
||||
CUINFER_DATA_FLOAT,
|
||||
2,
|
||||
std::vector<int>({max_seq_len, head_dim}).data(),
|
||||
std::vector<int>({head_dim, 1}).data()));
|
||||
|
||||
cuinferHandle_t cuinfer_handle = iluvatar::getContextInstance()->getIxInferHandle();
|
||||
|
||||
size_t prefill_workspace_size = 0;
|
||||
CUINFER_CHECK(cuinferGetFmhaFwdMergedFuseRopeWorkspaceSize(prefill_num_tokens,
|
||||
num_heads,
|
||||
num_kv_heads,
|
||||
head_dim,
|
||||
q_rope,
|
||||
k_rope,
|
||||
v_rope,
|
||||
cuinfer_data_type,
|
||||
cuinfer_data_type,
|
||||
cuinfer_data_type,
|
||||
&prefill_workspace_size));
|
||||
|
||||
auto* allocator = paddle::GetAllocator(qkv.place());
|
||||
|
||||
phi::Allocator::AllocationPtr prefill_tmp_workspace = allocator->Allocate(prefill_workspace_size);
|
||||
void* prefill_workspace_ptr = prefill_tmp_workspace->ptr();
|
||||
|
||||
CUINFER_CHECK(cuinferFmhaFwdMergedFuseRopeFunc(cuinfer_handle,
|
||||
qkv_desc,
|
||||
qkv.data(),
|
||||
qkv_seqlens_desc,
|
||||
cu_seqlens_qkv.data<int32_t>(),
|
||||
block_table_desc,
|
||||
prefill_block_table.data<int32_t>(),
|
||||
o_desc,
|
||||
out.data(),
|
||||
k_cache_desc,
|
||||
k_cache.data(),
|
||||
v_cache_desc,
|
||||
v_cache.data(),
|
||||
prefill_workspace_ptr,
|
||||
prefill_workspace_size,
|
||||
cos_desc,
|
||||
rope_cos_ptr,
|
||||
sin_desc,
|
||||
rope_sin_ptr,
|
||||
prefill_batch_size,
|
||||
num_heads,
|
||||
num_kv_heads,
|
||||
head_dim,
|
||||
causal,
|
||||
scale,
|
||||
q_rope,
|
||||
k_rope,
|
||||
v_rope));
|
||||
|
||||
size_t decode_workspace_size = 0;
|
||||
CUINFER_CHECK(cuInferPageAttentionGetWorkspaceV7(decode_num_tokens,
|
||||
num_heads,
|
||||
num_kv_heads,
|
||||
head_dim,
|
||||
block_size,
|
||||
max_seq_len,
|
||||
&decode_workspace_size));
|
||||
|
||||
phi::Allocator::AllocationPtr decode_tmp_workspace = allocator->Allocate(decode_workspace_size);
|
||||
void* decode_workspace_ptr = decode_tmp_workspace->ptr();
|
||||
|
||||
void* decode_qkv_ptr = (void*)(qkv.data<data_t>() + prefill_num_tokens * qkv_stride);
|
||||
void* decode_out_ptr = (void*)(out.data<data_t>() + prefill_num_tokens * out.strides()[0]);
|
||||
|
||||
PageAttentionWithKVCacheArguments args{
|
||||
static_cast<float>(scale), 1.0, 1.0, static_cast<float>(softcap), window_left, window_right,
|
||||
causal, use_sqrt_alibi, enable_cuda_graph, false, nullptr, decode_qkv_ptr, decode_qkv_ptr,
|
||||
decode_workspace_ptr, true, rope_sin_ptr, rope_cos_ptr};
|
||||
|
||||
CUINFER_CHECK(cuInferPageAttentionV7(cuinfer_handle,
|
||||
decode_out_ptr,
|
||||
cu_data_type,
|
||||
decode_qkv_ptr,
|
||||
cu_data_type,
|
||||
decode_num_tokens,
|
||||
num_heads,
|
||||
num_kv_heads,
|
||||
head_dim,
|
||||
qkv_stride,
|
||||
kv_block_stride,
|
||||
kv_head_stride,
|
||||
k_cache.data(),
|
||||
cu_data_type,
|
||||
v_cache.data(),
|
||||
cu_data_type,
|
||||
block_size,
|
||||
max_num_blocks_per_seq,
|
||||
max_seq_len,
|
||||
decode_block_table.data<int32_t>(),
|
||||
seq_lens.data<int32_t>(),
|
||||
args));
|
||||
|
||||
CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_desc));
|
||||
CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_seqlens_desc));
|
||||
CUINFER_CHECK(cuinferDestroyTensorDescriptor(block_table_desc));
|
||||
CUINFER_CHECK(cuinferDestroyTensorDescriptor(o_desc));
|
||||
CUINFER_CHECK(cuinferDestroyTensorDescriptor(k_cache_desc));
|
||||
CUINFER_CHECK(cuinferDestroyTensorDescriptor(v_cache_desc));
|
||||
CUINFER_CHECK(cuinferDestroyTensorDescriptor(cos_desc));
|
||||
CUINFER_CHECK(cuinferDestroyTensorDescriptor(sin_desc));
|
||||
}
|
||||
|
||||
std::vector<paddle::Tensor> MixedFusedPagedAttn(const paddle::Tensor& qkv,
|
||||
paddle::Tensor& k_cache,
|
||||
paddle::Tensor& v_cache,
|
||||
const paddle::Tensor& prefill_block_table,
|
||||
const paddle::Tensor& decode_block_table,
|
||||
const paddle::Tensor& cu_seqlens_qkv,
|
||||
const paddle::Tensor& seq_lens,
|
||||
const paddle::optional<paddle::Tensor> &rope_sin,
|
||||
const paddle::optional<paddle::Tensor> &rope_cos,
|
||||
int prefill_num_tokens,
|
||||
int num_heads,
|
||||
int head_dim,
|
||||
int num_kv_heads,
|
||||
int block_size,
|
||||
int max_seq_len,
|
||||
float scale,
|
||||
bool causal,
|
||||
bool q_rope,
|
||||
bool k_rope,
|
||||
bool v_rope,
|
||||
int window_left,
|
||||
int window_right,
|
||||
float softcap,
|
||||
bool enable_cuda_graph,
|
||||
bool use_sqrt_alibi) {
|
||||
const auto dtype = qkv.dtype();
|
||||
auto out = paddle::empty({qkv.shape()[0], num_heads * head_dim}, dtype, qkv.place());
|
||||
|
||||
switch (dtype) {
|
||||
case paddle::DataType::BFLOAT16:
|
||||
MixedFusedPagedAttnKernel<paddle::DataType::BFLOAT16>(qkv,
|
||||
k_cache,
|
||||
v_cache,
|
||||
prefill_block_table,
|
||||
decode_block_table,
|
||||
cu_seqlens_qkv,
|
||||
seq_lens,
|
||||
rope_sin,
|
||||
rope_cos,
|
||||
prefill_num_tokens,
|
||||
num_heads,
|
||||
head_dim,
|
||||
num_kv_heads,
|
||||
block_size,
|
||||
max_seq_len,
|
||||
scale,
|
||||
causal,
|
||||
q_rope,
|
||||
k_rope,
|
||||
v_rope,
|
||||
window_left,
|
||||
window_right,
|
||||
softcap,
|
||||
enable_cuda_graph,
|
||||
use_sqrt_alibi,
|
||||
out);
|
||||
break;
|
||||
case paddle::DataType::FLOAT16:
|
||||
MixedFusedPagedAttnKernel<paddle::DataType::FLOAT16>(qkv,
|
||||
k_cache,
|
||||
v_cache,
|
||||
prefill_block_table,
|
||||
decode_block_table,
|
||||
cu_seqlens_qkv,
|
||||
seq_lens,
|
||||
rope_sin,
|
||||
rope_cos,
|
||||
prefill_num_tokens,
|
||||
num_heads,
|
||||
head_dim,
|
||||
num_kv_heads,
|
||||
block_size,
|
||||
max_seq_len,
|
||||
scale,
|
||||
causal,
|
||||
q_rope,
|
||||
k_rope,
|
||||
v_rope,
|
||||
window_left,
|
||||
window_right,
|
||||
softcap,
|
||||
enable_cuda_graph,
|
||||
use_sqrt_alibi,
|
||||
out);
|
||||
break;
|
||||
default:
|
||||
PD_THROW("Unsupported data type for mixed paged attn");
|
||||
}
|
||||
return {out};
|
||||
}
|
||||
|
||||
std::vector<std::vector<int64_t>> MixedFusedPagedAttnInferShape(const std::vector<int64_t>& qkv_shape,
|
||||
int num_heads,
|
||||
int head_dim) {
|
||||
return {{qkv_shape[0], num_heads * head_dim}};
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType> MixedFusedPagedAttnInferDtype(const paddle::DataType& qkv_dtype) {
|
||||
return {qkv_dtype};
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(mixed_fused_paged_attn)
|
||||
.Inputs({"qkv", "k_cache", "v_cache", "prefill_block_table", "decode_block_table",
|
||||
"cu_seqlens_qkv", "seq_lens", paddle::Optional("rope_sin"), paddle::Optional("rope_cos")})
|
||||
.Outputs({"out"})
|
||||
.Attrs({"prefill_num_tokens:int",
|
||||
"num_heads: int",
|
||||
"head_dim:int",
|
||||
"num_kv_heads:int",
|
||||
"block_size:int",
|
||||
"max_seq_len:int",
|
||||
"scale:float",
|
||||
"causal:bool",
|
||||
"q_rope:bool",
|
||||
"k_rope:bool",
|
||||
"v_rope:bool",
|
||||
"window_left:int",
|
||||
"window_right:int",
|
||||
"softcap:float",
|
||||
"enable_cuda_graph:bool",
|
||||
"use_sqrt_alibi:bool"})
|
||||
.SetKernelFn(PD_KERNEL(MixedFusedPagedAttn))
|
||||
.SetInferShapeFn(PD_INFER_SHAPE(MixedFusedPagedAttnInferShape))
|
||||
.SetInferDtypeFn(PD_INFER_DTYPE(MixedFusedPagedAttnInferDtype));
|
@@ -53,7 +53,6 @@ void MoeDispatchKernel(const paddle::Tensor& input,
|
||||
const paddle::optional<paddle::Tensor>& gating_correction_bias,
|
||||
const int moe_topk,
|
||||
const bool group_moe,
|
||||
const std::string &moe_quant_type,
|
||||
const bool topk_only_mode,
|
||||
const int num_rows,
|
||||
const int hidden_size,
|
||||
@@ -184,7 +183,6 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
|
||||
const paddle::optional<paddle::Tensor>& w4a8_in_scale,
|
||||
const int moe_topk,
|
||||
const bool group_moe,
|
||||
const std::string &moe_quant_type,
|
||||
const bool topk_only_mode) {
|
||||
const auto input_type = input.dtype();
|
||||
auto place = input.place();
|
||||
@@ -222,7 +220,6 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
|
||||
gating_correction_bias,
|
||||
moe_topk,
|
||||
group_moe,
|
||||
moe_quant_type,
|
||||
topk_only_mode,
|
||||
num_rows,
|
||||
hidden_size,
|
||||
@@ -239,7 +236,6 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
|
||||
gating_correction_bias,
|
||||
moe_topk,
|
||||
group_moe,
|
||||
moe_quant_type,
|
||||
topk_only_mode,
|
||||
num_rows,
|
||||
hidden_size,
|
||||
@@ -309,7 +305,7 @@ PD_BUILD_STATIC_OP(moe_expert_dispatch)
|
||||
"top_k_weight",
|
||||
"top_k_indices",
|
||||
"expert_idx_per_token"})
|
||||
.Attrs({"moe_topk:int", "group_moe:bool", "moe_quant_type:std::string", "topk_only_mode:bool"})
|
||||
.Attrs({"moe_topk:int", "group_moe:bool", "topk_only_mode:bool"})
|
||||
.SetKernelFn(PD_KERNEL(MoeExpertDispatch))
|
||||
.SetInferShapeFn(PD_INFER_SHAPE(MoeExpertDispatchInferShape))
|
||||
.SetInferDtypeFn(PD_INFER_DTYPE(MoeExpertDispatchInferDtype));
|
||||
|
@@ -27,8 +27,6 @@ void PagedAttnKernel(const paddle::Tensor& q,
|
||||
const paddle::optional<paddle::Tensor> &v,
|
||||
const paddle::optional<paddle::Tensor> &rope_sin,
|
||||
const paddle::optional<paddle::Tensor> &rope_cos,
|
||||
int num_heads,
|
||||
int head_dim,
|
||||
int num_kv_heads,
|
||||
float scale,
|
||||
int block_size,
|
||||
@@ -88,36 +86,32 @@ void PagedAttnKernel(const paddle::Tensor& q,
|
||||
common::errors::InvalidArgument(
|
||||
"paged_attention expects seq_lens is contiguous"));
|
||||
// check dim and shape
|
||||
// k_cache: [num_blocks, kv_num_heads, block_size, head_dim]
|
||||
// v_cache: [num_blocks, kv_num_heads, block_size, head_dim]
|
||||
// k_cache: [num_blocks, kv_num_heads, block_size, head_size]
|
||||
// v_cache: [num_blocks, kv_num_heads, block_size, head_size]
|
||||
// block_table: [num_seqs, max_num_blocks_per_seq]
|
||||
// seq_lens: [num_seqs]
|
||||
// q and out:
|
||||
// if merged_qkv = false:
|
||||
// q:[num_seqs, hidden_size]
|
||||
// out:[num_seqs, hidden_size]
|
||||
// if merged_qkv = true:
|
||||
// q: [num_seqs, (num_heads+2*num_kv_heads)*head_dim]
|
||||
// out: [num_seqs, hidden_size]
|
||||
// merged_qkv = false: [num_seqs, num_heads, head_size]
|
||||
// merged_qkv = true: [num_seqs, num_heads+2*num_kv_heads, head_size]
|
||||
|
||||
const auto& q_dims = q.dims();
|
||||
PADDLE_ENFORCE_EQ(q_dims.size(),
|
||||
2,
|
||||
3,
|
||||
common::errors::InvalidArgument(
|
||||
"paged_attn receive query dims is "
|
||||
"[num_seqs, (num_heads+2*num_kv_heads)*head_dim]"));
|
||||
"[num_seqs, num_heads, head_size]"));
|
||||
PADDLE_ENFORCE_EQ(out.dims().size(),
|
||||
2,
|
||||
3,
|
||||
common::errors::InvalidArgument(
|
||||
"paged_attn receive out dims is "
|
||||
"[num_seqs, hidden_size]"));
|
||||
"[num_seqs, num_heads, head_size]"));
|
||||
|
||||
const auto& kv_cache_dims = k_cache.dims();
|
||||
PADDLE_ENFORCE_EQ(kv_cache_dims.size(),
|
||||
4,
|
||||
common::errors::InvalidArgument(
|
||||
"paged_attn receive kv cache dims is "
|
||||
"[num_blocks, kv_num_heads, block_size, head_dim]"));
|
||||
"[num_blocks, kv_num_heads, block_size, head_size]"));
|
||||
|
||||
const auto& block_table_dims = block_table.dims();
|
||||
PADDLE_ENFORCE_EQ(block_table_dims.size(),
|
||||
@@ -133,6 +127,8 @@ void PagedAttnKernel(const paddle::Tensor& q,
|
||||
"paged_attn receive seq_lens dims is [num_seqs]"));
|
||||
|
||||
int num_seqs = q_dims[0];
|
||||
int num_heads = merged_qkv ? q_dims[1] - 2 * num_kv_heads : q_dims[1];
|
||||
int head_size = q_dims[2];
|
||||
int max_num_blocks_per_seq = block_table_dims[1];
|
||||
int q_stride = q.strides()[0];
|
||||
int num_blocks = kv_cache_dims[0];
|
||||
@@ -146,9 +142,9 @@ void PagedAttnKernel(const paddle::Tensor& q,
|
||||
common::errors::InvalidArgument(
|
||||
"kv_cache_dims[2] must be equal to block_size"));
|
||||
PADDLE_ENFORCE_EQ(kv_cache_dims[3],
|
||||
head_dim,
|
||||
head_size,
|
||||
common::errors::InvalidArgument(
|
||||
"kv_cache_dims[3] must be equal to head_dim"));
|
||||
"kv_cache_dims[3] must be equal to head_size"));
|
||||
PADDLE_ENFORCE_EQ(block_table_dims[0],
|
||||
num_seqs,
|
||||
common::errors::InvalidArgument(
|
||||
@@ -166,13 +162,14 @@ void PagedAttnKernel(const paddle::Tensor& q,
|
||||
const float *rope_sin_ptr = merged_qkv ? rope_sin.get().data<float>() : nullptr;
|
||||
const float *rope_cos_ptr = merged_qkv ? rope_cos.get().data<float>() : nullptr;
|
||||
|
||||
auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(q.place()));
|
||||
cuinferHandle_t cuinfer_handle = iluvatar::getContextInstance()->getIxInferHandle();
|
||||
|
||||
size_t workspace_size = 0;
|
||||
CUINFER_CHECK(cuInferPageAttentionGetWorkspaceV7(num_seqs,
|
||||
num_heads,
|
||||
num_kv_heads,
|
||||
head_dim,
|
||||
head_size,
|
||||
block_size,
|
||||
max_context_len,
|
||||
&workspace_size));
|
||||
@@ -192,7 +189,7 @@ void PagedAttnKernel(const paddle::Tensor& q,
|
||||
num_seqs,
|
||||
num_heads,
|
||||
num_kv_heads,
|
||||
head_dim,
|
||||
head_size,
|
||||
q_stride,
|
||||
kv_block_stride,
|
||||
kv_head_stride,
|
||||
@@ -218,8 +215,6 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
|
||||
const paddle::optional<paddle::Tensor> &v,
|
||||
const paddle::optional<paddle::Tensor> &rope_sin,
|
||||
const paddle::optional<paddle::Tensor> &rope_cos,
|
||||
int num_heads,
|
||||
int head_dim,
|
||||
int num_kv_heads,
|
||||
float scale,
|
||||
int block_size,
|
||||
@@ -233,7 +228,11 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
|
||||
bool merged_qkv) {
|
||||
|
||||
const auto dtype = q.dtype();
|
||||
auto out = paddle::empty({q.shape()[0], num_heads * head_dim}, dtype, q.place());
|
||||
auto out_shape = q.shape();
|
||||
if (merged_qkv) {
|
||||
out_shape[1] -= 2 * num_kv_heads;
|
||||
}
|
||||
auto out = paddle::empty(out_shape, dtype, q.place());
|
||||
|
||||
switch (dtype) {
|
||||
case paddle::DataType::BFLOAT16:
|
||||
@@ -247,8 +246,6 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
|
||||
v,
|
||||
rope_sin,
|
||||
rope_cos,
|
||||
num_heads,
|
||||
head_dim,
|
||||
num_kv_heads,
|
||||
scale,
|
||||
block_size,
|
||||
@@ -273,8 +270,6 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
|
||||
v,
|
||||
rope_sin,
|
||||
rope_cos,
|
||||
num_heads,
|
||||
head_dim,
|
||||
num_kv_heads,
|
||||
scale,
|
||||
block_size,
|
||||
@@ -304,8 +299,6 @@ std::vector<std::vector<int64_t>> PagedAttnInferShape(const std::vector<int64_t>
|
||||
const std::vector<int64_t>& v_shape,
|
||||
const std::vector<int64_t>& rope_sin_shape,
|
||||
const std::vector<int64_t>& rope_cos_shape,
|
||||
int num_heads,
|
||||
int head_dim,
|
||||
int num_kv_heads,
|
||||
float scale,
|
||||
int block_size,
|
||||
@@ -318,13 +311,36 @@ std::vector<std::vector<int64_t>> PagedAttnInferShape(const std::vector<int64_t>
|
||||
bool use_sqrt_alibi,
|
||||
bool merged_qkv) {
|
||||
if (merged_qkv) {
|
||||
return {{q_shape[0], num_heads * head_dim}};
|
||||
int64_t num_tokens = q_shape[0];
|
||||
int64_t num_heads = q_shape[1] - 2 * num_kv_heads;
|
||||
int64_t head_dim = q_shape[2];
|
||||
return {{num_tokens, num_heads, head_dim}};
|
||||
} else {
|
||||
return {q_shape};
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType> PagedAttnInferDtype(const paddle::DataType& q_dtype) {
|
||||
std::vector<paddle::DataType> PagedAttnInferDtype(const paddle::DataType& q_dtype,
|
||||
const paddle::DataType& k_cache_dtype,
|
||||
const paddle::DataType& v_cache_dtype,
|
||||
const paddle::DataType& block_table_dtype,
|
||||
const paddle::DataType& seq_lens_dtype,
|
||||
const paddle::DataType& alibi_slopes_dtype,
|
||||
const paddle::DataType& k_dtype,
|
||||
const paddle::DataType& v_dtype,
|
||||
const paddle::DataType& rope_sin_dtype,
|
||||
const paddle::DataType& rope_cos_dtype,
|
||||
int num_kv_heads,
|
||||
float scale,
|
||||
int block_size,
|
||||
int max_context_len,
|
||||
bool causal,
|
||||
int window_left,
|
||||
int window_right,
|
||||
float softcap,
|
||||
bool enable_cuda_graph,
|
||||
bool use_sqrt_alibi,
|
||||
bool merged_qkv) {
|
||||
return {q_dtype};
|
||||
}
|
||||
|
||||
@@ -335,9 +351,7 @@ PD_BUILD_STATIC_OP(paged_attn)
|
||||
paddle::Optional("v"), paddle::Optional("rope_sin"),
|
||||
paddle::Optional("rope_cos")})
|
||||
.Outputs({"out"})
|
||||
.Attrs({"num_heads:int",
|
||||
"head_dim:int",
|
||||
"num_kv_heads:int",
|
||||
.Attrs({"num_kv_heads:int",
|
||||
"scale:float",
|
||||
"block_size:int",
|
||||
"max_context_len:int",
|
||||
|
@@ -1,378 +0,0 @@
|
||||
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "helper.h"
|
||||
#include "iluvatar_context.h"
|
||||
|
||||
template <paddle::DataType T>
|
||||
void PrefillFusedPagedAttnKernel(const paddle::Tensor& qkv,
|
||||
paddle::Tensor& k_cache,
|
||||
paddle::Tensor& v_cache,
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& cu_seqlens_qkv,
|
||||
const paddle::optional<paddle::Tensor> &rope_sin,
|
||||
const paddle::optional<paddle::Tensor> &rope_cos,
|
||||
int num_heads,
|
||||
int head_dim,
|
||||
int num_kv_heads,
|
||||
int block_size,
|
||||
int max_seq_len,
|
||||
float scale,
|
||||
bool causal,
|
||||
bool q_rope,
|
||||
bool k_rope,
|
||||
bool v_rope,
|
||||
paddle::Tensor& out) {
|
||||
|
||||
// check dtype and contiguous
|
||||
const auto& dtype = qkv.dtype();
|
||||
cuinferDataType_t data_type;
|
||||
if (dtype == paddle::DataType::FLOAT16) {
|
||||
data_type = CUINFER_DATA_HALF;
|
||||
|
||||
} else if (dtype == paddle::DataType::BFLOAT16) {
|
||||
data_type = CUINFER_DATA_BFLOAT16;
|
||||
} else {
|
||||
common::errors::InvalidArgument("paged_attention support half and bfloat16 now");
|
||||
}
|
||||
|
||||
PADDLE_ENFORCE_EQ(k_cache.dtype(),
|
||||
dtype,
|
||||
common::errors::InvalidArgument(
|
||||
"k_cache dtype must be the same as query dtype"));
|
||||
PADDLE_ENFORCE_EQ(k_cache.is_contiguous(),
|
||||
true,
|
||||
common::errors::InvalidArgument(
|
||||
"paged_attention expects k_cache is contiguous"));
|
||||
PADDLE_ENFORCE_EQ(block_table.dtype(),
|
||||
paddle::DataType::INT32,
|
||||
common::errors::InvalidArgument(
|
||||
"block_table dtype must be int32"));
|
||||
PADDLE_ENFORCE_EQ(block_table.is_contiguous(),
|
||||
true,
|
||||
common::errors::InvalidArgument(
|
||||
"paged_attention expects block_table is contiguous"));
|
||||
PADDLE_ENFORCE_EQ(cu_seqlens_qkv.dtype(),
|
||||
paddle::DataType::INT32,
|
||||
common::errors::InvalidArgument(
|
||||
"cu_seqlens_qkv dtype must be int32"));
|
||||
PADDLE_ENFORCE_EQ(cu_seqlens_qkv.is_contiguous(),
|
||||
true,
|
||||
common::errors::InvalidArgument(
|
||||
"paged_attention expects cu_seqlens_qkv is contiguous"));
|
||||
// check dim and shape
|
||||
// k_cache: [num_blocks, kv_num_heads, block_size, head_dim]
|
||||
// v_cache: [num_blocks, kv_num_heads, block_size, head_dim]
|
||||
// block_table: [batch_size, max_num_blocks_per_seq]
|
||||
// seq_lens: [batch_size]
|
||||
// qkv: [num_tokens, (num_heads+2*num_kv_heads)*head_dim]
|
||||
// out: [num_tokens, hidden_size]
|
||||
|
||||
const auto& qkv_dims = qkv.dims();
|
||||
PADDLE_ENFORCE_EQ(qkv_dims.size(),
|
||||
2,
|
||||
common::errors::InvalidArgument(
|
||||
"paged_attn receive query dims is "
|
||||
"[num_tokens, (num_heads+2*num_kv_heads)*head_dim]"));
|
||||
PADDLE_ENFORCE_EQ(out.dims().size(),
|
||||
2,
|
||||
common::errors::InvalidArgument(
|
||||
"paged_attn receive out dims is "
|
||||
"[num_tokens, hidden_size]"));
|
||||
|
||||
const auto& kv_cache_dims = k_cache.dims();
|
||||
PADDLE_ENFORCE_EQ(kv_cache_dims.size(),
|
||||
4,
|
||||
common::errors::InvalidArgument(
|
||||
"paged_attn receive kv cache dims is "
|
||||
"[num_blocks, kv_num_heads, block_size, head_dim]"));
|
||||
|
||||
const auto& block_table_dims = block_table.dims();
|
||||
PADDLE_ENFORCE_EQ(block_table_dims.size(),
|
||||
2,
|
||||
common::errors::InvalidArgument(
|
||||
"paged_attn receive block_table dims is "
|
||||
"[batch_size, max_num_blocks_per_seq]"));
|
||||
|
||||
const auto& cu_seqlens_qkv_dims = cu_seqlens_qkv.dims();
|
||||
PADDLE_ENFORCE_EQ(cu_seqlens_qkv_dims.size(),
|
||||
1,
|
||||
common::errors::InvalidArgument(
|
||||
"paged_attn receive cu_seqlens_qkv dims is [batch_size]"));
|
||||
|
||||
int batch_size = block_table_dims[0];
|
||||
int num_tokens = qkv_dims[0];
|
||||
int num_total_heads = num_heads + 2 * num_kv_heads;
|
||||
int qkv_stride = qkv.strides()[0];
|
||||
int num_blocks = kv_cache_dims[0];
|
||||
|
||||
PADDLE_ENFORCE_EQ(kv_cache_dims[1],
|
||||
num_kv_heads,
|
||||
common::errors::InvalidArgument(
|
||||
"kv_cache_dims[1] must be equal to num_kv_head"));
|
||||
PADDLE_ENFORCE_EQ(kv_cache_dims[2],
|
||||
block_size,
|
||||
common::errors::InvalidArgument(
|
||||
"kv_cache_dims[2] must be equal to block_size"));
|
||||
PADDLE_ENFORCE_EQ(kv_cache_dims[3],
|
||||
head_dim,
|
||||
common::errors::InvalidArgument(
|
||||
"kv_cache_dims[3] must be equal to head_dim"));
|
||||
PADDLE_ENFORCE_EQ(cu_seqlens_qkv_dims[0],
|
||||
batch_size + 1,
|
||||
common::errors::InvalidArgument(
|
||||
"cu_seqlens_qkv_dims[0] must be equal to batch_size + 1"));
|
||||
|
||||
int block_table_stride = block_table.strides()[0];
|
||||
const float *rope_sin_ptr = rope_sin ? rope_sin.get().data<float>() : nullptr;
|
||||
const float *rope_cos_ptr = rope_cos ? rope_cos.get().data<float>() : nullptr;
|
||||
|
||||
cuinferHandle_t cuinfer_handle = iluvatar::getContextInstance()->getIxInferHandle();
|
||||
|
||||
size_t workspace_size = 0;
|
||||
CUINFER_CHECK(cuinferGetFmhaFwdMergedFuseRopeWorkspaceSize(num_tokens,
|
||||
num_heads,
|
||||
num_kv_heads,
|
||||
head_dim,
|
||||
q_rope,
|
||||
k_rope,
|
||||
v_rope,
|
||||
data_type,
|
||||
data_type,
|
||||
data_type,
|
||||
&workspace_size));
|
||||
auto* allocator = paddle::GetAllocator(qkv.place());
|
||||
phi::Allocator::AllocationPtr tmp_workspace = allocator->Allocate(workspace_size);
|
||||
void* workspace_ptr = tmp_workspace->ptr();
|
||||
|
||||
cuinferTensorDescriptor_t qkv_desc;
|
||||
CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_desc));
|
||||
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||
qkv_desc,
|
||||
data_type,
|
||||
3,
|
||||
std::vector<int>({num_tokens, num_total_heads, head_dim}).data(),
|
||||
std::vector<int>({num_total_heads * head_dim, head_dim, 1}).data()));
|
||||
|
||||
cuinferTensorDescriptor_t qkv_seqlens_desc;
|
||||
CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_seqlens_desc));
|
||||
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||
qkv_seqlens_desc,
|
||||
CUINFER_DATA_INT32,
|
||||
1,
|
||||
std::vector<int>({batch_size + 1}).data(),
|
||||
std::vector<int>({1}).data()));
|
||||
|
||||
cuinferTensorDescriptor_t block_table_desc;
|
||||
CUINFER_CHECK(cuinferCreateTensorDescriptor(&block_table_desc));
|
||||
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||
block_table_desc,
|
||||
CUINFER_DATA_INT32,
|
||||
2,
|
||||
std::vector<int>({batch_size, block_table_stride}).data(),
|
||||
std::vector<int>({block_table_stride, 1}).data()));
|
||||
|
||||
cuinferTensorDescriptor_t o_desc;
|
||||
CUINFER_CHECK(cuinferCreateTensorDescriptor(&o_desc));
|
||||
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||
o_desc,
|
||||
data_type,
|
||||
3,
|
||||
std::vector<int>({num_tokens, num_heads, head_dim}).data(),
|
||||
std::vector<int>({num_heads * head_dim, head_dim, 1}).data()));
|
||||
|
||||
cuinferTensorDescriptor_t k_cache_desc;
|
||||
CUINFER_CHECK(cuinferCreateTensorDescriptor(&k_cache_desc));
|
||||
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||
k_cache_desc,
|
||||
data_type,
|
||||
4,
|
||||
std::vector<int>({num_blocks, num_kv_heads, block_size, head_dim}).data(),
|
||||
std::vector<int>({num_kv_heads * block_size * head_dim, block_size * head_dim, head_dim, 1}).data()));
|
||||
|
||||
cuinferTensorDescriptor_t v_cache_desc;
|
||||
CUINFER_CHECK(cuinferCreateTensorDescriptor(&v_cache_desc));
|
||||
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||
v_cache_desc,
|
||||
data_type,
|
||||
4,
|
||||
std::vector<int>({num_blocks, num_kv_heads, block_size, head_dim}).data(),
|
||||
std::vector<int>({num_kv_heads * block_size * head_dim, block_size * head_dim, head_dim, 1}).data()));
|
||||
|
||||
cuinferTensorDescriptor_t cos_desc;
|
||||
CUINFER_CHECK(cuinferCreateTensorDescriptor(&cos_desc));
|
||||
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||
cos_desc,
|
||||
CUINFER_DATA_FLOAT,
|
||||
2,
|
||||
std::vector<int>({max_seq_len, head_dim}).data(),
|
||||
std::vector<int>({head_dim, 1}).data()));
|
||||
|
||||
cuinferTensorDescriptor_t sin_desc;
|
||||
CUINFER_CHECK(cuinferCreateTensorDescriptor(&sin_desc));
|
||||
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||
sin_desc,
|
||||
CUINFER_DATA_FLOAT,
|
||||
2,
|
||||
std::vector<int>({max_seq_len, head_dim}).data(),
|
||||
std::vector<int>({head_dim, 1}).data()));
|
||||
|
||||
CUINFER_CHECK(cuinferFmhaFwdMergedFuseRopeFunc(cuinfer_handle,
|
||||
qkv_desc,
|
||||
qkv.data(),
|
||||
qkv_seqlens_desc,
|
||||
cu_seqlens_qkv.data<int32_t>(),
|
||||
block_table_desc,
|
||||
block_table.data<int32_t>(),
|
||||
o_desc,
|
||||
out.data(),
|
||||
k_cache_desc,
|
||||
k_cache.data(),
|
||||
v_cache_desc,
|
||||
v_cache.data(),
|
||||
workspace_ptr,
|
||||
workspace_size,
|
||||
cos_desc,
|
||||
rope_cos_ptr,
|
||||
sin_desc,
|
||||
rope_sin_ptr,
|
||||
batch_size,
|
||||
num_heads,
|
||||
num_kv_heads,
|
||||
head_dim,
|
||||
causal,
|
||||
scale,
|
||||
q_rope,
|
||||
k_rope,
|
||||
v_rope));
|
||||
|
||||
CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_desc));
|
||||
CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_seqlens_desc));
|
||||
CUINFER_CHECK(cuinferDestroyTensorDescriptor(block_table_desc));
|
||||
CUINFER_CHECK(cuinferDestroyTensorDescriptor(o_desc));
|
||||
CUINFER_CHECK(cuinferDestroyTensorDescriptor(k_cache_desc));
|
||||
CUINFER_CHECK(cuinferDestroyTensorDescriptor(v_cache_desc));
|
||||
CUINFER_CHECK(cuinferDestroyTensorDescriptor(cos_desc));
|
||||
CUINFER_CHECK(cuinferDestroyTensorDescriptor(sin_desc));
|
||||
}
|
||||
|
||||
std::vector<paddle::Tensor> PrefillFusedPagedAttn(const paddle::Tensor& qkv,
|
||||
paddle::Tensor& k_cache,
|
||||
paddle::Tensor& v_cache,
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& cu_seqlens_qkv,
|
||||
const paddle::optional<paddle::Tensor> &rope_sin,
|
||||
const paddle::optional<paddle::Tensor> &rope_cos,
|
||||
int num_heads,
|
||||
int head_dim,
|
||||
int num_kv_heads,
|
||||
int block_size,
|
||||
int max_seq_len,
|
||||
float scale,
|
||||
bool causal,
|
||||
bool q_rope,
|
||||
bool k_rope,
|
||||
bool v_rope) {
|
||||
|
||||
const auto dtype = qkv.dtype();
|
||||
auto out = paddle::empty({qkv.shape()[0], num_heads * head_dim}, dtype, qkv.place());
|
||||
|
||||
switch (dtype) {
|
||||
case paddle::DataType::BFLOAT16:
|
||||
PrefillFusedPagedAttnKernel<paddle::DataType::BFLOAT16>(qkv,
|
||||
k_cache,
|
||||
v_cache,
|
||||
block_table,
|
||||
cu_seqlens_qkv,
|
||||
rope_sin,
|
||||
rope_cos,
|
||||
num_heads,
|
||||
head_dim,
|
||||
num_kv_heads,
|
||||
block_size,
|
||||
max_seq_len,
|
||||
scale,
|
||||
causal,
|
||||
q_rope,
|
||||
k_rope,
|
||||
v_rope,
|
||||
out);
|
||||
break;
|
||||
case paddle::DataType::FLOAT16:
|
||||
PrefillFusedPagedAttnKernel<paddle::DataType::FLOAT16>(qkv,
|
||||
k_cache,
|
||||
v_cache,
|
||||
block_table,
|
||||
cu_seqlens_qkv,
|
||||
rope_sin,
|
||||
rope_cos,
|
||||
num_heads,
|
||||
head_dim,
|
||||
num_kv_heads,
|
||||
block_size,
|
||||
max_seq_len,
|
||||
scale,
|
||||
causal,
|
||||
q_rope,
|
||||
k_rope,
|
||||
v_rope,
|
||||
out);
|
||||
break;
|
||||
default:
|
||||
PD_THROW("Unsupported data type for Paged attn");
|
||||
}
|
||||
return {out};
|
||||
}
|
||||
|
||||
std::vector<std::vector<int64_t>> PrefillFusedPagedAttnInferShape(const std::vector<int64_t>& qkv_shape,
|
||||
const std::vector<int64_t>& k_cache_shape,
|
||||
const std::vector<int64_t>& v_cache_shape,
|
||||
const std::vector<int64_t>& block_table_shape,
|
||||
const std::vector<int64_t>& cu_seqlens_qkv_shape,
|
||||
const std::vector<int64_t>& rope_sin_shape,
|
||||
const std::vector<int64_t>& rope_cos_shape,
|
||||
int num_heads,
|
||||
int head_dim,
|
||||
int num_kv_heads,
|
||||
int block_size,
|
||||
int max_seq_len,
|
||||
float scale,
|
||||
bool causal,
|
||||
bool q_rope,
|
||||
bool k_rope,
|
||||
bool v_rope) {
|
||||
return {{qkv_shape[0], num_heads * head_dim}};
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType> PrefillFusedPagedAttnInferDtype(const paddle::DataType& qkv_dtype) {
|
||||
return {qkv_dtype};
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(prefill_fused_paged_attn)
|
||||
.Inputs({"qkv", "k_cache", "v_cache", "block_table", "cu_seqlens_qkv",
|
||||
paddle::Optional("rope_sin"), paddle::Optional("rope_cos")})
|
||||
.Outputs({"out"})
|
||||
.Attrs({"num_heads:int",
|
||||
"head_dim:int",
|
||||
"num_kv_heads:int",
|
||||
"block_size:int",
|
||||
"max_seq_len:int",
|
||||
"scale:float",
|
||||
"causal:bool",
|
||||
"q_rope:bool",
|
||||
"k_rope:bool",
|
||||
"v_rope:bool"})
|
||||
.SetKernelFn(PD_KERNEL(PrefillFusedPagedAttn))
|
||||
.SetInferShapeFn(PD_INFER_SHAPE(PrefillFusedPagedAttnInferShape))
|
||||
.SetInferDtypeFn(PD_INFER_DTYPE(PrefillFusedPagedAttnInferDtype));
|
@@ -1,181 +0,0 @@
|
||||
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "helper.h"
|
||||
#include "mc_fused_moe_helper.h"
|
||||
#include "fused_moe_op.h"
|
||||
|
||||
__global__ void compute_total_rows_before_expert_kernel(
|
||||
int* sorted_experts,
|
||||
const int64_t sorted_experts_len,
|
||||
const int64_t num_experts,
|
||||
int32_t* total_rows_before_expert) {
|
||||
const int expert = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (expert >= num_experts) return;
|
||||
|
||||
total_rows_before_expert[expert] =
|
||||
find_total_elts_leq_target(sorted_experts, sorted_experts_len, expert);
|
||||
}
|
||||
|
||||
void compute_total_rows_before_expert(int* sorted_indices,
|
||||
const int64_t total_indices,
|
||||
const int64_t num_experts,
|
||||
int32_t* total_rows_before_expert,
|
||||
cudaStream_t stream) {
|
||||
const int threads = std::min(int64_t(1024), num_experts);
|
||||
const int blocks = (num_experts + threads - 1) / threads;
|
||||
|
||||
compute_total_rows_before_expert_kernel<<<blocks, threads, 0, stream>>>(
|
||||
sorted_indices, total_indices, num_experts, total_rows_before_expert);
|
||||
}
|
||||
|
||||
template <paddle::DataType T, typename ElementA, typename ElementB, typename ElementC>
|
||||
void FusedMoeKernel(const paddle::Tensor& input,
|
||||
const paddle::Tensor& gate_weight,
|
||||
const paddle::Tensor& ffn1_weight,
|
||||
const paddle::optional<paddle::Tensor>& ffn1_scale,
|
||||
const paddle::optional<paddle::Tensor>& ffn1_bias,
|
||||
const paddle::Tensor& ffn2_weight,
|
||||
const paddle::optional<paddle::Tensor>& ffn2_scale,
|
||||
const paddle::optional<paddle::Tensor>& ffn2_bias,
|
||||
const std::string& quant_method,
|
||||
const int moe_topk,
|
||||
const bool group_moe,
|
||||
const bool norm_topk_prob,
|
||||
paddle::Tensor* output) {
|
||||
typedef PDTraits<T> traits_;
|
||||
typedef typename traits_::DataType DataType_;
|
||||
typedef typename traits_::data_t data_t;
|
||||
|
||||
auto* output_data = output->data<data_t>();
|
||||
|
||||
auto moe_compute = McMoeHelper<data_t, ElementA, ElementB, ElementC>(quant_method);
|
||||
|
||||
moe_compute.computeFFN(
|
||||
&input,
|
||||
&gate_weight,
|
||||
&ffn1_weight,
|
||||
ffn1_scale ? ffn1_scale.get_ptr() : nullptr,
|
||||
ffn1_bias ? ffn1_bias.get_ptr() : nullptr,
|
||||
&ffn2_weight,
|
||||
ffn2_scale ? ffn2_scale.get_ptr() : nullptr,
|
||||
ffn2_bias ? ffn2_bias.get_ptr() : nullptr,
|
||||
nullptr,
|
||||
moe_topk,
|
||||
group_moe,
|
||||
norm_topk_prob,
|
||||
1.0, // ComputeFFN
|
||||
"ffn",
|
||||
output);
|
||||
}
|
||||
|
||||
|
||||
std::vector<paddle::Tensor> FusedExpertMoe(
|
||||
const paddle::Tensor& input,
|
||||
const paddle::Tensor& gate_weight,
|
||||
const paddle::Tensor& ffn1_weight,
|
||||
const paddle::Tensor& ffn2_weight,
|
||||
const paddle::optional<paddle::Tensor>& ffn1_bias,
|
||||
const paddle::optional<paddle::Tensor>& ffn1_scale,
|
||||
const paddle::optional<paddle::Tensor>& ffn2_bias,
|
||||
const paddle::optional<paddle::Tensor>& ffn2_scale,
|
||||
const std::string& quant_method,
|
||||
const int moe_topk,
|
||||
const bool norm_topk_prob,
|
||||
const bool group_moe) {
|
||||
const auto input_type = input.dtype();
|
||||
auto output = paddle::empty_like(input);
|
||||
|
||||
switch (input_type) {
|
||||
case paddle::DataType::BFLOAT16:
|
||||
FusedMoeKernel<paddle::DataType::BFLOAT16, maca_bfloat16, int8_t, maca_bfloat16>(input,
|
||||
gate_weight,
|
||||
ffn1_weight,
|
||||
ffn1_scale,
|
||||
ffn1_bias,
|
||||
ffn2_weight,
|
||||
ffn2_scale,
|
||||
ffn2_bias,
|
||||
quant_method,
|
||||
moe_topk,
|
||||
group_moe,
|
||||
norm_topk_prob,
|
||||
&output);
|
||||
break;
|
||||
// case paddle::DataType::FLOAT16:
|
||||
// FusedMoeKernel<paddle::DataType::FLOAT16>(input,
|
||||
// gate_weight,
|
||||
// ffn1_weight,
|
||||
// ffn1_scale,
|
||||
// ffn1_bias,
|
||||
// ffn2_weight,
|
||||
// ffn2_scale,
|
||||
// ffn2_bias,
|
||||
// quant_method,
|
||||
// moe_topk,
|
||||
// group_moe,
|
||||
// norm_topk_prob,
|
||||
// &output);
|
||||
// break;
|
||||
default:
|
||||
PD_THROW("Only support bf16 for FusedMoeKernel");
|
||||
}
|
||||
return {output};
|
||||
}
|
||||
|
||||
std::vector<std::vector<int64_t>> FusedExpertMoeInferShape(
|
||||
const std::vector<int64_t>& input_shape,
|
||||
const std::vector<int64_t>& gate_weight_shape,
|
||||
const std::vector<int64_t>& ffn1_weight_shape,
|
||||
const std::vector<int64_t>& ffn2_weight_shape,
|
||||
const paddle::optional<std::vector<int64_t>>& ffn1_bias_shape,
|
||||
const paddle::optional<std::vector<int64_t>>& ffn1_scale_shape,
|
||||
const paddle::optional<std::vector<int64_t>>& ffn2_bias_shape,
|
||||
const paddle::optional<std::vector<int64_t>>& ffn2_scale_shape) {
|
||||
return {input_shape};
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType> FusedExpertMoeInferDtype(
|
||||
const paddle::DataType& input_dtype,
|
||||
const paddle::DataType& gate_weight_dtype,
|
||||
const paddle::DataType& ffn1_weight_dtype,
|
||||
const paddle::DataType& ffn2_weight_dtype,
|
||||
const paddle::optional<paddle::DataType>& ffn1_bias_dtype,
|
||||
const paddle::optional<paddle::DataType>& ffn1_scale_dtype,
|
||||
const paddle::optional<paddle::DataType>& ffn2_bias_dtype,
|
||||
const paddle::optional<paddle::DataType>& ffn2_scale_dtype) {
|
||||
return {input_dtype};
|
||||
}
|
||||
|
||||
|
||||
PD_BUILD_OP(fused_expert_moe)
|
||||
.Inputs({"input",
|
||||
"gate_weight",
|
||||
"ffn1_weight",
|
||||
"ffn2_weight",
|
||||
paddle::Optional("ffn1_bias"),
|
||||
paddle::Optional("ffn1_scale"),
|
||||
paddle::Optional("ffn2_bias"),
|
||||
paddle::Optional("ffn2_scale")})
|
||||
.Outputs({"output"})
|
||||
.Attrs({"quant_method:std::string",
|
||||
"moe_topk:int",
|
||||
"norm_topk_prob:bool",
|
||||
"group_moe:bool"})
|
||||
.SetKernelFn(PD_KERNEL(FusedExpertMoe))
|
||||
.SetInferShapeFn(PD_INFER_SHAPE(FusedExpertMoeInferShape))
|
||||
.SetInferDtypeFn(PD_INFER_DTYPE(FusedExpertMoeInferDtype));
|
@@ -1,53 +0,0 @@
|
||||
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cutlass_kernels/moe_gemm/fused_moe_gemm_kernels.h"
|
||||
#include "fused_moe_op.h"
|
||||
|
||||
using namespace phi;
|
||||
|
||||
template <typename T, int VecSize>
|
||||
__global__ void moe_token_type_ids_kernel(T *gating_output,
|
||||
const int *moe_token_type_ids_out,
|
||||
const int num_rows,
|
||||
const int num_experts,
|
||||
const int k) {
|
||||
const int moe_token_index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (moe_token_index >= num_rows) {
|
||||
return;
|
||||
}
|
||||
|
||||
gating_output[moe_token_index * 2] =
|
||||
gating_output[moe_token_index * 2] +
|
||||
(moe_token_type_ids_out[moe_token_index]) * -1e10;
|
||||
gating_output[moe_token_index * 2 + 1] =
|
||||
gating_output[moe_token_index * 2 + 1] +
|
||||
(1 - moe_token_type_ids_out[moe_token_index]) * -1e10;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void moe_token_type_ids_kernelLauncher(T *gating_output,
|
||||
const int *moe_token_type_ids_out,
|
||||
const int num_rows,
|
||||
const int num_experts,
|
||||
const int k,
|
||||
cudaStream_t stream) {
|
||||
const int blocks = num_rows * k / 512 + 1;
|
||||
const int threads = 512;
|
||||
moe_token_type_ids_kernel<T, 1><<<blocks, 512, 0, stream>>>(
|
||||
gating_output, moe_token_type_ids_out, num_rows, num_experts, k);
|
||||
}
|
@@ -1,123 +0,0 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION &
|
||||
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include "cub/cub.cuh"
|
||||
|
||||
static const float HALF_FLT_MAX = 65504.F;
|
||||
static const float HALF_FLT_MIN = -65504.F;
|
||||
static inline size_t AlignTo16(const size_t& input) {
|
||||
static constexpr int ALIGNMENT = 16;
|
||||
return ALIGNMENT * ((input + ALIGNMENT - 1) / ALIGNMENT);
|
||||
}
|
||||
|
||||
class CubKeyValueSorter {
|
||||
public:
|
||||
CubKeyValueSorter() : num_experts_(0), num_bits_(sizeof(int) * 8) {}
|
||||
|
||||
explicit CubKeyValueSorter(const int num_experts)
|
||||
: num_experts_(num_experts),
|
||||
num_bits_(static_cast<int>(log2(num_experts)) + 1) {}
|
||||
|
||||
void update_num_experts(const int num_experts) {
|
||||
num_experts_ = num_experts;
|
||||
num_bits_ = static_cast<int>(log2(num_experts)) + 1;
|
||||
}
|
||||
|
||||
size_t getWorkspaceSize(const size_t num_key_value_pairs,
|
||||
bool descending = false) {
|
||||
num_key_value_pairs_ = num_key_value_pairs;
|
||||
size_t required_storage = 0;
|
||||
int* null_int = nullptr;
|
||||
if (descending) {
|
||||
cub::DeviceRadixSort::SortPairsDescending(NULL,
|
||||
required_storage,
|
||||
null_int,
|
||||
null_int,
|
||||
null_int,
|
||||
null_int,
|
||||
num_key_value_pairs,
|
||||
0,
|
||||
32);
|
||||
} else {
|
||||
cub::DeviceRadixSort::SortPairs(NULL,
|
||||
required_storage,
|
||||
null_int,
|
||||
null_int,
|
||||
null_int,
|
||||
null_int,
|
||||
num_key_value_pairs,
|
||||
0,
|
||||
num_bits_);
|
||||
}
|
||||
return required_storage;
|
||||
}
|
||||
|
||||
template <typename KeyT>
|
||||
void run(void* workspace,
|
||||
const size_t workspace_size,
|
||||
const KeyT* keys_in,
|
||||
KeyT* keys_out,
|
||||
const int* values_in,
|
||||
int* values_out,
|
||||
const size_t num_key_value_pairs,
|
||||
bool descending,
|
||||
cudaStream_t stream) {
|
||||
size_t expected_ws_size = getWorkspaceSize(num_key_value_pairs);
|
||||
size_t actual_ws_size = workspace_size;
|
||||
|
||||
if (expected_ws_size > workspace_size) {
|
||||
std::stringstream err_ss;
|
||||
err_ss << "[Error][CubKeyValueSorter::run]\n";
|
||||
err_ss << "Error. The allocated workspace is too small to run this "
|
||||
"problem.\n";
|
||||
err_ss << "Expected workspace size of at least " << expected_ws_size
|
||||
<< " but got problem size " << workspace_size << "\n";
|
||||
throw std::runtime_error(err_ss.str());
|
||||
}
|
||||
if (descending) {
|
||||
cub::DeviceRadixSort::SortPairsDescending(workspace,
|
||||
actual_ws_size,
|
||||
keys_in,
|
||||
keys_out,
|
||||
values_in,
|
||||
values_out,
|
||||
num_key_value_pairs,
|
||||
0,
|
||||
32,
|
||||
stream);
|
||||
} else {
|
||||
cub::DeviceRadixSort::SortPairs(workspace,
|
||||
actual_ws_size,
|
||||
keys_in,
|
||||
keys_out,
|
||||
values_in,
|
||||
values_out,
|
||||
num_key_value_pairs,
|
||||
0,
|
||||
num_bits_,
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
size_t num_key_value_pairs_;
|
||||
int num_experts_;
|
||||
int num_bits_;
|
||||
};
|
@@ -1,990 +0,0 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION &
|
||||
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_fp16.h>
|
||||
#include "fused_moe_imp_op.h"
|
||||
#include "fused_moe_helper.h"
|
||||
#include "mctlass/numeric_conversion.h" // BUILD_MARK
|
||||
// Ignore mctlass warnings about type punning
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
|
||||
#pragma GCC diagnostic ignored "-Wunused-function"
|
||||
|
||||
// #include "paddle/phi/backends/gpu/gpu_info.h"
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
#include "helper.h"
|
||||
|
||||
#define WARP_SIZE 32
|
||||
|
||||
struct GpuLaunchConfig {
|
||||
dim3 block_per_grid;
|
||||
dim3 thread_per_block;
|
||||
};
|
||||
|
||||
inline GpuLaunchConfig Get1DBlocksAnd2DGridsMoe(const int64_t cols) {
|
||||
int blocks_x = cols;
|
||||
int blocks_y = 1;
|
||||
int blocks_z = 1;
|
||||
if (blocks_x > 1024) {
|
||||
blocks_y = 256;
|
||||
blocks_x = (blocks_x + blocks_y - 1) / blocks_y;
|
||||
}
|
||||
|
||||
GpuLaunchConfig config;
|
||||
config.block_per_grid.x = blocks_x;
|
||||
config.block_per_grid.y = blocks_y;
|
||||
config.block_per_grid.z = blocks_z;
|
||||
return config;
|
||||
}
|
||||
|
||||
// ====================== Softmax things ===============================
|
||||
// We have our own implementation of softmax here so we can support transposing
|
||||
// the output in the softmax kernel when we extend this module to support
|
||||
// expert-choice routing.
|
||||
template <typename T, int TPB>
|
||||
__launch_bounds__(TPB) __global__
|
||||
void group_moe_softmax(const T* input,
|
||||
T* output,
|
||||
T* softmax_max_prob,
|
||||
const int64_t num_cols,
|
||||
const int64_t softmax_num_rows) {
|
||||
using BlockReduce = cub::BlockReduce<float, TPB>;
|
||||
__shared__ typename BlockReduce::TempStorage tmpStorage;
|
||||
|
||||
__shared__ float normalizing_factor;
|
||||
__shared__ float float_max;
|
||||
__shared__ float max_out;
|
||||
|
||||
int globalIdx = blockIdx.x + blockIdx.y * gridDim.x;
|
||||
if (globalIdx >= softmax_num_rows) {
|
||||
return;
|
||||
}
|
||||
const int64_t thread_row_offset = globalIdx * num_cols;
|
||||
|
||||
cub::Sum sum;
|
||||
float threadData(-FLT_MAX);
|
||||
|
||||
|
||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
|
||||
const int idx = thread_row_offset + ii;
|
||||
threadData = max(static_cast<float>(input[idx]), threadData);
|
||||
}
|
||||
|
||||
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
|
||||
if (threadIdx.x == 0) {
|
||||
float_max = maxElem;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
threadData = 0;
|
||||
|
||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
|
||||
const int idx = thread_row_offset + ii;
|
||||
threadData += exp((static_cast<float>(input[idx]) - float_max));
|
||||
}
|
||||
|
||||
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum);
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
normalizing_factor = 1.f / Z;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
threadData = 0;
|
||||
|
||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
|
||||
const int idx = thread_row_offset + ii;
|
||||
const float val =
|
||||
exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
|
||||
output[idx] = T(val);
|
||||
threadData = max(static_cast<float>(T(val)), threadData);
|
||||
}
|
||||
|
||||
const float maxOut = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
|
||||
if (threadIdx.x == 0) {
|
||||
// group max probs
|
||||
max_out = 1.f / maxOut;
|
||||
softmax_max_prob[globalIdx] = T(max_out);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
|
||||
const int idx = thread_row_offset + ii;
|
||||
// group softmax normalization
|
||||
output[idx] = output[idx] * static_cast<T>(max_out);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int TPB>
|
||||
__launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax,
|
||||
T* output,
|
||||
int* indices,
|
||||
int* source_rows,
|
||||
T* softmax_max_prob,
|
||||
const int64_t num_experts,
|
||||
const int64_t k,
|
||||
const int64_t num_rows) {
|
||||
using cub_kvp = cub::KeyValuePair<int, T>;
|
||||
using BlockReduce = cub::BlockReduce<cub_kvp, TPB>;
|
||||
__shared__ typename BlockReduce::TempStorage tmpStorage;
|
||||
|
||||
cub_kvp thread_kvp;
|
||||
cub::ArgMax arg_max;
|
||||
|
||||
const int block_row = blockIdx.x + blockIdx.y * gridDim.x;
|
||||
if (block_row >= num_rows) {
|
||||
return;
|
||||
}
|
||||
|
||||
const bool should_process_row = true;
|
||||
const int thread_read_offset = block_row * num_experts;
|
||||
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx) {
|
||||
thread_kvp.key = 0;
|
||||
thread_kvp.value = T(-1.f); // This is OK because inputs are probabilities
|
||||
|
||||
cub_kvp inp_kvp;
|
||||
for (int expert = threadIdx.x; expert < num_experts; expert += TPB) {
|
||||
const int idx = thread_read_offset + expert;
|
||||
inp_kvp.key = expert;
|
||||
inp_kvp.value = inputs_after_softmax[idx];
|
||||
|
||||
for (int prior_k = 0; prior_k < k_idx; ++prior_k) {
|
||||
const int prior_winning_expert = indices[k * block_row + prior_k];
|
||||
|
||||
if (prior_winning_expert == expert) {
|
||||
inp_kvp = thread_kvp;
|
||||
}
|
||||
}
|
||||
|
||||
thread_kvp = arg_max(inp_kvp, thread_kvp);
|
||||
}
|
||||
|
||||
const cub_kvp result_kvp =
|
||||
BlockReduce(tmpStorage).Reduce(thread_kvp, arg_max);
|
||||
if (threadIdx.x == 0) {
|
||||
const int idx = k * block_row + k_idx;
|
||||
// restore normalized probes
|
||||
output[idx] = result_kvp.value / T(softmax_max_prob[idx]);
|
||||
indices[idx] = should_process_row ? result_kvp.key : num_experts;
|
||||
source_rows[idx] = k_idx * num_rows + block_row;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int TPB>
|
||||
__launch_bounds__(TPB) __global__ void moe_softmax(const T* input,
|
||||
T* output,
|
||||
const int64_t num_cols,
|
||||
const int64_t num_rows) {
|
||||
using BlockReduce = cub::BlockReduce<float, TPB>;
|
||||
__shared__ typename BlockReduce::TempStorage tmpStorage;
|
||||
|
||||
__shared__ float normalizing_factor;
|
||||
__shared__ float float_max;
|
||||
|
||||
int globalIdx = blockIdx.x + blockIdx.y * gridDim.x;
|
||||
if (globalIdx >= num_rows) {
|
||||
return;
|
||||
}
|
||||
const int64_t thread_row_offset = globalIdx * num_cols;
|
||||
|
||||
cub::Sum sum;
|
||||
float threadData(-FLT_MAX);
|
||||
|
||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
|
||||
const int idx = thread_row_offset + ii;
|
||||
threadData = max(static_cast<float>(input[idx]), threadData);
|
||||
}
|
||||
|
||||
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
|
||||
if (threadIdx.x == 0) {
|
||||
float_max = maxElem;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
threadData = 0;
|
||||
|
||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
|
||||
const int idx = thread_row_offset + ii;
|
||||
threadData += exp((static_cast<float>(input[idx]) - float_max));
|
||||
}
|
||||
|
||||
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum);
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
normalizing_factor = 1.f / Z;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
|
||||
const int idx = thread_row_offset + ii;
|
||||
const float val =
|
||||
exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
|
||||
output[idx] = T(val);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int TPB>
|
||||
__launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax,
|
||||
T* output,
|
||||
int* indices,
|
||||
int* source_rows,
|
||||
const int64_t num_experts,
|
||||
const int64_t k,
|
||||
const int64_t num_rows) {
|
||||
using cub_kvp = cub::KeyValuePair<int, T>;
|
||||
using BlockReduce = cub::BlockReduce<cub_kvp, TPB>;
|
||||
__shared__ typename BlockReduce::TempStorage tmpStorage;
|
||||
|
||||
cub_kvp thread_kvp;
|
||||
cub::ArgMax arg_max;
|
||||
|
||||
const int block_row = blockIdx.x + blockIdx.y * gridDim.x;
|
||||
if (block_row >= num_rows) {
|
||||
return;
|
||||
}
|
||||
|
||||
const bool should_process_row = true;
|
||||
const int thread_read_offset = block_row * num_experts;
|
||||
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx) {
|
||||
thread_kvp.key = 0;
|
||||
thread_kvp.value = T(-1.f); // This is OK because inputs are probabilities
|
||||
|
||||
cub_kvp inp_kvp;
|
||||
for (int expert = threadIdx.x; expert < num_experts; expert += TPB) {
|
||||
const int idx = thread_read_offset + expert;
|
||||
inp_kvp.key = expert;
|
||||
inp_kvp.value = inputs_after_softmax[idx];
|
||||
|
||||
for (int prior_k = 0; prior_k < k_idx; ++prior_k) {
|
||||
const int prior_winning_expert = indices[k * block_row + prior_k];
|
||||
|
||||
if (prior_winning_expert == expert) {
|
||||
inp_kvp = thread_kvp;
|
||||
}
|
||||
}
|
||||
|
||||
thread_kvp = arg_max(inp_kvp, thread_kvp);
|
||||
}
|
||||
|
||||
const cub_kvp result_kvp =
|
||||
BlockReduce(tmpStorage).Reduce(thread_kvp, arg_max);
|
||||
if (threadIdx.x == 0) {
|
||||
const int idx = k * block_row + k_idx;
|
||||
output[idx] = result_kvp.value;
|
||||
indices[idx] = should_process_row ? result_kvp.key : num_experts;
|
||||
source_rows[idx] = k_idx * num_rows + block_row;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
// ====================== TopK softmax things ===============================
|
||||
|
||||
/*
|
||||
A Top-K gating softmax written to exploit when the number of experts in the
|
||||
MoE layers are a small power of 2. This allows us to cleanly share the rows
|
||||
among the threads in a single warp and eliminate communication between warps
|
||||
(so no need to use shared mem).
|
||||
|
||||
It fuses the softmax, max and argmax into a single kernel.
|
||||
|
||||
Limitations:
|
||||
1) This implementation is intended for when the number of experts is a small
|
||||
power of 2. 2) This implementation assumes k is small, but will work for any
|
||||
k.
|
||||
*/
|
||||
|
||||
template <typename T,
|
||||
int VPT,
|
||||
int NUM_EXPERTS,
|
||||
int WARPS_PER_CTA,
|
||||
int BYTES_PER_LDG>
|
||||
__launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
|
||||
void topk_gating_softmax(const T* input,
|
||||
T* output,
|
||||
const int64_t num_rows,
|
||||
int* indices,
|
||||
int* source_rows,
|
||||
const int64_t k) {
|
||||
// We begin by enforcing compile time assertions and setting up compile time
|
||||
// constants.
|
||||
static_assert(VPT == (VPT & -VPT), "VPT must be power of 2");
|
||||
static_assert(NUM_EXPERTS == (NUM_EXPERTS & -NUM_EXPERTS),
|
||||
"NUM_EXPERTS must be power of 2");
|
||||
static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG),
|
||||
"BYTES_PER_LDG must be power of 2");
|
||||
static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
|
||||
|
||||
// Number of bytes each thread pulls in per load
|
||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(T);
|
||||
static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
|
||||
static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
|
||||
static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;
|
||||
|
||||
// Restrictions based on previous section.
|
||||
static_assert(
|
||||
VPT % ELTS_PER_LDG == 0,
|
||||
"The elements per thread must be a multiple of the elements per ldg");
|
||||
static_assert(WARP_SIZE % THREADS_PER_ROW == 0,
|
||||
"The threads per row must cleanly divide the threads per warp");
|
||||
static_assert(THREADS_PER_ROW == (THREADS_PER_ROW & -THREADS_PER_ROW),
|
||||
"THREADS_PER_ROW must be power of 2");
|
||||
static_assert(THREADS_PER_ROW <= WARP_SIZE,
|
||||
"THREADS_PER_ROW can be at most warp size");
|
||||
|
||||
// We have NUM_EXPERTS elements per row. We specialize for small #experts
|
||||
static constexpr int ELTS_PER_WARP = WARP_SIZE * VPT;
|
||||
static constexpr int ROWS_PER_WARP = ELTS_PER_WARP / ELTS_PER_ROW;
|
||||
static constexpr int ROWS_PER_CTA = WARPS_PER_CTA * ROWS_PER_WARP;
|
||||
|
||||
// Restrictions for previous section.
|
||||
static_assert(ELTS_PER_WARP % ELTS_PER_ROW == 0,
|
||||
"The elts per row must cleanly divide the total elt per warp");
|
||||
|
||||
// ===================== From this point, we finally start computing run-time
|
||||
// variables. ========================
|
||||
|
||||
// Compute CTA and warp rows. We pack multiple rows into a single warp, and a
|
||||
// block contains WARPS_PER_CTA warps. This, each block processes a chunk of
|
||||
// rows. We start by computing the start row for each block.
|
||||
const int cta_base_row = blockIdx.x * ROWS_PER_CTA;
|
||||
|
||||
// Now, using the base row per thread block, we compute the base row per warp.
|
||||
const int warp_base_row = cta_base_row + threadIdx.y * ROWS_PER_WARP;
|
||||
|
||||
// The threads in a warp are split into sub-groups that will work on a row.
|
||||
// We compute row offset for each thread sub-group
|
||||
const int thread_row_in_warp = threadIdx.x / THREADS_PER_ROW;
|
||||
const int thread_row = warp_base_row + thread_row_in_warp;
|
||||
|
||||
// Threads with indices out of bounds should early exit here.
|
||||
if (thread_row >= num_rows) return;
|
||||
const bool should_process_row = true;
|
||||
|
||||
// We finally start setting up the read pointers for each thread. First, each
|
||||
// thread jumps to the start of the row it will read.
|
||||
const T* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
|
||||
|
||||
// Now, we compute the group each thread belong to in order to determine the
|
||||
// first column to start loads.
|
||||
const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
|
||||
const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
|
||||
const T* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
|
||||
|
||||
// Determine the pointer type to use to read in the data depending on the
|
||||
// BYTES_PER_LDG template param. In theory, this can support all powers of 2
|
||||
// up to 16.
|
||||
using AccessType = mctlass::AlignedArray<T, ELTS_PER_LDG>;
|
||||
|
||||
// Finally, we pull in the data from global mem
|
||||
mctlass::Array<T, VPT> row_chunk_input;
|
||||
AccessType* row_chunk_vec_ptr =
|
||||
reinterpret_cast<AccessType*>(&row_chunk_input);
|
||||
const AccessType* vec_thread_read_ptr =
|
||||
reinterpret_cast<const AccessType*>(thread_read_ptr);
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||
}
|
||||
|
||||
using ComputeType = float;
|
||||
using Converter = mctlass::NumericArrayConverter<ComputeType, T, VPT>;
|
||||
Converter compute_type_converter;
|
||||
mctlass::Array<ComputeType, VPT> row_chunk =
|
||||
compute_type_converter(row_chunk_input);
|
||||
|
||||
// First, we perform a max reduce within the thread. We can do the max in fp16
|
||||
// safely (I think) and just convert to float afterwards for the exp + sum
|
||||
// reduction.
|
||||
ComputeType thread_max = row_chunk[0];
|
||||
#pragma unroll
|
||||
for (int ii = 1; ii < VPT; ++ii) {
|
||||
thread_max = max(thread_max, row_chunk[ii]);
|
||||
}
|
||||
|
||||
// Now, we find the max within the thread group and distribute among the
|
||||
// threads. We use a butterfly reduce.
|
||||
#pragma unroll
|
||||
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2) {
|
||||
thread_max =
|
||||
max(thread_max,
|
||||
__shfl_xor_sync(0xFFFFFFFF, thread_max, mask, THREADS_PER_ROW));
|
||||
}
|
||||
|
||||
// From this point, thread max in all the threads have the max within the row.
|
||||
// Now, we subtract the max from each element in the thread and take the exp.
|
||||
// We also compute the thread local sum.
|
||||
float row_sum = 0;
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < VPT; ++ii) {
|
||||
row_chunk[ii] = expf(row_chunk[ii] - thread_max);
|
||||
row_sum += row_chunk[ii];
|
||||
}
|
||||
|
||||
// Now, we perform the sum reduce within each thread group. Similar to the max
|
||||
// reduce, we use a bufferfly pattern.
|
||||
#pragma unroll
|
||||
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2) {
|
||||
row_sum += __shfl_xor_sync(0xFFFFFFFF, row_sum, mask, THREADS_PER_ROW);
|
||||
}
|
||||
|
||||
// From this point, all threads have the max and the sum for their rows in the
|
||||
// thread_max and thread_sum variables respectively. Finally, we can scale the
|
||||
// rows for the softmax. Technically, for top-k gating we don't need to
|
||||
// compute the entire softmax row. We can likely look at the maxes and only
|
||||
// compute for the top-k values in the row. However, this kernel will likely
|
||||
// not be a bottle neck and it seems better to closer match torch and find the
|
||||
// argmax after computing the softmax.
|
||||
const float reciprocal_row_sum = 1.f / row_sum;
|
||||
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < VPT; ++ii) {
|
||||
row_chunk[ii] = row_chunk[ii] * reciprocal_row_sum;
|
||||
}
|
||||
|
||||
// Now, softmax_res contains the softmax of the row chunk. Now, I want to find
|
||||
// the topk elements in each row, along with the max index.
|
||||
int start_col = first_elt_read_by_thread;
|
||||
static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
|
||||
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx) {
|
||||
// First, each thread does the local argmax
|
||||
float max_val = row_chunk[0];
|
||||
int expert = start_col;
|
||||
#pragma unroll
|
||||
for (int ldg = 0, col = start_col; ldg < LDG_PER_THREAD;
|
||||
++ldg, col += COLS_PER_GROUP_LDG) {
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < ELTS_PER_LDG; ++ii) {
|
||||
float val = row_chunk[ldg * ELTS_PER_LDG + ii];
|
||||
|
||||
// No check on the experts here since columns with the smallest index
|
||||
// are processed first and only updated if > (not >=)
|
||||
if (val > max_val) {
|
||||
max_val = val;
|
||||
expert = col + ii;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Now, we perform the argmax reduce. We use the butterfly pattern so threads
|
||||
// reach consensus about the max. This will be useful for K > 1 so that the
|
||||
// threads can agree on "who" had the max value. That thread can then blank out
|
||||
// their max with -inf and the warp can run more iterations...
|
||||
#pragma unroll
|
||||
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2) {
|
||||
float other_max =
|
||||
__shfl_xor_sync(0xFFFFFFFF, max_val, mask, THREADS_PER_ROW);
|
||||
int other_expert =
|
||||
__shfl_xor_sync(0xFFFFFFFF, expert, mask, THREADS_PER_ROW);
|
||||
|
||||
// We want lower indices to "win" in every thread so we break ties this
|
||||
// way
|
||||
if (other_max > max_val ||
|
||||
(other_max == max_val && other_expert < expert)) {
|
||||
max_val = other_max;
|
||||
expert = other_expert;
|
||||
}
|
||||
}
|
||||
|
||||
// Write the max for this k iteration to global memory.
|
||||
if (thread_group_idx == 0) {
|
||||
// The lead thread from each sub-group will write out the final results to
|
||||
// global memory. (This will be a single) thread per row of the
|
||||
// input/output matrices.
|
||||
const int idx = k * thread_row + k_idx;
|
||||
output[idx] = T(max_val);
|
||||
indices[idx] = should_process_row ? expert : NUM_EXPERTS;
|
||||
source_rows[idx] = k_idx * num_rows + thread_row;
|
||||
}
|
||||
|
||||
// Finally, we clear the value in the thread with the current max if there
|
||||
// is another iteration to run.
|
||||
if (k_idx + 1 < k) {
|
||||
const int ldg_group_for_expert = expert / COLS_PER_GROUP_LDG;
|
||||
const int thread_to_clear_in_group =
|
||||
(expert / ELTS_PER_LDG) % THREADS_PER_ROW;
|
||||
|
||||
// Only the thread in the group which produced the max will reset the
|
||||
// "winning" value to -inf.
|
||||
if (thread_group_idx == thread_to_clear_in_group) {
|
||||
const int offset_for_expert = expert % ELTS_PER_LDG;
|
||||
// Safe to set to any negative value since row_chunk values must be
|
||||
// between 0 and 1.
|
||||
row_chunk[ldg_group_for_expert * ELTS_PER_LDG + offset_for_expert] =
|
||||
ComputeType(-10000.f);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
// Constructs some constants needed to partition the work across threads at
|
||||
// compile time.
|
||||
template <typename T, int EXPERTS, int BYTES_PER_LDG>
|
||||
struct TopkConstants {
|
||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(T);
|
||||
static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE) == 0 ||
|
||||
EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0,
|
||||
"");
|
||||
static constexpr int VECs_PER_THREAD =
|
||||
std::max(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
|
||||
static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
|
||||
static constexpr int THREADS_PER_ROW = EXPERTS / VPT;
|
||||
static constexpr int ROWS_PER_WARP = WARP_SIZE / THREADS_PER_ROW;
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
template <typename T, int EXPERTS, int WARPS_PER_TB>
|
||||
void topk_gating_softmax_launcher_helper(const T* input,
|
||||
T* output,
|
||||
int* indices,
|
||||
int* source_row,
|
||||
const int64_t num_rows,
|
||||
const int64_t num_experts,
|
||||
const int64_t k,
|
||||
cudaStream_t stream) {
|
||||
static constexpr uint64_t MAX_BYTES_PER_LDG = 16;
|
||||
static constexpr int BYTES_PER_LDG =
|
||||
std::min(MAX_BYTES_PER_LDG, sizeof(T) * EXPERTS);
|
||||
using Constants = detail::TopkConstants<T, EXPERTS, BYTES_PER_LDG>;
|
||||
static constexpr int VPT = Constants::VPT;
|
||||
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
||||
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
||||
|
||||
dim3 block_dim(WARP_SIZE, WARPS_PER_TB);
|
||||
topk_gating_softmax<T, VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG>
|
||||
<<<num_blocks, block_dim, 0, stream>>>(
|
||||
input, output, num_rows, indices, source_row, k);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void topk_gating_softmax_kernelLauncher(const T* input,
|
||||
T* output,
|
||||
T* softmax,
|
||||
int* indices,
|
||||
int* source_row,
|
||||
T* softmax_max_prob,
|
||||
const int64_t num_rows,
|
||||
const int64_t num_experts,
|
||||
const int64_t k,
|
||||
const bool group_moe,
|
||||
cudaStream_t stream,
|
||||
const bool topk_only_mode = false) {
|
||||
if (topk_only_mode) {
|
||||
static constexpr int TPB = 256;
|
||||
const auto config_topk = Get1DBlocksAnd2DGridsMoe(num_rows);
|
||||
moe_top_k<T, TPB><<<config_topk.block_per_grid, TPB, 0, stream>>>(
|
||||
input, output, indices, source_row, num_experts, k, num_rows);
|
||||
return;
|
||||
}
|
||||
static constexpr int WARPS_PER_TB = 4;
|
||||
|
||||
#define LAUNCH_TOPK_GATING_SOFTMAX_HELPER(N) \
|
||||
case N: { \
|
||||
topk_gating_softmax_launcher_helper<T, N, WARPS_PER_TB>( \
|
||||
input, output, indices, source_row, num_rows, num_experts, k, stream); \
|
||||
break; \
|
||||
}
|
||||
switch (num_experts) {
|
||||
LAUNCH_TOPK_GATING_SOFTMAX_HELPER(2)
|
||||
LAUNCH_TOPK_GATING_SOFTMAX_HELPER(4)
|
||||
LAUNCH_TOPK_GATING_SOFTMAX_HELPER(8)
|
||||
LAUNCH_TOPK_GATING_SOFTMAX_HELPER(16)
|
||||
LAUNCH_TOPK_GATING_SOFTMAX_HELPER(32)
|
||||
LAUNCH_TOPK_GATING_SOFTMAX_HELPER(64)
|
||||
LAUNCH_TOPK_GATING_SOFTMAX_HELPER(128)
|
||||
LAUNCH_TOPK_GATING_SOFTMAX_HELPER(256)
|
||||
|
||||
default: {
|
||||
static constexpr int TPB = 256;
|
||||
if (group_moe) {
|
||||
const int group_experts = num_experts / k;
|
||||
const int softmax_num_rows = num_rows * k;
|
||||
const auto config_softmax = Get1DBlocksAnd2DGridsMoe(softmax_num_rows);
|
||||
group_moe_softmax<T, TPB>
|
||||
<<<config_softmax.block_per_grid, TPB, 0, stream>>>(
|
||||
input,
|
||||
softmax,
|
||||
softmax_max_prob,
|
||||
group_experts,
|
||||
softmax_num_rows);
|
||||
const auto config_topk = Get1DBlocksAnd2DGridsMoe(num_rows);
|
||||
moe_top_k<T, TPB>
|
||||
<<<config_topk.block_per_grid, TPB, 0, stream>>>(softmax,
|
||||
output,
|
||||
indices,
|
||||
source_row,
|
||||
softmax_max_prob,
|
||||
num_experts,
|
||||
k,
|
||||
num_rows);
|
||||
} else {
|
||||
const auto config_topk = Get1DBlocksAnd2DGridsMoe(num_rows);
|
||||
moe_softmax<T, TPB><<<config_topk.block_per_grid, TPB, 0, stream>>>(
|
||||
input, softmax, num_experts, num_rows);
|
||||
moe_top_k<T, TPB>
|
||||
<<<config_topk.block_per_grid, TPB, 0, stream>>>(softmax,
|
||||
output,
|
||||
indices,
|
||||
source_row,
|
||||
num_experts,
|
||||
k,
|
||||
num_rows);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ========================== Permutation things
|
||||
// =======================================
|
||||
|
||||
// Duplicated and permutes rows for MoE. In addition, reverse the permutation
|
||||
// map to help with finalizing routing.
|
||||
|
||||
// "expanded_x_row" simply means that the number of values is num_rows x k. It
|
||||
// is "expanded" since we will have to duplicate some rows in the input matrix
|
||||
// to match the dimensions. Duplicates will always get routed to separate
|
||||
// experts in the end.
|
||||
|
||||
// Note that the expanded_dest_row_to_expanded_source_row map referred to here
|
||||
// has indices in the range (0, k*rows_in_input - 1). However, it is set up so
|
||||
// that index 0, rows_in_input, 2*rows_in_input ... (k-1)*rows_in_input all map
|
||||
// to row 0 in the original matrix. Thus, to know where to read in the source
|
||||
// matrix, we simply take the modulus of the expanded index.
|
||||
|
||||
template <typename T, int VecSize>
|
||||
__global__ void initialize_moe_routing_kernel(
|
||||
const T* unpermuted_input,
|
||||
T* permuted_output,
|
||||
const int* expanded_dest_row_to_expanded_source_row,
|
||||
int* expanded_source_row_to_expanded_dest_row,
|
||||
const int64_t num_rows,
|
||||
const int64_t active_rows,
|
||||
const int64_t cols,
|
||||
const int64_t num_rows_k) {
|
||||
using LoadT = AlignedVector<T, VecSize>;
|
||||
LoadT src_vec;
|
||||
|
||||
// Reverse permutation map.
|
||||
// I do this so that later, we can use the source -> dest map to do the k-way
|
||||
// reduction and unpermuting. I need the reverse map for that reduction to
|
||||
// allow each threadblock to do 1 k-way reduce without atomics later in MoE. 1
|
||||
// thread block will be responsible for all k summations.
|
||||
const int expanded_dest_row = blockIdx.x + blockIdx.y * gridDim.x;
|
||||
if (expanded_dest_row >= num_rows_k) return;
|
||||
const int expanded_source_row =
|
||||
expanded_dest_row_to_expanded_source_row[expanded_dest_row];
|
||||
if (threadIdx.x == 0) {
|
||||
expanded_source_row_to_expanded_dest_row[expanded_source_row] =
|
||||
expanded_dest_row;
|
||||
}
|
||||
|
||||
if ((blockIdx.x + blockIdx.y * gridDim.x) < active_rows) {
|
||||
// Duplicate and permute rows
|
||||
const int source_row = expanded_source_row % num_rows;
|
||||
|
||||
const T* source_row_ptr = unpermuted_input + source_row * cols;
|
||||
T* dest_row_ptr = permuted_output + expanded_dest_row * cols;
|
||||
|
||||
for (int tid = threadIdx.x * VecSize; tid < cols;
|
||||
tid += blockDim.x * VecSize) {
|
||||
// dest_row_ptr[tid] = source_row_ptr[tid];
|
||||
Load<T, VecSize>(&source_row_ptr[tid], &src_vec);
|
||||
Store<T, VecSize>(src_vec, &dest_row_ptr[tid]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void initialize_moe_routing_kernelLauncher(
|
||||
const T* unpermuted_input,
|
||||
T* permuted_output,
|
||||
const int* expanded_dest_row_to_expanded_source_row,
|
||||
int* expanded_source_row_to_expanded_dest_row,
|
||||
const int64_t num_rows,
|
||||
const int64_t active_rows,
|
||||
const int64_t cols,
|
||||
const int64_t k,
|
||||
cudaStream_t stream) {
|
||||
const int threads = std::min(cols, int64_t(1024));
|
||||
constexpr int max_pack_size = 16 / sizeof(T);
|
||||
const auto config_initialize = Get1DBlocksAnd2DGridsMoe(num_rows * k);
|
||||
if (cols % max_pack_size == 0) {
|
||||
initialize_moe_routing_kernel<T, max_pack_size>
|
||||
<<<config_initialize.block_per_grid, threads, 0, stream>>>(
|
||||
unpermuted_input,
|
||||
permuted_output,
|
||||
expanded_dest_row_to_expanded_source_row,
|
||||
expanded_source_row_to_expanded_dest_row,
|
||||
num_rows,
|
||||
k * active_rows,
|
||||
cols,
|
||||
num_rows * k);
|
||||
} else {
|
||||
initialize_moe_routing_kernel<T, 1>
|
||||
<<<config_initialize.block_per_grid, threads, 0, stream>>>(
|
||||
unpermuted_input,
|
||||
permuted_output,
|
||||
expanded_dest_row_to_expanded_source_row,
|
||||
expanded_source_row_to_expanded_dest_row,
|
||||
num_rows,
|
||||
k * active_rows,
|
||||
cols,
|
||||
num_rows * k);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================== Infer GEMM sizes
|
||||
// =================================
|
||||
__device__ inline int find_total_elts_leq_target(int* sorted_indices,
|
||||
const int64_t arr_length,
|
||||
const int64_t target) {
|
||||
int64_t low = 0, high = arr_length - 1, target_location = -1;
|
||||
while (low <= high) {
|
||||
int64_t mid = (low + high) / 2;
|
||||
|
||||
if (sorted_indices[mid] > target) {
|
||||
high = mid - 1;
|
||||
} else {
|
||||
low = mid + 1;
|
||||
target_location = mid;
|
||||
}
|
||||
}
|
||||
return target_location + 1;
|
||||
}
|
||||
|
||||
void compute_total_rows_before_expert(int* sorted_indices,
|
||||
const int64_t total_indices,
|
||||
const int64_t num_experts,
|
||||
int32_t* total_rows_before_expert,
|
||||
cudaStream_t stream);
|
||||
|
||||
// Final kernel to unpermute and scale
|
||||
// This kernel unpermutes the original data, does the k-way reduction and
|
||||
// performs the final skip connection.
|
||||
template <typename T, int RESIDUAL_NUM>
|
||||
__global__ void finalize_moe_routing_kernel(
|
||||
const T* expanded_permuted_rows,
|
||||
T* reduced_unpermuted_output,
|
||||
const T* bias,
|
||||
const float* scales,
|
||||
const int* expanded_source_row_to_expanded_dest_row,
|
||||
const int* expert_for_source_row,
|
||||
const int64_t cols,
|
||||
const int64_t k,
|
||||
const int64_t compute_bias,
|
||||
const bool norm_topk_prob,
|
||||
const float routed_scaling_factor,
|
||||
const int64_t num_rows) {
|
||||
const int original_row = blockIdx.x + blockIdx.y * gridDim.x;
|
||||
// const int original_row = blockIdx.x;
|
||||
// const int num_rows = gridDim.x;
|
||||
if (original_row >= num_rows) return;
|
||||
T* reduced_row_ptr = reduced_unpermuted_output + original_row * cols;
|
||||
|
||||
for (int tid = threadIdx.x; tid < cols; tid += blockDim.x) {
|
||||
T thread_output{0.f};
|
||||
float row_rescale{0.f};
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx) {
|
||||
const int expanded_original_row = original_row + k_idx * num_rows;
|
||||
const int expanded_permuted_row =
|
||||
expanded_source_row_to_expanded_dest_row[expanded_original_row];
|
||||
|
||||
const int64_t k_offset = original_row * k + k_idx;
|
||||
const float row_scale = scales[k_offset];
|
||||
row_rescale = row_rescale + row_scale;
|
||||
|
||||
const T* expanded_permuted_rows_row_ptr =
|
||||
expanded_permuted_rows + expanded_permuted_row * cols;
|
||||
|
||||
const int expert_idx = expert_for_source_row[k_offset];
|
||||
const T* bias_ptr = bias ? bias + expert_idx * cols : nullptr;
|
||||
const T bias_value = bias_ptr ? bias_ptr[tid] : T{0.f};
|
||||
|
||||
thread_output =
|
||||
static_cast<float>(thread_output) +
|
||||
row_scale * static_cast<float>(
|
||||
expanded_permuted_rows_row_ptr[tid] +
|
||||
bias_value *
|
||||
static_cast<T>(static_cast<float>(compute_bias)));
|
||||
}
|
||||
|
||||
thread_output = static_cast<float>(thread_output) /
|
||||
(norm_topk_prob ? row_rescale : 1.0f) *
|
||||
routed_scaling_factor;
|
||||
reduced_row_ptr[tid] = thread_output;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void finalize_moe_routing_kernelLauncher(
|
||||
const T* expanded_permuted_rows,
|
||||
T* reduced_unpermuted_output,
|
||||
const T* bias,
|
||||
const float* scales,
|
||||
const int* expanded_source_row_to_expanded_dest_row,
|
||||
const int* expert_for_source_row,
|
||||
const int64_t num_rows,
|
||||
const int64_t cols,
|
||||
const int64_t k,
|
||||
const int64_t compute_bias,
|
||||
const bool norm_topk_prob,
|
||||
const float routed_scaling_factor,
|
||||
cudaStream_t stream) {
|
||||
const int threads = std::min(cols, int64_t(1024));
|
||||
const auto config_final = Get1DBlocksAnd2DGridsMoe(num_rows);
|
||||
|
||||
finalize_moe_routing_kernel<T, 1>
|
||||
<<<config_final.block_per_grid, threads, 0, stream>>>(
|
||||
expanded_permuted_rows,
|
||||
reduced_unpermuted_output,
|
||||
bias,
|
||||
scales,
|
||||
expanded_source_row_to_expanded_dest_row,
|
||||
expert_for_source_row,
|
||||
cols,
|
||||
k,
|
||||
compute_bias,
|
||||
norm_topk_prob,
|
||||
routed_scaling_factor,
|
||||
num_rows);
|
||||
}
|
||||
|
||||
// ========================= TopK Softmax specializations
|
||||
// ===========================
|
||||
template void topk_gating_softmax_kernelLauncher(const float*,
|
||||
float*,
|
||||
float*,
|
||||
int*,
|
||||
int*,
|
||||
float*,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const bool,
|
||||
cudaStream_t,
|
||||
const bool);
|
||||
template void topk_gating_softmax_kernelLauncher(const half*,
|
||||
half*,
|
||||
half*,
|
||||
int*,
|
||||
int*,
|
||||
half*,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const bool,
|
||||
cudaStream_t,
|
||||
const bool);
|
||||
#ifdef PADDLE_CUDA_BF16
|
||||
template void topk_gating_softmax_kernelLauncher(const __nv_bfloat16*,
|
||||
__nv_bfloat16*,
|
||||
__nv_bfloat16*,
|
||||
int*,
|
||||
int*,
|
||||
__nv_bfloat16*,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const bool,
|
||||
cudaStream_t,
|
||||
const bool);
|
||||
#endif
|
||||
// ===================== Specializations for init routing
|
||||
// =========================
|
||||
template void initialize_moe_routing_kernelLauncher(const float*,
|
||||
float*,
|
||||
const int*,
|
||||
int*,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
cudaStream_t);
|
||||
template void initialize_moe_routing_kernelLauncher(const half*,
|
||||
half*,
|
||||
const int*,
|
||||
int*,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
cudaStream_t);
|
||||
#ifdef PADDLE_CUDA_BF16
|
||||
template void initialize_moe_routing_kernelLauncher(const __nv_bfloat16*,
|
||||
__nv_bfloat16*,
|
||||
const int*,
|
||||
int*,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
cudaStream_t);
|
||||
#endif
|
||||
// ==================== Specializations for final routing
|
||||
// ===================================
|
||||
template void finalize_moe_routing_kernelLauncher(const float*,
|
||||
float*,
|
||||
const float*,
|
||||
const float*,
|
||||
const int*,
|
||||
const int*,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const bool,
|
||||
const float,
|
||||
cudaStream_t);
|
||||
template void finalize_moe_routing_kernelLauncher(const half*,
|
||||
half*,
|
||||
const half*,
|
||||
const float*,
|
||||
const int*,
|
||||
const int*,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const bool,
|
||||
const float,
|
||||
cudaStream_t);
|
||||
#ifdef PADDLE_CUDA_BF16
|
||||
template void finalize_moe_routing_kernelLauncher(const __nv_bfloat16*,
|
||||
__nv_bfloat16*,
|
||||
const __nv_bfloat16*,
|
||||
const float*,
|
||||
const int*,
|
||||
const int*,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const int64_t,
|
||||
const bool,
|
||||
const float,
|
||||
cudaStream_t);
|
||||
#endif
|
@@ -1,417 +0,0 @@
|
||||
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "mctlass/numeric_conversion.h"
|
||||
#include "mctlassEx/mctlassEx.h"
|
||||
#include "fused_moe_helper.h"
|
||||
|
||||
|
||||
template <typename ElementA, typename ElementB, typename ElementC>
|
||||
void mc_grouped_gemm_basic_kernel(
|
||||
const ElementA* ptrA,
|
||||
mctlassExOrder_t majorA,
|
||||
const ElementB* ptrB,
|
||||
mctlassExOrder_t majorB,
|
||||
const ElementA* ptrScale,
|
||||
const ElementA* ptrBias,
|
||||
ElementC* ptrC,
|
||||
mctlassExOrder_t majorC,
|
||||
const int *ptrSegInd,
|
||||
int numExperts,
|
||||
int m, // expanded_active_expert_rows
|
||||
int n, // inter_dim
|
||||
int k, // hidden_size
|
||||
mcStream_t stream) {
|
||||
mctlassExHandle_t handle;
|
||||
mctlassExHandleCreate(&handle);
|
||||
|
||||
int* ptrMNumTilesInd;
|
||||
mcMallocAsync((void**)&ptrMNumTilesInd, sizeof(int) * numExperts, stream);
|
||||
|
||||
mctlassExMatrixLayout_t matLayoutA;
|
||||
mctlassExMatrixLayout_t matLayoutB;
|
||||
mctlassExMatrixLayout_t matLayoutC;
|
||||
|
||||
// mat A: (m, k)
|
||||
mctlassExMatrixLayoutCreate(&matLayoutA, mctlassExDataType::MCTLASS_EX_BF16, m, k, k);
|
||||
mctlassExMatrixLayoutSetAttribute(matLayoutA, mctlassExMatrixLayoutAttribute_t::MCTLASS_EX_MATRIX_LAYOUT_ORDER,
|
||||
&majorA, sizeof(mctlassExOrder_t));
|
||||
// mat B: (num_experts, n, k)
|
||||
mctlassExMatrixLayoutCreate(&matLayoutB, mctlassExDataType::MCTLASS_EX_INT8, k, n, k);
|
||||
mctlassExMatrixLayoutSetAttribute(matLayoutB, mctlassExMatrixLayoutAttribute_t::MCTLASS_EX_MATRIX_LAYOUT_ORDER,
|
||||
&majorB, sizeof(mctlassExOrder_t));
|
||||
mctlassExMatrixLayoutSetAttribute(matLayoutB, mctlassExMatrixLayoutAttribute_t::MCTLASS_EX_MATRIX_LAYOUT_BATCH_COUNT,
|
||||
&numExperts, sizeof(int));
|
||||
// mat C: (m, n)
|
||||
mctlassExMatrixLayoutCreate(&matLayoutC, mctlassExDataType::MCTLASS_EX_BF16, m, n, n);
|
||||
mctlassExMatrixLayoutSetAttribute(matLayoutC, mctlassExMatrixLayoutAttribute_t::MCTLASS_EX_MATRIX_LAYOUT_ORDER,
|
||||
&majorC, sizeof(mctlassExOrder_t));
|
||||
// bias: (num_experts, n)
|
||||
// scale: (num, n)
|
||||
|
||||
mctlassExDesc_t mctlass_desc;
|
||||
mctlassExCreateDesc(&mctlass_desc);
|
||||
mctlassExDataType input_type = mctlassExDataType::MCTLASS_EX_BF16;
|
||||
mctlassExDataType scale_type = mctlassExDataType::MCTLASS_EX_INT8;
|
||||
mctlassExDataType compute_type = mctlassExDataType::MCTLASS_EX_FP32;
|
||||
mctlassExEpilogueType epilogue_type = mctlassExEpilogueType::MCTLASS_EX_GEMM_DEFAULT;
|
||||
if (ptrBias) {
|
||||
epilogue_type = mctlassExEpilogueType::MCTLASS_EX_GEMM_BIAS_PERGROUP;
|
||||
}
|
||||
// set scale
|
||||
mctlassExDescSetAttribute(mctlass_desc, mctlassExDescAttributes_t::MCTLASS_EX_GEMM_DESC_B_SCALE_POINTER,
|
||||
&ptrScale, sizeof(ptrScale));
|
||||
mctlassExDescSetAttribute(mctlass_desc, mctlassExDescAttributes_t::MCTLASS_EX_GEMM_DESC_B_SCALE_TYPE,
|
||||
&scale_type, sizeof(mctlassExDataType));
|
||||
// set bias
|
||||
if (ptrBias) {
|
||||
mctlassExDescSetAttribute(mctlass_desc, mctlassExDescAttributes_t::MCTLASS_EX_GEMM_DESC_BIAS_POINTER,
|
||||
&ptrBias, sizeof(ptrBias));
|
||||
}
|
||||
// set coumpute type
|
||||
mctlassExDescSetAttribute(mctlass_desc, mctlassExDescAttributes_t::MCTLASS_EX_GEMM_DESC_COMPUTE_TYPE,
|
||||
&compute_type, sizeof(mctlassExDataType));
|
||||
// set epilogue type
|
||||
mctlassExDescSetAttribute(mctlass_desc, mctlassExDescAttributes_t::MCTLASS_EX_GEMM_DESC_EPILOGUE_TYPE,
|
||||
&epilogue_type, sizeof(mctlassExEpilogueType));
|
||||
|
||||
const mctlassExContiguousGroupedGemmAlgo_t algo = mctlassExContiguousGroupedGemmAlgo_t::MCTLASS_EX_CONTIGUOUS_GROUPED_ALGO_SEGPTR;
|
||||
int blocksizeM = mctlassExContiguousGroupedGemmGetBlocksizeM(handle, mctlass_desc, matLayoutA, matLayoutB, matLayoutC, &algo);
|
||||
mctlassExContiguousGroupedGemmComputeMNumTilesIndptr(handle, mctlass_desc, matLayoutA, matLayoutB, matLayoutC, &algo, ptrSegInd, ptrMNumTilesInd, numExperts, blocksizeM);
|
||||
|
||||
mctlassExContiguousGroupedGemmBasic(handle, mctlass_desc,
|
||||
ptrA, matLayoutA,
|
||||
ptrB, matLayoutB,
|
||||
ptrC, matLayoutC,
|
||||
ptrSegInd, nullptr, ptrMNumTilesInd,
|
||||
&algo, nullptr, 0, stream);
|
||||
|
||||
mctlassExHandleDestroy(handle);
|
||||
mctlassExMatrixLayoutDestroy(matLayoutA);
|
||||
mctlassExMatrixLayoutDestroy(matLayoutB);
|
||||
mctlassExMatrixLayoutDestroy(matLayoutC);
|
||||
mctlassExDestroyDesc(mctlass_desc);
|
||||
mcFreeAsync(ptrMNumTilesInd, stream);
|
||||
}
|
||||
|
||||
template<typename T, typename ElementA, typename ElementB, typename ElementC>
|
||||
class McMoeHelper {
|
||||
public:
|
||||
McMoeHelper(const std::string gemm_method): gemm_method_(gemm_method) {}
|
||||
|
||||
// -------- getWorkspaceSize -------- //
|
||||
template <typename KeyT>
|
||||
size_t getWorkspaceSize(const int64_t num_rows,
|
||||
const int64_t hidden_size,
|
||||
const int64_t inter_size,
|
||||
const int64_t num_experts,
|
||||
const int64_t k) {
|
||||
const size_t buf_size = AlignTo16(k * num_rows * hidden_size);
|
||||
const size_t interbuf_size = AlignTo16(k * num_rows * inter_size);
|
||||
const size_t padded_experts = AlignTo16(num_experts);
|
||||
const size_t num_moe_inputs = AlignTo16(k * num_rows);
|
||||
// softmax output, permuted_rows and permuted_experts have moved to outside
|
||||
// of moe kernel, allocate them in Encoder or Decoder before invoking
|
||||
// FfnLayer forward.
|
||||
size_t total_ws_bytes =
|
||||
5 * num_moe_inputs *
|
||||
sizeof(int); // source_rows_, permuted_rows_, permuted_experts_
|
||||
total_ws_bytes += buf_size * sizeof(KeyT); // permuted_data
|
||||
total_ws_bytes +=
|
||||
padded_experts * sizeof(int32_t); // Hold total_rows_before_expert_
|
||||
|
||||
const size_t bytes_for_fc1_result = interbuf_size * sizeof(KeyT);
|
||||
const size_t sorter_ws_size_bytes =
|
||||
AlignTo16(sorter_.getWorkspaceSize(num_rows));
|
||||
sorter_.update_num_experts(num_experts);
|
||||
|
||||
int64_t bytes_for_intermediate_and_sorting = bytes_for_fc1_result;
|
||||
if (sorter_ws_size_bytes > bytes_for_fc1_result) {
|
||||
int64_t remaining_bytes =
|
||||
AlignTo16(sorter_ws_size_bytes - bytes_for_fc1_result);
|
||||
bytes_for_intermediate_and_sorting += remaining_bytes;
|
||||
}
|
||||
|
||||
total_ws_bytes +=
|
||||
bytes_for_intermediate_and_sorting; // intermediate (fc1) output + cub
|
||||
// sorting workspace
|
||||
|
||||
int64_t num_softmax_outs = 0;
|
||||
const bool is_pow_2 =
|
||||
(num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
|
||||
if (!is_pow_2 || num_experts > 256) {
|
||||
num_softmax_outs = AlignTo16(num_rows * num_experts);
|
||||
}
|
||||
|
||||
total_ws_bytes += num_softmax_outs * sizeof(float);
|
||||
|
||||
return total_ws_bytes;
|
||||
}
|
||||
|
||||
void computeFFN(const paddle::Tensor *input,
|
||||
const paddle::Tensor *gate_weight,
|
||||
const paddle::Tensor *ffn1_weight,
|
||||
const paddle::Tensor *ffn1_scale,
|
||||
const paddle::Tensor *ffn1_bias,
|
||||
const paddle::Tensor *ffn2_weight,
|
||||
const paddle::Tensor *ffn2_scale,
|
||||
const paddle::Tensor *ffn2_bias,
|
||||
const paddle::Tensor *moe_token_type_ids,
|
||||
const int moe_topk,
|
||||
const bool group_moe,
|
||||
const bool norm_topk_prob,
|
||||
const float routed_scaling_factor,
|
||||
const std::string moe_type,
|
||||
paddle::Tensor *output) {
|
||||
auto *input_activations = input->data<T>();
|
||||
auto *gating_weights = gate_weight->data<float>();
|
||||
const T *fc1_expert_biases = ffn1_bias ? ffn1_bias->data<T>() : nullptr;
|
||||
const T *fc2_expert_biases = ffn2_bias ? ffn2_bias->data<T>() : nullptr;
|
||||
|
||||
auto *output_ = output->data<T>();
|
||||
auto stream = input->stream();
|
||||
auto place = input->place();
|
||||
auto input_type = input->dtype();
|
||||
|
||||
auto input_dims = input->dims();
|
||||
auto ffn1_dims = ffn1_weight->dims();
|
||||
int64_t token_num = 0;
|
||||
if (input_dims.size() == 3) {
|
||||
token_num = input_dims[0] * input_dims[1];
|
||||
} else {
|
||||
token_num = input_dims[0];
|
||||
}
|
||||
const int64_t num_rows = token_num;
|
||||
|
||||
const int64_t hidden_size = ffn1_dims[2];
|
||||
int64_t inter_dim = 0;
|
||||
if (moe_type == "qkv") {
|
||||
inter_dim = ffn1_dims[2] * ffn1_dims[3] * ffn1_dims[4];
|
||||
} else {
|
||||
inter_dim = ffn1_dims[1];
|
||||
}
|
||||
|
||||
// if (gemm_method == "weight_only_int4") {
|
||||
// inter_dim = inter_dim * 2;
|
||||
// }
|
||||
|
||||
const int64_t inter_size = inter_dim;
|
||||
const int64_t num_experts = ffn1_dims[0];
|
||||
const int64_t k = moe_topk;
|
||||
|
||||
|
||||
int64_t bytes =
|
||||
getWorkspaceSize<T>(num_rows, hidden_size, inter_size, num_experts, k);
|
||||
|
||||
// Pointers
|
||||
int *expert_for_source_row;
|
||||
int *source_rows_;
|
||||
int *permuted_rows_;
|
||||
int *permuted_experts_;
|
||||
int *expanded_source_row_to_expanded_dest_row;
|
||||
|
||||
T *permuted_data_;
|
||||
int32_t *total_rows_before_expert_;
|
||||
T *fc1_result_;
|
||||
float *softmax_out_;
|
||||
|
||||
paddle::Tensor ws_ptr_tensor =
|
||||
GetEmptyTensor({bytes}, paddle::DataType::INT8, place);
|
||||
int8_t *ws_ptr = ws_ptr_tensor.data<int8_t>();
|
||||
|
||||
const int64_t buf_size = AlignTo16(k * num_rows * hidden_size);
|
||||
const int64_t interbuf_size = AlignTo16(k * num_rows * inter_size);
|
||||
const int64_t padded_experts = AlignTo16(num_experts);
|
||||
const int64_t num_moe_inputs = AlignTo16(k * num_rows);
|
||||
|
||||
expert_for_source_row = reinterpret_cast<int *>(ws_ptr);
|
||||
source_rows_ = expert_for_source_row + num_moe_inputs;
|
||||
permuted_rows_ = source_rows_ + num_moe_inputs;
|
||||
permuted_experts_ = permuted_rows_ + num_moe_inputs;
|
||||
expanded_source_row_to_expanded_dest_row =
|
||||
permuted_experts_ + num_moe_inputs;
|
||||
permuted_data_ = reinterpret_cast<T *>(
|
||||
expanded_source_row_to_expanded_dest_row + num_moe_inputs);
|
||||
total_rows_before_expert_ =
|
||||
reinterpret_cast<int32_t *>(permuted_data_ + buf_size);
|
||||
fc1_result_ =
|
||||
reinterpret_cast<T *>(total_rows_before_expert_ + padded_experts);
|
||||
|
||||
const bool is_pow_2 =
|
||||
(num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
|
||||
if (!is_pow_2 || num_experts > 256) {
|
||||
softmax_out_ = reinterpret_cast<float *>(fc1_result_ + interbuf_size);
|
||||
} else {
|
||||
softmax_out_ = nullptr;
|
||||
}
|
||||
|
||||
paddle::Tensor expert_scales_float_tensor =
|
||||
GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
|
||||
float *expert_scales_float = expert_scales_float_tensor.data<float>();
|
||||
|
||||
float *softmax_max_prob = nullptr;
|
||||
if (group_moe) {
|
||||
paddle::Tensor softmax_max_prob_tensor = GetEmptyTensor(
|
||||
{num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
|
||||
// (TODO: check fill success ?)
|
||||
paddle::experimental::fill(softmax_max_prob_tensor, 0.f);
|
||||
softmax_max_prob = softmax_max_prob_tensor.data<float>();
|
||||
}
|
||||
|
||||
paddle::Tensor fc1_out_tensor =
|
||||
GetEmptyTensor({num_rows * k, inter_size}, input_type, place);
|
||||
T *fc1_out = fc1_out_tensor.data<T>();
|
||||
|
||||
auto input_cast_tensor =
|
||||
paddle::experimental::cast(*input, paddle::DataType::FLOAT32);
|
||||
auto gate_tensor =
|
||||
paddle::experimental::matmul(input_cast_tensor, *gate_weight);
|
||||
float *gating_output = gate_tensor.data<float>();
|
||||
|
||||
if (moe_token_type_ids) {
|
||||
auto *moe_token_type_ids_out = moe_token_type_ids->data<int>();
|
||||
moe_token_type_ids_kernelLauncher<float>(gating_output,
|
||||
moe_token_type_ids_out,
|
||||
num_rows,
|
||||
num_experts,
|
||||
k,
|
||||
stream);
|
||||
}
|
||||
|
||||
topk_gating_softmax_kernelLauncher<float>(gating_output,
|
||||
expert_scales_float,
|
||||
softmax_out_,
|
||||
expert_for_source_row,
|
||||
source_rows_,
|
||||
softmax_max_prob,
|
||||
num_rows,
|
||||
num_experts,
|
||||
k,
|
||||
group_moe,
|
||||
stream);
|
||||
|
||||
const int64_t sorter_ws_size_bytes =
|
||||
AlignTo16(sorter_.getWorkspaceSize(int64_t(k * num_rows)));
|
||||
|
||||
sorter_.run(fc1_result_,
|
||||
sorter_ws_size_bytes,
|
||||
expert_for_source_row,
|
||||
permuted_experts_,
|
||||
source_rows_,
|
||||
permuted_rows_,
|
||||
k * num_rows,
|
||||
false,
|
||||
stream);
|
||||
|
||||
initialize_moe_routing_kernelLauncher(
|
||||
input_activations,
|
||||
permuted_data_,
|
||||
permuted_rows_,
|
||||
expanded_source_row_to_expanded_dest_row,
|
||||
num_rows,
|
||||
num_rows,
|
||||
hidden_size,
|
||||
k,
|
||||
stream);
|
||||
|
||||
const int64_t expanded_active_expert_rows = k * num_rows;
|
||||
|
||||
compute_total_rows_before_expert(permuted_experts_,
|
||||
expanded_active_expert_rows,
|
||||
num_experts,
|
||||
total_rows_before_expert_,
|
||||
stream);
|
||||
|
||||
mctlassExOrder_t row_major = mctlassExOrder_t::MCTLASS_EX_ROWMAJOR_ORDER;
|
||||
mctlassExOrder_t column_major = mctlassExOrder_t::MCTLASS_EX_COLUMNMAJOR_ORDER;
|
||||
|
||||
mc_grouped_gemm_basic_kernel<ElementA, ElementB, ElementC>(
|
||||
reinterpret_cast<const ElementA *>(permuted_data_),
|
||||
row_major,
|
||||
reinterpret_cast<const ElementB *>(ffn1_weight->data<ElementB>()),
|
||||
column_major,
|
||||
reinterpret_cast<const ElementA *>(ffn1_scale->data<T>()),
|
||||
reinterpret_cast<const ElementA *>(fc1_expert_biases),
|
||||
reinterpret_cast<ElementC *>(fc1_out),
|
||||
row_major,
|
||||
total_rows_before_expert_,
|
||||
num_experts,
|
||||
expanded_active_expert_rows,
|
||||
inter_size,
|
||||
hidden_size,
|
||||
stream);
|
||||
|
||||
if (moe_type == "ffn") {
|
||||
auto act_out_tensor =
|
||||
paddle::experimental::swiglu(fc1_out_tensor, nullptr);
|
||||
auto act_out = act_out_tensor.data<T>();
|
||||
|
||||
paddle::Tensor fc2_output_tensor =
|
||||
GetEmptyTensor({k * num_rows, hidden_size}, input_type, place);
|
||||
T *fc2_result = fc2_output_tensor.data<T>();
|
||||
|
||||
mc_grouped_gemm_basic_kernel<ElementA, ElementB, ElementC>(
|
||||
reinterpret_cast<const ElementA *>(act_out),
|
||||
row_major,
|
||||
reinterpret_cast<const ElementB *>(ffn2_weight->data<ElementB>()),
|
||||
column_major,
|
||||
reinterpret_cast<const ElementA *>(ffn2_scale->data<T>()),
|
||||
nullptr,
|
||||
reinterpret_cast<ElementC *>(fc2_result),
|
||||
row_major,
|
||||
total_rows_before_expert_,
|
||||
num_experts,
|
||||
expanded_active_expert_rows,
|
||||
hidden_size,
|
||||
inter_size / 2,
|
||||
stream);
|
||||
|
||||
finalize_moe_routing_kernelLauncher(
|
||||
fc2_result,
|
||||
output_,
|
||||
fc2_expert_biases,
|
||||
reinterpret_cast<float *>(expert_scales_float),
|
||||
expanded_source_row_to_expanded_dest_row,
|
||||
expert_for_source_row,
|
||||
num_rows,
|
||||
hidden_size,
|
||||
k,
|
||||
static_cast<int>(1),
|
||||
norm_topk_prob,
|
||||
routed_scaling_factor,
|
||||
stream);
|
||||
} else {
|
||||
finalize_moe_routing_kernelLauncher(
|
||||
// fc2_result,
|
||||
fc1_out,
|
||||
output_,
|
||||
fc1_expert_biases, // fc2_expert_biases,
|
||||
reinterpret_cast<float *>(expert_scales_float),
|
||||
expanded_source_row_to_expanded_dest_row,
|
||||
expert_for_source_row,
|
||||
num_rows,
|
||||
inter_size,
|
||||
k,
|
||||
static_cast<int>(0),
|
||||
norm_topk_prob,
|
||||
routed_scaling_factor,
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::string gemm_method_;
|
||||
CubKeyValueSorter sorter_;
|
||||
};
|
@@ -1,274 +0,0 @@
|
||||
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
|
||||
#pragma GCC diagnostic ignored "-Wunused-function"
|
||||
#pragma once
|
||||
|
||||
#include "fused_moe_helper.h"
|
||||
#include "fused_moe_op.h"
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
#include "helper.h"
|
||||
|
||||
|
||||
template <paddle::DataType T>
|
||||
void MoeDispatchKernel(const paddle::Tensor& input,
|
||||
const paddle::Tensor& gating_output,
|
||||
const int moe_topk,
|
||||
const bool group_moe,
|
||||
const bool topk_only_mode,
|
||||
const int num_rows,
|
||||
const int hidden_size,
|
||||
const int expert_num,
|
||||
paddle::Tensor* permute_input,
|
||||
paddle::Tensor* tokens_expert_prefix_sum,
|
||||
paddle::Tensor* permute_indices_per_token,
|
||||
paddle::Tensor* top_k_weight,
|
||||
paddle::Tensor* top_k_indices) {
|
||||
typedef PDTraits<T> traits_;
|
||||
typedef typename traits_::DataType DataType_;
|
||||
typedef typename traits_::data_t data_t;
|
||||
|
||||
auto stream = input.stream();
|
||||
auto place = input.place();
|
||||
|
||||
if (group_moe) {
|
||||
// Check if expert_num is divisible by moe_topk, else throw an error
|
||||
PADDLE_ENFORCE_EQ(expert_num % moe_topk,
|
||||
0,
|
||||
common::errors::InvalidArgument(
|
||||
"The number of experts (expert_num) "
|
||||
"must be divisible by moe_topk. "
|
||||
"Got expert_num = %d and moe_topk = %d.",
|
||||
expert_num,
|
||||
moe_topk));
|
||||
}
|
||||
|
||||
const int num_moe_inputs = AlignTo16(num_rows * moe_topk);
|
||||
const int bytes = num_moe_inputs * sizeof(int);
|
||||
|
||||
CubKeyValueSorter sorter_;
|
||||
sorter_.update_num_experts(expert_num);
|
||||
|
||||
const int sorter_ws_size_bytes =
|
||||
AlignTo16(sorter_.getWorkspaceSize(moe_topk * num_rows));
|
||||
const int sort_tmp_in_out_size = num_moe_inputs * 2 * sizeof(int);
|
||||
|
||||
paddle::Tensor ws_ptr_tensor =
|
||||
GetEmptyTensor({bytes + sorter_ws_size_bytes + sort_tmp_in_out_size},
|
||||
paddle::DataType::INT8,
|
||||
place);
|
||||
|
||||
int8_t* ws_ptr = ws_ptr_tensor.data<int8_t>();
|
||||
int* source_rows_ = reinterpret_cast<int*>(ws_ptr);
|
||||
int8_t* sorter_ws_ptr = reinterpret_cast<int8_t*>(ws_ptr + bytes);
|
||||
int* permuted_experts_ =
|
||||
reinterpret_cast<int*>(sorter_ws_ptr + sorter_ws_size_bytes);
|
||||
int* permuted_rows_ = permuted_experts_ + num_moe_inputs;
|
||||
|
||||
int* expert_for_source_row = top_k_indices->data<int>();
|
||||
|
||||
float* softmax_max_prob = nullptr;
|
||||
if (group_moe) {
|
||||
paddle::Tensor softmax_max_prob_tensor =
|
||||
GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
|
||||
paddle::experimental::fill(softmax_max_prob_tensor, 0.f);
|
||||
softmax_max_prob = softmax_max_prob_tensor.data<float>();
|
||||
}
|
||||
|
||||
float* softmax_out_;
|
||||
|
||||
const bool is_pow_2 =
|
||||
(expert_num != 0) && ((expert_num & (expert_num - 1)) == 0);
|
||||
|
||||
paddle::Tensor softmax_buffer;
|
||||
|
||||
if (!is_pow_2 || expert_num > 256 || group_moe) {
|
||||
softmax_buffer = GetEmptyTensor(
|
||||
{num_rows * expert_num}, paddle::DataType::FLOAT32, place);
|
||||
softmax_out_ = softmax_buffer.data<float>();
|
||||
} else {
|
||||
softmax_out_ = nullptr;
|
||||
}
|
||||
|
||||
topk_gating_softmax_kernelLauncher<float>(gating_output.data<float>(),
|
||||
top_k_weight->data<float>(),
|
||||
softmax_out_,
|
||||
expert_for_source_row,
|
||||
source_rows_,
|
||||
softmax_max_prob,
|
||||
num_rows,
|
||||
expert_num,
|
||||
moe_topk,
|
||||
group_moe,
|
||||
stream,
|
||||
topk_only_mode);
|
||||
|
||||
sorter_.run(reinterpret_cast<void*>(sorter_ws_ptr),
|
||||
sorter_ws_size_bytes,
|
||||
expert_for_source_row,
|
||||
permuted_experts_,
|
||||
source_rows_,
|
||||
permuted_rows_,
|
||||
moe_topk * num_rows,
|
||||
false,
|
||||
stream);
|
||||
|
||||
|
||||
initialize_moe_routing_kernelLauncher(
|
||||
input.data<data_t>(),
|
||||
permute_input->data<data_t>(),
|
||||
permuted_rows_,
|
||||
permute_indices_per_token->data<int32_t>(),
|
||||
num_rows,
|
||||
num_rows,
|
||||
hidden_size,
|
||||
moe_topk,
|
||||
stream);
|
||||
|
||||
|
||||
compute_total_rows_before_expert(
|
||||
permuted_experts_,
|
||||
moe_topk * num_rows,
|
||||
expert_num,
|
||||
tokens_expert_prefix_sum->data<int32_t>(),
|
||||
stream);
|
||||
}
|
||||
|
||||
|
||||
std::vector<paddle::Tensor> MoeExpertDispatch(
|
||||
const paddle::Tensor& input,
|
||||
const paddle::Tensor& gating_output,
|
||||
const int moe_topk,
|
||||
const bool group_moe,
|
||||
const bool topk_only_mode) {
|
||||
const auto input_type = input.dtype();
|
||||
auto place = input.place();
|
||||
int token_rows = 0;
|
||||
auto input_dims = input.dims();
|
||||
auto gating_dims = gating_output.dims();
|
||||
const int expert_num = gating_dims[gating_dims.size() - 1];
|
||||
|
||||
if (input_dims.size() == 3) {
|
||||
token_rows = input_dims[0] * input_dims[1];
|
||||
} else {
|
||||
token_rows = input_dims[0];
|
||||
}
|
||||
const int num_rows = token_rows;
|
||||
const int hidden_size = input.dims()[input_dims.size() - 1];
|
||||
|
||||
auto permute_input =
|
||||
GetEmptyTensor({moe_topk * num_rows, hidden_size}, input_type, place);
|
||||
// correspond to the weighted coefficients of the results from each expert.
|
||||
auto top_k_weight =
|
||||
GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
|
||||
auto top_k_indices =
|
||||
GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::INT32, place);
|
||||
|
||||
auto tokens_expert_prefix_sum =
|
||||
GetEmptyTensor({expert_num}, paddle::DataType::INT32, place);
|
||||
auto permute_indices_per_token =
|
||||
GetEmptyTensor({moe_topk, num_rows}, paddle::DataType::INT32, place);
|
||||
|
||||
|
||||
switch (input_type) {
|
||||
case paddle::DataType::BFLOAT16:
|
||||
MoeDispatchKernel<paddle::DataType::BFLOAT16>(input,
|
||||
gating_output,
|
||||
moe_topk,
|
||||
group_moe,
|
||||
topk_only_mode,
|
||||
num_rows,
|
||||
hidden_size,
|
||||
expert_num,
|
||||
&permute_input,
|
||||
&tokens_expert_prefix_sum,
|
||||
&permute_indices_per_token,
|
||||
&top_k_weight,
|
||||
&top_k_indices);
|
||||
break;
|
||||
// case paddle::DataType::FLOAT16:
|
||||
// MoeDispatchKernel<paddle::DataType::FLOAT16>(input,
|
||||
// gating_output,
|
||||
// moe_topk,
|
||||
// group_moe,
|
||||
// topk_only_mode,
|
||||
// num_rows,
|
||||
// hidden_size,
|
||||
// expert_num,
|
||||
// &permute_input,
|
||||
// &tokens_expert_prefix_sum,
|
||||
// &permute_indices_per_token,
|
||||
// &top_k_weight,
|
||||
// &top_k_indices);
|
||||
// break;
|
||||
default:
|
||||
PD_THROW("Only support bf16 for MoeDispatchKernel");
|
||||
}
|
||||
return {permute_input,
|
||||
tokens_expert_prefix_sum,
|
||||
permute_indices_per_token,
|
||||
top_k_weight,
|
||||
top_k_indices};
|
||||
}
|
||||
|
||||
|
||||
std::vector<std::vector<int64_t>> MoeExpertDispatchInferShape(
|
||||
const std::vector<int64_t>& input_shape,
|
||||
const std::vector<int64_t>& gating_output_shape,
|
||||
const int moe_topk) {
|
||||
int token_rows = -1;
|
||||
|
||||
if (input_shape.size() == 3) {
|
||||
token_rows = input_shape[0] * input_shape[1];
|
||||
} else {
|
||||
token_rows = input_shape[0];
|
||||
}
|
||||
const int expert_num = gating_output_shape[gating_output_shape.size() - 1];
|
||||
const int num_rows = token_rows;
|
||||
const int hidden_size = input_shape[input_shape.size() - 1];
|
||||
|
||||
return {{moe_topk * num_rows, hidden_size},
|
||||
{expert_num},
|
||||
{moe_topk, num_rows},
|
||||
{num_rows, moe_topk},
|
||||
{num_rows, moe_topk}};
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType> MoeExpertDispatchInferDtype(
|
||||
const paddle::DataType& input_dtype,
|
||||
const paddle::DataType& gating_output_dtype,
|
||||
const int moe_topk) {
|
||||
return {input_dtype,
|
||||
paddle::DataType::INT64,
|
||||
paddle::DataType::INT32,
|
||||
paddle::DataType::FLOAT32,
|
||||
paddle::DataType::INT32};
|
||||
}
|
||||
|
||||
|
||||
PD_BUILD_OP(moe_expert_dispatch)
|
||||
.Inputs({"input", "gating_output"})
|
||||
.Outputs({"permute_input",
|
||||
"tokens_expert_prefix_sum",
|
||||
"permute_indices_per_token",
|
||||
"top_k_weight",
|
||||
"top_k_indices"})
|
||||
.Attrs({"moe_topk:int", "group_moe:bool", "topk_only_mode:bool"})
|
||||
.SetKernelFn(PD_KERNEL(MoeExpertDispatch))
|
||||
.SetInferShapeFn(PD_INFER_SHAPE(MoeExpertDispatchInferShape))
|
||||
.SetInferDtypeFn(PD_INFER_DTYPE(MoeExpertDispatchInferDtype));
|
@@ -1,173 +0,0 @@
|
||||
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#pragma once
|
||||
#include "mc_fused_moe_helper.h"
|
||||
#include "helper.h"
|
||||
|
||||
template <paddle::DataType T, typename ElementA, typename ElementB, typename ElementC>
|
||||
void McMoeFFNKernel(const paddle::Tensor& permute_input,
|
||||
const paddle::Tensor& tokens_expert_prefix_sum,
|
||||
const paddle::Tensor& ffn1_weight,
|
||||
const paddle::Tensor& ffn2_weight,
|
||||
const paddle::optional<paddle::Tensor>& ffn1_bias,
|
||||
const paddle::optional<paddle::Tensor>& ffn1_scale,
|
||||
const paddle::optional<paddle::Tensor>& ffn2_scale,
|
||||
const std::string& quant_method,
|
||||
paddle::Tensor ffn_out) {
|
||||
typedef PDTraits<T> traits_;
|
||||
typedef typename traits_::DataType DataType_;
|
||||
typedef typename traits_::data_t data_t;
|
||||
|
||||
auto ffn_out_ptr = ffn_out.data<data_t>();
|
||||
auto permuted_input_ptr = permute_input.data<data_t>();
|
||||
auto place = permute_input.place();
|
||||
auto input_type = permute_input.dtype();
|
||||
auto stream = permute_input.stream();
|
||||
|
||||
const int expanded_active_expert_rows = permute_input.dims()[0]; // permute_input.dims(): m, k
|
||||
const int num_experts = ffn1_weight.dims()[0]; // batchsize
|
||||
const int hidden_size = ffn1_weight.dims()[2]; // n
|
||||
int inter_dim = ffn1_weight.dims()[1]; // k
|
||||
|
||||
const int64_t inter_size = inter_dim; // since weight_only_int_8
|
||||
paddle::Tensor fc1_out_tensor = GetEmptyTensor(
|
||||
{expanded_active_expert_rows, inter_size}, input_type, place);
|
||||
auto fc1_out_ptr = fc1_out_tensor.data<data_t>();
|
||||
|
||||
mctlassExOrder_t row_major = mctlassExOrder_t::MCTLASS_EX_ROWMAJOR_ORDER;
|
||||
mctlassExOrder_t column_major = mctlassExOrder_t::MCTLASS_EX_COLUMNMAJOR_ORDER;
|
||||
|
||||
// ffn1
|
||||
auto fc1_expert_biases =
|
||||
ffn1_bias
|
||||
? const_cast<paddle::Tensor*>(ffn1_bias.get_ptr())->data<data_t>()
|
||||
: nullptr;
|
||||
auto fc1_expert_scales = const_cast<paddle::Tensor*>(ffn1_scale.get_ptr())->data<data_t>();
|
||||
mc_grouped_gemm_basic_kernel<ElementA, ElementB, ElementC>(
|
||||
reinterpret_cast<const ElementA *>(permuted_input_ptr),
|
||||
row_major,
|
||||
reinterpret_cast<const ElementB *>(ffn1_weight.data<ElementB>()),
|
||||
column_major,
|
||||
reinterpret_cast<const ElementA *>(fc1_expert_scales),
|
||||
reinterpret_cast<const ElementA *>(fc1_expert_biases),
|
||||
reinterpret_cast<ElementC *>(fc1_out_ptr),
|
||||
row_major,
|
||||
tokens_expert_prefix_sum.data<int>(),
|
||||
num_experts,
|
||||
expanded_active_expert_rows,
|
||||
inter_dim,
|
||||
hidden_size,
|
||||
stream);
|
||||
|
||||
// swiglu
|
||||
auto act_out_tensor = paddle::experimental::swiglu(fc1_out_tensor, nullptr);
|
||||
auto act_out = act_out_tensor.data<data_t>();
|
||||
|
||||
auto fc2_expert_scales = const_cast<paddle::Tensor*>(ffn2_scale.get_ptr())->data<data_t>();
|
||||
mc_grouped_gemm_basic_kernel<ElementA, ElementB, ElementC>(
|
||||
reinterpret_cast<const ElementA *>(act_out),
|
||||
row_major,
|
||||
reinterpret_cast<const ElementB *>(ffn2_weight.data<ElementB>()),
|
||||
column_major,
|
||||
reinterpret_cast<const ElementA *>(fc2_expert_scales),
|
||||
nullptr,
|
||||
reinterpret_cast<ElementC *>(ffn_out_ptr),
|
||||
row_major,
|
||||
tokens_expert_prefix_sum.data<int>(),
|
||||
num_experts,
|
||||
expanded_active_expert_rows,
|
||||
hidden_size,
|
||||
inter_dim / 2,
|
||||
stream);
|
||||
}
|
||||
|
||||
std::vector<paddle::Tensor> MoeExpertFFN(
|
||||
const paddle::Tensor& permute_input,
|
||||
const paddle::Tensor& tokens_expert_prefix_sum,
|
||||
const paddle::Tensor& ffn1_weight,
|
||||
const paddle::Tensor& ffn2_weight,
|
||||
const paddle::optional<paddle::Tensor>& ffn1_bias,
|
||||
const paddle::optional<paddle::Tensor>& ffn1_scale,
|
||||
const paddle::optional<paddle::Tensor>& ffn2_scale,
|
||||
const std::string& quant_method) {
|
||||
assert(quant_method == "weight_only_int8");
|
||||
const auto input_type = permute_input.dtype();
|
||||
auto ffn_out = paddle::empty_like(permute_input);
|
||||
|
||||
switch (input_type) {
|
||||
case paddle::DataType::BFLOAT16:
|
||||
McMoeFFNKernel<paddle::DataType::BFLOAT16, maca_bfloat16, int8_t, maca_bfloat16>(permute_input,
|
||||
tokens_expert_prefix_sum,
|
||||
ffn1_weight,
|
||||
ffn2_weight,
|
||||
ffn1_bias,
|
||||
ffn1_scale,
|
||||
ffn2_scale,
|
||||
quant_method,
|
||||
ffn_out);
|
||||
break;
|
||||
// case paddle::DataType::FLOAT16:
|
||||
// MoeFFNKernel<paddle::DataType::FLOAT16>(permute_input,
|
||||
// tokens_expert_prefix_sum,
|
||||
// ffn1_weight,
|
||||
// ffn2_weight,
|
||||
// ffn1_bias,
|
||||
// ffn1_scale,
|
||||
// ffn2_scale,
|
||||
// quant_method,
|
||||
// ffn_out);
|
||||
// break;
|
||||
default:
|
||||
PD_THROW("Only support bf16 for MoeExpertFFN");
|
||||
}
|
||||
return {ffn_out};
|
||||
}
|
||||
|
||||
std::vector<std::vector<int64_t>> MoeExpertFFNInferShape(
|
||||
const std::vector<int64_t>& permute_input_shape,
|
||||
const std::vector<int64_t>& tokens_expert_prefix_sum_shape,
|
||||
const std::vector<int64_t>& ffn1_weight_shape,
|
||||
const std::vector<int64_t>& ffn2_weight_shape,
|
||||
const paddle::optional<std::vector<int64_t>>& ffn1_bias_shape,
|
||||
const paddle::optional<std::vector<int64_t>>& ffn1_scale_shape,
|
||||
const paddle::optional<std::vector<int64_t>>& ffn2_scale_shape) {
|
||||
return {permute_input_shape};
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType> MoeExpertFFNInferDtype(
|
||||
const paddle::DataType& permute_input_dtype,
|
||||
const paddle::DataType& tokens_expert_prefix_sum_dtype,
|
||||
const paddle::DataType& ffn1_weight_dtype,
|
||||
const paddle::DataType& ffn2_weight_dtype,
|
||||
const paddle::optional<paddle::DataType>& ffn1_bias_dtype,
|
||||
const paddle::optional<paddle::DataType>& ffn1_scale_dtype,
|
||||
const paddle::optional<paddle::DataType>& ffn2_scale_dtype) {
|
||||
return {permute_input_dtype};
|
||||
}
|
||||
|
||||
PD_BUILD_OP(moe_expert_ffn)
|
||||
.Inputs({"permute_input",
|
||||
"tokens_expert_prefix_sum",
|
||||
"ffn1_weight",
|
||||
"ffn2_weight",
|
||||
paddle::Optional("ffn1_bias"),
|
||||
paddle::Optional("ffn1_scale"),
|
||||
paddle::Optional("ffn2_scale")})
|
||||
.Outputs({"output_tensor"})
|
||||
.Attrs({"quant_method:std::string"})
|
||||
.SetKernelFn(PD_KERNEL(MoeExpertFFN))
|
||||
.SetInferShapeFn(PD_INFER_SHAPE(MoeExpertFFNInferShape))
|
||||
.SetInferDtypeFn(PD_INFER_DTYPE(MoeExpertFFNInferDtype));
|
@@ -1,143 +0,0 @@
|
||||
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "helper.h"
|
||||
#include "fused_moe_helper.h"
|
||||
#include "fused_moe_op.h"
|
||||
|
||||
template <paddle::DataType T>
|
||||
void MoeReduceKernel(const paddle::Tensor& ffn_out,
|
||||
const paddle::Tensor& top_k_weight,
|
||||
const paddle::Tensor& permute_indices_per_token,
|
||||
const paddle::Tensor& top_k_indices,
|
||||
const paddle::optional<paddle::Tensor>& ffn2_bias,
|
||||
const bool norm_topk_prob,
|
||||
const float routed_scaling_factor,
|
||||
const int num_rows,
|
||||
const int hidden_size,
|
||||
const int topk,
|
||||
paddle::Tensor* output) {
|
||||
typedef PDTraits<T> traits_;
|
||||
typedef typename traits_::DataType DataType_;
|
||||
typedef typename traits_::data_t data_t;
|
||||
auto stream = ffn_out.stream();
|
||||
|
||||
finalize_moe_routing_kernelLauncher(
|
||||
ffn_out.data<data_t>(),
|
||||
output->data<data_t>(),
|
||||
ffn2_bias ? ffn2_bias->data<data_t>() : nullptr,
|
||||
top_k_weight.data<float>(),
|
||||
permute_indices_per_token.data<int32_t>(),
|
||||
top_k_indices.data<int>(),
|
||||
num_rows,
|
||||
hidden_size,
|
||||
topk,
|
||||
static_cast<int>(1),
|
||||
norm_topk_prob,
|
||||
routed_scaling_factor,
|
||||
stream);
|
||||
}
|
||||
|
||||
|
||||
std::vector<paddle::Tensor> MoeExpertReduce(
|
||||
const paddle::Tensor& ffn_out,
|
||||
const paddle::Tensor& top_k_weight,
|
||||
const paddle::Tensor& permute_indices_per_token,
|
||||
const paddle::Tensor& top_k_indices,
|
||||
const paddle::optional<paddle::Tensor>& ffn2_bias,
|
||||
const bool norm_topk_prob,
|
||||
const float routed_scaling_factor) {
|
||||
const auto input_type = ffn_out.dtype();
|
||||
auto place = ffn_out.place();
|
||||
|
||||
const int topk = top_k_indices.dims()[1];
|
||||
const int num_rows = ffn_out.dims()[0] / topk;
|
||||
const int hidden_size = ffn_out.dims()[1];
|
||||
|
||||
auto output = GetEmptyTensor({num_rows, hidden_size}, input_type, place);
|
||||
|
||||
// Avoids ‘invalid configuration argument’ when we launch the kernel.
|
||||
if (ffn_out.dims()[0] == 0) return {output};
|
||||
|
||||
switch (input_type) {
|
||||
case paddle::DataType::BFLOAT16:
|
||||
MoeReduceKernel<paddle::DataType::BFLOAT16>(ffn_out,
|
||||
top_k_weight,
|
||||
permute_indices_per_token,
|
||||
top_k_indices,
|
||||
ffn2_bias,
|
||||
norm_topk_prob,
|
||||
routed_scaling_factor,
|
||||
num_rows,
|
||||
hidden_size,
|
||||
topk,
|
||||
&output);
|
||||
break;
|
||||
// case paddle::DataType::FLOAT16:
|
||||
// MoeReduceKernel<paddle::DataType::FLOAT16>(ffn_out,
|
||||
// top_k_weight,
|
||||
// permute_indices_per_token,
|
||||
// top_k_indices,
|
||||
// ffn2_bias,
|
||||
// norm_topk_prob,
|
||||
// routed_scaling_factor,
|
||||
// num_rows,
|
||||
// hidden_size,
|
||||
// topk,
|
||||
// &output);
|
||||
// break;
|
||||
default:
|
||||
PD_THROW("Only support bf16 for MoeDispatchKernel");
|
||||
}
|
||||
return {output};
|
||||
}
|
||||
|
||||
|
||||
std::vector<std::vector<int64_t>> MoeExpertReduceInferShape(
|
||||
const std::vector<int64_t>& ffn_out_shape,
|
||||
const std::vector<int64_t>& top_k_weight_shape,
|
||||
const std::vector<int64_t>& permute_indices_per_token_shape,
|
||||
const std::vector<int64_t>& top_k_indices_shape,
|
||||
const paddle::optional<std::vector<int64_t>>& ffn2_bias_shape) {
|
||||
const int topk = top_k_indices_shape[1];
|
||||
std::vector<int64_t> fused_moe_out_shape = {ffn_out_shape[0] / topk,
|
||||
ffn_out_shape[1]};
|
||||
|
||||
return {fused_moe_out_shape};
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType> MoeExpertReduceInferDtype(
|
||||
const paddle::DataType& ffn_out_dtype,
|
||||
const paddle::DataType& top_k_weight_dtype,
|
||||
const paddle::DataType& permute_indices_per_token_dtype,
|
||||
const paddle::DataType& top_k_indices_dtype,
|
||||
const paddle::optional<paddle::DataType>& ffn2_bias_dtype) {
|
||||
return {ffn_out_dtype};
|
||||
}
|
||||
|
||||
|
||||
PD_BUILD_OP(moe_expert_reduce)
|
||||
.Inputs({"ffn_out",
|
||||
"top_k_weight",
|
||||
"permute_indices_per_token",
|
||||
"top_k_indices",
|
||||
paddle::Optional("ffn2_bias")})
|
||||
.Outputs({"output"})
|
||||
.Attrs({"norm_topk_prob:bool", "routed_scaling_factor:float"})
|
||||
.SetKernelFn(PD_KERNEL(MoeExpertReduce))
|
||||
.SetInferShapeFn(PD_INFER_SHAPE(MoeExpertReduceInferShape))
|
||||
.SetInferDtypeFn(PD_INFER_DTYPE(MoeExpertReduceInferDtype));
|
@@ -208,7 +208,6 @@ if paddle.is_compiled_with_rocm():
|
||||
"gpu_ops/rebuild_padding.cu",
|
||||
"gpu_ops/step.cu",
|
||||
"gpu_ops/set_data_ipc.cu",
|
||||
"gpu_ops/unset_data_ipc.cu",
|
||||
"gpu_ops/moe/tritonmoe_preprocess.cu",
|
||||
"gpu_ops/step_system_cache.cu",
|
||||
"gpu_ops/get_output_ep.cc",
|
||||
@@ -279,7 +278,6 @@ elif paddle.is_compiled_with_cuda():
|
||||
"gpu_ops/beam_search_softmax.cu",
|
||||
"gpu_ops/rebuild_padding.cu",
|
||||
"gpu_ops/set_data_ipc.cu",
|
||||
"gpu_ops/unset_data_ipc.cu",
|
||||
"gpu_ops/read_data_ipc.cu",
|
||||
"gpu_ops/enforce_generation.cu",
|
||||
"gpu_ops/dequant_int8.cu",
|
||||
@@ -538,8 +536,6 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
|
||||
"iluvatar_ops/moe_dispatch.cu",
|
||||
"iluvatar_ops/moe_reduce.cu",
|
||||
"iluvatar_ops/paged_attn.cu",
|
||||
"iluvatar_ops/prefill_fused_attn.cu",
|
||||
"iluvatar_ops/mixed_fused_attn.cu",
|
||||
"iluvatar_ops/w8a16_group_gemm.cu",
|
||||
"iluvatar_ops/runtime/iluvatar_context.cc",
|
||||
],
|
||||
@@ -597,10 +593,6 @@ elif paddle.device.is_compiled_with_custom_device("metax_gpu"):
|
||||
"gpu_ops/moe/tritonmoe_preprocess.cu",
|
||||
"gpu_ops/moe/moe_topk_select.cu",
|
||||
"gpu_ops/recover_decode_task.cu",
|
||||
"metax_ops/moe_dispatch.cu",
|
||||
"metax_ops/moe_ffn.cu",
|
||||
"metax_ops/moe_reduce.cu",
|
||||
"metax_ops/fused_moe.cu",
|
||||
]
|
||||
|
||||
sources += find_end_files("gpu_ops/speculate_decoding", ".cu")
|
||||
@@ -621,7 +613,7 @@ elif paddle.device.is_compiled_with_custom_device("metax_gpu"):
|
||||
],
|
||||
},
|
||||
library_dirs=[os.path.join(maca_path, "lib")],
|
||||
extra_link_args=["-lruntime_cu", "-lmctlassEx"],
|
||||
extra_link_args=["-lruntime_cu"],
|
||||
include_dirs=[
|
||||
os.path.join(maca_path, "include"),
|
||||
os.path.join(maca_path, "include/mcr"),
|
||||
@@ -629,8 +621,6 @@ elif paddle.device.is_compiled_with_custom_device("metax_gpu"):
|
||||
],
|
||||
),
|
||||
)
|
||||
elif paddle.is_compiled_with_custom_device("intel_hpu"):
|
||||
pass
|
||||
else:
|
||||
use_bf16 = envs.FD_CPU_USE_BF16 == "True"
|
||||
|
||||
|
@@ -41,9 +41,7 @@ std::vector<paddle::Tensor> BlockAttnKernel(
|
||||
const paddle::Tensor &encoder_seq_lod_cpu,
|
||||
const paddle::Tensor &encoder_batch_map_cpu,
|
||||
const paddle::Tensor &decoder_context_len_cpu,
|
||||
const paddle::Tensor &decoder_batch_map_cpu,
|
||||
const std::string &pos_emb_type="NORMAL",
|
||||
bool rope_3d=false) {
|
||||
const paddle::Tensor &decoder_batch_map_cpu) {
|
||||
phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
|
||||
auto dev_ctx =
|
||||
paddle::experimental::DeviceContextPool::Instance().Get(place);
|
||||
@@ -74,14 +72,6 @@ std::vector<paddle::Tensor> BlockAttnKernel(
|
||||
int enc_batch = enc_batch_tensor.data<int32_t>()[0];
|
||||
int dec_batch = dec_batch_tensor.data<int32_t>()[0];
|
||||
int total_enc_len = total_enc_len_tensor.data<int32_t>()[0];
|
||||
int rope_max_seqlen = 0;
|
||||
int rope_3d_num_seqs = 1;
|
||||
if (rope_3d) {
|
||||
rope_max_seqlen = rotary_embs.dims()[3];
|
||||
rope_3d_num_seqs = rotary_embs.dims()[0];
|
||||
} else {
|
||||
rope_max_seqlen = rotary_embs.dims()[2];
|
||||
}
|
||||
|
||||
auto block_attn_out =
|
||||
paddle::full({token_num, hidden_dim}, -1, qkv.type(), qkv.place());
|
||||
@@ -161,10 +151,10 @@ std::vector<paddle::Tensor> BlockAttnKernel(
|
||||
prefix_lens_vp, // start_tokens
|
||||
param.batch_size, // batch_size
|
||||
1, // emb_batch_size
|
||||
rope_max_seqlen, // max_seqlen
|
||||
rotary_embs.dims()[2], // max_seqlen
|
||||
param.head_num, param.kv_head_num, param.head_dim,
|
||||
param.max_batch_size, block_size, max_block_per_seq, "BLHD",
|
||||
"HLD", pos_emb_type,
|
||||
"HLD", "NORMAL",
|
||||
!p_kcache_perhead_scale.defined()
|
||||
? nullptr
|
||||
: p_kcache_perhead_scale.data<float>() +
|
||||
@@ -256,10 +246,10 @@ std::vector<paddle::Tensor> BlockAttnKernel(
|
||||
vsl.slot_mapping_vp, // real_batch
|
||||
param.batch_size, // batch_size
|
||||
1, // emb_batch_size
|
||||
rope_max_seqlen, // max_seqlen
|
||||
rotary_embs.dims()[2], // max_seqlen TODO!!double check
|
||||
param.head_num, param.kv_head_num, param.head_dim,
|
||||
param.max_batch_size, block_size, max_block_per_seq, "BLHD", "HLD",
|
||||
pos_emb_type,
|
||||
"NORMAL",
|
||||
!p_kcache_perhead_scale.defined()
|
||||
? nullptr
|
||||
: p_kcache_perhead_scale.data<float>() +
|
||||
@@ -270,9 +260,7 @@ std::vector<paddle::Tensor> BlockAttnKernel(
|
||||
param.kv_head_num, // v_cache_scale_inv
|
||||
nullptr, // k_cache_zp
|
||||
nullptr, // v_cache_zp
|
||||
false, // b_c8_pc
|
||||
rope_3d, // rope_3d
|
||||
rope_3d_num_seqs);
|
||||
false); // b_c8_pc
|
||||
XFTBLOCK_CHECK_EQ(ret, api::SUCCESS);
|
||||
|
||||
// attn decode
|
||||
@@ -326,7 +314,6 @@ PD_BUILD_OP(block_attn)
|
||||
"decoder_context_len_cpu",
|
||||
"decoder_batch_map_cpu",
|
||||
})
|
||||
.Attrs({"pos_emb_type:std::string", "rope_3d:bool"})
|
||||
.Outputs({"block_attn_out"})
|
||||
.SetKernelFn(PD_KERNEL(BlockAttnKernel))
|
||||
.SetInferShapeFn(PD_INFER_SHAPE(BlockAttnInferShape))
|
||||
|
@@ -1,60 +0,0 @@
|
||||
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "paddle/extension.h"
|
||||
|
||||
std::vector<paddle::Tensor> GetImgBoundaries(const paddle::Tensor& task_input_ids,
|
||||
const paddle::Tensor& grid_thw,
|
||||
const int64_t image_patch_id) {
|
||||
// All tensor in cpu
|
||||
auto input_ids_ptr = task_input_ids.data<int64_t>();
|
||||
int64_t seq_lens_origin = task_input_ids.numel();
|
||||
auto grid_thw_ptr = grid_thw.data<int64_t>();
|
||||
|
||||
int token_times = 4;
|
||||
int token_idx = 0;
|
||||
int image_idx = 0;
|
||||
std::vector<int> img_boundaries, img_nums;
|
||||
img_boundaries.emplace_back(0);
|
||||
img_nums.emplace_back(0);
|
||||
while (token_idx < seq_lens_origin) {
|
||||
if (input_ids_ptr[token_idx] != image_patch_id) {
|
||||
do {
|
||||
token_idx++;
|
||||
} while (token_idx < seq_lens_origin && input_ids_ptr[token_idx] != image_patch_id);
|
||||
} else {
|
||||
int cur_image_token_len = (grid_thw_ptr[image_idx * 3 + 1] * grid_thw_ptr[image_idx * 3 + 2]) / token_times;
|
||||
image_idx++;
|
||||
token_idx += cur_image_token_len;
|
||||
}
|
||||
img_boundaries.emplace_back(token_idx);
|
||||
img_nums.emplace_back(image_idx);
|
||||
}
|
||||
|
||||
int64_t num_img_boundaries = static_cast<int64_t>(img_boundaries.size());
|
||||
auto out = paddle::full({2, num_img_boundaries}, 0, paddle::DataType::INT64, paddle::CPUPlace());
|
||||
|
||||
for (int i = 0; i < num_img_boundaries; i++) {
|
||||
out.data<int64_t>()[i] = img_boundaries[i];
|
||||
out.data<int64_t>()[num_img_boundaries + i] = img_nums[i];
|
||||
}
|
||||
|
||||
return {out};
|
||||
}
|
||||
|
||||
PD_BUILD_OP(get_img_boundaries)
|
||||
.Inputs({"task_input_ids", "grid_thw"})
|
||||
.Attrs({"image_patch_id: int64_t"})
|
||||
.Outputs({"img_boundaries"})
|
||||
.SetKernelFn(PD_KERNEL(GetImgBoundaries));
|
@@ -72,7 +72,6 @@ void MoeExpertFFNImpl(xftblock::Tensor* ffn_in,
|
||||
is_padding_input ? token_num_info : nullptr,
|
||||
expert_num,
|
||||
1, // moe_topk
|
||||
0, // group_size
|
||||
ffn1_out_shape.size() == 2 ? xftblock::MoeFCInputMode::DENSE
|
||||
: xftblock::MoeFCInputMode::SPARSE);
|
||||
PD_CHECK(ret == 0);
|
||||
@@ -135,7 +134,6 @@ void MoeExpertFFNImpl(xftblock::Tensor* ffn_in,
|
||||
is_padding_input ? token_num_info : nullptr,
|
||||
expert_num,
|
||||
1, // moe_topk
|
||||
0, // group_size
|
||||
ffn1_out_shape.size() == 2
|
||||
? xftblock::MoeFCInputMode::DENSE
|
||||
: xftblock::MoeFCInputMode::SPARSE); // bias_mode
|
||||
|
@@ -145,8 +145,7 @@ std::vector<paddle::Tensor> MoeLayerKernel(
|
||||
? up_gate_proj_weight_scale.get_ptr()->data<float>()
|
||||
: nullptr),
|
||||
xftblock_tw,
|
||||
std::vector<int64_t>{expert_num, inter_dim, hidden_dim}
|
||||
);
|
||||
std::vector<int64_t>{expert_num, inter_dim, hidden_dim});
|
||||
|
||||
xdown_proj_w = std::make_shared<xftblock::Tensor>(
|
||||
const_cast<TW *>(down_proj_weight.data<TW>()), nullptr,
|
||||
@@ -154,8 +153,7 @@ std::vector<paddle::Tensor> MoeLayerKernel(
|
||||
? down_proj_weight_scale.get_ptr()->data<float>()
|
||||
: nullptr),
|
||||
xftblock_tw,
|
||||
std::vector<int64_t>{expert_num, hidden_dim, outer_dim}
|
||||
);
|
||||
std::vector<int64_t>{expert_num, hidden_dim, outer_dim});
|
||||
}
|
||||
std::shared_ptr<xftblock::Tensor> xup_gate_proj_bias;
|
||||
std::shared_ptr<xftblock::Tensor> xdown_proj_bias;
|
||||
|
@@ -1,83 +0,0 @@
|
||||
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <paddle/phi/backends/xpu/xpu_context.h>
|
||||
#include <xft/xdnn_plugin.h>
|
||||
#include "paddle/extension.h"
|
||||
#include "xpu/plugin.h"
|
||||
|
||||
void TextImageGatherScatter(
|
||||
paddle::Tensor& input,
|
||||
paddle::Tensor& text_input,
|
||||
paddle::Tensor& image_input,
|
||||
paddle::Tensor& token_type_ids,
|
||||
paddle::Tensor& text_index,
|
||||
paddle::Tensor& image_index,
|
||||
const bool is_scatter) {
|
||||
phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
|
||||
auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
|
||||
auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
|
||||
|
||||
const int64_t token_num = input.dims()[0];
|
||||
const int64_t hidden_size = input.dims()[1];
|
||||
const int64_t text_token_num = text_input.dims()[0];
|
||||
const int64_t image_token_num = image_input.dims()[0];
|
||||
|
||||
switch (input.type()) {
|
||||
case paddle::DataType::BFLOAT16: {
|
||||
using XPUType = typename XPUTypeTrait<bfloat16>::Type;
|
||||
typedef paddle::bfloat16 data_t;
|
||||
int r = baidu::xpu::api::plugin::text_image_gather_scatter<XPUType>(
|
||||
xpu_ctx->x_context(),
|
||||
reinterpret_cast<XPUType*>(input.data<data_t>()),
|
||||
reinterpret_cast<XPUType*>(text_input.data<data_t>()),
|
||||
reinterpret_cast<XPUType*>(image_input.data<data_t>()),
|
||||
reinterpret_cast<int*>(token_type_ids.data<int>()),
|
||||
reinterpret_cast<int*>(text_index.data<int>()),
|
||||
reinterpret_cast<int*>(image_index.data<int>()),
|
||||
token_num,
|
||||
text_token_num,
|
||||
image_token_num,
|
||||
hidden_size,
|
||||
is_scatter
|
||||
);
|
||||
PADDLE_ENFORCE_XDNN_SUCCESS(r, "text_image_gather_scatter");
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
PD_THROW(
|
||||
"NOT supported data type. Only support BFLOAT16. ");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
PD_BUILD_OP(text_image_gather_scatter)
|
||||
.Inputs({"input",
|
||||
"text_input",
|
||||
"image_input",
|
||||
"token_type_ids",
|
||||
"text_index",
|
||||
"image_index"})
|
||||
.Outputs({"text_input_out",
|
||||
"image_input_out",
|
||||
"text_index_out",
|
||||
"image_index_out"})
|
||||
.Attrs({"is_scatter:bool"})
|
||||
.SetInplaceMap({{"text_input", "text_input_out"},
|
||||
{"image_input", "image_input_out"},
|
||||
{"text_index", "text_index_out"},
|
||||
{"image_index", "image_index_out"}})
|
||||
.SetKernelFn(PD_KERNEL(TextImageGatherScatter));
|
@@ -1,48 +0,0 @@
|
||||
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <paddle/phi/backends/xpu/xpu_context.h>
|
||||
#include "paddle/extension.h"
|
||||
#include "xpu/plugin.h"
|
||||
|
||||
void TextImageIndexOut(
|
||||
const paddle::Tensor& token_type_ids,
|
||||
const paddle::Tensor& text_index,
|
||||
const paddle::Tensor& image_index) {
|
||||
if (token_type_ids.type() != paddle::DataType::INT32 || text_index.type()
|
||||
!= paddle::DataType::INT32 || image_index.type() != paddle::DataType::INT32) {
|
||||
PD_THROW("NOT supported data type. Only support BFLOAT16. ");
|
||||
}
|
||||
phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
|
||||
auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
|
||||
auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
|
||||
const int64_t token_num = token_type_ids.shape()[0];
|
||||
int r = baidu::xpu::api::plugin::text_image_index_out(xpu_ctx->x_context(),
|
||||
token_type_ids.data<int32_t>(),
|
||||
const_cast<int32_t*>(text_index.data<int32_t>()),
|
||||
const_cast<int32_t*>(image_index.data<int32_t>()),
|
||||
token_num);
|
||||
PADDLE_ENFORCE_XDNN_SUCCESS(r, "text_image_index_out");
|
||||
}
|
||||
|
||||
|
||||
PD_BUILD_OP(text_image_index_out)
|
||||
.Inputs({"token_type_ids",
|
||||
"text_index",
|
||||
"image_index"})
|
||||
.Outputs({"text_index_out",
|
||||
"image_index_out"})
|
||||
.SetInplaceMap({{"text_index", "text_index_out"},
|
||||
{"image_index", "image_index_out"}})
|
||||
.SetKernelFn(PD_KERNEL(TextImageIndexOut));
|
@@ -140,25 +140,6 @@ DLL_EXPORT int quant2d_per_channel(api::Context *ctx, const TX *x,
|
||||
const TSCALE *scale_in, TY *y,
|
||||
TSCALE *scale_out, int64_t m, int64_t n);
|
||||
|
||||
DLL_EXPORT int text_image_index_out(Context* ctx,
|
||||
const int* token_type_ids, // x
|
||||
int* text_index, // y1
|
||||
int* image_index, // y2
|
||||
const int64_t token_num);
|
||||
|
||||
template <typename T>
|
||||
DLL_EXPORT int text_image_gather_scatter(api::Context* ctx,
|
||||
T* input,
|
||||
T* text_input,
|
||||
T* image_input,
|
||||
int* token_type_ids,
|
||||
int* text_index,
|
||||
int* image_index,
|
||||
int64_t token_num,
|
||||
int64_t text_token_num,
|
||||
int64_t image_token_num,
|
||||
int64_t hidden_size,
|
||||
bool is_scatter);
|
||||
|
||||
/*--------------------------------------- MTP being --------------------------------------------*/
|
||||
|
||||
|
@@ -1,175 +0,0 @@
|
||||
#include "xpu/kernel/cluster.h"
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
#include "xpu/kernel/xtdk_io.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
|
||||
template <typename T>
|
||||
static __device__ inline void text_image_gather(
|
||||
__global_ptr__ T* input,
|
||||
__global_ptr__ T* text_input,
|
||||
__global_ptr__ T* image_input,
|
||||
__global_ptr__ int* token_type_ids,
|
||||
__global_ptr__ int* text_index,
|
||||
__global_ptr__ int* image_index,
|
||||
int64_t token_num,
|
||||
int64_t text_token_num,
|
||||
int64_t image_token_num,
|
||||
int64_t hidden_size,
|
||||
T* input_lm) {
|
||||
int cid = core_id();
|
||||
int clusterid = cluster_id();
|
||||
int token_start_cluster;
|
||||
int token_end_cluster;
|
||||
int token_start_core;
|
||||
int token_end_core;
|
||||
|
||||
const int BUFSIZE = 2 * 1024 / sizeof(T); // 1024 for bf16, 512 for fp32
|
||||
// cluster partition
|
||||
partition(cluster_id(), cluster_num(), (int)token_num, 1, &token_start_cluster, &token_end_cluster);
|
||||
if (token_start_cluster >= token_end_cluster) {
|
||||
return;
|
||||
}
|
||||
int rows_cluster = token_end_cluster - token_start_cluster; // total rows for a cluster
|
||||
// core partition
|
||||
partition(core_id(), core_num(), rows_cluster, 1, &token_start_core, &token_end_core);
|
||||
int rows_core = token_end_core - token_start_core; // total rows for a core
|
||||
token_start_core += token_start_cluster;
|
||||
token_end_core += token_start_cluster;
|
||||
|
||||
int read_len;
|
||||
for (int i = token_start_core; i < token_end_core; i += 1) {
|
||||
int token_type, text_image_token_idx;
|
||||
__global_ptr__ T* text_image_input = nullptr;
|
||||
__global_ptr__ int* text_image_index = nullptr;
|
||||
|
||||
GM2LM(token_type_ids + i, &token_type, sizeof(int));
|
||||
if (token_type == 0) {
|
||||
text_image_input = text_input;
|
||||
text_image_index = text_index;
|
||||
} else {
|
||||
text_image_input = image_input;
|
||||
text_image_index = image_index;
|
||||
}
|
||||
GM2LM(text_image_index + i, &text_image_token_idx, sizeof(int));
|
||||
int input_offset = i * hidden_size;
|
||||
int text_image_offset = text_image_token_idx * hidden_size;
|
||||
|
||||
for (int j = 0; j < hidden_size; j += BUFSIZE) {
|
||||
read_len = min(hidden_size - j, BUFSIZE);
|
||||
GM2LM(text_image_input + text_image_offset + j, input_lm, sizeof(T) * read_len);
|
||||
LM2GM(input_lm, input + input_offset + j, sizeof(T) * read_len);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __device__ inline void text_image_scatter(
|
||||
__global_ptr__ T* input,
|
||||
__global_ptr__ T* text_input,
|
||||
__global_ptr__ T* image_input,
|
||||
__global_ptr__ int* token_type_ids,
|
||||
__global_ptr__ int* text_index,
|
||||
__global_ptr__ int* image_index,
|
||||
int64_t token_num,
|
||||
int64_t text_token_num,
|
||||
int64_t image_token_num,
|
||||
int64_t hidden_size,
|
||||
T* input_lm) {
|
||||
int cid = core_id();
|
||||
int clusterid = cluster_id();
|
||||
int token_start_cluster;
|
||||
int token_end_cluster;
|
||||
int token_start_core;
|
||||
int token_end_core;
|
||||
|
||||
const int BUFSIZE = 2 * 1024 / sizeof(T); // 1024 for bf16, 512 for fp32
|
||||
// cluster partition
|
||||
partition(cluster_id(), cluster_num(), (int)token_num, 1, &token_start_cluster, &token_end_cluster);
|
||||
if (token_start_cluster >= token_end_cluster) {
|
||||
return;
|
||||
}
|
||||
int rows_cluster = token_end_cluster - token_start_cluster; // total rows for a cluster
|
||||
// core partition
|
||||
partition(core_id(), core_num(), rows_cluster, 1, &token_start_core, &token_end_core);
|
||||
int rows_core = token_end_core - token_start_core; // total rows for a core
|
||||
token_start_core += token_start_cluster;
|
||||
token_end_core += token_start_cluster;
|
||||
|
||||
int read_len;
|
||||
for (int i = token_start_core; i < token_end_core; i += 1) {
|
||||
int token_type, text_image_token_idx;
|
||||
__global_ptr__ T* text_image_input = nullptr;
|
||||
__global_ptr__ int* text_image_index = nullptr;
|
||||
|
||||
GM2LM(token_type_ids + i, &token_type, sizeof(int));
|
||||
if (token_type == 0) {
|
||||
text_image_input = text_input;
|
||||
text_image_index = text_index;
|
||||
} else {
|
||||
text_image_input = image_input;
|
||||
text_image_index = image_index;
|
||||
}
|
||||
GM2LM(text_image_index + i, &text_image_token_idx, sizeof(int));
|
||||
int input_offset = i * hidden_size;
|
||||
int text_image_offset = text_image_token_idx * hidden_size;
|
||||
|
||||
for (int j = 0; j < hidden_size; j += BUFSIZE) {
|
||||
read_len = min(hidden_size - j, BUFSIZE);
|
||||
GM2LM(input + input_offset + j, input_lm, sizeof(T) * read_len);
|
||||
LM2GM(input_lm, text_image_input + text_image_offset + j, sizeof(T) * read_len);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void text_image_gather_scatter(
|
||||
T* input,
|
||||
T* text_input,
|
||||
T* image_input,
|
||||
int* token_type_ids,
|
||||
int* text_index,
|
||||
int* image_index,
|
||||
int64_t token_num,
|
||||
int64_t text_token_num,
|
||||
int64_t image_token_num,
|
||||
int64_t hidden_size,
|
||||
bool is_scatter) {
|
||||
int cid = core_id();
|
||||
int ncores = core_num();
|
||||
int clusterid = cluster_id();
|
||||
int nclusters = cluster_num();
|
||||
const int BUFSIZE = 2 * 1024 / sizeof(T); // 1024 for bf16, 512 for fp32
|
||||
__simd__ T input_lm[BUFSIZE]; // 2KB for bf16 and fp32
|
||||
if (is_scatter) {
|
||||
text_image_scatter(
|
||||
input, text_input, image_input, token_type_ids, text_index, image_index,
|
||||
token_num, text_token_num, image_token_num, hidden_size, input_lm);
|
||||
} else {
|
||||
text_image_gather(
|
||||
input, text_input, image_input, token_type_ids, text_index, image_index,
|
||||
token_num, text_token_num, image_token_num, hidden_size, input_lm);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#define _XPU_DEF_TEXT_IMAGE_GATHER_SCATTER(T) \
|
||||
template __global__ void text_image_gather_scatter<T>( \
|
||||
T* input, \
|
||||
T* text_input, \
|
||||
T* image_input, \
|
||||
int* token_type_ids, \
|
||||
int* text_index, \
|
||||
int* image_index, \
|
||||
int64_t token_num, \
|
||||
int64_t text_token_num, \
|
||||
int64_t image_token_num, \
|
||||
int64_t hidden_size, \
|
||||
bool is_scatter);
|
||||
|
||||
_XPU_DEF_TEXT_IMAGE_GATHER_SCATTER(bfloat16);
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
@@ -1,97 +0,0 @@
|
||||
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
/*
|
||||
* copyright (C) 2025 KUNLUNXIN, Inc
|
||||
*/
|
||||
|
||||
#include "xpu/kernel/cluster.h"
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
#include "xpu/kernel/cluster_primitive_template.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
|
||||
static __device__ void do_calc(const _shared_ptr_ int* lm_x, int* lm_y1, int* lm_y2, int64_t size, int& text_count, int& images_count) {
|
||||
for (int j = 0; j < size; j++) {
|
||||
if (lm_x[j] == 0) {
|
||||
lm_y1[j] = text_count;
|
||||
text_count += 1;
|
||||
} else {
|
||||
lm_y2[j] = images_count;
|
||||
images_count += 1;
|
||||
}
|
||||
}
|
||||
mfence_lm_sm();
|
||||
}
|
||||
|
||||
__global__ void text_image_index_out_kernel(
|
||||
const int* token_type_ids, // x
|
||||
int* text_index, // y1
|
||||
int* image_index, // y2
|
||||
const int64_t token_num) {
|
||||
const int cid = core_id();
|
||||
const int tid = core_id() * cluster_num() + cluster_id();
|
||||
const int nthreads = core_num() * cluster_num();
|
||||
if (tid >= 1) return;
|
||||
constexpr int BUFSIZE = 1024;
|
||||
constexpr int READ_MAX_SIZE = BUFSIZE / sizeof(int);
|
||||
const int64_t len = token_num;
|
||||
|
||||
__simd__ char buffer0[BUFSIZE * 3];
|
||||
__simd__ char buffer1[BUFSIZE * 3];
|
||||
__simd__ __shared__ char buffer2[64][BUFSIZE * 2];
|
||||
|
||||
DoublePtr<READ_MAX_SIZE, SmPtr<int>> buffer_ptr_x((SmPtr<int>((_shared_ptr_ int*)buffer2[cid])));
|
||||
TriplePtr<READ_MAX_SIZE, LmPtr<int>> buffer_ptr_y1((LmPtr<int>((int*)buffer0)));
|
||||
TriplePtr<READ_MAX_SIZE, LmPtr<int>> buffer_ptr_y2((LmPtr<int>((int*)buffer1)));
|
||||
int64_t buflen = get_1d_buflen(len, nthreads, READ_MAX_SIZE, 64);
|
||||
int64_t i = tid * buflen;
|
||||
int read_size = 0;
|
||||
int offset = nthreads * buflen;
|
||||
|
||||
int text_count = 0;
|
||||
int images_count = 0;
|
||||
|
||||
if (i < len) {
|
||||
read_size = min<int64_t>(buflen, len - i);
|
||||
buffer_ptr_y1.gm_load_async(text_index + tid * buflen, read_size);
|
||||
buffer_ptr_y2.gm_load_async(image_index + tid * buflen, read_size);
|
||||
buffer_ptr_x.gm_load_async(token_type_ids + tid * buflen, read_size);
|
||||
mfence();
|
||||
}
|
||||
while (i < len && i + offset < len) {
|
||||
i = i + offset;
|
||||
int read_size_next = min<int64_t>(buflen, len - i);
|
||||
buffer_ptr_x.next().gm_load_async(token_type_ids + i, read_size_next);
|
||||
buffer_ptr_y1.next().gm_load_async(text_index + i, read_size_next);
|
||||
buffer_ptr_y2.next().gm_load_async(image_index + i, read_size_next);
|
||||
|
||||
do_calc(buffer_ptr_x.ptr, buffer_ptr_y1.ptr, buffer_ptr_y2.ptr, read_size, text_count, images_count);
|
||||
|
||||
buffer_ptr_y1.gm_store_async(text_index + i - offset, read_size);
|
||||
buffer_ptr_y2.gm_store_async(image_index + i - offset, read_size);
|
||||
buffer_ptr_x.toggle();
|
||||
buffer_ptr_y1.toggle();
|
||||
buffer_ptr_y2.toggle();
|
||||
read_size = read_size_next;
|
||||
}
|
||||
if (i < len) {
|
||||
do_calc(buffer_ptr_x.ptr, buffer_ptr_y1.ptr, buffer_ptr_y2.ptr, read_size, text_count, images_count);
|
||||
buffer_ptr_y1.gm_store_async(text_index + i, read_size);
|
||||
buffer_ptr_y2.gm_store(image_index + i, read_size);
|
||||
}
|
||||
}
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
@@ -1,182 +0,0 @@
|
||||
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "xpu/plugin.h"
|
||||
#include "xpu/refactor/impl_public/wrapper_check.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
template <typename T>
|
||||
__attribute__((global)) void text_image_gather_scatter(
|
||||
T* input,
|
||||
T* text_input,
|
||||
T* image_input,
|
||||
int* token_type_ids,
|
||||
int* text_index,
|
||||
int* image_index,
|
||||
int64_t token_num,
|
||||
int64_t text_token_num,
|
||||
int64_t image_token_num,
|
||||
int64_t hidden_size,
|
||||
bool is_scatter);
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
|
||||
namespace baidu {
|
||||
namespace xpu {
|
||||
namespace api {
|
||||
namespace plugin {
|
||||
|
||||
template <typename T>
|
||||
static int cpu_wrapper(
|
||||
Context* ctx,
|
||||
T* input, // shape [token_num, hidden_size]
|
||||
T* text_input, // shape [text_token_num, hidden_size]
|
||||
T* image_input, // shape [image_token_num, hidden_size]
|
||||
int* token_type_ids,// shape [token_num], 0 for text, 1 for image
|
||||
int* text_index, // shape [token_num], mapping from input to text_input
|
||||
int* image_index, // shape [token_num], mapping from input to image_input
|
||||
int64_t token_num,
|
||||
int64_t text_token_num,
|
||||
int64_t image_token_num,
|
||||
int64_t hidden_size,
|
||||
bool is_scatter) {
|
||||
|
||||
if (is_scatter) {
|
||||
// Scatter mode: input -> text_input/image_input
|
||||
for (int64_t i = 0; i < token_num; i++) {
|
||||
int token_type = token_type_ids[i];
|
||||
|
||||
T* text_image_input = nullptr;
|
||||
int* text_image_index = nullptr;
|
||||
if (token_type == 0) {
|
||||
text_image_input = text_input;
|
||||
text_image_index = text_index;
|
||||
} else { // token_type == 1
|
||||
text_image_input = image_input;
|
||||
text_image_index = image_index;
|
||||
}
|
||||
|
||||
int text_image_token_idx = text_image_index[i];
|
||||
int input_offset = i * hidden_size;
|
||||
int text_image_offset = text_image_token_idx * hidden_size;
|
||||
|
||||
for (int64_t j = 0; j < hidden_size; j++) {
|
||||
T value = input[input_offset + j];
|
||||
text_image_input[text_image_offset + j] = value;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Gather mode: text_input/image_input -> input
|
||||
for (int64_t i = 0; i < token_num; i++) {
|
||||
int token_type = token_type_ids[i];
|
||||
|
||||
T* text_image_input = nullptr;
|
||||
int* text_image_index = nullptr;
|
||||
if (token_type == 0) {
|
||||
text_image_input = text_input;
|
||||
text_image_index = text_index;
|
||||
} else { // token_type == 1
|
||||
text_image_input = image_input;
|
||||
text_image_index = image_index;
|
||||
}
|
||||
|
||||
int text_image_token_idx = text_image_index[i];
|
||||
int input_offset = i * hidden_size;
|
||||
int text_image_offset = text_image_token_idx * hidden_size;
|
||||
|
||||
for (int64_t j = 0; j < hidden_size; j++) {
|
||||
T value = text_image_input[text_image_offset + j];
|
||||
input[input_offset + j] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
return api::SUCCESS;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static int xpu3_wrapper(
|
||||
Context* ctx,
|
||||
T* input,
|
||||
T* text_input,
|
||||
T* image_input,
|
||||
int* token_type_ids,
|
||||
int* text_index,
|
||||
int* image_index,
|
||||
int64_t token_num,
|
||||
int64_t text_token_num,
|
||||
int64_t image_token_num,
|
||||
int64_t hidden_size,
|
||||
bool is_scatter) {
|
||||
xpu3::plugin::text_image_gather_scatter<T> <<<ctx->ncluster(), 64, ctx->xpu_stream>>>(
|
||||
input, text_input, image_input, token_type_ids, text_index, image_index,
|
||||
token_num, text_token_num, image_token_num, hidden_size, is_scatter
|
||||
);
|
||||
return api::SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
int text_image_gather_scatter(
|
||||
Context* ctx,
|
||||
T* input, // shape [token_num, hidden_size]
|
||||
T* text_input, // shape [text_token_num, hidden_size]
|
||||
T* image_input, // shape [image_token_num, hidden_size]
|
||||
int* token_type_ids,// shape [token_num], 0 for text, 1 for image
|
||||
int* text_index, // shape [token_num], mapping from input to text_input
|
||||
int* image_index, // shape [token_num], mapping from input to image_input
|
||||
int64_t token_num,
|
||||
int64_t text_token_num,
|
||||
int64_t image_token_num,
|
||||
int64_t hidden_size,
|
||||
bool is_scatter) {
|
||||
WRAPPER_CHECK_CTX(ctx);
|
||||
WRAPPER_DUMP_FUNCTION_T1(ctx, "text_image_gather_scatter", T);
|
||||
WRAPPER_DUMP_PARAM6(ctx, input, text_input, image_input, token_type_ids, text_index, image_index);
|
||||
WRAPPER_DUMP_PARAM5(ctx, token_num, text_token_num, image_token_num, hidden_size, is_scatter);
|
||||
WRAPPER_DUMP(ctx);
|
||||
WRAPPER_CHECK_PTR(ctx, T, token_num * hidden_size, input);
|
||||
if (text_token_num != 0) { // avoiding text_input tensor with shape [0, hidden_size]
|
||||
WRAPPER_CHECK_PTR(ctx, T, text_token_num * hidden_size, text_input);
|
||||
}
|
||||
if (image_token_num != 0) { // avoiding image_input tensor with shape [0, hidden_size]
|
||||
WRAPPER_CHECK_PTR(ctx, T, image_token_num * hidden_size, image_input);
|
||||
}
|
||||
WRAPPER_CHECK_PTR(ctx, int, token_num, token_type_ids);
|
||||
WRAPPER_CHECK_PTR(ctx, int, token_num, text_index);
|
||||
WRAPPER_CHECK_PTR(ctx, int, token_num, image_index);
|
||||
WRAPPER_ASSERT_EQ(ctx, token_num, text_token_num + image_token_num);
|
||||
|
||||
if (ctx->dev().type() == api::kCPU) {
|
||||
return cpu_wrapper<T>(
|
||||
ctx, input, text_input, image_input, token_type_ids, text_index, image_index,
|
||||
token_num, text_token_num, image_token_num, hidden_size, is_scatter
|
||||
);
|
||||
}
|
||||
if (ctx->dev().type() == api::kXPU3) {
|
||||
return xpu3_wrapper<T>(
|
||||
ctx, input, text_input, image_input, token_type_ids, text_index, image_index,
|
||||
token_num, text_token_num, image_token_num, hidden_size, is_scatter
|
||||
);
|
||||
}
|
||||
WRAPPER_UNIMPLEMENTED(ctx);
|
||||
}
|
||||
|
||||
|
||||
template int text_image_gather_scatter(
|
||||
Context*, bfloat16*, bfloat16*, bfloat16*, int*, int*, int*, const int64_t, const int64_t, const int64_t, const int64_t, bool);
|
||||
} // namespace plugin
|
||||
} // namespace api
|
||||
} // namespace xpu
|
||||
} // namespace baidu
|
@@ -1,103 +0,0 @@
|
||||
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "xpu/plugin.h"
|
||||
#include "xpu/refactor/impl_public/wrapper_check.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
__attribute__((global)) void text_image_index_out_kernel(const int* token_type_ids, // x
|
||||
int* text_index, // y1
|
||||
int* image_index, // y2
|
||||
const int64_t token_num);
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
|
||||
namespace baidu {
|
||||
namespace xpu {
|
||||
namespace api {
|
||||
namespace plugin {
|
||||
|
||||
static int cpu_wrapper(Context* ctx,
|
||||
const int* token_type_ids, // x
|
||||
int* text_index, // y1
|
||||
int* image_index, // y2
|
||||
const int64_t token_num) {
|
||||
int text_count = 0;
|
||||
int image_count = 0;
|
||||
|
||||
for (int64_t i = 0; i < token_num; ++i) {
|
||||
if (token_type_ids[i] == 0) {
|
||||
text_index[i] = text_count;
|
||||
++text_count;
|
||||
} else {
|
||||
image_index[i] = image_count;
|
||||
++image_count;
|
||||
}
|
||||
}
|
||||
return api::SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
static int xpu3_wrapper(Context* ctx,
|
||||
const int* token_type_ids, // x
|
||||
int* text_index, // y1
|
||||
int* image_index, // y2
|
||||
const int64_t token_num) {
|
||||
|
||||
xpu3::plugin::text_image_index_out_kernel<<<1, 1, ctx->xpu_stream>>>(
|
||||
token_type_ids,
|
||||
text_index,
|
||||
image_index,
|
||||
token_num);
|
||||
return api::SUCCESS;
|
||||
}
|
||||
|
||||
int text_image_index_out(Context* ctx,
|
||||
const int* token_type_ids, // x
|
||||
int* text_index, // y1
|
||||
int* image_index, // y2
|
||||
const int64_t token_num) {
|
||||
|
||||
WRAPPER_CHECK_CTX(ctx);
|
||||
WRAPPER_DUMP_FUNCTION_T1(ctx, "text_image_index_out", int);
|
||||
WRAPPER_DUMP_PARAM4(
|
||||
ctx, token_type_ids, text_index, image_index, token_num);
|
||||
WRAPPER_DUMP(ctx);
|
||||
WRAPPER_ASSERT_GT(ctx, token_num, 0);
|
||||
WRAPPER_CHECK_PTR(ctx, int, token_num, token_type_ids);
|
||||
WRAPPER_CHECK_PTR(ctx, int, token_num, text_index);
|
||||
WRAPPER_CHECK_PTR(ctx, int, token_num, image_index);
|
||||
|
||||
|
||||
if (ctx->dev().type() == api::kCPU) {
|
||||
return cpu_wrapper(ctx,
|
||||
token_type_ids,
|
||||
text_index,
|
||||
image_index,
|
||||
token_num);
|
||||
} else if (ctx->dev().type() == api::kXPU3) {
|
||||
return xpu3_wrapper(ctx,
|
||||
token_type_ids,
|
||||
text_index,
|
||||
image_index,
|
||||
token_num);
|
||||
}
|
||||
WRAPPER_UNIMPLEMENTED(ctx);
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace api
|
||||
} // namespace xpu
|
||||
} // namespace baidu
|
@@ -31,11 +31,11 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--quantization wint4 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 128 \
|
||||
--load-choices "default_v1"
|
||||
--load_choices "default_v1"
|
||||
```
|
||||
- `--quantization`: indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of `wint8` / `wint4` / `block_wise_fp8`(Hopper is needed).
|
||||
- `--max-model-len`: Indicates the maximum number of tokens supported by the currently deployed service. The larger the value, the longer the context length the model can support, but the more GPU memory is occupied, which may affect the concurrency.
|
||||
- `--load-choices`: indicates the version of the loader. "default_v1" means enabling the v1 version of the loader, which has faster loading speed and less memory usage.
|
||||
- `--load_choices`: indicates the version of the loader. "default_v1" means enabling the v1 version of the loader, which has faster loading speed and less memory usage.
|
||||
|
||||
For more parameter meanings and default settings, see [FastDeploy Parameter Documentation](../parameters.md)。
|
||||
|
||||
|
@@ -31,11 +31,11 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--quantization wint4 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 128 \
|
||||
--load-choices "default_v1"
|
||||
--load_choices "default_v1"
|
||||
```
|
||||
- `--quantization`: indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of `wint8` / `wint4` / `block_wise_fp8`(Hopper is needed).
|
||||
- `--max-model-len`: Indicates the maximum number of tokens supported by the currently deployed service. The larger the value, the longer the context length the model can support, but the more GPU memory is occupied, which may affect the concurrency.
|
||||
- `--load-choices`: indicates the version of the loader. "default_v1" means enabling the v1 version of the loader, which has faster loading speed and less memory usage.
|
||||
- `--load_choices`: indicates the version of the loader. "default_v1" means enabling the v1 version of the loader, which has faster loading speed and less memory usage.
|
||||
|
||||
For more parameter meanings and default settings, see [FastDeploy Parameter Documentation](../parameters.md)。
|
||||
|
||||
|
@@ -27,7 +27,7 @@ Start the service by following command:
|
||||
```bash
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-21B-A3B-Thinking \
|
||||
--load-choices "default_v1" \
|
||||
--load_choices "default_v1" \
|
||||
--tensor-parallel-size 1 \
|
||||
--max-model-len 131072 \
|
||||
--quantization wint8 \
|
||||
@@ -37,7 +37,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
```
|
||||
- `--quantization`: Indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of `wint8` / `wint4` / `block_wise_fp8`(Hopper is needed).
|
||||
- `--max-model-len`: Indicates the maximum number of tokens supported by the currently deployed service. The larger the value, the longer the context length the model can support, but the more GPU memory is occupied, which may affect the concurrency.
|
||||
- `--load-choices`: Indicates the version of the loader. "default_v1" means enabling the v1 version of the loader, which has faster loading speed and less memory usage.
|
||||
- `--load_choices`: Indicates the version of the loader. "default_v1" means enabling the v1 version of the loader, which has faster loading speed and less memory usage.
|
||||
- `--reasoning-parser`, `--tool-call-parser`: Indicates the corresponding reasoning content and tool call parser.
|
||||
|
||||
For more parameter meanings and default settings, see [FastDeploy Parameter Documentation](../parameters.md)。
|
||||
|
@@ -28,11 +28,11 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--quantization wint4 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 128 \
|
||||
--load-choices "default_v1"
|
||||
--load_choices "default_v1"
|
||||
```
|
||||
- `--quantization`: indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of `wint8` / `wint4` / `block_wise_fp8`(Hopper is needed).
|
||||
- `--max-model-len`: Indicates the maximum number of tokens supported by the currently deployed service. The larger the value, the longer the context length the model can support, but the more GPU memory is occupied, which may affect the concurrency.
|
||||
- `--load-choices`: indicates the version of the loader. "default_v1" means enabling the v1 version of the loader, which has faster loading speed and less memory usage.
|
||||
- `--load_choices`: indicates the version of the loader. "default_v1" means enabling the v1 version of the loader, which has faster loading speed and less memory usage.
|
||||
|
||||
For more parameter meanings and default settings, see [FastDeploy Parameter Documentation](../parameters.md)。
|
||||
|
||||
@@ -91,7 +91,7 @@ Just specify the corresponding model name in the startup command, `baidu/ERNIE-4
|
||||
```
|
||||
|
||||
Note:
|
||||
- W4A8C8 quantized models are not supported when loaded via `--load-choices "default_v1"`.
|
||||
- W4A8C8 quantized models are not supported when loaded via `--load_choices "default_v1"`.
|
||||
|
||||
#### 2.2.6 Rejection Sampling
|
||||
**Idea:**
|
||||
|
@@ -196,7 +196,7 @@ We selected a subset (longbook_sum_eng) from InfiniteBench as the performance ev
|
||||
## Usage
|
||||
|
||||
```
|
||||
export FD_ATTENTION_BACKEND="PLAS_ATTN"
|
||||
export FD_ATTENTION_BACKEND="MOBA_ATTN"
|
||||
|
||||
python -m fastdeploy.entrypoints.openai.api_server
|
||||
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
|
||||
@@ -207,13 +207,13 @@ python -m fastdeploy.entrypoints.openai.api_server
|
||||
--max-num-batched-tokens 8192 \
|
||||
--max-model-len 131072 \
|
||||
--max-num-seqs 32 \
|
||||
--plas-attention-config '{"plas_encoder_top_k_left": 50, "plas_encoder_top_k_right": 60, "plas_decoder_top_k_left": 100, "plas_decoder_top_k_right": 120}'
|
||||
--moba-attention-config '{"moba_encoder_top_k_left": 50, "moba_encoder_top_k_right": 60, "moba_decoder_top_k_left": 100, "moba_decoder_top_k_right": 120}'
|
||||
```
|
||||
|
||||
**Note**: If sparse attention is enabled, the system will automatically load the MLP weights from `plas_attention_mlp_weight.safetensors` in the weight directory. If the MLP weight file is not found, mean pooling will be applied to the key representations.
|
||||
**Note**: If sparse attention is enabled, the system will automatically load the MLP weights from `moba_mlp_weight.safetensors` in the weight directory. If the MLP weight file is not found, mean pooling will be applied to the key representations.
|
||||
|
||||
**Parameter Description:**
|
||||
|
||||
* Setting `FD_ATTENTION_BACKEND="PLAS_ATTN"` enables PLAS sparse attention.
|
||||
* `plas_encoder_top_k_left=50, plas_encoder_top_k_right=60` indicates that the range of top-k is between 50 and 60 when the encoder is sparse.
|
||||
* `plas_decoder_top_k_left=100, plas_decoder_top_k_right=120` indicates that the range of top-k is between 100 and 120 when the decoder is sparse.
|
||||
* Setting `FD_ATTENTION_BACKEND="MOBA_ATTN"` enables MOBA sparse attention.
|
||||
* `moba_encoder_top_k_left=50, moba_encoder_top_k_right=60` indicates that the range of top-k is between 50 and 60 when the encoder is sparse.
|
||||
* `moba_decoder_top_k_left=100, moba_decoder_top_k_right=120` indicates that the range of top-k is between 100 and 120 when the decoder is sparse.
|
||||
|
@@ -18,7 +18,7 @@ Assuming you have a custom model class `MyModelForCasualLM` and a pretrained cla
|
||||
|
||||
```python
|
||||
# File: fd_add_dummy_model/__init__.py or fd_add_dummy_model/register.py
|
||||
from fastdeploy.model_executor.models.model_base import ModelRegistry
|
||||
from fastdeploy.model_registry import ModelRegistry
|
||||
from my_custom_model import MyModelForCasualLM, MyPretrainedModel
|
||||
from fastdeploy.config import ErnieArchitectures
|
||||
|
||||
|
@@ -7,4 +7,3 @@ FastDeploy currently supports installation on the following hardware platforms:
|
||||
- [Enflame S60 GCU Installation](Enflame_gcu.md)
|
||||
- [Iluvatar GPU Installation](iluvatar_gpu.md)
|
||||
- [Hygon DCU Installation](hygon_dcu.md)
|
||||
- [Intel Gaudi Installation](intel_gaudi.md)
|
||||
|
@@ -1,4 +1,5 @@
|
||||
# Run ERNIE-4.5-300B-A47B & ERNIE-4.5-21B-A3B model on iluvatar machine
|
||||
The current version of the software merely serves as a demonstration demo for the Iluvatar CoreX combined with the Fastdeploy inference framework for large models. Running the latest ERNIE4.5 300B model on the GSM8K dataset takes about 6.3 hours.
|
||||
|
||||
## Machine Preparation
|
||||
First, the `TP=16` when running the ERNIE4.5 300B model and so you need to prepare a machine with the following configurations:
|
||||
@@ -29,7 +30,7 @@ docker exec -it paddle_infer bash
|
||||
### Install paddle
|
||||
|
||||
```bash
|
||||
pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
||||
pip3 install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
||||
pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
|
||||
```
|
||||
For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
|
||||
@@ -77,7 +78,7 @@ prompts = [
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
|
||||
|
||||
# load the model
|
||||
llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, block_size=16, quantization='wint8')
|
||||
llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, static_decode_blocks=0, block_size=16, quantization='wint8')
|
||||
|
||||
# Perform batch inference
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
@@ -389,7 +390,7 @@ export INFERENCE_MSG_QUEUE_ID=232132
|
||||
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
|
||||
export FD_SAMPLING_CLASS=rejection
|
||||
|
||||
python3 -m fastdeploy.entrypoints.openai.api_server --model "/home/paddle/ernie-45t" --port 8188 --tensor-parallel-size 16 --block-size 16 --quantization wint8
|
||||
python3 -m fastdeploy.entrypoints.openai.api_server --model "/home/paddle/ernie-45t" --port 8188 --tensor-parallel-size 16 --block-size 16 --static-decode-blocks 0 --quantization wint8
|
||||
```
|
||||
|
||||
4. Running the Script
|
||||
@@ -402,10 +403,10 @@ After the service is ready, open another terminal and run:
|
||||
```bash
|
||||
python3 -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parallel 8
|
||||
```
|
||||
It takes about 4.8 hours to run the GSM8K dataset.
|
||||
It takes about 6.3 hours to run the GSM8K dataset.
|
||||
|
||||
```
|
||||
Accuracy: 0.962
|
||||
Accuracy: 0.964
|
||||
Invaild: 0.000
|
||||
Latency: 17332.728 s
|
||||
Latency: 22918.186 s
|
||||
```
|
||||
|
@@ -1,75 +0,0 @@
|
||||
# Intel Gaudi Installation for running ERNIE 4.5 Series Models
|
||||
|
||||
The following installation methods are available when your environment meets these requirements:
|
||||
|
||||
- Python 3.10
|
||||
- Intel Gaudi 2
|
||||
- Intel Gaudi software version 1.22.0
|
||||
- Linux X86_64
|
||||
|
||||
## 1. Run Docker Container
|
||||
|
||||
Use the following commands to run a Docker container. Make sure to update the versions below as listed in the [Support Matrix](https://docs.habana.ai/en/latest/Support_Matrix/Support_Matrix.html):
|
||||
|
||||
```{.console}
|
||||
$ docker pull vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest
|
||||
$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest
|
||||
```
|
||||
|
||||
### 2. Install PaddlePaddle
|
||||
|
||||
```bash
|
||||
python -m pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
||||
```
|
||||
|
||||
### 3. Install PaddleCustomDevice
|
||||
```shell
|
||||
git clone https://github.com/PaddlePaddle/PaddleCustomDevice
|
||||
cd PaddleCustomDevice/backends/intel_hpu/
|
||||
mkdir -p build
|
||||
cd build
|
||||
cmake ..
|
||||
make -j
|
||||
pip install --force-reinstall dist/paddle_intel_hpu*.whl
|
||||
cd PaddleCustomDevice/backends/intel_hpu/custom_ops
|
||||
python setup.py install
|
||||
```
|
||||
|
||||
### 4. Install FastDeploy
|
||||
|
||||
```shell
|
||||
git clone https://github.com/PaddlePaddle/FastDeploy
|
||||
cd FastDeploy
|
||||
bash build.sh
|
||||
```
|
||||
|
||||
## Prepare the inference demo
|
||||
|
||||
### 1. Start inference service
|
||||
```shell
|
||||
export GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
|
||||
export GC_KERNEL_PATH=/usr/local/lib/python3.10/dist-packages/paddle_custom_device/intel_hpu/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH
|
||||
export INTEL_HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||
export PADDLE_DISTRI_BACKEND=xccl
|
||||
export PADDLE_XCCL_BACKEND=intel_hpu
|
||||
export HABANA_PROFILE=0
|
||||
export HPU_VISIBLE_DEVICES=0
|
||||
|
||||
HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN python -m fastdeploy.entrypoints.openai.api_server --model ERNIE-4.5-21B-A3B-Paddle --tensor-parallel-size 1 --max-model-len 32768 --max-num-seqs 128
|
||||
```
|
||||
|
||||
### 2. Launch the request
|
||||
```bash
|
||||
curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"messages": [
|
||||
{"role": "user", "content": "What is AI?"}
|
||||
], "max_tokens": 24
|
||||
}'
|
||||
```
|
||||
|
||||
### 3. Successfully returns the result
|
||||
```json
|
||||
{"id":"chatcmpl-3bd98ae2-fafe-46ae-a552-d653a8526503","object":"chat.completion","created":1757653575,"model":"ERNIE-4.5-21B-A3B-Paddle","choices":[{"index":0,"message":{"role":"assistant","content":"**AI (Artificial Intelligence)** refers to the development of computer systems that can perform tasks typically requiring human intelligence.","multimodal_content":null,"reasoning_content":null,"tool_calls":null,"prompt_token_ids":null,"completion_token_ids":null,"text_after_process":null,"raw_prediction":null,"prompt_tokens":null,"completion_tokens":null},"logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":11,"total_tokens":35,"completion_tokens":24,"prompt_tokens_details":{"cached_tokens":0}}}
|
||||
```
|
@@ -19,8 +19,8 @@ docker login --username=cr_temp_user --password=eyJpbnN0YW5jZUlkIjoiY3JpLXpxYTIz
|
||||
## 2. paddlepaddle and custom device installation
|
||||
|
||||
```shell
|
||||
1)pip install paddlepaddle==3.0.0.dev20250825 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
|
||||
2)pip install paddle-metax-gpu==3.0.0.dev20250826 -i https://www.paddlepaddle.org.cn/packages/nightly/maca/
|
||||
1)pip install paddlepaddle==3.0.0.dev20250729 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
|
||||
2)pip install paddle-metax-gpu==3.0.0.dev20250807 -i https://www.paddlepaddle.org.cn/packages/nightly/maca/
|
||||
```
|
||||
|
||||
## 3. Build Wheel from Source
|
||||
@@ -47,8 +47,6 @@ from fastdeploy.model_executor.ops.gpu import beam_search_softmax
|
||||
If the above code executes successfully, the environment is ready.
|
||||
|
||||
## 5. Demo
|
||||
|
||||
```python
|
||||
from fastdeploy import LLM, SamplingParams
|
||||
|
||||
prompts = [
|
||||
@@ -70,9 +68,7 @@ for output in outputs:
|
||||
print(prompt)
|
||||
print(generated_text)
|
||||
print("-" * 50)
|
||||
```
|
||||
|
||||
```
|
||||
Output:
|
||||
INFO 2025-08-18 10:54:18,455 416822 engine.py[line:202] Waiting worker processes ready...
|
||||
Loading Weights: 100%|█████████████████████████████████████████████████████████████████████████| 100/100 [03:33<00:00, 2.14s/it]
|
||||
@@ -85,4 +81,3 @@ Generated 1 outputs
|
||||
Hello. My name is
|
||||
Alice and I'm here to help you. What can I do for you today?
|
||||
Hello Alice! I'm trying to organize a small party
|
||||
```
|
||||
|
@@ -10,7 +10,7 @@ The following installation methods are available when your environment meets the
|
||||
|
||||
## 1. Pre-built Docker Installation (Recommended)
|
||||
|
||||
**Notice**: The pre-built image only supports SM80/90 GPU(e.g. H800/A800),if you are deploying on SM86/89GPU(L40/4090/L20), please reinstall ```fastdeploy-gpu``` after you create the container.
|
||||
**Notice**: The pre-built image only supports SM80/90 GPU(e.g. H800/A800),if you are deploying on SM86/89GPU(L40/4090/L20), please reinstall ```fastdpeloy-gpu``` after you create the container.
|
||||
|
||||
```shell
|
||||
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.2.0
|
||||
|
@@ -16,7 +16,7 @@ For more information about how to install FastDeploy, refer to the [installation
|
||||
After installing FastDeploy, execute the following command in the terminal to start the service. For the configuration method of the startup command, refer to [Parameter Description](../parameters.md)
|
||||
|
||||
> ⚠️ **Note:**
|
||||
> When using HuggingFace models (torch format), you need to enable `--load-choices "default_v1"`.
|
||||
> When using HuggingFace models (torch format), you need to enable `--load_choices "default_v1"`.
|
||||
|
||||
```
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
@@ -27,7 +27,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--engine-worker-queue-port 8182 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 32 \
|
||||
--load-choices "default_v1"
|
||||
--load_choices "default_v1"
|
||||
```
|
||||
|
||||
> 💡 Note: In the path specified by ```--model```, if the subdirectory corresponding to the path does not exist in the current directory, it will try to query whether AIStudio has a preset model based on the specified model name (such as ```Qwen/QWEN3-0.6b```). If it exists, it will automatically start downloading. The default download path is: ```~/xx```. For instructions and configuration on automatic model download, see [Model Download](../supported_models.md).
|
||||
|
@@ -107,7 +107,7 @@ messages = [
|
||||
}
|
||||
]
|
||||
|
||||
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
prompt = tokenizer.apply_chat_template(messages, tokenize=False)
|
||||
images, videos = [], []
|
||||
for message in messages:
|
||||
content = message["content"]
|
||||
|
@@ -13,7 +13,7 @@ export FD_MODEL_SOURCE=AISTUDIO # "AISTUDIO", "MODELSCOPE" or "HUGGINGFACE"
|
||||
export FD_MODEL_CACHE=/ssd1/download_models
|
||||
```
|
||||
|
||||
> ⭐ **Note**: Models marked with an asterisk can directly use **HuggingFace Torch weights** and support **FP8/WINT8/WINT4** as well as **BF16**. When running inference, you need to enable **`--load-choices "default_v1"`**.
|
||||
> ⭐ **Note**: Models marked with an asterisk can directly use **HuggingFace Torch weights** and support **FP8/WINT8/WINT4** as well as **BF16**. When running inference, you need to enable **`--load_choices "default_v1"`**.
|
||||
|
||||
> Example launch Command using baidu/ERNIE-4.5-21B-A3B-PT:
|
||||
```
|
||||
@@ -24,7 +24,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--engine-worker-queue-port 8182 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 32 \
|
||||
--load-choices "default_v1"
|
||||
--load_choices "default_v1"
|
||||
```
|
||||
|
||||
## Large Language Models
|
||||
|
@@ -20,6 +20,6 @@ Below is an overview of the FastDeploy code structure and functionality organize
|
||||
- ```platforms```: Platform-specific modules for underlying hardware support.
|
||||
- ```scheduler```: Request scheduling module for large models.
|
||||
- ```metrics```: Core component for collecting, managing, and exporting Prometheus metrics, tracking key runtime performance data (e.g., request latency, resource utilization, successful request counts).
|
||||
- ```splitwise```: Modules related to PD disaggregation deployment.
|
||||
- ```splitwise```: Modules related to PD disaggragation deployment.
|
||||
- ```scripts```/```tools```: Utility scripts for FastDeploy operations (e.g., compilation, unit testing, code style fixes).
|
||||
- ```test```: Code for unit testing and validation.
|
||||
|
@@ -1,20 +1,20 @@
|
||||
## Supported Models
|
||||
|Model Name|Context Length|Quantization|XPUs Required|Deployment Commands|Minimum Version Required|
|
||||
|-|-|-|-|-|-|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT8|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.3|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT4|4 (Recommended)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 4 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.0|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.95 \ <br> --load-choices "default"|>=2.0.0|
|
||||
|ERNIE-4.5-300B-A47B|128K|WINT4|8 (Recommended)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.0|
|
||||
|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|
||||
|ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.3|
|
||||
|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.3|
|
||||
|ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.3|
|
||||
|ERNIE-4.5-0.3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.3|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT8|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT4|4 (Recommended)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 4 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.0.0|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.95|>=2.0.0|
|
||||
|ERNIE-4.5-300B-A47B|128K|WINT4|8 (Recommended)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.0.0|
|
||||
|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|
||||
|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|
||||
|ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|
||||
|ERNIE-4.5-0.3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|
||||
|
||||
## Quick start
|
||||
|
||||
@@ -28,7 +28,6 @@ Deploy an OpenAI API-compatible server using FastDeploy with the following comma
|
||||
|
||||
```bash
|
||||
export XPU_VISIBLE_DEVICES="0,1,2,3" # Specify which cards to be used
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1 is not supported
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
|
||||
--port 8188 \
|
||||
@@ -36,8 +35,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 64 \
|
||||
--quantization "wint4" \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--load-choices "default"
|
||||
--gpu-memory-utilization 0.9
|
||||
```
|
||||
|
||||
**Note:** When deploying on 4 XPUs, only two configurations are supported which constrained by hardware limitations such as interconnect capabilities.
|
||||
@@ -51,7 +49,7 @@ All supported models can be found in the *Supported Models* section above.
|
||||
|
||||
#### Send requests
|
||||
|
||||
Send requests using either curl or Python.
|
||||
Send requests using either curl or Python
|
||||
|
||||
```bash
|
||||
curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
|
||||
|
@@ -30,7 +30,7 @@ By default, logs are stored in the `log` directory under the execution path. To
|
||||
* `cache_transfer_manager.log` : Logs startup parameters and received request information.
|
||||
* `launch_cache_manager.log` : Records cache transfer startup parameters and error messages.
|
||||
|
||||
## PD Disaggregation Logs
|
||||
## PD Disaggragation Logs
|
||||
* `cache_messager.log` : Logs transmission protocols and messages used by the P instance.
|
||||
* `splitwise_connector.log` : Records data received from P/D instances and connection establishment details.
|
||||
|
||||
|
@@ -31,12 +31,12 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--quantization wint4 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 128 \
|
||||
--load-choices "default_v1"
|
||||
--load_choices "default_v1"
|
||||
```
|
||||
其中:
|
||||
- `--quantization`: 表示模型采用的量化策略。不同量化策略,模型的性能和精度也会不同。可选值包括:`wint8` / `wint4` / `block_wise_fp8`(需要Hopper架构)。
|
||||
- `--max-model-len`:表示当前部署的服务所支持的最长Token数量。设置得越大,模型可支持的上下文长度也越大,但相应占用的显存也越多,可能影响并发数。
|
||||
- `--load-choices`: 表示loader的版本,"default_v1"表示启用v1版本的loader,具有更快的加载速度和更少的内存使用。
|
||||
- `--load_choices`: 表示loader的版本,"default_v1"表示启用v1版本的loader,具有更快的加载速度和更少的内存使用。
|
||||
|
||||
更多的参数含义与默认设置,请参见[FastDeploy参数说明](../parameters.md)。
|
||||
|
||||
|
@@ -31,12 +31,12 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--quantization wint4 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 128 \
|
||||
--load-choices "default_v1"
|
||||
--load_choices "default_v1"
|
||||
```
|
||||
其中:
|
||||
- `--quantization`: 表示模型采用的量化策略。不同量化策略,模型的性能和精度也会不同。可选值包括:`wint8` / `wint4` / `block_wise_fp8`(需要Hopper架构)。
|
||||
- `--max-model-len`:表示当前部署的服务所支持的最长Token数量。设置得越大,模型可支持的上下文长度也越大,但相应占用的显存也越多,可能影响并发数。
|
||||
- `--load-choices`: 表示loader的版本,"default_v1"表示启用v1版本的loader,具有更快的加载速度和更少的内存使用。
|
||||
- `--load_choices`: 表示loader的版本,"default_v1"表示启用v1版本的loader,具有更快的加载速度和更少的内存使用。
|
||||
|
||||
更多的参数含义与默认设置,请参见[FastDeploy参数说明](../parameters.md)。
|
||||
|
||||
|
@@ -27,7 +27,7 @@ ERNIE-4.5-21B-A3B 各量化精度,在下列硬件上部署所需要的最小
|
||||
```bash
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-21B-A3B-Thinking \
|
||||
--load-choices "default_v1" \
|
||||
--load_choices "default_v1" \
|
||||
--tensor-parallel-size 1 \
|
||||
--max-model-len 131072 \
|
||||
--quantization wint8 \
|
||||
@@ -38,7 +38,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
其中:
|
||||
- `--quantization`: 表示模型采用的量化策略。不同量化策略,模型的性能和精度也会不同。可选值包括:`wint8` / `wint4` / `block_wise_fp8`(需要Hopper架构)。
|
||||
- `--max-model-len`:表示当前部署的服务所支持的最长Token数量。设置得越大,模型可支持的上下文长度也越大,但相应占用的显存也越多,可能影响并发数。
|
||||
- `--load-choices`: 表示loader的版本,"default_v1"表示启用v1版本的loader,具有更快的加载速度和更少的内存使用。
|
||||
- `--load_choices`: 表示loader的版本,"default_v1"表示启用v1版本的loader,具有更快的加载速度和更少的内存使用。
|
||||
- `--reasoning-parser` 、 `--tool-call-parser`: 表示对应调用的思考内容和工具调用解析器
|
||||
|
||||
更多的参数含义与默认设置,请参见[FastDeploy参数说明](../parameters.md)。
|
||||
|
@@ -28,12 +28,12 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--quantization wint4 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 128 \
|
||||
--load-choices "default_v1"
|
||||
--load_choices "default_v1"
|
||||
```
|
||||
其中:
|
||||
- `--quantization`: 表示模型采用的量化策略。不同量化策略,模型的性能和精度也会不同。可选值包括:`wint8` / `wint4` / `block_wise_fp8`(需要Hopper架构)。
|
||||
- `--max-model-len`:表示当前部署的服务所支持的最长Token数量。设置得越大,模型可支持的上下文长度也越大,但相应占用的显存也越多,可能影响并发数。
|
||||
- `--load-choices`: 表示loader的版本,"default_v1"表示启用v1版本的loader,具有更快的加载速度和更少的内存使用。
|
||||
- `--load_choices`: 表示loader的版本,"default_v1"表示启用v1版本的loader,具有更快的加载速度和更少的内存使用。
|
||||
|
||||
更多的参数含义与默认设置,请参见[FastDeploy参数说明](../parameters.md)。
|
||||
|
||||
@@ -92,7 +92,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
```
|
||||
|
||||
注:
|
||||
- W4A8C8量化的模型不支持通过`--load-choices "default_v1"`载入。
|
||||
- W4A8C8量化的模型不支持通过`--load_choices "default_v1"`载入。
|
||||
|
||||
#### 2.2.6 拒绝采样
|
||||
**原理:**
|
||||
|
@@ -18,7 +18,7 @@
|
||||
<img src="images/plas_training_distill.png" alt="Attention Gate Module" width="60%">
|
||||
</div>
|
||||
|
||||
* **Attention Gate Module**: 如上图所示,为了以较低的计算开销估计每个块的重要性,我们设计了一个轻量级的注意力门模块。该模块首先通过一个MLP层压缩每个K个块,生成一个具有代表性的低维表示: $K_c^T=W_{kp}K^T$ ,其中 $W_{kp}$ 表示 MLP 层的权重。与直接应用均值池化相比,可学习的 MLP 可以更有效地捕捉不同 token 之间的语义关系和重要性分布,从而提供每个块的精细表示。在获得压缩表示 $K_c$ 之后,通过以下公式估计每个查询 token 相对于每个块的重要性:$Softmax(Q\cdot K_c^T)$。为了增强 MLP 层的判别能力,我们使用一维最大池化后的完整注意力结果 $1DMaxPooling(Softmax(Q \cdot K^T))$ 作为 ground truth。通过最小化两者之间的分布差异,引导 MLP 层学习更符合真实注意力分布的特征表示。
|
||||
* **Attention Gate Module**: 如上图所示,为了以较低的计算开销估计每个块的重要性,我们设计了一个轻量级的注意力门模块。该模块首先通过一个 MLP 层压缩每个 K 个块,生成一个具有代表性的低维表示:$K_c^T=W_{kp}K^T$,其中 $W_{kp}$ 表示 MLP 层的权重。与直接应用均值池化相比,可学习的 MLP 可以更有效地捕捉不同 token 之间的语义关系和重要性分布,从而提供每个块的精细表示。在获得压缩表示 $K_c$ 之后,通过以下公式估计每个查询 token 相对于每个块的重要性:$Softmax(Q\cdot K_c^T)$。为了增强 MLP 层的判别能力,我们使用一维最大池化后的完整注意力结果 $1DMaxPooling(Softmax(Q \cdot K^T))$ 作为 ground truth。通过最小化两者之间的分布差异,引导 MLP 层学习更符合真实注意力分布的特征表示。
|
||||
|
||||
* **Training Data**: 得益于模型架构和训练范式的高效性,我们的方法仅使用 10 亿个 token 进行训练,便实现了近乎无损的精度。训练数据源自内部构建的包含长文本和短文本的混合语料库,从而增强了模块对不同序列长度的适应性。
|
||||
|
||||
@@ -36,7 +36,7 @@
|
||||
|
||||
* **Prefill Token Union**: 我们观察到相邻的查询标记倾向于选择相似的关键块。利用这种局部性,我们取连续 128 个查询标记选择的关键块的并集,并联合计算这些标记的稀疏注意力机制。
|
||||
|
||||
* **Decode Head Union**: 鉴于GQA在现代模型中的广泛应用,我们发现同一组内的不同查询头经常选择重叠的关键块。因此,我们将同一组内所有查询头选择的关键块合并为一个统一的集合,并联合计算稀疏注意力机制。这种方式也减少了内存访问开销,并进一步提高了解码效率。
|
||||
* **Decode Head Union**: 鉴于 GQA 在现代模型中的广泛应用,我们发现同一组内的不同查询头经常选择重叠的关键块。因此,我们将同一组内所有查询头选择的关键块合并为一个统一的集合,并联合计算稀疏注意力机制。这种方式也减少了内存访问开销,并进一步提高了解码效率。
|
||||
|
||||
* **Top-K Selection**: 传统的 Top-k 算法基于排序或直接调用 Cub 库,会带来显著的运行时开销。为了缓解这个问题,我们实现了一个基于二分查找的近似 Top-k 选择算法,该算法在保持准确率的同时显著降低了延迟,最终实现了性能的显著提升。
|
||||
|
||||
@@ -200,7 +200,7 @@
|
||||
## 使用方式
|
||||
|
||||
```
|
||||
export FD_ATTENTION_BACKEND="PLAS_ATTN"
|
||||
export FD_ATTENTION_BACKEND="MOBA_ATTN"
|
||||
|
||||
python -m fastdeploy.entrypoints.openai.api_server
|
||||
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
|
||||
@@ -211,13 +211,13 @@ python -m fastdeploy.entrypoints.openai.api_server
|
||||
--max-num-batched-tokens 8192 \
|
||||
--max-model-len 131072 \
|
||||
--max-num-seqs 32 \
|
||||
--plas-attention-config '{"plas_encoder_top_k_left": 50, "plas_encoder_top_k_right": 60, "plas_decoder_top_k_left": 100, "plas_decoder_top_k_right": 120}'
|
||||
--moba-attention-config '{"moba_encoder_top_k_left": 50, "moba_encoder_top_k_right": 60, "moba_decoder_top_k_left": 100, "moba_decoder_top_k_right": 120}'
|
||||
```
|
||||
|
||||
**Note**: 如果启用了稀疏注意力机制,系统将自动从权重目录中的`plas_attention_mlp_weight.safetensors`文件加载 MLP 权重。如果未找到 MLP 权重文件,则将对关键表示应用均值池化
|
||||
**Note**: 如果启用了稀疏注意力机制,系统将自动从权重目录中的`moba_mlp_weight.safetensors`文件加载 MLP 权重。如果未找到 MLP 权重文件,则将对关键表示应用均值池化
|
||||
|
||||
**Parameter Description:**
|
||||
|
||||
* `FD_ATTENTION_BACKEND="PLAS_ATTN"` 启用 PLAS sparse attention.
|
||||
* `plas_encoder_top_k_left=50, plas_encoder_top_k_right=60` 表示当encoder时,top-k的范围在50到60之间。
|
||||
* `plas_decoder_top_k_left=100, plas_decoder_top_k_right=120` 表示当decoder时,top-k的范围在100到120之间。
|
||||
* `FD_ATTENTION_BACKEND="MOBA_ATTN"` 启用 MOBA sparse attention.
|
||||
* `moba_encoder_top_k_left=50, moba_encoder_top_k_right=60` 表示当encoder时,top-k的范围在50到60之间。
|
||||
* `moba_decoder_top_k_left=100, moba_decoder_top_k_right=120` 表示当decoder时,top-k的范围在100到120之间。
|
||||
|
@@ -18,7 +18,7 @@ FastDeploy 利用 Python 的 `entry_points` 机制来发现并加载插件。开
|
||||
|
||||
```python
|
||||
# 文件:fd_add_dummy_model/__init__.py
|
||||
from fastdeploy.model_executor.models.model_base import ModelRegistry
|
||||
from fastdeploy.model_registry import ModelRegistry
|
||||
from my_custom_model import MyModelForCasualLM, MyPretrainedModel
|
||||
|
||||
def register():
|
||||
|
@@ -7,4 +7,3 @@ FastDeploy支持如下硬件平台:
|
||||
- [Enflame S60 GCU Installation](Enflame_gcu.md)
|
||||
- [Iluvatar GPU Installation](iluvatar_gpu.md)
|
||||
- [Hygon DCU Installation](hygon_dcu.md)
|
||||
- [Intel Gaudi Installation](intel_gaudi.md)
|
||||
|
@@ -1,11 +1,12 @@
|
||||
# 如何在天数机器上运行 ERNIE-4.5-300B-A47B-BF16 & ERNIE-4.5-21B-A3B
|
||||
该软件的当前版本仅作为Iluvatar CoreX与大型模型的Fastdeploy推理框架相结合的演示。在GSM8K数据集上运行最新的ERNIE4.5 300B模型大约需要6.3小时。
|
||||
|
||||
## 准备机器
|
||||
首先运行ERNIE4.5 300B模型需要`TP=16`, 所以您需要准备以下配置的机器:
|
||||
首先您需要准备以下配置的机器
|
||||
|
||||
| CPU | 内存 | 天数 | 硬盘|
|
||||
|-----|------|-----|-----|
|
||||
| x86 | 1TB| 16xBI150| 1TB|
|
||||
| x86 | 1TB| 8xBI150| 1TB|
|
||||
|
||||
目前需要将完整模型 load 到 host memory 中,需要需要大于 600GB 的 host memory,后续版本会优化。
|
||||
|
||||
@@ -29,7 +30,7 @@ docker exec -it paddle_infer bash
|
||||
### 安装paddle
|
||||
|
||||
```bash
|
||||
pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
||||
pip3 install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
||||
pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
|
||||
```
|
||||
获取Paddle的最新安装版本: [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
|
||||
@@ -76,7 +77,7 @@ prompts = [
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
|
||||
|
||||
# 加载模型
|
||||
llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, quantization='wint8')
|
||||
llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, static_decode_blocks=0, quantization='wint8')
|
||||
|
||||
# 批量进行推理(llm内部基于资源情况进行请求排队、动态插入处理)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
@@ -131,281 +132,3 @@ Now, let's break down each step:
|
||||
**Step 3: Drawing the
|
||||
The largest ocean is the Pacific Ocean, covering an area of approximately ⦠[3], The first scientific expeditions to determine the ocean's depth were the Challenger expedition (1872â1876) and the U.S. Navy Hydrographic Office survey (1877â1879). The oceanic crust is thin and irregular, consisting of upward moving magma from the mantle below, and cooling and solidifying on the surface. The shallowest parts of the ocean are called the continental shelves. Large tides are caused mainly by the alignment of the Sun, Moon, and Earth during new or full moons. The origin of the word "ocean" is not clear. The first global oceanic topography survey was completed by the Challenger expedition (1872â1876). [57] The sound speed in the ocean is primarily a function of water temperature and salinity, and varies with depth. The deep-ocean floor is mostly flat and devoid of life, with the exception of seamounts and various underwater volcanic features, including seamounts and hydrothermal vents. [73] Today, the five ocean
|
||||
```
|
||||
|
||||
## 在GSM8K数据集上运行ernie4.5 300B模型
|
||||
|
||||
1. 下载GSM8K数据集
|
||||
|
||||
```bash
|
||||
wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
|
||||
```
|
||||
|
||||
2. 准备`bench_gsm8k.py`
|
||||
|
||||
```python
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
""" Fastdeploy + ERNIE-4.5-Turbo 的指标评估 """
|
||||
# adapted from https://github.com/sgl-project/sglang/blob/main/benchmark/gsm8k/bench_other.py
|
||||
import argparse
|
||||
import ast
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
|
||||
INVALID = -9999999
|
||||
|
||||
|
||||
def call_generate(prompt, **kwargs):
|
||||
"""
|
||||
Generates response based on the input prompt.
|
||||
|
||||
Args:
|
||||
prompt (str): The input prompt text.
|
||||
**kwargs: Keyword arguments, including server IP address and port number.
|
||||
|
||||
Returns:
|
||||
str: The response generated based on the prompt.
|
||||
|
||||
"""
|
||||
url = f"http://{kwargs['ip']}:{kwargs['port']}/v1/chat/completions"
|
||||
headers = {"Content-Type": "application/json"}
|
||||
data = {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
"temperature": 0.6,
|
||||
"max_tokens": 2047,
|
||||
"top_p": 0.95,
|
||||
"do_sample": True,
|
||||
}
|
||||
|
||||
response = requests.post(url, headers=headers, data=json.dumps(data))
|
||||
out = response.json()
|
||||
return out["choices"][0]["message"]["content"]
|
||||
|
||||
|
||||
def get_one_example(lines, i, include_answer):
|
||||
"""
|
||||
Retrieves a question-answer example from the given list of text lines.
|
||||
|
||||
Args:
|
||||
lines (list of dict): A list of question-answer pairs.
|
||||
i (int): The index of the question-answer pair to retrieve from lines.
|
||||
include_answer (bool): Whether to include the answer in the returned string.
|
||||
|
||||
Returns:
|
||||
str: A formatted question-answer string in the format "Question: <question>\nAnswer: <answer>".
|
||||
|
||||
"""
|
||||
ret = "Question: " + lines[i]["question"] + "\nAnswer:"
|
||||
if include_answer:
|
||||
ret += " " + lines[i]["answer"]
|
||||
return ret
|
||||
|
||||
|
||||
def get_few_shot_examples(lines, k):
|
||||
"""
|
||||
Selects k examples from the given list of text lines and concatenates them into a single string.
|
||||
|
||||
Args:
|
||||
lines (list): A list containing text lines.
|
||||
k (int): The number of examples to select.
|
||||
|
||||
Returns:
|
||||
str: A string composed of k examples, separated by two newline characters.
|
||||
"""
|
||||
ret = ""
|
||||
for i in range(k):
|
||||
ret += get_one_example(lines, i, True) + "\n\n"
|
||||
return ret
|
||||
|
||||
|
||||
def get_answer_value(answer_str):
|
||||
"""
|
||||
Extracts numerical values from an answer string and returns them.
|
||||
|
||||
Args:
|
||||
answer_str (str): The string containing the answer.
|
||||
|
||||
Returns:
|
||||
The extracted numerical value; returns "INVALID" if extraction fails.
|
||||
"""
|
||||
answer_str = answer_str.replace(",", "")
|
||||
numbers = re.findall(r"\d+", answer_str)
|
||||
if len(numbers) < 1:
|
||||
return INVALID
|
||||
try:
|
||||
return ast.literal_eval(numbers[-1])
|
||||
except SyntaxError:
|
||||
return INVALID
|
||||
|
||||
|
||||
def read_jsonl(filename: str):
|
||||
"""
|
||||
Reads a JSONL file.
|
||||
|
||||
Args:
|
||||
filename (str): Path to the JSONL file.
|
||||
|
||||
Yields:
|
||||
dict: A dictionary object corresponding to each line in the JSONL file.
|
||||
"""
|
||||
with open(filename) as fin:
|
||||
for line in fin:
|
||||
if line.startswith("#"):
|
||||
continue
|
||||
yield json.loads(line)
|
||||
|
||||
|
||||
def main(args):
|
||||
"""
|
||||
Process inputs and generate answers by calling the model in parallel using a thread pool.
|
||||
|
||||
Args:
|
||||
args (argparse.Namespace):
|
||||
- num_questions (int): Number of questions to process.
|
||||
- num_shots (int): Number of few-shot learning examples.
|
||||
- ip (str): IP address of the model service.
|
||||
- port (int): Port number of the model service.
|
||||
- parallel (int): Number of questions to process in parallel.
|
||||
- result_file (str): File path to store the results.
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
"""
|
||||
# Read data
|
||||
filename = "test.jsonl"
|
||||
|
||||
lines = list(read_jsonl(filename))
|
||||
|
||||
# Construct prompts
|
||||
num_questions = args.num_questions
|
||||
num_shots = args.num_shots
|
||||
few_shot_examples = get_few_shot_examples(lines, num_shots)
|
||||
|
||||
questions = []
|
||||
labels = []
|
||||
for i in range(len(lines[:num_questions])):
|
||||
questions.append(get_one_example(lines, i, False))
|
||||
labels.append(get_answer_value(lines[i]["answer"]))
|
||||
assert all(l != INVALID for l in labels)
|
||||
|
||||
states = [None] * len(labels)
|
||||
|
||||
# Use thread pool
|
||||
def get_one_answer(i):
|
||||
answer = call_generate(
|
||||
prompt=few_shot_examples + questions[i],
|
||||
# stop=["Question", "Assistant:", "<|separator|>"],
|
||||
ip=args.ip,
|
||||
port=args.port,
|
||||
)
|
||||
states[i] = answer
|
||||
|
||||
tic = time.time()
|
||||
if args.parallel == 1:
|
||||
for i in tqdm(range(len(questions))):
|
||||
get_one_answer(i)
|
||||
else:
|
||||
with ThreadPoolExecutor(args.parallel) as executor:
|
||||
list(
|
||||
tqdm(
|
||||
executor.map(get_one_answer, list(range(len(questions)))),
|
||||
total=len(questions),
|
||||
)
|
||||
)
|
||||
|
||||
latency = time.time() - tic
|
||||
preds = []
|
||||
for i in range(len(states)):
|
||||
preds.append(get_answer_value(states[i]))
|
||||
|
||||
# Compute accuracy
|
||||
acc = np.mean(np.array(preds) == np.array(labels))
|
||||
invalid = np.mean(np.array(preds) == INVALID)
|
||||
|
||||
# Print results
|
||||
print(f"Accuracy: {acc:.3f}")
|
||||
print(f"Invalid: {invalid:.3f}")
|
||||
print(f"Latency: {latency:.3f} s")
|
||||
|
||||
with open(args.result_file, "a") as fout:
|
||||
value = {
|
||||
"task": "gsm8k",
|
||||
"backend": "paddlepaddle",
|
||||
"num_gpus": 1,
|
||||
"latency": round(latency, 3),
|
||||
"accuracy": round(acc, 3),
|
||||
"num_requests": args.num_questions,
|
||||
"other": {
|
||||
"num_questions": args.num_questions,
|
||||
"parallel": args.parallel,
|
||||
},
|
||||
}
|
||||
fout.write(json.dumps(value) + "\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--ip", type=str, default="127.0.0.1")
|
||||
parser.add_argument("--port", type=str, default="8188")
|
||||
parser.add_argument("--num-shots", type=int, default=10)
|
||||
parser.add_argument("--data-path", type=str, default="test.jsonl")
|
||||
parser.add_argument("--num-questions", type=int, default=1319)
|
||||
parser.add_argument("--result-file", type=str, default="result.jsonl")
|
||||
parser.add_argument("--parallel", type=int, default=1)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
```
|
||||
|
||||
3. 准备`run_bench.sh`
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
export PADDLE_XCCL_BACKEND=iluvatar_gpu
|
||||
export INFERENCE_MSG_QUEUE_ID=232132
|
||||
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
|
||||
export FD_SAMPLING_CLASS=rejection
|
||||
|
||||
python3 -m fastdeploy.entrypoints.openai.api_server --model "/home/paddle/ernie-45t" --port 8188 --tensor-parallel-size 16 --block-size 16 --quantization wint8
|
||||
```
|
||||
|
||||
4. 运行脚本
|
||||
|
||||
首先打开一个终端执行服务端命令:
|
||||
```bash
|
||||
./run_bench.sh
|
||||
```
|
||||
等服务起好后,在打开另一个终端执行客户端命令:
|
||||
```bash
|
||||
python3 -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parallel 8
|
||||
```
|
||||
推理整个GSM8K数据集大概需要4.8个小时。
|
||||
|
||||
```
|
||||
Accuracy: 0.962
|
||||
Invaild: 0.000
|
||||
Latency: 17332.728 s
|
||||
```
|
||||
|
@@ -1,75 +0,0 @@
|
||||
# 使用 Intel Gaudi 运行ERNIE 4.5 系列模型
|
||||
|
||||
在环境满足如下条件前提下
|
||||
|
||||
- Python 3.10
|
||||
- Intel Gaudi 2
|
||||
- Intel Gaudi software version 1.22.0
|
||||
- Linux X86_64
|
||||
|
||||
## 1. 运行Docker容器
|
||||
|
||||
使用下面命令运行Docker容器. 确保更新的版本在如下列表中 [Support Matrix](https://docs.habana.ai/en/latest/Support_Matrix/Support_Matrix.html):
|
||||
|
||||
```{.console}
|
||||
$ docker pull vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest
|
||||
$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest
|
||||
```
|
||||
|
||||
### 2. 安装 PaddlePaddle
|
||||
|
||||
```bash
|
||||
python -m pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
||||
```
|
||||
|
||||
### 3. 安装 PaddleCustomDevice
|
||||
```shell
|
||||
git clone https://github.com/PaddlePaddle/PaddleCustomDevice
|
||||
cd PaddleCustomDevice/backends/intel_hpu/
|
||||
mkdir -p build
|
||||
cd build
|
||||
cmake ..
|
||||
make -j
|
||||
pip install --force-reinstall dist/paddle_intel_hpu*.whl
|
||||
cd PaddleCustomDevice/backends/intel_hpu/custom_ops
|
||||
python setup.py install
|
||||
```
|
||||
|
||||
### 4. 安装 FastDeploy
|
||||
|
||||
```shell
|
||||
git clone https://github.com/PaddlePaddle/FastDeploy
|
||||
cd FastDeploy
|
||||
bash build.sh
|
||||
```
|
||||
|
||||
## 准备推理示例
|
||||
|
||||
### 1. 启动推理服务
|
||||
```shell
|
||||
export GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
|
||||
export GC_KERNEL_PATH=/usr/local/lib/python3.10/dist-packages/paddle_custom_device/intel_hpu/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH
|
||||
export INTEL_HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||
export PADDLE_DISTRI_BACKEND=xccl
|
||||
export PADDLE_XCCL_BACKEND=intel_hpu
|
||||
export HABANA_PROFILE=0
|
||||
export HPU_VISIBLE_DEVICES=0
|
||||
|
||||
HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN python -m fastdeploy.entrypoints.openai.api_server --model ERNIE-4.5-21B-A3B-Paddle --tensor-parallel-size 1 --max-model-len 32768 --max-num-seqs 128
|
||||
```
|
||||
|
||||
### 2. 发送请求
|
||||
```bash
|
||||
curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"messages": [
|
||||
{"role": "user", "content": "What is AI?"}
|
||||
], "max_tokens": 24
|
||||
}'
|
||||
```
|
||||
|
||||
### 3. 成功返回结果
|
||||
```json
|
||||
{"id":"chatcmpl-3bd98ae2-fafe-46ae-a552-d653a8526503","object":"chat.completion","created":1757653575,"model":"ERNIE-4.5-21B-A3B-Paddle","choices":[{"index":0,"message":{"role":"assistant","content":"**AI (Artificial Intelligence)** refers to the development of computer systems that can perform tasks typically requiring human intelligence.","multimodal_content":null,"reasoning_content":null,"tool_calls":null,"prompt_token_ids":null,"completion_token_ids":null,"text_after_process":null,"raw_prediction":null,"prompt_tokens":null,"completion_tokens":null},"logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":11,"total_tokens":35,"completion_tokens":24,"prompt_tokens_details":{"cached_tokens":0}}}
|
||||
```
|
@@ -15,7 +15,7 @@
|
||||
安装FastDeploy后,在终端执行如下命令,启动服务,其中启动命令配置方式参考[参数说明](../parameters.md)
|
||||
|
||||
> ⚠️ **注意:**
|
||||
> 当使用HuggingFace 模型(torch格式)时, 需要开启 `--load-choices "default_v1"`
|
||||
> 当使用HuggingFace 模型(torch格式)时, 需要开启 `--load_choices "default_v1"`
|
||||
|
||||
```shell
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
@@ -26,7 +26,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--engine-worker-queue-port 8182 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 32 \
|
||||
--load-choices "default_v1"
|
||||
--load_choices "default_v1"
|
||||
```
|
||||
|
||||
>💡 注意:在 ```--model``` 指定的路径中,若当前目录下不存在该路径对应的子目录,则会尝试根据指定的模型名称(如 ```Qwen/Qwen3-0.6B```)查询AIStudio是否存在预置模型,若存在,则自动启动下载。默认的下载路径为:```~/xx```。关于模型自动下载的说明和配置参阅[模型下载](../supported_models.md)。
|
||||
|
@@ -107,7 +107,7 @@ messages = [
|
||||
}
|
||||
]
|
||||
|
||||
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
prompt = tokenizer.apply_chat_template(messages, tokenize=False)
|
||||
images, videos = [], []
|
||||
for message in messages:
|
||||
content = message["content"]
|
||||
|
@@ -13,7 +13,7 @@ export FD_MODEL_SOURCE=AISTUDIO # "AISTUDIO", "MODELSCOPE" or "HUGGINGFACE"
|
||||
export FD_MODEL_CACHE=/ssd1/download_models
|
||||
```
|
||||
|
||||
> ⭐ **说明**:带星号的模型可直接使用 **HuggingFace Torch 权重**,支持 **FP8/WINT8/WINT4 动态量化** 和 **BF16 精度** 推理,推理时需启用 **`--load-choices "default_v1"`**。
|
||||
> ⭐ **说明**:带星号的模型可直接使用 **HuggingFace Torch 权重**,支持 **FP8/WINT8/WINT4 动态量化** 和 **BF16 精度** 推理,推理时需启用 **`--load_choices "default_v1"`**。
|
||||
|
||||
> 以baidu/ERNIE-4.5-21B-A3B-PT为例启动命令如下
|
||||
```
|
||||
@@ -24,7 +24,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--engine-worker-queue-port 8182 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 32 \
|
||||
--load-choices "default_v1"
|
||||
--load_choices "default_v1"
|
||||
```
|
||||
|
||||
## 纯文本模型列表
|
||||
|
@@ -1,20 +1,20 @@
|
||||
## 支持的模型
|
||||
|模型名|上下文长度|量化|所需卡数|部署命令|最低版本要求|
|
||||
|-|-|-|-|-|-|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT8|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.3|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT4|4 (推荐)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 4 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.0|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.95 \ <br> --load-choices "default"|>=2.0.0|
|
||||
|ERNIE-4.5-300B-A47B|128K|WINT4|8 (推荐)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.0|
|
||||
|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.1.0|
|
||||
|ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.3|
|
||||
|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="x" # 指定任意一张卡<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.3|
|
||||
|ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.3|
|
||||
|ERNIE-4.5-0.3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9 \ <br> --load-choices "default"|>=2.0.3|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT8|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT4|4 (推荐)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 4 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.0.0|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.95|>=2.0.0|
|
||||
|ERNIE-4.5-300B-A47B|128K|WINT4|8 (推荐)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.0.0|
|
||||
|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|
||||
|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="x" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|
||||
|ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|
||||
|ERNIE-4.5-0.3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|
||||
|
||||
## 快速开始
|
||||
|
||||
@@ -28,7 +28,6 @@
|
||||
|
||||
```bash
|
||||
export XPU_VISIBLE_DEVICES="0,1,2,3" # 设置使用的 XPU 卡
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=0 # V1不支持
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
|
||||
--port 8188 \
|
||||
@@ -36,8 +35,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 64 \
|
||||
--quantization "wint4" \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--load-choices "default"
|
||||
--gpu-memory-utilization 0.9
|
||||
```
|
||||
|
||||
**注意:** 使用 P800 在 4 块 XPU 上进行部署时,由于受到卡间互联拓扑等硬件限制,仅支持以下两种配置方式:
|
||||
|
@@ -28,7 +28,7 @@ from paddleformers.utils.log import logger as pf_logger
|
||||
|
||||
from fastdeploy.engine.sampling_params import SamplingParams
|
||||
from fastdeploy.entrypoints.llm import LLM
|
||||
from fastdeploy.utils import current_package_version, envs
|
||||
from fastdeploy.utils import envs
|
||||
|
||||
if envs.FD_DEBUG != "1":
|
||||
import logging
|
||||
@@ -43,8 +43,6 @@ except ImportError:
|
||||
pass
|
||||
# TODO(tangbinhan): remove this code
|
||||
|
||||
__version__ = current_package_version()
|
||||
|
||||
|
||||
def _patch_fastsafetensors():
|
||||
try:
|
||||
|
@@ -14,10 +14,7 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import queue
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
@@ -26,72 +23,16 @@ import numpy as np
|
||||
import paddle
|
||||
|
||||
from fastdeploy.cache_manager.transfer_factory import IPCCommManager, RDMACommManager
|
||||
from fastdeploy.config import SpeculativeConfig
|
||||
from fastdeploy.inter_communicator import (
|
||||
EngineWorkerQueue,
|
||||
IPCSignal,
|
||||
shared_memory_exists,
|
||||
)
|
||||
from fastdeploy.model_executor.ops.gpu import get_output_kv_signal, set_data_ipc
|
||||
from fastdeploy.utils import envs, get_logger
|
||||
from fastdeploy.utils import get_logger
|
||||
|
||||
logger = get_logger("cache_messager", "cache_messager.log")
|
||||
|
||||
|
||||
def parse_args():
|
||||
"""
|
||||
从命令行解析参数
|
||||
"""
|
||||
parser = argparse.ArgumentParser("Cache Messager")
|
||||
parser.add_argument(
|
||||
"--splitwise_role",
|
||||
type=str,
|
||||
default="mixed",
|
||||
help="splitwise role, can be decode, prefill or mixed",
|
||||
)
|
||||
parser.add_argument("--rank", type=int, default=0, help="current rank")
|
||||
parser.add_argument("--device_id", type=int, default=0, help="device id")
|
||||
parser.add_argument("--num_layers", type=int, default=1, help="model num layers")
|
||||
parser.add_argument("--head_dim", type=int, default=1, help="model head dim")
|
||||
parser.add_argument("--kv_num_head", type=int, default=1, help="model kv num head")
|
||||
parser.add_argument("--rdma_port", type=str, default="", help="rmda port")
|
||||
parser.add_argument("--mp_num", type=int, default=1, help="number of model parallel")
|
||||
parser.add_argument("--engine_pid", type=str, default=None, help="engine pid")
|
||||
parser.add_argument(
|
||||
"--protocol",
|
||||
type=str,
|
||||
default="ipc",
|
||||
help="cache transfer protocol, only surport ipc now",
|
||||
)
|
||||
parser.add_argument("--pod_ip", type=str, default="0.0.0.0", help="pod ip")
|
||||
parser.add_argument("--cache_queue_port", type=int, default=9924, help="cache queue port")
|
||||
parser.add_argument(
|
||||
"--engine_worker_queue_port",
|
||||
type=int,
|
||||
default=9923,
|
||||
help="engine worker queue port",
|
||||
)
|
||||
parser.add_argument("--num_gpu_blocks", type=int, default=1, help="gpu cache block number")
|
||||
parser.add_argument("--block_size", type=int, default=64, help="cache block size(tokens)")
|
||||
parser.add_argument(
|
||||
"--cache_dtype",
|
||||
type=str,
|
||||
default="bfloat16",
|
||||
choices=["uint8", "bfloat16"],
|
||||
help="cache dtype",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--speculative_config",
|
||||
type=json.loads,
|
||||
default="{}",
|
||||
help="speculative config",
|
||||
)
|
||||
parser.add_argument("--local_data_parallel_id", type=int, default=0)
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
class CacheMessager:
|
||||
"""
|
||||
CacheMessager is used to send the cache data between the engine worker and the cache server.
|
||||
@@ -128,6 +69,11 @@ class CacheMessager:
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
|
||||
assert splitwise_role in [
|
||||
"prefill",
|
||||
"decode",
|
||||
], "splitwise_role must be prefill or decode"
|
||||
self.splitwise_role = splitwise_role
|
||||
self.gpu_cache_kvs = gpu_cache_kvs
|
||||
self.rank = rank
|
||||
@@ -201,16 +147,15 @@ class CacheMessager:
|
||||
|
||||
self.gpu_id = gpu_id
|
||||
self.cache_info = dict()
|
||||
self.rank_id = self.rank + local_data_parallel_id * self.nranks
|
||||
self.dp_rank_id = self.rank + local_data_parallel_id * self.nranks
|
||||
|
||||
if self.splitwise_role != "mixed":
|
||||
connect_rdma_thread = threading.Thread(target=self._handle_connect_task)
|
||||
connect_rdma_thread.daemon = True
|
||||
connect_rdma_thread.start()
|
||||
layerwise_send_cache_thread = threading.Thread(target=self._prefill_layerwise_send_cache_thread)
|
||||
layerwise_send_cache_thread.daemon = True
|
||||
layerwise_send_cache_thread.start()
|
||||
|
||||
logger.info(f"cache messager init finished, use {transfer_protocol}")
|
||||
|
||||
def prefill_layerwise_send_cache_thread(self):
|
||||
def _prefill_layerwise_send_cache_thread(self):
|
||||
"""
|
||||
layerwise_send_cache_thread:
|
||||
send cache to other instance
|
||||
@@ -218,23 +163,23 @@ class CacheMessager:
|
||||
try:
|
||||
prefilled_step_idx_data = np.zeros(shape=[1], dtype=np.int32)
|
||||
prefilled_layer_idx_data = np.zeros(shape=[1], dtype=np.int32)
|
||||
prefilled_layer_name = f"splitwise_complete_prefilled_layer_{self.rank_id}.{self.gpu_id}"
|
||||
prefilled_step_name = f"splitwise_complete_prefilled_step_{self.rank_id}.{self.gpu_id}"
|
||||
prefilled_layer_name = f"splitwise_complete_prefilled_layer_{self.dp_rank_id}.{self.gpu_id}"
|
||||
prefilled_step_name = f"splitwise_complete_prefilled_step_{self.dp_rank_id}.{self.gpu_id}"
|
||||
step_shm_value = IPCSignal(
|
||||
name=f"splitwise_complete_prefilled_step_{self.rank_id}",
|
||||
name=f"splitwise_complete_prefilled_step_{self.dp_rank_id}",
|
||||
array=prefilled_step_idx_data,
|
||||
dtype=np.int32,
|
||||
suffix=self.gpu_id,
|
||||
create=not shared_memory_exists(prefilled_step_name),
|
||||
)
|
||||
layer_shm_value = IPCSignal(
|
||||
name=f"splitwise_complete_prefilled_layer_{self.rank_id}",
|
||||
name=f"splitwise_complete_prefilled_layer_{self.dp_rank_id}",
|
||||
array=prefilled_layer_idx_data,
|
||||
dtype=np.int32,
|
||||
suffix=self.gpu_id,
|
||||
create=not shared_memory_exists(prefilled_layer_name),
|
||||
)
|
||||
logger.info(f"splitwise_complete_prefilled_step_{self.rank_id}, gpu_id: {self.gpu_id}")
|
||||
logger.info(f"splitwise_complete_prefilled_step_{self.dp_rank_id}, gpu_id: {self.gpu_id}")
|
||||
|
||||
step_shm_value.value[0] = -1
|
||||
layer_shm_value.value[0] = -1
|
||||
@@ -242,9 +187,6 @@ class CacheMessager:
|
||||
self.last_step_idx = -1
|
||||
self.last_layer_idx = -1 # int32
|
||||
|
||||
max_step_idx = 100003
|
||||
engine_recycled_count = 0
|
||||
|
||||
while True:
|
||||
|
||||
cache_info = self.engine_worker_queue.get_cache_info()
|
||||
@@ -260,13 +202,16 @@ class CacheMessager:
|
||||
-len(current_info["dest_block_ids"]) :
|
||||
]
|
||||
current_info["src_block_ids"] = current_src_blocks
|
||||
current_info["current_layer_ids"] = 0
|
||||
current_info["status"] = "init"
|
||||
logger.info(f"start cache_infos: {current_info}")
|
||||
self.cache_info[info["request_id"]] = current_info
|
||||
self.last_step_idx = min(self.last_step_idx, current_info["current_id"])
|
||||
else:
|
||||
self.cache_info[info["request_id"]] = info
|
||||
prefilled_layer_idx = layer_shm_value.value[0]
|
||||
prefilled_step_idx = step_shm_value.value[0]
|
||||
logger.info(f"prefilled_layer_idx: {prefilled_layer_idx}, prefilled_step_idx: {prefilled_step_idx}")
|
||||
if prefilled_layer_idx == self.num_layers - 1:
|
||||
time.sleep(0.001)
|
||||
prefilled_layer_idx = layer_shm_value.value[0]
|
||||
@@ -278,18 +223,7 @@ class CacheMessager:
|
||||
if not self.cache_info:
|
||||
time.sleep(0.001)
|
||||
continue
|
||||
|
||||
if self.last_step_idx > prefilled_step_idx:
|
||||
engine_recycled_count += 1
|
||||
self.last_step_idx = prefilled_step_idx # only copy value read from shm memory
|
||||
prefilled_step_idx = (
|
||||
prefilled_step_idx + max_step_idx * engine_recycled_count
|
||||
) # remap prefilled_step_idx for comparison
|
||||
|
||||
logger.debug(
|
||||
f"prefilled_layer_idx: {prefilled_layer_idx}, prefilled_step_idx in shm: {self.last_step_idx},"
|
||||
f"prefilled_step_idx: {prefilled_step_idx} engine_recycled_count {engine_recycled_count}"
|
||||
)
|
||||
logger.debug(f"prefilled_layer_idx: {prefilled_layer_idx}, prefilled_step_idx: {prefilled_step_idx}")
|
||||
for req_id, item in list(self.cache_info.items()):
|
||||
if "status" not in item:
|
||||
continue
|
||||
@@ -360,493 +294,12 @@ class CacheMessager:
|
||||
logger.info(f"finish write cache {item['request_id']}")
|
||||
self.engine_worker_queue.finish_request_barrier.wait()
|
||||
if self.rank == 0:
|
||||
# to do: robust in TP: here we assume all status in tp are the same. If wrong, all wrong. If ok, all ok.
|
||||
self.engine_worker_queue.put_finished_req([(item["request_id"], "finished")])
|
||||
logger.info(f"put write cache {item['request_id']}")
|
||||
del self.cache_info[req_id]
|
||||
self.last_layer_idx = prefilled_layer_idx
|
||||
|
||||
self.last_step_idx = prefilled_step_idx
|
||||
self.last_layer_idx = prefilled_layer_idx
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"prefill layerwise send cache thread has exception: {e}, {str(traceback.format_exc())}")
|
||||
|
||||
def _handle_connect_task(self):
|
||||
while True:
|
||||
try:
|
||||
task = self.engine_worker_queue.get_connect_rdma_task()
|
||||
if task is None:
|
||||
time.sleep(0.001)
|
||||
continue
|
||||
logger.info(f"_handle_connect_task recv task: {task}")
|
||||
task_id = task["task_id"]
|
||||
ip, rdma_port = task["ip"], task["rdma_port"]
|
||||
status = self.messager["rdma"].connect(ip, rdma_port)
|
||||
if not status:
|
||||
response = {"task_id": task_id, "success": False}
|
||||
else:
|
||||
response = {"task_id": task_id, "success": True}
|
||||
self.engine_worker_queue.put_connect_rdma_task_response(response)
|
||||
except Exception as e:
|
||||
logger.error(f"handle_connect_task has exception: {e}")
|
||||
|
||||
|
||||
class CacheMessagerV1:
|
||||
"""
|
||||
CacheMessager is used to send the cache data between the engine worker and the cache server.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
splitwise_role,
|
||||
transfer_protocol,
|
||||
pod_ip,
|
||||
engine_worker_queue_port,
|
||||
local_data_parallel_id,
|
||||
gpu_cache_kvs,
|
||||
rank,
|
||||
nranks,
|
||||
num_layers,
|
||||
gpu_id=0,
|
||||
block_size=64,
|
||||
rdma_port=None,
|
||||
):
|
||||
"""
|
||||
Initialize the CacheMessager object.
|
||||
|
||||
Args:
|
||||
splitwise_role (str): splitwise_role only can be 'prefill' or 'decode'.
|
||||
transfer_protocol (str): support ipc and rdma
|
||||
engine_worker_queue_port (int): engine_worker_queue port
|
||||
gpu_cache_kvs (dict): GPU kv cache
|
||||
rank (int): current rank
|
||||
nranks (int): global rank number
|
||||
num_layers (int): model layer number
|
||||
gpu_id (int, optional): GPU ID
|
||||
rdma_port (int, optional): RDMA port
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
self.splitwise_role = splitwise_role
|
||||
self.gpu_cache_kvs = gpu_cache_kvs
|
||||
self.rank = rank
|
||||
self.nranks = nranks
|
||||
address = (pod_ip, engine_worker_queue_port)
|
||||
self.engine_worker_queue = EngineWorkerQueue(
|
||||
address=address,
|
||||
is_server=False,
|
||||
num_client=self.nranks,
|
||||
client_id=self.rank,
|
||||
local_data_parallel_id=local_data_parallel_id,
|
||||
)
|
||||
self.block_size = block_size
|
||||
transfer_protocol = transfer_protocol.split(",")
|
||||
|
||||
logger.info(f"splitwise role: {splitwise_role}, {transfer_protocol}" f"rank: {rank}")
|
||||
|
||||
# 1. initialize the cache_k_ptr_list and cache_v_ptr_list
|
||||
self.num_layers = num_layers
|
||||
cache_k_ptr_list = []
|
||||
cache_v_ptr_list = []
|
||||
cache_k = []
|
||||
cache_v = []
|
||||
self.messager = {}
|
||||
for layer_idx in range(self.num_layers):
|
||||
key_cache = self.gpu_cache_kvs[f"key_caches_{layer_idx}_rank{self.rank}_device{gpu_id}"]
|
||||
val_cache = self.gpu_cache_kvs[f"value_caches_{layer_idx}_rank{self.rank}_device{gpu_id}"]
|
||||
cache_k.append(key_cache)
|
||||
cache_v.append(val_cache)
|
||||
cache_k_ptr_list.append(key_cache.data_ptr())
|
||||
cache_v_ptr_list.append(val_cache.data_ptr())
|
||||
cache_k_ptr_list = np.array(cache_k_ptr_list)
|
||||
cache_v_ptr_list = np.array(cache_v_ptr_list)
|
||||
|
||||
# 2. initialize the block_bytes
|
||||
cache_shape = key_cache.shape
|
||||
max_block_num = cache_shape[0]
|
||||
block_bytes = math.prod(cache_shape[1:])
|
||||
if key_cache.dtype == paddle.bfloat16:
|
||||
block_bytes *= 2
|
||||
logger.info(
|
||||
f"layers {num_layers} cache_shape: {cache_shape}, max_block_num: {max_block_num}, "
|
||||
f"block_bytes: {block_bytes}, dtype: {key_cache.dtype}"
|
||||
)
|
||||
self.block_bytes = block_bytes
|
||||
|
||||
# 3. initialize the messager
|
||||
for protocol in transfer_protocol:
|
||||
if protocol == "ipc":
|
||||
self.messager[protocol] = IPCCommManager(
|
||||
self.rank,
|
||||
gpu_id,
|
||||
cache_k,
|
||||
cache_v,
|
||||
)
|
||||
local_device_id = int(str(cache_k[0].place)[-2])
|
||||
logger.info(f"done create ipc_comm with local_device_id:{local_device_id}, ")
|
||||
|
||||
elif protocol == "rdma":
|
||||
logger.info(f"splitwise_role rdma: {self.splitwise_role}, rank: {self.rank}, gpu_id: {gpu_id}")
|
||||
|
||||
self.messager[protocol] = RDMACommManager(
|
||||
splitwise_role,
|
||||
rank,
|
||||
gpu_id,
|
||||
cache_k_ptr_list,
|
||||
cache_v_ptr_list,
|
||||
max_block_num,
|
||||
block_bytes,
|
||||
rdma_port,
|
||||
)
|
||||
|
||||
self.gpu_id = gpu_id
|
||||
self.cache_info = dict()
|
||||
self.rank_id = self.rank + local_data_parallel_id * self.nranks
|
||||
self.engine_cache_task_thread_lock = threading.Lock()
|
||||
self.engine_cache_tasks = [dict() for _ in range(512)]
|
||||
self.idx_cache_task_dict = {}
|
||||
self.cache_prefilled_engine_ids_queue = queue.Queue() # keep batch slot index for each prefill step
|
||||
if splitwise_role == "prefill":
|
||||
consume_signals_thread = threading.Thread(target=self.consume_signals)
|
||||
consume_signals_thread.daemon = True
|
||||
consume_signals_thread.start()
|
||||
add_cache_task_thread = threading.Thread(target=self._add_cache_task_thread)
|
||||
add_cache_task_thread.daemon = True
|
||||
add_cache_task_thread.start()
|
||||
|
||||
if self.splitwise_role != "mixed":
|
||||
connect_rdma_thread = threading.Thread(target=self._handle_connect_task)
|
||||
connect_rdma_thread.daemon = True
|
||||
connect_rdma_thread.start()
|
||||
|
||||
logger.info(f"cache messager init finished, use {transfer_protocol}")
|
||||
|
||||
def _add_cache_task_thread(self):
|
||||
while True:
|
||||
try:
|
||||
cache_info = self.engine_worker_queue.get_cache_info()
|
||||
self.engine_worker_queue.finish_add_cache_task_barrier.wait()
|
||||
finished_add_cache_task_req_ids = []
|
||||
if cache_info:
|
||||
for info in cache_info:
|
||||
if info["request_id"] in self.cache_info:
|
||||
self.cache_info[info["request_id"]].update(info)
|
||||
current_info = self.cache_info[info["request_id"]]
|
||||
assert "dest_block_ids" in current_info and "src_block_ids" in current_info
|
||||
finished_add_cache_task_req_ids.append(info["request_id"])
|
||||
decode_cached_block_num = len(current_info["src_block_ids"]) - len(
|
||||
current_info["dest_block_ids"]
|
||||
)
|
||||
padding_decode_block_ids = [-1 for i in range(decode_cached_block_num)] + current_info[
|
||||
"dest_block_ids"
|
||||
]
|
||||
current_info["dest_block_ids"] = padding_decode_block_ids
|
||||
current_info["decode_cached_tokens"] = decode_cached_block_num * self.block_size
|
||||
current_info["sended_layer_id"] = -1
|
||||
current_info["sended_block_num"] = current_info["decode_cached_tokens"] // self.block_size
|
||||
current_info["status"] = "init"
|
||||
logger.info(f"finish add cache task: {current_info}")
|
||||
self.cache_info[info["request_id"]] = current_info
|
||||
self.idx_cache_task_dict[current_info["current_id"]] = current_info
|
||||
else:
|
||||
self.cache_info[info["request_id"]] = info
|
||||
if self.rank == 0 and finished_add_cache_task_req_ids:
|
||||
self.engine_worker_queue.put_finished_add_cache_task_req(finished_add_cache_task_req_ids)
|
||||
else:
|
||||
time.sleep(0.001)
|
||||
except Exception as e:
|
||||
logger.info(f"add cache task occured error: {e}, {traceback.format_exc()!s}.")
|
||||
|
||||
def prefill_layerwise_send_cache_thread(self):
|
||||
"""
|
||||
layerwise_send_cache_thread:
|
||||
send cache to other instance
|
||||
"""
|
||||
while True:
|
||||
try:
|
||||
engine_indexes = self.cache_prefilled_engine_ids_queue.get()
|
||||
self.engine_worker_queue.finish_request_barrier.wait()
|
||||
block_start_end_list = []
|
||||
current_prefilled_token_num_list = []
|
||||
for engine_index in engine_indexes:
|
||||
assert engine_index in self.idx_cache_task_dict
|
||||
block_id_start = self.idx_cache_task_dict[engine_index]["sended_block_num"]
|
||||
prefilled_token_num = self.engine_cache_tasks[engine_index]["prefilled_token_num"]
|
||||
if (
|
||||
prefilled_token_num == self.idx_cache_task_dict[engine_index]["need_prefill_tokens"]
|
||||
): # all chunks have been prefilled
|
||||
block_id_end = len(self.idx_cache_task_dict[engine_index]["src_block_ids"])
|
||||
else:
|
||||
block_id_end = prefilled_token_num // self.block_size # [block_id_start, block_id_end)
|
||||
block_start_end_list.append((block_id_start, block_id_end))
|
||||
current_prefilled_token_num_list.append(prefilled_token_num)
|
||||
while True: # from layer0 to last layer
|
||||
sended_layer_idx = self.idx_cache_task_dict[engine_indexes[0]]["sended_layer_id"]
|
||||
start_layer_idx = sended_layer_idx + 1
|
||||
with self.engine_cache_task_thread_lock: # to check end_layer_idx
|
||||
prefilled_layer_idx = self.engine_cache_tasks[engine_indexes[0]]["prefilled_layer_idx"]
|
||||
if sended_layer_idx > prefilled_layer_idx: # computation must in next chunk
|
||||
logger.info(
|
||||
f"current_prefilled_token_num_list[0] {current_prefilled_token_num_list[0]} prefilled_token_num {self.engine_cache_tasks[engine_indexes[0]]['prefilled_token_num']}"
|
||||
)
|
||||
assert (
|
||||
current_prefilled_token_num_list[0]
|
||||
< self.engine_cache_tasks[engine_indexes[0]]["prefilled_token_num"]
|
||||
), "when sended_layer_idx > prefilled_layer_idx, must be in next chunk, but not, sth wrong"
|
||||
end_layer_idx = self.num_layers - 1 # [start_layer_idx, end_layer_idx)
|
||||
else:
|
||||
end_layer_idx = prefilled_layer_idx
|
||||
if sended_layer_idx == prefilled_layer_idx: # computation not in next layer
|
||||
time.sleep(0.01)
|
||||
for layer_idx in range(start_layer_idx, end_layer_idx + 1):
|
||||
for i, (block_id_start, block_id_end) in enumerate(block_start_end_list):
|
||||
engine_index = engine_indexes[i]
|
||||
task = self.idx_cache_task_dict[engine_index]
|
||||
req_id = task["request_id"]
|
||||
if (
|
||||
block_id_start >= block_id_end
|
||||
): # no blocks need to transfer for this request in this chunk
|
||||
task["sended_layer_id"] += 1
|
||||
assert task["sended_layer_id"] == layer_idx
|
||||
if task["sended_layer_id"] == self.num_layers - 1:
|
||||
task["sended_layer_id"] = -1
|
||||
continue
|
||||
else:
|
||||
current_transfer_protocol = task["transfer_protocol"]
|
||||
if task["transfer_protocol"] == "rdma":
|
||||
target_ip = task["ip"]
|
||||
target_id = int(task["rdma_ports"][self.rank])
|
||||
if task["status"] == "error":
|
||||
continue
|
||||
status = self.messager[current_transfer_protocol].connect(target_ip, target_id)
|
||||
if not status:
|
||||
logger.error(f"connect to {target_ip}:{target_id} failed")
|
||||
task["status"] = "connection error"
|
||||
continue
|
||||
elif task["transfer_protocol"] == "ipc":
|
||||
target_ip = "0.0.0.0"
|
||||
target_id = int(task["device_ids"][self.rank])
|
||||
|
||||
src_block_ids = task["src_block_ids"][block_id_start:block_id_end]
|
||||
dest_block_ids = task["dest_block_ids"][block_id_start:block_id_end]
|
||||
src_block_ids = paddle.to_tensor(src_block_ids, dtype="int32", place="cpu")
|
||||
dest_block_ids = paddle.to_tensor(dest_block_ids, dtype="int32", place="cpu")
|
||||
|
||||
logger.info(
|
||||
f"start write cache for a layer, {req_id}, {layer_idx}, {target_ip}, {target_id}, block_id_start {block_id_start} block_id_end {block_id_end}"
|
||||
)
|
||||
tic = time.time()
|
||||
return_code = self.messager[current_transfer_protocol].write_cache(
|
||||
target_ip,
|
||||
target_id,
|
||||
src_block_ids,
|
||||
dest_block_ids,
|
||||
layer_idx,
|
||||
)
|
||||
if return_code != 0:
|
||||
task["status"] = "write cache error"
|
||||
logger.error(
|
||||
f"write cache failed, layer_idx: {layer_idx}, req_id: {req_id}, dest_ip: {target_ip}, block_id_start {block_id_start} block_id_end {block_id_end}"
|
||||
)
|
||||
tok = time.time()
|
||||
cost_time = tok - tic
|
||||
block_num = len(src_block_ids)
|
||||
avg_time_per_block = cost_time * 1000 / block_num # ms
|
||||
send_cache_speed = block_num * self.block_bytes / 1073741824 / cost_time # GB/s
|
||||
logger.debug(
|
||||
f"finish write cache for a layer, {req_id}, {layer_idx}, {target_ip}, {target_id},"
|
||||
f"block_num: {block_num}, send_cache_speed(GB/s): {round(send_cache_speed, 5)},"
|
||||
f"avg_time per block(ms): {round(avg_time_per_block, 5)} block_id_start {block_id_start} block_id_end {block_id_end}"
|
||||
)
|
||||
|
||||
task["sended_layer_id"] += 1
|
||||
assert task["sended_layer_id"] == layer_idx
|
||||
if task["sended_layer_id"] == self.num_layers - 1:
|
||||
self.idx_cache_task_dict[engine_index]["sended_block_num"] += (
|
||||
block_id_end - block_id_start
|
||||
)
|
||||
if current_prefilled_token_num_list[i] == task["need_prefill_tokens"]:
|
||||
if task["status"] != "error":
|
||||
task["status"] = "finished"
|
||||
logger.info(
|
||||
f"finish write cache for all layers, req_id: {req_id}, block_id_end {block_id_end} need_prefill_tokens {task['need_prefill_tokens']}"
|
||||
)
|
||||
else:
|
||||
task["sended_layer_id"] = -1
|
||||
if end_layer_idx == self.num_layers - 1:
|
||||
with self.engine_cache_task_thread_lock:
|
||||
for engine_idx in engine_indexes:
|
||||
task = self.idx_cache_task_dict[engine_idx]
|
||||
if task["status"] == "finished" or ("error" in task["status"]):
|
||||
target_id = int(task["rdma_ports"][self.rank])
|
||||
if task["transfer_protocol"] == "ipc":
|
||||
self.messager["ipc"].write_block_by_sync(target_id)
|
||||
if self.rank == 0:
|
||||
# to do: robust in TP, here we assume all status in tp are the same. If wrong, all wrong. If ok, all ok.
|
||||
self.engine_worker_queue.put_finished_req(
|
||||
[(task["request_id"], task["status"])]
|
||||
)
|
||||
logger.info(f"put write cache {task['request_id']}, status {task['status']}")
|
||||
self.engine_cache_tasks[task["current_id"]] = dict()
|
||||
del self.cache_info[task["request_id"]]
|
||||
del self.idx_cache_task_dict[task["current_id"]]
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"prefill layerwise send cache thread has exception: {e} {traceback.format_exc()!s}")
|
||||
time.sleep(0.01)
|
||||
|
||||
def consume_signals(self):
|
||||
paddle.device.set_device("cpu")
|
||||
kv_signal_data = paddle.full(shape=[512 * 3 + 2], fill_value=-1, dtype="int32")
|
||||
while True:
|
||||
try:
|
||||
get_output_kv_signal(kv_signal_data, self.rank_id, 0) # wait_flag
|
||||
if not self.cache_info:
|
||||
time.sleep(0.01)
|
||||
continue
|
||||
tasks_count = kv_signal_data[0]
|
||||
if tasks_count == -1:
|
||||
time.sleep(0.001)
|
||||
continue
|
||||
layer_id = kv_signal_data[1].numpy().tolist()
|
||||
if layer_id == self.num_layers - 1:
|
||||
logger.info(f"tasks_count: {tasks_count}, layer_id: {layer_id}")
|
||||
batch_engine_ids = []
|
||||
with self.engine_cache_task_thread_lock:
|
||||
for bi in range(tasks_count):
|
||||
engine_idx = kv_signal_data[3 * bi + 2].numpy().tolist()
|
||||
chuck_token_offset = kv_signal_data[3 * bi + 3].numpy().tolist()
|
||||
current_seq_len = kv_signal_data[3 * bi + 4].numpy().tolist()
|
||||
self.engine_cache_tasks[engine_idx]["prefilled_layer_idx"] = layer_id
|
||||
self.engine_cache_tasks[engine_idx]["prefilled_token_num"] = (
|
||||
chuck_token_offset + current_seq_len
|
||||
)
|
||||
batch_engine_ids.append(engine_idx)
|
||||
if layer_id == 0:
|
||||
self.cache_prefilled_engine_ids_queue.put(batch_engine_ids)
|
||||
except Exception as e:
|
||||
logger.error(f"Consume signals get exception: {e}")
|
||||
|
||||
def _handle_connect_task(self):
|
||||
while True:
|
||||
try:
|
||||
task = self.engine_worker_queue.get_connect_rdma_task()
|
||||
if task is None:
|
||||
time.sleep(0.001)
|
||||
continue
|
||||
logger.info(f"_handle_connect_task recv task: {task}")
|
||||
task_id = task["task_id"]
|
||||
ip, rdma_port = task["ip"], task["rdma_port"]
|
||||
status = self.messager["rdma"].connect(ip, rdma_port)
|
||||
if not status:
|
||||
response = {"task_id": task_id, "success": False}
|
||||
else:
|
||||
response = {"task_id": task_id, "success": True}
|
||||
self.engine_worker_queue.put_connect_rdma_task_response(response)
|
||||
except Exception as e:
|
||||
logger.error(f"handle_connect_task has exception: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
device = args.device_id
|
||||
rank = args.rank
|
||||
paddle.set_device(f"gpu:{device}")
|
||||
cache_type = args.cache_dtype
|
||||
speculative_config = SpeculativeConfig(args.speculative_config)
|
||||
num_extra_layers = speculative_config.num_extra_cache_layer
|
||||
num_extra_layer_gpu_blocks = int(args.num_gpu_blocks * speculative_config.num_gpu_block_expand_ratio)
|
||||
gpu_cache_kvs = {}
|
||||
gpu_cache_k_tensors = []
|
||||
gpu_cache_v_tensors = []
|
||||
|
||||
for i in range(args.num_layers + num_extra_layers):
|
||||
num_gpu_blocks = args.num_gpu_blocks if i < args.num_layers else num_extra_layer_gpu_blocks
|
||||
|
||||
gpu_cache_kvs[f"key_caches_{i}_rank{rank}_device{device}"] = paddle.full(
|
||||
shape=[
|
||||
num_gpu_blocks,
|
||||
args.kv_num_head,
|
||||
args.block_size,
|
||||
args.head_dim,
|
||||
],
|
||||
fill_value=0,
|
||||
dtype=cache_type,
|
||||
)
|
||||
gpu_cache_k_tensors.append(gpu_cache_kvs[f"key_caches_{i}_rank{rank}_device{device}"])
|
||||
gpu_cache_kvs[f"value_caches_{i}_rank{rank}_device{device}"] = paddle.full(
|
||||
shape=[
|
||||
num_gpu_blocks,
|
||||
args.kv_num_head,
|
||||
args.block_size,
|
||||
args.head_dim,
|
||||
],
|
||||
fill_value=0,
|
||||
dtype=cache_type,
|
||||
)
|
||||
gpu_cache_v_tensors.append(gpu_cache_kvs[f"value_caches_{i}_rank{rank}_device{device}"])
|
||||
|
||||
set_data_ipc(
|
||||
gpu_cache_kvs[f"key_caches_{i}_rank{rank}_device{device}"],
|
||||
f"key_caches_{i}_rank{rank}.device{device}",
|
||||
)
|
||||
set_data_ipc(
|
||||
gpu_cache_kvs[f"value_caches_{i}_rank{rank}_device{device}"],
|
||||
f"value_caches_{i}_rank{rank}.device{device}",
|
||||
)
|
||||
cache_kv_size_byte = sum([tmp.numel() * 1 for key, tmp in gpu_cache_kvs.items()])
|
||||
logger.info(f"device :{device}")
|
||||
logger.info(f"cache_kv_size_byte : {cache_kv_size_byte}")
|
||||
logger.info(f"done init cache (full) gmem alloc : {paddle.device.cuda.memory_allocated()}")
|
||||
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
cache_messager = CacheMessagerV1(
|
||||
splitwise_role=args.splitwise_role,
|
||||
transfer_protocol=args.protocol,
|
||||
pod_ip=args.pod_ip,
|
||||
engine_worker_queue_port=args.engine_worker_queue_port,
|
||||
local_data_parallel_id=args.local_data_parallel_id,
|
||||
gpu_cache_kvs=gpu_cache_kvs,
|
||||
rank=rank,
|
||||
nranks=args.mp_num,
|
||||
num_layers=args.num_layers + num_extra_layers,
|
||||
gpu_id=device,
|
||||
rdma_port=args.rdma_port,
|
||||
)
|
||||
else:
|
||||
cache_messager = CacheMessager(
|
||||
splitwise_role=args.splitwise_role,
|
||||
transfer_protocol=args.protocol,
|
||||
pod_ip=args.pod_ip,
|
||||
engine_worker_queue_port=args.engine_worker_queue_port,
|
||||
local_data_parallel_id=args.local_data_parallel_id,
|
||||
gpu_cache_kvs=gpu_cache_kvs,
|
||||
rank=rank,
|
||||
nranks=args.mp_num,
|
||||
num_layers=args.num_layers + num_extra_layers,
|
||||
gpu_id=device,
|
||||
rdma_port=args.rdma_port,
|
||||
)
|
||||
|
||||
cache_ready_signal_data = np.zeros(shape=[args.mp_num], dtype=np.int32)
|
||||
cache_ready_signal = IPCSignal(
|
||||
name="cache_ready_signal",
|
||||
array=cache_ready_signal_data,
|
||||
dtype=np.int32,
|
||||
suffix=args.engine_pid,
|
||||
create=False,
|
||||
)
|
||||
cache_ready_signal.value[rank] = 1
|
||||
if args.splitwise_role == "mixed":
|
||||
while True:
|
||||
time.sleep(1)
|
||||
cache_messager.prefill_layerwise_send_cache_thread()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
args = parse_args()
|
||||
rank_id = args.rank + args.local_data_parallel_id * args.mp_num
|
||||
logger = get_logger("cache_messager", f"cache_messager_rank{rank_id}.log")
|
||||
|
||||
logger.info("create cache messager...")
|
||||
logger.info(f"{args}")
|
||||
main()
|
||||
|
@@ -16,27 +16,21 @@
|
||||
|
||||
import argparse
|
||||
import concurrent.futures
|
||||
import gc
|
||||
import json
|
||||
import queue
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.cache_manager.cache_data import CacheStatus
|
||||
from fastdeploy.config import SpeculativeConfig
|
||||
from fastdeploy.inter_communicator import EngineCacheQueue, IPCSignal, KVCacheStatus
|
||||
from fastdeploy.inter_communicator import EngineCacheQueue, IPCSignal
|
||||
from fastdeploy.model_executor.ops.gpu import (
|
||||
cuda_host_alloc,
|
||||
cuda_host_free,
|
||||
set_data_ipc,
|
||||
share_external_data,
|
||||
swap_cache_all_layers,
|
||||
unset_data_ipc,
|
||||
)
|
||||
from fastdeploy.utils import get_logger
|
||||
|
||||
@@ -99,7 +93,6 @@ def parse_args():
|
||||
help="speculative config",
|
||||
)
|
||||
parser.add_argument("--local_data_parallel_id", type=int, default=0)
|
||||
parser.add_argument("--create_cache_tensor", action="store_true")
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
@@ -117,6 +110,7 @@ class CacheTransferManager:
|
||||
|
||||
device = args.device_id
|
||||
rank = args.rank
|
||||
paddle.set_device(f"gpu:{device}")
|
||||
self.gpu_cache_kvs = {}
|
||||
self.cpu_cache_kvs = {}
|
||||
self.gpu_cache_k_tensors = []
|
||||
@@ -132,7 +126,6 @@ class CacheTransferManager:
|
||||
self.n_ranks = args.mp_num
|
||||
self.rank = rank
|
||||
self.device = device
|
||||
self.engine_pid = args.engine_pid
|
||||
|
||||
address = (args.pod_ip, args.cache_queue_port)
|
||||
self.cache_task_queue = EngineCacheQueue(
|
||||
@@ -143,27 +136,92 @@ class CacheTransferManager:
|
||||
local_data_parallel_id=args.local_data_parallel_id,
|
||||
)
|
||||
|
||||
self.num_cpu_blocks = args.num_cpu_blocks
|
||||
|
||||
cache_type = args.cache_dtype
|
||||
for i in range(args.num_layers + self.num_extra_layers):
|
||||
num_gpu_blocks = args.num_gpu_blocks if i < args.num_layers else self.num_extra_layer_gpu_blocks
|
||||
|
||||
self.gpu_cache_kvs[f"key_caches_{i}_rank{rank}_device{device}"] = paddle.full(
|
||||
shape=[
|
||||
num_gpu_blocks,
|
||||
args.kv_num_head,
|
||||
args.block_size,
|
||||
args.head_dim,
|
||||
],
|
||||
fill_value=0,
|
||||
dtype=cache_type,
|
||||
)
|
||||
self.gpu_cache_k_tensors.append(self.gpu_cache_kvs[f"key_caches_{i}_rank{rank}_device{device}"])
|
||||
self.gpu_cache_kvs[f"value_caches_{i}_rank{rank}_device{device}"] = paddle.full(
|
||||
shape=[
|
||||
num_gpu_blocks,
|
||||
args.kv_num_head,
|
||||
args.block_size,
|
||||
args.head_dim,
|
||||
],
|
||||
fill_value=0,
|
||||
dtype=cache_type,
|
||||
)
|
||||
self.gpu_cache_v_tensors.append(self.gpu_cache_kvs[f"value_caches_{i}_rank{rank}_device{device}"])
|
||||
|
||||
set_data_ipc(
|
||||
self.gpu_cache_kvs[f"key_caches_{i}_rank{rank}_device{device}"],
|
||||
f"key_caches_{i}_rank{rank}.device{device}",
|
||||
)
|
||||
set_data_ipc(
|
||||
self.gpu_cache_kvs[f"value_caches_{i}_rank{rank}_device{device}"],
|
||||
f"value_caches_{i}_rank{rank}.device{device}",
|
||||
)
|
||||
cache_kv_size_byte = sum([tmp.numel() * 1 for key, tmp in self.gpu_cache_kvs.items()])
|
||||
logger.info(f"device :{self.device}")
|
||||
logger.info(f"cache_kv_size_byte : {cache_kv_size_byte}")
|
||||
logger.info(f"done init cache (full) gmem alloc : {paddle.device.cuda.memory_allocated()}")
|
||||
|
||||
paddle.set_device("cpu")
|
||||
self.k_dst_ptrs = []
|
||||
self.v_dst_ptrs = []
|
||||
for i in range(args.num_layers + self.num_extra_layers):
|
||||
self.cpu_cache_kvs[f"key_caches_{i}_rank{rank}"] = cuda_host_alloc(
|
||||
args.num_cpu_blocks * args.bytes_per_layer_per_block
|
||||
)
|
||||
self.k_dst_ptrs.append(self.cpu_cache_kvs[f"key_caches_{i}_rank{rank}"])
|
||||
self.cpu_cache_kvs[f"value_caches_{i}_rank{rank}"] = cuda_host_alloc(
|
||||
args.num_cpu_blocks * args.bytes_per_layer_per_block
|
||||
)
|
||||
self.v_dst_ptrs.append(self.cpu_cache_kvs[f"value_caches_{i}_rank{rank}"])
|
||||
|
||||
cache_ready_signal_data = np.zeros(shape=[args.mp_num], dtype=np.int32)
|
||||
self.cache_ready_signal = IPCSignal(
|
||||
name="cache_ready_signal",
|
||||
array=cache_ready_signal_data,
|
||||
dtype=np.int32,
|
||||
suffix=self.engine_pid,
|
||||
create=False,
|
||||
)
|
||||
swap_space_ready_data = np.zeros(shape=[args.mp_num], dtype=np.int32)
|
||||
self.swap_space_ready_signal = IPCSignal(
|
||||
name="swap_space_ready_signal",
|
||||
array=swap_space_ready_data,
|
||||
dtype=np.int32,
|
||||
suffix=self.engine_pid,
|
||||
suffix=args.engine_pid,
|
||||
create=False,
|
||||
)
|
||||
self.cache_ready_signal.value[self.rank] = 1
|
||||
|
||||
self.num_cpu_blocks = args.num_cpu_blocks
|
||||
paddle.set_device(f"gpu:{device}")
|
||||
if args.enable_splitwise:
|
||||
logger.debug("create cache messager...")
|
||||
logger.info(f"{args}")
|
||||
from fastdeploy.cache_manager.cache_messager import CacheMessager
|
||||
|
||||
self._init_cpu_cache(args)
|
||||
self._init_gpu_cache(args)
|
||||
self.cache_messager = CacheMessager(
|
||||
splitwise_role=args.splitwise_role,
|
||||
transfer_protocol=args.protocol,
|
||||
pod_ip=args.pod_ip,
|
||||
engine_worker_queue_port=args.engine_worker_queue_port,
|
||||
local_data_parallel_id=args.local_data_parallel_id,
|
||||
gpu_cache_kvs=self.gpu_cache_kvs,
|
||||
rank=self.rank,
|
||||
nranks=args.mp_num,
|
||||
num_layers=args.num_layers + self.num_extra_layers,
|
||||
gpu_id=self.device,
|
||||
rdma_port=args.rdma_port,
|
||||
)
|
||||
logger.info("successfully create cache messager")
|
||||
logger.info(f"done init CacheMessager gmem alloc : {paddle.device.cuda.memory_allocated()}")
|
||||
|
||||
cache_task_broadcast_data = np.zeros(shape=[1], dtype=np.int32)
|
||||
self.cache_task_broadcast_signal = IPCSignal(
|
||||
@@ -174,76 +232,6 @@ class CacheTransferManager:
|
||||
create=False,
|
||||
)
|
||||
|
||||
threading.Thread(target=self.clear_or_update_caches, args=[args], daemon=True).start()
|
||||
|
||||
def _init_gpu_cache(self, args):
|
||||
|
||||
if not args.create_cache_tensor:
|
||||
logger.info(f"[rank {self.rank}/{self.n_ranks}] Waiting for runners to create kv cache.")
|
||||
while self.cache_ready_signal.value[self.rank] != 1:
|
||||
time.sleep(0.1)
|
||||
logger.info(f"[rank {self.rank}/{self.n_ranks}] OK! Stop waiting.")
|
||||
|
||||
logger.info(f"[rank {self.rank}/{self.n_ranks}] Initializing kv cache for all layers.")
|
||||
paddle.set_device(f"gpu:{self.device}")
|
||||
for i in range(args.num_layers + self.num_extra_layers):
|
||||
num_gpu_blocks = args.num_gpu_blocks if i < args.num_layers else self.num_extra_layer_gpu_blocks
|
||||
cache_shape = [num_gpu_blocks, args.kv_num_head, args.block_size, args.head_dim]
|
||||
key_name = f"key_caches_{i}_rank{self.rank}.device{self.device}"
|
||||
val_name = f"value_caches_{i}_rank{self.rank}.device{self.device}"
|
||||
|
||||
if args.create_cache_tensor:
|
||||
logger.info(f"[rank {self.rank}/{self.n_ranks}] ..creating kv cache for layer {i}: {cache_shape}")
|
||||
key_cache = paddle.full(shape=cache_shape, fill_value=0, dtype=args.cache_dtype)
|
||||
val_cache = paddle.full(shape=cache_shape, fill_value=0, dtype=args.cache_dtype)
|
||||
set_data_ipc(key_cache, key_name)
|
||||
set_data_ipc(val_cache, val_name)
|
||||
else:
|
||||
logger.info(f"[rank {self.rank}/{self.n_ranks}] ..attaching kv cache for layer {i}: {cache_shape}")
|
||||
key_cache = paddle.empty(shape=[], dtype=args.cache_dtype)
|
||||
val_cache = paddle.empty(shape=[], dtype=args.cache_dtype)
|
||||
key_cache = share_external_data(key_cache, key_name, cache_shape)
|
||||
val_cache = share_external_data(val_cache, val_name, cache_shape)
|
||||
|
||||
self.gpu_cache_kvs[key_name] = key_cache
|
||||
self.gpu_cache_kvs[val_name] = val_cache
|
||||
self.gpu_cache_k_tensors.append(self.gpu_cache_kvs[key_name])
|
||||
self.gpu_cache_v_tensors.append(self.gpu_cache_kvs[val_name])
|
||||
|
||||
if args.create_cache_tensor:
|
||||
logger.info("[rank {self.rank}/{self.n_ranks}] ✅ kv cache is ready!")
|
||||
self.cache_ready_signal.value[self.rank] = 1
|
||||
|
||||
cache_kv_size_byte = sum([tmp.numel() * 1 for key, tmp in self.gpu_cache_kvs.items()])
|
||||
logger.info(f"[rank {self.rank}/{self.n_ranks}] device :{self.device}")
|
||||
logger.info(f"[rank {self.rank}/{self.n_ranks}] cache_kv_size_byte : {cache_kv_size_byte}")
|
||||
logger.info(
|
||||
f"[rank {self.rank}/{self.n_ranks}] done init cache (full) gmem alloc : {paddle.device.cuda.memory_allocated()}"
|
||||
)
|
||||
|
||||
def _init_cpu_cache(self, args):
|
||||
if args.num_cpu_blocks == 0:
|
||||
logger.info(f"[rank {self.rank}/{self.n_ranks}] 💡 no swap space (cpu cache) is specified.")
|
||||
self.swap_space_ready_signal.value[self.rank] = 1
|
||||
return
|
||||
logger.info(f"[rank {self.rank}/{self.n_ranks}] Initializing swap space (cpu cache) for all layers.")
|
||||
paddle.set_device("cpu")
|
||||
self.k_dst_ptrs = []
|
||||
self.v_dst_ptrs = []
|
||||
for i in range(args.num_layers + self.num_extra_layers):
|
||||
key_name = f"key_caches_{i}_rank{self.rank}"
|
||||
val_name = f"value_caches_{i}_rank{self.rank}"
|
||||
need_to_allocate_bytes = args.num_cpu_blocks * args.bytes_per_layer_per_block
|
||||
logger.info(
|
||||
f"[rank {self.rank}/{self.n_ranks}] ..creating cpu cache for layer {i}: {2 * need_to_allocate_bytes / 1024 ** 3:.2f}GB"
|
||||
)
|
||||
self.cpu_cache_kvs[key_name] = cuda_host_alloc(need_to_allocate_bytes)
|
||||
self.k_dst_ptrs.append(self.cpu_cache_kvs[key_name])
|
||||
self.cpu_cache_kvs[val_name] = cuda_host_alloc(need_to_allocate_bytes)
|
||||
self.v_dst_ptrs.append(self.cpu_cache_kvs[val_name])
|
||||
logger.info(f"[rank {self.rank}/{self.n_ranks}] ✅ swap space (cpu cache) is ready!")
|
||||
self.swap_space_ready_signal.value[self.rank] = 1
|
||||
|
||||
def _do_swap_to_cpu_task(
|
||||
self,
|
||||
swap_node_ids,
|
||||
@@ -441,92 +429,6 @@ class CacheTransferManager:
|
||||
transfer_task_id,
|
||||
)
|
||||
|
||||
def clear_or_update_caches(self, args):
|
||||
logger.info("Start a thread to clear/restore kv cache when model weights are cleared/updated.")
|
||||
logger.info(f"FD_ENABLE_SWAP_SPACE_CLEARING={envs.FD_ENABLE_SWAP_SPACE_CLEARING}")
|
||||
kv_cache_status = np.zeros([1], dtype=np.int32)
|
||||
kv_cache_status_signal = IPCSignal(
|
||||
name="kv_cache_status",
|
||||
array=kv_cache_status,
|
||||
dtype=np.int32,
|
||||
suffix=self.engine_pid,
|
||||
create=False,
|
||||
)
|
||||
while True:
|
||||
if kv_cache_status_signal.value[0] == KVCacheStatus.CLEARING:
|
||||
try:
|
||||
logger.info(
|
||||
f"[rank {self.rank}/{self.n_ranks}] Start clearing caches {self.cache_ready_signal.value}"
|
||||
)
|
||||
# clear cpu caches
|
||||
if envs.FD_ENABLE_SWAP_SPACE_CLEARING:
|
||||
paddle.set_device("cpu")
|
||||
for ptrs in self.k_dst_ptrs + self.v_dst_ptrs:
|
||||
cuda_host_free(ptrs)
|
||||
self.cpu_cache_kvs.clear()
|
||||
self.k_dst_ptrs.clear()
|
||||
self.v_dst_ptrs.clear()
|
||||
gc.collect()
|
||||
# reset swap_space_ready_signal
|
||||
self.swap_space_ready_signal.value[self.rank] = 0
|
||||
while np.sum(self.swap_space_ready_signal.value) != 0:
|
||||
time.sleep(0.1)
|
||||
|
||||
# clear gpu caches
|
||||
paddle.set_device(f"gpu:{self.device}")
|
||||
for name, tensor in self.gpu_cache_kvs.items():
|
||||
unset_data_ipc(tensor, name, True, False)
|
||||
self.gpu_cache_kvs.clear()
|
||||
self.gpu_cache_k_tensors.clear()
|
||||
self.gpu_cache_v_tensors.clear()
|
||||
|
||||
# reset cache_ready_signal
|
||||
self.cache_ready_signal.value[self.rank] = 0
|
||||
logger.info(
|
||||
f"[rank {self.rank}/{self.n_ranks}] Finish clearing caches {self.cache_ready_signal.value}"
|
||||
)
|
||||
|
||||
# wait for all ranks caches to be cleared
|
||||
if np.sum(self.cache_ready_signal.value) != 0:
|
||||
time.sleep(0.1)
|
||||
|
||||
# reset kv_cache_status_signal
|
||||
kv_cache_status_signal.value[0] = KVCacheStatus.CLEARED
|
||||
logger.info("All ranks finish clearing caches")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[rank {self.rank}/{self.n_ranks}] Failed to clear caches: {e}")
|
||||
|
||||
elif kv_cache_status_signal.value[0] == KVCacheStatus.UPDATING:
|
||||
try:
|
||||
logger.info(
|
||||
f"[rank {self.rank}/{self.n_ranks}] Start restoring caches {self.cache_ready_signal.value}"
|
||||
)
|
||||
# restore cpu cache
|
||||
if envs.FD_ENABLE_SWAP_SPACE_CLEARING:
|
||||
self._init_cpu_cache(args)
|
||||
while np.sum(self.swap_space_ready_signal.value) != args.mp_num:
|
||||
time.sleep(0.1)
|
||||
|
||||
# restore gpu cache and set cache_ready_signal
|
||||
self._init_gpu_cache(args)
|
||||
logger.info(
|
||||
f"[rank {self.rank}/{self.n_ranks}] Finish restoring caches {self.cache_ready_signal.value}"
|
||||
)
|
||||
|
||||
# wait for all ranks caches to be ready
|
||||
while np.sum(self.cache_ready_signal.value) != args.mp_num:
|
||||
time.sleep(0.1)
|
||||
|
||||
# set kv_cache_status_signal
|
||||
logger.info("All ranks finish restoring caches")
|
||||
kv_cache_status_signal.value[0] = KVCacheStatus.NORMAL
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[rank {self.rank}/{self.n_ranks}] Failed to restore caches: {e}")
|
||||
|
||||
time.sleep(0.1)
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
@@ -541,7 +443,5 @@ def main():
|
||||
if __name__ == "__main__":
|
||||
|
||||
args = parse_args()
|
||||
rank_id = args.rank + args.local_data_parallel_id * args.mp_num
|
||||
logger = get_logger("cache_transfer_manager", f"cache_transfer_manager_rank{rank_id}.log")
|
||||
paddle.set_device(f"gpu:{args.device_id}")
|
||||
logger = get_logger("cache_transfer_manager", "cache_transfer_manager.log")
|
||||
main()
|
||||
|
@@ -31,7 +31,7 @@ import numpy as np
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.cache_manager.cache_data import BlockNode, CacheStatus
|
||||
from fastdeploy.cache_manager.cache_metrics import CacheMetrics
|
||||
from fastdeploy.inter_communicator import EngineCacheQueue, IPCSignal, PrefixTreeStatus
|
||||
from fastdeploy.inter_communicator import EngineCacheQueue, IPCSignal
|
||||
from fastdeploy.metrics.metrics import main_process_metrics
|
||||
from fastdeploy.utils import get_logger
|
||||
|
||||
@@ -71,7 +71,6 @@ class PrefixCacheManager:
|
||||
else:
|
||||
self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num
|
||||
self.num_cpu_blocks = self.cache_config.num_cpu_blocks
|
||||
|
||||
self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1))
|
||||
if self.num_cpu_blocks > 0:
|
||||
self.cpu_free_block_list = list(range(self.num_cpu_blocks - 1, -1, -1))
|
||||
@@ -79,7 +78,6 @@ class PrefixCacheManager:
|
||||
self.cpu_free_block_list = []
|
||||
heapq.heapify(self.gpu_free_block_list)
|
||||
heapq.heapify(self.cpu_free_block_list)
|
||||
|
||||
self.node_id_pool = list(range(self.num_gpu_blocks + self.num_cpu_blocks))
|
||||
|
||||
self.radix_tree_root = BlockNode(-1, [], 0, 0, -1, 0, None, None, None)
|
||||
@@ -113,10 +111,6 @@ class PrefixCacheManager:
|
||||
+ f"{self.num_cpu_blocks}, bytes_per_layer_per_block {self.cache_config.bytes_per_layer_per_block}"
|
||||
)
|
||||
|
||||
main_process_metrics.max_gpu_block_num.set(self.num_gpu_blocks)
|
||||
main_process_metrics.available_gpu_block_num.set(self.num_gpu_blocks)
|
||||
main_process_metrics.available_gpu_resource.set(1.0)
|
||||
|
||||
@property
|
||||
def available_gpu_resource(self):
|
||||
return len(self.gpu_free_block_list) / self.num_gpu_blocks if self.num_gpu_blocks > 0 else 0.0
|
||||
@@ -129,7 +123,6 @@ class PrefixCacheManager:
|
||||
pod_ip,
|
||||
engine_worker_queue_port,
|
||||
pid_suffix,
|
||||
create_cache_tensor,
|
||||
):
|
||||
"""
|
||||
launch_cache_manager function used to initialize the cache manager.
|
||||
@@ -140,7 +133,7 @@ class PrefixCacheManager:
|
||||
name="cache_task_broadcast_signal",
|
||||
array=broadcast_cache_task_flag_array,
|
||||
dtype=np.int32,
|
||||
suffix=engine_worker_queue_port,
|
||||
suffix=pid_suffix,
|
||||
create=True,
|
||||
)
|
||||
|
||||
@@ -157,20 +150,6 @@ class PrefixCacheManager:
|
||||
filename = "cache_transfer_manager.py"
|
||||
py_path = os.path.join(current_dir_path, filename)
|
||||
|
||||
cache_messager_processes = []
|
||||
if self.enable_splitwise:
|
||||
cache_messager_processes = self.launch_cache_messager(
|
||||
cache_config,
|
||||
tensor_parallel_size,
|
||||
device_ids,
|
||||
pod_ip,
|
||||
engine_worker_queue_port,
|
||||
pid_suffix,
|
||||
)
|
||||
if cache_messager_processes is None:
|
||||
raise RuntimeError("Launch cache messager failed")
|
||||
return []
|
||||
|
||||
if (
|
||||
hasattr(cache_config.model_cfg, "num_key_value_heads")
|
||||
and hasattr(cache_config.model_cfg, "num_key_value_heads")
|
||||
@@ -181,41 +160,20 @@ class PrefixCacheManager:
|
||||
else:
|
||||
kv_num_head = cache_config.model_cfg.num_attention_heads // tensor_parallel_size
|
||||
kv_num_head = max(1, kv_num_head)
|
||||
|
||||
cache_ready_signal_data = np.zeros(shape=[tensor_parallel_size], dtype=np.int32)
|
||||
self.cache_ready_signal = IPCSignal(
|
||||
name="cache_ready_signal",
|
||||
array=cache_ready_signal_data,
|
||||
dtype=np.int32,
|
||||
suffix=engine_worker_queue_port,
|
||||
create=False,
|
||||
suffix=pid_suffix,
|
||||
create=True,
|
||||
)
|
||||
swap_space_ready_data = np.zeros(shape=[tensor_parallel_size], dtype=np.int32)
|
||||
self.swap_space_ready_signal = IPCSignal(
|
||||
name="swap_space_ready_signal",
|
||||
array=swap_space_ready_data,
|
||||
dtype=np.int32,
|
||||
suffix=engine_worker_queue_port,
|
||||
create=False,
|
||||
)
|
||||
prefix_tree_status = np.zeros([1], dtype=np.int32)
|
||||
self.prefix_tree_status_signal = IPCSignal(
|
||||
name="prefix_tree_status",
|
||||
array=prefix_tree_status,
|
||||
dtype=np.int32,
|
||||
suffix=engine_worker_queue_port,
|
||||
create=False,
|
||||
)
|
||||
|
||||
# Run command to launch cache transfer managers
|
||||
logger.info(f"create_cache_tensor: {create_cache_tensor}")
|
||||
log_dir = envs.FD_LOG_DIR
|
||||
cache_manager_processes = []
|
||||
for i in range(tensor_parallel_size):
|
||||
launch_cmd = (
|
||||
"FLAGS_allocator_strategy=auto_growth CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7"
|
||||
+ " NCCL_MAX_NCHANNELS=1 NCCL_BUFFSIZE=0"
|
||||
+ f" FD_ENABLE_SWAP_SPACE_CLEARING={envs.FD_ENABLE_SWAP_SPACE_CLEARING}"
|
||||
+ f" {sys.executable} {py_path}"
|
||||
+ f" --device_id {int(device_ids[i])}"
|
||||
+ f" --rank {i}"
|
||||
@@ -238,104 +196,24 @@ class PrefixCacheManager:
|
||||
+ f" --local_data_parallel_id {self.local_data_parallel_id}"
|
||||
+ f" --rdma_port {cache_config.rdma_comm_ports[i] if cache_config.rdma_comm_ports is not None else '0'}"
|
||||
+ f" --speculative_config '{self.speculative_config.to_json_string()}'"
|
||||
+ (" --create_cache_tensor" if create_cache_tensor else "")
|
||||
+ f" >{log_dir}/launch_cache_manager_{int(device_ids[i])}.log 2>&1"
|
||||
)
|
||||
logger.info(f"Launch cache transfer manager, command:{launch_cmd}")
|
||||
cache_manager_processes.append(subprocess.Popen(launch_cmd, shell=True, preexec_fn=os.setsid))
|
||||
|
||||
logger.info("PrefixCacheManager is waiting for kv cache to be initialized.")
|
||||
# 等待cache初始化完毕
|
||||
logger.info("Waiting for cache transfer manager ready...")
|
||||
while np.sum(self.cache_ready_signal.value) != tensor_parallel_size:
|
||||
time.sleep(1)
|
||||
|
||||
if cache_config.enable_hierarchical_cache and self.num_cpu_blocks > 0:
|
||||
while np.sum(self.swap_space_ready_signal.value) != tensor_parallel_size:
|
||||
time.sleep(1)
|
||||
|
||||
exit_code = cache_manager_processes[-1].poll()
|
||||
if exit_code is None:
|
||||
logger.info("Launch cache transfer manager successful")
|
||||
else:
|
||||
logger.info("Launch cache transfer manager failed, see launch_cache_manager.log for more information")
|
||||
|
||||
# Start additional threads
|
||||
if cache_config.enable_hierarchical_cache and self.num_cpu_blocks > 0:
|
||||
logger.info("Enable hierarchical cache.")
|
||||
threading.Thread(target=self.recv_data_transfer_result).start()
|
||||
if cache_config.enable_prefix_caching:
|
||||
threading.Thread(target=self.clear_prefix_cache, daemon=True).start()
|
||||
|
||||
all_cache_processes = cache_messager_processes + cache_manager_processes
|
||||
return all_cache_processes
|
||||
|
||||
def launch_cache_messager(
|
||||
self, cache_config, tensor_parallel_size, device_ids, pod_ip, engine_worker_queue_port, pid_suffix
|
||||
):
|
||||
"""
|
||||
launch_cache_messager function used to initialize the cache messager.
|
||||
"""
|
||||
current_dir_path = os.path.split(os.path.abspath(__file__))[0]
|
||||
filename = "cache_messager.py"
|
||||
if (
|
||||
hasattr(cache_config.model_cfg, "num_key_value_heads")
|
||||
and hasattr(cache_config.model_cfg, "num_key_value_heads")
|
||||
and cache_config.model_cfg.num_key_value_heads is not None
|
||||
and int(cache_config.model_cfg.num_key_value_heads) > 0
|
||||
):
|
||||
kv_num_head = int(cache_config.model_cfg.num_key_value_heads) // tensor_parallel_size
|
||||
else:
|
||||
kv_num_head = cache_config.model_cfg.num_attention_heads // tensor_parallel_size
|
||||
|
||||
cache_ready_signal_data = np.zeros(shape=[tensor_parallel_size], dtype=np.int32)
|
||||
self.cache_ready_signal = IPCSignal(
|
||||
name="cache_ready_signal",
|
||||
array=cache_ready_signal_data,
|
||||
dtype=np.int32,
|
||||
suffix=pid_suffix,
|
||||
create=False,
|
||||
)
|
||||
|
||||
py_path = os.path.join(current_dir_path, filename)
|
||||
log_dir = envs.FD_LOG_DIR
|
||||
cache_messager_processes = []
|
||||
for i in range(tensor_parallel_size):
|
||||
launch_cmd = (
|
||||
"FLAGS_allocator_strategy=auto_growth CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7"
|
||||
+ " NCCL_MAX_NCHANNELS=1 NCCL_BUFFSIZE=0"
|
||||
+ f" {sys.executable} {py_path}"
|
||||
+ f" --device_id {int(device_ids[i])}"
|
||||
+ f" --rank {i}"
|
||||
+ f" --splitwise_role {self.splitwise_role}"
|
||||
+ f" --num_layers {cache_config.model_cfg.num_hidden_layers}"
|
||||
+ f" --head_dim {cache_config.model_cfg.head_dim}"
|
||||
+ f" --kv_num_head {kv_num_head}"
|
||||
+ f" --mp_num {tensor_parallel_size}"
|
||||
+ f" --cache_dtype {cache_config.cache_dtype}"
|
||||
+ f" --pod_ip {pod_ip}"
|
||||
+ f" --cache_queue_port {cache_config.cache_queue_port}"
|
||||
+ f" --engine_worker_queue_port {engine_worker_queue_port}"
|
||||
+ f" --num_gpu_blocks {cache_config.total_block_num}"
|
||||
+ f" --block_size {cache_config.block_size}"
|
||||
+ f" --protocol {cache_config.cache_transfer_protocol}"
|
||||
+ f" --local_data_parallel_id {self.local_data_parallel_id}"
|
||||
+ f" --engine_pid {pid_suffix}"
|
||||
+ f" --rdma_port {cache_config.rdma_comm_ports[i] if cache_config.rdma_comm_ports is not None else '0'}"
|
||||
+ f" --speculative_config '{self.speculative_config.to_json_string()}'"
|
||||
+ f" >{log_dir}/launch_cache_messager_{int(device_ids[i])}.log 2>&1"
|
||||
)
|
||||
logger.info(f"Launch cache messager, command:{launch_cmd}")
|
||||
cache_messager_processes.append(subprocess.Popen(launch_cmd, shell=True, preexec_fn=os.setsid))
|
||||
|
||||
logger.info("Waiting for cache ready...")
|
||||
while np.sum(self.cache_ready_signal.value) != tensor_parallel_size:
|
||||
time.sleep(1)
|
||||
exit_code = cache_messager_processes[-1].poll()
|
||||
if exit_code is None:
|
||||
logger.info("Launch cache messager successful")
|
||||
else:
|
||||
logger.info("Launch cache messager failed, see launch_cache_messager.log for more information")
|
||||
cache_messager_processes = None
|
||||
return cache_messager_processes
|
||||
self._enable_cpu_cache()
|
||||
return cache_manager_processes
|
||||
|
||||
def update_cache_config(self, cache_config):
|
||||
"""
|
||||
@@ -357,9 +235,23 @@ class PrefixCacheManager:
|
||||
self.node_id_pool = list(range(self.num_gpu_blocks + self.num_cpu_blocks))
|
||||
|
||||
main_process_metrics.max_gpu_block_num.set(self.num_gpu_blocks)
|
||||
main_process_metrics.available_gpu_block_num.set(self.num_gpu_blocks)
|
||||
main_process_metrics.available_gpu_resource.set(1.0)
|
||||
|
||||
def _enable_cpu_cache(self):
|
||||
"""
|
||||
_enable_cpu_cache function used to enable cpu cache.
|
||||
"""
|
||||
|
||||
# ipc_cache_queue_port = self.cache_config.cache_queue_port
|
||||
# self.cache_task_queue = CacheQueueManager(
|
||||
# rank=0,
|
||||
# mp_num=tensor_parallel_size,
|
||||
# port=ipc_cache_queue_port,
|
||||
# )
|
||||
# 开启获取传输任务结果的监听线程
|
||||
self.transfer_recv_thread = threading.Thread(target=self.recv_data_transfer_result)
|
||||
self.transfer_recv_thread.start()
|
||||
|
||||
def can_allocate_gpu_blocks(self, num_blocks: int):
|
||||
"""
|
||||
Check if num_blocks gpu blocks can be allocated.
|
||||
@@ -1403,70 +1295,3 @@ class PrefixCacheManager:
|
||||
except Exception as e:
|
||||
logger.warning(f"recv_data_transfer_result: error: {e}, {str(traceback.format_exc())}")
|
||||
raise e
|
||||
|
||||
def reset(self):
|
||||
"""
|
||||
Reset the RadixTree.
|
||||
"""
|
||||
|
||||
if len(self.node_map) == 0:
|
||||
return
|
||||
|
||||
logger.info("Resetting the RadixTree!")
|
||||
|
||||
# wait for swap tasks to finish
|
||||
if self.gpu_free_task_future is not None:
|
||||
self.gpu_free_task_future.result()
|
||||
self.gpu_free_task_future = None
|
||||
for event in list(self.task_swapping_event.values()):
|
||||
event.wait()
|
||||
self.task_swapping_event.clear()
|
||||
|
||||
# clear node map
|
||||
self.node_map.clear()
|
||||
self.req_leaf_map.clear()
|
||||
self.leaf_req_map.clear()
|
||||
self.unfilled_req_block_map.clear()
|
||||
self.cache_info.clear()
|
||||
|
||||
# reset gpu cache data structure
|
||||
self.gpu_lru_leaf_heap.clear()
|
||||
self.gpu_lru_leaf_set.clear()
|
||||
|
||||
# reset cpu cache data structure
|
||||
self.cpu_lru_leaf_heap.clear()
|
||||
self.cpu_lru_leaf_set.clear()
|
||||
|
||||
# reset gpu/cpu free block list
|
||||
self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1))
|
||||
if self.num_cpu_blocks > 0:
|
||||
self.cpu_free_block_list = list(range(self.num_cpu_blocks - 1, -1, -1))
|
||||
else:
|
||||
self.cpu_free_block_list = []
|
||||
heapq.heapify(self.gpu_free_block_list)
|
||||
heapq.heapify(self.cpu_free_block_list)
|
||||
|
||||
# reset node/tree
|
||||
self.node_id_pool = list(range(self.num_gpu_blocks + self.num_cpu_blocks))
|
||||
self.radix_tree_root = BlockNode(-1, [], 0, 0, -1, 0, None, None, None)
|
||||
|
||||
# reset metrics
|
||||
self.metrics.reset_metrics()
|
||||
main_process_metrics.free_gpu_block_num.set(len(self.gpu_free_block_list))
|
||||
main_process_metrics.available_gpu_resource.set(self.available_gpu_resource)
|
||||
|
||||
def clear_prefix_cache(self):
|
||||
"""
|
||||
If the model weights status is updating or clearing, reset prefix cache tree
|
||||
"""
|
||||
logger.info("Start a thread to clear prefix cache when model weights are cleared.")
|
||||
prefix_tree_status_signal = self.prefix_tree_status_signal
|
||||
while True:
|
||||
if prefix_tree_status_signal.value[0] == PrefixTreeStatus.CLEARING:
|
||||
self.reset()
|
||||
prefix_tree_status_signal.value[0] = PrefixTreeStatus.CLEARED
|
||||
logger.info("Prefix cache tree is cleared.")
|
||||
if prefix_tree_status_signal.value[0] == PrefixTreeStatus.UPDATING:
|
||||
prefix_tree_status_signal.value[0] = PrefixTreeStatus.NORMAL
|
||||
logger.info("Prefix cache tree is updated.")
|
||||
time.sleep(0.01)
|
||||
|
@@ -61,12 +61,18 @@ class RDMACommManager:
|
||||
Connect to remote gpu and write cache.
|
||||
"""
|
||||
assert self.splitwise_role == "prefill", "only prefill can call this method"
|
||||
addr = f"{ip}:{port!s}"
|
||||
if addr in self.connected_rdma:
|
||||
return True
|
||||
ret = self.messager.is_connected(ip, str(port))
|
||||
if ret:
|
||||
self.connected_rdma.add(addr)
|
||||
return True
|
||||
|
||||
ret = self.messager.connect(ip, str(port))
|
||||
logger.info(f"connect to remote rdma address {ip}:{port} status is {ret}")
|
||||
if ret == 0:
|
||||
self.connected_rdma.add(addr)
|
||||
return ret == 0
|
||||
|
||||
def write_cache(self, ip, port, local_block_ids, remote_block_ids, layer_idx):
|
||||
|
@@ -18,14 +18,12 @@ from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
from dataclasses import field
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Literal, Optional, Union
|
||||
|
||||
import paddle
|
||||
import paddle.distributed as dist
|
||||
from paddleformers.transformers.configuration_utils import PretrainedConfig
|
||||
from typing_extensions import assert_never
|
||||
|
||||
import fastdeploy
|
||||
from fastdeploy import envs
|
||||
@@ -33,68 +31,11 @@ from fastdeploy.model_executor.layers.quantization.quant_base import QuantConfig
|
||||
from fastdeploy.multimodal.registry import MultimodalRegistry
|
||||
from fastdeploy.platforms import current_platform
|
||||
from fastdeploy.scheduler import SchedulerConfig
|
||||
from fastdeploy.transformer_utils.config import get_pooling_config
|
||||
from fastdeploy.utils import ceil_div, check_unified_ckpt, get_host_ip, get_logger
|
||||
|
||||
logger = get_logger("config", "config.log")
|
||||
|
||||
TaskOption = Literal["auto", "generate", "embedding", "embed"]
|
||||
|
||||
RunnerType = Literal["generate", "pooling"]
|
||||
|
||||
RunnerOption = Literal["auto", "generate", "pooling"]
|
||||
|
||||
ConvertOption = Literal["auto", "none", "embed"]
|
||||
|
||||
ConvertType = Literal["none", "embed"]
|
||||
|
||||
_ResolvedTask = Literal["generate", "encode", "embed"]
|
||||
|
||||
_RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = {
|
||||
"generate": [],
|
||||
"pooling": ["embed"],
|
||||
}
|
||||
|
||||
# Some model suffixes are based on auto classes from Transformers:
|
||||
# https://huggingface.co/docs/transformers/en/model_doc/auto
|
||||
# NOTE: Items higher on this list priority over lower ones
|
||||
_SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [
|
||||
("ForCausalLM", ("generate", "none")),
|
||||
("ForConditionalGeneration", ("generate", "none")),
|
||||
("ChatModel", ("generate", "none")),
|
||||
("LMHeadModel", ("generate", "none")),
|
||||
("ForTextEncoding", ("pooling", "embed")),
|
||||
("EmbeddingModel", ("pooling", "embed")),
|
||||
("ForSequenceClassification", ("pooling", "classify")),
|
||||
("ForAudioClassification", ("pooling", "classify")),
|
||||
("ForImageClassification", ("pooling", "classify")),
|
||||
("ForVideoClassification", ("pooling", "classify")),
|
||||
("ClassificationModel", ("pooling", "classify")),
|
||||
("ForRewardModeling", ("pooling", "reward")),
|
||||
("RewardModel", ("pooling", "reward")),
|
||||
# Let other `*Model`s take priority
|
||||
("Model", ("pooling", "embed")),
|
||||
]
|
||||
|
||||
|
||||
def iter_architecture_defaults():
|
||||
yield from _SUFFIX_TO_DEFAULTS
|
||||
|
||||
|
||||
def try_match_architecture_defaults(
|
||||
architecture: str,
|
||||
*,
|
||||
runner_type: Optional[RunnerType] = None,
|
||||
convert_type: Optional[ConvertType] = None,
|
||||
):
|
||||
for suffix, (default_runner_type, default_convert_type) in iter_architecture_defaults():
|
||||
if (
|
||||
(runner_type is None or runner_type == default_runner_type)
|
||||
and (convert_type is None or convert_type == default_convert_type)
|
||||
and architecture.endswith(suffix)
|
||||
):
|
||||
return suffix, (default_runner_type, default_convert_type)
|
||||
return None
|
||||
TaskOption = Literal["generate"]
|
||||
|
||||
|
||||
class MoEPhase:
|
||||
@@ -192,12 +133,6 @@ class ModelConfig:
|
||||
self.eos_tokens_lens: int = 2
|
||||
self.lm_head_fp32: bool = False
|
||||
self.model_format = "auto"
|
||||
self.runner = "auto"
|
||||
self.convert = "auto"
|
||||
self.pooler_config: Optional["PoolerConfig"] = field(init=False)
|
||||
self.override_pooler_config: Optional[Union[dict, "PoolerConfig"]] = None
|
||||
self.revision = None
|
||||
|
||||
self.partial_rotary_factor: float = 1.0
|
||||
self.num_nextn_predict_layers = 0
|
||||
for key, value in args.items():
|
||||
@@ -224,10 +159,8 @@ class ModelConfig:
|
||||
self.vision_config = PretrainedConfig.from_dict(self.vision_config)
|
||||
|
||||
self.ori_vocab_size = args.get("ori_vocab_size", self.vocab_size)
|
||||
self.think_end_id = args.get("think_end_id", -1)
|
||||
|
||||
architectures = self.architectures[0]
|
||||
|
||||
if MultimodalRegistry.contains_model(architectures):
|
||||
self.enable_mm = True
|
||||
else:
|
||||
@@ -238,43 +171,6 @@ class ModelConfig:
|
||||
self.override_name_from_config()
|
||||
self.read_from_env()
|
||||
self.read_model_config()
|
||||
self.runner_type = self._get_runner_type(self.architectures, self.runner)
|
||||
self.convert_type = self._get_convert_type(self.architectures, self.runner_type, self.convert)
|
||||
|
||||
registry = self.registry
|
||||
is_generative_model = registry.is_text_generation_model(self.architectures, self)
|
||||
is_pooling_model = registry.is_pooling_model(self.architectures, self)
|
||||
is_multimodal_model = registry.is_multimodal_model(self.architectures, self)
|
||||
|
||||
if self.runner_type == "generate" and not is_generative_model:
|
||||
if is_multimodal_model:
|
||||
pass
|
||||
else:
|
||||
generate_converts = _RUNNER_CONVERTS["generate"]
|
||||
if self.convert_type not in generate_converts:
|
||||
raise ValueError("This model does not support '--runner generate.")
|
||||
if self.runner_type == "pooling" and not is_pooling_model:
|
||||
pooling_converts = _RUNNER_CONVERTS["pooling"]
|
||||
if self.convert_type not in pooling_converts:
|
||||
convert_option = "<" + "|".join(pooling_converts) + ">"
|
||||
raise ValueError(
|
||||
"This model does not support `--runner pooling`. "
|
||||
f"You can pass `--convert {convert_option} to adapt "
|
||||
"it into a pooling model."
|
||||
)
|
||||
|
||||
self.supported_tasks = self._get_supported_tasks(self.architectures, self.runner_type, self.convert_type)
|
||||
model_info, arch = registry.inspect_model_cls(self.architectures, self)
|
||||
self._model_info = model_info
|
||||
self._architecture = arch
|
||||
|
||||
self.pooler_config = self._init_pooler_config()
|
||||
|
||||
@property
|
||||
def registry(self):
|
||||
from fastdeploy.model_executor.models.model_base import ModelRegistry
|
||||
|
||||
return ModelRegistry()
|
||||
|
||||
def override_name_from_config(self):
|
||||
"""
|
||||
@@ -298,6 +194,7 @@ class ModelConfig:
|
||||
def read_from_env(self):
|
||||
"""
|
||||
Read configuration information from environment variables and update the object's attributes.
|
||||
|
||||
If an attribute is not present or is an empty string in the environment variables, use the default value.
|
||||
"""
|
||||
self.max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM)
|
||||
@@ -338,165 +235,6 @@ class ModelConfig:
|
||||
f"Config file path: {config_path}"
|
||||
)
|
||||
|
||||
def _get_default_runner_type(
|
||||
self,
|
||||
architectures: list[str],
|
||||
) -> RunnerType:
|
||||
registry = self.registry
|
||||
if get_pooling_config(self.model, self.revision):
|
||||
return "pooling"
|
||||
for arch in architectures:
|
||||
if arch in registry.get_supported_archs():
|
||||
if registry.is_pooling_model(architectures, self):
|
||||
return "pooling"
|
||||
if registry.is_text_generation_model(architectures, self):
|
||||
return "generate"
|
||||
match = try_match_architecture_defaults(arch)
|
||||
if match:
|
||||
_, (runner_type, _) = match
|
||||
return runner_type
|
||||
return "generate"
|
||||
|
||||
def _get_default_convert_type(
|
||||
self,
|
||||
architectures: list[str],
|
||||
runner_type: RunnerType,
|
||||
) -> ConvertType:
|
||||
registry = self.registry
|
||||
|
||||
for arch in architectures:
|
||||
if arch in registry.get_supported_archs():
|
||||
if runner_type == "generate" and registry.is_text_generation_model(architectures, self):
|
||||
return "none"
|
||||
if runner_type == "pooling" and registry.is_pooling_model(architectures, self):
|
||||
return "none"
|
||||
match = try_match_architecture_defaults(arch, runner_type=runner_type)
|
||||
if match:
|
||||
_, (_, convert_type) = match
|
||||
return convert_type
|
||||
|
||||
# This is to handle Sentence Transformers models that use *ForCausalLM
|
||||
# and also multi-modal pooling models which are not defined as
|
||||
# Sentence Transformers models
|
||||
if runner_type == "pooling":
|
||||
return "embed"
|
||||
|
||||
return "none"
|
||||
|
||||
def _get_runner_type(
|
||||
self,
|
||||
architectures: list[str],
|
||||
runner: RunnerOption,
|
||||
) -> RunnerType:
|
||||
if runner != "auto":
|
||||
return runner
|
||||
|
||||
runner_type = self._get_default_runner_type(architectures)
|
||||
if runner_type != "generate":
|
||||
logger.info(
|
||||
"Resolved `--runner auto` to `--runner %s`. " "Pass the value explicitly to silence this message.",
|
||||
runner_type,
|
||||
)
|
||||
|
||||
return runner_type
|
||||
|
||||
def _get_convert_type(
|
||||
self,
|
||||
architectures: list[str],
|
||||
runner_type: RunnerType,
|
||||
convert: ConvertOption,
|
||||
) -> ConvertType:
|
||||
if convert != "auto":
|
||||
return convert
|
||||
|
||||
convert_type = self._get_default_convert_type(architectures, runner_type)
|
||||
|
||||
if convert_type != "none":
|
||||
logger.info(
|
||||
"Resolved `--convert auto` to `--convert %s`. " "Pass the value explicitly to silence this message.",
|
||||
convert_type,
|
||||
)
|
||||
|
||||
return convert_type
|
||||
|
||||
def _get_supported_generation_tasks(
|
||||
self,
|
||||
architectures: list[str],
|
||||
convert_type: ConvertType,
|
||||
) -> list[_ResolvedTask]:
|
||||
registry = self.registry
|
||||
|
||||
supported_tasks = list[_ResolvedTask]()
|
||||
if registry.is_text_generation_model(architectures, self) or convert_type in _RUNNER_CONVERTS["generate"]:
|
||||
supported_tasks.append("generate")
|
||||
|
||||
# TODO:Temporarily does not support transcription.
|
||||
return supported_tasks
|
||||
|
||||
def _get_default_pooling_task(
|
||||
self,
|
||||
architectures: list[str],
|
||||
) -> Literal["embed"]:
|
||||
# Temporarily does not support classification and reward.
|
||||
for arch in architectures:
|
||||
match = try_match_architecture_defaults(arch, runner_type="pooling")
|
||||
if match:
|
||||
_, (_, convert_type) = match
|
||||
assert convert_type != "none"
|
||||
return convert_type
|
||||
|
||||
return "embed"
|
||||
|
||||
def _get_supported_pooling_tasks(
|
||||
self,
|
||||
architectures: list[str],
|
||||
convert_type: ConvertType,
|
||||
) -> list[_ResolvedTask]:
|
||||
registry = self.registry
|
||||
|
||||
supported_tasks = list[_ResolvedTask]()
|
||||
if registry.is_pooling_model(architectures, self) or convert_type in _RUNNER_CONVERTS["pooling"]:
|
||||
supported_tasks.append("encode")
|
||||
|
||||
extra_task = self._get_default_pooling_task(architectures) if convert_type == "none" else convert_type
|
||||
supported_tasks.append(extra_task)
|
||||
|
||||
return supported_tasks
|
||||
|
||||
def _get_supported_tasks(
|
||||
self,
|
||||
architectures: list[str],
|
||||
runner_type: RunnerType,
|
||||
convert_type: ConvertType,
|
||||
) -> list[_ResolvedTask]:
|
||||
if runner_type == "generate":
|
||||
return self._get_supported_generation_tasks(architectures, convert_type)
|
||||
if runner_type == "pooling":
|
||||
return self._get_supported_pooling_tasks(architectures, convert_type)
|
||||
|
||||
assert_never(runner_type)
|
||||
|
||||
def _init_pooler_config(self) -> Optional["PoolerConfig"]:
|
||||
if self.runner_type == "pooling":
|
||||
if isinstance(self.override_pooler_config, dict):
|
||||
self.override_pooler_config = PoolerConfig(**self.override_pooler_config)
|
||||
|
||||
pooler_config = self.override_pooler_config or PoolerConfig()
|
||||
|
||||
base_config = get_pooling_config(self.model, self.revision)
|
||||
if base_config is not None:
|
||||
for k, v in base_config.items():
|
||||
if getattr(pooler_config, k) is None:
|
||||
setattr(pooler_config, k, v)
|
||||
|
||||
default_pooling_type = self._model_info.default_pooling_type
|
||||
if pooler_config.pooling_type is None:
|
||||
pooler_config.pooling_type = default_pooling_type
|
||||
|
||||
return pooler_config
|
||||
|
||||
return None
|
||||
|
||||
def _get_download_model(self, model_name, model_type="default"):
|
||||
# TODO: Provide dynamic graph for self-downloading and save to the specified download directory.
|
||||
pass
|
||||
@@ -520,6 +258,7 @@ class ParallelConfig:
|
||||
):
|
||||
self.sequence_parallel = False # Whether to enable sequence parallelism.
|
||||
self.use_ep = False # Whether to enable Expert Parallelism
|
||||
self.moe_phase = MoEPhase("prefill") # Generation phase
|
||||
self.msg_queue_id = 1 # message queue id
|
||||
|
||||
self.tensor_parallel_rank = 0 # TP rank ID
|
||||
@@ -557,6 +296,8 @@ class ParallelConfig:
|
||||
# Do profile or not
|
||||
self.do_profile: bool = False
|
||||
|
||||
# splitwise role
|
||||
self.splitwise_role: str = "mixed"
|
||||
# guided decoding backend
|
||||
self.guided_decoding_backend: str = None
|
||||
# disable any whitespace for guided decoding
|
||||
@@ -578,6 +319,14 @@ class ParallelConfig:
|
||||
else:
|
||||
self.expert_parallel_size = 1
|
||||
self.use_ep = self.expert_parallel_size > 1
|
||||
if self.splitwise_role == "mixed":
|
||||
self.moe_phase = MoEPhase(phase="prefill")
|
||||
elif self.splitwise_role == "prefill":
|
||||
self.moe_phase = MoEPhase(phase="prefill")
|
||||
elif self.splitwise_role == "decode":
|
||||
self.moe_phase = MoEPhase(phase="decode")
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
# pd_disaggregation
|
||||
use_pd_disaggregation: int = int(os.getenv("FLAGS_use_pd_disaggregation", 0))
|
||||
@@ -589,24 +338,20 @@ class ParallelConfig:
|
||||
else:
|
||||
self.pd_disaggregation_mode = "None"
|
||||
|
||||
def set_communicate_group(self):
|
||||
def set_tp_group(self):
|
||||
# different tp group id
|
||||
# prevent different tp_groups using the same group_id
|
||||
tp_gid_offset = envs.FD_TP_GROUP_GID_OFFSET
|
||||
dist.collective._set_custom_gid(self.data_parallel_rank + tp_gid_offset)
|
||||
|
||||
self.tp_group = dist.new_group(
|
||||
range(
|
||||
self.data_parallel_rank * self.tensor_parallel_size,
|
||||
(self.data_parallel_rank + 1) * self.tensor_parallel_size,
|
||||
)
|
||||
)
|
||||
dist.collective._set_custom_gid(None)
|
||||
# same ep group id
|
||||
if self.enable_expert_parallel:
|
||||
dist.collective._set_custom_gid(self.data_parallel_size + tp_gid_offset)
|
||||
self.ep_group = dist.new_group(range(self.expert_parallel_size))
|
||||
dist.collective._set_custom_gid(None)
|
||||
dist.collective._set_custom_gid(self.data_parallel_size + tp_gid_offset)
|
||||
self.ep_group = dist.new_group(range(self.expert_parallel_size))
|
||||
logger.info(
|
||||
f"data_parallel_size: {self.data_parallel_size}, tensor_parallel_size: {self.tensor_parallel_size}, expert_parallel_size: {self.expert_parallel_size}, data_parallel_rank: {self.data_parallel_rank}, tensor_parallel_rank: {self.tensor_parallel_rank}, expert_parallel_rank: {self.expert_parallel_rank}, tp_group: {self.tp_group}."
|
||||
)
|
||||
@@ -839,13 +584,8 @@ class GraphOptimizationConfig:
|
||||
Now don't support capture both decode-only and prefill-only"""
|
||||
self.full_cuda_graph: bool = True
|
||||
|
||||
""" Maximum CUDA Graph capture size """
|
||||
self.max_capture_size: int = None
|
||||
""" Record maps mapped from real shape to captured size to reduce runtime overhead """
|
||||
self.real_shape_to_captured_size: dict[int, int] = None
|
||||
""" Whether to use shared memory pool for multi capture_size """
|
||||
self.use_unique_memory_pool: bool = False
|
||||
|
||||
# CINN Config ...
|
||||
if args is not None:
|
||||
for key, value in args.items():
|
||||
@@ -948,63 +688,63 @@ class GraphOptimizationConfig:
|
||||
argument = self.use_cudagraph
|
||||
|
||||
|
||||
class PlasAttentionConfig:
|
||||
class MobaAttentionConfig:
|
||||
def __init__(
|
||||
self,
|
||||
args,
|
||||
):
|
||||
self.plas_encoder_top_k_left: int = None
|
||||
self.plas_encoder_top_k_right: int = None
|
||||
"The sparse topk of encoder attention is located at [plas_encoder_top_k_left, plas_encoder top_k_right]"
|
||||
self.plas_decoder_top_k_left: int = None
|
||||
self.plas_decoder_top_k_right: int = None
|
||||
"The sparse topk of decoder attention is located at [plas_decoder_top_k_left, plas_decoder top_k_right]"
|
||||
self.plas_use_encoder_seq_limit: int = None
|
||||
"When the number of encdoer token is less than plas_use_encoder_seq_limit, it is not sparse"
|
||||
self.plas_use_decoder_seq_limit: int = None
|
||||
"When the number of decdoer token is less than plas_use_decoder_seq_limit, it is not sparse"
|
||||
self.plas_block_size: int = 128
|
||||
self.mlp_weight_name: str = "plas_attention_mlp_weight.safetensors"
|
||||
self.plas_max_seq_length: int = 128 * 1024
|
||||
self.moba_encoder_top_k_left: int = None
|
||||
self.moba_encoder_top_k_right: int = None
|
||||
"The sparse topk of encoder attention is located at [moba_encoder_top_k_left, moba_encoder top_k_right]"
|
||||
self.moba_decoder_top_k_left: int = None
|
||||
self.moba_decoder_top_k_right: int = None
|
||||
"The sparse topk of decoder attention is located at [moba_decoder_top_k_left, moba_decoder top_k_right]"
|
||||
self.moba_use_encoder_seq_limit: int = None
|
||||
"When the number of encdoer token is less than moba_use_encoder_seq_limit, it is not sparse"
|
||||
self.moba_use_decoder_seq_limit: int = None
|
||||
"When the number of decdoer token is less than moba_use_decoder_seq_limit, it is not sparse"
|
||||
self.moba_block_size: int = 128
|
||||
self.mlp_weight_name: str = "moba_mlp_weight.safetensors"
|
||||
self.moba_max_seq_length: int = 128 * 1024
|
||||
if args is not None:
|
||||
for key, value in args.items():
|
||||
if hasattr(self, key):
|
||||
setattr(self, key, value)
|
||||
if self.plas_use_encoder_seq_limit is None and self.plas_encoder_top_k_left is not None:
|
||||
self.plas_use_encoder_seq_limit = self.plas_encoder_top_k_left * self.plas_block_size
|
||||
if self.plas_use_decoder_seq_limit is None and self.plas_decoder_top_k_left is not None:
|
||||
self.plas_use_decoder_seq_limit = self.plas_decoder_top_k_left * self.plas_block_size
|
||||
if self.moba_use_encoder_seq_limit is None and self.moba_encoder_top_k_left is not None:
|
||||
self.moba_use_encoder_seq_limit = self.moba_encoder_top_k_left * self.moba_block_size
|
||||
if self.moba_use_decoder_seq_limit is None and self.moba_decoder_top_k_left is not None:
|
||||
self.moba_use_decoder_seq_limit = self.moba_decoder_top_k_left * self.moba_block_size
|
||||
self.check_legality_parameters()
|
||||
|
||||
def check_legality_parameters(
|
||||
self,
|
||||
) -> None:
|
||||
if self.plas_encoder_top_k_left is not None:
|
||||
assert self.plas_encoder_top_k_left > 0, "plas_encoder_top_k_left must large than 0"
|
||||
if self.moba_encoder_top_k_left is not None:
|
||||
assert self.moba_encoder_top_k_left > 0, "moba_encoder_top_k_left must large than 0"
|
||||
|
||||
if self.plas_encoder_top_k_right is not None:
|
||||
assert self.plas_encoder_top_k_right > 0, "plas_encoder_top_k_right must large than 0"
|
||||
if self.moba_encoder_top_k_right is not None:
|
||||
assert self.moba_encoder_top_k_right > 0, "moba_encoder_top_k_right must large than 0"
|
||||
assert (
|
||||
self.plas_encoder_top_k_right >= self.plas_encoder_top_k_left
|
||||
), "plas_encoder_top_k_right must large than plas_encoder_top_k_left"
|
||||
self.moba_encoder_top_k_right >= self.moba_encoder_top_k_left
|
||||
), "moba_encoder_top_k_right must large than moba_encoder_top_k_left"
|
||||
|
||||
if self.plas_decoder_top_k_left is not None:
|
||||
assert self.plas_decoder_top_k_left > 0, "plas_decoder_top_k_left must large than 0"
|
||||
if self.moba_decoder_top_k_left is not None:
|
||||
assert self.moba_decoder_top_k_left > 0, "moba_decoder_top_k_left must large than 0"
|
||||
|
||||
if self.plas_decoder_top_k_right is not None:
|
||||
assert self.plas_decoder_top_k_right > 0, "plas_decoder_top_k_right must large than 0"
|
||||
if self.moba_decoder_top_k_right is not None:
|
||||
assert self.moba_decoder_top_k_right > 0, "moba_decoder_top_k_right must large than 0"
|
||||
assert (
|
||||
self.plas_decoder_top_k_right >= self.plas_decoder_top_k_left
|
||||
), "plas_decoder_top_k_right must large than plas_decoder_top_k_left"
|
||||
self.moba_decoder_top_k_right >= self.moba_decoder_top_k_left
|
||||
), "moba_decoder_top_k_right must large than moba_decoder_top_k_left"
|
||||
|
||||
if self.plas_use_encoder_seq_limit is not None and self.plas_encoder_top_k_left is not None:
|
||||
assert self.plas_use_encoder_seq_limit >= self.plas_encoder_top_k_left * self.plas_block_size
|
||||
if self.plas_use_decoder_seq_limit is not None and self.plas_decoder_top_k_left is not None:
|
||||
assert self.plas_use_decoder_seq_limit >= self.plas_decoder_top_k_left * self.plas_block_size
|
||||
if self.moba_use_encoder_seq_limit is not None and self.moba_encoder_top_k_left is not None:
|
||||
assert self.moba_use_encoder_seq_limit >= self.moba_encoder_top_k_left * self.moba_block_size
|
||||
if self.moba_use_decoder_seq_limit is not None and self.moba_decoder_top_k_left is not None:
|
||||
assert self.moba_use_decoder_seq_limit >= self.moba_decoder_top_k_left * self.moba_block_size
|
||||
|
||||
def to_json_string(self):
|
||||
"""
|
||||
Convert plas_attention_config to json string.
|
||||
Convert moba_attention_config to json string.
|
||||
"""
|
||||
return json.dumps({key: value for key, value in self.__dict__.items() if value is not None})
|
||||
|
||||
@@ -1093,7 +833,6 @@ class LoadConfig:
|
||||
load_strategy: Specifies the weight loading method when enabled:
|
||||
- 'ipc': Real-time IPC streaming with automatic resharding
|
||||
- 'ipc_snapshot': Load from disk snapshot of IPC weights
|
||||
- 'meta': Only model meta messages
|
||||
- None: No dynamic loading
|
||||
"""
|
||||
|
||||
@@ -1104,47 +843,12 @@ class LoadConfig:
|
||||
self.load_choices: Union[str, LoadChoices] = LoadChoices.DEFAULT.value
|
||||
self.use_fastsafetensor = int(envs.FD_USE_FASTSAFETENSOR) == 1
|
||||
self.dynamic_load_weight: bool = False
|
||||
self.load_strategy: Optional[Literal["ipc", "ipc_snapshot", "meta", "normal"]] = "normal"
|
||||
self.load_strategy: Optional[Literal["ipc", "ipc_snapshot"]] = None
|
||||
for key, value in args.items():
|
||||
if hasattr(self, key):
|
||||
setattr(self, key, value)
|
||||
|
||||
|
||||
class PoolerConfig:
|
||||
"""Controls the behavior of output pooling in pooling models."""
|
||||
|
||||
pooling_type: Optional[str] = None
|
||||
"""
|
||||
The pooling method of the pooling model.
|
||||
"""
|
||||
# for embeddings models
|
||||
normalize: Optional[bool] = None
|
||||
"""
|
||||
Whether to normalize the embeddings outputs. Defaults to True.
|
||||
"""
|
||||
dimensions: Optional[int] = None
|
||||
"""
|
||||
Reduce the dimensions of embeddings if model
|
||||
support matryoshka representation. Defaults to None.
|
||||
"""
|
||||
enable_chunked_processing: Optional[bool] = None
|
||||
"""
|
||||
Whether to enable chunked processing for long inputs that exceed the model's
|
||||
maximum position embeddings. When enabled, long inputs will be split into
|
||||
chunks, processed separately, and then aggregated using weighted averaging.
|
||||
This allows embedding models to handle arbitrarily long text without CUDA
|
||||
errors. Defaults to False.
|
||||
"""
|
||||
max_embed_len: Optional[int] = None
|
||||
"""
|
||||
Maximum input length allowed for embedding generation. When set, allows
|
||||
inputs longer than max_embed_len to be accepted for embedding models.
|
||||
When an input exceeds max_embed_len, it will be handled according to
|
||||
the original max_model_len validation logic.
|
||||
Defaults to None (i.e. set to max_model_len).
|
||||
"""
|
||||
|
||||
|
||||
class LoRAConfig:
|
||||
"""LoRA Config"""
|
||||
|
||||
@@ -1189,7 +893,7 @@ class CacheConfig:
|
||||
self.kv_cache_ratio = 1.0
|
||||
else:
|
||||
self.kv_cache_ratio = 0.75
|
||||
self.enc_dec_block_num = 0 if current_platform.is_maca() else envs.FD_ENC_DEC_BLOCK_NUM
|
||||
self.enc_dec_block_num = 0 if current_platform.is_iluvatar() or current_platform.is_maca() else 2
|
||||
self.prealloc_dec_block_slot_num_threshold = 12
|
||||
self.cache_dtype = "bfloat16"
|
||||
self.model_cfg = None
|
||||
@@ -1399,14 +1103,16 @@ class FDConfig:
|
||||
decoding_config: DecodingConfig = None,
|
||||
quant_config: QuantConfigBase = None,
|
||||
graph_opt_config: GraphOptimizationConfig = None,
|
||||
plas_attention_config: PlasAttentionConfig = None,
|
||||
moba_attention_config: MobaAttentionConfig = None,
|
||||
speculative_config: SpeculativeConfig = None,
|
||||
tokenizer: str = None,
|
||||
max_model_len: int = 8192,
|
||||
ips: str = None,
|
||||
use_warmup: bool = False,
|
||||
engine_worker_queue_port: str = "8002",
|
||||
limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
|
||||
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
|
||||
splitwise_role: str = "mixed",
|
||||
innode_prefill_ports: Optional[List[int]] = None,
|
||||
max_num_partial_prefills: int = 1,
|
||||
max_long_partial_prefills: int = 1,
|
||||
@@ -1430,7 +1136,7 @@ class FDConfig:
|
||||
self.early_stop_config: Optional[EarlyStopConfig] = early_stop_config
|
||||
self.decoding_config: DecodingConfig = decoding_config # type: ignore
|
||||
self.cache_config: CacheConfig = cache_config # type: ignore
|
||||
self.plas_attention_config: Optional[PlasAttentionConfig] = plas_attention_config
|
||||
self.moba_attention_config: Optional[MobaAttentionConfig] = moba_attention_config
|
||||
# Initialize cuda graph capture list
|
||||
if self.graph_opt_config.cudagraph_capture_sizes is None:
|
||||
self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.scheduler_config.max_num_seqs)
|
||||
@@ -1469,6 +1175,7 @@ class FDConfig:
|
||||
self.limit_mm_per_prompt = limit_mm_per_prompt
|
||||
self.mm_processor_kwargs = mm_processor_kwargs
|
||||
self.use_warmup = use_warmup
|
||||
self.splitwise_role = splitwise_role
|
||||
self.innode_prefill_ports = innode_prefill_ports
|
||||
self.max_num_partial_prefills = max_num_partial_prefills
|
||||
self.max_long_partial_prefills = max_long_partial_prefills
|
||||
@@ -1476,13 +1183,17 @@ class FDConfig:
|
||||
self.reasoning_parser = reasoning_parser
|
||||
self.guided_decoding_backend = guided_decoding_backend
|
||||
self.disable_any_whitespace = disable_any_whitespace
|
||||
self.engine_worker_queue_port = engine_worker_queue_port
|
||||
self._str_to_list("innode_prefill_ports", int)
|
||||
if isinstance(engine_worker_queue_port, int):
|
||||
self.engine_worker_queue_port = str(engine_worker_queue_port)
|
||||
self._str_to_list("engine_worker_queue_port", str)
|
||||
|
||||
if envs.FD_FOR_TORCH_MODEL_FORMAT:
|
||||
self.model_config.model_format = "torch"
|
||||
|
||||
# TODO
|
||||
self.max_prefill_batch = int(os.getenv("MAX_PREFILL_NUM", "3"))
|
||||
self.max_prefill_batch = 3
|
||||
if current_platform.is_xpu():
|
||||
self.max_prefill_batch = 1
|
||||
if self.model_config is not None and self.model_config.enable_mm:
|
||||
@@ -1490,10 +1201,12 @@ class FDConfig:
|
||||
|
||||
num_ranks = self.parallel_config.tensor_parallel_size * self.parallel_config.data_parallel_size
|
||||
self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
|
||||
if num_ranks > self.max_chips_per_node and self.load_config.load_strategy != "meta":
|
||||
if num_ranks > self.max_chips_per_node:
|
||||
self.worker_num_per_node = self.max_chips_per_node
|
||||
nnode = ceil_div(num_ranks, self.worker_num_per_node)
|
||||
assert nnode == self.nnode, f"nnode: {nnode}, but got {self.nnode}"
|
||||
|
||||
# assert nnode == self.nnode, f"nnode: {nnode}, but got {self.nnode}"
|
||||
else:
|
||||
self.worker_num_per_node = num_ranks
|
||||
|
||||
@@ -1501,8 +1214,6 @@ class FDConfig:
|
||||
self.device_ids = os.getenv("CUDA_VISIBLE_DEVICES", self.device_ids)
|
||||
if current_platform.is_xpu():
|
||||
self.device_ids = os.getenv("XPU_VISIBLE_DEVICES", self.device_ids)
|
||||
if current_platform.is_intel_hpu():
|
||||
self.device_ids = os.getenv("HPU_VISIBLE_DEVICES", self.device_ids)
|
||||
|
||||
self.read_from_config()
|
||||
self.postprocess()
|
||||
@@ -1543,8 +1254,6 @@ class FDConfig:
|
||||
|
||||
self.cache_config.postprocess(self.scheduler_config.max_num_batched_tokens, self.scheduler_config.max_num_seqs)
|
||||
self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size)
|
||||
if self.model_config is not None and self.model_config.enable_mm:
|
||||
self.cache_config.enable_prefix_caching = False
|
||||
|
||||
if self.guided_decoding_backend == "auto":
|
||||
if current_platform.is_xpu() or self.speculative_config.method is not None:
|
||||
@@ -1553,15 +1262,6 @@ class FDConfig:
|
||||
else:
|
||||
self.guided_decoding_backend = "xgrammar"
|
||||
|
||||
if self.scheduler_config.splitwise_role == "mixed":
|
||||
self.model_config.moe_phase = MoEPhase(phase="prefill")
|
||||
elif self.scheduler_config.splitwise_role == "prefill":
|
||||
self.model_config.moe_phase = MoEPhase(phase="prefill")
|
||||
elif self.scheduler_config.splitwise_role == "decode":
|
||||
self.model_config.moe_phase = MoEPhase(phase="decode")
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
def check(self):
|
||||
"""
|
||||
check the legality of config
|
||||
@@ -1596,7 +1296,7 @@ class FDConfig:
|
||||
f"max_long_partial_prefills: {self.max_long_partial_prefills} should "
|
||||
f"be less than or equal to max_num_partial_prefills: {self.max_num_partial_prefills}"
|
||||
)
|
||||
assert self.scheduler_config.splitwise_role in ["mixed", "prefill", "decode"]
|
||||
assert self.splitwise_role in ["mixed", "prefill", "decode"]
|
||||
# TODO(@wufeisheng): TP and EP need to be supported simultaneously.
|
||||
assert (self.parallel_config.tensor_parallel_size == 1 and self.parallel_config.expert_parallel_size >= 1) or (
|
||||
self.parallel_config.tensor_parallel_size >= 1 and self.parallel_config.expert_parallel_size == 1
|
||||
@@ -1682,8 +1382,8 @@ class FDConfig:
|
||||
initialize cache info
|
||||
"""
|
||||
disaggregate_info = {}
|
||||
if self.scheduler_config.splitwise_role != "mixed":
|
||||
disaggregate_info["role"] = self.scheduler_config.splitwise_role
|
||||
if self.splitwise_role != "mixed":
|
||||
disaggregate_info["role"] = self.splitwise_role
|
||||
disaggregate_info["cache_info"] = dict()
|
||||
current_protocol = self.cache_config.cache_transfer_protocol.split(",")
|
||||
disaggregate_info["transfer_protocol"] = current_protocol
|
||||
@@ -1691,9 +1391,7 @@ class FDConfig:
|
||||
if protocol == "ipc":
|
||||
disaggregate_info["cache_info"][protocol] = {
|
||||
"ip": self.host_ip,
|
||||
"port": self.parallel_config.engine_worker_queue_port[
|
||||
self.parallel_config.local_data_parallel_id
|
||||
],
|
||||
"port": self.engine_worker_queue_port[self.parallel_config.local_data_parallel_id],
|
||||
"device_ids": self.local_device_ids,
|
||||
}
|
||||
elif protocol == "rdma":
|
||||
|
@@ -42,12 +42,6 @@ def use_custom_allreduce(custom_all_reduce_max_bytes: int = 8192 * 1024):
|
||||
_TP_AR = CustomAllreduce(model_parallel_group, custom_all_reduce_max_bytes)
|
||||
|
||||
|
||||
def custom_ar_clear_ipc_handles():
|
||||
global _TP_AR
|
||||
if _TP_AR is not None:
|
||||
_TP_AR.clear_ipc_handles()
|
||||
|
||||
|
||||
try:
|
||||
|
||||
@paddle.jit.marker.unified
|
||||
@@ -72,26 +66,3 @@ try:
|
||||
|
||||
except:
|
||||
tensor_model_parallel_all_reduce = None
|
||||
|
||||
from paddle.distributed.communication import stream
|
||||
from paddle.distributed.communication.reduce import ReduceOp
|
||||
|
||||
|
||||
def all_reduce(
|
||||
tensor,
|
||||
op,
|
||||
group,
|
||||
sync_op: bool = True,
|
||||
):
|
||||
return stream.all_reduce(tensor, op=op, group=group, sync_op=sync_op, use_calc_stream=True)
|
||||
|
||||
|
||||
@paddle.jit.marker.unified
|
||||
def tensor_model_parallel_all_reduce_custom(input_: paddle.Tensor) -> paddle.Tensor:
|
||||
"""All-reduce the input tensor across model parallel group on calc stream."""
|
||||
if paddle.in_dynamic_mode():
|
||||
hcg = dist.fleet.get_hybrid_communicate_group()
|
||||
mp_group = hcg.get_model_parallel_group()
|
||||
all_reduce(input_, op=ReduceOp.SUM, group=mp_group)
|
||||
else:
|
||||
dist.all_reduce(input_)
|
||||
|
@@ -25,7 +25,6 @@ from paddle.distributed.communication.group import Group
|
||||
from fastdeploy.distributed.custom_all_reduce import cuda_wrapper
|
||||
from fastdeploy.model_executor.ops.gpu import (
|
||||
all_reduce,
|
||||
clear_ipc_handles,
|
||||
dispose,
|
||||
get_graph_buffer_ipc_meta,
|
||||
init_custom_all_reduce,
|
||||
@@ -221,9 +220,6 @@ class CustomAllreduce:
|
||||
else:
|
||||
return self.all_reduce(input, input, registered=False)
|
||||
|
||||
def clear_ipc_handles(self):
|
||||
clear_ipc_handles(self._ptr)
|
||||
|
||||
def close(self):
|
||||
if self._ptr:
|
||||
dispose(self._ptr)
|
||||
|
@@ -18,23 +18,20 @@ import argparse
|
||||
import json
|
||||
from dataclasses import asdict, dataclass
|
||||
from dataclasses import fields as dataclass_fields
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import paddle
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.config import (
|
||||
CacheConfig,
|
||||
ConvertOption,
|
||||
EarlyStopConfig,
|
||||
FDConfig,
|
||||
GraphOptimizationConfig,
|
||||
LoadConfig,
|
||||
MobaAttentionConfig,
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
PlasAttentionConfig,
|
||||
PoolerConfig,
|
||||
RunnerOption,
|
||||
SpeculativeConfig,
|
||||
TaskOption,
|
||||
)
|
||||
@@ -98,20 +95,6 @@ class EngineArgs:
|
||||
"""
|
||||
The task to be executed by the model.
|
||||
"""
|
||||
runner: RunnerOption = "auto"
|
||||
"""
|
||||
The type of model runner to use.Each FD instance only supports one model runner.
|
||||
even if the same model can be used for multiple types.
|
||||
"""
|
||||
convert: ConvertOption = "auto"
|
||||
"""
|
||||
Convert the model using adapters. The most common use case is to
|
||||
adapt a text generation model to be used for pooling tasks.
|
||||
"""
|
||||
override_pooler_config: Optional[Union[dict, PoolerConfig]] = None
|
||||
"""
|
||||
Override configuration for the pooler.
|
||||
"""
|
||||
max_num_seqs: int = 8
|
||||
"""
|
||||
Maximum number of sequences per iteration.
|
||||
@@ -152,7 +135,7 @@ class EngineArgs:
|
||||
"""
|
||||
dynamic load weight
|
||||
"""
|
||||
load_strategy: str = "normal"
|
||||
load_strategy: str = "ipc_snapshot"
|
||||
"""
|
||||
dynamic load weight strategy
|
||||
"""
|
||||
@@ -361,9 +344,9 @@ class EngineArgs:
|
||||
"""
|
||||
Configuration for graph optimization backend execution.
|
||||
"""
|
||||
plas_attention_config: Optional[Dict[str, Any]] = None
|
||||
moba_attention_config: Optional[Dict[str, Any]] = None
|
||||
"""
|
||||
Configuration for plas attention.
|
||||
Configuration for moba attention.
|
||||
"""
|
||||
|
||||
enable_logprob: bool = False
|
||||
@@ -403,27 +386,30 @@ class EngineArgs:
|
||||
"""
|
||||
Post-initialization processing to set default tokenizer if not provided.
|
||||
"""
|
||||
|
||||
if not self.tokenizer:
|
||||
self.tokenizer = self.model
|
||||
if self.splitwise_role == "decode":
|
||||
self.enable_prefix_caching = False
|
||||
if self.speculative_config is not None:
|
||||
self.enable_prefix_caching = False
|
||||
if self.enable_mm:
|
||||
self.enable_prefix_caching = False
|
||||
if not current_platform.is_cuda():
|
||||
self.enable_prefix_caching = False
|
||||
# if self.dynamic_load_weight:
|
||||
# self.enable_prefix_caching = False
|
||||
if self.dynamic_load_weight:
|
||||
self.enable_prefix_caching = False
|
||||
if self.enable_logprob:
|
||||
if self.speculative_config is not None:
|
||||
raise NotImplementedError("Logprob does not support speculation_config.")
|
||||
if self.enable_expert_parallel:
|
||||
raise NotImplementedError("Logprob does not support enable_expert_parallel.")
|
||||
if not current_platform.is_cuda():
|
||||
raise NotImplementedError("Only CUDA platform supports logprob.")
|
||||
if self.speculative_config is not None:
|
||||
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
|
||||
if self.splitwise_role != "mixed" and self.cache_transfer_protocol != "rdma":
|
||||
if self.splitwise_role != "mixed":
|
||||
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
|
||||
if not current_platform.is_cuda() and not current_platform.is_xpu():
|
||||
if not current_platform.is_cuda():
|
||||
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
|
||||
if self.guided_decoding_backend != "off":
|
||||
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
|
||||
@@ -489,21 +475,6 @@ class EngineArgs:
|
||||
default=EngineArgs.task,
|
||||
help="Task to be executed by the model.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--runner",
|
||||
type=str,
|
||||
default=EngineArgs.runner,
|
||||
help="The type of model runner to use",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--convert", type=str, default=EngineArgs.convert, help="Convert the model using adapters"
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--override-pooler-config",
|
||||
type=json.loads,
|
||||
default=EngineArgs.override_pooler_config,
|
||||
help="Override the pooler configuration with a JSON string.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--use-warmup",
|
||||
type=int,
|
||||
@@ -600,9 +571,9 @@ class EngineArgs:
|
||||
help="",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--plas-attention-config",
|
||||
"--moba-attention-config",
|
||||
type=json.loads,
|
||||
default=EngineArgs.plas_attention_config,
|
||||
default=EngineArgs.moba_attention_config,
|
||||
help="",
|
||||
)
|
||||
model_group.add_argument(
|
||||
@@ -711,7 +682,7 @@ class EngineArgs:
|
||||
# Load group
|
||||
load_group = parser.add_argument_group("Load Configuration")
|
||||
load_group.add_argument(
|
||||
"--load-choices",
|
||||
"--load_choices",
|
||||
type=str,
|
||||
default=EngineArgs.load_choices,
|
||||
help="The format of the model weights to load.\
|
||||
@@ -735,7 +706,7 @@ class EngineArgs:
|
||||
cache_group.add_argument(
|
||||
"--prealloc-dec-block-slot-num-threshold",
|
||||
type=int,
|
||||
default=EngineArgs.prealloc_dec_block_slot_num_threshold,
|
||||
default=12,
|
||||
help="Number of token slot threadshold to allocate next blocks for decoding.",
|
||||
)
|
||||
|
||||
@@ -992,17 +963,17 @@ class EngineArgs:
|
||||
graph_optimization_args[k] = v
|
||||
return GraphOptimizationConfig(graph_optimization_args)
|
||||
|
||||
def create_plas_attention_config(self) -> PlasAttentionConfig:
|
||||
def create_moba_attention_config(self) -> MobaAttentionConfig:
|
||||
"""
|
||||
Create and retuan a PlasAttentionConfig object based on the current settings.
|
||||
Create and retuan a MobaAttentionConfig object based on the current settings.
|
||||
"""
|
||||
attention_args = asdict(self)
|
||||
if self.plas_attention_config is not None:
|
||||
for k, v in self.plas_attention_config.items():
|
||||
if self.moba_attention_config is not None:
|
||||
for k, v in self.moba_attention_config.items():
|
||||
attention_args[k] = v
|
||||
return PlasAttentionConfig(attention_args)
|
||||
return MobaAttentionConfig(attention_args)
|
||||
else:
|
||||
return PlasAttentionConfig(None)
|
||||
return MobaAttentionConfig(None)
|
||||
|
||||
def create_early_stop_config(self) -> EarlyStopConfig:
|
||||
"""
|
||||
@@ -1050,11 +1021,6 @@ class EngineArgs:
|
||||
else:
|
||||
self.max_num_batched_tokens = self.max_model_len
|
||||
|
||||
if isinstance(self.engine_worker_queue_port, int):
|
||||
self.engine_worker_queue_port = str(self.engine_worker_queue_port)
|
||||
if isinstance(self.engine_worker_queue_port, str):
|
||||
self.engine_worker_queue_port = self.engine_worker_queue_port.split(",")
|
||||
|
||||
all_dict = asdict(self)
|
||||
all_dict["model_cfg"] = model_cfg
|
||||
cache_cfg = CacheConfig(all_dict)
|
||||
@@ -1063,11 +1029,16 @@ class EngineArgs:
|
||||
scheduler_cfg = self.create_scheduler_config()
|
||||
graph_opt_cfg = self.create_graph_optimization_config()
|
||||
graph_opt_cfg.update_use_cudagraph(self.use_cudagraph)
|
||||
plas_attention_config = self.create_plas_attention_config()
|
||||
moba_attention_config = self.create_moba_attention_config()
|
||||
|
||||
early_stop_cfg = self.create_early_stop_config()
|
||||
early_stop_cfg.update_enable_early_stop(self.enable_early_stop)
|
||||
|
||||
if isinstance(self.engine_worker_queue_port, int):
|
||||
self.engine_worker_queue_port = str(self.engine_worker_queue_port)
|
||||
if isinstance(self.engine_worker_queue_port, str):
|
||||
self.engine_worker_queue_port = self.engine_worker_queue_port.split(",")
|
||||
|
||||
assert is_port_available(
|
||||
"0.0.0.0", int(self.engine_worker_queue_port[parallel_cfg.local_data_parallel_id])
|
||||
), f"The parameter `engine_worker_queue_port`:{self.engine_worker_queue_port} is already in use."
|
||||
@@ -1083,16 +1054,18 @@ class EngineArgs:
|
||||
speculative_config=speculative_cfg,
|
||||
ips=self.ips,
|
||||
use_warmup=self.use_warmup,
|
||||
engine_worker_queue_port=self.engine_worker_queue_port,
|
||||
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
||||
reasoning_parser=self.reasoning_parser,
|
||||
tool_parser=self.tool_call_parser,
|
||||
splitwise_role=self.splitwise_role,
|
||||
innode_prefill_ports=self.innode_prefill_ports,
|
||||
max_num_partial_prefills=self.max_num_partial_prefills,
|
||||
max_long_partial_prefills=self.max_long_partial_prefills,
|
||||
long_prefill_token_threshold=self.long_prefill_token_threshold,
|
||||
graph_opt_config=graph_opt_cfg,
|
||||
plas_attention_config=plas_attention_config,
|
||||
moba_attention_config=moba_attention_config,
|
||||
guided_decoding_backend=self.guided_decoding_backend,
|
||||
disable_any_whitespace=self.guided_decoding_disable_any_whitespace,
|
||||
early_stop_config=early_stop_cfg,
|
||||
|
@@ -30,23 +30,21 @@ import paddle
|
||||
import zmq
|
||||
from opentelemetry import trace
|
||||
|
||||
from fastdeploy.engine.request import Request, RequestOutput, RequestType
|
||||
from fastdeploy.engine.request import Request, RequestOutput
|
||||
from fastdeploy.engine.resource_manager import ResourceManager
|
||||
from fastdeploy.engine.sched.resource_manager_v1 import ResourceManagerV1
|
||||
from fastdeploy.inter_communicator import (
|
||||
EngineCacheQueue,
|
||||
EngineWorkerQueue,
|
||||
IPCSignal,
|
||||
ZmqIpcServer,
|
||||
ZmqTcpServer,
|
||||
ZmqClient,
|
||||
)
|
||||
from fastdeploy.metrics.metrics import main_process_metrics
|
||||
from fastdeploy.metrics.trace_util import start_span, start_span_request
|
||||
from fastdeploy.model_executor.guided_decoding import schema_checker
|
||||
from fastdeploy.plugins.token_processor import load_token_processor_plugins
|
||||
from fastdeploy.splitwise.internal_adapter_utils import InternalAdapter
|
||||
from fastdeploy.splitwise.splitwise_connector import SplitwiseConnector
|
||||
from fastdeploy.utils import EngineError, envs, get_logger, llm_logger
|
||||
from fastdeploy.utils import EngineError, envs, llm_logger
|
||||
|
||||
try:
|
||||
TokenProcessor = load_token_processor_plugins()
|
||||
@@ -69,36 +67,32 @@ class EngineService:
|
||||
"""
|
||||
self.cfg = cfg
|
||||
|
||||
if self.cfg.parallel_config.enable_expert_parallel:
|
||||
self.llm_logger = get_logger(
|
||||
"fastdeploy", f"fastdeploy_rank{self.cfg.parallel_config.local_data_parallel_id}.log"
|
||||
)
|
||||
else:
|
||||
self.llm_logger = llm_logger
|
||||
|
||||
self.scheduler = cfg.scheduler_config.scheduler()
|
||||
self.enable_decode_cache_task = envs.FD_ENABLE_CACHE_TASK == "1"
|
||||
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
self.resource_manager = ResourceManagerV1(
|
||||
cfg.scheduler_config.max_num_seqs,
|
||||
cfg,
|
||||
cfg.parallel_config.tensor_parallel_size,
|
||||
cfg.scheduler_config.splitwise_role,
|
||||
cfg.splitwise_role,
|
||||
cfg.parallel_config.local_data_parallel_id,
|
||||
)
|
||||
if cfg.splitwise_role != "mixed":
|
||||
raise NotImplementedError(
|
||||
"Currently ENABLE_V1_KVCACHE_SCHEDULER=1 only supported in mixed sampling now."
|
||||
)
|
||||
else:
|
||||
self.resource_manager = ResourceManager(
|
||||
cfg.scheduler_config.max_num_seqs,
|
||||
cfg,
|
||||
cfg.parallel_config.tensor_parallel_size,
|
||||
cfg.scheduler_config.splitwise_role,
|
||||
cfg.splitwise_role,
|
||||
cfg.parallel_config.local_data_parallel_id,
|
||||
)
|
||||
|
||||
self.start_worker_queue_service(start_queue)
|
||||
|
||||
os.environ["INFERENCE_MSG_QUEUE_ID"] = self.cfg.parallel_config.engine_worker_queue_port[
|
||||
os.environ["INFERENCE_MSG_QUEUE_ID"] = self.cfg.engine_worker_queue_port[
|
||||
self.cfg.parallel_config.local_data_parallel_id
|
||||
]
|
||||
|
||||
@@ -139,14 +133,10 @@ class EngineService:
|
||||
self.insert_task_to_worker_thread.start()
|
||||
self.token_processor.tasks_queue = self.engine_worker_queue
|
||||
self.token_processor.run()
|
||||
if self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
self.split_mode_get_tasks()
|
||||
|
||||
def _init_worker_monitor_signals(self): # exist_task_signal 用于各worker进程感知是否有新Task需要处理
|
||||
current_suffix = int(
|
||||
self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]
|
||||
)
|
||||
self.llm_logger.info(f"current_suffix: {current_suffix}")
|
||||
current_suffix = int(self.cfg.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id])
|
||||
llm_logger.info(f"current_suffix: {current_suffix}")
|
||||
exist_task_signal_data = np.zeros([1], dtype=np.int32)
|
||||
self.exist_task_signal = IPCSignal(
|
||||
name="exist_task_signal",
|
||||
@@ -188,24 +178,6 @@ class EngineService:
|
||||
create=True,
|
||||
)
|
||||
|
||||
cache_ready_signal_data = np.zeros(shape=[self.cfg.parallel_config.tensor_parallel_size], dtype=np.int32)
|
||||
self.cache_ready_signal = IPCSignal(
|
||||
name="cache_ready_signal",
|
||||
array=cache_ready_signal_data,
|
||||
dtype=np.int32,
|
||||
suffix=current_suffix,
|
||||
create=True,
|
||||
)
|
||||
|
||||
swap_space_ready_signal_data = np.zeros(shape=[self.cfg.parallel_config.tensor_parallel_size], dtype=np.int32)
|
||||
self.swap_space_ready_signal = IPCSignal(
|
||||
name="swap_space_ready_signal",
|
||||
array=swap_space_ready_signal_data,
|
||||
dtype=np.int32,
|
||||
suffix=current_suffix,
|
||||
create=True,
|
||||
)
|
||||
|
||||
model_weights_status = np.zeros([1], dtype=np.int32)
|
||||
self.model_weights_status_signal = IPCSignal(
|
||||
name="model_weights_status",
|
||||
@@ -215,35 +187,17 @@ class EngineService:
|
||||
create=True,
|
||||
)
|
||||
|
||||
prefix_tree_status = np.zeros([1], dtype=np.int32)
|
||||
self.prefix_tree_status_signal = IPCSignal(
|
||||
name="prefix_tree_status",
|
||||
array=prefix_tree_status,
|
||||
dtype=np.int32,
|
||||
suffix=current_suffix,
|
||||
create=True,
|
||||
)
|
||||
|
||||
kv_cache_status = np.zeros([1], dtype=np.int32)
|
||||
self.kv_cache_status_signal = IPCSignal(
|
||||
name="kv_cache_status",
|
||||
array=kv_cache_status,
|
||||
dtype=np.int32,
|
||||
suffix=current_suffix,
|
||||
create=True,
|
||||
)
|
||||
|
||||
def start_worker_queue_service(self, start_queue):
|
||||
"""
|
||||
start queue service for engine worker communication
|
||||
"""
|
||||
address = (
|
||||
self.cfg.master_ip,
|
||||
int(self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]),
|
||||
int(self.cfg.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]),
|
||||
)
|
||||
|
||||
if start_queue and (self.cfg.host_ip == self.cfg.master_ip or self.cfg.master_ip == "0.0.0.0"):
|
||||
self.llm_logger.info(f"Starting engine worker queue server service at {address}")
|
||||
llm_logger.info(f"Starting engine worker queue server service at {address}")
|
||||
self.engine_worker_queue_server = EngineWorkerQueue(
|
||||
address=address,
|
||||
is_server=True,
|
||||
@@ -253,7 +207,7 @@ class EngineService:
|
||||
|
||||
if (
|
||||
self.cfg.cache_config.enable_prefix_caching
|
||||
or self.cfg.scheduler_config.splitwise_role != "mixed"
|
||||
or self.cfg.splitwise_role != "mixed"
|
||||
and self.cfg.parallel_config.local_data_parallel_id == 0
|
||||
):
|
||||
self.cache_task_queue = EngineCacheQueue(
|
||||
@@ -267,7 +221,7 @@ class EngineService:
|
||||
client_id=-1,
|
||||
local_data_parallel_size=self.cfg.parallel_config.data_parallel_size,
|
||||
)
|
||||
self.llm_logger.info(
|
||||
llm_logger.info(
|
||||
f"local {min(self.cfg.worker_num_per_node * self.cfg.node_rank + self.cfg.parallel_config.local_data_parallel_id,self.cfg.parallel_config.data_parallel_size - 1)}"
|
||||
)
|
||||
self.engine_worker_queue = EngineWorkerQueue(
|
||||
@@ -296,21 +250,8 @@ class EngineService:
|
||||
cur_task_idx = self.resource_manager.req_dict[task.request_id]
|
||||
del self.resource_manager.req_dict[task.request_id]
|
||||
cur_task = self.resource_manager.tasks_list[cur_task_idx]
|
||||
if envs.FD_ENABLE_INTERNAL_ADAPTER:
|
||||
if not task.outputs.token_ids: # first token is eos in Prefill, just recycle resource and continue
|
||||
self.resource_manager.stop_flags[cur_task_idx] = True
|
||||
self.resource_manager.tasks_list[cur_task_idx] = None
|
||||
self.resource_manager._recycle_block_tables(cur_task)
|
||||
if task.request_id in self.token_processor.tokens_counter:
|
||||
del self.token_processor.tokens_counter[task.request_id]
|
||||
self.llm_logger.warning(f"{task.request_id} need not decode after first token")
|
||||
continue
|
||||
cur_task.prompt_token_ids[0] = task.outputs.token_ids[0]
|
||||
cur_task.num_cached_tokens = task.num_cached_tokens
|
||||
if (
|
||||
self.cfg.speculative_config.method in ["mtp"]
|
||||
and self.cfg.scheduler_config.splitwise_role == "decode"
|
||||
):
|
||||
if self.cfg.speculative_config.method in ["mtp"] and self.cfg.splitwise_role == "decode":
|
||||
cur_task.draft_token_ids = copy.deepcopy(task.outputs.draft_token_ids)
|
||||
if task.error_code != 200:
|
||||
self.resource_manager.stop_flags[cur_task_idx] = True
|
||||
@@ -319,14 +260,13 @@ class EngineService:
|
||||
if task.request_id in self.token_processor.tokens_counter:
|
||||
del self.token_processor.tokens_counter[task.request_id]
|
||||
self.scheduler.put_results([task])
|
||||
self.llm_logger.warning(
|
||||
llm_logger.warning(
|
||||
f"{task.request_id} prefill failed with msg:{task.error_msg}, recycle resource."
|
||||
)
|
||||
continue
|
||||
self.token_processor.tokens_counter[task.request_id] = 1
|
||||
current_tasks.append(cur_task)
|
||||
if current_tasks:
|
||||
self.engine_worker_queue.put_tasks((current_tasks, self.resource_manager.real_bsz))
|
||||
self.engine_worker_queue.put_tasks((current_tasks, self.resource_manager.real_bsz))
|
||||
return True
|
||||
|
||||
self.resource_manager.check_and_free_block_tables()
|
||||
@@ -334,34 +274,13 @@ class EngineService:
|
||||
if not isinstance(tasks, list):
|
||||
tasks = [tasks]
|
||||
|
||||
need_delete_tasks = []
|
||||
for task in tasks:
|
||||
if self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
status, msg = self.split_connector.check_decode_allocated(task)
|
||||
if not status:
|
||||
self.llm_logger.error(f"{task.request_id} prefill failed with msg:{msg}.")
|
||||
self.scheduler.put_results(
|
||||
[
|
||||
RequestOutput(
|
||||
request_id=task.request_id,
|
||||
finished=True,
|
||||
error_code=500,
|
||||
error_msg=msg,
|
||||
)
|
||||
]
|
||||
)
|
||||
need_delete_tasks.append(task)
|
||||
continue
|
||||
for tmp_task in need_delete_tasks:
|
||||
tasks.remove(tmp_task)
|
||||
|
||||
for item in tasks:
|
||||
item.schedule_start_time = time.time()
|
||||
|
||||
available_batch = np.sum(self.resource_manager.stop_flags)
|
||||
if len(tasks) > available_batch:
|
||||
self.llm_logger.error(f"Inserting batch:{len(tasks)} exceeds the available batch:{available_batch}.")
|
||||
self.llm_logger.error("The exceeded part will be ignored!")
|
||||
llm_logger.error(f"Inserting batch:{len(tasks)} exceeds the available batch:{available_batch}.")
|
||||
llm_logger.error("The exceeded part will be ignored!")
|
||||
tasks = tasks[:available_batch]
|
||||
|
||||
req_ids = [t.request_id for t in tasks]
|
||||
@@ -370,7 +289,7 @@ class EngineService:
|
||||
|
||||
if not tasks:
|
||||
error_msg = f"The request required resources is exceed the limit, request id={req_ids}."
|
||||
self.llm_logger.error(error_msg)
|
||||
llm_logger.error(error_msg)
|
||||
raise EngineError(error_msg, error_code=500)
|
||||
return False
|
||||
|
||||
@@ -388,7 +307,7 @@ class EngineService:
|
||||
|
||||
self.split_connector.send_cache_infos(tasks, current_id)
|
||||
if not is_decode:
|
||||
self.llm_logger.info(f"Tasks are sent to engine, req_ids={req_ids}")
|
||||
llm_logger.info(f"Tasks are sent to engine, req_ids={req_ids}")
|
||||
for task in tasks:
|
||||
task.inference_start_time = time.time()
|
||||
if not is_prefill:
|
||||
@@ -547,7 +466,7 @@ class EngineService:
|
||||
Insert task to engine thread, monitor scheduler request queue.
|
||||
if the engine has resource, insert task to engine
|
||||
"""
|
||||
current_id = 0
|
||||
current_id = -1
|
||||
while getattr(self, "running", True):
|
||||
try:
|
||||
if self.resource_manager.available_batch() == 0:
|
||||
@@ -557,10 +476,7 @@ class EngineService:
|
||||
time.sleep(0.001)
|
||||
continue
|
||||
if hasattr(self, "exist_prefill_task_signal") and self.exist_prefill_task_signal.value[0] > 0:
|
||||
if (
|
||||
self.cfg.scheduler_config.splitwise_role == "mixed"
|
||||
or self.split_connector.has_splitwise_tasks()
|
||||
):
|
||||
if self.cfg.splitwise_role == "mixed" or self.split_connector.has_splitwise_tasks():
|
||||
time.sleep(0.005)
|
||||
continue
|
||||
if self.engine_worker_queue.num_cache_infos() > 0:
|
||||
@@ -588,21 +504,18 @@ class EngineService:
|
||||
time.sleep(0.001)
|
||||
continue
|
||||
|
||||
if self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
self.llm_logger.info("Inserting splitwise tasks")
|
||||
current_id = (current_id + 1) % 100003
|
||||
if self.cfg.splitwise_role != "mixed":
|
||||
llm_logger.info("Inserting splitwise tasks")
|
||||
self.split_connector.send_splitwise_tasks(tasks, current_id)
|
||||
|
||||
insert_successful = self.insert_tasks(tasks, current_id)
|
||||
if insert_successful:
|
||||
current_id = current_id + 1
|
||||
else:
|
||||
continue
|
||||
self.insert_tasks(tasks, current_id)
|
||||
|
||||
main_process_metrics.num_requests_waiting.dec(len(tasks))
|
||||
main_process_metrics.num_requests_running.inc(len(tasks))
|
||||
except Exception as e:
|
||||
err_msg = f"Error happend while insert task to engine: {e}, {traceback.format_exc()!s}."
|
||||
self.llm_logger.error(err_msg)
|
||||
err_msg = f"Error happened while insert task to engine: {e}, {traceback.format_exc()!s}."
|
||||
llm_logger.error(err_msg)
|
||||
|
||||
def _scheduler_task_to_worker_v1(self):
|
||||
"""
|
||||
@@ -612,145 +525,60 @@ class EngineService:
|
||||
is_fetching = False
|
||||
|
||||
def _fetch_request():
|
||||
try:
|
||||
nonlocal is_fetching
|
||||
is_fetching = True
|
||||
num_prefill_batch = min(
|
||||
int(self.resource_manager.available_batch()),
|
||||
self.cfg.max_prefill_batch,
|
||||
)
|
||||
if self.cfg.model_config.enable_mm:
|
||||
available_blocks = self.resource_manager.available_block_num()
|
||||
else:
|
||||
available_blocks = self.cfg.cache_config.max_block_num_per_seq
|
||||
nonlocal is_fetching
|
||||
is_fetching = True
|
||||
num_prefill_batch = min(
|
||||
int(self.resource_manager.available_batch()),
|
||||
self.cfg.max_prefill_batch,
|
||||
)
|
||||
if self.cfg.model_config.enable_mm:
|
||||
available_blocks = self.resource_manager.available_block_num()
|
||||
else:
|
||||
available_blocks = self.cfg.cache_config.max_block_num_per_seq
|
||||
|
||||
tasks = self.scheduler.get_requests(
|
||||
available_blocks=available_blocks,
|
||||
block_size=self.cfg.cache_config.block_size,
|
||||
reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num,
|
||||
max_num_batched_tokens=self.cfg.max_model_len,
|
||||
batch=num_prefill_batch,
|
||||
)
|
||||
if self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
for task in tasks:
|
||||
# assure can allocate block ids in P
|
||||
while not self.resource_manager.preallocate_resource_in_p(task):
|
||||
time.sleep(0.005)
|
||||
self.llm_logger.info(f"ask D resource for req_id: {task.request_id}")
|
||||
self.split_connector.send_splitwise_tasks([task], task.idx)
|
||||
need_delete_tasks = []
|
||||
for task in tasks:
|
||||
if self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
# assure fetch block ids from D
|
||||
status, msg = self.split_connector.check_decode_allocated(task)
|
||||
if not status:
|
||||
self.llm_logger.error(f"{task.request_id} prefill failed with msg:{msg}.")
|
||||
self.scheduler.put_results(
|
||||
[
|
||||
RequestOutput(
|
||||
request_id=task.request_id,
|
||||
finished=True,
|
||||
error_code=500,
|
||||
error_msg=msg,
|
||||
)
|
||||
]
|
||||
)
|
||||
need_delete_tasks.append(task)
|
||||
continue
|
||||
for tmp_task in need_delete_tasks:
|
||||
tasks.remove(tmp_task)
|
||||
# release resource in P
|
||||
self.resource_manager.prerelease_resource(tmp_task)
|
||||
if self.cfg.scheduler_config.splitwise_role == "prefill":
|
||||
# to send cache info to cache messager
|
||||
if tasks:
|
||||
self.split_connector.send_cache_infos(tasks, 0)
|
||||
# ensure cache tasks has sent to cache_messager
|
||||
need_check_req_ids = [task.request_id for task in tasks]
|
||||
while need_check_req_ids:
|
||||
req_ids = self.engine_worker_queue.get_finished_add_cache_task_req()
|
||||
self.llm_logger.info(f"get_finished_add_cache_task_req: {req_ids}")
|
||||
if req_ids:
|
||||
for req_id in req_ids:
|
||||
assert req_id in need_check_req_ids
|
||||
need_check_req_ids.remove(req_id)
|
||||
else:
|
||||
time.sleep(0.001)
|
||||
# Fetch requests and add them to the scheduling queue
|
||||
if tasks:
|
||||
if self.cfg.scheduler_config.splitwise_role == "prefill":
|
||||
self.resource_manager.add_request_in_p(tasks)
|
||||
else:
|
||||
for task in tasks:
|
||||
self.resource_manager.add_request(task)
|
||||
is_fetching = False
|
||||
except Exception as e:
|
||||
self.llm_logger.error(f"fetching request error {e} {str(traceback.format_exc())}")
|
||||
is_fetching = False
|
||||
tasks = self.scheduler.get_requests(
|
||||
available_blocks=available_blocks,
|
||||
block_size=self.cfg.cache_config.block_size,
|
||||
reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num,
|
||||
max_num_batched_tokens=self.cfg.max_model_len,
|
||||
batch=num_prefill_batch,
|
||||
)
|
||||
# Fetch requests and add them to the scheduling queue
|
||||
for task in tasks:
|
||||
self.resource_manager.add_request(task)
|
||||
is_fetching = False
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
if self.engine_worker_queue.num_tasks() > 0:
|
||||
time.sleep(0.001)
|
||||
continue
|
||||
if self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
if self.scheduler.get_unhandled_request_num() <= envs.FD_EP_MAX_PREFETCH_TASK_NUM and (
|
||||
not is_fetching
|
||||
):
|
||||
get_request_pool.submit(_fetch_request)
|
||||
|
||||
else:
|
||||
if (
|
||||
len(self.resource_manager.waiting) == 0
|
||||
and (not is_fetching)
|
||||
and self.exist_prefill_task_signal.value[0] == 0
|
||||
):
|
||||
get_request_pool.submit(_fetch_request)
|
||||
if (
|
||||
len(self.resource_manager.waiting) == 0
|
||||
and (not is_fetching)
|
||||
and self.exist_prefill_task_signal.value[0] == 0
|
||||
):
|
||||
get_request_pool.submit(_fetch_request)
|
||||
# 2. Schedule requests
|
||||
tasks = self.resource_manager.schedule()
|
||||
# 3. Send to engine
|
||||
if tasks:
|
||||
if self.cfg.scheduler_config.splitwise_role == "decode":
|
||||
for task in tasks:
|
||||
if task.task_type == RequestType.PREEMPTED:
|
||||
msg = f"{task.request_id} decode not enough blocks, need to be rescheduled."
|
||||
self.llm_logger.error(msg)
|
||||
self.scheduler.put_results(
|
||||
[
|
||||
RequestOutput(
|
||||
request_id=task.request_id,
|
||||
finished=True,
|
||||
error_code=500,
|
||||
error_msg=msg,
|
||||
)
|
||||
]
|
||||
)
|
||||
self.resource_manager.get_real_bsz()
|
||||
self.engine_worker_queue.put_tasks((tasks, self.resource_manager.real_bsz))
|
||||
else:
|
||||
time.sleep(0.005)
|
||||
|
||||
except Exception as e:
|
||||
err_msg = "Error happend while insert task to engine: {}, {}.".format(e, str(traceback.format_exc()))
|
||||
self.llm_logger.error(err_msg)
|
||||
err_msg = "Error happened while insert task to engine: {}, {}.".format(e, str(traceback.format_exc()))
|
||||
llm_logger.error(err_msg)
|
||||
|
||||
def start_zmq_service(self, api_server_pid=None):
|
||||
if api_server_pid is None:
|
||||
return
|
||||
self.api_server_pid = api_server_pid
|
||||
if envs.FD_ENABLE_INTERNAL_ADAPTER:
|
||||
self.recv_request_server = ZmqTcpServer(port=envs.FD_ZMQ_RECV_REQUEST_SERVER_PORT, mode=zmq.PULL)
|
||||
self.send_response_server = ZmqTcpServer(port=envs.FD_ZMQ_SEND_RESPONSE_SERVER_PORT, mode=zmq.ROUTER)
|
||||
self.internal_adapter = InternalAdapter(
|
||||
cfg=self.cfg, engine=self, dp_rank=self.cfg.node_rank * self.cfg.worker_num_per_node
|
||||
)
|
||||
else:
|
||||
self.recv_request_server = ZmqIpcServer(name=api_server_pid, mode=zmq.PULL)
|
||||
self.send_response_server = ZmqIpcServer(name=api_server_pid, mode=zmq.ROUTER)
|
||||
self.recv_result_handle_thread = threading.Thread(
|
||||
target=self.send_response_server.recv_result_handle, daemon=True
|
||||
)
|
||||
self.recv_result_handle_thread.start()
|
||||
self.zmq_server = ZmqClient(name=api_server_pid, mode=zmq.PULL)
|
||||
self.zmq_server.start_server()
|
||||
self.zmq_server.create_router()
|
||||
time.sleep(3)
|
||||
self.insert_task_to_scheduler_thread = threading.Thread(target=self._insert_zmq_task_to_scheduler, daemon=True)
|
||||
self.insert_task_to_scheduler_thread.start()
|
||||
@@ -760,18 +588,15 @@ class EngineService:
|
||||
|
||||
def _insert_zmq_task_to_scheduler(self):
|
||||
added_requests: Dict[str, int] = dict()
|
||||
if envs.FD_ENABLE_INTERNAL_ADAPTER:
|
||||
if self.cfg.scheduler_config.splitwise_role == "decode":
|
||||
return
|
||||
while self.running:
|
||||
try:
|
||||
block = True if len(added_requests) == 0 else False
|
||||
if not self.cfg.model_config.enable_mm:
|
||||
err, data = self.recv_request_server.receive_json_once(block)
|
||||
err, data = self.zmq_server.receive_json_once(block)
|
||||
else:
|
||||
err, data = self.recv_request_server.receive_pyobj_once(block)
|
||||
err, data = self.zmq_server.receive_pyobj_once(block)
|
||||
if err is not None:
|
||||
self.llm_logger.error(f"Engine stops inserting zmq task into scheduler, err:{err}")
|
||||
llm_logger.error(f"Engine stops inserting zmq task into scheduler, err:{err}")
|
||||
break
|
||||
|
||||
request, insert_task = None, []
|
||||
@@ -782,16 +607,16 @@ class EngineService:
|
||||
request = Request.from_dict(data)
|
||||
start_span("ENQUEUE_ZMQ", data, trace.SpanKind.PRODUCER)
|
||||
main_process_metrics.requests_number.inc()
|
||||
self.llm_logger.debug(f"Receive request: {request}")
|
||||
llm_logger.debug(f"Receive request: {request}")
|
||||
except Exception as e:
|
||||
self.llm_logger.error(f"Receive request error: {e}, {traceback.format_exc()!s}")
|
||||
llm_logger.error(f"Receive request error: {e}, {traceback.format_exc()!s}")
|
||||
err_msg = str(e)
|
||||
results.append((data["request_id"], err_msg))
|
||||
|
||||
if self.guided_decoding_checker is not None and err_msg is None:
|
||||
request, err_msg = self.guided_decoding_checker.schema_format(request)
|
||||
if err_msg is not None:
|
||||
self.llm_logger.error(f"Receive request error: {err_msg}")
|
||||
llm_logger.error(f"Receive request error: {err_msg}")
|
||||
results.append((request.request_id, err_msg))
|
||||
|
||||
if err_msg is None:
|
||||
@@ -823,9 +648,9 @@ class EngineService:
|
||||
)
|
||||
# Since the request is not in scheduler
|
||||
# Send result by zmq directly
|
||||
self.send_response_server.send_response(request_id, [error_result])
|
||||
self.zmq_server.send_multipart(request_id, [error_result])
|
||||
except Exception as e:
|
||||
self.llm_logger.error(
|
||||
llm_logger.error(
|
||||
f"Error happened while receiving new request from zmq, details={e}, "
|
||||
f"traceback={traceback.format_exc()}"
|
||||
)
|
||||
@@ -841,10 +666,10 @@ class EngineService:
|
||||
time.sleep(0.005)
|
||||
continue
|
||||
for request_id, contents in results.items():
|
||||
self.send_response_server.send_response(request_id, contents)
|
||||
self.zmq_server.send_multipart(request_id, contents)
|
||||
|
||||
except Exception as e:
|
||||
self.llm_logger.error(f"Unexcepted error happend: {e}, {traceback.format_exc()!s}")
|
||||
llm_logger.error(f"Unexcepted error happened: {e}, {traceback.format_exc()!s}")
|
||||
|
||||
def split_mode_get_tasks(self):
|
||||
"""
|
||||
@@ -857,22 +682,13 @@ class EngineService:
|
||||
|
||||
processed_indices = []
|
||||
for idx, task in enumerate(self.waiting_requests):
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
if self.resource_manager.preallocate_resource_in_d(task):
|
||||
self.llm_logger.info(f"Resource available, processing task {task.request_id}")
|
||||
self.split_connector.send_cache_infos([task], -1)
|
||||
processed_indices.append(idx)
|
||||
else:
|
||||
self.llm_logger.debug(f"Still waiting for resources {task.request_id}")
|
||||
break
|
||||
if self.resource_manager.is_resource_sufficient(task.prompt_token_ids_len):
|
||||
self.insert_tasks([task])
|
||||
llm_logger.info(f"Resource available, processing task {task.request_id}")
|
||||
processed_indices.append(idx)
|
||||
else:
|
||||
if self.resource_manager.is_resource_sufficient(task.prompt_token_ids_len):
|
||||
self.insert_tasks([task])
|
||||
self.llm_logger.info(f"Resource available, processing task {task.request_id}")
|
||||
processed_indices.append(idx)
|
||||
else:
|
||||
self.llm_logger.debug(f"Still waiting for resources {task.request_id}")
|
||||
break
|
||||
llm_logger.debug(f"Still waiting for resources {task.request_id}")
|
||||
break
|
||||
|
||||
for idx in sorted(processed_indices, reverse=True):
|
||||
self.waiting_requests.pop(idx)
|
||||
@@ -894,111 +710,51 @@ class EngineService:
|
||||
tasks = [tasks]
|
||||
for task in tasks:
|
||||
task.finished = False
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
for task in tasks:
|
||||
if envs.FD_ENABLE_INTERNAL_ADAPTER:
|
||||
if (
|
||||
not task.outputs.token_ids
|
||||
): # first token is eos in Prefill, just recycle resource and continue
|
||||
cur_task = self.resource_manager.requests[task.request_id]
|
||||
self.resource_manager.stop_flags[cur_task.idx] = True
|
||||
self.resource_manager.tasks_list[cur_task.idx] = None
|
||||
self.resource_manager._free_blocks(cur_task)
|
||||
if cur_task.request_id in self.token_processor.tokens_counter:
|
||||
del self.token_processor.tokens_counter[task.request_id]
|
||||
self.llm_logger.warning(
|
||||
f"{task.request_id} need not decode after first token"
|
||||
)
|
||||
del self.resource_manager.requests[task.request_id]
|
||||
del self.resource_manager.req_dict[task.request_id]
|
||||
continue
|
||||
if task.error_code != 200:
|
||||
cur_task = self.resource_manager.requests[task.request_id]
|
||||
self.resource_manager.stop_flags[cur_task.idx] = True
|
||||
self.resource_manager.tasks_list[cur_task.idx] = None
|
||||
self.resource_manager._free_blocks(cur_task)
|
||||
if cur_task.request_id in self.token_processor.tokens_counter:
|
||||
del self.token_processor.tokens_counter[task.request_id]
|
||||
self.scheduler.put_results([task])
|
||||
self.llm_logger.warning(
|
||||
f"{task.request_id} prefill failed with msg:{task.error_msg}, recycle resource."
|
||||
)
|
||||
continue
|
||||
self.resource_manager.insert_task_for_decoding(task)
|
||||
self.insert_tasks(tasks, allocated=True)
|
||||
|
||||
if self.cfg.innode_prefill_ports is not None:
|
||||
self.scheduler.put_results(tasks)
|
||||
|
||||
else:
|
||||
self.insert_tasks(tasks, allocated=True)
|
||||
if self.cfg.innode_prefill_ports is not None:
|
||||
self.scheduler.put_results(tasks)
|
||||
else:
|
||||
if len(self.waiting_requests):
|
||||
self.llm_logger.info(f"Waiting for resource for task {tasks[0].request_id}")
|
||||
llm_logger.info(f"Waiting for resource for task {tasks[0].request_id}")
|
||||
self.waiting_requests.extend(tasks)
|
||||
else:
|
||||
new_waiting = []
|
||||
for task in tasks:
|
||||
can_allocate_resource = False
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
if self.resource_manager.preallocate_resource_in_d(task):
|
||||
self.split_connector.send_cache_infos([task], -1)
|
||||
can_allocate_resource = True
|
||||
if self.resource_manager.is_resource_sufficient(task.prompt_token_ids_len):
|
||||
self.insert_tasks([task])
|
||||
else:
|
||||
if self.resource_manager.is_resource_sufficient(
|
||||
task.prompt_token_ids_len
|
||||
):
|
||||
self.insert_tasks([task])
|
||||
can_allocate_resource = True
|
||||
if can_allocate_resource is False:
|
||||
if not self.enable_decode_cache_task:
|
||||
task.error_msg = "Not enough resources"
|
||||
new_waiting.append(task)
|
||||
|
||||
if new_waiting:
|
||||
if not self.enable_decode_cache_task:
|
||||
self.split_connector.send_cache_infos(new_waiting, -1)
|
||||
else:
|
||||
self.waiting_requests.extend(new_waiting)
|
||||
self.llm_logger.info(
|
||||
f"Added {len(new_waiting)} tasks to waiting queue"
|
||||
)
|
||||
self.waiting_requests.extend(new_waiting)
|
||||
llm_logger.info(f"Added {len(new_waiting)} tasks to waiting queue")
|
||||
|
||||
else:
|
||||
time.sleep(0.001)
|
||||
|
||||
except Exception as e:
|
||||
self.llm_logger.error(f"Error in main loop: {e}")
|
||||
llm_logger.error(f"Error in main loop: {e}")
|
||||
time.sleep(0.1)
|
||||
|
||||
threading.Thread(target=receiver_loop, daemon=True).start()
|
||||
|
||||
def start_cache_service(self, device_ids, ipc_signal_suffix, create_cache_tensor):
|
||||
def start_cache_service(self, device_ids, ipc_signal_suffix):
|
||||
return self.resource_manager.cache_manager.launch_cache_manager(
|
||||
cache_config=self.cfg.cache_config,
|
||||
tensor_parallel_size=self.cfg.parallel_config.tensor_parallel_size,
|
||||
device_ids=device_ids,
|
||||
pod_ip=self.cfg.master_ip,
|
||||
engine_worker_queue_port=int(
|
||||
self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]
|
||||
self.cfg.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]
|
||||
),
|
||||
pid_suffix=ipc_signal_suffix,
|
||||
create_cache_tensor=create_cache_tensor,
|
||||
)
|
||||
|
||||
def check_and_free_block_tables(self):
|
||||
self.resource_manager.check_and_free_block_tables()
|
||||
|
||||
def clear_data(self):
|
||||
try:
|
||||
llm_logger.info("Clear Data: Start")
|
||||
self.token_processor.clear_data()
|
||||
self.engine_worker_queue.clear_data()
|
||||
self.zmq_server.req_dict.clear()
|
||||
llm_logger.info("Clear Data: Successfully")
|
||||
return True
|
||||
except Exception as e:
|
||||
llm_logger.error(f"Clear data error: {e}")
|
||||
return False
|
||||
|
||||
def _exit_sub_services(self):
|
||||
"""
|
||||
exit sub services
|
||||
@@ -1008,15 +764,7 @@ class EngineService:
|
||||
self.exist_task_signal.clear()
|
||||
self.exist_swapped_task_signal.clear()
|
||||
self.worker_healthy_live_signal.clear()
|
||||
self.cache_ready_signal.clear()
|
||||
self.swap_space_ready_signal.clear()
|
||||
self.exist_prefill_task_signal.clear()
|
||||
self.model_weights_status_signal.clear()
|
||||
self.prefix_tree_status_signal.clear()
|
||||
self.kv_cache_status_signal.clear()
|
||||
if hasattr(self, "send_response_server") and self.send_response_server is not None:
|
||||
self.send_response_server.close()
|
||||
if hasattr(self, "recv_request_server") and self.recv_request_server is not None:
|
||||
self.recv_request_server.close()
|
||||
if hasattr(self, "recv_control_cmd_server") and self.recv_control_cmd_server is not None:
|
||||
self.recv_control_cmd_server.close()
|
||||
if hasattr(self, "zmq_server") and self.zmq_server is not None:
|
||||
self.zmq_server.close()
|
||||
|
@@ -34,7 +34,6 @@ import numpy as np
|
||||
import paddle
|
||||
from tqdm import tqdm
|
||||
|
||||
from fastdeploy.config import ErnieArchitectures
|
||||
from fastdeploy.engine.args_utils import EngineArgs
|
||||
from fastdeploy.engine.common_engine import EngineService
|
||||
from fastdeploy.engine.expert_service import start_data_parallel_service
|
||||
@@ -116,24 +115,25 @@ class LLMEngine:
|
||||
start_time = time.time()
|
||||
|
||||
self.api_server_pid = api_server_pid
|
||||
self.ipc_signal_suffix = self.cfg.parallel_config.engine_worker_queue_port[0]
|
||||
self.ipc_signal_suffix = self.cfg.engine_worker_queue_port[0]
|
||||
self._init_worker_signals()
|
||||
|
||||
self.data_processor = self.input_processor.create_processor()
|
||||
self.engine.data_processor = self.data_processor
|
||||
# Launch components: scheduler, cache_manager, expert_service et.al.
|
||||
self.launch_components()
|
||||
|
||||
self.engine.start()
|
||||
if api_server_pid is not None:
|
||||
llm_logger.info(f"Start zmq server, api_server_pid: {api_server_pid}")
|
||||
self.engine.start_zmq_service(api_server_pid)
|
||||
|
||||
# If block numer is specified and model is deployed in mixed mode, start cache manager first
|
||||
if not self.do_profile and self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
if self.do_profile == 0 and (
|
||||
self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != "mixed"
|
||||
):
|
||||
device_ids = self.cfg.device_ids.split(",")
|
||||
self.cache_manager_processes = self.engine.start_cache_service(device_ids, self.ipc_signal_suffix, True)
|
||||
self.cache_manager_processes = self.engine.start_cache_service(device_ids, self.ipc_signal_suffix)
|
||||
|
||||
# Start workers
|
||||
self.worker_proc = self._start_worker_service()
|
||||
console_logger.info("Waiting for worker processes to be ready...")
|
||||
console_logger.info("Waiting worker processes ready...")
|
||||
time.sleep(5)
|
||||
self.worker_init_status = dict()
|
||||
|
||||
@@ -157,22 +157,13 @@ class LLMEngine:
|
||||
return False
|
||||
time.sleep(1)
|
||||
|
||||
# If block number is not specified, let workers do profiling to determine the block number,
|
||||
# and then start the cache manager
|
||||
if self.do_profile:
|
||||
self._stop_profile()
|
||||
elif self.cfg.cache_config.enable_prefix_caching:
|
||||
device_ids = self.cfg.device_ids.split(",")
|
||||
self.cache_manager_processes = self.engine.start_cache_service(device_ids, self.ipc_signal_suffix, False)
|
||||
|
||||
# Launch components: scheduler, cache_manager, expert_service et.al.
|
||||
if self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
self.launch_components()
|
||||
if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != "mixed":
|
||||
self.launched_cache_manager_signal.value[0] = 1
|
||||
|
||||
if api_server_pid is not None:
|
||||
llm_logger.info(f"Start zmq server, api_server_pid: {api_server_pid}")
|
||||
self.engine.start_zmq_service(api_server_pid)
|
||||
|
||||
# Worker launched
|
||||
self.check_worker_initialize_status_func_thread.join()
|
||||
if not result_container["worker_is_alive"]:
|
||||
@@ -180,24 +171,6 @@ class LLMEngine:
|
||||
return False
|
||||
|
||||
console_logger.info(f"Worker processes are launched with {time.time() - start_time} seconds.")
|
||||
|
||||
# Print blocks number & max running requests to console
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
block_size = self.cfg.cache_config.block_size
|
||||
num_gpu_blocks = self.cfg.cache_config.num_gpu_blocks_override or self.cfg.cache_config.total_block_num
|
||||
num_cpu_blocks = self.cfg.cache_config.num_cpu_blocks
|
||||
max_running_requests = min(
|
||||
(num_gpu_blocks + num_cpu_blocks) * block_size // self.cfg.max_model_len,
|
||||
self.cfg.scheduler_config.max_num_seqs,
|
||||
)
|
||||
console_logger.info(
|
||||
f"Detected {num_gpu_blocks} gpu blocks and {num_cpu_blocks} cpu blocks in cache (block size: {block_size})."
|
||||
)
|
||||
console_logger.info(
|
||||
f"FastDeploy will be serving {max_running_requests} running requests "
|
||||
f"if each sequence reaches its maximum length: {self.cfg.max_model_len}"
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def _get_generated_result(self):
|
||||
@@ -245,9 +218,7 @@ class LLMEngine:
|
||||
if sampling_params is not None:
|
||||
request.sampling_params = sampling_params
|
||||
request.preprocess_start_time = time.time()
|
||||
chat_template_kwargs = kwargs.get("chat_template_kwargs") or {}
|
||||
chat_template_kwargs["chat_template"] = kwargs.get("chat_template")
|
||||
kwargs["chat_template_kwargs"] = chat_template_kwargs
|
||||
|
||||
request = self.data_processor.process_request(request, self.cfg.max_model_len, **kwargs)
|
||||
request.prompt_token_ids_len = len(request.prompt_token_ids)
|
||||
request.need_prefill_tokens = request.prompt_token_ids_len
|
||||
@@ -259,6 +230,9 @@ class LLMEngine:
|
||||
request.get("max_tokens"),
|
||||
),
|
||||
)
|
||||
if request.get("reasoning_max_tokens") is None:
|
||||
default_reasoning_max_tokens = max(int(request.get("max_tokens") * 0.8), 1)
|
||||
request.set("reasoning_max_tokens", default_reasoning_max_tokens)
|
||||
min_tokens = request.get("min_tokens")
|
||||
if input_ids_len + min_tokens >= self.cfg.max_model_len:
|
||||
error_msg = (
|
||||
@@ -337,7 +311,7 @@ class LLMEngine:
|
||||
)
|
||||
|
||||
# launched_cache_manager_signal 用于感知engine是否启动了cache_manager
|
||||
if self.cfg.cache_config.enable_prefix_caching or self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != "mixed":
|
||||
launched_cache_manager_signal_data = np.zeros([1], dtype=np.int32)
|
||||
self.launched_cache_manager_signal = IPCSignal(
|
||||
name="launched_cache_manager_signal",
|
||||
@@ -452,13 +426,10 @@ class LLMEngine:
|
||||
}
|
||||
)
|
||||
|
||||
if self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
variables["FLAGS_use_pd_disaggregation_per_chunk"] = 1
|
||||
else:
|
||||
variables["FLAGS_use_pd_disaggregation"] = 1
|
||||
if self.cfg.splitwise_role != "mixed":
|
||||
variables["FLAGS_use_pd_disaggregation"] = 1
|
||||
# TODO dynamic load environment variable
|
||||
if self.cfg.scheduler_config.splitwise_role == "prefill":
|
||||
if self.cfg.splitwise_role == "prefill":
|
||||
variables["FLAGS_fmt_write_cache_completed_signal"] = 1
|
||||
|
||||
if self.cfg.model_config.enable_mm:
|
||||
@@ -492,15 +463,7 @@ class LLMEngine:
|
||||
else len(self.data_processor.tokenizer.vocab)
|
||||
)
|
||||
|
||||
is_ernie = ErnieArchitectures.contains_ernie_arch(self.cfg.model_config.architectures)
|
||||
if is_ernie:
|
||||
self.cfg.model_config.think_end_id = self.data_processor.tokenizer.get_vocab().get("</think>", -1)
|
||||
if self.cfg.model_config.think_end_id != -1:
|
||||
llm_logger.info(f"Get think_end_id {self.cfg.model_config.think_end_id} from vocab.")
|
||||
else:
|
||||
llm_logger.info("No </think> token found in vocabulary, the model can not do reasoning.")
|
||||
|
||||
ports = ",".join(self.cfg.parallel_config.engine_worker_queue_port)
|
||||
ports = ",".join(self.cfg.engine_worker_queue_port)
|
||||
ips = None
|
||||
if self.cfg.ips is not None:
|
||||
ips = ",".join(self.cfg.ips)
|
||||
@@ -518,15 +481,14 @@ class LLMEngine:
|
||||
f" --enc_dec_block_num {self.cfg.cache_config.enc_dec_block_num}"
|
||||
f" --eos_tokens_lens {self.data_processor.eos_token_id_len}"
|
||||
f" --pad_token_id {self.data_processor.pad_token_id}"
|
||||
f" --engine_pid {self.cfg.parallel_config.engine_worker_queue_port[0]}"
|
||||
f" --engine_pid {self.cfg.engine_worker_queue_port[0]}"
|
||||
f" --max_num_batched_tokens {self.cfg.scheduler_config.max_num_batched_tokens}"
|
||||
f" --splitwise_role {self.cfg.scheduler_config.splitwise_role}"
|
||||
f" --splitwise_role {self.cfg.splitwise_role}"
|
||||
f" --kv_cache_ratio {self.cfg.cache_config.kv_cache_ratio}"
|
||||
f" --expert_parallel_size {self.cfg.parallel_config.expert_parallel_size}"
|
||||
f" --data_parallel_size {self.cfg.parallel_config.data_parallel_size}"
|
||||
f" --quantization '{json.dumps(self.cfg.model_config.quantization)}'"
|
||||
f" --ori_vocab_size {ori_vocab_size}"
|
||||
f" --think_end_id {self.cfg.model_config.think_end_id}"
|
||||
f" --speculative_config '{self.cfg.speculative_config.to_json_string()}'"
|
||||
f" --graph_optimization_config '{self.cfg.graph_opt_config.to_json_string()}'"
|
||||
f" --guided_decoding_backend {self.cfg.guided_decoding_backend}"
|
||||
@@ -534,12 +496,8 @@ class LLMEngine:
|
||||
f" --early_stop_config '{self.cfg.early_stop_config.to_json_string()}'"
|
||||
f" --reasoning_parser {self.cfg.reasoning_parser}"
|
||||
f" --load_choices {self.cfg.load_config.load_choices}"
|
||||
f" --plas_attention_config '{self.cfg.plas_attention_config.to_json_string()}'"
|
||||
f" --moba_attention_config '{self.cfg.moba_attention_config.to_json_string()}'"
|
||||
f" --ips {ips}"
|
||||
f" --cache-transfer-protocol {self.cfg.cache_config.cache_transfer_protocol}"
|
||||
f" --runner {self.cfg.model_config.runner}"
|
||||
f" --convert {self.cfg.model_config.convert}"
|
||||
f" --override-pooler-config {self.cfg.model_config.override_pooler_config}"
|
||||
)
|
||||
|
||||
worker_append_flag = {
|
||||
@@ -644,11 +602,9 @@ class LLMEngine:
|
||||
num_gpu_blocks = self.get_profile_block_num_signal.value[0]
|
||||
self.cfg.cache_config.reset(num_gpu_blocks)
|
||||
self.engine.resource_manager.reset_cache_config(self.cfg.cache_config)
|
||||
if self.cfg.cache_config.enable_prefix_caching or self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != "mixed":
|
||||
device_ids = self.cfg.device_ids.split(",")
|
||||
self.cache_manager_processes = self.engine.start_cache_service(
|
||||
device_ids, self.ipc_signal_suffix, self.cfg.scheduler_config.splitwise_role != "mixed"
|
||||
)
|
||||
self.cache_manager_processes = self.engine.start_cache_service(device_ids, self.ipc_signal_suffix)
|
||||
|
||||
def check_health(self, time_interval_threashold=30):
|
||||
"""
|
||||
@@ -663,32 +619,24 @@ class LLMEngine:
|
||||
return True, ""
|
||||
|
||||
def launch_components(self):
|
||||
if self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
if self.cfg.splitwise_role != "mixed":
|
||||
# 单机逻辑
|
||||
self.engine.engine_worker_queue.available_prefill_instances.put(1)
|
||||
self.splitwise_receive_thread = threading.Thread(
|
||||
target=self.engine.split_connector.start_receiver, args=()
|
||||
)
|
||||
self.splitwise_receive_thread.daemon = True
|
||||
self.splitwise_receive_thread.start()
|
||||
self.engine.split_mode_get_tasks()
|
||||
if self.cfg.scheduler_config.name == "splitwise":
|
||||
self.splitwise_receive_thread = threading.Thread(
|
||||
target=self.engine.split_connector.start_receiver, args=()
|
||||
)
|
||||
self.splitwise_receive_thread.daemon = True
|
||||
self.splitwise_receive_thread.start()
|
||||
|
||||
self.cfg.init_cache_info()
|
||||
|
||||
role = self.cfg.scheduler_config.splitwise_role
|
||||
role = self.cfg.splitwise_role
|
||||
host_ip = self.cfg.host_ip
|
||||
disaggregate = self.cfg.disaggregate_info
|
||||
request_queues_for_dp_ipc = None
|
||||
result_queue_for_dp_ipc = None
|
||||
if self.cfg.scheduler_config.name == "splitwise":
|
||||
self.engine.scheduler.start(role, host_ip, disaggregate)
|
||||
elif self.cfg.scheduler_config.name == "dp":
|
||||
request_queues_for_dp_ipc = []
|
||||
result_queue_for_dp_ipc = multiprocessing.Queue()
|
||||
for i in range(self.cfg.parallel_config.data_parallel_size):
|
||||
request_queues_for_dp_ipc.append(multiprocessing.Queue())
|
||||
self.engine.scheduler.start(
|
||||
self.cfg.node_rank * self.cfg.worker_num_per_node, request_queues_for_dp_ipc, result_queue_for_dp_ipc
|
||||
)
|
||||
|
||||
if not envs.FD_ENABLE_MULTI_API_SERVER:
|
||||
if self.cfg.parallel_config.enable_expert_parallel and self.cfg.parallel_config.data_parallel_size > 1:
|
||||
@@ -701,7 +649,7 @@ class LLMEngine:
|
||||
):
|
||||
address = (
|
||||
self.cfg.master_ip,
|
||||
int(self.cfg.parallel_config.engine_worker_queue_port[i]),
|
||||
int(self.cfg.engine_worker_queue_port[i]),
|
||||
)
|
||||
llm_logger.info(f"dp start queue service {address}")
|
||||
self.dp_engine_worker_queue_server.append(
|
||||
@@ -718,9 +666,6 @@ class LLMEngine:
|
||||
args=(
|
||||
self.cfg,
|
||||
i,
|
||||
None,
|
||||
request_queues_for_dp_ipc,
|
||||
result_queue_for_dp_ipc,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
@@ -27,7 +27,6 @@ import numpy as np
|
||||
|
||||
from fastdeploy.engine.common_engine import EngineService
|
||||
from fastdeploy.inter_communicator import IPCSignal
|
||||
from fastdeploy.splitwise.internal_adapter_utils import InternalAdapter
|
||||
from fastdeploy.utils import console_logger, envs, llm_logger
|
||||
|
||||
|
||||
@@ -51,13 +50,13 @@ class ExpertService:
|
||||
self.cfg = cfg
|
||||
start_pos = (local_data_parallel_id * self.cfg.parallel_config.tensor_parallel_size) % cfg.worker_num_per_node
|
||||
end_pos = start_pos + self.cfg.parallel_config.tensor_parallel_size
|
||||
if cfg.scheduler_config.splitwise_role != "mixed":
|
||||
if cfg.splitwise_role != "mixed":
|
||||
self.cfg.cache_config.rdma_comm_ports = self.cfg.cache_config.rdma_comm_ports[start_pos:end_pos]
|
||||
self.cfg.local_device_ids = self.cfg.device_ids.split(",")[start_pos:end_pos]
|
||||
llm_logger.info(f"local_data_parallel_id: {local_data_parallel_id}")
|
||||
self.cfg.disaggregate_info = None
|
||||
|
||||
if cfg.scheduler_config.splitwise_role != "mixed":
|
||||
if cfg.splitwise_role != "mixed":
|
||||
if len(self.cfg.cache_config.pd_comm_port) == 1:
|
||||
self.cfg.cache_config.pd_comm_port[0] = (
|
||||
int(self.cfg.cache_config.pd_comm_port[0]) + local_data_parallel_id
|
||||
@@ -70,12 +69,8 @@ class ExpertService:
|
||||
self.engine.scheduler.reset_nodeid(f"{self.engine.scheduler.infer.nodeid}_{local_data_parallel_id!s}")
|
||||
|
||||
self._finalizer = weakref.finalize(self, self._exit_sub_services)
|
||||
if envs.FD_ENABLE_INTERNAL_ADAPTER:
|
||||
self.internal_adapter = InternalAdapter(cfg=self.cfg, engine=self.engine, dp_rank=local_data_parallel_id)
|
||||
|
||||
def start(
|
||||
self, ipc_signal_suffix, local_data_parallel_id, request_queues_for_dp_ipc=None, result_queue_for_dp_ipc=None
|
||||
):
|
||||
def start(self, ipc_signal_suffix, local_data_parallel_id):
|
||||
"""
|
||||
Initializes the engine and starts its sub-services.
|
||||
If `api_server_pid` is defined, will launch a thread
|
||||
@@ -85,30 +80,25 @@ class ExpertService:
|
||||
|
||||
start_time = time.time()
|
||||
self.engine.start()
|
||||
if self.cfg.scheduler_config.name == "dp":
|
||||
self.cfg.init_cache_info()
|
||||
assert (request_queues_for_dp_ipc is not None) and (result_queue_for_dp_ipc is not None)
|
||||
self.engine.scheduler.start(local_data_parallel_id, request_queues_for_dp_ipc, result_queue_for_dp_ipc)
|
||||
|
||||
if ipc_signal_suffix is not None:
|
||||
self.api_server_pid = ipc_signal_suffix
|
||||
self.engine.start_zmq_service(ipc_signal_suffix)
|
||||
else:
|
||||
ipc_signal_suffix = self.cfg.parallel_config.engine_worker_queue_port[0]
|
||||
ipc_signal_suffix = self.cfg.engine_worker_queue_port[0]
|
||||
|
||||
llm_logger.info(f"start expert service {local_data_parallel_id}")
|
||||
if self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
ipc_signal_suffix_cache = self.cfg.parallel_config.engine_worker_queue_port[local_data_parallel_id]
|
||||
self.engine.start_cache_service(self.cfg.local_device_ids, ipc_signal_suffix_cache)
|
||||
if self.cfg.splitwise_role != "mixed":
|
||||
self.engine.start_cache_service(self.cfg.local_device_ids, ipc_signal_suffix)
|
||||
self.engine.split_mode_get_tasks()
|
||||
|
||||
if self.cfg.scheduler_config.name == "splitwise":
|
||||
self.cfg.init_cache_info()
|
||||
role = self.cfg.scheduler_config.splitwise_role
|
||||
role = self.cfg.splitwise_role
|
||||
host_ip = self.cfg.host_ip
|
||||
disaggregate = self.cfg.disaggregate_info
|
||||
self.engine.scheduler.start(role, host_ip, disaggregate)
|
||||
|
||||
if self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
if self.cfg.splitwise_role != "mixed":
|
||||
self.splitwise_receive_thread = threading.Thread(
|
||||
target=self.engine.split_connector.start_receiver, args=()
|
||||
)
|
||||
@@ -142,6 +132,7 @@ class ExpertService:
|
||||
|
||||
if hasattr(self, "cache_manager_processes"):
|
||||
self.engine.resource_manager.cache_manager.shm_cache_task_flag_broadcast.clear()
|
||||
self.engine.resource_manager.cache_manager.cache_ready_signal.clear()
|
||||
for p in self.cache_manager_processes:
|
||||
llm_logger.info(f"Killing cache manager process {p.pid}")
|
||||
try:
|
||||
@@ -153,18 +144,14 @@ class ExpertService:
|
||||
self.zmq_server.close()
|
||||
|
||||
|
||||
def start_data_parallel_service(
|
||||
cfg, local_data_parallel_id, ipc_signal_suffix=None, request_queues_for_dp_ipc=None, result_queue_for_dp_ipc=None
|
||||
):
|
||||
def start_data_parallel_service(cfg, local_data_parallel_id, ipc_signal_suffix=None):
|
||||
"""
|
||||
Start expert service
|
||||
"""
|
||||
expert_service = ExpertService(cfg, local_data_parallel_id, start_queue=False)
|
||||
|
||||
try:
|
||||
expert_service.start(
|
||||
ipc_signal_suffix, local_data_parallel_id, request_queues_for_dp_ipc, result_queue_for_dp_ipc
|
||||
)
|
||||
expert_service.start(ipc_signal_suffix, local_data_parallel_id)
|
||||
|
||||
def deamon_thread():
|
||||
while True:
|
||||
@@ -172,6 +159,5 @@ def start_data_parallel_service(
|
||||
|
||||
t_deamon = threading.Thread(target=deamon_thread, daemon=True)
|
||||
t_deamon.start()
|
||||
t_deamon.join()
|
||||
except Exception as e:
|
||||
llm_logger.exception(f"Expert service failed to start: {e}, {str(traceback.format_exc())}")
|
||||
|
@@ -1,170 +0,0 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from copy import deepcopy
|
||||
from typing import TYPE_CHECKING, Annotated, Any, Optional
|
||||
|
||||
import msgspec
|
||||
|
||||
from fastdeploy.engine.sampling_params import RequestOutputKind
|
||||
from fastdeploy.engine.tasks import PoolingTask
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fastdeploy.config import ModelConfig
|
||||
|
||||
|
||||
class PoolingParams:
|
||||
"""API parameters for pooling models.
|
||||
|
||||
Attributes:
|
||||
normalize: Whether to normalize the embeddings outputs.
|
||||
dimensions: Reduce the dimensions of embeddings
|
||||
if model support matryoshka representation.
|
||||
activation: Whether to apply activation function to
|
||||
the classification outputs.
|
||||
softmax: Whether to apply softmax to the reward outputs.
|
||||
step_tag_id: Step tag ID for process reward models to identify
|
||||
specific steps in multi-step reasoning tasks.
|
||||
returned_token_ids: List of token IDs to return rewards for,
|
||||
used for fine-grained reward calculation.
|
||||
task: Internal use only. Specifies the pooling task type
|
||||
("embed" for embeddings, "encode" for reward models).
|
||||
requires_token_ids: Internal use only. Whether token ID information
|
||||
is required for processing.
|
||||
extra_kwargs: Internal use only. Dictionary for storing additional
|
||||
custom parameters for extended functionality.
|
||||
output_kind: Output type specification, fixed to FINAL_ONLY
|
||||
(only final outputs are returned).
|
||||
"""
|
||||
|
||||
truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=-1)]] = None
|
||||
"""If set to -1, will use the truncation size supported by the model. If
|
||||
set to an integer k, will use only the last k tokens from the prompt
|
||||
(i.e., left truncation). If set to `None`, truncation is disabled."""
|
||||
|
||||
# for embeddings models
|
||||
dimensions: Optional[int] = None
|
||||
normalize: Optional[bool] = None
|
||||
|
||||
# for reward models
|
||||
softmax: Optional[bool] = None
|
||||
step_tag_id: Optional[int] = None
|
||||
returned_token_ids: Optional[list[int]] = None
|
||||
|
||||
task: Optional[PoolingTask] = None
|
||||
"""Internal use only."""
|
||||
|
||||
requires_token_ids: bool = False
|
||||
"""Internal use only."""
|
||||
|
||||
extra_kwargs: Optional[dict[str, Any]] = None
|
||||
"""Internal use only."""
|
||||
|
||||
output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY
|
||||
|
||||
@property
|
||||
def _all_parameters(self) -> list[str]:
|
||||
return ["dimensions", "normalize", "softmax", "step_tag_id", "returned_token_ids"]
|
||||
|
||||
@property
|
||||
def valid_parameters(self):
|
||||
return {
|
||||
"embed": ["dimensions", "normalize"],
|
||||
"encode": ["softmax", "step_tag_id", "returned_token_ids"],
|
||||
}
|
||||
|
||||
def clone(self) -> "PoolingParams":
|
||||
"""Returns a deep copy of the PoolingParams instance."""
|
||||
return deepcopy(self)
|
||||
|
||||
def verify(self, task: PoolingTask, model_config: Optional["ModelConfig"] = None) -> None:
|
||||
|
||||
if self.task is None:
|
||||
self.task = task
|
||||
elif self.task != task:
|
||||
msg = f"You cannot overwrite {self.task=!r} with {task=!r}!"
|
||||
raise ValueError(msg)
|
||||
|
||||
# NOTE: Task validation needs to done against the model instance,
|
||||
# which is not available in model config. So, it's not included
|
||||
# in this method
|
||||
|
||||
self._merge_default_parameters(model_config)
|
||||
self._set_default_parameters(model_config)
|
||||
self._verify_valid_parameters()
|
||||
|
||||
def _merge_default_parameters(self, model_config: Optional["ModelConfig"] = None) -> None:
|
||||
|
||||
if model_config is None:
|
||||
return
|
||||
|
||||
pooler_config = model_config.pooler_config
|
||||
if pooler_config is None:
|
||||
return
|
||||
|
||||
assert self.task is not None, "task must be set"
|
||||
valid_parameters = self.valid_parameters[self.task]
|
||||
|
||||
for k in valid_parameters:
|
||||
if getattr(pooler_config, k, None) is None:
|
||||
continue
|
||||
|
||||
if getattr(self, k, None) is None:
|
||||
setattr(self, k, getattr(pooler_config, k))
|
||||
|
||||
def _set_default_parameters(self, model_config: Optional["ModelConfig"]):
|
||||
if self.task == "embed":
|
||||
if self.normalize is None:
|
||||
self.normalize = True
|
||||
elif self.task == "encode":
|
||||
if self.softmax is None:
|
||||
self.softmax = True
|
||||
else:
|
||||
raise ValueError(f"Unknown pooling task: {self.task}")
|
||||
|
||||
def _verify_valid_parameters(self):
|
||||
assert self.task is not None, "task must be set"
|
||||
valid_parameters = self.valid_parameters[self.task]
|
||||
invalid_parameters = []
|
||||
for k in self._all_parameters:
|
||||
if k in valid_parameters:
|
||||
continue
|
||||
|
||||
if getattr(self, k, None) is not None:
|
||||
invalid_parameters.append(k)
|
||||
|
||||
if invalid_parameters:
|
||||
raise ValueError(
|
||||
f"Task {self.task} only supports {valid_parameters} "
|
||||
f"parameters, does not support "
|
||||
f"{invalid_parameters} parameters"
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
f"PoolingParams("
|
||||
f"task={self.task}, "
|
||||
f"normalize={self.normalize}, "
|
||||
f"dimensions={self.dimensions}, "
|
||||
f"softmax={self.softmax}, "
|
||||
f"step_tag_id={self.step_tag_id}, "
|
||||
f"returned_token_ids={self.returned_token_ids}, "
|
||||
f"requires_token_ids={self.requires_token_ids}, "
|
||||
f"extra_kwargs={self.extra_kwargs})"
|
||||
)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
assert self.output_kind == RequestOutputKind.FINAL_ONLY, "For pooling output_kind has to be FINAL_ONLY"
|
@@ -73,7 +73,6 @@ class Request:
|
||||
guided_json_object: Optional[bool] = None,
|
||||
enable_thinking: Optional[bool] = True,
|
||||
trace_carrier: dict = dict(),
|
||||
dp_rank: Optional[int] = None,
|
||||
chat_template: Optional[str] = None,
|
||||
image_start: int = 0,
|
||||
video_start: int = 0,
|
||||
@@ -146,8 +145,6 @@ class Request:
|
||||
# extend block tables
|
||||
self.use_extend_tables = False
|
||||
self.extend_block_tables = []
|
||||
# dp
|
||||
self.dp_rank = dp_rank
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, d: dict):
|
||||
@@ -190,7 +187,6 @@ class Request:
|
||||
image_end=d.get("image_end", 0),
|
||||
video_end=d.get("video_end", 0),
|
||||
audio_end=d.get("audio_end", 0),
|
||||
dp_rank=d.get("dp_rank", None),
|
||||
)
|
||||
|
||||
@property
|
||||
@@ -308,7 +304,6 @@ class CompletionOutput:
|
||||
"index": self.index,
|
||||
"send_idx": self.send_idx,
|
||||
"token_ids": self.token_ids,
|
||||
"decode_type": self.decode_type,
|
||||
"logprob": self.logprob,
|
||||
"top_logprobs": self.top_logprobs,
|
||||
"logprobs": self.logprobs,
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user