[Iluvatar GPU] Optimize attention performance and fix moe load ckpt error (#3651)

This commit is contained in:
yzwu
2025-09-22 21:13:59 +08:00
committed by GitHub
parent 5532e8a323
commit 504461b6b5
17 changed files with 1344 additions and 363 deletions

View File

@@ -28,18 +28,22 @@ jobs:
REPO="https://github.com/${{ github.repository }}.git" REPO="https://github.com/${{ github.repository }}.git"
FULL_REPO="${{ github.repository }}" FULL_REPO="${{ github.repository }}"
REPO_NAME="${FULL_REPO##*/}" REPO_NAME="${FULL_REPO##*/}"
BASE_BRANCH="${{ github.base_ref }}"
# Clean the repository directory before starting # Clean the repository directory before starting
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-e "REPO_NAME=${REPO_NAME}" \ -e "REPO_NAME=${REPO_NAME}" \
-e "BASE_BRANCH=${BASE_BRANCH}" \
${docker_image} /bin/bash -c ' ${docker_image} /bin/bash -c '
if [ -d ${REPO_NAME} ]; then if [ -d ${REPO_NAME} ]; then
echo "Directory ${REPO_NAME} exists, removing it..." echo "Directory ${REPO_NAME} exists, removing it..."
rm -rf ${REPO_NAME} rm -rf ${REPO_NAME}
fi fi
' '
git config --global http.proxy "http://61.151.249.150:33128"
git config --global https.proxy "http://61.151.249.150:33128"
git config --global user.name "FastDeployCI" git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com" git config --global user.email "fastdeploy_ci@example.com"
git clone ${REPO} ${REPO_NAME} git clone --recursive ${REPO} ${REPO_NAME} -b ${BASE_BRANCH}
cd FastDeploy cd FastDeploy
if [ "${{ github.event_name }}" = "pull_request" ]; then if [ "${{ github.event_name }}" = "pull_request" ]; then
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }} git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}

View File

@@ -193,11 +193,13 @@ public:
typedef uint8_t data_t; typedef uint8_t data_t;
}; };
#ifndef PADDLE_WITH_COREX
template <> class PDTraits<paddle::DataType::FLOAT8_E4M3FN> { template <> class PDTraits<paddle::DataType::FLOAT8_E4M3FN> {
public: public:
typedef __nv_fp8_e4m3 DataType; typedef __nv_fp8_e4m3 DataType;
typedef paddle::float8_e4m3fn data_t; typedef paddle::float8_e4m3fn data_t;
}; };
#endif
template <typename T, int Size> struct alignas(sizeof(T) * Size) AlignedVector { template <typename T, int Size> struct alignas(sizeof(T) * Size) AlignedVector {
T val[Size]; T val[Size];

View File

@@ -0,0 +1,376 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "helper.h"
#include "iluvatar_context.h"
template <paddle::DataType T>
void MixedFusedPagedAttnKernel(const paddle::Tensor& qkv,
paddle::Tensor& k_cache,
paddle::Tensor& v_cache,
const paddle::Tensor& prefill_block_table,
const paddle::Tensor& decode_block_table,
const paddle::Tensor& cu_seqlens_qkv,
const paddle::Tensor& seq_lens,
const paddle::optional<paddle::Tensor> &rope_sin,
const paddle::optional<paddle::Tensor> &rope_cos,
int prefill_num_tokens,
int num_heads,
int head_dim,
int num_kv_heads,
int block_size,
int max_seq_len,
float scale,
bool causal,
bool q_rope,
bool k_rope,
bool v_rope,
int window_left,
int window_right,
float softcap,
bool enable_cuda_graph,
bool use_sqrt_alibi,
paddle::Tensor& out) {
typedef PDTraits<T> traits_;
typedef typename traits_::data_t data_t;
const auto& dtype = qkv.dtype();
cuinferDataType_t cuinfer_data_type;
cudaDataType_t cu_data_type;
if (dtype == paddle::DataType::FLOAT16) {
cuinfer_data_type = CUINFER_DATA_HALF;
cu_data_type = CUDA_R_16F;
} else {
cuinfer_data_type = CUINFER_DATA_BFLOAT16;
cu_data_type = CUDA_R_16BF;
}
const auto& qkv_dims = qkv.dims();
const auto& kv_cache_dims = k_cache.dims();
const auto& prefill_block_table_dims = prefill_block_table.dims();
const auto& cu_seqlens_qkv_dims = cu_seqlens_qkv.dims();
int prefill_batch_size = prefill_block_table_dims[0];
int num_tokens = qkv_dims[0];
int decode_num_tokens = num_tokens - prefill_num_tokens;
int num_total_heads = num_heads + 2 * num_kv_heads;
int max_num_blocks_per_seq = prefill_block_table_dims[1];
int qkv_stride = qkv.strides()[0];
int num_blocks = kv_cache_dims[0];
int kv_block_stride = k_cache.strides()[0];
int kv_head_stride = k_cache.strides()[1];
int block_table_stride = prefill_block_table.strides()[0];
const float *rope_sin_ptr = rope_sin ? rope_sin.get().data<float>() : nullptr;
const float *rope_cos_ptr = rope_cos ? rope_cos.get().data<float>() : nullptr;
cuinferTensorDescriptor_t qkv_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
qkv_desc,
cuinfer_data_type,
3,
std::vector<int>({prefill_num_tokens, num_total_heads, head_dim}).data(),
std::vector<int>({num_total_heads * head_dim, head_dim, 1}).data()));
cuinferTensorDescriptor_t qkv_seqlens_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_seqlens_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
qkv_seqlens_desc,
CUINFER_DATA_INT32,
1,
std::vector<int>({prefill_batch_size + 1}).data(),
std::vector<int>({1}).data()));
cuinferTensorDescriptor_t block_table_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&block_table_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
block_table_desc,
CUINFER_DATA_INT32,
2,
std::vector<int>({prefill_batch_size, block_table_stride}).data(),
std::vector<int>({block_table_stride, 1}).data()));
cuinferTensorDescriptor_t o_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&o_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
o_desc,
cuinfer_data_type,
3,
std::vector<int>({prefill_num_tokens, num_heads, head_dim}).data(),
std::vector<int>({num_heads * head_dim, head_dim, 1}).data()));
cuinferTensorDescriptor_t k_cache_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&k_cache_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
k_cache_desc,
cuinfer_data_type,
4,
std::vector<int>({num_blocks, num_kv_heads, block_size, head_dim}).data(),
std::vector<int>({num_kv_heads * block_size * head_dim, block_size * head_dim, head_dim, 1}).data()));
cuinferTensorDescriptor_t v_cache_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&v_cache_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
v_cache_desc,
cuinfer_data_type,
4,
std::vector<int>({num_blocks, num_kv_heads, block_size, head_dim}).data(),
std::vector<int>({num_kv_heads * block_size * head_dim, block_size * head_dim, head_dim, 1}).data()));
cuinferTensorDescriptor_t cos_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&cos_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
cos_desc,
CUINFER_DATA_FLOAT,
2,
std::vector<int>({max_seq_len, head_dim}).data(),
std::vector<int>({head_dim, 1}).data()));
cuinferTensorDescriptor_t sin_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&sin_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
sin_desc,
CUINFER_DATA_FLOAT,
2,
std::vector<int>({max_seq_len, head_dim}).data(),
std::vector<int>({head_dim, 1}).data()));
cuinferHandle_t cuinfer_handle = iluvatar::getContextInstance()->getIxInferHandle();
size_t prefill_workspace_size = 0;
CUINFER_CHECK(cuinferGetFmhaFwdMergedFuseRopeWorkspaceSize(prefill_num_tokens,
num_heads,
num_kv_heads,
head_dim,
q_rope,
k_rope,
v_rope,
cuinfer_data_type,
cuinfer_data_type,
cuinfer_data_type,
&prefill_workspace_size));
auto* allocator = paddle::GetAllocator(qkv.place());
phi::Allocator::AllocationPtr prefill_tmp_workspace = allocator->Allocate(prefill_workspace_size);
void* prefill_workspace_ptr = prefill_tmp_workspace->ptr();
CUINFER_CHECK(cuinferFmhaFwdMergedFuseRopeFunc(cuinfer_handle,
qkv_desc,
qkv.data(),
qkv_seqlens_desc,
cu_seqlens_qkv.data<int32_t>(),
block_table_desc,
prefill_block_table.data<int32_t>(),
o_desc,
out.data(),
k_cache_desc,
k_cache.data(),
v_cache_desc,
v_cache.data(),
prefill_workspace_ptr,
prefill_workspace_size,
cos_desc,
rope_cos_ptr,
sin_desc,
rope_sin_ptr,
prefill_batch_size,
num_heads,
num_kv_heads,
head_dim,
causal,
scale,
q_rope,
k_rope,
v_rope));
size_t decode_workspace_size = 0;
CUINFER_CHECK(cuInferPageAttentionGetWorkspaceV7(decode_num_tokens,
num_heads,
num_kv_heads,
head_dim,
block_size,
max_seq_len,
&decode_workspace_size));
phi::Allocator::AllocationPtr decode_tmp_workspace = allocator->Allocate(decode_workspace_size);
void* decode_workspace_ptr = decode_tmp_workspace->ptr();
void* decode_qkv_ptr = (void*)(qkv.data<data_t>() + prefill_num_tokens * qkv_stride);
void* decode_out_ptr = (void*)(out.data<data_t>() + prefill_num_tokens * out.strides()[0]);
PageAttentionWithKVCacheArguments args{
static_cast<float>(scale), 1.0, 1.0, static_cast<float>(softcap), window_left, window_right,
causal, use_sqrt_alibi, enable_cuda_graph, false, nullptr, decode_qkv_ptr, decode_qkv_ptr,
decode_workspace_ptr, true, rope_sin_ptr, rope_cos_ptr};
CUINFER_CHECK(cuInferPageAttentionV7(cuinfer_handle,
decode_out_ptr,
cu_data_type,
decode_qkv_ptr,
cu_data_type,
decode_num_tokens,
num_heads,
num_kv_heads,
head_dim,
qkv_stride,
kv_block_stride,
kv_head_stride,
k_cache.data(),
cu_data_type,
v_cache.data(),
cu_data_type,
block_size,
max_num_blocks_per_seq,
max_seq_len,
decode_block_table.data<int32_t>(),
seq_lens.data<int32_t>(),
args));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_seqlens_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(block_table_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(o_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(k_cache_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(v_cache_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(cos_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(sin_desc));
}
std::vector<paddle::Tensor> MixedFusedPagedAttn(const paddle::Tensor& qkv,
paddle::Tensor& k_cache,
paddle::Tensor& v_cache,
const paddle::Tensor& prefill_block_table,
const paddle::Tensor& decode_block_table,
const paddle::Tensor& cu_seqlens_qkv,
const paddle::Tensor& seq_lens,
const paddle::optional<paddle::Tensor> &rope_sin,
const paddle::optional<paddle::Tensor> &rope_cos,
int prefill_num_tokens,
int num_heads,
int head_dim,
int num_kv_heads,
int block_size,
int max_seq_len,
float scale,
bool causal,
bool q_rope,
bool k_rope,
bool v_rope,
int window_left,
int window_right,
float softcap,
bool enable_cuda_graph,
bool use_sqrt_alibi) {
const auto dtype = qkv.dtype();
auto out = paddle::empty({qkv.shape()[0], num_heads * head_dim}, dtype, qkv.place());
switch (dtype) {
case paddle::DataType::BFLOAT16:
MixedFusedPagedAttnKernel<paddle::DataType::BFLOAT16>(qkv,
k_cache,
v_cache,
prefill_block_table,
decode_block_table,
cu_seqlens_qkv,
seq_lens,
rope_sin,
rope_cos,
prefill_num_tokens,
num_heads,
head_dim,
num_kv_heads,
block_size,
max_seq_len,
scale,
causal,
q_rope,
k_rope,
v_rope,
window_left,
window_right,
softcap,
enable_cuda_graph,
use_sqrt_alibi,
out);
break;
case paddle::DataType::FLOAT16:
MixedFusedPagedAttnKernel<paddle::DataType::FLOAT16>(qkv,
k_cache,
v_cache,
prefill_block_table,
decode_block_table,
cu_seqlens_qkv,
seq_lens,
rope_sin,
rope_cos,
prefill_num_tokens,
num_heads,
head_dim,
num_kv_heads,
block_size,
max_seq_len,
scale,
causal,
q_rope,
k_rope,
v_rope,
window_left,
window_right,
softcap,
enable_cuda_graph,
use_sqrt_alibi,
out);
break;
default:
PD_THROW("Unsupported data type for mixed paged attn");
}
return {out};
}
std::vector<std::vector<int64_t>> MixedFusedPagedAttnInferShape(const std::vector<int64_t>& qkv_shape,
int num_heads,
int head_dim) {
return {{qkv_shape[0], num_heads * head_dim}};
}
std::vector<paddle::DataType> MixedFusedPagedAttnInferDtype(const paddle::DataType& qkv_dtype) {
return {qkv_dtype};
}
PD_BUILD_STATIC_OP(mixed_fused_paged_attn)
.Inputs({"qkv", "k_cache", "v_cache", "prefill_block_table", "decode_block_table",
"cu_seqlens_qkv", "seq_lens", paddle::Optional("rope_sin"), paddle::Optional("rope_cos")})
.Outputs({"out"})
.Attrs({"prefill_num_tokens:int",
"num_heads: int",
"head_dim:int",
"num_kv_heads:int",
"block_size:int",
"max_seq_len:int",
"scale:float",
"causal:bool",
"q_rope:bool",
"k_rope:bool",
"v_rope:bool",
"window_left:int",
"window_right:int",
"softcap:float",
"enable_cuda_graph:bool",
"use_sqrt_alibi:bool"})
.SetKernelFn(PD_KERNEL(MixedFusedPagedAttn))
.SetInferShapeFn(PD_INFER_SHAPE(MixedFusedPagedAttnInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(MixedFusedPagedAttnInferDtype));

View File

@@ -53,6 +53,7 @@ void MoeDispatchKernel(const paddle::Tensor& input,
const paddle::optional<paddle::Tensor>& gating_correction_bias, const paddle::optional<paddle::Tensor>& gating_correction_bias,
const int moe_topk, const int moe_topk,
const bool group_moe, const bool group_moe,
const std::string &moe_quant_type,
const bool topk_only_mode, const bool topk_only_mode,
const int num_rows, const int num_rows,
const int hidden_size, const int hidden_size,
@@ -183,6 +184,7 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
const paddle::optional<paddle::Tensor>& w4a8_in_scale, const paddle::optional<paddle::Tensor>& w4a8_in_scale,
const int moe_topk, const int moe_topk,
const bool group_moe, const bool group_moe,
const std::string &moe_quant_type,
const bool topk_only_mode) { const bool topk_only_mode) {
const auto input_type = input.dtype(); const auto input_type = input.dtype();
auto place = input.place(); auto place = input.place();
@@ -220,6 +222,7 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
gating_correction_bias, gating_correction_bias,
moe_topk, moe_topk,
group_moe, group_moe,
moe_quant_type,
topk_only_mode, topk_only_mode,
num_rows, num_rows,
hidden_size, hidden_size,
@@ -236,6 +239,7 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
gating_correction_bias, gating_correction_bias,
moe_topk, moe_topk,
group_moe, group_moe,
moe_quant_type,
topk_only_mode, topk_only_mode,
num_rows, num_rows,
hidden_size, hidden_size,
@@ -305,7 +309,7 @@ PD_BUILD_STATIC_OP(moe_expert_dispatch)
"top_k_weight", "top_k_weight",
"top_k_indices", "top_k_indices",
"expert_idx_per_token"}) "expert_idx_per_token"})
.Attrs({"moe_topk:int", "group_moe:bool", "topk_only_mode:bool"}) .Attrs({"moe_topk:int", "group_moe:bool", "moe_quant_type:std::string", "topk_only_mode:bool"})
.SetKernelFn(PD_KERNEL(MoeExpertDispatch)) .SetKernelFn(PD_KERNEL(MoeExpertDispatch))
.SetInferShapeFn(PD_INFER_SHAPE(MoeExpertDispatchInferShape)) .SetInferShapeFn(PD_INFER_SHAPE(MoeExpertDispatchInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(MoeExpertDispatchInferDtype)); .SetInferDtypeFn(PD_INFER_DTYPE(MoeExpertDispatchInferDtype));

View File

@@ -27,6 +27,8 @@ void PagedAttnKernel(const paddle::Tensor& q,
const paddle::optional<paddle::Tensor> &v, const paddle::optional<paddle::Tensor> &v,
const paddle::optional<paddle::Tensor> &rope_sin, const paddle::optional<paddle::Tensor> &rope_sin,
const paddle::optional<paddle::Tensor> &rope_cos, const paddle::optional<paddle::Tensor> &rope_cos,
int num_heads,
int head_dim,
int num_kv_heads, int num_kv_heads,
float scale, float scale,
int block_size, int block_size,
@@ -86,32 +88,36 @@ void PagedAttnKernel(const paddle::Tensor& q,
common::errors::InvalidArgument( common::errors::InvalidArgument(
"paged_attention expects seq_lens is contiguous")); "paged_attention expects seq_lens is contiguous"));
// check dim and shape // check dim and shape
// k_cache: [num_blocks, kv_num_heads, block_size, head_size] // k_cache: [num_blocks, kv_num_heads, block_size, head_dim]
// v_cache: [num_blocks, kv_num_heads, block_size, head_size] // v_cache: [num_blocks, kv_num_heads, block_size, head_dim]
// block_table: [num_seqs, max_num_blocks_per_seq] // block_table: [num_seqs, max_num_blocks_per_seq]
// seq_lens: [num_seqs] // seq_lens: [num_seqs]
// q and out: // q and out:
// merged_qkv = false: [num_seqs, num_heads, head_size] // if merged_qkv = false:
// merged_qkv = true: [num_seqs, num_heads+2*num_kv_heads, head_size] // q:[num_seqs, hidden_size]
// out:[num_seqs, hidden_size]
// if merged_qkv = true:
// q: [num_seqs, (num_heads+2*num_kv_heads)*head_dim]
// out: [num_seqs, hidden_size]
const auto& q_dims = q.dims(); const auto& q_dims = q.dims();
PADDLE_ENFORCE_EQ(q_dims.size(), PADDLE_ENFORCE_EQ(q_dims.size(),
3, 2,
common::errors::InvalidArgument( common::errors::InvalidArgument(
"paged_attn receive query dims is " "paged_attn receive query dims is "
"[num_seqs, num_heads, head_size]")); "[num_seqs, (num_heads+2*num_kv_heads)*head_dim]"));
PADDLE_ENFORCE_EQ(out.dims().size(), PADDLE_ENFORCE_EQ(out.dims().size(),
3, 2,
common::errors::InvalidArgument( common::errors::InvalidArgument(
"paged_attn receive out dims is " "paged_attn receive out dims is "
"[num_seqs, num_heads, head_size]")); "[num_seqs, hidden_size]"));
const auto& kv_cache_dims = k_cache.dims(); const auto& kv_cache_dims = k_cache.dims();
PADDLE_ENFORCE_EQ(kv_cache_dims.size(), PADDLE_ENFORCE_EQ(kv_cache_dims.size(),
4, 4,
common::errors::InvalidArgument( common::errors::InvalidArgument(
"paged_attn receive kv cache dims is " "paged_attn receive kv cache dims is "
"[num_blocks, kv_num_heads, block_size, head_size]")); "[num_blocks, kv_num_heads, block_size, head_dim]"));
const auto& block_table_dims = block_table.dims(); const auto& block_table_dims = block_table.dims();
PADDLE_ENFORCE_EQ(block_table_dims.size(), PADDLE_ENFORCE_EQ(block_table_dims.size(),
@@ -127,8 +133,6 @@ void PagedAttnKernel(const paddle::Tensor& q,
"paged_attn receive seq_lens dims is [num_seqs]")); "paged_attn receive seq_lens dims is [num_seqs]"));
int num_seqs = q_dims[0]; int num_seqs = q_dims[0];
int num_heads = merged_qkv ? q_dims[1] - 2 * num_kv_heads : q_dims[1];
int head_size = q_dims[2];
int max_num_blocks_per_seq = block_table_dims[1]; int max_num_blocks_per_seq = block_table_dims[1];
int q_stride = q.strides()[0]; int q_stride = q.strides()[0];
int num_blocks = kv_cache_dims[0]; int num_blocks = kv_cache_dims[0];
@@ -142,9 +146,9 @@ void PagedAttnKernel(const paddle::Tensor& q,
common::errors::InvalidArgument( common::errors::InvalidArgument(
"kv_cache_dims[2] must be equal to block_size")); "kv_cache_dims[2] must be equal to block_size"));
PADDLE_ENFORCE_EQ(kv_cache_dims[3], PADDLE_ENFORCE_EQ(kv_cache_dims[3],
head_size, head_dim,
common::errors::InvalidArgument( common::errors::InvalidArgument(
"kv_cache_dims[3] must be equal to head_size")); "kv_cache_dims[3] must be equal to head_dim"));
PADDLE_ENFORCE_EQ(block_table_dims[0], PADDLE_ENFORCE_EQ(block_table_dims[0],
num_seqs, num_seqs,
common::errors::InvalidArgument( common::errors::InvalidArgument(
@@ -162,14 +166,13 @@ void PagedAttnKernel(const paddle::Tensor& q,
const float *rope_sin_ptr = merged_qkv ? rope_sin.get().data<float>() : nullptr; const float *rope_sin_ptr = merged_qkv ? rope_sin.get().data<float>() : nullptr;
const float *rope_cos_ptr = merged_qkv ? rope_cos.get().data<float>() : nullptr; const float *rope_cos_ptr = merged_qkv ? rope_cos.get().data<float>() : nullptr;
auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(q.place()));
cuinferHandle_t cuinfer_handle = iluvatar::getContextInstance()->getIxInferHandle(); cuinferHandle_t cuinfer_handle = iluvatar::getContextInstance()->getIxInferHandle();
size_t workspace_size = 0; size_t workspace_size = 0;
CUINFER_CHECK(cuInferPageAttentionGetWorkspaceV7(num_seqs, CUINFER_CHECK(cuInferPageAttentionGetWorkspaceV7(num_seqs,
num_heads, num_heads,
num_kv_heads, num_kv_heads,
head_size, head_dim,
block_size, block_size,
max_context_len, max_context_len,
&workspace_size)); &workspace_size));
@@ -189,7 +192,7 @@ void PagedAttnKernel(const paddle::Tensor& q,
num_seqs, num_seqs,
num_heads, num_heads,
num_kv_heads, num_kv_heads,
head_size, head_dim,
q_stride, q_stride,
kv_block_stride, kv_block_stride,
kv_head_stride, kv_head_stride,
@@ -215,6 +218,8 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
const paddle::optional<paddle::Tensor> &v, const paddle::optional<paddle::Tensor> &v,
const paddle::optional<paddle::Tensor> &rope_sin, const paddle::optional<paddle::Tensor> &rope_sin,
const paddle::optional<paddle::Tensor> &rope_cos, const paddle::optional<paddle::Tensor> &rope_cos,
int num_heads,
int head_dim,
int num_kv_heads, int num_kv_heads,
float scale, float scale,
int block_size, int block_size,
@@ -228,11 +233,7 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
bool merged_qkv) { bool merged_qkv) {
const auto dtype = q.dtype(); const auto dtype = q.dtype();
auto out_shape = q.shape(); auto out = paddle::empty({q.shape()[0], num_heads * head_dim}, dtype, q.place());
if (merged_qkv) {
out_shape[1] -= 2 * num_kv_heads;
}
auto out = paddle::empty(out_shape, dtype, q.place());
switch (dtype) { switch (dtype) {
case paddle::DataType::BFLOAT16: case paddle::DataType::BFLOAT16:
@@ -246,6 +247,8 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
v, v,
rope_sin, rope_sin,
rope_cos, rope_cos,
num_heads,
head_dim,
num_kv_heads, num_kv_heads,
scale, scale,
block_size, block_size,
@@ -270,6 +273,8 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
v, v,
rope_sin, rope_sin,
rope_cos, rope_cos,
num_heads,
head_dim,
num_kv_heads, num_kv_heads,
scale, scale,
block_size, block_size,
@@ -299,6 +304,8 @@ std::vector<std::vector<int64_t>> PagedAttnInferShape(const std::vector<int64_t>
const std::vector<int64_t>& v_shape, const std::vector<int64_t>& v_shape,
const std::vector<int64_t>& rope_sin_shape, const std::vector<int64_t>& rope_sin_shape,
const std::vector<int64_t>& rope_cos_shape, const std::vector<int64_t>& rope_cos_shape,
int num_heads,
int head_dim,
int num_kv_heads, int num_kv_heads,
float scale, float scale,
int block_size, int block_size,
@@ -311,36 +318,13 @@ std::vector<std::vector<int64_t>> PagedAttnInferShape(const std::vector<int64_t>
bool use_sqrt_alibi, bool use_sqrt_alibi,
bool merged_qkv) { bool merged_qkv) {
if (merged_qkv) { if (merged_qkv) {
int64_t num_tokens = q_shape[0]; return {{q_shape[0], num_heads * head_dim}};
int64_t num_heads = q_shape[1] - 2 * num_kv_heads;
int64_t head_dim = q_shape[2];
return {{num_tokens, num_heads, head_dim}};
} else { } else {
return {q_shape}; return {q_shape};
} }
} }
std::vector<paddle::DataType> PagedAttnInferDtype(const paddle::DataType& q_dtype, std::vector<paddle::DataType> PagedAttnInferDtype(const paddle::DataType& q_dtype) {
const paddle::DataType& k_cache_dtype,
const paddle::DataType& v_cache_dtype,
const paddle::DataType& block_table_dtype,
const paddle::DataType& seq_lens_dtype,
const paddle::DataType& alibi_slopes_dtype,
const paddle::DataType& k_dtype,
const paddle::DataType& v_dtype,
const paddle::DataType& rope_sin_dtype,
const paddle::DataType& rope_cos_dtype,
int num_kv_heads,
float scale,
int block_size,
int max_context_len,
bool causal,
int window_left,
int window_right,
float softcap,
bool enable_cuda_graph,
bool use_sqrt_alibi,
bool merged_qkv) {
return {q_dtype}; return {q_dtype};
} }
@@ -351,7 +335,9 @@ PD_BUILD_STATIC_OP(paged_attn)
paddle::Optional("v"), paddle::Optional("rope_sin"), paddle::Optional("v"), paddle::Optional("rope_sin"),
paddle::Optional("rope_cos")}) paddle::Optional("rope_cos")})
.Outputs({"out"}) .Outputs({"out"})
.Attrs({"num_kv_heads:int", .Attrs({"num_heads:int",
"head_dim:int",
"num_kv_heads:int",
"scale:float", "scale:float",
"block_size:int", "block_size:int",
"max_context_len:int", "max_context_len:int",

View File

@@ -0,0 +1,378 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "helper.h"
#include "iluvatar_context.h"
template <paddle::DataType T>
void PrefillFusedPagedAttnKernel(const paddle::Tensor& qkv,
paddle::Tensor& k_cache,
paddle::Tensor& v_cache,
const paddle::Tensor& block_table,
const paddle::Tensor& cu_seqlens_qkv,
const paddle::optional<paddle::Tensor> &rope_sin,
const paddle::optional<paddle::Tensor> &rope_cos,
int num_heads,
int head_dim,
int num_kv_heads,
int block_size,
int max_seq_len,
float scale,
bool causal,
bool q_rope,
bool k_rope,
bool v_rope,
paddle::Tensor& out) {
// check dtype and contiguous
const auto& dtype = qkv.dtype();
cuinferDataType_t data_type;
if (dtype == paddle::DataType::FLOAT16) {
data_type = CUINFER_DATA_HALF;
} else if (dtype == paddle::DataType::BFLOAT16) {
data_type = CUINFER_DATA_BFLOAT16;
} else {
common::errors::InvalidArgument("paged_attention support half and bfloat16 now");
}
PADDLE_ENFORCE_EQ(k_cache.dtype(),
dtype,
common::errors::InvalidArgument(
"k_cache dtype must be the same as query dtype"));
PADDLE_ENFORCE_EQ(k_cache.is_contiguous(),
true,
common::errors::InvalidArgument(
"paged_attention expects k_cache is contiguous"));
PADDLE_ENFORCE_EQ(block_table.dtype(),
paddle::DataType::INT32,
common::errors::InvalidArgument(
"block_table dtype must be int32"));
PADDLE_ENFORCE_EQ(block_table.is_contiguous(),
true,
common::errors::InvalidArgument(
"paged_attention expects block_table is contiguous"));
PADDLE_ENFORCE_EQ(cu_seqlens_qkv.dtype(),
paddle::DataType::INT32,
common::errors::InvalidArgument(
"cu_seqlens_qkv dtype must be int32"));
PADDLE_ENFORCE_EQ(cu_seqlens_qkv.is_contiguous(),
true,
common::errors::InvalidArgument(
"paged_attention expects cu_seqlens_qkv is contiguous"));
// check dim and shape
// k_cache: [num_blocks, kv_num_heads, block_size, head_dim]
// v_cache: [num_blocks, kv_num_heads, block_size, head_dim]
// block_table: [batch_size, max_num_blocks_per_seq]
// seq_lens: [batch_size]
// qkv: [num_tokens, (num_heads+2*num_kv_heads)*head_dim]
// out: [num_tokens, hidden_size]
const auto& qkv_dims = qkv.dims();
PADDLE_ENFORCE_EQ(qkv_dims.size(),
2,
common::errors::InvalidArgument(
"paged_attn receive query dims is "
"[num_tokens, (num_heads+2*num_kv_heads)*head_dim]"));
PADDLE_ENFORCE_EQ(out.dims().size(),
2,
common::errors::InvalidArgument(
"paged_attn receive out dims is "
"[num_tokens, hidden_size]"));
const auto& kv_cache_dims = k_cache.dims();
PADDLE_ENFORCE_EQ(kv_cache_dims.size(),
4,
common::errors::InvalidArgument(
"paged_attn receive kv cache dims is "
"[num_blocks, kv_num_heads, block_size, head_dim]"));
const auto& block_table_dims = block_table.dims();
PADDLE_ENFORCE_EQ(block_table_dims.size(),
2,
common::errors::InvalidArgument(
"paged_attn receive block_table dims is "
"[batch_size, max_num_blocks_per_seq]"));
const auto& cu_seqlens_qkv_dims = cu_seqlens_qkv.dims();
PADDLE_ENFORCE_EQ(cu_seqlens_qkv_dims.size(),
1,
common::errors::InvalidArgument(
"paged_attn receive cu_seqlens_qkv dims is [batch_size]"));
int batch_size = block_table_dims[0];
int num_tokens = qkv_dims[0];
int num_total_heads = num_heads + 2 * num_kv_heads;
int qkv_stride = qkv.strides()[0];
int num_blocks = kv_cache_dims[0];
PADDLE_ENFORCE_EQ(kv_cache_dims[1],
num_kv_heads,
common::errors::InvalidArgument(
"kv_cache_dims[1] must be equal to num_kv_head"));
PADDLE_ENFORCE_EQ(kv_cache_dims[2],
block_size,
common::errors::InvalidArgument(
"kv_cache_dims[2] must be equal to block_size"));
PADDLE_ENFORCE_EQ(kv_cache_dims[3],
head_dim,
common::errors::InvalidArgument(
"kv_cache_dims[3] must be equal to head_dim"));
PADDLE_ENFORCE_EQ(cu_seqlens_qkv_dims[0],
batch_size + 1,
common::errors::InvalidArgument(
"cu_seqlens_qkv_dims[0] must be equal to batch_size + 1"));
int block_table_stride = block_table.strides()[0];
const float *rope_sin_ptr = rope_sin ? rope_sin.get().data<float>() : nullptr;
const float *rope_cos_ptr = rope_cos ? rope_cos.get().data<float>() : nullptr;
cuinferHandle_t cuinfer_handle = iluvatar::getContextInstance()->getIxInferHandle();
size_t workspace_size = 0;
CUINFER_CHECK(cuinferGetFmhaFwdMergedFuseRopeWorkspaceSize(num_tokens,
num_heads,
num_kv_heads,
head_dim,
q_rope,
k_rope,
v_rope,
data_type,
data_type,
data_type,
&workspace_size));
auto* allocator = paddle::GetAllocator(qkv.place());
phi::Allocator::AllocationPtr tmp_workspace = allocator->Allocate(workspace_size);
void* workspace_ptr = tmp_workspace->ptr();
cuinferTensorDescriptor_t qkv_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
qkv_desc,
data_type,
3,
std::vector<int>({num_tokens, num_total_heads, head_dim}).data(),
std::vector<int>({num_total_heads * head_dim, head_dim, 1}).data()));
cuinferTensorDescriptor_t qkv_seqlens_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_seqlens_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
qkv_seqlens_desc,
CUINFER_DATA_INT32,
1,
std::vector<int>({batch_size + 1}).data(),
std::vector<int>({1}).data()));
cuinferTensorDescriptor_t block_table_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&block_table_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
block_table_desc,
CUINFER_DATA_INT32,
2,
std::vector<int>({batch_size, block_table_stride}).data(),
std::vector<int>({block_table_stride, 1}).data()));
cuinferTensorDescriptor_t o_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&o_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
o_desc,
data_type,
3,
std::vector<int>({num_tokens, num_heads, head_dim}).data(),
std::vector<int>({num_heads * head_dim, head_dim, 1}).data()));
cuinferTensorDescriptor_t k_cache_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&k_cache_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
k_cache_desc,
data_type,
4,
std::vector<int>({num_blocks, num_kv_heads, block_size, head_dim}).data(),
std::vector<int>({num_kv_heads * block_size * head_dim, block_size * head_dim, head_dim, 1}).data()));
cuinferTensorDescriptor_t v_cache_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&v_cache_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
v_cache_desc,
data_type,
4,
std::vector<int>({num_blocks, num_kv_heads, block_size, head_dim}).data(),
std::vector<int>({num_kv_heads * block_size * head_dim, block_size * head_dim, head_dim, 1}).data()));
cuinferTensorDescriptor_t cos_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&cos_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
cos_desc,
CUINFER_DATA_FLOAT,
2,
std::vector<int>({max_seq_len, head_dim}).data(),
std::vector<int>({head_dim, 1}).data()));
cuinferTensorDescriptor_t sin_desc;
CUINFER_CHECK(cuinferCreateTensorDescriptor(&sin_desc));
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
sin_desc,
CUINFER_DATA_FLOAT,
2,
std::vector<int>({max_seq_len, head_dim}).data(),
std::vector<int>({head_dim, 1}).data()));
CUINFER_CHECK(cuinferFmhaFwdMergedFuseRopeFunc(cuinfer_handle,
qkv_desc,
qkv.data(),
qkv_seqlens_desc,
cu_seqlens_qkv.data<int32_t>(),
block_table_desc,
block_table.data<int32_t>(),
o_desc,
out.data(),
k_cache_desc,
k_cache.data(),
v_cache_desc,
v_cache.data(),
workspace_ptr,
workspace_size,
cos_desc,
rope_cos_ptr,
sin_desc,
rope_sin_ptr,
batch_size,
num_heads,
num_kv_heads,
head_dim,
causal,
scale,
q_rope,
k_rope,
v_rope));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_seqlens_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(block_table_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(o_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(k_cache_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(v_cache_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(cos_desc));
CUINFER_CHECK(cuinferDestroyTensorDescriptor(sin_desc));
}
std::vector<paddle::Tensor> PrefillFusedPagedAttn(const paddle::Tensor& qkv,
paddle::Tensor& k_cache,
paddle::Tensor& v_cache,
const paddle::Tensor& block_table,
const paddle::Tensor& cu_seqlens_qkv,
const paddle::optional<paddle::Tensor> &rope_sin,
const paddle::optional<paddle::Tensor> &rope_cos,
int num_heads,
int head_dim,
int num_kv_heads,
int block_size,
int max_seq_len,
float scale,
bool causal,
bool q_rope,
bool k_rope,
bool v_rope) {
const auto dtype = qkv.dtype();
auto out = paddle::empty({qkv.shape()[0], num_heads * head_dim}, dtype, qkv.place());
switch (dtype) {
case paddle::DataType::BFLOAT16:
PrefillFusedPagedAttnKernel<paddle::DataType::BFLOAT16>(qkv,
k_cache,
v_cache,
block_table,
cu_seqlens_qkv,
rope_sin,
rope_cos,
num_heads,
head_dim,
num_kv_heads,
block_size,
max_seq_len,
scale,
causal,
q_rope,
k_rope,
v_rope,
out);
break;
case paddle::DataType::FLOAT16:
PrefillFusedPagedAttnKernel<paddle::DataType::FLOAT16>(qkv,
k_cache,
v_cache,
block_table,
cu_seqlens_qkv,
rope_sin,
rope_cos,
num_heads,
head_dim,
num_kv_heads,
block_size,
max_seq_len,
scale,
causal,
q_rope,
k_rope,
v_rope,
out);
break;
default:
PD_THROW("Unsupported data type for Paged attn");
}
return {out};
}
std::vector<std::vector<int64_t>> PrefillFusedPagedAttnInferShape(const std::vector<int64_t>& qkv_shape,
const std::vector<int64_t>& k_cache_shape,
const std::vector<int64_t>& v_cache_shape,
const std::vector<int64_t>& block_table_shape,
const std::vector<int64_t>& cu_seqlens_qkv_shape,
const std::vector<int64_t>& rope_sin_shape,
const std::vector<int64_t>& rope_cos_shape,
int num_heads,
int head_dim,
int num_kv_heads,
int block_size,
int max_seq_len,
float scale,
bool causal,
bool q_rope,
bool k_rope,
bool v_rope) {
return {{qkv_shape[0], num_heads * head_dim}};
}
std::vector<paddle::DataType> PrefillFusedPagedAttnInferDtype(const paddle::DataType& qkv_dtype) {
return {qkv_dtype};
}
PD_BUILD_STATIC_OP(prefill_fused_paged_attn)
.Inputs({"qkv", "k_cache", "v_cache", "block_table", "cu_seqlens_qkv",
paddle::Optional("rope_sin"), paddle::Optional("rope_cos")})
.Outputs({"out"})
.Attrs({"num_heads:int",
"head_dim:int",
"num_kv_heads:int",
"block_size:int",
"max_seq_len:int",
"scale:float",
"causal:bool",
"q_rope:bool",
"k_rope:bool",
"v_rope:bool"})
.SetKernelFn(PD_KERNEL(PrefillFusedPagedAttn))
.SetInferShapeFn(PD_INFER_SHAPE(PrefillFusedPagedAttnInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(PrefillFusedPagedAttnInferDtype));

View File

@@ -536,6 +536,8 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
"iluvatar_ops/moe_dispatch.cu", "iluvatar_ops/moe_dispatch.cu",
"iluvatar_ops/moe_reduce.cu", "iluvatar_ops/moe_reduce.cu",
"iluvatar_ops/paged_attn.cu", "iluvatar_ops/paged_attn.cu",
"iluvatar_ops/prefill_fused_attn.cu",
"iluvatar_ops/mixed_fused_attn.cu",
"iluvatar_ops/w8a16_group_gemm.cu", "iluvatar_ops/w8a16_group_gemm.cu",
"iluvatar_ops/runtime/iluvatar_context.cc", "iluvatar_ops/runtime/iluvatar_context.cc",
], ],

View File

@@ -1,5 +1,4 @@
# Run ERNIE-4.5-300B-A47B & ERNIE-4.5-21B-A3B model on iluvatar machine # Run ERNIE-4.5-300B-A47B & ERNIE-4.5-21B-A3B model on iluvatar machine
The current version of the software merely serves as a demonstration demo for the Iluvatar CoreX combined with the Fastdeploy inference framework for large models. Running the latest ERNIE4.5 300B model on the GSM8K dataset takes about 6.3 hours.
## Machine Preparation ## Machine Preparation
First, the `TP=16` when running the ERNIE4.5 300B model and so you need to prepare a machine with the following configurations: First, the `TP=16` when running the ERNIE4.5 300B model and so you need to prepare a machine with the following configurations:
@@ -30,7 +29,7 @@ docker exec -it paddle_infer bash
### Install paddle ### Install paddle
```bash ```bash
pip3 install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
``` ```
For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/) For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
@@ -78,7 +77,7 @@ prompts = [
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256) sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
# load the model # load the model
llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, static_decode_blocks=0, block_size=16, quantization='wint8') llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, block_size=16, quantization='wint8')
# Perform batch inference # Perform batch inference
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
@@ -390,7 +389,7 @@ export INFERENCE_MSG_QUEUE_ID=232132
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
export FD_SAMPLING_CLASS=rejection export FD_SAMPLING_CLASS=rejection
python3 -m fastdeploy.entrypoints.openai.api_server --model "/home/paddle/ernie-45t" --port 8188 --tensor-parallel-size 16 --block-size 16 --static-decode-blocks 0 --quantization wint8 python3 -m fastdeploy.entrypoints.openai.api_server --model "/home/paddle/ernie-45t" --port 8188 --tensor-parallel-size 16 --block-size 16 --quantization wint8
``` ```
4. Running the Script 4. Running the Script
@@ -403,10 +402,10 @@ After the service is ready, open another terminal and run:
```bash ```bash
python3 -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parallel 8 python3 -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parallel 8
``` ```
It takes about 6.3 hours to run the GSM8K dataset. It takes about 4.8 hours to run the GSM8K dataset.
``` ```
Accuracy: 0.964 Accuracy: 0.962
Invaild: 0.000 Invaild: 0.000
Latency: 22918.186 s Latency: 17332.728 s
``` ```

View File

@@ -1,12 +1,11 @@
# 如何在天数机器上运行 ERNIE-4.5-300B-A47B-BF16 & ERNIE-4.5-21B-A3B # 如何在天数机器上运行 ERNIE-4.5-300B-A47B-BF16 & ERNIE-4.5-21B-A3B
该软件的当前版本仅作为Iluvatar CoreX与大型模型的Fastdeploy推理框架相结合的演示。在GSM8K数据集上运行最新的ERNIE4.5 300B模型大约需要6.3小时。
## 准备机器 ## 准备机器
首先您需要准备以下配置的机器 首先运行ERNIE4.5 300B模型需要`TP=16`, 所以您需要准备以下配置的机器
| CPU | 内存 | 天数 | 硬盘| | CPU | 内存 | 天数 | 硬盘|
|-----|------|-----|-----| |-----|------|-----|-----|
| x86 | 1TB| 8xBI150| 1TB| | x86 | 1TB| 16xBI150| 1TB|
目前需要将完整模型 load 到 host memory 中,需要需要大于 600GB 的 host memory后续版本会优化。 目前需要将完整模型 load 到 host memory 中,需要需要大于 600GB 的 host memory后续版本会优化。
@@ -30,7 +29,7 @@ docker exec -it paddle_infer bash
### 安装paddle ### 安装paddle
```bash ```bash
pip3 install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
``` ```
获取Paddle的最新安装版本 [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/) 获取Paddle的最新安装版本 [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
@@ -77,7 +76,7 @@ prompts = [
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256) sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
# 加载模型 # 加载模型
llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, static_decode_blocks=0, quantization='wint8') llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, quantization='wint8')
# 批量进行推理llm内部基于资源情况进行请求排队、动态插入处理 # 批量进行推理llm内部基于资源情况进行请求排队、动态插入处理
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
@@ -132,3 +131,281 @@ Now, let's break down each step:
**Step 3: Drawing the **Step 3: Drawing the
The largest ocean is the Pacific Ocean, covering an area of approximately ⦠[3], The first scientific expeditions to determine the ocean's depth were the Challenger expedition (1872â1876) and the U.S. Navy Hydrographic Office survey (1877â1879). The oceanic crust is thin and irregular, consisting of upward moving magma from the mantle below, and cooling and solidifying on the surface. The shallowest parts of the ocean are called the continental shelves. Large tides are caused mainly by the alignment of the Sun, Moon, and Earth during new or full moons. The origin of the word "ocean" is not clear. The first global oceanic topography survey was completed by the Challenger expedition (1872â1876). [57] The sound speed in the ocean is primarily a function of water temperature and salinity, and varies with depth. The deep-ocean floor is mostly flat and devoid of life, with the exception of seamounts and various underwater volcanic features, including seamounts and hydrothermal vents. [73] Today, the five ocean The largest ocean is the Pacific Ocean, covering an area of approximately ⦠[3], The first scientific expeditions to determine the ocean's depth were the Challenger expedition (1872â1876) and the U.S. Navy Hydrographic Office survey (1877â1879). The oceanic crust is thin and irregular, consisting of upward moving magma from the mantle below, and cooling and solidifying on the surface. The shallowest parts of the ocean are called the continental shelves. Large tides are caused mainly by the alignment of the Sun, Moon, and Earth during new or full moons. The origin of the word "ocean" is not clear. The first global oceanic topography survey was completed by the Challenger expedition (1872â1876). [57] The sound speed in the ocean is primarily a function of water temperature and salinity, and varies with depth. The deep-ocean floor is mostly flat and devoid of life, with the exception of seamounts and various underwater volcanic features, including seamounts and hydrothermal vents. [73] Today, the five ocean
``` ```
## 在GSM8K数据集上运行ernie4.5 300B模型
1. 下载GSM8K数据集
```bash
wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
```
2. 准备`bench_gsm8k.py`
```python
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Fastdeploy + ERNIE-4.5-Turbo 的指标评估 """
# adapted from https://github.com/sgl-project/sglang/blob/main/benchmark/gsm8k/bench_other.py
import argparse
import ast
import json
import re
import time
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import requests
from tqdm import tqdm
INVALID = -9999999
def call_generate(prompt, **kwargs):
"""
Generates response based on the input prompt.
Args:
prompt (str): The input prompt text.
**kwargs: Keyword arguments, including server IP address and port number.
Returns:
str: The response generated based on the prompt.
"""
url = f"http://{kwargs['ip']}:{kwargs['port']}/v1/chat/completions"
headers = {"Content-Type": "application/json"}
data = {
"messages": [
{
"role": "user",
"content": prompt,
}
],
"temperature": 0.6,
"max_tokens": 2047,
"top_p": 0.95,
"do_sample": True,
}
response = requests.post(url, headers=headers, data=json.dumps(data))
out = response.json()
return out["choices"][0]["message"]["content"]
def get_one_example(lines, i, include_answer):
"""
Retrieves a question-answer example from the given list of text lines.
Args:
lines (list of dict): A list of question-answer pairs.
i (int): The index of the question-answer pair to retrieve from lines.
include_answer (bool): Whether to include the answer in the returned string.
Returns:
str: A formatted question-answer string in the format "Question: <question>\nAnswer: <answer>".
"""
ret = "Question: " + lines[i]["question"] + "\nAnswer:"
if include_answer:
ret += " " + lines[i]["answer"]
return ret
def get_few_shot_examples(lines, k):
"""
Selects k examples from the given list of text lines and concatenates them into a single string.
Args:
lines (list): A list containing text lines.
k (int): The number of examples to select.
Returns:
str: A string composed of k examples, separated by two newline characters.
"""
ret = ""
for i in range(k):
ret += get_one_example(lines, i, True) + "\n\n"
return ret
def get_answer_value(answer_str):
"""
Extracts numerical values from an answer string and returns them.
Args:
answer_str (str): The string containing the answer.
Returns:
The extracted numerical value; returns "INVALID" if extraction fails.
"""
answer_str = answer_str.replace(",", "")
numbers = re.findall(r"\d+", answer_str)
if len(numbers) < 1:
return INVALID
try:
return ast.literal_eval(numbers[-1])
except SyntaxError:
return INVALID
def read_jsonl(filename: str):
"""
Reads a JSONL file.
Args:
filename (str): Path to the JSONL file.
Yields:
dict: A dictionary object corresponding to each line in the JSONL file.
"""
with open(filename) as fin:
for line in fin:
if line.startswith("#"):
continue
yield json.loads(line)
def main(args):
"""
Process inputs and generate answers by calling the model in parallel using a thread pool.
Args:
args (argparse.Namespace):
- num_questions (int): Number of questions to process.
- num_shots (int): Number of few-shot learning examples.
- ip (str): IP address of the model service.
- port (int): Port number of the model service.
- parallel (int): Number of questions to process in parallel.
- result_file (str): File path to store the results.
Returns:
None
"""
# Read data
filename = "test.jsonl"
lines = list(read_jsonl(filename))
# Construct prompts
num_questions = args.num_questions
num_shots = args.num_shots
few_shot_examples = get_few_shot_examples(lines, num_shots)
questions = []
labels = []
for i in range(len(lines[:num_questions])):
questions.append(get_one_example(lines, i, False))
labels.append(get_answer_value(lines[i]["answer"]))
assert all(l != INVALID for l in labels)
states = [None] * len(labels)
# Use thread pool
def get_one_answer(i):
answer = call_generate(
prompt=few_shot_examples + questions[i],
# stop=["Question", "Assistant:", "<|separator|>"],
ip=args.ip,
port=args.port,
)
states[i] = answer
tic = time.time()
if args.parallel == 1:
for i in tqdm(range(len(questions))):
get_one_answer(i)
else:
with ThreadPoolExecutor(args.parallel) as executor:
list(
tqdm(
executor.map(get_one_answer, list(range(len(questions)))),
total=len(questions),
)
)
latency = time.time() - tic
preds = []
for i in range(len(states)):
preds.append(get_answer_value(states[i]))
# Compute accuracy
acc = np.mean(np.array(preds) == np.array(labels))
invalid = np.mean(np.array(preds) == INVALID)
# Print results
print(f"Accuracy: {acc:.3f}")
print(f"Invalid: {invalid:.3f}")
print(f"Latency: {latency:.3f} s")
with open(args.result_file, "a") as fout:
value = {
"task": "gsm8k",
"backend": "paddlepaddle",
"num_gpus": 1,
"latency": round(latency, 3),
"accuracy": round(acc, 3),
"num_requests": args.num_questions,
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
},
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--ip", type=str, default="127.0.0.1")
parser.add_argument("--port", type=str, default="8188")
parser.add_argument("--num-shots", type=int, default=10)
parser.add_argument("--data-path", type=str, default="test.jsonl")
parser.add_argument("--num-questions", type=int, default=1319)
parser.add_argument("--result-file", type=str, default="result.jsonl")
parser.add_argument("--parallel", type=int, default=1)
args = parser.parse_args()
main(args)
```
3. 准备`run_bench.sh`
```bash
#!/bin/bash
export PADDLE_XCCL_BACKEND=iluvatar_gpu
export INFERENCE_MSG_QUEUE_ID=232132
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
export FD_SAMPLING_CLASS=rejection
python3 -m fastdeploy.entrypoints.openai.api_server --model "/home/paddle/ernie-45t" --port 8188 --tensor-parallel-size 16 --block-size 16 --quantization wint8
```
4. 运行脚本
首先打开一个终端执行服务端命令:
```bash
./run_bench.sh
```
等服务起好后,在打开另一个终端执行客户端命令:
```bash
python3 -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parallel 8
```
推理整个GSM8K数据集大概需要4.8个小时。
```
Accuracy: 0.962
Invaild: 0.000
Latency: 17332.728 s
```

View File

@@ -1186,9 +1186,7 @@ class CacheConfig:
self.kv_cache_ratio = 1.0 self.kv_cache_ratio = 1.0
else: else:
self.kv_cache_ratio = 0.75 self.kv_cache_ratio = 0.75
self.enc_dec_block_num = ( self.enc_dec_block_num = 0 if current_platform.is_maca() else envs.FD_ENC_DEC_BLOCK_NUM
0 if current_platform.is_iluvatar() or current_platform.is_maca() else envs.FD_ENC_DEC_BLOCK_NUM
)
self.prealloc_dec_block_slot_num_threshold = 12 self.prealloc_dec_block_slot_num_threshold = 12
self.cache_dtype = "bfloat16" self.cache_dtype = "bfloat16"
self.model_cfg = None self.model_cfg = None

View File

@@ -16,13 +16,11 @@
from __future__ import annotations from __future__ import annotations
import os
from dataclasses import dataclass from dataclasses import dataclass
from math import sqrt from math import sqrt
from typing import TYPE_CHECKING, Optional from typing import TYPE_CHECKING, Optional
import paddle import paddle
from paddle.nn.functional.flash_attention import flash_attn_unpadded
from fastdeploy.config import FDConfig from fastdeploy.config import FDConfig
from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.attention import Attention
@@ -30,7 +28,11 @@ from fastdeploy.model_executor.layers.attention.base_attention_backend import (
AttentionBackend, AttentionBackend,
AttentionMetadata, AttentionMetadata,
) )
from fastdeploy.model_executor.ops.iluvatar import paged_attention from fastdeploy.model_executor.ops.iluvatar import (
mixed_fused_paged_attention,
paged_attention,
prefill_fused_paged_attention,
)
if TYPE_CHECKING: if TYPE_CHECKING:
from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
@@ -42,26 +44,7 @@ class IluvatarAttentionMetadata(AttentionMetadata):
IluvatarAttentionMetadata IluvatarAttentionMetadata
""" """
# flash_attn metadata
cu_seqlens_q: Optional[paddle.Tensor] = None
cu_seqlens_k: Optional[paddle.Tensor] = None
fixed_seed_offset: Optional[paddle.Tensor] = None
attn_mask: Optional[paddle.Tensor] = None
attn_mask_start_row_indices: Optional[paddle.Tensor] = None
dropout: float = 0.0
causal: bool = True
return_softmax: bool = False
rng_name: str = ""
# paged_attn metadata
block_tables: Optional[paddle.Tensor] = None
seq_lens: Optional[paddle.Tensor] = None
num_kv_heads: int = 1
scale: float = 1.0
block_size: int = 1
max_context_len: int = 1
alibi_slopes: Optional[paddle.Tensor] = None alibi_slopes: Optional[paddle.Tensor] = None
# causal: bool = True
window_left: int = -1 window_left: int = -1
window_right: int = -1 window_right: int = -1
softcap: float = 0.0 softcap: float = 0.0
@@ -88,55 +71,44 @@ class IluvatarAttnBackend(AttentionBackend):
def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, head_dim: int): def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, head_dim: int):
super().__init__() super().__init__()
self.attention_metadata = IluvatarAttentionMetadata() self.attention_metadata = IluvatarAttentionMetadata()
self.attention_metadata.block_size = fd_config.parallel_config.block_size self.block_size = fd_config.parallel_config.block_size
assert ( assert self.block_size == 16, "Iluvatar paged attn requires block_size must be 16."
fd_config.parallel_config.enc_dec_block_num == 0 self.max_context_len = fd_config.parallel_config.max_model_len
), f"Iluvatar does not support yet, {fd_config.parallel_config.enc_dec_block_num}" self.causal = getattr(fd_config.model_config, "causal", True)
assert self.attention_metadata.block_size == 16, "Iluvatar paged attn requires block_size must be 16."
self.attention_metadata.max_context_len = fd_config.parallel_config.max_model_len
self.attention_metadata.causal = getattr(fd_config.model_config, "causal", True)
self.speculate_method = getattr(fd_config.parallel_config, "speculate_method", None) self.speculate_method = getattr(fd_config.parallel_config, "speculate_method", None)
self.use_speculate = self.speculate_method is not None self.use_speculate = self.speculate_method is not None
self.attention_metadata.num_kv_heads = kv_num_heads self.num_kv_heads = kv_num_heads
self.attention_metadata.dropout = fd_config.model_config.hidden_dropout_prob
self.num_heads = num_heads self.num_heads = num_heads
self.total_num_heads = num_heads + 2 * kv_num_heads self.total_num_heads = num_heads + 2 * kv_num_heads
self.head_dim = head_dim self.head_dim = head_dim
self.hidden_dim = num_heads * head_dim self.hidden_dim = fd_config.model_config.hidden_size
self.total_hidden_dim = self.total_num_heads * head_dim
# note: scale need to change if using MLA # note: scale need to change if using MLA
self.attention_metadata.scale = 1.0 / sqrt(head_dim) self.scale = 1.0 / sqrt(head_dim)
self.num_layers = fd_config.model_config.num_hidden_layers self.num_layers = fd_config.model_config.num_hidden_layers
self.dtype = paddle.get_default_dtype() self.dtype = paddle.get_default_dtype()
self.record_block_table_metadata = {}
self.enable_fused_attention = int(os.getenv("FD_ILUVATAR_ENABLE_FUSED_ATTN", 1))
def init_attention_metadata(self, forward_meta: ForwardMeta): def init_attention_metadata(self, forward_meta: ForwardMeta):
"""Initialize attntion metadata hence all layers in the forward pass can reuse it.""" """Initialize attntion metadata hence all layers in the forward pass can reuse it."""
self.rope_cos = forward_meta.rotary_embs[0, 0, :, :, :]
self.rope_sin = forward_meta.rotary_embs[1, 0, :, :, :]
self.prefill_info_dict = {} self.prefill_info_dict = {}
self.decode_info_dict = {} self.decode_info_dict = {}
self.prefill_info_dict["batch_ids"] = paddle.where(forward_meta.seq_lens_encoder)[0]
prefill_non_zeros_ids = forward_meta.seq_lens_this_time > 1 self.decode_info_dict["batch_ids"] = paddle.where(forward_meta.seq_lens_decoder)[0]
decode_non_zeros_ids = forward_meta.seq_lens_this_time == 1
self.prefill_info_dict["batch_ids"] = paddle.where(prefill_non_zeros_ids)[0]
self.decode_info_dict["batch_ids"] = paddle.where(decode_non_zeros_ids)[0]
self.prefill_len = len(self.prefill_info_dict["batch_ids"]) self.prefill_len = len(self.prefill_info_dict["batch_ids"])
self.decode_len = len(self.decode_info_dict["batch_ids"]) self.decode_len = len(self.decode_info_dict["batch_ids"])
# only prefill # only prefill
if self.decode_len == 0: if self.decode_len == 0:
cu_seq_ids = list(range(self.prefill_len + 1)) cu_seq_ids = list(range(self.prefill_len + 1))
self.prefill_info_dict["cu_seqlens_q"] = forward_meta.cu_seqlens_q[cu_seq_ids] self.prefill_info_dict["cu_seqlens_q"] = forward_meta.cu_seqlens_q[cu_seq_ids]
self.mixed = False
# only decode # only decode
elif self.prefill_len == 0: elif self.prefill_len == 0:
pass self.mixed = False
# both prefill and decode # both prefill and decode
else: else:
prefill_num_tokens = paddle.sum(forward_meta.seq_lens_this_time[prefill_non_zeros_ids]) self.mixed = True
decode_num_tokens = paddle.sum(forward_meta.seq_lens_this_time[decode_non_zeros_ids]) self.prefill_num_tokens = paddle.sum(forward_meta.seq_lens_encoder).item()
self.prefill_info_dict["cu_seqlens_q"] = paddle.zeros( self.prefill_info_dict["cu_seqlens_q"] = paddle.zeros(
[self.prefill_len + 1], dtype=forward_meta.cu_seqlens_q.dtype [self.prefill_len + 1], dtype=forward_meta.cu_seqlens_q.dtype
) )
@@ -145,36 +117,30 @@ class IluvatarAttnBackend(AttentionBackend):
] ]
self.prefill_info_dict["cu_seqlens_q"] = paddle.cumsum(self.prefill_info_dict["cu_seqlens_q"]) self.prefill_info_dict["cu_seqlens_q"] = paddle.cumsum(self.prefill_info_dict["cu_seqlens_q"])
self.prefill_qkv = paddle.zeros([prefill_num_tokens, self.total_hidden_dim], dtype=self.dtype) self.tmp_buffer = paddle.zeros(
self.decode_qkv = paddle.zeros([decode_num_tokens, self.total_hidden_dim], dtype=self.dtype) [self.prefill_num_tokens + self.decode_len, self.hidden_dim], dtype=self.dtype
self.merged_output = paddle.zeros(
[prefill_num_tokens + decode_num_tokens, self.num_heads, self.head_dim], dtype=self.dtype
) )
prefill_start, decode_start, start = 0, 0, 0 prefill_start, decode_start, start = 0, self.prefill_num_tokens, 0
non_zeros_ids = forward_meta.seq_lens_this_time != 0 non_zeros_ids = forward_meta.seq_lens_this_time != 0
non_zeros_seq_lens = forward_meta.seq_lens_this_time[non_zeros_ids] non_zeros_seq_lens = forward_meta.seq_lens_this_time[non_zeros_ids]
end = non_zeros_seq_lens[0] end = non_zeros_seq_lens[0]
if end > 1: if end > 1:
last_stage = "prefill" last_stage = "prefill"
prefill_end = end prefill_end = end
decode_end = 0 decode_end = decode_start
else: else:
last_stage = "decode" last_stage = "decode"
prefill_end = 0 prefill_end = 0
decode_end = end decode_end = decode_start + end
self.prefill_info_dict["id_group"] = [] self.id_group = []
self.prefill_info_dict["reverse_id_group"] = [] self.reverse_id_group = []
self.decode_info_dict["id_group"] = []
self.decode_info_dict["reverse_id_group"] = []
self.record_stages = []
for seq_len in non_zeros_seq_lens[1:]: for seq_len in non_zeros_seq_lens[1:]:
if seq_len > 1: if seq_len > 1:
if last_stage == "decode": if last_stage == "decode":
self.record_stages.append((last_stage, len(self.decode_info_dict["id_group"]))) self.id_group.append((decode_start, decode_end))
self.decode_info_dict["id_group"].append((decode_start, decode_end)) self.reverse_id_group.append((start, end))
self.decode_info_dict["reverse_id_group"].append((start, end))
decode_start = decode_end decode_start = decode_end
start = end start = end
last_stage = "prefill" last_stage = "prefill"
@@ -182,9 +148,8 @@ class IluvatarAttnBackend(AttentionBackend):
end += seq_len end += seq_len
else: else:
if last_stage == "prefill": if last_stage == "prefill":
self.record_stages.append((last_stage, len(self.prefill_info_dict["id_group"]))) self.id_group.append((prefill_start, prefill_end))
self.prefill_info_dict["id_group"].append((prefill_start, prefill_end)) self.reverse_id_group.append((start, end))
self.prefill_info_dict["reverse_id_group"].append((start, end))
prefill_start = prefill_end prefill_start = prefill_end
start = end start = end
last_stage = "decode" last_stage = "decode"
@@ -192,13 +157,11 @@ class IluvatarAttnBackend(AttentionBackend):
end += seq_len end += seq_len
if prefill_start < prefill_end: if prefill_start < prefill_end:
self.record_stages.append(("prefill", len(self.prefill_info_dict["id_group"]))) self.id_group.append((prefill_start, prefill_end))
self.prefill_info_dict["id_group"].append((prefill_start, prefill_end)) self.reverse_id_group.append((start, end))
self.prefill_info_dict["reverse_id_group"].append((start, end))
if decode_start < decode_end: if decode_start < decode_end:
self.record_stages.append(("decode", len(self.decode_info_dict["id_group"]))) self.id_group.append((decode_start, decode_end))
self.decode_info_dict["id_group"].append((decode_start, decode_end)) self.reverse_id_group.append((start, end))
self.decode_info_dict["reverse_id_group"].append((start, end))
def get_attntion_meta(self): def get_attntion_meta(self):
"""get_attntion_meta""" """get_attntion_meta"""
@@ -214,206 +177,20 @@ class IluvatarAttnBackend(AttentionBackend):
""" """
return ( return (
max_num_blocks, max_num_blocks,
self.attention_metadata.num_kv_heads, self.num_kv_heads,
self.attention_metadata.block_size, self.block_size,
self.head_dim, self.head_dim,
) )
def prefill_update_kv_cache( def transpose(self, hidden_states):
self, k, v, k_cache_id: int, v_cache_id: int, layer_id: int, forward_meta: ForwardMeta, prefill_batch_ids: list for ids, reverse_ids in zip(self.id_group, self.reverse_id_group):
): self.tmp_buffer[ids[0] : ids[1], :] = hidden_states[reverse_ids[0] : reverse_ids[1], :]
# [num_tokens, num_kv_heads, head_dim] -> [num_kv_heads, num_tokens, head_dim] return self.tmp_buffer
trans_k = k.transpose([1, 0, 2]).contiguous()
trans_v = v.transpose([1, 0, 2]).contiguous()
tensor_start = 0
for batch_idx in prefill_batch_ids:
seq_len = forward_meta.seq_lens_this_time[batch_idx]
tensor_end = tensor_start + seq_len def reverse_transpose(self, hidden_states):
slice_trans_k = trans_k[:, tensor_start:tensor_end, :] for ids, reverse_ids in zip(self.id_group, self.reverse_id_group):
slice_trans_v = trans_v[:, tensor_start:tensor_end, :] self.tmp_buffer[reverse_ids[0] : reverse_ids[1], :] = hidden_states[ids[0] : ids[1], :]
return self.tmp_buffer
cur_block_tables = forward_meta.block_tables[batch_idx]
cur_used_block_tables = cur_block_tables[cur_block_tables != -1]
cache_start = 0
cur_used_num_blocks = cur_used_block_tables.shape[0]
for i, block_id in enumerate(cur_used_block_tables):
# last block: seq_len - cache_start <= block_size
if i == cur_used_num_blocks - 1:
cache_end = seq_len - cache_start
assert cache_end <= self.attention_metadata.block_size
paddle.assign(
slice_trans_k[:, cache_start:seq_len, :],
output=forward_meta.caches[k_cache_id][block_id, :, 0:cache_end, :],
)
paddle.assign(
slice_trans_v[:, cache_start:seq_len, :],
output=forward_meta.caches[v_cache_id][block_id, :, 0:cache_end, :],
)
if layer_id == self.num_layers - 1:
self.record_block_table_metadata[batch_idx] = {
"block_id": block_id.item(),
"cache_end": cache_end.item(),
}
# non last block: seq_lens_this_time > block_size
else:
assert seq_len > self.attention_metadata.block_size
cache_end = cache_start + self.attention_metadata.block_size
paddle.assign(
slice_trans_k[:, cache_start:cache_end, :], output=forward_meta.caches[k_cache_id][block_id]
)
paddle.assign(
slice_trans_v[:, cache_start:cache_end, :], output=forward_meta.caches[v_cache_id][block_id]
)
cache_start += self.attention_metadata.block_size
tensor_start = tensor_end
def get_splited_qkv(
self, qkv: paddle.Tensor, forward_meta: ForwardMeta, cu_seqlens_q: paddle.Tensor, batch_ids=None
):
q_end = self.hidden_dim
k_end = q_end + self.attention_metadata.num_kv_heads * self.head_dim
v_end = k_end + self.attention_metadata.num_kv_heads * self.head_dim
assert v_end == qkv.shape[-1], f"Shape mismatch: {v_end} vs {qkv.shape[-1]}"
assert qkv.shape[0] == cu_seqlens_q[-1], f"Shape mismatch: {qkv.shape[0]} vs {cu_seqlens_q[-1]}"
if batch_ids is None:
batch_ids = list(range(forward_meta.seq_lens_this_time.shape[0]))
q = qkv[..., 0:q_end]
k = qkv[..., q_end:k_end]
v = qkv[..., k_end:v_end]
q = q.view([-1, self.num_heads, self.head_dim])
k = k.view([-1, self.attention_metadata.num_kv_heads, self.head_dim])
v = v.view([-1, self.attention_metadata.num_kv_heads, self.head_dim])
for idx in range(len(cu_seqlens_q) - 1):
batch_idx = batch_ids[idx]
seq_len_i = forward_meta.seq_lens_this_time[batch_idx]
if seq_len_i == 0:
continue
cached_kv_len = forward_meta.seq_lens_decoder[batch_idx][0]
cu_seq_start_q = cu_seqlens_q[idx]
cu_seq_end_q = cu_seqlens_q[idx + 1]
# forward_meta.rotary_embs is [2, 1, S, 1, D]
if forward_meta.rotary_embs is not None:
cos = forward_meta.rotary_embs[0, 0, cached_kv_len : cached_kv_len + seq_len_i, :, :]
sin = forward_meta.rotary_embs[1, 0, cached_kv_len : cached_kv_len + seq_len_i, :, :]
q[cu_seq_start_q:cu_seq_end_q] = apply_rope(q[cu_seq_start_q:cu_seq_end_q], cos, sin)
k[cu_seq_start_q:cu_seq_end_q] = apply_rope(k[cu_seq_start_q:cu_seq_end_q], cos, sin)
return q, k, v
def split_pd_qkv(self, qkv):
for ids, reverse_ids in zip(self.prefill_info_dict["id_group"], self.prefill_info_dict["reverse_id_group"]):
self.prefill_qkv[ids[0] : ids[1], :] = qkv[reverse_ids[0] : reverse_ids[1], :]
for ids, reverse_ids in zip(self.decode_info_dict["id_group"], self.decode_info_dict["reverse_id_group"]):
self.decode_qkv[ids[0] : ids[1], :] = qkv[reverse_ids[0] : reverse_ids[1], :]
return self.prefill_qkv, self.decode_qkv
def merge_pd_output(self, prefill_out, decode_out):
for stage, idx in self.record_stages:
if stage == "prefill":
ids = self.prefill_info_dict["id_group"][idx]
reverse_ids = self.prefill_info_dict["reverse_id_group"][idx]
self.merged_output[reverse_ids[0] : reverse_ids[1], :, :] = prefill_out[ids[0] : ids[1], :, :]
else:
ids = self.decode_info_dict["id_group"][idx]
reverse_ids = self.decode_info_dict["reverse_id_group"][idx]
self.merged_output[reverse_ids[0] : reverse_ids[1], :, :] = decode_out[ids[0] : ids[1], :, :]
return self.merged_output
def forward_prefill(self, prefill_qkv, layer_id, k_cache_id, v_cache_id, forward_meta: ForwardMeta):
prefill_q, prefill_k, prefill_v = self.get_splited_qkv(
prefill_qkv,
forward_meta,
self.prefill_info_dict["cu_seqlens_q"],
batch_ids=self.prefill_info_dict["batch_ids"],
)
prefill_out = flash_attn_unpadded(
prefill_q,
prefill_k,
prefill_v,
cu_seqlens_q=self.prefill_info_dict["cu_seqlens_q"],
cu_seqlens_k=self.prefill_info_dict["cu_seqlens_q"],
max_seqlen_q=self.attention_metadata.max_context_len,
max_seqlen_k=self.attention_metadata.max_context_len,
scale=self.attention_metadata.scale,
dropout=self.attention_metadata.dropout,
causal=self.attention_metadata.causal,
return_softmax=self.attention_metadata.return_softmax,
)[0]
self.prefill_update_kv_cache(
prefill_k, prefill_v, k_cache_id, v_cache_id, layer_id, forward_meta, self.prefill_info_dict["batch_ids"]
)
return prefill_out
def forward_decode(self, decode_qkv, k_cache_id, v_cache_id, forward_meta: ForwardMeta):
k_cache = forward_meta.caches[k_cache_id]
v_cache = forward_meta.caches[v_cache_id]
if self.enable_fused_attention:
rope_cos = forward_meta.rotary_embs[0, 0, :, :, :]
rope_sin = forward_meta.rotary_embs[1, 0, :, :, :]
decode_out = paged_attention(
decode_qkv.view([-1, self.total_num_heads, self.head_dim]),
k_cache,
v_cache,
block_tables=forward_meta.block_tables[self.decode_info_dict["batch_ids"], :],
seq_lens=forward_meta.seq_lens_decoder[self.decode_info_dict["batch_ids"], 0] + 1,
num_kv_heads=self.attention_metadata.num_kv_heads,
scale=self.attention_metadata.scale,
block_size=self.attention_metadata.block_size,
max_context_len=self.attention_metadata.max_context_len,
alibi_slopes=self.attention_metadata.alibi_slopes,
causal=self.attention_metadata.causal,
window_left=self.attention_metadata.window_left,
window_right=self.attention_metadata.window_right,
softcap=self.attention_metadata.softcap,
use_cuda_graph=self.attention_metadata.use_cuda_graph,
use_sqrt_alibi=self.attention_metadata.use_sqrt_alibi,
merged_qkv=True,
k=decode_qkv,
v=decode_qkv,
rope_sin=rope_sin,
rope_cos=rope_cos,
)
else:
decode_q, decode_k, decode_v = self.get_splited_qkv(
decode_qkv,
forward_meta,
self.decode_info_dict["cu_seqlens_q"],
batch_ids=self.decode_info_dict["batch_ids"],
)
decode_out = paged_attention(
decode_q,
k_cache,
v_cache,
block_tables=forward_meta.block_tables[self.decode_info_dict["batch_ids"], :],
seq_lens=forward_meta.seq_lens_decoder[self.decode_info_dict["batch_ids"], 0] + 1,
num_kv_heads=self.attention_metadata.num_kv_heads,
scale=self.attention_metadata.scale,
block_size=self.attention_metadata.block_size,
max_context_len=self.attention_metadata.max_context_len,
alibi_slopes=self.attention_metadata.alibi_slopes,
causal=self.attention_metadata.causal,
window_left=self.attention_metadata.window_left,
window_right=self.attention_metadata.window_right,
softcap=self.attention_metadata.softcap,
use_cuda_graph=self.attention_metadata.use_cuda_graph,
use_sqrt_alibi=self.attention_metadata.use_sqrt_alibi,
k=decode_k,
v=decode_v,
)
return decode_out
def forward_mixed( def forward_mixed(
self, self,
@@ -429,23 +206,84 @@ class IluvatarAttnBackend(AttentionBackend):
""" """
forward_mixed forward_mixed
""" """
assert not self.use_speculate, "IluvatarAttnBackend cannot support speculate now"
layer_id = layer.layer_id layer_id = layer.layer_id
k_cache_id = layer_id * 2 k_cache_id = layer_id * 2
v_cache_id = k_cache_id + 1 v_cache_id = k_cache_id + 1
q_dim = qkv.dim() k_cache = forward_meta.caches[k_cache_id]
assert q_dim == 2 v_cache = forward_meta.caches[v_cache_id]
if self.decode_len == 0: if self.decode_len == 0:
output = self.forward_prefill(qkv, layer_id, k_cache_id, v_cache_id, forward_meta) output = prefill_fused_paged_attention(
qkv,
k_cache,
v_cache,
block_tables=forward_meta.block_tables[self.prefill_info_dict["batch_ids"], :],
cu_seqlens_qkv=self.prefill_info_dict["cu_seqlens_q"],
num_heads=self.num_heads,
head_dim=self.head_dim,
num_kv_heads=self.num_kv_heads,
block_size=self.block_size,
max_seq_len=self.max_context_len,
scale=self.scale,
causal=self.causal,
q_rope=True,
k_rope=True,
v_rope=False,
rope_sin=self.rope_sin,
rope_cos=self.rope_cos,
)
elif self.prefill_len == 0: elif self.prefill_len == 0:
output = self.forward_decode(qkv, k_cache_id, v_cache_id, forward_meta) output = paged_attention(
qkv,
k_cache,
v_cache,
block_tables=forward_meta.block_tables[self.decode_info_dict["batch_ids"], :],
seq_lens=forward_meta.seq_lens_decoder[self.decode_info_dict["batch_ids"], 0] + 1,
num_heads=self.num_heads,
head_dim=self.head_dim,
num_kv_heads=self.num_kv_heads,
scale=self.scale,
block_size=self.block_size,
max_context_len=self.max_context_len,
alibi_slopes=self.attention_metadata.alibi_slopes,
causal=self.causal,
window_left=self.attention_metadata.window_left,
window_right=self.attention_metadata.window_right,
softcap=self.attention_metadata.softcap,
use_cuda_graph=self.attention_metadata.use_cuda_graph,
use_sqrt_alibi=self.attention_metadata.use_sqrt_alibi,
merged_qkv=True,
k=qkv,
v=qkv,
rope_sin=self.rope_sin,
rope_cos=self.rope_cos,
)
else: else:
prefill_qkv, decode_qkv = self.split_pd_qkv(qkv) output = mixed_fused_paged_attention(
prefill_output = self.forward_prefill(prefill_qkv, layer_id, k_cache_id, v_cache_id, forward_meta) qkv,
decode_output = self.forward_decode(decode_qkv, k_cache_id, v_cache_id, forward_meta) k_cache,
output = self.merge_pd_output(prefill_output, decode_output) v_cache,
prefill_block_tables=forward_meta.block_tables[self.prefill_info_dict["batch_ids"], :],
decode_block_tables=forward_meta.block_tables[self.decode_info_dict["batch_ids"], :],
cu_seqlens_qkv=self.prefill_info_dict["cu_seqlens_q"],
seq_lens=forward_meta.seq_lens_decoder[self.decode_info_dict["batch_ids"], 0] + 1,
prefill_num_tokens=self.prefill_num_tokens,
num_heads=self.num_heads,
head_dim=self.head_dim,
num_kv_heads=self.num_kv_heads,
block_size=self.block_size,
max_seq_len=self.max_context_len,
scale=self.scale,
causal=self.causal,
q_rope=True,
k_rope=True,
v_rope=False,
window_left=self.attention_metadata.window_left,
window_right=self.attention_metadata.window_right,
softcap=self.attention_metadata.softcap,
use_cuda_graph=self.attention_metadata.use_cuda_graph,
use_sqrt_alibi=self.attention_metadata.use_sqrt_alibi,
rope_sin=self.rope_sin,
rope_cos=self.rope_cos,
)
output = output.view([-1, self.num_heads * self.head_dim])
return output return output

View File

@@ -83,7 +83,6 @@ class CutlassMoEMethod(UnquantizedFusedMoEMethod):
expert_idx_per_token, expert_idx_per_token,
self.moe_quant_type, self.moe_quant_type,
used_in_ep_low_latency, used_in_ep_low_latency,
estimate_total_token_nums,
) )
return fastdeploy.model_executor.ops.gpu.moe_expert_ffn( return fastdeploy.model_executor.ops.gpu.moe_expert_ffn(
permute_input, permute_input,

View File

@@ -53,6 +53,7 @@ from fastdeploy.model_executor.models.model_base import (
from fastdeploy.model_executor.models.tp_utils import TensorSplitMode as tsm from fastdeploy.model_executor.models.tp_utils import TensorSplitMode as tsm
from fastdeploy.model_executor.models.utils import LayerIdPlaceholder as layerid from fastdeploy.model_executor.models.utils import LayerIdPlaceholder as layerid
from fastdeploy.model_executor.models.utils import WeightMeta from fastdeploy.model_executor.models.utils import WeightMeta
from fastdeploy.platforms import current_platform
from fastdeploy.worker.experts_manager import RedundantExpertManger from fastdeploy.worker.experts_manager import RedundantExpertManger
@@ -464,6 +465,9 @@ class Ernie4_5_Model(nn.Layer):
): ):
hidden_states = self.embed_tokens(ids_remove_padding=ids_remove_padding) hidden_states = self.embed_tokens(ids_remove_padding=ids_remove_padding)
if current_platform.is_iluvatar() and forward_meta.attn_backend.mixed:
hidden_states = forward_meta.attn_backend.transpose(hidden_states)
residual = None residual = None
for i in range(self.num_layers): for i in range(self.num_layers):
hidden_states, residual = self.layers[i](forward_meta, hidden_states, residual) hidden_states, residual = self.layers[i](forward_meta, hidden_states, residual)
@@ -472,6 +476,9 @@ class Ernie4_5_Model(nn.Layer):
out = self.norm(hidden_states) out = self.norm(hidden_states)
if current_platform.is_iluvatar() and forward_meta.attn_backend.mixed:
out = forward_meta.attn_backend.reverse_transpose(out)
return out return out

View File

@@ -20,4 +20,8 @@ PACKAGE = "fastdeploy.model_executor.ops.iluvatar"
import_custom_ops(PACKAGE, ".fastdeploy_ops", globals()) import_custom_ops(PACKAGE, ".fastdeploy_ops", globals())
from .moe_ops import iluvatar_moe_expert_ffn as moe_expert_ffn # noqa: F401 from .moe_ops import iluvatar_moe_expert_ffn as moe_expert_ffn # noqa: F401
from .paged_attention import paged_attention # noqa: F401 from .paged_attention import ( # noqa: F401
mixed_fused_paged_attention,
paged_attention,
prefill_fused_paged_attention,
)

View File

@@ -17,9 +17,15 @@
import paddle import paddle
try: try:
from fastdeploy.model_executor.ops.iluvatar import paged_attn from fastdeploy.model_executor.ops.iluvatar import (
mixed_fused_paged_attn,
paged_attn,
prefill_fused_paged_attn,
)
except ImportError: except ImportError:
paged_attn = None paged_attn = None
prefill_fused_paged_attn = None
mixed_fused_paged_attn = None
def paged_attention( def paged_attention(
@@ -28,6 +34,8 @@ def paged_attention(
v_cache: paddle.Tensor, v_cache: paddle.Tensor,
block_tables: paddle.Tensor, block_tables: paddle.Tensor,
seq_lens: paddle.Tensor, seq_lens: paddle.Tensor,
num_heads: int,
head_dim: int,
num_kv_heads: int, num_kv_heads: int,
scale: float, scale: float,
block_size: int, block_size: int,
@@ -45,7 +53,7 @@ def paged_attention(
rope_sin: paddle.Tensor = None, rope_sin: paddle.Tensor = None,
rope_cos: paddle.Tensor = None, rope_cos: paddle.Tensor = None,
): ):
output = paged_attn( return paged_attn(
q, q,
k_cache, k_cache,
v_cache, v_cache,
@@ -56,6 +64,8 @@ def paged_attention(
v, v,
rope_sin, rope_sin,
rope_cos, rope_cos,
num_heads,
head_dim,
num_kv_heads, num_kv_heads,
scale, scale,
block_size, block_size,
@@ -68,4 +78,99 @@ def paged_attention(
use_sqrt_alibi, use_sqrt_alibi,
merged_qkv, merged_qkv,
) )
return output[0] if isinstance(output, list) else output
def prefill_fused_paged_attention(
qkv: paddle.Tensor,
k_cache: paddle.Tensor,
v_cache: paddle.Tensor,
block_tables: paddle.Tensor,
cu_seqlens_qkv: paddle.Tensor,
num_heads: int,
head_dim: int,
num_kv_heads: int,
block_size: int,
max_seq_len: int,
scale: float,
causal: bool = True,
q_rope: bool = True,
k_rope: bool = True,
v_rope: bool = False,
rope_sin: paddle.Tensor = None,
rope_cos: paddle.Tensor = None,
):
return prefill_fused_paged_attn(
qkv,
k_cache,
v_cache,
block_tables,
cu_seqlens_qkv,
rope_sin,
rope_cos,
num_heads,
head_dim,
num_kv_heads,
block_size,
max_seq_len,
scale,
causal,
q_rope,
k_rope,
v_rope,
)
def mixed_fused_paged_attention(
qkv: paddle.Tensor,
k_cache: paddle.Tensor,
v_cache: paddle.Tensor,
prefill_block_tables: paddle.Tensor,
decode_block_tables: paddle.Tensor,
cu_seqlens_qkv: paddle.Tensor,
seq_lens: paddle.Tensor,
prefill_num_tokens: int,
num_heads: int,
head_dim: int,
num_kv_heads: int,
block_size: int,
max_seq_len: int,
scale: float,
causal: bool = True,
q_rope: bool = True,
k_rope: bool = True,
v_rope: bool = False,
window_left: int = -1,
window_right: int = -1,
softcap: float = 0.0,
use_cuda_graph: bool = False,
use_sqrt_alibi: bool = False,
rope_sin: paddle.Tensor = None,
rope_cos: paddle.Tensor = None,
):
return mixed_fused_paged_attn(
qkv,
k_cache,
v_cache,
prefill_block_tables,
decode_block_tables,
cu_seqlens_qkv,
seq_lens,
rope_sin,
rope_cos,
prefill_num_tokens,
num_heads,
head_dim,
num_kv_heads,
block_size,
max_seq_len,
scale,
causal,
q_rope,
k_rope,
v_rope,
window_left,
window_right,
softcap,
use_cuda_graph,
use_sqrt_alibi,
)

View File

@@ -13,10 +13,10 @@ python -m pip install -r requirements_iluvatar.txt
echo "uninstall org" echo "uninstall org"
python -m pip uninstall paddlepaddle -y python -m pip uninstall paddlepaddle -y
python -m pip uninstall paddle-iluvatar-gpu -y python -m pip uninstall paddle-iluvatar-gpu -y
python -m pip install --pre paddlepaddle==3.0.0.dev20250708 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ # python -m pip install --pre paddlepaddle==3.0.0.dev20250708 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
# TODO: Change to open access URL # python -m pip install --pre paddle-iluvatar-gpu==3.0.0.dev20250806 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
python -m pip install --pre paddle-iluvatar-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/ python -m pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
# python -m pip install /data1/fastdeploy/packages/paddle_iluvatar_gpu-0.0.0-cp310-cp310-linux_x86_64.whl python -m pip install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
# Patch, remove if image updated # Patch, remove if image updated
cp /data1/fastdeploy/packages/cusolver.h /usr/local/lib/python3.10/site-packages/paddle/include/paddle/phi/backends/dynload/cusolver.h cp /data1/fastdeploy/packages/cusolver.h /usr/local/lib/python3.10/site-packages/paddle/include/paddle/phi/backends/dynload/cusolver.h
echo "build whl" echo "build whl"

View File

@@ -1,4 +1,7 @@
from fastdeploy import LLM, SamplingParams from fastdeploy import LLM, SamplingParams
from fastdeploy.utils import set_random_seed
set_random_seed(123)
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
@@ -12,7 +15,6 @@ llm = LLM(
model="/data1/fastdeploy/ERNIE_300B_4L", model="/data1/fastdeploy/ERNIE_300B_4L",
tensor_parallel_size=8, tensor_parallel_size=8,
max_model_len=8192, max_model_len=8192,
static_decode_blocks=0,
quantization="wint8", quantization="wint8",
block_size=16, block_size=16,
) )
@@ -27,14 +29,14 @@ assert outputs[0].outputs.token_ids == [
59335, 59335,
68170, 68170,
183, 183,
97404, 49080,
100088, 94717,
36310, 82966,
95633, 99140,
95913, 31615,
41459, 51497,
95049, 94851,
94970, 60764,
96840, 10889,
2, 2,
], f"{outputs[0].outputs.token_ids}" ], f"{outputs[0].outputs.token_ids}"