mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
[Iluvatar GPU] Optimize attention performance and fix moe load ckpt error (#3651)
This commit is contained in:
6
.github/workflows/ci_iluvatar.yml
vendored
6
.github/workflows/ci_iluvatar.yml
vendored
@@ -28,18 +28,22 @@ jobs:
|
|||||||
REPO="https://github.com/${{ github.repository }}.git"
|
REPO="https://github.com/${{ github.repository }}.git"
|
||||||
FULL_REPO="${{ github.repository }}"
|
FULL_REPO="${{ github.repository }}"
|
||||||
REPO_NAME="${FULL_REPO##*/}"
|
REPO_NAME="${FULL_REPO##*/}"
|
||||||
|
BASE_BRANCH="${{ github.base_ref }}"
|
||||||
# Clean the repository directory before starting
|
# Clean the repository directory before starting
|
||||||
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
|
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
|
||||||
-e "REPO_NAME=${REPO_NAME}" \
|
-e "REPO_NAME=${REPO_NAME}" \
|
||||||
|
-e "BASE_BRANCH=${BASE_BRANCH}" \
|
||||||
${docker_image} /bin/bash -c '
|
${docker_image} /bin/bash -c '
|
||||||
if [ -d ${REPO_NAME} ]; then
|
if [ -d ${REPO_NAME} ]; then
|
||||||
echo "Directory ${REPO_NAME} exists, removing it..."
|
echo "Directory ${REPO_NAME} exists, removing it..."
|
||||||
rm -rf ${REPO_NAME}
|
rm -rf ${REPO_NAME}
|
||||||
fi
|
fi
|
||||||
'
|
'
|
||||||
|
git config --global http.proxy "http://61.151.249.150:33128"
|
||||||
|
git config --global https.proxy "http://61.151.249.150:33128"
|
||||||
git config --global user.name "FastDeployCI"
|
git config --global user.name "FastDeployCI"
|
||||||
git config --global user.email "fastdeploy_ci@example.com"
|
git config --global user.email "fastdeploy_ci@example.com"
|
||||||
git clone ${REPO} ${REPO_NAME}
|
git clone --recursive ${REPO} ${REPO_NAME} -b ${BASE_BRANCH}
|
||||||
cd FastDeploy
|
cd FastDeploy
|
||||||
if [ "${{ github.event_name }}" = "pull_request" ]; then
|
if [ "${{ github.event_name }}" = "pull_request" ]; then
|
||||||
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
|
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
|
||||||
|
@@ -193,11 +193,13 @@ public:
|
|||||||
typedef uint8_t data_t;
|
typedef uint8_t data_t;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifndef PADDLE_WITH_COREX
|
||||||
template <> class PDTraits<paddle::DataType::FLOAT8_E4M3FN> {
|
template <> class PDTraits<paddle::DataType::FLOAT8_E4M3FN> {
|
||||||
public:
|
public:
|
||||||
typedef __nv_fp8_e4m3 DataType;
|
typedef __nv_fp8_e4m3 DataType;
|
||||||
typedef paddle::float8_e4m3fn data_t;
|
typedef paddle::float8_e4m3fn data_t;
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
template <typename T, int Size> struct alignas(sizeof(T) * Size) AlignedVector {
|
template <typename T, int Size> struct alignas(sizeof(T) * Size) AlignedVector {
|
||||||
T val[Size];
|
T val[Size];
|
||||||
|
376
custom_ops/iluvatar_ops/mixed_fused_attn.cu
Normal file
376
custom_ops/iluvatar_ops/mixed_fused_attn.cu
Normal file
@@ -0,0 +1,376 @@
|
|||||||
|
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#include "helper.h"
|
||||||
|
#include "iluvatar_context.h"
|
||||||
|
|
||||||
|
template <paddle::DataType T>
|
||||||
|
void MixedFusedPagedAttnKernel(const paddle::Tensor& qkv,
|
||||||
|
paddle::Tensor& k_cache,
|
||||||
|
paddle::Tensor& v_cache,
|
||||||
|
const paddle::Tensor& prefill_block_table,
|
||||||
|
const paddle::Tensor& decode_block_table,
|
||||||
|
const paddle::Tensor& cu_seqlens_qkv,
|
||||||
|
const paddle::Tensor& seq_lens,
|
||||||
|
const paddle::optional<paddle::Tensor> &rope_sin,
|
||||||
|
const paddle::optional<paddle::Tensor> &rope_cos,
|
||||||
|
int prefill_num_tokens,
|
||||||
|
int num_heads,
|
||||||
|
int head_dim,
|
||||||
|
int num_kv_heads,
|
||||||
|
int block_size,
|
||||||
|
int max_seq_len,
|
||||||
|
float scale,
|
||||||
|
bool causal,
|
||||||
|
bool q_rope,
|
||||||
|
bool k_rope,
|
||||||
|
bool v_rope,
|
||||||
|
int window_left,
|
||||||
|
int window_right,
|
||||||
|
float softcap,
|
||||||
|
bool enable_cuda_graph,
|
||||||
|
bool use_sqrt_alibi,
|
||||||
|
paddle::Tensor& out) {
|
||||||
|
|
||||||
|
typedef PDTraits<T> traits_;
|
||||||
|
typedef typename traits_::data_t data_t;
|
||||||
|
|
||||||
|
const auto& dtype = qkv.dtype();
|
||||||
|
cuinferDataType_t cuinfer_data_type;
|
||||||
|
cudaDataType_t cu_data_type;
|
||||||
|
if (dtype == paddle::DataType::FLOAT16) {
|
||||||
|
cuinfer_data_type = CUINFER_DATA_HALF;
|
||||||
|
cu_data_type = CUDA_R_16F;
|
||||||
|
} else {
|
||||||
|
cuinfer_data_type = CUINFER_DATA_BFLOAT16;
|
||||||
|
cu_data_type = CUDA_R_16BF;
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto& qkv_dims = qkv.dims();
|
||||||
|
const auto& kv_cache_dims = k_cache.dims();
|
||||||
|
const auto& prefill_block_table_dims = prefill_block_table.dims();
|
||||||
|
const auto& cu_seqlens_qkv_dims = cu_seqlens_qkv.dims();
|
||||||
|
|
||||||
|
int prefill_batch_size = prefill_block_table_dims[0];
|
||||||
|
int num_tokens = qkv_dims[0];
|
||||||
|
int decode_num_tokens = num_tokens - prefill_num_tokens;
|
||||||
|
int num_total_heads = num_heads + 2 * num_kv_heads;
|
||||||
|
int max_num_blocks_per_seq = prefill_block_table_dims[1];
|
||||||
|
int qkv_stride = qkv.strides()[0];
|
||||||
|
int num_blocks = kv_cache_dims[0];
|
||||||
|
|
||||||
|
int kv_block_stride = k_cache.strides()[0];
|
||||||
|
int kv_head_stride = k_cache.strides()[1];
|
||||||
|
int block_table_stride = prefill_block_table.strides()[0];
|
||||||
|
const float *rope_sin_ptr = rope_sin ? rope_sin.get().data<float>() : nullptr;
|
||||||
|
const float *rope_cos_ptr = rope_cos ? rope_cos.get().data<float>() : nullptr;
|
||||||
|
|
||||||
|
cuinferTensorDescriptor_t qkv_desc;
|
||||||
|
CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_desc));
|
||||||
|
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||||
|
qkv_desc,
|
||||||
|
cuinfer_data_type,
|
||||||
|
3,
|
||||||
|
std::vector<int>({prefill_num_tokens, num_total_heads, head_dim}).data(),
|
||||||
|
std::vector<int>({num_total_heads * head_dim, head_dim, 1}).data()));
|
||||||
|
|
||||||
|
cuinferTensorDescriptor_t qkv_seqlens_desc;
|
||||||
|
CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_seqlens_desc));
|
||||||
|
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||||
|
qkv_seqlens_desc,
|
||||||
|
CUINFER_DATA_INT32,
|
||||||
|
1,
|
||||||
|
std::vector<int>({prefill_batch_size + 1}).data(),
|
||||||
|
std::vector<int>({1}).data()));
|
||||||
|
|
||||||
|
cuinferTensorDescriptor_t block_table_desc;
|
||||||
|
CUINFER_CHECK(cuinferCreateTensorDescriptor(&block_table_desc));
|
||||||
|
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||||
|
block_table_desc,
|
||||||
|
CUINFER_DATA_INT32,
|
||||||
|
2,
|
||||||
|
std::vector<int>({prefill_batch_size, block_table_stride}).data(),
|
||||||
|
std::vector<int>({block_table_stride, 1}).data()));
|
||||||
|
|
||||||
|
cuinferTensorDescriptor_t o_desc;
|
||||||
|
CUINFER_CHECK(cuinferCreateTensorDescriptor(&o_desc));
|
||||||
|
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||||
|
o_desc,
|
||||||
|
cuinfer_data_type,
|
||||||
|
3,
|
||||||
|
std::vector<int>({prefill_num_tokens, num_heads, head_dim}).data(),
|
||||||
|
std::vector<int>({num_heads * head_dim, head_dim, 1}).data()));
|
||||||
|
|
||||||
|
cuinferTensorDescriptor_t k_cache_desc;
|
||||||
|
CUINFER_CHECK(cuinferCreateTensorDescriptor(&k_cache_desc));
|
||||||
|
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||||
|
k_cache_desc,
|
||||||
|
cuinfer_data_type,
|
||||||
|
4,
|
||||||
|
std::vector<int>({num_blocks, num_kv_heads, block_size, head_dim}).data(),
|
||||||
|
std::vector<int>({num_kv_heads * block_size * head_dim, block_size * head_dim, head_dim, 1}).data()));
|
||||||
|
|
||||||
|
cuinferTensorDescriptor_t v_cache_desc;
|
||||||
|
CUINFER_CHECK(cuinferCreateTensorDescriptor(&v_cache_desc));
|
||||||
|
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||||
|
v_cache_desc,
|
||||||
|
cuinfer_data_type,
|
||||||
|
4,
|
||||||
|
std::vector<int>({num_blocks, num_kv_heads, block_size, head_dim}).data(),
|
||||||
|
std::vector<int>({num_kv_heads * block_size * head_dim, block_size * head_dim, head_dim, 1}).data()));
|
||||||
|
|
||||||
|
cuinferTensorDescriptor_t cos_desc;
|
||||||
|
CUINFER_CHECK(cuinferCreateTensorDescriptor(&cos_desc));
|
||||||
|
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||||
|
cos_desc,
|
||||||
|
CUINFER_DATA_FLOAT,
|
||||||
|
2,
|
||||||
|
std::vector<int>({max_seq_len, head_dim}).data(),
|
||||||
|
std::vector<int>({head_dim, 1}).data()));
|
||||||
|
|
||||||
|
cuinferTensorDescriptor_t sin_desc;
|
||||||
|
CUINFER_CHECK(cuinferCreateTensorDescriptor(&sin_desc));
|
||||||
|
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||||
|
sin_desc,
|
||||||
|
CUINFER_DATA_FLOAT,
|
||||||
|
2,
|
||||||
|
std::vector<int>({max_seq_len, head_dim}).data(),
|
||||||
|
std::vector<int>({head_dim, 1}).data()));
|
||||||
|
|
||||||
|
cuinferHandle_t cuinfer_handle = iluvatar::getContextInstance()->getIxInferHandle();
|
||||||
|
|
||||||
|
size_t prefill_workspace_size = 0;
|
||||||
|
CUINFER_CHECK(cuinferGetFmhaFwdMergedFuseRopeWorkspaceSize(prefill_num_tokens,
|
||||||
|
num_heads,
|
||||||
|
num_kv_heads,
|
||||||
|
head_dim,
|
||||||
|
q_rope,
|
||||||
|
k_rope,
|
||||||
|
v_rope,
|
||||||
|
cuinfer_data_type,
|
||||||
|
cuinfer_data_type,
|
||||||
|
cuinfer_data_type,
|
||||||
|
&prefill_workspace_size));
|
||||||
|
|
||||||
|
auto* allocator = paddle::GetAllocator(qkv.place());
|
||||||
|
|
||||||
|
phi::Allocator::AllocationPtr prefill_tmp_workspace = allocator->Allocate(prefill_workspace_size);
|
||||||
|
void* prefill_workspace_ptr = prefill_tmp_workspace->ptr();
|
||||||
|
|
||||||
|
CUINFER_CHECK(cuinferFmhaFwdMergedFuseRopeFunc(cuinfer_handle,
|
||||||
|
qkv_desc,
|
||||||
|
qkv.data(),
|
||||||
|
qkv_seqlens_desc,
|
||||||
|
cu_seqlens_qkv.data<int32_t>(),
|
||||||
|
block_table_desc,
|
||||||
|
prefill_block_table.data<int32_t>(),
|
||||||
|
o_desc,
|
||||||
|
out.data(),
|
||||||
|
k_cache_desc,
|
||||||
|
k_cache.data(),
|
||||||
|
v_cache_desc,
|
||||||
|
v_cache.data(),
|
||||||
|
prefill_workspace_ptr,
|
||||||
|
prefill_workspace_size,
|
||||||
|
cos_desc,
|
||||||
|
rope_cos_ptr,
|
||||||
|
sin_desc,
|
||||||
|
rope_sin_ptr,
|
||||||
|
prefill_batch_size,
|
||||||
|
num_heads,
|
||||||
|
num_kv_heads,
|
||||||
|
head_dim,
|
||||||
|
causal,
|
||||||
|
scale,
|
||||||
|
q_rope,
|
||||||
|
k_rope,
|
||||||
|
v_rope));
|
||||||
|
|
||||||
|
size_t decode_workspace_size = 0;
|
||||||
|
CUINFER_CHECK(cuInferPageAttentionGetWorkspaceV7(decode_num_tokens,
|
||||||
|
num_heads,
|
||||||
|
num_kv_heads,
|
||||||
|
head_dim,
|
||||||
|
block_size,
|
||||||
|
max_seq_len,
|
||||||
|
&decode_workspace_size));
|
||||||
|
|
||||||
|
phi::Allocator::AllocationPtr decode_tmp_workspace = allocator->Allocate(decode_workspace_size);
|
||||||
|
void* decode_workspace_ptr = decode_tmp_workspace->ptr();
|
||||||
|
|
||||||
|
void* decode_qkv_ptr = (void*)(qkv.data<data_t>() + prefill_num_tokens * qkv_stride);
|
||||||
|
void* decode_out_ptr = (void*)(out.data<data_t>() + prefill_num_tokens * out.strides()[0]);
|
||||||
|
|
||||||
|
PageAttentionWithKVCacheArguments args{
|
||||||
|
static_cast<float>(scale), 1.0, 1.0, static_cast<float>(softcap), window_left, window_right,
|
||||||
|
causal, use_sqrt_alibi, enable_cuda_graph, false, nullptr, decode_qkv_ptr, decode_qkv_ptr,
|
||||||
|
decode_workspace_ptr, true, rope_sin_ptr, rope_cos_ptr};
|
||||||
|
|
||||||
|
CUINFER_CHECK(cuInferPageAttentionV7(cuinfer_handle,
|
||||||
|
decode_out_ptr,
|
||||||
|
cu_data_type,
|
||||||
|
decode_qkv_ptr,
|
||||||
|
cu_data_type,
|
||||||
|
decode_num_tokens,
|
||||||
|
num_heads,
|
||||||
|
num_kv_heads,
|
||||||
|
head_dim,
|
||||||
|
qkv_stride,
|
||||||
|
kv_block_stride,
|
||||||
|
kv_head_stride,
|
||||||
|
k_cache.data(),
|
||||||
|
cu_data_type,
|
||||||
|
v_cache.data(),
|
||||||
|
cu_data_type,
|
||||||
|
block_size,
|
||||||
|
max_num_blocks_per_seq,
|
||||||
|
max_seq_len,
|
||||||
|
decode_block_table.data<int32_t>(),
|
||||||
|
seq_lens.data<int32_t>(),
|
||||||
|
args));
|
||||||
|
|
||||||
|
CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_desc));
|
||||||
|
CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_seqlens_desc));
|
||||||
|
CUINFER_CHECK(cuinferDestroyTensorDescriptor(block_table_desc));
|
||||||
|
CUINFER_CHECK(cuinferDestroyTensorDescriptor(o_desc));
|
||||||
|
CUINFER_CHECK(cuinferDestroyTensorDescriptor(k_cache_desc));
|
||||||
|
CUINFER_CHECK(cuinferDestroyTensorDescriptor(v_cache_desc));
|
||||||
|
CUINFER_CHECK(cuinferDestroyTensorDescriptor(cos_desc));
|
||||||
|
CUINFER_CHECK(cuinferDestroyTensorDescriptor(sin_desc));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<paddle::Tensor> MixedFusedPagedAttn(const paddle::Tensor& qkv,
|
||||||
|
paddle::Tensor& k_cache,
|
||||||
|
paddle::Tensor& v_cache,
|
||||||
|
const paddle::Tensor& prefill_block_table,
|
||||||
|
const paddle::Tensor& decode_block_table,
|
||||||
|
const paddle::Tensor& cu_seqlens_qkv,
|
||||||
|
const paddle::Tensor& seq_lens,
|
||||||
|
const paddle::optional<paddle::Tensor> &rope_sin,
|
||||||
|
const paddle::optional<paddle::Tensor> &rope_cos,
|
||||||
|
int prefill_num_tokens,
|
||||||
|
int num_heads,
|
||||||
|
int head_dim,
|
||||||
|
int num_kv_heads,
|
||||||
|
int block_size,
|
||||||
|
int max_seq_len,
|
||||||
|
float scale,
|
||||||
|
bool causal,
|
||||||
|
bool q_rope,
|
||||||
|
bool k_rope,
|
||||||
|
bool v_rope,
|
||||||
|
int window_left,
|
||||||
|
int window_right,
|
||||||
|
float softcap,
|
||||||
|
bool enable_cuda_graph,
|
||||||
|
bool use_sqrt_alibi) {
|
||||||
|
const auto dtype = qkv.dtype();
|
||||||
|
auto out = paddle::empty({qkv.shape()[0], num_heads * head_dim}, dtype, qkv.place());
|
||||||
|
|
||||||
|
switch (dtype) {
|
||||||
|
case paddle::DataType::BFLOAT16:
|
||||||
|
MixedFusedPagedAttnKernel<paddle::DataType::BFLOAT16>(qkv,
|
||||||
|
k_cache,
|
||||||
|
v_cache,
|
||||||
|
prefill_block_table,
|
||||||
|
decode_block_table,
|
||||||
|
cu_seqlens_qkv,
|
||||||
|
seq_lens,
|
||||||
|
rope_sin,
|
||||||
|
rope_cos,
|
||||||
|
prefill_num_tokens,
|
||||||
|
num_heads,
|
||||||
|
head_dim,
|
||||||
|
num_kv_heads,
|
||||||
|
block_size,
|
||||||
|
max_seq_len,
|
||||||
|
scale,
|
||||||
|
causal,
|
||||||
|
q_rope,
|
||||||
|
k_rope,
|
||||||
|
v_rope,
|
||||||
|
window_left,
|
||||||
|
window_right,
|
||||||
|
softcap,
|
||||||
|
enable_cuda_graph,
|
||||||
|
use_sqrt_alibi,
|
||||||
|
out);
|
||||||
|
break;
|
||||||
|
case paddle::DataType::FLOAT16:
|
||||||
|
MixedFusedPagedAttnKernel<paddle::DataType::FLOAT16>(qkv,
|
||||||
|
k_cache,
|
||||||
|
v_cache,
|
||||||
|
prefill_block_table,
|
||||||
|
decode_block_table,
|
||||||
|
cu_seqlens_qkv,
|
||||||
|
seq_lens,
|
||||||
|
rope_sin,
|
||||||
|
rope_cos,
|
||||||
|
prefill_num_tokens,
|
||||||
|
num_heads,
|
||||||
|
head_dim,
|
||||||
|
num_kv_heads,
|
||||||
|
block_size,
|
||||||
|
max_seq_len,
|
||||||
|
scale,
|
||||||
|
causal,
|
||||||
|
q_rope,
|
||||||
|
k_rope,
|
||||||
|
v_rope,
|
||||||
|
window_left,
|
||||||
|
window_right,
|
||||||
|
softcap,
|
||||||
|
enable_cuda_graph,
|
||||||
|
use_sqrt_alibi,
|
||||||
|
out);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
PD_THROW("Unsupported data type for mixed paged attn");
|
||||||
|
}
|
||||||
|
return {out};
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<int64_t>> MixedFusedPagedAttnInferShape(const std::vector<int64_t>& qkv_shape,
|
||||||
|
int num_heads,
|
||||||
|
int head_dim) {
|
||||||
|
return {{qkv_shape[0], num_heads * head_dim}};
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<paddle::DataType> MixedFusedPagedAttnInferDtype(const paddle::DataType& qkv_dtype) {
|
||||||
|
return {qkv_dtype};
|
||||||
|
}
|
||||||
|
|
||||||
|
PD_BUILD_STATIC_OP(mixed_fused_paged_attn)
|
||||||
|
.Inputs({"qkv", "k_cache", "v_cache", "prefill_block_table", "decode_block_table",
|
||||||
|
"cu_seqlens_qkv", "seq_lens", paddle::Optional("rope_sin"), paddle::Optional("rope_cos")})
|
||||||
|
.Outputs({"out"})
|
||||||
|
.Attrs({"prefill_num_tokens:int",
|
||||||
|
"num_heads: int",
|
||||||
|
"head_dim:int",
|
||||||
|
"num_kv_heads:int",
|
||||||
|
"block_size:int",
|
||||||
|
"max_seq_len:int",
|
||||||
|
"scale:float",
|
||||||
|
"causal:bool",
|
||||||
|
"q_rope:bool",
|
||||||
|
"k_rope:bool",
|
||||||
|
"v_rope:bool",
|
||||||
|
"window_left:int",
|
||||||
|
"window_right:int",
|
||||||
|
"softcap:float",
|
||||||
|
"enable_cuda_graph:bool",
|
||||||
|
"use_sqrt_alibi:bool"})
|
||||||
|
.SetKernelFn(PD_KERNEL(MixedFusedPagedAttn))
|
||||||
|
.SetInferShapeFn(PD_INFER_SHAPE(MixedFusedPagedAttnInferShape))
|
||||||
|
.SetInferDtypeFn(PD_INFER_DTYPE(MixedFusedPagedAttnInferDtype));
|
@@ -53,6 +53,7 @@ void MoeDispatchKernel(const paddle::Tensor& input,
|
|||||||
const paddle::optional<paddle::Tensor>& gating_correction_bias,
|
const paddle::optional<paddle::Tensor>& gating_correction_bias,
|
||||||
const int moe_topk,
|
const int moe_topk,
|
||||||
const bool group_moe,
|
const bool group_moe,
|
||||||
|
const std::string &moe_quant_type,
|
||||||
const bool topk_only_mode,
|
const bool topk_only_mode,
|
||||||
const int num_rows,
|
const int num_rows,
|
||||||
const int hidden_size,
|
const int hidden_size,
|
||||||
@@ -183,6 +184,7 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
|
|||||||
const paddle::optional<paddle::Tensor>& w4a8_in_scale,
|
const paddle::optional<paddle::Tensor>& w4a8_in_scale,
|
||||||
const int moe_topk,
|
const int moe_topk,
|
||||||
const bool group_moe,
|
const bool group_moe,
|
||||||
|
const std::string &moe_quant_type,
|
||||||
const bool topk_only_mode) {
|
const bool topk_only_mode) {
|
||||||
const auto input_type = input.dtype();
|
const auto input_type = input.dtype();
|
||||||
auto place = input.place();
|
auto place = input.place();
|
||||||
@@ -220,6 +222,7 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
|
|||||||
gating_correction_bias,
|
gating_correction_bias,
|
||||||
moe_topk,
|
moe_topk,
|
||||||
group_moe,
|
group_moe,
|
||||||
|
moe_quant_type,
|
||||||
topk_only_mode,
|
topk_only_mode,
|
||||||
num_rows,
|
num_rows,
|
||||||
hidden_size,
|
hidden_size,
|
||||||
@@ -236,6 +239,7 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
|
|||||||
gating_correction_bias,
|
gating_correction_bias,
|
||||||
moe_topk,
|
moe_topk,
|
||||||
group_moe,
|
group_moe,
|
||||||
|
moe_quant_type,
|
||||||
topk_only_mode,
|
topk_only_mode,
|
||||||
num_rows,
|
num_rows,
|
||||||
hidden_size,
|
hidden_size,
|
||||||
@@ -305,7 +309,7 @@ PD_BUILD_STATIC_OP(moe_expert_dispatch)
|
|||||||
"top_k_weight",
|
"top_k_weight",
|
||||||
"top_k_indices",
|
"top_k_indices",
|
||||||
"expert_idx_per_token"})
|
"expert_idx_per_token"})
|
||||||
.Attrs({"moe_topk:int", "group_moe:bool", "topk_only_mode:bool"})
|
.Attrs({"moe_topk:int", "group_moe:bool", "moe_quant_type:std::string", "topk_only_mode:bool"})
|
||||||
.SetKernelFn(PD_KERNEL(MoeExpertDispatch))
|
.SetKernelFn(PD_KERNEL(MoeExpertDispatch))
|
||||||
.SetInferShapeFn(PD_INFER_SHAPE(MoeExpertDispatchInferShape))
|
.SetInferShapeFn(PD_INFER_SHAPE(MoeExpertDispatchInferShape))
|
||||||
.SetInferDtypeFn(PD_INFER_DTYPE(MoeExpertDispatchInferDtype));
|
.SetInferDtypeFn(PD_INFER_DTYPE(MoeExpertDispatchInferDtype));
|
||||||
|
@@ -27,6 +27,8 @@ void PagedAttnKernel(const paddle::Tensor& q,
|
|||||||
const paddle::optional<paddle::Tensor> &v,
|
const paddle::optional<paddle::Tensor> &v,
|
||||||
const paddle::optional<paddle::Tensor> &rope_sin,
|
const paddle::optional<paddle::Tensor> &rope_sin,
|
||||||
const paddle::optional<paddle::Tensor> &rope_cos,
|
const paddle::optional<paddle::Tensor> &rope_cos,
|
||||||
|
int num_heads,
|
||||||
|
int head_dim,
|
||||||
int num_kv_heads,
|
int num_kv_heads,
|
||||||
float scale,
|
float scale,
|
||||||
int block_size,
|
int block_size,
|
||||||
@@ -86,32 +88,36 @@ void PagedAttnKernel(const paddle::Tensor& q,
|
|||||||
common::errors::InvalidArgument(
|
common::errors::InvalidArgument(
|
||||||
"paged_attention expects seq_lens is contiguous"));
|
"paged_attention expects seq_lens is contiguous"));
|
||||||
// check dim and shape
|
// check dim and shape
|
||||||
// k_cache: [num_blocks, kv_num_heads, block_size, head_size]
|
// k_cache: [num_blocks, kv_num_heads, block_size, head_dim]
|
||||||
// v_cache: [num_blocks, kv_num_heads, block_size, head_size]
|
// v_cache: [num_blocks, kv_num_heads, block_size, head_dim]
|
||||||
// block_table: [num_seqs, max_num_blocks_per_seq]
|
// block_table: [num_seqs, max_num_blocks_per_seq]
|
||||||
// seq_lens: [num_seqs]
|
// seq_lens: [num_seqs]
|
||||||
// q and out:
|
// q and out:
|
||||||
// merged_qkv = false: [num_seqs, num_heads, head_size]
|
// if merged_qkv = false:
|
||||||
// merged_qkv = true: [num_seqs, num_heads+2*num_kv_heads, head_size]
|
// q:[num_seqs, hidden_size]
|
||||||
|
// out:[num_seqs, hidden_size]
|
||||||
|
// if merged_qkv = true:
|
||||||
|
// q: [num_seqs, (num_heads+2*num_kv_heads)*head_dim]
|
||||||
|
// out: [num_seqs, hidden_size]
|
||||||
|
|
||||||
const auto& q_dims = q.dims();
|
const auto& q_dims = q.dims();
|
||||||
PADDLE_ENFORCE_EQ(q_dims.size(),
|
PADDLE_ENFORCE_EQ(q_dims.size(),
|
||||||
3,
|
2,
|
||||||
common::errors::InvalidArgument(
|
common::errors::InvalidArgument(
|
||||||
"paged_attn receive query dims is "
|
"paged_attn receive query dims is "
|
||||||
"[num_seqs, num_heads, head_size]"));
|
"[num_seqs, (num_heads+2*num_kv_heads)*head_dim]"));
|
||||||
PADDLE_ENFORCE_EQ(out.dims().size(),
|
PADDLE_ENFORCE_EQ(out.dims().size(),
|
||||||
3,
|
2,
|
||||||
common::errors::InvalidArgument(
|
common::errors::InvalidArgument(
|
||||||
"paged_attn receive out dims is "
|
"paged_attn receive out dims is "
|
||||||
"[num_seqs, num_heads, head_size]"));
|
"[num_seqs, hidden_size]"));
|
||||||
|
|
||||||
const auto& kv_cache_dims = k_cache.dims();
|
const auto& kv_cache_dims = k_cache.dims();
|
||||||
PADDLE_ENFORCE_EQ(kv_cache_dims.size(),
|
PADDLE_ENFORCE_EQ(kv_cache_dims.size(),
|
||||||
4,
|
4,
|
||||||
common::errors::InvalidArgument(
|
common::errors::InvalidArgument(
|
||||||
"paged_attn receive kv cache dims is "
|
"paged_attn receive kv cache dims is "
|
||||||
"[num_blocks, kv_num_heads, block_size, head_size]"));
|
"[num_blocks, kv_num_heads, block_size, head_dim]"));
|
||||||
|
|
||||||
const auto& block_table_dims = block_table.dims();
|
const auto& block_table_dims = block_table.dims();
|
||||||
PADDLE_ENFORCE_EQ(block_table_dims.size(),
|
PADDLE_ENFORCE_EQ(block_table_dims.size(),
|
||||||
@@ -127,8 +133,6 @@ void PagedAttnKernel(const paddle::Tensor& q,
|
|||||||
"paged_attn receive seq_lens dims is [num_seqs]"));
|
"paged_attn receive seq_lens dims is [num_seqs]"));
|
||||||
|
|
||||||
int num_seqs = q_dims[0];
|
int num_seqs = q_dims[0];
|
||||||
int num_heads = merged_qkv ? q_dims[1] - 2 * num_kv_heads : q_dims[1];
|
|
||||||
int head_size = q_dims[2];
|
|
||||||
int max_num_blocks_per_seq = block_table_dims[1];
|
int max_num_blocks_per_seq = block_table_dims[1];
|
||||||
int q_stride = q.strides()[0];
|
int q_stride = q.strides()[0];
|
||||||
int num_blocks = kv_cache_dims[0];
|
int num_blocks = kv_cache_dims[0];
|
||||||
@@ -142,9 +146,9 @@ void PagedAttnKernel(const paddle::Tensor& q,
|
|||||||
common::errors::InvalidArgument(
|
common::errors::InvalidArgument(
|
||||||
"kv_cache_dims[2] must be equal to block_size"));
|
"kv_cache_dims[2] must be equal to block_size"));
|
||||||
PADDLE_ENFORCE_EQ(kv_cache_dims[3],
|
PADDLE_ENFORCE_EQ(kv_cache_dims[3],
|
||||||
head_size,
|
head_dim,
|
||||||
common::errors::InvalidArgument(
|
common::errors::InvalidArgument(
|
||||||
"kv_cache_dims[3] must be equal to head_size"));
|
"kv_cache_dims[3] must be equal to head_dim"));
|
||||||
PADDLE_ENFORCE_EQ(block_table_dims[0],
|
PADDLE_ENFORCE_EQ(block_table_dims[0],
|
||||||
num_seqs,
|
num_seqs,
|
||||||
common::errors::InvalidArgument(
|
common::errors::InvalidArgument(
|
||||||
@@ -162,14 +166,13 @@ void PagedAttnKernel(const paddle::Tensor& q,
|
|||||||
const float *rope_sin_ptr = merged_qkv ? rope_sin.get().data<float>() : nullptr;
|
const float *rope_sin_ptr = merged_qkv ? rope_sin.get().data<float>() : nullptr;
|
||||||
const float *rope_cos_ptr = merged_qkv ? rope_cos.get().data<float>() : nullptr;
|
const float *rope_cos_ptr = merged_qkv ? rope_cos.get().data<float>() : nullptr;
|
||||||
|
|
||||||
auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(q.place()));
|
|
||||||
cuinferHandle_t cuinfer_handle = iluvatar::getContextInstance()->getIxInferHandle();
|
cuinferHandle_t cuinfer_handle = iluvatar::getContextInstance()->getIxInferHandle();
|
||||||
|
|
||||||
size_t workspace_size = 0;
|
size_t workspace_size = 0;
|
||||||
CUINFER_CHECK(cuInferPageAttentionGetWorkspaceV7(num_seqs,
|
CUINFER_CHECK(cuInferPageAttentionGetWorkspaceV7(num_seqs,
|
||||||
num_heads,
|
num_heads,
|
||||||
num_kv_heads,
|
num_kv_heads,
|
||||||
head_size,
|
head_dim,
|
||||||
block_size,
|
block_size,
|
||||||
max_context_len,
|
max_context_len,
|
||||||
&workspace_size));
|
&workspace_size));
|
||||||
@@ -189,7 +192,7 @@ void PagedAttnKernel(const paddle::Tensor& q,
|
|||||||
num_seqs,
|
num_seqs,
|
||||||
num_heads,
|
num_heads,
|
||||||
num_kv_heads,
|
num_kv_heads,
|
||||||
head_size,
|
head_dim,
|
||||||
q_stride,
|
q_stride,
|
||||||
kv_block_stride,
|
kv_block_stride,
|
||||||
kv_head_stride,
|
kv_head_stride,
|
||||||
@@ -215,6 +218,8 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
|
|||||||
const paddle::optional<paddle::Tensor> &v,
|
const paddle::optional<paddle::Tensor> &v,
|
||||||
const paddle::optional<paddle::Tensor> &rope_sin,
|
const paddle::optional<paddle::Tensor> &rope_sin,
|
||||||
const paddle::optional<paddle::Tensor> &rope_cos,
|
const paddle::optional<paddle::Tensor> &rope_cos,
|
||||||
|
int num_heads,
|
||||||
|
int head_dim,
|
||||||
int num_kv_heads,
|
int num_kv_heads,
|
||||||
float scale,
|
float scale,
|
||||||
int block_size,
|
int block_size,
|
||||||
@@ -228,11 +233,7 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
|
|||||||
bool merged_qkv) {
|
bool merged_qkv) {
|
||||||
|
|
||||||
const auto dtype = q.dtype();
|
const auto dtype = q.dtype();
|
||||||
auto out_shape = q.shape();
|
auto out = paddle::empty({q.shape()[0], num_heads * head_dim}, dtype, q.place());
|
||||||
if (merged_qkv) {
|
|
||||||
out_shape[1] -= 2 * num_kv_heads;
|
|
||||||
}
|
|
||||||
auto out = paddle::empty(out_shape, dtype, q.place());
|
|
||||||
|
|
||||||
switch (dtype) {
|
switch (dtype) {
|
||||||
case paddle::DataType::BFLOAT16:
|
case paddle::DataType::BFLOAT16:
|
||||||
@@ -246,6 +247,8 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
|
|||||||
v,
|
v,
|
||||||
rope_sin,
|
rope_sin,
|
||||||
rope_cos,
|
rope_cos,
|
||||||
|
num_heads,
|
||||||
|
head_dim,
|
||||||
num_kv_heads,
|
num_kv_heads,
|
||||||
scale,
|
scale,
|
||||||
block_size,
|
block_size,
|
||||||
@@ -270,6 +273,8 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
|
|||||||
v,
|
v,
|
||||||
rope_sin,
|
rope_sin,
|
||||||
rope_cos,
|
rope_cos,
|
||||||
|
num_heads,
|
||||||
|
head_dim,
|
||||||
num_kv_heads,
|
num_kv_heads,
|
||||||
scale,
|
scale,
|
||||||
block_size,
|
block_size,
|
||||||
@@ -299,6 +304,8 @@ std::vector<std::vector<int64_t>> PagedAttnInferShape(const std::vector<int64_t>
|
|||||||
const std::vector<int64_t>& v_shape,
|
const std::vector<int64_t>& v_shape,
|
||||||
const std::vector<int64_t>& rope_sin_shape,
|
const std::vector<int64_t>& rope_sin_shape,
|
||||||
const std::vector<int64_t>& rope_cos_shape,
|
const std::vector<int64_t>& rope_cos_shape,
|
||||||
|
int num_heads,
|
||||||
|
int head_dim,
|
||||||
int num_kv_heads,
|
int num_kv_heads,
|
||||||
float scale,
|
float scale,
|
||||||
int block_size,
|
int block_size,
|
||||||
@@ -311,36 +318,13 @@ std::vector<std::vector<int64_t>> PagedAttnInferShape(const std::vector<int64_t>
|
|||||||
bool use_sqrt_alibi,
|
bool use_sqrt_alibi,
|
||||||
bool merged_qkv) {
|
bool merged_qkv) {
|
||||||
if (merged_qkv) {
|
if (merged_qkv) {
|
||||||
int64_t num_tokens = q_shape[0];
|
return {{q_shape[0], num_heads * head_dim}};
|
||||||
int64_t num_heads = q_shape[1] - 2 * num_kv_heads;
|
|
||||||
int64_t head_dim = q_shape[2];
|
|
||||||
return {{num_tokens, num_heads, head_dim}};
|
|
||||||
} else {
|
} else {
|
||||||
return {q_shape};
|
return {q_shape};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<paddle::DataType> PagedAttnInferDtype(const paddle::DataType& q_dtype,
|
std::vector<paddle::DataType> PagedAttnInferDtype(const paddle::DataType& q_dtype) {
|
||||||
const paddle::DataType& k_cache_dtype,
|
|
||||||
const paddle::DataType& v_cache_dtype,
|
|
||||||
const paddle::DataType& block_table_dtype,
|
|
||||||
const paddle::DataType& seq_lens_dtype,
|
|
||||||
const paddle::DataType& alibi_slopes_dtype,
|
|
||||||
const paddle::DataType& k_dtype,
|
|
||||||
const paddle::DataType& v_dtype,
|
|
||||||
const paddle::DataType& rope_sin_dtype,
|
|
||||||
const paddle::DataType& rope_cos_dtype,
|
|
||||||
int num_kv_heads,
|
|
||||||
float scale,
|
|
||||||
int block_size,
|
|
||||||
int max_context_len,
|
|
||||||
bool causal,
|
|
||||||
int window_left,
|
|
||||||
int window_right,
|
|
||||||
float softcap,
|
|
||||||
bool enable_cuda_graph,
|
|
||||||
bool use_sqrt_alibi,
|
|
||||||
bool merged_qkv) {
|
|
||||||
return {q_dtype};
|
return {q_dtype};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -351,7 +335,9 @@ PD_BUILD_STATIC_OP(paged_attn)
|
|||||||
paddle::Optional("v"), paddle::Optional("rope_sin"),
|
paddle::Optional("v"), paddle::Optional("rope_sin"),
|
||||||
paddle::Optional("rope_cos")})
|
paddle::Optional("rope_cos")})
|
||||||
.Outputs({"out"})
|
.Outputs({"out"})
|
||||||
.Attrs({"num_kv_heads:int",
|
.Attrs({"num_heads:int",
|
||||||
|
"head_dim:int",
|
||||||
|
"num_kv_heads:int",
|
||||||
"scale:float",
|
"scale:float",
|
||||||
"block_size:int",
|
"block_size:int",
|
||||||
"max_context_len:int",
|
"max_context_len:int",
|
||||||
|
378
custom_ops/iluvatar_ops/prefill_fused_attn.cu
Normal file
378
custom_ops/iluvatar_ops/prefill_fused_attn.cu
Normal file
@@ -0,0 +1,378 @@
|
|||||||
|
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#include "helper.h"
|
||||||
|
#include "iluvatar_context.h"
|
||||||
|
|
||||||
|
template <paddle::DataType T>
|
||||||
|
void PrefillFusedPagedAttnKernel(const paddle::Tensor& qkv,
|
||||||
|
paddle::Tensor& k_cache,
|
||||||
|
paddle::Tensor& v_cache,
|
||||||
|
const paddle::Tensor& block_table,
|
||||||
|
const paddle::Tensor& cu_seqlens_qkv,
|
||||||
|
const paddle::optional<paddle::Tensor> &rope_sin,
|
||||||
|
const paddle::optional<paddle::Tensor> &rope_cos,
|
||||||
|
int num_heads,
|
||||||
|
int head_dim,
|
||||||
|
int num_kv_heads,
|
||||||
|
int block_size,
|
||||||
|
int max_seq_len,
|
||||||
|
float scale,
|
||||||
|
bool causal,
|
||||||
|
bool q_rope,
|
||||||
|
bool k_rope,
|
||||||
|
bool v_rope,
|
||||||
|
paddle::Tensor& out) {
|
||||||
|
|
||||||
|
// check dtype and contiguous
|
||||||
|
const auto& dtype = qkv.dtype();
|
||||||
|
cuinferDataType_t data_type;
|
||||||
|
if (dtype == paddle::DataType::FLOAT16) {
|
||||||
|
data_type = CUINFER_DATA_HALF;
|
||||||
|
|
||||||
|
} else if (dtype == paddle::DataType::BFLOAT16) {
|
||||||
|
data_type = CUINFER_DATA_BFLOAT16;
|
||||||
|
} else {
|
||||||
|
common::errors::InvalidArgument("paged_attention support half and bfloat16 now");
|
||||||
|
}
|
||||||
|
|
||||||
|
PADDLE_ENFORCE_EQ(k_cache.dtype(),
|
||||||
|
dtype,
|
||||||
|
common::errors::InvalidArgument(
|
||||||
|
"k_cache dtype must be the same as query dtype"));
|
||||||
|
PADDLE_ENFORCE_EQ(k_cache.is_contiguous(),
|
||||||
|
true,
|
||||||
|
common::errors::InvalidArgument(
|
||||||
|
"paged_attention expects k_cache is contiguous"));
|
||||||
|
PADDLE_ENFORCE_EQ(block_table.dtype(),
|
||||||
|
paddle::DataType::INT32,
|
||||||
|
common::errors::InvalidArgument(
|
||||||
|
"block_table dtype must be int32"));
|
||||||
|
PADDLE_ENFORCE_EQ(block_table.is_contiguous(),
|
||||||
|
true,
|
||||||
|
common::errors::InvalidArgument(
|
||||||
|
"paged_attention expects block_table is contiguous"));
|
||||||
|
PADDLE_ENFORCE_EQ(cu_seqlens_qkv.dtype(),
|
||||||
|
paddle::DataType::INT32,
|
||||||
|
common::errors::InvalidArgument(
|
||||||
|
"cu_seqlens_qkv dtype must be int32"));
|
||||||
|
PADDLE_ENFORCE_EQ(cu_seqlens_qkv.is_contiguous(),
|
||||||
|
true,
|
||||||
|
common::errors::InvalidArgument(
|
||||||
|
"paged_attention expects cu_seqlens_qkv is contiguous"));
|
||||||
|
// check dim and shape
|
||||||
|
// k_cache: [num_blocks, kv_num_heads, block_size, head_dim]
|
||||||
|
// v_cache: [num_blocks, kv_num_heads, block_size, head_dim]
|
||||||
|
// block_table: [batch_size, max_num_blocks_per_seq]
|
||||||
|
// seq_lens: [batch_size]
|
||||||
|
// qkv: [num_tokens, (num_heads+2*num_kv_heads)*head_dim]
|
||||||
|
// out: [num_tokens, hidden_size]
|
||||||
|
|
||||||
|
const auto& qkv_dims = qkv.dims();
|
||||||
|
PADDLE_ENFORCE_EQ(qkv_dims.size(),
|
||||||
|
2,
|
||||||
|
common::errors::InvalidArgument(
|
||||||
|
"paged_attn receive query dims is "
|
||||||
|
"[num_tokens, (num_heads+2*num_kv_heads)*head_dim]"));
|
||||||
|
PADDLE_ENFORCE_EQ(out.dims().size(),
|
||||||
|
2,
|
||||||
|
common::errors::InvalidArgument(
|
||||||
|
"paged_attn receive out dims is "
|
||||||
|
"[num_tokens, hidden_size]"));
|
||||||
|
|
||||||
|
const auto& kv_cache_dims = k_cache.dims();
|
||||||
|
PADDLE_ENFORCE_EQ(kv_cache_dims.size(),
|
||||||
|
4,
|
||||||
|
common::errors::InvalidArgument(
|
||||||
|
"paged_attn receive kv cache dims is "
|
||||||
|
"[num_blocks, kv_num_heads, block_size, head_dim]"));
|
||||||
|
|
||||||
|
const auto& block_table_dims = block_table.dims();
|
||||||
|
PADDLE_ENFORCE_EQ(block_table_dims.size(),
|
||||||
|
2,
|
||||||
|
common::errors::InvalidArgument(
|
||||||
|
"paged_attn receive block_table dims is "
|
||||||
|
"[batch_size, max_num_blocks_per_seq]"));
|
||||||
|
|
||||||
|
const auto& cu_seqlens_qkv_dims = cu_seqlens_qkv.dims();
|
||||||
|
PADDLE_ENFORCE_EQ(cu_seqlens_qkv_dims.size(),
|
||||||
|
1,
|
||||||
|
common::errors::InvalidArgument(
|
||||||
|
"paged_attn receive cu_seqlens_qkv dims is [batch_size]"));
|
||||||
|
|
||||||
|
int batch_size = block_table_dims[0];
|
||||||
|
int num_tokens = qkv_dims[0];
|
||||||
|
int num_total_heads = num_heads + 2 * num_kv_heads;
|
||||||
|
int qkv_stride = qkv.strides()[0];
|
||||||
|
int num_blocks = kv_cache_dims[0];
|
||||||
|
|
||||||
|
PADDLE_ENFORCE_EQ(kv_cache_dims[1],
|
||||||
|
num_kv_heads,
|
||||||
|
common::errors::InvalidArgument(
|
||||||
|
"kv_cache_dims[1] must be equal to num_kv_head"));
|
||||||
|
PADDLE_ENFORCE_EQ(kv_cache_dims[2],
|
||||||
|
block_size,
|
||||||
|
common::errors::InvalidArgument(
|
||||||
|
"kv_cache_dims[2] must be equal to block_size"));
|
||||||
|
PADDLE_ENFORCE_EQ(kv_cache_dims[3],
|
||||||
|
head_dim,
|
||||||
|
common::errors::InvalidArgument(
|
||||||
|
"kv_cache_dims[3] must be equal to head_dim"));
|
||||||
|
PADDLE_ENFORCE_EQ(cu_seqlens_qkv_dims[0],
|
||||||
|
batch_size + 1,
|
||||||
|
common::errors::InvalidArgument(
|
||||||
|
"cu_seqlens_qkv_dims[0] must be equal to batch_size + 1"));
|
||||||
|
|
||||||
|
int block_table_stride = block_table.strides()[0];
|
||||||
|
const float *rope_sin_ptr = rope_sin ? rope_sin.get().data<float>() : nullptr;
|
||||||
|
const float *rope_cos_ptr = rope_cos ? rope_cos.get().data<float>() : nullptr;
|
||||||
|
|
||||||
|
cuinferHandle_t cuinfer_handle = iluvatar::getContextInstance()->getIxInferHandle();
|
||||||
|
|
||||||
|
size_t workspace_size = 0;
|
||||||
|
CUINFER_CHECK(cuinferGetFmhaFwdMergedFuseRopeWorkspaceSize(num_tokens,
|
||||||
|
num_heads,
|
||||||
|
num_kv_heads,
|
||||||
|
head_dim,
|
||||||
|
q_rope,
|
||||||
|
k_rope,
|
||||||
|
v_rope,
|
||||||
|
data_type,
|
||||||
|
data_type,
|
||||||
|
data_type,
|
||||||
|
&workspace_size));
|
||||||
|
auto* allocator = paddle::GetAllocator(qkv.place());
|
||||||
|
phi::Allocator::AllocationPtr tmp_workspace = allocator->Allocate(workspace_size);
|
||||||
|
void* workspace_ptr = tmp_workspace->ptr();
|
||||||
|
|
||||||
|
cuinferTensorDescriptor_t qkv_desc;
|
||||||
|
CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_desc));
|
||||||
|
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||||
|
qkv_desc,
|
||||||
|
data_type,
|
||||||
|
3,
|
||||||
|
std::vector<int>({num_tokens, num_total_heads, head_dim}).data(),
|
||||||
|
std::vector<int>({num_total_heads * head_dim, head_dim, 1}).data()));
|
||||||
|
|
||||||
|
cuinferTensorDescriptor_t qkv_seqlens_desc;
|
||||||
|
CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_seqlens_desc));
|
||||||
|
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||||
|
qkv_seqlens_desc,
|
||||||
|
CUINFER_DATA_INT32,
|
||||||
|
1,
|
||||||
|
std::vector<int>({batch_size + 1}).data(),
|
||||||
|
std::vector<int>({1}).data()));
|
||||||
|
|
||||||
|
cuinferTensorDescriptor_t block_table_desc;
|
||||||
|
CUINFER_CHECK(cuinferCreateTensorDescriptor(&block_table_desc));
|
||||||
|
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||||
|
block_table_desc,
|
||||||
|
CUINFER_DATA_INT32,
|
||||||
|
2,
|
||||||
|
std::vector<int>({batch_size, block_table_stride}).data(),
|
||||||
|
std::vector<int>({block_table_stride, 1}).data()));
|
||||||
|
|
||||||
|
cuinferTensorDescriptor_t o_desc;
|
||||||
|
CUINFER_CHECK(cuinferCreateTensorDescriptor(&o_desc));
|
||||||
|
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||||
|
o_desc,
|
||||||
|
data_type,
|
||||||
|
3,
|
||||||
|
std::vector<int>({num_tokens, num_heads, head_dim}).data(),
|
||||||
|
std::vector<int>({num_heads * head_dim, head_dim, 1}).data()));
|
||||||
|
|
||||||
|
cuinferTensorDescriptor_t k_cache_desc;
|
||||||
|
CUINFER_CHECK(cuinferCreateTensorDescriptor(&k_cache_desc));
|
||||||
|
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||||
|
k_cache_desc,
|
||||||
|
data_type,
|
||||||
|
4,
|
||||||
|
std::vector<int>({num_blocks, num_kv_heads, block_size, head_dim}).data(),
|
||||||
|
std::vector<int>({num_kv_heads * block_size * head_dim, block_size * head_dim, head_dim, 1}).data()));
|
||||||
|
|
||||||
|
cuinferTensorDescriptor_t v_cache_desc;
|
||||||
|
CUINFER_CHECK(cuinferCreateTensorDescriptor(&v_cache_desc));
|
||||||
|
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||||
|
v_cache_desc,
|
||||||
|
data_type,
|
||||||
|
4,
|
||||||
|
std::vector<int>({num_blocks, num_kv_heads, block_size, head_dim}).data(),
|
||||||
|
std::vector<int>({num_kv_heads * block_size * head_dim, block_size * head_dim, head_dim, 1}).data()));
|
||||||
|
|
||||||
|
cuinferTensorDescriptor_t cos_desc;
|
||||||
|
CUINFER_CHECK(cuinferCreateTensorDescriptor(&cos_desc));
|
||||||
|
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||||
|
cos_desc,
|
||||||
|
CUINFER_DATA_FLOAT,
|
||||||
|
2,
|
||||||
|
std::vector<int>({max_seq_len, head_dim}).data(),
|
||||||
|
std::vector<int>({head_dim, 1}).data()));
|
||||||
|
|
||||||
|
cuinferTensorDescriptor_t sin_desc;
|
||||||
|
CUINFER_CHECK(cuinferCreateTensorDescriptor(&sin_desc));
|
||||||
|
CUINFER_CHECK(cuinferSetTensorNdDescriptor(
|
||||||
|
sin_desc,
|
||||||
|
CUINFER_DATA_FLOAT,
|
||||||
|
2,
|
||||||
|
std::vector<int>({max_seq_len, head_dim}).data(),
|
||||||
|
std::vector<int>({head_dim, 1}).data()));
|
||||||
|
|
||||||
|
CUINFER_CHECK(cuinferFmhaFwdMergedFuseRopeFunc(cuinfer_handle,
|
||||||
|
qkv_desc,
|
||||||
|
qkv.data(),
|
||||||
|
qkv_seqlens_desc,
|
||||||
|
cu_seqlens_qkv.data<int32_t>(),
|
||||||
|
block_table_desc,
|
||||||
|
block_table.data<int32_t>(),
|
||||||
|
o_desc,
|
||||||
|
out.data(),
|
||||||
|
k_cache_desc,
|
||||||
|
k_cache.data(),
|
||||||
|
v_cache_desc,
|
||||||
|
v_cache.data(),
|
||||||
|
workspace_ptr,
|
||||||
|
workspace_size,
|
||||||
|
cos_desc,
|
||||||
|
rope_cos_ptr,
|
||||||
|
sin_desc,
|
||||||
|
rope_sin_ptr,
|
||||||
|
batch_size,
|
||||||
|
num_heads,
|
||||||
|
num_kv_heads,
|
||||||
|
head_dim,
|
||||||
|
causal,
|
||||||
|
scale,
|
||||||
|
q_rope,
|
||||||
|
k_rope,
|
||||||
|
v_rope));
|
||||||
|
|
||||||
|
CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_desc));
|
||||||
|
CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_seqlens_desc));
|
||||||
|
CUINFER_CHECK(cuinferDestroyTensorDescriptor(block_table_desc));
|
||||||
|
CUINFER_CHECK(cuinferDestroyTensorDescriptor(o_desc));
|
||||||
|
CUINFER_CHECK(cuinferDestroyTensorDescriptor(k_cache_desc));
|
||||||
|
CUINFER_CHECK(cuinferDestroyTensorDescriptor(v_cache_desc));
|
||||||
|
CUINFER_CHECK(cuinferDestroyTensorDescriptor(cos_desc));
|
||||||
|
CUINFER_CHECK(cuinferDestroyTensorDescriptor(sin_desc));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<paddle::Tensor> PrefillFusedPagedAttn(const paddle::Tensor& qkv,
|
||||||
|
paddle::Tensor& k_cache,
|
||||||
|
paddle::Tensor& v_cache,
|
||||||
|
const paddle::Tensor& block_table,
|
||||||
|
const paddle::Tensor& cu_seqlens_qkv,
|
||||||
|
const paddle::optional<paddle::Tensor> &rope_sin,
|
||||||
|
const paddle::optional<paddle::Tensor> &rope_cos,
|
||||||
|
int num_heads,
|
||||||
|
int head_dim,
|
||||||
|
int num_kv_heads,
|
||||||
|
int block_size,
|
||||||
|
int max_seq_len,
|
||||||
|
float scale,
|
||||||
|
bool causal,
|
||||||
|
bool q_rope,
|
||||||
|
bool k_rope,
|
||||||
|
bool v_rope) {
|
||||||
|
|
||||||
|
const auto dtype = qkv.dtype();
|
||||||
|
auto out = paddle::empty({qkv.shape()[0], num_heads * head_dim}, dtype, qkv.place());
|
||||||
|
|
||||||
|
switch (dtype) {
|
||||||
|
case paddle::DataType::BFLOAT16:
|
||||||
|
PrefillFusedPagedAttnKernel<paddle::DataType::BFLOAT16>(qkv,
|
||||||
|
k_cache,
|
||||||
|
v_cache,
|
||||||
|
block_table,
|
||||||
|
cu_seqlens_qkv,
|
||||||
|
rope_sin,
|
||||||
|
rope_cos,
|
||||||
|
num_heads,
|
||||||
|
head_dim,
|
||||||
|
num_kv_heads,
|
||||||
|
block_size,
|
||||||
|
max_seq_len,
|
||||||
|
scale,
|
||||||
|
causal,
|
||||||
|
q_rope,
|
||||||
|
k_rope,
|
||||||
|
v_rope,
|
||||||
|
out);
|
||||||
|
break;
|
||||||
|
case paddle::DataType::FLOAT16:
|
||||||
|
PrefillFusedPagedAttnKernel<paddle::DataType::FLOAT16>(qkv,
|
||||||
|
k_cache,
|
||||||
|
v_cache,
|
||||||
|
block_table,
|
||||||
|
cu_seqlens_qkv,
|
||||||
|
rope_sin,
|
||||||
|
rope_cos,
|
||||||
|
num_heads,
|
||||||
|
head_dim,
|
||||||
|
num_kv_heads,
|
||||||
|
block_size,
|
||||||
|
max_seq_len,
|
||||||
|
scale,
|
||||||
|
causal,
|
||||||
|
q_rope,
|
||||||
|
k_rope,
|
||||||
|
v_rope,
|
||||||
|
out);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
PD_THROW("Unsupported data type for Paged attn");
|
||||||
|
}
|
||||||
|
return {out};
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<int64_t>> PrefillFusedPagedAttnInferShape(const std::vector<int64_t>& qkv_shape,
|
||||||
|
const std::vector<int64_t>& k_cache_shape,
|
||||||
|
const std::vector<int64_t>& v_cache_shape,
|
||||||
|
const std::vector<int64_t>& block_table_shape,
|
||||||
|
const std::vector<int64_t>& cu_seqlens_qkv_shape,
|
||||||
|
const std::vector<int64_t>& rope_sin_shape,
|
||||||
|
const std::vector<int64_t>& rope_cos_shape,
|
||||||
|
int num_heads,
|
||||||
|
int head_dim,
|
||||||
|
int num_kv_heads,
|
||||||
|
int block_size,
|
||||||
|
int max_seq_len,
|
||||||
|
float scale,
|
||||||
|
bool causal,
|
||||||
|
bool q_rope,
|
||||||
|
bool k_rope,
|
||||||
|
bool v_rope) {
|
||||||
|
return {{qkv_shape[0], num_heads * head_dim}};
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<paddle::DataType> PrefillFusedPagedAttnInferDtype(const paddle::DataType& qkv_dtype) {
|
||||||
|
return {qkv_dtype};
|
||||||
|
}
|
||||||
|
|
||||||
|
PD_BUILD_STATIC_OP(prefill_fused_paged_attn)
|
||||||
|
.Inputs({"qkv", "k_cache", "v_cache", "block_table", "cu_seqlens_qkv",
|
||||||
|
paddle::Optional("rope_sin"), paddle::Optional("rope_cos")})
|
||||||
|
.Outputs({"out"})
|
||||||
|
.Attrs({"num_heads:int",
|
||||||
|
"head_dim:int",
|
||||||
|
"num_kv_heads:int",
|
||||||
|
"block_size:int",
|
||||||
|
"max_seq_len:int",
|
||||||
|
"scale:float",
|
||||||
|
"causal:bool",
|
||||||
|
"q_rope:bool",
|
||||||
|
"k_rope:bool",
|
||||||
|
"v_rope:bool"})
|
||||||
|
.SetKernelFn(PD_KERNEL(PrefillFusedPagedAttn))
|
||||||
|
.SetInferShapeFn(PD_INFER_SHAPE(PrefillFusedPagedAttnInferShape))
|
||||||
|
.SetInferDtypeFn(PD_INFER_DTYPE(PrefillFusedPagedAttnInferDtype));
|
@@ -536,6 +536,8 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
|
|||||||
"iluvatar_ops/moe_dispatch.cu",
|
"iluvatar_ops/moe_dispatch.cu",
|
||||||
"iluvatar_ops/moe_reduce.cu",
|
"iluvatar_ops/moe_reduce.cu",
|
||||||
"iluvatar_ops/paged_attn.cu",
|
"iluvatar_ops/paged_attn.cu",
|
||||||
|
"iluvatar_ops/prefill_fused_attn.cu",
|
||||||
|
"iluvatar_ops/mixed_fused_attn.cu",
|
||||||
"iluvatar_ops/w8a16_group_gemm.cu",
|
"iluvatar_ops/w8a16_group_gemm.cu",
|
||||||
"iluvatar_ops/runtime/iluvatar_context.cc",
|
"iluvatar_ops/runtime/iluvatar_context.cc",
|
||||||
],
|
],
|
||||||
|
@@ -1,5 +1,4 @@
|
|||||||
# Run ERNIE-4.5-300B-A47B & ERNIE-4.5-21B-A3B model on iluvatar machine
|
# Run ERNIE-4.5-300B-A47B & ERNIE-4.5-21B-A3B model on iluvatar machine
|
||||||
The current version of the software merely serves as a demonstration demo for the Iluvatar CoreX combined with the Fastdeploy inference framework for large models. Running the latest ERNIE4.5 300B model on the GSM8K dataset takes about 6.3 hours.
|
|
||||||
|
|
||||||
## Machine Preparation
|
## Machine Preparation
|
||||||
First, the `TP=16` when running the ERNIE4.5 300B model and so you need to prepare a machine with the following configurations:
|
First, the `TP=16` when running the ERNIE4.5 300B model and so you need to prepare a machine with the following configurations:
|
||||||
@@ -30,7 +29,7 @@ docker exec -it paddle_infer bash
|
|||||||
### Install paddle
|
### Install paddle
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip3 install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
||||||
pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
|
pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
|
||||||
```
|
```
|
||||||
For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
|
For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
|
||||||
@@ -78,7 +77,7 @@ prompts = [
|
|||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
|
||||||
|
|
||||||
# load the model
|
# load the model
|
||||||
llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, static_decode_blocks=0, block_size=16, quantization='wint8')
|
llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, block_size=16, quantization='wint8')
|
||||||
|
|
||||||
# Perform batch inference
|
# Perform batch inference
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
@@ -390,7 +389,7 @@ export INFERENCE_MSG_QUEUE_ID=232132
|
|||||||
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
|
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
|
||||||
export FD_SAMPLING_CLASS=rejection
|
export FD_SAMPLING_CLASS=rejection
|
||||||
|
|
||||||
python3 -m fastdeploy.entrypoints.openai.api_server --model "/home/paddle/ernie-45t" --port 8188 --tensor-parallel-size 16 --block-size 16 --static-decode-blocks 0 --quantization wint8
|
python3 -m fastdeploy.entrypoints.openai.api_server --model "/home/paddle/ernie-45t" --port 8188 --tensor-parallel-size 16 --block-size 16 --quantization wint8
|
||||||
```
|
```
|
||||||
|
|
||||||
4. Running the Script
|
4. Running the Script
|
||||||
@@ -403,10 +402,10 @@ After the service is ready, open another terminal and run:
|
|||||||
```bash
|
```bash
|
||||||
python3 -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parallel 8
|
python3 -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parallel 8
|
||||||
```
|
```
|
||||||
It takes about 6.3 hours to run the GSM8K dataset.
|
It takes about 4.8 hours to run the GSM8K dataset.
|
||||||
|
|
||||||
```
|
```
|
||||||
Accuracy: 0.964
|
Accuracy: 0.962
|
||||||
Invaild: 0.000
|
Invaild: 0.000
|
||||||
Latency: 22918.186 s
|
Latency: 17332.728 s
|
||||||
```
|
```
|
||||||
|
@@ -1,12 +1,11 @@
|
|||||||
# 如何在天数机器上运行 ERNIE-4.5-300B-A47B-BF16 & ERNIE-4.5-21B-A3B
|
# 如何在天数机器上运行 ERNIE-4.5-300B-A47B-BF16 & ERNIE-4.5-21B-A3B
|
||||||
该软件的当前版本仅作为Iluvatar CoreX与大型模型的Fastdeploy推理框架相结合的演示。在GSM8K数据集上运行最新的ERNIE4.5 300B模型大约需要6.3小时。
|
|
||||||
|
|
||||||
## 准备机器
|
## 准备机器
|
||||||
首先您需要准备以下配置的机器
|
首先运行ERNIE4.5 300B模型需要`TP=16`, 所以您需要准备以下配置的机器:
|
||||||
|
|
||||||
| CPU | 内存 | 天数 | 硬盘|
|
| CPU | 内存 | 天数 | 硬盘|
|
||||||
|-----|------|-----|-----|
|
|-----|------|-----|-----|
|
||||||
| x86 | 1TB| 8xBI150| 1TB|
|
| x86 | 1TB| 16xBI150| 1TB|
|
||||||
|
|
||||||
目前需要将完整模型 load 到 host memory 中,需要需要大于 600GB 的 host memory,后续版本会优化。
|
目前需要将完整模型 load 到 host memory 中,需要需要大于 600GB 的 host memory,后续版本会优化。
|
||||||
|
|
||||||
@@ -30,7 +29,7 @@ docker exec -it paddle_infer bash
|
|||||||
### 安装paddle
|
### 安装paddle
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip3 install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
||||||
pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
|
pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
|
||||||
```
|
```
|
||||||
获取Paddle的最新安装版本: [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
|
获取Paddle的最新安装版本: [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
|
||||||
@@ -77,7 +76,7 @@ prompts = [
|
|||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
|
||||||
|
|
||||||
# 加载模型
|
# 加载模型
|
||||||
llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, static_decode_blocks=0, quantization='wint8')
|
llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, quantization='wint8')
|
||||||
|
|
||||||
# 批量进行推理(llm内部基于资源情况进行请求排队、动态插入处理)
|
# 批量进行推理(llm内部基于资源情况进行请求排队、动态插入处理)
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
@@ -132,3 +131,281 @@ Now, let's break down each step:
|
|||||||
**Step 3: Drawing the
|
**Step 3: Drawing the
|
||||||
The largest ocean is the Pacific Ocean, covering an area of approximately ⦠[3], The first scientific expeditions to determine the ocean's depth were the Challenger expedition (1872â1876) and the U.S. Navy Hydrographic Office survey (1877â1879). The oceanic crust is thin and irregular, consisting of upward moving magma from the mantle below, and cooling and solidifying on the surface. The shallowest parts of the ocean are called the continental shelves. Large tides are caused mainly by the alignment of the Sun, Moon, and Earth during new or full moons. The origin of the word "ocean" is not clear. The first global oceanic topography survey was completed by the Challenger expedition (1872â1876). [57] The sound speed in the ocean is primarily a function of water temperature and salinity, and varies with depth. The deep-ocean floor is mostly flat and devoid of life, with the exception of seamounts and various underwater volcanic features, including seamounts and hydrothermal vents. [73] Today, the five ocean
|
The largest ocean is the Pacific Ocean, covering an area of approximately ⦠[3], The first scientific expeditions to determine the ocean's depth were the Challenger expedition (1872â1876) and the U.S. Navy Hydrographic Office survey (1877â1879). The oceanic crust is thin and irregular, consisting of upward moving magma from the mantle below, and cooling and solidifying on the surface. The shallowest parts of the ocean are called the continental shelves. Large tides are caused mainly by the alignment of the Sun, Moon, and Earth during new or full moons. The origin of the word "ocean" is not clear. The first global oceanic topography survey was completed by the Challenger expedition (1872â1876). [57] The sound speed in the ocean is primarily a function of water temperature and salinity, and varies with depth. The deep-ocean floor is mostly flat and devoid of life, with the exception of seamounts and various underwater volcanic features, including seamounts and hydrothermal vents. [73] Today, the five ocean
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## 在GSM8K数据集上运行ernie4.5 300B模型
|
||||||
|
|
||||||
|
1. 下载GSM8K数据集
|
||||||
|
|
||||||
|
```bash
|
||||||
|
wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
|
||||||
|
```
|
||||||
|
|
||||||
|
2. 准备`bench_gsm8k.py`
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
""" Fastdeploy + ERNIE-4.5-Turbo 的指标评估 """
|
||||||
|
# adapted from https://github.com/sgl-project/sglang/blob/main/benchmark/gsm8k/bench_other.py
|
||||||
|
import argparse
|
||||||
|
import ast
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import requests
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
INVALID = -9999999
|
||||||
|
|
||||||
|
|
||||||
|
def call_generate(prompt, **kwargs):
|
||||||
|
"""
|
||||||
|
Generates response based on the input prompt.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompt (str): The input prompt text.
|
||||||
|
**kwargs: Keyword arguments, including server IP address and port number.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The response generated based on the prompt.
|
||||||
|
|
||||||
|
"""
|
||||||
|
url = f"http://{kwargs['ip']}:{kwargs['port']}/v1/chat/completions"
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
data = {
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": prompt,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"temperature": 0.6,
|
||||||
|
"max_tokens": 2047,
|
||||||
|
"top_p": 0.95,
|
||||||
|
"do_sample": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(url, headers=headers, data=json.dumps(data))
|
||||||
|
out = response.json()
|
||||||
|
return out["choices"][0]["message"]["content"]
|
||||||
|
|
||||||
|
|
||||||
|
def get_one_example(lines, i, include_answer):
|
||||||
|
"""
|
||||||
|
Retrieves a question-answer example from the given list of text lines.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lines (list of dict): A list of question-answer pairs.
|
||||||
|
i (int): The index of the question-answer pair to retrieve from lines.
|
||||||
|
include_answer (bool): Whether to include the answer in the returned string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: A formatted question-answer string in the format "Question: <question>\nAnswer: <answer>".
|
||||||
|
|
||||||
|
"""
|
||||||
|
ret = "Question: " + lines[i]["question"] + "\nAnswer:"
|
||||||
|
if include_answer:
|
||||||
|
ret += " " + lines[i]["answer"]
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
def get_few_shot_examples(lines, k):
|
||||||
|
"""
|
||||||
|
Selects k examples from the given list of text lines and concatenates them into a single string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lines (list): A list containing text lines.
|
||||||
|
k (int): The number of examples to select.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: A string composed of k examples, separated by two newline characters.
|
||||||
|
"""
|
||||||
|
ret = ""
|
||||||
|
for i in range(k):
|
||||||
|
ret += get_one_example(lines, i, True) + "\n\n"
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
def get_answer_value(answer_str):
|
||||||
|
"""
|
||||||
|
Extracts numerical values from an answer string and returns them.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
answer_str (str): The string containing the answer.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The extracted numerical value; returns "INVALID" if extraction fails.
|
||||||
|
"""
|
||||||
|
answer_str = answer_str.replace(",", "")
|
||||||
|
numbers = re.findall(r"\d+", answer_str)
|
||||||
|
if len(numbers) < 1:
|
||||||
|
return INVALID
|
||||||
|
try:
|
||||||
|
return ast.literal_eval(numbers[-1])
|
||||||
|
except SyntaxError:
|
||||||
|
return INVALID
|
||||||
|
|
||||||
|
|
||||||
|
def read_jsonl(filename: str):
|
||||||
|
"""
|
||||||
|
Reads a JSONL file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filename (str): Path to the JSONL file.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
dict: A dictionary object corresponding to each line in the JSONL file.
|
||||||
|
"""
|
||||||
|
with open(filename) as fin:
|
||||||
|
for line in fin:
|
||||||
|
if line.startswith("#"):
|
||||||
|
continue
|
||||||
|
yield json.loads(line)
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
"""
|
||||||
|
Process inputs and generate answers by calling the model in parallel using a thread pool.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
args (argparse.Namespace):
|
||||||
|
- num_questions (int): Number of questions to process.
|
||||||
|
- num_shots (int): Number of few-shot learning examples.
|
||||||
|
- ip (str): IP address of the model service.
|
||||||
|
- port (int): Port number of the model service.
|
||||||
|
- parallel (int): Number of questions to process in parallel.
|
||||||
|
- result_file (str): File path to store the results.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Read data
|
||||||
|
filename = "test.jsonl"
|
||||||
|
|
||||||
|
lines = list(read_jsonl(filename))
|
||||||
|
|
||||||
|
# Construct prompts
|
||||||
|
num_questions = args.num_questions
|
||||||
|
num_shots = args.num_shots
|
||||||
|
few_shot_examples = get_few_shot_examples(lines, num_shots)
|
||||||
|
|
||||||
|
questions = []
|
||||||
|
labels = []
|
||||||
|
for i in range(len(lines[:num_questions])):
|
||||||
|
questions.append(get_one_example(lines, i, False))
|
||||||
|
labels.append(get_answer_value(lines[i]["answer"]))
|
||||||
|
assert all(l != INVALID for l in labels)
|
||||||
|
|
||||||
|
states = [None] * len(labels)
|
||||||
|
|
||||||
|
# Use thread pool
|
||||||
|
def get_one_answer(i):
|
||||||
|
answer = call_generate(
|
||||||
|
prompt=few_shot_examples + questions[i],
|
||||||
|
# stop=["Question", "Assistant:", "<|separator|>"],
|
||||||
|
ip=args.ip,
|
||||||
|
port=args.port,
|
||||||
|
)
|
||||||
|
states[i] = answer
|
||||||
|
|
||||||
|
tic = time.time()
|
||||||
|
if args.parallel == 1:
|
||||||
|
for i in tqdm(range(len(questions))):
|
||||||
|
get_one_answer(i)
|
||||||
|
else:
|
||||||
|
with ThreadPoolExecutor(args.parallel) as executor:
|
||||||
|
list(
|
||||||
|
tqdm(
|
||||||
|
executor.map(get_one_answer, list(range(len(questions)))),
|
||||||
|
total=len(questions),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
latency = time.time() - tic
|
||||||
|
preds = []
|
||||||
|
for i in range(len(states)):
|
||||||
|
preds.append(get_answer_value(states[i]))
|
||||||
|
|
||||||
|
# Compute accuracy
|
||||||
|
acc = np.mean(np.array(preds) == np.array(labels))
|
||||||
|
invalid = np.mean(np.array(preds) == INVALID)
|
||||||
|
|
||||||
|
# Print results
|
||||||
|
print(f"Accuracy: {acc:.3f}")
|
||||||
|
print(f"Invalid: {invalid:.3f}")
|
||||||
|
print(f"Latency: {latency:.3f} s")
|
||||||
|
|
||||||
|
with open(args.result_file, "a") as fout:
|
||||||
|
value = {
|
||||||
|
"task": "gsm8k",
|
||||||
|
"backend": "paddlepaddle",
|
||||||
|
"num_gpus": 1,
|
||||||
|
"latency": round(latency, 3),
|
||||||
|
"accuracy": round(acc, 3),
|
||||||
|
"num_requests": args.num_questions,
|
||||||
|
"other": {
|
||||||
|
"num_questions": args.num_questions,
|
||||||
|
"parallel": args.parallel,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
fout.write(json.dumps(value) + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--ip", type=str, default="127.0.0.1")
|
||||||
|
parser.add_argument("--port", type=str, default="8188")
|
||||||
|
parser.add_argument("--num-shots", type=int, default=10)
|
||||||
|
parser.add_argument("--data-path", type=str, default="test.jsonl")
|
||||||
|
parser.add_argument("--num-questions", type=int, default=1319)
|
||||||
|
parser.add_argument("--result-file", type=str, default="result.jsonl")
|
||||||
|
parser.add_argument("--parallel", type=int, default=1)
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
||||||
|
```
|
||||||
|
|
||||||
|
3. 准备`run_bench.sh`
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
export PADDLE_XCCL_BACKEND=iluvatar_gpu
|
||||||
|
export INFERENCE_MSG_QUEUE_ID=232132
|
||||||
|
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
|
||||||
|
export FD_SAMPLING_CLASS=rejection
|
||||||
|
|
||||||
|
python3 -m fastdeploy.entrypoints.openai.api_server --model "/home/paddle/ernie-45t" --port 8188 --tensor-parallel-size 16 --block-size 16 --quantization wint8
|
||||||
|
```
|
||||||
|
|
||||||
|
4. 运行脚本
|
||||||
|
|
||||||
|
首先打开一个终端执行服务端命令:
|
||||||
|
```bash
|
||||||
|
./run_bench.sh
|
||||||
|
```
|
||||||
|
等服务起好后,在打开另一个终端执行客户端命令:
|
||||||
|
```bash
|
||||||
|
python3 -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parallel 8
|
||||||
|
```
|
||||||
|
推理整个GSM8K数据集大概需要4.8个小时。
|
||||||
|
|
||||||
|
```
|
||||||
|
Accuracy: 0.962
|
||||||
|
Invaild: 0.000
|
||||||
|
Latency: 17332.728 s
|
||||||
|
```
|
||||||
|
@@ -1186,9 +1186,7 @@ class CacheConfig:
|
|||||||
self.kv_cache_ratio = 1.0
|
self.kv_cache_ratio = 1.0
|
||||||
else:
|
else:
|
||||||
self.kv_cache_ratio = 0.75
|
self.kv_cache_ratio = 0.75
|
||||||
self.enc_dec_block_num = (
|
self.enc_dec_block_num = 0 if current_platform.is_maca() else envs.FD_ENC_DEC_BLOCK_NUM
|
||||||
0 if current_platform.is_iluvatar() or current_platform.is_maca() else envs.FD_ENC_DEC_BLOCK_NUM
|
|
||||||
)
|
|
||||||
self.prealloc_dec_block_slot_num_threshold = 12
|
self.prealloc_dec_block_slot_num_threshold = 12
|
||||||
self.cache_dtype = "bfloat16"
|
self.cache_dtype = "bfloat16"
|
||||||
self.model_cfg = None
|
self.model_cfg = None
|
||||||
|
@@ -16,13 +16,11 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from math import sqrt
|
from math import sqrt
|
||||||
from typing import TYPE_CHECKING, Optional
|
from typing import TYPE_CHECKING, Optional
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
from paddle.nn.functional.flash_attention import flash_attn_unpadded
|
|
||||||
|
|
||||||
from fastdeploy.config import FDConfig
|
from fastdeploy.config import FDConfig
|
||||||
from fastdeploy.model_executor.layers.attention.attention import Attention
|
from fastdeploy.model_executor.layers.attention.attention import Attention
|
||||||
@@ -30,7 +28,11 @@ from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
|||||||
AttentionBackend,
|
AttentionBackend,
|
||||||
AttentionMetadata,
|
AttentionMetadata,
|
||||||
)
|
)
|
||||||
from fastdeploy.model_executor.ops.iluvatar import paged_attention
|
from fastdeploy.model_executor.ops.iluvatar import (
|
||||||
|
mixed_fused_paged_attention,
|
||||||
|
paged_attention,
|
||||||
|
prefill_fused_paged_attention,
|
||||||
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
@@ -42,26 +44,7 @@ class IluvatarAttentionMetadata(AttentionMetadata):
|
|||||||
IluvatarAttentionMetadata
|
IluvatarAttentionMetadata
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# flash_attn metadata
|
|
||||||
cu_seqlens_q: Optional[paddle.Tensor] = None
|
|
||||||
cu_seqlens_k: Optional[paddle.Tensor] = None
|
|
||||||
fixed_seed_offset: Optional[paddle.Tensor] = None
|
|
||||||
attn_mask: Optional[paddle.Tensor] = None
|
|
||||||
attn_mask_start_row_indices: Optional[paddle.Tensor] = None
|
|
||||||
dropout: float = 0.0
|
|
||||||
causal: bool = True
|
|
||||||
return_softmax: bool = False
|
|
||||||
rng_name: str = ""
|
|
||||||
|
|
||||||
# paged_attn metadata
|
|
||||||
block_tables: Optional[paddle.Tensor] = None
|
|
||||||
seq_lens: Optional[paddle.Tensor] = None
|
|
||||||
num_kv_heads: int = 1
|
|
||||||
scale: float = 1.0
|
|
||||||
block_size: int = 1
|
|
||||||
max_context_len: int = 1
|
|
||||||
alibi_slopes: Optional[paddle.Tensor] = None
|
alibi_slopes: Optional[paddle.Tensor] = None
|
||||||
# causal: bool = True
|
|
||||||
window_left: int = -1
|
window_left: int = -1
|
||||||
window_right: int = -1
|
window_right: int = -1
|
||||||
softcap: float = 0.0
|
softcap: float = 0.0
|
||||||
@@ -88,55 +71,44 @@ class IluvatarAttnBackend(AttentionBackend):
|
|||||||
def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, head_dim: int):
|
def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, head_dim: int):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.attention_metadata = IluvatarAttentionMetadata()
|
self.attention_metadata = IluvatarAttentionMetadata()
|
||||||
self.attention_metadata.block_size = fd_config.parallel_config.block_size
|
self.block_size = fd_config.parallel_config.block_size
|
||||||
assert (
|
assert self.block_size == 16, "Iluvatar paged attn requires block_size must be 16."
|
||||||
fd_config.parallel_config.enc_dec_block_num == 0
|
self.max_context_len = fd_config.parallel_config.max_model_len
|
||||||
), f"Iluvatar does not support yet, {fd_config.parallel_config.enc_dec_block_num}"
|
self.causal = getattr(fd_config.model_config, "causal", True)
|
||||||
assert self.attention_metadata.block_size == 16, "Iluvatar paged attn requires block_size must be 16."
|
|
||||||
|
|
||||||
self.attention_metadata.max_context_len = fd_config.parallel_config.max_model_len
|
|
||||||
self.attention_metadata.causal = getattr(fd_config.model_config, "causal", True)
|
|
||||||
self.speculate_method = getattr(fd_config.parallel_config, "speculate_method", None)
|
self.speculate_method = getattr(fd_config.parallel_config, "speculate_method", None)
|
||||||
self.use_speculate = self.speculate_method is not None
|
self.use_speculate = self.speculate_method is not None
|
||||||
self.attention_metadata.num_kv_heads = kv_num_heads
|
self.num_kv_heads = kv_num_heads
|
||||||
self.attention_metadata.dropout = fd_config.model_config.hidden_dropout_prob
|
|
||||||
self.num_heads = num_heads
|
self.num_heads = num_heads
|
||||||
self.total_num_heads = num_heads + 2 * kv_num_heads
|
self.total_num_heads = num_heads + 2 * kv_num_heads
|
||||||
self.head_dim = head_dim
|
self.head_dim = head_dim
|
||||||
self.hidden_dim = num_heads * head_dim
|
self.hidden_dim = fd_config.model_config.hidden_size
|
||||||
self.total_hidden_dim = self.total_num_heads * head_dim
|
|
||||||
# note: scale need to change if using MLA
|
# note: scale need to change if using MLA
|
||||||
self.attention_metadata.scale = 1.0 / sqrt(head_dim)
|
self.scale = 1.0 / sqrt(head_dim)
|
||||||
self.num_layers = fd_config.model_config.num_hidden_layers
|
self.num_layers = fd_config.model_config.num_hidden_layers
|
||||||
self.dtype = paddle.get_default_dtype()
|
self.dtype = paddle.get_default_dtype()
|
||||||
|
|
||||||
self.record_block_table_metadata = {}
|
|
||||||
self.enable_fused_attention = int(os.getenv("FD_ILUVATAR_ENABLE_FUSED_ATTN", 1))
|
|
||||||
|
|
||||||
def init_attention_metadata(self, forward_meta: ForwardMeta):
|
def init_attention_metadata(self, forward_meta: ForwardMeta):
|
||||||
"""Initialize attntion metadata hence all layers in the forward pass can reuse it."""
|
"""Initialize attntion metadata hence all layers in the forward pass can reuse it."""
|
||||||
|
self.rope_cos = forward_meta.rotary_embs[0, 0, :, :, :]
|
||||||
|
self.rope_sin = forward_meta.rotary_embs[1, 0, :, :, :]
|
||||||
self.prefill_info_dict = {}
|
self.prefill_info_dict = {}
|
||||||
self.decode_info_dict = {}
|
self.decode_info_dict = {}
|
||||||
|
self.prefill_info_dict["batch_ids"] = paddle.where(forward_meta.seq_lens_encoder)[0]
|
||||||
prefill_non_zeros_ids = forward_meta.seq_lens_this_time > 1
|
self.decode_info_dict["batch_ids"] = paddle.where(forward_meta.seq_lens_decoder)[0]
|
||||||
decode_non_zeros_ids = forward_meta.seq_lens_this_time == 1
|
|
||||||
self.prefill_info_dict["batch_ids"] = paddle.where(prefill_non_zeros_ids)[0]
|
|
||||||
self.decode_info_dict["batch_ids"] = paddle.where(decode_non_zeros_ids)[0]
|
|
||||||
|
|
||||||
self.prefill_len = len(self.prefill_info_dict["batch_ids"])
|
self.prefill_len = len(self.prefill_info_dict["batch_ids"])
|
||||||
self.decode_len = len(self.decode_info_dict["batch_ids"])
|
self.decode_len = len(self.decode_info_dict["batch_ids"])
|
||||||
# only prefill
|
# only prefill
|
||||||
if self.decode_len == 0:
|
if self.decode_len == 0:
|
||||||
cu_seq_ids = list(range(self.prefill_len + 1))
|
cu_seq_ids = list(range(self.prefill_len + 1))
|
||||||
self.prefill_info_dict["cu_seqlens_q"] = forward_meta.cu_seqlens_q[cu_seq_ids]
|
self.prefill_info_dict["cu_seqlens_q"] = forward_meta.cu_seqlens_q[cu_seq_ids]
|
||||||
|
self.mixed = False
|
||||||
# only decode
|
# only decode
|
||||||
elif self.prefill_len == 0:
|
elif self.prefill_len == 0:
|
||||||
pass
|
self.mixed = False
|
||||||
# both prefill and decode
|
# both prefill and decode
|
||||||
else:
|
else:
|
||||||
prefill_num_tokens = paddle.sum(forward_meta.seq_lens_this_time[prefill_non_zeros_ids])
|
self.mixed = True
|
||||||
decode_num_tokens = paddle.sum(forward_meta.seq_lens_this_time[decode_non_zeros_ids])
|
self.prefill_num_tokens = paddle.sum(forward_meta.seq_lens_encoder).item()
|
||||||
|
|
||||||
self.prefill_info_dict["cu_seqlens_q"] = paddle.zeros(
|
self.prefill_info_dict["cu_seqlens_q"] = paddle.zeros(
|
||||||
[self.prefill_len + 1], dtype=forward_meta.cu_seqlens_q.dtype
|
[self.prefill_len + 1], dtype=forward_meta.cu_seqlens_q.dtype
|
||||||
)
|
)
|
||||||
@@ -145,36 +117,30 @@ class IluvatarAttnBackend(AttentionBackend):
|
|||||||
]
|
]
|
||||||
self.prefill_info_dict["cu_seqlens_q"] = paddle.cumsum(self.prefill_info_dict["cu_seqlens_q"])
|
self.prefill_info_dict["cu_seqlens_q"] = paddle.cumsum(self.prefill_info_dict["cu_seqlens_q"])
|
||||||
|
|
||||||
self.prefill_qkv = paddle.zeros([prefill_num_tokens, self.total_hidden_dim], dtype=self.dtype)
|
self.tmp_buffer = paddle.zeros(
|
||||||
self.decode_qkv = paddle.zeros([decode_num_tokens, self.total_hidden_dim], dtype=self.dtype)
|
[self.prefill_num_tokens + self.decode_len, self.hidden_dim], dtype=self.dtype
|
||||||
self.merged_output = paddle.zeros(
|
|
||||||
[prefill_num_tokens + decode_num_tokens, self.num_heads, self.head_dim], dtype=self.dtype
|
|
||||||
)
|
)
|
||||||
|
|
||||||
prefill_start, decode_start, start = 0, 0, 0
|
prefill_start, decode_start, start = 0, self.prefill_num_tokens, 0
|
||||||
non_zeros_ids = forward_meta.seq_lens_this_time != 0
|
non_zeros_ids = forward_meta.seq_lens_this_time != 0
|
||||||
non_zeros_seq_lens = forward_meta.seq_lens_this_time[non_zeros_ids]
|
non_zeros_seq_lens = forward_meta.seq_lens_this_time[non_zeros_ids]
|
||||||
end = non_zeros_seq_lens[0]
|
end = non_zeros_seq_lens[0]
|
||||||
if end > 1:
|
if end > 1:
|
||||||
last_stage = "prefill"
|
last_stage = "prefill"
|
||||||
prefill_end = end
|
prefill_end = end
|
||||||
decode_end = 0
|
decode_end = decode_start
|
||||||
else:
|
else:
|
||||||
last_stage = "decode"
|
last_stage = "decode"
|
||||||
prefill_end = 0
|
prefill_end = 0
|
||||||
decode_end = end
|
decode_end = decode_start + end
|
||||||
|
|
||||||
self.prefill_info_dict["id_group"] = []
|
self.id_group = []
|
||||||
self.prefill_info_dict["reverse_id_group"] = []
|
self.reverse_id_group = []
|
||||||
self.decode_info_dict["id_group"] = []
|
|
||||||
self.decode_info_dict["reverse_id_group"] = []
|
|
||||||
self.record_stages = []
|
|
||||||
for seq_len in non_zeros_seq_lens[1:]:
|
for seq_len in non_zeros_seq_lens[1:]:
|
||||||
if seq_len > 1:
|
if seq_len > 1:
|
||||||
if last_stage == "decode":
|
if last_stage == "decode":
|
||||||
self.record_stages.append((last_stage, len(self.decode_info_dict["id_group"])))
|
self.id_group.append((decode_start, decode_end))
|
||||||
self.decode_info_dict["id_group"].append((decode_start, decode_end))
|
self.reverse_id_group.append((start, end))
|
||||||
self.decode_info_dict["reverse_id_group"].append((start, end))
|
|
||||||
decode_start = decode_end
|
decode_start = decode_end
|
||||||
start = end
|
start = end
|
||||||
last_stage = "prefill"
|
last_stage = "prefill"
|
||||||
@@ -182,9 +148,8 @@ class IluvatarAttnBackend(AttentionBackend):
|
|||||||
end += seq_len
|
end += seq_len
|
||||||
else:
|
else:
|
||||||
if last_stage == "prefill":
|
if last_stage == "prefill":
|
||||||
self.record_stages.append((last_stage, len(self.prefill_info_dict["id_group"])))
|
self.id_group.append((prefill_start, prefill_end))
|
||||||
self.prefill_info_dict["id_group"].append((prefill_start, prefill_end))
|
self.reverse_id_group.append((start, end))
|
||||||
self.prefill_info_dict["reverse_id_group"].append((start, end))
|
|
||||||
prefill_start = prefill_end
|
prefill_start = prefill_end
|
||||||
start = end
|
start = end
|
||||||
last_stage = "decode"
|
last_stage = "decode"
|
||||||
@@ -192,13 +157,11 @@ class IluvatarAttnBackend(AttentionBackend):
|
|||||||
end += seq_len
|
end += seq_len
|
||||||
|
|
||||||
if prefill_start < prefill_end:
|
if prefill_start < prefill_end:
|
||||||
self.record_stages.append(("prefill", len(self.prefill_info_dict["id_group"])))
|
self.id_group.append((prefill_start, prefill_end))
|
||||||
self.prefill_info_dict["id_group"].append((prefill_start, prefill_end))
|
self.reverse_id_group.append((start, end))
|
||||||
self.prefill_info_dict["reverse_id_group"].append((start, end))
|
|
||||||
if decode_start < decode_end:
|
if decode_start < decode_end:
|
||||||
self.record_stages.append(("decode", len(self.decode_info_dict["id_group"])))
|
self.id_group.append((decode_start, decode_end))
|
||||||
self.decode_info_dict["id_group"].append((decode_start, decode_end))
|
self.reverse_id_group.append((start, end))
|
||||||
self.decode_info_dict["reverse_id_group"].append((start, end))
|
|
||||||
|
|
||||||
def get_attntion_meta(self):
|
def get_attntion_meta(self):
|
||||||
"""get_attntion_meta"""
|
"""get_attntion_meta"""
|
||||||
@@ -214,206 +177,20 @@ class IluvatarAttnBackend(AttentionBackend):
|
|||||||
"""
|
"""
|
||||||
return (
|
return (
|
||||||
max_num_blocks,
|
max_num_blocks,
|
||||||
self.attention_metadata.num_kv_heads,
|
self.num_kv_heads,
|
||||||
self.attention_metadata.block_size,
|
self.block_size,
|
||||||
self.head_dim,
|
self.head_dim,
|
||||||
)
|
)
|
||||||
|
|
||||||
def prefill_update_kv_cache(
|
def transpose(self, hidden_states):
|
||||||
self, k, v, k_cache_id: int, v_cache_id: int, layer_id: int, forward_meta: ForwardMeta, prefill_batch_ids: list
|
for ids, reverse_ids in zip(self.id_group, self.reverse_id_group):
|
||||||
):
|
self.tmp_buffer[ids[0] : ids[1], :] = hidden_states[reverse_ids[0] : reverse_ids[1], :]
|
||||||
# [num_tokens, num_kv_heads, head_dim] -> [num_kv_heads, num_tokens, head_dim]
|
return self.tmp_buffer
|
||||||
trans_k = k.transpose([1, 0, 2]).contiguous()
|
|
||||||
trans_v = v.transpose([1, 0, 2]).contiguous()
|
|
||||||
tensor_start = 0
|
|
||||||
for batch_idx in prefill_batch_ids:
|
|
||||||
seq_len = forward_meta.seq_lens_this_time[batch_idx]
|
|
||||||
|
|
||||||
tensor_end = tensor_start + seq_len
|
def reverse_transpose(self, hidden_states):
|
||||||
slice_trans_k = trans_k[:, tensor_start:tensor_end, :]
|
for ids, reverse_ids in zip(self.id_group, self.reverse_id_group):
|
||||||
slice_trans_v = trans_v[:, tensor_start:tensor_end, :]
|
self.tmp_buffer[reverse_ids[0] : reverse_ids[1], :] = hidden_states[ids[0] : ids[1], :]
|
||||||
|
return self.tmp_buffer
|
||||||
cur_block_tables = forward_meta.block_tables[batch_idx]
|
|
||||||
cur_used_block_tables = cur_block_tables[cur_block_tables != -1]
|
|
||||||
|
|
||||||
cache_start = 0
|
|
||||||
cur_used_num_blocks = cur_used_block_tables.shape[0]
|
|
||||||
for i, block_id in enumerate(cur_used_block_tables):
|
|
||||||
# last block: seq_len - cache_start <= block_size
|
|
||||||
if i == cur_used_num_blocks - 1:
|
|
||||||
cache_end = seq_len - cache_start
|
|
||||||
assert cache_end <= self.attention_metadata.block_size
|
|
||||||
paddle.assign(
|
|
||||||
slice_trans_k[:, cache_start:seq_len, :],
|
|
||||||
output=forward_meta.caches[k_cache_id][block_id, :, 0:cache_end, :],
|
|
||||||
)
|
|
||||||
paddle.assign(
|
|
||||||
slice_trans_v[:, cache_start:seq_len, :],
|
|
||||||
output=forward_meta.caches[v_cache_id][block_id, :, 0:cache_end, :],
|
|
||||||
)
|
|
||||||
if layer_id == self.num_layers - 1:
|
|
||||||
self.record_block_table_metadata[batch_idx] = {
|
|
||||||
"block_id": block_id.item(),
|
|
||||||
"cache_end": cache_end.item(),
|
|
||||||
}
|
|
||||||
# non last block: seq_lens_this_time > block_size
|
|
||||||
else:
|
|
||||||
assert seq_len > self.attention_metadata.block_size
|
|
||||||
cache_end = cache_start + self.attention_metadata.block_size
|
|
||||||
paddle.assign(
|
|
||||||
slice_trans_k[:, cache_start:cache_end, :], output=forward_meta.caches[k_cache_id][block_id]
|
|
||||||
)
|
|
||||||
paddle.assign(
|
|
||||||
slice_trans_v[:, cache_start:cache_end, :], output=forward_meta.caches[v_cache_id][block_id]
|
|
||||||
)
|
|
||||||
cache_start += self.attention_metadata.block_size
|
|
||||||
|
|
||||||
tensor_start = tensor_end
|
|
||||||
|
|
||||||
def get_splited_qkv(
|
|
||||||
self, qkv: paddle.Tensor, forward_meta: ForwardMeta, cu_seqlens_q: paddle.Tensor, batch_ids=None
|
|
||||||
):
|
|
||||||
q_end = self.hidden_dim
|
|
||||||
k_end = q_end + self.attention_metadata.num_kv_heads * self.head_dim
|
|
||||||
v_end = k_end + self.attention_metadata.num_kv_heads * self.head_dim
|
|
||||||
assert v_end == qkv.shape[-1], f"Shape mismatch: {v_end} vs {qkv.shape[-1]}"
|
|
||||||
assert qkv.shape[0] == cu_seqlens_q[-1], f"Shape mismatch: {qkv.shape[0]} vs {cu_seqlens_q[-1]}"
|
|
||||||
|
|
||||||
if batch_ids is None:
|
|
||||||
batch_ids = list(range(forward_meta.seq_lens_this_time.shape[0]))
|
|
||||||
|
|
||||||
q = qkv[..., 0:q_end]
|
|
||||||
k = qkv[..., q_end:k_end]
|
|
||||||
v = qkv[..., k_end:v_end]
|
|
||||||
q = q.view([-1, self.num_heads, self.head_dim])
|
|
||||||
k = k.view([-1, self.attention_metadata.num_kv_heads, self.head_dim])
|
|
||||||
v = v.view([-1, self.attention_metadata.num_kv_heads, self.head_dim])
|
|
||||||
|
|
||||||
for idx in range(len(cu_seqlens_q) - 1):
|
|
||||||
batch_idx = batch_ids[idx]
|
|
||||||
seq_len_i = forward_meta.seq_lens_this_time[batch_idx]
|
|
||||||
if seq_len_i == 0:
|
|
||||||
continue
|
|
||||||
cached_kv_len = forward_meta.seq_lens_decoder[batch_idx][0]
|
|
||||||
cu_seq_start_q = cu_seqlens_q[idx]
|
|
||||||
cu_seq_end_q = cu_seqlens_q[idx + 1]
|
|
||||||
# forward_meta.rotary_embs is [2, 1, S, 1, D]
|
|
||||||
if forward_meta.rotary_embs is not None:
|
|
||||||
cos = forward_meta.rotary_embs[0, 0, cached_kv_len : cached_kv_len + seq_len_i, :, :]
|
|
||||||
sin = forward_meta.rotary_embs[1, 0, cached_kv_len : cached_kv_len + seq_len_i, :, :]
|
|
||||||
q[cu_seq_start_q:cu_seq_end_q] = apply_rope(q[cu_seq_start_q:cu_seq_end_q], cos, sin)
|
|
||||||
k[cu_seq_start_q:cu_seq_end_q] = apply_rope(k[cu_seq_start_q:cu_seq_end_q], cos, sin)
|
|
||||||
|
|
||||||
return q, k, v
|
|
||||||
|
|
||||||
def split_pd_qkv(self, qkv):
|
|
||||||
|
|
||||||
for ids, reverse_ids in zip(self.prefill_info_dict["id_group"], self.prefill_info_dict["reverse_id_group"]):
|
|
||||||
self.prefill_qkv[ids[0] : ids[1], :] = qkv[reverse_ids[0] : reverse_ids[1], :]
|
|
||||||
|
|
||||||
for ids, reverse_ids in zip(self.decode_info_dict["id_group"], self.decode_info_dict["reverse_id_group"]):
|
|
||||||
self.decode_qkv[ids[0] : ids[1], :] = qkv[reverse_ids[0] : reverse_ids[1], :]
|
|
||||||
|
|
||||||
return self.prefill_qkv, self.decode_qkv
|
|
||||||
|
|
||||||
def merge_pd_output(self, prefill_out, decode_out):
|
|
||||||
for stage, idx in self.record_stages:
|
|
||||||
if stage == "prefill":
|
|
||||||
ids = self.prefill_info_dict["id_group"][idx]
|
|
||||||
reverse_ids = self.prefill_info_dict["reverse_id_group"][idx]
|
|
||||||
self.merged_output[reverse_ids[0] : reverse_ids[1], :, :] = prefill_out[ids[0] : ids[1], :, :]
|
|
||||||
else:
|
|
||||||
ids = self.decode_info_dict["id_group"][idx]
|
|
||||||
reverse_ids = self.decode_info_dict["reverse_id_group"][idx]
|
|
||||||
self.merged_output[reverse_ids[0] : reverse_ids[1], :, :] = decode_out[ids[0] : ids[1], :, :]
|
|
||||||
return self.merged_output
|
|
||||||
|
|
||||||
def forward_prefill(self, prefill_qkv, layer_id, k_cache_id, v_cache_id, forward_meta: ForwardMeta):
|
|
||||||
prefill_q, prefill_k, prefill_v = self.get_splited_qkv(
|
|
||||||
prefill_qkv,
|
|
||||||
forward_meta,
|
|
||||||
self.prefill_info_dict["cu_seqlens_q"],
|
|
||||||
batch_ids=self.prefill_info_dict["batch_ids"],
|
|
||||||
)
|
|
||||||
|
|
||||||
prefill_out = flash_attn_unpadded(
|
|
||||||
prefill_q,
|
|
||||||
prefill_k,
|
|
||||||
prefill_v,
|
|
||||||
cu_seqlens_q=self.prefill_info_dict["cu_seqlens_q"],
|
|
||||||
cu_seqlens_k=self.prefill_info_dict["cu_seqlens_q"],
|
|
||||||
max_seqlen_q=self.attention_metadata.max_context_len,
|
|
||||||
max_seqlen_k=self.attention_metadata.max_context_len,
|
|
||||||
scale=self.attention_metadata.scale,
|
|
||||||
dropout=self.attention_metadata.dropout,
|
|
||||||
causal=self.attention_metadata.causal,
|
|
||||||
return_softmax=self.attention_metadata.return_softmax,
|
|
||||||
)[0]
|
|
||||||
self.prefill_update_kv_cache(
|
|
||||||
prefill_k, prefill_v, k_cache_id, v_cache_id, layer_id, forward_meta, self.prefill_info_dict["batch_ids"]
|
|
||||||
)
|
|
||||||
|
|
||||||
return prefill_out
|
|
||||||
|
|
||||||
def forward_decode(self, decode_qkv, k_cache_id, v_cache_id, forward_meta: ForwardMeta):
|
|
||||||
k_cache = forward_meta.caches[k_cache_id]
|
|
||||||
v_cache = forward_meta.caches[v_cache_id]
|
|
||||||
if self.enable_fused_attention:
|
|
||||||
rope_cos = forward_meta.rotary_embs[0, 0, :, :, :]
|
|
||||||
rope_sin = forward_meta.rotary_embs[1, 0, :, :, :]
|
|
||||||
decode_out = paged_attention(
|
|
||||||
decode_qkv.view([-1, self.total_num_heads, self.head_dim]),
|
|
||||||
k_cache,
|
|
||||||
v_cache,
|
|
||||||
block_tables=forward_meta.block_tables[self.decode_info_dict["batch_ids"], :],
|
|
||||||
seq_lens=forward_meta.seq_lens_decoder[self.decode_info_dict["batch_ids"], 0] + 1,
|
|
||||||
num_kv_heads=self.attention_metadata.num_kv_heads,
|
|
||||||
scale=self.attention_metadata.scale,
|
|
||||||
block_size=self.attention_metadata.block_size,
|
|
||||||
max_context_len=self.attention_metadata.max_context_len,
|
|
||||||
alibi_slopes=self.attention_metadata.alibi_slopes,
|
|
||||||
causal=self.attention_metadata.causal,
|
|
||||||
window_left=self.attention_metadata.window_left,
|
|
||||||
window_right=self.attention_metadata.window_right,
|
|
||||||
softcap=self.attention_metadata.softcap,
|
|
||||||
use_cuda_graph=self.attention_metadata.use_cuda_graph,
|
|
||||||
use_sqrt_alibi=self.attention_metadata.use_sqrt_alibi,
|
|
||||||
merged_qkv=True,
|
|
||||||
k=decode_qkv,
|
|
||||||
v=decode_qkv,
|
|
||||||
rope_sin=rope_sin,
|
|
||||||
rope_cos=rope_cos,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
decode_q, decode_k, decode_v = self.get_splited_qkv(
|
|
||||||
decode_qkv,
|
|
||||||
forward_meta,
|
|
||||||
self.decode_info_dict["cu_seqlens_q"],
|
|
||||||
batch_ids=self.decode_info_dict["batch_ids"],
|
|
||||||
)
|
|
||||||
|
|
||||||
decode_out = paged_attention(
|
|
||||||
decode_q,
|
|
||||||
k_cache,
|
|
||||||
v_cache,
|
|
||||||
block_tables=forward_meta.block_tables[self.decode_info_dict["batch_ids"], :],
|
|
||||||
seq_lens=forward_meta.seq_lens_decoder[self.decode_info_dict["batch_ids"], 0] + 1,
|
|
||||||
num_kv_heads=self.attention_metadata.num_kv_heads,
|
|
||||||
scale=self.attention_metadata.scale,
|
|
||||||
block_size=self.attention_metadata.block_size,
|
|
||||||
max_context_len=self.attention_metadata.max_context_len,
|
|
||||||
alibi_slopes=self.attention_metadata.alibi_slopes,
|
|
||||||
causal=self.attention_metadata.causal,
|
|
||||||
window_left=self.attention_metadata.window_left,
|
|
||||||
window_right=self.attention_metadata.window_right,
|
|
||||||
softcap=self.attention_metadata.softcap,
|
|
||||||
use_cuda_graph=self.attention_metadata.use_cuda_graph,
|
|
||||||
use_sqrt_alibi=self.attention_metadata.use_sqrt_alibi,
|
|
||||||
k=decode_k,
|
|
||||||
v=decode_v,
|
|
||||||
)
|
|
||||||
|
|
||||||
return decode_out
|
|
||||||
|
|
||||||
def forward_mixed(
|
def forward_mixed(
|
||||||
self,
|
self,
|
||||||
@@ -429,23 +206,84 @@ class IluvatarAttnBackend(AttentionBackend):
|
|||||||
"""
|
"""
|
||||||
forward_mixed
|
forward_mixed
|
||||||
"""
|
"""
|
||||||
assert not self.use_speculate, "IluvatarAttnBackend cannot support speculate now"
|
|
||||||
layer_id = layer.layer_id
|
layer_id = layer.layer_id
|
||||||
k_cache_id = layer_id * 2
|
k_cache_id = layer_id * 2
|
||||||
v_cache_id = k_cache_id + 1
|
v_cache_id = k_cache_id + 1
|
||||||
q_dim = qkv.dim()
|
k_cache = forward_meta.caches[k_cache_id]
|
||||||
assert q_dim == 2
|
v_cache = forward_meta.caches[v_cache_id]
|
||||||
|
|
||||||
if self.decode_len == 0:
|
if self.decode_len == 0:
|
||||||
output = self.forward_prefill(qkv, layer_id, k_cache_id, v_cache_id, forward_meta)
|
output = prefill_fused_paged_attention(
|
||||||
|
qkv,
|
||||||
|
k_cache,
|
||||||
|
v_cache,
|
||||||
|
block_tables=forward_meta.block_tables[self.prefill_info_dict["batch_ids"], :],
|
||||||
|
cu_seqlens_qkv=self.prefill_info_dict["cu_seqlens_q"],
|
||||||
|
num_heads=self.num_heads,
|
||||||
|
head_dim=self.head_dim,
|
||||||
|
num_kv_heads=self.num_kv_heads,
|
||||||
|
block_size=self.block_size,
|
||||||
|
max_seq_len=self.max_context_len,
|
||||||
|
scale=self.scale,
|
||||||
|
causal=self.causal,
|
||||||
|
q_rope=True,
|
||||||
|
k_rope=True,
|
||||||
|
v_rope=False,
|
||||||
|
rope_sin=self.rope_sin,
|
||||||
|
rope_cos=self.rope_cos,
|
||||||
|
)
|
||||||
elif self.prefill_len == 0:
|
elif self.prefill_len == 0:
|
||||||
output = self.forward_decode(qkv, k_cache_id, v_cache_id, forward_meta)
|
output = paged_attention(
|
||||||
|
qkv,
|
||||||
|
k_cache,
|
||||||
|
v_cache,
|
||||||
|
block_tables=forward_meta.block_tables[self.decode_info_dict["batch_ids"], :],
|
||||||
|
seq_lens=forward_meta.seq_lens_decoder[self.decode_info_dict["batch_ids"], 0] + 1,
|
||||||
|
num_heads=self.num_heads,
|
||||||
|
head_dim=self.head_dim,
|
||||||
|
num_kv_heads=self.num_kv_heads,
|
||||||
|
scale=self.scale,
|
||||||
|
block_size=self.block_size,
|
||||||
|
max_context_len=self.max_context_len,
|
||||||
|
alibi_slopes=self.attention_metadata.alibi_slopes,
|
||||||
|
causal=self.causal,
|
||||||
|
window_left=self.attention_metadata.window_left,
|
||||||
|
window_right=self.attention_metadata.window_right,
|
||||||
|
softcap=self.attention_metadata.softcap,
|
||||||
|
use_cuda_graph=self.attention_metadata.use_cuda_graph,
|
||||||
|
use_sqrt_alibi=self.attention_metadata.use_sqrt_alibi,
|
||||||
|
merged_qkv=True,
|
||||||
|
k=qkv,
|
||||||
|
v=qkv,
|
||||||
|
rope_sin=self.rope_sin,
|
||||||
|
rope_cos=self.rope_cos,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
prefill_qkv, decode_qkv = self.split_pd_qkv(qkv)
|
output = mixed_fused_paged_attention(
|
||||||
prefill_output = self.forward_prefill(prefill_qkv, layer_id, k_cache_id, v_cache_id, forward_meta)
|
qkv,
|
||||||
decode_output = self.forward_decode(decode_qkv, k_cache_id, v_cache_id, forward_meta)
|
k_cache,
|
||||||
output = self.merge_pd_output(prefill_output, decode_output)
|
v_cache,
|
||||||
|
prefill_block_tables=forward_meta.block_tables[self.prefill_info_dict["batch_ids"], :],
|
||||||
|
decode_block_tables=forward_meta.block_tables[self.decode_info_dict["batch_ids"], :],
|
||||||
|
cu_seqlens_qkv=self.prefill_info_dict["cu_seqlens_q"],
|
||||||
|
seq_lens=forward_meta.seq_lens_decoder[self.decode_info_dict["batch_ids"], 0] + 1,
|
||||||
|
prefill_num_tokens=self.prefill_num_tokens,
|
||||||
|
num_heads=self.num_heads,
|
||||||
|
head_dim=self.head_dim,
|
||||||
|
num_kv_heads=self.num_kv_heads,
|
||||||
|
block_size=self.block_size,
|
||||||
|
max_seq_len=self.max_context_len,
|
||||||
|
scale=self.scale,
|
||||||
|
causal=self.causal,
|
||||||
|
q_rope=True,
|
||||||
|
k_rope=True,
|
||||||
|
v_rope=False,
|
||||||
|
window_left=self.attention_metadata.window_left,
|
||||||
|
window_right=self.attention_metadata.window_right,
|
||||||
|
softcap=self.attention_metadata.softcap,
|
||||||
|
use_cuda_graph=self.attention_metadata.use_cuda_graph,
|
||||||
|
use_sqrt_alibi=self.attention_metadata.use_sqrt_alibi,
|
||||||
|
rope_sin=self.rope_sin,
|
||||||
|
rope_cos=self.rope_cos,
|
||||||
|
)
|
||||||
|
|
||||||
output = output.view([-1, self.num_heads * self.head_dim])
|
|
||||||
return output
|
return output
|
||||||
|
@@ -83,7 +83,6 @@ class CutlassMoEMethod(UnquantizedFusedMoEMethod):
|
|||||||
expert_idx_per_token,
|
expert_idx_per_token,
|
||||||
self.moe_quant_type,
|
self.moe_quant_type,
|
||||||
used_in_ep_low_latency,
|
used_in_ep_low_latency,
|
||||||
estimate_total_token_nums,
|
|
||||||
)
|
)
|
||||||
return fastdeploy.model_executor.ops.gpu.moe_expert_ffn(
|
return fastdeploy.model_executor.ops.gpu.moe_expert_ffn(
|
||||||
permute_input,
|
permute_input,
|
||||||
|
@@ -53,6 +53,7 @@ from fastdeploy.model_executor.models.model_base import (
|
|||||||
from fastdeploy.model_executor.models.tp_utils import TensorSplitMode as tsm
|
from fastdeploy.model_executor.models.tp_utils import TensorSplitMode as tsm
|
||||||
from fastdeploy.model_executor.models.utils import LayerIdPlaceholder as layerid
|
from fastdeploy.model_executor.models.utils import LayerIdPlaceholder as layerid
|
||||||
from fastdeploy.model_executor.models.utils import WeightMeta
|
from fastdeploy.model_executor.models.utils import WeightMeta
|
||||||
|
from fastdeploy.platforms import current_platform
|
||||||
from fastdeploy.worker.experts_manager import RedundantExpertManger
|
from fastdeploy.worker.experts_manager import RedundantExpertManger
|
||||||
|
|
||||||
|
|
||||||
@@ -464,6 +465,9 @@ class Ernie4_5_Model(nn.Layer):
|
|||||||
):
|
):
|
||||||
hidden_states = self.embed_tokens(ids_remove_padding=ids_remove_padding)
|
hidden_states = self.embed_tokens(ids_remove_padding=ids_remove_padding)
|
||||||
|
|
||||||
|
if current_platform.is_iluvatar() and forward_meta.attn_backend.mixed:
|
||||||
|
hidden_states = forward_meta.attn_backend.transpose(hidden_states)
|
||||||
|
|
||||||
residual = None
|
residual = None
|
||||||
for i in range(self.num_layers):
|
for i in range(self.num_layers):
|
||||||
hidden_states, residual = self.layers[i](forward_meta, hidden_states, residual)
|
hidden_states, residual = self.layers[i](forward_meta, hidden_states, residual)
|
||||||
@@ -472,6 +476,9 @@ class Ernie4_5_Model(nn.Layer):
|
|||||||
|
|
||||||
out = self.norm(hidden_states)
|
out = self.norm(hidden_states)
|
||||||
|
|
||||||
|
if current_platform.is_iluvatar() and forward_meta.attn_backend.mixed:
|
||||||
|
out = forward_meta.attn_backend.reverse_transpose(out)
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
@@ -20,4 +20,8 @@ PACKAGE = "fastdeploy.model_executor.ops.iluvatar"
|
|||||||
import_custom_ops(PACKAGE, ".fastdeploy_ops", globals())
|
import_custom_ops(PACKAGE, ".fastdeploy_ops", globals())
|
||||||
|
|
||||||
from .moe_ops import iluvatar_moe_expert_ffn as moe_expert_ffn # noqa: F401
|
from .moe_ops import iluvatar_moe_expert_ffn as moe_expert_ffn # noqa: F401
|
||||||
from .paged_attention import paged_attention # noqa: F401
|
from .paged_attention import ( # noqa: F401
|
||||||
|
mixed_fused_paged_attention,
|
||||||
|
paged_attention,
|
||||||
|
prefill_fused_paged_attention,
|
||||||
|
)
|
||||||
|
@@ -17,9 +17,15 @@
|
|||||||
import paddle
|
import paddle
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from fastdeploy.model_executor.ops.iluvatar import paged_attn
|
from fastdeploy.model_executor.ops.iluvatar import (
|
||||||
|
mixed_fused_paged_attn,
|
||||||
|
paged_attn,
|
||||||
|
prefill_fused_paged_attn,
|
||||||
|
)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
paged_attn = None
|
paged_attn = None
|
||||||
|
prefill_fused_paged_attn = None
|
||||||
|
mixed_fused_paged_attn = None
|
||||||
|
|
||||||
|
|
||||||
def paged_attention(
|
def paged_attention(
|
||||||
@@ -28,6 +34,8 @@ def paged_attention(
|
|||||||
v_cache: paddle.Tensor,
|
v_cache: paddle.Tensor,
|
||||||
block_tables: paddle.Tensor,
|
block_tables: paddle.Tensor,
|
||||||
seq_lens: paddle.Tensor,
|
seq_lens: paddle.Tensor,
|
||||||
|
num_heads: int,
|
||||||
|
head_dim: int,
|
||||||
num_kv_heads: int,
|
num_kv_heads: int,
|
||||||
scale: float,
|
scale: float,
|
||||||
block_size: int,
|
block_size: int,
|
||||||
@@ -45,7 +53,7 @@ def paged_attention(
|
|||||||
rope_sin: paddle.Tensor = None,
|
rope_sin: paddle.Tensor = None,
|
||||||
rope_cos: paddle.Tensor = None,
|
rope_cos: paddle.Tensor = None,
|
||||||
):
|
):
|
||||||
output = paged_attn(
|
return paged_attn(
|
||||||
q,
|
q,
|
||||||
k_cache,
|
k_cache,
|
||||||
v_cache,
|
v_cache,
|
||||||
@@ -56,6 +64,8 @@ def paged_attention(
|
|||||||
v,
|
v,
|
||||||
rope_sin,
|
rope_sin,
|
||||||
rope_cos,
|
rope_cos,
|
||||||
|
num_heads,
|
||||||
|
head_dim,
|
||||||
num_kv_heads,
|
num_kv_heads,
|
||||||
scale,
|
scale,
|
||||||
block_size,
|
block_size,
|
||||||
@@ -68,4 +78,99 @@ def paged_attention(
|
|||||||
use_sqrt_alibi,
|
use_sqrt_alibi,
|
||||||
merged_qkv,
|
merged_qkv,
|
||||||
)
|
)
|
||||||
return output[0] if isinstance(output, list) else output
|
|
||||||
|
|
||||||
|
def prefill_fused_paged_attention(
|
||||||
|
qkv: paddle.Tensor,
|
||||||
|
k_cache: paddle.Tensor,
|
||||||
|
v_cache: paddle.Tensor,
|
||||||
|
block_tables: paddle.Tensor,
|
||||||
|
cu_seqlens_qkv: paddle.Tensor,
|
||||||
|
num_heads: int,
|
||||||
|
head_dim: int,
|
||||||
|
num_kv_heads: int,
|
||||||
|
block_size: int,
|
||||||
|
max_seq_len: int,
|
||||||
|
scale: float,
|
||||||
|
causal: bool = True,
|
||||||
|
q_rope: bool = True,
|
||||||
|
k_rope: bool = True,
|
||||||
|
v_rope: bool = False,
|
||||||
|
rope_sin: paddle.Tensor = None,
|
||||||
|
rope_cos: paddle.Tensor = None,
|
||||||
|
):
|
||||||
|
return prefill_fused_paged_attn(
|
||||||
|
qkv,
|
||||||
|
k_cache,
|
||||||
|
v_cache,
|
||||||
|
block_tables,
|
||||||
|
cu_seqlens_qkv,
|
||||||
|
rope_sin,
|
||||||
|
rope_cos,
|
||||||
|
num_heads,
|
||||||
|
head_dim,
|
||||||
|
num_kv_heads,
|
||||||
|
block_size,
|
||||||
|
max_seq_len,
|
||||||
|
scale,
|
||||||
|
causal,
|
||||||
|
q_rope,
|
||||||
|
k_rope,
|
||||||
|
v_rope,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def mixed_fused_paged_attention(
|
||||||
|
qkv: paddle.Tensor,
|
||||||
|
k_cache: paddle.Tensor,
|
||||||
|
v_cache: paddle.Tensor,
|
||||||
|
prefill_block_tables: paddle.Tensor,
|
||||||
|
decode_block_tables: paddle.Tensor,
|
||||||
|
cu_seqlens_qkv: paddle.Tensor,
|
||||||
|
seq_lens: paddle.Tensor,
|
||||||
|
prefill_num_tokens: int,
|
||||||
|
num_heads: int,
|
||||||
|
head_dim: int,
|
||||||
|
num_kv_heads: int,
|
||||||
|
block_size: int,
|
||||||
|
max_seq_len: int,
|
||||||
|
scale: float,
|
||||||
|
causal: bool = True,
|
||||||
|
q_rope: bool = True,
|
||||||
|
k_rope: bool = True,
|
||||||
|
v_rope: bool = False,
|
||||||
|
window_left: int = -1,
|
||||||
|
window_right: int = -1,
|
||||||
|
softcap: float = 0.0,
|
||||||
|
use_cuda_graph: bool = False,
|
||||||
|
use_sqrt_alibi: bool = False,
|
||||||
|
rope_sin: paddle.Tensor = None,
|
||||||
|
rope_cos: paddle.Tensor = None,
|
||||||
|
):
|
||||||
|
return mixed_fused_paged_attn(
|
||||||
|
qkv,
|
||||||
|
k_cache,
|
||||||
|
v_cache,
|
||||||
|
prefill_block_tables,
|
||||||
|
decode_block_tables,
|
||||||
|
cu_seqlens_qkv,
|
||||||
|
seq_lens,
|
||||||
|
rope_sin,
|
||||||
|
rope_cos,
|
||||||
|
prefill_num_tokens,
|
||||||
|
num_heads,
|
||||||
|
head_dim,
|
||||||
|
num_kv_heads,
|
||||||
|
block_size,
|
||||||
|
max_seq_len,
|
||||||
|
scale,
|
||||||
|
causal,
|
||||||
|
q_rope,
|
||||||
|
k_rope,
|
||||||
|
v_rope,
|
||||||
|
window_left,
|
||||||
|
window_right,
|
||||||
|
softcap,
|
||||||
|
use_cuda_graph,
|
||||||
|
use_sqrt_alibi,
|
||||||
|
)
|
||||||
|
@@ -13,10 +13,10 @@ python -m pip install -r requirements_iluvatar.txt
|
|||||||
echo "uninstall org"
|
echo "uninstall org"
|
||||||
python -m pip uninstall paddlepaddle -y
|
python -m pip uninstall paddlepaddle -y
|
||||||
python -m pip uninstall paddle-iluvatar-gpu -y
|
python -m pip uninstall paddle-iluvatar-gpu -y
|
||||||
python -m pip install --pre paddlepaddle==3.0.0.dev20250708 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
|
# python -m pip install --pre paddlepaddle==3.0.0.dev20250708 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
|
||||||
# TODO: Change to open access URL
|
# python -m pip install --pre paddle-iluvatar-gpu==3.0.0.dev20250806 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
|
||||||
python -m pip install --pre paddle-iluvatar-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
|
python -m pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
||||||
# python -m pip install /data1/fastdeploy/packages/paddle_iluvatar_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
|
python -m pip install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
|
||||||
# Patch, remove if image updated
|
# Patch, remove if image updated
|
||||||
cp /data1/fastdeploy/packages/cusolver.h /usr/local/lib/python3.10/site-packages/paddle/include/paddle/phi/backends/dynload/cusolver.h
|
cp /data1/fastdeploy/packages/cusolver.h /usr/local/lib/python3.10/site-packages/paddle/include/paddle/phi/backends/dynload/cusolver.h
|
||||||
echo "build whl"
|
echo "build whl"
|
||||||
|
@@ -1,4 +1,7 @@
|
|||||||
from fastdeploy import LLM, SamplingParams
|
from fastdeploy import LLM, SamplingParams
|
||||||
|
from fastdeploy.utils import set_random_seed
|
||||||
|
|
||||||
|
set_random_seed(123)
|
||||||
|
|
||||||
prompts = [
|
prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
@@ -12,7 +15,6 @@ llm = LLM(
|
|||||||
model="/data1/fastdeploy/ERNIE_300B_4L",
|
model="/data1/fastdeploy/ERNIE_300B_4L",
|
||||||
tensor_parallel_size=8,
|
tensor_parallel_size=8,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
static_decode_blocks=0,
|
|
||||||
quantization="wint8",
|
quantization="wint8",
|
||||||
block_size=16,
|
block_size=16,
|
||||||
)
|
)
|
||||||
@@ -27,14 +29,14 @@ assert outputs[0].outputs.token_ids == [
|
|||||||
59335,
|
59335,
|
||||||
68170,
|
68170,
|
||||||
183,
|
183,
|
||||||
97404,
|
49080,
|
||||||
100088,
|
94717,
|
||||||
36310,
|
82966,
|
||||||
95633,
|
99140,
|
||||||
95913,
|
31615,
|
||||||
41459,
|
51497,
|
||||||
95049,
|
94851,
|
||||||
94970,
|
60764,
|
||||||
96840,
|
10889,
|
||||||
2,
|
2,
|
||||||
], f"{outputs[0].outputs.token_ids}"
|
], f"{outputs[0].outputs.token_ids}"
|
||||||
|
Reference in New Issue
Block a user