mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
enhance set_stop_value_multi_ends and standardize the registration of some operators (#4525)
* fix custom_ops * paddleformers>=0.3.1
This commit is contained in:
@@ -18,6 +18,10 @@
|
||||
#include <stdio.h>
|
||||
#include "paddle/extension.h"
|
||||
|
||||
#ifndef PD_BUILD_STATIC_OP
|
||||
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
|
||||
#endif
|
||||
|
||||
bool is_in_end(const int64_t id, const int64_t *end_ids, int length) {
|
||||
bool flag = false;
|
||||
for (int i = 0; i < length; i++) {
|
||||
@@ -49,6 +53,8 @@ void set_value_by_flags(bool *stop_flags,
|
||||
}
|
||||
if (!beam_search && is_in_end(topk_ids[bi], end_ids, end_length)) {
|
||||
stop_flags[bi] = true;
|
||||
topk_ids[bi] = end_ids[0];
|
||||
next_tokens[bi] = end_ids[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,6 +20,10 @@
|
||||
#include "kernel_traits.h"
|
||||
#include "flash_mask_attn_kernel.hpp"
|
||||
|
||||
#ifndef PD_BUILD_STATIC_OP
|
||||
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
|
||||
#endif
|
||||
|
||||
template <typename paddle_type>
|
||||
struct cuteType;
|
||||
|
||||
@@ -142,7 +146,7 @@ std::vector<paddle::Tensor> FlashAttentionMask(
|
||||
}
|
||||
|
||||
|
||||
PD_BUILD_OP(flash_attention_mask)
|
||||
PD_BUILD_STATIC_OP(flash_attention_mask)
|
||||
.Inputs({
|
||||
"q_input",
|
||||
"k_input",
|
||||
|
||||
@@ -14,6 +14,10 @@
|
||||
|
||||
#include "paddle/extension.h"
|
||||
|
||||
#ifndef PD_BUILD_STATIC_OP
|
||||
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
|
||||
#endif
|
||||
|
||||
std::vector<paddle::Tensor> GetImgBoundaries(const paddle::Tensor& task_input_ids,
|
||||
const paddle::Tensor& grid_thw,
|
||||
const int64_t image_patch_id) {
|
||||
@@ -53,7 +57,7 @@ std::vector<paddle::Tensor> GetImgBoundaries(const paddle::Tensor& task_input_id
|
||||
return {out};
|
||||
}
|
||||
|
||||
PD_BUILD_OP(get_img_boundaries)
|
||||
PD_BUILD_STATIC_OP(get_img_boundaries)
|
||||
.Inputs({"task_input_ids", "grid_thw"})
|
||||
.Attrs({"image_patch_id: int64_t"})
|
||||
.Outputs({"img_boundaries"})
|
||||
|
||||
@@ -15,6 +15,10 @@
|
||||
#include "paddle/extension.h"
|
||||
#include <map>
|
||||
|
||||
#ifndef PD_BUILD_STATIC_OP
|
||||
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
|
||||
#endif
|
||||
|
||||
std::vector<paddle::Tensor> GetMmSplitFuse(const paddle::Tensor& task_input_ids,
|
||||
const paddle::Tensor& task_image_type_ids,
|
||||
const paddle::Tensor& task_input_ids_image_token_count,
|
||||
@@ -133,7 +137,7 @@ std::vector<paddle::Tensor> GetMmSplitFuse(const paddle::Tensor& task_input_ids,
|
||||
return {image_chunk_selections_out, split_fuse_cur_seq_lens_out};
|
||||
}
|
||||
|
||||
PD_BUILD_OP(get_mm_split_fuse)
|
||||
PD_BUILD_STATIC_OP(get_mm_split_fuse)
|
||||
.Inputs({"task_input_ids", "task_image_type_ids", "task_input_ids_image_token_count", "grid_thw"})
|
||||
.Attrs({"image_token_id: int64_t", "img_total: int64_t", "batch_idx: int", "seq_lens_origin: int", "split_fuse_img_size: int", "split_fuse_text_size: int", "max_chunk_token_size: int"})
|
||||
.Outputs({"image_chunk_selections", "split_fuse_cur_seq_lens"})
|
||||
|
||||
@@ -80,7 +80,7 @@ void LimitThinkingContentLengthV1(const paddle::Tensor &next_tokens,
|
||||
batch_size);
|
||||
}
|
||||
|
||||
PD_BUILD_OP(limit_thinking_content_length_v1)
|
||||
PD_BUILD_STATIC_OP(limit_thinking_content_length_v1)
|
||||
.Inputs({"next_tokens", "max_think_lens", "step_idx", "limit_think_status"})
|
||||
.Attrs({"think_end_id: int64_t"})
|
||||
.Outputs({"next_tokens_out"})
|
||||
|
||||
@@ -103,7 +103,7 @@ void LimitThinkingContentLengthV2(const paddle::Tensor &next_tokens,
|
||||
batch_size);
|
||||
}
|
||||
|
||||
PD_BUILD_OP(limit_thinking_content_length_v2)
|
||||
PD_BUILD_STATIC_OP(limit_thinking_content_length_v2)
|
||||
.Inputs({"next_tokens", "max_think_lens", "step_idx", "limit_think_status"})
|
||||
.Attrs({"think_end_id: int64_t", "line_break_id: int64_t"})
|
||||
.Outputs({"next_tokens_out"})
|
||||
|
||||
@@ -15,6 +15,9 @@
|
||||
#include "paddle/extension.h"
|
||||
#include "moba_attn.h"
|
||||
|
||||
#ifndef PD_BUILD_STATIC_OP
|
||||
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
|
||||
#endif
|
||||
|
||||
std::vector<paddle::Tensor> MobaAttention(
|
||||
const paddle::Tensor& qkv,
|
||||
@@ -272,7 +275,7 @@ std::vector<paddle::Tensor> MobaAttention(
|
||||
}
|
||||
|
||||
|
||||
PD_BUILD_OP(moba_attention)
|
||||
PD_BUILD_STATIC_OP(moba_attention)
|
||||
.Inputs({
|
||||
"qkv",
|
||||
"q_input",
|
||||
|
||||
@@ -16,6 +16,9 @@
|
||||
#include "moba_attn/moba_attn_utils.hpp"
|
||||
#include "moba_attn/moba_attn.h"
|
||||
|
||||
#ifndef PD_BUILD_STATIC_OP
|
||||
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
|
||||
#endif
|
||||
|
||||
template <typename T, int knthreads, int moba_block_size, int kBlockMaxN, int searchtimes>
|
||||
__global__ void qk_gate_sort_decoder_kernel(
|
||||
@@ -221,7 +224,7 @@ std::vector<paddle::Tensor> QkSortDecoder(
|
||||
}
|
||||
}
|
||||
|
||||
PD_BUILD_OP(moba_qk_sort_decoder)
|
||||
PD_BUILD_STATIC_OP(moba_qk_sort_decoder)
|
||||
.Inputs({
|
||||
"qk_gate_weight",
|
||||
"seq_len_encoder",
|
||||
|
||||
@@ -27,6 +27,10 @@
|
||||
#include "softmax.hpp"
|
||||
#include "cutlass/arch/reg_reconfig.h"
|
||||
|
||||
#ifndef PD_BUILD_STATIC_OP
|
||||
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
|
||||
#endif
|
||||
|
||||
template <int kHeadDim>
|
||||
auto get_gmem_layout(int token_num, int head_num) {
|
||||
return make_layout(
|
||||
@@ -360,7 +364,7 @@ void MobaEncoderAttn(
|
||||
}
|
||||
|
||||
|
||||
PD_BUILD_OP(moba_encoder_attn)
|
||||
PD_BUILD_STATIC_OP(moba_encoder_attn)
|
||||
.Inputs({
|
||||
"q_input",
|
||||
"k_input",
|
||||
|
||||
@@ -15,6 +15,9 @@
|
||||
#include "paddle/extension.h"
|
||||
#include "moba_attn/moba_attn.h"
|
||||
|
||||
#ifndef PD_BUILD_STATIC_OP
|
||||
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
|
||||
#endif
|
||||
|
||||
template <typename T, int kBlockSize, int kHeadDim>
|
||||
__global__ void write_encoder_cachekv_c16(
|
||||
@@ -135,7 +138,7 @@ void MobaEncoderAttnWriteCacheKv(
|
||||
}
|
||||
}
|
||||
|
||||
PD_BUILD_OP(moba_encoder_attn_write_cache_kv)
|
||||
PD_BUILD_STATIC_OP(moba_encoder_attn_write_cache_kv)
|
||||
.Inputs({
|
||||
"k_input",
|
||||
"v_input",
|
||||
|
||||
@@ -15,6 +15,10 @@
|
||||
#include "paddle/extension.h"
|
||||
#include "moba_attn/moba_attn_utils.hpp"
|
||||
|
||||
#ifndef PD_BUILD_STATIC_OP
|
||||
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
|
||||
#endif
|
||||
|
||||
template <typename T, int knthreads, int moba_block_size, int kBlockM, int kBlockMaxN, int searchtimes>
|
||||
__global__ void qk_gate_sort_encoder_kernel(
|
||||
const T* qk_gate_weight,
|
||||
@@ -320,7 +324,7 @@ std::vector<paddle::Tensor> QkSortEncoder(
|
||||
}
|
||||
}
|
||||
|
||||
PD_BUILD_OP(moba_qk_sort_encoder)
|
||||
PD_BUILD_STATIC_OP(moba_qk_sort_encoder)
|
||||
.Inputs({
|
||||
"qk_gate_weight",
|
||||
"seq_len_encoder",
|
||||
|
||||
@@ -16,6 +16,10 @@
|
||||
#include "moba_attn/moba_attn_utils.hpp"
|
||||
#include "moba_attn/moba_attn.h"
|
||||
|
||||
#ifndef PD_BUILD_STATIC_OP
|
||||
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
|
||||
#endif
|
||||
|
||||
template <typename T, int kBlockSize, int kHeadDim>
|
||||
__global__ void get_kv_from_cache_c16_kernel(
|
||||
T * k_input,
|
||||
@@ -251,7 +255,7 @@ std::vector<paddle::Tensor> GetCurCuSeqLenk(
|
||||
return {cu_seq_q_pack, cu_seqlens_k, q_pack_tokens_cpu};
|
||||
}
|
||||
|
||||
PD_BUILD_OP(get_kv_from_cache)
|
||||
PD_BUILD_STATIC_OP(get_kv_from_cache)
|
||||
.Inputs({
|
||||
"k_input",
|
||||
"v_input",
|
||||
@@ -277,7 +281,7 @@ PD_BUILD_OP(get_kv_from_cache)
|
||||
{"v_input", "v_input_out"}})
|
||||
.SetKernelFn(PD_KERNEL(GetKVFromCache));
|
||||
|
||||
PD_BUILD_OP(get_cur_cu_seq_len_k)
|
||||
PD_BUILD_STATIC_OP(get_cur_cu_seq_len_k)
|
||||
.Inputs({
|
||||
"seq_lens_encoder",
|
||||
"seq_lens_decoder",
|
||||
|
||||
@@ -16,6 +16,9 @@
|
||||
#include "moba_attn/moba_attn_utils.hpp"
|
||||
#include "moba_attn/moba_attn.h"
|
||||
|
||||
#ifndef PD_BUILD_STATIC_OP
|
||||
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
|
||||
#endif
|
||||
|
||||
template <typename T, int moba_block_size, int kHeadDim, int kMaxN>
|
||||
__global__ void moba_mlp_einsum_kernel(
|
||||
@@ -207,7 +210,7 @@ std::vector<paddle::Tensor> MobaMlpEinsum(
|
||||
return {k_gate_weight};
|
||||
}
|
||||
|
||||
PD_BUILD_OP(moba_mlp_einsum)
|
||||
PD_BUILD_STATIC_OP(moba_mlp_einsum)
|
||||
.Inputs({
|
||||
"k_input",
|
||||
"attn_gate_weight",
|
||||
|
||||
@@ -25,6 +25,10 @@
|
||||
#include "cutlass/cluster_launch.hpp"
|
||||
#include "cutlass/arch/reg_reconfig.h"
|
||||
|
||||
#ifndef PD_BUILD_STATIC_OP
|
||||
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
|
||||
#endif
|
||||
|
||||
template <typename input_type, int kBlockM, int kBlockN, int kMobaBlockSize, int kMaxN, int kHeadDim, bool is_split_kv>
|
||||
__global__ void qk_gemm_kernel(
|
||||
const input_type *q_input,
|
||||
@@ -446,7 +450,7 @@ std::vector<paddle::Tensor> MobaQKGemm(
|
||||
}
|
||||
}
|
||||
|
||||
PD_BUILD_OP(moba_qk_gemm)
|
||||
PD_BUILD_STATIC_OP(moba_qk_gemm)
|
||||
.Inputs({
|
||||
"q_input",
|
||||
"k_block_means",
|
||||
|
||||
@@ -16,6 +16,10 @@
|
||||
#include "moba_attn/moba_attn_utils.hpp"
|
||||
#include "moba_attn/moba_attn.h"
|
||||
|
||||
#ifndef PD_BUILD_STATIC_OP
|
||||
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
|
||||
#endif
|
||||
|
||||
template <typename input_type, int moba_block_size, int kBlockM, int kMaxN, int tokens_per_block, bool need_k_mean>
|
||||
__global__ void fused_block_mean_and_rope_kernel(
|
||||
const input_type *qkv_input,
|
||||
@@ -341,7 +345,7 @@ void FusedBlockMeanAndRope(
|
||||
|
||||
|
||||
|
||||
PD_BUILD_OP(fused_block_mean_and_rope)
|
||||
PD_BUILD_STATIC_OP(fused_block_mean_and_rope)
|
||||
.Inputs({
|
||||
"qkv_out",
|
||||
"k_block_means",
|
||||
|
||||
@@ -64,6 +64,8 @@ __global__ void set_value_by_flags(bool *stop_flags,
|
||||
}
|
||||
if (!beam_search && is_in_end(topk_ids[bid], end_ids, end_length)) {
|
||||
stop_flags[bid] = true;
|
||||
topk_ids[bid] = end_ids[0];
|
||||
next_tokens[bid] = end_ids[0];
|
||||
}
|
||||
}
|
||||
// dealing stop_seqs
|
||||
|
||||
@@ -426,7 +426,6 @@ class LLMEngine:
|
||||
"ENABLE_FASTDEPLOY_LOAD_MODEL_CONCURRENCY": 0,
|
||||
"LOAD_STATE_DICT_THREAD_NUM": len(self.cfg.parallel_config.device_ids.split(",")),
|
||||
"PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION": "python",
|
||||
"FLAGS_use_append_attn": 1,
|
||||
"NCCL_ALGO": "Ring",
|
||||
"FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 1024)),
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ tqdm
|
||||
pynvml
|
||||
uvicorn==0.29.0
|
||||
fastapi
|
||||
paddleformers>=0.2
|
||||
paddleformers>=0.3.1
|
||||
redis
|
||||
etcd3
|
||||
httpx
|
||||
|
||||
Reference in New Issue
Block a user