[Feature] DeepseekV3 use pd_build_static_op (#2948)

Co-authored-by: K11OntheBoat <“ruianmaidanglao@163.com”>
This commit is contained in:
K11OntheBoat
2025-07-22 15:03:41 +08:00
committed by GitHub
parent 2a8a2c06de
commit e991777757
6 changed files with 13 additions and 8 deletions

View File

@@ -13,6 +13,7 @@
// limitations under the License.
#pragma once
#include "helper.h"
#include "mla_cache_kernel.cuh"
template <paddle::DataType T>
@@ -259,7 +260,7 @@ std::vector<paddle::Tensor> DecodeMLAWriteCacheKernel(
}
PD_BUILD_OP(prefill_mla_write_cache)
PD_BUILD_STATIC_OP(prefill_mla_write_cache)
.Inputs({"kv_nope",
"kv_pe",
"kv_cache",
@@ -274,7 +275,7 @@ PD_BUILD_OP(prefill_mla_write_cache)
"max_seq_len: int"})
.SetKernelFn(PD_KERNEL(PrefillMLAWriteCacheKernel));
PD_BUILD_OP(decode_mla_write_cache)
PD_BUILD_STATIC_OP(decode_mla_write_cache)
.Inputs({"kv_nope",
"kv_pe",
"kv_cache",

View File

@@ -15,6 +15,7 @@
#include "helper.h"
#include "paddle/extension.h"
template <typename T, bool IS_NEOX>
inline __device__ void apply_token_rotary_embedding_kernel(
T* __restrict__ arr,
@@ -138,7 +139,7 @@ void FusedRotaryPositionEncoding(
});
}
PD_BUILD_OP(fused_rotary_position_encoding)
PD_BUILD_STATIC_OP(fused_rotary_position_encoding)
.Inputs({"query", "key", "position_ids", "cos_sin_cache"})
.Outputs({"query_out", "key_out"})
.Attrs({"head_size: int", "is_neox: bool"})

View File

@@ -15,6 +15,7 @@
#include "helper.h"
#include "paddle/extension.h"
__global__ void GetPositionIdsAndMaskEncoderBatchKernel(
const int* seq_lens_encoder, // [bsz] 每个批次的 encoder 长度
const int* seq_lens_decoder, // [bsz] 每个批次的 decoder 长度
@@ -74,7 +75,7 @@ void GetPositionIdsAndMaskEncoderBatch(
bsz);
}
PD_BUILD_OP(get_position_ids_and_mask_encoder_batch)
PD_BUILD_STATIC_OP(get_position_ids_and_mask_encoder_batch)
.Inputs({"seq_lens_encoder",
"seq_lens_decoder",
"seq_lens_this_time",

View File

@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "helper.h"
#include "paddle/extension.h"
#define CEILDIV(a,b) (((a+b-1)/b))
template <typename scalar_t>
@@ -189,7 +189,7 @@ std::vector<paddle::Tensor> tritonmoe_preprocess_kernel(const paddle::Tensor& to
return {sorted_ids, expert_ids, num_tokens_post_pad};
}
PD_BUILD_OP(tritonmoe_preprocess)
PD_BUILD_STATIC_OP(tritonmoe_preprocess)
.Inputs({"topk_ids"})
.Attrs({"num_experts: int64_t", "GEMM_BLOCK_SIZE_M: int64_t"})
.Outputs({"sorted_ids", "expert_ids", "num_tokens_post_pad"})

View File

@@ -13,6 +13,7 @@
// limitations under the License.
#include "append_attn/multi_head_latent_attention_kernel.h"
#include "helper.h"
#include "mla_attn/batch_mla_with_paged_kv_cache.h"
template <paddle::DataType D>
@@ -410,7 +411,7 @@ std::vector<paddle::DataType> MultiHeadLatentAttentionInferDtype(
}
}
PD_BUILD_OP(multi_head_latent_attention)
PD_BUILD_STATIC_OP(multi_head_latent_attention)
.Inputs({"query",
"key_cache",
"value_cache",

View File

@@ -18,6 +18,7 @@
#include <algorithm>
#include <optional>
#include "helper.h"
#include "noauxtc_kernel.h"
std::vector<paddle::Tensor> NoauxTc(paddle::Tensor& scores,
@@ -60,7 +61,7 @@ std::vector<std::vector<int64_t>> NoauxTcInferShape(
return {scores_shape};
}
PD_BUILD_OP(noaux_tc)
PD_BUILD_STATIC_OP(noaux_tc)
.Inputs({"scores", "scores_with_bias"})
.Outputs({"output_tensor"})
.Attrs({"n_group: int",