mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 16:22:57 +08:00
fix typos (#3684)
This commit is contained in:
@@ -980,7 +980,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
|||||||
|
|
||||||
m.def("per_token_quant_padding", &PerTokenQuantPadding, py::arg("input"),
|
m.def("per_token_quant_padding", &PerTokenQuantPadding, py::arg("input"),
|
||||||
py::arg("block_size"),
|
py::arg("block_size"),
|
||||||
"per token per block quant and padding tranpose scale");
|
"per token per block quant and padding transpose scale");
|
||||||
|
|
||||||
m.def("masked_per_token_quant", &MaskedPerTokenQuant, py::arg("input"),
|
m.def("masked_per_token_quant", &MaskedPerTokenQuant, py::arg("input"),
|
||||||
py::arg("recv_expert_count"), py::arg("block_size"),
|
py::arg("recv_expert_count"), py::arg("block_size"),
|
||||||
|
@@ -89,11 +89,11 @@ public:
|
|||||||
GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN,
|
GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN,
|
||||||
Shape::kK / WarpGemm::kK>;
|
Shape::kK / WarpGemm::kK>;
|
||||||
|
|
||||||
/// Number of warp-level GEMM oeprations
|
/// Number of warp-level GEMM operations
|
||||||
static int const kWarpGemmIterations =
|
static int const kWarpGemmIterations =
|
||||||
(WarpGemm::kK / Operator::Policy::MmaShape::kK);
|
(WarpGemm::kK / Operator::Policy::MmaShape::kK);
|
||||||
|
|
||||||
/// Number of warp-level GEMM oeprations per load for B
|
/// Number of warp-level GEMM operations per load for B
|
||||||
static constexpr int kWarpGemmIterationsPerLoadForB =
|
static constexpr int kWarpGemmIterationsPerLoadForB =
|
||||||
Operator::IteratorB::InstructionShape::kRow / Operator::InstructionShape::kK;
|
Operator::IteratorB::InstructionShape::kRow / Operator::InstructionShape::kK;
|
||||||
static_assert(!(kWarpGemmIterations % kWarpGemmIterationsPerLoadForB), "");
|
static_assert(!(kWarpGemmIterations % kWarpGemmIterationsPerLoadForB), "");
|
||||||
|
@@ -117,7 +117,7 @@ class LeftGELUAndMul {
|
|||||||
CUTLASS_HOST_DEVICE
|
CUTLASS_HOST_DEVICE
|
||||||
FragmentOutput operator()(FragmentAccumulator const &lhs,
|
FragmentOutput operator()(FragmentAccumulator const &lhs,
|
||||||
FragmentAccumulator const &rhs) const {
|
FragmentAccumulator const &rhs) const {
|
||||||
// Convert source to interal compute numeric type
|
// Convert source to internal compute numeric type
|
||||||
NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
|
NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
|
||||||
accumulator_to_compute;
|
accumulator_to_compute;
|
||||||
|
|
||||||
|
@@ -117,7 +117,7 @@ class LeftSiLUAndMul {
|
|||||||
CUTLASS_HOST_DEVICE
|
CUTLASS_HOST_DEVICE
|
||||||
FragmentOutput operator()(FragmentAccumulator const &lhs,
|
FragmentOutput operator()(FragmentAccumulator const &lhs,
|
||||||
FragmentAccumulator const &rhs) const {
|
FragmentAccumulator const &rhs) const {
|
||||||
// Convert source to interal compute numeric type
|
// Convert source to internal compute numeric type
|
||||||
NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
|
NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
|
||||||
accumulator_to_compute;
|
accumulator_to_compute;
|
||||||
|
|
||||||
|
@@ -92,7 +92,7 @@ class DualMmaBase {
|
|||||||
Shape::kN / WarpGemm::kN,
|
Shape::kN / WarpGemm::kN,
|
||||||
Shape::kK / WarpGemm::kK>;
|
Shape::kK / WarpGemm::kK>;
|
||||||
|
|
||||||
/// Number of warp-level GEMM oeprations
|
/// Number of warp-level GEMM operations
|
||||||
static int const kWarpGemmIterations =
|
static int const kWarpGemmIterations =
|
||||||
(WarpGemm::kK / Operator0::Policy::MmaShape::kK);
|
(WarpGemm::kK / Operator0::Policy::MmaShape::kK);
|
||||||
|
|
||||||
|
@@ -219,7 +219,7 @@ class EpilogueVisitorPerRowPerColNf4 {
|
|||||||
iterator_C_.clear_mask();
|
iterator_C_.clear_mask();
|
||||||
}
|
}
|
||||||
// NOTE(wangbojun) Currently, this kernel don't hanve implantention for
|
// NOTE(wangbojun) Currently, this kernel don't hanve implantention for
|
||||||
// adding elementwise beta, we keep this here for future useage beta_ =
|
// adding elementwise beta, we keep this here for future usage beta_ =
|
||||||
// (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr :
|
// (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr :
|
||||||
// params.elementwise.beta); if (beta_ == ElementAccumulator()) {
|
// params.elementwise.beta); if (beta_ == ElementAccumulator()) {
|
||||||
// iterator_C_.clear_mask();
|
// iterator_C_.clear_mask();
|
||||||
|
@@ -176,7 +176,7 @@ struct Nf4DefaultIteratorsTensorOp<cutlass::bfloat16_t,
|
|||||||
///
|
///
|
||||||
/// Satisfies: ReadableTileIterator
|
/// Satisfies: ReadableTileIterator
|
||||||
///
|
///
|
||||||
template <typename ThreadMap_ ///< Thread map (conept: OutputTileThreadMap)
|
template <typename ThreadMap_ ///< Thread map (concept: OutputTileThreadMap)
|
||||||
>
|
>
|
||||||
class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 16, 8, 8> {
|
class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 16, 8, 8> {
|
||||||
public:
|
public:
|
||||||
|
@@ -64,7 +64,7 @@ template <
|
|||||||
typename InstructionShape_,
|
typename InstructionShape_,
|
||||||
/// Number of stages used in the pipelined mainloop
|
/// Number of stages used in the pipelined mainloop
|
||||||
int Stages,
|
int Stages,
|
||||||
/// Operation perfomed by GEMM
|
/// Operation performed by GEMM
|
||||||
typename Operator,
|
typename Operator,
|
||||||
/// Store the accumulators in row major or column major. Row major is used
|
/// Store the accumulators in row major or column major. Row major is used
|
||||||
/// when output layout is interleaved.
|
/// when output layout is interleaved.
|
||||||
|
@@ -133,7 +133,7 @@ public:
|
|||||||
/// Shape describing the number of warps filling the CTA
|
/// Shape describing the number of warps filling the CTA
|
||||||
using WarpCount = GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
|
using WarpCount = GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
|
||||||
|
|
||||||
/// Number of warp-level GEMM oeprations
|
/// Number of warp-level GEMM operations
|
||||||
static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
|
static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
|
||||||
static_assert(Operator::IteratorB::InstructionShape::kRow>=Operator::InstructionShape::kK,"");
|
static_assert(Operator::IteratorB::InstructionShape::kRow>=Operator::InstructionShape::kK,"");
|
||||||
static constexpr int kNumKIterationsPerWarpBLoad =
|
static constexpr int kNumKIterationsPerWarpBLoad =
|
||||||
|
@@ -509,7 +509,7 @@ public:
|
|||||||
this->warp_tile_iterator_B_.load(warp_frag_B[(warp_tileB_k_load_offset + 1) % 2]);
|
this->warp_tile_iterator_B_.load(warp_frag_B[(warp_tileB_k_load_offset + 1) % 2]);
|
||||||
++this->warp_tile_iterator_B_;
|
++this->warp_tile_iterator_B_;
|
||||||
}
|
}
|
||||||
// TOOD(wangbojun) lds_converter can be remove for int8 B input
|
// TODO(wangbojun) lds_converter can be remove for int8 B input
|
||||||
typename TransformBAfterLDS::result_type converted_frag_B =
|
typename TransformBAfterLDS::result_type converted_frag_B =
|
||||||
lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);
|
lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);
|
||||||
|
|
||||||
|
@@ -96,7 +96,7 @@ public:
|
|||||||
/// Shape describing the number of warps filling the CTA
|
/// Shape describing the number of warps filling the CTA
|
||||||
using WarpCount = GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
|
using WarpCount = GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
|
||||||
|
|
||||||
/// Number of warp-level GEMM oeprations
|
/// Number of warp-level GEMM operations
|
||||||
static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
|
static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
|
||||||
static_assert(Operator::IteratorB::InstructionShape::kRow>=Operator::InstructionShape::kK,"");
|
static_assert(Operator::IteratorB::InstructionShape::kRow>=Operator::InstructionShape::kK,"");
|
||||||
static constexpr int kNumKIterationsPerWarpBLoad =
|
static constexpr int kNumKIterationsPerWarpBLoad =
|
||||||
|
@@ -646,7 +646,7 @@ public:
|
|||||||
// );
|
// );
|
||||||
// }
|
// }
|
||||||
}
|
}
|
||||||
// TOOD(wangbojun) lds_converter can be remove for int8 B input
|
// TODO(wangbojun) lds_converter can be remove for int8 B input
|
||||||
// int4
|
// int4
|
||||||
// typename TransformBAfterLDS::result_type converted_frag_B =
|
// typename TransformBAfterLDS::result_type converted_frag_B =
|
||||||
// lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);
|
// lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);
|
||||||
|
@@ -171,7 +171,7 @@ struct DefaultIteratorsTensorOp<cutlass::bfloat16_t,
|
|||||||
///
|
///
|
||||||
/// Satisfies: ReadableTileIterator
|
/// Satisfies: ReadableTileIterator
|
||||||
///
|
///
|
||||||
template <typename ThreadMap_ ///< Thread map (conept: OutputTileThreadMap)
|
template <typename ThreadMap_ ///< Thread map (concept: OutputTileThreadMap)
|
||||||
>
|
>
|
||||||
class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 16, 8, 8> {
|
class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 16, 8, 8> {
|
||||||
public:
|
public:
|
||||||
|
@@ -80,7 +80,7 @@ void MoeDispatchKernel(
|
|||||||
if (group_moe) {
|
if (group_moe) {
|
||||||
paddle::Tensor softmax_max_prob_tensor =
|
paddle::Tensor softmax_max_prob_tensor =
|
||||||
GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
|
GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
|
||||||
// (TODO: check fill sucess ?)
|
// (TODO: check fill success ?)
|
||||||
paddle::experimental::fill(softmax_max_prob_tensor, 0.f);
|
paddle::experimental::fill(softmax_max_prob_tensor, 0.f);
|
||||||
softmax_max_prob = softmax_max_prob_tensor.data<float>();
|
softmax_max_prob = softmax_max_prob_tensor.data<float>();
|
||||||
}
|
}
|
||||||
|
@@ -75,7 +75,7 @@ void SaveOutMmsgTopK(const paddle::Tensor& x,
|
|||||||
std::string inference_msg_id_env_str(inference_msg_id_env_p);
|
std::string inference_msg_id_env_str(inference_msg_id_env_p);
|
||||||
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
|
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
|
||||||
if (inference_msg_id_from_env == 2) {
|
if (inference_msg_id_from_env == 2) {
|
||||||
// 2 and -2 is perserve for no-output indication.
|
// 2 and -2 is preserve for no-output indication.
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
" INFERENCE_MSG_ID cannot be 2, please use other number.");
|
" INFERENCE_MSG_ID cannot be 2, please use other number.");
|
||||||
}
|
}
|
||||||
|
@@ -45,7 +45,7 @@ void save_kernel(const paddle::Tensor& x,
|
|||||||
std::string inference_msg_id_env_str(inference_msg_id_env_p);
|
std::string inference_msg_id_env_str(inference_msg_id_env_p);
|
||||||
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
|
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
|
||||||
if (inference_msg_id_from_env == 2) {
|
if (inference_msg_id_from_env == 2) {
|
||||||
// 2 and -2 is perserve for no-output indication.
|
// 2 and -2 is preserve for no-output indication.
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
" INFERENCE_MSG_ID cannot be 2, please use other number.");
|
" INFERENCE_MSG_ID cannot be 2, please use other number.");
|
||||||
}
|
}
|
||||||
|
@@ -34,7 +34,7 @@ __global__ void set_value_by_flag_and_id(const bool *stop_flags,
|
|||||||
const int64_t *input_ids_now = input_ids + tid * length_input_ids;
|
const int64_t *input_ids_now = input_ids + tid * length_input_ids;
|
||||||
const int seq_len_dec = seq_lens_decoder[tid];
|
const int seq_len_dec = seq_lens_decoder[tid];
|
||||||
const int seq_len_enc = seq_lens_encoder[tid];
|
const int seq_len_enc = seq_lens_encoder[tid];
|
||||||
if (seq_len_dec == 0 && seq_len_enc == 0) return; // stoped
|
if (seq_len_dec == 0 && seq_len_enc == 0) return; // stopped
|
||||||
if (step_idx[tid] >= 0) {
|
if (step_idx[tid] >= 0) {
|
||||||
if (seq_len_enc > 0) { // encoder, get last token accord to seq_lens_encoder
|
if (seq_len_enc > 0) { // encoder, get last token accord to seq_lens_encoder
|
||||||
pre_ids_all_now[step_idx[tid]] = input_ids_now[seq_len_enc - 1];
|
pre_ids_all_now[step_idx[tid]] = input_ids_now[seq_len_enc - 1];
|
||||||
|
@@ -63,7 +63,7 @@ __global__ void ComputeOrderKernel(
|
|||||||
position_map[in_offset++] = out_offset++;
|
position_map[in_offset++] = out_offset++;
|
||||||
}
|
}
|
||||||
in_offset += cur_base_model_seq_lens_this_time - accept_num;
|
in_offset += cur_base_model_seq_lens_this_time - accept_num;
|
||||||
// (liuzichang): Temperary Reserved for debug
|
// (liuzichang): Temporary Reserved for debug
|
||||||
// if (accept_num <= actual_draft_token_num) /*Accept partial draft tokens*/ {
|
// if (accept_num <= actual_draft_token_num) /*Accept partial draft tokens*/ {
|
||||||
// #ifdef DEBUG_EAGLE_KERNEL
|
// #ifdef DEBUG_EAGLE_KERNEL
|
||||||
// printf("batch %d: accept_num <= actual_draft_token_num \n", i);
|
// printf("batch %d: accept_num <= actual_draft_token_num \n", i);
|
||||||
|
@@ -35,7 +35,7 @@ __global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all,
|
|||||||
accept_tokens + tid * max_draft_tokens;
|
accept_tokens + tid * max_draft_tokens;
|
||||||
const int seq_len_dec = seq_lens_decoder[tid];
|
const int seq_len_dec = seq_lens_decoder[tid];
|
||||||
const int seq_len_enc = seq_lens_encoder[tid];
|
const int seq_len_enc = seq_lens_encoder[tid];
|
||||||
if (seq_len_dec == 0 && seq_len_enc == 0) return; // stoped
|
if (seq_len_dec == 0 && seq_len_enc == 0) return; // stopped
|
||||||
// printf("step_idx[tid] %d\n", step_idx[tid]);
|
// printf("step_idx[tid] %d\n", step_idx[tid]);
|
||||||
if (step_idx[tid] >= 0) {
|
if (step_idx[tid] >= 0) {
|
||||||
for (int i = 0; i < accept_num[tid]; i++) {
|
for (int i = 0; i < accept_num[tid]; i++) {
|
||||||
|
@@ -295,7 +295,7 @@ void SpeculateStepSchedule(const paddle::Tensor &stop_flags,
|
|||||||
std::string inference_msg_id_env_str(inference_msg_id_env_p);
|
std::string inference_msg_id_env_str(inference_msg_id_env_p);
|
||||||
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
|
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
|
||||||
if (inference_msg_id_from_env == 2) {
|
if (inference_msg_id_from_env == 2) {
|
||||||
// 2 and -2 is perserve for no-output indication.
|
// 2 and -2 is preserve for no-output indication.
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
" INFERENCE_MSG_ID cannot be 2, please use other number.");
|
" INFERENCE_MSG_ID cannot be 2, please use other number.");
|
||||||
}
|
}
|
||||||
|
@@ -283,7 +283,7 @@ void Schedule(const paddle::Tensor &stop_flags,
|
|||||||
std::string inference_msg_id_env_str(inference_msg_id_env_p);
|
std::string inference_msg_id_env_str(inference_msg_id_env_p);
|
||||||
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
|
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
|
||||||
if (inference_msg_id_from_env == 2) {
|
if (inference_msg_id_from_env == 2) {
|
||||||
// 2 and -2 is perserve for no-output indication.
|
// 2 and -2 is preserve for no-output indication.
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
" INFERENCE_MSG_ID cannot be 2, please use other number.");
|
" INFERENCE_MSG_ID cannot be 2, please use other number.");
|
||||||
}
|
}
|
||||||
|
@@ -58,7 +58,7 @@ class TokenTransfer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// once copy: cpu --> cpu
|
// once copy: cpu --> cpu
|
||||||
// arrary length should be (1 + MAX_BATCH)
|
// array length should be (1 + MAX_BATCH)
|
||||||
bool GetBatchToken(int64_t *array) {
|
bool GetBatchToken(int64_t *array) {
|
||||||
if (Empty()) {
|
if (Empty()) {
|
||||||
return false;
|
return false;
|
||||||
|
@@ -75,10 +75,10 @@ void UpdateSplitFuseInputes(const paddle::Tensor& split_fuse_seq_lens,
|
|||||||
const int max_seq_len,
|
const int max_seq_len,
|
||||||
const int max_batch_size,
|
const int max_batch_size,
|
||||||
const int split_fuse_size) {
|
const int split_fuse_size) {
|
||||||
dim3 girds;
|
dim3 grids;
|
||||||
girds.x = max_batch_size;
|
grids.x = max_batch_size;
|
||||||
const int block_size = 128;
|
const int block_size = 128;
|
||||||
update_split_fuse_inputs_kernel<<<girds,
|
update_split_fuse_inputs_kernel<<<grids,
|
||||||
block_size,
|
block_size,
|
||||||
0,
|
0,
|
||||||
input_ids.stream()>>>(
|
input_ids.stream()>>>(
|
||||||
|
@@ -110,7 +110,7 @@ void MoeDispatchKernel(const paddle::Tensor& input,
|
|||||||
if (group_moe) {
|
if (group_moe) {
|
||||||
paddle::Tensor softmax_max_prob_tensor =
|
paddle::Tensor softmax_max_prob_tensor =
|
||||||
GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
|
GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
|
||||||
// (TODO: check fill sucess ?)
|
// (TODO: check fill success ?)
|
||||||
paddle::experimental::fill(softmax_max_prob_tensor, 0.f);
|
paddle::experimental::fill(softmax_max_prob_tensor, 0.f);
|
||||||
softmax_max_prob = softmax_max_prob_tensor.data<float>();
|
softmax_max_prob = softmax_max_prob_tensor.data<float>();
|
||||||
}
|
}
|
||||||
|
@@ -507,7 +507,7 @@ elif paddle.is_compiled_with_cuda():
|
|||||||
sources += find_end_files(fp8_auto_gen_directory, ".cu")
|
sources += find_end_files(fp8_auto_gen_directory, ".cu")
|
||||||
|
|
||||||
if cc >= 90 and nvcc_version >= 12.0:
|
if cc >= 90 and nvcc_version >= 12.0:
|
||||||
# Hopper optmized mla
|
# Hopper optimized mla
|
||||||
sources += find_end_files("gpu_ops/mla_attn", ".cu")
|
sources += find_end_files("gpu_ops/mla_attn", ".cu")
|
||||||
sources += ["gpu_ops/flash_mask_attn/flash_mask_attn.cu"]
|
sources += ["gpu_ops/flash_mask_attn/flash_mask_attn.cu"]
|
||||||
sources += find_end_files("gpu_ops/moba_attn/moba_decoder_attn/", ".cu")
|
sources += find_end_files("gpu_ops/moba_attn/moba_decoder_attn/", ".cu")
|
||||||
|
@@ -67,7 +67,7 @@ std::vector<paddle::Tensor> MoeLayerKernel(
|
|||||||
const auto xtype = x.dtype();
|
const auto xtype = x.dtype();
|
||||||
auto x_dims = x.shape();
|
auto x_dims = x.shape();
|
||||||
auto up_gate_proj_dims = up_gate_proj_weight.shape();
|
auto up_gate_proj_dims = up_gate_proj_weight.shape();
|
||||||
PD_CHECK(x_dims.size() == 2, "x_dims.size() shoud be 2.");
|
PD_CHECK(x_dims.size() == 2, "x_dims.size() should be 2.");
|
||||||
PD_CHECK(up_gate_proj_dims.size() == 3, "up_gate_proj_dims.size() should be 3.");
|
PD_CHECK(up_gate_proj_dims.size() == 3, "up_gate_proj_dims.size() should be 3.");
|
||||||
PD_CHECK(down_proj_in_scale.get_ptr() == nullptr, "down_proj_in_scale not support.");
|
PD_CHECK(down_proj_in_scale.get_ptr() == nullptr, "down_proj_in_scale not support.");
|
||||||
if (quant_method == "weight_only_int4") {
|
if (quant_method == "weight_only_int4") {
|
||||||
|
@@ -122,7 +122,7 @@ void SpeculateStepSchedule(
|
|||||||
std::string inference_msg_id_env_str(inference_msg_id_env_p);
|
std::string inference_msg_id_env_str(inference_msg_id_env_p);
|
||||||
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
|
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
|
||||||
if (inference_msg_id_from_env == 2) {
|
if (inference_msg_id_from_env == 2) {
|
||||||
// 2 and -2 is perserve for no-output indication.
|
// 2 and -2 is preserve for no-output indication.
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
" INFERENCE_MSG_ID cannot be 2, please use other number.");
|
" INFERENCE_MSG_ID cannot be 2, please use other number.");
|
||||||
}
|
}
|
||||||
|
@@ -59,7 +59,7 @@ void SaveOutMmsg(const paddle::Tensor &x, const paddle::Tensor ¬_need_stop,
|
|||||||
std::string inference_msg_id_env_str(inference_msg_id_env_p);
|
std::string inference_msg_id_env_str(inference_msg_id_env_p);
|
||||||
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
|
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
|
||||||
if (inference_msg_id_from_env == 2) {
|
if (inference_msg_id_from_env == 2) {
|
||||||
// 2 and -2 is perserve for no-output indication.
|
// 2 and -2 is preserve for no-output indication.
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
" INFERENCE_MSG_ID cannot be 2, please use other number.");
|
" INFERENCE_MSG_ID cannot be 2, please use other number.");
|
||||||
}
|
}
|
||||||
|
@@ -4,7 +4,7 @@
|
|||||||
namespace xpu3 {
|
namespace xpu3 {
|
||||||
namespace plugin {
|
namespace plugin {
|
||||||
#define MAX_LM_SIZE 28672
|
#define MAX_LM_SIZE 28672
|
||||||
// One core has 32KB LM(gropu LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
|
// One core has 32KB LM(group LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
|
||||||
// the stack space
|
// the stack space
|
||||||
#define MAX_BATCH 512
|
#define MAX_BATCH 512
|
||||||
#define ALIGNMENT 64
|
#define ALIGNMENT 64
|
||||||
|
@@ -4,7 +4,7 @@
|
|||||||
namespace xpu3 {
|
namespace xpu3 {
|
||||||
namespace plugin {
|
namespace plugin {
|
||||||
#define MAX_LM_SIZE 28672
|
#define MAX_LM_SIZE 28672
|
||||||
// One core has 32KB LM(gropu LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
|
// One core has 32KB LM(group LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
|
||||||
// the stack space
|
// the stack space
|
||||||
#define MAX_BATCH 512
|
#define MAX_BATCH 512
|
||||||
#define ALIGNMENT 64
|
#define ALIGNMENT 64
|
||||||
|
@@ -8,7 +8,7 @@
|
|||||||
namespace xpu3 {
|
namespace xpu3 {
|
||||||
namespace plugin {
|
namespace plugin {
|
||||||
#define MAX_SM_SIZE 32768
|
#define MAX_SM_SIZE 32768
|
||||||
// One core has 32KB LM(gropu LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
|
// One core has 32KB LM(group LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
|
||||||
// the stack space
|
// the stack space
|
||||||
#define MAX_BATCH 512
|
#define MAX_BATCH 512
|
||||||
#define BANK_CONFLICT_M 128
|
#define BANK_CONFLICT_M 128
|
||||||
|
@@ -79,7 +79,7 @@ qw_pd_trans = paddle.transpose(qw_pd, [1, 0])
|
|||||||
# print("wscale_pd:\n{}".format(wscale_pd))
|
# print("wscale_pd:\n{}".format(wscale_pd))
|
||||||
# print("wscale_np:\n{}".format(wscale_np))
|
# print("wscale_np:\n{}".format(wscale_np))
|
||||||
|
|
||||||
# comparation
|
# comparison
|
||||||
print(f"wscale_pd, mean={wscale_pd.mean()}, std={wscale_pd.std()}")
|
print(f"wscale_pd, mean={wscale_pd.mean()}, std={wscale_pd.std()}")
|
||||||
print(f"wscale_np, mean={wscale_np.mean()}, std={wscale_np.std()}")
|
print(f"wscale_np, mean={wscale_np.mean()}, std={wscale_np.std()}")
|
||||||
print(f"qw_np, mean={qw_np.astype(np.float32).mean()}, std={qw_np.astype(np.float32).std()}")
|
print(f"qw_np, mean={qw_np.astype(np.float32).mean()}, std={qw_np.astype(np.float32).std()}")
|
||||||
|
@@ -44,7 +44,7 @@ CudaGrpah can be enabled by setting `--use-cudagraph` or `--graph-optimization-c
|
|||||||
The `graph_opt_level` parameter within `--graph-optimization-config` is used to configure the graph optimization level, with the following available options:
|
The `graph_opt_level` parameter within `--graph-optimization-config` is used to configure the graph optimization level, with the following available options:
|
||||||
+ `0`: Use Dynamic compute graph, default to 0
|
+ `0`: Use Dynamic compute graph, default to 0
|
||||||
+ `1`: Use Static compute graph, during the initialization phase, Paddle API will be used to convert the dynamic image into a static image
|
+ `1`: Use Static compute graph, during the initialization phase, Paddle API will be used to convert the dynamic image into a static image
|
||||||
+ `2`: Base on Static compute graph, use the complier(CINN, Compiler Infrastructure for Neural Networks) of Paddle to compile and optimize
|
+ `2`: Base on Static compute graph, use the compiler(CINN, Compiler Infrastructure for Neural Networks) of Paddle to compile and optimize
|
||||||
|
|
||||||
In general, static graphs have lower Kernel Launch overhead than dynamic graphs, and it is recommended to use static graphs.
|
In general, static graphs have lower Kernel Launch overhead than dynamic graphs, and it is recommended to use static graphs.
|
||||||
For adapted models, FastDeploy's CudaGraph *can support both dynamic and static graphs* simultaneously.
|
For adapted models, FastDeploy's CudaGraph *can support both dynamic and static graphs* simultaneously.
|
||||||
|
@@ -62,7 +62,7 @@ python -m pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/pac
|
|||||||
python -m pip install paddle-custom-gcu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/gcu/
|
python -m pip install paddle-custom-gcu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/gcu/
|
||||||
# For source compilation, refer to: https://github.com/PaddlePaddle/PaddleCustomDevice/blob/develop/backends/gcu/README_cn.md
|
# For source compilation, refer to: https://github.com/PaddlePaddle/PaddleCustomDevice/blob/develop/backends/gcu/README_cn.md
|
||||||
```
|
```
|
||||||
For latest paddle verion on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
|
For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
|
||||||
|
|
||||||
6. Install FastDeploy and dependencies
|
6. Install FastDeploy and dependencies
|
||||||
```bash
|
```bash
|
||||||
|
@@ -89,4 +89,4 @@ for chunk in response:
|
|||||||
print('\n')
|
print('\n')
|
||||||
```
|
```
|
||||||
|
|
||||||
For detailed OpenAI protocol specifications, see [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../online_serving/README.md).
|
For detailed OpenAI protocol specifications, see [OpenAI Chat Completion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../online_serving/README.md).
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
# 采样策略
|
# 采样策略
|
||||||
|
|
||||||
采样策略用于决定如何从模型的输出概率分布中选择下一个token。FastDeploy目前支持 Top-p 、 Top-k_Top-p 和 Min-p Samping 多种采样策略。
|
采样策略用于决定如何从模型的输出概率分布中选择下一个token。FastDeploy目前支持 Top-p 、 Top-k_Top-p 和 Min-p Sampling 多种采样策略。
|
||||||
|
|
||||||
1. Top-p 采样
|
1. Top-p 采样
|
||||||
|
|
||||||
|
@@ -89,4 +89,4 @@ for chunk in response:
|
|||||||
print('\n')
|
print('\n')
|
||||||
```
|
```
|
||||||
|
|
||||||
OpenAI 协议的更多说明可参考文档 [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create),以及与 OpenAI 协议的区别可以参考 [兼容 OpenAI 协议的服务化部署](../online_serving/README.md)。
|
OpenAI 协议的更多说明可参考文档 [OpenAI Chat Completion API](https://platform.openai.com/docs/api-reference/chat/create),以及与 OpenAI 协议的区别可以参考 [兼容 OpenAI 协议的服务化部署](../online_serving/README.md)。
|
||||||
|
@@ -57,7 +57,7 @@ def parse_args():
|
|||||||
"--protocol",
|
"--protocol",
|
||||||
type=str,
|
type=str,
|
||||||
default="ipc",
|
default="ipc",
|
||||||
help="cache transfer protocol, only surport ipc now",
|
help="cache transfer protocol, only support ipc now",
|
||||||
)
|
)
|
||||||
parser.add_argument("--enable_splitwise", type=int, default=0, help="enable splitwise ")
|
parser.add_argument("--enable_splitwise", type=int, default=0, help="enable splitwise ")
|
||||||
parser.add_argument("--cache_queue_port", type=int, default=9923, help="cache queue port")
|
parser.add_argument("--cache_queue_port", type=int, default=9923, help="cache queue port")
|
||||||
|
@@ -257,7 +257,7 @@ class ParallelConfig:
|
|||||||
self.sequence_parallel = False # Whether to enable sequence parallelism.
|
self.sequence_parallel = False # Whether to enable sequence parallelism.
|
||||||
self.use_ep = False # Whether to enable Expert Parallelism
|
self.use_ep = False # Whether to enable Expert Parallelism
|
||||||
self.moe_phase = MoEPhase("prefill") # Generation phase
|
self.moe_phase = MoEPhase("prefill") # Generation phase
|
||||||
self.msg_queue_id = 1 # mesage queue id
|
self.msg_queue_id = 1 # message queue id
|
||||||
|
|
||||||
self.tensor_parallel_rank = 0 # TP rank ID
|
self.tensor_parallel_rank = 0 # TP rank ID
|
||||||
self.tensor_parallel_size = 1 # TP degree
|
self.tensor_parallel_size = 1 # TP degree
|
||||||
@@ -549,7 +549,7 @@ class GraphOptimizationConfig:
|
|||||||
It requires that all input buffers have fixed addresses, and all
|
It requires that all input buffers have fixed addresses, and all
|
||||||
splitting ops write their outputs to input buffers.
|
splitting ops write their outputs to input buffers.
|
||||||
- With dyncmic graph backend: ...
|
- With dyncmic graph backend: ...
|
||||||
- With static grpah backend: WIP
|
- With static graph backend: WIP
|
||||||
"""
|
"""
|
||||||
self.sot_warmup_sizes: list[int] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 32, 64, 128]
|
self.sot_warmup_sizes: list[int] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 32, 64, 128]
|
||||||
""" Number of warmup runs for SOT warmup. """
|
""" Number of warmup runs for SOT warmup. """
|
||||||
|
@@ -531,7 +531,7 @@ class EngineArgs:
|
|||||||
"--quantization",
|
"--quantization",
|
||||||
type=str,
|
type=str,
|
||||||
default=EngineArgs.quantization,
|
default=EngineArgs.quantization,
|
||||||
help="Quantization name for the model, currentlly support "
|
help="Quantization name for the model, currently support "
|
||||||
"'wint8', 'wint4',"
|
"'wint8', 'wint4',"
|
||||||
"default is None. The priority of this configuration "
|
"default is None. The priority of this configuration "
|
||||||
"is lower than that of the config file. "
|
"is lower than that of the config file. "
|
||||||
@@ -829,7 +829,7 @@ class EngineArgs:
|
|||||||
scheduler_group.add_argument(
|
scheduler_group.add_argument(
|
||||||
"--scheduler-topic",
|
"--scheduler-topic",
|
||||||
default=EngineArgs.scheduler_topic,
|
default=EngineArgs.scheduler_topic,
|
||||||
help=f"Topic of scheduler. Defaule is {EngineArgs.scheduler_topic}. (global)",
|
help=f"Topic of scheduler. Default is {EngineArgs.scheduler_topic}. (global)",
|
||||||
)
|
)
|
||||||
scheduler_group.add_argument(
|
scheduler_group.add_argument(
|
||||||
"--scheduler-min-load-score",
|
"--scheduler-min-load-score",
|
||||||
|
@@ -644,13 +644,13 @@ class EngineSevice:
|
|||||||
self.zmq_server.send_multipart(request_id, [error_result])
|
self.zmq_server.send_multipart(request_id, [error_result])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
llm_logger.error(
|
llm_logger.error(
|
||||||
f"Error happend while receving new request from zmq, details={e}, "
|
f"Error happend while receiving new request from zmq, details={e}, "
|
||||||
f"traceback={traceback.format_exc()}"
|
f"traceback={traceback.format_exc()}"
|
||||||
)
|
)
|
||||||
|
|
||||||
def _zmq_send_generated_tokens(self):
|
def _zmq_send_generated_tokens(self):
|
||||||
"""
|
"""
|
||||||
Recieve output for zmq
|
Receive output for zmq
|
||||||
"""
|
"""
|
||||||
while self.running:
|
while self.running:
|
||||||
try:
|
try:
|
||||||
|
@@ -458,7 +458,7 @@ class ResourceManagerV1(ResourceManager):
|
|||||||
|
|
||||||
def _free_blocks(self, request: Request):
|
def _free_blocks(self, request: Request):
|
||||||
if self.config.cache_config.enable_prefix_caching:
|
if self.config.cache_config.enable_prefix_caching:
|
||||||
# TODO(chengyanfu): support cache ouput blocks for prefix caching
|
# TODO(chengyanfu): support cache output blocks for prefix caching
|
||||||
if request.get("prefill_block_num", None) is None:
|
if request.get("prefill_block_num", None) is None:
|
||||||
leaf_node = self.cache_manager.req_leaf_map[request.request_id]
|
leaf_node = self.cache_manager.req_leaf_map[request.request_id]
|
||||||
self.cache_manager.decrease_request_share_count(request.request_id)
|
self.cache_manager.decrease_request_share_count(request.request_id)
|
||||||
|
@@ -112,7 +112,7 @@ class LLM:
|
|||||||
|
|
||||||
def _receive_output(self):
|
def _receive_output(self):
|
||||||
"""
|
"""
|
||||||
Recieve output from token processor and store them in cache
|
Receive output from token processor and store them in cache
|
||||||
"""
|
"""
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
|
@@ -40,7 +40,7 @@ class ConcreteSizeEntry:
|
|||||||
# Has runtime-bs been captured before
|
# Has runtime-bs been captured before
|
||||||
captured: bool = False
|
captured: bool = False
|
||||||
|
|
||||||
# Need to be captured callable object(dynamic graph or static grpah backend)
|
# Need to be captured callable object(dynamic graph or static graph backend)
|
||||||
runnable: Callable = None # type: ignore
|
runnable: Callable = None # type: ignore
|
||||||
# Number of completed warmups
|
# Number of completed warmups
|
||||||
num_finished_warmup: int = 0
|
num_finished_warmup: int = 0
|
||||||
|
@@ -117,9 +117,9 @@ class GraphOptBackend:
|
|||||||
|
|
||||||
self.max_captre_batch = fd_config.graph_opt_config.cudagraph_capture_sizes[0]
|
self.max_captre_batch = fd_config.graph_opt_config.cudagraph_capture_sizes[0]
|
||||||
if self.fd_config.graph_opt_config.graph_opt_level > 0:
|
if self.fd_config.graph_opt_config.graph_opt_level > 0:
|
||||||
# 1. Prepare cuda grpah input buffers (contain output of subgraphs)
|
# 1. Prepare cuda graph input buffers (contain output of subgraphs)
|
||||||
|
|
||||||
# 2. Convert dynamic grpah to static graph
|
# 2. Convert dynamic graph to static graph
|
||||||
|
|
||||||
backend = (
|
backend = (
|
||||||
ToStaticBackend.CINN if self.fd_config.graph_opt_config.graph_opt_level > 1 else ToStaticBackend.PHI
|
ToStaticBackend.CINN if self.fd_config.graph_opt_config.graph_opt_level > 1 else ToStaticBackend.PHI
|
||||||
|
@@ -193,7 +193,7 @@ class AppendAttentionBackend(AttentionBackend):
|
|||||||
kv_cache_quant_type: str = None,
|
kv_cache_quant_type: str = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Caculate kv cache shape
|
Calculate kv cache shape
|
||||||
"""
|
"""
|
||||||
if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
|
if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
|
||||||
return (
|
return (
|
||||||
|
@@ -114,7 +114,7 @@ class BlockAttentionBackend(AttentionBackend):
|
|||||||
kv_cache_quant_type: str = None,
|
kv_cache_quant_type: str = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Caculate kv cache shape
|
Calculate kv cache shape
|
||||||
"""
|
"""
|
||||||
if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
|
if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
|
||||||
return (
|
return (
|
||||||
|
@@ -176,7 +176,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|||||||
kv_cache_quant_type: str = None,
|
kv_cache_quant_type: str = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Caculate kv cache shape
|
Calculate kv cache shape
|
||||||
"""
|
"""
|
||||||
if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
|
if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
|
||||||
return (
|
return (
|
||||||
|
@@ -210,7 +210,7 @@ class IluvatarAttnBackend(AttentionBackend):
|
|||||||
kv_cache_quant_type: str = None,
|
kv_cache_quant_type: str = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Caculate kv cache shape
|
Calculate kv cache shape
|
||||||
"""
|
"""
|
||||||
return (
|
return (
|
||||||
max_num_blocks,
|
max_num_blocks,
|
||||||
|
@@ -130,7 +130,7 @@ class XPUAttentionBackend(AttentionBackend):
|
|||||||
kv_cache_quant_type: str = None,
|
kv_cache_quant_type: str = None,
|
||||||
) -> Tuple[int, int, int, int]:
|
) -> Tuple[int, int, int, int]:
|
||||||
"""
|
"""
|
||||||
Caculate kv cache shape
|
Calculate kv cache shape
|
||||||
"""
|
"""
|
||||||
return (
|
return (
|
||||||
max_num_blocks,
|
max_num_blocks,
|
||||||
|
@@ -170,7 +170,7 @@ class GCUFlashAttnBackend(AttentionBackend):
|
|||||||
cache_len = 0
|
cache_len = 0
|
||||||
elif self.seq_lens_decoder_list[seq_idx][0] != 0: # decode
|
elif self.seq_lens_decoder_list[seq_idx][0] != 0: # decode
|
||||||
cache_len = self.seq_lens_decoder_list[seq_idx][0]
|
cache_len = self.seq_lens_decoder_list[seq_idx][0]
|
||||||
# else: doesnot have req in this seq_idx
|
# else: doesn't have req in this seq_idx
|
||||||
|
|
||||||
if cache_len is not None:
|
if cache_len is not None:
|
||||||
lens_this_time = self.seq_lens_this_time_list[seq_idx]
|
lens_this_time = self.seq_lens_this_time_list[seq_idx]
|
||||||
@@ -212,7 +212,7 @@ class GCUFlashAttnBackend(AttentionBackend):
|
|||||||
kv_cache_quant_type: str = None,
|
kv_cache_quant_type: str = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Caculate kv cache shape
|
Calculate kv cache shape
|
||||||
"""
|
"""
|
||||||
# [total_tokens, kv_num_heads, head_dim]
|
# [total_tokens, kv_num_heads, head_dim]
|
||||||
return (
|
return (
|
||||||
|
@@ -171,7 +171,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
|
|||||||
cache_len = 0
|
cache_len = 0
|
||||||
elif self.seq_lens_decoder_list[seq_idx][0] != 0: # decode
|
elif self.seq_lens_decoder_list[seq_idx][0] != 0: # decode
|
||||||
cache_len = self.seq_lens_decoder_list[seq_idx][0]
|
cache_len = self.seq_lens_decoder_list[seq_idx][0]
|
||||||
# else: doesnot have req in this seq_idx
|
# else: doesn't have req in this seq_idx
|
||||||
|
|
||||||
if cache_len is not None:
|
if cache_len is not None:
|
||||||
lens_this_time = self.seq_lens_this_time_list[seq_idx]
|
lens_this_time = self.seq_lens_this_time_list[seq_idx]
|
||||||
@@ -224,7 +224,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
|
|||||||
kv_cache_quant_type: str = None,
|
kv_cache_quant_type: str = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Caculate kv cache shape
|
Calculate kv cache shape
|
||||||
"""
|
"""
|
||||||
# [total_tokens, kv_num_heads, head_dim]
|
# [total_tokens, kv_num_heads, head_dim]
|
||||||
return (
|
return (
|
||||||
|
@@ -137,7 +137,7 @@ class FlashAttentionBackend(AttentionBackend):
|
|||||||
kv_cache_quant_type: str = None,
|
kv_cache_quant_type: str = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Caculate kv cache shape
|
Calculate kv cache shape
|
||||||
"""
|
"""
|
||||||
if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
|
if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
|
||||||
return (
|
return (
|
||||||
|
@@ -114,7 +114,7 @@ class DeepEPEngine:
|
|||||||
low_latency_mode=True,
|
low_latency_mode=True,
|
||||||
num_qps_per_rank=24,
|
num_qps_per_rank=24,
|
||||||
)
|
)
|
||||||
# In disaggregated mode on mutiple nodes, we either use
|
# In disaggregated mode on multiple nodes, we either use
|
||||||
# high throughput mode or low latency mode.
|
# high throughput mode or low latency mode.
|
||||||
else:
|
else:
|
||||||
if moe_phase.phase == "decode":
|
if moe_phase.phase == "decode":
|
||||||
|
@@ -35,7 +35,7 @@ class EarlyStopper:
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def process(self, probs: paddle.Tensor, next_tokens: paddle.Tensor, stop_flags: paddle.Tensor):
|
def process(self, probs: paddle.Tensor, next_tokens: paddle.Tensor, stop_flags: paddle.Tensor):
|
||||||
"""
|
"""
|
||||||
processs the stopper and set the stop_flags corresponding to the batch that triggers early stop to True
|
process the stopper and set the stop_flags corresponding to the batch that triggers early stop to True
|
||||||
args:
|
args:
|
||||||
- probs: [batch_size, vocab_size], the probs of every sample
|
- probs: [batch_size, vocab_size], the probs of every sample
|
||||||
- next_tokens: [batch_size, 1], the token index of every chosen sample
|
- next_tokens: [batch_size, 1], the token index of every chosen sample
|
||||||
|
@@ -267,7 +267,7 @@ class TokenProcessor:
|
|||||||
spec_logger.info(
|
spec_logger.info(
|
||||||
f"Speculate global accept ratio(Accept draft_tokens/Generated tokens): {accept_ratio}"
|
f"Speculate global accept ratio(Accept draft_tokens/Generated tokens): {accept_ratio}"
|
||||||
f" total step: {self.total_step}. total output token num: {self.number_of_output_tokens}"
|
f" total step: {self.total_step}. total output token num: {self.number_of_output_tokens}"
|
||||||
f" avarage accept len: {self.number_of_output_tokens / self.total_step}"
|
f" average accept len: {self.number_of_output_tokens / self.total_step}"
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.cfg.speculative_config.method in ["mtp"]:
|
if self.cfg.speculative_config.method in ["mtp"]:
|
||||||
|
@@ -72,7 +72,7 @@ class Proposer(ABC):
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _run_impl(self, *args, **kwargs) -> Any:
|
def _run_impl(self, *args, **kwargs) -> Any:
|
||||||
"""
|
"""
|
||||||
Implemention for different method
|
Implementation for different method
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@@ -14,7 +14,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
"""redundant expert manger."""
|
"""redundant expert manager."""
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@@ -49,7 +49,7 @@ class GcuWorker(WorkerBase):
|
|||||||
def init_device(self):
|
def init_device(self):
|
||||||
"""Initialize device and Construct model runner"""
|
"""Initialize device and Construct model runner"""
|
||||||
if paddle.is_compiled_with_custom_device("gcu"):
|
if paddle.is_compiled_with_custom_device("gcu"):
|
||||||
# Set evironment variable
|
# Set environment variable
|
||||||
self.device_ids = self.parallel_config.device_ids.split(",")
|
self.device_ids = self.parallel_config.device_ids.split(",")
|
||||||
self.device = f"gcu:{self.local_rank}"
|
self.device = f"gcu:{self.local_rank}"
|
||||||
paddle.device.set_device(self.device)
|
paddle.device.set_device(self.device)
|
||||||
@@ -127,7 +127,7 @@ class GcuWorker(WorkerBase):
|
|||||||
# NOTE(gongshaotian): may be not need warm_up at this place
|
# NOTE(gongshaotian): may be not need warm_up at this place
|
||||||
if self.model_runner.graph_opt_level >= 1:
|
if self.model_runner.graph_opt_level >= 1:
|
||||||
self.model_runner.sot_warmup()
|
self.model_runner.sot_warmup()
|
||||||
# 2. Triger cuda grpah capture
|
# 2. Trigger cuda graph capture
|
||||||
self.model_runner.capture_model()
|
self.model_runner.capture_model()
|
||||||
set_random_seed(self.fd_config.model_config.seed)
|
set_random_seed(self.fd_config.model_config.seed)
|
||||||
|
|
||||||
|
@@ -60,7 +60,7 @@ class GpuWorker(WorkerBase):
|
|||||||
"""
|
"""
|
||||||
self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
|
self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
|
||||||
if self.device_config.device_type == "cuda" and paddle.device.is_compiled_with_cuda():
|
if self.device_config.device_type == "cuda" and paddle.device.is_compiled_with_cuda():
|
||||||
# Set evironment variable
|
# Set environment variable
|
||||||
self.device_ids = self.parallel_config.device_ids.split(",")
|
self.device_ids = self.parallel_config.device_ids.split(",")
|
||||||
self.device = f"gpu:{self.local_rank % self.max_chips_per_node}"
|
self.device = f"gpu:{self.local_rank % self.max_chips_per_node}"
|
||||||
paddle.device.set_device(self.device)
|
paddle.device.set_device(self.device)
|
||||||
@@ -169,7 +169,7 @@ class GpuWorker(WorkerBase):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
return available_kv_cache_memory # return to caculate the block num in this device
|
return available_kv_cache_memory # return to calculate the block num in this device
|
||||||
|
|
||||||
def load_model(self) -> None:
|
def load_model(self) -> None:
|
||||||
"""Load model"""
|
"""Load model"""
|
||||||
@@ -209,7 +209,7 @@ class GpuWorker(WorkerBase):
|
|||||||
"""
|
"""
|
||||||
if self.model_runner.graph_opt_level >= 1:
|
if self.model_runner.graph_opt_level >= 1:
|
||||||
self.model_runner.sot_warmup()
|
self.model_runner.sot_warmup()
|
||||||
# Triger cuda grpah capture
|
# Trigger cuda graph capture
|
||||||
self.model_runner.capture_model()
|
self.model_runner.capture_model()
|
||||||
|
|
||||||
def check_health(self) -> bool:
|
def check_health(self) -> bool:
|
||||||
|
@@ -51,7 +51,7 @@ class IluvatarWorker(GpuWorker):
|
|||||||
Initialize device and construct model runner
|
Initialize device and construct model runner
|
||||||
"""
|
"""
|
||||||
if paddle.is_compiled_with_custom_device("iluvatar_gpu"):
|
if paddle.is_compiled_with_custom_device("iluvatar_gpu"):
|
||||||
# Set evironment variable
|
# Set environment variable
|
||||||
self.device = f"iluvatar_gpu:{self.local_rank}"
|
self.device = f"iluvatar_gpu:{self.local_rank}"
|
||||||
paddle.device.set_device(self.device)
|
paddle.device.set_device(self.device)
|
||||||
paddle.set_default_dtype(self.parallel_config.dtype)
|
paddle.set_default_dtype(self.parallel_config.dtype)
|
||||||
|
@@ -54,7 +54,7 @@ class MetaxWorker(WorkerBase):
|
|||||||
"""
|
"""
|
||||||
self.max_chips_per_node = 8
|
self.max_chips_per_node = 8
|
||||||
if paddle.is_compiled_with_custom_device("metax_gpu"):
|
if paddle.is_compiled_with_custom_device("metax_gpu"):
|
||||||
# Set evironment variable
|
# Set environment variable
|
||||||
self.device_ids = self.parallel_config.device_ids.split(",")
|
self.device_ids = self.parallel_config.device_ids.split(",")
|
||||||
self.device = f"metax_gpu:{self.local_rank % self.max_chips_per_node}"
|
self.device = f"metax_gpu:{self.local_rank % self.max_chips_per_node}"
|
||||||
paddle.device.set_device(self.device)
|
paddle.device.set_device(self.device)
|
||||||
@@ -202,7 +202,7 @@ class MetaxWorker(WorkerBase):
|
|||||||
"""
|
"""
|
||||||
if self.model_runner.graph_opt_level >= 1:
|
if self.model_runner.graph_opt_level >= 1:
|
||||||
self.model_runner.sot_warmup()
|
self.model_runner.sot_warmup()
|
||||||
# Todo Triger cuda grpah capture.
|
# Todo Trigger cuda graph capture.
|
||||||
|
|
||||||
def check_health(self) -> bool:
|
def check_health(self) -> bool:
|
||||||
""" """
|
""" """
|
||||||
|
@@ -21,7 +21,7 @@ import traceback
|
|||||||
def check_safetensors_model(model_dir: str):
|
def check_safetensors_model(model_dir: str):
|
||||||
"""
|
"""
|
||||||
model_dir : the directory of the model
|
model_dir : the directory of the model
|
||||||
Check whther the model is safetensors format
|
Check whether the model is safetensors format
|
||||||
"""
|
"""
|
||||||
model_files = list()
|
model_files = list()
|
||||||
all_files = os.listdir(model_dir)
|
all_files = os.listdir(model_dir)
|
||||||
|
@@ -27,7 +27,7 @@ from fastdeploy.worker.output import ModelRunnerOutput
|
|||||||
class WorkerBase(ABC):
|
class WorkerBase(ABC):
|
||||||
"""
|
"""
|
||||||
Engine -> (WIP)Executor -> Worker -> ModelRunner -> Model
|
Engine -> (WIP)Executor -> Worker -> ModelRunner -> Model
|
||||||
Worker interface that allows inference framwork to cleanly separate implementations for different harware.
|
Worker interface that allows inference framework to cleanly separate implementations for different hardware.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -89,7 +89,7 @@ class WorkerBase(ABC):
|
|||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def graph_optimize_and_warm_up_model(self) -> None:
|
def graph_optimize_and_warm_up_model(self) -> None:
|
||||||
"""Prepare model for execution through grpah optimizaiton(CudaGrpah/CINN) or warmup."""
|
"""Prepare model for execution through graph optimizaiton(CudaGrpah/CINN) or warmup."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
@@ -249,7 +249,7 @@ class PaddleDisWorkerProc:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def event_loop_normal(self) -> None:
|
def event_loop_normal(self) -> None:
|
||||||
"""Main event loop for Paddle Distrubuted Workers.
|
"""Main event loop for Paddle Distributed Workers.
|
||||||
TODO(gongshaotian): support remote calling of functions that control worker.
|
TODO(gongshaotian): support remote calling of functions that control worker.
|
||||||
"""
|
"""
|
||||||
# Currently, only support single node
|
# Currently, only support single node
|
||||||
@@ -493,7 +493,7 @@ def parse_args():
|
|||||||
"--speculative_config",
|
"--speculative_config",
|
||||||
type=json.loads,
|
type=json.loads,
|
||||||
default=None,
|
default=None,
|
||||||
help="Configation of SpeculativeConfig.",
|
help="Configuration of SpeculativeConfig.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--max_num_batched_tokens",
|
"--max_num_batched_tokens",
|
||||||
@@ -542,7 +542,7 @@ def parse_args():
|
|||||||
"--quantization",
|
"--quantization",
|
||||||
type=str,
|
type=str,
|
||||||
default="None",
|
default="None",
|
||||||
help="Quantization name for the model, currentlly support "
|
help="Quantization name for the model, currently support "
|
||||||
"'wint4', 'wint8',"
|
"'wint4', 'wint8',"
|
||||||
"default is None. The priority of this configuration "
|
"default is None. The priority of this configuration "
|
||||||
"is lower than that of the config file. "
|
"is lower than that of the config file. "
|
||||||
@@ -552,7 +552,7 @@ def parse_args():
|
|||||||
"--graph_optimization_config",
|
"--graph_optimization_config",
|
||||||
type=json.loads,
|
type=json.loads,
|
||||||
default=None,
|
default=None,
|
||||||
help="Configation of Graph optimization backend.",
|
help="Configuration of Graph optimization backend.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--moba_attention_config",
|
"--moba_attention_config",
|
||||||
|
@@ -50,7 +50,7 @@ class XpuWorker(WorkerBase):
|
|||||||
def init_device(self):
|
def init_device(self):
|
||||||
"""Initialize device and Construct model runner"""
|
"""Initialize device and Construct model runner"""
|
||||||
if paddle.is_compiled_with_xpu():
|
if paddle.is_compiled_with_xpu():
|
||||||
# Set evironment variable
|
# Set environment variable
|
||||||
self.device = f"xpu:{self.local_rank}"
|
self.device = f"xpu:{self.local_rank}"
|
||||||
paddle.device.set_device(self.device)
|
paddle.device.set_device(self.device)
|
||||||
paddle.set_default_dtype(self.parallel_config.dtype)
|
paddle.set_default_dtype(self.parallel_config.dtype)
|
||||||
|
@@ -107,7 +107,7 @@ class TestModel1(paddle.nn.Layer):
|
|||||||
sub_meta1 = forward_meta
|
sub_meta1 = forward_meta
|
||||||
sublayer1_output = self.sublayer1(ids_remove_padding=ids_remove_padding, forward_meta=sub_meta1)
|
sublayer1_output = self.sublayer1(ids_remove_padding=ids_remove_padding, forward_meta=sub_meta1)
|
||||||
|
|
||||||
# sublayer2 not use cuda garph
|
# sublayer2 not use cuda graph
|
||||||
sub_meta2 = ForwardMeta(input_ids=sublayer1_output, ids_remove_padding=sublayer1_output)
|
sub_meta2 = ForwardMeta(input_ids=sublayer1_output, ids_remove_padding=sublayer1_output)
|
||||||
sublayer2_output = self.sublayer2(ids_remove_padding=sublayer1_output, forward_meta=sub_meta2)
|
sublayer2_output = self.sublayer2(ids_remove_padding=sublayer1_output, forward_meta=sub_meta2)
|
||||||
self.sublayer2_output_buffer.copy_(sublayer2_output, False)
|
self.sublayer2_output_buffer.copy_(sublayer2_output, False)
|
||||||
@@ -131,7 +131,7 @@ class TestModel1(paddle.nn.Layer):
|
|||||||
ids_remove_padding=ids_remove_padding, forward_meta=sub_meta1
|
ids_remove_padding=ids_remove_padding, forward_meta=sub_meta1
|
||||||
)
|
)
|
||||||
|
|
||||||
# sublayer2 not use cuda garph
|
# sublayer2 not use cuda graph
|
||||||
sub_meta2 = ForwardMeta(input_ids=sublayer1_output, ids_remove_padding=sublayer1_output)
|
sub_meta2 = ForwardMeta(input_ids=sublayer1_output, ids_remove_padding=sublayer1_output)
|
||||||
sublayer2_output = self.sublayer2.forward_correct(ids_remove_padding=sublayer1_output, forward_meta=sub_meta2)
|
sublayer2_output = self.sublayer2.forward_correct(ids_remove_padding=sublayer1_output, forward_meta=sub_meta2)
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user