This commit is contained in:
co63oc
2025-09-01 17:50:17 +08:00
committed by GitHub
parent 0513a78ecc
commit d6369b4d51
67 changed files with 85 additions and 85 deletions

View File

@@ -980,7 +980,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("per_token_quant_padding", &PerTokenQuantPadding, py::arg("input"),
py::arg("block_size"),
"per token per block quant and padding tranpose scale");
"per token per block quant and padding transpose scale");
m.def("masked_per_token_quant", &MaskedPerTokenQuant, py::arg("input"),
py::arg("recv_expert_count"), py::arg("block_size"),

View File

@@ -89,11 +89,11 @@ public:
GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN,
Shape::kK / WarpGemm::kK>;
/// Number of warp-level GEMM oeprations
/// Number of warp-level GEMM operations
static int const kWarpGemmIterations =
(WarpGemm::kK / Operator::Policy::MmaShape::kK);
/// Number of warp-level GEMM oeprations per load for B
/// Number of warp-level GEMM operations per load for B
static constexpr int kWarpGemmIterationsPerLoadForB =
Operator::IteratorB::InstructionShape::kRow / Operator::InstructionShape::kK;
static_assert(!(kWarpGemmIterations % kWarpGemmIterationsPerLoadForB), "");

View File

@@ -117,7 +117,7 @@ class LeftGELUAndMul {
CUTLASS_HOST_DEVICE
FragmentOutput operator()(FragmentAccumulator const &lhs,
FragmentAccumulator const &rhs) const {
// Convert source to interal compute numeric type
// Convert source to internal compute numeric type
NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
accumulator_to_compute;

View File

@@ -117,7 +117,7 @@ class LeftSiLUAndMul {
CUTLASS_HOST_DEVICE
FragmentOutput operator()(FragmentAccumulator const &lhs,
FragmentAccumulator const &rhs) const {
// Convert source to interal compute numeric type
// Convert source to internal compute numeric type
NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
accumulator_to_compute;

View File

@@ -92,7 +92,7 @@ class DualMmaBase {
Shape::kN / WarpGemm::kN,
Shape::kK / WarpGemm::kK>;
/// Number of warp-level GEMM oeprations
/// Number of warp-level GEMM operations
static int const kWarpGemmIterations =
(WarpGemm::kK / Operator0::Policy::MmaShape::kK);

View File

@@ -219,7 +219,7 @@ class EpilogueVisitorPerRowPerColNf4 {
iterator_C_.clear_mask();
}
// NOTE(wangbojun) Currently, this kernel don't hanve implantention for
// adding elementwise beta, we keep this here for future useage beta_ =
// adding elementwise beta, we keep this here for future usage beta_ =
// (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr :
// params.elementwise.beta); if (beta_ == ElementAccumulator()) {
// iterator_C_.clear_mask();

View File

@@ -176,7 +176,7 @@ struct Nf4DefaultIteratorsTensorOp<cutlass::bfloat16_t,
///
/// Satisfies: ReadableTileIterator
///
template <typename ThreadMap_ ///< Thread map (conept: OutputTileThreadMap)
template <typename ThreadMap_ ///< Thread map (concept: OutputTileThreadMap)
>
class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 16, 8, 8> {
public:

View File

@@ -64,7 +64,7 @@ template <
typename InstructionShape_,
/// Number of stages used in the pipelined mainloop
int Stages,
/// Operation perfomed by GEMM
/// Operation performed by GEMM
typename Operator,
/// Store the accumulators in row major or column major. Row major is used
/// when output layout is interleaved.

View File

@@ -133,7 +133,7 @@ public:
/// Shape describing the number of warps filling the CTA
using WarpCount = GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
/// Number of warp-level GEMM oeprations
/// Number of warp-level GEMM operations
static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
static_assert(Operator::IteratorB::InstructionShape::kRow>=Operator::InstructionShape::kK,"");
static constexpr int kNumKIterationsPerWarpBLoad =

View File

@@ -509,7 +509,7 @@ public:
this->warp_tile_iterator_B_.load(warp_frag_B[(warp_tileB_k_load_offset + 1) % 2]);
++this->warp_tile_iterator_B_;
}
// TOOD(wangbojun) lds_converter can be remove for int8 B input
// TODO(wangbojun) lds_converter can be remove for int8 B input
typename TransformBAfterLDS::result_type converted_frag_B =
lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);

View File

@@ -96,7 +96,7 @@ public:
/// Shape describing the number of warps filling the CTA
using WarpCount = GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
/// Number of warp-level GEMM oeprations
/// Number of warp-level GEMM operations
static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
static_assert(Operator::IteratorB::InstructionShape::kRow>=Operator::InstructionShape::kK,"");
static constexpr int kNumKIterationsPerWarpBLoad =

View File

@@ -646,7 +646,7 @@ public:
// );
// }
}
// TOOD(wangbojun) lds_converter can be remove for int8 B input
// TODO(wangbojun) lds_converter can be remove for int8 B input
// int4
// typename TransformBAfterLDS::result_type converted_frag_B =
// lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);

View File

@@ -171,7 +171,7 @@ struct DefaultIteratorsTensorOp<cutlass::bfloat16_t,
///
/// Satisfies: ReadableTileIterator
///
template <typename ThreadMap_ ///< Thread map (conept: OutputTileThreadMap)
template <typename ThreadMap_ ///< Thread map (concept: OutputTileThreadMap)
>
class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 16, 8, 8> {
public:

View File

@@ -80,7 +80,7 @@ void MoeDispatchKernel(
if (group_moe) {
paddle::Tensor softmax_max_prob_tensor =
GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
// (TODO: check fill sucess ?)
// (TODO: check fill success ?)
paddle::experimental::fill(softmax_max_prob_tensor, 0.f);
softmax_max_prob = softmax_max_prob_tensor.data<float>();
}

View File

@@ -75,7 +75,7 @@ void SaveOutMmsgTopK(const paddle::Tensor& x,
std::string inference_msg_id_env_str(inference_msg_id_env_p);
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
if (inference_msg_id_from_env == 2) {
// 2 and -2 is perserve for no-output indication.
// 2 and -2 is preserve for no-output indication.
throw std::runtime_error(
" INFERENCE_MSG_ID cannot be 2, please use other number.");
}

View File

@@ -45,7 +45,7 @@ void save_kernel(const paddle::Tensor& x,
std::string inference_msg_id_env_str(inference_msg_id_env_p);
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
if (inference_msg_id_from_env == 2) {
// 2 and -2 is perserve for no-output indication.
// 2 and -2 is preserve for no-output indication.
throw std::runtime_error(
" INFERENCE_MSG_ID cannot be 2, please use other number.");
}

View File

@@ -34,7 +34,7 @@ __global__ void set_value_by_flag_and_id(const bool *stop_flags,
const int64_t *input_ids_now = input_ids + tid * length_input_ids;
const int seq_len_dec = seq_lens_decoder[tid];
const int seq_len_enc = seq_lens_encoder[tid];
if (seq_len_dec == 0 && seq_len_enc == 0) return; // stoped
if (seq_len_dec == 0 && seq_len_enc == 0) return; // stopped
if (step_idx[tid] >= 0) {
if (seq_len_enc > 0) { // encoder, get last token accord to seq_lens_encoder
pre_ids_all_now[step_idx[tid]] = input_ids_now[seq_len_enc - 1];

View File

@@ -63,7 +63,7 @@ __global__ void ComputeOrderKernel(
position_map[in_offset++] = out_offset++;
}
in_offset += cur_base_model_seq_lens_this_time - accept_num;
// (liuzichang): Temperary Reserved for debug
// (liuzichang): Temporary Reserved for debug
// if (accept_num <= actual_draft_token_num) /*Accept partial draft tokens*/ {
// #ifdef DEBUG_EAGLE_KERNEL
// printf("batch %d: accept_num <= actual_draft_token_num \n", i);

View File

@@ -35,7 +35,7 @@ __global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all,
accept_tokens + tid * max_draft_tokens;
const int seq_len_dec = seq_lens_decoder[tid];
const int seq_len_enc = seq_lens_encoder[tid];
if (seq_len_dec == 0 && seq_len_enc == 0) return; // stoped
if (seq_len_dec == 0 && seq_len_enc == 0) return; // stopped
// printf("step_idx[tid] %d\n", step_idx[tid]);
if (step_idx[tid] >= 0) {
for (int i = 0; i < accept_num[tid]; i++) {

View File

@@ -295,7 +295,7 @@ void SpeculateStepSchedule(const paddle::Tensor &stop_flags,
std::string inference_msg_id_env_str(inference_msg_id_env_p);
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
if (inference_msg_id_from_env == 2) {
// 2 and -2 is perserve for no-output indication.
// 2 and -2 is preserve for no-output indication.
throw std::runtime_error(
" INFERENCE_MSG_ID cannot be 2, please use other number.");
}

View File

@@ -283,7 +283,7 @@ void Schedule(const paddle::Tensor &stop_flags,
std::string inference_msg_id_env_str(inference_msg_id_env_p);
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
if (inference_msg_id_from_env == 2) {
// 2 and -2 is perserve for no-output indication.
// 2 and -2 is preserve for no-output indication.
throw std::runtime_error(
" INFERENCE_MSG_ID cannot be 2, please use other number.");
}

View File

@@ -58,7 +58,7 @@ class TokenTransfer {
}
// once copy: cpu --> cpu
// arrary length should be (1 + MAX_BATCH)
// array length should be (1 + MAX_BATCH)
bool GetBatchToken(int64_t *array) {
if (Empty()) {
return false;

View File

@@ -75,10 +75,10 @@ void UpdateSplitFuseInputes(const paddle::Tensor& split_fuse_seq_lens,
const int max_seq_len,
const int max_batch_size,
const int split_fuse_size) {
dim3 girds;
girds.x = max_batch_size;
dim3 grids;
grids.x = max_batch_size;
const int block_size = 128;
update_split_fuse_inputs_kernel<<<girds,
update_split_fuse_inputs_kernel<<<grids,
block_size,
0,
input_ids.stream()>>>(

View File

@@ -110,7 +110,7 @@ void MoeDispatchKernel(const paddle::Tensor& input,
if (group_moe) {
paddle::Tensor softmax_max_prob_tensor =
GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
// (TODO: check fill sucess ?)
// (TODO: check fill success ?)
paddle::experimental::fill(softmax_max_prob_tensor, 0.f);
softmax_max_prob = softmax_max_prob_tensor.data<float>();
}

View File

@@ -507,7 +507,7 @@ elif paddle.is_compiled_with_cuda():
sources += find_end_files(fp8_auto_gen_directory, ".cu")
if cc >= 90 and nvcc_version >= 12.0:
# Hopper optmized mla
# Hopper optimized mla
sources += find_end_files("gpu_ops/mla_attn", ".cu")
sources += ["gpu_ops/flash_mask_attn/flash_mask_attn.cu"]
sources += find_end_files("gpu_ops/moba_attn/moba_decoder_attn/", ".cu")

View File

@@ -67,7 +67,7 @@ std::vector<paddle::Tensor> MoeLayerKernel(
const auto xtype = x.dtype();
auto x_dims = x.shape();
auto up_gate_proj_dims = up_gate_proj_weight.shape();
PD_CHECK(x_dims.size() == 2, "x_dims.size() shoud be 2.");
PD_CHECK(x_dims.size() == 2, "x_dims.size() should be 2.");
PD_CHECK(up_gate_proj_dims.size() == 3, "up_gate_proj_dims.size() should be 3.");
PD_CHECK(down_proj_in_scale.get_ptr() == nullptr, "down_proj_in_scale not support.");
if (quant_method == "weight_only_int4") {

View File

@@ -122,7 +122,7 @@ void SpeculateStepSchedule(
std::string inference_msg_id_env_str(inference_msg_id_env_p);
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
if (inference_msg_id_from_env == 2) {
// 2 and -2 is perserve for no-output indication.
// 2 and -2 is preserve for no-output indication.
throw std::runtime_error(
" INFERENCE_MSG_ID cannot be 2, please use other number.");
}

View File

@@ -59,7 +59,7 @@ void SaveOutMmsg(const paddle::Tensor &x, const paddle::Tensor &not_need_stop,
std::string inference_msg_id_env_str(inference_msg_id_env_p);
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
if (inference_msg_id_from_env == 2) {
// 2 and -2 is perserve for no-output indication.
// 2 and -2 is preserve for no-output indication.
throw std::runtime_error(
" INFERENCE_MSG_ID cannot be 2, please use other number.");
}

View File

@@ -4,7 +4,7 @@
namespace xpu3 {
namespace plugin {
#define MAX_LM_SIZE 28672
// One core has 32KB LMgropu LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
// One core has 32KB LMgroup LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
// the stack space
#define MAX_BATCH 512
#define ALIGNMENT 64

View File

@@ -4,7 +4,7 @@
namespace xpu3 {
namespace plugin {
#define MAX_LM_SIZE 28672
// One core has 32KB LMgropu LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
// One core has 32KB LMgroup LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
// the stack space
#define MAX_BATCH 512
#define ALIGNMENT 64

View File

@@ -8,7 +8,7 @@
namespace xpu3 {
namespace plugin {
#define MAX_SM_SIZE 32768
// One core has 32KB LMgropu LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
// One core has 32KB LMgroup LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
// the stack space
#define MAX_BATCH 512
#define BANK_CONFLICT_M 128

View File

@@ -79,7 +79,7 @@ qw_pd_trans = paddle.transpose(qw_pd, [1, 0])
# print("wscale_pd:\n{}".format(wscale_pd))
# print("wscale_np:\n{}".format(wscale_np))
# comparation
# comparison
print(f"wscale_pd, mean={wscale_pd.mean()}, std={wscale_pd.std()}")
print(f"wscale_np, mean={wscale_np.mean()}, std={wscale_np.std()}")
print(f"qw_np, mean={qw_np.astype(np.float32).mean()}, std={qw_np.astype(np.float32).std()}")

View File

@@ -44,7 +44,7 @@ CudaGrpah can be enabled by setting `--use-cudagraph` or `--graph-optimization-c
The `graph_opt_level` parameter within `--graph-optimization-config` is used to configure the graph optimization level, with the following available options:
+ `0`: Use Dynamic compute graph, default to 0
+ `1`: Use Static compute graph, during the initialization phase, Paddle API will be used to convert the dynamic image into a static image
+ `2`: Base on Static compute graph, use the complier(CINN, Compiler Infrastructure for Neural Networks) of Paddle to compile and optimize
+ `2`: Base on Static compute graph, use the compiler(CINN, Compiler Infrastructure for Neural Networks) of Paddle to compile and optimize
In general, static graphs have lower Kernel Launch overhead than dynamic graphs, and it is recommended to use static graphs.
For adapted models, FastDeploy's CudaGraph *can support both dynamic and static graphs* simultaneously.

View File

@@ -62,7 +62,7 @@ python -m pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/pac
python -m pip install paddle-custom-gcu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/gcu/
# For source compilation, refer to: https://github.com/PaddlePaddle/PaddleCustomDevice/blob/develop/backends/gcu/README_cn.md
```
For latest paddle verion on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
6. Install FastDeploy and dependencies
```bash

View File

@@ -89,4 +89,4 @@ for chunk in response:
print('\n')
```
For detailed OpenAI protocol specifications, see [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../online_serving/README.md).
For detailed OpenAI protocol specifications, see [OpenAI Chat Completion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../online_serving/README.md).

View File

@@ -1,6 +1,6 @@
# 采样策略
采样策略用于决定如何从模型的输出概率分布中选择下一个token。FastDeploy目前支持 Top-p 、 Top-k_Top-p 和 Min-p Samping 多种采样策略。
采样策略用于决定如何从模型的输出概率分布中选择下一个token。FastDeploy目前支持 Top-p 、 Top-k_Top-p 和 Min-p Sampling 多种采样策略。
1. Top-p 采样

View File

@@ -89,4 +89,4 @@ for chunk in response:
print('\n')
```
OpenAI 协议的更多说明可参考文档 [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create),以及与 OpenAI 协议的区别可以参考 [兼容 OpenAI 协议的服务化部署](../online_serving/README.md)。
OpenAI 协议的更多说明可参考文档 [OpenAI Chat Completion API](https://platform.openai.com/docs/api-reference/chat/create),以及与 OpenAI 协议的区别可以参考 [兼容 OpenAI 协议的服务化部署](../online_serving/README.md)。

View File

@@ -57,7 +57,7 @@ def parse_args():
"--protocol",
type=str,
default="ipc",
help="cache transfer protocol, only surport ipc now",
help="cache transfer protocol, only support ipc now",
)
parser.add_argument("--enable_splitwise", type=int, default=0, help="enable splitwise ")
parser.add_argument("--cache_queue_port", type=int, default=9923, help="cache queue port")

View File

@@ -257,7 +257,7 @@ class ParallelConfig:
self.sequence_parallel = False # Whether to enable sequence parallelism.
self.use_ep = False # Whether to enable Expert Parallelism
self.moe_phase = MoEPhase("prefill") # Generation phase
self.msg_queue_id = 1 # mesage queue id
self.msg_queue_id = 1 # message queue id
self.tensor_parallel_rank = 0 # TP rank ID
self.tensor_parallel_size = 1 # TP degree
@@ -549,7 +549,7 @@ class GraphOptimizationConfig:
It requires that all input buffers have fixed addresses, and all
splitting ops write their outputs to input buffers.
- With dyncmic graph backend: ...
- With static grpah backend: WIP
- With static graph backend: WIP
"""
self.sot_warmup_sizes: list[int] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 32, 64, 128]
""" Number of warmup runs for SOT warmup. """

View File

@@ -531,7 +531,7 @@ class EngineArgs:
"--quantization",
type=str,
default=EngineArgs.quantization,
help="Quantization name for the model, currentlly support "
help="Quantization name for the model, currently support "
"'wint8', 'wint4',"
"default is None. The priority of this configuration "
"is lower than that of the config file. "
@@ -829,7 +829,7 @@ class EngineArgs:
scheduler_group.add_argument(
"--scheduler-topic",
default=EngineArgs.scheduler_topic,
help=f"Topic of scheduler. Defaule is {EngineArgs.scheduler_topic}. (global)",
help=f"Topic of scheduler. Default is {EngineArgs.scheduler_topic}. (global)",
)
scheduler_group.add_argument(
"--scheduler-min-load-score",

View File

@@ -644,13 +644,13 @@ class EngineSevice:
self.zmq_server.send_multipart(request_id, [error_result])
except Exception as e:
llm_logger.error(
f"Error happend while receving new request from zmq, details={e}, "
f"Error happend while receiving new request from zmq, details={e}, "
f"traceback={traceback.format_exc()}"
)
def _zmq_send_generated_tokens(self):
"""
Recieve output for zmq
Receive output for zmq
"""
while self.running:
try:

View File

@@ -458,7 +458,7 @@ class ResourceManagerV1(ResourceManager):
def _free_blocks(self, request: Request):
if self.config.cache_config.enable_prefix_caching:
# TODO(chengyanfu): support cache ouput blocks for prefix caching
# TODO(chengyanfu): support cache output blocks for prefix caching
if request.get("prefill_block_num", None) is None:
leaf_node = self.cache_manager.req_leaf_map[request.request_id]
self.cache_manager.decrease_request_share_count(request.request_id)

View File

@@ -112,7 +112,7 @@ class LLM:
def _receive_output(self):
"""
Recieve output from token processor and store them in cache
Receive output from token processor and store them in cache
"""
while True:
try:

View File

@@ -40,7 +40,7 @@ class ConcreteSizeEntry:
# Has runtime-bs been captured before
captured: bool = False
# Need to be captured callable objectdynamic graph or static grpah backend
# Need to be captured callable objectdynamic graph or static graph backend
runnable: Callable = None # type: ignore
# Number of completed warmups
num_finished_warmup: int = 0

View File

@@ -117,9 +117,9 @@ class GraphOptBackend:
self.max_captre_batch = fd_config.graph_opt_config.cudagraph_capture_sizes[0]
if self.fd_config.graph_opt_config.graph_opt_level > 0:
# 1. Prepare cuda grpah input buffers (contain output of subgraphs)
# 1. Prepare cuda graph input buffers (contain output of subgraphs)
# 2. Convert dynamic grpah to static graph
# 2. Convert dynamic graph to static graph
backend = (
ToStaticBackend.CINN if self.fd_config.graph_opt_config.graph_opt_level > 1 else ToStaticBackend.PHI

View File

@@ -193,7 +193,7 @@ class AppendAttentionBackend(AttentionBackend):
kv_cache_quant_type: str = None,
):
"""
Caculate kv cache shape
Calculate kv cache shape
"""
if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
return (

View File

@@ -114,7 +114,7 @@ class BlockAttentionBackend(AttentionBackend):
kv_cache_quant_type: str = None,
):
"""
Caculate kv cache shape
Calculate kv cache shape
"""
if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
return (

View File

@@ -176,7 +176,7 @@ class FlashAttentionBackend(AttentionBackend):
kv_cache_quant_type: str = None,
):
"""
Caculate kv cache shape
Calculate kv cache shape
"""
if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
return (

View File

@@ -210,7 +210,7 @@ class IluvatarAttnBackend(AttentionBackend):
kv_cache_quant_type: str = None,
):
"""
Caculate kv cache shape
Calculate kv cache shape
"""
return (
max_num_blocks,

View File

@@ -130,7 +130,7 @@ class XPUAttentionBackend(AttentionBackend):
kv_cache_quant_type: str = None,
) -> Tuple[int, int, int, int]:
"""
Caculate kv cache shape
Calculate kv cache shape
"""
return (
max_num_blocks,

View File

@@ -170,7 +170,7 @@ class GCUFlashAttnBackend(AttentionBackend):
cache_len = 0
elif self.seq_lens_decoder_list[seq_idx][0] != 0: # decode
cache_len = self.seq_lens_decoder_list[seq_idx][0]
# else: doesnot have req in this seq_idx
# else: doesn't have req in this seq_idx
if cache_len is not None:
lens_this_time = self.seq_lens_this_time_list[seq_idx]
@@ -212,7 +212,7 @@ class GCUFlashAttnBackend(AttentionBackend):
kv_cache_quant_type: str = None,
):
"""
Caculate kv cache shape
Calculate kv cache shape
"""
# [total_tokens, kv_num_heads, head_dim]
return (

View File

@@ -171,7 +171,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
cache_len = 0
elif self.seq_lens_decoder_list[seq_idx][0] != 0: # decode
cache_len = self.seq_lens_decoder_list[seq_idx][0]
# else: doesnot have req in this seq_idx
# else: doesn't have req in this seq_idx
if cache_len is not None:
lens_this_time = self.seq_lens_this_time_list[seq_idx]
@@ -224,7 +224,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
kv_cache_quant_type: str = None,
):
"""
Caculate kv cache shape
Calculate kv cache shape
"""
# [total_tokens, kv_num_heads, head_dim]
return (

View File

@@ -137,7 +137,7 @@ class FlashAttentionBackend(AttentionBackend):
kv_cache_quant_type: str = None,
):
"""
Caculate kv cache shape
Calculate kv cache shape
"""
if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
return (

View File

@@ -114,7 +114,7 @@ class DeepEPEngine:
low_latency_mode=True,
num_qps_per_rank=24,
)
# In disaggregated mode on mutiple nodes, we either use
# In disaggregated mode on multiple nodes, we either use
# high throughput mode or low latency mode.
else:
if moe_phase.phase == "decode":

View File

@@ -35,7 +35,7 @@ class EarlyStopper:
@abstractmethod
def process(self, probs: paddle.Tensor, next_tokens: paddle.Tensor, stop_flags: paddle.Tensor):
"""
processs the stopper and set the stop_flags corresponding to the batch that triggers early stop to True
process the stopper and set the stop_flags corresponding to the batch that triggers early stop to True
args:
- probs: [batch_size, vocab_size], the probs of every sample
- next_tokens: [batch_size, 1], the token index of every chosen sample

View File

@@ -267,7 +267,7 @@ class TokenProcessor:
spec_logger.info(
f"Speculate global accept ratio(Accept draft_tokens/Generated tokens): {accept_ratio}"
f" total step: {self.total_step}. total output token num: {self.number_of_output_tokens}"
f" avarage accept len: {self.number_of_output_tokens / self.total_step}"
f" average accept len: {self.number_of_output_tokens / self.total_step}"
)
if self.cfg.speculative_config.method in ["mtp"]:

View File

@@ -72,7 +72,7 @@ class Proposer(ABC):
@abstractmethod
def _run_impl(self, *args, **kwargs) -> Any:
"""
Implemention for different method
Implementation for different method
"""
raise NotImplementedError

View File

@@ -14,7 +14,7 @@
# limitations under the License.
"""
"""redundant expert manger."""
"""redundant expert manager."""
from typing import Optional, Tuple
import numpy as np

View File

@@ -49,7 +49,7 @@ class GcuWorker(WorkerBase):
def init_device(self):
"""Initialize device and Construct model runner"""
if paddle.is_compiled_with_custom_device("gcu"):
# Set evironment variable
# Set environment variable
self.device_ids = self.parallel_config.device_ids.split(",")
self.device = f"gcu:{self.local_rank}"
paddle.device.set_device(self.device)
@@ -127,7 +127,7 @@ class GcuWorker(WorkerBase):
# NOTE(gongshaotian): may be not need warm_up at this place
if self.model_runner.graph_opt_level >= 1:
self.model_runner.sot_warmup()
# 2. Triger cuda grpah capture
# 2. Trigger cuda graph capture
self.model_runner.capture_model()
set_random_seed(self.fd_config.model_config.seed)

View File

@@ -60,7 +60,7 @@ class GpuWorker(WorkerBase):
"""
self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
if self.device_config.device_type == "cuda" and paddle.device.is_compiled_with_cuda():
# Set evironment variable
# Set environment variable
self.device_ids = self.parallel_config.device_ids.split(",")
self.device = f"gpu:{self.local_rank % self.max_chips_per_node}"
paddle.device.set_device(self.device)
@@ -169,7 +169,7 @@ class GpuWorker(WorkerBase):
)
)
return available_kv_cache_memory # return to caculate the block num in this device
return available_kv_cache_memory # return to calculate the block num in this device
def load_model(self) -> None:
"""Load model"""
@@ -209,7 +209,7 @@ class GpuWorker(WorkerBase):
"""
if self.model_runner.graph_opt_level >= 1:
self.model_runner.sot_warmup()
# Triger cuda grpah capture
# Trigger cuda graph capture
self.model_runner.capture_model()
def check_health(self) -> bool:

View File

@@ -51,7 +51,7 @@ class IluvatarWorker(GpuWorker):
Initialize device and construct model runner
"""
if paddle.is_compiled_with_custom_device("iluvatar_gpu"):
# Set evironment variable
# Set environment variable
self.device = f"iluvatar_gpu:{self.local_rank}"
paddle.device.set_device(self.device)
paddle.set_default_dtype(self.parallel_config.dtype)

View File

@@ -54,7 +54,7 @@ class MetaxWorker(WorkerBase):
"""
self.max_chips_per_node = 8
if paddle.is_compiled_with_custom_device("metax_gpu"):
# Set evironment variable
# Set environment variable
self.device_ids = self.parallel_config.device_ids.split(",")
self.device = f"metax_gpu:{self.local_rank % self.max_chips_per_node}"
paddle.device.set_device(self.device)
@@ -202,7 +202,7 @@ class MetaxWorker(WorkerBase):
"""
if self.model_runner.graph_opt_level >= 1:
self.model_runner.sot_warmup()
# Todo Triger cuda grpah capture.
# Todo Trigger cuda graph capture.
def check_health(self) -> bool:
""" """

View File

@@ -21,7 +21,7 @@ import traceback
def check_safetensors_model(model_dir: str):
"""
model_dir : the directory of the model
Check whther the model is safetensors format
Check whether the model is safetensors format
"""
model_files = list()
all_files = os.listdir(model_dir)

View File

@@ -27,7 +27,7 @@ from fastdeploy.worker.output import ModelRunnerOutput
class WorkerBase(ABC):
"""
Engine -> (WIP)Executor -> Worker -> ModelRunner -> Model
Worker interface that allows inference framwork to cleanly separate implementations for different harware.
Worker interface that allows inference framework to cleanly separate implementations for different hardware.
"""
def __init__(
@@ -89,7 +89,7 @@ class WorkerBase(ABC):
@abstractmethod
def graph_optimize_and_warm_up_model(self) -> None:
"""Prepare model for execution through grpah optimizaiton(CudaGrpah/CINN) or warmup."""
"""Prepare model for execution through graph optimizaiton(CudaGrpah/CINN) or warmup."""
raise NotImplementedError
@abstractmethod

View File

@@ -249,7 +249,7 @@ class PaddleDisWorkerProc:
)
def event_loop_normal(self) -> None:
"""Main event loop for Paddle Distrubuted Workers.
"""Main event loop for Paddle Distributed Workers.
TODO(gongshaotian): support remote calling of functions that control worker.
"""
# Currently, only support single node
@@ -493,7 +493,7 @@ def parse_args():
"--speculative_config",
type=json.loads,
default=None,
help="Configation of SpeculativeConfig.",
help="Configuration of SpeculativeConfig.",
)
parser.add_argument(
"--max_num_batched_tokens",
@@ -542,7 +542,7 @@ def parse_args():
"--quantization",
type=str,
default="None",
help="Quantization name for the model, currentlly support "
help="Quantization name for the model, currently support "
"'wint4', 'wint8',"
"default is None. The priority of this configuration "
"is lower than that of the config file. "
@@ -552,7 +552,7 @@ def parse_args():
"--graph_optimization_config",
type=json.loads,
default=None,
help="Configation of Graph optimization backend.",
help="Configuration of Graph optimization backend.",
)
parser.add_argument(
"--moba_attention_config",

View File

@@ -50,7 +50,7 @@ class XpuWorker(WorkerBase):
def init_device(self):
"""Initialize device and Construct model runner"""
if paddle.is_compiled_with_xpu():
# Set evironment variable
# Set environment variable
self.device = f"xpu:{self.local_rank}"
paddle.device.set_device(self.device)
paddle.set_default_dtype(self.parallel_config.dtype)

View File

@@ -107,7 +107,7 @@ class TestModel1(paddle.nn.Layer):
sub_meta1 = forward_meta
sublayer1_output = self.sublayer1(ids_remove_padding=ids_remove_padding, forward_meta=sub_meta1)
# sublayer2 not use cuda garph
# sublayer2 not use cuda graph
sub_meta2 = ForwardMeta(input_ids=sublayer1_output, ids_remove_padding=sublayer1_output)
sublayer2_output = self.sublayer2(ids_remove_padding=sublayer1_output, forward_meta=sub_meta2)
self.sublayer2_output_buffer.copy_(sublayer2_output, False)
@@ -131,7 +131,7 @@ class TestModel1(paddle.nn.Layer):
ids_remove_padding=ids_remove_padding, forward_meta=sub_meta1
)
# sublayer2 not use cuda garph
# sublayer2 not use cuda graph
sub_meta2 = ForwardMeta(input_ids=sublayer1_output, ids_remove_padding=sublayer1_output)
sublayer2_output = self.sublayer2.forward_correct(ids_remove_padding=sublayer1_output, forward_meta=sub_meta2)