mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 08:16:42 +08:00
[xpu] support ep (#4067)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
This commit is contained in:
@@ -43,18 +43,20 @@ std::vector<paddle::Tensor> MoeTopkSelect(
|
||||
int32_t* block_statistic = nullptr;
|
||||
const float* bias_data =
|
||||
bias.get_ptr() != nullptr ? bias.get_ptr()->data<float>() : nullptr;
|
||||
int ret = infer_ops::moe_softmax_topk_norm_fusion(
|
||||
xpu_ctx->x_context(),
|
||||
gating_logits.data<float>(),
|
||||
topk_weights.mutable_data<float>(),
|
||||
topk_ids.mutable_data<int>(),
|
||||
block_statistic,
|
||||
token_num,
|
||||
expert_num,
|
||||
moe_topk,
|
||||
0,
|
||||
bias_data);
|
||||
PD_CHECK(ret == 0);
|
||||
if (token_num > 0) {
|
||||
int ret = infer_ops::moe_softmax_topk_norm_fusion(
|
||||
xpu_ctx->x_context(),
|
||||
gating_logits.data<float>(),
|
||||
topk_weights.mutable_data<float>(),
|
||||
topk_ids.mutable_data<int>(),
|
||||
block_statistic,
|
||||
token_num,
|
||||
expert_num,
|
||||
moe_topk,
|
||||
0,
|
||||
bias_data);
|
||||
PD_CHECK(ret == 0);
|
||||
}
|
||||
|
||||
return {topk_ids, topk_weights};
|
||||
}
|
||||
|
@@ -416,7 +416,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
py::arg("bias"),
|
||||
py::arg("weight_dtype"),
|
||||
py::arg("arch"),
|
||||
py::arg("group_size"));
|
||||
py::arg("group_size")=-1);
|
||||
|
||||
m.def("ep_moe_expert_combine",
|
||||
&MoeEPCombine,
|
||||
|
@@ -265,7 +265,7 @@ class Ernie4_5Processor(BaseDataProcessor):
|
||||
if tool_call_info.tools_called:
|
||||
response_dict.outputs.tool_calls = tool_call_info.tool_calls
|
||||
response_dict.outputs.text = tool_call_info.content
|
||||
data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}")
|
||||
data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}")
|
||||
if response_dict.outputs.text == "" and response_dict.outputs.reasoning_content == "":
|
||||
return None
|
||||
return response_dict
|
||||
|
@@ -377,7 +377,7 @@ class DataProcessor(BaseDataProcessor):
|
||||
if tool_call_info.tools_called:
|
||||
response_dict.outputs.tool_calls = tool_call_info.tool_calls
|
||||
response_dict.outputs.text = tool_call_info.content
|
||||
data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}")
|
||||
data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}")
|
||||
|
||||
return response_dict
|
||||
|
||||
|
@@ -16,6 +16,16 @@
|
||||
xpu backend methods
|
||||
"""
|
||||
|
||||
from .moe.fused_moe import (
|
||||
XPUMoEMethod,
|
||||
XPUWeightOnlyMoeEpMethod,
|
||||
XPUWeightOnlyMoEMethod,
|
||||
)
|
||||
from .quantization.weight_only import XPUWeightOnlyLinearMethod
|
||||
|
||||
__all__ = ["XPUWeightOnlyLinearMethod"]
|
||||
__all__ = [
|
||||
"XPUWeightOnlyLinearMethod",
|
||||
"XPUMoEMethod",
|
||||
"XPUWeightOnlyMoEMethod",
|
||||
"XPUWeightOnlyMoeEpMethod",
|
||||
]
|
||||
|
@@ -0,0 +1,16 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
xpu fused moe methods
|
||||
"""
|
415
fastdeploy/model_executor/layers/backends/xpu/moe/ep.py
Normal file
415
fastdeploy/model_executor/layers/backends/xpu/moe/ep.py
Normal file
@@ -0,0 +1,415 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from abc import abstractmethod
|
||||
|
||||
import deep_ep
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddleformers.utils.log import logger
|
||||
|
||||
import fastdeploy
|
||||
from fastdeploy.config import MoEPhase
|
||||
from fastdeploy.model_executor.layers.moe.ep import DeepEPEngineBase, EPRunner
|
||||
from fastdeploy.utils import singleton
|
||||
|
||||
|
||||
@singleton
|
||||
class DeepEPEngine(DeepEPEngineBase):
|
||||
"""
|
||||
A wrapper class for DeepEP engine.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_max_dispatch_tokens_per_rank: int,
|
||||
hidden: int,
|
||||
num_experts: int,
|
||||
ep_size: int,
|
||||
ep_rank: int,
|
||||
splitwise_role: str,
|
||||
moe_phase: MoEPhase,
|
||||
async_finish: bool = False,
|
||||
group=None,
|
||||
):
|
||||
"""
|
||||
Initialize the DeepEP engine.
|
||||
Args:
|
||||
group: The MPI group object.
|
||||
ep_size: The number of ranks.
|
||||
rank_id: The rank id.
|
||||
num_max_dispatch_tokens_per_rank: The maximum number of tokens per rank to dispatch.
|
||||
hidden: The hidden dimension of the model.
|
||||
num_experts: The number of experts.
|
||||
"""
|
||||
super().__init__(
|
||||
num_max_dispatch_tokens_per_rank,
|
||||
hidden,
|
||||
num_experts,
|
||||
ep_size,
|
||||
ep_rank,
|
||||
splitwise_role,
|
||||
moe_phase,
|
||||
async_finish,
|
||||
group,
|
||||
)
|
||||
|
||||
def init_deepep_engine(self):
|
||||
if self.splitwise_role == "mixed" or self.moe_phase.phase == "prefill":
|
||||
self.deepep_engine = deep_ep.Buffer(
|
||||
self.group,
|
||||
int(1e9),
|
||||
0,
|
||||
num_experts=self.num_experts,
|
||||
low_latency_mode=False,
|
||||
num_qps_per_rank=1,
|
||||
)
|
||||
elif self.moe_phase.phase == "decode":
|
||||
logger.info("Initializing Low Latency Buffer")
|
||||
self.get_low_latency_buffer()
|
||||
else:
|
||||
raise ValueError(f"Unknown generation phase {self.moe_phase}")
|
||||
|
||||
def get_low_latency_buffer(self):
|
||||
"""
|
||||
Get the DeepEP buffer.
|
||||
Args:
|
||||
group: The MPI group object.
|
||||
num_max_dispatch_tokens_per_rank: The maximum number of tokens per rank to dispatch.
|
||||
hidden: The hidden dimension of the model.
|
||||
"""
|
||||
# NOTES: the low-latency mode will consume much more space than the normal mode
|
||||
# So we recommend that `num_max_dispatch_tokens_per_rank`
|
||||
# (the actual batch size in the decoding engine) should be less than 256
|
||||
num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
|
||||
self.num_max_dispatch_tokens_per_rank,
|
||||
self.hidden,
|
||||
self.ep_size,
|
||||
self.num_experts,
|
||||
)
|
||||
# Allocate a buffer if not existed or not enough buffer size
|
||||
if (
|
||||
self.deepep_engine is None
|
||||
or self.deepep_engine.group != self.group
|
||||
or not self.deepep_engine.low_latency_mode
|
||||
or self.deepep_engine.num_rdma_bytes < num_rdma_bytes
|
||||
):
|
||||
# NOTES: for best performance, the QP number **must** be equal to the number of the local experts
|
||||
assert self.num_experts % self.ep_size == 0
|
||||
self.deepep_engine = deep_ep.Buffer(
|
||||
self.group,
|
||||
0,
|
||||
num_rdma_bytes,
|
||||
self.num_experts,
|
||||
low_latency_mode=True,
|
||||
num_qps_per_rank=self.num_experts // self.num_ranks,
|
||||
)
|
||||
|
||||
def low_latency_dispatch(
|
||||
self,
|
||||
hidden_states: paddle.Tensor,
|
||||
topk_idx: paddle.Tensor,
|
||||
expertwise_scale,
|
||||
use_fp8: bool = False,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
hidden_states: [token_num, hidden] 'bfloat16/int8'
|
||||
topk_idx: [token_num, num_topk] 'int64'
|
||||
|
||||
Returns:
|
||||
recv_hidden_states: [num_local_experts,
|
||||
num_max_dispatch_tokens_per_rank * ep_size, hidden]
|
||||
ep_size * num_local_experts = num_experts
|
||||
recv_count: [num_local_experts]
|
||||
recv_count: a tensor shaped `[num_local_experts]` with type `torch.int`, indicating how many tokens each
|
||||
expert receive. As mentioned before, all not tokens are valid in `recv_x`.
|
||||
handle: the communication handle to be used in the `low_latency_combine` function.
|
||||
event: the event after executing the kernel (valid only if `async_finish` is set).
|
||||
hook: the receiving hook function (valid only if `return_recv_hook` is set).
|
||||
"""
|
||||
moe_in_w4a8_scale = None
|
||||
(
|
||||
packed_recv_x,
|
||||
recv_expert_count,
|
||||
handle,
|
||||
dispatch_hook,
|
||||
valid_token_num,
|
||||
) = self.deepep_engine.low_latency_dispatch(
|
||||
hidden_states,
|
||||
moe_in_w4a8_scale,
|
||||
topk_idx,
|
||||
self.num_max_dispatch_tokens_per_rank,
|
||||
self.num_experts,
|
||||
use_fp8=use_fp8,
|
||||
async_finish=False,
|
||||
return_recv_hook=True,
|
||||
)
|
||||
|
||||
return packed_recv_x, recv_expert_count, handle, dispatch_hook, valid_token_num
|
||||
|
||||
def low_latency_combine(
|
||||
self,
|
||||
hidden_states: paddle.Tensor,
|
||||
topk_idx: paddle.Tensor,
|
||||
topk_weights: paddle.Tensor,
|
||||
handle,
|
||||
):
|
||||
"""
|
||||
|
||||
Return:
|
||||
combined_hidden_states: [num_tokens, hidden]
|
||||
"""
|
||||
combined_hidden_states, combine_hook = self.deepep_engine.low_latency_combine(
|
||||
hidden_states,
|
||||
topk_idx,
|
||||
topk_weights,
|
||||
handle,
|
||||
async_finish=False,
|
||||
return_recv_hook=True,
|
||||
)
|
||||
return combined_hidden_states, combine_hook
|
||||
|
||||
def clean_low_latency_buffer(self):
|
||||
"""
|
||||
clean_low_latency_buffer
|
||||
"""
|
||||
pass
|
||||
|
||||
def barrier_all(self):
|
||||
"""
|
||||
barrier_all
|
||||
"""
|
||||
self.deepep_engine.barrier_all()
|
||||
|
||||
|
||||
class XPUEPRunner(EPRunner):
|
||||
"""
|
||||
EPRunnerBase
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
top_k: int,
|
||||
hidden: int,
|
||||
num_experts: int,
|
||||
splitwise_role: str,
|
||||
moe_phase: MoEPhase,
|
||||
num_max_dispatch_tokens_per_rank: int = 1,
|
||||
ep_size: int = 1,
|
||||
ep_rank: int = 0,
|
||||
redundant_experts_num: int = 0,
|
||||
ep_group=None,
|
||||
):
|
||||
super().__init__(
|
||||
top_k,
|
||||
hidden,
|
||||
num_experts,
|
||||
splitwise_role,
|
||||
moe_phase,
|
||||
num_max_dispatch_tokens_per_rank,
|
||||
ep_size,
|
||||
ep_rank,
|
||||
redundant_experts_num,
|
||||
ep_group,
|
||||
)
|
||||
|
||||
def init_ep_engine(self):
|
||||
self.ep_engine = DeepEPEngine(
|
||||
num_max_dispatch_tokens_per_rank=self.num_max_dispatch_tokens_per_rank,
|
||||
hidden=self.hidden,
|
||||
num_experts=self.num_experts + self.redundant_experts_num,
|
||||
ep_size=self.ep_size,
|
||||
ep_rank=self.ep_rank,
|
||||
splitwise_role=self.splitwise_role,
|
||||
moe_phase=self.moe_phase,
|
||||
group=self.ep_group,
|
||||
)
|
||||
|
||||
def moe_select(self, layer: nn.Layer, gate_out: paddle.Tensor):
|
||||
"""
|
||||
moe_select
|
||||
"""
|
||||
if layer.redundant_table_manger is not None:
|
||||
(
|
||||
ep_rank_to_expert_id_list,
|
||||
expert_id_to_ep_rank_array,
|
||||
expert_in_rank_num_list,
|
||||
tokens_per_expert_stats_list,
|
||||
) = layer.redundant_table_manger.get_ep_rank_to_expert_id_list_by_layer(layer.layer_idx)
|
||||
|
||||
topk_idx, topk_weights = fastdeploy.model_executor.ops.xpu.moe_redundant_topk_select(
|
||||
gating_logits=gate_out,
|
||||
expert_id_to_ep_rank_array=expert_id_to_ep_rank_array,
|
||||
expert_in_rank_num_list=expert_in_rank_num_list,
|
||||
tokens_per_expert_stats_list=tokens_per_expert_stats_list,
|
||||
bias=layer.gate_correction_bias,
|
||||
moe_topk=self.top_k,
|
||||
apply_norm_weight=True, # apply_norm_weight
|
||||
enable_softmax_top_k_fused=False,
|
||||
redundant_ep_rank_num_plus_one=layer.fd_config.model_config.redundant_experts_num + 1,
|
||||
)
|
||||
else:
|
||||
topk_idx, topk_weights = fastdeploy.model_executor.ops.xpu.moe_topk_select(
|
||||
gate_out,
|
||||
layer.gate_correction_bias,
|
||||
self.top_k,
|
||||
True, # apply_norm_weight,
|
||||
)
|
||||
return topk_idx, topk_weights
|
||||
|
||||
@abstractmethod
|
||||
def dispatch(self, *args, **kwargs):
|
||||
"""
|
||||
dispatch
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def combine(self, *args, **kwargs):
|
||||
"""
|
||||
combine
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class XPUEPPrefillRunner(XPUEPRunner):
|
||||
"""
|
||||
EPPrefillRunner
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
top_k: int,
|
||||
hidden: int,
|
||||
num_experts: int,
|
||||
splitwise_role: str,
|
||||
num_max_dispatch_tokens_per_rank: int,
|
||||
ep_size: int = 1,
|
||||
ep_rank: int = 0,
|
||||
redundant_experts_num: int = 0,
|
||||
ep_group=None,
|
||||
moe_phase: MoEPhase = MoEPhase("prefill"),
|
||||
):
|
||||
super().__init__(
|
||||
top_k,
|
||||
hidden,
|
||||
num_experts,
|
||||
splitwise_role,
|
||||
moe_phase,
|
||||
num_max_dispatch_tokens_per_rank=num_max_dispatch_tokens_per_rank,
|
||||
ep_size=ep_size,
|
||||
ep_rank=ep_rank,
|
||||
redundant_experts_num=redundant_experts_num,
|
||||
ep_group=ep_group,
|
||||
)
|
||||
|
||||
def dispatch(
|
||||
self,
|
||||
x: paddle.Tensor,
|
||||
topk_idx: paddle.Tensor,
|
||||
topk_weights: paddle.Tensor,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
self.num_combined_tokens = x.shape[0]
|
||||
x_scale_tensor = kwargs.get("x_scale_tensor", None)
|
||||
dispatch_args = {
|
||||
"x": (x, x_scale_tensor) if x_scale_tensor is not None else x,
|
||||
"topk_idx": topk_idx,
|
||||
"topk_weights": topk_weights,
|
||||
}
|
||||
return self.ep_engine.deepep_engine.dispatch(**dispatch_args)
|
||||
|
||||
def combine(
|
||||
self,
|
||||
tmp_ffn_out: paddle.Tensor,
|
||||
handle: tuple,
|
||||
recv_topk_weights: paddle.Tensor,
|
||||
):
|
||||
combine_args = {
|
||||
"x": tmp_ffn_out,
|
||||
"topk_weights": recv_topk_weights,
|
||||
"num_combined_tokens": self.num_combined_tokens,
|
||||
}
|
||||
fused_moe_out, _, _ = self.ep_engine.deepep_engine.combine(**combine_args)
|
||||
|
||||
return fused_moe_out
|
||||
|
||||
|
||||
class XPUEPDecoderRunner(XPUEPRunner):
|
||||
"""
|
||||
EPDecoderRunner
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
top_k: int,
|
||||
hidden: int,
|
||||
num_experts: int,
|
||||
splitwise_role: str,
|
||||
num_max_dispatch_tokens_per_rank: int,
|
||||
ep_size: int = 1,
|
||||
ep_rank: int = 0,
|
||||
redundant_experts_num: int = 0,
|
||||
ep_group=None,
|
||||
moe_phase: MoEPhase = MoEPhase("decode"),
|
||||
):
|
||||
super().__init__(
|
||||
top_k,
|
||||
hidden,
|
||||
num_experts,
|
||||
splitwise_role,
|
||||
moe_phase,
|
||||
num_max_dispatch_tokens_per_rank,
|
||||
ep_size=ep_size,
|
||||
ep_rank=ep_rank,
|
||||
redundant_experts_num=redundant_experts_num,
|
||||
ep_group=ep_group,
|
||||
)
|
||||
|
||||
def dispatch(
|
||||
self,
|
||||
x: paddle.Tensor,
|
||||
topk_idx: paddle.Tensor,
|
||||
topk_weights: paddle.Tensor,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
expertwise_scale = kwargs.get("expertwise_scale", None)
|
||||
use_fp8 = expertwise_scale is not None
|
||||
|
||||
(
|
||||
recv_hidden_states,
|
||||
recv_expert_count,
|
||||
handle,
|
||||
dispatch_hook,
|
||||
valid_token_num,
|
||||
) = self.ep_engine.low_latency_dispatch(x, topk_idx, expertwise_scale, use_fp8)
|
||||
# no need to call dispatch_hook here, because it has already been done in xDeepEP
|
||||
# if dispatch_hook is not None:
|
||||
# dispatch_hook()
|
||||
|
||||
return recv_hidden_states, recv_expert_count, handle, valid_token_num
|
||||
|
||||
def combine(self, ffn_out, topk_idx, topk_weights, handle):
|
||||
combined_hidden_states, combine_hook = self.ep_engine.low_latency_combine(
|
||||
ffn_out, topk_idx, topk_weights, handle
|
||||
)
|
||||
if combine_hook is not None:
|
||||
combine_hook()
|
||||
|
||||
return combined_hidden_states
|
550
fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py
Normal file
550
fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py
Normal file
@@ -0,0 +1,550 @@
|
||||
"""
|
||||
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
|
||||
from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import (
|
||||
UnquantizedFusedMoEMethod,
|
||||
)
|
||||
from fastdeploy.model_executor.layers.quantization.quant_base import QuantMethodBase
|
||||
from fastdeploy.model_executor.layers.quantization.weight_only import WeightOnlyConfig
|
||||
from fastdeploy.model_executor.ops.xpu import (
|
||||
ep_moe_expert_combine,
|
||||
ep_moe_expert_dispatch,
|
||||
moe_expert_ffn,
|
||||
weight_quantize_xpu,
|
||||
)
|
||||
|
||||
|
||||
class XPUMoEMethod(UnquantizedFusedMoEMethod):
|
||||
"""
|
||||
XPU MOE
|
||||
"""
|
||||
|
||||
def process_loaded_weights(self, layer: nn.Layer, state_dict):
|
||||
up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
|
||||
for weights in [up_gate_proj_weights, down_proj_weights]:
|
||||
for idx, weight in enumerate(weights):
|
||||
weights[idx] = weight.transpose([1, 0])
|
||||
stacked_up_gate_proj_weights = paddle.stack(up_gate_proj_weights, axis=0)
|
||||
stacked_down_proj_weights = paddle.stack(down_proj_weights, axis=0)
|
||||
|
||||
layer.up_gate_proj_weight.set_value(stacked_up_gate_proj_weights)
|
||||
layer.down_proj_weight.set_value(stacked_down_proj_weights)
|
||||
|
||||
def apply_tp(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate: nn.Layer,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Paddle Cutlass compute Fused MoE.
|
||||
"""
|
||||
from fastdeploy.model_executor.ops.xpu import xpu_moe_layer
|
||||
|
||||
fused_moe_out = xpu_moe_layer(
|
||||
x,
|
||||
gate.weight.transpose([1, 0]),
|
||||
layer.gate_correction_bias,
|
||||
layer.up_gate_proj_weight,
|
||||
layer.down_proj_weight,
|
||||
None, # up_gate_proj bias
|
||||
None, # down_proj bias
|
||||
None, # up_gate_proj scale
|
||||
None, # down_proj scale
|
||||
None, # up_gate_proj_in_scale
|
||||
"", # moe_quant_type
|
||||
layer.top_k,
|
||||
False, # moe group, used in deepseek
|
||||
)
|
||||
if layer.tp_size > 1:
|
||||
from fastdeploy.distributed.communication import (
|
||||
tensor_model_parallel_all_reduce,
|
||||
)
|
||||
|
||||
tensor_model_parallel_all_reduce(fused_moe_out)
|
||||
|
||||
return fused_moe_out
|
||||
|
||||
def apply_ep_prefill(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate: nn.Layer,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Apply the EP prefill method.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def apply_ep_decode(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate: nn.Layer,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Apply the EP decoder method.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class XPUWeightOnlyMoEMethod(QuantMethodBase):
|
||||
"""
|
||||
XPU Fused MoE Method.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
quant_config: WeightOnlyConfig,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.quant_config = quant_config
|
||||
self.moe_quant_type = self.quant_config.algo
|
||||
self.added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"]
|
||||
self.added_scale_attrs = [
|
||||
"up_gate_proj_weight_scale",
|
||||
"down_proj_weight_scale",
|
||||
]
|
||||
|
||||
def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
|
||||
"""
|
||||
Paddle cutlass create weight process.
|
||||
"""
|
||||
self.default_dtype = "float32"
|
||||
self.weight_dtype = "int8"
|
||||
|
||||
if self.moe_quant_type in ["weight_only_int4", "w4a8"]:
|
||||
self.up_gate_proj_weight_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.moe_intermediate_size * 2,
|
||||
layer.hidden_size // 2,
|
||||
]
|
||||
else:
|
||||
self.up_gate_proj_weight_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.moe_intermediate_size * 2,
|
||||
layer.hidden_size,
|
||||
]
|
||||
if self.moe_quant_type in ["weight_only_int4", "w4a8"]:
|
||||
self.down_proj_weight_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.hidden_size,
|
||||
layer.moe_intermediate_size // 2,
|
||||
]
|
||||
else:
|
||||
self.down_proj_weight_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.hidden_size,
|
||||
layer.moe_intermediate_size,
|
||||
]
|
||||
|
||||
setattr(
|
||||
layer,
|
||||
self.added_weight_attrs[0],
|
||||
layer.create_parameter(
|
||||
shape=self.up_gate_proj_weight_shape,
|
||||
dtype=self.weight_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
),
|
||||
)
|
||||
setattr(
|
||||
layer,
|
||||
self.added_weight_attrs[1],
|
||||
layer.create_parameter(
|
||||
shape=self.down_proj_weight_shape,
|
||||
dtype=self.weight_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
),
|
||||
)
|
||||
# weight_scale
|
||||
setattr(
|
||||
layer,
|
||||
self.added_scale_attrs[0],
|
||||
layer.create_parameter(
|
||||
shape=[layer.num_local_experts, layer.moe_intermediate_size * 2],
|
||||
dtype=self.default_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
),
|
||||
)
|
||||
setattr(
|
||||
layer,
|
||||
self.added_scale_attrs[1],
|
||||
layer.create_parameter(
|
||||
shape=[layer.num_local_experts, layer.hidden_size],
|
||||
dtype=self.default_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
),
|
||||
)
|
||||
|
||||
def process_loaded_weights(self, layer: nn.Layer, state_dict):
|
||||
"""
|
||||
Paddle xpu load weight process.
|
||||
"""
|
||||
up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
|
||||
assert len(up_gate_proj_weights) == layer.num_local_experts
|
||||
assert len(down_proj_weights) == layer.num_local_experts
|
||||
assert up_gate_proj_weights[0].shape == [
|
||||
layer.hidden_size,
|
||||
layer.moe_intermediate_size * 2,
|
||||
]
|
||||
assert down_proj_weights[0].shape == [
|
||||
layer.moe_intermediate_size,
|
||||
layer.hidden_size,
|
||||
]
|
||||
|
||||
for idx, weight_tensor in enumerate([up_gate_proj_weights, down_proj_weights]):
|
||||
weight_name = self.added_weight_attrs[idx]
|
||||
scale_name = self.added_scale_attrs[idx]
|
||||
|
||||
weight_list = []
|
||||
weight_scale_list = []
|
||||
for i in range(layer.num_local_experts):
|
||||
quant_weight, scale = weight_quantize_xpu(
|
||||
weight_tensor[i], self.moe_quant_type, -1, -1
|
||||
) # weight is [k,n]
|
||||
weight_list.append(quant_weight.transpose([1, 0])) # transpose weight to [n,k]
|
||||
weight_scale_list.append(scale)
|
||||
quanted_weight = paddle.stack(weight_list, axis=0)
|
||||
getattr(layer, weight_name).set_value(quanted_weight)
|
||||
|
||||
quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
|
||||
getattr(layer, scale_name).set_value(quanted_weight_scale)
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate: nn.Layer,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
XPU compute Fused MoE.
|
||||
"""
|
||||
from fastdeploy.model_executor.ops.xpu import xpu_moe_layer
|
||||
|
||||
fused_moe_out = xpu_moe_layer(
|
||||
x,
|
||||
gate.weight.transpose([1, 0]),
|
||||
layer.gate_correction_bias,
|
||||
layer.up_gate_proj_weight,
|
||||
layer.down_proj_weight,
|
||||
None, # up_gate_proj bias
|
||||
None, # down_proj bias
|
||||
(layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
|
||||
(layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
|
||||
(layer.down_proj_in_scale if hasattr(layer, "down_proj_in_scale") else None),
|
||||
self.moe_quant_type,
|
||||
layer.top_k,
|
||||
False, # moe group, used in deepseek
|
||||
)
|
||||
if layer.tp_size > 1:
|
||||
from fastdeploy.distributed.communication import (
|
||||
tensor_model_parallel_all_reduce,
|
||||
)
|
||||
|
||||
tensor_model_parallel_all_reduce(fused_moe_out)
|
||||
|
||||
return fused_moe_out
|
||||
|
||||
|
||||
class XPUWeightOnlyMoeEpMethod(XPUMoEMethod):
|
||||
"""
|
||||
XPU Fused MoE EP Method.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
quant_config: WeightOnlyConfig,
|
||||
) -> None:
|
||||
super().__init__(quant_config)
|
||||
self.moe_quant_type = self.quant_config.algo
|
||||
self.weight_dtype = "int8"
|
||||
self.scale_dtype = "float32"
|
||||
|
||||
def import_backend_ep_runner(self) -> None:
|
||||
from .ep import XPUEPDecoderRunner, XPUEPPrefillRunner
|
||||
|
||||
self.EPPrefillRunner = XPUEPPrefillRunner
|
||||
self.EPDecoderRunner = XPUEPDecoderRunner
|
||||
|
||||
def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
|
||||
"""
|
||||
create weight process.
|
||||
"""
|
||||
if self.moe_quant_type in ["weight_only_int8"]:
|
||||
self.up_gate_proj_weight_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.moe_intermediate_size * 2,
|
||||
layer.hidden_size,
|
||||
]
|
||||
self.down_proj_weight_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.hidden_size,
|
||||
layer.moe_intermediate_size,
|
||||
]
|
||||
elif self.moe_quant_type in ["weight_only_int4"]:
|
||||
self.up_gate_proj_weight_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.moe_intermediate_size * 2,
|
||||
layer.hidden_size // 2,
|
||||
]
|
||||
self.down_proj_weight_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.hidden_size,
|
||||
layer.moe_intermediate // 2,
|
||||
]
|
||||
else:
|
||||
raise ValueError(f"Unsupported moe quant type: {self.moe_quant_type}")
|
||||
|
||||
self.up_gate_proj_scale_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.moe_intermediate_size * 2,
|
||||
]
|
||||
self.down_proj_scale_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.hidden_size,
|
||||
]
|
||||
|
||||
setattr(
|
||||
layer,
|
||||
self.added_weight_attrs[0],
|
||||
layer.create_parameter(
|
||||
shape=self.up_gate_proj_weight_shape,
|
||||
dtype=self.weight_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
),
|
||||
)
|
||||
setattr(
|
||||
layer,
|
||||
self.added_weight_attrs[1],
|
||||
layer.create_parameter(
|
||||
shape=self.down_proj_weight_shape,
|
||||
dtype=self.weight_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
),
|
||||
)
|
||||
setattr(
|
||||
layer,
|
||||
self.added_scale_attrs[0],
|
||||
layer.create_parameter(
|
||||
shape=self.up_gate_proj_scale_shape,
|
||||
dtype=self.scale_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
),
|
||||
)
|
||||
setattr(
|
||||
layer,
|
||||
self.added_scale_attrs[1],
|
||||
layer.create_parameter(
|
||||
shape=self.down_proj_scale_shape,
|
||||
dtype=self.scale_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
),
|
||||
)
|
||||
|
||||
def process_loaded_weights(self, layer: nn.Layer, state_dict):
|
||||
"""
|
||||
Paddle xpu load weight process.
|
||||
"""
|
||||
up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
|
||||
assert len(up_gate_proj_weights) == layer.num_local_experts
|
||||
assert len(down_proj_weights) == layer.num_local_experts
|
||||
assert up_gate_proj_weights[0].shape == [
|
||||
layer.hidden_size,
|
||||
layer.moe_intermediate_size * 2,
|
||||
]
|
||||
assert down_proj_weights[0].shape == [
|
||||
layer.moe_intermediate_size,
|
||||
layer.hidden_size,
|
||||
]
|
||||
|
||||
for idx, weight_tensor in enumerate([up_gate_proj_weights, down_proj_weights]):
|
||||
weight_name = self.added_weight_attrs[idx]
|
||||
scale_name = self.added_scale_attrs[idx]
|
||||
|
||||
weight_list = []
|
||||
weight_scale_list = []
|
||||
for i in range(layer.num_local_experts):
|
||||
quant_weight, scale = weight_quantize_xpu(
|
||||
weight_tensor[i], self.moe_quant_type, -1, -1
|
||||
) # quant_weight is [k,n]
|
||||
# transpose quant_weight to [n,k]
|
||||
weight_list.append(quant_weight.transpose([1, 0]))
|
||||
weight_scale_list.append(scale)
|
||||
|
||||
quanted_weight = paddle.stack(weight_list, axis=0)
|
||||
getattr(layer, weight_name).set_value(quanted_weight)
|
||||
quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
|
||||
getattr(layer, scale_name).set_value(quanted_weight_scale)
|
||||
|
||||
def apply_ep_prefill(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate: nn.Layer,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Apply the EP prefill method.
|
||||
"""
|
||||
gate_out = gate(x.cast("float32"))
|
||||
# 1. Select topk experts and weights
|
||||
topk_idx, topk_weights = self.ep_prefill_runner.moe_select(layer, gate_out)
|
||||
# 2. Dynamic compute blockwise quantization scales
|
||||
# x, x_scale_tensor = fastdeploy.model_executor.ops.xpu.per_token_quant(x)
|
||||
x_scale_tensor = None
|
||||
# 3. EP Dispatch
|
||||
(
|
||||
recv_x,
|
||||
recv_x_scales,
|
||||
recv_topk_idx,
|
||||
recv_topk_weights,
|
||||
recv_num_tokens_per_expert_list,
|
||||
_,
|
||||
) = self.ep_prefill_runner.dispatch(x, topk_idx, topk_weights, x_scale_tensor=x_scale_tensor)
|
||||
|
||||
token_num_per_expert = recv_num_tokens_per_expert_list.numpy().tolist()
|
||||
token_all_num = sum(token_num_per_expert)
|
||||
|
||||
# 4. Compute ffn
|
||||
if token_all_num > 0:
|
||||
moe_dispatch_scale = None
|
||||
(
|
||||
permute_input,
|
||||
permute_indices_per_token,
|
||||
token_num_lod,
|
||||
dst_weights,
|
||||
ffn1_act_scale_per_token,
|
||||
) = ep_moe_expert_dispatch(
|
||||
recv_x,
|
||||
recv_topk_idx,
|
||||
recv_topk_weights,
|
||||
moe_dispatch_scale,
|
||||
token_num_per_expert,
|
||||
token_all_num,
|
||||
self.moe_quant_type,
|
||||
)
|
||||
|
||||
moe_ffn1_scale = None
|
||||
moe_ffn2_scale = None
|
||||
ffn_out = moe_expert_ffn(
|
||||
permute_input,
|
||||
token_num_lod,
|
||||
getattr(layer, self.added_weight_attrs[0]),
|
||||
getattr(layer, self.added_weight_attrs[1]),
|
||||
None,
|
||||
None,
|
||||
moe_ffn1_scale,
|
||||
moe_ffn2_scale,
|
||||
getattr(layer, self.added_scale_attrs[0]),
|
||||
getattr(layer, self.added_scale_attrs[1]),
|
||||
None,
|
||||
None,
|
||||
self.moe_quant_type,
|
||||
-1,
|
||||
token_all_num,
|
||||
)
|
||||
|
||||
# prmt back per rank
|
||||
recv_topk_weights_bf16 = recv_topk_weights.astype("bfloat16")
|
||||
tmp_ffn_out = ep_moe_expert_combine(
|
||||
ffn_out,
|
||||
permute_indices_per_token,
|
||||
recv_topk_weights_bf16,
|
||||
permute_indices_per_token.shape[0],
|
||||
ffn_out.shape[0],
|
||||
ffn_out.shape[1],
|
||||
permute_indices_per_token.shape[1],
|
||||
)
|
||||
|
||||
else:
|
||||
tmp_ffn_out = paddle.empty(recv_x.shape, "bfloat16")
|
||||
|
||||
# 5. EP combine
|
||||
handle = None
|
||||
return self.ep_prefill_runner.combine(tmp_ffn_out, handle, recv_topk_weights)
|
||||
|
||||
def compute_ffn(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
valid_token_num=-1,
|
||||
extra_ffn1_in_scale=None,
|
||||
):
|
||||
"""
|
||||
Calculate moe
|
||||
"""
|
||||
# ffn1_in_scale = extra_ffn1_in_scale
|
||||
moe_ffn1_scale = None
|
||||
moe_ffn2_scale = None
|
||||
|
||||
ffn_out = moe_expert_ffn(
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
getattr(layer, self.added_weight_attrs[0]),
|
||||
getattr(layer, self.added_weight_attrs[1]),
|
||||
None,
|
||||
None,
|
||||
moe_ffn1_scale,
|
||||
moe_ffn2_scale,
|
||||
getattr(layer, self.added_scale_attrs[0]),
|
||||
getattr(layer, self.added_scale_attrs[1]),
|
||||
None,
|
||||
None,
|
||||
self.moe_quant_type,
|
||||
-1,
|
||||
valid_token_num,
|
||||
)
|
||||
return ffn_out
|
||||
|
||||
def apply_ep_decode(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate: nn.Layer,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Apply the EP decoder method.
|
||||
"""
|
||||
gate_out = gate(x.cast("float32"))
|
||||
|
||||
# 1. Select topk experts and weights
|
||||
topk_idx, topk_weights = self.ep_decoder_runner.moe_select(layer, gate_out)
|
||||
|
||||
# 2. EP Dispatch
|
||||
expertwise_scale = None
|
||||
use_fp8 = False
|
||||
(
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
handle,
|
||||
valid_token_num,
|
||||
) = self.ep_decoder_runner.dispatch(
|
||||
x, topk_idx, topk_weights, expertwise_scale=expertwise_scale, use_fp8=use_fp8
|
||||
)
|
||||
|
||||
# 3. Compute ffn
|
||||
ffn_out = self.compute_ffn(
|
||||
layer,
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
valid_token_num,
|
||||
)
|
||||
|
||||
# 4. EP combine
|
||||
return self.ep_decoder_runner.combine(ffn_out, topk_idx, topk_weights, handle)
|
@@ -34,6 +34,7 @@ class XPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
|
||||
quant_config: WeightOnlyConfig,
|
||||
) -> None:
|
||||
super().__init__(quant_config)
|
||||
self.quant_config.weight_only_linear_arch = -1
|
||||
|
||||
def create_weights(self, layer: nn.Layer, **extra_weight_attrs) -> None:
|
||||
"""
|
||||
@@ -61,6 +62,17 @@ class XPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
|
||||
"""
|
||||
loaded_weights using xpu special quantization
|
||||
"""
|
||||
quanted_weight_tensor, weight_scale_tensor = weight_quantize_xpu(weight, self.quant_config.algo, -1, -1)
|
||||
k, n = weight.shape
|
||||
quanted_weight_tensors = []
|
||||
weight_scale_tensors = []
|
||||
offset = 30720
|
||||
for i in range(0, n, offset):
|
||||
end_n = min(i + offset, n)
|
||||
weight_i = weight[:, i:end_n]
|
||||
quanted_weight_tensor, weight_scale_tensor = weight_quantize_xpu(weight_i, self.quant_config.algo, -1, -1)
|
||||
quanted_weight_tensors.append(quanted_weight_tensor)
|
||||
weight_scale_tensors.append(weight_scale_tensor)
|
||||
quanted_weight_tensor = paddle.concat(quanted_weight_tensors, axis=1)
|
||||
weight_scale_tensor = paddle.concat(weight_scale_tensors, axis=0)
|
||||
layer.weight.set_value(paddle.transpose(quanted_weight_tensor, [1, 0]))
|
||||
layer.weight_scale.set_value(weight_scale_tensor)
|
||||
|
@@ -20,11 +20,13 @@ import paddle
|
||||
from paddle import nn
|
||||
from paddleformers.utils.log import logger
|
||||
|
||||
try:
|
||||
from paddle.distributed.communication import deep_ep
|
||||
except:
|
||||
logger.warning("import deep_ep Failed!")
|
||||
from fastdeploy.platforms import current_platform
|
||||
|
||||
if current_platform.is_cuda():
|
||||
try:
|
||||
from paddle.distributed.communication import deep_ep
|
||||
except:
|
||||
logger.warning("import deep_ep Failed!")
|
||||
|
||||
import fastdeploy
|
||||
from fastdeploy.config import MoEPhase
|
||||
@@ -32,8 +34,7 @@ from fastdeploy.model_executor.layers.moe.moe import get_moe_scores
|
||||
from fastdeploy.utils import singleton
|
||||
|
||||
|
||||
@singleton
|
||||
class DeepEPEngine:
|
||||
class DeepEPEngineBase:
|
||||
"""
|
||||
A wrapper class for DeepEP engine.
|
||||
"""
|
||||
@@ -60,28 +61,76 @@ class DeepEPEngine:
|
||||
hidden: The hidden dimension of the model.
|
||||
num_experts: The number of experts.
|
||||
"""
|
||||
self.num_max_dispatch_tokens_per_rank = num_max_dispatch_tokens_per_rank
|
||||
self.hidden = hidden
|
||||
self.num_experts = num_experts
|
||||
self.ep_size = ep_size
|
||||
self.rank_id = ep_rank
|
||||
self.splitwise_role = splitwise_role
|
||||
self.moe_phase = moe_phase
|
||||
self.async_finish = async_finish
|
||||
# TODO(@wufeisheng): Support configurable EP size
|
||||
if group is None:
|
||||
group = paddle.distributed.new_group(range(ep_size))
|
||||
self.group = group
|
||||
self.ep_size = ep_size
|
||||
self.rank_id = ep_rank
|
||||
self.hidden = hidden
|
||||
self.num_experts = num_experts
|
||||
self.num_local_experts = num_experts // ep_size
|
||||
self.async_finish = async_finish
|
||||
|
||||
self.deepep_engine = None
|
||||
self.init_deepep_engine()
|
||||
|
||||
@abstractmethod
|
||||
def init_deepep_engine(self):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@singleton
|
||||
class DeepEPEngine(DeepEPEngineBase):
|
||||
"""
|
||||
A wrapper class for DeepEP engine.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_max_dispatch_tokens_per_rank: int,
|
||||
hidden: int,
|
||||
num_experts: int,
|
||||
ep_size: int,
|
||||
ep_rank: int,
|
||||
splitwise_role: str,
|
||||
moe_phase: MoEPhase,
|
||||
async_finish: bool = False,
|
||||
group=None,
|
||||
):
|
||||
"""
|
||||
Initialize the DeepEP engine.
|
||||
Args:
|
||||
group: The MPI group object.
|
||||
ep_size: The number of ranks.
|
||||
rank_id: The rank id.
|
||||
num_max_dispatch_tokens_per_rank: The maximum number of tokens per rank to dispatch.
|
||||
hidden: The hidden dimension of the model.
|
||||
num_experts: The number of experts.
|
||||
"""
|
||||
super().__init__(
|
||||
num_max_dispatch_tokens_per_rank,
|
||||
hidden,
|
||||
num_experts,
|
||||
ep_size,
|
||||
ep_rank,
|
||||
splitwise_role,
|
||||
moe_phase,
|
||||
async_finish,
|
||||
group,
|
||||
)
|
||||
|
||||
def init_deepep_engine(self):
|
||||
from paddle.base.core import Config
|
||||
|
||||
self.ep_config = Config(24, 6, 256)
|
||||
self.num_max_dispatch_tokens_per_rank = num_max_dispatch_tokens_per_rank
|
||||
|
||||
# In mixed EP mode on a single node, we dynamically switch between
|
||||
# high throughput and low latency modes.
|
||||
|
||||
if splitwise_role == "mixed":
|
||||
if self.splitwise_role == "mixed":
|
||||
self.deepep_engine = deep_ep.Buffer(
|
||||
self.group,
|
||||
int(2e9),
|
||||
@@ -92,10 +141,10 @@ class DeepEPEngine:
|
||||
# In disaggregated mode on multiple nodes, we either use
|
||||
# high throughput mode or low latency mode.
|
||||
else:
|
||||
if moe_phase.phase == "decode":
|
||||
if self.moe_phase.phase == "decode":
|
||||
logger.info("Initializing Low Latency Buffer")
|
||||
self.get_low_latency_buffer()
|
||||
elif moe_phase.phase == "prefill":
|
||||
elif self.moe_phase.phase == "prefill":
|
||||
self.deepep_engine = deep_ep.Buffer(
|
||||
self.group,
|
||||
int(5e8),
|
||||
@@ -104,7 +153,7 @@ class DeepEPEngine:
|
||||
num_qps_per_rank=1,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown generation phase {moe_phase}")
|
||||
raise ValueError(f"Unknown generation phase {self.moe_phase}")
|
||||
|
||||
def get_low_latency_buffer(self):
|
||||
"""
|
||||
@@ -255,17 +304,27 @@ class EPRunner:
|
||||
ep_group=None,
|
||||
):
|
||||
self.top_k = top_k
|
||||
self.hidden = hidden
|
||||
self.num_experts = num_experts
|
||||
self.splitwise_role = splitwise_role
|
||||
self.moe_phase = moe_phase
|
||||
self.num_max_dispatch_tokens_per_rank = num_max_dispatch_tokens_per_rank
|
||||
self.ep_size = ep_size
|
||||
self.ep_rank = ep_rank
|
||||
self.redundant_experts_num = redundant_experts_num
|
||||
self.ep_group = ep_group
|
||||
self.init_ep_engine()
|
||||
|
||||
def init_ep_engine(self):
|
||||
self.ep_engine = DeepEPEngine(
|
||||
num_max_dispatch_tokens_per_rank=num_max_dispatch_tokens_per_rank,
|
||||
hidden=hidden,
|
||||
num_experts=num_experts + redundant_experts_num,
|
||||
ep_size=ep_size,
|
||||
ep_rank=ep_rank,
|
||||
splitwise_role=splitwise_role,
|
||||
moe_phase=moe_phase,
|
||||
group=ep_group,
|
||||
num_max_dispatch_tokens_per_rank=self.num_max_dispatch_tokens_per_rank,
|
||||
hidden=self.hidden,
|
||||
num_experts=self.num_experts + self.redundant_experts_num,
|
||||
ep_size=self.ep_size,
|
||||
ep_rank=self.ep_rank,
|
||||
splitwise_role=self.splitwise_role,
|
||||
moe_phase=self.moe_phase,
|
||||
group=self.ep_group,
|
||||
)
|
||||
|
||||
def moe_select(self, layer: nn.Layer, gate_out: paddle.Tensor):
|
||||
|
@@ -41,15 +41,20 @@ class MoEMethodBase(QuantMethodBase):
|
||||
]
|
||||
self.pack_num = 1
|
||||
|
||||
def import_backend_ep_runner(self) -> None:
|
||||
from .ep import EPDecoderRunner, EPPrefillRunner
|
||||
|
||||
self.EPPrefillRunner = EPPrefillRunner
|
||||
self.EPDecoderRunner = EPDecoderRunner
|
||||
|
||||
def init_ep(self, layer: nn.Layer) -> None:
|
||||
"""
|
||||
Init EP related module
|
||||
"""
|
||||
self.import_backend_ep_runner()
|
||||
if layer.ep_size > 1:
|
||||
if layer.fd_config.parallel_config.splitwise_role == "mixed":
|
||||
from .ep import EPDecoderRunner, EPPrefillRunner
|
||||
|
||||
self.ep_prefill_runner = EPPrefillRunner(
|
||||
self.ep_prefill_runner = self.EPPrefillRunner(
|
||||
layer.top_k,
|
||||
layer.hidden_size,
|
||||
layer.num_experts,
|
||||
@@ -60,7 +65,7 @@ class MoEMethodBase(QuantMethodBase):
|
||||
layer.fd_config.model_config.redundant_experts_num,
|
||||
ep_group=layer.fd_config.parallel_config.ep_group,
|
||||
)
|
||||
self.ep_decoder_runner = EPDecoderRunner(
|
||||
self.ep_decoder_runner = self.EPDecoderRunner(
|
||||
layer.top_k,
|
||||
layer.hidden_size,
|
||||
layer.num_experts,
|
||||
@@ -73,9 +78,7 @@ class MoEMethodBase(QuantMethodBase):
|
||||
)
|
||||
else:
|
||||
if layer.fd_config.parallel_config.moe_phase.phase == "prefill":
|
||||
from .ep import EPPrefillRunner
|
||||
|
||||
self.ep_prefill_runner = EPPrefillRunner(
|
||||
self.ep_prefill_runner = self.EPPrefillRunner(
|
||||
layer.top_k,
|
||||
layer.hidden_size,
|
||||
layer.num_experts,
|
||||
@@ -87,9 +90,7 @@ class MoEMethodBase(QuantMethodBase):
|
||||
ep_group=layer.fd_config.parallel_config.ep_group,
|
||||
)
|
||||
else:
|
||||
from .ep import EPDecoderRunner
|
||||
|
||||
self.ep_decoder_runner = EPDecoderRunner(
|
||||
self.ep_decoder_runner = self.EPDecoderRunner(
|
||||
layer.top_k,
|
||||
layer.hidden_size,
|
||||
layer.num_experts,
|
||||
|
@@ -1,258 +0,0 @@
|
||||
"""
|
||||
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
|
||||
from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import (
|
||||
UnquantizedFusedMoEMethod,
|
||||
)
|
||||
from fastdeploy.model_executor.layers.quantization.quant_base import QuantMethodBase
|
||||
from fastdeploy.model_executor.layers.quantization.weight_only import WeightOnlyConfig
|
||||
from fastdeploy.model_executor.ops.xpu import weight_quantize_xpu
|
||||
|
||||
|
||||
class XPUMoEMethod(UnquantizedFusedMoEMethod):
|
||||
"""
|
||||
XPU MOE
|
||||
"""
|
||||
|
||||
def process_loaded_weights(self, layer: nn.Layer, state_dict):
|
||||
|
||||
up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
|
||||
for weights in [up_gate_proj_weights, down_proj_weights]:
|
||||
for idx, weight in enumerate(weights):
|
||||
weights[idx] = weight.transpose([1, 0])
|
||||
stacked_up_gate_proj_weights = paddle.stack(up_gate_proj_weights, axis=0)
|
||||
stacked_down_proj_weights = paddle.stack(down_proj_weights, axis=0)
|
||||
|
||||
layer.up_gate_proj_weight.set_value(stacked_up_gate_proj_weights)
|
||||
layer.down_proj_weight.set_value(stacked_down_proj_weights)
|
||||
|
||||
def apply_tp(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate: nn.Layer,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Paddle Cutlass compute Fused MoE.
|
||||
"""
|
||||
from fastdeploy.model_executor.ops.xpu import xpu_moe_layer
|
||||
|
||||
fused_moe_out = xpu_moe_layer(
|
||||
x,
|
||||
gate.weight.transpose([1, 0]),
|
||||
layer.gate_correction_bias,
|
||||
layer.up_gate_proj_weight,
|
||||
layer.down_proj_weight,
|
||||
None, # up_gate_proj bias
|
||||
None, # down_proj bias
|
||||
None, # up_gate_proj scale
|
||||
None, # down_proj scale
|
||||
None, # up_gate_proj_in_scale
|
||||
"", # moe_quant_type
|
||||
layer.top_k,
|
||||
False, # moe group, used in deepseek
|
||||
)
|
||||
if layer.tp_size > 1:
|
||||
from fastdeploy.distributed.communication import (
|
||||
tensor_model_parallel_all_reduce,
|
||||
)
|
||||
|
||||
tensor_model_parallel_all_reduce(fused_moe_out)
|
||||
|
||||
return fused_moe_out
|
||||
|
||||
def apply_ep_prefill(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate: nn.Layer,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Apply the EP prefill method.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def apply_ep_decode(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate: nn.Layer,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Apply the EP decoder method.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class XPUWeightOnlyMoEMethod(QuantMethodBase):
|
||||
"""
|
||||
XPU Fused MoE Method.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
quant_config: WeightOnlyConfig,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.quant_config = quant_config
|
||||
self.moe_quant_type = self.quant_config.algo
|
||||
self.added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"]
|
||||
self.added_scale_attrs = [
|
||||
"up_gate_proj_weight_scale",
|
||||
"down_proj_weight_scale",
|
||||
]
|
||||
|
||||
def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
|
||||
"""
|
||||
Paddle cutlass create weight process.
|
||||
"""
|
||||
self.default_dtype = "float32"
|
||||
self.weight_dtype = "int8"
|
||||
|
||||
if self.moe_quant_type in ["weight_only_int4", "w4a8"]:
|
||||
self.up_gate_proj_weight_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.moe_intermediate_size * 2,
|
||||
layer.hidden_size // 2,
|
||||
]
|
||||
else:
|
||||
self.up_gate_proj_weight_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.moe_intermediate_size * 2,
|
||||
layer.hidden_size,
|
||||
]
|
||||
if self.moe_quant_type in ["weight_only_int4", "w4a8"]:
|
||||
self.down_proj_weight_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.hidden_size,
|
||||
layer.moe_intermediate_size // 2,
|
||||
]
|
||||
else:
|
||||
self.down_proj_weight_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.hidden_size,
|
||||
layer.moe_intermediate_size,
|
||||
]
|
||||
|
||||
setattr(
|
||||
layer,
|
||||
self.added_weight_attrs[0],
|
||||
layer.create_parameter(
|
||||
shape=self.up_gate_proj_weight_shape,
|
||||
dtype=self.weight_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
),
|
||||
)
|
||||
setattr(
|
||||
layer,
|
||||
self.added_weight_attrs[1],
|
||||
layer.create_parameter(
|
||||
shape=self.down_proj_weight_shape,
|
||||
dtype=self.weight_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
),
|
||||
)
|
||||
# weight_scale
|
||||
setattr(
|
||||
layer,
|
||||
self.added_scale_attrs[0],
|
||||
layer.create_parameter(
|
||||
shape=[layer.num_local_experts, layer.moe_intermediate_size * 2],
|
||||
dtype=self.default_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
),
|
||||
)
|
||||
setattr(
|
||||
layer,
|
||||
self.added_scale_attrs[1],
|
||||
layer.create_parameter(
|
||||
shape=[layer.num_local_experts, layer.hidden_size],
|
||||
dtype=self.default_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
),
|
||||
)
|
||||
|
||||
def process_loaded_weights(self, layer: nn.Layer, state_dict):
|
||||
"""
|
||||
Paddle xpu load weight process.
|
||||
"""
|
||||
up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
|
||||
assert len(up_gate_proj_weights) == layer.num_local_experts
|
||||
assert len(down_proj_weights) == layer.num_local_experts
|
||||
assert up_gate_proj_weights[0].shape == [
|
||||
layer.hidden_size,
|
||||
layer.moe_intermediate_size * 2,
|
||||
]
|
||||
assert down_proj_weights[0].shape == [
|
||||
layer.moe_intermediate_size,
|
||||
layer.hidden_size,
|
||||
]
|
||||
|
||||
for idx, weight_tensor in enumerate([up_gate_proj_weights, down_proj_weights]):
|
||||
weight_name = self.added_weight_attrs[idx]
|
||||
scale_name = self.added_scale_attrs[idx]
|
||||
|
||||
weight_list = []
|
||||
weight_scale_list = []
|
||||
for i in range(layer.num_local_experts):
|
||||
quant_weight, scale = weight_quantize_xpu(
|
||||
weight_tensor[i], self.moe_quant_type, -1, -1
|
||||
) # weight is [k,n]
|
||||
weight_list.append(quant_weight.transpose([1, 0])) # transpose weight to [n,k]
|
||||
weight_scale_list.append(scale)
|
||||
quanted_weight = paddle.stack(weight_list, axis=0)
|
||||
getattr(layer, weight_name).set_value(quanted_weight)
|
||||
|
||||
quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
|
||||
getattr(layer, scale_name).set_value(quanted_weight_scale)
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate: nn.Layer,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
XPU compute Fused MoE.
|
||||
"""
|
||||
from fastdeploy.model_executor.ops.xpu import xpu_moe_layer
|
||||
|
||||
fused_moe_out = xpu_moe_layer(
|
||||
x,
|
||||
gate.weight.transpose([1, 0]),
|
||||
layer.gate_correction_bias,
|
||||
layer.up_gate_proj_weight,
|
||||
layer.down_proj_weight,
|
||||
None, # up_gate_proj bias
|
||||
None, # down_proj bias
|
||||
(layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
|
||||
(layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
|
||||
(layer.down_proj_in_scale if hasattr(layer, "down_proj_in_scale") else None),
|
||||
self.moe_quant_type,
|
||||
layer.top_k,
|
||||
False, # moe group, used in deepseek
|
||||
)
|
||||
if layer.tp_size > 1:
|
||||
from fastdeploy.distributed.communication import (
|
||||
tensor_model_parallel_all_reduce,
|
||||
)
|
||||
|
||||
tensor_model_parallel_all_reduce(fused_moe_out)
|
||||
|
||||
return fused_moe_out
|
@@ -43,7 +43,7 @@ def get_moe_method():
|
||||
|
||||
return CutlassMoEMethod(None)
|
||||
elif current_platform.is_xpu():
|
||||
from .fused_moe_xpu_backend import XPUMoEMethod
|
||||
from fastdeploy.model_executor.layers.backends import XPUMoEMethod
|
||||
|
||||
return XPUMoEMethod(None)
|
||||
elif current_platform.is_gcu():
|
||||
|
@@ -19,7 +19,7 @@ from abc import abstractmethod
|
||||
from typing import Optional
|
||||
|
||||
import paddle
|
||||
from paddle.nn.quant import weight_only_linear, weight_quantize
|
||||
from paddle.nn.quant import weight_quantize
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.model_executor.layers.linear import (
|
||||
@@ -30,6 +30,13 @@ from fastdeploy.model_executor.layers.linear import (
|
||||
from fastdeploy.model_executor.utils import TensorTracker, free_tensor, set_weight_attrs
|
||||
from fastdeploy.platforms import current_platform
|
||||
|
||||
if current_platform.is_xpu():
|
||||
from fastdeploy.model_executor.ops.xpu import (
|
||||
weight_only_linear_xpu as weight_only_linear,
|
||||
)
|
||||
else:
|
||||
from paddle.nn.quant import weight_only_linear
|
||||
|
||||
from ..moe import FusedMoE
|
||||
from ..utils import get_tensor
|
||||
from .quant_base import QuantConfigBase, QuantMethodBase
|
||||
@@ -70,16 +77,24 @@ class WeightOnlyConfig(QuantConfigBase):
|
||||
|
||||
def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
|
||||
if current_platform.is_xpu():
|
||||
from fastdeploy.model_executor.layers.backends import (
|
||||
XPUWeightOnlyLinearMethod,
|
||||
)
|
||||
from fastdeploy.model_executor.layers.moe.fused_moe_xpu_backend import (
|
||||
XPUWeightOnlyMoEMethod,
|
||||
)
|
||||
|
||||
if isinstance(layer, FusedMoE):
|
||||
return XPUWeightOnlyMoEMethod(self)
|
||||
if layer.ep_size > 1:
|
||||
from fastdeploy.model_executor.layers.backends import (
|
||||
XPUWeightOnlyMoeEpMethod,
|
||||
)
|
||||
|
||||
return XPUWeightOnlyMoeEpMethod(self)
|
||||
else:
|
||||
from fastdeploy.model_executor.layers.backends import (
|
||||
XPUWeightOnlyMoEMethod,
|
||||
)
|
||||
|
||||
return XPUWeightOnlyMoEMethod(self)
|
||||
else:
|
||||
from fastdeploy.model_executor.layers.backends import (
|
||||
XPUWeightOnlyLinearMethod,
|
||||
)
|
||||
|
||||
return XPUWeightOnlyLinearMethod(self)
|
||||
elif current_platform.is_gcu():
|
||||
from fastdeploy.model_executor.layers.backends import (
|
||||
|
@@ -146,7 +146,7 @@ class TokenProcessor:
|
||||
"""
|
||||
|
||||
if current_platform.is_xpu():
|
||||
from fastdeploy.model_executor.ops.xpu import get_output
|
||||
from fastdeploy.model_executor.ops.xpu import get_output, get_output_ep
|
||||
elif current_platform.is_iluvatar():
|
||||
from fastdeploy.model_executor.ops.iluvatar import get_output
|
||||
elif current_platform.is_gcu():
|
||||
|
@@ -872,16 +872,36 @@ class XPUModelRunner(ModelRunnerBase):
|
||||
self._dummy_prefill_inputs(num_tokens, batch_size)
|
||||
|
||||
while True:
|
||||
self.execute_model(None, True)
|
||||
self.execute_model(is_dummy_run=True)
|
||||
|
||||
if int((self.share_inputs["seq_lens_this_time"] > 0).sum()) == 0:
|
||||
break
|
||||
|
||||
def _set_debug_level(
|
||||
self, debug_level: int = 0x1, model_forward_batch: Optional[List[Request]] = None, is_dummy_run: bool = False
|
||||
) -> None:
|
||||
"""
|
||||
Set debug level for XPU: 0x1, 0xA1, 0x1B1
|
||||
"""
|
||||
request_num = 0 if model_forward_batch is None else len(model_forward_batch)
|
||||
if debug_level == 0 or request_num == 0 or is_dummy_run:
|
||||
paddle.device.xpu.set_debug_level(0)
|
||||
return
|
||||
|
||||
if self.parallel_config.use_ep:
|
||||
request_num = paddle.to_tensor(request_num, dtype="int32")
|
||||
paddle.distributed.all_reduce(request_num, group=self.parallel_config.ep_group)
|
||||
logger.info(f"local_rank: {self.local_rank}, request_num: {request_num.item()}")
|
||||
if request_num.item() > 0:
|
||||
paddle.device.xpu.set_debug_level(debug_level)
|
||||
else:
|
||||
paddle.device.xpu.set_debug_level(debug_level)
|
||||
|
||||
def execute_model(
|
||||
self,
|
||||
model_forward_batch: Optional[List[Request]] = None,
|
||||
is_dummy_run: bool = False,
|
||||
num_running_requests: int = None,
|
||||
is_dummy_run: bool = False,
|
||||
) -> Optional[ModelRunnerOutput]:
|
||||
"""
|
||||
The Entrance of model execute.
|
||||
@@ -892,6 +912,9 @@ class XPUModelRunner(ModelRunnerBase):
|
||||
num_running_requests: batch_size
|
||||
intermediate_tensors:
|
||||
"""
|
||||
# 0. set debug level
|
||||
# self._set_debug_level(0x1, model_forward_batch, is_dummy_run)
|
||||
|
||||
# 1. Prepare inputs of model and decoder.
|
||||
self._prepare_inputs(is_dummy_run=is_dummy_run)
|
||||
|
||||
|
@@ -110,7 +110,10 @@ class XpuWorker(WorkerBase):
|
||||
)
|
||||
|
||||
self.model_runner.prepare_profile()
|
||||
self.model_runner.profile_run()
|
||||
if self.parallel_config.use_ep:
|
||||
logger.warning("EP mode does not support profile run.")
|
||||
else:
|
||||
self.model_runner.profile_run()
|
||||
set_random_seed(self.fd_config.model_config.seed)
|
||||
|
||||
total_available_memory = int(total_memory * self.cache_config.gpu_memory_utilization)
|
||||
@@ -118,6 +121,8 @@ class XpuWorker(WorkerBase):
|
||||
available_kv_cache_memory = total_available_memory - used_memory
|
||||
model_block_memory_used = self.cal_theortical_kvcache()
|
||||
available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num
|
||||
if self.parallel_config.use_ep:
|
||||
available_kv_cache_memory = int(available_kv_cache_memory * 0.6)
|
||||
|
||||
self.model_runner.clear_block_table()
|
||||
|
||||
@@ -147,14 +152,11 @@ class XpuWorker(WorkerBase):
|
||||
def execute_model(
|
||||
self,
|
||||
model_forward_batch: Optional[List[Request]] = None,
|
||||
is_dummy_run: bool = False,
|
||||
num_running_requests: Optional[int] = None,
|
||||
is_dummy_run: bool = False,
|
||||
) -> Optional[ModelRunnerOutput]:
|
||||
""" """
|
||||
|
||||
output = self.model_runner.execute_model(model_forward_batch)
|
||||
|
||||
return output
|
||||
return self.model_runner.execute_model(model_forward_batch, num_running_requests, is_dummy_run)
|
||||
|
||||
def exist_prefill(self):
|
||||
"""
|
||||
|
@@ -66,6 +66,9 @@ while true; do
|
||||
echo -e "\n服务启动超时:经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
|
||||
cat server.log
|
||||
cat log/workerlog.0
|
||||
cat log/workerlog.1
|
||||
cat log/workerlog.2
|
||||
cat log/workerlog.3
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
Reference in New Issue
Block a user