mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-02 23:32:48 +08:00
[BugFix] fix real_bsz in ep (#3366)
* Your commit message here * fix ep * delete cuda_graph
This commit is contained in:
@@ -450,7 +450,7 @@ class Ernie4_5_MoeForCausalLM(ModelForCasualLM):
|
||||
self.fd_config.model_config.moe_layer_start_index,
|
||||
self.fd_config.model_config.num_hidden_layers,
|
||||
):
|
||||
self.ernie.layers[i].mlp.expert(fake_hidden_states)
|
||||
self.ernie.layers[i].mlp.experts(fake_hidden_states, self.ernie.layers[i].mlp.gate)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
|
@@ -636,7 +636,9 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["max_length"] = paddle.full(
|
||||
[max_num_seqs, 1], self.model_config.max_model_len, dtype="int64"
|
||||
)
|
||||
self.seq_lens_this_time_buffer = paddle.full(max_num_seqs, 0, dtype="int32")
|
||||
self.seq_lens_this_time_buffer = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
||||
if self.fd_config.parallel_config.enable_expert_parallel:
|
||||
self.share_inputs["seq_lens_this_time"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
||||
self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
||||
self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
||||
self.share_inputs["step_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
||||
|
@@ -250,6 +250,7 @@ class PaddleDisWorkerProc:
|
||||
while True:
|
||||
self.worker_healthy_live_signal.value[self.local_rank % self.max_chips_per_node] = int(time.time())
|
||||
|
||||
num_running_requests = 0
|
||||
if self.fd_config.parallel_config.tensor_parallel_rank == 0 and self.task_queue.num_tasks() > 0:
|
||||
tasks, read_finish = self.task_queue.get_tasks()
|
||||
|
||||
@@ -276,6 +277,7 @@ class PaddleDisWorkerProc:
|
||||
self.nnode = int((self.parallel_config.tensor_parallel_size + 7) // 8)
|
||||
mp_num_per_node = self.parallel_config.tensor_parallel_size // self.nnode
|
||||
req_ids = []
|
||||
num_running_requests = 0
|
||||
while True:
|
||||
if self.local_rank == 0:
|
||||
if self.model_weights_status.value[0] != 0:
|
||||
|
Reference in New Issue
Block a user