mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 00:33:03 +08:00
This reverts commit e0aeac58e1
.
This commit is contained in:
@@ -450,7 +450,7 @@ class Ernie4_5_MoeForCausalLM(ModelForCasualLM):
|
|||||||
self.fd_config.model_config.moe_layer_start_index,
|
self.fd_config.model_config.moe_layer_start_index,
|
||||||
self.fd_config.model_config.num_hidden_layers,
|
self.fd_config.model_config.num_hidden_layers,
|
||||||
):
|
):
|
||||||
self.ernie.layers[i].mlp.experts(fake_hidden_states, self.ernie.layers[i].mlp.gate)
|
self.ernie.layers[i].mlp.expert(fake_hidden_states)
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
|
@@ -799,7 +799,7 @@ class GPUModelRunner(ModelRunnerBase):
|
|||||||
output_padding_offset,
|
output_padding_offset,
|
||||||
) = pre_process(
|
) = pre_process(
|
||||||
self.share_inputs["input_ids"],
|
self.share_inputs["input_ids"],
|
||||||
getattr(self.share_inputs, "seq_lens_this_time", self.seq_lens_this_time_buffer),
|
self.share_inputs["seq_lens_this_time"],
|
||||||
self.speculative_decoding,
|
self.speculative_decoding,
|
||||||
(self.share_inputs["draft_tokens"] if self.speculative_decoding else None),
|
(self.share_inputs["draft_tokens"] if self.speculative_decoding else None),
|
||||||
self.share_inputs["seq_lens_encoder"],
|
self.share_inputs["seq_lens_encoder"],
|
||||||
@@ -884,7 +884,7 @@ class GPUModelRunner(ModelRunnerBase):
|
|||||||
max_len_tensor_cpu=self.share_inputs["max_len_tensor_cpu"],
|
max_len_tensor_cpu=self.share_inputs["max_len_tensor_cpu"],
|
||||||
seq_lens_encoder=self.share_inputs["seq_lens_encoder"],
|
seq_lens_encoder=self.share_inputs["seq_lens_encoder"],
|
||||||
seq_lens_decoder=self.share_inputs["seq_lens_decoder"],
|
seq_lens_decoder=self.share_inputs["seq_lens_decoder"],
|
||||||
seq_lens_this_time=getattr(self.share_inputs, "seq_lens_this_time", self.seq_lens_this_time_buffer),
|
seq_lens_this_time=self.share_inputs["seq_lens_this_time"],
|
||||||
batch_id_per_token=self.share_inputs["batch_id_per_token"],
|
batch_id_per_token=self.share_inputs["batch_id_per_token"],
|
||||||
cu_seqlens_q=self.share_inputs["cu_seqlens_q"],
|
cu_seqlens_q=self.share_inputs["cu_seqlens_q"],
|
||||||
cu_seqlens_k=self.share_inputs["cu_seqlens_k"],
|
cu_seqlens_k=self.share_inputs["cu_seqlens_k"],
|
||||||
|
@@ -244,7 +244,6 @@ class PaddleDisWorkerProc:
|
|||||||
"""
|
"""
|
||||||
while True:
|
while True:
|
||||||
self.worker_healthy_live_signal.value[self.local_rank % self.max_chips_per_node] = int(time.time())
|
self.worker_healthy_live_signal.value[self.local_rank % self.max_chips_per_node] = int(time.time())
|
||||||
num_running_requests = 0
|
|
||||||
|
|
||||||
if self.fd_config.parallel_config.tensor_parallel_rank == 0 and self.task_queue.num_tasks() > 0:
|
if self.fd_config.parallel_config.tensor_parallel_rank == 0 and self.task_queue.num_tasks() > 0:
|
||||||
tasks, read_finish = self.task_queue.get_tasks()
|
tasks, read_finish = self.task_queue.get_tasks()
|
||||||
@@ -272,8 +271,6 @@ class PaddleDisWorkerProc:
|
|||||||
self.nnode = int((self.parallel_config.tensor_parallel_size + 7) // 8)
|
self.nnode = int((self.parallel_config.tensor_parallel_size + 7) // 8)
|
||||||
mp_num_per_node = self.parallel_config.tensor_parallel_size // self.nnode
|
mp_num_per_node = self.parallel_config.tensor_parallel_size // self.nnode
|
||||||
req_ids = []
|
req_ids = []
|
||||||
num_running_requests = 0
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
if self.local_rank == 0:
|
if self.local_rank == 0:
|
||||||
if self.model_weights_status.value[0] != 0:
|
if self.model_weights_status.value[0] != 0:
|
||||||
|
Reference in New Issue
Block a user