diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index 5a0ee3f57..4a0250028 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -450,7 +450,7 @@ class Ernie4_5_MoeForCausalLM(ModelForCasualLM): self.fd_config.model_config.moe_layer_start_index, self.fd_config.model_config.num_hidden_layers, ): - self.ernie.layers[i].mlp.experts(fake_hidden_states, self.ernie.layers[i].mlp.gate) + self.ernie.layers[i].mlp.expert(fake_hidden_states) def forward( self, diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 9a5895742..0720493aa 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -799,7 +799,7 @@ class GPUModelRunner(ModelRunnerBase): output_padding_offset, ) = pre_process( self.share_inputs["input_ids"], - getattr(self.share_inputs, "seq_lens_this_time", self.seq_lens_this_time_buffer), + self.share_inputs["seq_lens_this_time"], self.speculative_decoding, (self.share_inputs["draft_tokens"] if self.speculative_decoding else None), self.share_inputs["seq_lens_encoder"], @@ -884,7 +884,7 @@ class GPUModelRunner(ModelRunnerBase): max_len_tensor_cpu=self.share_inputs["max_len_tensor_cpu"], seq_lens_encoder=self.share_inputs["seq_lens_encoder"], seq_lens_decoder=self.share_inputs["seq_lens_decoder"], - seq_lens_this_time=getattr(self.share_inputs, "seq_lens_this_time", self.seq_lens_this_time_buffer), + seq_lens_this_time=self.share_inputs["seq_lens_this_time"], batch_id_per_token=self.share_inputs["batch_id_per_token"], cu_seqlens_q=self.share_inputs["cu_seqlens_q"], cu_seqlens_k=self.share_inputs["cu_seqlens_k"], diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 8ddd4bc90..e3fca7d7a 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -244,7 +244,6 @@ class PaddleDisWorkerProc: """ while True: self.worker_healthy_live_signal.value[self.local_rank % self.max_chips_per_node] = int(time.time()) - num_running_requests = 0 if self.fd_config.parallel_config.tensor_parallel_rank == 0 and self.task_queue.num_tasks() > 0: tasks, read_finish = self.task_queue.get_tasks() @@ -272,8 +271,6 @@ class PaddleDisWorkerProc: self.nnode = int((self.parallel_config.tensor_parallel_size + 7) // 8) mp_num_per_node = self.parallel_config.tensor_parallel_size // self.nnode req_ids = [] - num_running_requests = 0 - while True: if self.local_rank == 0: if self.model_weights_status.value[0] != 0: