From dd877f38b15969d88f0520d8bcdfce3b410f4e01 Mon Sep 17 00:00:00 2001 From: begin2023 <131694384+begin2023@users.noreply.github.com> Date: Mon, 28 Jul 2025 11:38:29 +0800 Subject: [PATCH] [Perf] Remove unnecessary operations in non-cuda_graph (#3010) * [Perf] Remove unnecessary operations in non-cuda_graph * fix code logic * use suggestion comment * reduce function call * reduce function call * reduce function call * reduce function call --- fastdeploy/worker/gcu_model_runner.py | 5 +++-- fastdeploy/worker/gpu_model_runner.py | 19 ++++++++++++------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index 433ec85cb..26522044f 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -576,8 +576,9 @@ class GCUModelRunner(ModelRunnerBase): ) # Update Batch type for cuda graph - is_decode_batch = not ((self.share_inputs["seq_lens_this_time"] > 1).sum() > 0) - self.forward_meta.step_use_cudagraph = self.use_cudagraph and is_decode_batch + self.forward_meta.step_use_cudagraph = self.use_cudagraph and ( + not ((self.share_inputs["seq_lens_this_time"] > 1).sum() > 0) + ) # Initialzie attention meta data for attn_backend in self.attn_backends: diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 0be973530..3f9014dc2 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -793,16 +793,21 @@ class GPUModelRunner(ModelRunnerBase): # Update Batch type for cuda graph # TODO(gongshaotian): Use seq_lens_encoder to set is_decode_batch - is_decode_batch = not ((self.share_inputs["seq_lens_this_time"] > 1).sum() > 0) - + only_decode_batch = True + prefill_exists = None # mix ep in single node if self.fd_config.parallel_config.use_ep and self.fd_config.parallel_config.splitwise_role == "mixed": - is_decode_batch_list = [] - paddle.distributed.all_gather_object(is_decode_batch_list, is_decode_batch) - is_decode_batch = all(is_decode_batch_list) - self.fd_config.parallel_config.moe_phase.phase = "decode" if is_decode_batch else "prefill" + only_decode_batch_list = [] + prefill_exists = self.exist_prefill() + paddle.distributed.all_gather_object(only_decode_batch_list, not prefill_exists) + only_decode_batch = all(only_decode_batch_list) + self.fd_config.parallel_config.moe_phase.phase = "decode" if only_decode_batch else "prefill" - self.forward_meta.step_use_cudagraph = self.use_cudagraph and is_decode_batch + self.forward_meta.step_use_cudagraph = ( + self.use_cudagraph + and only_decode_batch + and not (prefill_exists if prefill_exists is not None else self.exist_prefill()) + ) # Initialzie attention meta data for attn_backend in self.attn_backends: