diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index a67ac75e4..139a809c6 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -105,9 +105,9 @@ class GCUModelRunner(ModelRunnerBase): self.local_rank + int(self.parallel_config.engine_worker_queue_port) ) - def prefill_finished(self): + def exist_prefill(self): """ - check whether prefill stage finished + check whether prefill stage exist """ if int(paddle.max(self.share_inputs["seq_lens_encoder"])) != 0: return 1 diff --git a/fastdeploy/worker/gcu_worker.py b/fastdeploy/worker/gcu_worker.py index 1b98e3b0c..004e0e801 100644 --- a/fastdeploy/worker/gcu_worker.py +++ b/fastdeploy/worker/gcu_worker.py @@ -69,11 +69,11 @@ class GcuWorker(WorkerBase): local_rank=self.local_rank, ) - def prefill_finished(self): + def exist_prefill(self): """ - check whether prefill stage finished + check whether prefill stage exist """ - return self.model_runner.prefill_finished() + return self.model_runner.exist_prefill() def determine_available_memory(self) -> int: """ diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 45dd69d59..ecdc4bf37 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -148,9 +148,9 @@ class GPUModelRunner(ModelRunnerBase): self.local_rank + int(self.parallel_config.engine_worker_queue_port) ) - def prefill_finished(self): + def exist_prefill(self): """ - Check whether prefill stage finished + check whether prefill stage exist """ if int(paddle.max(self.share_inputs["seq_lens_encoder"])) != 0: return 1 diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py index 50ac4d231..13270757b 100644 --- a/fastdeploy/worker/gpu_worker.py +++ b/fastdeploy/worker/gpu_worker.py @@ -78,11 +78,11 @@ class GpuWorker(WorkerBase): local_rank=self.local_rank, ) - def prefill_finished(self): + def exist_prefill(self): """ - Check whether prefill stage finished + check whether prefill stage exist """ - return self.model_runner.prefill_finished() + return self.model_runner.exist_prefill() def determine_available_memory(self) -> int: """ diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py index f110ee64d..f3ef3823c 100644 --- a/fastdeploy/worker/iluvatar_model_runner.py +++ b/fastdeploy/worker/iluvatar_model_runner.py @@ -105,9 +105,9 @@ class IluvatarModelRunner(ModelRunnerBase): self.local_rank + int(self.parallel_config.engine_worker_queue_port) ) - def prefill_finished(self): + def exist_prefill(self): """ - check whether prefill stage finished + check whether prefill stage exist """ if int(paddle.max(self.share_inputs["seq_lens_encoder"])) != 0: return 1 diff --git a/fastdeploy/worker/iluvatar_worker.py b/fastdeploy/worker/iluvatar_worker.py index 76fcb558b..8ed74c6fe 100644 --- a/fastdeploy/worker/iluvatar_worker.py +++ b/fastdeploy/worker/iluvatar_worker.py @@ -69,11 +69,11 @@ class IluvatarWorker(WorkerBase): local_rank=self.local_rank, ) - def prefill_finished(self): + def exist_prefill(self): """ - check whether prefill stage finished + check whether prefill stage exist """ - return self.model_runner.prefill_finished() + return self.model_runner.exist_prefill() def determine_available_memory(self) -> int: """ diff --git a/fastdeploy/worker/worker_base.py b/fastdeploy/worker/worker_base.py index 281776bb0..8ff5a659f 100644 --- a/fastdeploy/worker/worker_base.py +++ b/fastdeploy/worker/worker_base.py @@ -96,6 +96,6 @@ class WorkerBase(ABC): """Basic health check (override for device-specific checks).""" return NotImplementedError - def prefill_finished(self): - """check whether prefill stage finished.""" + def exist_prefill(self): + """check whether prefill stage exist.""" return True diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 8a12988c4..72cd2b8ed 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -286,7 +286,7 @@ class PaddleDisWorkerProc: if self.local_rank % mp_num_per_node == 0: if self.task_queue.num_tasks() > 0: # VL only support 1 batch to prefill - if not self.fd_config.model_config.enable_mm or not self.worker.prefill_finished(): + if not self.fd_config.model_config.enable_mm or not self.worker.exist_prefill(): if self.nnode > 1: self.task_queue.read_finish_flag.set(1) else: @@ -346,7 +346,7 @@ class PaddleDisWorkerProc: # Execute model to generate token. The generated token will be written to the buffer. # These generated tokens can be obtained through get_output op. self.worker.execute_model(req_dicts) - self.exist_prefill_task_signal.value[0] = self.worker.prefill_finished() + self.exist_prefill_task_signal.value[0] = self.worker.exist_prefill() def initialize_kv_cache(self) -> None: """Profiles the peak memory usage of the model to determine how many diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index 3240b217e..0c39cf3e0 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -584,9 +584,9 @@ class XPUModelRunner(ModelRunnerBase): logger.warn("XPU not support cuda graph currently") pass - def prefill_finished(self): + def exist_prefill(self): """ - check whether prefill stage finished + check whether prefill stage exist """ if int(paddle.max(self.share_inputs["seq_lens_encoder"])) != 0: return 1 diff --git a/fastdeploy/worker/xpu_worker.py b/fastdeploy/worker/xpu_worker.py index 49b4a74ac..5e1dc0c53 100644 --- a/fastdeploy/worker/xpu_worker.py +++ b/fastdeploy/worker/xpu_worker.py @@ -143,11 +143,11 @@ class XpuWorker(WorkerBase): output = self.model_runner.execute_model(model_forward_batch) return output - def prefill_finished(self): + def exist_prefill(self): """ - check whether prefill stage finished + check whether prefill stage exist """ - return self.model_runner.prefill_finished() + return self.model_runner.exist_prefill() def preprocess_new_task(self, req_dicts: List[Request]) -> None: """Process new requests and then start the decode loop