diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index 907508883..d1116980c 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -195,7 +195,6 @@ class ResourceManagerV1(ResourceManager): ) request.num_image_end = img_num_per_boundary[new_boundary_idx] - request.num_image_end = img_num_per_boundary[new_boundary_idx] request.image_type_ids_start = np.sum(grid_thw[: request.num_image_start, 0]) request.image_type_ids_end = np.sum(grid_thw[: request.num_image_end, 0]) request.image_start = np.sum(np.prod(grid_thw[: request.num_image_start], axis=1)) diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index ebe08669d..fc5026bdb 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -24,6 +24,7 @@ import paddle import paddle.distributed as dist from paddle.distributed import fleet +from fastdeploy import envs from fastdeploy.config import ( CacheConfig, DecodingConfig, @@ -289,8 +290,9 @@ class PaddleDisWorkerProc: if self.local_rank % mp_num_per_node == 0: if self.task_queue.num_tasks() > 0: # VL only support 1 batch to prefill - - if not self.fd_config.model_config.enable_mm or not self.worker.exist_prefill(): + if envs.ENABLE_V1_KVCACHE_SCHEDULER or not ( + self.fd_config.model_config.enable_mm and self.worker.exist_prefill() + ): if self.nnode > 1 and self.parallel_config.tensor_parallel_size > self.max_chips_per_node: self.task_queue.read_finish_flag.set(1) else: