[XPU]Fixed the issue of performance degradation caused by enabling ENABLE_V1_KVCACHE_SCHEDULER (#3897)

* fix bug * fix bug * update * update * update
2025-10-05 16:48:03 +08:00 · 2025-09-08 10:34:46 +08:00
parent b649494655
commit 7833f2f6cb
4 changed files with 20 additions and 4 deletions
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -19,6 +19,8 @@ from dataclasses import asdict, dataclass
 from dataclasses import fields as dataclass_fields
 from typing import Any, Dict, List, Optional

+import paddle
+
 from fastdeploy import envs
 from fastdeploy.config import (
    CacheConfig,
@@ -1005,7 +1007,10 @@ class EngineArgs:

        if self.max_num_batched_tokens is None:
            if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
-                self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
+                if paddle.is_compiled_with_xpu():
+                    self.max_num_batched_tokens = self.max_model_len
+                else:
+                    self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
            else:
                if self.enable_chunked_prefill:
                    self.max_num_batched_tokens = 2048
--- a/fastdeploy/engine/sched/resource_manager_v1.py
+++ b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -363,7 +363,9 @@ class ResourceManagerV1(ResourceManager):
                while self.waiting and token_budget > 0:
                    if len(self.running) == self.max_num_seqs:
                        break
-                    if self.config.model_config.enable_mm and self.exist_prefill(scheduled_reqs):
+                    if (self.config.model_config.enable_mm or paddle.is_compiled_with_xpu()) and self.exist_prefill(
+                        scheduled_reqs
+                    ):
                        break
                    request = self.waiting[0]
                    if request.status == RequestStatus.WAITING: