[XPU]Fixed the issue of performance degradation caused by enabling ENABLE_V1_KVCACHE_SCHEDULER (#3897)

* fix bug

* fix bug

* update

* update

* update
This commit is contained in:
yinwei
2025-09-08 10:34:46 +08:00
committed by GitHub
parent b649494655
commit 7833f2f6cb
4 changed files with 20 additions and 4 deletions

View File

@@ -19,6 +19,8 @@ from dataclasses import asdict, dataclass
from dataclasses import fields as dataclass_fields
from typing import Any, Dict, List, Optional
import paddle
from fastdeploy import envs
from fastdeploy.config import (
CacheConfig,
@@ -1005,7 +1007,10 @@ class EngineArgs:
if self.max_num_batched_tokens is None:
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
if paddle.is_compiled_with_xpu():
self.max_num_batched_tokens = self.max_model_len
else:
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
else:
if self.enable_chunked_prefill:
self.max_num_batched_tokens = 2048

View File

@@ -363,7 +363,9 @@ class ResourceManagerV1(ResourceManager):
while self.waiting and token_budget > 0:
if len(self.running) == self.max_num_seqs:
break
if self.config.model_config.enable_mm and self.exist_prefill(scheduled_reqs):
if (self.config.model_config.enable_mm or paddle.is_compiled_with_xpu()) and self.exist_prefill(
scheduled_reqs
):
break
request = self.waiting[0]
if request.status == RequestStatus.WAITING: