mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-01 06:42:23 +08:00
[XPU]Fixed the issue of performance degradation caused by enabling ENABLE_V1_KVCACHE_SCHEDULER (#3900)
* fix bug * fix bug * update * udpate * update
This commit is contained in:
@@ -1236,7 +1236,10 @@ class FDConfig:
|
|||||||
|
|
||||||
if self.max_num_batched_tokens is None:
|
if self.max_num_batched_tokens is None:
|
||||||
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
|
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
|
||||||
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
|
if paddle.is_compiled_with_xpu():
|
||||||
|
self.max_num_batched_tokens = self.max_model_len
|
||||||
|
else:
|
||||||
|
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
|
||||||
else:
|
else:
|
||||||
if self.cache_config.enable_chunked_prefill:
|
if self.cache_config.enable_chunked_prefill:
|
||||||
self.max_num_batched_tokens = 2048
|
self.max_num_batched_tokens = 2048
|
||||||
|
@@ -19,6 +19,8 @@ from dataclasses import asdict, dataclass
|
|||||||
from dataclasses import fields as dataclass_fields
|
from dataclasses import fields as dataclass_fields
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
|
||||||
from fastdeploy import envs
|
from fastdeploy import envs
|
||||||
from fastdeploy.config import (
|
from fastdeploy.config import (
|
||||||
CacheConfig,
|
CacheConfig,
|
||||||
@@ -1006,7 +1008,10 @@ class EngineArgs:
|
|||||||
|
|
||||||
if self.max_num_batched_tokens is None:
|
if self.max_num_batched_tokens is None:
|
||||||
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
|
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
|
||||||
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
|
if paddle.is_compiled_with_xpu():
|
||||||
|
self.max_num_batched_tokens = self.max_model_len
|
||||||
|
else:
|
||||||
|
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
|
||||||
else:
|
else:
|
||||||
if self.enable_chunked_prefill:
|
if self.enable_chunked_prefill:
|
||||||
self.max_num_batched_tokens = 2048
|
self.max_num_batched_tokens = 2048
|
||||||
|
@@ -345,7 +345,9 @@ class ResourceManagerV1(ResourceManager):
|
|||||||
while self.waiting and token_budget > 0:
|
while self.waiting and token_budget > 0:
|
||||||
if len(self.running) == self.max_num_seqs:
|
if len(self.running) == self.max_num_seqs:
|
||||||
break
|
break
|
||||||
if self.config.model_config.enable_mm and self.exist_prefill(scheduled_reqs):
|
if (self.config.model_config.enable_mm or paddle.is_compiled_with_xpu()) and self.exist_prefill(
|
||||||
|
scheduled_reqs
|
||||||
|
):
|
||||||
break
|
break
|
||||||
request = self.waiting[0]
|
request = self.waiting[0]
|
||||||
if request.status == RequestStatus.WAITING:
|
if request.status == RequestStatus.WAITING:
|
||||||
|
@@ -383,6 +383,7 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
|
|
||||||
req_len = len(req_dicts)
|
req_len = len(req_dicts)
|
||||||
has_prefill_task = False
|
has_prefill_task = False
|
||||||
|
has_decode_task = False
|
||||||
for i in range(req_len):
|
for i in range(req_len):
|
||||||
request = req_dicts[i]
|
request = req_dicts[i]
|
||||||
idx = request.idx
|
idx = request.idx
|
||||||
@@ -392,6 +393,9 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
prefill_end_index = request.prefill_end_index
|
prefill_end_index = request.prefill_end_index
|
||||||
length = prefill_end_index - prefill_start_index
|
length = prefill_end_index - prefill_start_index
|
||||||
input_ids = request.prompt_token_ids + request.output_token_ids
|
input_ids = request.prompt_token_ids + request.output_token_ids
|
||||||
|
logger.debug(
|
||||||
|
f"Handle prefill request {request} at idx {idx} prefill_start_index {prefill_start_index} prefill_end_index {prefill_end_index} need_prefilled_token_num {len(input_ids)}"
|
||||||
|
)
|
||||||
self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array(
|
self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array(
|
||||||
input_ids[prefill_start_index:prefill_end_index]
|
input_ids[prefill_start_index:prefill_end_index]
|
||||||
)
|
)
|
||||||
@@ -401,6 +405,8 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
self.share_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array(
|
self.share_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array(
|
||||||
request.block_tables, dtype="int32"
|
request.block_tables, dtype="int32"
|
||||||
)
|
)
|
||||||
|
if self.share_inputs["is_block_step"][idx]: # has tasks to continue to decode
|
||||||
|
has_decode_task = True
|
||||||
self.share_inputs["stop_flags"][idx : idx + 1] = False
|
self.share_inputs["stop_flags"][idx : idx + 1] = False
|
||||||
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = prefill_start_index
|
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = prefill_start_index
|
||||||
self.share_inputs["seq_lens_this_time"][idx : idx + 1] = length
|
self.share_inputs["seq_lens_this_time"][idx : idx + 1] = length
|
||||||
@@ -474,7 +480,7 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
self.share_inputs["stop_seqs"][:stop_seqs_num, : len(request.get("stop_token_ids")[0])] = np.array(
|
self.share_inputs["stop_seqs"][:stop_seqs_num, : len(request.get("stop_token_ids")[0])] = np.array(
|
||||||
request.get("stop_token_ids"), dtype="int64"
|
request.get("stop_token_ids"), dtype="int64"
|
||||||
)
|
)
|
||||||
if has_prefill_task:
|
if has_prefill_task or has_decode_task:
|
||||||
self.share_inputs["not_need_stop"][0] = True
|
self.share_inputs["not_need_stop"][0] = True
|
||||||
|
|
||||||
def process_prefill_inputs(self, req_dicts: List[Request]):
|
def process_prefill_inputs(self, req_dicts: List[Request]):
|
||||||
|
Reference in New Issue
Block a user