mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
Unify server-side and model-side Config (Part1) (#3018)
* move cache config * fix mtp
This commit is contained in:
@@ -339,7 +339,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["prompt_ids"][idx : idx + 1, :length] = np.array(request.prompt_token_ids)
|
||||
|
||||
# Use chunked prefill
|
||||
if self.parallel_config.enable_chunked_prefill:
|
||||
if self.cache_config.enable_chunked_prefill:
|
||||
request.set("chunk_idx", 1)
|
||||
logger.info(f"prefill_chunk_info: {request.prefill_chunk_info}")
|
||||
token_chunk_size = request.prefill_chunk_info[0]
|
||||
@@ -467,10 +467,10 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
num_tokens // batch_size,
|
||||
self.parallel_config.max_model_len - max_dec_len,
|
||||
)
|
||||
input_length = int(full_length * self.parallel_config.kv_cache_ratio)
|
||||
input_length = int(full_length * self.cache_config.kv_cache_ratio)
|
||||
block_num = (
|
||||
input_length + self.parallel_config.block_size - 1
|
||||
) // self.parallel_config.block_size + self.parallel_config.enc_dec_block_num
|
||||
input_length + self.cache_config.block_size - 1
|
||||
) // self.cache_config.block_size + self.cache_config.enc_dec_block_num
|
||||
|
||||
for i in range(batch_size):
|
||||
idx = i
|
||||
@@ -602,15 +602,15 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
|
||||
# Set block tables
|
||||
pre_max_block_num = (
|
||||
self.parallel_config.max_model_len + self.parallel_config.block_size - 1
|
||||
) // self.parallel_config.block_size + self.parallel_config.enc_dec_block_num
|
||||
self.parallel_config.max_model_len + self.cache_config.block_size - 1
|
||||
) // self.cache_config.block_size + self.cache_config.enc_dec_block_num
|
||||
self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32")
|
||||
|
||||
# Initialize free list
|
||||
free_list = list(
|
||||
range(
|
||||
self.parallel_config.total_block_num - 1,
|
||||
int(self.parallel_config.total_block_num * self.parallel_config.kv_cache_ratio) - 1,
|
||||
int(self.parallel_config.total_block_num * self.cache_config.kv_cache_ratio) - 1,
|
||||
-1,
|
||||
)
|
||||
)
|
||||
@@ -689,7 +689,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["step_seq_lens_decoder"],
|
||||
self.share_inputs["block_tables"],
|
||||
self.share_inputs["is_block_step"],
|
||||
self.parallel_config.block_size,
|
||||
self.cache_config.block_size,
|
||||
)
|
||||
|
||||
# Remove padding
|
||||
@@ -833,9 +833,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
)
|
||||
local_rank = self.local_rank % self.parallel_config.tensor_parallel_size
|
||||
|
||||
if not profile and (
|
||||
self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"
|
||||
):
|
||||
if not profile and (self.cache_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"):
|
||||
cache_kvs_list = []
|
||||
for i in range(self.model_config.num_hidden_layers):
|
||||
key_cache = paddle.empty(shape=[], dtype=cache_type)
|
||||
@@ -1015,7 +1013,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
sampler_output=sampler_output,
|
||||
model_output=model_output_data,
|
||||
share_inputs=self.share_inputs,
|
||||
block_size=self.parallel_config.block_size,
|
||||
block_size=self.cache_config.block_size,
|
||||
speculative_decoding=self.speculative_decoding,
|
||||
skip_save_output=True,
|
||||
)
|
||||
@@ -1031,10 +1029,10 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
|
||||
step_cuda(
|
||||
self.share_inputs,
|
||||
self.parallel_config.block_size,
|
||||
self.parallel_config.enc_dec_block_num,
|
||||
self.cache_config.block_size,
|
||||
self.cache_config.enc_dec_block_num,
|
||||
self.speculative_config,
|
||||
self.parallel_config.enable_prefix_caching,
|
||||
self.cache_config.enable_prefix_caching,
|
||||
)
|
||||
|
||||
if int((self.share_inputs["seq_lens_this_time"] > 0).sum()) == 0:
|
||||
@@ -1044,7 +1042,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
"""
|
||||
Update chunked prefill related parameters
|
||||
"""
|
||||
if not self.parallel_config.enable_chunked_prefill:
|
||||
if not self.cache_config.enable_chunked_prefill:
|
||||
return
|
||||
for task in tasks:
|
||||
if task.get("prefill_chunk_info", None) is None:
|
||||
@@ -1144,7 +1142,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
A list of indices corresponding to the requests that need to be skipped.
|
||||
"""
|
||||
skip_idx_list = []
|
||||
if not self.parallel_config.enable_chunked_prefill or self.guided_backend is None:
|
||||
if not self.cache_config.enable_chunked_prefill or self.guided_backend is None:
|
||||
return skip_idx_list
|
||||
|
||||
for task in model_forward_batch:
|
||||
@@ -1283,7 +1281,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
sampler_output=sampler_output,
|
||||
model_output=model_output_data,
|
||||
share_inputs=self.share_inputs,
|
||||
block_size=self.parallel_config.block_size,
|
||||
block_size=self.cache_config.block_size,
|
||||
save_each_rank=self.parallel_config.use_ep,
|
||||
speculative_decoding=self.speculative_decoding,
|
||||
skip_save_output=skip_save_output,
|
||||
@@ -1302,10 +1300,10 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
step_cuda(
|
||||
self.share_inputs,
|
||||
self.parallel_config.block_size,
|
||||
self.parallel_config.enc_dec_block_num,
|
||||
self.cache_config.block_size,
|
||||
self.cache_config.enc_dec_block_num,
|
||||
self.speculative_config,
|
||||
self.parallel_config.enable_prefix_caching,
|
||||
self.cache_config.enable_prefix_caching,
|
||||
)
|
||||
|
||||
self._update_chunked_prefill(model_forward_batch)
|
||||
@@ -1379,7 +1377,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
free_list = list(
|
||||
range(
|
||||
self.num_gpu_blocks - 1,
|
||||
int(self.num_gpu_blocks * self.parallel_config.kv_cache_ratio) - 1,
|
||||
int(self.num_gpu_blocks * self.cache_config.kv_cache_ratio) - 1,
|
||||
-1,
|
||||
)
|
||||
)
|
||||
@@ -1425,7 +1423,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
if self.speculative_method in ["mtp"]
|
||||
else self.model_config.num_hidden_layers
|
||||
)
|
||||
required_memory = byte_of_dtype * 2 * (self.parallel_config.block_size * hidden_dim) * num_layers # k + v
|
||||
required_memory = byte_of_dtype * 2 * (self.cache_config.block_size * hidden_dim) * num_layers # k + v
|
||||
return required_memory
|
||||
|
||||
def not_need_stop(self) -> bool:
|
||||
|
Reference in New Issue
Block a user