Unify server-side and model-side Config (Part1) (#3018)

* move cache config

* fix mtp
This commit is contained in:
YuanRisheng
2025-07-28 10:51:52 +08:00
committed by GitHub
parent 8f426c1690
commit 6ccc10ad47
23 changed files with 243 additions and 289 deletions

View File

@@ -428,15 +428,15 @@ class XPUModelRunner(ModelRunnerBase):
# Set block tables
pre_max_block_num = (
self.parallel_config.max_model_len + self.parallel_config.block_size - 1
) // self.parallel_config.block_size + self.parallel_config.enc_dec_block_num
self.parallel_config.max_model_len + self.cache_config.block_size - 1
) // self.cache_config.block_size + self.cache_config.enc_dec_block_num
self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32")
# Initialize free list
free_list = list(
range(
self.parallel_config.total_block_num - 1,
int(self.parallel_config.total_block_num * self.parallel_config.kv_cache_ratio) - 1,
int(self.parallel_config.total_block_num * self.cache_config.kv_cache_ratio) - 1,
-1,
)
)
@@ -598,8 +598,8 @@ class XPUModelRunner(ModelRunnerBase):
full_length = min(num_tokens // batch_size, self.parallel_config.max_model_len - 10)
input_length = int(full_length - 512)
block_num = (
input_length + self.parallel_config.block_size - 1
) // self.parallel_config.block_size + self.parallel_config.enc_dec_block_num
input_length + self.cache_config.block_size - 1
) // self.cache_config.block_size + self.cache_config.enc_dec_block_num
for i in range(batch_size):
idx = i
@@ -707,8 +707,8 @@ class XPUModelRunner(ModelRunnerBase):
self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
step_paddle(
self.share_inputs,
self.parallel_config.block_size,
self.parallel_config.enc_dec_block_num,
self.cache_config.block_size,
self.cache_config.enc_dec_block_num,
)
return None
@@ -764,7 +764,7 @@ class XPUModelRunner(ModelRunnerBase):
required_memory = (
byte_of_dtype
* 2 # k + v
* (self.parallel_config.block_size * hidden_dim)
* (self.cache_config.block_size * hidden_dim)
* self.model_config.num_hidden_layers
)
return required_memory
@@ -784,7 +784,7 @@ class XPUModelRunner(ModelRunnerBase):
free_list = list(
range(
self.num_gpu_blocks - 1,
int(self.num_gpu_blocks * self.parallel_config.kv_cache_ratio) - 1,
int(self.num_gpu_blocks * self.cache_config.kv_cache_ratio) - 1,
-1,
)
)