[BugFix] fix cpu prefix cache bug (#5544)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled

* fix_dy_c8_bug

* add block_num check

* fix test case

* update ci case
This commit is contained in:
kevin
2025-12-16 14:21:42 +08:00
committed by GitHub
parent 5d2b16e6f3
commit c9b47f90ce
5 changed files with 23 additions and 8 deletions

View File

@@ -66,6 +66,13 @@ def parse_args():
choices=["uint8", "bfloat16", "block_wise_fp8"],
help="cache dtype",
)
parser.add_argument(
"--default_dtype",
type=str,
default="bfloat16",
choices=["float16", "bfloat16", "uint8"],
help="paddle default dtype, swap_cache_batch only support float16、bfloat16 and uint8 now",
)
parser.add_argument("--key_cache_shape", type=str, default="", help="key cache shape")
parser.add_argument("--value_cache_shape", type=str, default="", help="value cache shape")
parser.add_argument("--cache_queue_port", type=int, default=9923, help="cache queue port")
@@ -124,6 +131,7 @@ class CacheTransferManager:
self.num_gpu_blocks = self.key_cache_shape[0]
self.num_extra_layers = self.speculative_config.num_extra_cache_layer
self.num_extra_layer_gpu_blocks = int(self.num_gpu_blocks * self.speculative_config.num_gpu_block_expand_ratio)
paddle.set_default_dtype(args.default_dtype)
self.swap_to_cpu_thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=1)
self.swap_to_gpu_thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=1)

View File

@@ -279,6 +279,7 @@ class PrefixCacheManager:
+ f" --local_data_parallel_id {self.local_data_parallel_id}"
+ f" --rdma_port {cache_config.rdma_comm_ports[i] if cache_config.rdma_comm_ports is not None else '0'}"
+ f" --speculative_config '{self.speculative_config.to_json_string()}'"
+ f" --default_dtype '{self.config.model_config.dtype}'"
+ (" --create_cache_tensor" if create_cache_tensor else "")
+ f" >{log_dir}/launch_cache_transfer_manager_tprank{i}.log 2>&1"
)

View File

@@ -1355,9 +1355,11 @@ class CacheConfig:
self.prefill_kvcache_block_num = self.total_block_num
else:
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
assert (
self.prefill_kvcache_block_num >= self.max_block_num_per_seq
), f"current block number :{self.prefill_kvcache_block_num} should be greater than or equal to current model len needed minimum block number :{self.max_block_num_per_seq}"
assert self.prefill_kvcache_block_num >= self.max_block_num_per_seq + self.enc_dec_block_num, (
f"prefill_kvcache_block_num: {self.prefill_kvcache_block_num} should be larger "
f"than or equal to {self.max_block_num_per_seq + self.enc_dec_block_num}, please reduce "
"the max_model_len or increase num_gpu_blocks_override"
)
else:
length = num_total_tokens // number_of_tasks
block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size
@@ -1378,9 +1380,11 @@ class CacheConfig:
f"Reset block num, the total_block_num:{self.total_block_num},"
f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}"
)
assert (
self.prefill_kvcache_block_num >= self.max_block_num_per_seq
), f"current block number :{self.prefill_kvcache_block_num} should be greater than or equal to current model len needed minimum block number :{self.max_block_num_per_seq}"
assert self.prefill_kvcache_block_num >= self.max_block_num_per_seq + self.enc_dec_block_num, (
f"current device block num: {self.prefill_kvcache_block_num} "
f"should be larger than or equal to {self.max_block_num_per_seq + self.enc_dec_block_num}, please reduce "
"the max_model_len or replace the machine with larger GPU cards"
)
def print(self):
"""

View File

@@ -40,6 +40,7 @@ class Args:
value_cache_shape = ""
create_cache_tensor = False
cache_dtype = "bfloat16"
default_dtype = "bfloat16"
# ==========================

View File

@@ -253,8 +253,9 @@ class TestCommonEngineAdditionalCoverage(unittest.TestCase):
with patch("fastdeploy.engine.args_utils.envs.ENABLE_V1_KVCACHE_SCHEDULER", 0):
cfg = self._make_cfg(
splitwise_role="prefill",
num_gpu_blocks_override=3,
num_gpu_blocks_override=4,
router="0.0.0.0:30000",
kv_cache_ratio=1,
)
# Patch EngineWorkerQueue before EngineService ctor to avoid real IPC
@@ -337,7 +338,7 @@ class TestCommonEngineAdditionalCoverage(unittest.TestCase):
def test_start_mixed_branch_cache_after_load_and_zmq(self):
"""Cover lines 215-217 and 231 in start()."""
cfg = self._make_cfg(splitwise_role="mixed", num_gpu_blocks_override=2)
cfg = self._make_cfg(splitwise_role="mixed", num_gpu_blocks_override=4)
class DummyQ:
def __init__(self, *a, **k):