Unify server-side and model-side Config (Part1) (#3018)

* move cache config

* fix mtp
This commit is contained in:
YuanRisheng
2025-07-28 10:51:52 +08:00
committed by GitHub
parent 8f426c1690
commit 6ccc10ad47
23 changed files with 243 additions and 289 deletions

View File

@@ -25,6 +25,7 @@ import paddle.distributed as dist
from paddle.distributed import fleet
from fastdeploy.config import (
CacheConfig,
DecodingConfig,
DeviceConfig,
ErnieArchitectures,
@@ -140,6 +141,7 @@ class PaddleDisWorkerProc:
self.local_rank = local_rank
self.fd_config = fd_config
self.parallel_config = fd_config.parallel_config
self.cache_config = fd_config.cache_config
# TODO(gongshaotian): Use worker factory to get worker
self.worker = get_worker(fd_config=fd_config, local_rank=self.local_rank, rank=self.ranks)
@@ -404,7 +406,7 @@ class PaddleDisWorkerProc:
logger.info(f"------- num_blocks_global: {num_blocks_local} --------")
# wait engine launch cache_manager
if self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed":
if self.cache_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed":
launched_cache_manager_signal_data = np.zeros([1], dtype=np.int32)
self.launched_cache_manager_signal = IPCSignal(
name="launched_cache_manager_signal",
@@ -607,6 +609,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
decoding_config = DecodingConfig(vars(args))
speculative_config = SpeculativeConfig(vars(args))
parallel_config = ParallelConfig(vars(args))
cache_config = CacheConfig(vars(args))
parallel_config.tensor_parallel_size = args.tensor_parallel_size
parallel_config.tensor_parallel_rank = local_rank % args.tensor_parallel_size
parallel_config.expert_parallel_size = args.expert_parallel_size
@@ -707,6 +710,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
decoding_config=decoding_config,
quant_config=quant_config,
graph_opt_config=graph_opt_config,
cache_config=cache_config,
)
update_fd_config_for_mm(fd_config)