mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[BugFix] fix total_block_num init error in worker_process (#4553)
* fix total_block_num init error in worker_process * fix req and token client * fix req and token client * fix xpu xi * fix xpu ci
This commit is contained in:
@@ -503,7 +503,6 @@ class LLMEngine:
|
||||
f" --tensor_parallel_size {self.cfg.parallel_config.tensor_parallel_size}"
|
||||
f" --engine_worker_queue_port {ports}"
|
||||
f" --pod_ip {self.cfg.master_ip}"
|
||||
f" --total_block_num {self.cfg.cache_config.total_block_num}"
|
||||
f" --block_size {self.cfg.cache_config.block_size}"
|
||||
f" --enc_dec_block_num {self.cfg.cache_config.enc_dec_block_num}"
|
||||
f" --eos_tokens_lens {self.engine.data_processor.eos_token_id_len}"
|
||||
@@ -538,7 +537,7 @@ class LLMEngine:
|
||||
if self.cfg.structured_outputs_config.logits_processors is not None:
|
||||
arguments += f" --logits-processors {' '.join(self.cfg.structured_outputs_config.logits_processors)}"
|
||||
|
||||
worker_append_flag = {
|
||||
worker_store_true_flag = {
|
||||
"enable_expert_parallel": self.cfg.parallel_config.enable_expert_parallel,
|
||||
"enable_prefix_caching": self.cfg.cache_config.enable_prefix_caching,
|
||||
"enable_chunked_prefill": self.cfg.cache_config.enable_chunked_prefill,
|
||||
@@ -549,9 +548,17 @@ class LLMEngine:
|
||||
"enable_logprob": self.cfg.model_config.enable_logprob,
|
||||
"lm_head_fp32": self.cfg.model_config.lm_head_fp32,
|
||||
}
|
||||
for worker_flag, value in worker_append_flag.items():
|
||||
for worker_flag, value in worker_store_true_flag.items():
|
||||
if value:
|
||||
arguments = arguments + f" --{worker_flag}"
|
||||
|
||||
worker_default_none_flag = {
|
||||
"num_gpu_blocks_override": self.cfg.cache_config.num_gpu_blocks_override,
|
||||
}
|
||||
for worker_flag, value in worker_default_none_flag.items():
|
||||
if value:
|
||||
arguments = arguments + f" --{worker_flag} {value}"
|
||||
|
||||
if self.cfg.nnode > 1:
|
||||
pd_cmd = pd_cmd + f" --ips {ips} --nnodes {len(self.cfg.ips)}"
|
||||
pd_cmd = pd_cmd + arguments + f" 2>{log_dir}/launch_worker.log"
|
||||
|
||||
@@ -480,7 +480,7 @@ def parse_args():
|
||||
help="model dir",
|
||||
)
|
||||
parser.add_argument("-mbs", "--max_num_seqs", type=int, default=34, help="max batch size")
|
||||
parser.add_argument("--total_block_num", type=int, default=2000)
|
||||
parser.add_argument("--num_gpu_blocks_override", type=int, default=None)
|
||||
parser.add_argument("--block_size", type=int, default=64)
|
||||
parser.add_argument("--pod_ip", type=str, default="127.0.0.1")
|
||||
parser.add_argument("--engine_worker_queue_port", type=str, default="9923")
|
||||
@@ -715,6 +715,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
|
||||
parallel_config = ParallelConfig(vars(args))
|
||||
cache_config = CacheConfig(vars(args))
|
||||
scheduler_config = SchedulerConfig(vars(args))
|
||||
|
||||
parallel_config.tensor_parallel_rank = local_rank % parallel_config.tensor_parallel_size
|
||||
parallel_config.data_parallel_rank = local_rank // parallel_config.tensor_parallel_size
|
||||
# config for EP
|
||||
|
||||
Reference in New Issue
Block a user