mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[FDConfig]Remove reasoning_parser/guided_decoding_backend/disable_any_whitespace/device_ids in FDConfig (#4362)
* remove devices id * fix unittest * fix ce --------- Co-authored-by: root <root@yqlcc01-sys-rpm12rzmwjd.yqlcc01.baidu.com>
This commit is contained in:
@@ -90,7 +90,7 @@ class LLMEngine:
|
||||
|
||||
self.input_processor = InputPreprocessor(
|
||||
cfg.tokenizer,
|
||||
cfg.reasoning_parser,
|
||||
cfg.structured_outputs_config.reasoning_parser,
|
||||
cfg.limit_mm_per_prompt,
|
||||
cfg.mm_processor_kwargs,
|
||||
cfg.model_config.enable_mm,
|
||||
@@ -128,7 +128,7 @@ class LLMEngine:
|
||||
|
||||
# If block numer is specified and model is deployed in mixed mode, start cache manager first
|
||||
if not self.do_profile and self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
device_ids = self.cfg.device_ids.split(",")
|
||||
device_ids = self.cfg.parallel_config.device_ids.split(",")
|
||||
self.cache_manager_processes = self.engine.start_cache_service(device_ids, self.ipc_signal_suffix, True)
|
||||
|
||||
# Start workers
|
||||
@@ -162,7 +162,7 @@ class LLMEngine:
|
||||
if self.do_profile:
|
||||
self._stop_profile()
|
||||
elif self.cfg.cache_config.enable_prefix_caching:
|
||||
device_ids = self.cfg.device_ids.split(",")
|
||||
device_ids = self.cfg.parallel_config.device_ids.split(",")
|
||||
self.cache_manager_processes = self.engine.start_cache_service(device_ids, self.ipc_signal_suffix, False)
|
||||
|
||||
# Launch components: scheduler, cache_manager, expert_service et.al.
|
||||
@@ -426,7 +426,7 @@ class LLMEngine:
|
||||
"""
|
||||
variables = {
|
||||
"ENABLE_FASTDEPLOY_LOAD_MODEL_CONCURRENCY": 0,
|
||||
"LOAD_STATE_DICT_THREAD_NUM": len(self.cfg.device_ids.split(",")),
|
||||
"LOAD_STATE_DICT_THREAD_NUM": len(self.cfg.parallel_config.device_ids.split(",")),
|
||||
"PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION": "python",
|
||||
"FLAGS_use_append_attn": 1,
|
||||
"NCCL_ALGO": "Ring",
|
||||
@@ -503,11 +503,11 @@ class LLMEngine:
|
||||
if self.cfg.ips is not None:
|
||||
ips = ",".join(self.cfg.ips)
|
||||
arguments = (
|
||||
f" --devices {self.cfg.device_ids} {py_script}"
|
||||
f" --devices {self.cfg.parallel_config.device_ids} {py_script}"
|
||||
f" --max_num_seqs {self.cfg.scheduler_config.max_num_seqs} --max_model_len {self.cfg.model_config.max_model_len}"
|
||||
f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}"
|
||||
f" --model {self.cfg.model_config.model!s}"
|
||||
f" --device_ids {self.cfg.device_ids}"
|
||||
f" --device_ids {self.cfg.parallel_config.device_ids}"
|
||||
f" --tensor_parallel_size {self.cfg.parallel_config.tensor_parallel_size}"
|
||||
f" --engine_worker_queue_port {ports}"
|
||||
f" --pod_ip {self.cfg.master_ip}"
|
||||
@@ -527,10 +527,10 @@ class LLMEngine:
|
||||
f" --think_end_id {self.cfg.model_config.think_end_id}"
|
||||
f" --speculative_config '{self.cfg.speculative_config.to_json_string()}'"
|
||||
f" --graph_optimization_config '{self.cfg.graph_opt_config.to_json_string()}'"
|
||||
f" --guided_decoding_backend {self.cfg.guided_decoding_backend}"
|
||||
f" --guided_decoding_backend {self.cfg.structured_outputs_config.guided_decoding_backend}"
|
||||
f" --load_strategy {self.cfg.load_config.load_strategy}"
|
||||
f" --early_stop_config '{self.cfg.early_stop_config.to_json_string()}'"
|
||||
f" --reasoning_parser {self.cfg.reasoning_parser}"
|
||||
f" --reasoning_parser {self.cfg.structured_outputs_config.reasoning_parser}"
|
||||
f" --load_choices {self.cfg.load_config.load_choices}"
|
||||
f" --plas_attention_config '{self.cfg.plas_attention_config.to_json_string()}'"
|
||||
f" --ips {ips}"
|
||||
@@ -546,7 +546,7 @@ class LLMEngine:
|
||||
"enable_chunked_prefill": self.cfg.cache_config.enable_chunked_prefill,
|
||||
"do_profile": self.do_profile,
|
||||
"dynamic_load_weight": self.cfg.load_config.dynamic_load_weight,
|
||||
"disable_any_whitespace": self.cfg.disable_any_whitespace,
|
||||
"disable_any_whitespace": self.cfg.structured_outputs_config.disable_any_whitespace,
|
||||
"disable_custom_all_reduce": self.cfg.parallel_config.disable_custom_all_reduce,
|
||||
"enable_logprob": self.cfg.model_config.enable_logprob,
|
||||
"lm_head_fp32": self.cfg.model_config.lm_head_fp32,
|
||||
@@ -643,7 +643,7 @@ class LLMEngine:
|
||||
self.cfg.cache_config.reset(num_gpu_blocks)
|
||||
self.engine.resource_manager.reset_cache_config(self.cfg.cache_config)
|
||||
if self.cfg.cache_config.enable_prefix_caching or self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
device_ids = self.cfg.device_ids.split(",")
|
||||
device_ids = self.cfg.parallel_config.device_ids.split(",")
|
||||
self.cache_manager_processes = self.engine.start_cache_service(
|
||||
device_ids, self.ipc_signal_suffix, self.cfg.scheduler_config.splitwise_role != "mixed"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user