mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 16:22:57 +08:00
[FDConfig]Remove splitwise_role and engine_worker_queue_port in FDConfig (#4147)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* remove splitwise_role and engine_worker_queue_port * fix xpu * fix xpu * fix xpu * fix unittest * resolve conflct
This commit is contained in:
@@ -296,8 +296,6 @@ class ParallelConfig:
|
||||
# Do profile or not
|
||||
self.do_profile: bool = False
|
||||
|
||||
# splitwise role
|
||||
self.splitwise_role: str = "mixed"
|
||||
# guided decoding backend
|
||||
self.guided_decoding_backend: str = None
|
||||
# disable any whitespace for guided decoding
|
||||
@@ -319,14 +317,6 @@ class ParallelConfig:
|
||||
else:
|
||||
self.expert_parallel_size = 1
|
||||
self.use_ep = self.expert_parallel_size > 1
|
||||
if self.splitwise_role == "mixed":
|
||||
self.moe_phase = MoEPhase(phase="prefill")
|
||||
elif self.splitwise_role == "prefill":
|
||||
self.moe_phase = MoEPhase(phase="prefill")
|
||||
elif self.splitwise_role == "decode":
|
||||
self.moe_phase = MoEPhase(phase="decode")
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
# pd_disaggregation
|
||||
use_pd_disaggregation: int = int(os.getenv("FLAGS_use_pd_disaggregation", 0))
|
||||
@@ -1116,10 +1106,8 @@ class FDConfig:
|
||||
max_model_len: int = 8192,
|
||||
ips: str = None,
|
||||
use_warmup: bool = False,
|
||||
engine_worker_queue_port: str = "8002",
|
||||
limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
|
||||
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
|
||||
splitwise_role: str = "mixed",
|
||||
innode_prefill_ports: Optional[List[int]] = None,
|
||||
max_num_partial_prefills: int = 1,
|
||||
max_long_partial_prefills: int = 1,
|
||||
@@ -1182,7 +1170,6 @@ class FDConfig:
|
||||
self.limit_mm_per_prompt = limit_mm_per_prompt
|
||||
self.mm_processor_kwargs = mm_processor_kwargs
|
||||
self.use_warmup = use_warmup
|
||||
self.splitwise_role = splitwise_role
|
||||
self.innode_prefill_ports = innode_prefill_ports
|
||||
self.max_num_partial_prefills = max_num_partial_prefills
|
||||
self.max_long_partial_prefills = max_long_partial_prefills
|
||||
@@ -1190,11 +1177,7 @@ class FDConfig:
|
||||
self.reasoning_parser = reasoning_parser
|
||||
self.guided_decoding_backend = guided_decoding_backend
|
||||
self.disable_any_whitespace = disable_any_whitespace
|
||||
self.engine_worker_queue_port = engine_worker_queue_port
|
||||
self._str_to_list("innode_prefill_ports", int)
|
||||
if isinstance(engine_worker_queue_port, int):
|
||||
self.engine_worker_queue_port = str(engine_worker_queue_port)
|
||||
self._str_to_list("engine_worker_queue_port", str)
|
||||
|
||||
if envs.FD_FOR_TORCH_MODEL_FORMAT:
|
||||
self.model_config.model_format = "torch"
|
||||
@@ -1267,6 +1250,15 @@ class FDConfig:
|
||||
else:
|
||||
self.guided_decoding_backend = "xgrammar"
|
||||
|
||||
if self.scheduler_config.splitwise_role == "mixed":
|
||||
self.model_config.moe_phase = MoEPhase(phase="prefill")
|
||||
elif self.scheduler_config.splitwise_role == "prefill":
|
||||
self.model_config.moe_phase = MoEPhase(phase="prefill")
|
||||
elif self.scheduler_config.splitwise_role == "decode":
|
||||
self.model_config.moe_phase = MoEPhase(phase="decode")
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
def check(self):
|
||||
"""
|
||||
check the legality of config
|
||||
@@ -1301,7 +1293,7 @@ class FDConfig:
|
||||
f"max_long_partial_prefills: {self.max_long_partial_prefills} should "
|
||||
f"be less than or equal to max_num_partial_prefills: {self.max_num_partial_prefills}"
|
||||
)
|
||||
assert self.splitwise_role in ["mixed", "prefill", "decode"]
|
||||
assert self.scheduler_config.splitwise_role in ["mixed", "prefill", "decode"]
|
||||
# TODO(@wufeisheng): TP and EP need to be supported simultaneously.
|
||||
assert (self.parallel_config.tensor_parallel_size == 1 and self.parallel_config.expert_parallel_size >= 1) or (
|
||||
self.parallel_config.tensor_parallel_size >= 1 and self.parallel_config.expert_parallel_size == 1
|
||||
@@ -1387,8 +1379,8 @@ class FDConfig:
|
||||
initialize cache info
|
||||
"""
|
||||
disaggregate_info = {}
|
||||
if self.splitwise_role != "mixed":
|
||||
disaggregate_info["role"] = self.splitwise_role
|
||||
if self.scheduler_config.splitwise_role != "mixed":
|
||||
disaggregate_info["role"] = self.scheduler_config.splitwise_role
|
||||
disaggregate_info["cache_info"] = dict()
|
||||
current_protocol = self.cache_config.cache_transfer_protocol.split(",")
|
||||
disaggregate_info["transfer_protocol"] = current_protocol
|
||||
@@ -1396,7 +1388,9 @@ class FDConfig:
|
||||
if protocol == "ipc":
|
||||
disaggregate_info["cache_info"][protocol] = {
|
||||
"ip": self.host_ip,
|
||||
"port": self.engine_worker_queue_port[self.parallel_config.local_data_parallel_id],
|
||||
"port": self.parallel_config.engine_worker_queue_port[
|
||||
self.parallel_config.local_data_parallel_id
|
||||
],
|
||||
"device_ids": self.local_device_ids,
|
||||
}
|
||||
elif protocol == "rdma":
|
||||
|
Reference in New Issue
Block a user