mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[PD Disaggregation] support different tp_size for prefill and decode (#5296)
* up * up * up * fix
This commit is contained in:
@@ -199,7 +199,7 @@ class SplitwiseConnector:
|
||||
f"{task.disaggregate_info['cache_info']['rdma']['ip']}:"
|
||||
+ f"{task.disaggregate_info['cache_info']['rdma']['port']}"
|
||||
)
|
||||
self.logger.info(f"send splitwise tasks to port {addr} decode")
|
||||
self.logger.info(f"send splitwise tasks to port {addr} decode, {task.request_id}")
|
||||
self.current_request_ids[task.request_id] = "init"
|
||||
decode_diagg = task.disaggregate_info["cache_info"]
|
||||
task.disaggregate_info["cache_info"] = self.cfg.disaggregate_info["cache_info"]
|
||||
@@ -271,6 +271,7 @@ class SplitwiseConnector:
|
||||
)
|
||||
|
||||
def check_decode_allocated(self, task):
|
||||
self.logger.debug(f"start check decode allocated: {task.request_id}")
|
||||
start_time = time.time()
|
||||
if task.disaggregate_info is None:
|
||||
return True, ""
|
||||
@@ -280,7 +281,7 @@ class SplitwiseConnector:
|
||||
return True, ""
|
||||
while self.current_request_ids[task.request_id] == "init":
|
||||
time.sleep(0.001)
|
||||
if time.time() - start_time > 30:
|
||||
if time.time() - start_time > envs.FD_PREFILL_WAIT_DECODE_RESOURCE_SECONDS:
|
||||
del self.current_request_ids[task.request_id]
|
||||
return False, "timeout"
|
||||
msg = self.current_request_ids[task.request_id]
|
||||
@@ -363,6 +364,7 @@ class SplitwiseConnector:
|
||||
"rdma_ports": self.cfg.disaggregate_info["cache_info"]["rdma"]["rdma_port"],
|
||||
"transfer_protocol": "rdma",
|
||||
"dest_block_ids": dsg_info["block_tables"],
|
||||
"decode_tp_size": self.cfg.parallel_config.tensor_parallel_size,
|
||||
}
|
||||
|
||||
addr = f"{dsg_info['cache_info']['rdma']['ip']}:" + f"{dsg_info['cache_info']['rdma']['port']}"
|
||||
|
||||
Reference in New Issue
Block a user