[LLM] fix multinode bugs (#2945)

* [LLM] fix multinode bugs

* [LLM] fix multinode bugs

* [LLM] fix multinode bugs

* [LLM] fix ci bugs

* fix ci bugs

* fix ci bugs
This commit is contained in:
ltd0924
2025-07-22 20:23:37 +08:00
committed by GitHub
parent 69be77c8c0
commit b0f1e0eef4
9 changed files with 68 additions and 87 deletions

View File

@@ -24,6 +24,7 @@ from fastdeploy.input.preprocess import InputPreprocessor
from fastdeploy.engine.request import Request
from fastdeploy.inter_communicator import ZmqClient, IPCSignal
from fastdeploy.metrics.work_metrics import work_process_metrics
from fastdeploy.platforms import current_platform
from fastdeploy.utils import api_server_logger, EngineError
@@ -43,7 +44,8 @@ class EngineClient:
self.reasoning_parser = reasoning_parser
self.data_processor = input_processor.create_processor()
self.max_model_len = max_model_len
self.worker_healthy_live_recorded_time_array = np.zeros(shape=[tensor_parallel_size], dtype=np.int32)
max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
self.worker_healthy_live_recorded_time_array = np.zeros(shape=[tensor_parallel_size % max_chips_per_node], dtype=np.int32)
self.worker_healthy_live_signal = IPCSignal(name="worker_healthy_live_signal",
array=self.worker_healthy_live_recorded_time_array,
dtype=np.int32,