[BugFix] support real batch_size (#3109)

* support real bsz

* fix

* fix xpu_model_runner.py,gpu_model_runner.py,gcu_model_runner.py,iluvatar_model_runner.py

* add event_loop_ep

* fix

* Add comments

* fix

* support mtp real_batch_size

* fix

* self.tmp_seq_lens_this_time->self.seq_lens_this_time_buffer

* fix

* fix VL real_seq_lens_this_time

* fix

* fix mtp

* fix

* fix mtp

* fix xpu

* fix
This commit is contained in:
lizexu123
2025-08-05 16:33:54 +08:00
committed by GitHub
parent 55939f7942
commit b01cfd6007
10 changed files with 110 additions and 58 deletions

View File

@@ -373,7 +373,7 @@ class XPUModelRunner(ModelRunnerBase):
# Forward meta store the global meta information of the forward
self.forward_meta: ForwardMeta = None
def insert_tasks_v1(self, req_dicts: List[Request]):
def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = None):
"""
Process scheduler output tasks, used when ENABLE_V1_KVCACHE_SCHEDULER=1
"""
@@ -403,7 +403,7 @@ class XPUModelRunner(ModelRunnerBase):
)
self.share_inputs["stop_flags"][idx : idx + 1] = False
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = prefill_start_index
self.share_inputs["seq_lens_this_time"][idx : idx + 1] = length
self.seq_lens_this_time_buffer[idx : idx + 1] = length
self.share_inputs["seq_lens_encoder"][idx : idx + 1] = length
self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = 0
self.share_inputs["prompt_lens"][idx : idx + 1] = len(input_ids)
@@ -425,7 +425,7 @@ class XPUModelRunner(ModelRunnerBase):
logger.debug(f"Handle preempted request {request} at idx {idx}")
self.share_inputs["block_tables"][idx : idx + 1, :] = -1
self.share_inputs["stop_flags"][idx : idx + 1] = True
self.share_inputs["seq_lens_this_time"][idx : idx + 1] = 0
self.seq_lens_this_time_buffer[idx : idx + 1] = 0
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
self.share_inputs["seq_lens_encoder"][idx : idx + 1] = 0
self.share_inputs["is_block_step"][idx : idx + 1] = False
@@ -462,8 +462,9 @@ class XPUModelRunner(ModelRunnerBase):
)
if has_prefill_task:
self.share_inputs["not_need_stop"][0] = True
self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer[:num_running_requests]
def process_prefill_inputs(self, req_dicts: List[Request]):
def process_prefill_inputs(self, req_dicts: List[Request], num_running_requests: int = None):
"""Process inputs for prefill tasks and update share_inputs buffer"""
req_len = len(req_dicts)
for i in range(req_len):
@@ -482,7 +483,7 @@ class XPUModelRunner(ModelRunnerBase):
self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0)
self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0)
self.share_inputs["presence_score"][idx : idx + 1] = request.get("presence_penalty", 0.0)
self.share_inputs["seq_lens_this_time"][idx : idx + 1] = length
self.seq_lens_this_time_buffer[idx : idx + 1] = length
self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = length
self.share_inputs["seq_lens_encoder"][idx : idx + 1] = length
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
@@ -526,6 +527,7 @@ class XPUModelRunner(ModelRunnerBase):
)
self.share_inputs["not_need_stop"][0] = True
self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer[:num_running_requests]
def _init_share_inputs(self, max_num_seqs: int):
"""Initialize all share buffers for model inputs.
@@ -571,7 +573,7 @@ class XPUModelRunner(ModelRunnerBase):
self.share_inputs["max_length"] = paddle.full(
[max_num_seqs, 1], self.model_config.max_model_len, dtype="int64"
)
self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, 0, dtype="int32")
self.seq_lens_this_time_buffer = paddle.full(max_num_seqs, 0, dtype="int32")
self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
self.share_inputs["step_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
@@ -813,7 +815,7 @@ class XPUModelRunner(ModelRunnerBase):
idx = i
self.share_inputs["input_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length)
self.share_inputs["eos_token_id"][:] = np.array([2], dtype="int64").reshape(-1, 1)
self.share_inputs["seq_lens_this_time"][idx : idx + 1] = input_length
self.seq_lens_this_time_buffer[idx : idx + 1] = input_length
self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = input_length
self.share_inputs["seq_lens_encoder"][idx : idx + 1] = input_length
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
@@ -829,6 +831,7 @@ class XPUModelRunner(ModelRunnerBase):
self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange(
idx * block_num, (idx + 1) * block_num, 1
)
self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer
def _dummy_run(
self,
@@ -853,6 +856,7 @@ class XPUModelRunner(ModelRunnerBase):
self,
model_forward_batch: Optional[List[Request]] = None,
is_dummy_run: bool = False,
num_running_requests: int = None,
) -> Optional[ModelRunnerOutput]:
"""
The Entrance of model execute.
@@ -860,6 +864,7 @@ class XPUModelRunner(ModelRunnerBase):
model_forward_batch: 'Request' contains information related to prompt and is an abstract
class at the server level, which is too granular for ModelRunner.
We plan to replace it with 'ModelForwardBatch'.
num_running_requests: batch_size
intermediate_tensors:
"""
# 1. Prepare inputs of model and decoder.
@@ -920,6 +925,10 @@ class XPUModelRunner(ModelRunnerBase):
self.cache_config.block_size,
self.cache_config.enc_dec_block_num,
)
if num_running_requests is not None:
self.seq_lens_this_time_buffer[:num_running_requests].copy_(
self.share_inputs["seq_lens_this_time"][:num_running_requests], False
)
return None