mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
* support real bsz * fix * fix xpu_model_runner.py,gpu_model_runner.py,gcu_model_runner.py,iluvatar_model_runner.py * add event_loop_ep * fix * Add comments * fix * support mtp real_batch_size * fix * self.tmp_seq_lens_this_time->self.seq_lens_this_time_buffer * fix * fix VL real_seq_lens_this_time * fix * fix mtp * fix * fix mtp * fix xpu * fix
This commit is contained in:
@@ -142,9 +142,10 @@ class IluvatarModelRunner(ModelRunnerBase):
|
||||
schemata_key,
|
||||
)
|
||||
|
||||
def insert_prefill_inputs(self, req_dicts: List[Request]):
|
||||
def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests: int = None):
|
||||
"""
|
||||
Process inputs for prefill tasks and insert it to share_inputs buffer
|
||||
num_running_requests: batch_size
|
||||
TODO(gongshaotian): Refactor this func
|
||||
"""
|
||||
|
||||
@@ -176,7 +177,7 @@ class IluvatarModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["input_ids"][idx : idx + 1, 0] = request.prompt_token_ids[0]
|
||||
self.share_inputs["seq_lens_encoder"][idx : idx + 1] = 0
|
||||
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = length
|
||||
self.share_inputs["seq_lens_this_time"][idx : idx + 1] = 1
|
||||
self.seq_lens_this_time_buffer[idx : idx + 1] = 1
|
||||
self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = 0
|
||||
self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = length
|
||||
self.share_inputs["prompt_lens"][idx : idx + 1] = length
|
||||
@@ -188,7 +189,7 @@ class IluvatarModelRunner(ModelRunnerBase):
|
||||
request.draft_token_ids[0:num_prefill_send_token],
|
||||
dtype="int64",
|
||||
)
|
||||
self.share_inputs["seq_lens_this_time"][idx : idx + 1] = num_prefill_send_token
|
||||
self.seq_lens_this_time_buffer[idx : idx + 1] = num_prefill_send_token
|
||||
else:
|
||||
self.share_inputs["pre_ids"][idx : idx + 1] = -1
|
||||
self.share_inputs["step_idx"][idx : idx + 1] = 0
|
||||
@@ -199,7 +200,7 @@ class IluvatarModelRunner(ModelRunnerBase):
|
||||
request.set("chunk_idx", 1)
|
||||
logger.info(f"prefill_chunk_info: {request.prefill_chunk_info}")
|
||||
token_chunk_size = request.prefill_chunk_info[0]
|
||||
self.share_inputs["seq_lens_this_time"][idx : idx + 1] = token_chunk_size
|
||||
self.seq_lens_this_time_buffer[idx : idx + 1] = token_chunk_size
|
||||
self.share_inputs["input_ids"][idx, :token_chunk_size] = np.array(
|
||||
request.prompt_token_ids[:token_chunk_size]
|
||||
)
|
||||
@@ -211,7 +212,7 @@ class IluvatarModelRunner(ModelRunnerBase):
|
||||
else:
|
||||
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0)
|
||||
self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0)
|
||||
self.share_inputs["seq_lens_this_time"][idx : idx + 1] = length
|
||||
self.seq_lens_this_time_buffer[idx : idx + 1] = length
|
||||
self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = length
|
||||
self.share_inputs["seq_lens_encoder"][idx : idx + 1] = length
|
||||
self.share_inputs["prompt_lens"][idx : idx + 1] = length
|
||||
@@ -262,6 +263,7 @@ class IluvatarModelRunner(ModelRunnerBase):
|
||||
self.sampler.apply_logits_processor(idx, request.get("logits_processor"), prefill_tokens)
|
||||
|
||||
self.share_inputs["not_need_stop"][0] = True
|
||||
self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer[:num_running_requests]
|
||||
|
||||
def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decode_len: int):
|
||||
"""Set dummy prefill inputs to share_inputs"""
|
||||
@@ -281,7 +283,7 @@ class IluvatarModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["input_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length)
|
||||
self.share_inputs["prompt_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length)
|
||||
self.share_inputs["eos_token_id"][:] = np.array([2], dtype="int64").reshape(-1, 1)
|
||||
self.share_inputs["seq_lens_this_time"][idx : idx + 1] = input_length
|
||||
self.seq_lens_this_time_buffer[idx : idx + 1] = input_length
|
||||
self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = input_length
|
||||
self.share_inputs["seq_lens_encoder"][idx : idx + 1] = input_length
|
||||
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
|
||||
@@ -297,6 +299,7 @@ class IluvatarModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange(
|
||||
idx * block_num, (idx + 1) * block_num, 1
|
||||
)
|
||||
self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer
|
||||
|
||||
def _init_share_inputs(self, max_num_seqs: int):
|
||||
"""Initialize all share buffers for model inputs.
|
||||
@@ -342,7 +345,7 @@ class IluvatarModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["max_dec_len"] = paddle.full([max_num_seqs, 1], self.model_config.max_length, dtype="int64")
|
||||
self.share_inputs["min_length"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64")
|
||||
self.share_inputs["max_length"] = paddle.full([max_num_seqs, 1], self.model_config.max_length, dtype="int64")
|
||||
self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, 0, dtype="int32")
|
||||
self.seq_lens_this_time_buffer = paddle.full(max_num_seqs, 0, dtype="int32")
|
||||
self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
||||
self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
||||
self.share_inputs["step_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
||||
@@ -859,6 +862,7 @@ class IluvatarModelRunner(ModelRunnerBase):
|
||||
def execute_model(
|
||||
self,
|
||||
model_forward_batch: Optional[List[Request]] = None,
|
||||
num_running_requests: int = None,
|
||||
) -> Optional[ModelRunnerOutput]:
|
||||
"""
|
||||
The Entrance of model execute.
|
||||
@@ -866,6 +870,7 @@ class IluvatarModelRunner(ModelRunnerBase):
|
||||
model_forward_batch: 'Request' contains information related to prompt and is an abstract
|
||||
class at the server level, which is too granular for ModelRunner.
|
||||
We plan to replace it with 'ModelForwardBatch'.
|
||||
num_running_requests: batch_size
|
||||
intermediate_tensors:
|
||||
"""
|
||||
# Note(@wufeisheng): If `not_need_stop`` is False, it means the current worker is in an idle state.
|
||||
@@ -986,6 +991,9 @@ class IluvatarModelRunner(ModelRunnerBase):
|
||||
|
||||
self._update_chunked_prefill(model_forward_batch)
|
||||
self._add_cache(model_forward_batch)
|
||||
self.seq_lens_this_time_buffer[:num_running_requests].copy_(
|
||||
self.share_inputs["seq_lens_this_time"][:num_running_requests], False
|
||||
)
|
||||
return None
|
||||
|
||||
def _add_cache(self, model_forward_batch) -> None:
|
||||
|
Reference in New Issue
Block a user