diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index f824b48a4..16f3b0a0e 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -646,7 +646,6 @@ class GPUModelRunner(ModelRunnerBase): ) self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1] - self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = length if request.get("seed") is not None: self.share_inputs["infer_seed"][idx : idx + 1] = request.get("seed") @@ -864,7 +863,6 @@ class GPUModelRunner(ModelRunnerBase): self.share_inputs["stop_flags"][idx : idx + 1] = False self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1] - self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = length if request.get("seed") is not None: self.share_inputs["infer_seed"][idx : idx + 1] = request.get("seed") @@ -1017,7 +1015,6 @@ class GPUModelRunner(ModelRunnerBase): self.share_inputs["stop_flags"][idx : idx + 1] = False self.share_inputs["temperature"][idx : idx + 1] = 1 self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1] - self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = input_length self.share_inputs["encoder_block_lens"][idx : idx + 1] = block_num self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange( @@ -1101,7 +1098,6 @@ class GPUModelRunner(ModelRunnerBase): self.share_inputs["used_list_len"] = paddle.full([max_num_seqs], 0, dtype="int32") self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], 0, dtype="int64").cpu() self.share_inputs["first_token_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int64") - self.share_inputs["ori_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int32") @@ -1783,8 +1779,8 @@ class GPUModelRunner(ModelRunnerBase): def _dummy_run( self, - num_tokens: paddle.Tensor, - batch_size: paddle.Tensor, + num_tokens: int, + batch_size: int, expected_decode_len: int = 1, in_capturing: bool = False, capture_prefill: bool = False, @@ -1830,21 +1826,20 @@ class GPUModelRunner(ModelRunnerBase): # 3. Run model if self.enable_mm: model_output = self.model( - self.share_inputs["ids_remove_padding"], + self.forward_meta.ids_remove_padding, self.share_inputs["image_features"], self.forward_meta, ) else: model_output = self.model( - ids_remove_padding=self.share_inputs["ids_remove_padding"], - forward_meta=self.forward_meta, + self.forward_meta.ids_remove_padding, + self.forward_meta, ) if self.use_cudagraph: model_output = model_output[: self.real_token_num] if self.is_pooling_model: - hidden_states = model_output - self._dummy_pooler_run(hidden_states, model_output) + self._dummy_pooler_run(model_output, model_output) break else: hidden_states = rebuild_padding( @@ -2154,14 +2149,14 @@ class GPUModelRunner(ModelRunnerBase): # 3. Execute model if self.enable_mm: model_output = self.model( - self.share_inputs["ids_remove_padding"], + self.forward_meta.ids_remove_padding, self.share_inputs["image_features"], self.forward_meta, ) else: model_output = self.model( - ids_remove_padding=self.share_inputs["ids_remove_padding"], - forward_meta=self.forward_meta, + self.forward_meta.ids_remove_padding, + self.forward_meta, ) if self.use_cudagraph: model_output = model_output[: self.real_token_num] @@ -2169,8 +2164,7 @@ class GPUModelRunner(ModelRunnerBase): prompt_logprobs_list = self._get_prompt_logprobs_list(model_output) if self.is_pooling_model: - hidden_states = model_output - pooler_output = self._pool(hidden_states, num_running_requests) + pooler_output = self._pool(model_output, num_running_requests) model_output_data = ModelOutputData( next_tokens=self.share_inputs["next_tokens"],