diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 7ed470f31..a4461cc71 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -637,6 +637,7 @@ class GPUModelRunner(ModelRunnerBase): batch_pooling_params = [] for i in range(req_len): request = req_dicts[i] + # assert isinstance(request, Request) idx = request.idx if hasattr(request, "pooling_params") and request.pooling_params is not None: @@ -655,14 +656,14 @@ class GPUModelRunner(ModelRunnerBase): logits_info, schemata_key = self._init_logits_processor(request) request.schemata_key = schemata_key - if self.scheduler_config.splitwise_role == "decode": - if ( - hasattr(request, "prefill_end_index") - and hasattr(request, "prompt_token_ids") - and request.prefill_end_index > len(request.prompt_token_ids) - ): - if hasattr(request, "output_token_ids"): - prefill_tokens.extend(request.output_token_ids) + if ( + self.scheduler_config.splitwise_role == "decode" + and hasattr(request, "prefill_end_index") + and hasattr(request, "prompt_token_ids") + and request.prefill_end_index > len(request.prompt_token_ids) + and hasattr(request, "output_token_ids") + ): + prefill_tokens.extend(request.output_token_ids) prefill_start_index = request.prefill_start_index prefill_end_index = request.prefill_end_index @@ -784,12 +785,12 @@ class GPUModelRunner(ModelRunnerBase): if request.get("bad_words_token_ids") is not None and len(request.get("bad_words_token_ids")) > 0: bad_words_len = len(request.get("bad_words_token_ids")) - self.share_inputs["bad_tokens_len"][idx : idx + 1] = bad_words_len + self.share_inputs["bad_tokens_len"][idx] = bad_words_len self.share_inputs["bad_tokens"][idx : idx + 1, :bad_words_len] = np.array( request.get("bad_words_token_ids"), dtype="int64" ) else: - self.share_inputs["bad_tokens_len"][idx : idx + 1] = 1 + self.share_inputs["bad_tokens_len"][idx] = 1 self.share_inputs["bad_tokens"][idx : idx + 1, :] = np.array([-1], dtype="int64") if request.get("stop_token_ids") is not None and request.get("stop_seqs_len") is not None: @@ -1007,12 +1008,12 @@ class GPUModelRunner(ModelRunnerBase): if request.get("bad_words_token_ids") is not None and len(request.get("bad_words_token_ids")) > 0: bad_words_len = len(request.get("bad_words_token_ids")) - self.share_inputs["bad_tokens_len"][idx : idx + 1] = bad_words_len + self.share_inputs["bad_tokens_len"][idx] = bad_words_len self.share_inputs["bad_tokens"][idx : idx + 1, :bad_words_len] = np.array( request.get("bad_words_token_ids"), dtype="int64" ) else: - self.share_inputs["bad_tokens_len"][idx : idx + 1] = 1 + self.share_inputs["bad_tokens_len"][idx] = 1 self.share_inputs["bad_tokens"][idx : idx + 1, :] = np.array([-1], dtype="int64") if request.get("stop_token_ids") is not None and request.get("stop_seqs_len") is not None: @@ -1217,7 +1218,7 @@ class GPUModelRunner(ModelRunnerBase): self.share_inputs["stop_nums"] = paddle.full([1], max_num_seqs, dtype="int64") self.share_inputs["bad_tokens"] = paddle.full([max_num_seqs, self.model_config.vocab_size], -1, dtype="int64") - self.share_inputs["bad_tokens_len"] = paddle.full([max_num_seqs], 1, dtype="int64") + self.share_inputs["bad_tokens_len"] = [-1] * max_num_seqs self.share_inputs["next_tokens"] = paddle.full([max_num_seqs, 1], -1, dtype="int64") self.share_inputs["is_block_step"] = paddle.full([max_num_seqs], False, dtype="bool") self.share_inputs["is_chunk_step"] = paddle.full([max_num_seqs], False, dtype="bool").cpu() @@ -1447,7 +1448,7 @@ class GPUModelRunner(ModelRunnerBase): self.share_inputs["output_padding_offset"].copy_(output_padding_offset, False) # Update bad tokens len - max_bad_tokens_len = np.max(self.share_inputs["bad_tokens_len"].numpy()) + max_bad_tokens_len = max(self.share_inputs["bad_tokens_len"]) # Initialize forward meta data self.initialize_forward_meta(is_dummy_or_profile_run=is_dummy_or_profile_run) diff --git a/tests/layers/test_attention_layer.py b/tests/layers/test_attention_layer.py index deffb5a73..f553b2877 100644 --- a/tests/layers/test_attention_layer.py +++ b/tests/layers/test_attention_layer.py @@ -270,7 +270,7 @@ class TestAttentionPerformance(unittest.TestCase): partial_rotary_factor=fd_config.model_config.partial_rotary_factor, ) - input_ids = paddle.zeros([batch_size, seq_len if mode == ForwardMode.EXTEND else 1], dtype="int64") + input_ids = paddle.zeros([batch_size, fd_config.model_config.max_model_len], dtype="int64") token_num = np.sum(seq_lens_this_time) ids_remove_padding, batch_id_per_token, cu_seqlens_q, cu_seqlens_k = get_padding_offset( input_ids, seq_lens_this_time, token_num @@ -302,27 +302,22 @@ class TestAttentionPerformance(unittest.TestCase): # Test parameters test_steps = 100 - # prefill_batch_size = 1 - # prefill_seq_len = 4096 + prefill_batch_size = 1 + prefill_seq_len = 2048 - # prefill_hidden_states = paddle.randn( - # [prefill_batch_size * prefill_seq_len, self.fd_config.model_config.hidden_size], - # dtype=act_tensor_dtype, - # ) + forward_meta, prefill_hidden_states = self.create_forward_meta( + batch_size=prefill_batch_size, + seq_len=prefill_seq_len, + mode=ForwardMode.EXTEND, + fd_config=self.fd_config, + attn_backend=self.attn_backend, + cache_quant_type_str=self.cache_quant_type_str, + ) - # forward_meta = self.create_forward_meta( - # batch_size=prefill_batch_size, - # seq_len=prefill_seq_len, - # mode=ForwardMode.EXTEND, - # fd_config=self.fd_config, - # attn_backend=self.attn_backend, - # cache_quant_type_str=self.cache_quant_type_str, - # ) + self.attn_backend.init_attention_metadata(forward_meta) + self.attn_forward(forward_meta, prefill_hidden_states) - # self.attn_backend.init_attention_metadata(forward_meta) - # self.attn_forward(forward_meta, prefill_hidden_states) - - # paddle.device.synchronize() + paddle.device.synchronize() # import paddle.profiler as profiler # p = profiler.Profiler( @@ -332,22 +327,22 @@ class TestAttentionPerformance(unittest.TestCase): # p.start() # p.step() - # start_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)] - # end_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)] - # for i in range(test_steps): - # start_events[i].record() + start_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)] + end_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)] + for i in range(test_steps): + start_events[i].record() - # self.attn_forward(forward_meta, prefill_hidden_states) + self.attn_forward(forward_meta, prefill_hidden_states) - # end_events[i].record() - # paddle.device.synchronize() - - # times = np.array([round(s.elapsed_time(e), 1) for s, e in zip(start_events, end_events)])[1:] - # print(times[-5:]) - # return + end_events[i].record() + paddle.device.synchronize() + times = np.array([round(s.elapsed_time(e), 1) for s, e in zip(start_events, end_events)])[1:] + print(times[-5:]) # p.stop() + return + # p = profiler.Profiler( # targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU], # on_trace_ready=profiler.export_chrome_tracing("./profile_log"),