diff --git a/llm/server/server/engine/infer.py b/llm/server/server/engine/infer.py index 9b1db3d8f..f00cd0506 100644 --- a/llm/server/server/engine/infer.py +++ b/llm/server/server/engine/infer.py @@ -29,7 +29,7 @@ from paddlenlp.trl.llm_utils import get_rotary_position_embedding from paddlenlp_ops import step_paddle, speculate_step_paddle from server.data.processor import DataProcessor from server.engine.config import Config -from server.engine.proposers import InferenceWithReferenceProposer +from paddlenlp.experimental.transformers import InferenceWithReferenceProposer from server.utils import get_logger from task_queue_manager import TaskQueueManager @@ -518,6 +518,7 @@ class ModelRunner: self.share_inputs['infer_seed'].add_(infer_seed_increment) self.share_inputs['infer_seed'][:] %= self.MAX_INFER_SEED if self.free_list_len > 0: + logger.info('You got into step CUDA!!!') self.step_cuda(seq_lens_this_time) diff --git a/llm/server/server/engine/proposers.py b/llm/server/server/engine/proposers.py deleted file mode 100644 index f2a1d2b0a..000000000 --- a/llm/server/server/engine/proposers.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import annotations - -from abc import ABC, abstractmethod - -import paddle - - -class Proposer(ABC): - """ - Abstract base class for all proposers that can be used in the speculative decoding framework. - The subclasses of this class must implement the run method to get the draft tokens that are - generated by the proposer. - """ - - def __init__(self, **kwargs): - pass - - @abstractmethod - def run(self, model_inputs: dict[str, paddle.Tensor], **kargs): - """ - Get the draft tokens that are generated by the proposer. - """ - raise NotImplementedError() - - -class InferenceWithReferenceProposer(Proposer): - """ - InferenceWithReference(https://arxiv.org/pdf/2304.04487) is one of the speculative decoding method. - It match tokens in the input and output as draft tokens. - """ - - def __init__(self, max_draft_token_num: int, max_ngram_size: int, max_batch_size: int, max_seq_len: int, **kwargs): - """ - Args: - max_draft_token_num (int): - Maximum number of tokens a proposer can generate at one time. - The hyperparameter of k in the paper. - max_ngram_size (int): - The maximum size of the window used to match inputs and outputs. - The hyperparameter of n in the paper. - max_batch_size (int): - The maximum batch size. - max_seq_len (int): - The maximum sequence length. - """ - super().__init__() - self.max_ngram_size = max_ngram_size - self.input_ids_len = paddle.zeros(shape=[max_batch_size, 1], dtype="int64").cpu() - self.input_ids_cpu = paddle.zeros(shape=[max_batch_size, max_seq_len], dtype="int64").cpu() - self.max_batch_size = max_batch_size - self.max_draft_token_num = max_draft_token_num - - def run(self, model_inputs: dict[str, paddle.Tensor], **kargs): - """ - Use ngram_match to get draft tokens from the input and output. - """ - draft_tokens = model_inputs["draft_tokens"].cpu() - seq_lens_this_time = kargs["seq_lens_this_time"].cpu() - seq_lens_encoder = model_inputs["seq_lens_encoder"].cpu() - seq_lens_decoder = model_inputs["seq_lens_decoder"].cpu() - - from paddlenlp_ops import ngram_match - - ngram_match( - self.input_ids_cpu, - self.input_ids_len.cpu(), - model_inputs["pre_ids"].cpu(), - model_inputs["step_idx"].cpu(), - model_inputs["actual_draft_token_num"].cpu(), - draft_tokens, - seq_lens_this_time, - seq_lens_encoder, - seq_lens_decoder, - kargs["real_batch_size"], - self.max_ngram_size, - self.max_draft_token_num, - ) - - model_inputs["draft_tokens"][:] = draft_tokens.cuda() - model_inputs["seq_lens_encoder"][:] = seq_lens_encoder.cuda() - kargs["seq_lens_this_time"][:] = seq_lens_this_time.cuda()