Sync v2.0 version of code to github repo

2025-10-06 09:07:10 +08:00 · 2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions
--- a/fastdeploy/model_executor/layers/sample/meta_data.py
+++ b/fastdeploy/model_executor/layers/sample/meta_data.py
@@ -23,11 +23,12 @@ import paddle
@dataclass
 class SamplingMetadata:
    """
+    metadata for sampling.
    """

    temperature: paddle.Tensor

-    prompt_token_ids: paddle.Tensor
+    pre_token_ids: paddle.Tensor
    eos_token_ids: paddle.Tensor
    frequency_penalties: paddle.Tensor
    presence_penalties: paddle.Tensor
--- a/fastdeploy/model_executor/layers/sample/ops/init.py
+++ b/fastdeploy/model_executor/layers/sample/ops/init.py
@@ -14,8 +14,12 @@
 # limitations under the License.
 """

-from .apply_penalty_multi_scores import apply_penalty_multi_scores
+from .apply_penalty_multi_scores import (
+    apply_penalty_multi_scores, apply_speculative_penalty_multi_scores)
+from .top_p_sampling import top_p_sampling

 __all__ = [
    "apply_penalty_multi_scores",
+    "apply_speculative_penalty_multi_scores",
+    "top_p_sampling",
 ]
--- a/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py
+++ b/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py
@@ -20,7 +20,7 @@ from fastdeploy.platforms import current_platform


 def apply_penalty_multi_scores(
-    prompt_token_ids: paddle.Tensor,
+    pre_token_ids: paddle.Tensor,
    logits: paddle.Tensor,
    repetition_penalties: paddle.Tensor,
    frequency_penalties: paddle.Tensor,
@@ -30,16 +30,30 @@ def apply_penalty_multi_scores(
    step_idx: paddle.Tensor,
    min_dec_lens: paddle.Tensor,
    eos_token_ids: paddle.Tensor,
-):
+) -> paddle.Tensor:
    """
-    Args:
-    Returns:
+    apply_penalty_multi_scores
    """
    if current_platform.is_cuda():
        from fastdeploy.model_executor.ops.gpu import \
            get_token_penalty_multi_scores
        logits = get_token_penalty_multi_scores(
-            prompt_token_ids,
+            pre_token_ids,
+            logits,
+            repetition_penalties,
+            frequency_penalties,
+            presence_penalties,
+            temperature,
+            bad_words_token_ids,
+            step_idx,
+            min_dec_lens,
+            eos_token_ids,
+        )
+    elif current_platform.is_xpu():
+        from fastdeploy.model_executor.ops.xpu import \
+            get_token_penalty_multi_scores
+        logits = get_token_penalty_multi_scores(
+            pre_token_ids,
            logits,
            repetition_penalties,
            frequency_penalties,
@@ -54,3 +68,48 @@ def apply_penalty_multi_scores(
        raise NotImplementedError()

    return logits
+
+
+def apply_speculative_penalty_multi_scores(
+    pre_token_ids: paddle.Tensor,
+    logits: paddle.Tensor,
+    repetition_penalties: paddle.Tensor,
+    frequency_penalties: paddle.Tensor,
+    presence_penalties: paddle.Tensor,
+    temperature: paddle.Tensor,
+    bad_words_token_ids: paddle.Tensor,
+    step_idx: paddle.Tensor,
+    min_dec_lens: paddle.Tensor,
+    eos_token_ids: paddle.Tensor,
+    seq_lens_this_time: paddle.Tensor,
+    output_padding_offset: paddle.Tensor,
+    output_cum_offsets: paddle.Tensor,
+    max_len: int,
+):
+    """
+    apply_speculative_penalty_multi_scores
+    """
+    if current_platform.is_cuda():
+        from fastdeploy.model_executor.ops.gpu import \
+            speculate_get_token_penalty_multi_scores
+
+        logits = speculate_get_token_penalty_multi_scores(
+            pre_token_ids,
+            logits,
+            repetition_penalties,
+            frequency_penalties,
+            presence_penalties,
+            temperature,
+            bad_words_token_ids,
+            step_idx,
+            min_dec_lens,
+            eos_token_ids,
+            seq_lens_this_time,
+            output_padding_offset,
+            output_cum_offsets,
+            max_len,
+        )
+    else:
+        raise NotImplementedError()
+
+    return logits
--- a/fastdeploy/model_executor/layers/sample/ops/top_p_sampling.py
+++ b/fastdeploy/model_executor/layers/sample/ops/top_p_sampling.py
@@ -0,0 +1,97 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from typing import Literal, Optional
+
+import paddle
+
+from fastdeploy import envs
+
+
+def top_p_sampling(
+    x: paddle.Tensor,
+    ps: paddle.Tensor,
+    threshold: Optional[paddle.Tensor] = None,
+    topp_seed: Optional[paddle.Tensor] = None,
+    seed: int = -1,
+    k: int = 0,
+    mode: Literal['truncated', 'non-truncated'] = "truncated",
+) -> tuple[paddle.Tensor, paddle.Tensor]:
+    """
+    top_p_sampling
+    """
+    top_p_class = envs.FD_SAMPLING_CLASS.lower()
+    if top_p_class == "air":
+        _, ids = air_top_p_sampling(x,
+                                    ps,
+                                    threshold,
+                                    topp_seed,
+                                    seed=seed,
+                                    k=k,
+                                    mode=mode)
+    elif top_p_class == "rejection":
+        ids = rejection_top_p_sampling(x, ps, seed)
+        _ = None
+    else:
+        _, ids = paddle.tensor.top_p_sampling(x,
+                                              ps,
+                                              threshold=threshold,
+                                              topp_seed=topp_seed,
+                                              seed=seed,
+                                              k=k,
+                                              mode=mode)
+    return _, ids
+
+
+def air_top_p_sampling(
+    x: paddle.Tensor,
+    ps: paddle.Tensor,
+    threshold: Optional[paddle.Tensor] = None,
+    topp_seed: Optional[paddle.Tensor] = None,
+    seed: int = -1,
+    k: int = 0,
+    mode: Literal['truncated', 'non-truncated'] = "truncated",
+) -> tuple[paddle.Tensor, paddle.Tensor]:
+    """
+    air_top_p_sampling
+    """
+    try:
+        from fastdeploy.model_executor.ops.gpu import air_top_p_sampling
+        out, ids = air_top_p_sampling(x, ps, threshold, topp_seed, seed, k,
+                                      mode)
+    except ImportError:
+        raise RuntimeError("Cannot import air_top_p_sampling op.")
+    return out, ids
+
+
+def rejection_top_p_sampling(
+    x: paddle.Tensor,
+    ps: paddle.Tensor,
+    seed: int = -1,
+) -> paddle.Tensor:
+    """
+    rejection_top_p_sampling
+    """
+    try:
+        from fastdeploy.model_executor.ops.gpu import rejection_top_p_sampling
+        ids = rejection_top_p_sampling(
+            x,
+            ps,
+            seed,
+        )
+    except ImportError:
+        raise RuntimeError("Cannot import rejection_top_p_sampling op.")
+    return ids
--- a/fastdeploy/model_executor/layers/sample/sampler.py
+++ b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -13,43 +13,193 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
+import threading
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Dict, List, Optional

 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from fastdeploy.distributed.parallel_state import \
-    get_tensor_model_parallel_world_size
+from fastdeploy.config import FDConfig
+from fastdeploy.model_executor.guided_decoding.base_guided_decoding import \
+    LogitsProcessorBase
 from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
-from fastdeploy.model_executor.layers.sample.ops import \
-    apply_penalty_multi_scores
+from fastdeploy.model_executor.layers.sample.ops import (
+    apply_penalty_multi_scores, apply_speculative_penalty_multi_scores,
+    top_p_sampling)
 from fastdeploy.platforms import current_platform


+class SamplerProcessor:
+    """
+    SamplingProcessor for guided decoding.
+    """
+
+    def __init__(self):
+        self.async_step = None
+        self.token_bitmask = None
+        self.logits_processor: Dict[int, Optional[Any]] = dict()
+        self.executor = ThreadPoolExecutor()
+        self.logits_lock = threading.Lock()
+
+    def add_logits_processor(self,
+                             ids: int,
+                             future: Optional[Any] = None,
+                             prefill_tokens: List[int] = []):
+        """ add logits processor to SamplerProcessor """
+        with self.logits_lock:
+            if future is None:
+                if ids in self.logits_processor:
+                    del self.logits_processor[ids]
+                return
+
+            if isinstance(future, LogitsProcessorBase):
+                self.logits_processor[ids] = future
+                for token in prefill_tokens:
+                    self.logits_processor[ids].accept_token(token)
+            elif future.done():
+                self.logits_processor[ids] = future.result()
+                for token in prefill_tokens:
+                    self.logits_processor[ids].accept_token(token)
+            else:
+                self.logits_processor[ids] = [future, prefill_tokens]
+
+    def update_vocab_mask(self, skip_idx_list: List[int] = []):
+        """ update vocab mask. (cpu-heavy operation) """
+        if len(self.logits_processor) == 0:
+            return
+
+        with self.logits_lock:
+            for idx, processor in self.logits_processor.items():
+                if processor is None:
+                    del self.logits_processor[idx]
+                    continue
+
+                if not isinstance(processor, LogitsProcessorBase):
+                    future, prefill_tokens = self.logits_processor[idx]
+                    self.logits_processor[idx] = future.result()
+                    for token in prefill_tokens:
+                        self.logits_processor[idx].accept_token(token)
+
+            available_processors = None
+            for processor in self.logits_processor.values():
+                if processor.is_terminated():
+                    continue
+                available_processors = processor
+            if available_processors is None:
+                return
+
+        # allocate token bitmask
+        self.token_bitmask = available_processors.allocate_token_bitmask()
+
+        with self.logits_lock:
+            # fill token bitmask
+            for idx, processor in self.logits_processor.items():
+                if processor.is_terminated() or idx in skip_idx_list:
+                    continue
+
+                processor.fill_token_bitmask(self.token_bitmask, idx)
+
+    def apply_token_mask(self,
+                         logits: paddle.Tensor,
+                         skip_idx_list: List[int] = []):
+        """ apply token mask to logits """
+        if len(self.logits_processor) == 0 or self.token_bitmask is None:
+            return logits
+
+        # self.async_step.result()
+        available_processors = None
+        with self.logits_lock:
+            for processor in self.logits_processor.values():
+                if processor.is_terminated():
+                    continue
+                available_processors = processor
+        if available_processors is None:
+            return logits
+
+        indices = list(self.logits_processor.keys())
+        mask_idx = [i for i in indices if i not in skip_idx_list]
+        return available_processors.apply_token_mask(logits,
+                                                     self.token_bitmask,
+                                                     indices=mask_idx)
+
+    def _accept_token(self, idx: int, token: int):
+        """ accept token """
+        if idx not in self.logits_processor:
+            raise ValueError(
+                f"Invalid index, idx: {idx}, logit_processors.keys: {self.logits_processor.keys()}"
+            )
+
+        if self.logits_processor[idx].is_terminated():
+            return
+
+        self.logits_processor[idx].accept_token(token)
+
+    def update_output_tokens(self,
+                             next_tokens: paddle.Tensor,
+                             skip_idx_list: List[int] = []):
+        """ update output tokens """
+        if len(self.logits_processor) == 0:
+            return
+
+        token_ids = next_tokens.numpy().tolist()
+        with self.logits_lock:
+            for idx in self.logits_processor.keys():
+                token = token_ids[idx][0]
+                if token < 0 or self.logits_processor[
+                        idx] is None or idx in skip_idx_list:
+                    continue
+
+                self._accept_token(idx, token)
+
+    def pre_process(self, skip_idx_list: List[int] = []):
+        """ pre process before running """
+        # create async operation for guided decoding
+        # TODO: support async
+        self.update_vocab_mask(skip_idx_list)
+        # self.async_step = self.executor.submit(self.update_vocab_mask)
+
+
 class Sampler(nn.Layer):
    """
+    Sampler for normal generation.
    """

    def __init__(self):
        """
        """
        super().__init__()
-        if current_platform.is_cuda():
-            self.nranks = get_tensor_model_parallel_world_size()
+        if current_platform.is_cuda() or current_platform.is_xpu():
            self.forward = self.forward_cuda
        else:
            raise NotImplementedError()

+        self.processor = SamplerProcessor()
+
+    def apply_logits_processor(self,
+                               ids: int,
+                               future: Optional[Any] = None,
+                               prefill_tokens: List[int] = []):
+        """ apply logits processor to sampler """
+        self.processor.add_logits_processor(ids, future, prefill_tokens)
+
+    def pre_process(self, skip_idx_list: List[int] = []):
+        """ pre process before running """
+        self.processor.pre_process(skip_idx_list)
+
    def forward_cuda(
        self,
        logits: paddle.Tensor,
        sampling_metadata: SamplingMetadata,
+        skip_idx_list: List[int] = [],
    ) -> paddle.Tensor:
        """
        """
+        logits = self.processor.apply_token_mask(logits, skip_idx_list)

        logits = apply_penalty_multi_scores(
-            sampling_metadata.prompt_token_ids,
+            sampling_metadata.pre_token_ids,
            logits,
            sampling_metadata.repetition_penalties,
            sampling_metadata.frequency_penalties,
@@ -63,10 +213,156 @@ class Sampler(nn.Layer):

        probs = F.softmax(logits)

-        _, next_tokens = paddle.tensor.top_p_sampling(probs,
-                                                      sampling_metadata.top_p)
-
-        if self.nranks > 1:
-            paddle.distributed.broadcast(next_tokens, 0)
+        _, next_tokens = top_p_sampling(probs, sampling_metadata.top_p)

+        self.processor.update_output_tokens(next_tokens, skip_idx_list)
+        return next_tokens
+
+
+class SpeculativeSampler(nn.Layer):
+    """
+    Sampler for speculative generation.
+    """
+
+    def __init__(self, fd_config: FDConfig):
+        """
+        """
+        super().__init__()
+        if current_platform.is_cuda():
+            self.forward = self.forward_cuda
+        else:
+            raise NotImplementedError()
+        self.speculative_verify_window = fd_config.speculative_config.verify_window
+        self.speculative_max_candidate_len = fd_config.speculative_config.max_candidate_len
+
+    def pre_process(self, skip_idx_list: List[int] = []):
+        """ pre process before running """
+        pass
+
+    def apply_logits_processor(self,
+                               ids: int,
+                               future: Optional[Any] = None,
+                               prefill_tokens: List[int] = []):
+        """ apply logits processor to sampler """
+        pass
+
+    def forward_cuda(
+        self,
+        logits: paddle.Tensor,
+        sampling_metadata: SamplingMetadata,
+        max_model_len: int,
+        share_inputs: List[paddle.Tensor],
+    ) -> paddle.Tensor:
+        """
+        """
+
+        from fastdeploy.model_executor.ops.gpu import (speculate_verify,
+                                                       top_p_candidates)
+
+        logits = apply_speculative_penalty_multi_scores(
+            sampling_metadata.pre_token_ids,
+            logits,
+            sampling_metadata.repetition_penalties,
+            sampling_metadata.frequency_penalties,
+            sampling_metadata.presence_penalties,
+            sampling_metadata.temperature,
+            sampling_metadata.bad_words_token_ids,
+            sampling_metadata.step_idx,
+            sampling_metadata.min_dec_lens,
+            sampling_metadata.eos_token_ids,
+            share_inputs["seq_lens_this_time"],
+            share_inputs["output_padding_offset"],
+            share_inputs["output_cum_offsets"],
+            max_model_len,
+        )
+
+        probs = F.softmax(logits)
+
+        verify_scores, verify_tokens, actual_candidate_len = top_p_candidates(
+            probs,
+            sampling_metadata.top_p,
+            share_inputs["output_padding_offset"],
+            self.speculative_max_candidate_len,
+            max_model_len,
+        )
+
+        speculate_verify(
+            share_inputs["accept_tokens"],
+            share_inputs["accept_num"],
+            share_inputs["step_idx"],
+            share_inputs["stop_flags"],
+            share_inputs["seq_lens_encoder"],
+            share_inputs["seq_lens_decoder"],
+            share_inputs[
+                "draft_tokens"],  # Both input and output, need to write the last 1 token accepted to position 0.
+            share_inputs["seq_lens_this_time"],
+            verify_tokens,
+            verify_scores,
+            share_inputs["max_dec_len"],
+            sampling_metadata.eos_token_ids,
+            share_inputs["is_block_step"],
+            share_inputs["output_cum_offsets"],
+            actual_candidate_len,
+            share_inputs["actual_draft_token_num"],
+            sampling_metadata.top_p,
+            max_model_len,
+            self.speculative_verify_window,
+            True,  # enable_topp
+        )
+
+        return None
+
+
+class MTPSampler(nn.Layer):
+    """
+    """
+
+    def __init__(self, fd_config: FDConfig):
+        """
+        """
+        super().__init__()
+        if current_platform.is_cuda():
+            self.forward = self.forward_cuda
+        else:
+            raise NotImplementedError()
+
+    def pre_process(self, skip_idx_list: List[int] = []):
+        """ pre process before running """
+        pass
+
+    def apply_logits_processor(self,
+                               ids: int,
+                               future: Optional[Any] = None,
+                               prefill_tokens: List[int] = []):
+        """ apply logits processor to sampler """
+        pass
+
+    def forward_cuda(
+        self,
+        logits: paddle.Tensor,
+        sampling_metadata: SamplingMetadata,
+        max_model_len: int,
+        share_inputs: List[paddle.Tensor],
+    ) -> paddle.Tensor:
+        """
+        """
+        logits = apply_speculative_penalty_multi_scores(
+            sampling_metadata.pre_token_ids,
+            logits,
+            sampling_metadata.repetition_penalties,
+            sampling_metadata.frequency_penalties,
+            sampling_metadata.presence_penalties,
+            sampling_metadata.temperature,
+            sampling_metadata.bad_words_token_ids,
+            sampling_metadata.step_idx,
+            sampling_metadata.min_dec_lens,
+            sampling_metadata.eos_token_ids,
+            share_inputs["seq_lens_this_time"],
+            share_inputs["seq_lens_encoder"],
+            share_inputs["seq_lens_decoder"],
+            max_model_len,
+        )
+        probs = F.softmax(logits)
+
+        _, next_tokens = top_p_sampling(probs, sampling_metadata.top_p)
        return next_tokens