polish code with new pre-commit rule (#2923)

2025-10-21 15:49:31 +08:00 · 2025-07-19 23:19:27 +08:00
parent b8676d71a8
commit 25698d56d1
424 changed files with 14307 additions and 13518 deletions
--- a/fastdeploy/model_executor/layers/sample/init.py
+++ b/fastdeploy/model_executor/layers/sample/init.py
@@ -11,6 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""""
+""" "
 sample
 """
--- a/fastdeploy/model_executor/layers/sample/ops/init.py
+++ b/fastdeploy/model_executor/layers/sample/ops/init.py
@@ -15,7 +15,9 @@
 """

 from .apply_penalty_multi_scores import (
-    apply_penalty_multi_scores, apply_speculative_penalty_multi_scores)
+    apply_penalty_multi_scores,
+    apply_speculative_penalty_multi_scores,
+)
 from .top_k_top_p_sampling import top_k_top_p_sampling

 __all__ = [
--- a/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py
+++ b/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py
@@ -37,8 +37,8 @@ def apply_penalty_multi_scores(
    apply_penalty_multi_scores
    """
    if current_platform.is_cuda():
-        from fastdeploy.model_executor.ops.gpu import \
-            get_token_penalty_multi_scores
+        from fastdeploy.model_executor.ops.gpu import get_token_penalty_multi_scores
+
        logits = get_token_penalty_multi_scores(
            pre_token_ids,
            prompt_ids,
@@ -54,8 +54,8 @@ def apply_penalty_multi_scores(
            eos_token_ids,
        )
    elif current_platform.is_xpu():
-        from fastdeploy.model_executor.ops.xpu import \
-            get_token_penalty_multi_scores
+        from fastdeploy.model_executor.ops.xpu import get_token_penalty_multi_scores
+
        logits = get_token_penalty_multi_scores(
            pre_token_ids,
            logits,
@@ -69,8 +69,10 @@ def apply_penalty_multi_scores(
            eos_token_ids,
        )
    elif current_platform.is_iluvatar():
-        from fastdeploy.model_executor.ops.iluvatar import \
-            get_token_penalty_multi_scores
+        from fastdeploy.model_executor.ops.iluvatar import (
+            get_token_penalty_multi_scores,
+        )
+
        logits = get_token_penalty_multi_scores(
            pre_token_ids,
            prompt_ids,
@@ -86,8 +88,8 @@ def apply_penalty_multi_scores(
            eos_token_ids,
        )
    elif current_platform.is_gcu():
-        from fastdeploy.model_executor.ops.gcu import \
-            get_token_penalty_multi_scores
+        from fastdeploy.model_executor.ops.gcu import get_token_penalty_multi_scores
+
        logits = get_token_penalty_multi_scores(
            pre_token_ids,
            logits,
@@ -101,7 +103,7 @@ def apply_penalty_multi_scores(
            eos_token_ids,
        )
    else:
-        raise NotImplementedError()
+        raise NotImplementedError

    return logits

@@ -126,8 +128,9 @@ def apply_speculative_penalty_multi_scores(
    apply_speculative_penalty_multi_scores
    """
    if current_platform.is_cuda():
-        from fastdeploy.model_executor.ops.gpu import \
-            speculate_get_token_penalty_multi_scores
+        from fastdeploy.model_executor.ops.gpu import (
+            speculate_get_token_penalty_multi_scores,
+        )

        speculate_get_token_penalty_multi_scores(
            pre_token_ids,
@@ -146,6 +149,6 @@ def apply_speculative_penalty_multi_scores(
            max_len,
        )
    else:
-        raise NotImplementedError()
+        raise NotImplementedError
    # inplace
    return logits
--- a/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py
+++ b/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py
@@ -22,8 +22,8 @@ from fastdeploy import envs
 from fastdeploy.platforms import current_platform

 if current_platform.is_gcu():
-    from fastdeploy.model_executor.ops.gcu import \
-        top_p_sampling as gcu_top_p_sampling
+    from fastdeploy.model_executor.ops.gcu import top_p_sampling as gcu_top_p_sampling
+

 def top_k_top_p_sampling(
    x: paddle.Tensor,
@@ -33,8 +33,8 @@ def top_k_top_p_sampling(
    topp_seed: Optional[paddle.Tensor] = None,
    seed: int = -1,
    k: int = 0,
-    mode: Literal['truncated', 'non-truncated'] = "truncated",
-    order: Literal['top_k_first', 'joint'] = "top_k_first",
+    mode: Literal["truncated", "non-truncated"] = "truncated",
+    order: Literal["top_k_first", "joint"] = "top_k_first",
 ) -> tuple[paddle.Tensor, paddle.Tensor]:
    """
    x(Tensor): An input 2-D Tensor with type float32, float16 and bfloat16.
@@ -61,35 +61,33 @@ def top_k_top_p_sampling(
    """
    top_p_class = envs.FD_SAMPLING_CLASS.lower()
    if top_p_class == "air":
-        _, ids = air_top_p_sampling(x,
-                                    top_p,
-                                    threshold,
-                                    topp_seed,
-                                    seed=seed,
-                                    k=k,
-                                    mode=mode)
+        _, ids = air_top_p_sampling(x, top_p, threshold, topp_seed, seed=seed, k=k, mode=mode)
    elif top_p_class == "rejection":
        ids = rejection_top_p_sampling(x, top_p, top_k, seed, order)
        _ = None
    elif top_p_class == "base_non_truncated":
-        _, ids = paddle.tensor.top_p_sampling(x,
-                                                top_p,
-                                                threshold=threshold,
-                                                topp_seed=topp_seed,
-                                                seed=seed,
-                                                k=k,
-                                                mode="non-truncated")
+        _, ids = paddle.tensor.top_p_sampling(
+            x,
+            top_p,
+            threshold=threshold,
+            topp_seed=topp_seed,
+            seed=seed,
+            k=k,
+            mode="non-truncated",
+        )
    else:
        if current_platform.is_gcu():
            _, ids = gcu_top_p_sampling(x, top_p)
        else:
-            _, ids = paddle.tensor.top_p_sampling(x,
-                                                  top_p,
-                                                  threshold=threshold,
-                                                  topp_seed=topp_seed,
-                                                  seed=seed,
-                                                  k=k,
-                                                  mode="truncated")
+            _, ids = paddle.tensor.top_p_sampling(
+                x,
+                top_p,
+                threshold=threshold,
+                topp_seed=topp_seed,
+                seed=seed,
+                k=k,
+                mode="truncated",
+            )
    return _, ids


@@ -100,15 +98,15 @@ def air_top_p_sampling(
    topp_seed: Optional[paddle.Tensor] = None,
    seed: int = -1,
    k: int = 0,
-    mode: Literal['truncated', 'non-truncated'] = "truncated",
+    mode: Literal["truncated", "non-truncated"] = "truncated",
 ) -> tuple[paddle.Tensor, paddle.Tensor]:
    """
    air_top_p_sampling
    """
    try:
        from fastdeploy.model_executor.ops.gpu import air_top_p_sampling
-        out, ids = air_top_p_sampling(x, top_p, threshold, topp_seed, seed, k,
-                                      mode)
+
+        out, ids = air_top_p_sampling(x, top_p, threshold, topp_seed, seed, k, mode)
    except ImportError:
        raise RuntimeError("Cannot import air_top_p_sampling op.")
    return out, ids
@@ -119,14 +117,16 @@ def rejection_top_p_sampling(
    top_p: paddle.Tensor,
    top_k: paddle.Tensor,
    seed: int = -1,
-    order: Literal['top_k_first', 'joint'] = "top_k_first",
+    order: Literal["top_k_first", "joint"] = "top_k_first",
 ) -> paddle.Tensor:
    """
    rejection_top_p_sampling
    """
    try:
        from fastdeploy.model_executor.ops.gpu import (
-            rejection_top_p_sampling, top_k_renorm_probs)
+            rejection_top_p_sampling,
+            top_k_renorm_probs,
+        )

        if paddle.count_nonzero(top_k) == 0:
            ids = rejection_top_p_sampling(
--- a/fastdeploy/model_executor/layers/sample/sampler.py
+++ b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -13,21 +13,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
+
 import threading
 from concurrent.futures import ThreadPoolExecutor
 from typing import Any, Dict, List, Optional

 import paddle
-import paddle.nn as nn
 import paddle.nn.functional as F
+from paddle import nn

 from fastdeploy.config import FDConfig
-from fastdeploy.model_executor.guided_decoding.base_guided_decoding import \
-    LogitsProcessorBase
+from fastdeploy.model_executor.guided_decoding.base_guided_decoding import (
+    LogitsProcessorBase,
+)
 from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
 from fastdeploy.model_executor.layers.sample.ops import (
-    apply_penalty_multi_scores, apply_speculative_penalty_multi_scores,
-    top_k_top_p_sampling)
+    apply_penalty_multi_scores,
+    apply_speculative_penalty_multi_scores,
+    top_k_top_p_sampling,
+)
 from fastdeploy.platforms import current_platform
 from fastdeploy.worker.output import LogprobsTensors, SamplerOutput

@@ -44,11 +48,13 @@ class SamplerProcessor:
        self.executor = ThreadPoolExecutor()
        self.logits_lock = threading.Lock()

-    def add_logits_processor(self,
-                             ids: int,
-                             future: Optional[Any] = None,
-                             prefill_tokens: List[int] = []):
-        """ add logits processor to SamplerProcessor """
+    def add_logits_processor(
+        self,
+        ids: int,
+        future: Optional[Any] = None,
+        prefill_tokens: List[int] = [],
+    ):
+        """add logits processor to SamplerProcessor"""
        with self.logits_lock:
            if future is None:
                if ids in self.logits_processor:
@@ -67,7 +73,7 @@ class SamplerProcessor:
                self.logits_processor[ids] = [future, prefill_tokens]

    def update_vocab_mask(self, skip_idx_list: List[int] = []):
-        """ update vocab mask. (cpu-heavy operation) """
+        """update vocab mask. (cpu-heavy operation)"""
        if len(self.logits_processor) == 0:
            return

@@ -102,10 +108,8 @@ class SamplerProcessor:

                processor.fill_token_bitmask(self.token_bitmask, idx)

-    def apply_token_mask(self,
-                         logits: paddle.Tensor,
-                         skip_idx_list: List[int] = []):
-        """ apply token mask to logits """
+    def apply_token_mask(self, logits: paddle.Tensor, skip_idx_list: List[int] = []):
+        """apply token mask to logits"""
        if len(self.logits_processor) == 0 or self.token_bitmask is None:
            return logits

@@ -121,26 +125,20 @@ class SamplerProcessor:

        indices = list(self.logits_processor.keys())
        mask_idx = [i for i in indices if i not in skip_idx_list]
-        return available_processors.apply_token_mask(logits,
-                                                     self.token_bitmask,
-                                                     indices=mask_idx)
+        return available_processors.apply_token_mask(logits, self.token_bitmask, indices=mask_idx)

    def _accept_token(self, idx: int, token: int):
-        """ accept token """
+        """accept token"""
        if idx not in self.logits_processor:
-            raise ValueError(
-                f"Invalid index, idx: {idx}, logit_processors.keys: {self.logits_processor.keys()}"
-            )
+            raise ValueError(f"Invalid index, idx: {idx}, logit_processors.keys: {self.logits_processor.keys()}")

        if self.logits_processor[idx].is_terminated():
            return

        self.logits_processor[idx].accept_token(token)

-    def update_output_tokens(self,
-                             next_tokens: paddle.Tensor,
-                             skip_idx_list: List[int] = []):
-        """ update output tokens """
+    def update_output_tokens(self, next_tokens: paddle.Tensor, skip_idx_list: List[int] = []):
+        """update output tokens"""
        if len(self.logits_processor) == 0:
            return

@@ -148,14 +146,13 @@ class SamplerProcessor:
        with self.logits_lock:
            for idx in self.logits_processor.keys():
                token = token_ids[idx][0]
-                if token < 0 or self.logits_processor[
-                        idx] is None or idx in skip_idx_list:
+                if token < 0 or self.logits_processor[idx] is None or idx in skip_idx_list:
                    continue

                self._accept_token(idx, token)

    def pre_process(self, skip_idx_list: List[int] = []):
-        """ pre process before running """
+        """pre process before running"""
        # create async operation for guided decoding
        # TODO: support async
        self.update_vocab_mask(skip_idx_list)
@@ -168,31 +165,35 @@ class Sampler(nn.Layer):
    """

    def __init__(self):
-        """
-        """
+        """ """
        super().__init__()
-        if current_platform.is_cuda() or current_platform.is_xpu(
-        ) or current_platform.is_iluvatar() or current_platform.is_gcu():
+        if (
+            current_platform.is_cuda()
+            or current_platform.is_xpu()
+            or current_platform.is_iluvatar()
+            or current_platform.is_gcu()
+        ):
            self.forward = self.forward_cuda
        else:
-            raise NotImplementedError()
+            raise NotImplementedError

        self.processor = SamplerProcessor()

-    def apply_logits_processor(self,
-                               ids: int,
-                               future: Optional[Any] = None,
-                               prefill_tokens: List[int] = []):
-        """ apply logits processor to sampler """
+    def apply_logits_processor(
+        self,
+        ids: int,
+        future: Optional[Any] = None,
+        prefill_tokens: List[int] = [],
+    ):
+        """apply logits processor to sampler"""
        self.processor.add_logits_processor(ids, future, prefill_tokens)

    def pre_process(self, skip_idx_list: List[int] = []):
-        """ pre process before running """
+        """pre process before running"""
        self.processor.pre_process(skip_idx_list)

    def compute_logprobs(self, logits: paddle.Tensor) -> paddle.Tensor:
-        """
-        """
+        """ """
        return F.log_softmax(logits, axis=-1)

    def gather_logprobs(
@@ -226,9 +227,7 @@ class Sampler(nn.Layer):

        if num_logprobs >= 1:
            # Find the topK values.
-            topk_logprobs, topk_indices = paddle.topk(logprobs,
-                                                    num_logprobs,
-                                                    axis=-1)
+            topk_logprobs, topk_indices = paddle.topk(logprobs, num_logprobs, axis=-1)
            indices = paddle.concat([token_ids, topk_indices], axis=1)
            top_logprobs = paddle.concat([token_logprobs, topk_logprobs], axis=1)
        else:
@@ -243,8 +242,7 @@ class Sampler(nn.Layer):
        sampling_metadata: SamplingMetadata,
        skip_idx_list: List[int] = [],
    ) -> SamplerOutput:
-        """
-        """
+        """ """
        num_logprobs = sampling_metadata.max_num_logprobs
        if num_logprobs is not None:
            raw_logprobs = self.compute_logprobs(logits)
@@ -270,8 +268,9 @@ class Sampler(nn.Layer):

        _, next_tokens = top_k_top_p_sampling(probs, sampling_metadata.top_p, sampling_metadata.top_k)

-        logprobs_tensors = None if num_logprobs is None else \
-            self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=next_tokens)
+        logprobs_tensors = (
+            None if num_logprobs is None else self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=next_tokens)
+        )

        self.processor.update_output_tokens(next_tokens, skip_idx_list)

@@ -291,26 +290,27 @@ class SpeculativeSampler(nn.Layer):
    """

    def __init__(self, fd_config: FDConfig):
-        """
-        """
+        """ """
        super().__init__()
        if current_platform.is_cuda():
            self.forward = self.forward_cuda
        else:
-            raise NotImplementedError()
+            raise NotImplementedError
        self.speculative_verify_window = fd_config.speculative_config.verify_window
        self.speculative_max_candidate_len = fd_config.speculative_config.max_candidate_len
        self.speculative_benchmark_mode = fd_config.speculative_config.benchmark_mode

    def pre_process(self, skip_idx_list: List[int] = []):
-        """ pre process before running """
+        """pre process before running"""
        pass

-    def apply_logits_processor(self,
-                               ids: int,
-                               future: Optional[Any] = None,
-                               prefill_tokens: List[int] = []):
-        """ apply logits processor to sampler """
+    def apply_logits_processor(
+        self,
+        ids: int,
+        future: Optional[Any] = None,
+        prefill_tokens: List[int] = [],
+    ):
+        """apply logits processor to sampler"""
        pass

    def forward_cuda(
@@ -320,11 +320,9 @@ class SpeculativeSampler(nn.Layer):
        max_model_len: int,
        share_inputs: List[paddle.Tensor],
    ) -> paddle.Tensor:
-        """
-        """
+        """ """

-        from fastdeploy.model_executor.ops.gpu import (speculate_verify,
-                                                       top_p_candidates)
+        from fastdeploy.model_executor.ops.gpu import speculate_verify, top_p_candidates

        logits = apply_speculative_penalty_multi_scores(
            sampling_metadata.pre_token_ids,
@@ -361,7 +359,8 @@ class SpeculativeSampler(nn.Layer):
            share_inputs["seq_lens_encoder"],
            share_inputs["seq_lens_decoder"],
            share_inputs[
-                "draft_tokens"],  # Both input and output, need to write the last 1 token accepted to position 0.
+                "draft_tokens"
+            ],  # Both input and output, need to write the last 1 token accepted to position 0.
            share_inputs["seq_lens_this_time"],
            verify_tokens,
            verify_scores,
@@ -382,27 +381,27 @@ class SpeculativeSampler(nn.Layer):


 class MTPSampler(nn.Layer):
-    """
-    """
+    """ """

    def __init__(self, fd_config: FDConfig):
-        """
-        """
+        """ """
        super().__init__()
        if current_platform.is_cuda():
            self.forward = self.forward_cuda
        else:
-            raise NotImplementedError()
+            raise NotImplementedError

    def pre_process(self, skip_idx_list: List[int] = []):
-        """ pre process before running """
+        """pre process before running"""
        pass

-    def apply_logits_processor(self,
-                               ids: int,
-                               future: Optional[Any] = None,
-                               prefill_tokens: List[int] = []):
-        """ apply logits processor to sampler """
+    def apply_logits_processor(
+        self,
+        ids: int,
+        future: Optional[Any] = None,
+        prefill_tokens: List[int] = [],
+    ):
+        """apply logits processor to sampler"""
        pass

    def forward_cuda(
@@ -412,8 +411,7 @@ class MTPSampler(nn.Layer):
        max_model_len: int,
        share_inputs: List[paddle.Tensor],
    ) -> paddle.Tensor:
-        """
-        """
+        """ """
        logits = apply_speculative_penalty_multi_scores(
            sampling_metadata.pre_token_ids,
            logits,