[Feature] Entropy calculation support (#5692)

* support entropy * fix bug --------- Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
2025-12-24 13:28:13 +08:00 · 2025-12-23 21:19:47 +08:00
parent d1c6e57341
commit 23d488c488
11 changed files with 369 additions and 0 deletions
--- a/fastdeploy/model_executor/pre_and_post_process.py
+++ b/fastdeploy/model_executor/pre_and_post_process.py
@@ -93,6 +93,11 @@ else:
        speculate_limit_thinking_content_length_v2,
    )

+from fastdeploy.model_executor.entropy_utils import (
+    calculate_logits_entropy,
+    speculate_calculate_logits_entropy,
+)
+from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
 from fastdeploy.output.pooler import PoolerOutput, PoolingSequenceGroupOutput
 from fastdeploy.output.stream_transfer_data import DecoderState, StreamTransferData
 from fastdeploy.worker.output import LogprobsTensors, ModelOutputData, SamplerOutput
@@ -307,12 +312,14 @@ def post_process_normal(
    sampler_output: SamplerOutput,
    model_output: ModelOutputData,
    share_inputs: Dict[str, paddle.Tensor],
+    sampling_metadata: SamplingMetadata,
    block_size: int = 64,
    save_each_rank: bool = False,
    skip_save_output: bool = False,
    async_output_queue: queue.Queue = None,
    think_end_id: int = -1,
    line_break_id: int = -1,
+    enable_entropy: bool = False,
 ):
    """Post-processing steps after completing a single token generation."""
    if think_end_id > 0:
@@ -371,6 +378,9 @@ def post_process_normal(
            False,
        )

+    if enable_entropy:
+        calculate_logits_entropy(sampler_output.logits, share_inputs, sampling_metadata.temperature)
+
    # 2. Update the input buffer of the model
    with paddle.framework._no_check_dy2st_diff():
        if envs.ENABLE_V1_KVCACHE_SCHEDULER:
@@ -436,10 +446,12 @@ def post_process_specualate(
    sampler_output: SamplerOutput,
    model_output: ModelOutputData,
    share_inputs: Dict[str, paddle.Tensor],
+    sampling_metadata: SamplingMetadata,
    save_each_rank: bool = False,
    skip_save_output: bool = False,
    think_end_id: int = -1,
    line_break_id: int = -1,
+    enable_entropy: bool = False,
 ):
    if think_end_id > 0:
        speculate_limit_thinking_content_length(
@@ -464,6 +476,10 @@ def post_process_specualate(
        model_output.eos_token_id,
        model_output.min_tokens,
    )
+
+    if enable_entropy:
+        speculate_calculate_logits_entropy(sampler_output.logits, share_inputs, sampling_metadata.temperature)
+
    speculate_update(
        model_output.seq_lens_encoder,
        model_output.seq_lens_decoder,
@@ -525,6 +541,7 @@ def post_process(
    sampler_or_pooler_output: Union[SamplerOutput, PoolerOutput],
    model_output: ModelOutputData,
    share_inputs: Dict[str, paddle.Tensor],
+    sampling_metadata: SamplingMetadata = None,
    block_size: int = 64,
    save_each_rank: bool = False,
    speculative_decoding: bool = False,
@@ -532,6 +549,7 @@ def post_process(
    async_output_queue: queue.Queue = None,
    think_end_id: int = -1,
    line_break_id: int = -1,
+    enable_entropy: bool = False,
 ) -> None:
    """Post-processing steps after completing a single token generation."""

@@ -551,22 +569,26 @@ def post_process(
                sampler_or_pooler_output,
                model_output,
                share_inputs,
+                sampling_metadata,
                save_each_rank,
                skip_save_output,
                think_end_id,
                line_break_id,
+                enable_entropy,
            )
        else:
            post_process_normal(
                sampler_or_pooler_output,
                model_output,
                share_inputs,
+                sampling_metadata,
                block_size,
                save_each_rank,
                skip_save_output,
                async_output_queue,
                think_end_id,
                line_break_id,
+                enable_entropy,
            )