[Feature] Entropy calculation support (#5692)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled

* support entropy

* fix bug

---------

Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
This commit is contained in:
GoldPancake
2025-12-23 21:19:47 +08:00
committed by GitHub
parent d1c6e57341
commit 23d488c488
11 changed files with 369 additions and 0 deletions

View File

@@ -93,6 +93,11 @@ else:
speculate_limit_thinking_content_length_v2,
)
from fastdeploy.model_executor.entropy_utils import (
calculate_logits_entropy,
speculate_calculate_logits_entropy,
)
from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
from fastdeploy.output.pooler import PoolerOutput, PoolingSequenceGroupOutput
from fastdeploy.output.stream_transfer_data import DecoderState, StreamTransferData
from fastdeploy.worker.output import LogprobsTensors, ModelOutputData, SamplerOutput
@@ -307,12 +312,14 @@ def post_process_normal(
sampler_output: SamplerOutput,
model_output: ModelOutputData,
share_inputs: Dict[str, paddle.Tensor],
sampling_metadata: SamplingMetadata,
block_size: int = 64,
save_each_rank: bool = False,
skip_save_output: bool = False,
async_output_queue: queue.Queue = None,
think_end_id: int = -1,
line_break_id: int = -1,
enable_entropy: bool = False,
):
"""Post-processing steps after completing a single token generation."""
if think_end_id > 0:
@@ -371,6 +378,9 @@ def post_process_normal(
False,
)
if enable_entropy:
calculate_logits_entropy(sampler_output.logits, share_inputs, sampling_metadata.temperature)
# 2. Update the input buffer of the model
with paddle.framework._no_check_dy2st_diff():
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
@@ -436,10 +446,12 @@ def post_process_specualate(
sampler_output: SamplerOutput,
model_output: ModelOutputData,
share_inputs: Dict[str, paddle.Tensor],
sampling_metadata: SamplingMetadata,
save_each_rank: bool = False,
skip_save_output: bool = False,
think_end_id: int = -1,
line_break_id: int = -1,
enable_entropy: bool = False,
):
if think_end_id > 0:
speculate_limit_thinking_content_length(
@@ -464,6 +476,10 @@ def post_process_specualate(
model_output.eos_token_id,
model_output.min_tokens,
)
if enable_entropy:
speculate_calculate_logits_entropy(sampler_output.logits, share_inputs, sampling_metadata.temperature)
speculate_update(
model_output.seq_lens_encoder,
model_output.seq_lens_decoder,
@@ -525,6 +541,7 @@ def post_process(
sampler_or_pooler_output: Union[SamplerOutput, PoolerOutput],
model_output: ModelOutputData,
share_inputs: Dict[str, paddle.Tensor],
sampling_metadata: SamplingMetadata = None,
block_size: int = 64,
save_each_rank: bool = False,
speculative_decoding: bool = False,
@@ -532,6 +549,7 @@ def post_process(
async_output_queue: queue.Queue = None,
think_end_id: int = -1,
line_break_id: int = -1,
enable_entropy: bool = False,
) -> None:
"""Post-processing steps after completing a single token generation."""
@@ -551,22 +569,26 @@ def post_process(
sampler_or_pooler_output,
model_output,
share_inputs,
sampling_metadata,
save_each_rank,
skip_save_output,
think_end_id,
line_break_id,
enable_entropy,
)
else:
post_process_normal(
sampler_or_pooler_output,
model_output,
share_inputs,
sampling_metadata,
block_size,
save_each_rank,
skip_save_output,
async_output_queue,
think_end_id,
line_break_id,
enable_entropy,
)