mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Feature] Entropy calculation support (#5692)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
* support entropy * fix bug --------- Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
This commit is contained in:
@@ -93,6 +93,11 @@ else:
|
||||
speculate_limit_thinking_content_length_v2,
|
||||
)
|
||||
|
||||
from fastdeploy.model_executor.entropy_utils import (
|
||||
calculate_logits_entropy,
|
||||
speculate_calculate_logits_entropy,
|
||||
)
|
||||
from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
|
||||
from fastdeploy.output.pooler import PoolerOutput, PoolingSequenceGroupOutput
|
||||
from fastdeploy.output.stream_transfer_data import DecoderState, StreamTransferData
|
||||
from fastdeploy.worker.output import LogprobsTensors, ModelOutputData, SamplerOutput
|
||||
@@ -307,12 +312,14 @@ def post_process_normal(
|
||||
sampler_output: SamplerOutput,
|
||||
model_output: ModelOutputData,
|
||||
share_inputs: Dict[str, paddle.Tensor],
|
||||
sampling_metadata: SamplingMetadata,
|
||||
block_size: int = 64,
|
||||
save_each_rank: bool = False,
|
||||
skip_save_output: bool = False,
|
||||
async_output_queue: queue.Queue = None,
|
||||
think_end_id: int = -1,
|
||||
line_break_id: int = -1,
|
||||
enable_entropy: bool = False,
|
||||
):
|
||||
"""Post-processing steps after completing a single token generation."""
|
||||
if think_end_id > 0:
|
||||
@@ -371,6 +378,9 @@ def post_process_normal(
|
||||
False,
|
||||
)
|
||||
|
||||
if enable_entropy:
|
||||
calculate_logits_entropy(sampler_output.logits, share_inputs, sampling_metadata.temperature)
|
||||
|
||||
# 2. Update the input buffer of the model
|
||||
with paddle.framework._no_check_dy2st_diff():
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
@@ -436,10 +446,12 @@ def post_process_specualate(
|
||||
sampler_output: SamplerOutput,
|
||||
model_output: ModelOutputData,
|
||||
share_inputs: Dict[str, paddle.Tensor],
|
||||
sampling_metadata: SamplingMetadata,
|
||||
save_each_rank: bool = False,
|
||||
skip_save_output: bool = False,
|
||||
think_end_id: int = -1,
|
||||
line_break_id: int = -1,
|
||||
enable_entropy: bool = False,
|
||||
):
|
||||
if think_end_id > 0:
|
||||
speculate_limit_thinking_content_length(
|
||||
@@ -464,6 +476,10 @@ def post_process_specualate(
|
||||
model_output.eos_token_id,
|
||||
model_output.min_tokens,
|
||||
)
|
||||
|
||||
if enable_entropy:
|
||||
speculate_calculate_logits_entropy(sampler_output.logits, share_inputs, sampling_metadata.temperature)
|
||||
|
||||
speculate_update(
|
||||
model_output.seq_lens_encoder,
|
||||
model_output.seq_lens_decoder,
|
||||
@@ -525,6 +541,7 @@ def post_process(
|
||||
sampler_or_pooler_output: Union[SamplerOutput, PoolerOutput],
|
||||
model_output: ModelOutputData,
|
||||
share_inputs: Dict[str, paddle.Tensor],
|
||||
sampling_metadata: SamplingMetadata = None,
|
||||
block_size: int = 64,
|
||||
save_each_rank: bool = False,
|
||||
speculative_decoding: bool = False,
|
||||
@@ -532,6 +549,7 @@ def post_process(
|
||||
async_output_queue: queue.Queue = None,
|
||||
think_end_id: int = -1,
|
||||
line_break_id: int = -1,
|
||||
enable_entropy: bool = False,
|
||||
) -> None:
|
||||
"""Post-processing steps after completing a single token generation."""
|
||||
|
||||
@@ -551,22 +569,26 @@ def post_process(
|
||||
sampler_or_pooler_output,
|
||||
model_output,
|
||||
share_inputs,
|
||||
sampling_metadata,
|
||||
save_each_rank,
|
||||
skip_save_output,
|
||||
think_end_id,
|
||||
line_break_id,
|
||||
enable_entropy,
|
||||
)
|
||||
else:
|
||||
post_process_normal(
|
||||
sampler_or_pooler_output,
|
||||
model_output,
|
||||
share_inputs,
|
||||
sampling_metadata,
|
||||
block_size,
|
||||
save_each_rank,
|
||||
skip_save_output,
|
||||
async_output_queue,
|
||||
think_end_id,
|
||||
line_break_id,
|
||||
enable_entropy,
|
||||
)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user