[Feature 2.0.2] support top_k_top_p sampling (#2789)

* support top_k_top_p sampling

* fix

* add api param

* add api para

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* change func name
This commit is contained in:
Sunny-bot1
2025-07-10 12:01:51 +08:00
committed by GitHub
parent 1fe37cb7e8
commit 1107e08cd9
18 changed files with 524 additions and 134 deletions

View File

@@ -13,10 +13,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import argparse
import json
import os
import random
import argparse
import numpy as np
import paddle
@@ -24,6 +24,9 @@ import paddle.distributed.fleet as fleet
from paddleformers.transformers.model_utils import load_tp_checkpoint
from safetensors import safe_open
from fastdeploy.config import (DeviceConfig, FDConfig, KVCacheConfig,
LoadConfig, ModelConfig, MoEConfig, MoEPhase,
ParallelConfig, SpeculativeConfig)
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
from fastdeploy.input.mm_processor import DataProcessor
from fastdeploy.model_executor.layers.attention import get_attention_backend
@@ -44,9 +47,6 @@ from fastdeploy.platforms import current_platform
from fastdeploy.worker.forward_meta import ForwardMeta
from fastdeploy.worker.utils import check_safetensors_model
from fastdeploy.worker.vl_model_runner_base import VLModelRunnerBase
from fastdeploy.config import (DeviceConfig, FDConfig, KVCacheConfig,
LoadConfig, ModelConfig, MoEConfig,
MoEPhase, ParallelConfig, SpeculativeConfig)
if current_platform.is_cuda() and current_platform.available():
from fastdeploy.model_executor.layers.utils import (