Optimizing the performance of think length limit using custom operators (#4279)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FD Image Build (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled

* delete impl

* delete min_length&max_length

* support limit thinking content strategy

* fix

* fix

* fix

* update

* fix set_value_by_flags_and_idx

* fix

* fix

* fix

* fix

* update

* fix

* fix

* fix typo

* fix ci

* fix

* fix

* support mtp

* fix

* fix

* update

* update
This commit is contained in:
Yuanle Liu
2025-10-20 21:09:13 +08:00
committed by GitHub
parent 36af88ff3f
commit cef3164c3b
31 changed files with 747 additions and 1032 deletions

View File

@@ -28,7 +28,6 @@ from paddle.distributed import fleet
from fastdeploy import envs
from fastdeploy.config import (
CacheConfig,
DecodingConfig,
DeviceConfig,
EarlyStopConfig,
ErnieArchitectures,
@@ -41,7 +40,6 @@ from fastdeploy.config import (
SpeculativeConfig,
StructuredOutputsConfig,
)
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
from fastdeploy.inter_communicator import ExistTaskStatus, IPCSignal, ModelWeightsStatus
from fastdeploy.model_executor.layers.quantization import parse_quant_config
@@ -117,25 +115,9 @@ def init_distributed_environment(seed: int = 20) -> Tuple[int, int]:
def update_fd_config_for_mm(fd_config: FDConfig) -> None:
architectures = fd_config.model_config.architectures
if fd_config.model_config.enable_mm and ErnieArchitectures.contains_ernie_arch(architectures):
tokenizer = Ernie4_5Tokenizer.from_pretrained(
fd_config.model_config.model,
model_max_length=fd_config.model_config.max_model_len,
padding_side="right",
use_fast=False,
)
tokenizer.ignored_index = -100
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.unk_token
fd_config.model_config.tensor_parallel_degree = fd_config.parallel_config.tensor_parallel_size
fd_config.model_config.tensor_parallel_rank = fd_config.parallel_config.tensor_parallel_rank
vision_config = fd_config.model_config.vision_config
vision_config.dtype = fd_config.model_config.dtype
# vision_config.tensor_parallel_degree = fd_config.parallel_config.tensor_parallel_size
# vision_config.tensor_parallel_rank = fd_config.parallel_config.tensor_parallel_rank
fd_config.model_config.im_patch_id = tokenizer.get_vocab()["<|IMAGE_PLACEHOLDER|>"]
fd_config.model_config.think_end_id = tokenizer.get_vocab()["</think>"]
fd_config.model_config.sequence_parallel = fd_config.parallel_config.sequence_parallel
fd_config.model_config.vision_config.dtype = fd_config.model_config.dtype
class PaddleDisWorkerProc:
@@ -577,6 +559,8 @@ def parse_args():
)
parser.add_argument("--ori_vocab_size", type=int, default=None)
parser.add_argument("--think_end_id", type=int, default=-1)
parser.add_argument("--image_patch_id", type=int, default=-1)
parser.add_argument("--line_break_id", type=int, default=-1)
parser.add_argument(
"--quantization",
@@ -707,7 +691,6 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
paddle.set_default_dtype(args.dtype)
model_config = ModelConfig(vars(args))
device_config = DeviceConfig(vars(args))
decoding_config = DecodingConfig(vars(args))
speculative_config = SpeculativeConfig(args.speculative_config)
parallel_config = ParallelConfig(vars(args))
cache_config = CacheConfig(vars(args))
@@ -808,7 +791,6 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
speculative_config=speculative_config,
device_config=device_config,
load_config=load_config,
decoding_config=decoding_config,
quant_config=quant_config,
graph_opt_config=graph_opt_config,
early_stop_config=early_stop_config,