diff --git a/fastdeploy/cache_manager/cache_transfer_manager.py b/fastdeploy/cache_manager/cache_transfer_manager.py index 678819723..34ccf144c 100644 --- a/fastdeploy/cache_manager/cache_transfer_manager.py +++ b/fastdeploy/cache_manager/cache_transfer_manager.py @@ -24,7 +24,7 @@ import numpy as np import paddle from fastdeploy.cache_manager.cache_data import CacheStatus -from fastdeploy.engine.config import SpeculativeConfig +from fastdeploy.config import SpeculativeConfig from fastdeploy.inter_communicator import EngineCacheQueue, IPCSignal from fastdeploy.model_executor.ops.gpu import ( cuda_host_alloc, @@ -114,7 +114,7 @@ class CacheTransferManager: self.cpu_cache_kvs = {} self.gpu_cache_k_tensors = [] self.gpu_cache_v_tensors = [] - self.speculative_config = SpeculativeConfig(**args.speculative_config) + self.speculative_config = SpeculativeConfig(args.speculative_config) self.num_extra_layers = self.speculative_config.num_extra_cache_layer self.num_extra_layer_gpu_blocks = int(args.num_gpu_blocks * self.speculative_config.num_gpu_block_expand_ratio) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 48a45f41e..09c419e42 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -254,7 +254,7 @@ class SpeculativeConfig: # ngram match self.max_ngram_size: int = 5 # model for mtp/eagle/draft_model - self.model_name_or_path: Optional[str] = None + self.model: Optional[str] = None # quantization of model self.quantization: Optional[str] = None # allocate more blocks to prevent mtp from finishing the block earlier than the main model @@ -273,21 +273,11 @@ class SpeculativeConfig: self.benchmark_mode: bool = False self.num_extra_cache_layer = 0 - # TODO(YuanRisheng): The name of the server args is different from the name of the SpeculativeConfig. - # We temperately add the name map here and will delete it in future. - name_map = { - "speculative_method": "method", - "speculative_max_draft_token_num": "num_speculative_tokens", - "speculative_model_name_or_path": "model_name_or_path", - "speculative_model_quantization": "quantization", - "speculative_benchmark_mode": "benchmark_mode", - } for key, value in args.items(): - if key in name_map.keys() and hasattr(self, name_map[key]): - if key == "speculative_benchmark_mode": - value = True if value.lower() == "true" else False - setattr(self, name_map[key], value) + if hasattr(self, key): + setattr(self, key, value) + self.read_model_config() self.reset() @@ -299,11 +289,11 @@ class SpeculativeConfig: if not self.enabled_speculative_decoding(): return - self.is_unified_ckpt = check_unified_ckpt(self.model_name_or_path) - if self.model_name_or_path is None: + self.is_unified_ckpt = check_unified_ckpt(self.model) + if self.model is None: return - self.config_path = os.path.join(self.model_name_or_path, "config.json") + self.config_path = os.path.join(self.model, "config.json") if os.path.exists(self.config_path): self.model_config = json.load(open(self.config_path, "r", encoding="utf-8")) diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index acea471a8..f59f6d0a5 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -1081,11 +1081,7 @@ class LLMEngine: f" --expert_parallel_size {self.cfg.parallel_config.expert_parallel_size}" f" --quantization {self.cfg.model_config.quantization}" f" --ori_vocab_size {ori_vocab_size}" - f" --speculative_method {self.cfg.speculative_config.method}" - f" --speculative_max_draft_token_num {self.cfg.speculative_config.num_speculative_tokens}" - f" --speculative_model_name_or_path {self.cfg.speculative_config.model_name_or_path}" - f" --speculative_model_quantization {self.cfg.speculative_config.quantization}" - f" --speculative_benchmark_mode {self.cfg.speculative_config.benchmark_mode}" + f" --speculative_config '{self.cfg.speculative_config.to_json_string()}'" f" --graph_optimization_config '{self.cfg.graph_optimization_config.to_json_string()}'" f" --guided_decoding_backend {self.cfg.guided_decoding_backend}" f" --load_strategy {self.cfg.model_config.load_strategy}" diff --git a/fastdeploy/rl/rollout_config.py b/fastdeploy/rl/rollout_config.py index 40be4d774..67c4fdd17 100644 --- a/fastdeploy/rl/rollout_config.py +++ b/fastdeploy/rl/rollout_config.py @@ -83,10 +83,11 @@ class RolloutModelConfig: self.pad_token_id = pad_token_id self.eos_tokens_lens = eos_tokens_lens self.enable_chunked_prefill = enable_chunked_prefill - self.speculative_method = speculative_method - self.speculative_max_draft_token_num = speculative_max_draft_token_num - self.speculative_model_name_or_path = speculative_model_name_or_path - self.speculative_model_quantization = speculative_model_quantization + self.speculative_config = {} + self.speculative_config["method"] = speculative_method + self.speculative_config["max_draft_token_num"] = speculative_max_draft_token_num + self.speculative_config["model"] = speculative_model_name_or_path + self.speculative_config["quantization"] = speculative_model_quantization self.max_num_batched_tokens = max_num_batched_tokens self.enable_prefix_caching = enable_prefix_caching self.splitwise_role = splitwise_role diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py index 9c4b8c9dc..aa67aa857 100644 --- a/fastdeploy/spec_decode/mtp.py +++ b/fastdeploy/spec_decode/mtp.py @@ -73,7 +73,7 @@ class MTPProposer(Proposer): self.model_config.architectures[0] = "Ernie4_5_MTPForCausalLM" self.speculative_config.sharing_model = main_model self.model_config.num_hidden_layers = 1 - self.parallel_config.model_name_or_path = self.speculative_config.model_name_or_path + self.parallel_config.model_name_or_path = self.speculative_config.model self.model_config.pretrained_config.prefix_name = "ernie.mtp_block" if self.speculative_config.quantization != "": self.model_config.quantization = self.speculative_config.quantization diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 108a0b8eb..c25de24f3 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -41,7 +41,7 @@ from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue from fastdeploy.inter_communicator import IPCSignal from fastdeploy.model_executor.layers.quantization import get_quantization_config from fastdeploy.platforms import current_platform -from fastdeploy.utils import get_logger, none_or_str +from fastdeploy.utils import get_logger from fastdeploy.worker.worker_base import WorkerBase logger = get_logger("worker_process", "worker_process.log") @@ -476,34 +476,10 @@ def parse_args(): help="enable chunked prefill", ) parser.add_argument( - "--speculative_method", + "--speculative_config", + type=json.loads, default=None, - type=none_or_str, - choices=[ - None, - "ngram", - "mtp", - ], - ) - parser.add_argument( - "--speculative_max_draft_token_num", - default=1, - type=int, - ) - parser.add_argument( - "--speculative_model_name_or_path", - default="", - type=str, - ) - parser.add_argument( - "--speculative_model_quantization", - default="WINT8", - type=str, - ) - parser.add_argument( - "--speculative_benchmark_mode", - default="False", - type=str, + help="Configation of SpeculativeConfig.", ) parser.add_argument( "--max_num_batched_tokens", @@ -607,7 +583,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: model_config = ModelConfig(vars(args)) device_config = DeviceConfig(vars(args)) decoding_config = DecodingConfig(vars(args)) - speculative_config = SpeculativeConfig(vars(args)) + speculative_config = SpeculativeConfig(args.speculative_config) parallel_config = ParallelConfig(vars(args)) cache_config = CacheConfig(vars(args)) parallel_config.tensor_parallel_size = args.tensor_parallel_size