[Feature] Add speculative decoding simulation benchmark. (#2751)

* Add speculative decoding simulation benchmark * Fix the name of the parameter
2025-10-24 00:53:22 +08:00 · 2025-07-09 12:08:43 +08:00
parent 6b10c19482
commit f7cad30a38
8 changed files with 246 additions and 7 deletions
--- a/fastdeploy/engine/config.py
+++ b/fastdeploy/engine/config.py
@@ -337,6 +337,7 @@ class SpeculativeConfig:
        model_name_or_path (Optional[str]): Path of the model.
        quantization (str): Quantization method for draft model, default is WINT8.
        max_model_len: Optional[int]: Maximum model length for draft model.
+        benchmark_mode (bool): Whether to use benchmark mode.
    """

    def __init__(self,
@@ -345,12 +346,14 @@ class SpeculativeConfig:
                 model: Optional[str] = None,
                 quantization: Optional[str] = "WINT8",
                 max_model_len: Optional[int] = None,
+                 benchmark_mode: bool = False,
                 **kwargs):
        self.model_name_or_path = model
        self.method = method
        self.num_speculative_tokens = num_speculative_tokens
        self.quantization = quantization
        self.max_model_len = max_model_len
+        self.benchmark_mode = benchmark_mode
        # Fixed now
        self.num_gpu_block_expand_ratio = 1
        self.num_extra_cache_layer = 0
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -1030,6 +1030,7 @@ class LLMEngine(object):
            f" --speculative_max_draft_token_num {self.cfg.speculative_config.num_speculative_tokens}"
            f" --speculative_model_name_or_path {self.cfg.speculative_config.model_name_or_path}"
            f" --speculative_model_quantization {self.cfg.speculative_config.quantization}"
+            f" --speculative_benchmark_mode {self.cfg.speculative_config.benchmark_mode}"
            f" --max_capture_batch_size {self.cfg.max_capture_batch_size}"
            f" --guided_decoding_backend {self.cfg.guided_decoding_backend}"
            f" --load_strategy {self.cfg.model_config.load_strategy}")