[Feature][MTP]support new speculative decoding method named hybrid mtp with ngram (#3610)

2025-10-04 16:22:57 +08:00 · 2025-08-26 14:29:22 +08:00
parent 0a0d2959b9
commit 52eda7fdb3
20 changed files with 454 additions and 571 deletions
--- a/fastdeploy/output/token_processor.py
+++ b/fastdeploy/output/token_processor.py
@@ -252,12 +252,13 @@ class TokenProcessor:

    def _compute_speculative_status(self):
        # TODO(liuzichang): Supplement more statistics
-        interval = 50
+        interval = 10
        if self.speculative_stats_step % interval == 0:
            accept_ratio = 1 - self.total_step * 1.0 / self.number_of_output_tokens
            spec_logger.info(
                f"Speculate global accept ratio(Accept draft_tokens/Generated tokens): {accept_ratio}"
                f" total step: {self.total_step}. total output token num: {self.number_of_output_tokens}"
+                f" avarage accept len: {self.number_of_output_tokens / self.total_step}"
            )

            if self.cfg.speculative_config.method in ["mtp"]: