update hybrid-mtp-with-ngram (#3924)

2025-09-27 04:46:16 +08:00 · 2025-09-05 23:06:57 +08:00
parent 2d975e16b0
commit e2c764fd5a
3 changed files with 39 additions and 3 deletions
--- a/docs/features/speculative_decoding.md
+++ b/docs/features/speculative_decoding.md
@@ -18,6 +18,13 @@ This project implements an efficient **Speculative Decoding** inference framewor
  - ⏳ Coming Soon: Support Chunk-prefill
  - ⏳ Coming Soon: Multi-layer MTP Layer

+- **Decoding with Hybrid MTP and Ngram Methods(Hybrid-MTP-with-Ngram)**
+
+  - Overview: A hybrid method combining MTP and Ngram. First, MTP generates N draft tokens, then Ngram matching is used to supplement additional draft tokens.
+
+  - Use Cases: Suitable when higher draft token coverage is required, leveraging both MTP’s generation capability and the efficiency of Ngram matching.
+
+
 ---

 ### Coming Soon
@@ -132,7 +139,13 @@ python -m fastdeploy.entrypoints.openai.api_server \
    --scheduler-password "scheduler_mtp" \
    --speculative-config '{"method": "mtp", "num_speculative_tokens": 1, "model": "${path_to_mtp_model}"}' &
 ```
+## Decoding with Hybrid MTP and Ngram Methods

+When starting the service, you only need to modify the --speculative-config option.
+For example, use MTP to generate two draft tokens, and then append three additional draft tokens from Ngram matching:
+```
+--speculative-config '{"method": "mtp", "num_model_steps": 2, "mtp_strategy": "with_ngram", "num_speculative_tokens": 5, "model": "'$model_path'/mtp"}'
+```
 ## 🧠 Using Ngram-Based Decoding
 This method uses an n-gram sliding window to match the prompt and generated tokens to predict draft tokens. It is particularly effective in scenarios with high input-output overlap (e.g., code completion, document search).

--- a/docs/zh/features/speculative_decoding.md
+++ b/docs/zh/features/speculative_decoding.md
@@ -14,6 +14,9 @@
  - ⏳ 即将支持：兼容 Chunk Prefill
  - ⏳ 即将支持：多层 MTP layer

+- **混合MTP、Ngram方法解码(Hybrid-MTP-with-Ngram)**
+  - 方法概述：混合MTP与Ngram方法，先使用MTP产出N个草稿Token，再使用Ngram匹配补充草稿Token。
+  - 使用场景：适合在需要更多草稿Token时使用，兼顾MTP生成能力与Ngram匹配的高效性。
 ---

 ### ⏳ 规划中
@@ -110,7 +113,12 @@ python -m fastdeploy.entrypoints.openai.api_server  \
       --scheduler-password "scheduler_mtp" \
       --speculative-config '{"method": "mtp", "num_speculative_tokens": 1, "model": ""${path_to_mtp_model}"}'  &
 ```
+## 使用混合MTP、Ngram方法解码
+在启动服务时，只需改动 --speculative-config 即可。例如使用MTP产出两个DraftToken，再额外拼接三个Ngram匹配的DraftToken
+```
+--speculative-config '{"method": "mtp", "num_model_steps": 2, "mtp_strategy": "with_ngram" ,"num_speculative_tokens": 5, "model": "'$model_path'/mtp"}'

+```
 ## 🧠 使用 Ngram 解码
 该算法通过 n-gram 窗口从 prompt 和已生成的 Token 中进行匹配生成草稿 Token，适合输入和输出有很大 overlap 的场景，如代码续写、文档查询等。
 > 使用 4×H100；量化方式选择 WINT4
--- a/fastdeploy/spec_decode/mtp.py
+++ b/fastdeploy/spec_decode/mtp.py
@@ -268,7 +268,11 @@ class MTPProposer(Proposer):
        self.model_inputs["block_tables"] = paddle.clone(self.main_model_inputs["block_tables"])
        self.model_inputs["input_ids"] = paddle.clone(self.main_model_inputs["input_ids"])
        self.seq_lens_this_time_buffer = paddle.clone(self.main_model_inputs["seq_lens_this_time"])
-
+        self.model_inputs["input_ids_cpu"] = paddle.full(
+            shape=[self.max_num_seqs, self.parallel_config.max_model_len],
+            fill_value=-1,
+            dtype="int64",
+        ).cpu()
        self.model_inputs["seq_lens_encoder"] = paddle.clone(self.main_model_inputs["seq_lens_encoder"])
        self.model_inputs["seq_lens_decoder"] = paddle.clone(self.main_model_inputs["seq_lens_decoder"])
        self.model_inputs["step_idx"] = paddle.clone(self.main_model_inputs["step_idx"])
@@ -368,10 +372,17 @@ class MTPProposer(Proposer):
            request = req_dicts[i]
            idx = request.idx
            length = len(request.prompt_token_ids)
-            self.input_ids_len[idx] = length
+            self.input_ids_len[idx] = length - 1

            if req_dicts[i].disaggregate_info is not None and req_dicts[i].disaggregate_info["role"] == "decode":
                length = len(request.prompt_token_ids)
+                if length > 1:
+                    self.model_inputs["input_ids"][idx : idx + 1, : length - 1] = self.target_model_inputs[
+                        "input_ids"
+                    ][idx : idx + 1, 1:length]
+                    self.model_inputs["input_ids_cpu"][idx : idx + 1, : length - 1] = np.array(
+                        request.prompt_token_ids
+                    )[1:]
                self.model_inputs["pre_ids"][idx : idx + 1] = request.prompt_token_ids[-1]
                prefill_token_num = self.max_draft_token_num + 1
                self.model_inputs["draft_tokens"][idx : idx + 1, 0:1] = paddle.to_tensor(
@@ -400,6 +411,10 @@ class MTPProposer(Proposer):
                    self.model_inputs["input_ids"][idx : idx + 1, : length - 1] = self.main_model_inputs["input_ids"][
                        idx : idx + 1, 1:length
                    ]
+                    self.model_inputs["input_ids_cpu"][idx : idx + 1, : length - 1] = np.array(
+                        request.prompt_token_ids
+                    )[1:]
+
                self.model_inputs["pre_ids"][idx : idx + 1] = -1
                self.model_inputs["step_idx"][idx : idx + 1] = 0
                if self.cache_config.enable_chunked_prefill:
@@ -688,7 +703,7 @@ class MTPProposer(Proposer):
        seq_lens_this_time = self.main_model_inputs["seq_lens_this_time"].cpu()
        seq_lens_decoder = self.model_inputs["seq_lens_decoder"].cpu()
        hybrid_mtp_ngram(
-            self.model_inputs["input_ids"]._copy_to(device, True),
+            self.model_inputs["input_ids_cpu"],
            self.input_ids_len,
            self.model_inputs["pre_ids"]._copy_to(device, True),
            self.model_inputs["step_idx"].cpu(),