diff --git a/docs/features/speculative_decoding.md b/docs/features/speculative_decoding.md index 4093dcca5..2239c8758 100644 --- a/docs/features/speculative_decoding.md +++ b/docs/features/speculative_decoding.md @@ -18,6 +18,13 @@ This project implements an efficient **Speculative Decoding** inference framewor - ⏳ Coming Soon: Support Chunk-prefill - ⏳ Coming Soon: Multi-layer MTP Layer +- **Decoding with Hybrid MTP and Ngram Methods(Hybrid-MTP-with-Ngram)** + + - Overview: A hybrid method combining MTP and Ngram. First, MTP generates N draft tokens, then Ngram matching is used to supplement additional draft tokens. + + - Use Cases: Suitable when higher draft token coverage is required, leveraging both MTP’s generation capability and the efficiency of Ngram matching. + + --- ### Coming Soon @@ -132,7 +139,13 @@ python -m fastdeploy.entrypoints.openai.api_server \ --scheduler-password "scheduler_mtp" \ --speculative-config '{"method": "mtp", "num_speculative_tokens": 1, "model": "${path_to_mtp_model}"}' & ``` +## Decoding with Hybrid MTP and Ngram Methods +When starting the service, you only need to modify the --speculative-config option. +For example, use MTP to generate two draft tokens, and then append three additional draft tokens from Ngram matching: +``` +--speculative-config '{"method": "mtp", "num_model_steps": 2, "mtp_strategy": "with_ngram", "num_speculative_tokens": 5, "model": "'$model_path'/mtp"}' +``` ## 🧠 Using Ngram-Based Decoding This method uses an n-gram sliding window to match the prompt and generated tokens to predict draft tokens. It is particularly effective in scenarios with high input-output overlap (e.g., code completion, document search). diff --git a/docs/zh/features/speculative_decoding.md b/docs/zh/features/speculative_decoding.md index eb898e873..58b70742c 100644 --- a/docs/zh/features/speculative_decoding.md +++ b/docs/zh/features/speculative_decoding.md @@ -14,6 +14,9 @@ - ⏳ 即将支持:兼容 Chunk Prefill - ⏳ 即将支持:多层 MTP layer +- **混合MTP、Ngram方法解码(Hybrid-MTP-with-Ngram)** + - 方法概述:混合MTP与Ngram方法,先使用MTP产出N个草稿Token,再使用Ngram匹配补充草稿Token。 + - 使用场景:适合在需要更多草稿Token时使用,兼顾MTP生成能力与Ngram匹配的高效性。 --- ### ⏳ 规划中 @@ -110,7 +113,12 @@ python -m fastdeploy.entrypoints.openai.api_server \ --scheduler-password "scheduler_mtp" \ --speculative-config '{"method": "mtp", "num_speculative_tokens": 1, "model": ""${path_to_mtp_model}"}' & ``` +## 使用混合MTP、Ngram方法解码 +在启动服务时,只需改动 --speculative-config 即可。例如使用MTP产出两个DraftToken,再额外拼接三个Ngram匹配的DraftToken +``` +--speculative-config '{"method": "mtp", "num_model_steps": 2, "mtp_strategy": "with_ngram" ,"num_speculative_tokens": 5, "model": "'$model_path'/mtp"}' +``` ## 🧠 使用 Ngram 解码 该算法通过 n-gram 窗口从 prompt 和已生成的 Token 中进行匹配生成草稿 Token,适合输入和输出有很大 overlap 的场景,如代码续写、文档查询等。 > 使用 4×H100;量化方式选择 WINT4 diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py index 481dbc13e..ab908a584 100644 --- a/fastdeploy/spec_decode/mtp.py +++ b/fastdeploy/spec_decode/mtp.py @@ -268,7 +268,11 @@ class MTPProposer(Proposer): self.model_inputs["block_tables"] = paddle.clone(self.main_model_inputs["block_tables"]) self.model_inputs["input_ids"] = paddle.clone(self.main_model_inputs["input_ids"]) self.seq_lens_this_time_buffer = paddle.clone(self.main_model_inputs["seq_lens_this_time"]) - + self.model_inputs["input_ids_cpu"] = paddle.full( + shape=[self.max_num_seqs, self.parallel_config.max_model_len], + fill_value=-1, + dtype="int64", + ).cpu() self.model_inputs["seq_lens_encoder"] = paddle.clone(self.main_model_inputs["seq_lens_encoder"]) self.model_inputs["seq_lens_decoder"] = paddle.clone(self.main_model_inputs["seq_lens_decoder"]) self.model_inputs["step_idx"] = paddle.clone(self.main_model_inputs["step_idx"]) @@ -368,10 +372,17 @@ class MTPProposer(Proposer): request = req_dicts[i] idx = request.idx length = len(request.prompt_token_ids) - self.input_ids_len[idx] = length + self.input_ids_len[idx] = length - 1 if req_dicts[i].disaggregate_info is not None and req_dicts[i].disaggregate_info["role"] == "decode": length = len(request.prompt_token_ids) + if length > 1: + self.model_inputs["input_ids"][idx : idx + 1, : length - 1] = self.target_model_inputs[ + "input_ids" + ][idx : idx + 1, 1:length] + self.model_inputs["input_ids_cpu"][idx : idx + 1, : length - 1] = np.array( + request.prompt_token_ids + )[1:] self.model_inputs["pre_ids"][idx : idx + 1] = request.prompt_token_ids[-1] prefill_token_num = self.max_draft_token_num + 1 self.model_inputs["draft_tokens"][idx : idx + 1, 0:1] = paddle.to_tensor( @@ -400,6 +411,10 @@ class MTPProposer(Proposer): self.model_inputs["input_ids"][idx : idx + 1, : length - 1] = self.main_model_inputs["input_ids"][ idx : idx + 1, 1:length ] + self.model_inputs["input_ids_cpu"][idx : idx + 1, : length - 1] = np.array( + request.prompt_token_ids + )[1:] + self.model_inputs["pre_ids"][idx : idx + 1] = -1 self.model_inputs["step_idx"][idx : idx + 1] = 0 if self.cache_config.enable_chunked_prefill: @@ -688,7 +703,7 @@ class MTPProposer(Proposer): seq_lens_this_time = self.main_model_inputs["seq_lens_this_time"].cpu() seq_lens_decoder = self.model_inputs["seq_lens_decoder"].cpu() hybrid_mtp_ngram( - self.model_inputs["input_ids"]._copy_to(device, True), + self.model_inputs["input_ids_cpu"], self.input_ids_len, self.model_inputs["pre_ids"]._copy_to(device, True), self.model_inputs["step_idx"].cpu(),