Files
FastDeploy/zh/features/speculative_decoding/index.html

2924 lines
51 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!doctype html>
<html lang="zh" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="prev" href="../load_balance/">
<link rel="next" href="../structured_outputs/">
<link rel="alternate" href="../../../features/speculative_decoding/" hreflang="en">
<link rel="alternate" href="./" hreflang="zh">
<link rel="icon" href="../../../assets/images/favicon.ico">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.1">
<title>投机解码 - 飞桨大语言模型推理部署工具包</title>
<link rel="stylesheet" href="../../../assets/stylesheets/main.484c7ddc.min.css">
<link rel="stylesheet" href="../../../assets/stylesheets/palette.ab4e12ef.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr" data-md-color-scheme="default" data-md-color-primary="indigo" data-md-color-accent="indigo">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#_1" class="md-skip">
跳转至
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="页眉">
<a href="../../" title="飞桨大语言模型推理部署工具包" class="md-header__button md-logo" aria-label="飞桨大语言模型推理部署工具包" data-md-component="logo">
<img src="../../../assets/images/logo.jpg" alt="logo">
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
飞桨大语言模型推理部署工具包
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
投机解码
</span>
</div>
</div>
</div>
<form class="md-header__option" data-md-component="palette">
<input class="md-option" data-md-color-media="(prefers-color-scheme: light)" data-md-color-scheme="default" data-md-color-primary="indigo" data-md-color-accent="indigo" aria-label="Switch to dark mode" type="radio" name="__palette" id="__palette_0">
<label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
</label>
<input class="md-option" data-md-color-media="(prefers-color-scheme: dark)" data-md-color-scheme="slate" data-md-color-primary="black" data-md-color-accent="indigo" aria-label="Switch to system preference" type="radio" name="__palette" id="__palette_1">
<label class="md-header__button md-icon" title="Switch to system preference" for="__palette_0" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
</label>
</form>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
<div class="md-header__option">
<div class="md-select">
<button class="md-header__button md-icon" aria-label="选择当前语言">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="m12.87 15.07-2.54-2.51.03-.03A17.5 17.5 0 0 0 14.07 6H17V4h-7V2H8v2H1v2h11.17C11.5 7.92 10.44 9.75 9 11.35 8.07 10.32 7.3 9.19 6.69 8h-2c.73 1.63 1.73 3.17 2.98 4.56l-5.09 5.02L4 19l5-5 3.11 3.11zM18.5 10h-2L12 22h2l1.12-3h4.75L21 22h2zm-2.62 7 1.62-4.33L19.12 17z"/></svg>
</button>
<div class="md-select__inner">
<ul class="md-select__list">
<li class="md-select__item">
<a href="../../../features/speculative_decoding/" hreflang="en" class="md-select__link">
English
</a>
</li>
<li class="md-select__item">
<a href="./" hreflang="zh" class="md-select__link">
简体中文
</a>
</li>
</ul>
</div>
</div>
</div>
<label class="md-header__button md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
</label>
<div class="md-search" data-md-component="search" role="dialog">
<label class="md-search__overlay" for="__search"></label>
<div class="md-search__inner" role="search">
<form class="md-search__form" name="search">
<input type="text" class="md-search__input" name="query" aria-label="搜索" placeholder="搜索" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
<label class="md-search__icon md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
</label>
<nav class="md-search__options" aria-label="查找">
<button type="reset" class="md-search__icon md-icon" title="清空当前内容" aria-label="清空当前内容" tabindex="-1">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
</button>
</nav>
</form>
<div class="md-search__output">
<div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
<div class="md-search-result" data-md-component="search-result">
<div class="md-search-result__meta">
正在初始化搜索引擎
</div>
<ol class="md-search-result__list" role="presentation"></ol>
</div>
</div>
</div>
</div>
</div>
<div class="md-header__source">
<a href="https://github.com/PaddlePaddle/FastDeploy" title="前往仓库" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
</div>
<div class="md-source__repository">
FastDeploy
</div>
</a>
</div>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="导航栏" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../../" title="飞桨大语言模型推理部署工具包" class="md-nav__button md-logo" aria-label="飞桨大语言模型推理部署工具包" data-md-component="logo">
<img src="../../../assets/images/logo.jpg" alt="logo">
</a>
飞桨大语言模型推理部署工具包
</label>
<div class="md-nav__source">
<a href="https://github.com/PaddlePaddle/FastDeploy" title="前往仓库" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.1.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
</div>
<div class="md-source__repository">
FastDeploy
</div>
</a>
</div>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../" class="md-nav__link">
<span class="md-ellipsis">
FastDeploy
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
快速入门
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
快速入门
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2_1" >
<label class="md-nav__link" for="__nav_2_1" id="__nav_2_1_label" tabindex="0">
<span class="md-ellipsis">
安装
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_2_1_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2_1">
<span class="md-nav__icon md-icon"></span>
安装
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../get_started/installation/nvidia_gpu/" class="md-nav__link">
<span class="md-ellipsis">
英伟达 GPU
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/installation/kunlunxin_xpu/" class="md-nav__link">
<span class="md-ellipsis">
昆仑芯 XPU
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/installation/intel_gaudi/" class="md-nav__link">
<span class="md-ellipsis">
英特尔 Gaudi
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/installation/hygon_dcu/" class="md-nav__link">
<span class="md-ellipsis">
海光 DCU
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/installation/Enflame_gcu/" class="md-nav__link">
<span class="md-ellipsis">
燧原 S60
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/installation/iluvatar_gpu/" class="md-nav__link">
<span class="md-ellipsis">
天数 CoreX
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/installation/metax_gpu/" class="md-nav__link">
<span class="md-ellipsis">
沐曦 C550
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../../get_started/quick_start/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-0.3B快速部署
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/quick_start_vl/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-VL-28B-A3B快速部署
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/ernie-4.5/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-300B-A47B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/ernie-4.5-vl/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-VL-424B-A47B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/quick_start_qwen/" class="md-nav__link">
<span class="md-ellipsis">
Qwen3-0.6b快速部署
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/quick_start_qwen25_vl/" class="md-nav__link">
<span class="md-ellipsis">
Qwen2.5-VL系列快速部署
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
在线服务
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
在线服务
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../online_serving/" class="md-nav__link">
<span class="md-ellipsis">
兼容 OpenAI 协议的服务化部署
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../online_serving/metrics/" class="md-nav__link">
<span class="md-ellipsis">
监控Metrics
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../online_serving/scheduler/" class="md-nav__link">
<span class="md-ellipsis">
调度器
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../online_serving/graceful_shutdown_service/" class="md-nav__link">
<span class="md-ellipsis">
服务优雅关闭
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../../offline_inference/" class="md-nav__link">
<span class="md-ellipsis">
离线推理
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_5" >
<label class="md-nav__link" for="__nav_5" id="__nav_5_label" tabindex="0">
<span class="md-ellipsis">
Best Practices
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_5_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_5">
<span class="md-nav__icon md-icon"></span>
Best Practices
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../best_practices/ERNIE-4.5-0.3B-Paddle/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-0.3B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../best_practices/ERNIE-4.5-21B-A3B-Paddle/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-21B-A3B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../best_practices/ERNIE-4.5-300B-A47B-Paddle/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-300B-A47B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../best_practices/ERNIE-4.5-21B-A3B-Thinking/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-21B-A3B-Thinking
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../best_practices/ERNIE-4.5-VL-28B-A3B-Paddle/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-VL-28B-A3B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../best_practices/ERNIE-4.5-VL-424B-A47B-Paddle/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-VL-424B-A47B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../best_practices/PaddleOCR-VL-0.9B/" class="md-nav__link">
<span class="md-ellipsis">
PaddleOCR-VL-0.9B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../best_practices/FAQ/" class="md-nav__link">
<span class="md-ellipsis">
常见问题
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_6" >
<label class="md-nav__link" for="__nav_6" id="__nav_6_label" tabindex="0">
<span class="md-ellipsis">
量化
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_6_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_6">
<span class="md-nav__icon md-icon"></span>
量化
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../quantization/" class="md-nav__link">
<span class="md-ellipsis">
概述
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../quantization/online_quantization/" class="md-nav__link">
<span class="md-ellipsis">
在线量化
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../quantization/wint2/" class="md-nav__link">
<span class="md-ellipsis">
WINT2量化
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" checked>
<label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
<span class="md-ellipsis">
特性
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_7">
<span class="md-nav__icon md-icon"></span>
特性
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../prefix_caching/" class="md-nav__link">
<span class="md-ellipsis">
前缀缓存
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../disaggregated/" class="md-nav__link">
<span class="md-ellipsis">
分离式部署
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../chunked_prefill/" class="md-nav__link">
<span class="md-ellipsis">
分块预填充
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../load_balance/" class="md-nav__link">
<span class="md-ellipsis">
负载均衡
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
投机解码
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
投机解码
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="目录">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
目录
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#_2" class="md-nav__link">
<span class="md-ellipsis">
✅ 投机解码方法支持
</span>
</a>
<nav class="md-nav" aria-label="✅ 投机解码方法支持">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#_3" class="md-nav__link">
<span class="md-ellipsis">
✅ 支持列表
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_4" class="md-nav__link">
<span class="md-ellipsis">
⏳ 规划中
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#_5" class="md-nav__link">
<span class="md-ellipsis">
⚙️ 高效投机解码框架设计
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_6" class="md-nav__link">
<span class="md-ellipsis">
🔧 参数说明
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#multi-token-predictionmtp" class="md-nav__link">
<span class="md-ellipsis">
🚀 使用 Multi-Token-Prediction(MTP) 解码
</span>
</a>
<nav class="md-nav" aria-label="🚀 使用 Multi-Token-Prediction(MTP) 解码">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#tp" class="md-nav__link">
<span class="md-ellipsis">
TP 并行部署
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#pd-1p1d" class="md-nav__link">
<span class="md-ellipsis">
PD 分离式部署1P1D
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#mtpngram" class="md-nav__link">
<span class="md-ellipsis">
使用混合MTP、Ngram方法解码
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#ngram" class="md-nav__link">
<span class="md-ellipsis">
🧠 使用 Ngram 解码
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../structured_outputs/" class="md-nav__link">
<span class="md-ellipsis">
结构化输出
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../reasoning_output/" class="md-nav__link">
<span class="md-ellipsis">
思考链内容
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../early_stop/" class="md-nav__link">
<span class="md-ellipsis">
早停功能
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../plugins/" class="md-nav__link">
<span class="md-ellipsis">
插件机制
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../sampling/" class="md-nav__link">
<span class="md-ellipsis">
采样策略
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../multi-node_deployment/" class="md-nav__link">
<span class="md-ellipsis">
多机部署
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../graph_optimization/" class="md-nav__link">
<span class="md-ellipsis">
图优化
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../data_parallel_service/" class="md-nav__link">
<span class="md-ellipsis">
数据并行
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../plas_attention/" class="md-nav__link">
<span class="md-ellipsis">
PLAS
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../../supported_models/" class="md-nav__link">
<span class="md-ellipsis">
支持模型列表
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../benchmark/" class="md-nav__link">
<span class="md-ellipsis">
基准测试
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
<label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
<span class="md-ellipsis">
用法
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_10">
<span class="md-nav__icon md-icon"></span>
用法
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../usage/log/" class="md-nav__link">
<span class="md-ellipsis">
日志说明
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../usage/code_overview/" class="md-nav__link">
<span class="md-ellipsis">
代码概述
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../usage/environment_variables/" class="md-nav__link">
<span class="md-ellipsis">
环境变量
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_11" >
<label class="md-nav__link" for="__nav_11" id="__nav_11_label" tabindex="0">
<span class="md-ellipsis">
CLI 使用说明
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_11_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_11">
<span class="md-nav__icon md-icon"></span>
CLI 使用说明
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../cli/" class="md-nav__link">
<span class="md-ellipsis">
概述
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../cli/chat/" class="md-nav__link">
<span class="md-ellipsis">
Chat命令
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../cli/complete/" class="md-nav__link">
<span class="md-ellipsis">
Complete命令
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../cli/serve/" class="md-nav__link">
<span class="md-ellipsis">
Server命令
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../cli/collect-env/" class="md-nav__link">
<span class="md-ellipsis">
Collect Env命令
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../cli/bench/" class="md-nav__link">
<span class="md-ellipsis">
Bench命令
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../cli/run-batch/" class="md-nav__link">
<span class="md-ellipsis">
Run Batch命令
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../cli/tokenizer/" class="md-nav__link">
<span class="md-ellipsis">
Tokenizer命令
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_12" >
<label class="md-nav__link" for="__nav_12" id="__nav_12_label" tabindex="0">
<span class="md-ellipsis">
可观测性
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_12_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_12">
<span class="md-nav__icon md-icon"></span>
可观测性
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../observability/trace/" class="md-nav__link">
<span class="md-ellipsis">
Trace服务
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="目录">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
目录
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#_2" class="md-nav__link">
<span class="md-ellipsis">
✅ 投机解码方法支持
</span>
</a>
<nav class="md-nav" aria-label="✅ 投机解码方法支持">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#_3" class="md-nav__link">
<span class="md-ellipsis">
✅ 支持列表
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_4" class="md-nav__link">
<span class="md-ellipsis">
⏳ 规划中
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#_5" class="md-nav__link">
<span class="md-ellipsis">
⚙️ 高效投机解码框架设计
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#_6" class="md-nav__link">
<span class="md-ellipsis">
🔧 参数说明
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#multi-token-predictionmtp" class="md-nav__link">
<span class="md-ellipsis">
🚀 使用 Multi-Token-Prediction(MTP) 解码
</span>
</a>
<nav class="md-nav" aria-label="🚀 使用 Multi-Token-Prediction(MTP) 解码">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#tp" class="md-nav__link">
<span class="md-ellipsis">
TP 并行部署
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#pd-1p1d" class="md-nav__link">
<span class="md-ellipsis">
PD 分离式部署1P1D
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#mtpngram" class="md-nav__link">
<span class="md-ellipsis">
使用混合MTP、Ngram方法解码
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#ngram" class="md-nav__link">
<span class="md-ellipsis">
🧠 使用 Ngram 解码
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<p><a href="./">English</a></p>
<h1 id="_1">🔮 投机解码</h1>
<p>本项目基于 PaddlePaddle 实现了高效的 <strong>投机解码Speculative Decoding</strong> 推理框架,支持多 Token 预测Multi-token Proposing, MTP用于加速大语言模型LLM的生成显著降低时延并提升吞吐量。</p>
<h2 id="_2">✅ 投机解码方法支持</h2>
<h3 id="_3">✅ 支持列表</h3>
<ul>
<li>
<p><strong>Ngram</strong></p>
</li>
<li>
<p><strong>MTP (Multi-Token Prediction)</strong></p>
</li>
<li>✅ 已支持TP 切分</li>
<li>✅ 已支持:共享前缀</li>
<li>✅ 已支持:单机 TP 切分 + PD 分离</li>
<li>⏳ 即将支持EP + DP + PD 分离</li>
<li>⏳ 即将支持:兼容 Chunk Prefill</li>
<li>
<p>⏳ 即将支持:多层 MTP layer</p>
</li>
<li>
<p><strong>混合MTP、Ngram方法解码(Hybrid-MTP-with-Ngram)</strong></p>
</li>
<li>方法概述混合MTP与Ngram方法先使用MTP产出N个草稿Token再使用Ngram匹配补充草稿Token。</li>
<li>使用场景适合在需要更多草稿Token时使用兼顾MTP生成能力与Ngram匹配的高效性。</li>
</ul>
<hr />
<h3 id="_4">⏳ 规划中</h3>
<ul>
<li>Draft Model</li>
<li>Eagle</li>
<li>Hydra</li>
<li>Medusa</li>
<li>...</li>
</ul>
<h2 id="_5">⚙️ 高效投机解码框架设计</h2>
<ul>
<li><strong>Attention机制</strong>:采用 <a href="https://flashinfer.ai/2024/02/02/cascade-inference.html">Cascade Append Attention</a> 的 Attention 机制,支持变长查询统一处理,一次前向推理即可完成所有验证。此外,我们对 Kernel 实现进行了深度定制,以最大化 Tensor Core 的利用率,并在高并发场景下仍然保持高吞吐。</li>
<li><strong>虚拟填充机制</strong>:采用虚拟填充快速定位输出 Token 的批次 ID避免了高开销的数据拷贝与切片操作。</li>
<li><strong>并行采样与验证</strong>:我们开发了多个融合 Cuda Kernel用于同时执行采样与验证操作。该 Kernel 支持对每个 batch 样本进行并行处理,避免了显式循环的开销。</li>
<li><strong>高效 DraftModel/MTP 框架</strong>:开发多个融合 Cuda Kernel统一完成模型类方法的前后处理相比传统的循环、切片方法性能高效且易维护</li>
</ul>
<h2 id="_6">🔧 参数说明</h2>
<ul>
<li><code>method</code>: 解码策略,可选值为 <code>"mtp"</code><code>"ngram"</code></li>
<li><code>num_speculative_tokens</code>: 每轮预测的 Token 数,最大支持 5当前 MTP 仅支持 1</li>
<li><code>model</code>: 若选择 MTP则需指定 MTP 模型路径</li>
<li><code>quantization</code>: 模型量化方式,推荐使用 <code>wint8</code></li>
<li><code>batch_size</code>: 当前支持最大值为 256</li>
</ul>
<h2 id="multi-token-predictionmtp">🚀 使用 Multi-Token-Prediction(MTP) 解码</h2>
<p>详见论文:<a href="https://arxiv.org/pdf/2412.19437">DeepSeek-V3</a></p>
<h3 id="tp">TP 并行部署</h3>
<blockquote>
<p>使用 4×H100量化方式选择 WINT4
配置文件:<code>benchmarks/yaml/eb45t-32k-wint4-mtp-h100-tp4.yaml</code></p>
</blockquote>
<pre><code>python -m fastdeploy.entrypoints.openai.api_server \
--model ${path_to_main_model} \
--tensor-parallel-size 4 \
--config ${path_to_FastDeploy}benchmarks/yaml/eb45t-32k-wint4-mtp-h100-tp4.yaml \
--speculative-config '{&quot;method&quot;: &quot;mtp&quot;, &quot;num_speculative_tokens&quot;: 1, &quot;model&quot;: &quot;${path_to_mtp_model}&quot;}'
</code></pre>
<h3 id="pd-1p1d">PD 分离式部署1P1D</h3>
<blockquote>
<p>在8×H100上部署1P1DP、D节点 分别使用 4×H100量化方式选择 WINT4
与常规 PD 分离部署一致,仅需替换配置文件并新增 speculative_config
详情请参考<a href="../disaggregated/">PD分离式部署</a>
- P 节点Prefill</p>
<p>配置文件: <code>benchmarks/yaml/eb45t-32k-wint4-mtp-tp4-prefill.yaml</code></p>
</blockquote>
<pre><code>export FD_LOG_DIR=&quot;log_prefill&quot;
rm -rf ${FD_LOG_DIR}
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m fastdeploy.entrypoints.openai.api_server \
--model ${path_to_main_model} \
--port 8180 \
--metrics-port 8181 \
--engine-worker-queue-port 8182 \
--cache-queue-port 8183 \
--workers 2 \
--tensor-parallel-size 4 \
--quantization wint4 \
--splitwise-role &quot;prefill&quot; \
--scheduler-name &quot;splitwise&quot; \
--scheduler-host &quot;127.0.0.1&quot; \
--scheduler-port 6379 \
--scheduler-ttl 9000 \
--scheduler-topic mtp \
--config ${path_to_FastDeploy}/benchmarks/yaml/eb45t-32k-wint4-mtp-tp4-prefill.yaml \
--scheduler-password &quot;scheduler_mtp&quot; \
--speculative-config '{&quot;method&quot;: &quot;mtp&quot;, &quot;num_speculative_tokens&quot;: 1, &quot;model&quot;: &quot;&quot;${path_to_mtp_model}&quot;}' &amp;
</code></pre>
<ul>
<li>D 节点Decode</li>
</ul>
<blockquote>
<p>配置文件: <code>benchmarks/yaml/eb45t-32k-wint4-mtp-tp4-decode.yaml</code></p>
</blockquote>
<pre><code>export FD_LOG_DIR=&quot;log_prefill&quot;
rm -rf ${FD_LOG_DIR}
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m fastdeploy.entrypoints.openai.api_server \
--model ${path_to_main_model} \
--port 8180 \
--metrics-port 8181 \
--engine-worker-queue-port 8182 \
--cache-queue-port 8183 \
--workers 2 \
--tensor-parallel-size 4 \
--quantization wint4 \
--splitwise-role &quot;prefill&quot; \
--scheduler-name &quot;splitwise&quot; \
--scheduler-host &quot;127.0.0.1&quot; \
--scheduler-port 6379 \
--scheduler-ttl 9000 \
--scheduler-topic mtp \
--config ${path_to_FastDeploy}/benchmarks/yaml/eb45t-32k-wint4-mtp-tp4-prefill.yaml \
--scheduler-password &quot;scheduler_mtp&quot; \
--speculative-config '{&quot;method&quot;: &quot;mtp&quot;, &quot;num_speculative_tokens&quot;: 1, &quot;model&quot;: &quot;&quot;${path_to_mtp_model}&quot;}' &amp;
</code></pre>
<h2 id="mtpngram">使用混合MTP、Ngram方法解码</h2>
<p>在启动服务时,只需改动 --speculative-config 即可。例如使用MTP产出两个DraftToken再额外拼接三个Ngram匹配的DraftToken</p>
<pre><code>--speculative-config '{&quot;method&quot;: &quot;mtp&quot;, &quot;num_model_steps&quot;: 2, &quot;mtp_strategy&quot;: &quot;with_ngram&quot; ,&quot;num_speculative_tokens&quot;: 5, &quot;model&quot;: &quot;'$model_path'/mtp&quot;}'
</code></pre>
<h2 id="ngram">🧠 使用 Ngram 解码</h2>
<p>该算法通过 n-gram 窗口从 prompt 和已生成的 Token 中进行匹配生成草稿 Token适合输入和输出有很大 overlap 的场景,如代码续写、文档查询等。</p>
<blockquote>
<p>使用 4×H100量化方式选择 WINT4
配置文件benchmarks/yaml/eb45t-32k-wint4-mtp-h100-tp4.yaml</p>
</blockquote>
<pre><code>python -m fastdeploy.entrypoints.openai.api_server \
--model ${path_to_main_model} \
--tensor-parallel-size 4 \
--config ${path_to_FastDeploy}benchmarks/yaml/eb45t-32k-wint4-mtp-h100-tp4.yaml \
--speculative-config '{&quot;method&quot;: &quot;ngram&quot;, &quot;num_speculative_tokens&quot;: 1, &quot;model&quot;: &quot;${mtp_model_path}&quot;}'
</code></pre>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
<div class="md-copyright__highlight">
Copyright &copy; 2025 Maintained by FastDeploy
</div>
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "../../..", "features": [], "search": "../../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "\u5df2\u590d\u5236", "clipboard.copy": "\u590d\u5236", "search.result.more.one": "\u5728\u8be5\u9875\u4e0a\u8fd8\u6709 1 \u4e2a\u7b26\u5408\u6761\u4ef6\u7684\u7ed3\u679c", "search.result.more.other": "\u5728\u8be5\u9875\u4e0a\u8fd8\u6709 # \u4e2a\u7b26\u5408\u6761\u4ef6\u7684\u7ed3\u679c", "search.result.none": "\u6ca1\u6709\u627e\u5230\u7b26\u5408\u6761\u4ef6\u7684\u7ed3\u679c", "search.result.one": "\u627e\u5230 1 \u4e2a\u7b26\u5408\u6761\u4ef6\u7684\u7ed3\u679c", "search.result.other": "# \u4e2a\u7b26\u5408\u6761\u4ef6\u7684\u7ed3\u679c", "search.result.placeholder": "\u952e\u5165\u4ee5\u5f00\u59cb\u641c\u7d22", "search.result.term.missing": "\u7f3a\u5c11", "select.version": "\u9009\u62e9\u5f53\u524d\u7248\u672c"}, "version": null}</script>
<script src="../../../assets/javascripts/bundle.79ae519e.min.js"></script>
</body>
</html>