Files
FastDeploy/features/speculative_decoding/index.html

2153 lines
46 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="prev" href="../load_balance/">
<link rel="next" href="../structured_outputs/">
<link rel="icon" href="../../assets/images/favicon.ico">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.6.20">
<title>Speculative Decoding - FastDeploy: Large Language Model Deployment</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.e53b48f4.min.css">
<link rel="stylesheet" href="../../assets/stylesheets/palette.06af60db.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr" data-md-color-scheme="default" data-md-color-primary="indigo" data-md-color-accent="indigo">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#speculative-decoding" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../.." title="FastDeploy: Large Language Model Deployment" class="md-header__button md-logo" aria-label="FastDeploy: Large Language Model Deployment" data-md-component="logo">
<img src="../../assets/images/logo.jpg" alt="logo">
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
FastDeploy: Large Language Model Deployment
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
Speculative Decoding
</span>
</div>
</div>
</div>
<form class="md-header__option" data-md-component="palette">
<input class="md-option" data-md-color-media="(prefers-color-scheme: light)" data-md-color-scheme="default" data-md-color-primary="indigo" data-md-color-accent="indigo" aria-label="Switch to dark mode" type="radio" name="__palette" id="__palette_0">
<label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
</label>
<input class="md-option" data-md-color-media="(prefers-color-scheme: dark)" data-md-color-scheme="slate" data-md-color-primary="black" data-md-color-accent="indigo" aria-label="Switch to system preference" type="radio" name="__palette" id="__palette_1">
<label class="md-header__button md-icon" title="Switch to system preference" for="__palette_0" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
</label>
</form>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
<div class="md-header__option">
<div class="md-select">
<button class="md-header__button md-icon" aria-label="Select language">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="m12.87 15.07-2.54-2.51.03-.03A17.5 17.5 0 0 0 14.07 6H17V4h-7V2H8v2H1v2h11.17C11.5 7.92 10.44 9.75 9 11.35 8.07 10.32 7.3 9.19 6.69 8h-2c.73 1.63 1.73 3.17 2.98 4.56l-5.09 5.02L4 19l5-5 3.11 3.11zM18.5 10h-2L12 22h2l1.12-3h4.75L21 22h2zm-2.62 7 1.62-4.33L19.12 17z"/></svg>
</button>
<div class="md-select__inner">
<ul class="md-select__list">
<li class="md-select__item">
<a href="./" hreflang="en" class="md-select__link">
English
</a>
</li>
<li class="md-select__item">
<a href="../../zh/features/speculative_decoding/" hreflang="zh" class="md-select__link">
简体中文
</a>
</li>
</ul>
</div>
</div>
</div>
<label class="md-header__button md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
</label>
<div class="md-search" data-md-component="search" role="dialog">
<label class="md-search__overlay" for="__search"></label>
<div class="md-search__inner" role="search">
<form class="md-search__form" name="search">
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
<label class="md-search__icon md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
</label>
<nav class="md-search__options" aria-label="Search">
<button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
</button>
</nav>
</form>
<div class="md-search__output">
<div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
<div class="md-search-result" data-md-component="search-result">
<div class="md-search-result__meta">
Initializing search
</div>
<ol class="md-search-result__list" role="presentation"></ol>
</div>
</div>
</div>
</div>
</div>
<div class="md-header__source">
<a href="https://github.com/PaddlePaddle/FastDeploy" title="Go to repository" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.0.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
</div>
<div class="md-source__repository">
FastDeploy
</div>
</a>
</div>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="FastDeploy: Large Language Model Deployment" class="md-nav__button md-logo" aria-label="FastDeploy: Large Language Model Deployment" data-md-component="logo">
<img src="../../assets/images/logo.jpg" alt="logo">
</a>
FastDeploy: Large Language Model Deployment
</label>
<div class="md-nav__source">
<a href="https://github.com/PaddlePaddle/FastDeploy" title="Go to repository" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.0.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
</div>
<div class="md-source__repository">
FastDeploy
</div>
</a>
</div>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../.." class="md-nav__link">
<span class="md-ellipsis">
FastDeploy
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Quick Start
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Quick Start
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2_1" >
<label class="md-nav__link" for="__nav_2_1" id="__nav_2_1_label" tabindex="0">
<span class="md-ellipsis">
Installation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_2_1_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2_1">
<span class="md-nav__icon md-icon"></span>
Installation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../get_started/installation/nvidia_gpu/" class="md-nav__link">
<span class="md-ellipsis">
Nvidia GPU
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/installation/kunlunxin_xpu/" class="md-nav__link">
<span class="md-ellipsis">
KunlunXin XPU
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/installation/hygon_dcu/" class="md-nav__link">
<span class="md-ellipsis">
HYGON DCU
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/installation/Enflame_gcu/" class="md-nav__link">
<span class="md-ellipsis">
Enflame S60
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/installation/iluvatar_gpu/" class="md-nav__link">
<span class="md-ellipsis">
Iluvatar CoreX
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/installation/metax_gpu/" class="md-nav__link">
<span class="md-ellipsis">
Metax C550
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../../get_started/quick_start/" class="md-nav__link">
<span class="md-ellipsis">
Quick Deployment For ERNIE-4.5-0.3B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/quick_start_vl/" class="md-nav__link">
<span class="md-ellipsis">
Quick Deployment for ERNIE-4.5-VL-28B-A3B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/ernie-4.5/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-300B-A47B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/ernie-4.5-vl/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-VL-424B-A47B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/quick_start_qwen/" class="md-nav__link">
<span class="md-ellipsis">
Quick Deployment For QWEN
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Online Serving
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Online Serving
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../online_serving/" class="md-nav__link">
<span class="md-ellipsis">
OpenAI-Compatible API Server
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../online_serving/metrics/" class="md-nav__link">
<span class="md-ellipsis">
Monitor Metrics
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../online_serving/scheduler/" class="md-nav__link">
<span class="md-ellipsis">
Scheduler
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../online_serving/graceful_shutdown_service/" class="md-nav__link">
<span class="md-ellipsis">
Graceful Shutdown
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../../offline_inference/" class="md-nav__link">
<span class="md-ellipsis">
Offline Inference
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_5" >
<label class="md-nav__link" for="__nav_5" id="__nav_5_label" tabindex="0">
<span class="md-ellipsis">
Best Practices
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_5_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_5">
<span class="md-nav__icon md-icon"></span>
Best Practices
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../best_practices/ERNIE-4.5-0.3B-Paddle/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-0.3B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../best_practices/ERNIE-4.5-21B-A3B-Paddle/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-21B-A3B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../best_practices/ERNIE-4.5-300B-A47B-Paddle/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-300B-A47B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../best_practices/ERNIE-4.5-21B-A3B-Thinking/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-21B-A3B-Thinking
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../best_practices/ERNIE-4.5-VL-28B-A3B-Paddle/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-VL-28B-A3B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../best_practices/ERNIE-4.5-VL-424B-A47B-Paddle/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-VL-424B-A47B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../best_practices/FAQ/" class="md-nav__link">
<span class="md-ellipsis">
FAQ
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_6" >
<label class="md-nav__link" for="__nav_6" id="__nav_6_label" tabindex="0">
<span class="md-ellipsis">
Quantization
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_6_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_6">
<span class="md-nav__icon md-icon"></span>
Quantization
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../quantization/" class="md-nav__link">
<span class="md-ellipsis">
Overview
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../quantization/online_quantization/" class="md-nav__link">
<span class="md-ellipsis">
Online Quantization
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../quantization/wint2/" class="md-nav__link">
<span class="md-ellipsis">
WINT2 Quantization
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" checked>
<label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
<span class="md-ellipsis">
Features
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_7">
<span class="md-nav__icon md-icon"></span>
Features
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../prefix_caching/" class="md-nav__link">
<span class="md-ellipsis">
Prefix Caching
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../disaggregated/" class="md-nav__link">
<span class="md-ellipsis">
Disaggregation
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../chunked_prefill/" class="md-nav__link">
<span class="md-ellipsis">
Chunked Prefill
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../load_balance/" class="md-nav__link">
<span class="md-ellipsis">
Load Balance
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
Speculative Decoding
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
Speculative Decoding
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#supported-speculative-decoding-methods" class="md-nav__link">
<span class="md-ellipsis">
✅ Supported Speculative Decoding Methods
</span>
</a>
<nav class="md-nav" aria-label="✅ Supported Speculative Decoding Methods">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#supported" class="md-nav__link">
<span class="md-ellipsis">
Supported
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#coming-soon" class="md-nav__link">
<span class="md-ellipsis">
Coming Soon
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#efficient-speculative-decoding-architecture" class="md-nav__link">
<span class="md-ellipsis">
⚙️ Efficient Speculative Decoding Architecture
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#configuration-parameters" class="md-nav__link">
<span class="md-ellipsis">
🔧 Configuration Parameters
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#using-multi-token-prediction-mtp" class="md-nav__link">
<span class="md-ellipsis">
🚀 Using Multi-Token Prediction (MTP)
</span>
</a>
<nav class="md-nav" aria-label="🚀 Using Multi-Token Prediction (MTP)">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#tp-sharding-mode" class="md-nav__link">
<span class="md-ellipsis">
TP Sharding Mode
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#pd-separated-deployment-1p1d-mode" class="md-nav__link">
<span class="md-ellipsis">
PD-Separated Deployment (1P1D Mode)
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#decoding-with-hybrid-mtp-and-ngram-methods" class="md-nav__link">
<span class="md-ellipsis">
Decoding with Hybrid MTP and Ngram Methods
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#using-ngram-based-decoding" class="md-nav__link">
<span class="md-ellipsis">
🧠 Using Ngram-Based Decoding
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../structured_outputs/" class="md-nav__link">
<span class="md-ellipsis">
Structured Outputs
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../reasoning_output/" class="md-nav__link">
<span class="md-ellipsis">
Reasoning Output
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../early_stop/" class="md-nav__link">
<span class="md-ellipsis">
Early Stop
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../plugins/" class="md-nav__link">
<span class="md-ellipsis">
Plugins
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../sampling/" class="md-nav__link">
<span class="md-ellipsis">
Sampling
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../multi-node_deployment/" class="md-nav__link">
<span class="md-ellipsis">
MultiNode Deployment
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../graph_optimization/" class="md-nav__link">
<span class="md-ellipsis">
Graph Optimization
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../data_parallel_service/" class="md-nav__link">
<span class="md-ellipsis">
Data Parallelism
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../plas_attention/" class="md-nav__link">
<span class="md-ellipsis">
PLAS
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../../supported_models/" class="md-nav__link">
<span class="md-ellipsis">
Supported Models
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../benchmark/" class="md-nav__link">
<span class="md-ellipsis">
Benchmark
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
<label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
<span class="md-ellipsis">
Usage
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_10">
<span class="md-nav__icon md-icon"></span>
Usage
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../usage/log/" class="md-nav__link">
<span class="md-ellipsis">
Log Description
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../usage/code_overview/" class="md-nav__link">
<span class="md-ellipsis">
Code Overview
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../usage/environment_variables/" class="md-nav__link">
<span class="md-ellipsis">
Environment Variables
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#supported-speculative-decoding-methods" class="md-nav__link">
<span class="md-ellipsis">
✅ Supported Speculative Decoding Methods
</span>
</a>
<nav class="md-nav" aria-label="✅ Supported Speculative Decoding Methods">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#supported" class="md-nav__link">
<span class="md-ellipsis">
Supported
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#coming-soon" class="md-nav__link">
<span class="md-ellipsis">
Coming Soon
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#efficient-speculative-decoding-architecture" class="md-nav__link">
<span class="md-ellipsis">
⚙️ Efficient Speculative Decoding Architecture
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#configuration-parameters" class="md-nav__link">
<span class="md-ellipsis">
🔧 Configuration Parameters
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#using-multi-token-prediction-mtp" class="md-nav__link">
<span class="md-ellipsis">
🚀 Using Multi-Token Prediction (MTP)
</span>
</a>
<nav class="md-nav" aria-label="🚀 Using Multi-Token Prediction (MTP)">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#tp-sharding-mode" class="md-nav__link">
<span class="md-ellipsis">
TP Sharding Mode
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#pd-separated-deployment-1p1d-mode" class="md-nav__link">
<span class="md-ellipsis">
PD-Separated Deployment (1P1D Mode)
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#decoding-with-hybrid-mtp-and-ngram-methods" class="md-nav__link">
<span class="md-ellipsis">
Decoding with Hybrid MTP and Ngram Methods
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#using-ngram-based-decoding" class="md-nav__link">
<span class="md-ellipsis">
🧠 Using Ngram-Based Decoding
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="speculative-decoding">🔮 Speculative Decoding</h1>
<p>This project implements an efficient <strong>Speculative Decoding</strong> inference framework based on PaddlePaddle. It supports <strong>Multi-Token Proposing (MTP)</strong> to accelerate large language model (LLM) generation, significantly reducing latency and improving throughput.</p>
<hr />
<h2 id="supported-speculative-decoding-methods">✅ Supported Speculative Decoding Methods</h2>
<h3 id="supported">Supported</h3>
<ul>
<li>
<p><strong>Ngram</strong></p>
</li>
<li>
<p><strong>MTP (Multi-Token Prediction)</strong></p>
</li>
<li>✅ Supported: TP Sharding</li>
<li>✅ Supported: Shared Prefix</li>
<li>✅ Supported: TP Sharding + PD Separation</li>
<li>⏳ Coming Soon: EP + DP + PD Separation</li>
<li>⏳ Coming Soon: Support Chunk-prefill</li>
<li>
<p>⏳ Coming Soon: Multi-layer MTP Layer</p>
</li>
<li>
<p><strong>Decoding with Hybrid MTP and Ngram Methods(Hybrid-MTP-with-Ngram)</strong></p>
</li>
<li>
<p>Overview: A hybrid method combining MTP and Ngram. First, MTP generates N draft tokens, then Ngram matching is used to supplement additional draft tokens.</p>
</li>
<li>
<p>Use Cases: Suitable when higher draft token coverage is required, leveraging both MTPs generation capability and the efficiency of Ngram matching.</p>
</li>
</ul>
<hr />
<h3 id="coming-soon">Coming Soon</h3>
<ul>
<li>Draft Model</li>
<li>Eagle</li>
<li>Hydra</li>
<li>Medusa</li>
<li>...</li>
</ul>
<hr />
<h2 id="efficient-speculative-decoding-architecture">⚙️ Efficient Speculative Decoding Architecture</h2>
<ul>
<li>
<p><strong>Attention Mechanism</strong>: We employ <a href="https://flashinfer.ai/2024/02/02/cascade-inference.html">Cascade Append Attention</a>, which allows unified processing of queries with varying token lengths, enabling efficient verification. All tokens can be verified in a single forward pass. We deeply customized the underlying kernels to fully leverage Tensor Cores and maintain high throughput even under heavy concurrency.</p>
</li>
<li>
<p><strong>Virtual Padding Mechanism</strong>: A virtual padding strategy is used to locate output token batch IDs, eliminating the overhead of data copying and slicing operations.</p>
</li>
<li>
<p><strong>Parallel Sampling and Verification</strong>: We developed multiple fused CUDA kernels for concurrent sampling and verification. These kernels allow parallel processing for each sample in a batch, avoiding explicit loop execution on the host side.</p>
</li>
<li>
<p><strong>Efficient Draft Model/MTP Framework</strong>: Multiple fused CUDA kernels are used to handle pre- and post-processing within the model class, replacing traditional loop-based and slicing-based methods with a more performant and maintainable structure.</p>
</li>
</ul>
<hr />
<h2 id="configuration-parameters">🔧 Configuration Parameters</h2>
<ul>
<li><code>method</code>: The speculative decoding strategy, currently supports <code>["mtp", "ngram"]</code>.</li>
<li><code>num_speculative_tokens</code>: Number of speculative tokens to generate; max is 5, currently MTP supports only 1.</li>
<li><code>model</code>: Path to the MTP draft model when using the <code>"mtp"</code> method.</li>
<li><code>quantization</code>: Quantization method of the MTP model (e.g., WINT4).</li>
<li>Max <code>batch_size</code>: 256</li>
</ul>
<hr />
<h2 id="using-multi-token-prediction-mtp">🚀 Using Multi-Token Prediction (MTP)</h2>
<p>For detailed theory, refer to:
📄 <a href="https://arxiv.org/pdf/2412.19437">DeepSeek-V3 Paper</a></p>
<h3 id="tp-sharding-mode">TP Sharding Mode</h3>
<p>Launch service on 4 × H100 GPUs using WINT4 quantization (Dense: WINT8, MoE: WINT4):</p>
<blockquote>
<p>Config file: <code>benchmarks/yaml/eb45t-32k-wint4-mtp-h100-tp4.yaml</code></p>
</blockquote>
<pre><code class="language-bash">python -m fastdeploy.entrypoints.openai.api_server \
--model ${path_to_main_model} \
--tensor-parallel-size 4 \
--config ${path_to_FastDeploy}benchmarks/yaml/eb45t-32k-wint4-mtp-h100-tp4.yaml \
--speculative-config '{&quot;method&quot;: &quot;mtp&quot;, &quot;num_speculative_tokens&quot;: 1, &quot;model&quot;: &quot;${path_to_mtp_model}&quot;}'
</code></pre>
<h3 id="pd-separated-deployment-1p1d-mode">PD-Separated Deployment (1P1D Mode)</h3>
<p>Deploy 1P1D on H100 with both Prefill (P) and Decode (D) nodes using TP4 + WINT4 quantization.
This deployment only requires changing the config and adding speculative_config.
For details, refer to the <a href="../disaggregated/">PD Separation</a>.
- P Node(Prefill)</p>
<blockquote>
<p>Config file: <code>benchmarks/yaml/eb45t-32k-wint4-mtp-tp4-prefill.yaml</code></p>
</blockquote>
<pre><code>export FD_LOG_DIR=&quot;log_prefill&quot;
rm -rf ${FD_LOG_DIR}
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m fastdeploy.entrypoints.openai.api_server \
--model ${path_to_main_model} \
--port 8180 \
--metrics-port 8181 \
--engine-worker-queue-port 8182 \
--cache-queue-port 8183 \
--workers 2 \
--tensor-parallel-size 4 \
--quantization wint4 \
--splitwise-role &quot;prefill&quot; \
--scheduler-name &quot;splitwise&quot; \
--scheduler-host &quot;127.0.0.1&quot; \
--scheduler-port 6379 \
--scheduler-ttl 9000 \
--scheduler-topic mtp \
--config ${path_to_FastDeploy}/benchmarks/yaml/eb45t-32k-wint4-mtp-tp4-prefill.yaml \
--scheduler-password &quot;scheduler_mtp&quot; \
--speculative-config '{&quot;method&quot;: &quot;mtp&quot;, &quot;num_speculative_tokens&quot;: 1, &quot;model&quot;: &quot;${path_to_mtp_model}&quot;}' &amp;
</code></pre>
<ul>
<li>D Node(Decode)</li>
</ul>
<blockquote>
<p>Config file: <code>benchmarks/yaml/eb45t-32k-wint4-mtp-tp4-decode.yaml</code></p>
</blockquote>
<pre><code>export FD_LOG_DIR=&quot;log_decode&quot;
rm -rf ${FD_LOG_DIR}
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m fastdeploy.entrypoints.openai.api_server \
--model ${path_to_main_model} \
--port 8190 \
--metrics-port 8191 \
--engine-worker-queue-port 8192 \
--cache-queue-port 8193 \
--workers 2 \
--tensor-parallel-size 4 \
--quantization wint4 \
--splitwise-role &quot;decode&quot; \
--scheduler-name &quot;splitwise&quot; \
--scheduler-host &quot;127.0.0.1&quot; \
--scheduler-port 6379 \
--scheduler-ttl 9000 \
--scheduler-topic mtp \
--config ${path_to_FastDeploy}/benchmarks/yaml/eb45t-32k-wint4-mtp-tp4-decode.yaml \
--scheduler-password &quot;scheduler_mtp&quot; \
--speculative-config '{&quot;method&quot;: &quot;mtp&quot;, &quot;num_speculative_tokens&quot;: 1, &quot;model&quot;: &quot;${path_to_mtp_model}&quot;}' &amp;
</code></pre>
<h2 id="decoding-with-hybrid-mtp-and-ngram-methods">Decoding with Hybrid MTP and Ngram Methods</h2>
<p>When starting the service, you only need to modify the --speculative-config option.
For example, use MTP to generate two draft tokens, and then append three additional draft tokens from Ngram matching:</p>
<pre><code>--speculative-config '{&quot;method&quot;: &quot;mtp&quot;, &quot;num_model_steps&quot;: 2, &quot;mtp_strategy&quot;: &quot;with_ngram&quot;, &quot;num_speculative_tokens&quot;: 5, &quot;model&quot;: &quot;'$model_path'/mtp&quot;}'
</code></pre>
<h2 id="using-ngram-based-decoding">🧠 Using Ngram-Based Decoding</h2>
<p>This method uses an n-gram sliding window to match the prompt and generated tokens to predict draft tokens. It is particularly effective in scenarios with high input-output overlap (e.g., code completion, document search).</p>
<p>Run on 4 × H100 GPUs with WINT4 quantization:</p>
<blockquote>
<p>Config file: <code>benchmarks/yaml/eb45t-32k-wint4-mtp-h100-tp4.yaml</code></p>
</blockquote>
<pre><code>python -m fastdeploy.entrypoints.openai.api_server \
--model ${path_to_main_model} \
--tensor-parallel-size 4 \
--config ${path_to_FastDeploy}benchmarks/yaml/eb45t-32k-wint4-mtp-h100-tp4.yaml \
--speculative-config '{&quot;method&quot;: &quot;mtp&quot;, &quot;num_speculative_tokens&quot;: 1, &quot;model&quot;: &quot;${mtp_model_path}&quot;}'
</code></pre>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
<div class="md-copyright__highlight">
Copyright &copy; 2025 Maintained by FastDeploy
</div>
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.973d3a69.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../../assets/javascripts/bundle.f55a23d4.min.js"></script>
</body>
</html>