Files
FastDeploy/best_practices/ERNIE-4.5-300B-A47B-Paddle/index.html

2248 lines
50 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="prev" href="../ERNIE-4.5-21B-A3B-Paddle/">
<link rel="next" href="../ERNIE-4.5-21B-A3B-Thinking/">
<link rel="icon" href="../../assets/images/favicon.ico">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.6.20">
<title>ERNIE-4.5-300B-A47B - FastDeploy: Large Language Model Deployment</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.e53b48f4.min.css">
<link rel="stylesheet" href="../../assets/stylesheets/palette.06af60db.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr" data-md-color-scheme="default" data-md-color-primary="indigo" data-md-color-accent="indigo">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#ernie-45-300b-a47b" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../.." title="FastDeploy: Large Language Model Deployment" class="md-header__button md-logo" aria-label="FastDeploy: Large Language Model Deployment" data-md-component="logo">
<img src="../../assets/images/logo.jpg" alt="logo">
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
FastDeploy: Large Language Model Deployment
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
ERNIE-4.5-300B-A47B
</span>
</div>
</div>
</div>
<form class="md-header__option" data-md-component="palette">
<input class="md-option" data-md-color-media="(prefers-color-scheme: light)" data-md-color-scheme="default" data-md-color-primary="indigo" data-md-color-accent="indigo" aria-label="Switch to dark mode" type="radio" name="__palette" id="__palette_0">
<label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
</label>
<input class="md-option" data-md-color-media="(prefers-color-scheme: dark)" data-md-color-scheme="slate" data-md-color-primary="black" data-md-color-accent="indigo" aria-label="Switch to system preference" type="radio" name="__palette" id="__palette_1">
<label class="md-header__button md-icon" title="Switch to system preference" for="__palette_0" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
</label>
</form>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
<div class="md-header__option">
<div class="md-select">
<button class="md-header__button md-icon" aria-label="Select language">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="m12.87 15.07-2.54-2.51.03-.03A17.5 17.5 0 0 0 14.07 6H17V4h-7V2H8v2H1v2h11.17C11.5 7.92 10.44 9.75 9 11.35 8.07 10.32 7.3 9.19 6.69 8h-2c.73 1.63 1.73 3.17 2.98 4.56l-5.09 5.02L4 19l5-5 3.11 3.11zM18.5 10h-2L12 22h2l1.12-3h4.75L21 22h2zm-2.62 7 1.62-4.33L19.12 17z"/></svg>
</button>
<div class="md-select__inner">
<ul class="md-select__list">
<li class="md-select__item">
<a href="./" hreflang="en" class="md-select__link">
English
</a>
</li>
<li class="md-select__item">
<a href="../../zh/best_practices/ERNIE-4.5-300B-A47B-Paddle/" hreflang="zh" class="md-select__link">
简体中文
</a>
</li>
</ul>
</div>
</div>
</div>
<label class="md-header__button md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
</label>
<div class="md-search" data-md-component="search" role="dialog">
<label class="md-search__overlay" for="__search"></label>
<div class="md-search__inner" role="search">
<form class="md-search__form" name="search">
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
<label class="md-search__icon md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
</label>
<nav class="md-search__options" aria-label="Search">
<button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
</button>
</nav>
</form>
<div class="md-search__output">
<div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
<div class="md-search-result" data-md-component="search-result">
<div class="md-search-result__meta">
Initializing search
</div>
<ol class="md-search-result__list" role="presentation"></ol>
</div>
</div>
</div>
</div>
</div>
<div class="md-header__source">
<a href="https://github.com/PaddlePaddle/FastDeploy" title="Go to repository" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.0.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
</div>
<div class="md-source__repository">
FastDeploy
</div>
</a>
</div>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="FastDeploy: Large Language Model Deployment" class="md-nav__button md-logo" aria-label="FastDeploy: Large Language Model Deployment" data-md-component="logo">
<img src="../../assets/images/logo.jpg" alt="logo">
</a>
FastDeploy: Large Language Model Deployment
</label>
<div class="md-nav__source">
<a href="https://github.com/PaddlePaddle/FastDeploy" title="Go to repository" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.0.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
</div>
<div class="md-source__repository">
FastDeploy
</div>
</a>
</div>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../.." class="md-nav__link">
<span class="md-ellipsis">
FastDeploy
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Quick Start
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Quick Start
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2_1" >
<label class="md-nav__link" for="__nav_2_1" id="__nav_2_1_label" tabindex="0">
<span class="md-ellipsis">
Installation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_2_1_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2_1">
<span class="md-nav__icon md-icon"></span>
Installation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../get_started/installation/nvidia_gpu/" class="md-nav__link">
<span class="md-ellipsis">
Nvidia GPU
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/installation/kunlunxin_xpu/" class="md-nav__link">
<span class="md-ellipsis">
KunlunXin XPU
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/installation/hygon_dcu/" class="md-nav__link">
<span class="md-ellipsis">
HYGON DCU
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/installation/Enflame_gcu/" class="md-nav__link">
<span class="md-ellipsis">
Enflame S60
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/installation/iluvatar_gpu/" class="md-nav__link">
<span class="md-ellipsis">
Iluvatar CoreX
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/installation/metax_gpu/" class="md-nav__link">
<span class="md-ellipsis">
Metax C550
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../../get_started/quick_start/" class="md-nav__link">
<span class="md-ellipsis">
Quick Deployment For ERNIE-4.5-0.3B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/quick_start_vl/" class="md-nav__link">
<span class="md-ellipsis">
Quick Deployment for ERNIE-4.5-VL-28B-A3B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/ernie-4.5/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-300B-A47B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/ernie-4.5-vl/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-VL-424B-A47B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../get_started/quick_start_qwen/" class="md-nav__link">
<span class="md-ellipsis">
Quick Deployment For QWEN
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Online Serving
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Online Serving
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../online_serving/" class="md-nav__link">
<span class="md-ellipsis">
OpenAI-Compatible API Server
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../online_serving/metrics/" class="md-nav__link">
<span class="md-ellipsis">
Monitor Metrics
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../online_serving/scheduler/" class="md-nav__link">
<span class="md-ellipsis">
Scheduler
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../online_serving/graceful_shutdown_service/" class="md-nav__link">
<span class="md-ellipsis">
Graceful Shutdown
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../../offline_inference/" class="md-nav__link">
<span class="md-ellipsis">
Offline Inference
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_5" checked>
<label class="md-nav__link" for="__nav_5" id="__nav_5_label" tabindex="0">
<span class="md-ellipsis">
Best Practices
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_5_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_5">
<span class="md-nav__icon md-icon"></span>
Best Practices
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../ERNIE-4.5-0.3B-Paddle/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-0.3B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../ERNIE-4.5-21B-A3B-Paddle/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-21B-A3B
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
ERNIE-4.5-300B-A47B
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
ERNIE-4.5-300B-A47B
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#environmental-preparation" class="md-nav__link">
<span class="md-ellipsis">
Environmental Preparation
</span>
</a>
<nav class="md-nav" aria-label="Environmental Preparation">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#11-hardware-requirements" class="md-nav__link">
<span class="md-ellipsis">
1.1 Hardware requirements
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#12-install-fastdeploy" class="md-nav__link">
<span class="md-ellipsis">
1.2 Install fastdeploy
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#2how-to-use" class="md-nav__link">
<span class="md-ellipsis">
2.How to Use
</span>
</a>
<nav class="md-nav" aria-label="2.How to Use">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#21-basic-launching-the-service" class="md-nav__link">
<span class="md-ellipsis">
2.1 Basic: Launching the Service
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#22-advanced-how-to-get-better-performance" class="md-nav__link">
<span class="md-ellipsis">
2.2 Advanced: How to get better performance
</span>
</a>
<nav class="md-nav" aria-label="2.2 Advanced: How to get better performance">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#221-correctly-set-parameters-that-match-the-application-scenario" class="md-nav__link">
<span class="md-ellipsis">
2.2.1 Correctly set parameters that match the application scenario
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#222-prefix-caching" class="md-nav__link">
<span class="md-ellipsis">
2.2.2 Prefix Caching
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#223-chunked-prefill" class="md-nav__link">
<span class="md-ellipsis">
2.2.3 Chunked Prefill
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#224-mtp-multi-token-prediction" class="md-nav__link">
<span class="md-ellipsis">
2.2.4 MTP (Multi-Token Prediction)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#225-w4a8c8-quantization" class="md-nav__link">
<span class="md-ellipsis">
2.2.5 W4A8C8 Quantization
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#226-rejection-sampling" class="md-nav__link">
<span class="md-ellipsis">
2.2.6 Rejection Sampling
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#227-disaggregated-deployment" class="md-nav__link">
<span class="md-ellipsis">
2.2.7 Disaggregated Deployment
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#228-cudagraph" class="md-nav__link">
<span class="md-ellipsis">
2.2.8 CUDAGraph
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#faq" class="md-nav__link">
<span class="md-ellipsis">
FAQ
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../ERNIE-4.5-21B-A3B-Thinking/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-21B-A3B-Thinking
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../ERNIE-4.5-VL-28B-A3B-Paddle/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-VL-28B-A3B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../ERNIE-4.5-VL-424B-A47B-Paddle/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-VL-424B-A47B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../FAQ/" class="md-nav__link">
<span class="md-ellipsis">
FAQ
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_6" >
<label class="md-nav__link" for="__nav_6" id="__nav_6_label" tabindex="0">
<span class="md-ellipsis">
Quantization
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_6_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_6">
<span class="md-nav__icon md-icon"></span>
Quantization
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../quantization/" class="md-nav__link">
<span class="md-ellipsis">
Overview
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../quantization/online_quantization/" class="md-nav__link">
<span class="md-ellipsis">
Online Quantization
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../quantization/wint2/" class="md-nav__link">
<span class="md-ellipsis">
WINT2 Quantization
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" >
<label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
<span class="md-ellipsis">
Features
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_7">
<span class="md-nav__icon md-icon"></span>
Features
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../features/prefix_caching/" class="md-nav__link">
<span class="md-ellipsis">
Prefix Caching
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../features/disaggregated/" class="md-nav__link">
<span class="md-ellipsis">
Disaggregation
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../features/chunked_prefill/" class="md-nav__link">
<span class="md-ellipsis">
Chunked Prefill
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../features/load_balance/" class="md-nav__link">
<span class="md-ellipsis">
Load Balance
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../features/speculative_decoding/" class="md-nav__link">
<span class="md-ellipsis">
Speculative Decoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../features/structured_outputs/" class="md-nav__link">
<span class="md-ellipsis">
Structured Outputs
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../features/reasoning_output/" class="md-nav__link">
<span class="md-ellipsis">
Reasoning Output
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../features/early_stop/" class="md-nav__link">
<span class="md-ellipsis">
Early Stop
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../features/plugins/" class="md-nav__link">
<span class="md-ellipsis">
Plugins
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../features/sampling/" class="md-nav__link">
<span class="md-ellipsis">
Sampling
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../features/multi-node_deployment/" class="md-nav__link">
<span class="md-ellipsis">
MultiNode Deployment
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../features/graph_optimization/" class="md-nav__link">
<span class="md-ellipsis">
Graph Optimization
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../features/data_parallel_service/" class="md-nav__link">
<span class="md-ellipsis">
Data Parallelism
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../features/plas_attention/" class="md-nav__link">
<span class="md-ellipsis">
PLAS
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../../supported_models/" class="md-nav__link">
<span class="md-ellipsis">
Supported Models
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../benchmark/" class="md-nav__link">
<span class="md-ellipsis">
Benchmark
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
<label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
<span class="md-ellipsis">
Usage
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_10">
<span class="md-nav__icon md-icon"></span>
Usage
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../usage/log/" class="md-nav__link">
<span class="md-ellipsis">
Log Description
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../usage/code_overview/" class="md-nav__link">
<span class="md-ellipsis">
Code Overview
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../usage/environment_variables/" class="md-nav__link">
<span class="md-ellipsis">
Environment Variables
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#environmental-preparation" class="md-nav__link">
<span class="md-ellipsis">
Environmental Preparation
</span>
</a>
<nav class="md-nav" aria-label="Environmental Preparation">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#11-hardware-requirements" class="md-nav__link">
<span class="md-ellipsis">
1.1 Hardware requirements
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#12-install-fastdeploy" class="md-nav__link">
<span class="md-ellipsis">
1.2 Install fastdeploy
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#2how-to-use" class="md-nav__link">
<span class="md-ellipsis">
2.How to Use
</span>
</a>
<nav class="md-nav" aria-label="2.How to Use">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#21-basic-launching-the-service" class="md-nav__link">
<span class="md-ellipsis">
2.1 Basic: Launching the Service
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#22-advanced-how-to-get-better-performance" class="md-nav__link">
<span class="md-ellipsis">
2.2 Advanced: How to get better performance
</span>
</a>
<nav class="md-nav" aria-label="2.2 Advanced: How to get better performance">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#221-correctly-set-parameters-that-match-the-application-scenario" class="md-nav__link">
<span class="md-ellipsis">
2.2.1 Correctly set parameters that match the application scenario
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#222-prefix-caching" class="md-nav__link">
<span class="md-ellipsis">
2.2.2 Prefix Caching
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#223-chunked-prefill" class="md-nav__link">
<span class="md-ellipsis">
2.2.3 Chunked Prefill
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#224-mtp-multi-token-prediction" class="md-nav__link">
<span class="md-ellipsis">
2.2.4 MTP (Multi-Token Prediction)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#225-w4a8c8-quantization" class="md-nav__link">
<span class="md-ellipsis">
2.2.5 W4A8C8 Quantization
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#226-rejection-sampling" class="md-nav__link">
<span class="md-ellipsis">
2.2.6 Rejection Sampling
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#227-disaggregated-deployment" class="md-nav__link">
<span class="md-ellipsis">
2.2.7 Disaggregated Deployment
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#228-cudagraph" class="md-nav__link">
<span class="md-ellipsis">
2.2.8 CUDAGraph
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#faq" class="md-nav__link">
<span class="md-ellipsis">
FAQ
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="ernie-45-300b-a47b">ERNIE-4.5-300B-A47B</h1>
<h2 id="environmental-preparation">Environmental Preparation</h2>
<h3 id="11-hardware-requirements">1.1 Hardware requirements</h3>
<p>The minimum number of GPUs required to deploy <code>ERNIE-4.5-300B-A47B</code> on the following hardware for each quantization is as follows:</p>
<table>
<thead>
<tr>
<th></th>
<th>WINT8</th>
<th>WINT4</th>
<th>FP8</th>
<th>WINT2</th>
<th>W4A8</th>
</tr>
</thead>
<tbody>
<tr>
<td>H800 80GB</td>
<td>8</td>
<td>4</td>
<td>8</td>
<td>2</td>
<td>4</td>
</tr>
<tr>
<td>A800 80GB</td>
<td>8</td>
<td>4</td>
<td>/</td>
<td>2</td>
<td>4</td>
</tr>
</tbody>
</table>
<p><strong>Tips:</strong>
1. To modify the number of deployment GPUs, specify <code>--tensor-parallel-size 4</code> in starting command.
2. Since only 4-GPSs quantization scale is provided, the W4A8 model needs to be deployed on 4 GPUs.
3. For hardware not listed in the table, you can estimate whether it can be deployed based on the GPU memory.</p>
<h3 id="12-install-fastdeploy">1.2 Install fastdeploy</h3>
<ul>
<li>
<p>Installation: For detail, please refer to <a href="../../get_started/installation/">Fastdeploy Installation</a>.</p>
</li>
<li>
<p>Model DownloadFor detail, please refer to <a href="../../supported_models/">Supported Models</a>.</p>
</li>
</ul>
<h2 id="2how-to-use">2.How to Use</h2>
<h3 id="21-basic-launching-the-service">2.1 Basic: Launching the Service</h3>
<p>Start the service by following command:</p>
<pre><code class="language-bash">python -m fastdeploy.entrypoints.openai.api_server \
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
--tensor-parallel-size 8 \
--quantization wint4 \
--max-model-len 32768 \
--max-num-seqs 128 \
--load-choices &quot;default_v1&quot;
</code></pre>
<ul>
<li><code>--quantization</code>: indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of <code>wint8</code> / <code>wint4</code> / <code>block_wise_fp8</code>(Hopper is needed).</li>
<li><code>--max-model-len</code>: Indicates the maximum number of tokens supported by the currently deployed service. The larger the value, the longer the context length the model can support, but the more GPU memory is occupied, which may affect the concurrency.</li>
<li><code>--load-choices</code>: indicates the version of the loader. "default_v1" means enabling the v1 version of the loader, which has faster loading speed and less memory usage.</li>
</ul>
<p>For more parameter meanings and default settings, see <a href="../../parameters/">FastDeploy Parameter Documentation</a></p>
<h3 id="22-advanced-how-to-get-better-performance">2.2 Advanced: How to get better performance</h3>
<h4 id="221-correctly-set-parameters-that-match-the-application-scenario">2.2.1 Correctly set parameters that match the application scenario</h4>
<p>Evaluate average input length, average output length, and maximum context length
- Set max-model-len according to the maximum context length. For example, if the average input length is 1000 and the output length is 30000, then it is recommended to set it to 32768</p>
<h4 id="222-prefix-caching">2.2.2 Prefix Caching</h4>
<p><strong>Idea:</strong> The core idea of Prefix Caching is to avoid repeated calculations by caching the intermediate calculation results of the input sequence (KV Cache), thereby speeding up the response speed of multiple requests with the same prefix. For details, refer to <a href="../../features/prefix_caching/">prefix-cache</a></p>
<p><strong>How to enable:</strong>
Since version 2.2 (including the develop branch), Prefix Caching has been enabled by default.</p>
<p>For versions 2.1 and earlier, you need to enable it manually by adding following lines to the startup parameters, where <code>--enable-prefix-caching</code> enables prefix caching, and <code>--swap-space</code> enables CPU cache in addition to GPU cache. The size is GB and should be adjusted according to the actual situation of the machine. The recommended value is <code>(total machine memory - model size) * 20%</code>. If the service fails to start because other programs are occupying memory, try reducing the <code>--swap-space</code> value.</p>
<pre><code>--enable-prefix-caching
--swap-space 50
</code></pre>
<h4 id="223-chunked-prefill">2.2.3 Chunked Prefill</h4>
<p><strong>Idea:</strong> This strategy is adopted to split the prefill stage request into small-scale sub-chunks, and execute them in batches mixed with the decode request. This can better balance the computation-intensive (Prefill) and memory-intensive (Decode) operations, optimize GPU resource utilization, reduce the computational workload and memory usage of a single Prefill, thereby reducing the peak memory usage and avoiding the problem of insufficient memory. For details, please refer to <a href="../../features/chunked_prefill/">Chunked Prefill</a></p>
<p><strong>How to enable:</strong>
Since version 2.2 (including the develop branch), Chunked Prefill has been enabled by default.</p>
<p>For versions 2.1 and earlier, you need to enable it manually by adding</p>
<pre><code>--enable-chunked-prefill
</code></pre>
<h4 id="224-mtp-multi-token-prediction">2.2.4 MTP (Multi-Token Prediction)</h4>
<p><strong>Idea:</strong>
By predicting multiple tokens at once, the number of decoding steps is reduced to significantly speed up the generation speed, while maintaining the generation quality through certain strategies. For details, please refer to <a href="../../features/speculative_decoding/">Speculative Decoding</a></p>
<p><strong>How to enable:</strong>
Add the following lines to the startup parameters</p>
<pre><code>--speculative-config '{&quot;method&quot;: &quot;mtp&quot;, &quot;num_speculative_tokens&quot;: 1, &quot;model&quot;: &quot;${path_to_mtp_model}&quot;}'
</code></pre>
<p>Notes:
1. MTP currently does not support simultaneous use with Prefix Caching, Chunked Prefill, and CUDAGraph.
- Use <code>export FD_DISABLE_CHUNKED_PREFILL=1</code> to disable Chunked Prefill.
- When setting <code>speculative-config</code>, Prefix Caching will be automatically disabled.
2. MTP currently does not support service management global blocks, When setting <code>speculative-config</code>, service management global blocks will be automatically disabled.
3. MTP currently does not support rejection sampling, i.e. do not run with <code>export FD_SAMPLING_CLASS=rejection</code></p>
<h4 id="225-w4a8c8-quantization">2.2.5 W4A8C8 Quantization</h4>
<p><strong>Idea:</strong>
Quantization can achieve model compression, reduce GPU memory usage and speed up inference. To achieve better inference results, per-channel symmetric 4-bit quantization is used for MoE weights. static per-tensor symmetric 8-bit quantization is used for activation. And static per-channel symmetric 8-bit quantization is used for KVCache.</p>
<p><strong>How to enable:</strong>
Just specify the corresponding model name in the startup command, <code>baidu/ERNIE-4.5-300B-A47B-W4A8C8-TP4-Paddle</code></p>
<pre><code>--model baidu/ERNIE-4.5-300B-A47B-W4A8C8-TP4-Paddle
</code></pre>
<p>Note:
- W4A8C8 quantized models are not supported when loaded via <code>--load-choices "default_v1"</code>.</p>
<h4 id="226-rejection-sampling">2.2.6 Rejection Sampling</h4>
<p><strong>Idea:</strong>
Rejection sampling is to generate samples from a proposal distribution that is easy to sample, avoiding explicit sorting to increase the sampling speed, which has a significant improvement on small-sized models.</p>
<p><strong>How to enable:</strong>
Add the following environment variables before starting</p>
<pre><code>export FD_SAMPLING_CLASS=rejection
</code></pre>
<h4 id="227-disaggregated-deployment">2.2.7 Disaggregated Deployment</h4>
<p><strong>Idea:</strong> Deploying Prefill and Decode separately in certain scenarios can improve hardware utilization, effectively increase throughput, and reduce overall sentence latency.</p>
<p><strong>How to enable:</strong> Take the deployment of a single machine with 8 GPUs and 1P1D (4 GPUs each) as an example. Compared with the default hybrid deployment method, <code>--splitwise-role</code> is required to specify the role of the node. And the GPUs and logs of the two nodes are isolated through the environment variables <code>FD_LOG_DIR</code> and <code>CUDA_VISIBLE_DEVICES</code>.</p>
<pre><code>export FD_LOG_DIR=&quot;log_prefill&quot;
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m fastdeploy.entrypoints.openai.api_server \
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
--port 8180 --metrics-port 8181 \
--engine-worker-queue-port 8182 \
--cache-queue-port 8183 \
--tensor-parallel-size 4 \
--quantization wint4 \
--splitwise-role &quot;prefill&quot;
</code></pre>
<pre><code>export FD_LOG_DIR=&quot;log_decode&quot;
export CUDA_VISIBLE_DEVICES=4,5,6,7
# Note that innode-prefill-ports is specified as the Prefill serviceengine-worker-queue-port
python -m fastdeploy.entrypoints.openai.api_server \
--model baidu/ERNIE-4.5-300B-A47B-Paddle\
--port 8184 --metrics-port 8185 \
--engine-worker-queue-port 8186 \
--cache-queue-port 8187 \
--tensor-parallel-size 4 \
--quantization wint4 \
--innode-prefill-ports 8182 \
--splitwise-role &quot;decode&quot;
</code></pre>
<h4 id="228-cudagraph">2.2.8 CUDAGraph</h4>
<p><strong>Idea:</strong>
CUDAGraph is a GPU computing acceleration technology provided by NVIDIA. It achieves efficient execution and optimization of GPU tasks by capturing CUDA operation sequences into a graph structure. The core idea of CUDAGraph is to encapsulate a series of GPU computing and memory operations into a re-executable graph, thereby reducing CPU-GPU communication overhead, reducing kernel startup latency, and improving overall computing performance.</p>
<p><strong>How to enable:</strong>
Add the following lines to the startup parameters</p>
<pre><code>--use-cudagraph
</code></pre>
<p>Notes:
- Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to <a href="../../features/graph_optimization/">GraphOptimizationBackend</a> for related configuration parameter descriptions</p>
<h2 id="faq">FAQ</h2>
<p>If you encounter any problems during use, you can refer to <a href="../FAQ/">FAQ</a>.</p>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
<div class="md-copyright__highlight">
Copyright &copy; 2025 Maintained by FastDeploy
</div>
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.973d3a69.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../../assets/javascripts/bundle.f55a23d4.min.js"></script>
</body>
</html>