Files
FastDeploy/offline_inference/index.html

2217 lines
48 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="prev" href="../online_serving/graceful_shutdown_service/">
<link rel="next" href="../best_practices/ERNIE-4.5-0.3B-Paddle/">
<link rel="icon" href="../assets/images/favicon.ico">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.6.20">
<title>Offline Inference - FastDeploy: Large Language Model Deployment</title>
<link rel="stylesheet" href="../assets/stylesheets/main.e53b48f4.min.css">
<link rel="stylesheet" href="../assets/stylesheets/palette.06af60db.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr" data-md-color-scheme="default" data-md-color-primary="indigo" data-md-color-accent="indigo">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#offline-inference" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href=".." title="FastDeploy: Large Language Model Deployment" class="md-header__button md-logo" aria-label="FastDeploy: Large Language Model Deployment" data-md-component="logo">
<img src="../assets/images/logo.jpg" alt="logo">
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
FastDeploy: Large Language Model Deployment
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
Offline Inference
</span>
</div>
</div>
</div>
<form class="md-header__option" data-md-component="palette">
<input class="md-option" data-md-color-media="(prefers-color-scheme: light)" data-md-color-scheme="default" data-md-color-primary="indigo" data-md-color-accent="indigo" aria-label="Switch to dark mode" type="radio" name="__palette" id="__palette_0">
<label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
</label>
<input class="md-option" data-md-color-media="(prefers-color-scheme: dark)" data-md-color-scheme="slate" data-md-color-primary="black" data-md-color-accent="indigo" aria-label="Switch to system preference" type="radio" name="__palette" id="__palette_1">
<label class="md-header__button md-icon" title="Switch to system preference" for="__palette_0" hidden>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
</label>
</form>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
<div class="md-header__option">
<div class="md-select">
<button class="md-header__button md-icon" aria-label="Select language">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="m12.87 15.07-2.54-2.51.03-.03A17.5 17.5 0 0 0 14.07 6H17V4h-7V2H8v2H1v2h11.17C11.5 7.92 10.44 9.75 9 11.35 8.07 10.32 7.3 9.19 6.69 8h-2c.73 1.63 1.73 3.17 2.98 4.56l-5.09 5.02L4 19l5-5 3.11 3.11zM18.5 10h-2L12 22h2l1.12-3h4.75L21 22h2zm-2.62 7 1.62-4.33L19.12 17z"/></svg>
</button>
<div class="md-select__inner">
<ul class="md-select__list">
<li class="md-select__item">
<a href="./" hreflang="en" class="md-select__link">
English
</a>
</li>
<li class="md-select__item">
<a href="../zh/offline_inference/" hreflang="zh" class="md-select__link">
简体中文
</a>
</li>
</ul>
</div>
</div>
</div>
<label class="md-header__button md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
</label>
<div class="md-search" data-md-component="search" role="dialog">
<label class="md-search__overlay" for="__search"></label>
<div class="md-search__inner" role="search">
<form class="md-search__form" name="search">
<input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
<label class="md-search__icon md-icon" for="__search">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
</label>
<nav class="md-search__options" aria-label="Search">
<button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
</button>
</nav>
</form>
<div class="md-search__output">
<div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
<div class="md-search-result" data-md-component="search-result">
<div class="md-search-result__meta">
Initializing search
</div>
<ol class="md-search-result__list" role="presentation"></ol>
</div>
</div>
</div>
</div>
</div>
<div class="md-header__source">
<a href="https://github.com/PaddlePaddle/FastDeploy" title="Go to repository" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.0.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
</div>
<div class="md-source__repository">
FastDeploy
</div>
</a>
</div>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href=".." title="FastDeploy: Large Language Model Deployment" class="md-nav__button md-logo" aria-label="FastDeploy: Large Language Model Deployment" data-md-component="logo">
<img src="../assets/images/logo.jpg" alt="logo">
</a>
FastDeploy: Large Language Model Deployment
</label>
<div class="md-nav__source">
<a href="https://github.com/PaddlePaddle/FastDeploy" title="Go to repository" class="md-source" data-md-component="source">
<div class="md-source__icon md-icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.0.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
</div>
<div class="md-source__repository">
FastDeploy
</div>
</a>
</div>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href=".." class="md-nav__link">
<span class="md-ellipsis">
FastDeploy
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Quick Start
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Quick Start
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2_1" >
<label class="md-nav__link" for="__nav_2_1" id="__nav_2_1_label" tabindex="0">
<span class="md-ellipsis">
Installation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="2" aria-labelledby="__nav_2_1_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2_1">
<span class="md-nav__icon md-icon"></span>
Installation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../get_started/installation/nvidia_gpu/" class="md-nav__link">
<span class="md-ellipsis">
Nvidia GPU
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../get_started/installation/kunlunxin_xpu/" class="md-nav__link">
<span class="md-ellipsis">
KunlunXin XPU
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../get_started/installation/hygon_dcu/" class="md-nav__link">
<span class="md-ellipsis">
HYGON DCU
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../get_started/installation/Enflame_gcu/" class="md-nav__link">
<span class="md-ellipsis">
Enflame S60
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../get_started/installation/iluvatar_gpu/" class="md-nav__link">
<span class="md-ellipsis">
Iluvatar CoreX
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../get_started/installation/metax_gpu/" class="md-nav__link">
<span class="md-ellipsis">
Metax C550
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../get_started/quick_start/" class="md-nav__link">
<span class="md-ellipsis">
Quick Deployment For ERNIE-4.5-0.3B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../get_started/quick_start_vl/" class="md-nav__link">
<span class="md-ellipsis">
Quick Deployment for ERNIE-4.5-VL-28B-A3B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../get_started/ernie-4.5/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-300B-A47B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../get_started/ernie-4.5-vl/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-VL-424B-A47B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../get_started/quick_start_qwen/" class="md-nav__link">
<span class="md-ellipsis">
Quick Deployment For QWEN
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Online Serving
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Online Serving
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../online_serving/" class="md-nav__link">
<span class="md-ellipsis">
OpenAI-Compatible API Server
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../online_serving/metrics/" class="md-nav__link">
<span class="md-ellipsis">
Monitor Metrics
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../online_serving/scheduler/" class="md-nav__link">
<span class="md-ellipsis">
Scheduler
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../online_serving/graceful_shutdown_service/" class="md-nav__link">
<span class="md-ellipsis">
Graceful Shutdown
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
Offline Inference
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
Offline Inference
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#1-usage" class="md-nav__link">
<span class="md-ellipsis">
1. Usage
</span>
</a>
<nav class="md-nav" aria-label="1. Usage">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#chat-interface-llmchat" class="md-nav__link">
<span class="md-ellipsis">
Chat Interface (LLM.chat)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#text-completion-interface-llmgenerate" class="md-nav__link">
<span class="md-ellipsis">
Text Completion Interface (LLM.generate)
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#2-api-documentation" class="md-nav__link">
<span class="md-ellipsis">
2. API Documentation
</span>
</a>
<nav class="md-nav" aria-label="2. API Documentation">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#21-fastdeployllm" class="md-nav__link">
<span class="md-ellipsis">
2.1 fastdeploy.LLM
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#22-fastdeployllmchat" class="md-nav__link">
<span class="md-ellipsis">
2.2 fastdeploy.LLM.chat
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#23-fastdeployllmgenerate" class="md-nav__link">
<span class="md-ellipsis">
2.3 fastdeploy.LLM.generate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#24-fastdeploysamplingparams" class="md-nav__link">
<span class="md-ellipsis">
2.4 fastdeploy.SamplingParams
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#25-fastdeployenginerequestrequestoutput" class="md-nav__link">
<span class="md-ellipsis">
2.5 fastdeploy.engine.request.RequestOutput
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#26-fastdeployenginerequestcompletionoutput" class="md-nav__link">
<span class="md-ellipsis">
2.6 fastdeploy.engine.request.CompletionOutput
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#27-fastdeployenginerequestrequestmetrics" class="md-nav__link">
<span class="md-ellipsis">
2.7 fastdeploy.engine.request.RequestMetrics
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_5" >
<label class="md-nav__link" for="__nav_5" id="__nav_5_label" tabindex="0">
<span class="md-ellipsis">
Best Practices
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_5_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_5">
<span class="md-nav__icon md-icon"></span>
Best Practices
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../best_practices/ERNIE-4.5-0.3B-Paddle/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-0.3B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../best_practices/ERNIE-4.5-21B-A3B-Paddle/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-21B-A3B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../best_practices/ERNIE-4.5-300B-A47B-Paddle/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-300B-A47B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../best_practices/ERNIE-4.5-21B-A3B-Thinking/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-21B-A3B-Thinking
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../best_practices/ERNIE-4.5-VL-28B-A3B-Paddle/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-VL-28B-A3B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../best_practices/ERNIE-4.5-VL-424B-A47B-Paddle/" class="md-nav__link">
<span class="md-ellipsis">
ERNIE-4.5-VL-424B-A47B
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../best_practices/FAQ/" class="md-nav__link">
<span class="md-ellipsis">
FAQ
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_6" >
<label class="md-nav__link" for="__nav_6" id="__nav_6_label" tabindex="0">
<span class="md-ellipsis">
Quantization
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_6_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_6">
<span class="md-nav__icon md-icon"></span>
Quantization
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../quantization/" class="md-nav__link">
<span class="md-ellipsis">
Overview
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../quantization/online_quantization/" class="md-nav__link">
<span class="md-ellipsis">
Online Quantization
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../quantization/wint2/" class="md-nav__link">
<span class="md-ellipsis">
WINT2 Quantization
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" >
<label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
<span class="md-ellipsis">
Features
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_7">
<span class="md-nav__icon md-icon"></span>
Features
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../features/prefix_caching/" class="md-nav__link">
<span class="md-ellipsis">
Prefix Caching
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../features/disaggregated/" class="md-nav__link">
<span class="md-ellipsis">
Disaggregation
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../features/chunked_prefill/" class="md-nav__link">
<span class="md-ellipsis">
Chunked Prefill
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../features/load_balance/" class="md-nav__link">
<span class="md-ellipsis">
Load Balance
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../features/speculative_decoding/" class="md-nav__link">
<span class="md-ellipsis">
Speculative Decoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../features/structured_outputs/" class="md-nav__link">
<span class="md-ellipsis">
Structured Outputs
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../features/reasoning_output/" class="md-nav__link">
<span class="md-ellipsis">
Reasoning Output
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../features/early_stop/" class="md-nav__link">
<span class="md-ellipsis">
Early Stop
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../features/plugins/" class="md-nav__link">
<span class="md-ellipsis">
Plugins
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../features/sampling/" class="md-nav__link">
<span class="md-ellipsis">
Sampling
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../features/multi-node_deployment/" class="md-nav__link">
<span class="md-ellipsis">
MultiNode Deployment
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../features/graph_optimization/" class="md-nav__link">
<span class="md-ellipsis">
Graph Optimization
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../features/data_parallel_service/" class="md-nav__link">
<span class="md-ellipsis">
Data Parallelism
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../features/plas_attention/" class="md-nav__link">
<span class="md-ellipsis">
PLAS
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../supported_models/" class="md-nav__link">
<span class="md-ellipsis">
Supported Models
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../benchmark/" class="md-nav__link">
<span class="md-ellipsis">
Benchmark
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >
<label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">
<span class="md-ellipsis">
Usage
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_10">
<span class="md-nav__icon md-icon"></span>
Usage
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../usage/log/" class="md-nav__link">
<span class="md-ellipsis">
Log Description
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../usage/code_overview/" class="md-nav__link">
<span class="md-ellipsis">
Code Overview
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../usage/environment_variables/" class="md-nav__link">
<span class="md-ellipsis">
Environment Variables
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#1-usage" class="md-nav__link">
<span class="md-ellipsis">
1. Usage
</span>
</a>
<nav class="md-nav" aria-label="1. Usage">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#chat-interface-llmchat" class="md-nav__link">
<span class="md-ellipsis">
Chat Interface (LLM.chat)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#text-completion-interface-llmgenerate" class="md-nav__link">
<span class="md-ellipsis">
Text Completion Interface (LLM.generate)
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#2-api-documentation" class="md-nav__link">
<span class="md-ellipsis">
2. API Documentation
</span>
</a>
<nav class="md-nav" aria-label="2. API Documentation">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#21-fastdeployllm" class="md-nav__link">
<span class="md-ellipsis">
2.1 fastdeploy.LLM
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#22-fastdeployllmchat" class="md-nav__link">
<span class="md-ellipsis">
2.2 fastdeploy.LLM.chat
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#23-fastdeployllmgenerate" class="md-nav__link">
<span class="md-ellipsis">
2.3 fastdeploy.LLM.generate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#24-fastdeploysamplingparams" class="md-nav__link">
<span class="md-ellipsis">
2.4 fastdeploy.SamplingParams
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#25-fastdeployenginerequestrequestoutput" class="md-nav__link">
<span class="md-ellipsis">
2.5 fastdeploy.engine.request.RequestOutput
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#26-fastdeployenginerequestcompletionoutput" class="md-nav__link">
<span class="md-ellipsis">
2.6 fastdeploy.engine.request.CompletionOutput
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#27-fastdeployenginerequestrequestmetrics" class="md-nav__link">
<span class="md-ellipsis">
2.7 fastdeploy.engine.request.RequestMetrics
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="offline-inference">Offline Inference</h1>
<h2 id="1-usage">1. Usage</h2>
<p>FastDeploy supports offline inference by loading models locally and processing user data. Usage examples:</p>
<h3 id="chat-interface-llmchat">Chat Interface (LLM.chat)</h3>
<pre><code class="language-python">from fastdeploy import LLM, SamplingParams
msg1=[
{&quot;role&quot;: &quot;system&quot;, &quot;content&quot;: &quot;I'm a helpful AI assistant.&quot;},
{&quot;role&quot;: &quot;user&quot;, &quot;content&quot;: &quot;把李白的静夜思改写为现代诗&quot;},
]
msg2 = [
{&quot;role&quot;: &quot;system&quot;, &quot;content&quot;: &quot;I'm a helpful AI assistant.&quot;},
{&quot;role&quot;: &quot;user&quot;, &quot;content&quot;: &quot;Write me a poem about large language model.&quot;},
]
messages = [msg1, msg2]
# Sampling parameters
sampling_params = SamplingParams(top_p=0.95, max_tokens=6400)
# Load model
llm = LLM(model=&quot;ERNIE-4.5-0.3B&quot;, tensor_parallel_size=1, max_model_len=8192)
# Batch inference (internal request queuing and dynamic batching)
outputs = llm.chat(messages, sampling_params)
# Output results
for output in outputs:
prompt = output.prompt
generated_text = output.outputs.text
</code></pre>
<p>Documentation for <code>SamplingParams</code>, <code>LLM.generate</code>, <code>LLM.chat</code>, and output structure <code>RequestOutput</code> is provided below.</p>
<blockquote>
<p>Note: For reasoning models, when loading the model, you need to specify the reasoning_parser parameter. Additionally, during the request, you can toggle the reasoning feature on or off by configuring the <code>enable_thinking</code> parameter within <code>chat_template_kwargs</code>.</p>
</blockquote>
<pre><code class="language-python">from fastdeploy.entrypoints.llm import LLM
# 加载模型
llm = LLM(model=&quot;baidu/ERNIE-4.5-VL-28B-A3B-Paddle&quot;, tensor_parallel_size=1, max_model_len=32768, limit_mm_per_prompt={&quot;image&quot;: 100}, reasoning_parser=&quot;ernie-45-vl&quot;)
outputs = llm.chat(
messages=[
{&quot;role&quot;: &quot;user&quot;, &quot;content&quot;: [ {&quot;type&quot;: &quot;image_url&quot;, &quot;image_url&quot;: {&quot;url&quot;: &quot;https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg&quot;}},
{&quot;type&quot;: &quot;text&quot;, &quot;text&quot;: &quot;图中的文物属于哪个年代&quot;}]}
],
chat_template_kwargs={&quot;enable_thinking&quot;: False})
# 输出结果
for output in outputs:
prompt = output.prompt
generated_text = output.outputs.text
reasoning_text = output.outputs.reasoning_content
</code></pre>
<h3 id="text-completion-interface-llmgenerate">Text Completion Interface (LLM.generate)</h3>
<pre><code class="language-python">from fastdeploy import LLM, SamplingParams
prompts = [
&quot;User: 帮我写一篇关于深圳文心公园的500字游记和赏析。\nAssistant: 好的。&quot;
]
# 采样参数
sampling_params = SamplingParams(top_p=0.95, max_tokens=6400)
# 加载模型
llm = LLM(model=&quot;baidu/ERNIE-4.5-21B-A3B-Base-Paddle&quot;, tensor_parallel_size=1, max_model_len=8192)
# 批量进行推理llm内部基于资源情况进行请求排队、动态插入处理
outputs = llm.generate(prompts, sampling_params)
# 输出结果
for output in outputs:
prompt = output.prompt
generated_text = output.outputs.text
</code></pre>
<blockquote>
<p>Note: Text completion interface, suitable for scenarios where users have predefined the context input and expect the model to output only the continuation content. No additional <code>prompt</code> concatenation will be added during the inference process.
For the <code>chat</code> model, it is recommended to use the Chat Interface (<code>LLM.chat</code>).</p>
</blockquote>
<p>For multimodal models, such as <code>baidu/ERNIE-4.5-VL-28B-A3B-Paddle</code>, when calling the <code>generate interface</code>, you need to provide a prompt that includes images. The usage is as follows:</p>
<pre><code class="language-python">import io
import requests
from PIL import Image
from fastdeploy.entrypoints.llm import LLM
from fastdeploy.engine.sampling_params import SamplingParams
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
PATH = &quot;baidu/ERNIE-4.5-VL-28B-A3B-Paddle&quot;
tokenizer = Ernie4_5Tokenizer.from_pretrained(PATH)
messages = [
{
&quot;role&quot;: &quot;user&quot;,
&quot;content&quot;: [
{&quot;type&quot;:&quot;image_url&quot;, &quot;image_url&quot;: {&quot;url&quot;:&quot;https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg&quot;}},
{&quot;type&quot;:&quot;text&quot;, &quot;text&quot;:&quot;图中的文物属于哪个年代&quot;}
]
}
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
images, videos = [], []
for message in messages:
content = message[&quot;content&quot;]
if not isinstance(content, list):
continue
for part in content:
if part[&quot;type&quot;] == &quot;image_url&quot;:
url = part[&quot;image_url&quot;][&quot;url&quot;]
image_bytes = requests.get(url).content
img = Image.open(io.BytesIO(image_bytes))
images.append(img)
elif part[&quot;type&quot;] == &quot;video_url&quot;:
url = part[&quot;video_url&quot;][&quot;url&quot;]
video_bytes = requests.get(url).content
videos.append({
&quot;video&quot;: video_bytes,
&quot;max_frames&quot;: 30
})
sampling_params = SamplingParams(temperature=0.1, max_tokens=6400)
llm = LLM(model=PATH, tensor_parallel_size=1, max_model_len=32768, limit_mm_per_prompt={&quot;image&quot;: 100}, reasoning_parser=&quot;ernie-45-vl&quot;)
outputs = llm.generate(prompts={
&quot;prompt&quot;: prompt,
&quot;multimodal_data&quot;: {
&quot;image&quot;: images,
&quot;video&quot;: videos
}
}, sampling_params=sampling_params)
# 输出结果
for output in outputs:
prompt = output.prompt
generated_text = output.outputs.text
reasoning_text = output.outputs.reasoning_content
</code></pre>
<blockquote>
<p>Note: The <code>generate interface</code> does not currently support passing parameters to control the thinking function (on/off). It always uses the model's default parameters.</p>
</blockquote>
<h2 id="2-api-documentation">2. API Documentation</h2>
<h3 id="21-fastdeployllm">2.1 fastdeploy.LLM</h3>
<p>For <code>LLM</code> configuration, refer to <a href="../parameters/">Parameter Documentation</a>.</p>
<blockquote>
<p>Configuration Notes:</p>
<ol>
<li><code>port</code> and <code>metrics_port</code> is only used for online inference.</li>
<li>After startup, the service logs KV Cache block count (e.g. <code>total_block_num:640</code>). Multiply this by block_size (default 64) to get total cacheable tokens.</li>
<li>Calculate <code>max_num_seqs</code> based on cacheable tokens. Example: avg input=800 tokens, output=500 tokens, blocks=640 → <code>kv_cache_ratio = 800/(800+500)=0.6</code>, <code>max_seq_len = 640*64/(800+500)=31</code>.</li>
</ol>
</blockquote>
<h3 id="22-fastdeployllmchat">2.2 fastdeploy.LLM.chat</h3>
<ul>
<li>messages(list[dict],list[list[dict]]): Input messages (batch supported)</li>
<li>sampling_params: See 2.4 for parameter details</li>
<li>use_tqdm: Enable progress visualization</li>
<li>chat_template_kwargs(dict): Extra template parameters (currently supports enable_thinking(bool))
<em>usage example: <code>chat_template_kwargs={"enable_thinking": False}</code></em></li>
</ul>
<h3 id="23-fastdeployllmgenerate">2.3 fastdeploy.LLM.generate</h3>
<ul>
<li>prompts(str, list[str], list[int], list[list[int]], dict[str, Any], list[dict[str, Any]]): : Input prompts (batch supported), accepts decoded token ids
<em>example of using a dict-type parameter: <code>prompts={"prompt": prompt, "multimodal_data": {"image": images}}</code></em></li>
<li>sampling_params: See 2.4 for parameter details</li>
<li>use_tqdm: Enable progress visualization</li>
</ul>
<h3 id="24-fastdeploysamplingparams">2.4 fastdeploy.SamplingParams</h3>
<ul>
<li>presence_penalty(float): Penalizes repeated topics (positive values reduce repetition)</li>
<li>frequency_penalty(float): Strict penalty for repeated tokens</li>
<li>repetition_penalty(float): Direct penalty for repeated tokens (&gt;1 penalizes, &lt;1 encourages)</li>
<li>temperature(float): Controls randomness (higher = more random)</li>
<li>top_p(float): Probability threshold for token selection</li>
<li>top_k(int): Number of tokens considered for sampling</li>
<li>min_p(float): Minimum probability relative to the maximum probability for a token to be considered (&gt;0 filters low-probability tokens to improve quality)</li>
<li>max_tokens(int): Maximum generated tokens (input + output)</li>
<li>min_tokens(int): Minimum forced generation length</li>
<li>bad_words(list[str]): Prohibited words</li>
</ul>
<h3 id="25-fastdeployenginerequestrequestoutput">2.5 fastdeploy.engine.request.RequestOutput</h3>
<ul>
<li>request_id(str): Request identifier</li>
<li>prompt(str): Input content</li>
<li>prompt_token_ids(list[int]): Tokenized input</li>
<li>outputs(fastdeploy.engine.request.CompletionOutput): Results</li>
<li>finished(bool): Completion status</li>
<li>metrics(fastdeploy.engine.request.RequestMetrics): Performance metrics</li>
<li>num_cached_tokens(int): Cached token count (only valid when enable_prefix_caching``` is enabled)</li>
<li>error_code(int): Error code</li>
<li>error_msg(str): Error message</li>
</ul>
<h3 id="26-fastdeployenginerequestcompletionoutput">2.6 fastdeploy.engine.request.CompletionOutput</h3>
<ul>
<li>index(int): Batch index</li>
<li>send_idx(int): Request token index</li>
<li>token_ids(list[int]): Output tokens</li>
<li>text(str): Decoded text</li>
<li>reasoning_content(str): (X1 model only) Chain-of-thought output</li>
</ul>
<h3 id="27-fastdeployenginerequestrequestmetrics">2.7 fastdeploy.engine.request.RequestMetrics</h3>
<ul>
<li>arrival_time(float): Request receipt time</li>
<li>inference_start_time(float): Inference start time</li>
<li>first_token_time(float): First token latency</li>
<li>time_in_queue(float): Queuing time</li>
<li>model_forward_time(float): Forward pass duration</li>
<li>model_execute_time(float): Total execution time (including preprocessing)</li>
</ul>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
<div class="md-copyright__highlight">
Copyright &copy; 2025 Maintained by FastDeploy
</div>
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"base": "..", "features": [], "search": "../assets/javascripts/workers/search.973d3a69.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../assets/javascripts/bundle.f55a23d4.min.js"></script>
</body>
</html>