FastDeploy/get_started/installation/iluvatar_gpu/index.html


<!doctype html>
<html lang="en" class="no-js">
  <head>

      <meta charset="utf-8">
      <meta name="viewport" content="width=device-width,initial-scale=1">


        <link rel="prev" href="../Enflame_gcu/">


        <link rel="next" href="../metax_gpu/">


      <link rel="icon" href="../../../assets/images/favicon.ico">
      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.6.20">


        <title>Iluvatar CoreX - FastDeploy: Large Language Model Deployment</title>


      <link rel="stylesheet" href="../../../assets/stylesheets/main.e53b48f4.min.css">


        <link rel="stylesheet" href="../../../assets/stylesheets/palette.06af60db.min.css">


        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>


    <script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>


  </head>


    <body dir="ltr" data-md-color-scheme="default" data-md-color-primary="indigo" data-md-color-accent="indigo">


    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
    <label class="md-overlay" for="__drawer"></label>
    <div data-md-component="skip">


        <a href="#run-ernie-45-300b-a47b-ernie-45-21b-a3b-model-on-iluvatar-machine" class="md-skip">
          Skip to content
        </a>

    </div>
    <div data-md-component="announce">

    </div>


<header class="md-header md-header--shadow" data-md-component="header">
  <nav class="md-header__inner md-grid" aria-label="Header">
    <a href="../../.." title="FastDeploy: Large Language Model Deployment" class="md-header__button md-logo" aria-label="FastDeploy: Large Language Model Deployment" data-md-component="logo">

  <img src="../../../assets/images/logo.jpg" alt="logo">

    </a>
    <label class="md-header__button md-icon" for="__drawer">

      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
    </label>
    <div class="md-header__title" data-md-component="header-title">
      <div class="md-header__ellipsis">
        <div class="md-header__topic">
          <span class="md-ellipsis">
            FastDeploy: Large Language Model Deployment
          </span>
        </div>
        <div class="md-header__topic" data-md-component="header-topic">
          <span class="md-ellipsis">

              Iluvatar CoreX

          </span>
        </div>
      </div>
    </div>


        <form class="md-header__option" data-md-component="palette">


    <input class="md-option" data-md-color-media="(prefers-color-scheme: light)" data-md-color-scheme="default" data-md-color-primary="indigo" data-md-color-accent="indigo"  aria-label="Switch to dark mode"  type="radio" name="__palette" id="__palette_0">

      <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden>
        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a4 4 0 0 0-4 4 4 4 0 0 0 4 4 4 4 0 0 0 4-4 4 4 0 0 0-4-4m0 10a6 6 0 0 1-6-6 6 6 0 0 1 6-6 6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
      </label>


    <input class="md-option" data-md-color-media="(prefers-color-scheme: dark)" data-md-color-scheme="slate" data-md-color-primary="black" data-md-color-accent="indigo"  aria-label="Switch to system preference"  type="radio" name="__palette" id="__palette_1">

      <label class="md-header__button md-icon" title="Switch to system preference" for="__palette_0" hidden>
        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 18c-.89 0-1.74-.2-2.5-.55C11.56 16.5 13 14.42 13 12s-1.44-4.5-3.5-5.45C10.26 6.2 11.11 6 12 6a6 6 0 0 1 6 6 6 6 0 0 1-6 6m8-9.31V4h-4.69L12 .69 8.69 4H4v4.69L.69 12 4 15.31V20h4.69L12 23.31 15.31 20H20v-4.69L23.31 12z"/></svg>
      </label>


</form>


      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>


      <div class="md-header__option">
  <div class="md-select">

    <button class="md-header__button md-icon" aria-label="Select language">
      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="m12.87 15.07-2.54-2.51.03-.03A17.5 17.5 0 0 0 14.07 6H17V4h-7V2H8v2H1v2h11.17C11.5 7.92 10.44 9.75 9 11.35 8.07 10.32 7.3 9.19 6.69 8h-2c.73 1.63 1.73 3.17 2.98 4.56l-5.09 5.02L4 19l5-5 3.11 3.11zM18.5 10h-2L12 22h2l1.12-3h4.75L21 22h2zm-2.62 7 1.62-4.33L19.12 17z"/></svg>
    </button>
    <div class="md-select__inner">
      <ul class="md-select__list">

          <li class="md-select__item">
            <a href="./" hreflang="en" class="md-select__link">
              English
            </a>
          </li>

          <li class="md-select__item">
            <a href="../../../zh/get_started/installation/iluvatar_gpu/" hreflang="zh" class="md-select__link">
              简体中文
            </a>
          </li>

      </ul>
    </div>
  </div>
</div>


        <label class="md-header__button md-icon" for="__search">

          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
        </label>
        <div class="md-search" data-md-component="search" role="dialog">
  <label class="md-search__overlay" for="__search"></label>
  <div class="md-search__inner" role="search">
    <form class="md-search__form" name="search">
      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
      <label class="md-search__icon md-icon" for="__search">

        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>

        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
      </label>
      <nav class="md-search__options" aria-label="Search">

        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">

          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
        </button>
      </nav>

    </form>
    <div class="md-search__output">
      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
        <div class="md-search-result" data-md-component="search-result">
          <div class="md-search-result__meta">
            Initializing search
          </div>
          <ol class="md-search-result__list" role="presentation"></ol>
        </div>
      </div>
    </div>
  </div>
</div>


      <div class="md-header__source">
        <a href="https://github.com/PaddlePaddle/FastDeploy" title="Go to repository" class="md-source" data-md-component="source">
  <div class="md-source__icon md-icon">

    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.0.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
  </div>
  <div class="md-source__repository">
    FastDeploy
  </div>
</a>
      </div>

  </nav>

</header>

    <div class="md-container" data-md-component="container">


      <main class="md-main" data-md-component="main">
        <div class="md-main__inner md-grid">


              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
                <div class="md-sidebar__scrollwrap">
                  <div class="md-sidebar__inner">


<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
  <label class="md-nav__title" for="__drawer">
    <a href="../../.." title="FastDeploy: Large Language Model Deployment" class="md-nav__button md-logo" aria-label="FastDeploy: Large Language Model Deployment" data-md-component="logo">

  <img src="../../../assets/images/logo.jpg" alt="logo">

    </a>
    FastDeploy: Large Language Model Deployment
  </label>

    <div class="md-nav__source">
      <a href="https://github.com/PaddlePaddle/FastDeploy" title="Go to repository" class="md-source" data-md-component="source">
  <div class="md-source__icon md-icon">

    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 7.0.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2025 Fonticons, Inc.--><path d="M439.6 236.1 244 40.5c-5.4-5.5-12.8-8.5-20.4-8.5s-15 3-20.4 8.4L162.5 81l51.5 51.5c27.1-9.1 52.7 16.8 43.4 43.7l49.7 49.7c34.2-11.8 61.2 31 35.5 56.7-26.5 26.5-70.2-2.9-56-37.3L240.3 199v121.9c25.3 12.5 22.3 41.8 9.1 55-6.4 6.4-15.2 10.1-24.3 10.1s-17.8-3.6-24.3-10.1c-17.6-17.6-11.1-46.9 11.2-56v-123c-20.8-8.5-24.6-30.7-18.6-45L142.6 101 8.5 235.1C3 240.6 0 247.9 0 255.5s3 15 8.5 20.4l195.6 195.7c5.4 5.4 12.7 8.4 20.4 8.4s15-3 20.4-8.4l194.7-194.7c5.4-5.4 8.4-12.8 8.4-20.4s-3-15-8.4-20.4"/></svg>
  </div>
  <div class="md-source__repository">
    FastDeploy
  </div>
</a>
    </div>

  <ul class="md-nav__list" data-md-scrollfix>


    <li class="md-nav__item">
      <a href="../../.." class="md-nav__link">


  <span class="md-ellipsis">
    FastDeploy

  </span>


      </a>
    </li>


    <li class="md-nav__item md-nav__item--active md-nav__item--nested">


        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" checked>


          <label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">


  <span class="md-ellipsis">
    Quick Start

  </span>


            <span class="md-nav__icon md-icon"></span>
          </label>

        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="true">
          <label class="md-nav__title" for="__nav_2">
            <span class="md-nav__icon md-icon"></span>
            Quick Start
          </label>
          <ul class="md-nav__list" data-md-scrollfix>


    <li class="md-nav__item md-nav__item--active md-nav__item--nested">


        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2_1" checked>


          <label class="md-nav__link" for="__nav_2_1" id="__nav_2_1_label" tabindex="0">


  <span class="md-ellipsis">
    Installation

  </span>


            <span class="md-nav__icon md-icon"></span>
          </label>

        <nav class="md-nav" data-md-level="2" aria-labelledby="__nav_2_1_label" aria-expanded="true">
          <label class="md-nav__title" for="__nav_2_1">
            <span class="md-nav__icon md-icon"></span>
            Installation
          </label>
          <ul class="md-nav__list" data-md-scrollfix>


    <li class="md-nav__item">
      <a href="../nvidia_gpu/" class="md-nav__link">


  <span class="md-ellipsis">
    Nvidia GPU

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../kunlunxin_xpu/" class="md-nav__link">


  <span class="md-ellipsis">
    KunlunXin XPU

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../hygon_dcu/" class="md-nav__link">


  <span class="md-ellipsis">
    HYGON DCU

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../Enflame_gcu/" class="md-nav__link">


  <span class="md-ellipsis">
    Enflame S60

  </span>


      </a>
    </li>


    <li class="md-nav__item md-nav__item--active">

      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">


        <label class="md-nav__link md-nav__link--active" for="__toc">


  <span class="md-ellipsis">
    Iluvatar CoreX

  </span>


          <span class="md-nav__icon md-icon"></span>
        </label>

      <a href="./" class="md-nav__link md-nav__link--active">


  <span class="md-ellipsis">
    Iluvatar CoreX

  </span>


      </a>


<nav class="md-nav md-nav--secondary" aria-label="Table of contents">


    <label class="md-nav__title" for="__toc">
      <span class="md-nav__icon md-icon"></span>
      Table of contents
    </label>
    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>

        <li class="md-nav__item">
  <a href="#machine-preparation" class="md-nav__link">
    <span class="md-ellipsis">
      Machine Preparation
    </span>
  </a>

</li>

        <li class="md-nav__item">
  <a href="#image-preparation" class="md-nav__link">
    <span class="md-ellipsis">
      Image Preparation
    </span>
  </a>

</li>

        <li class="md-nav__item">
  <a href="#container-preparation" class="md-nav__link">
    <span class="md-ellipsis">
      Container Preparation
    </span>
  </a>

    <nav class="md-nav" aria-label="Container Preparation">
      <ul class="md-nav__list">

          <li class="md-nav__item">
  <a href="#start-container" class="md-nav__link">
    <span class="md-ellipsis">
      Start Container
    </span>
  </a>

</li>

          <li class="md-nav__item">
  <a href="#install-paddle" class="md-nav__link">
    <span class="md-ellipsis">
      Install paddle
    </span>
  </a>

</li>

          <li class="md-nav__item">
  <a href="#install-or-build-fastdeploy" class="md-nav__link">
    <span class="md-ellipsis">
      Install or build FastDeploy
    </span>
  </a>

</li>

      </ul>
    </nav>

</li>

        <li class="md-nav__item">
  <a href="#prepare-the-inference-demo-script" class="md-nav__link">
    <span class="md-ellipsis">
      Prepare the inference demo script
    </span>
  </a>

</li>

        <li class="md-nav__item">
  <a href="#run-demo" class="md-nav__link">
    <span class="md-ellipsis">
      run demo
    </span>
  </a>

</li>

        <li class="md-nav__item">
  <a href="#run-ernie45-300b-model-with-the-gsm8k-dataset" class="md-nav__link">
    <span class="md-ellipsis">
      Run ernie4.5 300B model with the GSM8K dataset
    </span>
  </a>

</li>

    </ul>

</nav>

    </li>


    <li class="md-nav__item">
      <a href="../metax_gpu/" class="md-nav__link">


  <span class="md-ellipsis">
    Metax C550

  </span>


      </a>
    </li>


          </ul>
        </nav>

    </li>


    <li class="md-nav__item">
      <a href="../../quick_start/" class="md-nav__link">


  <span class="md-ellipsis">
    Quick Deployment For ERNIE-4.5-0.3B

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../quick_start_vl/" class="md-nav__link">


  <span class="md-ellipsis">
    Quick Deployment for ERNIE-4.5-VL-28B-A3B

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../ernie-4.5/" class="md-nav__link">


  <span class="md-ellipsis">
    ERNIE-4.5-300B-A47B

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../ernie-4.5-vl/" class="md-nav__link">


  <span class="md-ellipsis">
    ERNIE-4.5-VL-424B-A47B

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../quick_start_qwen/" class="md-nav__link">


  <span class="md-ellipsis">
    Quick Deployment For QWEN

  </span>


      </a>
    </li>


          </ul>
        </nav>

    </li>


    <li class="md-nav__item md-nav__item--nested">


        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >


          <label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">


  <span class="md-ellipsis">
    Online Serving

  </span>


            <span class="md-nav__icon md-icon"></span>
          </label>

        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
          <label class="md-nav__title" for="__nav_3">
            <span class="md-nav__icon md-icon"></span>
            Online Serving
          </label>
          <ul class="md-nav__list" data-md-scrollfix>


    <li class="md-nav__item">
      <a href="../../../online_serving/" class="md-nav__link">


  <span class="md-ellipsis">
    OpenAI-Compatible API Server

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../online_serving/metrics/" class="md-nav__link">


  <span class="md-ellipsis">
    Monitor Metrics

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../online_serving/scheduler/" class="md-nav__link">


  <span class="md-ellipsis">
    Scheduler

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../online_serving/graceful_shutdown_service/" class="md-nav__link">


  <span class="md-ellipsis">
    Graceful Shutdown

  </span>


      </a>
    </li>


          </ul>
        </nav>

    </li>


    <li class="md-nav__item">
      <a href="../../../offline_inference/" class="md-nav__link">


  <span class="md-ellipsis">
    Offline Inference

  </span>


      </a>
    </li>


    <li class="md-nav__item md-nav__item--nested">


        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_5" >


          <label class="md-nav__link" for="__nav_5" id="__nav_5_label" tabindex="0">


  <span class="md-ellipsis">
    Best Practices

  </span>


            <span class="md-nav__icon md-icon"></span>
          </label>

        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_5_label" aria-expanded="false">
          <label class="md-nav__title" for="__nav_5">
            <span class="md-nav__icon md-icon"></span>
            Best Practices
          </label>
          <ul class="md-nav__list" data-md-scrollfix>


    <li class="md-nav__item">
      <a href="../../../best_practices/ERNIE-4.5-0.3B-Paddle/" class="md-nav__link">


  <span class="md-ellipsis">
    ERNIE-4.5-0.3B

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../best_practices/ERNIE-4.5-21B-A3B-Paddle/" class="md-nav__link">


  <span class="md-ellipsis">
    ERNIE-4.5-21B-A3B

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../best_practices/ERNIE-4.5-300B-A47B-Paddle/" class="md-nav__link">


  <span class="md-ellipsis">
    ERNIE-4.5-300B-A47B

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../best_practices/ERNIE-4.5-21B-A3B-Thinking/" class="md-nav__link">


  <span class="md-ellipsis">
    ERNIE-4.5-21B-A3B-Thinking

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../best_practices/ERNIE-4.5-VL-28B-A3B-Paddle/" class="md-nav__link">


  <span class="md-ellipsis">
    ERNIE-4.5-VL-28B-A3B

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../best_practices/ERNIE-4.5-VL-424B-A47B-Paddle/" class="md-nav__link">


  <span class="md-ellipsis">
    ERNIE-4.5-VL-424B-A47B

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../best_practices/FAQ/" class="md-nav__link">


  <span class="md-ellipsis">
    FAQ

  </span>


      </a>
    </li>


          </ul>
        </nav>

    </li>


    <li class="md-nav__item md-nav__item--nested">


        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_6" >


          <label class="md-nav__link" for="__nav_6" id="__nav_6_label" tabindex="0">


  <span class="md-ellipsis">
    Quantization

  </span>


            <span class="md-nav__icon md-icon"></span>
          </label>

        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_6_label" aria-expanded="false">
          <label class="md-nav__title" for="__nav_6">
            <span class="md-nav__icon md-icon"></span>
            Quantization
          </label>
          <ul class="md-nav__list" data-md-scrollfix>


    <li class="md-nav__item">
      <a href="../../../quantization/" class="md-nav__link">


  <span class="md-ellipsis">
    Overview

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../quantization/online_quantization/" class="md-nav__link">


  <span class="md-ellipsis">
    Online Quantization

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../quantization/wint2/" class="md-nav__link">


  <span class="md-ellipsis">
    WINT2 Quantization

  </span>


      </a>
    </li>


          </ul>
        </nav>

    </li>


    <li class="md-nav__item md-nav__item--nested">


        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" >


          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">


  <span class="md-ellipsis">
    Features

  </span>


            <span class="md-nav__icon md-icon"></span>
          </label>

        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="false">
          <label class="md-nav__title" for="__nav_7">
            <span class="md-nav__icon md-icon"></span>
            Features
          </label>
          <ul class="md-nav__list" data-md-scrollfix>


    <li class="md-nav__item">
      <a href="../../../features/prefix_caching/" class="md-nav__link">


  <span class="md-ellipsis">
    Prefix Caching

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../features/disaggregated/" class="md-nav__link">


  <span class="md-ellipsis">
    Disaggregation

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../features/chunked_prefill/" class="md-nav__link">


  <span class="md-ellipsis">
    Chunked Prefill

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../features/load_balance/" class="md-nav__link">


  <span class="md-ellipsis">
    Load Balance

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../features/speculative_decoding/" class="md-nav__link">


  <span class="md-ellipsis">
    Speculative Decoding

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../features/structured_outputs/" class="md-nav__link">


  <span class="md-ellipsis">
    Structured Outputs

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../features/reasoning_output/" class="md-nav__link">


  <span class="md-ellipsis">
    Reasoning Output

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../features/early_stop/" class="md-nav__link">


  <span class="md-ellipsis">
    Early Stop

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../features/plugins/" class="md-nav__link">


  <span class="md-ellipsis">
    Plugins

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../features/sampling/" class="md-nav__link">


  <span class="md-ellipsis">
    Sampling

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../features/multi-node_deployment/" class="md-nav__link">


  <span class="md-ellipsis">
    MultiNode Deployment

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../features/graph_optimization/" class="md-nav__link">


  <span class="md-ellipsis">
    Graph Optimization

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../features/data_parallel_service/" class="md-nav__link">


  <span class="md-ellipsis">
    Data Parallelism

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../features/plas_attention/" class="md-nav__link">


  <span class="md-ellipsis">
    PLAS

  </span>


      </a>
    </li>


          </ul>
        </nav>

    </li>


    <li class="md-nav__item">
      <a href="../../../supported_models/" class="md-nav__link">


  <span class="md-ellipsis">
    Supported Models

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../benchmark/" class="md-nav__link">


  <span class="md-ellipsis">
    Benchmark

  </span>


      </a>
    </li>


    <li class="md-nav__item md-nav__item--nested">


        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_10" >


          <label class="md-nav__link" for="__nav_10" id="__nav_10_label" tabindex="0">


  <span class="md-ellipsis">
    Usage

  </span>


            <span class="md-nav__icon md-icon"></span>
          </label>

        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_10_label" aria-expanded="false">
          <label class="md-nav__title" for="__nav_10">
            <span class="md-nav__icon md-icon"></span>
            Usage
          </label>
          <ul class="md-nav__list" data-md-scrollfix>


    <li class="md-nav__item">
      <a href="../../../usage/log/" class="md-nav__link">


  <span class="md-ellipsis">
    Log Description

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../usage/code_overview/" class="md-nav__link">


  <span class="md-ellipsis">
    Code Overview

  </span>


      </a>
    </li>


    <li class="md-nav__item">
      <a href="../../../usage/environment_variables/" class="md-nav__link">


  <span class="md-ellipsis">
    Environment Variables

  </span>


      </a>
    </li>


          </ul>
        </nav>

    </li>


  </ul>
</nav>
                  </div>
                </div>
              </div>


              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
                <div class="md-sidebar__scrollwrap">
                  <div class="md-sidebar__inner">


<nav class="md-nav md-nav--secondary" aria-label="Table of contents">


    <label class="md-nav__title" for="__toc">
      <span class="md-nav__icon md-icon"></span>
      Table of contents
    </label>
    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>

        <li class="md-nav__item">
  <a href="#machine-preparation" class="md-nav__link">
    <span class="md-ellipsis">
      Machine Preparation
    </span>
  </a>

</li>

        <li class="md-nav__item">
  <a href="#image-preparation" class="md-nav__link">
    <span class="md-ellipsis">
      Image Preparation
    </span>
  </a>

</li>

        <li class="md-nav__item">
  <a href="#container-preparation" class="md-nav__link">
    <span class="md-ellipsis">
      Container Preparation
    </span>
  </a>

    <nav class="md-nav" aria-label="Container Preparation">
      <ul class="md-nav__list">

          <li class="md-nav__item">
  <a href="#start-container" class="md-nav__link">
    <span class="md-ellipsis">
      Start Container
    </span>
  </a>

</li>

          <li class="md-nav__item">
  <a href="#install-paddle" class="md-nav__link">
    <span class="md-ellipsis">
      Install paddle
    </span>
  </a>

</li>

          <li class="md-nav__item">
  <a href="#install-or-build-fastdeploy" class="md-nav__link">
    <span class="md-ellipsis">
      Install or build FastDeploy
    </span>
  </a>

</li>

      </ul>
    </nav>

</li>

        <li class="md-nav__item">
  <a href="#prepare-the-inference-demo-script" class="md-nav__link">
    <span class="md-ellipsis">
      Prepare the inference demo script
    </span>
  </a>

</li>

        <li class="md-nav__item">
  <a href="#run-demo" class="md-nav__link">
    <span class="md-ellipsis">
      run demo
    </span>
  </a>

</li>

        <li class="md-nav__item">
  <a href="#run-ernie45-300b-model-with-the-gsm8k-dataset" class="md-nav__link">
    <span class="md-ellipsis">
      Run ernie4.5 300B model with the GSM8K dataset
    </span>
  </a>

</li>

    </ul>

</nav>
                  </div>
                </div>
              </div>


            <div class="md-content" data-md-component="content">
              <article class="md-content__inner md-typeset">


<h1 id="run-ernie-45-300b-a47b-ernie-45-21b-a3b-model-on-iluvatar-machine">Run ERNIE-4.5-300B-A47B &amp; ERNIE-4.5-21B-A3B model on iluvatar machine</h1>
<h2 id="machine-preparation">Machine Preparation</h2>
<p>First, the <code>TP=16</code> when running the ERNIE4.5 300B model and so you need to prepare a machine with the following configurations:</p>
<table>
<thead>
<tr>
<th style="text-align: center;">CPU</th>
<th style="text-align: center;">Memory</th>
<th style="text-align: center;">Card</th>
<th style="text-align: center;">Hard Disk</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align: center;">x86</td>
<td style="text-align: center;">1TB</td>
<td style="text-align: center;">16xBI150</td>
<td style="text-align: center;">1TB</td>
</tr>
</tbody>
</table>
<p>Currently, the entire model needs to be loaded into the host memory, which requires more than 600GB of host memory. This issue will be optimized in subsequent versions.</p>
<h2 id="image-preparation">Image Preparation</h2>
<p>Pull the Docker image</p>
<pre><code class="language-bash">docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
</code></pre>
<h2 id="container-preparation">Container Preparation</h2>
<h3 id="start-container">Start Container</h3>
<pre><code class="language-bash">docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
docker exec -it paddle_infer bash
</code></pre>
<p>/home/paddle contains the model files, *.whl packages, and scripts.</p>
<h3 id="install-paddle">Install paddle</h3>
<pre><code class="language-bash">pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
</code></pre>
<p>For latest paddle version on iluvatar. Refer to <a href="https://www.paddlepaddle.org.cn/">PaddlePaddle Installation</a></p>
<h3 id="install-or-build-fastdeploy">Install or build FastDeploy</h3>
<pre><code class="language-bash">pip3 install fastdeploy_iluvatar_gpu==2.1.0.dev0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels
</code></pre>
<p>You can build FastDeploy from source if you need the <code>latest version</code>.</p>
<pre><code class="language-bash">git clone https://github.com/PaddlePaddle/FastDeploy
cd FastDeploy
pip install -r requirements_iluvatar.txt
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
bash build.sh
</code></pre>
<h2 id="prepare-the-inference-demo-script">Prepare the inference demo script</h2>
<p>script list below:</p>
<p><code>run_demo.sh</code>:</p>
<pre><code class="language-bash">#!/bin/bash
export PADDLE_XCCL_BACKEND=iluvatar_gpu
export INFERENCE_MSG_QUEUE_ID=232132
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
export FD_SAMPLING_CLASS=rejection
export FD_DEBUG=1
python3 run_demo.py
</code></pre>
<p><code>run_demo.py</code>:</p>
<pre><code class="language-python">from fastdeploy import LLM, SamplingParams

prompts = [
    &quot;Hello, my name is&quot;,
    &quot;The largest ocean is&quot;,
]

# sampling parameters
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)

# load the model
llm = LLM(model=&quot;/home/paddle/ernie-4_5-21b-a3b-bf16-paddle&quot;, tensor_parallel_size=4, max_model_len=8192, block_size=16, quantization='wint8')

# Perform batch inference
outputs = llm.generate(prompts, sampling_params)
# Note：Replace `/home/paddle/ernie-4_5-21b-a3b-bf16-paddle` in it with the path to the ERNIE model you have downloaded.

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs.text
    print(prompt, generated_text)
</code></pre>
<h2 id="run-demo">run demo</h2>
<pre><code class="language-bash">./run_demo.sh
</code></pre>
<p>The following logs will be printed: Loading the model took approximately 74 seconds, and running the demo took approximately 240 seconds.</p>
<pre><code>/usr/local/lib/python3.10/site-packages/paddle/utils/cpp_extension/extension_utils.py:715: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md
  warnings.warn(warning_message)
/usr/local/lib/python3.10/site-packages/_distutils_hack/__init__.py:31: UserWarning: Setuptools is replacing distutils. Support for replacing an already imported distutils is deprecated. In the future, this condition will fail. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
  warnings.warn(
[2025-07-02 11:07:42,393] [    INFO] - Loading configuration file /home/paddle/ernie-4_5-21b-a3b-bf16-paddle/generation_config.json
/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:250: UserWarning: using greedy search strategy. However, `temperature` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy=&quot;greedy_search&quot; ` or unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
  warnings.warn(
/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:255: UserWarning: using greedy search strategy. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy=&quot;greedy_search&quot; ` or unset `top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
  warnings.warn(
INFO     2025-07-02 11:07:43,589 577964 engine.py[line:207] Waitting worker processes ready...
Loading Weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:57&lt;00:00,  1.75it/s]
Loading Layers: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:08&lt;00:00, 11.73it/s]
INFO     2025-07-02 11:08:55,261 577964 engine.py[line:277] Worker processes are launched with 73.76574492454529 seconds.
Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [03:59&lt;00:00, 119.96s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Hello, my name is  Christopher. Today, I'm going to teach you how to draw a cute cartoon ghost. Let's get started!
 (1) First, draw a big circle for the ghost's head.
 (2) Then, add two small circles for the eyes, making sure they're not too big.
 (3) Next, draw a wide, open mouth that looks like a big &quot;U&quot;.
 (4) After that, create the body by drawing a slightly smaller circle below the head.
 (5) Now, let's add some arms. Draw two short, curly lines on each side of the body.
 (6) Finally, give the ghost a wavy line at the bottom to represent its floating appearance.

Now, let's break down each step:

**Step 1: Drawing the Head**
- Start with a big circle to form the head of the ghost. This will be the foundation of your drawing.

**Step 2: Adding Eyes**
- On the head, place two small circles for the eyes. They should be centered and not too big, to give the ghost a cute and innocent look.

**Step 3: Drawing the
The largest ocean is  the Pacific Ocean, covering an area of approximately â¦ [3], The first scientific expeditions to determine the ocean's depth were the Challenger expedition (1872â1876) and the U.S. Navy Hydrographic Office survey (1877â1879). The oceanic crust is thin and irregular, consisting of upward moving magma from the mantle below, and cooling and solidifying on the surface. The shallowest parts of the ocean are called the continental shelves. Large tides are caused mainly by the alignment of the Sun, Moon, and Earth during new or full moons. The origin of the word &quot;ocean&quot; is not clear. The first global oceanic topography survey was completed by the Challenger expedition (1872â1876). [57] The sound speed in the ocean is primarily a function of water temperature and salinity, and varies with depth. The deep-ocean floor is mostly flat and devoid of life, with the exception of seamounts and various underwater volcanic features, including seamounts and hydrothermal vents. [73] Today, the five ocean
</code></pre>
<h2 id="run-ernie45-300b-model-with-the-gsm8k-dataset">Run ernie4.5 300B model with the GSM8K dataset</h2>
<ol>
<li>Download GSM8K dataset</li>
</ol>
<pre><code class="language-bash">wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
</code></pre>
<ol>
<li>Prepare <code>bench_gsm8k.py</code></li>
</ol>
<pre><code class="language-python"># Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the &quot;License&quot;);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an &quot;AS IS&quot; BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

&quot;&quot;&quot; Fastdeploy + ERNIE-4.5-Turbo 的指标评估 &quot;&quot;&quot;
# adapted from https://github.com/sgl-project/sglang/blob/main/benchmark/gsm8k/bench_other.py
import argparse
import ast
import json
import re
import time
from concurrent.futures import ThreadPoolExecutor

import numpy as np
import requests
from tqdm import tqdm

INVALID = -9999999


def call_generate(prompt, **kwargs):
    &quot;&quot;&quot;
    Generates response based on the input prompt.

    Args:
        prompt (str): The input prompt text.
        **kwargs: Keyword arguments, including server IP address and port number.

    Returns:
        str: The response generated based on the prompt.

    &quot;&quot;&quot;
    url = f&quot;http://{kwargs['ip']}:{kwargs['port']}/v1/chat/completions&quot;
    headers = {&quot;Content-Type&quot;: &quot;application/json&quot;}
    data = {
        &quot;messages&quot;: [
            {
                &quot;role&quot;: &quot;user&quot;,
                &quot;content&quot;: prompt,
            }
        ],
        &quot;temperature&quot;: 0.6,
        &quot;max_tokens&quot;: 2047,
        &quot;top_p&quot;: 0.95,
        &quot;do_sample&quot;: True,
    }

    response = requests.post(url, headers=headers, data=json.dumps(data))
    out = response.json()
    return out[&quot;choices&quot;][0][&quot;message&quot;][&quot;content&quot;]


def get_one_example(lines, i, include_answer):
    &quot;&quot;&quot;
    Retrieves a question-answer example from the given list of text lines.

    Args:
        lines (list of dict): A list of question-answer pairs.
        i (int): The index of the question-answer pair to retrieve from lines.
        include_answer (bool): Whether to include the answer in the returned string.

    Returns:
        str: A formatted question-answer string in the format &quot;Question: &lt;question&gt;\nAnswer: &lt;answer&gt;&quot;.

    &quot;&quot;&quot;
    ret = &quot;Question: &quot; + lines[i][&quot;question&quot;] + &quot;\nAnswer:&quot;
    if include_answer:
        ret += &quot; &quot; + lines[i][&quot;answer&quot;]
    return ret


def get_few_shot_examples(lines, k):
    &quot;&quot;&quot;
    Selects k examples from the given list of text lines and concatenates them into a single string.

    Args:
        lines (list): A list containing text lines.
        k (int): The number of examples to select.

    Returns:
        str: A string composed of k examples, separated by two newline characters.
    &quot;&quot;&quot;
    ret = &quot;&quot;
    for i in range(k):
        ret += get_one_example(lines, i, True) + &quot;\n\n&quot;
    return ret


def get_answer_value(answer_str):
    &quot;&quot;&quot;
    Extracts numerical values from an answer string and returns them.

    Args:
        answer_str (str): The string containing the answer.

    Returns:
        The extracted numerical value; returns &quot;INVALID&quot; if extraction fails.
    &quot;&quot;&quot;
    answer_str = answer_str.replace(&quot;,&quot;, &quot;&quot;)
    numbers = re.findall(r&quot;\d+&quot;, answer_str)
    if len(numbers) &lt; 1:
        return INVALID
    try:
        return ast.literal_eval(numbers[-1])
    except SyntaxError:
        return INVALID


def read_jsonl(filename: str):
    &quot;&quot;&quot;
    Reads a JSONL file.

    Args:
        filename (str): Path to the JSONL file.

    Yields:
        dict: A dictionary object corresponding to each line in the JSONL file.
    &quot;&quot;&quot;
    with open(filename) as fin:
        for line in fin:
            if line.startswith(&quot;#&quot;):
                continue
            yield json.loads(line)


def main(args):
    &quot;&quot;&quot;
    Process inputs and generate answers by calling the model in parallel using a thread pool.

    Args:
        args (argparse.Namespace):
            - num_questions (int): Number of questions to process.
            - num_shots (int): Number of few-shot learning examples.
            - ip (str): IP address of the model service.
            - port (int): Port number of the model service.
            - parallel (int): Number of questions to process in parallel.
            - result_file (str): File path to store the results.

    Returns:
        None

    &quot;&quot;&quot;
    # Read data
    filename = &quot;test.jsonl&quot;

    lines = list(read_jsonl(filename))

    # Construct prompts
    num_questions = args.num_questions
    num_shots = args.num_shots
    few_shot_examples = get_few_shot_examples(lines, num_shots)

    questions = []
    labels = []
    for i in range(len(lines[:num_questions])):
        questions.append(get_one_example(lines, i, False))
        labels.append(get_answer_value(lines[i][&quot;answer&quot;]))
    assert all(l != INVALID for l in labels)

    states = [None] * len(labels)

    # Use thread pool
    def get_one_answer(i):
        answer = call_generate(
            prompt=few_shot_examples + questions[i],
            # stop=[&quot;Question&quot;, &quot;Assistant:&quot;, &quot;&lt;|separator|&gt;&quot;],
            ip=args.ip,
            port=args.port,
        )
        states[i] = answer

    tic = time.time()
    if args.parallel == 1:
        for i in tqdm(range(len(questions))):
            get_one_answer(i)
    else:
        with ThreadPoolExecutor(args.parallel) as executor:
            list(
                tqdm(
                    executor.map(get_one_answer, list(range(len(questions)))),
                    total=len(questions),
                )
            )

    latency = time.time() - tic
    preds = []
    for i in range(len(states)):
        preds.append(get_answer_value(states[i]))

    # Compute accuracy
    acc = np.mean(np.array(preds) == np.array(labels))
    invalid = np.mean(np.array(preds) == INVALID)

    # Print results
    print(f&quot;Accuracy: {acc:.3f}&quot;)
    print(f&quot;Invalid: {invalid:.3f}&quot;)
    print(f&quot;Latency: {latency:.3f} s&quot;)

    with open(args.result_file, &quot;a&quot;) as fout:
        value = {
            &quot;task&quot;: &quot;gsm8k&quot;,
            &quot;backend&quot;: &quot;paddlepaddle&quot;,
            &quot;num_gpus&quot;: 1,
            &quot;latency&quot;: round(latency, 3),
            &quot;accuracy&quot;: round(acc, 3),
            &quot;num_requests&quot;: args.num_questions,
            &quot;other&quot;: {
                &quot;num_questions&quot;: args.num_questions,
                &quot;parallel&quot;: args.parallel,
            },
        }
        fout.write(json.dumps(value) + &quot;\n&quot;)


if __name__ == &quot;__main__&quot;:
    parser = argparse.ArgumentParser()
    parser.add_argument(&quot;--ip&quot;, type=str, default=&quot;127.0.0.1&quot;)
    parser.add_argument(&quot;--port&quot;, type=str, default=&quot;8188&quot;)
    parser.add_argument(&quot;--num-shots&quot;, type=int, default=10)
    parser.add_argument(&quot;--data-path&quot;, type=str, default=&quot;test.jsonl&quot;)
    parser.add_argument(&quot;--num-questions&quot;, type=int, default=1319)
    parser.add_argument(&quot;--result-file&quot;, type=str, default=&quot;result.jsonl&quot;)
    parser.add_argument(&quot;--parallel&quot;, type=int, default=1)
    args = parser.parse_args()
    main(args)
</code></pre>
<ol>
<li>Prepare <code>run_bench.sh</code></li>
</ol>
<pre><code class="language-bash">#!/bin/bash
export PADDLE_XCCL_BACKEND=iluvatar_gpu
export INFERENCE_MSG_QUEUE_ID=232132
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
export FD_SAMPLING_CLASS=rejection

python3 -m fastdeploy.entrypoints.openai.api_server --model &quot;/home/paddle/ernie-45t&quot; --port 8188 --tensor-parallel-size 16 --block-size 16 --quantization wint8
</code></pre>
<ol>
<li>Running the Script</li>
</ol>
<p>Firstly, open a terminal and run:</p>
<pre><code class="language-bash">./run_bench.sh
</code></pre>
<p>After the service is ready, open another terminal and run:</p>
<pre><code class="language-bash">python3 -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parallel 8
</code></pre>
<p>It takes about 4.8 hours to run the GSM8K dataset.</p>
<pre><code>Accuracy: 0.962
Invaild: 0.000
Latency: 17332.728 s
</code></pre>


              </article>
            </div>


<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
        </div>

      </main>

        <footer class="md-footer">

  <div class="md-footer-meta md-typeset">
    <div class="md-footer-meta__inner md-grid">
      <div class="md-copyright">

    <div class="md-copyright__highlight">
      Copyright &copy; 2025 Maintained by FastDeploy
    </div>


    Made with
    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
      Material for MkDocs
    </a>

</div>

    </div>
  </div>
</footer>

    </div>
    <div class="md-dialog" data-md-component="dialog">
      <div class="md-dialog__inner md-typeset"></div>
    </div>


      <script id="__config" type="application/json">{"base": "../../..", "features": [], "search": "../../../assets/javascripts/workers/search.973d3a69.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>


      <script src="../../../assets/javascripts/bundle.f55a23d4.min.js"></script>


  </body>
</html>