Sync v2.0 version of code to github repo

2025-10-05 08:37:06 +08:00 · 2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions
--- a/fastdeploy/model_executor/eplb/eplb.py
+++ b/fastdeploy/model_executor/eplb/eplb.py
@@ -1,192 +0,0 @@
-"""
-This file is copied from https://github.com/deepseek-ai/EPLB/blob/main/eplb.py
-"""
-
-"""Expert Parallelism Load Balancer (EPLB)"""
-
-from typing import Tuple
-import numpy as np
-
-def balanced_packing(weight: np.ndarray, num_packs: int) -> Tuple[np.ndarray, np.ndarray]:
-    """
-    Pack n weighted objects to m packs, such that each bin contains exactly n/m objects and the weights of all packs
-    are as balanced as possible.
-
-    Parameters:
-        weight: [X, n], the weight of each item
-        num_packs: number of packs
-    
-    Returns: 
-        pack_index: [X, n], the pack index of each item
-        rank_in_pack: [X, n], the rank of the item in the pack
-    """
-    num_layers, num_groups = weight.shape
-    assert num_groups % num_packs == 0
-    groups_per_pack = num_groups // num_packs
-
-    if groups_per_pack == 1:
-        pack_index = np.arange(weight.shape[-1], dtype=np.int32).reshape(1, -1).repeat(num_layers, axis=0)
-        rank_in_pack = np.zeros_like(weight, dtype=np.int32)
-        return pack_index, rank_in_pack
-
-    indices = np.argsort(-weight.astype(np.float32), axis=-1)
-    pack_index = np.full_like(weight, fill_value=-1, dtype=np.int32)
-    rank_in_pack = np.full_like(pack_index, fill_value=-1)
-    for i in range(num_layers):
-        pack_weights = [0] * num_packs
-        pack_items = [0] * num_packs
-        for group in indices[i]:
-            pack = min((i for i in range(num_packs) if pack_items[i] < groups_per_pack), 
-                       key=pack_weights.__getitem__)
-            assert pack_items[pack] < groups_per_pack
-            pack_index[i, group] = pack
-            rank_in_pack[i, group] = pack_items[pack]
-            pack_weights[pack] += weight[i, group]
-            pack_items[pack] += 1
-    return pack_index, rank_in_pack
-
-
-def replicate_experts(weight: np.ndarray, num_phy: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-    """
-    Replicate `num_log` experts to `num_phy` replicas, such that the maximum load of all replicas is minimized.
-
-    Parameters:
-        weight: [X, num_log]
-        num_phy: total number of experts after replication
-    
-    Returns:
-        phy2log: [X, num_phy], logical expert id of each physical expert
-        rank: [X, num_phy], the replica rank
-        logcnt: [X, num_log], number of replicas for each logical expert
-    """
-    n, num_log = weight.shape
-    num_redundant = num_phy - num_log
-    assert num_redundant >= 0
-    phy2log = np.arange(num_phy, dtype=np.int32).reshape(1, -1).repeat(n, axis=0)
-    rank = np.zeros((n, num_phy), dtype=np.int32)
-    logcnt = np.ones((n, num_log), dtype=np.int32)
-    arangen = np.arange(n, dtype=np.int32)
-    for i in range(num_log, num_phy):
-        redundant_indices = np.argmax(weight / logcnt, axis=-1)
-        phy2log[:, i] = redundant_indices
-        rank[:, i] = logcnt[arangen, redundant_indices]
-        logcnt[arangen, redundant_indices] += 1
-    return phy2log, rank, logcnt
-
-
-def rebalance_experts_hierarchical(weight: np.ndarray, num_physical_experts: int, 
-                      num_groups: int, num_nodes: int, num_gpus: int):
-    """
-    Parameters:
-        weight: [num_moe_layers, num_logical_experts]
-        num_physical_experts: number of physical experts after replication
-        num_groups: number of expert groups
-        num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
-        num_gpus: number of GPUs, must be a multiple of `num_nodes`
-
-    Returns: 
-        physical_to_logical_map: [num_moe_layers, num_physical_experts]
-        logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
-        logical_count: [num_moe_layers, num_logical_experts]
-    """
-    num_layers, num_logical_experts = weight.shape
-    assert num_logical_experts % num_groups == 0
-    group_size = num_logical_experts // num_groups 
-    assert num_groups % num_nodes == 0
-    groups_per_node = num_groups // num_nodes
-    assert num_gpus % num_nodes == 0
-    assert num_physical_experts % num_gpus == 0
-    phy_experts_per_gpu = num_physical_experts // num_gpus
-
-    def inverse(perm: np.ndarray) -> np.ndarray:
-        inv = np.empty_like(perm)
-        inv[np.arange(perm.shape[0])[:, None], perm] = np.arange(perm.shape[1], dtype=np.int32).reshape(1, -1)
-        return inv
-
-    # Step 1: pack groups to nodes
-    tokens_per_group = weight.reshape(num_layers, num_groups, group_size).sum(axis=-1)
-    group_pack_index, group_rank_in_pack = balanced_packing(tokens_per_group, num_nodes) 
-    log2mlog = (((group_pack_index * groups_per_node + group_rank_in_pack) * group_size)[:, :, None] + 
-                np.arange(group_size, dtype=np.int32)).reshape(num_layers, -1)
-    mlog2log = inverse(log2mlog)
-
-    # Step 2: construct redundant experts within nodes
-    tokens_per_mlog = np.take_along_axis(weight, mlog2log, axis=-1).reshape(-1, num_logical_experts // num_nodes)
-    phy2mlog, phyrank, mlogcnt = replicate_experts(tokens_per_mlog, num_physical_experts // num_nodes)    
-
-    # Step 3: pack physical_experts to GPUs
-    tokens_per_phy = np.take_along_axis(tokens_per_mlog / mlogcnt, phy2mlog, axis=-1)
-    pack_index, rank_in_pack = balanced_packing(tokens_per_phy, num_gpus // num_nodes)
-    phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
-    pphy2phy = inverse(phy2pphy)
-
-    pphy2mlog = np.take_along_axis(phy2mlog, pphy2phy, axis=-1) # [num_layers * num_nodes, num_log_per_nodes]
-    pphy2mlog = (pphy2mlog.reshape(num_layers, num_nodes, -1) + 
-                 np.arange(0, num_logical_experts, num_logical_experts // num_nodes, dtype=np.int32)
-                 .reshape(1, -1, 1)).reshape(num_layers, -1)
-    pphy2log = np.take_along_axis(mlog2log, pphy2mlog, axis=-1)
-    pphyrank = np.take_along_axis(phyrank, pphy2phy, axis=-1).reshape(num_layers, -1)
-    logcnt = np.take_along_axis(mlogcnt.reshape(num_layers, -1), log2mlog, axis=-1)
-    return pphy2log, pphyrank, logcnt
-
-
-def rebalance_experts(weight: np.ndarray, num_replicas: int, num_groups: int,
-                      num_nodes: int, num_gpus: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-    """
-    Entry point for expert-parallelism load balancer.
-
-    Parameters:
-        weight: [layers, num_logical_experts], the load statistics for all logical experts
-        num_replicas: number of physical experts, must be a multiple of `num_gpus`
-        num_groups: number of expert groups
-        num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
-        num_gpus: number of GPUs, must be a multiple of `num_nodes`
-
-    Returns: 
-        physical_to_logical_map: [layers, num_replicas], the expert index of each replica
-        logical_to_physical_map: [layers, num_logical_experts, X], the replica indices for each expert
-        expert_count: [layers, num_logical_experts], number of physical replicas for each logical expert
-    """
-    num_layers, num_logical_experts = weight.shape
-    weight = weight.astype(np.float32)
-    if num_groups % num_nodes == 0:
-        # use hierarchical load-balance policy
-        phy2log, phyrank, logcnt = rebalance_experts_hierarchical(weight, num_replicas, 
-                                                                  num_groups, num_nodes, num_gpus)
-    else:
-        # use global load-balance policy
-        phy2log, phyrank, logcnt = replicate_experts(weight, num_replicas)
-    maxlogcnt = logcnt.max()
-    log2phy = np.full((num_layers, num_logical_experts, maxlogcnt), -1, dtype=np.int32)
-    np.put_along_axis(log2phy.reshape(num_layers, -1)[:, :, None], 
-                      (phy2log * maxlogcnt + phyrank)[:, :, None], 
-                      np.arange(num_replicas, dtype=np.int32).reshape(1, -1).repeat(num_layers, axis=0)[:, :, None], 
-                      axis=1)
-    return phy2log, log2phy, logcnt
-
-__all__ = ['rebalance_experts']
-
-def main():
-    """ """
-    num_hidden_layers = 3
-    num_expert = 64
-    num_groups = 8
-
-    num_replicas = 64
-    num_nodes = 4
-    num_gpus = 4 * 8
-
-    model_tokens_per_expert_stats_list = np.random.randint(
-        low=1, high=10, size=(num_hidden_layers, num_expert))
-
-
-    phy2log, phyrank, logcnt = rebalance_experts(model_tokens_per_expert_stats_list,
-        num_replicas, num_groups, num_nodes, num_gpus)
-
-    print(phy2log)
-    print(phyrank)
-    print(logcnt)
-
-
-if __name__ == '__main__':
-    main()
--- a/fastdeploy/model_executor/eplb/experts_manager.py
+++ b/fastdeploy/model_executor/eplb/experts_manager.py
@@ -1,155 +0,0 @@
-"""
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-"""
-redundant expert manger
-"""
-
-import paddle
-import numpy as np
-
-from paddlenlp.utils.log import logger
-
-from fastdeploy.model_executor.eplb.eplb import rebalance_experts
-
-class RedundantExpertManger:
-    """
-    RedundantExpertManger
-    """
-    def __init__(self, 
-                 n_routed_experts,
-                 num_hidden_layers,
-                 redundant_experts_num,
-                 ep_size):
-        
-        self.num_expert = n_routed_experts
-        self.redundant_experts_num = redundant_experts_num
-        self.num_hidden_layers = num_hidden_layers
-        
-        self.num_replicas = self.num_expert + self.redundant_experts_num
-        self.num_nodes = max(ep_size // 8, 1)
-        self.num_gpus = ep_size
-        self.num_groups = 1
-        
-        self.export_per_rank = self.num_replicas // ep_size
-        assert self.num_replicas % ep_size == 0, \
-            f"num_replicas must be divisible by ep_size, \
-                but got num_replicas = {self.num_replicas}, ep_size = {ep_size}"
-        
-        self.model_ep_rank_to_expert_id_list = paddle.full(
-            shape=[self.num_hidden_layers, self.num_expert + self.redundant_experts_num],
-            fill_value=-1,
-            dtype="int32")
-        self.model_expert_id_to_ep_rank_array = paddle.full(
-            shape=[self.num_hidden_layers, self.num_expert, self.redundant_experts_num + 1],
-            fill_value=-1,
-            dtype="int32")
-        self.model_expert_in_rank_num_list = paddle.full(
-            shape=[self.num_hidden_layers, self.num_expert],
-            fill_value=0,
-            dtype="int32")
-        # self.model_ep_rank_to_expert_id_list = paddle.arange(
-        #     self.num_expert + self.redundant_experts_num, 
-        #     dtype="int32").tile([self.num_hidden_layers, 1])
-        # self.model_expert_id_to_ep_rank_array = paddle.arange(
-        #     self.num_expert,
-        #     dtype="int32").reshape([self.num_expert, 1]).tile([self.num_hidden_layers, 1, 1])
-        # self.model_expert_in_rank_num_list = paddle.full(
-        #     shape=[self.num_hidden_layers, self.num_expert],
-        #     fill_value=1,
-        #     dtype="int32")
-
-        self.model_tokens_per_expert_stats_list = paddle.ones(
-            shape=[self.num_hidden_layers, self.num_expert], 
-            dtype="int32")
-
-        rank_expert_list, \
-            logical_to_physical_map, \
-            expert_count = rebalance_experts(
-                                self.model_tokens_per_expert_stats_list.cpu().numpy(), 
-                                self.num_replicas, 
-                                self.num_groups, 
-                                self.num_nodes, 
-                                self.num_gpus)
-
-        self.update_expert_rank_table(rank_expert_list, logical_to_physical_map, expert_count, False)
-        
-        logger.info(f"moe experts table manager init successfully, ep_size {ep_size} \
-            num_replicas {self.num_replicas} export_per_rank {self.export_per_rank}")
-
-
-    def get_ep_rank_to_expert_id_list_by_layer(self, layer_id):
-        """
-        get_ep_rank_to_expert_id_list_by_layer
-        """
-        return self.model_ep_rank_to_expert_id_list[layer_id],  \
-               self.model_expert_id_to_ep_rank_array[layer_id], \
-               self.model_expert_in_rank_num_list[layer_id], \
-               self.model_tokens_per_expert_stats_list[layer_id]
-               
-    def get_ep_rank_to_expert_id_list(self, layer_id):
-        """
-        get_ep_rank_to_expert_id_list
-        """
-        return self.model_ep_rank_to_expert_id_list[layer_id],  \
-               self.model_expert_id_to_ep_rank_array[layer_id], \
-               self.model_expert_in_rank_num_list[layer_id], \
-               self.model_tokens_per_expert_stats_list[layer_id]
-    
-    def get_expert_tokens_stats(self, verbose: bool = False, clear_stat: bool = False):
-        """ 
-        get_per_expert_tokens_stats 
-        """
-        try:
-            if verbose:
-                return self.model_tokens_per_expert_stats_list.cpu().numpy(), \
-                    self.model_expert_id_to_ep_rank_array.cpu().numpy(), \
-                    self.model_ep_rank_to_expert_id_list.cpu().numpy(), \
-                    self.model_expert_in_rank_num_list.cpu().numpy()
-            return self.model_tokens_per_expert_stats_list.cpu().numpy(), None, None, None
-        finally:
-            if clear_stat:
-                self.model_tokens_per_expert_stats_list.zero_()
-    
-    def get_expert_id_to_ep_rank_array(self):
-        """
-        get_expert_id_to_ep_rank_array
-        """
-        return self.model_expert_id_to_ep_rank_array.cpu().numpy()
-
-
-    def update_expert_rank_table(self, 
-        rank_expert_list: np.ndarray, 
-        logical_to_physical_map: np.ndarray, 
-        expert_count: np.ndarray,
-        clear_stat: bool = True
-    ):
-        """
-        update_expert_rank_table
-        """
-        #update model info
-        self.model_ep_rank_to_expert_id_list.copy_(paddle.to_tensor(rank_expert_list), True)
-        self.model_expert_id_to_ep_rank_array.fill_(-1)
-        self.model_expert_id_to_ep_rank_array[:, :, :logical_to_physical_map.shape[-1]] = \
-            paddle.to_tensor(logical_to_physical_map)
-        self.model_expert_in_rank_num_list.copy_(paddle.to_tensor(expert_count), True)
-        
-        # reset
-        if clear_stat:
-            self.model_tokens_per_expert_stats_list.zero_()
-
-if __name__ == '__main__':
-    print(RedundantExpertManger(64, 2, 8, 8).model_expert_id_to_ep_rank_array)
--- a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
+++ b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
@@ -20,7 +20,7 @@ from typing import Callable, Dict, Optional
 import paddle.device.cuda.graphs as graphs
 import paddle.nn.layer

-from fastdeploy.config import LLMConfig
+from fastdeploy.config import FDConfig
 from fastdeploy.utils import get_logger

 logger = get_logger("cudagrpah_piecewise_backend",
@@ -33,7 +33,7 @@ class ConcreteSizeEntry:
    # Concrete batch size
    runtime_bs: int
    # The size is in cudagraph_capture_sizes
-    use_cuda_graph: bool = True
+    use_cudagraph: bool = True
    # Has runtime-bs been captured before
    captured: bool = False

@@ -56,45 +56,56 @@ class CudaGraphPiecewiseBackend:

    def __init__(
        self,
-        llm_config: LLMConfig,
+        fd_config: FDConfig,
        runnable: Callable,
    ):
-        self.llm_config = llm_config
+        self.fd_config = fd_config
        self.runnable = runnable
-        self.cuda_graph_capture_size = llm_config.graph_opt_config.cudagraph_capture_sizes
+        self.cudagraph_capture_sizes = fd_config.graph_opt_config.cudagraph_capture_sizes
+        self.warm_up_size = fd_config.graph_opt_config.cudagraph_num_of_warmups
+        self.batch_size_to_captured_size = fd_config.graph_opt_config.batch_size_to_captured_size
+
        # runtime_bs -> ConcreteSizeEntry
        self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}

-        for shape in self.cuda_graph_capture_size:
+        for shape in self.cudagraph_capture_sizes:
            self.concrete_size_entries[shape] = ConcreteSizeEntry(
                runtime_bs=shape)

-        print("create all batch size entry")
+        print("[CUDA GRAPH] Created all batch size entry ")

    def __call__(self, **kwargs):
        # Get batch size
-        input_ids: paddle.Tensor = kwargs['input_ids']
-        batch_size = input_ids.shape[0]
-        entry = self.concrete_size_entries.get(batch_size)
+        ids_remove_padding: paddle.Tensor = kwargs["ids_remove_padding"]
+        batch_size = ids_remove_padding.shape[0]
+
+        padding_batch_size = self.batch_size_to_captured_size[batch_size]
+        # print(
+        #     f"[CUDA GRAPH] The actual batch size obtained by CUDAGraph is :{batch_size}, ",
+        #     f"The padded batch size is :{padding_batch_size}"
+        # )
+
+        entry = self.concrete_size_entries.get(padding_batch_size)
+        assert entry is not None, f"Batch size:{padding_batch_size} is not in cuda graph capture list."
        if entry.runnable is None:
            entry.runnable = self.runnable
-            print(
-                f"[CUDA GRAPH] new entry lazy initialize with batch size {batch_size}"
-            )
+            # print(
+            #     f"[CUDA GRAPH] New entry lazy initialize with batch size {padding_batch_size}"
+            # )

-        if not entry.use_cuda_graph:
+        if not entry.use_cudagraph:
            return entry.runnable(**kwargs)

        # Capture a new cuda graph
        if entry.cuda_graph is None:
            # Warmup the model
-            for n in range(entry.num_finished_warmup):
+            for n in range(entry.num_finished_warmup, self.warm_up_size):
                entry.num_finished_warmup += 1
                entry.runnable(**kwargs)
-                print(
-                    f"[CUDA GRAPH] warm up for batch size "
-                    f"{batch_size}, finished ({n+1}/{entry.num_finished_warmup}) times"
-                )
+                # print(
+                #     "[CUDA GRAPH] Warm up for batch size ",
+                #     f"{padding_batch_size}, finished ({n+1}/{entry.num_finished_warmup}) times"
+                # )

            # Store input addresses for debug
            input_addresses = [
@@ -118,11 +129,11 @@ class CudaGraphPiecewiseBackend:
            output._clear

            paddle.device.synchronize()
-            print(
-                f"[CUDA GRAPH] cuda graph captured for batch size {batch_size}"
-            )
+            # print(
+            #     f"[CUDA GRAPH] CUDAGraph captured for batch size {padding_batch_size}"
+            # )

        # Replay
        entry.cuda_graph.replay()
-        print(f"[CUDA GRAPH] cuda graph replayed for batch size {batch_size}")
+        # print(f"[CUDA GRAPH] CUDAGraph replayed for batch size {padding_batch_size}")
        return entry.output_buffer
--- a/fastdeploy/model_executor/graph_optimization/decorator.py
+++ b/fastdeploy/model_executor/graph_optimization/decorator.py
@@ -19,14 +19,14 @@ from typing import Callable, Optional, TypeVar

 import paddle.nn.layer

-from fastdeploy.config import LLMConfig
+from fastdeploy.config import FDConfig
 from fastdeploy.model_executor.graph_optimization.graph_optimization_backend import \
    GraphOptBackend

 _T = TypeVar("_T", bound=type[paddle.nn.Layer])


-def support_graph_opt(cls: Optional[_T] = None) -> _T:
+def support_graph_optimization(cls: Optional[_T] = None) -> _T:
    """
    A decorator for wrapping models or layers with CUDA graph support.
    This enables efficient kernel launch sequencing for improved GPU performance.
@@ -34,7 +34,7 @@ def support_graph_opt(cls: Optional[_T] = None) -> _T:
    Example usage:

    '''
-    @support_graph_opt
+    @support_graph_optimization
    class ErnieBot(paddle.nn.Layer):
        def __init__(**kwargs):
            ...
@@ -49,15 +49,13 @@ def support_graph_opt(cls: Optional[_T] = None) -> _T:
        cls.__bases__ = cls.__bases__ + (GraphOptWrapper, )
    origin_init = cls.__init__

-    def __init__(self, llm_config: LLMConfig, **kwargs):
+    def __init__(self, fd_config: FDConfig, **kwargs):
        """ Decorator model.__init__() func """
-        origin_init(self, llm_config=llm_config, **kwargs)
-        self.use_graph_opt = (
-            not (llm_config.graph_opt_config.graph_opt_level == 0
-                 and not llm_config.graph_opt_config.use_cudagraph))
+        origin_init(self, fd_config=fd_config, **kwargs)
+        self.use_graph_opt = fd_config.graph_opt_config.graph_opt_level > 0 or fd_config.graph_opt_config.use_cudagraph
        if self.use_graph_opt:
            GraphOptWrapper.__init__(self,
-                                     llm_config=llm_config,
+                                     fd_config=fd_config,
                                     graph_opt_backend=None)
        else:
            # Not use graph optimization
@@ -81,10 +79,10 @@ class GraphOptWrapper:
    def __init__(
        self,
        graph_opt_backend: Optional[Callable] = None,
-        llm_config: LLMConfig = None,
+        fd_config: FDConfig = None,
    ):
        if graph_opt_backend is None:
-            graph_opt_backend = GraphOptBackend(self.forward, llm_config)
+            graph_opt_backend = GraphOptBackend(self.forward, fd_config)
        self.graph_opt_backend = graph_opt_backend

    @abstractmethod
--- a/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py
+++ b/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py
@@ -16,7 +16,9 @@

 from typing import Callable, Optional

-from fastdeploy.config import LLMConfig
+from paddle.jit.dy2static.utils import Backend
+
+from fastdeploy.config import FDConfig
 from fastdeploy.model_executor.graph_optimization.cudagraph_piecewise_backend import \
    CudaGraphPiecewiseBackend

@@ -24,38 +26,39 @@ from fastdeploy.model_executor.graph_optimization.cudagraph_piecewise_backend im
 class GraphOptBackend:
    """ """

-    llm_config: LLMConfig
+    fd_config: FDConfig
    cudagraph_piecewise_backend: Optional[CudaGraphPiecewiseBackend] = None

-    def __init__(self, runnable: Callable, llm_config: LLMConfig):
+    def __init__(self, runnable: Callable, fd_config: FDConfig):
        self.runnable = runnable
-        self.llm_config = llm_config
+        self.fd_config = fd_config

-    def __call__(self, **kwargs):
-        # 1. TODO(gongshaotian): Static graph
-        if self.llm_config.graph_opt_config.graph_opt_level > 0:
+        self.max_captre_batch = fd_config.graph_opt_config.cudagraph_capture_sizes[
+            0]
+        if self.fd_config.graph_opt_config.graph_opt_level > 0:
            # 1. Prepare cuda grpah input buffers (contain output of subgraphs)

            # 2. Convert dynamic grpah to static graph
-            if self.llm_config.graph_opt_config.graph_opt_level > 1:
-                # with cinn
-                pass
-            else:
-                # not use cinn
-                pass
+            from paddle.jit import sot
+            backend = (Backend.CINN
+                       if self.fd_config.graph_opt_config.graph_opt_level > 1
+                       else Backend.PHI)
+            self.runnable = sot.symbolic_translate(self.runnable,
+                                                   training=False,
+                                                   backend=backend)

-            # 3. Split the static graph and get a list of callable obj
+    def __call__(self, **kwargs):
+        if not self.fd_config.graph_opt_config.use_cudagraph:
+            return self.runnable(**kwargs)
+        if self.cudagraph_piecewise_backend is None:
+            self.cudagraph_piecewise_backend = CudaGraphPiecewiseBackend(
+                fd_config=self.fd_config, runnable=self.runnable)

-            # 4. Get piecewise cuda grpah backend list
+        assert kwargs["forward_meta"].ids_remove_padding is not None
+        batch_size = kwargs["forward_meta"].ids_remove_padding.shape[0]

-            return self.runnable  # Fake return value
-
-        # 2. Dynamic graph
+        if ((not kwargs["forward_meta"].step_use_cudagraph)
+                or (batch_size > self.max_captre_batch)):
+            return self.runnable(**kwargs)
        else:
-            print(self.cudagraph_piecewise_backend is None)
-            if self.cudagraph_piecewise_backend is None:
-                self.cudagraph_piecewise_backend = CudaGraphPiecewiseBackend(
-                    llm_config=self.llm_config, runnable=self.runnable)
-            # TODO(gongshaotian): handling kwargs
-            assert kwargs["input_ids"] is not None
            return self.cudagraph_piecewise_backend.__call__(**kwargs)
--- a/fastdeploy/model_executor/guided_decoding/init.py
+++ b/fastdeploy/model_executor/guided_decoding/init.py
@@ -0,0 +1,73 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+# from fastdeploy.config import FDConfig
+
+__all__ = ['get_guided_backend', 'schema_checker']
+
+
+def get_guided_backend(
+    fd_config,
+    **kwargs,
+):
+    """
+    Get the guided decoding backend instance based on configuration.
+
+    Args:
+        fd_config (FDConfig): FastDeploy configuration object containing backend settings
+        **kwargs: Additional arguments passed to the backend constructor
+
+    Returns:
+        BaseBackend: An instance of the specified guided decoding backend
+
+    Raises:
+        ValueError: If the specified backend is not supported
+    """
+    if fd_config.parallel_config.guided_decoding_backend.lower() == "xgrammar":
+        from fastdeploy.model_executor.guided_decoding.xgrammar_backend import \
+            XGrammarBackend
+        return XGrammarBackend(
+            fd_config=fd_config,
+            **kwargs,
+        )
+    else:
+        raise ValueError(
+            f"Get unsupported backend {fd_config.parallel_config.guided_decoding_backend},"
+            f" please check your configuration.")
+
+
+def schema_checker(backend_name: str, **kwargs):
+    """
+    Get the schema checker instance for the specified backend.
+
+    Args:
+        backend_name (str): Name of the backend (e.g. "xgrammar")
+        **kwargs: Additional arguments passed to the checker constructor
+
+    Returns:
+        BaseChecker: An instance of the specified schema checker
+
+    Raises:
+        ValueError: If the specified backend is not supported
+    """
+    if backend_name.lower() == "xgrammar":
+        from fastdeploy.model_executor.guided_decoding.xgrammar_backend import \
+            XGrammarChecker
+        return XGrammarChecker(**kwargs)
+    else:
+        raise ValueError(
+            f"Get unsupported backend {backend_name}, please check your configuration."
+        )
--- a/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py
+++ b/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py
@@ -0,0 +1,347 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import os
+from concurrent.futures import ThreadPoolExecutor
+
+from fastdeploy.config import FDConfig
+from fastdeploy.engine.request import Request
+from fastdeploy.utils import llm_logger
+
+
+class LogitsProcessorBase:
+    """
+    Abstract base class for logits processors in guided decoding.
+
+    This class defines the interface for logits processors that modify token probabilities
+    during generation to enforce schema constraints. Subclasses should implement all
+    abstract methods to provide specific constraint enforcement logic.
+
+    Attributes:
+        None (all state should be managed by subclasses)
+    """
+
+    def __init__(self):
+        pass
+
+    def fill_token_bitmask(self, token_bitmask, idx):
+        """
+        Fill the vocabulary mask.
+
+        Args:
+            token_bitmask (tensor): The vocabulary mask tensor.
+            idx (tensor): The tensor index.
+
+        Raises:
+            NotImplementedError: This method should be implemented in subclasses.
+        """
+        raise NotImplementedError()
+
+    def apply_token_mask(self, logits, token_bitmask):
+        """
+        Apply the vocabulary mask to logits.
+
+        Args:
+            logits (tensor): The logits tensor.
+            token_bitmask (tensor): The vocabulary mask tensor.
+
+        Raises:
+            NotImplementedError: This method should be implemented in subclasses.
+        """
+        raise NotImplementedError()
+
+    def allocate_token_bitmask(self, batch_size, vocab_size):
+        """
+        Allocate a token bitmask for the given batch size and vocabulary size.
+
+        Args:
+            batch_size (int): The batch size.
+            vocab_size (int): The vocabulary size.
+
+        Returns:
+            tensor: The allocated token bitmask.
+        """
+        raise NotImplementedError()
+
+    def accept_token(self, token):
+        """
+        Accept tokens based on the token bitmask
+
+        Args:
+            token (int): The token id.
+
+        Raises:
+            NotImplementedError: This method should be implemented in subclasses.
+        """
+        raise NotImplementedError()
+
+    def is_terminated(self):
+        """
+        Check if the processor has been terminated.
+
+        Raises:
+            NotImplementedError: This method should be implemented in subclasses.
+        """
+        raise NotImplementedError()
+
+    def reset(self):
+        """
+        Reset the matcher state.
+        """
+        raise NotImplementedError()
+
+    def copy(self):
+        """
+        Create a copy of the backend instance.
+
+        Returns:
+            BackendBase: A copy of the backend instance.
+        """
+        raise NotImplementedError()
+
+
+class BackendBase:
+    """
+    Abstract base class for guided decoding backends.
+
+    This class provides the core infrastructure for managing schema processors and
+    their caching. It handles:
+    - Processor creation and caching
+    - Tokenizer initialization
+    - Thread pool management for async operations
+
+    Attributes:
+        cache (dict): Cache of schema processors
+        fd_config (FDConfig): FastDeploy configuration
+        executor (ThreadPoolExecutor): Thread pool for async operations
+        max_cache_size (int): Maximum number of processors to cache
+        hf_tokenizer: HuggingFace tokenizer instance
+    """
+
+    def __init__(self, fd_config: FDConfig):
+        self.cache = {}
+        self.fd_config = fd_config
+        self.executor = ThreadPoolExecutor()
+        self.max_cache_size = 2048
+
+        self.hf_tokenizer = self._get_tokenizer_hf()
+
+    def _create_processor(self):
+        """
+        Create a specific logits processor instance.
+
+        Raises:
+            NotImplementedError: This method should be implemented in subclasses.
+        """
+        raise NotImplementedError()
+
+    def _json_processor(self, schemata):
+        """
+        Process JSON schemata.
+
+        Args:
+            schemata (str): The schemata string.
+
+        Raises:
+            NotImplementedError: This method should be implemented in subclasses.
+        """
+        raise NotImplementedError()
+
+    def _regex_processor(self, schemata):
+        """
+        Process regular expression schemata.
+
+        Args:
+            schemata (str): The schemata string.
+
+        Raises:
+            NotImplementedError: This method should be implemented in subclasses.
+        """
+        raise NotImplementedError()
+
+    def _grammar_processor(self, schemata):
+        """
+        Process grammar schemata.
+
+        Args:
+            schemata (str): The schemata string.
+
+        Raises:
+            NotImplementedError: This method should be implemented in subclasses.
+        """
+        raise NotImplementedError()
+
+    def _structural_tag_processor(self, schemata):
+        """
+        Process structural tag schemata.
+
+        Args:
+            schemata (str): The schemata string.
+
+        Raises:
+            NotImplementedError: This method should be implemented in subclasses.
+        """
+        raise NotImplementedError()
+
+    def _unsupported_processor_type(self, key_type, schemata):
+        """
+        Process unsupported type.
+
+        Args:
+            key_type (str): The key type string.
+            schemata (str): The schemata string.
+        """
+        raise Exception(f"Unsupported processor type {key_type}.")
+
+    def _init_logits_processor(
+            self, schemata_key: tuple[str, str]) -> LogitsProcessorBase:
+        """
+        init logits processor by type and schemata.
+
+        Args:
+            schemata_key (tuple[str, str]): Tuple containing processor type and schema string
+
+        Returns:
+            LogitsProcessorBase: Initialized logits processor instance
+
+        Raises:
+            ValueError: If processor type is not supported
+        """
+        key_type, schemata = schemata_key
+        if key_type == "json":
+            return self._json_processor(schemata)
+        elif key_type == "regex":
+            return self._regex_processor(schemata)
+        elif key_type == "grammar":
+            return self._grammar_processor(schemata)
+        elif key_type == "structural_tag":
+            return self._structural_tag_processor(schemata)
+        else:
+            llm_logger.error(f"Unsupported processor type {key_type}.")
+            return None
+
+    def get_logits_processor(
+            self,
+            schemata_key: tuple[str, str]) -> tuple[LogitsProcessorBase, bool]:
+        """
+        get logits processor by key from cache or create new one.
+
+        Args:
+            schemata_key (tuple[str, str]): Tuple containing processor type and schema string
+
+        Returns:
+            tuple[LogitsProcessorBase, bool]: Tuple containing:
+                - LogitsProcessorBase: The logits processor instance
+                - bool: True if processor was from cache, False if newly created
+        """
+        value = self.cache.get(schemata_key, None)
+        if value:
+            return value.copy(), True
+        value = self.executor.submit(self._init_logits_processor, schemata_key)
+        return value, False
+
+    def _get_tokenizer_hf(self):
+        """
+        Initialize and return a HuggingFace tokenizer instance.
+
+        This method handles special cases for Ernie models and falls back to standard
+        AutoTokenizer for other models. It also ensures fast tokenizer is used when possible.
+
+        Returns:
+            Tokenizer: Initialized HuggingFace tokenizer instance
+
+        Raises:
+            Exception: If tokenizer initialization fails
+        """
+        try:
+            architectures = self.fd_config.model_config.architectures
+            if "Ernie4_5_MoeForCausalLM" not in architectures \
+                and "Ernie4_5_ForCausalLM" not in architectures:
+
+                from transformers import AutoTokenizer, PreTrainedTokenizerFast
+                tokenizer = AutoTokenizer.from_pretrained(
+                    self.fd_config.parallel_config.model_name_or_path,
+                    use_fast=False,
+                )
+
+                if not isinstance(tokenizer, PreTrainedTokenizerFast):
+                    tokenizer = PreTrainedTokenizerFast(
+                        __slow_tokenizer=tokenizer)
+            else:
+                from fastdeploy.model_executor.guided_decoding.ernie_tokenizer import \
+                    ErnieBotTokenizer
+
+                vocab_file_names = [
+                    "tokenizer.model", "spm.model", "ernie_token_100k.model"
+                ]
+                for i in range(len(vocab_file_names)):
+                    if os.path.exists(
+                            os.path.join(
+                                self.fd_config.parallel_config.
+                                model_name_or_path, vocab_file_names[i])):
+                        ErnieBotTokenizer.vocab_files_names[
+                            "vocab_file"] = vocab_file_names[i]
+                        break
+
+                tokenizer = ErnieBotTokenizer.from_pretrained(
+                    self.fd_config.parallel_config.model_name_or_path)
+
+            return tokenizer
+        except Exception as e:
+            raise Exception(f"Fail to initialize hf tokenizer: {e}")
+
+    def add_cache(self, schemata_key: tuple[str, str],
+                  processor: LogitsProcessorBase) -> None:
+        """
+        add logits processor to cache.
+
+        Args:
+            schemata_key (tuple[str, str]): Tuple containing processor type and schema string
+            processor (LogitsProcessorBase): Logits processor instance to cache
+
+        Returns:
+            None: No return value
+        """
+        if len(self.cache) >= self.max_cache_size:
+            return
+        self.cache[schemata_key] = processor.copy()
+
+
+class BaseChecker:
+    """
+    Abstract base class for schema checkers.
+
+    This class defines the interface for validating and formatting schemas
+    before they are used by logits processors. Subclasses should implement
+    schema-specific validation and formatting logic.
+
+    Attributes:
+        None (all state should be managed by subclasses)
+    """
+
+    def __init__(self):
+        pass
+
+    def schema_format(self, request: Request):
+        """
+        format schema to backend specific format.
+        Args:
+            request (Request): request object.
+
+        Returns:
+            request (Request): request object with formatted schema.
+        """
+        raise NotImplementedError()
--- a/fastdeploy/model_executor/guided_decoding/ernie_tokenizer.py
+++ b/fastdeploy/model_executor/guided_decoding/ernie_tokenizer.py
@@ -0,0 +1,266 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+
+VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {},
+    "tokenizer_file": {},
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
+
+
+class ErnieBotTokenizer(PreTrainedTokenizer):
+    """
+    Construct a ErnieBot tokenizer. Based on byte-level Byte-Pair-Encoding.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    resource_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="<pad>",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
+        bos_token = AddedToken(bos_token,
+                               lstrip=False, rstrip=False) if isinstance(
+                                   bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token,
+                               lstrip=False, rstrip=False) if isinstance(
+                                   eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token,
+                               lstrip=False, rstrip=False) if isinstance(
+                                   unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token,
+                               lstrip=False, rstrip=False) if isinstance(
+                                   pad_token, str) else pad_token
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
+        # for eb35 reader
+        self.bos_id = self.bos_token_id
+        self.eos_id = self.eos_token_id
+        self.sep_id = self.sep_token_id
+        self.pad_id = self.pad_token_id
+        self.unk_id = self.unk_token_id
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {
+            self.convert_ids_to_tokens(i): i
+            for i in range(self.vocab_size)
+        }
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def tokenize(self, text):
+        """Returns a tokenized string."""
+        return self._tokenize(text)
+
+    def _tokenize(self, text):
+        """Returns a tokenized string."""
+        return self.sp_model.encode(text, out_type=str)
+
+    def decode(self,
+               tokens,
+               skip_special_tokens=False,
+               clean_up_tokenization_spaces=False):
+        """Returns a tokenized string."""
+        return self.sp_model.decode(tokens)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for i, token in enumerate(tokens):
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special and i != 0:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+
+    def save_vocabulary(self,
+                        save_directory,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            return
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") +
+            VOCAB_FILES_NAMES["vocab_file"])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(
+                out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file, )
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        build inputs with special tokens
+        """
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = bos_token_id + token_ids_0 + eos_token_id
+
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+
+        return output
+
+    def get_special_tokens_mask(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None,
+            already_has_special_tokens: bool = False) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True)
+
+        bos_token_id = [1] if self.add_bos_token else []
+        eos_token_id = [1] if self.add_eos_token else []
+
+        if token_ids_1 is None:
+            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+        return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
+                bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)
+
+    def create_token_type_ids_from_sequences(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        sequence pair mask has the following format:
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+
+        if token_ids_1 is not None:
+            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+
+        return output
--- a/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py
+++ b/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py
@@ -0,0 +1,457 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import json
+import re
+from typing import Any, List, Optional
+
+import paddle
+import torch
+
+from fastdeploy.config import FDConfig
+from fastdeploy.engine.request import Request
+from fastdeploy.model_executor.guided_decoding.base_guided_decoding import (
+    BackendBase, BaseChecker, LogitsProcessorBase)
+from fastdeploy.utils import llm_logger
+
+try:
+    from xgrammar import (CompiledGrammar, Grammar, GrammarCompiler,
+                          GrammarMatcher, StructuralTagItem, TokenizerInfo,
+                          allocate_token_bitmask, apply_token_bitmask_inplace)
+except Exception as e:
+    raise Exception(
+        f"import XGrammar failed, please check your environment:\n\t {e}")
+
+
+class XGrammarProcessor(LogitsProcessorBase):
+    """
+    XGrammar-specific implementation of LogitsProcessorBase.
+
+    This processor enforces grammar constraints during token generation using XGrammar.
+    It manages the grammar matching state and applies token masks to logits.
+
+    Attributes:
+        max_rollback_tokens (int): Maximum number of tokens to rollback on mismatch
+        vocab_size (int): Size of the vocabulary
+        batch_size (int): Batch size for processing
+        splitwise_role (str): Role for splitwise processing
+        compiled_grammar (CompiledGrammar): Compiled grammar rules
+        terminate_without_stop_token (bool): Whether to terminate without stop token
+        override_stop_tokens (Optional[List[int]]): Custom stop tokens
+        matcher (GrammarMatcher): Grammar matching engine
+    """
+
+    def __init__(
+        self,
+        compiled_grammar: CompiledGrammar,
+        terminate_without_stop_token: bool = False,
+        override_stop_tokens: Optional[List[int]] = None,
+        vocab_size: Optional[int] = None,
+        batch_size: Optional[int] = None,
+        splitwise_role: str = "mixed",
+    ):
+        super().__init__()
+        self.max_rollback_tokens = 200
+        self.vocab_size = vocab_size
+        self.batch_size = batch_size
+        self.splitwise_role = splitwise_role
+        self.compiled_grammar = compiled_grammar
+        self.terminate_without_stop_token = terminate_without_stop_token
+        self.override_stop_tokens = override_stop_tokens
+
+        self.matcher = GrammarMatcher(
+            compiled_grammar=compiled_grammar,
+            max_rollback_tokens=self.max_rollback_tokens,
+            terminate_without_stop_token=terminate_without_stop_token,
+            override_stop_tokens=override_stop_tokens,
+        )
+
+    def allocate_token_bitmask(self) -> torch.Tensor:
+        """
+        Allocate a token bitmask tensor for grammar constraints.
+
+        Returns:
+            torch.Tensor: A tensor of shape (batch_size, vocab_size) initialized to 0
+        """
+        return allocate_token_bitmask(self.batch_size, self.vocab_size)
+
+    def fill_token_bitmask(self, token_bitmask: torch.Tensor,
+                           idx: int) -> None:
+        """
+        Fill the token bitmask with allowed tokens for the given index.
+
+        Args:
+            token_bitmask (torch.Tensor): The token bitmask tensor to fill
+            idx (int): The batch index to fill the mask for
+
+        Returns:
+            None: Modifies the token_bitmask in-place
+        """
+        self.matcher.fill_next_token_bitmask(token_bitmask, idx)
+
+    def apply_token_mask(
+        self,
+        logits: paddle.Tensor,
+        token_bitmask: torch.Tensor,
+        indices: Optional[List[int]] = None,
+    ) -> paddle.Tensor:
+        """
+        Apply the token mask to the logits, modifying probabilities of invalid tokens.
+
+        Args:
+            logits (paddle.Tensor): The logits tensor to modify
+            token_bitmask (torch.Tensor): The token bitmask indicating allowed tokens
+            indices (Optional[List[int]]): Optional list of batch indices to apply mask to
+
+        Returns:
+            paddle.Tensor: The modified logits tensor
+        """
+        origin_place = logits.place
+        origin_dtype = logits.dtype
+        logits = torch.from_numpy(logits.numpy())
+
+        logits = logits.float()  # cpu
+        apply_token_bitmask_inplace(
+            logits=logits,
+            bitmask=token_bitmask.to(logits.device, non_blocking=True),
+            indices=indices,
+        )
+
+        return paddle.to_tensor(
+            logits.numpy(),
+            dtype=origin_dtype,
+            place=origin_place,
+        )
+
+    def reset(self) -> None:
+        """
+        Reset the grammar matcher state to initial conditions.
+
+        Returns:
+            None: No return value
+        """
+        self.matcher.reset()
+
+    def accept_token(self, token: int) -> None:
+        """
+        Validate and accept a generated token against the grammar constraints.
+
+        Args:
+            token (int): The token ID to validate
+
+        Raises:
+            AssertionError: If token is not allowed by the grammar
+        """
+        assert self.matcher.accept_token(
+            token), f"Failed to accept token {token}"
+
+    def is_terminated(self) -> bool:
+        """
+        Check if the grammar matching process has terminated.
+
+        Returns:
+            bool: True if matching has terminated, False otherwise
+        """
+        return self.matcher.is_terminated()
+
+    def copy(self) -> "XGrammarProcessor":
+        """
+        Create a deep copy of this processor instance.
+
+        Returns:
+            XGrammarProcessor: A new processor instance with identical state
+        """
+        return XGrammarProcessor(
+            compiled_grammar=self.compiled_grammar,
+            terminate_without_stop_token=self.terminate_without_stop_token,
+            override_stop_tokens=self.override_stop_tokens,
+            vocab_size=self.vocab_size,
+            batch_size=self.batch_size,
+            splitwise_role=self.splitwise_role,
+        )
+
+
+class XGrammarBackend(BackendBase):
+    """
+    XGrammar-specific implementation of BackendBase.
+
+    This backend handles compilation of various schema types (JSON, regex, grammar)
+    into XGrammar processors. It manages the grammar compiler and tokenizer info.
+
+    Attributes:
+        vocab_size (int): Size of the vocabulary from config
+        batch_size (int): Maximum batch size from config
+        any_whitespace (bool): Whether to allow any whitespace in JSON
+        splitwise_role (str): Role for splitwise processing
+        grammar_compiler (GrammarCompiler): Grammar compilation engine
+    """
+
+    def __init__(
+        self,
+        fd_config: FDConfig,
+        **kwargs,
+    ):
+        super().__init__(fd_config=fd_config)
+        self.vocab_size = fd_config.model_config.vocab_size
+        self.batch_size = fd_config.parallel_config.max_num_seqs
+
+        self.any_whitespace = not fd_config.parallel_config.disable_any_whitespace
+        self.splitwise_role = fd_config.parallel_config.splitwise_role
+
+        try:
+            tokenizer_info = TokenizerInfo.from_huggingface(
+                self.hf_tokenizer, vocab_size=self.vocab_size)
+            self.grammar_compiler = GrammarCompiler(
+                tokenizer_info=tokenizer_info)
+        except Exception as e:
+            raise Exception(f"Failed to load XGrammar tokenizer: {e}")
+
+    def _create_processor(
+        self,
+        compiled_grammar: CompiledGrammar,
+        terminate_without_stop_token: bool = False,
+        override_stop_tokens: Optional[List[int]] = None,
+    ) -> XGrammarProcessor:
+        """
+        Create a logits processor instance for the given compiled grammar.
+
+        Args:
+            compiled_grammar (CompiledGrammar): Compiled grammar rules
+            terminate_without_stop_token (bool): Whether to terminate without stop token
+            override_stop_tokens (Optional[List[int]]): Custom stop tokens to override defaults
+
+        Returns:
+            XGrammarProcessor: Configured grammar processor instance
+        """
+        return XGrammarProcessor(
+            compiled_grammar=compiled_grammar,
+            terminate_without_stop_token=terminate_without_stop_token,
+            override_stop_tokens=override_stop_tokens,
+            vocab_size=self.vocab_size,
+            batch_size=self.batch_size,
+            splitwise_role=self.splitwise_role,
+        )
+
+    def _json_processor(self, schemata: str) -> Optional[XGrammarProcessor]:
+        """
+        Compile JSON schema into a grammar processor.
+
+        Args:
+            schemata (str): JSON schema string to compile
+
+        Returns:
+            Optional[XGrammarProcessor]: Configured processor if successful, None on failure
+        """
+        try:
+            compiled_grammar = self.grammar_compiler.compile_json_schema(
+                schemata, any_whitespace=self.any_whitespace)
+        except Exception as e:
+            llm_logger.error(f"Failed to compile json schema: {e}")
+            return None
+        return self._create_processor(compiled_grammar)
+
+    def _regex_processor(self, schemata: str) -> Optional[XGrammarProcessor]:
+        """
+        Compile regex pattern into a grammar processor.
+
+        Args:
+            schemata (str): Regex pattern string to compile
+
+        Returns:
+            Optional[XGrammarProcessor]: Configured processor if successful, None on failure
+        """
+        try:
+            compiled_grammar = self.grammar_compiler.compile_regex(schemata)
+        except Exception as e:
+            llm_logger.error(f"Failed to compile regex schema: {e}")
+            return None
+        return self._create_processor(compiled_grammar)
+
+    def _grammar_processor(self, schemata: str) -> Optional[XGrammarProcessor]:
+        """
+        Compile grammar (EBNF) into a grammar processor.
+
+        Args:
+            schemata (str): Grammar string in EBNF format
+
+        Returns:
+            Optional[XGrammarProcessor]: Configured processor if successful, None on failure
+        """
+        try:
+            compiled_grammar = self.grammar_compiler.compile_grammar(schemata)
+        except Exception as e:
+            llm_logger.error(f"Failed to compile ebnf schema: {e}")
+            return None
+        return self._create_processor(compiled_grammar)
+
+    def _structural_tag_processor(
+            self, schemata: str) -> Optional[XGrammarProcessor]:
+        """
+        Compile structural tags into a grammar processor.
+
+        Args:
+            schemata (str): JSON string containing structural tag definitions
+
+        Returns:
+            Optional[XGrammarProcessor]: Configured processor if successful, None on failure
+        """
+        try:
+            structural_tag = json.loads(schemata)
+            tags = [
+                StructuralTagItem(
+                    begin=structure["begin"],
+                    schema=json.dumps(structure["schema"]),
+                    end=structure["end"],
+                ) for structure in structural_tag["structures"]
+            ]
+
+            compiled_grammar = self.grammar_compiler.compile_structural_tag(
+                tags, structural_tag["triggers"])
+        except Exception as e:
+            llm_logger.error(f"Failed to compile structural tags schema: {e}")
+            return None
+        return self._create_processor(compiled_grammar)
+
+
+class XGrammarChecker(BaseChecker):
+    """
+    XGrammar-specific implementation of BaseChecker.
+
+    This validator checks and formats various schema types (JSON, regex, grammar)
+    for compatibility with XGrammar before processing.
+
+    Attributes:
+        any_whitespace (bool): Whether to allow any whitespace in JSON
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__()
+
+        self.any_whitespace = not kwargs.get("disable_any_whitespace", True)
+
+    def _unsupported_json_schema(self, schema: dict[str, Any]) -> bool:
+        """
+        Check if JSON schema contains unsupported features.
+
+        Args:
+            schema (dict[str, Any]): JSON schema to validate
+
+        Returns:
+            bool: True if schema contains unsupported features, False otherwise
+        """
+
+        def check_object(obj: dict[str, Any]) -> bool:
+            if not isinstance(obj, dict):
+                return False
+
+            if obj.get("type") in ("integer", "number") and ("multipleOf"
+                                                             in obj):
+                return True
+
+            if obj.get("type") == "array" and any(
+                    key in obj for key in ("uniqueItems", "contains",
+                                           "minContains", "maxContains")):
+                return True
+
+            if obj.get("type") == "string" and "format" in obj:
+                return True
+
+            if obj.get("type") == "object" and any(
+                    key in obj
+                    for key in ("minProperties", "maxProperties",
+                                "propertyNames", "patternProperties")):
+                return True
+
+            for value in obj.values():
+                if isinstance(value, dict):
+                    if check_object(value):
+                        return True
+                elif isinstance(value, list):
+                    for item in value:
+                        if isinstance(item, dict) and check_object(item):
+                            return True
+            return False
+
+        return check_object(schema)
+
+    def schema_format(self, request: Request):
+        """
+        format schema to backend specific format.
+        """
+        if request.guided_json:
+            try:
+                if not isinstance(request.guided_json, str):
+                    guided_json = json.dumps(request.guided_json)
+                else:
+                    guided_json = request.guided_json
+
+                Grammar.from_json_schema(guided_json,
+                                         any_whitespace=self.any_whitespace)
+            except RuntimeError as e:
+                err_msg = f"Invalid JSON format: {guided_json}, error message: {str(e)}"
+                return request, err_msg
+
+            if self._unsupported_json_schema(guided_json):
+                err_msg = f"unsupported JSON schema: {guided_json}"
+                return request, err_msg
+
+            request.guided_json = guided_json
+            return request, None
+        elif request.guided_grammar:
+            # TODO: XGrammar only supports GBNF grammars, convert Lark to GBNF
+            guided_grammar = request.guided_grammar
+            try:
+                Grammar.from_ebnf(guided_grammar)
+            except RuntimeError as e:
+                err_msg = f"Invalid grammar format: {guided_grammar}, error message: {str(e)}"
+                return request, err_msg
+            request.guided_grammar = guided_grammar
+            return request, None
+        elif request.guided_json_object:
+            request.guided_json = '{"type": "object"}'
+            return request, None
+        elif request.guided_choice:
+            try:
+                escaped_choices = (re.sub(r'(["\\])', r'\\\1', c)
+                                   for c in request.guided_choice)
+                guided_choice = ('root ::= ' +
+                                 ' | '.join(f'"{c}"' for c in escaped_choices))
+
+                Grammar.from_ebnf(guided_choice)
+            except RuntimeError as e:
+                err_msg = f"Invalid choice format: {guided_choice}, error message: {str(e)}"
+                return request, err_msg
+
+            request.guided_grammar = guided_choice
+            return request, None
+        elif request.structural_tag:
+            try:
+                structural_tag = json.loads(request.structural_tag)
+                tags = [
+                    StructuralTagItem(
+                        begin=s["begin"],
+                        schema=json.dumps(s["schema"]),
+                        end=s["end"],
+                    ) for s in structural_tag["structures"]
+                ]
+                Grammar.from_structural_tag(tags, structural_tag["triggers"])
+            except RuntimeError as e:
+                err_msg = f"Invalid structural_tag format: {structural_tag}, error message: {str(e)}"
+                return request, err_msg
+            return request, None
+        else:
+            # regex is not format
+            return request, None
--- a/fastdeploy/model_executor/layers/activation.py
+++ b/fastdeploy/model_executor/layers/activation.py
@@ -15,10 +15,13 @@
 """

 # cipher_token=WjI1fQOvhN  # do not edit this line
+from typing import Optional
+
+import paddle
 from paddle import nn
 from paddle.incubate.nn.functional import fused_bias_act

-from fastdeploy.config import LLMConfig
+from fastdeploy.config import FDConfig
 from fastdeploy.platforms import current_platform


@@ -29,28 +32,27 @@ class SiluAndMul(nn.Layer):

    def __init__(
        self,
-        llm_config: LLMConfig,
-        bias=None,
-        act_method="gelu",
-        dequant_scales=None,
-        shift=None,
-        smooth=None,
-        quant_scale=-1,
+        fd_config: FDConfig,
+        bias: paddle.Tensor = None,
+        act_method: str = "gelu",
+        dequant_scales: Optional[paddle.Tensor] = None,
+        shift: Optional[paddle.Tensor] = None,
+        smooth: Optional[paddle.Tensor] = None,
+        quant_scale: float = -1,
    ):
        """
        Initialize the activation layer with optional parameters for quantization, bias,
        activation method, and more.

        Args:
-            llm_config (Any): Arguments related to inference, including quantization
+            fd_config (Any): Arguments related to inference, including quantization
                settings.
            bias (Optional[Tensor]): Optional bias term to be added to the output.
-            act_method (str, optional): Activation method to be applied.
-                Defaults to "gelu".
-            dequant_scales (Optional[List[float]]): Dequantization scales, used in
+            act_method (str): Activation method to be applied. Defaults to "gelu".
+            dequant_scales (Optional[Tensor]): Dequantization scales, used in
                quantization scenarios.
-            shift (Optional[float]): Shift factor, used in quantization scenarios.
-            smooth (Optional[float]): Smoothing factor, used for specific activation
+            shift (Optional[Tensor]): Shift factor, used in quantization scenarios.
+            smooth (Optional[Tensor]): Smoothing factor, used for specific activation
                functions.
            quant_scale (float, optional): Quantization scale, used in quantization
                scenarios. Defaults to -1, indicating no quantization.
@@ -61,12 +63,13 @@ class SiluAndMul(nn.Layer):
        """
        super().__init__()

-        if current_platform.is_cuda():
+        if current_platform.is_cuda() or current_platform.is_xpu():
            self.forward = self.forward_cuda
        else:
            raise NotImplementedError

        self.bias = bias
+        act_method = act_method.lower()
        if act_method == "silu":
            act_method = "swiglu"

@@ -75,9 +78,9 @@ class SiluAndMul(nn.Layer):
        self.shift = shift
        self.smooth = smooth
        self.quant_scale = quant_scale
-        self.quant_round_type = llm_config.quant_config.quant_round_type
-        self.quant_max_bound = llm_config.quant_config.quant_max_bound
-        self.quant_min_bound = llm_config.quant_config.quant_min_bound
+        self.quant_round_type = fd_config.quant_config.quant_round_type if fd_config.quant_config else 0
+        self.quant_max_bound = fd_config.quant_config.quant_max_bound if fd_config.quant_config else 0
+        self.quant_min_bound = fd_config.quant_config.quant_min_bound if fd_config.quant_config else 0

        self._dtype = self._helper.get_default_dtype()
        if self._dtype == "bfloat16":
@@ -91,12 +94,12 @@ class SiluAndMul(nn.Layer):
                    bfloat16 as default dtype, but received {self._dtype}")

        # fp8 is not support smooth quantization
-        if "float8" in llm_config.model_config.act_dtype:
+        if fd_config.quant_config and "fp8" in fd_config.quant_config.name():
            self.dequant_scales = None
            self.shift = None
            self.smooth = None

-    def forward_cuda(self, x):
+    def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor:
        """
        Forward propagation of the custom activation layer.

--- a/fastdeploy/model_executor/layers/attention/init.py
+++ b/fastdeploy/model_executor/layers/attention/init.py
@@ -13,15 +13,13 @@
 # limitations under the License.

 from .attention import Attention
+from .append_attn_backend import AppendAttentionBackend
+from .attention_selecter import get_attention_backend
 from .base_attention_backend import AttentionBackend
 from .native_paddle_backend import PaddleNativeAttnBackend
-from .attention_selecter import get_attention_backend
-from .append_attn_backend import AppendAttentionBackend
+from .xpu_attn_backend import XPUAttentionBackend

 __all__ = [
-    "Attention",
-    "AttentionBackend",
-    "PaddleNativeAttnBackend",
-    "get_attention_backend",
-    "AppendAttentionBackend",
+    "Attention", "AttentionBackend", "PaddleNativeAttnBackend",
+    "get_attention_backend", "AppendAttentionBackend", "XPUAttentionBackend"
 ]
--- a/fastdeploy/model_executor/layers/attention/append_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/append_attn_backend.py
@@ -16,25 +16,28 @@

 from __future__ import annotations

-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+import os
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, List, Optional, Tuple

 import paddle

 from fastdeploy.model_executor.layers.attention.ops import (
-    append_attention, get_block_shape_and_split_kv_block)
+    append_attention, get_block_shape_and_split_kv_block,
+    init_signal_layerwise, open_shm_and_get_meta_signal)

 if TYPE_CHECKING:
    from paddle._typing.dtype_like import _DTypeLiteral

+from fastdeploy.config import FDConfig
 from fastdeploy.model_executor.layers.attention import Attention
-from fastdeploy.model_executor.layers.attention.base_attention_backend import \
-    AttentionBackend
-from fastdeploy.worker.model_runner import ForwardMeta
+from fastdeploy.model_executor.layers.attention.base_attention_backend import (
+    AttentionBackend, AttentionMetadata)
+from fastdeploy.worker.forward_meta import ForwardMeta


@dataclass
-class AppendAttentionMetadata:
+class AppendAttentionMetadata(AttentionMetadata):
    """
    AppendAttentionMetadata
    """
@@ -60,40 +63,65 @@ class AppendAttentionMetadata:
    decoder_block_shape_q: Optional[paddle.Tensor] = None
    _fuse_kernel_compute_dtype: str = "bf16"

+    # pd_disaggregation
+    kv_signal_metadata: Optional[paddle.Tensor] = None
+    kv_signal_data_list: List[paddle.Tensor] = field(default_factory=list)
+

 class AppendAttentionBackend(AttentionBackend):
    """
    AppendAttentionBackend backend implementation.
    """

-    def __init__(
-        self,
-        model_runner: "ModelRunner",
-    ):
+    def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
+                 head_dim: int) -> None:
        """
        AppendAttentionBackend __init__
        """
        super().__init__()
        self.attention_metadata: AppendAttentionMetadata = None
-        self.block_size = model_runner.args.block_size
-        self.max_seq_len = model_runner.args.max_model_len
-        self.rope_theta = (10000.0 if model_runner.model_cfg.rope_theta is None
-                           else model_runner.model_cfg.rope_theta)
-        self.rope_3d = getattr(model_runner.model_cfg, "rope_3d", False)
-        self.causal = getattr(model_runner.model_cfg, "causal", True)
-        self.speculate_method = model_runner.args.speculate_method
-        self.speculate_max_draft_token_num = model_runner.args.speculate_max_draft_tokens
-        self.num_heads = model_runner.model_cfg.num_attention_heads // model_runner.nranks
-        self.kv_num_heads = int(
-            model_runner.model_cfg.num_key_value_heads) // model_runner.nranks
+        self.block_size: int = fd_config.parallel_config.block_size
+        self.max_seq_len: int = fd_config.parallel_config.max_model_len
+        self.rope_theta: float = (10000.0
+                                  if fd_config.model_config.rope_theta is None
+                                  else fd_config.model_config.rope_theta)
+        self.rope_3d: bool = getattr(fd_config.model_config, "rope_3d", False)
+        self.causal: bool = getattr(fd_config.model_config, "causal", True)
+        self.speculative_method: str = fd_config.speculative_config.method
+        self.use_speculate: bool = self.speculative_method is not None
+        self.speculate_max_draft_token_num: int = fd_config.speculative_config.num_speculative_tokens
+        self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
+        self.rank: int = fd_config.parallel_config.tensor_parallel_rank
+
+        self.kv_num_heads: int = kv_num_heads
+        self.num_heads: int = num_heads
+        self.head_dim: int = fd_config.model_config.head_dim
+        self.num_layers: int = fd_config.model_config.num_layers
+        self.max_partition_size: int = int(
+            os.getenv("FLAGS_max_partition_size", 32768))
+
+        # pd_disaggregation
+        self.use_pd_disaggregation: int = int(
+            os.getenv("FLAGS_use_pd_disaggregation", 0))
+        self.start_layer_index: int = fd_config.model_config.start_layer_index
+        self.device_id: int = os.getenv("CUDA_VISIBLE_DEVICES", None)
+
+        if fd_config.parallel_config.expert_parallel_rank is None:
+            fd_config.parallel_config.expert_parallel_rank = 0
+        device_id = self.rank + fd_config.parallel_config.tensor_parallel_degree * \
+            fd_config.parallel_config.expert_parallel_rank
+        if self.device_id is None:
+            self.device_id = device_id
+        else:
+            self.device_id = self.device_id.split(",")[device_id]

    def init_attention_metadata(self, forward_meta: ForwardMeta):
        """Initialize attntion metadata hence all layers in the forward pass can reuse it."""
        metadata = AppendAttentionMetadata()
        metadata.encoder_block_shape_q = 64
        metadata.decoder_block_shape_q = 16
-        metadata.max_partition_size = 32768
-        metadata.encoder_max_partition_size = 32768
+        metadata.max_partition_size = self.max_partition_size
+        metadata.encoder_max_partition_size = self.max_seq_len
        metadata._dtype = paddle.get_default_dtype()
        if metadata._dtype == "bfloat16":
            metadata._fuse_kernel_compute_dtype = "bf16"
@@ -128,38 +156,51 @@ class AppendAttentionBackend(AttentionBackend):
            self.block_size,
            self.speculate_max_draft_token_num + 1,
        )
-        self.attention_metadata = metadata

-    def get_attntion_meta(self):
+        # pd_disaggregation
+        metadata.kv_signal_data_list = [None] * self.num_layers
+        if self.use_pd_disaggregation:
+            metadata.kv_signal_metadata = open_shm_and_get_meta_signal(
+                self.rank, int(self.device_id), self.keep_pd_step_flag)
+        self.attention_metadata: AttentionMetadata = metadata
+        forward_meta.decoder_batch_ids.copy_(metadata.decoder_batch_ids, False)
+        forward_meta.decoder_tile_ids_per_batch.copy_(
+            metadata.decoder_tile_ids_per_batch, False)
+
+    def get_attntion_meta(self) -> AttentionMetadata:
        """get_attntion_meta"""
        return self.attention_metadata

-    @staticmethod
    def get_kv_cache_shape(
+        self,
        max_num_blocks: int,
-        block_size: int,
-        kv_num_head: int,
-        head_dim: int,
-    ):
+    ) -> Tuple[int, int, int, int]:
        """
-        get_kv_cache_shape
+        Caculate kv cache shape
        """
-        return (max_num_blocks, kv_num_head, block_size, head_dim)
+        return (max_num_blocks, self.kv_num_heads, self.block_size,
+                self.head_dim)

    def forward_mixed(
        self,
-        q,
-        k,
-        v,
-        qkv,
+        q: paddle.Tensor,
+        k: paddle.Tensor,
+        v: paddle.Tensor,
+        qkv: paddle.Tensor,
        layer: Attention,
        forward_meta: ForwardMeta,
-    ):
+    ) -> paddle.Tensor:
        """
        forward_mixed
        """
        metadata = self.attention_metadata

+        if self.use_pd_disaggregation:
+            metadata.kv_signal_data_list[
+                layer.layer_id] = init_signal_layerwise(
+                    metadata.kv_signal_metadata,
+                    layer.layer_id + self.start_layer_index)
+
        res = append_attention(
            qkv,
            forward_meta.caches[2 * layer.layer_id],
@@ -176,8 +217,8 @@ class AppendAttentionBackend(AttentionBackend):
            metadata.kv_batch_ids,
            metadata.kv_tile_ids_per_batch,
            metadata.kv_num_blocks,
-            metadata.decoder_batch_ids,
-            metadata.decoder_tile_ids_per_batch,
+            forward_meta.decoder_batch_ids,  # from buffer
+            forward_meta.decoder_tile_ids_per_batch,  # from buffer
            metadata.decoder_num_blocks,
            metadata.set_max_lengths,
            metadata.max_len_kv,
@@ -193,7 +234,7 @@ class AppendAttentionBackend(AttentionBackend):
            getattr(layer, "cache_v_zp", None),
            layer.linear_shift,
            layer.linear_smooth,
-            None,  # kv_signal_data,
+            metadata.kv_signal_data_list[layer.layer_id],
            metadata._fuse_kernel_compute_dtype,
            getattr(layer, "cache_quant_type_str", "none"),
            layer.use_neox_rotary_style,
@@ -208,7 +249,6 @@ class AppendAttentionBackend(AttentionBackend):
            metadata.encoder_max_partition_size,
            self.speculate_max_draft_token_num + 1,
            self.causal,
-            self.speculate_method is not None,
+            self.speculative_method is not None,
        )[0]
-
        return res
--- a/fastdeploy/model_executor/layers/attention/attention.py
+++ b/fastdeploy/model_executor/layers/attention/attention.py
@@ -14,12 +14,17 @@
 # limitations under the License.
 """

-from typing import Optional
+from typing import Dict, Optional

+import numpy as np
 import paddle
 from paddle import nn
+from paddleformers.utils.log import logger

-from fastdeploy.worker.model_runner import ForwardMeta
+from fastdeploy.config import FDConfig
+from fastdeploy.model_executor.layers.quantization.quant_base import \
+    QuantMethodBase
+from fastdeploy.worker.forward_meta import ForwardMeta


 class Attention(nn.Layer):
@@ -29,26 +34,24 @@ class Attention(nn.Layer):

    def __init__(
        self,
-        llm_config,
+        fd_config: FDConfig,
        layer_id: int,
-        logit_cap: float = 0.0,
        v_head_dim: int = -1,
        rope_type: str = "",
        qkv_bias: Optional[paddle.Tensor] = None,
        qkv_scale: Optional[paddle.Tensor] = None,
        prefix: str = "",
-        out_scale: float = -1.,
-        linear_shift=None,
-        linear_smooth=None,
-        use_neox_rotary_style=False,
+        out_scale: float = -1.0,
+        linear_shift: paddle.Tensor = None,
+        linear_smooth: paddle.Tensor = None,
+        use_neox_rotary_style: bool = False,
    ) -> None:
        """
        Initializes `LMLayer` with the given parameters.

        Args:
-            llm_config (dict): The config of LM model.
+            fd_config (dict): The config of LM model.
            layer_id (int): The id of current layer.
-            logit_cap (float, optional): The cap for logits. Defaults to 0.0.
            v_head_dim (int, optional): The head dim of value. Defaults to -1.
            rope_type (str, optional): The type of RoPE. Defaults to "".
            qkv_bias (Optional[paddle.Tensor], optional): The bias of QKV. Defaults to None.
@@ -61,34 +64,46 @@ class Attention(nn.Layer):
            ValueError: If the `v_head_dim` is less than 0.
        """
        super().__init__()
-        self.num_heads = llm_config.model_config.num_attention_heads // llm_config.parallel_config.mp_size
-        self.head_dim = llm_config.model_config.hidden_size // llm_config.model_config.num_attention_heads
-        self.kv_num_heads = llm_config.model_config.num_key_value_heads // llm_config.parallel_config.mp_size
-        self.layer_id = layer_id
-        self.logit_cap = logit_cap
-        self.v_head_dim = v_head_dim if v_head_dim > 0 else self.head_dim
-        self.rope_type = rope_type
-        self.qk_head_dim = self.head_dim
+        self.num_heads: int = fd_config.model_config.num_attention_heads // fd_config.parallel_config.tensor_parallel_degree
+        self.head_dim: int = fd_config.model_config.head_dim
+        self.kv_num_heads: int = \
+            fd_config.model_config.num_key_value_heads // fd_config.parallel_config.tensor_parallel_degree
+        self.layer_id: int = layer_id
+        self.v_head_dim: int = v_head_dim if v_head_dim > 0 else self.head_dim
+        self.rope_type: str = rope_type
+        self.qk_head_dim: int = self.head_dim
+        self.prefix: str = prefix
        # not use
-        self.tp_q_head_num = self.num_heads
-        self.tp_k_head_num = self.num_heads
-        self.tp_v_head_num = self.num_heads
-        # not use
-        self.scaling = 1.0 / (self.head_dim**0.5)
-        self.linear_shift = linear_shift
-        self.linear_smooth = linear_smooth
-        self.qkv_bias = qkv_bias
-        self.qkv_scale = qkv_scale
+        self.linear_shift: paddle.Tensor | None = linear_shift
+        self.linear_smooth: paddle.Tensor | None = linear_smooth
+        self.qkv_bias: paddle.Tensor | None = qkv_bias
+        self.qkv_scale: paddle.Tensor | None = qkv_scale
        self._dtype = self._helper.get_default_dtype()
-        self.out_scale = out_scale
-        self.use_neox_rotary_style = use_neox_rotary_style
-        if llm_config.kvcache_config is not None:
-            self.kvcache_quant_method = llm_config.kvcache_config.kvcache_quant_config.get_quant_method(
+
+        self.out_scale: float = out_scale
+        self.use_neox_rotary_style: bool = use_neox_rotary_style
+
+        if fd_config.quant_config and hasattr(fd_config.quant_config,
+                                              "kv_cache_quant_type"):
+            self.kvcache_quant_method: QuantMethodBase = fd_config.quant_config.get_quant_method(
                self)
-            self.kvcache_quant_method.create_weights(self)
-        if llm_config.quant_config is not None:
-            self.quant_max_bound = llm_config.quant_config.quant_max_bound
-            self.quant_min_bound = llm_config.quant_config.quant_min_bound
+        else:
+            self.kvcache_quant_method = None
+
+        if self.kvcache_quant_method is None:
+            logger.info(f"Attention is running in cache kv {self._dtype} mode")
+        else:
+            logger.info(
+                f"Attention is running in cache kv {self.kvcache_quant_method.cache_quant_config.quant_type} mode"
+            )
+
+    def load_state_dict(self, state_dict: Dict[str,
+                                               paddle.Tensor | np.ndarray]):
+        '''
+        Attention only have quant related scales not other parameters.
+        '''
+        if self.kvcache_quant_method is not None:
+            self.kvcache_quant_method.create_weights(self, state_dict)

    def forward(
        self,
@@ -97,7 +112,7 @@ class Attention(nn.Layer):
        v: paddle.Tensor = None,
        qkv: paddle.Tensor = None,
        forward_meta: ForwardMeta = None,
-    ):
+    ) -> paddle.Tensor:
        """
        The forward function of attention layer.
        args:
--- a/fastdeploy/model_executor/layers/attention/attention_selecter.py
+++ b/fastdeploy/model_executor/layers/attention/attention_selecter.py
@@ -14,26 +14,20 @@
 # limitations under the License.
 """

-"""
-attention backend selecter
-"""
-from fastdeploy.model_executor.layers.attention.base_attention_backend import AttentionBackend
-from fastdeploy.platforms import current_platform
-from fastdeploy.utils import resolve_obj_from_strname
 from functools import cache
-from fastdeploy.platforms import _Backend
+
+from fastdeploy.platforms import _Backend, current_platform
+from fastdeploy.utils import resolve_obj_from_strname


-def backend_name_to_enum(backend_name: str):
+def backend_name_to_enum(backend_name: str) -> _Backend:
    """backend_name_to_enum """
    assert backend_name is not None
    return _Backend.__members__.get(backend_name)


@cache
-def _get_attn_backend(
-    selected_backend
-):
+def _get_attn_backend(selected_backend: str) -> object:
    """_get_attn_backend """
    if isinstance(selected_backend, str):
        selected_backend = backend_name_to_enum(selected_backend)
@@ -46,10 +40,6 @@ def _get_attn_backend(
    return resolve_obj_from_strname(attention_cls)


-def get_attention_backend(
-    selected_backend
-):
+def get_attention_backend(selected_backend):
    """Selects which attention backend ."""
-    return _get_attn_backend(
-        selected_backend
-    )
+    return _get_attn_backend(selected_backend)
--- a/fastdeploy/model_executor/layers/attention/base.py
+++ b/fastdeploy/model_executor/layers/attention/base.py
@@ -1,395 +0,0 @@
-"""
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import os
-
-import paddle
-from paddle import nn
-
-import fastdeploy
-
-
-class Attention(nn.Layer):
-    """
-    Attention Layer
-    """
-
-    def __init__(
-        self,
-        inference_args,
-        prefix,
-        out_scale=-1,
-        use_neox_rotary_style=False,
-        rope_theta=10000.0,
-        rope_3d=False,
-        qkv_scale=None,
-        qkv_bias=None,
-        linear_shift=None,
-        linear_smooth=None,
-    ):
-        """
-        Initialize the attention layer with various parameters.
-
-        Args:
-            inference_args (dict or object): Contains arguments for inference, including
-                number of key-value heads, weight data type, activation data type, etc.
-            prefix (str): The name of the attention layer for identification purposes.
-            out_scale (float, optional): Output scale factor. Defaults to -1.
-            use_neox_rotary_style (bool, optional): Whether to use the NeoX rotary position
-                encoding style. Defaults to False.
-            rope_theta (float, optional): Theta value for the rope position encoding. Defaults to 10000.0.
-            qkv_scale (float or None, optional): Quantization scale for QKV weights.
-                Used only for certain quantization configurations. Defaults to None.
-            qkv_bias (Tensor or None, optional): Bias for QKV linear layer. Defaults to None.
-            linear_shift (float or None, optional): Linear shift factor used in
-                quantization. Used only for certain quantization configurations.
-                Defaults to None.
-            linear_smooth (float or None, optional): Linear smooth factor used in
-                quantization. Used only for certain quantization configurations.
-                Defaults to None.
-        """
-        super().__init__()
-        self.inference_args = inference_args
-        self.nranks = inference_args.mp_size
-        self.kv_num_heads = inference_args.num_key_value_heads // self.nranks
-        self.head_dim = self.inference_args.head_dim
-        self.prefix = prefix
-        self.cache_k_scale_name = prefix + ".cachek_matmul.activation_quanter"
-        self.cache_v_scale_name = prefix + ".cachev_matmul.activation_quanter"
-        self.out_scale = out_scale
-
-        self.cache_k_zp_name = self.cache_k_scale_name + ".zero_point"
-        self.cache_v_zp_name = self.cache_v_scale_name + ".zero_point"
-
-        self.use_neox_rotary_style = use_neox_rotary_style
-        self.rope_theta = rope_theta
-        self.rope_3d = rope_3d
-
-        self._dtype = self._helper.get_default_dtype()
-        if self._dtype == "bfloat16":
-            self._fuse_kernel_compute_dtype = "bf16"
-        elif self._dtype == "float16":
-            self._fuse_kernel_compute_dtype = "fp16"
-        elif self._dtype == "float32":
-            self._fuse_kernel_compute_dtype = "fp32"
-        else:
-            raise ValueError(f"Just support float32, float16 and \
-                    bfloat16 as default dtype, but received {self._dtype}")
-
-        self.cache_scale_dtype = (
-            self._dtype if self.inference_args.use_append_attn else "float32")
-
-        self.qkv_bias = qkv_bias
-        if inference_args.weight_dtype == "int8" and inference_args.act_dtype == "int8":
-            self.qkv_scale = qkv_scale
-            self.linear_shift = linear_shift
-            self.linear_smooth = linear_smooth
-        if (inference_args.cachekv_dtype == "int8"
-                or inference_args.cachekv_dtype == "int4"
-                or inference_args.cachekv_dtype == "float8_e4m3fn"):
-            self.set_cachekv_scale()
-        # qkv_bias fused with attention only when W8A8
-        if not (inference_args.weight_dtype == "int8"
-                and inference_args.act_dtype == "int8"):
-            self.qkv_bias = None
-
-    def set_cachekv_scale(self):
-        """
-        Set cache key (K) and value (V) scaling factors.
-
-        This method initializes and sets the scaling factors for cache key (K) and value (V)
-        tensors, which are used in attention mechanisms to adjust the scale of the cache
-        representations. Additionally, it calculates and sets the inverse of these scaling
-        factors for the output cache K and V tensors.
-
-        Args:
-            None - This method does not take any explicit arguments as it relies on the
-                instance variables of the class, such as `self.kv_num_heads`,
-                `self.cache_k_scale_name`, `self.cache_v_scale_name`, and
-                `self.inference_args.cachekv_scale_dict` for its functionality.
-
-        Returns:
-            None - This method modifies the instance variables directly and does not return
-                any values.
-        """
-        self.cache_k_scale = self.create_parameter(
-            shape=([self.kv_num_heads *
-                    self.head_dim] if self.inference_args.is_channel_wise else
-                   [self.kv_num_heads]),
-            dtype=self.cache_scale_dtype,
-            is_bias=False,
-        )
-        self.cache_v_scale = self.create_parameter(
-            shape=([self.kv_num_heads *
-                    self.head_dim] if self.inference_args.is_channel_wise else
-                   [self.kv_num_heads]),
-            dtype=self.cache_scale_dtype,
-            is_bias=False,
-        )
-        self.cache_k_out_scale = self.create_parameter(
-            shape=([self.kv_num_heads *
-                    self.head_dim] if self.inference_args.is_channel_wise else
-                   [self.kv_num_heads]),
-            attr=None,
-            dtype=self.cache_scale_dtype,
-            is_bias=False,
-        )
-        self.cache_v_out_scale = self.create_parameter(
-            shape=([self.kv_num_heads *
-                    self.head_dim] if self.inference_args.is_channel_wise else
-                   [self.kv_num_heads]),
-            attr=None,
-            dtype=self.cache_scale_dtype,
-            is_bias=False,
-        )
-
-        if self.cache_k_scale_name in self.inference_args.cachekv_scale_dict:
-            cache_k_scale = paddle.cast(
-                paddle.to_tensor(self.inference_args.cachekv_scale_dict[
-                    self.cache_k_scale_name]),
-                self.cache_scale_dtype,
-            )
-            cache_k_out_scale = 1.0 / cache_k_scale
-        else:
-            if os.getenv("EP_DECODER_PERF_TEST", "False") == "True":
-                cache_k_scale = paddle.zeros(self.cache_k_scale.shape,
-                                             self.cache_k_scale.dtype)
-                cache_k_out_scale = paddle.zeros(self.cache_k_out_scale.shape,
-                                                 self.cache_k_out_scale.dtype)
-            else:
-                raise KeyError(
-                    f"{self.cache_k_scale_name} not found in scale dict")
-
-        if self.cache_v_scale_name in self.inference_args.cachekv_scale_dict:
-            cache_v_scale = paddle.cast(
-                paddle.to_tensor(self.inference_args.cachekv_scale_dict[
-                    self.cache_v_scale_name]),
-                self.cache_scale_dtype,
-            )
-            cache_v_out_scale = 1.0 / cache_v_scale
-        else:
-            if os.getenv("EP_DECODER_PERF_TEST", "False") == "True":
-                cache_v_scale = paddle.zeros(self.cache_v_scale.shape,
-                                             self.cache_v_scale.dtype)
-                cache_v_out_scale = paddle.zeros(self.cache_v_out_scale.shape,
-                                                 self.cache_v_out_scale.dtype)
-            else:
-                raise KeyError(
-                    f"{self.cache_v_scale_name} not found in scale dict")
-
-        self.cache_k_scale.set_value(cache_k_scale)
-        self.cache_v_scale.set_value(cache_v_scale)
-        self.cache_k_out_scale.set_value(cache_k_out_scale)
-        self.cache_v_out_scale.set_value(cache_v_out_scale)
-
-        if self.inference_args.has_zero_point:
-            self.cache_k_zp = self.create_parameter(
-                shape=([self.kv_num_heads *
-                        self.head_dim] if self.inference_args.is_channel_wise
-                       else [self.kv_num_heads]),
-                dtype=self.cache_scale_dtype,
-                is_bias=False,
-            )
-            self.cache_v_zp = self.create_parameter(
-                shape=([self.kv_num_heads *
-                        self.head_dim] if self.inference_args.is_channel_wise
-                       else [self.kv_num_heads]),
-                dtype=self.cache_scale_dtype,
-                is_bias=False,
-            )
-            if self.cache_k_zp_name in self.inference_args.cachekv_scale_dict:
-                cache_k_zp = paddle.cast(
-                    paddle.to_tensor(self.inference_args.cachekv_scale_dict[
-                        self.cache_k_zp_name]),
-                    self.cache_scale_dtype,
-                )
-            else:
-                cache_k_zp = paddle.zeros(
-                    ([self.kv_num_heads *
-                      self.head_dim] if self.inference_args.is_channel_wise
-                     else [self.kv_num_heads]),
-                    dtype=self.cache_scale_dtype,
-                )
-            if self.cache_v_zp_name in self.inference_args.cachekv_scale_dict:
-                cache_v_zp = paddle.cast(
-                    paddle.to_tensor(self.inference_args.cachekv_scale_dict[
-                        self.cache_v_zp_name]),
-                    self.cache_scale_dtype,
-                )
-            else:
-                cache_v_zp = paddle.zeros(
-                    ([self.kv_num_heads *
-                      self.head_dim] if self.inference_args.is_channel_wise
-                     else [self.kv_num_heads]),
-                    dtype=self.cache_scale_dtype,
-                )
-            self.cache_k_zp.set_value(cache_k_zp)
-            self.cache_v_zp.set_value(cache_v_zp)
-
-    def forward(
-        self,
-        qkv,
-        input_ids,
-        rotary_embs,
-        rotary_emb_dims,
-        key_cache,
-        value_cache,
-        pre_key_cache,
-        pre_value_cache,
-        pre_caches_length,
-        attn_mask,
-        kv_signal_data,
-        **kwargs,
-    ):
-        """
-            Compute the attention for a single time step.
-
-        Args:
-            qkv (Tensor): The output of the linear transformation of query, key and value.
-                Shape: [batch_size, num_heads, seq_len, embed_dim // num_heads].
-            padding_offset (Tensor): The offset to be added to the sequence length when computing
-                the attention mask. Shape: [batch_size, 1].
-            input_ids (Tensor, optional): The input ids of the batch. Used for computing the
-                attention mask. Default: None. Shape: [batch_size, max_sequence_length].
-            rotary_embs (Tensor, optional): The rotary position embeddings. Default: None.
-                Shape: [num_heads, rotary_emb_dims].
-            rotary_emb_dims (int, optional): The dimension of the rotary position embeddings.
-                Default: None.
-            caches (List[Tensor], optional): The cache tensors used in the computation of the
-                attention. Default: None.
-            pre_caches (List[Tensor], optional): The pre-computed cache tensors used in the
-                computation of the attention. Default: None.
-            pre_caches_length (int, optional): The length of the pre-computed cache tensors.
-                Default: None.
-            attn_mask (Tensor, optional): The attention mask. Default: None.
-                Shape: [batch_size, max_sequence_length].
-            **kwargs (dict, optional): Additional keyword arguments passed along.
-
-        Returns:
-            Tensor: The output of the linear transformation after applying the attention.
-                Shape: [batch_size, embed_dim // num_heads].
-
-        Raises:
-            None.
-        """
-        k_quant_scale = kwargs.get("k_quant_scale", None)
-        v_quant_scale = kwargs.get("v_quant_scale", None)
-        k_dequant_scale = kwargs.get("k_dequant_scale", None)
-        v_dequant_scale = kwargs.get("v_dequant_scale", None)
-
-        if not self.inference_args.use_dynamic_cachekv_quant:
-            k_quant_scale = getattr(self, "cache_k_scale", None)
-            v_quant_scale = getattr(self, "cache_v_scale", None)
-            k_dequant_scale = getattr(self, "cache_k_out_scale", None)
-            v_dequant_scale = getattr(self, "cache_v_out_scale", None)
-            cache_quant_type_str = self.inference_args.cache_quant_type
-        else:
-            cache_quant_type_str = "none"
-
-        if self.inference_args.use_append_attn:
-            out = fastdeploy.model_executor.ops.gpu.append_attention(
-                qkv,
-                key_cache,
-                value_cache,
-                kwargs.get("seq_lens_encoder", None),
-                kwargs.get("seq_lens_decoder", None),
-                kwargs.get("seq_lens_this_time", None),
-                kwargs.get("padding_offsets", None),
-                kwargs.get("cum_offsets", None),
-                kwargs.get("block_tables", None),
-                kwargs.get("encoder_batch_ids", None),
-                kwargs.get("encoder_tile_ids_per_batch", None),
-                kwargs.get("encoder_num_blocks", None),
-                kwargs.get("kv_batch_ids", None),
-                kwargs.get("kv_tile_ids_per_batch", None),
-                kwargs.get("kv_num_blocks", None),
-                kwargs.get("decoder_batch_ids", None),
-                kwargs.get("decoder_tile_ids_per_batch", None),
-                kwargs.get("decoder_num_blocks", None),
-                kwargs.get("set_max_lengths", None),
-                kwargs.get("max_len_kv", None),
-                rotary_embs,
-                attn_mask,
-                getattr(self, "qkv_bias", None),
-                getattr(self, "qkv_scale", None),
-                k_quant_scale,
-                v_quant_scale,
-                k_dequant_scale,
-                v_dequant_scale,
-                getattr(self, "cache_k_zp", None),  # cache_k_zp
-                getattr(self, "cache_v_zp", None),  # cache_v_zp
-                getattr(self, "linear_shift", None),  # out_shifts
-                getattr(self, "linear_smooth", None),  # out_smooths
-                kv_signal_data,
-                self._fuse_kernel_compute_dtype,
-                cache_quant_type_str,  # cache_quant_type
-                self.use_neox_rotary_style,
-                self.rope_3d,
-                kwargs.get("max_input_length", -1),
-                self.inference_args.quant_max_bound,
-                self.inference_args.quant_min_bound,
-                self.out_scale,  # out_linear_in_scale
-                kwargs.get("encoder_block_shape_q", 64),
-                kwargs.get("decoder_block_shape_q", 16),
-                kwargs.get("max_partition_size", 32768),
-                kwargs.get("encoder_max_partition_size", 32768),
-                self.inference_args.speculate_max_draft_token_num +
-                1,  # speculate_max_draft_token_num
-                True,  # causal
-                self.inference_args.speculate_method
-                is not None,  # speculate_decoder
-            )[0]
-        else:
-            out = paddle.incubate.nn.functional.block_multihead_attention(
-                qkv,
-                key_cache,
-                value_cache,
-                kwargs.get("seq_lens_encoder", None),
-                kwargs.get("seq_lens_decoder", None),
-                kwargs.get("seq_lens_this_time", None),
-                kwargs.get("padding_offsets", None),
-                kwargs.get("cum_offsets", None),
-                kwargs.get("cu_seqlens_q", None),
-                kwargs.get("cu_seqlens_k", None),
-                kwargs.get("block_tables", None),
-                pre_key_cache,
-                pre_value_cache,
-                k_quant_scale,
-                v_quant_scale,
-                k_dequant_scale,
-                v_dequant_scale,
-                getattr(self, "qkv_scale", None),
-                getattr(self, "qkv_bias", None),
-                getattr(self, "linear_shift", None),
-                getattr(self, "linear_smooth", None),
-                kwargs.get("max_enc_len_this_time", None),
-                kwargs.get("max_dec_len_this_time", None),
-                rotary_embs,
-                attn_mask,
-                None,  # tgt_mask
-                kwargs.get("max_input_length", -1),
-                kwargs.get("block_size", 64),
-                self.use_neox_rotary_style,
-                self.inference_args.use_dynamic_cachekv_quant,
-                quant_round_type=self.inference_args.quant_round_type,
-                quant_max_bound=self.inference_args.quant_max_bound,
-                quant_min_bound=self.inference_args.quant_min_bound,
-                out_scale=self.out_scale,
-                compute_dtype=self._fuse_kernel_compute_dtype,
-                rope_theta=self.rope_theta,
-            )[0]
-
-        return out
--- a/fastdeploy/model_executor/layers/attention/base_attention_backend.py
+++ b/fastdeploy/model_executor/layers/attention/base_attention_backend.py
@@ -20,10 +20,16 @@
 from __future__ import annotations

 from abc import ABC, abstractmethod
+from dataclasses import dataclass

 import paddle

-from fastdeploy.worker.model_runner import ForwardMeta
+from fastdeploy.worker.forward_meta import ForwardMeta
+
+
+@dataclass
+class AttentionMetadata(ABC):
+    pass


 class AttentionBackend(ABC):
@@ -42,7 +48,7 @@ class AttentionBackend(ABC):
        qkv: paddle.Tensor,
        layer: paddle.nn.Layer,
        forward_meta: ForwardMeta,
-    ):
+    ) -> paddle.Tensor:
        """
        Run a forward.
        args:
@@ -88,7 +94,7 @@ class AttentionBackend(ABC):
        qkv: paddle.Tensor,
        layer: paddle.nn.Layer,
        forward_meta: ForwardMeta,
-    ):
+    ) -> paddle.Tensor:
        """Run a forward for mix."""
        raise NotImplementedError()

@@ -100,7 +106,7 @@ class AttentionBackend(ABC):
        qkv: paddle.Tensor,
        layer: paddle.nn.Layer,
        forward_meta: ForwardMeta,
-    ):
+    ) -> paddle.Tensor:
        """Run a forward for decode."""
        raise NotImplementedError()

@@ -112,6 +118,6 @@ class AttentionBackend(ABC):
        qkv: paddle.Tensor,
        layer: paddle.nn.Layer,
        forward_meta: ForwardMeta,
-    ):
+    ) -> paddle.Tensor:
        """Run a forward for extend."""
        raise NotImplementedError()
--- a/fastdeploy/model_executor/layers/attention/native_paddle_backend.py
+++ b/fastdeploy/model_executor/layers/attention/native_paddle_backend.py
@@ -1,4 +1,3 @@
-
 """
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
@@ -16,15 +15,14 @@

 """

-
 from __future__ import annotations
-from typing import TYPE_CHECKING

 import paddle
 from paddle.nn.functional import scaled_dot_product_attention

-from fastdeploy.model_executor.layers.attention.base_attention_backend import AttentionBackend
-from fastdeploy.worker.model_runner import ForwardMeta, ForwardMode
+from fastdeploy.model_executor.layers.attention.base_attention_backend import \
+    AttentionBackend
+from fastdeploy.worker.forward_meta import ForwardMeta


 class PaddleNativeAttnBackend(AttentionBackend):
@@ -33,10 +31,8 @@ class PaddleNativeAttnBackend(AttentionBackend):
    Which is used only for testing purpose.
    """

-    def __init__(self, device):
+    def __init__(self) -> None:
        super().__init__()
-        self.forward_metadata = None
-        self.device = device

    def init_attention_metadata(self, forward_meta: ForwardMeta):
        """Init the metadata for a forward pass."""
@@ -53,8 +49,8 @@ class PaddleNativeAttnBackend(AttentionBackend):
        seq_lens: paddle.Tensor,
        extend_prefix_lens: paddle.Tensor,
        extend_seq_lens: paddle.Tensor,
-        causal=False,
-    ):
+        causal: bool = False,
+    ) -> paddle.Tensor:
        """Run the extend forward by using paddle native sdpa op.

        Args:
@@ -111,18 +107,14 @@ class PaddleNativeAttnBackend(AttentionBackend):
            per_req_value = v_cache[per_req_tokens].transpose(
                [query.dim() - 2, 0])

-            per_req_out_redudant = (
-                scaled_dot_product_attention(
-                    per_req_query_redudant.unsqueeze(0),
-                    per_req_key.unsqueeze(0),
-                    per_req_value.unsqueeze(0),
-                    is_causal=causal,
-                )
-                .squeeze(0)
-                .transpose([query.dim() - 2, 0])
-            )
-            output[start_q:end_q, :,
-                   :] = per_req_out_redudant[prefill_seq_len_q:, :, :]
+            per_req_out_redudant = (scaled_dot_product_attention(
+                per_req_query_redudant.unsqueeze(0),
+                per_req_key.unsqueeze(0),
+                per_req_value.unsqueeze(0),
+                is_causal=causal,
+            ).squeeze(0).transpose([query.dim() - 2, 0]))
+            output[start_q:end_q, :, :] = per_req_out_redudant[
+                prefill_seq_len_q:, :, :]
            start_q, start_kv = end_q, end_kv
        return output

@@ -132,7 +124,7 @@ class PaddleNativeAttnBackend(AttentionBackend):
        key: paddle.Tensor,
        value: paddle.Tensor,
        is_causal: bool = False,
-    ):
+    ) -> paddle.Tensor:
        """Paddle implementation of scaled dot-product attention."""
        # query, key, value shape: [batch_size, num_heads, seq_len, head_size]
        d_k = query.shape[-1]
@@ -159,8 +151,8 @@ class PaddleNativeAttnBackend(AttentionBackend):
        req_to_token: paddle.Tensor,
        req_pool_indices: paddle.Tensor,
        seq_lens: paddle.Tensor,
-        causal=False,
-    ):
+        causal: bool = False,
+    ) -> paddle.Tensor:
        """Run the decode forward by using paddle native sdpa op.

        Args:
@@ -203,16 +195,12 @@ class PaddleNativeAttnBackend(AttentionBackend):
            per_req_value = v_cache[per_req_tokens].transpose(
                [query.dim() - 2, 0])

-            per_req_out = (
-                self._scaled_dot_product_attention(
-                    per_req_query.unsqueeze(0),
-                    per_req_key.unsqueeze(0),
-                    per_req_value.unsqueeze(0),
-                    is_causal=causal,
-                )
-                .squeeze(0)
-                .transpose([query.dim() - 2, 0])
-            )
+            per_req_out = (self._scaled_dot_product_attention(
+                per_req_query.unsqueeze(0),
+                per_req_key.unsqueeze(0),
+                per_req_value.unsqueeze(0),
+                is_causal=causal,
+            ).squeeze(0).transpose([query.dim() - 2, 0]))
            output[start_q:end_q, :, :] = per_req_out
            start_q, start_kv = end_q, end_kv

@@ -220,31 +208,28 @@ class PaddleNativeAttnBackend(AttentionBackend):

    def forward_extend(
        self,
-        q,
-        k,
-        v,
+        q: paddle.Tensor,
+        k: paddle.Tensor,
+        v: paddle.Tensor,
        layer: paddle.nn.Layer,
        forward_meta: ForwardMeta,
-        save_kv_cache=True,
-    ):
+        save_kv_cache: bool = True,
+    ) -> paddle.Tensor:
        """
            Run the prefill and extend(prompt cache) attention forward by using paddle native sdpa op.
        """
        if layer.qk_head_dim != layer.v_head_dim:
            o = q.new_empty(
-                (q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+                (q.shape[0], layer.self.num_heads * layer.v_head_dim))
        else:
            o = paddle.empty_like(q)

        if save_kv_cache:
            forward_meta.token_to_kv_pool.set_kv_buffer(
-                layer, forward_meta.out_cache_loc, k, v
-            )
+                layer, forward_meta.out_cache_loc, k, v)

-        use_gqa = layer.tp_q_head_num != layer.tp_k_head_num
-
-        q_ = q.view([-1, layer.tp_q_head_num, layer.qk_head_dim])
-        o_ = o.view([-1, layer.tp_q_head_num, layer.v_head_dim])
+        q_ = q.view([-1, layer.self.num_heads, layer.qk_head_dim])
+        o_ = o.view([-1, layer.self.num_heads, layer.v_head_dim])

        causal = True

@@ -264,31 +249,29 @@ class PaddleNativeAttnBackend(AttentionBackend):

    def forward_decode(
        self,
-        q,
-        k,
-        v,
+        q: paddle.Tensor,
+        k: paddle.Tensor,
+        v: paddle.Tensor,
        layer: paddle.nn.Layer,
        forward_meta: ForwardMeta,
-    ):
+    ) -> paddle.Tensor:
        """
            Run the decoding attention forward by using paddle native sdpa op.
        """
-        q = q.reshape([-1, layer.tp_q_head_num * layer.qk_head_dim])
+        q = q.reshape([-1, layer.self.num_heads * layer.qk_head_dim])

        if layer.qk_head_dim != layer.v_head_dim:
            o = q.new_empty(
-                (q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+                (q.shape[0], layer.self.num_heads * layer.v_head_dim))
        else:
            o = paddle.empty_like(q)

-        forward_meta.token_to_kv_pool.set_kv_buffer(
-            layer, forward_meta.out_cache_loc, k, v
-        )
+        forward_meta.token_to_kv_pool.set_kv_buffer(layer,
+                                                    forward_meta.out_cache_loc,
+                                                    k, v)

-        use_gqa = layer.tp_q_head_num != layer.tp_k_head_num
-
-        q_ = q.view([-1, layer.tp_q_head_num, layer.qk_head_dim])
-        o_ = o.view([-1, layer.tp_q_head_num, layer.v_head_dim])
+        q_ = q.view([-1, layer.self.num_heads, layer.qk_head_dim])
+        o_ = o.view([-1, layer.self.num_heads, layer.v_head_dim])

        self._run_sdpa_forward_decode(
            q_,
--- a/fastdeploy/model_executor/layers/attention/ops/init.py
+++ b/fastdeploy/model_executor/layers/attention/ops/init.py
@@ -14,10 +14,13 @@
 # limitations under the License.
 """

-from .get_block_shape_and_split_kv_block import get_block_shape_and_split_kv_block
 from .append_attention import append_attention
+from .get_block_shape_and_split_kv_block import \
+    get_block_shape_and_split_kv_block
+from .init_signal_layerwise import init_signal_layerwise
+from .open_shm_and_get_meta_signal import open_shm_and_get_meta_signal

 __all__ = [
-    "get_block_shape_and_split_kv_block",
-    "append_attention"
-]
+    "get_block_shape_and_split_kv_block", "append_attention",
+    "open_shm_and_get_meta_signal", "init_signal_layerwise"
+]
--- a/fastdeploy/model_executor/layers/attention/ops/append_attention.py
+++ b/fastdeploy/model_executor/layers/attention/ops/append_attention.py
@@ -14,10 +14,16 @@
 # limitations under the License.
 """

-import paddle
 from typing import Optional
+
+import paddle
+
 from fastdeploy.platforms import current_platform

+if current_platform.is_cuda():
+    from fastdeploy.model_executor.ops.gpu import \
+        append_attention as append_attention_gpu
+

 def append_attention(
    qkv: paddle.Tensor,
@@ -68,14 +74,12 @@ def append_attention(
    speculate_max_draft_token_num: int = 1,
    causal: bool = True,
    speculate_decoder: bool = False,
-):
+) -> paddle.Tensor:
    """
-    Args:
-    Returns:
+    append_attention
    """
    if current_platform.is_cuda():
-        from fastdeploy.model_executor.ops.gpu import append_attention
-        out = append_attention(
+        out = append_attention_gpu(
            qkv,
            key_cache,
            value_cache,
--- a/fastdeploy/model_executor/layers/attention/ops/init_signal_layerwise.py
+++ b/fastdeploy/model_executor/layers/attention/ops/init_signal_layerwise.py
@@ -0,0 +1,34 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import paddle
+
+from fastdeploy.platforms import current_platform
+
+
+def init_signal_layerwise(
+    kv_signal_metadata: paddle.Tensor,
+    layer_id: int = 0,
+) -> paddle.Tensor:
+    """
+    init_signal_layerwise
+    """
+    if current_platform.is_cuda():
+        from fastdeploy.model_executor.ops.gpu import init_signal_layerwise
+        out = init_signal_layerwise(kv_signal_metadata, layer_id)
+        return out
+    else:
+        raise NotImplementedError()
--- a/fastdeploy/model_executor/layers/attention/ops/open_shm_and_get_meta_signal.py
+++ b/fastdeploy/model_executor/layers/attention/ops/open_shm_and_get_meta_signal.py
@@ -0,0 +1,35 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+import paddle
+
+from fastdeploy.platforms import current_platform
+
+
+def open_shm_and_get_meta_signal(
+    rank: int = 0,
+    device_id: int = 0,
+    keep_pd_step_flag: bool = False,
+) -> paddle.Tensor:
+    """
+    open_shm_and_get_meta_signal
+    """
+    if current_platform.is_cuda():
+        from fastdeploy.model_executor.ops.gpu import \
+            open_shm_and_get_meta_signal
+        out = open_shm_and_get_meta_signal(rank, device_id, keep_pd_step_flag)
+        return out
+    else:
+        raise NotImplementedError()
--- a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py
@@ -0,0 +1,188 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, List, Optional, Tuple
+
+import paddle
+
+from fastdeploy.model_executor.layers.attention.ops import (
+    init_signal_layerwise, open_shm_and_get_meta_signal)
+
+if TYPE_CHECKING:
+    from paddle._typing.dtype_like import _DTypeLiteral
+
+from fastdeploy.config import FDConfig
+from fastdeploy.model_executor.layers.attention import Attention
+from fastdeploy.model_executor.layers.attention.base_attention_backend import (
+    AttentionBackend, AttentionMetadata)
+from fastdeploy.worker.forward_meta import ForwardMeta
+
+
+@dataclass
+class XPUAttentionMetadata(AttentionMetadata):
+    """
+    XPUAttentionMetadata
+    """
+    max_len_kv: paddle.Tensor = None
+    set_max_lengths: int = -1
+    encoder_batch_ids: paddle.Tensor = None
+    encoder_tile_ids_per_batch: paddle.Tensor = None
+    encoder_num_blocks: paddle.Tensor = None
+    kv_batch_ids: paddle.Tensor = None
+    kv_tile_ids_per_batch: paddle.Tensor = None
+    kv_num_blocks: paddle.Tensor = None
+    decoder_batch_ids: paddle.Tensor = None
+    decoder_tile_ids_per_batch: paddle.Tensor = None
+    decoder_num_blocks: paddle.Tensor = None
+
+    _dtype: _DTypeLiteral = paddle.bfloat16
+    encoder_max_partition_size: int = 32768
+    max_partition_size: int = 32768
+    block_tables: Optional[paddle.Tensor] = None
+    rotary_embs: Optional[paddle.Tensor] = None
+    attn_mask: Optional[paddle.Tensor] = None
+    encoder_block_shape_q: Optional[paddle.Tensor] = None
+    decoder_block_shape_q: Optional[paddle.Tensor] = None
+    _fuse_kernel_compute_dtype: str = "bf16"
+
+    # pd_disaggregation
+    kv_signal_metadata: Optional[paddle.Tensor] = None
+    kv_signal_data_list: List[paddle.Tensor] = field(default_factory=list)
+
+
+class XPUAttentionBackend(AttentionBackend):
+    """
+    XPUAttentionBackend backend implementation.
+    """
+
+    def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
+                 head_dim: int):
+        """
+        XPUAttentionBackend __init__
+        """
+        super().__init__()
+        self.attention_metadata: XPUAttentionMetadata = None
+        # TODO(gongshaotian): Use fd_config parameters in the correct location
+        self.block_size: int = fd_config.parallel_config.block_size
+        self.max_seq_len: int = fd_config.parallel_config.max_model_len
+        self.rope_theta: float = (10000.0
+                                  if fd_config.model_config.rope_theta is None
+                                  else fd_config.model_config.rope_theta)
+        self.rope_3d: bool = getattr(fd_config.model_config, "rope_3d", False)
+        self.causal: bool = getattr(fd_config.model_config, "causal", True)
+        # self.speculate_method = fd_config.parallel_config.speculate_method
+        # self.use_speculate = self.speculate_method is not None
+        # self.speculate_max_draft_token_num = fd_config.parallel_config.speculate_max_draft_tokens
+        self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
+        self.rank: int = fd_config.parallel_config.tensor_parallel_rank
+
+        self.kv_num_heads: int = kv_num_heads
+        self.num_heads: int = num_heads
+        self.head_dim: int = head_dim
+        self.num_layers: int = fd_config.model_config.num_layers
+
+        # pd_disaggregation
+        self.use_pd_disaggregation: int = int(
+            os.getenv("FLAGS_use_pd_disaggregation", 0))
+        self.start_layer_index: int = fd_config.model_config.start_layer_index
+
+    def init_attention_metadata(self, forward_meta: ForwardMeta):
+        """Initialize attntion metadata hence all layers in the forward pass can reuse it."""
+        metadata = XPUAttentionMetadata()
+        metadata.encoder_block_shape_q = 64
+        metadata.decoder_block_shape_q = 16
+        metadata.max_partition_size = 32768
+        metadata.encoder_max_partition_size = 32768
+        metadata._dtype = paddle.get_default_dtype()
+        if metadata._dtype == "bfloat16":
+            metadata._fuse_kernel_compute_dtype = "bf16"
+        elif metadata._dtype == "float16":
+            metadata._fuse_kernel_compute_dtype = "fp16"
+        elif metadata._dtype == "float32":
+            metadata._fuse_kernel_compute_dtype = "fp32"
+        metadata.block_tables = forward_meta.block_tables
+        metadata.rotary_embs = forward_meta.rotary_embs
+        metadata.attn_mask = forward_meta.attn_mask
+        metadata.pre_caches_length = forward_meta.pre_caches_length
+
+        # pd_disaggregation
+        metadata.kv_signal_data_list = [None] * self.num_layers
+        if self.use_pd_disaggregation:
+            metadata.kv_signal_metadata = open_shm_and_get_meta_signal(
+                self.rank, self.keep_pd_step_flag)
+        self.attention_metadata: AttentionMetadata = metadata
+
+    def get_attntion_meta(self) -> AttentionMetadata:
+        """get_attntion_meta"""
+        return self.attention_metadata
+
+    def get_kv_cache_shape(
+        self,
+        max_num_blocks: int,
+    ) -> Tuple[int, int, int, int]:
+        """
+        Caculate kv cache shape
+        """
+        return (max_num_blocks, self.kv_num_heads, self.block_size,
+                self.head_dim)
+
+    def forward_mixed(
+        self,
+        q: paddle.Tensor,
+        k: paddle.Tensor,
+        v: paddle.Tensor,
+        qkv: paddle.Tensor,
+        layer: Attention,
+        forward_meta: ForwardMeta,
+    ) -> paddle.Tensor:
+        """
+        forward_mixed
+        """
+        metadata = self.attention_metadata
+
+        if self.use_pd_disaggregation:
+            metadata.kv_signal_data_list[
+                layer.layer_id] = init_signal_layerwise(
+                    metadata.kv_signal_metadata,
+                    layer.layer_id + self.start_layer_index)
+
+        k_quant_scale = getattr(layer, "cache_k_scale", None)
+        v_quant_scale = getattr(layer, "cache_v_scale", None)
+
+        from fastdeploy.model_executor.ops.xpu import block_attn
+        res = block_attn(
+            qkv,
+            forward_meta.caches[2 * layer.layer_id],
+            forward_meta.caches[2 * layer.layer_id + 1],
+            forward_meta.cum_offsets,
+            metadata.rotary_embs,
+            metadata.block_tables,
+            None,
+            k_quant_scale,
+            v_quant_scale,
+            forward_meta.enc_batch,
+            forward_meta.dec_batch,
+            forward_meta.total_enc_len,
+            forward_meta.encoder_seq_lod_cpu,
+            forward_meta.encoder_batch_map_cpu,
+            forward_meta.decoder_context_len_cpu,
+            forward_meta.decoder_batch_map_cpu,
+        )
+        return res
--- a/fastdeploy/model_executor/layers/backends/xpu/init.py
+++ b/fastdeploy/model_executor/layers/backends/xpu/init.py
@@ -16,6 +16,6 @@
 xpu backend methods
 """

-from .quantization.weight_only import XPUWeightOnlyLinearMethod
+from .quantization.weight_only import XPUWeightOnlyLinearMethod, XPUWeightOnlyMoEMethod

-__all__ = ['XPUWeightOnlyLinearMethod']
+__all__ = ['XPUWeightOnlyLinearMethod', 'XPUWeightOnlyMoEMethod']
--- a/fastdeploy/model_executor/layers/backends/xpu/quantization/init.py
+++ b/fastdeploy/model_executor/layers/backends/xpu/quantization/init.py
@@ -1,5 +1,4 @@
-"""
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-
-""""
-Expert Parallelism Load Balancer (EPLB)
-"""
+xpu quantization methods
+"""
--- a/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py
+++ b/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py
@@ -13,15 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-from abc import abstractmethod
-from typing import Optional
+
+from typing import Dict

 import paddle
+from paddle import nn

-from .utils import xpu_quant_weight
+from fastdeploy.model_executor.layers.quantization.quant_base import \
+    QuantMethodBase
+from fastdeploy.model_executor.layers.quantization.weight_only import (
+    WeightOnlyConfig, WeightOnlyLinearMethod)
+from fastdeploy.model_executor.ops.xpu import weight_quantize_xpu

-from fastdeploy.model_executor.layers.quantization.quant_base import QuantConfigBase
-from fastdeploy.model_executor.layers.quantization.weight_only import WeightOnlyConfig, WeightOnlyLinearMethod

 class XPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
    """
@@ -34,12 +37,133 @@ class XPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
    ) -> None:
        super().__init__(quant_config)

-    def process_loaded_weights(self, layer, weight) -> None:
+    def create_weights(self, layer: nn.Layer) -> None:
+        """
+        Create weights for linear layer on XPU
+        """
+        layer.linear_weight_shape.reverse()
+        if self.quant_config.name() == "weight_only_int4":
+            layer.linear_weight_shape[0] //= 2
+        layer.weight_dtype = "int8"
+        linear_weight_scale_shape = [layer.embed_dim]
+        if hasattr(layer, "linear_weight_shape"):
+            if isinstance(layer.linear_weight_shape, list):
+                layer_weight_shape = layer.linear_weight_shape
+                linear_weight_scale_shape = layer_weight_shape[:1]
+
+        layer.linear_weight_scale = layer.create_parameter(
+            shape=linear_weight_scale_shape,
+            dtype="float32",
+            is_bias=False,
+        )
+
+    def process_loaded_weights(self, layer: nn.Layer,
+                               weight: paddle.Tensor) -> None:
        """
        loaded_weights using xpu special quantization
        """
-        quanted_weight_tensor, weight_scale_tensor = xpu_quant_weight(
-            weight.cpu().numpy())
-        layer.linear_weight.set_value(quanted_weight_tensor)
-        layer.linear_weight_scale.set_value(
-            weight_scale_tensor.astype(paddle.get_default_dtype()))
+        quanted_weight_tensor, weight_scale_tensor = weight_quantize_xpu(
+            weight, self.quant_config.algo, -1, -1)
+        layer.linear_weight.set_value(
+            paddle.transpose(quanted_weight_tensor, [1, 0]))
+        layer.linear_weight_scale.set_value(weight_scale_tensor)
+
+
+class XPUWeightOnlyMoEMethod(QuantMethodBase):
+    """
+    XPU Fused MoE Method.
+    """
+
+    def __init__(
+        self,
+        quant_config: WeightOnlyConfig,
+    ) -> None:
+        super().__init__()
+        self.quant_config = quant_config
+        self.moe_quant_type = self.quant_config.algo
+
+    def create_weights(self, layer: nn.Layer, state_dict: Dict[str,
+                                                               paddle.Tensor]):
+        """
+        Paddle cutlass create weight process.
+        """
+        ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
+        assert len(ffn1_weights) == layer.num_local_experts
+        assert len(ffn2_weights) == layer.num_local_experts
+        assert ffn1_weights[0].shape == [
+            layer.hidden_size, layer.moe_intermediate_size * 2
+        ]
+        assert ffn2_weights[0].shape == [
+            layer.moe_intermediate_size, layer.hidden_size
+        ]
+
+        added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
+        added_scale_attrs = ["moe_ffn1_weight_scale", "moe_ffn2_weight_scale"]
+
+        for idx, weight_tensor in enumerate([ffn1_weights, ffn2_weights]):
+            weight_name = added_weight_attrs[idx]
+            scale_name = added_scale_attrs[idx]
+
+            weight_list = []
+            weight_scale_list = []
+            for i in range(layer.num_local_experts):
+                quant_weight, scale = weight_quantize_xpu(
+                    weight_tensor[i], self.moe_quant_type, -1,
+                    -1)  # weight is [k,n]
+                weight_list.append(quant_weight.transpose(
+                    [1, 0]))  # transpose weight to [n,k]
+                weight_scale_list.append(scale)
+            quanted_weight = paddle.stack(weight_list, axis=0)
+            setattr(
+                layer, weight_name,
+                layer.create_parameter(
+                    shape=quanted_weight.shape,
+                    dtype=quanted_weight.dtype,
+                    default_initializer=paddle.nn.initializer.Constant(0),
+                ))
+            getattr(layer, weight_name).set_value(quanted_weight)
+
+            quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
+            setattr(
+                layer, scale_name,
+                layer.create_parameter(
+                    shape=quanted_weight_scale.shape,
+                    dtype=quanted_weight_scale.dtype,
+                ))
+            getattr(layer, scale_name).set_value(quanted_weight_scale)
+
+    def apply(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        XPU compute Fused MoE.
+        """
+        from fastdeploy.model_executor.ops.xpu import xpu_moe_layer
+
+        fused_moe_out = xpu_moe_layer(
+            x,
+            layer.gate_weight.transpose([1, 0]),
+            layer.gate_correction_bias,
+            layer.moe_ffn1_weight,
+            layer.moe_ffn2_weight,
+            None,  # ffn1 bias
+            None,  # ffn2 bias
+            (layer.moe_ffn1_weight_scale
+             if hasattr(layer, "moe_ffn1_weight_scale") else None),
+            (layer.moe_ffn2_weight_scale
+             if hasattr(layer, "moe_ffn2_weight_scale") else None),
+            (layer.moe_ffn2_in_scale
+             if hasattr(layer, "moe_ffn2_in_scale") else None),
+            self.moe_quant_type,
+            layer.top_k,
+            False,  # moe group, used in deepseek
+        )
+        if layer.tp_size > 1:
+            from fastdeploy.distributed.communication_op import \
+                tensor_model_parallel_all_reduce
+            tensor_model_parallel_all_reduce(fused_moe_out)
+
+        return fused_moe_out
--- a/fastdeploy/model_executor/layers/backends/xpu/utils.py
+++ b/fastdeploy/model_executor/layers/backends/xpu/utils.py
@@ -16,11 +16,13 @@
 !! This file will be deleted after the platform is fully functional
 """

+from typing import Tuple
+
 import numpy as np
 import paddle


-def xpu_clip_and_round(x):
+def xpu_clip_and_round(x: np.ndarray) -> np.ndarray:
    """
    Clip and round the input array to the range [-127, 127] and convert to int8.

@@ -33,7 +35,8 @@ def xpu_clip_and_round(x):
    return np.clip(np.around(x), -127, 127).astype("int8")


-def xpu_quant_qkv_weight(weight_np):
+def xpu_quant_qkv_weight(
+        weight_np: np.ndarray) -> Tuple[paddle.Tensor, paddle.Tensor]:
    """
    Quantize the query, key, and value weights for the Transformer model.

@@ -61,7 +64,8 @@ def xpu_quant_qkv_weight(weight_np):
    return quanted_weight, weight_scales


-def xpu_quant_weight(weight_np):
+def xpu_quant_weight(
+        weight_np: np.ndarray) -> Tuple[paddle.Tensor, paddle.Tensor]:
    """
    Quantize the weight tensor for XPU devices.

--- a/fastdeploy/model_executor/layers/embeddings.py
+++ b/fastdeploy/model_executor/layers/embeddings.py
@@ -28,7 +28,7 @@ class VocabParallelEmbedding(nn.Layer):

    def __init__(
        self,
-        llm_config,
+        fd_config,
        num_embeddings,
        embedding_dim=768,
        params_dtype="bfloat16",
@@ -38,7 +38,7 @@ class VocabParallelEmbedding(nn.Layer):
        Initialize the VocabParallelEmbedding layer for the model.

        Args:
-            llm_config (LLMConfig): Arguments related to inference, containing
+            fd_config (FDConfig): Arguments related to inference, containing
                attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
                num_attention_heads, and ffn_hidden_size.
            num_embeddings : vocabulary size.
@@ -48,21 +48,21 @@ class VocabParallelEmbedding(nn.Layer):
                you can give it any name you like.
        """
        super().__init__()
+        self.fd_config = fd_config
        hcg = fleet.get_hybrid_communicate_group()
        self.mp_rank = hcg.get_model_parallel_rank()
-        self.column_cut = llm_config.parallel_config.column_cut
+        self.column_cut = fd_config.parallel_config.column_cut
        self.world_size = hcg.get_model_parallel_world_size()
        self.ring_id = hcg.get_model_parallel_group().id
-        self.use_rope = llm_config.model_config.use_rope
-        self.rope_head_dim = llm_config.model_config.rope_head_dim
-        self.use_ep = llm_config.parallel_config.use_ep
-        self.hidden_dropout_prob = llm_config.model_config.hidden_dropout_prob
-        self.initializer_range = llm_config.model_config.initializer_range
-        self.weight_sharing = llm_config.model_config.weight_sharing
-        self.sequence_parallel = llm_config.parallel_config.sequence_parallel
-        self.weight_sharing_add_bias = llm_config.model_config.weight_sharing_add_bias
-        self.max_position_embeddings = llm_config.model_config.max_position_embeddings
-        self.freeze_embedding = llm_config.model_config.freeze_embedding
+        self.use_rope = fd_config.model_config.use_rope
+        self.rope_head_dim = fd_config.model_config.rope_head_dim
+        self.use_ep = fd_config.parallel_config.use_ep
+        self.hidden_dropout_prob = fd_config.model_config.hidden_dropout_prob
+        self.initializer_range = fd_config.model_config.initializer_range
+        self.sequence_parallel = fd_config.parallel_config.sequence_parallel
+        self.max_position_embeddings = fd_config.model_config.max_position_embeddings
+        self.freeze_embedding = fd_config.model_config.freeze_embedding
+        self.tie_word_embeddings = fd_config.model_config.tie_word_embeddings

        if self.use_ep:
            self.word_embeddings = nn.Embedding(
@@ -78,8 +78,7 @@ class VocabParallelEmbedding(nn.Layer):
                    get_model_parallel_group(),
                    weight_attr=paddle.ParamAttr(
                        initializer=nn.initializer.Normal(
-                            mean=0.0, std=self.initializer_range),
-                    ),
+                            mean=0.0, std=self.initializer_range), ),
                )
            else:
                # column cut embedding
@@ -87,6 +86,7 @@ class VocabParallelEmbedding(nn.Layer):
                    num_embeddings,
                    embedding_dim // self.world_size,
                )
+
                self.word_embeddings.weight.is_distributed = True
                self.word_embeddings.weight.split_axis = 1

@@ -94,34 +94,12 @@ class VocabParallelEmbedding(nn.Layer):
            self.position_embeddings = nn.Embedding(
                self.max_position_embeddings,
                embedding_dim,
-                weight_attr=paddle.ParamAttr(
-                    initializer=nn.initializer.Normal(
-                        mean=0.0, std=self.initializer_range),
-                ),
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(
+                    mean=0.0, std=self.initializer_range), ),
            )

        self.prefix = prefix

-        if self.weight_sharing and self.weight_sharing_add_bias:
-            assert num_embeddings % self.world_size == 0
-            if self.use_ep:
-                self.bias = self.create_parameter(
-                    shape=[num_embeddings],
-                    dtype=paddle.get_default_dtype(),
-                    attr=paddle.ParamAttr(
-                        initializer=paddle.nn.initializer.Constant(value=0.0),
-                    ),
-                    is_bias=True,
-                )
-            else:
-                self.bias = self.create_parameter(
-                    shape=[num_embeddings // self.world_size],
-                    dtype=paddle.get_default_dtype(),
-                    attr=mask_lm_out_bias_attr,
-                    is_bias=True,
-                )
-                self.bias.is_distributed = True
-
        if self.freeze_embedding:
            self.word_embeddings.weight.learning_rate = 0.0
            if not self.use_rope:
@@ -138,9 +116,14 @@ class VocabParallelEmbedding(nn.Layer):
        Args:
            state_dict (dict): A dictionary containing the checkpoint weights and biases.
        """
-        self.word_embeddings.weight.set_value(
-            get_tensor(state_dict.pop(self.prefix + ".weight")).astype(
-                paddle.get_default_dtype()))
+        if self.tie_word_embeddings:
+            self.word_embeddings.weight.set_value(
+                get_tensor(state_dict[self.prefix + ".weight"]).astype(
+                    paddle.get_default_dtype()))
+        else:
+            self.word_embeddings.weight.set_value(
+                get_tensor(state_dict.pop(self.prefix + ".weight")).astype(
+                    paddle.get_default_dtype()))

    def forward(self, ids_remove_padding=None):
        """
--- a/fastdeploy/model_executor/layers/hydra_head.py
+++ b/fastdeploy/model_executor/layers/hydra_head.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """

-from paddlenlp.utils.log import logger
+from paddleformers.utils.log import logger

 import paddle
 import paddle.nn.functional as F
--- a/fastdeploy/model_executor/layers/linear.py
+++ b/fastdeploy/model_executor/layers/linear.py
@@ -14,29 +14,25 @@
 # limitations under the License.
 """

-import os
-
-import fastdeploy
-from paddlenlp.utils.log import logger
-
 import paddle
 from paddle import nn

+from fastdeploy.config import FDConfig
+from fastdeploy.distributed.communication_op import \
+    tensor_model_parallel_all_reduce
 from fastdeploy.platforms import current_platform

 from .utils import _set_var_distributed, divide, get_tensor

-import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
-

 class LinearBase(nn.Layer):
    """
-    LinearBase Layer
+    LinearBase Layer.
    """

    def __init__(
        self,
-        llm_config,
+        fd_config: FDConfig,
        prefix: str = "",
        input_size: int = None,
        output_size: int = None,
@@ -48,31 +44,26 @@ class LinearBase(nn.Layer):
        Initializes a linear layer and provides additional parameters required for inference and quantization.

        Args:
-            llm_config (LLMConfig): Inference-related parameters containing attributes such as
-                weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
-                num_attention_heads, and ffn_hidden_size.
+            fd_config (FDConfig): Inference-related parameters.
            prefix (str): Unique name of the layer, used to name internal attributes.
                Can be arbitrarily named.
-            input_size (int, optional): Number of input features. Defaults to None.
-            output_size (int, optional): Number of output features. Defaults to None.
-            weight_key (Any, optional): Key for weights. Defaults to None.
-            bias_key (Any, optional): Key for biases. Defaults to None.
-            skip_quant (bool, optional): Whether to skip quantization. Defaults to False.
+            input_size (int): Number of input features. Defaults to None.
+            output_size (int): Number of output features. Defaults to None.
+            with_bias (bool): Whether to include bias or not. Defaults to False.
+            add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to False.
+            skip_quant (bool): Whether to skip quantization. Defaults to False.

        Raises:
            NotImplementedError: Raised if the current platform is not a CUDA platform.
        """
        super().__init__()
-        if current_platform.is_cuda():
+        if current_platform.is_cuda() or current_platform.is_xpu():
            self.forward = self.forward_cuda
        else:
            raise NotImplementedError

-        self.llm_config = llm_config
+        self.fd_config = fd_config
        self.skip_quant = skip_quant
-        self.use_smooth_quant = llm_config.model_config.use_smooth_quant
-        self.weight_dtype = llm_config.model_config.weight_dtype
-        self.act_dtype = llm_config.model_config.act_dtype
        self.input_size = input_size
        self.output_size = output_size
        self.with_bias = with_bias
@@ -86,61 +77,27 @@ class LinearBase(nn.Layer):
        self.out_scale_key = f"{prefix}.out_scale"

        self._dtype = self._helper.get_default_dtype()
-
-        if llm_config.quant_config:
-            self.quant_method = llm_config.quant_config.get_quant_method(self)
-        self.use_offline_quant = llm_config.tmp_config.use_offline_quant
-        
-    def is_y_transposed(self):
-        """
-        Returns whether the y tensor should be transposed for inference.
-        Args:
-            None.
-
-        Returns:
-            bool, whether the y tensor should be transposed for inference.
-        """
-        if self.weight_dtype == "int4":
-            return True
-        if self.weight_dtype == "int8":
-            return True
-        if "float8" in self.weight_dtype:
-            return True
-        # bf16/fp16/fp32 y is not transposed
-        return False
-
-    def init_weight_shape(self, trans=False):
-        """
-        Initialize the weight shape for the first feedforward network layer.
-
-        Args:
-            trans (bool, optional): Whether to transpose the weight shape.
-                Defaults to False. If True, the shape will be reversed.
-
-        Returns:
-            None.
-        """
+        self.weight_dtype = self._dtype
        self.linear_weight_shape = [
            self.input_size,
            self.output_size,
        ]
-        if trans:
-            self.linear_weight_shape.reverse()
-        if self.use_smooth_quant:
-            self.linear_shift_shape = [self.output_size]
-            self.linear_smooth_shape = [self.output_size]
-        if self.weight_dtype == "int4":
-            self.linear_weight_shape[0] //= 2
+        if fd_config.quant_config:
+            self.quant_method = fd_config.quant_config.get_quant_method(self)
+        if fd_config.model_config.is_quantized:
+            self.weight_key = f"{prefix}.quant_weight"
+            self.weight_scale_key = f"{prefix}.weight_scale"
+            self.act_scale_key = f"{prefix}.activation_scale"

    def init_weight(self):
        """
        Initialize the weights and biases.
        """
-        self.init_weight_shape(self.is_y_transposed())
-
+        if self.skip_quant:
+            self.weight_dtype = self._dtype
        self.linear_weight = self.create_parameter(
            shape=self.linear_weight_shape,
-            dtype=self.get_weight_create_dtype(),
+            dtype=self.weight_dtype,
            is_bias=False,
            default_initializer=paddle.nn.initializer.Constant(0),
        )
@@ -156,117 +113,57 @@ class LinearBase(nn.Layer):
        # smooth quant
        self.linear_shift = None
        self.linear_smooth = None
-        if self.use_smooth_quant:
-            self.linear_shift = self.create_parameter(
-                shape=self.linear_shift_shape,
-                dtype=self._dtype,
-                is_bias=False,
-            )
-            self.linear_smooth = self.create_parameter(
-                shape=self.linear_smooth_shape,
-                dtype=self._dtype,
-                is_bias=False,
-            )

-    def get_weight_create_dtype(self):
+    def load_prequant_weight(self, state_dict: dict):
        """
-        Get the data type for creating weights based on quantization settings.
+        Load the prequantized weight from the state dictionary.

        Args:
-            self (object): The instance of the class where this method is defined.
-
-        Returns:
-            str: The data type for creating weights. It depends on the quantization settings:
-                - If `self.skip_quant` is True, returns the original data type `self._dtype`.
-                - If `self.weight_dtype` is "int4", returns "int8" to ensure compatibility or optimization.
-                - Otherwise, returns the specified weight data type `self.weight_dtype`.
+            state_dict (dict): A dictionary containing the prequantized weights and scales.
        """
-        if self.skip_quant:
-            return self._dtype
-        if self.weight_dtype == "int4":
-            return "int8"
-        # TODO(wangzhe24) create_parameter not support FP8
-        if "float8" in self.weight_dtype:
-            return self._dtype
-        return self.weight_dtype
+        self.quant_method.process_prequanted_weights(self, state_dict)

+    def load_weight(self, state_dict: dict):
+        """
+        Load the weight from the state dictionary.

-    def load_offline_quant_state_dict(self, quant_weight, quant_scale=None):
+        Args:
+            state_dict (dict): A dictionary containing the weights
        """
-        Load offline the checkpoint state dictionary into the layer.
-        """
-        if quant_scale is None:
-            if "float8" in self.weight_dtype:
-                self.linear_weight.copy_(quant_weight, False)
-            else:
-                self.linear_weight.set_value(quant_weight)
+        weight_tensor = get_tensor(state_dict.pop(self.weight_key))
+
+        if self.fd_config.quant_config:
+            self.quant_method.process_loaded_weights(self, weight_tensor)
        else:
-            if self.inference_args.weight_block_size[0] != -1:
-                self.linear_weight.copy_(quant_weight.view(paddle.float8_e4m3fn), False)
-            else:
-                self.linear_weight.set_value(quant_weight)
-            self.linear_weight_scale.set_value(quant_scale)
+            self.linear_weight.set_value(weight_tensor)

-    def load_state_dict(self, state_dict):
+    def load_state_dict(self, state_dict: dict):
        """
        Load the checkpoint state dictionary into the layer.

        Args:
            state_dict (dict): A dictionary containing the checkpoint weights and biases.
        """
-        if self.use_offline_quant:
-            self.load_offline_quant_state_dict(
-                quant_weight=get_tensor(
-                    state_dict.pop(self.weight_key + ".quant_weight")
-                ),
-                quant_scale=get_tensor(
-                    state_dict.pop(self.weight_key + ".quant_scale")
-                ),
-            )
+        # weight
+        self.state_dict = state_dict
+        assert self.weight_key is not None, 'weight_key should not be None.'
+        if self.fd_config.model_config.is_quantized:
+            self.load_prequant_weight(state_dict)
        else:
-            # weight
-            assert self.weight_key is not None, 'weight_key should not be None.'
-            weight_tensor = get_tensor(state_dict.pop(self.weight_key))
-
-            if self.llm_config.quant_config:
-                self.quant_method.process_loaded_weights(self, weight_tensor)
-            else:
-                self.linear_weight.set_value(weight_tensor)
+            self.load_weight(state_dict)

        # bias
        if self.with_bias:
-            bias_tensor = paddle.to_tensor(get_tensor(state_dict.pop(self.bias_key)))
+            bias_tensor = paddle.to_tensor(
+                get_tensor(state_dict.pop(self.bias_key)))
            self.linear_bias.set_value(bias_tensor)

-        # smooth quant
-        if self.use_smooth_quant:
-            if self.shift_key in state_dict:
-                shift_tensor = get_tensor(state_dict.pop(self.shift_key)).astype(
-                    paddle.get_default_dtype()
-                )
-            else:
-                shift_tensor = paddle.zeros(
-                    shape=self.linear_shift_shape,
-                    dtype=paddle.get_default_dtype(),
-                )
-            self.linear_shift.set_value(shift_tensor)
-            if self.smooth_key in state_dict:
-                smooth_tensor = get_tensor(state_dict.pop(self.smooth_key)).astype(
-                    paddle.get_default_dtype()
-                )
-            else:
-                smooth_tensor = paddle.ones(
-                    shape=[self.linear_smooth_shape],
-                    dtype=paddle.get_default_dtype(),
-                )
-            self.linear_smooth.set_value(smooth_tensor)
-
-    def forward_cuda(self, x):
+    def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor:
        """
-        Forward function for ColumnParallelLinear.
+        Forward function for Linear.

        Args:
-            x (Tensor): Input tensor to the ColumnParallelLinear layer.
+            x (Tensor): Input tensor to the Linear.

        Returns:
            Tensor: Output tensor.
@@ -274,22 +171,24 @@ class LinearBase(nn.Layer):
        Raises:
            NotImplementedError: If the weight dtype is not float8 or act dtype is not equal to weight dtype.
        """
-        if self.llm_config.quant_config:
+        if self.fd_config.quant_config:
            linear_out = self.quant_method.apply(self, x)
        else:
            linear_out = paddle.matmul(x, self.linear_weight)
+            if self.with_bias:
+                linear_out = paddle.add(linear_out, self.linear_bias)

        return linear_out


 class ReplicatedLinear(LinearBase):
    """
-    ReplicatedLinear Layer
+    ReplicatedLinear Layer.
    """

    def __init__(
        self,
-        llm_config,
+        fd_config: FDConfig,
        prefix: str = "",
        input_size: int = None,
        output_size: int = None,
@@ -298,74 +197,39 @@ class ReplicatedLinear(LinearBase):
        skip_quant: bool = False,
    ):
        """
-        Initialize a linear layer with additional parameters for inference and quantization.
+        Initializes a replicated linear layer.

        Args:
-            llm_config (LLMConfig): Arguments related to inference, containing
-                attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
-                num_attention_heads, and ffn_hidden_size.
-            prefix (str): Unique name of the layer, used for naming internal attributes,
-                you can give it any name you like.
-            layer_index (int): The index of the linear layer in the model
-
+            fd_config (FDConfig): Inference-related parameters.
+            prefix (str): Unique name of the layer, used to name internal attributes.
+                Can be arbitrarily named.
+            input_size (int): Number of input features. Defaults to None.
+            output_size (int): Number of output features. Defaults to None.
+            with_bias (bool): Whether to include bias or not. Defaults to False.
+            add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to False.
+            skip_quant (bool): Whether to skip quantization. Defaults to False.
        """
-        super().__init__(llm_config=llm_config,
+        super().__init__(fd_config=fd_config,
                         prefix=prefix,
                         input_size=input_size,
                         output_size=output_size,
                         with_bias=with_bias,
                         add_bias=add_bias,
                         skip_quant=skip_quant)
-        self.nranks = llm_config.parallel_config.mp_size
-        self.input_size = input_size
        self.init_weight()
-        self.quant_method.create_weights(self)
-
-    def init_weight(self):
-        """
-        Initialize the weights and biases.
-        """
-        self.init_weight_shape(self.is_y_transposed())
-
-        self.linear_weight = self.create_parameter(
-            shape=self.linear_weight_shape,
-            dtype=self.get_weight_create_dtype(),
-            is_bias=False,
-            default_initializer=paddle.nn.initializer.Constant(0),
-        )
-
-        self.linear_bias = None
-        if self.with_bias:
-            self.linear_bias = self.create_parameter(
-                shape=[self.output_size],
-                dtype=self._dtype,
-                is_bias=True,
-            )
-
-        # smooth quant
-        self.linear_shift = None
-        self.linear_smooth = None
-        if self.use_smooth_quant:
-            self.linear_shift = self.create_parameter(
-                shape=self.linear_shift_shape,
-                dtype=self._dtype,
-                is_bias=False,
-            )
-            self.linear_smooth = self.create_parameter(
-                shape=self.linear_smooth_shape,
-                dtype=self._dtype,
-                is_bias=False,
-            )


 class ColumnParallelLinear(LinearBase):
    """
-    ColumnParallelLinear Layer
+    ColumnParallelLinear Layer.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
    """

    def __init__(
        self,
-        llm_config,
+        fd_config: FDConfig,
        prefix: str = "",
        input_size: int = None,
        output_size: int = None,
@@ -374,40 +238,45 @@ class ColumnParallelLinear(LinearBase):
        skip_quant: bool = False,
    ):
        """
-        Initialize a linear layer with additional parameters for inference and quantization.
+        Initializes a linear layer and provides additional parameters required for inference and quantization.

        Args:
-            llm_config (LLMConfig): Arguments related to inference, containing
-                attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
-                num_attention_heads, and ffn_hidden_size.
-            prefix (str): Unique name of the layer, used for naming internal attributes,
-                you can give it any name you like.
-            layer_index (int): The index of the linear layer in the model
-
+            fd_config (FDConfig): Inference-related parameters.
+            prefix (str): Unique name of the layer, used to name internal attributes.
+                Can be arbitrarily named.
+            input_size (int): Number of input features. Defaults to None.
+            output_size (int): Number of output features. Defaults to None.
+            with_bias (bool): Whether to include bias or not. Defaults to False.
+            add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to False.
+            skip_quant (bool): Whether to skip quantization. Defaults to False.
        """
-        super().__init__(llm_config=llm_config,
+        super().__init__(fd_config=fd_config,
                         prefix=prefix,
                         input_size=input_size,
                         output_size=output_size,
                         with_bias=with_bias,
                         add_bias=add_bias,
                         skip_quant=skip_quant)
-        self.nranks = llm_config.parallel_config.mp_size
+        self.nranks = fd_config.parallel_config.tensor_parallel_degree
        self.input_size = input_size
        self.output_size = divide(output_size, self.nranks)
+        self.linear_weight_shape = [
+            self.input_size,
+            self.output_size,
+        ]
+        if fd_config.quant_config:
+            self.quant_method.create_weights(self)
        self.init_weight()

-        self.quant_method.create_weights(self)
-
    def init_weight(self):
        """
        Initialize the weights and biases.
        """
-        self.init_weight_shape(self.is_y_transposed())
-
+        if self.skip_quant:
+            self.weight_dtype = self._dtype
        self.linear_weight = self.create_parameter(
            shape=self.linear_weight_shape,
-            dtype=self.get_weight_create_dtype(),
+            dtype=self.weight_dtype,
            is_bias=False,
            default_initializer=paddle.nn.initializer.Constant(0),
        )
@@ -429,62 +298,51 @@ class ColumnParallelLinear(LinearBase):
        # smooth quant
        self.linear_shift = None
        self.linear_smooth = None
-        if self.use_smooth_quant:
-            self.linear_shift = self.create_parameter(
-                shape=self.linear_shift_shape,
-                dtype=self._dtype,
-                is_bias=False,
-            )
-            self.linear_smooth = self.create_parameter(
-                shape=self.linear_smooth_shape,
-                dtype=self._dtype,
-                is_bias=False,
-            )


 class MergedColumnParallelLinear(ColumnParallelLinear):
    """
    MergedColumnParallelLinear Layer.
+
+    Similar to ColumnParallelLinear, but the weight matrix is concatenated
+    along the output dimension. When the weight matrix is loaded, the
+    different partitions are sharded separately.
    """

    def __init__(
        self,
-        llm_config,
-        prefix,
-        with_bias=False,
-        add_bias=False,
-        activation="gelu",
-        use_fast_ffn=False,
-        skip_quant=False,
+        fd_config: FDConfig,
+        prefix: str,
+        input_size: int = None,
+        output_size: int = None,
+        with_bias: bool = False,
+        add_bias: bool = False,
+        activation: str = "gelu",
+        use_fast_ffn: bool = False,
+        skip_quant: bool = False,
    ):
-        """Packed linear layers with column parallelism.
-
+        """
        Initialize the fused ffn1 Linear layer with given parameters.

        Args:
-            llm_config (LLMConfig): Arguments related to inference, containing
-                attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
-                num_attention_heads, and ffn_hidden_size.
-
-            prefix (str): Unique name of the layer, used for naming weights and biases.
-            weight_key (str): Key name of weight in the pdparams state dict.
-            bias_key (str): Key name of bias in the pdparams state dict. Defaults to None, means no bias.
-            with_bias (bool, optional): Whether to include bias term. Defaults to True.
-            activation (str, optional): Activation function to use. Defaults to "gelu".
-            use_fast_ffn (bool, optional): Whether to use a faster FFN implementation.
+            fd_config (FDConfig): Inference-related parameters.
+            prefix (str): Unique name of the layer, used to name internal attributes.
+                Can be arbitrarily named.
+            input_size (int): Number of input features. Defaults to None.
+            output_size (int): Number of output features. Defaults to None.
+            with_bias (bool): Whether to include bias or not. Defaults to False.
+            add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to False.
+            activation (str): Activation function to use. Defaults to "gelu".
+            use_fast_ffn (bool): Whether to use a faster FFN implementation.
                Defaults to False.
-            skip_quant (bool, optional): Whether to skip quantization steps. Defaults to False.
+            skip_quant (bool): Whether to skip quantization. Defaults to False.
        """
        self.use_fast_ffn = use_fast_ffn
        self.activation = activation
-        self.embed_dim = llm_config.model_config.hidden_size
-        self.dim_feedforward = llm_config.model_config.ffn_hidden_size
-        self.nranks = llm_config.parallel_config.mp_size
-        self.dim_feedforward_per_rank = divide(self.dim_feedforward,
-                                               self.nranks)
-        input_size = self.embed_dim
-        output_size = self.dim_feedforward * 2
-        super().__init__(llm_config=llm_config,
+        self.embed_dim = fd_config.model_config.hidden_size
+        self.nranks = fd_config.parallel_config.tensor_parallel_degree
+
+        super().__init__(fd_config=fd_config,
                         prefix=prefix,
                         input_size=input_size,
                         output_size=output_size,
@@ -492,7 +350,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                         add_bias=add_bias,
                         skip_quant=skip_quant)

-    def load_state_dict(self, state_dict):
+    def load_state_dict(self, state_dict: dict):
        """
        Load the checkpoint state dictionary into the layer.

@@ -542,47 +400,40 @@ class QKVParallelLinear(ColumnParallelLinear):
    QKVParallelLinear Layer.
    """

-    def __init__(self, llm_config, prefix, with_bias=False, add_bias=True):
+    def __init__(self, fd_config, prefix, with_bias=False, add_bias=True):
        """
        Initialize the QKV Linear layer with given parameters.

        Args:
-            llm_config (LLMConfig): Arguments related to inference, containing
-                attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
-                num_attention_heads, and ffn_hidden_size.
-
-            prefix (str): Unique name of the layer, used for naming weights and biases.
-            weight_key (str): Key name of weight in the pdparams state dict.
-            bias_key (str): Key name of bias in the pdparams state dict. Defaults to None, means no bias.
-            with_bias (bool, optional): Whether to include bias term. Defaults to True.
-            skip_quant (bool, optional): Whether to skip quantization steps. Defaults to False.
+            fd_config (FDConfig): Inference-related parameters.
+            prefix (str): Unique name of the layer, used to name internal attributes.
+                Can be arbitrarily named.
+            with_bias (bool): Whether to include bias or not. Defaults to False.
+            add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to True.
        """
-        self.num_heads = llm_config.model_config.num_attention_heads
-        self.kv_num_heads = llm_config.model_config.num_key_value_heads
-        self.embed_dim = llm_config.model_config.hidden_size
-        self.head_dim = llm_config.model_config.head_dim
-        self.nranks = llm_config.parallel_config.mp_size
+        self.num_heads = fd_config.model_config.num_attention_heads
+        self.kv_num_heads = fd_config.model_config.num_key_value_heads
+        self.embed_dim = fd_config.model_config.hidden_size
+        self.head_dim = fd_config.model_config.head_dim
+        self.nranks = fd_config.parallel_config.tensor_parallel_degree
        self.num_heads_per_rank = divide(self.num_heads, self.nranks)
        self.kv_num_heads_per_rank = divide(self.kv_num_heads, self.nranks)
        input_size = self.embed_dim
        output_size = (self.num_heads + 2 * self.kv_num_heads) * self.head_dim
-        super().__init__(llm_config=llm_config,
+        super().__init__(fd_config=fd_config,
                         prefix=prefix,
                         input_size=input_size,
                         output_size=output_size,
                         with_bias=with_bias,
                         add_bias=add_bias)

-    def load_state_dict(self, state_dict):
+    def load_weight(self, state_dict: dict):
        """
-        Load the checkpoint state dictionary into the layer.
+        Load the weight from the state dictionary.

        Args:
-            state_dict (dict): A dictionary containing the checkpoint weights and biases.
+            state_dict (dict): A dictionary containing the weights
        """
-        # weight
-        assert self.weight_key is not None, 'weight_key should not be None.'
-        # qkv fused in disk
        if self.weight_key in state_dict.keys():
            weight_tensor = get_tensor(state_dict.pop(self.weight_key))
        else:
@@ -601,11 +452,27 @@ class QKVParallelLinear(ColumnParallelLinear):
            ])
            weight_tensor = paddle.transpose(weight_tensor, perm=[1, 0])

-        if self.llm_config.quant_config:
+        if self.fd_config.quant_config:
            self.quant_method.process_loaded_weights(self, weight_tensor)
        else:
            self.linear_weight.set_value(weight_tensor)

+    def load_state_dict(self, state_dict: dict):
+        """
+        Load the checkpoint state dictionary into the layer.
+
+        Args:
+            state_dict (dict): A dictionary containing the checkpoint weights and biases.
+        """
+        # weight
+        assert self.weight_key is not None, 'weight_key should not be None.'
+        # qkv fused in disk
+
+        if self.fd_config.model_config.is_quantized:
+            self.load_prequant_weight(state_dict)
+        else:
+            self.load_weight(state_dict)
+
        # bias
        if self.with_bias:
            if self.bias_key in state_dict.keys():
@@ -622,38 +489,25 @@ class QKVParallelLinear(ColumnParallelLinear):
                qkv_bias = paddle.concat([q_bias, k_bias, v_bias], axis=-1)
            self.linear_bias.set_value(qkv_bias)

-        # smooth quant
-        if self.use_smooth_quant:
-            if self.shift_key in state_dict:
-                shift_tensor = get_tensor(state_dict.pop(self.shift_key)).astype(
-                    paddle.get_default_dtype()
-                )
-            else:
-                shift_tensor = paddle.zeros(
-                    shape=self.linear_shift_shape,
-                    dtype=paddle.get_default_dtype(),
-                )
-            self.linear_shift.set_value(shift_tensor)
-            if self.smooth_key in state_dict:
-                smooth_tensor = get_tensor(state_dict.pop(self.smooth_key)).astype(
-                    paddle.get_default_dtype()
-                )
-            else:
-                smooth_tensor = paddle.ones(
-                    shape=[self.linear_smooth_shape],
-                    dtype=paddle.get_default_dtype(),
-                )
-            self.linear_smooth.set_value(smooth_tensor)
-

 class RowParallelLinear(LinearBase):
    """
-    RowParallelLinear Layer
+    RowParallelLinear Layer.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
    """

    def __init__(
        self,
-        llm_config,
+        fd_config: FDConfig,
        prefix: str = "",
        input_size: int = None,
        output_size: int = None,
@@ -665,57 +519,50 @@ class RowParallelLinear(LinearBase):
        Initialize a linear layer with additional parameters for inference and quantization.

        Args:
-            llm_config (LLMConfig): Arguments related to inference, containing
-                attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
-                num_attention_heads, and ffn_hidden_size.
-            prefix (str): Unique name of the layer, used for naming internal attributes,
-                you can give it any name you like.
-            layer_index (int): The index of the linear layer in the model
-
+            fd_config (FDConfig): Inference-related parameters.
+            prefix (str): Unique name of the layer, used to name internal attributes.
+                Can be arbitrarily named.
+            input_size (int): Number of input features. Defaults to None.
+            output_size (int): Number of output features. Defaults to None.
+            with_bias (bool): Whether to include bias or not. Defaults to False.
+            add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to False.
+            skip_quant (bool): Whether to skip quantization. Defaults to False.
        """
-        super().__init__(llm_config=llm_config,
+        super().__init__(fd_config=fd_config,
                         prefix=prefix,
                         input_size=input_size,
                         output_size=output_size,
                         with_bias=with_bias,
                         add_bias=add_bias,
                         skip_quant=skip_quant)
-        self.llm_config = llm_config
+        self.fd_config = fd_config
        self.skip_quant = False
-        self.use_smooth_quant = llm_config.model_config.use_smooth_quant
-        self.weight_dtype = llm_config.model_config.weight_dtype
-        self.act_dtype = llm_config.model_config.act_dtype
-        self.nranks = llm_config.parallel_config.mp_size
-        self.embed_dim = llm_config.model_config.hidden_size
-        self.head_dim = llm_config.model_config.hidden_size // llm_config.model_config.num_attention_heads
-        self.num_heads = llm_config.model_config.num_attention_heads // self.nranks
-        self.dim_feedforward = llm_config.model_config.ffn_hidden_size // self.nranks
-        self.with_bias = with_bias
-        self.prefix = prefix
-        self.shift_key = f"{prefix}.shift_bias"
-        self.smooth_key = f"{prefix}.smooth_weight"
-        self.weight_key = f"{prefix}.weight"
-        self.bias_key = f"{prefix}.bias"
-        self.weight_only_scale_key = f"{prefix}.weight_only_scale"
-        self.out_scale_key = f"{prefix}.out_scale"
+        self.nranks = fd_config.parallel_config.tensor_parallel_degree
+        self.embed_dim = fd_config.model_config.hidden_size
+        self.head_dim = fd_config.model_config.head_dim
+        self.num_heads = fd_config.model_config.num_attention_heads // self.nranks

+        self.linear_weight_shape = [
+            self.input_size,
+            self.output_size,
+        ]
        self._dtype = self._helper.get_default_dtype()

-        if llm_config.quant_config:
-            self.quant_method = llm_config.quant_config.get_quant_method(self)
+        if fd_config.quant_config:
+            self.quant_method = fd_config.quant_config.get_quant_method(self)
            self.quant_method.create_weights(self)
-
        self.init_weight()

    def init_weight(self):
        """
        Initialize the weights and biases.
        """
-        self.init_weight_shape(self.is_y_transposed())
+        if self.skip_quant:
+            self.weight_dtype = self._dtype

        self.linear_weight = self.create_parameter(
            shape=self.linear_weight_shape,
-            dtype=self.get_weight_create_dtype(),
+            dtype=self.weight_dtype,
            is_bias=False,
            default_initializer=paddle.nn.initializer.Constant(0),
        )
@@ -735,27 +582,159 @@ class RowParallelLinear(LinearBase):
        # smooth quant
        self.linear_shift = None
        self.linear_smooth = None
-        if self.use_smooth_quant:
-            self.linear_shift = self.create_parameter(
-                shape=self.linear_shift_shape,
-                dtype=self._dtype,
-                is_bias=False,
-            )
-            self.linear_smooth = self.create_parameter(
-                shape=self.linear_smooth_shape,
-                dtype=self._dtype,
-                is_bias=False,
-            )

-    def forward_cuda(self, x):
-        if self.llm_config.quant_config:
+    def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor:
+        if self.fd_config.quant_config:
            out = self.quant_method.apply(self, x)
        else:
            out = paddle.matmul(x, self.linear_weight)

        if self.nranks > 1:
-            from fastdeploy.distributed.communication_op import \
-                tensor_model_parallel_all_reduce
            tensor_model_parallel_all_reduce(out)

        return out
+
+
+class KVBatchLinear(LinearBase):
+    """
+    KVBatchLinear Layer for handling combined KV projections with bmm.
+    """
+
+    def __init__(
+        self,
+        fd_config: FDConfig,
+        prefix: str = "",
+        kv_lora_rank: int = None,
+        num_attention_heads: int = None,
+        qk_nope_head_dim: int = None,
+        v_head_dim: int = None,
+        with_bias: bool = False,
+        skip_quant: bool = False,
+    ):
+        """
+        Initializes a KV batch linear layer that internally splits into K and V projections.
+
+        Args:
+            fd_config (FDConfig): Inference-related parameters.
+            prefix (str): Unique name of the layer, used to name internal attributes.
+            kv_lora_rank (int): LoRA rank for KV projection. Defaults to None.
+            num_attention_heads (int): Number of attention heads. Defaults to None.
+            qk_nope_head_dim (int): Dimension for Q/K projection (nope part). Defaults to None.
+            v_head_dim (int): Dimension for V projection. Defaults to None.
+            with_bias (bool): Whether to include bias or not. Defaults to False.
+            skip_quant (bool): Whether to skip quantization. Defaults to False.
+        """
+        self.nranks = fd_config.parallel_config.tensor_parallel_degree
+        self.kv_lora_rank = kv_lora_rank
+        self.num_attention_heads = num_attention_heads
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.v_head_dim = v_head_dim
+        # Split num_attention_heads when using TP inference.
+        self.num_heads_per_partition = divide(num_attention_heads, self.nranks)
+
+        # Initialize parent with combined dimensions
+        super().__init__(
+            fd_config=fd_config,
+            prefix=prefix,
+            input_size=None,  # Will be determined from weight shape
+            output_size=None,  # Will be determined from weight shape
+            with_bias=with_bias,
+            add_bias=False,
+            skip_quant=skip_quant,
+        )
+        self.weight_dtype = self._dtype
+
+        # Override weight keys to use the combined kv_b_proj
+        self.weight_key = f"{prefix}.weight"  # e.g., "kv_b_proj.weight"
+        self.k_weight_key = f"{prefix.replace('kv_b_proj', 'k_b_proj')}.weight"
+        self.v_weight_key = f"{prefix.replace('kv_b_proj', 'v_b_proj')}.weight"
+
+    def load_state_dict(self, state_dict: dict):
+        """
+        Load the combined KV weight and split it into K and V projections
+        """
+        # Get the combined KV weight
+        # NOTE(Ryan):Do not pop weight_key here, it will be popped in other class
+        kv_weight_tensor = get_tensor(state_dict[self.weight_key])
+
+        # Reshape and split the weight
+        w = kv_weight_tensor.reshape([
+            self.kv_lora_rank,
+            self.num_heads_per_partition,
+            -1,
+        ]).transpose(perm=[1, 2, 0])
+
+        # Split into K and V weights
+        # wk_b: [num_heads, qk_nope_head_dim, kv_lora_rank]
+        wk_b = w[:, :self.qk_nope_head_dim, :]
+
+        if self.v_head_dim is None:
+            raise ValueError("self.v_head_dim should not be None")
+        # wv_b: [num_heads, kv_lora_rank, v_head_dim]
+        wv_b = w[:, -self.v_head_dim:, :].transpose(perm=[0, 2, 1])
+
+        # Create K projection weight
+        self.k_b_proj_weight = self.create_parameter(
+            shape=wk_b.shape,
+            dtype=self.weight_dtype,
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )
+
+        # Create V projection weight
+        self.v_b_proj_weight = self.create_parameter(
+            shape=wv_b.shape,
+            dtype=self.weight_dtype,
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )
+
+        self.k_b_proj_weight.set_value(wk_b)
+        self.v_b_proj_weight.set_value(wv_b)
+
+    def forward_k_b(self, x: paddle.Tensor) -> paddle.Tensor:
+        """
+        Forward pass for K_b projection using bmm
+
+        Args:
+            x: Input tensor (e.g., query_nope.transpose([1, 0, 2]))
+
+        Returns:
+            K_b projection output
+        """
+
+        out = paddle.bmm(x, self.k_b_proj_weight)
+        return out
+
+    def forward_v_b(self, x: paddle.Tensor) -> paddle.Tensor:
+        """
+        Forward pass for V_b projection using bmm
+
+        Args:
+            x: Input tensor (e.g., fmha_out_decode)
+
+        Returns:
+            V_b projection output
+        """
+        out = paddle.bmm(x, self.v_b_proj_weight)
+        return out
+
+    def forward_cuda(self,
+                     x: paddle.Tensor,
+                     proj_type: str = 'k') -> paddle.Tensor:
+        """
+        Forward function that can handle both K and V projections
+
+        Args:
+            x: Input tensor
+            proj_type: 'k' or 'v' to select which projection to use
+
+        Returns:
+            Projection output
+        """
+        if proj_type == 'k':
+            return self.forward_k_b(x)
+        elif proj_type == 'v':
+            return self.forward_v_b(x)
+        else:
+            raise ValueError(f"proj_type must be 'k' or 'v', got {proj_type}")
--- a/fastdeploy/model_executor/layers/lm_head.py
+++ b/fastdeploy/model_executor/layers/lm_head.py
@@ -21,48 +21,6 @@ from paddle.distributed import fleet
 from .utils import get_tensor


-def parallel_matmul(lm_output, logit_weights, parallel_output):
-    """
-    Performs parallel matrix multiplication for large-scale language models.
-
-    Args:
-        lm_output (Tensor): The output tensor from the language model layers,
-            which will be multiplied with the logit weights.
-        logit_weights (Tensor): The weights used in the matrix multiplication,
-            typically the weights of the output layer.
-        parallel_output (bool): A flag indicating whether to return the parallel
-            outputs or concatenate them. If True, returns the outputs from the
-            parallel computation directly. If False, concatenates the outputs
-            across the model parallel group before returning.
-
-    Returns:
-        Tensor: The result of the matrix multiplication. If `parallel_output` is True,
-            returns the parallel outputs. If `parallel_output` is False and
-            model parallel world size is greater than 1, returns the concatenated
-            outputs across the model parallel group. Otherwise, returns the direct
-            matrix multiplication result.
-    """
-    hcg = fleet.get_hybrid_communicate_group()
-    model_parallel_group = hcg.get_model_parallel_group()
-    world_size = hcg.get_model_parallel_world_size()
-    # rank = hcg.get_model_parallel_rank()
-
-    if world_size > 1:
-        input_parallel = paddle.distributed.collective._c_identity(
-            lm_output, group=model_parallel_group)
-
-        logits = paddle.matmul(input_parallel, logit_weights, transpose_y=True)
-
-        if parallel_output:
-            return logits
-
-        return paddle.distributed.collective._c_concat(
-            logits, group=model_parallel_group)
-    else:
-        logits = paddle.matmul(lm_output, logit_weights, transpose_y=True)
-        return logits
-
-
 class ParallelLMHead(nn.Layer):
    """
    "Parallelized LM head.
@@ -70,75 +28,69 @@ class ParallelLMHead(nn.Layer):

    def __init__(
        self,
-        llm_config,
+        fd_config,
        num_embeddings,
        embedding_dim,
        prefix="",
        with_bias=False,
-        tie_word_embeddings=None,
    ):
        """
        Parallelized LMhead.

        Args:
-            llm_config (LLMConfig): Arguments related to inference, containing
+            fd_config (FDConfig): Arguments related to inference, containing
                attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
                num_attention_heads, and ffn_hidden_size.
            num_embeddings (int): vocabulary size.
            embedding_dim (int): size of hidden state.
-            tie_embeddings_weight (bool, optional): Whether to share weights across model parallel ranks,
-                defaults to None.
            prefix (str): full name of the layer in the state dict
        """
        super(ParallelLMHead, self).__init__()
-        self.use_moe = llm_config.model_config.use_moe
        self.linear_weight_key = prefix + ".weight"
        if with_bias:
            self.linear_bias_key = prefix + ".bias"
        else:
            self.linear_bias_key = None
-        self.use_ep = llm_config.parallel_config.use_ep
+        self.use_ep = fd_config.parallel_config.use_ep
        self.column_cut = True
-        self.fused_linear = True

-        hcg = fleet.get_hybrid_communicate_group()
-        mp_rank = hcg.get_model_parallel_rank()
        ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear
        RowParallelLinear = fleet.meta_parallel.RowParallelLinear

-        self.tie_word_embeddings = tie_word_embeddings
+        self.tie_word_embeddings = fd_config.model_config.tie_word_embeddings

-        if self.tie_word_embeddings is None:
-            if self.use_ep:
-                self.weight = self.create_parameter(
-                    shape=[embedding_dim, num_embeddings],
-                    dtype=paddle.get_default_dtype(),
-                    is_bias=False,
+        if self.use_ep:
+            self.weight = self.create_parameter(
+                shape=[embedding_dim, num_embeddings],
+                dtype=paddle.get_default_dtype(),
+                is_bias=False,
+            )
+        else:
+            if self.column_cut:
+                need_gather = True
+                self.out_linear = ColumnParallelLinear(
+                    embedding_dim,
+                    num_embeddings,
+                    mp_group=fleet.get_hybrid_communicate_group().
+                    get_model_parallel_group(),
+                    weight_attr=None,
+                    has_bias=True
+                    if self.linear_bias_key is not None else False,
+                    gather_output=need_gather,
+                    fuse_matmul_bias=False,  # False diff更小
                )
            else:
-                if self.column_cut:
-                    need_gather = True
-                    self.out_linear = ColumnParallelLinear(
-                        embedding_dim,
-                        num_embeddings,
-                        mp_group=fleet.get_hybrid_communicate_group().
-                        get_model_parallel_group(),
-                        weight_attr=None,
-                        has_bias=True,
-                        gather_output=need_gather,
-                        fuse_matmul_bias=self.fused_linear,  # False diff更小
-                    )
-                else:
-                    self.out_linear = RowParallelLinear(
-                        embedding_dim,
-                        num_embeddings,
-                        mp_group=fleet.get_hybrid_communicate_group().
-                        get_model_parallel_group(),
-                        weight_attr=None,
-                        has_bias=True,
-                        input_is_parallel=False,
-                        fuse_matmul_bias=self.fused_linear,  # False diff更小
-                    )
+                self.out_linear = RowParallelLinear(
+                    embedding_dim,
+                    num_embeddings,
+                    mp_group=fleet.get_hybrid_communicate_group().
+                    get_model_parallel_group(),
+                    weight_attr=None,
+                    has_bias=True
+                    if self.linear_bias_key is not None else False,
+                    input_is_parallel=False,
+                    fuse_matmul_bias=False,  # False diff更小
+                )

    def load_state_dict(self, state_dict):
        """
@@ -148,25 +100,26 @@ class ParallelLMHead(nn.Layer):
            state_dict (dict): A dictionary containing the checkpoint weights and biases.
        """

-        if self.tie_word_embeddings is None:
-            if self.use_ep:
-                self.weight.set_value(
-                    get_tensor(state_dict.pop(self.linear_weight_key)).astype(
-                        paddle.get_default_dtype()))
-            else:
+        if self.use_ep:
+            self.weight.set_value(
+                get_tensor(state_dict.pop(self.linear_weight_key)).astype(
+                    paddle.get_default_dtype()))
+        else:
+            if self.tie_word_embeddings:
                self.out_linear.weight.set_value(
                    get_tensor(state_dict.pop(self.linear_weight_key)).astype(
-                        paddle.get_default_dtype()))
+                        paddle.get_default_dtype()).transpose([1, 0]))
+            else:
+                weight_tensor = get_tensor(
+                    state_dict.pop(self.linear_weight_key)).astype(
+                        paddle.get_default_dtype())
+                if self.out_linear.weight.shape != weight_tensor.shape:
+                    weight_tensor = weight_tensor.transpose([1, 0])
+                self.out_linear.weight.set_value(weight_tensor)

-                bias = (
-                    get_tensor(state_dict.pop(self.linear_bias_key)).astype(
-                        paddle.get_default_dtype()
-                    )
-                    if self.linear_bias_key is not None
-                    else paddle.zeros(
-                        self.out_linear.bias.shape, dtype=paddle.get_default_dtype()
-                    )
-                )
+            if self.linear_bias_key is not None:
+                bias = get_tensor(state_dict.pop(self.linear_bias_key)).astype(
+                    paddle.get_default_dtype())
                self.out_linear.bias.set_value(bias)

    def forward(self, input):
@@ -180,11 +133,8 @@ class ParallelLMHead(nn.Layer):
            Tensor: The output tensor after processing through the layer.
        """
        logits = input
-        if self.tie_word_embeddings is not None:
-            logits = parallel_matmul(logits, self.tie_word_embeddings, False)
+        if self.use_ep:
+            logits = paddle.matmul(logits, self.weight)
        else:
-            if self.use_ep:
-                logits = paddle.matmul(logits, self.weight)
-            else:
-                logits = self.out_linear(logits)
+            logits = self.out_linear(logits)
        return logits
--- a/fastdeploy/model_executor/layers/moe/init.py
+++ b/fastdeploy/model_executor/layers/moe/init.py
@@ -11,3 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .fused_moe_cutlass_backend import (CutlassW4A8MoEMethod,
+                                        CutlassWeightOnlyMoEMethod)
+from .fused_moe_triton_backend import TritonWeightOnlyMoEMethod
+from .moe import FusedMoE
+
+__all__ = [
+    CutlassWeightOnlyMoEMethod, CutlassW4A8MoEMethod, FusedMoE,
+    TritonWeightOnlyMoEMethod
+]
--- a/fastdeploy/model_executor/layers/moe/cutlass_fused_moe.py
+++ b/fastdeploy/model_executor/layers/moe/cutlass_fused_moe.py
@@ -1,222 +0,0 @@
-"""
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import paddle
-from paddle import nn
-from paddle.distributed import fleet
-from paddle.framework import in_dynamic_or_pir_mode
-from paddle.nn.quant import weight_quantize
-
-from fastdeploy.model_executor.ops.gpu import (moe_expert_dispatch,
-                                               moe_expert_ffn,
-                                               moe_expert_reduce)
-
-from .fused_moe_method_base import FusedMoEMethodBase
-
-
-class CutlassFusedMoeMethod(FusedMoEMethodBase):
-    """
-    Use Cutlass Group Gemm to compute Fused MoE.
-    This method is the oldest way to compute MoE in Paddle.
-    """
-
-    def create_weights(
-            self,
-            layer: nn.Layer,
-            moe_compute_params,
-            ffn1_tensor,
-            ffn2_tensor,
-            ffn1_bias=None,
-            ffn2_bias=None,
-            # belows only used in w4a8.
-            moe_ffn1_weight_scale=None,
-            moe_ffn2_weight_scale=None,
-            moe_ffn1_in_scale=None,
-            moe_ffn2_in_scale=None):
-        """
-        Paddle cutlass create weight process.
-        """
-
-        num_local_experts = moe_compute_params.num_local_experts
-        moe_quant_type = moe_compute_params.moe_quant_type
-
-        assert len(ffn1_tensor) == num_local_experts
-        assert len(ffn2_tensor) == num_local_experts
-        assert ffn1_tensor[0].shape == [
-            moe_compute_params.hidden_size,
-            moe_compute_params.moe_intermediate_size * 2
-        ]
-        assert ffn2_tensor[0].shape == [
-            moe_compute_params.moe_intermediate_size,
-            moe_compute_params.hidden_size
-        ]
-
-        added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
-        added_scale_attrs = ["moe_ffn1_weight_scale", "moe_ffn2_weight_scale"]
-
-        if moe_quant_type == "w4a8":
-            moe_ffn1_in_scale = paddle.concat(moe_ffn1_in_scale)
-            moe_ffn2_in_scale = paddle.concat(moe_ffn2_in_scale)
-            moe_ffn1_in_scale = 1 / moe_ffn1_in_scale
-            moe_ffn2_in_scale = 1 / moe_ffn2_in_scale
-            moe_ffn1_weight_scale = paddle.stack(moe_ffn1_weight_scale, axis=0)
-            moe_ffn2_weight_scale = paddle.stack(moe_ffn2_weight_scale, axis=0)
-
-            moe_ffn1_weight_scale = moe_ffn1_weight_scale / (127 * 112)
-            moe_ffn2_weight_scale = moe_ffn2_weight_scale / (127 * 112)
-            moe_ffn1_weight_scale = moe_ffn1_weight_scale / moe_ffn1_in_scale[:,
-                                                                              None]
-            moe_ffn2_weight_scale = moe_ffn2_weight_scale / moe_ffn2_in_scale[:,
-                                                                              None]
-            moe_ffn1_weight_scale = moe_ffn1_weight_scale.cast(
-                paddle.get_default_dtype())
-            moe_ffn2_weight_scale = moe_ffn2_weight_scale.cast(
-                paddle.get_default_dtype())
-
-        if moe_quant_type in ["weight_only_int4", "weight_only_int8", "w4a8"]:
-
-            for idx, weight_tensor in enumerate([ffn1_tensor, ffn2_tensor]):
-                weight_name = added_weight_attrs[idx]
-                scale_name = added_scale_attrs[idx]
-
-                weight_list = []
-                weight_scale_list = []
-                for i in range(num_local_experts):
-                    quant_weight, scale = weight_quantize(weight_tensor[i],
-                                                          algo=moe_quant_type,
-                                                          arch=80)
-                    weight_list.append(quant_weight)
-                    if moe_quant_type != "w4a8":
-                        # scale holds no memoty in w4a8, don't touch it!
-                        weight_scale_list.append(scale)
-                quanted_weight = paddle.stack(weight_list, axis=0)
-                setattr(
-                    layer, weight_name,
-                    layer.create_parameter(
-                        shape=quanted_weight.shape,
-                        dtype=quanted_weight.dtype,
-                        default_initializer=paddle.nn.initializer.Constant(0),
-                    ))
-                getattr(layer, weight_name).set_value(quanted_weight)
-
-                # this scale only useful for wint8/4.
-                if moe_quant_type != "w4a8":
-                    quanted_weight_scale = paddle.stack(weight_scale_list,
-                                                        axis=0)
-                    setattr(
-                        layer, scale_name,
-                        layer.create_parameter(
-                            shape=quanted_weight_scale.shape,
-                            dtype=quanted_weight_scale.dtype,
-                        ))
-                    getattr(layer, scale_name).set_value(quanted_weight_scale)
-
-        if moe_quant_type == "w4a8":
-            assert moe_ffn1_weight_scale is not None
-            assert moe_ffn2_weight_scale is not None
-            assert moe_ffn1_in_scale is not None
-            assert moe_ffn2_in_scale is not None
-            added_w4a8_attrs = [
-                "moe_ffn1_weight_scale", "moe_ffn2_weight_scale",
-                "moe_ffn1_in_scale", "moe_ffn2_in_scale"
-            ]
-            for idx, weight_tensor in enumerate([
-                    moe_ffn1_weight_scale, moe_ffn2_weight_scale,
-                    moe_ffn1_in_scale, moe_ffn2_in_scale
-            ]):
-                name = added_w4a8_attrs[idx]
-                setattr(
-                    layer, name,
-                    layer.create_parameter(
-                        shape=weight_tensor.shape,
-                        dtype=weight_tensor.dtype,
-                        default_initializer=paddle.nn.initializer.Constant(0),
-                    ))
-                getattr(layer, name).set_value(weight_tensor)
-
-    def apply(
-        self,
-        layer: nn.Layer,
-        moe_compute_params,
-        x: paddle.Tensor,
-    ) -> paddle.Tensor:
-        """
-        Paddle Cutlass compute Fused MoE.
-        """
-
-        gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
-
-        (
-            permute_input,
-            token_nums_per_expert,
-            permute_indices_per_token,
-            topk_weights,
-            topk_idx,
-            expert_idx_per_token,
-        ) = moe_expert_dispatch(
-            x,
-            gate_out,
-            layer.gate_correction_bias,
-            (layer.moe_ffn1_in_scale if hasattr(layer, "moe_ffn1_in_scale")
-             else None),  # if set, permute_input will be int8_t
-            moe_compute_params.top_k,
-            False,
-            topk_only_mode=False,
-        )
-
-        if moe_compute_params.moe_quant_type != "w4a8":
-            # only w4a8 need expert_idx_per_token
-            # Other need not this tensor, so we make it None.
-            expert_idx_per_token = None
-        else:
-            expert_idx_per_token = expert_idx_per_token.cast("int64")
-
-        ffn_out = moe_expert_ffn(
-            permute_input,
-            token_nums_per_expert,
-            layer.moe_ffn1_weight,
-            layer.moe_ffn2_weight,
-            None,
-            (layer.moe_ffn1_weight_scale
-             if hasattr(layer, "moe_ffn1_weight_scale") else None),
-            (layer.moe_ffn2_weight_scale
-             if hasattr(layer, "moe_ffn2_weight_scale") else None),
-            (layer.moe_ffn2_in_scale
-             if hasattr(layer, "moe_ffn2_in_scale") else None),
-            expert_idx_per_token,
-            moe_compute_params.moe_quant_type,
-            False,  # used_in_ep_low_latency
-        )
-
-        if False:
-            if in_dynamic_or_pir_mode():
-                hcg = fleet.get_hybrid_communicate_group()
-                mp_group = hcg.get_model_parallel_group()
-                paddle.distributed.all_reduce(ffn_out, group=mp_group)
-            else:
-                paddle.distributed.all_reduce(ffn_out, group=mp_group)
-
-        # reduce 中会做 topk 个 weight 的 norm 和 routed_scaling_factor
-        fused_moe_out = moe_expert_reduce(
-            ffn_out,
-            topk_weights,
-            permute_indices_per_token,
-            topk_idx,
-            None,
-            norm_topk_prob=True,
-            routed_scaling_factor=1.0,
-        )
-        return fused_moe_out
--- a/fastdeploy/model_executor/layers/moe/ep.py
+++ b/fastdeploy/model_executor/layers/moe/ep.py
--- a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py
@@ -0,0 +1,135 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from abc import abstractmethod
+
+import paddle
+from paddle import nn
+
+from fastdeploy.config import MoEPhase
+
+from ..quantization.quant_base import QuantMethodBase
+
+
+class MoEMethodBase(QuantMethodBase):
+    """
+    """
+
+    def __init__(self, quant_config):
+        super().__init__()
+        if quant_config is None:
+            self.moe_quant_type = "w16a16"
+        else:
+            self.quant_config = quant_config
+        self.added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
+        self.added_scale_attrs = [
+            "moe_ffn1_weight_scale", "moe_ffn2_weight_scale"
+        ]
+        self.pack_num = 1
+
+    def init_ep(self, layer: nn.Layer) -> None:
+        """
+        Init EP related module
+        """
+        if layer.ep_size > 1:
+            if layer.fd_config.parallel_config.moe_phase == MoEPhase.DECODER:
+                from .ep import EPDecoderRunner
+                self.ep_decoder_runner = EPDecoderRunner(
+                    layer.top_k, layer.hidden_size, layer.num_experts,
+                    layer.moe_config.num_max_dispatch_tokens_per_rank,
+                    layer.ep_size, layer.ep_rank)
+            else:
+                from .ep import EPPrefillRunner
+                self.ep_prefill_runner = EPPrefillRunner(
+                    layer.top_k, layer.hidden_size, layer.num_experts,
+                    layer.ep_size, layer.ep_rank)
+
+    def process_loaded_weights(self, layer, weights) -> None:
+        """
+        process_loaded_weights
+        """
+        pass
+
+    def check(self, layer: nn.Layer, ffn1_weights, ffn2_weights):
+        """
+        check layer is valid for this method
+        """
+        assert ffn1_weights[0].shape == [
+            layer.hidden_size // self.pack_num, layer.moe_intermediate_size * 2
+        ]
+        assert ffn2_weights[0].shape == [
+            layer.moe_intermediate_size // self.pack_num, layer.hidden_size
+        ]
+
+    @abstractmethod
+    def create_weights(self, layer: nn.Layer, state_dict):
+        """
+        Paddle cutlass create weight process.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_ep_prefill(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Apply the EP prefill method.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_ep_decode(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Apply the EP decoder method.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_tp(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Paddle Cutlass compute Fused MoE.
+        """
+        raise NotImplementedError
+
+    def apply(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Paddle Cutlass compute Fused MoE.
+        """
+        if layer.ep_size > 1:
+            if layer.fd_config.parallel_config.moe_phase == MoEPhase.PREFILL:
+                return self.apply_ep_prefill(layer, x, gate_out)
+            else:
+                return self.apply_ep_decode(layer, x, gate_out)
+        else:
+            return self.apply_tp(layer, x, gate_out)
--- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
@@ -0,0 +1,431 @@
+"""
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import paddle
+from paddle import nn
+from paddle.nn.quant import weight_quantize
+from paddleformers.utils.log import logger
+
+import fastdeploy
+from fastdeploy.distributed.communication_op import \
+    tensor_model_parallel_all_reduce
+from ..utils import get_tensor, create_and_set_parameter
+from .fused_moe_backend_base import MoEMethodBase
+
+from fastdeploy.platforms import current_platform
+if current_platform.is_cuda():
+    from fastdeploy.model_executor.ops.gpu import moe_expert_dispatch
+    from fastdeploy.model_executor.ops.gpu import moe_expert_reduce
+    
+
+class CutlassMoEMethod(MoEMethodBase):
+    """
+    Use Cutlass Group Gemm to compute Fused MoE.
+    This method is the oldest way to compute MoE in Paddle.
+    """
+
+    def create_weights(self, layer: nn.Layer, state_dict):
+        """
+        Paddle cutlass create weight process.
+        """
+        # bf16
+        ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
+        stacked_ffn1_weights = paddle.stack(ffn1_weights, axis=0)
+        stacked_ffn2_weights = paddle.stack(ffn2_weights, axis=0)
+        for idx, weight_tensor in enumerate(
+            [stacked_ffn1_weights, stacked_ffn2_weights]):
+            weight_name = self.added_weight_attrs[idx]
+            setattr(
+                layer, weight_name,
+                layer.create_parameter(
+                    shape=weight_tensor.shape,
+                    dtype=weight_tensor.dtype,
+                    default_initializer=paddle.nn.initializer.Constant(0),
+                ))
+            getattr(layer, weight_name).set_value(weight_tensor)
+
+    def compute_ffn(
+        self,
+        layer: nn.Layer,
+        permute_input: paddle.Tensor,
+        token_nums_per_expert: paddle.Tensor,
+        expert_idx_per_token: paddle.Tensor,
+        used_in_ep_low_latency: bool = False,
+    ):
+        """
+        Paddle Cutlass compute Fused MoE.
+        """
+        return fastdeploy.model_executor.ops.gpu.moe_expert_ffn(
+            permute_input,
+            token_nums_per_expert,
+            layer.moe_ffn1_weight,
+            layer.moe_ffn2_weight,
+            None,
+            (layer.moe_ffn1_weight_scale
+             if hasattr(layer, "moe_ffn1_weight_scale") else None),
+            (layer.moe_ffn2_weight_scale
+             if hasattr(layer, "moe_ffn2_weight_scale") else None),
+            (layer.moe_ffn2_in_scale
+             if hasattr(layer, "moe_ffn2_in_scale") else None),
+            expert_idx_per_token,
+            self.moe_quant_type,
+            used_in_ep_low_latency,
+        )
+
+    def apply_ep_prefill(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Apply the EP prefill method.
+        """
+        # 1. Select topk experts and weights
+        topk_idx, topk_weights = self.ep_prefill_runner.moe_select(
+            layer, gate_out)
+        # 2. EP Dispatch
+        (
+            recv_x,
+            recv_topk_idx,
+            recv_topk_weights,
+            recv_num_tokens_per_expert_list,
+            handle,
+            _,
+        ) = self.ep_prefill_runner.dispatch(x, topk_idx, topk_weights)
+        token_all_num = sum(recv_num_tokens_per_expert_list)
+
+        # 3. Compute ffn
+        if token_all_num > 0:
+            logger.info(f"token_all_num {token_all_num}")
+            (
+                permute_input,
+                permute_indices_per_token,
+                recv_num_tokens_per_expert_list_cumsum,
+                dst_weights,
+                dst_indices,
+                cumsum_idx_gpu,
+                expert_idx_per_token,
+            ) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch(
+                recv_x,
+                recv_topk_idx,
+                recv_topk_weights,
+                (self.moe_ffn1_in_scale
+                 if hasattr(self, "moe_ffn1_in_scale") else None),
+                recv_num_tokens_per_expert_list,
+                token_all_num,
+                self.moe_quant_type,
+            )
+            if self.moe_quant_type != "w4a8":
+                # only w4a8 need expert_idx_per_token
+                # Other need not this tensor, so we make it None.
+                expert_idx_per_token = None
+            else:
+                expert_idx_per_token = expert_idx_per_token.cast("int64")
+
+            ffn_out = self.compute_ffn(layer, permute_input,
+                                       recv_num_tokens_per_expert_list_cumsum,
+                                       expert_idx_per_token)
+
+            # prmt back per rank
+            tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
+                ffn_out,
+                dst_weights,
+                permute_indices_per_token,
+                dst_indices,
+                None,  # moe_ffn2_bias,
+                False,  # norm_topk_prob
+                1.0,
+            )[0]
+        else:
+            tmp_ffn_out = recv_x
+
+        # 4. EP combine
+        return self.ep_prefill_runner.combine(tmp_ffn_out, handle,
+                                              recv_topk_weights)
+
+    def apply_ep_decode(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Apply the EP decoder method.
+        """
+        # 1. Select topk experts and weights
+        topk_idx, topk_weights = self.ep_decoder_runner.moe_select(
+            layer, gate_out)
+        # 2. EP Dispatch
+        permute_input, token_nums_per_expert, handle = self.ep_decoder_runner.dispatch(
+            x, topk_idx, topk_weights)
+        # 3. Compute ffn
+        if self.moe_quant_type == "w4a8":
+            num_local_experts, max_num, _ = permute_input.shape
+            expert_idx_per_token = paddle.arange(
+                num_local_experts)[:, None].tile([1, max_num])
+        elif self.moe_quant_type in ["weight_only_int8", "weight_only_int4"]:
+            expert_idx_per_token = None
+        else:
+            raise NotImplementedError
+
+        ffn_out = self.compute_ffn(layer, permute_input,
+                                   token_nums_per_expert.cast("int64"),
+                                   expert_idx_per_token, True)
+
+        # 4. EP combine
+        return self.ep_decoder_runner.combine(ffn_out, topk_idx, topk_weights,
+                                              handle)
+
+    def apply_tp(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Paddle Cutlass compute Fused MoE.
+        """
+        (
+            permute_input,
+            token_nums_per_expert,
+            permute_indices_per_token,
+            topk_weights,
+            topk_idx,
+            expert_idx_per_token,
+        ) = moe_expert_dispatch(
+            x,
+            gate_out,
+            layer.gate_correction_bias,
+            (layer.moe_ffn1_in_scale if hasattr(layer, "moe_ffn1_in_scale")
+             else None),  # if set, permute_input will be int8_t
+            layer.top_k,
+            False,
+            topk_only_mode=False,
+        )
+
+        if self.moe_quant_type != "w4a8":
+            # only w4a8 need expert_idx_per_token
+            # Other need not this tensor, so we make it None.
+            expert_idx_per_token = None
+        else:
+            expert_idx_per_token = expert_idx_per_token.cast("int64")
+
+        ffn_out = self.compute_ffn(layer, permute_input, token_nums_per_expert,
+                                   expert_idx_per_token)
+
+        # reduce 中会做 topk 个 weight 的 norm 和 routed_scaling_factor
+        fused_moe_out = moe_expert_reduce(
+            ffn_out,
+            topk_weights,
+            permute_indices_per_token,
+            topk_idx,
+            None,
+            norm_topk_prob=True,
+            routed_scaling_factor=1.0,
+        )
+
+        if layer.tp_size > 1:
+            tensor_model_parallel_all_reduce(fused_moe_out)
+
+        return fused_moe_out
+
+
+class CutlassW4A8MoEMethod(CutlassMoEMethod):
+    """
+    w4a8 MoE Method
+    """
+
+    def __init__(self, quant_config):
+        super().__init__(quant_config)
+        self.quant_config = quant_config
+        self.moe_quant_type = "w4a8"
+        self.pack_num = 2
+
+    def create_weights(self, layer: nn.Layer, state_dict):
+        """
+        Paddle cutlass create weight process.
+        """
+        ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
+        self.check(layer, ffn1_weights, ffn2_weights)
+        for idx, weight_tensor in enumerate([ffn1_weights, ffn2_weights]):
+            weight_name = self.added_weight_attrs[idx]
+            weight_list = []
+            for i in range(layer.num_local_experts):
+                quant_weight, scale = weight_quantize(weight_tensor[i],
+                                                      algo=self.moe_quant_type,
+                                                      arch=80)
+                weight_list.append(quant_weight)
+            quanted_weight = paddle.stack(weight_list, axis=0)
+            create_and_set_parameter(layer, weight_name, quanted_weight)
+
+        self.create_w4a8_scale_weights(layer, layer.weight_key_map, state_dict)
+
+    def create_w4a8_scale_weights(self, layer: nn.Layer, weight_key_map: dict,
+                                  state_dict: dict):
+        """
+        Get w4a8 weights from state dict and process them.
+        Args:
+            layer (nn.Layer): The layer to add parameters to.
+            weight_key_map (dict): The weight key map.
+            state_dict (dict): The state dict.
+        """
+
+        def _extract_scale_tensor(state_dict, key_template, expert_idx):
+            return get_tensor(state_dict.pop(key_template.format(expert_idx)))
+
+        def _process_in_scale(name: str, in_scales: list[paddle.Tensor]):
+            processed_in_scale = 1 / paddle.concat(in_scales)
+            create_and_set_parameter(layer, name, processed_in_scale)
+            return processed_in_scale
+
+        def _process_weight_scale(name: str,
+                                  weight_scales: list[paddle.Tensor],
+                                  processed_in_scale: paddle.Tensor):
+            processed_weight_scale = (paddle.stack(weight_scales, axis=0) /
+                                      (127 * 112) /
+                                      processed_in_scale[:, None]).cast(
+                                          paddle.get_default_dtype())
+            create_and_set_parameter(layer, name, processed_weight_scale)
+
+        # 1. Init scale containers and maps
+        moe_ffn1_weight_scales = []
+        moe_ffn2_weight_scales = []
+        moe_ffn1_in_scales = []
+        moe_ffn2_in_scales = []
+
+        scale_weight_map = {
+            "moe_ffn1_weight_scale": moe_ffn1_weight_scales,
+            "moe_ffn2_weight_scale": moe_ffn2_weight_scales,
+            "moe_ffn1_in_scale": moe_ffn1_in_scales,
+            "moe_ffn2_in_scale": moe_ffn2_in_scales,
+        }
+        scale_key_map = {
+            "moe_ffn1_weight_scale":
+            weight_key_map.get("ffn1_expert_weight_scale_key", None),
+            "moe_ffn2_weight_scale":
+            weight_key_map.get("ffn2_expert_weight_scale_key", None),
+            "moe_ffn1_in_scale":
+            weight_key_map.get("ffn1_expert_in_scale_key", None),
+            "moe_ffn2_in_scale":
+            weight_key_map.get("ffn2_expert_in_scale_key", None),
+        }
+        for name, value in scale_key_map.items():
+            if value is None:
+                raise ValueError(
+                    f"scale {name} should not be none in w4a8 mode.")
+
+        # 2. Extract scale tensor from state dict
+
+        for local_expert_idx in range(layer.num_local_experts):
+            expert_idx = local_expert_idx + layer.expert_id_offset * layer.num_local_experts
+            for name, scale_key_template in scale_key_map.items():
+                scale_tensor = _extract_scale_tensor(state_dict,
+                                                     scale_key_template,
+                                                     expert_idx)
+                scale_weight_map[name].append(scale_tensor)
+
+        # 3. Process scale tensor and set to layer
+        in_scales = []
+        for in_scale_name in ["moe_ffn1_in_scale", "moe_ffn2_in_scale"]:
+            in_scales.append(
+                _process_in_scale(in_scale_name,
+                                  scale_weight_map[in_scale_name]))
+
+        for i, weight_scale_name in enumerate(
+            ["moe_ffn1_weight_scale", "moe_ffn2_weight_scale"]):
+            _process_weight_scale(weight_scale_name,
+                                  scale_weight_map[weight_scale_name],
+                                  in_scales[i])
+
+
+class CutlassWeightOnlyMoEMethod(CutlassMoEMethod):
+    """
+    weight only for moe
+    """
+
+    def __init__(self, quant_config):
+        super().__init__(quant_config)
+        self.quant_config = quant_config
+        self.moe_quant_type = self.quant_config.algo
+        self.pack_num = 1
+
+    def process_prequanted_weights(self, layer: nn.Layer, state_dict):
+        """
+        Paddle cutlass process prequanted weights.
+        """
+        ffn1_expert_weight_key = layer.weight_key_map.get(
+            "ffn1_expert_weight_key", None)
+        ffn2_expert_weight_key = layer.weight_key_map.get(
+            "ffn2_expert_weight_key", None)
+        ffn1_expert_weight_scale_key = layer.weight_key_map.get(
+            "ffn1_expert_weight_scale_key", None)
+        ffn2_expert_weight_scale_key = layer.weight_key_map.get(
+            "ffn2_expert_weight_scale_key", None)
+
+        ffn1_weights, ffn2_weights = layer.load_experts_weight(
+            state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
+        # self.check(layer, ffn1_weights, ffn2_weights)
+        ffn1_weight_scale = []
+        ffn2_weight_scale = []
+        for i in range(layer.num_local_experts):
+            expert_idx = layer.expert_id_offset + i
+            ffn1_weight_scale.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn1_expert_weight_scale_key.format(expert_idx))))
+            ffn2_weight_scale.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn2_expert_weight_scale_key.format(expert_idx))))
+
+        ffn1_weight = paddle.stack(ffn1_weights, axis=0)
+        ffn2_weight = paddle.stack(ffn2_weights, axis=0)
+        ffn1_weight_scale = paddle.stack(ffn1_weight_scale, axis=0)
+        ffn2_weight_scale = paddle.stack(ffn2_weight_scale, axis=0)
+
+        name_tensor_map = {
+            "moe_ffn1_weight": ffn1_weight,
+            "moe_ffn2_weight": ffn2_weight,
+            "moe_ffn1_weight_scale": ffn1_weight_scale,
+            "moe_ffn2_weight_scale": ffn2_weight_scale
+        }
+        for name, tensor in name_tensor_map.items():
+            create_and_set_parameter(layer, name, tensor)
+
+    def create_weights(self, layer: nn.Layer, state_dict):
+        """
+        Paddle cutlass create weight process.
+        """
+        ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
+        self.check(layer, ffn1_weights, ffn2_weights)
+
+        for idx, weight_tensor in enumerate([ffn1_weights, ffn2_weights]):
+            weight_name = self.added_weight_attrs[idx]
+            scale_name = self.added_scale_attrs[idx]
+
+            weight_list = []
+            weight_scale_list = []
+            for i in range(layer.num_local_experts):
+                quant_weight, scale = weight_quantize(weight_tensor[i],
+                                                      algo=self.moe_quant_type)
+                weight_list.append(quant_weight)
+                weight_scale_list.append(scale)
+            quanted_weight = paddle.stack(weight_list, axis=0)
+            create_and_set_parameter(layer, weight_name, quanted_weight)
+
+            quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
+            create_and_set_parameter(layer, scale_name, quanted_weight_scale)
--- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
@@ -0,0 +1,380 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import numpy as np
+import paddle
+from paddle import nn
+from paddleformers.utils.log import logger
+
+import fastdeploy
+import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
+from fastdeploy.distributed.communication_op import \
+    tensor_model_parallel_all_reduce
+from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func
+from fastdeploy.model_executor.layers.utils import get_tensor
+
+from ..utils import create_and_set_parameter
+from .fused_moe_backend_base import MoEMethodBase
+
+
+class DeepGemmFusedMoeMethod(MoEMethodBase):
+    """
+    DeepGemmFusedMoeMethod is a class that implements the MoEMethodBase interface for DeepGemm backend.
+    """
+
+    def create_weights(self, layer: nn.Layer, state_dict):
+        """
+        deepgemm create weight process.
+        """
+
+        ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
+
+        self.check(layer, ffn1_weights, ffn2_weights)
+
+        for idx, weight_tensor in enumerate([ffn1_weights, ffn2_weights]):
+            weight_name = self.added_weight_attrs[idx]
+            scale_name = self.added_scale_attrs[idx]
+
+            weight_list = []
+            weight_scale_list = []
+            for i in range(layer.num_local_experts):
+                from fastdeploy.model_executor.layers.utils import \
+                    per_block_cast_to_fp8
+                quant_weight, scale = per_block_cast_to_fp8(
+                    weight_tensor[i], self.quant_config.weight_block_size)
+
+                weight_list.append(quant_weight)
+                weight_scale_list.append(scale)
+            quanted_weight = paddle.stack(weight_list, axis=0)
+            quanted_weight = quanted_weight.transpose([0, 2, 1]).contiguous()
+            create_and_set_parameter(layer, weight_name, quanted_weight)
+
+            quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
+            quanted_weight_scale = quanted_weight_scale.transpose(
+                [0, 2, 1]).contiguous()
+            create_and_set_parameter(layer, scale_name, quanted_weight_scale)
+
+    def process_prequanted_weights(self, layer: nn.Layer, state_dict):
+        """
+        Paddle cutlass process prequanted weights.
+        """
+        ffn1_expert_weight_key = layer.weight_key_map.get(
+            "ffn1_expert_weight_key", None)
+        ffn2_expert_weight_key = layer.weight_key_map.get(
+            "ffn2_expert_weight_key", None)
+        ffn1_expert_weight_scale_key = layer.weight_key_map.get(
+            "ffn1_expert_weight_scale_key", None)
+        ffn2_expert_weight_scale_key = layer.weight_key_map.get(
+            "ffn2_expert_weight_scale_key", None)
+
+        ffn1_weights, ffn2_weights = layer.load_experts_weight(
+            state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
+        # self.check(layer, ffn1_weights, ffn2_weights)
+        ffn1_weight_scale = []
+        ffn2_weight_scale = []
+        for i in range(layer.num_local_experts):
+            expert_idx = layer.expert_id_offset + i
+            ffn1_weight_scale.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn1_expert_weight_scale_key.format(expert_idx))))
+            ffn2_weight_scale.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn2_expert_weight_scale_key.format(expert_idx))))
+
+        ffn1_weight = paddle.stack(ffn1_weights, axis=0).transpose([0, 2, 1]).contiguous().view("float8_e4m3fn")
+        ffn2_weight = paddle.stack(ffn2_weights, axis=0).transpose([0, 2, 1]).contiguous().view("float8_e4m3fn")
+        ffn1_weight_scale = paddle.stack(ffn1_weight_scale, axis=0).transpose([0, 2, 1]).contiguous()
+        ffn2_weight_scale = paddle.stack(ffn2_weight_scale, axis=0).transpose([0, 2, 1]).contiguous()
+
+        name_tensor_map = {
+            "moe_ffn1_weight": ffn1_weight,
+            "moe_ffn2_weight": ffn2_weight,
+            "moe_ffn1_weight_scale": ffn1_weight_scale,
+            "moe_ffn2_weight_scale": ffn2_weight_scale
+        }
+        for name, tensor in name_tensor_map.items():
+            create_and_set_parameter(layer, name, tensor)
+
+    def apply_ep_prefill(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Apply the EP prefill method.
+        """
+        # 1. Select topk experts and weights
+        topk_idx, topk_weights = self.ep_prefill_runner.moe_select(
+            layer, gate_out)
+        # 2. Dynamic compute blockwise quantization scales
+        x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
+            x, self.quant_config.weight_block_size[0])
+        # 3. EP Dispatch
+        (
+            recv_x,
+            recv_topk_idx,
+            recv_topk_weights,
+            recv_num_tokens_per_expert_list,
+            handle,
+            _,
+        ) = self.ep_prefill_runner.dispatch(x,
+                                            topk_idx,
+                                            topk_weights,
+                                            x_scale_tensor=x_scale_tensor)
+
+        token_all_num = sum(recv_num_tokens_per_expert_list)
+
+        # 4. Compute ffn
+        if token_all_num > 0:
+            logger.info(f"token_all_num {token_all_num}")
+            (recv_x, recv_x_scale) = recv_x
+            tmp = count_tokens_per_expert_func(recv_topk_idx, layer.num_local_experts)
+            (
+                permute_input,
+                permute_scale,
+                permute_indices_per_token,
+                recv_num_tokens_per_expert_list_cumsum,
+                recv_num_tokens_per_expert_list_padded_cumsum,
+                dst_weights,
+                dst_indices,
+                cumsum_idx_gpu,
+                m_indices,
+            ) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8(
+                recv_x,
+                recv_x_scale,
+                recv_topk_idx,
+                recv_topk_weights,
+                tmp[0],
+                tmp[1]
+            )
+
+            permute_scale = permute_scale.transpose([1, 0]).contiguous()
+            permute_scale = permute_scale.transpose([1, 0])
+
+            # ffn1
+            ffn_out = paddle.empty(
+                (permute_input.shape[0], layer.moe_ffn1_weight.shape[1]),
+                dtype=paddle.bfloat16,
+            )
+            deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+                (permute_input, permute_scale),
+                (layer.moe_ffn1_weight, layer.moe_ffn1_weight_scale),
+                ffn_out,
+                m_indices,
+            )
+            # swiglu
+            ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out, None)
+
+            # ffn2
+            ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
+                ffn_out, self.quant_config.weight_block_size[0])
+            ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose(
+                [1, 0]).contiguous()
+            ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0])
+
+            ffn_out = paddle.empty(
+                (ffn_out.shape[0], layer.moe_ffn2_weight.shape[1]),
+                dtype=paddle.bfloat16)
+            deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+                (ffn_in_x, ffn_in_x_scale_tensor),
+                (layer.moe_ffn2_weight, layer.moe_ffn2_weight_scale),
+                ffn_out,
+                m_indices,
+            )
+            # prmt back per rank
+            tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
+                ffn_out,
+                dst_weights,
+                permute_indices_per_token,
+                dst_indices,
+                None,  # moe_ffn2_bias
+                False,  # norm_topk_prob
+                1.0,
+            )[0]
+
+        else:
+            tmp_ffn_out = paddle.cast(recv_x[0], paddle.bfloat16)
+
+        # 5. EP combine
+        return self.ep_prefill_runner.combine(tmp_ffn_out, handle,
+                                              recv_topk_weights)
+
+    def apply_ep_decode(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Apply the EP decoder method.
+        """
+        # 1. Select topk experts and weights
+        topk_idx, topk_weights = self.ep_decoder_runner.moe_select(
+            layer, gate_out)
+        # 2. EP Dispatch
+        permute_input, token_nums_per_expert, handle = self.ep_decoder_runner.dispatch(
+            x, topk_idx, topk_weights, use_fp8=True)
+
+        # 3. Compute ffn
+        assert isinstance(permute_input, tuple)
+        ffn1_out = paddle.empty(
+            [
+                layer.num_local_experts,
+                layer.ep_size *
+                layer.moe_config.num_max_dispatch_tokens_per_rank,
+                layer.moe_intermediate_size * 2,
+            ],
+            dtype=paddle.bfloat16,
+        )
+
+        ffn_out = paddle.empty(
+            [
+                layer.num_local_experts,
+                layer.ep_size *
+                layer.moe_config.num_max_dispatch_tokens_per_rank,
+                layer.hidden_size,
+            ],
+            dtype=paddle.bfloat16,
+        )
+
+        expected_m = 128
+        deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
+            permute_input,
+            (
+                layer.moe_ffn1_weight,
+                layer.moe_ffn1_weight_scale,
+            ),
+            ffn1_out,
+            token_nums_per_expert,
+            expected_m,
+        )
+
+        act_out = fastdeploy.model_executor.ops.gpu.group_swiglu_with_masked(
+            ffn1_out, token_nums_per_expert)
+
+        act_out_fp8, scale = fastdeploy.model_executor.ops.gpu.masked_per_token_quant(
+            act_out, token_nums_per_expert,
+            self.quant_config.weight_block_size[0])
+
+        deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
+            (act_out_fp8, scale),
+            (
+                layer.moe_ffn2_weight,
+                layer.moe_ffn2_weight_scale,
+            ),
+            ffn_out,
+            token_nums_per_expert,
+            expected_m,
+        )
+
+        # 4. EP combine
+        return self.ep_decoder_runner.combine(ffn_out, topk_idx, topk_weights,
+                                              handle)
+
+    def apply_tp(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Paddle Use DeepGemm compute Fused MoE.
+        below is TP compute method.
+        """
+
+        topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
+            gate_out,
+            layer.gate_correction_bias,
+            layer.top_k,
+            True,  # apply_norm_weight
+            False,
+        )
+
+        tmp = count_tokens_per_expert_func(topk_ids, layer.num_experts)
+
+        recv_x, recv_x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(
+            x, 128)
+
+        (
+            permute_input,
+            permute_scale,
+            permute_indices_per_token,
+            recv_num_tokens_per_expert_list_cumsum,
+            recv_num_tokens_per_expert_list_padded_cumsum,
+            dst_weights,
+            dst_indices,
+            cumsum_idx_gpu,
+            m_indices,
+        ) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8(
+            recv_x,
+            recv_x_scale,
+            topk_ids,
+            topk_weights,
+            tmp[0],
+            tmp[1],
+        )
+
+        permute_scale = permute_scale.transpose([1, 0]).contiguous()
+        permute_scale = permute_scale.transpose([1, 0])
+
+        # ffn1
+        ffn_out = paddle.empty(
+            (permute_input.shape[0], layer.moe_ffn1_weight.shape[1]),
+            dtype=paddle.bfloat16,
+        )
+        deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+            (permute_input, permute_scale),
+            (layer.moe_ffn1_weight, layer.moe_ffn1_weight_scale),
+            ffn_out,
+            m_indices,
+        )
+        # swiglu
+        ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out)
+
+        # ffn2
+        ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
+            ffn_out, self.quant_config.weight_block_size[0])
+
+        ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose(
+            [1, 0]).contiguous()
+        ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0])
+
+        ffn_out = paddle.empty(
+            (ffn_out.shape[0], layer.moe_ffn2_weight.shape[1]),
+            dtype=paddle.bfloat16)
+        deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+            (ffn_in_x, ffn_in_x_scale_tensor),
+            (layer.moe_ffn2_weight, layer.moe_ffn2_weight_scale),
+            ffn_out,
+            m_indices,
+        )
+        # prmt back per rank
+        tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
+            ffn_out,
+            dst_weights,
+            permute_indices_per_token,
+            dst_indices,
+            None,
+            False,  # norm_topk_prob
+            1.0,
+        )[0]
+        if layer.tp_size > 1:
+            tensor_model_parallel_all_reduce(tmp_ffn_out)
+
+        return tmp_ffn_out
--- a/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py
@@ -0,0 +1,285 @@
+"""
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import paddle
+from paddle import nn
+
+import fastdeploy
+from fastdeploy.distributed.communication_op import \
+    tensor_model_parallel_all_reduce
+from fastdeploy.model_executor.ops.gpu import (MoeWna16MarlinGemmApi,
+                                               tritonmoe_preprocess_func)
+
+from ..quantization.quant_base import QuantMethodBase
+
+
+def gptq_marlin_moe_repack(b_q_weight: paddle.Tensor, perm: paddle.Tensor,
+                           size_k: int, size_n: int,
+                           num_bits: int) -> paddle.Tensor:
+    """
+    Util function.
+    """
+    from fastdeploy.model_executor.ops.gpu import gptq_marlin_repack
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = paddle.empty(
+        [num_experts, size_k // 16, size_n * (num_bits // 2)],
+        dtype=b_q_weight.dtype)
+    for e in range(num_experts):
+        output[e] = gptq_marlin_repack(b_q_weight[e], perm[e], size_k, size_n,
+                                       num_bits)
+    return output
+
+
+def get_scale_perms():
+    """
+    Util function.
+    """
+    scale_perm: list[int] = []
+    for i in range(8):
+        scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single: list[int] = []
+    for i in range(4):
+        scale_perm_single.extend(
+            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+    return scale_perm, scale_perm_single
+
+
+def marlin_permute_scales(s: paddle.Tensor, size_k: int, size_n: int,
+                          group_size: int) -> paddle.Tensor:
+    """
+    Util function.
+    """
+    scale_perm, scale_perm_single = get_scale_perms()
+    if group_size < size_k and group_size != -1:
+        s = s.reshape([-1, len(scale_perm)])[:, scale_perm]
+    else:
+        s = s.reshape([-1, len(scale_perm_single)])[:, scale_perm_single]
+    s = s.reshape((-1, size_n)).contiguous()
+
+    return s
+
+
+def marlin_moe_permute_scales(
+    s: paddle.Tensor,
+    size_k: int,
+    size_n: int,
+    group_size: int,
+):
+    """
+    Util function.
+    """
+    num_experts = s.shape[0]
+    output = paddle.empty(
+        [num_experts, s.shape[1], s.shape[2]],
+        dtype=s.dtype,
+    )
+
+    for e in range(num_experts):
+        output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size)
+    return output
+
+
+class MarlinWeightOnlyMoEMethod(QuantMethodBase):
+    """
+    Use Marlin Group Gemm to compute Fused MoE.
+    """
+
+    def __init__(self, quant_method=None):
+        """
+        Marlin Group Gemm to compute Fused MoE.
+        """
+        self.quant_method = quant_method
+        self.added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
+        self.added_scale_attrs = [
+            "moe_ffn1_weight_scale", "moe_ffn2_weight_scale"
+        ]
+        self.added_zeros_attrs = ["zeros0", "zeros1"]
+
+    def create_weights(self, layer: nn.Layer, state_dict):
+        """
+        Marlin MoE create weight process.
+        """
+        ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
+        assert len(ffn1_weights) == layer.num_local_experts
+        assert len(ffn2_weights) == layer.num_local_experts
+        assert ffn1_weights[0].shape == [
+            layer.hidden_size, layer.moe_intermediate_size * 2
+        ]
+        assert ffn2_weights[0].shape == [
+            layer.moe_intermediate_size, layer.hidden_size
+        ]
+
+        ffn1_tensor = paddle.stack(ffn1_weights, axis=0)
+        ffn2_tensor = paddle.stack(ffn2_weights, axis=0)
+
+        max_bound = 7
+
+        for idx, weight_tensor in enumerate([ffn1_tensor, ffn2_tensor]):
+            weight_name = self.added_weight_attrs[idx]
+            scale_name = self.added_scale_attrs[idx]
+
+            weight_scale = weight_tensor.abs().max(axis=1)
+            quanted_weight = weight_tensor / weight_scale[:,
+                                                          None, :] * max_bound
+            quanted_weight = paddle.round(quanted_weight).astype("int32")
+
+            quanted_weight[quanted_weight > 7] = 7
+            quanted_weight[quanted_weight < -7] = -7
+            quanted_weight += 8
+
+            E, K, N = quanted_weight.shape
+            quanted_weight = quanted_weight.reshape([0, K // 8, 8, N])
+            res = paddle.zeros([E, K // 8, N], dtype='int32')
+            for j in range(8):
+                tmp = quanted_weight[:, :, j, :]
+                res = res | (tmp << (j * 4))
+            quanted_weight = paddle.assign(res)
+            weight_scale = weight_scale / max_bound
+            weight_scale = weight_scale[:, None, :]
+
+            group_size = -1  # means per_channel
+
+            g_idx_sort_indices = paddle.empty([E, 0], dtype="int32")
+            quanted_weight = gptq_marlin_moe_repack(
+                quanted_weight,
+                g_idx_sort_indices,
+                K,
+                N,
+                4,
+            )
+
+            weight_scale = marlin_moe_permute_scales(
+                weight_scale,
+                size_k=layer.moe_intermediate_size,  #useless
+                size_n=N,
+                group_size=group_size)
+
+            for (name, tensor) in [(weight_name, quanted_weight),
+                                   (scale_name, weight_scale)]:
+                setattr(
+                    layer, name,
+                    layer.create_parameter(
+                        shape=tensor.shape,
+                        dtype=tensor.dtype,
+                        default_initializer=paddle.nn.initializer.Constant(0),
+                    ))
+                getattr(layer, name).set_value(tensor)
+
+    def apply(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Marlin compute Fused MoE.
+        """
+        token_num = x.shape[0]
+        top_k = layer.top_k
+        top_k = layer.top_k
+        moe_intermediate_size = layer.moe_intermediate_size
+        hidden_size = layer.hidden_size
+        num_experts = layer.num_experts
+
+        gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
+
+        topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
+            gate_out,
+            layer.gate_correction_bias,
+            top_k,
+            True,  # apply_norm_weight,
+            False,
+        )
+
+        block_size_m = 64
+
+        for m in [8, 16, 32, 48, 64]:
+            if token_num * top_k / num_experts / m < 0.9:
+                block_size_m = m
+                break
+
+        topk = top_k
+
+        # for H100 132 sms
+        workspace = paddle.empty([528], dtype="int32")
+
+        sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess_func(
+            topk_ids, num_experts, block_size_m)
+
+        ffn_out = MoeWna16MarlinGemmApi(
+            x,
+            c_or_none=None,
+            b_q_weight=layer.moe_ffn1_weight,
+            b_scales=layer.moe_ffn1_weight_scale,
+            global_scale_or_none=None,
+            b_zeros_or_none=None,
+            g_idx_or_none=None,
+            perm_or_none=None,
+            workspace=workspace,
+            sorted_token_ids=sorted_token_ids,
+            expert_ids=expert_ids,
+            num_tokens_post_padded=num_tokens_post_padded,
+            topk_weights=topk_weights,
+            moe_block_size=block_size_m,
+            top_k=topk,
+            mul_topk_weights=False,
+            is_ep=False,
+            b_q_type_str="uint4b8",
+            size_m=token_num,
+            size_n=moe_intermediate_size * 2,
+            size_k=hidden_size,
+            is_k_full=True,
+            use_atomic_add=True,
+            use_fp32_reduce=True,
+            is_zp_float=False)[0]
+
+        swiglu_out = paddle.incubate.nn.functional.swiglu(ffn_out)
+
+        ffn_out = MoeWna16MarlinGemmApi(
+            swiglu_out,
+            c_or_none=None,
+            b_q_weight=layer.moe_ffn2_weight,
+            b_scales=layer.moe_ffn2_weight_scale,
+            global_scale_or_none=None,
+            b_zeros_or_none=None,
+            g_idx_or_none=None,
+            perm_or_none=None,
+            workspace=workspace,
+            sorted_token_ids=sorted_token_ids,
+            expert_ids=expert_ids,
+            num_tokens_post_padded=num_tokens_post_padded,
+            topk_weights=topk_weights,
+            moe_block_size=block_size_m,
+            top_k=1,
+            mul_topk_weights=True,
+            is_ep=False,
+            b_q_type_str="uint4b8",
+            size_m=token_num * topk,
+            size_n=hidden_size,
+            size_k=moe_intermediate_size,
+            is_k_full=True,
+            use_atomic_add=True,
+            use_fp32_reduce=True,
+            is_zp_float=False)[0]
+
+        ffn_out.reshape_([token_num, -1, hidden_size])
+        ffn_out = ffn_out.sum(axis=1)
+
+        if layer.tp_size > 1:
+            tensor_model_parallel_all_reduce(ffn_out)
+
+        return ffn_out
--- a/fastdeploy/model_executor/layers/moe/fused_moe_method_base.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_method_base.py
@@ -1,57 +0,0 @@
-"""
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-from abc import abstractmethod
-
-import paddle
-from paddle import nn
-
-from fastdeploy.model_executor.layers.quantization.quant_base import \
-    QuantMethodBase
-
-
-class FusedMoEMethodBase(QuantMethodBase):
-    """
-    All MoE Method should inherit this class.
-    and must implement following methods!
-
-    """
-
-    @abstractmethod
-    def create_weights(self,
-                       layer: nn.Layer,
-                       moe_compute_params,
-                       ffn1_tensor,
-                       ffn2_tensor,
-                       ffn1_bias=None,
-                       ffn2_bias=None):
-        """
-        How to create weights, you must implement this method.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def apply(
-        self,
-        layer: nn.Layer,
-        moe_compute_params,
-        x: paddle.Tensor,
-    ) -> paddle.Tensor:
-        """
-        Compute methods, you must implement this method.
-        """
-
-        raise NotImplementedError
--- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
@@ -0,0 +1,479 @@
+"""
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import paddle
+from paddle import nn
+
+from fastdeploy.distributed.communication_op import \
+    tensor_model_parallel_all_reduce
+from fastdeploy.model_executor.layers.utils import (create_hadamard_matrix_map,
+                                                    get_tensor)
+from fastdeploy.utils import ceil_div
+
+from ..quantization.quant_base import QuantMethodBase
+
+
+class TritonWeightOnlyMoEMethod(QuantMethodBase):
+    """
+    Use Triton Group Gemm to compute Fused MoE.
+    """
+
+    def __init__(self, quant_method=None):
+        """
+        Triton Group Gemm to compute Fused MoE.
+        """
+        self.quant_method = quant_method
+        self.added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
+        self.added_scale_attrs = [
+            "moe_ffn1_weight_scale", "moe_ffn2_weight_scale"
+        ]
+
+    def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None:
+        """process_prequanted_weights"""
+        pass
+
+    def create_weights(self, layer: nn.Layer, state_dict):
+        """
+        Triton MoE create weight process.
+        """
+        ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
+        assert len(ffn1_weights) == layer.num_local_experts
+        assert len(ffn2_weights) == layer.num_local_experts
+        assert layer.quant_method.quant_config.name() == "wint8"
+        assert ffn1_weights[0].shape == [
+            layer.hidden_size, layer.moe_intermediate_size * 2
+        ]
+        assert ffn2_weights[0].shape == [
+            layer.moe_intermediate_size, layer.hidden_size
+        ]
+
+        ffn1_tensor = paddle.stack(ffn1_weights, axis=0)
+        ffn2_tensor = paddle.stack(ffn2_weights, axis=0)
+
+        if self.quant_config.name() == "wint8":
+            max_bound = 127
+        elif self.quant_config.name() == "wint4":
+            max_bound = 7
+
+        for idx, weight_tensor in enumerate([ffn1_tensor, ffn2_tensor]):
+            weight_name = self.added_weight_attrs[idx]
+            scale_name = self.added_scale_attrs[idx]
+
+            quanted_weight_scale = weight_tensor.abs().max(axis=1)
+            quanted_weight = weight_tensor / quanted_weight_scale[:,
+                                                                  None, :] * max_bound
+            quanted_weight = paddle.round(quanted_weight).astype("int8")
+            quanted_weight_scale = quanted_weight_scale / max_bound
+
+            setattr(
+                layer, weight_name,
+                layer.create_parameter(
+                    shape=quanted_weight.shape,
+                    dtype=quanted_weight.dtype,
+                    default_initializer=paddle.nn.initializer.Constant(0),
+                ))
+            getattr(layer, weight_name).set_value(quanted_weight)
+
+            setattr(
+                layer, scale_name,
+                layer.create_parameter(
+                    shape=quanted_weight_scale.shape,
+                    dtype=quanted_weight_scale.dtype,
+                ))
+            getattr(layer, scale_name).set_value(quanted_weight_scale)
+
+    def apply(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Triton compute Fused MoE.
+        """
+        token_num = x.shape[0]
+        top_k = layer.top_k
+        num_local_experts = layer.num_local_experts
+        top_k = layer.top_k
+        moe_intermediate_size = layer.moe_intermediate_size
+        hidden_size = layer.hidden_size
+
+        gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
+        scores = paddle.nn.functional.softmax(gate_out, axis=-1)
+
+        topk_weights, topk_ids = paddle.topk(scores,
+                                             k=top_k,
+                                             axis=-1,
+                                             sorted=False)
+        topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdim=True)
+
+        intermediate_cache1 = paddle.empty(
+            [token_num * top_k, moe_intermediate_size * 2],
+            dtype=x.dtype,
+        )
+        intermediate_cache2 = paddle.empty(
+            (token_num * top_k, moe_intermediate_size),
+            dtype=x.dtype,
+        )
+        intermediate_cache3 = paddle.empty(
+            (token_num * top_k, hidden_size),
+            dtype=x.dtype,
+        )
+
+        config = {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 128,
+            "GROUP_SIZE_M": 1,
+        }
+        from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess
+
+        from .triton_moe_kernels import fused_moe_kernel_paddle
+        sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess(
+            topk_ids, num_local_experts, config["BLOCK_SIZE_M"])
+        max_num_tokens_padded = sorted_token_ids.shape[0]
+        grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
+                ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]), )
+
+        fused_moe_kernel_paddle[grid](
+            x,
+            layer.moe_ffn1_weight,
+            intermediate_cache1,
+            None,
+            layer.moe_ffn1_weight_scale,
+            None,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            moe_intermediate_size * 2,
+            hidden_size,
+            max_num_tokens_padded,
+            token_num * top_k,
+            stride_am=x.strides[0],
+            stride_ak=x.strides[1],
+            stride_be=layer.moe_ffn1_weight.strides[0],
+            stride_bk=layer.moe_ffn1_weight.strides[1],
+            stride_bn=layer.moe_ffn1_weight.strides[2],
+            stride_cm=intermediate_cache1.strides[0],
+            stride_cn=intermediate_cache1.strides[1],
+            #
+            stride_asm=-1,
+            stride_ask=-1,
+            stride_bse=layer.moe_ffn1_weight_scale.strides[0],
+            stride_bsk=-1,
+            stride_bsn=layer.moe_ffn1_weight_scale.strides[1],
+            group_n=-1,
+            group_k=-1,
+            # Meta-parameters
+            BLOCK_SIZE_M=config["BLOCK_SIZE_M"],
+            BLOCK_SIZE_N=config["BLOCK_SIZE_N"],
+            BLOCK_SIZE_K=config["BLOCK_SIZE_K"],
+            GROUP_SIZE_M=config["GROUP_SIZE_M"],
+            MUL_ROUTED_WEIGHT=False,
+            top_k=top_k,
+            compute_type_enum=1,
+            use_fp8_w8a8=False,
+            use_int8_w8a16=True,
+            even_Ks=hidden_size % config["BLOCK_SIZE_K"] == 0,
+        )
+
+        intermediate_cache2 = paddle.incubate.nn.functional.swiglu(
+            intermediate_cache1)
+
+        grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
+                ceil_div(hidden_size, config["BLOCK_SIZE_N"]), )
+        fused_moe_kernel_paddle[grid](
+            intermediate_cache2,
+            layer.moe_ffn2_weight,
+            intermediate_cache3,
+            None,
+            layer.moe_ffn2_weight_scale,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            hidden_size,
+            moe_intermediate_size,
+            max_num_tokens_padded,
+            token_num * top_k,
+            stride_am=intermediate_cache2.strides[0],
+            stride_ak=intermediate_cache2.strides[1],
+            stride_be=layer.moe_ffn2_weight.strides[0],
+            stride_bk=layer.moe_ffn2_weight.strides[1],
+            stride_bn=layer.moe_ffn2_weight.strides[2],
+            stride_cm=intermediate_cache3.strides[0],
+            stride_cn=intermediate_cache3.strides[1],
+            stride_asm=-1,
+            stride_ask=-1,
+            stride_bse=layer.moe_ffn2_weight_scale.strides[0],
+            stride_bsk=-1,
+            stride_bsn=layer.moe_ffn2_weight_scale.strides[1],
+            group_n=-1,
+            group_k=-1,
+            # Meta-parameters
+            BLOCK_SIZE_M=config["BLOCK_SIZE_M"],
+            BLOCK_SIZE_N=config["BLOCK_SIZE_N"],
+            BLOCK_SIZE_K=config["BLOCK_SIZE_K"],
+            GROUP_SIZE_M=config["GROUP_SIZE_M"],
+            MUL_ROUTED_WEIGHT=True,
+            top_k=1,
+            compute_type_enum=1,
+            use_fp8_w8a8=False,
+            use_int8_w8a16=True,
+            even_Ks=moe_intermediate_size % config["BLOCK_SIZE_K"] == 0,
+        )
+
+        intermediate_cache3.reshape_([token_num, top_k, hidden_size])
+        out = intermediate_cache3.sum(axis=1)
+        return out
+
+
+class TensorWiseFP8MoEMethod(QuantMethodBase):
+    """
+    Use Triton Group Gemm to compute Fused MoE.
+    """
+
+    def __init__(self, quant_method=None):
+        """
+        Triton Group Gemm to compute Fused MoE.
+        """
+        self.quant_method = quant_method
+
+    def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None:
+        """process_prequanted_weights"""
+
+        ffn1_tensor, ffn2_tensor = layer.extract_moe_ffn_weights(state_dict)
+        assert ffn1_tensor[0].shape == [
+            layer.hidden_size, layer.moe_intermediate_size * 2
+        ]
+        assert ffn2_tensor[0].shape == [
+            layer.moe_intermediate_size, layer.hidden_size
+        ]
+
+        ffn1_tensor = paddle.stack(ffn1_tensor, axis=0)
+        ffn2_tensor = paddle.stack(ffn2_tensor, axis=0)
+
+        added_wfp8afp8_attrs = [
+            "moe_ffn1_weight", "moe_ffn2_weight", "moe_ffn1_weight_scale",
+            "moe_ffn2_weight_scale", "moe_ffn1_in_scale", "moe_ffn2_in_scale"
+        ]
+
+        def _extract_scale_tensor(key_template):
+            result = []
+            for i in range(layer.num_experts):
+                result.append(
+                    get_tensor(state_dict.pop(key_template.format(i))))
+            return paddle.concat(result).cast("float32")
+
+        weight_key_map = layer.weight_key_map
+        moe_ffn1_weight_scale = _extract_scale_tensor(
+            weight_key_map["ffn1_expert_weight_scale_key"])
+        moe_ffn2_weight_scale = _extract_scale_tensor(
+            weight_key_map["ffn2_expert_weight_scale_key"])
+        moe_ffn1_in_scale = _extract_scale_tensor(
+            weight_key_map["ffn1_expert_in_scale_key"])
+        moe_ffn2_in_scale = _extract_scale_tensor(
+            weight_key_map["ffn2_expert_in_scale_key"])
+
+        for idx, weight_tensor in enumerate([
+                ffn1_tensor, ffn2_tensor, moe_ffn1_weight_scale,
+                moe_ffn2_weight_scale, moe_ffn1_in_scale, moe_ffn2_in_scale
+        ]):
+            name = added_wfp8afp8_attrs[idx]
+            setattr(
+                layer, name,
+                layer.create_parameter(
+                    shape=weight_tensor.shape,
+                    dtype=weight_tensor.dtype,
+                    default_initializer=paddle.nn.initializer.Constant(0),
+                ))
+            getattr(layer, name).set_value(weight_tensor)
+
+    def create_weights(self, layer: nn.Layer, state_dict):
+        """
+        Triton MoE create weight process.
+        """
+        pass
+
+    def apply(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Triton compute Fused MoE.
+        """
+
+        token_num = x.shape[0]
+        top_k = layer.top_k
+        num_local_experts = layer.num_local_experts
+        moe_intermediate_size = layer.moe_intermediate_size
+        hidden_size = layer.hidden_size
+
+        gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
+        scores = paddle.nn.functional.softmax(gate_out, axis=-1)
+
+        topk_weights, topk_ids = paddle.topk(scores,
+                                             k=top_k,
+                                             axis=-1,
+                                             sorted=False)
+        topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdim=True)
+
+        intermediate_cache1 = paddle.empty(
+            [token_num * top_k, moe_intermediate_size * 2],
+            dtype=x.dtype,
+        )
+        intermediate_cache2 = paddle.empty(
+            (token_num * top_k, moe_intermediate_size),
+            dtype=x.dtype,
+        )
+        intermediate_cache3 = paddle.empty(
+            (token_num * top_k, hidden_size),
+            dtype=x.dtype,
+        )
+
+        config = {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 128,
+            "GROUP_SIZE_M": 1,
+        }
+        from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess
+
+        sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess(
+            topk_ids, num_local_experts, config["BLOCK_SIZE_M"])
+        max_num_tokens_padded = sorted_token_ids.shape[0]
+        grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
+                ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]), )
+
+        adamard_matrix = create_hadamard_matrix_map[hidden_size]
+        x = paddle.matmul(x.cast("float32"), adamard_matrix)
+
+        permute_x = x[:, None, :].tile([1, top_k, 1])
+        permute_x = permute_x.reshape([-1, hidden_size])
+
+        quant_activation_scale = layer.moe_ffn1_in_scale[topk_ids].reshape(
+            [-1, 1])
+        permute_x = permute_x / quant_activation_scale
+        permute_x = permute_x.astype("float8_e4m3fn")
+
+        from .triton_moe_kernels import fused_moe_kernel_paddle
+
+        fused_moe_kernel_paddle[grid](
+            permute_x,
+            layer.moe_ffn1_weight.view(paddle.float8_e4m3fn),
+            intermediate_cache1,
+            layer.moe_ffn1_in_scale,
+            layer.moe_ffn1_weight_scale,
+            None,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            moe_intermediate_size * 2,
+            hidden_size,
+            max_num_tokens_padded,
+            token_num * top_k,
+            stride_am=x.strides[0],
+            stride_ak=x.strides[1],
+            stride_be=layer.moe_ffn1_weight.strides[0],
+            stride_bk=layer.moe_ffn1_weight.strides[1],
+            stride_bn=layer.moe_ffn1_weight.strides[2],
+            stride_cm=intermediate_cache1.strides[0],
+            stride_cn=intermediate_cache1.strides[1],
+            #
+            stride_asm=-1,  # only used in blockwise fp8
+            stride_ask=-1,  # only used in blockwise fp8
+            stride_bse=-1,
+            stride_bsk=-1,
+            stride_bsn=-1,
+            group_n=-1,
+            group_k=-1,
+            # Meta-parameters
+            BLOCK_SIZE_M=config["BLOCK_SIZE_M"],
+            BLOCK_SIZE_N=config["BLOCK_SIZE_N"],
+            BLOCK_SIZE_K=config["BLOCK_SIZE_K"],
+            GROUP_SIZE_M=config["GROUP_SIZE_M"],
+            MUL_ROUTED_WEIGHT=False,
+            top_k=1,
+            compute_type_enum=1,
+            use_fp8_w8a8=True,
+            use_int8_w8a16=False,
+            even_Ks=hidden_size % config["BLOCK_SIZE_K"] == 0,
+        )
+
+        intermediate_cache2 = paddle.incubate.nn.functional.swiglu(
+            intermediate_cache1)
+
+        hadamard_matrix = create_hadamard_matrix_map[moe_intermediate_size]
+        intermediate_cache2 = paddle.matmul(
+            intermediate_cache2.cast("float32"), hadamard_matrix)
+        quant_activation_scale = layer.moe_ffn2_in_scale[topk_ids].reshape(
+            [-1, 1])
+        intermediate_cache2 = intermediate_cache2 / quant_activation_scale
+        intermediate_cache2 = intermediate_cache2.astype("float8_e4m3fn")
+
+        grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
+                ceil_div(hidden_size, config["BLOCK_SIZE_N"]), )
+
+        fused_moe_kernel_paddle[grid](
+            intermediate_cache2,
+            layer.moe_ffn2_weight.view(paddle.float8_e4m3fn),
+            intermediate_cache3,
+            layer.moe_ffn2_in_scale,
+            layer.moe_ffn2_weight_scale,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            hidden_size,
+            moe_intermediate_size,
+            max_num_tokens_padded,
+            token_num * top_k,
+            stride_am=intermediate_cache2.strides[0],
+            stride_ak=intermediate_cache2.strides[1],
+            stride_be=layer.moe_ffn2_weight.strides[0],
+            stride_bk=layer.moe_ffn2_weight.strides[1],
+            stride_bn=layer.moe_ffn2_weight.strides[2],
+            stride_cm=intermediate_cache3.strides[0],
+            stride_cn=intermediate_cache3.strides[1],
+            stride_asm=-1,
+            stride_ask=-1,
+            stride_bse=-1,
+            stride_bsk=-1,
+            stride_bsn=-1,
+            group_n=-1,
+            group_k=-1,
+            # Meta-parameters
+            BLOCK_SIZE_M=config["BLOCK_SIZE_M"],
+            BLOCK_SIZE_N=config["BLOCK_SIZE_N"],
+            BLOCK_SIZE_K=config["BLOCK_SIZE_K"],
+            GROUP_SIZE_M=config["GROUP_SIZE_M"],
+            MUL_ROUTED_WEIGHT=True,
+            top_k=1,
+            compute_type_enum=1,
+            use_fp8_w8a8=True,
+            use_int8_w8a16=False,
+            even_Ks=moe_intermediate_size % config["BLOCK_SIZE_K"] == 0,
+        )
+
+        intermediate_cache3.reshape_([token_num, top_k, hidden_size])
+        out = intermediate_cache3.sum(axis=1)
+
+        if layer.tp_size > 1:
+            tensor_model_parallel_all_reduce(out)
+
+        return out
--- a/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py
@@ -0,0 +1,236 @@
+"""
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import paddle
+from paddle import nn
+
+import fastdeploy
+
+from ..quantization.quant_base import QuantMethodBase
+from ..utils import create_and_set_parameter, get_tensor
+
+
+class Wint2MoeMethod(QuantMethodBase):
+    """
+    Use  compute Fused MoE.
+    """
+
+    def __init__(self, quant_config):
+        super().__init__()
+        self.moe_quant_type = quant_config.moe_quant_type
+
+    def process_loaded_weights(self, layer, weights) -> None:
+        """
+        process_loaded_weights
+        """
+        pass
+
+    def check(self, layer: nn.Layer, ffn1_weights, ffn2_weights):
+        """
+        check layer is valid for this method
+        """
+        assert len(
+            ffn1_weights
+        ) == layer.num_local_experts, "ffn1_weights length should be equal to num_local_experts."
+        assert len(
+            ffn2_weights
+        ) == layer.num_local_experts, "ffn2_weights length should be equal to num_local_experts."
+
+    def create_weights(self, layer: nn.Layer, state_dict):
+        """
+        Paddle cutlass create weight process.
+        """
+        pass
+
+
+class TritonWint2FusedMoeMethod(Wint2MoeMethod):
+    """
+    Use Triton Group Gemm to compute Fused MoE.
+    """
+
+    def __init__(self, quant_config):
+        super().__init__(quant_config)
+        self.moe_quant_type = quant_config.moe_quant_type
+
+    def process_loaded_weights(self, layer, weights) -> None:
+        """
+        process_loaded_weights
+        """
+        pass
+
+    def process_prequanted_weights(self, layer: nn.Layer, state_dict):
+        """
+        Paddle cutlass process prequanted weights.
+        """
+        ffn1_expert_weight_key = layer.weight_key_map.get(
+            "ffn1_expert_weight_key", None)
+        ffn2_expert_weight_key = layer.weight_key_map.get(
+            "ffn2_expert_weight_key", None)
+        ffn1_expert_weight_scale_key = layer.weight_key_map.get(
+            "ffn1_expert_weight_scale_key", None)
+        ffn2_expert_weight_scale_key = layer.weight_key_map.get(
+            "ffn2_expert_weight_scale_key", None)
+        ffn1_expert_super_scales_key = layer.weight_key_map.get(
+            "ffn1_expert_super_scales_key", None)
+        ffn2_expert_super_scales_key = layer.weight_key_map.get(
+            "ffn2_expert_super_scales_key", None)
+        ffn1_expert_code_scale_key = layer.weight_key_map.get(
+            "ffn1_expert_code_scale_key", None)
+        ffn2_expert_code_scale_key = layer.weight_key_map.get(
+            "ffn2_expert_code_scale_key", None)
+        ffn1_expert_code_zp_key = layer.weight_key_map.get(
+            "ffn1_expert_code_zp_key", None)
+        ffn2_expert_code_zp_key = layer.weight_key_map.get(
+            "ffn2_expert_code_zp_key", None)
+
+        ffn1_weights, ffn2_weights = layer.load_experts_weight(
+            state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
+        # self.check(layer, ffn1_weights, ffn2_weights)
+
+        ffn1_weight_scale = []
+        ffn2_weight_scale = []
+        ffn1_super_scales = []
+        ffn2_super_scales = []
+        ffn1_code_scale = []
+        ffn2_code_scale = []
+        ffn1_code_zp = []
+        ffn2_code_zp = []
+        for i in range(layer.num_experts):
+            expert_idx = layer.expert_id_offset + i
+            ffn1_weight_scale.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn1_expert_weight_scale_key.format(expert_idx))))
+            ffn2_weight_scale.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn2_expert_weight_scale_key.format(expert_idx))))
+            ffn1_super_scales.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn1_expert_super_scales_key.format(expert_idx))))
+            ffn2_super_scales.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn2_expert_super_scales_key.format(expert_idx))))
+            ffn1_code_scale.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn1_expert_code_scale_key.format(expert_idx))))
+            ffn2_code_scale.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn2_expert_code_scale_key.format(expert_idx))))
+            ffn1_code_zp.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn1_expert_code_zp_key.format(expert_idx))))
+            ffn2_code_zp.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn2_expert_code_zp_key.format(expert_idx))))
+
+        ffn1_weight = paddle.stack(ffn1_weights, axis=0)
+        ffn2_weight = paddle.stack(ffn2_weights, axis=0)
+        ffn1_weight_scale = paddle.stack(ffn1_weight_scale, axis=0)
+        ffn2_weight_scale = paddle.stack(ffn2_weight_scale, axis=0)
+        ffn1_super_scales = paddle.stack(ffn1_super_scales, axis=0)
+        ffn2_super_scales = paddle.stack(ffn2_super_scales, axis=0)
+        ffn1_code_scale = paddle.stack(ffn1_code_scale, axis=0)
+        ffn2_code_scale = paddle.stack(ffn2_code_scale, axis=0)
+        ffn1_code_zp = paddle.stack(ffn1_code_zp, axis=0)
+        ffn2_code_zp = paddle.stack(ffn2_code_zp, axis=0)
+
+        name_tensor_map = {
+            "moe_ffn1_weight": ffn1_weight,
+            "moe_ffn2_weight": ffn2_weight,
+            "moe_ffn1_weight_scale": ffn1_weight_scale,
+            "moe_ffn2_weight_scale": ffn2_weight_scale,
+            "moe_ffn1_super_scales": ffn1_super_scales,
+            "moe_ffn2_super_scales": ffn2_super_scales,
+            "moe_ffn1_code_scale": ffn1_code_scale,
+            "moe_ffn2_code_scale": ffn2_code_scale,
+            "moe_ffn1_code_zp": ffn1_code_zp,
+            "moe_ffn2_code_zp": ffn2_code_zp
+        }
+        for name, tensor in name_tensor_map.items():
+            create_and_set_parameter(layer, name, tensor)
+
+    def create_weights(self, layer: nn.Layer, state_dict):
+        """
+        Paddle cutlass create weight process.
+        """
+        pass
+
+    def apply(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Use Wint2 Triton Fusedmoe compute Fused MoE.
+        """
+
+        from fastdeploy.model_executor.ops.gpu import moe_expert_dispatch
+        (
+            permute_input,
+            token_nums_per_expert,
+            permute_indices_per_token,
+            topk_weights,
+            topk_idx,
+            expert_idx_per_token,
+        ) = moe_expert_dispatch(
+            x,
+            gate_out,
+            layer.gate_correction_bias,
+            (layer.moe_ffn1_in_scale if hasattr(layer, "moe_ffn1_in_scale")
+             else None),  # if set, permute_input will be int8_t
+            layer.top_k,
+            False,
+            topk_only_mode=False,
+        )
+
+        ffn_out = fastdeploy.model_executor.ops.gpu.moe_expert_ffn_wint2(
+            permute_input,
+            token_nums_per_expert,
+            layer.moe_ffn1_weight,
+            layer.moe_ffn2_weight,
+            None,
+            layer.moe_ffn1_super_scales,
+            layer.moe_ffn2_super_scales,
+            layer.moe_ffn1_weight_scale,
+            layer.moe_ffn1_code_scale,
+            layer.moe_ffn1_code_zp,
+            layer.moe_ffn2_weight_scale,
+            layer.moe_ffn2_code_scale,
+            layer.moe_ffn2_code_zp,
+            False,
+        )
+
+        from fastdeploy.model_executor.ops.gpu import moe_expert_reduce
+
+        fused_moe_out = moe_expert_reduce(
+            ffn_out,
+            topk_weights,
+            permute_indices_per_token,
+            topk_idx,
+            None,
+            norm_topk_prob=True,
+            routed_scaling_factor=1.0,
+        )
+
+        return fused_moe_out
--- a/fastdeploy/model_executor/layers/moe/mm.py
+++ b/fastdeploy/model_executor/layers/moe/mm.py
@@ -1,273 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import os
-import paddle
-from paddle import nn
-from fastdeploy.model_executor.layers.moe.moe import MoELayer
-from fastdeploy.model_executor.layers.utils import get_tensor
-
-
-class TextMoELayer(MoELayer):
-    """
-    MoELayer is a layer that performs MoE (Mixture of Experts) computation.
-    """
-
-    def __init__(
-        self,
-        *args,
-        **kwargs,
-    ):
-        """
-            初始化函数，用于设置类的属性和方法。
-        参数：
-            - args (tuple, optional): 可变长度的位置参数列表，默认为空元组。
-            - kwargs (dict, optional): 关键字参数字典，默认为空字典。
-        返回值：
-            无返回值，直接修改类的属性和方法。
-        """
-        kwargs["moe_tag"] = "Text"
-        super().__init__(*args, **kwargs)
-
-    def load_gate_state_dict(self, state_dict):
-        """
-            加载门状态字典，用于初始化网络参数。
-        将从给定的状态字典中弹出的参数赋值给网络的门参数。
-
-        Args:
-            state_dict (OrderedDict): 包含网络门参数的字典。
-
-        Returns:
-            tuple (list, list): 返回两个列表，分别代表上阶网关投影和下阶投影的参数。
-                每个元素都是一个列表，长度为网络的专家数量。
-        """
-        up_gate_proj_weight = []
-        up_gate_proj_weight_scale = []
-        down_proj_weight = []
-        down_proj_weight_scale = []
-        for j in range(0, self.num_experts):
-            up_gate_proj_weight.append(
-                get_tensor(state_dict.pop(self.ffn1_expert_weight_key.format(j)))
-            )
-            down_proj_weight.append(
-                get_tensor(state_dict.pop(self.ffn2_expert_weight_key.format(j)))
-            )
-        return (
-            up_gate_proj_weight,
-            down_proj_weight,
-            up_gate_proj_weight_scale,
-            down_proj_weight_scale,
-        )
-
-    def load_gate_correction_bias(self, state_dict):
-        """
-            加载网关校正偏置。如果使用了网关校正偏置，则从state_dict中获取相应的张量并设置到网关校正偏置上。
-        参数：
-            state_dict (OrderedDict): 包含模型参数和状态的字典。
-        返回值：
-            无返回值，直接修改了网关校正偏置的值。
-        """
-        if self.moe_config.moe_use_gate_correction_bias:
-            gate_correction_bias_tensor = get_tensor(
-                state_dict[self.gate_correction_bias_key]
-            )
-            self.gate_correction_bias.set_value(
-                gate_correction_bias_tensor[0].unsqueeze(0)
-            )
-
-
-class ImageMoELayer(MoELayer):
-    """
-    MoELayer is a layer that performs MoE (Mixture of Experts) computation.
-    """
-
-    def __init__(
-        self,
-        *args,
-        **kwargs,
-    ):
-        """
-            初始化函数，用于设置类的属性和方法。
-        参数：
-            - args (tuple, optional): 可变长度的位置参数列表，默认为空元组。
-            - kwargs (dict, optional): 关键字参数字典，默认为空字典。
-        返回值：
-            无返回值，直接修改类的属性和方法。
-        """
-        moe_quant_type = os.getenv("ELLM_MM_IMAGE_QUANT_TYPE", None)
-        if moe_quant_type is not None:
-            kwargs["moe_quant_type"] = moe_quant_type
-        kwargs["moe_tag"] = "Image"
-        super().__init__(*args, **kwargs)
-
-    def load_gate_state_dict(self, state_dict):
-        """
-            加载门状态字典。
-        从给定的状态字典中提取并返回两个专家的上下关门投影权重，以及两个专家的下降投影权重。
-        参数：
-            state_dict (OrderedDict): 包含网络参数的有序字典。
-        返回值：
-            tuple (list, list)，分别是两个专家的上下关门投影权重和两个专家的下降投影权重，都是列表类型。
-        """
-        up_gate_proj_weight = []
-        up_gate_proj_weight_scale = []
-        down_proj_weight = []
-        down_proj_weight_scale = []
-        for j in range(self.num_experts, self.num_experts + self.num_experts):
-            up_gate_proj_weight.append(
-                get_tensor(state_dict.pop(self.ffn1_expert_weight_key.format(j)))
-            )
-            down_proj_weight.append(
-                get_tensor(state_dict.pop(self.ffn2_expert_weight_key.format(j)))
-            )
-        return (
-            up_gate_proj_weight,
-            down_proj_weight,
-            up_gate_proj_weight_scale,
-            down_proj_weight_scale,
-        )
-
-    def load_gate_correction_bias(self, state_dict):
-        """
-            加载门级别校正偏置参数，如果使用门级别校正偏置则从state_dict中获取并设置到gate_correction_bias中。
-        参数：
-            state_dict (OrderedDict): 模型的状态字典，包含所有需要被加载的参数。
-        返回值：
-            无返回值，直接修改了gate_correction_bias的值。
-        """
-        if self.moe_config.moe_use_gate_correction_bias:
-            gate_correction_bias_tensor = get_tensor(
-                state_dict[self.gate_correction_bias_key]
-            )
-            self.gate_correction_bias.set_value(
-                gate_correction_bias_tensor[1].unsqueeze(0)
-            )
-
-
-class MultimodalityMoeLayer(nn.Layer):
-    """
-    Multimodality MOE Layer
-    """
-
-    def __init__(
-        self,
-        inference_args,
-        layer_name,
-        layer_idx,
-    ):
-        """
-            初始化一个 MoELayer。
-
-        Args:
-            inference_args (InferenceArgs): 推理参数类，包含了所有必要的配置信息。
-            layer_name (str): 当前 MoE Layer 的名称。
-            layer_idx (int): 当前 MoE Layer 在模型中的索引。
-
-        Returns:
-            None, 无返回值。
-        """
-        super().__init__()
-
-        self.text_moe_layer = TextMoELayer(
-            inference_args=inference_args,
-            moe_config=inference_args.moe_config,
-            layer_name=layer_name + ".text",
-            gate_weight_key=f"ernie.layers.{layer_idx}.mlp.gate.weight",
-            ffn1_expert_weight_key=f"ernie.layers.{layer_idx}.mlp.experts"
-            + ".{}.up_gate_proj.weight",
-            ffn2_expert_weight_key=f"ernie.layers.{layer_idx}.mlp.experts"
-            + ".{}.down_proj.weight",
-            gate_correction_bias_key=f"ernie.layers.{layer_idx}.mlp.moe_statics.e_score_correction_bias",
-            ffn1_bias_key=None,
-            ffn2_bias_key=None,
-            ffn1_shared_weight_key=None,
-            ffn1_shared_bias_key=None,
-            ffn2_shared_weight_key=None,
-            ffn2_shared_bias_key=None,
-            layer_idx=layer_idx,
-        )
-
-        self.image_moe_layer = ImageMoELayer(
-            inference_args=inference_args,
-            moe_config=inference_args.moe_config_1,
-            layer_name=layer_name + ".image",
-            gate_weight_key=f"ernie.layers.{layer_idx}.mlp.gate.weight_1",
-            ffn1_expert_weight_key=f"ernie.layers.{layer_idx}.mlp.experts"
-            + ".{}.up_gate_proj.weight",
-            ffn2_expert_weight_key=f"ernie.layers.{layer_idx}.mlp.experts"
-            + ".{}.down_proj.weight",
-            gate_correction_bias_key=f"ernie.layers.{layer_idx}.mlp.moe_statics.e_score_correction_bias",
-            ffn1_bias_key=None,
-            ffn2_bias_key=None,
-            ffn1_shared_weight_key=None,
-            ffn1_shared_bias_key=None,
-            ffn2_shared_weight_key=None,
-            ffn2_shared_bias_key=None,
-            layer_idx=layer_idx,
-        )
-
-    def load_state_dict(self, state_dict):
-        """
-            加载模型参数。
-        将给定的字典中的参数覆盖到当前模型上，并返回一个新的字典，其中包含未被覆盖的键值对。
-
-        Args:
-            state_dict (dict): 包含了要加载的模型参数的字典。
-
-        Returns:
-            dict: 包含未被覆盖的键值对的字典。
-        """
-        self.text_moe_layer.load_state_dict(state_dict)
-        self.image_moe_layer.load_state_dict(state_dict)
-        state_dict.pop(self.text_moe_layer.gate_correction_bias_key)
-
-    def forward(self, x, **kwargs):
-        """
-            前向计算函数，将输入的张量进行处理并返回结果。
-        该函数接受以下键值对参数：
-            - token_type_ids (Optional, Tensor, default=None): 一个bool型Tensor，用于指定每个元素是否为文本类型（值为0）或图像类型（值为1）。
-                如果未提供此参数，则会引发AssertionError。
-        返回值是一个Tensor，形状与输入相同，表示处理后的结果。
-
-        Args:
-            x (Tensor): 输入张量，形状为[token_num, hidden_size]，其中token_num是序列长度，hidden_size是隐藏状态维度。
-            kwargs (dict, optional): 可选参数字典，默认为None，包含以下键值对：
-                - token_type_ids (Tensor, optional): 一个bool型Tensor，用于指定每个元素是否为文本类型（值为0）或图像类型（值为1），默认为None。
-
-        Returns:
-            Tensor: 一个Tensor，形状与输入相同，表示处理后的结果。
-
-        Raises:
-            AssertionError: 当未提供token_type_ids参数时会引发此错误。
-        """
-        token_type_ids = kwargs.get("token_type_ids", None)
-        assert token_type_ids is not None
-
-        # x.shape is [token_num, hidden_size]
-        fused_moe_out = paddle.zeros_like(x)
-
-        text_mask = token_type_ids == 0  # [token_num]
-        image_mask = token_type_ids == 1
-
-        if text_mask.any():
-            text_out = self.text_moe_layer(x[text_mask])
-            fused_moe_out[text_mask] = text_out
-
-        if image_mask.any():
-            image_out = self.image_moe_layer(x[image_mask])
-            fused_moe_out[image_mask] = image_out
-
-        return fused_moe_out
--- a/fastdeploy/model_executor/layers/moe/moe.py
+++ b/fastdeploy/model_executor/layers/moe/moe.py
@@ -1,5 +1,5 @@
 """
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,34 +14,13 @@
 # limitations under the License.
 """

-from dataclasses import dataclass
-
+import paddle
 from paddle import nn
-from paddlenlp.utils.log import logger
+from paddleformers.utils.log import logger

+from fastdeploy import envs
 from fastdeploy.model_executor.layers.utils import get_tensor

-from .cutlass_fused_moe import CutlassFusedMoeMethod
-
-
-@dataclass
-class MoEComputeParams:
-    """
-    some params for computing MoE.
-    it is given to different compute methods.
-    """
-    global_num_experts: int = -1
-    top_k: int = -1
-    hidden_size: int = -1
-    num_local_experts: int = -1
-    moe_intermediate_size: int = -1
-
-    tp_size: int = -1
-    ep_size: int = -1
-    dp_size: int = -1
-
-    moe_quant_type: str = ""
-

 class FusedMoE(nn.Layer):
    """
@@ -50,174 +29,195 @@ class FusedMoE(nn.Layer):

    def __init__(
        self,
-        llm_config,
+        fd_config,
        moe_intermediate_size: int = -1,
        num_experts: int = -1,
+        expert_id_offset: int = 0,
        top_k: int = -1,
-        moe_use_gate_correction_bias: bool = False,
-        moe_quant_type: str = "weight_only_int4",
        layer_idx: int = -1,
-        gate_weight_key=None,
-        gate_correction_bias_key=None,
-        ffn1_expert_weight_key=None,
-        ffn2_expert_weight_key=None,
-        moe_ffn1_bias_keys=None,
-        moe_ffn2_bias_keys=None,
-        moe_ffn1_weight_scale_keys=None,
-        moe_ffn2_weight_scale_keys=None,
-        moe_ffn1_in_scale_keys=None,
-        moe_ffn2_in_scale_keys=None,
+        moe_tag: str = "",
+        weight_key_map: dict = {},
    ):
        """
        Initialize the Moe layer with given parameters.
        Args:
-            llm_config (LLMConfig): Arguments related to inference, containing
+            fd_config (FDConfig): Arguments related to inference, containing
                attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
                num_attention_heads, and ffn_hidden_size.
        """
        super().__init__()

-        self.llm_config = llm_config
+        self.fd_config = fd_config
        self.layer_idx = layer_idx
-        self.tp_size = llm_config.parallel_config.mp_size
-        self.ep_size = llm_config.parallel_config.ep_size

-        self.moe_use_gate_correction_bias = moe_use_gate_correction_bias
+        self.tp_size = fd_config.parallel_config.tensor_parallel_degree
+        self.ep_size = fd_config.parallel_config.expert_parallel_degree
+        self.ep_rank = fd_config.parallel_config.expert_parallel_rank
+
+        assert (self.tp_size >= 1 and self.ep_size == 1) or \
+                (self.tp_size == 1 and self.ep_size > 1), \
+            'MoE only support parallelism on TP or EP dimension.'
+
+        self.hidden_size = fd_config.model_config.hidden_size
+        self.moe_config = fd_config.moe_config

-        self.hidden_size = llm_config.model_config.hidden_size
-        self.moe_config = llm_config.moe_config
-        self.use_offline_quant = llm_config.tmp_config.use_offline_quant
-        moe_tag = self.llm_config.moe_config.moe_tag
-        logger.info(f"{moe_tag}MoE is running in {moe_quant_type} mode")
-        
-        self.moe_quant_type = moe_quant_type
        self.num_experts = num_experts
        self.num_local_experts = self.num_experts // self.ep_size

-        logger.info(f'''MoE config is num_experts:{num_experts},
-             top_k:{top_k},
-             hidden_size:{self.hidden_size},
-             moe_intermediate_size:{moe_intermediate_size}''')
-        logger.info(
-            f"MoE is running on moe_quant_type: {self.moe_quant_type}, ep:{self.ep_size}, tp:{self.tp_size} mode"
-        )
        self.moe_intermediate_size = moe_intermediate_size // self.tp_size

-        self.gate_weight_key = gate_weight_key
-        self.gate_correction_bias_key = gate_correction_bias_key
+        self.top_k = top_k
+        self.hidden_size = self.hidden_size
+        self.moe_intermediate_size = moe_intermediate_size // self.tp_size
+        self.weight_key_map = weight_key_map

-        self.ffn1_expert_weight_key = ffn1_expert_weight_key
-        self.ffn2_expert_weight_key = ffn2_expert_weight_key
-        self.ffn1_bias_key = moe_ffn1_bias_keys
-        self.ffn2_bias_key = moe_ffn2_bias_keys
+        self.use_method = envs.FD_MOE_BACKEND.lower()
+        self.gate_correction_bias = None
+        self.moe_tag = moe_tag

-        if self.moe_quant_type == "w4a8":
-            # below keys are only used in MoE W4A8!
-            self.ffn1_expert_weight_scale_key = moe_ffn1_weight_scale_keys
-            self.ffn2_expert_weight_scale_key = moe_ffn2_weight_scale_keys
-            self.ffn1_expert_in_scale_key = moe_ffn1_in_scale_keys
-            self.ffn2_expert_in_scale_key = moe_ffn2_in_scale_keys
+        if self.ep_size > 1:
+            expert_id_offset = expert_id_offset + self.ep_rank * self.num_local_experts

-        self.compute_method = CutlassFusedMoeMethod()
+        self.expert_id_offset = expert_id_offset

-        self.moe_compute_params = MoEComputeParams()
-        self.moe_compute_params.global_num_experts = self.num_experts
-        self.moe_compute_params.top_k = top_k
-        self.moe_compute_params.hidden_size = self.hidden_size
-        self.moe_compute_params.num_local_experts = self.num_local_experts
-        self.moe_compute_params.moe_quant_type = self.moe_quant_type
-        self.moe_compute_params.moe_intermediate_size = self.moe_intermediate_size
-        self.moe_compute_params.ep_size = self.ep_size
-        self.moe_compute_params.tp_size = self.tp_size
+        if fd_config.quant_config:
+            self.quant_method = fd_config.quant_config.get_quant_method(self)
+        else:
+            # now, no quant method(w_fp16 a_fp16) can't get from quant_config, we will optimize it in future
+            from .fused_moe_cutlass_backend import CutlassMoEMethod
+            self.quant_method = CutlassMoEMethod(None)

-    def load_gate_state_dict(self, state_dict):
+        if self.ep_size > 1:
+            self.quant_method.init_ep(self)
+
+        logger.info(
+            f"{moe_tag}MoE config is {num_experts=}[{expert_id_offset}, {expert_id_offset+self.num_local_experts}), \
+        {top_k=}, hidden_size={self.hidden_size}, {moe_intermediate_size=}, \
+            , ep_size={self.ep_size}, \
+            tp_size={self.tp_size}.")
+
+    def load_experts_weight(self, state_dict: dict,
+                            ffn1_expert_weight_key: str,
+                            ffn2_expert_weight_key: str):
        """
-        load_gate_state_dict function.
+        Load experts weight from state_dict.
+        Args:
+            state_dict (dict): The state_dict of model.
+            ffn1_expert_weight_key (str): The key of ffn1 expert weight.
+            ffn2_expert_weight_key (str): The key of ffn2 expert weight.
        """
-        up_gate_proj_weight = []
-        up_gate_proj_weight_scale = []
-        down_proj_weight = []
-        down_proj_weight_scale = []
-        for j in range(self.num_experts):
-            up_gate_proj_weight.append(
-                get_tensor(
-                    state_dict.pop(self.ffn1_expert_weight_key.format(j))))
-            down_proj_weight.append(
-                get_tensor(
-                    state_dict.pop(self.ffn2_expert_weight_key.format(j))))
-        return up_gate_proj_weight, down_proj_weight
+        ffn1_weights = []
+        ffn2_weights = []
+        is_ffn_merged = ffn1_expert_weight_key.format(
+            self.expert_id_offset) in state_dict
+        if is_ffn_merged:
+            for i in range(self.num_local_experts):
+                expert_idx = self.expert_id_offset + i
+                ffn1_weights.append(
+                    get_tensor(
+                        state_dict.pop(
+                            ffn1_expert_weight_key.format(expert_idx))))
+                ffn2_weights.append(
+                    get_tensor(
+                        state_dict.pop(
+                            ffn2_expert_weight_key.format(expert_idx))))
+        else:
+            gate_expert_weight_key = ffn1_expert_weight_key.replace(
+                "up_gate_proj", "gate_proj")
+            up_expert_weight_key = ffn1_expert_weight_key.replace(
+                "up_gate_proj", "up_proj")
+            for j in range(self.num_local_experts):
+                expert_idx = self.expert_id_offset + j
+                gate = get_tensor(
+                    state_dict.pop(gate_expert_weight_key.format(expert_idx)))
+                up = get_tensor(
+                    state_dict.pop(up_expert_weight_key.format(expert_idx)))
+                ffn1_weights.append(paddle.concat([gate, up], axis=-1))
+                ffn2_weights.append(
+                    get_tensor(
+                        state_dict.pop(
+                            ffn2_expert_weight_key.format(expert_idx))))
+        return ffn1_weights, ffn2_weights

-    def load_state_dict(self, state_dict, is_update: bool = False):
+    def extract_moe_ffn_weights(self, state_dict: dict):
+        """
+        Extract MoE FFN weights from state dict based on weight key mapping.
+
+        Args:
+            state_dict (dict): Model state dictionary containing the weights.
+
+        Returns:
+            tuple: A tuple containing two lists:
+                - ffn1_weights: List of tensors for first FFN layer weights
+                - ffn2_weights: List of tensors for second FFN layer weights
+
+        Raises:
+            AssertionError: If required weight keys are missing or number of weights
+                doesn't match number of local experts.
+        """
+        ffn1_expert_weight_key = self.weight_key_map.get(
+            "ffn1_expert_weight_key", None)
+        ffn2_expert_weight_key = self.weight_key_map.get(
+            "ffn2_expert_weight_key", None)
+        assert ffn1_expert_weight_key is not None, "ffn1_expert_weight_key should not be none."
+        assert ffn2_expert_weight_key is not None, "ffn2_expert_weight_key should not be none."
+
+        ffn1_weights, ffn2_weights = self.load_experts_weight(
+            state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
+        assert len(
+            ffn1_weights
+        ) == self.num_local_experts, "ffn1_weights length should be equal to num_local_experts."
+        assert len(
+            ffn2_weights
+        ) == self.num_local_experts, "ffn2_weights length should be equal to num_local_experts."
+
+        return ffn1_weights, ffn2_weights
+
+    def extract_gate_correction_bias(self, gate_correction_bias_key,
+                                     state_dict):
+        """
+        extract_gate_correction_bias function.
+        """
+        gate_correction_bias_tensor = get_tensor(
+            state_dict.pop(gate_correction_bias_key)).astype("float32")
+        return gate_correction_bias_tensor
+
+    def load_state_dict(self, state_dict):
        """
        load_state_dict function.
        """
-        # gate
-        if not is_update:
-            gate_weight_tensor = get_tensor(state_dict.pop(self.gate_weight_key))
-            self.gate_weight = self.create_parameter(
-                shape=gate_weight_tensor.shape,
-                dtype="float32",
-            )
-            self.gate_weight.set_value(gate_weight_tensor)
-
-        # gate_correction_bias
+        self.gate_correction_bias_key = self.weight_key_map.get(
+            "gate_correction_bias_key", None)
+        if self.gate_correction_bias_key is not None and self.gate_correction_bias_key in state_dict:
+            self.moe_use_gate_correction_bias = True
+        else:
+            self.moe_use_gate_correction_bias = False
        if self.moe_use_gate_correction_bias:
-            gate_correction_bias_tensor = get_tensor(
-                state_dict.pop(self.gate_correction_bias_key))
-
+            gate_correction_bias_tensor = self.extract_gate_correction_bias(
+                self.gate_correction_bias_key, state_dict)
            self.gate_correction_bias = self.create_parameter(
                shape=gate_correction_bias_tensor.shape,
                dtype="float32",
            )
-
            self.gate_correction_bias.set_value(gate_correction_bias_tensor)
+
+        gate_weight_key = self.weight_key_map.get("gate_weight_key", None)
+        assert gate_weight_key is not None, "gate_weight_key should not be None, please check model checkpoints"
+
+        gate_weight_tensor = get_tensor(state_dict.pop(gate_weight_key))
+
+        self.gate_weight = self.create_parameter(
+            shape=gate_weight_tensor.shape,
+            dtype="float32",
+        )
+        self.gate_weight.set_value(gate_weight_tensor.astype("float32"))
+
+        if self.fd_config.model_config.is_quantized:
+            self.quant_method.process_prequanted_weights(self, state_dict)
        else:
-            self.gate_correction_bias = None
+            self.quant_method.create_weights(self, state_dict)

-        up_gate_proj_weight, down_proj_weight = self.load_gate_state_dict(
-            state_dict)
-
-        weight1_scale = None
-        weight2_scale = None
-        ffn1_in_scale = None
-        ffn2_in_scale = None
-        if self.moe_quant_type == "w4a8":
-            weight1_scale = []
-            weight2_scale = []
-            ffn1_in_scale = []
-            ffn2_in_scale = []
-
-            for j in range(self.num_experts):
-                weight1_scale.append(
-                    get_tensor(
-                        state_dict.pop(
-                            self.ffn1_expert_weight_scale_key.format(
-                                self.layer_idx, j))))
-                weight2_scale.append(
-                    get_tensor(
-                        state_dict.pop(
-                            self.ffn2_expert_weight_scale_key.format(
-                                self.layer_idx, j))))
-                ffn1_in_scale.append(
-                    get_tensor(
-                        state_dict.pop(
-                            self.ffn1_expert_in_scale_key.format(
-                                self.layer_idx, j))))
-                ffn2_in_scale.append(
-                    get_tensor(
-                        state_dict.pop(
-                            self.ffn2_expert_in_scale_key.format(
-                                self.layer_idx, j))))
-
-        # other weight is with compute_method
-        # different method may have different way to create weights
-        self.compute_method.create_weights(self, self.moe_compute_params,
-                                           up_gate_proj_weight,
-                                           down_proj_weight, None, None,
-                                           weight1_scale, weight2_scale,
-                                           ffn1_in_scale, ffn2_in_scale)
-
-    def forward(self, x, **kwargs):
+    def forward(self, x: paddle.Tensor):
        """
        Defines the forward computation of the moe layer.

@@ -225,13 +225,9 @@ class FusedMoE(nn.Layer):
            x (Tensor): Input tensor to the moe layer.

        Returns:
-            Tensor: Output tensor.
+            Tensor: Output tensor.s

        """
-
-        out = self.compute_method.apply(self, self.moe_compute_params, x)
-        if self.tp_size > 1:
-            from fastdeploy.distributed.communication_op import \
-                tensor_model_parallel_all_reduce
-            tensor_model_parallel_all_reduce(out)
+        gate_out = paddle.matmul(x.cast("float32"), self.gate_weight)
+        out = self.quant_method.apply(self, x, gate_out)
        return out
--- a/fastdeploy/model_executor/layers/moe/tp.py
+++ b/fastdeploy/model_executor/layers/moe/tp.py
@@ -1,126 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import os
-import paddle
-import fastdeploy
-import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
-from fastdeploy.model_executor.layers.moe.moe import MoELayer
-
-
-class MoeTPDecoerDeepDeepGEMMLayer(MoELayer):
-    """
-    MoeTPDecoerDeepDeepGEMMLayer
-    """
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def forward(self, x, **kwargs):
-        """
-        forward
-        """
-        gate_out = paddle.matmul(x.cast("float32"), self.gate_weight)
-        if os.getenv("EP_DECODER_PERF_TEST", "False") == "True":
-            gate_out = paddle.rand(shape=gate_out.shape, dtype=gate_out.dtype)
-        ffn1_out = paddle.empty(
-            [
-                self.num_local_experts,
-                self.max_batch_size,
-                self.moe_intermediate_size * 2,
-            ],
-            dtype=self._dtype,
-        )
-
-        ffn_out = paddle.empty(
-            [
-                self.num_local_experts,
-                self.max_batch_size,
-                self.embed_dim,
-            ],
-            dtype=self._dtype,
-        )
-
-        topk_idx, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
-            gate_out,
-            (
-                self.gate_correction_bias
-                if self.moe_config.moe_use_gate_correction_bias
-                else None
-            ),
-            self.top_k,
-            True,  # apply_norm_weight
-            False,
-        )
-        permute_input, token_nums_per_expert, permute_indices_per_token = (
-            fastdeploy.model_executor.ops.gpu.moe_deepgemm_permute(
-                x, topk_idx, self.num_local_experts, self.max_batch_size
-            )
-        )
-
-        expected_m = 128
-
-        permute_input_fp8, scale = fastdeploy.model_executor.ops.gpu.masked_per_token_quant(
-            permute_input, token_nums_per_expert, 128
-        )
-        deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
-            (permute_input_fp8, scale),
-            (
-                self.moe_ffn1_weight,
-                self.moe_ffn1_weight_scale,
-            ),
-            ffn1_out,
-            token_nums_per_expert,
-            expected_m,
-        )
-
-        act_out = fastdeploy.model_executor.ops.gpu.group_swiglu_with_masked(
-            ffn1_out, token_nums_per_expert
-        )
-
-        act_out_fp8, scale = fastdeploy.model_executor.ops.gpu.masked_per_token_quant(
-            act_out, token_nums_per_expert, 128
-        )
-
-        deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
-            (act_out_fp8, scale),
-            (
-                self.moe_ffn2_weight,
-                self.moe_ffn2_weight_scale,
-            ),
-            ffn_out,
-            token_nums_per_expert,
-            expected_m,
-        )
-
-        fused_moe_out = fastdeploy.model_executor.ops.gpu.moe_deepgemm_depermute(
-            ffn_out, permute_indices_per_token, topk_idx, topk_weights
-        )[0]
-
-        return fused_moe_out
-
-
-class MoeTPPrefillDeepDeepGEMMLayer(MoELayer):
-    """
-    MoeTPPrefillDeepDeepGEMMLayer
-    """
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def forward(self, x, **kwargs):
-        """
-        forward
-        """
-        raise NotImplementedError("Prefill is comming soon...")
--- a/fastdeploy/model_executor/layers/moe/triton_moe_kernels.py
+++ b/fastdeploy/model_executor/layers/moe/triton_moe_kernels.py
@@ -0,0 +1,198 @@
+"""
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def fused_moe_kernel_paddle(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    a_scale_ptr,
+    b_scale_ptr,
+    topk_weights_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    num_tokens_post_padded_ptr,
+
+    # Matrix dimensions
+    N,
+    K,
+    num_tokens_post_padded,
+    num_valid_tokens,
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_asm,
+    stride_ask,
+    stride_bse,
+    stride_bsk,
+    stride_bsn,
+    # Block size for block-wise fp8 quantization
+    group_n: tl.constexpr,
+    group_k: tl.constexpr,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    MUL_ROUTED_WEIGHT: tl.constexpr,
+    top_k: tl.constexpr,
+    compute_type_enum: tl.constexpr,
+    use_fp8_w8a8: tl.constexpr,
+    use_int8_w8a16: tl.constexpr,
+    even_Ks: tl.constexpr,
+):
+    """
+
+    Key Parameters:
+    - A: The input tensor representing tokens with shape (*, K), where '*' can
+        be any shape representing batches and K is the feature dimension of
+        each token.
+    - B: The stacked MOE weight tensor with shape (E, N, K), where E is
+        the number of experts, K is the input feature dimension, and N is
+        the output feature dimension.
+    - C: The output cache tensor with shape (M, topk, N), where M is the
+        total number of tokens post padding, topk is the number of times
+        each token is repeated, and N is the output feature dimension.
+    - sorted_token_ids: A tensor containing the sorted indices of tokens,
+        repeated topk times and arranged by the expert index they are
+        assigned to.
+    - expert_ids: A tensor containing the indices of the expert for each
+        block. It determines which expert matrix from B should be used for
+        each block in A.
+    This kernel performs the multiplication of a token by its corresponding
+    expert matrix as determined by `expert_ids`. The sorting of
+    `sorted_token_ids` by expert index and padding ensures divisibility by
+    BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
+    multiplication across different blocks processed by the same expert.
+    """
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(num_tokens_post_padded, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    assert compute_type_enum == 1
+    compute_type = tl.bfloat16
+
+    num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
+    if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
+        return
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
+    token_mask = offs_token < num_valid_tokens
+
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
+                      offs_k[None, :] * stride_ak)
+
+    off_experts = tl.load(expert_ids_ptr + pid_m)
+    b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk +
+                                                offs_bn[None, :] * stride_bn)
+
+    if use_int8_w8a16:
+        b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[
+            None, :] * stride_bsn
+        b_scale = tl.load(b_scale_ptrs)
+
+    if use_fp8_w8a8:
+        if group_k > 0 and group_n > 0:
+            a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
+            offs_bsn = offs_bn // group_n
+            b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bsn * stride_bsn
+        else:
+            # (Zkk): every expert has one activation scale and weight scale.
+            a_scale = tl.load(a_scale_ptr + off_experts)
+            b_scale = tl.load(b_scale_ptr + off_experts)
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        if even_Ks:
+            a = tl.load(
+                a_ptrs,
+                mask=token_mask[:, None],
+                other=0.0,
+            )
+            b = tl.load(b_ptrs,
+                        cache_modifier=".cv",
+                        eviction_policy='evict_first')
+        else:
+            a = tl.load(
+                a_ptrs,
+                mask=token_mask[:, None] &
+                (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+                other=0.0,
+            )
+            b = tl.load(b_ptrs,
+                        mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
+                        other=0.0)
+
+        # We accumulate along the K dimension.
+        if use_int8_w8a16:
+            accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)
+        elif use_fp8_w8a8:
+            if group_k > 0 and group_n > 0:
+                k_start = k * BLOCK_SIZE_K
+                offs_ks = k_start // group_k
+                a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask,
+                                  mask=token_mask,
+                                  other=0.0)
+                b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk)
+
+                accumulator += tl.dot(a, b) * a_scale[:,
+                                                      None] * b_scale[None, :]
+            else:
+                accumulator = tl.dot(a, b, acc=accumulator)
+        else:
+            accumulator += tl.dot(a, b)
+
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if MUL_ROUTED_WEIGHT:
+        moe_weight = tl.load(topk_weights_ptr + offs_token,
+                             mask=token_mask,
+                             other=0)
+        accumulator = accumulator * moe_weight[:, None]
+    if use_int8_w8a16:
+        accumulator = (accumulator * b_scale).to(compute_type)
+    elif use_fp8_w8a8:
+        if group_k > 0 and group_n > 0:
+            accumulator = accumulator.to(compute_type)
+        else:
+            accumulator = (accumulator * a_scale * b_scale).to(compute_type)
+    else:
+        accumulator = accumulator.to(compute_type)
+    # Write back the block of the output
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
+        None, :]
+    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+
+    tl.store(c_ptrs, accumulator, mask=c_mask)
--- a/fastdeploy/model_executor/layers/normalization.py
+++ b/fastdeploy/model_executor/layers/normalization.py
@@ -28,18 +28,19 @@ class RMSNorm(nn.Layer):

    def __init__(
        self,
-        llm_config,
+        fd_config,
        hidden_size,
        eps=1e-5,
        prefix="",
        linear_bias=None,
        quant_scale=None,
+        begin_norm_axis=1,
    ):
        """
        Initializes the normalization layer.

        Args:
-            llm_config (LLMConfig): Arguments related to inference, containing
+            fd_config (FDConfig): Arguments related to inference, containing
                attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
                num_attention_heads, and ffn_hidden_size.
            hidden_size (int) : size of hidden state.
@@ -52,7 +53,7 @@ class RMSNorm(nn.Layer):
            NotImplementedError: If the specified norm_type is not supported.
        """
        super().__init__()
-        self.llm_config = llm_config
+        self.fd_config = fd_config
        self.prefix = prefix
        self.hidden_size = hidden_size
        if len(prefix) == 0:
@@ -66,6 +67,11 @@ class RMSNorm(nn.Layer):
        self.quant_scale = quant_scale
        self._dtype = self._helper.get_default_dtype()
        self._norm_weight_dtype = self._dtype
+        self.begin_norm_axis = begin_norm_axis
+        self.quant_round_type = self.fd_config.quant_config.quant_round_type if fd_config.quant_config else 0
+        self.quant_max_bound = self.fd_config.quant_config.quant_max_bound if fd_config.quant_config else 0
+        self.quant_min_bound = self.fd_config.quant_config.quant_min_bound if fd_config.quant_config else 0
+        self.begin_norm_axis = begin_norm_axis

        self.init_weight()

@@ -118,13 +124,13 @@ class RMSNorm(nn.Layer):
            norm_weight=self.ln_weight,
            norm_bias=None,
            epsilon=self.eps,
-            begin_norm_axis=1,
+            begin_norm_axis=self.begin_norm_axis,
            bias=self.linear_bias,
            residual=residual_input,
            quant_scale=-1 if self.quant_scale is None else self.quant_scale,
-            quant_round_type=self.llm_config.quant_config.quant_round_type,
-            quant_max_bound=self.llm_config.quant_config.quant_max_bound,
-            quant_min_bound=self.llm_config.quant_config.quant_min_bound,
+            quant_round_type=self.quant_round_type,
+            quant_max_bound=self.quant_max_bound,
+            quant_min_bound=self.quant_min_bound,
        )
        if residual_input is not None:
            return norm_out[0], norm_out[1]
@@ -139,7 +145,7 @@ class LayerNorm(nn.Layer):

    def __init__(
        self,
-        llm_config,
+        fd_config,
        hidden_size,
        eps=1e-5,
        prefix="",
@@ -151,7 +157,7 @@ class LayerNorm(nn.Layer):
        Initializes the normalization layer.

        Args:
-            llm_config (LLMConfig): Arguments related to inference, containing
+            fd_config (FDConfig): Arguments related to inference, containing
                attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
                num_attention_heads, and ffn_hidden_size.
            prefix (str): Unique name of the layer, used for naming internal attributes,
@@ -163,7 +169,7 @@ class LayerNorm(nn.Layer):
            NotImplementedError: If the specified norm_type is not supported.
        """
        super().__init__()
-        self.llm_config = llm_config
+        self.fd_config = fd_config
        self.prefix = prefix
        self.hidden_size = hidden_size
        if len(prefix) == 0:
@@ -180,6 +186,10 @@ class LayerNorm(nn.Layer):
        self._dtype = self._helper.get_default_dtype()
        self._norm_weight_dtype = "float32"

+        self.quant_round_type = self.fd_config.quant_config.quant_round_type if fd_config.quant_config else 0
+        self.quant_max_bound = self.fd_config.quant_config.quant_max_bound if fd_config.quant_config else 0
+        self.quant_min_bound = self.fd_config.quant_config.quant_min_bound if fd_config.quant_config else 0
+
        self.init_weight()

    def init_weight(self):
@@ -240,6 +250,7 @@ class LayerNorm(nn.Layer):
                  The `residual_output` is the result of applying the normalization and possibly other
                  operations (like linear transformation) on the `residual_input`.
        """
+
        norm_out = self.norm_func(
            x,
            norm_weight=self.ln_weight,
@@ -249,9 +260,9 @@ class LayerNorm(nn.Layer):
            bias=self.linear_bias,
            residual=residual_input,
            quant_scale=-1,
-            quant_round_type=self.llm_config.quant_config.quant_round_type,
-            quant_max_bound=self.llm_config.quant_config.quant_max_bound,
-            quant_min_bound=self.llm_config.quant_config.quant_min_bound,
+            quant_round_type=self.quant_round_type,
+            quant_max_bound=self.quant_max_bound,
+            quant_min_bound=self.quant_min_bound,
        )
        if residual_input is not None:
            return norm_out[0], norm_out[1]
--- a/fastdeploy/model_executor/layers/quantization/init.py
+++ b/fastdeploy/model_executor/layers/quantization/init.py
@@ -19,11 +19,18 @@ from typing import Dict, List, Type
 from .quant_base import QuantConfigBase

 QUANTIZATION_METHODS: List[str] = [
+    "wint2",
+    "wint4",
+    "wint8",
    "weight_only",
-    "block_wise",
+    "block_wise_fp8",
    "w4afp8",
    "w8a8",
+    "w4a8",
    "wfp8afp8",
+    "mix_quant",
+    "tensor_wise_fp8",
+    "kvcache",
 ]


@@ -34,20 +41,30 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]:
    if quantization not in QUANTIZATION_METHODS:
        raise ValueError(f"Invalid quantization method: {quantization}")

-    from .block_wise import BlockWiseConfig
+    from .block_wise_fp8 import BlockWiseFP8Config
+    from .kv_cache import KvCacheQuantConfig
+    from .mix_quant import MixQuantConfig
+    from .tensor_wise_fp8 import TensorWiseFP8Config
+    from .w4a8 import W4A8Config
    from .w4afp8 import W4AFP8Config
    from .w8a8 import W8A8Config
-    from .weight_only import WeightOnlyConfig
+    from .weight_only import WeightOnlyConfig, WINT4Config, WINT8Config
    from .wfp8afp8 import WFP8AFP8Config
-    from .kv_cache import KvCacheQuantConfig
-    
+    from .wint2 import WINT2Config
+
    method_to_config: Dict[str, Type[QuantConfigBase]] = {
+        "wint2": WINT2Config,
+        "wint4": WINT4Config,
+        "wint8": WINT8Config,
        "weight_only": WeightOnlyConfig,
-        "block_wise": BlockWiseConfig,
+        "block_wise_fp8": BlockWiseFP8Config,
        "w4afp8": W4AFP8Config,
        "w8a8": W8A8Config,
+        "w4a8": W4A8Config,
        "wfp8afp8": WFP8AFP8Config,
-        "kvcache": KvCacheQuantConfig
+        "tensor_wise_fp8": TensorWiseFP8Config,
+        "kvcache": KvCacheQuantConfig,
+        "mix_quant": MixQuantConfig,
    }

    return method_to_config[quantization]
--- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
+++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
@@ -18,16 +18,13 @@ from typing import Optional
 import paddle

 import fastdeploy
-import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
+from fastdeploy.model_executor.layers.moe import FusedMoE

-from ..utils import per_block_cast_to_fp8
+from ..utils import per_block_cast_to_fp8, get_tensor
 from .quant_base import QuantConfigBase, QuantMethodBase

-QUANT_ALIGNMENT_OFFSET = 127
-QUANT_BLOCK_SIZE = 128

-
-class BlockWiseConfig(QuantConfigBase):
+class BlockWiseFP8Config(QuantConfigBase):
    """
    block wise quantization config, only support fp8 quant and only supports loading weights in BF16 format.
    After loading the weights, it will automatically compute quantization sparsity and dynamically perform
@@ -37,41 +34,55 @@ class BlockWiseConfig(QuantConfigBase):
    def __init__(self, weight_block_size: list = [-1, -1]) -> None:
        super().__init__()
        self.weight_block_size = weight_block_size
+        self.quant_max_bound = 448
+        self.quant_min_bound = -448
+        self.quant_round_type = 1

-    def get_name(self) -> str:
-        return "block_wise"
+    def name(self) -> str:
+        return "block_wise_fp8"

    @classmethod
-    def from_config(cls, config: dict) -> "BlockWiseConfig":
-        weight_block_size = config["weight_block_size"]
+    def from_config(cls, config: dict) -> "BlockWiseFP8Config":
+        weight_block_size = config.get("weight_block_size", [128, 128])
        return cls(weight_block_size)

    def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
-        return BlockWiseLinearMethod(self)
+        '''
+        Get quantization method.
+        '''
+        if isinstance(layer, FusedMoE):
+            from fastdeploy.model_executor.layers.moe.fused_moe_deepgemm_backend import \
+                DeepGemmFusedMoeMethod
+            return DeepGemmFusedMoeMethod(self)
+        else:
+            return BlockWiseFP8LinearMethod(self)


-class BlockWiseLinearMethod(QuantMethodBase):
+class BlockWiseFP8LinearMethod(QuantMethodBase):
    """
    block wise quantization method for linear
    """

    def __init__(
        self,
-        quant_config: BlockWiseConfig,
+        quant_config: BlockWiseFP8Config,
    ) -> None:
        super().__init__()
        self.quant_config = quant_config

    def create_weights(self, layer):
-        layer.linear_weight_scale = self.create_parameter(
+        layer.linear_weight_shape.reverse()
+        layer.linear_weight_scale = layer.create_parameter(
            shape=[
-                (layer.embed_dim + QUANT_ALIGNMENT_OFFSET) // QUANT_BLOCK_SIZE,
-                (layer.num_heads * layer.head_dim + QUANT_ALIGNMENT_OFFSET) //
-                QUANT_BLOCK_SIZE,
+                (layer.output_size + self.quant_config.weight_block_size[0] -
+                 1) // self.quant_config.weight_block_size[0],
+                (layer.input_size + self.quant_config.weight_block_size[1] - 1)
+                // self.quant_config.weight_block_size[1],
            ],
            dtype="float32",
            is_bias=False,
        )
+        layer.weight_dtype = "float8_e4m3fn"

    def process_loaded_weights(self, layer, weights) -> None:
        weight_tensor = weights.transpose([1, 0])
@@ -80,15 +91,30 @@ class BlockWiseLinearMethod(QuantMethodBase):
        layer.linear_weight.copy_(quanted_weight_tensor, False)
        layer.linear_weight_scale.set_value(weight_block_scale_tensor)

+    def process_prequanted_weights(self, layer, state_dict):
+        """
+        process_prequanted_weights
+        """
+        quant_weight = get_tensor(state_dict.pop(layer.weight_key))
+        weight_scale = get_tensor(state_dict.pop(layer.weight_scale_key))
+
+        quant_weight = quant_weight.transpose([1, 0]).contiguous()
+        layer.linear_weight.copy_(quant_weight.view("float8_e4m3fn"), False)
+
+        weight_scale = weight_scale.transpose([1, 0])
+        layer.linear_weight_scale.set_value(weight_scale)
+
    def apply(self, layer, x):
        x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant_padding(
            x, self.quant_config.weight_block_size[0])
-        linear_out = paddle.empty(
-            (x.shape[0], layer.llm_config.model_config.hidden_size),
-            dtype=paddle.bfloat16)
+        linear_out = paddle.empty((x.shape[0], layer.output_size),
+                                  dtype=paddle.bfloat16)
+        import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
        deep_gemm.gemm_fp8_fp8_bf16_nt(
            (x, x_scale_tensor),
            (layer.linear_weight, layer.linear_weight_scale),
            linear_out,
        )
+        if layer.with_bias:
+            linear_out = paddle.add(linear_out, layer.linear_bias)
        return linear_out
--- a/fastdeploy/model_executor/layers/quantization/kv_cache.py
+++ b/fastdeploy/model_executor/layers/quantization/kv_cache.py
@@ -13,38 +13,66 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-from paddle import nn
-import os
-import paddle
-from .quant_base import QuantConfigBase, QuantMethodBase
+from enum import Enum
 from typing import Optional

+import paddle
+from paddle import nn
+
+from fastdeploy.model_executor.layers.utils import get_tensor
+
+from ..utils import create_and_set_parameter
+from .quant_base import QuantConfigBase, QuantMethodBase
+
+
+class KvCacheQuantzationTypes(str, Enum):
+    """
+    KvCacheQuantzationTypes
+    """
+    INT8 = "int8"
+    FP8 = "float8_e4m3fn"
+    INT8_ZP = "int8_zp"
+    FP8_ZP = "float8_e4m3fn_zp"
+

 class KvCacheQuantConfig(QuantConfigBase):
    """
    quantization config for weight 4bits and activation fp8
    """

-    def __init__(self, cachekv_scale_dict) -> None:
+    def __init__(self, kv_cache_quant_type: str) -> None:
        """
        __init__
        """
        super().__init__()
-        self.cachekv_scale_dict = cachekv_scale_dict
+        self.kv_cache_quant_type = kv_cache_quant_type

-    def get_name(self) -> str:
+        try:
+            self.quant_type = KvCacheQuantzationTypes(kv_cache_quant_type)
+        except ValueError:
+            raise ValueError(f'Invalid Kvcache type: {kv_cache_quant_type}')
+
+        self.has_zero_point = "zp" in kv_cache_quant_type
+
+        if self.quant_type == KvCacheQuantzationTypes.INT8 or self.quant_type == KvCacheQuantzationTypes.INT8_ZP:
+            self.max_bound = 127.0
+        elif self.quant_type == KvCacheQuantzationTypes.FP8 or self.quant_type == KvCacheQuantzationTypes.FP8_ZP:
+            self.max_bound = 448.0
+        else:
+            raise ValueError(f'Invalid Kvcache type: {kv_cache_quant_type}')
+
+    def name(self) -> str:
        """
        get_name
        """
        return "kvcache"

    @classmethod
-    def from_config(cls, config: dict) -> "KvCacheQuantConfig":
+    def from_config(cls, kv_cache_quant_type: str) -> "KvCacheQuantConfig":
        """
        from_config
        """
-        cachekv_scale_dict = config["cachekv_scale_dict"]
-        return cls(cachekv_scale_dict)
+        return cls(kv_cache_quant_type)

    def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
        """
@@ -66,197 +94,63 @@ class KVCacheMethodBase(QuantMethodBase):
        KVCacheMethodBase __init__
        """
        super().__init__()
-        self.quant_config = quant_config
+        self.cache_quant_config = quant_config

-    def load_zp(self, layer: nn.Layer):
+    def load_zp(self, layer: nn.Layer, state_dict):
        """
        load_zp
        """
-        if self.cache_k_zp_name in self.quant_config.cachekv_scale_dict:
-            cache_k_zp = paddle.cast(
-                paddle.to_tensor(
-                    self.quant_config.cachekv_scale_dict[self.cache_k_zp_name]
-                ),
-                self.cache_scale_dtype,
-            )
-        else:
-            cache_k_zp = paddle.zeros(
-                (
-                    [self.kv_num_heads * self.head_dim]
-                    if self.quant_config.is_channel_wise
-                    else [self.kv_num_heads]
-                ),
-                dtype=self.cache_scale_dtype,
-            )
-        if self.cache_v_zp_name in self.quant_config.cachekv_scale_dict:
-            cache_v_zp = paddle.cast(
-                paddle.to_tensor(
-                    self.quant_config.cachekv_scale_dict[self.cache_v_zp_name]
-                ),
-                self.cache_scale_dtype,
-            )
-        else:
-            cache_v_zp = paddle.zeros(
-                (
-                    [self.kv_num_heads * self.head_dim]
-                    if self.quant_config.is_channel_wise
-                    else [self.kv_num_heads]
-                ),
-                dtype=self.cache_scale_dtype,
-            )
-        layer.cache_k_zp.set_value(cache_k_zp)
-        layer.cache_v_zp.set_value(cache_v_zp)
+        cache_k_zeropoint = get_tensor(state_dict.pop(self.cache_k_zp_name))
+        cache_v_zeropoint = get_tensor(state_dict.pop(self.cache_v_zp_name))

-    def load_scale(self, layer: nn.Layer):
+        create_and_set_parameter(layer, "cache_k_zp", cache_k_zeropoint)
+        create_and_set_parameter(layer, "cache_v_zp", cache_v_zeropoint)
+
+    def load_scale(self, layer: nn.Layer, state_dict):
        """
        load_scale
        """
-        if self.cache_k_scale_name in self.quant_config.cachekv_scale_dict:
-            cache_k_scale = paddle.cast(
-                paddle.to_tensor(
-                    self.quant_config.cachekv_scale_dict[self.cache_k_scale_name]
-                ),
-                self.cache_scale_dtype,
-            )
-            cache_k_out_scale = 1.0 / cache_k_scale
-        else:
-            raise KeyError(
-                f"{self.cache_k_scale_name} not found in scale dict")
+        cache_k_scale_tensor = get_tensor(
+            state_dict.pop(self.cache_k_scale_name)).cast(
+                paddle.get_default_dtype()).reshape_([-1])
+        cache_v_scale_tensor = get_tensor(
+            state_dict.pop(self.cache_v_scale_name)).cast(
+                paddle.get_default_dtype()).reshape_([-1])

-        if self.cache_v_scale_name in self.quant_config.cachekv_scale_dict:
-            cache_v_scale = paddle.cast(
-                paddle.to_tensor(
-                    self.quant_config.cachekv_scale_dict[self.cache_v_scale_name]
-                ),
-                self.cache_scale_dtype,
-            )
-            cache_v_out_scale = 1.0 / cache_v_scale
-        else:
-            raise KeyError(
-                f"{self.cache_v_scale_name} not found in scale dict")
+        cache_k_scale = self.cache_quant_config.max_bound / cache_k_scale_tensor
+        cache_v_scale = self.cache_quant_config.max_bound / cache_v_scale_tensor
+        cache_k_out_scale = cache_k_scale_tensor / self.cache_quant_config.max_bound
+        cache_v_out_scale = cache_v_scale_tensor / self.cache_quant_config.max_bound

-        if self.cache_v_scale_name in self.quant_config.cachekv_scale_dict:
-            cache_v_scale = paddle.cast(
-                paddle.to_tensor(
-                    self.quant_config.cachekv_scale_dict[self.cache_v_scale_name]
-                ),
-                self.cache_scale_dtype,
-            )
-            cache_v_out_scale = 1.0 / cache_v_scale
-        else:
-            raise KeyError(
-                f"{self.cache_v_scale_name} not found in scale dict")
+        create_and_set_parameter(layer, "cache_k_scale", cache_k_scale)
+        create_and_set_parameter(layer, "cache_v_scale", cache_v_scale)
+        create_and_set_parameter(layer, "cache_k_out_scale", cache_k_out_scale)
+        create_and_set_parameter(layer, "cache_v_out_scale", cache_v_out_scale)

-        layer.cache_k_scale.set_value(cache_k_scale)
-        layer.cache_v_scale.set_value(cache_v_scale)
-        layer.cache_k_out_scale.set_value(cache_k_out_scale)
-        layer.cache_v_out_scale.set_value(cache_v_out_scale)
-
-    def create_scale(self, layer: nn.Layer):
-        """
-        create_scale
-        """
-        layer.cache_k_scale = layer.create_parameter(
-            shape=(
-                [layer.kv_num_heads * layer.head_dim]
-                if self.quant_config.is_channel_wise
-                else [layer.kv_num_heads]
-            ),
-            dtype=self.cache_scale_dtype,
-            is_bias=False,
-        )
-        layer.cache_v_scale = layer.create_parameter(
-            shape=(
-                [layer.kv_num_heads * layer.head_dim]
-                if self.quant_config.is_channel_wise
-                else [layer.kv_num_heads]
-            ),
-            dtype=self.cache_scale_dtype,
-            is_bias=False,
-        )
-        layer.cache_k_out_scale = layer.create_parameter(
-            shape=(
-                [layer.kv_num_heads * layer.head_dim]
-                if self.quant_config.is_channel_wise
-                else [layer.kv_num_heads]
-            ),
-            attr=None,
-            dtype=self.cache_scale_dtype,
-            is_bias=False,
-        )
-        layer.cache_v_out_scale = layer.create_parameter(
-            shape=(
-                [layer.kv_num_heads * layer.head_dim]
-                if self.quant_config.is_channel_wise
-                else [layer.kv_num_heads]
-            ),
-            attr=None,
-            dtype=self.cache_scale_dtype,
-            is_bias=False,
-        )
-
-    def create_zp(self, layer: nn.Layer):
-        """
-        create_zp
-        """
-        layer.cache_k_zp = layer.create_parameter(
-            shape=(
-                [layer.kv_num_heads * layer.head_dim]
-                if self.quant_config.is_channel_wise
-                else [layer.kv_num_heads]
-            ),
-            dtype=self.cache_scale_dtype,
-            is_bias=False,
-        )
-        layer.cache_v_zp = layer.create_parameter(
-            shape=(
-                [layer.kv_num_heads * layer.head_dim]
-                if self.quant_config.is_channel_wise
-                else [layer.kv_num_heads]
-            ),
-            dtype=self.cache_scale_dtype,
-            is_bias=False,
-        )
-
-    def create_weights(self, layer: nn.Layer):
+    def create_weights(self, layer: nn.Layer, state_dict):
        """
        create_weights
        """
        self.prefix = layer.prefix
-        self.cache_k_scale_name = layer.prefix + ".cachek_matmul.activation_quanter"
-        self.cache_v_scale_name = layer.prefix + ".cachev_matmul.activation_quanter"
-        self.cache_k_zp_name = layer.cache_k_scale_name + ".zero_point"
-        self.cache_v_zp_name = layer.cache_v_scale_name + ".zero_point"
+        self.cache_k_scale_name = layer.prefix + ".cachek_matmul.activation_scale"
+        self.cache_v_scale_name = layer.prefix + ".cachev_matmul.activation_scale"
+        self.cache_k_zp_name = layer.prefix + ".cachek_matmul.activation_zero_point"
+        self.cache_v_zp_name = layer.prefix + ".cachev_matmul.activation_zero_point"

-        layer.cache_k_zp = None
-        layer.cache_v_zp = None
-        layer.cache_k_scale = None
-        layer.cache_v_scale = None
-        layer.cache_k_out_scale = None
-        layer.cache_v_out_scale = None
+        if self.cache_quant_config.quant_type == KvCacheQuantzationTypes.INT8:
+            setattr(layer, "cache_quant_type_str", "cache_int8")
+            setattr(layer, "quant_max_bound", 127.0)
+            setattr(layer, "quant_min_bound", -127.0)
+        elif self.cache_quant_config.quant_type == KvCacheQuantzationTypes.FP8:
+            setattr(layer, "cache_quant_type_str", "cache_fp8")
+            setattr(layer, "quant_max_bound", 448.0)
+            setattr(layer, "quant_min_bound", -448.0)
+        else:
+            raise NotImplementedError(f"{self.cache_quant_config.quant_type} is not implemented")

-        self._dtype = layer._dtype
-        if self._dtype != "bfloat16" and self._dtype != "float16" and self._dtype == "float32":
-            raise ValueError(
-                f"Just support float32, float16 and \
-                    bfloat16 as default dtype, but received {self._dtype}"
-            )
-        self.cache_scale_dtype = (
-            self._dtype if self.quant_config.use_append_attn else "float32"
-        )
-
-        if not self.quant_config.use_dynamic_cachekv_quant:
-            if (
-                self.quant_config.cachekv_dtype == "int8"
-                or self.quant_config.cachekv_dtype == "int4"
-                or self.quant_config.cachekv_dtype == "float8_e4m3fn"
-            ):
-                self.create_scale(layer)
-                self.load_scale(layer)
-                if self.quant_config.has_zero_point:
-                    self.create_zp(layer)
-                    self.load_zp(layer)
-        layer.cache_quant_type_str = self.quant_config.cache_quant_type
+        self.load_scale(layer, state_dict)
+        if self.cache_quant_config.has_zero_point:
+            self.load_zp(layer, state_dict)

    def apply(self, layer):
        """
@@ -264,4 +158,3 @@ class KVCacheMethodBase(QuantMethodBase):
        """
        raise RuntimeError(
            f"{self.__class__.__name__}.apply should not be called.")
-
--- a/fastdeploy/model_executor/layers/quantization/mix_quant.py
+++ b/fastdeploy/model_executor/layers/quantization/mix_quant.py
@@ -0,0 +1,75 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+from typing import Optional
+
+from ..attention import Attention
+from ..moe import FusedMoE
+from . import get_quantization_config
+from .quant_base import QuantConfigBase, QuantMethodBase
+
+
+class MixQuantConfig(QuantConfigBase):
+    """
+    Quantization config for layers that has different quantization methods.
+    """
+
+    def __init__(
+        self,
+        dense_quant_type: str,
+        moe_quant_type: str,
+        kv_cache_quant_type: str = None,
+        image_moe_quant_type: str = None,
+    ) -> None:
+        super().__init__()
+        self.dense_quant_type = dense_quant_type
+        self.moe_quant_type = moe_quant_type
+        self.kv_cache_quant_type = kv_cache_quant_type
+        if image_moe_quant_type is None:
+            self.image_moe_quant_type = moe_quant_type
+        else:
+            self.image_moe_quant_type = image_moe_quant_type
+        self.quant_max_bound = 0
+        self.quant_min_bound = 0
+        self.quant_round_type = 0
+
+    def name(self) -> str:
+        return "mix_quant"
+
+    @classmethod
+    def from_config(cls, config: dict) -> "MixQuantConfig":
+        return cls(config['dense_quant_type'], config['moe_quant_type'],
+                   config.get('kv_cache_quant_type', None),
+                   config.get('image_moe_quant_type', None))
+
+    def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
+        if isinstance(layer, FusedMoE):
+            if layer.moe_tag == "Image":
+                return get_quantization_config(
+                    self.image_moe_quant_type).from_config(
+                        {}).get_quant_method(layer)
+            else:
+                return get_quantization_config(
+                    self.moe_quant_type).from_config(
+                        {}).get_quant_method(layer)
+        elif isinstance(layer, Attention):
+            if self.kv_cache_quant_type is not None:
+                return (get_quantization_config("kvcache").from_config(
+                    self.kv_cache_quant_type).get_quant_method(layer))
+            else:
+                return None
+        else:
+            return get_quantization_config(self.dense_quant_type).from_config(
+                {}).get_quant_method(layer)
--- a/fastdeploy/model_executor/layers/quantization/ops/init.py
+++ b/fastdeploy/model_executor/layers/quantization/ops/init.py
@@ -0,0 +1,22 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+from .cutlass_scaled_mm import cutlass_scaled_mm
+from .scaled_fp8_quant import scaled_fp8_quant
+
+__all__ = [
+    "cutlass_scaled_mm",
+    "scaled_fp8_quant",
+]
--- a/fastdeploy/model_executor/layers/quantization/ops/cutlass_scaled_mm.py
+++ b/fastdeploy/model_executor/layers/quantization/ops/cutlass_scaled_mm.py
@@ -0,0 +1,126 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+from typing import Optional
+
+import paddle
+
+import fastdeploy
+
+
+def cutlass_scaled_mm(a: paddle.Tensor,
+                      b: paddle.Tensor,
+                      scale_a: paddle.Tensor,
+                      scale_b: paddle.Tensor,
+                      out_dtype: paddle.dtype,
+                      bias: Optional[paddle.Tensor] = None) -> paddle.Tensor:
+    """
+    `cutlass_scaled_mm` implements a fused version of
+        `output = paddle.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
+    where scale_a * a and scale_b * b are implemented using numpy-style
+    broadcasting.
+
+    In order to support blockwise scaling like found in DeepSeek V3 we also
+    support extended "group" broadcast rules. We extend the numpy-style
+    broadcasting rules with the following rule:
+        "if the extent of a dimension in the source shape is between 1 and
+        corresponding extent in the target shape we repeat each element along
+        that dimension  src_shape[dim] // target_shape[dim] times consecutively"
+    example if we have:
+          a = [[1, 2], and target_shape = (2, 4)
+               [3, 4]]
+    then we would expand a to:
+          a = [[1, 1, 2, 2],
+               [3, 3, 4, 4]]
+    currently we only support the case:
+        scale_a.shape * [1, 128] == a.shape
+        scale_b.shape * [128, 128] == b.shape
+    """
+    assert (out_dtype == paddle.bfloat16 or out_dtype == paddle.float16)
+    assert bias is None or bias.shape[0] == b.shape[
+        0] and bias.dtype == out_dtype
+    # Ensure input tensors have valid shapes
+    # assert a.numel() > 0, "Input tensor 'a' must not be empty"
+    # assert b.numel() > 0, "Input tensor 'b' must not be empty"
+    # assert scale_a.numel() > 0, "Scale tensor 'scale_a' must not be empty"
+    # assert scale_b.numel() > 0, "Scale tensor 'scale_b' must not be empty"
+
+    m = a.shape[0]
+    n = b.shape[0]
+    cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
+    assert cutlass_compatible_b
+
+    out = paddle.empty([m, n], dtype=out_dtype)
+    fastdeploy.model_executor.ops.gpu.cutlass_scaled_mm(
+        out, a, b, scale_a, scale_b, bias)
+
+    return out
+
+
+def scaled_fp8_quant(
+    input: paddle.Tensor,
+    scale: Optional[paddle.Tensor] = None,
+    num_token_padding: Optional[int] = None,
+    scale_ub: float = 0,
+    use_per_token_if_dynamic: bool = False,
+) -> tuple[paddle.Tensor, paddle.Tensor]:
+    """
+    Quantize input tensor to FP8 and return quantized tensor and scale.
+
+    This function supports both static and dynamic quantization: If you
+    provide the scale, it will use static scaling and if you omit it,
+    the scale will be determined dynamically. The function also allows
+    optional padding of the output tensors for downstream kernels that
+    will benefit from padding.
+
+    Args:
+        input: The input tensor to be quantized to FP8
+        scale: Optional scaling factor for the FP8 quantization
+        scale_ub: Optional upper bound for scaling factor in dynamic
+            per token case
+        num_token_padding: If specified, pad the first dimension
+            of the output to at least this value.
+        use_per_token_if_dynamic: Whether to do per_tensor or per_token
+            in the dynamic quantization case.
+
+    Returns:
+        tuple[paddle.Tensor, paddle.Tensor]: The output tensor in FP8 and
+            scaling factor.
+    """
+    # This code assumes batch_dim and num_tokens are flattened
+    assert (input.ndim == 2)
+    shape = input.shape
+    if num_token_padding:
+        shape = (max(num_token_padding, input.shape[0]), shape[1])
+    output = paddle.empty(shape, dtype=paddle.float8_e4m3fn)
+
+    if scale is None:
+        if use_per_token_if_dynamic:
+            scale = paddle.empty([shape[0], 1], dtype=paddle.float32)
+            from fastdeploy.model_executor.ops.gpu import \
+                dynamic_per_token_scaled_fp8_quant
+            dynamic_per_token_scaled_fp8_quant(output, input, scale, scale_ub)
+        else:
+            scale = paddle.zeros([1], dtype=paddle.float32)
+            from fastdeploy.model_executor.ops.gpu import \
+                dynamic_scaled_fp8_quant
+            dynamic_scaled_fp8_quant(output, input, scale)
+    else:
+        # num_token_padding not implemented for this case
+        # assert (scale.numel() == 1 or num_token_padding is None)
+        from fastdeploy.model_executor.ops.gpu import static_scaled_fp8_quant
+        static_scaled_fp8_quant(output, input, scale)
+
+    return output, scale
--- a/fastdeploy/model_executor/layers/quantization/ops/scaled_fp8_quant.py
+++ b/fastdeploy/model_executor/layers/quantization/ops/scaled_fp8_quant.py
@@ -0,0 +1,75 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+from typing import Optional
+
+import paddle
+
+
+def scaled_fp8_quant(
+    input: paddle.Tensor,
+    scale: Optional[paddle.Tensor] = None,
+    num_token_padding: Optional[int] = None,
+    scale_ub: float = 0,
+    use_per_token_if_dynamic: bool = False,
+) -> tuple[paddle.Tensor, paddle.Tensor]:
+    """
+    Quantize input tensor to FP8 and return quantized tensor and scale.
+
+    This function supports both static and dynamic quantization: If you
+    provide the scale, it will use static scaling and if you omit it,
+    the scale will be determined dynamically. The function also allows
+    optional padding of the output tensors for downstream kernels that
+    will benefit from padding.
+
+    Args:
+        input: The input tensor to be quantized to FP8
+        scale: Optional scaling factor for the FP8 quantization
+        scale_ub: Optional upper bound for scaling factor in dynamic
+            per token case
+        num_token_padding: If specified, pad the first dimension
+            of the output to at least this value.
+        use_per_token_if_dynamic: Whether to do per_tensor or per_token
+            in the dynamic quantization case.
+
+    Returns:
+        tuple[paddle.Tensor, paddle.Tensor]: The output tensor in FP8 and
+            scaling factor.
+    """
+    # This code assumes batch_dim and num_tokens are flattened
+    assert (input.ndim == 2)
+    shape = input.shape
+    if num_token_padding:
+        shape = (max(num_token_padding, input.shape[0]), shape[1])
+    output = paddle.empty(shape, dtype=paddle.float8_e4m3fn)
+
+    if scale is None:
+        if use_per_token_if_dynamic:
+            scale = paddle.empty([shape[0], 1], dtype=paddle.float32)
+            from fastdeploy.model_executor.ops.gpu import \
+                dynamic_per_token_scaled_fp8_quant
+            dynamic_per_token_scaled_fp8_quant(output, input, scale, scale_ub)
+        else:
+            scale = paddle.zeros([1], dtype=paddle.float32)
+            from fastdeploy.model_executor.ops.gpu import \
+                dynamic_scaled_fp8_quant
+            dynamic_scaled_fp8_quant(output, input, scale)
+    else:
+        # num_token_padding not implemented for this case
+        # assert (scale.numel() == 1 or num_token_padding is None)
+        from fastdeploy.model_executor.ops.gpu import static_scaled_fp8_quant
+        static_scaled_fp8_quant(output, input, scale)
+
+    return output, scale
--- a/fastdeploy/model_executor/layers/quantization/quant_base.py
+++ b/fastdeploy/model_executor/layers/quantization/quant_base.py
@@ -47,12 +47,9 @@ class QuantConfigBase(ABC):

    def __init__(self):
        super().__init__()
-        self.quant_round_type = None
-        self.quant_max_bound = None
-        self.quant_min_bound = None

    @abstractmethod
-    def get_name(self) -> str:
+    def name(self) -> str:
        """Name of the quantization method."""
        raise NotImplementedError

--- a/fastdeploy/model_executor/layers/quantization/tensor_wise_fp8.py
+++ b/fastdeploy/model_executor/layers/quantization/tensor_wise_fp8.py
@@ -0,0 +1,135 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+from typing import Optional
+
+import paddle
+
+from fastdeploy.model_executor.layers.moe import FusedMoE
+
+from ..utils import get_tensor
+from .quant_base import QuantConfigBase, QuantMethodBase
+
+
+class TensorWiseFP8Config(QuantConfigBase):
+    """
+    Quantization config for weight and activation with FP8.
+    """
+
+    def __init__(self) -> None:
+        """
+        Nothing else to do!
+        """
+        super().__init__()
+
+    def name(self) -> str:
+        """
+        Nothing else to do!
+        """
+        return "tensor_wise_fp8"
+
+    @classmethod
+    def from_config(cls, config: dict) -> "TensorWiseFP8Config":
+        """
+        Nothing else to do!
+        """
+        return cls()
+
+    def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
+        """
+        return method according to this config!
+        """
+        if isinstance(layer, FusedMoE):
+            from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import \
+                TensorWiseFP8MoEMethod
+            return TensorWiseFP8MoEMethod(self)
+        else:
+            return TensorWiseFP8LinearMethod(self)
+
+
+class TensorWiseFP8LinearMethod(QuantMethodBase):
+    """
+    Weight and activation quantization method for linear layer with per tensor FP8
+    """
+
+    def __init__(
+        self,
+        quant_config: TensorWiseFP8Config,
+    ) -> None:
+        """
+        Nothing special to do!
+        """
+        super().__init__()
+        self.quant_config = quant_config
+        self.quant_max_bound = 448
+        self.quant_min_bound = -448
+        self.quant_round_type = 1
+        self.weight_dtype = "float8_e4m3fn"
+
+    def create_weights(self, layer):
+        """
+        Nothing to do!
+        """
+        pass
+
+    def process_prequanted_weights(self, layer, state_dict) -> None:
+        """
+        Process pre-quantized weights before applying them to the model
+        Args:
+            layer: The layer that owns the weights
+            quant_weight: The quantized weights
+            weight_scale: The scale of the quantized weights
+        """
+
+        quant_weight = get_tensor(state_dict.pop(layer.weight_key))
+        weight_scale = get_tensor(state_dict.pop(layer.weight_scale_key))
+        act_scale = get_tensor(state_dict.pop(layer.act_scale_key))
+
+        quant_weight = quant_weight.transpose([1, 0]).contiguous()
+        layer.linear_weight.copy_(quant_weight.view("float8_e4m3fn"), False)
+
+        self.act_scale = act_scale.item()
+        self.total_scale = (act_scale * weight_scale).item()
+
+    def process_loaded_weights(self, layer, weights, state_dict) -> None:
+        """
+        Read fp8 weight, act scale, weight scale
+        """
+        pass
+
+    def apply(self, layer, x):
+        """
+        compute!
+        """
+        from fastdeploy.model_executor.ops.gpu import \
+            cutlass_fp8_fp8_half_gemm_fused
+
+        from ..utils import create_hadamard_matrix_map
+
+        hadamard_matrix = create_hadamard_matrix_map[x.shape[-1]]
+        new_x = paddle.matmul(x.cast("float32"), hadamard_matrix)
+        fp8_x = new_x / self.act_scale
+        fp8_x = fp8_x.astype("float8_e4m3fn")
+
+        linear_out = cutlass_fp8_fp8_half_gemm_fused(
+            fp8_x,
+            layer.linear_weight,
+            transpose_x=False,
+            transpose_y=True,
+            bias=None,
+            scale=self.total_scale,
+            output_dtype="bfloat16",
+            activation_type="identity")
+        return linear_out
--- a/fastdeploy/model_executor/layers/quantization/w4a8.py
+++ b/fastdeploy/model_executor/layers/quantization/w4a8.py
@@ -0,0 +1,42 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+from typing import Optional
+
+from ..moe import FusedMoE
+from .quant_base import QuantConfigBase, QuantMethodBase
+
+
+class W4A8Config(QuantConfigBase):
+    """
+    quantization config for weight 4bits and activation 8bits
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def name(self) -> str:
+        return "w4a8"
+
+    @classmethod
+    def from_config(cls, config: dict) -> "W4A8Config":
+        return cls()
+
+    def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
+        if isinstance(layer, FusedMoE):
+            from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import CutlassW4A8MoEMethod
+            return CutlassW4A8MoEMethod(self)
+        else:
+            raise ValueError(f"Unsupported layer type {type(layer)} for w4a8")
--- a/fastdeploy/model_executor/layers/quantization/w4afp8.py
+++ b/fastdeploy/model_executor/layers/quantization/w4afp8.py
@@ -23,16 +23,21 @@ from .quant_base import QuantConfigBase, QuantMethodBase

 QUANT_SCALING_FACTOR = 448

+
 class W4AFP8Config(QuantConfigBase):
    """
    quantization config for weight 4bits and activation fp8
    """
+
    def __init__(self, weight_scale_dict, act_scale_dict) -> None:
        super().__init__()
        self.weight_scale_dict = weight_scale_dict
        self.act_scale_dict = act_scale_dict
+        self.quant_max_bound = 448
+        self.quant_min_bound = -448
+        self.quant_round_type = 1

-    def get_name(self) -> str:
+    def name(self) -> str:
        return "w4afp8"

    @classmethod
@@ -49,6 +54,7 @@ class W4AFP8LinearMethod(QuantMethodBase):
    """
    W4 AFP8 quant method for linear
    """
+
    def __init__(
        self,
        quant_config: W4AFP8Config,
@@ -57,6 +63,9 @@ class W4AFP8LinearMethod(QuantMethodBase):
        self.quant_config = quant_config

    def create_weights(self, layer):
+        layer.linear_weight_shape.reverse()
+        layer.linear_weight_shape[0] //= 2
+        layer.weight_dtype = "int8"
        pass

    def process_loaded_weights(self, layer, weights) -> None:
@@ -78,11 +87,11 @@ class W4AFP8LinearMethod(QuantMethodBase):
            layer.linear_weight_scale,
            zero_points=None,
            bias=layer.linear_bias if layer.add_bias else None,
-            out_scale=self.quant_config.weight_scale_dict.get(
-                layer.prefix + ".weight_quanter") /
-            (self.quant_config.act_scale_dict.get(layer.prefix +
-                                                  ".activation_quanter") *
-             QUANT_SCALING_FACTOR * QUANT_SCALING_FACTOR),
+            out_scale=self.quant_config.weight_scale_dict.get(layer.prefix +
+                                                              ".weight_scale")
+            / (self.quant_config.act_scale_dict.get(layer.prefix +
+                                                    ".activation_scale") *
+               QUANT_SCALING_FACTOR * QUANT_SCALING_FACTOR),
            groupsize=0,
            out_dtype=layer._dtype,
        )
--- a/fastdeploy/model_executor/layers/quantization/w8a8.py
+++ b/fastdeploy/model_executor/layers/quantization/w8a8.py
@@ -16,11 +16,12 @@
 from typing import Optional

 import paddle
-from paddlenlp.utils.log import logger
+from paddleformers.utils.log import logger

 import fastdeploy
 from fastdeploy.platforms.utils import convert_to_npu_dequant_scale

+from ..utils import get_tensor
 from .quant_base import QuantConfigBase, QuantMethodBase


@@ -29,14 +30,18 @@ class W8A8Config(QuantConfigBase):
    quantization config for weight 8bits and activation 8bits
    """

-    def __init__(self, weight_scale_dict, act_scale_dict,
-                 use_gemm_dequant) -> None:
+    def __init__(self, weight_scale_dict, act_scale_dict, use_gemm_dequant,
+                 use_smooth_quant) -> None:
        super().__init__()
        self.weight_scale_dict = weight_scale_dict
        self.act_scale_dict = act_scale_dict
        self.use_gemm_dequant = use_gemm_dequant
+        self.use_smooth_quant = use_smooth_quant
+        self.quant_max_bound = 127
+        self.quant_min_bound = -127
+        self.quant_round_type = 0

-    def get_name(self) -> str:
+    def name(self) -> str:
        return "w8a8"

    @classmethod
@@ -61,12 +66,17 @@ class W8A8LinearMethod(QuantMethodBase):
    ) -> None:
        super().__init__()
        self.quant_config = quant_config
+        self.smooth_quant_method = SmoothQuantLinearMethod(quant_config)

    def create_weights(self, layer):
-        weight_scale = self.quant_config.weight_scale_dict.get(
-            layer.prefix + ".weight_quanter")
+        layer.linear_weight_shape.reverse()
+        layer.weight_dtype = "int8"
+        if self.quant_config.use_smooth_quant:
+            self.smooth_quant_method.create_weights(layer)
+        weight_scale = self.quant_config.weight_scale_dict.get(layer.prefix +
+                                                               ".weight_scale")
        in_scale = self.quant_config.act_scale_dict.get(layer.prefix +
-                                                        ".activation_quanter")
+                                                        ".activation_scale")
        self.skip_quant = False
        if weight_scale is None or in_scale is None:
            self.skip_quant = True
@@ -86,13 +96,15 @@ class W8A8LinearMethod(QuantMethodBase):
            convert_to_npu_dequant_scale(linear_out_scale))

    def process_loaded_weights(self, layer, weights) -> None:
+        if self.quant_config.use_smooth_quant:
+            self.smooth_quant_method.process_loaded_weights(layer, weights)
        if self.skip_quant:
            logger.debug(f"{layer.prefix} skip quant")
            weight_tensor = weights.cast(layer._dtype)
            layer.linear_weight.set_value(weight_tensor)
        else:
            weight_tensor = weights.transpose([1, 0])
-            weight_tensor = paddle.cast(weight_tensor, layer.weight_dtype)
+            weight_tensor = paddle.cast(weight_tensor, "int8")
            layer.linear_weight.set_value(weight_tensor)

    def apply(self, layer, x):
@@ -107,3 +119,53 @@ class W8A8LinearMethod(QuantMethodBase):
            linear_out = fastdeploy.model_executor.ops.gpu.dequant_int8(
                linear_out, layer.linear_out_scale, layer._dtype)
        return linear_out
+
+
+class SmoothQuantLinearMethod(QuantMethodBase):
+    """
+    SmoothQuant Method
+    """
+
+    def __init__(
+        self,
+        quant_config: QuantConfigBase,
+    ) -> None:
+        super().__init__()
+        self.quant_config = quant_config
+
+    def create_weights(self, layer):
+        linear_shift_shape = [layer.output_size]
+        linear_smooth_shape = [layer.output_size]
+        layer.linear_shift = self.create_parameter(
+            shape=linear_shift_shape,
+            dtype=layer._dtype,
+            is_bias=False,
+        )
+        layer.linear_smooth = layer.create_parameter(
+            shape=linear_smooth_shape,
+            dtype=layer._dtype,
+            is_bias=False,
+        )
+
+    def process_loaded_weights(self, layer, weights) -> None:
+        if layer.shift_key in layer.state_dict:
+            shift_tensor = get_tensor(layer.state_dict.pop(
+                layer.shift_key)).astype(paddle.get_default_dtype())
+        else:
+            shift_tensor = paddle.zeros(
+                shape=layer.linear_shift_shape,
+                dtype=paddle.get_default_dtype(),
+            )
+        layer.linear_shift.set_value(shift_tensor)
+        if layer.smooth_key in layer.state_dict:
+            smooth_tensor = get_tensor(layer.state_dict.pop(
+                layer.smooth_key)).astype(paddle.get_default_dtype())
+        else:
+            smooth_tensor = paddle.ones(
+                shape=[layer.linear_smooth_shape],
+                dtype=paddle.get_default_dtype(),
+            )
+        layer.linear_smooth.set_value(smooth_tensor)
+
+    def apply(self, layer, x):
+        pass
--- a/fastdeploy/model_executor/layers/quantization/weight_only.py
+++ b/fastdeploy/model_executor/layers/quantization/weight_only.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
+import os
 from abc import abstractmethod
 from typing import Optional

@@ -21,6 +22,8 @@ from paddle.nn.quant import weight_only_linear, weight_quantize

 from fastdeploy.platforms import current_platform

+from ..moe import FusedMoE
+from ..utils import get_tensor
 from .quant_base import QuantConfigBase, QuantMethodBase


@@ -28,34 +31,92 @@ class WeightOnlyConfig(QuantConfigBase):
    """
    Quantization config for weight only
    Args:
-        weight_only_linear_arch: The architecture of weight only linear layer
        algo: The quant algorithm("weight_only_int8" or "weight_only_int4") used for weight only linear layer
    """

    def __init__(
        self,
-        weight_only_linear_arch: int,
        algo: str,
    ) -> None:
        super().__init__()
-        self.weight_only_linear_arch = weight_only_linear_arch
        self.algo = algo
+        # arch (int): The compute arch for target device. For example, A100 is 80, v100 is 70,
+        # if you do not assign arch, we will get arch from your device, default: None.
+        self.weight_only_linear_arch = os.getenv(
+            "FLAGS_weight_only_linear_arch")
+        if self.weight_only_linear_arch is not None:
+            self.weight_only_linear_arch = int(self.weight_only_linear_arch)
+        self.quant_max_bound = 0
+        self.quant_min_bound = 0
+        self.quant_round_type = 0

-    def get_name(self) -> str:
+    def name(self) -> str:
        return "weight_only"

    @classmethod
    def from_config(cls, config: dict) -> "WeightOnlyConfig":
-        weight_only_linear_arch = config["weight_only_linear_arch"]
        algo = config["algo"]
-        return cls(weight_only_linear_arch, algo)
+        return cls(algo)

    def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
        if current_platform.is_xpu():
-            from fastdeploy.model_executor.layers.backends import XPUWeightOnlyLinearMethod
-            return XPUWeightOnlyLinearMethod(self)
+            from fastdeploy.model_executor.layers.backends import (
+                XPUWeightOnlyLinearMethod, XPUWeightOnlyMoEMethod)
+            if isinstance(layer, FusedMoE):
+                return XPUWeightOnlyMoEMethod(self)
+            else:
+                return XPUWeightOnlyLinearMethod(self)
        else:
-            return GPUWeightOnlyLinearMethod(self)
+            if isinstance(layer, FusedMoE):
+                if layer.use_method == "cutlass":
+                    from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import \
+                        CutlassWeightOnlyMoEMethod
+                    return CutlassWeightOnlyMoEMethod(self)
+                elif layer.use_method == "triton":
+                    from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import \
+                        TritonWeightOnlyMoEMethod
+                    return TritonWeightOnlyMoEMethod(self)
+                elif layer.use_method == "marlin":
+                    from fastdeploy.model_executor.layers.moe.fused_moe_marlin_backend import \
+                        MarlinWeightOnlyMoEMethod
+                    return MarlinWeightOnlyMoEMethod(self)
+                else:
+                    raise ValueError(
+                        f"Unsupported MOE backend {layer.use_method}")
+            else:
+                return GPUWeightOnlyLinearMethod(self)
+
+
+class WINT8Config(WeightOnlyConfig):
+    """
+    weight only int8 config
+    """
+
+    def __init__(self, ) -> None:
+        super().__init__("weight_only_int8")
+
+    @classmethod
+    def from_config(cls, config: dict) -> "WINT8Config":
+        return cls()
+
+    def name(self) -> str:
+        return "wint8"
+
+
+class WINT4Config(WeightOnlyConfig):
+    """
+    weight only int4 config
+    """
+
+    def __init__(self, ) -> None:
+        super().__init__("weight_only_int4")
+
+    @classmethod
+    def from_config(cls, config: dict) -> "WINT4Config":
+        return cls()
+
+    def name(self) -> str:
+        return "wint4"


 class WeightOnlyLinearMethod(QuantMethodBase):
@@ -71,12 +132,17 @@ class WeightOnlyLinearMethod(QuantMethodBase):
        self.quant_config = quant_config

    def create_weights(self, layer):
-        weight_only_scale_name = layer.prefix + ".weight_only_scale"
+        layer.linear_weight_shape.reverse()
+        if self.quant_config.name() == "wint4":
+            layer.linear_weight_shape[0] //= 2
+        layer.weight_dtype = "int8"
        linear_weight_scale_shape = [layer.embed_dim]
        if hasattr(layer, "linear_weight_shape"):
            if isinstance(layer.linear_weight_shape, list):
                layer_weight_shape = layer.linear_weight_shape
                linear_weight_scale_shape = layer_weight_shape[:1]
+            if self.quant_config.name() == "wint4":
+                linear_weight_scale_shape[0] *= 2

        layer.linear_weight_scale = layer.create_parameter(
            shape=linear_weight_scale_shape,
@@ -94,7 +160,8 @@ class WeightOnlyLinearMethod(QuantMethodBase):
            weight=layer.linear_weight,
            bias=layer.linear_bias if layer.add_bias else None,
            weight_scale=layer.linear_weight_scale,
-            weight_dtype=layer.weight_dtype,
+            weight_dtype="int8"
+            if self.quant_config.name() == "wint8" else "int4",
            arch=self.quant_config.weight_only_linear_arch,
        )
        return linear_out
@@ -113,6 +180,20 @@ class GPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
    ) -> None:
        super().__init__(quant_config)

+    def process_prequanted_weights(self, layer, state_dict) -> None:
+        """
+        Process pre-quantized weights before applying them to the model
+        Args:
+            layer: The layer that owns the weights
+            quant_weight: The quantized weights
+            weight_scale: The scale of the quantized weights
+        """
+        quant_weight = get_tensor(state_dict.pop(layer.weight_key))
+        weight_scale = get_tensor(state_dict.pop(layer.weight_scale_key))
+        layer.linear_weight.set_value(quant_weight)
+        layer.linear_weight_scale.set_value(
+            weight_scale.astype(paddle.get_default_dtype()))
+
    def process_loaded_weights(self, layer, weight) -> None:
        quanted_weight_tensor, weight_scale_tensor = weight_quantize(
            weight,
--- a/fastdeploy/model_executor/layers/quantization/wfp8afp8.py
+++ b/fastdeploy/model_executor/layers/quantization/wfp8afp8.py
@@ -17,10 +17,10 @@ from typing import Optional

 import paddle

-import fastdeploy
-from fastdeploy.platforms.utils import convert_to_npu_dequant_scale
-
-from .quant_base import QuantConfigBase, QuantMethodBase
+from fastdeploy.model_executor.layers.quantization.ops import (
+    cutlass_scaled_mm, scaled_fp8_quant)
+from fastdeploy.model_executor.layers.quantization.quant_base import (
+    QuantConfigBase, QuantMethodBase)


 class WFP8AFP8Config(QuantConfigBase):
@@ -32,17 +32,26 @@ class WFP8AFP8Config(QuantConfigBase):
        super().__init__()
        self.weight_scale_dict = weight_scale_dict
        self.act_scale_dict = act_scale_dict
+        self.quant_max_bound = 448
+        self.quant_min_bound = -448
+        self.quant_round_type = 1

-    def get_name(self) -> str:
+    def name(self) -> str:
+        """
+        """
        return "wfp8afp8"

    @classmethod
    def from_config(cls, config: dict) -> "WFP8AFP8Config":
-        weight_scale_dict = config["weight_scale_dict"]
-        act_scale_dict = config["act_scale_dict"]
+        """
+        """
+        weight_scale_dict = config.get("weight_scale_dict", None)
+        act_scale_dict = config.get("act_scale_dict", None)
        return cls(weight_scale_dict, act_scale_dict)

    def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
+        """
+        """
        return WFP8AFP8LinearMethod(self)


@@ -59,58 +68,49 @@ class WFP8AFP8LinearMethod(QuantMethodBase):
        self.quant_config = quant_config

    def create_weights(self, layer):
+        """
+        """
+        layer.linear_weight_shape.reverse()
+        layer.weight_dtype = "float8_e4m3fn"
        # TODO(YuanRisheng): set weight logic should be moved to process_loaded_weights func
-        weight_scale = self.quant_config.weight_scale_dict.get(
-            layer.prefix + ".weight_quanter")
-        in_scale = self.quant_config.act_scale_dict.get(layer.prefix +
-                                                        ".activation_quanter")
        self.skip_quant = False
-        # we will skip quant if weight_scale is not found or in_scale is not found
-        if weight_scale is None or in_scale is None:
-            self.skip_quant = True
-        else:
-            max_range = 448.0
-            layer.scalar_scale_name = layer.prefix + ".scalar_weight_quanter"
-            layer.scalar_scale = layer.create_parameter(
-                shape=([1]),
-                dtype="float32",
-            )
-            layer.scalar_scale.set_value(
-                paddle.to_tensor([1.0 / (max_range * in_scale)],
-                                 dtype="float32"))
-            linear_out_scale = paddle.to_tensor(weight_scale /
-                                                max_range).astype("float32")
-            layer.linear_out_scale = layer.create_parameter(
-                shape=[layer.embed_dim],
-                dtype="float32",
-                is_bias=False,
-                default_initializer=paddle.nn.initializer.Constant(0),
-            )
-            layer.linear_out_scale.set_value(
-                convert_to_npu_dequant_scale(linear_out_scale))
+        layer.linear_weight_scale = layer.create_parameter(
+            shape=[1],
+            dtype="float32",
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )

    def process_loaded_weights(self, layer, weights) -> None:
-        # TODO(YuanRisheng): We should abstract the ‌skip_quant‌ logic to adapt to more quant methods
+        """
+        """
        if self.skip_quant:
            weight_tensor = weights.cast(layer._dtype)
            layer.linear_weight.set_value(weight_tensor)
            return
-        weight_tensor = weights.transpose([1, 0])
-        weight_tensor = paddle.cast(weight_tensor, self.weight_dtype)
-        self.linear_weight.copy_(weight_tensor, False)
+        if weights.dtype != paddle.float8_e4m3fn:
+            self.use_per_token_if_dynamic = True
+        weight_tensor = weights.transpose([1, 0]).contiguous()
+        qweight, weight_scale = scaled_fp8_quant(
+            weight_tensor,
+            use_per_token_if_dynamic=False,
+        )
+        layer.linear_weight.copy_(qweight, False)
+        layer.linear_weight_scale.set_value(weight_scale)

    def apply(self, layer, x):
+        """
+        """
        if self.skip_quant:
            linear_out = paddle.matmul(x, layer.linear_weight, False, True)
            return linear_out
-        linear_out = fastdeploy.model_executor.ops.gpu.per_channel_fp8_fp8_half_gemm_fused(
-            x,
-            layer.linear_weight,
-            bias=layer.linear_bias if layer.add_bias else None,
-            scalar_scale=layer.scalar_scale,
-            channel_scale=layer.linear_out_scale,
-            transpose_x=False,
-            transpose_y=True,
-            output_dtype=layer._dtype,
-        )
+        if self.use_per_token_if_dynamic:
+            out_type = x.dtype
+            a_q, a_scales = scaled_fp8_quant(
+                x, use_per_token_if_dynamic=self.use_per_token_if_dynamic)
+            linear_out = cutlass_scaled_mm(a_q, layer.linear_weight, a_scales,
+                                           layer.linear_weight_scale, out_type,
+                                           layer.linear_bias)
+        else:
+            raise NotImplementedError
        return linear_out
--- a/fastdeploy/model_executor/layers/quantization/wint2.py
+++ b/fastdeploy/model_executor/layers/quantization/wint2.py
@@ -0,0 +1,142 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+from typing import Optional
+
+from ..moe import FusedMoE
+from . import get_quantization_config
+from .quant_base import QuantConfigBase, QuantMethodBase
+
+
+class WINT2Config(QuantConfigBase):
+    """
+    Quantization config for wint8 linear and w4w2 MoE.
+    """
+
+    def __init__(
+        self,
+        dense_quant_type: str,
+        dense_quant_granularity: str,
+        moe_quant_type: str,
+        moe_w4_quant_type: str,
+        moe_w4_quant_granularity: str,
+        moe_w4_quant_start_layer: int,
+        moe_w4_quant_end_layer: int,
+        moe_w2_quant_type: str,
+        moe_w2_quant_granularity: str,
+        moe_w2_quant_group_size: int,
+        moe_w2_quant_start_layer: int,
+        moe_w2_quant_end_layer: int,
+    ) -> None:
+        super().__init__()
+        self.quant_max_bound = 0
+        self.quant_min_bound = 0
+        self.quant_round_type = 0
+
+        # wint2 quantization config
+        self.dense_quant_type = dense_quant_type
+        self.dense_quant_granularity = dense_quant_granularity
+        self.moe_quant_type = moe_quant_type
+        self.moe_w4_quant_type = moe_w4_quant_type
+        self.moe_w4_quant_granularity = moe_w4_quant_granularity
+        self.moe_w4_quant_start_layer = moe_w4_quant_start_layer
+        self.moe_w4_quant_end_layer = moe_w4_quant_end_layer
+        self.moe_w2_quant_type = moe_w2_quant_type
+        self.moe_w2_quant_granularity = moe_w2_quant_granularity
+        self.moe_w2_quant_group_size = moe_w2_quant_group_size
+        self.moe_w2_quant_start_layer = moe_w2_quant_start_layer
+        self.moe_w2_quant_end_layer = moe_w2_quant_end_layer
+
+    def name(self) -> str:
+        """
+        Get the name of the quantization configuration.
+        Returns:
+            str: The name of the quantization configuration.
+        """
+        return "wint2"
+
+    @classmethod
+    def from_config(cls, config: dict) -> "WINT2Config":
+        """
+        Create a new instance of `WINT2Config` using the provided configuration dictionary.
+        Args:
+            config (dict): A dictionary containing the configuration parameters for the new instance.
+
+        Returns:
+            WINT2Config: The newly created instance of `WINT2Config`.
+        """
+
+        dense_quant_type = config.get("dense_quant_config", "wint8")
+        dense_quant_granularity = config.get("dense_quant_granularity",
+                                             "per_channel")
+
+        moe_quant_config = config.get("moe_quant_config", {})
+        moe_quant_type = moe_quant_config.get("quant_type", "w4w2")
+
+        moe_w4_quant_config = moe_quant_config.get("moe_w4_quant_config", {})
+        moe_w4_quant_type = moe_w4_quant_config.get("quant_type",
+                                                    "wint4")
+        moe_w4_quant_granularity = moe_w4_quant_config.get(
+            "quant_granularity", "per_channel")
+        moe_w4_quant_start_layer = moe_w4_quant_config.get(
+            "quant_start_layer", 0)
+        moe_w4_quant_end_layer = moe_w4_quant_config.get("quant_end_layer", 6)
+
+        moe_w2_quant_config = moe_quant_config.get("moe_w2_quant_config", {})
+        moe_w2_quant_type = moe_w2_quant_config.get("quant_type", "wint2")
+        moe_w2_quant_granularity = moe_w2_quant_config.get(
+            "quant_granularity", "pp_acc")
+        moe_w2_quant_group_size = moe_w2_quant_config.get(
+            "quant_group_size", 0)
+        moe_w2_quant_start_layer = moe_w2_quant_config.get(
+            "quant_start_layer", 0)
+        moe_w2_quant_end_layer = moe_w2_quant_config.get("quant_end_layer", 0)
+
+        return cls(
+            dense_quant_type,
+            dense_quant_granularity,
+            moe_quant_type,
+            moe_w4_quant_type,
+            moe_w4_quant_granularity,
+            moe_w4_quant_start_layer,
+            moe_w4_quant_end_layer,
+            moe_w2_quant_type,
+            moe_w2_quant_granularity,
+            moe_w2_quant_group_size,
+            moe_w2_quant_start_layer,
+            moe_w2_quant_end_layer,
+        )
+
+    def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
+        """
+        Get the quantization method associated with the given layer based on the current quantization configuration.
+        Args:
+            layer (Layer): The layer for which the quantization method should be retrieved.
+
+        Returns:
+            QuantMethodBase: The quantization method associated with the given layer. 
+        """
+        if isinstance(layer, FusedMoE):
+            if layer.layer_idx <= self.moe_w4_quant_end_layer:
+                return get_quantization_config(
+                    self.moe_w4_quant_type).from_config(
+                        {}).get_quant_method(layer)
+            else:
+                from fastdeploy.model_executor.layers.moe.fused_moe_wint2_backend import \
+                    TritonWint2FusedMoeMethod
+                return TritonWint2FusedMoeMethod(self)
+        else:
+            return get_quantization_config(self.dense_quant_type).from_config(
+                {}).get_quant_method(layer)
--- a/fastdeploy/model_executor/layers/rotary_embedding.py
+++ b/fastdeploy/model_executor/layers/rotary_embedding.py
@@ -14,25 +14,25 @@
 # limitations under the License.
 """

-from typing import Any, Optional
+from typing import Optional

 import paddle

+from fastdeploy.config import ModelConfig
+from fastdeploy.platforms import current_platform
+
+from .utils import CpuGuard
+

 class ErnieRotaryEmbedding:

-    def __init__(self,
-                 rotary_dim,
-                 base,
-                 partial_rotary_factor,
-                 rope_scaling=None):
+    def __init__(self, rotary_dim, base, partial_rotary_factor):
        """
        Pre-calculate rotary position embedding for position_ids.
        """
        self.rotary_dim = rotary_dim
        self.base = base
        self.partial_rotary_factor = partial_rotary_factor
-        self.rope_scaling = rope_scaling

    def __call__(self, position_ids):
        bsz, max_seq_len = position_ids.shape[:2]
@@ -70,18 +70,13 @@ class ErnieRotaryEmbedding:

 class QwenRotaryEmbedding:

-    def __init__(self,
-                 rotary_dim,
-                 base,
-                 partial_rotary_factor,
-                 rope_scaling=None):
+    def __init__(self, rotary_dim, base, partial_rotary_factor):
        """
        Pre-calculate rotary position embedding for position_ids.
        """
        self.rotary_dim = rotary_dim
        self.base = base
        self.partial_rotary_factor = partial_rotary_factor
-        self.rope_scaling = rope_scaling

    def __call__(self, position_ids):
        bsz, max_seq_len = position_ids.shape[:2]
@@ -104,35 +99,72 @@ class QwenRotaryEmbedding:
        return rot_emb


+def get_rope_impl(
+    rotary_dim: int,
+    base: 10000.0,
+    position_ids,
+    model_config: Optional[ModelConfig] = None,
+    partial_rotary_factor=1,
+):
+    """
+    The real implementation of get_rope
+    """
+
+    architecture = model_config.architectures[0]
+    if model_config is not None and model_config is None or architecture.startswith(
+            "Qwen"):
+        rotary_emb_layer = QwenRotaryEmbedding(rotary_dim, base,
+                                               partial_rotary_factor)
+        rotary_emb = rotary_emb_layer(position_ids)
+    else:
+        rotary_emb_layer = ErnieRotaryEmbedding(rotary_dim, base,
+                                                partial_rotary_factor)
+        rotary_emb = rotary_emb_layer(position_ids)
+    return rotary_emb
+
+
+def get_rope_xpu(
+    rotary_dim: int,
+    base: 10000.0,
+    position_ids,
+    model_config: ModelConfig,
+    partial_rotary_factor=1,
+):
+    """
+    In XPU, cos and sin compute must be done on cpu
+    """
+    with CpuGuard():
+        position_ids = position_ids.cpu()
+        rotary_emb = get_rope_impl(rotary_dim, base, position_ids,
+                                   model_config, partial_rotary_factor)
+        return rotary_emb.to('xpu')
+
+
 def get_rope(
    rotary_dim: int,
    base: 10000.0,
    position_ids,
+    model_config: ModelConfig,
    partial_rotary_factor=1,
-    rope_scaling: Optional[dict[str, Any]] = None,
 ):
-    rope_type = rope_scaling.get("architectures", None)
-    if "Qwen2ForCausalLM" in rope_type:
-        rotary_emb_layer = QwenRotaryEmbedding(rotary_dim, base,
-                                               partial_rotary_factor,
-                                               rope_scaling)
-        rotary_emb = rotary_emb_layer(position_ids)
+    """
+    The warpper of get_rope
+    """
+    if current_platform.is_xpu():
+        return get_rope_xpu(rotary_dim, base, position_ids, model_config,
+                            partial_rotary_factor)
    else:
-        rotary_emb_layer = ErnieRotaryEmbedding(rotary_dim, base,
-                                                partial_rotary_factor,
-                                                rope_scaling)
-        rotary_emb = rotary_emb_layer(position_ids)
-    return rotary_emb
+        return get_rope_impl(rotary_dim, base, position_ids, model_config,
+                             partial_rotary_factor)


 class ErnieVlRotaryEmbedding3D:

    def __init__(self, rotary_dim, base, partial_rotary_factor, max_position,
-                 freq_allocation, rope_scaling):
+                 freq_allocation):
        self.rotary_dim = rotary_dim
        self.base = base
        self.paritial_rotary_factor = partial_rotary_factor
-        self.rope_scaling = rope_scaling
        self.max_position = max_position
        self.freq_allocation = freq_allocation

@@ -223,12 +255,10 @@ def get_rope_3d(
    paritial_rotary_factor: 1,
    max_position: 131072,
    freq_allocation: 2,
-    rope_scaling: Optional[dict[str, Any]] = None,
 ):
    rotary_emb3d_layer = ErnieVlRotaryEmbedding3D(rotary_dim, base,
                                                  paritial_rotary_factor,
                                                  max_position,
-                                                  freq_allocation,
-                                                  rope_scaling)
+                                                  freq_allocation)
    rotary_emb_3d = rotary_emb3d_layer(position_ids)
    return rotary_emb_3d
--- a/fastdeploy/model_executor/layers/sample/meta_data.py
+++ b/fastdeploy/model_executor/layers/sample/meta_data.py
@@ -23,11 +23,12 @@ import paddle
@dataclass
 class SamplingMetadata:
    """
+    metadata for sampling.
    """

    temperature: paddle.Tensor

-    prompt_token_ids: paddle.Tensor
+    pre_token_ids: paddle.Tensor
    eos_token_ids: paddle.Tensor
    frequency_penalties: paddle.Tensor
    presence_penalties: paddle.Tensor
--- a/fastdeploy/model_executor/layers/sample/ops/init.py
+++ b/fastdeploy/model_executor/layers/sample/ops/init.py
@@ -14,8 +14,12 @@
 # limitations under the License.
 """

-from .apply_penalty_multi_scores import apply_penalty_multi_scores
+from .apply_penalty_multi_scores import (
+    apply_penalty_multi_scores, apply_speculative_penalty_multi_scores)
+from .top_p_sampling import top_p_sampling

 __all__ = [
    "apply_penalty_multi_scores",
+    "apply_speculative_penalty_multi_scores",
+    "top_p_sampling",
 ]
--- a/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py
+++ b/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py
@@ -20,7 +20,7 @@ from fastdeploy.platforms import current_platform


 def apply_penalty_multi_scores(
-    prompt_token_ids: paddle.Tensor,
+    pre_token_ids: paddle.Tensor,
    logits: paddle.Tensor,
    repetition_penalties: paddle.Tensor,
    frequency_penalties: paddle.Tensor,
@@ -30,16 +30,30 @@ def apply_penalty_multi_scores(
    step_idx: paddle.Tensor,
    min_dec_lens: paddle.Tensor,
    eos_token_ids: paddle.Tensor,
-):
+) -> paddle.Tensor:
    """
-    Args:
-    Returns:
+    apply_penalty_multi_scores
    """
    if current_platform.is_cuda():
        from fastdeploy.model_executor.ops.gpu import \
            get_token_penalty_multi_scores
        logits = get_token_penalty_multi_scores(
-            prompt_token_ids,
+            pre_token_ids,
+            logits,
+            repetition_penalties,
+            frequency_penalties,
+            presence_penalties,
+            temperature,
+            bad_words_token_ids,
+            step_idx,
+            min_dec_lens,
+            eos_token_ids,
+        )
+    elif current_platform.is_xpu():
+        from fastdeploy.model_executor.ops.xpu import \
+            get_token_penalty_multi_scores
+        logits = get_token_penalty_multi_scores(
+            pre_token_ids,
            logits,
            repetition_penalties,
            frequency_penalties,
@@ -54,3 +68,48 @@ def apply_penalty_multi_scores(
        raise NotImplementedError()

    return logits
+
+
+def apply_speculative_penalty_multi_scores(
+    pre_token_ids: paddle.Tensor,
+    logits: paddle.Tensor,
+    repetition_penalties: paddle.Tensor,
+    frequency_penalties: paddle.Tensor,
+    presence_penalties: paddle.Tensor,
+    temperature: paddle.Tensor,
+    bad_words_token_ids: paddle.Tensor,
+    step_idx: paddle.Tensor,
+    min_dec_lens: paddle.Tensor,
+    eos_token_ids: paddle.Tensor,
+    seq_lens_this_time: paddle.Tensor,
+    output_padding_offset: paddle.Tensor,
+    output_cum_offsets: paddle.Tensor,
+    max_len: int,
+):
+    """
+    apply_speculative_penalty_multi_scores
+    """
+    if current_platform.is_cuda():
+        from fastdeploy.model_executor.ops.gpu import \
+            speculate_get_token_penalty_multi_scores
+
+        logits = speculate_get_token_penalty_multi_scores(
+            pre_token_ids,
+            logits,
+            repetition_penalties,
+            frequency_penalties,
+            presence_penalties,
+            temperature,
+            bad_words_token_ids,
+            step_idx,
+            min_dec_lens,
+            eos_token_ids,
+            seq_lens_this_time,
+            output_padding_offset,
+            output_cum_offsets,
+            max_len,
+        )
+    else:
+        raise NotImplementedError()
+
+    return logits
--- a/fastdeploy/model_executor/layers/sample/ops/top_p_sampling.py
+++ b/fastdeploy/model_executor/layers/sample/ops/top_p_sampling.py
@@ -0,0 +1,97 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from typing import Literal, Optional
+
+import paddle
+
+from fastdeploy import envs
+
+
+def top_p_sampling(
+    x: paddle.Tensor,
+    ps: paddle.Tensor,
+    threshold: Optional[paddle.Tensor] = None,
+    topp_seed: Optional[paddle.Tensor] = None,
+    seed: int = -1,
+    k: int = 0,
+    mode: Literal['truncated', 'non-truncated'] = "truncated",
+) -> tuple[paddle.Tensor, paddle.Tensor]:
+    """
+    top_p_sampling
+    """
+    top_p_class = envs.FD_SAMPLING_CLASS.lower()
+    if top_p_class == "air":
+        _, ids = air_top_p_sampling(x,
+                                    ps,
+                                    threshold,
+                                    topp_seed,
+                                    seed=seed,
+                                    k=k,
+                                    mode=mode)
+    elif top_p_class == "rejection":
+        ids = rejection_top_p_sampling(x, ps, seed)
+        _ = None
+    else:
+        _, ids = paddle.tensor.top_p_sampling(x,
+                                              ps,
+                                              threshold=threshold,
+                                              topp_seed=topp_seed,
+                                              seed=seed,
+                                              k=k,
+                                              mode=mode)
+    return _, ids
+
+
+def air_top_p_sampling(
+    x: paddle.Tensor,
+    ps: paddle.Tensor,
+    threshold: Optional[paddle.Tensor] = None,
+    topp_seed: Optional[paddle.Tensor] = None,
+    seed: int = -1,
+    k: int = 0,
+    mode: Literal['truncated', 'non-truncated'] = "truncated",
+) -> tuple[paddle.Tensor, paddle.Tensor]:
+    """
+    air_top_p_sampling
+    """
+    try:
+        from fastdeploy.model_executor.ops.gpu import air_top_p_sampling
+        out, ids = air_top_p_sampling(x, ps, threshold, topp_seed, seed, k,
+                                      mode)
+    except ImportError:
+        raise RuntimeError("Cannot import air_top_p_sampling op.")
+    return out, ids
+
+
+def rejection_top_p_sampling(
+    x: paddle.Tensor,
+    ps: paddle.Tensor,
+    seed: int = -1,
+) -> paddle.Tensor:
+    """
+    rejection_top_p_sampling
+    """
+    try:
+        from fastdeploy.model_executor.ops.gpu import rejection_top_p_sampling
+        ids = rejection_top_p_sampling(
+            x,
+            ps,
+            seed,
+        )
+    except ImportError:
+        raise RuntimeError("Cannot import rejection_top_p_sampling op.")
+    return ids
--- a/fastdeploy/model_executor/layers/sample/sampler.py
+++ b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -13,43 +13,193 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
+import threading
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Dict, List, Optional

 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F

-from fastdeploy.distributed.parallel_state import \
-    get_tensor_model_parallel_world_size
+from fastdeploy.config import FDConfig
+from fastdeploy.model_executor.guided_decoding.base_guided_decoding import \
+    LogitsProcessorBase
 from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
-from fastdeploy.model_executor.layers.sample.ops import \
-    apply_penalty_multi_scores
+from fastdeploy.model_executor.layers.sample.ops import (
+    apply_penalty_multi_scores, apply_speculative_penalty_multi_scores,
+    top_p_sampling)
 from fastdeploy.platforms import current_platform


+class SamplerProcessor:
+    """
+    SamplingProcessor for guided decoding.
+    """
+
+    def __init__(self):
+        self.async_step = None
+        self.token_bitmask = None
+        self.logits_processor: Dict[int, Optional[Any]] = dict()
+        self.executor = ThreadPoolExecutor()
+        self.logits_lock = threading.Lock()
+
+    def add_logits_processor(self,
+                             ids: int,
+                             future: Optional[Any] = None,
+                             prefill_tokens: List[int] = []):
+        """ add logits processor to SamplerProcessor """
+        with self.logits_lock:
+            if future is None:
+                if ids in self.logits_processor:
+                    del self.logits_processor[ids]
+                return
+
+            if isinstance(future, LogitsProcessorBase):
+                self.logits_processor[ids] = future
+                for token in prefill_tokens:
+                    self.logits_processor[ids].accept_token(token)
+            elif future.done():
+                self.logits_processor[ids] = future.result()
+                for token in prefill_tokens:
+                    self.logits_processor[ids].accept_token(token)
+            else:
+                self.logits_processor[ids] = [future, prefill_tokens]
+
+    def update_vocab_mask(self, skip_idx_list: List[int] = []):
+        """ update vocab mask. (cpu-heavy operation) """
+        if len(self.logits_processor) == 0:
+            return
+
+        with self.logits_lock:
+            for idx, processor in self.logits_processor.items():
+                if processor is None:
+                    del self.logits_processor[idx]
+                    continue
+
+                if not isinstance(processor, LogitsProcessorBase):
+                    future, prefill_tokens = self.logits_processor[idx]
+                    self.logits_processor[idx] = future.result()
+                    for token in prefill_tokens:
+                        self.logits_processor[idx].accept_token(token)
+
+            available_processors = None
+            for processor in self.logits_processor.values():
+                if processor.is_terminated():
+                    continue
+                available_processors = processor
+            if available_processors is None:
+                return
+
+        # allocate token bitmask
+        self.token_bitmask = available_processors.allocate_token_bitmask()
+
+        with self.logits_lock:
+            # fill token bitmask
+            for idx, processor in self.logits_processor.items():
+                if processor.is_terminated() or idx in skip_idx_list:
+                    continue
+
+                processor.fill_token_bitmask(self.token_bitmask, idx)
+
+    def apply_token_mask(self,
+                         logits: paddle.Tensor,
+                         skip_idx_list: List[int] = []):
+        """ apply token mask to logits """
+        if len(self.logits_processor) == 0 or self.token_bitmask is None:
+            return logits
+
+        # self.async_step.result()
+        available_processors = None
+        with self.logits_lock:
+            for processor in self.logits_processor.values():
+                if processor.is_terminated():
+                    continue
+                available_processors = processor
+        if available_processors is None:
+            return logits
+
+        indices = list(self.logits_processor.keys())
+        mask_idx = [i for i in indices if i not in skip_idx_list]
+        return available_processors.apply_token_mask(logits,
+                                                     self.token_bitmask,
+                                                     indices=mask_idx)
+
+    def _accept_token(self, idx: int, token: int):
+        """ accept token """
+        if idx not in self.logits_processor:
+            raise ValueError(
+                f"Invalid index, idx: {idx}, logit_processors.keys: {self.logits_processor.keys()}"
+            )
+
+        if self.logits_processor[idx].is_terminated():
+            return
+
+        self.logits_processor[idx].accept_token(token)
+
+    def update_output_tokens(self,
+                             next_tokens: paddle.Tensor,
+                             skip_idx_list: List[int] = []):
+        """ update output tokens """
+        if len(self.logits_processor) == 0:
+            return
+
+        token_ids = next_tokens.numpy().tolist()
+        with self.logits_lock:
+            for idx in self.logits_processor.keys():
+                token = token_ids[idx][0]
+                if token < 0 or self.logits_processor[
+                        idx] is None or idx in skip_idx_list:
+                    continue
+
+                self._accept_token(idx, token)
+
+    def pre_process(self, skip_idx_list: List[int] = []):
+        """ pre process before running """
+        # create async operation for guided decoding
+        # TODO: support async
+        self.update_vocab_mask(skip_idx_list)
+        # self.async_step = self.executor.submit(self.update_vocab_mask)
+
+
 class Sampler(nn.Layer):
    """
+    Sampler for normal generation.
    """

    def __init__(self):
        """
        """
        super().__init__()
-        if current_platform.is_cuda():
-            self.nranks = get_tensor_model_parallel_world_size()
+        if current_platform.is_cuda() or current_platform.is_xpu():
            self.forward = self.forward_cuda
        else:
            raise NotImplementedError()

+        self.processor = SamplerProcessor()
+
+    def apply_logits_processor(self,
+                               ids: int,
+                               future: Optional[Any] = None,
+                               prefill_tokens: List[int] = []):
+        """ apply logits processor to sampler """
+        self.processor.add_logits_processor(ids, future, prefill_tokens)
+
+    def pre_process(self, skip_idx_list: List[int] = []):
+        """ pre process before running """
+        self.processor.pre_process(skip_idx_list)
+
    def forward_cuda(
        self,
        logits: paddle.Tensor,
        sampling_metadata: SamplingMetadata,
+        skip_idx_list: List[int] = [],
    ) -> paddle.Tensor:
        """
        """
+        logits = self.processor.apply_token_mask(logits, skip_idx_list)

        logits = apply_penalty_multi_scores(
-            sampling_metadata.prompt_token_ids,
+            sampling_metadata.pre_token_ids,
            logits,
            sampling_metadata.repetition_penalties,
            sampling_metadata.frequency_penalties,
@@ -63,10 +213,156 @@ class Sampler(nn.Layer):

        probs = F.softmax(logits)

-        _, next_tokens = paddle.tensor.top_p_sampling(probs,
-                                                      sampling_metadata.top_p)
-
-        if self.nranks > 1:
-            paddle.distributed.broadcast(next_tokens, 0)
+        _, next_tokens = top_p_sampling(probs, sampling_metadata.top_p)

+        self.processor.update_output_tokens(next_tokens, skip_idx_list)
+        return next_tokens
+
+
+class SpeculativeSampler(nn.Layer):
+    """
+    Sampler for speculative generation.
+    """
+
+    def __init__(self, fd_config: FDConfig):
+        """
+        """
+        super().__init__()
+        if current_platform.is_cuda():
+            self.forward = self.forward_cuda
+        else:
+            raise NotImplementedError()
+        self.speculative_verify_window = fd_config.speculative_config.verify_window
+        self.speculative_max_candidate_len = fd_config.speculative_config.max_candidate_len
+
+    def pre_process(self, skip_idx_list: List[int] = []):
+        """ pre process before running """
+        pass
+
+    def apply_logits_processor(self,
+                               ids: int,
+                               future: Optional[Any] = None,
+                               prefill_tokens: List[int] = []):
+        """ apply logits processor to sampler """
+        pass
+
+    def forward_cuda(
+        self,
+        logits: paddle.Tensor,
+        sampling_metadata: SamplingMetadata,
+        max_model_len: int,
+        share_inputs: List[paddle.Tensor],
+    ) -> paddle.Tensor:
+        """
+        """
+
+        from fastdeploy.model_executor.ops.gpu import (speculate_verify,
+                                                       top_p_candidates)
+
+        logits = apply_speculative_penalty_multi_scores(
+            sampling_metadata.pre_token_ids,
+            logits,
+            sampling_metadata.repetition_penalties,
+            sampling_metadata.frequency_penalties,
+            sampling_metadata.presence_penalties,
+            sampling_metadata.temperature,
+            sampling_metadata.bad_words_token_ids,
+            sampling_metadata.step_idx,
+            sampling_metadata.min_dec_lens,
+            sampling_metadata.eos_token_ids,
+            share_inputs["seq_lens_this_time"],
+            share_inputs["output_padding_offset"],
+            share_inputs["output_cum_offsets"],
+            max_model_len,
+        )
+
+        probs = F.softmax(logits)
+
+        verify_scores, verify_tokens, actual_candidate_len = top_p_candidates(
+            probs,
+            sampling_metadata.top_p,
+            share_inputs["output_padding_offset"],
+            self.speculative_max_candidate_len,
+            max_model_len,
+        )
+
+        speculate_verify(
+            share_inputs["accept_tokens"],
+            share_inputs["accept_num"],
+            share_inputs["step_idx"],
+            share_inputs["stop_flags"],
+            share_inputs["seq_lens_encoder"],
+            share_inputs["seq_lens_decoder"],
+            share_inputs[
+                "draft_tokens"],  # Both input and output, need to write the last 1 token accepted to position 0.
+            share_inputs["seq_lens_this_time"],
+            verify_tokens,
+            verify_scores,
+            share_inputs["max_dec_len"],
+            sampling_metadata.eos_token_ids,
+            share_inputs["is_block_step"],
+            share_inputs["output_cum_offsets"],
+            actual_candidate_len,
+            share_inputs["actual_draft_token_num"],
+            sampling_metadata.top_p,
+            max_model_len,
+            self.speculative_verify_window,
+            True,  # enable_topp
+        )
+
+        return None
+
+
+class MTPSampler(nn.Layer):
+    """
+    """
+
+    def __init__(self, fd_config: FDConfig):
+        """
+        """
+        super().__init__()
+        if current_platform.is_cuda():
+            self.forward = self.forward_cuda
+        else:
+            raise NotImplementedError()
+
+    def pre_process(self, skip_idx_list: List[int] = []):
+        """ pre process before running """
+        pass
+
+    def apply_logits_processor(self,
+                               ids: int,
+                               future: Optional[Any] = None,
+                               prefill_tokens: List[int] = []):
+        """ apply logits processor to sampler """
+        pass
+
+    def forward_cuda(
+        self,
+        logits: paddle.Tensor,
+        sampling_metadata: SamplingMetadata,
+        max_model_len: int,
+        share_inputs: List[paddle.Tensor],
+    ) -> paddle.Tensor:
+        """
+        """
+        logits = apply_speculative_penalty_multi_scores(
+            sampling_metadata.pre_token_ids,
+            logits,
+            sampling_metadata.repetition_penalties,
+            sampling_metadata.frequency_penalties,
+            sampling_metadata.presence_penalties,
+            sampling_metadata.temperature,
+            sampling_metadata.bad_words_token_ids,
+            sampling_metadata.step_idx,
+            sampling_metadata.min_dec_lens,
+            sampling_metadata.eos_token_ids,
+            share_inputs["seq_lens_this_time"],
+            share_inputs["seq_lens_encoder"],
+            share_inputs["seq_lens_decoder"],
+            max_model_len,
+        )
+        probs = F.softmax(logits)
+
+        _, next_tokens = top_p_sampling(probs, sampling_metadata.top_p)
        return next_tokens
--- a/fastdeploy/model_executor/layers/utils.py
+++ b/fastdeploy/model_executor/layers/utils.py
@@ -14,32 +14,37 @@
 # limitations under the License.
 """

-from typing import Tuple
+from typing import Tuple, Union

 import numpy as np
 import paddle
-from paddle import Tensor
+from paddle import Tensor, nn
 from paddle.framework import in_dynamic_mode
+from scipy.linalg import block_diag
+
 from fastdeploy.platforms import current_platform
+
 if current_platform.is_cuda() and current_platform.available():
    try:
        from fastdeploy.model_executor.ops.gpu import (
-            get_padding_offset,
-            speculate_get_padding_offset,
-        )
+            get_padding_offset, speculate_get_padding_offset)
    except Exception:
        raise ImportError(
-            f"Verify environment consistency between compilation and FastDeploy installation. "
-            f"And ensure the Paddle version supports FastDeploy's custom operators"
+            "Verify environment consistency between compilation and FastDeploy installation. "
+            "And ensure the Paddle version supports FastDeploy's custom operators"
        )
 import re

-import os
-cache_params = os.getenv("CACHE_PARAMS", "none")
+from fastdeploy import envs
+
+cache_params = envs.FD_CACHE_PARAMS
 if cache_params != "none":
    c8_state_dict = paddle.load(cache_params, return_numpy=True)

-def per_block_cast_to_fp8(x: Tensor) -> Tuple[Tensor, Tensor]:
+
+def per_block_cast_to_fp8(x: Tensor,
+                          block_size: list = [128,
+                                              128]) -> Tuple[Tensor, Tensor]:
    """
    Only used in deep_gemm block wise quant weight.
    copy from FastDeploy/custom_ops/gpu_ops/fp8_deep_gemm/tests/test_core.py.
@@ -48,10 +53,13 @@ def per_block_cast_to_fp8(x: Tensor) -> Tuple[Tensor, Tensor]:

    assert x.dim() == 2
    m, n = x.shape
-    x_padded = paddle.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128),
+    x_padded = paddle.zeros((ceil_div(m, block_size[0]) * block_size[0],
+                             ceil_div(n, block_size[1]) * block_size[1]),
                            dtype=x.dtype)
    x_padded[:m, :n] = x
-    x_view = paddle.view(x_padded, (-1, 128, x_padded.shape[1] // 128, 128))
+    x_view = paddle.view(
+        x_padded,
+        (-1, block_size[0], x_padded.shape[1] // block_size[1], block_size[1]))

    x_abs = paddle.abs(x_view).astype(paddle.float32)
    x_amax = paddle.amax(x_abs, axis=(1, 3), keepdim=True)
@@ -63,15 +71,15 @@ def per_block_cast_to_fp8(x: Tensor) -> Tuple[Tensor, Tensor]:


 # for distributed tensor model parallel
-def _set_var_distributed(var, split_axis):
+def _set_var_distributed(var: Tensor, split_axis: int):
    """
    Set whether the variable is distributed. If the variable is None, no operation will be performed.

    Args:
-    var (Variable, Optional): A Variable object, which can be None. The default value is None.
-    The Variable object should have an attribute 'is_distributed' to indicate whether
-    the variable has been processed in a distributed manner.
-    split_axis (Integer): the sharding dimension of dist tensors
+        var (Tensor): A Variable object, which can be None. The default value is None.
+            The Variable object should have an attribute 'is_distributed' to indicate whether
+            the variable has been processed in a distributed manner.
+        split_axis (int): the sharding dimension of dist tensors.

    Returns:
    None. No return value.
@@ -91,10 +99,16 @@ def _set_var_distributed(var, split_axis):
        main_block._find_var_recursive(var.name).is_distributed = True


-def get_tensor(input):
+def get_tensor(input: Union[paddle.Tensor, np.ndarray, str]) -> paddle.Tensor:
    """
-    EP并行中，权重按层分布式存储，为了节省峰值显存，在state_dict处理部分仅保存
-    层名与对应权重的路径，因此需要将权重的类型转换为paddle.Tensor
+    Return a corresponding PaddlePaddle tensor based on the type and content of the input.
+
+    Args:
+        input (Union[paddle.Tensor, np.ndarray, str]): The input data.
+
+    Returns:
+        paddle.Tensor: Returns a PaddlePaddle tensor.
+
    """
    if isinstance(input, paddle.Tensor):
        if input.place.is_cpu_place():
@@ -104,7 +118,6 @@ def get_tensor(input):
        return paddle.to_tensor(input)
    elif isinstance(input, str):
        if ".safetensors" in input:
-
            match = re.match(r"\[(.*?)\](.*)", input)
            if match:
                key_name = match.group(1)
@@ -116,12 +129,11 @@ def get_tensor(input):
                    weight = f.get_tensor(key_name)
                    weight = paddle.Tensor(weight, zero_copy=True)
                    weight = weight._copy_to(
-                        paddle.framework._current_expected_place(), False
-                    )
+                        paddle.framework._current_expected_place(), False)
                    return weight
                else:
                    return None
-        else:   
+        else:
            if cache_params != "none":
                tmp_key = input.split("/")[-1]
                if tmp_key in c8_state_dict:
@@ -129,25 +141,134 @@ def get_tensor(input):
                    return paddle.to_tensor(c8_state_dict.pop(tmp_key))
            return paddle.load(input)
    else:
-        # 理论上不会命中这个分支
        return input


+def matmul_hadU(X: Tensor) -> paddle.Tensor:
+    """
+    Perform matrix multiplication using the Hadamard matrix.
+
+    Args:
+        X (Tensor): The tensor to be multiplied.
+
+    Returns:
+        Tensor: The tensor after Hadamard matrix multiplication, with the same shape as the input tensor X.
+
+    """
+    input = X.clone().reshape((-1, X.shape[-1], 1))
+    output = input.clone()
+    while input.shape[1] > 1:
+        input = input.reshape(
+            (input.shape[0], input.shape[1] // 2, 2, input.shape[2]))
+        output = output.reshape(input.shape)
+        output[:, :, 0, :] = input[:, :, 0, :] + input[:, :, 1, :]
+        output[:, :, 1, :] = input[:, :, 0, :] - input[:, :, 1, :]
+        output = output.reshape((input.shape[0], input.shape[1], -1))
+        (input, output) = (output, input)
+    del output
+    return input.reshape(X.shape)
+
+
+def random_hadamard_matrix(block_size: int,
+                           dtype: Union[paddle.dtype, str]) -> paddle.Tensor:
+    """
+    Generate a random Hadamard matrix.
+
+    Args:
+        block_size (int): The size of the block, i.e., the number of rows and columns of the matrix.
+        dtype (str): The data type, for example 'float32'.
+
+    Returns:
+        paddle.Tensor: The generated random Hadamard matrix.
+
+    """
+    Q = paddle.diag(paddle.ones((block_size), dtype=dtype))
+    block = matmul_hadU(Q)
+    return block
+
+
+def create_hadamard_matrix(hidden_size: int) -> paddle.Tensor:
+    """
+    Generate a Hadamard matrix.
+
+    Args:
+        hidden_size (int): The size of the hidden layer.
+
+    Returns:
+        paddle.Tensor: The generated Hadamard matrix.
+
+    """
+    hadamard_block_size = 32
+    h = random_hadamard_matrix(hadamard_block_size, "float32")
+    block_num = hidden_size // hadamard_block_size
+    hadamard_matrix = paddle.to_tensor(
+        block_diag(*[h for i in range(block_num)]))
+    return hadamard_matrix
+
+
+create_hadamard_matrix_map = {}
+# Zkk: below key are used in 4.5T fp8.
+create_hadamard_matrix_map[8192] = create_hadamard_matrix(8192)
+create_hadamard_matrix_map[448] = create_hadamard_matrix(448)
+create_hadamard_matrix_map[1024] = create_hadamard_matrix(1024)
+create_hadamard_matrix_map[3584] = create_hadamard_matrix(3584)
+
+
 def ensure_divisibility(numerator, denominator):
-    """Ensure that numerator is divisible by the denominator."""
+    """
+    Ensure the numerator is divisible by the denominator.
+
+    Args:
+        numerator (int): The numerator.
+        denominator (int): The denominator.
+
+    Returns:
+        None
+
+    Raises:
+        AssertionError: If the numerator cannot be evenly divided by the denominator, an assertion error is raised.
+
+    """
    assert numerator % denominator == 0, "{} is not divisible by {}".format(
        numerator, denominator)


-def divide(numerator, denominator):
-    """Ensure that numerator is divisible by the denominator and return
-    the division value."""
+def divide(numerator: int, denominator: int):
+    """
+    Calculate the division result of two numbers.
+
+    Args:
+        numerator (int): The dividend.
+        denominator (int): The divisor.
+
+    Returns:
+        int: The result of the division, which is the quotient of the dividend divided by the divisor.
+
+    """
    ensure_divisibility(numerator, denominator)
    return numerator // denominator

-def remove_padding(max_len, input_ids, seq_lens_this_time):
+
+def remove_padding(
+    max_len: paddle.Tensor, input_ids: paddle.Tensor,
+    seq_lens_this_time: paddle.Tensor
+) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor,
+           paddle.Tensor]:
    """
-    remove_padding
+    Remove padded sequences from the input.
+
+    Args:
+        max_len (paddle.Tensor): The maximum length of the input sequences.
+        input_ids (paddle.Tensor): The IDs of the input sequences.
+        seq_lens_this_time (paddle.Tensor): The actual length of each sequence.
+
+    Returns:
+        tuple: A tuple containing:
+            - The sequence IDs with padding removed (paddle.Tensor).
+            - The padding offsets (paddle.Tensor).
+            - The cumulative offsets (paddle.Tensor).
+            - The query sequence lengths (paddle.Tensor).
+            - The key sequence lengths (paddle.Tensor).
    """
    if current_platform.is_cuda():
        cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time)
@@ -159,7 +280,7 @@ def remove_padding(max_len, input_ids, seq_lens_this_time):
            cu_seqlens_q,
            cu_seqlens_k,
        ) = get_padding_offset(input_ids, cum_offsets_now, token_num,
-                                seq_lens_this_time)
+                               seq_lens_this_time)
        return (
            ids_remove_padding,
            padding_offset,
@@ -168,10 +289,30 @@ def remove_padding(max_len, input_ids, seq_lens_this_time):
            cu_seqlens_k,
        )

-def speculate_remove_padding(max_len, input_ids, seq_lens_this_time,
-                                    draft_tokens, seq_lens_encoder):
+
+def speculate_remove_padding(
+    max_len: paddle.Tensor, input_ids: paddle.Tensor,
+    seq_lens_this_time: paddle.Tensor, draft_tokens: paddle.Tensor,
+    seq_lens_encoder: paddle.Tensor
+) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor,
+           paddle.Tensor]:
    """
-    remove_padding
+    Remove padding from sequences.
+
+    Args:
+        max_len (paddle.Tensor): The maximum length of the sequences.
+        input_ids (paddle.Tensor): The IDs of the input sequences.
+        seq_lens_this_time (paddle.Tensor): The lengths of the sequences in the current batch.
+        draft_tokens (paddle.Tensor): The draft tokens.
+        seq_lens_encoder (paddle.Tensor): The lengths of the encoder sequences.
+
+    Returns:
+        tuple: A tuple containing:
+            - The input sequence IDs with padding removed (paddle.Tensor).
+            - Padding offsets (paddle.Tensor).
+            - Cumulative offsets (paddle.Tensor).
+            - Query sequence lengths (paddle.Tensor).
+            - Key sequence lengths (paddle.Tensor).
    """
    if current_platform.is_cuda():
        cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time)
@@ -197,3 +338,43 @@ def speculate_remove_padding(max_len, input_ids, seq_lens_this_time,
            cu_seqlens_q,
            cu_seqlens_k,
        )
+
+
+class CpuGuard:
+    """CpuGuard"""
+
+    def __init__(self):
+        """init"""
+        pass
+
+    def __enter__(self):
+        """enter"""
+        self.ori_device = paddle.device.get_device()
+        paddle.device.set_device("cpu")
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """exit"""
+        paddle.device.set_device(self.ori_device)
+
+
+def create_and_set_parameter(layer: nn.Layer, name: str,
+                             tensor: paddle.Tensor):
+    """
+    Create a parameter for a specified layer and set its value to the given tensor.
+
+    Args:
+        layer (nn.Layer): The layer object to which the parameter will be added.
+        name (str): The name of the parameter to be created.
+        tensor (paddle.Tensor): The tensor to set as the value of the parameter.
+
+    Returns:
+        None
+    """
+    setattr(
+        layer, name,
+        layer.create_parameter(
+            shape=tensor.shape,
+            dtype=tensor.dtype,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        ))
+    getattr(layer, name).set_value(tensor)
--- a/fastdeploy/model_executor/model_loader.py
+++ b/fastdeploy/model_executor/model_loader.py
@@ -19,14 +19,31 @@ from abc import ABC, abstractmethod
 import paddle
 from paddle import nn

-from fastdeploy.config import LLMConfig, LoadConfig, ModelConfig
+from fastdeploy.config import FDConfig, LoadConfig, ModelConfig
+from fastdeploy.model_executor.models.ernie4_5_moe import \
+    Ernie4_5_PretrainedModel
+from fastdeploy.model_executor.models.ernie4_5_mtp import \
+    Ernie4_5_MTPPretrainedModel
+from fastdeploy.model_executor.models.model_base import ModelRegistry
+from fastdeploy.model_executor.models.qwen2 import Qwen2PretrainedModel
+from fastdeploy.model_executor.models.qwen3 import Qwen3PretrainedModel
+from fastdeploy.model_executor.models.qwen3moe import Qwen3MoePretrainedModel
+from fastdeploy.model_executor.models.utils import load_checkpoint
+
+MODEL_CLASSES = {
+    "Ernie4_5_MoeForCausalLM": Ernie4_5_PretrainedModel,
+    "Ernie4_5_MTPForCausalLM": Ernie4_5_MTPPretrainedModel,
+    "Qwen2ForCausalLM": Qwen2PretrainedModel,
+    "Qwen3ForCausalLM": Qwen3PretrainedModel,
+    "Qwen3MoeForCausalLM": Qwen3MoePretrainedModel,
+    "Ernie4_5_ForCausalLM": Ernie4_5_PretrainedModel
+}


-# TODO(gongshaotian): implement real interface to replace this
-def get_model(llm_config: LLMConfig) -> nn.Layer:
+def get_model_from_loader(fd_config: FDConfig) -> nn.Layer:
    """ load or download model """
-    model_path = llm_config.load_config.model_path
-    model = paddle.load(model_path, return_numpy=True)
+    model_loader = DefaultModelLoader(fd_config.load_config)
+    model = model_loader.load_model(fd_config)
    return model


@@ -42,7 +59,7 @@ class BaseModelLoader(ABC):
        raise NotImplementedError

    @abstractmethod
-    def load_model(self, llm_config: LLMConfig) -> nn.Layer:
+    def load_model(self, fd_config: FDConfig) -> nn.Layer:
        """ Load a model with the given configurations."""
        raise NotImplementedError

@@ -56,5 +73,23 @@ class DefaultModelLoader(BaseModelLoader):
    def download_model(self, model_config: ModelConfig) -> None:
        pass

-    def load_model(self, llm_config: LLMConfig) -> nn.Layer:
-        pass
+    def load_model(self, fd_config: FDConfig) -> nn.Layer:
+        context = paddle.LazyGuard()
+        architectures = fd_config.model_config.architectures[0]
+
+        # TODO(gongshaotian): Now, only support safetensor
+
+        model_class = MODEL_CLASSES[architectures]
+        state_dict = load_checkpoint(
+            fd_config.parallel_config.model_name_or_path,
+            model_class,
+            fd_config.model_config,
+            return_numpy=True)
+        with context:
+            model_cls = ModelRegistry.get_class(architectures)
+            model = model_cls(fd_config)
+
+        model.eval()
+        model.set_state_dict(state_dict)
+
+        return model
--- a/fastdeploy/model_executor/models/init.py
+++ b/fastdeploy/model_executor/models/init.py
@@ -16,30 +16,50 @@
 import importlib
 import inspect
 import os
+from pathlib import Path

 from .model_base import ModelForCasualLM, ModelRegistry

-inference_runner_supported_models = ["Qwen2ForCausalLM"]
+inference_runner_supported_models = [
+    "Ernie4_5_MoeForCausalLM",
+    "Ernie4_5_MTPForCausalLM",
+    "Qwen2ForCausalLM",
+    "Qwen3MoeForCausalLM",
+    "Ernie4_5_ForCausalLM",
+    "Qwen3ForCausalLM",
+]
+
+
+def _find_py_files(root_dir):
+    root_path = Path(root_dir)
+    py_files = []
+    for py_file in root_path.rglob("*.py"):
+        rel_path = py_file.relative_to(root_dir)
+        if "__init__" in str(py_file):
+            continue
+        dotted_path = str(rel_path).replace("/", ".").replace("\\",
+                                                              ".").replace(
+                                                                  ".py", "")
+        py_files.append(dotted_path)
+    return py_files


 def auto_models_registry():
    """
    auto registry all models in this folder
    """
-    for module_file in os.listdir(os.path.dirname(__file__)):
-        if module_file.endswith('.py') and module_file != '__init__.py':
-            module_name = module_file[:-3]
-            try:
-                module = importlib.import_module(
-                    f'fastdeploy.model_executor.models.{module_name}')
-                for attr_name in dir(module):
-                    attr = getattr(module, attr_name)
-                    if inspect.isclass(attr) and issubclass(
-                            attr,
-                            ModelForCasualLM) and attr is not ModelForCasualLM:
-                        ModelRegistry.register(attr)
-            except ImportError:
-                raise ImportError(f"{module_name=} import error")
+    for module_file in _find_py_files(os.path.dirname(__file__)):
+        try:
+            module = importlib.import_module(
+                f'fastdeploy.model_executor.models.{module_file}')
+            for attr_name in dir(module):
+                attr = getattr(module, attr_name)
+                if inspect.isclass(attr) and issubclass(
+                        attr,
+                        ModelForCasualLM) and attr is not ModelForCasualLM:
+                    ModelRegistry.register(attr)
+        except ImportError:
+            raise ImportError(f"{module_file=} import error")


 auto_models_registry()
--- a/fastdeploy/model_executor/models/ernie4_5_moe.py
+++ b/fastdeploy/model_executor/models/ernie4_5_moe.py
@@ -0,0 +1,774 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from __future__ import annotations
+
+from functools import partial
+from typing import Dict, Union
+
+import numpy as np
+import paddle
+from paddle import nn
+from paddleformers.transformers import PretrainedModel
+from paddleformers.utils.log import logger
+
+from fastdeploy.config import FDConfig, ModelConfig
+from fastdeploy.model_executor.graph_optimization.decorator import \
+    support_graph_optimization
+from fastdeploy.model_executor.layers.activation import SiluAndMul
+from fastdeploy.model_executor.layers.attention import Attention
+from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
+from fastdeploy.model_executor.layers.linear import (
+    MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear)
+from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
+from fastdeploy.model_executor.layers.moe.moe import FusedMoE
+from fastdeploy.model_executor.layers.normalization import RMSNorm
+from fastdeploy.model_executor.models.model_base import ModelForCasualLM
+from fastdeploy.worker.forward_meta import ForwardMeta
+
+
+class Ernie4_5_PretrainedModel(PretrainedModel):
+    """
+    Ernie4_5_PretrainedModel
+    """
+
+    config_class = FDConfig
+
+    def _init_weight(self, layer):
+        """
+        _init_weight
+        """
+        return None
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True):
+        """
+        get_tensor_parallel_mappings
+        """
+        logger.info("erine inference model _get_tensor_parallel_mappings")
+
+        from paddleformers.transformers.conversion_utils import \
+            split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def gqa_qkv_split_func(
+            weight,
+            tensor_parallel_degree,
+            tensor_parallel_rank,
+            num_attention_heads,
+            num_key_value_heads,
+            head_dim,
+        ):
+
+            def get_shape(tensor):
+                return (tensor.get_shape()
+                        if hasattr(tensor, "get_shape") else tensor.shape)
+
+            def slice_tensor(tensor, start, end):
+                shape = get_shape(tensor)
+                if len(shape) == 1:
+                    return tensor[start:end]
+                else:
+                    return tensor[..., start:end]
+
+            q_end = num_attention_heads * head_dim
+            k_end = q_end + num_key_value_heads * head_dim
+            v_end = k_end + num_key_value_heads * head_dim
+
+            q = slice_tensor(weight, 0, q_end)
+            k = slice_tensor(weight, q_end, k_end)
+            v = slice_tensor(weight, k_end, v_end)
+
+            def split_tensor(tensor, degree):
+                shape = get_shape(tensor)
+                size = shape[-1]
+                block_size = size // degree
+                if hasattr(tensor, "get_shape"):
+                    return [
+                        slice_tensor(tensor, i * block_size,
+                                     (i + 1) * block_size)
+                        for i in range(degree)
+                    ]
+                else:
+                    return np.split(tensor, degree, axis=-1)
+
+            q_list = split_tensor(q, tensor_parallel_degree)
+            k_list = split_tensor(k, tensor_parallel_degree)
+            v_list = split_tensor(v, tensor_parallel_degree)
+
+            if tensor_parallel_rank is None:
+                return [
+                    np.concatenate([q_i, k_i, v_i], axis=-1)
+                    for q_i, k_i, v_i in zip(q_list, k_list, v_list)
+                ]
+            else:
+                return np.concatenate(
+                    [
+                        q_list[tensor_parallel_rank],
+                        k_list[tensor_parallel_rank],
+                        v_list[tensor_parallel_rank],
+                    ],
+                    axis=-1,
+                )
+
+        def gqa_qkv_merge_func(weight_list, num_attention_heads,
+                               num_key_value_heads, head_dim):
+            tensor_parallel_degree = len(weight_list)
+            num_attention_heads = num_attention_heads // tensor_parallel_degree
+            num_key_value_heads = num_key_value_heads // tensor_parallel_degree
+
+            is_paddle_tensor = not isinstance(weight_list[0], np.ndarray)
+
+            def get_shape(tensor):
+                return (tensor.get_shape()
+                        if hasattr(tensor, "get_shape") else tensor.shape)
+
+            def slice_tensor(tensor, start, end):
+                if len(get_shape(tensor)) == 1:
+                    return tensor[start:end]
+                else:
+                    return tensor[..., start:end]
+
+            q_list, k_list, v_list = [], [], []
+
+            for weight in weight_list:
+                q_end = num_attention_heads * head_dim
+                k_end = q_end + num_key_value_heads * head_dim
+                v_end = k_end + num_key_value_heads * head_dim
+
+                q = slice_tensor(weight, 0, q_end)
+                k = slice_tensor(weight, q_end, k_end)
+                v = slice_tensor(weight, k_end, v_end)
+
+                q_list.append(q)
+                k_list.append(k)
+                v_list.append(v)
+
+            merged = q_list + k_list + v_list
+
+            if is_paddle_tensor:
+                tensor = paddle.concat(merged, axis=-1)
+                if tensor.place.is_gpu_place():
+                    tensor = tensor._copy_to(paddle.CUDAPinnedPlace(), False)
+                return tensor
+            else:
+                return np.concatenate(merged, axis=-1)
+
+        if (config.num_key_value_heads is not None
+                and config.num_key_value_heads != config.num_attention_heads):
+            if is_split:
+                qkv_fn = partial(
+                    gqa_qkv_split_func,
+                    tensor_parallel_degree=config.tensor_parallel_degree,
+                    tensor_parallel_rank=config.tensor_parallel_rank,
+                    num_attention_heads=config.num_attention_heads,
+                    num_key_value_heads=config.num_key_value_heads,
+                    head_dim=config.head_dim,
+                )
+            else:
+                qkv_fn = partial(
+                    gqa_qkv_merge_func,
+                    num_attention_heads=config.num_attention_heads,
+                    num_key_value_heads=config.num_key_value_heads,
+                    head_dim=config.head_dim,
+                )
+        else:
+            qkv_fn = partial(fn, is_column=True)
+
+        def get_tensor_parallel_split_mappings(num_layers, moe_num_experts,
+                                               moe_num_shared_experts,
+                                               moe_layer_start_index):
+
+            final_actions = {}
+
+            base_model_prefix = "ernie"
+            base_actions = {
+                "lm_head.weight":
+                partial(fn, is_column=True),
+                # "eh_proj.weight": partial(fn, is_column=True),
+                f"{base_model_prefix}.embed_tokens.weight":
+                partial(fn, is_column=False),
+            }
+
+            base_actions[
+                f"{base_model_prefix}.layers.0.self_attn.qkv_proj.weight"] = qkv_fn
+            base_actions[
+                f"{base_model_prefix}.layers.0.self_attn.qkv_proj.quant_weight"] = qkv_fn
+            base_actions[
+                f"{base_model_prefix}.layers.0.self_attn.o_proj.weight"] = partial(
+                    fn, is_column=False)
+            base_actions[
+                f"{base_model_prefix}.layers.0.self_attn.o_proj.quant_weight"] = partial(
+                    fn, is_column=False)
+            base_actions[
+                f"{base_model_prefix}.layers.0.mlp.up_gate_proj.weight"] = partial(
+                    fn, is_column=True, is_naive_2fuse=True)
+            base_actions[
+                f"{base_model_prefix}.layers.0.mlp.up_gate_proj.quant_weight"] = partial(
+                    fn, is_column=True, is_naive_2fuse=True)
+            base_actions[
+                f"{base_model_prefix}.layers.0.mlp.down_proj.weight"] = (
+                    partial(fn, is_column=False))
+            base_actions[
+                f"{base_model_prefix}.layers.0.mlp.down_proj.quant_weight"] = partial(
+                    fn, is_column=False)
+
+            for expert_idx in range(moe_num_experts):
+                base_actions[
+                    f"{base_model_prefix}.layers.{moe_layer_start_index}"
+                    f".mlp.experts.{expert_idx}.up_gate_proj.weight"] = partial(
+                        fn, is_column=True, is_naive_2fuse=True)
+                base_actions[
+                    f"{base_model_prefix}.layers.{moe_layer_start_index}"
+                    f".mlp.experts.{expert_idx}.up_gate_proj.quant_weight"] = partial(
+                        fn, is_column=True, is_naive_2fuse=True)
+                base_actions[
+                    f"{base_model_prefix}.layers.{moe_layer_start_index}"
+                    f".mlp.experts.{expert_idx}.down_proj.weight"] = partial(
+                        fn, is_column=False)
+                base_actions[
+                    f"{base_model_prefix}.layers.{moe_layer_start_index}"
+                    f".mlp.experts.{expert_idx}.down_proj.quant_weight"] = partial(
+                        fn, is_column=False)
+
+            if moe_num_shared_experts > 0:
+                base_actions[
+                    f"{base_model_prefix}.layers.{moe_layer_start_index}"
+                    f".mlp.shared_experts.up_gate_proj.weight"] = partial(
+                        fn, is_column=True, is_naive_2fuse=True)
+                base_actions[
+                    f"{base_model_prefix}.layers.{moe_layer_start_index}"
+                    f".mlp.shared_experts.up_gate_proj.quant_weight"] = partial(
+                        fn, is_column=True, is_naive_2fuse=True)
+                base_actions[
+                    f"{base_model_prefix}.layers.{moe_layer_start_index}"
+                    f".mlp.shared_experts.down_proj.weight"] = partial(
+                        fn, is_column=False)
+                base_actions[
+                    f"{base_model_prefix}.layers.{moe_layer_start_index}"
+                    f".mlp.shared_experts.up_gate_proj.quant_weight"] = partial(
+                        fn, is_column=False, is_naive_2fuse=True)
+
+            for key, action in base_actions.items():
+                if (f"{base_model_prefix}.layers.0.mlp.up_gate_proj.weight"
+                        in key or
+                        f"{base_model_prefix}.layers.0.mlp.up_gate_proj.quant_weight"
+                        in key
+                        or f"{base_model_prefix}.layers.0.mlp.down_proj.weight"
+                        in key or
+                        f"{base_model_prefix}.layers.0.mlp.down_proj.quant_weight"
+                        in key):
+                    for i in range(moe_layer_start_index):
+                        final_actions[key.replace("layers.0.",
+                                                  f"layers.{i}.")] = action
+                elif f"layers.{moe_layer_start_index}.mlp.experts." in key:
+                    for i in range(moe_layer_start_index, num_layers):
+                        final_actions[key.replace(
+                            f"layers.{moe_layer_start_index}.",
+                            f"layers.{i}.")] = action
+                elif f"layers.{moe_layer_start_index}.mlp.shared_experts." in key:
+                    for i in range(moe_layer_start_index, num_layers):
+                        final_actions[key.replace(
+                            f"layers.{moe_layer_start_index}.",
+                            f"layers.{i}.")] = action
+                elif f"{base_model_prefix}.layers.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("layers.0.",
+                                                  f"layers.{i}.")] = action
+                final_actions[key] = action
+            return final_actions
+
+        moe_num_experts = 0
+        moe_num_shared_experts = 0
+        if isinstance(config.moe_num_experts, list):
+            moe_num_experts = sum(config.moe_num_experts)
+        elif isinstance(config.moe_num_experts, int):
+            moe_num_experts = config.moe_num_experts
+        if hasattr(config, 'moe_num_shared_experts'):
+            moe_num_shared_experts = config.moe_num_shared_experts
+
+        moe_layer_start_index = -1
+        if isinstance(config.moe_layer_start_index, list):
+            moe_layer_start_index = min(config.moe_layer_start_index)
+        elif isinstance(config.moe_layer_start_index, int):
+            moe_layer_start_index = config.moe_layer_start_index
+
+        mappings = get_tensor_parallel_split_mappings(
+            config.num_layers,
+            moe_num_experts,
+            moe_num_shared_experts,
+            moe_layer_start_index,
+        )
+
+        return mappings
+
+
+class Ernie4_5_MLP(nn.Layer):
+
+    def __init__(
+        self,
+        fd_config: FDConfig,
+        intermediate_size: int,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.nranks = fd_config.parallel_config.tensor_parallel_degree
+        self.gate_up_proj = MergedColumnParallelLinear(
+            fd_config=fd_config,
+            prefix=f"{prefix}.up_gate_proj",
+            input_size=fd_config.model_config.hidden_size,
+            output_size=intermediate_size * 2,
+            with_bias=False,
+            activation=fd_config.model_config.hidden_act,
+            use_fast_ffn=True,
+        )
+
+        self.down_proj = RowParallelLinear(
+            fd_config=fd_config,
+            prefix=f"{prefix}.down_proj",
+            input_size=(intermediate_size // self.nranks),
+            output_size=fd_config.model_config.hidden_size,
+            with_bias=False,
+        )
+
+        self.act_fn = SiluAndMul(
+            fd_config=fd_config,
+            bias=None,
+            act_method=fd_config.model_config.hidden_act,
+        )
+
+    def load_state_dict(self, state_dict):
+        self.gate_up_proj.load_state_dict(state_dict)
+        self.down_proj.load_state_dict(state_dict)
+
+    def forward(self, hidden_states: paddle.Tensor):
+        gate_up_out = self.gate_up_proj(hidden_states)
+        act_out = self.act_fn(gate_up_out)
+        down_out = self.down_proj(act_out)
+        return down_out
+
+
+class Ernie4_5_MoE(nn.Layer):
+
+    def __init__(self, fd_config: FDConfig, layer_id: int,
+                 prefix: str) -> None:
+        super().__init__()
+        moe_quant_type = ""
+        if hasattr(fd_config.quant_config, 'moe_quant_type'):
+            moe_quant_type = fd_config.quant_config.moe_quant_type
+
+        if moe_quant_type == "w4a8":
+            weight_key_map = {
+                "gate_weight_key":
+                f"{prefix}.gate.weight",
+                "gate_correction_bias_key":
+                f"{prefix}.moe_statics.e_score_correction_bias",
+                "ffn1_expert_weight_key":
+                f"{prefix}.experts.{{}}.up_gate_proj.quant_weight",
+                "ffn2_expert_weight_key":
+                f"{prefix}.experts.{{}}.down_proj.quant_weight",
+                "ffn1_expert_weight_scale_key":
+                f"{prefix}.experts.{{}}.up_gate_proj.weight_scale",
+                "ffn2_expert_weight_scale_key":
+                f"{prefix}.experts.{{}}.down_proj.weight_scale",
+                "ffn1_expert_in_scale_key":
+                f"{prefix}.experts.{{}}.up_gate_proj.activation_scale",
+                "ffn2_expert_in_scale_key":
+                f"{prefix}.experts.{{}}.down_proj.activation_scale",
+            }
+        elif moe_quant_type == "w4w2":
+            weight_key_map = {
+                "gate_weight_key":
+                f"{prefix}.gate.weight",
+                "gate_correction_bias_key":
+                f"{prefix}.moe_statics.e_score_correction_bias",
+                "ffn1_expert_weight_key":
+                f"{prefix}.experts.{{}}.up_gate_proj.quant_weight",
+                "ffn2_expert_weight_key":
+                f"{prefix}.experts.{{}}.down_proj.quant_weight",
+                "ffn1_expert_weight_scale_key":
+                f"{prefix}.experts.{{}}.up_gate_proj.weight_scale",
+                "ffn2_expert_weight_scale_key":
+                f"{prefix}.experts.{{}}.down_proj.weight_scale",
+                "ffn1_expert_super_scales_key":
+                f"{prefix}.experts.{{}}.up_gate_proj.super_scales",
+                "ffn2_expert_super_scales_key":
+                f"{prefix}.experts.{{}}.down_proj.super_scales",
+                "ffn1_expert_code_scale_key":
+                f"{prefix}.experts.{{}}.up_gate_proj.code_scale",
+                "ffn2_expert_code_scale_key":
+                f"{prefix}.experts.{{}}.down_proj.code_scale",
+                "ffn1_expert_code_zp_key":
+                f"{prefix}.experts.{{}}.up_gate_proj.code_zp",
+                "ffn2_expert_code_zp_key":
+                f"{prefix}.experts.{{}}.down_proj.code_zp",
+            }
+        elif moe_quant_type == "tensor_wise_fp8" or (
+            moe_quant_type == "block_wise_fp8" and
+            fd_config.model_config.is_quantized):
+            weight_key_map = {
+                "gate_weight_key":
+                f"{prefix}.gate.weight",
+                "gate_correction_bias_key":
+                f"{prefix}.moe_statics.e_score_correction_bias",
+                "ffn1_expert_weight_key":
+                f"{prefix}.experts.{{}}.up_gate_proj.quant_weight",
+                "ffn2_expert_weight_key":
+                f"{prefix}.experts.{{}}.down_proj.quant_weight",
+                "ffn1_expert_weight_scale_key":
+                f"{prefix}.experts.{{}}.up_gate_proj.weight_scale",
+                "ffn2_expert_weight_scale_key":
+                f"{prefix}.experts.{{}}.down_proj.weight_scale",
+                "ffn1_expert_in_scale_key":
+                f"{prefix}.experts.{{}}.up_gate_proj.activation_scale",
+                "ffn2_expert_in_scale_key":
+                f"{prefix}.experts.{{}}.down_proj.activation_scale",
+            }
+        else:
+            weight_key_map = {
+                "gate_weight_key":
+                f"{prefix}.gate.weight",
+                "gate_correction_bias_key":
+                f"{prefix}.moe_statics.e_score_correction_bias",
+                "ffn1_expert_weight_key":
+                f"{prefix}.experts.{{}}.up_gate_proj.weight",
+                "ffn2_expert_weight_key":
+                f"{prefix}.experts.{{}}.down_proj.weight",
+            }
+
+        self.fused_moe = FusedMoE(
+            fd_config=fd_config,
+            moe_intermediate_size=fd_config.moe_config.moe_intermediate_size,
+            num_experts=fd_config.moe_config.num_experts,
+            top_k=fd_config.moe_config.top_k,
+            layer_idx=layer_id,
+            weight_key_map=weight_key_map,
+        )
+
+        self.num_shared_experts = fd_config.moe_config.moe_num_shared_experts
+        if self.num_shared_experts > 0:
+            shared_experts_hidden_dim = self.num_shared_experts * fd_config.moe_config.moe_intermediate_size
+            self.shared_experts = Ernie4_5_MLP(
+                fd_config=fd_config,
+                intermediate_size=shared_experts_hidden_dim,
+                prefix=f"{prefix}.shared_experts",
+            )
+
+    def load_state_dict(self, state_dict):
+        self.fused_moe.load_state_dict(state_dict)
+        if self.num_shared_experts > 0:
+            self.shared_experts.load_state_dict(state_dict)
+
+    def forward(self, hidden_states: paddle.Tensor):
+        out = self.fused_moe(hidden_states)
+        if self.num_shared_experts > 0:
+            s_x = self.shared_experts(hidden_states)
+            out = out + s_x
+        return out
+
+
+class Ernie4_5_Attention(nn.Layer):
+
+    def __init__(self, fd_config: FDConfig, layer_id: int,
+                 prefix: str) -> None:
+        super().__init__()
+
+        nranks = fd_config.parallel_config.tensor_parallel_degree
+
+        self.qkv_proj = QKVParallelLinear(
+            fd_config=fd_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            fd_config=fd_config,
+            prefix=f"{prefix}.o_proj",
+            input_size=(fd_config.model_config.head_dim *
+                        fd_config.model_config.num_attention_heads // nranks),
+            output_size=fd_config.model_config.hidden_size,
+        )
+        self.attn = Attention(
+            fd_config=fd_config,
+            layer_id=layer_id,
+            prefix=prefix,
+            use_neox_rotary_style=False,
+        )
+
+    def load_state_dict(self, state_dict):
+        self.qkv_proj.load_state_dict(state_dict)
+        self.o_proj.load_state_dict(state_dict)
+        self.attn.load_state_dict(state_dict)
+
+    def forward(
+        self,
+        forward_meta: ForwardMeta,
+        hidden_states: paddle.Tensor,
+    ):
+        qkv_out = self.qkv_proj(hidden_states)
+
+        attn_out = self.attn(
+            qkv=qkv_out,
+            forward_meta=forward_meta,
+        )
+
+        output = self.o_proj(attn_out)
+
+        return output
+
+
+class Ernie4_5_DecoderLayer(nn.Layer):
+
+    def __init__(
+        self,
+        fd_config: FDConfig,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        layer_id = int(prefix.split(sep='.')[-1])
+
+        self.self_attn = Ernie4_5_Attention(
+            fd_config=fd_config,
+            layer_id=layer_id,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        if (fd_config.moe_config.num_experts is not None
+                and layer_id >= fd_config.moe_config.moe_layer_start_index):
+            self.mlp = Ernie4_5_MoE(
+                fd_config=fd_config,
+                layer_id=layer_id,
+                prefix=f"{prefix}.mlp",
+            )
+        else:
+            self.mlp = Ernie4_5_MLP(
+                fd_config=fd_config,
+                intermediate_size=fd_config.model_config.ffn_hidden_size,
+                prefix=f"{prefix}.mlp",
+            )
+
+        self.input_layernorm = RMSNorm(
+            fd_config,
+            hidden_size=fd_config.model_config.hidden_size,
+            eps=1e-5,
+            prefix=f"{prefix}.input_layernorm",
+        )
+
+        self.post_attention_layernorm = RMSNorm(
+            fd_config,
+            hidden_size=fd_config.model_config.hidden_size,
+            eps=1e-5,
+            prefix=f"{prefix}.post_attention_layernorm",
+        )
+
+    def load_state_dict(self, state_dict):
+        self.self_attn.load_state_dict(state_dict)
+        self.mlp.load_state_dict(state_dict)
+        self.input_layernorm.load_state_dict(state_dict)
+        self.post_attention_layernorm.load_state_dict(state_dict)
+
+    def forward(
+        self,
+        forward_meta: ForwardMeta,
+        hidden_states: paddle.Tensor,
+        residual: paddle.Tensor = None,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            forward_meta=forward_meta,
+        )
+
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+
+        hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_graph_optimization
+class Ernie4_5_Model(nn.Layer):
+
+    def __init__(
+        self,
+        fd_config: FDConfig = None,
+    ):
+        """
+        Initializer for the Ernie4_5_Model class.
+
+        Args:
+
+        """
+        super().__init__()
+
+        self.num_layers = fd_config.model_config.num_layers
+        fd_config.model_config.prefix_name = "ernie"
+
+        self.embeddings = VocabParallelEmbedding(
+            fd_config=fd_config,
+            num_embeddings=fd_config.model_config.vocab_size,
+            embedding_dim=fd_config.model_config.hidden_size,
+            params_dtype=paddle.get_default_dtype(),
+            prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"))
+
+        self.hidden_layers = [
+            Ernie4_5_DecoderLayer(
+                fd_config=fd_config,
+                prefix=f"{fd_config.model_config.prefix_name}.layers.{i}")
+            for i in range(self.num_layers)
+        ]
+
+        self.norm = RMSNorm(
+            fd_config,
+            hidden_size=fd_config.model_config.hidden_size,
+            eps=1e-5,
+            prefix=f"{fd_config.model_config.prefix_name}.norm",
+        )
+
+    def load_state_dict(self, state_dict):
+        """
+        Load model parameters from a given state dictionary.
+
+        Args:
+            state_dict (dict[str, np.ndarray | paddle.Tensor]):
+                A dictionary containing model parameters, where keys are parameter names
+                and values are NumPy arrays or PaddlePaddle tensors.
+        """
+        self.embeddings.load_state_dict(state_dict)
+        self.norm.load_state_dict(state_dict)
+        for i in range(self.num_layers):
+            logger.info(f"Start load layer {i}")
+            self.hidden_layers[i].load_state_dict(state_dict)
+
+    def forward(
+        self,
+        ids_remove_padding: paddle.Tensor,
+        forward_meta: ForwardMeta,
+    ):
+        hidden_states = self.embeddings(ids_remove_padding=ids_remove_padding)
+
+        residual = None
+        for i in range(self.num_layers):
+            hidden_states, residual = self.hidden_layers[i](forward_meta,
+                                                            hidden_states,
+                                                            residual)
+
+        hidden_states = hidden_states + residual
+
+        out = self.norm(hidden_states)
+
+        return out
+
+
+class Ernie4_5_MoeForCausalLM(ModelForCasualLM):
+    """
+    Ernie4_5_MoeForCausalLM
+    """
+
+    def __init__(self, fd_config: FDConfig):
+        """
+        Args:
+            fd_config (FDConfig): Configurations for the LLM model.
+        """
+        super(Ernie4_5_MoeForCausalLM, self).__init__(fd_config)
+        self.fd_config = fd_config
+        self.model = Ernie4_5_Model(fd_config=fd_config)
+
+        self.ori_vocab_size = fd_config.model_config.ori_vocab_size
+
+        self.lm_head = ParallelLMHead(
+            fd_config=fd_config,
+            embedding_dim=fd_config.model_config.hidden_size,
+            num_embeddings=fd_config.model_config.vocab_size,
+            prefix="lm_head",
+        )
+        self.tie_word_embeddings = fd_config.model_config.tie_word_embeddings
+
+    @classmethod
+    def name(self):
+        return "Ernie4_5_MoeForCausalLM"
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict: Dict[str, Union[np.ndarray,
+                                                         paddle.Tensor]]):
+        """
+        Load model parameters from a given state dictionary.
+
+        Args:
+            state_dict (dict[str, np.ndarray | paddle.Tensor]):
+                A dictionary containing model parameters, where keys are parameter names
+                and values are NumPy arrays or PaddlePaddle tensors.
+        """
+        self.model.load_state_dict(state_dict)
+        if self.tie_word_embeddings:
+            self.lm_head.out_linear.weight.set_value(
+                self.model.embeddings.word_embeddings.weight.transpose([1, 0]))
+        else:
+            self.lm_head.load_state_dict(state_dict)
+
+    def compute_logits(self, hidden_states: paddle.Tensor):
+        logits = self.lm_head(hidden_states)
+        logits = paddle.cast(logits, paddle.float32)
+        logits[:, self.ori_vocab_size:] = -float("inf")
+
+        return logits
+
+    def empty_input_forward(self):
+        """
+        empty_input_forward
+        """
+        fake_hidden_states = paddle.empty(
+            shape=[0, self.fd_config.model_config.hidden_size],
+            dtype=paddle.get_default_dtype(),
+        )
+        for i in range(self.fd_config.moe_config.moe_layer_start_index,
+                       self.fd_config.model_config.num_layers):
+            self.model.hidden_layers[i].mlp.fused_moe(fake_hidden_states)
+
+    def forward(
+        self,
+        ids_remove_padding: paddle.Tensor,
+        forward_meta: ForwardMeta,
+    ):
+        hidden_states = self.model(ids_remove_padding=ids_remove_padding,
+                                   forward_meta=forward_meta)
+
+        return hidden_states
+
+
+class Ernie4_5_ForCausalLM(Ernie4_5_MoeForCausalLM):
+    """
+    Ernie4_5_ForCausalLM
+    """
+
+    @classmethod
+    def name(self):
+        """
+        Model Architecture Name
+        """
+        return "Ernie4_5_ForCausalLM"
--- a/fastdeploy/model_executor/models/ernie4_5_mtp.py
+++ b/fastdeploy/model_executor/models/ernie4_5_mtp.py
@@ -0,0 +1,417 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from __future__ import annotations
+
+from functools import partial
+from typing import Dict, Union
+
+import numpy as np
+import paddle
+from paddle import nn
+from paddleformers.transformers import PretrainedModel
+from paddleformers.utils.log import logger
+
+from fastdeploy.config import FDConfig, ModelConfig
+from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
+from fastdeploy.model_executor.layers.normalization import RMSNorm
+from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_DecoderLayer
+from fastdeploy.model_executor.models.model_base import ModelForCasualLM
+from fastdeploy.worker.forward_meta import ForwardMeta
+
+
+class Ernie4_5_MTPPretrainedModel(PretrainedModel):
+    """
+    Ernie4_5_MTPPretrainedModel
+    """
+
+    config_class = FDConfig
+
+    def _init_weight(self, layer):
+        """
+        _init_weight
+        """
+        return None
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True):
+        """
+        get_tensor_parallel_mappings
+        """
+        logger.info("erine inference model _get_tensor_parallel_mappings")
+
+        from paddleformers.transformers.conversion_utils import \
+            split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def gqa_qkv_split_func(
+            weight,
+            tensor_parallel_degree,
+            tensor_parallel_rank,
+            num_attention_heads,
+            num_key_value_heads,
+            head_dim,
+        ):
+
+            def get_shape(tensor):
+                return (tensor.get_shape()
+                        if hasattr(tensor, "get_shape") else tensor.shape)
+
+            def slice_tensor(tensor, start, end):
+                shape = get_shape(tensor)
+                if len(shape) == 1:
+                    return tensor[start:end]
+                else:
+                    return tensor[..., start:end]
+
+            q_end = num_attention_heads * head_dim
+            k_end = q_end + num_key_value_heads * head_dim
+            v_end = k_end + num_key_value_heads * head_dim
+
+            q = slice_tensor(weight, 0, q_end)
+            k = slice_tensor(weight, q_end, k_end)
+            v = slice_tensor(weight, k_end, v_end)
+
+            def split_tensor(tensor, degree):
+                shape = get_shape(tensor)
+                size = shape[-1]
+                block_size = size // degree
+                if hasattr(tensor, "get_shape"):
+                    return [
+                        slice_tensor(tensor, i * block_size,
+                                     (i + 1) * block_size)
+                        for i in range(degree)
+                    ]
+                else:
+                    return np.split(tensor, degree, axis=-1)
+
+            q_list = split_tensor(q, tensor_parallel_degree)
+            k_list = split_tensor(k, tensor_parallel_degree)
+            v_list = split_tensor(v, tensor_parallel_degree)
+
+            if tensor_parallel_rank is None:
+                return [
+                    np.concatenate([q_i, k_i, v_i], axis=-1)
+                    for q_i, k_i, v_i in zip(q_list, k_list, v_list)
+                ]
+            else:
+                return np.concatenate(
+                    [
+                        q_list[tensor_parallel_rank],
+                        k_list[tensor_parallel_rank],
+                        v_list[tensor_parallel_rank],
+                    ],
+                    axis=-1,
+                )
+
+        def gqa_qkv_merge_func(weight_list, num_attention_heads,
+                               num_key_value_heads, head_dim):
+            tensor_parallel_degree = len(weight_list)
+            num_attention_heads = num_attention_heads // tensor_parallel_degree
+            num_key_value_heads = num_key_value_heads // tensor_parallel_degree
+
+            is_paddle_tensor = not isinstance(weight_list[0], np.ndarray)
+
+            def get_shape(tensor):
+                return (tensor.get_shape()
+                        if hasattr(tensor, "get_shape") else tensor.shape)
+
+            def slice_tensor(tensor, start, end):
+                if len(get_shape(tensor)) == 1:
+                    return tensor[start:end]
+                else:
+                    return tensor[..., start:end]
+
+            q_list, k_list, v_list = [], [], []
+
+            for weight in weight_list:
+                q_end = num_attention_heads * head_dim
+                k_end = q_end + num_key_value_heads * head_dim
+                v_end = k_end + num_key_value_heads * head_dim
+
+                q = slice_tensor(weight, 0, q_end)
+                k = slice_tensor(weight, q_end, k_end)
+                v = slice_tensor(weight, k_end, v_end)
+
+                q_list.append(q)
+                k_list.append(k)
+                v_list.append(v)
+
+            merged = q_list + k_list + v_list
+
+            if is_paddle_tensor:
+                tensor = paddle.concat(merged, axis=-1)
+                if tensor.place.is_gpu_place():
+                    tensor = tensor._copy_to(paddle.CUDAPinnedPlace(), False)
+                return tensor
+            else:
+                return np.concatenate(merged, axis=-1)
+
+        if (config.num_key_value_heads is not None
+                and config.num_key_value_heads != config.num_attention_heads):
+            if is_split:
+                qkv_fn = partial(
+                    gqa_qkv_split_func,
+                    tensor_parallel_degree=config.tensor_parallel_degree,
+                    tensor_parallel_rank=config.tensor_parallel_rank,
+                    num_attention_heads=config.num_attention_heads,
+                    num_key_value_heads=config.num_key_value_heads,
+                    head_dim=config.hidden_size // config.num_attention_heads,
+                )
+            else:
+                qkv_fn = partial(
+                    gqa_qkv_merge_func,
+                    num_attention_heads=config.num_attention_heads,
+                    num_key_value_heads=config.num_key_value_heads,
+                    head_dim=config.hidden_size // config.num_attention_heads,
+                )
+        else:
+            qkv_fn = partial(fn, is_column=True)
+
+        def get_tensor_parallel_split_mappings(num_layers, moe_num_experts,
+                                               moe_layer_start_index):
+            """
+            get tensor from parallel-split-mappings
+            """
+            final_actions = {}
+            base_model_prefix = "ernie.mtp_block"
+
+            base_actions = {}
+
+            base_actions["ernie.mtp_linear_proj.0.weight"] = partial(
+                fn, is_column=True)
+            base_actions[
+                f"{base_model_prefix}.0.self_attn.qkv_proj.weight"] = qkv_fn
+            base_actions[
+                f"{base_model_prefix}.0.self_attn.o_proj.weight"] = partial(
+                    fn, is_column=False)
+            base_actions[
+                f"{base_model_prefix}.0.mlp.up_gate_proj.weight"] = partial(
+                    fn, is_column=True, is_naive_2fuse=True)
+            base_actions[f"{base_model_prefix}.0.mlp.down_proj.weight"] = (
+                partial(fn, is_column=False))
+
+            for expert_idx in range(moe_num_experts):
+                base_actions[
+                    f"{base_model_prefix}.{moe_layer_start_index}"
+                    f".mlp.experts.{expert_idx}.up_gate_proj.weight"] = partial(
+                        fn, is_column=True, is_naive_2fuse=True)
+                base_actions[
+                    f"{base_model_prefix}.{moe_layer_start_index}"
+                    f".mlp.experts.{expert_idx}.down_proj.weight"] = partial(
+                        fn, is_column=False)
+
+            for key, action in base_actions.items():
+                if (f"{base_model_prefix}.0.mlp.up_gate_proj.weight" in key or
+                        f"{base_model_prefix}.0.mlp.down_proj.weight" in key):
+                    for i in range(moe_layer_start_index):
+                        final_actions[key.replace("0.", f"{i}.")] = action
+                elif f"{moe_layer_start_index}.mlp.experts." in key:
+                    for i in range(moe_layer_start_index, num_layers):
+                        final_actions[key.replace(f"{moe_layer_start_index}.",
+                                                  f"{i}.")] = action
+                elif f"{base_model_prefix}.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("0.", f"{i}.")] = action
+                final_actions[key] = action
+            return final_actions
+
+        moe_num_experts = 0
+        mappings = get_tensor_parallel_split_mappings(
+            config.num_layers,
+            moe_num_experts,
+            config.moe_layer_start_index,
+        )
+
+        return mappings
+
+
+class Ernie4_5_MTPModel(nn.Layer):
+    """
+    Ernie4_5_MTPModel
+    """
+
+    def __init__(
+        self,
+        fd_config: FDConfig = None,
+    ):
+        """
+        Initializer for the Ernie4_5_MTPModel class.
+
+        Args:
+
+        """
+        super().__init__()
+
+        self.num_layers = fd_config.model_config.num_layers
+        self.embeddings = fd_config.speculative_config.sharing_model.model.embeddings
+
+        self.hidden_layers = [
+            Ernie4_5_DecoderLayer(
+                fd_config=fd_config,
+                prefix=f"{fd_config.model_config.prefix_name}.{i}")
+            for i in range(self.num_layers)
+        ]
+
+        self.enorm = RMSNorm(
+            fd_config,
+            hidden_size=fd_config.model_config.hidden_size,
+            eps=1e-5,
+            prefix="ernie.mtp_emb_norm.0",
+        )
+
+        self.hnorm = RMSNorm(
+            fd_config,
+            hidden_size=fd_config.model_config.hidden_size,
+            eps=1e-5,
+            prefix="ernie.mtp_hidden_norm.0",
+        )
+
+        self.eh_proj = ParallelLMHead(
+            fd_config=fd_config,
+            num_embeddings=fd_config.model_config.hidden_size,
+            embedding_dim=fd_config.model_config.hidden_size * 2,
+            prefix="ernie.mtp_linear_proj.0",
+        )
+
+    def load_state_dict(self, state_dict):
+        """
+        Load model parameters from a given state dictionary.
+
+        Args:
+            state_dict (dict[str, np.ndarray | paddle.Tensor]):
+                A dictionary containing model parameters, where keys are parameter names
+                and values are NumPy arrays or PaddlePaddle tensors.
+        """
+        # self.embeddings.load_state_dict(state_dict)
+        self.enorm.load_state_dict(state_dict)
+        self.hnorm.load_state_dict(state_dict)
+        self.eh_proj.load_state_dict(state_dict)
+        for i in range(self.num_layers):
+            logger.info(f"Start load layer {i}")
+            self.hidden_layers[i].load_state_dict(state_dict)
+
+    def forward(
+        self,
+        ids_remove_padding: paddle.Tensor,
+        previous_hidden_states: paddle.Tensor,
+        forward_meta: ForwardMeta,
+    ):
+        """
+        forward
+        """
+        inputs_embedding = self.embeddings(
+            ids_remove_padding=ids_remove_padding)
+        inputs_embedding = paddle.concat(
+            [self.enorm(inputs_embedding),
+             self.hnorm(previous_hidden_states)],
+            axis=-1)
+        hidden_states = self.eh_proj(inputs_embedding)
+        residual = None
+        for i in range(self.num_layers):
+            hidden_states, residual = self.hidden_layers[i](forward_meta,
+                                                            hidden_states,
+                                                            residual)
+
+        hidden_states = hidden_states + residual
+
+        return hidden_states
+
+
+class Ernie4_5_MTPForCausalLM(ModelForCasualLM):
+    """
+    Ernie4_5_MTPForCausalLM
+    """
+
+    def __init__(self, fd_config: FDConfig):
+        """
+        Args:
+            fd_config (FDConfig): Configurations for the LLM model.
+        """
+        super(Ernie4_5_MTPForCausalLM, self).__init__(fd_config)
+        self.fd_config = fd_config
+        self.model = Ernie4_5_MTPModel(fd_config=fd_config)
+
+        self.ori_vocab_size = fd_config.model_config.ori_vocab_size
+
+        self.lm_head = fd_config.speculative_config.sharing_model.lm_head
+        self.tie_word_embeddings = fd_config.model_config.tie_word_embeddings
+
+    @classmethod
+    def name(self):
+        """
+        """
+        return "Ernie4_5_MTPForCausalLM"
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict: Dict[str, Union[np.ndarray,
+                                                         paddle.Tensor]]):
+        """
+        Load model parameters from a given state dictionary.
+
+        Args:
+            state_dict (dict[str, np.ndarray | paddle.Tensor]):
+                A dictionary containing model parameters, where keys are parameter names
+                and values are NumPy arrays or PaddlePaddle tensors.
+        """
+        self.model.load_state_dict(state_dict)
+        # if self.tie_word_embeddings:
+        #     self.lm_head.out_linear.weight.set_value(
+        #         self.model.embeddings.word_embeddings.weight.transpose([1, 0]))
+        # else:
+        #     self.lm_head.load_state_dict(state_dict)
+
+    def compute_logits(self, hidden_states: paddle.Tensor):
+        """
+        compute logits
+        """
+        logits = self.lm_head(hidden_states)
+        logits = paddle.cast(logits, paddle.float32)
+        logits[:, self.ori_vocab_size:] = -float("inf")
+
+        return logits
+
+    def empty_input_forward(self):
+        """
+        empty_input_forward
+        """
+        fake_hidden_states = paddle.empty(
+            shape=[0, self.fd_config.model_config.hidden_size],
+            dtype=paddle.get_default_dtype(),
+        )
+        for i in range(self.fd_config.moe_config.moe_layer_start_index,
+                       self.fd_config.model_config.num_layers):
+            self.model.hidden_layers[i].mlp.fused_moe(fake_hidden_states)
+
+    def forward(
+        self,
+        ids_remove_padding: paddle.Tensor,
+        previous_hidden_states: paddle.Tensor,
+        forward_meta: ForwardMeta,
+    ):
+        """
+        forward
+        """
+        hidden_states = self.model(ids_remove_padding, previous_hidden_states,
+                                   forward_meta)
+
+        return hidden_states
--- a/fastdeploy/model_executor/models/ernie4_5_vl/init.py
+++ b/fastdeploy/model_executor/models/ernie4_5_vl/init.py
@@ -0,0 +1,15 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
--- a/fastdeploy/model_executor/models/ernie4_5_vl/configuration.py
+++ b/fastdeploy/model_executor/models/ernie4_5_vl/configuration.py
@@ -0,0 +1,167 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import copy
+
+from fastdeploy.config import ModelConfig
+
+from .dfnrope.modeling import DFNRopeVisionTransformerConfig
+
+__all__ = [
+    "Ernie4_5_VLMoeConfig",
+]
+
+
+class Ernie4_5_VLMoeConfig(ModelConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~ErnieModel`]. It is used to instantiate an Ernie
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Ernie-7B.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Ernie model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~ErnieModel`] or [`~TFErnieModel`].
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        Example:
+    ```python
+    >>> from paddleformers.transformer import ErnieModel, ErnieConfig
+
+    >>> # Initializing a Ernie ernie-7b style configuration
+    >>> configuration = ErnieConfig()
+
+    >>> # Initializing a model from the ernie-7b style configuration
+    >>> model = ErnieModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "erniemoevl"
+    attribute_map = {
+        "n_positions": "max_position_embeddings",
+        "n_embd": "hidden_size",
+        "n_layer": "num_hidden_layers",
+        "n_head": "num_attention_heads",
+        "n_inner": "intermediate_size",
+        "activation_function": "hidden_act",
+    }
+
+    def __init__(
+        self,
+        vision_config=None,
+        im_patch_id=None,
+        pixel_hidden_size=None,  # None for fuyu
+        modality_detach=False,
+        temporal_conv_size=2,
+        spatial_conv_size=2,
+        mm_vocab_size=0,  # vocab for mm specialtokens
+        max_text_id=None,
+        use_temporal_conv=True,
+        moe_use_size_all2all=False,
+        moe_num_attn_experts=False,
+        moe_dense_experts_token_type_id: int = 3,
+        moe_use_hard_gate: bool = True,
+        moe_fuse_experts: bool = False,
+        moe_use_token_type_bias: bool = False,
+        disable_ffn_model_parallel=False,
+        fuse_attn_ffn=True,
+        rope_3d=True,
+        freq_allocation=20,
+        using_precision_check=False,
+        use_recompute_resampler=False,
+        resampler_fuse_rms_norm=False,
+        moe_layer_feed_fake_token=False,
+        moe_num_experts=0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vision_config = DFNRopeVisionTransformerConfig(
+            **vision_config) if vision_config else None
+        self.im_patch_id = im_patch_id
+        self.pixel_hidden_size = pixel_hidden_size
+        self.modality_detach = modality_detach
+        self.temporal_conv_size = temporal_conv_size
+        self.spatial_conv_size = spatial_conv_size
+        self.mm_vocab_size = mm_vocab_size
+        self.max_text_id = max_text_id
+        self.use_temporal_conv = use_temporal_conv
+
+        self.moe_use_size_all2all = moe_use_size_all2all
+        self.moe_num_attn_experts = moe_num_attn_experts
+        self.moe_dense_experts_token_type_id = moe_dense_experts_token_type_id
+        self.moe_use_hard_gate = moe_use_hard_gate
+        self.moe_fuse_experts = moe_fuse_experts
+        self.moe_use_token_type_bias = moe_use_token_type_bias
+        self.disable_ffn_model_parallel = disable_ffn_model_parallel
+
+        self.fuse_attn_ffn = fuse_attn_ffn
+        self.rope_3d = rope_3d
+        self.freq_allocation = freq_allocation
+        self.using_precision_check = using_precision_check
+        self.use_recompute_resampler = use_recompute_resampler
+        self.resampler_fuse_rms_norm = resampler_fuse_rms_norm
+        self.moe_layer_feed_fake_token = moe_layer_feed_fake_token
+        self.moe_num_experts = moe_num_experts
+
+    @property
+    def multimodel_experts(self) -> bool:
+        """是否有多种类型的experts."""
+        return isinstance(self.moe_num_experts,
+                          (tuple, list)) and len(self.moe_num_experts) > 1
+
+    @property
+    def use_moe(self) -> bool:
+        """
+        Check if model is using MoE architecture.
+
+        Returns:
+            bool: True if moe_num_experts > 0, False otherwise
+        """
+        return sum(
+            self.moe_num_experts
+        ) > 0 if self.multimodel_experts else self.moe_num_experts > 0
+
+    def to_dict(self, saving_file=False):
+        """to_dict"""
+        output = copy.deepcopy(self.__dict__)
+        if self.vision_config:
+            output["vision_config"] = (
+                self.vision_config.to_diff_dict() if isinstance(
+                    self.vision_config,
+                    (DFNRopeVisionTransformerConfig)) else self.vision_config)
+
+        output["model_type"] = self.__class__.model_type
+        return output
--- a/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/init.py
+++ b/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/init.py
@@ -0,0 +1,22 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from .configuration import DFNRopeVisionTransformerConfig
+from .modeling import DFNRopeVisionTransformerPretrainedModel
+
+__all__ = [
+    'DFNRopeVisionTransformerConfig', 'DFNRopeVisionTransformerPretrainedModel'
+]
--- a/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/activation.py
+++ b/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/activation.py
@@ -0,0 +1,287 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import math
+from collections import OrderedDict
+
+import paddle
+import paddle.nn.functional as F
+from paddle import Tensor, nn
+
+
+class NewGELUActivation(nn.Layer):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """_summary_
+
+        Args:
+            input (Tensor): _description_
+
+        Returns:
+            Tensor: _description_
+        """
+        return (0.5 * input * (1.0 + paddle.tanh(
+            math.sqrt(2.0 / math.pi) *
+            (input + 0.044715 * paddle.pow(input, 3.0)))))
+
+
+class GELUActivation(nn.Layer):
+    """
+    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
+    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
+    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self, use_gelu_python: bool = False):
+        """_summary_
+
+        Args:
+            use_gelu_python (bool, optional): _description_. Defaults to False.
+        """
+        super().__init__()
+        if use_gelu_python:
+            self.act = self._gelu_python
+        else:
+            self.act = nn.functional.gelu
+
+    def _gelu_python(self, input: Tensor) -> Tensor:
+        """_summary_
+
+        Args:
+            input (Tensor): _description_
+
+        Returns:
+            Tensor: _description_
+        """
+        return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0)))
+
+    def forward(self, input: Tensor) -> Tensor:
+        """_summary_
+
+        Args:
+            input (Tensor): _description_
+
+        Returns:
+            Tensor: _description_
+        """
+        return self.act(input)
+
+
+class FastGELUActivation(nn.Layer):
+    """
+    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """_summary_
+
+        Args:
+            input (Tensor): _description_
+
+        Returns:
+            Tensor: _description_
+        """
+        return 0.5 * input * (1.0 +
+                              paddle.tanh(input * 0.7978845608 *
+                                          (1.0 + 0.044715 * input * input)))
+
+
+class QuickGELUActivation(nn.Layer):
+    """
+    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """_summary_
+
+        Args:
+            input (Tensor): _description_
+
+        Returns:
+            Tensor: _description_
+        """
+        return input * F.sigmoid(1.702 * input)
+
+
+class ClippedGELUActivation(nn.Layer):
+    """
+    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
+    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
+    https://arxiv.org/abs/2004.09602.
+
+    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
+    initially created.
+
+    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self, min: float, max: float):
+        if min > max:
+            raise ValueError(
+                f"min should be < max (got min: {min}, max: {max})")
+
+        super().__init__()
+        self.min = min
+        self.max = max
+
+    def forward(self, x: Tensor) -> Tensor:
+        """_summary_
+
+        Args:
+            x (Tensor): _description_
+
+        Returns:
+            Tensor: _description_
+        """
+        return paddle.clip(gelu(x), self.min, self.max)
+
+
+class SiLUActivation(nn.Layer):
+    """
+    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
+    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
+    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
+    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
+    later.
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """_summary_
+
+        Args:
+            input (Tensor): _description_
+
+        Returns:
+            Tensor: _description_
+        """
+        return F.silu(input)
+
+
+class MishActivation(nn.Layer):
+    """
+    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
+    visit the official repository for the paper: https://github.com/digantamisra98/Mish
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """_summary_
+
+        Args:
+            input (Tensor): _description_
+
+        Returns:
+            Tensor: _description_
+        """
+        return F.mish(input)
+
+
+class LinearActivation(nn.Layer):
+    """
+    Applies the linear activation function, i.e. forwarding input directly to output.
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        """_summary_
+
+        Args:
+            input (Tensor): _description_
+
+        Returns:
+            Tensor: _description_
+        """
+        return input
+
+
+class ClassInstantier(OrderedDict):
+    """_summary_
+
+    Args:
+        OrderedDict (_type_): _description_
+    """
+
+    def __getitem__(self, key):
+        """_summary_
+
+        Args:
+            key (_type_): _description_
+
+        Returns:
+            _type_: _description_
+        """
+        content = super().__getitem__(key)
+        cls, kwargs = content if isinstance(content, tuple) else (content, {})
+        return cls(**kwargs)
+
+
+ACT2CLS = {
+    "gelu": GELUActivation,
+    "gelu_10": (ClippedGELUActivation, {
+        "min": -10,
+        "max": 10
+    }),
+    "gelu_fast": FastGELUActivation,
+    "gelu_new": NewGELUActivation,
+    "gelu_python": (GELUActivation, {
+        "use_gelu_python": True
+    }),
+    "linear": LinearActivation,
+    "mish": MishActivation,
+    "quick_gelu": QuickGELUActivation,
+    "relu": nn.ReLU,
+    "relu6": nn.ReLU6,
+    "sigmoid": nn.Sigmoid,
+    "silu": SiLUActivation,
+    "swish": SiLUActivation,
+    "tanh": nn.Tanh,
+}
+ACT2FN = ClassInstantier(ACT2CLS)
+
+
+def get_activation(activation_string):
+    """_summary_
+
+    Args:
+        activation_string (_type_): _description_
+
+    Raises:
+        KeyError: _description_
+
+    Returns:
+        _type_: _description_
+    """
+    if activation_string in ACT2FN:
+        return ACT2FN[activation_string]
+    else:
+        raise KeyError(
+            f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}"
+        )
+
+
+# For backwards compatibility with: from activations import gelu_python
+gelu_python = get_activation("gelu_python")
+gelu_new = get_activation("gelu_new")
+gelu = get_activation("gelu")
+gelu_fast = get_activation("gelu_fast")
+quick_gelu = get_activation("quick_gelu")
+silu = get_activation("silu")
+mish = get_activation("mish")
+linear_act = get_activation("linear")
--- a/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/configuration.py
+++ b/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/configuration.py
@@ -0,0 +1,70 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from paddleformers.transformers.configuration_utils import PretrainedConfig
+
+__all__ = [
+    "DFNRopeVisionTransformerConfig",
+]
+
+
+class DFNRopeVisionTransformerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~ErnieModel`]. It is used to instantiate an Ernie
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Ernie-7B.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    """
+
+    model_type = "DFNRope_vision_transformer"
+
+    def __init__(
+        self,
+        depth=32,
+        embed_dim=1280,
+        hidden_size=3584,
+        hidden_act="quick_gelu",
+        mlp_ratio=4,
+        num_heads=16,
+        in_channels=3,
+        patch_size=14,
+        spatial_merge_size=2,
+        attn_implementation="eager",  # new added
+        pp_data_balance=False,
+        recompute=False,
+        attn_sep=False,
+        vit_first_fwd_bsz=128,
+        vit_num_recompute_layers=10000,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.depth = depth
+        self.embed_dim = embed_dim
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.attn_implementation = attn_implementation
+        self.pp_data_balance = pp_data_balance
+        self.recompute = recompute
+        self.attn_sep = attn_sep
+        self.vit_first_fwd_bsz = vit_first_fwd_bsz
+        self.vit_num_recompute_layers = vit_num_recompute_layers
--- a/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/modeling.py
+++ b/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/modeling.py
@@ -0,0 +1,732 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from functools import partial
+
+import numpy as np
+import paddle
+import paddle.distributed as dist
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_parallel import (ColumnParallelLinear,
+                                                    RowParallelLinear)
+from paddle.distributed.fleet.utils import recompute
+from paddle.nn.functional.flash_attention import \
+    flash_attn_unpadded as flash_attn_varlen_func
+from paddleformers.transformers.model_utils import PretrainedModel
+
+from .activation import ACT2FN
+from .configuration import DFNRopeVisionTransformerConfig
+
+
+def get_hcg():
+    """
+    获取混合通信组
+
+    Args:
+        无参数
+
+    Returns:
+        int: 混合通信组的ID
+    """
+    return fleet.get_hybrid_communicate_group()
+
+
+class _AllToAll(paddle.autograd.PyLayer):
+
+    @staticmethod
+    def forward(
+        ctx,
+        input,
+        group,
+        output_split_sizes=None,
+        input_split_sizes=None,
+    ):
+        """
+        All-to-all communication in the group.
+
+        Args:
+            ctx (Any): Context object.
+            input (Tensor): Input tensor.
+            group (Group): The group object.
+
+        Returns:
+            Tensor: Output tensor.
+        """
+
+        ctx.group = group
+        ctx.input_split_sizes = input_split_sizes
+        ctx.output_split_sizes = output_split_sizes
+        # return input
+        if dist.get_world_size(group) <= 1:
+            return input
+        if input_split_sizes is None and output_split_sizes is None:
+            output = paddle.empty_like(input)
+            task = dist.stream.alltoall_single(output, input, None, None,
+                                               group, True, True)
+            task.wait()
+        else:
+            out_sizes = [sum(output_split_sizes)]
+            out_sizes.extend(input.shape[1:])
+            output = paddle.empty(out_sizes, dtype=input.dtype)
+            task = dist.stream.alltoall_single(output,
+                                               input,
+                                               output_split_sizes,
+                                               input_split_sizes,
+                                               group,
+                                               sync_op=False)
+            task.wait()
+        return output
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        """
+        all-to-all backward
+
+        """
+        # return grad_output
+        if ctx.input_split_sizes is None and ctx.output_split_sizes is None:
+            return _AllToAll.apply(*grad_output, ctx.group)
+        else:
+            return _AllToAll.apply(*grad_output, ctx.group,
+                                   ctx.input_split_sizes,
+                                   ctx.output_split_sizes)
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., :x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return paddle.concat([-x2, x1], axis=-1)  # shape is the same as x
+
+
+def apply_rotary_pos_emb_vision(tensor: paddle.Tensor,
+                                freqs: paddle.Tensor) -> paddle.Tensor:
+    """_summary_
+
+    Args:
+        tensor (paddle.Tensor): _description_
+        freqs (paddle.Tensor): _description_
+
+    Returns:
+        paddle.Tensor: _description_
+    """
+    orig_dtype = tensor.dtype
+
+    with paddle.amp.auto_cast(False):
+        tensor = tensor.astype(dtype="float32")
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cos = cos.unsqueeze(1).tile(
+            repeat_times=[1, 1, 2]).unsqueeze(0).astype(dtype="float32")
+        sin = sin.unsqueeze(1).tile(
+            repeat_times=[1, 1, 2]).unsqueeze(0).astype(dtype="float32")
+        output = tensor * cos + rotate_half(tensor) * sin
+    output = paddle.cast(output, orig_dtype)
+    return output
+
+
+def qkv_reshard_head(tensor, group):
+    """
+    将qkv在seq维度拼接后一起做切分维度的转换
+    """
+    parallelism = group.nranks
+    qkv_seqlen, head_num, head_dim = tensor.shape
+    tensor = tensor.transpose(perm=[1, 0, 2]).contiguous()
+    out = _AllToAll.apply(tensor, group)
+    out = paddle.split(out, parallelism, axis=0)
+    output_q = []
+    output_k = []
+    output_v = []
+    for output_i in out:
+        outout = output_i.transpose(perm=[1, 0, 2]).contiguous()
+        output = paddle.split(outout, 3, axis=0)
+        output_q.append(output[0])
+        output_k.append(output[1])
+        output_v.append(output[2])
+    q = paddle.concat(output_q, axis=0)
+    k = paddle.concat(output_k, axis=0)
+    v = paddle.concat(output_v, axis=0)
+    return q, k, v
+
+
+class VisionFlashAttention2(nn.Layer):
+    """_summary_
+
+    Args:
+        nn (_type_): _description_
+    """
+
+    def __init__(self,
+                 dim: int,
+                 num_heads: int = 16,
+                 tensor_parallel_degree: int = 1) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.tensor_parallel_degree = tensor_parallel_degree
+
+        if tensor_parallel_degree > 1:
+            self.qkv = ColumnParallelLinear(
+                dim,
+                dim * 3,
+                mp_group=fleet.get_hybrid_communicate_group().
+                get_model_parallel_group(),
+                weight_attr=None,
+                has_bias=True,
+                fuse_matmul_bias=True,
+                gather_output=False,
+            )
+            self.proj = RowParallelLinear(
+                dim,
+                dim,
+                mp_group=fleet.get_hybrid_communicate_group(
+                ).get_model_parallel_group(),
+                input_is_parallel=True,
+                has_bias=True)
+        else:
+            self.qkv = nn.Linear(dim, dim * 3, bias_attr=True)
+            self.proj = nn.Linear(dim, dim)
+
+        self.head_dim = dim // num_heads  # must added
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        cu_seqlens: paddle.Tensor,
+        rotary_pos_emb: paddle.Tensor = None,
+        attn_sep=False,
+    ) -> paddle.Tensor:
+        """_summary_
+
+        Args:
+            hidden_states (paddle.Tensor): _description_
+            cu_seqlens (paddle.Tensor): _description_
+            rotary_pos_emb (paddle.Tensor, optional): _description_. Defaults to None.
+
+        Returns:
+            paddle.Tensor: _description_
+        """
+        seq_length = hidden_states.shape[0]
+        qkv = self.qkv(hidden_states).reshape(
+            [seq_length, 3, self.num_heads // self.tensor_parallel_degree,
+             -1]).transpose(perm=[1, 0, 2, 3])
+        q, k, v = qkv.unbind(axis=0)
+
+        if attn_sep:
+            hcg = get_hcg()
+            mp_group = hcg.get_model_parallel_group()
+            qkv = paddle.concat([q, k, v], axis=0)
+            q, k, v = qkv_reshard_head(qkv, mp_group)
+            seq_length = q.shape[0]
+
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(axis=0),
+                                        rotary_pos_emb).squeeze(axis=0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(axis=0),
+                                        rotary_pos_emb).squeeze(axis=0)
+
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+
+        softmax_scale = self.head_dim**-0.5  # TODO: 需要手动加上
+
+        attn_output = (
+            flash_attn_varlen_func(  # flash_attn_unpadded
+                q,  # 不支持float32
+                k,
+                v,
+                cu_seqlens,
+                cu_seqlens,
+                max_seqlen,
+                max_seqlen,
+                scale=softmax_scale,  # TODO: 需要手动加上
+            )[0].squeeze(0).reshape([seq_length, -1]))
+        if attn_sep:
+            out = _AllToAll.apply(attn_output, mp_group)
+            out = paddle.split(out, mp_group.nranks, axis=0)
+            attn_output = paddle.concat(out, axis=1)
+        attn_output = attn_output.astype(paddle.float32)
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+
+class PatchEmbed(nn.Layer):
+    """_summary_
+
+    Args:
+        nn (_type_): _description_
+    """
+
+    def __init__(
+        self,
+        patch_size: int = 14,
+        in_channels: int = 3,
+        embed_dim: int = 1152,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+        self.proj = nn.Linear(in_channels * patch_size * patch_size,
+                              embed_dim,
+                              bias_attr=False)
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        """_summary_
+
+        Args:
+            hidden_states (paddle.Tensor): _description_
+
+        Returns:
+            paddle.Tensor: _description_
+        """
+        target_dtype = self.proj.weight.dtype
+
+        hidden_states = self.proj(
+            paddle.cast(hidden_states, dtype=target_dtype))
+
+        return hidden_states
+
+
+class VisionMlp(nn.Layer):
+    """_summary_
+
+    Args:
+        nn (_type_): _description_
+    """
+
+    def __init__(self,
+                 dim: int,
+                 hidden_dim: int,
+                 hidden_act: str,
+                 tensor_parallel_degree: int = 1) -> None:
+        super().__init__()
+        self.tensor_parallel_degree = tensor_parallel_degree
+
+        if self.tensor_parallel_degree > 1:
+            self.fc1 = ColumnParallelLinear(
+                dim,
+                hidden_dim,
+                mp_group=fleet.get_hybrid_communicate_group(
+                ).get_model_parallel_group(),
+                gather_output=False,
+                has_bias=True)
+            self.fc2 = RowParallelLinear(
+                hidden_dim,
+                dim,
+                mp_group=fleet.get_hybrid_communicate_group(
+                ).get_model_parallel_group(),
+                input_is_parallel=True,
+                has_bias=True)
+        else:
+            self.fc1 = nn.Linear(dim, hidden_dim)
+            self.fc2 = nn.Linear(hidden_dim, dim)
+        self.act = ACT2FN[hidden_act]
+
+    def forward(self, x) -> paddle.Tensor:
+        """_summary_
+
+        Args:
+            x (_type_): _description_
+
+        Returns:
+            paddle.Tensor: _description_
+        """
+        return self.fc2(self.act(self.fc1(x)))
+
+
+class VisionRotaryEmbedding(nn.Layer):
+    """_summary_
+
+    Args:
+        nn (_type_): _description_
+    """
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        """_summary_
+
+        Args:
+            dim (int): _description_
+            theta (float, optional): _description_. Defaults to 10000.0.
+        """
+        super().__init__()
+        self.inv_freq = 1.0 / theta**(
+            paddle.arange(start=0, end=dim, step=2, dtype="float32") / dim)
+
+    def forward(self, seqlen: int) -> paddle.Tensor:
+        """_summary_
+
+        Args:
+            seqlen (int): _description_
+
+        Returns:
+            paddle.Tensor: _description_
+        """
+        seq = paddle.arange(seqlen).cast(self.inv_freq.dtype)
+        freqs = paddle.outer(x=seq, y=self.inv_freq)
+        return freqs
+
+
+class DFNRopeVisionBlock(nn.Layer):
+    """_summary_
+
+    Args:
+        nn (_type_): _description_
+    """
+
+    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
+        """_summary_
+
+        Args:
+            config (_type_): _description_
+            attn_implementation (str, optional): _description_. Defaults to "sdpa".
+        """
+        super().__init__()
+        self.norm1 = nn.LayerNorm(config.embed_dim, epsilon=1e-6)
+        self.norm2 = nn.LayerNorm(config.embed_dim, epsilon=1e-6)
+        mlp_hidden_dim = int(config.embed_dim * config.mlp_ratio)
+
+        self.attn = VisionFlashAttention2(
+            config.embed_dim,
+            num_heads=config.num_heads,
+            tensor_parallel_degree=config.tensor_parallel_degree)
+        self.mlp = VisionMlp(
+            dim=config.embed_dim,
+            hidden_dim=mlp_hidden_dim,
+            hidden_act=config.hidden_act,
+            tensor_parallel_degree=config.tensor_parallel_degree)
+        self.config = config
+
+    def forward(self,
+                hidden_states,
+                cu_seqlens,
+                rotary_pos_emb,
+                attn_sep=False) -> paddle.Tensor:
+        """_summary_
+
+        Args:
+            hidden_states (_type_): _description_
+            cu_seqlens (_type_): _description_
+            rotary_pos_emb (_type_): _description_
+
+        Returns:
+            paddle.Tensor: _description_
+        """
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            attn_sep=attn_sep,
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+
+
+class PatchMerger(nn.Layer):
+    """_summary_
+
+    Args:
+        nn (_type_): _description_
+    """
+
+    def __init__(self,
+                 dim: int,
+                 context_dim: int,
+                 spatial_merge_size: int = 2) -> None:
+        """_summary_
+
+        Args:
+            dim (int): _description_
+            context_dim (int): _description_
+            spatial_merge_size (int, optional): _description_. Defaults to 2.
+        """
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        self.ln_q = nn.LayerNorm(context_dim, epsilon=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(self.hidden_size, self.hidden_size),
+            nn.GELU(),
+            nn.Linear(self.hidden_size, dim),
+        )
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        """_summary_
+
+        Args:
+            x (paddle.Tensor): _description_
+
+        Returns:
+            paddle.Tensor: _description_
+        """
+        x = self.mlp(self.ln_q(x).reshape([-1, self.hidden_size]))
+        return x
+
+
+class DFNRopeVisionTransformerPretrainedModel(PretrainedModel):
+    """_summary_
+
+    Args:
+        PretrainedModel (_type_): _description_
+
+    Returns:
+        _type_: _description_
+    """
+
+    config_class = DFNRopeVisionTransformerConfig
+
+    def __init__(self, config) -> None:
+        super().__init__(config)
+        self.spatial_merge_size = config.spatial_merge_size
+
+        self.patch_embed = PatchEmbed(
+            patch_size=config.patch_size,
+            in_channels=config.in_channels,
+            embed_dim=config.embed_dim,
+        )
+
+        head_dim = config.embed_dim // config.num_heads
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.LayerList(
+            [DFNRopeVisionBlock(config) for _ in range(config.depth)])
+
+        assert (
+            config.hidden_size == config.embed_dim
+        ), "in DFNRope, vit's config.hidden must be equal to config.embed_dim"
+        # self.merger = PatchMerger(dim=config.hidden_size, context_dim=config.embed_dim)
+        self.ln = nn.LayerNorm(config.hidden_size, epsilon=1e-6)
+
+    def get_dtype(self) -> paddle.dtype:
+        """_summary_
+
+        Returns:
+            paddle.dtype: _description_
+        """
+        return self.blocks[0].mlp.fc2.weight.dtype
+
+    def get_name_mappings_to_training(self, ):
+        """ get_name_mappings_to_training """
+        infer_to_train = {}
+
+        # vit train names
+        vit_names = [
+            "vision_model.patch_embed.proj.weight", "vision_model.ln.weight",
+            "vision_model.ln.bias"
+        ]
+
+        vit_layer = 32
+        for layer_idx in range(vit_layer):
+            vit_names.append(f"vision_model.blocks.{layer_idx}.norm1.weight")
+            vit_names.append(f"vision_model.blocks.{layer_idx}.norm1.bias")
+
+            vit_names.append(f"vision_model.blocks.{layer_idx}.norm2.weight")
+            vit_names.append(f"vision_model.blocks.{layer_idx}.norm2.bias")
+
+            vit_names.append(
+                f"vision_model.blocks.{layer_idx}.attn.qkv.weight")
+            vit_names.append(f"vision_model.blocks.{layer_idx}.attn.qkv.bias")
+
+            vit_names.append(
+                f"vision_model.blocks.{layer_idx}.attn.proj.weight")
+            vit_names.append(f"vision_model.blocks.{layer_idx}.attn.proj.bias")
+
+            vit_names.append(f"vision_model.blocks.{layer_idx}.mlp.fc1.weight")
+            vit_names.append(f"vision_model.blocks.{layer_idx}.mlp.fc1.bias")
+
+            vit_names.append(f"vision_model.blocks.{layer_idx}.mlp.fc2.weight")
+            vit_names.append(f"vision_model.blocks.{layer_idx}.mlp.fc2.bias")
+
+        for train_name in vit_names:
+            infer_to_train[train_name] = train_name
+
+        return infer_to_train
+
+    def rot_pos_emb(self, grid_thw, num_pad=0):
+        """_summary_
+
+        Args:
+            grid_thw (_type_): _description_
+
+        Returns:
+            _type_: _description_
+        """
+        pos_ids = []
+        grid_hw_array = np.array(grid_thw, dtype=np.int64)
+        for t, h, w in grid_hw_array:
+            hpos_ids = np.arange(h).reshape(-1, 1)
+            hpos_ids = np.tile(hpos_ids, (1, w))
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = np.transpose(hpos_ids, (0, 2, 1, 3))
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = np.arange(w).reshape(1, -1)
+            wpos_ids = np.tile(wpos_ids, (h, 1))
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = np.transpose(wpos_ids, (0, 2, 1, 3))
+            wpos_ids = wpos_ids.flatten()
+
+            stacked_ids = np.stack([hpos_ids, wpos_ids], axis=-1)
+            tiled_ids = np.tile(stacked_ids, (t, 1))
+            pos_ids.append(tiled_ids)
+
+        pos_ids = np.concatenate(pos_ids, axis=0)
+        if num_pad > 0:
+            pos_ids = np.concatenate(
+                [pos_ids, np.zeros((num_pad, 2), dtype=pos_ids.dtype)])
+        max_grid_size = np.amax(grid_hw_array[:, 1:])
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(start_axis=1)
+        return rotary_pos_emb
+
+    def forward(self,
+                hidden_states: paddle.Tensor,
+                grid_thw: paddle.Tensor,
+                num_pad=0) -> paddle.Tensor:
+        """_summary_
+
+        Args:
+            hidden_states (paddle.Tensor): _description_
+            grid_thw (paddle.Tensor): _description_
+
+        Returns:
+            paddle.Tensor: _description_
+        """
+        hidden_states = self.patch_embed(hidden_states)
+
+        rotary_pos_emb = self.rot_pos_emb(grid_thw, num_pad=num_pad)
+
+        cu_seqlens = paddle.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
+                                              grid_thw[:, 0]).cumsum(
+                                                  axis=0, dtype="int32")
+
+        if num_pad > 0:
+            cu_seqlens = F.pad(cu_seqlens, (1, 1), value=0)
+            cu_seqlens[-1] = cu_seqlens[-2] + num_pad
+        else:
+            cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+
+        attn_sep = getattr(self.config, "attn_sep", False)
+        vit_num_recompute_layers = getattr(self.config,
+                                           "vit_num_recompute_layers",
+                                           self.config.depth)
+
+        for idx, blk in enumerate(self.blocks):
+            if self.config.recompute and self.training and idx < vit_num_recompute_layers:
+                hidden_states = recompute(blk, hidden_states, cu_seqlens,
+                                          rotary_pos_emb, attn_sep)
+            else:
+                hidden_states = blk(
+                    hidden_states,
+                    cu_seqlens=cu_seqlens,
+                    rotary_pos_emb=rotary_pos_emb,
+                    attn_sep=attn_sep,
+                )
+
+        # ret = self.merger(hidden_states)
+        # ret = hidden_states
+        ret = self.ln(hidden_states)  # add norm
+        return ret
+
+    def extract_feature(self, hidden_states: paddle.Tensor,
+                        grid_thw: paddle.Tensor) -> paddle.Tensor:
+        """_summary_
+
+        Args:
+            hidden_states (paddle.Tensor): _description_
+            grid_thw (paddle.Tensor): _description_
+
+        Returns:
+            paddle.Tensor: _description_
+        """
+        return self.forward(hidden_states, grid_thw)
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config, is_split=True):
+        """
+        dummy
+        """
+
+        from paddleformers.transformers.conversion_utils import \
+            split_or_merge_func
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+        )
+        vision_config = config.vision_config
+
+        def split_qkv_weight(x):
+            head_dim = vision_config.hidden_size // vision_config.num_heads
+            x = x.reshape([
+                vision_config.hidden_size, 3, vision_config.num_heads, head_dim
+            ])
+            x = np.split(x, vision_config.tensor_parallel_degree,
+                         axis=-2)[vision_config.tensor_parallel_rank]
+            x = x.reshape([vision_config.hidden_size, -1])
+            return x
+
+        def split_qkv_bias(x):
+            head_dim = vision_config.hidden_size // vision_config.num_heads
+            x = x.reshape([3, vision_config.num_heads, head_dim])
+            x = np.split(x, vision_config.tensor_parallel_degree,
+                         axis=-2)[vision_config.tensor_parallel_rank]
+            x = x.reshape([-1])
+            return x
+
+        def get_tensor_parallel_split_mappings(depth):
+            final_actions = {}
+            base_actions = {
+                "vision_model.blocks.0.attn.proj.weight":
+                partial(fn, is_column=False),
+                "vision_model.blocks.0.fc1.weight":
+                partial(fn, is_column=True),
+                "vision_model.blocks.0.fc1.bias":
+                partial(fn, is_column=True),
+                "vision_model.blocks.0.fc2.weight":
+                partial(fn, is_column=False),
+                "vision_model.blocks.0.qkv.weight":
+                split_qkv_weight,
+                "vision_model.blocks.0.qkv.bias":
+                split_qkv_bias,
+            }
+
+            for key, action in base_actions.items():
+                if "blocks.0." in key:
+                    for i in range(depth):
+                        newkey = key.replace("blocks.0.", f"blocks.{i}.")
+                        final_actions[newkey] = action
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(vision_config.depth)
+        return mappings
+
+    def set_state_dict(self, state_dict, *args, **kwargs):
+        """_summary_
+
+        Args:
+            state_dict (_type_): _description_
+        """
+        super().set_state_dict(state_dict, *args, **kwargs)
--- a/fastdeploy/model_executor/models/ernie4_5_vl/dist_utils.py
+++ b/fastdeploy/model_executor/models/ernie4_5_vl/dist_utils.py
@@ -0,0 +1,130 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import paddle
+from paddle import distributed as dist
+from paddle.distributed import fleet
+from paddle.distributed.fleet.utils.sequence_parallel_utils import \
+    RowSequenceParallelLinear
+
+__all__ = [
+    "scatter_axis", "all_gather_group", "reduce_scatter_group",
+    "RowSequenceParallelLinear"
+]
+
+
+def scatter_axis(input, group=None, axis=0):
+    """
+    在MP 间按照第 0 维对`input`进行均匀切分。
+    这个API 跟`distributed.scatter`并没有什么关系
+    """
+    if group is None:
+        hcg = fleet.get_hybrid_communicate_group()
+        group = hcg.get_model_parallel_group()
+    parallelism = group.nranks
+    if parallelism == 1:
+        return input.clone()
+    rank = group.rank
+    seq_len = input.shape[axis]
+    assert seq_len % parallelism == 0, (
+        f"Input sequence length {seq_len} can't be divided exactly"
+        f" by sequence parallelism {parallelism}")
+    interval = seq_len // parallelism
+    input = paddle.slice(input,
+                         axes=[axis],
+                         starts=[interval * rank],
+                         ends=[interval * (rank + 1)])
+    # slice use stride, so we maintain the memory of whole input, use assign to free the whole input
+    # which can avoid OOM.
+    input = paddle.assign(input)
+    return input
+
+
+def all_gather_group(input, group=None, axis=0):
+    """Perform collective all-gather operation across a process group with axis control.
+
+    Functional Behavior:
+      - Aggregates input tensors from all processes in the specified group
+      - Supports concatenation along arbitrary dimensions (axis parameter)
+      - Optimizes for axis=0 via direct shape expansion to avoid concatenation overhead
+
+    Args:
+        input (Tensor):        Local tensor to be gathered (shape: [..., D, ...])
+        group (ProcessGroup):  Communication group (defaults to model parallel group)
+        axis (int):            Concatenation dimension (default=0)
+
+    Returns:
+        Tensor: Concatenated tensor combining inputs from all processes:
+                - When axis=0: shape [D*N, ...] (N = group size)
+                - Otherwise:   shape [..., D*N, ...] along specified axis
+    """
+    if group is None:
+        hcg = fleet.get_hybrid_communicate_group()
+        group = hcg.get_model_parallel_group()
+    parallelism = group.nranks
+    if parallelism == 1:
+        return input.clone()
+    output_shape = input.shape
+    if axis == 0:
+        output_shape[axis] = output_shape[axis] * parallelism
+        output = paddle.empty(shape=output_shape, dtype=input.dtype)
+        dist.stream.all_gather(output,
+                               input,
+                               group=group,
+                               use_calc_stream=True)
+        return output
+    outputs = [
+        paddle.empty(output_shape, dtype=input.dtype)
+        for _ in range(parallelism)
+    ]
+    dist.stream.all_gather(outputs, input, group=group, use_calc_stream=True)
+    output = paddle.concat(outputs, axis=axis)
+    return output
+
+
+def reduce_scatter_group(input, group=None):
+    """Perform reduce-scatter collective operation across a process group.
+
+    Functional Behavior:
+      - Aggregates (sums) input tensors across all processes in the group
+      - Scatters the reduced result equally to all participants
+      - Operates along the first dimension (axis=0) of the input tensor
+
+    Args:
+        input (Tensor):        Local tensor to reduce (shape: [N*K, ...] where N=group_size)
+        group (ProcessGroup): Communication group (defaults to model parallel group)
+
+    Returns:
+        Tensor: Scattered portion of reduced tensor with shape [K, ...]
+    """
+    if group is None:
+        hcg = fleet.get_hybrid_communicate_group()
+        group = hcg.get_model_parallel_group()
+    parallelism = group.nranks
+    if parallelism == 1:
+        return input.clone()
+    output_shape = input.shape
+    assert (
+        input.shape[0] % parallelism == 0
+    ), f"Input sequence length {input.shape[0]} can't be divided exactly by sequence parallelism {parallelism}"
+    output_shape[0] = output_shape[0] // parallelism
+    output = paddle.empty(shape=output_shape, dtype=input.dtype)
+    dist.stream.reduce_scatter(output,
+                               input,
+                               op=dist.ReduceOp.SUM,
+                               group=group,
+                               use_calc_stream=True)
+    return output
--- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
+++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
@@ -0,0 +1,511 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Dict, Optional, Union
+
+import numpy as np
+import paddle
+from paddle import nn
+from paddleformers.utils.log import logger
+
+from fastdeploy.config import FDConfig
+from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
+from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
+from fastdeploy.model_executor.layers.moe.moe import FusedMoE
+from fastdeploy.model_executor.layers.normalization import RMSNorm
+from fastdeploy.model_executor.layers.utils import get_tensor
+from fastdeploy.model_executor.models.ernie4_5_moe import (Ernie4_5_Attention,
+                                                           Ernie4_5_MLP)
+from fastdeploy.model_executor.models.model_base import ModelForCasualLM
+from fastdeploy.platforms import current_platform
+
+if current_platform.is_cuda():
+    from fastdeploy.model_executor.ops.gpu import (extract_text_token_output,
+                                                   text_image_gather_scatter,
+                                                   text_image_index_out)
+
+from fastdeploy.worker.forward_meta import ForwardMeta
+
+
+class Ernie4_5_VLMLP(Ernie4_5_MLP):
+    pass
+
+
+class Ernie4_5_VLAttention(Ernie4_5_Attention):
+    pass
+
+
+@dataclass
+class VLMoEMeta:
+    image_input: Optional[paddle.Tensor] = None
+    text_input: Optional[paddle.Tensor] = None
+    text_index: Optional[paddle.Tensor] = None
+    image_index: Optional[paddle.Tensor] = None
+    token_type_ids: Optional[paddle.Tensor] = None
+
+
+class Ernie4_5_VLMoE(nn.Layer):
+
+    def __init__(self, fd_config: FDConfig, layer_id: int,
+                 prefix: str) -> None:
+        super().__init__()
+
+        moe_layer_start_index = fd_config.moe_config.moe_layer_start_index
+        if isinstance(moe_layer_start_index, int):
+            text_moe_layer_start_index = moe_layer_start_index
+            image_moe_layer_start_index = moe_layer_start_index
+        else:
+            text_moe_layer_start_index = moe_layer_start_index[0]
+            image_moe_layer_start_index = moe_layer_start_index[1]
+
+        moe_layer_end_index = fd_config.moe_config.moe_layer_end_index
+        if moe_layer_end_index is None:
+            text_moe_layer_end_index = fd_config.model_config.num_layers
+            image_moe_layer_end_index = fd_config.model_config.num_layers
+        elif isinstance(moe_layer_end_index, int):
+            text_moe_layer_end_index = moe_layer_end_index
+            image_moe_layer_end_index = moe_layer_end_index
+        else:
+            text_moe_layer_end_index = moe_layer_end_index[0]
+            image_moe_layer_end_index = moe_layer_end_index[1]
+
+        assert text_moe_layer_start_index <= text_moe_layer_end_index
+        if layer_id >= text_moe_layer_start_index and layer_id <= text_moe_layer_end_index:
+            weight_key_map = {
+                "gate_weight_key":
+                f"{prefix}.gate.weight",
+                "gate_correction_bias_key":
+                f"{prefix}.moe_statics.e_score_correction_bias",
+                "ffn1_expert_weight_key":
+                f"{prefix}.experts.{{}}.up_gate_proj.weight",
+                "ffn2_expert_weight_key":
+                f"{prefix}.experts.{{}}.down_proj.weight",
+            }
+            self.mlp_text = FusedMoE(
+                fd_config=fd_config,
+                moe_intermediate_size=fd_config.moe_config.
+                moe_intermediate_size[0],
+                num_experts=fd_config.moe_config.num_experts[0],
+                expert_id_offset=0,
+                top_k=fd_config.moe_config.top_k,
+                layer_idx=layer_id,
+                moe_tag="Text",
+                weight_key_map=weight_key_map,
+            )
+            self.mlp_text.extract_gate_correction_bias = self.extract_gate_correction_bias_text
+        else:
+            self.mlp_text = Ernie4_5_VLMLP(
+                fd_config=fd_config,
+                intermediate_size=fd_config.model_config.ffn_hidden_size,
+                prefix=f"{prefix}",
+            )
+
+        assert image_moe_layer_start_index <= image_moe_layer_end_index
+        if layer_id >= image_moe_layer_start_index and layer_id <= image_moe_layer_end_index:
+            weight_key_map = {
+                "gate_weight_key":
+                f"{prefix}.gate.weight_1",
+                "gate_correction_bias_key":
+                f"{prefix}.moe_statics.e_score_correction_bias",
+                "ffn1_expert_weight_key":
+                f"{prefix}.experts.{{}}.up_gate_proj.weight",
+                "ffn2_expert_weight_key":
+                f"{prefix}.experts.{{}}.down_proj.weight",
+            }
+            self.mlp_image = FusedMoE(
+                fd_config=fd_config,
+                moe_intermediate_size=fd_config.moe_config.
+                moe_intermediate_size[1],
+                num_experts=fd_config.moe_config.num_experts[1],
+                expert_id_offset=fd_config.moe_config.num_experts[0],
+                top_k=fd_config.moe_config.top_k,
+                layer_idx=layer_id,
+                moe_tag="Image",
+                weight_key_map=weight_key_map,
+            )
+            self.mlp_image.extract_gate_correction_bias = self.extract_gate_correction_bias_image
+        else:
+            self.mlp_image = Ernie4_5_VLMLP(
+                fd_config=fd_config,
+                intermediate_size=fd_config.model_config.ffn_hidden_size,
+                prefix=f"{prefix}",
+            )
+
+        self.num_shared_experts = fd_config.moe_config.moe_num_shared_experts
+        if self.num_shared_experts > 0:
+            self.share_experts = Ernie4_5_VLMLP(
+                fd_config=fd_config,
+                intermediate_size=self.num_shared_experts *
+                fd_config.moe_config.moe_intermediate_size[0],
+                prefix=f"{prefix}.shared_experts",
+            )
+
+    def extract_gate_correction_bias_text(self, gate_correction_bias_key,
+                                          state_dict):
+        """
+        extract_gate_correction_bias function.
+        """
+        gate_correction_bias_tensor = get_tensor(
+            state_dict[gate_correction_bias_key]).astype("float32")
+        return gate_correction_bias_tensor[0].unsqueeze(0)
+
+    def extract_gate_correction_bias_image(self, gate_correction_bias_key,
+                                           state_dict):
+        """
+        extract_gate_correction_bias function.
+        """
+        gate_correction_bias_tensor = get_tensor(
+            state_dict[gate_correction_bias_key]).astype("float32")
+        return gate_correction_bias_tensor[1].unsqueeze(0)
+
+    def load_state_dict(self, state_dict):
+        self.mlp_text.load_state_dict(state_dict)
+        self.mlp_image.load_state_dict(state_dict)
+        if self.mlp_text.moe_use_gate_correction_bias:
+            state_dict.pop(self.mlp_text.gate_correction_bias_key)
+        if self.num_shared_experts > 0:
+            self.share_experts.load_state_dict(state_dict)
+
+    def forward(self, hidden_states: paddle.Tensor, vl_moe_meta: VLMoEMeta):
+        if self.num_shared_experts > 0:
+            share_experts_out = self.share_experts(hidden_states)
+        if vl_moe_meta.image_input is not None:
+            text_image_gather_scatter(
+                hidden_states,
+                vl_moe_meta.text_input,
+                vl_moe_meta.image_input,
+                vl_moe_meta.token_type_ids,
+                vl_moe_meta.text_index,
+                vl_moe_meta.image_index,
+                True,
+            )
+            text_out = self.mlp_text(vl_moe_meta.text_input)
+            image_out = self.mlp_image(vl_moe_meta.image_input)
+            text_image_gather_scatter(
+                hidden_states,
+                text_out,
+                image_out,
+                vl_moe_meta.token_type_ids,
+                vl_moe_meta.text_index,
+                vl_moe_meta.image_index,
+                False,
+            )
+        else:
+            hidden_states = self.mlp_text(hidden_states)
+        if self.num_shared_experts > 0:
+            hidden_states += share_experts_out
+        return hidden_states
+
+
+class Ernie4_5_VLDecoderLayer(nn.Layer):
+
+    def __init__(
+        self,
+        fd_config: FDConfig,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        layer_id = int(prefix.split(sep='.')[-1])
+
+        moe_layer_start_index = fd_config.moe_config.moe_layer_start_index
+        if isinstance(moe_layer_start_index, list):
+            min_moe_layer_start_index = min(moe_layer_start_index)
+        else:
+            min_moe_layer_start_index = moe_layer_start_index
+
+        max_moe_layer_end_index = fd_config.model_config.num_layers
+        if fd_config.moe_config.moe_layer_end_index is not None:
+            moe_layer_end_index = fd_config.moe_config.moe_layer_end_index
+            if isinstance(moe_layer_start_index, list):
+                max_moe_layer_end_index = max(moe_layer_end_index)
+            else:
+                max_moe_layer_end_index = moe_layer_end_index
+
+        self.self_attn = Ernie4_5_VLAttention(
+            fd_config=fd_config,
+            layer_id=layer_id,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        assert min_moe_layer_start_index <= max_moe_layer_end_index
+
+        if (fd_config.moe_config.num_experts is not None
+                and layer_id >= min_moe_layer_start_index
+                and layer_id <= max_moe_layer_end_index):
+            self.mlp = Ernie4_5_VLMoE(
+                fd_config=fd_config,
+                layer_id=layer_id,
+                prefix=f"{prefix}.mlp",
+            )
+        else:
+            self.mlp = Ernie4_5_VLMLP(
+                fd_config=fd_config,
+                intermediate_size=fd_config.model_config.ffn_hidden_size,
+                prefix=f"{prefix}.mlp",
+            )
+
+        self.input_layernorm = RMSNorm(
+            fd_config,
+            hidden_size=fd_config.model_config.hidden_size,
+            eps=1e-5,
+            prefix=f"{prefix}.input_layernorm",
+        )
+
+        self.post_attention_layernorm = RMSNorm(
+            fd_config,
+            hidden_size=fd_config.model_config.hidden_size,
+            eps=1e-5,
+            prefix=f"{prefix}.post_attention_layernorm",
+        )
+
+    def load_state_dict(self, state_dict):
+        self.self_attn.load_state_dict(state_dict)
+        self.mlp.load_state_dict(state_dict)
+        self.input_layernorm.load_state_dict(state_dict)
+        self.post_attention_layernorm.load_state_dict(state_dict)
+
+    def forward(
+        self,
+        forward_meta: ForwardMeta,
+        hidden_states: paddle.Tensor,
+        residual: paddle.Tensor = None,
+        vl_moe_meta: VLMoEMeta = None,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            forward_meta=forward_meta,
+        )
+
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+
+        if isinstance(self.mlp, Ernie4_5_VLMoE):
+            hidden_states = self.mlp(hidden_states, vl_moe_meta)
+        else:
+            hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+class Ernie4_5_VLModel(nn.Layer):
+
+    def __init__(
+        self,
+        fd_config: FDConfig = None,
+    ):
+        """
+        Initializer for the Ernie4_5_VLModel class.
+
+        Args:
+
+        """
+        super().__init__()
+
+        self.num_layers = fd_config.model_config.num_layers
+        self.im_patch_id = fd_config.moe_config.im_patch_id
+        self._dtype = fd_config.model_config.dtype
+        fd_config.model_config.prefix_name = "ernie"
+
+        self.embeddings = VocabParallelEmbedding(
+            fd_config=fd_config,
+            num_embeddings=fd_config.model_config.vocab_size,
+            embedding_dim=fd_config.model_config.hidden_size,
+            params_dtype=paddle.get_default_dtype,
+            prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"),
+        )
+
+        self.hidden_layers = [
+            Ernie4_5_VLDecoderLayer(
+                fd_config=fd_config,
+                prefix=f"{fd_config.model_config.prefix_name}.layers.{i}")
+            for i in range(self.num_layers)
+        ]
+
+        self.norm = RMSNorm(
+            fd_config,
+            hidden_size=fd_config.model_config.hidden_size,
+            eps=1e-5,
+            prefix=f"{fd_config.model_config.prefix_name}.norm",
+        )
+
+    def load_state_dict(self, state_dict):
+        """
+        Load model parameters from a given state dictionary.
+
+        Args:
+            state_dict (dict[str, np.ndarray | paddle.Tensor]):
+                A dictionary containing model parameters, where keys are parameter names
+                and values are NumPy arrays or PaddlePaddle tensors.
+        """
+        self.embeddings.load_state_dict(state_dict)
+        self.norm.load_state_dict(state_dict)
+        for i in range(self.num_layers):
+            logger.info(f"Start load layer {i}")
+            self.hidden_layers[i].load_state_dict(state_dict)
+
+    def forward(
+        self,
+        ids_remove_padding: paddle.Tensor,
+        image_features: paddle.Tensor,
+        forward_meta: ForwardMeta,
+    ):
+        text_input = None
+        image_input = None
+        text_index = None
+        image_index = None
+        image_token_num = 0
+
+        hidden_states = self.embeddings(ids_remove_padding=ids_remove_padding)
+
+        # -----------------------
+        image_mask = ids_remove_padding == self.im_patch_id
+        token_type_ids = image_mask.cast("int32")
+        token_num = hidden_states.shape[0]
+        image_token_num = paddle.count_nonzero(token_type_ids).cast("int32")
+        text_token_num = ((token_num - image_token_num) if
+                          (token_num - image_token_num) > 0 else 1)
+        if image_mask.any():
+            hidden_states[image_mask] = image_features.cast(self._dtype)
+            text_input = paddle.full(
+                shape=[text_token_num, hidden_states.shape[1]],
+                fill_value=1,
+                dtype=self._dtype)
+            image_input = paddle.full(
+                shape=[image_token_num, hidden_states.shape[1]],
+                fill_value=1,
+                dtype=self._dtype)
+            text_index = paddle.zeros_like(token_type_ids)
+            image_index = paddle.zeros_like(token_type_ids)
+            text_image_index_out(token_type_ids, text_index, image_index)
+
+        vl_moe_meta = VLMoEMeta(
+            text_input=text_input,
+            image_input=image_input,
+            text_index=text_index,
+            image_index=image_index,
+            token_type_ids=token_type_ids,
+        )
+        # -----------------------
+
+        residual = None
+        for i in range(self.num_layers):
+            hidden_states, residual = self.hidden_layers[i](
+                forward_meta,
+                hidden_states,
+                residual,
+                vl_moe_meta,
+            )
+
+        hidden_states = hidden_states + residual
+
+        # -----------------------
+        hidden_states = hidden_states.cast("float32")
+        score_text = hidden_states
+
+        if image_input is not None:
+            token_type_ids = token_type_ids.reshape([-1])
+            text_pos_shifted = token_type_ids[:token_num] == 0
+            score_text = hidden_states[text_pos_shifted.reshape([-1])]
+        max_seq_len, max_seq_len_index = paddle.topk(
+            forward_meta.seq_lens_this_time.squeeze(-1), k=1)
+        hidden_states = extract_text_token_output(
+            max_seq_len,
+            max_seq_len_index.cast("int32"),
+            image_token_num,
+            forward_meta.seq_lens_this_time,
+            forward_meta.cu_seqlens_q,
+            score_text,
+        )[0].cast(self._dtype)
+        # -----------------------
+
+        out = self.norm(hidden_states)
+
+        return out
+
+
+class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM):
+    """
+    Ernie4_5_VLMoeForConditionalGeneration
+    """
+
+    def __init__(self, fd_config: FDConfig):
+        """
+        Args:
+            fd_config (FDConfig): Configurations for the LLM model.
+        """
+        super(Ernie4_5_VLMoeForConditionalGeneration, self).__init__(fd_config)
+
+        self.model = Ernie4_5_VLModel(fd_config=fd_config)
+
+        self.ori_vocab_size = fd_config.model_config.ori_vocab_size
+
+        self.lm_head = ParallelLMHead(
+            fd_config=fd_config,
+            embedding_dim=fd_config.model_config.hidden_size,
+            num_embeddings=fd_config.model_config.vocab_size,
+            prefix="lm_head",
+        )
+        self.tie_word_embeddings = fd_config.model_config.tie_word_embeddings
+
+    @classmethod
+    def name(self):
+        return "Ernie4_5_VLMoeForConditionalGeneration"
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict: Dict[str, Union[np.ndarray,
+                                                         paddle.Tensor]]):
+        """
+        Load model parameters from a given state dictionary.
+
+        Args:
+            state_dict (dict[str, np.ndarray | paddle.Tensor]):
+                A dictionary containing model parameters, where keys are parameter names
+                and values are NumPy arrays or PaddlePaddle tensors.
+        """
+        self.model.load_state_dict(state_dict)
+        if self.tie_word_embeddings:
+            self.lm_head.out_linear.weight.set_value(
+                self.model.embeddings.word_embeddings.weight.transpose([1, 0]))
+        else:
+            self.lm_head.load_state_dict(state_dict)
+
+    def compute_logits(self, hidden_states: paddle.Tensor):
+        logits = self.lm_head(hidden_states)
+        logits = paddle.cast(logits, paddle.float32)
+        logits[:, self.ori_vocab_size:] = -float("inf")
+
+        return logits
+
+    def forward(
+        self,
+        ids_remove_padding: paddle.Tensor,
+        image_features: paddle.Tensor,
+        forward_meta: ForwardMeta,
+    ):
+        hidden_states = self.model(ids_remove_padding, image_features,
+                                   forward_meta)
+
+        return hidden_states
--- a/fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py
+++ b/fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py
@@ -0,0 +1,399 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from copy import deepcopy
+from functools import partial
+
+import numpy as np
+import paddle
+from paddle import nn
+from paddle.autograd import PyLayer
+from paddle.distributed.fleet.utils import recompute
+
+from fastdeploy.model_executor.layers.utils import _set_var_distributed
+from fastdeploy.model_executor.models.ernie4_5_vl.dist_utils import (
+    RowSequenceParallelLinear, all_gather_group, reduce_scatter_group,
+    scatter_axis)
+
+
+class ScatterOp(PyLayer):
+    """
+    各 rank 从**同一个** sequence 上 slice 出属于自己的部分（均匀切分 )。
+    在反向时候会汇聚来自各 rank 的梯度，回复到 mp 同步状态。
+    反操作是`GatherOp`
+
+    input: Tensor [S,*]
+
+    注意：跟`distributed.scatter`并没有什么关系
+    """
+
+    @staticmethod
+    def forward(ctx, input, axis=0, group=None):
+        """fwd"""
+        ctx.axis = axis
+        ctx.group = group
+        return scatter_axis(input, axis=axis, group=ctx.group)
+
+    @staticmethod
+    def backward(ctx, grad):
+        return all_gather_group(grad, axis=ctx.axis, group=ctx.group)
+
+
+class AllGatherOp(PyLayer):
+    """
+    input shape: [s/n, b, h], n is mp parallelism
+    after forward shape: [s, b, h]
+    行为类似`AllGather`，反向会汇聚梯度，AllGather 完之后还是 MP 异步态。
+    """
+
+    @staticmethod
+    def forward(ctx, input, group=None):
+        """fwd"""
+        ctx.group = group
+        return all_gather_group(input, group=group)
+
+    # grad shape: [s, b, h], n is mp parallelism
+    # after forward shape: [s/n, b, h]
+    @staticmethod
+    def backward(ctx, grad):
+        return reduce_scatter_group(grad, group=ctx.group)
+
+
+def mark_as_sequence_parallel_parameter(parameter):
+    parameter.sequence_parallel = True
+
+
+class RMSNorm(nn.Layer):
+    """
+    Root Mean Square Layer Normalization (RMSNorm) implementation.
+
+    RMSNorm is a simplified version of LayerNorm that focuses on the root mean square of inputs,
+    omitting the mean-centering operation. This provides computational efficiency while maintaining
+    good performance.
+
+    """
+
+    def __init__(self, config):
+        """
+        Initialize RMSNorm layer.
+
+        Args:
+            config (ErnieConfig): Model configuration.
+        """
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.weight = paddle.create_parameter(
+            shape=[self.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(1.0),
+        )
+        self.variance_epsilon = config.rms_norm_eps
+        self.config = config
+
+        if config.sequence_parallel:
+            mark_as_sequence_parallel_parameter(self.weight)
+
+    def forward(self, hidden_states):
+        """
+        Apply RMS normalization to input hidden states.
+
+        Args:
+            hidden_states (Tensor): Input tensor of shape [batch_size, seq_len, hidden_size]
+
+        Returns:
+            Tensor: Normalized output tensor of same shape as input
+
+        Note:
+            - Uses fused kernel if config.fuse_rms_norm is True for better performance
+            - Otherwise computes RMSNorm manually:
+                1. Compute variance of features
+                2. Apply reciprocal square root normalization
+                3. Scale by learned weight parameter
+            - Maintains original dtype for numerical stability during computation
+        """
+        with paddle.amp.auto_cast(False):
+            variance = hidden_states.astype("float32").pow(2).mean(
+                -1, keepdim=True)
+            hidden_states = paddle.rsqrt(variance +
+                                         self.variance_epsilon) * hidden_states
+        return hidden_states.astype(self.weight.dtype) * self.weight
+
+
+class VariableResolutionResamplerModel(nn.Layer):
+    """
+    VariableResolutionResamplerModel, 支持变分, 负责空间、时间维度缩并。
+    """
+
+    def __init__(self, in_dim, out_dim, spatial_conv_size, temporal_conv_size,
+                 config):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.config = config
+        self.spatial_conv_size = spatial_conv_size
+        self.temporal_conv_size = temporal_conv_size
+        self.use_recompute_resampler = config.use_recompute_resampler
+        self.use_temporal_conv = config.use_temporal_conv
+        self.tensor_parallel_degree = config.tensor_parallel_degree
+
+        # for 空间四合一
+        self.spatial_dim = self.in_dim * self.spatial_conv_size * self.spatial_conv_size
+        # for 时间二合一
+        self.temporal_dim = self.in_dim * self.spatial_conv_size * self.spatial_conv_size * self.temporal_conv_size
+
+        with paddle.utils.unique_name.guard("mm_resampler_"):
+
+            self.spatial_linear = nn.Sequential(
+                (RowSequenceParallelLinear(
+                    self.spatial_dim,
+                    self.spatial_dim,
+                    input_is_parallel=True,
+                    has_bias=True,
+                    fuse_matmul_bias=True,
+                ) if config.tensor_parallel_degree > 1 else nn.Linear(
+                    self.spatial_dim, self.spatial_dim)),
+                nn.GELU(),
+                nn.Linear(self.spatial_dim, self.spatial_dim),
+                nn.LayerNorm(self.spatial_dim, epsilon=1e-6),
+            )
+
+            if self.use_temporal_conv:
+                self.temporal_linear = nn.Sequential(
+                    nn.Linear(self.temporal_dim, self.spatial_dim),
+                    nn.GELU(),
+                    nn.Linear(self.spatial_dim, self.spatial_dim),
+                    nn.LayerNorm(self.spatial_dim, epsilon=1e-6),
+                )
+
+            self.mlp = nn.Linear(self.spatial_dim, self.out_dim)
+
+            out_config = deepcopy(config)
+            out_config.hidden_size = out_dim
+            # Note(GuoxiaWang): fuse can reduce gpu peak memory
+            out_config.fuse_rms_norm = out_config.resampler_fuse_rms_norm
+            self.after_norm = RMSNorm(out_config)
+
+            if config.tensor_parallel_degree > 1:
+                for idx in [2, 3]:
+                    mark_as_sequence_parallel_parameter(
+                        self.spatial_linear[idx].weight)
+                    mark_as_sequence_parallel_parameter(
+                        self.spatial_linear[idx].bias)
+                _set_var_distributed(self.spatial_linear[idx].weight,
+                                     split_axis=0)
+                _set_var_distributed(self.spatial_linear[idx].bias,
+                                     split_axis=0)
+
+                if self.use_temporal_conv:
+                    for idx in [0, 2, 3]:
+                        mark_as_sequence_parallel_parameter(
+                            self.temporal_linear[idx].weight)
+                        mark_as_sequence_parallel_parameter(
+                            self.temporal_linear[idx].bias)
+
+                mark_as_sequence_parallel_parameter(self.mlp.weight)
+                mark_as_sequence_parallel_parameter(self.mlp.bias)
+                mark_as_sequence_parallel_parameter(self.after_norm.weight)
+
+    def get_name_mappings_to_training(self, ):
+        """ get_name_mappings_to_training """
+        infer_to_train = {}
+        resampler_names = [
+            "ernie.resampler_model.spatial_linear.0.weight",
+            "ernie.resampler_model.spatial_linear.0.bias",
+            "ernie.resampler_model.spatial_linear.2.weight",
+            "ernie.resampler_model.spatial_linear.2.bias",
+            "ernie.resampler_model.spatial_linear.3.weight",
+            "ernie.resampler_model.spatial_linear.3.bias",
+            "ernie.resampler_model.temporal_linear.0.weight",
+            "ernie.resampler_model.temporal_linear.0.bias",
+            "ernie.resampler_model.temporal_linear.2.weight",
+            "ernie.resampler_model.temporal_linear.2.bias",
+            "ernie.resampler_model.temporal_linear.3.weight",
+            "ernie.resampler_model.temporal_linear.3.bias",
+            "ernie.resampler_model.mlp.weight",
+            "ernie.resampler_model.mlp.bias",
+            "ernie.resampler_model.after_norm.weight",
+        ]
+        for train_name in resampler_names:
+            infer_to_train[train_name[len("ernie."):]] = train_name
+
+        return infer_to_train
+
+    def spatial_conv_reshape(self, x, spatial_conv_size):
+        """
+        Linear 前的 reshape，为了让 Linear 能模仿 conv 的感受野
+        """
+        S, C = x.shape
+        x = x.reshape([-1, C * (spatial_conv_size**2)])
+        return x
+
+    def forward(self, x, image_mask, token_type_ids, image_type_ids, grid_thw):
+        """
+        x: image_features
+        image_mask: [B]
+        token_types_ids: [B]
+        image_type_ids:  [B_image]
+        grid_thw: [B_image, 3]
+        """
+        assert image_type_ids is not None
+
+        def fwd_spatial(x):
+            """
+            x in the shape of [S, H]
+            S is ordered in the following way: [ [patch_h*patch_w (row-major traversal)] * patch_time]
+            H is simply hidden
+            """
+            x = self.spatial_conv_reshape(x, self.spatial_conv_size)
+
+            num_pad = 0
+            if self.tensor_parallel_degree > 1:
+                num_pad = (
+                    x.shape[0] + self.tensor_parallel_degree - 1
+                ) // self.tensor_parallel_degree * self.tensor_parallel_degree - x.shape[
+                    0]
+
+            if num_pad > 0:
+                x = paddle.nn.functional.pad(x, [0, num_pad, 0, 0])
+
+            x = self.spatial_linear(x)
+
+            if self.tensor_parallel_degree > 1:
+                x = AllGatherOp.apply(x)
+
+            if num_pad > 0:
+                x = x[:-num_pad]
+            return x
+
+        def fwd_placeholder(x, grid_thw, to_tensor=False):
+            """
+            x: [S, H]
+            grid_thw: [S, 3]
+                其中第二维是: [t, h, w]
+            """
+
+            grid_thw_cpu = grid_thw.numpy()
+            grid_t, grid_hw = grid_thw_cpu[:, 0], grid_thw_cpu[:, 1:]
+            grid_hw_after_conv = grid_hw.prod(-1) // (self.spatial_conv_size**
+                                                      2)
+
+            tokens_per_img_or_vid = grid_thw_cpu.prod(-1) // (
+                self.spatial_conv_size**2)
+            batch_offset = np.empty(tokens_per_img_or_vid.size,
+                                    dtype=tokens_per_img_or_vid.dtype)
+            batch_offset[0] = 0
+            batch_offset[1:] = tokens_per_img_or_vid.cumsum()[:-1]
+
+            assert self.temporal_conv_size == 2, f"Hard Code: temporal_conv_size==2, got:{self.temporal_conv_size}"
+
+            # TODO: support any temporal conv size
+            slice_offsets = []
+            for temporoal_size, spatial_size, b_offset in zip(
+                    grid_t, grid_hw_after_conv, batch_offset):
+                for temp_offset in range(0, temporoal_size, 2):
+                    slice_offsets.append(
+                        np.arange(b_offset + (temp_offset) * spatial_size,
+                                  b_offset + (temp_offset + 1) * spatial_size))
+            slice_offsets = paddle.to_tensor(
+                np.concatenate(slice_offsets, axis=-1))
+
+            slice_offsets2 = []
+            for temporoal_size, spatial_size, b_offset in zip(
+                    grid_t, grid_hw_after_conv, batch_offset):
+                for temp_offset in range(1 if temporoal_size > 1 else 0,
+                                         temporoal_size, 2):
+                    slice_offsets2.append(
+                        np.arange(b_offset + (temp_offset) * spatial_size,
+                                  b_offset + (temp_offset + 1) * spatial_size))
+            slice_offsets2 = paddle.to_tensor(
+                np.concatenate(slice_offsets2, axis=-1))
+
+            x_timestep_1 = paddle.gather(x, slice_offsets, axis=0)
+            x_timestep_2 = paddle.gather(x, slice_offsets2, axis=0)
+            x = paddle.concat([x_timestep_1, x_timestep_2], axis=-1)
+
+            return x
+
+        def fwd_temporal(x):
+            num_pad = 0
+            if self.tensor_parallel_degree > 1:
+                num_pad = (
+                    x.shape[0] + self.tensor_parallel_degree - 1
+                ) // self.tensor_parallel_degree * self.tensor_parallel_degree - x.shape[
+                    0]
+            if num_pad > 0:
+                x = paddle.nn.functional.pad(x, [0, num_pad, 0, 0])
+            if self.tensor_parallel_degree > 1:
+                x = ScatterOp.apply(x, axis=0)
+            x = self.temporal_linear(x)
+
+            if self.use_recompute_resampler:
+                num_pad = paddle.to_tensor(num_pad)
+
+            return x, num_pad
+
+        def fwd_mlp(x):
+            x = self.mlp(x)
+            x = self.after_norm(x)
+            if self.tensor_parallel_degree > 1:
+                x = AllGatherOp.apply(x)
+            return x
+
+        num_pad = 0
+        if self.use_recompute_resampler:
+            x = recompute(fwd_spatial, x)
+            if self.use_temporal_conv:
+                x = recompute(fwd_placeholder, x, grid_thw)
+                x, num_pad = recompute(fwd_temporal, x)
+            x = recompute(fwd_mlp, x)
+        else:
+            x = fwd_spatial(x)
+            if self.use_temporal_conv:
+                x = fwd_placeholder(x, grid_thw)
+                x, num_pad = fwd_temporal(x)
+            x = fwd_mlp(x)
+        if num_pad is not None and num_pad > 0:
+            x = x[:-num_pad]
+        return x
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config, is_split=True):
+
+        from paddleformers.transformers.conversion_utils import \
+            split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+        res = {"spatial_linear.0.weight": partial(fn, is_column=False)}
+        for k in (
+                "spatial_linear.0.bias",  # row linear bias
+                "spatial_linear.2.weight",
+                "spatial_linear.2.bias",  # linear
+                "spatial_linear.3.weight",
+                "spatial_linear.3.bias",  # layernorm
+                "temporal_linear.0.weight",
+                "temporal_linear.0.weight",  # linear
+                "temporal_linear.2.weight",
+                "temporal_linear.2.bias",  # linear
+                "temporal_linear.3.weight",
+                "temporal_linear.3.bias",  # bias
+        ):
+            res.update({k: lambda x: x})
+        return res
--- a/fastdeploy/model_executor/models/export_model.py
+++ b/fastdeploy/model_executor/models/export_model.py
@@ -1,652 +0,0 @@
-"""
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-from __future__ import annotations
-
-import contextlib
-import json
-import os
-import sys
-import threading
-
-import paddle
-import paddle.distributed as dist
-from paddle.common_ops_import import convert_dtype
-from fastdeploy.model_executor.models.utils import convert_ndarray_dtype
-from paddlenlp.trainer import RuntimeTimer
-from fastdeploy.inference_args import GenerationPhase
-
-from .utils import (
-    _vocab_size_with_padding,
-    generate_rank_mapping,
-    get_infer_model_path,
-    model_convert_fp8,
-)
-from paddlenlp.transformers import AutoTokenizer
-from paddle.distributed import fleet
-from paddlenlp.utils.env import USE_FAST_TOKENIZER
-from paddlenlp.utils.log import logger
-from fastdeploy.model_executor.models.utils import load_checkpoint
-
-from fastdeploy.config import (AdditionalConfig, DecodingConfig, DeviceConfig,
-                               LLMConfig, LoadConfig, ModelConfig, MoEConfig,
-                               ParallelConfig, SpeculativeConfig, TmpConfig)
-from fastdeploy.inference_args import GenerationPhase
-
-from ..layers.quantization import get_quantization_config
-from .model_base import ModelRegistry
-from .qwen2 import Qwen2PretrainedModel
-from .utils import (_vocab_size_with_padding, convert_ndarray_dtype,
-                    load_checkpoint, parser_quant_type)
-from paddlenlp.transformers.configuration_utils import PretrainedConfig
-from paddlenlp.trl import llm_utils
-model_classes_mapping = {
-    "Qwen2ForCausalLM": Qwen2PretrainedModel,
-}
-
-current_dir = os.path.dirname(os.path.abspath(__file__))
-grandparent_dir = os.path.abspath(
-    os.path.join(current_dir, os.pardir, os.pardir))
-sys.path.append(grandparent_dir)
-
-
-def offload_model(model):
-    """
-    Offload the model to CUDAPinnedPlace.
-    """
-    device = paddle.CUDAPinnedPlace()
-    for name, src in model.named_parameters():
-        if src._is_initialized() and not isinstance(src.place,
-                                                    paddle.CUDAPinnedPlace):
-            dst = src._copy_to(device, True)
-            dst_tensor = dst.value().get_tensor()
-            src_tensor = src.value().get_tensor()
-            src_tensor._clear()
-            src_tensor._share_data_with(dst_tensor)
-
-
-def reload_model(model):
-    """
-    Reload the model from CUDAPinnedPlace to GPU.
-    """
-    model.to(paddle.device.get_device())
-
-
-def reconstruct_memory(model):
-    """
-    reconstruct_memory to avoid memory chunks
-    """
-    offload_model(model)
-    paddle.device.cuda.empty_cache()
-    reload_model(model)
-
-
-def load_tensor_from_ipc_meta(state_dict):
-    """
-    convert ipc_meta to tensor, but keep keys unchanged
-    { 'key': ipc_meta } --> { 'key': tensor }
-    example:
-    state_dict = load_tensor_from_ipc_meta(state_dict)
-    """
-    for k, v in state_dict.items():
-        # for pickling, we have to convert bytes object before save
-        v[0] = v[0].encode("latin-1")
-        state_dict[k] = paddle.to_tensor(
-            paddle.base.core.LoDTensor._new_shared_cuda(tuple(v)))
-    return state_dict
-
-
-def build_stream_line_model(
-    config_path,
-    model_path,
-    dtype,
-    block_size,
-    max_len,
-    stage_flag,
-    min_dec_len=1,
-    max_dec_len=128,
-    temperature=1,
-    top_k=8,
-    top_p=0.8,
-    pre_caches_length=0,
-    export_model_type="default",
-    use_stop_seqs=False,
-    use_fake_parameter=False,
-    show_topk: int = 0,
-    msg_queue_id=None,
-    pad_vocab=True,
-    tokenizer=None,
-    cache_quant_dtype="default",
-    use_beam_search: bool = False,
-    enf_gen: bool = False,
-    speculate_method=None,
-    speculate_max_draft_token_num: int = 1,
-    speculate_max_candidate_len: int = 5,
-    speculate_verify_window: int = 2,
-    return_all_hidden_states: bool = False,
-    draft_type: str = "None",
-    start_layer_index: int = 0,
-    moe_quant_type: str = "default",
-    use_ep: bool = False,
-    ep_just_for_test: bool = False,
-    generation_phase: GenerationPhase = GenerationPhase.PREFILL,
-    use_micro_batch: bool = False,
-    fake_server_p: bool = False,
-    scale_dir: str = "None",
-    output_via_mq: bool = True,
-    use_safetensors: bool = False,
-    enable_redundant_experts: bool = False,
-    redundant_experts_num: int = 0,
-    max_batch_size: int = 128,
-    use_offline_quant: bool = False,
-    return_state_dicts: bool = False,
-    sharing_model=None,
-    sharing_state_dicts=None,
-):
-    """
-    Build a fused inference model
-
-    Args:
-        config_path (str): Path to the configuration file
-        model_path (str): Path to the model file
-        dtype (str): Data type of the model
-        block_size (int): Block size
-        max_len (int): Maximum sequence length
-        stage_flag (str): Qianfan requirement, stage flag, used to identify different stages in \
-            time-consuming statistics logs, such as prediction ("msgid-1 predict") or export ("convert").
-        min_dec_len (int, optional): Minimum decoding length. Default is 1.
-        max_dec_len (int, optional): Maximum decoding length. Default is 128.
-        temperature (float, optional): Temperature coefficient. Default is 1.
-        top_k (int, optional): k value in top-k sampling. Default is 0.
-        top_p (float, optional): p value in top-p sampling. Default is 0.8.
-        pre_caches_length (int, optional): Pre-cache length. Default is 0.
-        export_model_type (str, optional): Type of model to export. Default is "default".
-        use_stop_seqs (bool, optional): Whether to use stop sequences. Default is False.
-        use_fake_parameter (bool, optional): Whether to use fake parameters. Default is False.
-        show_topk (int, optional): Whether to show top-k results. Default is 0.
-        msg_queue_id (int, optional): Message queue ID. Default is None.
-        pad_vocab (bool, optional): Whether to pad the vocabulary. Default is True.
-        cache_quant_dtype (str, optional): Cache quantization data type. Default is "default".
-        use_beam_search (bool, optional): Whether to use beam search . Defaults is False.
-        enf_gen (bool, optional): Whether to use enforce generation. Defaults is False.
-    Returns:
-        tuple[dict, Tokenizer, CausalLM]:
-        A tuple containing the configuration, tokenizer, and model.
-    """
-    runtime_timer = RuntimeTimer("build_model")
-    runtime_timer.start(f"{stage_flag} stage model loading time")
-
-    # config_path = os.path.join(model_path,"config.json")
-    with open(config_path, "r") as fin:
-        config = json.load(fin)
-    architectures = config.get("architectures")
-    if tokenizer is None:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_path,
-            padding_side="left",
-            use_fast=USE_FAST_TOKENIZER,
-        )
-
-    config, _ = PretrainedConfig.get_config_dict(model_path)
-    model_config = ModelConfig.from_dict(config)
-
-    parallel_config = ParallelConfig()
-    speculative_config = SpeculativeConfig()
-    device_config = DeviceConfig()
-    additional_config = AdditionalConfig()
-    load_config = LoadConfig()
-    tmp_config = TmpConfig()
-    moe_config = MoEConfig()
-    decoding_config = DecodingConfig()
-
-    tensor_parallel_rank, tensor_parallel_degree = llm_utils.init_dist_env()
-    parallel_config.tensor_parallel_rank = tensor_parallel_rank
-    parallel_config.tensor_parallel_degree = tensor_parallel_degree
-    parallel_config.mp_size = tensor_parallel_degree
-    parallel_config.ep_size = 1
-    parallel_config.column_cut = False
-
-    speculative_config.is_mtp = draft_type in ["eagle", "mtp"]
-    speculative_config.draft_type = draft_type
-
-    # Note(tangbinhan): used for load_checkpoint
-    model_config.tensor_parallel_rank = parallel_config.tensor_parallel_rank
-    model_config.tensor_parallel_degree = parallel_config.tensor_parallel_degree
-    model_config.use_ep = use_ep
-    model_config.is_mtp = speculative_config.is_mtp
-
-    additional_config.use_fake_parameter = use_fake_parameter
-    additional_config.ep_just_for_test = ep_just_for_test
-
-    tmp_config.use_offline_quant = use_offline_quant
-    if use_ep:
-        if isinstance(model_config.moe_num_experts, list):
-            model_config.has_multimodality = True
-            moe_config.num_experts = model_config.moe_num_experts[0]
-        else:
-            moe_config.num_experts = model_config.moe_num_experts
-        moe_config.num_experts_per_rank = (
-            moe_config.num_experts // parallel_config.tensor_parallel_degree
-        )
-        moe_config.num_experts_start_offset = (
-            moe_config.num_experts_per_rank * parallel_config.tensor_parallel_rank
-        )
-
-    # use the length of tokenizer as the origin vocab size
-    ori_vocab_size = len(tokenizer)
-    moe_intermediate_size = (config.get("moe_intermediate_size", None),)
-    if isinstance(moe_intermediate_size, list) or isinstance(
-        moe_intermediate_size, tuple
-    ):
-        moe_intermediate_size = moe_intermediate_size[0]
-
-    if not use_ep and pad_vocab:
-        config["vocab_size"] = _vocab_size_with_padding(
-            config.get("vocab_size", tokenizer.vocab_size),
-            config.pop("vocab_size_divisible_unit", 128),
-            paddle.distributed.get_world_size(),
-        )
-
-    group_size = config.get("group_size", -1)
-    num_key_value_heads = config.get("num_key_value_heads", -1)
-    if num_key_value_heads is None:
-        num_key_value_heads = -1
-
-    if config.get("ffn_hidden_size", None) is not None:
-        ffn_hidden_size = config["ffn_hidden_size"]
-    elif config.get("intermediate_size", None) is not None:
-        ffn_hidden_size = config["intermediate_size"]
-    else:
-        ffn_hidden_size = 4 * config["hidden_size"]
-        if config["hidden_act"].lower() == "swiglu":
-            if paddle.distributed.get_world_size() > 1:
-                multiple_of = 8 * config["num_attention_heads"]
-            else:
-                multiple_of = 4 * config["num_attention_heads"]
-            ffn_hidden_size = multiple_of * (
-                (int(2 * ffn_hidden_size / 3) + multiple_of - 1) //
-                multiple_of)
-
-    if draft_type in ["mtp", "eagle"]:
-        num_layers = 1
-    else:
-        num_layers = config.get("num_layers", None) or config.get(
-            "num_hidden_layers", None
-        )
-    if num_layers is None:
-        raise ValueError(f"num_layers<{num_layers}> is invalid")
-
-    use_moe = config.get(
-        "moe_layer_start_index", num_layers
-    ) < num_layers or draft_type in ["mtp", "eagle"]
-
-    if not sharing_state_dicts:
-        if use_fake_parameter:
-            context = contextlib.nullcontext()
-        elif use_safetensors:
-            context = paddle.LazyGuard()
-            model_class = model_classes_mapping[architectures[0]]
-            state_dict = load_checkpoint(model_path,
-                                        model_class,
-                                        model_config,
-                                        return_numpy=True)
-        elif use_moe:
-            tensor_parallel_degree = dist.get_world_size()
-            if tensor_parallel_degree > 1:
-                hcg = fleet.get_hybrid_communicate_group()
-                mp_id = hcg.get_model_parallel_rank()
-                # 统计文件子目录数量
-                subdir_count = 0
-                for entry in os.listdir(model_path):
-                    if "pp" in entry:
-                        full_path = os.path.join(model_path, entry)
-                        if os.path.isdir(full_path):
-                            subdir_count += 1
-
-                pp_num = subdir_count
-                rank_model_paths = [
-                    os.path.join(model_path, f"pp{i}/model_state.tp0{mp_id}.pdparams")
-                    for i in range(pp_num)
-                ]
-
-            context = paddle.LazyGuard()
-            if not use_ep:
-                logger.info(f"start to loading weight: {rank_model_paths}")
-                state_dicts = [None for _ in rank_model_paths]
-
-                def load_ckpt(i):
-                    state_dicts[i] = paddle.load(rank_model_paths[i], return_numpy=True)
-
-                threads = []
-                for i in range(len(rank_model_paths)):
-                    thread = threading.Thread(target=load_ckpt, args=(i,))
-                    threads.append(thread)
-                    thread.start()
-
-                for t in threads:
-                    t.join()
-
-                logger.info("Loading finished")
-
-            else:
-                # for EP loading state_dicts
-                import glob
-
-                state_dicts = []
-                files = glob.glob(model_path + "/merged_tp1_state_split/*")
-                for file_name in files:
-                    try:
-                        state_dicts += [
-                            {file_name.split("/")[-1]: file_name}
-                        ]  # save {layer_name: weight_file_name}
-                    except Exception:
-                        pass
-
-            need_reset_moe_intermediate_size = False
-            if not use_ep:
-                logger.info(f"moe_intermediate_size is: {moe_intermediate_size}")
-                need_reset_moe_intermediate_size = (
-                    (not use_ep)
-                    and (moe_quant_type == "fp8")
-                    and (moe_intermediate_size // 8 % 128 != 0)
-                )
-                ori_up_size = moe_intermediate_size // 8 * 2
-                ori_down_size = ori_up_size // 2
-                if need_reset_moe_intermediate_size:
-                    moe_intermediate_size = (
-                        128 - moe_intermediate_size // 8 % 128
-                    ) * 8 + moe_intermediate_size
-                    logger.info(
-                        f"moe_intermediate_size reset to {moe_intermediate_size}!"
-                    )
-                    up_size = moe_intermediate_size // 8 * 2
-                    down_size = up_size // 2
-            new_state_dict = {}
-
-            def padding(key, value):
-                import numpy as np
-
-                # logger.info(f"deal {key}")
-                if ("experts" in key) and ("up_gate_proj" in key):
-                    # logger.info("up_gate_proj")
-                    v_new = np.zeros(shape=[value.shape[0], up_size], dtype=value.dtype)
-                    v_new[:, :ori_down_size] = value[:, :ori_down_size]
-                    v_new[:, down_size : (down_size + ori_down_size)] = value[
-                        :, ori_down_size:
-                    ]
-                elif ("experts" in key) and ("down_proj" in key):
-                    # logger.info("down_proj")
-                    v_new = np.zeros(
-                        shape=[down_size, value.shape[1]], dtype=value.dtype
-                    )
-                    v_new[:ori_down_size, :] = value
-                else:
-                    v_new = value
-                new_state_dict[key] = v_new
-                if ("experts" in key) and ("up_gate_proj" in key or "down_proj" in key):
-                    pass
-                    # logger.info(f"padding {key}: {value.shape}->{v_new.shape}")
-
-            threads = []
-            for state_dict in state_dicts:
-                for key, value in state_dict.items():
-                    if need_reset_moe_intermediate_size:
-                        thread = threading.Thread(target=padding, args=(key, value))
-                        threads.append(thread)
-                        thread.start()
-                    else:
-                        new_state_dict[key] = value
-
-            for t in threads:
-                t.join()
-            logger.info("Finish padding")
-            state_dict = new_state_dict
-        elif config.get("quant_type", None) is not None:
-            # TODO(@wangbojun) currently, we use paddle.load for ptq model.
-            tensor_parallel_degree = dist.get_world_size()
-            if tensor_parallel_degree > 1:
-                hcg = fleet.get_hybrid_communicate_group()
-                mp_id = hcg.get_model_parallel_rank()
-                rank_model_path = os.path.join(
-                    model_path, f"model_state.tp0{mp_id}.pdparams"
-                )
-                if not os.path.exists(rank_model_path):
-                    full_model_path = os.path.join(model_path, "model_state.pdparams")
-                    if not os.path.exists(full_model_path):
-                        raise ValueError(
-                            f"can not find <model_state.tp0{mp_id}.pdparams> "
-                            + f"and model_state.pdparams under dir<{model_path}>"
-                        )
-                    raise ValueError(
-                        "please run `split_weights.py` to gen weights for multi-gpu inference."
-                    )
-                if not os.path.exists(rank_model_path):
-                    full_model_path = os.path.join(model_path, "model_state.pdparams")
-                    if not os.path.exists(full_model_path):
-                        raise ValueError(
-                            f"can not find <model_state.tp0{mp_id}.pdparams> "
-                            + f"and model_state.pdparams under dir<{model_path}>"
-                        )
-                    raise ValueError(
-                        "please run `split_weights.py` to gen weights for multi-gpu inference."
-                    )
-                model_state_path = rank_model_path
-                if num_key_value_heads > 0:
-                    assert (
-                        num_key_value_heads % tensor_parallel_degree == 0
-                    ), "num_key_value_heads must be an integer multiple of tensor_parallel_degree"
-            else:
-                model_state_path = os.path.join(model_path, "model_state.pdparams")
-            context = paddle.LazyGuard()
-            logger.info(f"start to loading weight: {model_state_path}")
-            if os.path.exists(model_state_path):
-                state_dict = paddle.load(model_state_path, return_numpy=True)
-    else:
-        state_dict = sharing_state_dicts
-        context = paddle.LazyGuard()
-
-    use_rmsnorm = config.get("use_rmsnorm", True)
-
-    if use_beam_search:
-        decode_strategy = "beam_search"
-    elif speculate_method is not None:
-        if draft_type in ["draft_model", "eagle", "mtp"]:
-            decode_strategy = "draft_model_sampling"
-        else:
-            decode_strategy = "speculate_decoding"
-    else:
-        decode_strategy = "sampling"
-
-    logger.info(f"{runtime_timer.log()}")
-    runtime_timer.start(f"{stage_flag} stage set parameters time")
-
-    if config["hidden_act"].lower() == "swiglu":
-        model_config.hidden_act = "swiglu"
-    model_config.ffn_hidden_size = ffn_hidden_size
-    model_config.max_seq_len = max_len
-    model_config.num_layers = num_layers
-    model_config.dtype = dtype
-    model_config.export_model_type = export_model_type
-    parallel_config.block_size = block_size
-
-    model_config.group_size = group_size
-    load_config.model_path = model_path
-    model_config.use_rmsnorm = use_rmsnorm
-    parallel_config.msg_queue_id = msg_queue_id
-    additional_config.use_fake_parameter = use_fake_parameter
-    model_config.num_key_value_heads = num_key_value_heads
-    model_config.use_stop_seqs = use_stop_seqs
-    tmp_config.cache_quant_dtype = cache_quant_dtype
-    tmp_config.has_zero_point = config.get("has_zero_point", False)
-    tmp_config.is_channel_wise = config.get("is_channel_wise", False),
-    speculative_config.speculate_method = speculate_method
-    speculative_config.speculate_max_draft_token_num = speculate_max_draft_token_num
-    model_config.return_all_hidden_states = return_all_hidden_states
-    speculative_config.draft_type = draft_type
-    model_config.start_layer_index = start_layer_index
-    model_config.use_moe = use_moe
-    if use_moe:
-        moe_config.use_moe = use_moe
-        moe_config.num_experts = config.get("moe_num_experts", None)
-        moe_config.moe_intermediate_size = config.get("moe_intermediate_size",
-                                                    None)
-        moe_config.moe_use_gate_correction_bias = config.get(
-            "moe_use_gate_correction_bias", True)
-        moe_config.moe_every2 = config.get("moe_every2", False)
-        moe_config.moe_topk = config.get("moe_topk", 8)
-        moe_config.moe_num_shared_experts = config.get("moe_num_shared_experts", 0)
-        moe_config.moe_layer_start_index = config.get("moe_layer_start_index", 0)
-        moe_config.moe_use_ffn_shared_weight_and_bias = config.get(
-            "moe_use_ffn_shared_weight_and_bias", False)
-        moe_config.use_moe = use_moe
-        moe_config.moe_group = config.get("moe_group", False)
-        moe_config.moe_quant_type = moe_quant_type
-        if top_k > 0:
-            moe_config.top_k = top_k
-    parallel_config.use_ep = use_ep
-    additional_config.ep_just_for_test = ep_just_for_test
-    model_config.generation_phase = generation_phase
-    parallel_config.use_micro_batch = use_micro_batch
-    tmp_config.weight_block_size = config.get("weight_block_size", [-1, -1])
-    load_config.scale_dir = scale_dir
-    model_config.output_via_mq = output_via_mq
-
-
-    decoding_config.bos_token_id = tokenizer.bos_token_id
-    decoding_config.pad_token_id = tokenizer.pad_token_id
-    decoding_config.temperature = temperature
-    decoding_config.forced_eos_token_id = tokenizer.eos_token_id
-    model_config.ori_vocab_size = ori_vocab_size
-    decoding_config.max_dec_len = max_dec_len
-    decoding_config.min_dec_len = min_dec_len
-    additional_config.fake_server_p = fake_server_p
-    decoding_config.decode_strategy = decode_strategy
-    speculative_config.speculate_max_candidate_len = speculate_max_candidate_len
-    speculative_config.speculate_verify_window = speculate_verify_window
-
-    weight_dtype, act_dtype, cachekv_dtype = parser_quant_type(
-        export_model_type)
-    logger.info(
-        f"quant_type: weight[{weight_dtype}], act[{act_dtype}], cachekv[{cachekv_dtype}]"
-    )
-    model_config.weight_dtype = weight_dtype
-    model_config.act_dtype = act_dtype
-
-    if weight_dtype == "int8" and act_dtype in ["bfloat16", "float16"]:
-        quant_cls = get_quantization_config("weight_only")
-        quant_config = quant_cls.from_config({
-            "weight_only_linear_arch": None,
-            "algo": "weight_only_int8"
-        })
-        quant_config.quant_max_bound = 0
-        quant_config.quant_min_bound = 0
-        quant_config.quant_round_type = 0
-        model_config.use_smooth_quant = False
-    elif weight_dtype == "int4" and act_dtype in ["bfloat16", "float16"]:
-        quant_cls = get_quantization_config("weight_only")
-        quant_config = quant_cls.from_config({
-            "weight_only_linear_arch": None,
-            "algo": "weight_only_int4"
-        })
-        quant_config.quant_max_bound = 0
-        quant_config.quant_min_bound = 0
-        quant_config.quant_round_type = 0
-        model_config.use_smooth_quant = False
-    elif tmp_config.weight_block_size[0] != -1:
-        quant_cls = get_quantization_config("block_wise")
-        quant_config = quant_cls.from_config(
-            {"weight_block_size": tmp_config.weight_block_size})
-        quant_config.quant_max_bound = 448
-        quant_config.quant_min_bound = -448
-        quant_config.quant_round_type = 1
-        model_config.use_smooth_quant = False
-    elif weight_dtype == "int4" and act_dtype == "float8_e4m3fn":
-        quant_cls = get_quantization_config("w4afp8")
-        quant_config = quant_cls.from_config({
-            "weight_scale_dict": {},
-            "act_scale_dict": {}
-        })
-        quant_config.quant_max_bound = 448
-        quant_config.quant_min_bound = -448
-        quant_config.quant_round_type = 1
-        model_config.use_smooth_quant = False
-    elif weight_dtype == "int8" and act_dtype == weight_dtype:
-        quant_cls = get_quantization_config("w8a8")
-        quant_config = quant_cls.from_config({
-            "weight_scale_dict": {},
-            "act_scale_dict": {},
-            "use_gemm_dequant": False
-        })
-        quant_config.quant_max_bound = 127
-        quant_config.quant_min_bound = -127
-        quant_config.quant_round_type = 0
-        model_config.use_smooth_quant = True
-    elif weight_dtype == "float8_e4m3fn" and act_dtype == weight_dtype:
-        quant_cls = get_quantization_config("wfp8afp8")
-        quant_config = quant_cls.from_config({
-            "weight_scale_dict": {},
-            "act_scale_dict": {}
-        })
-        quant_config.quant_max_bound = 448
-        quant_config.quant_min_bound = -448
-        quant_config.quant_round_type = 1
-        model_config.use_smooth_quant = False
-    else:
-        quant_config = None
-
-    llm_config = LLMConfig(
-        model_config=model_config,
-        parallel_config=parallel_config,
-        speculative_config=speculative_config,
-        device_config=device_config,
-        additional_config=additional_config,
-        load_config=load_config,
-        tmp_config=tmp_config,
-        moe_config=moe_config,
-        decoding_config=decoding_config,
-        quant_config=quant_config,
-    )
-
-    with context:
-        model_cls = ModelRegistry.get_class(model_config.architectures[0])
-        model = model_cls(llm_config)
-
-    model.eval()
-
-    if use_fake_parameter:
-        return config, tokenizer, model
-    elif not use_moe:
-        for k, v in state_dict.items():
-            if convert_dtype(v.dtype) == dtype:
-                continue
-            elif convert_dtype(v.dtype) == "float32":
-                continue
-            state_dict[k] = convert_ndarray_dtype(v, dtype)
-
-    paddle.device.cuda.empty_cache()
-    assert state_dict is not None
-    model.set_state_dict(state_dict)
-    if use_ep and generation_phase == GenerationPhase.DECODER:
-        logger.info("Reloading model...")
-        reconstruct_memory(model)
-    logger.info(f"{runtime_timer.log()}")
-
-    if sharing_state_dicts is not None:
-        for k in list(sharing_state_dicts):
-            sharing_state_dicts.pop(k)
-
-    possible_state_dict = state_dict if return_state_dicts else None
-    return config, tokenizer, model, possible_state_dict
--- a/fastdeploy/model_executor/models/qwen2.py
+++ b/fastdeploy/model_executor/models/qwen2.py
@@ -20,9 +20,12 @@ from functools import partial

 import paddle
 from paddle import nn
-from paddlenlp.transformers import PretrainedModel
+from paddleformers.transformers import PretrainedModel
+from paddleformers.utils.log import logger

-from fastdeploy.config import LLMConfig, ModelConfig
+from fastdeploy.config import FDConfig, ModelConfig
+from fastdeploy.model_executor.graph_optimization.decorator import \
+    support_graph_optimization
 from fastdeploy.model_executor.layers.activation import SiluAndMul
 from fastdeploy.model_executor.layers.attention import Attention
 from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
@@ -31,7 +34,7 @@ from fastdeploy.model_executor.layers.linear import (
 from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
 from fastdeploy.model_executor.layers.normalization import RMSNorm
 from fastdeploy.model_executor.models.model_base import ModelForCasualLM
-from fastdeploy.worker.model_runner import ForwardMeta
+from fastdeploy.worker.forward_meta import ForwardMeta


 class Qwen2MLP(nn.Layer):
@@ -40,32 +43,33 @@ class Qwen2MLP(nn.Layer):

    def __init__(
        self,
-        llm_config: LLMConfig,
+        fd_config: FDConfig,
        prefix: str = "",
    ) -> None:
        super().__init__()
-        self.nranks = llm_config.parallel_config.mp_size
+        self.nranks = fd_config.parallel_config.tensor_parallel_degree
        self.gate_up_proj = MergedColumnParallelLinear(
-            llm_config=llm_config,
+            fd_config=fd_config,
            prefix=f"{prefix}.up_gate_proj",
+            input_size=fd_config.model_config.hidden_size,
+            output_size=fd_config.model_config.ffn_hidden_size * 2,
            with_bias=False,
-            activation=llm_config.model_config.hidden_act,
+            activation=fd_config.model_config.hidden_act,
            use_fast_ffn=True,
        )

        self.down_proj = RowParallelLinear(
-            llm_config=llm_config,
+            fd_config=fd_config,
            prefix=f"{prefix}.down_proj",
-            input_size=(llm_config.model_config.ffn_hidden_size //
-                        self.nranks),
-            output_size=llm_config.model_config.hidden_size,
+            input_size=(fd_config.model_config.ffn_hidden_size // self.nranks),
+            output_size=fd_config.model_config.hidden_size,
            with_bias=False,
        )

        self.act_fn = SiluAndMul(
-            llm_config=llm_config,
+            fd_config=fd_config,
            bias=getattr(self.gate_up_proj, "linear_bias", None),
-            act_method=llm_config.model_config.hidden_act,
+            act_method=fd_config.model_config.hidden_act,
        )

    def load_state_dict(self, state_dict):
@@ -88,25 +92,25 @@ class Qwen2Attention(nn.Layer):
    """

    def __init__(self,
-                 llm_config: LLMConfig,
+                 fd_config: FDConfig,
                 layer_id: int,
                 prefix: str = "") -> None:
        super().__init__()

-        nranks = llm_config.parallel_config.mp_size
+        nranks = fd_config.parallel_config.tensor_parallel_degree

-        self.qkv_proj = QKVParallelLinear(llm_config=llm_config,
+        self.qkv_proj = QKVParallelLinear(fd_config=fd_config,
                                          prefix=f"{prefix}.qkv_proj",
                                          with_bias=True)

        self.o_proj = RowParallelLinear(
-            llm_config=llm_config,
+            fd_config=fd_config,
            prefix=f"{prefix}.o_proj",
-            input_size=(llm_config.model_config.hidden_size // nranks),
-            output_size=llm_config.model_config.hidden_size,
+            input_size=(fd_config.model_config.hidden_size // nranks),
+            output_size=fd_config.model_config.hidden_size,
        )

-        self.attn = Attention(llm_config=llm_config,
+        self.attn = Attention(fd_config=fd_config,
                              layer_id=layer_id,
                              prefix=prefix,
                              use_neox_rotary_style=True)
@@ -140,33 +144,33 @@ class Qwen2DecoderLayer(nn.Layer):

    def __init__(
        self,
-        llm_config: LLMConfig,
+        fd_config: FDConfig,
        prefix: str = "",
    ) -> None:
        super().__init__()
        layer_id = int(prefix.split(sep='.')[-1])

        self.self_attn = Qwen2Attention(
-            llm_config=llm_config,
+            fd_config=fd_config,
            layer_id=layer_id,
            prefix=f"{prefix}.self_attn",
        )

        self.mlp = Qwen2MLP(
-            llm_config=llm_config,
+            fd_config=fd_config,
            prefix=f"{prefix}.mlp",
        )

        self.input_layernorm = RMSNorm(
-            llm_config,
-            hidden_size=llm_config.model_config.hidden_size,
+            fd_config,
+            hidden_size=fd_config.model_config.hidden_size,
            eps=1e-6,
            prefix=f"{prefix}.input_layernorm",
        )

        self.post_attention_layernorm = RMSNorm(
-            llm_config,
-            hidden_size=llm_config.model_config.hidden_size,
+            fd_config,
+            hidden_size=fd_config.model_config.hidden_size,
            eps=1e-6,
            prefix=f"{prefix}.post_attention_layernorm",
        )
@@ -209,13 +213,14 @@ class Qwen2DecoderLayer(nn.Layer):
        return hidden_states, residual


+@support_graph_optimization
 class Qwen2Model(nn.Layer):
    """
    """

    def __init__(
        self,
-        llm_config: LLMConfig = None,
+        fd_config: FDConfig = None,
    ):
        """
        Initializer for the Qwen2Model class.
@@ -225,29 +230,29 @@ class Qwen2Model(nn.Layer):
        """
        super().__init__()

-        self.num_layers = llm_config.model_config.num_layers
-        llm_config.model_config.prefix_name = "qwen2"
+        self.num_layers = fd_config.model_config.num_layers
+        fd_config.model_config.prefix_name = "qwen2"

        self.embeddings = VocabParallelEmbedding(
-            llm_config=llm_config,
-            num_embeddings=llm_config.model_config.vocab_size,
-            embedding_dim=llm_config.model_config.hidden_size,
+            fd_config=fd_config,
+            num_embeddings=fd_config.model_config.vocab_size,
+            embedding_dim=fd_config.model_config.hidden_size,
            params_dtype=paddle.get_default_dtype,
-            prefix=(f"{llm_config.model_config.prefix_name}.embed_tokens"),
+            prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"),
        )

        self.layers = nn.LayerList([
            Qwen2DecoderLayer(
-                llm_config=llm_config,
-                prefix=f"{llm_config.model_config.prefix_name}.layers.{i}")
+                fd_config=fd_config,
+                prefix=f"{fd_config.model_config.prefix_name}.layers.{i}")
            for i in range(self.num_layers)
        ])

        self.norm = RMSNorm(
-            llm_config,
-            hidden_size=llm_config.model_config.hidden_size,
+            fd_config,
+            hidden_size=fd_config.model_config.hidden_size,
            eps=1e-5,
-            prefix=f"{llm_config.model_config.prefix_name}.norm",
+            prefix=f"{fd_config.model_config.prefix_name}.norm",
        )

    def load_state_dict(self, state_dict):
@@ -262,6 +267,7 @@ class Qwen2Model(nn.Layer):
        self.embeddings.load_state_dict(state_dict)
        self.norm.load_state_dict(state_dict)
        for i in range(self.num_layers):
+            logger.info(f"Start load layer {i}")
            self.layers[i].load_state_dict(state_dict)

    def forward(
@@ -292,21 +298,21 @@ class Qwen2ForCausalLM(ModelForCasualLM):
    Qwen2ForCausalLM
    """

-    def __init__(self, llm_config: LLMConfig):
+    def __init__(self, fd_config: FDConfig):
        """
        Args:
-            llm_config (LLMConfig): Configurations for the LLM model.
+            fd_config (FDConfig): Configurations for the LLM model.
        """
-        super(Qwen2ForCausalLM, self).__init__(llm_config)
+        super(Qwen2ForCausalLM, self).__init__(fd_config)

-        self.model = Qwen2Model(llm_config=llm_config)
+        self.model = Qwen2Model(fd_config=fd_config)

-        self.ori_vocab_size = llm_config.model_config.ori_vocab_size
+        self.ori_vocab_size = fd_config.model_config.ori_vocab_size

        self.lm_head = ParallelLMHead(
-            llm_config=llm_config,
-            embedding_dim=llm_config.model_config.hidden_size,
-            num_embeddings=llm_config.model_config.vocab_size,
+            fd_config=fd_config,
+            embedding_dim=fd_config.model_config.hidden_size,
+            num_embeddings=fd_config.model_config.vocab_size,
            prefix="lm_head",
        )

@@ -345,7 +351,8 @@ class Qwen2ForCausalLM(ModelForCasualLM):
    ):
        """
        """
-        hidden_states = self.model(ids_remove_padding, forward_meta)
+        hidden_states = self.model(ids_remove_padding=ids_remove_padding,
+                                   forward_meta=forward_meta)

        return hidden_states

@@ -355,7 +362,7 @@ class Qwen2PretrainedModel(PretrainedModel):
    Qwen2PretrainedModel
    """

-    config_class = LLMConfig
+    config_class = FDConfig

    def _init_weight(self, layer):
        """
@@ -366,7 +373,8 @@ class Qwen2PretrainedModel(PretrainedModel):
    @classmethod
    def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True):

-        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+        from paddleformers.transformers.conversion_utils import \
+            split_or_merge_func

        fn = split_or_merge_func(
            is_split=is_split,
--- a/fastdeploy/model_executor/models/qwen3.py
+++ b/fastdeploy/model_executor/models/qwen3.py
@@ -0,0 +1,361 @@
+"""
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from __future__ import annotations
+
+from functools import partial
+
+import paddle
+from paddle import nn
+from paddleformers.transformers import PretrainedModel
+from paddleformers.utils.log import logger
+
+from fastdeploy.config import FDConfig, ModelConfig
+from fastdeploy.model_executor.graph_optimization.decorator import \
+    support_graph_optimization
+from fastdeploy.model_executor.layers.attention import Attention
+from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
+from fastdeploy.model_executor.layers.linear import (QKVParallelLinear,
+                                                     RowParallelLinear)
+from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
+from fastdeploy.model_executor.layers.normalization import RMSNorm
+from fastdeploy.model_executor.models.model_base import ModelForCasualLM
+from fastdeploy.model_executor.models.qwen2 import Qwen2DecoderLayer, Qwen2MLP
+from fastdeploy.worker.forward_meta import ForwardMeta
+
+
+class Qwen3MLP(Qwen2MLP):
+    """
+    """
+    pass
+
+
+class Qwen3Attention(nn.Layer):
+    """
+    """
+
+    def __init__(self,
+                 fd_config: FDConfig,
+                 layer_id: int,
+                 prefix: str = "") -> None:
+        super().__init__()
+
+        self.fd_config = fd_config
+
+        self.head_dim = fd_config.model_config.head_dim
+        nranks = fd_config.parallel_config.tensor_parallel_degree
+        self.q_size = fd_config.model_config.num_attention_heads * self.head_dim // nranks
+        self.kv_size = fd_config.model_config.num_key_value_heads * self.head_dim // nranks
+
+        self.qkv_proj = QKVParallelLinear(fd_config=fd_config,
+                                          prefix=f"{prefix}.qkv_proj",
+                                          with_bias=False)
+
+        self.o_proj = RowParallelLinear(
+            fd_config=fd_config,
+            prefix=f"{prefix}.o_proj",
+            input_size=fd_config.model_config.head_dim *
+            fd_config.model_config.num_attention_heads // nranks,
+            output_size=fd_config.model_config.hidden_size,
+        )
+
+        self.attn = Attention(fd_config=fd_config,
+                              layer_id=layer_id,
+                              prefix=prefix,
+                              use_neox_rotary_style=True)
+
+        self.q_norm = RMSNorm(fd_config=fd_config,
+                              hidden_size=fd_config.model_config.head_dim,
+                              eps=1e-6,
+                              prefix=f"{prefix}.q_norm",
+                              begin_norm_axis=2)
+        self.k_norm = RMSNorm(fd_config=fd_config,
+                              hidden_size=fd_config.model_config.head_dim,
+                              eps=1e-6,
+                              prefix=f"{prefix}.k_norm",
+                              begin_norm_axis=2)
+
+    def load_state_dict(self, state_dict):
+        """
+        """
+        self.qkv_proj.load_state_dict(state_dict)
+        self.o_proj.load_state_dict(state_dict)
+        self.q_norm.load_state_dict(state_dict)
+        self.k_norm.load_state_dict(state_dict)
+
+    def forward(
+        self,
+        forward_meta: ForwardMeta,
+        hidden_states: paddle.Tensor,
+    ):
+        """
+        """
+        qkv_out = self.qkv_proj(hidden_states)
+
+        # origin_qkv_out = qkv_out
+        q, k, v = qkv_out.split([self.q_size, self.kv_size, self.kv_size],
+                                axis=-1)
+
+        q_by_head = q.reshape(
+            [*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim])
+        q_by_head = self.q_norm(q_by_head)
+        q = q_by_head.reshape(q.shape)
+
+        k_by_head = k.reshape(
+            [*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim])
+        k_by_head = self.k_norm(k_by_head)
+        k = k_by_head.reshape(k.shape)
+
+        qkv_out = paddle.concat([q, k, v], axis=-1)
+
+        atten_out = self.attn(
+            qkv=qkv_out,
+            forward_meta=forward_meta,
+        )
+        output = self.o_proj(atten_out)
+        return output
+
+
+class Qwen3DecoderLayer(Qwen2DecoderLayer):
+    """
+    """
+
+    def __init__(
+        self,
+        fd_config: FDConfig,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(fd_config, prefix)
+        layer_id = int(prefix.split(sep='.')[-1])
+        self.self_attn = Qwen3Attention(fd_config=fd_config,
+                                        layer_id=layer_id,
+                                        prefix=f"{prefix}.self_attn")
+
+
+@support_graph_optimization
+class Qwen3Model(nn.Layer):
+    """
+    """
+
+    def __init__(
+        self,
+        fd_config: FDConfig = None,
+    ):
+        """
+        Initializer for the Qwen3Model class.
+
+        Args:
+
+        """
+        super().__init__()
+
+        self.num_layers = fd_config.model_config.num_layers
+        fd_config.model_config.prefix_name = "model"
+        fd_config.model_config.tie_word_embeddings = True
+
+        self.embeddings = VocabParallelEmbedding(
+            fd_config=fd_config,
+            num_embeddings=fd_config.model_config.vocab_size,
+            embedding_dim=fd_config.model_config.hidden_size,
+            params_dtype=paddle.get_default_dtype,
+            prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"),
+        )
+
+        self.layers = nn.LayerList([
+            Qwen3DecoderLayer(
+                fd_config=fd_config,
+                prefix=f"{fd_config.model_config.prefix_name}.layers.{i}")
+            for i in range(self.num_layers)
+        ])
+
+        self.norm = RMSNorm(
+            fd_config,
+            hidden_size=fd_config.model_config.hidden_size,
+            eps=1e-6,
+            prefix=f"{fd_config.model_config.prefix_name}.norm",
+        )
+
+    def load_state_dict(self, state_dict):
+        """
+        Load model parameters from a given state dictionary.
+
+        Args:
+            state_dict (dict[str, np.ndarray | paddle.Tensor]):
+                A dictionary containing model parameters, where keys are parameter names
+                and values are NumPy arrays or PaddlePaddle tensors.
+        """
+        self.embeddings.load_state_dict(state_dict)
+        self.norm.load_state_dict(state_dict)
+        for i in range(self.num_layers):
+            logger.info(f"Start load layer {i}")
+            self.layers[i].load_state_dict(state_dict)
+
+    def forward(
+        self,
+        ids_remove_padding: paddle.Tensor,
+        forward_meta: ForwardMeta,
+    ):
+        """
+        """
+        hidden_states = self.embeddings(ids_remove_padding=ids_remove_padding)
+
+        residual = None
+
+        for i in range(self.num_layers):
+            hidden_states, residual = self.layers[i](forward_meta,
+                                                     hidden_states, residual)
+
+        hidden_states = hidden_states + residual
+
+        out = self.norm(hidden_states)
+
+        return out
+
+
+class Qwen3ForCausalLM(ModelForCasualLM):
+    """
+    Qwen3ForCausalLM
+    """
+
+    def __init__(self, fd_config: FDConfig):
+        """
+        Args:
+            fd_config (FDConfig): Configurations for the LLM model.
+        """
+        super(Qwen3ForCausalLM, self).__init__(fd_config)
+
+        self.model = Qwen3Model(fd_config=fd_config)
+
+        self.ori_vocab_size = fd_config.model_config.ori_vocab_size
+
+        self.lm_head = ParallelLMHead(
+            fd_config=fd_config,
+            embedding_dim=fd_config.model_config.hidden_size,
+            num_embeddings=fd_config.model_config.vocab_size,
+            prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"),
+        )
+        self.tie_word_embeddings = fd_config.model_config.tie_word_embeddings
+
+    @classmethod
+    def name(self):
+        """
+        """
+        return "Qwen3ForCausalLM"
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict):
+        """
+        Load model parameters from a given state dictionary.
+
+        Args:
+            state_dict (dict[str, np.ndarray | paddle.Tensor]):
+                A dictionary containing model parameters, where keys are parameter names
+                and values are NumPy arrays or PaddlePaddle tensors.
+        """
+        self.model.load_state_dict(state_dict)
+        if self.tie_word_embeddings:
+            self.lm_head.out_linear.weight.set_value(
+                self.model.embeddings.word_embeddings.weight.transpose([1, 0]))
+        self.lm_head.load_state_dict(state_dict)
+
+    def compute_logits(self, hidden_states: paddle.Tensor):
+        """
+        """
+        logits = self.lm_head(hidden_states)
+        logits = paddle.cast(logits, paddle.float32)
+        logits[:, self.ori_vocab_size:] = -float("inf")
+
+        return logits
+
+    def forward(
+        self,
+        ids_remove_padding: paddle.Tensor,
+        forward_meta: ForwardMeta,
+    ):
+        """
+        """
+        hidden_states = self.model(ids_remove_padding=ids_remove_padding,
+                                   forward_meta=forward_meta)
+
+        return hidden_states
+
+
+class Qwen3PretrainedModel(PretrainedModel):
+    """
+    Qwen3PretrainedModel
+    """
+
+    config_class = FDConfig
+
+    def _init_weight(self, layer):
+        """
+        _init_weight
+        """
+        return None
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True):
+
+        from paddleformers.transformers.conversion_utils import \
+            split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(num_layers):
+            final_actions = {}
+
+            base_actions = {
+                # Row Linear
+                "embed_tokens.weight": partial(fn, is_column=False),
+                "layers.0.self_attn.o_proj.weight": partial(fn,
+                                                            is_column=False),
+                "layers.0.mlp.down_proj.weight": partial(fn, is_column=False),
+            }
+
+            # Column Linear
+
+            base_actions["layers.0.self_attn.q_proj.weight"] = partial(
+                fn, is_column=True)
+            base_actions["layers.0.self_attn.q_proj.bias"] = partial(
+                fn, is_column=True)
+            # if we have enough num_key_value_heads to split, then split it.
+            if config.num_key_value_heads % config.tensor_parallel_degree == 0:
+                base_actions["layers.0.self_attn.k_proj.weight"] = partial(
+                    fn, is_column=True)
+                base_actions["layers.0.self_attn.v_proj.weight"] = partial(
+                    fn, is_column=True)
+
+            base_actions["layers.0.mlp.gate_proj.weight"] = partial(
+                fn, is_column=True)
+            base_actions["layers.0.mlp.up_proj.weight"] = partial(
+                fn, is_column=True)
+
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("layers.0.",
+                                                  f"layers.{i}.")] = action
+                final_actions[key] = action
+
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.num_layers)
+        return mappings
--- a/fastdeploy/model_executor/models/qwen3moe.py
+++ b/fastdeploy/model_executor/models/qwen3moe.py
@@ -0,0 +1,509 @@
+"""
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from __future__ import annotations
+
+from functools import partial
+
+import paddle
+from paddle import nn
+from paddleformers.transformers import PretrainedModel
+from paddleformers.utils.log import logger
+
+from fastdeploy.config import FDConfig, ModelConfig
+from fastdeploy.model_executor.graph_optimization.decorator import \
+    support_graph_optimization
+from fastdeploy.model_executor.layers.activation import SiluAndMul
+from fastdeploy.model_executor.layers.attention import Attention
+from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
+from fastdeploy.model_executor.layers.linear import (
+    MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear)
+from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
+from fastdeploy.model_executor.layers.moe.moe import FusedMoE
+from fastdeploy.model_executor.layers.normalization import RMSNorm
+from fastdeploy.model_executor.models.model_base import ModelForCasualLM
+from fastdeploy.worker.forward_meta import ForwardMeta
+
+
+class Qwen3MLP(nn.Layer):
+    """
+    """
+
+    def __init__(
+        self,
+        fd_config: FDConfig,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.nranks = fd_config.parallel_config.tensor_parallel_degree
+
+        self.gate_up_proj = MergedColumnParallelLinear(
+            fd_config,
+            prefix=f"{prefix}.up_gate_proj",
+            input_size=fd_config.model_config.hidden_size,
+            output_size=fd_config.model_config.ffn_hidden_size * 2,
+            with_bias=False,
+            activation=fd_config.model_config.hidden_act,
+            use_fast_ffn=True,
+        )
+
+        self.down_proj = RowParallelLinear(
+            fd_config,
+            prefix=f"{prefix}.down_proj",
+            input_size=(fd_config.model_config.ffn_hidden_size // self.nranks),
+            output_size=fd_config.model_config.hidden_size,
+            with_bias=False,
+        )
+
+        self.act_fn = SiluAndMul(
+            fd_config,
+            bias=getattr(self.gate_up_proj, "linear_bias", None),
+            act_method=fd_config.model_config.hidden_act,
+        )
+
+    def load_state_dict(self, state_dict):
+        """
+        """
+        self.gate_up_proj.load_state_dict(state_dict)
+        self.down_proj.load_state_dict(state_dict)
+
+    def forward(self, x):
+        """
+        """
+        gate_up_out = self.gate_up_proj(x)
+        act_out = self.act_fn(gate_up_out)
+        down_out = self.down_proj(act_out)
+        return down_out
+
+
+class Qwen3Attention(nn.Layer):
+    """
+    """
+
+    def __init__(self,
+                 fd_config: FDConfig,
+                 layer_id: int,
+                 prefix: str = "") -> None:
+        super().__init__()
+
+        self.fd_config = fd_config
+        self.head_dim = fd_config.model_config.head_dim
+
+        self.qkv_proj = QKVParallelLinear(fd_config,
+                                          prefix=f"{prefix}.qkv_proj",
+                                          with_bias=False)
+        nranks = fd_config.parallel_config.tensor_parallel_degree
+
+        self.o_proj = RowParallelLinear(
+            fd_config,
+            prefix=f"{prefix}.o_proj",
+            input_size=fd_config.model_config.head_dim *
+            fd_config.model_config.num_attention_heads // nranks,
+            output_size=fd_config.model_config.hidden_size,
+        )
+
+        self.attn = Attention(fd_config,
+                              layer_id=layer_id,
+                              prefix=prefix,
+                              use_neox_rotary_style=True)
+
+        self.q_norm = RMSNorm(fd_config,
+                              hidden_size=self.head_dim,
+                              eps=1e-6,
+                              prefix=f"{prefix}.q_norm",
+                              begin_norm_axis=2)
+        self.k_norm = RMSNorm(fd_config,
+                              hidden_size=self.head_dim,
+                              eps=1e-6,
+                              prefix=f"{prefix}.k_norm",
+                              begin_norm_axis=2)
+
+        self.q_size = fd_config.model_config.num_attention_heads * self.head_dim // nranks
+        self.kv_size = fd_config.model_config.num_key_value_heads * self.head_dim // nranks
+
+    def load_state_dict(self, state_dict):
+        """
+        """
+        self.qkv_proj.load_state_dict(state_dict)
+        self.o_proj.load_state_dict(state_dict)
+        self.q_norm.load_state_dict(state_dict)
+        self.k_norm.load_state_dict(state_dict)
+
+    def forward(
+        self,
+        forward_meta: ForwardMeta,
+        hidden_states: paddle.Tensor,
+    ):
+        """
+        """
+        qkv_out = self.qkv_proj(hidden_states)
+        # origin_qkv_out = qkv_out
+        q, k, v = qkv_out.split([self.q_size, self.kv_size, self.kv_size],
+                                axis=-1)
+
+        q_by_head = q.reshape(
+            [*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim])
+        q_by_head = self.q_norm(q_by_head)
+        q = q_by_head.reshape(q.shape)
+
+        k_by_head = k.reshape(
+            [*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim])
+        k_by_head = self.k_norm(k_by_head)
+        k = k_by_head.reshape(k.shape)
+
+        qkv_out = paddle.concat([q, k, v], axis=-1)
+
+        atten_out = self.attn(
+            qkv=qkv_out,
+            forward_meta=forward_meta,
+        )
+        output = self.o_proj(atten_out)
+        return output
+
+
+class Qwen3DecoderLayer(nn.Layer):
+    """
+    """
+
+    def __init__(
+        self,
+        fd_config: FDConfig,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        layer_id = int(prefix.split(sep='.')[-1])
+
+        self.self_attn = Qwen3Attention(
+            fd_config=fd_config,
+            layer_id=layer_id,
+            prefix=f"{prefix}.self_attn",
+        )
+        weight_key_map = {
+            "gate_weight_key":
+            f"{prefix}.mlp.gate.weight",
+            "ffn1_expert_weight_key":
+            f"{prefix}.mlp.experts.{{}}.up_gate_proj.weight",
+            "ffn2_expert_weight_key":
+            f"{prefix}.mlp.experts.{{}}.down_proj.weight",
+        }
+
+        if (fd_config.moe_config.num_experts is not None
+                and layer_id >= fd_config.moe_config.moe_layer_start_index):
+
+            self.mlp = FusedMoE(fd_config,
+                                moe_intermediate_size=fd_config.moe_config.
+                                moe_intermediate_size,
+                                num_experts=fd_config.moe_config.num_experts,
+                                top_k=fd_config.moe_config.top_k,
+                                layer_idx=layer_id,
+                                weight_key_map=weight_key_map)
+        else:
+            self.mlp = Qwen3MLP(
+                fd_config,
+                prefix=f"{prefix}.mlp",
+            )
+
+        self.input_layernorm = RMSNorm(
+            fd_config,
+            hidden_size=fd_config.model_config.hidden_size,
+            eps=1e-6,
+            prefix=f"{prefix}.input_layernorm",
+        )
+
+        self.post_attention_layernorm = RMSNorm(
+            fd_config,
+            hidden_size=fd_config.model_config.hidden_size,
+            eps=1e-6,
+            prefix=f"{prefix}.post_attention_layernorm",
+        )
+
+    def load_state_dict(self, state_dict):
+        """
+        """
+        self.self_attn.load_state_dict(state_dict)
+        self.mlp.load_state_dict(state_dict)
+        self.input_layernorm.load_state_dict(state_dict)
+        self.post_attention_layernorm.load_state_dict(state_dict)
+
+    def forward(
+        self,
+        forward_meta: ForwardMeta,
+        hidden_states: paddle.Tensor,
+        residual: paddle.Tensor = None,
+    ):
+        """
+        """
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            forward_meta=forward_meta,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+
+        hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_graph_optimization
+class Qwen3MoeModel(nn.Layer):
+    """
+    """
+
+    def __init__(
+        self,
+        fd_config: FDConfig = None,
+    ):
+        """
+        Initializer for the Qwen2Model class.
+
+        Args:
+
+        """
+        super().__init__()
+
+        self.num_layers = fd_config.model_config.num_layers
+        fd_config.model_config.prefix_name = "model"
+
+        self.embeddings = VocabParallelEmbedding(
+            fd_config,
+            num_embeddings=fd_config.model_config.vocab_size,
+            embedding_dim=fd_config.model_config.hidden_size,
+            params_dtype=paddle.get_default_dtype,
+            prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"),
+        )
+
+        self.layers = nn.LayerList([
+            Qwen3DecoderLayer(
+                fd_config,
+                prefix=f"{fd_config.model_config.prefix_name}.layers.{i}")
+            for i in range(self.num_layers)
+        ])
+
+        self.norm = RMSNorm(
+            fd_config,
+            hidden_size=fd_config.model_config.hidden_size,
+            eps=1e-6,
+            prefix=f"{fd_config.model_config.prefix_name}.norm",
+        )
+
+    def load_state_dict(self, state_dict):
+        """
+        Load model parameters from a given state dictionary.
+
+        Args:
+            state_dict (dict[str, np.ndarray | paddle.Tensor]):
+                A dictionary containing model parameters, where keys are parameter names
+                and values are NumPy arrays or PaddlePaddle tensors.
+        """
+        self.embeddings.load_state_dict(state_dict)
+        self.norm.load_state_dict(state_dict)
+        for i in range(self.num_layers):
+            logger.info(f"Start load layer {i}")
+            self.layers[i].load_state_dict(state_dict)
+
+    def forward(
+        self,
+        ids_remove_padding: paddle.Tensor,
+        forward_meta: ForwardMeta,
+    ):
+        """
+        """
+        hidden_states = self.embeddings(ids_remove_padding=ids_remove_padding)
+
+        residual = None
+
+        for i in range(self.num_layers):
+            hidden_states, residual = self.layers[i](forward_meta,
+                                                     hidden_states, residual)
+        hidden_states = hidden_states + residual
+
+        out = self.norm(hidden_states)
+
+        return out
+
+
+class Qwen3MoeForCausalLM(ModelForCasualLM):
+    """
+    Qwen3MoeForCausalLM
+    """
+
+    def __init__(self, fd_config: FDConfig):
+        """
+        Args:
+            fd_config (FDConfig): Configurations for the LLM model.
+        """
+        super(Qwen3MoeForCausalLM, self).__init__(fd_config)
+
+        self.model = Qwen3MoeModel(fd_config)
+
+        self.ori_vocab_size = fd_config.model_config.ori_vocab_size
+
+        self.lm_head = ParallelLMHead(
+            fd_config,
+            embedding_dim=fd_config.model_config.hidden_size,
+            num_embeddings=fd_config.model_config.vocab_size,
+            prefix="lm_head",
+        )
+
+    @classmethod
+    def name(self):
+        """
+        """
+        return "Qwen3MoeForCausalLM"
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict):
+        """
+        Load model parameters from a given state dictionary.
+
+        Args:
+            state_dict (dict[str, np.ndarray | paddle.Tensor]):
+                A dictionary containing model parameters, where keys are parameter names
+                and values are NumPy arrays or PaddlePaddle tensors.
+        """
+        self.model.load_state_dict(state_dict)
+        self.lm_head.load_state_dict(state_dict)
+
+    def compute_logits(self, hidden_states: paddle.Tensor):
+        """
+        """
+        logits = self.lm_head(hidden_states)
+        logits = paddle.cast(logits, paddle.float32)
+        logits[:, self.ori_vocab_size:] = -float("inf")
+
+        return logits
+
+    def forward(
+        self,
+        ids_remove_padding: paddle.Tensor,
+        forward_meta: ForwardMeta,
+    ):
+        """
+        """
+        hidden_states = self.model(ids_remove_padding=ids_remove_padding,
+                                   forward_meta=forward_meta)
+
+        return hidden_states
+
+
+class Qwen3MoePretrainedModel(PretrainedModel):
+    """
+    Qwen3MoePretrainedModel
+    """
+
+    config_class = FDConfig
+
+    def _init_weight(self, layer):
+        """
+        _init_weight
+        """
+        return None
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True):
+        # TODO not support TP split now, next PR will support TP.
+
+        from paddleformers.transformers.conversion_utils import \
+            split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(num_layers, moe_num_experts):
+            final_actions = {}
+
+            base_actions = {
+                "lm_head.weight": partial(fn, is_column=True),
+                # Row Linear
+                "embed_tokens.weight": partial(fn, is_column=False),
+                "layers.0.self_attn.o_proj.weight": partial(fn,
+                                                            is_column=False),
+            }
+
+            # Column Linear
+            config.fuse_attention_qkv = False
+            if config.fuse_attention_qkv:
+                base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(
+                    fn, is_column=True)
+            else:
+                base_actions["layers.0.self_attn.q_proj.weight"] = partial(
+                    fn, is_column=True)
+                base_actions["layers.0.self_attn.q_proj.bias"] = partial(
+                    fn, is_column=True)
+                # if we have enough num_key_value_heads to split, then split it.
+                if config.num_key_value_heads % config.tensor_parallel_degree == 0:
+                    base_actions["layers.0.self_attn.k_proj.weight"] = partial(
+                        fn, is_column=True)
+                    base_actions["layers.0.self_attn.v_proj.weight"] = partial(
+                        fn, is_column=True)
+                    base_actions["layers.0.self_attn.k_proj.bias"] = partial(
+                        fn, is_column=True)
+                    base_actions["layers.0.self_attn.v_proj.bias"] = partial(
+                        fn, is_column=True)
+
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("layers.0.",
+                                                  f"layers.{i}.")] = action
+                final_actions[key] = action
+
+            base_actions = {
+                "layers.0.mlp.experts.0.gate_proj.weight":
+                partial(fn, is_column=True),
+                "layers.0.mlp.experts.0.down_proj.weight":
+                partial(fn, is_column=False),
+                "layers.0.mlp.experts.0.up_proj.weight":
+                partial(fn, is_column=True),
+            }
+
+            for key, action in base_actions.items():
+                for i in range(num_layers):
+                    newkey = key.replace("layers.0.", f"layers.{i}.")
+                    for j in range(moe_num_experts):
+                        newkey2 = newkey.replace("experts.0.", f"experts.{j}.")
+                        final_actions[newkey2] = action
+
+            return final_actions
+
+        moe_num_experts = 0
+        if isinstance(config.moe_num_experts, list):
+            moe_num_experts = sum(config.moe_num_experts)
+        elif isinstance(config.moe_num_experts, int):
+            moe_num_experts = config.moe_num_experts
+        else:
+            raise ValueError(
+                f"Not support type of moe_num_experts [{type(config.moe_num_experts)}]"
+            )
+
+        mappings = get_tensor_parallel_split_mappings(config.num_layers,
+                                                      moe_num_experts)
+
+        return mappings
--- a/fastdeploy/model_executor/models/tokenizer.py
+++ b/fastdeploy/model_executor/models/tokenizer.py
@@ -1,382 +0,0 @@
-"""
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import os
-import re
-from shutil import copyfile
-from typing import Dict, List, Optional, Tuple
-
-import numpy as np
-import paddle
-import sentencepiece as spm
-from paddlenlp.transformers import PretrainedTokenizer
-from paddlenlp.transformers.tokenizer_utils_base import (PaddingStrategy,
-                                                         TextInput)
-from paddlenlp.utils.log import logger
-
-__all__ = ["ErnieBotTokenizer"]
-
-
-# copy from ernie_core.tokenizers
-class ErnieBotTokenizer(PretrainedTokenizer):
-    """
-    一个更好用的 `ErnieBotToknizer`，
-    能 encode 目前 sft/ppo 阶段的特殊token，也支持多模态。
-    """
-
-    resource_files_names = {
-        "vocab_file": "spm.model",
-    }
-    pretrained_resource_files_map = {"vocab_file": {"ernie-bot-10b": None}}
-    pretrained_init_configuration = {
-        "ernie-bot-10b": {},
-    }
-    model_input_names = [
-        "input_ids", "position_ids", "attention_mask", "labels"
-    ]
-    padding_side = "right"
-
-    def __init__(
-        self,
-        vocab_file,
-        bos_token="<s>",
-        cls_token="<cls>",
-        eos_token="</s>",
-        mask_token="<mask:0>",
-        pad_token="<pad>",
-        sep_token="<sep>",
-        unk_token="<unk>",
-        additional_special_tokens=None,
-        **kwargs,
-    ):
-        """doc"""
-        if additional_special_tokens is None:
-            additional_special_tokens = ["<mask:1>", "<mask:7>"]
-        super().__init__(
-            bos_token=bos_token,
-            cls_token=cls_token,
-            eos_token=eos_token,
-            mask_token=mask_token,
-            pad_token=pad_token,
-            sep_token=sep_token,
-            unk_token=unk_token,
-            additional_special_tokens=additional_special_tokens,
-            **kwargs,
-        )
-        self.vocab_file = vocab_file
-        self.sp_model = spm.SentencePieceProcessor()
-        self.sp_model.Load(vocab_file)
-
-    @property
-    def space_token(self):
-        """doc"""
-        return "<mask:1>"
-
-    @property
-    def space_token_id(self):
-        """doc"""
-        return self.sp_model.piece_to_id("<mask:1>")
-
-    @property
-    def gend_token(self):
-        """doc"""
-        return "<mask:7>"
-
-    @property
-    def gend_token_id(self):
-        """doc"""
-        return self.sp_model.piece_to_id("<mask:7>")
-
-    @property
-    def im_start_id(self):
-        """doc"""
-        return self.sp_model.piece_to_id("<|im_start|>")
-
-    @property
-    def im_end_id(self):
-        """doc"""
-        return self.sp_model.piece_to_id("<|im_end|>")
-
-    @property
-    def vocab_size(self):
-        """doc"""
-        return self.sp_model.vocab_size()
-
-    def get_vocab(self):
-        """doc"""
-        vocab = {
-            self.convert_ids_to_tokens(i): i
-            for i in range(self.vocab_size)
-        }
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def _tokenize(self, text):
-        """doc"""
-        return self.sp_model.encode_as_pieces(text)
-
-    def _convert_token_to_id(self, token):
-        """doc"""
-        return self.sp_model.piece_to_id(token)
-
-    def _convert_id_to_token(self, id):
-        """doc"""
-        return self.sp_model.id_to_piece(id)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        current_sub_tokens = []
-        out_string = ""
-        # prev_is_special = False
-        for token in tokens:
-            # make sure that special tokens are not decoded using sentencepiece model
-            if token in self.all_special_tokens:
-                # if not prev_is_special:
-                #     out_string += " "
-                out_string += self.sp_model.decode(current_sub_tokens) + token
-                # prev_is_special = True
-
-                current_sub_tokens = []
-            else:
-                current_sub_tokens.append(token)
-                # prev_is_special = False
-        out_string += self.sp_model.decode(current_sub_tokens)
-        return out_string  # .strip()
-
-    def prepare_for_model(self, *args, **kwargs):
-        """doc"""
-        if "add_special_tokens" in kwargs:
-            kwargs.pop("add_special_tokens")
-            # logger.warning(f'ErnieBotTokenizer v2 does not support `add_special_tokens`')
-        return super().prepare_for_model(*args, **kwargs)
-
-    def save_vocabulary(self,
-                        save_directory,
-                        filename_prefix: Optional[str] = None) -> Tuple[str]:
-        """
-        Save the vocabulary and special tokens file to a directory.
-        Args:
-            save_directory (`str`):
-                The directory in which to save the vocabulary.
-        Returns:
-            `Tuple(str)`: Paths to the files saved.
-        """
-        if not os.path.isdir(save_directory):
-            logger.error(
-                f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory,
-            (filename_prefix + "-" if filename_prefix else "") +
-            self.resource_files_names["vocab_file"],
-        )
-        if os.path.abspath(self.vocab_file) != os.path.abspath(
-                out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-        return (out_vocab_file, )
-
-    def tokenize(self, text: TextInput, **kwargs) -> List[str]:
-        """
-        Converts a string in a sequence of tokens, using the tokenizer.
-
-        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
-        (BPE/SentencePieces/WordPieces). Takes care of added tokens.
-
-        Args:
-            text (`str`):
-                The sequence to be encoded.
-            **kwargs (additional keyword arguments):
-                Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
-
-        Returns:
-            `List[str]`: The list of tokens.
-        """
-        # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
-        # all_special_tokens_extended = dict(
-        #     (str(t), t)
-        #     for t in self.all_special_tokens_extended
-        #     if isinstance(t, AddedToken)
-        # )
-
-        text, kwargs = self.prepare_for_tokenization(text, **kwargs)
-
-        # TODO: should this be in the base class?
-        if hasattr(self, "do_lower_case") and self.do_lower_case:
-            # convert non-special tokens to lowercase
-            escaped_special_toks = [
-                re.escape(s_tok) for s_tok in (self.unique_no_split_tokens +
-                                               self.all_special_tokens)
-            ]
-            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
-            text = re.sub(pattern,
-                          lambda m: m.groups()[0] or m.groups()[1].lower(),
-                          text)
-
-        no_split_token = set(self.unique_no_split_tokens)
-        tokens = self.tokens_trie.split(text)
-
-        tokenized_text = []
-        for token in tokens:
-            # Need to skip eventual empty (fully stripped) tokens
-            if not token:
-                continue
-            if token in no_split_token:
-                tokenized_text.append(token)
-            else:
-                tokenized_text.extend(self._tokenize(token))
-        # ["This", " is", " something", "<special_token_1>", "else"]
-        return tokenized_text
-
-    def _decode(self, *args, **kwargs):
-        """doc"""
-        kwargs.pop("clean_up_tokenization_spaces", None)
-        kwargs.pop("spaces_between_special_tokens", None)
-        return super()._decode(
-            *args,
-            **kwargs,
-            clean_up_tokenization_spaces=False,
-            spaces_between_special_tokens=False,
-        )
-
-    def _pad(
-        self,
-        encoded_inputs: Dict,
-        max_length: Optional[int] = None,
-        padding_strategy=PaddingStrategy.DO_NOT_PAD,
-        pad_to_multiple_of: Optional[int] = None,
-        return_attention_mask: Optional[bool] = None,
-    ) -> dict:
-        """doc"""
-        if return_attention_mask is None:
-            return_attention_mask = "attention_mask" in self.model_input_names
-        if return_attention_mask:
-            required_input = encoded_inputs[self.model_input_names[0]]
-            if padding_strategy == PaddingStrategy.LONGEST:
-                max_length = len(required_input)
-            if (max_length is not None and pad_to_multiple_of is not None
-                    and (max_length % pad_to_multiple_of != 0)):
-                max_length = ((max_length // pad_to_multiple_of) +
-                              1) * pad_to_multiple_of
-            needs_to_be_padded = (padding_strategy
-                                  != PaddingStrategy.DO_NOT_PAD
-                                  and len(required_input) != max_length)
-            if ("attention_mask" in encoded_inputs
-                    and encoded_inputs["attention_mask"] is not None):
-                attention_mask = encoded_inputs.pop("attention_mask")
-                if isinstance(attention_mask, paddle.Tensor):
-                    attention_mask = attention_mask.numpy()
-                elif isinstance(attention_mask, list):
-                    attention_mask = np.array(attention_mask)
-                elif not isinstance(attention_mask, np.ndarray):
-                    raise ValueError(
-                        f"Unexpected type {type(attention_mask)} of attention_mask, "
-                    )
-            else:
-                attention_mask = np.tril(
-                    np.ones((len(required_input), len(required_input)),
-                            dtype=np.int64))
-                attention_mask = np.expand_dims(attention_mask, axis=0)
-            if needs_to_be_padded:
-                difference = max_length - len(required_input)
-                if self.padding_side == "right":
-                    if attention_mask.ndim == 1:
-                        pad_width = [(0, difference)]
-                    else:
-                        pad_width = [(0, 0), (0, difference), (0, difference)]
-                elif self.padding_side == "left":
-                    if attention_mask.ndim == 1:
-                        pad_width = [(difference, 0)]
-                    else:
-                        pad_width = [(0, 0), (difference, 0), (difference, 0)]
-                else:
-                    raise ValueError("Invalid padding strategy:" +
-                                     str(self.padding_side))
-                attention_mask = np.pad(
-                    attention_mask,
-                    pad_width=pad_width,
-                    mode="constant",
-                    constant_values=0,
-                )
-        encoded_inputs = super()._pad(
-            encoded_inputs,
-            max_length,
-            padding_strategy=padding_strategy,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_attention_mask=False,
-        )
-        if return_attention_mask:
-            encoded_inputs["attention_mask"] = attention_mask.tolist()
-        return encoded_inputs
-
-
-def add_special_tokens(
-    tokenizer,
-    special_tokens_info,
-    use_ocr_specialtoken=False,
-    use_crop_specialtoken=False,
-    special_token_ids_start=254208,
-    special_token_ids_end=256256,
-):
-    """
-    增加 special token
-
-    placeholder [<|IMAGE_PLACEHOLDER|>, <|AUDIO_PLACEHOLDER|>, <|VIDEO_PLACEHOLDER|>] 共3个
-
-    模态起始截止 special tokens [<|BOI|> <|EOI|> <|BOA|> <|EOA|> <|BOV|> <|EOV|>]
-
-    ocr special tokens [<|LOC_0|> <|LOC_1|> ... <|LOC_1000|>] 共1001个
-
-    crop special tokens [<|CROP_COL_SEP|>, <|CROP_ROW_SEP|>, <|CROP_IMAGE_SEP|>] 共3个
-        <|CROP_COL_SEP|> for col 维度切 图片width（替换原明文逗号）
-        <|CROP_ROW_SEP|> for row 维度切 图片height（替换原明文回车）
-        <|CROP_IMAGE_SEP|> for 区分原图和crop图 图片width（替换原明文两个回车）
-
-    共2048个 unsed token
-
-    Args:
-        tokenizer (ErnieTokenizer): tokenizer
-        special_token_ids_start (int, optional): special token 起点 ids. Defaults to 254208.
-        special_token_ids_end (int, optional): 词表最多支持大小. Defaults to 256256.
-    """
-    special_tokens = [
-        special_tokens_info["image_placeholder"],
-        special_tokens_info["audio_placeholder"],
-    ]
-
-    if use_ocr_specialtoken:
-        special_tokens.extend(special_tokens_info["ocr_coor"])
-        special_tokens.extend(special_tokens_info["ocr_begin_end"])
-
-    if use_crop_specialtoken:
-        special_tokens.extend(special_tokens_info["crop"])
-
-    # add special_tokens
-    additional_special_tokens = {"additional_special_tokens": special_tokens}
-    tokenizer.add_special_tokens(additional_special_tokens)
-
-    # check
-    first_special_tokens = tokenizer.encode(special_tokens[0])["input_ids"]
-
-    assert (first_special_tokens[0] == special_token_ids_start
-            ), f"[ERROR] first_special_tokens={first_special_tokens}"
-    assert (
-        len(tokenizer.get_vocab()) < special_token_ids_end
-    ), f"[ERROR] vocab_size = {len(tokenizer.get_vocab())} >= {special_token_ids_end} 增加过多special token了!"
--- a/fastdeploy/model_executor/models/utils.py
+++ b/fastdeploy/model_executor/models/utils.py
--- a/fastdeploy/model_executor/ops/triton_ops/init.py
+++ b/fastdeploy/model_executor/ops/triton_ops/init.py
@@ -0,0 +1,22 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+try:
+    from .wint2_fused_moe import fused_moe_wint2_triton
+
+    __all__ = ["fused_moe_wint2_triton"]
+except:
+    pass
--- a/fastdeploy/model_executor/ops/triton_ops/triton_utils.py
+++ b/fastdeploy/model_executor/ops/triton_ops/triton_utils.py
@@ -0,0 +1,804 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import inspect
+import os
+import re
+import sys
+
+import paddle
+import triton
+from paddle.base.framework import OpProtoHolder
+
+from fastdeploy import envs
+
+compile_file = triton.__path__[0] + "/tools/compile.py"
+link_file = triton.__path__[0] + "/tools/link.py"
+python_path = sys.executable
+
+
+def SubstituteTemplate(template, values):
+    """
+    Substitute all variables in the given template string using the provided values dictionary.
+    """
+    text = template
+    changed = True
+    while changed:
+        changed = False
+        for key, value in values.items():
+            regex = "\\$\\{%s\\}" % key
+            newtext = re.sub(regex, value, text)
+            if newtext != text:
+                changed = True
+            text = newtext
+    return text
+
+
+def find_so_path(generated_dir, python_package_name):
+    """
+    find the specified so in generated_dir, if not found it will return None.
+    """
+
+    so_path = []
+    for root, dirs, files in os.walk(generated_dir):
+        for file in files:
+            if file.endswith(python_package_name + ".so"):
+                so_path.append(os.path.join(root, file))
+    if len(so_path) == 0:
+        return None
+    else:
+        assert len(so_path) == 1
+        return so_path[0]
+
+
+def multi_process_do(commands):
+    """
+    Multi-threaded execution of commands.
+    """
+    THREADS = 40
+    import multiprocessing
+
+    process = []
+
+    def one_process_work(commands, thread_id):
+        i = thread_id
+        while i < len(commands):
+            re = os.system(commands[i])
+            assert re == 0
+            i += THREADS
+
+    for i in range(THREADS):
+        p = multiprocessing.Process(target=one_process_work,
+                                    args=(commands, i))
+        process.append(p)
+    for p in process:
+        p.start()
+    for p in process:
+        p.join()
+
+
+def extract_triton_kernel(kernel, file_name):
+    """
+    Extract the triton kernel and write it to the specified file_name.
+
+    Args:
+        kernel: the triton kernel name.
+        file_name: the file name you want to write.
+    """
+
+    import inspect
+    import re
+    import textwrap
+
+    fn = kernel
+    if type(kernel) == triton.runtime.jit.JITFunction:
+        fn = kernel.fn
+    elif type(kernel) == triton.runtime.autotuner.Autotuner:
+        fn = kernel.fn.fn
+    else:
+        AssertionError("error occurs")
+    py_script = textwrap.dedent(inspect.getsource(fn))
+
+    # @triton.jit must only appear once
+    # assert len(re.findall("@triton.jit", py_script)) == 1
+    assert len(re.findall("def ", py_script)) == 1
+    # assert len(re.findall("@haha()", py_script)) == 1
+    # py_script = py_script.replace("@haha()", "@triton.jit")
+
+    py_script = py_script[py_script.find("def "):]
+    py_script = "import triton\nimport triton.language as tl\n\n\n@triton.jit\n" + py_script
+
+    py_script = py_script.replace("if bias_ptr is not None", "if bias_ptr")
+
+    with open(file_name, "w") as f:
+        f.write(py_script)
+        f.close()
+
+
+template_install = """
+
+import os
+generated_cu = []
+for root, dirs, files in os.walk("./"):
+    for file in files:
+        if file.endswith(".c") or file.endswith(".cu"):
+            generated_cu.append(os.path.join(root, file))
+
+
+import paddle
+from paddle.utils.cpp_extension import CUDAExtension, setup
+
+
+def get_gencode_flags():
+    prop = paddle.device.cuda.get_device_properties()
+    cc = prop.major * 10 + prop.minor
+    return ["-gencode", "arch=compute_{{0}},code=sm_{{0}}".format(cc)]
+
+
+gencode_flags = get_gencode_flags()
+
+
+
+setup(
+    name="{python_package_name}",
+    ext_modules=CUDAExtension(
+        sources = generated_cu,
+        extra_compile_args={{
+            "cc": ["-lcuda"],
+            "nvcc": [
+                "-O3",
+                "-U__CUDA_NO_HALF_OPERATORS__",
+                "-U__CUDA_NO_HALF_CONVERSIONS__",
+                "-U__CUDA_NO_BFLOAT16_OPERATORS__",
+                "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+                "-U__CUDA_NO_BFLOAT162_OPERATORS__",
+                "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
+            ]
+            + gencode_flags,
+        }},
+        extra_link_args = ["-lcuda"]
+    ),
+)
+"""
+
+
+def get_op_name_with_suffix(op_name, x_list):
+    """
+    Get the operator name with suffix.
+    """
+    suffix = []
+    for x in x_list:
+        if x % 16 == 0:
+            suffix.append(16)
+        elif x == 1:
+            suffix.append(1)
+        else:
+            suffix.append(0)
+    return op_name + "_".join([str(i) for i in suffix])
+
+
+def get_value_hint(x):
+    """
+    Get the value hint from input list.
+    """
+    hint = ""
+    for ele in x:
+        if type(ele) == int:
+            if ele % 16 == 0 and ele > 0:
+                hint += "i64:16,"
+            elif ele == 1:
+                hint += "i64:1,"
+            else:
+                hint += "i64,"
+        if type(ele) == float:
+            hint += "fp32,"
+    return hint
+
+
+def get_dtype_str(dtype):
+    """
+    Get the dtype str.
+    """
+    if dtype == paddle.float16:
+        return "_fp16"
+    if dtype == paddle.float8_e4m3fn:
+        return "_float8_e4m3fn"
+    elif dtype == paddle.uint8:
+        return "_u8"
+    elif dtype == paddle.int8:
+        return "_i8"
+    elif dtype == paddle.int16:
+        return "_i16"
+    elif dtype == paddle.int32:
+        return "_i32"
+    elif dtype == paddle.int64:
+        return "_i64"
+    elif dtype == paddle.float32:
+        return "_fp32"
+    elif dtype == paddle.bfloat16:
+        return "_bf16"
+    else:
+        raise ValueError("Not support this dtype.")
+
+
+def build_package(generated_dir, python_package_name):
+    """
+    Build the package, not install it.
+
+    Args:
+        generated_dir: the source cu file dir.
+        python_package_name: the python package name.
+    """
+    setup_file_path = generated_dir + "/setup_cuda.py"
+    python_path = sys.executable
+    with open(setup_file_path, "w") as f:
+        f.write(
+            template_install.format(python_package_name=python_package_name))
+        f.close()
+    install_command = f"cd {generated_dir} && {python_path} setup_cuda.py build"
+    re = os.system(install_command)
+    assert re == 0
+
+
+def rename_c_to_cu(generated_dir):
+    """
+    Rename the .c files int generated_dir to .cu file, because the triton aot tool generate the .c files.
+    """
+    # rename the .c file to .cu
+    for filename in os.listdir(generated_dir):
+        if filename.endswith(".c"):
+            old_path = os.path.join(generated_dir, filename)
+            new_path = os.path.join(generated_dir, filename + "u")
+            os.rename(old_path, new_path)
+
+
+def get_pointer_hint(dtypes):
+    """
+    Get the pointer hint from input list.
+    """
+    hint = ""
+    for ele in dtypes:
+        if ele == paddle.float16:
+            hint += "*fp16:16,"
+        elif ele == paddle.uint8:
+            hint += "*u8:16,"
+        elif ele == paddle.int8:
+            hint += "*i8:16,"
+        elif ele == paddle.int16:
+            hint += "*i16:16,"
+        elif ele == paddle.float32:
+            hint += "*fp32:16,"
+        elif ele == paddle.bfloat16:
+            hint += "*bf16:16,"
+        elif ele == paddle.int32:
+            hint += "*i32:16,"
+        elif ele == paddle.int64:
+            hint += "*i64,"
+        elif ele == paddle.float8_e4m3fn:
+            hint += "*fp8e4nv:16,"
+    return hint
+
+
+paddle_custom_op_head_part = """#include <vector>
+#include <map>
+#include "${op_name}_kernel.h"
+#include "paddle/extension.h"
+
+std::map<std::vector<int>, int> map_problem_${op_name};
+
+CUdeviceptr get_tensor_ptr(const paddle::Tensor& input){
+  if (input.type() == paddle::DataType::FLOAT16) {
+    return (CUdeviceptr)(input.data<phi::dtype::float16>());
+  } else if (input.type() == paddle::DataType::BFLOAT16) {
+    return (CUdeviceptr)(input.data<phi::dtype::bfloat16>());
+  } else if (input.type() == paddle::DataType::INT32) {
+    return (CUdeviceptr)(input.data<int>());
+  } else if (input.type() == paddle::DataType::FLOAT32) {
+    return (CUdeviceptr)(input.data<float>());
+  } else if (input.type() == paddle::DataType::UINT8) {
+    return (CUdeviceptr)(input.data<uint8_t>());
+  } else if (input.type() == paddle::DataType::INT8) {
+    return (CUdeviceptr)(input.data<int8_t>());
+  } else if (input.type() == paddle::DataType::INT64) {
+    return (CUdeviceptr)(input.data<int64_t>());
+  } else if (input.type() == paddle::DataType::INT32) {
+    return (CUdeviceptr)(input.data<int32_t>());
+  } else if (input.type() == paddle::DataType::INT16) {
+    return (CUdeviceptr)(input.data<int16_t>());
+  } else if (input.type() == paddle::DataType::FLOAT8_E4M3FN) {
+    return (CUdeviceptr)(input.data<phi::dtype::float8_e4m3fn>());
+  } else {
+    assert(false);
+    return (CUdeviceptr)(nullptr);
+  }
+}
+
+int triton_cdiv(int x, int y) {
+    int result = (x + y - 1) / y;
+    return (int)(result);
+}
+"""
+
+tune_and_invoke_part = """
+  std::vector<int> problem_size = {${key}};
+  auto run_triton_kernel = [&](int algo_id) -> CUresult{
+      return ${op_name}_kernel(run_stream,
+                                               ${triton_kernel_args},
+                                               algo_id);
+  };
+
+  map_problem_${op_name}[problem_size] = 0;
+
+  if (!map_problem_${op_name}.count(problem_size)) {
+    std::cout << "we are tuning for ${op_name} which key is: {";
+    for (int i = 0; i < problem_size.size(); i++) {
+        std::cout << problem_size[i] << ", ";
+    }
+    std::cout << "}" << std::endl;
+
+    float min_time = 10000.f;
+    int select_id = -1;
+    constexpr int WARMUP = 5;
+    constexpr int REPEAT = 10;
+
+    for (int algo_id = 0; algo_id < ${op_name}_kernel_get_num_algos(); ++algo_id) {
+        cudaEvent_t beg[REPEAT];
+        cudaEvent_t end[REPEAT];
+        float elapsed_times[REPEAT];
+
+        auto status = CUDA_SUCCESS;
+
+        for (int ii = 0; ii < WARMUP + REPEAT; ii++) {
+            int repeat_id = ii - WARMUP;
+
+            if (repeat_id >= 0) {
+                (cudaEventCreate(beg + repeat_id));
+                (cudaEventCreate(end + repeat_id));
+                (cudaEventRecord(beg[repeat_id]));
+            }
+
+            auto flush_l2_cache = paddle::full(
+                {10 * 1024 * 1024}, 0, paddle::DataType::INT32, ${arbitary_output_name}.place());
+            // std::cout << &flush_l2_cache  << std::endl;
+            // this is used when out is need to be reset to zero, such as split-k gemm.
+            ${reset_zero_when_tune};
+
+            status = run_triton_kernel(algo_id);
+            // assert(status == CUDA_SUCCESS);
+
+            if (repeat_id >= 0) {
+                (cudaEventRecord(end[repeat_id]));
+                (cudaEventSynchronize(end[repeat_id]));
+                (cudaEventElapsedTime(
+                    elapsed_times + repeat_id, beg[repeat_id], end[repeat_id]));
+            }
+        }
+
+        float avg_elapsed_time = 0.f;
+        for (int ii = 0; ii < REPEAT; ++ii) {
+            avg_elapsed_time += elapsed_times[ii];
+        }
+
+        std::cout << "algo id " << algo_id << " costs " << avg_elapsed_time << " ms" << std::endl;
+
+        if (avg_elapsed_time < min_time && status == CUDA_SUCCESS) {
+            min_time = avg_elapsed_time;
+            select_id = algo_id;
+        }
+    }
+
+    map_problem_${op_name}[problem_size] = select_id;
+    std::cout << "select algo id: " << select_id << std::endl;
+    ${reset_zero_when_tune};
+  }
+
+  if (map_problem_${op_name}.count(problem_size)) {
+    int algo_id = map_problem_${op_name}[problem_size];
+    auto status = run_triton_kernel(algo_id);
+    assert(status == CUDA_SUCCESS);
+  }
+"""
+
+common_template = ("""
+std::vector<paddle::Tensor> ${op_name}_func(${input_and_attr}) {
+  ${prepare_attr_for_triton_kernel}
+  ${prepare_ptr_for_triton_kernel}
+  auto  run_stream = ${arbitary_output_name}.stream();
+  """ + tune_and_invoke_part + """
+  return {${return_tensor_names}};
+}
+
+${d2s_infer_code}
+
+PD_BUILD_OP(${op_name})
+    .Inputs({${paddle_input_sig}})
+    .Outputs({${paddle_output_sig}})
+    .Attrs({${paddle_attr_sig}})
+    .SetKernelFn(PD_KERNEL(${op_name}_func))
+    .SetInferDtypeFn(PD_INFER_DTYPE(${op_name}_InferDtype))
+    .SetInferShapeFn(PD_INFER_SHAPE(${op_name}_InferShape));
+""")
+
+
+def rendering_common_template(
+    func,
+    prepare_attr_for_triton_kernel,
+    prepare_ptr_for_triton_kernel,
+    return_tensor_names=None,
+    d2s_infer_code="",
+):
+    """
+    Render a template with given function and its arguments.
+    Args:
+        func: The function to render.
+        prepare_attr_for_triton_kernel: The code snippet that prepares attributes for Triton kernel.
+        prepare_ptr_for_triton_kernel: The code snippet that prepares pointers for Triton kernel.
+        return_tensor_names: The names of the returned tensors. Default is None.
+    """
+    signature = inspect.signature(func)
+    arg_names = [v.name for v in signature.parameters.values()]
+    arg_defaults = [v.default for v in signature.parameters.values()]
+    input_and_attr = ""
+    paddle_input_sig = ""
+    paddle_attr_sig = ""
+
+    if return_tensor_names is None:
+        return_tensor_names = "useless"
+        prepare_ptr_for_triton_kernel += (
+            "auto useless = paddle::empty({1}, paddle::DataType::INT32, paddle::CPUPlace());"
+        )
+
+    for i in range(len(arg_names)):
+        if arg_defaults[i] is None:
+            input_and_attr += f"paddle::optional<paddle::Tensor> & {arg_names[i]},"
+            paddle_input_sig += f"""paddle::Optional("{arg_names[i]}"),"""
+        elif type(arg_defaults[i]) == float:
+            input_and_attr += f"float {arg_names[i]},"
+            paddle_attr_sig += f""""{arg_names[i]}: float","""
+        elif type(arg_defaults[i]) == bool:
+            input_and_attr += f"bool {arg_names[i]},"
+            paddle_attr_sig += f""""{arg_names[i]}: bool","""
+        elif type(arg_defaults[i]) == int:
+            input_and_attr += f"int64_t {arg_names[i]},"
+            paddle_attr_sig += f""""{arg_names[i]}: int64_t","""
+        elif type(arg_defaults[i]) == str:
+            input_and_attr += f"std::string {arg_names[i]},"
+            paddle_attr_sig += f""""{arg_names[i]}: std::string","""
+        elif arg_names[i] == "config":
+            continue
+        else:
+            input_and_attr += f"const paddle::Tensor & {arg_names[i]},"
+            paddle_input_sig += f""""{arg_names[i]}","""
+    input_and_attr = input_and_attr[:-1]
+    paddle_input_sig = paddle_input_sig[:-1]
+    if len(paddle_attr_sig) > 1:
+        paddle_attr_sig = paddle_attr_sig[:-1]
+
+    paddle_output_sig = ""
+    arbitary_output_name = ""
+    for name in return_tensor_names.split(","):
+        name = name.strip()
+        arbitary_output_name = name
+        paddle_output_sig += f""""{name}","""
+    paddle_output_sig = paddle_output_sig[:-1]
+
+    if "${op_name}_InferShape" not in d2s_infer_code:
+        d2s_infer_shape_part = (
+            "std::vector<std::vector<int64_t>> ${op_name}_InferShape("
+            "const std::vector<int64_t>& A_shape) {"
+            "return {${tmp}};"
+            "}\n ")
+        tmp = ",".join(["A_shape"] * len(return_tensor_names.split(",")))
+        tmp_dict = {"tmp": tmp}
+        d2s_infer_shape_part = SubstituteTemplate(d2s_infer_shape_part,
+                                                  tmp_dict)
+
+        d2s_infer_code += d2s_infer_shape_part
+
+    if "${op_name}_InferDtype" not in d2s_infer_code:
+        d2s_infer_dtype_part = (
+            "std::vector<paddle::DataType> ${op_name}_InferDtype("
+            "const paddle::DataType& A_dtype) {"
+            "return {${tmp}};"
+            "}\n ")
+        tmp = ",".join(["A_dtype"] * len(return_tensor_names.split(",")))
+        tmp_dict = {"tmp": tmp}
+        d2s_infer_dtype_part = SubstituteTemplate(d2s_infer_dtype_part,
+                                                  tmp_dict)
+
+        d2s_infer_code += d2s_infer_dtype_part
+
+    result_str = SubstituteTemplate(
+        common_template,
+        {
+            "input_and_attr": input_and_attr,
+            "prepare_attr_for_triton_kernel": prepare_attr_for_triton_kernel,
+            "prepare_ptr_for_triton_kernel": prepare_ptr_for_triton_kernel,
+            "return_tensor_names": return_tensor_names,
+            "arbitary_output_name": arbitary_output_name,
+            "d2s_infer_code": d2s_infer_code,
+            "paddle_input_sig": paddle_input_sig,
+            "paddle_output_sig": paddle_output_sig,
+            "paddle_attr_sig": paddle_attr_sig,
+        },
+    )
+
+    return paddle_custom_op_head_part + result_str
+
+
+class KernelInterface:
+    """
+    triton kernel interface.
+    """
+
+    def __init__(
+        self,
+        func,
+        other_config,
+        key_args=["1"],
+    ):
+        """
+        triton kernel interface.
+        """
+        self.func = func
+        self.key_args = key_args
+
+        signature = inspect.signature(func)
+        self.arg_names = [v.name for v in signature.parameters.values()]
+        for ele in self.arg_names:
+            assert self.arg_names.count(ele) == 1
+        # arg_defaults = [v.default for v in signature.parameters.values()]
+
+        # self.annotations = {
+        #     name: ty for name, ty in func.__annotations__.items()
+        # }
+        self.annotations = dict(func.__annotations__)
+
+        self.constexprs = [
+            self.arg_names.index(name) for name in self.arg_names
+            if self.annotations.get(name) == triton.language.core.constexpr
+        ]
+
+        self.arg_exclude_constexpr = [
+            self.arg_names[i] for i in range(len(self.arg_names))
+            if i not in self.constexprs
+        ]
+
+        import textwrap
+
+        py_script = textwrap.dedent(inspect.getsource(func))
+
+        import re
+
+        pat = r"def\s" + func.__name__
+        func_begin = re.findall(pat, py_script)
+        assert len(func_begin) == 1
+        func_begin = func_begin[0]
+        py_script = py_script[py_script.find(func_begin):]
+
+        def decorator(*args, **kwargs):
+            """
+            decorator for triton kernels.
+            Args:
+                *args: positional arguments
+                **kwargs: keyword arguments
+            """
+            all_input = []
+
+            for i in range(len(args)):
+                all_input.append(args[i])
+
+            position_arguments_num = len(all_input)
+            for i in range(position_arguments_num, len(self.arg_names)):
+                if self.arg_names[i] in kwargs.keys():
+                    all_input.append(kwargs[self.arg_names[i]])
+                else:
+                    # means this input is not specified, it muse be a tl.constexpr.
+                    assert i in self.constexprs
+                    all_input.append(None)
+
+            dtypes = []
+            x_list = []
+            const_args = [self.arg_names[i] for i in self.constexprs]
+            # we dont allow there are two strings in const_args, and one is a substring of the other.
+            for i in const_args:
+                for j in const_args:
+                    if i != j and i.find(j) != -1:
+                        raise ValueError(
+                            f"We find {i}, {j} in tl.constexpr args, and {j} is a substring of {i}, "
+                            "please modify your triton kernel arguments names to avoid this."
+                        )
+
+            modified_arg_exclude_constexpr = self.arg_exclude_constexpr
+            const_hint_dict = {}
+            for i in range(len(all_input)):
+                ele = all_input[i]
+                if (type(ele) == paddle.Tensor
+                        or type(ele) == paddle.base.framework.EagerParamBase
+                        or type(ele) == paddle.base.framework.Parameter
+                        or type(ele) == paddle.base.framework.Variable
+                        or type(ele) == paddle.base.libpaddle.pir.Value):
+                    dtypes.append(ele.dtype)
+                    modified_arg_exclude_constexpr[i] = f"input_ptrs[{i}]"
+                elif i in self.constexprs:
+                    const_hint_dict[self.arg_names[i]] = ele
+                else:
+                    x_list.append(ele)
+
+            op_name = self.op_name
+
+            python_package_name = f"{op_name}_package"
+            tp_rank = paddle.distributed.get_rank()
+            generated_dir = envs.FD_TRITON_KERNEL_CACHE_DIR
+            if generated_dir is None:
+                generated_dir = f"/tmp/triton_cache/rank{tp_rank}"
+            print("the kernel cache dir is:", generated_dir)
+            assert (generated_dir is not None), (
+                "TRITON_KERNEL_CACHE_DIR is None, please set it such as "
+                "export TRITON_KERNEL_CACHE_DIR=/tmp/triton_cache ")
+            generated_dir = f"{generated_dir}/{op_name}"
+            os.makedirs(generated_dir, exist_ok=True)
+
+            py_script_file = f"{generated_dir}/triton_kernels.py"
+            extract_triton_kernel(func, py_script_file)
+
+            address_hint = get_pointer_hint(dtypes)
+            value_hint = get_value_hint(x_list)
+            const_args = [f"{{{ele}}}" for ele in const_args]
+            const_args = ",".join(const_args)
+
+            lanuch_grid = list(self.grid)
+            for i in range(len(lanuch_grid)):
+                ele = lanuch_grid[i]
+                if type(ele) == str:
+                    for key in const_hint_dict.keys():
+                        if key in ele:
+                            ele = ele.replace(key, f"{{{key}}}")
+                else:
+                    ele = str(ele)
+
+                lanuch_grid[i] = ele
+            if len(lanuch_grid) < 3:
+                lanuch_grid += ["1"] * (3 - len(lanuch_grid))
+            lanuch_grid = ",".join(lanuch_grid)
+
+            op_dict = {"op_name": op_name, "reset_zero_when_tune": ""}
+            op_dict["triton_kernel_args"] = ",".join(
+                modified_arg_exclude_constexpr)
+            op_dict["key"] = ",".join(self.key_args)
+            # when tunning, we need to reset the out to zero.
+            if "reset_zero_when_tune" in other_config.keys():
+                op_dict["reset_zero_when_tune"] = other_config[
+                    "reset_zero_when_tune"]
+
+            paddle_custom_op_file_path = f"{generated_dir}/{op_name}.cu"
+            so_path = find_so_path(generated_dir, python_package_name)
+
+            if so_path is None:
+                print("== we do not find so_path, we need to compile it")
+                with open(paddle_custom_op_file_path, "w") as f:
+                    f.write(
+                        SubstituteTemplate(
+                            self.custom_op_template,
+                            op_dict,
+                        ))
+                    f.close()
+
+                # ahead of time compile command.
+                aot_template = (
+                    f"""{python_path}  {compile_file} {py_script_file}  """ +
+                    f""" -n {func.__name__} -o {generated_dir}/{op_name}_kernel """
+                    + f"""--out-name {op_name}_kernel  """ +
+                    """ -w {num_warps} -ns {num_stages} """ +
+                    f""" -s"{address_hint} {value_hint} {const_args}" """ +
+                    f"""  -g "{lanuch_grid}" """)
+                all_tune_config = list(self.tune_config)
+                if len(all_tune_config) == 0:
+                    # when user do not specify config, we use const_hint_dict as config.
+                    all_tune_config = [const_hint_dict]
+                    # reset const_hint_dict as empty.
+                    const_hint_dict = {}
+                codegen_commands = []
+                for config in all_tune_config:
+                    for key in const_hint_dict.keys():
+                        if const_hint_dict[key] is not None:
+                            if key not in config.keys():
+                                config[key] = const_hint_dict[key]
+                            else:
+                                if config[key] == const_hint_dict[key]:
+                                    pass
+                                else:
+                                    message = (
+                                        f"you specify {key} both in arguments and config, "
+                                        "and they are not same, this is wrong."
+                                    )
+                                    raise ValueError(message)
+                        else:
+                            assert key in config.keys(
+                            ), f"you must specify {key} in your config."
+                    if "num_warps" not in config.keys():
+                        config["num_warps"] = 4
+                    if "num_stages" not in config.keys():
+                        config["num_stages"] = 4
+
+                    for key in config:
+                        assert config[
+                            key] is not None, f"{key} must be specified."
+                    codegen_command = aot_template.format(**config, )
+                    print(codegen_command)
+                    codegen_commands.append(codegen_command)
+                multi_process_do(codegen_commands)
+
+                link_command = (
+                    f"{python_path}  {link_file} "
+                    f"{generated_dir}/*.h -o {generated_dir}/{op_name}_kernel")
+                re = os.system(link_command)
+                assert re == 0
+
+                # rename the .c file to .cu
+                rename_c_to_cu(generated_dir)
+                # build the package to so, not install
+                build_package(generated_dir, python_package_name)
+
+            if op_name not in OpProtoHolder.instance().op_proto_map.keys():
+                so_path = find_so_path(generated_dir, python_package_name)
+                print("== we find so_path: ", so_path)
+                assert so_path is not None
+                paddle.utils.cpp_extension.load_op_meta_info_and_register_op(
+                    so_path)
+
+        self.decorator = decorator
+
+    def __getitem__(self, op_name_and_grid):
+        """
+        override the operator [], which will call the decorator function.
+        Args:
+            op_name_and_grid: the name of the operator and the grid size.
+        Returns:
+            the decorator function.
+        """
+        assert len(op_name_and_grid) >= 3, "len(op_name_and_grid) must >= 3."
+        self.op_name = op_name_and_grid[0]
+        self.custom_op_template = op_name_and_grid[1]
+        self.grid = op_name_and_grid[2]
+        if len(op_name_and_grid) == 3:
+            self.tune_config = {}
+        else:
+            self.tune_config = op_name_and_grid[3]
+
+        return self.decorator
+
+
+def paddle_use_triton(other_config={}, key=[]):
+    """
+    The decorator function that wraps the original function.
+    Args:
+        func: the original function.
+    Returns:
+        the wrapped function.
+    """
+
+    def decorator(func):
+        """
+        The decorator function that wraps the original function.
+        Args:
+            func: the original function.
+        Returns:
+            the wrapped function.
+        """
+        return KernelInterface(func, other_config, key)
+
+    return decorator
--- a/fastdeploy/model_executor/ops/triton_ops/wint2_fused_moe.py
+++ b/fastdeploy/model_executor/ops/triton_ops/wint2_fused_moe.py
@@ -0,0 +1,549 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import paddle
+import triton.language as tl
+from paddle import _C_ops
+from paddle.base.framework import OpProtoHolder
+from paddle.framework import in_dynamic_or_pir_mode
+
+from fastdeploy.model_executor.ops.triton_ops.triton_utils import (
+    get_dtype_str, paddle_use_triton, rendering_common_template)
+
+BLOCK_SIZE_M = 16
+
+
+def invoke_fused_moe_kernel(
+    A,
+    B,
+    C,
+    B_scale,
+    B_super_scale,
+    B_code_scale,
+    B_code_zp,
+    topk_weights,
+    topk_ids,
+    sorted_token_ids,
+    expert_ids,
+    num_tokens_post_padded,
+    mul_routed_weight=False,
+    top_k=-1,
+    group_size=-1,
+):
+    """
+    Invoke Fused Moe Kernel
+    """
+    KK = A.shape[-1]
+    NN = B.shape[-1]
+    sstride_am, sstride_ak = A.shape[1], 1
+    sstride_be, sstride_bk, sstride_bn = B.shape[1] * B.shape[2], B.shape[2], 1
+    sstride_cm, sstride_cn = C.shape[-1], 1
+    sstride_bse, sstride_bsk, sstride_bsn = B_scale.shape[1] * B_scale.shape[
+        2], B_scale.shape[2], 1
+    sstride_bce, sstride_bck, sstride_bcn = B_code_scale.shape[1], 1, 1
+
+    ddouble_quant = B_super_scale is not None
+
+    prepare_attr_for_triton_kernel = """
+        auto N = B.shape()[2];
+        auto K = A.shape()[1];
+        auto EM = sorted_token_ids.shape()[0];
+        auto num_valid_tokens = (topk_ids.shape()[0]) * (topk_ids.shape()[1]);
+        auto stride_am = A.strides()[0];
+        auto stride_ak = A.strides()[1];
+        auto stride_be = B.strides()[0];
+        auto stride_bk = B.strides()[1];
+        auto stride_bn = B.strides()[2];
+        auto stride_cm = C.strides()[1];
+        auto stride_cn = C.strides()[2];
+        auto stride_bse = B_scale.strides()[0];
+        auto stride_bsk = B_scale.strides()[1];
+        auto stride_bsn = 1;
+        auto stride_bce = B_code_scale.strides()[0];
+        auto stride_bck = 1;
+        auto stride_bcn = 1;
+        auto double_quant = true;
+    """
+    if mul_routed_weight:
+        config = {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 128,
+            "GROUP_SIZE_M": 2,
+            "num_warps": 4,
+            "num_stages": 8,
+        }
+    else:
+        config = {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 512,
+            "GROUP_SIZE_M": 1,
+            "num_warps": 8,
+            "num_stages": 12,
+        }
+    configs = []
+
+    configs.append(dict(config))
+
+    op_name = "wint2_moe_ffn"
+    op_name += f"{get_dtype_str(A.dtype)}"
+    op_name += f"{B.shape[0]}"
+    op_name += f"{B.shape[1]}"
+    op_name += f"{B.shape[2]}"
+
+    if op_name not in OpProtoHolder.instance().op_proto_map.keys():
+        prepare_ptr_for_triton_kernel = """
+            CUdeviceptr input_ptrs[11] = {
+                get_tensor_ptr(A),
+                get_tensor_ptr(B),
+                get_tensor_ptr(C),
+                get_tensor_ptr(B_scale),
+                get_tensor_ptr(B_super_scale),
+                get_tensor_ptr(B_code_scale),
+                get_tensor_ptr(B_code_zp),
+                get_tensor_ptr(topk_weights),
+                get_tensor_ptr(sorted_token_ids),
+                get_tensor_ptr(expert_ids),
+                get_tensor_ptr(num_tokens_post_padded),
+            };
+            """
+        template_used = rendering_common_template(
+            invoke_fused_moe_kernel,
+            prepare_attr_for_triton_kernel,
+            prepare_ptr_for_triton_kernel,
+        )
+        grid = (
+            "(EM+BLOCK_SIZE_M-1)/BLOCK_SIZE_M * ((N+BLOCK_SIZE_N-1)/BLOCK_SIZE_N)",
+        )
+
+        moe_wint2_ffn_kernel[(op_name, template_used, grid, configs)](
+            A,
+            B,
+            C,
+            B_scale,
+            B_super_scale,
+            B_code_scale,
+            B_code_zp,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            NN,
+            KK,
+            -1,  #EEM,
+            -1,  #nnum_valid_tokens,
+            sstride_am,
+            sstride_ak,
+            sstride_be,
+            sstride_bk,
+            sstride_bn,
+            sstride_cm,
+            sstride_cn,
+            sstride_bse,
+            sstride_bsk,
+            sstride_bsn,
+            sstride_bce,
+            sstride_bck,
+            sstride_bcn,
+            MUL_ROUTED_WEIGHT=(int)(mul_routed_weight),
+            USE_DOUBLE_QUANT=(int)(ddouble_quant),
+            top_k=top_k,
+            BLOCK_SIZE_K=group_size,
+        )
+    if in_dynamic_or_pir_mode():
+
+        outs = _C_ops._run_custom_op(
+            op_name,
+            A,
+            B,
+            C,
+            B_scale,
+            B_super_scale,
+            B_code_scale,
+            B_code_zp,
+            topk_weights,
+            topk_ids,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            mul_routed_weight,
+            top_k,
+            group_size,
+        )
+        return outs[0]
+
+
+@paddle_use_triton(key=["1"], )
+def moe_wint2_ffn_kernel(
+    # Pointers to matrices
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    bs_ptr,
+    superbs_ptr,
+    codebs_ptr,
+    codebzp_ptr,
+    topk_weights_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    num_tokens_post_padded_ptr,
+    # Matrix dimensions
+    N,
+    K,
+    EM,
+    num_valid_tokens,
+    # The stride variables represent how much to increase the ptr by when
+    # moving by 1 element in a particular dimension. E.g. `stride_am` is
+    # how much to increase `a_ptr` by to get the element one row down
+    # (A has M rows).
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_bse,
+    stride_bsk,
+    stride_bsn,
+    stride_bce,
+    stride_bck,
+    stride_bcn,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    MUL_ROUTED_WEIGHT: tl.constexpr,
+    USE_DOUBLE_QUANT: tl.constexpr,
+    top_k: tl.constexpr,
+):
+    """
+    Implements the fused computation for a Mixture of Experts (MOE) using
+    token and expert matrices.
+
+    Key Parameters:
+    - A: The input tensor representing tokens with shape (*, K), where '*' can
+        be any shape representing batches and K is the feature dimension of
+        each token.
+    - B: The stacked MOE weight tensor with shape (E, N, K), where E is
+        the number of experts, K is the input feature dimension, and N is
+        the output feature dimension.
+    - C: The output cache tensor with shape (M, topk, N), where M is the
+        total number of tokens post padding, topk is the number of times
+        each token is repeated, and N is the output feature dimension.
+    - sorted_token_ids: A tensor containing the sorted indices of tokens,
+        repeated topk times and arranged by the expert index they are
+        assigned to.
+    - expert_ids: A tensor containing the indices of the expert for each
+        block. It determines which expert matrix from B should be used for
+        each block in A.
+    This kernel performs the multiplication of a token by its corresponding
+    expert matrix as determined by `expert_ids`. The sorting of
+    `sorted_token_ids` by expert index and padding ensures divisibility by
+    BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
+    multiplication across different blocks processed by the same expert.
+    """
+
+    if USE_DOUBLE_QUANT:
+        # INT4 scale
+        s_packnums: tl.constexpr = 2
+    bzp: tl.constexpr = 32
+    w_mask: tl.constexpr = 0x3F
+    pack_num: tl.constexpr = 4
+    real_k_size: tl.constexpr = (BLOCK_SIZE_K - 1) // pack_num + 1
+
+    pid = tl.program_id(axis=0)
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    compute_type = c_ptr.dtype.element_ty
+
+    num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
+    if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
+        return
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
+
+    token_mask = offs_token < num_valid_tokens
+
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    # offs_k = tl.arange(0, BLOCK_SIZE_K)
+    offs_bk = tl.arange(0, real_k_size)
+
+    a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
+                      offs_bk[None, :] * pack_num * stride_ak)
+
+    off_experts = tl.load(expert_ids_ptr + pid_m)
+    b_ptrs = b_ptr + off_experts * stride_be + (offs_bk[:, None] * stride_bk +
+                                                offs_bn[None, :] * stride_bn)
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    bs_ptrs = bs_ptr + off_experts * stride_bse + offs_bn[
+        None, :] * stride_bsn  # group-wise, need advanced
+
+    off_set = off_experts * stride_bce + offs_bn[None, :] * stride_bcn
+    # load channel-wise scale & zero-point
+    if USE_DOUBLE_QUANT:
+        superbs_ptrs = superbs_ptr + off_set  # channel-wise
+        super_bs = tl.load(superbs_ptrs)  # super scale
+
+    codebs_ptrs = codebs_ptr + off_set  # channel-wise
+    code_bs = tl.load(codebs_ptrs)  # code scale
+    codebzp_ptrs = codebzp_ptr + off_set  # channel-wise
+    code_bzp = tl.load(codebzp_ptrs)  # code zp
+
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+
+        b = tl.load(b_ptrs)
+
+        bs = tl.load(bs_ptrs)
+        if USE_DOUBLE_QUANT:
+            s_shift_bits = (1 - k % s_packnums) * 4
+            bs = ((bs >> s_shift_bits) & 0xF) * super_bs
+
+        # reverse to int16
+        b = tl.floor((b.to(tl.float32) * code_bs + code_bzp) + 0.5).to(
+            tl.int16)
+        # dequant
+        b1 = (((b >> 9) & w_mask) - bzp) * bs
+        a = tl.load(
+            a_ptrs,
+            mask=token_mask[:, None],
+            other=0.0,
+        )
+        accumulator += tl.dot(a, b1.to(a.dtype))
+
+        b1 = (((b >> 6) & w_mask) - bzp) * bs
+        a = tl.load(
+            a_ptrs + 1,
+            mask=token_mask[:, None],
+            other=0.0,
+        )
+        accumulator += tl.dot(a, b1.to(a.dtype))
+
+        b1 = (((b >> 3) & w_mask) - bzp) * bs
+        a = tl.load(
+            a_ptrs + 2,
+            mask=token_mask[:, None],
+            other=0.0,
+        )
+        accumulator += tl.dot(a, b1.to(a.dtype))
+
+        b = ((b & w_mask) - bzp) * bs
+        a = tl.load(
+            a_ptrs + 3,
+            mask=token_mask[:, None],
+            other=0.0,
+        )
+        accumulator += tl.dot(a, b.to(a.dtype))
+
+        b_ptrs += real_k_size * stride_bk
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+
+        # advance scale ptr
+        if USE_DOUBLE_QUANT:
+            bs_ptrs += stride_bsk * (k % s_packnums)
+        else:
+            bs_ptrs += stride_bsk
+
+    if MUL_ROUTED_WEIGHT:
+        moe_weight = tl.load(topk_weights_ptr + offs_token,
+                             mask=token_mask,
+                             other=0)
+        accumulator = accumulator * moe_weight[:, None]
+
+    accumulator = accumulator.to(compute_type)
+    # -----------------------------------------------------------
+    # Write back the block of the output
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
+        None, :]
+    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+def fused_moe_wint2_impl(
+    hidden_states,
+    ffn1_quant_weight,
+    ffn2_quant_weight,
+    topk_weights,
+    topk_ids,
+    # inplace: bool = False,
+    ffn1_weight_scale=None,
+    ffn2_weight_scale=None,
+    ffn1_super_scales=None,
+    ffn2_super_scales=None,
+    ffn1_code_scale=None,
+    ffn2_code_scale=None,
+    ffn1_code_zp=None,
+    ffn2_code_zp=None,
+    group_size=64,
+    bit="wint2",
+):
+    """
+    Implementation of Fused MoE kernels on GPU.
+    """
+    # Check constraints.
+    # A: [M, K]
+    # B: [E, K, N]
+    # assert hidden_states.shape[1] == ffn1_weight_scale.shape[1],
+    # f"Hidden size mismatch, {hidden_states.shape[1]} != {ffn1_quant_weight.shape[1]}"
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert ffn1_quant_weight.is_contiguous(
+    ), "Expert weights1 must be contiguous"
+    assert ffn2_quant_weight.is_contiguous(
+    ), "Expert weights2 must be contiguous"
+    assert group_size > 0, "Group size must be greater than 0"
+
+    num_tokens, K = hidden_states.shape
+    E, _, N = ffn1_quant_weight.shape
+    M = num_tokens
+
+    if group_size < 0:
+        group_size = K // ffn1_weight_scale.shape[1]
+
+    top_k = topk_ids.shape[1]
+
+    intermediate_cache1 = paddle.empty(
+        [M, top_k, N],
+        dtype=hidden_states.dtype,
+    )
+    intermediate_cache2 = paddle.empty(
+        (M * top_k, N // 2),
+        dtype=hidden_states.dtype,
+    )
+    intermediate_cache3 = paddle.empty(
+        (M, top_k, K),
+        dtype=hidden_states.dtype,
+    )
+
+    from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess
+
+    sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess(
+        topk_ids, E, BLOCK_SIZE_M)
+
+
+    invoke_fused_moe_kernel(
+        A=hidden_states,
+        B=ffn1_quant_weight,
+        C=intermediate_cache1,
+        B_scale=ffn1_weight_scale,
+        B_super_scale=ffn1_super_scales,
+        B_code_scale=ffn1_code_scale,
+        B_code_zp=ffn1_code_zp,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        sorted_token_ids=sorted_token_ids,
+        expert_ids=expert_ids,
+        num_tokens_post_padded=num_tokens_post_padded,
+        mul_routed_weight=False,
+        top_k=top_k,
+        group_size=group_size,
+    )
+
+    intermediate_cache2 = paddle.incubate.nn.functional.swiglu(
+        intermediate_cache1.reshape([-1, N]))
+
+    invoke_fused_moe_kernel(
+        A=intermediate_cache2,
+        B=ffn2_quant_weight,
+        C=intermediate_cache3,
+        B_scale=ffn2_weight_scale,
+        B_super_scale=ffn2_super_scales,
+        B_code_scale=ffn2_code_scale,
+        B_code_zp=ffn2_code_zp,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        sorted_token_ids=sorted_token_ids,
+        expert_ids=expert_ids,
+        num_tokens_post_padded=num_tokens_post_padded,
+        mul_routed_weight=True,
+        top_k=1,
+        group_size=group_size,
+    )
+
+    out_hidden_states = paddle.sum(intermediate_cache3, axis=1)
+    return out_hidden_states
+
+
+def fused_moe_wint2_triton(
+    hidden_states,
+    ffn1_quant_weight,
+    ffn2_quant_weight,
+    scores,
+    gate_correction_bias,
+    topk,
+    ffn1_weight_scale,
+    ffn2_weight_scale,
+    ffn1_super_scales,
+    ffn2_super_scales,
+    ffn1_code_scale,
+    ffn2_code_scale,
+    ffn1_code_zp,
+    ffn2_code_zp,
+):
+    """
+    Fuse MoE with WINT2 quantization scheme and Triton backend.
+    Args:
+        hidden_states: input tensor.
+        ffn1_quant_weight: ffn1 weight matrix for experts.
+        ffn2_quant_weight: ffn2 weight matrix for experts.
+        scores: gate scores.
+        gate_correction_bias: bias correction for gates.
+        topk: number of experts to use.
+        ffn1_weight_scale: scaling factor for ffn1_quant_weight.
+        ffn2_weight_scale: scaling factor for ffn2_quant_weight.
+        ffn1_super_scales: super scaling factor for ffn1_scale.
+        ffn2_super_scales: super scaling factor for ffn2_weight_scale.
+        ffn1_code_scale: code scaling factor for ffn1_quant_weight.
+        ffn2_code_scale: code scaling factor for ffn2_quant_weight.
+        ffn1_code_zp: code zero point for ffn1_quant_weight.
+        ffn2_code_zp: code zero point for ffn2_quant_weight.
+    Returns:
+        output tensor.
+    """
+
+    score = gate_correction_bias + scores
+    _, topk_ids = paddle.topk(score, k=topk, axis=-1)
+    topk_weights, _ = paddle.topk(scores, k=topk, axis=-1)
+    topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdim=True)
+
+    return fused_moe_wint2_impl(
+        hidden_states,
+        ffn1_quant_weight,
+        ffn2_quant_weight,
+        topk_weights,
+        topk_ids,
+        ffn1_weight_scale,
+        ffn2_weight_scale,
+        ffn1_super_scales,
+        ffn2_super_scales,
+        ffn1_code_scale,
+        ffn2_code_scale,
+        ffn1_code_zp,
+        ffn2_code_zp,
+        bit="wint2",
+    )
--- a/fastdeploy/model_executor/pre_and_post_process.py
+++ b/fastdeploy/model_executor/pre_and_post_process.py
@@ -17,26 +17,34 @@ from typing import Dict, Optional

 import paddle

-from fastdeploy.model_executor.ops.gpu import (get_padding_offset, save_output,
-                                               save_output_dynamic,
-                                               set_stop_value_multi_ends,
-                                               set_stop_value_multi_seqs,
-                                               speculate_get_padding_offset,
-                                               step_paddle, update_inputs)
+from fastdeploy.engine.config import SpeculativeConfig
+from fastdeploy.model_executor.ops.gpu import (
+    get_padding_offset, save_output, set_stop_value_multi_ends,
+    speculate_clear_accept_nums, speculate_get_output_padding_offset,
+    speculate_get_padding_offset, speculate_get_seq_lens_output,
+    speculate_save_output, speculate_set_value_by_flags_and_idx,
+    speculate_step_paddle, speculate_step_system_cache, speculate_update_v3,
+    step_paddle, step_system_cache, update_inputs)
+from fastdeploy.platforms import current_platform
 from fastdeploy.worker.output import ModelOutputData


-def pre_process(max_len: int, input_ids: paddle.Tensor,
-                seq_lens_this_time: int, use_speculate_method: bool,
-                draft_tokens: Optional[paddle.Tensor],
-                seq_lens_encoder: Optional[paddle.Tensor]):
+def pre_process(
+    max_len: int,
+    input_ids: paddle.Tensor,
+    seq_lens_this_time: int,
+    speculative_decoding: bool,
+    draft_tokens: Optional[paddle.Tensor] = None,
+    seq_lens_encoder: Optional[paddle.Tensor] = None,
+    seq_lens_decoder: Optional[paddle.Tensor] = None,
+):
    """
    Preprocessing before embedding.
    Args:
        max_len:
        input_ids:
        seq_lens_this_time:
-        use_speculate_method:
+        speculative_decoding:
        draft_tokens:
        seq_lens_encoder:
    Return:
@@ -49,7 +57,9 @@ def pre_process(max_len: int, input_ids: paddle.Tensor,
    # Remove padding
    cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time)
    token_num = paddle.sum(seq_lens_this_time)
-    if use_speculate_method:
+    output_padding_offset = None
+    output_cum_offsets = None
+    if speculative_decoding:
        (
            ids_remove_padding,
            cum_offsets,
@@ -64,6 +74,19 @@ def pre_process(max_len: int, input_ids: paddle.Tensor,
            seq_lens_this_time,
            seq_lens_encoder,
        )
+        seq_lens_output = speculate_get_seq_lens_output(
+            seq_lens_this_time,
+            seq_lens_encoder,
+            seq_lens_decoder,
+        )
+        output_token_num = paddle.sum(seq_lens_output)
+        output_cum_offsets_tmp = paddle.cumsum(max_len - seq_lens_output)
+        output_padding_offset, output_cum_offsets = speculate_get_output_padding_offset(
+            output_cum_offsets_tmp,
+            output_token_num,
+            seq_lens_output,
+            max_len,
+        )
    else:
        (
            ids_remove_padding,
@@ -73,16 +96,14 @@ def pre_process(max_len: int, input_ids: paddle.Tensor,
            cu_seqlens_k,
        ) = get_padding_offset(input_ids, cum_offsets_now, token_num,
                               seq_lens_this_time)
-    return (
-        ids_remove_padding,
-        cum_offsets,
-        padding_offset,
-        cu_seqlens_q,
-        cu_seqlens_k,
-    )
+    return (ids_remove_padding, cum_offsets, padding_offset, cu_seqlens_q,
+            cu_seqlens_k, output_cum_offsets, output_padding_offset)


-def post_process(tokens: paddle.Tensor, model_output: ModelOutputData) -> None:
+def post_process_normal(sampled_token_ids: paddle.Tensor,
+                        model_output: ModelOutputData,
+                        save_each_rank: bool = False,
+                        skip_save_output: bool = False) -> None:
    """ Post-processing steps after completing a single token generation. """
    # 1. Set stop value
    paddle.assign(
@@ -99,27 +120,11 @@ def post_process(tokens: paddle.Tensor, model_output: ModelOutputData) -> None:
        paddle.logical_or(model_output.stop_flags, length_cond),
        model_output.stop_flags,
    )
-
-    if model_output.use_stop_seqs:
-        set_stop_value_multi_seqs(
-            tokens,
-            model_output.pre_ids,
-            model_output.step_idx,
-            model_output.stop_flags,
-            model_output.seq_lens_this_time,
-            model_output.stop_seqs,
-            model_output.stop_seqs_len,
-            model_output.eos_token_id,
-        )
-    else:
-        set_stop_value_multi_ends(
-            tokens,
-            model_output.stop_flags,
-            model_output.seq_lens_this_time,
-            model_output.eos_token_id,
-            model_output.next_tokens,
-            False,
-        )  # multi ends
+    # TODO(gongshaotian): Add use_stop_seqs
+    set_stop_value_multi_ends(sampled_token_ids, model_output.stop_flags,
+                              model_output.seq_lens_this_time,
+                              model_output.eos_token_id,
+                              model_output.next_tokens, False)  # multi ends

    # 2. Update the input buffer of the model
    with paddle.framework._no_check_dy2st_diff():
@@ -131,57 +136,223 @@ def post_process(tokens: paddle.Tensor, model_output: ModelOutputData) -> None:
            model_output.seq_lens_decoder,
            model_output.input_ids,
            model_output.stop_nums,
-            tokens,
+            sampled_token_ids,
            model_output.is_block_step,
        )
    # 3. Transmit the model's output and stop generation signal via message queue.
    #    In the future, we will abandon this approach.
-    if model_output.output_via_mq:
-        if model_output.msg_queue_id is None:
-            save_output(
-                tokens,
-                model_output.not_need_stop,
-                model_output.mp_rank,
-                model_output.use_ep,
-            )
-        else:
-            save_output_dynamic(
-                tokens,
-                model_output.not_need_stop,
-                model_output.mp_rank,
-                model_output.msg_queue_id,
-                model_output.gpt.use_ep,
-            )
+    if not skip_save_output:
+        save_output(
+            sampled_token_ids,
+            model_output.not_need_stop,
+            model_output.mp_rank,
+            save_each_rank,  # save_each_rank
+        )
+
+def post_process_specualate(model_output, skip_save_output: bool = False):
+    """"""
+    speculate_update_v3(
+        model_output.seq_lens_encoder,
+        model_output.seq_lens_decoder,
+        model_output.not_need_stop,
+        model_output.draft_tokens,
+        model_output.actual_draft_token_num,
+        model_output.accept_tokens,
+        model_output.accept_num,
+        model_output.stop_flags,
+        model_output.seq_lens_this_time,
+        model_output.is_block_step,
+        model_output.stop_nums,
+    )
+
+    if not skip_save_output:
+        speculate_save_output(
+            model_output.accept_tokens,
+            model_output.accept_num,
+            model_output.not_need_stop,
+            model_output.mp_rank,
+            False,
+        )
+
+    speculate_clear_accept_nums(model_output.accept_num,
+                                model_output.seq_lens_decoder)
+
+    # Update pre_ids through accept tokens
+
+    speculate_set_value_by_flags_and_idx(
+        model_output.pre_ids,
+        model_output.accept_tokens,
+        model_output.accept_num,
+        model_output.stop_flags,
+        model_output.seq_lens_this_time,
+        model_output.seq_lens_encoder,
+        model_output.seq_lens_decoder,
+        model_output.step_idx,
+    )


-def step_cuda(share_inputs: Dict[str, paddle.Tensor], block_size: int,
-              enc_dec_block_num: int) -> None:
+def post_process(sampled_token_ids: paddle.Tensor,
+                 model_output: ModelOutputData,
+                 save_each_rank: bool = False,
+                 speculative_decoding: bool = False,
+                 skip_save_output: bool = False) -> None:
+    """ Post-processing steps after completing a single token generation. """
+    if speculative_decoding:
+        post_process_specualate(model_output, skip_save_output)
+    else:
+        post_process_normal(sampled_token_ids, model_output, save_each_rank,
+                            skip_save_output)
+
+
+def step_cuda(
+    share_inputs: Dict[str, paddle.Tensor],
+    block_size: int,
+    enc_dec_block_num: int,
+    speculative_config: SpeculativeConfig,
+    enable_prefix_caching: bool = False,
+) -> None:
    """
    TODO(gongshaotian): normalization name
    """
-    step_paddle(
-        share_inputs["stop_flags"],
-        share_inputs["seq_lens_this_time"],
-        share_inputs["step_seq_lens_encoder"],
-        share_inputs["seq_lens_encoder"],
-        share_inputs["seq_lens_decoder"],
-        share_inputs["block_tables"],
-        share_inputs["encoder_block_lens"],
-        share_inputs["is_block_step"],
-        share_inputs["step_block_list"],
-        share_inputs["step_lens"],
-        share_inputs["recover_block_list"],
-        share_inputs["recover_lens"],
-        share_inputs["need_block_list"],
-        share_inputs["need_block_len"],
-        share_inputs["used_list_len"],
-        share_inputs["free_list"],
-        share_inputs["free_list_len"],
-        share_inputs["input_ids"],
-        share_inputs["pre_ids"],
-        share_inputs["step_idx"],
-        share_inputs["next_tokens"],
-        share_inputs["first_token_ids"],
-        block_size,
-        enc_dec_block_num,
-    )
+    if speculative_config.method is not None:
+        if enable_prefix_caching:
+            speculate_step_system_cache(
+                share_inputs['stop_flags'],
+                share_inputs["seq_lens_this_time"],
+                share_inputs['step_seq_lens_encoder'],
+                share_inputs['step_seq_lens_decoder'],
+                share_inputs['seq_lens_encoder'],
+                share_inputs['seq_lens_decoder'],
+                share_inputs["block_tables"],
+                share_inputs['encoder_block_lens'],
+                share_inputs["is_block_step"],
+                share_inputs['step_block_list'],
+                share_inputs['step_lens'],
+                share_inputs['recover_block_list'],
+                share_inputs['recover_lens'],
+                share_inputs['need_block_list'],
+                share_inputs['need_block_len'],
+                share_inputs['used_list_len'],
+                share_inputs['free_list'],
+                share_inputs['free_list_len'],
+                share_inputs['input_ids'],
+                share_inputs['pre_ids'],
+                share_inputs['step_idx'],
+                share_inputs['next_tokens'],
+                share_inputs['first_token_ids'],
+                share_inputs["accept_num"],
+                block_size,
+                enc_dec_block_num,
+                speculative_config.num_speculative_tokens,
+            )
+        else:
+            speculate_step_paddle(
+                share_inputs['stop_flags'],
+                share_inputs["seq_lens_this_time"],
+                share_inputs['step_seq_lens_encoder'],
+                share_inputs['seq_lens_encoder'],
+                share_inputs['seq_lens_decoder'],
+                share_inputs["block_tables"],
+                share_inputs['encoder_block_lens'],
+                share_inputs["is_block_step"],
+                share_inputs['step_block_list'],
+                share_inputs['step_lens'],
+                share_inputs['recover_block_list'],
+                share_inputs['recover_lens'],
+                share_inputs['need_block_list'],
+                share_inputs['need_block_len'],
+                share_inputs['used_list_len'],
+                share_inputs['free_list'],
+                share_inputs['free_list_len'],
+                share_inputs['input_ids'],
+                share_inputs['pre_ids'],
+                share_inputs['step_idx'],
+                share_inputs['next_tokens'],
+                share_inputs['first_token_ids'],
+                share_inputs["accept_num"],
+                block_size,
+                enc_dec_block_num,
+                speculative_config.num_speculative_tokens,
+            )
+    else:
+        if enable_prefix_caching:
+            step_system_cache(
+                share_inputs["stop_flags"], share_inputs["seq_lens_this_time"],
+                share_inputs["step_seq_lens_encoder"],
+                share_inputs["step_seq_lens_decoder"],
+                share_inputs["seq_lens_encoder"],
+                share_inputs["seq_lens_decoder"], share_inputs["block_tables"],
+                share_inputs["encoder_block_lens"],
+                share_inputs["is_block_step"], share_inputs["step_block_list"],
+                share_inputs["step_lens"], share_inputs["recover_block_list"],
+                share_inputs["recover_lens"], share_inputs["need_block_list"],
+                share_inputs["need_block_len"], share_inputs["used_list_len"],
+                share_inputs["free_list"], share_inputs["free_list_len"],
+                share_inputs["input_ids"], share_inputs["pre_ids"],
+                share_inputs["step_idx"], share_inputs["next_tokens"],
+                share_inputs["first_token_ids"], block_size, enc_dec_block_num)
+        else:
+            step_paddle(
+                share_inputs["stop_flags"],
+                share_inputs["seq_lens_this_time"],
+                share_inputs["step_seq_lens_encoder"],
+                share_inputs["seq_lens_encoder"],
+                share_inputs["seq_lens_decoder"],
+                share_inputs["block_tables"],
+                share_inputs["encoder_block_lens"],
+                share_inputs["is_block_step"],
+                share_inputs["step_block_list"],
+                share_inputs["step_lens"],
+                share_inputs["recover_block_list"],
+                share_inputs["recover_lens"],
+                share_inputs["need_block_list"],
+                share_inputs["need_block_len"],
+                share_inputs["used_list_len"],
+                share_inputs["free_list"],
+                share_inputs["free_list_len"],
+                share_inputs["input_ids"],
+                share_inputs["pre_ids"],
+                share_inputs["step_idx"],
+                share_inputs["next_tokens"],
+                share_inputs["first_token_ids"],
+                block_size,
+                enc_dec_block_num,
+            )
+
+
+def rebuild_padding(tmp_out: paddle.Tensor,
+                    cum_offsets: paddle.Tensor,
+                    seq_len_this_time: paddle.Tensor,
+                    seq_lens_decoder: paddle.Tensor,
+                    seq_lens_encoder: paddle.Tensor,
+                    output_padding_offset: Optional[paddle.Tensor] = None,
+                    max_input_length: Optional[int] = None):
+    """
+    Args:
+    Returns:
+    """
+    if current_platform.is_cuda():
+        from fastdeploy.model_executor.ops.gpu import rebuild_padding
+        hidden_states = rebuild_padding(
+            tmp_out,
+            cum_offsets,
+            seq_len_this_time,
+            seq_lens_decoder,
+            seq_lens_encoder,
+            output_padding_offset,
+            max_input_length,
+        )
+    elif current_platform.is_cpu():
+        from fastdeploy.model_executor.ops.cpu import rebuild_padding_cpu
+        hidden_states = rebuild_padding_cpu(
+            tmp_out,
+            cum_offsets,
+            seq_len_this_time,
+            seq_lens_decoder,
+            seq_lens_encoder,
+            output_padding_offset,
+            max_input_length,
+        )
+    else:
+        raise RuntimeError("Not supported platform")
+    return hidden_states