Sync v2.0 version of code to github repo

This commit is contained in:
Jiang-Jia-Jun
2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions

View File

@@ -1,192 +0,0 @@
"""
This file is copied from https://github.com/deepseek-ai/EPLB/blob/main/eplb.py
"""
"""Expert Parallelism Load Balancer (EPLB)"""
from typing import Tuple
import numpy as np
def balanced_packing(weight: np.ndarray, num_packs: int) -> Tuple[np.ndarray, np.ndarray]:
"""
Pack n weighted objects to m packs, such that each bin contains exactly n/m objects and the weights of all packs
are as balanced as possible.
Parameters:
weight: [X, n], the weight of each item
num_packs: number of packs
Returns:
pack_index: [X, n], the pack index of each item
rank_in_pack: [X, n], the rank of the item in the pack
"""
num_layers, num_groups = weight.shape
assert num_groups % num_packs == 0
groups_per_pack = num_groups // num_packs
if groups_per_pack == 1:
pack_index = np.arange(weight.shape[-1], dtype=np.int32).reshape(1, -1).repeat(num_layers, axis=0)
rank_in_pack = np.zeros_like(weight, dtype=np.int32)
return pack_index, rank_in_pack
indices = np.argsort(-weight.astype(np.float32), axis=-1)
pack_index = np.full_like(weight, fill_value=-1, dtype=np.int32)
rank_in_pack = np.full_like(pack_index, fill_value=-1)
for i in range(num_layers):
pack_weights = [0] * num_packs
pack_items = [0] * num_packs
for group in indices[i]:
pack = min((i for i in range(num_packs) if pack_items[i] < groups_per_pack),
key=pack_weights.__getitem__)
assert pack_items[pack] < groups_per_pack
pack_index[i, group] = pack
rank_in_pack[i, group] = pack_items[pack]
pack_weights[pack] += weight[i, group]
pack_items[pack] += 1
return pack_index, rank_in_pack
def replicate_experts(weight: np.ndarray, num_phy: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Replicate `num_log` experts to `num_phy` replicas, such that the maximum load of all replicas is minimized.
Parameters:
weight: [X, num_log]
num_phy: total number of experts after replication
Returns:
phy2log: [X, num_phy], logical expert id of each physical expert
rank: [X, num_phy], the replica rank
logcnt: [X, num_log], number of replicas for each logical expert
"""
n, num_log = weight.shape
num_redundant = num_phy - num_log
assert num_redundant >= 0
phy2log = np.arange(num_phy, dtype=np.int32).reshape(1, -1).repeat(n, axis=0)
rank = np.zeros((n, num_phy), dtype=np.int32)
logcnt = np.ones((n, num_log), dtype=np.int32)
arangen = np.arange(n, dtype=np.int32)
for i in range(num_log, num_phy):
redundant_indices = np.argmax(weight / logcnt, axis=-1)
phy2log[:, i] = redundant_indices
rank[:, i] = logcnt[arangen, redundant_indices]
logcnt[arangen, redundant_indices] += 1
return phy2log, rank, logcnt
def rebalance_experts_hierarchical(weight: np.ndarray, num_physical_experts: int,
num_groups: int, num_nodes: int, num_gpus: int):
"""
Parameters:
weight: [num_moe_layers, num_logical_experts]
num_physical_experts: number of physical experts after replication
num_groups: number of expert groups
num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
num_gpus: number of GPUs, must be a multiple of `num_nodes`
Returns:
physical_to_logical_map: [num_moe_layers, num_physical_experts]
logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
logical_count: [num_moe_layers, num_logical_experts]
"""
num_layers, num_logical_experts = weight.shape
assert num_logical_experts % num_groups == 0
group_size = num_logical_experts // num_groups
assert num_groups % num_nodes == 0
groups_per_node = num_groups // num_nodes
assert num_gpus % num_nodes == 0
assert num_physical_experts % num_gpus == 0
phy_experts_per_gpu = num_physical_experts // num_gpus
def inverse(perm: np.ndarray) -> np.ndarray:
inv = np.empty_like(perm)
inv[np.arange(perm.shape[0])[:, None], perm] = np.arange(perm.shape[1], dtype=np.int32).reshape(1, -1)
return inv
# Step 1: pack groups to nodes
tokens_per_group = weight.reshape(num_layers, num_groups, group_size).sum(axis=-1)
group_pack_index, group_rank_in_pack = balanced_packing(tokens_per_group, num_nodes)
log2mlog = (((group_pack_index * groups_per_node + group_rank_in_pack) * group_size)[:, :, None] +
np.arange(group_size, dtype=np.int32)).reshape(num_layers, -1)
mlog2log = inverse(log2mlog)
# Step 2: construct redundant experts within nodes
tokens_per_mlog = np.take_along_axis(weight, mlog2log, axis=-1).reshape(-1, num_logical_experts // num_nodes)
phy2mlog, phyrank, mlogcnt = replicate_experts(tokens_per_mlog, num_physical_experts // num_nodes)
# Step 3: pack physical_experts to GPUs
tokens_per_phy = np.take_along_axis(tokens_per_mlog / mlogcnt, phy2mlog, axis=-1)
pack_index, rank_in_pack = balanced_packing(tokens_per_phy, num_gpus // num_nodes)
phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
pphy2phy = inverse(phy2pphy)
pphy2mlog = np.take_along_axis(phy2mlog, pphy2phy, axis=-1) # [num_layers * num_nodes, num_log_per_nodes]
pphy2mlog = (pphy2mlog.reshape(num_layers, num_nodes, -1) +
np.arange(0, num_logical_experts, num_logical_experts // num_nodes, dtype=np.int32)
.reshape(1, -1, 1)).reshape(num_layers, -1)
pphy2log = np.take_along_axis(mlog2log, pphy2mlog, axis=-1)
pphyrank = np.take_along_axis(phyrank, pphy2phy, axis=-1).reshape(num_layers, -1)
logcnt = np.take_along_axis(mlogcnt.reshape(num_layers, -1), log2mlog, axis=-1)
return pphy2log, pphyrank, logcnt
def rebalance_experts(weight: np.ndarray, num_replicas: int, num_groups: int,
num_nodes: int, num_gpus: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Entry point for expert-parallelism load balancer.
Parameters:
weight: [layers, num_logical_experts], the load statistics for all logical experts
num_replicas: number of physical experts, must be a multiple of `num_gpus`
num_groups: number of expert groups
num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
num_gpus: number of GPUs, must be a multiple of `num_nodes`
Returns:
physical_to_logical_map: [layers, num_replicas], the expert index of each replica
logical_to_physical_map: [layers, num_logical_experts, X], the replica indices for each expert
expert_count: [layers, num_logical_experts], number of physical replicas for each logical expert
"""
num_layers, num_logical_experts = weight.shape
weight = weight.astype(np.float32)
if num_groups % num_nodes == 0:
# use hierarchical load-balance policy
phy2log, phyrank, logcnt = rebalance_experts_hierarchical(weight, num_replicas,
num_groups, num_nodes, num_gpus)
else:
# use global load-balance policy
phy2log, phyrank, logcnt = replicate_experts(weight, num_replicas)
maxlogcnt = logcnt.max()
log2phy = np.full((num_layers, num_logical_experts, maxlogcnt), -1, dtype=np.int32)
np.put_along_axis(log2phy.reshape(num_layers, -1)[:, :, None],
(phy2log * maxlogcnt + phyrank)[:, :, None],
np.arange(num_replicas, dtype=np.int32).reshape(1, -1).repeat(num_layers, axis=0)[:, :, None],
axis=1)
return phy2log, log2phy, logcnt
__all__ = ['rebalance_experts']
def main():
""" """
num_hidden_layers = 3
num_expert = 64
num_groups = 8
num_replicas = 64
num_nodes = 4
num_gpus = 4 * 8
model_tokens_per_expert_stats_list = np.random.randint(
low=1, high=10, size=(num_hidden_layers, num_expert))
phy2log, phyrank, logcnt = rebalance_experts(model_tokens_per_expert_stats_list,
num_replicas, num_groups, num_nodes, num_gpus)
print(phy2log)
print(phyrank)
print(logcnt)
if __name__ == '__main__':
main()

View File

@@ -1,155 +0,0 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
"""
redundant expert manger
"""
import paddle
import numpy as np
from paddlenlp.utils.log import logger
from fastdeploy.model_executor.eplb.eplb import rebalance_experts
class RedundantExpertManger:
"""
RedundantExpertManger
"""
def __init__(self,
n_routed_experts,
num_hidden_layers,
redundant_experts_num,
ep_size):
self.num_expert = n_routed_experts
self.redundant_experts_num = redundant_experts_num
self.num_hidden_layers = num_hidden_layers
self.num_replicas = self.num_expert + self.redundant_experts_num
self.num_nodes = max(ep_size // 8, 1)
self.num_gpus = ep_size
self.num_groups = 1
self.export_per_rank = self.num_replicas // ep_size
assert self.num_replicas % ep_size == 0, \
f"num_replicas must be divisible by ep_size, \
but got num_replicas = {self.num_replicas}, ep_size = {ep_size}"
self.model_ep_rank_to_expert_id_list = paddle.full(
shape=[self.num_hidden_layers, self.num_expert + self.redundant_experts_num],
fill_value=-1,
dtype="int32")
self.model_expert_id_to_ep_rank_array = paddle.full(
shape=[self.num_hidden_layers, self.num_expert, self.redundant_experts_num + 1],
fill_value=-1,
dtype="int32")
self.model_expert_in_rank_num_list = paddle.full(
shape=[self.num_hidden_layers, self.num_expert],
fill_value=0,
dtype="int32")
# self.model_ep_rank_to_expert_id_list = paddle.arange(
# self.num_expert + self.redundant_experts_num,
# dtype="int32").tile([self.num_hidden_layers, 1])
# self.model_expert_id_to_ep_rank_array = paddle.arange(
# self.num_expert,
# dtype="int32").reshape([self.num_expert, 1]).tile([self.num_hidden_layers, 1, 1])
# self.model_expert_in_rank_num_list = paddle.full(
# shape=[self.num_hidden_layers, self.num_expert],
# fill_value=1,
# dtype="int32")
self.model_tokens_per_expert_stats_list = paddle.ones(
shape=[self.num_hidden_layers, self.num_expert],
dtype="int32")
rank_expert_list, \
logical_to_physical_map, \
expert_count = rebalance_experts(
self.model_tokens_per_expert_stats_list.cpu().numpy(),
self.num_replicas,
self.num_groups,
self.num_nodes,
self.num_gpus)
self.update_expert_rank_table(rank_expert_list, logical_to_physical_map, expert_count, False)
logger.info(f"moe experts table manager init successfully, ep_size {ep_size} \
num_replicas {self.num_replicas} export_per_rank {self.export_per_rank}")
def get_ep_rank_to_expert_id_list_by_layer(self, layer_id):
"""
get_ep_rank_to_expert_id_list_by_layer
"""
return self.model_ep_rank_to_expert_id_list[layer_id], \
self.model_expert_id_to_ep_rank_array[layer_id], \
self.model_expert_in_rank_num_list[layer_id], \
self.model_tokens_per_expert_stats_list[layer_id]
def get_ep_rank_to_expert_id_list(self, layer_id):
"""
get_ep_rank_to_expert_id_list
"""
return self.model_ep_rank_to_expert_id_list[layer_id], \
self.model_expert_id_to_ep_rank_array[layer_id], \
self.model_expert_in_rank_num_list[layer_id], \
self.model_tokens_per_expert_stats_list[layer_id]
def get_expert_tokens_stats(self, verbose: bool = False, clear_stat: bool = False):
"""
get_per_expert_tokens_stats
"""
try:
if verbose:
return self.model_tokens_per_expert_stats_list.cpu().numpy(), \
self.model_expert_id_to_ep_rank_array.cpu().numpy(), \
self.model_ep_rank_to_expert_id_list.cpu().numpy(), \
self.model_expert_in_rank_num_list.cpu().numpy()
return self.model_tokens_per_expert_stats_list.cpu().numpy(), None, None, None
finally:
if clear_stat:
self.model_tokens_per_expert_stats_list.zero_()
def get_expert_id_to_ep_rank_array(self):
"""
get_expert_id_to_ep_rank_array
"""
return self.model_expert_id_to_ep_rank_array.cpu().numpy()
def update_expert_rank_table(self,
rank_expert_list: np.ndarray,
logical_to_physical_map: np.ndarray,
expert_count: np.ndarray,
clear_stat: bool = True
):
"""
update_expert_rank_table
"""
#update model info
self.model_ep_rank_to_expert_id_list.copy_(paddle.to_tensor(rank_expert_list), True)
self.model_expert_id_to_ep_rank_array.fill_(-1)
self.model_expert_id_to_ep_rank_array[:, :, :logical_to_physical_map.shape[-1]] = \
paddle.to_tensor(logical_to_physical_map)
self.model_expert_in_rank_num_list.copy_(paddle.to_tensor(expert_count), True)
# reset
if clear_stat:
self.model_tokens_per_expert_stats_list.zero_()
if __name__ == '__main__':
print(RedundantExpertManger(64, 2, 8, 8).model_expert_id_to_ep_rank_array)

View File

@@ -20,7 +20,7 @@ from typing import Callable, Dict, Optional
import paddle.device.cuda.graphs as graphs
import paddle.nn.layer
from fastdeploy.config import LLMConfig
from fastdeploy.config import FDConfig
from fastdeploy.utils import get_logger
logger = get_logger("cudagrpah_piecewise_backend",
@@ -33,7 +33,7 @@ class ConcreteSizeEntry:
# Concrete batch size
runtime_bs: int
# The size is in cudagraph_capture_sizes
use_cuda_graph: bool = True
use_cudagraph: bool = True
# Has runtime-bs been captured before
captured: bool = False
@@ -56,45 +56,56 @@ class CudaGraphPiecewiseBackend:
def __init__(
self,
llm_config: LLMConfig,
fd_config: FDConfig,
runnable: Callable,
):
self.llm_config = llm_config
self.fd_config = fd_config
self.runnable = runnable
self.cuda_graph_capture_size = llm_config.graph_opt_config.cudagraph_capture_sizes
self.cudagraph_capture_sizes = fd_config.graph_opt_config.cudagraph_capture_sizes
self.warm_up_size = fd_config.graph_opt_config.cudagraph_num_of_warmups
self.batch_size_to_captured_size = fd_config.graph_opt_config.batch_size_to_captured_size
# runtime_bs -> ConcreteSizeEntry
self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
for shape in self.cuda_graph_capture_size:
for shape in self.cudagraph_capture_sizes:
self.concrete_size_entries[shape] = ConcreteSizeEntry(
runtime_bs=shape)
print("create all batch size entry")
print("[CUDA GRAPH] Created all batch size entry ")
def __call__(self, **kwargs):
# Get batch size
input_ids: paddle.Tensor = kwargs['input_ids']
batch_size = input_ids.shape[0]
entry = self.concrete_size_entries.get(batch_size)
ids_remove_padding: paddle.Tensor = kwargs["ids_remove_padding"]
batch_size = ids_remove_padding.shape[0]
padding_batch_size = self.batch_size_to_captured_size[batch_size]
# print(
# f"[CUDA GRAPH] The actual batch size obtained by CUDAGraph is :{batch_size}, ",
# f"The padded batch size is :{padding_batch_size}"
# )
entry = self.concrete_size_entries.get(padding_batch_size)
assert entry is not None, f"Batch size:{padding_batch_size} is not in cuda graph capture list."
if entry.runnable is None:
entry.runnable = self.runnable
print(
f"[CUDA GRAPH] new entry lazy initialize with batch size {batch_size}"
)
# print(
# f"[CUDA GRAPH] New entry lazy initialize with batch size {padding_batch_size}"
# )
if not entry.use_cuda_graph:
if not entry.use_cudagraph:
return entry.runnable(**kwargs)
# Capture a new cuda graph
if entry.cuda_graph is None:
# Warmup the model
for n in range(entry.num_finished_warmup):
for n in range(entry.num_finished_warmup, self.warm_up_size):
entry.num_finished_warmup += 1
entry.runnable(**kwargs)
print(
f"[CUDA GRAPH] warm up for batch size "
f"{batch_size}, finished ({n+1}/{entry.num_finished_warmup}) times"
)
# print(
# "[CUDA GRAPH] Warm up for batch size ",
# f"{padding_batch_size}, finished ({n+1}/{entry.num_finished_warmup}) times"
# )
# Store input addresses for debug
input_addresses = [
@@ -118,11 +129,11 @@ class CudaGraphPiecewiseBackend:
output._clear
paddle.device.synchronize()
print(
f"[CUDA GRAPH] cuda graph captured for batch size {batch_size}"
)
# print(
# f"[CUDA GRAPH] CUDAGraph captured for batch size {padding_batch_size}"
# )
# Replay
entry.cuda_graph.replay()
print(f"[CUDA GRAPH] cuda graph replayed for batch size {batch_size}")
# print(f"[CUDA GRAPH] CUDAGraph replayed for batch size {padding_batch_size}")
return entry.output_buffer

View File

@@ -19,14 +19,14 @@ from typing import Callable, Optional, TypeVar
import paddle.nn.layer
from fastdeploy.config import LLMConfig
from fastdeploy.config import FDConfig
from fastdeploy.model_executor.graph_optimization.graph_optimization_backend import \
GraphOptBackend
_T = TypeVar("_T", bound=type[paddle.nn.Layer])
def support_graph_opt(cls: Optional[_T] = None) -> _T:
def support_graph_optimization(cls: Optional[_T] = None) -> _T:
"""
A decorator for wrapping models or layers with CUDA graph support.
This enables efficient kernel launch sequencing for improved GPU performance.
@@ -34,7 +34,7 @@ def support_graph_opt(cls: Optional[_T] = None) -> _T:
Example usage:
'''
@support_graph_opt
@support_graph_optimization
class ErnieBot(paddle.nn.Layer):
def __init__(**kwargs):
...
@@ -49,15 +49,13 @@ def support_graph_opt(cls: Optional[_T] = None) -> _T:
cls.__bases__ = cls.__bases__ + (GraphOptWrapper, )
origin_init = cls.__init__
def __init__(self, llm_config: LLMConfig, **kwargs):
def __init__(self, fd_config: FDConfig, **kwargs):
""" Decorator model.__init__() func """
origin_init(self, llm_config=llm_config, **kwargs)
self.use_graph_opt = (
not (llm_config.graph_opt_config.graph_opt_level == 0
and not llm_config.graph_opt_config.use_cudagraph))
origin_init(self, fd_config=fd_config, **kwargs)
self.use_graph_opt = fd_config.graph_opt_config.graph_opt_level > 0 or fd_config.graph_opt_config.use_cudagraph
if self.use_graph_opt:
GraphOptWrapper.__init__(self,
llm_config=llm_config,
fd_config=fd_config,
graph_opt_backend=None)
else:
# Not use graph optimization
@@ -81,10 +79,10 @@ class GraphOptWrapper:
def __init__(
self,
graph_opt_backend: Optional[Callable] = None,
llm_config: LLMConfig = None,
fd_config: FDConfig = None,
):
if graph_opt_backend is None:
graph_opt_backend = GraphOptBackend(self.forward, llm_config)
graph_opt_backend = GraphOptBackend(self.forward, fd_config)
self.graph_opt_backend = graph_opt_backend
@abstractmethod

View File

@@ -16,7 +16,9 @@
from typing import Callable, Optional
from fastdeploy.config import LLMConfig
from paddle.jit.dy2static.utils import Backend
from fastdeploy.config import FDConfig
from fastdeploy.model_executor.graph_optimization.cudagraph_piecewise_backend import \
CudaGraphPiecewiseBackend
@@ -24,38 +26,39 @@ from fastdeploy.model_executor.graph_optimization.cudagraph_piecewise_backend im
class GraphOptBackend:
""" """
llm_config: LLMConfig
fd_config: FDConfig
cudagraph_piecewise_backend: Optional[CudaGraphPiecewiseBackend] = None
def __init__(self, runnable: Callable, llm_config: LLMConfig):
def __init__(self, runnable: Callable, fd_config: FDConfig):
self.runnable = runnable
self.llm_config = llm_config
self.fd_config = fd_config
def __call__(self, **kwargs):
# 1. TODO(gongshaotian): Static graph
if self.llm_config.graph_opt_config.graph_opt_level > 0:
self.max_captre_batch = fd_config.graph_opt_config.cudagraph_capture_sizes[
0]
if self.fd_config.graph_opt_config.graph_opt_level > 0:
# 1. Prepare cuda grpah input buffers (contain output of subgraphs)
# 2. Convert dynamic grpah to static graph
if self.llm_config.graph_opt_config.graph_opt_level > 1:
# with cinn
pass
else:
# not use cinn
pass
from paddle.jit import sot
backend = (Backend.CINN
if self.fd_config.graph_opt_config.graph_opt_level > 1
else Backend.PHI)
self.runnable = sot.symbolic_translate(self.runnable,
training=False,
backend=backend)
# 3. Split the static graph and get a list of callable obj
def __call__(self, **kwargs):
if not self.fd_config.graph_opt_config.use_cudagraph:
return self.runnable(**kwargs)
if self.cudagraph_piecewise_backend is None:
self.cudagraph_piecewise_backend = CudaGraphPiecewiseBackend(
fd_config=self.fd_config, runnable=self.runnable)
# 4. Get piecewise cuda grpah backend list
assert kwargs["forward_meta"].ids_remove_padding is not None
batch_size = kwargs["forward_meta"].ids_remove_padding.shape[0]
return self.runnable # Fake return value
# 2. Dynamic graph
if ((not kwargs["forward_meta"].step_use_cudagraph)
or (batch_size > self.max_captre_batch)):
return self.runnable(**kwargs)
else:
print(self.cudagraph_piecewise_backend is None)
if self.cudagraph_piecewise_backend is None:
self.cudagraph_piecewise_backend = CudaGraphPiecewiseBackend(
llm_config=self.llm_config, runnable=self.runnable)
# TODO(gongshaotian): handling kwargs
assert kwargs["input_ids"] is not None
return self.cudagraph_piecewise_backend.__call__(**kwargs)

View File

@@ -0,0 +1,73 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
# from fastdeploy.config import FDConfig
__all__ = ['get_guided_backend', 'schema_checker']
def get_guided_backend(
fd_config,
**kwargs,
):
"""
Get the guided decoding backend instance based on configuration.
Args:
fd_config (FDConfig): FastDeploy configuration object containing backend settings
**kwargs: Additional arguments passed to the backend constructor
Returns:
BaseBackend: An instance of the specified guided decoding backend
Raises:
ValueError: If the specified backend is not supported
"""
if fd_config.parallel_config.guided_decoding_backend.lower() == "xgrammar":
from fastdeploy.model_executor.guided_decoding.xgrammar_backend import \
XGrammarBackend
return XGrammarBackend(
fd_config=fd_config,
**kwargs,
)
else:
raise ValueError(
f"Get unsupported backend {fd_config.parallel_config.guided_decoding_backend},"
f" please check your configuration.")
def schema_checker(backend_name: str, **kwargs):
"""
Get the schema checker instance for the specified backend.
Args:
backend_name (str): Name of the backend (e.g. "xgrammar")
**kwargs: Additional arguments passed to the checker constructor
Returns:
BaseChecker: An instance of the specified schema checker
Raises:
ValueError: If the specified backend is not supported
"""
if backend_name.lower() == "xgrammar":
from fastdeploy.model_executor.guided_decoding.xgrammar_backend import \
XGrammarChecker
return XGrammarChecker(**kwargs)
else:
raise ValueError(
f"Get unsupported backend {backend_name}, please check your configuration."
)

View File

@@ -0,0 +1,347 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import os
from concurrent.futures import ThreadPoolExecutor
from fastdeploy.config import FDConfig
from fastdeploy.engine.request import Request
from fastdeploy.utils import llm_logger
class LogitsProcessorBase:
"""
Abstract base class for logits processors in guided decoding.
This class defines the interface for logits processors that modify token probabilities
during generation to enforce schema constraints. Subclasses should implement all
abstract methods to provide specific constraint enforcement logic.
Attributes:
None (all state should be managed by subclasses)
"""
def __init__(self):
pass
def fill_token_bitmask(self, token_bitmask, idx):
"""
Fill the vocabulary mask.
Args:
token_bitmask (tensor): The vocabulary mask tensor.
idx (tensor): The tensor index.
Raises:
NotImplementedError: This method should be implemented in subclasses.
"""
raise NotImplementedError()
def apply_token_mask(self, logits, token_bitmask):
"""
Apply the vocabulary mask to logits.
Args:
logits (tensor): The logits tensor.
token_bitmask (tensor): The vocabulary mask tensor.
Raises:
NotImplementedError: This method should be implemented in subclasses.
"""
raise NotImplementedError()
def allocate_token_bitmask(self, batch_size, vocab_size):
"""
Allocate a token bitmask for the given batch size and vocabulary size.
Args:
batch_size (int): The batch size.
vocab_size (int): The vocabulary size.
Returns:
tensor: The allocated token bitmask.
"""
raise NotImplementedError()
def accept_token(self, token):
"""
Accept tokens based on the token bitmask
Args:
token (int): The token id.
Raises:
NotImplementedError: This method should be implemented in subclasses.
"""
raise NotImplementedError()
def is_terminated(self):
"""
Check if the processor has been terminated.
Raises:
NotImplementedError: This method should be implemented in subclasses.
"""
raise NotImplementedError()
def reset(self):
"""
Reset the matcher state.
"""
raise NotImplementedError()
def copy(self):
"""
Create a copy of the backend instance.
Returns:
BackendBase: A copy of the backend instance.
"""
raise NotImplementedError()
class BackendBase:
"""
Abstract base class for guided decoding backends.
This class provides the core infrastructure for managing schema processors and
their caching. It handles:
- Processor creation and caching
- Tokenizer initialization
- Thread pool management for async operations
Attributes:
cache (dict): Cache of schema processors
fd_config (FDConfig): FastDeploy configuration
executor (ThreadPoolExecutor): Thread pool for async operations
max_cache_size (int): Maximum number of processors to cache
hf_tokenizer: HuggingFace tokenizer instance
"""
def __init__(self, fd_config: FDConfig):
self.cache = {}
self.fd_config = fd_config
self.executor = ThreadPoolExecutor()
self.max_cache_size = 2048
self.hf_tokenizer = self._get_tokenizer_hf()
def _create_processor(self):
"""
Create a specific logits processor instance.
Raises:
NotImplementedError: This method should be implemented in subclasses.
"""
raise NotImplementedError()
def _json_processor(self, schemata):
"""
Process JSON schemata.
Args:
schemata (str): The schemata string.
Raises:
NotImplementedError: This method should be implemented in subclasses.
"""
raise NotImplementedError()
def _regex_processor(self, schemata):
"""
Process regular expression schemata.
Args:
schemata (str): The schemata string.
Raises:
NotImplementedError: This method should be implemented in subclasses.
"""
raise NotImplementedError()
def _grammar_processor(self, schemata):
"""
Process grammar schemata.
Args:
schemata (str): The schemata string.
Raises:
NotImplementedError: This method should be implemented in subclasses.
"""
raise NotImplementedError()
def _structural_tag_processor(self, schemata):
"""
Process structural tag schemata.
Args:
schemata (str): The schemata string.
Raises:
NotImplementedError: This method should be implemented in subclasses.
"""
raise NotImplementedError()
def _unsupported_processor_type(self, key_type, schemata):
"""
Process unsupported type.
Args:
key_type (str): The key type string.
schemata (str): The schemata string.
"""
raise Exception(f"Unsupported processor type {key_type}.")
def _init_logits_processor(
self, schemata_key: tuple[str, str]) -> LogitsProcessorBase:
"""
init logits processor by type and schemata.
Args:
schemata_key (tuple[str, str]): Tuple containing processor type and schema string
Returns:
LogitsProcessorBase: Initialized logits processor instance
Raises:
ValueError: If processor type is not supported
"""
key_type, schemata = schemata_key
if key_type == "json":
return self._json_processor(schemata)
elif key_type == "regex":
return self._regex_processor(schemata)
elif key_type == "grammar":
return self._grammar_processor(schemata)
elif key_type == "structural_tag":
return self._structural_tag_processor(schemata)
else:
llm_logger.error(f"Unsupported processor type {key_type}.")
return None
def get_logits_processor(
self,
schemata_key: tuple[str, str]) -> tuple[LogitsProcessorBase, bool]:
"""
get logits processor by key from cache or create new one.
Args:
schemata_key (tuple[str, str]): Tuple containing processor type and schema string
Returns:
tuple[LogitsProcessorBase, bool]: Tuple containing:
- LogitsProcessorBase: The logits processor instance
- bool: True if processor was from cache, False if newly created
"""
value = self.cache.get(schemata_key, None)
if value:
return value.copy(), True
value = self.executor.submit(self._init_logits_processor, schemata_key)
return value, False
def _get_tokenizer_hf(self):
"""
Initialize and return a HuggingFace tokenizer instance.
This method handles special cases for Ernie models and falls back to standard
AutoTokenizer for other models. It also ensures fast tokenizer is used when possible.
Returns:
Tokenizer: Initialized HuggingFace tokenizer instance
Raises:
Exception: If tokenizer initialization fails
"""
try:
architectures = self.fd_config.model_config.architectures
if "Ernie4_5_MoeForCausalLM" not in architectures \
and "Ernie4_5_ForCausalLM" not in architectures:
from transformers import AutoTokenizer, PreTrainedTokenizerFast
tokenizer = AutoTokenizer.from_pretrained(
self.fd_config.parallel_config.model_name_or_path,
use_fast=False,
)
if not isinstance(tokenizer, PreTrainedTokenizerFast):
tokenizer = PreTrainedTokenizerFast(
__slow_tokenizer=tokenizer)
else:
from fastdeploy.model_executor.guided_decoding.ernie_tokenizer import \
ErnieBotTokenizer
vocab_file_names = [
"tokenizer.model", "spm.model", "ernie_token_100k.model"
]
for i in range(len(vocab_file_names)):
if os.path.exists(
os.path.join(
self.fd_config.parallel_config.
model_name_or_path, vocab_file_names[i])):
ErnieBotTokenizer.vocab_files_names[
"vocab_file"] = vocab_file_names[i]
break
tokenizer = ErnieBotTokenizer.from_pretrained(
self.fd_config.parallel_config.model_name_or_path)
return tokenizer
except Exception as e:
raise Exception(f"Fail to initialize hf tokenizer: {e}")
def add_cache(self, schemata_key: tuple[str, str],
processor: LogitsProcessorBase) -> None:
"""
add logits processor to cache.
Args:
schemata_key (tuple[str, str]): Tuple containing processor type and schema string
processor (LogitsProcessorBase): Logits processor instance to cache
Returns:
None: No return value
"""
if len(self.cache) >= self.max_cache_size:
return
self.cache[schemata_key] = processor.copy()
class BaseChecker:
"""
Abstract base class for schema checkers.
This class defines the interface for validating and formatting schemas
before they are used by logits processors. Subclasses should implement
schema-specific validation and formatting logic.
Attributes:
None (all state should be managed by subclasses)
"""
def __init__(self):
pass
def schema_format(self, request: Request):
"""
format schema to backend specific format.
Args:
request (Request): request object.
Returns:
request (Request): request object with formatted schema.
"""
raise NotImplementedError()

View File

@@ -0,0 +1,266 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {},
"tokenizer_file": {},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
class ErnieBotTokenizer(PreTrainedTokenizer):
"""
Construct a ErnieBot tokenizer. Based on byte-level Byte-Pair-Encoding.
Args:
vocab_file (`str`):
Path to the vocabulary file.
"""
vocab_files_names = VOCAB_FILES_NAMES
resource_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
unk_token="<unk>",
bos_token="<s>",
eos_token="</s>",
pad_token="<pad>",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
add_bos_token=True,
add_eos_token=False,
clean_up_tokenization_spaces=False,
**kwargs,
):
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.vocab_file = vocab_file
self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file)
bos_token = AddedToken(bos_token,
lstrip=False, rstrip=False) if isinstance(
bos_token, str) else bos_token
eos_token = AddedToken(eos_token,
lstrip=False, rstrip=False) if isinstance(
eos_token, str) else eos_token
unk_token = AddedToken(unk_token,
lstrip=False, rstrip=False) if isinstance(
unk_token, str) else unk_token
pad_token = AddedToken(pad_token,
lstrip=False, rstrip=False) if isinstance(
pad_token, str) else pad_token
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
add_bos_token=add_bos_token,
add_eos_token=add_eos_token,
sp_model_kwargs=self.sp_model_kwargs,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)
# for eb35 reader
self.bos_id = self.bos_token_id
self.eos_id = self.eos_token_id
self.sep_id = self.sep_token_id
self.pad_id = self.pad_token_id
self.unk_id = self.unk_token_id
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file)
@property
def vocab_size(self):
"""Returns vocab size"""
return self.sp_model.get_piece_size()
def get_vocab(self):
"""Returns vocab as a dict"""
vocab = {
self.convert_ids_to_tokens(i): i
for i in range(self.vocab_size)
}
vocab.update(self.added_tokens_encoder)
return vocab
def tokenize(self, text):
"""Returns a tokenized string."""
return self._tokenize(text)
def _tokenize(self, text):
"""Returns a tokenized string."""
return self.sp_model.encode(text, out_type=str)
def decode(self,
tokens,
skip_special_tokens=False,
clean_up_tokenization_spaces=False):
"""Returns a tokenized string."""
return self.sp_model.decode(tokens)
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
token = self.sp_model.IdToPiece(index)
return token
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens = []
out_string = ""
prev_is_special = False
for i, token in enumerate(tokens):
# make sure that special tokens are not decoded using sentencepiece model
if token in self.all_special_tokens:
if not prev_is_special and i != 0:
out_string += " "
out_string += self.sp_model.decode(current_sub_tokens) + token
prev_is_special = True
current_sub_tokens = []
else:
current_sub_tokens.append(token)
prev_is_special = False
out_string += self.sp_model.decode(current_sub_tokens)
return out_string
def save_vocabulary(self,
save_directory,
filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (`str`):
The directory in which to save the vocabulary.
Returns:
`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
return
out_vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") +
VOCAB_FILES_NAMES["vocab_file"])
if os.path.abspath(self.vocab_file) != os.path.abspath(
out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file, )
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
build inputs with special tokens
"""
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
output = bos_token_id + token_ids_0 + eos_token_id
if token_ids_1 is not None:
output = output + bos_token_id + token_ids_1 + eos_token_id
return output
def get_special_tokens_mask(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None,
already_has_special_tokens: bool = False) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0,
token_ids_1=token_ids_1,
already_has_special_tokens=True)
bos_token_id = [1] if self.add_bos_token else []
eos_token_id = [1] if self.add_eos_token else []
if token_ids_1 is None:
return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)
def create_token_type_ids_from_sequences(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
if token_ids_1 is None, only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of ids.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
if token_ids_1 is not None:
output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
return output

View File

@@ -0,0 +1,457 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import json
import re
from typing import Any, List, Optional
import paddle
import torch
from fastdeploy.config import FDConfig
from fastdeploy.engine.request import Request
from fastdeploy.model_executor.guided_decoding.base_guided_decoding import (
BackendBase, BaseChecker, LogitsProcessorBase)
from fastdeploy.utils import llm_logger
try:
from xgrammar import (CompiledGrammar, Grammar, GrammarCompiler,
GrammarMatcher, StructuralTagItem, TokenizerInfo,
allocate_token_bitmask, apply_token_bitmask_inplace)
except Exception as e:
raise Exception(
f"import XGrammar failed, please check your environment:\n\t {e}")
class XGrammarProcessor(LogitsProcessorBase):
"""
XGrammar-specific implementation of LogitsProcessorBase.
This processor enforces grammar constraints during token generation using XGrammar.
It manages the grammar matching state and applies token masks to logits.
Attributes:
max_rollback_tokens (int): Maximum number of tokens to rollback on mismatch
vocab_size (int): Size of the vocabulary
batch_size (int): Batch size for processing
splitwise_role (str): Role for splitwise processing
compiled_grammar (CompiledGrammar): Compiled grammar rules
terminate_without_stop_token (bool): Whether to terminate without stop token
override_stop_tokens (Optional[List[int]]): Custom stop tokens
matcher (GrammarMatcher): Grammar matching engine
"""
def __init__(
self,
compiled_grammar: CompiledGrammar,
terminate_without_stop_token: bool = False,
override_stop_tokens: Optional[List[int]] = None,
vocab_size: Optional[int] = None,
batch_size: Optional[int] = None,
splitwise_role: str = "mixed",
):
super().__init__()
self.max_rollback_tokens = 200
self.vocab_size = vocab_size
self.batch_size = batch_size
self.splitwise_role = splitwise_role
self.compiled_grammar = compiled_grammar
self.terminate_without_stop_token = terminate_without_stop_token
self.override_stop_tokens = override_stop_tokens
self.matcher = GrammarMatcher(
compiled_grammar=compiled_grammar,
max_rollback_tokens=self.max_rollback_tokens,
terminate_without_stop_token=terminate_without_stop_token,
override_stop_tokens=override_stop_tokens,
)
def allocate_token_bitmask(self) -> torch.Tensor:
"""
Allocate a token bitmask tensor for grammar constraints.
Returns:
torch.Tensor: A tensor of shape (batch_size, vocab_size) initialized to 0
"""
return allocate_token_bitmask(self.batch_size, self.vocab_size)
def fill_token_bitmask(self, token_bitmask: torch.Tensor,
idx: int) -> None:
"""
Fill the token bitmask with allowed tokens for the given index.
Args:
token_bitmask (torch.Tensor): The token bitmask tensor to fill
idx (int): The batch index to fill the mask for
Returns:
None: Modifies the token_bitmask in-place
"""
self.matcher.fill_next_token_bitmask(token_bitmask, idx)
def apply_token_mask(
self,
logits: paddle.Tensor,
token_bitmask: torch.Tensor,
indices: Optional[List[int]] = None,
) -> paddle.Tensor:
"""
Apply the token mask to the logits, modifying probabilities of invalid tokens.
Args:
logits (paddle.Tensor): The logits tensor to modify
token_bitmask (torch.Tensor): The token bitmask indicating allowed tokens
indices (Optional[List[int]]): Optional list of batch indices to apply mask to
Returns:
paddle.Tensor: The modified logits tensor
"""
origin_place = logits.place
origin_dtype = logits.dtype
logits = torch.from_numpy(logits.numpy())
logits = logits.float() # cpu
apply_token_bitmask_inplace(
logits=logits,
bitmask=token_bitmask.to(logits.device, non_blocking=True),
indices=indices,
)
return paddle.to_tensor(
logits.numpy(),
dtype=origin_dtype,
place=origin_place,
)
def reset(self) -> None:
"""
Reset the grammar matcher state to initial conditions.
Returns:
None: No return value
"""
self.matcher.reset()
def accept_token(self, token: int) -> None:
"""
Validate and accept a generated token against the grammar constraints.
Args:
token (int): The token ID to validate
Raises:
AssertionError: If token is not allowed by the grammar
"""
assert self.matcher.accept_token(
token), f"Failed to accept token {token}"
def is_terminated(self) -> bool:
"""
Check if the grammar matching process has terminated.
Returns:
bool: True if matching has terminated, False otherwise
"""
return self.matcher.is_terminated()
def copy(self) -> "XGrammarProcessor":
"""
Create a deep copy of this processor instance.
Returns:
XGrammarProcessor: A new processor instance with identical state
"""
return XGrammarProcessor(
compiled_grammar=self.compiled_grammar,
terminate_without_stop_token=self.terminate_without_stop_token,
override_stop_tokens=self.override_stop_tokens,
vocab_size=self.vocab_size,
batch_size=self.batch_size,
splitwise_role=self.splitwise_role,
)
class XGrammarBackend(BackendBase):
"""
XGrammar-specific implementation of BackendBase.
This backend handles compilation of various schema types (JSON, regex, grammar)
into XGrammar processors. It manages the grammar compiler and tokenizer info.
Attributes:
vocab_size (int): Size of the vocabulary from config
batch_size (int): Maximum batch size from config
any_whitespace (bool): Whether to allow any whitespace in JSON
splitwise_role (str): Role for splitwise processing
grammar_compiler (GrammarCompiler): Grammar compilation engine
"""
def __init__(
self,
fd_config: FDConfig,
**kwargs,
):
super().__init__(fd_config=fd_config)
self.vocab_size = fd_config.model_config.vocab_size
self.batch_size = fd_config.parallel_config.max_num_seqs
self.any_whitespace = not fd_config.parallel_config.disable_any_whitespace
self.splitwise_role = fd_config.parallel_config.splitwise_role
try:
tokenizer_info = TokenizerInfo.from_huggingface(
self.hf_tokenizer, vocab_size=self.vocab_size)
self.grammar_compiler = GrammarCompiler(
tokenizer_info=tokenizer_info)
except Exception as e:
raise Exception(f"Failed to load XGrammar tokenizer: {e}")
def _create_processor(
self,
compiled_grammar: CompiledGrammar,
terminate_without_stop_token: bool = False,
override_stop_tokens: Optional[List[int]] = None,
) -> XGrammarProcessor:
"""
Create a logits processor instance for the given compiled grammar.
Args:
compiled_grammar (CompiledGrammar): Compiled grammar rules
terminate_without_stop_token (bool): Whether to terminate without stop token
override_stop_tokens (Optional[List[int]]): Custom stop tokens to override defaults
Returns:
XGrammarProcessor: Configured grammar processor instance
"""
return XGrammarProcessor(
compiled_grammar=compiled_grammar,
terminate_without_stop_token=terminate_without_stop_token,
override_stop_tokens=override_stop_tokens,
vocab_size=self.vocab_size,
batch_size=self.batch_size,
splitwise_role=self.splitwise_role,
)
def _json_processor(self, schemata: str) -> Optional[XGrammarProcessor]:
"""
Compile JSON schema into a grammar processor.
Args:
schemata (str): JSON schema string to compile
Returns:
Optional[XGrammarProcessor]: Configured processor if successful, None on failure
"""
try:
compiled_grammar = self.grammar_compiler.compile_json_schema(
schemata, any_whitespace=self.any_whitespace)
except Exception as e:
llm_logger.error(f"Failed to compile json schema: {e}")
return None
return self._create_processor(compiled_grammar)
def _regex_processor(self, schemata: str) -> Optional[XGrammarProcessor]:
"""
Compile regex pattern into a grammar processor.
Args:
schemata (str): Regex pattern string to compile
Returns:
Optional[XGrammarProcessor]: Configured processor if successful, None on failure
"""
try:
compiled_grammar = self.grammar_compiler.compile_regex(schemata)
except Exception as e:
llm_logger.error(f"Failed to compile regex schema: {e}")
return None
return self._create_processor(compiled_grammar)
def _grammar_processor(self, schemata: str) -> Optional[XGrammarProcessor]:
"""
Compile grammar (EBNF) into a grammar processor.
Args:
schemata (str): Grammar string in EBNF format
Returns:
Optional[XGrammarProcessor]: Configured processor if successful, None on failure
"""
try:
compiled_grammar = self.grammar_compiler.compile_grammar(schemata)
except Exception as e:
llm_logger.error(f"Failed to compile ebnf schema: {e}")
return None
return self._create_processor(compiled_grammar)
def _structural_tag_processor(
self, schemata: str) -> Optional[XGrammarProcessor]:
"""
Compile structural tags into a grammar processor.
Args:
schemata (str): JSON string containing structural tag definitions
Returns:
Optional[XGrammarProcessor]: Configured processor if successful, None on failure
"""
try:
structural_tag = json.loads(schemata)
tags = [
StructuralTagItem(
begin=structure["begin"],
schema=json.dumps(structure["schema"]),
end=structure["end"],
) for structure in structural_tag["structures"]
]
compiled_grammar = self.grammar_compiler.compile_structural_tag(
tags, structural_tag["triggers"])
except Exception as e:
llm_logger.error(f"Failed to compile structural tags schema: {e}")
return None
return self._create_processor(compiled_grammar)
class XGrammarChecker(BaseChecker):
"""
XGrammar-specific implementation of BaseChecker.
This validator checks and formats various schema types (JSON, regex, grammar)
for compatibility with XGrammar before processing.
Attributes:
any_whitespace (bool): Whether to allow any whitespace in JSON
"""
def __init__(self, **kwargs):
super().__init__()
self.any_whitespace = not kwargs.get("disable_any_whitespace", True)
def _unsupported_json_schema(self, schema: dict[str, Any]) -> bool:
"""
Check if JSON schema contains unsupported features.
Args:
schema (dict[str, Any]): JSON schema to validate
Returns:
bool: True if schema contains unsupported features, False otherwise
"""
def check_object(obj: dict[str, Any]) -> bool:
if not isinstance(obj, dict):
return False
if obj.get("type") in ("integer", "number") and ("multipleOf"
in obj):
return True
if obj.get("type") == "array" and any(
key in obj for key in ("uniqueItems", "contains",
"minContains", "maxContains")):
return True
if obj.get("type") == "string" and "format" in obj:
return True
if obj.get("type") == "object" and any(
key in obj
for key in ("minProperties", "maxProperties",
"propertyNames", "patternProperties")):
return True
for value in obj.values():
if isinstance(value, dict):
if check_object(value):
return True
elif isinstance(value, list):
for item in value:
if isinstance(item, dict) and check_object(item):
return True
return False
return check_object(schema)
def schema_format(self, request: Request):
"""
format schema to backend specific format.
"""
if request.guided_json:
try:
if not isinstance(request.guided_json, str):
guided_json = json.dumps(request.guided_json)
else:
guided_json = request.guided_json
Grammar.from_json_schema(guided_json,
any_whitespace=self.any_whitespace)
except RuntimeError as e:
err_msg = f"Invalid JSON format: {guided_json}, error message: {str(e)}"
return request, err_msg
if self._unsupported_json_schema(guided_json):
err_msg = f"unsupported JSON schema: {guided_json}"
return request, err_msg
request.guided_json = guided_json
return request, None
elif request.guided_grammar:
# TODO: XGrammar only supports GBNF grammars, convert Lark to GBNF
guided_grammar = request.guided_grammar
try:
Grammar.from_ebnf(guided_grammar)
except RuntimeError as e:
err_msg = f"Invalid grammar format: {guided_grammar}, error message: {str(e)}"
return request, err_msg
request.guided_grammar = guided_grammar
return request, None
elif request.guided_json_object:
request.guided_json = '{"type": "object"}'
return request, None
elif request.guided_choice:
try:
escaped_choices = (re.sub(r'(["\\])', r'\\\1', c)
for c in request.guided_choice)
guided_choice = ('root ::= ' +
' | '.join(f'"{c}"' for c in escaped_choices))
Grammar.from_ebnf(guided_choice)
except RuntimeError as e:
err_msg = f"Invalid choice format: {guided_choice}, error message: {str(e)}"
return request, err_msg
request.guided_grammar = guided_choice
return request, None
elif request.structural_tag:
try:
structural_tag = json.loads(request.structural_tag)
tags = [
StructuralTagItem(
begin=s["begin"],
schema=json.dumps(s["schema"]),
end=s["end"],
) for s in structural_tag["structures"]
]
Grammar.from_structural_tag(tags, structural_tag["triggers"])
except RuntimeError as e:
err_msg = f"Invalid structural_tag format: {structural_tag}, error message: {str(e)}"
return request, err_msg
return request, None
else:
# regex is not format
return request, None

View File

@@ -15,10 +15,13 @@
"""
# cipher_token=WjI1fQOvhN # do not edit this line
from typing import Optional
import paddle
from paddle import nn
from paddle.incubate.nn.functional import fused_bias_act
from fastdeploy.config import LLMConfig
from fastdeploy.config import FDConfig
from fastdeploy.platforms import current_platform
@@ -29,28 +32,27 @@ class SiluAndMul(nn.Layer):
def __init__(
self,
llm_config: LLMConfig,
bias=None,
act_method="gelu",
dequant_scales=None,
shift=None,
smooth=None,
quant_scale=-1,
fd_config: FDConfig,
bias: paddle.Tensor = None,
act_method: str = "gelu",
dequant_scales: Optional[paddle.Tensor] = None,
shift: Optional[paddle.Tensor] = None,
smooth: Optional[paddle.Tensor] = None,
quant_scale: float = -1,
):
"""
Initialize the activation layer with optional parameters for quantization, bias,
activation method, and more.
Args:
llm_config (Any): Arguments related to inference, including quantization
fd_config (Any): Arguments related to inference, including quantization
settings.
bias (Optional[Tensor]): Optional bias term to be added to the output.
act_method (str, optional): Activation method to be applied.
Defaults to "gelu".
dequant_scales (Optional[List[float]]): Dequantization scales, used in
act_method (str): Activation method to be applied. Defaults to "gelu".
dequant_scales (Optional[Tensor]): Dequantization scales, used in
quantization scenarios.
shift (Optional[float]): Shift factor, used in quantization scenarios.
smooth (Optional[float]): Smoothing factor, used for specific activation
shift (Optional[Tensor]): Shift factor, used in quantization scenarios.
smooth (Optional[Tensor]): Smoothing factor, used for specific activation
functions.
quant_scale (float, optional): Quantization scale, used in quantization
scenarios. Defaults to -1, indicating no quantization.
@@ -61,12 +63,13 @@ class SiluAndMul(nn.Layer):
"""
super().__init__()
if current_platform.is_cuda():
if current_platform.is_cuda() or current_platform.is_xpu():
self.forward = self.forward_cuda
else:
raise NotImplementedError
self.bias = bias
act_method = act_method.lower()
if act_method == "silu":
act_method = "swiglu"
@@ -75,9 +78,9 @@ class SiluAndMul(nn.Layer):
self.shift = shift
self.smooth = smooth
self.quant_scale = quant_scale
self.quant_round_type = llm_config.quant_config.quant_round_type
self.quant_max_bound = llm_config.quant_config.quant_max_bound
self.quant_min_bound = llm_config.quant_config.quant_min_bound
self.quant_round_type = fd_config.quant_config.quant_round_type if fd_config.quant_config else 0
self.quant_max_bound = fd_config.quant_config.quant_max_bound if fd_config.quant_config else 0
self.quant_min_bound = fd_config.quant_config.quant_min_bound if fd_config.quant_config else 0
self._dtype = self._helper.get_default_dtype()
if self._dtype == "bfloat16":
@@ -91,12 +94,12 @@ class SiluAndMul(nn.Layer):
bfloat16 as default dtype, but received {self._dtype}")
# fp8 is not support smooth quantization
if "float8" in llm_config.model_config.act_dtype:
if fd_config.quant_config and "fp8" in fd_config.quant_config.name():
self.dequant_scales = None
self.shift = None
self.smooth = None
def forward_cuda(self, x):
def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor:
"""
Forward propagation of the custom activation layer.

View File

@@ -13,15 +13,13 @@
# limitations under the License.
from .attention import Attention
from .append_attn_backend import AppendAttentionBackend
from .attention_selecter import get_attention_backend
from .base_attention_backend import AttentionBackend
from .native_paddle_backend import PaddleNativeAttnBackend
from .attention_selecter import get_attention_backend
from .append_attn_backend import AppendAttentionBackend
from .xpu_attn_backend import XPUAttentionBackend
__all__ = [
"Attention",
"AttentionBackend",
"PaddleNativeAttnBackend",
"get_attention_backend",
"AppendAttentionBackend",
"Attention", "AttentionBackend", "PaddleNativeAttnBackend",
"get_attention_backend", "AppendAttentionBackend", "XPUAttentionBackend"
]

View File

@@ -16,25 +16,28 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import TYPE_CHECKING, Optional
import os
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, List, Optional, Tuple
import paddle
from fastdeploy.model_executor.layers.attention.ops import (
append_attention, get_block_shape_and_split_kv_block)
append_attention, get_block_shape_and_split_kv_block,
init_signal_layerwise, open_shm_and_get_meta_signal)
if TYPE_CHECKING:
from paddle._typing.dtype_like import _DTypeLiteral
from fastdeploy.config import FDConfig
from fastdeploy.model_executor.layers.attention import Attention
from fastdeploy.model_executor.layers.attention.base_attention_backend import \
AttentionBackend
from fastdeploy.worker.model_runner import ForwardMeta
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
AttentionBackend, AttentionMetadata)
from fastdeploy.worker.forward_meta import ForwardMeta
@dataclass
class AppendAttentionMetadata:
class AppendAttentionMetadata(AttentionMetadata):
"""
AppendAttentionMetadata
"""
@@ -60,40 +63,65 @@ class AppendAttentionMetadata:
decoder_block_shape_q: Optional[paddle.Tensor] = None
_fuse_kernel_compute_dtype: str = "bf16"
# pd_disaggregation
kv_signal_metadata: Optional[paddle.Tensor] = None
kv_signal_data_list: List[paddle.Tensor] = field(default_factory=list)
class AppendAttentionBackend(AttentionBackend):
"""
AppendAttentionBackend backend implementation.
"""
def __init__(
self,
model_runner: "ModelRunner",
):
def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
head_dim: int) -> None:
"""
AppendAttentionBackend __init__
"""
super().__init__()
self.attention_metadata: AppendAttentionMetadata = None
self.block_size = model_runner.args.block_size
self.max_seq_len = model_runner.args.max_model_len
self.rope_theta = (10000.0 if model_runner.model_cfg.rope_theta is None
else model_runner.model_cfg.rope_theta)
self.rope_3d = getattr(model_runner.model_cfg, "rope_3d", False)
self.causal = getattr(model_runner.model_cfg, "causal", True)
self.speculate_method = model_runner.args.speculate_method
self.speculate_max_draft_token_num = model_runner.args.speculate_max_draft_tokens
self.num_heads = model_runner.model_cfg.num_attention_heads // model_runner.nranks
self.kv_num_heads = int(
model_runner.model_cfg.num_key_value_heads) // model_runner.nranks
self.block_size: int = fd_config.parallel_config.block_size
self.max_seq_len: int = fd_config.parallel_config.max_model_len
self.rope_theta: float = (10000.0
if fd_config.model_config.rope_theta is None
else fd_config.model_config.rope_theta)
self.rope_3d: bool = getattr(fd_config.model_config, "rope_3d", False)
self.causal: bool = getattr(fd_config.model_config, "causal", True)
self.speculative_method: str = fd_config.speculative_config.method
self.use_speculate: bool = self.speculative_method is not None
self.speculate_max_draft_token_num: int = fd_config.speculative_config.num_speculative_tokens
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
self.kv_num_heads: int = kv_num_heads
self.num_heads: int = num_heads
self.head_dim: int = fd_config.model_config.head_dim
self.num_layers: int = fd_config.model_config.num_layers
self.max_partition_size: int = int(
os.getenv("FLAGS_max_partition_size", 32768))
# pd_disaggregation
self.use_pd_disaggregation: int = int(
os.getenv("FLAGS_use_pd_disaggregation", 0))
self.start_layer_index: int = fd_config.model_config.start_layer_index
self.device_id: int = os.getenv("CUDA_VISIBLE_DEVICES", None)
if fd_config.parallel_config.expert_parallel_rank is None:
fd_config.parallel_config.expert_parallel_rank = 0
device_id = self.rank + fd_config.parallel_config.tensor_parallel_degree * \
fd_config.parallel_config.expert_parallel_rank
if self.device_id is None:
self.device_id = device_id
else:
self.device_id = self.device_id.split(",")[device_id]
def init_attention_metadata(self, forward_meta: ForwardMeta):
"""Initialize attntion metadata hence all layers in the forward pass can reuse it."""
metadata = AppendAttentionMetadata()
metadata.encoder_block_shape_q = 64
metadata.decoder_block_shape_q = 16
metadata.max_partition_size = 32768
metadata.encoder_max_partition_size = 32768
metadata.max_partition_size = self.max_partition_size
metadata.encoder_max_partition_size = self.max_seq_len
metadata._dtype = paddle.get_default_dtype()
if metadata._dtype == "bfloat16":
metadata._fuse_kernel_compute_dtype = "bf16"
@@ -128,38 +156,51 @@ class AppendAttentionBackend(AttentionBackend):
self.block_size,
self.speculate_max_draft_token_num + 1,
)
self.attention_metadata = metadata
def get_attntion_meta(self):
# pd_disaggregation
metadata.kv_signal_data_list = [None] * self.num_layers
if self.use_pd_disaggregation:
metadata.kv_signal_metadata = open_shm_and_get_meta_signal(
self.rank, int(self.device_id), self.keep_pd_step_flag)
self.attention_metadata: AttentionMetadata = metadata
forward_meta.decoder_batch_ids.copy_(metadata.decoder_batch_ids, False)
forward_meta.decoder_tile_ids_per_batch.copy_(
metadata.decoder_tile_ids_per_batch, False)
def get_attntion_meta(self) -> AttentionMetadata:
"""get_attntion_meta"""
return self.attention_metadata
@staticmethod
def get_kv_cache_shape(
self,
max_num_blocks: int,
block_size: int,
kv_num_head: int,
head_dim: int,
):
) -> Tuple[int, int, int, int]:
"""
get_kv_cache_shape
Caculate kv cache shape
"""
return (max_num_blocks, kv_num_head, block_size, head_dim)
return (max_num_blocks, self.kv_num_heads, self.block_size,
self.head_dim)
def forward_mixed(
self,
q,
k,
v,
qkv,
q: paddle.Tensor,
k: paddle.Tensor,
v: paddle.Tensor,
qkv: paddle.Tensor,
layer: Attention,
forward_meta: ForwardMeta,
):
) -> paddle.Tensor:
"""
forward_mixed
"""
metadata = self.attention_metadata
if self.use_pd_disaggregation:
metadata.kv_signal_data_list[
layer.layer_id] = init_signal_layerwise(
metadata.kv_signal_metadata,
layer.layer_id + self.start_layer_index)
res = append_attention(
qkv,
forward_meta.caches[2 * layer.layer_id],
@@ -176,8 +217,8 @@ class AppendAttentionBackend(AttentionBackend):
metadata.kv_batch_ids,
metadata.kv_tile_ids_per_batch,
metadata.kv_num_blocks,
metadata.decoder_batch_ids,
metadata.decoder_tile_ids_per_batch,
forward_meta.decoder_batch_ids, # from buffer
forward_meta.decoder_tile_ids_per_batch, # from buffer
metadata.decoder_num_blocks,
metadata.set_max_lengths,
metadata.max_len_kv,
@@ -193,7 +234,7 @@ class AppendAttentionBackend(AttentionBackend):
getattr(layer, "cache_v_zp", None),
layer.linear_shift,
layer.linear_smooth,
None, # kv_signal_data,
metadata.kv_signal_data_list[layer.layer_id],
metadata._fuse_kernel_compute_dtype,
getattr(layer, "cache_quant_type_str", "none"),
layer.use_neox_rotary_style,
@@ -208,7 +249,6 @@ class AppendAttentionBackend(AttentionBackend):
metadata.encoder_max_partition_size,
self.speculate_max_draft_token_num + 1,
self.causal,
self.speculate_method is not None,
self.speculative_method is not None,
)[0]
return res

View File

@@ -14,12 +14,17 @@
# limitations under the License.
"""
from typing import Optional
from typing import Dict, Optional
import numpy as np
import paddle
from paddle import nn
from paddleformers.utils.log import logger
from fastdeploy.worker.model_runner import ForwardMeta
from fastdeploy.config import FDConfig
from fastdeploy.model_executor.layers.quantization.quant_base import \
QuantMethodBase
from fastdeploy.worker.forward_meta import ForwardMeta
class Attention(nn.Layer):
@@ -29,26 +34,24 @@ class Attention(nn.Layer):
def __init__(
self,
llm_config,
fd_config: FDConfig,
layer_id: int,
logit_cap: float = 0.0,
v_head_dim: int = -1,
rope_type: str = "",
qkv_bias: Optional[paddle.Tensor] = None,
qkv_scale: Optional[paddle.Tensor] = None,
prefix: str = "",
out_scale: float = -1.,
linear_shift=None,
linear_smooth=None,
use_neox_rotary_style=False,
out_scale: float = -1.0,
linear_shift: paddle.Tensor = None,
linear_smooth: paddle.Tensor = None,
use_neox_rotary_style: bool = False,
) -> None:
"""
Initializes `LMLayer` with the given parameters.
Args:
llm_config (dict): The config of LM model.
fd_config (dict): The config of LM model.
layer_id (int): The id of current layer.
logit_cap (float, optional): The cap for logits. Defaults to 0.0.
v_head_dim (int, optional): The head dim of value. Defaults to -1.
rope_type (str, optional): The type of RoPE. Defaults to "".
qkv_bias (Optional[paddle.Tensor], optional): The bias of QKV. Defaults to None.
@@ -61,34 +64,46 @@ class Attention(nn.Layer):
ValueError: If the `v_head_dim` is less than 0.
"""
super().__init__()
self.num_heads = llm_config.model_config.num_attention_heads // llm_config.parallel_config.mp_size
self.head_dim = llm_config.model_config.hidden_size // llm_config.model_config.num_attention_heads
self.kv_num_heads = llm_config.model_config.num_key_value_heads // llm_config.parallel_config.mp_size
self.layer_id = layer_id
self.logit_cap = logit_cap
self.v_head_dim = v_head_dim if v_head_dim > 0 else self.head_dim
self.rope_type = rope_type
self.qk_head_dim = self.head_dim
self.num_heads: int = fd_config.model_config.num_attention_heads // fd_config.parallel_config.tensor_parallel_degree
self.head_dim: int = fd_config.model_config.head_dim
self.kv_num_heads: int = \
fd_config.model_config.num_key_value_heads // fd_config.parallel_config.tensor_parallel_degree
self.layer_id: int = layer_id
self.v_head_dim: int = v_head_dim if v_head_dim > 0 else self.head_dim
self.rope_type: str = rope_type
self.qk_head_dim: int = self.head_dim
self.prefix: str = prefix
# not use
self.tp_q_head_num = self.num_heads
self.tp_k_head_num = self.num_heads
self.tp_v_head_num = self.num_heads
# not use
self.scaling = 1.0 / (self.head_dim**0.5)
self.linear_shift = linear_shift
self.linear_smooth = linear_smooth
self.qkv_bias = qkv_bias
self.qkv_scale = qkv_scale
self.linear_shift: paddle.Tensor | None = linear_shift
self.linear_smooth: paddle.Tensor | None = linear_smooth
self.qkv_bias: paddle.Tensor | None = qkv_bias
self.qkv_scale: paddle.Tensor | None = qkv_scale
self._dtype = self._helper.get_default_dtype()
self.out_scale = out_scale
self.use_neox_rotary_style = use_neox_rotary_style
if llm_config.kvcache_config is not None:
self.kvcache_quant_method = llm_config.kvcache_config.kvcache_quant_config.get_quant_method(
self.out_scale: float = out_scale
self.use_neox_rotary_style: bool = use_neox_rotary_style
if fd_config.quant_config and hasattr(fd_config.quant_config,
"kv_cache_quant_type"):
self.kvcache_quant_method: QuantMethodBase = fd_config.quant_config.get_quant_method(
self)
self.kvcache_quant_method.create_weights(self)
if llm_config.quant_config is not None:
self.quant_max_bound = llm_config.quant_config.quant_max_bound
self.quant_min_bound = llm_config.quant_config.quant_min_bound
else:
self.kvcache_quant_method = None
if self.kvcache_quant_method is None:
logger.info(f"Attention is running in cache kv {self._dtype} mode")
else:
logger.info(
f"Attention is running in cache kv {self.kvcache_quant_method.cache_quant_config.quant_type} mode"
)
def load_state_dict(self, state_dict: Dict[str,
paddle.Tensor | np.ndarray]):
'''
Attention only have quant related scales not other parameters.
'''
if self.kvcache_quant_method is not None:
self.kvcache_quant_method.create_weights(self, state_dict)
def forward(
self,
@@ -97,7 +112,7 @@ class Attention(nn.Layer):
v: paddle.Tensor = None,
qkv: paddle.Tensor = None,
forward_meta: ForwardMeta = None,
):
) -> paddle.Tensor:
"""
The forward function of attention layer.
args:

View File

@@ -14,26 +14,20 @@
# limitations under the License.
"""
"""
attention backend selecter
"""
from fastdeploy.model_executor.layers.attention.base_attention_backend import AttentionBackend
from fastdeploy.platforms import current_platform
from fastdeploy.utils import resolve_obj_from_strname
from functools import cache
from fastdeploy.platforms import _Backend
from fastdeploy.platforms import _Backend, current_platform
from fastdeploy.utils import resolve_obj_from_strname
def backend_name_to_enum(backend_name: str):
def backend_name_to_enum(backend_name: str) -> _Backend:
"""backend_name_to_enum """
assert backend_name is not None
return _Backend.__members__.get(backend_name)
@cache
def _get_attn_backend(
selected_backend
):
def _get_attn_backend(selected_backend: str) -> object:
"""_get_attn_backend """
if isinstance(selected_backend, str):
selected_backend = backend_name_to_enum(selected_backend)
@@ -46,10 +40,6 @@ def _get_attn_backend(
return resolve_obj_from_strname(attention_cls)
def get_attention_backend(
selected_backend
):
def get_attention_backend(selected_backend):
"""Selects which attention backend ."""
return _get_attn_backend(
selected_backend
)
return _get_attn_backend(selected_backend)

View File

@@ -1,395 +0,0 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import os
import paddle
from paddle import nn
import fastdeploy
class Attention(nn.Layer):
"""
Attention Layer
"""
def __init__(
self,
inference_args,
prefix,
out_scale=-1,
use_neox_rotary_style=False,
rope_theta=10000.0,
rope_3d=False,
qkv_scale=None,
qkv_bias=None,
linear_shift=None,
linear_smooth=None,
):
"""
Initialize the attention layer with various parameters.
Args:
inference_args (dict or object): Contains arguments for inference, including
number of key-value heads, weight data type, activation data type, etc.
prefix (str): The name of the attention layer for identification purposes.
out_scale (float, optional): Output scale factor. Defaults to -1.
use_neox_rotary_style (bool, optional): Whether to use the NeoX rotary position
encoding style. Defaults to False.
rope_theta (float, optional): Theta value for the rope position encoding. Defaults to 10000.0.
qkv_scale (float or None, optional): Quantization scale for QKV weights.
Used only for certain quantization configurations. Defaults to None.
qkv_bias (Tensor or None, optional): Bias for QKV linear layer. Defaults to None.
linear_shift (float or None, optional): Linear shift factor used in
quantization. Used only for certain quantization configurations.
Defaults to None.
linear_smooth (float or None, optional): Linear smooth factor used in
quantization. Used only for certain quantization configurations.
Defaults to None.
"""
super().__init__()
self.inference_args = inference_args
self.nranks = inference_args.mp_size
self.kv_num_heads = inference_args.num_key_value_heads // self.nranks
self.head_dim = self.inference_args.head_dim
self.prefix = prefix
self.cache_k_scale_name = prefix + ".cachek_matmul.activation_quanter"
self.cache_v_scale_name = prefix + ".cachev_matmul.activation_quanter"
self.out_scale = out_scale
self.cache_k_zp_name = self.cache_k_scale_name + ".zero_point"
self.cache_v_zp_name = self.cache_v_scale_name + ".zero_point"
self.use_neox_rotary_style = use_neox_rotary_style
self.rope_theta = rope_theta
self.rope_3d = rope_3d
self._dtype = self._helper.get_default_dtype()
if self._dtype == "bfloat16":
self._fuse_kernel_compute_dtype = "bf16"
elif self._dtype == "float16":
self._fuse_kernel_compute_dtype = "fp16"
elif self._dtype == "float32":
self._fuse_kernel_compute_dtype = "fp32"
else:
raise ValueError(f"Just support float32, float16 and \
bfloat16 as default dtype, but received {self._dtype}")
self.cache_scale_dtype = (
self._dtype if self.inference_args.use_append_attn else "float32")
self.qkv_bias = qkv_bias
if inference_args.weight_dtype == "int8" and inference_args.act_dtype == "int8":
self.qkv_scale = qkv_scale
self.linear_shift = linear_shift
self.linear_smooth = linear_smooth
if (inference_args.cachekv_dtype == "int8"
or inference_args.cachekv_dtype == "int4"
or inference_args.cachekv_dtype == "float8_e4m3fn"):
self.set_cachekv_scale()
# qkv_bias fused with attention only when W8A8
if not (inference_args.weight_dtype == "int8"
and inference_args.act_dtype == "int8"):
self.qkv_bias = None
def set_cachekv_scale(self):
"""
Set cache key (K) and value (V) scaling factors.
This method initializes and sets the scaling factors for cache key (K) and value (V)
tensors, which are used in attention mechanisms to adjust the scale of the cache
representations. Additionally, it calculates and sets the inverse of these scaling
factors for the output cache K and V tensors.
Args:
None - This method does not take any explicit arguments as it relies on the
instance variables of the class, such as `self.kv_num_heads`,
`self.cache_k_scale_name`, `self.cache_v_scale_name`, and
`self.inference_args.cachekv_scale_dict` for its functionality.
Returns:
None - This method modifies the instance variables directly and does not return
any values.
"""
self.cache_k_scale = self.create_parameter(
shape=([self.kv_num_heads *
self.head_dim] if self.inference_args.is_channel_wise else
[self.kv_num_heads]),
dtype=self.cache_scale_dtype,
is_bias=False,
)
self.cache_v_scale = self.create_parameter(
shape=([self.kv_num_heads *
self.head_dim] if self.inference_args.is_channel_wise else
[self.kv_num_heads]),
dtype=self.cache_scale_dtype,
is_bias=False,
)
self.cache_k_out_scale = self.create_parameter(
shape=([self.kv_num_heads *
self.head_dim] if self.inference_args.is_channel_wise else
[self.kv_num_heads]),
attr=None,
dtype=self.cache_scale_dtype,
is_bias=False,
)
self.cache_v_out_scale = self.create_parameter(
shape=([self.kv_num_heads *
self.head_dim] if self.inference_args.is_channel_wise else
[self.kv_num_heads]),
attr=None,
dtype=self.cache_scale_dtype,
is_bias=False,
)
if self.cache_k_scale_name in self.inference_args.cachekv_scale_dict:
cache_k_scale = paddle.cast(
paddle.to_tensor(self.inference_args.cachekv_scale_dict[
self.cache_k_scale_name]),
self.cache_scale_dtype,
)
cache_k_out_scale = 1.0 / cache_k_scale
else:
if os.getenv("EP_DECODER_PERF_TEST", "False") == "True":
cache_k_scale = paddle.zeros(self.cache_k_scale.shape,
self.cache_k_scale.dtype)
cache_k_out_scale = paddle.zeros(self.cache_k_out_scale.shape,
self.cache_k_out_scale.dtype)
else:
raise KeyError(
f"{self.cache_k_scale_name} not found in scale dict")
if self.cache_v_scale_name in self.inference_args.cachekv_scale_dict:
cache_v_scale = paddle.cast(
paddle.to_tensor(self.inference_args.cachekv_scale_dict[
self.cache_v_scale_name]),
self.cache_scale_dtype,
)
cache_v_out_scale = 1.0 / cache_v_scale
else:
if os.getenv("EP_DECODER_PERF_TEST", "False") == "True":
cache_v_scale = paddle.zeros(self.cache_v_scale.shape,
self.cache_v_scale.dtype)
cache_v_out_scale = paddle.zeros(self.cache_v_out_scale.shape,
self.cache_v_out_scale.dtype)
else:
raise KeyError(
f"{self.cache_v_scale_name} not found in scale dict")
self.cache_k_scale.set_value(cache_k_scale)
self.cache_v_scale.set_value(cache_v_scale)
self.cache_k_out_scale.set_value(cache_k_out_scale)
self.cache_v_out_scale.set_value(cache_v_out_scale)
if self.inference_args.has_zero_point:
self.cache_k_zp = self.create_parameter(
shape=([self.kv_num_heads *
self.head_dim] if self.inference_args.is_channel_wise
else [self.kv_num_heads]),
dtype=self.cache_scale_dtype,
is_bias=False,
)
self.cache_v_zp = self.create_parameter(
shape=([self.kv_num_heads *
self.head_dim] if self.inference_args.is_channel_wise
else [self.kv_num_heads]),
dtype=self.cache_scale_dtype,
is_bias=False,
)
if self.cache_k_zp_name in self.inference_args.cachekv_scale_dict:
cache_k_zp = paddle.cast(
paddle.to_tensor(self.inference_args.cachekv_scale_dict[
self.cache_k_zp_name]),
self.cache_scale_dtype,
)
else:
cache_k_zp = paddle.zeros(
([self.kv_num_heads *
self.head_dim] if self.inference_args.is_channel_wise
else [self.kv_num_heads]),
dtype=self.cache_scale_dtype,
)
if self.cache_v_zp_name in self.inference_args.cachekv_scale_dict:
cache_v_zp = paddle.cast(
paddle.to_tensor(self.inference_args.cachekv_scale_dict[
self.cache_v_zp_name]),
self.cache_scale_dtype,
)
else:
cache_v_zp = paddle.zeros(
([self.kv_num_heads *
self.head_dim] if self.inference_args.is_channel_wise
else [self.kv_num_heads]),
dtype=self.cache_scale_dtype,
)
self.cache_k_zp.set_value(cache_k_zp)
self.cache_v_zp.set_value(cache_v_zp)
def forward(
self,
qkv,
input_ids,
rotary_embs,
rotary_emb_dims,
key_cache,
value_cache,
pre_key_cache,
pre_value_cache,
pre_caches_length,
attn_mask,
kv_signal_data,
**kwargs,
):
"""
Compute the attention for a single time step.
Args:
qkv (Tensor): The output of the linear transformation of query, key and value.
Shape: [batch_size, num_heads, seq_len, embed_dim // num_heads].
padding_offset (Tensor): The offset to be added to the sequence length when computing
the attention mask. Shape: [batch_size, 1].
input_ids (Tensor, optional): The input ids of the batch. Used for computing the
attention mask. Default: None. Shape: [batch_size, max_sequence_length].
rotary_embs (Tensor, optional): The rotary position embeddings. Default: None.
Shape: [num_heads, rotary_emb_dims].
rotary_emb_dims (int, optional): The dimension of the rotary position embeddings.
Default: None.
caches (List[Tensor], optional): The cache tensors used in the computation of the
attention. Default: None.
pre_caches (List[Tensor], optional): The pre-computed cache tensors used in the
computation of the attention. Default: None.
pre_caches_length (int, optional): The length of the pre-computed cache tensors.
Default: None.
attn_mask (Tensor, optional): The attention mask. Default: None.
Shape: [batch_size, max_sequence_length].
**kwargs (dict, optional): Additional keyword arguments passed along.
Returns:
Tensor: The output of the linear transformation after applying the attention.
Shape: [batch_size, embed_dim // num_heads].
Raises:
None.
"""
k_quant_scale = kwargs.get("k_quant_scale", None)
v_quant_scale = kwargs.get("v_quant_scale", None)
k_dequant_scale = kwargs.get("k_dequant_scale", None)
v_dequant_scale = kwargs.get("v_dequant_scale", None)
if not self.inference_args.use_dynamic_cachekv_quant:
k_quant_scale = getattr(self, "cache_k_scale", None)
v_quant_scale = getattr(self, "cache_v_scale", None)
k_dequant_scale = getattr(self, "cache_k_out_scale", None)
v_dequant_scale = getattr(self, "cache_v_out_scale", None)
cache_quant_type_str = self.inference_args.cache_quant_type
else:
cache_quant_type_str = "none"
if self.inference_args.use_append_attn:
out = fastdeploy.model_executor.ops.gpu.append_attention(
qkv,
key_cache,
value_cache,
kwargs.get("seq_lens_encoder", None),
kwargs.get("seq_lens_decoder", None),
kwargs.get("seq_lens_this_time", None),
kwargs.get("padding_offsets", None),
kwargs.get("cum_offsets", None),
kwargs.get("block_tables", None),
kwargs.get("encoder_batch_ids", None),
kwargs.get("encoder_tile_ids_per_batch", None),
kwargs.get("encoder_num_blocks", None),
kwargs.get("kv_batch_ids", None),
kwargs.get("kv_tile_ids_per_batch", None),
kwargs.get("kv_num_blocks", None),
kwargs.get("decoder_batch_ids", None),
kwargs.get("decoder_tile_ids_per_batch", None),
kwargs.get("decoder_num_blocks", None),
kwargs.get("set_max_lengths", None),
kwargs.get("max_len_kv", None),
rotary_embs,
attn_mask,
getattr(self, "qkv_bias", None),
getattr(self, "qkv_scale", None),
k_quant_scale,
v_quant_scale,
k_dequant_scale,
v_dequant_scale,
getattr(self, "cache_k_zp", None), # cache_k_zp
getattr(self, "cache_v_zp", None), # cache_v_zp
getattr(self, "linear_shift", None), # out_shifts
getattr(self, "linear_smooth", None), # out_smooths
kv_signal_data,
self._fuse_kernel_compute_dtype,
cache_quant_type_str, # cache_quant_type
self.use_neox_rotary_style,
self.rope_3d,
kwargs.get("max_input_length", -1),
self.inference_args.quant_max_bound,
self.inference_args.quant_min_bound,
self.out_scale, # out_linear_in_scale
kwargs.get("encoder_block_shape_q", 64),
kwargs.get("decoder_block_shape_q", 16),
kwargs.get("max_partition_size", 32768),
kwargs.get("encoder_max_partition_size", 32768),
self.inference_args.speculate_max_draft_token_num +
1, # speculate_max_draft_token_num
True, # causal
self.inference_args.speculate_method
is not None, # speculate_decoder
)[0]
else:
out = paddle.incubate.nn.functional.block_multihead_attention(
qkv,
key_cache,
value_cache,
kwargs.get("seq_lens_encoder", None),
kwargs.get("seq_lens_decoder", None),
kwargs.get("seq_lens_this_time", None),
kwargs.get("padding_offsets", None),
kwargs.get("cum_offsets", None),
kwargs.get("cu_seqlens_q", None),
kwargs.get("cu_seqlens_k", None),
kwargs.get("block_tables", None),
pre_key_cache,
pre_value_cache,
k_quant_scale,
v_quant_scale,
k_dequant_scale,
v_dequant_scale,
getattr(self, "qkv_scale", None),
getattr(self, "qkv_bias", None),
getattr(self, "linear_shift", None),
getattr(self, "linear_smooth", None),
kwargs.get("max_enc_len_this_time", None),
kwargs.get("max_dec_len_this_time", None),
rotary_embs,
attn_mask,
None, # tgt_mask
kwargs.get("max_input_length", -1),
kwargs.get("block_size", 64),
self.use_neox_rotary_style,
self.inference_args.use_dynamic_cachekv_quant,
quant_round_type=self.inference_args.quant_round_type,
quant_max_bound=self.inference_args.quant_max_bound,
quant_min_bound=self.inference_args.quant_min_bound,
out_scale=self.out_scale,
compute_dtype=self._fuse_kernel_compute_dtype,
rope_theta=self.rope_theta,
)[0]
return out

View File

@@ -20,10 +20,16 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from dataclasses import dataclass
import paddle
from fastdeploy.worker.model_runner import ForwardMeta
from fastdeploy.worker.forward_meta import ForwardMeta
@dataclass
class AttentionMetadata(ABC):
pass
class AttentionBackend(ABC):
@@ -42,7 +48,7 @@ class AttentionBackend(ABC):
qkv: paddle.Tensor,
layer: paddle.nn.Layer,
forward_meta: ForwardMeta,
):
) -> paddle.Tensor:
"""
Run a forward.
args:
@@ -88,7 +94,7 @@ class AttentionBackend(ABC):
qkv: paddle.Tensor,
layer: paddle.nn.Layer,
forward_meta: ForwardMeta,
):
) -> paddle.Tensor:
"""Run a forward for mix."""
raise NotImplementedError()
@@ -100,7 +106,7 @@ class AttentionBackend(ABC):
qkv: paddle.Tensor,
layer: paddle.nn.Layer,
forward_meta: ForwardMeta,
):
) -> paddle.Tensor:
"""Run a forward for decode."""
raise NotImplementedError()
@@ -112,6 +118,6 @@ class AttentionBackend(ABC):
qkv: paddle.Tensor,
layer: paddle.nn.Layer,
forward_meta: ForwardMeta,
):
) -> paddle.Tensor:
"""Run a forward for extend."""
raise NotImplementedError()

View File

@@ -1,4 +1,3 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
@@ -16,15 +15,14 @@
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import paddle
from paddle.nn.functional import scaled_dot_product_attention
from fastdeploy.model_executor.layers.attention.base_attention_backend import AttentionBackend
from fastdeploy.worker.model_runner import ForwardMeta, ForwardMode
from fastdeploy.model_executor.layers.attention.base_attention_backend import \
AttentionBackend
from fastdeploy.worker.forward_meta import ForwardMeta
class PaddleNativeAttnBackend(AttentionBackend):
@@ -33,10 +31,8 @@ class PaddleNativeAttnBackend(AttentionBackend):
Which is used only for testing purpose.
"""
def __init__(self, device):
def __init__(self) -> None:
super().__init__()
self.forward_metadata = None
self.device = device
def init_attention_metadata(self, forward_meta: ForwardMeta):
"""Init the metadata for a forward pass."""
@@ -53,8 +49,8 @@ class PaddleNativeAttnBackend(AttentionBackend):
seq_lens: paddle.Tensor,
extend_prefix_lens: paddle.Tensor,
extend_seq_lens: paddle.Tensor,
causal=False,
):
causal: bool = False,
) -> paddle.Tensor:
"""Run the extend forward by using paddle native sdpa op.
Args:
@@ -111,18 +107,14 @@ class PaddleNativeAttnBackend(AttentionBackend):
per_req_value = v_cache[per_req_tokens].transpose(
[query.dim() - 2, 0])
per_req_out_redudant = (
scaled_dot_product_attention(
per_req_query_redudant.unsqueeze(0),
per_req_key.unsqueeze(0),
per_req_value.unsqueeze(0),
is_causal=causal,
)
.squeeze(0)
.transpose([query.dim() - 2, 0])
)
output[start_q:end_q, :,
:] = per_req_out_redudant[prefill_seq_len_q:, :, :]
per_req_out_redudant = (scaled_dot_product_attention(
per_req_query_redudant.unsqueeze(0),
per_req_key.unsqueeze(0),
per_req_value.unsqueeze(0),
is_causal=causal,
).squeeze(0).transpose([query.dim() - 2, 0]))
output[start_q:end_q, :, :] = per_req_out_redudant[
prefill_seq_len_q:, :, :]
start_q, start_kv = end_q, end_kv
return output
@@ -132,7 +124,7 @@ class PaddleNativeAttnBackend(AttentionBackend):
key: paddle.Tensor,
value: paddle.Tensor,
is_causal: bool = False,
):
) -> paddle.Tensor:
"""Paddle implementation of scaled dot-product attention."""
# query, key, value shape: [batch_size, num_heads, seq_len, head_size]
d_k = query.shape[-1]
@@ -159,8 +151,8 @@ class PaddleNativeAttnBackend(AttentionBackend):
req_to_token: paddle.Tensor,
req_pool_indices: paddle.Tensor,
seq_lens: paddle.Tensor,
causal=False,
):
causal: bool = False,
) -> paddle.Tensor:
"""Run the decode forward by using paddle native sdpa op.
Args:
@@ -203,16 +195,12 @@ class PaddleNativeAttnBackend(AttentionBackend):
per_req_value = v_cache[per_req_tokens].transpose(
[query.dim() - 2, 0])
per_req_out = (
self._scaled_dot_product_attention(
per_req_query.unsqueeze(0),
per_req_key.unsqueeze(0),
per_req_value.unsqueeze(0),
is_causal=causal,
)
.squeeze(0)
.transpose([query.dim() - 2, 0])
)
per_req_out = (self._scaled_dot_product_attention(
per_req_query.unsqueeze(0),
per_req_key.unsqueeze(0),
per_req_value.unsqueeze(0),
is_causal=causal,
).squeeze(0).transpose([query.dim() - 2, 0]))
output[start_q:end_q, :, :] = per_req_out
start_q, start_kv = end_q, end_kv
@@ -220,31 +208,28 @@ class PaddleNativeAttnBackend(AttentionBackend):
def forward_extend(
self,
q,
k,
v,
q: paddle.Tensor,
k: paddle.Tensor,
v: paddle.Tensor,
layer: paddle.nn.Layer,
forward_meta: ForwardMeta,
save_kv_cache=True,
):
save_kv_cache: bool = True,
) -> paddle.Tensor:
"""
Run the prefill and extend(prompt cache) attention forward by using paddle native sdpa op.
"""
if layer.qk_head_dim != layer.v_head_dim:
o = q.new_empty(
(q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
(q.shape[0], layer.self.num_heads * layer.v_head_dim))
else:
o = paddle.empty_like(q)
if save_kv_cache:
forward_meta.token_to_kv_pool.set_kv_buffer(
layer, forward_meta.out_cache_loc, k, v
)
layer, forward_meta.out_cache_loc, k, v)
use_gqa = layer.tp_q_head_num != layer.tp_k_head_num
q_ = q.view([-1, layer.tp_q_head_num, layer.qk_head_dim])
o_ = o.view([-1, layer.tp_q_head_num, layer.v_head_dim])
q_ = q.view([-1, layer.self.num_heads, layer.qk_head_dim])
o_ = o.view([-1, layer.self.num_heads, layer.v_head_dim])
causal = True
@@ -264,31 +249,29 @@ class PaddleNativeAttnBackend(AttentionBackend):
def forward_decode(
self,
q,
k,
v,
q: paddle.Tensor,
k: paddle.Tensor,
v: paddle.Tensor,
layer: paddle.nn.Layer,
forward_meta: ForwardMeta,
):
) -> paddle.Tensor:
"""
Run the decoding attention forward by using paddle native sdpa op.
"""
q = q.reshape([-1, layer.tp_q_head_num * layer.qk_head_dim])
q = q.reshape([-1, layer.self.num_heads * layer.qk_head_dim])
if layer.qk_head_dim != layer.v_head_dim:
o = q.new_empty(
(q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
(q.shape[0], layer.self.num_heads * layer.v_head_dim))
else:
o = paddle.empty_like(q)
forward_meta.token_to_kv_pool.set_kv_buffer(
layer, forward_meta.out_cache_loc, k, v
)
forward_meta.token_to_kv_pool.set_kv_buffer(layer,
forward_meta.out_cache_loc,
k, v)
use_gqa = layer.tp_q_head_num != layer.tp_k_head_num
q_ = q.view([-1, layer.tp_q_head_num, layer.qk_head_dim])
o_ = o.view([-1, layer.tp_q_head_num, layer.v_head_dim])
q_ = q.view([-1, layer.self.num_heads, layer.qk_head_dim])
o_ = o.view([-1, layer.self.num_heads, layer.v_head_dim])
self._run_sdpa_forward_decode(
q_,

View File

@@ -14,10 +14,13 @@
# limitations under the License.
"""
from .get_block_shape_and_split_kv_block import get_block_shape_and_split_kv_block
from .append_attention import append_attention
from .get_block_shape_and_split_kv_block import \
get_block_shape_and_split_kv_block
from .init_signal_layerwise import init_signal_layerwise
from .open_shm_and_get_meta_signal import open_shm_and_get_meta_signal
__all__ = [
"get_block_shape_and_split_kv_block",
"append_attention"
]
"get_block_shape_and_split_kv_block", "append_attention",
"open_shm_and_get_meta_signal", "init_signal_layerwise"
]

View File

@@ -14,10 +14,16 @@
# limitations under the License.
"""
import paddle
from typing import Optional
import paddle
from fastdeploy.platforms import current_platform
if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import \
append_attention as append_attention_gpu
def append_attention(
qkv: paddle.Tensor,
@@ -68,14 +74,12 @@ def append_attention(
speculate_max_draft_token_num: int = 1,
causal: bool = True,
speculate_decoder: bool = False,
):
) -> paddle.Tensor:
"""
Args:
Returns:
append_attention
"""
if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import append_attention
out = append_attention(
out = append_attention_gpu(
qkv,
key_cache,
value_cache,

View File

@@ -0,0 +1,34 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import paddle
from fastdeploy.platforms import current_platform
def init_signal_layerwise(
kv_signal_metadata: paddle.Tensor,
layer_id: int = 0,
) -> paddle.Tensor:
"""
init_signal_layerwise
"""
if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import init_signal_layerwise
out = init_signal_layerwise(kv_signal_metadata, layer_id)
return out
else:
raise NotImplementedError()

View File

@@ -0,0 +1,35 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import paddle
from fastdeploy.platforms import current_platform
def open_shm_and_get_meta_signal(
rank: int = 0,
device_id: int = 0,
keep_pd_step_flag: bool = False,
) -> paddle.Tensor:
"""
open_shm_and_get_meta_signal
"""
if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import \
open_shm_and_get_meta_signal
out = open_shm_and_get_meta_signal(rank, device_id, keep_pd_step_flag)
return out
else:
raise NotImplementedError()

View File

@@ -0,0 +1,188 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from __future__ import annotations
import os
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, List, Optional, Tuple
import paddle
from fastdeploy.model_executor.layers.attention.ops import (
init_signal_layerwise, open_shm_and_get_meta_signal)
if TYPE_CHECKING:
from paddle._typing.dtype_like import _DTypeLiteral
from fastdeploy.config import FDConfig
from fastdeploy.model_executor.layers.attention import Attention
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
AttentionBackend, AttentionMetadata)
from fastdeploy.worker.forward_meta import ForwardMeta
@dataclass
class XPUAttentionMetadata(AttentionMetadata):
"""
XPUAttentionMetadata
"""
max_len_kv: paddle.Tensor = None
set_max_lengths: int = -1
encoder_batch_ids: paddle.Tensor = None
encoder_tile_ids_per_batch: paddle.Tensor = None
encoder_num_blocks: paddle.Tensor = None
kv_batch_ids: paddle.Tensor = None
kv_tile_ids_per_batch: paddle.Tensor = None
kv_num_blocks: paddle.Tensor = None
decoder_batch_ids: paddle.Tensor = None
decoder_tile_ids_per_batch: paddle.Tensor = None
decoder_num_blocks: paddle.Tensor = None
_dtype: _DTypeLiteral = paddle.bfloat16
encoder_max_partition_size: int = 32768
max_partition_size: int = 32768
block_tables: Optional[paddle.Tensor] = None
rotary_embs: Optional[paddle.Tensor] = None
attn_mask: Optional[paddle.Tensor] = None
encoder_block_shape_q: Optional[paddle.Tensor] = None
decoder_block_shape_q: Optional[paddle.Tensor] = None
_fuse_kernel_compute_dtype: str = "bf16"
# pd_disaggregation
kv_signal_metadata: Optional[paddle.Tensor] = None
kv_signal_data_list: List[paddle.Tensor] = field(default_factory=list)
class XPUAttentionBackend(AttentionBackend):
"""
XPUAttentionBackend backend implementation.
"""
def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
head_dim: int):
"""
XPUAttentionBackend __init__
"""
super().__init__()
self.attention_metadata: XPUAttentionMetadata = None
# TODO(gongshaotian): Use fd_config parameters in the correct location
self.block_size: int = fd_config.parallel_config.block_size
self.max_seq_len: int = fd_config.parallel_config.max_model_len
self.rope_theta: float = (10000.0
if fd_config.model_config.rope_theta is None
else fd_config.model_config.rope_theta)
self.rope_3d: bool = getattr(fd_config.model_config, "rope_3d", False)
self.causal: bool = getattr(fd_config.model_config, "causal", True)
# self.speculate_method = fd_config.parallel_config.speculate_method
# self.use_speculate = self.speculate_method is not None
# self.speculate_max_draft_token_num = fd_config.parallel_config.speculate_max_draft_tokens
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
self.kv_num_heads: int = kv_num_heads
self.num_heads: int = num_heads
self.head_dim: int = head_dim
self.num_layers: int = fd_config.model_config.num_layers
# pd_disaggregation
self.use_pd_disaggregation: int = int(
os.getenv("FLAGS_use_pd_disaggregation", 0))
self.start_layer_index: int = fd_config.model_config.start_layer_index
def init_attention_metadata(self, forward_meta: ForwardMeta):
"""Initialize attntion metadata hence all layers in the forward pass can reuse it."""
metadata = XPUAttentionMetadata()
metadata.encoder_block_shape_q = 64
metadata.decoder_block_shape_q = 16
metadata.max_partition_size = 32768
metadata.encoder_max_partition_size = 32768
metadata._dtype = paddle.get_default_dtype()
if metadata._dtype == "bfloat16":
metadata._fuse_kernel_compute_dtype = "bf16"
elif metadata._dtype == "float16":
metadata._fuse_kernel_compute_dtype = "fp16"
elif metadata._dtype == "float32":
metadata._fuse_kernel_compute_dtype = "fp32"
metadata.block_tables = forward_meta.block_tables
metadata.rotary_embs = forward_meta.rotary_embs
metadata.attn_mask = forward_meta.attn_mask
metadata.pre_caches_length = forward_meta.pre_caches_length
# pd_disaggregation
metadata.kv_signal_data_list = [None] * self.num_layers
if self.use_pd_disaggregation:
metadata.kv_signal_metadata = open_shm_and_get_meta_signal(
self.rank, self.keep_pd_step_flag)
self.attention_metadata: AttentionMetadata = metadata
def get_attntion_meta(self) -> AttentionMetadata:
"""get_attntion_meta"""
return self.attention_metadata
def get_kv_cache_shape(
self,
max_num_blocks: int,
) -> Tuple[int, int, int, int]:
"""
Caculate kv cache shape
"""
return (max_num_blocks, self.kv_num_heads, self.block_size,
self.head_dim)
def forward_mixed(
self,
q: paddle.Tensor,
k: paddle.Tensor,
v: paddle.Tensor,
qkv: paddle.Tensor,
layer: Attention,
forward_meta: ForwardMeta,
) -> paddle.Tensor:
"""
forward_mixed
"""
metadata = self.attention_metadata
if self.use_pd_disaggregation:
metadata.kv_signal_data_list[
layer.layer_id] = init_signal_layerwise(
metadata.kv_signal_metadata,
layer.layer_id + self.start_layer_index)
k_quant_scale = getattr(layer, "cache_k_scale", None)
v_quant_scale = getattr(layer, "cache_v_scale", None)
from fastdeploy.model_executor.ops.xpu import block_attn
res = block_attn(
qkv,
forward_meta.caches[2 * layer.layer_id],
forward_meta.caches[2 * layer.layer_id + 1],
forward_meta.cum_offsets,
metadata.rotary_embs,
metadata.block_tables,
None,
k_quant_scale,
v_quant_scale,
forward_meta.enc_batch,
forward_meta.dec_batch,
forward_meta.total_enc_len,
forward_meta.encoder_seq_lod_cpu,
forward_meta.encoder_batch_map_cpu,
forward_meta.decoder_context_len_cpu,
forward_meta.decoder_batch_map_cpu,
)
return res

View File

@@ -16,6 +16,6 @@
xpu backend methods
"""
from .quantization.weight_only import XPUWeightOnlyLinearMethod
from .quantization.weight_only import XPUWeightOnlyLinearMethod, XPUWeightOnlyMoEMethod
__all__ = ['XPUWeightOnlyLinearMethod']
__all__ = ['XPUWeightOnlyLinearMethod', 'XPUWeightOnlyMoEMethod']

View File

@@ -1,5 +1,4 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -13,7 +12,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
""""
Expert Parallelism Load Balancer (EPLB)
"""
xpu quantization methods
"""

View File

@@ -13,15 +13,18 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from abc import abstractmethod
from typing import Optional
from typing import Dict
import paddle
from paddle import nn
from .utils import xpu_quant_weight
from fastdeploy.model_executor.layers.quantization.quant_base import \
QuantMethodBase
from fastdeploy.model_executor.layers.quantization.weight_only import (
WeightOnlyConfig, WeightOnlyLinearMethod)
from fastdeploy.model_executor.ops.xpu import weight_quantize_xpu
from fastdeploy.model_executor.layers.quantization.quant_base import QuantConfigBase
from fastdeploy.model_executor.layers.quantization.weight_only import WeightOnlyConfig, WeightOnlyLinearMethod
class XPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
"""
@@ -34,12 +37,133 @@ class XPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
) -> None:
super().__init__(quant_config)
def process_loaded_weights(self, layer, weight) -> None:
def create_weights(self, layer: nn.Layer) -> None:
"""
Create weights for linear layer on XPU
"""
layer.linear_weight_shape.reverse()
if self.quant_config.name() == "weight_only_int4":
layer.linear_weight_shape[0] //= 2
layer.weight_dtype = "int8"
linear_weight_scale_shape = [layer.embed_dim]
if hasattr(layer, "linear_weight_shape"):
if isinstance(layer.linear_weight_shape, list):
layer_weight_shape = layer.linear_weight_shape
linear_weight_scale_shape = layer_weight_shape[:1]
layer.linear_weight_scale = layer.create_parameter(
shape=linear_weight_scale_shape,
dtype="float32",
is_bias=False,
)
def process_loaded_weights(self, layer: nn.Layer,
weight: paddle.Tensor) -> None:
"""
loaded_weights using xpu special quantization
"""
quanted_weight_tensor, weight_scale_tensor = xpu_quant_weight(
weight.cpu().numpy())
layer.linear_weight.set_value(quanted_weight_tensor)
layer.linear_weight_scale.set_value(
weight_scale_tensor.astype(paddle.get_default_dtype()))
quanted_weight_tensor, weight_scale_tensor = weight_quantize_xpu(
weight, self.quant_config.algo, -1, -1)
layer.linear_weight.set_value(
paddle.transpose(quanted_weight_tensor, [1, 0]))
layer.linear_weight_scale.set_value(weight_scale_tensor)
class XPUWeightOnlyMoEMethod(QuantMethodBase):
"""
XPU Fused MoE Method.
"""
def __init__(
self,
quant_config: WeightOnlyConfig,
) -> None:
super().__init__()
self.quant_config = quant_config
self.moe_quant_type = self.quant_config.algo
def create_weights(self, layer: nn.Layer, state_dict: Dict[str,
paddle.Tensor]):
"""
Paddle cutlass create weight process.
"""
ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
assert len(ffn1_weights) == layer.num_local_experts
assert len(ffn2_weights) == layer.num_local_experts
assert ffn1_weights[0].shape == [
layer.hidden_size, layer.moe_intermediate_size * 2
]
assert ffn2_weights[0].shape == [
layer.moe_intermediate_size, layer.hidden_size
]
added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
added_scale_attrs = ["moe_ffn1_weight_scale", "moe_ffn2_weight_scale"]
for idx, weight_tensor in enumerate([ffn1_weights, ffn2_weights]):
weight_name = added_weight_attrs[idx]
scale_name = added_scale_attrs[idx]
weight_list = []
weight_scale_list = []
for i in range(layer.num_local_experts):
quant_weight, scale = weight_quantize_xpu(
weight_tensor[i], self.moe_quant_type, -1,
-1) # weight is [k,n]
weight_list.append(quant_weight.transpose(
[1, 0])) # transpose weight to [n,k]
weight_scale_list.append(scale)
quanted_weight = paddle.stack(weight_list, axis=0)
setattr(
layer, weight_name,
layer.create_parameter(
shape=quanted_weight.shape,
dtype=quanted_weight.dtype,
default_initializer=paddle.nn.initializer.Constant(0),
))
getattr(layer, weight_name).set_value(quanted_weight)
quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
setattr(
layer, scale_name,
layer.create_parameter(
shape=quanted_weight_scale.shape,
dtype=quanted_weight_scale.dtype,
))
getattr(layer, scale_name).set_value(quanted_weight_scale)
def apply(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
XPU compute Fused MoE.
"""
from fastdeploy.model_executor.ops.xpu import xpu_moe_layer
fused_moe_out = xpu_moe_layer(
x,
layer.gate_weight.transpose([1, 0]),
layer.gate_correction_bias,
layer.moe_ffn1_weight,
layer.moe_ffn2_weight,
None, # ffn1 bias
None, # ffn2 bias
(layer.moe_ffn1_weight_scale
if hasattr(layer, "moe_ffn1_weight_scale") else None),
(layer.moe_ffn2_weight_scale
if hasattr(layer, "moe_ffn2_weight_scale") else None),
(layer.moe_ffn2_in_scale
if hasattr(layer, "moe_ffn2_in_scale") else None),
self.moe_quant_type,
layer.top_k,
False, # moe group, used in deepseek
)
if layer.tp_size > 1:
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
tensor_model_parallel_all_reduce(fused_moe_out)
return fused_moe_out

View File

@@ -16,11 +16,13 @@
!! This file will be deleted after the platform is fully functional
"""
from typing import Tuple
import numpy as np
import paddle
def xpu_clip_and_round(x):
def xpu_clip_and_round(x: np.ndarray) -> np.ndarray:
"""
Clip and round the input array to the range [-127, 127] and convert to int8.
@@ -33,7 +35,8 @@ def xpu_clip_and_round(x):
return np.clip(np.around(x), -127, 127).astype("int8")
def xpu_quant_qkv_weight(weight_np):
def xpu_quant_qkv_weight(
weight_np: np.ndarray) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""
Quantize the query, key, and value weights for the Transformer model.
@@ -61,7 +64,8 @@ def xpu_quant_qkv_weight(weight_np):
return quanted_weight, weight_scales
def xpu_quant_weight(weight_np):
def xpu_quant_weight(
weight_np: np.ndarray) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""
Quantize the weight tensor for XPU devices.

View File

@@ -28,7 +28,7 @@ class VocabParallelEmbedding(nn.Layer):
def __init__(
self,
llm_config,
fd_config,
num_embeddings,
embedding_dim=768,
params_dtype="bfloat16",
@@ -38,7 +38,7 @@ class VocabParallelEmbedding(nn.Layer):
Initialize the VocabParallelEmbedding layer for the model.
Args:
llm_config (LLMConfig): Arguments related to inference, containing
fd_config (FDConfig): Arguments related to inference, containing
attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
num_attention_heads, and ffn_hidden_size.
num_embeddings : vocabulary size.
@@ -48,21 +48,21 @@ class VocabParallelEmbedding(nn.Layer):
you can give it any name you like.
"""
super().__init__()
self.fd_config = fd_config
hcg = fleet.get_hybrid_communicate_group()
self.mp_rank = hcg.get_model_parallel_rank()
self.column_cut = llm_config.parallel_config.column_cut
self.column_cut = fd_config.parallel_config.column_cut
self.world_size = hcg.get_model_parallel_world_size()
self.ring_id = hcg.get_model_parallel_group().id
self.use_rope = llm_config.model_config.use_rope
self.rope_head_dim = llm_config.model_config.rope_head_dim
self.use_ep = llm_config.parallel_config.use_ep
self.hidden_dropout_prob = llm_config.model_config.hidden_dropout_prob
self.initializer_range = llm_config.model_config.initializer_range
self.weight_sharing = llm_config.model_config.weight_sharing
self.sequence_parallel = llm_config.parallel_config.sequence_parallel
self.weight_sharing_add_bias = llm_config.model_config.weight_sharing_add_bias
self.max_position_embeddings = llm_config.model_config.max_position_embeddings
self.freeze_embedding = llm_config.model_config.freeze_embedding
self.use_rope = fd_config.model_config.use_rope
self.rope_head_dim = fd_config.model_config.rope_head_dim
self.use_ep = fd_config.parallel_config.use_ep
self.hidden_dropout_prob = fd_config.model_config.hidden_dropout_prob
self.initializer_range = fd_config.model_config.initializer_range
self.sequence_parallel = fd_config.parallel_config.sequence_parallel
self.max_position_embeddings = fd_config.model_config.max_position_embeddings
self.freeze_embedding = fd_config.model_config.freeze_embedding
self.tie_word_embeddings = fd_config.model_config.tie_word_embeddings
if self.use_ep:
self.word_embeddings = nn.Embedding(
@@ -78,8 +78,7 @@ class VocabParallelEmbedding(nn.Layer):
get_model_parallel_group(),
weight_attr=paddle.ParamAttr(
initializer=nn.initializer.Normal(
mean=0.0, std=self.initializer_range),
),
mean=0.0, std=self.initializer_range), ),
)
else:
# column cut embedding
@@ -87,6 +86,7 @@ class VocabParallelEmbedding(nn.Layer):
num_embeddings,
embedding_dim // self.world_size,
)
self.word_embeddings.weight.is_distributed = True
self.word_embeddings.weight.split_axis = 1
@@ -94,34 +94,12 @@ class VocabParallelEmbedding(nn.Layer):
self.position_embeddings = nn.Embedding(
self.max_position_embeddings,
embedding_dim,
weight_attr=paddle.ParamAttr(
initializer=nn.initializer.Normal(
mean=0.0, std=self.initializer_range),
),
weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(
mean=0.0, std=self.initializer_range), ),
)
self.prefix = prefix
if self.weight_sharing and self.weight_sharing_add_bias:
assert num_embeddings % self.world_size == 0
if self.use_ep:
self.bias = self.create_parameter(
shape=[num_embeddings],
dtype=paddle.get_default_dtype(),
attr=paddle.ParamAttr(
initializer=paddle.nn.initializer.Constant(value=0.0),
),
is_bias=True,
)
else:
self.bias = self.create_parameter(
shape=[num_embeddings // self.world_size],
dtype=paddle.get_default_dtype(),
attr=mask_lm_out_bias_attr,
is_bias=True,
)
self.bias.is_distributed = True
if self.freeze_embedding:
self.word_embeddings.weight.learning_rate = 0.0
if not self.use_rope:
@@ -138,9 +116,14 @@ class VocabParallelEmbedding(nn.Layer):
Args:
state_dict (dict): A dictionary containing the checkpoint weights and biases.
"""
self.word_embeddings.weight.set_value(
get_tensor(state_dict.pop(self.prefix + ".weight")).astype(
paddle.get_default_dtype()))
if self.tie_word_embeddings:
self.word_embeddings.weight.set_value(
get_tensor(state_dict[self.prefix + ".weight"]).astype(
paddle.get_default_dtype()))
else:
self.word_embeddings.weight.set_value(
get_tensor(state_dict.pop(self.prefix + ".weight")).astype(
paddle.get_default_dtype()))
def forward(self, ids_remove_padding=None):
"""

View File

@@ -14,7 +14,7 @@
# limitations under the License.
"""
from paddlenlp.utils.log import logger
from paddleformers.utils.log import logger
import paddle
import paddle.nn.functional as F

View File

@@ -14,29 +14,25 @@
# limitations under the License.
"""
import os
import fastdeploy
from paddlenlp.utils.log import logger
import paddle
from paddle import nn
from fastdeploy.config import FDConfig
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
from fastdeploy.platforms import current_platform
from .utils import _set_var_distributed, divide, get_tensor
import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
class LinearBase(nn.Layer):
"""
LinearBase Layer
LinearBase Layer.
"""
def __init__(
self,
llm_config,
fd_config: FDConfig,
prefix: str = "",
input_size: int = None,
output_size: int = None,
@@ -48,31 +44,26 @@ class LinearBase(nn.Layer):
Initializes a linear layer and provides additional parameters required for inference and quantization.
Args:
llm_config (LLMConfig): Inference-related parameters containing attributes such as
weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
num_attention_heads, and ffn_hidden_size.
fd_config (FDConfig): Inference-related parameters.
prefix (str): Unique name of the layer, used to name internal attributes.
Can be arbitrarily named.
input_size (int, optional): Number of input features. Defaults to None.
output_size (int, optional): Number of output features. Defaults to None.
weight_key (Any, optional): Key for weights. Defaults to None.
bias_key (Any, optional): Key for biases. Defaults to None.
skip_quant (bool, optional): Whether to skip quantization. Defaults to False.
input_size (int): Number of input features. Defaults to None.
output_size (int): Number of output features. Defaults to None.
with_bias (bool): Whether to include bias or not. Defaults to False.
add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to False.
skip_quant (bool): Whether to skip quantization. Defaults to False.
Raises:
NotImplementedError: Raised if the current platform is not a CUDA platform.
"""
super().__init__()
if current_platform.is_cuda():
if current_platform.is_cuda() or current_platform.is_xpu():
self.forward = self.forward_cuda
else:
raise NotImplementedError
self.llm_config = llm_config
self.fd_config = fd_config
self.skip_quant = skip_quant
self.use_smooth_quant = llm_config.model_config.use_smooth_quant
self.weight_dtype = llm_config.model_config.weight_dtype
self.act_dtype = llm_config.model_config.act_dtype
self.input_size = input_size
self.output_size = output_size
self.with_bias = with_bias
@@ -86,61 +77,27 @@ class LinearBase(nn.Layer):
self.out_scale_key = f"{prefix}.out_scale"
self._dtype = self._helper.get_default_dtype()
if llm_config.quant_config:
self.quant_method = llm_config.quant_config.get_quant_method(self)
self.use_offline_quant = llm_config.tmp_config.use_offline_quant
def is_y_transposed(self):
"""
Returns whether the y tensor should be transposed for inference.
Args:
None.
Returns:
bool, whether the y tensor should be transposed for inference.
"""
if self.weight_dtype == "int4":
return True
if self.weight_dtype == "int8":
return True
if "float8" in self.weight_dtype:
return True
# bf16/fp16/fp32 y is not transposed
return False
def init_weight_shape(self, trans=False):
"""
Initialize the weight shape for the first feedforward network layer.
Args:
trans (bool, optional): Whether to transpose the weight shape.
Defaults to False. If True, the shape will be reversed.
Returns:
None.
"""
self.weight_dtype = self._dtype
self.linear_weight_shape = [
self.input_size,
self.output_size,
]
if trans:
self.linear_weight_shape.reverse()
if self.use_smooth_quant:
self.linear_shift_shape = [self.output_size]
self.linear_smooth_shape = [self.output_size]
if self.weight_dtype == "int4":
self.linear_weight_shape[0] //= 2
if fd_config.quant_config:
self.quant_method = fd_config.quant_config.get_quant_method(self)
if fd_config.model_config.is_quantized:
self.weight_key = f"{prefix}.quant_weight"
self.weight_scale_key = f"{prefix}.weight_scale"
self.act_scale_key = f"{prefix}.activation_scale"
def init_weight(self):
"""
Initialize the weights and biases.
"""
self.init_weight_shape(self.is_y_transposed())
if self.skip_quant:
self.weight_dtype = self._dtype
self.linear_weight = self.create_parameter(
shape=self.linear_weight_shape,
dtype=self.get_weight_create_dtype(),
dtype=self.weight_dtype,
is_bias=False,
default_initializer=paddle.nn.initializer.Constant(0),
)
@@ -156,117 +113,57 @@ class LinearBase(nn.Layer):
# smooth quant
self.linear_shift = None
self.linear_smooth = None
if self.use_smooth_quant:
self.linear_shift = self.create_parameter(
shape=self.linear_shift_shape,
dtype=self._dtype,
is_bias=False,
)
self.linear_smooth = self.create_parameter(
shape=self.linear_smooth_shape,
dtype=self._dtype,
is_bias=False,
)
def get_weight_create_dtype(self):
def load_prequant_weight(self, state_dict: dict):
"""
Get the data type for creating weights based on quantization settings.
Load the prequantized weight from the state dictionary.
Args:
self (object): The instance of the class where this method is defined.
Returns:
str: The data type for creating weights. It depends on the quantization settings:
- If `self.skip_quant` is True, returns the original data type `self._dtype`.
- If `self.weight_dtype` is "int4", returns "int8" to ensure compatibility or optimization.
- Otherwise, returns the specified weight data type `self.weight_dtype`.
state_dict (dict): A dictionary containing the prequantized weights and scales.
"""
if self.skip_quant:
return self._dtype
if self.weight_dtype == "int4":
return "int8"
# TODO(wangzhe24) create_parameter not support FP8
if "float8" in self.weight_dtype:
return self._dtype
return self.weight_dtype
self.quant_method.process_prequanted_weights(self, state_dict)
def load_weight(self, state_dict: dict):
"""
Load the weight from the state dictionary.
def load_offline_quant_state_dict(self, quant_weight, quant_scale=None):
Args:
state_dict (dict): A dictionary containing the weights
"""
Load offline the checkpoint state dictionary into the layer.
"""
if quant_scale is None:
if "float8" in self.weight_dtype:
self.linear_weight.copy_(quant_weight, False)
else:
self.linear_weight.set_value(quant_weight)
weight_tensor = get_tensor(state_dict.pop(self.weight_key))
if self.fd_config.quant_config:
self.quant_method.process_loaded_weights(self, weight_tensor)
else:
if self.inference_args.weight_block_size[0] != -1:
self.linear_weight.copy_(quant_weight.view(paddle.float8_e4m3fn), False)
else:
self.linear_weight.set_value(quant_weight)
self.linear_weight_scale.set_value(quant_scale)
self.linear_weight.set_value(weight_tensor)
def load_state_dict(self, state_dict):
def load_state_dict(self, state_dict: dict):
"""
Load the checkpoint state dictionary into the layer.
Args:
state_dict (dict): A dictionary containing the checkpoint weights and biases.
"""
if self.use_offline_quant:
self.load_offline_quant_state_dict(
quant_weight=get_tensor(
state_dict.pop(self.weight_key + ".quant_weight")
),
quant_scale=get_tensor(
state_dict.pop(self.weight_key + ".quant_scale")
),
)
# weight
self.state_dict = state_dict
assert self.weight_key is not None, 'weight_key should not be None.'
if self.fd_config.model_config.is_quantized:
self.load_prequant_weight(state_dict)
else:
# weight
assert self.weight_key is not None, 'weight_key should not be None.'
weight_tensor = get_tensor(state_dict.pop(self.weight_key))
if self.llm_config.quant_config:
self.quant_method.process_loaded_weights(self, weight_tensor)
else:
self.linear_weight.set_value(weight_tensor)
self.load_weight(state_dict)
# bias
if self.with_bias:
bias_tensor = paddle.to_tensor(get_tensor(state_dict.pop(self.bias_key)))
bias_tensor = paddle.to_tensor(
get_tensor(state_dict.pop(self.bias_key)))
self.linear_bias.set_value(bias_tensor)
# smooth quant
if self.use_smooth_quant:
if self.shift_key in state_dict:
shift_tensor = get_tensor(state_dict.pop(self.shift_key)).astype(
paddle.get_default_dtype()
)
else:
shift_tensor = paddle.zeros(
shape=self.linear_shift_shape,
dtype=paddle.get_default_dtype(),
)
self.linear_shift.set_value(shift_tensor)
if self.smooth_key in state_dict:
smooth_tensor = get_tensor(state_dict.pop(self.smooth_key)).astype(
paddle.get_default_dtype()
)
else:
smooth_tensor = paddle.ones(
shape=[self.linear_smooth_shape],
dtype=paddle.get_default_dtype(),
)
self.linear_smooth.set_value(smooth_tensor)
def forward_cuda(self, x):
def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor:
"""
Forward function for ColumnParallelLinear.
Forward function for Linear.
Args:
x (Tensor): Input tensor to the ColumnParallelLinear layer.
x (Tensor): Input tensor to the Linear.
Returns:
Tensor: Output tensor.
@@ -274,22 +171,24 @@ class LinearBase(nn.Layer):
Raises:
NotImplementedError: If the weight dtype is not float8 or act dtype is not equal to weight dtype.
"""
if self.llm_config.quant_config:
if self.fd_config.quant_config:
linear_out = self.quant_method.apply(self, x)
else:
linear_out = paddle.matmul(x, self.linear_weight)
if self.with_bias:
linear_out = paddle.add(linear_out, self.linear_bias)
return linear_out
class ReplicatedLinear(LinearBase):
"""
ReplicatedLinear Layer
ReplicatedLinear Layer.
"""
def __init__(
self,
llm_config,
fd_config: FDConfig,
prefix: str = "",
input_size: int = None,
output_size: int = None,
@@ -298,74 +197,39 @@ class ReplicatedLinear(LinearBase):
skip_quant: bool = False,
):
"""
Initialize a linear layer with additional parameters for inference and quantization.
Initializes a replicated linear layer.
Args:
llm_config (LLMConfig): Arguments related to inference, containing
attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
num_attention_heads, and ffn_hidden_size.
prefix (str): Unique name of the layer, used for naming internal attributes,
you can give it any name you like.
layer_index (int): The index of the linear layer in the model
fd_config (FDConfig): Inference-related parameters.
prefix (str): Unique name of the layer, used to name internal attributes.
Can be arbitrarily named.
input_size (int): Number of input features. Defaults to None.
output_size (int): Number of output features. Defaults to None.
with_bias (bool): Whether to include bias or not. Defaults to False.
add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to False.
skip_quant (bool): Whether to skip quantization. Defaults to False.
"""
super().__init__(llm_config=llm_config,
super().__init__(fd_config=fd_config,
prefix=prefix,
input_size=input_size,
output_size=output_size,
with_bias=with_bias,
add_bias=add_bias,
skip_quant=skip_quant)
self.nranks = llm_config.parallel_config.mp_size
self.input_size = input_size
self.init_weight()
self.quant_method.create_weights(self)
def init_weight(self):
"""
Initialize the weights and biases.
"""
self.init_weight_shape(self.is_y_transposed())
self.linear_weight = self.create_parameter(
shape=self.linear_weight_shape,
dtype=self.get_weight_create_dtype(),
is_bias=False,
default_initializer=paddle.nn.initializer.Constant(0),
)
self.linear_bias = None
if self.with_bias:
self.linear_bias = self.create_parameter(
shape=[self.output_size],
dtype=self._dtype,
is_bias=True,
)
# smooth quant
self.linear_shift = None
self.linear_smooth = None
if self.use_smooth_quant:
self.linear_shift = self.create_parameter(
shape=self.linear_shift_shape,
dtype=self._dtype,
is_bias=False,
)
self.linear_smooth = self.create_parameter(
shape=self.linear_smooth_shape,
dtype=self._dtype,
is_bias=False,
)
class ColumnParallelLinear(LinearBase):
"""
ColumnParallelLinear Layer
ColumnParallelLinear Layer.
The linear layer is defined as Y = XA + b. A is parallelized along
its second dimension as A = [A_1, ..., A_p].
"""
def __init__(
self,
llm_config,
fd_config: FDConfig,
prefix: str = "",
input_size: int = None,
output_size: int = None,
@@ -374,40 +238,45 @@ class ColumnParallelLinear(LinearBase):
skip_quant: bool = False,
):
"""
Initialize a linear layer with additional parameters for inference and quantization.
Initializes a linear layer and provides additional parameters required for inference and quantization.
Args:
llm_config (LLMConfig): Arguments related to inference, containing
attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
num_attention_heads, and ffn_hidden_size.
prefix (str): Unique name of the layer, used for naming internal attributes,
you can give it any name you like.
layer_index (int): The index of the linear layer in the model
fd_config (FDConfig): Inference-related parameters.
prefix (str): Unique name of the layer, used to name internal attributes.
Can be arbitrarily named.
input_size (int): Number of input features. Defaults to None.
output_size (int): Number of output features. Defaults to None.
with_bias (bool): Whether to include bias or not. Defaults to False.
add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to False.
skip_quant (bool): Whether to skip quantization. Defaults to False.
"""
super().__init__(llm_config=llm_config,
super().__init__(fd_config=fd_config,
prefix=prefix,
input_size=input_size,
output_size=output_size,
with_bias=with_bias,
add_bias=add_bias,
skip_quant=skip_quant)
self.nranks = llm_config.parallel_config.mp_size
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.input_size = input_size
self.output_size = divide(output_size, self.nranks)
self.linear_weight_shape = [
self.input_size,
self.output_size,
]
if fd_config.quant_config:
self.quant_method.create_weights(self)
self.init_weight()
self.quant_method.create_weights(self)
def init_weight(self):
"""
Initialize the weights and biases.
"""
self.init_weight_shape(self.is_y_transposed())
if self.skip_quant:
self.weight_dtype = self._dtype
self.linear_weight = self.create_parameter(
shape=self.linear_weight_shape,
dtype=self.get_weight_create_dtype(),
dtype=self.weight_dtype,
is_bias=False,
default_initializer=paddle.nn.initializer.Constant(0),
)
@@ -429,62 +298,51 @@ class ColumnParallelLinear(LinearBase):
# smooth quant
self.linear_shift = None
self.linear_smooth = None
if self.use_smooth_quant:
self.linear_shift = self.create_parameter(
shape=self.linear_shift_shape,
dtype=self._dtype,
is_bias=False,
)
self.linear_smooth = self.create_parameter(
shape=self.linear_smooth_shape,
dtype=self._dtype,
is_bias=False,
)
class MergedColumnParallelLinear(ColumnParallelLinear):
"""
MergedColumnParallelLinear Layer.
Similar to ColumnParallelLinear, but the weight matrix is concatenated
along the output dimension. When the weight matrix is loaded, the
different partitions are sharded separately.
"""
def __init__(
self,
llm_config,
prefix,
with_bias=False,
add_bias=False,
activation="gelu",
use_fast_ffn=False,
skip_quant=False,
fd_config: FDConfig,
prefix: str,
input_size: int = None,
output_size: int = None,
with_bias: bool = False,
add_bias: bool = False,
activation: str = "gelu",
use_fast_ffn: bool = False,
skip_quant: bool = False,
):
"""Packed linear layers with column parallelism.
"""
Initialize the fused ffn1 Linear layer with given parameters.
Args:
llm_config (LLMConfig): Arguments related to inference, containing
attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
num_attention_heads, and ffn_hidden_size.
prefix (str): Unique name of the layer, used for naming weights and biases.
weight_key (str): Key name of weight in the pdparams state dict.
bias_key (str): Key name of bias in the pdparams state dict. Defaults to None, means no bias.
with_bias (bool, optional): Whether to include bias term. Defaults to True.
activation (str, optional): Activation function to use. Defaults to "gelu".
use_fast_ffn (bool, optional): Whether to use a faster FFN implementation.
fd_config (FDConfig): Inference-related parameters.
prefix (str): Unique name of the layer, used to name internal attributes.
Can be arbitrarily named.
input_size (int): Number of input features. Defaults to None.
output_size (int): Number of output features. Defaults to None.
with_bias (bool): Whether to include bias or not. Defaults to False.
add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to False.
activation (str): Activation function to use. Defaults to "gelu".
use_fast_ffn (bool): Whether to use a faster FFN implementation.
Defaults to False.
skip_quant (bool, optional): Whether to skip quantization steps. Defaults to False.
skip_quant (bool): Whether to skip quantization. Defaults to False.
"""
self.use_fast_ffn = use_fast_ffn
self.activation = activation
self.embed_dim = llm_config.model_config.hidden_size
self.dim_feedforward = llm_config.model_config.ffn_hidden_size
self.nranks = llm_config.parallel_config.mp_size
self.dim_feedforward_per_rank = divide(self.dim_feedforward,
self.nranks)
input_size = self.embed_dim
output_size = self.dim_feedforward * 2
super().__init__(llm_config=llm_config,
self.embed_dim = fd_config.model_config.hidden_size
self.nranks = fd_config.parallel_config.tensor_parallel_degree
super().__init__(fd_config=fd_config,
prefix=prefix,
input_size=input_size,
output_size=output_size,
@@ -492,7 +350,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
add_bias=add_bias,
skip_quant=skip_quant)
def load_state_dict(self, state_dict):
def load_state_dict(self, state_dict: dict):
"""
Load the checkpoint state dictionary into the layer.
@@ -542,47 +400,40 @@ class QKVParallelLinear(ColumnParallelLinear):
QKVParallelLinear Layer.
"""
def __init__(self, llm_config, prefix, with_bias=False, add_bias=True):
def __init__(self, fd_config, prefix, with_bias=False, add_bias=True):
"""
Initialize the QKV Linear layer with given parameters.
Args:
llm_config (LLMConfig): Arguments related to inference, containing
attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
num_attention_heads, and ffn_hidden_size.
prefix (str): Unique name of the layer, used for naming weights and biases.
weight_key (str): Key name of weight in the pdparams state dict.
bias_key (str): Key name of bias in the pdparams state dict. Defaults to None, means no bias.
with_bias (bool, optional): Whether to include bias term. Defaults to True.
skip_quant (bool, optional): Whether to skip quantization steps. Defaults to False.
fd_config (FDConfig): Inference-related parameters.
prefix (str): Unique name of the layer, used to name internal attributes.
Can be arbitrarily named.
with_bias (bool): Whether to include bias or not. Defaults to False.
add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to True.
"""
self.num_heads = llm_config.model_config.num_attention_heads
self.kv_num_heads = llm_config.model_config.num_key_value_heads
self.embed_dim = llm_config.model_config.hidden_size
self.head_dim = llm_config.model_config.head_dim
self.nranks = llm_config.parallel_config.mp_size
self.num_heads = fd_config.model_config.num_attention_heads
self.kv_num_heads = fd_config.model_config.num_key_value_heads
self.embed_dim = fd_config.model_config.hidden_size
self.head_dim = fd_config.model_config.head_dim
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.num_heads_per_rank = divide(self.num_heads, self.nranks)
self.kv_num_heads_per_rank = divide(self.kv_num_heads, self.nranks)
input_size = self.embed_dim
output_size = (self.num_heads + 2 * self.kv_num_heads) * self.head_dim
super().__init__(llm_config=llm_config,
super().__init__(fd_config=fd_config,
prefix=prefix,
input_size=input_size,
output_size=output_size,
with_bias=with_bias,
add_bias=add_bias)
def load_state_dict(self, state_dict):
def load_weight(self, state_dict: dict):
"""
Load the checkpoint state dictionary into the layer.
Load the weight from the state dictionary.
Args:
state_dict (dict): A dictionary containing the checkpoint weights and biases.
state_dict (dict): A dictionary containing the weights
"""
# weight
assert self.weight_key is not None, 'weight_key should not be None.'
# qkv fused in disk
if self.weight_key in state_dict.keys():
weight_tensor = get_tensor(state_dict.pop(self.weight_key))
else:
@@ -601,11 +452,27 @@ class QKVParallelLinear(ColumnParallelLinear):
])
weight_tensor = paddle.transpose(weight_tensor, perm=[1, 0])
if self.llm_config.quant_config:
if self.fd_config.quant_config:
self.quant_method.process_loaded_weights(self, weight_tensor)
else:
self.linear_weight.set_value(weight_tensor)
def load_state_dict(self, state_dict: dict):
"""
Load the checkpoint state dictionary into the layer.
Args:
state_dict (dict): A dictionary containing the checkpoint weights and biases.
"""
# weight
assert self.weight_key is not None, 'weight_key should not be None.'
# qkv fused in disk
if self.fd_config.model_config.is_quantized:
self.load_prequant_weight(state_dict)
else:
self.load_weight(state_dict)
# bias
if self.with_bias:
if self.bias_key in state_dict.keys():
@@ -622,38 +489,25 @@ class QKVParallelLinear(ColumnParallelLinear):
qkv_bias = paddle.concat([q_bias, k_bias, v_bias], axis=-1)
self.linear_bias.set_value(qkv_bias)
# smooth quant
if self.use_smooth_quant:
if self.shift_key in state_dict:
shift_tensor = get_tensor(state_dict.pop(self.shift_key)).astype(
paddle.get_default_dtype()
)
else:
shift_tensor = paddle.zeros(
shape=self.linear_shift_shape,
dtype=paddle.get_default_dtype(),
)
self.linear_shift.set_value(shift_tensor)
if self.smooth_key in state_dict:
smooth_tensor = get_tensor(state_dict.pop(self.smooth_key)).astype(
paddle.get_default_dtype()
)
else:
smooth_tensor = paddle.ones(
shape=[self.linear_smooth_shape],
dtype=paddle.get_default_dtype(),
)
self.linear_smooth.set_value(smooth_tensor)
class RowParallelLinear(LinearBase):
"""
RowParallelLinear Layer
RowParallelLinear Layer.
The linear layer is defined as Y = XA + b. A is parallelized along
its first dimension and X along its second dimension as:
- -
| A_1 |
| . |
A = | . | X = [X_1, ..., X_p]
| . |
| A_p |
- -
"""
def __init__(
self,
llm_config,
fd_config: FDConfig,
prefix: str = "",
input_size: int = None,
output_size: int = None,
@@ -665,57 +519,50 @@ class RowParallelLinear(LinearBase):
Initialize a linear layer with additional parameters for inference and quantization.
Args:
llm_config (LLMConfig): Arguments related to inference, containing
attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
num_attention_heads, and ffn_hidden_size.
prefix (str): Unique name of the layer, used for naming internal attributes,
you can give it any name you like.
layer_index (int): The index of the linear layer in the model
fd_config (FDConfig): Inference-related parameters.
prefix (str): Unique name of the layer, used to name internal attributes.
Can be arbitrarily named.
input_size (int): Number of input features. Defaults to None.
output_size (int): Number of output features. Defaults to None.
with_bias (bool): Whether to include bias or not. Defaults to False.
add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to False.
skip_quant (bool): Whether to skip quantization. Defaults to False.
"""
super().__init__(llm_config=llm_config,
super().__init__(fd_config=fd_config,
prefix=prefix,
input_size=input_size,
output_size=output_size,
with_bias=with_bias,
add_bias=add_bias,
skip_quant=skip_quant)
self.llm_config = llm_config
self.fd_config = fd_config
self.skip_quant = False
self.use_smooth_quant = llm_config.model_config.use_smooth_quant
self.weight_dtype = llm_config.model_config.weight_dtype
self.act_dtype = llm_config.model_config.act_dtype
self.nranks = llm_config.parallel_config.mp_size
self.embed_dim = llm_config.model_config.hidden_size
self.head_dim = llm_config.model_config.hidden_size // llm_config.model_config.num_attention_heads
self.num_heads = llm_config.model_config.num_attention_heads // self.nranks
self.dim_feedforward = llm_config.model_config.ffn_hidden_size // self.nranks
self.with_bias = with_bias
self.prefix = prefix
self.shift_key = f"{prefix}.shift_bias"
self.smooth_key = f"{prefix}.smooth_weight"
self.weight_key = f"{prefix}.weight"
self.bias_key = f"{prefix}.bias"
self.weight_only_scale_key = f"{prefix}.weight_only_scale"
self.out_scale_key = f"{prefix}.out_scale"
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.embed_dim = fd_config.model_config.hidden_size
self.head_dim = fd_config.model_config.head_dim
self.num_heads = fd_config.model_config.num_attention_heads // self.nranks
self.linear_weight_shape = [
self.input_size,
self.output_size,
]
self._dtype = self._helper.get_default_dtype()
if llm_config.quant_config:
self.quant_method = llm_config.quant_config.get_quant_method(self)
if fd_config.quant_config:
self.quant_method = fd_config.quant_config.get_quant_method(self)
self.quant_method.create_weights(self)
self.init_weight()
def init_weight(self):
"""
Initialize the weights and biases.
"""
self.init_weight_shape(self.is_y_transposed())
if self.skip_quant:
self.weight_dtype = self._dtype
self.linear_weight = self.create_parameter(
shape=self.linear_weight_shape,
dtype=self.get_weight_create_dtype(),
dtype=self.weight_dtype,
is_bias=False,
default_initializer=paddle.nn.initializer.Constant(0),
)
@@ -735,27 +582,159 @@ class RowParallelLinear(LinearBase):
# smooth quant
self.linear_shift = None
self.linear_smooth = None
if self.use_smooth_quant:
self.linear_shift = self.create_parameter(
shape=self.linear_shift_shape,
dtype=self._dtype,
is_bias=False,
)
self.linear_smooth = self.create_parameter(
shape=self.linear_smooth_shape,
dtype=self._dtype,
is_bias=False,
)
def forward_cuda(self, x):
if self.llm_config.quant_config:
def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor:
if self.fd_config.quant_config:
out = self.quant_method.apply(self, x)
else:
out = paddle.matmul(x, self.linear_weight)
if self.nranks > 1:
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
tensor_model_parallel_all_reduce(out)
return out
class KVBatchLinear(LinearBase):
"""
KVBatchLinear Layer for handling combined KV projections with bmm.
"""
def __init__(
self,
fd_config: FDConfig,
prefix: str = "",
kv_lora_rank: int = None,
num_attention_heads: int = None,
qk_nope_head_dim: int = None,
v_head_dim: int = None,
with_bias: bool = False,
skip_quant: bool = False,
):
"""
Initializes a KV batch linear layer that internally splits into K and V projections.
Args:
fd_config (FDConfig): Inference-related parameters.
prefix (str): Unique name of the layer, used to name internal attributes.
kv_lora_rank (int): LoRA rank for KV projection. Defaults to None.
num_attention_heads (int): Number of attention heads. Defaults to None.
qk_nope_head_dim (int): Dimension for Q/K projection (nope part). Defaults to None.
v_head_dim (int): Dimension for V projection. Defaults to None.
with_bias (bool): Whether to include bias or not. Defaults to False.
skip_quant (bool): Whether to skip quantization. Defaults to False.
"""
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.kv_lora_rank = kv_lora_rank
self.num_attention_heads = num_attention_heads
self.qk_nope_head_dim = qk_nope_head_dim
self.v_head_dim = v_head_dim
# Split num_attention_heads when using TP inference.
self.num_heads_per_partition = divide(num_attention_heads, self.nranks)
# Initialize parent with combined dimensions
super().__init__(
fd_config=fd_config,
prefix=prefix,
input_size=None, # Will be determined from weight shape
output_size=None, # Will be determined from weight shape
with_bias=with_bias,
add_bias=False,
skip_quant=skip_quant,
)
self.weight_dtype = self._dtype
# Override weight keys to use the combined kv_b_proj
self.weight_key = f"{prefix}.weight" # e.g., "kv_b_proj.weight"
self.k_weight_key = f"{prefix.replace('kv_b_proj', 'k_b_proj')}.weight"
self.v_weight_key = f"{prefix.replace('kv_b_proj', 'v_b_proj')}.weight"
def load_state_dict(self, state_dict: dict):
"""
Load the combined KV weight and split it into K and V projections
"""
# Get the combined KV weight
# NOTE(Ryan):Do not pop weight_key here, it will be popped in other class
kv_weight_tensor = get_tensor(state_dict[self.weight_key])
# Reshape and split the weight
w = kv_weight_tensor.reshape([
self.kv_lora_rank,
self.num_heads_per_partition,
-1,
]).transpose(perm=[1, 2, 0])
# Split into K and V weights
# wk_b: [num_heads, qk_nope_head_dim, kv_lora_rank]
wk_b = w[:, :self.qk_nope_head_dim, :]
if self.v_head_dim is None:
raise ValueError("self.v_head_dim should not be None")
# wv_b: [num_heads, kv_lora_rank, v_head_dim]
wv_b = w[:, -self.v_head_dim:, :].transpose(perm=[0, 2, 1])
# Create K projection weight
self.k_b_proj_weight = self.create_parameter(
shape=wk_b.shape,
dtype=self.weight_dtype,
is_bias=False,
default_initializer=paddle.nn.initializer.Constant(0),
)
# Create V projection weight
self.v_b_proj_weight = self.create_parameter(
shape=wv_b.shape,
dtype=self.weight_dtype,
is_bias=False,
default_initializer=paddle.nn.initializer.Constant(0),
)
self.k_b_proj_weight.set_value(wk_b)
self.v_b_proj_weight.set_value(wv_b)
def forward_k_b(self, x: paddle.Tensor) -> paddle.Tensor:
"""
Forward pass for K_b projection using bmm
Args:
x: Input tensor (e.g., query_nope.transpose([1, 0, 2]))
Returns:
K_b projection output
"""
out = paddle.bmm(x, self.k_b_proj_weight)
return out
def forward_v_b(self, x: paddle.Tensor) -> paddle.Tensor:
"""
Forward pass for V_b projection using bmm
Args:
x: Input tensor (e.g., fmha_out_decode)
Returns:
V_b projection output
"""
out = paddle.bmm(x, self.v_b_proj_weight)
return out
def forward_cuda(self,
x: paddle.Tensor,
proj_type: str = 'k') -> paddle.Tensor:
"""
Forward function that can handle both K and V projections
Args:
x: Input tensor
proj_type: 'k' or 'v' to select which projection to use
Returns:
Projection output
"""
if proj_type == 'k':
return self.forward_k_b(x)
elif proj_type == 'v':
return self.forward_v_b(x)
else:
raise ValueError(f"proj_type must be 'k' or 'v', got {proj_type}")

View File

@@ -21,48 +21,6 @@ from paddle.distributed import fleet
from .utils import get_tensor
def parallel_matmul(lm_output, logit_weights, parallel_output):
"""
Performs parallel matrix multiplication for large-scale language models.
Args:
lm_output (Tensor): The output tensor from the language model layers,
which will be multiplied with the logit weights.
logit_weights (Tensor): The weights used in the matrix multiplication,
typically the weights of the output layer.
parallel_output (bool): A flag indicating whether to return the parallel
outputs or concatenate them. If True, returns the outputs from the
parallel computation directly. If False, concatenates the outputs
across the model parallel group before returning.
Returns:
Tensor: The result of the matrix multiplication. If `parallel_output` is True,
returns the parallel outputs. If `parallel_output` is False and
model parallel world size is greater than 1, returns the concatenated
outputs across the model parallel group. Otherwise, returns the direct
matrix multiplication result.
"""
hcg = fleet.get_hybrid_communicate_group()
model_parallel_group = hcg.get_model_parallel_group()
world_size = hcg.get_model_parallel_world_size()
# rank = hcg.get_model_parallel_rank()
if world_size > 1:
input_parallel = paddle.distributed.collective._c_identity(
lm_output, group=model_parallel_group)
logits = paddle.matmul(input_parallel, logit_weights, transpose_y=True)
if parallel_output:
return logits
return paddle.distributed.collective._c_concat(
logits, group=model_parallel_group)
else:
logits = paddle.matmul(lm_output, logit_weights, transpose_y=True)
return logits
class ParallelLMHead(nn.Layer):
"""
"Parallelized LM head.
@@ -70,75 +28,69 @@ class ParallelLMHead(nn.Layer):
def __init__(
self,
llm_config,
fd_config,
num_embeddings,
embedding_dim,
prefix="",
with_bias=False,
tie_word_embeddings=None,
):
"""
Parallelized LMhead.
Args:
llm_config (LLMConfig): Arguments related to inference, containing
fd_config (FDConfig): Arguments related to inference, containing
attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
num_attention_heads, and ffn_hidden_size.
num_embeddings (int): vocabulary size.
embedding_dim (int): size of hidden state.
tie_embeddings_weight (bool, optional): Whether to share weights across model parallel ranks,
defaults to None.
prefix (str): full name of the layer in the state dict
"""
super(ParallelLMHead, self).__init__()
self.use_moe = llm_config.model_config.use_moe
self.linear_weight_key = prefix + ".weight"
if with_bias:
self.linear_bias_key = prefix + ".bias"
else:
self.linear_bias_key = None
self.use_ep = llm_config.parallel_config.use_ep
self.use_ep = fd_config.parallel_config.use_ep
self.column_cut = True
self.fused_linear = True
hcg = fleet.get_hybrid_communicate_group()
mp_rank = hcg.get_model_parallel_rank()
ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear
RowParallelLinear = fleet.meta_parallel.RowParallelLinear
self.tie_word_embeddings = tie_word_embeddings
self.tie_word_embeddings = fd_config.model_config.tie_word_embeddings
if self.tie_word_embeddings is None:
if self.use_ep:
self.weight = self.create_parameter(
shape=[embedding_dim, num_embeddings],
dtype=paddle.get_default_dtype(),
is_bias=False,
if self.use_ep:
self.weight = self.create_parameter(
shape=[embedding_dim, num_embeddings],
dtype=paddle.get_default_dtype(),
is_bias=False,
)
else:
if self.column_cut:
need_gather = True
self.out_linear = ColumnParallelLinear(
embedding_dim,
num_embeddings,
mp_group=fleet.get_hybrid_communicate_group().
get_model_parallel_group(),
weight_attr=None,
has_bias=True
if self.linear_bias_key is not None else False,
gather_output=need_gather,
fuse_matmul_bias=False, # False diff更小
)
else:
if self.column_cut:
need_gather = True
self.out_linear = ColumnParallelLinear(
embedding_dim,
num_embeddings,
mp_group=fleet.get_hybrid_communicate_group().
get_model_parallel_group(),
weight_attr=None,
has_bias=True,
gather_output=need_gather,
fuse_matmul_bias=self.fused_linear, # False diff更小
)
else:
self.out_linear = RowParallelLinear(
embedding_dim,
num_embeddings,
mp_group=fleet.get_hybrid_communicate_group().
get_model_parallel_group(),
weight_attr=None,
has_bias=True,
input_is_parallel=False,
fuse_matmul_bias=self.fused_linear, # False diff更小
)
self.out_linear = RowParallelLinear(
embedding_dim,
num_embeddings,
mp_group=fleet.get_hybrid_communicate_group().
get_model_parallel_group(),
weight_attr=None,
has_bias=True
if self.linear_bias_key is not None else False,
input_is_parallel=False,
fuse_matmul_bias=False, # False diff更小
)
def load_state_dict(self, state_dict):
"""
@@ -148,25 +100,26 @@ class ParallelLMHead(nn.Layer):
state_dict (dict): A dictionary containing the checkpoint weights and biases.
"""
if self.tie_word_embeddings is None:
if self.use_ep:
self.weight.set_value(
get_tensor(state_dict.pop(self.linear_weight_key)).astype(
paddle.get_default_dtype()))
else:
if self.use_ep:
self.weight.set_value(
get_tensor(state_dict.pop(self.linear_weight_key)).astype(
paddle.get_default_dtype()))
else:
if self.tie_word_embeddings:
self.out_linear.weight.set_value(
get_tensor(state_dict.pop(self.linear_weight_key)).astype(
paddle.get_default_dtype()))
paddle.get_default_dtype()).transpose([1, 0]))
else:
weight_tensor = get_tensor(
state_dict.pop(self.linear_weight_key)).astype(
paddle.get_default_dtype())
if self.out_linear.weight.shape != weight_tensor.shape:
weight_tensor = weight_tensor.transpose([1, 0])
self.out_linear.weight.set_value(weight_tensor)
bias = (
get_tensor(state_dict.pop(self.linear_bias_key)).astype(
paddle.get_default_dtype()
)
if self.linear_bias_key is not None
else paddle.zeros(
self.out_linear.bias.shape, dtype=paddle.get_default_dtype()
)
)
if self.linear_bias_key is not None:
bias = get_tensor(state_dict.pop(self.linear_bias_key)).astype(
paddle.get_default_dtype())
self.out_linear.bias.set_value(bias)
def forward(self, input):
@@ -180,11 +133,8 @@ class ParallelLMHead(nn.Layer):
Tensor: The output tensor after processing through the layer.
"""
logits = input
if self.tie_word_embeddings is not None:
logits = parallel_matmul(logits, self.tie_word_embeddings, False)
if self.use_ep:
logits = paddle.matmul(logits, self.weight)
else:
if self.use_ep:
logits = paddle.matmul(logits, self.weight)
else:
logits = self.out_linear(logits)
logits = self.out_linear(logits)
return logits

View File

@@ -11,3 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .fused_moe_cutlass_backend import (CutlassW4A8MoEMethod,
CutlassWeightOnlyMoEMethod)
from .fused_moe_triton_backend import TritonWeightOnlyMoEMethod
from .moe import FusedMoE
__all__ = [
CutlassWeightOnlyMoEMethod, CutlassW4A8MoEMethod, FusedMoE,
TritonWeightOnlyMoEMethod
]

View File

@@ -1,222 +0,0 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import paddle
from paddle import nn
from paddle.distributed import fleet
from paddle.framework import in_dynamic_or_pir_mode
from paddle.nn.quant import weight_quantize
from fastdeploy.model_executor.ops.gpu import (moe_expert_dispatch,
moe_expert_ffn,
moe_expert_reduce)
from .fused_moe_method_base import FusedMoEMethodBase
class CutlassFusedMoeMethod(FusedMoEMethodBase):
"""
Use Cutlass Group Gemm to compute Fused MoE.
This method is the oldest way to compute MoE in Paddle.
"""
def create_weights(
self,
layer: nn.Layer,
moe_compute_params,
ffn1_tensor,
ffn2_tensor,
ffn1_bias=None,
ffn2_bias=None,
# belows only used in w4a8.
moe_ffn1_weight_scale=None,
moe_ffn2_weight_scale=None,
moe_ffn1_in_scale=None,
moe_ffn2_in_scale=None):
"""
Paddle cutlass create weight process.
"""
num_local_experts = moe_compute_params.num_local_experts
moe_quant_type = moe_compute_params.moe_quant_type
assert len(ffn1_tensor) == num_local_experts
assert len(ffn2_tensor) == num_local_experts
assert ffn1_tensor[0].shape == [
moe_compute_params.hidden_size,
moe_compute_params.moe_intermediate_size * 2
]
assert ffn2_tensor[0].shape == [
moe_compute_params.moe_intermediate_size,
moe_compute_params.hidden_size
]
added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
added_scale_attrs = ["moe_ffn1_weight_scale", "moe_ffn2_weight_scale"]
if moe_quant_type == "w4a8":
moe_ffn1_in_scale = paddle.concat(moe_ffn1_in_scale)
moe_ffn2_in_scale = paddle.concat(moe_ffn2_in_scale)
moe_ffn1_in_scale = 1 / moe_ffn1_in_scale
moe_ffn2_in_scale = 1 / moe_ffn2_in_scale
moe_ffn1_weight_scale = paddle.stack(moe_ffn1_weight_scale, axis=0)
moe_ffn2_weight_scale = paddle.stack(moe_ffn2_weight_scale, axis=0)
moe_ffn1_weight_scale = moe_ffn1_weight_scale / (127 * 112)
moe_ffn2_weight_scale = moe_ffn2_weight_scale / (127 * 112)
moe_ffn1_weight_scale = moe_ffn1_weight_scale / moe_ffn1_in_scale[:,
None]
moe_ffn2_weight_scale = moe_ffn2_weight_scale / moe_ffn2_in_scale[:,
None]
moe_ffn1_weight_scale = moe_ffn1_weight_scale.cast(
paddle.get_default_dtype())
moe_ffn2_weight_scale = moe_ffn2_weight_scale.cast(
paddle.get_default_dtype())
if moe_quant_type in ["weight_only_int4", "weight_only_int8", "w4a8"]:
for idx, weight_tensor in enumerate([ffn1_tensor, ffn2_tensor]):
weight_name = added_weight_attrs[idx]
scale_name = added_scale_attrs[idx]
weight_list = []
weight_scale_list = []
for i in range(num_local_experts):
quant_weight, scale = weight_quantize(weight_tensor[i],
algo=moe_quant_type,
arch=80)
weight_list.append(quant_weight)
if moe_quant_type != "w4a8":
# scale holds no memoty in w4a8, don't touch it!
weight_scale_list.append(scale)
quanted_weight = paddle.stack(weight_list, axis=0)
setattr(
layer, weight_name,
layer.create_parameter(
shape=quanted_weight.shape,
dtype=quanted_weight.dtype,
default_initializer=paddle.nn.initializer.Constant(0),
))
getattr(layer, weight_name).set_value(quanted_weight)
# this scale only useful for wint8/4.
if moe_quant_type != "w4a8":
quanted_weight_scale = paddle.stack(weight_scale_list,
axis=0)
setattr(
layer, scale_name,
layer.create_parameter(
shape=quanted_weight_scale.shape,
dtype=quanted_weight_scale.dtype,
))
getattr(layer, scale_name).set_value(quanted_weight_scale)
if moe_quant_type == "w4a8":
assert moe_ffn1_weight_scale is not None
assert moe_ffn2_weight_scale is not None
assert moe_ffn1_in_scale is not None
assert moe_ffn2_in_scale is not None
added_w4a8_attrs = [
"moe_ffn1_weight_scale", "moe_ffn2_weight_scale",
"moe_ffn1_in_scale", "moe_ffn2_in_scale"
]
for idx, weight_tensor in enumerate([
moe_ffn1_weight_scale, moe_ffn2_weight_scale,
moe_ffn1_in_scale, moe_ffn2_in_scale
]):
name = added_w4a8_attrs[idx]
setattr(
layer, name,
layer.create_parameter(
shape=weight_tensor.shape,
dtype=weight_tensor.dtype,
default_initializer=paddle.nn.initializer.Constant(0),
))
getattr(layer, name).set_value(weight_tensor)
def apply(
self,
layer: nn.Layer,
moe_compute_params,
x: paddle.Tensor,
) -> paddle.Tensor:
"""
Paddle Cutlass compute Fused MoE.
"""
gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
(
permute_input,
token_nums_per_expert,
permute_indices_per_token,
topk_weights,
topk_idx,
expert_idx_per_token,
) = moe_expert_dispatch(
x,
gate_out,
layer.gate_correction_bias,
(layer.moe_ffn1_in_scale if hasattr(layer, "moe_ffn1_in_scale")
else None), # if set, permute_input will be int8_t
moe_compute_params.top_k,
False,
topk_only_mode=False,
)
if moe_compute_params.moe_quant_type != "w4a8":
# only w4a8 need expert_idx_per_token
# Other need not this tensor, so we make it None.
expert_idx_per_token = None
else:
expert_idx_per_token = expert_idx_per_token.cast("int64")
ffn_out = moe_expert_ffn(
permute_input,
token_nums_per_expert,
layer.moe_ffn1_weight,
layer.moe_ffn2_weight,
None,
(layer.moe_ffn1_weight_scale
if hasattr(layer, "moe_ffn1_weight_scale") else None),
(layer.moe_ffn2_weight_scale
if hasattr(layer, "moe_ffn2_weight_scale") else None),
(layer.moe_ffn2_in_scale
if hasattr(layer, "moe_ffn2_in_scale") else None),
expert_idx_per_token,
moe_compute_params.moe_quant_type,
False, # used_in_ep_low_latency
)
if False:
if in_dynamic_or_pir_mode():
hcg = fleet.get_hybrid_communicate_group()
mp_group = hcg.get_model_parallel_group()
paddle.distributed.all_reduce(ffn_out, group=mp_group)
else:
paddle.distributed.all_reduce(ffn_out, group=mp_group)
# reduce 中会做 topk 个 weight 的 norm 和 routed_scaling_factor
fused_moe_out = moe_expert_reduce(
ffn_out,
topk_weights,
permute_indices_per_token,
topk_idx,
None,
norm_topk_prob=True,
routed_scaling_factor=1.0,
)
return fused_moe_out

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,135 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from abc import abstractmethod
import paddle
from paddle import nn
from fastdeploy.config import MoEPhase
from ..quantization.quant_base import QuantMethodBase
class MoEMethodBase(QuantMethodBase):
"""
"""
def __init__(self, quant_config):
super().__init__()
if quant_config is None:
self.moe_quant_type = "w16a16"
else:
self.quant_config = quant_config
self.added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
self.added_scale_attrs = [
"moe_ffn1_weight_scale", "moe_ffn2_weight_scale"
]
self.pack_num = 1
def init_ep(self, layer: nn.Layer) -> None:
"""
Init EP related module
"""
if layer.ep_size > 1:
if layer.fd_config.parallel_config.moe_phase == MoEPhase.DECODER:
from .ep import EPDecoderRunner
self.ep_decoder_runner = EPDecoderRunner(
layer.top_k, layer.hidden_size, layer.num_experts,
layer.moe_config.num_max_dispatch_tokens_per_rank,
layer.ep_size, layer.ep_rank)
else:
from .ep import EPPrefillRunner
self.ep_prefill_runner = EPPrefillRunner(
layer.top_k, layer.hidden_size, layer.num_experts,
layer.ep_size, layer.ep_rank)
def process_loaded_weights(self, layer, weights) -> None:
"""
process_loaded_weights
"""
pass
def check(self, layer: nn.Layer, ffn1_weights, ffn2_weights):
"""
check layer is valid for this method
"""
assert ffn1_weights[0].shape == [
layer.hidden_size // self.pack_num, layer.moe_intermediate_size * 2
]
assert ffn2_weights[0].shape == [
layer.moe_intermediate_size // self.pack_num, layer.hidden_size
]
@abstractmethod
def create_weights(self, layer: nn.Layer, state_dict):
"""
Paddle cutlass create weight process.
"""
raise NotImplementedError
@abstractmethod
def apply_ep_prefill(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Apply the EP prefill method.
"""
raise NotImplementedError
@abstractmethod
def apply_ep_decode(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Apply the EP decoder method.
"""
raise NotImplementedError
@abstractmethod
def apply_tp(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Paddle Cutlass compute Fused MoE.
"""
raise NotImplementedError
def apply(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Paddle Cutlass compute Fused MoE.
"""
if layer.ep_size > 1:
if layer.fd_config.parallel_config.moe_phase == MoEPhase.PREFILL:
return self.apply_ep_prefill(layer, x, gate_out)
else:
return self.apply_ep_decode(layer, x, gate_out)
else:
return self.apply_tp(layer, x, gate_out)

View File

@@ -0,0 +1,431 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import paddle
from paddle import nn
from paddle.nn.quant import weight_quantize
from paddleformers.utils.log import logger
import fastdeploy
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
from ..utils import get_tensor, create_and_set_parameter
from .fused_moe_backend_base import MoEMethodBase
from fastdeploy.platforms import current_platform
if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import moe_expert_dispatch
from fastdeploy.model_executor.ops.gpu import moe_expert_reduce
class CutlassMoEMethod(MoEMethodBase):
"""
Use Cutlass Group Gemm to compute Fused MoE.
This method is the oldest way to compute MoE in Paddle.
"""
def create_weights(self, layer: nn.Layer, state_dict):
"""
Paddle cutlass create weight process.
"""
# bf16
ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
stacked_ffn1_weights = paddle.stack(ffn1_weights, axis=0)
stacked_ffn2_weights = paddle.stack(ffn2_weights, axis=0)
for idx, weight_tensor in enumerate(
[stacked_ffn1_weights, stacked_ffn2_weights]):
weight_name = self.added_weight_attrs[idx]
setattr(
layer, weight_name,
layer.create_parameter(
shape=weight_tensor.shape,
dtype=weight_tensor.dtype,
default_initializer=paddle.nn.initializer.Constant(0),
))
getattr(layer, weight_name).set_value(weight_tensor)
def compute_ffn(
self,
layer: nn.Layer,
permute_input: paddle.Tensor,
token_nums_per_expert: paddle.Tensor,
expert_idx_per_token: paddle.Tensor,
used_in_ep_low_latency: bool = False,
):
"""
Paddle Cutlass compute Fused MoE.
"""
return fastdeploy.model_executor.ops.gpu.moe_expert_ffn(
permute_input,
token_nums_per_expert,
layer.moe_ffn1_weight,
layer.moe_ffn2_weight,
None,
(layer.moe_ffn1_weight_scale
if hasattr(layer, "moe_ffn1_weight_scale") else None),
(layer.moe_ffn2_weight_scale
if hasattr(layer, "moe_ffn2_weight_scale") else None),
(layer.moe_ffn2_in_scale
if hasattr(layer, "moe_ffn2_in_scale") else None),
expert_idx_per_token,
self.moe_quant_type,
used_in_ep_low_latency,
)
def apply_ep_prefill(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Apply the EP prefill method.
"""
# 1. Select topk experts and weights
topk_idx, topk_weights = self.ep_prefill_runner.moe_select(
layer, gate_out)
# 2. EP Dispatch
(
recv_x,
recv_topk_idx,
recv_topk_weights,
recv_num_tokens_per_expert_list,
handle,
_,
) = self.ep_prefill_runner.dispatch(x, topk_idx, topk_weights)
token_all_num = sum(recv_num_tokens_per_expert_list)
# 3. Compute ffn
if token_all_num > 0:
logger.info(f"token_all_num {token_all_num}")
(
permute_input,
permute_indices_per_token,
recv_num_tokens_per_expert_list_cumsum,
dst_weights,
dst_indices,
cumsum_idx_gpu,
expert_idx_per_token,
) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch(
recv_x,
recv_topk_idx,
recv_topk_weights,
(self.moe_ffn1_in_scale
if hasattr(self, "moe_ffn1_in_scale") else None),
recv_num_tokens_per_expert_list,
token_all_num,
self.moe_quant_type,
)
if self.moe_quant_type != "w4a8":
# only w4a8 need expert_idx_per_token
# Other need not this tensor, so we make it None.
expert_idx_per_token = None
else:
expert_idx_per_token = expert_idx_per_token.cast("int64")
ffn_out = self.compute_ffn(layer, permute_input,
recv_num_tokens_per_expert_list_cumsum,
expert_idx_per_token)
# prmt back per rank
tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
ffn_out,
dst_weights,
permute_indices_per_token,
dst_indices,
None, # moe_ffn2_bias,
False, # norm_topk_prob
1.0,
)[0]
else:
tmp_ffn_out = recv_x
# 4. EP combine
return self.ep_prefill_runner.combine(tmp_ffn_out, handle,
recv_topk_weights)
def apply_ep_decode(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Apply the EP decoder method.
"""
# 1. Select topk experts and weights
topk_idx, topk_weights = self.ep_decoder_runner.moe_select(
layer, gate_out)
# 2. EP Dispatch
permute_input, token_nums_per_expert, handle = self.ep_decoder_runner.dispatch(
x, topk_idx, topk_weights)
# 3. Compute ffn
if self.moe_quant_type == "w4a8":
num_local_experts, max_num, _ = permute_input.shape
expert_idx_per_token = paddle.arange(
num_local_experts)[:, None].tile([1, max_num])
elif self.moe_quant_type in ["weight_only_int8", "weight_only_int4"]:
expert_idx_per_token = None
else:
raise NotImplementedError
ffn_out = self.compute_ffn(layer, permute_input,
token_nums_per_expert.cast("int64"),
expert_idx_per_token, True)
# 4. EP combine
return self.ep_decoder_runner.combine(ffn_out, topk_idx, topk_weights,
handle)
def apply_tp(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Paddle Cutlass compute Fused MoE.
"""
(
permute_input,
token_nums_per_expert,
permute_indices_per_token,
topk_weights,
topk_idx,
expert_idx_per_token,
) = moe_expert_dispatch(
x,
gate_out,
layer.gate_correction_bias,
(layer.moe_ffn1_in_scale if hasattr(layer, "moe_ffn1_in_scale")
else None), # if set, permute_input will be int8_t
layer.top_k,
False,
topk_only_mode=False,
)
if self.moe_quant_type != "w4a8":
# only w4a8 need expert_idx_per_token
# Other need not this tensor, so we make it None.
expert_idx_per_token = None
else:
expert_idx_per_token = expert_idx_per_token.cast("int64")
ffn_out = self.compute_ffn(layer, permute_input, token_nums_per_expert,
expert_idx_per_token)
# reduce 中会做 topk 个 weight 的 norm 和 routed_scaling_factor
fused_moe_out = moe_expert_reduce(
ffn_out,
topk_weights,
permute_indices_per_token,
topk_idx,
None,
norm_topk_prob=True,
routed_scaling_factor=1.0,
)
if layer.tp_size > 1:
tensor_model_parallel_all_reduce(fused_moe_out)
return fused_moe_out
class CutlassW4A8MoEMethod(CutlassMoEMethod):
"""
w4a8 MoE Method
"""
def __init__(self, quant_config):
super().__init__(quant_config)
self.quant_config = quant_config
self.moe_quant_type = "w4a8"
self.pack_num = 2
def create_weights(self, layer: nn.Layer, state_dict):
"""
Paddle cutlass create weight process.
"""
ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
self.check(layer, ffn1_weights, ffn2_weights)
for idx, weight_tensor in enumerate([ffn1_weights, ffn2_weights]):
weight_name = self.added_weight_attrs[idx]
weight_list = []
for i in range(layer.num_local_experts):
quant_weight, scale = weight_quantize(weight_tensor[i],
algo=self.moe_quant_type,
arch=80)
weight_list.append(quant_weight)
quanted_weight = paddle.stack(weight_list, axis=0)
create_and_set_parameter(layer, weight_name, quanted_weight)
self.create_w4a8_scale_weights(layer, layer.weight_key_map, state_dict)
def create_w4a8_scale_weights(self, layer: nn.Layer, weight_key_map: dict,
state_dict: dict):
"""
Get w4a8 weights from state dict and process them.
Args:
layer (nn.Layer): The layer to add parameters to.
weight_key_map (dict): The weight key map.
state_dict (dict): The state dict.
"""
def _extract_scale_tensor(state_dict, key_template, expert_idx):
return get_tensor(state_dict.pop(key_template.format(expert_idx)))
def _process_in_scale(name: str, in_scales: list[paddle.Tensor]):
processed_in_scale = 1 / paddle.concat(in_scales)
create_and_set_parameter(layer, name, processed_in_scale)
return processed_in_scale
def _process_weight_scale(name: str,
weight_scales: list[paddle.Tensor],
processed_in_scale: paddle.Tensor):
processed_weight_scale = (paddle.stack(weight_scales, axis=0) /
(127 * 112) /
processed_in_scale[:, None]).cast(
paddle.get_default_dtype())
create_and_set_parameter(layer, name, processed_weight_scale)
# 1. Init scale containers and maps
moe_ffn1_weight_scales = []
moe_ffn2_weight_scales = []
moe_ffn1_in_scales = []
moe_ffn2_in_scales = []
scale_weight_map = {
"moe_ffn1_weight_scale": moe_ffn1_weight_scales,
"moe_ffn2_weight_scale": moe_ffn2_weight_scales,
"moe_ffn1_in_scale": moe_ffn1_in_scales,
"moe_ffn2_in_scale": moe_ffn2_in_scales,
}
scale_key_map = {
"moe_ffn1_weight_scale":
weight_key_map.get("ffn1_expert_weight_scale_key", None),
"moe_ffn2_weight_scale":
weight_key_map.get("ffn2_expert_weight_scale_key", None),
"moe_ffn1_in_scale":
weight_key_map.get("ffn1_expert_in_scale_key", None),
"moe_ffn2_in_scale":
weight_key_map.get("ffn2_expert_in_scale_key", None),
}
for name, value in scale_key_map.items():
if value is None:
raise ValueError(
f"scale {name} should not be none in w4a8 mode.")
# 2. Extract scale tensor from state dict
for local_expert_idx in range(layer.num_local_experts):
expert_idx = local_expert_idx + layer.expert_id_offset * layer.num_local_experts
for name, scale_key_template in scale_key_map.items():
scale_tensor = _extract_scale_tensor(state_dict,
scale_key_template,
expert_idx)
scale_weight_map[name].append(scale_tensor)
# 3. Process scale tensor and set to layer
in_scales = []
for in_scale_name in ["moe_ffn1_in_scale", "moe_ffn2_in_scale"]:
in_scales.append(
_process_in_scale(in_scale_name,
scale_weight_map[in_scale_name]))
for i, weight_scale_name in enumerate(
["moe_ffn1_weight_scale", "moe_ffn2_weight_scale"]):
_process_weight_scale(weight_scale_name,
scale_weight_map[weight_scale_name],
in_scales[i])
class CutlassWeightOnlyMoEMethod(CutlassMoEMethod):
"""
weight only for moe
"""
def __init__(self, quant_config):
super().__init__(quant_config)
self.quant_config = quant_config
self.moe_quant_type = self.quant_config.algo
self.pack_num = 1
def process_prequanted_weights(self, layer: nn.Layer, state_dict):
"""
Paddle cutlass process prequanted weights.
"""
ffn1_expert_weight_key = layer.weight_key_map.get(
"ffn1_expert_weight_key", None)
ffn2_expert_weight_key = layer.weight_key_map.get(
"ffn2_expert_weight_key", None)
ffn1_expert_weight_scale_key = layer.weight_key_map.get(
"ffn1_expert_weight_scale_key", None)
ffn2_expert_weight_scale_key = layer.weight_key_map.get(
"ffn2_expert_weight_scale_key", None)
ffn1_weights, ffn2_weights = layer.load_experts_weight(
state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
# self.check(layer, ffn1_weights, ffn2_weights)
ffn1_weight_scale = []
ffn2_weight_scale = []
for i in range(layer.num_local_experts):
expert_idx = layer.expert_id_offset + i
ffn1_weight_scale.append(
get_tensor(
state_dict.pop(
ffn1_expert_weight_scale_key.format(expert_idx))))
ffn2_weight_scale.append(
get_tensor(
state_dict.pop(
ffn2_expert_weight_scale_key.format(expert_idx))))
ffn1_weight = paddle.stack(ffn1_weights, axis=0)
ffn2_weight = paddle.stack(ffn2_weights, axis=0)
ffn1_weight_scale = paddle.stack(ffn1_weight_scale, axis=0)
ffn2_weight_scale = paddle.stack(ffn2_weight_scale, axis=0)
name_tensor_map = {
"moe_ffn1_weight": ffn1_weight,
"moe_ffn2_weight": ffn2_weight,
"moe_ffn1_weight_scale": ffn1_weight_scale,
"moe_ffn2_weight_scale": ffn2_weight_scale
}
for name, tensor in name_tensor_map.items():
create_and_set_parameter(layer, name, tensor)
def create_weights(self, layer: nn.Layer, state_dict):
"""
Paddle cutlass create weight process.
"""
ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
self.check(layer, ffn1_weights, ffn2_weights)
for idx, weight_tensor in enumerate([ffn1_weights, ffn2_weights]):
weight_name = self.added_weight_attrs[idx]
scale_name = self.added_scale_attrs[idx]
weight_list = []
weight_scale_list = []
for i in range(layer.num_local_experts):
quant_weight, scale = weight_quantize(weight_tensor[i],
algo=self.moe_quant_type)
weight_list.append(quant_weight)
weight_scale_list.append(scale)
quanted_weight = paddle.stack(weight_list, axis=0)
create_and_set_parameter(layer, weight_name, quanted_weight)
quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
create_and_set_parameter(layer, scale_name, quanted_weight_scale)

View File

@@ -0,0 +1,380 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import numpy as np
import paddle
from paddle import nn
from paddleformers.utils.log import logger
import fastdeploy
import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func
from fastdeploy.model_executor.layers.utils import get_tensor
from ..utils import create_and_set_parameter
from .fused_moe_backend_base import MoEMethodBase
class DeepGemmFusedMoeMethod(MoEMethodBase):
"""
DeepGemmFusedMoeMethod is a class that implements the MoEMethodBase interface for DeepGemm backend.
"""
def create_weights(self, layer: nn.Layer, state_dict):
"""
deepgemm create weight process.
"""
ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
self.check(layer, ffn1_weights, ffn2_weights)
for idx, weight_tensor in enumerate([ffn1_weights, ffn2_weights]):
weight_name = self.added_weight_attrs[idx]
scale_name = self.added_scale_attrs[idx]
weight_list = []
weight_scale_list = []
for i in range(layer.num_local_experts):
from fastdeploy.model_executor.layers.utils import \
per_block_cast_to_fp8
quant_weight, scale = per_block_cast_to_fp8(
weight_tensor[i], self.quant_config.weight_block_size)
weight_list.append(quant_weight)
weight_scale_list.append(scale)
quanted_weight = paddle.stack(weight_list, axis=0)
quanted_weight = quanted_weight.transpose([0, 2, 1]).contiguous()
create_and_set_parameter(layer, weight_name, quanted_weight)
quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
quanted_weight_scale = quanted_weight_scale.transpose(
[0, 2, 1]).contiguous()
create_and_set_parameter(layer, scale_name, quanted_weight_scale)
def process_prequanted_weights(self, layer: nn.Layer, state_dict):
"""
Paddle cutlass process prequanted weights.
"""
ffn1_expert_weight_key = layer.weight_key_map.get(
"ffn1_expert_weight_key", None)
ffn2_expert_weight_key = layer.weight_key_map.get(
"ffn2_expert_weight_key", None)
ffn1_expert_weight_scale_key = layer.weight_key_map.get(
"ffn1_expert_weight_scale_key", None)
ffn2_expert_weight_scale_key = layer.weight_key_map.get(
"ffn2_expert_weight_scale_key", None)
ffn1_weights, ffn2_weights = layer.load_experts_weight(
state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
# self.check(layer, ffn1_weights, ffn2_weights)
ffn1_weight_scale = []
ffn2_weight_scale = []
for i in range(layer.num_local_experts):
expert_idx = layer.expert_id_offset + i
ffn1_weight_scale.append(
get_tensor(
state_dict.pop(
ffn1_expert_weight_scale_key.format(expert_idx))))
ffn2_weight_scale.append(
get_tensor(
state_dict.pop(
ffn2_expert_weight_scale_key.format(expert_idx))))
ffn1_weight = paddle.stack(ffn1_weights, axis=0).transpose([0, 2, 1]).contiguous().view("float8_e4m3fn")
ffn2_weight = paddle.stack(ffn2_weights, axis=0).transpose([0, 2, 1]).contiguous().view("float8_e4m3fn")
ffn1_weight_scale = paddle.stack(ffn1_weight_scale, axis=0).transpose([0, 2, 1]).contiguous()
ffn2_weight_scale = paddle.stack(ffn2_weight_scale, axis=0).transpose([0, 2, 1]).contiguous()
name_tensor_map = {
"moe_ffn1_weight": ffn1_weight,
"moe_ffn2_weight": ffn2_weight,
"moe_ffn1_weight_scale": ffn1_weight_scale,
"moe_ffn2_weight_scale": ffn2_weight_scale
}
for name, tensor in name_tensor_map.items():
create_and_set_parameter(layer, name, tensor)
def apply_ep_prefill(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Apply the EP prefill method.
"""
# 1. Select topk experts and weights
topk_idx, topk_weights = self.ep_prefill_runner.moe_select(
layer, gate_out)
# 2. Dynamic compute blockwise quantization scales
x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
x, self.quant_config.weight_block_size[0])
# 3. EP Dispatch
(
recv_x,
recv_topk_idx,
recv_topk_weights,
recv_num_tokens_per_expert_list,
handle,
_,
) = self.ep_prefill_runner.dispatch(x,
topk_idx,
topk_weights,
x_scale_tensor=x_scale_tensor)
token_all_num = sum(recv_num_tokens_per_expert_list)
# 4. Compute ffn
if token_all_num > 0:
logger.info(f"token_all_num {token_all_num}")
(recv_x, recv_x_scale) = recv_x
tmp = count_tokens_per_expert_func(recv_topk_idx, layer.num_local_experts)
(
permute_input,
permute_scale,
permute_indices_per_token,
recv_num_tokens_per_expert_list_cumsum,
recv_num_tokens_per_expert_list_padded_cumsum,
dst_weights,
dst_indices,
cumsum_idx_gpu,
m_indices,
) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8(
recv_x,
recv_x_scale,
recv_topk_idx,
recv_topk_weights,
tmp[0],
tmp[1]
)
permute_scale = permute_scale.transpose([1, 0]).contiguous()
permute_scale = permute_scale.transpose([1, 0])
# ffn1
ffn_out = paddle.empty(
(permute_input.shape[0], layer.moe_ffn1_weight.shape[1]),
dtype=paddle.bfloat16,
)
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
(permute_input, permute_scale),
(layer.moe_ffn1_weight, layer.moe_ffn1_weight_scale),
ffn_out,
m_indices,
)
# swiglu
ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out, None)
# ffn2
ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
ffn_out, self.quant_config.weight_block_size[0])
ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose(
[1, 0]).contiguous()
ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0])
ffn_out = paddle.empty(
(ffn_out.shape[0], layer.moe_ffn2_weight.shape[1]),
dtype=paddle.bfloat16)
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
(ffn_in_x, ffn_in_x_scale_tensor),
(layer.moe_ffn2_weight, layer.moe_ffn2_weight_scale),
ffn_out,
m_indices,
)
# prmt back per rank
tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
ffn_out,
dst_weights,
permute_indices_per_token,
dst_indices,
None, # moe_ffn2_bias
False, # norm_topk_prob
1.0,
)[0]
else:
tmp_ffn_out = paddle.cast(recv_x[0], paddle.bfloat16)
# 5. EP combine
return self.ep_prefill_runner.combine(tmp_ffn_out, handle,
recv_topk_weights)
def apply_ep_decode(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Apply the EP decoder method.
"""
# 1. Select topk experts and weights
topk_idx, topk_weights = self.ep_decoder_runner.moe_select(
layer, gate_out)
# 2. EP Dispatch
permute_input, token_nums_per_expert, handle = self.ep_decoder_runner.dispatch(
x, topk_idx, topk_weights, use_fp8=True)
# 3. Compute ffn
assert isinstance(permute_input, tuple)
ffn1_out = paddle.empty(
[
layer.num_local_experts,
layer.ep_size *
layer.moe_config.num_max_dispatch_tokens_per_rank,
layer.moe_intermediate_size * 2,
],
dtype=paddle.bfloat16,
)
ffn_out = paddle.empty(
[
layer.num_local_experts,
layer.ep_size *
layer.moe_config.num_max_dispatch_tokens_per_rank,
layer.hidden_size,
],
dtype=paddle.bfloat16,
)
expected_m = 128
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
permute_input,
(
layer.moe_ffn1_weight,
layer.moe_ffn1_weight_scale,
),
ffn1_out,
token_nums_per_expert,
expected_m,
)
act_out = fastdeploy.model_executor.ops.gpu.group_swiglu_with_masked(
ffn1_out, token_nums_per_expert)
act_out_fp8, scale = fastdeploy.model_executor.ops.gpu.masked_per_token_quant(
act_out, token_nums_per_expert,
self.quant_config.weight_block_size[0])
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
(act_out_fp8, scale),
(
layer.moe_ffn2_weight,
layer.moe_ffn2_weight_scale,
),
ffn_out,
token_nums_per_expert,
expected_m,
)
# 4. EP combine
return self.ep_decoder_runner.combine(ffn_out, topk_idx, topk_weights,
handle)
def apply_tp(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Paddle Use DeepGemm compute Fused MoE.
below is TP compute method.
"""
topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
gate_out,
layer.gate_correction_bias,
layer.top_k,
True, # apply_norm_weight
False,
)
tmp = count_tokens_per_expert_func(topk_ids, layer.num_experts)
recv_x, recv_x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(
x, 128)
(
permute_input,
permute_scale,
permute_indices_per_token,
recv_num_tokens_per_expert_list_cumsum,
recv_num_tokens_per_expert_list_padded_cumsum,
dst_weights,
dst_indices,
cumsum_idx_gpu,
m_indices,
) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8(
recv_x,
recv_x_scale,
topk_ids,
topk_weights,
tmp[0],
tmp[1],
)
permute_scale = permute_scale.transpose([1, 0]).contiguous()
permute_scale = permute_scale.transpose([1, 0])
# ffn1
ffn_out = paddle.empty(
(permute_input.shape[0], layer.moe_ffn1_weight.shape[1]),
dtype=paddle.bfloat16,
)
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
(permute_input, permute_scale),
(layer.moe_ffn1_weight, layer.moe_ffn1_weight_scale),
ffn_out,
m_indices,
)
# swiglu
ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out)
# ffn2
ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
ffn_out, self.quant_config.weight_block_size[0])
ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose(
[1, 0]).contiguous()
ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0])
ffn_out = paddle.empty(
(ffn_out.shape[0], layer.moe_ffn2_weight.shape[1]),
dtype=paddle.bfloat16)
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
(ffn_in_x, ffn_in_x_scale_tensor),
(layer.moe_ffn2_weight, layer.moe_ffn2_weight_scale),
ffn_out,
m_indices,
)
# prmt back per rank
tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
ffn_out,
dst_weights,
permute_indices_per_token,
dst_indices,
None,
False, # norm_topk_prob
1.0,
)[0]
if layer.tp_size > 1:
tensor_model_parallel_all_reduce(tmp_ffn_out)
return tmp_ffn_out

View File

@@ -0,0 +1,285 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import paddle
from paddle import nn
import fastdeploy
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
from fastdeploy.model_executor.ops.gpu import (MoeWna16MarlinGemmApi,
tritonmoe_preprocess_func)
from ..quantization.quant_base import QuantMethodBase
def gptq_marlin_moe_repack(b_q_weight: paddle.Tensor, perm: paddle.Tensor,
size_k: int, size_n: int,
num_bits: int) -> paddle.Tensor:
"""
Util function.
"""
from fastdeploy.model_executor.ops.gpu import gptq_marlin_repack
num_experts = b_q_weight.shape[0]
assert size_k % 16 == 0
output = paddle.empty(
[num_experts, size_k // 16, size_n * (num_bits // 2)],
dtype=b_q_weight.dtype)
for e in range(num_experts):
output[e] = gptq_marlin_repack(b_q_weight[e], perm[e], size_k, size_n,
num_bits)
return output
def get_scale_perms():
"""
Util function.
"""
scale_perm: list[int] = []
for i in range(8):
scale_perm.extend([i + 8 * j for j in range(8)])
scale_perm_single: list[int] = []
for i in range(4):
scale_perm_single.extend(
[2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
return scale_perm, scale_perm_single
def marlin_permute_scales(s: paddle.Tensor, size_k: int, size_n: int,
group_size: int) -> paddle.Tensor:
"""
Util function.
"""
scale_perm, scale_perm_single = get_scale_perms()
if group_size < size_k and group_size != -1:
s = s.reshape([-1, len(scale_perm)])[:, scale_perm]
else:
s = s.reshape([-1, len(scale_perm_single)])[:, scale_perm_single]
s = s.reshape((-1, size_n)).contiguous()
return s
def marlin_moe_permute_scales(
s: paddle.Tensor,
size_k: int,
size_n: int,
group_size: int,
):
"""
Util function.
"""
num_experts = s.shape[0]
output = paddle.empty(
[num_experts, s.shape[1], s.shape[2]],
dtype=s.dtype,
)
for e in range(num_experts):
output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size)
return output
class MarlinWeightOnlyMoEMethod(QuantMethodBase):
"""
Use Marlin Group Gemm to compute Fused MoE.
"""
def __init__(self, quant_method=None):
"""
Marlin Group Gemm to compute Fused MoE.
"""
self.quant_method = quant_method
self.added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
self.added_scale_attrs = [
"moe_ffn1_weight_scale", "moe_ffn2_weight_scale"
]
self.added_zeros_attrs = ["zeros0", "zeros1"]
def create_weights(self, layer: nn.Layer, state_dict):
"""
Marlin MoE create weight process.
"""
ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
assert len(ffn1_weights) == layer.num_local_experts
assert len(ffn2_weights) == layer.num_local_experts
assert ffn1_weights[0].shape == [
layer.hidden_size, layer.moe_intermediate_size * 2
]
assert ffn2_weights[0].shape == [
layer.moe_intermediate_size, layer.hidden_size
]
ffn1_tensor = paddle.stack(ffn1_weights, axis=0)
ffn2_tensor = paddle.stack(ffn2_weights, axis=0)
max_bound = 7
for idx, weight_tensor in enumerate([ffn1_tensor, ffn2_tensor]):
weight_name = self.added_weight_attrs[idx]
scale_name = self.added_scale_attrs[idx]
weight_scale = weight_tensor.abs().max(axis=1)
quanted_weight = weight_tensor / weight_scale[:,
None, :] * max_bound
quanted_weight = paddle.round(quanted_weight).astype("int32")
quanted_weight[quanted_weight > 7] = 7
quanted_weight[quanted_weight < -7] = -7
quanted_weight += 8
E, K, N = quanted_weight.shape
quanted_weight = quanted_weight.reshape([0, K // 8, 8, N])
res = paddle.zeros([E, K // 8, N], dtype='int32')
for j in range(8):
tmp = quanted_weight[:, :, j, :]
res = res | (tmp << (j * 4))
quanted_weight = paddle.assign(res)
weight_scale = weight_scale / max_bound
weight_scale = weight_scale[:, None, :]
group_size = -1 # means per_channel
g_idx_sort_indices = paddle.empty([E, 0], dtype="int32")
quanted_weight = gptq_marlin_moe_repack(
quanted_weight,
g_idx_sort_indices,
K,
N,
4,
)
weight_scale = marlin_moe_permute_scales(
weight_scale,
size_k=layer.moe_intermediate_size, #useless
size_n=N,
group_size=group_size)
for (name, tensor) in [(weight_name, quanted_weight),
(scale_name, weight_scale)]:
setattr(
layer, name,
layer.create_parameter(
shape=tensor.shape,
dtype=tensor.dtype,
default_initializer=paddle.nn.initializer.Constant(0),
))
getattr(layer, name).set_value(tensor)
def apply(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Marlin compute Fused MoE.
"""
token_num = x.shape[0]
top_k = layer.top_k
top_k = layer.top_k
moe_intermediate_size = layer.moe_intermediate_size
hidden_size = layer.hidden_size
num_experts = layer.num_experts
gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
gate_out,
layer.gate_correction_bias,
top_k,
True, # apply_norm_weight,
False,
)
block_size_m = 64
for m in [8, 16, 32, 48, 64]:
if token_num * top_k / num_experts / m < 0.9:
block_size_m = m
break
topk = top_k
# for H100 132 sms
workspace = paddle.empty([528], dtype="int32")
sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess_func(
topk_ids, num_experts, block_size_m)
ffn_out = MoeWna16MarlinGemmApi(
x,
c_or_none=None,
b_q_weight=layer.moe_ffn1_weight,
b_scales=layer.moe_ffn1_weight_scale,
global_scale_or_none=None,
b_zeros_or_none=None,
g_idx_or_none=None,
perm_or_none=None,
workspace=workspace,
sorted_token_ids=sorted_token_ids,
expert_ids=expert_ids,
num_tokens_post_padded=num_tokens_post_padded,
topk_weights=topk_weights,
moe_block_size=block_size_m,
top_k=topk,
mul_topk_weights=False,
is_ep=False,
b_q_type_str="uint4b8",
size_m=token_num,
size_n=moe_intermediate_size * 2,
size_k=hidden_size,
is_k_full=True,
use_atomic_add=True,
use_fp32_reduce=True,
is_zp_float=False)[0]
swiglu_out = paddle.incubate.nn.functional.swiglu(ffn_out)
ffn_out = MoeWna16MarlinGemmApi(
swiglu_out,
c_or_none=None,
b_q_weight=layer.moe_ffn2_weight,
b_scales=layer.moe_ffn2_weight_scale,
global_scale_or_none=None,
b_zeros_or_none=None,
g_idx_or_none=None,
perm_or_none=None,
workspace=workspace,
sorted_token_ids=sorted_token_ids,
expert_ids=expert_ids,
num_tokens_post_padded=num_tokens_post_padded,
topk_weights=topk_weights,
moe_block_size=block_size_m,
top_k=1,
mul_topk_weights=True,
is_ep=False,
b_q_type_str="uint4b8",
size_m=token_num * topk,
size_n=hidden_size,
size_k=moe_intermediate_size,
is_k_full=True,
use_atomic_add=True,
use_fp32_reduce=True,
is_zp_float=False)[0]
ffn_out.reshape_([token_num, -1, hidden_size])
ffn_out = ffn_out.sum(axis=1)
if layer.tp_size > 1:
tensor_model_parallel_all_reduce(ffn_out)
return ffn_out

View File

@@ -1,57 +0,0 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from abc import abstractmethod
import paddle
from paddle import nn
from fastdeploy.model_executor.layers.quantization.quant_base import \
QuantMethodBase
class FusedMoEMethodBase(QuantMethodBase):
"""
All MoE Method should inherit this class.
and must implement following methods!
"""
@abstractmethod
def create_weights(self,
layer: nn.Layer,
moe_compute_params,
ffn1_tensor,
ffn2_tensor,
ffn1_bias=None,
ffn2_bias=None):
"""
How to create weights, you must implement this method.
"""
raise NotImplementedError
@abstractmethod
def apply(
self,
layer: nn.Layer,
moe_compute_params,
x: paddle.Tensor,
) -> paddle.Tensor:
"""
Compute methods, you must implement this method.
"""
raise NotImplementedError

View File

@@ -0,0 +1,479 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import paddle
from paddle import nn
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
from fastdeploy.model_executor.layers.utils import (create_hadamard_matrix_map,
get_tensor)
from fastdeploy.utils import ceil_div
from ..quantization.quant_base import QuantMethodBase
class TritonWeightOnlyMoEMethod(QuantMethodBase):
"""
Use Triton Group Gemm to compute Fused MoE.
"""
def __init__(self, quant_method=None):
"""
Triton Group Gemm to compute Fused MoE.
"""
self.quant_method = quant_method
self.added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
self.added_scale_attrs = [
"moe_ffn1_weight_scale", "moe_ffn2_weight_scale"
]
def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None:
"""process_prequanted_weights"""
pass
def create_weights(self, layer: nn.Layer, state_dict):
"""
Triton MoE create weight process.
"""
ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
assert len(ffn1_weights) == layer.num_local_experts
assert len(ffn2_weights) == layer.num_local_experts
assert layer.quant_method.quant_config.name() == "wint8"
assert ffn1_weights[0].shape == [
layer.hidden_size, layer.moe_intermediate_size * 2
]
assert ffn2_weights[0].shape == [
layer.moe_intermediate_size, layer.hidden_size
]
ffn1_tensor = paddle.stack(ffn1_weights, axis=0)
ffn2_tensor = paddle.stack(ffn2_weights, axis=0)
if self.quant_config.name() == "wint8":
max_bound = 127
elif self.quant_config.name() == "wint4":
max_bound = 7
for idx, weight_tensor in enumerate([ffn1_tensor, ffn2_tensor]):
weight_name = self.added_weight_attrs[idx]
scale_name = self.added_scale_attrs[idx]
quanted_weight_scale = weight_tensor.abs().max(axis=1)
quanted_weight = weight_tensor / quanted_weight_scale[:,
None, :] * max_bound
quanted_weight = paddle.round(quanted_weight).astype("int8")
quanted_weight_scale = quanted_weight_scale / max_bound
setattr(
layer, weight_name,
layer.create_parameter(
shape=quanted_weight.shape,
dtype=quanted_weight.dtype,
default_initializer=paddle.nn.initializer.Constant(0),
))
getattr(layer, weight_name).set_value(quanted_weight)
setattr(
layer, scale_name,
layer.create_parameter(
shape=quanted_weight_scale.shape,
dtype=quanted_weight_scale.dtype,
))
getattr(layer, scale_name).set_value(quanted_weight_scale)
def apply(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Triton compute Fused MoE.
"""
token_num = x.shape[0]
top_k = layer.top_k
num_local_experts = layer.num_local_experts
top_k = layer.top_k
moe_intermediate_size = layer.moe_intermediate_size
hidden_size = layer.hidden_size
gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
scores = paddle.nn.functional.softmax(gate_out, axis=-1)
topk_weights, topk_ids = paddle.topk(scores,
k=top_k,
axis=-1,
sorted=False)
topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdim=True)
intermediate_cache1 = paddle.empty(
[token_num * top_k, moe_intermediate_size * 2],
dtype=x.dtype,
)
intermediate_cache2 = paddle.empty(
(token_num * top_k, moe_intermediate_size),
dtype=x.dtype,
)
intermediate_cache3 = paddle.empty(
(token_num * top_k, hidden_size),
dtype=x.dtype,
)
config = {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
}
from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess
from .triton_moe_kernels import fused_moe_kernel_paddle
sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess(
topk_ids, num_local_experts, config["BLOCK_SIZE_M"])
max_num_tokens_padded = sorted_token_ids.shape[0]
grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]), )
fused_moe_kernel_paddle[grid](
x,
layer.moe_ffn1_weight,
intermediate_cache1,
None,
layer.moe_ffn1_weight_scale,
None,
sorted_token_ids,
expert_ids,
num_tokens_post_padded,
moe_intermediate_size * 2,
hidden_size,
max_num_tokens_padded,
token_num * top_k,
stride_am=x.strides[0],
stride_ak=x.strides[1],
stride_be=layer.moe_ffn1_weight.strides[0],
stride_bk=layer.moe_ffn1_weight.strides[1],
stride_bn=layer.moe_ffn1_weight.strides[2],
stride_cm=intermediate_cache1.strides[0],
stride_cn=intermediate_cache1.strides[1],
#
stride_asm=-1,
stride_ask=-1,
stride_bse=layer.moe_ffn1_weight_scale.strides[0],
stride_bsk=-1,
stride_bsn=layer.moe_ffn1_weight_scale.strides[1],
group_n=-1,
group_k=-1,
# Meta-parameters
BLOCK_SIZE_M=config["BLOCK_SIZE_M"],
BLOCK_SIZE_N=config["BLOCK_SIZE_N"],
BLOCK_SIZE_K=config["BLOCK_SIZE_K"],
GROUP_SIZE_M=config["GROUP_SIZE_M"],
MUL_ROUTED_WEIGHT=False,
top_k=top_k,
compute_type_enum=1,
use_fp8_w8a8=False,
use_int8_w8a16=True,
even_Ks=hidden_size % config["BLOCK_SIZE_K"] == 0,
)
intermediate_cache2 = paddle.incubate.nn.functional.swiglu(
intermediate_cache1)
grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
ceil_div(hidden_size, config["BLOCK_SIZE_N"]), )
fused_moe_kernel_paddle[grid](
intermediate_cache2,
layer.moe_ffn2_weight,
intermediate_cache3,
None,
layer.moe_ffn2_weight_scale,
topk_weights,
sorted_token_ids,
expert_ids,
num_tokens_post_padded,
hidden_size,
moe_intermediate_size,
max_num_tokens_padded,
token_num * top_k,
stride_am=intermediate_cache2.strides[0],
stride_ak=intermediate_cache2.strides[1],
stride_be=layer.moe_ffn2_weight.strides[0],
stride_bk=layer.moe_ffn2_weight.strides[1],
stride_bn=layer.moe_ffn2_weight.strides[2],
stride_cm=intermediate_cache3.strides[0],
stride_cn=intermediate_cache3.strides[1],
stride_asm=-1,
stride_ask=-1,
stride_bse=layer.moe_ffn2_weight_scale.strides[0],
stride_bsk=-1,
stride_bsn=layer.moe_ffn2_weight_scale.strides[1],
group_n=-1,
group_k=-1,
# Meta-parameters
BLOCK_SIZE_M=config["BLOCK_SIZE_M"],
BLOCK_SIZE_N=config["BLOCK_SIZE_N"],
BLOCK_SIZE_K=config["BLOCK_SIZE_K"],
GROUP_SIZE_M=config["GROUP_SIZE_M"],
MUL_ROUTED_WEIGHT=True,
top_k=1,
compute_type_enum=1,
use_fp8_w8a8=False,
use_int8_w8a16=True,
even_Ks=moe_intermediate_size % config["BLOCK_SIZE_K"] == 0,
)
intermediate_cache3.reshape_([token_num, top_k, hidden_size])
out = intermediate_cache3.sum(axis=1)
return out
class TensorWiseFP8MoEMethod(QuantMethodBase):
"""
Use Triton Group Gemm to compute Fused MoE.
"""
def __init__(self, quant_method=None):
"""
Triton Group Gemm to compute Fused MoE.
"""
self.quant_method = quant_method
def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None:
"""process_prequanted_weights"""
ffn1_tensor, ffn2_tensor = layer.extract_moe_ffn_weights(state_dict)
assert ffn1_tensor[0].shape == [
layer.hidden_size, layer.moe_intermediate_size * 2
]
assert ffn2_tensor[0].shape == [
layer.moe_intermediate_size, layer.hidden_size
]
ffn1_tensor = paddle.stack(ffn1_tensor, axis=0)
ffn2_tensor = paddle.stack(ffn2_tensor, axis=0)
added_wfp8afp8_attrs = [
"moe_ffn1_weight", "moe_ffn2_weight", "moe_ffn1_weight_scale",
"moe_ffn2_weight_scale", "moe_ffn1_in_scale", "moe_ffn2_in_scale"
]
def _extract_scale_tensor(key_template):
result = []
for i in range(layer.num_experts):
result.append(
get_tensor(state_dict.pop(key_template.format(i))))
return paddle.concat(result).cast("float32")
weight_key_map = layer.weight_key_map
moe_ffn1_weight_scale = _extract_scale_tensor(
weight_key_map["ffn1_expert_weight_scale_key"])
moe_ffn2_weight_scale = _extract_scale_tensor(
weight_key_map["ffn2_expert_weight_scale_key"])
moe_ffn1_in_scale = _extract_scale_tensor(
weight_key_map["ffn1_expert_in_scale_key"])
moe_ffn2_in_scale = _extract_scale_tensor(
weight_key_map["ffn2_expert_in_scale_key"])
for idx, weight_tensor in enumerate([
ffn1_tensor, ffn2_tensor, moe_ffn1_weight_scale,
moe_ffn2_weight_scale, moe_ffn1_in_scale, moe_ffn2_in_scale
]):
name = added_wfp8afp8_attrs[idx]
setattr(
layer, name,
layer.create_parameter(
shape=weight_tensor.shape,
dtype=weight_tensor.dtype,
default_initializer=paddle.nn.initializer.Constant(0),
))
getattr(layer, name).set_value(weight_tensor)
def create_weights(self, layer: nn.Layer, state_dict):
"""
Triton MoE create weight process.
"""
pass
def apply(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Triton compute Fused MoE.
"""
token_num = x.shape[0]
top_k = layer.top_k
num_local_experts = layer.num_local_experts
moe_intermediate_size = layer.moe_intermediate_size
hidden_size = layer.hidden_size
gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
scores = paddle.nn.functional.softmax(gate_out, axis=-1)
topk_weights, topk_ids = paddle.topk(scores,
k=top_k,
axis=-1,
sorted=False)
topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdim=True)
intermediate_cache1 = paddle.empty(
[token_num * top_k, moe_intermediate_size * 2],
dtype=x.dtype,
)
intermediate_cache2 = paddle.empty(
(token_num * top_k, moe_intermediate_size),
dtype=x.dtype,
)
intermediate_cache3 = paddle.empty(
(token_num * top_k, hidden_size),
dtype=x.dtype,
)
config = {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
}
from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess
sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess(
topk_ids, num_local_experts, config["BLOCK_SIZE_M"])
max_num_tokens_padded = sorted_token_ids.shape[0]
grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]), )
adamard_matrix = create_hadamard_matrix_map[hidden_size]
x = paddle.matmul(x.cast("float32"), adamard_matrix)
permute_x = x[:, None, :].tile([1, top_k, 1])
permute_x = permute_x.reshape([-1, hidden_size])
quant_activation_scale = layer.moe_ffn1_in_scale[topk_ids].reshape(
[-1, 1])
permute_x = permute_x / quant_activation_scale
permute_x = permute_x.astype("float8_e4m3fn")
from .triton_moe_kernels import fused_moe_kernel_paddle
fused_moe_kernel_paddle[grid](
permute_x,
layer.moe_ffn1_weight.view(paddle.float8_e4m3fn),
intermediate_cache1,
layer.moe_ffn1_in_scale,
layer.moe_ffn1_weight_scale,
None,
sorted_token_ids,
expert_ids,
num_tokens_post_padded,
moe_intermediate_size * 2,
hidden_size,
max_num_tokens_padded,
token_num * top_k,
stride_am=x.strides[0],
stride_ak=x.strides[1],
stride_be=layer.moe_ffn1_weight.strides[0],
stride_bk=layer.moe_ffn1_weight.strides[1],
stride_bn=layer.moe_ffn1_weight.strides[2],
stride_cm=intermediate_cache1.strides[0],
stride_cn=intermediate_cache1.strides[1],
#
stride_asm=-1, # only used in blockwise fp8
stride_ask=-1, # only used in blockwise fp8
stride_bse=-1,
stride_bsk=-1,
stride_bsn=-1,
group_n=-1,
group_k=-1,
# Meta-parameters
BLOCK_SIZE_M=config["BLOCK_SIZE_M"],
BLOCK_SIZE_N=config["BLOCK_SIZE_N"],
BLOCK_SIZE_K=config["BLOCK_SIZE_K"],
GROUP_SIZE_M=config["GROUP_SIZE_M"],
MUL_ROUTED_WEIGHT=False,
top_k=1,
compute_type_enum=1,
use_fp8_w8a8=True,
use_int8_w8a16=False,
even_Ks=hidden_size % config["BLOCK_SIZE_K"] == 0,
)
intermediate_cache2 = paddle.incubate.nn.functional.swiglu(
intermediate_cache1)
hadamard_matrix = create_hadamard_matrix_map[moe_intermediate_size]
intermediate_cache2 = paddle.matmul(
intermediate_cache2.cast("float32"), hadamard_matrix)
quant_activation_scale = layer.moe_ffn2_in_scale[topk_ids].reshape(
[-1, 1])
intermediate_cache2 = intermediate_cache2 / quant_activation_scale
intermediate_cache2 = intermediate_cache2.astype("float8_e4m3fn")
grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
ceil_div(hidden_size, config["BLOCK_SIZE_N"]), )
fused_moe_kernel_paddle[grid](
intermediate_cache2,
layer.moe_ffn2_weight.view(paddle.float8_e4m3fn),
intermediate_cache3,
layer.moe_ffn2_in_scale,
layer.moe_ffn2_weight_scale,
topk_weights,
sorted_token_ids,
expert_ids,
num_tokens_post_padded,
hidden_size,
moe_intermediate_size,
max_num_tokens_padded,
token_num * top_k,
stride_am=intermediate_cache2.strides[0],
stride_ak=intermediate_cache2.strides[1],
stride_be=layer.moe_ffn2_weight.strides[0],
stride_bk=layer.moe_ffn2_weight.strides[1],
stride_bn=layer.moe_ffn2_weight.strides[2],
stride_cm=intermediate_cache3.strides[0],
stride_cn=intermediate_cache3.strides[1],
stride_asm=-1,
stride_ask=-1,
stride_bse=-1,
stride_bsk=-1,
stride_bsn=-1,
group_n=-1,
group_k=-1,
# Meta-parameters
BLOCK_SIZE_M=config["BLOCK_SIZE_M"],
BLOCK_SIZE_N=config["BLOCK_SIZE_N"],
BLOCK_SIZE_K=config["BLOCK_SIZE_K"],
GROUP_SIZE_M=config["GROUP_SIZE_M"],
MUL_ROUTED_WEIGHT=True,
top_k=1,
compute_type_enum=1,
use_fp8_w8a8=True,
use_int8_w8a16=False,
even_Ks=moe_intermediate_size % config["BLOCK_SIZE_K"] == 0,
)
intermediate_cache3.reshape_([token_num, top_k, hidden_size])
out = intermediate_cache3.sum(axis=1)
if layer.tp_size > 1:
tensor_model_parallel_all_reduce(out)
return out

View File

@@ -0,0 +1,236 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import paddle
from paddle import nn
import fastdeploy
from ..quantization.quant_base import QuantMethodBase
from ..utils import create_and_set_parameter, get_tensor
class Wint2MoeMethod(QuantMethodBase):
"""
Use compute Fused MoE.
"""
def __init__(self, quant_config):
super().__init__()
self.moe_quant_type = quant_config.moe_quant_type
def process_loaded_weights(self, layer, weights) -> None:
"""
process_loaded_weights
"""
pass
def check(self, layer: nn.Layer, ffn1_weights, ffn2_weights):
"""
check layer is valid for this method
"""
assert len(
ffn1_weights
) == layer.num_local_experts, "ffn1_weights length should be equal to num_local_experts."
assert len(
ffn2_weights
) == layer.num_local_experts, "ffn2_weights length should be equal to num_local_experts."
def create_weights(self, layer: nn.Layer, state_dict):
"""
Paddle cutlass create weight process.
"""
pass
class TritonWint2FusedMoeMethod(Wint2MoeMethod):
"""
Use Triton Group Gemm to compute Fused MoE.
"""
def __init__(self, quant_config):
super().__init__(quant_config)
self.moe_quant_type = quant_config.moe_quant_type
def process_loaded_weights(self, layer, weights) -> None:
"""
process_loaded_weights
"""
pass
def process_prequanted_weights(self, layer: nn.Layer, state_dict):
"""
Paddle cutlass process prequanted weights.
"""
ffn1_expert_weight_key = layer.weight_key_map.get(
"ffn1_expert_weight_key", None)
ffn2_expert_weight_key = layer.weight_key_map.get(
"ffn2_expert_weight_key", None)
ffn1_expert_weight_scale_key = layer.weight_key_map.get(
"ffn1_expert_weight_scale_key", None)
ffn2_expert_weight_scale_key = layer.weight_key_map.get(
"ffn2_expert_weight_scale_key", None)
ffn1_expert_super_scales_key = layer.weight_key_map.get(
"ffn1_expert_super_scales_key", None)
ffn2_expert_super_scales_key = layer.weight_key_map.get(
"ffn2_expert_super_scales_key", None)
ffn1_expert_code_scale_key = layer.weight_key_map.get(
"ffn1_expert_code_scale_key", None)
ffn2_expert_code_scale_key = layer.weight_key_map.get(
"ffn2_expert_code_scale_key", None)
ffn1_expert_code_zp_key = layer.weight_key_map.get(
"ffn1_expert_code_zp_key", None)
ffn2_expert_code_zp_key = layer.weight_key_map.get(
"ffn2_expert_code_zp_key", None)
ffn1_weights, ffn2_weights = layer.load_experts_weight(
state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
# self.check(layer, ffn1_weights, ffn2_weights)
ffn1_weight_scale = []
ffn2_weight_scale = []
ffn1_super_scales = []
ffn2_super_scales = []
ffn1_code_scale = []
ffn2_code_scale = []
ffn1_code_zp = []
ffn2_code_zp = []
for i in range(layer.num_experts):
expert_idx = layer.expert_id_offset + i
ffn1_weight_scale.append(
get_tensor(
state_dict.pop(
ffn1_expert_weight_scale_key.format(expert_idx))))
ffn2_weight_scale.append(
get_tensor(
state_dict.pop(
ffn2_expert_weight_scale_key.format(expert_idx))))
ffn1_super_scales.append(
get_tensor(
state_dict.pop(
ffn1_expert_super_scales_key.format(expert_idx))))
ffn2_super_scales.append(
get_tensor(
state_dict.pop(
ffn2_expert_super_scales_key.format(expert_idx))))
ffn1_code_scale.append(
get_tensor(
state_dict.pop(
ffn1_expert_code_scale_key.format(expert_idx))))
ffn2_code_scale.append(
get_tensor(
state_dict.pop(
ffn2_expert_code_scale_key.format(expert_idx))))
ffn1_code_zp.append(
get_tensor(
state_dict.pop(
ffn1_expert_code_zp_key.format(expert_idx))))
ffn2_code_zp.append(
get_tensor(
state_dict.pop(
ffn2_expert_code_zp_key.format(expert_idx))))
ffn1_weight = paddle.stack(ffn1_weights, axis=0)
ffn2_weight = paddle.stack(ffn2_weights, axis=0)
ffn1_weight_scale = paddle.stack(ffn1_weight_scale, axis=0)
ffn2_weight_scale = paddle.stack(ffn2_weight_scale, axis=0)
ffn1_super_scales = paddle.stack(ffn1_super_scales, axis=0)
ffn2_super_scales = paddle.stack(ffn2_super_scales, axis=0)
ffn1_code_scale = paddle.stack(ffn1_code_scale, axis=0)
ffn2_code_scale = paddle.stack(ffn2_code_scale, axis=0)
ffn1_code_zp = paddle.stack(ffn1_code_zp, axis=0)
ffn2_code_zp = paddle.stack(ffn2_code_zp, axis=0)
name_tensor_map = {
"moe_ffn1_weight": ffn1_weight,
"moe_ffn2_weight": ffn2_weight,
"moe_ffn1_weight_scale": ffn1_weight_scale,
"moe_ffn2_weight_scale": ffn2_weight_scale,
"moe_ffn1_super_scales": ffn1_super_scales,
"moe_ffn2_super_scales": ffn2_super_scales,
"moe_ffn1_code_scale": ffn1_code_scale,
"moe_ffn2_code_scale": ffn2_code_scale,
"moe_ffn1_code_zp": ffn1_code_zp,
"moe_ffn2_code_zp": ffn2_code_zp
}
for name, tensor in name_tensor_map.items():
create_and_set_parameter(layer, name, tensor)
def create_weights(self, layer: nn.Layer, state_dict):
"""
Paddle cutlass create weight process.
"""
pass
def apply(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Use Wint2 Triton Fusedmoe compute Fused MoE.
"""
from fastdeploy.model_executor.ops.gpu import moe_expert_dispatch
(
permute_input,
token_nums_per_expert,
permute_indices_per_token,
topk_weights,
topk_idx,
expert_idx_per_token,
) = moe_expert_dispatch(
x,
gate_out,
layer.gate_correction_bias,
(layer.moe_ffn1_in_scale if hasattr(layer, "moe_ffn1_in_scale")
else None), # if set, permute_input will be int8_t
layer.top_k,
False,
topk_only_mode=False,
)
ffn_out = fastdeploy.model_executor.ops.gpu.moe_expert_ffn_wint2(
permute_input,
token_nums_per_expert,
layer.moe_ffn1_weight,
layer.moe_ffn2_weight,
None,
layer.moe_ffn1_super_scales,
layer.moe_ffn2_super_scales,
layer.moe_ffn1_weight_scale,
layer.moe_ffn1_code_scale,
layer.moe_ffn1_code_zp,
layer.moe_ffn2_weight_scale,
layer.moe_ffn2_code_scale,
layer.moe_ffn2_code_zp,
False,
)
from fastdeploy.model_executor.ops.gpu import moe_expert_reduce
fused_moe_out = moe_expert_reduce(
ffn_out,
topk_weights,
permute_indices_per_token,
topk_idx,
None,
norm_topk_prob=True,
routed_scaling_factor=1.0,
)
return fused_moe_out

View File

@@ -1,273 +0,0 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import os
import paddle
from paddle import nn
from fastdeploy.model_executor.layers.moe.moe import MoELayer
from fastdeploy.model_executor.layers.utils import get_tensor
class TextMoELayer(MoELayer):
"""
MoELayer is a layer that performs MoE (Mixture of Experts) computation.
"""
def __init__(
self,
*args,
**kwargs,
):
"""
初始化函数,用于设置类的属性和方法。
参数:
- args (tuple, optional): 可变长度的位置参数列表,默认为空元组。
- kwargs (dict, optional): 关键字参数字典,默认为空字典。
返回值:
无返回值,直接修改类的属性和方法。
"""
kwargs["moe_tag"] = "Text"
super().__init__(*args, **kwargs)
def load_gate_state_dict(self, state_dict):
"""
加载门状态字典,用于初始化网络参数。
将从给定的状态字典中弹出的参数赋值给网络的门参数。
Args:
state_dict (OrderedDict): 包含网络门参数的字典。
Returns:
tuple (list, list): 返回两个列表,分别代表上阶网关投影和下阶投影的参数。
每个元素都是一个列表,长度为网络的专家数量。
"""
up_gate_proj_weight = []
up_gate_proj_weight_scale = []
down_proj_weight = []
down_proj_weight_scale = []
for j in range(0, self.num_experts):
up_gate_proj_weight.append(
get_tensor(state_dict.pop(self.ffn1_expert_weight_key.format(j)))
)
down_proj_weight.append(
get_tensor(state_dict.pop(self.ffn2_expert_weight_key.format(j)))
)
return (
up_gate_proj_weight,
down_proj_weight,
up_gate_proj_weight_scale,
down_proj_weight_scale,
)
def load_gate_correction_bias(self, state_dict):
"""
加载网关校正偏置。如果使用了网关校正偏置则从state_dict中获取相应的张量并设置到网关校正偏置上。
参数:
state_dict (OrderedDict): 包含模型参数和状态的字典。
返回值:
无返回值,直接修改了网关校正偏置的值。
"""
if self.moe_config.moe_use_gate_correction_bias:
gate_correction_bias_tensor = get_tensor(
state_dict[self.gate_correction_bias_key]
)
self.gate_correction_bias.set_value(
gate_correction_bias_tensor[0].unsqueeze(0)
)
class ImageMoELayer(MoELayer):
"""
MoELayer is a layer that performs MoE (Mixture of Experts) computation.
"""
def __init__(
self,
*args,
**kwargs,
):
"""
初始化函数,用于设置类的属性和方法。
参数:
- args (tuple, optional): 可变长度的位置参数列表,默认为空元组。
- kwargs (dict, optional): 关键字参数字典,默认为空字典。
返回值:
无返回值,直接修改类的属性和方法。
"""
moe_quant_type = os.getenv("ELLM_MM_IMAGE_QUANT_TYPE", None)
if moe_quant_type is not None:
kwargs["moe_quant_type"] = moe_quant_type
kwargs["moe_tag"] = "Image"
super().__init__(*args, **kwargs)
def load_gate_state_dict(self, state_dict):
"""
加载门状态字典。
从给定的状态字典中提取并返回两个专家的上下关门投影权重,以及两个专家的下降投影权重。
参数:
state_dict (OrderedDict): 包含网络参数的有序字典。
返回值:
tuple (list, list),分别是两个专家的上下关门投影权重和两个专家的下降投影权重,都是列表类型。
"""
up_gate_proj_weight = []
up_gate_proj_weight_scale = []
down_proj_weight = []
down_proj_weight_scale = []
for j in range(self.num_experts, self.num_experts + self.num_experts):
up_gate_proj_weight.append(
get_tensor(state_dict.pop(self.ffn1_expert_weight_key.format(j)))
)
down_proj_weight.append(
get_tensor(state_dict.pop(self.ffn2_expert_weight_key.format(j)))
)
return (
up_gate_proj_weight,
down_proj_weight,
up_gate_proj_weight_scale,
down_proj_weight_scale,
)
def load_gate_correction_bias(self, state_dict):
"""
加载门级别校正偏置参数如果使用门级别校正偏置则从state_dict中获取并设置到gate_correction_bias中。
参数:
state_dict (OrderedDict): 模型的状态字典,包含所有需要被加载的参数。
返回值:
无返回值直接修改了gate_correction_bias的值。
"""
if self.moe_config.moe_use_gate_correction_bias:
gate_correction_bias_tensor = get_tensor(
state_dict[self.gate_correction_bias_key]
)
self.gate_correction_bias.set_value(
gate_correction_bias_tensor[1].unsqueeze(0)
)
class MultimodalityMoeLayer(nn.Layer):
"""
Multimodality MOE Layer
"""
def __init__(
self,
inference_args,
layer_name,
layer_idx,
):
"""
初始化一个 MoELayer。
Args:
inference_args (InferenceArgs): 推理参数类,包含了所有必要的配置信息。
layer_name (str): 当前 MoE Layer 的名称。
layer_idx (int): 当前 MoE Layer 在模型中的索引。
Returns:
None, 无返回值。
"""
super().__init__()
self.text_moe_layer = TextMoELayer(
inference_args=inference_args,
moe_config=inference_args.moe_config,
layer_name=layer_name + ".text",
gate_weight_key=f"ernie.layers.{layer_idx}.mlp.gate.weight",
ffn1_expert_weight_key=f"ernie.layers.{layer_idx}.mlp.experts"
+ ".{}.up_gate_proj.weight",
ffn2_expert_weight_key=f"ernie.layers.{layer_idx}.mlp.experts"
+ ".{}.down_proj.weight",
gate_correction_bias_key=f"ernie.layers.{layer_idx}.mlp.moe_statics.e_score_correction_bias",
ffn1_bias_key=None,
ffn2_bias_key=None,
ffn1_shared_weight_key=None,
ffn1_shared_bias_key=None,
ffn2_shared_weight_key=None,
ffn2_shared_bias_key=None,
layer_idx=layer_idx,
)
self.image_moe_layer = ImageMoELayer(
inference_args=inference_args,
moe_config=inference_args.moe_config_1,
layer_name=layer_name + ".image",
gate_weight_key=f"ernie.layers.{layer_idx}.mlp.gate.weight_1",
ffn1_expert_weight_key=f"ernie.layers.{layer_idx}.mlp.experts"
+ ".{}.up_gate_proj.weight",
ffn2_expert_weight_key=f"ernie.layers.{layer_idx}.mlp.experts"
+ ".{}.down_proj.weight",
gate_correction_bias_key=f"ernie.layers.{layer_idx}.mlp.moe_statics.e_score_correction_bias",
ffn1_bias_key=None,
ffn2_bias_key=None,
ffn1_shared_weight_key=None,
ffn1_shared_bias_key=None,
ffn2_shared_weight_key=None,
ffn2_shared_bias_key=None,
layer_idx=layer_idx,
)
def load_state_dict(self, state_dict):
"""
加载模型参数。
将给定的字典中的参数覆盖到当前模型上,并返回一个新的字典,其中包含未被覆盖的键值对。
Args:
state_dict (dict): 包含了要加载的模型参数的字典。
Returns:
dict: 包含未被覆盖的键值对的字典。
"""
self.text_moe_layer.load_state_dict(state_dict)
self.image_moe_layer.load_state_dict(state_dict)
state_dict.pop(self.text_moe_layer.gate_correction_bias_key)
def forward(self, x, **kwargs):
"""
前向计算函数,将输入的张量进行处理并返回结果。
该函数接受以下键值对参数:
- token_type_ids (Optional, Tensor, default=None): 一个bool型Tensor用于指定每个元素是否为文本类型值为0或图像类型值为1
如果未提供此参数则会引发AssertionError。
返回值是一个Tensor形状与输入相同表示处理后的结果。
Args:
x (Tensor): 输入张量,形状为[token_num, hidden_size]其中token_num是序列长度hidden_size是隐藏状态维度。
kwargs (dict, optional): 可选参数字典默认为None包含以下键值对
- token_type_ids (Tensor, optional): 一个bool型Tensor用于指定每个元素是否为文本类型值为0或图像类型值为1默认为None。
Returns:
Tensor: 一个Tensor形状与输入相同表示处理后的结果。
Raises:
AssertionError: 当未提供token_type_ids参数时会引发此错误。
"""
token_type_ids = kwargs.get("token_type_ids", None)
assert token_type_ids is not None
# x.shape is [token_num, hidden_size]
fused_moe_out = paddle.zeros_like(x)
text_mask = token_type_ids == 0 # [token_num]
image_mask = token_type_ids == 1
if text_mask.any():
text_out = self.text_moe_layer(x[text_mask])
fused_moe_out[text_mask] = text_out
if image_mask.any():
image_out = self.image_moe_layer(x[image_mask])
fused_moe_out[image_mask] = image_out
return fused_moe_out

View File

@@ -1,5 +1,5 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -14,34 +14,13 @@
# limitations under the License.
"""
from dataclasses import dataclass
import paddle
from paddle import nn
from paddlenlp.utils.log import logger
from paddleformers.utils.log import logger
from fastdeploy import envs
from fastdeploy.model_executor.layers.utils import get_tensor
from .cutlass_fused_moe import CutlassFusedMoeMethod
@dataclass
class MoEComputeParams:
"""
some params for computing MoE.
it is given to different compute methods.
"""
global_num_experts: int = -1
top_k: int = -1
hidden_size: int = -1
num_local_experts: int = -1
moe_intermediate_size: int = -1
tp_size: int = -1
ep_size: int = -1
dp_size: int = -1
moe_quant_type: str = ""
class FusedMoE(nn.Layer):
"""
@@ -50,174 +29,195 @@ class FusedMoE(nn.Layer):
def __init__(
self,
llm_config,
fd_config,
moe_intermediate_size: int = -1,
num_experts: int = -1,
expert_id_offset: int = 0,
top_k: int = -1,
moe_use_gate_correction_bias: bool = False,
moe_quant_type: str = "weight_only_int4",
layer_idx: int = -1,
gate_weight_key=None,
gate_correction_bias_key=None,
ffn1_expert_weight_key=None,
ffn2_expert_weight_key=None,
moe_ffn1_bias_keys=None,
moe_ffn2_bias_keys=None,
moe_ffn1_weight_scale_keys=None,
moe_ffn2_weight_scale_keys=None,
moe_ffn1_in_scale_keys=None,
moe_ffn2_in_scale_keys=None,
moe_tag: str = "",
weight_key_map: dict = {},
):
"""
Initialize the Moe layer with given parameters.
Args:
llm_config (LLMConfig): Arguments related to inference, containing
fd_config (FDConfig): Arguments related to inference, containing
attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
num_attention_heads, and ffn_hidden_size.
"""
super().__init__()
self.llm_config = llm_config
self.fd_config = fd_config
self.layer_idx = layer_idx
self.tp_size = llm_config.parallel_config.mp_size
self.ep_size = llm_config.parallel_config.ep_size
self.moe_use_gate_correction_bias = moe_use_gate_correction_bias
self.tp_size = fd_config.parallel_config.tensor_parallel_degree
self.ep_size = fd_config.parallel_config.expert_parallel_degree
self.ep_rank = fd_config.parallel_config.expert_parallel_rank
assert (self.tp_size >= 1 and self.ep_size == 1) or \
(self.tp_size == 1 and self.ep_size > 1), \
'MoE only support parallelism on TP or EP dimension.'
self.hidden_size = fd_config.model_config.hidden_size
self.moe_config = fd_config.moe_config
self.hidden_size = llm_config.model_config.hidden_size
self.moe_config = llm_config.moe_config
self.use_offline_quant = llm_config.tmp_config.use_offline_quant
moe_tag = self.llm_config.moe_config.moe_tag
logger.info(f"{moe_tag}MoE is running in {moe_quant_type} mode")
self.moe_quant_type = moe_quant_type
self.num_experts = num_experts
self.num_local_experts = self.num_experts // self.ep_size
logger.info(f'''MoE config is num_experts:{num_experts},
top_k:{top_k},
hidden_size:{self.hidden_size},
moe_intermediate_size:{moe_intermediate_size}''')
logger.info(
f"MoE is running on moe_quant_type: {self.moe_quant_type}, ep:{self.ep_size}, tp:{self.tp_size} mode"
)
self.moe_intermediate_size = moe_intermediate_size // self.tp_size
self.gate_weight_key = gate_weight_key
self.gate_correction_bias_key = gate_correction_bias_key
self.top_k = top_k
self.hidden_size = self.hidden_size
self.moe_intermediate_size = moe_intermediate_size // self.tp_size
self.weight_key_map = weight_key_map
self.ffn1_expert_weight_key = ffn1_expert_weight_key
self.ffn2_expert_weight_key = ffn2_expert_weight_key
self.ffn1_bias_key = moe_ffn1_bias_keys
self.ffn2_bias_key = moe_ffn2_bias_keys
self.use_method = envs.FD_MOE_BACKEND.lower()
self.gate_correction_bias = None
self.moe_tag = moe_tag
if self.moe_quant_type == "w4a8":
# below keys are only used in MoE W4A8!
self.ffn1_expert_weight_scale_key = moe_ffn1_weight_scale_keys
self.ffn2_expert_weight_scale_key = moe_ffn2_weight_scale_keys
self.ffn1_expert_in_scale_key = moe_ffn1_in_scale_keys
self.ffn2_expert_in_scale_key = moe_ffn2_in_scale_keys
if self.ep_size > 1:
expert_id_offset = expert_id_offset + self.ep_rank * self.num_local_experts
self.compute_method = CutlassFusedMoeMethod()
self.expert_id_offset = expert_id_offset
self.moe_compute_params = MoEComputeParams()
self.moe_compute_params.global_num_experts = self.num_experts
self.moe_compute_params.top_k = top_k
self.moe_compute_params.hidden_size = self.hidden_size
self.moe_compute_params.num_local_experts = self.num_local_experts
self.moe_compute_params.moe_quant_type = self.moe_quant_type
self.moe_compute_params.moe_intermediate_size = self.moe_intermediate_size
self.moe_compute_params.ep_size = self.ep_size
self.moe_compute_params.tp_size = self.tp_size
if fd_config.quant_config:
self.quant_method = fd_config.quant_config.get_quant_method(self)
else:
# now, no quant method(w_fp16 a_fp16) can't get from quant_config, we will optimize it in future
from .fused_moe_cutlass_backend import CutlassMoEMethod
self.quant_method = CutlassMoEMethod(None)
def load_gate_state_dict(self, state_dict):
if self.ep_size > 1:
self.quant_method.init_ep(self)
logger.info(
f"{moe_tag}MoE config is {num_experts=}[{expert_id_offset}, {expert_id_offset+self.num_local_experts}), \
{top_k=}, hidden_size={self.hidden_size}, {moe_intermediate_size=}, \
, ep_size={self.ep_size}, \
tp_size={self.tp_size}.")
def load_experts_weight(self, state_dict: dict,
ffn1_expert_weight_key: str,
ffn2_expert_weight_key: str):
"""
load_gate_state_dict function.
Load experts weight from state_dict.
Args:
state_dict (dict): The state_dict of model.
ffn1_expert_weight_key (str): The key of ffn1 expert weight.
ffn2_expert_weight_key (str): The key of ffn2 expert weight.
"""
up_gate_proj_weight = []
up_gate_proj_weight_scale = []
down_proj_weight = []
down_proj_weight_scale = []
for j in range(self.num_experts):
up_gate_proj_weight.append(
get_tensor(
state_dict.pop(self.ffn1_expert_weight_key.format(j))))
down_proj_weight.append(
get_tensor(
state_dict.pop(self.ffn2_expert_weight_key.format(j))))
return up_gate_proj_weight, down_proj_weight
ffn1_weights = []
ffn2_weights = []
is_ffn_merged = ffn1_expert_weight_key.format(
self.expert_id_offset) in state_dict
if is_ffn_merged:
for i in range(self.num_local_experts):
expert_idx = self.expert_id_offset + i
ffn1_weights.append(
get_tensor(
state_dict.pop(
ffn1_expert_weight_key.format(expert_idx))))
ffn2_weights.append(
get_tensor(
state_dict.pop(
ffn2_expert_weight_key.format(expert_idx))))
else:
gate_expert_weight_key = ffn1_expert_weight_key.replace(
"up_gate_proj", "gate_proj")
up_expert_weight_key = ffn1_expert_weight_key.replace(
"up_gate_proj", "up_proj")
for j in range(self.num_local_experts):
expert_idx = self.expert_id_offset + j
gate = get_tensor(
state_dict.pop(gate_expert_weight_key.format(expert_idx)))
up = get_tensor(
state_dict.pop(up_expert_weight_key.format(expert_idx)))
ffn1_weights.append(paddle.concat([gate, up], axis=-1))
ffn2_weights.append(
get_tensor(
state_dict.pop(
ffn2_expert_weight_key.format(expert_idx))))
return ffn1_weights, ffn2_weights
def load_state_dict(self, state_dict, is_update: bool = False):
def extract_moe_ffn_weights(self, state_dict: dict):
"""
Extract MoE FFN weights from state dict based on weight key mapping.
Args:
state_dict (dict): Model state dictionary containing the weights.
Returns:
tuple: A tuple containing two lists:
- ffn1_weights: List of tensors for first FFN layer weights
- ffn2_weights: List of tensors for second FFN layer weights
Raises:
AssertionError: If required weight keys are missing or number of weights
doesn't match number of local experts.
"""
ffn1_expert_weight_key = self.weight_key_map.get(
"ffn1_expert_weight_key", None)
ffn2_expert_weight_key = self.weight_key_map.get(
"ffn2_expert_weight_key", None)
assert ffn1_expert_weight_key is not None, "ffn1_expert_weight_key should not be none."
assert ffn2_expert_weight_key is not None, "ffn2_expert_weight_key should not be none."
ffn1_weights, ffn2_weights = self.load_experts_weight(
state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
assert len(
ffn1_weights
) == self.num_local_experts, "ffn1_weights length should be equal to num_local_experts."
assert len(
ffn2_weights
) == self.num_local_experts, "ffn2_weights length should be equal to num_local_experts."
return ffn1_weights, ffn2_weights
def extract_gate_correction_bias(self, gate_correction_bias_key,
state_dict):
"""
extract_gate_correction_bias function.
"""
gate_correction_bias_tensor = get_tensor(
state_dict.pop(gate_correction_bias_key)).astype("float32")
return gate_correction_bias_tensor
def load_state_dict(self, state_dict):
"""
load_state_dict function.
"""
# gate
if not is_update:
gate_weight_tensor = get_tensor(state_dict.pop(self.gate_weight_key))
self.gate_weight = self.create_parameter(
shape=gate_weight_tensor.shape,
dtype="float32",
)
self.gate_weight.set_value(gate_weight_tensor)
# gate_correction_bias
self.gate_correction_bias_key = self.weight_key_map.get(
"gate_correction_bias_key", None)
if self.gate_correction_bias_key is not None and self.gate_correction_bias_key in state_dict:
self.moe_use_gate_correction_bias = True
else:
self.moe_use_gate_correction_bias = False
if self.moe_use_gate_correction_bias:
gate_correction_bias_tensor = get_tensor(
state_dict.pop(self.gate_correction_bias_key))
gate_correction_bias_tensor = self.extract_gate_correction_bias(
self.gate_correction_bias_key, state_dict)
self.gate_correction_bias = self.create_parameter(
shape=gate_correction_bias_tensor.shape,
dtype="float32",
)
self.gate_correction_bias.set_value(gate_correction_bias_tensor)
gate_weight_key = self.weight_key_map.get("gate_weight_key", None)
assert gate_weight_key is not None, "gate_weight_key should not be None, please check model checkpoints"
gate_weight_tensor = get_tensor(state_dict.pop(gate_weight_key))
self.gate_weight = self.create_parameter(
shape=gate_weight_tensor.shape,
dtype="float32",
)
self.gate_weight.set_value(gate_weight_tensor.astype("float32"))
if self.fd_config.model_config.is_quantized:
self.quant_method.process_prequanted_weights(self, state_dict)
else:
self.gate_correction_bias = None
self.quant_method.create_weights(self, state_dict)
up_gate_proj_weight, down_proj_weight = self.load_gate_state_dict(
state_dict)
weight1_scale = None
weight2_scale = None
ffn1_in_scale = None
ffn2_in_scale = None
if self.moe_quant_type == "w4a8":
weight1_scale = []
weight2_scale = []
ffn1_in_scale = []
ffn2_in_scale = []
for j in range(self.num_experts):
weight1_scale.append(
get_tensor(
state_dict.pop(
self.ffn1_expert_weight_scale_key.format(
self.layer_idx, j))))
weight2_scale.append(
get_tensor(
state_dict.pop(
self.ffn2_expert_weight_scale_key.format(
self.layer_idx, j))))
ffn1_in_scale.append(
get_tensor(
state_dict.pop(
self.ffn1_expert_in_scale_key.format(
self.layer_idx, j))))
ffn2_in_scale.append(
get_tensor(
state_dict.pop(
self.ffn2_expert_in_scale_key.format(
self.layer_idx, j))))
# other weight is with compute_method
# different method may have different way to create weights
self.compute_method.create_weights(self, self.moe_compute_params,
up_gate_proj_weight,
down_proj_weight, None, None,
weight1_scale, weight2_scale,
ffn1_in_scale, ffn2_in_scale)
def forward(self, x, **kwargs):
def forward(self, x: paddle.Tensor):
"""
Defines the forward computation of the moe layer.
@@ -225,13 +225,9 @@ class FusedMoE(nn.Layer):
x (Tensor): Input tensor to the moe layer.
Returns:
Tensor: Output tensor.
Tensor: Output tensor.s
"""
out = self.compute_method.apply(self, self.moe_compute_params, x)
if self.tp_size > 1:
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
tensor_model_parallel_all_reduce(out)
gate_out = paddle.matmul(x.cast("float32"), self.gate_weight)
out = self.quant_method.apply(self, x, gate_out)
return out

View File

@@ -1,126 +0,0 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import os
import paddle
import fastdeploy
import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
from fastdeploy.model_executor.layers.moe.moe import MoELayer
class MoeTPDecoerDeepDeepGEMMLayer(MoELayer):
"""
MoeTPDecoerDeepDeepGEMMLayer
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def forward(self, x, **kwargs):
"""
forward
"""
gate_out = paddle.matmul(x.cast("float32"), self.gate_weight)
if os.getenv("EP_DECODER_PERF_TEST", "False") == "True":
gate_out = paddle.rand(shape=gate_out.shape, dtype=gate_out.dtype)
ffn1_out = paddle.empty(
[
self.num_local_experts,
self.max_batch_size,
self.moe_intermediate_size * 2,
],
dtype=self._dtype,
)
ffn_out = paddle.empty(
[
self.num_local_experts,
self.max_batch_size,
self.embed_dim,
],
dtype=self._dtype,
)
topk_idx, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
gate_out,
(
self.gate_correction_bias
if self.moe_config.moe_use_gate_correction_bias
else None
),
self.top_k,
True, # apply_norm_weight
False,
)
permute_input, token_nums_per_expert, permute_indices_per_token = (
fastdeploy.model_executor.ops.gpu.moe_deepgemm_permute(
x, topk_idx, self.num_local_experts, self.max_batch_size
)
)
expected_m = 128
permute_input_fp8, scale = fastdeploy.model_executor.ops.gpu.masked_per_token_quant(
permute_input, token_nums_per_expert, 128
)
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
(permute_input_fp8, scale),
(
self.moe_ffn1_weight,
self.moe_ffn1_weight_scale,
),
ffn1_out,
token_nums_per_expert,
expected_m,
)
act_out = fastdeploy.model_executor.ops.gpu.group_swiglu_with_masked(
ffn1_out, token_nums_per_expert
)
act_out_fp8, scale = fastdeploy.model_executor.ops.gpu.masked_per_token_quant(
act_out, token_nums_per_expert, 128
)
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
(act_out_fp8, scale),
(
self.moe_ffn2_weight,
self.moe_ffn2_weight_scale,
),
ffn_out,
token_nums_per_expert,
expected_m,
)
fused_moe_out = fastdeploy.model_executor.ops.gpu.moe_deepgemm_depermute(
ffn_out, permute_indices_per_token, topk_idx, topk_weights
)[0]
return fused_moe_out
class MoeTPPrefillDeepDeepGEMMLayer(MoELayer):
"""
MoeTPPrefillDeepDeepGEMMLayer
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def forward(self, x, **kwargs):
"""
forward
"""
raise NotImplementedError("Prefill is comming soon...")

View File

@@ -0,0 +1,198 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import triton
import triton.language as tl
@triton.jit
def fused_moe_kernel_paddle(
a_ptr,
b_ptr,
c_ptr,
a_scale_ptr,
b_scale_ptr,
topk_weights_ptr,
sorted_token_ids_ptr,
expert_ids_ptr,
num_tokens_post_padded_ptr,
# Matrix dimensions
N,
K,
num_tokens_post_padded,
num_valid_tokens,
stride_am,
stride_ak,
stride_be,
stride_bk,
stride_bn,
stride_cm,
stride_cn,
stride_asm,
stride_ask,
stride_bse,
stride_bsk,
stride_bsn,
# Block size for block-wise fp8 quantization
group_n: tl.constexpr,
group_k: tl.constexpr,
# Meta-parameters
BLOCK_SIZE_M: tl.constexpr,
BLOCK_SIZE_N: tl.constexpr,
BLOCK_SIZE_K: tl.constexpr,
GROUP_SIZE_M: tl.constexpr,
MUL_ROUTED_WEIGHT: tl.constexpr,
top_k: tl.constexpr,
compute_type_enum: tl.constexpr,
use_fp8_w8a8: tl.constexpr,
use_int8_w8a16: tl.constexpr,
even_Ks: tl.constexpr,
):
"""
Key Parameters:
- A: The input tensor representing tokens with shape (*, K), where '*' can
be any shape representing batches and K is the feature dimension of
each token.
- B: The stacked MOE weight tensor with shape (E, N, K), where E is
the number of experts, K is the input feature dimension, and N is
the output feature dimension.
- C: The output cache tensor with shape (M, topk, N), where M is the
total number of tokens post padding, topk is the number of times
each token is repeated, and N is the output feature dimension.
- sorted_token_ids: A tensor containing the sorted indices of tokens,
repeated topk times and arranged by the expert index they are
assigned to.
- expert_ids: A tensor containing the indices of the expert for each
block. It determines which expert matrix from B should be used for
each block in A.
This kernel performs the multiplication of a token by its corresponding
expert matrix as determined by `expert_ids`. The sorting of
`sorted_token_ids` by expert index and padding ensures divisibility by
BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
multiplication across different blocks processed by the same expert.
"""
pid = tl.program_id(axis=0)
num_pid_m = tl.cdiv(num_tokens_post_padded, BLOCK_SIZE_M)
num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
num_pid_in_group = GROUP_SIZE_M * num_pid_n
group_id = pid // num_pid_in_group
first_pid_m = group_id * GROUP_SIZE_M
group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
pid_n = (pid % num_pid_in_group) // group_size_m
assert compute_type_enum == 1
compute_type = tl.bfloat16
num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
return
offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
token_mask = offs_token < num_valid_tokens
offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
offs_k = tl.arange(0, BLOCK_SIZE_K)
a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
offs_k[None, :] * stride_ak)
off_experts = tl.load(expert_ids_ptr + pid_m)
b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk +
offs_bn[None, :] * stride_bn)
if use_int8_w8a16:
b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[
None, :] * stride_bsn
b_scale = tl.load(b_scale_ptrs)
if use_fp8_w8a8:
if group_k > 0 and group_n > 0:
a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
offs_bsn = offs_bn // group_n
b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bsn * stride_bsn
else:
# (Zkk): every expert has one activation scale and weight scale.
a_scale = tl.load(a_scale_ptr + off_experts)
b_scale = tl.load(b_scale_ptr + off_experts)
accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
if even_Ks:
a = tl.load(
a_ptrs,
mask=token_mask[:, None],
other=0.0,
)
b = tl.load(b_ptrs,
cache_modifier=".cv",
eviction_policy='evict_first')
else:
a = tl.load(
a_ptrs,
mask=token_mask[:, None] &
(offs_k[None, :] < K - k * BLOCK_SIZE_K),
other=0.0,
)
b = tl.load(b_ptrs,
mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
other=0.0)
# We accumulate along the K dimension.
if use_int8_w8a16:
accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)
elif use_fp8_w8a8:
if group_k > 0 and group_n > 0:
k_start = k * BLOCK_SIZE_K
offs_ks = k_start // group_k
a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask,
mask=token_mask,
other=0.0)
b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk)
accumulator += tl.dot(a, b) * a_scale[:,
None] * b_scale[None, :]
else:
accumulator = tl.dot(a, b, acc=accumulator)
else:
accumulator += tl.dot(a, b)
a_ptrs += BLOCK_SIZE_K * stride_ak
b_ptrs += BLOCK_SIZE_K * stride_bk
if MUL_ROUTED_WEIGHT:
moe_weight = tl.load(topk_weights_ptr + offs_token,
mask=token_mask,
other=0)
accumulator = accumulator * moe_weight[:, None]
if use_int8_w8a16:
accumulator = (accumulator * b_scale).to(compute_type)
elif use_fp8_w8a8:
if group_k > 0 and group_n > 0:
accumulator = accumulator.to(compute_type)
else:
accumulator = (accumulator * a_scale * b_scale).to(compute_type)
else:
accumulator = accumulator.to(compute_type)
# Write back the block of the output
offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
None, :]
c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
tl.store(c_ptrs, accumulator, mask=c_mask)

View File

@@ -28,18 +28,19 @@ class RMSNorm(nn.Layer):
def __init__(
self,
llm_config,
fd_config,
hidden_size,
eps=1e-5,
prefix="",
linear_bias=None,
quant_scale=None,
begin_norm_axis=1,
):
"""
Initializes the normalization layer.
Args:
llm_config (LLMConfig): Arguments related to inference, containing
fd_config (FDConfig): Arguments related to inference, containing
attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
num_attention_heads, and ffn_hidden_size.
hidden_size (int) : size of hidden state.
@@ -52,7 +53,7 @@ class RMSNorm(nn.Layer):
NotImplementedError: If the specified norm_type is not supported.
"""
super().__init__()
self.llm_config = llm_config
self.fd_config = fd_config
self.prefix = prefix
self.hidden_size = hidden_size
if len(prefix) == 0:
@@ -66,6 +67,11 @@ class RMSNorm(nn.Layer):
self.quant_scale = quant_scale
self._dtype = self._helper.get_default_dtype()
self._norm_weight_dtype = self._dtype
self.begin_norm_axis = begin_norm_axis
self.quant_round_type = self.fd_config.quant_config.quant_round_type if fd_config.quant_config else 0
self.quant_max_bound = self.fd_config.quant_config.quant_max_bound if fd_config.quant_config else 0
self.quant_min_bound = self.fd_config.quant_config.quant_min_bound if fd_config.quant_config else 0
self.begin_norm_axis = begin_norm_axis
self.init_weight()
@@ -118,13 +124,13 @@ class RMSNorm(nn.Layer):
norm_weight=self.ln_weight,
norm_bias=None,
epsilon=self.eps,
begin_norm_axis=1,
begin_norm_axis=self.begin_norm_axis,
bias=self.linear_bias,
residual=residual_input,
quant_scale=-1 if self.quant_scale is None else self.quant_scale,
quant_round_type=self.llm_config.quant_config.quant_round_type,
quant_max_bound=self.llm_config.quant_config.quant_max_bound,
quant_min_bound=self.llm_config.quant_config.quant_min_bound,
quant_round_type=self.quant_round_type,
quant_max_bound=self.quant_max_bound,
quant_min_bound=self.quant_min_bound,
)
if residual_input is not None:
return norm_out[0], norm_out[1]
@@ -139,7 +145,7 @@ class LayerNorm(nn.Layer):
def __init__(
self,
llm_config,
fd_config,
hidden_size,
eps=1e-5,
prefix="",
@@ -151,7 +157,7 @@ class LayerNorm(nn.Layer):
Initializes the normalization layer.
Args:
llm_config (LLMConfig): Arguments related to inference, containing
fd_config (FDConfig): Arguments related to inference, containing
attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
num_attention_heads, and ffn_hidden_size.
prefix (str): Unique name of the layer, used for naming internal attributes,
@@ -163,7 +169,7 @@ class LayerNorm(nn.Layer):
NotImplementedError: If the specified norm_type is not supported.
"""
super().__init__()
self.llm_config = llm_config
self.fd_config = fd_config
self.prefix = prefix
self.hidden_size = hidden_size
if len(prefix) == 0:
@@ -180,6 +186,10 @@ class LayerNorm(nn.Layer):
self._dtype = self._helper.get_default_dtype()
self._norm_weight_dtype = "float32"
self.quant_round_type = self.fd_config.quant_config.quant_round_type if fd_config.quant_config else 0
self.quant_max_bound = self.fd_config.quant_config.quant_max_bound if fd_config.quant_config else 0
self.quant_min_bound = self.fd_config.quant_config.quant_min_bound if fd_config.quant_config else 0
self.init_weight()
def init_weight(self):
@@ -240,6 +250,7 @@ class LayerNorm(nn.Layer):
The `residual_output` is the result of applying the normalization and possibly other
operations (like linear transformation) on the `residual_input`.
"""
norm_out = self.norm_func(
x,
norm_weight=self.ln_weight,
@@ -249,9 +260,9 @@ class LayerNorm(nn.Layer):
bias=self.linear_bias,
residual=residual_input,
quant_scale=-1,
quant_round_type=self.llm_config.quant_config.quant_round_type,
quant_max_bound=self.llm_config.quant_config.quant_max_bound,
quant_min_bound=self.llm_config.quant_config.quant_min_bound,
quant_round_type=self.quant_round_type,
quant_max_bound=self.quant_max_bound,
quant_min_bound=self.quant_min_bound,
)
if residual_input is not None:
return norm_out[0], norm_out[1]

View File

@@ -19,11 +19,18 @@ from typing import Dict, List, Type
from .quant_base import QuantConfigBase
QUANTIZATION_METHODS: List[str] = [
"wint2",
"wint4",
"wint8",
"weight_only",
"block_wise",
"block_wise_fp8",
"w4afp8",
"w8a8",
"w4a8",
"wfp8afp8",
"mix_quant",
"tensor_wise_fp8",
"kvcache",
]
@@ -34,20 +41,30 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]:
if quantization not in QUANTIZATION_METHODS:
raise ValueError(f"Invalid quantization method: {quantization}")
from .block_wise import BlockWiseConfig
from .block_wise_fp8 import BlockWiseFP8Config
from .kv_cache import KvCacheQuantConfig
from .mix_quant import MixQuantConfig
from .tensor_wise_fp8 import TensorWiseFP8Config
from .w4a8 import W4A8Config
from .w4afp8 import W4AFP8Config
from .w8a8 import W8A8Config
from .weight_only import WeightOnlyConfig
from .weight_only import WeightOnlyConfig, WINT4Config, WINT8Config
from .wfp8afp8 import WFP8AFP8Config
from .kv_cache import KvCacheQuantConfig
from .wint2 import WINT2Config
method_to_config: Dict[str, Type[QuantConfigBase]] = {
"wint2": WINT2Config,
"wint4": WINT4Config,
"wint8": WINT8Config,
"weight_only": WeightOnlyConfig,
"block_wise": BlockWiseConfig,
"block_wise_fp8": BlockWiseFP8Config,
"w4afp8": W4AFP8Config,
"w8a8": W8A8Config,
"w4a8": W4A8Config,
"wfp8afp8": WFP8AFP8Config,
"kvcache": KvCacheQuantConfig
"tensor_wise_fp8": TensorWiseFP8Config,
"kvcache": KvCacheQuantConfig,
"mix_quant": MixQuantConfig,
}
return method_to_config[quantization]

View File

@@ -18,16 +18,13 @@ from typing import Optional
import paddle
import fastdeploy
import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
from fastdeploy.model_executor.layers.moe import FusedMoE
from ..utils import per_block_cast_to_fp8
from ..utils import per_block_cast_to_fp8, get_tensor
from .quant_base import QuantConfigBase, QuantMethodBase
QUANT_ALIGNMENT_OFFSET = 127
QUANT_BLOCK_SIZE = 128
class BlockWiseConfig(QuantConfigBase):
class BlockWiseFP8Config(QuantConfigBase):
"""
block wise quantization config, only support fp8 quant and only supports loading weights in BF16 format.
After loading the weights, it will automatically compute quantization sparsity and dynamically perform
@@ -37,41 +34,55 @@ class BlockWiseConfig(QuantConfigBase):
def __init__(self, weight_block_size: list = [-1, -1]) -> None:
super().__init__()
self.weight_block_size = weight_block_size
self.quant_max_bound = 448
self.quant_min_bound = -448
self.quant_round_type = 1
def get_name(self) -> str:
return "block_wise"
def name(self) -> str:
return "block_wise_fp8"
@classmethod
def from_config(cls, config: dict) -> "BlockWiseConfig":
weight_block_size = config["weight_block_size"]
def from_config(cls, config: dict) -> "BlockWiseFP8Config":
weight_block_size = config.get("weight_block_size", [128, 128])
return cls(weight_block_size)
def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
return BlockWiseLinearMethod(self)
'''
Get quantization method.
'''
if isinstance(layer, FusedMoE):
from fastdeploy.model_executor.layers.moe.fused_moe_deepgemm_backend import \
DeepGemmFusedMoeMethod
return DeepGemmFusedMoeMethod(self)
else:
return BlockWiseFP8LinearMethod(self)
class BlockWiseLinearMethod(QuantMethodBase):
class BlockWiseFP8LinearMethod(QuantMethodBase):
"""
block wise quantization method for linear
"""
def __init__(
self,
quant_config: BlockWiseConfig,
quant_config: BlockWiseFP8Config,
) -> None:
super().__init__()
self.quant_config = quant_config
def create_weights(self, layer):
layer.linear_weight_scale = self.create_parameter(
layer.linear_weight_shape.reverse()
layer.linear_weight_scale = layer.create_parameter(
shape=[
(layer.embed_dim + QUANT_ALIGNMENT_OFFSET) // QUANT_BLOCK_SIZE,
(layer.num_heads * layer.head_dim + QUANT_ALIGNMENT_OFFSET) //
QUANT_BLOCK_SIZE,
(layer.output_size + self.quant_config.weight_block_size[0] -
1) // self.quant_config.weight_block_size[0],
(layer.input_size + self.quant_config.weight_block_size[1] - 1)
// self.quant_config.weight_block_size[1],
],
dtype="float32",
is_bias=False,
)
layer.weight_dtype = "float8_e4m3fn"
def process_loaded_weights(self, layer, weights) -> None:
weight_tensor = weights.transpose([1, 0])
@@ -80,15 +91,30 @@ class BlockWiseLinearMethod(QuantMethodBase):
layer.linear_weight.copy_(quanted_weight_tensor, False)
layer.linear_weight_scale.set_value(weight_block_scale_tensor)
def process_prequanted_weights(self, layer, state_dict):
"""
process_prequanted_weights
"""
quant_weight = get_tensor(state_dict.pop(layer.weight_key))
weight_scale = get_tensor(state_dict.pop(layer.weight_scale_key))
quant_weight = quant_weight.transpose([1, 0]).contiguous()
layer.linear_weight.copy_(quant_weight.view("float8_e4m3fn"), False)
weight_scale = weight_scale.transpose([1, 0])
layer.linear_weight_scale.set_value(weight_scale)
def apply(self, layer, x):
x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant_padding(
x, self.quant_config.weight_block_size[0])
linear_out = paddle.empty(
(x.shape[0], layer.llm_config.model_config.hidden_size),
dtype=paddle.bfloat16)
linear_out = paddle.empty((x.shape[0], layer.output_size),
dtype=paddle.bfloat16)
import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
deep_gemm.gemm_fp8_fp8_bf16_nt(
(x, x_scale_tensor),
(layer.linear_weight, layer.linear_weight_scale),
linear_out,
)
if layer.with_bias:
linear_out = paddle.add(linear_out, layer.linear_bias)
return linear_out

View File

@@ -13,38 +13,66 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from paddle import nn
import os
import paddle
from .quant_base import QuantConfigBase, QuantMethodBase
from enum import Enum
from typing import Optional
import paddle
from paddle import nn
from fastdeploy.model_executor.layers.utils import get_tensor
from ..utils import create_and_set_parameter
from .quant_base import QuantConfigBase, QuantMethodBase
class KvCacheQuantzationTypes(str, Enum):
"""
KvCacheQuantzationTypes
"""
INT8 = "int8"
FP8 = "float8_e4m3fn"
INT8_ZP = "int8_zp"
FP8_ZP = "float8_e4m3fn_zp"
class KvCacheQuantConfig(QuantConfigBase):
"""
quantization config for weight 4bits and activation fp8
"""
def __init__(self, cachekv_scale_dict) -> None:
def __init__(self, kv_cache_quant_type: str) -> None:
"""
__init__
"""
super().__init__()
self.cachekv_scale_dict = cachekv_scale_dict
self.kv_cache_quant_type = kv_cache_quant_type
def get_name(self) -> str:
try:
self.quant_type = KvCacheQuantzationTypes(kv_cache_quant_type)
except ValueError:
raise ValueError(f'Invalid Kvcache type: {kv_cache_quant_type}')
self.has_zero_point = "zp" in kv_cache_quant_type
if self.quant_type == KvCacheQuantzationTypes.INT8 or self.quant_type == KvCacheQuantzationTypes.INT8_ZP:
self.max_bound = 127.0
elif self.quant_type == KvCacheQuantzationTypes.FP8 or self.quant_type == KvCacheQuantzationTypes.FP8_ZP:
self.max_bound = 448.0
else:
raise ValueError(f'Invalid Kvcache type: {kv_cache_quant_type}')
def name(self) -> str:
"""
get_name
"""
return "kvcache"
@classmethod
def from_config(cls, config: dict) -> "KvCacheQuantConfig":
def from_config(cls, kv_cache_quant_type: str) -> "KvCacheQuantConfig":
"""
from_config
"""
cachekv_scale_dict = config["cachekv_scale_dict"]
return cls(cachekv_scale_dict)
return cls(kv_cache_quant_type)
def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
"""
@@ -66,197 +94,63 @@ class KVCacheMethodBase(QuantMethodBase):
KVCacheMethodBase __init__
"""
super().__init__()
self.quant_config = quant_config
self.cache_quant_config = quant_config
def load_zp(self, layer: nn.Layer):
def load_zp(self, layer: nn.Layer, state_dict):
"""
load_zp
"""
if self.cache_k_zp_name in self.quant_config.cachekv_scale_dict:
cache_k_zp = paddle.cast(
paddle.to_tensor(
self.quant_config.cachekv_scale_dict[self.cache_k_zp_name]
),
self.cache_scale_dtype,
)
else:
cache_k_zp = paddle.zeros(
(
[self.kv_num_heads * self.head_dim]
if self.quant_config.is_channel_wise
else [self.kv_num_heads]
),
dtype=self.cache_scale_dtype,
)
if self.cache_v_zp_name in self.quant_config.cachekv_scale_dict:
cache_v_zp = paddle.cast(
paddle.to_tensor(
self.quant_config.cachekv_scale_dict[self.cache_v_zp_name]
),
self.cache_scale_dtype,
)
else:
cache_v_zp = paddle.zeros(
(
[self.kv_num_heads * self.head_dim]
if self.quant_config.is_channel_wise
else [self.kv_num_heads]
),
dtype=self.cache_scale_dtype,
)
layer.cache_k_zp.set_value(cache_k_zp)
layer.cache_v_zp.set_value(cache_v_zp)
cache_k_zeropoint = get_tensor(state_dict.pop(self.cache_k_zp_name))
cache_v_zeropoint = get_tensor(state_dict.pop(self.cache_v_zp_name))
def load_scale(self, layer: nn.Layer):
create_and_set_parameter(layer, "cache_k_zp", cache_k_zeropoint)
create_and_set_parameter(layer, "cache_v_zp", cache_v_zeropoint)
def load_scale(self, layer: nn.Layer, state_dict):
"""
load_scale
"""
if self.cache_k_scale_name in self.quant_config.cachekv_scale_dict:
cache_k_scale = paddle.cast(
paddle.to_tensor(
self.quant_config.cachekv_scale_dict[self.cache_k_scale_name]
),
self.cache_scale_dtype,
)
cache_k_out_scale = 1.0 / cache_k_scale
else:
raise KeyError(
f"{self.cache_k_scale_name} not found in scale dict")
cache_k_scale_tensor = get_tensor(
state_dict.pop(self.cache_k_scale_name)).cast(
paddle.get_default_dtype()).reshape_([-1])
cache_v_scale_tensor = get_tensor(
state_dict.pop(self.cache_v_scale_name)).cast(
paddle.get_default_dtype()).reshape_([-1])
if self.cache_v_scale_name in self.quant_config.cachekv_scale_dict:
cache_v_scale = paddle.cast(
paddle.to_tensor(
self.quant_config.cachekv_scale_dict[self.cache_v_scale_name]
),
self.cache_scale_dtype,
)
cache_v_out_scale = 1.0 / cache_v_scale
else:
raise KeyError(
f"{self.cache_v_scale_name} not found in scale dict")
cache_k_scale = self.cache_quant_config.max_bound / cache_k_scale_tensor
cache_v_scale = self.cache_quant_config.max_bound / cache_v_scale_tensor
cache_k_out_scale = cache_k_scale_tensor / self.cache_quant_config.max_bound
cache_v_out_scale = cache_v_scale_tensor / self.cache_quant_config.max_bound
if self.cache_v_scale_name in self.quant_config.cachekv_scale_dict:
cache_v_scale = paddle.cast(
paddle.to_tensor(
self.quant_config.cachekv_scale_dict[self.cache_v_scale_name]
),
self.cache_scale_dtype,
)
cache_v_out_scale = 1.0 / cache_v_scale
else:
raise KeyError(
f"{self.cache_v_scale_name} not found in scale dict")
create_and_set_parameter(layer, "cache_k_scale", cache_k_scale)
create_and_set_parameter(layer, "cache_v_scale", cache_v_scale)
create_and_set_parameter(layer, "cache_k_out_scale", cache_k_out_scale)
create_and_set_parameter(layer, "cache_v_out_scale", cache_v_out_scale)
layer.cache_k_scale.set_value(cache_k_scale)
layer.cache_v_scale.set_value(cache_v_scale)
layer.cache_k_out_scale.set_value(cache_k_out_scale)
layer.cache_v_out_scale.set_value(cache_v_out_scale)
def create_scale(self, layer: nn.Layer):
"""
create_scale
"""
layer.cache_k_scale = layer.create_parameter(
shape=(
[layer.kv_num_heads * layer.head_dim]
if self.quant_config.is_channel_wise
else [layer.kv_num_heads]
),
dtype=self.cache_scale_dtype,
is_bias=False,
)
layer.cache_v_scale = layer.create_parameter(
shape=(
[layer.kv_num_heads * layer.head_dim]
if self.quant_config.is_channel_wise
else [layer.kv_num_heads]
),
dtype=self.cache_scale_dtype,
is_bias=False,
)
layer.cache_k_out_scale = layer.create_parameter(
shape=(
[layer.kv_num_heads * layer.head_dim]
if self.quant_config.is_channel_wise
else [layer.kv_num_heads]
),
attr=None,
dtype=self.cache_scale_dtype,
is_bias=False,
)
layer.cache_v_out_scale = layer.create_parameter(
shape=(
[layer.kv_num_heads * layer.head_dim]
if self.quant_config.is_channel_wise
else [layer.kv_num_heads]
),
attr=None,
dtype=self.cache_scale_dtype,
is_bias=False,
)
def create_zp(self, layer: nn.Layer):
"""
create_zp
"""
layer.cache_k_zp = layer.create_parameter(
shape=(
[layer.kv_num_heads * layer.head_dim]
if self.quant_config.is_channel_wise
else [layer.kv_num_heads]
),
dtype=self.cache_scale_dtype,
is_bias=False,
)
layer.cache_v_zp = layer.create_parameter(
shape=(
[layer.kv_num_heads * layer.head_dim]
if self.quant_config.is_channel_wise
else [layer.kv_num_heads]
),
dtype=self.cache_scale_dtype,
is_bias=False,
)
def create_weights(self, layer: nn.Layer):
def create_weights(self, layer: nn.Layer, state_dict):
"""
create_weights
"""
self.prefix = layer.prefix
self.cache_k_scale_name = layer.prefix + ".cachek_matmul.activation_quanter"
self.cache_v_scale_name = layer.prefix + ".cachev_matmul.activation_quanter"
self.cache_k_zp_name = layer.cache_k_scale_name + ".zero_point"
self.cache_v_zp_name = layer.cache_v_scale_name + ".zero_point"
self.cache_k_scale_name = layer.prefix + ".cachek_matmul.activation_scale"
self.cache_v_scale_name = layer.prefix + ".cachev_matmul.activation_scale"
self.cache_k_zp_name = layer.prefix + ".cachek_matmul.activation_zero_point"
self.cache_v_zp_name = layer.prefix + ".cachev_matmul.activation_zero_point"
layer.cache_k_zp = None
layer.cache_v_zp = None
layer.cache_k_scale = None
layer.cache_v_scale = None
layer.cache_k_out_scale = None
layer.cache_v_out_scale = None
if self.cache_quant_config.quant_type == KvCacheQuantzationTypes.INT8:
setattr(layer, "cache_quant_type_str", "cache_int8")
setattr(layer, "quant_max_bound", 127.0)
setattr(layer, "quant_min_bound", -127.0)
elif self.cache_quant_config.quant_type == KvCacheQuantzationTypes.FP8:
setattr(layer, "cache_quant_type_str", "cache_fp8")
setattr(layer, "quant_max_bound", 448.0)
setattr(layer, "quant_min_bound", -448.0)
else:
raise NotImplementedError(f"{self.cache_quant_config.quant_type} is not implemented")
self._dtype = layer._dtype
if self._dtype != "bfloat16" and self._dtype != "float16" and self._dtype == "float32":
raise ValueError(
f"Just support float32, float16 and \
bfloat16 as default dtype, but received {self._dtype}"
)
self.cache_scale_dtype = (
self._dtype if self.quant_config.use_append_attn else "float32"
)
if not self.quant_config.use_dynamic_cachekv_quant:
if (
self.quant_config.cachekv_dtype == "int8"
or self.quant_config.cachekv_dtype == "int4"
or self.quant_config.cachekv_dtype == "float8_e4m3fn"
):
self.create_scale(layer)
self.load_scale(layer)
if self.quant_config.has_zero_point:
self.create_zp(layer)
self.load_zp(layer)
layer.cache_quant_type_str = self.quant_config.cache_quant_type
self.load_scale(layer, state_dict)
if self.cache_quant_config.has_zero_point:
self.load_zp(layer, state_dict)
def apply(self, layer):
"""
@@ -264,4 +158,3 @@ class KVCacheMethodBase(QuantMethodBase):
"""
raise RuntimeError(
f"{self.__class__.__name__}.apply should not be called.")

View File

@@ -0,0 +1,75 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import Optional
from ..attention import Attention
from ..moe import FusedMoE
from . import get_quantization_config
from .quant_base import QuantConfigBase, QuantMethodBase
class MixQuantConfig(QuantConfigBase):
"""
Quantization config for layers that has different quantization methods.
"""
def __init__(
self,
dense_quant_type: str,
moe_quant_type: str,
kv_cache_quant_type: str = None,
image_moe_quant_type: str = None,
) -> None:
super().__init__()
self.dense_quant_type = dense_quant_type
self.moe_quant_type = moe_quant_type
self.kv_cache_quant_type = kv_cache_quant_type
if image_moe_quant_type is None:
self.image_moe_quant_type = moe_quant_type
else:
self.image_moe_quant_type = image_moe_quant_type
self.quant_max_bound = 0
self.quant_min_bound = 0
self.quant_round_type = 0
def name(self) -> str:
return "mix_quant"
@classmethod
def from_config(cls, config: dict) -> "MixQuantConfig":
return cls(config['dense_quant_type'], config['moe_quant_type'],
config.get('kv_cache_quant_type', None),
config.get('image_moe_quant_type', None))
def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
if isinstance(layer, FusedMoE):
if layer.moe_tag == "Image":
return get_quantization_config(
self.image_moe_quant_type).from_config(
{}).get_quant_method(layer)
else:
return get_quantization_config(
self.moe_quant_type).from_config(
{}).get_quant_method(layer)
elif isinstance(layer, Attention):
if self.kv_cache_quant_type is not None:
return (get_quantization_config("kvcache").from_config(
self.kv_cache_quant_type).get_quant_method(layer))
else:
return None
else:
return get_quantization_config(self.dense_quant_type).from_config(
{}).get_quant_method(layer)

View File

@@ -0,0 +1,22 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from .cutlass_scaled_mm import cutlass_scaled_mm
from .scaled_fp8_quant import scaled_fp8_quant
__all__ = [
"cutlass_scaled_mm",
"scaled_fp8_quant",
]

View File

@@ -0,0 +1,126 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import Optional
import paddle
import fastdeploy
def cutlass_scaled_mm(a: paddle.Tensor,
b: paddle.Tensor,
scale_a: paddle.Tensor,
scale_b: paddle.Tensor,
out_dtype: paddle.dtype,
bias: Optional[paddle.Tensor] = None) -> paddle.Tensor:
"""
`cutlass_scaled_mm` implements a fused version of
`output = paddle.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
where scale_a * a and scale_b * b are implemented using numpy-style
broadcasting.
In order to support blockwise scaling like found in DeepSeek V3 we also
support extended "group" broadcast rules. We extend the numpy-style
broadcasting rules with the following rule:
"if the extent of a dimension in the source shape is between 1 and
corresponding extent in the target shape we repeat each element along
that dimension src_shape[dim] // target_shape[dim] times consecutively"
example if we have:
a = [[1, 2], and target_shape = (2, 4)
[3, 4]]
then we would expand a to:
a = [[1, 1, 2, 2],
[3, 3, 4, 4]]
currently we only support the case:
scale_a.shape * [1, 128] == a.shape
scale_b.shape * [128, 128] == b.shape
"""
assert (out_dtype == paddle.bfloat16 or out_dtype == paddle.float16)
assert bias is None or bias.shape[0] == b.shape[
0] and bias.dtype == out_dtype
# Ensure input tensors have valid shapes
# assert a.numel() > 0, "Input tensor 'a' must not be empty"
# assert b.numel() > 0, "Input tensor 'b' must not be empty"
# assert scale_a.numel() > 0, "Scale tensor 'scale_a' must not be empty"
# assert scale_b.numel() > 0, "Scale tensor 'scale_b' must not be empty"
m = a.shape[0]
n = b.shape[0]
cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
assert cutlass_compatible_b
out = paddle.empty([m, n], dtype=out_dtype)
fastdeploy.model_executor.ops.gpu.cutlass_scaled_mm(
out, a, b, scale_a, scale_b, bias)
return out
def scaled_fp8_quant(
input: paddle.Tensor,
scale: Optional[paddle.Tensor] = None,
num_token_padding: Optional[int] = None,
scale_ub: float = 0,
use_per_token_if_dynamic: bool = False,
) -> tuple[paddle.Tensor, paddle.Tensor]:
"""
Quantize input tensor to FP8 and return quantized tensor and scale.
This function supports both static and dynamic quantization: If you
provide the scale, it will use static scaling and if you omit it,
the scale will be determined dynamically. The function also allows
optional padding of the output tensors for downstream kernels that
will benefit from padding.
Args:
input: The input tensor to be quantized to FP8
scale: Optional scaling factor for the FP8 quantization
scale_ub: Optional upper bound for scaling factor in dynamic
per token case
num_token_padding: If specified, pad the first dimension
of the output to at least this value.
use_per_token_if_dynamic: Whether to do per_tensor or per_token
in the dynamic quantization case.
Returns:
tuple[paddle.Tensor, paddle.Tensor]: The output tensor in FP8 and
scaling factor.
"""
# This code assumes batch_dim and num_tokens are flattened
assert (input.ndim == 2)
shape = input.shape
if num_token_padding:
shape = (max(num_token_padding, input.shape[0]), shape[1])
output = paddle.empty(shape, dtype=paddle.float8_e4m3fn)
if scale is None:
if use_per_token_if_dynamic:
scale = paddle.empty([shape[0], 1], dtype=paddle.float32)
from fastdeploy.model_executor.ops.gpu import \
dynamic_per_token_scaled_fp8_quant
dynamic_per_token_scaled_fp8_quant(output, input, scale, scale_ub)
else:
scale = paddle.zeros([1], dtype=paddle.float32)
from fastdeploy.model_executor.ops.gpu import \
dynamic_scaled_fp8_quant
dynamic_scaled_fp8_quant(output, input, scale)
else:
# num_token_padding not implemented for this case
# assert (scale.numel() == 1 or num_token_padding is None)
from fastdeploy.model_executor.ops.gpu import static_scaled_fp8_quant
static_scaled_fp8_quant(output, input, scale)
return output, scale

View File

@@ -0,0 +1,75 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import Optional
import paddle
def scaled_fp8_quant(
input: paddle.Tensor,
scale: Optional[paddle.Tensor] = None,
num_token_padding: Optional[int] = None,
scale_ub: float = 0,
use_per_token_if_dynamic: bool = False,
) -> tuple[paddle.Tensor, paddle.Tensor]:
"""
Quantize input tensor to FP8 and return quantized tensor and scale.
This function supports both static and dynamic quantization: If you
provide the scale, it will use static scaling and if you omit it,
the scale will be determined dynamically. The function also allows
optional padding of the output tensors for downstream kernels that
will benefit from padding.
Args:
input: The input tensor to be quantized to FP8
scale: Optional scaling factor for the FP8 quantization
scale_ub: Optional upper bound for scaling factor in dynamic
per token case
num_token_padding: If specified, pad the first dimension
of the output to at least this value.
use_per_token_if_dynamic: Whether to do per_tensor or per_token
in the dynamic quantization case.
Returns:
tuple[paddle.Tensor, paddle.Tensor]: The output tensor in FP8 and
scaling factor.
"""
# This code assumes batch_dim and num_tokens are flattened
assert (input.ndim == 2)
shape = input.shape
if num_token_padding:
shape = (max(num_token_padding, input.shape[0]), shape[1])
output = paddle.empty(shape, dtype=paddle.float8_e4m3fn)
if scale is None:
if use_per_token_if_dynamic:
scale = paddle.empty([shape[0], 1], dtype=paddle.float32)
from fastdeploy.model_executor.ops.gpu import \
dynamic_per_token_scaled_fp8_quant
dynamic_per_token_scaled_fp8_quant(output, input, scale, scale_ub)
else:
scale = paddle.zeros([1], dtype=paddle.float32)
from fastdeploy.model_executor.ops.gpu import \
dynamic_scaled_fp8_quant
dynamic_scaled_fp8_quant(output, input, scale)
else:
# num_token_padding not implemented for this case
# assert (scale.numel() == 1 or num_token_padding is None)
from fastdeploy.model_executor.ops.gpu import static_scaled_fp8_quant
static_scaled_fp8_quant(output, input, scale)
return output, scale

View File

@@ -47,12 +47,9 @@ class QuantConfigBase(ABC):
def __init__(self):
super().__init__()
self.quant_round_type = None
self.quant_max_bound = None
self.quant_min_bound = None
@abstractmethod
def get_name(self) -> str:
def name(self) -> str:
"""Name of the quantization method."""
raise NotImplementedError

View File

@@ -0,0 +1,135 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import Optional
import paddle
from fastdeploy.model_executor.layers.moe import FusedMoE
from ..utils import get_tensor
from .quant_base import QuantConfigBase, QuantMethodBase
class TensorWiseFP8Config(QuantConfigBase):
"""
Quantization config for weight and activation with FP8.
"""
def __init__(self) -> None:
"""
Nothing else to do!
"""
super().__init__()
def name(self) -> str:
"""
Nothing else to do!
"""
return "tensor_wise_fp8"
@classmethod
def from_config(cls, config: dict) -> "TensorWiseFP8Config":
"""
Nothing else to do!
"""
return cls()
def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
"""
return method according to this config!
"""
if isinstance(layer, FusedMoE):
from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import \
TensorWiseFP8MoEMethod
return TensorWiseFP8MoEMethod(self)
else:
return TensorWiseFP8LinearMethod(self)
class TensorWiseFP8LinearMethod(QuantMethodBase):
"""
Weight and activation quantization method for linear layer with per tensor FP8
"""
def __init__(
self,
quant_config: TensorWiseFP8Config,
) -> None:
"""
Nothing special to do!
"""
super().__init__()
self.quant_config = quant_config
self.quant_max_bound = 448
self.quant_min_bound = -448
self.quant_round_type = 1
self.weight_dtype = "float8_e4m3fn"
def create_weights(self, layer):
"""
Nothing to do!
"""
pass
def process_prequanted_weights(self, layer, state_dict) -> None:
"""
Process pre-quantized weights before applying them to the model
Args:
layer: The layer that owns the weights
quant_weight: The quantized weights
weight_scale: The scale of the quantized weights
"""
quant_weight = get_tensor(state_dict.pop(layer.weight_key))
weight_scale = get_tensor(state_dict.pop(layer.weight_scale_key))
act_scale = get_tensor(state_dict.pop(layer.act_scale_key))
quant_weight = quant_weight.transpose([1, 0]).contiguous()
layer.linear_weight.copy_(quant_weight.view("float8_e4m3fn"), False)
self.act_scale = act_scale.item()
self.total_scale = (act_scale * weight_scale).item()
def process_loaded_weights(self, layer, weights, state_dict) -> None:
"""
Read fp8 weight, act scale, weight scale
"""
pass
def apply(self, layer, x):
"""
compute!
"""
from fastdeploy.model_executor.ops.gpu import \
cutlass_fp8_fp8_half_gemm_fused
from ..utils import create_hadamard_matrix_map
hadamard_matrix = create_hadamard_matrix_map[x.shape[-1]]
new_x = paddle.matmul(x.cast("float32"), hadamard_matrix)
fp8_x = new_x / self.act_scale
fp8_x = fp8_x.astype("float8_e4m3fn")
linear_out = cutlass_fp8_fp8_half_gemm_fused(
fp8_x,
layer.linear_weight,
transpose_x=False,
transpose_y=True,
bias=None,
scale=self.total_scale,
output_dtype="bfloat16",
activation_type="identity")
return linear_out

View File

@@ -0,0 +1,42 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import Optional
from ..moe import FusedMoE
from .quant_base import QuantConfigBase, QuantMethodBase
class W4A8Config(QuantConfigBase):
"""
quantization config for weight 4bits and activation 8bits
"""
def __init__(self) -> None:
super().__init__()
def name(self) -> str:
return "w4a8"
@classmethod
def from_config(cls, config: dict) -> "W4A8Config":
return cls()
def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
if isinstance(layer, FusedMoE):
from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import CutlassW4A8MoEMethod
return CutlassW4A8MoEMethod(self)
else:
raise ValueError(f"Unsupported layer type {type(layer)} for w4a8")

View File

@@ -23,16 +23,21 @@ from .quant_base import QuantConfigBase, QuantMethodBase
QUANT_SCALING_FACTOR = 448
class W4AFP8Config(QuantConfigBase):
"""
quantization config for weight 4bits and activation fp8
"""
def __init__(self, weight_scale_dict, act_scale_dict) -> None:
super().__init__()
self.weight_scale_dict = weight_scale_dict
self.act_scale_dict = act_scale_dict
self.quant_max_bound = 448
self.quant_min_bound = -448
self.quant_round_type = 1
def get_name(self) -> str:
def name(self) -> str:
return "w4afp8"
@classmethod
@@ -49,6 +54,7 @@ class W4AFP8LinearMethod(QuantMethodBase):
"""
W4 AFP8 quant method for linear
"""
def __init__(
self,
quant_config: W4AFP8Config,
@@ -57,6 +63,9 @@ class W4AFP8LinearMethod(QuantMethodBase):
self.quant_config = quant_config
def create_weights(self, layer):
layer.linear_weight_shape.reverse()
layer.linear_weight_shape[0] //= 2
layer.weight_dtype = "int8"
pass
def process_loaded_weights(self, layer, weights) -> None:
@@ -78,11 +87,11 @@ class W4AFP8LinearMethod(QuantMethodBase):
layer.linear_weight_scale,
zero_points=None,
bias=layer.linear_bias if layer.add_bias else None,
out_scale=self.quant_config.weight_scale_dict.get(
layer.prefix + ".weight_quanter") /
(self.quant_config.act_scale_dict.get(layer.prefix +
".activation_quanter") *
QUANT_SCALING_FACTOR * QUANT_SCALING_FACTOR),
out_scale=self.quant_config.weight_scale_dict.get(layer.prefix +
".weight_scale")
/ (self.quant_config.act_scale_dict.get(layer.prefix +
".activation_scale") *
QUANT_SCALING_FACTOR * QUANT_SCALING_FACTOR),
groupsize=0,
out_dtype=layer._dtype,
)

View File

@@ -16,11 +16,12 @@
from typing import Optional
import paddle
from paddlenlp.utils.log import logger
from paddleformers.utils.log import logger
import fastdeploy
from fastdeploy.platforms.utils import convert_to_npu_dequant_scale
from ..utils import get_tensor
from .quant_base import QuantConfigBase, QuantMethodBase
@@ -29,14 +30,18 @@ class W8A8Config(QuantConfigBase):
quantization config for weight 8bits and activation 8bits
"""
def __init__(self, weight_scale_dict, act_scale_dict,
use_gemm_dequant) -> None:
def __init__(self, weight_scale_dict, act_scale_dict, use_gemm_dequant,
use_smooth_quant) -> None:
super().__init__()
self.weight_scale_dict = weight_scale_dict
self.act_scale_dict = act_scale_dict
self.use_gemm_dequant = use_gemm_dequant
self.use_smooth_quant = use_smooth_quant
self.quant_max_bound = 127
self.quant_min_bound = -127
self.quant_round_type = 0
def get_name(self) -> str:
def name(self) -> str:
return "w8a8"
@classmethod
@@ -61,12 +66,17 @@ class W8A8LinearMethod(QuantMethodBase):
) -> None:
super().__init__()
self.quant_config = quant_config
self.smooth_quant_method = SmoothQuantLinearMethod(quant_config)
def create_weights(self, layer):
weight_scale = self.quant_config.weight_scale_dict.get(
layer.prefix + ".weight_quanter")
layer.linear_weight_shape.reverse()
layer.weight_dtype = "int8"
if self.quant_config.use_smooth_quant:
self.smooth_quant_method.create_weights(layer)
weight_scale = self.quant_config.weight_scale_dict.get(layer.prefix +
".weight_scale")
in_scale = self.quant_config.act_scale_dict.get(layer.prefix +
".activation_quanter")
".activation_scale")
self.skip_quant = False
if weight_scale is None or in_scale is None:
self.skip_quant = True
@@ -86,13 +96,15 @@ class W8A8LinearMethod(QuantMethodBase):
convert_to_npu_dequant_scale(linear_out_scale))
def process_loaded_weights(self, layer, weights) -> None:
if self.quant_config.use_smooth_quant:
self.smooth_quant_method.process_loaded_weights(layer, weights)
if self.skip_quant:
logger.debug(f"{layer.prefix} skip quant")
weight_tensor = weights.cast(layer._dtype)
layer.linear_weight.set_value(weight_tensor)
else:
weight_tensor = weights.transpose([1, 0])
weight_tensor = paddle.cast(weight_tensor, layer.weight_dtype)
weight_tensor = paddle.cast(weight_tensor, "int8")
layer.linear_weight.set_value(weight_tensor)
def apply(self, layer, x):
@@ -107,3 +119,53 @@ class W8A8LinearMethod(QuantMethodBase):
linear_out = fastdeploy.model_executor.ops.gpu.dequant_int8(
linear_out, layer.linear_out_scale, layer._dtype)
return linear_out
class SmoothQuantLinearMethod(QuantMethodBase):
"""
SmoothQuant Method
"""
def __init__(
self,
quant_config: QuantConfigBase,
) -> None:
super().__init__()
self.quant_config = quant_config
def create_weights(self, layer):
linear_shift_shape = [layer.output_size]
linear_smooth_shape = [layer.output_size]
layer.linear_shift = self.create_parameter(
shape=linear_shift_shape,
dtype=layer._dtype,
is_bias=False,
)
layer.linear_smooth = layer.create_parameter(
shape=linear_smooth_shape,
dtype=layer._dtype,
is_bias=False,
)
def process_loaded_weights(self, layer, weights) -> None:
if layer.shift_key in layer.state_dict:
shift_tensor = get_tensor(layer.state_dict.pop(
layer.shift_key)).astype(paddle.get_default_dtype())
else:
shift_tensor = paddle.zeros(
shape=layer.linear_shift_shape,
dtype=paddle.get_default_dtype(),
)
layer.linear_shift.set_value(shift_tensor)
if layer.smooth_key in layer.state_dict:
smooth_tensor = get_tensor(layer.state_dict.pop(
layer.smooth_key)).astype(paddle.get_default_dtype())
else:
smooth_tensor = paddle.ones(
shape=[layer.linear_smooth_shape],
dtype=paddle.get_default_dtype(),
)
layer.linear_smooth.set_value(smooth_tensor)
def apply(self, layer, x):
pass

View File

@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import os
from abc import abstractmethod
from typing import Optional
@@ -21,6 +22,8 @@ from paddle.nn.quant import weight_only_linear, weight_quantize
from fastdeploy.platforms import current_platform
from ..moe import FusedMoE
from ..utils import get_tensor
from .quant_base import QuantConfigBase, QuantMethodBase
@@ -28,34 +31,92 @@ class WeightOnlyConfig(QuantConfigBase):
"""
Quantization config for weight only
Args:
weight_only_linear_arch: The architecture of weight only linear layer
algo: The quant algorithm("weight_only_int8" or "weight_only_int4") used for weight only linear layer
"""
def __init__(
self,
weight_only_linear_arch: int,
algo: str,
) -> None:
super().__init__()
self.weight_only_linear_arch = weight_only_linear_arch
self.algo = algo
# arch (int): The compute arch for target device. For example, A100 is 80, v100 is 70,
# if you do not assign arch, we will get arch from your device, default: None.
self.weight_only_linear_arch = os.getenv(
"FLAGS_weight_only_linear_arch")
if self.weight_only_linear_arch is not None:
self.weight_only_linear_arch = int(self.weight_only_linear_arch)
self.quant_max_bound = 0
self.quant_min_bound = 0
self.quant_round_type = 0
def get_name(self) -> str:
def name(self) -> str:
return "weight_only"
@classmethod
def from_config(cls, config: dict) -> "WeightOnlyConfig":
weight_only_linear_arch = config["weight_only_linear_arch"]
algo = config["algo"]
return cls(weight_only_linear_arch, algo)
return cls(algo)
def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
if current_platform.is_xpu():
from fastdeploy.model_executor.layers.backends import XPUWeightOnlyLinearMethod
return XPUWeightOnlyLinearMethod(self)
from fastdeploy.model_executor.layers.backends import (
XPUWeightOnlyLinearMethod, XPUWeightOnlyMoEMethod)
if isinstance(layer, FusedMoE):
return XPUWeightOnlyMoEMethod(self)
else:
return XPUWeightOnlyLinearMethod(self)
else:
return GPUWeightOnlyLinearMethod(self)
if isinstance(layer, FusedMoE):
if layer.use_method == "cutlass":
from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import \
CutlassWeightOnlyMoEMethod
return CutlassWeightOnlyMoEMethod(self)
elif layer.use_method == "triton":
from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import \
TritonWeightOnlyMoEMethod
return TritonWeightOnlyMoEMethod(self)
elif layer.use_method == "marlin":
from fastdeploy.model_executor.layers.moe.fused_moe_marlin_backend import \
MarlinWeightOnlyMoEMethod
return MarlinWeightOnlyMoEMethod(self)
else:
raise ValueError(
f"Unsupported MOE backend {layer.use_method}")
else:
return GPUWeightOnlyLinearMethod(self)
class WINT8Config(WeightOnlyConfig):
"""
weight only int8 config
"""
def __init__(self, ) -> None:
super().__init__("weight_only_int8")
@classmethod
def from_config(cls, config: dict) -> "WINT8Config":
return cls()
def name(self) -> str:
return "wint8"
class WINT4Config(WeightOnlyConfig):
"""
weight only int4 config
"""
def __init__(self, ) -> None:
super().__init__("weight_only_int4")
@classmethod
def from_config(cls, config: dict) -> "WINT4Config":
return cls()
def name(self) -> str:
return "wint4"
class WeightOnlyLinearMethod(QuantMethodBase):
@@ -71,12 +132,17 @@ class WeightOnlyLinearMethod(QuantMethodBase):
self.quant_config = quant_config
def create_weights(self, layer):
weight_only_scale_name = layer.prefix + ".weight_only_scale"
layer.linear_weight_shape.reverse()
if self.quant_config.name() == "wint4":
layer.linear_weight_shape[0] //= 2
layer.weight_dtype = "int8"
linear_weight_scale_shape = [layer.embed_dim]
if hasattr(layer, "linear_weight_shape"):
if isinstance(layer.linear_weight_shape, list):
layer_weight_shape = layer.linear_weight_shape
linear_weight_scale_shape = layer_weight_shape[:1]
if self.quant_config.name() == "wint4":
linear_weight_scale_shape[0] *= 2
layer.linear_weight_scale = layer.create_parameter(
shape=linear_weight_scale_shape,
@@ -94,7 +160,8 @@ class WeightOnlyLinearMethod(QuantMethodBase):
weight=layer.linear_weight,
bias=layer.linear_bias if layer.add_bias else None,
weight_scale=layer.linear_weight_scale,
weight_dtype=layer.weight_dtype,
weight_dtype="int8"
if self.quant_config.name() == "wint8" else "int4",
arch=self.quant_config.weight_only_linear_arch,
)
return linear_out
@@ -113,6 +180,20 @@ class GPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
) -> None:
super().__init__(quant_config)
def process_prequanted_weights(self, layer, state_dict) -> None:
"""
Process pre-quantized weights before applying them to the model
Args:
layer: The layer that owns the weights
quant_weight: The quantized weights
weight_scale: The scale of the quantized weights
"""
quant_weight = get_tensor(state_dict.pop(layer.weight_key))
weight_scale = get_tensor(state_dict.pop(layer.weight_scale_key))
layer.linear_weight.set_value(quant_weight)
layer.linear_weight_scale.set_value(
weight_scale.astype(paddle.get_default_dtype()))
def process_loaded_weights(self, layer, weight) -> None:
quanted_weight_tensor, weight_scale_tensor = weight_quantize(
weight,

View File

@@ -17,10 +17,10 @@ from typing import Optional
import paddle
import fastdeploy
from fastdeploy.platforms.utils import convert_to_npu_dequant_scale
from .quant_base import QuantConfigBase, QuantMethodBase
from fastdeploy.model_executor.layers.quantization.ops import (
cutlass_scaled_mm, scaled_fp8_quant)
from fastdeploy.model_executor.layers.quantization.quant_base import (
QuantConfigBase, QuantMethodBase)
class WFP8AFP8Config(QuantConfigBase):
@@ -32,17 +32,26 @@ class WFP8AFP8Config(QuantConfigBase):
super().__init__()
self.weight_scale_dict = weight_scale_dict
self.act_scale_dict = act_scale_dict
self.quant_max_bound = 448
self.quant_min_bound = -448
self.quant_round_type = 1
def get_name(self) -> str:
def name(self) -> str:
"""
"""
return "wfp8afp8"
@classmethod
def from_config(cls, config: dict) -> "WFP8AFP8Config":
weight_scale_dict = config["weight_scale_dict"]
act_scale_dict = config["act_scale_dict"]
"""
"""
weight_scale_dict = config.get("weight_scale_dict", None)
act_scale_dict = config.get("act_scale_dict", None)
return cls(weight_scale_dict, act_scale_dict)
def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
"""
"""
return WFP8AFP8LinearMethod(self)
@@ -59,58 +68,49 @@ class WFP8AFP8LinearMethod(QuantMethodBase):
self.quant_config = quant_config
def create_weights(self, layer):
"""
"""
layer.linear_weight_shape.reverse()
layer.weight_dtype = "float8_e4m3fn"
# TODO(YuanRisheng): set weight logic should be moved to process_loaded_weights func
weight_scale = self.quant_config.weight_scale_dict.get(
layer.prefix + ".weight_quanter")
in_scale = self.quant_config.act_scale_dict.get(layer.prefix +
".activation_quanter")
self.skip_quant = False
# we will skip quant if weight_scale is not found or in_scale is not found
if weight_scale is None or in_scale is None:
self.skip_quant = True
else:
max_range = 448.0
layer.scalar_scale_name = layer.prefix + ".scalar_weight_quanter"
layer.scalar_scale = layer.create_parameter(
shape=([1]),
dtype="float32",
)
layer.scalar_scale.set_value(
paddle.to_tensor([1.0 / (max_range * in_scale)],
dtype="float32"))
linear_out_scale = paddle.to_tensor(weight_scale /
max_range).astype("float32")
layer.linear_out_scale = layer.create_parameter(
shape=[layer.embed_dim],
dtype="float32",
is_bias=False,
default_initializer=paddle.nn.initializer.Constant(0),
)
layer.linear_out_scale.set_value(
convert_to_npu_dequant_scale(linear_out_scale))
layer.linear_weight_scale = layer.create_parameter(
shape=[1],
dtype="float32",
is_bias=False,
default_initializer=paddle.nn.initializer.Constant(0),
)
def process_loaded_weights(self, layer, weights) -> None:
# TODO(YuanRisheng): We should abstract the skip_quant logic to adapt to more quant methods
"""
"""
if self.skip_quant:
weight_tensor = weights.cast(layer._dtype)
layer.linear_weight.set_value(weight_tensor)
return
weight_tensor = weights.transpose([1, 0])
weight_tensor = paddle.cast(weight_tensor, self.weight_dtype)
self.linear_weight.copy_(weight_tensor, False)
if weights.dtype != paddle.float8_e4m3fn:
self.use_per_token_if_dynamic = True
weight_tensor = weights.transpose([1, 0]).contiguous()
qweight, weight_scale = scaled_fp8_quant(
weight_tensor,
use_per_token_if_dynamic=False,
)
layer.linear_weight.copy_(qweight, False)
layer.linear_weight_scale.set_value(weight_scale)
def apply(self, layer, x):
"""
"""
if self.skip_quant:
linear_out = paddle.matmul(x, layer.linear_weight, False, True)
return linear_out
linear_out = fastdeploy.model_executor.ops.gpu.per_channel_fp8_fp8_half_gemm_fused(
x,
layer.linear_weight,
bias=layer.linear_bias if layer.add_bias else None,
scalar_scale=layer.scalar_scale,
channel_scale=layer.linear_out_scale,
transpose_x=False,
transpose_y=True,
output_dtype=layer._dtype,
)
if self.use_per_token_if_dynamic:
out_type = x.dtype
a_q, a_scales = scaled_fp8_quant(
x, use_per_token_if_dynamic=self.use_per_token_if_dynamic)
linear_out = cutlass_scaled_mm(a_q, layer.linear_weight, a_scales,
layer.linear_weight_scale, out_type,
layer.linear_bias)
else:
raise NotImplementedError
return linear_out

View File

@@ -0,0 +1,142 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import Optional
from ..moe import FusedMoE
from . import get_quantization_config
from .quant_base import QuantConfigBase, QuantMethodBase
class WINT2Config(QuantConfigBase):
"""
Quantization config for wint8 linear and w4w2 MoE.
"""
def __init__(
self,
dense_quant_type: str,
dense_quant_granularity: str,
moe_quant_type: str,
moe_w4_quant_type: str,
moe_w4_quant_granularity: str,
moe_w4_quant_start_layer: int,
moe_w4_quant_end_layer: int,
moe_w2_quant_type: str,
moe_w2_quant_granularity: str,
moe_w2_quant_group_size: int,
moe_w2_quant_start_layer: int,
moe_w2_quant_end_layer: int,
) -> None:
super().__init__()
self.quant_max_bound = 0
self.quant_min_bound = 0
self.quant_round_type = 0
# wint2 quantization config
self.dense_quant_type = dense_quant_type
self.dense_quant_granularity = dense_quant_granularity
self.moe_quant_type = moe_quant_type
self.moe_w4_quant_type = moe_w4_quant_type
self.moe_w4_quant_granularity = moe_w4_quant_granularity
self.moe_w4_quant_start_layer = moe_w4_quant_start_layer
self.moe_w4_quant_end_layer = moe_w4_quant_end_layer
self.moe_w2_quant_type = moe_w2_quant_type
self.moe_w2_quant_granularity = moe_w2_quant_granularity
self.moe_w2_quant_group_size = moe_w2_quant_group_size
self.moe_w2_quant_start_layer = moe_w2_quant_start_layer
self.moe_w2_quant_end_layer = moe_w2_quant_end_layer
def name(self) -> str:
"""
Get the name of the quantization configuration.
Returns:
str: The name of the quantization configuration.
"""
return "wint2"
@classmethod
def from_config(cls, config: dict) -> "WINT2Config":
"""
Create a new instance of `WINT2Config` using the provided configuration dictionary.
Args:
config (dict): A dictionary containing the configuration parameters for the new instance.
Returns:
WINT2Config: The newly created instance of `WINT2Config`.
"""
dense_quant_type = config.get("dense_quant_config", "wint8")
dense_quant_granularity = config.get("dense_quant_granularity",
"per_channel")
moe_quant_config = config.get("moe_quant_config", {})
moe_quant_type = moe_quant_config.get("quant_type", "w4w2")
moe_w4_quant_config = moe_quant_config.get("moe_w4_quant_config", {})
moe_w4_quant_type = moe_w4_quant_config.get("quant_type",
"wint4")
moe_w4_quant_granularity = moe_w4_quant_config.get(
"quant_granularity", "per_channel")
moe_w4_quant_start_layer = moe_w4_quant_config.get(
"quant_start_layer", 0)
moe_w4_quant_end_layer = moe_w4_quant_config.get("quant_end_layer", 6)
moe_w2_quant_config = moe_quant_config.get("moe_w2_quant_config", {})
moe_w2_quant_type = moe_w2_quant_config.get("quant_type", "wint2")
moe_w2_quant_granularity = moe_w2_quant_config.get(
"quant_granularity", "pp_acc")
moe_w2_quant_group_size = moe_w2_quant_config.get(
"quant_group_size", 0)
moe_w2_quant_start_layer = moe_w2_quant_config.get(
"quant_start_layer", 0)
moe_w2_quant_end_layer = moe_w2_quant_config.get("quant_end_layer", 0)
return cls(
dense_quant_type,
dense_quant_granularity,
moe_quant_type,
moe_w4_quant_type,
moe_w4_quant_granularity,
moe_w4_quant_start_layer,
moe_w4_quant_end_layer,
moe_w2_quant_type,
moe_w2_quant_granularity,
moe_w2_quant_group_size,
moe_w2_quant_start_layer,
moe_w2_quant_end_layer,
)
def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
"""
Get the quantization method associated with the given layer based on the current quantization configuration.
Args:
layer (Layer): The layer for which the quantization method should be retrieved.
Returns:
QuantMethodBase: The quantization method associated with the given layer.
"""
if isinstance(layer, FusedMoE):
if layer.layer_idx <= self.moe_w4_quant_end_layer:
return get_quantization_config(
self.moe_w4_quant_type).from_config(
{}).get_quant_method(layer)
else:
from fastdeploy.model_executor.layers.moe.fused_moe_wint2_backend import \
TritonWint2FusedMoeMethod
return TritonWint2FusedMoeMethod(self)
else:
return get_quantization_config(self.dense_quant_type).from_config(
{}).get_quant_method(layer)

View File

@@ -14,25 +14,25 @@
# limitations under the License.
"""
from typing import Any, Optional
from typing import Optional
import paddle
from fastdeploy.config import ModelConfig
from fastdeploy.platforms import current_platform
from .utils import CpuGuard
class ErnieRotaryEmbedding:
def __init__(self,
rotary_dim,
base,
partial_rotary_factor,
rope_scaling=None):
def __init__(self, rotary_dim, base, partial_rotary_factor):
"""
Pre-calculate rotary position embedding for position_ids.
"""
self.rotary_dim = rotary_dim
self.base = base
self.partial_rotary_factor = partial_rotary_factor
self.rope_scaling = rope_scaling
def __call__(self, position_ids):
bsz, max_seq_len = position_ids.shape[:2]
@@ -70,18 +70,13 @@ class ErnieRotaryEmbedding:
class QwenRotaryEmbedding:
def __init__(self,
rotary_dim,
base,
partial_rotary_factor,
rope_scaling=None):
def __init__(self, rotary_dim, base, partial_rotary_factor):
"""
Pre-calculate rotary position embedding for position_ids.
"""
self.rotary_dim = rotary_dim
self.base = base
self.partial_rotary_factor = partial_rotary_factor
self.rope_scaling = rope_scaling
def __call__(self, position_ids):
bsz, max_seq_len = position_ids.shape[:2]
@@ -104,35 +99,72 @@ class QwenRotaryEmbedding:
return rot_emb
def get_rope_impl(
rotary_dim: int,
base: 10000.0,
position_ids,
model_config: Optional[ModelConfig] = None,
partial_rotary_factor=1,
):
"""
The real implementation of get_rope
"""
architecture = model_config.architectures[0]
if model_config is not None and model_config is None or architecture.startswith(
"Qwen"):
rotary_emb_layer = QwenRotaryEmbedding(rotary_dim, base,
partial_rotary_factor)
rotary_emb = rotary_emb_layer(position_ids)
else:
rotary_emb_layer = ErnieRotaryEmbedding(rotary_dim, base,
partial_rotary_factor)
rotary_emb = rotary_emb_layer(position_ids)
return rotary_emb
def get_rope_xpu(
rotary_dim: int,
base: 10000.0,
position_ids,
model_config: ModelConfig,
partial_rotary_factor=1,
):
"""
In XPU, cos and sin compute must be done on cpu
"""
with CpuGuard():
position_ids = position_ids.cpu()
rotary_emb = get_rope_impl(rotary_dim, base, position_ids,
model_config, partial_rotary_factor)
return rotary_emb.to('xpu')
def get_rope(
rotary_dim: int,
base: 10000.0,
position_ids,
model_config: ModelConfig,
partial_rotary_factor=1,
rope_scaling: Optional[dict[str, Any]] = None,
):
rope_type = rope_scaling.get("architectures", None)
if "Qwen2ForCausalLM" in rope_type:
rotary_emb_layer = QwenRotaryEmbedding(rotary_dim, base,
partial_rotary_factor,
rope_scaling)
rotary_emb = rotary_emb_layer(position_ids)
"""
The warpper of get_rope
"""
if current_platform.is_xpu():
return get_rope_xpu(rotary_dim, base, position_ids, model_config,
partial_rotary_factor)
else:
rotary_emb_layer = ErnieRotaryEmbedding(rotary_dim, base,
partial_rotary_factor,
rope_scaling)
rotary_emb = rotary_emb_layer(position_ids)
return rotary_emb
return get_rope_impl(rotary_dim, base, position_ids, model_config,
partial_rotary_factor)
class ErnieVlRotaryEmbedding3D:
def __init__(self, rotary_dim, base, partial_rotary_factor, max_position,
freq_allocation, rope_scaling):
freq_allocation):
self.rotary_dim = rotary_dim
self.base = base
self.paritial_rotary_factor = partial_rotary_factor
self.rope_scaling = rope_scaling
self.max_position = max_position
self.freq_allocation = freq_allocation
@@ -223,12 +255,10 @@ def get_rope_3d(
paritial_rotary_factor: 1,
max_position: 131072,
freq_allocation: 2,
rope_scaling: Optional[dict[str, Any]] = None,
):
rotary_emb3d_layer = ErnieVlRotaryEmbedding3D(rotary_dim, base,
paritial_rotary_factor,
max_position,
freq_allocation,
rope_scaling)
freq_allocation)
rotary_emb_3d = rotary_emb3d_layer(position_ids)
return rotary_emb_3d

View File

@@ -23,11 +23,12 @@ import paddle
@dataclass
class SamplingMetadata:
"""
metadata for sampling.
"""
temperature: paddle.Tensor
prompt_token_ids: paddle.Tensor
pre_token_ids: paddle.Tensor
eos_token_ids: paddle.Tensor
frequency_penalties: paddle.Tensor
presence_penalties: paddle.Tensor

View File

@@ -14,8 +14,12 @@
# limitations under the License.
"""
from .apply_penalty_multi_scores import apply_penalty_multi_scores
from .apply_penalty_multi_scores import (
apply_penalty_multi_scores, apply_speculative_penalty_multi_scores)
from .top_p_sampling import top_p_sampling
__all__ = [
"apply_penalty_multi_scores",
"apply_speculative_penalty_multi_scores",
"top_p_sampling",
]

View File

@@ -20,7 +20,7 @@ from fastdeploy.platforms import current_platform
def apply_penalty_multi_scores(
prompt_token_ids: paddle.Tensor,
pre_token_ids: paddle.Tensor,
logits: paddle.Tensor,
repetition_penalties: paddle.Tensor,
frequency_penalties: paddle.Tensor,
@@ -30,16 +30,30 @@ def apply_penalty_multi_scores(
step_idx: paddle.Tensor,
min_dec_lens: paddle.Tensor,
eos_token_ids: paddle.Tensor,
):
) -> paddle.Tensor:
"""
Args:
Returns:
apply_penalty_multi_scores
"""
if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import \
get_token_penalty_multi_scores
logits = get_token_penalty_multi_scores(
prompt_token_ids,
pre_token_ids,
logits,
repetition_penalties,
frequency_penalties,
presence_penalties,
temperature,
bad_words_token_ids,
step_idx,
min_dec_lens,
eos_token_ids,
)
elif current_platform.is_xpu():
from fastdeploy.model_executor.ops.xpu import \
get_token_penalty_multi_scores
logits = get_token_penalty_multi_scores(
pre_token_ids,
logits,
repetition_penalties,
frequency_penalties,
@@ -54,3 +68,48 @@ def apply_penalty_multi_scores(
raise NotImplementedError()
return logits
def apply_speculative_penalty_multi_scores(
pre_token_ids: paddle.Tensor,
logits: paddle.Tensor,
repetition_penalties: paddle.Tensor,
frequency_penalties: paddle.Tensor,
presence_penalties: paddle.Tensor,
temperature: paddle.Tensor,
bad_words_token_ids: paddle.Tensor,
step_idx: paddle.Tensor,
min_dec_lens: paddle.Tensor,
eos_token_ids: paddle.Tensor,
seq_lens_this_time: paddle.Tensor,
output_padding_offset: paddle.Tensor,
output_cum_offsets: paddle.Tensor,
max_len: int,
):
"""
apply_speculative_penalty_multi_scores
"""
if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import \
speculate_get_token_penalty_multi_scores
logits = speculate_get_token_penalty_multi_scores(
pre_token_ids,
logits,
repetition_penalties,
frequency_penalties,
presence_penalties,
temperature,
bad_words_token_ids,
step_idx,
min_dec_lens,
eos_token_ids,
seq_lens_this_time,
output_padding_offset,
output_cum_offsets,
max_len,
)
else:
raise NotImplementedError()
return logits

View File

@@ -0,0 +1,97 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import Literal, Optional
import paddle
from fastdeploy import envs
def top_p_sampling(
x: paddle.Tensor,
ps: paddle.Tensor,
threshold: Optional[paddle.Tensor] = None,
topp_seed: Optional[paddle.Tensor] = None,
seed: int = -1,
k: int = 0,
mode: Literal['truncated', 'non-truncated'] = "truncated",
) -> tuple[paddle.Tensor, paddle.Tensor]:
"""
top_p_sampling
"""
top_p_class = envs.FD_SAMPLING_CLASS.lower()
if top_p_class == "air":
_, ids = air_top_p_sampling(x,
ps,
threshold,
topp_seed,
seed=seed,
k=k,
mode=mode)
elif top_p_class == "rejection":
ids = rejection_top_p_sampling(x, ps, seed)
_ = None
else:
_, ids = paddle.tensor.top_p_sampling(x,
ps,
threshold=threshold,
topp_seed=topp_seed,
seed=seed,
k=k,
mode=mode)
return _, ids
def air_top_p_sampling(
x: paddle.Tensor,
ps: paddle.Tensor,
threshold: Optional[paddle.Tensor] = None,
topp_seed: Optional[paddle.Tensor] = None,
seed: int = -1,
k: int = 0,
mode: Literal['truncated', 'non-truncated'] = "truncated",
) -> tuple[paddle.Tensor, paddle.Tensor]:
"""
air_top_p_sampling
"""
try:
from fastdeploy.model_executor.ops.gpu import air_top_p_sampling
out, ids = air_top_p_sampling(x, ps, threshold, topp_seed, seed, k,
mode)
except ImportError:
raise RuntimeError("Cannot import air_top_p_sampling op.")
return out, ids
def rejection_top_p_sampling(
x: paddle.Tensor,
ps: paddle.Tensor,
seed: int = -1,
) -> paddle.Tensor:
"""
rejection_top_p_sampling
"""
try:
from fastdeploy.model_executor.ops.gpu import rejection_top_p_sampling
ids = rejection_top_p_sampling(
x,
ps,
seed,
)
except ImportError:
raise RuntimeError("Cannot import rejection_top_p_sampling op.")
return ids

View File

@@ -13,43 +13,193 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import threading
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Dict, List, Optional
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from fastdeploy.distributed.parallel_state import \
get_tensor_model_parallel_world_size
from fastdeploy.config import FDConfig
from fastdeploy.model_executor.guided_decoding.base_guided_decoding import \
LogitsProcessorBase
from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
from fastdeploy.model_executor.layers.sample.ops import \
apply_penalty_multi_scores
from fastdeploy.model_executor.layers.sample.ops import (
apply_penalty_multi_scores, apply_speculative_penalty_multi_scores,
top_p_sampling)
from fastdeploy.platforms import current_platform
class SamplerProcessor:
"""
SamplingProcessor for guided decoding.
"""
def __init__(self):
self.async_step = None
self.token_bitmask = None
self.logits_processor: Dict[int, Optional[Any]] = dict()
self.executor = ThreadPoolExecutor()
self.logits_lock = threading.Lock()
def add_logits_processor(self,
ids: int,
future: Optional[Any] = None,
prefill_tokens: List[int] = []):
""" add logits processor to SamplerProcessor """
with self.logits_lock:
if future is None:
if ids in self.logits_processor:
del self.logits_processor[ids]
return
if isinstance(future, LogitsProcessorBase):
self.logits_processor[ids] = future
for token in prefill_tokens:
self.logits_processor[ids].accept_token(token)
elif future.done():
self.logits_processor[ids] = future.result()
for token in prefill_tokens:
self.logits_processor[ids].accept_token(token)
else:
self.logits_processor[ids] = [future, prefill_tokens]
def update_vocab_mask(self, skip_idx_list: List[int] = []):
""" update vocab mask. (cpu-heavy operation) """
if len(self.logits_processor) == 0:
return
with self.logits_lock:
for idx, processor in self.logits_processor.items():
if processor is None:
del self.logits_processor[idx]
continue
if not isinstance(processor, LogitsProcessorBase):
future, prefill_tokens = self.logits_processor[idx]
self.logits_processor[idx] = future.result()
for token in prefill_tokens:
self.logits_processor[idx].accept_token(token)
available_processors = None
for processor in self.logits_processor.values():
if processor.is_terminated():
continue
available_processors = processor
if available_processors is None:
return
# allocate token bitmask
self.token_bitmask = available_processors.allocate_token_bitmask()
with self.logits_lock:
# fill token bitmask
for idx, processor in self.logits_processor.items():
if processor.is_terminated() or idx in skip_idx_list:
continue
processor.fill_token_bitmask(self.token_bitmask, idx)
def apply_token_mask(self,
logits: paddle.Tensor,
skip_idx_list: List[int] = []):
""" apply token mask to logits """
if len(self.logits_processor) == 0 or self.token_bitmask is None:
return logits
# self.async_step.result()
available_processors = None
with self.logits_lock:
for processor in self.logits_processor.values():
if processor.is_terminated():
continue
available_processors = processor
if available_processors is None:
return logits
indices = list(self.logits_processor.keys())
mask_idx = [i for i in indices if i not in skip_idx_list]
return available_processors.apply_token_mask(logits,
self.token_bitmask,
indices=mask_idx)
def _accept_token(self, idx: int, token: int):
""" accept token """
if idx not in self.logits_processor:
raise ValueError(
f"Invalid index, idx: {idx}, logit_processors.keys: {self.logits_processor.keys()}"
)
if self.logits_processor[idx].is_terminated():
return
self.logits_processor[idx].accept_token(token)
def update_output_tokens(self,
next_tokens: paddle.Tensor,
skip_idx_list: List[int] = []):
""" update output tokens """
if len(self.logits_processor) == 0:
return
token_ids = next_tokens.numpy().tolist()
with self.logits_lock:
for idx in self.logits_processor.keys():
token = token_ids[idx][0]
if token < 0 or self.logits_processor[
idx] is None or idx in skip_idx_list:
continue
self._accept_token(idx, token)
def pre_process(self, skip_idx_list: List[int] = []):
""" pre process before running """
# create async operation for guided decoding
# TODO: support async
self.update_vocab_mask(skip_idx_list)
# self.async_step = self.executor.submit(self.update_vocab_mask)
class Sampler(nn.Layer):
"""
Sampler for normal generation.
"""
def __init__(self):
"""
"""
super().__init__()
if current_platform.is_cuda():
self.nranks = get_tensor_model_parallel_world_size()
if current_platform.is_cuda() or current_platform.is_xpu():
self.forward = self.forward_cuda
else:
raise NotImplementedError()
self.processor = SamplerProcessor()
def apply_logits_processor(self,
ids: int,
future: Optional[Any] = None,
prefill_tokens: List[int] = []):
""" apply logits processor to sampler """
self.processor.add_logits_processor(ids, future, prefill_tokens)
def pre_process(self, skip_idx_list: List[int] = []):
""" pre process before running """
self.processor.pre_process(skip_idx_list)
def forward_cuda(
self,
logits: paddle.Tensor,
sampling_metadata: SamplingMetadata,
skip_idx_list: List[int] = [],
) -> paddle.Tensor:
"""
"""
logits = self.processor.apply_token_mask(logits, skip_idx_list)
logits = apply_penalty_multi_scores(
sampling_metadata.prompt_token_ids,
sampling_metadata.pre_token_ids,
logits,
sampling_metadata.repetition_penalties,
sampling_metadata.frequency_penalties,
@@ -63,10 +213,156 @@ class Sampler(nn.Layer):
probs = F.softmax(logits)
_, next_tokens = paddle.tensor.top_p_sampling(probs,
sampling_metadata.top_p)
if self.nranks > 1:
paddle.distributed.broadcast(next_tokens, 0)
_, next_tokens = top_p_sampling(probs, sampling_metadata.top_p)
self.processor.update_output_tokens(next_tokens, skip_idx_list)
return next_tokens
class SpeculativeSampler(nn.Layer):
"""
Sampler for speculative generation.
"""
def __init__(self, fd_config: FDConfig):
"""
"""
super().__init__()
if current_platform.is_cuda():
self.forward = self.forward_cuda
else:
raise NotImplementedError()
self.speculative_verify_window = fd_config.speculative_config.verify_window
self.speculative_max_candidate_len = fd_config.speculative_config.max_candidate_len
def pre_process(self, skip_idx_list: List[int] = []):
""" pre process before running """
pass
def apply_logits_processor(self,
ids: int,
future: Optional[Any] = None,
prefill_tokens: List[int] = []):
""" apply logits processor to sampler """
pass
def forward_cuda(
self,
logits: paddle.Tensor,
sampling_metadata: SamplingMetadata,
max_model_len: int,
share_inputs: List[paddle.Tensor],
) -> paddle.Tensor:
"""
"""
from fastdeploy.model_executor.ops.gpu import (speculate_verify,
top_p_candidates)
logits = apply_speculative_penalty_multi_scores(
sampling_metadata.pre_token_ids,
logits,
sampling_metadata.repetition_penalties,
sampling_metadata.frequency_penalties,
sampling_metadata.presence_penalties,
sampling_metadata.temperature,
sampling_metadata.bad_words_token_ids,
sampling_metadata.step_idx,
sampling_metadata.min_dec_lens,
sampling_metadata.eos_token_ids,
share_inputs["seq_lens_this_time"],
share_inputs["output_padding_offset"],
share_inputs["output_cum_offsets"],
max_model_len,
)
probs = F.softmax(logits)
verify_scores, verify_tokens, actual_candidate_len = top_p_candidates(
probs,
sampling_metadata.top_p,
share_inputs["output_padding_offset"],
self.speculative_max_candidate_len,
max_model_len,
)
speculate_verify(
share_inputs["accept_tokens"],
share_inputs["accept_num"],
share_inputs["step_idx"],
share_inputs["stop_flags"],
share_inputs["seq_lens_encoder"],
share_inputs["seq_lens_decoder"],
share_inputs[
"draft_tokens"], # Both input and output, need to write the last 1 token accepted to position 0.
share_inputs["seq_lens_this_time"],
verify_tokens,
verify_scores,
share_inputs["max_dec_len"],
sampling_metadata.eos_token_ids,
share_inputs["is_block_step"],
share_inputs["output_cum_offsets"],
actual_candidate_len,
share_inputs["actual_draft_token_num"],
sampling_metadata.top_p,
max_model_len,
self.speculative_verify_window,
True, # enable_topp
)
return None
class MTPSampler(nn.Layer):
"""
"""
def __init__(self, fd_config: FDConfig):
"""
"""
super().__init__()
if current_platform.is_cuda():
self.forward = self.forward_cuda
else:
raise NotImplementedError()
def pre_process(self, skip_idx_list: List[int] = []):
""" pre process before running """
pass
def apply_logits_processor(self,
ids: int,
future: Optional[Any] = None,
prefill_tokens: List[int] = []):
""" apply logits processor to sampler """
pass
def forward_cuda(
self,
logits: paddle.Tensor,
sampling_metadata: SamplingMetadata,
max_model_len: int,
share_inputs: List[paddle.Tensor],
) -> paddle.Tensor:
"""
"""
logits = apply_speculative_penalty_multi_scores(
sampling_metadata.pre_token_ids,
logits,
sampling_metadata.repetition_penalties,
sampling_metadata.frequency_penalties,
sampling_metadata.presence_penalties,
sampling_metadata.temperature,
sampling_metadata.bad_words_token_ids,
sampling_metadata.step_idx,
sampling_metadata.min_dec_lens,
sampling_metadata.eos_token_ids,
share_inputs["seq_lens_this_time"],
share_inputs["seq_lens_encoder"],
share_inputs["seq_lens_decoder"],
max_model_len,
)
probs = F.softmax(logits)
_, next_tokens = top_p_sampling(probs, sampling_metadata.top_p)
return next_tokens

View File

@@ -14,32 +14,37 @@
# limitations under the License.
"""
from typing import Tuple
from typing import Tuple, Union
import numpy as np
import paddle
from paddle import Tensor
from paddle import Tensor, nn
from paddle.framework import in_dynamic_mode
from scipy.linalg import block_diag
from fastdeploy.platforms import current_platform
if current_platform.is_cuda() and current_platform.available():
try:
from fastdeploy.model_executor.ops.gpu import (
get_padding_offset,
speculate_get_padding_offset,
)
get_padding_offset, speculate_get_padding_offset)
except Exception:
raise ImportError(
f"Verify environment consistency between compilation and FastDeploy installation. "
f"And ensure the Paddle version supports FastDeploy's custom operators"
"Verify environment consistency between compilation and FastDeploy installation. "
"And ensure the Paddle version supports FastDeploy's custom operators"
)
import re
import os
cache_params = os.getenv("CACHE_PARAMS", "none")
from fastdeploy import envs
cache_params = envs.FD_CACHE_PARAMS
if cache_params != "none":
c8_state_dict = paddle.load(cache_params, return_numpy=True)
def per_block_cast_to_fp8(x: Tensor) -> Tuple[Tensor, Tensor]:
def per_block_cast_to_fp8(x: Tensor,
block_size: list = [128,
128]) -> Tuple[Tensor, Tensor]:
"""
Only used in deep_gemm block wise quant weight.
copy from FastDeploy/custom_ops/gpu_ops/fp8_deep_gemm/tests/test_core.py.
@@ -48,10 +53,13 @@ def per_block_cast_to_fp8(x: Tensor) -> Tuple[Tensor, Tensor]:
assert x.dim() == 2
m, n = x.shape
x_padded = paddle.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128),
x_padded = paddle.zeros((ceil_div(m, block_size[0]) * block_size[0],
ceil_div(n, block_size[1]) * block_size[1]),
dtype=x.dtype)
x_padded[:m, :n] = x
x_view = paddle.view(x_padded, (-1, 128, x_padded.shape[1] // 128, 128))
x_view = paddle.view(
x_padded,
(-1, block_size[0], x_padded.shape[1] // block_size[1], block_size[1]))
x_abs = paddle.abs(x_view).astype(paddle.float32)
x_amax = paddle.amax(x_abs, axis=(1, 3), keepdim=True)
@@ -63,15 +71,15 @@ def per_block_cast_to_fp8(x: Tensor) -> Tuple[Tensor, Tensor]:
# for distributed tensor model parallel
def _set_var_distributed(var, split_axis):
def _set_var_distributed(var: Tensor, split_axis: int):
"""
Set whether the variable is distributed. If the variable is None, no operation will be performed.
Args:
var (Variable, Optional): A Variable object, which can be None. The default value is None.
The Variable object should have an attribute 'is_distributed' to indicate whether
the variable has been processed in a distributed manner.
split_axis (Integer): the sharding dimension of dist tensors
var (Tensor): A Variable object, which can be None. The default value is None.
The Variable object should have an attribute 'is_distributed' to indicate whether
the variable has been processed in a distributed manner.
split_axis (int): the sharding dimension of dist tensors.
Returns:
None. No return value.
@@ -91,10 +99,16 @@ def _set_var_distributed(var, split_axis):
main_block._find_var_recursive(var.name).is_distributed = True
def get_tensor(input):
def get_tensor(input: Union[paddle.Tensor, np.ndarray, str]) -> paddle.Tensor:
"""
EP并行中权重按层分布式存储为了节省峰值显存在state_dict处理部分仅保存
层名与对应权重的路径因此需要将权重的类型转换为paddle.Tensor
Return a corresponding PaddlePaddle tensor based on the type and content of the input.
Args:
input (Union[paddle.Tensor, np.ndarray, str]): The input data.
Returns:
paddle.Tensor: Returns a PaddlePaddle tensor.
"""
if isinstance(input, paddle.Tensor):
if input.place.is_cpu_place():
@@ -104,7 +118,6 @@ def get_tensor(input):
return paddle.to_tensor(input)
elif isinstance(input, str):
if ".safetensors" in input:
match = re.match(r"\[(.*?)\](.*)", input)
if match:
key_name = match.group(1)
@@ -116,12 +129,11 @@ def get_tensor(input):
weight = f.get_tensor(key_name)
weight = paddle.Tensor(weight, zero_copy=True)
weight = weight._copy_to(
paddle.framework._current_expected_place(), False
)
paddle.framework._current_expected_place(), False)
return weight
else:
return None
else:
else:
if cache_params != "none":
tmp_key = input.split("/")[-1]
if tmp_key in c8_state_dict:
@@ -129,25 +141,134 @@ def get_tensor(input):
return paddle.to_tensor(c8_state_dict.pop(tmp_key))
return paddle.load(input)
else:
# 理论上不会命中这个分支
return input
def matmul_hadU(X: Tensor) -> paddle.Tensor:
"""
Perform matrix multiplication using the Hadamard matrix.
Args:
X (Tensor): The tensor to be multiplied.
Returns:
Tensor: The tensor after Hadamard matrix multiplication, with the same shape as the input tensor X.
"""
input = X.clone().reshape((-1, X.shape[-1], 1))
output = input.clone()
while input.shape[1] > 1:
input = input.reshape(
(input.shape[0], input.shape[1] // 2, 2, input.shape[2]))
output = output.reshape(input.shape)
output[:, :, 0, :] = input[:, :, 0, :] + input[:, :, 1, :]
output[:, :, 1, :] = input[:, :, 0, :] - input[:, :, 1, :]
output = output.reshape((input.shape[0], input.shape[1], -1))
(input, output) = (output, input)
del output
return input.reshape(X.shape)
def random_hadamard_matrix(block_size: int,
dtype: Union[paddle.dtype, str]) -> paddle.Tensor:
"""
Generate a random Hadamard matrix.
Args:
block_size (int): The size of the block, i.e., the number of rows and columns of the matrix.
dtype (str): The data type, for example 'float32'.
Returns:
paddle.Tensor: The generated random Hadamard matrix.
"""
Q = paddle.diag(paddle.ones((block_size), dtype=dtype))
block = matmul_hadU(Q)
return block
def create_hadamard_matrix(hidden_size: int) -> paddle.Tensor:
"""
Generate a Hadamard matrix.
Args:
hidden_size (int): The size of the hidden layer.
Returns:
paddle.Tensor: The generated Hadamard matrix.
"""
hadamard_block_size = 32
h = random_hadamard_matrix(hadamard_block_size, "float32")
block_num = hidden_size // hadamard_block_size
hadamard_matrix = paddle.to_tensor(
block_diag(*[h for i in range(block_num)]))
return hadamard_matrix
create_hadamard_matrix_map = {}
# Zkk: below key are used in 4.5T fp8.
create_hadamard_matrix_map[8192] = create_hadamard_matrix(8192)
create_hadamard_matrix_map[448] = create_hadamard_matrix(448)
create_hadamard_matrix_map[1024] = create_hadamard_matrix(1024)
create_hadamard_matrix_map[3584] = create_hadamard_matrix(3584)
def ensure_divisibility(numerator, denominator):
"""Ensure that numerator is divisible by the denominator."""
"""
Ensure the numerator is divisible by the denominator.
Args:
numerator (int): The numerator.
denominator (int): The denominator.
Returns:
None
Raises:
AssertionError: If the numerator cannot be evenly divided by the denominator, an assertion error is raised.
"""
assert numerator % denominator == 0, "{} is not divisible by {}".format(
numerator, denominator)
def divide(numerator, denominator):
"""Ensure that numerator is divisible by the denominator and return
the division value."""
def divide(numerator: int, denominator: int):
"""
Calculate the division result of two numbers.
Args:
numerator (int): The dividend.
denominator (int): The divisor.
Returns:
int: The result of the division, which is the quotient of the dividend divided by the divisor.
"""
ensure_divisibility(numerator, denominator)
return numerator // denominator
def remove_padding(max_len, input_ids, seq_lens_this_time):
def remove_padding(
max_len: paddle.Tensor, input_ids: paddle.Tensor,
seq_lens_this_time: paddle.Tensor
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor,
paddle.Tensor]:
"""
remove_padding
Remove padded sequences from the input.
Args:
max_len (paddle.Tensor): The maximum length of the input sequences.
input_ids (paddle.Tensor): The IDs of the input sequences.
seq_lens_this_time (paddle.Tensor): The actual length of each sequence.
Returns:
tuple: A tuple containing:
- The sequence IDs with padding removed (paddle.Tensor).
- The padding offsets (paddle.Tensor).
- The cumulative offsets (paddle.Tensor).
- The query sequence lengths (paddle.Tensor).
- The key sequence lengths (paddle.Tensor).
"""
if current_platform.is_cuda():
cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time)
@@ -159,7 +280,7 @@ def remove_padding(max_len, input_ids, seq_lens_this_time):
cu_seqlens_q,
cu_seqlens_k,
) = get_padding_offset(input_ids, cum_offsets_now, token_num,
seq_lens_this_time)
seq_lens_this_time)
return (
ids_remove_padding,
padding_offset,
@@ -168,10 +289,30 @@ def remove_padding(max_len, input_ids, seq_lens_this_time):
cu_seqlens_k,
)
def speculate_remove_padding(max_len, input_ids, seq_lens_this_time,
draft_tokens, seq_lens_encoder):
def speculate_remove_padding(
max_len: paddle.Tensor, input_ids: paddle.Tensor,
seq_lens_this_time: paddle.Tensor, draft_tokens: paddle.Tensor,
seq_lens_encoder: paddle.Tensor
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor,
paddle.Tensor]:
"""
remove_padding
Remove padding from sequences.
Args:
max_len (paddle.Tensor): The maximum length of the sequences.
input_ids (paddle.Tensor): The IDs of the input sequences.
seq_lens_this_time (paddle.Tensor): The lengths of the sequences in the current batch.
draft_tokens (paddle.Tensor): The draft tokens.
seq_lens_encoder (paddle.Tensor): The lengths of the encoder sequences.
Returns:
tuple: A tuple containing:
- The input sequence IDs with padding removed (paddle.Tensor).
- Padding offsets (paddle.Tensor).
- Cumulative offsets (paddle.Tensor).
- Query sequence lengths (paddle.Tensor).
- Key sequence lengths (paddle.Tensor).
"""
if current_platform.is_cuda():
cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time)
@@ -197,3 +338,43 @@ def speculate_remove_padding(max_len, input_ids, seq_lens_this_time,
cu_seqlens_q,
cu_seqlens_k,
)
class CpuGuard:
"""CpuGuard"""
def __init__(self):
"""init"""
pass
def __enter__(self):
"""enter"""
self.ori_device = paddle.device.get_device()
paddle.device.set_device("cpu")
def __exit__(self, exc_type, exc_val, exc_tb):
"""exit"""
paddle.device.set_device(self.ori_device)
def create_and_set_parameter(layer: nn.Layer, name: str,
tensor: paddle.Tensor):
"""
Create a parameter for a specified layer and set its value to the given tensor.
Args:
layer (nn.Layer): The layer object to which the parameter will be added.
name (str): The name of the parameter to be created.
tensor (paddle.Tensor): The tensor to set as the value of the parameter.
Returns:
None
"""
setattr(
layer, name,
layer.create_parameter(
shape=tensor.shape,
dtype=tensor.dtype,
default_initializer=paddle.nn.initializer.Constant(0),
))
getattr(layer, name).set_value(tensor)

View File

@@ -19,14 +19,31 @@ from abc import ABC, abstractmethod
import paddle
from paddle import nn
from fastdeploy.config import LLMConfig, LoadConfig, ModelConfig
from fastdeploy.config import FDConfig, LoadConfig, ModelConfig
from fastdeploy.model_executor.models.ernie4_5_moe import \
Ernie4_5_PretrainedModel
from fastdeploy.model_executor.models.ernie4_5_mtp import \
Ernie4_5_MTPPretrainedModel
from fastdeploy.model_executor.models.model_base import ModelRegistry
from fastdeploy.model_executor.models.qwen2 import Qwen2PretrainedModel
from fastdeploy.model_executor.models.qwen3 import Qwen3PretrainedModel
from fastdeploy.model_executor.models.qwen3moe import Qwen3MoePretrainedModel
from fastdeploy.model_executor.models.utils import load_checkpoint
MODEL_CLASSES = {
"Ernie4_5_MoeForCausalLM": Ernie4_5_PretrainedModel,
"Ernie4_5_MTPForCausalLM": Ernie4_5_MTPPretrainedModel,
"Qwen2ForCausalLM": Qwen2PretrainedModel,
"Qwen3ForCausalLM": Qwen3PretrainedModel,
"Qwen3MoeForCausalLM": Qwen3MoePretrainedModel,
"Ernie4_5_ForCausalLM": Ernie4_5_PretrainedModel
}
# TODO(gongshaotian): implement real interface to replace this
def get_model(llm_config: LLMConfig) -> nn.Layer:
def get_model_from_loader(fd_config: FDConfig) -> nn.Layer:
""" load or download model """
model_path = llm_config.load_config.model_path
model = paddle.load(model_path, return_numpy=True)
model_loader = DefaultModelLoader(fd_config.load_config)
model = model_loader.load_model(fd_config)
return model
@@ -42,7 +59,7 @@ class BaseModelLoader(ABC):
raise NotImplementedError
@abstractmethod
def load_model(self, llm_config: LLMConfig) -> nn.Layer:
def load_model(self, fd_config: FDConfig) -> nn.Layer:
""" Load a model with the given configurations."""
raise NotImplementedError
@@ -56,5 +73,23 @@ class DefaultModelLoader(BaseModelLoader):
def download_model(self, model_config: ModelConfig) -> None:
pass
def load_model(self, llm_config: LLMConfig) -> nn.Layer:
pass
def load_model(self, fd_config: FDConfig) -> nn.Layer:
context = paddle.LazyGuard()
architectures = fd_config.model_config.architectures[0]
# TODO(gongshaotian): Now, only support safetensor
model_class = MODEL_CLASSES[architectures]
state_dict = load_checkpoint(
fd_config.parallel_config.model_name_or_path,
model_class,
fd_config.model_config,
return_numpy=True)
with context:
model_cls = ModelRegistry.get_class(architectures)
model = model_cls(fd_config)
model.eval()
model.set_state_dict(state_dict)
return model

View File

@@ -16,30 +16,50 @@
import importlib
import inspect
import os
from pathlib import Path
from .model_base import ModelForCasualLM, ModelRegistry
inference_runner_supported_models = ["Qwen2ForCausalLM"]
inference_runner_supported_models = [
"Ernie4_5_MoeForCausalLM",
"Ernie4_5_MTPForCausalLM",
"Qwen2ForCausalLM",
"Qwen3MoeForCausalLM",
"Ernie4_5_ForCausalLM",
"Qwen3ForCausalLM",
]
def _find_py_files(root_dir):
root_path = Path(root_dir)
py_files = []
for py_file in root_path.rglob("*.py"):
rel_path = py_file.relative_to(root_dir)
if "__init__" in str(py_file):
continue
dotted_path = str(rel_path).replace("/", ".").replace("\\",
".").replace(
".py", "")
py_files.append(dotted_path)
return py_files
def auto_models_registry():
"""
auto registry all models in this folder
"""
for module_file in os.listdir(os.path.dirname(__file__)):
if module_file.endswith('.py') and module_file != '__init__.py':
module_name = module_file[:-3]
try:
module = importlib.import_module(
f'fastdeploy.model_executor.models.{module_name}')
for attr_name in dir(module):
attr = getattr(module, attr_name)
if inspect.isclass(attr) and issubclass(
attr,
ModelForCasualLM) and attr is not ModelForCasualLM:
ModelRegistry.register(attr)
except ImportError:
raise ImportError(f"{module_name=} import error")
for module_file in _find_py_files(os.path.dirname(__file__)):
try:
module = importlib.import_module(
f'fastdeploy.model_executor.models.{module_file}')
for attr_name in dir(module):
attr = getattr(module, attr_name)
if inspect.isclass(attr) and issubclass(
attr,
ModelForCasualLM) and attr is not ModelForCasualLM:
ModelRegistry.register(attr)
except ImportError:
raise ImportError(f"{module_file=} import error")
auto_models_registry()

View File

@@ -0,0 +1,774 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from __future__ import annotations
from functools import partial
from typing import Dict, Union
import numpy as np
import paddle
from paddle import nn
from paddleformers.transformers import PretrainedModel
from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig, ModelConfig
from fastdeploy.model_executor.graph_optimization.decorator import \
support_graph_optimization
from fastdeploy.model_executor.layers.activation import SiluAndMul
from fastdeploy.model_executor.layers.attention import Attention
from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
from fastdeploy.model_executor.layers.linear import (
MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear)
from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
from fastdeploy.model_executor.layers.moe.moe import FusedMoE
from fastdeploy.model_executor.layers.normalization import RMSNorm
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.worker.forward_meta import ForwardMeta
class Ernie4_5_PretrainedModel(PretrainedModel):
"""
Ernie4_5_PretrainedModel
"""
config_class = FDConfig
def _init_weight(self, layer):
"""
_init_weight
"""
return None
@classmethod
def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True):
"""
get_tensor_parallel_mappings
"""
logger.info("erine inference model _get_tensor_parallel_mappings")
from paddleformers.transformers.conversion_utils import \
split_or_merge_func
fn = split_or_merge_func(
is_split=is_split,
tensor_parallel_degree=config.tensor_parallel_degree,
tensor_parallel_rank=config.tensor_parallel_rank,
num_attention_heads=config.num_attention_heads,
)
def gqa_qkv_split_func(
weight,
tensor_parallel_degree,
tensor_parallel_rank,
num_attention_heads,
num_key_value_heads,
head_dim,
):
def get_shape(tensor):
return (tensor.get_shape()
if hasattr(tensor, "get_shape") else tensor.shape)
def slice_tensor(tensor, start, end):
shape = get_shape(tensor)
if len(shape) == 1:
return tensor[start:end]
else:
return tensor[..., start:end]
q_end = num_attention_heads * head_dim
k_end = q_end + num_key_value_heads * head_dim
v_end = k_end + num_key_value_heads * head_dim
q = slice_tensor(weight, 0, q_end)
k = slice_tensor(weight, q_end, k_end)
v = slice_tensor(weight, k_end, v_end)
def split_tensor(tensor, degree):
shape = get_shape(tensor)
size = shape[-1]
block_size = size // degree
if hasattr(tensor, "get_shape"):
return [
slice_tensor(tensor, i * block_size,
(i + 1) * block_size)
for i in range(degree)
]
else:
return np.split(tensor, degree, axis=-1)
q_list = split_tensor(q, tensor_parallel_degree)
k_list = split_tensor(k, tensor_parallel_degree)
v_list = split_tensor(v, tensor_parallel_degree)
if tensor_parallel_rank is None:
return [
np.concatenate([q_i, k_i, v_i], axis=-1)
for q_i, k_i, v_i in zip(q_list, k_list, v_list)
]
else:
return np.concatenate(
[
q_list[tensor_parallel_rank],
k_list[tensor_parallel_rank],
v_list[tensor_parallel_rank],
],
axis=-1,
)
def gqa_qkv_merge_func(weight_list, num_attention_heads,
num_key_value_heads, head_dim):
tensor_parallel_degree = len(weight_list)
num_attention_heads = num_attention_heads // tensor_parallel_degree
num_key_value_heads = num_key_value_heads // tensor_parallel_degree
is_paddle_tensor = not isinstance(weight_list[0], np.ndarray)
def get_shape(tensor):
return (tensor.get_shape()
if hasattr(tensor, "get_shape") else tensor.shape)
def slice_tensor(tensor, start, end):
if len(get_shape(tensor)) == 1:
return tensor[start:end]
else:
return tensor[..., start:end]
q_list, k_list, v_list = [], [], []
for weight in weight_list:
q_end = num_attention_heads * head_dim
k_end = q_end + num_key_value_heads * head_dim
v_end = k_end + num_key_value_heads * head_dim
q = slice_tensor(weight, 0, q_end)
k = slice_tensor(weight, q_end, k_end)
v = slice_tensor(weight, k_end, v_end)
q_list.append(q)
k_list.append(k)
v_list.append(v)
merged = q_list + k_list + v_list
if is_paddle_tensor:
tensor = paddle.concat(merged, axis=-1)
if tensor.place.is_gpu_place():
tensor = tensor._copy_to(paddle.CUDAPinnedPlace(), False)
return tensor
else:
return np.concatenate(merged, axis=-1)
if (config.num_key_value_heads is not None
and config.num_key_value_heads != config.num_attention_heads):
if is_split:
qkv_fn = partial(
gqa_qkv_split_func,
tensor_parallel_degree=config.tensor_parallel_degree,
tensor_parallel_rank=config.tensor_parallel_rank,
num_attention_heads=config.num_attention_heads,
num_key_value_heads=config.num_key_value_heads,
head_dim=config.head_dim,
)
else:
qkv_fn = partial(
gqa_qkv_merge_func,
num_attention_heads=config.num_attention_heads,
num_key_value_heads=config.num_key_value_heads,
head_dim=config.head_dim,
)
else:
qkv_fn = partial(fn, is_column=True)
def get_tensor_parallel_split_mappings(num_layers, moe_num_experts,
moe_num_shared_experts,
moe_layer_start_index):
final_actions = {}
base_model_prefix = "ernie"
base_actions = {
"lm_head.weight":
partial(fn, is_column=True),
# "eh_proj.weight": partial(fn, is_column=True),
f"{base_model_prefix}.embed_tokens.weight":
partial(fn, is_column=False),
}
base_actions[
f"{base_model_prefix}.layers.0.self_attn.qkv_proj.weight"] = qkv_fn
base_actions[
f"{base_model_prefix}.layers.0.self_attn.qkv_proj.quant_weight"] = qkv_fn
base_actions[
f"{base_model_prefix}.layers.0.self_attn.o_proj.weight"] = partial(
fn, is_column=False)
base_actions[
f"{base_model_prefix}.layers.0.self_attn.o_proj.quant_weight"] = partial(
fn, is_column=False)
base_actions[
f"{base_model_prefix}.layers.0.mlp.up_gate_proj.weight"] = partial(
fn, is_column=True, is_naive_2fuse=True)
base_actions[
f"{base_model_prefix}.layers.0.mlp.up_gate_proj.quant_weight"] = partial(
fn, is_column=True, is_naive_2fuse=True)
base_actions[
f"{base_model_prefix}.layers.0.mlp.down_proj.weight"] = (
partial(fn, is_column=False))
base_actions[
f"{base_model_prefix}.layers.0.mlp.down_proj.quant_weight"] = partial(
fn, is_column=False)
for expert_idx in range(moe_num_experts):
base_actions[
f"{base_model_prefix}.layers.{moe_layer_start_index}"
f".mlp.experts.{expert_idx}.up_gate_proj.weight"] = partial(
fn, is_column=True, is_naive_2fuse=True)
base_actions[
f"{base_model_prefix}.layers.{moe_layer_start_index}"
f".mlp.experts.{expert_idx}.up_gate_proj.quant_weight"] = partial(
fn, is_column=True, is_naive_2fuse=True)
base_actions[
f"{base_model_prefix}.layers.{moe_layer_start_index}"
f".mlp.experts.{expert_idx}.down_proj.weight"] = partial(
fn, is_column=False)
base_actions[
f"{base_model_prefix}.layers.{moe_layer_start_index}"
f".mlp.experts.{expert_idx}.down_proj.quant_weight"] = partial(
fn, is_column=False)
if moe_num_shared_experts > 0:
base_actions[
f"{base_model_prefix}.layers.{moe_layer_start_index}"
f".mlp.shared_experts.up_gate_proj.weight"] = partial(
fn, is_column=True, is_naive_2fuse=True)
base_actions[
f"{base_model_prefix}.layers.{moe_layer_start_index}"
f".mlp.shared_experts.up_gate_proj.quant_weight"] = partial(
fn, is_column=True, is_naive_2fuse=True)
base_actions[
f"{base_model_prefix}.layers.{moe_layer_start_index}"
f".mlp.shared_experts.down_proj.weight"] = partial(
fn, is_column=False)
base_actions[
f"{base_model_prefix}.layers.{moe_layer_start_index}"
f".mlp.shared_experts.up_gate_proj.quant_weight"] = partial(
fn, is_column=False, is_naive_2fuse=True)
for key, action in base_actions.items():
if (f"{base_model_prefix}.layers.0.mlp.up_gate_proj.weight"
in key or
f"{base_model_prefix}.layers.0.mlp.up_gate_proj.quant_weight"
in key
or f"{base_model_prefix}.layers.0.mlp.down_proj.weight"
in key or
f"{base_model_prefix}.layers.0.mlp.down_proj.quant_weight"
in key):
for i in range(moe_layer_start_index):
final_actions[key.replace("layers.0.",
f"layers.{i}.")] = action
elif f"layers.{moe_layer_start_index}.mlp.experts." in key:
for i in range(moe_layer_start_index, num_layers):
final_actions[key.replace(
f"layers.{moe_layer_start_index}.",
f"layers.{i}.")] = action
elif f"layers.{moe_layer_start_index}.mlp.shared_experts." in key:
for i in range(moe_layer_start_index, num_layers):
final_actions[key.replace(
f"layers.{moe_layer_start_index}.",
f"layers.{i}.")] = action
elif f"{base_model_prefix}.layers.0." in key:
for i in range(num_layers):
final_actions[key.replace("layers.0.",
f"layers.{i}.")] = action
final_actions[key] = action
return final_actions
moe_num_experts = 0
moe_num_shared_experts = 0
if isinstance(config.moe_num_experts, list):
moe_num_experts = sum(config.moe_num_experts)
elif isinstance(config.moe_num_experts, int):
moe_num_experts = config.moe_num_experts
if hasattr(config, 'moe_num_shared_experts'):
moe_num_shared_experts = config.moe_num_shared_experts
moe_layer_start_index = -1
if isinstance(config.moe_layer_start_index, list):
moe_layer_start_index = min(config.moe_layer_start_index)
elif isinstance(config.moe_layer_start_index, int):
moe_layer_start_index = config.moe_layer_start_index
mappings = get_tensor_parallel_split_mappings(
config.num_layers,
moe_num_experts,
moe_num_shared_experts,
moe_layer_start_index,
)
return mappings
class Ernie4_5_MLP(nn.Layer):
def __init__(
self,
fd_config: FDConfig,
intermediate_size: int,
prefix: str = "",
) -> None:
super().__init__()
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.gate_up_proj = MergedColumnParallelLinear(
fd_config=fd_config,
prefix=f"{prefix}.up_gate_proj",
input_size=fd_config.model_config.hidden_size,
output_size=intermediate_size * 2,
with_bias=False,
activation=fd_config.model_config.hidden_act,
use_fast_ffn=True,
)
self.down_proj = RowParallelLinear(
fd_config=fd_config,
prefix=f"{prefix}.down_proj",
input_size=(intermediate_size // self.nranks),
output_size=fd_config.model_config.hidden_size,
with_bias=False,
)
self.act_fn = SiluAndMul(
fd_config=fd_config,
bias=None,
act_method=fd_config.model_config.hidden_act,
)
def load_state_dict(self, state_dict):
self.gate_up_proj.load_state_dict(state_dict)
self.down_proj.load_state_dict(state_dict)
def forward(self, hidden_states: paddle.Tensor):
gate_up_out = self.gate_up_proj(hidden_states)
act_out = self.act_fn(gate_up_out)
down_out = self.down_proj(act_out)
return down_out
class Ernie4_5_MoE(nn.Layer):
def __init__(self, fd_config: FDConfig, layer_id: int,
prefix: str) -> None:
super().__init__()
moe_quant_type = ""
if hasattr(fd_config.quant_config, 'moe_quant_type'):
moe_quant_type = fd_config.quant_config.moe_quant_type
if moe_quant_type == "w4a8":
weight_key_map = {
"gate_weight_key":
f"{prefix}.gate.weight",
"gate_correction_bias_key":
f"{prefix}.moe_statics.e_score_correction_bias",
"ffn1_expert_weight_key":
f"{prefix}.experts.{{}}.up_gate_proj.quant_weight",
"ffn2_expert_weight_key":
f"{prefix}.experts.{{}}.down_proj.quant_weight",
"ffn1_expert_weight_scale_key":
f"{prefix}.experts.{{}}.up_gate_proj.weight_scale",
"ffn2_expert_weight_scale_key":
f"{prefix}.experts.{{}}.down_proj.weight_scale",
"ffn1_expert_in_scale_key":
f"{prefix}.experts.{{}}.up_gate_proj.activation_scale",
"ffn2_expert_in_scale_key":
f"{prefix}.experts.{{}}.down_proj.activation_scale",
}
elif moe_quant_type == "w4w2":
weight_key_map = {
"gate_weight_key":
f"{prefix}.gate.weight",
"gate_correction_bias_key":
f"{prefix}.moe_statics.e_score_correction_bias",
"ffn1_expert_weight_key":
f"{prefix}.experts.{{}}.up_gate_proj.quant_weight",
"ffn2_expert_weight_key":
f"{prefix}.experts.{{}}.down_proj.quant_weight",
"ffn1_expert_weight_scale_key":
f"{prefix}.experts.{{}}.up_gate_proj.weight_scale",
"ffn2_expert_weight_scale_key":
f"{prefix}.experts.{{}}.down_proj.weight_scale",
"ffn1_expert_super_scales_key":
f"{prefix}.experts.{{}}.up_gate_proj.super_scales",
"ffn2_expert_super_scales_key":
f"{prefix}.experts.{{}}.down_proj.super_scales",
"ffn1_expert_code_scale_key":
f"{prefix}.experts.{{}}.up_gate_proj.code_scale",
"ffn2_expert_code_scale_key":
f"{prefix}.experts.{{}}.down_proj.code_scale",
"ffn1_expert_code_zp_key":
f"{prefix}.experts.{{}}.up_gate_proj.code_zp",
"ffn2_expert_code_zp_key":
f"{prefix}.experts.{{}}.down_proj.code_zp",
}
elif moe_quant_type == "tensor_wise_fp8" or (
moe_quant_type == "block_wise_fp8" and
fd_config.model_config.is_quantized):
weight_key_map = {
"gate_weight_key":
f"{prefix}.gate.weight",
"gate_correction_bias_key":
f"{prefix}.moe_statics.e_score_correction_bias",
"ffn1_expert_weight_key":
f"{prefix}.experts.{{}}.up_gate_proj.quant_weight",
"ffn2_expert_weight_key":
f"{prefix}.experts.{{}}.down_proj.quant_weight",
"ffn1_expert_weight_scale_key":
f"{prefix}.experts.{{}}.up_gate_proj.weight_scale",
"ffn2_expert_weight_scale_key":
f"{prefix}.experts.{{}}.down_proj.weight_scale",
"ffn1_expert_in_scale_key":
f"{prefix}.experts.{{}}.up_gate_proj.activation_scale",
"ffn2_expert_in_scale_key":
f"{prefix}.experts.{{}}.down_proj.activation_scale",
}
else:
weight_key_map = {
"gate_weight_key":
f"{prefix}.gate.weight",
"gate_correction_bias_key":
f"{prefix}.moe_statics.e_score_correction_bias",
"ffn1_expert_weight_key":
f"{prefix}.experts.{{}}.up_gate_proj.weight",
"ffn2_expert_weight_key":
f"{prefix}.experts.{{}}.down_proj.weight",
}
self.fused_moe = FusedMoE(
fd_config=fd_config,
moe_intermediate_size=fd_config.moe_config.moe_intermediate_size,
num_experts=fd_config.moe_config.num_experts,
top_k=fd_config.moe_config.top_k,
layer_idx=layer_id,
weight_key_map=weight_key_map,
)
self.num_shared_experts = fd_config.moe_config.moe_num_shared_experts
if self.num_shared_experts > 0:
shared_experts_hidden_dim = self.num_shared_experts * fd_config.moe_config.moe_intermediate_size
self.shared_experts = Ernie4_5_MLP(
fd_config=fd_config,
intermediate_size=shared_experts_hidden_dim,
prefix=f"{prefix}.shared_experts",
)
def load_state_dict(self, state_dict):
self.fused_moe.load_state_dict(state_dict)
if self.num_shared_experts > 0:
self.shared_experts.load_state_dict(state_dict)
def forward(self, hidden_states: paddle.Tensor):
out = self.fused_moe(hidden_states)
if self.num_shared_experts > 0:
s_x = self.shared_experts(hidden_states)
out = out + s_x
return out
class Ernie4_5_Attention(nn.Layer):
def __init__(self, fd_config: FDConfig, layer_id: int,
prefix: str) -> None:
super().__init__()
nranks = fd_config.parallel_config.tensor_parallel_degree
self.qkv_proj = QKVParallelLinear(
fd_config=fd_config,
prefix=f"{prefix}.qkv_proj",
)
self.o_proj = RowParallelLinear(
fd_config=fd_config,
prefix=f"{prefix}.o_proj",
input_size=(fd_config.model_config.head_dim *
fd_config.model_config.num_attention_heads // nranks),
output_size=fd_config.model_config.hidden_size,
)
self.attn = Attention(
fd_config=fd_config,
layer_id=layer_id,
prefix=prefix,
use_neox_rotary_style=False,
)
def load_state_dict(self, state_dict):
self.qkv_proj.load_state_dict(state_dict)
self.o_proj.load_state_dict(state_dict)
self.attn.load_state_dict(state_dict)
def forward(
self,
forward_meta: ForwardMeta,
hidden_states: paddle.Tensor,
):
qkv_out = self.qkv_proj(hidden_states)
attn_out = self.attn(
qkv=qkv_out,
forward_meta=forward_meta,
)
output = self.o_proj(attn_out)
return output
class Ernie4_5_DecoderLayer(nn.Layer):
def __init__(
self,
fd_config: FDConfig,
prefix: str = "",
) -> None:
super().__init__()
layer_id = int(prefix.split(sep='.')[-1])
self.self_attn = Ernie4_5_Attention(
fd_config=fd_config,
layer_id=layer_id,
prefix=f"{prefix}.self_attn",
)
if (fd_config.moe_config.num_experts is not None
and layer_id >= fd_config.moe_config.moe_layer_start_index):
self.mlp = Ernie4_5_MoE(
fd_config=fd_config,
layer_id=layer_id,
prefix=f"{prefix}.mlp",
)
else:
self.mlp = Ernie4_5_MLP(
fd_config=fd_config,
intermediate_size=fd_config.model_config.ffn_hidden_size,
prefix=f"{prefix}.mlp",
)
self.input_layernorm = RMSNorm(
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=1e-5,
prefix=f"{prefix}.input_layernorm",
)
self.post_attention_layernorm = RMSNorm(
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=1e-5,
prefix=f"{prefix}.post_attention_layernorm",
)
def load_state_dict(self, state_dict):
self.self_attn.load_state_dict(state_dict)
self.mlp.load_state_dict(state_dict)
self.input_layernorm.load_state_dict(state_dict)
self.post_attention_layernorm.load_state_dict(state_dict)
def forward(
self,
forward_meta: ForwardMeta,
hidden_states: paddle.Tensor,
residual: paddle.Tensor = None,
):
if residual is None:
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
else:
hidden_states, residual = self.input_layernorm(
hidden_states, residual)
hidden_states = self.self_attn(
hidden_states=hidden_states,
forward_meta=forward_meta,
)
hidden_states, residual = self.post_attention_layernorm(
hidden_states, residual)
hidden_states = self.mlp(hidden_states)
return hidden_states, residual
@support_graph_optimization
class Ernie4_5_Model(nn.Layer):
def __init__(
self,
fd_config: FDConfig = None,
):
"""
Initializer for the Ernie4_5_Model class.
Args:
"""
super().__init__()
self.num_layers = fd_config.model_config.num_layers
fd_config.model_config.prefix_name = "ernie"
self.embeddings = VocabParallelEmbedding(
fd_config=fd_config,
num_embeddings=fd_config.model_config.vocab_size,
embedding_dim=fd_config.model_config.hidden_size,
params_dtype=paddle.get_default_dtype(),
prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"))
self.hidden_layers = [
Ernie4_5_DecoderLayer(
fd_config=fd_config,
prefix=f"{fd_config.model_config.prefix_name}.layers.{i}")
for i in range(self.num_layers)
]
self.norm = RMSNorm(
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=1e-5,
prefix=f"{fd_config.model_config.prefix_name}.norm",
)
def load_state_dict(self, state_dict):
"""
Load model parameters from a given state dictionary.
Args:
state_dict (dict[str, np.ndarray | paddle.Tensor]):
A dictionary containing model parameters, where keys are parameter names
and values are NumPy arrays or PaddlePaddle tensors.
"""
self.embeddings.load_state_dict(state_dict)
self.norm.load_state_dict(state_dict)
for i in range(self.num_layers):
logger.info(f"Start load layer {i}")
self.hidden_layers[i].load_state_dict(state_dict)
def forward(
self,
ids_remove_padding: paddle.Tensor,
forward_meta: ForwardMeta,
):
hidden_states = self.embeddings(ids_remove_padding=ids_remove_padding)
residual = None
for i in range(self.num_layers):
hidden_states, residual = self.hidden_layers[i](forward_meta,
hidden_states,
residual)
hidden_states = hidden_states + residual
out = self.norm(hidden_states)
return out
class Ernie4_5_MoeForCausalLM(ModelForCasualLM):
"""
Ernie4_5_MoeForCausalLM
"""
def __init__(self, fd_config: FDConfig):
"""
Args:
fd_config (FDConfig): Configurations for the LLM model.
"""
super(Ernie4_5_MoeForCausalLM, self).__init__(fd_config)
self.fd_config = fd_config
self.model = Ernie4_5_Model(fd_config=fd_config)
self.ori_vocab_size = fd_config.model_config.ori_vocab_size
self.lm_head = ParallelLMHead(
fd_config=fd_config,
embedding_dim=fd_config.model_config.hidden_size,
num_embeddings=fd_config.model_config.vocab_size,
prefix="lm_head",
)
self.tie_word_embeddings = fd_config.model_config.tie_word_embeddings
@classmethod
def name(self):
return "Ernie4_5_MoeForCausalLM"
@paddle.no_grad()
def set_state_dict(self, state_dict: Dict[str, Union[np.ndarray,
paddle.Tensor]]):
"""
Load model parameters from a given state dictionary.
Args:
state_dict (dict[str, np.ndarray | paddle.Tensor]):
A dictionary containing model parameters, where keys are parameter names
and values are NumPy arrays or PaddlePaddle tensors.
"""
self.model.load_state_dict(state_dict)
if self.tie_word_embeddings:
self.lm_head.out_linear.weight.set_value(
self.model.embeddings.word_embeddings.weight.transpose([1, 0]))
else:
self.lm_head.load_state_dict(state_dict)
def compute_logits(self, hidden_states: paddle.Tensor):
logits = self.lm_head(hidden_states)
logits = paddle.cast(logits, paddle.float32)
logits[:, self.ori_vocab_size:] = -float("inf")
return logits
def empty_input_forward(self):
"""
empty_input_forward
"""
fake_hidden_states = paddle.empty(
shape=[0, self.fd_config.model_config.hidden_size],
dtype=paddle.get_default_dtype(),
)
for i in range(self.fd_config.moe_config.moe_layer_start_index,
self.fd_config.model_config.num_layers):
self.model.hidden_layers[i].mlp.fused_moe(fake_hidden_states)
def forward(
self,
ids_remove_padding: paddle.Tensor,
forward_meta: ForwardMeta,
):
hidden_states = self.model(ids_remove_padding=ids_remove_padding,
forward_meta=forward_meta)
return hidden_states
class Ernie4_5_ForCausalLM(Ernie4_5_MoeForCausalLM):
"""
Ernie4_5_ForCausalLM
"""
@classmethod
def name(self):
"""
Model Architecture Name
"""
return "Ernie4_5_ForCausalLM"

View File

@@ -0,0 +1,417 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from __future__ import annotations
from functools import partial
from typing import Dict, Union
import numpy as np
import paddle
from paddle import nn
from paddleformers.transformers import PretrainedModel
from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig, ModelConfig
from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
from fastdeploy.model_executor.layers.normalization import RMSNorm
from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_DecoderLayer
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.worker.forward_meta import ForwardMeta
class Ernie4_5_MTPPretrainedModel(PretrainedModel):
"""
Ernie4_5_MTPPretrainedModel
"""
config_class = FDConfig
def _init_weight(self, layer):
"""
_init_weight
"""
return None
@classmethod
def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True):
"""
get_tensor_parallel_mappings
"""
logger.info("erine inference model _get_tensor_parallel_mappings")
from paddleformers.transformers.conversion_utils import \
split_or_merge_func
fn = split_or_merge_func(
is_split=is_split,
tensor_parallel_degree=config.tensor_parallel_degree,
tensor_parallel_rank=config.tensor_parallel_rank,
num_attention_heads=config.num_attention_heads,
)
def gqa_qkv_split_func(
weight,
tensor_parallel_degree,
tensor_parallel_rank,
num_attention_heads,
num_key_value_heads,
head_dim,
):
def get_shape(tensor):
return (tensor.get_shape()
if hasattr(tensor, "get_shape") else tensor.shape)
def slice_tensor(tensor, start, end):
shape = get_shape(tensor)
if len(shape) == 1:
return tensor[start:end]
else:
return tensor[..., start:end]
q_end = num_attention_heads * head_dim
k_end = q_end + num_key_value_heads * head_dim
v_end = k_end + num_key_value_heads * head_dim
q = slice_tensor(weight, 0, q_end)
k = slice_tensor(weight, q_end, k_end)
v = slice_tensor(weight, k_end, v_end)
def split_tensor(tensor, degree):
shape = get_shape(tensor)
size = shape[-1]
block_size = size // degree
if hasattr(tensor, "get_shape"):
return [
slice_tensor(tensor, i * block_size,
(i + 1) * block_size)
for i in range(degree)
]
else:
return np.split(tensor, degree, axis=-1)
q_list = split_tensor(q, tensor_parallel_degree)
k_list = split_tensor(k, tensor_parallel_degree)
v_list = split_tensor(v, tensor_parallel_degree)
if tensor_parallel_rank is None:
return [
np.concatenate([q_i, k_i, v_i], axis=-1)
for q_i, k_i, v_i in zip(q_list, k_list, v_list)
]
else:
return np.concatenate(
[
q_list[tensor_parallel_rank],
k_list[tensor_parallel_rank],
v_list[tensor_parallel_rank],
],
axis=-1,
)
def gqa_qkv_merge_func(weight_list, num_attention_heads,
num_key_value_heads, head_dim):
tensor_parallel_degree = len(weight_list)
num_attention_heads = num_attention_heads // tensor_parallel_degree
num_key_value_heads = num_key_value_heads // tensor_parallel_degree
is_paddle_tensor = not isinstance(weight_list[0], np.ndarray)
def get_shape(tensor):
return (tensor.get_shape()
if hasattr(tensor, "get_shape") else tensor.shape)
def slice_tensor(tensor, start, end):
if len(get_shape(tensor)) == 1:
return tensor[start:end]
else:
return tensor[..., start:end]
q_list, k_list, v_list = [], [], []
for weight in weight_list:
q_end = num_attention_heads * head_dim
k_end = q_end + num_key_value_heads * head_dim
v_end = k_end + num_key_value_heads * head_dim
q = slice_tensor(weight, 0, q_end)
k = slice_tensor(weight, q_end, k_end)
v = slice_tensor(weight, k_end, v_end)
q_list.append(q)
k_list.append(k)
v_list.append(v)
merged = q_list + k_list + v_list
if is_paddle_tensor:
tensor = paddle.concat(merged, axis=-1)
if tensor.place.is_gpu_place():
tensor = tensor._copy_to(paddle.CUDAPinnedPlace(), False)
return tensor
else:
return np.concatenate(merged, axis=-1)
if (config.num_key_value_heads is not None
and config.num_key_value_heads != config.num_attention_heads):
if is_split:
qkv_fn = partial(
gqa_qkv_split_func,
tensor_parallel_degree=config.tensor_parallel_degree,
tensor_parallel_rank=config.tensor_parallel_rank,
num_attention_heads=config.num_attention_heads,
num_key_value_heads=config.num_key_value_heads,
head_dim=config.hidden_size // config.num_attention_heads,
)
else:
qkv_fn = partial(
gqa_qkv_merge_func,
num_attention_heads=config.num_attention_heads,
num_key_value_heads=config.num_key_value_heads,
head_dim=config.hidden_size // config.num_attention_heads,
)
else:
qkv_fn = partial(fn, is_column=True)
def get_tensor_parallel_split_mappings(num_layers, moe_num_experts,
moe_layer_start_index):
"""
get tensor from parallel-split-mappings
"""
final_actions = {}
base_model_prefix = "ernie.mtp_block"
base_actions = {}
base_actions["ernie.mtp_linear_proj.0.weight"] = partial(
fn, is_column=True)
base_actions[
f"{base_model_prefix}.0.self_attn.qkv_proj.weight"] = qkv_fn
base_actions[
f"{base_model_prefix}.0.self_attn.o_proj.weight"] = partial(
fn, is_column=False)
base_actions[
f"{base_model_prefix}.0.mlp.up_gate_proj.weight"] = partial(
fn, is_column=True, is_naive_2fuse=True)
base_actions[f"{base_model_prefix}.0.mlp.down_proj.weight"] = (
partial(fn, is_column=False))
for expert_idx in range(moe_num_experts):
base_actions[
f"{base_model_prefix}.{moe_layer_start_index}"
f".mlp.experts.{expert_idx}.up_gate_proj.weight"] = partial(
fn, is_column=True, is_naive_2fuse=True)
base_actions[
f"{base_model_prefix}.{moe_layer_start_index}"
f".mlp.experts.{expert_idx}.down_proj.weight"] = partial(
fn, is_column=False)
for key, action in base_actions.items():
if (f"{base_model_prefix}.0.mlp.up_gate_proj.weight" in key or
f"{base_model_prefix}.0.mlp.down_proj.weight" in key):
for i in range(moe_layer_start_index):
final_actions[key.replace("0.", f"{i}.")] = action
elif f"{moe_layer_start_index}.mlp.experts." in key:
for i in range(moe_layer_start_index, num_layers):
final_actions[key.replace(f"{moe_layer_start_index}.",
f"{i}.")] = action
elif f"{base_model_prefix}.0." in key:
for i in range(num_layers):
final_actions[key.replace("0.", f"{i}.")] = action
final_actions[key] = action
return final_actions
moe_num_experts = 0
mappings = get_tensor_parallel_split_mappings(
config.num_layers,
moe_num_experts,
config.moe_layer_start_index,
)
return mappings
class Ernie4_5_MTPModel(nn.Layer):
"""
Ernie4_5_MTPModel
"""
def __init__(
self,
fd_config: FDConfig = None,
):
"""
Initializer for the Ernie4_5_MTPModel class.
Args:
"""
super().__init__()
self.num_layers = fd_config.model_config.num_layers
self.embeddings = fd_config.speculative_config.sharing_model.model.embeddings
self.hidden_layers = [
Ernie4_5_DecoderLayer(
fd_config=fd_config,
prefix=f"{fd_config.model_config.prefix_name}.{i}")
for i in range(self.num_layers)
]
self.enorm = RMSNorm(
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=1e-5,
prefix="ernie.mtp_emb_norm.0",
)
self.hnorm = RMSNorm(
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=1e-5,
prefix="ernie.mtp_hidden_norm.0",
)
self.eh_proj = ParallelLMHead(
fd_config=fd_config,
num_embeddings=fd_config.model_config.hidden_size,
embedding_dim=fd_config.model_config.hidden_size * 2,
prefix="ernie.mtp_linear_proj.0",
)
def load_state_dict(self, state_dict):
"""
Load model parameters from a given state dictionary.
Args:
state_dict (dict[str, np.ndarray | paddle.Tensor]):
A dictionary containing model parameters, where keys are parameter names
and values are NumPy arrays or PaddlePaddle tensors.
"""
# self.embeddings.load_state_dict(state_dict)
self.enorm.load_state_dict(state_dict)
self.hnorm.load_state_dict(state_dict)
self.eh_proj.load_state_dict(state_dict)
for i in range(self.num_layers):
logger.info(f"Start load layer {i}")
self.hidden_layers[i].load_state_dict(state_dict)
def forward(
self,
ids_remove_padding: paddle.Tensor,
previous_hidden_states: paddle.Tensor,
forward_meta: ForwardMeta,
):
"""
forward
"""
inputs_embedding = self.embeddings(
ids_remove_padding=ids_remove_padding)
inputs_embedding = paddle.concat(
[self.enorm(inputs_embedding),
self.hnorm(previous_hidden_states)],
axis=-1)
hidden_states = self.eh_proj(inputs_embedding)
residual = None
for i in range(self.num_layers):
hidden_states, residual = self.hidden_layers[i](forward_meta,
hidden_states,
residual)
hidden_states = hidden_states + residual
return hidden_states
class Ernie4_5_MTPForCausalLM(ModelForCasualLM):
"""
Ernie4_5_MTPForCausalLM
"""
def __init__(self, fd_config: FDConfig):
"""
Args:
fd_config (FDConfig): Configurations for the LLM model.
"""
super(Ernie4_5_MTPForCausalLM, self).__init__(fd_config)
self.fd_config = fd_config
self.model = Ernie4_5_MTPModel(fd_config=fd_config)
self.ori_vocab_size = fd_config.model_config.ori_vocab_size
self.lm_head = fd_config.speculative_config.sharing_model.lm_head
self.tie_word_embeddings = fd_config.model_config.tie_word_embeddings
@classmethod
def name(self):
"""
"""
return "Ernie4_5_MTPForCausalLM"
@paddle.no_grad()
def set_state_dict(self, state_dict: Dict[str, Union[np.ndarray,
paddle.Tensor]]):
"""
Load model parameters from a given state dictionary.
Args:
state_dict (dict[str, np.ndarray | paddle.Tensor]):
A dictionary containing model parameters, where keys are parameter names
and values are NumPy arrays or PaddlePaddle tensors.
"""
self.model.load_state_dict(state_dict)
# if self.tie_word_embeddings:
# self.lm_head.out_linear.weight.set_value(
# self.model.embeddings.word_embeddings.weight.transpose([1, 0]))
# else:
# self.lm_head.load_state_dict(state_dict)
def compute_logits(self, hidden_states: paddle.Tensor):
"""
compute logits
"""
logits = self.lm_head(hidden_states)
logits = paddle.cast(logits, paddle.float32)
logits[:, self.ori_vocab_size:] = -float("inf")
return logits
def empty_input_forward(self):
"""
empty_input_forward
"""
fake_hidden_states = paddle.empty(
shape=[0, self.fd_config.model_config.hidden_size],
dtype=paddle.get_default_dtype(),
)
for i in range(self.fd_config.moe_config.moe_layer_start_index,
self.fd_config.model_config.num_layers):
self.model.hidden_layers[i].mlp.fused_moe(fake_hidden_states)
def forward(
self,
ids_remove_padding: paddle.Tensor,
previous_hidden_states: paddle.Tensor,
forward_meta: ForwardMeta,
):
"""
forward
"""
hidden_states = self.model(ids_remove_padding, previous_hidden_states,
forward_meta)
return hidden_states

View File

@@ -0,0 +1,15 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

View File

@@ -0,0 +1,167 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import copy
from fastdeploy.config import ModelConfig
from .dfnrope.modeling import DFNRopeVisionTransformerConfig
__all__ = [
"Ernie4_5_VLMoeConfig",
]
class Ernie4_5_VLMoeConfig(ModelConfig):
r"""
This is the configuration class to store the configuration of a [`~ErnieModel`]. It is used to instantiate an Ernie
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Ernie-7B.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the Ernie model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`~ErnieModel`] or [`~TFErnieModel`].
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer encoder.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
tie_word_embeddings(`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
Example:
```python
>>> from paddleformers.transformer import ErnieModel, ErnieConfig
>>> # Initializing a Ernie ernie-7b style configuration
>>> configuration = ErnieConfig()
>>> # Initializing a model from the ernie-7b style configuration
>>> model = ErnieModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "erniemoevl"
attribute_map = {
"n_positions": "max_position_embeddings",
"n_embd": "hidden_size",
"n_layer": "num_hidden_layers",
"n_head": "num_attention_heads",
"n_inner": "intermediate_size",
"activation_function": "hidden_act",
}
def __init__(
self,
vision_config=None,
im_patch_id=None,
pixel_hidden_size=None, # None for fuyu
modality_detach=False,
temporal_conv_size=2,
spatial_conv_size=2,
mm_vocab_size=0, # vocab for mm specialtokens
max_text_id=None,
use_temporal_conv=True,
moe_use_size_all2all=False,
moe_num_attn_experts=False,
moe_dense_experts_token_type_id: int = 3,
moe_use_hard_gate: bool = True,
moe_fuse_experts: bool = False,
moe_use_token_type_bias: bool = False,
disable_ffn_model_parallel=False,
fuse_attn_ffn=True,
rope_3d=True,
freq_allocation=20,
using_precision_check=False,
use_recompute_resampler=False,
resampler_fuse_rms_norm=False,
moe_layer_feed_fake_token=False,
moe_num_experts=0,
**kwargs,
):
super().__init__(**kwargs)
self.vision_config = DFNRopeVisionTransformerConfig(
**vision_config) if vision_config else None
self.im_patch_id = im_patch_id
self.pixel_hidden_size = pixel_hidden_size
self.modality_detach = modality_detach
self.temporal_conv_size = temporal_conv_size
self.spatial_conv_size = spatial_conv_size
self.mm_vocab_size = mm_vocab_size
self.max_text_id = max_text_id
self.use_temporal_conv = use_temporal_conv
self.moe_use_size_all2all = moe_use_size_all2all
self.moe_num_attn_experts = moe_num_attn_experts
self.moe_dense_experts_token_type_id = moe_dense_experts_token_type_id
self.moe_use_hard_gate = moe_use_hard_gate
self.moe_fuse_experts = moe_fuse_experts
self.moe_use_token_type_bias = moe_use_token_type_bias
self.disable_ffn_model_parallel = disable_ffn_model_parallel
self.fuse_attn_ffn = fuse_attn_ffn
self.rope_3d = rope_3d
self.freq_allocation = freq_allocation
self.using_precision_check = using_precision_check
self.use_recompute_resampler = use_recompute_resampler
self.resampler_fuse_rms_norm = resampler_fuse_rms_norm
self.moe_layer_feed_fake_token = moe_layer_feed_fake_token
self.moe_num_experts = moe_num_experts
@property
def multimodel_experts(self) -> bool:
"""是否有多种类型的experts."""
return isinstance(self.moe_num_experts,
(tuple, list)) and len(self.moe_num_experts) > 1
@property
def use_moe(self) -> bool:
"""
Check if model is using MoE architecture.
Returns:
bool: True if moe_num_experts > 0, False otherwise
"""
return sum(
self.moe_num_experts
) > 0 if self.multimodel_experts else self.moe_num_experts > 0
def to_dict(self, saving_file=False):
"""to_dict"""
output = copy.deepcopy(self.__dict__)
if self.vision_config:
output["vision_config"] = (
self.vision_config.to_diff_dict() if isinstance(
self.vision_config,
(DFNRopeVisionTransformerConfig)) else self.vision_config)
output["model_type"] = self.__class__.model_type
return output

View File

@@ -0,0 +1,22 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from .configuration import DFNRopeVisionTransformerConfig
from .modeling import DFNRopeVisionTransformerPretrainedModel
__all__ = [
'DFNRopeVisionTransformerConfig', 'DFNRopeVisionTransformerPretrainedModel'
]

View File

@@ -0,0 +1,287 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import math
from collections import OrderedDict
import paddle
import paddle.nn.functional as F
from paddle import Tensor, nn
class NewGELUActivation(nn.Layer):
"""
Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
"""
def forward(self, input: Tensor) -> Tensor:
"""_summary_
Args:
input (Tensor): _description_
Returns:
Tensor: _description_
"""
return (0.5 * input * (1.0 + paddle.tanh(
math.sqrt(2.0 / math.pi) *
(input + 0.044715 * paddle.pow(input, 3.0)))))
class GELUActivation(nn.Layer):
"""
Original Implementation of the GELU activation function in Google BERT repo when initially created. For
information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
"""
def __init__(self, use_gelu_python: bool = False):
"""_summary_
Args:
use_gelu_python (bool, optional): _description_. Defaults to False.
"""
super().__init__()
if use_gelu_python:
self.act = self._gelu_python
else:
self.act = nn.functional.gelu
def _gelu_python(self, input: Tensor) -> Tensor:
"""_summary_
Args:
input (Tensor): _description_
Returns:
Tensor: _description_
"""
return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0)))
def forward(self, input: Tensor) -> Tensor:
"""_summary_
Args:
input (Tensor): _description_
Returns:
Tensor: _description_
"""
return self.act(input)
class FastGELUActivation(nn.Layer):
"""
Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
"""
def forward(self, input: Tensor) -> Tensor:
"""_summary_
Args:
input (Tensor): _description_
Returns:
Tensor: _description_
"""
return 0.5 * input * (1.0 +
paddle.tanh(input * 0.7978845608 *
(1.0 + 0.044715 * input * input)))
class QuickGELUActivation(nn.Layer):
"""
Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
"""
def forward(self, input: Tensor) -> Tensor:
"""_summary_
Args:
input (Tensor): _description_
Returns:
Tensor: _description_
"""
return input * F.sigmoid(1.702 * input)
class ClippedGELUActivation(nn.Layer):
"""
Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
https://arxiv.org/abs/2004.09602.
Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
initially created.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
"""
def __init__(self, min: float, max: float):
if min > max:
raise ValueError(
f"min should be < max (got min: {min}, max: {max})")
super().__init__()
self.min = min
self.max = max
def forward(self, x: Tensor) -> Tensor:
"""_summary_
Args:
x (Tensor): _description_
Returns:
Tensor: _description_
"""
return paddle.clip(gelu(x), self.min, self.max)
class SiLUActivation(nn.Layer):
"""
See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
later.
"""
def forward(self, input: Tensor) -> Tensor:
"""_summary_
Args:
input (Tensor): _description_
Returns:
Tensor: _description_
"""
return F.silu(input)
class MishActivation(nn.Layer):
"""
See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
visit the official repository for the paper: https://github.com/digantamisra98/Mish
"""
def forward(self, input: Tensor) -> Tensor:
"""_summary_
Args:
input (Tensor): _description_
Returns:
Tensor: _description_
"""
return F.mish(input)
class LinearActivation(nn.Layer):
"""
Applies the linear activation function, i.e. forwarding input directly to output.
"""
def forward(self, input: Tensor) -> Tensor:
"""_summary_
Args:
input (Tensor): _description_
Returns:
Tensor: _description_
"""
return input
class ClassInstantier(OrderedDict):
"""_summary_
Args:
OrderedDict (_type_): _description_
"""
def __getitem__(self, key):
"""_summary_
Args:
key (_type_): _description_
Returns:
_type_: _description_
"""
content = super().__getitem__(key)
cls, kwargs = content if isinstance(content, tuple) else (content, {})
return cls(**kwargs)
ACT2CLS = {
"gelu": GELUActivation,
"gelu_10": (ClippedGELUActivation, {
"min": -10,
"max": 10
}),
"gelu_fast": FastGELUActivation,
"gelu_new": NewGELUActivation,
"gelu_python": (GELUActivation, {
"use_gelu_python": True
}),
"linear": LinearActivation,
"mish": MishActivation,
"quick_gelu": QuickGELUActivation,
"relu": nn.ReLU,
"relu6": nn.ReLU6,
"sigmoid": nn.Sigmoid,
"silu": SiLUActivation,
"swish": SiLUActivation,
"tanh": nn.Tanh,
}
ACT2FN = ClassInstantier(ACT2CLS)
def get_activation(activation_string):
"""_summary_
Args:
activation_string (_type_): _description_
Raises:
KeyError: _description_
Returns:
_type_: _description_
"""
if activation_string in ACT2FN:
return ACT2FN[activation_string]
else:
raise KeyError(
f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}"
)
# For backwards compatibility with: from activations import gelu_python
gelu_python = get_activation("gelu_python")
gelu_new = get_activation("gelu_new")
gelu = get_activation("gelu")
gelu_fast = get_activation("gelu_fast")
quick_gelu = get_activation("quick_gelu")
silu = get_activation("silu")
mish = get_activation("mish")
linear_act = get_activation("linear")

View File

@@ -0,0 +1,70 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from paddleformers.transformers.configuration_utils import PretrainedConfig
__all__ = [
"DFNRopeVisionTransformerConfig",
]
class DFNRopeVisionTransformerConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`~ErnieModel`]. It is used to instantiate an Ernie
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Ernie-7B.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
model_type = "DFNRope_vision_transformer"
def __init__(
self,
depth=32,
embed_dim=1280,
hidden_size=3584,
hidden_act="quick_gelu",
mlp_ratio=4,
num_heads=16,
in_channels=3,
patch_size=14,
spatial_merge_size=2,
attn_implementation="eager", # new added
pp_data_balance=False,
recompute=False,
attn_sep=False,
vit_first_fwd_bsz=128,
vit_num_recompute_layers=10000,
**kwargs,
):
super().__init__(**kwargs)
self.depth = depth
self.embed_dim = embed_dim
self.hidden_size = hidden_size
self.hidden_act = hidden_act
self.mlp_ratio = mlp_ratio
self.num_heads = num_heads
self.in_channels = in_channels
self.patch_size = patch_size
self.spatial_merge_size = spatial_merge_size
self.attn_implementation = attn_implementation
self.pp_data_balance = pp_data_balance
self.recompute = recompute
self.attn_sep = attn_sep
self.vit_first_fwd_bsz = vit_first_fwd_bsz
self.vit_num_recompute_layers = vit_num_recompute_layers

View File

@@ -0,0 +1,732 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from functools import partial
import numpy as np
import paddle
import paddle.distributed as dist
import paddle.nn.functional as F
from paddle import nn
from paddle.distributed import fleet
from paddle.distributed.fleet.meta_parallel import (ColumnParallelLinear,
RowParallelLinear)
from paddle.distributed.fleet.utils import recompute
from paddle.nn.functional.flash_attention import \
flash_attn_unpadded as flash_attn_varlen_func
from paddleformers.transformers.model_utils import PretrainedModel
from .activation import ACT2FN
from .configuration import DFNRopeVisionTransformerConfig
def get_hcg():
"""
获取混合通信组
Args:
无参数
Returns:
int: 混合通信组的ID
"""
return fleet.get_hybrid_communicate_group()
class _AllToAll(paddle.autograd.PyLayer):
@staticmethod
def forward(
ctx,
input,
group,
output_split_sizes=None,
input_split_sizes=None,
):
"""
All-to-all communication in the group.
Args:
ctx (Any): Context object.
input (Tensor): Input tensor.
group (Group): The group object.
Returns:
Tensor: Output tensor.
"""
ctx.group = group
ctx.input_split_sizes = input_split_sizes
ctx.output_split_sizes = output_split_sizes
# return input
if dist.get_world_size(group) <= 1:
return input
if input_split_sizes is None and output_split_sizes is None:
output = paddle.empty_like(input)
task = dist.stream.alltoall_single(output, input, None, None,
group, True, True)
task.wait()
else:
out_sizes = [sum(output_split_sizes)]
out_sizes.extend(input.shape[1:])
output = paddle.empty(out_sizes, dtype=input.dtype)
task = dist.stream.alltoall_single(output,
input,
output_split_sizes,
input_split_sizes,
group,
sync_op=False)
task.wait()
return output
@staticmethod
def backward(ctx, *grad_output):
"""
all-to-all backward
"""
# return grad_output
if ctx.input_split_sizes is None and ctx.output_split_sizes is None:
return _AllToAll.apply(*grad_output, ctx.group)
else:
return _AllToAll.apply(*grad_output, ctx.group,
ctx.input_split_sizes,
ctx.output_split_sizes)
# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
"""Rotates half the hidden dims of the input."""
x1 = x[..., :x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2:]
return paddle.concat([-x2, x1], axis=-1) # shape is the same as x
def apply_rotary_pos_emb_vision(tensor: paddle.Tensor,
freqs: paddle.Tensor) -> paddle.Tensor:
"""_summary_
Args:
tensor (paddle.Tensor): _description_
freqs (paddle.Tensor): _description_
Returns:
paddle.Tensor: _description_
"""
orig_dtype = tensor.dtype
with paddle.amp.auto_cast(False):
tensor = tensor.astype(dtype="float32")
cos = freqs.cos()
sin = freqs.sin()
cos = cos.unsqueeze(1).tile(
repeat_times=[1, 1, 2]).unsqueeze(0).astype(dtype="float32")
sin = sin.unsqueeze(1).tile(
repeat_times=[1, 1, 2]).unsqueeze(0).astype(dtype="float32")
output = tensor * cos + rotate_half(tensor) * sin
output = paddle.cast(output, orig_dtype)
return output
def qkv_reshard_head(tensor, group):
"""
将qkv在seq维度拼接后一起做切分维度的转换
"""
parallelism = group.nranks
qkv_seqlen, head_num, head_dim = tensor.shape
tensor = tensor.transpose(perm=[1, 0, 2]).contiguous()
out = _AllToAll.apply(tensor, group)
out = paddle.split(out, parallelism, axis=0)
output_q = []
output_k = []
output_v = []
for output_i in out:
outout = output_i.transpose(perm=[1, 0, 2]).contiguous()
output = paddle.split(outout, 3, axis=0)
output_q.append(output[0])
output_k.append(output[1])
output_v.append(output[2])
q = paddle.concat(output_q, axis=0)
k = paddle.concat(output_k, axis=0)
v = paddle.concat(output_v, axis=0)
return q, k, v
class VisionFlashAttention2(nn.Layer):
"""_summary_
Args:
nn (_type_): _description_
"""
def __init__(self,
dim: int,
num_heads: int = 16,
tensor_parallel_degree: int = 1) -> None:
super().__init__()
self.num_heads = num_heads
self.tensor_parallel_degree = tensor_parallel_degree
if tensor_parallel_degree > 1:
self.qkv = ColumnParallelLinear(
dim,
dim * 3,
mp_group=fleet.get_hybrid_communicate_group().
get_model_parallel_group(),
weight_attr=None,
has_bias=True,
fuse_matmul_bias=True,
gather_output=False,
)
self.proj = RowParallelLinear(
dim,
dim,
mp_group=fleet.get_hybrid_communicate_group(
).get_model_parallel_group(),
input_is_parallel=True,
has_bias=True)
else:
self.qkv = nn.Linear(dim, dim * 3, bias_attr=True)
self.proj = nn.Linear(dim, dim)
self.head_dim = dim // num_heads # must added
def forward(
self,
hidden_states: paddle.Tensor,
cu_seqlens: paddle.Tensor,
rotary_pos_emb: paddle.Tensor = None,
attn_sep=False,
) -> paddle.Tensor:
"""_summary_
Args:
hidden_states (paddle.Tensor): _description_
cu_seqlens (paddle.Tensor): _description_
rotary_pos_emb (paddle.Tensor, optional): _description_. Defaults to None.
Returns:
paddle.Tensor: _description_
"""
seq_length = hidden_states.shape[0]
qkv = self.qkv(hidden_states).reshape(
[seq_length, 3, self.num_heads // self.tensor_parallel_degree,
-1]).transpose(perm=[1, 0, 2, 3])
q, k, v = qkv.unbind(axis=0)
if attn_sep:
hcg = get_hcg()
mp_group = hcg.get_model_parallel_group()
qkv = paddle.concat([q, k, v], axis=0)
q, k, v = qkv_reshard_head(qkv, mp_group)
seq_length = q.shape[0]
q = apply_rotary_pos_emb_vision(q.unsqueeze(axis=0),
rotary_pos_emb).squeeze(axis=0)
k = apply_rotary_pos_emb_vision(k.unsqueeze(axis=0),
rotary_pos_emb).squeeze(axis=0)
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
softmax_scale = self.head_dim**-0.5 # TODO: 需要手动加上
attn_output = (
flash_attn_varlen_func( # flash_attn_unpadded
q, # 不支持float32
k,
v,
cu_seqlens,
cu_seqlens,
max_seqlen,
max_seqlen,
scale=softmax_scale, # TODO: 需要手动加上
)[0].squeeze(0).reshape([seq_length, -1]))
if attn_sep:
out = _AllToAll.apply(attn_output, mp_group)
out = paddle.split(out, mp_group.nranks, axis=0)
attn_output = paddle.concat(out, axis=1)
attn_output = attn_output.astype(paddle.float32)
attn_output = self.proj(attn_output)
return attn_output
class PatchEmbed(nn.Layer):
"""_summary_
Args:
nn (_type_): _description_
"""
def __init__(
self,
patch_size: int = 14,
in_channels: int = 3,
embed_dim: int = 1152,
) -> None:
super().__init__()
self.patch_size = patch_size
self.in_channels = in_channels
self.embed_dim = embed_dim
self.proj = nn.Linear(in_channels * patch_size * patch_size,
embed_dim,
bias_attr=False)
def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
"""_summary_
Args:
hidden_states (paddle.Tensor): _description_
Returns:
paddle.Tensor: _description_
"""
target_dtype = self.proj.weight.dtype
hidden_states = self.proj(
paddle.cast(hidden_states, dtype=target_dtype))
return hidden_states
class VisionMlp(nn.Layer):
"""_summary_
Args:
nn (_type_): _description_
"""
def __init__(self,
dim: int,
hidden_dim: int,
hidden_act: str,
tensor_parallel_degree: int = 1) -> None:
super().__init__()
self.tensor_parallel_degree = tensor_parallel_degree
if self.tensor_parallel_degree > 1:
self.fc1 = ColumnParallelLinear(
dim,
hidden_dim,
mp_group=fleet.get_hybrid_communicate_group(
).get_model_parallel_group(),
gather_output=False,
has_bias=True)
self.fc2 = RowParallelLinear(
hidden_dim,
dim,
mp_group=fleet.get_hybrid_communicate_group(
).get_model_parallel_group(),
input_is_parallel=True,
has_bias=True)
else:
self.fc1 = nn.Linear(dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, dim)
self.act = ACT2FN[hidden_act]
def forward(self, x) -> paddle.Tensor:
"""_summary_
Args:
x (_type_): _description_
Returns:
paddle.Tensor: _description_
"""
return self.fc2(self.act(self.fc1(x)))
class VisionRotaryEmbedding(nn.Layer):
"""_summary_
Args:
nn (_type_): _description_
"""
def __init__(self, dim: int, theta: float = 10000.0) -> None:
"""_summary_
Args:
dim (int): _description_
theta (float, optional): _description_. Defaults to 10000.0.
"""
super().__init__()
self.inv_freq = 1.0 / theta**(
paddle.arange(start=0, end=dim, step=2, dtype="float32") / dim)
def forward(self, seqlen: int) -> paddle.Tensor:
"""_summary_
Args:
seqlen (int): _description_
Returns:
paddle.Tensor: _description_
"""
seq = paddle.arange(seqlen).cast(self.inv_freq.dtype)
freqs = paddle.outer(x=seq, y=self.inv_freq)
return freqs
class DFNRopeVisionBlock(nn.Layer):
"""_summary_
Args:
nn (_type_): _description_
"""
def __init__(self, config, attn_implementation: str = "sdpa") -> None:
"""_summary_
Args:
config (_type_): _description_
attn_implementation (str, optional): _description_. Defaults to "sdpa".
"""
super().__init__()
self.norm1 = nn.LayerNorm(config.embed_dim, epsilon=1e-6)
self.norm2 = nn.LayerNorm(config.embed_dim, epsilon=1e-6)
mlp_hidden_dim = int(config.embed_dim * config.mlp_ratio)
self.attn = VisionFlashAttention2(
config.embed_dim,
num_heads=config.num_heads,
tensor_parallel_degree=config.tensor_parallel_degree)
self.mlp = VisionMlp(
dim=config.embed_dim,
hidden_dim=mlp_hidden_dim,
hidden_act=config.hidden_act,
tensor_parallel_degree=config.tensor_parallel_degree)
self.config = config
def forward(self,
hidden_states,
cu_seqlens,
rotary_pos_emb,
attn_sep=False) -> paddle.Tensor:
"""_summary_
Args:
hidden_states (_type_): _description_
cu_seqlens (_type_): _description_
rotary_pos_emb (_type_): _description_
Returns:
paddle.Tensor: _description_
"""
hidden_states = hidden_states + self.attn(
self.norm1(hidden_states),
cu_seqlens=cu_seqlens,
rotary_pos_emb=rotary_pos_emb,
attn_sep=attn_sep,
)
hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
return hidden_states
class PatchMerger(nn.Layer):
"""_summary_
Args:
nn (_type_): _description_
"""
def __init__(self,
dim: int,
context_dim: int,
spatial_merge_size: int = 2) -> None:
"""_summary_
Args:
dim (int): _description_
context_dim (int): _description_
spatial_merge_size (int, optional): _description_. Defaults to 2.
"""
super().__init__()
self.hidden_size = context_dim * (spatial_merge_size**2)
self.ln_q = nn.LayerNorm(context_dim, epsilon=1e-6)
self.mlp = nn.Sequential(
nn.Linear(self.hidden_size, self.hidden_size),
nn.GELU(),
nn.Linear(self.hidden_size, dim),
)
def forward(self, x: paddle.Tensor) -> paddle.Tensor:
"""_summary_
Args:
x (paddle.Tensor): _description_
Returns:
paddle.Tensor: _description_
"""
x = self.mlp(self.ln_q(x).reshape([-1, self.hidden_size]))
return x
class DFNRopeVisionTransformerPretrainedModel(PretrainedModel):
"""_summary_
Args:
PretrainedModel (_type_): _description_
Returns:
_type_: _description_
"""
config_class = DFNRopeVisionTransformerConfig
def __init__(self, config) -> None:
super().__init__(config)
self.spatial_merge_size = config.spatial_merge_size
self.patch_embed = PatchEmbed(
patch_size=config.patch_size,
in_channels=config.in_channels,
embed_dim=config.embed_dim,
)
head_dim = config.embed_dim // config.num_heads
self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
self.blocks = nn.LayerList(
[DFNRopeVisionBlock(config) for _ in range(config.depth)])
assert (
config.hidden_size == config.embed_dim
), "in DFNRope, vit's config.hidden must be equal to config.embed_dim"
# self.merger = PatchMerger(dim=config.hidden_size, context_dim=config.embed_dim)
self.ln = nn.LayerNorm(config.hidden_size, epsilon=1e-6)
def get_dtype(self) -> paddle.dtype:
"""_summary_
Returns:
paddle.dtype: _description_
"""
return self.blocks[0].mlp.fc2.weight.dtype
def get_name_mappings_to_training(self, ):
""" get_name_mappings_to_training """
infer_to_train = {}
# vit train names
vit_names = [
"vision_model.patch_embed.proj.weight", "vision_model.ln.weight",
"vision_model.ln.bias"
]
vit_layer = 32
for layer_idx in range(vit_layer):
vit_names.append(f"vision_model.blocks.{layer_idx}.norm1.weight")
vit_names.append(f"vision_model.blocks.{layer_idx}.norm1.bias")
vit_names.append(f"vision_model.blocks.{layer_idx}.norm2.weight")
vit_names.append(f"vision_model.blocks.{layer_idx}.norm2.bias")
vit_names.append(
f"vision_model.blocks.{layer_idx}.attn.qkv.weight")
vit_names.append(f"vision_model.blocks.{layer_idx}.attn.qkv.bias")
vit_names.append(
f"vision_model.blocks.{layer_idx}.attn.proj.weight")
vit_names.append(f"vision_model.blocks.{layer_idx}.attn.proj.bias")
vit_names.append(f"vision_model.blocks.{layer_idx}.mlp.fc1.weight")
vit_names.append(f"vision_model.blocks.{layer_idx}.mlp.fc1.bias")
vit_names.append(f"vision_model.blocks.{layer_idx}.mlp.fc2.weight")
vit_names.append(f"vision_model.blocks.{layer_idx}.mlp.fc2.bias")
for train_name in vit_names:
infer_to_train[train_name] = train_name
return infer_to_train
def rot_pos_emb(self, grid_thw, num_pad=0):
"""_summary_
Args:
grid_thw (_type_): _description_
Returns:
_type_: _description_
"""
pos_ids = []
grid_hw_array = np.array(grid_thw, dtype=np.int64)
for t, h, w in grid_hw_array:
hpos_ids = np.arange(h).reshape(-1, 1)
hpos_ids = np.tile(hpos_ids, (1, w))
hpos_ids = hpos_ids.reshape(
h // self.spatial_merge_size,
self.spatial_merge_size,
w // self.spatial_merge_size,
self.spatial_merge_size,
)
hpos_ids = np.transpose(hpos_ids, (0, 2, 1, 3))
hpos_ids = hpos_ids.flatten()
wpos_ids = np.arange(w).reshape(1, -1)
wpos_ids = np.tile(wpos_ids, (h, 1))
wpos_ids = wpos_ids.reshape(
h // self.spatial_merge_size,
self.spatial_merge_size,
w // self.spatial_merge_size,
self.spatial_merge_size,
)
wpos_ids = np.transpose(wpos_ids, (0, 2, 1, 3))
wpos_ids = wpos_ids.flatten()
stacked_ids = np.stack([hpos_ids, wpos_ids], axis=-1)
tiled_ids = np.tile(stacked_ids, (t, 1))
pos_ids.append(tiled_ids)
pos_ids = np.concatenate(pos_ids, axis=0)
if num_pad > 0:
pos_ids = np.concatenate(
[pos_ids, np.zeros((num_pad, 2), dtype=pos_ids.dtype)])
max_grid_size = np.amax(grid_hw_array[:, 1:])
rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(start_axis=1)
return rotary_pos_emb
def forward(self,
hidden_states: paddle.Tensor,
grid_thw: paddle.Tensor,
num_pad=0) -> paddle.Tensor:
"""_summary_
Args:
hidden_states (paddle.Tensor): _description_
grid_thw (paddle.Tensor): _description_
Returns:
paddle.Tensor: _description_
"""
hidden_states = self.patch_embed(hidden_states)
rotary_pos_emb = self.rot_pos_emb(grid_thw, num_pad=num_pad)
cu_seqlens = paddle.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
grid_thw[:, 0]).cumsum(
axis=0, dtype="int32")
if num_pad > 0:
cu_seqlens = F.pad(cu_seqlens, (1, 1), value=0)
cu_seqlens[-1] = cu_seqlens[-2] + num_pad
else:
cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
attn_sep = getattr(self.config, "attn_sep", False)
vit_num_recompute_layers = getattr(self.config,
"vit_num_recompute_layers",
self.config.depth)
for idx, blk in enumerate(self.blocks):
if self.config.recompute and self.training and idx < vit_num_recompute_layers:
hidden_states = recompute(blk, hidden_states, cu_seqlens,
rotary_pos_emb, attn_sep)
else:
hidden_states = blk(
hidden_states,
cu_seqlens=cu_seqlens,
rotary_pos_emb=rotary_pos_emb,
attn_sep=attn_sep,
)
# ret = self.merger(hidden_states)
# ret = hidden_states
ret = self.ln(hidden_states) # add norm
return ret
def extract_feature(self, hidden_states: paddle.Tensor,
grid_thw: paddle.Tensor) -> paddle.Tensor:
"""_summary_
Args:
hidden_states (paddle.Tensor): _description_
grid_thw (paddle.Tensor): _description_
Returns:
paddle.Tensor: _description_
"""
return self.forward(hidden_states, grid_thw)
@classmethod
def _get_tensor_parallel_mappings(cls, config, is_split=True):
"""
dummy
"""
from paddleformers.transformers.conversion_utils import \
split_or_merge_func
fn = split_or_merge_func(
is_split=is_split,
tensor_parallel_degree=config.tensor_parallel_degree,
tensor_parallel_rank=config.tensor_parallel_rank,
)
vision_config = config.vision_config
def split_qkv_weight(x):
head_dim = vision_config.hidden_size // vision_config.num_heads
x = x.reshape([
vision_config.hidden_size, 3, vision_config.num_heads, head_dim
])
x = np.split(x, vision_config.tensor_parallel_degree,
axis=-2)[vision_config.tensor_parallel_rank]
x = x.reshape([vision_config.hidden_size, -1])
return x
def split_qkv_bias(x):
head_dim = vision_config.hidden_size // vision_config.num_heads
x = x.reshape([3, vision_config.num_heads, head_dim])
x = np.split(x, vision_config.tensor_parallel_degree,
axis=-2)[vision_config.tensor_parallel_rank]
x = x.reshape([-1])
return x
def get_tensor_parallel_split_mappings(depth):
final_actions = {}
base_actions = {
"vision_model.blocks.0.attn.proj.weight":
partial(fn, is_column=False),
"vision_model.blocks.0.fc1.weight":
partial(fn, is_column=True),
"vision_model.blocks.0.fc1.bias":
partial(fn, is_column=True),
"vision_model.blocks.0.fc2.weight":
partial(fn, is_column=False),
"vision_model.blocks.0.qkv.weight":
split_qkv_weight,
"vision_model.blocks.0.qkv.bias":
split_qkv_bias,
}
for key, action in base_actions.items():
if "blocks.0." in key:
for i in range(depth):
newkey = key.replace("blocks.0.", f"blocks.{i}.")
final_actions[newkey] = action
return final_actions
mappings = get_tensor_parallel_split_mappings(vision_config.depth)
return mappings
def set_state_dict(self, state_dict, *args, **kwargs):
"""_summary_
Args:
state_dict (_type_): _description_
"""
super().set_state_dict(state_dict, *args, **kwargs)

View File

@@ -0,0 +1,130 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import paddle
from paddle import distributed as dist
from paddle.distributed import fleet
from paddle.distributed.fleet.utils.sequence_parallel_utils import \
RowSequenceParallelLinear
__all__ = [
"scatter_axis", "all_gather_group", "reduce_scatter_group",
"RowSequenceParallelLinear"
]
def scatter_axis(input, group=None, axis=0):
"""
在MP 间按照第 0 维对`input`进行均匀切分。
这个API 跟`distributed.scatter`并没有什么关系
"""
if group is None:
hcg = fleet.get_hybrid_communicate_group()
group = hcg.get_model_parallel_group()
parallelism = group.nranks
if parallelism == 1:
return input.clone()
rank = group.rank
seq_len = input.shape[axis]
assert seq_len % parallelism == 0, (
f"Input sequence length {seq_len} can't be divided exactly"
f" by sequence parallelism {parallelism}")
interval = seq_len // parallelism
input = paddle.slice(input,
axes=[axis],
starts=[interval * rank],
ends=[interval * (rank + 1)])
# slice use stride, so we maintain the memory of whole input, use assign to free the whole input
# which can avoid OOM.
input = paddle.assign(input)
return input
def all_gather_group(input, group=None, axis=0):
"""Perform collective all-gather operation across a process group with axis control.
Functional Behavior:
- Aggregates input tensors from all processes in the specified group
- Supports concatenation along arbitrary dimensions (axis parameter)
- Optimizes for axis=0 via direct shape expansion to avoid concatenation overhead
Args:
input (Tensor): Local tensor to be gathered (shape: [..., D, ...])
group (ProcessGroup): Communication group (defaults to model parallel group)
axis (int): Concatenation dimension (default=0)
Returns:
Tensor: Concatenated tensor combining inputs from all processes:
- When axis=0: shape [D*N, ...] (N = group size)
- Otherwise: shape [..., D*N, ...] along specified axis
"""
if group is None:
hcg = fleet.get_hybrid_communicate_group()
group = hcg.get_model_parallel_group()
parallelism = group.nranks
if parallelism == 1:
return input.clone()
output_shape = input.shape
if axis == 0:
output_shape[axis] = output_shape[axis] * parallelism
output = paddle.empty(shape=output_shape, dtype=input.dtype)
dist.stream.all_gather(output,
input,
group=group,
use_calc_stream=True)
return output
outputs = [
paddle.empty(output_shape, dtype=input.dtype)
for _ in range(parallelism)
]
dist.stream.all_gather(outputs, input, group=group, use_calc_stream=True)
output = paddle.concat(outputs, axis=axis)
return output
def reduce_scatter_group(input, group=None):
"""Perform reduce-scatter collective operation across a process group.
Functional Behavior:
- Aggregates (sums) input tensors across all processes in the group
- Scatters the reduced result equally to all participants
- Operates along the first dimension (axis=0) of the input tensor
Args:
input (Tensor): Local tensor to reduce (shape: [N*K, ...] where N=group_size)
group (ProcessGroup): Communication group (defaults to model parallel group)
Returns:
Tensor: Scattered portion of reduced tensor with shape [K, ...]
"""
if group is None:
hcg = fleet.get_hybrid_communicate_group()
group = hcg.get_model_parallel_group()
parallelism = group.nranks
if parallelism == 1:
return input.clone()
output_shape = input.shape
assert (
input.shape[0] % parallelism == 0
), f"Input sequence length {input.shape[0]} can't be divided exactly by sequence parallelism {parallelism}"
output_shape[0] = output_shape[0] // parallelism
output = paddle.empty(shape=output_shape, dtype=input.dtype)
dist.stream.reduce_scatter(output,
input,
op=dist.ReduceOp.SUM,
group=group,
use_calc_stream=True)
return output

View File

@@ -0,0 +1,511 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Dict, Optional, Union
import numpy as np
import paddle
from paddle import nn
from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig
from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
from fastdeploy.model_executor.layers.moe.moe import FusedMoE
from fastdeploy.model_executor.layers.normalization import RMSNorm
from fastdeploy.model_executor.layers.utils import get_tensor
from fastdeploy.model_executor.models.ernie4_5_moe import (Ernie4_5_Attention,
Ernie4_5_MLP)
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.platforms import current_platform
if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import (extract_text_token_output,
text_image_gather_scatter,
text_image_index_out)
from fastdeploy.worker.forward_meta import ForwardMeta
class Ernie4_5_VLMLP(Ernie4_5_MLP):
pass
class Ernie4_5_VLAttention(Ernie4_5_Attention):
pass
@dataclass
class VLMoEMeta:
image_input: Optional[paddle.Tensor] = None
text_input: Optional[paddle.Tensor] = None
text_index: Optional[paddle.Tensor] = None
image_index: Optional[paddle.Tensor] = None
token_type_ids: Optional[paddle.Tensor] = None
class Ernie4_5_VLMoE(nn.Layer):
def __init__(self, fd_config: FDConfig, layer_id: int,
prefix: str) -> None:
super().__init__()
moe_layer_start_index = fd_config.moe_config.moe_layer_start_index
if isinstance(moe_layer_start_index, int):
text_moe_layer_start_index = moe_layer_start_index
image_moe_layer_start_index = moe_layer_start_index
else:
text_moe_layer_start_index = moe_layer_start_index[0]
image_moe_layer_start_index = moe_layer_start_index[1]
moe_layer_end_index = fd_config.moe_config.moe_layer_end_index
if moe_layer_end_index is None:
text_moe_layer_end_index = fd_config.model_config.num_layers
image_moe_layer_end_index = fd_config.model_config.num_layers
elif isinstance(moe_layer_end_index, int):
text_moe_layer_end_index = moe_layer_end_index
image_moe_layer_end_index = moe_layer_end_index
else:
text_moe_layer_end_index = moe_layer_end_index[0]
image_moe_layer_end_index = moe_layer_end_index[1]
assert text_moe_layer_start_index <= text_moe_layer_end_index
if layer_id >= text_moe_layer_start_index and layer_id <= text_moe_layer_end_index:
weight_key_map = {
"gate_weight_key":
f"{prefix}.gate.weight",
"gate_correction_bias_key":
f"{prefix}.moe_statics.e_score_correction_bias",
"ffn1_expert_weight_key":
f"{prefix}.experts.{{}}.up_gate_proj.weight",
"ffn2_expert_weight_key":
f"{prefix}.experts.{{}}.down_proj.weight",
}
self.mlp_text = FusedMoE(
fd_config=fd_config,
moe_intermediate_size=fd_config.moe_config.
moe_intermediate_size[0],
num_experts=fd_config.moe_config.num_experts[0],
expert_id_offset=0,
top_k=fd_config.moe_config.top_k,
layer_idx=layer_id,
moe_tag="Text",
weight_key_map=weight_key_map,
)
self.mlp_text.extract_gate_correction_bias = self.extract_gate_correction_bias_text
else:
self.mlp_text = Ernie4_5_VLMLP(
fd_config=fd_config,
intermediate_size=fd_config.model_config.ffn_hidden_size,
prefix=f"{prefix}",
)
assert image_moe_layer_start_index <= image_moe_layer_end_index
if layer_id >= image_moe_layer_start_index and layer_id <= image_moe_layer_end_index:
weight_key_map = {
"gate_weight_key":
f"{prefix}.gate.weight_1",
"gate_correction_bias_key":
f"{prefix}.moe_statics.e_score_correction_bias",
"ffn1_expert_weight_key":
f"{prefix}.experts.{{}}.up_gate_proj.weight",
"ffn2_expert_weight_key":
f"{prefix}.experts.{{}}.down_proj.weight",
}
self.mlp_image = FusedMoE(
fd_config=fd_config,
moe_intermediate_size=fd_config.moe_config.
moe_intermediate_size[1],
num_experts=fd_config.moe_config.num_experts[1],
expert_id_offset=fd_config.moe_config.num_experts[0],
top_k=fd_config.moe_config.top_k,
layer_idx=layer_id,
moe_tag="Image",
weight_key_map=weight_key_map,
)
self.mlp_image.extract_gate_correction_bias = self.extract_gate_correction_bias_image
else:
self.mlp_image = Ernie4_5_VLMLP(
fd_config=fd_config,
intermediate_size=fd_config.model_config.ffn_hidden_size,
prefix=f"{prefix}",
)
self.num_shared_experts = fd_config.moe_config.moe_num_shared_experts
if self.num_shared_experts > 0:
self.share_experts = Ernie4_5_VLMLP(
fd_config=fd_config,
intermediate_size=self.num_shared_experts *
fd_config.moe_config.moe_intermediate_size[0],
prefix=f"{prefix}.shared_experts",
)
def extract_gate_correction_bias_text(self, gate_correction_bias_key,
state_dict):
"""
extract_gate_correction_bias function.
"""
gate_correction_bias_tensor = get_tensor(
state_dict[gate_correction_bias_key]).astype("float32")
return gate_correction_bias_tensor[0].unsqueeze(0)
def extract_gate_correction_bias_image(self, gate_correction_bias_key,
state_dict):
"""
extract_gate_correction_bias function.
"""
gate_correction_bias_tensor = get_tensor(
state_dict[gate_correction_bias_key]).astype("float32")
return gate_correction_bias_tensor[1].unsqueeze(0)
def load_state_dict(self, state_dict):
self.mlp_text.load_state_dict(state_dict)
self.mlp_image.load_state_dict(state_dict)
if self.mlp_text.moe_use_gate_correction_bias:
state_dict.pop(self.mlp_text.gate_correction_bias_key)
if self.num_shared_experts > 0:
self.share_experts.load_state_dict(state_dict)
def forward(self, hidden_states: paddle.Tensor, vl_moe_meta: VLMoEMeta):
if self.num_shared_experts > 0:
share_experts_out = self.share_experts(hidden_states)
if vl_moe_meta.image_input is not None:
text_image_gather_scatter(
hidden_states,
vl_moe_meta.text_input,
vl_moe_meta.image_input,
vl_moe_meta.token_type_ids,
vl_moe_meta.text_index,
vl_moe_meta.image_index,
True,
)
text_out = self.mlp_text(vl_moe_meta.text_input)
image_out = self.mlp_image(vl_moe_meta.image_input)
text_image_gather_scatter(
hidden_states,
text_out,
image_out,
vl_moe_meta.token_type_ids,
vl_moe_meta.text_index,
vl_moe_meta.image_index,
False,
)
else:
hidden_states = self.mlp_text(hidden_states)
if self.num_shared_experts > 0:
hidden_states += share_experts_out
return hidden_states
class Ernie4_5_VLDecoderLayer(nn.Layer):
def __init__(
self,
fd_config: FDConfig,
prefix: str = "",
) -> None:
super().__init__()
layer_id = int(prefix.split(sep='.')[-1])
moe_layer_start_index = fd_config.moe_config.moe_layer_start_index
if isinstance(moe_layer_start_index, list):
min_moe_layer_start_index = min(moe_layer_start_index)
else:
min_moe_layer_start_index = moe_layer_start_index
max_moe_layer_end_index = fd_config.model_config.num_layers
if fd_config.moe_config.moe_layer_end_index is not None:
moe_layer_end_index = fd_config.moe_config.moe_layer_end_index
if isinstance(moe_layer_start_index, list):
max_moe_layer_end_index = max(moe_layer_end_index)
else:
max_moe_layer_end_index = moe_layer_end_index
self.self_attn = Ernie4_5_VLAttention(
fd_config=fd_config,
layer_id=layer_id,
prefix=f"{prefix}.self_attn",
)
assert min_moe_layer_start_index <= max_moe_layer_end_index
if (fd_config.moe_config.num_experts is not None
and layer_id >= min_moe_layer_start_index
and layer_id <= max_moe_layer_end_index):
self.mlp = Ernie4_5_VLMoE(
fd_config=fd_config,
layer_id=layer_id,
prefix=f"{prefix}.mlp",
)
else:
self.mlp = Ernie4_5_VLMLP(
fd_config=fd_config,
intermediate_size=fd_config.model_config.ffn_hidden_size,
prefix=f"{prefix}.mlp",
)
self.input_layernorm = RMSNorm(
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=1e-5,
prefix=f"{prefix}.input_layernorm",
)
self.post_attention_layernorm = RMSNorm(
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=1e-5,
prefix=f"{prefix}.post_attention_layernorm",
)
def load_state_dict(self, state_dict):
self.self_attn.load_state_dict(state_dict)
self.mlp.load_state_dict(state_dict)
self.input_layernorm.load_state_dict(state_dict)
self.post_attention_layernorm.load_state_dict(state_dict)
def forward(
self,
forward_meta: ForwardMeta,
hidden_states: paddle.Tensor,
residual: paddle.Tensor = None,
vl_moe_meta: VLMoEMeta = None,
):
if residual is None:
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
else:
hidden_states, residual = self.input_layernorm(
hidden_states, residual)
hidden_states = self.self_attn(
hidden_states=hidden_states,
forward_meta=forward_meta,
)
hidden_states, residual = self.post_attention_layernorm(
hidden_states, residual)
if isinstance(self.mlp, Ernie4_5_VLMoE):
hidden_states = self.mlp(hidden_states, vl_moe_meta)
else:
hidden_states = self.mlp(hidden_states)
return hidden_states, residual
class Ernie4_5_VLModel(nn.Layer):
def __init__(
self,
fd_config: FDConfig = None,
):
"""
Initializer for the Ernie4_5_VLModel class.
Args:
"""
super().__init__()
self.num_layers = fd_config.model_config.num_layers
self.im_patch_id = fd_config.moe_config.im_patch_id
self._dtype = fd_config.model_config.dtype
fd_config.model_config.prefix_name = "ernie"
self.embeddings = VocabParallelEmbedding(
fd_config=fd_config,
num_embeddings=fd_config.model_config.vocab_size,
embedding_dim=fd_config.model_config.hidden_size,
params_dtype=paddle.get_default_dtype,
prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"),
)
self.hidden_layers = [
Ernie4_5_VLDecoderLayer(
fd_config=fd_config,
prefix=f"{fd_config.model_config.prefix_name}.layers.{i}")
for i in range(self.num_layers)
]
self.norm = RMSNorm(
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=1e-5,
prefix=f"{fd_config.model_config.prefix_name}.norm",
)
def load_state_dict(self, state_dict):
"""
Load model parameters from a given state dictionary.
Args:
state_dict (dict[str, np.ndarray | paddle.Tensor]):
A dictionary containing model parameters, where keys are parameter names
and values are NumPy arrays or PaddlePaddle tensors.
"""
self.embeddings.load_state_dict(state_dict)
self.norm.load_state_dict(state_dict)
for i in range(self.num_layers):
logger.info(f"Start load layer {i}")
self.hidden_layers[i].load_state_dict(state_dict)
def forward(
self,
ids_remove_padding: paddle.Tensor,
image_features: paddle.Tensor,
forward_meta: ForwardMeta,
):
text_input = None
image_input = None
text_index = None
image_index = None
image_token_num = 0
hidden_states = self.embeddings(ids_remove_padding=ids_remove_padding)
# -----------------------
image_mask = ids_remove_padding == self.im_patch_id
token_type_ids = image_mask.cast("int32")
token_num = hidden_states.shape[0]
image_token_num = paddle.count_nonzero(token_type_ids).cast("int32")
text_token_num = ((token_num - image_token_num) if
(token_num - image_token_num) > 0 else 1)
if image_mask.any():
hidden_states[image_mask] = image_features.cast(self._dtype)
text_input = paddle.full(
shape=[text_token_num, hidden_states.shape[1]],
fill_value=1,
dtype=self._dtype)
image_input = paddle.full(
shape=[image_token_num, hidden_states.shape[1]],
fill_value=1,
dtype=self._dtype)
text_index = paddle.zeros_like(token_type_ids)
image_index = paddle.zeros_like(token_type_ids)
text_image_index_out(token_type_ids, text_index, image_index)
vl_moe_meta = VLMoEMeta(
text_input=text_input,
image_input=image_input,
text_index=text_index,
image_index=image_index,
token_type_ids=token_type_ids,
)
# -----------------------
residual = None
for i in range(self.num_layers):
hidden_states, residual = self.hidden_layers[i](
forward_meta,
hidden_states,
residual,
vl_moe_meta,
)
hidden_states = hidden_states + residual
# -----------------------
hidden_states = hidden_states.cast("float32")
score_text = hidden_states
if image_input is not None:
token_type_ids = token_type_ids.reshape([-1])
text_pos_shifted = token_type_ids[:token_num] == 0
score_text = hidden_states[text_pos_shifted.reshape([-1])]
max_seq_len, max_seq_len_index = paddle.topk(
forward_meta.seq_lens_this_time.squeeze(-1), k=1)
hidden_states = extract_text_token_output(
max_seq_len,
max_seq_len_index.cast("int32"),
image_token_num,
forward_meta.seq_lens_this_time,
forward_meta.cu_seqlens_q,
score_text,
)[0].cast(self._dtype)
# -----------------------
out = self.norm(hidden_states)
return out
class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM):
"""
Ernie4_5_VLMoeForConditionalGeneration
"""
def __init__(self, fd_config: FDConfig):
"""
Args:
fd_config (FDConfig): Configurations for the LLM model.
"""
super(Ernie4_5_VLMoeForConditionalGeneration, self).__init__(fd_config)
self.model = Ernie4_5_VLModel(fd_config=fd_config)
self.ori_vocab_size = fd_config.model_config.ori_vocab_size
self.lm_head = ParallelLMHead(
fd_config=fd_config,
embedding_dim=fd_config.model_config.hidden_size,
num_embeddings=fd_config.model_config.vocab_size,
prefix="lm_head",
)
self.tie_word_embeddings = fd_config.model_config.tie_word_embeddings
@classmethod
def name(self):
return "Ernie4_5_VLMoeForConditionalGeneration"
@paddle.no_grad()
def set_state_dict(self, state_dict: Dict[str, Union[np.ndarray,
paddle.Tensor]]):
"""
Load model parameters from a given state dictionary.
Args:
state_dict (dict[str, np.ndarray | paddle.Tensor]):
A dictionary containing model parameters, where keys are parameter names
and values are NumPy arrays or PaddlePaddle tensors.
"""
self.model.load_state_dict(state_dict)
if self.tie_word_embeddings:
self.lm_head.out_linear.weight.set_value(
self.model.embeddings.word_embeddings.weight.transpose([1, 0]))
else:
self.lm_head.load_state_dict(state_dict)
def compute_logits(self, hidden_states: paddle.Tensor):
logits = self.lm_head(hidden_states)
logits = paddle.cast(logits, paddle.float32)
logits[:, self.ori_vocab_size:] = -float("inf")
return logits
def forward(
self,
ids_remove_padding: paddle.Tensor,
image_features: paddle.Tensor,
forward_meta: ForwardMeta,
):
hidden_states = self.model(ids_remove_padding, image_features,
forward_meta)
return hidden_states

View File

@@ -0,0 +1,399 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from copy import deepcopy
from functools import partial
import numpy as np
import paddle
from paddle import nn
from paddle.autograd import PyLayer
from paddle.distributed.fleet.utils import recompute
from fastdeploy.model_executor.layers.utils import _set_var_distributed
from fastdeploy.model_executor.models.ernie4_5_vl.dist_utils import (
RowSequenceParallelLinear, all_gather_group, reduce_scatter_group,
scatter_axis)
class ScatterOp(PyLayer):
"""
各 rank 从**同一个** sequence 上 slice 出属于自己的部分(均匀切分 )。
在反向时候会汇聚来自各 rank 的梯度,回复到 mp 同步状态。
反操作是`GatherOp`
input: Tensor [S,*]
注意:跟`distributed.scatter`并没有什么关系
"""
@staticmethod
def forward(ctx, input, axis=0, group=None):
"""fwd"""
ctx.axis = axis
ctx.group = group
return scatter_axis(input, axis=axis, group=ctx.group)
@staticmethod
def backward(ctx, grad):
return all_gather_group(grad, axis=ctx.axis, group=ctx.group)
class AllGatherOp(PyLayer):
"""
input shape: [s/n, b, h], n is mp parallelism
after forward shape: [s, b, h]
行为类似`AllGather`反向会汇聚梯度AllGather 完之后还是 MP 异步态。
"""
@staticmethod
def forward(ctx, input, group=None):
"""fwd"""
ctx.group = group
return all_gather_group(input, group=group)
# grad shape: [s, b, h], n is mp parallelism
# after forward shape: [s/n, b, h]
@staticmethod
def backward(ctx, grad):
return reduce_scatter_group(grad, group=ctx.group)
def mark_as_sequence_parallel_parameter(parameter):
parameter.sequence_parallel = True
class RMSNorm(nn.Layer):
"""
Root Mean Square Layer Normalization (RMSNorm) implementation.
RMSNorm is a simplified version of LayerNorm that focuses on the root mean square of inputs,
omitting the mean-centering operation. This provides computational efficiency while maintaining
good performance.
"""
def __init__(self, config):
"""
Initialize RMSNorm layer.
Args:
config (ErnieConfig): Model configuration.
"""
super().__init__()
self.hidden_size = config.hidden_size
self.weight = paddle.create_parameter(
shape=[self.hidden_size],
dtype=paddle.get_default_dtype(),
default_initializer=nn.initializer.Constant(1.0),
)
self.variance_epsilon = config.rms_norm_eps
self.config = config
if config.sequence_parallel:
mark_as_sequence_parallel_parameter(self.weight)
def forward(self, hidden_states):
"""
Apply RMS normalization to input hidden states.
Args:
hidden_states (Tensor): Input tensor of shape [batch_size, seq_len, hidden_size]
Returns:
Tensor: Normalized output tensor of same shape as input
Note:
- Uses fused kernel if config.fuse_rms_norm is True for better performance
- Otherwise computes RMSNorm manually:
1. Compute variance of features
2. Apply reciprocal square root normalization
3. Scale by learned weight parameter
- Maintains original dtype for numerical stability during computation
"""
with paddle.amp.auto_cast(False):
variance = hidden_states.astype("float32").pow(2).mean(
-1, keepdim=True)
hidden_states = paddle.rsqrt(variance +
self.variance_epsilon) * hidden_states
return hidden_states.astype(self.weight.dtype) * self.weight
class VariableResolutionResamplerModel(nn.Layer):
"""
VariableResolutionResamplerModel, 支持变分, 负责空间、时间维度缩并。
"""
def __init__(self, in_dim, out_dim, spatial_conv_size, temporal_conv_size,
config):
super().__init__()
self.in_dim = in_dim
self.out_dim = out_dim
self.config = config
self.spatial_conv_size = spatial_conv_size
self.temporal_conv_size = temporal_conv_size
self.use_recompute_resampler = config.use_recompute_resampler
self.use_temporal_conv = config.use_temporal_conv
self.tensor_parallel_degree = config.tensor_parallel_degree
# for 空间四合一
self.spatial_dim = self.in_dim * self.spatial_conv_size * self.spatial_conv_size
# for 时间二合一
self.temporal_dim = self.in_dim * self.spatial_conv_size * self.spatial_conv_size * self.temporal_conv_size
with paddle.utils.unique_name.guard("mm_resampler_"):
self.spatial_linear = nn.Sequential(
(RowSequenceParallelLinear(
self.spatial_dim,
self.spatial_dim,
input_is_parallel=True,
has_bias=True,
fuse_matmul_bias=True,
) if config.tensor_parallel_degree > 1 else nn.Linear(
self.spatial_dim, self.spatial_dim)),
nn.GELU(),
nn.Linear(self.spatial_dim, self.spatial_dim),
nn.LayerNorm(self.spatial_dim, epsilon=1e-6),
)
if self.use_temporal_conv:
self.temporal_linear = nn.Sequential(
nn.Linear(self.temporal_dim, self.spatial_dim),
nn.GELU(),
nn.Linear(self.spatial_dim, self.spatial_dim),
nn.LayerNorm(self.spatial_dim, epsilon=1e-6),
)
self.mlp = nn.Linear(self.spatial_dim, self.out_dim)
out_config = deepcopy(config)
out_config.hidden_size = out_dim
# Note(GuoxiaWang): fuse can reduce gpu peak memory
out_config.fuse_rms_norm = out_config.resampler_fuse_rms_norm
self.after_norm = RMSNorm(out_config)
if config.tensor_parallel_degree > 1:
for idx in [2, 3]:
mark_as_sequence_parallel_parameter(
self.spatial_linear[idx].weight)
mark_as_sequence_parallel_parameter(
self.spatial_linear[idx].bias)
_set_var_distributed(self.spatial_linear[idx].weight,
split_axis=0)
_set_var_distributed(self.spatial_linear[idx].bias,
split_axis=0)
if self.use_temporal_conv:
for idx in [0, 2, 3]:
mark_as_sequence_parallel_parameter(
self.temporal_linear[idx].weight)
mark_as_sequence_parallel_parameter(
self.temporal_linear[idx].bias)
mark_as_sequence_parallel_parameter(self.mlp.weight)
mark_as_sequence_parallel_parameter(self.mlp.bias)
mark_as_sequence_parallel_parameter(self.after_norm.weight)
def get_name_mappings_to_training(self, ):
""" get_name_mappings_to_training """
infer_to_train = {}
resampler_names = [
"ernie.resampler_model.spatial_linear.0.weight",
"ernie.resampler_model.spatial_linear.0.bias",
"ernie.resampler_model.spatial_linear.2.weight",
"ernie.resampler_model.spatial_linear.2.bias",
"ernie.resampler_model.spatial_linear.3.weight",
"ernie.resampler_model.spatial_linear.3.bias",
"ernie.resampler_model.temporal_linear.0.weight",
"ernie.resampler_model.temporal_linear.0.bias",
"ernie.resampler_model.temporal_linear.2.weight",
"ernie.resampler_model.temporal_linear.2.bias",
"ernie.resampler_model.temporal_linear.3.weight",
"ernie.resampler_model.temporal_linear.3.bias",
"ernie.resampler_model.mlp.weight",
"ernie.resampler_model.mlp.bias",
"ernie.resampler_model.after_norm.weight",
]
for train_name in resampler_names:
infer_to_train[train_name[len("ernie."):]] = train_name
return infer_to_train
def spatial_conv_reshape(self, x, spatial_conv_size):
"""
Linear 前的 reshape为了让 Linear 能模仿 conv 的感受野
"""
S, C = x.shape
x = x.reshape([-1, C * (spatial_conv_size**2)])
return x
def forward(self, x, image_mask, token_type_ids, image_type_ids, grid_thw):
"""
x: image_features
image_mask: [B]
token_types_ids: [B]
image_type_ids: [B_image]
grid_thw: [B_image, 3]
"""
assert image_type_ids is not None
def fwd_spatial(x):
"""
x in the shape of [S, H]
S is ordered in the following way: [ [patch_h*patch_w (row-major traversal)] * patch_time]
H is simply hidden
"""
x = self.spatial_conv_reshape(x, self.spatial_conv_size)
num_pad = 0
if self.tensor_parallel_degree > 1:
num_pad = (
x.shape[0] + self.tensor_parallel_degree - 1
) // self.tensor_parallel_degree * self.tensor_parallel_degree - x.shape[
0]
if num_pad > 0:
x = paddle.nn.functional.pad(x, [0, num_pad, 0, 0])
x = self.spatial_linear(x)
if self.tensor_parallel_degree > 1:
x = AllGatherOp.apply(x)
if num_pad > 0:
x = x[:-num_pad]
return x
def fwd_placeholder(x, grid_thw, to_tensor=False):
"""
x: [S, H]
grid_thw: [S, 3]
其中第二维是: [t, h, w]
"""
grid_thw_cpu = grid_thw.numpy()
grid_t, grid_hw = grid_thw_cpu[:, 0], grid_thw_cpu[:, 1:]
grid_hw_after_conv = grid_hw.prod(-1) // (self.spatial_conv_size**
2)
tokens_per_img_or_vid = grid_thw_cpu.prod(-1) // (
self.spatial_conv_size**2)
batch_offset = np.empty(tokens_per_img_or_vid.size,
dtype=tokens_per_img_or_vid.dtype)
batch_offset[0] = 0
batch_offset[1:] = tokens_per_img_or_vid.cumsum()[:-1]
assert self.temporal_conv_size == 2, f"Hard Code: temporal_conv_size==2, got:{self.temporal_conv_size}"
# TODO: support any temporal conv size
slice_offsets = []
for temporoal_size, spatial_size, b_offset in zip(
grid_t, grid_hw_after_conv, batch_offset):
for temp_offset in range(0, temporoal_size, 2):
slice_offsets.append(
np.arange(b_offset + (temp_offset) * spatial_size,
b_offset + (temp_offset + 1) * spatial_size))
slice_offsets = paddle.to_tensor(
np.concatenate(slice_offsets, axis=-1))
slice_offsets2 = []
for temporoal_size, spatial_size, b_offset in zip(
grid_t, grid_hw_after_conv, batch_offset):
for temp_offset in range(1 if temporoal_size > 1 else 0,
temporoal_size, 2):
slice_offsets2.append(
np.arange(b_offset + (temp_offset) * spatial_size,
b_offset + (temp_offset + 1) * spatial_size))
slice_offsets2 = paddle.to_tensor(
np.concatenate(slice_offsets2, axis=-1))
x_timestep_1 = paddle.gather(x, slice_offsets, axis=0)
x_timestep_2 = paddle.gather(x, slice_offsets2, axis=0)
x = paddle.concat([x_timestep_1, x_timestep_2], axis=-1)
return x
def fwd_temporal(x):
num_pad = 0
if self.tensor_parallel_degree > 1:
num_pad = (
x.shape[0] + self.tensor_parallel_degree - 1
) // self.tensor_parallel_degree * self.tensor_parallel_degree - x.shape[
0]
if num_pad > 0:
x = paddle.nn.functional.pad(x, [0, num_pad, 0, 0])
if self.tensor_parallel_degree > 1:
x = ScatterOp.apply(x, axis=0)
x = self.temporal_linear(x)
if self.use_recompute_resampler:
num_pad = paddle.to_tensor(num_pad)
return x, num_pad
def fwd_mlp(x):
x = self.mlp(x)
x = self.after_norm(x)
if self.tensor_parallel_degree > 1:
x = AllGatherOp.apply(x)
return x
num_pad = 0
if self.use_recompute_resampler:
x = recompute(fwd_spatial, x)
if self.use_temporal_conv:
x = recompute(fwd_placeholder, x, grid_thw)
x, num_pad = recompute(fwd_temporal, x)
x = recompute(fwd_mlp, x)
else:
x = fwd_spatial(x)
if self.use_temporal_conv:
x = fwd_placeholder(x, grid_thw)
x, num_pad = fwd_temporal(x)
x = fwd_mlp(x)
if num_pad is not None and num_pad > 0:
x = x[:-num_pad]
return x
@classmethod
def _get_tensor_parallel_mappings(cls, config, is_split=True):
from paddleformers.transformers.conversion_utils import \
split_or_merge_func
fn = split_or_merge_func(
is_split=is_split,
tensor_parallel_degree=config.tensor_parallel_degree,
tensor_parallel_rank=config.tensor_parallel_rank,
num_attention_heads=config.num_attention_heads,
)
res = {"spatial_linear.0.weight": partial(fn, is_column=False)}
for k in (
"spatial_linear.0.bias", # row linear bias
"spatial_linear.2.weight",
"spatial_linear.2.bias", # linear
"spatial_linear.3.weight",
"spatial_linear.3.bias", # layernorm
"temporal_linear.0.weight",
"temporal_linear.0.weight", # linear
"temporal_linear.2.weight",
"temporal_linear.2.bias", # linear
"temporal_linear.3.weight",
"temporal_linear.3.bias", # bias
):
res.update({k: lambda x: x})
return res

View File

@@ -1,652 +0,0 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from __future__ import annotations
import contextlib
import json
import os
import sys
import threading
import paddle
import paddle.distributed as dist
from paddle.common_ops_import import convert_dtype
from fastdeploy.model_executor.models.utils import convert_ndarray_dtype
from paddlenlp.trainer import RuntimeTimer
from fastdeploy.inference_args import GenerationPhase
from .utils import (
_vocab_size_with_padding,
generate_rank_mapping,
get_infer_model_path,
model_convert_fp8,
)
from paddlenlp.transformers import AutoTokenizer
from paddle.distributed import fleet
from paddlenlp.utils.env import USE_FAST_TOKENIZER
from paddlenlp.utils.log import logger
from fastdeploy.model_executor.models.utils import load_checkpoint
from fastdeploy.config import (AdditionalConfig, DecodingConfig, DeviceConfig,
LLMConfig, LoadConfig, ModelConfig, MoEConfig,
ParallelConfig, SpeculativeConfig, TmpConfig)
from fastdeploy.inference_args import GenerationPhase
from ..layers.quantization import get_quantization_config
from .model_base import ModelRegistry
from .qwen2 import Qwen2PretrainedModel
from .utils import (_vocab_size_with_padding, convert_ndarray_dtype,
load_checkpoint, parser_quant_type)
from paddlenlp.transformers.configuration_utils import PretrainedConfig
from paddlenlp.trl import llm_utils
model_classes_mapping = {
"Qwen2ForCausalLM": Qwen2PretrainedModel,
}
current_dir = os.path.dirname(os.path.abspath(__file__))
grandparent_dir = os.path.abspath(
os.path.join(current_dir, os.pardir, os.pardir))
sys.path.append(grandparent_dir)
def offload_model(model):
"""
Offload the model to CUDAPinnedPlace.
"""
device = paddle.CUDAPinnedPlace()
for name, src in model.named_parameters():
if src._is_initialized() and not isinstance(src.place,
paddle.CUDAPinnedPlace):
dst = src._copy_to(device, True)
dst_tensor = dst.value().get_tensor()
src_tensor = src.value().get_tensor()
src_tensor._clear()
src_tensor._share_data_with(dst_tensor)
def reload_model(model):
"""
Reload the model from CUDAPinnedPlace to GPU.
"""
model.to(paddle.device.get_device())
def reconstruct_memory(model):
"""
reconstruct_memory to avoid memory chunks
"""
offload_model(model)
paddle.device.cuda.empty_cache()
reload_model(model)
def load_tensor_from_ipc_meta(state_dict):
"""
convert ipc_meta to tensor, but keep keys unchanged
{ 'key': ipc_meta } --> { 'key': tensor }
example:
state_dict = load_tensor_from_ipc_meta(state_dict)
"""
for k, v in state_dict.items():
# for pickling, we have to convert bytes object before save
v[0] = v[0].encode("latin-1")
state_dict[k] = paddle.to_tensor(
paddle.base.core.LoDTensor._new_shared_cuda(tuple(v)))
return state_dict
def build_stream_line_model(
config_path,
model_path,
dtype,
block_size,
max_len,
stage_flag,
min_dec_len=1,
max_dec_len=128,
temperature=1,
top_k=8,
top_p=0.8,
pre_caches_length=0,
export_model_type="default",
use_stop_seqs=False,
use_fake_parameter=False,
show_topk: int = 0,
msg_queue_id=None,
pad_vocab=True,
tokenizer=None,
cache_quant_dtype="default",
use_beam_search: bool = False,
enf_gen: bool = False,
speculate_method=None,
speculate_max_draft_token_num: int = 1,
speculate_max_candidate_len: int = 5,
speculate_verify_window: int = 2,
return_all_hidden_states: bool = False,
draft_type: str = "None",
start_layer_index: int = 0,
moe_quant_type: str = "default",
use_ep: bool = False,
ep_just_for_test: bool = False,
generation_phase: GenerationPhase = GenerationPhase.PREFILL,
use_micro_batch: bool = False,
fake_server_p: bool = False,
scale_dir: str = "None",
output_via_mq: bool = True,
use_safetensors: bool = False,
enable_redundant_experts: bool = False,
redundant_experts_num: int = 0,
max_batch_size: int = 128,
use_offline_quant: bool = False,
return_state_dicts: bool = False,
sharing_model=None,
sharing_state_dicts=None,
):
"""
Build a fused inference model
Args:
config_path (str): Path to the configuration file
model_path (str): Path to the model file
dtype (str): Data type of the model
block_size (int): Block size
max_len (int): Maximum sequence length
stage_flag (str): Qianfan requirement, stage flag, used to identify different stages in \
time-consuming statistics logs, such as prediction ("msgid-1 predict") or export ("convert").
min_dec_len (int, optional): Minimum decoding length. Default is 1.
max_dec_len (int, optional): Maximum decoding length. Default is 128.
temperature (float, optional): Temperature coefficient. Default is 1.
top_k (int, optional): k value in top-k sampling. Default is 0.
top_p (float, optional): p value in top-p sampling. Default is 0.8.
pre_caches_length (int, optional): Pre-cache length. Default is 0.
export_model_type (str, optional): Type of model to export. Default is "default".
use_stop_seqs (bool, optional): Whether to use stop sequences. Default is False.
use_fake_parameter (bool, optional): Whether to use fake parameters. Default is False.
show_topk (int, optional): Whether to show top-k results. Default is 0.
msg_queue_id (int, optional): Message queue ID. Default is None.
pad_vocab (bool, optional): Whether to pad the vocabulary. Default is True.
cache_quant_dtype (str, optional): Cache quantization data type. Default is "default".
use_beam_search (bool, optional): Whether to use beam search . Defaults is False.
enf_gen (bool, optional): Whether to use enforce generation. Defaults is False.
Returns:
tuple[dict, Tokenizer, CausalLM]:
A tuple containing the configuration, tokenizer, and model.
"""
runtime_timer = RuntimeTimer("build_model")
runtime_timer.start(f"{stage_flag} stage model loading time")
# config_path = os.path.join(model_path,"config.json")
with open(config_path, "r") as fin:
config = json.load(fin)
architectures = config.get("architectures")
if tokenizer is None:
tokenizer = AutoTokenizer.from_pretrained(
model_path,
padding_side="left",
use_fast=USE_FAST_TOKENIZER,
)
config, _ = PretrainedConfig.get_config_dict(model_path)
model_config = ModelConfig.from_dict(config)
parallel_config = ParallelConfig()
speculative_config = SpeculativeConfig()
device_config = DeviceConfig()
additional_config = AdditionalConfig()
load_config = LoadConfig()
tmp_config = TmpConfig()
moe_config = MoEConfig()
decoding_config = DecodingConfig()
tensor_parallel_rank, tensor_parallel_degree = llm_utils.init_dist_env()
parallel_config.tensor_parallel_rank = tensor_parallel_rank
parallel_config.tensor_parallel_degree = tensor_parallel_degree
parallel_config.mp_size = tensor_parallel_degree
parallel_config.ep_size = 1
parallel_config.column_cut = False
speculative_config.is_mtp = draft_type in ["eagle", "mtp"]
speculative_config.draft_type = draft_type
# Note(tangbinhan): used for load_checkpoint
model_config.tensor_parallel_rank = parallel_config.tensor_parallel_rank
model_config.tensor_parallel_degree = parallel_config.tensor_parallel_degree
model_config.use_ep = use_ep
model_config.is_mtp = speculative_config.is_mtp
additional_config.use_fake_parameter = use_fake_parameter
additional_config.ep_just_for_test = ep_just_for_test
tmp_config.use_offline_quant = use_offline_quant
if use_ep:
if isinstance(model_config.moe_num_experts, list):
model_config.has_multimodality = True
moe_config.num_experts = model_config.moe_num_experts[0]
else:
moe_config.num_experts = model_config.moe_num_experts
moe_config.num_experts_per_rank = (
moe_config.num_experts // parallel_config.tensor_parallel_degree
)
moe_config.num_experts_start_offset = (
moe_config.num_experts_per_rank * parallel_config.tensor_parallel_rank
)
# use the length of tokenizer as the origin vocab size
ori_vocab_size = len(tokenizer)
moe_intermediate_size = (config.get("moe_intermediate_size", None),)
if isinstance(moe_intermediate_size, list) or isinstance(
moe_intermediate_size, tuple
):
moe_intermediate_size = moe_intermediate_size[0]
if not use_ep and pad_vocab:
config["vocab_size"] = _vocab_size_with_padding(
config.get("vocab_size", tokenizer.vocab_size),
config.pop("vocab_size_divisible_unit", 128),
paddle.distributed.get_world_size(),
)
group_size = config.get("group_size", -1)
num_key_value_heads = config.get("num_key_value_heads", -1)
if num_key_value_heads is None:
num_key_value_heads = -1
if config.get("ffn_hidden_size", None) is not None:
ffn_hidden_size = config["ffn_hidden_size"]
elif config.get("intermediate_size", None) is not None:
ffn_hidden_size = config["intermediate_size"]
else:
ffn_hidden_size = 4 * config["hidden_size"]
if config["hidden_act"].lower() == "swiglu":
if paddle.distributed.get_world_size() > 1:
multiple_of = 8 * config["num_attention_heads"]
else:
multiple_of = 4 * config["num_attention_heads"]
ffn_hidden_size = multiple_of * (
(int(2 * ffn_hidden_size / 3) + multiple_of - 1) //
multiple_of)
if draft_type in ["mtp", "eagle"]:
num_layers = 1
else:
num_layers = config.get("num_layers", None) or config.get(
"num_hidden_layers", None
)
if num_layers is None:
raise ValueError(f"num_layers<{num_layers}> is invalid")
use_moe = config.get(
"moe_layer_start_index", num_layers
) < num_layers or draft_type in ["mtp", "eagle"]
if not sharing_state_dicts:
if use_fake_parameter:
context = contextlib.nullcontext()
elif use_safetensors:
context = paddle.LazyGuard()
model_class = model_classes_mapping[architectures[0]]
state_dict = load_checkpoint(model_path,
model_class,
model_config,
return_numpy=True)
elif use_moe:
tensor_parallel_degree = dist.get_world_size()
if tensor_parallel_degree > 1:
hcg = fleet.get_hybrid_communicate_group()
mp_id = hcg.get_model_parallel_rank()
# 统计文件子目录数量
subdir_count = 0
for entry in os.listdir(model_path):
if "pp" in entry:
full_path = os.path.join(model_path, entry)
if os.path.isdir(full_path):
subdir_count += 1
pp_num = subdir_count
rank_model_paths = [
os.path.join(model_path, f"pp{i}/model_state.tp0{mp_id}.pdparams")
for i in range(pp_num)
]
context = paddle.LazyGuard()
if not use_ep:
logger.info(f"start to loading weight: {rank_model_paths}")
state_dicts = [None for _ in rank_model_paths]
def load_ckpt(i):
state_dicts[i] = paddle.load(rank_model_paths[i], return_numpy=True)
threads = []
for i in range(len(rank_model_paths)):
thread = threading.Thread(target=load_ckpt, args=(i,))
threads.append(thread)
thread.start()
for t in threads:
t.join()
logger.info("Loading finished")
else:
# for EP loading state_dicts
import glob
state_dicts = []
files = glob.glob(model_path + "/merged_tp1_state_split/*")
for file_name in files:
try:
state_dicts += [
{file_name.split("/")[-1]: file_name}
] # save {layer_name: weight_file_name}
except Exception:
pass
need_reset_moe_intermediate_size = False
if not use_ep:
logger.info(f"moe_intermediate_size is: {moe_intermediate_size}")
need_reset_moe_intermediate_size = (
(not use_ep)
and (moe_quant_type == "fp8")
and (moe_intermediate_size // 8 % 128 != 0)
)
ori_up_size = moe_intermediate_size // 8 * 2
ori_down_size = ori_up_size // 2
if need_reset_moe_intermediate_size:
moe_intermediate_size = (
128 - moe_intermediate_size // 8 % 128
) * 8 + moe_intermediate_size
logger.info(
f"moe_intermediate_size reset to {moe_intermediate_size}!"
)
up_size = moe_intermediate_size // 8 * 2
down_size = up_size // 2
new_state_dict = {}
def padding(key, value):
import numpy as np
# logger.info(f"deal {key}")
if ("experts" in key) and ("up_gate_proj" in key):
# logger.info("up_gate_proj")
v_new = np.zeros(shape=[value.shape[0], up_size], dtype=value.dtype)
v_new[:, :ori_down_size] = value[:, :ori_down_size]
v_new[:, down_size : (down_size + ori_down_size)] = value[
:, ori_down_size:
]
elif ("experts" in key) and ("down_proj" in key):
# logger.info("down_proj")
v_new = np.zeros(
shape=[down_size, value.shape[1]], dtype=value.dtype
)
v_new[:ori_down_size, :] = value
else:
v_new = value
new_state_dict[key] = v_new
if ("experts" in key) and ("up_gate_proj" in key or "down_proj" in key):
pass
# logger.info(f"padding {key}: {value.shape}->{v_new.shape}")
threads = []
for state_dict in state_dicts:
for key, value in state_dict.items():
if need_reset_moe_intermediate_size:
thread = threading.Thread(target=padding, args=(key, value))
threads.append(thread)
thread.start()
else:
new_state_dict[key] = value
for t in threads:
t.join()
logger.info("Finish padding")
state_dict = new_state_dict
elif config.get("quant_type", None) is not None:
# TODO(@wangbojun) currently, we use paddle.load for ptq model.
tensor_parallel_degree = dist.get_world_size()
if tensor_parallel_degree > 1:
hcg = fleet.get_hybrid_communicate_group()
mp_id = hcg.get_model_parallel_rank()
rank_model_path = os.path.join(
model_path, f"model_state.tp0{mp_id}.pdparams"
)
if not os.path.exists(rank_model_path):
full_model_path = os.path.join(model_path, "model_state.pdparams")
if not os.path.exists(full_model_path):
raise ValueError(
f"can not find <model_state.tp0{mp_id}.pdparams> "
+ f"and model_state.pdparams under dir<{model_path}>"
)
raise ValueError(
"please run `split_weights.py` to gen weights for multi-gpu inference."
)
if not os.path.exists(rank_model_path):
full_model_path = os.path.join(model_path, "model_state.pdparams")
if not os.path.exists(full_model_path):
raise ValueError(
f"can not find <model_state.tp0{mp_id}.pdparams> "
+ f"and model_state.pdparams under dir<{model_path}>"
)
raise ValueError(
"please run `split_weights.py` to gen weights for multi-gpu inference."
)
model_state_path = rank_model_path
if num_key_value_heads > 0:
assert (
num_key_value_heads % tensor_parallel_degree == 0
), "num_key_value_heads must be an integer multiple of tensor_parallel_degree"
else:
model_state_path = os.path.join(model_path, "model_state.pdparams")
context = paddle.LazyGuard()
logger.info(f"start to loading weight: {model_state_path}")
if os.path.exists(model_state_path):
state_dict = paddle.load(model_state_path, return_numpy=True)
else:
state_dict = sharing_state_dicts
context = paddle.LazyGuard()
use_rmsnorm = config.get("use_rmsnorm", True)
if use_beam_search:
decode_strategy = "beam_search"
elif speculate_method is not None:
if draft_type in ["draft_model", "eagle", "mtp"]:
decode_strategy = "draft_model_sampling"
else:
decode_strategy = "speculate_decoding"
else:
decode_strategy = "sampling"
logger.info(f"{runtime_timer.log()}")
runtime_timer.start(f"{stage_flag} stage set parameters time")
if config["hidden_act"].lower() == "swiglu":
model_config.hidden_act = "swiglu"
model_config.ffn_hidden_size = ffn_hidden_size
model_config.max_seq_len = max_len
model_config.num_layers = num_layers
model_config.dtype = dtype
model_config.export_model_type = export_model_type
parallel_config.block_size = block_size
model_config.group_size = group_size
load_config.model_path = model_path
model_config.use_rmsnorm = use_rmsnorm
parallel_config.msg_queue_id = msg_queue_id
additional_config.use_fake_parameter = use_fake_parameter
model_config.num_key_value_heads = num_key_value_heads
model_config.use_stop_seqs = use_stop_seqs
tmp_config.cache_quant_dtype = cache_quant_dtype
tmp_config.has_zero_point = config.get("has_zero_point", False)
tmp_config.is_channel_wise = config.get("is_channel_wise", False),
speculative_config.speculate_method = speculate_method
speculative_config.speculate_max_draft_token_num = speculate_max_draft_token_num
model_config.return_all_hidden_states = return_all_hidden_states
speculative_config.draft_type = draft_type
model_config.start_layer_index = start_layer_index
model_config.use_moe = use_moe
if use_moe:
moe_config.use_moe = use_moe
moe_config.num_experts = config.get("moe_num_experts", None)
moe_config.moe_intermediate_size = config.get("moe_intermediate_size",
None)
moe_config.moe_use_gate_correction_bias = config.get(
"moe_use_gate_correction_bias", True)
moe_config.moe_every2 = config.get("moe_every2", False)
moe_config.moe_topk = config.get("moe_topk", 8)
moe_config.moe_num_shared_experts = config.get("moe_num_shared_experts", 0)
moe_config.moe_layer_start_index = config.get("moe_layer_start_index", 0)
moe_config.moe_use_ffn_shared_weight_and_bias = config.get(
"moe_use_ffn_shared_weight_and_bias", False)
moe_config.use_moe = use_moe
moe_config.moe_group = config.get("moe_group", False)
moe_config.moe_quant_type = moe_quant_type
if top_k > 0:
moe_config.top_k = top_k
parallel_config.use_ep = use_ep
additional_config.ep_just_for_test = ep_just_for_test
model_config.generation_phase = generation_phase
parallel_config.use_micro_batch = use_micro_batch
tmp_config.weight_block_size = config.get("weight_block_size", [-1, -1])
load_config.scale_dir = scale_dir
model_config.output_via_mq = output_via_mq
decoding_config.bos_token_id = tokenizer.bos_token_id
decoding_config.pad_token_id = tokenizer.pad_token_id
decoding_config.temperature = temperature
decoding_config.forced_eos_token_id = tokenizer.eos_token_id
model_config.ori_vocab_size = ori_vocab_size
decoding_config.max_dec_len = max_dec_len
decoding_config.min_dec_len = min_dec_len
additional_config.fake_server_p = fake_server_p
decoding_config.decode_strategy = decode_strategy
speculative_config.speculate_max_candidate_len = speculate_max_candidate_len
speculative_config.speculate_verify_window = speculate_verify_window
weight_dtype, act_dtype, cachekv_dtype = parser_quant_type(
export_model_type)
logger.info(
f"quant_type: weight[{weight_dtype}], act[{act_dtype}], cachekv[{cachekv_dtype}]"
)
model_config.weight_dtype = weight_dtype
model_config.act_dtype = act_dtype
if weight_dtype == "int8" and act_dtype in ["bfloat16", "float16"]:
quant_cls = get_quantization_config("weight_only")
quant_config = quant_cls.from_config({
"weight_only_linear_arch": None,
"algo": "weight_only_int8"
})
quant_config.quant_max_bound = 0
quant_config.quant_min_bound = 0
quant_config.quant_round_type = 0
model_config.use_smooth_quant = False
elif weight_dtype == "int4" and act_dtype in ["bfloat16", "float16"]:
quant_cls = get_quantization_config("weight_only")
quant_config = quant_cls.from_config({
"weight_only_linear_arch": None,
"algo": "weight_only_int4"
})
quant_config.quant_max_bound = 0
quant_config.quant_min_bound = 0
quant_config.quant_round_type = 0
model_config.use_smooth_quant = False
elif tmp_config.weight_block_size[0] != -1:
quant_cls = get_quantization_config("block_wise")
quant_config = quant_cls.from_config(
{"weight_block_size": tmp_config.weight_block_size})
quant_config.quant_max_bound = 448
quant_config.quant_min_bound = -448
quant_config.quant_round_type = 1
model_config.use_smooth_quant = False
elif weight_dtype == "int4" and act_dtype == "float8_e4m3fn":
quant_cls = get_quantization_config("w4afp8")
quant_config = quant_cls.from_config({
"weight_scale_dict": {},
"act_scale_dict": {}
})
quant_config.quant_max_bound = 448
quant_config.quant_min_bound = -448
quant_config.quant_round_type = 1
model_config.use_smooth_quant = False
elif weight_dtype == "int8" and act_dtype == weight_dtype:
quant_cls = get_quantization_config("w8a8")
quant_config = quant_cls.from_config({
"weight_scale_dict": {},
"act_scale_dict": {},
"use_gemm_dequant": False
})
quant_config.quant_max_bound = 127
quant_config.quant_min_bound = -127
quant_config.quant_round_type = 0
model_config.use_smooth_quant = True
elif weight_dtype == "float8_e4m3fn" and act_dtype == weight_dtype:
quant_cls = get_quantization_config("wfp8afp8")
quant_config = quant_cls.from_config({
"weight_scale_dict": {},
"act_scale_dict": {}
})
quant_config.quant_max_bound = 448
quant_config.quant_min_bound = -448
quant_config.quant_round_type = 1
model_config.use_smooth_quant = False
else:
quant_config = None
llm_config = LLMConfig(
model_config=model_config,
parallel_config=parallel_config,
speculative_config=speculative_config,
device_config=device_config,
additional_config=additional_config,
load_config=load_config,
tmp_config=tmp_config,
moe_config=moe_config,
decoding_config=decoding_config,
quant_config=quant_config,
)
with context:
model_cls = ModelRegistry.get_class(model_config.architectures[0])
model = model_cls(llm_config)
model.eval()
if use_fake_parameter:
return config, tokenizer, model
elif not use_moe:
for k, v in state_dict.items():
if convert_dtype(v.dtype) == dtype:
continue
elif convert_dtype(v.dtype) == "float32":
continue
state_dict[k] = convert_ndarray_dtype(v, dtype)
paddle.device.cuda.empty_cache()
assert state_dict is not None
model.set_state_dict(state_dict)
if use_ep and generation_phase == GenerationPhase.DECODER:
logger.info("Reloading model...")
reconstruct_memory(model)
logger.info(f"{runtime_timer.log()}")
if sharing_state_dicts is not None:
for k in list(sharing_state_dicts):
sharing_state_dicts.pop(k)
possible_state_dict = state_dict if return_state_dicts else None
return config, tokenizer, model, possible_state_dict

View File

@@ -20,9 +20,12 @@ from functools import partial
import paddle
from paddle import nn
from paddlenlp.transformers import PretrainedModel
from paddleformers.transformers import PretrainedModel
from paddleformers.utils.log import logger
from fastdeploy.config import LLMConfig, ModelConfig
from fastdeploy.config import FDConfig, ModelConfig
from fastdeploy.model_executor.graph_optimization.decorator import \
support_graph_optimization
from fastdeploy.model_executor.layers.activation import SiluAndMul
from fastdeploy.model_executor.layers.attention import Attention
from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
@@ -31,7 +34,7 @@ from fastdeploy.model_executor.layers.linear import (
from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
from fastdeploy.model_executor.layers.normalization import RMSNorm
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.worker.model_runner import ForwardMeta
from fastdeploy.worker.forward_meta import ForwardMeta
class Qwen2MLP(nn.Layer):
@@ -40,32 +43,33 @@ class Qwen2MLP(nn.Layer):
def __init__(
self,
llm_config: LLMConfig,
fd_config: FDConfig,
prefix: str = "",
) -> None:
super().__init__()
self.nranks = llm_config.parallel_config.mp_size
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.gate_up_proj = MergedColumnParallelLinear(
llm_config=llm_config,
fd_config=fd_config,
prefix=f"{prefix}.up_gate_proj",
input_size=fd_config.model_config.hidden_size,
output_size=fd_config.model_config.ffn_hidden_size * 2,
with_bias=False,
activation=llm_config.model_config.hidden_act,
activation=fd_config.model_config.hidden_act,
use_fast_ffn=True,
)
self.down_proj = RowParallelLinear(
llm_config=llm_config,
fd_config=fd_config,
prefix=f"{prefix}.down_proj",
input_size=(llm_config.model_config.ffn_hidden_size //
self.nranks),
output_size=llm_config.model_config.hidden_size,
input_size=(fd_config.model_config.ffn_hidden_size // self.nranks),
output_size=fd_config.model_config.hidden_size,
with_bias=False,
)
self.act_fn = SiluAndMul(
llm_config=llm_config,
fd_config=fd_config,
bias=getattr(self.gate_up_proj, "linear_bias", None),
act_method=llm_config.model_config.hidden_act,
act_method=fd_config.model_config.hidden_act,
)
def load_state_dict(self, state_dict):
@@ -88,25 +92,25 @@ class Qwen2Attention(nn.Layer):
"""
def __init__(self,
llm_config: LLMConfig,
fd_config: FDConfig,
layer_id: int,
prefix: str = "") -> None:
super().__init__()
nranks = llm_config.parallel_config.mp_size
nranks = fd_config.parallel_config.tensor_parallel_degree
self.qkv_proj = QKVParallelLinear(llm_config=llm_config,
self.qkv_proj = QKVParallelLinear(fd_config=fd_config,
prefix=f"{prefix}.qkv_proj",
with_bias=True)
self.o_proj = RowParallelLinear(
llm_config=llm_config,
fd_config=fd_config,
prefix=f"{prefix}.o_proj",
input_size=(llm_config.model_config.hidden_size // nranks),
output_size=llm_config.model_config.hidden_size,
input_size=(fd_config.model_config.hidden_size // nranks),
output_size=fd_config.model_config.hidden_size,
)
self.attn = Attention(llm_config=llm_config,
self.attn = Attention(fd_config=fd_config,
layer_id=layer_id,
prefix=prefix,
use_neox_rotary_style=True)
@@ -140,33 +144,33 @@ class Qwen2DecoderLayer(nn.Layer):
def __init__(
self,
llm_config: LLMConfig,
fd_config: FDConfig,
prefix: str = "",
) -> None:
super().__init__()
layer_id = int(prefix.split(sep='.')[-1])
self.self_attn = Qwen2Attention(
llm_config=llm_config,
fd_config=fd_config,
layer_id=layer_id,
prefix=f"{prefix}.self_attn",
)
self.mlp = Qwen2MLP(
llm_config=llm_config,
fd_config=fd_config,
prefix=f"{prefix}.mlp",
)
self.input_layernorm = RMSNorm(
llm_config,
hidden_size=llm_config.model_config.hidden_size,
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=1e-6,
prefix=f"{prefix}.input_layernorm",
)
self.post_attention_layernorm = RMSNorm(
llm_config,
hidden_size=llm_config.model_config.hidden_size,
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=1e-6,
prefix=f"{prefix}.post_attention_layernorm",
)
@@ -209,13 +213,14 @@ class Qwen2DecoderLayer(nn.Layer):
return hidden_states, residual
@support_graph_optimization
class Qwen2Model(nn.Layer):
"""
"""
def __init__(
self,
llm_config: LLMConfig = None,
fd_config: FDConfig = None,
):
"""
Initializer for the Qwen2Model class.
@@ -225,29 +230,29 @@ class Qwen2Model(nn.Layer):
"""
super().__init__()
self.num_layers = llm_config.model_config.num_layers
llm_config.model_config.prefix_name = "qwen2"
self.num_layers = fd_config.model_config.num_layers
fd_config.model_config.prefix_name = "qwen2"
self.embeddings = VocabParallelEmbedding(
llm_config=llm_config,
num_embeddings=llm_config.model_config.vocab_size,
embedding_dim=llm_config.model_config.hidden_size,
fd_config=fd_config,
num_embeddings=fd_config.model_config.vocab_size,
embedding_dim=fd_config.model_config.hidden_size,
params_dtype=paddle.get_default_dtype,
prefix=(f"{llm_config.model_config.prefix_name}.embed_tokens"),
prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"),
)
self.layers = nn.LayerList([
Qwen2DecoderLayer(
llm_config=llm_config,
prefix=f"{llm_config.model_config.prefix_name}.layers.{i}")
fd_config=fd_config,
prefix=f"{fd_config.model_config.prefix_name}.layers.{i}")
for i in range(self.num_layers)
])
self.norm = RMSNorm(
llm_config,
hidden_size=llm_config.model_config.hidden_size,
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=1e-5,
prefix=f"{llm_config.model_config.prefix_name}.norm",
prefix=f"{fd_config.model_config.prefix_name}.norm",
)
def load_state_dict(self, state_dict):
@@ -262,6 +267,7 @@ class Qwen2Model(nn.Layer):
self.embeddings.load_state_dict(state_dict)
self.norm.load_state_dict(state_dict)
for i in range(self.num_layers):
logger.info(f"Start load layer {i}")
self.layers[i].load_state_dict(state_dict)
def forward(
@@ -292,21 +298,21 @@ class Qwen2ForCausalLM(ModelForCasualLM):
Qwen2ForCausalLM
"""
def __init__(self, llm_config: LLMConfig):
def __init__(self, fd_config: FDConfig):
"""
Args:
llm_config (LLMConfig): Configurations for the LLM model.
fd_config (FDConfig): Configurations for the LLM model.
"""
super(Qwen2ForCausalLM, self).__init__(llm_config)
super(Qwen2ForCausalLM, self).__init__(fd_config)
self.model = Qwen2Model(llm_config=llm_config)
self.model = Qwen2Model(fd_config=fd_config)
self.ori_vocab_size = llm_config.model_config.ori_vocab_size
self.ori_vocab_size = fd_config.model_config.ori_vocab_size
self.lm_head = ParallelLMHead(
llm_config=llm_config,
embedding_dim=llm_config.model_config.hidden_size,
num_embeddings=llm_config.model_config.vocab_size,
fd_config=fd_config,
embedding_dim=fd_config.model_config.hidden_size,
num_embeddings=fd_config.model_config.vocab_size,
prefix="lm_head",
)
@@ -345,7 +351,8 @@ class Qwen2ForCausalLM(ModelForCasualLM):
):
"""
"""
hidden_states = self.model(ids_remove_padding, forward_meta)
hidden_states = self.model(ids_remove_padding=ids_remove_padding,
forward_meta=forward_meta)
return hidden_states
@@ -355,7 +362,7 @@ class Qwen2PretrainedModel(PretrainedModel):
Qwen2PretrainedModel
"""
config_class = LLMConfig
config_class = FDConfig
def _init_weight(self, layer):
"""
@@ -366,7 +373,8 @@ class Qwen2PretrainedModel(PretrainedModel):
@classmethod
def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True):
from paddlenlp.transformers.conversion_utils import split_or_merge_func
from paddleformers.transformers.conversion_utils import \
split_or_merge_func
fn = split_or_merge_func(
is_split=is_split,

View File

@@ -0,0 +1,361 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from __future__ import annotations
from functools import partial
import paddle
from paddle import nn
from paddleformers.transformers import PretrainedModel
from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig, ModelConfig
from fastdeploy.model_executor.graph_optimization.decorator import \
support_graph_optimization
from fastdeploy.model_executor.layers.attention import Attention
from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
from fastdeploy.model_executor.layers.linear import (QKVParallelLinear,
RowParallelLinear)
from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
from fastdeploy.model_executor.layers.normalization import RMSNorm
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.model_executor.models.qwen2 import Qwen2DecoderLayer, Qwen2MLP
from fastdeploy.worker.forward_meta import ForwardMeta
class Qwen3MLP(Qwen2MLP):
"""
"""
pass
class Qwen3Attention(nn.Layer):
"""
"""
def __init__(self,
fd_config: FDConfig,
layer_id: int,
prefix: str = "") -> None:
super().__init__()
self.fd_config = fd_config
self.head_dim = fd_config.model_config.head_dim
nranks = fd_config.parallel_config.tensor_parallel_degree
self.q_size = fd_config.model_config.num_attention_heads * self.head_dim // nranks
self.kv_size = fd_config.model_config.num_key_value_heads * self.head_dim // nranks
self.qkv_proj = QKVParallelLinear(fd_config=fd_config,
prefix=f"{prefix}.qkv_proj",
with_bias=False)
self.o_proj = RowParallelLinear(
fd_config=fd_config,
prefix=f"{prefix}.o_proj",
input_size=fd_config.model_config.head_dim *
fd_config.model_config.num_attention_heads // nranks,
output_size=fd_config.model_config.hidden_size,
)
self.attn = Attention(fd_config=fd_config,
layer_id=layer_id,
prefix=prefix,
use_neox_rotary_style=True)
self.q_norm = RMSNorm(fd_config=fd_config,
hidden_size=fd_config.model_config.head_dim,
eps=1e-6,
prefix=f"{prefix}.q_norm",
begin_norm_axis=2)
self.k_norm = RMSNorm(fd_config=fd_config,
hidden_size=fd_config.model_config.head_dim,
eps=1e-6,
prefix=f"{prefix}.k_norm",
begin_norm_axis=2)
def load_state_dict(self, state_dict):
"""
"""
self.qkv_proj.load_state_dict(state_dict)
self.o_proj.load_state_dict(state_dict)
self.q_norm.load_state_dict(state_dict)
self.k_norm.load_state_dict(state_dict)
def forward(
self,
forward_meta: ForwardMeta,
hidden_states: paddle.Tensor,
):
"""
"""
qkv_out = self.qkv_proj(hidden_states)
# origin_qkv_out = qkv_out
q, k, v = qkv_out.split([self.q_size, self.kv_size, self.kv_size],
axis=-1)
q_by_head = q.reshape(
[*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim])
q_by_head = self.q_norm(q_by_head)
q = q_by_head.reshape(q.shape)
k_by_head = k.reshape(
[*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim])
k_by_head = self.k_norm(k_by_head)
k = k_by_head.reshape(k.shape)
qkv_out = paddle.concat([q, k, v], axis=-1)
atten_out = self.attn(
qkv=qkv_out,
forward_meta=forward_meta,
)
output = self.o_proj(atten_out)
return output
class Qwen3DecoderLayer(Qwen2DecoderLayer):
"""
"""
def __init__(
self,
fd_config: FDConfig,
prefix: str = "",
) -> None:
super().__init__(fd_config, prefix)
layer_id = int(prefix.split(sep='.')[-1])
self.self_attn = Qwen3Attention(fd_config=fd_config,
layer_id=layer_id,
prefix=f"{prefix}.self_attn")
@support_graph_optimization
class Qwen3Model(nn.Layer):
"""
"""
def __init__(
self,
fd_config: FDConfig = None,
):
"""
Initializer for the Qwen3Model class.
Args:
"""
super().__init__()
self.num_layers = fd_config.model_config.num_layers
fd_config.model_config.prefix_name = "model"
fd_config.model_config.tie_word_embeddings = True
self.embeddings = VocabParallelEmbedding(
fd_config=fd_config,
num_embeddings=fd_config.model_config.vocab_size,
embedding_dim=fd_config.model_config.hidden_size,
params_dtype=paddle.get_default_dtype,
prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"),
)
self.layers = nn.LayerList([
Qwen3DecoderLayer(
fd_config=fd_config,
prefix=f"{fd_config.model_config.prefix_name}.layers.{i}")
for i in range(self.num_layers)
])
self.norm = RMSNorm(
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=1e-6,
prefix=f"{fd_config.model_config.prefix_name}.norm",
)
def load_state_dict(self, state_dict):
"""
Load model parameters from a given state dictionary.
Args:
state_dict (dict[str, np.ndarray | paddle.Tensor]):
A dictionary containing model parameters, where keys are parameter names
and values are NumPy arrays or PaddlePaddle tensors.
"""
self.embeddings.load_state_dict(state_dict)
self.norm.load_state_dict(state_dict)
for i in range(self.num_layers):
logger.info(f"Start load layer {i}")
self.layers[i].load_state_dict(state_dict)
def forward(
self,
ids_remove_padding: paddle.Tensor,
forward_meta: ForwardMeta,
):
"""
"""
hidden_states = self.embeddings(ids_remove_padding=ids_remove_padding)
residual = None
for i in range(self.num_layers):
hidden_states, residual = self.layers[i](forward_meta,
hidden_states, residual)
hidden_states = hidden_states + residual
out = self.norm(hidden_states)
return out
class Qwen3ForCausalLM(ModelForCasualLM):
"""
Qwen3ForCausalLM
"""
def __init__(self, fd_config: FDConfig):
"""
Args:
fd_config (FDConfig): Configurations for the LLM model.
"""
super(Qwen3ForCausalLM, self).__init__(fd_config)
self.model = Qwen3Model(fd_config=fd_config)
self.ori_vocab_size = fd_config.model_config.ori_vocab_size
self.lm_head = ParallelLMHead(
fd_config=fd_config,
embedding_dim=fd_config.model_config.hidden_size,
num_embeddings=fd_config.model_config.vocab_size,
prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"),
)
self.tie_word_embeddings = fd_config.model_config.tie_word_embeddings
@classmethod
def name(self):
"""
"""
return "Qwen3ForCausalLM"
@paddle.no_grad()
def set_state_dict(self, state_dict):
"""
Load model parameters from a given state dictionary.
Args:
state_dict (dict[str, np.ndarray | paddle.Tensor]):
A dictionary containing model parameters, where keys are parameter names
and values are NumPy arrays or PaddlePaddle tensors.
"""
self.model.load_state_dict(state_dict)
if self.tie_word_embeddings:
self.lm_head.out_linear.weight.set_value(
self.model.embeddings.word_embeddings.weight.transpose([1, 0]))
self.lm_head.load_state_dict(state_dict)
def compute_logits(self, hidden_states: paddle.Tensor):
"""
"""
logits = self.lm_head(hidden_states)
logits = paddle.cast(logits, paddle.float32)
logits[:, self.ori_vocab_size:] = -float("inf")
return logits
def forward(
self,
ids_remove_padding: paddle.Tensor,
forward_meta: ForwardMeta,
):
"""
"""
hidden_states = self.model(ids_remove_padding=ids_remove_padding,
forward_meta=forward_meta)
return hidden_states
class Qwen3PretrainedModel(PretrainedModel):
"""
Qwen3PretrainedModel
"""
config_class = FDConfig
def _init_weight(self, layer):
"""
_init_weight
"""
return None
@classmethod
def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True):
from paddleformers.transformers.conversion_utils import \
split_or_merge_func
fn = split_or_merge_func(
is_split=is_split,
tensor_parallel_degree=config.tensor_parallel_degree,
tensor_parallel_rank=config.tensor_parallel_rank,
num_attention_heads=config.num_attention_heads,
)
def get_tensor_parallel_split_mappings(num_layers):
final_actions = {}
base_actions = {
# Row Linear
"embed_tokens.weight": partial(fn, is_column=False),
"layers.0.self_attn.o_proj.weight": partial(fn,
is_column=False),
"layers.0.mlp.down_proj.weight": partial(fn, is_column=False),
}
# Column Linear
base_actions["layers.0.self_attn.q_proj.weight"] = partial(
fn, is_column=True)
base_actions["layers.0.self_attn.q_proj.bias"] = partial(
fn, is_column=True)
# if we have enough num_key_value_heads to split, then split it.
if config.num_key_value_heads % config.tensor_parallel_degree == 0:
base_actions["layers.0.self_attn.k_proj.weight"] = partial(
fn, is_column=True)
base_actions["layers.0.self_attn.v_proj.weight"] = partial(
fn, is_column=True)
base_actions["layers.0.mlp.gate_proj.weight"] = partial(
fn, is_column=True)
base_actions["layers.0.mlp.up_proj.weight"] = partial(
fn, is_column=True)
for key, action in base_actions.items():
if "layers.0." in key:
for i in range(num_layers):
final_actions[key.replace("layers.0.",
f"layers.{i}.")] = action
final_actions[key] = action
return final_actions
mappings = get_tensor_parallel_split_mappings(config.num_layers)
return mappings

View File

@@ -0,0 +1,509 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from __future__ import annotations
from functools import partial
import paddle
from paddle import nn
from paddleformers.transformers import PretrainedModel
from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig, ModelConfig
from fastdeploy.model_executor.graph_optimization.decorator import \
support_graph_optimization
from fastdeploy.model_executor.layers.activation import SiluAndMul
from fastdeploy.model_executor.layers.attention import Attention
from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
from fastdeploy.model_executor.layers.linear import (
MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear)
from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
from fastdeploy.model_executor.layers.moe.moe import FusedMoE
from fastdeploy.model_executor.layers.normalization import RMSNorm
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.worker.forward_meta import ForwardMeta
class Qwen3MLP(nn.Layer):
"""
"""
def __init__(
self,
fd_config: FDConfig,
prefix: str = "",
) -> None:
super().__init__()
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.gate_up_proj = MergedColumnParallelLinear(
fd_config,
prefix=f"{prefix}.up_gate_proj",
input_size=fd_config.model_config.hidden_size,
output_size=fd_config.model_config.ffn_hidden_size * 2,
with_bias=False,
activation=fd_config.model_config.hidden_act,
use_fast_ffn=True,
)
self.down_proj = RowParallelLinear(
fd_config,
prefix=f"{prefix}.down_proj",
input_size=(fd_config.model_config.ffn_hidden_size // self.nranks),
output_size=fd_config.model_config.hidden_size,
with_bias=False,
)
self.act_fn = SiluAndMul(
fd_config,
bias=getattr(self.gate_up_proj, "linear_bias", None),
act_method=fd_config.model_config.hidden_act,
)
def load_state_dict(self, state_dict):
"""
"""
self.gate_up_proj.load_state_dict(state_dict)
self.down_proj.load_state_dict(state_dict)
def forward(self, x):
"""
"""
gate_up_out = self.gate_up_proj(x)
act_out = self.act_fn(gate_up_out)
down_out = self.down_proj(act_out)
return down_out
class Qwen3Attention(nn.Layer):
"""
"""
def __init__(self,
fd_config: FDConfig,
layer_id: int,
prefix: str = "") -> None:
super().__init__()
self.fd_config = fd_config
self.head_dim = fd_config.model_config.head_dim
self.qkv_proj = QKVParallelLinear(fd_config,
prefix=f"{prefix}.qkv_proj",
with_bias=False)
nranks = fd_config.parallel_config.tensor_parallel_degree
self.o_proj = RowParallelLinear(
fd_config,
prefix=f"{prefix}.o_proj",
input_size=fd_config.model_config.head_dim *
fd_config.model_config.num_attention_heads // nranks,
output_size=fd_config.model_config.hidden_size,
)
self.attn = Attention(fd_config,
layer_id=layer_id,
prefix=prefix,
use_neox_rotary_style=True)
self.q_norm = RMSNorm(fd_config,
hidden_size=self.head_dim,
eps=1e-6,
prefix=f"{prefix}.q_norm",
begin_norm_axis=2)
self.k_norm = RMSNorm(fd_config,
hidden_size=self.head_dim,
eps=1e-6,
prefix=f"{prefix}.k_norm",
begin_norm_axis=2)
self.q_size = fd_config.model_config.num_attention_heads * self.head_dim // nranks
self.kv_size = fd_config.model_config.num_key_value_heads * self.head_dim // nranks
def load_state_dict(self, state_dict):
"""
"""
self.qkv_proj.load_state_dict(state_dict)
self.o_proj.load_state_dict(state_dict)
self.q_norm.load_state_dict(state_dict)
self.k_norm.load_state_dict(state_dict)
def forward(
self,
forward_meta: ForwardMeta,
hidden_states: paddle.Tensor,
):
"""
"""
qkv_out = self.qkv_proj(hidden_states)
# origin_qkv_out = qkv_out
q, k, v = qkv_out.split([self.q_size, self.kv_size, self.kv_size],
axis=-1)
q_by_head = q.reshape(
[*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim])
q_by_head = self.q_norm(q_by_head)
q = q_by_head.reshape(q.shape)
k_by_head = k.reshape(
[*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim])
k_by_head = self.k_norm(k_by_head)
k = k_by_head.reshape(k.shape)
qkv_out = paddle.concat([q, k, v], axis=-1)
atten_out = self.attn(
qkv=qkv_out,
forward_meta=forward_meta,
)
output = self.o_proj(atten_out)
return output
class Qwen3DecoderLayer(nn.Layer):
"""
"""
def __init__(
self,
fd_config: FDConfig,
prefix: str = "",
) -> None:
super().__init__()
layer_id = int(prefix.split(sep='.')[-1])
self.self_attn = Qwen3Attention(
fd_config=fd_config,
layer_id=layer_id,
prefix=f"{prefix}.self_attn",
)
weight_key_map = {
"gate_weight_key":
f"{prefix}.mlp.gate.weight",
"ffn1_expert_weight_key":
f"{prefix}.mlp.experts.{{}}.up_gate_proj.weight",
"ffn2_expert_weight_key":
f"{prefix}.mlp.experts.{{}}.down_proj.weight",
}
if (fd_config.moe_config.num_experts is not None
and layer_id >= fd_config.moe_config.moe_layer_start_index):
self.mlp = FusedMoE(fd_config,
moe_intermediate_size=fd_config.moe_config.
moe_intermediate_size,
num_experts=fd_config.moe_config.num_experts,
top_k=fd_config.moe_config.top_k,
layer_idx=layer_id,
weight_key_map=weight_key_map)
else:
self.mlp = Qwen3MLP(
fd_config,
prefix=f"{prefix}.mlp",
)
self.input_layernorm = RMSNorm(
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=1e-6,
prefix=f"{prefix}.input_layernorm",
)
self.post_attention_layernorm = RMSNorm(
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=1e-6,
prefix=f"{prefix}.post_attention_layernorm",
)
def load_state_dict(self, state_dict):
"""
"""
self.self_attn.load_state_dict(state_dict)
self.mlp.load_state_dict(state_dict)
self.input_layernorm.load_state_dict(state_dict)
self.post_attention_layernorm.load_state_dict(state_dict)
def forward(
self,
forward_meta: ForwardMeta,
hidden_states: paddle.Tensor,
residual: paddle.Tensor = None,
):
"""
"""
if residual is None:
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
else:
hidden_states, residual = self.input_layernorm(
hidden_states, residual)
hidden_states = self.self_attn(
hidden_states=hidden_states,
forward_meta=forward_meta,
)
# Fully Connected
hidden_states, residual = self.post_attention_layernorm(
hidden_states, residual)
hidden_states = self.mlp(hidden_states)
return hidden_states, residual
@support_graph_optimization
class Qwen3MoeModel(nn.Layer):
"""
"""
def __init__(
self,
fd_config: FDConfig = None,
):
"""
Initializer for the Qwen2Model class.
Args:
"""
super().__init__()
self.num_layers = fd_config.model_config.num_layers
fd_config.model_config.prefix_name = "model"
self.embeddings = VocabParallelEmbedding(
fd_config,
num_embeddings=fd_config.model_config.vocab_size,
embedding_dim=fd_config.model_config.hidden_size,
params_dtype=paddle.get_default_dtype,
prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"),
)
self.layers = nn.LayerList([
Qwen3DecoderLayer(
fd_config,
prefix=f"{fd_config.model_config.prefix_name}.layers.{i}")
for i in range(self.num_layers)
])
self.norm = RMSNorm(
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=1e-6,
prefix=f"{fd_config.model_config.prefix_name}.norm",
)
def load_state_dict(self, state_dict):
"""
Load model parameters from a given state dictionary.
Args:
state_dict (dict[str, np.ndarray | paddle.Tensor]):
A dictionary containing model parameters, where keys are parameter names
and values are NumPy arrays or PaddlePaddle tensors.
"""
self.embeddings.load_state_dict(state_dict)
self.norm.load_state_dict(state_dict)
for i in range(self.num_layers):
logger.info(f"Start load layer {i}")
self.layers[i].load_state_dict(state_dict)
def forward(
self,
ids_remove_padding: paddle.Tensor,
forward_meta: ForwardMeta,
):
"""
"""
hidden_states = self.embeddings(ids_remove_padding=ids_remove_padding)
residual = None
for i in range(self.num_layers):
hidden_states, residual = self.layers[i](forward_meta,
hidden_states, residual)
hidden_states = hidden_states + residual
out = self.norm(hidden_states)
return out
class Qwen3MoeForCausalLM(ModelForCasualLM):
"""
Qwen3MoeForCausalLM
"""
def __init__(self, fd_config: FDConfig):
"""
Args:
fd_config (FDConfig): Configurations for the LLM model.
"""
super(Qwen3MoeForCausalLM, self).__init__(fd_config)
self.model = Qwen3MoeModel(fd_config)
self.ori_vocab_size = fd_config.model_config.ori_vocab_size
self.lm_head = ParallelLMHead(
fd_config,
embedding_dim=fd_config.model_config.hidden_size,
num_embeddings=fd_config.model_config.vocab_size,
prefix="lm_head",
)
@classmethod
def name(self):
"""
"""
return "Qwen3MoeForCausalLM"
@paddle.no_grad()
def set_state_dict(self, state_dict):
"""
Load model parameters from a given state dictionary.
Args:
state_dict (dict[str, np.ndarray | paddle.Tensor]):
A dictionary containing model parameters, where keys are parameter names
and values are NumPy arrays or PaddlePaddle tensors.
"""
self.model.load_state_dict(state_dict)
self.lm_head.load_state_dict(state_dict)
def compute_logits(self, hidden_states: paddle.Tensor):
"""
"""
logits = self.lm_head(hidden_states)
logits = paddle.cast(logits, paddle.float32)
logits[:, self.ori_vocab_size:] = -float("inf")
return logits
def forward(
self,
ids_remove_padding: paddle.Tensor,
forward_meta: ForwardMeta,
):
"""
"""
hidden_states = self.model(ids_remove_padding=ids_remove_padding,
forward_meta=forward_meta)
return hidden_states
class Qwen3MoePretrainedModel(PretrainedModel):
"""
Qwen3MoePretrainedModel
"""
config_class = FDConfig
def _init_weight(self, layer):
"""
_init_weight
"""
return None
@classmethod
def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True):
# TODO not support TP split now, next PR will support TP.
from paddleformers.transformers.conversion_utils import \
split_or_merge_func
fn = split_or_merge_func(
is_split=is_split,
tensor_parallel_degree=config.tensor_parallel_degree,
tensor_parallel_rank=config.tensor_parallel_rank,
num_attention_heads=config.num_attention_heads,
)
def get_tensor_parallel_split_mappings(num_layers, moe_num_experts):
final_actions = {}
base_actions = {
"lm_head.weight": partial(fn, is_column=True),
# Row Linear
"embed_tokens.weight": partial(fn, is_column=False),
"layers.0.self_attn.o_proj.weight": partial(fn,
is_column=False),
}
# Column Linear
config.fuse_attention_qkv = False
if config.fuse_attention_qkv:
base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(
fn, is_column=True)
else:
base_actions["layers.0.self_attn.q_proj.weight"] = partial(
fn, is_column=True)
base_actions["layers.0.self_attn.q_proj.bias"] = partial(
fn, is_column=True)
# if we have enough num_key_value_heads to split, then split it.
if config.num_key_value_heads % config.tensor_parallel_degree == 0:
base_actions["layers.0.self_attn.k_proj.weight"] = partial(
fn, is_column=True)
base_actions["layers.0.self_attn.v_proj.weight"] = partial(
fn, is_column=True)
base_actions["layers.0.self_attn.k_proj.bias"] = partial(
fn, is_column=True)
base_actions["layers.0.self_attn.v_proj.bias"] = partial(
fn, is_column=True)
for key, action in base_actions.items():
if "layers.0." in key:
for i in range(num_layers):
final_actions[key.replace("layers.0.",
f"layers.{i}.")] = action
final_actions[key] = action
base_actions = {
"layers.0.mlp.experts.0.gate_proj.weight":
partial(fn, is_column=True),
"layers.0.mlp.experts.0.down_proj.weight":
partial(fn, is_column=False),
"layers.0.mlp.experts.0.up_proj.weight":
partial(fn, is_column=True),
}
for key, action in base_actions.items():
for i in range(num_layers):
newkey = key.replace("layers.0.", f"layers.{i}.")
for j in range(moe_num_experts):
newkey2 = newkey.replace("experts.0.", f"experts.{j}.")
final_actions[newkey2] = action
return final_actions
moe_num_experts = 0
if isinstance(config.moe_num_experts, list):
moe_num_experts = sum(config.moe_num_experts)
elif isinstance(config.moe_num_experts, int):
moe_num_experts = config.moe_num_experts
else:
raise ValueError(
f"Not support type of moe_num_experts [{type(config.moe_num_experts)}]"
)
mappings = get_tensor_parallel_split_mappings(config.num_layers,
moe_num_experts)
return mappings

View File

@@ -1,382 +0,0 @@
"""
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import os
import re
from shutil import copyfile
from typing import Dict, List, Optional, Tuple
import numpy as np
import paddle
import sentencepiece as spm
from paddlenlp.transformers import PretrainedTokenizer
from paddlenlp.transformers.tokenizer_utils_base import (PaddingStrategy,
TextInput)
from paddlenlp.utils.log import logger
__all__ = ["ErnieBotTokenizer"]
# copy from ernie_core.tokenizers
class ErnieBotTokenizer(PretrainedTokenizer):
"""
一个更好用的 `ErnieBotToknizer`
能 encode 目前 sft/ppo 阶段的特殊token也支持多模态。
"""
resource_files_names = {
"vocab_file": "spm.model",
}
pretrained_resource_files_map = {"vocab_file": {"ernie-bot-10b": None}}
pretrained_init_configuration = {
"ernie-bot-10b": {},
}
model_input_names = [
"input_ids", "position_ids", "attention_mask", "labels"
]
padding_side = "right"
def __init__(
self,
vocab_file,
bos_token="<s>",
cls_token="<cls>",
eos_token="</s>",
mask_token="<mask:0>",
pad_token="<pad>",
sep_token="<sep>",
unk_token="<unk>",
additional_special_tokens=None,
**kwargs,
):
"""doc"""
if additional_special_tokens is None:
additional_special_tokens = ["<mask:1>", "<mask:7>"]
super().__init__(
bos_token=bos_token,
cls_token=cls_token,
eos_token=eos_token,
mask_token=mask_token,
pad_token=pad_token,
sep_token=sep_token,
unk_token=unk_token,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
self.vocab_file = vocab_file
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(vocab_file)
@property
def space_token(self):
"""doc"""
return "<mask:1>"
@property
def space_token_id(self):
"""doc"""
return self.sp_model.piece_to_id("<mask:1>")
@property
def gend_token(self):
"""doc"""
return "<mask:7>"
@property
def gend_token_id(self):
"""doc"""
return self.sp_model.piece_to_id("<mask:7>")
@property
def im_start_id(self):
"""doc"""
return self.sp_model.piece_to_id("<|im_start|>")
@property
def im_end_id(self):
"""doc"""
return self.sp_model.piece_to_id("<|im_end|>")
@property
def vocab_size(self):
"""doc"""
return self.sp_model.vocab_size()
def get_vocab(self):
"""doc"""
vocab = {
self.convert_ids_to_tokens(i): i
for i in range(self.vocab_size)
}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text):
"""doc"""
return self.sp_model.encode_as_pieces(text)
def _convert_token_to_id(self, token):
"""doc"""
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, id):
"""doc"""
return self.sp_model.id_to_piece(id)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens = []
out_string = ""
# prev_is_special = False
for token in tokens:
# make sure that special tokens are not decoded using sentencepiece model
if token in self.all_special_tokens:
# if not prev_is_special:
# out_string += " "
out_string += self.sp_model.decode(current_sub_tokens) + token
# prev_is_special = True
current_sub_tokens = []
else:
current_sub_tokens.append(token)
# prev_is_special = False
out_string += self.sp_model.decode(current_sub_tokens)
return out_string # .strip()
def prepare_for_model(self, *args, **kwargs):
"""doc"""
if "add_special_tokens" in kwargs:
kwargs.pop("add_special_tokens")
# logger.warning(f'ErnieBotTokenizer v2 does not support `add_special_tokens`')
return super().prepare_for_model(*args, **kwargs)
def save_vocabulary(self,
save_directory,
filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (`str`):
The directory in which to save the vocabulary.
Returns:
`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error(
f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") +
self.resource_files_names["vocab_file"],
)
if os.path.abspath(self.vocab_file) != os.path.abspath(
out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file, )
def tokenize(self, text: TextInput, **kwargs) -> List[str]:
"""
Converts a string in a sequence of tokens, using the tokenizer.
Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
(BPE/SentencePieces/WordPieces). Takes care of added tokens.
Args:
text (`str`):
The sequence to be encoded.
**kwargs (additional keyword arguments):
Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
Returns:
`List[str]`: The list of tokens.
"""
# Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
# all_special_tokens_extended = dict(
# (str(t), t)
# for t in self.all_special_tokens_extended
# if isinstance(t, AddedToken)
# )
text, kwargs = self.prepare_for_tokenization(text, **kwargs)
# TODO: should this be in the base class?
if hasattr(self, "do_lower_case") and self.do_lower_case:
# convert non-special tokens to lowercase
escaped_special_toks = [
re.escape(s_tok) for s_tok in (self.unique_no_split_tokens +
self.all_special_tokens)
]
pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
text = re.sub(pattern,
lambda m: m.groups()[0] or m.groups()[1].lower(),
text)
no_split_token = set(self.unique_no_split_tokens)
tokens = self.tokens_trie.split(text)
tokenized_text = []
for token in tokens:
# Need to skip eventual empty (fully stripped) tokens
if not token:
continue
if token in no_split_token:
tokenized_text.append(token)
else:
tokenized_text.extend(self._tokenize(token))
# ["This", " is", " something", "<special_token_1>", "else"]
return tokenized_text
def _decode(self, *args, **kwargs):
"""doc"""
kwargs.pop("clean_up_tokenization_spaces", None)
kwargs.pop("spaces_between_special_tokens", None)
return super()._decode(
*args,
**kwargs,
clean_up_tokenization_spaces=False,
spaces_between_special_tokens=False,
)
def _pad(
self,
encoded_inputs: Dict,
max_length: Optional[int] = None,
padding_strategy=PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""doc"""
if return_attention_mask is None:
return_attention_mask = "attention_mask" in self.model_input_names
if return_attention_mask:
required_input = encoded_inputs[self.model_input_names[0]]
if padding_strategy == PaddingStrategy.LONGEST:
max_length = len(required_input)
if (max_length is not None and pad_to_multiple_of is not None
and (max_length % pad_to_multiple_of != 0)):
max_length = ((max_length // pad_to_multiple_of) +
1) * pad_to_multiple_of
needs_to_be_padded = (padding_strategy
!= PaddingStrategy.DO_NOT_PAD
and len(required_input) != max_length)
if ("attention_mask" in encoded_inputs
and encoded_inputs["attention_mask"] is not None):
attention_mask = encoded_inputs.pop("attention_mask")
if isinstance(attention_mask, paddle.Tensor):
attention_mask = attention_mask.numpy()
elif isinstance(attention_mask, list):
attention_mask = np.array(attention_mask)
elif not isinstance(attention_mask, np.ndarray):
raise ValueError(
f"Unexpected type {type(attention_mask)} of attention_mask, "
)
else:
attention_mask = np.tril(
np.ones((len(required_input), len(required_input)),
dtype=np.int64))
attention_mask = np.expand_dims(attention_mask, axis=0)
if needs_to_be_padded:
difference = max_length - len(required_input)
if self.padding_side == "right":
if attention_mask.ndim == 1:
pad_width = [(0, difference)]
else:
pad_width = [(0, 0), (0, difference), (0, difference)]
elif self.padding_side == "left":
if attention_mask.ndim == 1:
pad_width = [(difference, 0)]
else:
pad_width = [(0, 0), (difference, 0), (difference, 0)]
else:
raise ValueError("Invalid padding strategy:" +
str(self.padding_side))
attention_mask = np.pad(
attention_mask,
pad_width=pad_width,
mode="constant",
constant_values=0,
)
encoded_inputs = super()._pad(
encoded_inputs,
max_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=False,
)
if return_attention_mask:
encoded_inputs["attention_mask"] = attention_mask.tolist()
return encoded_inputs
def add_special_tokens(
tokenizer,
special_tokens_info,
use_ocr_specialtoken=False,
use_crop_specialtoken=False,
special_token_ids_start=254208,
special_token_ids_end=256256,
):
"""
增加 special token
placeholder [<|IMAGE_PLACEHOLDER|>, <|AUDIO_PLACEHOLDER|>, <|VIDEO_PLACEHOLDER|>] 共3个
模态起始截止 special tokens [<|BOI|> <|EOI|> <|BOA|> <|EOA|> <|BOV|> <|EOV|>]
ocr special tokens [<|LOC_0|> <|LOC_1|> ... <|LOC_1000|>] 共1001个
crop special tokens [<|CROP_COL_SEP|>, <|CROP_ROW_SEP|>, <|CROP_IMAGE_SEP|>] 共3个
<|CROP_COL_SEP|> for col 维度切 图片width替换原明文逗号
<|CROP_ROW_SEP|> for row 维度切 图片height替换原明文回车
<|CROP_IMAGE_SEP|> for 区分原图和crop图 图片width替换原明文两个回车
共2048个 unsed token
Args:
tokenizer (ErnieTokenizer): tokenizer
special_token_ids_start (int, optional): special token 起点 ids. Defaults to 254208.
special_token_ids_end (int, optional): 词表最多支持大小. Defaults to 256256.
"""
special_tokens = [
special_tokens_info["image_placeholder"],
special_tokens_info["audio_placeholder"],
]
if use_ocr_specialtoken:
special_tokens.extend(special_tokens_info["ocr_coor"])
special_tokens.extend(special_tokens_info["ocr_begin_end"])
if use_crop_specialtoken:
special_tokens.extend(special_tokens_info["crop"])
# add special_tokens
additional_special_tokens = {"additional_special_tokens": special_tokens}
tokenizer.add_special_tokens(additional_special_tokens)
# check
first_special_tokens = tokenizer.encode(special_tokens[0])["input_ids"]
assert (first_special_tokens[0] == special_token_ids_start
), f"[ERROR] first_special_tokens={first_special_tokens}"
assert (
len(tokenizer.get_vocab()) < special_token_ids_end
), f"[ERROR] vocab_size = {len(tokenizer.get_vocab())} >= {special_token_ids_end} 增加过多special token了!"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,22 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
try:
from .wint2_fused_moe import fused_moe_wint2_triton
__all__ = ["fused_moe_wint2_triton"]
except:
pass

View File

@@ -0,0 +1,804 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import inspect
import os
import re
import sys
import paddle
import triton
from paddle.base.framework import OpProtoHolder
from fastdeploy import envs
compile_file = triton.__path__[0] + "/tools/compile.py"
link_file = triton.__path__[0] + "/tools/link.py"
python_path = sys.executable
def SubstituteTemplate(template, values):
"""
Substitute all variables in the given template string using the provided values dictionary.
"""
text = template
changed = True
while changed:
changed = False
for key, value in values.items():
regex = "\\$\\{%s\\}" % key
newtext = re.sub(regex, value, text)
if newtext != text:
changed = True
text = newtext
return text
def find_so_path(generated_dir, python_package_name):
"""
find the specified so in generated_dir, if not found it will return None.
"""
so_path = []
for root, dirs, files in os.walk(generated_dir):
for file in files:
if file.endswith(python_package_name + ".so"):
so_path.append(os.path.join(root, file))
if len(so_path) == 0:
return None
else:
assert len(so_path) == 1
return so_path[0]
def multi_process_do(commands):
"""
Multi-threaded execution of commands.
"""
THREADS = 40
import multiprocessing
process = []
def one_process_work(commands, thread_id):
i = thread_id
while i < len(commands):
re = os.system(commands[i])
assert re == 0
i += THREADS
for i in range(THREADS):
p = multiprocessing.Process(target=one_process_work,
args=(commands, i))
process.append(p)
for p in process:
p.start()
for p in process:
p.join()
def extract_triton_kernel(kernel, file_name):
"""
Extract the triton kernel and write it to the specified file_name.
Args:
kernel: the triton kernel name.
file_name: the file name you want to write.
"""
import inspect
import re
import textwrap
fn = kernel
if type(kernel) == triton.runtime.jit.JITFunction:
fn = kernel.fn
elif type(kernel) == triton.runtime.autotuner.Autotuner:
fn = kernel.fn.fn
else:
AssertionError("error occurs")
py_script = textwrap.dedent(inspect.getsource(fn))
# @triton.jit must only appear once
# assert len(re.findall("@triton.jit", py_script)) == 1
assert len(re.findall("def ", py_script)) == 1
# assert len(re.findall("@haha()", py_script)) == 1
# py_script = py_script.replace("@haha()", "@triton.jit")
py_script = py_script[py_script.find("def "):]
py_script = "import triton\nimport triton.language as tl\n\n\n@triton.jit\n" + py_script
py_script = py_script.replace("if bias_ptr is not None", "if bias_ptr")
with open(file_name, "w") as f:
f.write(py_script)
f.close()
template_install = """
import os
generated_cu = []
for root, dirs, files in os.walk("./"):
for file in files:
if file.endswith(".c") or file.endswith(".cu"):
generated_cu.append(os.path.join(root, file))
import paddle
from paddle.utils.cpp_extension import CUDAExtension, setup
def get_gencode_flags():
prop = paddle.device.cuda.get_device_properties()
cc = prop.major * 10 + prop.minor
return ["-gencode", "arch=compute_{{0}},code=sm_{{0}}".format(cc)]
gencode_flags = get_gencode_flags()
setup(
name="{python_package_name}",
ext_modules=CUDAExtension(
sources = generated_cu,
extra_compile_args={{
"cc": ["-lcuda"],
"nvcc": [
"-O3",
"-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT16_OPERATORS__",
"-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
"-U__CUDA_NO_BFLOAT162_OPERATORS__",
"-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
]
+ gencode_flags,
}},
extra_link_args = ["-lcuda"]
),
)
"""
def get_op_name_with_suffix(op_name, x_list):
"""
Get the operator name with suffix.
"""
suffix = []
for x in x_list:
if x % 16 == 0:
suffix.append(16)
elif x == 1:
suffix.append(1)
else:
suffix.append(0)
return op_name + "_".join([str(i) for i in suffix])
def get_value_hint(x):
"""
Get the value hint from input list.
"""
hint = ""
for ele in x:
if type(ele) == int:
if ele % 16 == 0 and ele > 0:
hint += "i64:16,"
elif ele == 1:
hint += "i64:1,"
else:
hint += "i64,"
if type(ele) == float:
hint += "fp32,"
return hint
def get_dtype_str(dtype):
"""
Get the dtype str.
"""
if dtype == paddle.float16:
return "_fp16"
if dtype == paddle.float8_e4m3fn:
return "_float8_e4m3fn"
elif dtype == paddle.uint8:
return "_u8"
elif dtype == paddle.int8:
return "_i8"
elif dtype == paddle.int16:
return "_i16"
elif dtype == paddle.int32:
return "_i32"
elif dtype == paddle.int64:
return "_i64"
elif dtype == paddle.float32:
return "_fp32"
elif dtype == paddle.bfloat16:
return "_bf16"
else:
raise ValueError("Not support this dtype.")
def build_package(generated_dir, python_package_name):
"""
Build the package, not install it.
Args:
generated_dir: the source cu file dir.
python_package_name: the python package name.
"""
setup_file_path = generated_dir + "/setup_cuda.py"
python_path = sys.executable
with open(setup_file_path, "w") as f:
f.write(
template_install.format(python_package_name=python_package_name))
f.close()
install_command = f"cd {generated_dir} && {python_path} setup_cuda.py build"
re = os.system(install_command)
assert re == 0
def rename_c_to_cu(generated_dir):
"""
Rename the .c files int generated_dir to .cu file, because the triton aot tool generate the .c files.
"""
# rename the .c file to .cu
for filename in os.listdir(generated_dir):
if filename.endswith(".c"):
old_path = os.path.join(generated_dir, filename)
new_path = os.path.join(generated_dir, filename + "u")
os.rename(old_path, new_path)
def get_pointer_hint(dtypes):
"""
Get the pointer hint from input list.
"""
hint = ""
for ele in dtypes:
if ele == paddle.float16:
hint += "*fp16:16,"
elif ele == paddle.uint8:
hint += "*u8:16,"
elif ele == paddle.int8:
hint += "*i8:16,"
elif ele == paddle.int16:
hint += "*i16:16,"
elif ele == paddle.float32:
hint += "*fp32:16,"
elif ele == paddle.bfloat16:
hint += "*bf16:16,"
elif ele == paddle.int32:
hint += "*i32:16,"
elif ele == paddle.int64:
hint += "*i64,"
elif ele == paddle.float8_e4m3fn:
hint += "*fp8e4nv:16,"
return hint
paddle_custom_op_head_part = """#include <vector>
#include <map>
#include "${op_name}_kernel.h"
#include "paddle/extension.h"
std::map<std::vector<int>, int> map_problem_${op_name};
CUdeviceptr get_tensor_ptr(const paddle::Tensor& input){
if (input.type() == paddle::DataType::FLOAT16) {
return (CUdeviceptr)(input.data<phi::dtype::float16>());
} else if (input.type() == paddle::DataType::BFLOAT16) {
return (CUdeviceptr)(input.data<phi::dtype::bfloat16>());
} else if (input.type() == paddle::DataType::INT32) {
return (CUdeviceptr)(input.data<int>());
} else if (input.type() == paddle::DataType::FLOAT32) {
return (CUdeviceptr)(input.data<float>());
} else if (input.type() == paddle::DataType::UINT8) {
return (CUdeviceptr)(input.data<uint8_t>());
} else if (input.type() == paddle::DataType::INT8) {
return (CUdeviceptr)(input.data<int8_t>());
} else if (input.type() == paddle::DataType::INT64) {
return (CUdeviceptr)(input.data<int64_t>());
} else if (input.type() == paddle::DataType::INT32) {
return (CUdeviceptr)(input.data<int32_t>());
} else if (input.type() == paddle::DataType::INT16) {
return (CUdeviceptr)(input.data<int16_t>());
} else if (input.type() == paddle::DataType::FLOAT8_E4M3FN) {
return (CUdeviceptr)(input.data<phi::dtype::float8_e4m3fn>());
} else {
assert(false);
return (CUdeviceptr)(nullptr);
}
}
int triton_cdiv(int x, int y) {
int result = (x + y - 1) / y;
return (int)(result);
}
"""
tune_and_invoke_part = """
std::vector<int> problem_size = {${key}};
auto run_triton_kernel = [&](int algo_id) -> CUresult{
return ${op_name}_kernel(run_stream,
${triton_kernel_args},
algo_id);
};
map_problem_${op_name}[problem_size] = 0;
if (!map_problem_${op_name}.count(problem_size)) {
std::cout << "we are tuning for ${op_name} which key is: {";
for (int i = 0; i < problem_size.size(); i++) {
std::cout << problem_size[i] << ", ";
}
std::cout << "}" << std::endl;
float min_time = 10000.f;
int select_id = -1;
constexpr int WARMUP = 5;
constexpr int REPEAT = 10;
for (int algo_id = 0; algo_id < ${op_name}_kernel_get_num_algos(); ++algo_id) {
cudaEvent_t beg[REPEAT];
cudaEvent_t end[REPEAT];
float elapsed_times[REPEAT];
auto status = CUDA_SUCCESS;
for (int ii = 0; ii < WARMUP + REPEAT; ii++) {
int repeat_id = ii - WARMUP;
if (repeat_id >= 0) {
(cudaEventCreate(beg + repeat_id));
(cudaEventCreate(end + repeat_id));
(cudaEventRecord(beg[repeat_id]));
}
auto flush_l2_cache = paddle::full(
{10 * 1024 * 1024}, 0, paddle::DataType::INT32, ${arbitary_output_name}.place());
// std::cout << &flush_l2_cache << std::endl;
// this is used when out is need to be reset to zero, such as split-k gemm.
${reset_zero_when_tune};
status = run_triton_kernel(algo_id);
// assert(status == CUDA_SUCCESS);
if (repeat_id >= 0) {
(cudaEventRecord(end[repeat_id]));
(cudaEventSynchronize(end[repeat_id]));
(cudaEventElapsedTime(
elapsed_times + repeat_id, beg[repeat_id], end[repeat_id]));
}
}
float avg_elapsed_time = 0.f;
for (int ii = 0; ii < REPEAT; ++ii) {
avg_elapsed_time += elapsed_times[ii];
}
std::cout << "algo id " << algo_id << " costs " << avg_elapsed_time << " ms" << std::endl;
if (avg_elapsed_time < min_time && status == CUDA_SUCCESS) {
min_time = avg_elapsed_time;
select_id = algo_id;
}
}
map_problem_${op_name}[problem_size] = select_id;
std::cout << "select algo id: " << select_id << std::endl;
${reset_zero_when_tune};
}
if (map_problem_${op_name}.count(problem_size)) {
int algo_id = map_problem_${op_name}[problem_size];
auto status = run_triton_kernel(algo_id);
assert(status == CUDA_SUCCESS);
}
"""
common_template = ("""
std::vector<paddle::Tensor> ${op_name}_func(${input_and_attr}) {
${prepare_attr_for_triton_kernel}
${prepare_ptr_for_triton_kernel}
auto run_stream = ${arbitary_output_name}.stream();
""" + tune_and_invoke_part + """
return {${return_tensor_names}};
}
${d2s_infer_code}
PD_BUILD_OP(${op_name})
.Inputs({${paddle_input_sig}})
.Outputs({${paddle_output_sig}})
.Attrs({${paddle_attr_sig}})
.SetKernelFn(PD_KERNEL(${op_name}_func))
.SetInferDtypeFn(PD_INFER_DTYPE(${op_name}_InferDtype))
.SetInferShapeFn(PD_INFER_SHAPE(${op_name}_InferShape));
""")
def rendering_common_template(
func,
prepare_attr_for_triton_kernel,
prepare_ptr_for_triton_kernel,
return_tensor_names=None,
d2s_infer_code="",
):
"""
Render a template with given function and its arguments.
Args:
func: The function to render.
prepare_attr_for_triton_kernel: The code snippet that prepares attributes for Triton kernel.
prepare_ptr_for_triton_kernel: The code snippet that prepares pointers for Triton kernel.
return_tensor_names: The names of the returned tensors. Default is None.
"""
signature = inspect.signature(func)
arg_names = [v.name for v in signature.parameters.values()]
arg_defaults = [v.default for v in signature.parameters.values()]
input_and_attr = ""
paddle_input_sig = ""
paddle_attr_sig = ""
if return_tensor_names is None:
return_tensor_names = "useless"
prepare_ptr_for_triton_kernel += (
"auto useless = paddle::empty({1}, paddle::DataType::INT32, paddle::CPUPlace());"
)
for i in range(len(arg_names)):
if arg_defaults[i] is None:
input_and_attr += f"paddle::optional<paddle::Tensor> & {arg_names[i]},"
paddle_input_sig += f"""paddle::Optional("{arg_names[i]}"),"""
elif type(arg_defaults[i]) == float:
input_and_attr += f"float {arg_names[i]},"
paddle_attr_sig += f""""{arg_names[i]}: float","""
elif type(arg_defaults[i]) == bool:
input_and_attr += f"bool {arg_names[i]},"
paddle_attr_sig += f""""{arg_names[i]}: bool","""
elif type(arg_defaults[i]) == int:
input_and_attr += f"int64_t {arg_names[i]},"
paddle_attr_sig += f""""{arg_names[i]}: int64_t","""
elif type(arg_defaults[i]) == str:
input_and_attr += f"std::string {arg_names[i]},"
paddle_attr_sig += f""""{arg_names[i]}: std::string","""
elif arg_names[i] == "config":
continue
else:
input_and_attr += f"const paddle::Tensor & {arg_names[i]},"
paddle_input_sig += f""""{arg_names[i]}","""
input_and_attr = input_and_attr[:-1]
paddle_input_sig = paddle_input_sig[:-1]
if len(paddle_attr_sig) > 1:
paddle_attr_sig = paddle_attr_sig[:-1]
paddle_output_sig = ""
arbitary_output_name = ""
for name in return_tensor_names.split(","):
name = name.strip()
arbitary_output_name = name
paddle_output_sig += f""""{name}","""
paddle_output_sig = paddle_output_sig[:-1]
if "${op_name}_InferShape" not in d2s_infer_code:
d2s_infer_shape_part = (
"std::vector<std::vector<int64_t>> ${op_name}_InferShape("
"const std::vector<int64_t>& A_shape) {"
"return {${tmp}};"
"}\n ")
tmp = ",".join(["A_shape"] * len(return_tensor_names.split(",")))
tmp_dict = {"tmp": tmp}
d2s_infer_shape_part = SubstituteTemplate(d2s_infer_shape_part,
tmp_dict)
d2s_infer_code += d2s_infer_shape_part
if "${op_name}_InferDtype" not in d2s_infer_code:
d2s_infer_dtype_part = (
"std::vector<paddle::DataType> ${op_name}_InferDtype("
"const paddle::DataType& A_dtype) {"
"return {${tmp}};"
"}\n ")
tmp = ",".join(["A_dtype"] * len(return_tensor_names.split(",")))
tmp_dict = {"tmp": tmp}
d2s_infer_dtype_part = SubstituteTemplate(d2s_infer_dtype_part,
tmp_dict)
d2s_infer_code += d2s_infer_dtype_part
result_str = SubstituteTemplate(
common_template,
{
"input_and_attr": input_and_attr,
"prepare_attr_for_triton_kernel": prepare_attr_for_triton_kernel,
"prepare_ptr_for_triton_kernel": prepare_ptr_for_triton_kernel,
"return_tensor_names": return_tensor_names,
"arbitary_output_name": arbitary_output_name,
"d2s_infer_code": d2s_infer_code,
"paddle_input_sig": paddle_input_sig,
"paddle_output_sig": paddle_output_sig,
"paddle_attr_sig": paddle_attr_sig,
},
)
return paddle_custom_op_head_part + result_str
class KernelInterface:
"""
triton kernel interface.
"""
def __init__(
self,
func,
other_config,
key_args=["1"],
):
"""
triton kernel interface.
"""
self.func = func
self.key_args = key_args
signature = inspect.signature(func)
self.arg_names = [v.name for v in signature.parameters.values()]
for ele in self.arg_names:
assert self.arg_names.count(ele) == 1
# arg_defaults = [v.default for v in signature.parameters.values()]
# self.annotations = {
# name: ty for name, ty in func.__annotations__.items()
# }
self.annotations = dict(func.__annotations__)
self.constexprs = [
self.arg_names.index(name) for name in self.arg_names
if self.annotations.get(name) == triton.language.core.constexpr
]
self.arg_exclude_constexpr = [
self.arg_names[i] for i in range(len(self.arg_names))
if i not in self.constexprs
]
import textwrap
py_script = textwrap.dedent(inspect.getsource(func))
import re
pat = r"def\s" + func.__name__
func_begin = re.findall(pat, py_script)
assert len(func_begin) == 1
func_begin = func_begin[0]
py_script = py_script[py_script.find(func_begin):]
def decorator(*args, **kwargs):
"""
decorator for triton kernels.
Args:
*args: positional arguments
**kwargs: keyword arguments
"""
all_input = []
for i in range(len(args)):
all_input.append(args[i])
position_arguments_num = len(all_input)
for i in range(position_arguments_num, len(self.arg_names)):
if self.arg_names[i] in kwargs.keys():
all_input.append(kwargs[self.arg_names[i]])
else:
# means this input is not specified, it muse be a tl.constexpr.
assert i in self.constexprs
all_input.append(None)
dtypes = []
x_list = []
const_args = [self.arg_names[i] for i in self.constexprs]
# we dont allow there are two strings in const_args, and one is a substring of the other.
for i in const_args:
for j in const_args:
if i != j and i.find(j) != -1:
raise ValueError(
f"We find {i}, {j} in tl.constexpr args, and {j} is a substring of {i}, "
"please modify your triton kernel arguments names to avoid this."
)
modified_arg_exclude_constexpr = self.arg_exclude_constexpr
const_hint_dict = {}
for i in range(len(all_input)):
ele = all_input[i]
if (type(ele) == paddle.Tensor
or type(ele) == paddle.base.framework.EagerParamBase
or type(ele) == paddle.base.framework.Parameter
or type(ele) == paddle.base.framework.Variable
or type(ele) == paddle.base.libpaddle.pir.Value):
dtypes.append(ele.dtype)
modified_arg_exclude_constexpr[i] = f"input_ptrs[{i}]"
elif i in self.constexprs:
const_hint_dict[self.arg_names[i]] = ele
else:
x_list.append(ele)
op_name = self.op_name
python_package_name = f"{op_name}_package"
tp_rank = paddle.distributed.get_rank()
generated_dir = envs.FD_TRITON_KERNEL_CACHE_DIR
if generated_dir is None:
generated_dir = f"/tmp/triton_cache/rank{tp_rank}"
print("the kernel cache dir is:", generated_dir)
assert (generated_dir is not None), (
"TRITON_KERNEL_CACHE_DIR is None, please set it such as "
"export TRITON_KERNEL_CACHE_DIR=/tmp/triton_cache ")
generated_dir = f"{generated_dir}/{op_name}"
os.makedirs(generated_dir, exist_ok=True)
py_script_file = f"{generated_dir}/triton_kernels.py"
extract_triton_kernel(func, py_script_file)
address_hint = get_pointer_hint(dtypes)
value_hint = get_value_hint(x_list)
const_args = [f"{{{ele}}}" for ele in const_args]
const_args = ",".join(const_args)
lanuch_grid = list(self.grid)
for i in range(len(lanuch_grid)):
ele = lanuch_grid[i]
if type(ele) == str:
for key in const_hint_dict.keys():
if key in ele:
ele = ele.replace(key, f"{{{key}}}")
else:
ele = str(ele)
lanuch_grid[i] = ele
if len(lanuch_grid) < 3:
lanuch_grid += ["1"] * (3 - len(lanuch_grid))
lanuch_grid = ",".join(lanuch_grid)
op_dict = {"op_name": op_name, "reset_zero_when_tune": ""}
op_dict["triton_kernel_args"] = ",".join(
modified_arg_exclude_constexpr)
op_dict["key"] = ",".join(self.key_args)
# when tunning, we need to reset the out to zero.
if "reset_zero_when_tune" in other_config.keys():
op_dict["reset_zero_when_tune"] = other_config[
"reset_zero_when_tune"]
paddle_custom_op_file_path = f"{generated_dir}/{op_name}.cu"
so_path = find_so_path(generated_dir, python_package_name)
if so_path is None:
print("== we do not find so_path, we need to compile it")
with open(paddle_custom_op_file_path, "w") as f:
f.write(
SubstituteTemplate(
self.custom_op_template,
op_dict,
))
f.close()
# ahead of time compile command.
aot_template = (
f"""{python_path} {compile_file} {py_script_file} """ +
f""" -n {func.__name__} -o {generated_dir}/{op_name}_kernel """
+ f"""--out-name {op_name}_kernel """ +
""" -w {num_warps} -ns {num_stages} """ +
f""" -s"{address_hint} {value_hint} {const_args}" """ +
f""" -g "{lanuch_grid}" """)
all_tune_config = list(self.tune_config)
if len(all_tune_config) == 0:
# when user do not specify config, we use const_hint_dict as config.
all_tune_config = [const_hint_dict]
# reset const_hint_dict as empty.
const_hint_dict = {}
codegen_commands = []
for config in all_tune_config:
for key in const_hint_dict.keys():
if const_hint_dict[key] is not None:
if key not in config.keys():
config[key] = const_hint_dict[key]
else:
if config[key] == const_hint_dict[key]:
pass
else:
message = (
f"you specify {key} both in arguments and config, "
"and they are not same, this is wrong."
)
raise ValueError(message)
else:
assert key in config.keys(
), f"you must specify {key} in your config."
if "num_warps" not in config.keys():
config["num_warps"] = 4
if "num_stages" not in config.keys():
config["num_stages"] = 4
for key in config:
assert config[
key] is not None, f"{key} must be specified."
codegen_command = aot_template.format(**config, )
print(codegen_command)
codegen_commands.append(codegen_command)
multi_process_do(codegen_commands)
link_command = (
f"{python_path} {link_file} "
f"{generated_dir}/*.h -o {generated_dir}/{op_name}_kernel")
re = os.system(link_command)
assert re == 0
# rename the .c file to .cu
rename_c_to_cu(generated_dir)
# build the package to so, not install
build_package(generated_dir, python_package_name)
if op_name not in OpProtoHolder.instance().op_proto_map.keys():
so_path = find_so_path(generated_dir, python_package_name)
print("== we find so_path: ", so_path)
assert so_path is not None
paddle.utils.cpp_extension.load_op_meta_info_and_register_op(
so_path)
self.decorator = decorator
def __getitem__(self, op_name_and_grid):
"""
override the operator [], which will call the decorator function.
Args:
op_name_and_grid: the name of the operator and the grid size.
Returns:
the decorator function.
"""
assert len(op_name_and_grid) >= 3, "len(op_name_and_grid) must >= 3."
self.op_name = op_name_and_grid[0]
self.custom_op_template = op_name_and_grid[1]
self.grid = op_name_and_grid[2]
if len(op_name_and_grid) == 3:
self.tune_config = {}
else:
self.tune_config = op_name_and_grid[3]
return self.decorator
def paddle_use_triton(other_config={}, key=[]):
"""
The decorator function that wraps the original function.
Args:
func: the original function.
Returns:
the wrapped function.
"""
def decorator(func):
"""
The decorator function that wraps the original function.
Args:
func: the original function.
Returns:
the wrapped function.
"""
return KernelInterface(func, other_config, key)
return decorator

View File

@@ -0,0 +1,549 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import paddle
import triton.language as tl
from paddle import _C_ops
from paddle.base.framework import OpProtoHolder
from paddle.framework import in_dynamic_or_pir_mode
from fastdeploy.model_executor.ops.triton_ops.triton_utils import (
get_dtype_str, paddle_use_triton, rendering_common_template)
BLOCK_SIZE_M = 16
def invoke_fused_moe_kernel(
A,
B,
C,
B_scale,
B_super_scale,
B_code_scale,
B_code_zp,
topk_weights,
topk_ids,
sorted_token_ids,
expert_ids,
num_tokens_post_padded,
mul_routed_weight=False,
top_k=-1,
group_size=-1,
):
"""
Invoke Fused Moe Kernel
"""
KK = A.shape[-1]
NN = B.shape[-1]
sstride_am, sstride_ak = A.shape[1], 1
sstride_be, sstride_bk, sstride_bn = B.shape[1] * B.shape[2], B.shape[2], 1
sstride_cm, sstride_cn = C.shape[-1], 1
sstride_bse, sstride_bsk, sstride_bsn = B_scale.shape[1] * B_scale.shape[
2], B_scale.shape[2], 1
sstride_bce, sstride_bck, sstride_bcn = B_code_scale.shape[1], 1, 1
ddouble_quant = B_super_scale is not None
prepare_attr_for_triton_kernel = """
auto N = B.shape()[2];
auto K = A.shape()[1];
auto EM = sorted_token_ids.shape()[0];
auto num_valid_tokens = (topk_ids.shape()[0]) * (topk_ids.shape()[1]);
auto stride_am = A.strides()[0];
auto stride_ak = A.strides()[1];
auto stride_be = B.strides()[0];
auto stride_bk = B.strides()[1];
auto stride_bn = B.strides()[2];
auto stride_cm = C.strides()[1];
auto stride_cn = C.strides()[2];
auto stride_bse = B_scale.strides()[0];
auto stride_bsk = B_scale.strides()[1];
auto stride_bsn = 1;
auto stride_bce = B_code_scale.strides()[0];
auto stride_bck = 1;
auto stride_bcn = 1;
auto double_quant = true;
"""
if mul_routed_weight:
config = {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"GROUP_SIZE_M": 2,
"num_warps": 4,
"num_stages": 8,
}
else:
config = {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 512,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 12,
}
configs = []
configs.append(dict(config))
op_name = "wint2_moe_ffn"
op_name += f"{get_dtype_str(A.dtype)}"
op_name += f"{B.shape[0]}"
op_name += f"{B.shape[1]}"
op_name += f"{B.shape[2]}"
if op_name not in OpProtoHolder.instance().op_proto_map.keys():
prepare_ptr_for_triton_kernel = """
CUdeviceptr input_ptrs[11] = {
get_tensor_ptr(A),
get_tensor_ptr(B),
get_tensor_ptr(C),
get_tensor_ptr(B_scale),
get_tensor_ptr(B_super_scale),
get_tensor_ptr(B_code_scale),
get_tensor_ptr(B_code_zp),
get_tensor_ptr(topk_weights),
get_tensor_ptr(sorted_token_ids),
get_tensor_ptr(expert_ids),
get_tensor_ptr(num_tokens_post_padded),
};
"""
template_used = rendering_common_template(
invoke_fused_moe_kernel,
prepare_attr_for_triton_kernel,
prepare_ptr_for_triton_kernel,
)
grid = (
"(EM+BLOCK_SIZE_M-1)/BLOCK_SIZE_M * ((N+BLOCK_SIZE_N-1)/BLOCK_SIZE_N)",
)
moe_wint2_ffn_kernel[(op_name, template_used, grid, configs)](
A,
B,
C,
B_scale,
B_super_scale,
B_code_scale,
B_code_zp,
topk_weights,
sorted_token_ids,
expert_ids,
num_tokens_post_padded,
NN,
KK,
-1, #EEM,
-1, #nnum_valid_tokens,
sstride_am,
sstride_ak,
sstride_be,
sstride_bk,
sstride_bn,
sstride_cm,
sstride_cn,
sstride_bse,
sstride_bsk,
sstride_bsn,
sstride_bce,
sstride_bck,
sstride_bcn,
MUL_ROUTED_WEIGHT=(int)(mul_routed_weight),
USE_DOUBLE_QUANT=(int)(ddouble_quant),
top_k=top_k,
BLOCK_SIZE_K=group_size,
)
if in_dynamic_or_pir_mode():
outs = _C_ops._run_custom_op(
op_name,
A,
B,
C,
B_scale,
B_super_scale,
B_code_scale,
B_code_zp,
topk_weights,
topk_ids,
sorted_token_ids,
expert_ids,
num_tokens_post_padded,
mul_routed_weight,
top_k,
group_size,
)
return outs[0]
@paddle_use_triton(key=["1"], )
def moe_wint2_ffn_kernel(
# Pointers to matrices
a_ptr,
b_ptr,
c_ptr,
bs_ptr,
superbs_ptr,
codebs_ptr,
codebzp_ptr,
topk_weights_ptr,
sorted_token_ids_ptr,
expert_ids_ptr,
num_tokens_post_padded_ptr,
# Matrix dimensions
N,
K,
EM,
num_valid_tokens,
# The stride variables represent how much to increase the ptr by when
# moving by 1 element in a particular dimension. E.g. `stride_am` is
# how much to increase `a_ptr` by to get the element one row down
# (A has M rows).
stride_am,
stride_ak,
stride_be,
stride_bk,
stride_bn,
stride_cm,
stride_cn,
stride_bse,
stride_bsk,
stride_bsn,
stride_bce,
stride_bck,
stride_bcn,
# Meta-parameters
BLOCK_SIZE_M: tl.constexpr,
BLOCK_SIZE_N: tl.constexpr,
BLOCK_SIZE_K: tl.constexpr,
GROUP_SIZE_M: tl.constexpr,
MUL_ROUTED_WEIGHT: tl.constexpr,
USE_DOUBLE_QUANT: tl.constexpr,
top_k: tl.constexpr,
):
"""
Implements the fused computation for a Mixture of Experts (MOE) using
token and expert matrices.
Key Parameters:
- A: The input tensor representing tokens with shape (*, K), where '*' can
be any shape representing batches and K is the feature dimension of
each token.
- B: The stacked MOE weight tensor with shape (E, N, K), where E is
the number of experts, K is the input feature dimension, and N is
the output feature dimension.
- C: The output cache tensor with shape (M, topk, N), where M is the
total number of tokens post padding, topk is the number of times
each token is repeated, and N is the output feature dimension.
- sorted_token_ids: A tensor containing the sorted indices of tokens,
repeated topk times and arranged by the expert index they are
assigned to.
- expert_ids: A tensor containing the indices of the expert for each
block. It determines which expert matrix from B should be used for
each block in A.
This kernel performs the multiplication of a token by its corresponding
expert matrix as determined by `expert_ids`. The sorting of
`sorted_token_ids` by expert index and padding ensures divisibility by
BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
multiplication across different blocks processed by the same expert.
"""
if USE_DOUBLE_QUANT:
# INT4 scale
s_packnums: tl.constexpr = 2
bzp: tl.constexpr = 32
w_mask: tl.constexpr = 0x3F
pack_num: tl.constexpr = 4
real_k_size: tl.constexpr = (BLOCK_SIZE_K - 1) // pack_num + 1
pid = tl.program_id(axis=0)
pid = tl.program_id(axis=0)
num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
num_pid_in_group = GROUP_SIZE_M * num_pid_n
group_id = pid // num_pid_in_group
first_pid_m = group_id * GROUP_SIZE_M
group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
pid_n = (pid % num_pid_in_group) // group_size_m
compute_type = c_ptr.dtype.element_ty
num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
return
offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
token_mask = offs_token < num_valid_tokens
offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
# offs_k = tl.arange(0, BLOCK_SIZE_K)
offs_bk = tl.arange(0, real_k_size)
a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
offs_bk[None, :] * pack_num * stride_ak)
off_experts = tl.load(expert_ids_ptr + pid_m)
b_ptrs = b_ptr + off_experts * stride_be + (offs_bk[:, None] * stride_bk +
offs_bn[None, :] * stride_bn)
accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
bs_ptrs = bs_ptr + off_experts * stride_bse + offs_bn[
None, :] * stride_bsn # group-wise, need advanced
off_set = off_experts * stride_bce + offs_bn[None, :] * stride_bcn
# load channel-wise scale & zero-point
if USE_DOUBLE_QUANT:
superbs_ptrs = superbs_ptr + off_set # channel-wise
super_bs = tl.load(superbs_ptrs) # super scale
codebs_ptrs = codebs_ptr + off_set # channel-wise
code_bs = tl.load(codebs_ptrs) # code scale
codebzp_ptrs = codebzp_ptr + off_set # channel-wise
code_bzp = tl.load(codebzp_ptrs) # code zp
for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
b = tl.load(b_ptrs)
bs = tl.load(bs_ptrs)
if USE_DOUBLE_QUANT:
s_shift_bits = (1 - k % s_packnums) * 4
bs = ((bs >> s_shift_bits) & 0xF) * super_bs
# reverse to int16
b = tl.floor((b.to(tl.float32) * code_bs + code_bzp) + 0.5).to(
tl.int16)
# dequant
b1 = (((b >> 9) & w_mask) - bzp) * bs
a = tl.load(
a_ptrs,
mask=token_mask[:, None],
other=0.0,
)
accumulator += tl.dot(a, b1.to(a.dtype))
b1 = (((b >> 6) & w_mask) - bzp) * bs
a = tl.load(
a_ptrs + 1,
mask=token_mask[:, None],
other=0.0,
)
accumulator += tl.dot(a, b1.to(a.dtype))
b1 = (((b >> 3) & w_mask) - bzp) * bs
a = tl.load(
a_ptrs + 2,
mask=token_mask[:, None],
other=0.0,
)
accumulator += tl.dot(a, b1.to(a.dtype))
b = ((b & w_mask) - bzp) * bs
a = tl.load(
a_ptrs + 3,
mask=token_mask[:, None],
other=0.0,
)
accumulator += tl.dot(a, b.to(a.dtype))
b_ptrs += real_k_size * stride_bk
a_ptrs += BLOCK_SIZE_K * stride_ak
# advance scale ptr
if USE_DOUBLE_QUANT:
bs_ptrs += stride_bsk * (k % s_packnums)
else:
bs_ptrs += stride_bsk
if MUL_ROUTED_WEIGHT:
moe_weight = tl.load(topk_weights_ptr + offs_token,
mask=token_mask,
other=0)
accumulator = accumulator * moe_weight[:, None]
accumulator = accumulator.to(compute_type)
# -----------------------------------------------------------
# Write back the block of the output
offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
None, :]
c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
tl.store(c_ptrs, accumulator, mask=c_mask)
def fused_moe_wint2_impl(
hidden_states,
ffn1_quant_weight,
ffn2_quant_weight,
topk_weights,
topk_ids,
# inplace: bool = False,
ffn1_weight_scale=None,
ffn2_weight_scale=None,
ffn1_super_scales=None,
ffn2_super_scales=None,
ffn1_code_scale=None,
ffn2_code_scale=None,
ffn1_code_zp=None,
ffn2_code_zp=None,
group_size=64,
bit="wint2",
):
"""
Implementation of Fused MoE kernels on GPU.
"""
# Check constraints.
# A: [M, K]
# B: [E, K, N]
# assert hidden_states.shape[1] == ffn1_weight_scale.shape[1],
# f"Hidden size mismatch, {hidden_states.shape[1]} != {ffn1_quant_weight.shape[1]}"
assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
assert ffn1_quant_weight.is_contiguous(
), "Expert weights1 must be contiguous"
assert ffn2_quant_weight.is_contiguous(
), "Expert weights2 must be contiguous"
assert group_size > 0, "Group size must be greater than 0"
num_tokens, K = hidden_states.shape
E, _, N = ffn1_quant_weight.shape
M = num_tokens
if group_size < 0:
group_size = K // ffn1_weight_scale.shape[1]
top_k = topk_ids.shape[1]
intermediate_cache1 = paddle.empty(
[M, top_k, N],
dtype=hidden_states.dtype,
)
intermediate_cache2 = paddle.empty(
(M * top_k, N // 2),
dtype=hidden_states.dtype,
)
intermediate_cache3 = paddle.empty(
(M, top_k, K),
dtype=hidden_states.dtype,
)
from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess
sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess(
topk_ids, E, BLOCK_SIZE_M)
invoke_fused_moe_kernel(
A=hidden_states,
B=ffn1_quant_weight,
C=intermediate_cache1,
B_scale=ffn1_weight_scale,
B_super_scale=ffn1_super_scales,
B_code_scale=ffn1_code_scale,
B_code_zp=ffn1_code_zp,
topk_weights=topk_weights,
topk_ids=topk_ids,
sorted_token_ids=sorted_token_ids,
expert_ids=expert_ids,
num_tokens_post_padded=num_tokens_post_padded,
mul_routed_weight=False,
top_k=top_k,
group_size=group_size,
)
intermediate_cache2 = paddle.incubate.nn.functional.swiglu(
intermediate_cache1.reshape([-1, N]))
invoke_fused_moe_kernel(
A=intermediate_cache2,
B=ffn2_quant_weight,
C=intermediate_cache3,
B_scale=ffn2_weight_scale,
B_super_scale=ffn2_super_scales,
B_code_scale=ffn2_code_scale,
B_code_zp=ffn2_code_zp,
topk_weights=topk_weights,
topk_ids=topk_ids,
sorted_token_ids=sorted_token_ids,
expert_ids=expert_ids,
num_tokens_post_padded=num_tokens_post_padded,
mul_routed_weight=True,
top_k=1,
group_size=group_size,
)
out_hidden_states = paddle.sum(intermediate_cache3, axis=1)
return out_hidden_states
def fused_moe_wint2_triton(
hidden_states,
ffn1_quant_weight,
ffn2_quant_weight,
scores,
gate_correction_bias,
topk,
ffn1_weight_scale,
ffn2_weight_scale,
ffn1_super_scales,
ffn2_super_scales,
ffn1_code_scale,
ffn2_code_scale,
ffn1_code_zp,
ffn2_code_zp,
):
"""
Fuse MoE with WINT2 quantization scheme and Triton backend.
Args:
hidden_states: input tensor.
ffn1_quant_weight: ffn1 weight matrix for experts.
ffn2_quant_weight: ffn2 weight matrix for experts.
scores: gate scores.
gate_correction_bias: bias correction for gates.
topk: number of experts to use.
ffn1_weight_scale: scaling factor for ffn1_quant_weight.
ffn2_weight_scale: scaling factor for ffn2_quant_weight.
ffn1_super_scales: super scaling factor for ffn1_scale.
ffn2_super_scales: super scaling factor for ffn2_weight_scale.
ffn1_code_scale: code scaling factor for ffn1_quant_weight.
ffn2_code_scale: code scaling factor for ffn2_quant_weight.
ffn1_code_zp: code zero point for ffn1_quant_weight.
ffn2_code_zp: code zero point for ffn2_quant_weight.
Returns:
output tensor.
"""
score = gate_correction_bias + scores
_, topk_ids = paddle.topk(score, k=topk, axis=-1)
topk_weights, _ = paddle.topk(scores, k=topk, axis=-1)
topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdim=True)
return fused_moe_wint2_impl(
hidden_states,
ffn1_quant_weight,
ffn2_quant_weight,
topk_weights,
topk_ids,
ffn1_weight_scale,
ffn2_weight_scale,
ffn1_super_scales,
ffn2_super_scales,
ffn1_code_scale,
ffn2_code_scale,
ffn1_code_zp,
ffn2_code_zp,
bit="wint2",
)

View File

@@ -17,26 +17,34 @@ from typing import Dict, Optional
import paddle
from fastdeploy.model_executor.ops.gpu import (get_padding_offset, save_output,
save_output_dynamic,
set_stop_value_multi_ends,
set_stop_value_multi_seqs,
speculate_get_padding_offset,
step_paddle, update_inputs)
from fastdeploy.engine.config import SpeculativeConfig
from fastdeploy.model_executor.ops.gpu import (
get_padding_offset, save_output, set_stop_value_multi_ends,
speculate_clear_accept_nums, speculate_get_output_padding_offset,
speculate_get_padding_offset, speculate_get_seq_lens_output,
speculate_save_output, speculate_set_value_by_flags_and_idx,
speculate_step_paddle, speculate_step_system_cache, speculate_update_v3,
step_paddle, step_system_cache, update_inputs)
from fastdeploy.platforms import current_platform
from fastdeploy.worker.output import ModelOutputData
def pre_process(max_len: int, input_ids: paddle.Tensor,
seq_lens_this_time: int, use_speculate_method: bool,
draft_tokens: Optional[paddle.Tensor],
seq_lens_encoder: Optional[paddle.Tensor]):
def pre_process(
max_len: int,
input_ids: paddle.Tensor,
seq_lens_this_time: int,
speculative_decoding: bool,
draft_tokens: Optional[paddle.Tensor] = None,
seq_lens_encoder: Optional[paddle.Tensor] = None,
seq_lens_decoder: Optional[paddle.Tensor] = None,
):
"""
Preprocessing before embedding.
Args:
max_len:
input_ids:
seq_lens_this_time:
use_speculate_method:
speculative_decoding:
draft_tokens:
seq_lens_encoder:
Return:
@@ -49,7 +57,9 @@ def pre_process(max_len: int, input_ids: paddle.Tensor,
# Remove padding
cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time)
token_num = paddle.sum(seq_lens_this_time)
if use_speculate_method:
output_padding_offset = None
output_cum_offsets = None
if speculative_decoding:
(
ids_remove_padding,
cum_offsets,
@@ -64,6 +74,19 @@ def pre_process(max_len: int, input_ids: paddle.Tensor,
seq_lens_this_time,
seq_lens_encoder,
)
seq_lens_output = speculate_get_seq_lens_output(
seq_lens_this_time,
seq_lens_encoder,
seq_lens_decoder,
)
output_token_num = paddle.sum(seq_lens_output)
output_cum_offsets_tmp = paddle.cumsum(max_len - seq_lens_output)
output_padding_offset, output_cum_offsets = speculate_get_output_padding_offset(
output_cum_offsets_tmp,
output_token_num,
seq_lens_output,
max_len,
)
else:
(
ids_remove_padding,
@@ -73,16 +96,14 @@ def pre_process(max_len: int, input_ids: paddle.Tensor,
cu_seqlens_k,
) = get_padding_offset(input_ids, cum_offsets_now, token_num,
seq_lens_this_time)
return (
ids_remove_padding,
cum_offsets,
padding_offset,
cu_seqlens_q,
cu_seqlens_k,
)
return (ids_remove_padding, cum_offsets, padding_offset, cu_seqlens_q,
cu_seqlens_k, output_cum_offsets, output_padding_offset)
def post_process(tokens: paddle.Tensor, model_output: ModelOutputData) -> None:
def post_process_normal(sampled_token_ids: paddle.Tensor,
model_output: ModelOutputData,
save_each_rank: bool = False,
skip_save_output: bool = False) -> None:
""" Post-processing steps after completing a single token generation. """
# 1. Set stop value
paddle.assign(
@@ -99,27 +120,11 @@ def post_process(tokens: paddle.Tensor, model_output: ModelOutputData) -> None:
paddle.logical_or(model_output.stop_flags, length_cond),
model_output.stop_flags,
)
if model_output.use_stop_seqs:
set_stop_value_multi_seqs(
tokens,
model_output.pre_ids,
model_output.step_idx,
model_output.stop_flags,
model_output.seq_lens_this_time,
model_output.stop_seqs,
model_output.stop_seqs_len,
model_output.eos_token_id,
)
else:
set_stop_value_multi_ends(
tokens,
model_output.stop_flags,
model_output.seq_lens_this_time,
model_output.eos_token_id,
model_output.next_tokens,
False,
) # multi ends
# TODO(gongshaotian): Add use_stop_seqs
set_stop_value_multi_ends(sampled_token_ids, model_output.stop_flags,
model_output.seq_lens_this_time,
model_output.eos_token_id,
model_output.next_tokens, False) # multi ends
# 2. Update the input buffer of the model
with paddle.framework._no_check_dy2st_diff():
@@ -131,57 +136,223 @@ def post_process(tokens: paddle.Tensor, model_output: ModelOutputData) -> None:
model_output.seq_lens_decoder,
model_output.input_ids,
model_output.stop_nums,
tokens,
sampled_token_ids,
model_output.is_block_step,
)
# 3. Transmit the model's output and stop generation signal via message queue.
# In the future, we will abandon this approach.
if model_output.output_via_mq:
if model_output.msg_queue_id is None:
save_output(
tokens,
model_output.not_need_stop,
model_output.mp_rank,
model_output.use_ep,
)
else:
save_output_dynamic(
tokens,
model_output.not_need_stop,
model_output.mp_rank,
model_output.msg_queue_id,
model_output.gpt.use_ep,
)
if not skip_save_output:
save_output(
sampled_token_ids,
model_output.not_need_stop,
model_output.mp_rank,
save_each_rank, # save_each_rank
)
def post_process_specualate(model_output, skip_save_output: bool = False):
""""""
speculate_update_v3(
model_output.seq_lens_encoder,
model_output.seq_lens_decoder,
model_output.not_need_stop,
model_output.draft_tokens,
model_output.actual_draft_token_num,
model_output.accept_tokens,
model_output.accept_num,
model_output.stop_flags,
model_output.seq_lens_this_time,
model_output.is_block_step,
model_output.stop_nums,
)
if not skip_save_output:
speculate_save_output(
model_output.accept_tokens,
model_output.accept_num,
model_output.not_need_stop,
model_output.mp_rank,
False,
)
speculate_clear_accept_nums(model_output.accept_num,
model_output.seq_lens_decoder)
# Update pre_ids through accept tokens
speculate_set_value_by_flags_and_idx(
model_output.pre_ids,
model_output.accept_tokens,
model_output.accept_num,
model_output.stop_flags,
model_output.seq_lens_this_time,
model_output.seq_lens_encoder,
model_output.seq_lens_decoder,
model_output.step_idx,
)
def step_cuda(share_inputs: Dict[str, paddle.Tensor], block_size: int,
enc_dec_block_num: int) -> None:
def post_process(sampled_token_ids: paddle.Tensor,
model_output: ModelOutputData,
save_each_rank: bool = False,
speculative_decoding: bool = False,
skip_save_output: bool = False) -> None:
""" Post-processing steps after completing a single token generation. """
if speculative_decoding:
post_process_specualate(model_output, skip_save_output)
else:
post_process_normal(sampled_token_ids, model_output, save_each_rank,
skip_save_output)
def step_cuda(
share_inputs: Dict[str, paddle.Tensor],
block_size: int,
enc_dec_block_num: int,
speculative_config: SpeculativeConfig,
enable_prefix_caching: bool = False,
) -> None:
"""
TODO(gongshaotian): normalization name
"""
step_paddle(
share_inputs["stop_flags"],
share_inputs["seq_lens_this_time"],
share_inputs["step_seq_lens_encoder"],
share_inputs["seq_lens_encoder"],
share_inputs["seq_lens_decoder"],
share_inputs["block_tables"],
share_inputs["encoder_block_lens"],
share_inputs["is_block_step"],
share_inputs["step_block_list"],
share_inputs["step_lens"],
share_inputs["recover_block_list"],
share_inputs["recover_lens"],
share_inputs["need_block_list"],
share_inputs["need_block_len"],
share_inputs["used_list_len"],
share_inputs["free_list"],
share_inputs["free_list_len"],
share_inputs["input_ids"],
share_inputs["pre_ids"],
share_inputs["step_idx"],
share_inputs["next_tokens"],
share_inputs["first_token_ids"],
block_size,
enc_dec_block_num,
)
if speculative_config.method is not None:
if enable_prefix_caching:
speculate_step_system_cache(
share_inputs['stop_flags'],
share_inputs["seq_lens_this_time"],
share_inputs['step_seq_lens_encoder'],
share_inputs['step_seq_lens_decoder'],
share_inputs['seq_lens_encoder'],
share_inputs['seq_lens_decoder'],
share_inputs["block_tables"],
share_inputs['encoder_block_lens'],
share_inputs["is_block_step"],
share_inputs['step_block_list'],
share_inputs['step_lens'],
share_inputs['recover_block_list'],
share_inputs['recover_lens'],
share_inputs['need_block_list'],
share_inputs['need_block_len'],
share_inputs['used_list_len'],
share_inputs['free_list'],
share_inputs['free_list_len'],
share_inputs['input_ids'],
share_inputs['pre_ids'],
share_inputs['step_idx'],
share_inputs['next_tokens'],
share_inputs['first_token_ids'],
share_inputs["accept_num"],
block_size,
enc_dec_block_num,
speculative_config.num_speculative_tokens,
)
else:
speculate_step_paddle(
share_inputs['stop_flags'],
share_inputs["seq_lens_this_time"],
share_inputs['step_seq_lens_encoder'],
share_inputs['seq_lens_encoder'],
share_inputs['seq_lens_decoder'],
share_inputs["block_tables"],
share_inputs['encoder_block_lens'],
share_inputs["is_block_step"],
share_inputs['step_block_list'],
share_inputs['step_lens'],
share_inputs['recover_block_list'],
share_inputs['recover_lens'],
share_inputs['need_block_list'],
share_inputs['need_block_len'],
share_inputs['used_list_len'],
share_inputs['free_list'],
share_inputs['free_list_len'],
share_inputs['input_ids'],
share_inputs['pre_ids'],
share_inputs['step_idx'],
share_inputs['next_tokens'],
share_inputs['first_token_ids'],
share_inputs["accept_num"],
block_size,
enc_dec_block_num,
speculative_config.num_speculative_tokens,
)
else:
if enable_prefix_caching:
step_system_cache(
share_inputs["stop_flags"], share_inputs["seq_lens_this_time"],
share_inputs["step_seq_lens_encoder"],
share_inputs["step_seq_lens_decoder"],
share_inputs["seq_lens_encoder"],
share_inputs["seq_lens_decoder"], share_inputs["block_tables"],
share_inputs["encoder_block_lens"],
share_inputs["is_block_step"], share_inputs["step_block_list"],
share_inputs["step_lens"], share_inputs["recover_block_list"],
share_inputs["recover_lens"], share_inputs["need_block_list"],
share_inputs["need_block_len"], share_inputs["used_list_len"],
share_inputs["free_list"], share_inputs["free_list_len"],
share_inputs["input_ids"], share_inputs["pre_ids"],
share_inputs["step_idx"], share_inputs["next_tokens"],
share_inputs["first_token_ids"], block_size, enc_dec_block_num)
else:
step_paddle(
share_inputs["stop_flags"],
share_inputs["seq_lens_this_time"],
share_inputs["step_seq_lens_encoder"],
share_inputs["seq_lens_encoder"],
share_inputs["seq_lens_decoder"],
share_inputs["block_tables"],
share_inputs["encoder_block_lens"],
share_inputs["is_block_step"],
share_inputs["step_block_list"],
share_inputs["step_lens"],
share_inputs["recover_block_list"],
share_inputs["recover_lens"],
share_inputs["need_block_list"],
share_inputs["need_block_len"],
share_inputs["used_list_len"],
share_inputs["free_list"],
share_inputs["free_list_len"],
share_inputs["input_ids"],
share_inputs["pre_ids"],
share_inputs["step_idx"],
share_inputs["next_tokens"],
share_inputs["first_token_ids"],
block_size,
enc_dec_block_num,
)
def rebuild_padding(tmp_out: paddle.Tensor,
cum_offsets: paddle.Tensor,
seq_len_this_time: paddle.Tensor,
seq_lens_decoder: paddle.Tensor,
seq_lens_encoder: paddle.Tensor,
output_padding_offset: Optional[paddle.Tensor] = None,
max_input_length: Optional[int] = None):
"""
Args:
Returns:
"""
if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import rebuild_padding
hidden_states = rebuild_padding(
tmp_out,
cum_offsets,
seq_len_this_time,
seq_lens_decoder,
seq_lens_encoder,
output_padding_offset,
max_input_length,
)
elif current_platform.is_cpu():
from fastdeploy.model_executor.ops.cpu import rebuild_padding_cpu
hidden_states = rebuild_padding_cpu(
tmp_out,
cum_offsets,
seq_len_this_time,
seq_lens_decoder,
seq_lens_encoder,
output_padding_offset,
max_input_length,
)
else:
raise RuntimeError("Not supported platform")
return hidden_states