polish code with new pre-commit rule (#2923)

This commit is contained in:
Zero Rains
2025-07-19 23:19:27 +08:00
committed by GitHub
parent b8676d71a8
commit 25698d56d1
424 changed files with 14307 additions and 13518 deletions

View File

@@ -22,24 +22,29 @@ __all__ = []
if current_platform.is_xpu():
from . import xpu
from .xpu import *
if hasattr(xpu, '__all__'):
# fix: F403 `from .xpu import *` used; unable to detect undefined names
if hasattr(xpu, "__all__"):
globals().update({name: getattr(xpu, name) for name in xpu.__all__})
__all__.extend(xpu.__all__)
if current_platform.is_npu():
from . import npu
from .npu import *
if hasattr(npu, '__all__'):
if hasattr(npu, "__all__"):
globals().update({name: getattr(npu, name) for name in npu.__all__})
__all__.extend(npu.__all__)
if current_platform.is_gcu():
from . import gcu
from .gcu import *
if hasattr(gcu, '__all__'):
if hasattr(gcu, "__all__"):
globals().update({name: getattr(gcu, name) for name in gcu.__all__})
__all__.extend(gcu.__all__)
if current_platform.is_dcu():
from .dcu import *
from . import dcu
if hasattr(dcu, '__all__'):
__all__.extend(dcu.__all__)
if hasattr(dcu, "__all__"):
globals().update({name: getattr(dcu, name) for name in dcu.__all__})
__all__.extend(dcu.__all__)

View File

@@ -19,4 +19,4 @@ dcu backend methods
from .fused_moe_triton_backends import DCUTritonWeightOnlyMoEMethod
from .weight_only import DCUWeightOnlyLinearMethod
__all__ = ['DCUTritonWeightOnlyMoEMethod', 'DCUWeightOnlyLinearMethod']
__all__ = ["DCUTritonWeightOnlyMoEMethod", "DCUWeightOnlyLinearMethod"]

View File

@@ -17,10 +17,8 @@
import paddle
from paddle import nn
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
from fastdeploy.model_executor.layers.quantization.quant_base import \
QuantMethodBase
from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce
from fastdeploy.model_executor.layers.quantization.quant_base import QuantMethodBase
from fastdeploy.utils import ceil_div
@@ -36,7 +34,8 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase):
self.quant_method = quant_method
self.added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"]
self.added_scale_attrs = [
"up_gate_proj_weight_scale", "down_proj_weight_scale"
"up_gate_proj_weight_scale",
"down_proj_weight_scale",
]
def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None:
@@ -52,10 +51,12 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase):
assert len(down_proj_weights) == layer.num_local_experts
assert self.quant_method.name() == "wint8"
assert up_gate_proj_weights[0].shape == [
layer.hidden_size, layer.moe_intermediate_size * 2
layer.hidden_size,
layer.moe_intermediate_size * 2,
]
assert down_proj_weights[0].shape == [
layer.moe_intermediate_size, layer.hidden_size
layer.moe_intermediate_size,
layer.hidden_size,
]
up_gate_proj_tensor = paddle.stack(up_gate_proj_weights, axis=0)
@@ -71,26 +72,29 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase):
scale_name = self.added_scale_attrs[idx]
quanted_weight_scale = weight_tensor.abs().max(axis=1)
quanted_weight = weight_tensor / quanted_weight_scale[:,
None, :] * max_bound
quanted_weight = weight_tensor / quanted_weight_scale[:, None, :] * max_bound
quanted_weight = paddle.round(quanted_weight).astype("int8")
quanted_weight_scale = quanted_weight_scale / max_bound
setattr(
layer, weight_name,
layer,
weight_name,
layer.create_parameter(
shape=quanted_weight.shape,
dtype=quanted_weight.dtype,
default_initializer=paddle.nn.initializer.Constant(0),
))
),
)
getattr(layer, weight_name).set_value(quanted_weight)
setattr(
layer, scale_name,
layer,
scale_name,
layer.create_parameter(
shape=quanted_weight_scale.shape,
dtype=quanted_weight_scale.dtype,
))
),
)
getattr(layer, scale_name).set_value(quanted_weight_scale)
def apply(
@@ -112,10 +116,7 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase):
gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
scores = paddle.nn.functional.softmax(gate_out, axis=-1)
scores += layer.gate_correction_bias
topk_weights, topk_ids = paddle.topk(scores,
k=top_k,
axis=-1,
sorted=False)
topk_weights, topk_ids = paddle.topk(scores, k=top_k, axis=-1, sorted=False)
topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdim=True)
intermediate_cache1 = paddle.empty(
@@ -140,11 +141,15 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase):
from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess
from .triton_moe_kernels import fused_moe_kernel_paddle
sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess(
topk_ids, num_local_experts, config["BLOCK_SIZE_M"])
topk_ids, num_local_experts, config["BLOCK_SIZE_M"]
)
max_num_tokens_padded = sorted_token_ids.shape[0]
grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]), )
grid = (
ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"])
* ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]),
)
fused_moe_kernel_paddle[grid](
x,
@@ -188,11 +193,11 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase):
even_Ks=hidden_size % config["BLOCK_SIZE_K"] == 0,
)
intermediate_cache2 = paddle.incubate.nn.functional.swiglu(
intermediate_cache1)
intermediate_cache2 = paddle.incubate.nn.functional.swiglu(intermediate_cache1)
grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
ceil_div(hidden_size, config["BLOCK_SIZE_N"]), )
grid = (
ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) * ceil_div(hidden_size, config["BLOCK_SIZE_N"]),
)
fused_moe_kernel_paddle[grid](
intermediate_cache2,
layer.down_proj_weight,

View File

@@ -29,7 +29,6 @@ def fused_moe_kernel_paddle(
sorted_token_ids_ptr,
expert_ids_ptr,
num_tokens_post_padded_ptr,
# Matrix dimensions
N,
K,
@@ -108,16 +107,13 @@ def fused_moe_kernel_paddle(
offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
offs_k = tl.arange(0, BLOCK_SIZE_K)
a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
offs_k[None, :] * stride_ak)
a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak)
off_experts = tl.load(expert_ids_ptr + pid_m)
b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk +
offs_bn[None, :] * stride_bn)
b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
if use_int8_w8a16:
b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[
None, :] * stride_bsn
b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn
b_scale = tl.load(b_scale_ptrs)
if use_fp8_w8a8:
@@ -139,19 +135,14 @@ def fused_moe_kernel_paddle(
mask=token_mask[:, None],
other=0.0,
)
b = tl.load(b_ptrs,
cache_modifier=".cv",
eviction_policy='evict_first')
b = tl.load(b_ptrs, cache_modifier=".cv", eviction_policy="evict_first")
else:
a = tl.load(
a_ptrs,
mask=token_mask[:, None] &
(offs_k[None, :] < K - k * BLOCK_SIZE_K),
mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K),
other=0.0,
)
b = tl.load(b_ptrs,
mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
other=0.0)
b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
# We accumulate along the K dimension.
if use_int8_w8a16:
@@ -160,13 +151,14 @@ def fused_moe_kernel_paddle(
if group_k > 0 and group_n > 0:
k_start = k * BLOCK_SIZE_K
offs_ks = k_start // group_k
a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask,
mask=token_mask,
other=0.0)
a_scale = tl.load(
a_scale_ptrs + offs_ks * stride_ask,
mask=token_mask,
other=0.0,
)
b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk)
accumulator += tl.dot(a, b) * a_scale[:,
None] * b_scale[None, :]
accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :]
else:
accumulator = tl.dot(a, b, acc=accumulator)
else:
@@ -176,9 +168,7 @@ def fused_moe_kernel_paddle(
b_ptrs += BLOCK_SIZE_K * stride_bk
if MUL_ROUTED_WEIGHT:
moe_weight = tl.load(topk_weights_ptr + offs_token,
mask=token_mask,
other=0)
moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0)
accumulator = accumulator * moe_weight[:, None]
if use_int8_w8a16:
accumulator = (accumulator * b_scale).to(compute_type)
@@ -191,8 +181,7 @@ def fused_moe_kernel_paddle(
accumulator = accumulator.to(compute_type)
# Write back the block of the output
offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
None, :]
c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
tl.store(c_ptrs, accumulator, mask=c_mask)

View File

@@ -13,11 +13,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import paddle
from paddle.nn.quant import weight_dequantize
from fastdeploy.model_executor.layers.quantization.weight_only import (
GPUWeightOnlyLinearMethod, WeightOnlyConfig)
GPUWeightOnlyLinearMethod,
WeightOnlyConfig,
)
class DCUWeightOnlyLinearMethod(GPUWeightOnlyLinearMethod):
@@ -38,7 +41,7 @@ class DCUWeightOnlyLinearMethod(GPUWeightOnlyLinearMethod):
x=layer.weight,
scale=layer.weight_scale,
algo=self.quant_config.algo,
out_dtype=paddle.get_default_dtype()
out_dtype=paddle.get_default_dtype(),
)
linear_out = paddle.matmul(x, dequant_out)
if layer.bias is not None:

View File

@@ -18,14 +18,13 @@ gcu backend methods
from .attention.flash_attn_backend import GCUFlashAttnBackend
from .attention.mem_efficient_attn_backend import GCUMemEfficientAttnBackend
from .moe.fused_moe_method_gcu_backend import (GCUFusedMoeMethod,
GCUWeightOnlyMoEMethod)
from .moe.fused_moe_method_gcu_backend import GCUFusedMoeMethod, GCUWeightOnlyMoEMethod
from .quantization.weight_only import GCUWeightOnlyLinearMethod
__all__ = [
'GCUFlashAttnBackend',
'GCUMemEfficientAttnBackend',
'GCUFusedMoeMethod',
'GCUWeightOnlyMoEMethod',
'GCUWeightOnlyLinearMethod',
"GCUFlashAttnBackend",
"GCUMemEfficientAttnBackend",
"GCUFusedMoeMethod",
"GCUWeightOnlyMoEMethod",
"GCUWeightOnlyLinearMethod",
]

View File

@@ -17,31 +17,33 @@
from __future__ import annotations
import os
from dataclasses import dataclass, field
from dataclasses import dataclass
from typing import TYPE_CHECKING, List, Optional
import paddle
import numpy as np
import paddle
from fastdeploy.config import FDConfig
from fastdeploy.model_executor.layers.attention.attention import Attention
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
AttentionBackend, AttentionMetadata)
AttentionBackend,
AttentionMetadata,
)
if TYPE_CHECKING:
from fastdeploy.model_executor.forward_meta import ForwardMeta, ForwardMode
from fastdeploy.model_executor.ops.gcu import (fused_rotary_embedding,
mem_efficient_attention,
flash_attn_var_len)
from paddleformers.utils.log import logger
from fastdeploy.model_executor.ops.gcu import flash_attn_var_len, fused_rotary_embedding
@dataclass
class GCUFlashAttnMetadata(AttentionMetadata):
"""
GCUFlashAttnMetadata
"""
forward_mode: ForwardMode = ForwardMode.MIXED
_dtype: paddle.dtype = paddle.bfloat16
@@ -63,15 +65,18 @@ class GCUFlashAttnMetadata(AttentionMetadata):
pre_caches_length: int = 0
class GCUFlashAttnBackend(AttentionBackend):
"""
GCUFlashAttnBackend backend implementation.
"""
def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
head_dim: int):
def __init__(
self,
fd_config: FDConfig,
kv_num_heads: int,
num_heads: int,
head_dim: int,
):
"""
GCUFlashAttnBackend __init__
"""
@@ -99,8 +104,6 @@ class GCUFlashAttnBackend(AttentionBackend):
self.rotary_embs = None
self.enable_monitor: bool = bool(os.getenv("FD_GCU_ATTN_MONITOR", False))
def init_attention_metadata(self, forward_meta: ForwardMeta):
"""Initialize attntion metadata hence all layers in the forward pass can reuse it."""
metadata = GCUFlashAttnMetadata()
@@ -131,15 +134,14 @@ class GCUFlashAttnBackend(AttentionBackend):
self.rotary_embs = metadata.rotary_embs.reshape((-1, self.head_dim))
# some info for attention
self.seq_lens_this_time_list = forward_meta.seq_lens_this_time.tolist() # List[int]
self.seq_lens_encoder_list = forward_meta.seq_lens_encoder.tolist() # List[List[int]]
self.seq_lens_decoder_list = forward_meta.seq_lens_decoder.tolist() # List[List[int]]
self.seq_lens_this_time_list = forward_meta.seq_lens_this_time.tolist() # List[int]
self.seq_lens_encoder_list = forward_meta.seq_lens_encoder.tolist() # List[List[int]]
self.seq_lens_decoder_list = forward_meta.seq_lens_decoder.tolist() # List[List[int]]
self.seq_lens_sum = np.sum(self.seq_lens_this_time_list)
self.max_seq_len_this_time = np.max(self.seq_lens_this_time_list)
num_seqs = forward_meta.seq_lens_this_time.shape[0]
self.is_decoder = all(x[0] == 0 for x in self.seq_lens_encoder_list)
self.is_all_prefill = all(x[0] == 0 for x in self.seq_lens_decoder_list)
@@ -147,8 +149,14 @@ class GCUFlashAttnBackend(AttentionBackend):
if self.all_slot_mapping is None:
max_num_blocks_per_seq = (self.max_seq_len + self.block_size - 1) // self.block_size
total_blocks = max_num_blocks_per_seq * self.max_num_seqs
self.all_block_tables = np.arange(0, total_blocks, dtype=np.int32).reshape((self.max_num_seqs, max_num_blocks_per_seq)).tolist()
self.all_slot_mapping = np.arange(0, total_blocks * self.block_size, dtype=np.int32).reshape((self.max_num_seqs, -1)).tolist()
self.all_block_tables = (
np.arange(0, total_blocks, dtype=np.int32)
.reshape((self.max_num_seqs, max_num_blocks_per_seq))
.tolist()
)
self.all_slot_mapping = (
np.arange(0, total_blocks * self.block_size, dtype=np.int32).reshape((self.max_num_seqs, -1)).tolist()
)
block_tables = []
slot_mapping = []
@@ -157,9 +165,9 @@ class GCUFlashAttnBackend(AttentionBackend):
position_ids = []
for seq_idx in range(num_seqs):
cache_len = None
if self.seq_lens_encoder_list[seq_idx][0] != 0: # prefill
if self.seq_lens_encoder_list[seq_idx][0] != 0: # prefill
cache_len = 0
elif self.seq_lens_decoder_list[seq_idx][0] != 0: # decode
elif self.seq_lens_decoder_list[seq_idx][0] != 0: # decode
cache_len = self.seq_lens_decoder_list[seq_idx][0]
# else: doesnot have req in this seq_idx
@@ -193,7 +201,6 @@ class GCUFlashAttnBackend(AttentionBackend):
self.max_seqlen_q = self.max_seq_len_this_time
self.max_seqlen_k = np.max(cache_lens)
def get_attntion_meta(self):
"""get_attntion_meta"""
return self.attention_metadata
@@ -206,9 +213,11 @@ class GCUFlashAttnBackend(AttentionBackend):
Caculate kv cache shape
"""
# [total_tokens, kv_num_heads, head_dim]
return (max_num_blocks * self.block_size,
self.kv_num_heads,
self.head_dim)
return (
max_num_blocks * self.block_size,
self.kv_num_heads,
self.head_dim,
)
@paddle.no_grad()
def forward_mixed(
@@ -232,7 +241,6 @@ class GCUFlashAttnBackend(AttentionBackend):
query = query.reshape_((1, -1, self.num_heads, self.head_dim))
key = key.reshape_((1, -1, self.kv_num_heads, self.head_dim))
# 1. Rope
if self.rotary_embs.dtype != query.dtype:
self.rotary_embs = paddle.cast(self.rotary_embs, query.dtype)
@@ -242,7 +250,7 @@ class GCUFlashAttnBackend(AttentionBackend):
key,
self.rotary_embs,
self.position_ids,
layer.use_neox_rotary_style
layer.use_neox_rotary_style,
)
# 2. Save kv cache
@@ -281,4 +289,3 @@ class GCUFlashAttnBackend(AttentionBackend):
)
res = res.reshape_((token_num, -1))
return res

View File

@@ -16,33 +16,35 @@
from __future__ import annotations
import os
from dataclasses import dataclass, field
import math
from dataclasses import dataclass
from typing import TYPE_CHECKING, List, Optional
import paddle
import numpy as np
import math
import paddle
from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig
from fastdeploy.model_executor.layers.attention.attention import Attention
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
AttentionBackend, AttentionMetadata)
from fastdeploy.model_executor.ops.gcu import (fused_rotary_embedding,
mem_efficient_attention,
flash_attn_var_len)
from paddleformers.utils.log import logger
AttentionBackend,
AttentionMetadata,
)
from fastdeploy.model_executor.ops.gcu import (
fused_rotary_embedding,
mem_efficient_attention,
)
if TYPE_CHECKING:
from fastdeploy.model_executor.forward_meta import ForwardMeta, ForwardMode
@dataclass
class GCUMemEfficientAttnMetadata(AttentionMetadata):
"""
GCUMemEfficientAttnMetadata
"""
forward_mode: ForwardMode = ForwardMode.MIXED
_dtype: paddle.dtype = paddle.bfloat16
@@ -63,15 +65,18 @@ class GCUMemEfficientAttnMetadata(AttentionMetadata):
pre_caches_length: int = 0
class GCUMemEfficientAttnBackend(AttentionBackend):
"""
GCUMemEfficientAttnBackend backend implementation.
"""
def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
head_dim: int):
def __init__(
self,
fd_config: FDConfig,
kv_num_heads: int,
num_heads: int,
head_dim: int,
):
"""
GCUMemEfficientAttnBackend __init__
"""
@@ -99,8 +104,6 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
self.rotary_embs = None
self.use_paddle_native_sdpa = False
def init_attention_metadata(self, forward_meta: ForwardMeta):
"""Initialize attntion metadata hence all layers in the forward pass can reuse it."""
metadata = GCUMemEfficientAttnMetadata()
@@ -125,32 +128,35 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
metadata.pre_caches_length = forward_meta.pre_caches_length # not inited
self.attention_metadata = metadata
if self.rotary_embs is None:
self.rotary_embs = metadata.rotary_embs.reshape((-1, self.head_dim))
# some info for attention
self.seq_lens_this_time_list = forward_meta.seq_lens_this_time.tolist() # List[int]
self.seq_lens_encoder_list = forward_meta.seq_lens_encoder.tolist() # List[List[int]]
self.seq_lens_decoder_list = forward_meta.seq_lens_decoder.tolist() # List[List[int]]
self.seq_lens_this_time_list = forward_meta.seq_lens_this_time.tolist() # List[int]
self.seq_lens_encoder_list = forward_meta.seq_lens_encoder.tolist() # List[List[int]]
self.seq_lens_decoder_list = forward_meta.seq_lens_decoder.tolist() # List[List[int]]
self.seq_lens_sum = np.sum(self.seq_lens_this_time_list)
self.max_seq_len_this_time = np.max(self.seq_lens_this_time_list)
num_seqs = forward_meta.seq_lens_this_time.shape[0]
self.is_decoder = all(x[0] == 0 for x in self.seq_lens_encoder_list)
self.is_all_prefill = all(x[0] == 0 for x in self.seq_lens_decoder_list)
# block_tables and slot_mapping
if self.all_slot_mapping is None:
max_num_blocks_per_seq = (self.max_seq_len + self.block_size - 1) // self.block_size
total_blocks = max_num_blocks_per_seq * self.max_num_seqs
self.all_block_tables = np.arange(0, total_blocks, dtype=np.int32).reshape((self.max_num_seqs, max_num_blocks_per_seq)).tolist()
self.all_slot_mapping = np.arange(0, total_blocks * self.block_size, dtype=np.int32).reshape((self.max_num_seqs, -1)).tolist()
self.all_block_tables = (
np.arange(0, total_blocks, dtype=np.int32)
.reshape((self.max_num_seqs, max_num_blocks_per_seq))
.tolist()
)
self.all_slot_mapping = (
np.arange(0, total_blocks * self.block_size, dtype=np.int32).reshape((self.max_num_seqs, -1)).tolist()
)
block_tables = []
slot_mapping = []
@@ -162,9 +168,9 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
position_ids = []
for seq_idx in range(num_seqs):
cache_len = None
if self.seq_lens_encoder_list[seq_idx][0] != 0: # prefill
if self.seq_lens_encoder_list[seq_idx][0] != 0: # prefill
cache_len = 0
elif self.seq_lens_decoder_list[seq_idx][0] != 0: # decode
elif self.seq_lens_decoder_list[seq_idx][0] != 0: # decode
cache_len = self.seq_lens_decoder_list[seq_idx][0]
# else: doesnot have req in this seq_idx
@@ -179,9 +185,12 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
position_ids.extend(self.position_ids_base[start:end])
query_lens.append(lens_this_time)
cached_kv_lens.append(end)
cached_kv_slot_range.append([self.all_slot_mapping[seq_idx][0], self.all_slot_mapping[seq_idx][end]])
cached_kv_slot_range.append(
[
self.all_slot_mapping[seq_idx][0],
self.all_slot_mapping[seq_idx][end],
]
)
self.block_tables = paddle.to_tensor(block_tables, dtype="int32")
self.slot_mapping = paddle.to_tensor(slot_mapping, dtype="int32")
@@ -206,7 +215,6 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
self.cached_kv_lens = cached_kv_lens
self.cached_kv_slot_range = cached_kv_slot_range
def get_attntion_meta(self):
"""get_attntion_meta"""
return self.attention_metadata
@@ -219,9 +227,11 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
Caculate kv cache shape
"""
# [total_tokens, kv_num_heads, head_dim]
return (max_num_blocks * self.block_size,
self.kv_num_heads,
self.head_dim)
return (
max_num_blocks * self.block_size,
self.kv_num_heads,
self.head_dim,
)
@paddle.no_grad()
def forward_mixed(
@@ -245,7 +255,6 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
query = query.reshape_((1, -1, self.num_heads, self.head_dim))
key = key.reshape_((1, -1, self.kv_num_heads, self.head_dim))
# 1. Rope
if self.rotary_embs.dtype != query.dtype:
self.rotary_embs = paddle.cast(self.rotary_embs, query.dtype)
@@ -255,7 +264,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
key,
self.rotary_embs,
self.position_ids,
layer.use_neox_rotary_style
layer.use_neox_rotary_style,
)
# 2. Save kv cache
@@ -282,9 +291,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
v_ = value_caches[kv_start:kv_end, :, :]
if self.use_paddle_native_sdpa:
res = self.native_sdpa_impl(
q_, k_, v_
)
res = self.native_sdpa_impl(q_, k_, v_)
else:
res = mem_efficient_attention(
query=q_.unsqueeze(0),
@@ -302,7 +309,6 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
result = result.reshape_((token_num, -1))
return result
def get_triangle_upper_mask(self, shape, dtype):
# [batch_size, 1, q_seq_len, kv_seq_len]
shape[1] = 1
@@ -313,7 +319,6 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
mask = paddle.triu(mask, diagonal=kv_seq_len - q_seq_len + 1)
return mask
def native_sdpa_impl(self, query, key, value):
# input shape: [num_tokens, num_heads, head_dim] -> [1, num_tokens, num_heads, head_dim]
q = query.unsqueeze(0)
@@ -342,13 +347,9 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
# matmul and devide by sqrt(head_dim)
attn_weights = paddle.matmul(q / math.sqrt(head_dim), k.transpose([0, 1, 3, 2]))
attention_mask = self.get_triangle_upper_mask(
[batch, 1, q_seq_len, kv_seq_len], q.dtype
)
attention_mask = self.get_triangle_upper_mask([batch, 1, q_seq_len, kv_seq_len], q.dtype)
attn_weights = attn_weights + attention_mask
attn_weights = paddle.nn.functional.softmax(
attn_weights, axis=-1, dtype="float32"
).astype(q.dtype)
attn_weights = paddle.nn.functional.softmax(attn_weights, axis=-1, dtype="float32").astype(q.dtype)
attn_output = paddle.matmul(attn_weights, v)
attn_output = attn_output.transpose([0, 2, 1, 3])

View File

@@ -11,6 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""""
""" "
gcu moe
"""

View File

@@ -1,4 +1,3 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
@@ -15,7 +14,6 @@
# limitations under the License.
"""
import multiprocessing
import os
@@ -24,27 +22,30 @@ import paddle
from paddle import nn
from paddleformers.utils.log import logger
from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import \
MoEMethodBase
from fastdeploy.model_executor.layers.utils import (CpuGuard,
create_and_set_parameter,
get_tensor)
from fastdeploy.model_executor.ops.gcu import (invoke_fused_moe_kernel,
moe_align_block_size,
topk_softmax,
weight_quantize_custom_rtn,
weight_quantize_rtn)
from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import MoEMethodBase
from fastdeploy.model_executor.layers.utils import (
CpuGuard,
create_and_set_parameter,
get_tensor,
)
from fastdeploy.model_executor.ops.gcu import (
invoke_fused_moe_kernel,
moe_align_block_size,
topk_softmax,
weight_quantize_custom_rtn,
weight_quantize_rtn,
)
class GCUFusedMoeMethod(MoEMethodBase):
"""
Use GCU to compute Fused MoE.
"""
def __init__(self, quant_config):
super().__init__(quant_config)
self.group_size = -1
def create_weights(self, layer: nn.Layer, state_dict):
"""
Paddle gcu create weight process.
@@ -53,28 +54,28 @@ class GCUFusedMoeMethod(MoEMethodBase):
up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
stacked_up_gate_proj_weights = paddle.stack(up_gate_proj_weights, axis=0)
stacked_down_proj_weights = paddle.stack(down_proj_weights, axis=0)
for idx, weight_tensor in enumerate(
[stacked_up_gate_proj_weights, stacked_down_proj_weights]):
for idx, weight_tensor in enumerate([stacked_up_gate_proj_weights, stacked_down_proj_weights]):
# shape [E, K, N] -> [E, N, K]
weight_tensor = paddle.transpose(weight_tensor, [0, 2, 1])
weight_name = self.added_weight_attrs[idx]
setattr(
layer, weight_name,
layer,
weight_name,
layer.create_parameter(
shape=weight_tensor.shape,
dtype=weight_tensor.dtype,
default_initializer=paddle.nn.initializer.Constant(0),
))
),
)
getattr(layer, weight_name).set_value(weight_tensor)
@paddle.no_grad()
def compute_ffn(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
enable_quant = False
enable_quant=False,
) -> paddle.Tensor:
"""
Paddle gcu compute Fused MoE.
@@ -86,8 +87,17 @@ class GCUFusedMoeMethod(MoEMethodBase):
topk_weights = paddle.empty([token_num, top_k], dtype=gate_out.dtype)
topk_indices = paddle.empty([token_num, top_k], dtype="int32")
token_expert_indices = paddle.empty([token_num, top_k], dtype="int32",)
topk_softmax(topk_weights, topk_indices, token_expert_indices, gate_out, norm_topk_prob=True)
token_expert_indices = paddle.empty(
[token_num, top_k],
dtype="int32",
)
topk_softmax(
topk_weights,
topk_indices,
token_expert_indices,
gate_out,
norm_topk_prob=True,
)
config = {
"BLOCK_SIZE_M": 32,
@@ -136,7 +146,7 @@ class GCUFusedMoeMethod(MoEMethodBase):
top_k,
config,
enable_quant, # use_int4_w4a16
[0, self.group_size], # block_shape
[0, self.group_size], # block_shape
)
intermediate_cache2 = paddle.empty(
@@ -144,8 +154,7 @@ class GCUFusedMoeMethod(MoEMethodBase):
dtype=x.dtype,
)
intermediate_cache2 = paddle.incubate.nn.functional.swiglu(
intermediate_cache1)
intermediate_cache2 = paddle.incubate.nn.functional.swiglu(intermediate_cache1)
intermediate_cache2 = intermediate_cache2.reshape([-1, moe_intermediate_size])
@@ -181,13 +190,14 @@ class GCUFusedMoeMethod(MoEMethodBase):
fused_moe_out = fused_moe_out.reshape_([token_num, hidden_size])
if layer.tp_size > 1:
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
from fastdeploy.distributed.communication_op import (
tensor_model_parallel_all_reduce,
)
tensor_model_parallel_all_reduce(fused_moe_out)
return fused_moe_out
def apply(
self,
layer: nn.Layer,
@@ -199,7 +209,6 @@ class GCUFusedMoeMethod(MoEMethodBase):
"""
return self.compute_ffn(layer, x, gate_out, enable_quant=False)
def apply_ep_prefill(
self,
layer: nn.Layer,
@@ -211,7 +220,6 @@ class GCUFusedMoeMethod(MoEMethodBase):
"""
raise NotImplementedError
def apply_ep_decode(
self,
layer: nn.Layer,
@@ -223,7 +231,6 @@ class GCUFusedMoeMethod(MoEMethodBase):
"""
raise NotImplementedError
def apply_tp(
self,
layer: nn.Layer,
@@ -247,48 +254,44 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod):
self.moe_quant_type = self.quant_config.algo
self.pack_num = 1
assert self.quant_config.algo == "weight_only_int4", \
"GCUWeightOnlyMoEMethod only support weight_only_int4, but got:{self.quant_config.algo}"
assert (
self.quant_config.algo == "weight_only_int4"
), "GCUWeightOnlyMoEMethod only support weight_only_int4, but got:{self.quant_config.algo}"
self.added_qzeros_attrs = [
"up_gate_proj_weight_zeros", "down_proj_weight_zeros"
"up_gate_proj_weight_zeros",
"down_proj_weight_zeros",
]
self.group_size = 64
self.quant_multi_process_group_size = int(
os.getenv("FD_MOE_QUANT_MULTI_PROCESS_GROUP_SIZE", 8)
)
self.quant_multi_process_group_size = int(os.getenv("FD_MOE_QUANT_MULTI_PROCESS_GROUP_SIZE", 8))
logger.info(f"GCUWeightOnlyMoEMethod quant_multi_process_group_size: {self.quant_multi_process_group_size}")
def process_prequanted_weights(self, layer: nn.Layer, state_dict):
"""
Paddle gcu process prequanted weights.
"""
up_gate_proj_expert_weight_key = layer.weight_key_map.get(
"up_gate_proj_expert_weight_key", None)
down_proj_expert_weight_key = layer.weight_key_map.get(
"down_proj_expert_weight_key", None)
up_gate_proj_expert_weight_scale_key = layer.weight_key_map.get(
"up_gate_proj_expert_weight_scale_key", None)
down_proj_expert_weight_scale_key = layer.weight_key_map.get(
"down_proj_expert_weight_scale_key", None)
up_gate_proj_expert_weight_key = layer.weight_key_map.get("up_gate_proj_expert_weight_key", None)
down_proj_expert_weight_key = layer.weight_key_map.get("down_proj_expert_weight_key", None)
up_gate_proj_expert_weight_scale_key = layer.weight_key_map.get("up_gate_proj_expert_weight_scale_key", None)
down_proj_expert_weight_scale_key = layer.weight_key_map.get("down_proj_expert_weight_scale_key", None)
up_gate_proj_weights, down_proj_weights = layer.load_experts_weight(
state_dict, up_gate_proj_expert_weight_key, down_proj_expert_weight_key)
state_dict,
up_gate_proj_expert_weight_key,
down_proj_expert_weight_key,
)
# self.check(layer, up_gate_proj_weights, down_proj_weights)
up_gate_proj_weight_scale = []
down_proj_weight_scale = []
for i in range(layer.num_experts):
expert_idx = layer.expert_id_offset + i
up_gate_proj_weight_scale.append(
get_tensor(
state_dict.pop(
up_gate_proj_expert_weight_scale_key.format(expert_idx))))
get_tensor(state_dict.pop(up_gate_proj_expert_weight_scale_key.format(expert_idx)))
)
down_proj_weight_scale.append(
get_tensor(
state_dict.pop(
down_proj_expert_weight_scale_key.format(expert_idx))))
get_tensor(state_dict.pop(down_proj_expert_weight_scale_key.format(expert_idx)))
)
up_gate_proj_weight = paddle.stack(up_gate_proj_weights, axis=0)
down_proj_weight = paddle.stack(down_proj_weights, axis=0)
@@ -299,12 +302,11 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod):
"up_gate_proj_weight": up_gate_proj_weight,
"down_proj_weight": down_proj_weight,
"up_gate_proj_weight_scale": up_gate_proj_weight_scale,
"down_proj_weight_scale": down_proj_weight_scale
"down_proj_weight_scale": down_proj_weight_scale,
}
for name, tensor in name_tensor_map.items():
create_and_set_parameter(layer, name, tensor)
@paddle.no_grad()
def create_weights(self, layer: nn.Layer, state_dict):
"""
@@ -313,7 +315,6 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod):
up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
self.check(layer, up_gate_proj_weights, down_proj_weights)
def quant_worker(p_group_idx, shared_dict, weights, moe_quant_type, group_size):
with CpuGuard():
p_group_size = len(weights)
@@ -322,13 +323,13 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod):
quant_weight, scale = weight_quantize_custom_rtn(
weights[group_j],
moe_quant_type,
group_size # group_size
group_size, # group_size
)
shared_dict[p_group_size * p_group_idx + group_j] = (
quant_weight, scale
quant_weight,
scale,
)
for idx, weight_tensor in enumerate([up_gate_proj_weights, down_proj_weights]):
weight_name = self.added_weight_attrs[idx]
scale_name = self.added_scale_attrs[idx]
@@ -354,7 +355,13 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod):
p = multiprocessing.Process(
target=quant_worker,
args=(i, shared_dict, w, self.moe_quant_type, self.group_size)
args=(
i,
shared_dict,
w,
self.moe_quant_type,
self.group_size,
),
)
p.start()
processes.append(p)
@@ -376,7 +383,7 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod):
quant_weight, scale = weight_quantize_rtn(
weight_tensor[i],
self.moe_quant_type,
self.group_size # group_size
self.group_size, # group_size
)
weight_list.append(quant_weight)
weight_scale_list.append(scale)
@@ -389,7 +396,6 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod):
quanted_weight_zeros = quanted_weight_scale * 8
create_and_set_parameter(layer, zeros_name, quanted_weight_zeros)
def apply(
self,
layer: nn.Layer,

View File

@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""""
""" "
gcu quantization
"""
from .weight_only import GCUWeightOnlyLinearMethod

View File

@@ -17,7 +17,9 @@
import paddle
from fastdeploy.model_executor.layers.quantization.weight_only import (
WeightOnlyConfig, WeightOnlyLinearMethod)
WeightOnlyConfig,
WeightOnlyLinearMethod,
)
from fastdeploy.model_executor.layers.utils import get_tensor
from fastdeploy.model_executor.ops.gcu import linear_quant, weight_quantize_rtn
@@ -35,7 +37,6 @@ class GCUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
self.quant_config = quant_config
self.group_size = -1
def create_weights(self, layer):
# The scale shape should be equal to the output dim of weight using Per-Channel Quantization.
weight_scale_shape = [layer.weight_shape[1]]
@@ -50,7 +51,6 @@ class GCUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
is_bias=False,
)
def process_prequanted_weights(self, layer, state_dict) -> None:
"""
Process pre-quantized weights before applying them to the model
@@ -62,9 +62,7 @@ class GCUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
quant_weight = get_tensor(state_dict.pop(layer.weight_key))
weight_scale = get_tensor(state_dict.pop(layer.weight_scale_key))
layer.weight.set_value(quant_weight)
layer.weight_scale.set_value(
weight_scale.astype(paddle.get_default_dtype()))
layer.weight_scale.set_value(weight_scale.astype(paddle.get_default_dtype()))
def process_loaded_weights(self, layer, weight) -> None:
quanted_weight_tensor, weight_scale_tensor = weight_quantize_rtn(
@@ -74,9 +72,7 @@ class GCUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
)
layer.weight.set_value(quanted_weight_tensor)
layer.weight_scale.set_value(
weight_scale_tensor.astype(paddle.get_default_dtype()))
layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype()))
@paddle.no_grad()
def apply(self, layer, x):

View File

@@ -14,4 +14,4 @@
"""
npu backend methods
"""
"""

View File

@@ -18,4 +18,4 @@ xpu backend methods
from .quantization.weight_only import XPUWeightOnlyLinearMethod
__all__ = ['XPUWeightOnlyLinearMethod']
__all__ = ["XPUWeightOnlyLinearMethod"]

View File

@@ -13,11 +13,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import paddle
from paddle import nn
from fastdeploy.model_executor.layers.quantization.weight_only import (
WeightOnlyConfig, WeightOnlyLinearMethod)
WeightOnlyConfig,
WeightOnlyLinearMethod,
)
from fastdeploy.model_executor.ops.xpu import weight_quantize_xpu
@@ -48,13 +51,10 @@ class XPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
is_bias=False,
)
def process_loaded_weights(self, layer: nn.Layer,
weight: paddle.Tensor) -> None:
def process_loaded_weights(self, layer: nn.Layer, weight: paddle.Tensor) -> None:
"""
loaded_weights using xpu special quantization
"""
quanted_weight_tensor, weight_scale_tensor = weight_quantize_xpu(
weight, self.quant_config.algo, -1, -1)
layer.weight.set_value(
paddle.transpose(quanted_weight_tensor, [1, 0]))
quanted_weight_tensor, weight_scale_tensor = weight_quantize_xpu(weight, self.quant_config.algo, -1, -1)
layer.weight.set_value(paddle.transpose(quanted_weight_tensor, [1, 0]))
layer.weight_scale.set_value(weight_scale_tensor)

View File

@@ -36,7 +36,8 @@ def xpu_clip_and_round(x: np.ndarray) -> np.ndarray:
def xpu_quant_qkv_weight(
weight_np: np.ndarray) -> Tuple[paddle.Tensor, paddle.Tensor]:
weight_np: np.ndarray,
) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""
Quantize the query, key, and value weights for the Transformer model.
@@ -65,7 +66,8 @@ def xpu_quant_qkv_weight(
def xpu_quant_weight(
weight_np: np.ndarray) -> Tuple[paddle.Tensor, paddle.Tensor]:
weight_np: np.ndarray,
) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""
Quantize the weight tensor for XPU devices.