mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 16:22:57 +08:00
[Features] support hugging face qwen3 dense and qwen2 model (#3574)
* support qwen2 and qwen3 hugging face * fix moe * defualt_v1 loader * hugging_face_format deprecated * modify hugging_face_foramt to model_format * model_format auto * fix environemt * fix bug * fix qwen3-0.6 bug * model_format is str * fix
This commit is contained in:
@@ -57,6 +57,7 @@ class UnquantizedLinearMethod(QuantMethodBase):
|
||||
{
|
||||
**extra_weight_attrs,
|
||||
"weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
|
||||
"model_format": extra_weight_attrs.get("model_format", ""),
|
||||
},
|
||||
)
|
||||
|
||||
@@ -343,7 +344,9 @@ class ColumnParallelLinear(LinearBase):
|
||||
weight_loader=(
|
||||
self.weight_loader if hasattr(self, "weight_loader") else default_weight_loader(self.fd_config)
|
||||
),
|
||||
model_format=fd_config.model_config.model_format,
|
||||
)
|
||||
|
||||
if self.nranks > 0:
|
||||
if self.with_bias:
|
||||
# col parallel
|
||||
@@ -402,6 +405,9 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
|
||||
)
|
||||
|
||||
def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None):
|
||||
model_format = getattr(param, "model_format", "")
|
||||
if model_format == "torch":
|
||||
loaded_weight = loaded_weight.transpose([1, 0])
|
||||
output_dim = getattr(param, "output_dim", None)
|
||||
assert output_dim is not None
|
||||
shard_dim = -1 if output_dim else 0
|
||||
@@ -424,7 +430,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
|
||||
# Tensor parallelism splits the weight along the output_dim
|
||||
if self.nranks != 1:
|
||||
dim = -1 if output_dim else 0
|
||||
if isinstance(loaded_weight, np.ndarray):
|
||||
if isinstance(loaded_weight, (np.ndarray, paddle.Tensor)):
|
||||
size = loaded_weight.shape[dim]
|
||||
else:
|
||||
size = loaded_weight.get_shape()[dim]
|
||||
@@ -523,6 +529,9 @@ class QKVParallelLinear(ColumnParallelLinear):
|
||||
assert output_dim is not None
|
||||
dim = -1 if output_dim else 0
|
||||
head_dim = param.shape[dim] // (self.num_heads_per_rank + 2 * self.kv_num_heads_per_rank)
|
||||
model_format = getattr(param, "model_format", "")
|
||||
if model_format == "torch":
|
||||
loaded_weight = loaded_weight.transpose([1, 0])
|
||||
if loaded_shard_id is None:
|
||||
# Loaded weight is already fused on disk
|
||||
shard_offsets = [
|
||||
@@ -541,7 +550,8 @@ class QKVParallelLinear(ColumnParallelLinear):
|
||||
assert loaded_shard_id in ["q", "k", "v"]
|
||||
# Tensor parallelism splits the weight along the output_dim
|
||||
if self.nranks != 1:
|
||||
if isinstance(loaded_weight, np.ndarray):
|
||||
dim = -1 if output_dim else 0
|
||||
if isinstance(loaded_weight, (np.ndarray, paddle.Tensor)):
|
||||
size = loaded_weight.shape[dim]
|
||||
else:
|
||||
size = loaded_weight.get_shape()[dim]
|
||||
@@ -712,6 +722,7 @@ class RowParallelLinear(LinearBase):
|
||||
weight_loader=(
|
||||
self.weight_loader if hasattr(self, "weight_loader") else default_weight_loader(self.fd_config)
|
||||
),
|
||||
model_format=fd_config.model_config.model_format,
|
||||
)
|
||||
if self.nranks > 0:
|
||||
if self.with_bias:
|
||||
|
Reference in New Issue
Block a user