diff --git a/fastdeploy/model_executor/layers/embeddings.py b/fastdeploy/model_executor/layers/embeddings.py index 377ff19bb..0ac2d0d70 100644 --- a/fastdeploy/model_executor/layers/embeddings.py +++ b/fastdeploy/model_executor/layers/embeddings.py @@ -77,6 +77,11 @@ class VocabParallelEmbedding(nn.Layer): ) if self.world_size > 1: set_weight_attrs(self.embeddings.weight, {"output_dim": False}) + set_weight_attrs( + self.embeddings.weight, + {"rl_need_attr": {"rl_tp_degree": fd_config.parallel_config.tensor_parallel_size}}, + ) + else: # column cut embedding self.embeddings = nn.Embedding( diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index 2c7f9aef3..0d079c90c 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -356,11 +356,21 @@ class ColumnParallelLinear(LinearBase): ) if self.nranks > 0: + _set_var_distributed(self.weight, split_axis=-1) if self.with_bias: # col parallel - _set_var_distributed(self.bias, split_axis=1) + _set_var_distributed(self.bias, split_axis=0) set_weight_attrs(self.bias, {"output_dim": True}) + # set_rl_tp_degree + set_weight_attrs( + self.weight, {"rl_need_attr": {"rl_tp_degree": fd_config.parallel_config.tensor_parallel_size}} + ) + if self.with_bias: + set_weight_attrs( + self.bias, {"rl_need_attr": {"rl_tp_degree": fd_config.parallel_config.tensor_parallel_size}} + ) + class MergedColumnParallelLinear(ColumnParallelLinear): """ @@ -743,6 +753,7 @@ class RowParallelLinear(LinearBase): model_format=fd_config.model_config.model_format, ) if self.nranks > 0: + _set_var_distributed(self.weight, split_axis=0) if self.with_bias: # col parallel _set_var_distributed(self.bias, split_axis=0) @@ -755,6 +766,11 @@ class RowParallelLinear(LinearBase): self.reduce_results = reduce_results + # set_rl_tp_degree + set_weight_attrs( + self.weight, {"rl_need_attr": {"rl_tp_degree": fd_config.parallel_config.tensor_parallel_size}} + ) + def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor: if self.fd_config.quant_config: out = self.quant_method.apply(self, x) diff --git a/fastdeploy/model_executor/layers/lm_head.py b/fastdeploy/model_executor/layers/lm_head.py index a62e46d61..b9dc06ab0 100644 --- a/fastdeploy/model_executor/layers/lm_head.py +++ b/fastdeploy/model_executor/layers/lm_head.py @@ -94,6 +94,12 @@ class ParallelLMHead(nn.Layer): "model_format": self.fd_config.model_config.model_format, }, ) + if self.bias_key is not None: + set_weight_attrs( + self.linear.bias, + {"rl_need_attr": {"rl_tp_degree": fd_config.parallel_config.tensor_parallel_size}}, + ) + if self.nranks > 1: set_weight_attrs(self.linear.weight, {"output_dim": True}) else: @@ -116,6 +122,9 @@ class ParallelLMHead(nn.Layer): if self.nranks > 1: set_weight_attrs(self.linear.weight, {"output_dim": False}) + set_weight_attrs( + self.linear.weight, {"rl_need_attr": {"rl_tp_degree": fd_config.parallel_config.tensor_parallel_size}} + ) def load_state_dict(self, state_dict: Dict[str, paddle.Tensor | np.ndarray]): """