From d40a1046def4c03f00d8f0860a835b6abb0c2742 Mon Sep 17 00:00:00 2001
From: lizhenyun01 <1500424927@qq.com>
Date: Mon, 8 Sep 2025 16:20:32 +0800
Subject: [PATCH] [Feature] support rl_tp_degree (#3934)

* [Feature] support rl_tp_degree

* add rl_tp_degree in lmhead

* add rl_tp_degree in bias

* fix split_axis=0 in bias

* fix split_axis in weight

* fix bias rl_tp_degree

* fix bias rl_tp_degree

* change attr to dict

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
---
 fastdeploy/model_executor/layers/embeddings.py |  5 +++++
 fastdeploy/model_executor/layers/linear.py     | 18 +++++++++++++++++-
 fastdeploy/model_executor/layers/lm_head.py    |  9 +++++++++
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/fastdeploy/model_executor/layers/embeddings.py b/fastdeploy/model_executor/layers/embeddings.py
index 377ff19bb..0ac2d0d70 100644
--- a/fastdeploy/model_executor/layers/embeddings.py
+++ b/fastdeploy/model_executor/layers/embeddings.py
@@ -77,6 +77,11 @@ class VocabParallelEmbedding(nn.Layer):
             )
             if self.world_size > 1:
                 set_weight_attrs(self.embeddings.weight, {"output_dim": False})
+            set_weight_attrs(
+                self.embeddings.weight,
+                {"rl_need_attr": {"rl_tp_degree": fd_config.parallel_config.tensor_parallel_size}},
+            )
+
         else:
             # column cut embedding
             self.embeddings = nn.Embedding(
diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py
index 2c7f9aef3..0d079c90c 100644
--- a/fastdeploy/model_executor/layers/linear.py
+++ b/fastdeploy/model_executor/layers/linear.py
@@ -356,11 +356,21 @@ class ColumnParallelLinear(LinearBase):
         )
 
         if self.nranks > 0:
+            _set_var_distributed(self.weight, split_axis=-1)
             if self.with_bias:
                 # col parallel
-                _set_var_distributed(self.bias, split_axis=1)
+                _set_var_distributed(self.bias, split_axis=0)
                 set_weight_attrs(self.bias, {"output_dim": True})
 
+        # set_rl_tp_degree
+        set_weight_attrs(
+            self.weight, {"rl_need_attr": {"rl_tp_degree": fd_config.parallel_config.tensor_parallel_size}}
+        )
+        if self.with_bias:
+            set_weight_attrs(
+                self.bias, {"rl_need_attr": {"rl_tp_degree": fd_config.parallel_config.tensor_parallel_size}}
+            )
+
 
 class MergedColumnParallelLinear(ColumnParallelLinear):
     """
@@ -743,6 +753,7 @@ class RowParallelLinear(LinearBase):
             model_format=fd_config.model_config.model_format,
         )
         if self.nranks > 0:
+            _set_var_distributed(self.weight, split_axis=0)
             if self.with_bias:
                 # col parallel
                 _set_var_distributed(self.bias, split_axis=0)
@@ -755,6 +766,11 @@ class RowParallelLinear(LinearBase):
 
         self.reduce_results = reduce_results
 
+        # set_rl_tp_degree
+        set_weight_attrs(
+            self.weight, {"rl_need_attr": {"rl_tp_degree": fd_config.parallel_config.tensor_parallel_size}}
+        )
+
     def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor:
         if self.fd_config.quant_config:
             out = self.quant_method.apply(self, x)
diff --git a/fastdeploy/model_executor/layers/lm_head.py b/fastdeploy/model_executor/layers/lm_head.py
index a62e46d61..b9dc06ab0 100644
--- a/fastdeploy/model_executor/layers/lm_head.py
+++ b/fastdeploy/model_executor/layers/lm_head.py
@@ -94,6 +94,12 @@ class ParallelLMHead(nn.Layer):
                         "model_format": self.fd_config.model_config.model_format,
                     },
                 )
+                if self.bias_key is not None:
+                    set_weight_attrs(
+                        self.linear.bias,
+                        {"rl_need_attr": {"rl_tp_degree": fd_config.parallel_config.tensor_parallel_size}},
+                    )
+
                 if self.nranks > 1:
                     set_weight_attrs(self.linear.weight, {"output_dim": True})
             else:
@@ -116,6 +122,9 @@ class ParallelLMHead(nn.Layer):
 
                 if self.nranks > 1:
                     set_weight_attrs(self.linear.weight, {"output_dim": False})
+        set_weight_attrs(
+            self.linear.weight, {"rl_need_attr": {"rl_tp_degree": fd_config.parallel_config.tensor_parallel_size}}
+        )
 
     def load_state_dict(self, state_dict: Dict[str, paddle.Tensor | np.ndarray]):
         """