custom all reduce support cuda graph (#2938)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled

* Support enabling cuda graph and custom all reduce at the same time, and fix the overwritten custom all reduce flag

* rename communication_op to communication
This commit is contained in:
zhink
2025-07-21 22:52:03 +08:00
committed by GitHub
parent ff4569f135
commit 0262ef7eb3
21 changed files with 88 additions and 51 deletions

View File

@@ -17,7 +17,7 @@
import paddle
from paddle import nn
from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
from fastdeploy.model_executor.layers.quantization.quant_base import QuantMethodBase
from fastdeploy.utils import ceil_div

View File

@@ -190,7 +190,7 @@ class GCUFusedMoeMethod(MoEMethodBase):
fused_moe_out = fused_moe_out.reshape_([token_num, hidden_size])
if layer.tp_size > 1:
from fastdeploy.distributed.communication_op import (
from fastdeploy.distributed.communication import (
tensor_model_parallel_all_reduce,
)

View File

@@ -18,7 +18,7 @@ import paddle
from paddle import nn
from fastdeploy.config import FDConfig
from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
from fastdeploy.platforms import current_platform
from .utils import _set_var_distributed, divide, get_tensor

View File

@@ -20,7 +20,7 @@ from paddle.nn.quant import weight_quantize
from paddleformers.utils.log import logger
import fastdeploy
from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
from fastdeploy.platforms import current_platform
from ..utils import create_and_set_parameter, get_tensor

View File

@@ -19,7 +19,7 @@ from paddle import nn
from paddleformers.utils.log import logger
import fastdeploy
from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
from fastdeploy.model_executor.layers.utils import get_tensor
from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func, deep_gemm

View File

@@ -18,7 +18,7 @@ import paddle
from paddle import nn
import fastdeploy
from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
from fastdeploy.model_executor.ops.gpu import (
MoeWna16MarlinGemmApi,
tritonmoe_preprocess_func,

View File

@@ -18,7 +18,7 @@ import paddle
from paddle import nn
import fastdeploy
from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
from fastdeploy.model_executor.layers.utils import create_and_set_parameter, get_tensor
from fastdeploy.utils import ceil_div

View File

@@ -18,7 +18,7 @@ import paddle
from paddle import nn
import fastdeploy
from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
from fastdeploy.utils import ceil_div
from ..quantization.quant_base import QuantMethodBase

View File

@@ -82,7 +82,7 @@ class XPUMoEMethod(MoEMethodBase):
False, # moe group, used in deepseek
)
if layer.tp_size > 1:
from fastdeploy.distributed.communication_op import (
from fastdeploy.distributed.communication import (
tensor_model_parallel_all_reduce,
)
@@ -210,7 +210,7 @@ class XPUWeightOnlyMoEMethod(QuantMethodBase):
False, # moe group, used in deepseek
)
if layer.tp_size > 1:
from fastdeploy.distributed.communication_op import (
from fastdeploy.distributed.communication import (
tensor_model_parallel_all_reduce,
)