mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-11-03 11:02:01 +08:00
custom all reduce support cuda graph (#2938)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* Support enabling cuda graph and custom all reduce at the same time, and fix the overwritten custom all reduce flag * rename communication_op to communication
This commit is contained in:
@@ -17,7 +17,7 @@
|
||||
import paddle
|
||||
from paddle import nn
|
||||
|
||||
from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce
|
||||
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
|
||||
from fastdeploy.model_executor.layers.quantization.quant_base import QuantMethodBase
|
||||
from fastdeploy.utils import ceil_div
|
||||
|
||||
|
||||
@@ -190,7 +190,7 @@ class GCUFusedMoeMethod(MoEMethodBase):
|
||||
fused_moe_out = fused_moe_out.reshape_([token_num, hidden_size])
|
||||
|
||||
if layer.tp_size > 1:
|
||||
from fastdeploy.distributed.communication_op import (
|
||||
from fastdeploy.distributed.communication import (
|
||||
tensor_model_parallel_all_reduce,
|
||||
)
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ import paddle
|
||||
from paddle import nn
|
||||
|
||||
from fastdeploy.config import FDConfig
|
||||
from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce
|
||||
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
|
||||
from fastdeploy.platforms import current_platform
|
||||
|
||||
from .utils import _set_var_distributed, divide, get_tensor
|
||||
|
||||
@@ -20,7 +20,7 @@ from paddle.nn.quant import weight_quantize
|
||||
from paddleformers.utils.log import logger
|
||||
|
||||
import fastdeploy
|
||||
from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce
|
||||
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
|
||||
from fastdeploy.platforms import current_platform
|
||||
|
||||
from ..utils import create_and_set_parameter, get_tensor
|
||||
|
||||
@@ -19,7 +19,7 @@ from paddle import nn
|
||||
from paddleformers.utils.log import logger
|
||||
|
||||
import fastdeploy
|
||||
from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce
|
||||
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
|
||||
from fastdeploy.model_executor.layers.utils import get_tensor
|
||||
from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func, deep_gemm
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ import paddle
|
||||
from paddle import nn
|
||||
|
||||
import fastdeploy
|
||||
from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce
|
||||
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
|
||||
from fastdeploy.model_executor.ops.gpu import (
|
||||
MoeWna16MarlinGemmApi,
|
||||
tritonmoe_preprocess_func,
|
||||
|
||||
@@ -18,7 +18,7 @@ import paddle
|
||||
from paddle import nn
|
||||
|
||||
import fastdeploy
|
||||
from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce
|
||||
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
|
||||
from fastdeploy.model_executor.layers.utils import create_and_set_parameter, get_tensor
|
||||
from fastdeploy.utils import ceil_div
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ import paddle
|
||||
from paddle import nn
|
||||
|
||||
import fastdeploy
|
||||
from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce
|
||||
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
|
||||
from fastdeploy.utils import ceil_div
|
||||
|
||||
from ..quantization.quant_base import QuantMethodBase
|
||||
|
||||
@@ -82,7 +82,7 @@ class XPUMoEMethod(MoEMethodBase):
|
||||
False, # moe group, used in deepseek
|
||||
)
|
||||
if layer.tp_size > 1:
|
||||
from fastdeploy.distributed.communication_op import (
|
||||
from fastdeploy.distributed.communication import (
|
||||
tensor_model_parallel_all_reduce,
|
||||
)
|
||||
|
||||
@@ -210,7 +210,7 @@ class XPUWeightOnlyMoEMethod(QuantMethodBase):
|
||||
False, # moe group, used in deepseek
|
||||
)
|
||||
if layer.tp_size > 1:
|
||||
from fastdeploy.distributed.communication_op import (
|
||||
from fastdeploy.distributed.communication import (
|
||||
tensor_model_parallel_all_reduce,
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user