mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[BugFix]Set default OMP_NUM_THREADS=3 and fix extra GPU memory usage in DeepSeek (#5219)
* fix bug * update * update * update * fix copy * update
This commit is contained in:
@@ -722,7 +722,7 @@ class AsyncLLMEngine:
|
||||
"FLAGS_use_append_attn": 1,
|
||||
"NCCL_ALGO": "Ring",
|
||||
"FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 1024)),
|
||||
"OMP_NUM_THREADS": int(os.getenv("OMP_NUM_THREADS", 3)),
|
||||
"OMP_NUM_THREADS": 3,
|
||||
}
|
||||
# environment variables needed by Dy2St
|
||||
variables.update(
|
||||
|
||||
@@ -453,7 +453,7 @@ class LLMEngine:
|
||||
"PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION": "python",
|
||||
"NCCL_ALGO": "Ring",
|
||||
"FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 1024)),
|
||||
"OMP_NUM_THREADS": int(os.getenv("OMP_NUM_THREADS", 3)),
|
||||
"OMP_NUM_THREADS": 3,
|
||||
"FD_ENABLE_PDL": envs.FD_ENABLE_PDL,
|
||||
}
|
||||
# environment variables needed by Dy2St
|
||||
|
||||
@@ -356,10 +356,6 @@ class MergedReplicatedLinear(ReplicatedLinear):
|
||||
self.output_sizes = output_sizes
|
||||
|
||||
def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None):
|
||||
weight_need_transpose = getattr(param, "weight_need_transpose", False)
|
||||
if weight_need_transpose:
|
||||
loaded_weight = get_tensor(loaded_weight).transpose([1, 0])
|
||||
|
||||
assert loaded_shard_id in ["q_a", "kv_a"]
|
||||
if not param._is_initialized():
|
||||
param.initialize()
|
||||
@@ -385,7 +381,6 @@ class MergedReplicatedLinear(ReplicatedLinear):
|
||||
else:
|
||||
loaded_weight = loaded_weight.cast(param.dtype)
|
||||
# (bukejiyu) After this fix, the early H2D copy for non-GPU devices is no longer needed and can be safely removed.
|
||||
loaded_weight = get_tensor(loaded_weight)
|
||||
h2d_copy(param, loaded_weight)
|
||||
|
||||
|
||||
@@ -452,7 +447,17 @@ class ColumnParallelLinear(LinearBase):
|
||||
if self.with_bias:
|
||||
# col parallel
|
||||
_set_var_distributed(self.bias, split_axis=1)
|
||||
set_weight_attrs(self.bias, {"output_dim": True})
|
||||
set_weight_attrs(
|
||||
self.bias,
|
||||
{
|
||||
"output_dim": True,
|
||||
"weight_loader": (
|
||||
self.weight_loader
|
||||
if hasattr(self, "weight_loader")
|
||||
else default_weight_loader(self.fd_config)
|
||||
),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
class MergedColumnParallelLinear(ColumnParallelLinear):
|
||||
@@ -955,7 +960,10 @@ class KVBatchLinear(nn.Layer):
|
||||
self.num_heads_per_partition = divide(num_attention_heads, self.tp_size)
|
||||
self.local_rank = fd_config.parallel_config.tensor_parallel_rank
|
||||
self.fd_config = fd_config
|
||||
self.kv_b_proj = kv_b_proj
|
||||
if self.fd_config.load_config.load_choices == "default_v1":
|
||||
self.kv_b_proj = kv_b_proj
|
||||
else:
|
||||
self.kv_b_proj = None
|
||||
|
||||
self.weight_dtype = self._helper.get_default_dtype()
|
||||
|
||||
|
||||
@@ -141,7 +141,10 @@ def get_tensor(input: Union[paddle.Tensor, np.ndarray, str], model_path=None) ->
|
||||
|
||||
if isinstance(input, paddle.Tensor):
|
||||
if input.place.is_cpu_place():
|
||||
return input.to(paddle.device.get_device())
|
||||
if current_platform.is_cuda():
|
||||
return input.cuda()
|
||||
else:
|
||||
return input.to(paddle.device.get_device())
|
||||
return input
|
||||
elif isinstance(input, np.ndarray):
|
||||
return paddle.to_tensor(input)
|
||||
|
||||
@@ -32,7 +32,7 @@ from paddle.nn.functional.flash_attention import (
|
||||
from paddleformers.transformers.model_utils import PretrainedModel
|
||||
|
||||
from fastdeploy.model_executor.layers.utils import divide, get_tensor
|
||||
from fastdeploy.model_executor.utils import fd_cast, h2d_copy, set_weight_attrs
|
||||
from fastdeploy.model_executor.utils import fd_cast, set_weight_attrs
|
||||
|
||||
from .activation import ACT2FN
|
||||
from .configuration import DFNRopeVisionTransformerConfig
|
||||
@@ -151,7 +151,8 @@ class VisionFlashAttention2(nn.Layer):
|
||||
assert param.shape == shard_weight.shape, (
|
||||
f" Attempted to load weight ({shard_weight.shape}) " f"into parameter ({param.shape})"
|
||||
)
|
||||
h2d_copy(param, shard_weight)
|
||||
shard_weight = get_tensor(shard_weight)
|
||||
param.copy_(shard_weight, False)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
|
||||
@@ -281,7 +281,6 @@ def default_weight_loader(fd_config: FDConfig = None) -> None:
|
||||
|
||||
def fn(param, loaded_weight, shard_id: Optional[Union[int, str]] = None):
|
||||
"""fn"""
|
||||
|
||||
output_dim = getattr(param, "output_dim", None)
|
||||
weight_need_transpose = getattr(param, "weight_need_transpose", False)
|
||||
if weight_need_transpose:
|
||||
@@ -310,7 +309,8 @@ def default_weight_loader(fd_config: FDConfig = None) -> None:
|
||||
assert param.shape == loaded_weight.shape, (
|
||||
f" Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({param.shape})"
|
||||
)
|
||||
h2d_copy(dst=param, src=loaded_weight)
|
||||
loaded_weight = get_tensor(loaded_weight)
|
||||
param.copy_(loaded_weight, False)
|
||||
|
||||
return fn
|
||||
|
||||
@@ -369,8 +369,9 @@ def h2d_copy(dst, src, blocking=True):
|
||||
if not current_platform.is_cuda() or not is_paddle_support_new_h2d():
|
||||
# For non-GPU devices, data is transferred to device (H2D) in advance.
|
||||
src = get_tensor(src)
|
||||
if not dst._is_initialized():
|
||||
dst.initialize()
|
||||
if len(src.shape) == 1:
|
||||
# TODO (bukejiyu):A recently merged Paddle PR introduced a hang when copying 1-D non-contiguous tensors. This approach serves as a temporary workaround.
|
||||
src = get_tensor(src)
|
||||
dst.copy_(src, blocking)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user