[Intel HPU] Support intel hpu platform (#4161)

* [Intel HPU] Support intel hpu platform

* fix some issues

* apply precommit and move AttentionBackend_HPU

* fix format issue

* correct ops import

* fix ci issue

* update code in layers

* fix code style issue

* remove dense tp moe ep mode

* fix enc_dec_block_num

* fix rebase issue

* rename hpu to gaudi in readme

* rename ForwardMeta_HPU to HPUForwardMeta
This commit is contained in:
fmiao2372
2025-09-24 12:27:50 +08:00
committed by GitHub
parent a1c5d930bb
commit f1b5392e20
35 changed files with 2814 additions and 19 deletions

View File

@@ -66,3 +66,26 @@ try:
except:
tensor_model_parallel_all_reduce = None
from paddle.distributed.communication import stream
from paddle.distributed.communication.reduce import ReduceOp
def all_reduce(
tensor,
op,
group,
sync_op: bool = True,
):
return stream.all_reduce(tensor, op=op, group=group, sync_op=sync_op, use_calc_stream=True)
@paddle.jit.marker.unified
def tensor_model_parallel_all_reduce_custom(input_: paddle.Tensor) -> paddle.Tensor:
"""All-reduce the input tensor across model parallel group on calc stream."""
if paddle.in_dynamic_mode():
hcg = dist.fleet.get_hybrid_communicate_group()
mp_group = hcg.get_model_parallel_group()
all_reduce(input_, op=ReduceOp.SUM, group=mp_group)
else:
dist.all_reduce(input_)