[Intel HPU] Support intel hpu platform (#4161)

* [Intel HPU] Support intel hpu platform * fix some issues * apply precommit and move AttentionBackend_HPU * fix format issue * correct ops import * fix ci issue * update code in layers * fix code style issue * remove dense tp moe ep mode * fix enc_dec_block_num * fix rebase issue * rename hpu to gaudi in readme * rename ForwardMeta_HPU to HPUForwardMeta
2025-10-08 18:11:00 +08:00 · 2025-09-24 12:27:50 +08:00
parent a1c5d930bb
commit f1b5392e20
35 changed files with 2814 additions and 19 deletions
--- a/fastdeploy/model_executor/layers/sample/sampler.py
+++ b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -209,6 +209,8 @@ class Sampler(nn.Layer):
            or current_platform.is_maca()
        ):
            self.forward = self.forward_cuda
+        elif current_platform.is_intel_hpu():
+            self.forward = self.forward_intel_hpu
        else:
            raise NotImplementedError

@@ -377,6 +379,49 @@ class Sampler(nn.Layer):

        return sampler_output

+    def forward_intel_hpu(
+        self,
+        logits: paddle.Tensor,
+        sampling_metadata: SamplingMetadata,
+        batch_ids: paddle.Tensor,
+        max_batch: int,
+        rank: int,
+        local_rank: int,
+    ) -> paddle.Tensor:
+        if logits.dtype != paddle.float32:
+            logits = paddle.cast(logits, paddle.float32)
+
+        from fastdeploy.model_executor.ops.intel_hpu import fused_sampler
+
+        _, next_tokens = fused_sampler(
+            sampling_metadata.pre_token_ids,
+            sampling_metadata.prompt_ids,
+            sampling_metadata.seq_lens_encoder,
+            sampling_metadata.seq_lens_decoder,
+            sampling_metadata.step_idx,
+            sampling_metadata.stop_flags,
+            logits,
+            sampling_metadata.repetition_penalties,
+            sampling_metadata.frequency_penalties,
+            sampling_metadata.presence_penalties,
+            sampling_metadata.temperature,
+            sampling_metadata.bad_words_token_ids,
+            sampling_metadata.step_idx,
+            sampling_metadata.min_dec_lens,
+            sampling_metadata.eos_token_ids,
+            sampling_metadata.top_p,
+            rank,
+            local_rank,
+        )
+
+        if next_tokens.shape[0] != max_batch:
+            dim = next_tokens.shape[-1]
+            tmp_tokens = paddle.full((max_batch, dim), -1, dtype=next_tokens.dtype)
+            tmp_tokens = paddle.scatter(tmp_tokens, batch_ids, next_tokens[: batch_ids.shape[0], :])
+            return tmp_tokens
+
+        return next_tokens
+

 class SpeculativeSampler(nn.Layer):
    """