mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-08 18:11:00 +08:00
[Intel HPU] Support intel hpu platform (#4161)
* [Intel HPU] Support intel hpu platform * fix some issues * apply precommit and move AttentionBackend_HPU * fix format issue * correct ops import * fix ci issue * update code in layers * fix code style issue * remove dense tp moe ep mode * fix enc_dec_block_num * fix rebase issue * rename hpu to gaudi in readme * rename ForwardMeta_HPU to HPUForwardMeta
This commit is contained in:
@@ -209,6 +209,8 @@ class Sampler(nn.Layer):
|
||||
or current_platform.is_maca()
|
||||
):
|
||||
self.forward = self.forward_cuda
|
||||
elif current_platform.is_intel_hpu():
|
||||
self.forward = self.forward_intel_hpu
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -377,6 +379,49 @@ class Sampler(nn.Layer):
|
||||
|
||||
return sampler_output
|
||||
|
||||
def forward_intel_hpu(
|
||||
self,
|
||||
logits: paddle.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
batch_ids: paddle.Tensor,
|
||||
max_batch: int,
|
||||
rank: int,
|
||||
local_rank: int,
|
||||
) -> paddle.Tensor:
|
||||
if logits.dtype != paddle.float32:
|
||||
logits = paddle.cast(logits, paddle.float32)
|
||||
|
||||
from fastdeploy.model_executor.ops.intel_hpu import fused_sampler
|
||||
|
||||
_, next_tokens = fused_sampler(
|
||||
sampling_metadata.pre_token_ids,
|
||||
sampling_metadata.prompt_ids,
|
||||
sampling_metadata.seq_lens_encoder,
|
||||
sampling_metadata.seq_lens_decoder,
|
||||
sampling_metadata.step_idx,
|
||||
sampling_metadata.stop_flags,
|
||||
logits,
|
||||
sampling_metadata.repetition_penalties,
|
||||
sampling_metadata.frequency_penalties,
|
||||
sampling_metadata.presence_penalties,
|
||||
sampling_metadata.temperature,
|
||||
sampling_metadata.bad_words_token_ids,
|
||||
sampling_metadata.step_idx,
|
||||
sampling_metadata.min_dec_lens,
|
||||
sampling_metadata.eos_token_ids,
|
||||
sampling_metadata.top_p,
|
||||
rank,
|
||||
local_rank,
|
||||
)
|
||||
|
||||
if next_tokens.shape[0] != max_batch:
|
||||
dim = next_tokens.shape[-1]
|
||||
tmp_tokens = paddle.full((max_batch, dim), -1, dtype=next_tokens.dtype)
|
||||
tmp_tokens = paddle.scatter(tmp_tokens, batch_ids, next_tokens[: batch_ids.shape[0], :])
|
||||
return tmp_tokens
|
||||
|
||||
return next_tokens
|
||||
|
||||
|
||||
class SpeculativeSampler(nn.Layer):
|
||||
"""
|
||||
|
Reference in New Issue
Block a user