diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py index c97221723..2b4e583d7 100644 --- a/fastdeploy/model_executor/layers/sample/sampler.py +++ b/fastdeploy/model_executor/layers/sample/sampler.py @@ -416,7 +416,7 @@ class Sampler(nn.Layer): if next_tokens.shape[0] != max_batch: dim = next_tokens.shape[-1] - tmp_tokens = paddle.full((max_batch, dim), -1, dtype=next_tokens.dtype) + tmp_tokens = paddle.full((max_batch, dim), -1 if local_rank == 0 else 0, dtype=next_tokens.dtype) tmp_tokens = paddle.scatter(tmp_tokens, batch_ids, next_tokens[: batch_ids.shape[0], :]) return tmp_tokens diff --git a/fastdeploy/worker/hpu_model_runner.py b/fastdeploy/worker/hpu_model_runner.py index 56f84fd86..3f29646c4 100644 --- a/fastdeploy/worker/hpu_model_runner.py +++ b/fastdeploy/worker/hpu_model_runner.py @@ -24,6 +24,7 @@ import paddle.nn as nn from paddleformers.utils.log import logger from fastdeploy.config import FDConfig +from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce_custom from fastdeploy.engine.request import Request # from fastdeploy.spec_decode import MTPProposer, NgramProposer @@ -944,7 +945,7 @@ class HPUModelRunner(ModelRunnerBase): if self.parallel_config.tensor_parallel_size > 1: dtype = sampled_token_ids.dtype sampled_token_ids = sampled_token_ids.to("float32") - paddle.distributed.broadcast(sampled_token_ids, 0) + tensor_model_parallel_all_reduce_custom(sampled_token_ids) sampled_token_ids = sampled_token_ids.to(dtype) # 6. post process @@ -1272,7 +1273,7 @@ class HPUModelRunner(ModelRunnerBase): if self.parallel_config.tensor_parallel_size > 1: dtype = sampled_token_ids.dtype sampled_token_ids = sampled_token_ids.to("float32") - paddle.distributed.broadcast(sampled_token_ids, 0) + tensor_model_parallel_all_reduce_custom(sampled_token_ids) sampled_token_ids = sampled_token_ids.to(dtype) if self.is_hpu_perf_breakdown_sync_mode: sampled_token_ids.cpu()