mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 08:16:42 +08:00
[Feature] Support repetition early stop (#3024)
* support repetition early stop and support user to set the parameter * remove log * fix codestyle * add the early_stop_config to rollout_config * update config and EarlyStopper class * fix the bug for triton * modify the stop method * update description * modify the usage for stop_flags --------- Co-authored-by: Yuanle Liu <yuanlehome@163.com>
This commit is contained in:
@@ -82,6 +82,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.speculative_method = self.fd_config.speculative_config.method
|
||||
self.speculative_decoding = self.speculative_method is not None
|
||||
self.enable_logprob = fd_config.model_config.enable_logprob
|
||||
self.enable_early_stop = self.fd_config.early_stop_config.enable_early_stop
|
||||
|
||||
self.guided_backend = None
|
||||
if self.fd_config.parallel_config.guided_decoding_backend != "off":
|
||||
@@ -108,10 +109,9 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
"matmul_v2",
|
||||
"fused_gemm_epilogue",
|
||||
]
|
||||
|
||||
# Sampler
|
||||
if not self.speculative_decoding:
|
||||
self.sampler = Sampler()
|
||||
self.sampler = Sampler(fd_config)
|
||||
else:
|
||||
self.sampler = SpeculativeSampler(fd_config)
|
||||
|
||||
@@ -753,6 +753,8 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
bad_words_token_ids=self.share_inputs["bad_tokens"],
|
||||
eos_token_ids=self.share_inputs["eos_token_id"],
|
||||
max_num_logprobs=20 if self.enable_logprob else None,
|
||||
enable_early_stop=self.enable_early_stop,
|
||||
stop_flags=self.share_inputs["stop_flags"],
|
||||
)
|
||||
|
||||
def load_model(self) -> None:
|
||||
|
Reference in New Issue
Block a user