[Feature] Support repetition early stop (#3024)

* support repetition early stop and support user to set the parameter * remove log * fix codestyle * add the early_stop_config to rollout_config * update config and EarlyStopper class * fix the bug for triton * modify the stop method * update description * modify the usage for stop_flags --------- Co-authored-by: Yuanle Liu <yuanlehome@163.com>
2025-10-04 08:16:42 +08:00 · 2025-07-29 22:42:54 +08:00
parent 3214fb5393
commit b2f9a42d87
13 changed files with 575 additions and 4 deletions
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -82,6 +82,7 @@ class GPUModelRunner(ModelRunnerBase):
        self.speculative_method = self.fd_config.speculative_config.method
        self.speculative_decoding = self.speculative_method is not None
        self.enable_logprob = fd_config.model_config.enable_logprob
+        self.enable_early_stop = self.fd_config.early_stop_config.enable_early_stop

        self.guided_backend = None
        if self.fd_config.parallel_config.guided_decoding_backend != "off":
@@ -108,10 +109,9 @@ class GPUModelRunner(ModelRunnerBase):
                "matmul_v2",
                "fused_gemm_epilogue",
            ]
-
        #  Sampler
        if not self.speculative_decoding:
-            self.sampler = Sampler()
+            self.sampler = Sampler(fd_config)
        else:
            self.sampler = SpeculativeSampler(fd_config)

@@ -753,6 +753,8 @@ class GPUModelRunner(ModelRunnerBase):
            bad_words_token_ids=self.share_inputs["bad_tokens"],
            eos_token_ids=self.share_inputs["eos_token_id"],
            max_num_logprobs=20 if self.enable_logprob else None,
+            enable_early_stop=self.enable_early_stop,
+            stop_flags=self.share_inputs["stop_flags"],
        )

    def load_model(self) -> None: