From 91912cc2e1f17c0b804eaba1cd87ad8d382b928a Mon Sep 17 00:00:00 2001 From: RichardWooSJTU <37864677+RichardWooSJTU@users.noreply.github.com> Date: Fri, 19 Sep 2025 18:07:13 +0800 Subject: [PATCH] fix t2i (#4163) Co-authored-by: Yuanle Liu --- fastdeploy/config.py | 4 +++- fastdeploy/engine/args_utils.py | 2 +- fastdeploy/engine/request.py | 1 + fastdeploy/envs.py | 2 ++ fastdeploy/scheduler/local_scheduler.py | 1 + 5 files changed, 8 insertions(+), 2 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index e02389d8a..4b5cdb500 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -890,7 +890,9 @@ class CacheConfig: self.kv_cache_ratio = 1.0 else: self.kv_cache_ratio = 0.75 - self.enc_dec_block_num = 0 if current_platform.is_iluvatar() or current_platform.is_maca() else 2 + self.enc_dec_block_num = ( + 0 if current_platform.is_iluvatar() or current_platform.is_maca() else envs.FD_ENC_DEC_BLOCK_NUM + ) self.prealloc_dec_block_slot_num_threshold = 12 self.cache_dtype = "bfloat16" self.model_cfg = None diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index e05e347b3..3dccb9cdc 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -704,7 +704,7 @@ class EngineArgs: cache_group.add_argument( "--prealloc-dec-block-slot-num-threshold", type=int, - default=12, + default=EngineArgs.prealloc_dec_block_slot_num_threshold, help="Number of token slot threadshold to allocate next blocks for decoding.", ) diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py index 01ee9571e..2a0def97a 100644 --- a/fastdeploy/engine/request.py +++ b/fastdeploy/engine/request.py @@ -304,6 +304,7 @@ class CompletionOutput: "index": self.index, "send_idx": self.send_idx, "token_ids": self.token_ids, + "decode_type": self.decode_type, "logprob": self.logprob, "top_logprobs": self.top_logprobs, "logprobs": self.logprobs, diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index 06a919ab6..32071f682 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -82,6 +82,8 @@ environment_variables: dict[str, Callable[[], Any]] = { "EXPORTER_OTLP_HEADERS": lambda: os.getenv("EXPORTER_OTLP_HEADERS"), # enable kv cache block scheduler v1 (no need for kv_cache_ratio) "ENABLE_V1_KVCACHE_SCHEDULER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "1")), + # set prealloc block num for decoder + "FD_ENC_DEC_BLOCK_NUM": lambda: int(os.getenv("FD_ENC_DEC_BLOCK_NUM", "2")), # Whether to use PLUGINS. "FD_PLUGINS": lambda: None if "FD_PLUGINS" not in os.environ else os.environ["FD_PLUGINS"].split(","), # set trace attribute job_id. diff --git a/fastdeploy/scheduler/local_scheduler.py b/fastdeploy/scheduler/local_scheduler.py index 159dd447d..fd4ec3757 100644 --- a/fastdeploy/scheduler/local_scheduler.py +++ b/fastdeploy/scheduler/local_scheduler.py @@ -306,6 +306,7 @@ class LocalScheduler: if response.request_id not in self.responses: self.responses[response.request_id] = [response] continue + scheduler_logger.debug(f"append response {response.raw}") self.responses[response.request_id].append(response) self.responses_not_empty.notify_all()