[Feature] Set scheduler v1 as default (#3812)

* [Feature] Set scheduler v1 as default * [Feature] Set scheduler v1 as default * [Feature] Set scheduler v1 as default * [Feature] Set scheduler v1 as default * [Feature] Set scheduler v1 as default * [Feature] Set scheduler v1 as default
2025-10-05 08:37:06 +08:00 · 2025-09-04 11:02:10 +08:00
parent fbf0e9d2aa
commit fb1e0d6a87
8 changed files with 43 additions and 7 deletions
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -1290,7 +1290,7 @@ class FDConfig:
        ), "TP and EP cannot be enabled at the same time"
        if not self.cache_config.enable_chunked_prefill:
-            if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
+            if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
                assert self.max_num_batched_tokens >= self.max_model_len, (
                    f"max_num_batched_tokens: {self.max_num_batched_tokens} "
                    f"should be larger than or equal to max_model_len: {self.max_model_len}"
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -392,6 +392,12 @@ class EngineArgs:
                raise NotImplementedError("Logprob does not support enable_expert_parallel.")
            if not current_platform.is_cuda():
                raise NotImplementedError("Only CUDA platform supports logprob.")
        if self.speculative_config is not None:
            envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
        if self.splitwise_role != "mixed":
            envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
        if not current_platform.is_cuda():
            envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
    @staticmethod
    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
--- a/fastdeploy/engine/common_engine.py
+++ b/fastdeploy/engine/common_engine.py
@@ -552,8 +552,6 @@ class EngineSevice:
                    get_request_pool.submit(_fetch_request)
                # 2. Schedule requests
                tasks = self.resource_manager.schedule()
                main_process_metrics.num_requests_waiting.dec(len(tasks))
                main_process_metrics.num_requests_running.inc(len(tasks))
                # 3. Send to engine
                if tasks:
                    self.resource_manager.get_real_bsz()
--- a/fastdeploy/engine/sched/resource_manager_v1.py
+++ b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -123,6 +123,8 @@ class ResourceManagerV1(ResourceManager):
                self.to_be_rescheduled_request_id_set.add(preempted_req.request_id)
                preempted_reqs.append(preempted_req)
                scheduled_reqs.append(self._prepare_preempt_task(preempted_req))
                main_process_metrics.num_requests_waiting.inc(1)
                main_process_metrics.num_requests_running.dec(1)
                if preempted_req == request:
                    # No more request to preempt.
                    can_schedule = False
@@ -368,6 +370,8 @@ class ResourceManagerV1(ResourceManager):
                            token_budget -= num_new_tokens
                            request.num_computed_tokens += num_new_tokens
                            request.status = RequestStatus.RUNNING
                            main_process_metrics.num_requests_waiting.dec(1)
                            main_process_metrics.num_requests_running.inc(1)
                            allocated_position = self.get_available_position()
                            request.idx = allocated_position
                            self.tasks_list[allocated_position] = request
@@ -398,6 +402,8 @@ class ResourceManagerV1(ResourceManager):
                            token_budget -= num_new_tokens
                            request.num_computed_tokens += num_new_tokens
                            request.status = RequestStatus.RUNNING
                            main_process_metrics.num_requests_waiting.dec(1)
                            main_process_metrics.num_requests_running.inc(1)
                        else:
                            if self.config.cache_config.enable_prefix_caching:
                                self._free_blocks(request)
--- a/fastdeploy/envs.py
+++ b/fastdeploy/envs.py
@@ -81,7 +81,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # set traec exporter_otlp_headers.
    "EXPORTER_OTLP_HEADERS": lambda: os.getenv("EXPORTER_OTLP_HEADERS"),
    # enable kv cache block scheduler v1 (no need for kv_cache_ratio)
-    "ENABLE_V1_KVCACHE_SCHEDULER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")),
+    "ENABLE_V1_KVCACHE_SCHEDULER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "1")),
    # Whether to use PLUGINS.
    "FD_PLUGINS": lambda: None if "FD_PLUGINS" not in os.environ else os.environ["FD_PLUGINS"].split(","),
    # set trace attribute job_id.
@@ -105,5 +105,10 @@ def __getattr__(name: str):
    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 def __setattr__(name: str, value: Any):
    assert name in environment_variables
    environment_variables[name] = lambda: value
 def __dir__():
    return list(environment_variables.keys())
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1337,7 +1337,11 @@ class GPUModelRunner(ModelRunnerBase):
            A list of indices corresponding to the requests that need to be skipped.
        """
        skip_idx_list = []
-        if not self.cache_config.enable_chunked_prefill or self.guided_backend is None:
+        if (
            not self.cache_config.enable_chunked_prefill
            or self.guided_backend is None
            or envs.ENABLE_V1_KVCACHE_SCHEDULER
        ):
            return skip_idx_list
        for task in model_forward_batch:
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -740,6 +740,20 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
    logger.info(f"- Dynamic load weight: {load_config.dynamic_load_weight}")
    logger.info(f"- Load strategy: {load_config.load_strategy}")
    if (
        args.speculative_config is not None
        and ("method" in args.speculative_config)
        and (args.speculative_config["method"] is not None)
    ):
        logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not support speculative decoding now.")
        envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
    if args.splitwise_role != "mixed":
        logger.info(f"Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported {args.splitwise_role} now.")
        envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
    if not current_platform.is_cuda():
        logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported.")
        envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
    fd_config = FDConfig(
        model_config=model_config,
        parallel_config=parallel_config,
--- a/tests/utils/test_config.py
+++ b/tests/utils/test_config.py
@@ -1,5 +1,6 @@
 import unittest
 from fastdeploy import envs
 from fastdeploy.config import (
    CacheConfig,
    FDConfig,
@@ -48,7 +49,8 @@ class TestConfig(unittest.TestCase):
            ips="0.0.0.0",
            test_mode=True,
        )
-        assert fd_config.max_num_batched_tokens == 2048
+        if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
            assert fd_config.max_num_batched_tokens == 2048
        cache_config.enable_chunked_prefill = False
        fd_config = FDConfig(
@@ -58,7 +60,8 @@ class TestConfig(unittest.TestCase):
            ips="0.0.0.0",
            test_mode=True,
        )
-        assert fd_config.max_num_batched_tokens == 8192
+        if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
            assert fd_config.max_num_batched_tokens == 8192
    def test_fdconfig_init_cache(self):
        parallel_config = ParallelConfig({})