From af715db763abc65cb2cca1e9f93f9b6572acde19 Mon Sep 17 00:00:00 2001
From: yangjianfengo1 <125249383+yangjianfengo1@users.noreply.github.com>
Date: Thu, 20 Nov 2025 16:29:13 +0800
Subject: [PATCH] [Scheduler] Support chunk prefill for video input (#5107)

* add video chunk prefill

* add vit_merge=True for test_tokenizer_client.py

---------

Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
---
 fastdeploy/engine/sched/resource_manager_v1.py | 10 +++++++++-
 fastdeploy/input/tokenzier_client.py           |  1 +
 tests/input/test_tokenizer_client.py           |  1 +
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py
index b74c772d3..2d6641ed9 100644
--- a/fastdeploy/engine/sched/resource_manager_v1.py
+++ b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -40,6 +40,7 @@ from fastdeploy.engine.request import (
     RequestType,
 )
 from fastdeploy.engine.resource_manager import ResourceManager
+from fastdeploy.input.utils import IDS_TYPE_FLAG
 from fastdeploy.inter_communicator import IPCSignal
 from fastdeploy.metrics.metrics import main_process_metrics
 from fastdeploy.multimodal.hasher import MultimodalHasher
@@ -391,8 +392,15 @@ class ResourceManagerV1(ResourceManager):
                     end_patch_idx -= 1
             end_patch_map = inputs["patch_map"][end_patch_idx]
             end_modal_id = end_patch_map["modal_id"]
-            if end_modal_id > 0:
+            if end_modal_id > 0 and end_modal_id != IDS_TYPE_FLAG["video"]:
                 new_end_idx = end_patch_map["end_idx"]  # 当前模态结束位置
+
+            if end_modal_id == IDS_TYPE_FLAG["video"] and "can_split_idx_list" in inputs:
+                can_split_idx_list = inputs["can_split_idx_list"]
+                for i in range(len(can_split_idx_list)):
+                    if can_split_idx_list[i] >= new_end_idx:
+                        new_end_idx = can_split_idx_list[i]
+                        break
             num_new_tokens = new_end_idx - pre_end_idx
 
             request.image_end = end_patch_map["image_num"]
diff --git a/fastdeploy/input/tokenzier_client.py b/fastdeploy/input/tokenzier_client.py
index ff013cf3c..409055c1d 100644
--- a/fastdeploy/input/tokenzier_client.py
+++ b/fastdeploy/input/tokenzier_client.py
@@ -40,6 +40,7 @@ class VideoEncodeRequest(BaseEncodeRequest):
     start_ts: int
     end_ts: int
     frames: int
+    vit_merge: bool
 
 
 class ImageDecodeRequest(BaseModel):
diff --git a/tests/input/test_tokenizer_client.py b/tests/input/test_tokenizer_client.py
index 64c50e929..ef180270f 100644
--- a/tests/input/test_tokenizer_client.py
+++ b/tests/input/test_tokenizer_client.py
@@ -58,6 +58,7 @@ async def test_encode_video_failure():
         start_ts=0.0,
         end_ts=10.0,
         frames=30,
+        vit_merge=True,
     )
 
     with pytest.raises(RuntimeError, match="Encode failed"):