diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index 4c3530512..20de85344 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -542,8 +542,6 @@ class ParallelConfig:
         self.block_size: int = 64
         # Engine worker queue port
         self.engine_worker_queue_port: str = "9923"
-        # Max model len
-        self.max_model_len: int = 3072  # max_seq_len
         # cuda visible devices
         self.device_ids: str = "0"
         # Input dtype
@@ -1402,7 +1400,6 @@ class FDConfig:
         plas_attention_config: PlasAttentionConfig = None,
         speculative_config: SpeculativeConfig = None,
         tokenizer: str = None,
-        max_model_len: int = 8192,
         ips: str = None,
         use_warmup: bool = False,
         limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
@@ -1470,7 +1467,6 @@ class FDConfig:
                 if ip == self.host_ip:
                     self.node_rank = idx
 
-        self.max_model_len = max_model_len
         self.limit_mm_per_prompt = limit_mm_per_prompt
         self.mm_processor_kwargs = mm_processor_kwargs
         self.use_warmup = use_warmup
@@ -1534,20 +1530,20 @@ class FDConfig:
         if self.scheduler_config.max_num_batched_tokens is None:
             if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
                 if paddle.is_compiled_with_xpu():
-                    self.scheduler_config.max_num_batched_tokens = self.max_model_len
+                    self.scheduler_config.max_num_batched_tokens = self.model_config.max_model_len
                 else:
                     self.scheduler_config.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
             else:
                 if self.cache_config.enable_chunked_prefill:
                     self.scheduler_config.max_num_batched_tokens = 2048
                 else:
-                    self.scheduler_config.max_num_batched_tokens = self.max_model_len
+                    self.scheduler_config.max_num_batched_tokens = self.model_config.max_model_len
 
         if self.long_prefill_token_threshold == 0:
-            self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
+            self.long_prefill_token_threshold = int(self.model_config.max_model_len * 0.04)
 
         self.cache_config.postprocess(self.scheduler_config.max_num_batched_tokens, self.scheduler_config.max_num_seqs)
-        self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size)
+        self.cache_config.max_block_num_per_seq = int(self.model_config.max_model_len // self.cache_config.block_size)
         if self.model_config is not None and self.model_config.enable_mm:
             self.cache_config.enable_prefix_caching = False
 
@@ -1576,7 +1572,9 @@ class FDConfig:
             f"but now it's {self.scheduler_config.max_num_seqs}."
         )
         assert self.nnode >= 1, f"nnode: {self.nnode} should no less than 1"
-        assert self.max_model_len >= 16, f"max_model_len: {self.max_model_len} should be larger than 16"
+        assert (
+            self.model_config.max_model_len >= 16
+        ), f"max_model_len: {self.model_config.max_model_len} should be larger than 16"
         assert (
             self.scheduler_config.max_num_seqs >= 1
         ), f"max_num_seqs: {self.scheduler_config.max_num_seqs} should be larger than 1"
@@ -1585,10 +1583,11 @@ class FDConfig:
             f"should be larger than or equal to max_num_seqs: {self.scheduler_config.max_num_seqs}"
         )
         assert (
-            self.scheduler_config.max_num_batched_tokens <= self.max_model_len * self.scheduler_config.max_num_seqs
+            self.scheduler_config.max_num_batched_tokens
+            <= self.model_config.max_model_len * self.scheduler_config.max_num_seqs
         ), (
             f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} should be larger"
-            f"than or equal to max_num_seqs: {self.scheduler_config.max_num_seqs} * max_model_len: {self.max_model_len}"
+            f"than or equal to max_num_seqs: {self.scheduler_config.max_num_seqs} * max_model_len: {self.model_config.max_model_len}"
         )
         assert (
             self.max_num_partial_prefills >= 1
@@ -1609,9 +1608,9 @@ class FDConfig:
 
         if not self.cache_config.enable_chunked_prefill:
             if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
-                assert self.scheduler_config.max_num_batched_tokens >= self.max_model_len, (
+                assert self.scheduler_config.max_num_batched_tokens >= self.model_config.max_model_len, (
                     f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} "
-                    f"should be larger than or equal to max_model_len: {self.max_model_len}"
+                    f"should be larger than or equal to max_model_len: {self.model_config.max_model_len}"
                 )
         else:
             assert self.scheduler_config.max_num_batched_tokens >= self.cache_config.block_size, (
@@ -1623,9 +1622,9 @@ class FDConfig:
             assert (
                 self.cache_config.enable_chunked_prefill is True
             ), "Chunked prefill must be enabled to set max_num_partial_prefills > 1"
-            assert self.long_prefill_token_threshold < self.max_model_len, (
+            assert self.long_prefill_token_threshold < self.model_config.max_model_len, (
                 f"long_prefill_token_threshold: {self.long_prefill_token_threshold} should be less than"
-                f" max_model_len: {self.max_model_len}"
+                f" max_model_len: {self.model_config.max_model_len}"
             )
 
         if self.guided_decoding_backend is not None:
diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
index fcb7088d6..27b8eecaf 100644
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -1079,7 +1079,6 @@ class EngineArgs:
             cache_config=cache_cfg,
             load_config=load_cfg,
             parallel_config=parallel_cfg,
-            max_model_len=self.max_model_len,
             speculative_config=speculative_cfg,
             ips=self.ips,
             use_warmup=self.use_warmup,
diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py
index e10bc93c4..93dc7258a 100644
--- a/fastdeploy/engine/common_engine.py
+++ b/fastdeploy/engine/common_engine.py
@@ -630,7 +630,7 @@ class EngineService:
                     available_blocks=available_blocks,
                     block_size=self.cfg.cache_config.block_size,
                     reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num,
-                    max_num_batched_tokens=self.cfg.max_model_len,
+                    max_num_batched_tokens=self.cfg.model_config.max_model_len,
                     batch=num_prefill_batch,
                 )
                 if self.cfg.scheduler_config.splitwise_role != "mixed":
diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py
index f65bc20a2..ca7545667 100644
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -187,7 +187,7 @@ class LLMEngine:
             num_gpu_blocks = self.cfg.cache_config.num_gpu_blocks_override or self.cfg.cache_config.total_block_num
             num_cpu_blocks = self.cfg.cache_config.num_cpu_blocks
             max_running_requests = min(
-                (num_gpu_blocks + num_cpu_blocks) * block_size // self.cfg.max_model_len,
+                (num_gpu_blocks + num_cpu_blocks) * block_size // self.cfg.model_config.max_model_len,
                 self.cfg.scheduler_config.max_num_seqs,
             )
             console_logger.info(
@@ -195,7 +195,7 @@ class LLMEngine:
             )
             console_logger.info(
                 f"FastDeploy will be serving {max_running_requests} running requests "
-                f"if each sequence reaches its maximum length: {self.cfg.max_model_len}"
+                f"if each sequence reaches its maximum length: {self.cfg.model_config.max_model_len}"
             )
 
         return True
@@ -248,19 +248,19 @@ class LLMEngine:
         chat_template_kwargs = kwargs.get("chat_template_kwargs") or {}
         chat_template_kwargs["chat_template"] = kwargs.get("chat_template")
         kwargs["chat_template_kwargs"] = chat_template_kwargs
-        request = self.data_processor.process_request(request, self.cfg.max_model_len, **kwargs)
+        request = self.data_processor.process_request(request, self.cfg.model_config.max_model_len, **kwargs)
         request.prompt_token_ids_len = len(request.prompt_token_ids)
         request.need_prefill_tokens = request.prompt_token_ids_len
         input_ids_len = request.prompt_token_ids_len
         request.set(
             "max_tokens",
             min(
-                self.cfg.max_model_len - input_ids_len,
+                self.cfg.model_config.max_model_len - input_ids_len,
                 request.get("max_tokens"),
             ),
         )
         min_tokens = request.get("min_tokens")
-        if input_ids_len + min_tokens >= self.cfg.max_model_len:
+        if input_ids_len + min_tokens >= self.cfg.model_config.max_model_len:
             error_msg = (
                 f"Input text is too long, length of prompt token({input_ids_len}) "
                 f"+ min_dec_len ({min_tokens}) >= max_model_len "
@@ -268,10 +268,8 @@ class LLMEngine:
             llm_logger.error(error_msg)
             raise EngineError(error_msg, error_code=400)
 
-        if input_ids_len > self.cfg.max_model_len:
-            error_msg = (
-                f"Length of input token({input_ids_len}) exceeds the limit max_model_len({self.cfg.max_model_len})."
-            )
+        if input_ids_len > self.cfg.model_config.max_model_len:
+            error_msg = f"Length of input token({input_ids_len}) exceeds the limit max_model_len({self.cfg.model_config.max_model_len})."
             llm_logger.error(error_msg)
             raise EngineError(error_msg, error_code=400)
 
@@ -506,7 +504,7 @@ class LLMEngine:
             ips = ",".join(self.cfg.ips)
         arguments = (
             f" --devices {self.cfg.device_ids} {py_script}"
-            f" --max_num_seqs {self.cfg.scheduler_config.max_num_seqs} --max_model_len {self.cfg.max_model_len}"
+            f" --max_num_seqs {self.cfg.scheduler_config.max_num_seqs} --max_model_len {self.cfg.model_config.max_model_len}"
             f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}"
             f" --model {self.cfg.model_config.model!s}"
             f" --device_ids {self.cfg.device_ids}"
@@ -587,7 +585,7 @@ class LLMEngine:
                     prompts["prompt"] = query_list
 
         if "max_tokens" not in prompts:
-            prompts["max_tokens"] = self.cfg.max_model_len
+            prompts["max_tokens"] = self.cfg.model_config.max_model_len
 
         self.add_requests(prompts)
         return prompts["request_id"]
diff --git a/fastdeploy/entrypoints/llm.py b/fastdeploy/entrypoints/llm.py
index 207203d67..302b65d25 100644
--- a/fastdeploy/entrypoints/llm.py
+++ b/fastdeploy/entrypoints/llm.py
@@ -93,7 +93,7 @@ class LLM:
         # Create the Engine
         self.llm_engine = LLMEngine.from_engine_args(engine_args=engine_args)
 
-        self.default_sampling_params = SamplingParams(max_tokens=self.llm_engine.cfg.max_model_len)
+        self.default_sampling_params = SamplingParams(max_tokens=self.llm_engine.cfg.model_config.max_model_len)
 
         self.llm_engine.start()
 
diff --git a/fastdeploy/model_executor/layers/attention/append_attn_backend.py b/fastdeploy/model_executor/layers/attention/append_attn_backend.py
index d42c4b80c..4e015e003 100644
--- a/fastdeploy/model_executor/layers/attention/append_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/append_attn_backend.py
@@ -85,7 +85,7 @@ class AppendAttentionBackend(AttentionBackend):
         super().__init__()
         self.attention_metadata: AppendAttentionMetadata = None
         self.block_size: int = fd_config.cache_config.block_size
-        self.max_seq_len: int = fd_config.parallel_config.max_model_len
+        self.max_seq_len: int = fd_config.model_config.max_model_len
         self.rope_theta: float = (
             10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta
         )
diff --git a/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py b/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py
index 418876271..b16a66817 100644
--- a/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py
@@ -81,7 +81,7 @@ class BlockAttentionBackend(AttentionBackend):
         super().__init__()
         self.attention_metadata: BlockAttentionMetadata = None
         self.block_size = fd_config.cache_config.block_size
-        self.max_seq_len = fd_config.parallel_config.max_model_len
+        self.max_seq_len = fd_config.model_config.max_model_len
         self.rope_theta = 10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta
         self.rank = fd_config.parallel_config.tensor_parallel_rank
 
diff --git a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py
index 15750d090..ee57c7754 100644
--- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py
@@ -110,7 +110,7 @@ class FlashAttentionBackend(AttentionBackend):
         """
         super().__init__()
         self.attention_metadata: FlashAttentionMetadata = None
-        self.max_seq_len = fd_config.parallel_config.max_model_len
+        self.max_seq_len = fd_config.model_config.max_model_len
         self.causal = getattr(fd_config.model_config, "causal", True)
 
         self.kv_num_heads = kv_num_heads
diff --git a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py
index 355400a31..db3a09ce8 100644
--- a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py
@@ -73,7 +73,7 @@ class IluvatarAttnBackend(AttentionBackend):
         self.attention_metadata = IluvatarAttentionMetadata()
         self.block_size = fd_config.parallel_config.block_size
         assert self.block_size == 16, "Iluvatar paged attn requires block_size must be 16."
-        self.max_context_len = fd_config.parallel_config.max_model_len
+        self.max_context_len = fd_config.model_config.max_model_len
         self.causal = getattr(fd_config.model_config, "causal", True)
         self.speculate_method = getattr(fd_config.parallel_config, "speculate_method", None)
         self.use_speculate = self.speculate_method is not None
diff --git a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py
index 896742962..5c283c84d 100644
--- a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py
+++ b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py
@@ -111,7 +111,7 @@ class MLAAttentionBackend(AttentionBackend):
 
         # 基础配置
         self.block_size: int = fd_config.cache_config.block_size
-        self.max_seq_len: int = fd_config.parallel_config.max_model_len
+        self.max_seq_len: int = fd_config.model_config.max_model_len
         self.rope_theta: float = (
             10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta
         )
diff --git a/fastdeploy/model_executor/layers/attention/moba_attention_backend.py b/fastdeploy/model_executor/layers/attention/moba_attention_backend.py
index 82ac4880b..04183922e 100644
--- a/fastdeploy/model_executor/layers/attention/moba_attention_backend.py
+++ b/fastdeploy/model_executor/layers/attention/moba_attention_backend.py
@@ -76,7 +76,7 @@ class PlasAttentionBackend(AttentionBackend):
         self.attention_metadata: PlasAttentionMetadata = None
         assert fd_config.plas_attention_config is not None, "plas_attention_config is None"
         self.block_size = fd_config.parallel_config.block_size
-        self.max_seq_len = fd_config.parallel_config.max_model_len
+        self.max_seq_len = fd_config.model_config.max_model_len
         self.max_num_seqs = fd_config.scheduler_config.max_num_seqs
         self.kv_num_heads = kv_num_heads
         self.num_heads = num_heads
diff --git a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py
index 62ad01d9f..5735abf6f 100644
--- a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py
@@ -79,7 +79,7 @@ class XPUAttentionBackend(AttentionBackend):
         super().__init__()
         self.attention_metadata: XPUAttentionMetadata = None
         self.block_size: int = fd_config.cache_config.block_size
-        self.max_seq_len: int = fd_config.parallel_config.max_model_len
+        self.max_seq_len: int = fd_config.model_config.max_model_len
         self.rope_theta: float = (
             10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta
         )
diff --git a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py
index a7135a0e0..8487dda5f 100644
--- a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py
+++ b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py
@@ -85,7 +85,7 @@ class GCUFlashAttnBackend(AttentionBackend):
         super().__init__()
         self.attention_metadata: GCUFlashAttnMetadata = None
         self.block_size = fd_config.cache_config.block_size
-        self.max_seq_len = fd_config.parallel_config.max_model_len
+        self.max_seq_len = fd_config.model_config.max_model_len
         self.max_num_seqs = fd_config.scheduler_config.max_num_seqs
 
         self.causal = getattr(fd_config.model_config, "causal", True)
diff --git a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py
index 4901ecce2..7a6eedcd3 100644
--- a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py
+++ b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py
@@ -83,7 +83,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
         super().__init__()
         self.attention_metadata: GCUMemEfficientAttnMetadata = None
         self.block_size = fd_config.cache_config.block_size
-        self.max_seq_len = fd_config.parallel_config.max_model_len
+        self.max_seq_len = fd_config.model_config.max_model_len
         self.max_num_seqs = fd_config.scheduler_config.max_num_seqs
 
         self.causal = getattr(fd_config.model_config, "causal", True)
diff --git a/fastdeploy/model_executor/layers/backends/intel_hpu/attention/hpu_attn_backend.py b/fastdeploy/model_executor/layers/backends/intel_hpu/attention/hpu_attn_backend.py
index 962b6e113..b580d7ad8 100644
--- a/fastdeploy/model_executor/layers/backends/intel_hpu/attention/hpu_attn_backend.py
+++ b/fastdeploy/model_executor/layers/backends/intel_hpu/attention/hpu_attn_backend.py
@@ -168,7 +168,7 @@ class HPUAttentionBackend(AttentionBackend_HPU):
         self.attention_metadata: HPUAttentionMetadata = None
         # TODO(gongshaotian): Use llm_config parameters in the correct location
         self.block_size = llm_config.parallel_config.block_size
-        self.max_seq_len = llm_config.parallel_config.max_model_len
+        self.max_seq_len = llm_config.model_config.max_model_len
         self.rope_theta = 10000.0 if llm_config.model_config.rope_theta is None else llm_config.model_config.rope_theta
         self.rope_3d = getattr(llm_config.model_config, "rope_3d", False)
         self.causal = getattr(llm_config.model_config, "causal", True)
diff --git a/fastdeploy/model_executor/layers/backends/metax/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/backends/metax/attention/flash_attn_backend.py
index 8b673d23f..a19ed32cb 100644
--- a/fastdeploy/model_executor/layers/backends/metax/attention/flash_attn_backend.py
+++ b/fastdeploy/model_executor/layers/backends/metax/attention/flash_attn_backend.py
@@ -90,7 +90,7 @@ class FlashAttentionBackend(AttentionBackend):
         self.attention_metadata: FlashAttentionMetadata = None
         self.record_block_table_metadata = {}
         self.block_size: int = fd_config.parallel_config.block_size
-        self.max_seq_len: int = fd_config.parallel_config.max_model_len
+        self.max_seq_len: int = fd_config.model_config.max_model_len
         self.rope_theta: float = (
             10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta
         )
diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
index 7be6d2b5c..6a96adeab 100644
--- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
+++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
@@ -403,27 +403,27 @@ class Ernie4_5_VLDecoderLayer(nn.Layer):
 @cuda_graph_buffers(
     {
         "text_input": {
-            "shape": ["parallel_config.max_model_len", "model_config.hidden_size"],
+            "shape": ["model_config.max_model_len", "model_config.hidden_size"],
             "dtype": "model_config.dtype",
             "value": 1,
         },
         "image_input": {
-            "shape": ["parallel_config.max_model_len", "model_config.hidden_size"],
+            "shape": ["model_config.max_model_len", "model_config.hidden_size"],
             "dtype": "model_config.dtype",
             "value": 1,
         },
         "text_index": {
-            "shape": ["parallel_config.max_model_len"],
+            "shape": ["model_config.max_model_len"],
             "dtype": "int32",
             "value": 0,
         },
         "image_index": {
-            "shape": ["parallel_config.max_model_len"],
+            "shape": ["model_config.max_model_len"],
             "dtype": "int32",
             "value": 0,
         },
         "token_type_ids": {
-            "shape": ["parallel_config.max_model_len"],
+            "shape": ["model_config.max_model_len"],
             "dtype": "int32",
             "value": -1,
         },
@@ -568,7 +568,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM):
 
         # Persistent buffers for CUDA graphs.
         self._input_embeddings = paddle.zeros(
-            [fd_config.parallel_config.max_model_len, fd_config.model_config.hidden_size],
+            [fd_config.model_config.max_model_len, fd_config.model_config.hidden_size],
             dtype=fd_config.model_config.dtype,
         )
 
diff --git a/fastdeploy/model_executor/models/qwen2_5_vl/qwen2_5_vl.py b/fastdeploy/model_executor/models/qwen2_5_vl/qwen2_5_vl.py
index ddd3cd483..3955deb9a 100644
--- a/fastdeploy/model_executor/models/qwen2_5_vl/qwen2_5_vl.py
+++ b/fastdeploy/model_executor/models/qwen2_5_vl/qwen2_5_vl.py
@@ -154,7 +154,7 @@ class Qwen2_5_VLForConditionalGeneration(ModelForCasualLM):
 
         # Persistent buffers for CUDA graphs.
         self._input_embeddings = paddle.zeros(
-            [fd_config.parallel_config.max_model_len, fd_config.model_config.hidden_size],
+            [fd_config.model_config.max_model_len, fd_config.model_config.hidden_size],
             dtype=fd_config.model_config.dtype,
         )
 
diff --git a/fastdeploy/spec_decode/base.py b/fastdeploy/spec_decode/base.py
index a7d8f2266..458f8e579 100644
--- a/fastdeploy/spec_decode/base.py
+++ b/fastdeploy/spec_decode/base.py
@@ -54,7 +54,7 @@ class Proposer(ABC):
         self.scheduler_config = self.fd_config.scheduler_config
 
         self.max_num_seqs = self.scheduler_config.max_num_seqs
-        self.max_model_len = self.parallel_config.max_model_len
+        self.max_model_len = self.model_config.max_model_len
         self.speculative_method = self.speculative_config.method
         self.max_draft_token_num = self.speculative_config.num_speculative_tokens
         self.num_model_steps = self.speculative_config.num_model_steps
diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py
index 945962d55..14a3e4fec 100644
--- a/fastdeploy/spec_decode/mtp.py
+++ b/fastdeploy/spec_decode/mtp.py
@@ -117,7 +117,7 @@ class MTPProposer(Proposer):
 
         input_length = min(
             num_tokens // batch_size,
-            self.parallel_config.max_model_len - max_dec_len,
+            self.model_config.max_model_len - max_dec_len,
         )
         block_num = (
             input_length + self.cache_config.block_size - 1
@@ -306,7 +306,7 @@ class MTPProposer(Proposer):
         self.model_inputs["block_tables"] = paddle.clone(self.target_model_inputs["block_tables"])
         self.model_inputs["input_ids"] = paddle.clone(self.target_model_inputs["input_ids"])
         self.model_inputs["input_ids_cpu"] = paddle.full(
-            shape=[self.max_num_seqs, self.parallel_config.max_model_len],
+            shape=[self.max_num_seqs, self.model_config.max_model_len],
             fill_value=-1,
             dtype="int64",
         ).cpu()
@@ -334,7 +334,7 @@ class MTPProposer(Proposer):
             [self.max_model_len * self.fd_config.max_prefill_batch, self.model_config.hidden_size], 0, dtype="bfloat16"
         )
 
-        tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1))
+        tmp_position_ids = paddle.arange(self.model_config.max_model_len).reshape((1, -1))
         self.model_inputs["rope_emb"] = get_rope(
             rotary_dim=self.model_config.head_dim,
             position_ids=tmp_position_ids,
@@ -764,7 +764,7 @@ class MTPProposer(Proposer):
                     self.model_inputs["seq_lens_decoder"],
                     self.model_inputs["seq_lens_encoder"],
                     self.model_inputs["output_padding_offset"],
-                    self.parallel_config.max_model_len,
+                    self.model_config.max_model_len,
                 )
 
                 # 4. Compute logits, Sample
diff --git a/fastdeploy/splitwise/internal_adapter_utils.py b/fastdeploy/splitwise/internal_adapter_utils.py
index 0e1ba4494..eabae716d 100644
--- a/fastdeploy/splitwise/internal_adapter_utils.py
+++ b/fastdeploy/splitwise/internal_adapter_utils.py
@@ -61,7 +61,7 @@ class InternalAdapter:
             "dec_token_num": int(self.cfg.cache_config.dec_token_num),
             "available_resource": float(1.0 * available_block_num / self.cfg.cache_config.total_block_num),
             "max_batch_size": int(available_batch_size),
-            "max_input_token_num": self.cfg.max_model_len,
+            "max_input_token_num": self.cfg.model_config.max_model_len,
             "unhandled_request_num": self.engine.scheduler.get_unhandled_request_num(),
             "available_batch": int(self.engine.resource_manager.available_batch()),
         }
diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py
index 332659118..5c0580ea8 100644
--- a/fastdeploy/worker/gcu_model_runner.py
+++ b/fastdeploy/worker/gcu_model_runner.py
@@ -302,7 +302,7 @@ class GCUModelRunner(ModelRunnerBase):
         max_dec_len = expected_decode_len + 1
         full_length = min(
             num_tokens // batch_size,
-            self.parallel_config.max_model_len - max_dec_len,
+            self.model_config.max_model_len - max_dec_len,
         )
         input_length = int(full_length * self.cache_config.kv_cache_ratio)
         block_num = (
@@ -344,17 +344,17 @@ class GCUModelRunner(ModelRunnerBase):
         self.share_inputs = {}
 
         self.share_inputs["pre_ids"] = paddle.full(
-            [max_num_seqs, self.parallel_config.max_model_len],
+            [max_num_seqs, self.model_config.max_model_len],
             -1,
             dtype="int64",
         )
         self.share_inputs["input_ids"] = paddle.full(
-            [max_num_seqs, self.parallel_config.max_model_len],
+            [max_num_seqs, self.model_config.max_model_len],
             self.model_config.pad_token_id,
             dtype="int64",
         )
         self.share_inputs["prompt_ids"] = paddle.full(
-            [max_num_seqs, self.parallel_config.max_model_len],
+            [max_num_seqs, self.model_config.max_model_len],
             self.model_config.pad_token_id,
             dtype="int64",
         )
@@ -417,7 +417,7 @@ class GCUModelRunner(ModelRunnerBase):
         self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int32")
 
         self.share_inputs["ids_remove_padding"] = paddle.full(
-            [max_num_seqs * self.parallel_config.max_model_len],
+            [max_num_seqs * self.model_config.max_model_len],
             0,
             dtype="int64",
         )
@@ -439,7 +439,7 @@ class GCUModelRunner(ModelRunnerBase):
         self.share_inputs["max_len_kv_cpu"] = None  # CPU
 
         # Initialize rotary position embedding
-        tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1))
+        tmp_position_ids = paddle.arange(self.model_config.max_model_len).reshape((1, -1))
         self.share_inputs["rope_emb"] = get_rope(
             rotary_dim=self.model_config.head_dim,
             position_ids=tmp_position_ids,
@@ -449,7 +449,7 @@ class GCUModelRunner(ModelRunnerBase):
 
         # Set block tables
         pre_max_block_num = (
-            self.parallel_config.max_model_len + self.cache_config.block_size - 1
+            self.model_config.max_model_len + self.cache_config.block_size - 1
         ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num
         self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32")
 
@@ -478,7 +478,7 @@ class GCUModelRunner(ModelRunnerBase):
         if self.speculative_decoding:
             max_draft_token_num = self.speculative_config.num_speculative_tokens
             self.share_inputs["input_ids_cpu"] = paddle.full(
-                shape=[max_num_seqs, self.parallel_config.max_model_len],
+                shape=[max_num_seqs, self.model_config.max_model_len],
                 fill_value=1,
                 dtype="int64",
             ).cpu()
@@ -779,7 +779,7 @@ class GCUModelRunner(ModelRunnerBase):
                 (
                     self.share_inputs["output_padding_offset"] if self.speculative_decoding else None
                 ),  # speculative decoding requires
-                self.parallel_config.max_model_len,
+                self.model_config.max_model_len,
             )
 
             # 4. Execute spec decode
@@ -802,7 +802,7 @@ class GCUModelRunner(ModelRunnerBase):
                 self.sampler(
                     logits,
                     self.sampling_metadata,
-                    self.parallel_config.max_model_len,
+                    self.model_config.max_model_len,
                     self.share_inputs,
                 )
                 sampler_output = None
@@ -1002,7 +1002,7 @@ class GCUModelRunner(ModelRunnerBase):
             self.share_inputs["seq_lens_decoder"],
             self.share_inputs["seq_lens_encoder"],
             (self.share_inputs["output_padding_offset"] if self.speculative_decoding else None),
-            self.parallel_config.max_model_len,
+            self.model_config.max_model_len,
         )
 
         # 4. Compute logits, Sample
@@ -1030,7 +1030,7 @@ class GCUModelRunner(ModelRunnerBase):
             self.sampler(
                 logits,
                 self.sampling_metadata,
-                self.parallel_config.max_model_len,
+                self.model_config.max_model_len,
                 self.share_inputs,
             )
             sampler_output = None
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 7eda4f8e1..1c55b4679 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -708,7 +708,7 @@ class GPUModelRunner(ModelRunnerBase):
         max_dec_len = expected_decode_len + 1
         input_length = min(
             num_tokens // (1 if capture_prefill else batch_size),
-            self.parallel_config.max_model_len - max_dec_len,
+            self.model_config.max_model_len - max_dec_len,
         )
 
         # NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan.
@@ -773,17 +773,17 @@ class GPUModelRunner(ModelRunnerBase):
         self.share_inputs = {}
 
         self.share_inputs["pre_ids"] = paddle.full(
-            [max_num_seqs, self.parallel_config.max_model_len],
+            [max_num_seqs, self.model_config.max_model_len],
             -1,
             dtype="int64",
         )
         self.share_inputs["input_ids"] = paddle.full(
-            [max_num_seqs, self.parallel_config.max_model_len],
+            [max_num_seqs, self.model_config.max_model_len],
             self.model_config.pad_token_id,
             dtype="int64",
         )
         self.share_inputs["prompt_ids"] = paddle.full(
-            [max_num_seqs, self.parallel_config.max_model_len],
+            [max_num_seqs, self.model_config.max_model_len],
             self.model_config.pad_token_id,
             dtype="int64",
         )
@@ -850,12 +850,12 @@ class GPUModelRunner(ModelRunnerBase):
         self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int32")
 
         self.share_inputs["ids_remove_padding"] = paddle.full(
-            [max_num_seqs * self.parallel_config.max_model_len],
+            [max_num_seqs * self.model_config.max_model_len],
             0,
             dtype="int64",
         )
         self.share_inputs["batch_id_per_token"] = paddle.full(
-            [max_num_seqs * self.parallel_config.max_model_len, 1], 0, dtype="int32"
+            [max_num_seqs * self.model_config.max_model_len, 1], 0, dtype="int32"
         )
         self.share_inputs["cu_seqlens_q"] = paddle.full([max_num_seqs + 1, 1], 0, dtype="int32")
         self.share_inputs["cu_seqlens_k"] = paddle.full([max_num_seqs + 1, 1], 0, dtype="int32")
@@ -876,7 +876,7 @@ class GPUModelRunner(ModelRunnerBase):
         self.share_inputs["max_len_kv_cpu"] = None  # CPU
 
         # Initialize rotary position embedding
-        tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1))
+        tmp_position_ids = paddle.arange(self.model_config.max_model_len).reshape((1, -1))
 
         # Initialize thinking related buffers
         self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
@@ -895,7 +895,7 @@ class GPUModelRunner(ModelRunnerBase):
 
         # Set block tables
         pre_max_block_num = (
-            self.parallel_config.max_model_len + self.cache_config.block_size - 1
+            self.model_config.max_model_len + self.cache_config.block_size - 1
         ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num
         self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32")
 
@@ -927,7 +927,7 @@ class GPUModelRunner(ModelRunnerBase):
         if self.speculative_decoding:
             max_draft_token_num = self.speculative_config.num_speculative_tokens
             self.share_inputs["input_ids_cpu"] = paddle.full(
-                shape=[max_num_seqs, self.parallel_config.max_model_len],
+                shape=[max_num_seqs, self.model_config.max_model_len],
                 fill_value=1,
                 dtype="int64",
             ).cpu()
@@ -974,7 +974,7 @@ class GPUModelRunner(ModelRunnerBase):
                     max_num_seqs,
                     2,
                     1,
-                    self.parallel_config.max_model_len,
+                    self.model_config.max_model_len,
                     1,
                     rope_head_dim,
                 ],
@@ -1373,7 +1373,7 @@ class GPUModelRunner(ModelRunnerBase):
                 (
                     self.share_inputs["output_padding_offset"] if self.speculative_decoding else None
                 ),  # speculative decoding requires
-                self.parallel_config.max_model_len,
+                self.model_config.max_model_len,
             )
 
             logits = None
@@ -1405,7 +1405,7 @@ class GPUModelRunner(ModelRunnerBase):
                 self.sampler(
                     logits,
                     self.sampling_metadata,
-                    self.parallel_config.max_model_len,
+                    self.model_config.max_model_len,
                     self.share_inputs,
                     accept_all_drafts,
                 )
@@ -1723,7 +1723,7 @@ class GPUModelRunner(ModelRunnerBase):
             self.share_inputs["seq_lens_decoder"],
             self.share_inputs["seq_lens_encoder"],
             (self.share_inputs["output_padding_offset"] if self.speculative_decoding else None),
-            self.parallel_config.max_model_len,
+            self.model_config.max_model_len,
         )
 
         logits = None
@@ -1760,7 +1760,7 @@ class GPUModelRunner(ModelRunnerBase):
             self.sampler(
                 logits,
                 self.sampling_metadata,
-                self.parallel_config.max_model_len,
+                self.model_config.max_model_len,
                 self.share_inputs,
             )
             sampler_output = None
@@ -2198,7 +2198,7 @@ class GPUModelRunner(ModelRunnerBase):
             rotary_dim=self.model_config.head_dim,
             partial_rotary_factor=1.0,
             base=self.model_config.rope_theta,
-            max_position=self.parallel_config.max_model_len,
+            max_position=self.model_config.max_model_len,
             freq_allocation=getattr(self.model_config, "freq_allocation", 20),
             model_type=self.model_config.model_type,
         )
diff --git a/fastdeploy/worker/hpu_model_runner.py b/fastdeploy/worker/hpu_model_runner.py
index 4323138cf..56f84fd86 100644
--- a/fastdeploy/worker/hpu_model_runner.py
+++ b/fastdeploy/worker/hpu_model_runner.py
@@ -535,7 +535,7 @@ class HPUModelRunner(ModelRunnerBase):
         """Set dummy prefill inputs to share_inputs"""
         # NOTE(gongshaotian): The maximum decoding length is equal to the expected decoded tokens plus the eos token
         max_dec_len = expected_decode_len + 1
-        full_length = min(num_tokens // batch_size, self.parallel_config.max_model_len - max_dec_len)
+        full_length = min(num_tokens // batch_size, self.model_config.max_model_len - max_dec_len)
         input_length = int(full_length * self.cache_config.kv_cache_ratio)
         block_num = (
             input_length + self.cache_config.block_size - 1
@@ -568,11 +568,9 @@ class HPUModelRunner(ModelRunnerBase):
         self.MAX_INFER_SEED = 9223372036854775806
         self.share_inputs = {}
 
-        self.share_inputs["pre_ids"] = paddle.full(
-            [max_num_seqs, self.parallel_config.max_model_len], -1, dtype="int64"
-        )
+        self.share_inputs["pre_ids"] = paddle.full([max_num_seqs, self.model_config.max_model_len], -1, dtype="int64")
         self.share_inputs["input_ids"] = paddle.full(
-            [max_num_seqs, self.parallel_config.max_model_len], self.model_config.pad_token_id, dtype="int64"
+            [max_num_seqs, self.model_config.max_model_len], self.model_config.pad_token_id, dtype="int64"
         )
         self.share_inputs["eos_token_id"] = paddle.full([self.model_config.eos_tokens_lens, 1], 0, dtype="int64")
         self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32")
@@ -627,7 +625,7 @@ class HPUModelRunner(ModelRunnerBase):
         self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int32")
 
         self.share_inputs["ids_remove_padding"] = paddle.full(
-            [max_num_seqs * self.parallel_config.max_model_len], 0, dtype="int64"
+            [max_num_seqs * self.model_config.max_model_len], 0, dtype="int64"
         )
         self.share_inputs["cum_offsets"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
         self.share_inputs["padding_offset"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
@@ -638,7 +636,7 @@ class HPUModelRunner(ModelRunnerBase):
         self.share_inputs["decoder_tile_ids_per_batch"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
 
         # Initialize rotary position embedding
-        tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1))
+        tmp_position_ids = paddle.arange(self.model_config.max_model_len).reshape((1, -1))
         # TODO(gongshaotian): move to models
         self.share_inputs["rope_emb"] = get_rope(
             rotary_dim=self.model_config.head_dim,
@@ -649,7 +647,7 @@ class HPUModelRunner(ModelRunnerBase):
 
         # Set block tables
         pre_max_block_num = (
-            self.parallel_config.max_model_len + self.cache_config.block_size - 1
+            self.model_config.max_model_len + self.cache_config.block_size - 1
         ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num
         self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32").cpu()
 
@@ -673,7 +671,7 @@ class HPUModelRunner(ModelRunnerBase):
         if self.speculative_decoding:
             max_draft_token_num = self.speculative_config.num_speculative_tokens
             self.share_inputs["input_ids_cpu"] = paddle.full(
-                shape=[max_num_seqs, self.parallel_config.max_model_len], fill_value=1, dtype="int64"
+                shape=[max_num_seqs, self.model_config.max_model_len], fill_value=1, dtype="int64"
             ).cpu()
             self.share_inputs["accept_tokens"] = paddle.full(
                 shape=[max_num_seqs, max_draft_token_num + 1], fill_value=0, dtype="int64"
@@ -983,7 +981,7 @@ class HPUModelRunner(ModelRunnerBase):
             # 7. Updata 'infer_seed' and step_cuda()
             self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
             self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
-            step_intel_hpu(self.share_inputs, self.cache_config.block_size, self.parallel_config.max_model_len)
+            step_intel_hpu(self.share_inputs, self.cache_config.block_size, self.model_config.max_model_len)
 
             if int((self.share_inputs["seq_lens_this_time"] > 0).sum()) == 0:
                 break
@@ -1082,9 +1080,7 @@ class HPUModelRunner(ModelRunnerBase):
 
     def warm_up_bucket(self) -> None:
         max_prefill_batch = 3  # Hard-Code in FastDeploy/fastdeploy/engine/config.py
-        warmup_max_model_len = min(
-            int(os.environ.get("HPU_WARMUP_MODEL_LEN", 4096)), self.parallel_config.max_model_len
-        )
+        warmup_max_model_len = min(int(os.environ.get("HPU_WARMUP_MODEL_LEN", 4096)), self.model_config.max_model_len)
         prefill_batchs = []
         prefill_batch_step = int(os.environ.get("BATCH_STEP_PREFILL", 1))
         current_prefill_batch = prefill_batch_step
@@ -1176,7 +1172,7 @@ class HPUModelRunner(ModelRunnerBase):
         capture_sizes = self.cudagraph_capture_sizes.copy()
         for batch_size in sorted(capture_sizes, reverse=True):
             self._dummy_run(
-                num_tokens=self.parallel_config.max_model_len,
+                num_tokens=self.model_config.max_model_len,
                 batch_size=batch_size,
                 in_capturing=True,
                 expected_decode_len=expected_decode_len,
@@ -1334,7 +1330,7 @@ class HPUModelRunner(ModelRunnerBase):
         self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
         self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
         start_time = time.time()
-        step_intel_hpu(self.share_inputs, self.cache_config.block_size, self.parallel_config.max_model_len)
+        step_intel_hpu(self.share_inputs, self.cache_config.block_size, self.model_config.max_model_len)
         end_time = time.time()
         execution_time = (end_time - start_time) * 1000
         hpu_model_runner_profile_logger.info(f"StepPaddle execution time(ms): {execution_time}, BT={real_bs}")
diff --git a/fastdeploy/worker/metax_model_runner.py b/fastdeploy/worker/metax_model_runner.py
index 93368b2a4..dcce154ea 100644
--- a/fastdeploy/worker/metax_model_runner.py
+++ b/fastdeploy/worker/metax_model_runner.py
@@ -551,7 +551,7 @@ class MetaxModelRunner(ModelRunnerBase):
         max_dec_len = expected_decode_len + 1
         full_length = min(
             num_tokens // batch_size,
-            self.parallel_config.max_model_len - max_dec_len,
+            self.model_config.max_model_len - max_dec_len,
         )
 
         # When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan.
@@ -599,17 +599,17 @@ class MetaxModelRunner(ModelRunnerBase):
         self.share_inputs = {}
 
         self.share_inputs["pre_ids"] = paddle.full(
-            [max_num_seqs, self.parallel_config.max_model_len],
+            [max_num_seqs, self.model_config.max_model_len],
             -1,
             dtype="int64",
         )
         self.share_inputs["input_ids"] = paddle.full(
-            [max_num_seqs, self.parallel_config.max_model_len],
+            [max_num_seqs, self.model_config.max_model_len],
             self.model_config.pad_token_id,
             dtype="int64",
         )
         self.share_inputs["prompt_ids"] = paddle.full(
-            [max_num_seqs, self.parallel_config.max_model_len],
+            [max_num_seqs, self.model_config.max_model_len],
             self.model_config.pad_token_id,
             dtype="int64",
         )
@@ -674,12 +674,12 @@ class MetaxModelRunner(ModelRunnerBase):
         self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int32")
 
         self.share_inputs["ids_remove_padding"] = paddle.full(
-            [max_num_seqs * self.parallel_config.max_model_len],
+            [max_num_seqs * self.model_config.max_model_len],
             0,
             dtype="int64",
         )
         self.share_inputs["batch_id_per_token"] = paddle.full(
-            [max_num_seqs * self.parallel_config.max_model_len, 1], 0, dtype="int32"
+            [max_num_seqs * self.model_config.max_model_len, 1], 0, dtype="int32"
         )
         self.share_inputs["cu_seqlens_q"] = paddle.full([max_num_seqs + 1, 1], 0, dtype="int32")
         self.share_inputs["cu_seqlens_k"] = paddle.full([max_num_seqs + 1, 1], 0, dtype="int32")
@@ -691,7 +691,7 @@ class MetaxModelRunner(ModelRunnerBase):
         self.share_inputs["max_len_tensor_cpu"] = None  # CPU
 
         # Initialize rotary position embedding
-        tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1))
+        tmp_position_ids = paddle.arange(self.model_config.max_model_len).reshape((1, -1))
 
         # TODO(gongshaotian): move to models
         if not self.enable_mm:
@@ -704,7 +704,7 @@ class MetaxModelRunner(ModelRunnerBase):
 
         # Set block tables
         pre_max_block_num = (
-            self.parallel_config.max_model_len + self.cache_config.block_size - 1
+            self.model_config.max_model_len + self.cache_config.block_size - 1
         ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num
         self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32")
 
@@ -736,7 +736,7 @@ class MetaxModelRunner(ModelRunnerBase):
         if self.speculative_decoding:
             max_draft_token_num = self.speculative_config.num_speculative_tokens
             self.share_inputs["input_ids_cpu"] = paddle.full(
-                shape=[max_num_seqs, self.parallel_config.max_model_len],
+                shape=[max_num_seqs, self.model_config.max_model_len],
                 fill_value=1,
                 dtype="int64",
             ).cpu()
@@ -771,7 +771,7 @@ class MetaxModelRunner(ModelRunnerBase):
                     max_num_seqs,
                     2,
                     1,
-                    self.parallel_config.max_model_len,
+                    self.model_config.max_model_len,
                     1,
                     head_dim // 2,
                 ],
@@ -1075,7 +1075,7 @@ class MetaxModelRunner(ModelRunnerBase):
                     (
                         self.share_inputs["output_padding_offset"] if self.speculative_decoding else None
                     ),  # speculative decoding requires
-                    self.parallel_config.max_model_len,
+                    self.model_config.max_model_len,
                 )
 
             # 4. Execute spec decode
@@ -1098,7 +1098,7 @@ class MetaxModelRunner(ModelRunnerBase):
                 self.sampler(
                     logits,
                     self.sampling_metadata,
-                    self.parallel_config.max_model_len,
+                    self.model_config.max_model_len,
                     self.share_inputs,
                 )
                 sampler_output = None
@@ -1338,7 +1338,7 @@ class MetaxModelRunner(ModelRunnerBase):
                 self.share_inputs["seq_lens_decoder"],
                 self.share_inputs["seq_lens_encoder"],
                 (self.share_inputs["output_padding_offset"] if self.speculative_decoding else None),
-                self.parallel_config.max_model_len,
+                self.model_config.max_model_len,
             )
 
         # 4. Compute logits, Sample
@@ -1366,7 +1366,7 @@ class MetaxModelRunner(ModelRunnerBase):
             self.sampler(
                 logits,
                 self.sampling_metadata,
-                self.parallel_config.max_model_len,
+                self.model_config.max_model_len,
                 self.share_inputs,
             )
             sampler_output = None
@@ -1707,7 +1707,7 @@ class MetaxModelRunner(ModelRunnerBase):
             rotary_dim=self.model_config.head_dim,
             partial_rotary_factor=1.0,
             base=self.model_config.rope_theta,
-            max_position=self.parallel_config.max_model_len,
+            max_position=self.model_config.max_model_len,
             freq_allocation=getattr(self.model_config, "freq_allocation", 20),
         )
         return rope_emb
diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py
index abb886290..b2dcc7920 100644
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -117,7 +117,7 @@ def update_fd_config_for_mm(fd_config: FDConfig) -> None:
     if fd_config.model_config.enable_mm and ErnieArchitectures.contains_ernie_arch(architectures):
         tokenizer = Ernie4_5Tokenizer.from_pretrained(
             fd_config.model_config.model,
-            model_max_length=fd_config.parallel_config.max_model_len,
+            model_max_length=fd_config.model_config.max_model_len,
             padding_side="right",
             use_fast=False,
         )
diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py
index 6cf3d3de1..f04680498 100644
--- a/fastdeploy/worker/xpu_model_runner.py
+++ b/fastdeploy/worker/xpu_model_runner.py
@@ -680,17 +680,17 @@ class XPUModelRunner(ModelRunnerBase):
         self.share_inputs = {}
 
         self.share_inputs["pre_ids"] = paddle.full(
-            [max_num_seqs, self.parallel_config.max_model_len],
+            [max_num_seqs, self.model_config.max_model_len],
             -1,
             dtype="int64",
         )
         self.share_inputs["input_ids"] = paddle.full(
-            [max_num_seqs, self.parallel_config.max_model_len],
+            [max_num_seqs, self.model_config.max_model_len],
             self.model_config.pad_token_id,
             dtype="int64",
         )
         self.share_inputs["prompt_ids"] = paddle.full(
-            [max_num_seqs, self.parallel_config.max_model_len],
+            [max_num_seqs, self.model_config.max_model_len],
             self.model_config.pad_token_id,
             dtype="int64",
         )
@@ -755,7 +755,7 @@ class XPUModelRunner(ModelRunnerBase):
         self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int32")
 
         # Initialize rotary position embedding
-        tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1))
+        tmp_position_ids = paddle.arange(self.model_config.max_model_len).reshape((1, -1))
 
         # TODO(gongshaotian): move to models
         if not self.enable_mm:
@@ -768,7 +768,7 @@ class XPUModelRunner(ModelRunnerBase):
 
         # Set block tables
         pre_max_block_num = (
-            self.parallel_config.max_model_len + self.cache_config.block_size - 1
+            self.model_config.max_model_len + self.cache_config.block_size - 1
         ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num
         self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32")
 
@@ -805,7 +805,7 @@ class XPUModelRunner(ModelRunnerBase):
                     max_num_seqs,
                     2,
                     1,
-                    self.parallel_config.max_model_len,
+                    self.model_config.max_model_len,
                     1,
                     head_dim // 2,
                 ],
@@ -960,7 +960,7 @@ class XPUModelRunner(ModelRunnerBase):
 
     def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int):
         """Set dummy prefill inputs to share_inputs"""
-        full_length = min(num_tokens // batch_size, self.parallel_config.max_model_len - 10)
+        full_length = min(num_tokens // batch_size, self.model_config.max_model_len - 10)
         input_length = int(full_length - 512)
         block_num = (
             input_length + self.cache_config.block_size - 1
@@ -1344,7 +1344,7 @@ class XPUModelRunner(ModelRunnerBase):
             rotary_dim=self.model_config.head_dim,
             partial_rotary_factor=1.0,
             base=self.model_config.rope_theta,
-            max_position=self.parallel_config.max_model_len,
+            max_position=self.model_config.max_model_len,
             freq_allocation=getattr(self.model_config, "freq_allocation", 20),
             model_type=self.model_config.model_type,
         )
diff --git a/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py b/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py
index 5ee42b1a7..98c7383f0 100644
--- a/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py
+++ b/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py
@@ -159,6 +159,7 @@ class TestCUDAGrpahSubgraph(unittest.TestCase):
         cache_config = CacheConfig({})
         parallel_config = ParallelConfig(args={})
         model_config = Mock()
+        model_config.max_model_len = 512
         # Initialize cuda graph capture list
         graph_opt_config._set_cudagraph_sizes(max_num_seqs=scheduler_config.max_num_seqs)
         graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs)
diff --git a/tests/graph_optimization/test_cuda_graph_recapture.py b/tests/graph_optimization/test_cuda_graph_recapture.py
index 535accc38..7359fd6dd 100644
--- a/tests/graph_optimization/test_cuda_graph_recapture.py
+++ b/tests/graph_optimization/test_cuda_graph_recapture.py
@@ -97,6 +97,7 @@ class TestCUDAGrpahRecapture(unittest.TestCase):
         scheduler_config.max_num_seqs = 1
         parallel_config = ParallelConfig(args={})
         model_config = Mock()
+        model_config.max_model_len = 5120
         fd_config = FDConfig(
             graph_opt_config=graph_opt_config,
             scheduler_config=scheduler_config,
diff --git a/tests/graph_optimization/test_cuda_graph_spec_decode.py b/tests/graph_optimization/test_cuda_graph_spec_decode.py
index 394c4857f..40c32ca91 100644
--- a/tests/graph_optimization/test_cuda_graph_spec_decode.py
+++ b/tests/graph_optimization/test_cuda_graph_spec_decode.py
@@ -106,6 +106,7 @@ class TestCUDAGrpahSpecDecode(unittest.TestCase):
         cache_config = CacheConfig({})
         parallel_config = ParallelConfig(args={})
         model_config = Mock()
+        model_config.max_model_len = 512
         # Initialize cuda graph capture list
         graph_opt_config._set_cudagraph_sizes(max_num_seqs=scheduler_config.max_num_seqs)
         graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs)
diff --git a/tests/graph_optimization/test_graph_opt_backend.py b/tests/graph_optimization/test_graph_opt_backend.py
index 0123d9dba..27cb4b161 100644
--- a/tests/graph_optimization/test_graph_opt_backend.py
+++ b/tests/graph_optimization/test_graph_opt_backend.py
@@ -93,6 +93,7 @@ class TestGraphOptBackend(unittest.TestCase):
         baseline_cache_config = CacheConfig({})
         baseline_parallel_config = ParallelConfig(args={})
         model_config = Mock()
+        model_config.max_model_len = 512
         self.baseline_fd_config = FDConfig(
             graph_opt_config=baseline_graph_opt_config,
             scheduler_config=baseline_scheduler_config,
@@ -141,6 +142,7 @@ class TestGraphOptBackend(unittest.TestCase):
         cache_config = CacheConfig({})
         parallel_config = ParallelConfig(args={})
         model_config = Mock()
+        model_config.max_model_len = 512
 
         # Create FD config
         return FDConfig(
diff --git a/tests/graph_optimization/test_static_graph_cuda_graph_split.py b/tests/graph_optimization/test_static_graph_cuda_graph_split.py
index 03a07513e..8dcbf2e8f 100644
--- a/tests/graph_optimization/test_static_graph_cuda_graph_split.py
+++ b/tests/graph_optimization/test_static_graph_cuda_graph_split.py
@@ -96,6 +96,7 @@ class TestStaticGraphCUDAGraphSplit(unittest.TestCase):
         cache_config = CacheConfig({})
         parallel_config = ParallelConfig(args={})
         model_config = Mock()
+        model_config.max_model_len = 512
         fd_config = FDConfig(
             graph_opt_config=graph_opt_config,
             scheduler_config=scheduler_config,
diff --git a/tests/utils.py b/tests/utils.py
index 5e8310fa4..36dc0360b 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -48,6 +48,7 @@ class FakeModelConfig:
         self.tie_word_embeddings = True
         self.model_format = "auto"
         self.enable_mm = False
+        self.max_model_len = 512
 
 
 def get_default_test_fd_config():
diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py
index 366520bbd..c95ef1229 100644
--- a/tests/utils/test_config.py
+++ b/tests/utils/test_config.py
@@ -20,6 +20,7 @@ class TestConfig(unittest.TestCase):
         load_config = LoadConfig({})
         scheduler_config = SchedulerConfig({})
         model_config = Mock()
+        model_config.max_model_len = 512
         fd_config = FDConfig(
             parallel_config=parallel_config,
             graph_opt_config=graph_opt_config,
@@ -40,6 +41,7 @@ class TestConfig(unittest.TestCase):
         load_config = LoadConfig({})
         scheduler_config = SchedulerConfig({})
         model_config = Mock()
+        model_config.max_model_len = 512
         fd_config = FDConfig(
             parallel_config=parallel_config,
             graph_opt_config=graph_opt_config,
@@ -59,7 +61,8 @@ class TestConfig(unittest.TestCase):
         load_config = LoadConfig({})
         cache_config.enable_chunked_prefill = True
         scheduler_config = SchedulerConfig({})
-        model_config = model_config = Mock()
+        model_config: Mock = Mock()
+        model_config.max_model_len = 512
 
         fd_config = FDConfig(
             parallel_config=parallel_config,
@@ -97,7 +100,8 @@ class TestConfig(unittest.TestCase):
         load_config = LoadConfig({})
         scheduler_config = SchedulerConfig({})
         scheduler_config.splitwise_role = "prefill"
-        model_config = model_config = Mock()
+        model_config: Mock = Mock()
+        model_config.max_model_len = 512
 
         fd_config = FDConfig(
             parallel_config=parallel_config,
diff --git a/tests/v1/test_prefix_cache.py b/tests/v1/test_prefix_cache.py
index 2d56a9c9d..a6c7c2bf9 100644
--- a/tests/v1/test_prefix_cache.py
+++ b/tests/v1/test_prefix_cache.py
@@ -16,6 +16,7 @@ def test_normal_case():
     model_cfg = SimpleNamespace(enable_mm=False)
     speculative_cfg = SimpleNamespace(method=None)
     model_cfg.print = print
+    model_cfg.max_model_len = 5120
     cache_cfg.bytes_per_layer_per_block = 1
     parallel_cfg = ParallelConfig(args)
     scheduler_cfg = SchedulerConfig(args)
diff --git a/tests/v1/test_schedule_output.py b/tests/v1/test_schedule_output.py
index 929098563..88988022d 100644
--- a/tests/v1/test_schedule_output.py
+++ b/tests/v1/test_schedule_output.py
@@ -15,6 +15,7 @@ def test_normal_schedule():
     model_cfg = SimpleNamespace(enable_mm=False)
     speculative_cfg = SimpleNamespace(method=None)
     model_cfg.print = print
+    model_cfg.max_model_len = 5120
     cache_cfg.bytes_per_layer_per_block = 1
     parallel_cfg = ParallelConfig(args)
     scheduler_cfg = SchedulerConfig(args)
@@ -79,6 +80,7 @@ def test_preempted_request():
     model_cfg = SimpleNamespace(enable_mm=False)
     speculative_cfg = SimpleNamespace(method=None)
     model_cfg.print = print
+    model_cfg.max_model_len = 5120
     cache_cfg.bytes_per_layer_per_block = 1
     parallel_cfg = ParallelConfig(args)
     scheduler_cfg = SchedulerConfig(args)