diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 4c3530512..20de85344 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -542,8 +542,6 @@ class ParallelConfig: self.block_size: int = 64 # Engine worker queue port self.engine_worker_queue_port: str = "9923" - # Max model len - self.max_model_len: int = 3072 # max_seq_len # cuda visible devices self.device_ids: str = "0" # Input dtype @@ -1402,7 +1400,6 @@ class FDConfig: plas_attention_config: PlasAttentionConfig = None, speculative_config: SpeculativeConfig = None, tokenizer: str = None, - max_model_len: int = 8192, ips: str = None, use_warmup: bool = False, limit_mm_per_prompt: Optional[Dict[str, Any]] = None, @@ -1470,7 +1467,6 @@ class FDConfig: if ip == self.host_ip: self.node_rank = idx - self.max_model_len = max_model_len self.limit_mm_per_prompt = limit_mm_per_prompt self.mm_processor_kwargs = mm_processor_kwargs self.use_warmup = use_warmup @@ -1534,20 +1530,20 @@ class FDConfig: if self.scheduler_config.max_num_batched_tokens is None: if int(envs.ENABLE_V1_KVCACHE_SCHEDULER): if paddle.is_compiled_with_xpu(): - self.scheduler_config.max_num_batched_tokens = self.max_model_len + self.scheduler_config.max_num_batched_tokens = self.model_config.max_model_len else: self.scheduler_config.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM else: if self.cache_config.enable_chunked_prefill: self.scheduler_config.max_num_batched_tokens = 2048 else: - self.scheduler_config.max_num_batched_tokens = self.max_model_len + self.scheduler_config.max_num_batched_tokens = self.model_config.max_model_len if self.long_prefill_token_threshold == 0: - self.long_prefill_token_threshold = int(self.max_model_len * 0.04) + self.long_prefill_token_threshold = int(self.model_config.max_model_len * 0.04) self.cache_config.postprocess(self.scheduler_config.max_num_batched_tokens, self.scheduler_config.max_num_seqs) - self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size) + self.cache_config.max_block_num_per_seq = int(self.model_config.max_model_len // self.cache_config.block_size) if self.model_config is not None and self.model_config.enable_mm: self.cache_config.enable_prefix_caching = False @@ -1576,7 +1572,9 @@ class FDConfig: f"but now it's {self.scheduler_config.max_num_seqs}." ) assert self.nnode >= 1, f"nnode: {self.nnode} should no less than 1" - assert self.max_model_len >= 16, f"max_model_len: {self.max_model_len} should be larger than 16" + assert ( + self.model_config.max_model_len >= 16 + ), f"max_model_len: {self.model_config.max_model_len} should be larger than 16" assert ( self.scheduler_config.max_num_seqs >= 1 ), f"max_num_seqs: {self.scheduler_config.max_num_seqs} should be larger than 1" @@ -1585,10 +1583,11 @@ class FDConfig: f"should be larger than or equal to max_num_seqs: {self.scheduler_config.max_num_seqs}" ) assert ( - self.scheduler_config.max_num_batched_tokens <= self.max_model_len * self.scheduler_config.max_num_seqs + self.scheduler_config.max_num_batched_tokens + <= self.model_config.max_model_len * self.scheduler_config.max_num_seqs ), ( f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} should be larger" - f"than or equal to max_num_seqs: {self.scheduler_config.max_num_seqs} * max_model_len: {self.max_model_len}" + f"than or equal to max_num_seqs: {self.scheduler_config.max_num_seqs} * max_model_len: {self.model_config.max_model_len}" ) assert ( self.max_num_partial_prefills >= 1 @@ -1609,9 +1608,9 @@ class FDConfig: if not self.cache_config.enable_chunked_prefill: if not envs.ENABLE_V1_KVCACHE_SCHEDULER: - assert self.scheduler_config.max_num_batched_tokens >= self.max_model_len, ( + assert self.scheduler_config.max_num_batched_tokens >= self.model_config.max_model_len, ( f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} " - f"should be larger than or equal to max_model_len: {self.max_model_len}" + f"should be larger than or equal to max_model_len: {self.model_config.max_model_len}" ) else: assert self.scheduler_config.max_num_batched_tokens >= self.cache_config.block_size, ( @@ -1623,9 +1622,9 @@ class FDConfig: assert ( self.cache_config.enable_chunked_prefill is True ), "Chunked prefill must be enabled to set max_num_partial_prefills > 1" - assert self.long_prefill_token_threshold < self.max_model_len, ( + assert self.long_prefill_token_threshold < self.model_config.max_model_len, ( f"long_prefill_token_threshold: {self.long_prefill_token_threshold} should be less than" - f" max_model_len: {self.max_model_len}" + f" max_model_len: {self.model_config.max_model_len}" ) if self.guided_decoding_backend is not None: diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index fcb7088d6..27b8eecaf 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -1079,7 +1079,6 @@ class EngineArgs: cache_config=cache_cfg, load_config=load_cfg, parallel_config=parallel_cfg, - max_model_len=self.max_model_len, speculative_config=speculative_cfg, ips=self.ips, use_warmup=self.use_warmup, diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index e10bc93c4..93dc7258a 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -630,7 +630,7 @@ class EngineService: available_blocks=available_blocks, block_size=self.cfg.cache_config.block_size, reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num, - max_num_batched_tokens=self.cfg.max_model_len, + max_num_batched_tokens=self.cfg.model_config.max_model_len, batch=num_prefill_batch, ) if self.cfg.scheduler_config.splitwise_role != "mixed": diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index f65bc20a2..ca7545667 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -187,7 +187,7 @@ class LLMEngine: num_gpu_blocks = self.cfg.cache_config.num_gpu_blocks_override or self.cfg.cache_config.total_block_num num_cpu_blocks = self.cfg.cache_config.num_cpu_blocks max_running_requests = min( - (num_gpu_blocks + num_cpu_blocks) * block_size // self.cfg.max_model_len, + (num_gpu_blocks + num_cpu_blocks) * block_size // self.cfg.model_config.max_model_len, self.cfg.scheduler_config.max_num_seqs, ) console_logger.info( @@ -195,7 +195,7 @@ class LLMEngine: ) console_logger.info( f"FastDeploy will be serving {max_running_requests} running requests " - f"if each sequence reaches its maximum length: {self.cfg.max_model_len}" + f"if each sequence reaches its maximum length: {self.cfg.model_config.max_model_len}" ) return True @@ -248,19 +248,19 @@ class LLMEngine: chat_template_kwargs = kwargs.get("chat_template_kwargs") or {} chat_template_kwargs["chat_template"] = kwargs.get("chat_template") kwargs["chat_template_kwargs"] = chat_template_kwargs - request = self.data_processor.process_request(request, self.cfg.max_model_len, **kwargs) + request = self.data_processor.process_request(request, self.cfg.model_config.max_model_len, **kwargs) request.prompt_token_ids_len = len(request.prompt_token_ids) request.need_prefill_tokens = request.prompt_token_ids_len input_ids_len = request.prompt_token_ids_len request.set( "max_tokens", min( - self.cfg.max_model_len - input_ids_len, + self.cfg.model_config.max_model_len - input_ids_len, request.get("max_tokens"), ), ) min_tokens = request.get("min_tokens") - if input_ids_len + min_tokens >= self.cfg.max_model_len: + if input_ids_len + min_tokens >= self.cfg.model_config.max_model_len: error_msg = ( f"Input text is too long, length of prompt token({input_ids_len}) " f"+ min_dec_len ({min_tokens}) >= max_model_len " @@ -268,10 +268,8 @@ class LLMEngine: llm_logger.error(error_msg) raise EngineError(error_msg, error_code=400) - if input_ids_len > self.cfg.max_model_len: - error_msg = ( - f"Length of input token({input_ids_len}) exceeds the limit max_model_len({self.cfg.max_model_len})." - ) + if input_ids_len > self.cfg.model_config.max_model_len: + error_msg = f"Length of input token({input_ids_len}) exceeds the limit max_model_len({self.cfg.model_config.max_model_len})." llm_logger.error(error_msg) raise EngineError(error_msg, error_code=400) @@ -506,7 +504,7 @@ class LLMEngine: ips = ",".join(self.cfg.ips) arguments = ( f" --devices {self.cfg.device_ids} {py_script}" - f" --max_num_seqs {self.cfg.scheduler_config.max_num_seqs} --max_model_len {self.cfg.max_model_len}" + f" --max_num_seqs {self.cfg.scheduler_config.max_num_seqs} --max_model_len {self.cfg.model_config.max_model_len}" f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}" f" --model {self.cfg.model_config.model!s}" f" --device_ids {self.cfg.device_ids}" @@ -587,7 +585,7 @@ class LLMEngine: prompts["prompt"] = query_list if "max_tokens" not in prompts: - prompts["max_tokens"] = self.cfg.max_model_len + prompts["max_tokens"] = self.cfg.model_config.max_model_len self.add_requests(prompts) return prompts["request_id"] diff --git a/fastdeploy/entrypoints/llm.py b/fastdeploy/entrypoints/llm.py index 207203d67..302b65d25 100644 --- a/fastdeploy/entrypoints/llm.py +++ b/fastdeploy/entrypoints/llm.py @@ -93,7 +93,7 @@ class LLM: # Create the Engine self.llm_engine = LLMEngine.from_engine_args(engine_args=engine_args) - self.default_sampling_params = SamplingParams(max_tokens=self.llm_engine.cfg.max_model_len) + self.default_sampling_params = SamplingParams(max_tokens=self.llm_engine.cfg.model_config.max_model_len) self.llm_engine.start() diff --git a/fastdeploy/model_executor/layers/attention/append_attn_backend.py b/fastdeploy/model_executor/layers/attention/append_attn_backend.py index d42c4b80c..4e015e003 100644 --- a/fastdeploy/model_executor/layers/attention/append_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/append_attn_backend.py @@ -85,7 +85,7 @@ class AppendAttentionBackend(AttentionBackend): super().__init__() self.attention_metadata: AppendAttentionMetadata = None self.block_size: int = fd_config.cache_config.block_size - self.max_seq_len: int = fd_config.parallel_config.max_model_len + self.max_seq_len: int = fd_config.model_config.max_model_len self.rope_theta: float = ( 10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta ) diff --git a/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py b/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py index 418876271..b16a66817 100644 --- a/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py @@ -81,7 +81,7 @@ class BlockAttentionBackend(AttentionBackend): super().__init__() self.attention_metadata: BlockAttentionMetadata = None self.block_size = fd_config.cache_config.block_size - self.max_seq_len = fd_config.parallel_config.max_model_len + self.max_seq_len = fd_config.model_config.max_model_len self.rope_theta = 10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta self.rank = fd_config.parallel_config.tensor_parallel_rank diff --git a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py index 15750d090..ee57c7754 100644 --- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py @@ -110,7 +110,7 @@ class FlashAttentionBackend(AttentionBackend): """ super().__init__() self.attention_metadata: FlashAttentionMetadata = None - self.max_seq_len = fd_config.parallel_config.max_model_len + self.max_seq_len = fd_config.model_config.max_model_len self.causal = getattr(fd_config.model_config, "causal", True) self.kv_num_heads = kv_num_heads diff --git a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py index 355400a31..db3a09ce8 100644 --- a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py @@ -73,7 +73,7 @@ class IluvatarAttnBackend(AttentionBackend): self.attention_metadata = IluvatarAttentionMetadata() self.block_size = fd_config.parallel_config.block_size assert self.block_size == 16, "Iluvatar paged attn requires block_size must be 16." - self.max_context_len = fd_config.parallel_config.max_model_len + self.max_context_len = fd_config.model_config.max_model_len self.causal = getattr(fd_config.model_config, "causal", True) self.speculate_method = getattr(fd_config.parallel_config, "speculate_method", None) self.use_speculate = self.speculate_method is not None diff --git a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py index 896742962..5c283c84d 100644 --- a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py +++ b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py @@ -111,7 +111,7 @@ class MLAAttentionBackend(AttentionBackend): # 基础配置 self.block_size: int = fd_config.cache_config.block_size - self.max_seq_len: int = fd_config.parallel_config.max_model_len + self.max_seq_len: int = fd_config.model_config.max_model_len self.rope_theta: float = ( 10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta ) diff --git a/fastdeploy/model_executor/layers/attention/moba_attention_backend.py b/fastdeploy/model_executor/layers/attention/moba_attention_backend.py index 82ac4880b..04183922e 100644 --- a/fastdeploy/model_executor/layers/attention/moba_attention_backend.py +++ b/fastdeploy/model_executor/layers/attention/moba_attention_backend.py @@ -76,7 +76,7 @@ class PlasAttentionBackend(AttentionBackend): self.attention_metadata: PlasAttentionMetadata = None assert fd_config.plas_attention_config is not None, "plas_attention_config is None" self.block_size = fd_config.parallel_config.block_size - self.max_seq_len = fd_config.parallel_config.max_model_len + self.max_seq_len = fd_config.model_config.max_model_len self.max_num_seqs = fd_config.scheduler_config.max_num_seqs self.kv_num_heads = kv_num_heads self.num_heads = num_heads diff --git a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py index 62ad01d9f..5735abf6f 100644 --- a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py @@ -79,7 +79,7 @@ class XPUAttentionBackend(AttentionBackend): super().__init__() self.attention_metadata: XPUAttentionMetadata = None self.block_size: int = fd_config.cache_config.block_size - self.max_seq_len: int = fd_config.parallel_config.max_model_len + self.max_seq_len: int = fd_config.model_config.max_model_len self.rope_theta: float = ( 10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta ) diff --git a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py index a7135a0e0..8487dda5f 100644 --- a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py @@ -85,7 +85,7 @@ class GCUFlashAttnBackend(AttentionBackend): super().__init__() self.attention_metadata: GCUFlashAttnMetadata = None self.block_size = fd_config.cache_config.block_size - self.max_seq_len = fd_config.parallel_config.max_model_len + self.max_seq_len = fd_config.model_config.max_model_len self.max_num_seqs = fd_config.scheduler_config.max_num_seqs self.causal = getattr(fd_config.model_config, "causal", True) diff --git a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py index 4901ecce2..7a6eedcd3 100644 --- a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py +++ b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py @@ -83,7 +83,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend): super().__init__() self.attention_metadata: GCUMemEfficientAttnMetadata = None self.block_size = fd_config.cache_config.block_size - self.max_seq_len = fd_config.parallel_config.max_model_len + self.max_seq_len = fd_config.model_config.max_model_len self.max_num_seqs = fd_config.scheduler_config.max_num_seqs self.causal = getattr(fd_config.model_config, "causal", True) diff --git a/fastdeploy/model_executor/layers/backends/intel_hpu/attention/hpu_attn_backend.py b/fastdeploy/model_executor/layers/backends/intel_hpu/attention/hpu_attn_backend.py index 962b6e113..b580d7ad8 100644 --- a/fastdeploy/model_executor/layers/backends/intel_hpu/attention/hpu_attn_backend.py +++ b/fastdeploy/model_executor/layers/backends/intel_hpu/attention/hpu_attn_backend.py @@ -168,7 +168,7 @@ class HPUAttentionBackend(AttentionBackend_HPU): self.attention_metadata: HPUAttentionMetadata = None # TODO(gongshaotian): Use llm_config parameters in the correct location self.block_size = llm_config.parallel_config.block_size - self.max_seq_len = llm_config.parallel_config.max_model_len + self.max_seq_len = llm_config.model_config.max_model_len self.rope_theta = 10000.0 if llm_config.model_config.rope_theta is None else llm_config.model_config.rope_theta self.rope_3d = getattr(llm_config.model_config, "rope_3d", False) self.causal = getattr(llm_config.model_config, "causal", True) diff --git a/fastdeploy/model_executor/layers/backends/metax/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/backends/metax/attention/flash_attn_backend.py index 8b673d23f..a19ed32cb 100644 --- a/fastdeploy/model_executor/layers/backends/metax/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/backends/metax/attention/flash_attn_backend.py @@ -90,7 +90,7 @@ class FlashAttentionBackend(AttentionBackend): self.attention_metadata: FlashAttentionMetadata = None self.record_block_table_metadata = {} self.block_size: int = fd_config.parallel_config.block_size - self.max_seq_len: int = fd_config.parallel_config.max_model_len + self.max_seq_len: int = fd_config.model_config.max_model_len self.rope_theta: float = ( 10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta ) diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index 7be6d2b5c..6a96adeab 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -403,27 +403,27 @@ class Ernie4_5_VLDecoderLayer(nn.Layer): @cuda_graph_buffers( { "text_input": { - "shape": ["parallel_config.max_model_len", "model_config.hidden_size"], + "shape": ["model_config.max_model_len", "model_config.hidden_size"], "dtype": "model_config.dtype", "value": 1, }, "image_input": { - "shape": ["parallel_config.max_model_len", "model_config.hidden_size"], + "shape": ["model_config.max_model_len", "model_config.hidden_size"], "dtype": "model_config.dtype", "value": 1, }, "text_index": { - "shape": ["parallel_config.max_model_len"], + "shape": ["model_config.max_model_len"], "dtype": "int32", "value": 0, }, "image_index": { - "shape": ["parallel_config.max_model_len"], + "shape": ["model_config.max_model_len"], "dtype": "int32", "value": 0, }, "token_type_ids": { - "shape": ["parallel_config.max_model_len"], + "shape": ["model_config.max_model_len"], "dtype": "int32", "value": -1, }, @@ -568,7 +568,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM): # Persistent buffers for CUDA graphs. self._input_embeddings = paddle.zeros( - [fd_config.parallel_config.max_model_len, fd_config.model_config.hidden_size], + [fd_config.model_config.max_model_len, fd_config.model_config.hidden_size], dtype=fd_config.model_config.dtype, ) diff --git a/fastdeploy/model_executor/models/qwen2_5_vl/qwen2_5_vl.py b/fastdeploy/model_executor/models/qwen2_5_vl/qwen2_5_vl.py index ddd3cd483..3955deb9a 100644 --- a/fastdeploy/model_executor/models/qwen2_5_vl/qwen2_5_vl.py +++ b/fastdeploy/model_executor/models/qwen2_5_vl/qwen2_5_vl.py @@ -154,7 +154,7 @@ class Qwen2_5_VLForConditionalGeneration(ModelForCasualLM): # Persistent buffers for CUDA graphs. self._input_embeddings = paddle.zeros( - [fd_config.parallel_config.max_model_len, fd_config.model_config.hidden_size], + [fd_config.model_config.max_model_len, fd_config.model_config.hidden_size], dtype=fd_config.model_config.dtype, ) diff --git a/fastdeploy/spec_decode/base.py b/fastdeploy/spec_decode/base.py index a7d8f2266..458f8e579 100644 --- a/fastdeploy/spec_decode/base.py +++ b/fastdeploy/spec_decode/base.py @@ -54,7 +54,7 @@ class Proposer(ABC): self.scheduler_config = self.fd_config.scheduler_config self.max_num_seqs = self.scheduler_config.max_num_seqs - self.max_model_len = self.parallel_config.max_model_len + self.max_model_len = self.model_config.max_model_len self.speculative_method = self.speculative_config.method self.max_draft_token_num = self.speculative_config.num_speculative_tokens self.num_model_steps = self.speculative_config.num_model_steps diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py index 945962d55..14a3e4fec 100644 --- a/fastdeploy/spec_decode/mtp.py +++ b/fastdeploy/spec_decode/mtp.py @@ -117,7 +117,7 @@ class MTPProposer(Proposer): input_length = min( num_tokens // batch_size, - self.parallel_config.max_model_len - max_dec_len, + self.model_config.max_model_len - max_dec_len, ) block_num = ( input_length + self.cache_config.block_size - 1 @@ -306,7 +306,7 @@ class MTPProposer(Proposer): self.model_inputs["block_tables"] = paddle.clone(self.target_model_inputs["block_tables"]) self.model_inputs["input_ids"] = paddle.clone(self.target_model_inputs["input_ids"]) self.model_inputs["input_ids_cpu"] = paddle.full( - shape=[self.max_num_seqs, self.parallel_config.max_model_len], + shape=[self.max_num_seqs, self.model_config.max_model_len], fill_value=-1, dtype="int64", ).cpu() @@ -334,7 +334,7 @@ class MTPProposer(Proposer): [self.max_model_len * self.fd_config.max_prefill_batch, self.model_config.hidden_size], 0, dtype="bfloat16" ) - tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1)) + tmp_position_ids = paddle.arange(self.model_config.max_model_len).reshape((1, -1)) self.model_inputs["rope_emb"] = get_rope( rotary_dim=self.model_config.head_dim, position_ids=tmp_position_ids, @@ -764,7 +764,7 @@ class MTPProposer(Proposer): self.model_inputs["seq_lens_decoder"], self.model_inputs["seq_lens_encoder"], self.model_inputs["output_padding_offset"], - self.parallel_config.max_model_len, + self.model_config.max_model_len, ) # 4. Compute logits, Sample diff --git a/fastdeploy/splitwise/internal_adapter_utils.py b/fastdeploy/splitwise/internal_adapter_utils.py index 0e1ba4494..eabae716d 100644 --- a/fastdeploy/splitwise/internal_adapter_utils.py +++ b/fastdeploy/splitwise/internal_adapter_utils.py @@ -61,7 +61,7 @@ class InternalAdapter: "dec_token_num": int(self.cfg.cache_config.dec_token_num), "available_resource": float(1.0 * available_block_num / self.cfg.cache_config.total_block_num), "max_batch_size": int(available_batch_size), - "max_input_token_num": self.cfg.max_model_len, + "max_input_token_num": self.cfg.model_config.max_model_len, "unhandled_request_num": self.engine.scheduler.get_unhandled_request_num(), "available_batch": int(self.engine.resource_manager.available_batch()), } diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index 332659118..5c0580ea8 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -302,7 +302,7 @@ class GCUModelRunner(ModelRunnerBase): max_dec_len = expected_decode_len + 1 full_length = min( num_tokens // batch_size, - self.parallel_config.max_model_len - max_dec_len, + self.model_config.max_model_len - max_dec_len, ) input_length = int(full_length * self.cache_config.kv_cache_ratio) block_num = ( @@ -344,17 +344,17 @@ class GCUModelRunner(ModelRunnerBase): self.share_inputs = {} self.share_inputs["pre_ids"] = paddle.full( - [max_num_seqs, self.parallel_config.max_model_len], + [max_num_seqs, self.model_config.max_model_len], -1, dtype="int64", ) self.share_inputs["input_ids"] = paddle.full( - [max_num_seqs, self.parallel_config.max_model_len], + [max_num_seqs, self.model_config.max_model_len], self.model_config.pad_token_id, dtype="int64", ) self.share_inputs["prompt_ids"] = paddle.full( - [max_num_seqs, self.parallel_config.max_model_len], + [max_num_seqs, self.model_config.max_model_len], self.model_config.pad_token_id, dtype="int64", ) @@ -417,7 +417,7 @@ class GCUModelRunner(ModelRunnerBase): self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int32") self.share_inputs["ids_remove_padding"] = paddle.full( - [max_num_seqs * self.parallel_config.max_model_len], + [max_num_seqs * self.model_config.max_model_len], 0, dtype="int64", ) @@ -439,7 +439,7 @@ class GCUModelRunner(ModelRunnerBase): self.share_inputs["max_len_kv_cpu"] = None # CPU # Initialize rotary position embedding - tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1)) + tmp_position_ids = paddle.arange(self.model_config.max_model_len).reshape((1, -1)) self.share_inputs["rope_emb"] = get_rope( rotary_dim=self.model_config.head_dim, position_ids=tmp_position_ids, @@ -449,7 +449,7 @@ class GCUModelRunner(ModelRunnerBase): # Set block tables pre_max_block_num = ( - self.parallel_config.max_model_len + self.cache_config.block_size - 1 + self.model_config.max_model_len + self.cache_config.block_size - 1 ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32") @@ -478,7 +478,7 @@ class GCUModelRunner(ModelRunnerBase): if self.speculative_decoding: max_draft_token_num = self.speculative_config.num_speculative_tokens self.share_inputs["input_ids_cpu"] = paddle.full( - shape=[max_num_seqs, self.parallel_config.max_model_len], + shape=[max_num_seqs, self.model_config.max_model_len], fill_value=1, dtype="int64", ).cpu() @@ -779,7 +779,7 @@ class GCUModelRunner(ModelRunnerBase): ( self.share_inputs["output_padding_offset"] if self.speculative_decoding else None ), # speculative decoding requires - self.parallel_config.max_model_len, + self.model_config.max_model_len, ) # 4. Execute spec decode @@ -802,7 +802,7 @@ class GCUModelRunner(ModelRunnerBase): self.sampler( logits, self.sampling_metadata, - self.parallel_config.max_model_len, + self.model_config.max_model_len, self.share_inputs, ) sampler_output = None @@ -1002,7 +1002,7 @@ class GCUModelRunner(ModelRunnerBase): self.share_inputs["seq_lens_decoder"], self.share_inputs["seq_lens_encoder"], (self.share_inputs["output_padding_offset"] if self.speculative_decoding else None), - self.parallel_config.max_model_len, + self.model_config.max_model_len, ) # 4. Compute logits, Sample @@ -1030,7 +1030,7 @@ class GCUModelRunner(ModelRunnerBase): self.sampler( logits, self.sampling_metadata, - self.parallel_config.max_model_len, + self.model_config.max_model_len, self.share_inputs, ) sampler_output = None diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 7eda4f8e1..1c55b4679 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -708,7 +708,7 @@ class GPUModelRunner(ModelRunnerBase): max_dec_len = expected_decode_len + 1 input_length = min( num_tokens // (1 if capture_prefill else batch_size), - self.parallel_config.max_model_len - max_dec_len, + self.model_config.max_model_len - max_dec_len, ) # NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan. @@ -773,17 +773,17 @@ class GPUModelRunner(ModelRunnerBase): self.share_inputs = {} self.share_inputs["pre_ids"] = paddle.full( - [max_num_seqs, self.parallel_config.max_model_len], + [max_num_seqs, self.model_config.max_model_len], -1, dtype="int64", ) self.share_inputs["input_ids"] = paddle.full( - [max_num_seqs, self.parallel_config.max_model_len], + [max_num_seqs, self.model_config.max_model_len], self.model_config.pad_token_id, dtype="int64", ) self.share_inputs["prompt_ids"] = paddle.full( - [max_num_seqs, self.parallel_config.max_model_len], + [max_num_seqs, self.model_config.max_model_len], self.model_config.pad_token_id, dtype="int64", ) @@ -850,12 +850,12 @@ class GPUModelRunner(ModelRunnerBase): self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int32") self.share_inputs["ids_remove_padding"] = paddle.full( - [max_num_seqs * self.parallel_config.max_model_len], + [max_num_seqs * self.model_config.max_model_len], 0, dtype="int64", ) self.share_inputs["batch_id_per_token"] = paddle.full( - [max_num_seqs * self.parallel_config.max_model_len, 1], 0, dtype="int32" + [max_num_seqs * self.model_config.max_model_len, 1], 0, dtype="int32" ) self.share_inputs["cu_seqlens_q"] = paddle.full([max_num_seqs + 1, 1], 0, dtype="int32") self.share_inputs["cu_seqlens_k"] = paddle.full([max_num_seqs + 1, 1], 0, dtype="int32") @@ -876,7 +876,7 @@ class GPUModelRunner(ModelRunnerBase): self.share_inputs["max_len_kv_cpu"] = None # CPU # Initialize rotary position embedding - tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1)) + tmp_position_ids = paddle.arange(self.model_config.max_model_len).reshape((1, -1)) # Initialize thinking related buffers self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") @@ -895,7 +895,7 @@ class GPUModelRunner(ModelRunnerBase): # Set block tables pre_max_block_num = ( - self.parallel_config.max_model_len + self.cache_config.block_size - 1 + self.model_config.max_model_len + self.cache_config.block_size - 1 ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32") @@ -927,7 +927,7 @@ class GPUModelRunner(ModelRunnerBase): if self.speculative_decoding: max_draft_token_num = self.speculative_config.num_speculative_tokens self.share_inputs["input_ids_cpu"] = paddle.full( - shape=[max_num_seqs, self.parallel_config.max_model_len], + shape=[max_num_seqs, self.model_config.max_model_len], fill_value=1, dtype="int64", ).cpu() @@ -974,7 +974,7 @@ class GPUModelRunner(ModelRunnerBase): max_num_seqs, 2, 1, - self.parallel_config.max_model_len, + self.model_config.max_model_len, 1, rope_head_dim, ], @@ -1373,7 +1373,7 @@ class GPUModelRunner(ModelRunnerBase): ( self.share_inputs["output_padding_offset"] if self.speculative_decoding else None ), # speculative decoding requires - self.parallel_config.max_model_len, + self.model_config.max_model_len, ) logits = None @@ -1405,7 +1405,7 @@ class GPUModelRunner(ModelRunnerBase): self.sampler( logits, self.sampling_metadata, - self.parallel_config.max_model_len, + self.model_config.max_model_len, self.share_inputs, accept_all_drafts, ) @@ -1723,7 +1723,7 @@ class GPUModelRunner(ModelRunnerBase): self.share_inputs["seq_lens_decoder"], self.share_inputs["seq_lens_encoder"], (self.share_inputs["output_padding_offset"] if self.speculative_decoding else None), - self.parallel_config.max_model_len, + self.model_config.max_model_len, ) logits = None @@ -1760,7 +1760,7 @@ class GPUModelRunner(ModelRunnerBase): self.sampler( logits, self.sampling_metadata, - self.parallel_config.max_model_len, + self.model_config.max_model_len, self.share_inputs, ) sampler_output = None @@ -2198,7 +2198,7 @@ class GPUModelRunner(ModelRunnerBase): rotary_dim=self.model_config.head_dim, partial_rotary_factor=1.0, base=self.model_config.rope_theta, - max_position=self.parallel_config.max_model_len, + max_position=self.model_config.max_model_len, freq_allocation=getattr(self.model_config, "freq_allocation", 20), model_type=self.model_config.model_type, ) diff --git a/fastdeploy/worker/hpu_model_runner.py b/fastdeploy/worker/hpu_model_runner.py index 4323138cf..56f84fd86 100644 --- a/fastdeploy/worker/hpu_model_runner.py +++ b/fastdeploy/worker/hpu_model_runner.py @@ -535,7 +535,7 @@ class HPUModelRunner(ModelRunnerBase): """Set dummy prefill inputs to share_inputs""" # NOTE(gongshaotian): The maximum decoding length is equal to the expected decoded tokens plus the eos token max_dec_len = expected_decode_len + 1 - full_length = min(num_tokens // batch_size, self.parallel_config.max_model_len - max_dec_len) + full_length = min(num_tokens // batch_size, self.model_config.max_model_len - max_dec_len) input_length = int(full_length * self.cache_config.kv_cache_ratio) block_num = ( input_length + self.cache_config.block_size - 1 @@ -568,11 +568,9 @@ class HPUModelRunner(ModelRunnerBase): self.MAX_INFER_SEED = 9223372036854775806 self.share_inputs = {} - self.share_inputs["pre_ids"] = paddle.full( - [max_num_seqs, self.parallel_config.max_model_len], -1, dtype="int64" - ) + self.share_inputs["pre_ids"] = paddle.full([max_num_seqs, self.model_config.max_model_len], -1, dtype="int64") self.share_inputs["input_ids"] = paddle.full( - [max_num_seqs, self.parallel_config.max_model_len], self.model_config.pad_token_id, dtype="int64" + [max_num_seqs, self.model_config.max_model_len], self.model_config.pad_token_id, dtype="int64" ) self.share_inputs["eos_token_id"] = paddle.full([self.model_config.eos_tokens_lens, 1], 0, dtype="int64") self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32") @@ -627,7 +625,7 @@ class HPUModelRunner(ModelRunnerBase): self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int32") self.share_inputs["ids_remove_padding"] = paddle.full( - [max_num_seqs * self.parallel_config.max_model_len], 0, dtype="int64" + [max_num_seqs * self.model_config.max_model_len], 0, dtype="int64" ) self.share_inputs["cum_offsets"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["padding_offset"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") @@ -638,7 +636,7 @@ class HPUModelRunner(ModelRunnerBase): self.share_inputs["decoder_tile_ids_per_batch"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") # Initialize rotary position embedding - tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1)) + tmp_position_ids = paddle.arange(self.model_config.max_model_len).reshape((1, -1)) # TODO(gongshaotian): move to models self.share_inputs["rope_emb"] = get_rope( rotary_dim=self.model_config.head_dim, @@ -649,7 +647,7 @@ class HPUModelRunner(ModelRunnerBase): # Set block tables pre_max_block_num = ( - self.parallel_config.max_model_len + self.cache_config.block_size - 1 + self.model_config.max_model_len + self.cache_config.block_size - 1 ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32").cpu() @@ -673,7 +671,7 @@ class HPUModelRunner(ModelRunnerBase): if self.speculative_decoding: max_draft_token_num = self.speculative_config.num_speculative_tokens self.share_inputs["input_ids_cpu"] = paddle.full( - shape=[max_num_seqs, self.parallel_config.max_model_len], fill_value=1, dtype="int64" + shape=[max_num_seqs, self.model_config.max_model_len], fill_value=1, dtype="int64" ).cpu() self.share_inputs["accept_tokens"] = paddle.full( shape=[max_num_seqs, max_draft_token_num + 1], fill_value=0, dtype="int64" @@ -983,7 +981,7 @@ class HPUModelRunner(ModelRunnerBase): # 7. Updata 'infer_seed' and step_cuda() self.share_inputs["infer_seed"].add_(self.infer_seed_increment) self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED - step_intel_hpu(self.share_inputs, self.cache_config.block_size, self.parallel_config.max_model_len) + step_intel_hpu(self.share_inputs, self.cache_config.block_size, self.model_config.max_model_len) if int((self.share_inputs["seq_lens_this_time"] > 0).sum()) == 0: break @@ -1082,9 +1080,7 @@ class HPUModelRunner(ModelRunnerBase): def warm_up_bucket(self) -> None: max_prefill_batch = 3 # Hard-Code in FastDeploy/fastdeploy/engine/config.py - warmup_max_model_len = min( - int(os.environ.get("HPU_WARMUP_MODEL_LEN", 4096)), self.parallel_config.max_model_len - ) + warmup_max_model_len = min(int(os.environ.get("HPU_WARMUP_MODEL_LEN", 4096)), self.model_config.max_model_len) prefill_batchs = [] prefill_batch_step = int(os.environ.get("BATCH_STEP_PREFILL", 1)) current_prefill_batch = prefill_batch_step @@ -1176,7 +1172,7 @@ class HPUModelRunner(ModelRunnerBase): capture_sizes = self.cudagraph_capture_sizes.copy() for batch_size in sorted(capture_sizes, reverse=True): self._dummy_run( - num_tokens=self.parallel_config.max_model_len, + num_tokens=self.model_config.max_model_len, batch_size=batch_size, in_capturing=True, expected_decode_len=expected_decode_len, @@ -1334,7 +1330,7 @@ class HPUModelRunner(ModelRunnerBase): self.share_inputs["infer_seed"].add_(self.infer_seed_increment) self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED start_time = time.time() - step_intel_hpu(self.share_inputs, self.cache_config.block_size, self.parallel_config.max_model_len) + step_intel_hpu(self.share_inputs, self.cache_config.block_size, self.model_config.max_model_len) end_time = time.time() execution_time = (end_time - start_time) * 1000 hpu_model_runner_profile_logger.info(f"StepPaddle execution time(ms): {execution_time}, BT={real_bs}") diff --git a/fastdeploy/worker/metax_model_runner.py b/fastdeploy/worker/metax_model_runner.py index 93368b2a4..dcce154ea 100644 --- a/fastdeploy/worker/metax_model_runner.py +++ b/fastdeploy/worker/metax_model_runner.py @@ -551,7 +551,7 @@ class MetaxModelRunner(ModelRunnerBase): max_dec_len = expected_decode_len + 1 full_length = min( num_tokens // batch_size, - self.parallel_config.max_model_len - max_dec_len, + self.model_config.max_model_len - max_dec_len, ) # When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan. @@ -599,17 +599,17 @@ class MetaxModelRunner(ModelRunnerBase): self.share_inputs = {} self.share_inputs["pre_ids"] = paddle.full( - [max_num_seqs, self.parallel_config.max_model_len], + [max_num_seqs, self.model_config.max_model_len], -1, dtype="int64", ) self.share_inputs["input_ids"] = paddle.full( - [max_num_seqs, self.parallel_config.max_model_len], + [max_num_seqs, self.model_config.max_model_len], self.model_config.pad_token_id, dtype="int64", ) self.share_inputs["prompt_ids"] = paddle.full( - [max_num_seqs, self.parallel_config.max_model_len], + [max_num_seqs, self.model_config.max_model_len], self.model_config.pad_token_id, dtype="int64", ) @@ -674,12 +674,12 @@ class MetaxModelRunner(ModelRunnerBase): self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int32") self.share_inputs["ids_remove_padding"] = paddle.full( - [max_num_seqs * self.parallel_config.max_model_len], + [max_num_seqs * self.model_config.max_model_len], 0, dtype="int64", ) self.share_inputs["batch_id_per_token"] = paddle.full( - [max_num_seqs * self.parallel_config.max_model_len, 1], 0, dtype="int32" + [max_num_seqs * self.model_config.max_model_len, 1], 0, dtype="int32" ) self.share_inputs["cu_seqlens_q"] = paddle.full([max_num_seqs + 1, 1], 0, dtype="int32") self.share_inputs["cu_seqlens_k"] = paddle.full([max_num_seqs + 1, 1], 0, dtype="int32") @@ -691,7 +691,7 @@ class MetaxModelRunner(ModelRunnerBase): self.share_inputs["max_len_tensor_cpu"] = None # CPU # Initialize rotary position embedding - tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1)) + tmp_position_ids = paddle.arange(self.model_config.max_model_len).reshape((1, -1)) # TODO(gongshaotian): move to models if not self.enable_mm: @@ -704,7 +704,7 @@ class MetaxModelRunner(ModelRunnerBase): # Set block tables pre_max_block_num = ( - self.parallel_config.max_model_len + self.cache_config.block_size - 1 + self.model_config.max_model_len + self.cache_config.block_size - 1 ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32") @@ -736,7 +736,7 @@ class MetaxModelRunner(ModelRunnerBase): if self.speculative_decoding: max_draft_token_num = self.speculative_config.num_speculative_tokens self.share_inputs["input_ids_cpu"] = paddle.full( - shape=[max_num_seqs, self.parallel_config.max_model_len], + shape=[max_num_seqs, self.model_config.max_model_len], fill_value=1, dtype="int64", ).cpu() @@ -771,7 +771,7 @@ class MetaxModelRunner(ModelRunnerBase): max_num_seqs, 2, 1, - self.parallel_config.max_model_len, + self.model_config.max_model_len, 1, head_dim // 2, ], @@ -1075,7 +1075,7 @@ class MetaxModelRunner(ModelRunnerBase): ( self.share_inputs["output_padding_offset"] if self.speculative_decoding else None ), # speculative decoding requires - self.parallel_config.max_model_len, + self.model_config.max_model_len, ) # 4. Execute spec decode @@ -1098,7 +1098,7 @@ class MetaxModelRunner(ModelRunnerBase): self.sampler( logits, self.sampling_metadata, - self.parallel_config.max_model_len, + self.model_config.max_model_len, self.share_inputs, ) sampler_output = None @@ -1338,7 +1338,7 @@ class MetaxModelRunner(ModelRunnerBase): self.share_inputs["seq_lens_decoder"], self.share_inputs["seq_lens_encoder"], (self.share_inputs["output_padding_offset"] if self.speculative_decoding else None), - self.parallel_config.max_model_len, + self.model_config.max_model_len, ) # 4. Compute logits, Sample @@ -1366,7 +1366,7 @@ class MetaxModelRunner(ModelRunnerBase): self.sampler( logits, self.sampling_metadata, - self.parallel_config.max_model_len, + self.model_config.max_model_len, self.share_inputs, ) sampler_output = None @@ -1707,7 +1707,7 @@ class MetaxModelRunner(ModelRunnerBase): rotary_dim=self.model_config.head_dim, partial_rotary_factor=1.0, base=self.model_config.rope_theta, - max_position=self.parallel_config.max_model_len, + max_position=self.model_config.max_model_len, freq_allocation=getattr(self.model_config, "freq_allocation", 20), ) return rope_emb diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index abb886290..b2dcc7920 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -117,7 +117,7 @@ def update_fd_config_for_mm(fd_config: FDConfig) -> None: if fd_config.model_config.enable_mm and ErnieArchitectures.contains_ernie_arch(architectures): tokenizer = Ernie4_5Tokenizer.from_pretrained( fd_config.model_config.model, - model_max_length=fd_config.parallel_config.max_model_len, + model_max_length=fd_config.model_config.max_model_len, padding_side="right", use_fast=False, ) diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index 6cf3d3de1..f04680498 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -680,17 +680,17 @@ class XPUModelRunner(ModelRunnerBase): self.share_inputs = {} self.share_inputs["pre_ids"] = paddle.full( - [max_num_seqs, self.parallel_config.max_model_len], + [max_num_seqs, self.model_config.max_model_len], -1, dtype="int64", ) self.share_inputs["input_ids"] = paddle.full( - [max_num_seqs, self.parallel_config.max_model_len], + [max_num_seqs, self.model_config.max_model_len], self.model_config.pad_token_id, dtype="int64", ) self.share_inputs["prompt_ids"] = paddle.full( - [max_num_seqs, self.parallel_config.max_model_len], + [max_num_seqs, self.model_config.max_model_len], self.model_config.pad_token_id, dtype="int64", ) @@ -755,7 +755,7 @@ class XPUModelRunner(ModelRunnerBase): self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int32") # Initialize rotary position embedding - tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1)) + tmp_position_ids = paddle.arange(self.model_config.max_model_len).reshape((1, -1)) # TODO(gongshaotian): move to models if not self.enable_mm: @@ -768,7 +768,7 @@ class XPUModelRunner(ModelRunnerBase): # Set block tables pre_max_block_num = ( - self.parallel_config.max_model_len + self.cache_config.block_size - 1 + self.model_config.max_model_len + self.cache_config.block_size - 1 ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32") @@ -805,7 +805,7 @@ class XPUModelRunner(ModelRunnerBase): max_num_seqs, 2, 1, - self.parallel_config.max_model_len, + self.model_config.max_model_len, 1, head_dim // 2, ], @@ -960,7 +960,7 @@ class XPUModelRunner(ModelRunnerBase): def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int): """Set dummy prefill inputs to share_inputs""" - full_length = min(num_tokens // batch_size, self.parallel_config.max_model_len - 10) + full_length = min(num_tokens // batch_size, self.model_config.max_model_len - 10) input_length = int(full_length - 512) block_num = ( input_length + self.cache_config.block_size - 1 @@ -1344,7 +1344,7 @@ class XPUModelRunner(ModelRunnerBase): rotary_dim=self.model_config.head_dim, partial_rotary_factor=1.0, base=self.model_config.rope_theta, - max_position=self.parallel_config.max_model_len, + max_position=self.model_config.max_model_len, freq_allocation=getattr(self.model_config, "freq_allocation", 20), model_type=self.model_config.model_type, ) diff --git a/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py b/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py index 5ee42b1a7..98c7383f0 100644 --- a/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py +++ b/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py @@ -159,6 +159,7 @@ class TestCUDAGrpahSubgraph(unittest.TestCase): cache_config = CacheConfig({}) parallel_config = ParallelConfig(args={}) model_config = Mock() + model_config.max_model_len = 512 # Initialize cuda graph capture list graph_opt_config._set_cudagraph_sizes(max_num_seqs=scheduler_config.max_num_seqs) graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs) diff --git a/tests/graph_optimization/test_cuda_graph_recapture.py b/tests/graph_optimization/test_cuda_graph_recapture.py index 535accc38..7359fd6dd 100644 --- a/tests/graph_optimization/test_cuda_graph_recapture.py +++ b/tests/graph_optimization/test_cuda_graph_recapture.py @@ -97,6 +97,7 @@ class TestCUDAGrpahRecapture(unittest.TestCase): scheduler_config.max_num_seqs = 1 parallel_config = ParallelConfig(args={}) model_config = Mock() + model_config.max_model_len = 5120 fd_config = FDConfig( graph_opt_config=graph_opt_config, scheduler_config=scheduler_config, diff --git a/tests/graph_optimization/test_cuda_graph_spec_decode.py b/tests/graph_optimization/test_cuda_graph_spec_decode.py index 394c4857f..40c32ca91 100644 --- a/tests/graph_optimization/test_cuda_graph_spec_decode.py +++ b/tests/graph_optimization/test_cuda_graph_spec_decode.py @@ -106,6 +106,7 @@ class TestCUDAGrpahSpecDecode(unittest.TestCase): cache_config = CacheConfig({}) parallel_config = ParallelConfig(args={}) model_config = Mock() + model_config.max_model_len = 512 # Initialize cuda graph capture list graph_opt_config._set_cudagraph_sizes(max_num_seqs=scheduler_config.max_num_seqs) graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs) diff --git a/tests/graph_optimization/test_graph_opt_backend.py b/tests/graph_optimization/test_graph_opt_backend.py index 0123d9dba..27cb4b161 100644 --- a/tests/graph_optimization/test_graph_opt_backend.py +++ b/tests/graph_optimization/test_graph_opt_backend.py @@ -93,6 +93,7 @@ class TestGraphOptBackend(unittest.TestCase): baseline_cache_config = CacheConfig({}) baseline_parallel_config = ParallelConfig(args={}) model_config = Mock() + model_config.max_model_len = 512 self.baseline_fd_config = FDConfig( graph_opt_config=baseline_graph_opt_config, scheduler_config=baseline_scheduler_config, @@ -141,6 +142,7 @@ class TestGraphOptBackend(unittest.TestCase): cache_config = CacheConfig({}) parallel_config = ParallelConfig(args={}) model_config = Mock() + model_config.max_model_len = 512 # Create FD config return FDConfig( diff --git a/tests/graph_optimization/test_static_graph_cuda_graph_split.py b/tests/graph_optimization/test_static_graph_cuda_graph_split.py index 03a07513e..8dcbf2e8f 100644 --- a/tests/graph_optimization/test_static_graph_cuda_graph_split.py +++ b/tests/graph_optimization/test_static_graph_cuda_graph_split.py @@ -96,6 +96,7 @@ class TestStaticGraphCUDAGraphSplit(unittest.TestCase): cache_config = CacheConfig({}) parallel_config = ParallelConfig(args={}) model_config = Mock() + model_config.max_model_len = 512 fd_config = FDConfig( graph_opt_config=graph_opt_config, scheduler_config=scheduler_config, diff --git a/tests/utils.py b/tests/utils.py index 5e8310fa4..36dc0360b 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -48,6 +48,7 @@ class FakeModelConfig: self.tie_word_embeddings = True self.model_format = "auto" self.enable_mm = False + self.max_model_len = 512 def get_default_test_fd_config(): diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py index 366520bbd..c95ef1229 100644 --- a/tests/utils/test_config.py +++ b/tests/utils/test_config.py @@ -20,6 +20,7 @@ class TestConfig(unittest.TestCase): load_config = LoadConfig({}) scheduler_config = SchedulerConfig({}) model_config = Mock() + model_config.max_model_len = 512 fd_config = FDConfig( parallel_config=parallel_config, graph_opt_config=graph_opt_config, @@ -40,6 +41,7 @@ class TestConfig(unittest.TestCase): load_config = LoadConfig({}) scheduler_config = SchedulerConfig({}) model_config = Mock() + model_config.max_model_len = 512 fd_config = FDConfig( parallel_config=parallel_config, graph_opt_config=graph_opt_config, @@ -59,7 +61,8 @@ class TestConfig(unittest.TestCase): load_config = LoadConfig({}) cache_config.enable_chunked_prefill = True scheduler_config = SchedulerConfig({}) - model_config = model_config = Mock() + model_config: Mock = Mock() + model_config.max_model_len = 512 fd_config = FDConfig( parallel_config=parallel_config, @@ -97,7 +100,8 @@ class TestConfig(unittest.TestCase): load_config = LoadConfig({}) scheduler_config = SchedulerConfig({}) scheduler_config.splitwise_role = "prefill" - model_config = model_config = Mock() + model_config: Mock = Mock() + model_config.max_model_len = 512 fd_config = FDConfig( parallel_config=parallel_config, diff --git a/tests/v1/test_prefix_cache.py b/tests/v1/test_prefix_cache.py index 2d56a9c9d..a6c7c2bf9 100644 --- a/tests/v1/test_prefix_cache.py +++ b/tests/v1/test_prefix_cache.py @@ -16,6 +16,7 @@ def test_normal_case(): model_cfg = SimpleNamespace(enable_mm=False) speculative_cfg = SimpleNamespace(method=None) model_cfg.print = print + model_cfg.max_model_len = 5120 cache_cfg.bytes_per_layer_per_block = 1 parallel_cfg = ParallelConfig(args) scheduler_cfg = SchedulerConfig(args) diff --git a/tests/v1/test_schedule_output.py b/tests/v1/test_schedule_output.py index 929098563..88988022d 100644 --- a/tests/v1/test_schedule_output.py +++ b/tests/v1/test_schedule_output.py @@ -15,6 +15,7 @@ def test_normal_schedule(): model_cfg = SimpleNamespace(enable_mm=False) speculative_cfg = SimpleNamespace(method=None) model_cfg.print = print + model_cfg.max_model_len = 5120 cache_cfg.bytes_per_layer_per_block = 1 parallel_cfg = ParallelConfig(args) scheduler_cfg = SchedulerConfig(args) @@ -79,6 +80,7 @@ def test_preempted_request(): model_cfg = SimpleNamespace(enable_mm=False) speculative_cfg = SimpleNamespace(method=None) model_cfg.print = print + model_cfg.max_model_len = 5120 cache_cfg.bytes_per_layer_per_block = 1 parallel_cfg = ParallelConfig(args) scheduler_cfg = SchedulerConfig(args)