mirror of
				https://github.com/PaddlePaddle/FastDeploy.git
				synced 2025-10-31 20:02:53 +08:00 
			
		
		
		
	 4325b737e7
			
		
	
	4325b737e7
	
	
	
		
			
			* 【FIX】Change the name of sparse attn from moba to plas (#4006) * 更新文档 * 【docs】 update readme (#4000) * 更新文档 * update readme * update docs * 【FIX】Change the name of sparse attn from moba to plas (#3845) * 更新文档 * 更新文档 * 更新文档 * 更新文档 * 修改moba为plas * code style * update ci * code style * update ci * code style --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> * fix max_num_seqs * fix test load attn --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
		
			
				
	
	
		
			121 lines
		
	
	
		
			4.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			121 lines
		
	
	
		
			4.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """
 | |
| # Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
 | |
| #
 | |
| # Licensed under the Apache License, Version 2.0 (the "License"
 | |
| # you may not use this file except in compliance with the License.
 | |
| # You may obtain a copy of the License at
 | |
| #
 | |
| #     http://www.apache.org/licenses/LICENSE-2.0
 | |
| #
 | |
| # Unless required by applicable law or agreed to in writing, software
 | |
| # distributed under the License is distributed on an "AS IS" BASIS,
 | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| # See the License for the specific language governing permissions and
 | |
| # limitations under the License.
 | |
| """
 | |
| 
 | |
| from typing import Any, Dict, Optional
 | |
| 
 | |
| from fastdeploy.worker.worker_process import initialize_fd_config
 | |
| 
 | |
| 
 | |
| class RolloutModelConfig:
 | |
|     def __init__(
 | |
|         self,
 | |
|         model_name_or_path: str,
 | |
|         max_model_len: int = 32768,
 | |
|         tensor_parallel_size: int = 4,
 | |
|         dynamic_load_weight: bool = True,
 | |
|         load_strategy: str = "meta",
 | |
|         enable_mm: bool = False,
 | |
|         # Default values for all other parameters
 | |
|         max_num_seqs: int = 34,
 | |
|         total_block_num: int = 2000,
 | |
|         block_size: int = 64,
 | |
|         engine_worker_queue_port: str = "8002",
 | |
|         device_ids: str = "0",
 | |
|         dtype: str = "bfloat16",
 | |
|         enc_dec_block_num: int = 1,
 | |
|         kv_cache_ratio: float = 0.7,
 | |
|         first_token_id: int = 1,
 | |
|         gpu_memory_utilization: float = 0.9,
 | |
|         engine_pid: int = None,
 | |
|         do_profile: bool = False,
 | |
|         pad_token_id: int = -1,
 | |
|         eos_tokens_lens: int = 2,
 | |
|         enable_chunked_prefill: bool = False,
 | |
|         speculative_method: str = None,
 | |
|         speculative_max_draft_token_num: int = 1,
 | |
|         speculative_model_name_or_path: str = "",
 | |
|         speculative_model_quantization: str = "WINT8",
 | |
|         max_num_batched_tokens: int = 2048,
 | |
|         enable_prefix_caching: bool = False,
 | |
|         splitwise_role: str = "mixed",
 | |
|         expert_parallel_size: int = 1,
 | |
|         enable_expert_parallel: bool = False,
 | |
|         ori_vocab_size: int = None,
 | |
|         quantization: Optional[Dict[str, Any]] = None,
 | |
|         guided_decoding_backend: str = "off",
 | |
|         disable_any_whitespace: bool = True,
 | |
|         enable_logprob: bool = False,
 | |
|         graph_optimization_config: str = None,
 | |
|         early_stop_config: str = None,
 | |
|         local_rank: int = 0,
 | |
|         plas_attention_config: str = None,
 | |
|         data_parallel_size: int = 1,
 | |
|         num_nextn_predict_layers: int = 0,
 | |
|     ):
 | |
|         # Required parameters
 | |
|         self.model = model_name_or_path
 | |
|         self.max_model_len = max_model_len
 | |
|         self.tensor_parallel_size = tensor_parallel_size
 | |
|         self.dynamic_load_weight = dynamic_load_weight
 | |
|         self.load_strategy = load_strategy
 | |
|         self.enable_mm = enable_mm
 | |
| 
 | |
|         # Optional parameters with defaults
 | |
|         self.max_num_seqs = max_num_seqs
 | |
|         self.total_block_num = total_block_num
 | |
|         self.block_size = block_size
 | |
|         self.engine_worker_queue_port = engine_worker_queue_port
 | |
|         self.device_ids = device_ids
 | |
|         self.dtype = dtype
 | |
|         self.enc_dec_block_num = enc_dec_block_num
 | |
|         self.kv_cache_ratio = kv_cache_ratio
 | |
|         self.first_token_id = first_token_id
 | |
|         self.gpu_memory_utilization = gpu_memory_utilization
 | |
|         self.engine_pid = engine_pid
 | |
|         self.do_profile = do_profile
 | |
|         self.pad_token_id = pad_token_id
 | |
|         self.eos_tokens_lens = eos_tokens_lens
 | |
|         self.enable_chunked_prefill = enable_chunked_prefill
 | |
|         self.speculative_config = {}
 | |
|         self.speculative_config["method"] = speculative_method
 | |
|         self.speculative_config["max_draft_token_num"] = speculative_max_draft_token_num
 | |
|         self.speculative_config["model"] = speculative_model_name_or_path
 | |
|         self.speculative_config["quantization"] = speculative_model_quantization
 | |
|         self.max_num_batched_tokens = max_num_batched_tokens
 | |
|         self.enable_prefix_caching = enable_prefix_caching
 | |
|         self.splitwise_role = splitwise_role
 | |
|         self.expert_parallel_size = expert_parallel_size
 | |
|         self.enable_expert_parallel = enable_expert_parallel
 | |
|         self.data_parallel_size = data_parallel_size
 | |
|         self.ori_vocab_size = ori_vocab_size
 | |
|         self.quantization = quantization
 | |
|         self.guided_decoding_backend = guided_decoding_backend
 | |
|         self.disable_any_whitespace = disable_any_whitespace
 | |
|         self.enable_logprob = enable_logprob
 | |
|         self.graph_optimization_config = graph_optimization_config
 | |
|         self.local_rank = local_rank
 | |
|         self.early_stop_config = early_stop_config
 | |
|         self.ips = None
 | |
|         self.plas_attention_config = plas_attention_config
 | |
|         self.num_nextn_predict_layers = num_nextn_predict_layers
 | |
| 
 | |
|     def __str__(self):
 | |
|         return "\n".join(f"{k}: {v}" for k, v in self.__dict__.items())
 | |
| 
 | |
|     def initialize(self):
 | |
|         """Initialize the final fd config"""
 | |
|         return initialize_fd_config(self, ranks=self.tensor_parallel_size, local_rank=self.local_rank)
 |