mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-01 06:42:23 +08:00
@@ -168,7 +168,7 @@ class PrefixCacheManager:
|
||||
+ f" --device_id {int(device_ids[i])}"
|
||||
+ f" --rank {i}"
|
||||
+ f" --splitwise_role {self.splitwise_role}"
|
||||
+ f" --num_layers {cache_config.model_cfg.num_layers}"
|
||||
+ f" --num_layers {cache_config.model_cfg.num_hidden_layers}"
|
||||
+ f" --head_dim {cache_config.model_cfg.head_dim}"
|
||||
+ f" --kv_num_head {kv_num_head}"
|
||||
+ f" --mp_num {tensor_parallel_size}"
|
||||
|
@@ -270,6 +270,8 @@ class ParallelConfig:
|
||||
if hasattr(self, key):
|
||||
setattr(self, key, value)
|
||||
|
||||
# currently, the expert parallel size is equal data parallel size
|
||||
self.expert_parallel_size = self.data_parallel_size
|
||||
self.use_ep = self.expert_parallel_size > 1
|
||||
if self.splitwise_role == "mixed":
|
||||
self.moe_phase = MoEPhase(phase="prefill")
|
||||
|
@@ -1082,6 +1082,7 @@ class LLMEngine:
|
||||
f" --splitwise_role {self.cfg.splitwise_role}"
|
||||
f" --kv_cache_ratio {self.cfg.cache_config.kv_cache_ratio}"
|
||||
f" --expert_parallel_size {self.cfg.parallel_config.expert_parallel_size}"
|
||||
f" --data_parallel_size {self.cfg.parallel_config.data_parallel_size}"
|
||||
f" --quantization {self.cfg.model_config.quantization}"
|
||||
f" --ori_vocab_size {ori_vocab_size}"
|
||||
f" --speculative_config '{self.cfg.speculative_config.to_json_string()}'"
|
||||
|
@@ -513,6 +513,12 @@ def parse_args():
|
||||
default=1,
|
||||
help="expert parallel size",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data_parallel_size",
|
||||
type=int,
|
||||
default=1,
|
||||
help="data parallel size",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable_expert_parallel",
|
||||
action="store_true",
|
||||
|
Reference in New Issue
Block a user