[Sync] Update to latest code (#2679)

* [Sync] Update to latest code

* Add new code files

* Add new code files

* update code

* Try to fix build.sh

* Try to fix build.sh

* Update code

* Update requirements.txt

* Update code

---------

Co-authored-by: Jiang-Jia-Jun <jiangjiajun@baidu.com>
This commit is contained in:
Jiang-Jia-Jun
2025-07-03 15:43:53 +08:00
committed by GitHub
parent d222248d00
commit 05c670e593
95 changed files with 9916 additions and 1312 deletions

View File

@@ -18,7 +18,7 @@ from __future__ import annotations
from dataclasses import dataclass, field
from enum import Enum
from typing import Optional
from typing import Optional, Literal
from paddleformers.transformers.configuration_utils import PretrainedConfig
@@ -51,7 +51,6 @@ class ModelConfig(PretrainedConfig):
top_p = 0.0
temperature = 1.0
rope_theta = 10000.0
rope_scaling = None
penalty_score = 1.0
frequency_score = 0.0
presence_score = 0.0
@@ -142,6 +141,7 @@ class MoEConfig:
moe_num_shared_experts = (0, )
moe_layer_start_index = 0
moe_layer_end_index = None
moe_use_aux_free: bool = False
num_max_dispatch_tokens_per_rank = 256
im_patch_id = (
100295 # multimodality, TODO(liuyuanle): read from config.json
@@ -163,7 +163,6 @@ class ParallelConfig:
# The embedding weight distributed on your gpu cards is divided by row or column.
# Defaults to False means divide by row. When vocab_size can not be divided by world_size
# but hidden_size can, we can consider split embedding weight by column.
column_cut = False # (bool, optional)
"""
From old wersion worker args
TODO(gongshaotian): Reclassify
@@ -194,18 +193,13 @@ class ParallelConfig:
engine_pid: Optional[int] = None
# Do profile or not
do_profile: bool = False
# Dynamic load weight or not
dynamic_load_weight: bool = False
#
pad_token_id: int = -1
#
eos_tokens_lens: int = 2
# Enable chunked prefill
enable_chunked_prefill: str = "store_true"
"""
- APPEND_ATTN:
"""
attention_backend: str = "APPEND_ATTN"
#
max_num_batched_tokens: int = 2048
# enable prefix cache
enable_prefix_caching = None
@@ -354,9 +348,27 @@ class GraphOptimizationConfig:
@dataclass
class LoadConfig:
"""
Configuration for loading parameter
Configuration for dynamic weight loading strategies
Attributes:
dynamic_load_weight: Whether to enable dynamic weight loading
load_strategy: Specifies the weight loading method when enabled:
- 'ipc': Real-time IPC streaming with automatic resharding
- 'ipc_no_reshard': Real-time IPC streaming without weight process
- 'ipc_snapshot': Load from disk snapshot of IPC weights
- 'meta': provide RL traing worker, no_weights_load
- None: No dynamic loading
"""
pass
use_fastsafetensor: bool = False
dynamic_load_weight: bool = False
load_strategy: Optional[Literal['ipc', 'ipc_no_reshard', 'ipc_snapshot', 'meta']] = None
def __post_init__(self):
if self.load_strategy is not None and not self.dynamic_load_weight:
raise ValueError("Load strategy requires dynamic_load_weight=True")
if self.dynamic_load_weight and self.load_strategy is None:
raise ValueError("Must specify load_strategy when dynamic_load_weight is True")
@dataclass
@@ -392,7 +404,7 @@ class FDConfig:
init=True) # type: ignore
device_config: DeviceConfig = field(default=None,
init=True) # type: ignore
load_config: LoadConfig = field(default=None, init=True) # type: ignore
load_config: LoadConfig = field(default=None, init=True)
quant_config: Optional[QuantConfigBase] = None
graph_opt_config: Optional[GraphOptimizationConfig] = None
moe_config: MoEConfig = field(default=None, init=True) # type: ignore