qwen loader (#3057)

This commit is contained in:
bukejiyu
2025-07-30 19:09:38 +08:00
committed by GitHub
parent 28fff1b035
commit db698bda01
22 changed files with 494 additions and 92 deletions

View File

@@ -41,7 +41,7 @@ from fastdeploy.model_executor.layers.attention.base_attention_backend import (
from fastdeploy.model_executor.layers.rotary_embedding import get_rope
from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
from fastdeploy.model_executor.layers.sample.sampler import Sampler, SpeculativeSampler
from fastdeploy.model_executor.model_loader import get_model_from_loader
from fastdeploy.model_executor.model_loader import get_model_loader
from fastdeploy.model_executor.ops.gcu import set_value_by_flags_and_idx
from fastdeploy.model_executor.pre_and_post_process import (
post_process,
@@ -547,10 +547,9 @@ class GCUModelRunner(ModelRunnerBase):
def load_model(self) -> None:
"""load or download model"""
logger.info(f"Starting to load model {self.model_config.architectures[0]}")
time_before_load = time.perf_counter()
# 1. Load original model
self.model = get_model_from_loader(fd_config=self.fd_config)
model_loader = get_model_loader(load_config=self.fd_config.load_config)
self.model = model_loader.load_model(fd_config=self.fd_config)
# 1.1 Load RL dynamic model
if self.fd_config.load_config.dynamic_load_weight:
from fastdeploy.rl.dynamic_weight_manager import DynamicWeightManager
@@ -561,9 +560,6 @@ class GCUModelRunner(ModelRunnerBase):
# 3. Load drafter model(for speculative decoding)
time_after_load = time.perf_counter()
logger.info(f"Model loading took {time_after_load - time_before_load} seconds")
# 4. Init proposer for speculative method
self._init_speculative_proposer()