mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
qwen loader (#3057)
This commit is contained in:
@@ -37,7 +37,7 @@ from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
||||
from fastdeploy.model_executor.layers.rotary_embedding import get_rope
|
||||
from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
|
||||
from fastdeploy.model_executor.layers.sample.sampler import Sampler, SpeculativeSampler
|
||||
from fastdeploy.model_executor.model_loader import get_model_from_loader
|
||||
from fastdeploy.model_executor.model_loader import get_model_loader
|
||||
from fastdeploy.model_executor.ops.iluvatar import set_value_by_flags_and_idx
|
||||
from fastdeploy.model_executor.pre_and_post_process import (
|
||||
post_process,
|
||||
@@ -519,17 +519,14 @@ class IluvatarModelRunner(ModelRunnerBase):
|
||||
def load_model(self) -> None:
|
||||
"""load or download model"""
|
||||
logger.info(f"Starting to load model {self.model_config.architectures[0]}")
|
||||
time_before_load = time.perf_counter()
|
||||
# 1. Load original model
|
||||
self.model = get_model_from_loader(fd_config=self.fd_config)
|
||||
model_loader = get_model_loader(load_config=self.fd_config.load_config)
|
||||
self.model = model_loader.load_model(fd_config=self.fd_config)
|
||||
|
||||
# 2. Load lora model
|
||||
|
||||
# 3. Load drafter model(for speculative decoding)
|
||||
|
||||
time_after_load = time.perf_counter()
|
||||
logger.info(f"Model loading took {time_after_load - time_before_load} seconds")
|
||||
|
||||
def get_model(self) -> nn.Layer:
|
||||
"""get current model"""
|
||||
return self.model
|
||||
|
Reference in New Issue
Block a user