mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
843 lines
35 KiB
Python
843 lines
35 KiB
Python
"""
|
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""
|
|
import argparse
|
|
import os
|
|
import random
|
|
|
|
import numpy as np
|
|
import paddle
|
|
import paddle.distributed.fleet as fleet
|
|
|
|
from fastdeploy.config import ModelConfig
|
|
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
|
|
from fastdeploy.input.mm_processor import DataProcessor
|
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
|
from fastdeploy.model_executor.layers.attention import get_attention_backend
|
|
from fastdeploy.model_executor.layers.rotary_embedding import get_rope_3d
|
|
from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
|
|
from fastdeploy.model_executor.layers.sample.sampler import Sampler
|
|
from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import \
|
|
ScatterOp
|
|
from fastdeploy.platforms import current_platform
|
|
from fastdeploy.worker.output import SamplerOutput
|
|
from fastdeploy.worker.utils import check_safetensors_model
|
|
from fastdeploy.worker.vl_model_runner_base import VLModelRunnerBase
|
|
|
|
if current_platform.is_cuda() and current_platform.available():
|
|
from fastdeploy.model_executor.layers.utils import (
|
|
remove_padding, speculate_remove_padding)
|
|
|
|
from fastdeploy.model_executor.ops.gpu import (save_output, save_output_topk,
|
|
set_stop_value_multi_ends,
|
|
set_value_by_flags_and_idx,
|
|
update_inputs)
|
|
|
|
|
|
class GPUVLModelRunner(VLModelRunnerBase):
|
|
"""
|
|
The GPUVLModelRunner class for vision-language tasks on GPU.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
config: ModelConfig,
|
|
args: argparse.Namespace,
|
|
nranks: int,
|
|
rank: int,
|
|
) -> None:
|
|
"""
|
|
GPUVLModelRunner init
|
|
"""
|
|
self.nranks = nranks
|
|
self.rank = rank
|
|
|
|
hcg = fleet.get_hybrid_communicate_group()
|
|
self.tensor_parallel_degree = max(hcg.get_model_parallel_world_size(),
|
|
1)
|
|
self.tensor_parallel_rank = hcg.get_model_parallel_rank()
|
|
self.mp_src_rank = hcg.get_model_parallel_group_src_rank()
|
|
self.mp_group = hcg.get_model_parallel_group()
|
|
self.is_safetensors_model = check_safetensors_model(
|
|
args.model_name_or_path)
|
|
self.enable_logprob = args.enable_logprob
|
|
|
|
model_path = os.path.dirname(args.model_name_or_path)
|
|
args.llm_model_name_or_path = args.model_name_or_path
|
|
if not self.is_safetensors_model:
|
|
args.tokenizer = args.image_preprocessor = model_path
|
|
else:
|
|
args.tokenizer = args.image_preprocessor = args.model_name_or_path
|
|
args.vision_model_name_or_path = os.path.join(
|
|
model_path, "DFNRopeVisionTransformer")
|
|
|
|
self.amp_black = [
|
|
"reduce_sum",
|
|
"c_softmax_with_cross_entropy",
|
|
"elementwise_div",
|
|
"sin",
|
|
"cos",
|
|
"sort",
|
|
"multinomial",
|
|
]
|
|
self.amp_white = [
|
|
"lookup_table",
|
|
"lookup_table_v2",
|
|
"flash_attn",
|
|
"matmul",
|
|
"matmul_v2",
|
|
"fused_gemm_epilogue",
|
|
]
|
|
|
|
super().__init__(config, args)
|
|
self.init_extra_input(config, args)
|
|
|
|
self._reset_paddle_env()
|
|
|
|
self.sampler = Sampler()
|
|
|
|
def _reset_paddle_env(self):
|
|
pass
|
|
|
|
def update_chunked_prefill(self, tasks: list[any]) -> None:
|
|
"""
|
|
update chunked prefill
|
|
"""
|
|
if not self.args.enable_chunked_prefill:
|
|
return
|
|
|
|
for task in tasks:
|
|
if task.chunk_idx > len(task.prefill_chunk_info):
|
|
continue
|
|
|
|
idx = task.idx
|
|
if task.chunk_idx == len(task.prefill_chunk_info):
|
|
self.share_inputs["seq_lens_this_time"][idx:idx + 1] = 1
|
|
self.share_inputs['seq_lens_encoder'][idx:idx + 1] = 0
|
|
self.share_inputs["seq_lens_decoder"][idx:idx +
|
|
1] = task.start_idx
|
|
self.share_inputs["step_idx"][idx:idx + 1] = 1
|
|
else:
|
|
inputs = self._preprocess_task(
|
|
task.prefill_chunk_info[task.chunk_idx])
|
|
if inputs.get("images") is not None:
|
|
self.share_inputs[
|
|
"image_features"] = self.extract_vision_features(
|
|
inputs)
|
|
else:
|
|
# Compatible with the situation that lacks images and videos
|
|
self.share_inputs["image_features"] = None
|
|
|
|
token_chunk_size = inputs["input_ids"].shape[1]
|
|
self.share_inputs["input_ids"][
|
|
idx:idx + 1, :token_chunk_size] = inputs["input_ids"]
|
|
self.share_inputs["seq_lens_this_time"][idx:idx +
|
|
1] = token_chunk_size
|
|
self.share_inputs['seq_lens_encoder'][idx:idx +
|
|
1] = token_chunk_size
|
|
self.share_inputs["seq_lens_decoder"][idx:idx +
|
|
1] = task.start_idx
|
|
self.share_inputs["step_idx"][idx:idx + 1] = 0
|
|
|
|
task.start_idx += token_chunk_size
|
|
task.chunk_idx += 1
|
|
|
|
def _init_image_preprocess(self, vision_config) -> None:
|
|
processor = DataProcessor(
|
|
tokenizer_name=self.args.tokenizer,
|
|
image_preprocessor_name=str(self.args.image_preprocessor),
|
|
)
|
|
processor.eval()
|
|
image_preprocess = processor.image_preprocessor
|
|
image_preprocess.image_mean_tensor = paddle.to_tensor(
|
|
image_preprocess.image_mean, dtype="float32"
|
|
).reshape([1, 3, 1, 1])
|
|
image_preprocess.image_std_tensor = paddle.to_tensor(
|
|
image_preprocess.image_std, dtype="float32"
|
|
).reshape([1, 3, 1, 1])
|
|
image_preprocess.rescale_factor = paddle.to_tensor(
|
|
image_preprocess.rescale_factor, dtype="float32"
|
|
)
|
|
image_preprocess.image_mean_tensor = (
|
|
image_preprocess.image_mean_tensor.squeeze(
|
|
[-2, -1]
|
|
).repeat_interleave(vision_config.patch_size**2 * 1, -1)
|
|
)
|
|
image_preprocess.image_std_tensor = (
|
|
image_preprocess.image_std_tensor.squeeze(
|
|
[-2, -1]
|
|
).repeat_interleave(vision_config.patch_size**2 * 1, -1)
|
|
)
|
|
return image_preprocess
|
|
|
|
def _load_model(
|
|
self,
|
|
model_name: str,
|
|
dynamic_load_weight: int = 0,
|
|
) -> None:
|
|
"""
|
|
Load the model from the given model name.
|
|
"""
|
|
|
|
vocab_file_names = [
|
|
"tokenizer.model", "spm.model", "ernie_token_100k.model"
|
|
]
|
|
for i in range(len(vocab_file_names)):
|
|
if os.path.exists(
|
|
os.path.join(self.args.tokenizer, vocab_file_names[i])):
|
|
ErnieBotTokenizer.resource_files_names[
|
|
"vocab_file"] = vocab_file_names[i]
|
|
break
|
|
|
|
tokenizer = ErnieBotTokenizer.from_pretrained(
|
|
self.args.tokenizer,
|
|
model_max_length=self.args.max_model_len,
|
|
padding_side="right",
|
|
use_fast=False,
|
|
)
|
|
tokenizer.ignored_index = -100
|
|
if tokenizer.pad_token is None:
|
|
tokenizer.pad_token = tokenizer.unk_token
|
|
|
|
self.dtype = self.args.dtype
|
|
paddle.set_default_dtype(self.dtype)
|
|
|
|
from fastdeploy.worker.worker_process import initialize_fd_config
|
|
|
|
fd_config = initialize_fd_config(
|
|
self.args, self.tensor_parallel_degree, self.tensor_parallel_rank
|
|
)
|
|
fd_config.model_config.tensor_parallel_degree=self.tensor_parallel_degree
|
|
fd_config.model_config.tensor_parallel_rank=self.tensor_parallel_rank
|
|
fd_config.model_config.moe_group="dummy"
|
|
fd_config.parallel_config.column_cut = False
|
|
vision_config = fd_config.model_config.vision_config
|
|
vision_config.attn_sep = False
|
|
vision_config.dtype = "bfloat16"
|
|
vision_config.tensor_parallel_degree = self.tensor_parallel_degree
|
|
vision_config.tensor_parallel_rank = self.tensor_parallel_rank
|
|
fd_config.model_config.pixel_hidden_size = vision_config.hidden_size
|
|
fd_config.model_config.im_patch_id = tokenizer.get_vocab()[
|
|
"<|IMAGE_PLACEHOLDER|>"
|
|
]
|
|
fd_config.model_config.think_end_id = tokenizer.get_vocab()["</think>"]
|
|
fd_config.model_config.max_text_id = fd_config.model_config.im_patch_id
|
|
fd_config.model_config.sequence_parallel = False
|
|
self.fd_config = fd_config
|
|
self.model_cfg = self.fd_config.model_config
|
|
self.image_preprocess = self._init_image_preprocess(
|
|
self.fd_config.model_config.vision_config
|
|
)
|
|
from fastdeploy.model_executor.model_loader import \
|
|
get_model_from_loader
|
|
|
|
self.model = get_model_from_loader(self.fd_config)
|
|
attn_backend_cls = get_attention_backend()
|
|
num_heads = self.fd_config.model_config.num_attention_heads // \
|
|
self.fd_config.parallel_config.tensor_parallel_size
|
|
self.fd_config.model_config.kv_num_heads = int(
|
|
self.fd_config.model_config.num_key_value_heads
|
|
) // self.fd_config.parallel_config.tensor_parallel_size
|
|
head_dim = self.fd_config.model_config.head_dim
|
|
self.attn_backend = attn_backend_cls(
|
|
self.fd_config,
|
|
kv_num_heads=self.fd_config.model_config.kv_num_heads,
|
|
num_heads=num_heads,
|
|
head_dim=head_dim)
|
|
self._init_kvcache()
|
|
|
|
def init_extra_input(self, config: ModelConfig, args: argparse.Namespace) -> None:
|
|
"""
|
|
Initialize extra input tensors.
|
|
"""
|
|
head_dim = self.model_cfg.head_dim
|
|
self.share_inputs.update({
|
|
"rope_emb":
|
|
paddle.full(shape=[
|
|
args.max_num_seqs, 2, 1, self.max_length, 1, head_dim // 2
|
|
],
|
|
fill_value=0,
|
|
dtype="float32")
|
|
})
|
|
self.share_inputs.update({"image_features": None})
|
|
self.share_inputs.update({
|
|
"need_think_end":
|
|
paddle.full(shape=[args.max_num_seqs, 1],
|
|
fill_value=0,
|
|
dtype="int32")
|
|
})
|
|
self.share_inputs.update({
|
|
"enable_thinking":
|
|
paddle.full(shape=[1], fill_value=True, dtype="bool")
|
|
})
|
|
self.share_inputs.update({
|
|
"reasoning_index":
|
|
paddle.full(shape=[args.max_num_seqs, 1],
|
|
fill_value=0,
|
|
dtype="int32")
|
|
})
|
|
|
|
def init_rotary_position_embedding(self, max_model_len: int) -> None:
|
|
"""
|
|
Init rotary position embedding
|
|
"""
|
|
pass
|
|
|
|
def _init_kvcache(self):
|
|
"""
|
|
Init kv cache
|
|
"""
|
|
cache_kvs = {}
|
|
total_block_num = self.num_gpu_blocks
|
|
num_layers = self.model_cfg.num_hidden_layers
|
|
|
|
kv_num_head = self.model_cfg.num_key_value_heads if self.model_cfg.num_key_value_heads != -1 else self.model_cfg.num_attention_heads
|
|
|
|
kv_num_head = kv_num_head // self.tensor_parallel_degree
|
|
self.model_cfg.kv_num_head = kv_num_head
|
|
|
|
for i in range(num_layers):
|
|
cache_type = self.args.dtype
|
|
cache_kvs["key_caches_{}".format(i)] = paddle.full(
|
|
shape=[
|
|
total_block_num,
|
|
kv_num_head,
|
|
self.args.block_size,
|
|
self.model_cfg.head_dim,
|
|
],
|
|
fill_value=0,
|
|
dtype=cache_type,
|
|
)
|
|
cache_kvs["value_caches_{}".format(i)] = paddle.full(
|
|
shape=[
|
|
total_block_num,
|
|
kv_num_head,
|
|
self.args.block_size,
|
|
self.model_cfg.head_dim,
|
|
],
|
|
fill_value=0,
|
|
dtype=cache_type,
|
|
)
|
|
|
|
self.share_inputs["caches"] = list(cache_kvs.values())
|
|
for value in cache_kvs.values():
|
|
del value
|
|
paddle.device.cuda.empty_cache()
|
|
|
|
def clear_parameters(self, pid: int) -> None:
|
|
""" clear_parameters """
|
|
if "caches" in self.share_inputs:
|
|
self.model.clear_parameters(pid)
|
|
del self.share_inputs["caches"]
|
|
paddle.device.cuda.empty_cache()
|
|
self.model.log_memory_usage("clear all memory")
|
|
|
|
def update_parameters(self, pid: int) -> None:
|
|
""" update_parameters """
|
|
if "caches" not in self.share_inputs:
|
|
self.model.update_parameters(pid)
|
|
self._init_kvcache()
|
|
self.model.log_memory_usage("update all memory")
|
|
|
|
@paddle.no_grad()
|
|
def extract_vision_features(self, inputs: list[paddle.Tensor]) -> paddle.Tensor:
|
|
"""extract_vision_features"""
|
|
assert inputs["images"] is not None
|
|
grid_thw = inputs["grid_thw"]
|
|
|
|
images = inputs["images"].cast("float32")
|
|
images = self.image_preprocess.rescale_factor * images - self.image_preprocess.image_mean_tensor
|
|
images = images / self.image_preprocess.image_std_tensor
|
|
images = images.cast("bfloat16")
|
|
|
|
token_type_ids = inputs["token_type_ids"]
|
|
token_type_ids_w_video = token_type_ids
|
|
input_ids = inputs["input_ids"]
|
|
# convert to img patch id
|
|
image_mask = input_ids == self.model_cfg.im_patch_id
|
|
image_type_ids = inputs["image_type_ids"]
|
|
with paddle.amp.auto_cast(
|
|
True,
|
|
custom_black_list=self.amp_black,
|
|
custom_white_list=self.amp_white,
|
|
level="O2",
|
|
dtype=self.dtype,
|
|
):
|
|
image_features = self.model.vision_model.extract_feature(
|
|
images, grid_thw)
|
|
if self.tensor_parallel_degree > 1:
|
|
S, C = image_features.shape
|
|
image_features = image_features.reshape(
|
|
[-1, C * self.model_cfg.spatial_conv_size**2])
|
|
image_features = ScatterOp.apply(image_features,
|
|
axis=-1) # mp 切 Fea
|
|
image_features = image_features.reshape([S, -1])
|
|
image_features = self.model.resampler_model(
|
|
image_features,
|
|
image_mask,
|
|
token_type_ids_w_video,
|
|
image_type_ids,
|
|
grid_thw,
|
|
)
|
|
return image_features
|
|
|
|
@paddle.no_grad()
|
|
def prepare_rope3d(self, position_ids: paddle.Tensor, **kwargs) -> paddle.Tensor:
|
|
"""prepare_rope3d"""
|
|
|
|
prefix_max_position_ids = paddle.max(position_ids) + 1
|
|
dec_pos_ids = paddle.tile(
|
|
paddle.arange(kwargs["max_length"],
|
|
dtype="int64").unsqueeze(0).unsqueeze(-1), [1, 1, 3])
|
|
dec_pos_ids = dec_pos_ids + prefix_max_position_ids
|
|
position_ids_3d_real = paddle.concat([position_ids, dec_pos_ids],
|
|
axis=1)
|
|
|
|
rope_emb = get_rope_3d(
|
|
position_ids=position_ids_3d_real,
|
|
rotary_dim=self.model_cfg.head_dim,
|
|
paritial_rotary_factor=1.0,
|
|
base=self.model_cfg.rope_theta,
|
|
max_position=self.args.max_model_len,
|
|
freq_allocation=self.model_cfg.freq_allocation,
|
|
)
|
|
return rope_emb
|
|
|
|
def prefill_finished(self):
|
|
"""
|
|
Verify prefill operation completion
|
|
"""
|
|
prefill_statue = (self.share_inputs["seq_lens_this_time"] != 0) & (
|
|
self.share_inputs["seq_lens_this_time"] != 1)
|
|
return not paddle.any(prefill_statue).numpy()
|
|
|
|
def dy_input_preprocess(self, tasks: list[any]) -> None:
|
|
"""
|
|
dynamic insertion
|
|
"""
|
|
|
|
def get_numeric_value(task, key, default_value):
|
|
if task.get(key, None) is not None:
|
|
return task.get(key)
|
|
else:
|
|
return default_value
|
|
|
|
for i in range(len(tasks)):
|
|
task = tasks[i]
|
|
idx = task.idx
|
|
|
|
kwargs = {
|
|
"max_length":
|
|
get_numeric_value(task, "max_tokens", 2048),
|
|
"top_p":
|
|
get_numeric_value(task, "top_p", 0.8),
|
|
"temperature":
|
|
get_numeric_value(task, "temperature", 0.2),
|
|
"top_k":
|
|
get_numeric_value(task, "top_k", 0),
|
|
"penalty_score":
|
|
get_numeric_value(task, "repetition_penalty", 1.0),
|
|
"frequency_score":
|
|
get_numeric_value(task, "frequency_penalty", 0.0),
|
|
"presence_score":
|
|
get_numeric_value(task, "presence_penalty", 0.0),
|
|
"decode_strategy":
|
|
"sampling",
|
|
"pad_token_id":
|
|
self.args.pad_token_id,
|
|
"enable_thinking":
|
|
get_numeric_value(task, "enable_thinking", True),
|
|
"reasoning_max_tokens":
|
|
get_numeric_value(task, "reasoning_max_tokens", 2048),
|
|
}
|
|
|
|
if self.args.enable_chunked_prefill:
|
|
task.set("chunk_idx", 1)
|
|
inputs = self._preprocess_task(task.prefill_chunk_info[0])
|
|
if inputs.get("images") is not None:
|
|
self.share_inputs[
|
|
"image_features"] = self.extract_vision_features(
|
|
inputs)
|
|
else:
|
|
# Compatible with the situation that lacks images and videos
|
|
self.share_inputs["image_features"] = None
|
|
if task.multimodal_inputs["position_ids"] is not None:
|
|
position_ids = paddle.to_tensor(
|
|
task.multimodal_inputs["position_ids"],
|
|
dtype="int64").unsqueeze([0])
|
|
else:
|
|
position_ids = None
|
|
|
|
token_chunk_size = inputs["input_ids"].shape[1]
|
|
task.set("start_idx", token_chunk_size)
|
|
self.share_inputs["input_ids"][
|
|
idx:idx + 1, :token_chunk_size] = inputs["input_ids"]
|
|
self.share_inputs["seq_lens_this_time"][idx:idx +
|
|
1] = token_chunk_size
|
|
self.share_inputs["seq_lens_encoder"][idx:idx +
|
|
1] = token_chunk_size
|
|
self.share_inputs["step_seq_lens_encoder"][
|
|
idx:idx + 1] = token_chunk_size
|
|
else:
|
|
inputs = self._preprocess_task(task.multimodal_inputs)
|
|
if inputs.get("images") is not None:
|
|
self.share_inputs[
|
|
"image_features"] = self.extract_vision_features(
|
|
inputs)
|
|
else:
|
|
# Compatible with the situation that lacks images and videos
|
|
self.share_inputs["image_features"] = None
|
|
position_ids = inputs["position_ids"]
|
|
|
|
length = inputs["input_ids"].shape[1]
|
|
self.share_inputs["input_ids"][
|
|
idx:idx + 1, :length] = inputs["input_ids"]
|
|
self.share_inputs["seq_lens_this_time"][idx:idx + 1] = length
|
|
self.share_inputs["seq_lens_encoder"][idx:idx + 1] = length
|
|
self.share_inputs["step_seq_lens_encoder"][idx:idx +
|
|
1] = length
|
|
|
|
# force </think>
|
|
self.share_inputs["enable_thinking"][:] = kwargs["enable_thinking"]
|
|
self.share_inputs["need_think_end"][
|
|
idx:idx + 1, :] = 1 if kwargs["enable_thinking"] else 0
|
|
|
|
self.share_inputs["reasoning_index"][
|
|
idx:idx + 1, :] = kwargs["reasoning_max_tokens"]
|
|
|
|
self.share_inputs["rope_emb"][idx:idx +
|
|
1, :] = self.prepare_rope3d(
|
|
position_ids, **kwargs)
|
|
|
|
self.share_inputs["top_p"][idx:idx + 1] = kwargs["top_p"]
|
|
self.share_inputs["temperature"][idx:idx +
|
|
1] = kwargs["temperature"]
|
|
self.share_inputs["eos_token_id"][:] = np.array(
|
|
task.eos_token_ids).astype("int64").reshape(-1, 1)
|
|
self.share_inputs["penalty_score"][idx:idx +
|
|
1] = kwargs["penalty_score"]
|
|
self.share_inputs["frequency_score"][idx:idx +
|
|
1] = kwargs["frequency_score"]
|
|
self.share_inputs["presence_score"][idx:idx +
|
|
1] = kwargs["presence_score"]
|
|
self.share_inputs["seq_lens_decoder"][idx:idx + 1] = 0
|
|
self.share_inputs["step_idx"][idx:idx + 1] = 0
|
|
self.share_inputs["min_dec_len"][idx:idx + 1] = 1
|
|
self.share_inputs["max_dec_len"][idx:idx +
|
|
1] = kwargs["max_length"]
|
|
self.share_inputs["stop_flags"][idx:idx + 1] = False
|
|
self.share_inputs["pre_ids"][idx:idx + 1] = -1
|
|
encoder_block_num = len(task.get("block_tables"))
|
|
self.share_inputs["encoder_block_lens"][idx:idx +
|
|
1] = encoder_block_num
|
|
self.share_inputs["block_tables"][idx:idx + 1, :] = -1
|
|
self.share_inputs["block_tables"][
|
|
idx:idx + 1, :encoder_block_num] = np.array(task.block_tables,
|
|
dtype="int32")
|
|
|
|
def pre_process(self) -> None:
|
|
"""
|
|
pre_process
|
|
"""
|
|
if current_platform.is_cuda():
|
|
if self.args.speculative_method is not None:
|
|
(
|
|
ids_remove_padding,
|
|
padding_offset,
|
|
cum_offsets,
|
|
cu_seqlens_q,
|
|
cu_seqlens_k,
|
|
) = speculate_remove_padding(
|
|
max_len=self.args.max_model_len,
|
|
input_ids=self.share_inputs["input_ids"],
|
|
seq_lens_this_time=self.share_inputs["seq_lens_this_time"],
|
|
draft_tokens=self.share_inputs["draft_tokens"],
|
|
seq_lens_encoder=self.share_inputs["seq_lens_encoder"])
|
|
else:
|
|
(
|
|
ids_remove_padding,
|
|
padding_offset,
|
|
cum_offsets,
|
|
cu_seqlens_q,
|
|
cu_seqlens_k,
|
|
) = remove_padding(
|
|
max_len=self.args.max_model_len,
|
|
input_ids=self.share_inputs["input_ids"],
|
|
seq_lens_this_time=self.share_inputs["seq_lens_this_time"])
|
|
self.share_inputs["ids_remove_padding"] = ids_remove_padding
|
|
self.share_inputs["padding_offset"] = padding_offset
|
|
self.share_inputs["cum_offsets"] = cum_offsets
|
|
self.share_inputs["cu_seqlens_q"] = cu_seqlens_q
|
|
self.share_inputs["cu_seqlens_k"] = cu_seqlens_k
|
|
self.share_inputs["decoder_batch_ids"] = paddle.full(
|
|
[self.fd_config.parallel_config.max_num_seqs, 1], 0, dtype='int32')
|
|
self.share_inputs["decoder_tile_ids_per_batch"] = paddle.full(
|
|
[self.fd_config.parallel_config.max_num_seqs, 1], 0, dtype='int32')
|
|
# initialize_forward_meta
|
|
self.forward_meta = ForwardMeta(
|
|
input_ids=self.share_inputs["input_ids"],
|
|
ids_remove_padding=self.share_inputs["ids_remove_padding"],
|
|
rotary_embs=self.share_inputs["rope_emb"],
|
|
attn_backend=self.attn_backend,
|
|
decoder_batch_ids=self.share_inputs["decoder_batch_ids"],
|
|
decoder_tile_ids_per_batch=self.share_inputs["decoder_tile_ids_per_batch"],
|
|
seq_lens_encoder=self.share_inputs["seq_lens_encoder"],
|
|
seq_lens_decoder=self.share_inputs["seq_lens_decoder"],
|
|
seq_lens_this_time=self.share_inputs["seq_lens_this_time"],
|
|
cum_offsets=self.share_inputs["cum_offsets"],
|
|
padding_offset=self.share_inputs["padding_offset"],
|
|
cu_seqlens_q=self.share_inputs["cu_seqlens_q"],
|
|
cu_seqlens_k=self.share_inputs["cu_seqlens_k"],
|
|
block_tables=self.share_inputs["block_tables"],
|
|
caches=self.share_inputs["caches"]
|
|
)
|
|
self.attn_backend.init_attention_metadata(self.forward_meta)
|
|
|
|
self.sampling_metadata = SamplingMetadata(
|
|
temperature=self.share_inputs["temperature"],
|
|
top_p=self.share_inputs["top_p"],
|
|
step_idx=self.share_inputs["step_idx"],
|
|
pre_token_ids=self.share_inputs["pre_ids"],
|
|
frequency_penalties=self.share_inputs["frequency_score"],
|
|
presence_penalties=self.share_inputs["presence_score"],
|
|
repetition_penalties=self.share_inputs["penalty_score"],
|
|
min_dec_lens=self.share_inputs["min_dec_len"],
|
|
bad_words_token_ids=self.share_inputs["bad_tokens"],
|
|
eos_token_ids=self.share_inputs["eos_token_id"],
|
|
max_num_logprobs=20 if self.enable_logprob else None,
|
|
)
|
|
|
|
def generate(self) -> None:
|
|
"""
|
|
generate
|
|
"""
|
|
self.pre_process()
|
|
hiddden_states = self.model(self.share_inputs["ids_remove_padding"],
|
|
self.share_inputs["image_features"],
|
|
self.forward_meta)
|
|
logits = self.model.compute_logits(hiddden_states)
|
|
set_value_by_flags_and_idx(
|
|
self.share_inputs["pre_ids"],
|
|
self.share_inputs["input_ids"],
|
|
self.share_inputs["seq_lens_this_time"],
|
|
self.share_inputs["seq_lens_encoder"],
|
|
self.share_inputs["seq_lens_decoder"],
|
|
self.share_inputs["step_idx"],
|
|
self.share_inputs["stop_flags"],
|
|
)
|
|
# sampler & save_output
|
|
sampler_output = self.sampler(logits, self.sampling_metadata)
|
|
if self.fd_config.parallel_config.tensor_parallel_size > 1:
|
|
paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0)
|
|
self.post_process(sampler_output)
|
|
|
|
def post_process(self, sampler_output: SamplerOutput) -> None:
|
|
"""
|
|
post_process
|
|
"""
|
|
if self.share_inputs["enable_thinking"]:
|
|
exists_think_end = sampler_output.sampled_token_ids == self.model_cfg.think_end_id
|
|
paddle.assign(
|
|
paddle.where(
|
|
exists_think_end,
|
|
self.share_inputs["need_think_end"] - 1,
|
|
self.share_inputs["need_think_end"],
|
|
), self.share_inputs["need_think_end"])
|
|
|
|
paddle.assign(
|
|
paddle.where(
|
|
self.share_inputs["need_think_end"].cast("bool"),
|
|
self.share_inputs["reasoning_index"] - 1,
|
|
self.share_inputs["reasoning_index"],
|
|
), self.share_inputs["reasoning_index"])
|
|
|
|
stop_wo_think = (
|
|
(sampler_output.sampled_token_ids == self.share_inputs["eos_token_id"]) |
|
|
(self.share_inputs["reasoning_index"] == 0)) & (
|
|
self.share_inputs["need_think_end"] > 0)
|
|
sampler_output.sampled_token_ids = paddle.where(stop_wo_think,
|
|
self.model_cfg.think_end_id,
|
|
sampler_output.sampled_token_ids)
|
|
paddle.assign(
|
|
paddle.where(
|
|
stop_wo_think,
|
|
self.share_inputs["need_think_end"] - 1,
|
|
self.share_inputs["need_think_end"],
|
|
), self.share_inputs["need_think_end"])
|
|
paddle.assign(
|
|
paddle.where(
|
|
self.share_inputs["stop_flags"],
|
|
self.share_inputs["step_idx"],
|
|
self.share_inputs["step_idx"] + 1,
|
|
),
|
|
self.share_inputs["step_idx"],
|
|
)
|
|
length_cond = paddle.greater_equal(self.share_inputs["step_idx"],
|
|
self.share_inputs["max_dec_len"])
|
|
paddle.assign(
|
|
paddle.logical_or(self.share_inputs["stop_flags"], length_cond),
|
|
self.share_inputs["stop_flags"],
|
|
)
|
|
|
|
set_stop_value_multi_ends(
|
|
sampler_output.sampled_token_ids,
|
|
self.share_inputs["stop_flags"],
|
|
self.share_inputs["seq_lens_this_time"],
|
|
self.share_inputs["eos_token_id"],
|
|
self.share_inputs["next_tokens"],
|
|
False,
|
|
) # multi ends
|
|
# update inputs
|
|
update_inputs(
|
|
self.share_inputs["stop_flags"],
|
|
self.share_inputs["not_need_stop"],
|
|
self.share_inputs["seq_lens_this_time"],
|
|
self.share_inputs["seq_lens_encoder"],
|
|
self.share_inputs["seq_lens_decoder"],
|
|
self.share_inputs["input_ids"],
|
|
self.share_inputs["stop_nums"],
|
|
sampler_output.sampled_token_ids,
|
|
self.share_inputs["is_block_step"],
|
|
)
|
|
if sampler_output.logprobs_tensors is None:
|
|
save_output(
|
|
sampler_output.sampled_token_ids,
|
|
self.share_inputs["not_need_stop"],
|
|
self.rank,
|
|
False, # use_ep
|
|
)
|
|
else:
|
|
save_output_topk(
|
|
sampler_output.sampled_token_ids,
|
|
sampler_output.logprobs_tensors.logprob_token_ids,
|
|
sampler_output.logprobs_tensors.logprobs,
|
|
sampler_output.logprobs_tensors.selected_token_ranks,
|
|
self.share_inputs["not_need_stop"],
|
|
self.rank,
|
|
)
|
|
|
|
def _cal_theortical_kvcache(self):
|
|
"""
|
|
Calculate the size of kvcache for computational theory
|
|
"""
|
|
num_layers = self.model_cfg.num_hidden_layers
|
|
byte_of_cache = 2
|
|
# support c8 c4
|
|
|
|
hidden_dim = self.model_cfg.head_dim * self.model_cfg.kv_num_head
|
|
theoretical_kv_cache_memory = (2 * byte_of_cache *
|
|
self.args.block_size * num_layers *
|
|
hidden_dim)
|
|
return theoretical_kv_cache_memory
|
|
|
|
def _update_share_input_block_num(self):
|
|
"""
|
|
Update share_inputs['block_tables'] and share_inputs['free_list']
|
|
"""
|
|
num_gpu_blocks = self.num_gpu_blocks
|
|
|
|
del self.share_inputs["caches"]
|
|
self._init_kvcache()
|
|
|
|
del self.share_inputs["block_tables"]
|
|
self.share_inputs["block_tables"] = paddle.full(
|
|
[self.args.max_num_seqs, num_gpu_blocks], -1, dtype="int32")
|
|
|
|
# Init free list
|
|
free_list = list(
|
|
range(num_gpu_blocks - 1,
|
|
int(num_gpu_blocks * self.args.kv_cache_ratio) - 1, -1))
|
|
self.free_list_len = len(free_list)
|
|
self.share_inputs.update({
|
|
"free_list":
|
|
paddle.to_tensor(free_list, dtype="int32"),
|
|
"free_list_len":
|
|
paddle.full([1], self.free_list_len, dtype="int32"),
|
|
})
|
|
|
|
def dummy_input(self, num_total_tokens: int, number_of_tasks: int) -> None:
|
|
"""
|
|
fake input to profile
|
|
"""
|
|
input_length = min(num_total_tokens // number_of_tasks,
|
|
self.args.max_model_len - 10)
|
|
block_num = (input_length + self.args.block_size - 1 ) // self.args.block_size \
|
|
+ self.args.enc_dec_block_num
|
|
self.share_inputs["free_list"] = paddle.to_tensor([], dtype="int32")
|
|
self.share_inputs["free_list_len"][0] = 0
|
|
|
|
for i in range(number_of_tasks):
|
|
idx = i
|
|
self.share_inputs["input_ids"][idx:idx +
|
|
1, :input_length] = np.array(
|
|
[5] * input_length)
|
|
self.share_inputs["eos_token_id"][:] = np.array(
|
|
[2], dtype="int64").reshape(-1, 1)
|
|
self.share_inputs["seq_lens_this_time"][idx:idx + 1] = input_length
|
|
self.share_inputs["step_seq_lens_encoder"][idx:idx +
|
|
1] = input_length
|
|
self.share_inputs["seq_lens_encoder"][idx:idx + 1] = input_length
|
|
self.share_inputs["seq_lens_decoder"][idx:idx + 1] = 0
|
|
self.share_inputs["step_idx"][idx:idx + 1] = 0
|
|
self.share_inputs["max_dec_len"][idx:idx + 1] = 10
|
|
self.share_inputs["stop_flags"][idx:idx + 1] = False
|
|
|
|
self.share_inputs["first_token_ids"][
|
|
idx:idx + 1] = self.share_inputs["input_ids"][idx:idx + 1, :1]
|
|
self.share_inputs["ori_seq_lens_encoder"][idx:idx +
|
|
1] = input_length
|
|
|
|
self.share_inputs["infer_seed"][idx:idx + 1] = random.randint(
|
|
0, 922337203685477580)
|
|
self.share_inputs["encoder_block_lens"][idx:idx + 1] = block_num
|
|
self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange(idx * block_num, \
|
|
(idx + 1) * block_num, 1)
|
|
|
|
def _preprocess_task(self, one: dict) -> None:
|
|
"""process batch"""
|
|
|
|
input_ids = one["input_ids"][np.newaxis, :]
|
|
input_ids = paddle.to_tensor(input_ids, dtype=paddle.int64)
|
|
token_type_ids = one["token_type_ids"][np.newaxis, :]
|
|
token_type_ids = paddle.to_tensor(token_type_ids, dtype=paddle.int64)
|
|
|
|
if one["images"] is not None:
|
|
image_type_ids = one["image_type_ids"][np.newaxis, :]
|
|
images = one["images"]
|
|
image_type_ids = paddle.to_tensor(image_type_ids,
|
|
dtype=paddle.int64)
|
|
images = paddle.to_tensor(images, dtype="uint8")
|
|
grid_thw = paddle.to_tensor(one["grid_thw"], dtype="int64")
|
|
else:
|
|
image_type_ids = None
|
|
images = None
|
|
grid_thw = None
|
|
|
|
if one["position_ids"] is not None:
|
|
position_ids = paddle.to_tensor(one["position_ids"],
|
|
dtype="int64").unsqueeze([0])
|
|
else:
|
|
position_ids = None
|
|
|
|
result = dict(
|
|
input_ids=input_ids,
|
|
image_type_ids=image_type_ids,
|
|
token_type_ids=token_type_ids,
|
|
position_ids=position_ids,
|
|
grid_thw=grid_thw,
|
|
images=images,
|
|
)
|
|
return result
|