Files
FastDeploy/fastdeploy/worker/vl_gpu_model_runner.py
bukejiyu 15c8c240b5
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
[vl] Use top_k from config.json (#2831)
2025-07-15 00:39:12 +08:00

843 lines
35 KiB
Python

"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import argparse
import os
import random
import numpy as np
import paddle
import paddle.distributed.fleet as fleet
from fastdeploy.config import ModelConfig
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
from fastdeploy.input.mm_processor import DataProcessor
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.layers.attention import get_attention_backend
from fastdeploy.model_executor.layers.rotary_embedding import get_rope_3d
from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
from fastdeploy.model_executor.layers.sample.sampler import Sampler
from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import \
ScatterOp
from fastdeploy.platforms import current_platform
from fastdeploy.worker.output import SamplerOutput
from fastdeploy.worker.utils import check_safetensors_model
from fastdeploy.worker.vl_model_runner_base import VLModelRunnerBase
if current_platform.is_cuda() and current_platform.available():
from fastdeploy.model_executor.layers.utils import (
remove_padding, speculate_remove_padding)
from fastdeploy.model_executor.ops.gpu import (save_output, save_output_topk,
set_stop_value_multi_ends,
set_value_by_flags_and_idx,
update_inputs)
class GPUVLModelRunner(VLModelRunnerBase):
"""
The GPUVLModelRunner class for vision-language tasks on GPU.
"""
def __init__(
self,
config: ModelConfig,
args: argparse.Namespace,
nranks: int,
rank: int,
) -> None:
"""
GPUVLModelRunner init
"""
self.nranks = nranks
self.rank = rank
hcg = fleet.get_hybrid_communicate_group()
self.tensor_parallel_degree = max(hcg.get_model_parallel_world_size(),
1)
self.tensor_parallel_rank = hcg.get_model_parallel_rank()
self.mp_src_rank = hcg.get_model_parallel_group_src_rank()
self.mp_group = hcg.get_model_parallel_group()
self.is_safetensors_model = check_safetensors_model(
args.model_name_or_path)
self.enable_logprob = args.enable_logprob
model_path = os.path.dirname(args.model_name_or_path)
args.llm_model_name_or_path = args.model_name_or_path
if not self.is_safetensors_model:
args.tokenizer = args.image_preprocessor = model_path
else:
args.tokenizer = args.image_preprocessor = args.model_name_or_path
args.vision_model_name_or_path = os.path.join(
model_path, "DFNRopeVisionTransformer")
self.amp_black = [
"reduce_sum",
"c_softmax_with_cross_entropy",
"elementwise_div",
"sin",
"cos",
"sort",
"multinomial",
]
self.amp_white = [
"lookup_table",
"lookup_table_v2",
"flash_attn",
"matmul",
"matmul_v2",
"fused_gemm_epilogue",
]
super().__init__(config, args)
self.init_extra_input(config, args)
self._reset_paddle_env()
self.sampler = Sampler()
def _reset_paddle_env(self):
pass
def update_chunked_prefill(self, tasks: list[any]) -> None:
"""
update chunked prefill
"""
if not self.args.enable_chunked_prefill:
return
for task in tasks:
if task.chunk_idx > len(task.prefill_chunk_info):
continue
idx = task.idx
if task.chunk_idx == len(task.prefill_chunk_info):
self.share_inputs["seq_lens_this_time"][idx:idx + 1] = 1
self.share_inputs['seq_lens_encoder'][idx:idx + 1] = 0
self.share_inputs["seq_lens_decoder"][idx:idx +
1] = task.start_idx
self.share_inputs["step_idx"][idx:idx + 1] = 1
else:
inputs = self._preprocess_task(
task.prefill_chunk_info[task.chunk_idx])
if inputs.get("images") is not None:
self.share_inputs[
"image_features"] = self.extract_vision_features(
inputs)
else:
# Compatible with the situation that lacks images and videos
self.share_inputs["image_features"] = None
token_chunk_size = inputs["input_ids"].shape[1]
self.share_inputs["input_ids"][
idx:idx + 1, :token_chunk_size] = inputs["input_ids"]
self.share_inputs["seq_lens_this_time"][idx:idx +
1] = token_chunk_size
self.share_inputs['seq_lens_encoder'][idx:idx +
1] = token_chunk_size
self.share_inputs["seq_lens_decoder"][idx:idx +
1] = task.start_idx
self.share_inputs["step_idx"][idx:idx + 1] = 0
task.start_idx += token_chunk_size
task.chunk_idx += 1
def _init_image_preprocess(self, vision_config) -> None:
processor = DataProcessor(
tokenizer_name=self.args.tokenizer,
image_preprocessor_name=str(self.args.image_preprocessor),
)
processor.eval()
image_preprocess = processor.image_preprocessor
image_preprocess.image_mean_tensor = paddle.to_tensor(
image_preprocess.image_mean, dtype="float32"
).reshape([1, 3, 1, 1])
image_preprocess.image_std_tensor = paddle.to_tensor(
image_preprocess.image_std, dtype="float32"
).reshape([1, 3, 1, 1])
image_preprocess.rescale_factor = paddle.to_tensor(
image_preprocess.rescale_factor, dtype="float32"
)
image_preprocess.image_mean_tensor = (
image_preprocess.image_mean_tensor.squeeze(
[-2, -1]
).repeat_interleave(vision_config.patch_size**2 * 1, -1)
)
image_preprocess.image_std_tensor = (
image_preprocess.image_std_tensor.squeeze(
[-2, -1]
).repeat_interleave(vision_config.patch_size**2 * 1, -1)
)
return image_preprocess
def _load_model(
self,
model_name: str,
dynamic_load_weight: int = 0,
) -> None:
"""
Load the model from the given model name.
"""
vocab_file_names = [
"tokenizer.model", "spm.model", "ernie_token_100k.model"
]
for i in range(len(vocab_file_names)):
if os.path.exists(
os.path.join(self.args.tokenizer, vocab_file_names[i])):
ErnieBotTokenizer.resource_files_names[
"vocab_file"] = vocab_file_names[i]
break
tokenizer = ErnieBotTokenizer.from_pretrained(
self.args.tokenizer,
model_max_length=self.args.max_model_len,
padding_side="right",
use_fast=False,
)
tokenizer.ignored_index = -100
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.unk_token
self.dtype = self.args.dtype
paddle.set_default_dtype(self.dtype)
from fastdeploy.worker.worker_process import initialize_fd_config
fd_config = initialize_fd_config(
self.args, self.tensor_parallel_degree, self.tensor_parallel_rank
)
fd_config.model_config.tensor_parallel_degree=self.tensor_parallel_degree
fd_config.model_config.tensor_parallel_rank=self.tensor_parallel_rank
fd_config.model_config.moe_group="dummy"
fd_config.parallel_config.column_cut = False
vision_config = fd_config.model_config.vision_config
vision_config.attn_sep = False
vision_config.dtype = "bfloat16"
vision_config.tensor_parallel_degree = self.tensor_parallel_degree
vision_config.tensor_parallel_rank = self.tensor_parallel_rank
fd_config.model_config.pixel_hidden_size = vision_config.hidden_size
fd_config.model_config.im_patch_id = tokenizer.get_vocab()[
"<|IMAGE_PLACEHOLDER|>"
]
fd_config.model_config.think_end_id = tokenizer.get_vocab()["</think>"]
fd_config.model_config.max_text_id = fd_config.model_config.im_patch_id
fd_config.model_config.sequence_parallel = False
self.fd_config = fd_config
self.model_cfg = self.fd_config.model_config
self.image_preprocess = self._init_image_preprocess(
self.fd_config.model_config.vision_config
)
from fastdeploy.model_executor.model_loader import \
get_model_from_loader
self.model = get_model_from_loader(self.fd_config)
attn_backend_cls = get_attention_backend()
num_heads = self.fd_config.model_config.num_attention_heads // \
self.fd_config.parallel_config.tensor_parallel_size
self.fd_config.model_config.kv_num_heads = int(
self.fd_config.model_config.num_key_value_heads
) // self.fd_config.parallel_config.tensor_parallel_size
head_dim = self.fd_config.model_config.head_dim
self.attn_backend = attn_backend_cls(
self.fd_config,
kv_num_heads=self.fd_config.model_config.kv_num_heads,
num_heads=num_heads,
head_dim=head_dim)
self._init_kvcache()
def init_extra_input(self, config: ModelConfig, args: argparse.Namespace) -> None:
"""
Initialize extra input tensors.
"""
head_dim = self.model_cfg.head_dim
self.share_inputs.update({
"rope_emb":
paddle.full(shape=[
args.max_num_seqs, 2, 1, self.max_length, 1, head_dim // 2
],
fill_value=0,
dtype="float32")
})
self.share_inputs.update({"image_features": None})
self.share_inputs.update({
"need_think_end":
paddle.full(shape=[args.max_num_seqs, 1],
fill_value=0,
dtype="int32")
})
self.share_inputs.update({
"enable_thinking":
paddle.full(shape=[1], fill_value=True, dtype="bool")
})
self.share_inputs.update({
"reasoning_index":
paddle.full(shape=[args.max_num_seqs, 1],
fill_value=0,
dtype="int32")
})
def init_rotary_position_embedding(self, max_model_len: int) -> None:
"""
Init rotary position embedding
"""
pass
def _init_kvcache(self):
"""
Init kv cache
"""
cache_kvs = {}
total_block_num = self.num_gpu_blocks
num_layers = self.model_cfg.num_hidden_layers
kv_num_head = self.model_cfg.num_key_value_heads if self.model_cfg.num_key_value_heads != -1 else self.model_cfg.num_attention_heads
kv_num_head = kv_num_head // self.tensor_parallel_degree
self.model_cfg.kv_num_head = kv_num_head
for i in range(num_layers):
cache_type = self.args.dtype
cache_kvs["key_caches_{}".format(i)] = paddle.full(
shape=[
total_block_num,
kv_num_head,
self.args.block_size,
self.model_cfg.head_dim,
],
fill_value=0,
dtype=cache_type,
)
cache_kvs["value_caches_{}".format(i)] = paddle.full(
shape=[
total_block_num,
kv_num_head,
self.args.block_size,
self.model_cfg.head_dim,
],
fill_value=0,
dtype=cache_type,
)
self.share_inputs["caches"] = list(cache_kvs.values())
for value in cache_kvs.values():
del value
paddle.device.cuda.empty_cache()
def clear_parameters(self, pid: int) -> None:
""" clear_parameters """
if "caches" in self.share_inputs:
self.model.clear_parameters(pid)
del self.share_inputs["caches"]
paddle.device.cuda.empty_cache()
self.model.log_memory_usage("clear all memory")
def update_parameters(self, pid: int) -> None:
""" update_parameters """
if "caches" not in self.share_inputs:
self.model.update_parameters(pid)
self._init_kvcache()
self.model.log_memory_usage("update all memory")
@paddle.no_grad()
def extract_vision_features(self, inputs: list[paddle.Tensor]) -> paddle.Tensor:
"""extract_vision_features"""
assert inputs["images"] is not None
grid_thw = inputs["grid_thw"]
images = inputs["images"].cast("float32")
images = self.image_preprocess.rescale_factor * images - self.image_preprocess.image_mean_tensor
images = images / self.image_preprocess.image_std_tensor
images = images.cast("bfloat16")
token_type_ids = inputs["token_type_ids"]
token_type_ids_w_video = token_type_ids
input_ids = inputs["input_ids"]
# convert to img patch id
image_mask = input_ids == self.model_cfg.im_patch_id
image_type_ids = inputs["image_type_ids"]
with paddle.amp.auto_cast(
True,
custom_black_list=self.amp_black,
custom_white_list=self.amp_white,
level="O2",
dtype=self.dtype,
):
image_features = self.model.vision_model.extract_feature(
images, grid_thw)
if self.tensor_parallel_degree > 1:
S, C = image_features.shape
image_features = image_features.reshape(
[-1, C * self.model_cfg.spatial_conv_size**2])
image_features = ScatterOp.apply(image_features,
axis=-1) # mp 切 Fea
image_features = image_features.reshape([S, -1])
image_features = self.model.resampler_model(
image_features,
image_mask,
token_type_ids_w_video,
image_type_ids,
grid_thw,
)
return image_features
@paddle.no_grad()
def prepare_rope3d(self, position_ids: paddle.Tensor, **kwargs) -> paddle.Tensor:
"""prepare_rope3d"""
prefix_max_position_ids = paddle.max(position_ids) + 1
dec_pos_ids = paddle.tile(
paddle.arange(kwargs["max_length"],
dtype="int64").unsqueeze(0).unsqueeze(-1), [1, 1, 3])
dec_pos_ids = dec_pos_ids + prefix_max_position_ids
position_ids_3d_real = paddle.concat([position_ids, dec_pos_ids],
axis=1)
rope_emb = get_rope_3d(
position_ids=position_ids_3d_real,
rotary_dim=self.model_cfg.head_dim,
paritial_rotary_factor=1.0,
base=self.model_cfg.rope_theta,
max_position=self.args.max_model_len,
freq_allocation=self.model_cfg.freq_allocation,
)
return rope_emb
def prefill_finished(self):
"""
Verify prefill operation completion
"""
prefill_statue = (self.share_inputs["seq_lens_this_time"] != 0) & (
self.share_inputs["seq_lens_this_time"] != 1)
return not paddle.any(prefill_statue).numpy()
def dy_input_preprocess(self, tasks: list[any]) -> None:
"""
dynamic insertion
"""
def get_numeric_value(task, key, default_value):
if task.get(key, None) is not None:
return task.get(key)
else:
return default_value
for i in range(len(tasks)):
task = tasks[i]
idx = task.idx
kwargs = {
"max_length":
get_numeric_value(task, "max_tokens", 2048),
"top_p":
get_numeric_value(task, "top_p", 0.8),
"temperature":
get_numeric_value(task, "temperature", 0.2),
"top_k":
get_numeric_value(task, "top_k", 0),
"penalty_score":
get_numeric_value(task, "repetition_penalty", 1.0),
"frequency_score":
get_numeric_value(task, "frequency_penalty", 0.0),
"presence_score":
get_numeric_value(task, "presence_penalty", 0.0),
"decode_strategy":
"sampling",
"pad_token_id":
self.args.pad_token_id,
"enable_thinking":
get_numeric_value(task, "enable_thinking", True),
"reasoning_max_tokens":
get_numeric_value(task, "reasoning_max_tokens", 2048),
}
if self.args.enable_chunked_prefill:
task.set("chunk_idx", 1)
inputs = self._preprocess_task(task.prefill_chunk_info[0])
if inputs.get("images") is not None:
self.share_inputs[
"image_features"] = self.extract_vision_features(
inputs)
else:
# Compatible with the situation that lacks images and videos
self.share_inputs["image_features"] = None
if task.multimodal_inputs["position_ids"] is not None:
position_ids = paddle.to_tensor(
task.multimodal_inputs["position_ids"],
dtype="int64").unsqueeze([0])
else:
position_ids = None
token_chunk_size = inputs["input_ids"].shape[1]
task.set("start_idx", token_chunk_size)
self.share_inputs["input_ids"][
idx:idx + 1, :token_chunk_size] = inputs["input_ids"]
self.share_inputs["seq_lens_this_time"][idx:idx +
1] = token_chunk_size
self.share_inputs["seq_lens_encoder"][idx:idx +
1] = token_chunk_size
self.share_inputs["step_seq_lens_encoder"][
idx:idx + 1] = token_chunk_size
else:
inputs = self._preprocess_task(task.multimodal_inputs)
if inputs.get("images") is not None:
self.share_inputs[
"image_features"] = self.extract_vision_features(
inputs)
else:
# Compatible with the situation that lacks images and videos
self.share_inputs["image_features"] = None
position_ids = inputs["position_ids"]
length = inputs["input_ids"].shape[1]
self.share_inputs["input_ids"][
idx:idx + 1, :length] = inputs["input_ids"]
self.share_inputs["seq_lens_this_time"][idx:idx + 1] = length
self.share_inputs["seq_lens_encoder"][idx:idx + 1] = length
self.share_inputs["step_seq_lens_encoder"][idx:idx +
1] = length
# force </think>
self.share_inputs["enable_thinking"][:] = kwargs["enable_thinking"]
self.share_inputs["need_think_end"][
idx:idx + 1, :] = 1 if kwargs["enable_thinking"] else 0
self.share_inputs["reasoning_index"][
idx:idx + 1, :] = kwargs["reasoning_max_tokens"]
self.share_inputs["rope_emb"][idx:idx +
1, :] = self.prepare_rope3d(
position_ids, **kwargs)
self.share_inputs["top_p"][idx:idx + 1] = kwargs["top_p"]
self.share_inputs["temperature"][idx:idx +
1] = kwargs["temperature"]
self.share_inputs["eos_token_id"][:] = np.array(
task.eos_token_ids).astype("int64").reshape(-1, 1)
self.share_inputs["penalty_score"][idx:idx +
1] = kwargs["penalty_score"]
self.share_inputs["frequency_score"][idx:idx +
1] = kwargs["frequency_score"]
self.share_inputs["presence_score"][idx:idx +
1] = kwargs["presence_score"]
self.share_inputs["seq_lens_decoder"][idx:idx + 1] = 0
self.share_inputs["step_idx"][idx:idx + 1] = 0
self.share_inputs["min_dec_len"][idx:idx + 1] = 1
self.share_inputs["max_dec_len"][idx:idx +
1] = kwargs["max_length"]
self.share_inputs["stop_flags"][idx:idx + 1] = False
self.share_inputs["pre_ids"][idx:idx + 1] = -1
encoder_block_num = len(task.get("block_tables"))
self.share_inputs["encoder_block_lens"][idx:idx +
1] = encoder_block_num
self.share_inputs["block_tables"][idx:idx + 1, :] = -1
self.share_inputs["block_tables"][
idx:idx + 1, :encoder_block_num] = np.array(task.block_tables,
dtype="int32")
def pre_process(self) -> None:
"""
pre_process
"""
if current_platform.is_cuda():
if self.args.speculative_method is not None:
(
ids_remove_padding,
padding_offset,
cum_offsets,
cu_seqlens_q,
cu_seqlens_k,
) = speculate_remove_padding(
max_len=self.args.max_model_len,
input_ids=self.share_inputs["input_ids"],
seq_lens_this_time=self.share_inputs["seq_lens_this_time"],
draft_tokens=self.share_inputs["draft_tokens"],
seq_lens_encoder=self.share_inputs["seq_lens_encoder"])
else:
(
ids_remove_padding,
padding_offset,
cum_offsets,
cu_seqlens_q,
cu_seqlens_k,
) = remove_padding(
max_len=self.args.max_model_len,
input_ids=self.share_inputs["input_ids"],
seq_lens_this_time=self.share_inputs["seq_lens_this_time"])
self.share_inputs["ids_remove_padding"] = ids_remove_padding
self.share_inputs["padding_offset"] = padding_offset
self.share_inputs["cum_offsets"] = cum_offsets
self.share_inputs["cu_seqlens_q"] = cu_seqlens_q
self.share_inputs["cu_seqlens_k"] = cu_seqlens_k
self.share_inputs["decoder_batch_ids"] = paddle.full(
[self.fd_config.parallel_config.max_num_seqs, 1], 0, dtype='int32')
self.share_inputs["decoder_tile_ids_per_batch"] = paddle.full(
[self.fd_config.parallel_config.max_num_seqs, 1], 0, dtype='int32')
# initialize_forward_meta
self.forward_meta = ForwardMeta(
input_ids=self.share_inputs["input_ids"],
ids_remove_padding=self.share_inputs["ids_remove_padding"],
rotary_embs=self.share_inputs["rope_emb"],
attn_backend=self.attn_backend,
decoder_batch_ids=self.share_inputs["decoder_batch_ids"],
decoder_tile_ids_per_batch=self.share_inputs["decoder_tile_ids_per_batch"],
seq_lens_encoder=self.share_inputs["seq_lens_encoder"],
seq_lens_decoder=self.share_inputs["seq_lens_decoder"],
seq_lens_this_time=self.share_inputs["seq_lens_this_time"],
cum_offsets=self.share_inputs["cum_offsets"],
padding_offset=self.share_inputs["padding_offset"],
cu_seqlens_q=self.share_inputs["cu_seqlens_q"],
cu_seqlens_k=self.share_inputs["cu_seqlens_k"],
block_tables=self.share_inputs["block_tables"],
caches=self.share_inputs["caches"]
)
self.attn_backend.init_attention_metadata(self.forward_meta)
self.sampling_metadata = SamplingMetadata(
temperature=self.share_inputs["temperature"],
top_p=self.share_inputs["top_p"],
step_idx=self.share_inputs["step_idx"],
pre_token_ids=self.share_inputs["pre_ids"],
frequency_penalties=self.share_inputs["frequency_score"],
presence_penalties=self.share_inputs["presence_score"],
repetition_penalties=self.share_inputs["penalty_score"],
min_dec_lens=self.share_inputs["min_dec_len"],
bad_words_token_ids=self.share_inputs["bad_tokens"],
eos_token_ids=self.share_inputs["eos_token_id"],
max_num_logprobs=20 if self.enable_logprob else None,
)
def generate(self) -> None:
"""
generate
"""
self.pre_process()
hiddden_states = self.model(self.share_inputs["ids_remove_padding"],
self.share_inputs["image_features"],
self.forward_meta)
logits = self.model.compute_logits(hiddden_states)
set_value_by_flags_and_idx(
self.share_inputs["pre_ids"],
self.share_inputs["input_ids"],
self.share_inputs["seq_lens_this_time"],
self.share_inputs["seq_lens_encoder"],
self.share_inputs["seq_lens_decoder"],
self.share_inputs["step_idx"],
self.share_inputs["stop_flags"],
)
# sampler & save_output
sampler_output = self.sampler(logits, self.sampling_metadata)
if self.fd_config.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0)
self.post_process(sampler_output)
def post_process(self, sampler_output: SamplerOutput) -> None:
"""
post_process
"""
if self.share_inputs["enable_thinking"]:
exists_think_end = sampler_output.sampled_token_ids == self.model_cfg.think_end_id
paddle.assign(
paddle.where(
exists_think_end,
self.share_inputs["need_think_end"] - 1,
self.share_inputs["need_think_end"],
), self.share_inputs["need_think_end"])
paddle.assign(
paddle.where(
self.share_inputs["need_think_end"].cast("bool"),
self.share_inputs["reasoning_index"] - 1,
self.share_inputs["reasoning_index"],
), self.share_inputs["reasoning_index"])
stop_wo_think = (
(sampler_output.sampled_token_ids == self.share_inputs["eos_token_id"]) |
(self.share_inputs["reasoning_index"] == 0)) & (
self.share_inputs["need_think_end"] > 0)
sampler_output.sampled_token_ids = paddle.where(stop_wo_think,
self.model_cfg.think_end_id,
sampler_output.sampled_token_ids)
paddle.assign(
paddle.where(
stop_wo_think,
self.share_inputs["need_think_end"] - 1,
self.share_inputs["need_think_end"],
), self.share_inputs["need_think_end"])
paddle.assign(
paddle.where(
self.share_inputs["stop_flags"],
self.share_inputs["step_idx"],
self.share_inputs["step_idx"] + 1,
),
self.share_inputs["step_idx"],
)
length_cond = paddle.greater_equal(self.share_inputs["step_idx"],
self.share_inputs["max_dec_len"])
paddle.assign(
paddle.logical_or(self.share_inputs["stop_flags"], length_cond),
self.share_inputs["stop_flags"],
)
set_stop_value_multi_ends(
sampler_output.sampled_token_ids,
self.share_inputs["stop_flags"],
self.share_inputs["seq_lens_this_time"],
self.share_inputs["eos_token_id"],
self.share_inputs["next_tokens"],
False,
) # multi ends
# update inputs
update_inputs(
self.share_inputs["stop_flags"],
self.share_inputs["not_need_stop"],
self.share_inputs["seq_lens_this_time"],
self.share_inputs["seq_lens_encoder"],
self.share_inputs["seq_lens_decoder"],
self.share_inputs["input_ids"],
self.share_inputs["stop_nums"],
sampler_output.sampled_token_ids,
self.share_inputs["is_block_step"],
)
if sampler_output.logprobs_tensors is None:
save_output(
sampler_output.sampled_token_ids,
self.share_inputs["not_need_stop"],
self.rank,
False, # use_ep
)
else:
save_output_topk(
sampler_output.sampled_token_ids,
sampler_output.logprobs_tensors.logprob_token_ids,
sampler_output.logprobs_tensors.logprobs,
sampler_output.logprobs_tensors.selected_token_ranks,
self.share_inputs["not_need_stop"],
self.rank,
)
def _cal_theortical_kvcache(self):
"""
Calculate the size of kvcache for computational theory
"""
num_layers = self.model_cfg.num_hidden_layers
byte_of_cache = 2
# support c8 c4
hidden_dim = self.model_cfg.head_dim * self.model_cfg.kv_num_head
theoretical_kv_cache_memory = (2 * byte_of_cache *
self.args.block_size * num_layers *
hidden_dim)
return theoretical_kv_cache_memory
def _update_share_input_block_num(self):
"""
Update share_inputs['block_tables'] and share_inputs['free_list']
"""
num_gpu_blocks = self.num_gpu_blocks
del self.share_inputs["caches"]
self._init_kvcache()
del self.share_inputs["block_tables"]
self.share_inputs["block_tables"] = paddle.full(
[self.args.max_num_seqs, num_gpu_blocks], -1, dtype="int32")
# Init free list
free_list = list(
range(num_gpu_blocks - 1,
int(num_gpu_blocks * self.args.kv_cache_ratio) - 1, -1))
self.free_list_len = len(free_list)
self.share_inputs.update({
"free_list":
paddle.to_tensor(free_list, dtype="int32"),
"free_list_len":
paddle.full([1], self.free_list_len, dtype="int32"),
})
def dummy_input(self, num_total_tokens: int, number_of_tasks: int) -> None:
"""
fake input to profile
"""
input_length = min(num_total_tokens // number_of_tasks,
self.args.max_model_len - 10)
block_num = (input_length + self.args.block_size - 1 ) // self.args.block_size \
+ self.args.enc_dec_block_num
self.share_inputs["free_list"] = paddle.to_tensor([], dtype="int32")
self.share_inputs["free_list_len"][0] = 0
for i in range(number_of_tasks):
idx = i
self.share_inputs["input_ids"][idx:idx +
1, :input_length] = np.array(
[5] * input_length)
self.share_inputs["eos_token_id"][:] = np.array(
[2], dtype="int64").reshape(-1, 1)
self.share_inputs["seq_lens_this_time"][idx:idx + 1] = input_length
self.share_inputs["step_seq_lens_encoder"][idx:idx +
1] = input_length
self.share_inputs["seq_lens_encoder"][idx:idx + 1] = input_length
self.share_inputs["seq_lens_decoder"][idx:idx + 1] = 0
self.share_inputs["step_idx"][idx:idx + 1] = 0
self.share_inputs["max_dec_len"][idx:idx + 1] = 10
self.share_inputs["stop_flags"][idx:idx + 1] = False
self.share_inputs["first_token_ids"][
idx:idx + 1] = self.share_inputs["input_ids"][idx:idx + 1, :1]
self.share_inputs["ori_seq_lens_encoder"][idx:idx +
1] = input_length
self.share_inputs["infer_seed"][idx:idx + 1] = random.randint(
0, 922337203685477580)
self.share_inputs["encoder_block_lens"][idx:idx + 1] = block_num
self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange(idx * block_num, \
(idx + 1) * block_num, 1)
def _preprocess_task(self, one: dict) -> None:
"""process batch"""
input_ids = one["input_ids"][np.newaxis, :]
input_ids = paddle.to_tensor(input_ids, dtype=paddle.int64)
token_type_ids = one["token_type_ids"][np.newaxis, :]
token_type_ids = paddle.to_tensor(token_type_ids, dtype=paddle.int64)
if one["images"] is not None:
image_type_ids = one["image_type_ids"][np.newaxis, :]
images = one["images"]
image_type_ids = paddle.to_tensor(image_type_ids,
dtype=paddle.int64)
images = paddle.to_tensor(images, dtype="uint8")
grid_thw = paddle.to_tensor(one["grid_thw"], dtype="int64")
else:
image_type_ids = None
images = None
grid_thw = None
if one["position_ids"] is not None:
position_ids = paddle.to_tensor(one["position_ids"],
dtype="int64").unsqueeze([0])
else:
position_ids = None
result = dict(
input_ids=input_ids,
image_type_ids=image_type_ids,
token_type_ids=token_type_ids,
position_ids=position_ids,
grid_thw=grid_thw,
images=images,
)
return result