polish code with new pre-commit rule (#2923)

This commit is contained in:
Zero Rains
2025-07-19 23:19:27 +08:00
committed by GitHub
parent b8676d71a8
commit 25698d56d1
424 changed files with 14307 additions and 13518 deletions

View File

@@ -13,20 +13,22 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import random
import time
from typing import Dict, List, Optional
import numpy as np
import paddle
import paddle.nn as nn
from paddle import nn
from fastdeploy.config import FDConfig
from fastdeploy.engine.request import Request
from fastdeploy.model_executor.forward_meta import ForwardMeta, XPUForwardMeta
from fastdeploy.model_executor.layers.attention import get_attention_backend
from fastdeploy.model_executor.layers.attention.base_attention_backend import \
AttentionBackend
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
AttentionBackend,
)
from fastdeploy.model_executor.layers.rotary_embedding import get_rope
from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
from fastdeploy.model_executor.layers.sample.sampler import Sampler
@@ -39,30 +41,31 @@ logger = get_logger("xpu_model_runner", "xpu_model_runner.log")
def xpu_pre_process(
max_len: int,
input_ids: paddle.Tensor,
seq_lens_this_time: int,
share_inputs: Dict,
use_speculate_method: bool,
draft_tokens: Optional[paddle.Tensor] = None,
seq_lens_encoder: Optional[paddle.Tensor] = None,
seq_lens_decoder: Optional[paddle.Tensor] = None) -> XPUForwardMeta:
"""
"""
max_len: int,
input_ids: paddle.Tensor,
seq_lens_this_time: int,
share_inputs: Dict,
use_speculate_method: bool,
draft_tokens: Optional[paddle.Tensor] = None,
seq_lens_encoder: Optional[paddle.Tensor] = None,
seq_lens_decoder: Optional[paddle.Tensor] = None,
) -> XPUForwardMeta:
""" """
cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time)
token_num = paddle.sum(seq_lens_this_time)
from fastdeploy.model_executor.ops.xpu import (adjust_batch,
get_infer_param,
get_padding_offset)
from fastdeploy.model_executor.ops.xpu import (
adjust_batch,
get_infer_param,
get_padding_offset,
)
(
ids_remove_padding,
cum_offsets,
batch_id_per_token,
cu_seqlens_q,
cu_seqlens_k,
) = get_padding_offset(input_ids, cum_offsets_now, token_num,
seq_lens_this_time)
) = get_padding_offset(input_ids, cum_offsets_now, token_num, seq_lens_this_time)
share_inputs["ids_remove_padding"] = None # set this after adjust batch
share_inputs["cum_offsets"] = cum_offsets
@@ -83,7 +86,7 @@ def xpu_pre_process(
cu_seqlens_q=share_inputs["cu_seqlens_q"],
cu_seqlens_k=share_inputs["cu_seqlens_k"],
block_tables=share_inputs["block_tables"],
caches=share_inputs["caches"]
caches=share_inputs["caches"],
)
# Get xpu extra param
@@ -134,10 +137,9 @@ def xpu_process_output(
cum_offsets: paddle.Tensor,
xpu_forward_meta: XPUForwardMeta,
) -> paddle.Tensor:
"""
"""
""" """
from fastdeploy.model_executor.ops.xpu import gather_next_token
hiddden_states = gather_next_token(
forward_output,
cum_offsets,
@@ -155,15 +157,17 @@ def xpu_process_output(
return hiddden_states
def xpu_post_process(sampled_token_ids: paddle.Tensor,
model_output: ModelOutputData,
skip_save_output: bool) -> None:
"""
"""
from fastdeploy.model_executor.ops.xpu import (save_output,
set_stop_value_multi_ends,
update_inputs)
def xpu_post_process(
sampled_token_ids: paddle.Tensor,
model_output: ModelOutputData,
skip_save_output: bool,
) -> None:
""" """
from fastdeploy.model_executor.ops.xpu import (
save_output,
set_stop_value_multi_ends,
update_inputs,
)
# 1. Set stop value
paddle.assign(
@@ -174,16 +178,19 @@ def xpu_post_process(sampled_token_ids: paddle.Tensor,
),
model_output.step_idx,
)
length_cond = paddle.greater_equal(model_output.step_idx,
model_output.max_dec_len)
length_cond = paddle.greater_equal(model_output.step_idx, model_output.max_dec_len)
paddle.assign(
paddle.logical_or(model_output.stop_flags, length_cond),
model_output.stop_flags,
)
set_stop_value_multi_ends(sampled_token_ids, model_output.stop_flags,
model_output.seq_lens_this_time,
model_output.eos_token_id,
model_output.next_tokens, False) # multi ends
set_stop_value_multi_ends(
sampled_token_ids,
model_output.stop_flags,
model_output.seq_lens_this_time,
model_output.eos_token_id,
model_output.next_tokens,
False,
) # multi ends
# 2. Update the input buffer of the model
with paddle.framework._no_check_dy2st_diff():
@@ -209,12 +216,16 @@ def xpu_post_process(sampled_token_ids: paddle.Tensor,
)
def step_paddle(share_inputs: Dict[str, paddle.Tensor], block_size: int,
enc_dec_block_num: int) -> None:
def step_paddle(
share_inputs: Dict[str, paddle.Tensor],
block_size: int,
enc_dec_block_num: int,
) -> None:
"""
TODO(gongshaotian): normalization name
"""
from fastdeploy.model_executor.ops.xpu import step_paddle
step_paddle(
share_inputs["stop_flags"],
share_inputs["seq_lens_this_time"],
@@ -246,8 +257,7 @@ def step_paddle(share_inputs: Dict[str, paddle.Tensor], block_size: int,
class XPUModelRunner(ModelRunnerBase):
""" """
def __init__(self, fd_config: FDConfig, device: str, rank: int,
local_rank: int):
def __init__(self, fd_config: FDConfig, device: str, rank: int, local_rank: int):
super().__init__(fd_config=fd_config, device=device)
self.rank = rank
self.local_rank = local_rank
@@ -260,15 +270,15 @@ class XPUModelRunner(ModelRunnerBase):
# Cuda Graph
self.use_cudagraph = False
self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs,
dtype='int32')
self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, dtype="int32")
# Initialize share inputs
self._init_share_inputs(self.fd_config.parallel_config.max_num_seqs)
self.infer_seed_increment = paddle.full(
shape=[self.parallel_config.max_num_seqs, 1],
fill_value=4,
dtype="int64")
dtype="int64",
)
# Initialize attention Backend
# Note(gonshaotian): Currently, all attention layers share one attention backend instance.
@@ -281,68 +291,55 @@ class XPUModelRunner(ModelRunnerBase):
self.forward_meta: ForwardMeta = None
def process_prefill_inputs(self, req_dicts: List[Request]):
""" Process inputs for prefill tasks and update share_inputs buffer """
"""Process inputs for prefill tasks and update share_inputs buffer"""
req_len = len(req_dicts)
for i in range(req_len):
request = req_dicts[i]
idx = request.idx
length = request.prompt_token_ids_len
self.share_inputs["input_ids"][idx:idx + 1, :length] = np.array(
request.prompt_token_ids)
if len(request.eos_token_ids
) < self.parallel_config.eos_tokens_lens:
self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array(request.prompt_token_ids)
if len(request.eos_token_ids) < self.parallel_config.eos_tokens_lens:
request.eos_token_ids.append(request.eos_token_ids[0])
self.share_inputs["eos_token_id"][:] = np.array(
request.eos_token_ids, dtype="int64").reshape(-1, 1)
self.share_inputs["pre_ids"][idx:idx + 1] = -1
self.share_inputs["top_p"][idx:idx + 1] = request.get("top_p", 0.7)
self.share_inputs["top_k"][idx:idx + 1] = request.get("top_k", 0)
self.share_inputs["temperature"][idx:idx + 1] = request.get(
"temperature", 0.95)
self.share_inputs["penalty_score"][idx:idx + 1] = request.get(
"repetition_penalty", 1.0)
self.share_inputs["frequency_score"][idx:idx + 1] = request.get(
"frequency_penalty", 0.0)
self.share_inputs["presence_score"][idx:idx + 1] = request.get(
"presence_penalty", 0.0)
self.share_inputs["seq_lens_this_time"][idx:idx + 1] = length
self.share_inputs["step_seq_lens_encoder"][idx:idx + 1] = length
self.share_inputs["seq_lens_encoder"][idx:idx + 1] = length
self.share_inputs["seq_lens_decoder"][idx:idx + 1] = 0
self.share_inputs["step_idx"][idx:idx + 1] = 0
self.share_inputs["min_dec_len"][idx:idx + 1] = request.get(
"min_tokens", 1)
self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
self.share_inputs["pre_ids"][idx : idx + 1] = -1
self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7)
self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
self.share_inputs["temperature"][idx : idx + 1] = request.get("temperature", 0.95)
self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0)
self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0)
self.share_inputs["presence_score"][idx : idx + 1] = request.get("presence_penalty", 0.0)
self.share_inputs["seq_lens_this_time"][idx : idx + 1] = length
self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = length
self.share_inputs["seq_lens_encoder"][idx : idx + 1] = length
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
self.share_inputs["step_idx"][idx : idx + 1] = 0
self.share_inputs["min_dec_len"][idx : idx + 1] = request.get("min_tokens", 1)
self.share_inputs["max_dec_len"][idx:idx + 1] = request.get(
"max_tokens", self.model_config.max_model_len)
self.share_inputs["stop_flags"][idx:idx + 1] = False
self.share_inputs["max_dec_len"][idx : idx + 1] = request.get(
"max_tokens", self.model_config.max_model_len
)
self.share_inputs["stop_flags"][idx : idx + 1] = False
self.share_inputs["first_token_ids"][
idx:idx + 1] = self.share_inputs["input_ids"][idx:idx + 1, :1]
self.share_inputs["ori_seq_lens_encoder"][idx:idx + 1] = length
self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1]
self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = length
if request.get("seed") is not None:
self.share_inputs["infer_seed"][idx:idx +
1] = request.get("seed")
self.share_inputs["infer_seed"][idx : idx + 1] = request.get("seed")
encoder_block_num = len(request.get("block_tables"))
self.share_inputs["encoder_block_lens"][idx:idx +
1] = encoder_block_num
self.share_inputs["block_tables"][idx:idx + 1, :] = -1
self.share_inputs["block_tables"][
idx:idx + 1, :encoder_block_num] = np.array(
request.block_tables, dtype="int32")
self.share_inputs["encoder_block_lens"][idx : idx + 1] = encoder_block_num
self.share_inputs["block_tables"][idx : idx + 1, :] = -1
self.share_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array(
request.block_tables, dtype="int32"
)
if request.get("stop_token_ids") is not None and request.get(
"stop_seqs_len") is not None:
if request.get("stop_token_ids") is not None and request.get("stop_seqs_len") is not None:
stop_seqs_num = len(request.get("stop_seqs_len"))
for i in range(stop_seqs_num,
self.model_config.max_stop_seqs_num):
for i in range(stop_seqs_num, self.model_config.max_stop_seqs_num):
request.stop_seqs_len.append(0)
self.share_inputs["stop_seqs_len"][:] = np.array(
request.stop_seqs_len, dtype="int32")
self.share_inputs["stop_seqs"][:stop_seqs_num, :len(
request.get("stop_token_ids")[0])] = np.array(
request.get("stop_token_ids"), dtype="int64")
self.share_inputs["stop_seqs_len"][:] = np.array(request.stop_seqs_len, dtype="int32")
self.share_inputs["stop_seqs"][:stop_seqs_num, : len(request.get("stop_token_ids")[0])] = np.array(
request.get("stop_token_ids"), dtype="int64"
)
self.share_inputs["not_need_stop"][0] = True
@@ -356,151 +353,108 @@ class XPUModelRunner(ModelRunnerBase):
self.share_inputs["pre_ids"] = paddle.full(
[max_num_seqs, self.parallel_config.max_model_len],
-1,
dtype='int64')
dtype="int64",
)
self.share_inputs["input_ids"] = paddle.full(
[max_num_seqs, self.parallel_config.max_model_len],
self.parallel_config.pad_token_id,
dtype='int64')
self.share_inputs["eos_token_id"] = paddle.full(
[self.parallel_config.eos_tokens_lens, 1], 0, dtype='int64')
self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1],
self.model_config.top_p,
dtype='float32')
self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1],
0,
dtype='int64')
dtype="int64",
)
self.share_inputs["eos_token_id"] = paddle.full([self.parallel_config.eos_tokens_lens, 1], 0, dtype="int64")
self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32")
self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64")
self.share_inputs["temperature"] = paddle.full(
[max_num_seqs, 1], self.model_config.temperature, dtype='float32')
[max_num_seqs, 1], self.model_config.temperature, dtype="float32"
)
self.share_inputs["penalty_score"] = paddle.full(
[max_num_seqs, 1],
self.model_config.penalty_score,
dtype='float32')
[max_num_seqs, 1], self.model_config.penalty_score, dtype="float32"
)
self.share_inputs["frequency_score"] = paddle.full(
[max_num_seqs, 1],
self.model_config.frequency_score,
dtype='float32')
dtype="float32",
)
self.share_inputs["presence_score"] = paddle.full(
[max_num_seqs, 1],
self.model_config.presence_score,
dtype='float32')
[max_num_seqs, 1], self.model_config.presence_score, dtype="float32"
)
self.share_inputs["min_dec_len"] = paddle.full(
[max_num_seqs, 1], self.model_config.min_length, dtype='int64')
self.share_inputs["min_dec_len"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64")
self.share_inputs["max_dec_len"] = paddle.full(
[max_num_seqs, 1], self.model_config.max_model_len, dtype='int64')
self.share_inputs["min_length"] = paddle.full(
[max_num_seqs, 1], self.model_config.min_length, dtype='int64')
[max_num_seqs, 1], self.model_config.max_model_len, dtype="int64"
)
self.share_inputs["min_length"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64")
self.share_inputs["max_length"] = paddle.full(
[max_num_seqs, 1], self.model_config.max_model_len, dtype='int64')
self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs,
0,
dtype='int32')
self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1],
0,
dtype='int32')
self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1],
0,
dtype='int32')
self.share_inputs["step_seq_lens_encoder"] = paddle.full(
[max_num_seqs, 1], 0, dtype='int32')
self.share_inputs["step_idx"] = paddle.full([max_num_seqs, 1],
0,
dtype='int64')
[max_num_seqs, 1], self.model_config.max_model_len, dtype="int64"
)
self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, 0, dtype="int32")
self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
self.share_inputs["step_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
self.share_inputs["step_idx"] = paddle.full([max_num_seqs, 1], 0, dtype="int64")
self.share_inputs["not_need_stop"] = paddle.full(
[1], False,
dtype='bool').cpu() # TODO(gongshaotian): move to pinnd memory
self.share_inputs["stop_flags"] = paddle.full([max_num_seqs, 1],
True,
dtype='bool')
self.share_inputs["stop_nums"] = paddle.full([1],
max_num_seqs,
dtype='int64')
[1], False, dtype="bool"
).cpu() # TODO(gongshaotian): move to pinnd memory
self.share_inputs["stop_flags"] = paddle.full([max_num_seqs, 1], True, dtype="bool")
self.share_inputs["stop_nums"] = paddle.full([1], max_num_seqs, dtype="int64")
self.share_inputs["bad_tokens"] = paddle.full([1], -1, dtype='int64')
self.share_inputs["next_tokens"] = paddle.full([max_num_seqs, 1],
-1,
dtype='int64')
self.share_inputs["is_block_step"] = paddle.full([max_num_seqs],
False,
dtype='bool')
self.share_inputs["encoder_block_lens"] = paddle.full([max_num_seqs],
0,
dtype='int32')
self.share_inputs["step_block_list"] = paddle.full([max_num_seqs],
-1,
dtype='int32')
self.share_inputs["step_lens"] = paddle.full([1], 0, dtype='int32')
self.share_inputs["recover_block_list"] = paddle.full([max_num_seqs],
-1,
dtype='int32')
self.share_inputs["recover_lens"] = paddle.full([1], 0, dtype='int32')
self.share_inputs["need_block_list"] = paddle.full([max_num_seqs],
-1,
dtype='int32')
self.share_inputs["need_block_len"] = paddle.full([1],
0,
dtype='int32')
self.share_inputs["used_list_len"] = paddle.full([max_num_seqs],
0,
dtype='int32')
self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1],
0,
dtype='int64')
self.share_inputs["first_token_ids"] = paddle.full([max_num_seqs, 1],
-1,
dtype='int64')
self.share_inputs["ori_seq_lens_encoder"] = paddle.full(
[max_num_seqs, 1], 0, dtype='int32')
self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1],
0,
dtype='int32')
self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1],
-1,
dtype='int32')
self.share_inputs["bad_tokens"] = paddle.full([1], -1, dtype="int64")
self.share_inputs["next_tokens"] = paddle.full([max_num_seqs, 1], -1, dtype="int64")
self.share_inputs["is_block_step"] = paddle.full([max_num_seqs], False, dtype="bool")
self.share_inputs["encoder_block_lens"] = paddle.full([max_num_seqs], 0, dtype="int32")
self.share_inputs["step_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32")
self.share_inputs["step_lens"] = paddle.full([1], 0, dtype="int32")
self.share_inputs["recover_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32")
self.share_inputs["recover_lens"] = paddle.full([1], 0, dtype="int32")
self.share_inputs["need_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32")
self.share_inputs["need_block_len"] = paddle.full([1], 0, dtype="int32")
self.share_inputs["used_list_len"] = paddle.full([max_num_seqs], 0, dtype="int32")
self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], 0, dtype="int64")
self.share_inputs["first_token_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int64")
self.share_inputs["ori_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int32")
# Initialize rotary position embedding
tmp_position_ids = paddle.arange(
self.parallel_config.max_model_len).reshape((1, -1))
tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1))
# TODO(gongshaotian): move to models
self.share_inputs["rope_emb"] = get_rope(
rotary_dim=self.model_config.head_dim,
position_ids=tmp_position_ids,
base=self.model_config.rope_theta,
model_config=self.model_config)
model_config=self.model_config,
)
# Set block tables
pre_max_block_num = (
self.parallel_config.max_model_len +
self.parallel_config.block_size - 1
self.parallel_config.max_model_len + self.parallel_config.block_size - 1
) // self.parallel_config.block_size + self.parallel_config.enc_dec_block_num
self.share_inputs["block_tables"] = paddle.full(
[max_num_seqs, pre_max_block_num], -1, dtype='int32')
self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32")
# Initialize free list
free_list = list(
range(
self.parallel_config.total_block_num - 1,
int(self.parallel_config.total_block_num *
self.parallel_config.kv_cache_ratio) - 1, -1))
int(self.parallel_config.total_block_num * self.parallel_config.kv_cache_ratio) - 1,
-1,
)
)
self.free_list_len = len(free_list)
self.share_inputs["free_list"] = paddle.to_tensor(free_list,
dtype="int32")
self.share_inputs["free_list_len"] = paddle.full([1],
self.free_list_len,
dtype="int32")
self.share_inputs["free_list"] = paddle.to_tensor(free_list, dtype="int32")
self.share_inputs["free_list_len"] = paddle.full([1], self.free_list_len, dtype="int32")
# Initialize stop seqs
self.share_inputs["stop_seqs_len"] = paddle.full(
[self.model_config.max_stop_seqs_num], 0, dtype="int32")
self.share_inputs["stop_seqs"] = paddle.full([
self.model_config.max_stop_seqs_num,
self.model_config.stop_seqs_max_len
],
-1,
dtype="int32")
self.share_inputs["stop_seqs_len"] = paddle.full([self.model_config.max_stop_seqs_num], 0, dtype="int32")
self.share_inputs["stop_seqs"] = paddle.full(
[
self.model_config.max_stop_seqs_num,
self.model_config.stop_seqs_max_len,
],
-1,
dtype="int32",
)
def _prepare_inputs(self) -> None:
""" prepare the model inputs """
"""prepare the model inputs"""
self.forward_meta = xpu_pre_process(
self.parallel_config.max_model_len,
self.share_inputs["input_ids"],
@@ -530,9 +484,8 @@ class XPUModelRunner(ModelRunnerBase):
)
def load_model(self) -> None:
""" load or download model """
logger.info(
f"Starting to load model {self.model_config.architectures[0]}")
"""load or download model"""
logger.info(f"Starting to load model {self.model_config.architectures[0]}")
time_before_load = time.perf_counter()
# 1. Load original model
self.model = get_model_from_loader(fd_config=self.fd_config)
@@ -542,11 +495,10 @@ class XPUModelRunner(ModelRunnerBase):
# 3. Load drafter model(for speculative decoding)
time_after_load = time.perf_counter()
logger.info(
f"Model loading took {time_after_load - time_before_load} seconds")
logger.info(f"Model loading took {time_after_load - time_before_load} seconds")
def get_model(self) -> nn.Layer:
""" get current model """
"""get current model"""
return self.model
def initialize_attention_backend(self):
@@ -566,21 +518,22 @@ class XPUModelRunner(ModelRunnerBase):
cache_type = self.parallel_config.dtype
if (self.quant_config
and hasattr(self.quant_config, "kv_cache_quant_type")
and self.quant_config.kv_cache_quant_type is not None):
cache_type = 'uint8'
if (
self.quant_config
and hasattr(self.quant_config, "kv_cache_quant_type")
and self.quant_config.kv_cache_quant_type is not None
):
cache_type = "uint8"
kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(
max_num_blocks=max_block_num)
kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(max_num_blocks=max_block_num)
for i in range(self.model_config.num_hidden_layers):
cache_kvs["key_caches_{}".format(i)] = paddle.full(
cache_kvs[f"key_caches_{i}"] = paddle.full(
shape=kv_cache_shape,
fill_value=0,
dtype=cache_type,
)
cache_kvs["value_caches_{}".format(i)] = paddle.full(
cache_kvs[f"value_caches_{i}"] = paddle.full(
shape=kv_cache_shape,
fill_value=0,
dtype=cache_type,
@@ -598,17 +551,19 @@ class XPUModelRunner(ModelRunnerBase):
# TODO(gongshaotian): Get rank from config
num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size
self.model_config.kv_num_heads = int(
self.model_config.num_key_value_heads
) // self.parallel_config.tensor_parallel_size
self.model_config.kv_num_heads = (
int(self.model_config.num_key_value_heads) // self.parallel_config.tensor_parallel_size
)
head_dim = self.model_config.head_dim
# Get the attention backend
attn_cls = get_attention_backend()
attn_backend = attn_cls(self.fd_config,
kv_num_heads=self.model_config.kv_num_heads,
num_heads=num_heads,
head_dim=head_dim)
attn_backend = attn_cls(
self.fd_config,
kv_num_heads=self.model_config.kv_num_heads,
num_heads=num_heads,
head_dim=head_dim,
)
if attn_backend is None:
raise NotImplementedError(
"Attention backend which you specified is not supported, please set FD_ATTENTION_BACKEND correctly."
@@ -626,15 +581,14 @@ class XPUModelRunner(ModelRunnerBase):
"""
check whether prefill stage finished
"""
if int(paddle.max(self.share_inputs['seq_lens_encoder'])) != 0:
if int(paddle.max(self.share_inputs["seq_lens_encoder"])) != 0:
return 1
else:
return 0
def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int):
""" Set dummy prefill inputs to share_inputs """
full_length = min(num_tokens // batch_size,
self.parallel_config.max_model_len - 10)
"""Set dummy prefill inputs to share_inputs"""
full_length = min(num_tokens // batch_size, self.parallel_config.max_model_len - 10)
input_length = int(full_length - 512)
block_num = (
input_length + self.parallel_config.block_size - 1
@@ -642,35 +596,31 @@ class XPUModelRunner(ModelRunnerBase):
for i in range(batch_size):
idx = i
self.share_inputs["input_ids"][idx:idx +
1, :input_length] = np.array(
[5] * input_length)
self.share_inputs["eos_token_id"][:] = np.array(
[2], dtype="int64").reshape(-1, 1)
self.share_inputs["seq_lens_this_time"][idx:idx + 1] = input_length
self.share_inputs["step_seq_lens_encoder"][idx:idx +
1] = input_length
self.share_inputs["seq_lens_encoder"][idx:idx + 1] = input_length
self.share_inputs["seq_lens_decoder"][idx:idx + 1] = 0
self.share_inputs["step_idx"][idx:idx + 1] = 0
self.share_inputs["max_dec_len"][idx:idx + 1] = 10
self.share_inputs["stop_flags"][idx:idx + 1] = False
self.share_inputs["input_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length)
self.share_inputs["eos_token_id"][:] = np.array([2], dtype="int64").reshape(-1, 1)
self.share_inputs["seq_lens_this_time"][idx : idx + 1] = input_length
self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = input_length
self.share_inputs["seq_lens_encoder"][idx : idx + 1] = input_length
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
self.share_inputs["step_idx"][idx : idx + 1] = 0
self.share_inputs["max_dec_len"][idx : idx + 1] = 10
self.share_inputs["stop_flags"][idx : idx + 1] = False
self.share_inputs["first_token_ids"][
idx:idx + 1] = self.share_inputs["input_ids"][idx:idx + 1, :1]
self.share_inputs["ori_seq_lens_encoder"][idx:idx +
1] = input_length
self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1]
self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = input_length
self.share_inputs["infer_seed"][idx:idx + 1] = random.randint(
0, 922337203685477580)
self.share_inputs["encoder_block_lens"][idx:idx + 1] = block_num
self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange(idx * block_num, \
(idx + 1) * block_num, 1)
self.share_inputs["infer_seed"][idx : idx + 1] = random.randint(0, 922337203685477580)
self.share_inputs["encoder_block_lens"][idx : idx + 1] = block_num
self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange(
idx * block_num, (idx + 1) * block_num, 1
)
def _dummy_run(self,
num_tokens: paddle.Tensor,
batch_size: paddle.Tensor,
in_capturing: bool = False) -> paddle.Tensor:
def _dummy_run(
self,
num_tokens: paddle.Tensor,
batch_size: paddle.Tensor,
in_capturing: bool = False,
) -> paddle.Tensor:
"""
Use dummy inputs to run before formal execution.
Args:
@@ -681,7 +631,7 @@ class XPUModelRunner(ModelRunnerBase):
while True:
self.execute_model(None, True)
if int((self.share_inputs['seq_lens_this_time'] > 0).sum()) == 0:
if int((self.share_inputs["seq_lens_this_time"] > 0).sum()) == 0:
break
def execute_model(
@@ -703,12 +653,9 @@ class XPUModelRunner(ModelRunnerBase):
# 2. Padding inputs for cuda grph
# 3. Execute model
model_output = self.model(self.share_inputs["ids_remove_padding"],
self.forward_meta)
model_output = self.model(self.share_inputs["ids_remove_padding"], self.forward_meta)
hiddden_states = xpu_process_output(model_output,
self.share_inputs["cum_offsets"],
self.forward_meta)
hiddden_states = xpu_process_output(model_output, self.share_inputs["cum_offsets"], self.forward_meta)
# 4. Compute logits, Sample
logits = self.model.compute_logits(hiddden_states)
@@ -742,15 +689,20 @@ class XPUModelRunner(ModelRunnerBase):
accept_tokens=None,
accept_num=None,
)
xpu_post_process(sampled_token_ids=sampler_output.sampled_token_ids,
model_output=model_output_data,
skip_save_output=is_dummy_run)
xpu_post_process(
sampled_token_ids=sampler_output.sampled_token_ids,
model_output=model_output_data,
skip_save_output=is_dummy_run,
)
# 7. Updata 'infer_seed' and step_paddle()
self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
step_paddle(self.share_inputs, self.parallel_config.block_size,
self.parallel_config.enc_dec_block_num)
step_paddle(
self.share_inputs,
self.parallel_config.block_size,
self.parallel_config.enc_dec_block_num,
)
return None
@@ -763,9 +715,10 @@ class XPUModelRunner(ModelRunnerBase):
def profile_run(self) -> None:
"""Execute a forward pass with dummy inputs to profile the memory usage of the model."""
self._dummy_run(num_tokens=int(
self.parallel_config.max_num_batched_tokens),
batch_size=min(self.parallel_config.max_num_seqs, 1))
self._dummy_run(
num_tokens=int(self.parallel_config.max_num_batched_tokens),
batch_size=min(self.parallel_config.max_num_seqs, 1),
)
def clear_block_table(self) -> None:
"""
@@ -788,9 +741,11 @@ class XPUModelRunner(ModelRunnerBase):
- cache_int4:
"""
cache_quant_dtype = None
if (self.quant_config
and hasattr(self.quant_config, "kv_cache_quant_type")
and self.quant_config.kv_cache_quant_type is not None):
if (
self.quant_config
and hasattr(self.quant_config, "kv_cache_quant_type")
and self.quant_config.kv_cache_quant_type is not None
):
cache_quant_dtype = self.quant_config.kv_cache_quant_type
if cache_quant_dtype is not None: # int8, int8_zp, fp8, fp8_zp
@@ -800,9 +755,11 @@ class XPUModelRunner(ModelRunnerBase):
hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads
required_memory = (
byte_of_dtype * 2 * # k + v
(self.parallel_config.block_size * hidden_dim) *
self.model_config.num_hidden_layers)
byte_of_dtype
* 2 # k + v
* (self.parallel_config.block_size * hidden_dim)
* self.model_config.num_hidden_layers
)
return required_memory
def update_share_input_block_num(self, num_gpu_blocks: int) -> None:
@@ -820,15 +777,17 @@ class XPUModelRunner(ModelRunnerBase):
free_list = list(
range(
self.num_gpu_blocks - 1,
int(self.num_gpu_blocks * self.parallel_config.kv_cache_ratio)
- 1, -1))
int(self.num_gpu_blocks * self.parallel_config.kv_cache_ratio) - 1,
-1,
)
)
self.free_list_len = len(free_list)
self.share_inputs.update({
"free_list":
paddle.to_tensor(free_list, dtype="int32"),
"free_list_len":
paddle.full([1], self.free_list_len, dtype="int32"),
})
self.share_inputs.update(
{
"free_list": paddle.to_tensor(free_list, dtype="int32"),
"free_list_len": paddle.full([1], self.free_list_len, dtype="int32"),
}
)
def not_need_stop(self) -> bool:
""" """