[LLM] First commit the llm deployment code

This commit is contained in:
jiangjiajun
2025-06-09 19:20:15 +08:00
parent 980c0a1d2c
commit 684703fd72
11814 changed files with 127294 additions and 1293102 deletions

View File

@@ -0,0 +1,353 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from __future__ import annotations
import time
import numpy
from dataclasses import dataclass, asdict, fields
from typing import TYPE_CHECKING, Optional, Union, Any
from fastdeploy.engine.sampling_params import SamplingParams
from fastdeploy.utils import data_processor_logger
from fastdeploy.engine.sampling_params import SamplingParams
from fastdeploy.utils import data_processor_logger
@dataclass
class Request:
"""A class representing an inference request to the LLM engine.
Attributes:
request_id: Unique identifier for the request
prompt: Input prompt text or list of prompts
prompt_token_ids: Token IDs of the input prompt
prompt_token_ids_len: Length of prompt token IDs
messages: List of message dictionaries (for chat models)
history: Conversation history (for chat models)
system: System message (for chat models)
sampling_params: Parameters controlling text generation
eos_token_ids: List of end-of-sequence token IDs
arrival_time: Timestamp when request was received
preprocess_start_time: Timestamp when preprocessing started
preprocess_end_time: Timestamp when preprocessing completed
multimodal_inputs: Dictionary of multimodal inputs (images, audio etc.)
raw_request: Flag indicating if this is a raw request
"""
def __init__(
self,
request_id: str,
prompt: Optional[Union[str, list[str]]],
prompt_token_ids: Optional[list[int]],
prompt_token_ids_len: Optional[int],
messages: Optional[list[list[dict[str, Any]]]],
history: Optional[list[list[str]]],
system: Optional[Union[str, list[str]]],
sampling_params: SamplingParams,
eos_token_ids: Optional[list[int]],
arrival_time: float,
preprocess_start_time: Optional[float] = None,
preprocess_end_time: Optional[float] = None,
multimodal_inputs: Optional[dict] = None,
raw_request: bool = True
) -> None:
self.request_id = request_id
self.prompt = prompt
self.prompt_token_ids = prompt_token_ids
self.prompt_token_ids_len = prompt_token_ids_len
self.messages = messages
self.system = system
self.sampling_params = sampling_params
self.history = history
self.eos_token_ids = eos_token_ids
self.arrival_time = arrival_time
self.preprocess_start_time = preprocess_start_time
self.preprocess_end_time = preprocess_end_time
self.raw_request = raw_request
# Multi-modal related
self.multimodal_inputs = multimodal_inputs
@classmethod
def from_dict(cls, d: dict):
"""Create a Request instance from a dictionary.
Args:
d: Dictionary containing request parameters
Returns:
Request: A new Request instance initialized with values from the dictionary
"""
data_processor_logger.debug(f"{d}")
sampling_params = SamplingParams.from_dict(d)
return cls(
request_id=d["request_id"],
prompt=d.get("prompt"),
prompt_token_ids=d.get("prompt_token_ids"),
prompt_token_ids_len=d.get("prompt_token_ids_len"),
messages=d.get("messages"),
system=d.get("system"),
history=d.get("history"),
sampling_params=sampling_params,
eos_token_ids=d.get("eos_token_ids"),
arrival_time=d.get("arrival_time", time.time()),
preprocess_start_time=d.get("preprocess_start_time"),
preprocess_end_time=d.get("preprocess_end_time"),
multimodal_inputs=d.get("multimodal_inputs"),
raw_request=d.get("raw_request", True)
)
def to_dict(self) -> dict:
"""Convert the Request object into a serializable dictionary.
Returns:
dict: A dictionary containing all request attributes and sampling parameters
"""
data = {
"request_id": self.request_id,
"prompt": self.prompt,
"prompt_token_ids": self.prompt_token_ids,
"prompt_token_ids_len": self.prompt_token_ids_len,
"messages": self.messages,
"system": self.system,
"history": self.history,
"eos_token_ids": self.eos_token_ids,
"arrival_time": self.arrival_time,
"preprocess_start_time": self.preprocess_start_time,
"preprocess_end_time": self.preprocess_end_time,
"multimodal_inputs": self.multimodal_inputs,
"raw_request": self.raw_request
}
data.update(asdict(self.sampling_params))
return data
def get(self, key: str, default_value=None):
"""Get an attribute value from either the Request or its sampling parameters.
Args:
key: Attribute name to retrieve
default_value: Default value to return if attribute not found
Returns:
The attribute value if found, otherwise default_value
"""
if hasattr(self, key):
return getattr(self, key)
elif hasattr(self.sampling_params, key):
return getattr(self.sampling_params, key)
else:
return default_value
def set(self, key, value):
"""Set an attribute value on either the Request or its sampling parameters.
Args:
key: Attribute name to set
value: Value to assign to the attribute
"""
if hasattr(self.sampling_params, key):
setattr(self.sampling_params, key, value)
else:
setattr(self, key, value)
def __repr__(self) -> str:
return (f"Request(request_id={self.request_id}, "
f"prompt={self.prompt!r}, "
f"prompt_token_ids={self.prompt_token_ids}, "
f"sampling_params={self.sampling_params})")
@dataclass
class CompletionOutput:
"""The output data of one completion output of a request.
Args:
index: The index of the output in the request.
text: The generated output text.
token_ids: The token IDs of the generated output text.
"""
index: int
token_ids: list[int]
text: Optional[str] = None
reasoning_content: Optional[str] = None
@classmethod
def from_dict(cls, req_dict: dict[str, Any]) -> 'CompletionOutput':
"""Create instance from dict arguments"""
return cls(**{
field.name: req_dict[field.name] if field.name in req_dict else field.default
for field in fields(cls)
})
def __repr__(self) -> str:
return (f"CompletionOutput(index={self.index}, "
f"text={self.text!r}, "
f"token_ids={self.token_ids}, "
f"reasoning_content={self.reasoning_content!r}")
@dataclass
class RequestMetrics:
"""Metrics associated with a request.
Attributes:
arrival_time: The time when the request arrived.
inference_start_time: The time when the inference started.
first_token_time: The time when the first token was generated.
time_in_queue: The time the request spent in the queue.
model_forward_time: The time spent in the model forward pass when this
request was in the batch.
model_execute_time: The time spent in the model execute function. This
will include model forward, block/sync across
workers, cpu-gpu sync time and sampling time.
request_start_time: Time to accept the request
"""
arrival_time: float
inference_start_time: Optional[float] = None
first_token_time: Optional[float] = None
time_in_queue: Optional[float] = None
preprocess_cost_time: Optional[float] = None
model_forward_time: Optional[float] = None
model_execute_time: Optional[float] = None
request_start_time: Optional[float] = None
@classmethod
def from_dict(cls, req_dict: dict[str, Any]) -> 'RequestMetrics':
"""Create instance from dict arguments"""
return cls(**{
field.name: req_dict[field.name] if field.name in req_dict else field.default
for field in fields(cls)
})
class RequestOutput:
"""The output data of a completion request to the LLM.
Args:
request_id: The unique ID of the request.
prompt: The prompt string of the request.
For encoder/decoder models, this is the
decoder input prompt.
prompt_token_ids: The token IDs of the prompt.
For encoder/decoder models, this is the
decoder input prompt token ids.
prompt_logprobs: The log probabilities to return per prompt token.
outputs: The output sequences of the request.
finished: Whether the whole request is finished.
metrics: Metrics associated with the request.
lora_request: The LoRA request that was used to generate the output.
encoder_prompt: The encoder prompt string of the request.
None if decoder-only.
encoder_prompt_token_ids: The token IDs of the encoder prompt.
None if decoder-only.
num_cached_tokens: The number of tokens with prefix cache hit.
"""
def __init__(
self,
request_id: str,
prompt: Optional[str] = None,
prompt_token_ids: Optional[list[int]] = None,
outputs: CompletionOutput = None,
finished: bool = False,
metrics: Optional[RequestMetrics] = None,
num_cached_tokens: Optional[int] = 0,
error_code: Optional[int] = 200,
error_msg: Optional[str] = None,
) -> None:
self.request_id = request_id
self.prompt = prompt
self.prompt_token_ids = prompt_token_ids
self.outputs = outputs
self.finished = finished
self.metrics = metrics
self.num_cached_tokens = num_cached_tokens
self.error_code = error_code
self.error_msg = error_msg
def add(self, next_output: "RequestOutput") -> None:
"""Merge another RequestOutput into this one.
Args:
next_output: The RequestOutput to merge into this one
Updates:
- Combines output sequences
- Updates finish status
- Calculates timing metrics
"""
self.prompt = next_output.prompt
self.prompt_token_ids = next_output.prompt_token_ids
self.finished |= next_output.finished
self.outputs.index = next_output.outputs.index
self.outputs.token_ids.extend(next_output.outputs.token_ids)
if next_output.metrics.arrival_time is not None and self.metrics.inference_start_time is not None:
self.metrics.model_forward_time = next_output.metrics.arrival_time - \
self.metrics.inference_start_time
if next_output.metrics.arrival_time is not None and self.metrics.arrival_time is not None:
self.metrics.model_execute_time = next_output.metrics.arrival_time - \
self.metrics.arrival_time
def __repr__(self) -> str:
return (f"RequestOutput(request_id={self.request_id}, "
f"prompt={self.prompt!r}, "
f"prompt_token_ids={self.prompt_token_ids}, "
f"outputs={self.outputs}, "
f"metrics={self.metrics}, "
f"num_cached_tokens={self.num_cached_tokens})")
@classmethod
def from_dict(cls, d: dict):
"""Create a RequestOutput instance from a dictionary.
Args:
d: Dictionary containing request output parameters
Returns:
RequestOutput: A new RequestOutput instance initialized with values from the dictionary
"""
completion_output = CompletionOutput.from_dict(d.pop("outputs"))
metrics = RequestMetrics.from_dict(d.pop("metrics"))
return RequestOutput(**d, outputs=completion_output, metrics=metrics)
def to_dict(self):
"""Convert the RequestOutput object into a serializable dictionary.
Returns:
dict: A dictionary containing all request output attributes,
with token IDs converted to lists if necessary
"""
if self.prompt_token_ids is None:
self.prompt_token_ids = []
if type(self.prompt_token_ids) is numpy.ndarray:
self.prompt_token_ids = self.prompt_token_ids.tolist()
return {
"request_id": self.request_id,
"prompt": self.prompt,
"prompt_token_ids": self.prompt_token_ids,
"outputs": None if self.outputs is None else asdict(self.outputs),
"finished": self.finished,
"metrics": None if self.metrics is None else asdict(self.metrics),
"num_cached_tokens": self.num_cached_tokens,
"error_code": self.error_code,
"error_msg": self.error_msg,
}