From 70ec1e17c1a0549874ef7668d6454f605a474fde Mon Sep 17 00:00:00 2001 From: ming1753 <61511741+ming1753@users.noreply.github.com> Date: Mon, 1 Dec 2025 11:12:17 +0800 Subject: [PATCH] [Features] add audio request & fix embedding bug (#5201) * [Features] add audio request & fix embedding bug * fix bug --- fastdeploy/input/tokenzier_client.py | 31 +++++++++++++++++-- .../model_executor/layers/embeddings.py | 29 ++++++++++------- 2 files changed, 46 insertions(+), 14 deletions(-) diff --git a/fastdeploy/input/tokenzier_client.py b/fastdeploy/input/tokenzier_client.py index 409055c1d..34136a080 100644 --- a/fastdeploy/input/tokenzier_client.py +++ b/fastdeploy/input/tokenzier_client.py @@ -27,27 +27,43 @@ from fastdeploy.utils import data_processor_logger class BaseEncodeRequest(BaseModel): version: str req_id: str - is_gen: bool - resolution: int class ImageEncodeRequest(BaseEncodeRequest): image_url: Union[str, HttpUrl] + is_gen: bool + resolution: int class VideoEncodeRequest(BaseEncodeRequest): video_url: Union[str, HttpUrl] + is_gen: bool + resolution: int start_ts: int end_ts: int frames: int vit_merge: bool +class AudioEncodeRequest(BaseEncodeRequest): + audio_url: Union[str, HttpUrl] + is_add_spk_emb: bool + is_pad_aug: bool + is_aug: bool + audio_start: Optional[float] + audio_dur: Optional[float] + + class ImageDecodeRequest(BaseModel): req_id: str data: list[Any] +class AudioDecodeRequest(BaseModel): + req_id: str + data: list[Any] + + class AsyncTokenizerClient: def __init__( self, @@ -74,9 +90,15 @@ class AsyncTokenizerClient: async def encode_video(self, request: VideoEncodeRequest): return await self._async_encode_request("video", request.__dict__) + async def encode_audio(self, request: AudioEncodeRequest): + return await self._async_encode_request("audio", request.__dict__) + async def decode_image(self, request: ImageDecodeRequest): return await self._async_decode_request("image", request.__dict__) + async def decode_audio(self, request: AudioDecodeRequest): + return await self._async_decode_request("audio", request.__dict__) + async def log_request(self, request): data_processor_logger.debug(f">>> Request: {request.method} {request.url}") data_processor_logger.debug(f">>> Headers: {request.headers}") @@ -101,6 +123,8 @@ class AsyncTokenizerClient: url = f"{self.base_url}/image/encode" elif type == "video": url = f"{self.base_url}/video/encode" + elif type == "audio": + url = f"{self.base_url}/audio/encode" else: raise ValueError("Invalid type") @@ -110,6 +134,7 @@ class AsyncTokenizerClient: raise RuntimeError(f"Failed to create tokenize task: {e}") from e task_info = resp.json() + if task_info.get("code") != 0: raise RuntimeError(f"Tokenize task creation failed, {task_info.get('message')}") @@ -154,6 +179,8 @@ class AsyncTokenizerClient: url = None if type == "image": url = f"{self.base_url}/image/decode" + elif type == "audio": + url = f"{self.base_url}/audio/decode" else: raise ValueError("Invalid type") diff --git a/fastdeploy/model_executor/layers/embeddings.py b/fastdeploy/model_executor/layers/embeddings.py index 76ea13f30..1fea7d06f 100644 --- a/fastdeploy/model_executor/layers/embeddings.py +++ b/fastdeploy/model_executor/layers/embeddings.py @@ -106,6 +106,8 @@ class VocabParallelEmbedding(nn.Layer): params_dtype: str = "bfloat16", prefix="", padding_size: int = DEFAULT_VOCAB_PADDING_SIZE, + org_num_embeddings: int | None = None, + general=False, ) -> None: """ Initialize the VocabParallelEmbedding layer for the model. @@ -132,17 +134,23 @@ class VocabParallelEmbedding(nn.Layer): self.max_position_embeddings: int = fd_config.model_config.max_position_embeddings self.tie_word_embeddings: bool = fd_config.model_config.tie_word_embeddings self.params_dtype: str = params_dtype - self.padding_size = padding_size - self.org_vocab_size = num_embeddings + self.general = general # used for general Embedding self.num_embeddings = num_embeddings - num_added_embeddings = num_embeddings - self.org_vocab_size + self.padding_size = padding_size + if self.general: + self.org_vocab_size = num_embeddings + self.num_embeddings_padded = num_embeddings + self.org_vocab_size_padded = num_embeddings + else: + self.org_vocab_size = org_num_embeddings or num_embeddings + num_added_embeddings = num_embeddings - self.org_vocab_size - self.org_vocab_size_padded = pad_vocab_size(self.org_vocab_size, self.padding_size) - self.num_embeddings_padded = pad_vocab_size( - self.org_vocab_size_padded + num_added_embeddings, self.padding_size - ) - assert self.org_vocab_size_padded <= self.num_embeddings_padded + self.org_vocab_size_padded = pad_vocab_size(self.org_vocab_size, self.padding_size) + self.num_embeddings_padded = pad_vocab_size( + self.org_vocab_size_padded + num_added_embeddings, self.padding_size + ) + assert self.org_vocab_size_padded <= self.num_embeddings_padded self.shard_indices = self._get_indices( self.num_embeddings_padded, self.org_vocab_size_padded, @@ -152,9 +160,6 @@ class VocabParallelEmbedding(nn.Layer): self.world_size, ) - if num_embeddings % self.world_size != 0: - self.num_embeddings_padded = pad_vocab_size(num_embeddings, self.padding_size) - if not self.column_cut: self.embeddings = fleet.meta_parallel.VocabParallelEmbedding( self.num_embeddings_padded, @@ -188,7 +193,7 @@ class VocabParallelEmbedding(nn.Layer): Args: state_dict (dict): A dictionary containing the checkpoint weights and biases. """ - if self.tie_word_embeddings: + if self.tie_word_embeddings and not self.general: weight_tensor = get_tensor(state_dict[self.prefix + ".weight"]).astype(paddle.get_default_dtype()) else: weight_tensor = get_tensor(state_dict.pop(self.prefix + ".weight")).astype(paddle.get_default_dtype())