mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-07 09:31:35 +08:00
[Model] Add stable diffusion model based on fastdeploy (#297)
* Add stable diffusion model base on fastdeploy * Add sd infer * pipelines->multimodal * add create_ort_runtime * use fp16 input * fix pil * Add optimize unet model * add hf license * Add workspace args * Add profile func * Add schedulers * usrelace torch.Tenosr byp.ndarray * Add readme * Add trt shape setting * add dynamic shape * Add dynamic shape for stable diffusion * fix max shape setting * rename tensorrt file suffix * update dynamic shape setting * Add scheduler output * Add inference_steps and benchmark steps * add diffuser benchmark * Add paddle infer script * Rename 1 * Rename infer.py to torch_onnx_infer.py * Add export torch to onnx model * renmove export model * Add paddle export model for diffusion * Fix export model * mv torch onnx infer to infer * Fix export model * Fix infer * modif create_trt_runtime create_ort_runtime * update export torch * update requirements * add paddle inference backend * Fix unet pp run * remove print * Add paddle model export and infer * Add device id * remove profile to utils * Add -1 device id * Add safety checker args * remove safety checker temporarily * Add export model description * Add predict description * Fix readme * Fix device_id description * add timestep shape * add use fp16 precision * move use gpu * Add EulerAncestralDiscreteScheduler * Use EulerAncestralDiscreteScheduler with v1-5 model * Add export model readme * Add link of exported model * Update scheduler on README * Addd stable-diffusion-v1-5
This commit is contained in:
@@ -0,0 +1,236 @@
|
||||
# Copyright 2022 The HuggingFace Inc. team.
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import inspect
|
||||
from typing import Callable, List, Optional, Union
|
||||
import numpy as np
|
||||
|
||||
from paddlenlp.transformers import CLIPTokenizer
|
||||
import fastdeploy as fd
|
||||
from scheduling_utils import PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler, EulerAncestralDiscreteScheduler
|
||||
import PIL
|
||||
from PIL import Image
|
||||
import logging
|
||||
|
||||
|
||||
class StableDiffusionFastDeployPipeline(object):
|
||||
vae_decoder_runtime: fd.Runtime
|
||||
text_encoder_runtime: fd.Runtime
|
||||
tokenizer: CLIPTokenizer
|
||||
unet_runtime: fd.Runtime
|
||||
scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler,
|
||||
EulerAncestralDiscreteScheduler]
|
||||
|
||||
def __init__(self,
|
||||
vae_decoder_runtime: fd.Runtime,
|
||||
text_encoder_runtime: fd.Runtime,
|
||||
tokenizer: CLIPTokenizer,
|
||||
unet_runtime: fd.Runtime,
|
||||
scheduler: Union[DDIMScheduler, PNDMScheduler,
|
||||
LMSDiscreteScheduler]):
|
||||
self.vae_decoder_runtime = vae_decoder_runtime
|
||||
self.text_encoder_runtime = text_encoder_runtime
|
||||
self.unet_runtime = unet_runtime
|
||||
self.scheduler = scheduler
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
height: Optional[int]=512,
|
||||
width: Optional[int]=512,
|
||||
num_inference_steps: Optional[int]=50,
|
||||
guidance_scale: Optional[float]=7.5,
|
||||
negative_prompt: Optional[Union[str, List[str]]]=None,
|
||||
num_images_per_prompt: Optional[int]=1,
|
||||
eta: Optional[float]=0.0,
|
||||
generator: Optional[np.random.RandomState]=None,
|
||||
latents: Optional[np.ndarray]=None,
|
||||
output_type: Optional[str]="pil",
|
||||
return_dict: bool=True,
|
||||
callback: Optional[Callable[[int, int, np.ndarray], None]]=None,
|
||||
callback_steps: Optional[int]=1,
|
||||
**kwargs, ):
|
||||
if isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif isinstance(prompt, list):
|
||||
batch_size = len(prompt)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
|
||||
)
|
||||
|
||||
if height % 8 != 0 or width % 8 != 0:
|
||||
raise ValueError(
|
||||
f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
|
||||
)
|
||||
|
||||
if (callback_steps is None) or (callback_steps is not None and (
|
||||
not isinstance(callback_steps, int) or callback_steps <= 0)):
|
||||
raise ValueError(
|
||||
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
|
||||
f" {type(callback_steps)}.")
|
||||
|
||||
if generator is None:
|
||||
generator = np.random
|
||||
|
||||
# get prompt text embeddings
|
||||
text_inputs = self.tokenizer(
|
||||
prompt,
|
||||
padding="max_length",
|
||||
max_length=self.tokenizer.model_max_length,
|
||||
return_tensors="np", )
|
||||
text_input_ids = text_inputs.input_ids
|
||||
|
||||
if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
|
||||
removed_text = self.tokenizer.batch_decode(
|
||||
text_input_ids[:, self.tokenizer.model_max_length:])
|
||||
logger.warning(
|
||||
"The following part of your input was truncated because CLIP can only handle sequences up to"
|
||||
f" {self.tokenizer.model_max_length} tokens: {removed_text}")
|
||||
text_input_ids = text_input_ids[:, :
|
||||
self.tokenizer.model_max_length]
|
||||
|
||||
input_name = self.text_encoder_runtime.get_input_info(0).name
|
||||
text_embeddings = self.text_encoder_runtime.infer({
|
||||
input_name: text_input_ids.astype(np.int64)
|
||||
})[0]
|
||||
text_embeddings = np.repeat(
|
||||
text_embeddings, num_images_per_prompt, axis=0)
|
||||
|
||||
do_classifier_free_guidance = guidance_scale > 1.0
|
||||
if do_classifier_free_guidance:
|
||||
uncond_tokens: List[str]
|
||||
if negative_prompt is None:
|
||||
uncond_tokens = [""] * batch_size
|
||||
elif type(prompt) is not type(negative_prompt):
|
||||
raise TypeError(
|
||||
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
|
||||
f" {type(prompt)}.")
|
||||
elif isinstance(negative_prompt, str):
|
||||
uncond_tokens = [negative_prompt] * batch_size
|
||||
elif batch_size != len(negative_prompt):
|
||||
raise ValueError(
|
||||
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
|
||||
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
|
||||
" the batch size of `prompt`.")
|
||||
else:
|
||||
uncond_tokens = negative_prompt
|
||||
|
||||
max_length = text_input_ids.shape[-1]
|
||||
uncond_input = self.tokenizer(
|
||||
uncond_tokens,
|
||||
padding="max_length",
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
return_tensors="np")
|
||||
uncond_embeddings = self.text_encoder_runtime.infer({
|
||||
input_name: uncond_input.input_ids.astype(np.int64)
|
||||
})[0]
|
||||
uncond_embeddings = np.repeat(
|
||||
uncond_embeddings, num_images_per_prompt, axis=0)
|
||||
# For classifier free guidance, we need to do two forward passes.
|
||||
# Here we concatenate the unconditional and text embeddings into a single batch
|
||||
# to avoid doing two forward passes
|
||||
text_embeddings = np.concatenate(
|
||||
[uncond_embeddings, text_embeddings])
|
||||
|
||||
# get the initial random noise unless the user supplied it
|
||||
latents_dtype = text_embeddings.dtype
|
||||
latents_shape = (batch_size * num_images_per_prompt, 4, height // 8,
|
||||
width // 8)
|
||||
if latents is None:
|
||||
latents = generator.randn(*latents_shape).astype(latents_dtype)
|
||||
elif latents.shape != latents_shape:
|
||||
raise ValueError(
|
||||
f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
|
||||
)
|
||||
|
||||
# set timesteps
|
||||
self.scheduler.set_timesteps(num_inference_steps)
|
||||
|
||||
latents = latents * self.scheduler.init_noise_sigma
|
||||
|
||||
accepts_eta = "eta" in set(
|
||||
inspect.signature(self.scheduler.step).parameters.keys())
|
||||
extra_step_kwargs = {}
|
||||
if accepts_eta:
|
||||
extra_step_kwargs["eta"] = eta
|
||||
|
||||
for i, t in enumerate(self.scheduler.timesteps):
|
||||
# expand the latents if we are doing classifier free guidance
|
||||
latent_model_input = np.concatenate(
|
||||
[latents] * 2) if do_classifier_free_guidance else latents
|
||||
latent_model_input = self.scheduler.scale_model_input(
|
||||
latent_model_input, t)
|
||||
|
||||
# predict the noise residual
|
||||
sample_name = self.unet_runtime.get_input_info(0).name
|
||||
timestep_name = self.unet_runtime.get_input_info(1).name
|
||||
encoder_hidden_states_name = self.unet_runtime.get_input_info(
|
||||
2).name
|
||||
# Required fp16 input.
|
||||
input_type = [np.float16, np.float16, np.float16]
|
||||
if self.unet_runtime.get_input_info(0).dtype == fd.FDDataType.FP32:
|
||||
input_type = [np.float32, np.int64, np.float32]
|
||||
noise_pred = self.unet_runtime.infer({
|
||||
sample_name: latent_model_input.astype(input_type[0]),
|
||||
timestep_name: np.array(
|
||||
[t], dtype=input_type[1]),
|
||||
encoder_hidden_states_name:
|
||||
text_embeddings.astype(input_type[2]),
|
||||
})[0]
|
||||
# perform guidance
|
||||
if do_classifier_free_guidance:
|
||||
noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
|
||||
noise_pred = noise_pred_uncond + guidance_scale * (
|
||||
noise_pred_text - noise_pred_uncond)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
latents = self.scheduler.step(noise_pred, t, latents,
|
||||
**extra_step_kwargs).prev_sample
|
||||
latents = np.array(latents)
|
||||
# call the callback, if provided
|
||||
if callback is not None and i % callback_steps == 0:
|
||||
callback(i, t, latents)
|
||||
|
||||
# scale and decode the image latents with vae
|
||||
latents = 1 / 0.18215 * latents
|
||||
sample_name = self.vae_decoder_runtime.get_input_info(0).name
|
||||
input_dtype = np.float16
|
||||
if self.vae_decoder_runtime.get_input_info(
|
||||
0).dtype == fd.FDDataType.FP32:
|
||||
input_dtype = np.float32
|
||||
image = self.vae_decoder_runtime.infer({
|
||||
sample_name: latents.astype(input_dtype)
|
||||
})[0]
|
||||
|
||||
image = np.clip(image / 2 + 0.5, 0, 1)
|
||||
image = image.transpose((0, 2, 3, 1))
|
||||
if output_type == "pil":
|
||||
image = self.numpy_to_pil(image)
|
||||
return image
|
||||
|
||||
@staticmethod
|
||||
def numpy_to_pil(images):
|
||||
"""
|
||||
Convert a numpy image or a batch of images to a PIL image.
|
||||
"""
|
||||
if images.ndim == 3:
|
||||
images = images[None, ...]
|
||||
images = (images * 255).round().astype("uint8")
|
||||
pil_images = [Image.fromarray(image) for image in images]
|
||||
|
||||
return pil_images
|
Reference in New Issue
Block a user