[Model] Add stable diffusion model based on fastdeploy (#297)

* Add stable diffusion model base on fastdeploy * Add sd infer * pipelines->multimodal * add create_ort_runtime * use fp16 input * fix pil * Add optimize unet model * add hf license * Add workspace args * Add profile func * Add schedulers * usrelace torch.Tenosr byp.ndarray * Add readme * Add trt shape setting * add dynamic shape * Add dynamic shape for stable diffusion * fix max shape setting * rename tensorrt file suffix * update dynamic shape setting * Add scheduler output * Add inference_steps and benchmark steps * add diffuser benchmark * Add paddle infer script * Rename 1 * Rename infer.py to torch_onnx_infer.py * Add export torch to onnx model * renmove export model * Add paddle export model for diffusion * Fix export model * mv torch onnx infer to infer * Fix export model * Fix infer * modif create_trt_runtime create_ort_runtime * update export torch * update requirements * add paddle inference backend * Fix unet pp run * remove print * Add paddle model export and infer * Add device id * remove profile to utils * Add -1 device id * Add safety checker args * remove safety checker temporarily * Add export model description * Add predict description * Fix readme * Fix device_id description * add timestep shape * add use fp16 precision * move use gpu * Add EulerAncestralDiscreteScheduler * Use EulerAncestralDiscreteScheduler with v1-5 model * Add export model readme * Add link of exported model * Update scheduler on README * Addd stable-diffusion-v1-5
2025-10-07 09:31:35 +08:00 · 2022-11-10 14:59:07 +08:00
parent fa807340be
commit d4995e5468
13 changed files with 2301 additions and 0 deletions
--- a/examples/multimodal/stable_diffusion/pipeline_stable_diffusion.py
+++ b/examples/multimodal/stable_diffusion/pipeline_stable_diffusion.py
@@ -0,0 +1,236 @@
+# Copyright 2022 The HuggingFace Inc. team.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+import numpy as np
+
+from paddlenlp.transformers import CLIPTokenizer
+import fastdeploy as fd
+from scheduling_utils import PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler, EulerAncestralDiscreteScheduler
+import PIL
+from PIL import Image
+import logging
+
+
+class StableDiffusionFastDeployPipeline(object):
+    vae_decoder_runtime: fd.Runtime
+    text_encoder_runtime: fd.Runtime
+    tokenizer: CLIPTokenizer
+    unet_runtime: fd.Runtime
+    scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler,
+                     EulerAncestralDiscreteScheduler]
+
+    def __init__(self,
+                 vae_decoder_runtime: fd.Runtime,
+                 text_encoder_runtime: fd.Runtime,
+                 tokenizer: CLIPTokenizer,
+                 unet_runtime: fd.Runtime,
+                 scheduler: Union[DDIMScheduler, PNDMScheduler,
+                                  LMSDiscreteScheduler]):
+        self.vae_decoder_runtime = vae_decoder_runtime
+        self.text_encoder_runtime = text_encoder_runtime
+        self.unet_runtime = unet_runtime
+        self.scheduler = scheduler
+        self.tokenizer = tokenizer
+
+    def __call__(
+            self,
+            prompt: Union[str, List[str]],
+            height: Optional[int]=512,
+            width: Optional[int]=512,
+            num_inference_steps: Optional[int]=50,
+            guidance_scale: Optional[float]=7.5,
+            negative_prompt: Optional[Union[str, List[str]]]=None,
+            num_images_per_prompt: Optional[int]=1,
+            eta: Optional[float]=0.0,
+            generator: Optional[np.random.RandomState]=None,
+            latents: Optional[np.ndarray]=None,
+            output_type: Optional[str]="pil",
+            return_dict: bool=True,
+            callback: Optional[Callable[[int, int, np.ndarray], None]]=None,
+            callback_steps: Optional[int]=1,
+            **kwargs, ):
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+
+        if (callback_steps is None) or (callback_steps is not None and (
+                not isinstance(callback_steps, int) or callback_steps <= 0)):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}.")
+
+        if generator is None:
+            generator = np.random
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="np", )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(
+                text_input_ids[:, self.tokenizer.model_max_length:])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}")
+            text_input_ids = text_input_ids[:, :
+                                            self.tokenizer.model_max_length]
+
+        input_name = self.text_encoder_runtime.get_input_info(0).name
+        text_embeddings = self.text_encoder_runtime.infer({
+            input_name: text_input_ids.astype(np.int64)
+        })[0]
+        text_embeddings = np.repeat(
+            text_embeddings, num_images_per_prompt, axis=0)
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}.")
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`.")
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np")
+            uncond_embeddings = self.text_encoder_runtime.infer({
+                input_name: uncond_input.input_ids.astype(np.int64)
+            })[0]
+            uncond_embeddings = np.repeat(
+                uncond_embeddings, num_images_per_prompt, axis=0)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = np.concatenate(
+                [uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+        latents_dtype = text_embeddings.dtype
+        latents_shape = (batch_size * num_images_per_prompt, 4, height // 8,
+                         width // 8)
+        if latents is None:
+            latents = generator.randn(*latents_shape).astype(latents_dtype)
+        elif latents.shape != latents_shape:
+            raise ValueError(
+                f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
+            )
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        latents = latents * self.scheduler.init_noise_sigma
+
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.scheduler.timesteps):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = np.concatenate(
+                [latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(
+                latent_model_input, t)
+
+            # predict the noise residual
+            sample_name = self.unet_runtime.get_input_info(0).name
+            timestep_name = self.unet_runtime.get_input_info(1).name
+            encoder_hidden_states_name = self.unet_runtime.get_input_info(
+                2).name
+            # Required fp16 input.
+            input_type = [np.float16, np.float16, np.float16]
+            if self.unet_runtime.get_input_info(0).dtype == fd.FDDataType.FP32:
+                input_type = [np.float32, np.int64, np.float32]
+            noise_pred = self.unet_runtime.infer({
+                sample_name: latent_model_input.astype(input_type[0]),
+                timestep_name: np.array(
+                    [t], dtype=input_type[1]),
+                encoder_hidden_states_name:
+                text_embeddings.astype(input_type[2]),
+            })[0]
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents,
+                                          **extra_step_kwargs).prev_sample
+            latents = np.array(latents)
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        # scale and decode the image latents with vae
+        latents = 1 / 0.18215 * latents
+        sample_name = self.vae_decoder_runtime.get_input_info(0).name
+        input_dtype = np.float16
+        if self.vae_decoder_runtime.get_input_info(
+                0).dtype == fd.FDDataType.FP32:
+            input_dtype = np.float32
+        image = self.vae_decoder_runtime.infer({
+            sample_name: latents.astype(input_dtype)
+        })[0]
+
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = image.transpose((0, 2, 3, 1))
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+        return image
+
+    @staticmethod
+    def numpy_to_pil(images):
+        """
+        Convert a numpy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        pil_images = [Image.fromarray(image) for image in images]
+
+        return pil_images