Spaces:

ameerazam08
/

PhotoDoodle-Image-Edit-GPU

Running on Zero

App Files Files Community

ameerazam08 commited on 15 days ago

Commit

0a82683

verified ·

1 Parent(s): e8bec9f

Upload 13 files

Browse files

Files changed (14) hide show

.gitattributes +2 -0
app.py +63 -0
assets/1.png +0 -0
assets/R-F.jpg +3 -0
assets/teaser.png +3 -0
inference.py +70 -0
merge.py +13 -0
requirements.txt +28 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/__pycache__/pipeline_pe_clone.cpython-310.pyc +0 -0
src/jsonl_datasets.py +128 -0
src/pipeline_pe_clone.py +734 -0
src/prompt_helper.py +205 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/R-F.jpg filter=lfs diff=lfs merge=lfs -text
+assets/teaser.png filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import gradio as gr
+import torch
+from PIL import Image
+from src.pipeline_pe_clone import FluxPipeline
+import  spaces
+@spaces.GPU()
+def generate_image(model_path, image, height, width, prompt, guidance_scale, num_steps, lora_name):
+    # Load the model
+    pipeline = FluxPipeline.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+    ).to('cuda')
+    # Load and fuse base LoRA weights
+    pipeline.load_lora_weights("nicolaus-huang/PhotoDoodle", weight_name="pretrain.safetensors")
+    pipeline.fuse_lora()
+    pipeline.unload_lora_weights()
+    # Load selected LoRA effect if not using the pretrained base model
+    if lora_name != 'pretrained':
+        pipeline.load_lora_weights("nicolaus-huang/PhotoDoodle", weight_name=f"{lora_name}.safetensors")
+    # Prepare the input image
+    condition_image = image.resize((height, width)).convert("RGB")
+    # Generate the output image
+    result = pipeline(
+        prompt=prompt,
+        condition_image=condition_image,
+        height=height,
+        width=width,
+        guidance_scale=guidance_scale,
+        num_inference_steps=num_steps,
+        max_sequence_length=512
+    ).images[0]
+    return result
+# Create Gradio interface
+iface = gr.Interface(
+    fn=generate_image,
+    inputs=[
+        gr.Textbox(label="Model Path", value="black-forest-labs/FLUX.1-dev"),
+        gr.Image(label="Input Image", type="pil"),
+        gr.Number(label="Height", value=768),
+        gr.Number(label="Width", value=512),
+        gr.Textbox(label="Prompt", value="add a halo and wings for the cat by sksmagiceffects"),
+        gr.Number(label="Guidance Scale", value=3.5),
+        gr.Number(label="Number of Steps", value=20),
+        gr.Dropdown(
+            label="LoRA Name",
+            choices=["pretrained", "sksmagiceffects", "sksmonstercalledlulu",
+                     "skspaintingeffects", "sksedgeeffect", "skscatooneffect"],
+            value="sksmagiceffects"
+        )
+    ],
+    outputs=gr.Image(label="Output Image", type="pil"),
+    title="FLUX Image Generation with LoRA"
+)
+if __name__ == "__main__":
+    iface.launch()

assets/1.png ADDED Viewed

assets/R-F.jpg ADDED Viewed

Git LFS Details

SHA256: c37533cd09e5d5da972a4c5c7bb6093bb376b35b27cb10899d5ca1f608ebbad1
Pointer size: 132 Bytes
Size of remote file: 7.85 MB

assets/teaser.png ADDED Viewed

Git LFS Details

SHA256: 939bee83f31172b79c826e6dcb25978f11ee12e18324f65adb4742c59bb95d00
Pointer size: 132 Bytes
Size of remote file: 3.18 MB

inference.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import argparse
+from src.pipeline_pe_clone import FluxPipeline
+import torch
+from PIL import Image
+def parse_args():
+    parser = argparse.ArgumentParser(description='FLUX image generation with LoRA')
+    parser.add_argument('--model_path', type=str,
+                        default="black-forest-labs/FLUX.1-dev",
+                        help='Path to pretrained model')
+    parser.add_argument('--image_path', type=str,
+                        default="assets/1.png",
+                        help='Input image path')
+    parser.add_argument('--output_path', type=str,
+                        default="output.png",
+                        help='Output image path')
+    parser.add_argument('--height', type=int, default=768)
+    parser.add_argument('--width', type=int, default=512)
+    parser.add_argument('--prompt', type=str,
+                        default="add a halo and wings for the cat by sksmagiceffects",
+                        help="""Different LoRA effects and their example prompts:
+    - sksmagiceffects: "add a halo and wings for the cat by sksmagiceffects"
+    - sksmonstercalledlulu: "add a red sksmonstercalledlulu hugging the cat"
+    - skspaintingeffects: "add a yellow flower on the cat's head and psychedelic colors and dynamic flows by skspaintingeffects"
+    - sksedgeeffect: "add yellow flames to the cat by sksedgeeffect"
+    - skscatooneffect: "add two hands holding the cat in skscatooneffect"
+    """)
+    parser.add_argument('--guidance_scale', type=float, default=3.5)
+    parser.add_argument('--num_steps', type=int, default=20,
+                        help='Number of inference steps')
+    parser.add_argument('--lora_name', type=str,
+                        choices=['pretrained', 'sksmagiceffects', 'sksmonstercalledlulu',
+                                'skspaintingeffects', 'sksedgeeffect', 'skscatooneffect'],
+                        default="sksmagiceffects",
+                        help='Name of LoRA weights to use. Use "pretrained" for base model only')
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    pipeline = FluxPipeline.from_pretrained(
+        args.model_path,
+        torch_dtype=torch.bfloat16,
+    ).to('cuda')
+    # Load and fuse base LoRA weights
+    pipeline.load_lora_weights("nicolaus-huang/PhotoDoodle", weight_name="pretrain.safetensors")
+    pipeline.fuse_lora()
+    pipeline.unload_lora_weights()
+    # Load selected LoRA effect only if not using pretrained
+    if args.lora_name != 'pretrained':
+        pipeline.load_lora_weights("nicolaus-huang/PhotoDoodle", weight_name=f"{args.lora_name}.safetensors")
+    condition_image = Image.open(args.image_path).resize((args.height, args.width)).convert("RGB")
+    result = pipeline(
+        prompt=args.prompt,
+        condition_image=condition_image,
+        height=args.height,
+        width=args.width,
+        guidance_scale=args.guidance_scale,
+        num_inference_steps=args.num_steps,
+        max_sequence_length=512
+    ).images[0]
+    result.save(args.output_path)
+if __name__ == "__main__":
+    main()

merge.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from src.pipeline_pe_clone import FluxPipeline
+import torch
+from PIL import Image
+pretrained_model_name_or_path = "black-forest-labs/FLUX.1-dev"
+pipeline = FluxPipeline.from_pretrained(
+    pretrained_model_name_or_path,
+    torch_dtype=torch.bfloat16,
+)
+pipeline.load_lora_weights("outputs/doodle_pretrain_4508000/pytorch_lora_weights.safetensors")
+pipeline.fuse_lora()
+pipeline.unload_lora_weights()
+pipeline.save_pretrained("edit_pretrain")

requirements.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+accelerate==0.33.0
+transformers==4.44.0
+diffusers[torch]==0.25.0
+ftfy==6.1.1
+# albumentations==1.3.0
+opencv-python==4.8.1.78
+einops==0.7.0
+pytorch-lightning==1.9.0
+bitsandbytes==0.44.0
+prodigyopt==1.0
+lion-pytorch==0.0.6
+came_pytorch==0.1.3
+schedulefree==1.4
+tensorboard
+safetensors==0.4.4
+# for gradio
+gradio==3.6
+altair==4.2.2
+easygui==0.98.3
+toml==0.10.2
+voluptuous==0.13.1
+huggingface-hub==0.24.5
+# for Image utils
+imagesize==1.4.1
+numpy<=2.0
+rich==13.7.0
+# for T5XXL tokenizer (SD3/FLUX)
+sentencepiece==0.2.0

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (137 Bytes). View file

src/__pycache__/pipeline_pe_clone.cpython-310.pyc ADDED Viewed

Binary file (21.8 kB). View file

src/jsonl_datasets.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import torch
+from PIL import Image
+from datasets import load_dataset
+from torchvision import transforms
+import random
+import os
+import numpy as np
+Image.MAX_IMAGE_PIXELS = None
+def make_train_dataset(args, tokenizer, accelerator=None):
+    if args.train_data_dir is not None:
+        print("load_data")
+        dataset = load_dataset('json', data_files=args.train_data_dir)
+    column_names = dataset["train"].column_names
+    # 6. Get the column names for input/target.
+    if args.caption_column is None:
+        caption_column = column_names[0]
+        print(f"caption column defaulting to {caption_column}")
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"`--caption_column` value '{args.caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+    if args.source_column is None:
+        source_column = column_names[1]
+        print(f"source column defaulting to {source_column}")
+    else:
+        source_column = args.source_column
+        if source_column not in column_names:
+            raise ValueError(
+                f"`--source_column` value '{args.source_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+    if args.target_column is None:
+        target_column = column_names[1]
+        print(f"target column defaulting to {target_column}")
+    else:
+        target_column = args.target_column
+        if target_column not in column_names:
+            raise ValueError(
+                f"`--target_column` value '{args.target_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}"
+            )
+    h = args.height
+    w = args.width
+    train_transforms = transforms.Compose(
+        [
+            transforms.Resize((h, w), interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+    tokenizer_clip = tokenizer[0]
+    tokenizer_t5 = tokenizer[1]
+    def tokenize_prompt_clip_t5(examples):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, list):
+                captions.append(random.choice(caption))
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        text_inputs = tokenizer_clip(
+            captions,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids_1 = text_inputs.input_ids
+        text_inputs = tokenizer_t5(
+            captions,
+            padding="max_length",
+            max_length=512,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids_2 = text_inputs.input_ids
+        return text_input_ids_1, text_input_ids_2
+    def preprocess_train(examples):
+        _examples = {}
+        source_images = [Image.open(image).convert("RGB") for image in examples[source_column]]
+        target_images = [Image.open(image).convert("RGB") for image in examples[target_column]]
+        _examples["cond_pixel_values"] = [train_transforms(source) for source in source_images]
+        _examples["pixel_values"] = [train_transforms(image) for image in target_images]
+        _examples["token_ids_clip"], _examples["token_ids_t5"] = tokenize_prompt_clip_t5(examples)
+        return _examples
+    if accelerator is not None:
+        with accelerator.main_process_first():
+            train_dataset = dataset["train"].with_transform(preprocess_train)
+    else:
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+    return train_dataset
+def collate_fn(examples):
+    cond_pixel_values = torch.stack([example["cond_pixel_values"] for example in examples])
+    cond_pixel_values = cond_pixel_values.to(memory_format=torch.contiguous_format).float()
+    target_pixel_values = torch.stack([example["pixel_values"] for example in examples])
+    target_pixel_values = target_pixel_values.to(memory_format=torch.contiguous_format).float()
+    token_ids_clip = torch.stack([torch.tensor(example["token_ids_clip"]) for example in examples])
+    token_ids_t5 = torch.stack([torch.tensor(example["token_ids_t5"]) for example in examples])
+    return {
+        "cond_pixel_values": cond_pixel_values,
+        "pixel_values": target_pixel_values,
+        "text_ids_1": token_ids_clip,
+        "text_ids_2": token_ids_t5,
+    }

src/pipeline_pe_clone.py ADDED Viewed

	@@ -0,0 +1,734 @@

+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import numpy as np
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
+from diffusers.image_processor import (VaeImageProcessor)
+from diffusers.loaders import FluxLoraLoaderMixin, FromSingleFileMixin
+from diffusers.models.autoencoders import AutoencoderKL
+from diffusers.models.transformers import FluxTransformer2DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def calculate_shift(
+        image_seq_len,
+        base_seq_len: int = 256,
+        max_seq_len: int = 4096,
+        base_shift: float = 0.5,
+        max_shift: float = 1.16,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+def prepare_latent_image_ids_2(height, width, device, dtype):
+    latent_image_ids = torch.zeros(height//2, width//2, 3, device=device, dtype=dtype)
+    latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height//2, device=device)[:, None]  # y坐标
+    latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width//2, device=device)[None, :]   # x坐标
+    return latent_image_ids
+def position_encoding_clone(batch_size, original_height, original_width, device, dtype):
+    latent_image_ids = prepare_latent_image_ids_2(original_height, original_width, device, dtype)
+    latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+    latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+    cond_latent_image_ids = latent_image_ids
+    latent_image_ids = torch.concat([latent_image_ids, cond_latent_image_ids], dim=-2)
+    return latent_image_ids
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+        encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+        scheduler,
+        num_inference_steps: Optional[int] = None,
+        device: Optional[Union[str, torch.device]] = None,
+        timesteps: Optional[List[int]] = None,
+        sigmas: Optional[List[float]] = None,
+        **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class FluxPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
+    r"""
+    The Flux pipeline for text-to-image generation.
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+    def __init__(
+            self,
+            scheduler: FlowMatchEulerDiscreteScheduler,
+            vae: AutoencoderKL,
+            text_encoder: CLIPTextModel,
+            tokenizer: CLIPTokenizer,
+            text_encoder_2: T5EncoderModel,
+            tokenizer_2: T5TokenizerFast,
+            transformer: FluxTransformer2DModel,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 64
+    def _get_t5_prompt_embeds(
+            self,
+            prompt: Union[str, List[str]] = None,
+            num_images_per_prompt: int = 1,
+            max_sequence_length: int = 512,
+            device: Optional[torch.device] = None,
+            dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1: -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        return prompt_embeds
+    def _get_clip_prompt_embeds(
+            self,
+            prompt: Union[str, List[str]],
+            num_images_per_prompt: int = 1,
+            device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1: -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+        return prompt_embeds
+    def encode_prompt(
+            self,
+            prompt: Union[str, List[str]],
+            prompt_2: Union[str, List[str]],
+            device: Optional[torch.device] = None,
+            num_images_per_prompt: int = 1,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+            max_sequence_length: int = 512,
+            lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i: i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        return image_latents
+    def check_inputs(
+            self,
+            prompt,
+            prompt_2,
+            height,
+            width,
+            prompt_embeds=None,
+            pooled_prompt_embeds=None,
+            callback_on_step_end_tensor_inputs=None,
+            max_sequence_length=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if callback_on_step_end_tensor_inputs is not None and not all(
+                k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+    @staticmethod
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+        return latent_image_ids.to(device=device, dtype=dtype)
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+        return latents
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+        height = height // vae_scale_factor
+        width = width // vae_scale_factor
+        latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+        latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
+        return latents
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+    def prepare_latents(
+            self,
+            batch_size,
+            num_channels_latents,
+            height,
+            width,
+            dtype,
+            device,
+            generator,
+            latents=None,
+            condition_image=None,
+    ):
+        height = 2 * (int(height) // self.vae_scale_factor)
+        width = 2 * (int(width) // self.vae_scale_factor)
+        shape = (batch_size, num_channels_latents, height, width)  # 1 16 106 80
+        if latents is not None:
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if condition_image is not None:
+            condition_image = condition_image.to(device=device, dtype=dtype)
+            image_latents = self._encode_vae_image(image=condition_image, generator=generator)
+            if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+                # expand init_latents for batch_size
+                additional_image_per_prompt = batch_size // image_latents.shape[0]
+                image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+            elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+                )
+            else:
+                image_latents = torch.cat([image_latents], dim=0)
+        # import pdb; pdb.set_trace()
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+        cond_latents = self._pack_latents(image_latents, batch_size, num_channels_latents, height, width)
+        latents = torch.concat([latents, cond_latents], dim=-2)
+        latent_image_ids = position_encoding_clone(batch_size, height, width, device, dtype)  # add position
+        mask1 = torch.ones(shape, device=device, dtype=dtype)
+        mask2 = torch.zeros(shape, device=device, dtype=dtype)
+        mask1 = self._pack_latents(mask1, batch_size, num_channels_latents, height, width)  # 1 4096 64
+        mask2 = self._pack_latents(mask2, batch_size, num_channels_latents, height, width)  # 1 4096 64
+        mask = torch.concat([mask1, mask2], dim=-2)
+        return latents, latent_image_ids, mask, cond_latents
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    def __call__(
+            self,
+            prompt: Union[str, List[str]] = None,
+            prompt_2: Optional[Union[str, List[str]]] = None,
+            height: Optional[int] = None,
+            width: Optional[int] = None,
+            num_inference_steps: int = 28,
+            timesteps: List[int] = None,
+            guidance_scale: float = 3.5,
+            num_images_per_prompt: Optional[int] = 1,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.FloatTensor] = None,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+            output_type: Optional[str] = "pil",
+            return_dict: bool = True,
+            joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+            callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+            callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+            max_sequence_length: int = 512,
+            condition_image=None,
+    ):
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        condition_image = self.image_processor.preprocess(condition_image, height=height, width=width)
+        condition_image = condition_image.to(dtype=torch.float32)
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4  # 16
+        latents, latent_image_ids, mask, cond_latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            condition_image
+        )
+        clean_latents = latents.clone()
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                noise_pred = self.transformer(
+                    hidden_states=latents,  # 1 4096 64
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                latents = latents * mask + clean_latents * (1 - mask)
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents[:,:latents.shape[-2]-cond_latents.shape[-2],:], height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents.to(dtype=self.vae.dtype), return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return FluxPipelineOutput(images=image)

src/prompt_helper.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import torch
+def load_text_encoders(args, class_one, class_two):
+    text_encoder_one = class_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = class_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    return text_encoder_one, text_encoder_two
+def tokenize_prompt(tokenizer, prompt, max_sequence_length):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=max_sequence_length,
+        truncation=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+def tokenize_prompt_clip(tokenizer, prompt):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=77,
+        truncation=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+def tokenize_prompt_t5(tokenizer, prompt):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=512,
+        truncation=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+def _encode_prompt_with_t5(
+        text_encoder,
+        tokenizer,
+        max_sequence_length=512,
+        prompt=None,
+        num_images_per_prompt=1,
+        device=None,
+        text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+    prompt_embeds = text_encoder(text_input_ids.to(device))[0]
+    dtype = text_encoder.dtype
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+    return prompt_embeds
+def _encode_prompt_with_clip(
+        text_encoder,
+        tokenizer,
+        prompt: str,
+        device=None,
+        text_input_ids=None,
+        num_images_per_prompt: int = 1,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+    prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=False)
+    # Use pooled output of CLIPTextModel
+    prompt_embeds = prompt_embeds.pooler_output
+    prompt_embeds = prompt_embeds.to(dtype=text_encoder.dtype, device=device)
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+    return prompt_embeds
+def encode_prompt(
+        text_encoders,
+        tokenizers,
+        prompt: str,
+        max_sequence_length,
+        device=None,
+        num_images_per_prompt: int = 1,
+        text_input_ids_list=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    dtype = text_encoders[0].dtype
+    pooled_prompt_embeds = _encode_prompt_with_clip(
+        text_encoder=text_encoders[0],
+        tokenizer=tokenizers[0],
+        prompt=prompt,
+        device=device if device is not None else text_encoders[0].device,
+        num_images_per_prompt=num_images_per_prompt,
+        text_input_ids=text_input_ids_list[0] if text_input_ids_list else None,
+    )
+    prompt_embeds = _encode_prompt_with_t5(
+        text_encoder=text_encoders[1],
+        tokenizer=tokenizers[1],
+        max_sequence_length=max_sequence_length,
+        prompt=prompt,
+        num_images_per_prompt=num_images_per_prompt,
+        device=device if device is not None else text_encoders[1].device,
+        text_input_ids=text_input_ids_list[1] if text_input_ids_list else None,
+    )
+    text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+    return prompt_embeds, pooled_prompt_embeds, text_ids
+def encode_token_ids(text_encoders, tokens, accelerator, num_images_per_prompt=1, device=None):
+    text_encoder_clip = text_encoders[0]
+    text_encoder_t5 = text_encoders[1]
+    tokens_clip, tokens_t5 = tokens[0], tokens[1]
+    batch_size = tokens_clip.shape[0]
+    if device == "cpu":
+        device = "cpu"
+    else:
+        device = accelerator.device
+    # clip
+    prompt_embeds = text_encoder_clip(tokens_clip.to(device), output_hidden_states=False)
+    # Use pooled output of CLIPTextModelpreprocess_train
+    prompt_embeds = prompt_embeds.pooler_output
+    prompt_embeds = prompt_embeds.to(dtype=text_encoder_clip.dtype, device=accelerator.device)
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    pooled_prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+    pooled_prompt_embeds = pooled_prompt_embeds.to(dtype=text_encoder_clip.dtype, device=accelerator.device)
+    # t5
+    prompt_embeds = text_encoder_t5(tokens_t5.to(device))[0]
+    dtype = text_encoder_t5.dtype
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=accelerator.device)
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+    text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=accelerator.device, dtype=dtype)
+    return prompt_embeds, pooled_prompt_embeds, text_ids