Spaces:

helblazer811
/

ConceptAttention

Running on Zero

App Files Files Community

helblazer811 commited on 15 days ago

Commit

29c7873

1 Parent(s): bde9560

Added capability for uploading existing images to the UI.

Browse files

Files changed (8) hide show

app.py +10 -2
concept_attention/concept_attention_pipeline.py +283 -88
concept_attention/flux/src/flux/sampling.py +12 -8
concept_attention/image_generator.py +2 -2
concept_attention/modified_double_stream_block.py +23 -14
concept_attention/modified_flux_dit.py +14 -9
concept_attention/segmentation.py +1 -0
experiments/test_image_encoder/test_encode_image.py +20 -0

app.py CHANGED Viewed

@@ -45,6 +45,12 @@ def encode_image(image, prompt, concepts, seed, layer_start_index, noise_timeste
         if len(concepts) > 9:
             raise gr.Error("Please enter at most 9 concepts", duration=10)
         pipeline_output = pipeline.encode_image(
             image=image,
             prompt=prompt,
@@ -318,7 +324,7 @@ with gr.Blocks(
                     with gr.Column(scale=1, min_width=250):
                         generated_image = gr.Image(
                             elem_classes="generated-image",
-                            show_label=False
                         )
                     with gr.Column(scale=4):
@@ -419,7 +425,9 @@ with gr.Blocks(
                             input_image = gr.Image(
                                 elem_classes="generated-image",
                                 show_label=False,
-                                interactive=True
                             )
                         with gr.Column(scale=4):

         if len(concepts) > 9:
             raise gr.Error("Please enter at most 9 concepts", duration=10)
+        print(f"Num samples: {num_samples}")
+        print(f"Layer start index: {layer_start_index}")
+        print(f"Noise timestep: {noise_timestep}")
+        print(image)
+        image = image.convert("RGB")
         pipeline_output = pipeline.encode_image(
             image=image,
             prompt=prompt,
                     with gr.Column(scale=1, min_width=250):
                         generated_image = gr.Image(
                             elem_classes="generated-image",
+                            show_label=False,
                         )
                     with gr.Column(scale=4):
                             input_image = gr.Image(
                                 elem_classes="generated-image",
                                 show_label=False,
+                                interactive=True,
+                                type="pil",
+                                image_mode="RGB",
                             )
                         with gr.Column(scale=4):

concept_attention/concept_attention_pipeline.py CHANGED Viewed

@@ -5,8 +5,12 @@ from dataclasses import dataclass
 import PIL
 import numpy as np
 import matplotlib.pyplot as plt
 import torch
 import einops
 from concept_attention.binary_segmentation_baselines.raw_cross_attention import RawCrossAttentionBaseline, RawCrossAttentionSegmentationModel
 from concept_attention.binary_segmentation_baselines.raw_output_space import RawOutputSpaceBaseline, RawOutputSpaceSegmentationModel
@@ -18,6 +22,65 @@ class ConceptAttentionPipelineOutput():
     concept_heatmaps: list[PIL.Image.Image]
     cross_attention_maps: list[PIL.Image.Image]
 class ConceptAttentionFluxPipeline():
     """
         This is an object that allows you to generate images with flux, and
@@ -66,7 +129,7 @@ class ConceptAttentionFluxPipeline():
         if timesteps is None:
             timesteps = list(range(num_inference_steps))
         # Run the raw output space object
-        image, cross_attention_maps, concept_heatmaps = self.flux_generator.generate_image(
             width=width,
             height=height,
             prompt=prompt,
@@ -75,56 +138,43 @@ class ConceptAttentionFluxPipeline():
             seed=seed,
             guidance=guidance,
         )
-        # Concept heamaps extraction
-        if softmax:
-            concept_heatmaps = torch.nn.functional.softmax(concept_heatmaps, dim=-2)
-        concept_heatmaps = concept_heatmaps[:, layer_indices]
-        concept_heatmaps = einops.reduce(
-            concept_heatmaps,
-            "time layers batch concepts patches -> batch concepts patches",
-            reduction="mean"
-        )
-        concept_heatmaps = einops.rearrange(
-            concept_heatmaps,
-            "batch concepts (h w) -> batch concepts h w",
-            h=64,
-            w=64
-        )
-        # Cross attention maps
-        if softmax:
-            cross_attention_maps = torch.nn.functional.softmax(cross_attention_maps, dim=-2)
-        cross_attention_maps = cross_attention_maps[:, layer_indices]
-        cross_attention_maps = einops.reduce(
-            cross_attention_maps,
-            "time layers batch concepts patches -> batch concepts patches",
-            reduction="mean"
         )
-        cross_attention_maps = einops.rearrange(
-            cross_attention_maps,
-            "batch concepts (h w) -> batch concepts h w",
-            h=64,
-            w=64
         )
         concept_heatmaps = concept_heatmaps.to(torch.float32).detach().cpu().numpy()[0]
         cross_attention_maps = cross_attention_maps.to(torch.float32).detach().cpu().numpy()[0]
         # Convert the torch heatmaps to PIL images.
         if return_pil_heatmaps:
             # Convert to a matplotlib color scheme
             colored_heatmaps = []
             for concept_heatmap in concept_heatmaps:
-                concept_heatmap = (concept_heatmap - concept_heatmap.min()) / (concept_heatmap.max() - concept_heatmap.min())
                 colored_heatmap = plt.get_cmap(cmap)(concept_heatmap)
                 rgb_image = (colored_heatmap[:, :, :3] * 255).astype(np.uint8)
                 colored_heatmaps.append(rgb_image)
             concept_heatmaps = [PIL.Image.fromarray(concept_heatmap) for concept_heatmap in colored_heatmaps]
             colored_cross_attention_maps = []
             for cross_attention_map in cross_attention_maps:
-                cross_attention_map = (cross_attention_map - cross_attention_map.min()) / (cross_attention_map.max() - cross_attention_map.min())
                 colored_cross_attention_map = plt.get_cmap(cmap)(cross_attention_map)
                 rgb_image = (colored_cross_attention_map[:, :, :3] * 255).astype(np.uint8)
                 colored_cross_attention_maps.append(rgb_image)
@@ -137,58 +187,203 @@ class ConceptAttentionFluxPipeline():
             cross_attention_maps=cross_attention_maps
         )
-    # def encode_image(
-    #     self,
-    #     image: PIL.Image.Image,
-    #     concepts: list[str],
-    #     prompt: str = "", # Optional
-    #     width: int = 1024,
-    #     height: int = 1024,
-    #     return_cross_attention = False,
-    #     layer_indices = list(range(15, 19)),
-    #     num_samples: int = 1,
-    #     device: str = "cuda:0",
-    #     return_pil_heatmaps: bool = True,
-    #     seed: int = 0,
-    #     cmap="plasma"
-    # ) -> ConceptAttentionPipelineOutput:
-    #     """
-    #         Encode an image with flux, given a list of concepts.
-    #     """
-    #     assert return_cross_attention is False, "Not supported yet"
-    #     assert all([layer_index >= 0 and layer_index < 19 for layer_index in layer_indices]), "Invalid layer index"
-    #     assert height == width, "Height and width must be the same for now"
-    #     # Run the raw output space object
-    #     concept_heatmaps, _ = self.output_space_segmentation_model.segment_individual_image(
-    #         image=image,
-    #         concepts=concepts,
-    #         caption=prompt,
-    #         device=device,
-    #         softmax=True,
-    #         layers=layer_indices,
-    #         num_samples=num_samples,
-    #         height=height,
-    #         width=width
-    #     )
-    #     concept_heatmaps = concept_heatmaps.detach().cpu().numpy().squeeze()
-    #     # Convert the torch heatmaps to PIL images.
-    #     if return_pil_heatmaps:
-    #         min_val = concept_heatmaps.min()
-    #         max_val = concept_heatmaps.max()
-    #         # Convert to a matplotlib color scheme
-    #         colored_heatmaps = []
-    #         for concept_heatmap in concept_heatmaps:
-    #             # concept_heatmap = (concept_heatmap - concept_heatmap.min()) / (concept_heatmap.max() - concept_heatmap.min())
-    #             concept_heatmap = (concept_heatmap - min_val) / (max_val - min_val)
-    #             colored_heatmap = plt.get_cmap(cmap)(concept_heatmap)
-    #             rgb_image = (colored_heatmap[:, :, :3] * 255).astype(np.uint8)
-    #             colored_heatmaps.append(rgb_image)
-    #         concept_heatmaps = [PIL.Image.fromarray(concept_heatmap) for concept_heatmap in colored_heatmaps]
-    #     return ConceptAttentionPipelineOutput(
-    #         image=image,
-    #         concept_heatmaps=concept_heatmaps
-    #     )

 import PIL
 import numpy as np
 import matplotlib.pyplot as plt
+from concept_attention.flux.src.flux.sampling import prepare
+from concept_attention.segmentation import add_noise_to_image, encode_image
+from concept_attention.utils import embed_concepts, linear_normalization
 import torch
 import einops
+from tqdm import tqdm
 from concept_attention.binary_segmentation_baselines.raw_cross_attention import RawCrossAttentionBaseline, RawCrossAttentionSegmentationModel
 from concept_attention.binary_segmentation_baselines.raw_output_space import RawOutputSpaceBaseline, RawOutputSpaceSegmentationModel
     concept_heatmaps: list[PIL.Image.Image]
     cross_attention_maps: list[PIL.Image.Image]
+def compute_heatmaps_from_vectors(
+    image_vectors,
+    concept_vectors,
+    layer_indices: list[int],
+    timesteps: list[int] = list(range(4)),
+    softmax: bool = True,
+    normalize_concepts: bool = True
+):
+    """
+        Accepts image vectors and concept vectors. These can be from cross attentions or attention outputs.
+    """
+    print(f"Image vectors shape: {image_vectors.shape}")
+    print(f"Concept vectors shape: {concept_vectors.shape}")
+    # Check if there are heads in the input
+    if len(image_vectors.shape) == 6:
+        # Collapse the had dimension
+        image_vectors = einops.rearrange(
+            image_vectors,
+            "time layers batch head patches dim -> time layers batch patches (head dim)"
+        )
+        concept_vectors = einops.rearrange(
+            concept_vectors,
+            "time layers batch head concepts dim -> time layers batch concepts (head dim)"
+        )
+    # Apply linear normalization to concepts
+    if normalize_concepts:
+        concept_vectors = linear_normalization(concept_vectors, dim=-2)
+    # Compute heatmaps
+    heatmaps = einops.einsum(
+        image_vectors,
+        concept_vectors,
+        "time layers batch patches dim, time layers batch concepts dim -> time layers batch concepts patches",
+    )
+    # Apply softmax
+    if softmax:
+        heatmaps = torch.nn.functional.softmax(heatmaps, dim=-2)
+    # Pull out the timesteps and layers
+    heatmaps = heatmaps[timesteps]
+    heatmaps = heatmaps[:, layer_indices]
+    # Average over the heatmaps
+    heatmaps = einops.reduce(
+        heatmaps,
+        "time layers batch concepts patches -> batch concepts patches",
+        reduction="mean"
+    )
+    heatmaps = einops.rearrange(
+        heatmaps,
+        "batch concepts (h w) -> batch concepts h w",
+        h=64,
+        w=64
+    )
+    return heatmaps
 class ConceptAttentionFluxPipeline():
     """
         This is an object that allows you to generate images with flux, and
         if timesteps is None:
             timesteps = list(range(num_inference_steps))
         # Run the raw output space object
+        image, concept_attention_dict = self.flux_generator.generate_image(
             width=width,
             height=height,
             prompt=prompt,
             seed=seed,
             guidance=guidance,
         )
+        cross_attention_maps = compute_heatmaps_from_vectors(
+            concept_attention_dict["cross_attention_image_vectors"],
+            concept_attention_dict["cross_attention_concept_vectors"],
+            layer_indices=layer_indices,
+            timesteps=timesteps,
+            softmax=softmax
         )
+        concept_heatmaps = compute_heatmaps_from_vectors(
+            concept_attention_dict["output_space_image_vectors"],
+            concept_attention_dict["output_space_concept_vectors"],
+            layer_indices=layer_indices,
+            timesteps=timesteps,
+            softmax=softmax
         )
         concept_heatmaps = concept_heatmaps.to(torch.float32).detach().cpu().numpy()[0]
         cross_attention_maps = cross_attention_maps.to(torch.float32).detach().cpu().numpy()[0]
         # Convert the torch heatmaps to PIL images.
         if return_pil_heatmaps:
+            concept_heatmaps_min = concept_heatmaps.min()
+            concept_heatmaps_max = concept_heatmaps.max()
             # Convert to a matplotlib color scheme
             colored_heatmaps = []
             for concept_heatmap in concept_heatmaps:
+                concept_heatmap = (concept_heatmap - concept_heatmaps_min) / (concept_heatmaps_max - concept_heatmaps_min)
                 colored_heatmap = plt.get_cmap(cmap)(concept_heatmap)
                 rgb_image = (colored_heatmap[:, :, :3] * 255).astype(np.uint8)
                 colored_heatmaps.append(rgb_image)
             concept_heatmaps = [PIL.Image.fromarray(concept_heatmap) for concept_heatmap in colored_heatmaps]
+            cross_attention_min = cross_attention_maps.min()
+            cross_attention_max = cross_attention_maps.max()
             colored_cross_attention_maps = []
             for cross_attention_map in cross_attention_maps:
+                cross_attention_map = (cross_attention_map - cross_attention_min) / (cross_attention_max - cross_attention_min)
                 colored_cross_attention_map = plt.get_cmap(cmap)(cross_attention_map)
                 rgb_image = (colored_cross_attention_map[:, :, :3] * 255).astype(np.uint8)
                 colored_cross_attention_maps.append(rgb_image)
             cross_attention_maps=cross_attention_maps
         )
+    def encode_image(
+        self,
+        image: PIL.Image.Image,
+        concepts: list[str],
+        prompt: str = "", # Optional
+        width: int = 1024,
+        height: int = 1024,
+        layer_indices = list(range(15, 19)),
+        num_samples: int = 1,
+        num_steps: int = 4,
+        noise_timestep: int = 2,
+        device: str = "cuda:0",
+        return_pil_heatmaps: bool = True,
+        seed: int = 0,
+        cmap="plasma",
+        stop_after_multi_modal_attentions=True,
+        softmax=True
+    ) -> ConceptAttentionPipelineOutput:
+        """
+            Encode an image with flux, given a list of concepts.
+        """
+        assert all([layer_index >= 0 and layer_index < 19 for layer_index in layer_indices]), "Invalid layer index"
+        assert height == width, "Height and width must be the same for now"
+        print("Encoding image")
+        # Encode the image into the VAE latent space
+        encoded_image_without_noise = encode_image(
+            image,
+            self.flux_generator.ae,
+            offload=self.flux_generator.offload,
+            device=device,
+        )
+        # Do N trials
+        combined_concept_attention_dict = {
+            "cross_attention_image_vectors": [],
+            "cross_attention_concept_vectors": [],
+            "output_space_image_vectors": [],
+            "output_space_concept_vectors": []
+        }
+        print("Sampling")
+        for i in tqdm(range(num_samples)):
+            # Add noise to image
+            encoded_image, timesteps = add_noise_to_image(
+                encoded_image_without_noise,
+                num_steps=num_steps,
+                noise_timestep=noise_timestep,
+                seed=seed + i,
+                width=width,
+                height=height,
+                device=device,
+                is_schnell=self.flux_generator.is_schnell,
+            )
+            # Now run the diffusion model once on the noisy image
+            # Encode the concept vectors
+            if self.flux_generator.offload:
+                self.flux_generator.t5, self.flux_generator.clip = self.flux_generator.t5.to(device), self.flux_generator.clip.to(device)
+            inp = prepare(t5=self.flux_generator.t5, clip=self.flux_generator.clip, img=encoded_image, prompt=prompt)
+            concept_embeddings, concept_ids, concept_vec = embed_concepts(
+                self.flux_generator.clip,
+                self.flux_generator.t5,
+                concepts,
+            )
+            inp["concepts"] = concept_embeddings.to(encoded_image.device)
+            inp["concept_ids"] = concept_ids.to(encoded_image.device)
+            inp["concept_vec"] = concept_vec.to(encoded_image.device)
+            # offload TEs to CPU, load model to gpu
+            if self.flux_generator.offload:
+                self.flux_generator.t5, self.flux_generator.clip = self.flux_generator.t5.cpu(), self.flux_generator.clip.cpu()
+                torch.cuda.empty_cache()
+                self.flux_generator.model = self.flux_generator.model.to(device)
+            # Denoise the intermediate images
+            guidance_vec = torch.full((encoded_image.shape[0],), 0.0, device=encoded_image.device, dtype=encoded_image.dtype)
+            t_curr = timesteps[0]
+            t_prev = timesteps[1]
+            t_vec = torch.full((encoded_image.shape[0],), t_curr, dtype=encoded_image.dtype, device=encoded_image.device)
+            _, concept_attention_dict = self.flux_generator.model(
+                img=inp["img"],
+                img_ids=inp["img_ids"],
+                txt=inp["txt"],
+                txt_ids=inp["txt_ids"],
+                concepts=inp["concepts"],
+                concept_ids=inp["concept_ids"],
+                concept_vec=inp["concept_vec"],
+                y=inp["concept_vec"],
+                timesteps=t_vec,
+                guidance=guidance_vec,
+                stop_after_multimodal_attentions=stop_after_multi_modal_attentions, # Always true for the demo
+                joint_attention_kwargs=None,
+            )
+            for key in combined_concept_attention_dict.keys():
+                combined_concept_attention_dict[key].append(concept_attention_dict[key])
+            # all_concept_heatmaps.append(concept_heatmaps)
+            # all_cross_attention_maps.append(cross_attention_maps)
+        # Pull out the concept and image vectors from each block
+        for key in combined_concept_attention_dict.keys():
+            combined_concept_attention_dict[key] = torch.stack(combined_concept_attention_dict[key]).squeeze(1)
+        # Compute the heatmaps
+        concept_heatmaps = compute_heatmaps_from_vectors(
+            combined_concept_attention_dict["output_space_image_vectors"],
+            combined_concept_attention_dict["output_space_concept_vectors"],
+            layer_indices=layer_indices,
+            timesteps=timesteps,
+            softmax=softmax
+        )
+        cross_attention_maps = compute_heatmaps_from_vectors(
+            combined_concept_attention_dict["cross_attention_image_vectors"],
+            combined_concept_attention_dict["cross_attention_concept_vectors"],
+            layer_indices=layer_indices,
+            timesteps=timesteps,
+            softmax=softmax
+        )
+        # # Pull out the concept and image vectors from each block
+        # image_vectors = torch.stack(self.flux_generator.model.image_vectors).squeeze(1)
+        # concept_vectors = torch.stack(self.flux_generator.model.concept_vectors).squeeze(1)
+        # # Apply linear normalization ot the concept vectors
+        # if True:
+        #     concept_vectors = linear_normalization(concept_vectors, dim=-2)
+        # # Compute the heatmaps
+        # concept_heatmaps = einops.einsum(
+        #     image_vectors,
+        #     concept_vectors,
+        #     "time layers batch patches dim, time layers batch concepts dim -> time layers batch concepts patches",
+        # )
+        # concept_heatmaps = torch.stack(all_concept_heatmaps, dim=0)
+        # cross_attention_maps = torch.stack(all_cross_attention_maps, dim=0)
+        # Concept heamaps extraction
+        # if softmax:
+        #     concept_heatmaps = torch.nn.functional.softmax(concept_heatmaps, dim=-2)
+        # concept_heatmaps = concept_heatmaps[:, layer_indices]
+        # concept_heatmaps = einops.reduce(
+        #     concept_heatmaps,
+        #     "time layers batch concepts patches -> batch concepts patches",
+        #     reduction="mean"
+        # )
+        # concept_heatmaps = einops.rearrange(
+        #     concept_heatmaps,
+        #     "batch concepts (h w) -> batch concepts h w",
+        #     h=64,
+        #     w=64
+        # )
+        # Cross attention maps
+        # if softmax:
+        #     cross_attention_maps = torch.nn.functional.softmax(cross_attention_maps, dim=-2)
+        # cross_attention_maps = cross_attention_maps[:, layer_indices]
+        # cross_attention_maps = einops.reduce(
+        #     cross_attention_maps,
+        #     "time layers batch concepts patches -> batch concepts patches",
+        #     reduction="mean"
+        # )
+        # cross_attention_maps = einops.rearrange(
+        #     cross_attention_maps,
+        #     "batch concepts (h w) -> batch concepts h w",
+        #     h=64,
+        #     w=64
+        # )
+        concept_heatmaps = concept_heatmaps.to(torch.float32).detach().cpu().numpy()[0]
+        # cross_attention_maps = cross_attention_maps.to(torch.float32).detach().cpu().numpy()[0]
+        cross_attention_maps = concept_heatmaps
+        # Convert the torch heatmaps to PIL images.
+        if return_pil_heatmaps:
+            concept_heatmaps_min = concept_heatmaps.min()
+            concept_heatmaps_max = concept_heatmaps.max()
+            # Convert to a matplotlib color scheme
+            colored_heatmaps = []
+            for concept_heatmap in concept_heatmaps:
+                concept_heatmap = (concept_heatmap - concept_heatmaps_min) / (concept_heatmaps_max - concept_heatmaps_min)
+                colored_heatmap = plt.get_cmap(cmap)(concept_heatmap)
+                rgb_image = (colored_heatmap[:, :, :3] * 255).astype(np.uint8)
+                colored_heatmaps.append(rgb_image)
+            concept_heatmaps = [PIL.Image.fromarray(concept_heatmap) for concept_heatmap in colored_heatmaps]
+            cross_attention_min = cross_attention_maps.min()
+            cross_attention_max = cross_attention_maps.max()
+            colored_cross_attention_maps = []
+            for cross_attention_map in cross_attention_maps:
+                cross_attention_map = (cross_attention_map - cross_attention_min) / (cross_attention_max - cross_attention_min)
+                colored_cross_attention_map = plt.get_cmap(cmap)(cross_attention_map)
+                rgb_image = (colored_cross_attention_map[:, :, :3] * 255).astype(np.uint8)
+                colored_cross_attention_maps.append(rgb_image)
+            cross_attention_maps = [PIL.Image.fromarray(cross_attention_map) for cross_attention_map in colored_cross_attention_maps]
+        return ConceptAttentionPipelineOutput(
+            image=image,
+            concept_heatmaps=concept_heatmaps,
+            cross_attention_maps=cross_attention_maps
+        )

concept_attention/flux/src/flux/sampling.py CHANGED Viewed

@@ -111,14 +111,18 @@ def denoise(
     joint_attention_kwargs=None,
 ):
     intermediate_images = [img]
-    all_cross_attention_maps = []
-    all_concept_attention_maps = []
     # this is ignored for schnell
     guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
     iteration = 0
     for t_curr, t_prev in tqdm(zip(timesteps[:-1], timesteps[1:])):
         t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
-        pred, cross_attention_maps, concept_attention_maps = model(
             img=img,
             img_ids=img_ids,
             txt=txt,
@@ -138,13 +142,13 @@ def denoise(
         # increment iteration
         iteration += 1
-        all_cross_attention_maps.append(cross_attention_maps)
-        all_concept_attention_maps.append(concept_attention_maps)
-    all_cross_attention_maps = torch.stack(all_cross_attention_maps, dim=0)
-    all_concept_attention_maps = torch.stack(all_concept_attention_maps, dim=0)
-    return img, intermediate_images, all_cross_attention_maps, all_concept_attention_maps
 def unpack(x: Tensor, height: int, width: int) -> Tensor:
     return rearrange(

     joint_attention_kwargs=None,
 ):
     intermediate_images = [img]
+    combined_concept_attention_dict = {
+        "output_space_concept_vectors": [],
+        "output_space_image_vectors": [],
+        "cross_attention_concept_vectors": [],
+        "cross_attention_image_vectors": [],
+    }
     # this is ignored for schnell
     guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
     iteration = 0
     for t_curr, t_prev in tqdm(zip(timesteps[:-1], timesteps[1:])):
         t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
+        pred, concept_attention_dict = model(
             img=img,
             img_ids=img_ids,
             txt=txt,
         # increment iteration
         iteration += 1
+        for key in combined_concept_attention_dict.keys():
+            combined_concept_attention_dict[key].append(concept_attention_dict[key])
+    for key in combined_concept_attention_dict.keys():
+        combined_concept_attention_dict[key] = torch.stack(combined_concept_attention_dict[key], dim=0)
+    return img, intermediate_images, combined_concept_attention_dict
 def unpack(x: Tensor, height: int, width: int) -> Tensor:
     return rearrange(

concept_attention/image_generator.py CHANGED Viewed

@@ -171,7 +171,7 @@ class FluxGenerator():
             torch.cuda.empty_cache()
             self.model = self.model.to(self.device)
         # denoise initial noise
-        x, intermediate_images, cross_attention_maps, concept_attention_maps = denoise(
             self.model,
             **inp,
             timesteps=timesteps,
@@ -203,4 +203,4 @@ class FluxGenerator():
         img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy())
-        return img, cross_attention_maps, concept_attention_maps

             torch.cuda.empty_cache()
             self.model = self.model.to(self.device)
         # denoise initial noise
+        x, _, concept_attention_dict = denoise(
             self.model,
             **inp,
             timesteps=timesteps,
         img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy())
+        return img, concept_attention_dict

concept_attention/modified_double_stream_block.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import torch
 from torch import nn, Tensor
 import einops
@@ -77,6 +78,7 @@ class ModifiedDoubleStreamBlock(nn.Module):
         concept_vec: Tensor,
         concept_pe: Tensor,
         joint_attention_kwargs=None,
         **kwargs
     ) -> tuple[Tensor, Tensor]:
         assert concept_vec is not None, "Concept vectors must be provided for this implementation."
@@ -175,19 +177,26 @@ class ModifiedDoubleStreamBlock(nn.Module):
             concept_attn = einops.rearrange(concept_attn, "B H L D -> B L (H D)")
             img_attn = einops.rearrange(img_attn, "B H L D -> B L (H D)")
-        # Compute the cross attentions
-        cross_attention_maps = einops.einsum(
-            concept_q,
-            img_q,
-            "batch head concepts dim, batch had patches dim -> batch head concepts patches"
-        )
-        cross_attention_maps = einops.reduce(cross_attention_maps, "batch head concepts patches -> batch concepts patches", reduction="mean")
-        # Compute the concept attentions
-        concept_attention_maps = einops.einsum(
-            concept_attn,
-            img_attn,
-            "batch concepts dim, batch patches dim -> batch concepts patches"
-        )
         # Do the block updates
         # Calculate the img blocks
         img = img + img_mod1.gate * self.img_attn.proj(img_attn)
@@ -200,4 +209,4 @@ class ModifiedDoubleStreamBlock(nn.Module):
         concepts = concepts + concept_mod1.gate * self.txt_attn.proj(concept_attn)
         concepts = concepts + concept_mod2.gate * self.txt_mlp((1 + concept_mod2.scale) * self.txt_norm2(concepts) + concept_mod2.shift)
-        return img, txt, concepts, cross_attention_maps, concept_attention_maps

+from concept_attention.utils import linear_normalization
 import torch
 from torch import nn, Tensor
 import einops
         concept_vec: Tensor,
         concept_pe: Tensor,
         joint_attention_kwargs=None,
+        normalize_concepts=True,
         **kwargs
     ) -> tuple[Tensor, Tensor]:
         assert concept_vec is not None, "Concept vectors must be provided for this implementation."
             concept_attn = einops.rearrange(concept_attn, "B H L D -> B L (H D)")
             img_attn = einops.rearrange(img_attn, "B H L D -> B L (H D)")
+        concept_attention_dict = {
+            "output_space_concept_vectors": concept_attn,
+            "output_space_image_vectors": img_attn,
+            "cross_attention_concept_vectors": concept_q,
+            "cross_attention_image_vectors": img_q
+        }
+        # # Compute the cross attentions
+        # cross_attention_maps = einops.einsum(
+        #     concept_q,
+        #     img_q,
+        #     "batch head concepts dim, batch had patches dim -> batch head concepts patches"
+        # )
+        # cross_attention_maps = einops.reduce(cross_attention_maps, "batch head concepts patches -> batch concepts patches", reduction="mean")
+        # # Compute the concept attentions
+        # concept_attention_maps = einops.einsum(
+        #     concept_attn,
+        #     img_attn,
+        #     "batch concepts dim, batch patches dim -> batch concepts patches"
+        # )
         # Do the block updates
         # Calculate the img blocks
         img = img + img_mod1.gate * self.img_attn.proj(img_attn)
         concepts = concepts + concept_mod1.gate * self.txt_attn.proj(concept_attn)
         concepts = concepts + concept_mod2.gate * self.txt_mlp((1 + concept_mod2.scale) * self.txt_norm2(concepts) + concept_mod2.shift)
+        return img, txt, concepts, concept_attention_dict

concept_attention/modified_flux_dit.py CHANGED Viewed

@@ -119,10 +119,14 @@ class ModifiedFluxDiT(nn.Module):
         concept_vec = concept_vec + self.vector_in(original_concept_vec)
         concepts = self.txt_in(concepts)
         ############## Modify the double blocks to also return concept vectors ##############
-        all_cross_attention_maps = []
-        all_concept_attention_maps = []
         for block in self.double_blocks:
-            img, txt, concepts, cross_attention_maps, concept_attention_maps = block(
                 img=img,
                 txt=txt,
                 vec=vec,
@@ -134,18 +138,18 @@ class ModifiedFluxDiT(nn.Module):
                 iteration=iteration,
                 joint_attention_kwargs=joint_attention_kwargs
             )
-            all_cross_attention_maps.append(cross_attention_maps)
-            all_concept_attention_maps.append(concept_attention_maps)
-        all_concept_attention_maps = torch.stack(all_concept_attention_maps, dim=0)
-        all_cross_attention_maps = torch.stack(all_cross_attention_maps, dim=0)
         #####################################################################################
         img = torch.cat((txt, img), 1)
         # Speed up segmentation by not generating the full image
         if stop_after_multimodal_attentions:
-            return None, all_cross_attention_maps, all_concept_attention_maps
         # Do the single blocks now
         for block in self.single_blocks:
@@ -154,4 +158,5 @@ class ModifiedFluxDiT(nn.Module):
         img = img[:, txt.shape[1] :, ...]
         img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
-        return img, all_cross_attention_maps, all_concept_attention_maps

         concept_vec = concept_vec + self.vector_in(original_concept_vec)
         concepts = self.txt_in(concepts)
         ############## Modify the double blocks to also return concept vectors ##############
+        combined_concept_attention_dict = {
+            "output_space_concept_vectors": [],
+            "output_space_image_vectors": [],
+            "cross_attention_concept_vectors": [],
+            "cross_attention_image_vectors": []
+        }
         for block in self.double_blocks:
+            img, txt, concepts, concept_attention_dict = block(
                 img=img,
                 txt=txt,
                 vec=vec,
                 iteration=iteration,
                 joint_attention_kwargs=joint_attention_kwargs
             )
+            for key in combined_concept_attention_dict.keys():
+                combined_concept_attention_dict[key].append(concept_attention_dict[key])
+        for key in combined_concept_attention_dict.keys():
+            combined_concept_attention_dict[key] = torch.stack(combined_concept_attention_dict[key], dim=0)
         #####################################################################################
         img = torch.cat((txt, img), 1)
         # Speed up segmentation by not generating the full image
         if stop_after_multimodal_attentions:
+            return None, combined_concept_attention_dict
         # Do the single blocks now
         for block in self.single_blocks:
         img = img[:, txt.shape[1] :, ...]
         img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img, combined_concept_attention_dict

concept_attention/segmentation.py CHANGED Viewed

@@ -125,6 +125,7 @@ def encode_image(
         Encodes a PIL image to the VAE latent space and adds noise to it
     """
     if isinstance(image, PIL.Image.Image):
         transform = transforms.Compose([
             transforms.ToTensor(),
             transforms.Lambda(lambda x: 2.0 * x - 1.0),

         Encodes a PIL image to the VAE latent space and adds noise to it
     """
     if isinstance(image, PIL.Image.Image):
+        image = image.convert("RGB")
         transform = transforms.Compose([
             transforms.ToTensor(),
             transforms.Lambda(lambda x: 2.0 * x - 1.0),

experiments/test_image_encoder/test_encode_image.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from concept_attention.concept_attention_pipeline import ConceptAttentionFluxPipeline
+from PIL import Image
+if __name__ == "__main__":
+    pipeline = ConceptAttentionFluxPipeline(
+        model_name="flux-schnell",
+        offload_model=True
+    ) # , device="cuda:0") # , offload_model=True)
+    image = Image.open("image.png").convert("RGB")
+    outputs = pipeline.encode_image(
+        image,
+        concepts=["animal", "background"]
+    )
+    concept_attention_maps = outputs.concept_heatmaps
+    concepts = ["animal", "background"]
+    for concept, attention_map in zip(concepts, concept_attention_maps):
+        attention_map.save(f"{concept}_attention_map.png")