Spaces:

helblazer811
/

ConceptAttention

Running on Zero

App Files Files Community

helblazer811 commited on 15 days ago

Commit

227c367

1 Parent(s): ed0bb32

Fixed UI for mobile and the logic/UI for the second page.

Browse files

Files changed (6) hide show

CrossAttentionCallout.svg +4 -0
app.py +65 -48
concept_attention/concept_attention_pipeline.py +43 -4
concept_attention/flux/src/flux/sampling.py +1 -0
concept_attention/modified_double_stream_block.py +8 -16
concept_attention/modified_flux_dit.py +2 -1

CrossAttentionCallout.svg ADDED Viewed

app.py CHANGED Viewed

@@ -69,14 +69,27 @@ def encode_image(image, prompt, concepts, seed, layer_start_index, noise_timeste
         cross_attention_heatmaps = pipeline_output.cross_attention_maps
         cross_attention_heatmaps = [heatmap.resize((IMG_SIZE, IMG_SIZE), resample=Image.NEAREST) for heatmap in cross_attention_heatmaps]
-        cross_attention_maps_and_labels = [(cross_attention_heatmaps[concept_index], concepts[concept_index]) for concept_index in range(len(concepts))]
         return output_image, \
             gr.update(value=output_space_maps_and_labels, columns=len(output_space_maps_and_labels)), \
             gr.update(value=cross_attention_maps_and_labels, columns=len(cross_attention_maps_and_labels))
     except gr.Error as e:
-        return None, gr.update(value=[], columns=1), gr.update(value=[], columns=1)
 @spaces.GPU(duration=60)
@@ -116,7 +129,20 @@ def generate_image(prompt, concepts, seed, layer_start_index, timestep_start_ind
         cross_attention_heatmaps = pipeline_output.cross_attention_maps
         cross_attention_heatmaps = [heatmap.resize((IMG_SIZE, IMG_SIZE), resample=Image.NEAREST) for heatmap in cross_attention_heatmaps]
-        cross_attention_maps_and_labels = [(cross_attention_heatmaps[concept_index], concepts[concept_index]) for concept_index in range(len(concepts))]
         return output_image, \
             gr.update(value=output_space_maps_and_labels, columns=len(output_space_maps_and_labels)), \
@@ -145,11 +171,7 @@ with gr.Blocks(
         .input {
             height: 47px;
         }
-        .input-column {
-            flex-direction: column;
-            gap: 0px;
-            height: 100%;
-        }
         .input-column-label {}
         .gallery {
             height: 220px;
@@ -162,52 +184,49 @@ with gr.Blocks(
             scrollbar-width: thin;
             scrollbar-color: grey black;
         }
-        /* Show only on screens wider than 768px (adjust as needed)
-        @media (min-width: 1024px) {
-            .svg-container {
-                min-width: 150px;
-                width: 200px;
-                padding-top: 540px;
-            }
-        }
         @media (min-width: 1280px) {
-            .svg-container {
-                min-width: 200px;
-                width: 300px;
-                padding-top: 420px;
-            }
-        }
-         @media (min-width: 1530px) {
-            .svg-container {
-                min-width: 200px;
-                width: 300px;
-                padding-top: 400px;
-            }
-        }
-        */
-        @media (min-width: 1024px) {
             .svg-container {
                 min-width: 250px;
             }
-            #concept-attention-callout-svg {
                 width: 250px;
             }
         }
-        @media (max-width: 1024px) {
             .svg-container {
                 display: none !important;
             }
-            #concept-attention-callout-svg {
                 display: none;
             }
         }
         .header {
             display: flex;
             flex-direction: column;
@@ -241,11 +260,6 @@ with gr.Blocks(
             text-decoration: none;
         }
-        .svg-container {
-            display: flex;
-            justify-content: center;
-            align-items: center;
-        }
         .caption-label {
             font-size: 1.15em;
@@ -415,8 +429,7 @@ with gr.Blocks(
                                 elem_classes="input"
                             )
-                    with gr.Row(elem_classes="gallery-container", scale=8):
                         with gr.Column(scale=1, min_width=250):
                             input_image = gr.Image(
                                 elem_classes="generated-image",
@@ -424,9 +437,10 @@ with gr.Blocks(
                                 interactive=True,
                                 type="pil",
                                 image_mode="RGB",
                             )
-                        with gr.Column(scale=4):
                             concept_attention_gallery = gr.Gallery(
                                 label="Concept Attention (Ours)",
                                 show_label=True,
@@ -438,7 +452,6 @@ with gr.Blocks(
                                 elem_id="concept-attention-gallery",
                                 # scale=4
                             )
                             cross_attention_gallery = gr.Gallery(
                                 label="Cross Attention",
                                 show_label=True,
@@ -476,7 +489,11 @@ with gr.Blocks(
             with gr.Row(scale=4, elem_classes="svg-container"):
                 concept_attention_callout_svg = gr.HTML(
-                    "<img src='/gradio_api/file=ConceptAttentionCallout.svg' id='concept-attention-callout-svg'/>",
                     # container=False,
                 )

         cross_attention_heatmaps = pipeline_output.cross_attention_maps
         cross_attention_heatmaps = [heatmap.resize((IMG_SIZE, IMG_SIZE), resample=Image.NEAREST) for heatmap in cross_attention_heatmaps]
+        cross_attention_maps_and_labels = []
+        prompt_tokens = prompt.split()
+        for concept_index in range(len(concepts)):
+            concept = concepts[concept_index]
+            if concept in prompt_tokens:
+                cross_attention_maps_and_labels.append(
+                    (cross_attention_heatmaps[concept_index], concept)
+                )
+            else:
+                # Exclude this concept because it is only generated due to ConceptAttention's causal attention mechanism
+                empty_image = Image.new("RGB", (IMG_SIZE, IMG_SIZE), (39, 39, 42))
+                cross_attention_maps_and_labels.append(
+                    (empty_image, concept)
+                )
         return output_image, \
             gr.update(value=output_space_maps_and_labels, columns=len(output_space_maps_and_labels)), \
             gr.update(value=cross_attention_maps_and_labels, columns=len(cross_attention_maps_and_labels))
     except gr.Error as e:
+        return None, gr.update(value=[], columns=1) # , gr.update(value=[], columns=1)
 @spaces.GPU(duration=60)
         cross_attention_heatmaps = pipeline_output.cross_attention_maps
         cross_attention_heatmaps = [heatmap.resize((IMG_SIZE, IMG_SIZE), resample=Image.NEAREST) for heatmap in cross_attention_heatmaps]
+        cross_attention_maps_and_labels = []
+        prompt_tokens = prompt.split()
+        for concept_index in range(len(concepts)):
+            concept = concepts[concept_index]
+            if concept in prompt_tokens:
+                cross_attention_maps_and_labels.append(
+                    (cross_attention_heatmaps[concept_index], concept)
+                )
+            else:
+                # Exclude this concept because it is only generated due to ConceptAttention's causal attention mechanism
+                empty_image = Image.new("RGB", (IMG_SIZE, IMG_SIZE), (39, 39, 42))
+                cross_attention_maps_and_labels.append(
+                    (empty_image, concept)
+                )
         return output_image, \
             gr.update(value=output_space_maps_and_labels, columns=len(output_space_maps_and_labels)), \
         .input {
             height: 47px;
         }
         .input-column-label {}
         .gallery {
             height: 220px;
             scrollbar-width: thin;
             scrollbar-color: grey black;
         }
         @media (min-width: 1280px) {
             .svg-container {
                 min-width: 250px;
+                display: flex;
+                flex-direction: column;
+                padding-top: 340px;
             }
+            .callout {
                 width: 250px;
             }
+            .input-row {
+                height: 100px;
+            }
+            .input-column {
+                flex-direction: column;
+                gap: 0px;
+                height: 100%;
+            }
         }
+        @media (max-width: 1280px) {
             .svg-container {
                 display: none !important;
             }
+            .callout {
                 display: none;
             }
         }
+        /*
+            @media (max-width: 1024px) {
+                .svg-container {
+                    display: none !important;
+                    display: flex;
+                    flex-direction: column;
+                }
+                .callout {
+                    display: none;
+                }
+            }
+        */
         .header {
             display: flex;
             flex-direction: column;
             text-decoration: none;
         }
         .caption-label {
             font-size: 1.15em;
                                 elem_classes="input"
                             )
+                    with gr.Row(elem_classes="gallery-container", scale=8, equal_height=True):
                         with gr.Column(scale=1, min_width=250):
                             input_image = gr.Image(
                                 elem_classes="generated-image",
                                 interactive=True,
                                 type="pil",
                                 image_mode="RGB",
+                                scale=1
                             )
+                        with gr.Column(scale=2):
                             concept_attention_gallery = gr.Gallery(
                                 label="Concept Attention (Ours)",
                                 show_label=True,
                                 elem_id="concept-attention-gallery",
                                 # scale=4
                             )
                             cross_attention_gallery = gr.Gallery(
                                 label="Cross Attention",
                                 show_label=True,
             with gr.Row(scale=4, elem_classes="svg-container"):
                 concept_attention_callout_svg = gr.HTML(
+                    "<img src='/gradio_api/file=ConceptAttentionCallout.svg' class='callout'/>",
+                    # container=False,
+                )
+                cross_attention_callout_svg = gr.HTML(
+                    "<img src='/gradio_api/file=CrossAttentionCallout.svg' class='callout'/>",
                     # container=False,
                 )

concept_attention/concept_attention_pipeline.py CHANGED Viewed

@@ -29,13 +29,11 @@ def compute_heatmaps_from_vectors(
     layer_indices: list[int],
     timesteps: list[int] = list(range(4)),
     softmax: bool = True,
-    normalize_concepts: bool = True
 ):
     """
         Accepts image vectors and concept vectors. These can be from cross attentions or attention outputs.
     """
-    print(f"Image vectors shape: {image_vectors.shape}")
-    print(f"Concept vectors shape: {concept_vectors.shape}")
     # Check if there are heads in the input
     if len(image_vectors.shape) == 6:
         # Collapse the had dimension
@@ -139,6 +137,25 @@ class ConceptAttentionFluxPipeline():
             guidance=guidance,
         )
         cross_attention_maps = compute_heatmaps_from_vectors(
             concept_attention_dict["cross_attention_image_vectors"],
             concept_attention_dict["cross_attention_concept_vectors"],
@@ -146,6 +163,7 @@ class ConceptAttentionFluxPipeline():
             timesteps=timesteps,
             softmax=softmax
         )
         concept_heatmaps = compute_heatmaps_from_vectors(
             concept_attention_dict["output_space_image_vectors"],
             concept_attention_dict["output_space_concept_vectors"],
@@ -223,8 +241,9 @@ class ConceptAttentionFluxPipeline():
         combined_concept_attention_dict = {
             "cross_attention_image_vectors": [],
             "cross_attention_concept_vectors": [],
             "output_space_image_vectors": [],
-            "output_space_concept_vectors": []
         }
         print("Sampling")
         for i in tqdm(range(num_samples)):
@@ -307,6 +326,26 @@ class ConceptAttentionFluxPipeline():
             softmax=softmax
         )
         # # Pull out the concept and image vectors from each block
         # image_vectors = torch.stack(self.flux_generator.model.image_vectors).squeeze(1)
         # concept_vectors = torch.stack(self.flux_generator.model.concept_vectors).squeeze(1)

     layer_indices: list[int],
     timesteps: list[int] = list(range(4)),
     softmax: bool = True,
+    normalize_concepts: bool = False
 ):
     """
         Accepts image vectors and concept vectors. These can be from cross attentions or attention outputs.
     """
     # Check if there are heads in the input
     if len(image_vectors.shape) == 6:
         # Collapse the had dimension
             guidance=guidance,
         )
+        # # cross_attention_maps = concept_attention_dict["cross_attention_maps"]
+        # # Apply softmax
+        # if softmax:
+        #     cross_attention_maps = torch.nn.functional.softmax(cross_attention_maps, dim=-2)
+        # # Pull out the timesteps and layers
+        # cross_attention_maps = cross_attention_maps[timesteps]
+        # cross_attention_maps = cross_attention_maps[:, layer_indices]
+        # # Average over time, had, and layers
+        # cross_attention_maps = einops.reduce(
+        #     cross_attention_maps,
+        #     "time layers batch head concepts patches -> batch concepts patches",
+        #     reduction="mean"
+        # )
+        # cross_attention_maps = einops.rearrange(
+        #     cross_attention_maps,
+        #     "batch concepts (h w) -> batch concepts h w",
+        #     h=64,
+        #     w=64
+        # )
         cross_attention_maps = compute_heatmaps_from_vectors(
             concept_attention_dict["cross_attention_image_vectors"],
             concept_attention_dict["cross_attention_concept_vectors"],
             timesteps=timesteps,
             softmax=softmax
         )
+        # Compute concept the heatmaps
         concept_heatmaps = compute_heatmaps_from_vectors(
             concept_attention_dict["output_space_image_vectors"],
             concept_attention_dict["output_space_concept_vectors"],
         combined_concept_attention_dict = {
             "cross_attention_image_vectors": [],
             "cross_attention_concept_vectors": [],
+            # "cross_attention_maps": [],
             "output_space_image_vectors": [],
+            "output_space_concept_vectors": [],
         }
         print("Sampling")
         for i in tqdm(range(num_samples)):
             softmax=softmax
         )
+        # cross_attention_maps = concept_attention_dict["cross_attention_maps"]
+        # # Apply softmax
+        # if softmax:
+        #     cross_attention_maps = torch.nn.functional.softmax(cross_attention_maps, dim=-2)
+        # # Pull out the timesteps and layers
+        # cross_attention_maps = cross_attention_maps[timesteps]
+        # cross_attention_maps = cross_attention_maps[:, layer_indices]
+        # # Average over time, had, and layers
+        # cross_attention_maps = einops.reduce(
+        #     cross_attention_maps,
+        #     "time layers batch head concepts patches -> batch concepts patches",
+        #     reduction="mean"
+        # )
+        # cross_attention_maps = einops.rearrange(
+        #     cross_attention_maps,
+        #     "batch concepts (h w) -> batch concepts h w",
+        #     h=64,
+        #     w=64
+        # )
         # # Pull out the concept and image vectors from each block
         # image_vectors = torch.stack(self.flux_generator.model.image_vectors).squeeze(1)
         # concept_vectors = torch.stack(self.flux_generator.model.concept_vectors).squeeze(1)

concept_attention/flux/src/flux/sampling.py CHANGED Viewed

@@ -114,6 +114,7 @@ def denoise(
     combined_concept_attention_dict = {
         "output_space_concept_vectors": [],
         "output_space_image_vectors": [],
         "cross_attention_concept_vectors": [],
         "cross_attention_image_vectors": [],
     }

     combined_concept_attention_dict = {
         "output_space_concept_vectors": [],
         "output_space_image_vectors": [],
+        # "cross_attention_maps": [],
         "cross_attention_concept_vectors": [],
         "cross_attention_image_vectors": [],
     }

concept_attention/modified_double_stream_block.py CHANGED Viewed

@@ -4,7 +4,6 @@ from torch import nn, Tensor
 import einops
 import math
 import torch.nn.functional as F
-import matplotlib.pyplot as plt
 from concept_attention.flux.src.flux.modules.layers import Modulation, SelfAttention
 from concept_attention.flux.src.flux.math import apply_rope
@@ -167,7 +166,6 @@ class ModifiedDoubleStreamBlock(nn.Module):
             )
             # Separate the concept and image attentions
             concept_attn = concept_image_attn[:, :, :concepts.shape[1]]
         # Rearrange the attention tensors
         txt_attn = einops.rearrange(txt_attn, "B H L D -> B L (H D)")
         if joint_attention_kwargs is not None and joint_attention_kwargs.get("keep_head_dim", False):
@@ -177,26 +175,20 @@ class ModifiedDoubleStreamBlock(nn.Module):
             concept_attn = einops.rearrange(concept_attn, "B H L D -> B L (H D)")
             img_attn = einops.rearrange(img_attn, "B H L D -> B L (H D)")
-        concept_attention_dict = {
-            "output_space_concept_vectors": concept_attn,
-            "output_space_image_vectors": img_attn,
-            "cross_attention_concept_vectors": concept_q,
-            "cross_attention_image_vectors": img_q
-        }
         # # Compute the cross attentions
         # cross_attention_maps = einops.einsum(
         #     concept_q,
         #     img_q,
         #     "batch head concepts dim, batch had patches dim -> batch head concepts patches"
         # )
-        # cross_attention_maps = einops.reduce(cross_attention_maps, "batch head concepts patches -> batch concepts patches", reduction="mean")
-        # # Compute the concept attentions
-        # concept_attention_maps = einops.einsum(
-        #     concept_attn,
-        #     img_attn,
-        #     "batch concepts dim, batch patches dim -> batch concepts patches"
-        # )
         # Do the block updates
         # Calculate the img blocks
         img = img + img_mod1.gate * self.img_attn.proj(img_attn)

 import einops
 import math
 import torch.nn.functional as F
 from concept_attention.flux.src.flux.modules.layers import Modulation, SelfAttention
 from concept_attention.flux.src.flux.math import apply_rope
             )
             # Separate the concept and image attentions
             concept_attn = concept_image_attn[:, :, :concepts.shape[1]]
         # Rearrange the attention tensors
         txt_attn = einops.rearrange(txt_attn, "B H L D -> B L (H D)")
         if joint_attention_kwargs is not None and joint_attention_kwargs.get("keep_head_dim", False):
             concept_attn = einops.rearrange(concept_attn, "B H L D -> B L (H D)")
             img_attn = einops.rearrange(img_attn, "B H L D -> B L (H D)")
         # # Compute the cross attentions
         # cross_attention_maps = einops.einsum(
         #     concept_q,
         #     img_q,
         #     "batch head concepts dim, batch had patches dim -> batch head concepts patches"
         # )
+        # Collect all of the concept attention information
+        concept_attention_dict = {
+            "output_space_concept_vectors": concept_attn.detach(),
+            "output_space_image_vectors": img_attn.detach(),
+            # "cross_attention_maps": cross_attention_maps.detach(),
+            "cross_attention_concept_vectors": concept_q.detach(),
+            "cross_attention_image_vectors": img_q.detach()
+        }
         # Do the block updates
         # Calculate the img blocks
         img = img + img_mod1.gate * self.img_attn.proj(img_attn)

concept_attention/modified_flux_dit.py CHANGED Viewed

@@ -122,8 +122,9 @@ class ModifiedFluxDiT(nn.Module):
         combined_concept_attention_dict = {
             "output_space_concept_vectors": [],
             "output_space_image_vectors": [],
             "cross_attention_concept_vectors": [],
-            "cross_attention_image_vectors": []
         }
         for block in self.double_blocks:
             img, txt, concepts, concept_attention_dict = block(

         combined_concept_attention_dict = {
             "output_space_concept_vectors": [],
             "output_space_image_vectors": [],
+            # "cross_attention_maps": [],
             "cross_attention_concept_vectors": [],
+            "cross_attention_image_vectors": [],
         }
         for block in self.double_blocks:
             img, txt, concepts, concept_attention_dict = block(