Spaces:

prithivMLmods
/

Imagineo-Chat

Running on Zero

App Files Files Community

prithivMLmods commited on 6 days ago

Commit

de1dce3

verified ·

1 Parent(s): 554bd83

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -1

app.py CHANGED Viewed

@@ -20,6 +20,7 @@ from transformers import (
     TextIteratorStreamer,
     Qwen2VLForConditionalGeneration,
     AutoProcessor,
 )
 from transformers.image_utils import load_image
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
@@ -51,6 +52,16 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     communicate = edge_tts.Communicate(text, voice)
     await communicate.save(output_file)
@@ -188,6 +199,38 @@ def generate(
     files = input_dict.get("files", [])
     lower_text = text.lower().strip()
     # Check if the prompt is an image generation command using model flags.
     if (lower_text.startswith("@lightningv5") or
         lower_text.startswith("@lightningv4") or
@@ -345,13 +388,14 @@ demo = gr.ChatInterface(
         ['@lightningv4 A serene landscape with mountains'],
         ['@turbov3 Abstract art, colorful and vibrant'],
         ["@tts2 What causes rainbows to form?"],
     ],
     cache_examples=False,
     type="messages",
     description=DESCRIPTION,
     css=css,
     fill_height=True,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="use the tags @lightningv5 @lightningv4 @turbov3 for image gen !"),
     stop_btn="Stop Generation",
     multimodal=True,

     TextIteratorStreamer,
     Qwen2VLForConditionalGeneration,
     AutoProcessor,
+    AutoModelForImageTextToText,  # <-- New import for aya-vision
 )
 from transformers.image_utils import load_image
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
     torch_dtype=torch.float16
 ).to("cuda").eval()
+# --- New feature: aya-vision ---
+AYA_MODEL_ID = "CohereForAI/aya-vision-8b"
+aya_processor = AutoProcessor.from_pretrained(AYA_MODEL_ID, trust_remote_code=True)
+aya_model = AutoModelForImageTextToText.from_pretrained(
+    AYA_MODEL_ID,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to("cuda").eval()
+# --------------------------------
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     communicate = edge_tts.Communicate(text, voice)
     await communicate.save(output_file)
     files = input_dict.get("files", [])
     lower_text = text.lower().strip()
+    # --- New branch for @aya-vision feature ---
+    if lower_text.startswith("@aya-vision"):
+        prompt_clean = re.sub(r"@aya-vision", "", text, flags=re.IGNORECASE).strip().strip('"')
+        if not files:
+            yield "Please provide an image for @aya-vision command."
+            return
+        image = load_image(files[0])
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": prompt_clean},
+            ]
+        }]
+        prompt_aya = aya_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = aya_processor(text=[prompt_aya], images=[image], return_tensors="pt", padding=True).to("cuda")
+        streamer = TextIteratorStreamer(aya_processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
+        thread = Thread(target=aya_model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        yield "💭 Processing @aya-vision..."
+        for new_text in streamer:
+            buffer += new_text
+            buffer = buffer.replace("<|im_end|>", "")
+            time.sleep(0.01)
+            yield buffer
+        return
+    # ------------------------------------------------
     # Check if the prompt is an image generation command using model flags.
     if (lower_text.startswith("@lightningv5") or
         lower_text.startswith("@lightningv4") or
         ['@lightningv4 A serene landscape with mountains'],
         ['@turbov3 Abstract art, colorful and vibrant'],
         ["@tts2 What causes rainbows to form?"],
+        ["@aya-vision Describe the content of this image"],
     ],
     cache_examples=False,
     type="messages",
     description=DESCRIPTION,
     css=css,
     fill_height=True,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@aya-vision for img-txt-txt / use the tags @lightningv5 @lightningv4 @turbov3 or @aya-vision for image-based commands!"),
     stop_btn="Stop Generation",
     multimodal=True,