|
import gradio as gr |
|
import base64 |
|
from groq import Groq |
|
from io import BytesIO |
|
from dotenv import load_dotenv |
|
import os |
|
|
|
load_dotenv() |
|
|
|
api_key = os.getenv("API_KEY") |
|
|
|
|
|
def encode_image(image): |
|
""" |
|
Convert a PIL Image to a base64 encoded string. |
|
""" |
|
buffered = BytesIO() |
|
image.save(buffered, format="JPEG") |
|
return base64.b64encode(buffered.getvalue()).decode("utf-8") |
|
|
|
|
|
client = Groq(api_key=api_key) |
|
|
|
def vqa_function(image, question): |
|
""" |
|
Function to process the image and question and return the VQA answer. |
|
Args: |
|
image: Uploaded image (PIL format) |
|
question: User-provided question about the image |
|
Returns: |
|
The model's response to the question |
|
""" |
|
try: |
|
|
|
base64_image = encode_image(image) |
|
|
|
|
|
chat_completion = client.chat.completions.create( |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "text", "text": question}, |
|
{ |
|
"type": "image_url", |
|
"image_url": { |
|
"url": f"data:image/jpeg;base64,{base64_image}", |
|
}, |
|
}, |
|
], |
|
} |
|
], |
|
model="llama-3.2-11b-vision-preview", |
|
) |
|
|
|
|
|
return chat_completion.choices[0].message.content |
|
except Exception as e: |
|
return f"Error: {str(e)}" |
|
|
|
|
|
image_input = gr.Image(label="Upload Image", type="pil") |
|
text_input = gr.Textbox(label="Ask a question about the image") |
|
output_text = gr.Textbox(label="Answer") |
|
|
|
interface = gr.Interface( |
|
fn=vqa_function, |
|
inputs=[image_input, text_input], |
|
outputs=output_text, |
|
title="Visual Question Answering with llama model", |
|
description="Upload an image and ask a question. The app uses a LLAMA VISION model to analyze the image and answer your question." |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
interface.launch() |