Spaces:

mohsinmubaraksk
/

Llama-3.2-11-b-vision

Running

File size: 2,248 Bytes

import gradio as gr
import base64
from groq import Groq
from io import BytesIO
from dotenv import load_dotenv
import os

load_dotenv()

api_key = os.getenv("API_KEY")

# Function to encode the image to base64
def encode_image(image):
    """
    Convert a PIL Image to a base64 encoded string.
    """
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")

# Initialize the GROQ client
client = Groq(api_key=api_key)

def vqa_function(image, question):
    """
    Function to process the image and question and return the VQA answer.
    Args:
        image: Uploaded image (PIL format)
        question: User-provided question about the image
    Returns:
        The model's response to the question
    """
    try:
        # Encode the image as a base64 string
        base64_image = encode_image(image)

        # Create the input for the GROQ model
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": question},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}",
                            },
                        },
                    ],
                }
            ],
            model="llama-3.2-11b-vision-preview",
        )

        # Extract and return the response
        return chat_completion.choices[0].message.content
    except Exception as e:
        return f"Error: {str(e)}"

# Gradio Interface
image_input = gr.Image(label="Upload Image", type="pil")
text_input = gr.Textbox(label="Ask a question about the image")
output_text = gr.Textbox(label="Answer")

interface = gr.Interface(
    fn=vqa_function,
    inputs=[image_input, text_input],
    outputs=output_text,
    title="Visual Question Answering with llama model",
    description="Upload an image and ask a question. The app uses a LLAMA VISION model to analyze the image and answer your question."
)

# Launch the app
if __name__ == "__main__":
    interface.launch()