import gradio as gr import base64 from groq import Groq from io import BytesIO from dotenv import load_dotenv import os load_dotenv() api_key = os.getenv("API_KEY") # Function to encode the image to base64 def encode_image(image): """ Convert a PIL Image to a base64 encoded string. """ buffered = BytesIO() image.save(buffered, format="JPEG") return base64.b64encode(buffered.getvalue()).decode("utf-8") # Initialize the GROQ client client = Groq(api_key=api_key) def vqa_function(image, question): """ Function to process the image and question and return the VQA answer. Args: image: Uploaded image (PIL format) question: User-provided question about the image Returns: The model's response to the question """ try: # Encode the image as a base64 string base64_image = encode_image(image) # Create the input for the GROQ model chat_completion = client.chat.completions.create( messages=[ { "role": "user", "content": [ {"type": "text", "text": question}, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_image}", }, }, ], } ], model="llama-3.2-11b-vision-preview", ) # Extract and return the response return chat_completion.choices[0].message.content except Exception as e: return f"Error: {str(e)}" # Gradio Interface image_input = gr.Image(label="Upload Image", type="pil") text_input = gr.Textbox(label="Ask a question about the image") output_text = gr.Textbox(label="Answer") interface = gr.Interface( fn=vqa_function, inputs=[image_input, text_input], outputs=output_text, title="Visual Question Answering with llama model", description="Upload an image and ask a question. The app uses a LLAMA VISION model to analyze the image and answer your question." ) # Launch the app if __name__ == "__main__": interface.launch()