Spaces:

ZennyKenny
/

note-to-text

Running on Zero

File size: 2,961 Bytes

0fd8a0b
44342ba
fd11c5a
0fd8a0b
fb1de0f
44342ba
0fd8a0b
 
 
 
 
 
 
44342ba
fb1de0f
0fd8a0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
471fe5d
44342ba
0fd8a0b
471fe5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44342ba
 
0fd8a0b

from transformers import MllamaForConditionalGeneration, AutoProcessor
from PIL import Image
import torch
import gradio as gr
import spaces

# Initialize model and processor
ocr = "unsloth/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(
    ocr,
    torch_dtype=torch.bfloat16
).to("cuda")
processor = AutoProcessor.from_pretrained(ocr)

@spaces.GPU
def extract_text(image):
    # Convert image to RGB
    image = Image.open(image).convert("RGB")
    
    # Create message structure
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Extract handwritten text from the image and output only the extracted text without any additional description or commentary in output"},
                {"type": "image"}
            ]
        }
    ]
    
    # Process input
    texts = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=texts, images=[image], return_tensors="pt").to("cuda")

    
    # Generate output
    outputs = model.generate(**inputs, max_new_tokens=250)
    result = processor.decode(outputs[0], skip_special_tokens=True)

    print(result)
    
    # Clean up the output to remove the prompt and assistant text
    if "assistant" in result.lower():
        result = result[result.lower().find("assistant") + len("assistant"):].strip()
    
    # Remove any remaining conversation markers
    result = result.replace("user", "").replace("Extract handwritten text from the image and output only the extracted text without any additional description or commentary in output", "").strip()

    print(result)
    
    return result

# Create Gradio interface with a more colorful and engaging UI
note = gr.Interface(
    fn=extract_text,
    inputs=gr.Image(type="filepath", label="📷 Upload Image"),
    outputs=gr.Textbox(label="📝 Extracted Text"),
    title="🖋️ Handwritten Text Extractor 🖋️",
    description="""<div style="background-color: #f0f8ff; padding: 20px; border-radius: 10px;">
                    <h2 style="color: #333;">✨ Welcome to the Handwritten Text Extractor! ✨</h2>
                    <p style="color: #555;">Upload an image containing handwritten text, and let the magic happen! 🎩✨</p>
                    <p style="color: #555;">📌 <strong>Instructions:</strong></p>
                    <ul style="color: #555;">
                        <li>Click on the "Upload Image" button to select your image.</li>
                        <li>Wait a few seconds while the model processes the image.</li>
                        <li>Voilà! Your extracted text will appear below. 🎉</li>
                    </ul>
                    <p style="color: #555;">🖼️ <strong>Note:</strong> For best results, use clear and well-lit images.</p>
                </div>""",
    theme="soft",
    allow_flagging="never"
)

# Launch the app
note.launch(debug=True)