Spaces:

herokeyboard369
/

olmOCR

Build error

App Files Files Community

herokeyboard369 commited on 10 days ago

Commit

b8577b9

verified ·

1 Parent(s): 7205e30

Create app.py

Browse files

Files changed (1) hide show

app.py +73 -0

app.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import torch
+import base64
+import urllib.request
+import gradio as gr
+from io import BytesIO
+from PIL import Image
+from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+from olmocr.data.renderpdf import render_pdf_to_base64png
+from olmocr.prompts import build_finetuning_prompt
+from olmocr.prompts.anchor import get_anchor_text
+# Initialize the model
+model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval()
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+# Function to process PDF and generate text
+def process_pdf(pdf_file):
+    pdf_filename = pdf_file.name
+    image_base64 = render_pdf_to_base64png(pdf_filename, 1, target_longest_image_dim=1024)
+    anchor_text = get_anchor_text(pdf_filename, 1, pdf_engine="pdfreport", target_length=4000)
+    prompt = build_finetuning_prompt(anchor_text)
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
+    inputs = processor(
+        text=[text],
+        images=[main_image],
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = {key: value.to(device) for (key, value) in inputs.items()}
+    output = model.generate(
+        **inputs,
+        temperature=0.8,
+        max_new_tokens=1500,
+        num_return_sequences=1,
+        do_sample=True,
+    )
+    prompt_length = inputs["input_ids"].shape[1]
+    new_tokens = output[:, prompt_length:]
+    text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
+    return text_output[0]
+# Create Gradio Interface
+iface = gr.Interface(
+    fn=process_pdf,
+    inputs=gr.File(label="Upload PDF"),
+    outputs=gr.Textbox(label="Extracted Text"),
+    title="PDF Text Extractor",
+    description="Upload a PDF file and extract text using Qwen2-VL-7B-Instruct."
+)
+# Launch the Gradio app
+if __name__ == "__main__":
+    iface.launch()