herokeyboard369 commited on
Commit
b8577b9
·
verified ·
1 Parent(s): 7205e30

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -0
app.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import base64
3
+ import urllib.request
4
+ import gradio as gr
5
+
6
+ from io import BytesIO
7
+ from PIL import Image
8
+ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
9
+
10
+ from olmocr.data.renderpdf import render_pdf_to_base64png
11
+ from olmocr.prompts import build_finetuning_prompt
12
+ from olmocr.prompts.anchor import get_anchor_text
13
+
14
+ # Initialize the model
15
+ model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval()
16
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
17
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+ model.to(device)
19
+
20
+ # Function to process PDF and generate text
21
+ def process_pdf(pdf_file):
22
+ pdf_filename = pdf_file.name
23
+ image_base64 = render_pdf_to_base64png(pdf_filename, 1, target_longest_image_dim=1024)
24
+ anchor_text = get_anchor_text(pdf_filename, 1, pdf_engine="pdfreport", target_length=4000)
25
+ prompt = build_finetuning_prompt(anchor_text)
26
+
27
+ messages = [
28
+ {
29
+ "role": "user",
30
+ "content": [
31
+ {"type": "text", "text": prompt},
32
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
33
+ ],
34
+ }
35
+ ]
36
+
37
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
38
+ main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
39
+
40
+ inputs = processor(
41
+ text=[text],
42
+ images=[main_image],
43
+ padding=True,
44
+ return_tensors="pt",
45
+ )
46
+ inputs = {key: value.to(device) for (key, value) in inputs.items()}
47
+
48
+ output = model.generate(
49
+ **inputs,
50
+ temperature=0.8,
51
+ max_new_tokens=1500,
52
+ num_return_sequences=1,
53
+ do_sample=True,
54
+ )
55
+
56
+ prompt_length = inputs["input_ids"].shape[1]
57
+ new_tokens = output[:, prompt_length:]
58
+ text_output = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
59
+
60
+ return text_output[0]
61
+
62
+ # Create Gradio Interface
63
+ iface = gr.Interface(
64
+ fn=process_pdf,
65
+ inputs=gr.File(label="Upload PDF"),
66
+ outputs=gr.Textbox(label="Extracted Text"),
67
+ title="PDF Text Extractor",
68
+ description="Upload a PDF file and extract text using Qwen2-VL-7B-Instruct."
69
+ )
70
+
71
+ # Launch the Gradio app
72
+ if __name__ == "__main__":
73
+ iface.launch()