Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -9,10 +9,6 @@ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
|
|
9 |
from ultralytics import YOLO
|
10 |
from prompts import front, back # prompts.py should define front and back as multiline strings
|
11 |
|
12 |
-
from olmocr.prompts import build_finetuning_prompt
|
13 |
-
from olmocr.prompts.anchor import get_anchor_text
|
14 |
-
|
15 |
-
|
16 |
# Load the OCR model and processor once
|
17 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
18 |
ocr_model = Qwen2VLForConditionalGeneration.from_pretrained(
|
@@ -66,52 +62,25 @@ def process_image(input_image):
|
|
66 |
max_size = (640, 640) # Further reduced from 800x800
|
67 |
cropped_image.thumbnail(max_size, Image.LANCZOS)
|
68 |
|
69 |
-
#
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
|
77 |
# Step 5: Convert cropped image to base64 for the message
|
78 |
buffered = BytesIO()
|
79 |
cropped_image.save(buffered, format="PNG")
|
80 |
cropped_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
81 |
|
82 |
-
# # Build the message in the expected format for the OCR processor
|
83 |
-
# messages = [
|
84 |
-
# {
|
85 |
-
# "role": "user",
|
86 |
-
# "content": [
|
87 |
-
# {"type": "text", "text": doc_prompt},
|
88 |
-
# {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{cropped_base64}"}},
|
89 |
-
# ],
|
90 |
-
# }
|
91 |
-
# ]
|
92 |
-
# text_prompt = ocr_processor.apply_chat_template(
|
93 |
-
# messages, tokenize=False, add_generation_prompt=True
|
94 |
-
# )
|
95 |
-
|
96 |
-
# # Step 6: Prepare inputs and run the OCR model
|
97 |
-
# inputs = ocr_processor(
|
98 |
-
# text=[text_prompt],
|
99 |
-
# images=[cropped_image],
|
100 |
-
# padding=True,
|
101 |
-
# return_tensors="pt",
|
102 |
-
# )
|
103 |
-
# inputs = {k: v.to(device) for k, v in inputs.items()}
|
104 |
-
|
105 |
-
|
106 |
-
anchor_text = extract_anchor_text_from_image(cropped_image) # You'll need to implement this
|
107 |
-
prompt = build_finetuning_prompt(anchor_text)
|
108 |
-
|
109 |
# Build the message in the expected format for the OCR processor
|
110 |
messages = [
|
111 |
{
|
112 |
"role": "user",
|
113 |
"content": [
|
114 |
-
{"type": "text", "text":
|
115 |
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{cropped_base64}"}},
|
116 |
],
|
117 |
}
|
@@ -120,7 +89,7 @@ def process_image(input_image):
|
|
120 |
messages, tokenize=False, add_generation_prompt=True
|
121 |
)
|
122 |
|
123 |
-
#
|
124 |
inputs = ocr_processor(
|
125 |
text=[text_prompt],
|
126 |
images=[cropped_image],
|
@@ -143,7 +112,7 @@ def process_image(input_image):
|
|
143 |
output = model.generate(
|
144 |
**inputs,
|
145 |
temperature=0.2,
|
146 |
-
max_new_tokens=
|
147 |
num_return_sequences=1,
|
148 |
do_sample=True,
|
149 |
)
|
|
|
9 |
from ultralytics import YOLO
|
10 |
from prompts import front, back # prompts.py should define front and back as multiline strings
|
11 |
|
|
|
|
|
|
|
|
|
12 |
# Load the OCR model and processor once
|
13 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
14 |
ocr_model = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
|
62 |
max_size = (640, 640) # Further reduced from 800x800
|
63 |
cropped_image.thumbnail(max_size, Image.LANCZOS)
|
64 |
|
65 |
+
# Select the corresponding OCR prompt based on the YOLO label
|
66 |
+
if label.lower() == "front":
|
67 |
+
doc_prompt = front
|
68 |
+
elif label.lower() == "back":
|
69 |
+
doc_prompt = back
|
70 |
+
else:
|
71 |
+
doc_prompt = front # Default to front if unexpected label
|
72 |
|
73 |
# Step 5: Convert cropped image to base64 for the message
|
74 |
buffered = BytesIO()
|
75 |
cropped_image.save(buffered, format="PNG")
|
76 |
cropped_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
# Build the message in the expected format for the OCR processor
|
79 |
messages = [
|
80 |
{
|
81 |
"role": "user",
|
82 |
"content": [
|
83 |
+
{"type": "text", "text": doc_prompt},
|
84 |
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{cropped_base64}"}},
|
85 |
],
|
86 |
}
|
|
|
89 |
messages, tokenize=False, add_generation_prompt=True
|
90 |
)
|
91 |
|
92 |
+
# Step 6: Prepare inputs and run the OCR model
|
93 |
inputs = ocr_processor(
|
94 |
text=[text_prompt],
|
95 |
images=[cropped_image],
|
|
|
112 |
output = model.generate(
|
113 |
**inputs,
|
114 |
temperature=0.2,
|
115 |
+
max_new_tokens=1024,
|
116 |
num_return_sequences=1,
|
117 |
do_sample=True,
|
118 |
)
|