ammariii08 commited on
Commit
45ec7b5
·
verified ·
1 Parent(s): 1345fd5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -41
app.py CHANGED
@@ -9,10 +9,6 @@ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
9
  from ultralytics import YOLO
10
  from prompts import front, back # prompts.py should define front and back as multiline strings
11
 
12
- from olmocr.prompts import build_finetuning_prompt
13
- from olmocr.prompts.anchor import get_anchor_text
14
-
15
-
16
  # Load the OCR model and processor once
17
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
  ocr_model = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -66,52 +62,25 @@ def process_image(input_image):
66
  max_size = (640, 640) # Further reduced from 800x800
67
  cropped_image.thumbnail(max_size, Image.LANCZOS)
68
 
69
- # # Select the corresponding OCR prompt based on the YOLO label
70
- # if label.lower() == "front":
71
- # doc_prompt = front
72
- # elif label.lower() == "back":
73
- # doc_prompt = back
74
- # else:
75
- # doc_prompt = front # Default to front if unexpected label
76
 
77
  # Step 5: Convert cropped image to base64 for the message
78
  buffered = BytesIO()
79
  cropped_image.save(buffered, format="PNG")
80
  cropped_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
81
 
82
- # # Build the message in the expected format for the OCR processor
83
- # messages = [
84
- # {
85
- # "role": "user",
86
- # "content": [
87
- # {"type": "text", "text": doc_prompt},
88
- # {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{cropped_base64}"}},
89
- # ],
90
- # }
91
- # ]
92
- # text_prompt = ocr_processor.apply_chat_template(
93
- # messages, tokenize=False, add_generation_prompt=True
94
- # )
95
-
96
- # # Step 6: Prepare inputs and run the OCR model
97
- # inputs = ocr_processor(
98
- # text=[text_prompt],
99
- # images=[cropped_image],
100
- # padding=True,
101
- # return_tensors="pt",
102
- # )
103
- # inputs = {k: v.to(device) for k, v in inputs.items()}
104
-
105
-
106
- anchor_text = extract_anchor_text_from_image(cropped_image) # You'll need to implement this
107
- prompt = build_finetuning_prompt(anchor_text)
108
-
109
  # Build the message in the expected format for the OCR processor
110
  messages = [
111
  {
112
  "role": "user",
113
  "content": [
114
- {"type": "text", "text": prompt},
115
  {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{cropped_base64}"}},
116
  ],
117
  }
@@ -120,7 +89,7 @@ def process_image(input_image):
120
  messages, tokenize=False, add_generation_prompt=True
121
  )
122
 
123
- # Rest of your code for processing with OCR
124
  inputs = ocr_processor(
125
  text=[text_prompt],
126
  images=[cropped_image],
@@ -143,7 +112,7 @@ def process_image(input_image):
143
  output = model.generate(
144
  **inputs,
145
  temperature=0.2,
146
- max_new_tokens=50,
147
  num_return_sequences=1,
148
  do_sample=True,
149
  )
 
9
  from ultralytics import YOLO
10
  from prompts import front, back # prompts.py should define front and back as multiline strings
11
 
 
 
 
 
12
  # Load the OCR model and processor once
13
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
  ocr_model = Qwen2VLForConditionalGeneration.from_pretrained(
 
62
  max_size = (640, 640) # Further reduced from 800x800
63
  cropped_image.thumbnail(max_size, Image.LANCZOS)
64
 
65
+ # Select the corresponding OCR prompt based on the YOLO label
66
+ if label.lower() == "front":
67
+ doc_prompt = front
68
+ elif label.lower() == "back":
69
+ doc_prompt = back
70
+ else:
71
+ doc_prompt = front # Default to front if unexpected label
72
 
73
  # Step 5: Convert cropped image to base64 for the message
74
  buffered = BytesIO()
75
  cropped_image.save(buffered, format="PNG")
76
  cropped_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  # Build the message in the expected format for the OCR processor
79
  messages = [
80
  {
81
  "role": "user",
82
  "content": [
83
+ {"type": "text", "text": doc_prompt},
84
  {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{cropped_base64}"}},
85
  ],
86
  }
 
89
  messages, tokenize=False, add_generation_prompt=True
90
  )
91
 
92
+ # Step 6: Prepare inputs and run the OCR model
93
  inputs = ocr_processor(
94
  text=[text_prompt],
95
  images=[cropped_image],
 
112
  output = model.generate(
113
  **inputs,
114
  temperature=0.2,
115
+ max_new_tokens=1024,
116
  num_return_sequences=1,
117
  do_sample=True,
118
  )