Spaces:

ammariii08
/

dxf_test

Running

App Files Files Community

ammariii08 commited on 10 days ago

Commit

45ec7b5

verified ·

1 Parent(s): 1345fd5

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -41

app.py CHANGED Viewed

@@ -9,10 +9,6 @@ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 from ultralytics import YOLO
 from prompts import front, back  # prompts.py should define front and back as multiline strings
-from olmocr.prompts import build_finetuning_prompt
-from olmocr.prompts.anchor import get_anchor_text
 # Load the OCR model and processor once
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 ocr_model = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -66,52 +62,25 @@ def process_image(input_image):
     max_size = (640, 640)  # Further reduced from 800x800
     cropped_image.thumbnail(max_size, Image.LANCZOS)
-    # # Select the corresponding OCR prompt based on the YOLO label
-    # if label.lower() == "front":
-    #     doc_prompt = front
-    # elif label.lower() == "back":
-    #     doc_prompt = back
-    # else:
-    #     doc_prompt = front  # Default to front if unexpected label
     # Step 5: Convert cropped image to base64 for the message
     buffered = BytesIO()
     cropped_image.save(buffered, format="PNG")
     cropped_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
-    # # Build the message in the expected format for the OCR processor
-    # messages = [
-    #     {
-    #         "role": "user",
-    #         "content": [
-    #             {"type": "text", "text": doc_prompt},
-    #             {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{cropped_base64}"}},
-    #         ],
-    #     }
-    # ]
-    # text_prompt = ocr_processor.apply_chat_template(
-    #     messages, tokenize=False, add_generation_prompt=True
-    # )
-    # # Step 6: Prepare inputs and run the OCR model
-    # inputs = ocr_processor(
-    #     text=[text_prompt],
-    #     images=[cropped_image],
-    #     padding=True,
-    #     return_tensors="pt",
-    # )
-    # inputs = {k: v.to(device) for k, v in inputs.items()}
-    anchor_text = extract_anchor_text_from_image(cropped_image)  # You'll need to implement this
-    prompt = build_finetuning_prompt(anchor_text)
     # Build the message in the expected format for the OCR processor
     messages = [
         {
             "role": "user",
             "content": [
-                {"type": "text", "text": prompt},
                 {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{cropped_base64}"}},
             ],
         }
@@ -120,7 +89,7 @@ def process_image(input_image):
         messages, tokenize=False, add_generation_prompt=True
     )
-    # Rest of your code for processing with OCR
     inputs = ocr_processor(
         text=[text_prompt],
         images=[cropped_image],
@@ -143,7 +112,7 @@ def process_image(input_image):
     output = model.generate(
             **inputs,
             temperature=0.2,
-            max_new_tokens=50,
             num_return_sequences=1,
             do_sample=True,
         )

 from ultralytics import YOLO
 from prompts import front, back  # prompts.py should define front and back as multiline strings
 # Load the OCR model and processor once
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 ocr_model = Qwen2VLForConditionalGeneration.from_pretrained(
     max_size = (640, 640)  # Further reduced from 800x800
     cropped_image.thumbnail(max_size, Image.LANCZOS)
+    # Select the corresponding OCR prompt based on the YOLO label
+    if label.lower() == "front":
+        doc_prompt = front
+    elif label.lower() == "back":
+        doc_prompt = back
+    else:
+        doc_prompt = front  # Default to front if unexpected label
     # Step 5: Convert cropped image to base64 for the message
     buffered = BytesIO()
     cropped_image.save(buffered, format="PNG")
     cropped_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
     # Build the message in the expected format for the OCR processor
     messages = [
         {
             "role": "user",
             "content": [
+                {"type": "text", "text": doc_prompt},
                 {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{cropped_base64}"}},
             ],
         }
         messages, tokenize=False, add_generation_prompt=True
     )
+    # Step 6: Prepare inputs and run the OCR model
     inputs = ocr_processor(
         text=[text_prompt],
         images=[cropped_image],
     output = model.generate(
             **inputs,
             temperature=0.2,
+            max_new_tokens=1024,
             num_return_sequences=1,
             do_sample=True,
         )