Spaces:

ammariii08
/

dxf_test

Running

App Files Files Community

ammariii08 commited on 10 days ago

Commit

757bd2d

verified ·

1 Parent(s): 92b6d26

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -65

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import torch
 import base64
 import numpy as np
 from io import BytesIO
 from PIL import Image, ImageEnhance
@@ -7,7 +8,9 @@ import gradio as gr
 from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 from ultralytics import YOLO
-from prompts import front, back  # prompts.py should define front and back as multiline strings
 # Load the OCR model and processor once
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -21,66 +24,60 @@ yolo_model = YOLO("best.pt")
 def process_image(input_image):
     """
-    1. Preprocess the input image.
-    2. Run YOLO detection with a confidence threshold of 0.85.
-    3. Crop the image according to the detected bounding box.
-    4. Choose the corresponding prompt from prompts.py based on the label.
-    5. Convert the cropped image to base64 and build the OCR prompt.
-    6. Run the OCR model to extract text.
-    7. Return the cropped (preprocessed) image and the extracted text.
     """
-    # Step 1: Enhance the image (sharpness, contrast, brightness)
     enhanced_image = ImageEnhance.Sharpness(input_image).enhance(2.0)
     enhanced_image = ImageEnhance.Contrast(enhanced_image).enhance(1.5)
     enhanced_image = ImageEnhance.Brightness(enhanced_image).enhance(0.8)
-    # Step 2: Run YOLO detection using ultralytics with confidence threshold = 0.85
     image_np = np.array(enhanced_image)
     results = yolo_model.predict(source=image_np, conf=0.85)
     result = results[0]
-    # If no boxes detected, return the enhanced image with an error message.
     if len(result.boxes) == 0:
         return enhanced_image, "No document detected by YOLO."
-    # Step 3: Select the detection with the highest confidence
     boxes = result.boxes
-    confidences = boxes.conf.cpu().numpy()  # convert tensor to numpy array
     best_index = int(confidences.argmax())
     best_box = boxes.xyxy[best_index].cpu().numpy().tolist()  # [xmin, ymin, xmax, ymax]
     xmin, ymin, xmax, ymax = map(int, best_box)
-    # Retrieve the detected label using the model's names mapping
-    class_idx = int(boxes.cls[best_index].item())
-    label = yolo_model.names[class_idx]
-    # Step 4: Crop the image using the bounding box
     cropped_image = enhanced_image.crop((xmin, ymin, xmax, ymax))
-    # OPTIMIZATION: Resize the image to reduce processing time
-    # Calculate aspect ratio to maintain proportions
-    max_size = (640, 640)  # Further reduced from 800x800
     cropped_image.thumbnail(max_size, Image.LANCZOS)
-    # Select the corresponding OCR prompt based on the YOLO label
-    if label.lower() == "front":
-        doc_prompt = front
-    elif label.lower() == "back":
-        doc_prompt = back
-    else:
-        doc_prompt = front  # Default to front if unexpected label
-    # Step 5: Convert cropped image to base64 for the message
     buffered = BytesIO()
     cropped_image.save(buffered, format="PNG")
     cropped_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
-    # Build the message in the expected format for the OCR processor
     messages = [
         {
             "role": "user",
             "content": [
-                {"type": "text", "text": doc_prompt},
                 {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{cropped_base64}"}},
             ],
         }
@@ -98,43 +95,20 @@ def process_image(input_image):
     )
     inputs = {k: v.to(device) for k, v in inputs.items()}
-    # FIXED: Generation parameters with proper combinations to avoid warnings
-    # Choose one of these two approaches:
-    # Approach 1: Greedy decoding (fastest)
-    # output = ocr_model.generate(
-    #     **inputs,
-    #     max_new_tokens=40,
-    #     num_beams=1,
-    #     do_sample=False  # Greedy decoding
-    # )
     output = ocr_model.generate(
-            **inputs,
-            temperature=0.2,
-            max_new_tokens=50,
-            num_return_sequences=1,
-            do_sample=True,
-        )
-    # Uncomment this block and comment the above if you want sampling instead
-    # # Approach 2: Sampling (more natural but slower)
-    # output = ocr_model.generate(
-    #     **inputs,
-    #     max_new_tokens=40,
-    #     do_sample=True,
-    #     temperature=0.2,
-    #     top_p=0.95,
-    #     top_k=50,
-    #     num_return_sequences=1
-    # )
     prompt_length = inputs["input_ids"].shape[1]
     new_tokens = output[:, prompt_length:]
     text_output = ocr_processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
     extracted_text = text_output[0]
-    # Step 7: Return the cropped (preprocessed) image and extracted text
     return cropped_image, extracted_text
 # Define the Gradio Interface
@@ -147,10 +121,10 @@ iface = gr.Interface(
     ],
     title="Document OCR with YOLO and OLMOCR",
     description=(
-        "Upload an image of a document. The app enhances the image, then extracts text using an OCR model."
     ),
-    allow_flagging="never"  # Disable flagging to simplify UI
 )
-# Enable queue and sharing for Hugging Face Space
 iface.launch(share=True)

 import torch
 import base64
+import urllib.request
 import numpy as np
 from io import BytesIO
 from PIL import Image, ImageEnhance
 from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 from ultralytics import YOLO
+from olmocr.data.renderpdf import render_pdf_to_base64png
+from olmocr.prompts import build_finetuning_prompt
+from olmocr.prompts.anchor import get_anchor_text
 # Load the OCR model and processor once
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def process_image(input_image):
     """
+    Process the input image:
+      1. Enhance the image.
+      2. Detect and crop the document using YOLO (conf ≥ 0.85).
+      3. Generate an OCR prompt from a sample PDF.
+      4. Run the OCR model using the prompt and the cropped image.
+      5. Return the cropped image and extracted text.
     """
+    # Step 1: Enhance the input image (sharpness, contrast, brightness)
     enhanced_image = ImageEnhance.Sharpness(input_image).enhance(2.0)
     enhanced_image = ImageEnhance.Contrast(enhanced_image).enhance(1.5)
     enhanced_image = ImageEnhance.Brightness(enhanced_image).enhance(0.8)
+    # Step 2: Run YOLO detection with confidence threshold = 0.85
     image_np = np.array(enhanced_image)
     results = yolo_model.predict(source=image_np, conf=0.85)
     result = results[0]
     if len(result.boxes) == 0:
         return enhanced_image, "No document detected by YOLO."
+    # Select the detection with the highest confidence
     boxes = result.boxes
+    confidences = boxes.conf.cpu().numpy()
     best_index = int(confidences.argmax())
     best_box = boxes.xyxy[best_index].cpu().numpy().tolist()  # [xmin, ymin, xmax, ymax]
     xmin, ymin, xmax, ymax = map(int, best_box)
+    # Step 3: Crop the image using the bounding box and optionally resize it
     cropped_image = enhanced_image.crop((xmin, ymin, xmax, ymax))
+    max_size = (800, 800)  # Resize to reduce processing time
     cropped_image.thumbnail(max_size, Image.LANCZOS)
+    # Step 4: Build the OCR prompt using a sample PDF
+    sample_pdf_url = "https://molmo.allenai.org/paper.pdf"
+    sample_pdf_path = "./paper.pdf"
+    urllib.request.urlretrieve(sample_pdf_url, sample_pdf_path)
+    # Render page 1 to an image (used only for prompt building)
+    sample_image_base64 = render_pdf_to_base64png(sample_pdf_path, 1, target_longest_image_dim=1024)
+    # Extract document metadata and build the prompt
+    anchor_text = get_anchor_text(sample_pdf_path, 1, pdf_engine="pdfreport", target_length=4000)
+    prompt = build_finetuning_prompt(anchor_text)
+    # Step 5: Build the OCR message using the generated prompt and the cropped image.
     buffered = BytesIO()
     cropped_image.save(buffered, format="PNG")
     cropped_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
     messages = [
         {
             "role": "user",
             "content": [
+                {"type": "text", "text": prompt},
                 {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{cropped_base64}"}},
             ],
         }
     )
     inputs = {k: v.to(device) for k, v in inputs.items()}
     output = ocr_model.generate(
+        **inputs,
+        temperature=0.8,
+        max_new_tokens=50,
+        num_return_sequences=1,
+        do_sample=True,
+    )
     prompt_length = inputs["input_ids"].shape[1]
     new_tokens = output[:, prompt_length:]
     text_output = ocr_processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
     extracted_text = text_output[0]
+    # Step 7: Return the cropped image and the extracted text
     return cropped_image, extracted_text
 # Define the Gradio Interface
     ],
     title="Document OCR with YOLO and OLMOCR",
     description=(
+        "Upload an image of a document. The app enhances the image, detects and crops it using YOLO, "
+        "then builds an OCR prompt from a sample PDF and extracts text."
     ),
+    allow_flagging="never"
 )
 iface.launch(share=True)