Spaces:

ammariii08
/

dxf_test

Running

App Files Files Community

ammariii08 commited on 10 days ago

Commit

1345fd5

verified ·

1 Parent(s): 757bd2d

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -38

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import torch
 import base64
-import urllib.request
 import numpy as np
 from io import BytesIO
 from PIL import Image, ImageEnhance
@@ -8,10 +7,12 @@ import gradio as gr
 from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 from ultralytics import YOLO
-from olmocr.data.renderpdf import render_pdf_to_base64png
 from olmocr.prompts import build_finetuning_prompt
 from olmocr.prompts.anchor import get_anchor_text
 # Load the OCR model and processor once
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 ocr_model = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -24,55 +25,88 @@ yolo_model = YOLO("best.pt")
 def process_image(input_image):
     """
-    Process the input image:
-      1. Enhance the image.
-      2. Detect and crop the document using YOLO (conf ≥ 0.85).
-      3. Generate an OCR prompt from a sample PDF.
-      4. Run the OCR model using the prompt and the cropped image.
-      5. Return the cropped image and extracted text.
     """
-    # Step 1: Enhance the input image (sharpness, contrast, brightness)
     enhanced_image = ImageEnhance.Sharpness(input_image).enhance(2.0)
     enhanced_image = ImageEnhance.Contrast(enhanced_image).enhance(1.5)
     enhanced_image = ImageEnhance.Brightness(enhanced_image).enhance(0.8)
-    # Step 2: Run YOLO detection with confidence threshold = 0.85
     image_np = np.array(enhanced_image)
     results = yolo_model.predict(source=image_np, conf=0.85)
     result = results[0]
     if len(result.boxes) == 0:
         return enhanced_image, "No document detected by YOLO."
-    # Select the detection with the highest confidence
     boxes = result.boxes
-    confidences = boxes.conf.cpu().numpy()
     best_index = int(confidences.argmax())
     best_box = boxes.xyxy[best_index].cpu().numpy().tolist()  # [xmin, ymin, xmax, ymax]
     xmin, ymin, xmax, ymax = map(int, best_box)
-    # Step 3: Crop the image using the bounding box and optionally resize it
-    cropped_image = enhanced_image.crop((xmin, ymin, xmax, ymax))
-    max_size = (800, 800)  # Resize to reduce processing time
-    cropped_image.thumbnail(max_size, Image.LANCZOS)
-    # Step 4: Build the OCR prompt using a sample PDF
-    sample_pdf_url = "https://molmo.allenai.org/paper.pdf"
-    sample_pdf_path = "./paper.pdf"
-    urllib.request.urlretrieve(sample_pdf_url, sample_pdf_path)
-    # Render page 1 to an image (used only for prompt building)
-    sample_image_base64 = render_pdf_to_base64png(sample_pdf_path, 1, target_longest_image_dim=1024)
-    # Extract document metadata and build the prompt
-    anchor_text = get_anchor_text(sample_pdf_path, 1, pdf_engine="pdfreport", target_length=4000)
-    prompt = build_finetuning_prompt(anchor_text)
-    # Step 5: Build the OCR message using the generated prompt and the cropped image.
     buffered = BytesIO()
     cropped_image.save(buffered, format="PNG")
     cropped_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
     messages = [
         {
             "role": "user",
@@ -86,7 +120,7 @@ def process_image(input_image):
         messages, tokenize=False, add_generation_prompt=True
     )
-    # Step 6: Prepare inputs and run the OCR model
     inputs = ocr_processor(
         text=[text_prompt],
         images=[cropped_image],
@@ -95,20 +129,43 @@ def process_image(input_image):
     )
     inputs = {k: v.to(device) for k, v in inputs.items()}
-    output = ocr_model.generate(
-        **inputs,
-        temperature=0.8,
-        max_new_tokens=50,
-        num_return_sequences=1,
-        do_sample=True,
-    )
     prompt_length = inputs["input_ids"].shape[1]
     new_tokens = output[:, prompt_length:]
     text_output = ocr_processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
     extracted_text = text_output[0]
-    # Step 7: Return the cropped image and the extracted text
     return cropped_image, extracted_text
 # Define the Gradio Interface
@@ -121,10 +178,10 @@ iface = gr.Interface(
     ],
     title="Document OCR with YOLO and OLMOCR",
     description=(
-        "Upload an image of a document. The app enhances the image, detects and crops it using YOLO, "
-        "then builds an OCR prompt from a sample PDF and extracts text."
     ),
-    allow_flagging="never"
 )
 iface.launch(share=True)

 import torch
 import base64
 import numpy as np
 from io import BytesIO
 from PIL import Image, ImageEnhance
 from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 from ultralytics import YOLO
+from prompts import front, back  # prompts.py should define front and back as multiline strings
 from olmocr.prompts import build_finetuning_prompt
 from olmocr.prompts.anchor import get_anchor_text
 # Load the OCR model and processor once
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 ocr_model = Qwen2VLForConditionalGeneration.from_pretrained(
 def process_image(input_image):
     """
+    1. Preprocess the input image.
+    2. Run YOLO detection with a confidence threshold of 0.85.
+    3. Crop the image according to the detected bounding box.
+    4. Choose the corresponding prompt from prompts.py based on the label.
+    5. Convert the cropped image to base64 and build the OCR prompt.
+    6. Run the OCR model to extract text.
+    7. Return the cropped (preprocessed) image and the extracted text.
     """
+    # Step 1: Enhance the image (sharpness, contrast, brightness)
     enhanced_image = ImageEnhance.Sharpness(input_image).enhance(2.0)
     enhanced_image = ImageEnhance.Contrast(enhanced_image).enhance(1.5)
     enhanced_image = ImageEnhance.Brightness(enhanced_image).enhance(0.8)
+    # Step 2: Run YOLO detection using ultralytics with confidence threshold = 0.85
     image_np = np.array(enhanced_image)
     results = yolo_model.predict(source=image_np, conf=0.85)
     result = results[0]
+    # If no boxes detected, return the enhanced image with an error message.
     if len(result.boxes) == 0:
         return enhanced_image, "No document detected by YOLO."
+    # Step 3: Select the detection with the highest confidence
     boxes = result.boxes
+    confidences = boxes.conf.cpu().numpy()  # convert tensor to numpy array
     best_index = int(confidences.argmax())
     best_box = boxes.xyxy[best_index].cpu().numpy().tolist()  # [xmin, ymin, xmax, ymax]
     xmin, ymin, xmax, ymax = map(int, best_box)
+    # Retrieve the detected label using the model's names mapping
+    class_idx = int(boxes.cls[best_index].item())
+    label = yolo_model.names[class_idx]
+    # Step 4: Crop the image using the bounding box
+    cropped_image = enhanced_image.crop((xmin, ymin, xmax, ymax))
+    # OPTIMIZATION: Resize the image to reduce processing time
+    # Calculate aspect ratio to maintain proportions
+    max_size = (640, 640)  # Further reduced from 800x800
+    cropped_image.thumbnail(max_size, Image.LANCZOS)
+    # # Select the corresponding OCR prompt based on the YOLO label
+    # if label.lower() == "front":
+    #     doc_prompt = front
+    # elif label.lower() == "back":
+    #     doc_prompt = back
+    # else:
+    #     doc_prompt = front  # Default to front if unexpected label
+    # Step 5: Convert cropped image to base64 for the message
     buffered = BytesIO()
     cropped_image.save(buffered, format="PNG")
     cropped_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    # # Build the message in the expected format for the OCR processor
+    # messages = [
+    #     {
+    #         "role": "user",
+    #         "content": [
+    #             {"type": "text", "text": doc_prompt},
+    #             {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{cropped_base64}"}},
+    #         ],
+    #     }
+    # ]
+    # text_prompt = ocr_processor.apply_chat_template(
+    #     messages, tokenize=False, add_generation_prompt=True
+    # )
+    # # Step 6: Prepare inputs and run the OCR model
+    # inputs = ocr_processor(
+    #     text=[text_prompt],
+    #     images=[cropped_image],
+    #     padding=True,
+    #     return_tensors="pt",
+    # )
+    # inputs = {k: v.to(device) for k, v in inputs.items()}
+    anchor_text = extract_anchor_text_from_image(cropped_image)  # You'll need to implement this
+    prompt = build_finetuning_prompt(anchor_text)
+    # Build the message in the expected format for the OCR processor
     messages = [
         {
             "role": "user",
         messages, tokenize=False, add_generation_prompt=True
     )
+    # Rest of your code for processing with OCR
     inputs = ocr_processor(
         text=[text_prompt],
         images=[cropped_image],
     )
     inputs = {k: v.to(device) for k, v in inputs.items()}
+    # FIXED: Generation parameters with proper combinations to avoid warnings
+    # Choose one of these two approaches:
+    # Approach 1: Greedy decoding (fastest)
+    # output = ocr_model.generate(
+    #     **inputs,
+    #     max_new_tokens=40,
+    #     num_beams=1,
+    #     do_sample=False  # Greedy decoding
+    # )
+    output = model.generate(
+            **inputs,
+            temperature=0.2,
+            max_new_tokens=50,
+            num_return_sequences=1,
+            do_sample=True,
+        )
+    # Uncomment this block and comment the above if you want sampling instead
+    # # Approach 2: Sampling (more natural but slower)
+    # output = ocr_model.generate(
+    #     **inputs,
+    #     max_new_tokens=40,
+    #     do_sample=True,
+    #     temperature=0.2,
+    #     top_p=0.95,
+    #     top_k=50,
+    #     num_return_sequences=1
+    # )
     prompt_length = inputs["input_ids"].shape[1]
     new_tokens = output[:, prompt_length:]
     text_output = ocr_processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
     extracted_text = text_output[0]
+    # Step 7: Return the cropped (preprocessed) image and extracted text
     return cropped_image, extracted_text
 # Define the Gradio Interface
     ],
     title="Document OCR with YOLO and OLMOCR",
     description=(
+        "Upload an image of a document. The app enhances the image, then extracts text using an OCR model."
     ),
+    allow_flagging="never"  # Disable flagging to simplify UI
 )
+# Enable queue and sharing for Hugging Face Space
 iface.launch(share=True)