Spaces:

ammariii08
/

dxf_test

Running

App Files Files Community

ammariii08 commited on 10 days ago

Commit

d1385e8

verified ·

1 Parent(s): e12d8b4

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -25

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from PIL import Image, ImageEnhance
 import gradio as gr
 from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 from prompts import front, back  # prompts.py should define front and back as multiline strings
 # Load the OCR model and processor once
@@ -15,49 +16,54 @@ ocr_model = Qwen2VLForConditionalGeneration.from_pretrained(
 ).eval().to(device)
 ocr_processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
-# Load the YOLO model (using torch.hub and your custom checkpoint "best.pt")
-yolo_model = torch.hub.load('ultralytics/yolov5', 'custom', path='best.pt', force_reload=False)
 def process_image(input_image):
     """
     1. Preprocess the input image.
-    2. Run YOLO detection to get the document type and bounding box.
-    3. Crop the image according to the bounding box.
-    4. Based on the detection label ("front" or "back"), select the corresponding prompt.
-    5. Convert the cropped image to base64 and build the chat message.
-    6. Run the OCR model using the constructed prompt and cropped image.
-    7. Return the cropped image and extracted text.
     """
     # Step 1: Enhance the image (sharpness, contrast, brightness)
     enhanced_image = ImageEnhance.Sharpness(input_image).enhance(2.0)
     enhanced_image = ImageEnhance.Contrast(enhanced_image).enhance(1.5)
     enhanced_image = ImageEnhance.Brightness(enhanced_image).enhance(0.8)
-    # Step 2: Run YOLO detection
-    # Convert PIL image to numpy array (RGB)
     image_np = np.array(enhanced_image)
-    results = yolo_model(image_np)
-    df = results.pandas().xyxy[0]
-    if df.empty:
         return enhanced_image, "No document detected by YOLO."
-    # Use the detection with the highest confidence
-    best_row = df.sort_values(by="confidence", ascending=False).iloc[0]
-    label = best_row['name']
-    bbox = (int(best_row['xmin']), int(best_row['ymin']),
-            int(best_row['xmax']), int(best_row['ymax']))
-    # Step 3: Crop the image using the bounding box
-    cropped_image = enhanced_image.crop(bbox)
-    # Step 4: Select the prompt based on YOLO label
     if label.lower() == "front":
         doc_prompt = front
     elif label.lower() == "back":
         doc_prompt = back
     else:
-        doc_prompt = front  # default to front if label is unexpected
     # Step 5: Convert cropped image to base64 for the message
     buffered = BytesIO()
@@ -110,11 +116,11 @@ iface = gr.Interface(
         gr.Image(type="pil", label="Cropped & Preprocessed Image"),
         gr.Textbox(label="Extracted Text")
     ],
-    title="Document OCR with YOLO and OLMOCR",
     description=(
         "Upload an image of a document. The app enhances the image, uses a YOLO model "
-        "to detect and crop the document (front/back), and then extracts text using the OCR model "
-        "with a corresponding prompt."
     ),
 )

 import gradio as gr
 from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+from ultralytics import YOLO
 from prompts import front, back  # prompts.py should define front and back as multiline strings
 # Load the OCR model and processor once
 ).eval().to(device)
 ocr_processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+# Load the YOLO model using ultralytics (ensure best.pt is in your working directory)
+yolo_model = YOLO("best.pt")
 def process_image(input_image):
     """
     1. Preprocess the input image.
+    2. Run YOLO detection with a confidence threshold of 0.85.
+    3. Crop the image according to the detected bounding box.
+    4. Choose the corresponding prompt from prompts.py based on the label.
+    5. Convert the cropped image to base64 and build the OCR prompt.
+    6. Run the OCR model to extract text.
+    7. Return the cropped (preprocessed) image and the extracted text.
     """
     # Step 1: Enhance the image (sharpness, contrast, brightness)
     enhanced_image = ImageEnhance.Sharpness(input_image).enhance(2.0)
     enhanced_image = ImageEnhance.Contrast(enhanced_image).enhance(1.5)
     enhanced_image = ImageEnhance.Brightness(enhanced_image).enhance(0.8)
+    # Step 2: Run YOLO detection using ultralytics with confidence threshold = 0.85
     image_np = np.array(enhanced_image)
+    results = yolo_model.predict(source=image_np, conf=0.85)
+    result = results[0]
+    # If no boxes detected, return the enhanced image with an error message.
+    if len(result.boxes) == 0:
         return enhanced_image, "No document detected by YOLO."
+    # Step 3: Select the detection with the highest confidence
+    boxes = result.boxes
+    confidences = boxes.conf.cpu().numpy()  # convert tensor to numpy array
+    best_index = int(confidences.argmax())
+    best_box = boxes.xyxy[best_index].cpu().numpy().tolist()  # [xmin, ymin, xmax, ymax]
+    xmin, ymin, xmax, ymax = map(int, best_box)
+    # Retrieve the detected label using the model's names mapping
+    class_idx = int(boxes.cls[best_index].item())
+    label = yolo_model.names[class_idx]
+    # Step 4: Crop the image using the bounding box
+    cropped_image = enhanced_image.crop((xmin, ymin, xmax, ymax))
+    # Select the corresponding OCR prompt based on the YOLO label
     if label.lower() == "front":
         doc_prompt = front
     elif label.lower() == "back":
         doc_prompt = back
     else:
+        doc_prompt = front  # Default to front if unexpected label
     # Step 5: Convert cropped image to base64 for the message
     buffered = BytesIO()
         gr.Image(type="pil", label="Cropped & Preprocessed Image"),
         gr.Textbox(label="Extracted Text")
     ],
+    title="Document OCR with YOLO (Ultralytics) and OLMOCR",
     description=(
         "Upload an image of a document. The app enhances the image, uses a YOLO model "
+        "to detect and crop the document (front/back) with a confidence threshold of 0.85, and "
+        "then extracts text using an OCR model with a corresponding prompt."
     ),
 )