Spaces:

ammariii08
/

dxf_test

Running

App Files Files Community

ammariii08 commited on 10 days ago

Commit

e409fcf

verified ·

1 Parent(s): 7a18006

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -463

app.py CHANGED Viewed

@@ -1,472 +1,121 @@
-import os
-from pathlib import Path
-from typing import List, Union
-from PIL import Image
-import ezdxf.units
-import numpy as np
 import torch
-from torchvision import transforms
-from ultralytics import YOLOWorld, YOLO
-from ultralytics.engine.results import Results
-from ultralytics.utils.plotting import save_one_box
-from transformers import AutoModelForImageSegmentation
-import cv2
-import ezdxf
 import gradio as gr
-import zipfile
-import datetime
-from scalingtestupdated import calculate_scaling_factor
-from shapely.geometry import Polygon, Point
-from scipy.interpolate import splprep, splev
-from scipy.ndimage import gaussian_filter1d
-###############################################################################
-# 1) Single-Image Pipeline & Utilities (Simplified)
-###############################################################################
-# Load Segmentation Model (BiRefNet)
-birefnet = AutoModelForImageSegmentation.from_pretrained(
-    "zhengpeng7/BiRefNet", trust_remote_code=True
-)
-device = "cpu"
-torch.set_float32_matmul_precision(["high", "highest"][0])
-birefnet.to(device)
-birefnet.eval()
-transform_image = transforms.Compose([
-    transforms.Resize((1024, 1024)),
-    transforms.ToTensor(),
-    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
-])
-def yolo_detect(image: Union[str, Path, int, Image.Image, list, tuple, np.ndarray, torch.Tensor],
-                classes: List[str]) -> np.ndarray:
-    """Detects the drawer (box) in the image using YOLOWorld."""
-    drawer_detector = YOLOWorld("yolov8x-worldv2.pt")
-    drawer_detector.set_classes(classes)
-    results: List[Results] = drawer_detector.predict(image)
-    boxes = []
-    for result in results:
-        boxes.append(save_one_box(result.cpu().boxes.xyxy, im=result.orig_img, save=False))
-    del drawer_detector
-    return boxes[0]
-def resize_img(img: np.ndarray, resize_dim):
-    return np.array(Image.fromarray(img).resize(resize_dim))
-def remove_bg(image: np.ndarray) -> np.ndarray:
-    """Removes background using BiRefNet, returning a binary mask."""
-    image_pil = Image.fromarray(image)
-    input_images = transform_image(image_pil).unsqueeze(0).to(device)
-    with torch.no_grad():
-        preds = birefnet(input_images)[-1].sigmoid().cpu()
-    pred = preds[0].squeeze()
-    pred_pil: Image = transforms.ToPILImage()(pred)
-    scale_ratio = 1024 / max(image_pil.size)
-    scaled_size = (int(image_pil.size[0] * scale_ratio), int(image_pil.size[1] * scale_ratio))
-    return np.array(pred_pil.resize(scaled_size))
-def make_square(img: np.ndarray):
-    """Pads an image to be square (max dimension)."""
-    height, width = img.shape[:2]
-    max_dim = max(height, width)
-    pad_height = (max_dim - height) // 2
-    pad_width = (max_dim - width) // 2
-    pad_height_extra = max_dim - height - 2 * pad_height
-    pad_width_extra = max_dim - width - 2 * pad_width
-    if len(img.shape) == 3:
-        padded = np.pad(img, ((pad_height, pad_height + pad_height_extra),
-                              (pad_width, pad_width + pad_width_extra), (0, 0)), mode="edge")
-    else:
-        padded = np.pad(img, ((pad_height, pad_height + pad_height_extra),
-                              (pad_width, pad_width + pad_width_extra)), mode="edge")
-    return padded
-def exclude_scaling_box(image: np.ndarray, bbox: np.ndarray, orig_size: tuple, processed_size: tuple,
-                        expansion_factor: float = 1.2) -> np.ndarray:
-    """Zeros out the area of the reference square from the binary mask."""
-    x_min, y_min, x_max, y_max = map(int, bbox)
-    scale_x = processed_size[1] / orig_size[1]
-    scale_y = processed_size[0] / orig_size[0]
-    x_min = int(x_min * scale_x)
-    x_max = int(x_max * scale_x)
-    y_min = int(y_min * scale_y)
-    y_max = int(y_max * scale_y)
-    box_width = x_max - x_min
-    box_height = y_max - y_min
-    expanded_x_min = max(0, int(x_min - (expansion_factor - 1) * box_width / 2))
-    expanded_x_max = min(image.shape[1], int(x_max + (expansion_factor - 1) * box_width / 2))
-    expanded_y_min = max(0, int(y_min - (expansion_factor - 1) * box_height / 2))
-    expanded_y_max = min(image.shape[0], int(y_max + (expansion_factor - 1) * box_height / 2))
-    image[expanded_y_min:expanded_y_max, expanded_x_min:expanded_x_max] = 0
-    return image
-def resample_contour(contour):
-    """Resamples a contour to ~1000 points using spline interpolation and smoothing."""
-    num_points = 1000
-    smoothing_factor = 5
-    spline_degree = 3
-    if len(contour) < spline_degree + 1:
-        raise ValueError("Contour must have at least 4 points.")
-    contour = contour[:, 0, :]
-    tck, _ = splprep([contour[:, 0], contour[:, 1]], s=smoothing_factor)
-    u = np.linspace(0, 1, num_points)
-    resampled_points = splev(u, tck)
-    smoothed_x = gaussian_filter1d(resampled_points[0], sigma=1)
-    smoothed_y = gaussian_filter1d(resampled_points[1], sigma=1)
-    return np.array([smoothed_x, smoothed_y]).T
-def extract_outlines(binary_image: np.ndarray):
-    """Finds external contours in a binary mask, returns the outline image and the list of contours."""
-    contours, _ = cv2.findContours(binary_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
-    outline_image = np.zeros_like(binary_image)
-    cv2.drawContours(outline_image, contours, -1, (255), thickness=1)
-    return cv2.bitwise_not(outline_image), contours
-def shrink_bbox(image: np.ndarray, shrink_factor: float):
-    """Shrinks the bounding box around the image by a certain factor."""
-    height, width = image.shape[:2]
-    center_x, center_y = width // 2, height // 2
-    new_width = int(width * shrink_factor)
-    new_height = int(height * shrink_factor)
-    x1 = max(center_x - new_width // 2, 0)
-    y1 = max(center_y - new_height // 2, 0)
-    x2 = min(center_x + new_width // 2, width)
-    y2 = min(center_y + new_height // 2, height)
-    return image[y1:y2, x1:x2]
-def detect_reference_square(img) -> np.ndarray:
-    """Detects the reference square in the image using a YOLO model saved in './last.pt'."""
-    box_detector = YOLO("./last.pt")
-    res = box_detector.predict(img, conf=0.05)
-    del box_detector
-    if len(res) == 0 or len(res[0].boxes) == 0:
-        raise ValueError("No reference square found.")
-    cropped_img = save_one_box(res[0].cpu().boxes.xyxy, res[0].orig_img, save=False)
-    coords = res[0].cpu().boxes.xyxy[0]
-    return cropped_img, coords
-def build_tool_polygon(points_inch):
-    return Polygon(points_inch)
-def polygon_to_exterior_coords(poly: Polygon):
-    """Gets the exterior coordinates of a polygon (or the largest piece if MultiPolygon)."""
-    if poly.geom_type == "MultiPolygon":
-        poly = max(poly.geoms, key=lambda g: g.area)
-    if not poly.exterior:
-        return []
-    return list(poly.exterior.coords)
-def save_dxf_spline(inflated_contours, scaling_factor, height):
-    """Creates a DXF with splines from the inflated contours."""
-    doc = ezdxf.new(units=0)
-    doc.units = ezdxf.units.IN
-    doc.header["$INSUNITS"] = ezdxf.units.IN
-    msp = doc.modelspace()
-    final_polygons_inch = []
-    for contour in inflated_contours:
-        try:
-            resampled = resample_contour(contour)
-            points_inch = [(x * scaling_factor, (height - y) * scaling_factor) for (x, y) in resampled]
-            if len(points_inch) < 3:
-                continue
-            if np.linalg.norm(np.array(points_inch[0]) - np.array(points_inch[-1])) > 1e-6:
-                points_inch.append(points_inch[0])
-            tool_polygon = build_tool_polygon(points_inch)
-            exterior_coords = polygon_to_exterior_coords(tool_polygon)
-            if len(exterior_coords) < 3:
-                continue
-            msp.add_spline(exterior_coords, degree=3, dxfattribs={"layer": "TOOLS"})
-            final_polygons_inch.append(tool_polygon)
-        except ValueError as e:
-            print(f"Skipping contour: {e}")
-    return doc, final_polygons_inch
-def draw_polygons_inch(polygons_inch, image_rgb, scaling_factor, image_height,
-                       color=(0, 255, 0), thickness=1):
-    """Draws polygons on an image for visualization."""
-    for poly in polygons_inch:
-        if poly.geom_type == "MultiPolygon":
-            for subpoly in poly.geoms:
-                draw_single_polygon(subpoly, image_rgb, scaling_factor, image_height, color, thickness)
-        else:
-            draw_single_polygon(poly, image_rgb, scaling_factor, image_height, color, thickness)
-def draw_single_polygon(poly, image_rgb, scaling_factor, image_height,
-                        color=(0, 255, 0), thickness=1):
-    """Helper to draw a single polygon."""
-    ext = list(poly.exterior.coords)
-    if len(ext) < 3:
-        return
-    pts_px = []
-    for (x_in, y_in) in ext:
-        px = int(x_in / scaling_factor)
-        py = int(image_height - (y_in / scaling_factor))
-        pts_px.append([px, py])
-    pts_px = np.array(pts_px, dtype=np.int32)
-    cv2.polylines(image_rgb, [pts_px], isClosed=True, color=color, thickness=thickness, lineType=cv2.LINE_AA)
-###############################################################################
-# 2) Single-Image Predict (Only Image & Offset)
-###############################################################################
-def predict(image, offset, offset_unit):
-    # Convert offset to inches if necessary
-    if offset_unit == "mm":
-        offset_inches = offset / 25.4
-    else:
-        offset_inches = offset
-    try:
-        drawer_img = yolo_detect(image, ["box"])
-        shrunked_img = make_square(shrink_bbox(drawer_img, 0.90))
-    except Exception as e:
-        raise gr.Error("Unable to DETECT DRAWER. Please try a different image or angle!") from e
-    try:
-        reference_obj_img, scaling_box_coords = detect_reference_square(shrunked_img)
-    except Exception as e:
-        raise gr.Error("Unable to DETECT REFERENCE BOX. Please try a different image!") from e
-    reference_obj_img = make_square(reference_obj_img)
-    reference_square_mask = remove_bg(reference_obj_img)
-    reference_square_mask = resize_img(reference_square_mask, (reference_obj_img.shape[1], reference_obj_img.shape[0]))
-    try:
-        scaling_factor = calculate_scaling_factor(
-            reference_image_path="./Reference_ScalingBox.jpg",
-            target_image=reference_square_mask,
-            feature_detector="ORB",
-        )
-    except ZeroDivisionError:
-        scaling_factor = None
-        print("Error calculating scaling factor: Division by zero")
-    except Exception as e:
-        scaling_factor = None
-        print(f"Error calculating scaling factor: {e}")
-    if scaling_factor is None or scaling_factor == 0:
-        scaling_factor = 1.0
-        print("Using default scaling factor of 1.0 due to calculation error")
-    orig_size = shrunked_img.shape[:2]
-    objects_mask = remove_bg(shrunked_img)
-    processed_size = objects_mask.shape[:2]
-    # Exclude the reference square from the mask
-    objects_mask = exclude_scaling_box(objects_mask, scaling_box_coords, orig_size, processed_size, expansion_factor=1.2)
-    objects_mask = resize_img(objects_mask, (shrunked_img.shape[1], shrunked_img.shape[0]))
-    if scaling_factor != 0:
-        offset_pixels = (offset_inches / scaling_factor) * 2 + 1
-    else:
-        offset_pixels = 1
-    dilated_mask = cv2.dilate(objects_mask, np.ones((int(offset_pixels), int(offset_pixels)), np.uint8))
-    outlines, contours = extract_outlines(dilated_mask)
-    color_output = cv2.cvtColor(shrunked_img, cv2.COLOR_BGR2RGB)
-    outlines_bgr = cv2.cvtColor(outlines, cv2.COLOR_GRAY2BGR)
-    image_height, image_width = shrunked_img.shape[:2]
-    doc, final_polygons_inch = save_dxf_spline(inflated_contours=contours, scaling_factor=scaling_factor, height=image_height)
-    # Draw tool outlines on images
-    draw_polygons_inch(final_polygons_inch, color_output, scaling_factor, image_height, color=(0, 255, 0), thickness=1)
-    draw_polygons_inch(final_polygons_inch, outlines_bgr, scaling_factor, image_height, color=(0, 255, 0), thickness=1)
-    outlines_color = cv2.cvtColor(outlines_bgr, cv2.COLOR_BGR2RGB)
-    # Save DXF file
-    dxf_filepath = os.path.join("./outputs", "out.dxf")
-    doc.saveas(dxf_filepath)
-    return color_output, outlines_color, dxf_filepath, dilated_mask, str(scaling_factor)
-###############################################################################
-# 3) Batch Processing (Up to 4 Images; Retry Faulty Ones Separately)
-###############################################################################
-def batch_predict(images, offsets_str, offset_unit):
-    offsets = [float(x.strip()) for x in offsets_str.split(",")]
-    if len(images) != len(offsets):
-        raise gr.Error("The number of images and offsets must match!")
-    final_images = []
-    outline_images = []
-    mask_images = []
-    scale_factors_dict = {}
-    dxf_files = {}
-    error_indices = []
-    now_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-    zip_path = f"./outputs/batch_{now_str}.zip"
-    zipf = zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED)
-    for i, img_path in enumerate(images):
-        try:
-            img_pil = Image.open(img_path).convert("RGB")
-            img_np = np.array(img_pil)
-            offset = offsets[i]
-            color_output, outlines_img, dxf_path, mask_img, sfactor = predict(img_np, offset, offset_unit)
-            final_images.append(Image.fromarray(color_output))
-            outline_images.append(Image.fromarray(outlines_img))
-            mask_images.append(Image.fromarray(mask_img))
-            scale_factors_dict[str(i)] = sfactor
-            base_name = os.path.splitext(os.path.basename(img_path))[0]
-            unique_dxf = f"./outputs/{base_name}_{i}.dxf"
-            os.rename(dxf_path, unique_dxf)
-            dxf_files[i] = unique_dxf
-            zipf.write(unique_dxf, arcname=os.path.basename(unique_dxf))
-        except Exception as e:
-            error_indices.append(i)
-            final_images.append(None)
-            outline_images.append(None)
-            mask_images.append(None)
-            scale_factors_dict[str(i)] = f"Error: {str(e)}"
-    zipf.close()
-    return final_images, outline_images, zip_path, mask_images, scale_factors_dict, error_indices
-def retry_predict(index, image_path, offset, offset_unit, current_zip_path, current_scale_factors):
     """
-    Retry processing a single faulty image. Returns updated outputs for that image and updated zip & scale factors.
     """
-    try:
-        img_pil = Image.open(image_path).convert("RGB")
-        img_np = np.array(img_pil)
-        color_output, outlines_img, dxf_path, mask_img, sfactor = predict(img_np, offset, offset_unit)
-        processed_img = Image.fromarray(color_output)
-        outline_img = Image.fromarray(outlines_img)
-        mask_image = Image.fromarray(mask_img)
-        base_name = os.path.splitext(os.path.basename(image_path))[0]
-        unique_dxf = f"./outputs/{base_name}_{index}.dxf"
-        os.rename(dxf_path, unique_dxf)
-        # Append the new DXF to the existing zip archive.
-        with zipfile.ZipFile(current_zip_path, "a", zipfile.ZIP_DEFLATED) as zipf:
-            zipf.write(unique_dxf, arcname=os.path.basename(unique_dxf))
-        current_scale_factors[str(index)] = sfactor
-        return processed_img, outline_img, mask_image, unique_dxf, current_zip_path, current_scale_factors, ""
-    except Exception as e:
-        return None, None, None, None, current_zip_path, current_scale_factors, str(e)
-###############################################################################
-# 4) Gradio UI
-###############################################################################
-if __name__ == "__main__":
-    os.makedirs("./outputs", exist_ok=True)
-    with gr.Blocks() as demo:
-        gr.Markdown("## Choose Processing Mode")
-        # Radio to pick Single or Batch
-        mode_select = gr.Radio(choices=["Single", "Batch"], value="Single", label="Select Mode")
-        single_section = gr.Group(visible=True)
-        batch_section = gr.Group(visible=False)
-        retry_section = gr.Group(visible=False)
-        # Toggle mode visibility
-        def toggle_mode(mode):
-            if mode == "Single":
-                return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
-            else:
-                return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
-        mode_select.change(fn=toggle_mode, inputs=mode_select, outputs=[single_section, batch_section, retry_section])
-        #######################################################################
-        # Single-Image Section
-        #######################################################################
-        with single_section:
-            gr.Markdown("### Single-Image Processing")
-            with gr.Row():
-                with gr.Column():
-                    image_input = gr.Image(label="Input Image")
-                    offset_input = gr.Number(label="Offset", value=0.075)
-                    offset_unit_input = gr.Dropdown(label="Offset Unit", choices=["inches", "mm"], value="inches")
-                    submit_btn = gr.Button("Submit Single")
-                    clear_btn = gr.Button("Clear Single")
-                with gr.Column():
-                    output_image = gr.Image(label="Output Image")
-                    outlines_image = gr.Image(label="Outlined Image")
-                    dxf_file = gr.File(label="DXF File")
-                    mask_image = gr.Image(label="Mask")
-                    scaling_factor_txt = gr.Textbox(label="Scaling Factor (inches/pixel)", placeholder="Computed value")
-            submit_btn.click(fn=predict,
-                             inputs=[image_input, offset_input, offset_unit_input],
-                             outputs=[output_image, outlines_image, dxf_file, mask_image, scaling_factor_txt])
-            clear_btn.click(fn=lambda: (None, None, None, None, ""),
-                            inputs=[], outputs=[output_image, outlines_image, dxf_file, mask_image, scaling_factor_txt])
-        #######################################################################
-        # Batch Section
-        #######################################################################
-        # Helper function to limit files to a maximum of 4
-        def limit_files(file_list):
-            """If more than 4 files are uploaded, return only the first 4."""
-            if file_list is None:
-                return None
-            if len(file_list) > 4:
-                return file_list[:4]
-            return file_list
-        with batch_section:
-            gr.Markdown("### Batch Processing (Up to 4 Images)")
-            with gr.Row():
-                with gr.Column():
-                    images_input = gr.File(label="Upload 4 Images (up to 4)", file_count="multiple", type="filepath")
-                    images_input.change(fn=limit_files, inputs=images_input, outputs=images_input)
-                    offsets_input = gr.Textbox(label="Offsets (comma-separated, one per image)", placeholder="e.g. 0.1, 0.1")
-                    offset_unit_batch = gr.Dropdown(label="Offset Unit", choices=["inches", "mm"], value="inches")
-                    batch_submit_btn = gr.Button("Submit Batch")
-                    batch_clear_btn = gr.Button("Clear Batch")
-                with gr.Column():
-                    final_images_gallery = gr.Gallery(label="Final Annotated Images", columns=2)
-                    outlines_gallery = gr.Gallery(label="Outlined Images", columns=2)
-                    masks_gallery = gr.Gallery(label="Mask Images", columns=2)
-                    dxf_zip_file = gr.File(label="DXF Files (zip)")
-                    scale_factors_text = gr.JSON(label="Scale Factors (Key=Image Index)")
-                    error_indices_txt = gr.Textbox(label="Error Indices (if any)", interactive=False)
-            batch_submit_btn.click(fn=batch_predict,
-                                inputs=[images_input, offsets_input, offset_unit_batch],
-                                outputs=[final_images_gallery, outlines_gallery, dxf_zip_file, masks_gallery, scale_factors_text, error_indices_txt])
-            batch_clear_btn.click(fn=lambda: ([], [], None, [], {}, ""),
-                                inputs=[], outputs=[final_images_gallery, outlines_gallery, dxf_zip_file, masks_gallery, scale_factors_text, error_indices_txt])
-        #######################################################################
-        # Retry Faulty Image Section
-        #######################################################################
-        with retry_section:
-            gr.Markdown("### Retry Faulty Image")
-            with gr.Row():
-                with gr.Column():
-                    retry_index = gr.Textbox(label="Index of Faulty Image (0-indexed)", placeholder="Enter index of failed image")
-                    retry_image_input = gr.Image(label="Replacement Image")
-                    retry_offset = gr.Number(label="Offset", value=0.075)
-                    retry_offset_unit = gr.Dropdown(label="Offset Unit", choices=["inches", "mm"], value="inches")
-                    current_zip = gr.Textbox(label="Current ZIP File Path", interactive=False)
-                    current_scale = gr.JSON(label="Current Scale Factors", value={})
-                    retry_btn = gr.Button("Retry Faulty Image")
-                with gr.Column():
-                    retry_final_img = gr.Image(label="Updated Final Image")
-                    retry_outline_img = gr.Image(label="Updated Outline Image")
-                    retry_mask_img = gr.Image(label="Updated Mask Image")
-                    updated_zip = gr.File(label="Updated ZIP File")
-                    updated_scale = gr.JSON(label="Updated Scale Factors")
-                    retry_error = gr.Textbox(label="Retry Error Message", interactive=False)
-            retry_btn.click(fn=retry_predict,
-                            inputs=[retry_index, retry_image_input, retry_offset, retry_offset_unit, current_zip, current_scale],
-                            outputs=[retry_final_img, retry_outline_img, retry_mask_img, current_zip, updated_zip, updated_scale, retry_error])
-        demo.launch(share=True)

 import torch
+import base64
+import numpy as np
+from io import BytesIO
+from PIL import Image, ImageEnhance
 import gradio as gr
+from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+from prompts import front, back  # prompts.py should define front and back as multiline strings
+# Load the OCR model and processor once
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ocr_model = Qwen2VLForConditionalGeneration.from_pretrained(
+    "allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16
+).eval().to(device)
+ocr_processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+# Load the YOLO model (using torch.hub and your custom checkpoint "best.pt")
+yolo_model = torch.hub.load('ultralytics/yolov5', 'custom', path='best.pt', force_reload=False)
+def process_image(input_image):
     """
+    1. Preprocess the input image.
+    2. Run YOLO detection to get the document type and bounding box.
+    3. Crop the image according to the bounding box.
+    4. Based on the detection label ("front" or "back"), select the corresponding prompt.
+    5. Convert the cropped image to base64 and build the chat message.
+    6. Run the OCR model using the constructed prompt and cropped image.
+    7. Return the cropped image and extracted text.
     """
+    # Step 1: Enhance the image (sharpness, contrast, brightness)
+    enhanced_image = ImageEnhance.Sharpness(input_image).enhance(2.0)
+    enhanced_image = ImageEnhance.Contrast(enhanced_image).enhance(1.5)
+    enhanced_image = ImageEnhance.Brightness(enhanced_image).enhance(0.8)
+    # Step 2: Run YOLO detection
+    # Convert PIL image to numpy array (RGB)
+    image_np = np.array(enhanced_image)
+    results = yolo_model(image_np)
+    df = results.pandas().xyxy[0]
+    if df.empty:
+        return enhanced_image, "No document detected by YOLO."
+    # Use the detection with the highest confidence
+    best_row = df.sort_values(by="confidence", ascending=False).iloc[0]
+    label = best_row['name']
+    bbox = (int(best_row['xmin']), int(best_row['ymin']),
+            int(best_row['xmax']), int(best_row['ymax']))
+    # Step 3: Crop the image using the bounding box
+    cropped_image = enhanced_image.crop(bbox)
+    # Step 4: Select the prompt based on YOLO label
+    if label.lower() == "front":
+        doc_prompt = front
+    elif label.lower() == "back":
+        doc_prompt = back
+    else:
+        doc_prompt = front  # default to front if label is unexpected
+    # Step 5: Convert cropped image to base64 for the message
+    buffered = BytesIO()
+    cropped_image.save(buffered, format="PNG")
+    cropped_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    # Build the message in the expected format for the OCR processor
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": doc_prompt},
+                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{cropped_base64}"}},
+            ],
+        }
+    ]
+    text_prompt = ocr_processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    # Step 6: Prepare inputs and run the OCR model
+    inputs = ocr_processor(
+        text=[text_prompt],
+        images=[cropped_image],
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    output = ocr_model.generate(
+        **inputs,
+        temperature=0.8,
+        max_new_tokens=50,
+        num_return_sequences=1,
+        do_sample=True,
+    )
+    prompt_length = inputs["input_ids"].shape[1]
+    new_tokens = output[:, prompt_length:]
+    text_output = ocr_processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
+    extracted_text = text_output[0]
+    # Step 7: Return the cropped (preprocessed) image and extracted text
+    return cropped_image, extracted_text
+# Define the Gradio Interface
+iface = gr.Interface(
+    fn=process_image,
+    inputs=gr.Image(type="pil", label="Input Document Image"),
+    outputs=[
+        gr.Image(type="pil", label="Cropped & Preprocessed Image"),
+        gr.Textbox(label="Extracted Text")
+    ],
+    title="Document OCR with YOLO and OLMOCR",
+    description=(
+        "Upload an image of a document. The app enhances the image, uses a YOLO model "
+        "to detect and crop the document (front/back), and then extracts text using the OCR model "
+        "with a corresponding prompt."
+    ),
+)
+iface.launch()