Spaces:

MLBench
/

License_Text_Extraction

Sleeping

App Files Files Community

ammariii08 commited on 12 days ago

Commit

7832c3b

verified ·

1 Parent(s): c2fa9d2

Create app.py

Browse files

Files changed (1) hide show

app.py +178 -0

app.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import os
+# Set up caching for Hugging Face models
+os.environ["TRANSFORMERS_CACHE"] = "./.cache"
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Disable GPU usage
+import gradio as gr
+import torch
+import cv2
+import numpy as np
+from PIL import Image, ImageEnhance
+from ultralytics import YOLO
+from torchvision.transforms.functional import InterpolationMode
+import torchvision.transforms as T
+from transformers import AutoModel, AutoTokenizer
+import gc
+# Import prompts from prompts.py
+from prompts import front as front_prompt, back as back_prompt
+# ---------------------------
+# HUGGING FACE MODEL SETUP (CPU)
+# ---------------------------
+path = "OpenGVLab/InternVL2_5-2B"
+cache_folder = "./.cache"
+# Load the Vision AI model and tokenizer globally.
+model = AutoModel.from_pretrained(
+    path,
+    cache_dir=cache_folder,
+    torch_dtype=torch.float32,
+    trust_remote_code=True
+).eval().to("cpu")
+tokenizer = AutoTokenizer.from_pretrained(
+    path,
+    cache_dir=cache_folder,
+    trust_remote_code=True,
+    use_fast=False
+)
+# ---------------------------
+# YOLO MODEL INITIALIZATION
+# ---------------------------
+model_path = "best.pt"
+modelY = YOLO(model_path)
+modelY.to('cpu')  # Explicitly move model to CPU
+def preprocessing(image):
+    """Apply enhancement filters and resize."""
+    image = Image.fromarray(np.array(image))
+    image = ImageEnhance.Sharpness(image).enhance(2.0)   # Increase sharpness
+    image = ImageEnhance.Contrast(image).enhance(1.5)     # Increase contrast
+    image = ImageEnhance.Brightness(image).enhance(0.8)   # Reduce brightness
+    width = 448
+    aspect_ratio = image.height / image.width
+    height = int(width * aspect_ratio)
+    image = image.resize((width, height))
+    return image
+def imageRotation(image):
+    """Rotate image if height exceeds width."""
+    if image.height > image.width:
+        return image.rotate(90, expand=True)
+    return image
+def detect_document(image):
+    """Detect front/back of the document using YOLO."""
+    image_np = np.array(image)
+    results = modelY(image_np, conf=0.85, device='cpu')
+    detected_classes = set()
+    labels = []
+    bounding_boxes = []
+    for result in results:
+        for box in result.boxes:
+            x1, y1, x2, y2 = map(int, box.xyxy[0])
+            conf = box.conf[0]
+            cls = int(box.cls[0])
+            class_name = modelY.names[cls]
+            detected_classes.add(class_name)
+            label = f"{class_name} {conf:.2f}"
+            labels.append(label)
+            bounding_boxes.append((x1, y1, x2, y2, class_name, conf))
+            cv2.rectangle(image_np, (x1, y1), (x2, y2), (0, 255, 0), 2)
+            cv2.putText(image_np, label, (x1, y1 - 10),
+                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
+    possible_classes = {"front", "back"}
+    missing_classes = possible_classes - detected_classes
+    if missing_classes:
+        labels.append(f"Missing: {', '.join(missing_classes)}")
+    return Image.fromarray(image_np), labels, bounding_boxes
+def crop_image(image, bounding_boxes):
+    """Crop detected bounding boxes from the image."""
+    cropped_images = {}
+    image_np = np.array(image)
+    for (x1, y1, x2, y2, class_name, conf) in bounding_boxes:
+        cropped = image_np[y1:y2, x1:x2]
+        cropped_images[class_name] = Image.fromarray(cropped)
+    return cropped_images
+# ---------------------------
+# VISION AI API FUNCTIONS
+# ---------------------------
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+def build_transform(input_size):
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
+    ])
+    return transform
+def load_image(image_file):
+    transform = build_transform(input_size=448)
+    pixel_values = transform(image_file).unsqueeze(0)  # Add batch dimension
+    return pixel_values
+def vision_ai_api(image, doc_type):
+    """Run the model using a dynamic prompt based on detected doc type."""
+    pixel_values = load_image(image).to(torch.float32).to("cpu")
+    generation_config = dict(max_new_tokens=512, do_sample=True)
+    question = front_prompt if doc_type == "front" else back_prompt if doc_type == "back" else "Please provide document details."
+    print("Before requesting model...")
+    response = model.chat(tokenizer, pixel_values, question, generation_config)
+    print("After requesting model...", response)
+    # Clear memory
+    del pixel_values
+    gc.collect()  # Force garbage collection
+    torch.cuda.empty_cache()
+    return f'Assistant: {response}'
+# ---------------------------
+# PREDICTION PIPELINE
+# ---------------------------
+def predict(image):
+    """Pipeline: Preprocess → Detect → Crop → Vision AI API call."""
+    processed_image = preprocessing(image)
+    rotated_image = imageRotation(processed_image)
+    detected_image, labels, bounding_boxes = detect_document(rotated_image)
+    cropped_images = crop_image(rotated_image, bounding_boxes)
+    front_result, back_result = None, None
+    if "front" in cropped_images:
+        front_result = vision_ai_api(cropped_images["front"], "front")
+    if "back" in cropped_images:
+        back_result = vision_ai_api(cropped_images["back"], "back")
+    api_results = {"front": front_result, "back": back_result}
+    single_image = cropped_images.get("front") or cropped_images.get("back") or detected_image
+    return single_image, labels, api_results
+# ---------------------------
+# GRADIO INTERFACE LAUNCH
+# ---------------------------
+iface = gr.Interface(
+    fn=predict,
+    inputs="image",
+    outputs=["image", "text", "json"],
+    title="License Field Detection (Front & Back Card)"
+)
+iface.launch()