Spaces:
Sleeping
Sleeping
import os | |
# Set up caching for Hugging Face models | |
os.environ["TRANSFORMERS_CACHE"] = "./.cache" | |
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # Disable GPU usage | |
import gradio as gr | |
import torch | |
import cv2 | |
import numpy as np | |
from PIL import Image, ImageEnhance | |
from ultralytics import YOLO | |
from torchvision.transforms.functional import InterpolationMode | |
import torchvision.transforms as T | |
from transformers import AutoModel, AutoTokenizer | |
import gc | |
# Import prompts from prompts.py | |
from prompts import front as front_prompt, back as back_prompt | |
# --------------------------- | |
# HUGGING FACE MODEL SETUP (CPU) | |
# --------------------------- | |
path = "OpenGVLab/InternVL2_5-1B" | |
cache_folder = "./.cache" | |
# Load the Vision AI model and tokenizer globally. | |
model = AutoModel.from_pretrained( | |
path, | |
cache_dir=cache_folder, | |
torch_dtype=torch.float32, | |
trust_remote_code=True | |
).eval().to("cpu") | |
tokenizer = AutoTokenizer.from_pretrained( | |
path, | |
cache_dir=cache_folder, | |
trust_remote_code=True, | |
use_fast=False | |
) | |
# --------------------------- | |
# YOLO MODEL INITIALIZATION | |
# --------------------------- | |
model_path = "best.pt" | |
modelY = YOLO(model_path) | |
modelY.to('cpu') # Explicitly move model to CPU | |
def preprocessing(image): | |
"""Apply enhancement filters and resize.""" | |
image = Image.fromarray(np.array(image)) | |
image = ImageEnhance.Sharpness(image).enhance(2.0) # Increase sharpness | |
image = ImageEnhance.Contrast(image).enhance(1.5) # Increase contrast | |
image = ImageEnhance.Brightness(image).enhance(0.8) # Reduce brightness | |
width = 448 | |
aspect_ratio = image.height / image.width | |
height = int(width * aspect_ratio) | |
image = image.resize((width, height)) | |
return image | |
def imageRotation(image): | |
"""Rotate image if height exceeds width.""" | |
if image.height > image.width: | |
return image.rotate(90, expand=True) | |
return image | |
def detect_document(image): | |
"""Detect front/back of the document using YOLO.""" | |
image_np = np.array(image) | |
results = modelY(image_np, conf=0.70, device='cpu') | |
detected_classes = set() | |
labels = [] | |
bounding_boxes = [] | |
for result in results: | |
for box in result.boxes: | |
x1, y1, x2, y2 = map(int, box.xyxy[0]) | |
conf = box.conf[0] | |
cls = int(box.cls[0]) | |
class_name = modelY.names[cls] | |
detected_classes.add(class_name) | |
label = f"{class_name} {conf:.2f}" | |
labels.append(label) | |
bounding_boxes.append((x1, y1, x2, y2, class_name, conf)) | |
cv2.rectangle(image_np, (x1, y1), (x2, y2), (0, 255, 0), 2) | |
cv2.putText(image_np, label, (x1, y1 - 10), | |
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) | |
possible_classes = {"front", "back"} | |
missing_classes = possible_classes - detected_classes | |
if missing_classes: | |
labels.append(f"Missing: {', '.join(missing_classes)}") | |
return Image.fromarray(image_np), labels, bounding_boxes | |
def crop_image(image, bounding_boxes): | |
"""Crop detected bounding boxes from the image.""" | |
cropped_images = {} | |
image_np = np.array(image) | |
for (x1, y1, x2, y2, class_name, conf) in bounding_boxes: | |
cropped = image_np[y1:y2, x1:x2] | |
cropped_images[class_name] = Image.fromarray(cropped) | |
return cropped_images | |
# --------------------------- | |
# VISION AI API FUNCTIONS | |
# --------------------------- | |
IMAGENET_MEAN = (0.485, 0.456, 0.406) | |
IMAGENET_STD = (0.229, 0.224, 0.225) | |
def build_transform(input_size): | |
transform = T.Compose([ | |
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), | |
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), | |
T.ToTensor(), | |
T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) | |
]) | |
return transform | |
def load_image(image_file): | |
transform = build_transform(input_size=448) | |
pixel_values = transform(image_file).unsqueeze(0) # Add batch dimension | |
return pixel_values | |
def vision_ai_api(image, doc_type): | |
"""Run the model using a dynamic prompt based on detected doc type.""" | |
pixel_values = load_image(image).to(torch.float32).to("cpu") | |
generation_config = dict(max_new_tokens=256, do_sample=True) | |
question = front_prompt if doc_type == "front" else back_prompt if doc_type == "back" else "Please provide document details." | |
print("Before requesting model...") | |
response = model.chat(tokenizer, pixel_values, question, generation_config) | |
print("After requesting model...", response) | |
# Clear memory | |
del pixel_values | |
gc.collect() # Force garbage collection | |
torch.cuda.empty_cache() | |
return f'{response}' | |
# --------------------------- | |
# PREDICTION PIPELINE | |
# --------------------------- | |
def predict(image): | |
"""Pipeline: Preprocess β Detect β Crop β Vision AI API call.""" | |
processed_image = preprocessing(image) | |
rotated_image = imageRotation(processed_image) | |
detected_image, labels, bounding_boxes = detect_document(rotated_image) | |
cropped_images = crop_image(rotated_image, bounding_boxes) | |
front_result, back_result = None, None | |
if "front" in cropped_images: | |
front_result = vision_ai_api(cropped_images["front"], "front") | |
if "back" in cropped_images: | |
back_result = vision_ai_api(cropped_images["back"], "back") | |
api_results = {"front": front_result, "back": back_result} | |
single_image = cropped_images.get("front") or cropped_images.get("back") or detected_image | |
return single_image, labels, api_results | |
# --------------------------- | |
# GRADIO INTERFACE LAUNCH | |
# --------------------------- | |
iface = gr.Interface( | |
fn=predict, | |
inputs="image", | |
outputs=["image", "text", "json"], | |
title="License Field Detection (Front & Back Card)" | |
) | |
iface.launch() |