Spaces:

gtata
/

low-budget-document-cleaning

Runtime error

App Files Files Community

gtata commited on May 18, 2023

Commit

287cf36

1 Parent(s): 4b8a73f

FEAT: Multipage app with brief discription

Browse files

- Side bar showing about and denoise pages
- Disclaimer for vgg sample images
- Improve OCR inference for VGG images

Files changed (8) hide show

.gitattributes +3 -0
.gitignore +2 -0
About.py +68 -0
README.md +1 -1
cleaning_overview.png +0 -0
ocr_libs.py +11 -11
app.py → pages/1_Denoise.py +27 -76
preprocess_utils.py +52 -0

.gitattributes CHANGED Viewed

@@ -2,3 +2,6 @@
 models/prep_4.pt filter=lfs diff=lfs merge=lfs -text
 models/prep_50.pt filter=lfs diff=lfs merge=lfs -text
 models/prep_8.pt filter=lfs diff=lfs merge=lfs -text

 models/prep_4.pt filter=lfs diff=lfs merge=lfs -text
 models/prep_50.pt filter=lfs diff=lfs merge=lfs -text
 models/prep_8.pt filter=lfs diff=lfs merge=lfs -text
+models/vgg_4.pt filter=lfs diff=lfs merge=lfs -text
+models/vgg_50.pt filter=lfs diff=lfs merge=lfs -text
+models/vgg_8.pt filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -156,6 +156,8 @@ ocr_app/
 .DS_Store
 *.pyc
 # PyCharm

 .DS_Store
+*.drawio
 *.pyc
 # PyCharm

About.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import streamlit as st
+st.set_page_config(
+    page_title="About",
+    page_icon="👋",
+    layout="wide"
+)
+st.image("cleaning_overview.png")
+st.title("Document Image Cleaning for Black-Box OCR Engines")
+st.sidebar.success("Select a page.")
+st.markdown(
+    """
+    Black-Box open-source OCR engines ([Tesseract](https://github.com/tesseract-ocr/tesseract)) and commercial OCR APIs ([Google Vision API](https://cloud.google.com/vision/docs/ocr))
+    are difficult to retrain with new data. We can [train a document image preprocessor](https://arxiv.org/abs/2105.07983)
+    for black-box OCR engines by approximating the gradient of the black-box using a proxy model. However, the OCR engine
+    needs to be queried for all samples which is computationally/financially [expensive](https://cloud.google.com/vision/pricing). Here, we show that the documents
+    can be preprocessed using just 4% of the total OCR queries.
+    👈 Select  **Denoise** in the sidebar to see document preprocessing with 100\%, 8\% and 4\% budget OCR query budget.
+    """
+)
+    # Want to learn more?
+    # - Check out [streamlit.io](https://streamlit.io)
+    # - Jump into our [documentation](https://docs.streamlit.io)
+    # - Ask a question in our [community
+    #     forums](https://discuss.streamlit.io)
+    # ### See more complex demos
+    # - Use a neural net to [analyze the Udacity Self-driving Car Image
+    #     Dataset](https://github.com/streamlit/demo-self-driving)
+    # - Explore a [New York City rideshare dataset](https://github.com/streamlit/demo-uber-nyc-pickups)
+# st.write("")
+# st.write("")
+# st.write("")
+# st.markdown("#####  This app allows you to compare, from a given picture, the results of different solutions:")
+# st.markdown("##### *EasyOcr, PaddleOCR, MMOCR, Tesseract*")
+# st.write("")
+# st.write("")
+# st.markdown(''' The 1st step is to choose the language for the text recognition (not all solutions \
+# support the same languages), and then choose the picture to consider. It is possible to upload a file, \
+# to take a picture, or to use a demo file. \
+# It is then possible to change the default values for the text area detection process, \
+# before launching the detection task for each solution.''')
+# st.write("")
+# st.markdown(''' The different results are then presented. The 2nd step is to choose one of these \
+# detection results, in order to carry out the text recognition process there. It is also possible to change \
+# the default settings for each solution.''')
+# st.write("")
+# st.markdown("###### The recognition results appear in 2 formats:")
+# st.markdown(''' - a visual format resumes the initial image, replacing the detected areas with \
+# the recognized text. The background is + or - strongly colored in green according to the \
+# confidence level of the recognition.
+#     A slider allows you to change the font size, another \
+# allows you to modify the confidence threshold above which the text color changes: if it is at \
+# 70% for example, then all the texts with a confidence threshold higher or equal to 70 will appear \
+# in white, in black otherwise.''')
+# st.markdown(" - a detailed format presents the results in a table, for each text box detected. \
+# It is possible to download this results in a local csv file.")

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: green
 colorTo: yellow
 sdk: streamlit
 sdk_version: 1.19.0
-app_file: app.py
 pinned: false
 ---

 colorTo: yellow
 sdk: streamlit
 sdk_version: 1.19.0
+app_file: About.py
 pinned: false
 ---

cleaning_overview.png ADDED Viewed

ocr_libs.py CHANGED Viewed

@@ -13,7 +13,7 @@ class tess_ocr:
         boxes = boxes.dropna().to_dict(orient='list')
         text_labels = boxes['text']
         text_boxes = list()
-        for i  in range(len(boxes)):
             x1, y1 = boxes["left"][i], boxes["top"][i]
             x2, y2 = x1 + boxes["width"][i], y1 + boxes["height"][i]
             text_boxes.append({"x1": x1, "y1": y1, "x2": x2 , "y2": y2})
@@ -22,17 +22,17 @@ class tess_ocr:
             crops.append(image.crop((box['x1'], box['y1'], box['x2'], box['y2'],)))
         return list(zip(crops, text_boxes))
-    # def detect_text(self, image):
-    #     boxes = self.reader.readtext(np.asarray(image))
-    #     print(boxes)
-    #     return []
-    def extract_text(self, image, boxes):
-        OFFSET = 6
         texts = list()
-        for i, (im, box) in enumerate(boxes):
-            cropped = image.crop((box["x1"] - OFFSET, box["y1"] - OFFSET , box["x2"] + OFFSET, box["y2"] + OFFSET))
-            ocrResult = pytesseract.image_to_string(cropped, config='--oem 1 --psm 7')
             texts.append(ocrResult)
         return texts

         boxes = boxes.dropna().to_dict(orient='list')
         text_labels = boxes['text']
         text_boxes = list()
+        for i  in range(len(text_labels)):
             x1, y1 = boxes["left"][i], boxes["top"][i]
             x2, y2 = x1 + boxes["width"][i], y1 + boxes["height"][i]
             text_boxes.append({"x1": x1, "y1": y1, "x2": x2 , "y2": y2})
             crops.append(image.crop((box['x1'], box['y1'], box['x2'], box['y2'],)))
         return list(zip(crops, text_boxes))
+    def extract_text(self, image, boxes, dataset="POS"):
         texts = list()
+        if dataset == "POS":
+            OFFSET = 6
+            for i, (im, box) in enumerate(boxes):
+                cropped = image.crop((box["x1"] - OFFSET, box["y1"] - OFFSET , box["x2"] + OFFSET, box["y2"] + OFFSET))
+                ocrResult = pytesseract.image_to_string(cropped, config='--oem 1 --psm 7').strip()
+                texts.append(ocrResult)
+        else:
+            ocrResult = pytesseract.image_to_string(image, config='--oem 1 --psm 7').strip()
+            print(ocrResult)
             texts.append(ocrResult)
         return texts

app.py → pages/1_Denoise.py RENAMED Viewed

@@ -1,66 +1,18 @@
 import streamlit as st
 st.set_page_config(layout="wide")
 from models.model_unet import UNet
-from PIL import Image, ImageOps
 from streamlit_image_select import image_select
 import torchvision.transforms as transforms
-import os
 import torch
 from ocr_libs import tess_ocr
-class PadWhite(object):
-    def __init__(self, size):
-        assert isinstance(size, (int, tuple))
-        if isinstance(size, tuple):
-            self.height, self.width = size
-        elif isinstance(size, int):
-            self.height = self.width = size
-    def __call__(self, img):
-        if img.size[0] > self.width or img.size[1] > self.height:
-            img.thumbnail((self.width, self.height))
-        delta_width = self.width - img.size[0]
-        delta_height = self.height - img.size[1]
-        pad_width = delta_width // 2
-        pad_height = delta_height // 2
-        padding = (pad_width, pad_height, delta_width -
-                   pad_width, delta_height-pad_height)
-        return ImageOps.expand(img, padding, fill=255)
-def process_image_pos(image):
-    target_size = (400, 512)
-    # image = Image.open(img_name).convert("L")
-    w, h = image.size
-    top_padding, left_padding = 0, 0
-    if h <= target_size[0] or w <= target_size[1]:
-        delta_height = target_size[0] - h
-        delta_width = target_size[1] - w
-        pad_height = delta_height // 2
-        pad_width = delta_width // 2
-        # (left, top, right, bottom)
-        padding = (pad_width, pad_height, delta_width -
-                    pad_width, delta_height-pad_height)
-        image = ImageOps.expand(image, padding, fill=255)
-    elif h > 400 or w > 500:
-        print("Height screwed")
-    transform = transforms.ToTensor()
-    image = transform(image)
-    # image = torch.tensor(image)
-    return image
-def process_image_vgg(image):
-    input_size = (32, 128)
-    transform = transforms.Compose([
-        PadWhite(input_size),
-        transforms.ToTensor(),
-        ])
-    image = transform(image)
-    return image
 @st.cache_resource
 def load_models(model_paths):
@@ -101,6 +53,8 @@ if dataset:
     image_paths = [f"{image_folder}/{i}.png" for i in range(NUM_IMAGES)]
     img = None
     img_index = image_select(
         label="Select Image",
@@ -120,14 +74,13 @@ if dataset:
     if img is None and img_index >= 0:
         img = Image.open(image_paths[img_index]).convert("L")
     cols = st.columns(4)
     # Set Text
-    cols[0].text("Input Image")
-    cols[1].text("Full Training")
-    cols[2].text("8%")
-    cols[3].text("4%")
     models = load_models(model_paths)
     if img is not None:
@@ -142,29 +95,27 @@ if dataset:
         with st.spinner('Text Detection and Recognition in progress ...'):
             text_boxes = get_text_boxes(ocr, pil_image)
-            if not len(text_boxes):
                 st.text("No text boxes extracted")
             else:
                 all_texts = list()
-                all_texts.append(ocr.extract_text(pil_image, text_boxes))
                 for i in range(3):
-                    all_texts.append(ocr.extract_text(clned_imgs[i], text_boxes))
-                # text_boxes_more = get_text_boxes(ocr, clned_imgs[3])
                 title_cols = st.columns(5)
                 headings = ["Word Image", "Original", "Cleaned (100%)", "Cleaned (8%)", "Cleaned (4%)"]
                 for i, heading in  enumerate(headings):
-                    title_cols[i].markdown(f"## {heading}")
-            for i, box in enumerate(text_boxes):
                 txt_box_cols = st.columns(5)
-                txt_box_cols[0].image(box[0], use_column_width="always")
                 for j in range(4):
-                    txt_box_cols[j + 1].text(all_texts[j][i])

 import streamlit as st
 st.set_page_config(layout="wide")
 from models.model_unet import UNet
+import sys
+sys.path.append("../")
+from PIL import Image
 from streamlit_image_select import image_select
 import torchvision.transforms as transforms
+from preprocess_utils import process_image_pos, process_image_vgg
 import torch
+from PIL import ImageOps
 from ocr_libs import tess_ocr
+st.sidebar.success("Select a page.")
 @st.cache_resource
 def load_models(model_paths):
     image_paths = [f"{image_folder}/{i}.png" for i in range(NUM_IMAGES)]
     img = None
+    if dataset == "VGG":
+        st.markdown("**Due to display issues, you can view the full image by _right-click_ -> _open image in new tab_**")
     img_index = image_select(
         label="Select Image",
     if img is None and img_index >= 0:
         img = Image.open(image_paths[img_index]).convert("L")
     cols = st.columns(4)
     # Set Text
+    cols[0].markdown("### Input Image")
+    cols[1].markdown("### Full Training")
+    cols[2].markdown("### 8%")
+    cols[3].markdown("### 4%")
     models = load_models(model_paths)
     if img is not None:
         with st.spinner('Text Detection and Recognition in progress ...'):
             text_boxes = get_text_boxes(ocr, pil_image)
+            if not len(text_boxes) and dataset == "POS":
                 st.text("No text boxes extracted")
             else:
+                st.markdown("***")
                 all_texts = list()
+                all_texts.append(ocr.extract_text(pil_image, text_boxes, dataset))
                 for i in range(3):
+                    all_texts.append(ocr.extract_text(clned_imgs[i], text_boxes, dataset))
                 title_cols = st.columns(5)
                 headings = ["Word Image", "Original", "Cleaned (100%)", "Cleaned (8%)", "Cleaned (4%)"]
                 for i, heading in  enumerate(headings):
+                    title_cols[i].markdown(f"### {heading}")
+            if dataset == "POS":
+                for i, box in enumerate(text_boxes):
+                    txt_box_cols = st.columns(5)
+                    txt_box_cols[0].image(box[0], use_column_width="always")
+                    for j in range(4):
+                        txt_box_cols[j + 1].text(all_texts[j][i])
+            else:
                 txt_box_cols = st.columns(5)
+                txt_box_cols[0].image(pil_image, use_column_width="always")
                 for j in range(4):
+                    txt_box_cols[j + 1].text(all_texts[j][0])

preprocess_utils.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import torchvision.transforms as transforms
+from PIL import ImageOps
+class PadWhite(object):
+    def __init__(self, size):
+        assert isinstance(size, (int, tuple))
+        if isinstance(size, tuple):
+            self.height, self.width = size
+        elif isinstance(size, int):
+            self.height = self.width = size
+    def __call__(self, img):
+        if img.size[0] > self.width or img.size[1] > self.height:
+            img.thumbnail((self.width, self.height))
+        delta_width = self.width - img.size[0]
+        delta_height = self.height - img.size[1]
+        pad_width = delta_width // 2
+        pad_height = delta_height // 2
+        padding = (pad_width, pad_height, delta_width -
+                   pad_width, delta_height-pad_height)
+        return ImageOps.expand(img, padding, fill=255)
+def process_image_pos(image):
+    target_size = (400, 512)
+    # image = Image.open(img_name).convert("L")
+    w, h = image.size
+    top_padding, left_padding = 0, 0
+    if h <= target_size[0] or w <= target_size[1]:
+        delta_height = target_size[0] - h
+        delta_width = target_size[1] - w
+        pad_height = delta_height // 2
+        pad_width = delta_width // 2
+        # (left, top, right, bottom)
+        padding = (pad_width, pad_height, delta_width -
+                    pad_width, delta_height-pad_height)
+        image = ImageOps.expand(image, padding, fill=255)
+    elif h > 400 or w > 500:
+        print("Height screwed")
+    transform = transforms.ToTensor()
+    image = transform(image)
+    return image
+def process_image_vgg(image):
+    input_size = (32, 128)
+    transform = transforms.Compose([
+        PadWhite(input_size),
+        transforms.ToTensor(),
+        ])
+    image = transform(image)
+    return image