Spaces:
Runtime error
Runtime error
FEAT: Multipage app with brief discription
Browse files- Side bar showing about and denoise pages
- Disclaimer for vgg sample images
- Improve OCR inference for VGG images
- .gitattributes +3 -0
- .gitignore +2 -0
- About.py +68 -0
- README.md +1 -1
- cleaning_overview.png +0 -0
- ocr_libs.py +11 -11
- app.py → pages/1_Denoise.py +27 -76
- preprocess_utils.py +52 -0
.gitattributes
CHANGED
@@ -2,3 +2,6 @@
|
|
2 |
models/prep_4.pt filter=lfs diff=lfs merge=lfs -text
|
3 |
models/prep_50.pt filter=lfs diff=lfs merge=lfs -text
|
4 |
models/prep_8.pt filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
2 |
models/prep_4.pt filter=lfs diff=lfs merge=lfs -text
|
3 |
models/prep_50.pt filter=lfs diff=lfs merge=lfs -text
|
4 |
models/prep_8.pt filter=lfs diff=lfs merge=lfs -text
|
5 |
+
models/vgg_4.pt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
models/vgg_50.pt filter=lfs diff=lfs merge=lfs -text
|
7 |
+
models/vgg_8.pt filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
@@ -156,6 +156,8 @@ ocr_app/
|
|
156 |
|
157 |
.DS_Store
|
158 |
|
|
|
|
|
159 |
*.pyc
|
160 |
|
161 |
# PyCharm
|
|
|
156 |
|
157 |
.DS_Store
|
158 |
|
159 |
+
*.drawio
|
160 |
+
|
161 |
*.pyc
|
162 |
|
163 |
# PyCharm
|
About.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
st.set_page_config(
|
4 |
+
page_title="About",
|
5 |
+
page_icon="👋",
|
6 |
+
layout="wide"
|
7 |
+
)
|
8 |
+
|
9 |
+
st.image("cleaning_overview.png")
|
10 |
+
st.title("Document Image Cleaning for Black-Box OCR Engines")
|
11 |
+
st.sidebar.success("Select a page.")
|
12 |
+
|
13 |
+
|
14 |
+
st.markdown(
|
15 |
+
"""
|
16 |
+
Black-Box open-source OCR engines ([Tesseract](https://github.com/tesseract-ocr/tesseract)) and commercial OCR APIs ([Google Vision API](https://cloud.google.com/vision/docs/ocr))
|
17 |
+
are difficult to retrain with new data. We can [train a document image preprocessor](https://arxiv.org/abs/2105.07983)
|
18 |
+
for black-box OCR engines by approximating the gradient of the black-box using a proxy model. However, the OCR engine
|
19 |
+
needs to be queried for all samples which is computationally/financially [expensive](https://cloud.google.com/vision/pricing). Here, we show that the documents
|
20 |
+
can be preprocessed using just 4% of the total OCR queries.
|
21 |
+
|
22 |
+
👈 Select **Denoise** in the sidebar to see document preprocessing with 100\%, 8\% and 4\% budget OCR query budget.
|
23 |
+
"""
|
24 |
+
)
|
25 |
+
|
26 |
+
# Want to learn more?
|
27 |
+
# - Check out [streamlit.io](https://streamlit.io)
|
28 |
+
# - Jump into our [documentation](https://docs.streamlit.io)
|
29 |
+
# - Ask a question in our [community
|
30 |
+
# forums](https://discuss.streamlit.io)
|
31 |
+
# ### See more complex demos
|
32 |
+
# - Use a neural net to [analyze the Udacity Self-driving Car Image
|
33 |
+
# Dataset](https://github.com/streamlit/demo-self-driving)
|
34 |
+
# - Explore a [New York City rideshare dataset](https://github.com/streamlit/demo-uber-nyc-pickups)
|
35 |
+
|
36 |
+
# st.write("")
|
37 |
+
# st.write("")
|
38 |
+
# st.write("")
|
39 |
+
|
40 |
+
# st.markdown("##### This app allows you to compare, from a given picture, the results of different solutions:")
|
41 |
+
# st.markdown("##### *EasyOcr, PaddleOCR, MMOCR, Tesseract*")
|
42 |
+
# st.write("")
|
43 |
+
# st.write("")
|
44 |
+
|
45 |
+
# st.markdown(''' The 1st step is to choose the language for the text recognition (not all solutions \
|
46 |
+
# support the same languages), and then choose the picture to consider. It is possible to upload a file, \
|
47 |
+
# to take a picture, or to use a demo file. \
|
48 |
+
# It is then possible to change the default values for the text area detection process, \
|
49 |
+
# before launching the detection task for each solution.''')
|
50 |
+
# st.write("")
|
51 |
+
|
52 |
+
# st.markdown(''' The different results are then presented. The 2nd step is to choose one of these \
|
53 |
+
# detection results, in order to carry out the text recognition process there. It is also possible to change \
|
54 |
+
# the default settings for each solution.''')
|
55 |
+
# st.write("")
|
56 |
+
|
57 |
+
# st.markdown("###### The recognition results appear in 2 formats:")
|
58 |
+
# st.markdown(''' - a visual format resumes the initial image, replacing the detected areas with \
|
59 |
+
# the recognized text. The background is + or - strongly colored in green according to the \
|
60 |
+
# confidence level of the recognition.
|
61 |
+
# A slider allows you to change the font size, another \
|
62 |
+
# allows you to modify the confidence threshold above which the text color changes: if it is at \
|
63 |
+
# 70% for example, then all the texts with a confidence threshold higher or equal to 70 will appear \
|
64 |
+
# in white, in black otherwise.''')
|
65 |
+
|
66 |
+
# st.markdown(" - a detailed format presents the results in a table, for each text box detected. \
|
67 |
+
# It is possible to download this results in a local csv file.")
|
68 |
+
|
README.md
CHANGED
@@ -5,7 +5,7 @@ colorFrom: green
|
|
5 |
colorTo: yellow
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.19.0
|
8 |
-
app_file:
|
9 |
pinned: false
|
10 |
---
|
11 |
|
|
|
5 |
colorTo: yellow
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.19.0
|
8 |
+
app_file: About.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
cleaning_overview.png
ADDED
![]() |
ocr_libs.py
CHANGED
@@ -13,7 +13,7 @@ class tess_ocr:
|
|
13 |
boxes = boxes.dropna().to_dict(orient='list')
|
14 |
text_labels = boxes['text']
|
15 |
text_boxes = list()
|
16 |
-
for i in range(len(
|
17 |
x1, y1 = boxes["left"][i], boxes["top"][i]
|
18 |
x2, y2 = x1 + boxes["width"][i], y1 + boxes["height"][i]
|
19 |
text_boxes.append({"x1": x1, "y1": y1, "x2": x2 , "y2": y2})
|
@@ -22,17 +22,17 @@ class tess_ocr:
|
|
22 |
crops.append(image.crop((box['x1'], box['y1'], box['x2'], box['y2'],)))
|
23 |
return list(zip(crops, text_boxes))
|
24 |
|
25 |
-
|
26 |
-
# boxes = self.reader.readtext(np.asarray(image))
|
27 |
-
# print(boxes)
|
28 |
-
# return []
|
29 |
-
|
30 |
-
def extract_text(self, image, boxes):
|
31 |
-
OFFSET = 6
|
32 |
texts = list()
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
texts.append(ocrResult)
|
37 |
return texts
|
38 |
|
|
|
13 |
boxes = boxes.dropna().to_dict(orient='list')
|
14 |
text_labels = boxes['text']
|
15 |
text_boxes = list()
|
16 |
+
for i in range(len(text_labels)):
|
17 |
x1, y1 = boxes["left"][i], boxes["top"][i]
|
18 |
x2, y2 = x1 + boxes["width"][i], y1 + boxes["height"][i]
|
19 |
text_boxes.append({"x1": x1, "y1": y1, "x2": x2 , "y2": y2})
|
|
|
22 |
crops.append(image.crop((box['x1'], box['y1'], box['x2'], box['y2'],)))
|
23 |
return list(zip(crops, text_boxes))
|
24 |
|
25 |
+
def extract_text(self, image, boxes, dataset="POS"):
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
texts = list()
|
27 |
+
if dataset == "POS":
|
28 |
+
OFFSET = 6
|
29 |
+
for i, (im, box) in enumerate(boxes):
|
30 |
+
cropped = image.crop((box["x1"] - OFFSET, box["y1"] - OFFSET , box["x2"] + OFFSET, box["y2"] + OFFSET))
|
31 |
+
ocrResult = pytesseract.image_to_string(cropped, config='--oem 1 --psm 7').strip()
|
32 |
+
texts.append(ocrResult)
|
33 |
+
else:
|
34 |
+
ocrResult = pytesseract.image_to_string(image, config='--oem 1 --psm 7').strip()
|
35 |
+
print(ocrResult)
|
36 |
texts.append(ocrResult)
|
37 |
return texts
|
38 |
|
app.py → pages/1_Denoise.py
RENAMED
@@ -1,66 +1,18 @@
|
|
1 |
import streamlit as st
|
2 |
st.set_page_config(layout="wide")
|
3 |
from models.model_unet import UNet
|
|
|
|
|
4 |
|
5 |
-
from PIL import Image
|
6 |
from streamlit_image_select import image_select
|
7 |
import torchvision.transforms as transforms
|
8 |
-
import
|
9 |
import torch
|
|
|
10 |
from ocr_libs import tess_ocr
|
11 |
-
|
12 |
-
class PadWhite(object):
|
13 |
-
def __init__(self, size):
|
14 |
-
assert isinstance(size, (int, tuple))
|
15 |
-
if isinstance(size, tuple):
|
16 |
-
self.height, self.width = size
|
17 |
-
elif isinstance(size, int):
|
18 |
-
self.height = self.width = size
|
19 |
-
|
20 |
-
def __call__(self, img):
|
21 |
-
if img.size[0] > self.width or img.size[1] > self.height:
|
22 |
-
img.thumbnail((self.width, self.height))
|
23 |
-
delta_width = self.width - img.size[0]
|
24 |
-
delta_height = self.height - img.size[1]
|
25 |
-
pad_width = delta_width // 2
|
26 |
-
pad_height = delta_height // 2
|
27 |
-
padding = (pad_width, pad_height, delta_width -
|
28 |
-
pad_width, delta_height-pad_height)
|
29 |
-
return ImageOps.expand(img, padding, fill=255)
|
30 |
-
|
31 |
-
def process_image_pos(image):
|
32 |
-
target_size = (400, 512)
|
33 |
-
# image = Image.open(img_name).convert("L")
|
34 |
-
w, h = image.size
|
35 |
-
top_padding, left_padding = 0, 0
|
36 |
-
|
37 |
-
if h <= target_size[0] or w <= target_size[1]:
|
38 |
-
delta_height = target_size[0] - h
|
39 |
-
delta_width = target_size[1] - w
|
40 |
-
pad_height = delta_height // 2
|
41 |
-
pad_width = delta_width // 2
|
42 |
-
# (left, top, right, bottom)
|
43 |
-
padding = (pad_width, pad_height, delta_width -
|
44 |
-
pad_width, delta_height-pad_height)
|
45 |
-
image = ImageOps.expand(image, padding, fill=255)
|
46 |
-
elif h > 400 or w > 500:
|
47 |
-
print("Height screwed")
|
48 |
-
|
49 |
-
transform = transforms.ToTensor()
|
50 |
-
image = transform(image)
|
51 |
-
# image = torch.tensor(image)
|
52 |
-
return image
|
53 |
-
|
54 |
-
def process_image_vgg(image):
|
55 |
-
input_size = (32, 128)
|
56 |
-
transform = transforms.Compose([
|
57 |
-
PadWhite(input_size),
|
58 |
-
transforms.ToTensor(),
|
59 |
-
])
|
60 |
-
image = transform(image)
|
61 |
-
return image
|
62 |
-
|
63 |
|
|
|
64 |
|
65 |
@st.cache_resource
|
66 |
def load_models(model_paths):
|
@@ -101,6 +53,8 @@ if dataset:
|
|
101 |
|
102 |
image_paths = [f"{image_folder}/{i}.png" for i in range(NUM_IMAGES)]
|
103 |
img = None
|
|
|
|
|
104 |
|
105 |
img_index = image_select(
|
106 |
label="Select Image",
|
@@ -120,14 +74,13 @@ if dataset:
|
|
120 |
if img is None and img_index >= 0:
|
121 |
img = Image.open(image_paths[img_index]).convert("L")
|
122 |
|
123 |
-
|
124 |
cols = st.columns(4)
|
125 |
|
126 |
# Set Text
|
127 |
-
cols[0].
|
128 |
-
cols[1].
|
129 |
-
cols[2].
|
130 |
-
cols[3].
|
131 |
models = load_models(model_paths)
|
132 |
|
133 |
if img is not None:
|
@@ -142,29 +95,27 @@ if dataset:
|
|
142 |
|
143 |
with st.spinner('Text Detection and Recognition in progress ...'):
|
144 |
text_boxes = get_text_boxes(ocr, pil_image)
|
145 |
-
if not len(text_boxes):
|
146 |
st.text("No text boxes extracted")
|
147 |
else:
|
|
|
148 |
all_texts = list()
|
149 |
-
all_texts.append(ocr.extract_text(pil_image, text_boxes))
|
150 |
for i in range(3):
|
151 |
-
all_texts.append(ocr.extract_text(clned_imgs[i], text_boxes))
|
152 |
-
# text_boxes_more = get_text_boxes(ocr, clned_imgs[3])
|
153 |
title_cols = st.columns(5)
|
154 |
headings = ["Word Image", "Original", "Cleaned (100%)", "Cleaned (8%)", "Cleaned (4%)"]
|
155 |
for i, heading in enumerate(headings):
|
156 |
-
title_cols[i].markdown(f"
|
157 |
-
|
158 |
-
|
159 |
-
|
|
|
|
|
|
|
|
|
|
|
160 |
txt_box_cols = st.columns(5)
|
161 |
-
txt_box_cols[0].image(
|
162 |
for j in range(4):
|
163 |
-
txt_box_cols[j + 1].text(all_texts[j][
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
|
|
1 |
import streamlit as st
|
2 |
st.set_page_config(layout="wide")
|
3 |
from models.model_unet import UNet
|
4 |
+
import sys
|
5 |
+
sys.path.append("../")
|
6 |
|
7 |
+
from PIL import Image
|
8 |
from streamlit_image_select import image_select
|
9 |
import torchvision.transforms as transforms
|
10 |
+
from preprocess_utils import process_image_pos, process_image_vgg
|
11 |
import torch
|
12 |
+
from PIL import ImageOps
|
13 |
from ocr_libs import tess_ocr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
+
st.sidebar.success("Select a page.")
|
16 |
|
17 |
@st.cache_resource
|
18 |
def load_models(model_paths):
|
|
|
53 |
|
54 |
image_paths = [f"{image_folder}/{i}.png" for i in range(NUM_IMAGES)]
|
55 |
img = None
|
56 |
+
if dataset == "VGG":
|
57 |
+
st.markdown("**Due to display issues, you can view the full image by _right-click_ -> _open image in new tab_**")
|
58 |
|
59 |
img_index = image_select(
|
60 |
label="Select Image",
|
|
|
74 |
if img is None and img_index >= 0:
|
75 |
img = Image.open(image_paths[img_index]).convert("L")
|
76 |
|
|
|
77 |
cols = st.columns(4)
|
78 |
|
79 |
# Set Text
|
80 |
+
cols[0].markdown("### Input Image")
|
81 |
+
cols[1].markdown("### Full Training")
|
82 |
+
cols[2].markdown("### 8%")
|
83 |
+
cols[3].markdown("### 4%")
|
84 |
models = load_models(model_paths)
|
85 |
|
86 |
if img is not None:
|
|
|
95 |
|
96 |
with st.spinner('Text Detection and Recognition in progress ...'):
|
97 |
text_boxes = get_text_boxes(ocr, pil_image)
|
98 |
+
if not len(text_boxes) and dataset == "POS":
|
99 |
st.text("No text boxes extracted")
|
100 |
else:
|
101 |
+
st.markdown("***")
|
102 |
all_texts = list()
|
103 |
+
all_texts.append(ocr.extract_text(pil_image, text_boxes, dataset))
|
104 |
for i in range(3):
|
105 |
+
all_texts.append(ocr.extract_text(clned_imgs[i], text_boxes, dataset))
|
|
|
106 |
title_cols = st.columns(5)
|
107 |
headings = ["Word Image", "Original", "Cleaned (100%)", "Cleaned (8%)", "Cleaned (4%)"]
|
108 |
for i, heading in enumerate(headings):
|
109 |
+
title_cols[i].markdown(f"### {heading}")
|
110 |
+
|
111 |
+
if dataset == "POS":
|
112 |
+
for i, box in enumerate(text_boxes):
|
113 |
+
txt_box_cols = st.columns(5)
|
114 |
+
txt_box_cols[0].image(box[0], use_column_width="always")
|
115 |
+
for j in range(4):
|
116 |
+
txt_box_cols[j + 1].text(all_texts[j][i])
|
117 |
+
else:
|
118 |
txt_box_cols = st.columns(5)
|
119 |
+
txt_box_cols[0].image(pil_image, use_column_width="always")
|
120 |
for j in range(4):
|
121 |
+
txt_box_cols[j + 1].text(all_texts[j][0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
preprocess_utils.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torchvision.transforms as transforms
|
2 |
+
from PIL import ImageOps
|
3 |
+
|
4 |
+
class PadWhite(object):
|
5 |
+
def __init__(self, size):
|
6 |
+
assert isinstance(size, (int, tuple))
|
7 |
+
if isinstance(size, tuple):
|
8 |
+
self.height, self.width = size
|
9 |
+
elif isinstance(size, int):
|
10 |
+
self.height = self.width = size
|
11 |
+
|
12 |
+
def __call__(self, img):
|
13 |
+
if img.size[0] > self.width or img.size[1] > self.height:
|
14 |
+
img.thumbnail((self.width, self.height))
|
15 |
+
delta_width = self.width - img.size[0]
|
16 |
+
delta_height = self.height - img.size[1]
|
17 |
+
pad_width = delta_width // 2
|
18 |
+
pad_height = delta_height // 2
|
19 |
+
padding = (pad_width, pad_height, delta_width -
|
20 |
+
pad_width, delta_height-pad_height)
|
21 |
+
return ImageOps.expand(img, padding, fill=255)
|
22 |
+
|
23 |
+
def process_image_pos(image):
|
24 |
+
target_size = (400, 512)
|
25 |
+
# image = Image.open(img_name).convert("L")
|
26 |
+
w, h = image.size
|
27 |
+
top_padding, left_padding = 0, 0
|
28 |
+
|
29 |
+
if h <= target_size[0] or w <= target_size[1]:
|
30 |
+
delta_height = target_size[0] - h
|
31 |
+
delta_width = target_size[1] - w
|
32 |
+
pad_height = delta_height // 2
|
33 |
+
pad_width = delta_width // 2
|
34 |
+
# (left, top, right, bottom)
|
35 |
+
padding = (pad_width, pad_height, delta_width -
|
36 |
+
pad_width, delta_height-pad_height)
|
37 |
+
image = ImageOps.expand(image, padding, fill=255)
|
38 |
+
elif h > 400 or w > 500:
|
39 |
+
print("Height screwed")
|
40 |
+
|
41 |
+
transform = transforms.ToTensor()
|
42 |
+
image = transform(image)
|
43 |
+
return image
|
44 |
+
|
45 |
+
def process_image_vgg(image):
|
46 |
+
input_size = (32, 128)
|
47 |
+
transform = transforms.Compose([
|
48 |
+
PadWhite(input_size),
|
49 |
+
transforms.ToTensor(),
|
50 |
+
])
|
51 |
+
image = transform(image)
|
52 |
+
return image
|