gtata commited on
Commit
287cf36
·
1 Parent(s): 4b8a73f

FEAT: Multipage app with brief discription

Browse files

- Side bar showing about and denoise pages
- Disclaimer for vgg sample images
- Improve OCR inference for VGG images

.gitattributes CHANGED
@@ -2,3 +2,6 @@
2
  models/prep_4.pt filter=lfs diff=lfs merge=lfs -text
3
  models/prep_50.pt filter=lfs diff=lfs merge=lfs -text
4
  models/prep_8.pt filter=lfs diff=lfs merge=lfs -text
 
 
 
 
2
  models/prep_4.pt filter=lfs diff=lfs merge=lfs -text
3
  models/prep_50.pt filter=lfs diff=lfs merge=lfs -text
4
  models/prep_8.pt filter=lfs diff=lfs merge=lfs -text
5
+ models/vgg_4.pt filter=lfs diff=lfs merge=lfs -text
6
+ models/vgg_50.pt filter=lfs diff=lfs merge=lfs -text
7
+ models/vgg_8.pt filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -156,6 +156,8 @@ ocr_app/
156
 
157
  .DS_Store
158
 
 
 
159
  *.pyc
160
 
161
  # PyCharm
 
156
 
157
  .DS_Store
158
 
159
+ *.drawio
160
+
161
  *.pyc
162
 
163
  # PyCharm
About.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.set_page_config(
4
+ page_title="About",
5
+ page_icon="👋",
6
+ layout="wide"
7
+ )
8
+
9
+ st.image("cleaning_overview.png")
10
+ st.title("Document Image Cleaning for Black-Box OCR Engines")
11
+ st.sidebar.success("Select a page.")
12
+
13
+
14
+ st.markdown(
15
+ """
16
+ Black-Box open-source OCR engines ([Tesseract](https://github.com/tesseract-ocr/tesseract)) and commercial OCR APIs ([Google Vision API](https://cloud.google.com/vision/docs/ocr))
17
+ are difficult to retrain with new data. We can [train a document image preprocessor](https://arxiv.org/abs/2105.07983)
18
+ for black-box OCR engines by approximating the gradient of the black-box using a proxy model. However, the OCR engine
19
+ needs to be queried for all samples which is computationally/financially [expensive](https://cloud.google.com/vision/pricing). Here, we show that the documents
20
+ can be preprocessed using just 4% of the total OCR queries.
21
+
22
+ 👈 Select **Denoise** in the sidebar to see document preprocessing with 100\%, 8\% and 4\% budget OCR query budget.
23
+ """
24
+ )
25
+
26
+ # Want to learn more?
27
+ # - Check out [streamlit.io](https://streamlit.io)
28
+ # - Jump into our [documentation](https://docs.streamlit.io)
29
+ # - Ask a question in our [community
30
+ # forums](https://discuss.streamlit.io)
31
+ # ### See more complex demos
32
+ # - Use a neural net to [analyze the Udacity Self-driving Car Image
33
+ # Dataset](https://github.com/streamlit/demo-self-driving)
34
+ # - Explore a [New York City rideshare dataset](https://github.com/streamlit/demo-uber-nyc-pickups)
35
+
36
+ # st.write("")
37
+ # st.write("")
38
+ # st.write("")
39
+
40
+ # st.markdown("##### This app allows you to compare, from a given picture, the results of different solutions:")
41
+ # st.markdown("##### *EasyOcr, PaddleOCR, MMOCR, Tesseract*")
42
+ # st.write("")
43
+ # st.write("")
44
+
45
+ # st.markdown(''' The 1st step is to choose the language for the text recognition (not all solutions \
46
+ # support the same languages), and then choose the picture to consider. It is possible to upload a file, \
47
+ # to take a picture, or to use a demo file. \
48
+ # It is then possible to change the default values for the text area detection process, \
49
+ # before launching the detection task for each solution.''')
50
+ # st.write("")
51
+
52
+ # st.markdown(''' The different results are then presented. The 2nd step is to choose one of these \
53
+ # detection results, in order to carry out the text recognition process there. It is also possible to change \
54
+ # the default settings for each solution.''')
55
+ # st.write("")
56
+
57
+ # st.markdown("###### The recognition results appear in 2 formats:")
58
+ # st.markdown(''' - a visual format resumes the initial image, replacing the detected areas with \
59
+ # the recognized text. The background is + or - strongly colored in green according to the \
60
+ # confidence level of the recognition.
61
+ # A slider allows you to change the font size, another \
62
+ # allows you to modify the confidence threshold above which the text color changes: if it is at \
63
+ # 70% for example, then all the texts with a confidence threshold higher or equal to 70 will appear \
64
+ # in white, in black otherwise.''')
65
+
66
+ # st.markdown(" - a detailed format presents the results in a table, for each text box detected. \
67
+ # It is possible to download this results in a local csv file.")
68
+
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: green
5
  colorTo: yellow
6
  sdk: streamlit
7
  sdk_version: 1.19.0
8
- app_file: app.py
9
  pinned: false
10
  ---
11
 
 
5
  colorTo: yellow
6
  sdk: streamlit
7
  sdk_version: 1.19.0
8
+ app_file: About.py
9
  pinned: false
10
  ---
11
 
cleaning_overview.png ADDED
ocr_libs.py CHANGED
@@ -13,7 +13,7 @@ class tess_ocr:
13
  boxes = boxes.dropna().to_dict(orient='list')
14
  text_labels = boxes['text']
15
  text_boxes = list()
16
- for i in range(len(boxes)):
17
  x1, y1 = boxes["left"][i], boxes["top"][i]
18
  x2, y2 = x1 + boxes["width"][i], y1 + boxes["height"][i]
19
  text_boxes.append({"x1": x1, "y1": y1, "x2": x2 , "y2": y2})
@@ -22,17 +22,17 @@ class tess_ocr:
22
  crops.append(image.crop((box['x1'], box['y1'], box['x2'], box['y2'],)))
23
  return list(zip(crops, text_boxes))
24
 
25
- # def detect_text(self, image):
26
- # boxes = self.reader.readtext(np.asarray(image))
27
- # print(boxes)
28
- # return []
29
-
30
- def extract_text(self, image, boxes):
31
- OFFSET = 6
32
  texts = list()
33
- for i, (im, box) in enumerate(boxes):
34
- cropped = image.crop((box["x1"] - OFFSET, box["y1"] - OFFSET , box["x2"] + OFFSET, box["y2"] + OFFSET))
35
- ocrResult = pytesseract.image_to_string(cropped, config='--oem 1 --psm 7')
 
 
 
 
 
 
36
  texts.append(ocrResult)
37
  return texts
38
 
 
13
  boxes = boxes.dropna().to_dict(orient='list')
14
  text_labels = boxes['text']
15
  text_boxes = list()
16
+ for i in range(len(text_labels)):
17
  x1, y1 = boxes["left"][i], boxes["top"][i]
18
  x2, y2 = x1 + boxes["width"][i], y1 + boxes["height"][i]
19
  text_boxes.append({"x1": x1, "y1": y1, "x2": x2 , "y2": y2})
 
22
  crops.append(image.crop((box['x1'], box['y1'], box['x2'], box['y2'],)))
23
  return list(zip(crops, text_boxes))
24
 
25
+ def extract_text(self, image, boxes, dataset="POS"):
 
 
 
 
 
 
26
  texts = list()
27
+ if dataset == "POS":
28
+ OFFSET = 6
29
+ for i, (im, box) in enumerate(boxes):
30
+ cropped = image.crop((box["x1"] - OFFSET, box["y1"] - OFFSET , box["x2"] + OFFSET, box["y2"] + OFFSET))
31
+ ocrResult = pytesseract.image_to_string(cropped, config='--oem 1 --psm 7').strip()
32
+ texts.append(ocrResult)
33
+ else:
34
+ ocrResult = pytesseract.image_to_string(image, config='--oem 1 --psm 7').strip()
35
+ print(ocrResult)
36
  texts.append(ocrResult)
37
  return texts
38
 
app.py → pages/1_Denoise.py RENAMED
@@ -1,66 +1,18 @@
1
  import streamlit as st
2
  st.set_page_config(layout="wide")
3
  from models.model_unet import UNet
 
 
4
 
5
- from PIL import Image, ImageOps
6
  from streamlit_image_select import image_select
7
  import torchvision.transforms as transforms
8
- import os
9
  import torch
 
10
  from ocr_libs import tess_ocr
11
-
12
- class PadWhite(object):
13
- def __init__(self, size):
14
- assert isinstance(size, (int, tuple))
15
- if isinstance(size, tuple):
16
- self.height, self.width = size
17
- elif isinstance(size, int):
18
- self.height = self.width = size
19
-
20
- def __call__(self, img):
21
- if img.size[0] > self.width or img.size[1] > self.height:
22
- img.thumbnail((self.width, self.height))
23
- delta_width = self.width - img.size[0]
24
- delta_height = self.height - img.size[1]
25
- pad_width = delta_width // 2
26
- pad_height = delta_height // 2
27
- padding = (pad_width, pad_height, delta_width -
28
- pad_width, delta_height-pad_height)
29
- return ImageOps.expand(img, padding, fill=255)
30
-
31
- def process_image_pos(image):
32
- target_size = (400, 512)
33
- # image = Image.open(img_name).convert("L")
34
- w, h = image.size
35
- top_padding, left_padding = 0, 0
36
-
37
- if h <= target_size[0] or w <= target_size[1]:
38
- delta_height = target_size[0] - h
39
- delta_width = target_size[1] - w
40
- pad_height = delta_height // 2
41
- pad_width = delta_width // 2
42
- # (left, top, right, bottom)
43
- padding = (pad_width, pad_height, delta_width -
44
- pad_width, delta_height-pad_height)
45
- image = ImageOps.expand(image, padding, fill=255)
46
- elif h > 400 or w > 500:
47
- print("Height screwed")
48
-
49
- transform = transforms.ToTensor()
50
- image = transform(image)
51
- # image = torch.tensor(image)
52
- return image
53
-
54
- def process_image_vgg(image):
55
- input_size = (32, 128)
56
- transform = transforms.Compose([
57
- PadWhite(input_size),
58
- transforms.ToTensor(),
59
- ])
60
- image = transform(image)
61
- return image
62
-
63
 
 
64
 
65
  @st.cache_resource
66
  def load_models(model_paths):
@@ -101,6 +53,8 @@ if dataset:
101
 
102
  image_paths = [f"{image_folder}/{i}.png" for i in range(NUM_IMAGES)]
103
  img = None
 
 
104
 
105
  img_index = image_select(
106
  label="Select Image",
@@ -120,14 +74,13 @@ if dataset:
120
  if img is None and img_index >= 0:
121
  img = Image.open(image_paths[img_index]).convert("L")
122
 
123
-
124
  cols = st.columns(4)
125
 
126
  # Set Text
127
- cols[0].text("Input Image")
128
- cols[1].text("Full Training")
129
- cols[2].text("8%")
130
- cols[3].text("4%")
131
  models = load_models(model_paths)
132
 
133
  if img is not None:
@@ -142,29 +95,27 @@ if dataset:
142
 
143
  with st.spinner('Text Detection and Recognition in progress ...'):
144
  text_boxes = get_text_boxes(ocr, pil_image)
145
- if not len(text_boxes):
146
  st.text("No text boxes extracted")
147
  else:
 
148
  all_texts = list()
149
- all_texts.append(ocr.extract_text(pil_image, text_boxes))
150
  for i in range(3):
151
- all_texts.append(ocr.extract_text(clned_imgs[i], text_boxes))
152
- # text_boxes_more = get_text_boxes(ocr, clned_imgs[3])
153
  title_cols = st.columns(5)
154
  headings = ["Word Image", "Original", "Cleaned (100%)", "Cleaned (8%)", "Cleaned (4%)"]
155
  for i, heading in enumerate(headings):
156
- title_cols[i].markdown(f"## {heading}")
157
-
158
-
159
- for i, box in enumerate(text_boxes):
 
 
 
 
 
160
  txt_box_cols = st.columns(5)
161
- txt_box_cols[0].image(box[0], use_column_width="always")
162
  for j in range(4):
163
- txt_box_cols[j + 1].text(all_texts[j][i])
164
-
165
-
166
-
167
-
168
-
169
-
170
-
 
1
  import streamlit as st
2
  st.set_page_config(layout="wide")
3
  from models.model_unet import UNet
4
+ import sys
5
+ sys.path.append("../")
6
 
7
+ from PIL import Image
8
  from streamlit_image_select import image_select
9
  import torchvision.transforms as transforms
10
+ from preprocess_utils import process_image_pos, process_image_vgg
11
  import torch
12
+ from PIL import ImageOps
13
  from ocr_libs import tess_ocr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ st.sidebar.success("Select a page.")
16
 
17
  @st.cache_resource
18
  def load_models(model_paths):
 
53
 
54
  image_paths = [f"{image_folder}/{i}.png" for i in range(NUM_IMAGES)]
55
  img = None
56
+ if dataset == "VGG":
57
+ st.markdown("**Due to display issues, you can view the full image by _right-click_ -> _open image in new tab_**")
58
 
59
  img_index = image_select(
60
  label="Select Image",
 
74
  if img is None and img_index >= 0:
75
  img = Image.open(image_paths[img_index]).convert("L")
76
 
 
77
  cols = st.columns(4)
78
 
79
  # Set Text
80
+ cols[0].markdown("### Input Image")
81
+ cols[1].markdown("### Full Training")
82
+ cols[2].markdown("### 8%")
83
+ cols[3].markdown("### 4%")
84
  models = load_models(model_paths)
85
 
86
  if img is not None:
 
95
 
96
  with st.spinner('Text Detection and Recognition in progress ...'):
97
  text_boxes = get_text_boxes(ocr, pil_image)
98
+ if not len(text_boxes) and dataset == "POS":
99
  st.text("No text boxes extracted")
100
  else:
101
+ st.markdown("***")
102
  all_texts = list()
103
+ all_texts.append(ocr.extract_text(pil_image, text_boxes, dataset))
104
  for i in range(3):
105
+ all_texts.append(ocr.extract_text(clned_imgs[i], text_boxes, dataset))
 
106
  title_cols = st.columns(5)
107
  headings = ["Word Image", "Original", "Cleaned (100%)", "Cleaned (8%)", "Cleaned (4%)"]
108
  for i, heading in enumerate(headings):
109
+ title_cols[i].markdown(f"### {heading}")
110
+
111
+ if dataset == "POS":
112
+ for i, box in enumerate(text_boxes):
113
+ txt_box_cols = st.columns(5)
114
+ txt_box_cols[0].image(box[0], use_column_width="always")
115
+ for j in range(4):
116
+ txt_box_cols[j + 1].text(all_texts[j][i])
117
+ else:
118
  txt_box_cols = st.columns(5)
119
+ txt_box_cols[0].image(pil_image, use_column_width="always")
120
  for j in range(4):
121
+ txt_box_cols[j + 1].text(all_texts[j][0])
 
 
 
 
 
 
 
preprocess_utils.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torchvision.transforms as transforms
2
+ from PIL import ImageOps
3
+
4
+ class PadWhite(object):
5
+ def __init__(self, size):
6
+ assert isinstance(size, (int, tuple))
7
+ if isinstance(size, tuple):
8
+ self.height, self.width = size
9
+ elif isinstance(size, int):
10
+ self.height = self.width = size
11
+
12
+ def __call__(self, img):
13
+ if img.size[0] > self.width or img.size[1] > self.height:
14
+ img.thumbnail((self.width, self.height))
15
+ delta_width = self.width - img.size[0]
16
+ delta_height = self.height - img.size[1]
17
+ pad_width = delta_width // 2
18
+ pad_height = delta_height // 2
19
+ padding = (pad_width, pad_height, delta_width -
20
+ pad_width, delta_height-pad_height)
21
+ return ImageOps.expand(img, padding, fill=255)
22
+
23
+ def process_image_pos(image):
24
+ target_size = (400, 512)
25
+ # image = Image.open(img_name).convert("L")
26
+ w, h = image.size
27
+ top_padding, left_padding = 0, 0
28
+
29
+ if h <= target_size[0] or w <= target_size[1]:
30
+ delta_height = target_size[0] - h
31
+ delta_width = target_size[1] - w
32
+ pad_height = delta_height // 2
33
+ pad_width = delta_width // 2
34
+ # (left, top, right, bottom)
35
+ padding = (pad_width, pad_height, delta_width -
36
+ pad_width, delta_height-pad_height)
37
+ image = ImageOps.expand(image, padding, fill=255)
38
+ elif h > 400 or w > 500:
39
+ print("Height screwed")
40
+
41
+ transform = transforms.ToTensor()
42
+ image = transform(image)
43
+ return image
44
+
45
+ def process_image_vgg(image):
46
+ input_size = (32, 128)
47
+ transform = transforms.Compose([
48
+ PadWhite(input_size),
49
+ transforms.ToTensor(),
50
+ ])
51
+ image = transform(image)
52
+ return image