gtata commited on
Commit
d9a169f
·
1 Parent(s): 2aa102f

FEAT: Some improvements to text detection

Browse files
Files changed (4) hide show
  1. Dockerfile +0 -32
  2. app.py +9 -1
  3. ocr_libs.py +8 -5
  4. requirements.txt +42 -2
Dockerfile DELETED
@@ -1,32 +0,0 @@
1
- FROM python:3.10-slim
2
- WORKDIR /code
3
-
4
- # Install gcc and cc1plus
5
- RUN apt-get update && apt-get install -y gcc g++ make
6
-
7
- # Install tesseract
8
- RUN apt-get update && apt-get install -y tesseract-ocr libtesseract-dev libleptonica-dev pkg-config
9
-
10
- # Install python dependencies
11
- COPY requirements.txt .
12
- RUN pip install -r requirements.txt
13
-
14
- # Set up a new user named "user" with user ID 1000
15
- RUN useradd -m -u 1000 user
16
- # Switch to the "user" user
17
- USER user
18
- # Set home to the user's home directory
19
- ENV HOME=/home/user \
20
- PATH=/home/user/.local/bin:$PATH
21
-
22
- # Set the working directory to the user's home directory
23
- WORKDIR $HOME/app
24
-
25
- # Copy the current directory contents into the container at $HOME/app setting the owner to the user
26
- COPY --chown=user . $HOME/app
27
-
28
- RUN mkdir -p flagged
29
- RUN chmod 777 flagged
30
-
31
- # Run the app
32
- CMD ["python", "app.py"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -92,13 +92,21 @@ if img is not None:
92
  cols[0].image(pil_image)
93
  for i in range(3):
94
  cols[i + 1].image(clned_imgs[i])
95
-
 
 
96
  text_boxes = get_text_boxes(ocr, pil_image)
97
  all_texts = list()
98
  all_texts.append(ocr.extract_text(pil_image, text_boxes))
99
  for i in range(3):
100
  all_texts.append(ocr.extract_text(clned_imgs[i], text_boxes))
101
  # text_boxes_more = get_text_boxes(ocr, clned_imgs[3])
 
 
 
 
 
 
102
  for i, box in enumerate(text_boxes):
103
  txt_box_cols = st.columns(5)
104
  txt_box_cols[0].image(box[0], use_column_width="always")
 
92
  cols[0].image(pil_image)
93
  for i in range(3):
94
  cols[i + 1].image(clned_imgs[i])
95
+
96
+
97
+ with st.spinner('Text Detection and Recognition in progress ...'):
98
  text_boxes = get_text_boxes(ocr, pil_image)
99
  all_texts = list()
100
  all_texts.append(ocr.extract_text(pil_image, text_boxes))
101
  for i in range(3):
102
  all_texts.append(ocr.extract_text(clned_imgs[i], text_boxes))
103
  # text_boxes_more = get_text_boxes(ocr, clned_imgs[3])
104
+ title_cols = st.columns(5)
105
+ headings = ["Word Image", "Original", "Cleaned (100%)", "Cleaned (8%)", "Cleaned (4%)"]
106
+ for i, heading in enumerate(headings):
107
+ title_cols[i].markdown(f"## {heading}")
108
+
109
+
110
  for i, box in enumerate(text_boxes):
111
  txt_box_cols = st.columns(5)
112
  txt_box_cols[0].image(box[0], use_column_width="always")
ocr_libs.py CHANGED
@@ -1,20 +1,18 @@
1
  # import tesserocr
2
  import pytesseract
3
  from pprint import pprint
4
-
5
-
6
 
7
  class tess_ocr:
8
 
9
  def __init__(self):
10
  pass
11
-
12
  def detect_text(self, image):
13
- boxes = pytesseract.image_to_data(image, output_type='data.frame')
14
  boxes = boxes.dropna().to_dict(orient='list')
15
  text_labels = boxes['text']
16
  text_boxes = list()
17
- pprint(boxes)
18
  for i in range(len(boxes)):
19
  x1, y1 = boxes["left"][i], boxes["top"][i]
20
  x2, y2 = x1 + boxes["width"][i], y1 + boxes["height"][i]
@@ -24,6 +22,11 @@ class tess_ocr:
24
  crops.append(image.crop((box['x1'], box['y1'], box['x2'], box['y2'],)))
25
  return list(zip(crops, text_boxes))
26
 
 
 
 
 
 
27
  def extract_text(self, image, boxes):
28
  OFFSET = 6
29
  texts = list()
 
1
  # import tesserocr
2
  import pytesseract
3
  from pprint import pprint
4
+ import numpy as np
 
5
 
6
  class tess_ocr:
7
 
8
  def __init__(self):
9
  pass
10
+
11
  def detect_text(self, image):
12
+ boxes = pytesseract.image_to_data(image, config='--oem 1 --psm 3', output_type='data.frame')
13
  boxes = boxes.dropna().to_dict(orient='list')
14
  text_labels = boxes['text']
15
  text_boxes = list()
 
16
  for i in range(len(boxes)):
17
  x1, y1 = boxes["left"][i], boxes["top"][i]
18
  x2, y2 = x1 + boxes["width"][i], y1 + boxes["height"][i]
 
22
  crops.append(image.crop((box['x1'], box['y1'], box['x2'], box['y2'],)))
23
  return list(zip(crops, text_boxes))
24
 
25
+ # def detect_text(self, image):
26
+ # boxes = self.reader.readtext(np.asarray(image))
27
+ # print(boxes)
28
+ # return []
29
+
30
  def extract_text(self, image, boxes):
31
  OFFSET = 6
32
  texts = list()
requirements.txt CHANGED
@@ -1,54 +1,94 @@
 
1
  altair==4.0.0
2
  attrs==23.1.0
 
3
  blinker==1.6.2
4
  cachetools==5.3.0
5
  certifi==2023.5.7
6
  charset-normalizer==3.1.0
7
  click==8.1.3
 
 
 
8
  decorator==5.1.1
9
  entrypoints==0.4
10
  filelock==3.12.0
 
 
11
  gitdb==4.0.10
12
  GitPython==3.1.31
13
  idna==3.4
 
 
14
  importlib-metadata==6.6.0
 
15
  Jinja2==3.1.2
16
  jsonschema==4.17.3
 
 
 
 
17
  markdown-it-py==2.2.0
18
  MarkupSafe==2.1.2
 
19
  mdurl==0.1.2
 
20
  mpmath==1.3.0
21
  networkx==3.1
 
22
  numpy==1.24.3
 
 
 
 
23
  packaging==23.1
24
  pandas==2.0.1
25
  Pillow==9.5.0
26
  protobuf==3.20.3
27
  pyarrow==12.0.0
 
 
28
  pydeck==0.8.1b0
29
  Pygments==2.15.1
30
  Pympler==1.0.1
 
31
  pyrsistent==0.19.3
 
32
  pytesseract==0.3.10
 
33
  python-dateutil==2.8.2
34
  pytz==2023.3
35
  pytz-deprecation-shim==0.1.0.post0
 
 
 
36
  requests==2.30.0
37
  rich==13.3.5
 
 
 
38
  six==1.16.0
39
  smmap==5.0.0
 
40
  streamlit==1.22.0
41
  streamlit-image-select==0.6.0
42
  sympy==1.12
 
43
  tenacity==8.2.2
 
 
 
44
  toml==0.10.2
 
45
  toolz==0.12.0
46
- torch==2.0.1
47
- torchvision==0.15.2
48
  tornado==6.3.1
 
49
  typing_extensions==4.5.0
50
  tzdata==2023.3
51
  tzlocal==4.3
52
  urllib3==2.0.2
53
  validators==0.20.0
 
54
  zipp==3.15.0
 
1
+ addict==2.4.0
2
  altair==4.0.0
3
  attrs==23.1.0
4
+ beautifulsoup4==4.12.2
5
  blinker==1.6.2
6
  cachetools==5.3.0
7
  certifi==2023.5.7
8
  charset-normalizer==3.1.0
9
  click==8.1.3
10
+ colorama==0.4.6
11
+ contourpy==1.0.7
12
+ cycler==0.11.0
13
  decorator==5.1.1
14
  entrypoints==0.4
15
  filelock==3.12.0
16
+ fonttools==4.39.4
17
+ gdown==4.7.1
18
  gitdb==4.0.10
19
  GitPython==3.1.31
20
  idna==3.4
21
+ imageio==2.28.1
22
+ imgaug==0.4.0
23
  importlib-metadata==6.6.0
24
+ importlib-resources==5.12.0
25
  Jinja2==3.1.2
26
  jsonschema==4.17.3
27
+ kiwisolver==1.4.4
28
+ lazy_loader==0.2
29
+ lmdb==1.4.1
30
+ Markdown==3.4.3
31
  markdown-it-py==2.2.0
32
  MarkupSafe==2.1.2
33
+ matplotlib==3.7.1
34
  mdurl==0.1.2
35
+ model-index==0.1.11
36
  mpmath==1.3.0
37
  networkx==3.1
38
+ ninja==1.11.1
39
  numpy==1.24.3
40
+ opencv-python==4.5.4.60
41
+ opencv-python-headless==4.5.4.60
42
+ openmim==0.3.7
43
+ ordered-set==4.1.0
44
  packaging==23.1
45
  pandas==2.0.1
46
  Pillow==9.5.0
47
  protobuf==3.20.3
48
  pyarrow==12.0.0
49
+ pyclipper==1.3.0.post4
50
+ pycocotools==2.0.6
51
  pydeck==0.8.1b0
52
  Pygments==2.15.1
53
  Pympler==1.0.1
54
+ pyparsing==3.0.9
55
  pyrsistent==0.19.3
56
+ PySocks==1.7.1
57
  pytesseract==0.3.10
58
+ python-bidi==0.4.2
59
  python-dateutil==2.8.2
60
  pytz==2023.3
61
  pytz-deprecation-shim==0.1.0.post0
62
+ PyWavelets==1.4.1
63
+ PyYAML==6.0
64
+ rapidfuzz==3.0.0
65
  requests==2.30.0
66
  rich==13.3.5
67
+ scikit-image==0.20.0
68
+ scipy==1.9.1
69
+ shapely==2.0.1
70
  six==1.16.0
71
  smmap==5.0.0
72
+ soupsieve==2.4.1
73
  streamlit==1.22.0
74
  streamlit-image-select==0.6.0
75
  sympy==1.12
76
+ tabulate==0.9.0
77
  tenacity==8.2.2
78
+ termcolor==2.3.0
79
+ terminaltables==3.1.10
80
+ tifffile==2023.4.12
81
  toml==0.10.2
82
+ tomli==2.0.1
83
  toolz==0.12.0
84
+ torch==1.9.0
85
+ torchvision==0.10.0
86
  tornado==6.3.1
87
+ tqdm==4.65.0
88
  typing_extensions==4.5.0
89
  tzdata==2023.3
90
  tzlocal==4.3
91
  urllib3==2.0.2
92
  validators==0.20.0
93
+ yapf==0.33.0
94
  zipp==3.15.0