Spaces:

gtata
/

low-budget-document-cleaning

Runtime error

App Files Files Community

gtata commited on May 13, 2023

Commit

d9a169f

1 Parent(s): 2aa102f

FEAT: Some improvements to text detection

Browse files

Files changed (4) hide show

Dockerfile +0 -32
app.py +9 -1
ocr_libs.py +8 -5
requirements.txt +42 -2

Dockerfile DELETED Viewed

@@ -1,32 +0,0 @@
-FROM python:3.10-slim
-WORKDIR /code
-# Install gcc and cc1plus
-RUN apt-get update && apt-get install -y gcc g++ make
-# Install tesseract
-RUN apt-get update && apt-get install -y tesseract-ocr libtesseract-dev libleptonica-dev pkg-config
-# Install python dependencies
-COPY requirements.txt .
-RUN pip install -r requirements.txt
-# Set up a new user named "user" with user ID 1000
-RUN useradd -m -u 1000 user
-# Switch to the "user" user
-USER user
-# Set home to the user's home directory
-ENV HOME=/home/user \
-    PATH=/home/user/.local/bin:$PATH
-# Set the working directory to the user's home directory
-WORKDIR $HOME/app
-# Copy the current directory contents into the container at $HOME/app setting the owner to the user
-COPY --chown=user . $HOME/app
-RUN mkdir -p flagged
-RUN chmod 777 flagged
-# Run the app
-CMD ["python", "app.py"]

app.py CHANGED Viewed

@@ -92,13 +92,21 @@ if img is not None:
         cols[0].image(pil_image)
         for i in range(3):
             cols[i + 1].image(clned_imgs[i])
         text_boxes = get_text_boxes(ocr, pil_image)
         all_texts = list()
         all_texts.append(ocr.extract_text(pil_image, text_boxes))
         for i in range(3):
             all_texts.append(ocr.extract_text(clned_imgs[i], text_boxes))
         # text_boxes_more = get_text_boxes(ocr, clned_imgs[3])
         for i, box in enumerate(text_boxes):
             txt_box_cols = st.columns(5)
             txt_box_cols[0].image(box[0], use_column_width="always")

         cols[0].image(pil_image)
         for i in range(3):
             cols[i + 1].image(clned_imgs[i])
+    with st.spinner('Text Detection and Recognition in progress ...'):
         text_boxes = get_text_boxes(ocr, pil_image)
         all_texts = list()
         all_texts.append(ocr.extract_text(pil_image, text_boxes))
         for i in range(3):
             all_texts.append(ocr.extract_text(clned_imgs[i], text_boxes))
         # text_boxes_more = get_text_boxes(ocr, clned_imgs[3])
+        title_cols = st.columns(5)
+        headings = ["Word Image", "Original", "Cleaned (100%)", "Cleaned (8%)", "Cleaned (4%)"]
+        for i, heading in  enumerate(headings):
+            title_cols[i].markdown(f"## {heading}")
         for i, box in enumerate(text_boxes):
             txt_box_cols = st.columns(5)
             txt_box_cols[0].image(box[0], use_column_width="always")

ocr_libs.py CHANGED Viewed

@@ -1,20 +1,18 @@
 # import tesserocr
 import pytesseract
 from pprint import pprint
 class tess_ocr:
     def __init__(self):
         pass
     def detect_text(self, image):
-        boxes = pytesseract.image_to_data(image, output_type='data.frame')
         boxes = boxes.dropna().to_dict(orient='list')
         text_labels = boxes['text']
         text_boxes = list()
-        pprint(boxes)
         for i  in range(len(boxes)):
             x1, y1 = boxes["left"][i], boxes["top"][i]
             x2, y2 = x1 + boxes["width"][i], y1 + boxes["height"][i]
@@ -24,6 +22,11 @@ class tess_ocr:
             crops.append(image.crop((box['x1'], box['y1'], box['x2'], box['y2'],)))
         return list(zip(crops, text_boxes))
     def extract_text(self, image, boxes):
         OFFSET = 6
         texts = list()

 # import tesserocr
 import pytesseract
 from pprint import pprint
+import numpy as np
 class tess_ocr:
     def __init__(self):
         pass
     def detect_text(self, image):
+        boxes = pytesseract.image_to_data(image, config='--oem 1 --psm 3', output_type='data.frame')
         boxes = boxes.dropna().to_dict(orient='list')
         text_labels = boxes['text']
         text_boxes = list()
         for i  in range(len(boxes)):
             x1, y1 = boxes["left"][i], boxes["top"][i]
             x2, y2 = x1 + boxes["width"][i], y1 + boxes["height"][i]
             crops.append(image.crop((box['x1'], box['y1'], box['x2'], box['y2'],)))
         return list(zip(crops, text_boxes))
+    # def detect_text(self, image):
+    #     boxes = self.reader.readtext(np.asarray(image))
+    #     print(boxes)
+    #     return []
     def extract_text(self, image, boxes):
         OFFSET = 6
         texts = list()

requirements.txt CHANGED Viewed

@@ -1,54 +1,94 @@
 altair==4.0.0
 attrs==23.1.0
 blinker==1.6.2
 cachetools==5.3.0
 certifi==2023.5.7
 charset-normalizer==3.1.0
 click==8.1.3
 decorator==5.1.1
 entrypoints==0.4
 filelock==3.12.0
 gitdb==4.0.10
 GitPython==3.1.31
 idna==3.4
 importlib-metadata==6.6.0
 Jinja2==3.1.2
 jsonschema==4.17.3
 markdown-it-py==2.2.0
 MarkupSafe==2.1.2
 mdurl==0.1.2
 mpmath==1.3.0
 networkx==3.1
 numpy==1.24.3
 packaging==23.1
 pandas==2.0.1
 Pillow==9.5.0
 protobuf==3.20.3
 pyarrow==12.0.0
 pydeck==0.8.1b0
 Pygments==2.15.1
 Pympler==1.0.1
 pyrsistent==0.19.3
 pytesseract==0.3.10
 python-dateutil==2.8.2
 pytz==2023.3
 pytz-deprecation-shim==0.1.0.post0
 requests==2.30.0
 rich==13.3.5
 six==1.16.0
 smmap==5.0.0
 streamlit==1.22.0
 streamlit-image-select==0.6.0
 sympy==1.12
 tenacity==8.2.2
 toml==0.10.2
 toolz==0.12.0
-torch==2.0.1
-torchvision==0.15.2
 tornado==6.3.1
 typing_extensions==4.5.0
 tzdata==2023.3
 tzlocal==4.3
 urllib3==2.0.2
 validators==0.20.0
 zipp==3.15.0

+addict==2.4.0
 altair==4.0.0
 attrs==23.1.0
+beautifulsoup4==4.12.2
 blinker==1.6.2
 cachetools==5.3.0
 certifi==2023.5.7
 charset-normalizer==3.1.0
 click==8.1.3
+colorama==0.4.6
+contourpy==1.0.7
+cycler==0.11.0
 decorator==5.1.1
 entrypoints==0.4
 filelock==3.12.0
+fonttools==4.39.4
+gdown==4.7.1
 gitdb==4.0.10
 GitPython==3.1.31
 idna==3.4
+imageio==2.28.1
+imgaug==0.4.0
 importlib-metadata==6.6.0
+importlib-resources==5.12.0
 Jinja2==3.1.2
 jsonschema==4.17.3
+kiwisolver==1.4.4
+lazy_loader==0.2
+lmdb==1.4.1
+Markdown==3.4.3
 markdown-it-py==2.2.0
 MarkupSafe==2.1.2
+matplotlib==3.7.1
 mdurl==0.1.2
+model-index==0.1.11
 mpmath==1.3.0
 networkx==3.1
+ninja==1.11.1
 numpy==1.24.3
+opencv-python==4.5.4.60
+opencv-python-headless==4.5.4.60
+openmim==0.3.7
+ordered-set==4.1.0
 packaging==23.1
 pandas==2.0.1
 Pillow==9.5.0
 protobuf==3.20.3
 pyarrow==12.0.0
+pyclipper==1.3.0.post4
+pycocotools==2.0.6
 pydeck==0.8.1b0
 Pygments==2.15.1
 Pympler==1.0.1
+pyparsing==3.0.9
 pyrsistent==0.19.3
+PySocks==1.7.1
 pytesseract==0.3.10
+python-bidi==0.4.2
 python-dateutil==2.8.2
 pytz==2023.3
 pytz-deprecation-shim==0.1.0.post0
+PyWavelets==1.4.1
+PyYAML==6.0
+rapidfuzz==3.0.0
 requests==2.30.0
 rich==13.3.5
+scikit-image==0.20.0
+scipy==1.9.1
+shapely==2.0.1
 six==1.16.0
 smmap==5.0.0
+soupsieve==2.4.1
 streamlit==1.22.0
 streamlit-image-select==0.6.0
 sympy==1.12
+tabulate==0.9.0
 tenacity==8.2.2
+termcolor==2.3.0
+terminaltables==3.1.10
+tifffile==2023.4.12
 toml==0.10.2
+tomli==2.0.1
 toolz==0.12.0
+torch==1.9.0
+torchvision==0.10.0
 tornado==6.3.1
+tqdm==4.65.0
 typing_extensions==4.5.0
 tzdata==2023.3
 tzlocal==4.3
 urllib3==2.0.2
 validators==0.20.0
+yapf==0.33.0
 zipp==3.15.0