Spaces:

amiguel
/

finetune

Build error

App Files Files Community

amiguel commited on 4 days ago

Commit

95c6d6f

verified ·

1 Parent(s): 0864d4b

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -130

app.py CHANGED Viewed

@@ -1,139 +1,99 @@
 import streamlit as st
-import torch
 import base64
-from io import BytesIO
 from PIL import Image
-from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
-from olmocr.data.renderpdf import render_pdf_to_base64png
-from olmocr.prompts import build_finetuning_prompt
-from olmocr.prompts.anchor import get_anchor_text
-# Initialize the model
-model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval()
-processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.to(device)
-# Set the font
-st.markdown(
-    """
-    <style>
-    @import url('https://fonts.googleapis.com/css2?family=Tw+Cen+MT&display=swap');
-    body {
-        font-family: 'Tw Cen MT', sans-serif;
     }
-    </style>
-    """,
-    unsafe_allow_html=True,
-)
-# Title and description
-st.title("Document Processing App")
-st.write("Upload a PDF, Excel, Word, PNG, JPG, or JPEG file to process it.")
-# File uploader
-uploaded_file = st.sidebar.file_uploader("Choose a file", type=["pdf", "xls", "xlsx", "doc", "docx", "png", "jpg", "jpeg"])
-if uploaded_file is not None:
-    # Process the uploaded file
-    if uploaded_file.type == "application/pdf":
-        # Render page 1 to an image
-        image_base64 = render_pdf_to_base64png(uploaded_file, 1, target_longest_image_dim=1024)
-        # Build the prompt, using document metadata
-        anchor_text = get_anchor_text(uploaded_file, 1, pdf_engine="pdfreport", target_length=4000)
-        prompt = build_finetuning_prompt(anchor_text)
-        # Build the full prompt
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": prompt},
-                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
-                ],
-            }
-        ]
-        # Apply the chat template and processor
-        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
-        inputs = processor(
-            text=[text],
-            images=[main_image],
-            padding=True,
-            return_tensors="pt",
-        )
-        inputs = {key: value.to(device) for (key, value) in inputs.items()}
-        # Generate the output
-        output = model.generate(
-            **inputs,
-            temperature=0.8,
-            max_new_tokens=50,
-            num_return_sequences=1,
-            do_sample=True,
-        )
-        # Decode the output
-        prompt_length = inputs["input_ids"].shape[1]
-        new_tokens = output[:, prompt_length:]
-        text_output = processor.tokenizer.batch_decode(
-            new_tokens, skip_special_tokens=True
-        )
-        # Display the result
-        st.write("Processed Text:")
-        st.write(text_output)
-    elif uploaded_file.type in ["image/png", "image/jpeg"]:
-        # Load the image
-        image = Image.open(uploaded_file)
-        image_base64 = base64.b64encode(image.tobytes()).decode('utf-8')
-        # Build the prompt
-        prompt = "Please describe the content of the image."
-        # Build the full prompt
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": prompt},
-                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
-                ],
-            }
-        ]
-        # Apply the chat template and processor
-        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = processor(
-            text=[text],
-            images=[image],
-            padding=True,
-            return_tensors="pt",
-        )
-        inputs = {key: value.to(device) for (key, value) in inputs.items()}
-        # Generate the output
-        output = model.generate(
-            **inputs,
-            temperature=0.8,
-            max_new_tokens=50,
-            num_return_sequences=1,
-            do_sample=True,
-        )
-        # Decode the output
-        prompt_length = inputs["input_ids"].shape[1]
-        new_tokens = output[:, prompt_length:]
-        text_output = processor.tokenizer.batch_decode(
-            new_tokens, skip_special_tokens=True
-        )
-        # Display the result
-        st.write("Processed Text:")
-        st.write(text_output)
-    else:
-        st.write("Unsupported file type.")

 import streamlit as st
+from transformers import AutoProcessor, AutoModelForVision2Seq
+from pdf2image import convert_from_path
 import base64
+import io
 from PIL import Image
+# Load the OCR model and processor from Hugging Face
+try:
+    processor = AutoProcessor.from_pretrained("allenai/olmOCR-7B-0225-preview")
+    model = AutoModelForVision2Seq.from_pretrained("allenai/olmOCR-7B-0225-preview")
+except ImportError as e:
+    processor = None
+    model = None
+    print(f"Error loading model: {str(e)}. Please ensure PyTorch is installed.")
+except ValueError as e:
+    processor = None
+    model = None
+    print(f"Error with model configuration: {str(e)}")
+def process_pdf(pdf_file):
+    """ Process the uploaded PDF file one page at a time, yielding HTML for each page with its image and extracted text. """
+    if processor is None or model is None:
+        return "<p>Error: Model could not be loaded. Check environment setup (PyTorch may be missing) or model compatibility.</p>"
+    # Check if a PDF file was uploaded
+    if pdf_file is None:
+        return "<p>Please upload a PDF file.</p>"
+    # Convert PDF to images
+    try:
+        pages = convert_from_path(pdf_file.name)
+    except Exception as e:
+        return f"<p>Error converting PDF to images: {str(e)}</p>"
+    # Initial HTML with "Copy All" button and container for pages
+    html = '<div><button onclick="copyAll()" style="margin-bottom: 10px;">Copy All</button></div><div id="pages">'
+    # Process each page incrementally
+    for i, page in enumerate(pages):
+        # Convert the page image to base64 for embedding in HTML
+        buffered = io.BytesIO()
+        page.save(buffered, format="PNG")
+        img_str = base64.b64encode(buffered.getvalue()).decode()
+        img_data = f"data:image/png;base64,{img_str}"
+        # Extract text from the page using the OCR model
+        try:
+            inputs = processor(text="Extract the text from this image.", images=page, return_tensors="pt")
+            outputs = model.generate(**inputs)
+            text = processor.decode(outputs[0], skip_special_tokens=True)
+        except Exception as e:
+            text = f"Error extracting text: {str(e)}"
+        # Generate HTML for this page's section
+        textarea_id = f"text{i+1}"
+        page_html = f'''
+        <div class="page" style="margin-bottom: 20px; border-bottom: 1px solid #ccc; padding-bottom: 20px;">
+            <h3>Page {i+1}</h3>
+            <div style="display: flex; align-items: flex-start;">
+                <img src="{img_data}" alt="Page {i+1}" style="max-width: 300px; margin-right: 20px;">
+                <div style="flex-grow: 1;">
+                    <textarea id="{textarea_id}" rows="10" style="width: 100%;">{text}</textarea>
+                    <button onclick="copyText('{textarea_id}')" style="margin-top: 5px;">Copy</button>
+                </div>
+            </div>
+        </div>
+        '''
+        html += page_html
+    # After all pages are processed, close the div and add JavaScript
+    html += '</div>'
+    html += '''
+    <script>
+    function copyText(id) {
+        var text = document.getElementById(id);
+        text.select();
+        document.execCommand("copy");
     }
+    function copyAll() {
+        var texts = document.querySelectorAll("#pages textarea");
+        var allText = Array.from(texts).map(t => t.value).join("\\n\\n");
+        navigator.clipboard.writeText(allText);
+    }
+    </script>
+    '''
+    return html
+# Define the Streamlit interface
+st.title("PDF Text Extractor")
+st.markdown("Upload a PDF file and click 'Extract Text' to see each page's image and extracted text incrementally.")
+pdf_input = st.file_uploader("Upload PDF", type=["pdf"])
+submit_btn = st.button("Extract Text")
+if submit_btn and pdf_input:
+    output_html = process_pdf(pdf_input)
+    st.components.v1.html(output_html, height=800)