Spaces:

amiguel
/

finetune

Build error

App Files Files Community

amiguel commited on 4 days ago

Commit

8dd4b21

verified ·

1 Parent(s): 0f4fb76

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -16

app.py CHANGED Viewed

@@ -1,27 +1,139 @@
 import streamlit as st
-import os
 # Title and description
 st.title("Document Processing App")
 st.write("Upload a PDF, Excel, Word, PNG, JPG, or JPEG file to process it.")
-# Define the path for the new folder
-folder_path = "/workspace/olmocr/new_folder"
-# Create the folder if it doesn't exist
-if not os.path.exists(folder_path):
-    os.makedirs(folder_path)
-    st.write(f"Folder created: {folder_path}")
-else:
-    st.write(f"Folder already exists: {folder_path}")
 # File uploader
 uploaded_file = st.sidebar.file_uploader("Choose a file", type=["pdf", "xls", "xlsx", "doc", "docx", "png", "jpg", "jpeg"])
 if uploaded_file is not None:
-    # Save the uploaded file to the new folder
-    file_path = os.path.join(folder_path, uploaded_file.name)
-    with open(file_path, "wb") as f:
-        f.write(uploaded_file.getbuffer())
-    st.write(f"File saved to: {file_path}")

 import streamlit as st
+import torch
+import base64
+from io import BytesIO
+from PIL import Image
+from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+from olmocr.data.renderpdf import render_pdf_to_base64png
+from olmocr.prompts import build_finetuning_prompt
+from olmocr.prompts.anchor import get_anchor_text
+# Initialize the model
+model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval()
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+# Set the font
+st.markdown(
+    """
+    <style>
+    @import url('https://fonts.googleapis.com/css2?family=Tw+Cen+MT&display=swap');
+    body {
+        font-family: 'Tw Cen MT', sans-serif;
+    }
+    </style>
+    """,
+    unsafe_allow_html=True,
+)
 # Title and description
 st.title("Document Processing App")
 st.write("Upload a PDF, Excel, Word, PNG, JPG, or JPEG file to process it.")
 # File uploader
 uploaded_file = st.sidebar.file_uploader("Choose a file", type=["pdf", "xls", "xlsx", "doc", "docx", "png", "jpg", "jpeg"])
 if uploaded_file is not None:
+    # Process the uploaded file
+    if uploaded_file.type == "application/pdf":
+        # Render page 1 to an image
+        image_base64 = render_pdf_to_base64png(uploaded_file, 1, target_longest_image_dim=1024)
+        # Build the prompt, using document metadata
+        anchor_text = get_anchor_text(uploaded_file, 1, pdf_engine="pdfreport", target_length=4000)
+        prompt = build_finetuning_prompt(anchor_text)
+        # Build the full prompt
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
+                ],
+            }
+        ]
+        # Apply the chat template and processor
+        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
+        inputs = processor(
+            text=[text],
+            images=[main_image],
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = {key: value.to(device) for (key, value) in inputs.items()}
+        # Generate the output
+        output = model.generate(
+            **inputs,
+            temperature=0.8,
+            max_new_tokens=50,
+            num_return_sequences=1,
+            do_sample=True,
+        )
+        # Decode the output
+        prompt_length = inputs["input_ids"].shape[1]
+        new_tokens = output[:, prompt_length:]
+        text_output = processor.tokenizer.batch_decode(
+            new_tokens, skip_special_tokens=True
+        )
+        # Display the result
+        st.write("Processed Text:")
+        st.write(text_output)
+    elif uploaded_file.type in ["image/png", "image/jpeg"]:
+        # Load the image
+        image = Image.open(uploaded_file)
+        image_base64 = base64.b64encode(image.tobytes()).decode('utf-8')
+        # Build the prompt
+        prompt = "Please describe the content of the image."
+        # Build the full prompt
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
+                ],
+            }
+        ]
+        # Apply the chat template and processor
+        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(
+            text=[text],
+            images=[image],
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = {key: value.to(device) for (key, value) in inputs.items()}
+        # Generate the output
+        output = model.generate(
+            **inputs,
+            temperature=0.8,
+            max_new_tokens=50,
+            num_return_sequences=1,
+            do_sample=True,
+        )
+        # Decode the output
+        prompt_length = inputs["input_ids"].shape[1]
+        new_tokens = output[:, prompt_length:]
+        text_output = processor.tokenizer.batch_decode(
+            new_tokens, skip_special_tokens=True
+        )
+        # Display the result
+        st.write("Processed Text:")
+        st.write(text_output)
+    else:
+        st.write("Unsupported file type.")