amiguel commited on
Commit
8dd4b21
·
verified ·
1 Parent(s): 0f4fb76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -16
app.py CHANGED
@@ -1,27 +1,139 @@
1
  import streamlit as st
2
- import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  # Title and description
5
  st.title("Document Processing App")
6
  st.write("Upload a PDF, Excel, Word, PNG, JPG, or JPEG file to process it.")
7
 
8
- # Define the path for the new folder
9
- folder_path = "/workspace/olmocr/new_folder"
10
-
11
- # Create the folder if it doesn't exist
12
- if not os.path.exists(folder_path):
13
- os.makedirs(folder_path)
14
- st.write(f"Folder created: {folder_path}")
15
- else:
16
- st.write(f"Folder already exists: {folder_path}")
17
-
18
  # File uploader
19
  uploaded_file = st.sidebar.file_uploader("Choose a file", type=["pdf", "xls", "xlsx", "doc", "docx", "png", "jpg", "jpeg"])
20
 
21
  if uploaded_file is not None:
22
- # Save the uploaded file to the new folder
23
- file_path = os.path.join(folder_path, uploaded_file.name)
24
- with open(file_path, "wb") as f:
25
- f.write(uploaded_file.getbuffer())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- st.write(f"File saved to: {file_path}")
 
 
1
  import streamlit as st
2
+ import torch
3
+ import base64
4
+ from io import BytesIO
5
+ from PIL import Image
6
+ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
7
+ from olmocr.data.renderpdf import render_pdf_to_base64png
8
+ from olmocr.prompts import build_finetuning_prompt
9
+ from olmocr.prompts.anchor import get_anchor_text
10
+
11
+ # Initialize the model
12
+ model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval()
13
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
14
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
+ model.to(device)
16
+
17
+ # Set the font
18
+ st.markdown(
19
+ """
20
+ <style>
21
+ @import url('https://fonts.googleapis.com/css2?family=Tw+Cen+MT&display=swap');
22
+ body {
23
+ font-family: 'Tw Cen MT', sans-serif;
24
+ }
25
+ </style>
26
+ """,
27
+ unsafe_allow_html=True,
28
+ )
29
 
30
  # Title and description
31
  st.title("Document Processing App")
32
  st.write("Upload a PDF, Excel, Word, PNG, JPG, or JPEG file to process it.")
33
 
 
 
 
 
 
 
 
 
 
 
34
  # File uploader
35
  uploaded_file = st.sidebar.file_uploader("Choose a file", type=["pdf", "xls", "xlsx", "doc", "docx", "png", "jpg", "jpeg"])
36
 
37
  if uploaded_file is not None:
38
+ # Process the uploaded file
39
+ if uploaded_file.type == "application/pdf":
40
+ # Render page 1 to an image
41
+ image_base64 = render_pdf_to_base64png(uploaded_file, 1, target_longest_image_dim=1024)
42
+
43
+ # Build the prompt, using document metadata
44
+ anchor_text = get_anchor_text(uploaded_file, 1, pdf_engine="pdfreport", target_length=4000)
45
+ prompt = build_finetuning_prompt(anchor_text)
46
+
47
+ # Build the full prompt
48
+ messages = [
49
+ {
50
+ "role": "user",
51
+ "content": [
52
+ {"type": "text", "text": prompt},
53
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
54
+ ],
55
+ }
56
+ ]
57
+
58
+ # Apply the chat template and processor
59
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
60
+ main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
61
+ inputs = processor(
62
+ text=[text],
63
+ images=[main_image],
64
+ padding=True,
65
+ return_tensors="pt",
66
+ )
67
+ inputs = {key: value.to(device) for (key, value) in inputs.items()}
68
+
69
+ # Generate the output
70
+ output = model.generate(
71
+ **inputs,
72
+ temperature=0.8,
73
+ max_new_tokens=50,
74
+ num_return_sequences=1,
75
+ do_sample=True,
76
+ )
77
+
78
+ # Decode the output
79
+ prompt_length = inputs["input_ids"].shape[1]
80
+ new_tokens = output[:, prompt_length:]
81
+ text_output = processor.tokenizer.batch_decode(
82
+ new_tokens, skip_special_tokens=True
83
+ )
84
+
85
+ # Display the result
86
+ st.write("Processed Text:")
87
+ st.write(text_output)
88
+
89
+ elif uploaded_file.type in ["image/png", "image/jpeg"]:
90
+ # Load the image
91
+ image = Image.open(uploaded_file)
92
+ image_base64 = base64.b64encode(image.tobytes()).decode('utf-8')
93
+
94
+ # Build the prompt
95
+ prompt = "Please describe the content of the image."
96
+
97
+ # Build the full prompt
98
+ messages = [
99
+ {
100
+ "role": "user",
101
+ "content": [
102
+ {"type": "text", "text": prompt},
103
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
104
+ ],
105
+ }
106
+ ]
107
+
108
+ # Apply the chat template and processor
109
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
110
+ inputs = processor(
111
+ text=[text],
112
+ images=[image],
113
+ padding=True,
114
+ return_tensors="pt",
115
+ )
116
+ inputs = {key: value.to(device) for (key, value) in inputs.items()}
117
+
118
+ # Generate the output
119
+ output = model.generate(
120
+ **inputs,
121
+ temperature=0.8,
122
+ max_new_tokens=50,
123
+ num_return_sequences=1,
124
+ do_sample=True,
125
+ )
126
+
127
+ # Decode the output
128
+ prompt_length = inputs["input_ids"].shape[1]
129
+ new_tokens = output[:, prompt_length:]
130
+ text_output = processor.tokenizer.batch_decode(
131
+ new_tokens, skip_special_tokens=True
132
+ )
133
+
134
+ # Display the result
135
+ st.write("Processed Text:")
136
+ st.write(text_output)
137
 
138
+ else:
139
+ st.write("Unsupported file type.")