allenai
/

olmOCR-7B-0225-preview

@@ -33,24 +33,89 @@ The prompt must then contain the additional metadata from the document, and the
 ## Manual Prompting
 ```python
-image_base64 = [base64 image of PDF rendered down to 1024 px on longest edge]
 messages = [
             {
                 "role": "user",
                 "content": [
-                    {"type": "text", "text": "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally.
-Do not hallucinate.
-RAW_TEXT_START
-Page dimensions: 1836.8x2267.2
-[Image 0x0 to 1837x2267]
-RAW_TEXT_END"},
                     {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
                 ],
             }
-        ],
 ```
 ## License and use

 ## Manual Prompting
+If you want to prompt this model manually, please see the code below.
+In normal usage, the olmOCR toolkit builds the prompt by rendering the PDF page, and
+extracting relevant text blocks and image metadata. To duplicate that you will need to
+```bash
+pip install olmocr
+```
+and then run the following sample code.
 ```python
+import torch
+import base64
+import json
+import urllib.request
+from io import BytesIO
+from PIL import Image
+from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+from olmocr.data.renderpdf import render_pdf_to_base64png
+from olmocr.prompts import build_finetuning_prompt
+from olmocr.prompts.anchor import get_anchor_text
+# Initialize the model
+model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval()
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+# Grab a sample PDF
+urllib.request.urlretrieve("https://molmo.allenai.org/paper.pdf", "./paper.pdf")
+# Render page 1 to an image
+image_base64 = render_pdf_to_base64png("./paper.pdf", 1, target_longest_image_dim=1024)
+# Build the prompt, using document metadata
+anchor_text = get_anchor_text("./paper.pdf", 1, pdf_engine="pdfreport", target_length=4000)
+prompt = build_finetuning_prompt(anchor_text)
+# Build the full prompt
 messages = [
             {
                 "role": "user",
                 "content": [
+                    {"type": "text", "text": prompt},
                     {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
                 ],
             }
+        ]
+# Apply the chat template and processor
+text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
+inputs = processor(
+    text=[text],
+    images=[main_image],
+    padding=True,
+    return_tensors="pt",
+)
+inputs = {key: value.to(device) for (key, value) in inputs.items()}
+# Generate the output
+output = model.generate(
+            **inputs,
+            temperature=0.8,
+            max_new_tokens=50,
+            num_return_sequences=1,
+            do_sample=True,
+        )
+# Decode the output
+prompt_length = inputs["input_ids"].shape[1]
+new_tokens = output[:, prompt_length:]
+text_output = processor.tokenizer.batch_decode(
+    new_tokens, skip_special_tokens=True
+)
+print(text_output)
 ```
 ## License and use