jakep-allenai commited on
Commit
51f1b47
·
verified ·
1 Parent(s): 4e3f1d7

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +74 -9
README.md CHANGED
@@ -33,24 +33,89 @@ The prompt must then contain the additional metadata from the document, and the
33
 
34
  ## Manual Prompting
35
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  ```python
37
- image_base64 = [base64 image of PDF rendered down to 1024 px on longest edge]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
 
 
 
 
 
 
 
 
39
  messages = [
40
  {
41
  "role": "user",
42
  "content": [
43
- {"type": "text", "text": "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally.
44
- Do not hallucinate.
45
- RAW_TEXT_START
46
- Page dimensions: 1836.8x2267.2
47
- [Image 0x0 to 1837x2267]
48
-
49
- RAW_TEXT_END"},
50
  {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
51
  ],
52
  }
53
- ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  ```
55
 
56
  ## License and use
 
33
 
34
  ## Manual Prompting
35
 
36
+ If you want to prompt this model manually, please see the code below.
37
+
38
+ In normal usage, the olmOCR toolkit builds the prompt by rendering the PDF page, and
39
+ extracting relevant text blocks and image metadata. To duplicate that you will need to
40
+
41
+ ```bash
42
+ pip install olmocr
43
+ ```
44
+
45
+ and then run the following sample code.
46
+
47
+
48
  ```python
49
+ import torch
50
+ import base64
51
+ import json
52
+ import urllib.request
53
+
54
+ from io import BytesIO
55
+ from PIL import Image
56
+ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
57
+
58
+ from olmocr.data.renderpdf import render_pdf_to_base64png
59
+ from olmocr.prompts import build_finetuning_prompt
60
+ from olmocr.prompts.anchor import get_anchor_text
61
+
62
+ # Initialize the model
63
+ model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval()
64
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
65
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
66
+ model.to(device)
67
+
68
+ # Grab a sample PDF
69
+ urllib.request.urlretrieve("https://molmo.allenai.org/paper.pdf", "./paper.pdf")
70
 
71
+ # Render page 1 to an image
72
+ image_base64 = render_pdf_to_base64png("./paper.pdf", 1, target_longest_image_dim=1024)
73
+
74
+ # Build the prompt, using document metadata
75
+ anchor_text = get_anchor_text("./paper.pdf", 1, pdf_engine="pdfreport", target_length=4000)
76
+ prompt = build_finetuning_prompt(anchor_text)
77
+
78
+ # Build the full prompt
79
  messages = [
80
  {
81
  "role": "user",
82
  "content": [
83
+ {"type": "text", "text": prompt},
 
 
 
 
 
 
84
  {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
85
  ],
86
  }
87
+ ]
88
+
89
+ # Apply the chat template and processor
90
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
91
+ main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
92
+
93
+ inputs = processor(
94
+ text=[text],
95
+ images=[main_image],
96
+ padding=True,
97
+ return_tensors="pt",
98
+ )
99
+ inputs = {key: value.to(device) for (key, value) in inputs.items()}
100
+
101
+
102
+ # Generate the output
103
+ output = model.generate(
104
+ **inputs,
105
+ temperature=0.8,
106
+ max_new_tokens=50,
107
+ num_return_sequences=1,
108
+ do_sample=True,
109
+ )
110
+
111
+ # Decode the output
112
+ prompt_length = inputs["input_ids"].shape[1]
113
+ new_tokens = output[:, prompt_length:]
114
+ text_output = processor.tokenizer.batch_decode(
115
+ new_tokens, skip_special_tokens=True
116
+ )
117
+
118
+ print(text_output)
119
  ```
120
 
121
  ## License and use