>>31571>five million tokens per USD you're looking at a dollar every ten pages or so.Was able to simply extract the JPEG from the PDF, and to upload these instead of using base64.
This is because we were accidentally up-scaling and the base64 adds 33%.
This resulted in a savings of around 75%.
Tried using the cheaper scout model but this wasn't able to OCR all the files.
Also my original computation for the cost was off.
Empirically the cost to run this scan was 0.018803 USD.
The interface really should be modified to allow for the inspection and optional rencoding of the images.
Anyway here's the full scan:
https://0x0.st/KjlU.txtimport os
import pymupdf
import pathlib
import requests
import argparse
from groq import Groq
from typing import List
def render_pages(
src_pdf: pathlib.Path,
) -> List[bytes]:
doc = pymupdf.open(src_pdf)
images = []
for page in doc:
for img in page.get_images():
xref = img[0]
pix = doc.extract_image(xref)
images.append(pix['image'])
return images
# def render_pages(
# src_pdf: pathlib.Path,
# resolution: int = 200,
# ) -> List[bytes]:
# pages = []
# with Image(
# filename=str(src_pdf) + "[0]",
# resolution=resolution,
# depth=8,
# colorspace="gray"
# ) as images:
# for image in images.sequence:
# compressed = Image(image=image)
# compressed.colorspace = "gray"
# compressed.depth = 8
# # compressed.format = "jpeg"
# # compressed.compression = "jpeg"
# # compressed.strip()
# # buffer = io.BytesIO()
# compressed.save(filename="temp.jpg")
# # pages.append(buffer.getvalue())
# return pages
def upload_image(
image: bytes,
) -> str:
return requests.post('https://0x0.st',
files={'file': ('image.jpeg', image, 'image/jpeg')},
headers={'User-Agent': 'curl/7'}).text.strip()
def OCR_image (image_url: str) -> str:
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Return the plain text content of this JPEG image verbatim."
},
{
"type": "image_url",
"image_url": {
"url": image_url,
},
},
],
}
],
model="meta-llama/llama-4-maverick-17b-128e-instruct",
)
completion = chat_completion.choices[0].message.content
return completion if completion else ""
def main():
ap = argparse.ArgumentParser(description="Split PDF and render pages to OCR-friendly images")
ap.add_argument("pdf", help="input PDF file")
ap.add_argument("--out-dir", "-o", type=pathlib.Path, help="output folder (default: ./pdfname_imgs)")
args = ap.parse_args()
src_pdf = pathlib.Path(args.pdf).expanduser().resolve()
out_dir = args.out_dir
out_dir.mkdir(exist_ok=True)
images = render_pages(src_pdf)
with (out_dir / src_pdf.with_suffix("").with_suffix(".txt").name).open("a") as file:
for image in images:
url = upload_image(image)
file.write(OCR_image(url))
if __name__ == "__main__":
main()