How to Parse Invoices in Python Using an API (2026 Guide)
Every developer building a finance app eventually hits the same afternoon: you need to extract structured data from PDF invoices, and what looks like a two-hour task turns into two weeks of fighting PDF parsers, OCR libraries, and regex patterns that break the moment a vendor changes their template.
This guide shows you a faster path. You'll have working invoice extraction in Python in under 10 minutes, returning clean JSON with every financial field already named and normalized.
What You'll Need
- Python 3.8+
requestslibrary (pip install requests)- A DocuParseAPI key — get one free (20 documents/month, no credit card)
- A PDF invoice to test with
The One-Call Pattern
DocuParseAPI works as a single POST request. You send a file; you receive structured JSON. There's no pipeline to configure, no template to define per vendor, no model to train.
import os
import requests
def parse_invoice(file_path: str) -> dict:
"""
Parse an invoice PDF and return structured JSON data.
Args:
file_path: Path to the PDF, JPG, or PNG invoice file
Returns:
dict with fields: merchant, total, subtotal, tax, date,
due_date, invoice_id, currency, line_items
"""
api_key = os.environ["DOCUPARSE_API_KEY"]
with open(file_path, "rb") as f:
response = requests.post(
"https://docuparseapi.com/api/v1/extract",
headers={"Authorization": f"Bearer {api_key}"},
files={"file": (os.path.basename(file_path), f)},
)
response.raise_for_status()
data = response.json()
if not data.get("success"):
raise RuntimeError(
f"Extraction failed: [{data['error']['code']}] {data['error']['message']}"
)
return data
# Usage
result = parse_invoice("invoice.pdf")
print(f"Vendor: {result['merchant']}")
print(f"Total: {result['total']} {result['currency']}")
print(f"Tax: {result['tax']}")
print(f"Date: {result['date']}")
print(f"Due: {result['due_date']}")
for item in result.get("line_items", []):
print(f" - {item['description']}: {item['quantity']} × {item['unit_price']}")
Set your API key as an environment variable before running:
export DOCUPARSE_API_KEY="dex_your_key_here"
python parse_invoice.py
What the Response Looks Like
{
"success": true,
"document_id": "doc_clx7abc123",
"document_type": "invoice",
"merchant": "Acme Supplies Ltd",
"date": "2026-05-10",
"due_date": "2026-06-10",
"currency": "USD",
"subtotal": "1200.00",
"tax": "120.00",
"tax_rate": "10%",
"total": "1320.00",
"invoice_id": "INV-2026-0042",
"payment_method": null,
"line_items": [
{
"description": "Cloud Server - Monthly",
"quantity": 3,
"unit_price": "400.00",
"total": "1200.00"
}
],
"processing_time_ms": 1140
}
Every field is already named, typed, and normalized. No bounding boxes. No confidence scores to interpret. No post-processing required.
Processing Multiple Invoices in Batch
For processing a folder of invoices — a common accounts payable use case — here's a clean batch pattern:
import os
import json
import requests
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
API_KEY = os.environ["DOCUPARSE_API_KEY"]
BASE_URL = "https://docuparseapi.com/api/v1/extract"
def extract_single(file_path: Path) -> dict:
"""Extract one invoice. Returns result dict or error dict."""
try:
with open(file_path, "rb") as f:
response = requests.post(
BASE_URL,
headers={"Authorization": f"Bearer {API_KEY}"},
files={"file": (file_path.name, f)},
timeout=30,
)
data = response.json()
return {"file": file_path.name, "status": "ok", "data": data}
except Exception as e:
return {"file": file_path.name, "status": "error", "error": str(e)}
def batch_extract(folder_path: str, max_workers: int = 5) -> list[dict]:
"""
Extract all invoices in a folder concurrently.
Args:
folder_path: Directory containing PDF/JPG/PNG invoices
max_workers: Concurrent requests (keep ≤ 10 to stay within rate limits)
"""
invoice_dir = Path(folder_path)
files = list(invoice_dir.glob("*.pdf")) + list(invoice_dir.glob("*.jpg")) + list(invoice_dir.glob("*.png"))
results = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {executor.submit(extract_single, f): f for f in files}
for future in as_completed(futures):
result = future.result()
results.append(result)
status = "✓" if result["status"] == "ok" else "✗"
print(f"{status} {result['file']}")
return results
# Run batch extraction
results = batch_extract("./invoices/")
# Summarize
successful = [r for r in results if r["status"] == "ok" and r["data"].get("success")]
failed = [r for r in results if r["status"] == "error" or not r["data"].get("success")]
print(f"\nExtracted: {len(successful)}/{len(results)}")
print(f"Failed: {len(failed)}")
# Save results
with open("invoice_results.json", "w") as f:
json.dump(results, f, indent=2)
Error Handling: What Can Go Wrong
DocuParseAPI returns typed error codes so your error handling is explicit:
def parse_invoice_safe(file_path: str) -> dict | None:
"""Returns result dict, or None on unrecoverable error."""
try:
api_key = os.environ.get("DOCUPARSE_API_KEY")
if not api_key:
raise EnvironmentError("DOCUPARSE_API_KEY not set")
with open(file_path, "rb") as f:
response = requests.post(
"https://docuparseapi.com/api/v1/extract",
headers={"Authorization": f"Bearer {api_key}"},
files={"file": (os.path.basename(file_path), f)},
timeout=30,
)
data = response.json()
if not data.get("success"):
code = data.get("error", {}).get("code", "UNKNOWN")
if code == "LIMIT_EXCEEDED":
print("Monthly document limit reached. Upgrade at docuparseapi.com/pricing")
elif code == "UNSUPPORTED_FILE_TYPE":
print(f"File type not supported: {file_path}. Use PDF, JPG, or PNG.")
elif code == "FILE_TOO_LARGE":
print(f"File exceeds 10MB: {file_path}")
elif code == "EXTRACTION_FAILED":
print(f"Extraction failed for {file_path}. Try a cleaner scan.")
else:
print(f"API error [{code}]: {data['error']['message']}")
return None
return data
except requests.exceptions.Timeout:
print(f"Request timed out for {file_path}")
return None
except FileNotFoundError:
print(f"File not found: {file_path}")
return None
Storing Extracted Invoice Data
Once you have the structured JSON, storing it is straightforward. Here's a pattern for SQLite — easily adapted to PostgreSQL or any other database:
import sqlite3
import json
from datetime import datetime
def store_invoice(conn: sqlite3.Connection, result: dict) -> int:
"""Insert extracted invoice data into the database."""
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS invoices (
id INTEGER PRIMARY KEY AUTOINCREMENT,
document_id TEXT UNIQUE,
merchant TEXT,
invoice_id TEXT,
date TEXT,
due_date TEXT,
currency TEXT,
subtotal REAL,
tax REAL,
total REAL,
line_items TEXT, -- JSON string
created_at TEXT
)
""")
cursor.execute("""
INSERT OR IGNORE INTO invoices
(document_id, merchant, invoice_id, date, due_date,
currency, subtotal, tax, total, line_items, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
result.get("document_id"),
result.get("merchant"),
result.get("invoice_id"),
result.get("date"),
result.get("due_date"),
result.get("currency"),
float(result.get("subtotal") or 0),
float(result.get("tax") or 0),
float(result.get("total") or 0),
json.dumps(result.get("line_items", [])),
datetime.utcnow().isoformat(),
))
conn.commit()
return cursor.lastrowid
# Usage
conn = sqlite3.connect("invoices.db")
result = parse_invoice("invoice.pdf")
if result:
row_id = store_invoice(conn, result)
print(f"Stored invoice #{row_id}: {result['merchant']} — {result['total']} {result['currency']}")
Using an Async Client (httpx)
For async Python applications (FastAPI, aiohttp, etc.):
import os
import httpx
async def parse_invoice_async(file_path: str) -> dict:
"""Async version using httpx."""
api_key = os.environ["DOCUPARSE_API_KEY"]
async with httpx.AsyncClient(timeout=30) as client:
with open(file_path, "rb") as f:
response = await client.post(
"https://docuparseapi.com/api/v1/extract",
headers={"Authorization": f"Bearer {api_key}"},
files={"file": (os.path.basename(file_path), f.read())},
)
response.raise_for_status()
data = response.json()
if not data.get("success"):
raise RuntimeError(f"Extraction failed: {data['error']['code']}")
return data
# In a FastAPI route
from fastapi import FastAPI, UploadFile
app = FastAPI()
@app.post("/invoices/parse")
async def parse_uploaded_invoice(file: UploadFile):
import tempfile, shutil
# Save upload to temp file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
shutil.copyfileobj(file.file, tmp)
tmp_path = tmp.name
result = await parse_invoice_async(tmp_path)
os.unlink(tmp_path)
return {"invoice": result}
Common Mistakes to Avoid
Never put your API key in source code. Always use environment variables or a secrets manager. The key prefix dex_ makes it easy for secret scanners to catch accidental commits.
Don't call the API from browser JavaScript. The API is designed for server-side use. If you need browser-side invoice upload, build a backend route that proxies the request (like the FastAPI example above).
Handle None fields gracefully. Not every invoice has a due date. Not every receipt has an invoice ID. Check for None before parsing or storing.
Use document_id for deduplication. Every successful extraction returns a unique document_id. Store it and check before re-processing to avoid counting the same document twice.
Next Steps
- View the full API documentation — authentication, response format, error codes
- See the invoice parser API overview — supported fields and document types
- Compare pricing plans — free tier covers 20 documents/month
- Node.js integration guide — same workflow in JavaScript
- Batch processing with n8n — no-code automation alternative