-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path02_structured_extraction.py
More file actions
81 lines (69 loc) · 2.28 KB
/
02_structured_extraction.py
File metadata and controls
81 lines (69 loc) · 2.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
"""Extract structured JSON fields from a document using a schema."""
import os
import sys
import json
import time
import requests
API_KEY = os.environ.get("DEEPREAD_API_KEY")
BASE = "https://api.deepread.tech"
if not API_KEY:
print("Set DEEPREAD_API_KEY in your environment or .env file")
sys.exit(1)
file_path = sys.argv[1] if len(sys.argv) > 1 else None
if not file_path:
print("Usage: python 02_structured_extraction.py <invoice.pdf>")
sys.exit(1)
headers = {"X-API-Key": API_KEY}
# Define what fields to extract
schema = json.dumps({
"type": "object",
"properties": {
"vendor": {"type": "string", "description": "Company or vendor name"},
"invoice_number": {"type": "string", "description": "Invoice or receipt number"},
"date": {"type": "string", "description": "Invoice date"},
"total": {"type": "number", "description": "Total amount due"},
"line_items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"description": {"type": "string"},
"amount": {"type": "number"}
}
},
"description": "List of line items"
}
}
})
# Submit with schema
with open(file_path, "rb") as f:
resp = requests.post(
f"{BASE}/v1/process",
headers=headers,
files={"file": f},
data={"schema": schema},
)
resp.raise_for_status()
job = resp.json()
job_id = job["id"]
print(f"Submitted: {job_id}")
# Poll with backoff
delay = 5
while True:
time.sleep(delay)
result = requests.get(f"{BASE}/v1/jobs/{job_id}", headers=headers).json()
status = result["status"]
print(f" Status: {status}")
if status == "completed":
data = result["result"]["data"]
print("\n--- Extracted Fields ---")
print(json.dumps(data, indent=2))
# Show human-in-the-loop flags
for field, value in data.items():
if isinstance(value, dict) and value.get("hil_flag"):
print(f"\n Warning: '{field}' flagged for review: {value.get('reason', 'low confidence')}")
break
elif status == "failed":
print(f"Failed: {result.get('error', 'Unknown error')}")
break
delay = min(delay * 1.5, 15)