All PDF files must be fed to the woodchipper.
description = 'A Python library, CLI tool, and HTTP API for analyzing PDF files. Extracts metadata, computes file hashes, finds embedded URLs, detects forms, AcroForms, XFA, JavaScript, embedded files, rich media, and identifies suspicious actions and anomalies. All output is automatically defanged for safe handling.'
author = 'Ryan C. Moon'
version = '1.1.2'
date = '2026-02-17'
To report a bug, please open an issue or submit a PR.
pip install woodchipperWith HTTP server support:
pip install woodchipper[server]Or from source:
pip install -r requirements.txt
pip install -e .Note: Requires libmagic for file type detection:
- Debian/Ubuntu:
apt install libmagic1 - macOS:
brew install libmagic - Fedora:
dnf install file-libs
woodchipper <path-to-pdf>Output is JSON with all string fields defanged:
{
"filename": "suspicious.pdf",
"filesize": 142857,
"md5": "d41d8cd98f00b204e9800998ecf8427e",
"sha1": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
"sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
"urls": [
"hXXps://example[.]com",
"hXXps://test[.]org/page"
],
"metadata": {
"author": "John Doe",
"creator": "Microsoft Word",
"producer": "Microsoft: Print To PDF",
"subject": null,
"title": "Important Document",
"creation_date": "2024-01-15 12:00:00+00:00",
"modification_date": "2024-01-16 09:00:00+00:00",
"spoofing_indicators": []
},
"anomalies": {
"anomalies_present": true,
"anomalies": [
"PDF header not at byte 0 (found at byte 18)",
"Rich media detected: /3D (3D content). Rich media includes 3D or Flash streams which are no longer supported by modern OS or in common usage. This is suspicious.",
"Stream length mismatches detected (1 stream(s)). Mismatched stream lengths may indicate PDF tampering, corruption, or malicious manipulation."
],
"additional_actions_detected": [
{
"description": "Document Open: JavaScript execution (code: app.alert('Hello');...)",
"offset": 361,
"raw_bytes": "35 20 30 20 6f 62 6a 0d 0a 3c 3c 2f 4a 53 ...",
"decoded_bytes": null
}
],
"external_actions": [
{
"description": "/Launch at Document OpenAction: Launches external application (file: cmd.exe)",
"offset": 274,
"raw_bytes": "34 20 30 20 6f 62 6a 0d 0a 3c 3c 2f 46 ...",
"decoded_bytes": null
},
{
"description": "/URI at Page 1 Annotation 1: Opens URL (hXXps://malicious[.]com)",
"offset": 37039,
"raw_bytes": "31 30 20 30 20 6f 62 6a 0d 0a 3c 3c ...",
"decoded_bytes": null
}
],
"javascript_detected": [
{
"description": "Document OpenAction: displays alert (code: app.alert('Hello');)",
"offset": 361,
"raw_bytes": "35 20 30 20 6f 62 6a 0d 0a 3c 3c 2f 4a 53 ...",
"decoded_bytes": null
}
],
"embedded_files": [
{
"name": "malware.exe",
"file_type": "PE32 executable (GUI) Intel 80386",
"mime_type": "application/x-dosexec",
"size": 45056,
"description": "Click to open",
"offset": 892,
"raw_bytes": "37 20 30 20 6f 62 6a 0d 0a 3c 3c 2f 54 79 70 65 ...",
"decoded_bytes": "4d 5a 90 00 03 00 00 00 04 00 00 00 ff ff 00 00 ..."
}
],
"acroform_details": [
{
"description": "AcroForm detected in document catalog",
"offset": 512,
"raw_bytes": "34 20 30 20 6f 62 6a 0d 0a 3c 3c 2f 46 69 65 6c ...",
"decoded_bytes": null
},
{
"description": "Form contains 3 top-level field(s)",
"offset": 512,
"raw_bytes": "34 20 30 20 6f 62 6a 0d 0a 3c 3c 2f 46 69 65 6c ...",
"decoded_bytes": null
},
{
"description": "Field 'username': type=Text, flags=[Required]",
"offset": 1024,
"raw_bytes": "38 20 30 20 6f 62 6a 0d 0a 3c 3c 2f 46 54 2f 54 ...",
"decoded_bytes": null
}
],
"xfa_details": [
{
"description": "XFA (XML Forms Architecture) detected",
"offset": 512,
"raw_bytes": "34 20 30 20 6f 62 6a 0d 0a 3c 3c 2f 58 46 41 ...",
"decoded_bytes": null
},
{
"description": "XFA JavaScript script in template - behaviors: contains URL: var url = 'http://...'...",
"offset": 1280,
"raw_bytes": "39 20 30 20 6f 62 6a 0d 0a 3c 3c 2f 4c 65 6e ...",
"decoded_bytes": "3c 3f 78 6d 6c 20 76 65 72 73 69 6f 6e 3d 22 31 ..."
}
]
},
"forms": {
"forms_present": true,
"form_submission_targets": [
"hXXps://collect[.]malicious[.]com/submit"
]
}
}Exit codes:
0- Success1- Validation error (file not found, not readable, or not a PDF)
Woodchipper includes a FastAPI-based HTTP server for remote PDF analysis.
# Install with server dependencies
pip install woodchipper[server]
# Run with the built-in command
woodchipper-server
# Or run directly with uvicorn
uvicorn woodchipper.server:app --host 0.0.0.0 --port 8080
# Or with auto-reload for development
uvicorn woodchipper.server:app --host 0.0.0.0 --port 8080 --reload| Method | Path | Description |
|---|---|---|
| GET | /health |
Health check |
| POST | /analyze |
Upload PDF as multipart form |
| POST | /analyze/raw |
Send raw PDF bytes in body |
| GET | /docs |
Interactive Swagger UI documentation |
# Multipart upload
curl -X POST -F "file=@suspicious.pdf" http://localhost:8080/analyze
# Raw bytes
curl -X POST -H "Content-Type: application/pdf" \
--data-binary @suspicious.pdf \
http://localhost:8080/analyze/raw
# Health check
curl http://localhost:8080/healthBoth /analyze and /analyze/raw return the same JSON report as the CLI:
{
"filename": "suspicious.pdf",
"filesize": 142857,
"md5": "...",
"sha256": "...",
"urls": ["hXXps://example[.]com"],
"metadata": { ... },
"anomalies": { ... },
"forms": { ... }
}400 Bad Request- Invalid file, empty body, or not a PDF200 OK- Analysis successful, returns JSON report
Process a PDF file and return a full report with all analysis.
from woodchipper import process
report = process("document.pdf")
print(report["sha256"])
print(report["urls"]) # Defanged URLs
print(report["metadata"]["spoofing_indicators"])
# Action fields return ActionDetail dicts with description, offset, raw_bytes, and decoded_bytes
for action in report["anomalies"]["additional_actions_detected"]:
print(f"{action['description']} @ offset {action['offset']}")
for action in report["anomalies"]["external_actions"]:
print(f"{action['description']} @ offset {action['offset']}")
for js in report["anomalies"]["javascript_detected"]:
print(f"{js['description']} @ offset {js['offset']}")
print(report["anomalies"]["embedded_files"])
print(report["anomalies"]["acroform_details"])
print(report["anomalies"]["xfa_details"])Note: All string fields in the returned report are defanged for safe handling.
Extract URLs from a PDF file (returns raw URLs, not defanged).
from woodchipper import get_urls
urls = get_urls("document.pdf")
for url in urls:
print(url)Extracts URLs from /Link annotations and /A (Action) dictionaries with /URI entries.
Extract document metadata with spoofing detection.
from woodchipper import get_pdf_metadata
metadata = get_pdf_metadata("document.pdf")
print(f"Author: {metadata['author']}")
print(f"Creator: {metadata['creator']}")
if metadata["spoofing_indicators"]:
print("Potential spoofing detected:")
for indicator in metadata["spoofing_indicators"]:
print(f" - {indicator}")Spoofing detection includes:
- Creation date after modification date
- Timestamps in the future
- Creator/producer mismatches (e.g., claims Microsoft Word but produced by LibreOffice)
- Creation date before PDF format existed (pre-1993)
Check for PDF structural anomalies and suspicious content.
from woodchipper import check_anomalies
anomalies = check_anomalies("document.pdf")
if anomalies["anomalies_present"]:
for anomaly in anomalies["anomalies"]:
print(f"Anomaly: {anomaly}")
for action in anomalies["additional_actions_detected"]:
print(f"Action: {action['description']} (offset: {action['offset']})")
for external in anomalies["external_actions"]:
print(f"External: {external['description']} (offset: {external['offset']})")
for js in anomalies["javascript_detected"]:
print(f"JavaScript: {js['description']} (offset: {js['offset']})")
for ef in anomalies["embedded_files"]:
print(f"Embedded: {ef['name']} ({ef['mime_type']}) (offset: {ef['offset']})")
for form in anomalies["acroform_details"]:
print(f"AcroForm: {form['description']} (offset: {form['offset']})")
for xfa in anomalies["xfa_details"]:
print(f"XFA: {xfa['description']} (offset: {xfa['offset']})")Detects:
- PDF header not at byte 0 (embedded content)
- Invalid PDF version
- Missing or malformed binary marker
- Missing %%EOF marker
- Data after %%EOF (appended content)
- Rich media (3D, Flash) - no longer supported, suspicious
- Stream length mismatches (tampering indicator)
/OpenActionand/AA(Additional Actions) triggers- External actions (
/Launch,/URI,/GoToR,/GoToE) - Embedded JavaScript with behavior analysis
- Embedded files with file type detection
- AcroForm details (field types, flags, actions)
- XFA (XML Forms Architecture) with script extraction
Detect automatic actions triggered by PDF events. Each result is an ActionDetail dict with description, offset (byte offset of the PDF object, or null for inline objects), raw_bytes (space-separated hex at that offset, or null), and decoded_bytes (decompressed stream content as hex, or null for non-stream objects).
from woodchipper import detect_additionalactions
actions = detect_additionalactions("document.pdf")
for action in actions:
print(f"{action['description']} @ offset {action['offset']}")Detects actions triggered by:
- Document Open, Close, Save, Print
- Page Open, Close
Action types identified:
- JavaScript execution
- Launch external application
- Open URL
- Submit form data
- And more
Detect external action tags that access resources outside the PDF. Returns ActionDetail dicts with forensic offset, raw bytes, and decoded stream bytes.
from woodchipper import detect_external_actions
actions = detect_external_actions("document.pdf")
for action in actions:
print(f"{action['description']} @ offset {action['offset']}")Detects:
/Launch- Launches external applications (e.g.,cmd.exe, executables)/URI- Opens URLs in browser/GoToR- Opens remote PDF documents/GoToE- Opens embedded documents
Detect JavaScript code embedded in a PDF with behavior analysis. Returns ActionDetail dicts with forensic offset, raw bytes, and decoded stream bytes.
from woodchipper import detect_javascript
scripts = detect_javascript("document.pdf")
for script in scripts:
print(f"{script['description']} @ offset {script['offset']}")Detects /JavaScript and /JS tags in:
- Document OpenAction
- Document and page Additional Actions (/AA)
- Named JavaScript in the Names tree
- Annotation actions
Behavior analysis identifies:
displays alert- app.alert() callslaunches URL- app.launchURL() callssubmits form data- this.submitForm() callsexports data- exportDataAsObject() callsevaluates dynamic code- eval() usagedecodes obfuscated content- unescape(), fromCharCode()makes network request- XMLHttp, SOAP callsaccesses form fields- this.getField() callssets timer/delayed execution- setInterval/setTimeoutpotential heap spray- Collab.getIcon exploit pattern
Detect files embedded or attached to a PDF.
from woodchipper import detect_embedded_file
files = detect_embedded_file("document.pdf")
for f in files:
print(f"Name: {f['name']}")
print(f"Type: {f['file_type']}")
print(f"MIME: {f['mime_type']}")
print(f"Size: {f['size']} bytes")
print(f"Offset: {f['offset']}")Detects embedded files in:
/Namestree/EmbeddedFilesentries/FileSpecdictionaries with/EFstreams/FileAttachmentannotations
Returns EmbeddedFile dict with:
name- Filename from the PDFfile_type- File type description (from magic bytes)mime_type- MIME type (from magic bytes)size- File size in bytesdescription- Description from PDF metadataoffset- Byte offset of the FileSpec object in the PDF (ornull)raw_bytes- Space-separated hex bytes at the offset (ornull)decoded_bytes- Decompressed embedded file content as space-separated hex (up to 1024 bytes, ornull)
Detect and analyze AcroForm structures in a PDF. Returns ActionDetail dicts with forensic offset, raw bytes, and decoded stream bytes.
from woodchipper import detect_acroform
details = detect_acroform("document.pdf")
for detail in details:
print(f"{detail['description']} (offset: {detail['offset']})")Detects:
- Presence of AcroForm in document catalog
- XFA forms (XML Forms Architecture)
- NeedAppearances flag (dynamic appearance generation)
- Signature flags (SignaturesExist, AppendOnly)
- Calculation Order (/CO) - automatic calculation scripts
- Form field details:
- Field types (Text, Button, Choice, Signature)
- Field flags (ReadOnly, Required, Password, Multiline, etc.)
- Actions attached to fields
- JavaScript in field actions
Detect and analyze XFA (XML Forms Architecture) in a PDF. Returns ActionDetail dicts with forensic offset, raw bytes, and decoded stream bytes.
from woodchipper import detect_xmlforms
details = detect_xmlforms("document.pdf")
for detail in details:
print(f"{detail['description']} (offset: {detail['offset']})")Detects:
- XFA presence and structure (array or single stream)
- XFA components (template, config, datasets, localeSet, etc.)
- Embedded scripts (JavaScript and FormCalc)
- Event handlers (onClick, onEnter, onChange, etc.)
- Submit actions and URL targets
- Dangerous operations:
xfa.host.messageBox- message displayxfa.host.exportData/importData- data operationsxfa.host.gotoURL- URL navigationapp.launchURL,app.execMenuItem- application actionsADBC.*- database connectivityNet.HTTP,SOAP.*- network operations
Detect rich media content (3D, Flash, multimedia) in a PDF.
from woodchipper import detect_richmedia
findings = detect_richmedia("document.pdf")
for finding in findings:
print(finding)Detects:
/RichMediaannotations and related tags/3D,/3DD,/3DA,/3DV,/3DI- 3D content/U3D(Universal 3D),/PRC(Product Representation Compact)/Flash,/Movie,/Sound,/Screen- multimedia/Rendition,/GoTo3DView- multimedia actions
Note: Rich media like 3D and Flash are no longer supported by modern OS and PDF readers, making their presence suspicious.
Detect mismatches between declared and actual PDF stream lengths.
from woodchipper import detect_stream_mismatches
mismatches = detect_stream_mismatches("document.pdf")
for mismatch in mismatches:
print(mismatch)Detects:
- Missing
endstreammarkers (malformed structure) - Declared length exceeds actual (truncated/tampered data)
- Actual length exceeds declared (injected data/buffer overflow attempt)
- Indirect length reference mismatches
Note: Mismatched stream lengths may indicate PDF tampering, corruption, or malicious manipulation to hide content or exploit PDF parsers.
Detect PDF forms and extract submission targets.
from woodchipper import extract_forms
forms = extract_forms("document.pdf")
if forms["forms_present"]:
print("Form submission targets:")
for target in forms["form_submission_targets"]:
print(f" {target}")Validate that a file exists, is readable, and is a PDF.
from woodchipper import validate_pdf, ValidationError
try:
path = validate_pdf("document.pdf")
except ValidationError as e:
print(f"Invalid: {e}")Exception raised when file validation fails:
- File not found
- Path is not a file
- File is not readable
- File is not a PDF (based on magic bytes)
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"type": "object",
"properties": {
"filename": { "type": "string" },
"filesize": { "type": "integer" },
"md5": { "type": "string" },
"sha1": { "type": "string" },
"sha256": { "type": "string" },
"urls": {
"type": "array",
"items": { "type": "string" }
},
"metadata": {
"type": "object",
"properties": {
"author": { "type": ["string", "null"] },
"creator": { "type": ["string", "null"] },
"producer": { "type": ["string", "null"] },
"subject": { "type": ["string", "null"] },
"title": { "type": ["string", "null"] },
"creation_date": { "type": ["string", "null"] },
"modification_date": { "type": ["string", "null"] },
"spoofing_indicators": {
"type": "array",
"items": { "type": "string" }
}
}
},
"anomalies": {
"type": "object",
"properties": {
"anomalies_present": { "type": "boolean" },
"anomalies": {
"type": "array",
"items": { "type": "string" },
"description": "Structural anomalies, rich media findings, and stream length mismatches"
},
"additional_actions_detected": {
"type": "array",
"items": { "$ref": "#/$defs/ActionDetail" }
},
"external_actions": {
"type": "array",
"items": { "$ref": "#/$defs/ActionDetail" }
},
"javascript_detected": {
"type": "array",
"items": { "$ref": "#/$defs/ActionDetail" }
},
"embedded_files": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": { "type": ["string", "null"] },
"file_type": { "type": ["string", "null"] },
"mime_type": { "type": ["string", "null"] },
"size": { "type": ["integer", "null"] },
"description": { "type": ["string", "null"] },
"offset": { "type": ["integer", "null"] },
"raw_bytes": { "type": ["string", "null"] },
"decoded_bytes": { "type": ["string", "null"] }
}
}
},
"acroform_details": {
"type": "array",
"items": { "$ref": "#/$defs/ActionDetail" },
"description": "AcroForm field details, types, flags, and actions"
},
"xfa_details": {
"type": "array",
"items": { "$ref": "#/$defs/ActionDetail" },
"description": "XFA (XML Forms Architecture) components and scripts"
}
}
},
"forms": {
"type": "object",
"properties": {
"forms_present": { "type": "boolean" },
"form_submission_targets": {
"type": "array",
"items": { "type": "string" }
}
}
}
},
"required": ["filename", "filesize", "md5", "sha1", "sha256", "urls", "metadata", "anomalies", "forms"],
"$defs": {
"ActionDetail": {
"type": "object",
"description": "Forensic detail for a detected PDF action",
"properties": {
"description": {
"type": "string",
"description": "Human-readable description of the action and its trigger"
},
"offset": {
"type": ["integer", "null"],
"description": "Byte offset of the PDF object in the file, or null for inline objects"
},
"raw_bytes": {
"type": ["string", "null"],
"description": "Space-separated hex-encoded raw bytes at the offset (up to 256 bytes), or null"
},
"decoded_bytes": {
"type": ["string", "null"],
"description": "Space-separated hex-encoded decompressed stream content (up to 1024 bytes), or null for non-stream objects"
}
},
"required": ["description", "offset", "raw_bytes", "decoded_bytes"]
}
}
}Install in editable mode with all dependencies:
pip install -e ".[all]"Or just dev dependencies:
pip install -e ".[dev]"Run tests:
pytestMIT