Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,28 @@ show-retention-status: ## Show data retention status
@python3 -c "from src.services.data_retention import DataRetentionService; from src.app import AppConfig; config = AppConfig.from_env(); service = DataRetentionService(config.data_retention_days, config.output_dir); files = service.find_expired_files(); print(f'Retention period: {config.data_retention_days} days'); print(f'Expired files: {len(files)}')"

# Docker build modes
.PHONY: docker-local docker-remote docker-build docker-pull
.PHONY: docker-local docker-remote docker-build docker-pull docker-integration

docker-integration: ## Run Docker integration test: process ./input and assert non-zero transaction output
@docker images bankstatementsprocessor:latest -q > /dev/null 2>&1 || \
{ echo "❌ Image not found. Run 'make docker-build' first."; exit 1; }
@ls input/*.pdf > /dev/null 2>&1 || \
{ echo "❌ No PDFs found in ./input — add PDFs before running."; exit 1; }
@mkdir -p output
docker run --rm \
-v "$(PWD)/input:/app/input:ro" \
-v "$(PWD)/output:/app/output" \
-e LOG_LEVEL=WARNING \
-e EXIT_AFTER_PROCESSING=true \
-e GENERATE_MONTHLY_SUMMARY=false \
-e GENERATE_EXPENSE_ANALYSIS=false \
bankstatementsprocessor:latest
@python3 -c "\
import glob, csv, sys; \
files = [f for f in glob.glob('output/*.csv') if 'duplicate' not in f]; \
total = sum(sum(1 for _ in csv.DictReader(open(f))) for f in files); \
print(f'✅ Docker integration: {total} transactions extracted'); \
sys.exit(0 if total > 0 else 1)"

docker-local: ## Build and run from local code
@echo "🔨 Building from local code..."
Expand Down
49 changes: 49 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,55 @@ flake8 src tests

---

## Integration Testing

The integration test runs the full pipeline against real PDFs in `./input/` and compares key output metrics against a committed snapshot baseline. It is excluded from the default test run and must be triggered explicitly.

**Prerequisites:** place PDF bank statements in `./input/` before running.

### Run the integration test

```bash
cd packages/parser-core
pytest -m integration -v
```

The test asserts:
- Transaction counts, page counts, and PDF counts match the baseline
- Output filenames are unchanged
- `excluded_files.json` is consistent with the processing summary
- Each masked IBAN is correctly formatted and maps to the right output filename
- CSV row counts match JSON record counts per IBAN
- CSV column headers contain required fields (`Date`, `Details`, `Debit`/`Credit`)
- Duplicate JSON files are valid arrays

### Update the snapshot baseline

Run this after any intentional change that affects output counts or filenames (e.g. adding a new template, changing filter behaviour, adding a new output file):

```bash
cd packages/parser-core
pytest -m integration --snapshot-update --no-cov
```

The updated `snapshots/output_snapshot.json` should be committed — changes are visible in code review and serve as a record of intentional output changes.

### Docker integration test

Validate that the built Docker image correctly processes PDFs end-to-end:

```bash
# Build the image first
make docker-build

# Run against ./input — asserts non-zero CSV output
make docker-integration
```

This mounts `./input` read-only and `./output` writable, runs the container, then asserts that at least one CSV was produced with a non-zero transaction count.

---

## Contributing

Contributions are welcome!
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"size_bytes": 29904
},
"bank_statements_3656.xlsx": {
"size_bytes": 7335
"size_bytes": 7334
},
"bank_statements_9015.csv": {
"row_count": 400,
Expand All @@ -28,7 +28,7 @@
"size_bytes": 156629
},
"bank_statements_9015.xlsx": {
"size_bytes": 18959
"size_bytes": 18958
},
"duplicates.json": {
"record_count": 0,
Expand Down
139 changes: 139 additions & 0 deletions packages/parser-core/tests/integration/test_output_snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@

from __future__ import annotations

import csv
import json
import re
from pathlib import Path

import pytest
Expand Down Expand Up @@ -165,3 +167,140 @@ def test_output_snapshot(request: pytest.FixtureRequest) -> None:
f"{diff_text}\n\n"
"If this change is intentional, re-run with --snapshot-update to accept it."
)

_assert_output_invariants(OUTPUT_DIR, summary)


# ---------------------------------------------------------------------------
# Structural invariants — always asserted regardless of snapshot update mode
# ---------------------------------------------------------------------------

_IBAN_MASKED_RE = re.compile(r"^[A-Z]{2}[0-9]{2}\*+[0-9]{4}$")
_IBAN_DIGEST_RE = re.compile(r"^[0-9a-f]{64}$")


def _assert_output_invariants(output_dir: Path, summary: dict) -> None:
"""Assert structural correctness of the output directory.

These checks are independent of the snapshot baseline — they validate
relationships between output files that must always hold true.
"""
pdf_count = summary.get("pdf_count", 0)
pdfs_extracted = summary.get("pdfs_extracted", 0)
transactions = summary.get("transactions", 0)
duplicates = summary.get("duplicates", 0)

# 1a. Pipeline produced meaningful output
assert pdfs_extracted > 0, "pdfs_extracted == 0 — pipeline silently processed nothing"
assert transactions > 0, "transactions == 0 — no transactions extracted"

# 1b. Excluded files log is consistent with processing summary
excluded_path = output_dir / "excluded_files.json"
if excluded_path.exists():
excluded_data = json.loads(excluded_path.read_text(encoding="utf-8"))
total_excluded = excluded_data.get("summary", {}).get("total_excluded", 0)
assert total_excluded == pdf_count - pdfs_extracted, (
f"excluded_files.json total_excluded ({total_excluded}) != "
f"pdf_count - pdfs_extracted ({pdf_count} - {pdfs_extracted})"
)
for entry in excluded_data.get("excluded_files", []):
assert entry.get("reason"), f"Excluded file {entry.get('filename')!r} has no reason"
assert (entry.get("pages") or 0) > 0, (
f"Excluded file {entry.get('filename')!r} has pages <= 0"
)

# 1c. IBAN file: record count, format, and filename suffix mapping
ibans_path = output_dir / "ibans.json"
if ibans_path.exists():
ibans = json.loads(ibans_path.read_text(encoding="utf-8"))
assert isinstance(ibans, list), "ibans.json should be a JSON array"
assert len(ibans) == pdfs_extracted, (
f"ibans.json has {len(ibans)} entries but pdfs_extracted == {pdfs_extracted}"
)
iban_suffixes = set()
for entry in ibans:
masked = entry.get("iban_masked", "")
digest = entry.get("iban_digest", "")
assert _IBAN_MASKED_RE.match(masked), (
f"Masked IBAN {masked!r} does not match expected format [A-Z]{{2}}[0-9]{{2}}*...[0-9]{{4}}"
)
assert _IBAN_DIGEST_RE.match(digest), (
f"IBAN digest {digest!r} is not a valid 64-char hex SHA-256"
)
iban_suffixes.add(masked[-4:])

# Every IBAN-specific output file suffix must correspond to a known IBAN
for path in output_dir.iterdir():
match = re.match(r"bank_statements_(\d{4})\.", path.name)
if match:
suffix = match.group(1)
assert suffix in iban_suffixes, (
f"Output file {path.name!r} has suffix {suffix!r} "
f"but no IBAN ending in those digits found in ibans.json"
)

# 1d. Transaction count cross-check: CSV rows == JSON records per IBAN pair
iban_csv_files = [
p for p in output_dir.iterdir()
if re.match(r"bank_statements_\d{4}\.csv$", p.name)
]
csv_total = 0
for csv_path in iban_csv_files:
stem = csv_path.stem # e.g. bank_statements_3656
json_path = output_dir / f"{stem}.json"
lines = csv_path.read_text(encoding="utf-8").splitlines()
csv_rows = len([l for l in lines if l.strip()]) - 1 # exclude header
csv_total += csv_rows
if json_path.exists():
json_records = json.loads(json_path.read_text(encoding="utf-8"))
assert isinstance(json_records, list)
assert len(json_records) == csv_rows, (
f"{csv_path.name} has {csv_rows} rows but "
f"{json_path.name} has {len(json_records)} records"
)

assert csv_total == transactions, (
f"Sum of IBAN-specific CSV rows ({csv_total}) != transactions ({transactions})"
)

# 1e. CSV column integrity on each IBAN-specific CSV
required_columns = {"Date", "Details"}
debit_credit_columns = {"Debit", "Credit", "Debit €", "Credit €"}
for csv_path in iban_csv_files:
with csv_path.open(encoding="utf-8") as f:
reader = csv.DictReader(f)
columns = reader.fieldnames or []
assert len(columns) == len(set(columns)), (
f"{csv_path.name} has duplicate column names: {columns}"
)
assert all(c for c in columns), (
f"{csv_path.name} has empty column name(s): {columns}"
)
missing = required_columns - set(columns)
assert not missing, (
f"{csv_path.name} is missing required columns: {missing}"
)
assert debit_credit_columns & set(columns), (
f"{csv_path.name} has no Debit or Credit column: {columns}"
)

# 1f. Duplicate files are valid JSON arrays with length matching summary
duplicate_files = [
p for p in output_dir.iterdir()
if re.match(r"duplicates.*\.json$", p.name)
]
total_duplicate_records = 0
for dup_path in duplicate_files:
data = json.loads(dup_path.read_text(encoding="utf-8"))
assert isinstance(data, list), f"{dup_path.name} is not a JSON array"
total_duplicate_records += len(data)

# The global duplicates.json + per-IBAN files may double-count; just verify
# each file is a valid array (length consistency checked via summary count).
global_dup = output_dir / "duplicates.json"
if global_dup.exists():
global_data = json.loads(global_dup.read_text(encoding="utf-8"))
assert isinstance(global_data, list)
assert len(global_data) == duplicates, (
f"duplicates.json has {len(global_data)} records but summary says {duplicates} duplicates"
)
Loading