longieirl · longieirl · Mar 26, 2026 · Mar 26, 2026
@@ -360,7 +360,28 @@ show-retention-status:	## Show data retention status
 	@python3 -c "from src.services.data_retention import DataRetentionService; from src.app import AppConfig; config = AppConfig.from_env(); service = DataRetentionService(config.data_retention_days, config.output_dir); files = service.find_expired_files(); print(f'Retention period: {config.data_retention_days} days'); print(f'Expired files: {len(files)}')"
 
 # Docker build modes
-.PHONY: docker-local docker-remote docker-build docker-pull
+.PHONY: docker-local docker-remote docker-build docker-pull docker-integration
+
+docker-integration: ## Run Docker integration test: process ./input and assert non-zero transaction output
+	@docker images bankstatementsprocessor:latest -q > /dev/null 2>&1 || \
+	  { echo "❌ Image not found. Run 'make docker-build' first."; exit 1; }
+	@ls input/*.pdf > /dev/null 2>&1 || \
+	  { echo "❌ No PDFs found in ./input — add PDFs before running."; exit 1; }
+	@mkdir -p output
+	docker run --rm \
+	  -v "$(PWD)/input:/app/input:ro" \
+	  -v "$(PWD)/output:/app/output" \
+	  -e LOG_LEVEL=WARNING \
+	  -e EXIT_AFTER_PROCESSING=true \
+	  -e GENERATE_MONTHLY_SUMMARY=false \
+	  -e GENERATE_EXPENSE_ANALYSIS=false \
+	  bankstatementsprocessor:latest
+	@python3 -c "\
+import glob, csv, sys; \
+files = [f for f in glob.glob('output/*.csv') if 'duplicate' not in f]; \
+total = sum(sum(1 for _ in csv.DictReader(open(f))) for f in files); \
+print(f'✅ Docker integration: {total} transactions extracted'); \
+sys.exit(0 if total > 0 else 1)"
 
 docker-local: ## Build and run from local code
 	@echo "🔨 Building from local code..."

@@ -136,6 +136,55 @@ flake8 src tests
 
 ---
 
+## Integration Testing
+
+The integration test runs the full pipeline against real PDFs in `./input/` and compares key output metrics against a committed snapshot baseline. It is excluded from the default test run and must be triggered explicitly.
+
+**Prerequisites:** place PDF bank statements in `./input/` before running.
+
+### Run the integration test
+
+```bash
+cd packages/parser-core
+pytest -m integration -v
+```
+
+The test asserts:
+- Transaction counts, page counts, and PDF counts match the baseline
+- Output filenames are unchanged
+- `excluded_files.json` is consistent with the processing summary
+- Each masked IBAN is correctly formatted and maps to the right output filename
+- CSV row counts match JSON record counts per IBAN
+- CSV column headers contain required fields (`Date`, `Details`, `Debit`/`Credit`)
+- Duplicate JSON files are valid arrays
+
+### Update the snapshot baseline
+
+Run this after any intentional change that affects output counts or filenames (e.g. adding a new template, changing filter behaviour, adding a new output file):
+
+```bash
+cd packages/parser-core
+pytest -m integration --snapshot-update --no-cov
+```
+
+The updated `snapshots/output_snapshot.json` should be committed — changes are visible in code review and serve as a record of intentional output changes.
+
+### Docker integration test
+
+Validate that the built Docker image correctly processes PDFs end-to-end:
+
+```bash
+# Build the image first
+make docker-build
+
+# Run against ./input — asserts non-zero CSV output
+make docker-integration
+```
+
+This mounts `./input` read-only and `./output` writable, runs the container, then asserts that at least one CSV was produced with a non-zero transaction count.
+
+---
+
 ## Contributing
 
 Contributions are welcome!

@@ -17,7 +17,7 @@
       "size_bytes": 29904
     },
     "bank_statements_3656.xlsx": {
-      "size_bytes": 7335
+      "size_bytes": 7334
     },
     "bank_statements_9015.csv": {
       "row_count": 400,
@@ -28,7 +28,7 @@
       "size_bytes": 156629
     },
     "bank_statements_9015.xlsx": {
-      "size_bytes": 18959
+      "size_bytes": 18958
     },
     "duplicates.json": {
       "record_count": 0,

@@ -16,7 +16,9 @@
 
 from __future__ import annotations
 
+import csv
 import json
+import re
 from pathlib import Path
 
 import pytest
@@ -165,3 +167,140 @@ def test_output_snapshot(request: pytest.FixtureRequest) -> None:
             f"{diff_text}\n\n"
             "If this change is intentional, re-run with --snapshot-update to accept it."
         )
+
+    _assert_output_invariants(OUTPUT_DIR, summary)
+
+
+# ---------------------------------------------------------------------------
+# Structural invariants — always asserted regardless of snapshot update mode
+# ---------------------------------------------------------------------------
+
+_IBAN_MASKED_RE = re.compile(r"^[A-Z]{2}[0-9]{2}\*+[0-9]{4}$")
+_IBAN_DIGEST_RE = re.compile(r"^[0-9a-f]{64}$")
+
+
+def _assert_output_invariants(output_dir: Path, summary: dict) -> None:
+    """Assert structural correctness of the output directory.
+
+    These checks are independent of the snapshot baseline — they validate
+    relationships between output files that must always hold true.
+    """
+    pdf_count = summary.get("pdf_count", 0)
+    pdfs_extracted = summary.get("pdfs_extracted", 0)
+    transactions = summary.get("transactions", 0)
+    duplicates = summary.get("duplicates", 0)
+
+    # 1a. Pipeline produced meaningful output
+    assert pdfs_extracted > 0, "pdfs_extracted == 0 — pipeline silently processed nothing"
+    assert transactions > 0, "transactions == 0 — no transactions extracted"
+
+    # 1b. Excluded files log is consistent with processing summary
+    excluded_path = output_dir / "excluded_files.json"
+    if excluded_path.exists():
+        excluded_data = json.loads(excluded_path.read_text(encoding="utf-8"))
+        total_excluded = excluded_data.get("summary", {}).get("total_excluded", 0)
+        assert total_excluded == pdf_count - pdfs_extracted, (
+            f"excluded_files.json total_excluded ({total_excluded}) != "
+            f"pdf_count - pdfs_extracted ({pdf_count} - {pdfs_extracted})"
+        )
+        for entry in excluded_data.get("excluded_files", []):
+            assert entry.get("reason"), f"Excluded file {entry.get('filename')!r} has no reason"
+            assert (entry.get("pages") or 0) > 0, (
+                f"Excluded file {entry.get('filename')!r} has pages <= 0"
+            )
+
+    # 1c. IBAN file: record count, format, and filename suffix mapping
+    ibans_path = output_dir / "ibans.json"
+    if ibans_path.exists():
+        ibans = json.loads(ibans_path.read_text(encoding="utf-8"))
+        assert isinstance(ibans, list), "ibans.json should be a JSON array"
+        assert len(ibans) == pdfs_extracted, (
+            f"ibans.json has {len(ibans)} entries but pdfs_extracted == {pdfs_extracted}"
+        )
+        iban_suffixes = set()
+        for entry in ibans:
+            masked = entry.get("iban_masked", "")
+            digest = entry.get("iban_digest", "")
+            assert _IBAN_MASKED_RE.match(masked), (
+                f"Masked IBAN {masked!r} does not match expected format [A-Z]{{2}}[0-9]{{2}}*...[0-9]{{4}}"
+            )
+            assert _IBAN_DIGEST_RE.match(digest), (
+                f"IBAN digest {digest!r} is not a valid 64-char hex SHA-256"
+            )
+            iban_suffixes.add(masked[-4:])
+
+        # Every IBAN-specific output file suffix must correspond to a known IBAN
+        for path in output_dir.iterdir():
+            match = re.match(r"bank_statements_(\d{4})\.", path.name)
+            if match:
+                suffix = match.group(1)
+                assert suffix in iban_suffixes, (
+                    f"Output file {path.name!r} has suffix {suffix!r} "
+                    f"but no IBAN ending in those digits found in ibans.json"
+                )
+
+    # 1d. Transaction count cross-check: CSV rows == JSON records per IBAN pair
+    iban_csv_files = [
+        p for p in output_dir.iterdir()
+        if re.match(r"bank_statements_\d{4}\.csv$", p.name)
+    ]
+    csv_total = 0
+    for csv_path in iban_csv_files:
+        stem = csv_path.stem  # e.g. bank_statements_3656
+        json_path = output_dir / f"{stem}.json"
+        lines = csv_path.read_text(encoding="utf-8").splitlines()
+        csv_rows = len([l for l in lines if l.strip()]) - 1  # exclude header
+        csv_total += csv_rows
+        if json_path.exists():
+            json_records = json.loads(json_path.read_text(encoding="utf-8"))
+            assert isinstance(json_records, list)
+            assert len(json_records) == csv_rows, (
+                f"{csv_path.name} has {csv_rows} rows but "
+                f"{json_path.name} has {len(json_records)} records"
+            )
+
+    assert csv_total == transactions, (
+        f"Sum of IBAN-specific CSV rows ({csv_total}) != transactions ({transactions})"
+    )
+
+    # 1e. CSV column integrity on each IBAN-specific CSV
+    required_columns = {"Date", "Details"}
+    debit_credit_columns = {"Debit", "Credit", "Debit €", "Credit €"}
+    for csv_path in iban_csv_files:
+        with csv_path.open(encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            columns = reader.fieldnames or []
+        assert len(columns) == len(set(columns)), (
+            f"{csv_path.name} has duplicate column names: {columns}"
+        )
+        assert all(c for c in columns), (
+            f"{csv_path.name} has empty column name(s): {columns}"
+        )
+        missing = required_columns - set(columns)
+        assert not missing, (
+            f"{csv_path.name} is missing required columns: {missing}"
+        )
+        assert debit_credit_columns & set(columns), (
+            f"{csv_path.name} has no Debit or Credit column: {columns}"
+        )
+
+    # 1f. Duplicate files are valid JSON arrays with length matching summary
+    duplicate_files = [
+        p for p in output_dir.iterdir()
+        if re.match(r"duplicates.*\.json$", p.name)
+    ]
+    total_duplicate_records = 0
+    for dup_path in duplicate_files:
+        data = json.loads(dup_path.read_text(encoding="utf-8"))
+        assert isinstance(data, list), f"{dup_path.name} is not a JSON array"
+        total_duplicate_records += len(data)
+
+    # The global duplicates.json + per-IBAN files may double-count; just verify
+    # each file is a valid array (length consistency checked via summary count).
+    global_dup = output_dir / "duplicates.json"
+    if global_dup.exists():
+        global_data = json.loads(global_dup.read_text(encoding="utf-8"))
+        assert isinstance(global_data, list)
+        assert len(global_data) == duplicates, (
+            f"duplicates.json has {len(global_data)} records but summary says {duplicates} duplicates"
+        )