diff --git a/Makefile b/Makefile index 967523c..34a509f 100644 --- a/Makefile +++ b/Makefile @@ -360,7 +360,28 @@ show-retention-status: ## Show data retention status @python3 -c "from src.services.data_retention import DataRetentionService; from src.app import AppConfig; config = AppConfig.from_env(); service = DataRetentionService(config.data_retention_days, config.output_dir); files = service.find_expired_files(); print(f'Retention period: {config.data_retention_days} days'); print(f'Expired files: {len(files)}')" # Docker build modes -.PHONY: docker-local docker-remote docker-build docker-pull +.PHONY: docker-local docker-remote docker-build docker-pull docker-integration + +docker-integration: ## Run Docker integration test: process ./input and assert non-zero transaction output + @docker images bankstatementsprocessor:latest -q > /dev/null 2>&1 || \ + { echo "❌ Image not found. Run 'make docker-build' first."; exit 1; } + @ls input/*.pdf > /dev/null 2>&1 || \ + { echo "❌ No PDFs found in ./input — add PDFs before running."; exit 1; } + @mkdir -p output + docker run --rm \ + -v "$(PWD)/input:/app/input:ro" \ + -v "$(PWD)/output:/app/output" \ + -e LOG_LEVEL=WARNING \ + -e EXIT_AFTER_PROCESSING=true \ + -e GENERATE_MONTHLY_SUMMARY=false \ + -e GENERATE_EXPENSE_ANALYSIS=false \ + bankstatementsprocessor:latest + @python3 -c "\ +import glob, csv, sys; \ +files = [f for f in glob.glob('output/*.csv') if 'duplicate' not in f]; \ +total = sum(sum(1 for _ in csv.DictReader(open(f))) for f in files); \ +print(f'✅ Docker integration: {total} transactions extracted'); \ +sys.exit(0 if total > 0 else 1)" docker-local: ## Build and run from local code @echo "🔨 Building from local code..." diff --git a/README.md b/README.md index b0ca4ab..d03692d 100644 --- a/README.md +++ b/README.md @@ -136,6 +136,55 @@ flake8 src tests --- +## Integration Testing + +The integration test runs the full pipeline against real PDFs in `./input/` and compares key output metrics against a committed snapshot baseline. It is excluded from the default test run and must be triggered explicitly. + +**Prerequisites:** place PDF bank statements in `./input/` before running. + +### Run the integration test + +```bash +cd packages/parser-core +pytest -m integration -v +``` + +The test asserts: +- Transaction counts, page counts, and PDF counts match the baseline +- Output filenames are unchanged +- `excluded_files.json` is consistent with the processing summary +- Each masked IBAN is correctly formatted and maps to the right output filename +- CSV row counts match JSON record counts per IBAN +- CSV column headers contain required fields (`Date`, `Details`, `Debit`/`Credit`) +- Duplicate JSON files are valid arrays + +### Update the snapshot baseline + +Run this after any intentional change that affects output counts or filenames (e.g. adding a new template, changing filter behaviour, adding a new output file): + +```bash +cd packages/parser-core +pytest -m integration --snapshot-update --no-cov +``` + +The updated `snapshots/output_snapshot.json` should be committed — changes are visible in code review and serve as a record of intentional output changes. + +### Docker integration test + +Validate that the built Docker image correctly processes PDFs end-to-end: + +```bash +# Build the image first +make docker-build + +# Run against ./input — asserts non-zero CSV output +make docker-integration +``` + +This mounts `./input` read-only and `./output` writable, runs the container, then asserts that at least one CSV was produced with a non-zero transaction count. + +--- + ## Contributing Contributions are welcome! diff --git a/packages/parser-core/tests/integration/snapshots/output_snapshot.json b/packages/parser-core/tests/integration/snapshots/output_snapshot.json index 1a7fd54..cad9491 100644 --- a/packages/parser-core/tests/integration/snapshots/output_snapshot.json +++ b/packages/parser-core/tests/integration/snapshots/output_snapshot.json @@ -17,7 +17,7 @@ "size_bytes": 29904 }, "bank_statements_3656.xlsx": { - "size_bytes": 7335 + "size_bytes": 7334 }, "bank_statements_9015.csv": { "row_count": 400, @@ -28,7 +28,7 @@ "size_bytes": 156629 }, "bank_statements_9015.xlsx": { - "size_bytes": 18959 + "size_bytes": 18958 }, "duplicates.json": { "record_count": 0, diff --git a/packages/parser-core/tests/integration/test_output_snapshot.py b/packages/parser-core/tests/integration/test_output_snapshot.py index 7a8cfc7..e00119a 100644 --- a/packages/parser-core/tests/integration/test_output_snapshot.py +++ b/packages/parser-core/tests/integration/test_output_snapshot.py @@ -16,7 +16,9 @@ from __future__ import annotations +import csv import json +import re from pathlib import Path import pytest @@ -165,3 +167,140 @@ def test_output_snapshot(request: pytest.FixtureRequest) -> None: f"{diff_text}\n\n" "If this change is intentional, re-run with --snapshot-update to accept it." ) + + _assert_output_invariants(OUTPUT_DIR, summary) + + +# --------------------------------------------------------------------------- +# Structural invariants — always asserted regardless of snapshot update mode +# --------------------------------------------------------------------------- + +_IBAN_MASKED_RE = re.compile(r"^[A-Z]{2}[0-9]{2}\*+[0-9]{4}$") +_IBAN_DIGEST_RE = re.compile(r"^[0-9a-f]{64}$") + + +def _assert_output_invariants(output_dir: Path, summary: dict) -> None: + """Assert structural correctness of the output directory. + + These checks are independent of the snapshot baseline — they validate + relationships between output files that must always hold true. + """ + pdf_count = summary.get("pdf_count", 0) + pdfs_extracted = summary.get("pdfs_extracted", 0) + transactions = summary.get("transactions", 0) + duplicates = summary.get("duplicates", 0) + + # 1a. Pipeline produced meaningful output + assert pdfs_extracted > 0, "pdfs_extracted == 0 — pipeline silently processed nothing" + assert transactions > 0, "transactions == 0 — no transactions extracted" + + # 1b. Excluded files log is consistent with processing summary + excluded_path = output_dir / "excluded_files.json" + if excluded_path.exists(): + excluded_data = json.loads(excluded_path.read_text(encoding="utf-8")) + total_excluded = excluded_data.get("summary", {}).get("total_excluded", 0) + assert total_excluded == pdf_count - pdfs_extracted, ( + f"excluded_files.json total_excluded ({total_excluded}) != " + f"pdf_count - pdfs_extracted ({pdf_count} - {pdfs_extracted})" + ) + for entry in excluded_data.get("excluded_files", []): + assert entry.get("reason"), f"Excluded file {entry.get('filename')!r} has no reason" + assert (entry.get("pages") or 0) > 0, ( + f"Excluded file {entry.get('filename')!r} has pages <= 0" + ) + + # 1c. IBAN file: record count, format, and filename suffix mapping + ibans_path = output_dir / "ibans.json" + if ibans_path.exists(): + ibans = json.loads(ibans_path.read_text(encoding="utf-8")) + assert isinstance(ibans, list), "ibans.json should be a JSON array" + assert len(ibans) == pdfs_extracted, ( + f"ibans.json has {len(ibans)} entries but pdfs_extracted == {pdfs_extracted}" + ) + iban_suffixes = set() + for entry in ibans: + masked = entry.get("iban_masked", "") + digest = entry.get("iban_digest", "") + assert _IBAN_MASKED_RE.match(masked), ( + f"Masked IBAN {masked!r} does not match expected format [A-Z]{{2}}[0-9]{{2}}*...[0-9]{{4}}" + ) + assert _IBAN_DIGEST_RE.match(digest), ( + f"IBAN digest {digest!r} is not a valid 64-char hex SHA-256" + ) + iban_suffixes.add(masked[-4:]) + + # Every IBAN-specific output file suffix must correspond to a known IBAN + for path in output_dir.iterdir(): + match = re.match(r"bank_statements_(\d{4})\.", path.name) + if match: + suffix = match.group(1) + assert suffix in iban_suffixes, ( + f"Output file {path.name!r} has suffix {suffix!r} " + f"but no IBAN ending in those digits found in ibans.json" + ) + + # 1d. Transaction count cross-check: CSV rows == JSON records per IBAN pair + iban_csv_files = [ + p for p in output_dir.iterdir() + if re.match(r"bank_statements_\d{4}\.csv$", p.name) + ] + csv_total = 0 + for csv_path in iban_csv_files: + stem = csv_path.stem # e.g. bank_statements_3656 + json_path = output_dir / f"{stem}.json" + lines = csv_path.read_text(encoding="utf-8").splitlines() + csv_rows = len([l for l in lines if l.strip()]) - 1 # exclude header + csv_total += csv_rows + if json_path.exists(): + json_records = json.loads(json_path.read_text(encoding="utf-8")) + assert isinstance(json_records, list) + assert len(json_records) == csv_rows, ( + f"{csv_path.name} has {csv_rows} rows but " + f"{json_path.name} has {len(json_records)} records" + ) + + assert csv_total == transactions, ( + f"Sum of IBAN-specific CSV rows ({csv_total}) != transactions ({transactions})" + ) + + # 1e. CSV column integrity on each IBAN-specific CSV + required_columns = {"Date", "Details"} + debit_credit_columns = {"Debit", "Credit", "Debit €", "Credit €"} + for csv_path in iban_csv_files: + with csv_path.open(encoding="utf-8") as f: + reader = csv.DictReader(f) + columns = reader.fieldnames or [] + assert len(columns) == len(set(columns)), ( + f"{csv_path.name} has duplicate column names: {columns}" + ) + assert all(c for c in columns), ( + f"{csv_path.name} has empty column name(s): {columns}" + ) + missing = required_columns - set(columns) + assert not missing, ( + f"{csv_path.name} is missing required columns: {missing}" + ) + assert debit_credit_columns & set(columns), ( + f"{csv_path.name} has no Debit or Credit column: {columns}" + ) + + # 1f. Duplicate files are valid JSON arrays with length matching summary + duplicate_files = [ + p for p in output_dir.iterdir() + if re.match(r"duplicates.*\.json$", p.name) + ] + total_duplicate_records = 0 + for dup_path in duplicate_files: + data = json.loads(dup_path.read_text(encoding="utf-8")) + assert isinstance(data, list), f"{dup_path.name} is not a JSON array" + total_duplicate_records += len(data) + + # The global duplicates.json + per-IBAN files may double-count; just verify + # each file is a valid array (length consistency checked via summary count). + global_dup = output_dir / "duplicates.json" + if global_dup.exists(): + global_data = json.loads(global_dup.read_text(encoding="utf-8")) + assert isinstance(global_data, list) + assert len(global_data) == duplicates, ( + f"duplicates.json has {len(global_data)} records but summary says {duplicates} duplicates" + )