From 5bcfdc184d28262a4f13147c397bea0d95942461 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 24 Jan 2025 12:24:49 +0000 Subject: [PATCH 01/63] Bump utils to 94.0.1 ## 94.0.1 * Add `ruff.toml` to `MANIFEST.in` ## 94.0.0 * `version_tools.copy_config` will now copy `ruff.toml` instead of `pyproject.toml`. Apps should maintain their own `pyproject.toml`, if required * Replaces `black` formatter with `ruff format` * Upgrades `ruff` to version 0.9.2 ## 93.2.1 * Replaces symlink at `./pyproject.toml` with a duplicate file ## 93.2.0 * logging: add verbose eventlet context to app.request logs if request_time is over a threshold ## 93.1.0 * Introduce `NOTIFY_LOG_LEVEL_HANDLERS` config variable for separate control of handler log level ## 93.0.0 * BREAKING CHANGE: logging: all contents of `logging/__init__.py` have been moved to `logging/flask.py` because they all assume a flask(-like) environment and this way we don't implicitly import all of flask etc. every time anything under `logging` is imported. * Adds `request_cpu_time` to `app.request` logs when available. * Adds ability to track eventlet's switching of greenlets and annotate `app.request` logs with useful metrics when the flask config parameter `NOTIFY_EVENTLET_STATS` is set and the newly provided function `account_greenlet_times` is installed as a greenlet trace hook. *** Complete changes: https://github.com/alphagov/notifications-utils/compare/92.1.1...94.0.1 --- .pre-commit-config.yaml | 10 +++------- app/__init__.py | 5 +++-- requirements.in | 2 +- requirements.txt | 2 +- requirements_for_test.txt | 14 ++----------- requirements_for_test_common.in | 5 ++--- ruff.toml | 35 +++++++++++++++++++++++++++++++++ 7 files changed, 47 insertions(+), 26 deletions(-) create mode 100644 ruff.toml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e58e2b61..37a8eee2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -# This file was automatically copied from notifications-utils@92.1.1 +# This file was automatically copied from notifications-utils@94.0.1 repos: - repo: https://github.com/pre-commit/pre-commit-hooks @@ -9,12 +9,8 @@ repos: - id: check-yaml - id: debug-statements - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: 'v0.8.2' + rev: 'v0.9.2' hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] -- repo: https://github.com/psf/black - rev: 24.10.0 - hooks: - - id: black - name: black (python) + - id: ruff-format diff --git a/app/__init__.py b/app/__init__.py index 7cd359ee..06ea0683 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -5,11 +5,12 @@ from flask import Flask, current_app, jsonify from gds_metrics import GDSMetrics -from notifications_utils import logging, request_helper +from notifications_utils import request_helper from notifications_utils.clients.antivirus.antivirus_client import AntivirusClient from notifications_utils.clients.redis.redis_client import RedisClient from notifications_utils.eventlet import EventletTimeout from notifications_utils.local_vars import LazyLocalGetter +from notifications_utils.logging import flask as utils_logging from werkzeug.local import LocalProxy from app.config import Config, configs @@ -63,7 +64,7 @@ def create_app(): application.config.from_object(Config) request_helper.init_app(application) - logging.init_app(application) + utils_logging.init_app(application) metrics.init_app(application) redis_client.init_app(application) diff --git a/requirements.in b/requirements.in index 072fb2ce..e5928a2f 100644 --- a/requirements.in +++ b/requirements.in @@ -9,6 +9,6 @@ gds-metrics==0.2.4 argon2-cffi==21.3.0 # Run `make bump-utils` to update to the latest version -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@92.1.1 +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@94.0.1 sentry_sdk[flask]>=1.0.0,<2.0.0 diff --git a/requirements.txt b/requirements.txt index 114acce4..2c46875b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -70,7 +70,7 @@ markupsafe==2.1.3 # werkzeug mistune==0.8.4 # via notifications-utils -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@b9ef1b45b655324d6341995afffc3ebc25a3ed72 +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@d6311c3ab83b8225265277f781157c48f9318ae9 # via -r requirements.in ordered-set==4.1.0 # via notifications-utils diff --git a/requirements_for_test.txt b/requirements_for_test.txt index dc1035b0..79bc5172 100644 --- a/requirements_for_test.txt +++ b/requirements_for_test.txt @@ -14,8 +14,6 @@ awscrt==0.20.11 # botocore beautifulsoup4==4.12.3 # via -r requirements_for_test_common.in -black==24.10.0 - # via -r requirements_for_test_common.in blinker==1.9.0 # via # -r requirements.txt @@ -50,7 +48,6 @@ charset-normalizer==3.3.2 click==8.1.7 # via # -r requirements.txt - # black # flask coverage==7.6.4 # via pytest-testmon @@ -122,9 +119,7 @@ mistune==0.8.4 # via # -r requirements.txt # notifications-utils -mypy-extensions==1.0.0 - # via black -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@b9ef1b45b655324d6341995afffc3ebc25a3ed72 +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@d6311c3ab83b8225265277f781157c48f9318ae9 # via -r requirements.txt ordered-set==4.1.0 # via @@ -133,17 +128,12 @@ ordered-set==4.1.0 packaging==23.2 # via # -r requirements.txt - # black # gunicorn # pytest -pathspec==0.12.1 - # via black phonenumbers==8.13.52 # via # -r requirements.txt # notifications-utils -platformdirs==4.3.6 - # via black pluggy==1.5.0 # via pytest prometheus-client==0.19.0 @@ -210,7 +200,7 @@ requests-mock==1.12.1 # via -r requirements_for_test_common.in rsa==4.7.2 # via -r requirements.txt -ruff==0.8.2 +ruff==0.9.2 # via -r requirements_for_test_common.in s3transfer==0.10.1 # via diff --git a/requirements_for_test_common.in b/requirements_for_test_common.in index d424c600..e660e7e4 100644 --- a/requirements_for_test_common.in +++ b/requirements_for_test_common.in @@ -1,4 +1,4 @@ -# This file was automatically copied from notifications-utils@92.1.1 +# This file was automatically copied from notifications-utils@94.0.1 beautifulsoup4==4.12.3 pytest==8.3.4 @@ -9,5 +9,4 @@ pytest-testmon==2.1.1 requests-mock==1.12.1 freezegun==1.5.1 -black==24.10.0 # Also update `.pre-commit-config.yaml` if this changes -ruff==0.8.2 # Also update `.pre-commit-config.yaml` if this changes +ruff==0.9.2 # Also update `.pre-commit-config.yaml` if this changes diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 00000000..77c6085c --- /dev/null +++ b/ruff.toml @@ -0,0 +1,35 @@ +# This file was automatically copied from notifications-utils@94.0.1 + +exclude = [ + "migrations/versions/", + "venv*", + "__pycache__", + "node_modules", + "cache", + "migrations", + "build", + "sample_cap_xml_documents.py", +] + +line-length = 120 + +target-version = "py311" + +[lint] +select = [ + "E", # pycodestyle + "W", # pycodestyle + "F", # pyflakes + "I", # isort + "B", # flake8-bugbear + "C90", # mccabe cyclomatic complexity + "G", # flake8-logging-format + "T20", # flake8-print + "UP", # pyupgrade + "C4", # flake8-comprehensions + "ISC", # flake8-implicit-str-concat + "RSE", # flake8-raise + "PIE", # flake8-pie + "N804", # First argument of a class method should be named `cls` +] +ignore = [] From 427df817353279e1b69108fef7b55f2c930bcfb7 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 24 Jan 2025 12:25:20 +0000 Subject: [PATCH 02/63] Remove pyproject.toml --- pyproject.toml | 37 ------------------------------------- 1 file changed, 37 deletions(-) delete mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index a98c29c9..00000000 --- a/pyproject.toml +++ /dev/null @@ -1,37 +0,0 @@ -# This file was automatically copied from notifications-utils@92.1.1 - -[tool.black] -line-length = 120 - -[tool.ruff] -line-length = 120 - -target-version = "py311" - -lint.select = [ - "E", # pycodestyle - "W", # pycodestyle - "F", # pyflakes - "I", # isort - "B", # flake8-bugbear - "C90", # mccabe cyclomatic complexity - "G", # flake8-logging-format - "T20", # flake8-print - "UP", # pyupgrade - "C4", # flake8-comprehensions - "ISC", # flake8-implicit-str-concat - "RSE", # flake8-raise - "PIE", # flake8-pie - "N804", # First argument of a class method should be named `cls` -] -lint.ignore = [] -exclude = [ - "migrations/versions/", - "venv*", - "__pycache__", - "node_modules", - "cache", - "migrations", - "build", - "sample_cap_xml_documents.py", -] From 3d838154e75ab516ce1fcb423271b0685f2f134c Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 24 Jan 2025 12:25:41 +0000 Subject: [PATCH 03/63] Run ruff format --- tests/upload/test_views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/upload/test_views.py b/tests/upload/test_views.py index 9c388950..e5614fa7 100644 --- a/tests/upload/test_views.py +++ b/tests/upload/test_views.py @@ -324,7 +324,7 @@ def test_document_upload_csv_handling( "http://download.document-download-frontend-test", "/services/00000000-0000-0000-0000-000000000000", "/documents/ffffffff-ffff-ffff-ffff-ffffffffffff", - f'.{app.config["MIME_TYPES_TO_FILE_EXTENSIONS"][expected_mimetype]}', + f".{app.config['MIME_TYPES_TO_FILE_EXTENSIONS'][expected_mimetype]}", "?key=AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", ] ), From 9c1da63ace5b14ca2c65891d074fac1fc6d4fa0a Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 24 Jan 2025 12:26:08 +0000 Subject: [PATCH 04/63] =?UTF-8?q?Don=E2=80=99t=20run=20black=20from=20Make?= =?UTF-8?q?file?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c6892eb9..6b275ce2 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ run-flask-with-docker: ## Run flask with docker .PHONY: test test: ## Run all tests ruff check . - black --check . + ruff format --check . py.test tests/ .PHONY: test-with-docker From 936dac2cf3aeeb9af1d92c4321ee28d91935bb46 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Tue, 11 Feb 2025 11:07:12 +0000 Subject: [PATCH 05/63] Remove pull request template MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All our apps are continously deployed now, so we don’t need to specifically call it out when making PRs against this app. --- .github/pull_request_template.md | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 .github/pull_request_template.md diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md deleted file mode 100644 index 09e9bc7c..00000000 --- a/.github/pull_request_template.md +++ /dev/null @@ -1,7 +0,0 @@ - - ---- - -🚨⚠️ This will be deployed automatically all the way to production when you click merge ⚠️🚨 - -For more information, including how to check this deployment on preview or staging first before it goes to production, see our [team wiki section on deployment](https://github.com/alphagov/notifications-manuals/wiki/Merging-and-deploying#deployment) From 8c90408b1e76d9d93fc282f50207f7a01fe11a13 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Tue, 11 Feb 2025 11:21:55 +0000 Subject: [PATCH 06/63] Install uv from Docker registry Copies https://github.com/alphagov/notifications-api/pull/4376/files Also consolidates environment variables into a single block --- docker/Dockerfile | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 713f280e..6e9418c3 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,8 +1,12 @@ FROM python:3.11-slim-bookworm AS base +COPY --from=ghcr.io/astral-sh/uv:0.5.30 /uv /uvx /bin/ + ENV DEBIAN_FRONTEND=noninteractive \ PYTHONUNBUFFERED=1 \ - UV_COMPILE_BYTECODE=1 + UV_COMPILE_BYTECODE=1 \ + UV_CACHE_DIR='/tmp/uv-cache/' \ + VIRTUAL_ENV="/opt/venv" RUN apt-get update \ && apt-get install -y --no-install-recommends \ @@ -29,9 +33,6 @@ RUN echo "Install OS dependencies for python app requirements" && \ COPY requirements.txt ./ -RUN pip install uv - -ENV UV_CACHE_DIR='/tmp/uv-cache/' RUN echo "Installing python dependencies" && \ python3 -m venv /opt/venv && \ uv pip sync --python /opt/venv/bin/python requirements.txt @@ -88,9 +89,7 @@ RUN mkdir -p app COPY --from=python_build --chown=notify:notify /opt/venv /opt/venv # Install dev/test requirements -RUN pip install uv COPY --chown=notify:notify Makefile requirements_for_test.txt ./ -ENV UV_CACHE_DIR='/tmp/uv-cache/' RUN make bootstrap # Copy from the real world, one dir up (project root) into the environment's current working directory From f7519b3b17cd0115250ad7aa5d82307a63875a16 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Thu, 13 Feb 2025 12:03:09 +0000 Subject: [PATCH 07/63] Bump utils to 95.1.1 ## 95.1.1 * Add `RUF100` rule to linter config (checks for inapplicable uses of `# noqa`) ## 95.1.0 * Adds log message and statsd metric for retried celery tasks ## 95.0.0 * Reverts 92.0.0 to restore new validation code *** Complete changes: https://github.com/alphagov/notifications-utils/compare/94.0.1...95.1.1 --- .pre-commit-config.yaml | 2 +- requirements.in | 2 +- requirements.txt | 2 +- requirements_for_test.txt | 2 +- requirements_for_test_common.in | 2 +- ruff.toml | 3 ++- 6 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 37a8eee2..e5d72f56 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -# This file was automatically copied from notifications-utils@94.0.1 +# This file was automatically copied from notifications-utils@95.1.1 repos: - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/requirements.in b/requirements.in index e5928a2f..1b3f8188 100644 --- a/requirements.in +++ b/requirements.in @@ -9,6 +9,6 @@ gds-metrics==0.2.4 argon2-cffi==21.3.0 # Run `make bump-utils` to update to the latest version -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@94.0.1 +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@95.1.1 sentry_sdk[flask]>=1.0.0,<2.0.0 diff --git a/requirements.txt b/requirements.txt index 2c46875b..ef53e506 100644 --- a/requirements.txt +++ b/requirements.txt @@ -70,7 +70,7 @@ markupsafe==2.1.3 # werkzeug mistune==0.8.4 # via notifications-utils -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@d6311c3ab83b8225265277f781157c48f9318ae9 +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@361486ff4960b5d02c162118b1fd64b097f9cb96 # via -r requirements.in ordered-set==4.1.0 # via notifications-utils diff --git a/requirements_for_test.txt b/requirements_for_test.txt index 79bc5172..692c53a2 100644 --- a/requirements_for_test.txt +++ b/requirements_for_test.txt @@ -119,7 +119,7 @@ mistune==0.8.4 # via # -r requirements.txt # notifications-utils -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@d6311c3ab83b8225265277f781157c48f9318ae9 +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@361486ff4960b5d02c162118b1fd64b097f9cb96 # via -r requirements.txt ordered-set==4.1.0 # via diff --git a/requirements_for_test_common.in b/requirements_for_test_common.in index e660e7e4..3f17b097 100644 --- a/requirements_for_test_common.in +++ b/requirements_for_test_common.in @@ -1,4 +1,4 @@ -# This file was automatically copied from notifications-utils@94.0.1 +# This file was automatically copied from notifications-utils@95.1.1 beautifulsoup4==4.12.3 pytest==8.3.4 diff --git a/ruff.toml b/ruff.toml index 77c6085c..e65bd9de 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,4 +1,4 @@ -# This file was automatically copied from notifications-utils@94.0.1 +# This file was automatically copied from notifications-utils@95.1.1 exclude = [ "migrations/versions/", @@ -31,5 +31,6 @@ select = [ "RSE", # flake8-raise "PIE", # flake8-pie "N804", # First argument of a class method should be named `cls` + "RUF100", # Checks for noqa directives that are no longer applicable ] ignore = [] From 6441b985efc220ea3fb16d57a1e216ab269877a4 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Wed, 5 Mar 2025 14:48:51 +0000 Subject: [PATCH 08/63] Bump Sentry SDK to 1.45.1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes GHSA-g92j-qhmh-64v2 Not something that likely affects us, but clears down another Dependabot warning. This fix was released in sentry-sdk==2.8.0, then also backported to sentry-sdk==1.45.1. I think we could probably upgrade to Sentry >= 2 without making any changes, but more testing would be needed to validate this. So just going with the backport for now. Specifying an exact version because that’s our convention. --- requirements.in | 2 +- requirements.txt | 2 +- requirements_for_test.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.in b/requirements.in index 1b3f8188..f1d8ee0f 100644 --- a/requirements.in +++ b/requirements.in @@ -11,4 +11,4 @@ argon2-cffi==21.3.0 # Run `make bump-utils` to update to the latest version notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@95.1.1 -sentry_sdk[flask]>=1.0.0,<2.0.0 +sentry-sdk[flask]==1.45.1 diff --git a/requirements.txt b/requirements.txt index ef53e506..a0159b38 100644 --- a/requirements.txt +++ b/requirements.txt @@ -108,7 +108,7 @@ s3transfer==0.10.1 # via boto3 segno==1.6.1 # via notifications-utils -sentry-sdk==1.45.0 +sentry-sdk==1.45.1 # via -r requirements.in six==1.16.0 # via python-dateutil diff --git a/requirements_for_test.txt b/requirements_for_test.txt index 692c53a2..4a1533cb 100644 --- a/requirements_for_test.txt +++ b/requirements_for_test.txt @@ -210,7 +210,7 @@ segno==1.6.1 # via # -r requirements.txt # notifications-utils -sentry-sdk==1.45.0 +sentry-sdk==1.45.1 # via -r requirements.txt six==1.16.0 # via From f3575211a32b07caa3b27160adf93f039e245d71 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Thu, 6 Mar 2025 11:41:55 +0000 Subject: [PATCH 09/63] Add test for gunicorn config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This makes sure that: - all the imports in `gunicorn_config.py` exist - it’s exporting the variables we expect it to --- tests/test_gunicorn_config.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 tests/test_gunicorn_config.py diff --git a/tests/test_gunicorn_config.py b/tests/test_gunicorn_config.py new file mode 100644 index 00000000..7ff18331 --- /dev/null +++ b/tests/test_gunicorn_config.py @@ -0,0 +1,9 @@ +from gunicorn_config import keepalive, timeout, worker_class, worker_connections, workers + + +def test_gunicorn_config(): + assert workers == 4 + assert worker_class == "eventlet" + assert worker_connections == 1_000 + assert keepalive == 90 + assert timeout == 30 From 47f32448ece34394e3fd2ee6d8ef8c8720d3d8b2 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 14 Mar 2025 16:02:55 +0000 Subject: [PATCH 10/63] Bump utils to 97.0.3 ## 97.0.3 * Bump minimum jinja2 version to 3.1.6 ## 97.0.2 * Fix external link redirect ## 97.0.1 * Fix QR code URL with '&' encoded as '&' ## 97.0.0 * for bilingual letters preview, only show barcodes on the first page (patch). * rename `include_notify_tag` argument to `includes_first_page` (major). ## 96.0.0 * BREAKING CHANGE: the `gunicorn_defaults` module has been moved to `gunicorn.defaults` to make space for gunicorn-related utils that don't have the restricted-import constraints of the gunicorn defaults. Imports of `notifications_utils.gunicorn_defaults` should be changed to `notifications_utils.gunicorn.defaults`. * Added `ContextRecyclingEventletWorker` custom gunicorn worker class. ## 95.2.0 * Implement `InsensitiveSet.__contains__` ## 95.1.2 * Fix to the number of arguments the `on_retry` of the `NotifyTask` class takes. *** Complete changes: https://github.com/alphagov/notifications-utils/compare/95.1.1...97.0.3 --- .pre-commit-config.yaml | 2 +- gunicorn_config.py | 2 +- requirements.in | 2 +- requirements.txt | 4 ++-- requirements_for_test.txt | 4 ++-- requirements_for_test_common.in | 2 +- ruff.toml | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e5d72f56..d6510cac 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -# This file was automatically copied from notifications-utils@95.1.1 +# This file was automatically copied from notifications-utils@97.0.3 repos: - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/gunicorn_config.py b/gunicorn_config.py index 4735811a..45439f9b 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -1,7 +1,7 @@ import os from gds_metrics.gunicorn import child_exit # noqa -from notifications_utils.gunicorn_defaults import set_gunicorn_defaults +from notifications_utils.gunicorn.defaults import set_gunicorn_defaults set_gunicorn_defaults(globals()) diff --git a/requirements.in b/requirements.in index f1d8ee0f..15d83b50 100644 --- a/requirements.in +++ b/requirements.in @@ -9,6 +9,6 @@ gds-metrics==0.2.4 argon2-cffi==21.3.0 # Run `make bump-utils` to update to the latest version -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@95.1.1 +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@97.0.3 sentry-sdk[flask]==1.45.1 diff --git a/requirements.txt b/requirements.txt index a0159b38..750e6e78 100644 --- a/requirements.txt +++ b/requirements.txt @@ -55,7 +55,7 @@ itsdangerous==2.2.0 # via # flask # notifications-utils -jinja2==3.1.5 +jinja2==3.1.6 # via # flask # notifications-utils @@ -70,7 +70,7 @@ markupsafe==2.1.3 # werkzeug mistune==0.8.4 # via notifications-utils -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@361486ff4960b5d02c162118b1fd64b097f9cb96 +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@3fb3caf0efc9ccbebfe79b3269f84bfd66fc90d5 # via -r requirements.in ordered-set==4.1.0 # via notifications-utils diff --git a/requirements_for_test.txt b/requirements_for_test.txt index 4a1533cb..1721259b 100644 --- a/requirements_for_test.txt +++ b/requirements_for_test.txt @@ -100,7 +100,7 @@ itsdangerous==2.2.0 # -r requirements.txt # flask # notifications-utils -jinja2==3.1.5 +jinja2==3.1.6 # via # -r requirements.txt # flask @@ -119,7 +119,7 @@ mistune==0.8.4 # via # -r requirements.txt # notifications-utils -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@361486ff4960b5d02c162118b1fd64b097f9cb96 +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@3fb3caf0efc9ccbebfe79b3269f84bfd66fc90d5 # via -r requirements.txt ordered-set==4.1.0 # via diff --git a/requirements_for_test_common.in b/requirements_for_test_common.in index 3f17b097..2fc04082 100644 --- a/requirements_for_test_common.in +++ b/requirements_for_test_common.in @@ -1,4 +1,4 @@ -# This file was automatically copied from notifications-utils@95.1.1 +# This file was automatically copied from notifications-utils@97.0.3 beautifulsoup4==4.12.3 pytest==8.3.4 diff --git a/ruff.toml b/ruff.toml index e65bd9de..dd665701 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,4 +1,4 @@ -# This file was automatically copied from notifications-utils@95.1.1 +# This file was automatically copied from notifications-utils@97.0.3 exclude = [ "migrations/versions/", From c205b27b722074c2b20fcb2ee4e6e28502007b76 Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Thu, 24 Apr 2025 17:21:47 +0100 Subject: [PATCH 11/63] download views: rework error handling and status codes don't squash all store errors to 400 status code - if the intention was to prevent information leak, it failed to do this because it included the error string in the response. do this by making DocumentStoreError a superclass of other store exception types so exception handlers can continue to listen for the general DocumentStoreError. return 410 for keys that have a delete marker, 404 for documents with no such key, 403 for documents that are blocked. notably also catch NoSuchKey from check_for_blocked_document's get_object_tagging calls and turn this into a DocumentNotFound error, because in practise this will be the first problem a request will encounter when trying to access a missing document. --- app/download/views.py | 35 +++++++++--------- app/utils/store.py | 28 +++++++++------ tests/download/test_views.py | 9 +++-- tests/utils/test_store.py | 70 ++++++++++++++++++++++-------------- 4 files changed, 81 insertions(+), 61 deletions(-) diff --git a/app/download/views.py b/app/download/views.py index 5e15635d..1775a90a 100644 --- a/app/download/views.py +++ b/app/download/views.py @@ -77,7 +77,7 @@ def download_document(service_id, document_id, extension=None): "document_id": document_id, }, ) - return jsonify(error=str(e)), 400 + return jsonify(error=str(e)), e.suggested_status_code if redirect := get_redirect_url_if_user_not_authenticated(request, document): return redirect @@ -130,24 +130,21 @@ def get_document_metadata(service_id, document_id): "document_id": document_id, }, ) - return jsonify(error=str(e)), 400 - - if metadata: - document = { - "direct_file_url": get_direct_file_url( - service_id=service_id, - document_id=document_id, - key=key, - mimetype=metadata["mimetype"], - ), - "confirm_email": metadata["confirm_email"], - "size_in_bytes": metadata["size"], - "file_extension": current_app.config["MIME_TYPES_TO_FILE_EXTENSIONS"][metadata["mimetype"]], - "filename": metadata["filename"], - "available_until": metadata["available_until"], - } - else: - document = None + return jsonify(error=str(e)), e.suggested_status_code + + document = { + "direct_file_url": get_direct_file_url( + service_id=service_id, + document_id=document_id, + key=key, + mimetype=metadata["mimetype"], + ), + "confirm_email": metadata["confirm_email"], + "size_in_bytes": metadata["size"], + "file_extension": current_app.config["MIME_TYPES_TO_FILE_EXTENSIONS"][metadata["mimetype"]], + "filename": metadata["filename"], + "available_until": metadata["available_until"], + } response = make_response({"document": document}) response.headers["X-Robots-Tag"] = "noindex, nofollow" diff --git a/app/utils/store.py b/app/utils/store.py index 04aabf55..863f0302 100644 --- a/app/utils/store.py +++ b/app/utils/store.py @@ -13,15 +13,19 @@ class DocumentStoreError(Exception): - pass + suggested_status_code = 400 -class DocumentBlocked(Exception): - pass +class DocumentBlocked(DocumentStoreError): + suggested_status_code = 403 -class DocumentExpired(Exception): - pass +class DocumentExpired(DocumentStoreError): + suggested_status_code = 410 + + +class DocumentNotFound(DocumentStoreError): + suggested_status_code = 404 class DocumentStore: @@ -56,9 +60,11 @@ def check_for_blocked_document(self, service_id, document_id): except BotoClientError as e: if e.response["Error"].get("ResourceType") == "DeleteMarker": # The S3 object has been marked as expired (eg by our retention period lifecycle policy) - # We should treat is as not existing raise DocumentExpired("The document is no longer available") from e + if e.response["Error"].get("Code") == "NoSuchKey": + raise DocumentNotFound("The requested document could not be found") from e + raise e if tags.get("blocked", "false").lower() in {"true", "yes"}: @@ -135,9 +141,10 @@ def get(self, service_id, document_id, decryption_key): SSECustomerAlgorithm="AES256", ) - except (DocumentBlocked, DocumentExpired) as e: - raise DocumentStoreError(str(e)) from e except BotoClientError as e: + if e.response["Error"]["Code"] == "404": + raise DocumentNotFound("The requested document could not be found") from e + raise DocumentStoreError(e.response["Error"]) from e return { @@ -170,11 +177,10 @@ def get_document_metadata(self, service_id, document_id, decryption_key): "available_until": str(expiry_date), "filename": self._normalise_metadata(metadata["Metadata"]).get("filename"), } - except (DocumentBlocked, DocumentExpired): - return None except BotoClientError as e: if e.response["Error"]["Code"] == "404": - return None + raise DocumentNotFound("The requested document could not be found") from e + raise DocumentStoreError(e.response["Error"]) from e @staticmethod diff --git a/tests/download/test_views.py b/tests/download/test_views.py index f42a3816..7b2715ff 100644 --- a/tests/download/test_views.py +++ b/tests/download/test_views.py @@ -8,7 +8,7 @@ from app.download.views import get_redirect_url_if_user_not_authenticated from app.utils.signed_data import sign_service_and_document_id -from app.utils.store import DocumentStoreError +from app.utils.store import DocumentNotFound, DocumentStoreError @pytest.fixture @@ -332,7 +332,7 @@ def test_get_document_metadata_when_document_is_in_s3(client, store, mimetype, e def test_get_document_metadata_when_document_is_not_in_s3(client, store): - store.get_document_metadata.return_value = None + store.get_document_metadata.side_effect = DocumentNotFound("no such document") response = client.get( url_for( "download.get_document_metadata", @@ -342,9 +342,8 @@ def test_get_document_metadata_when_document_is_not_in_s3(client, store): ) ) - assert response.status_code == 200 - assert response.json == {"document": None} - assert response.headers["X-Robots-Tag"] == "noindex, nofollow" + assert response.status_code == 404 + assert response.json == {"error": "no such document"} class TestAuthenticateDocument: diff --git a/tests/utils/test_store.py b/tests/utils/test_store.py index fa1559d5..16a852bb 100644 --- a/tests/utils/test_store.py +++ b/tests/utils/test_store.py @@ -9,6 +9,7 @@ from app.utils.store import ( DocumentBlocked, DocumentExpired, + DocumentNotFound, DocumentStore, DocumentStoreError, ) @@ -66,20 +67,6 @@ def expired_document(mock_boto): ) -@pytest.fixture -def missing_document(mock_boto): - mock_boto.client.return_value.get_object_tagging.side_effect = botocore.exceptions.ClientError( - { - "Error": { - "Code": "NoSuchKey", - "Message": "The specified key does not exist.", - "Key": "object-key", - } - }, - "GetObjectTagging", - ) - - @pytest.fixture def store_with_email(mock_boto): mock_boto.client.return_value.get_object.return_value = { @@ -174,7 +161,32 @@ def test_check_for_blocked_document_raises_expired_error(store, expired_document store.check_for_blocked_document("service-id", "doc-id") -def test_check_for_blocked_document_reraises_other_boto_error(store, missing_document): +def test_check_for_blocked_document_missing_raises_document_not_found_error(store): + store.s3.get_object_tagging.side_effect = botocore.exceptions.ClientError( + { + "Error": { + "Code": "NoSuchKey", + "Message": "The specified key does not exist.", + "Key": "object-key", + } + }, + "GetObjectTagging", + ) + + with pytest.raises(DocumentNotFound): + store.check_for_blocked_document("service-id", "doc-id") + + +def test_check_for_blocked_document_random_error_propagated(store): + store.s3.get_object_tagging.side_effect = botocore.exceptions.ClientError( + { + "Error": { + "Code": "NotEnoughBananas", + } + }, + "GetObjectTagging", + ) + with pytest.raises(BotoClientError): store.check_for_blocked_document("service-id", "doc-id") @@ -295,18 +307,14 @@ def test_get_document_with_boto_error(store): def test_get_blocked_document(store, blocked_document): - with pytest.raises(DocumentStoreError) as e: + with pytest.raises(DocumentBlocked): store.get("service-id", "document-id", bytes(32)) - assert str(e.value) == "Access to the document has been blocked" - def test_get_expired_document(store, expired_document): - with pytest.raises(DocumentStoreError) as e: + with pytest.raises(DocumentExpired): store.get("service-id", "document-id", bytes(32)) - assert str(e.value) == "The document is no longer available" - def test_get_document_metadata_when_document_is_in_s3(store): metadata = store.get_document_metadata("service-id", "document-id", "0f0f0f") @@ -346,7 +354,8 @@ def test_get_document_metadata_when_document_is_not_in_s3(store): side_effect=BotoClientError({"Error": {"Code": "404", "Message": "Not Found"}}, "HeadObject") ) - assert store.get_document_metadata("service-id", "document-id", "0f0f0f") is None + with pytest.raises(DocumentNotFound): + store.get_document_metadata("service-id", "document-id", "0f0f0f") def test_get_document_metadata_with_unexpected_boto_error(store): @@ -359,19 +368,28 @@ def test_get_document_metadata_with_unexpected_boto_error(store): def test_get_document_metadata_with_blocked_document(store_with_email, blocked_document): - assert store_with_email.get_document_metadata("service-id", "document-id", "0f0f0f") is None + with pytest.raises(DocumentBlocked): + store_with_email.get_document_metadata("service-id", "document-id", "0f0f0f") def test_get_document_metadata_with_expired_document(store_with_email, expired_document): - assert store_with_email.get_document_metadata("service-id", "document-id", "0f0f0f") is None + with pytest.raises(DocumentExpired): + store_with_email.get_document_metadata("service-id", "document-id", "0f0f0f") -def test_authenticate_document_when_missing(store): +def test_get_document_metadata_when_missing(store): store.s3.head_object = mock.Mock( side_effect=BotoClientError({"Error": {"Code": "404", "Message": "Not Found"}}, "HeadObject") ) - assert store.get_document_metadata("service-id", "document-id", "0f0f0f") is None + with pytest.raises(DocumentNotFound): + store.get_document_metadata("service-id", "document-id", "0f0f0f") + + +def test_authenticate_document_when_missing(store): + store.s3.head_object = mock.Mock( + side_effect=BotoClientError({"Error": {"Code": "404", "Message": "Not Found"}}, "HeadObject") + ) assert store.authenticate("service-id", "document-id", b"0f0f0f", "test@notify.example") is False From 087d88ae2cc7f4fc469f55af43242d96ec52cadd Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Tue, 29 Apr 2025 11:41:17 +0100 Subject: [PATCH 12/63] DocumentStore.get_document_metadata: handle missing Expiration information gracefully instead of causing a crash this should simply return the metadata with the expiration missing, allowing the client to make up its own mind what to do. in most normal cases this is only used for informational display, and in case of a maintenance issue or misconfiguration it's better to have this information missing rather than being unable to serve any documents to anyone (a certain P1) --- app/utils/store.py | 6 +++--- tests/utils/test_store.py | 12 ++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/app/utils/store.py b/app/utils/store.py index 863f0302..6c7c7ffc 100644 --- a/app/utils/store.py +++ b/app/utils/store.py @@ -168,13 +168,13 @@ def get_document_metadata(self, service_id, document_id, decryption_key): SSECustomerAlgorithm="AES256", ) - expiry_date = self._convert_expiry_date_to_date_object(metadata["Expiration"]) - return { "mimetype": metadata["ContentType"], "confirm_email": self.get_email_hash(metadata) is not None, "size": metadata["ContentLength"], - "available_until": str(expiry_date), + "available_until": str(self._convert_expiry_date_to_date_object(metadata["Expiration"])) + if metadata.get("Expiration") + else None, "filename": self._normalise_metadata(metadata["Metadata"]).get("filename"), } except BotoClientError as e: diff --git a/tests/utils/test_store.py b/tests/utils/test_store.py index 16a852bb..7c78e052 100644 --- a/tests/utils/test_store.py +++ b/tests/utils/test_store.py @@ -327,6 +327,18 @@ def test_get_document_metadata_when_document_is_in_s3(store): } +def test_get_document_metadata_when_document_is_in_s3_but_missing_expiration(store): + del store.s3.head_object.return_value["Expiration"] + metadata = store.get_document_metadata("service-id", "document-id", "0f0f0f") + assert metadata == { + "mimetype": "text/plain", + "confirm_email": False, + "size": 100, + "available_until": None, + "filename": None, + } + + def test_get_document_metadata_when_document_is_in_s3_with_hashed_email(store_with_email): metadata = store_with_email.get_document_metadata("service-id", "document-id", "0f0f0f") assert metadata == { From 1d917fd1f1d59e03596e3a64f0b19ed1c75eff1b Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Tue, 29 Apr 2025 12:08:52 +0100 Subject: [PATCH 13/63] DocumentStore: partially enforce expired document inaccessibility via app logic and Expiration metadata this check was previously done in document-download-frontend, but it really belongs in the document-download-api if anywhere, if only because the api's download endpoints are publicly accessible and a document denied access by just the frontend could always just be downloaded via the api. note this check still relies on the s3 bucket's lifecycle policies being in place and implicitly the s3 object's last modified timestamp. this will only "rescue" us in cases where a delete marker has not yet been added by the lifecycle rile for some reason. --- app/utils/store.py | 44 +++++++++++++------- tests/utils/test_store.py | 87 ++++++++++++++++++++++++++++++++------- 2 files changed, 102 insertions(+), 29 deletions(-) diff --git a/app/utils/store.py b/app/utils/store.py index 6c7c7ffc..abb5f369 100644 --- a/app/utils/store.py +++ b/app/utils/store.py @@ -70,6 +70,19 @@ def check_for_blocked_document(self, service_id, document_id): if tags.get("blocked", "false").lower() in {"true", "yes"}: raise DocumentBlocked("Access to the document has been blocked") + def check_for_expired_document(self, s3_response): + if not s3_response.get("Expiration"): + current_app.logger.warning("Expiration information not available for document") + # in case we're having a maintenance issue/outage with our s3 bucket + # we'd prefer to serve some files that should have been deleted (hopefully <14 + # days ago) instead of being unable to serve any documents to any users + return + + expiry_date = self._convert_expiry_date_to_date_object(s3_response["Expiration"]) + + if expiry_date < date.today(): + raise DocumentExpired("The document is no longer available") + def put( self, service_id, document_stream, *, mimetype, confirmation_email=None, retention_period=None, filename=None ): @@ -134,12 +147,13 @@ def get(self, service_id, document_id, decryption_key): """ try: self.check_for_blocked_document(service_id, document_id) - document = self.s3.get_object( + s3_response = self.s3.get_object( Bucket=self.bucket, Key=self.get_document_key(service_id, document_id), SSECustomerKey=decryption_key, SSECustomerAlgorithm="AES256", ) + self.check_for_expired_document(s3_response) except BotoClientError as e: if e.response["Error"]["Code"] == "404": @@ -148,10 +162,10 @@ def get(self, service_id, document_id, decryption_key): raise DocumentStoreError(e.response["Error"]) from e return { - "body": document["Body"], - "mimetype": document["ContentType"], - "size": document["ContentLength"], - "metadata": self._normalise_metadata(document["Metadata"]), + "body": s3_response["Body"], + "mimetype": s3_response["ContentType"], + "size": s3_response["ContentLength"], + "metadata": self._normalise_metadata(s3_response["Metadata"]), } def get_document_metadata(self, service_id, document_id, decryption_key): @@ -161,21 +175,22 @@ def get_document_metadata(self, service_id, document_id, decryption_key): try: self.check_for_blocked_document(service_id, document_id) - metadata = self.s3.head_object( + s3_response = self.s3.head_object( Bucket=self.bucket, Key=self.get_document_key(service_id, document_id), SSECustomerKey=decryption_key, SSECustomerAlgorithm="AES256", ) + self.check_for_expired_document(s3_response) return { - "mimetype": metadata["ContentType"], - "confirm_email": self.get_email_hash(metadata) is not None, - "size": metadata["ContentLength"], - "available_until": str(self._convert_expiry_date_to_date_object(metadata["Expiration"])) - if metadata.get("Expiration") + "mimetype": s3_response["ContentType"], + "confirm_email": self.get_email_hash(s3_response) is not None, + "size": s3_response["ContentLength"], + "available_until": str(self._convert_expiry_date_to_date_object(s3_response["Expiration"])) + if s3_response.get("Expiration") else None, - "filename": self._normalise_metadata(metadata["Metadata"]).get("filename"), + "filename": self._normalise_metadata(s3_response["Metadata"]).get("filename"), } except BotoClientError as e: if e.response["Error"]["Code"] == "404": @@ -211,12 +226,13 @@ def authenticate(self, service_id: str, document_id: str, decryption_key: bytes, """ try: self.check_for_blocked_document(service_id, document_id) - response = self.s3.head_object( + s3_response = self.s3.head_object( Bucket=self.bucket, Key=self.get_document_key(service_id, document_id), SSECustomerKey=decryption_key, SSECustomerAlgorithm="AES256", ) + self.check_for_expired_document(s3_response) except (DocumentBlocked, DocumentExpired): return False except BotoClientError as e: @@ -224,7 +240,7 @@ def authenticate(self, service_id: str, document_id: str, decryption_key: bytes, return False raise DocumentStoreError(e.response["Error"]) from e - hashed_email = self.get_email_hash(response) + hashed_email = self.get_email_hash(s3_response) if not hashed_email: return False diff --git a/tests/utils/test_store.py b/tests/utils/test_store.py index 7c78e052..0b9fb03a 100644 --- a/tests/utils/test_store.py +++ b/tests/utils/test_store.py @@ -5,6 +5,7 @@ import botocore import pytest from botocore.exceptions import ClientError as BotoClientError +from freezegun import freeze_time from app.utils.store import ( DocumentBlocked, @@ -26,6 +27,7 @@ def mock_boto(mocker): def store(mock_boto): mock_boto.client.return_value.get_object.return_value = { "Body": mock.Mock(), + "Expiration": 'expiry-date="Fri, 01 May 2020 00:00:00 GMT", expiry-rule="custom-retention-1-weeks"', "ContentType": "application/pdf", "ContentLength": 100, "Metadata": {}, @@ -53,7 +55,7 @@ def blocked_document(mock_boto): @pytest.fixture -def expired_document(mock_boto): +def delete_markered_document(mock_boto): mock_boto.client.return_value.get_object_tagging.side_effect = botocore.exceptions.ClientError( { "Error": { @@ -156,7 +158,7 @@ def test_check_for_blocked_document_raises_error(store, mock_boto, blocked_value store.check_for_blocked_document("service-id", "doc-id") -def test_check_for_blocked_document_raises_expired_error(store, expired_document): +def test_check_for_blocked_document_delete_marker_document_expired_error(store, delete_markered_document): with pytest.raises(DocumentExpired): store.check_for_blocked_document("service-id", "doc-id") @@ -191,6 +193,42 @@ def test_check_for_blocked_document_random_error_propagated(store): store.check_for_blocked_document("service-id", "doc-id") +@pytest.mark.parametrize( + "expiration", + ( + 'expiry-date="Fri, 01 May 2020 00:00:00 GMT", expiry-rule="custom-retention-1-weeks"', + 'expiry-date="Sat, 02 May 2020 00:00:00 GMT", expiry-rule="custom-retention-3-weeks"', + ), +) +def test_check_for_expired_document_expired(store, expiration): + with freeze_time("2020-05-02 10:00:00"): + with pytest.raises(DocumentExpired): + store.check_for_expired_document( + { + "Expiration": expiration, + } + ) + + +@pytest.mark.parametrize( + "expiration", + ( + 'expiry-date="Sun, 03 May 2020 00:00:00 GMT", expiry-rule="custom-retention-1-weeks"', + None, + ), +) +def test_check_for_expired_document_not_expired(store, expiration): + with freeze_time("2020-05-02 10:00:00"): + assert ( + store.check_for_expired_document( + { + "Expiration": expiration, + } + ) + is None + ) + + def test_put_document(store): ret = store.put( "service-id", mock.Mock(), mimetype="application/pdf", confirmation_email=None, retention_period=None @@ -281,12 +319,13 @@ def test_put_document_records_filename_if_set(store, filename, expected_filename def test_get_document(store): - assert store.get("service-id", "document-id", bytes(32)) == { - "body": mock.ANY, - "mimetype": "application/pdf", - "size": 100, - "metadata": {}, - } + with freeze_time("2020-04-28 10:00:00"): + assert store.get("service-id", "document-id", bytes(32)) == { + "body": mock.ANY, + "mimetype": "application/pdf", + "size": 100, + "metadata": {}, + } store.s3.get_object.assert_called_once_with( Bucket="test-bucket", @@ -311,13 +350,15 @@ def test_get_blocked_document(store, blocked_document): store.get("service-id", "document-id", bytes(32)) -def test_get_expired_document(store, expired_document): +def test_get_delete_markered_document(store, delete_markered_document): with pytest.raises(DocumentExpired): store.get("service-id", "document-id", bytes(32)) def test_get_document_metadata_when_document_is_in_s3(store): - metadata = store.get_document_metadata("service-id", "document-id", "0f0f0f") + with freeze_time("2020-04-28 10:00:00"): + metadata = store.get_document_metadata("service-id", "document-id", "0f0f0f") + assert metadata == { "mimetype": "text/plain", "confirm_email": False, @@ -340,7 +381,9 @@ def test_get_document_metadata_when_document_is_in_s3_but_missing_expiration(sto def test_get_document_metadata_when_document_is_in_s3_with_hashed_email(store_with_email): - metadata = store_with_email.get_document_metadata("service-id", "document-id", "0f0f0f") + with freeze_time("2020-04-28 10:00:00"): + metadata = store_with_email.get_document_metadata("service-id", "document-id", "0f0f0f") + assert metadata == { "mimetype": "text/plain", "confirm_email": True, @@ -351,7 +394,9 @@ def test_get_document_metadata_when_document_is_in_s3_with_hashed_email(store_wi def test_get_document_metadata_when_document_is_in_s3_with_filename(store_with_filename): - metadata = store_with_filename.get_document_metadata("service-id", "document-id", "0f0f0f") + with freeze_time("2020-04-28 10:00:00"): + metadata = store_with_filename.get_document_metadata("service-id", "document-id", "0f0f0f") + assert metadata == { "mimetype": "text/plain", "confirm_email": False, @@ -361,6 +406,12 @@ def test_get_document_metadata_when_document_is_in_s3_with_filename(store_with_f } +def test_get_document_metadata_when_document_is_in_s3_but_expired(store): + with pytest.raises(DocumentExpired): + with freeze_time("2020-05-12 10:00:00"): + store.get_document_metadata("service-id", "document-id", "0f0f0f") + + def test_get_document_metadata_when_document_is_not_in_s3(store): store.s3.head_object = mock.Mock( side_effect=BotoClientError({"Error": {"Code": "404", "Message": "Not Found"}}, "HeadObject") @@ -384,7 +435,7 @@ def test_get_document_metadata_with_blocked_document(store_with_email, blocked_d store_with_email.get_document_metadata("service-id", "document-id", "0f0f0f") -def test_get_document_metadata_with_expired_document(store_with_email, expired_document): +def test_get_document_metadata_with_delete_markered_document(store_with_email, delete_markered_document): with pytest.raises(DocumentExpired): store_with_email.get_document_metadata("service-id", "document-id", "0f0f0f") @@ -414,7 +465,13 @@ def test_authenticate_document_when_missing(store): ), ) def test_authenticate_document_email_address_check(store_with_email, email_address, expected_result): - assert store_with_email.authenticate("service-id", "document-id", b"0f0f0f", email_address) is expected_result + with freeze_time("2020-04-28 10:00:00"): + assert store_with_email.authenticate("service-id", "document-id", b"0f0f0f", email_address) is expected_result + + +def test_authenticate_document_expired(store_with_email): + with freeze_time("2020-05-28 10:00:00"): + assert store_with_email.authenticate("service-id", "document-id", b"0f0f0f", "test@notify.example") is False def test_authenticate_fails_if_document_does_not_have_hash(store): @@ -439,7 +496,7 @@ def test_authenticate_with_blocked_document(store, blocked_document): assert store.authenticate("service-id", "document-id", b"0f0f0f", "test@notify.example") is False -def test_authenticate_with_expired_document(store, expired_document): +def test_authenticate_with_delete_markered_document(store, delete_markered_document): assert store.authenticate("service-id", "document-id", b"0f0f0f", "test@notify.example") is False From 11ef29aa8535c3bffd6e11e9bc29873ad0fca876 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Tue, 14 Jan 2025 15:49:56 +0000 Subject: [PATCH 14/63] Bump gunicorn to the latest version The version of Gunicorn we are using is more than 18 months out of date[1] and has a high severity security vulnerabilites[2]. We have not updated the version on the API (and therefore the minimum version in utils[3]) because last time we tried (while still on PaaS) it had some performance issues, documented here[4]: > We originally pinned this due to eventlet v0.33 compatibility issues. > That was supposedly fixed in version v21.0.0 and we merged v21.2.0 for > a while. Until we ran a load test again, and identified that the > bumped version of gunicorn led to a 33%+ drop-off in > performance/requests per second that the API was able to handle. If a > version greater than 21.2.0 is released, and it either gives us > something we need or we think it addresses said performance issues, > make sure to run a load test in staging before releasing to > production. But document download does not serve anywhere near the same number of requests per second as the API, so we have already upgraded to version 21.2.0. This pull request just updates from 21.2.0 to 23.0.0 (the latest version), which resolves the security vulnerability. *** 1. https://github.com/benoitc/gunicorn/tree/21.2.0 2. https://github.com/advisories/GHSA-w3h3-4rj7-4ph4 3. https://github.com/alphagov/notifications-utils/blob/main/setup.py#L31 4. https://github.com/alphagov/notifications-api/blob/aff08653d951d6f60dec8d701ae7cf1681b78a27/requirements.in#L10 --- requirements.txt | 2 +- requirements_for_test.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 750e6e78..8f544cc7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -47,7 +47,7 @@ govuk-bank-holidays==0.15 # via notifications-utils greenlet==3.0.3 # via eventlet -gunicorn==21.2.0 +gunicorn==23.0.0 # via notifications-utils idna==3.7 # via requests diff --git a/requirements_for_test.txt b/requirements_for_test.txt index 1721259b..c92e3c08 100644 --- a/requirements_for_test.txt +++ b/requirements_for_test.txt @@ -83,7 +83,7 @@ greenlet==3.0.3 # via # -r requirements.txt # eventlet -gunicorn==21.2.0 +gunicorn==23.0.0 # via # -r requirements.txt # notifications-utils From 1b36437741b389b68631ba7a859a928d69ed3260 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 2 May 2025 11:16:53 +0100 Subject: [PATCH 15/63] Bump eventlet to 0.39.1 --- requirements.txt | 2 +- requirements_for_test.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8f544cc7..584a6a69 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,7 +31,7 @@ click==8.1.7 # via flask dnspython==2.6.1 # via eventlet -eventlet==0.35.2 +eventlet==0.39.1 # via gunicorn flask==3.1.0 # via diff --git a/requirements_for_test.txt b/requirements_for_test.txt index c92e3c08..ac13fe31 100644 --- a/requirements_for_test.txt +++ b/requirements_for_test.txt @@ -55,7 +55,7 @@ dnspython==2.6.1 # via # -r requirements.txt # eventlet -eventlet==0.35.2 +eventlet==0.39.1 # via # -r requirements.txt # gunicorn From 838a046fb5c059f6e6995577352de665fc20cefe Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Wed, 21 May 2025 08:41:49 +0100 Subject: [PATCH 16/63] Bump utils to 99.5.1 ## 99.5.1 * Bump minimum Flask version to 3.1.1 ## 99.5.0 * Update economy letter transit date to 5 days ## 99.4.1 * Fix celery beat logging so that it will log in json ## 99.4.0 * Add celery logging configuration for json logging ## 99.3.4 * `celery.NotifyTask`: don't emit early log if called synchronously ## 99.3.3 * Fix bug with non-SMS templates considering international SMS limits ## 99.3.2 * Stops counting Crown Dependency phone numbers in CSV file as international ## 99.3.1 * Bump `python-json-logger` to version 3 ## 99.3.0 * Adds `RecipientCSV.international_sms_count` * Adds `RecipientCSV.more_international_sms_than_can_send` (initialise per `RecipientCSV(remaining_international_sms_messages=123)` to enable this check) ## 99.2.0 * Add s3 multipart upload lifecycle (create, upload, complete, abort) ## 99.1.2 * Normalise unusual dashes and spacing characters in phone numbers ## 99.1.1 * Bump ruff to latest version ## 99.1.0 * Adds support for economy mail in get_min_and_max_days_in_transit, with delivery estimated between 2 and 6 days ## 99.0.0 * NOTABLE CHANGE: Celery tasks emit an "early" log message when starting, with a customisable log level to allow this to be disabled for high-volume tasks. When upgrading past this version you may want to pass the extra argument `early_log_level=logging.DEBUG` to the task decorator of your most heavily-called tasks to reduce the effect on log volume * Annotates automatic celery task log messages with `celery_task_id` ## 98.0.1 * Moves from `setup.py` to `pyproject.toml` for package configuation * Changes ruff config from `exclude` to `extend-exclude` ## 98.0.0 * Update rate multipliers for international SMS to have values for 1 April 2025 onwards. When used in notifications-api and notifications-admin this change will affect the billing of text messages the pricing shown. ## 97.0.5 * Fix mailto: URLs in Markdown links ## 97.0.4 * Fix inset text block styling *** Complete changes: https://github.com/alphagov/notifications-utils/compare/97.0.3...99.5.1 --- .pre-commit-config.yaml | 4 ++-- requirements.in | 2 +- requirements.txt | 7 ++++--- requirements_for_test.txt | 9 +++++---- requirements_for_test_common.in | 4 ++-- ruff.toml | 6 ++---- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d6510cac..081632b2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -# This file was automatically copied from notifications-utils@97.0.3 +# This file was automatically copied from notifications-utils@99.5.1 repos: - repo: https://github.com/pre-commit/pre-commit-hooks @@ -9,7 +9,7 @@ repos: - id: check-yaml - id: debug-statements - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: 'v0.9.2' + rev: 'v0.11.4' hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] diff --git a/requirements.in b/requirements.in index 15d83b50..c1f51091 100644 --- a/requirements.in +++ b/requirements.in @@ -9,6 +9,6 @@ gds-metrics==0.2.4 argon2-cffi==21.3.0 # Run `make bump-utils` to update to the latest version -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@97.0.3 +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@99.5.1 sentry-sdk[flask]==1.45.1 diff --git a/requirements.txt b/requirements.txt index 584a6a69..88f63af6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,7 +33,7 @@ dnspython==2.6.1 # via eventlet eventlet==0.39.1 # via gunicorn -flask==3.1.0 +flask==3.1.1 # via # flask-redis # gds-metrics @@ -65,12 +65,13 @@ jmespath==1.0.1 # botocore markupsafe==2.1.3 # via + # flask # jinja2 # sentry-sdk # werkzeug mistune==0.8.4 # via notifications-utils -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@3fb3caf0efc9ccbebfe79b3269f84bfd66fc90d5 +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@d669a9544cc87372a10af8786d3bb172593261fa # via -r requirements.in ordered-set==4.1.0 # via notifications-utils @@ -88,7 +89,7 @@ pypdf==3.17.4 # via notifications-utils python-dateutil==2.8.2 # via botocore -python-json-logger==2.0.7 +python-json-logger==3.3.0 # via notifications-utils python-magic==0.4.25 # via -r requirements.in diff --git a/requirements_for_test.txt b/requirements_for_test.txt index ac13fe31..a95dc5a2 100644 --- a/requirements_for_test.txt +++ b/requirements_for_test.txt @@ -61,7 +61,7 @@ eventlet==0.39.1 # gunicorn execnet==2.1.1 # via pytest-xdist -flask==3.1.0 +flask==3.1.1 # via # -r requirements.txt # flask-redis @@ -113,13 +113,14 @@ jmespath==1.0.1 markupsafe==2.1.3 # via # -r requirements.txt + # flask # jinja2 # werkzeug mistune==0.8.4 # via # -r requirements.txt # notifications-utils -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@3fb3caf0efc9ccbebfe79b3269f84bfd66fc90d5 +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@d669a9544cc87372a10af8786d3bb172593261fa # via -r requirements.txt ordered-set==4.1.0 # via @@ -172,7 +173,7 @@ python-dateutil==2.8.2 # -r requirements.txt # botocore # freezegun -python-json-logger==2.0.7 +python-json-logger==3.3.0 # via # -r requirements.txt # notifications-utils @@ -200,7 +201,7 @@ requests-mock==1.12.1 # via -r requirements_for_test_common.in rsa==4.7.2 # via -r requirements.txt -ruff==0.9.2 +ruff==0.11.4 # via -r requirements_for_test_common.in s3transfer==0.10.1 # via diff --git a/requirements_for_test_common.in b/requirements_for_test_common.in index 2fc04082..cf7c3b2f 100644 --- a/requirements_for_test_common.in +++ b/requirements_for_test_common.in @@ -1,4 +1,4 @@ -# This file was automatically copied from notifications-utils@97.0.3 +# This file was automatically copied from notifications-utils@99.5.1 beautifulsoup4==4.12.3 pytest==8.3.4 @@ -9,4 +9,4 @@ pytest-testmon==2.1.1 requests-mock==1.12.1 freezegun==1.5.1 -ruff==0.9.2 # Also update `.pre-commit-config.yaml` if this changes +ruff==0.11.4 # Also update `.pre-commit-config.yaml` if this changes diff --git a/ruff.toml b/ruff.toml index dd665701..44dbbe66 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,10 +1,8 @@ -# This file was automatically copied from notifications-utils@97.0.3 +# This file was automatically copied from notifications-utils@99.5.1 -exclude = [ +extend-exclude = [ "migrations/versions/", - "venv*", "__pycache__", - "node_modules", "cache", "migrations", "build", From 6737bbaa325af3ef8e97160412bfa951bf002854 Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Wed, 28 May 2025 13:04:25 +0100 Subject: [PATCH 17/63] DocumentStore: add created-at tag when uploading documents to an extent this is a "backup" of the creation-time information we usually refer to s3's native timestamp for in case some maintenance-related reason causes us to lose it, but we will probably add logic to the app further down the line to use the data from this tag for additional access enforcement. timestamp will probably not match s3's native timestamp all the time. --- app/utils/store.py | 12 +++++++++--- tests/utils/test_store.py | 9 ++++++++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/app/utils/store.py b/app/utils/store.py index abb5f369..f0c951b0 100644 --- a/app/utils/store.py +++ b/app/utils/store.py @@ -1,7 +1,7 @@ import os import re import uuid -from datetime import date, timedelta +from datetime import UTC, date, datetime, timedelta from urllib.parse import urlencode import boto3 @@ -105,14 +105,20 @@ def put( {"service_id": service_id, "document_id": document_id}, ) + tags = { + # in case we lose S3's native timestamp for some reason + "created-at": datetime.now(UTC).isoformat(timespec="seconds"), + } + if retention_period: - tags = {"retention-period": retention_period} - extra_kwargs["Tagging"] = urlencode(tags) + tags["retention-period"] = retention_period current_app.logger.info( "Setting custom retention period for %(service_id)s/%(document_id)s: %(retention_period)s", {"service_id": service_id, "document_id": document_id, "retention_period": retention_period}, ) + extra_kwargs["Tagging"] = urlencode(tags) + if filename: # Convert utf-8 filenames to ASCII suitable for storing in AWS S3 Metadata. extra_kwargs["Metadata"]["filename"] = filename.encode("unicode_escape").decode("ascii") diff --git a/tests/utils/test_store.py b/tests/utils/test_store.py index 0b9fb03a..ead35b06 100644 --- a/tests/utils/test_store.py +++ b/tests/utils/test_store.py @@ -229,6 +229,7 @@ def test_check_for_expired_document_not_expired(store, expiration): ) +@freeze_time("2021-02-03T04:05:06") def test_put_document(store): ret = store.put( "service-id", mock.Mock(), mimetype="application/pdf", confirmation_email=None, retention_period=None @@ -247,9 +248,11 @@ def test_put_document(store): SSECustomerKey=ret["encryption_key"], SSECustomerAlgorithm="AES256", Metadata={}, + Tagging="created-at=2021-02-03T04%3A05%3A06%2B00%3A00", ) +@freeze_time("2021-02-03T04:05:06") def test_put_document_sends_hashed_recipient_email_to_s3_as_metadata_if_confirmation_email_present(store): ret = store.put("service-id", mock.Mock(), mimetype="application/pdf", confirmation_email="email@example.com") @@ -266,9 +269,11 @@ def test_put_document_sends_hashed_recipient_email_to_s3_as_metadata_if_confirma SSECustomerKey=ret["encryption_key"], SSECustomerAlgorithm="AES256", Metadata={"hashed-recipient-email": mock.ANY}, + Tagging="created-at=2021-02-03T04%3A05%3A06%2B00%3A00", ) +@freeze_time("2021-02-03T04:05:06") def test_put_document_tags_document_if_retention_period_set(store): ret = store.put("service-id", mock.Mock(), mimetype="application/pdf", retention_period="4 weeks") @@ -284,7 +289,7 @@ def test_put_document_tags_document_if_retention_period_set(store): Key=Matcher("document key", lambda x: x.startswith("service-id/") and len(x) == 11 + 36), SSECustomerKey=ret["encryption_key"], SSECustomerAlgorithm="AES256", - Tagging="retention-period=4+weeks", + Tagging="created-at=2021-02-03T04%3A05%3A06%2B00%3A00&retention-period=4+weeks", Metadata={}, ) @@ -299,6 +304,7 @@ def test_put_document_tags_document_if_retention_period_set(store): (r"\u2705.pdf", r"\\u2705.pdf"), ), ) +@freeze_time("2021-02-03T04:05:06") def test_put_document_records_filename_if_set(store, filename, expected_filename_for_s3): ret = store.put("service-id", mock.Mock(), mimetype="application/pdf", filename=filename) @@ -315,6 +321,7 @@ def test_put_document_records_filename_if_set(store, filename, expected_filename SSECustomerKey=ret["encryption_key"], SSECustomerAlgorithm="AES256", Metadata={"filename": expected_filename_for_s3}, + Tagging="created-at=2021-02-03T04%3A05%3A06%2B00%3A00", ) From 5e3471562e470a42d9916f8c79a445b80ee86e59 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Mon, 2 Jun 2025 14:38:15 +0100 Subject: [PATCH 18/63] Bump greenlet and cffi to the latest versions The versions we are using do not install on newer versions of Python because of the way they are packaged --- requirements.txt | 4 ++-- requirements_for_test.txt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 88f63af6..bc61c446 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,7 +23,7 @@ certifi==2024.7.4 # via # requests # sentry-sdk -cffi==1.16.0 +cffi==1.17.1 # via argon2-cffi-bindings charset-normalizer==3.3.2 # via requests @@ -45,7 +45,7 @@ gds-metrics==0.2.4 # via -r requirements.in govuk-bank-holidays==0.15 # via notifications-utils -greenlet==3.0.3 +greenlet==3.2.2 # via eventlet gunicorn==23.0.0 # via notifications-utils diff --git a/requirements_for_test.txt b/requirements_for_test.txt index a95dc5a2..1b195114 100644 --- a/requirements_for_test.txt +++ b/requirements_for_test.txt @@ -37,7 +37,7 @@ certifi==2024.7.4 # -r requirements.txt # requests # sentry-sdk -cffi==1.16.0 +cffi==1.17.1 # via # -r requirements.txt # argon2-cffi-bindings @@ -79,7 +79,7 @@ govuk-bank-holidays==0.15 # via # -r requirements.txt # notifications-utils -greenlet==3.0.3 +greenlet==3.2.2 # via # -r requirements.txt # eventlet From 0eb18378189e9918366c7a18fe8ec86fcc0125f4 Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Mon, 9 Jun 2025 12:22:50 +0100 Subject: [PATCH 19/63] DocumentStore: fallback to using s3 tags to determine expiration including minimal test changes intending to "prove" that everything still works more-or-less the same without tagging present --- app/utils/store.py | 123 ++++++++++++++++++++++++++++---------- tests/utils/test_store.py | 33 +++++----- 2 files changed, 106 insertions(+), 50 deletions(-) diff --git a/app/utils/store.py b/app/utils/store.py index f0c951b0..98ae8bf5 100644 --- a/app/utils/store.py +++ b/app/utils/store.py @@ -28,6 +28,10 @@ class DocumentNotFound(DocumentStoreError): suggested_status_code = 404 +class CannotDetermineExpiration(Exception): + pass + + class DocumentStore: """ This class is not thread-safe. @@ -42,16 +46,9 @@ def __init__(self, bucket=None): def init_app(self, app): self.bucket = app.config["DOCUMENTS_BUCKET"] - def check_for_blocked_document(self, service_id, document_id): - """Raises an exception if access to the document has been blocked after creation - - This should be checked before any document access. This might be used to quickly prevent anyone from accessing - a file that a service has sent out accidentally. - - Note that the `blocked` tag key MUST be in lowercase. - """ + def _get_document_tags(self, service_id, document_id): try: - tags = { + return { item["Key"]: item["Value"] for item in self.s3.get_object_tagging( Bucket=self.bucket, Key=self.get_document_key(service_id, document_id) @@ -67,20 +64,29 @@ def check_for_blocked_document(self, service_id, document_id): raise e + def check_for_blocked_document(self, tags): + """Raises an exception if access to the document has been blocked after creation + + This should be checked before any document access. This might be used to quickly prevent anyone from accessing + a file that a service has sent out accidentally. + + Note that the `blocked` tag key MUST be in lowercase. + """ + if tags.get("blocked", "false").lower() in {"true", "yes"}: raise DocumentBlocked("Access to the document has been blocked") - def check_for_expired_document(self, s3_response): - if not s3_response.get("Expiration"): - current_app.logger.warning("Expiration information not available for document") - # in case we're having a maintenance issue/outage with our s3 bucket - # we'd prefer to serve some files that should have been deleted (hopefully <14 - # days ago) instead of being unable to serve any documents to any users - return + @classmethod + def check_for_expired_document(cls, s3_response, tags): + effective_expiry_date = cls._get_effective_expiry_date(s3_response, tags) - expiry_date = self._convert_expiry_date_to_date_object(s3_response["Expiration"]) + if not effective_expiry_date: + current_app.logger.error( + "Expiration information not available for document, attempting to serve it anyway.." + ) + return - if expiry_date < date.today(): + if effective_expiry_date < date.today(): raise DocumentExpired("The document is no longer available") def put( @@ -152,14 +158,15 @@ def get(self, service_id, document_id, decryption_key): decryption_key should be raw bytes """ try: - self.check_for_blocked_document(service_id, document_id) + tags = self._get_document_tags(service_id, document_id) + self.check_for_blocked_document(tags) s3_response = self.s3.get_object( Bucket=self.bucket, Key=self.get_document_key(service_id, document_id), SSECustomerKey=decryption_key, SSECustomerAlgorithm="AES256", ) - self.check_for_expired_document(s3_response) + self.check_for_expired_document(s3_response, tags) except BotoClientError as e: if e.response["Error"]["Code"] == "404": @@ -180,22 +187,22 @@ def get_document_metadata(self, service_id, document_id, decryption_key): """ try: - self.check_for_blocked_document(service_id, document_id) + tags = self._get_document_tags(service_id, document_id) + self.check_for_blocked_document(tags) s3_response = self.s3.head_object( Bucket=self.bucket, Key=self.get_document_key(service_id, document_id), SSECustomerKey=decryption_key, SSECustomerAlgorithm="AES256", ) - self.check_for_expired_document(s3_response) + self.check_for_expired_document(s3_response, tags) + available_until = self._get_effective_expiry_date(s3_response, tags) return { "mimetype": s3_response["ContentType"], "confirm_email": self.get_email_hash(s3_response) is not None, "size": s3_response["ContentLength"], - "available_until": str(self._convert_expiry_date_to_date_object(s3_response["Expiration"])) - if s3_response.get("Expiration") - else None, + "available_until": str(available_until) if available_until else None, "filename": self._normalise_metadata(s3_response["Metadata"]).get("filename"), } except BotoClientError as e: @@ -204,21 +211,70 @@ def get_document_metadata(self, service_id, document_id, decryption_key): raise DocumentStoreError(e.response["Error"]) from e + @classmethod + def _get_effective_expiry_date(cls, s3_response, tags): + potential_expiry_dates = [] + + try: + potential_expiry_dates.append(cls._get_expiry_date_from_expiration_header(s3_response)) + except CannotDetermineExpiration as err: + current_app.logger.warning("Cannot determine document expiration through Expiration header: %s", str(err)) + + try: + potential_expiry_dates.append(cls._get_expiry_date_from_tags(s3_response, tags)) + except CannotDetermineExpiration as err: + current_app.logger.warning("Cannot determine document expiration through tags: %s", str(err)) + + if potential_expiry_dates: + return min(potential_expiry_dates) + + # in case we're having a maintenance issue/outage with our s3 bucket + # we'd prefer to serve some files that should have been deleted (hopefully <14 + # days ago) instead of being unable to serve any documents to any users + return None + @staticmethod - def _convert_expiry_date_to_date_object(raw_expiry_date: str) -> date: + def _get_expiry_date_from_expiration_header(s3_response) -> date: pattern = re.compile(r'([^=\s]+?)="(.+?)"') - expiry_date_as_dict = dict(pattern.findall(raw_expiry_date)) + expiry_date_as_dict = dict(pattern.findall(s3_response.get("Expiration") or "") or {}) + + if "expiry-date" not in expiry_date_as_dict: + raise CannotDetermineExpiration("No expiry-date found") expiry_date_string = expiry_date_as_dict["expiry-date"] timezone = expiry_date_string.split()[-1] if timezone != "GMT": - current_app.logger.warning("AWS S3 object expiration has unhandled timezone: %s", timezone) + raise CannotDetermineExpiration(f"AWS S3 object expiration has unhandled timezone: {timezone}") - expiry_date = parser.parse(expiry_date_string) - expiry_date = expiry_date.date() - timedelta(days=1) + try: + expiry_date = parser.parse(expiry_date_string) + except parser.ParserError as err: + raise CannotDetermineExpiration from err + + return expiry_date.date() - timedelta(days=1) + + @staticmethod + def _get_expiry_date_from_tags(s3_response, tags): + try: + retention_period_match = re.fullmatch(r"(\d+) weeks", tags["retention-period"]) + except KeyError as err: + raise CannotDetermineExpiration from err + + if retention_period_match is None: + raise CannotDetermineExpiration("Cannot parse retention-period header") + + try: + retention_period_days = int(retention_period_match.group(1), base=10) * 7 + except ValueError as err: + raise CannotDetermineExpiration from err + + try: + created_at = datetime.fromisoformat(tags["created-at"]) + except (KeyError, ValueError): + created_at = s3_response["LastModified"] - return expiry_date + return created_at.date() + timedelta(days=retention_period_days) def generate_encryption_key(self): return os.urandom(32) @@ -231,14 +287,15 @@ def authenticate(self, service_id: str, document_id: str, decryption_key: bytes, email_address needs to be in a validated and known-good format before being passed to this method """ try: - self.check_for_blocked_document(service_id, document_id) + tags = self._get_document_tags(service_id, document_id) + self.check_for_blocked_document(tags) s3_response = self.s3.head_object( Bucket=self.bucket, Key=self.get_document_key(service_id, document_id), SSECustomerKey=decryption_key, SSECustomerAlgorithm="AES256", ) - self.check_for_expired_document(s3_response) + self.check_for_expired_document(s3_response, tags) except (DocumentBlocked, DocumentExpired): return False except BotoClientError as e: diff --git a/tests/utils/test_store.py b/tests/utils/test_store.py index ead35b06..79ad131d 100644 --- a/tests/utils/test_store.py +++ b/tests/utils/test_store.py @@ -155,12 +155,12 @@ def test_check_for_blocked_document_raises_error(store, mock_boto, blocked_value } with pytest.raises(DocumentBlocked): - store.check_for_blocked_document("service-id", "doc-id") + store.check_for_blocked_document(store._get_document_tags("service-id", "doc-id")) def test_check_for_blocked_document_delete_marker_document_expired_error(store, delete_markered_document): with pytest.raises(DocumentExpired): - store.check_for_blocked_document("service-id", "doc-id") + store.check_for_blocked_document(store._get_document_tags("service-id", "doc-id")) def test_check_for_blocked_document_missing_raises_document_not_found_error(store): @@ -176,7 +176,7 @@ def test_check_for_blocked_document_missing_raises_document_not_found_error(stor ) with pytest.raises(DocumentNotFound): - store.check_for_blocked_document("service-id", "doc-id") + store.check_for_blocked_document(store._get_document_tags("service-id", "doc-id")) def test_check_for_blocked_document_random_error_propagated(store): @@ -190,7 +190,7 @@ def test_check_for_blocked_document_random_error_propagated(store): ) with pytest.raises(BotoClientError): - store.check_for_blocked_document("service-id", "doc-id") + store.check_for_blocked_document(store._get_document_tags("service-id", "doc-id")) @pytest.mark.parametrize( @@ -206,7 +206,8 @@ def test_check_for_expired_document_expired(store, expiration): store.check_for_expired_document( { "Expiration": expiration, - } + }, + {}, ) @@ -223,7 +224,8 @@ def test_check_for_expired_document_not_expired(store, expiration): store.check_for_expired_document( { "Expiration": expiration, - } + }, + {}, ) is None ) @@ -508,7 +510,7 @@ def test_authenticate_with_delete_markered_document(store, delete_markered_docum @pytest.mark.parametrize( - "raw_expiry_rule,expected_date", + "expiry_str,expected_date", [ # An expiry date in winter time (GMT) - date in GMT ISO 8601 format ('expiry-date="Mon, 31 Oct 2022 00:00:00 GMT", rule-id="remove-old-documents"', date(2022, 10, 30)), @@ -520,15 +522,12 @@ def test_authenticate_with_delete_markered_document(store, delete_markered_docum ('rule-id="remove-old-documents", expiry-date="Tue, 01 Nov 2022 00:00:00 GMT"', date(2022, 10, 31)), ], ) -def test___convert_expiry_date_to_date_object(raw_expiry_rule, expected_date): - result = DocumentStore._convert_expiry_date_to_date_object(raw_expiry_rule) - assert result == expected_date +def test__get_expiry_date_from_expiration_header(expiry_str, expected_date): + assert DocumentStore._get_expiry_date_from_expiration_header({"Expiration": expiry_str}) == expected_date -def test_convert_expiry_date_to_date_object_logs_on_non_gmt_expiration(app, caplog): - DocumentStore._convert_expiry_date_to_date_object( - 'expiry-date="Mon, 31 Oct 2022 00:00:00 EST", rule-id="remove-old-documents"' - ) - - assert len(caplog.records) == 1 - assert caplog.records[0].message == "AWS S3 object expiration has unhandled timezone: EST" +def test__get_expiry_date_from_expiration_header(app, caplog): + with pytest.raises(CannotDetermineExpiration): + DocumentStore._get_expiry_date_from_expiration_header( + {"Expiration": 'expiry-date="Mon, 31 Oct 2022 00:00:00 EST", rule-id="remove-old-documents"'} + ) From a56743c62b83038fa008d2aa6dee61706581683d Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Mon, 9 Jun 2025 14:56:13 +0100 Subject: [PATCH 20/63] DocumentStore tests: expand to cover various cases of tag/Expiration header validity --- tests/utils/test_store.py | 129 +++++++++++++++++++++++++++++++++----- 1 file changed, 114 insertions(+), 15 deletions(-) diff --git a/tests/utils/test_store.py b/tests/utils/test_store.py index 79ad131d..647b1871 100644 --- a/tests/utils/test_store.py +++ b/tests/utils/test_store.py @@ -1,5 +1,5 @@ import uuid -from datetime import date +from datetime import date, datetime from unittest import mock import botocore @@ -510,24 +510,123 @@ def test_authenticate_with_delete_markered_document(store, delete_markered_docum @pytest.mark.parametrize( - "expiry_str,expected_date", + "s3_response,tags,expected_date", [ # An expiry date in winter time (GMT) - date in GMT ISO 8601 format - ('expiry-date="Mon, 31 Oct 2022 00:00:00 GMT", rule-id="remove-old-documents"', date(2022, 10, 30)), + ( + {"Expiration": 'expiry-date="Mon, 31 Oct 2022 00:00:00 GMT", rule-id="remove-old-documents"'}, + {}, + date(2022, 10, 30), + ), # An expiry date in summer time (BST) - still sent by AWS in GMT ISO 8601 format. - ('expiry-date="Wed, 26 Oct 2022 00:00:00 GMT", rule-id="remove-old-documents"', date(2022, 10, 25)), + ( + {"Expiration": 'expiry-date="Wed, 26 Oct 2022 00:00:00 GMT", rule-id="remove-old-documents"'}, + {}, + date(2022, 10, 25), + ), # Swap the order of the key-value pairs - ('rule-id="remove-old-documents", expiry-date="Mon, 31 Oct 2022 00:00:00 GMT"', date(2022, 10, 30)), + ( + {"Expiration": 'rule-id="remove-old-documents", expiry-date="Mon, 31 Oct 2022 00:00:00 GMT"'}, + {}, + date(2022, 10, 30), + ), # Expiry date should handle month borders just fine - ('rule-id="remove-old-documents", expiry-date="Tue, 01 Nov 2022 00:00:00 GMT"', date(2022, 10, 31)), + ( + {"Expiration": 'rule-id="remove-old-documents", expiry-date="Tue, 01 Nov 2022 00:00:00 GMT"'}, + {}, + date(2022, 10, 31), + ), + # tag-based expiry is earlier so should be preferred + ( + {"Expiration": 'expiry-date="Mon, 31 Oct 2022 00:00:00 GMT", rule-id="remove-old-documents"'}, + { + "created-at": "2022-10-12T12:34:56+0000", + "retention-period": "2 weeks", + }, + date(2022, 10, 26), + ), + # retention-period tag corrupt, preventing tag use & causing fallback to Expiration header + ( + {"Expiration": 'expiry-date="Mon, 31 Oct 2022 00:00:00 GMT", rule-id="remove-old-documents"'}, + { + "created-at": "2022-10-12T12:34:56+0000", + "retention-period": "2.0 apples", + }, + date(2022, 10, 30), + ), + # created-at tag corrupt, preventing tag use & causing fallback to LastModified header used with + # retention-period tag + ( + { + "Expiration": 'expiry-date="Mon, 31 Oct 2022 00:00:00 GMT", rule-id="remove-old-documents"', + "LastModified": datetime(2022, 9, 12, 12, 34, 56), + }, + { + "created-at": "20222-10-122T12:34:56+0000", + "retention-period": "2 weeks", + }, + date(2022, 9, 26), + ), + # created-at tag missing, preventing tag use & causing fallback to LastModified header used with + # retention-period tag + ( + { + "Expiration": 'expiry-date="Mon, 31 Oct 2022 00:00:00 GMT", rule-id="remove-old-documents"', + "LastModified": datetime(2022, 9, 12, 12, 34, 56), + }, + { + "retention-period": "2 weeks", + }, + date(2022, 9, 26), + ), + # Expiration header is non-GMT so ignored in favour of (later) tag-based expiry date + ( + {"Expiration": 'expiry-date="Mon, 31 Oct 2022 00:00:00 EST", rule-id="remove-old-documents"'}, + { + "created-at": "2022-11-12T12:34:56+0000", + "retention-period": "2 weeks", + }, + date(2022, 11, 26), + ), + # Expiration header is missing expiry-date so ignored in favour of tag-based expiry date, + # though not disregarding LastModified + ( + { + "Expiration": 'rule-id="remove-old-documents"', + "LastModified": datetime(2022, 9, 12, 12, 34, 56), + }, + { + "retention-period": "30 weeks", + }, + date(2023, 4, 10), + ), + # Expiration header missing, tags used + ( + {}, + { + "created-at": "2022-10-12T12:34:56+0000", + "retention-period": "2 weeks", + }, + date(2022, 10, 26), + ), + # Expiration header unparseable, tags used + ( + {"Expiration": "blah"}, + { + "created-at": "2022-10-12T12:34:56+0000", + "retention-period": "2 weeks", + }, + date(2022, 10, 26), + ), + # Both methods unusable + ( + {"Expiration": "blah"}, + { + "created-at": "2022-10-12T12:34:56+0000", + }, + None, + ), ], ) -def test__get_expiry_date_from_expiration_header(expiry_str, expected_date): - assert DocumentStore._get_expiry_date_from_expiration_header({"Expiration": expiry_str}) == expected_date - - -def test__get_expiry_date_from_expiration_header(app, caplog): - with pytest.raises(CannotDetermineExpiration): - DocumentStore._get_expiry_date_from_expiration_header( - {"Expiration": 'expiry-date="Mon, 31 Oct 2022 00:00:00 EST", rule-id="remove-old-documents"'} - ) +def test__get_effective_expiry_date(s3_response, tags, expected_date): + assert DocumentStore._get_effective_expiry_date(s3_response, tags) == expected_date From c5aef2d4d81b0c278e83179db4874cd95e71867e Mon Sep 17 00:00:00 2001 From: Wesley Hindle Date: Thu, 24 Jul 2025 17:17:01 +0100 Subject: [PATCH 21/63] chore: bump utils to 100.2.0 Bump utils to 100.2.0 ## 100.2.0 * add fields to pre/post request flask logs ## 100.1.0 * Updated to version 9.0.9 to keep phonenumber metadata uptodate ## 100.0.0 * is now a required argument of (apps have already been updated) ## 99.8.0 * Add new version of GOV.UK brand to email template, behind a flag ## 99.7.0 * Update economy letter transit dates to max 8 days ## 99.6.0 * Improve celery json logging. Include beat with separate log level options and testing ## 99.5.2 * Make inheritence of annotations on SerialisedModel work on both the class and its instances *** Complete changes: https://github.com/alphagov/notifications-utils/compare/99.5.1...100.2.0 --- .pre-commit-config.yaml | 2 +- requirements.in | 2 +- requirements.txt | 10 ++++++---- requirements_for_test.txt | 7 ++++--- requirements_for_test_common.in | 2 +- ruff.toml | 2 +- 6 files changed, 14 insertions(+), 11 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 081632b2..cf0b5a54 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -# This file was automatically copied from notifications-utils@99.5.1 +# This file was automatically copied from notifications-utils@100.2.0 repos: - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/requirements.in b/requirements.in index c1f51091..bd011d8f 100644 --- a/requirements.in +++ b/requirements.in @@ -9,6 +9,6 @@ gds-metrics==0.2.4 argon2-cffi==21.3.0 # Run `make bump-utils` to update to the latest version -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@99.5.1 +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@100.2.0 sentry-sdk[flask]==1.45.1 diff --git a/requirements.txt b/requirements.txt index 88f63af6..8364300a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -71,13 +71,13 @@ markupsafe==2.1.3 # werkzeug mistune==0.8.4 # via notifications-utils -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@d669a9544cc87372a10af8786d3bb172593261fa +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@bb31f41dd194802ced5ea814f84cc25855467234 # via -r requirements.in ordered-set==4.1.0 # via notifications-utils packaging==23.2 # via gunicorn -phonenumbers==8.13.52 +phonenumbers==9.0.10 # via notifications-utils prometheus-client==0.19.0 # via gds-metrics @@ -87,8 +87,10 @@ pycparser==2.21 # via cffi pypdf==3.17.4 # via notifications-utils -python-dateutil==2.8.2 - # via botocore +python-dateutil==2.9.0.post0 + # via + # botocore + # notifications-utils python-json-logger==3.3.0 # via notifications-utils python-magic==0.4.25 diff --git a/requirements_for_test.txt b/requirements_for_test.txt index a95dc5a2..a7ecf97a 100644 --- a/requirements_for_test.txt +++ b/requirements_for_test.txt @@ -120,7 +120,7 @@ mistune==0.8.4 # via # -r requirements.txt # notifications-utils -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@d669a9544cc87372a10af8786d3bb172593261fa +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@bb31f41dd194802ced5ea814f84cc25855467234 # via -r requirements.txt ordered-set==4.1.0 # via @@ -131,7 +131,7 @@ packaging==23.2 # -r requirements.txt # gunicorn # pytest -phonenumbers==8.13.52 +phonenumbers==9.0.10 # via # -r requirements.txt # notifications-utils @@ -168,11 +168,12 @@ pytest-testmon==2.1.1 # via -r requirements_for_test_common.in pytest-xdist==3.6.1 # via -r requirements_for_test_common.in -python-dateutil==2.8.2 +python-dateutil==2.9.0.post0 # via # -r requirements.txt # botocore # freezegun + # notifications-utils python-json-logger==3.3.0 # via # -r requirements.txt diff --git a/requirements_for_test_common.in b/requirements_for_test_common.in index cf7c3b2f..6f622a88 100644 --- a/requirements_for_test_common.in +++ b/requirements_for_test_common.in @@ -1,4 +1,4 @@ -# This file was automatically copied from notifications-utils@99.5.1 +# This file was automatically copied from notifications-utils@100.2.0 beautifulsoup4==4.12.3 pytest==8.3.4 diff --git a/ruff.toml b/ruff.toml index 44dbbe66..348e96e8 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,4 +1,4 @@ -# This file was automatically copied from notifications-utils@99.5.1 +# This file was automatically copied from notifications-utils@100.2.0 extend-exclude = [ "migrations/versions/", From f3471b8bf561bb29ca7efc9baf06f60b1925baf5 Mon Sep 17 00:00:00 2001 From: Mervi Tyczynska Date: Wed, 3 Sep 2025 17:52:09 +0100 Subject: [PATCH 22/63] Update how to check for the right Python version in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9dd414ee..43d61286 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ### Python version -Check the version is [runtime.txt](runtime.txt) +Check the version is the same as at the top of [Dockerfile](docker/Dockerfile) ### uv From e69ec1d4f76ce011b3aa2efabe68572937715247 Mon Sep 17 00:00:00 2001 From: Richard Parke Date: Tue, 16 Sep 2025 14:26:57 +0100 Subject: [PATCH 23/63] Put mimetype and antivirus checks into their own method --- app/utils/file_checks.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 app/utils/file_checks.py diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py new file mode 100644 index 00000000..b72f92ca --- /dev/null +++ b/app/utils/file_checks.py @@ -0,0 +1,39 @@ +from flask import current_app +from notifications_utils.clients.antivirus.antivirus_client import AntivirusError + +from app import antivirus_client +from app.utils import get_mime_type + + +class FiletypeError(Exception): + def __init__(self, error_message=None, status_code=None): + self.error_message = error_message + self.status_code = status_code + + +def run_antivirus_checks(file_data): + try: + virus_free = antivirus_client.scan(file_data) + except AntivirusError as e: + raise AntivirusError(message="Antivirus API error", status_code=503) from e + + if not virus_free: + raise AntivirusError(message="File did not pass the virus scan", status_code=400) + return virus_free + + +def run_mimetype_checks(file_data, is_csv): + mimetype = get_mime_type(file_data) + # Our mimetype auto-detection sometimes resolves CSV content as text/plain, so we use + # an explicit POST body parameter `is_csv` from the caller to resolve it as text/csv + if is_csv and mimetype == "text/plain": + mimetype = "text/csv" + if mimetype not in current_app.config["MIME_TYPES_TO_FILE_EXTENSIONS"]: + allowed_file_types = ", ".join( + sorted({f"'.{x}'" for x in current_app.config["FILE_EXTENSIONS_TO_MIMETYPES"].keys()}) + ) + raise FiletypeError( + error_message=f"Unsupported file type '{mimetype}'. Supported types are: {allowed_file_types}", + status_code=400, + ) + return mimetype From 31d84e5e7e98cd606314f307caf5a6968c844317 Mon Sep 17 00:00:00 2001 From: Mervi Tyczynska Date: Wed, 3 Sep 2025 18:03:40 +0100 Subject: [PATCH 24/63] Upgrade Python version from 3.11 to 3.13 As a part of general python upgrade work. --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 6e9418c3..55e5fcd0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.11-slim-bookworm AS base +FROM python:3.13-slim-bookworm AS base COPY --from=ghcr.io/astral-sh/uv:0.5.30 /uv /uvx /bin/ From c79febc34beee4c12e7f64a0fc6b9c18296a17e6 Mon Sep 17 00:00:00 2001 From: Richard Parke Date: Tue, 16 Sep 2025 14:29:14 +0100 Subject: [PATCH 25/63] new endpoint and tests --- app/__init__.py | 2 + app/file_checks/__init__.py | 1 + app/file_checks/errors.py | 8 ++++ app/file_checks/views.py | 52 ++++++++++++++++++++++++ app/upload/views.py | 23 +++++++++++ tests/file_checks/__init__.py | 0 tests/file_checks/test_views.py | 70 +++++++++++++++++++++++++++++++++ 7 files changed, 156 insertions(+) create mode 100644 app/file_checks/__init__.py create mode 100644 app/file_checks/errors.py create mode 100644 app/file_checks/views.py create mode 100644 tests/file_checks/__init__.py create mode 100644 tests/file_checks/test_views.py diff --git a/app/__init__.py b/app/__init__.py index 06ea0683..a20e5b13 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -49,6 +49,7 @@ from app.download.views import download_blueprint +from app.file_checks.views import file_checks_blueprint from app.upload.views import upload_blueprint mimetypes.init() @@ -74,6 +75,7 @@ def create_app(): application.register_blueprint(download_blueprint) application.register_blueprint(upload_blueprint) + application.register_blueprint(file_checks_blueprint) @application.errorhandler(EventletTimeout) def eventlet_timeout(error): diff --git a/app/file_checks/__init__.py b/app/file_checks/__init__.py new file mode 100644 index 00000000..0218afa6 --- /dev/null +++ b/app/file_checks/__init__.py @@ -0,0 +1 @@ +from .errors import * # noqa import errors module to register error handlers diff --git a/app/file_checks/errors.py b/app/file_checks/errors.py new file mode 100644 index 00000000..4a0ee146 --- /dev/null +++ b/app/file_checks/errors.py @@ -0,0 +1,8 @@ +from flask import jsonify + +from .views import file_checks_blueprint + + +@file_checks_blueprint.errorhandler(413) +def request_entity_too_large(error): + return jsonify(error="Uploaded file exceeds file size limit"), 413 diff --git a/app/file_checks/views.py b/app/file_checks/views.py new file mode 100644 index 00000000..4b8d325d --- /dev/null +++ b/app/file_checks/views.py @@ -0,0 +1,52 @@ +from base64 import b64decode, binascii +from io import BytesIO + +from flask import Blueprint, abort, current_app, jsonify, request +from notifications_utils.clients.antivirus.antivirus_client import AntivirusError +from werkzeug.exceptions import BadRequest + +from app.utils.authentication import check_auth +from app.utils.file_checks import FiletypeError, run_antivirus_checks, run_mimetype_checks + +file_checks_blueprint = Blueprint("file_checks", __name__, url_prefix="") +file_checks_blueprint.before_request(check_auth) + + +def _get_upload_document_request_data(data): + if "document" not in data: + raise BadRequest("No document upload") + + try: + raw_content = b64decode(data["document"]) + except (binascii.Error, ValueError) as e: + raise BadRequest("Document is not base64 encoded") from e + + if len(raw_content) > current_app.config["MAX_DECODED_FILE_SIZE"]: + abort(413) + file_data = BytesIO(raw_content) + is_csv = data.get("is_csv", False) + + if not isinstance(is_csv, bool): + raise BadRequest("Value for is_csv must be a boolean") + + return file_data, is_csv + + +@file_checks_blueprint.route("/antivirus_and_mimetype_check", methods=["POST"]) +def get_mime_type_and_run_antivirus_scan(): + try: + ( + file_data, + is_csv, + ) = _get_upload_document_request_data(request.json) + except BadRequest as e: + return jsonify(error=e.description), 400 + try: + virus_free = run_antivirus_checks(file_data) + except AntivirusError as e: + return jsonify(error=e.message), e.status_code + try: + mimetype = run_mimetype_checks(file_data, is_csv) + except FiletypeError as e: + return jsonify(error=e.error_message), e.status_code + return jsonify(virus_free=virus_free, mimetype=mimetype), 200 diff --git a/app/upload/views.py b/app/upload/views.py index 2b4102f1..0934aeb3 100644 --- a/app/upload/views.py +++ b/app/upload/views.py @@ -63,6 +63,29 @@ def _get_upload_document_request_data(data): # noqa: C901 return file_data, is_csv, confirmation_email, retention_period, filename +# @upload_blueprint.route("/antivirus_and_mimetype_check", methods=["POST"]) +# def get_mime_type_and_run_antivirus_scan(): +# try: +# file_data, is_csv, confirmation_email, retention_period, filename = _get_upload_document_request_data( +# request.json +# ) +# except BadRequest as e: +# return jsonify(error=e.description), 400 +# try: +# virus_free = antivirus_client.scan(file_data) +# except AntivirusError: +# return jsonify(error="Antivirus API error"), 503 + +# if not virus_free: +# return jsonify(error="File did not pass the virus scan"), 400 +# mimetype = get_mime_type(file_data) +# # Our mimetype auto-detection sometimes resolves CSV content as text/plain, so we use +# # an explicit POST body parameter `is_csv` from the caller to resolve it as text/csv +# if is_csv and mimetype == "text/plain": +# mimetype = "text/csv" +# return jsonify(virus_free=virus_free, mimetype=mimetype), 200 + + @upload_blueprint.route("/services//documents", methods=["POST"]) def upload_document(service_id): try: diff --git a/tests/file_checks/__init__.py b/tests/file_checks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/file_checks/test_views.py b/tests/file_checks/test_views.py new file mode 100644 index 00000000..6cf5b5a3 --- /dev/null +++ b/tests/file_checks/test_views.py @@ -0,0 +1,70 @@ +import base64 + +import pytest +from notifications_utils.clients.antivirus.antivirus_client import AntivirusError + + +def _file_checks(client, file_content): + url = "/antivirus_and_mimetype_check" + data = { + "document": base64.b64encode(file_content).decode("utf-8"), + } + response = client.post(url, json=data) + return response + + +@pytest.fixture +def antivirus(mocker): + return mocker.patch( + "app.utils.file_checks.antivirus_client", + # prevent LocalProxy being detected as an async function + new_callable=mocker.MagicMock, + ) + + +def test_file_checks_virus_found(client, antivirus): + antivirus.scan.return_value = False + + file_content = b"%PDF-1.4 file contents" + response = _file_checks(client, file_content) + + assert response.status_code == 400 + assert response.json == {"error": "File did not pass the virus scan"} + + +def test_file_checks_returns_json_object_with_expected_results(client, antivirus): + antivirus.scan.return_value = True + file_content = b"%PDF-1.4 file contents" + response = _file_checks(client, file_content) + assert response.status_code == 200 + assert response.json == {"mimetype": "application/pdf", "virus_free": True} + + +def test_file_checks_virus_scan_error(client, antivirus): + antivirus.scan.side_effect = AntivirusError(503, "connection error") + + file_content = b"%PDF-1.4 file contents" + response = _file_checks(client, file_content) + + assert response.status_code == 503 + assert response.json == {"error": "Antivirus API error"} + + +def test_file_checks_invalid_encoding(client): + response = client.post("/antivirus_and_mimetype_check", json={"document": "foo"}) + + assert response.status_code == 400 + assert response.json == {"error": "Document is not base64 encoded"} + + +def test_file_checks_unknown_type(client, antivirus): + file_content = b"\x00pdf file contents\n" + + response = _file_checks(client, file_content) + + assert response.status_code == 400 + assert response.json["error"] == ( + "Unsupported file type 'application/octet-stream'. " + "Supported types are: '.csv', '.doc', '.docx', '.jpeg', '.jpg', '.json', '.odt', '.pdf', '.png', '.rtf', " + "'.txt', '.xlsx'" + ) From fae29435cd47a1eadd66b1e5003b30586fc41ba4 Mon Sep 17 00:00:00 2001 From: Richard Parke Date: Tue, 16 Sep 2025 15:29:22 +0100 Subject: [PATCH 26/63] working example of cachable function Refactor Antivirus on doc upload Clean up WIP WIP WIP WIP WIP tests passing WIP tests passing clean up WIP WIP passing tests WIP passing tests refactored mimetype and av checks into single shared cachable function refactor into cachable method rename working example of cachable function --- app/file_checks/views.py | 92 ++++++++++++++++++++++++++++----- app/upload/views.py | 60 ++++----------------- app/utils/file_checks.py | 46 +++++++++++------ tests/file_checks/test_views.py | 2 +- tests/upload/test_views.py | 2 +- 5 files changed, 122 insertions(+), 80 deletions(-) diff --git a/app/file_checks/views.py b/app/file_checks/views.py index 4b8d325d..e9456c56 100644 --- a/app/file_checks/views.py +++ b/app/file_checks/views.py @@ -1,18 +1,43 @@ +import mimetypes from base64 import b64decode, binascii +from dataclasses import dataclass from io import BytesIO from flask import Blueprint, abort, current_app, jsonify, request from notifications_utils.clients.antivirus.antivirus_client import AntivirusError from werkzeug.exceptions import BadRequest +from app import antivirus_client +from app.utils import get_mime_type from app.utils.authentication import check_auth -from app.utils.file_checks import FiletypeError, run_antivirus_checks, run_mimetype_checks +from app.utils.files import split_filename +from app.utils.validation import ( + validate_filename, +) file_checks_blueprint = Blueprint("file_checks", __name__, url_prefix="") file_checks_blueprint.before_request(check_auth) -def _get_upload_document_request_data(data): +@dataclass +class ErrorResponse: + error: str + status_code: int + + +@dataclass +class SuccessResponse: + virus_free: bool + mimetype: str + + +class FiletypeError(Exception): + def __init__(self, message=None, status_code=None): + self.message = message + self.status_code = status_code + + +def _get_file_content_and_is_csv_from_document_request_data(data): if "document" not in data: raise BadRequest("No document upload") @@ -29,7 +54,41 @@ def _get_upload_document_request_data(data): if not isinstance(is_csv, bool): raise BadRequest("Value for is_csv must be a boolean") - return file_data, is_csv + filename = data.get("filename", None) + if filename: + try: + filename = validate_filename(filename) + except ValueError as e: + raise BadRequest(str(e)) from e + + return file_data, is_csv, filename + + +def _run_mime_type_check_and_antivirus_scan(file_data, is_csv, filename=None): + virus_free = False + if current_app.config["ANTIVIRUS_ENABLED"]: + try: + virus_free = antivirus_client.scan(file_data) + except AntivirusError as e: + raise AntivirusError(message="Antivirus API error", status_code=503) from e + if not virus_free: + raise AntivirusError(message="File did not pass the virus scan", status_code=400) + if filename: + mimetype = mimetypes.types_map[split_filename(filename, dotted=True)[1]] + else: + mimetype = get_mime_type(file_data) + # Our mimetype auto-detection sometimes resolves CSV content as text/plain, so we use + # an explicit POST body parameter `is_csv` from the caller to resolve it as text/csv + if is_csv and mimetype == "text/plain": + mimetype = "text/csv" + if mimetype not in current_app.config["MIME_TYPES_TO_FILE_EXTENSIONS"]: + allowed_file_types = ", ".join( + sorted({f"'.{x}'" for x in current_app.config["FILE_EXTENSIONS_TO_MIMETYPES"].keys()}) + ) + raise FiletypeError( + message=f"Unsupported file type '{mimetype}'. Supported types are: {allowed_file_types}", status_code=400 + ) + return virus_free, mimetype @file_checks_blueprint.route("/antivirus_and_mimetype_check", methods=["POST"]) @@ -38,15 +97,24 @@ def get_mime_type_and_run_antivirus_scan(): ( file_data, is_csv, - ) = _get_upload_document_request_data(request.json) + filename, + ) = _get_file_content_and_is_csv_from_document_request_data(request.json) except BadRequest as e: return jsonify(error=e.description), 400 + result = get_mime_type_and_run_antivirus_scan_json(file_data, is_csv, filename) + if "success" in result.keys(): + virus_free = result.get("success").get("virus_free") + mimetype = result.get("success").get("mimetype") + return jsonify(virus_free=virus_free, mimetype=mimetype), 200 + if "failure" in result.keys(): + error = result.get("failure").get("error") + status_code = result.get("failure").get("status_code") + return jsonify(error=error), status_code + + +def get_mime_type_and_run_antivirus_scan_json(file_data, is_csv, filename=None): try: - virus_free = run_antivirus_checks(file_data) - except AntivirusError as e: - return jsonify(error=e.message), e.status_code - try: - mimetype = run_mimetype_checks(file_data, is_csv) - except FiletypeError as e: - return jsonify(error=e.error_message), e.status_code - return jsonify(virus_free=virus_free, mimetype=mimetype), 200 + virus_free, mimetype = _run_mime_type_check_and_antivirus_scan(file_data, is_csv, filename) + return {"success": {"virus_free": virus_free, "mimetype": mimetype}} + except Exception as e: + return {"failure": {"error": e.message, "status_code": e.status_code}} diff --git a/app/upload/views.py b/app/upload/views.py index 0934aeb3..a17853d4 100644 --- a/app/upload/views.py +++ b/app/upload/views.py @@ -1,16 +1,13 @@ -import mimetypes from base64 import b64decode, binascii from io import BytesIO from flask import Blueprint, abort, current_app, jsonify, request -from notifications_utils.clients.antivirus.antivirus_client import AntivirusError from notifications_utils.recipient_validation.errors import InvalidEmailError from werkzeug.exceptions import BadRequest -from app import antivirus_client, document_store -from app.utils import get_mime_type +from app import document_store +from app.file_checks.views import get_mime_type_and_run_antivirus_scan_json from app.utils.authentication import check_auth -from app.utils.files import split_filename from app.utils.urls import get_direct_file_url, get_frontend_download_url from app.utils.validation import ( clean_and_validate_email_address, @@ -63,29 +60,6 @@ def _get_upload_document_request_data(data): # noqa: C901 return file_data, is_csv, confirmation_email, retention_period, filename -# @upload_blueprint.route("/antivirus_and_mimetype_check", methods=["POST"]) -# def get_mime_type_and_run_antivirus_scan(): -# try: -# file_data, is_csv, confirmation_email, retention_period, filename = _get_upload_document_request_data( -# request.json -# ) -# except BadRequest as e: -# return jsonify(error=e.description), 400 -# try: -# virus_free = antivirus_client.scan(file_data) -# except AntivirusError: -# return jsonify(error="Antivirus API error"), 503 - -# if not virus_free: -# return jsonify(error="File did not pass the virus scan"), 400 -# mimetype = get_mime_type(file_data) -# # Our mimetype auto-detection sometimes resolves CSV content as text/plain, so we use -# # an explicit POST body parameter `is_csv` from the caller to resolve it as text/csv -# if is_csv and mimetype == "text/plain": -# mimetype = "text/csv" -# return jsonify(virus_free=virus_free, mimetype=mimetype), 200 - - @upload_blueprint.route("/services//documents", methods=["POST"]) def upload_document(service_id): try: @@ -95,30 +69,16 @@ def upload_document(service_id): except BadRequest as e: return jsonify(error=e.description), 400 - if current_app.config["ANTIVIRUS_ENABLED"]: - try: - virus_free = antivirus_client.scan(file_data) - except AntivirusError: - return jsonify(error="Antivirus API error"), 503 - + result = get_mime_type_and_run_antivirus_scan_json(file_data, is_csv, filename) + if "success" in result.keys(): + virus_free = result.get("success").get("virus_free") + mimetype = result.get("success").get("mimetype") if not virus_free: return jsonify(error="File did not pass the virus scan"), 400 - - if filename: - mimetype = mimetypes.types_map[split_filename(filename, dotted=True)[1]] - else: - mimetype = get_mime_type(file_data) - - # Our mimetype auto-detection sometimes resolves CSV content as text/plain, so we use - # an explicit POST body parameter `is_csv` from the caller to resolve it as text/csv - if is_csv and mimetype == "text/plain": - mimetype = "text/csv" - - if mimetype not in current_app.config["MIME_TYPES_TO_FILE_EXTENSIONS"]: - allowed_file_types = ", ".join( - sorted({f"'.{x}'" for x in current_app.config["FILE_EXTENSIONS_TO_MIMETYPES"].keys()}) - ) - return jsonify(error=f"Unsupported file type '{mimetype}'. Supported types are: {allowed_file_types}"), 400 + if "failure" in result.keys(): + error = result.get("failure").get("error") + status_code = result.get("failure").get("status_code") + return jsonify(error=error), status_code document = document_store.put( service_id, diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index b72f92ca..aad660d7 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -1,8 +1,11 @@ +import mimetypes + from flask import current_app from notifications_utils.clients.antivirus.antivirus_client import AntivirusError from app import antivirus_client from app.utils import get_mime_type +from app.utils.files import split_filename class FiletypeError(Exception): @@ -11,23 +14,34 @@ def __init__(self, error_message=None, status_code=None): self.status_code = status_code +def run_antivirus_and_mimetype_checks(file_data): + pass + + def run_antivirus_checks(file_data): - try: - virus_free = antivirus_client.scan(file_data) - except AntivirusError as e: - raise AntivirusError(message="Antivirus API error", status_code=503) from e - - if not virus_free: - raise AntivirusError(message="File did not pass the virus scan", status_code=400) - return virus_free - - -def run_mimetype_checks(file_data, is_csv): - mimetype = get_mime_type(file_data) - # Our mimetype auto-detection sometimes resolves CSV content as text/plain, so we use - # an explicit POST body parameter `is_csv` from the caller to resolve it as text/csv - if is_csv and mimetype == "text/plain": - mimetype = "text/csv" + if current_app.config["ANTIVIRUS_ENABLED"]: + results = {} + try: + virus_free = antivirus_client.scan(file_data) + except AntivirusError: + return {"message": "Antivirus API error", "status_code": 503} + if virus_free: + results["virus_free"] = True + else: + results["virus_free"] = False + return results + return + + +def run_mimetype_checks(file_data, is_csv, filename=None): + if filename: + mimetype = mimetypes.types_map[split_filename(filename, dotted=True)[1]] + else: + mimetype = get_mime_type(file_data) + # Our mimetype auto-detection sometimes resolves CSV content as text/plain, so we use + # an explicit POST body parameter `is_csv` from the caller to resolve it as text/csv + if is_csv and mimetype == "text/plain": + mimetype = "text/csv" if mimetype not in current_app.config["MIME_TYPES_TO_FILE_EXTENSIONS"]: allowed_file_types = ", ".join( sorted({f"'.{x}'" for x in current_app.config["FILE_EXTENSIONS_TO_MIMETYPES"].keys()}) diff --git a/tests/file_checks/test_views.py b/tests/file_checks/test_views.py index 6cf5b5a3..53713672 100644 --- a/tests/file_checks/test_views.py +++ b/tests/file_checks/test_views.py @@ -16,7 +16,7 @@ def _file_checks(client, file_content): @pytest.fixture def antivirus(mocker): return mocker.patch( - "app.utils.file_checks.antivirus_client", + "app.file_checks.views.antivirus_client", # prevent LocalProxy being detected as an async function new_callable=mocker.MagicMock, ) diff --git a/tests/upload/test_views.py b/tests/upload/test_views.py index e5614fa7..7ec34ebd 100644 --- a/tests/upload/test_views.py +++ b/tests/upload/test_views.py @@ -22,7 +22,7 @@ def store(mocker): @pytest.fixture def antivirus(mocker): return mocker.patch( - "app.upload.views.antivirus_client", + "app.file_checks.views.antivirus_client", # prevent LocalProxy being detected as an async function new_callable=mocker.MagicMock, ) From 903acf56163c5635b1408a2295a3796e7a08db11 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 11:22:45 +0100 Subject: [PATCH 27/63] Refactor into a class This avoid the pattern where we return a tuple of variables from one function just to pass it into another one. --- app/file_checks/views.py | 97 ++++++++++++++++++++++++-------------- app/upload/views.py | 67 ++++---------------------- tests/upload/test_views.py | 18 ++----- 3 files changed, 75 insertions(+), 107 deletions(-) diff --git a/app/file_checks/views.py b/app/file_checks/views.py index e9456c56..f6db7377 100644 --- a/app/file_checks/views.py +++ b/app/file_checks/views.py @@ -5,6 +5,7 @@ from flask import Blueprint, abort, current_app, jsonify, request from notifications_utils.clients.antivirus.antivirus_client import AntivirusError +from notifications_utils.recipient_validation.errors import InvalidEmailError from werkzeug.exceptions import BadRequest from app import antivirus_client @@ -12,6 +13,8 @@ from app.utils.authentication import check_auth from app.utils.files import split_filename from app.utils.validation import ( + clean_and_validate_email_address, + clean_and_validate_retention_period, validate_filename, ) @@ -37,31 +40,67 @@ def __init__(self, message=None, status_code=None): self.status_code = status_code -def _get_file_content_and_is_csv_from_document_request_data(data): - if "document" not in data: - raise BadRequest("No document upload") +class UploadedFile: + def __init__(self, file_data, is_csv, confirmation_email, retention_period, filename): + self.file_data = file_data + self.is_csv = is_csv + self.filename = filename + self.confirmation_email = confirmation_email + self.retention_period = retention_period - try: - raw_content = b64decode(data["document"]) - except (binascii.Error, ValueError) as e: - raise BadRequest("Document is not base64 encoded") from e - - if len(raw_content) > current_app.config["MAX_DECODED_FILE_SIZE"]: - abort(413) - file_data = BytesIO(raw_content) - is_csv = data.get("is_csv", False) - - if not isinstance(is_csv, bool): - raise BadRequest("Value for is_csv must be a boolean") + @classmethod + def from_request_json(cls, data): # noqa: C901 + if "document" not in data: + raise BadRequest("No document upload") - filename = data.get("filename", None) - if filename: try: - filename = validate_filename(filename) - except ValueError as e: - raise BadRequest(str(e)) from e + raw_content = b64decode(data["document"]) + except (binascii.Error, ValueError) as e: + raise BadRequest("Document is not base64 encoded") from e + + if len(raw_content) > current_app.config["MAX_DECODED_FILE_SIZE"]: + abort(413) + file_data = BytesIO(raw_content) + is_csv = data.get("is_csv", False) + + if not isinstance(is_csv, bool): + raise BadRequest("Value for is_csv must be a boolean") + + confirmation_email = data.get("confirmation_email", None) + if confirmation_email is not None: + try: + confirmation_email = clean_and_validate_email_address(confirmation_email) + except InvalidEmailError as e: + raise BadRequest(str(e)) from e + + retention_period = data.get("retention_period", None) + if retention_period is not None: + try: + retention_period = clean_and_validate_retention_period(retention_period) + except ValueError as e: + raise BadRequest(str(e)) from e + + filename = data.get("filename", None) + if filename: + try: + filename = validate_filename(filename) + except ValueError as e: + raise BadRequest(str(e)) from e + + return cls( + file_data=file_data, + is_csv=is_csv, + confirmation_email=confirmation_email, + retention_period=retention_period, + filename=filename, + ) - return file_data, is_csv, filename + def get_mime_type_and_run_antivirus_scan_json(self): + try: + virus_free, mimetype = _run_mime_type_check_and_antivirus_scan(self.file_data, self.is_csv, self.filename) + return {"success": {"virus_free": virus_free, "mimetype": mimetype}} + except Exception as e: + return {"failure": {"error": e.message, "status_code": e.status_code}} def _run_mime_type_check_and_antivirus_scan(file_data, is_csv, filename=None): @@ -94,14 +133,10 @@ def _run_mime_type_check_and_antivirus_scan(file_data, is_csv, filename=None): @file_checks_blueprint.route("/antivirus_and_mimetype_check", methods=["POST"]) def get_mime_type_and_run_antivirus_scan(): try: - ( - file_data, - is_csv, - filename, - ) = _get_file_content_and_is_csv_from_document_request_data(request.json) + uploaded_file = UploadedFile.from_request_json(request.json) except BadRequest as e: return jsonify(error=e.description), 400 - result = get_mime_type_and_run_antivirus_scan_json(file_data, is_csv, filename) + result = uploaded_file.get_mime_type_and_run_antivirus_scan_json() if "success" in result.keys(): virus_free = result.get("success").get("virus_free") mimetype = result.get("success").get("mimetype") @@ -110,11 +145,3 @@ def get_mime_type_and_run_antivirus_scan(): error = result.get("failure").get("error") status_code = result.get("failure").get("status_code") return jsonify(error=error), status_code - - -def get_mime_type_and_run_antivirus_scan_json(file_data, is_csv, filename=None): - try: - virus_free, mimetype = _run_mime_type_check_and_antivirus_scan(file_data, is_csv, filename) - return {"success": {"virus_free": virus_free, "mimetype": mimetype}} - except Exception as e: - return {"failure": {"error": e.message, "status_code": e.status_code}} diff --git a/app/upload/views.py b/app/upload/views.py index a17853d4..9eaf3867 100644 --- a/app/upload/views.py +++ b/app/upload/views.py @@ -1,75 +1,24 @@ -from base64 import b64decode, binascii -from io import BytesIO -from flask import Blueprint, abort, current_app, jsonify, request -from notifications_utils.recipient_validation.errors import InvalidEmailError +from flask import Blueprint, jsonify, request from werkzeug.exceptions import BadRequest from app import document_store -from app.file_checks.views import get_mime_type_and_run_antivirus_scan_json +from app.file_checks.views import UploadedFile from app.utils.authentication import check_auth from app.utils.urls import get_direct_file_url, get_frontend_download_url -from app.utils.validation import ( - clean_and_validate_email_address, - clean_and_validate_retention_period, - validate_filename, -) upload_blueprint = Blueprint("upload", __name__, url_prefix="") upload_blueprint.before_request(check_auth) -def _get_upload_document_request_data(data): # noqa: C901 - if "document" not in data: - raise BadRequest("No document upload") - - try: - raw_content = b64decode(data["document"]) - except (binascii.Error, ValueError) as e: - raise BadRequest("Document is not base64 encoded") from e - - if len(raw_content) > current_app.config["MAX_DECODED_FILE_SIZE"]: - abort(413) - file_data = BytesIO(raw_content) - is_csv = data.get("is_csv", False) - - if not isinstance(is_csv, bool): - raise BadRequest("Value for is_csv must be a boolean") - - confirmation_email = data.get("confirmation_email", None) - if confirmation_email is not None: - try: - confirmation_email = clean_and_validate_email_address(confirmation_email) - except InvalidEmailError as e: - raise BadRequest(str(e)) from e - - retention_period = data.get("retention_period", None) - if retention_period is not None: - try: - retention_period = clean_and_validate_retention_period(retention_period) - except ValueError as e: - raise BadRequest(str(e)) from e - - filename = data.get("filename", None) - if filename: - try: - filename = validate_filename(filename) - except ValueError as e: - raise BadRequest(str(e)) from e - - return file_data, is_csv, confirmation_email, retention_period, filename - - @upload_blueprint.route("/services//documents", methods=["POST"]) def upload_document(service_id): try: - file_data, is_csv, confirmation_email, retention_period, filename = _get_upload_document_request_data( - request.json - ) + uploaded_file = UploadedFile.from_request_json(request.json) except BadRequest as e: return jsonify(error=e.description), 400 - result = get_mime_type_and_run_antivirus_scan_json(file_data, is_csv, filename) + result = uploaded_file.get_mime_type_and_run_antivirus_scan_json() if "success" in result.keys(): virus_free = result.get("success").get("virus_free") mimetype = result.get("success").get("mimetype") @@ -82,11 +31,11 @@ def upload_document(service_id): document = document_store.put( service_id, - file_data, + uploaded_file.file_data, mimetype=mimetype, - confirmation_email=confirmation_email, - retention_period=retention_period, - filename=filename, + confirmation_email=uploaded_file.confirmation_email, + retention_period=uploaded_file.retention_period, + filename=uploaded_file.filename, ) return ( diff --git a/tests/upload/test_views.py b/tests/upload/test_views.py index 7ec34ebd..4cff629d 100644 --- a/tests/upload/test_views.py +++ b/tests/upload/test_views.py @@ -1,13 +1,12 @@ import base64 from pathlib import Path -from unittest import mock import pytest from notifications_utils.clients.antivirus.antivirus_client import AntivirusError from werkzeug.exceptions import BadRequest import app -from app.upload.views import _get_upload_document_request_data +from app.upload.views import UploadedFile @pytest.fixture @@ -166,14 +165,11 @@ def test_document_file_size_too_large_werkzeug_content_length(client): # Gets hit by Werkzeug's 3MiB content length limit automatically (before our app logic). file_content = b"a" * (3 * 1024 * 1024 + 1) - with mock.patch( - "app.upload.views._get_upload_document_request_data", wraps=_get_upload_document_request_data - ) as mock_get_data: - response = _document_upload(client, url, file_content) + response = _document_upload(client, url, file_content) assert response.status_code == 413 assert response.json == {"error": "Uploaded file exceeds file size limit"} - assert mock_get_data.call_count == 0 + # Mock assertion missing here def test_document_file_size_too_large_b64_decoded_content(client): @@ -181,14 +177,10 @@ def test_document_file_size_too_large_b64_decoded_content(client): # Gets through Werkzeug's 3MiB content length limit, but too big for our python ~2MiB check. file_content = b"a" * (2 * 1024 * 1024 + 1025) - with mock.patch( - "app.upload.views._get_upload_document_request_data", wraps=_get_upload_document_request_data - ) as mock_get_data: - response = _document_upload(client, url, file_content) + response = _document_upload(client, url, file_content) assert response.status_code == 413 assert response.json == {"error": "Uploaded file exceeds file size limit"} - assert mock_get_data.call_count == 1 def test_document_upload_no_document(client): @@ -439,6 +431,6 @@ def test_document_upload_bad_is_csv_value(client): ) def test_get_upload_document_request_data_errors(app, data, expected_error): with pytest.raises(BadRequest) as e: - _get_upload_document_request_data(data) + UploadedFile.from_request_json(data) assert e.value.description == expected_error From fd506269fcb38da4e770fd4b6fa068d8ea662f9b Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 11:26:23 +0100 Subject: [PATCH 28/63] Refactor antivirus checks into class This is another function which was entirely coupled to the same 3 arguments which are now properties of the class --- app/file_checks/views.py | 54 ++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/app/file_checks/views.py b/app/file_checks/views.py index f6db7377..a8ea32fe 100644 --- a/app/file_checks/views.py +++ b/app/file_checks/views.py @@ -97,37 +97,37 @@ def from_request_json(cls, data): # noqa: C901 def get_mime_type_and_run_antivirus_scan_json(self): try: - virus_free, mimetype = _run_mime_type_check_and_antivirus_scan(self.file_data, self.is_csv, self.filename) + virus_free, mimetype = self._run_mime_type_check_and_antivirus_scan() return {"success": {"virus_free": virus_free, "mimetype": mimetype}} except Exception as e: return {"failure": {"error": e.message, "status_code": e.status_code}} - -def _run_mime_type_check_and_antivirus_scan(file_data, is_csv, filename=None): - virus_free = False - if current_app.config["ANTIVIRUS_ENABLED"]: - try: - virus_free = antivirus_client.scan(file_data) - except AntivirusError as e: - raise AntivirusError(message="Antivirus API error", status_code=503) from e - if not virus_free: - raise AntivirusError(message="File did not pass the virus scan", status_code=400) - if filename: - mimetype = mimetypes.types_map[split_filename(filename, dotted=True)[1]] - else: - mimetype = get_mime_type(file_data) - # Our mimetype auto-detection sometimes resolves CSV content as text/plain, so we use - # an explicit POST body parameter `is_csv` from the caller to resolve it as text/csv - if is_csv and mimetype == "text/plain": - mimetype = "text/csv" - if mimetype not in current_app.config["MIME_TYPES_TO_FILE_EXTENSIONS"]: - allowed_file_types = ", ".join( - sorted({f"'.{x}'" for x in current_app.config["FILE_EXTENSIONS_TO_MIMETYPES"].keys()}) - ) - raise FiletypeError( - message=f"Unsupported file type '{mimetype}'. Supported types are: {allowed_file_types}", status_code=400 - ) - return virus_free, mimetype + def _run_mime_type_check_and_antivirus_scan(self): + virus_free = False + if current_app.config["ANTIVIRUS_ENABLED"]: + try: + virus_free = antivirus_client.scan(self.file_data) + except AntivirusError as e: + raise AntivirusError(message="Antivirus API error", status_code=503) from e + if not virus_free: + raise AntivirusError(message="File did not pass the virus scan", status_code=400) + if self.filename: + mimetype = mimetypes.types_map[split_filename(self.filename, dotted=True)[1]] + else: + mimetype = get_mime_type(self.file_data) + # Our mimetype auto-detection sometimes resolves CSV content as text/plain, so we use + # an explicit POST body parameter `is_csv` from the caller to resolve it as text/csv + if self.is_csv and mimetype == "text/plain": + mimetype = "text/csv" + if mimetype not in current_app.config["MIME_TYPES_TO_FILE_EXTENSIONS"]: + allowed_file_types = ", ".join( + sorted({f"'.{x}'" for x in current_app.config["FILE_EXTENSIONS_TO_MIMETYPES"].keys()}) + ) + raise FiletypeError( + message=f"Unsupported file type '{mimetype}'. Supported types are: {allowed_file_types}", + status_code=400, + ) + return virus_free, mimetype @file_checks_blueprint.route("/antivirus_and_mimetype_check", methods=["POST"]) From 7cc5849f1a640afe0c6e17e7728ce81b4e55ef6f Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 11:31:53 +0100 Subject: [PATCH 29/63] Refactor virus_free and mimetype into properties Having a function which returns a heterogenous tuple smells like a smell --- app/file_checks/views.py | 12 ++++++++---- app/upload/views.py | 1 - 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/app/file_checks/views.py b/app/file_checks/views.py index a8ea32fe..2ac75823 100644 --- a/app/file_checks/views.py +++ b/app/file_checks/views.py @@ -97,12 +97,12 @@ def from_request_json(cls, data): # noqa: C901 def get_mime_type_and_run_antivirus_scan_json(self): try: - virus_free, mimetype = self._run_mime_type_check_and_antivirus_scan() - return {"success": {"virus_free": virus_free, "mimetype": mimetype}} + return {"success": {"virus_free": self.virus_free, "mimetype": self.mimetype}} except Exception as e: return {"failure": {"error": e.message, "status_code": e.status_code}} - def _run_mime_type_check_and_antivirus_scan(self): + @property + def virus_free(self): virus_free = False if current_app.config["ANTIVIRUS_ENABLED"]: try: @@ -111,6 +111,10 @@ def _run_mime_type_check_and_antivirus_scan(self): raise AntivirusError(message="Antivirus API error", status_code=503) from e if not virus_free: raise AntivirusError(message="File did not pass the virus scan", status_code=400) + return virus_free + + @property + def mimetype(self): if self.filename: mimetype = mimetypes.types_map[split_filename(self.filename, dotted=True)[1]] else: @@ -127,7 +131,7 @@ def _run_mime_type_check_and_antivirus_scan(self): message=f"Unsupported file type '{mimetype}'. Supported types are: {allowed_file_types}", status_code=400, ) - return virus_free, mimetype + return mimetype @file_checks_blueprint.route("/antivirus_and_mimetype_check", methods=["POST"]) diff --git a/app/upload/views.py b/app/upload/views.py index 9eaf3867..c902ce2d 100644 --- a/app/upload/views.py +++ b/app/upload/views.py @@ -1,4 +1,3 @@ - from flask import Blueprint, jsonify, request from werkzeug.exceptions import BadRequest From eef708c1762307c263e9935e59220b43b5e3546c Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 11:33:16 +0100 Subject: [PATCH 30/63] Refactor virus_free with early return This reduces the level of nesting, and makes it clearer what happens if `ANTIVIRUS_ENABLED` is `False` --- app/file_checks/views.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/app/file_checks/views.py b/app/file_checks/views.py index 2ac75823..7f32deba 100644 --- a/app/file_checks/views.py +++ b/app/file_checks/views.py @@ -103,15 +103,14 @@ def get_mime_type_and_run_antivirus_scan_json(self): @property def virus_free(self): - virus_free = False - if current_app.config["ANTIVIRUS_ENABLED"]: - try: - virus_free = antivirus_client.scan(self.file_data) - except AntivirusError as e: - raise AntivirusError(message="Antivirus API error", status_code=503) from e - if not virus_free: - raise AntivirusError(message="File did not pass the virus scan", status_code=400) - return virus_free + if not current_app.config["ANTIVIRUS_ENABLED"]: + return False + try: + virus_free = antivirus_client.scan(self.file_data) + except AntivirusError as e: + raise AntivirusError(message="Antivirus API error", status_code=503) from e + if not virus_free: + raise AntivirusError(message="File did not pass the virus scan", status_code=400) @property def mimetype(self): From 1c9890a4628d08b2cd8c491bb114b4d90c912466 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 12:09:10 +0100 Subject: [PATCH 31/63] Encapsulate dict introspection in the class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The view layer shouldn’t need to know about the data structures the class is using --- app/file_checks/views.py | 41 ++++++++++++++++++++++++++++------------ app/upload/views.py | 21 ++++++++------------ app/utils/file_checks.py | 4 ++-- 3 files changed, 39 insertions(+), 27 deletions(-) diff --git a/app/file_checks/views.py b/app/file_checks/views.py index 7f32deba..3567d22d 100644 --- a/app/file_checks/views.py +++ b/app/file_checks/views.py @@ -11,6 +11,7 @@ from app import antivirus_client from app.utils import get_mime_type from app.utils.authentication import check_auth +from app.utils.file_checks import FiletypeError from app.utils.files import split_filename from app.utils.validation import ( clean_and_validate_email_address, @@ -34,7 +35,7 @@ class SuccessResponse: mimetype: str -class FiletypeError(Exception): +class AntivirusAndMimeTypeCheckError(Exception): def __init__(self, message=None, status_code=None): self.message = message self.status_code = status_code @@ -97,12 +98,32 @@ def from_request_json(cls, data): # noqa: C901 def get_mime_type_and_run_antivirus_scan_json(self): try: - return {"success": {"virus_free": self.virus_free, "mimetype": self.mimetype}} + return {"success": {"virus_free": self._virus_free, "mimetype": self._mimetype}} except Exception as e: return {"failure": {"error": e.message, "status_code": e.status_code}} @property def virus_free(self): + result = self.get_mime_type_and_run_antivirus_scan_json() + if "failure" in result: + raise AntivirusAndMimeTypeCheckError( + message=result["failure"]["error"], + status_code=result["failure"]["status_code"], + ) + return result["success"]["virus_free"] + + @property + def mimetype(self): + result = self.get_mime_type_and_run_antivirus_scan_json() + if "failure" in result: + raise AntivirusAndMimeTypeCheckError( + message=result.message, + status_code=result.status_code, + ) + return result["success"]["mimetype"] + + @property + def _virus_free(self): if not current_app.config["ANTIVIRUS_ENABLED"]: return False try: @@ -111,9 +132,10 @@ def virus_free(self): raise AntivirusError(message="Antivirus API error", status_code=503) from e if not virus_free: raise AntivirusError(message="File did not pass the virus scan", status_code=400) + return virus_free @property - def mimetype(self): + def _mimetype(self): if self.filename: mimetype = mimetypes.types_map[split_filename(self.filename, dotted=True)[1]] else: @@ -139,12 +161,7 @@ def get_mime_type_and_run_antivirus_scan(): uploaded_file = UploadedFile.from_request_json(request.json) except BadRequest as e: return jsonify(error=e.description), 400 - result = uploaded_file.get_mime_type_and_run_antivirus_scan_json() - if "success" in result.keys(): - virus_free = result.get("success").get("virus_free") - mimetype = result.get("success").get("mimetype") - return jsonify(virus_free=virus_free, mimetype=mimetype), 200 - if "failure" in result.keys(): - error = result.get("failure").get("error") - status_code = result.get("failure").get("status_code") - return jsonify(error=error), status_code + try: + return jsonify(virus_free=uploaded_file.virus_free, mimetype=uploaded_file.mimetype), 200 + except AntivirusAndMimeTypeCheckError as e: + return jsonify(error=e.message), e.status_code diff --git a/app/upload/views.py b/app/upload/views.py index c902ce2d..14953bb9 100644 --- a/app/upload/views.py +++ b/app/upload/views.py @@ -2,7 +2,7 @@ from werkzeug.exceptions import BadRequest from app import document_store -from app.file_checks.views import UploadedFile +from app.file_checks.views import AntivirusAndMimeTypeCheckError, UploadedFile from app.utils.authentication import check_auth from app.utils.urls import get_direct_file_url, get_frontend_download_url @@ -17,21 +17,16 @@ def upload_document(service_id): except BadRequest as e: return jsonify(error=e.description), 400 - result = uploaded_file.get_mime_type_and_run_antivirus_scan_json() - if "success" in result.keys(): - virus_free = result.get("success").get("virus_free") - mimetype = result.get("success").get("mimetype") - if not virus_free: + try: + if not uploaded_file.virus_free: return jsonify(error="File did not pass the virus scan"), 400 - if "failure" in result.keys(): - error = result.get("failure").get("error") - status_code = result.get("failure").get("status_code") - return jsonify(error=error), status_code + except AntivirusAndMimeTypeCheckError as e: + return jsonify(error=e.message), e.status_code document = document_store.put( service_id, uploaded_file.file_data, - mimetype=mimetype, + mimetype=uploaded_file.mimetype, confirmation_email=uploaded_file.confirmation_email, retention_period=uploaded_file.retention_period, filename=uploaded_file.filename, @@ -46,14 +41,14 @@ def upload_document(service_id): service_id=service_id, document_id=document["id"], key=document["encryption_key"], - mimetype=mimetype, + mimetype=uploaded_file.mimetype, ), "url": get_frontend_download_url( service_id=service_id, document_id=document["id"], key=document["encryption_key"], ), - "mimetype": mimetype, + "mimetype": uploaded_file.mimetype, }, ), 201, diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index aad660d7..d3a08472 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -9,8 +9,8 @@ class FiletypeError(Exception): - def __init__(self, error_message=None, status_code=None): - self.error_message = error_message + def __init__(self, message=None, status_code=None): + self.message = message self.status_code = status_code From e4d5e4fcf96c9837457e4553a8dcc7e774a645d7 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 12:21:42 +0100 Subject: [PATCH 32/63] Make getters and setter to validate properties This feels like a natural way to break up and organise this code --- app/file_checks/views.py | 89 +++++++++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 32 deletions(-) diff --git a/app/file_checks/views.py b/app/file_checks/views.py index 3567d22d..f2a1852a 100644 --- a/app/file_checks/views.py +++ b/app/file_checks/views.py @@ -49,8 +49,59 @@ def __init__(self, file_data, is_csv, confirmation_email, retention_period, file self.confirmation_email = confirmation_email self.retention_period = retention_period + @property + def is_csv(self): + return self._is_csv + + @is_csv.setter + def is_csv(self, value): + if value is None: + value = False + if not isinstance(value, bool): + raise BadRequest("Value for is_csv must be a boolean") + self._is_csv = value + + @property + def confirmation_email(self): + return getattr(self, "_confirmation_email", None) + + @confirmation_email.setter + def confirmation_email(self, value): + if value is None: + return + try: + self._confirmation_email = clean_and_validate_email_address(value) + except InvalidEmailError as e: + raise BadRequest(str(e)) from e + + @property + def retention_period(self): + return getattr(self, "_retention_period", None) + + @retention_period.setter + def retention_period(self, value): + if value is None: + return + try: + self._retention_period = clean_and_validate_retention_period(value) + except ValueError as e: + raise BadRequest(str(e)) from e + + @property + def filename(self): + return getattr(self, "_filename", None) + + @filename.setter + def filename(self, value): + if value is None: + return + try: + self._filename = validate_filename(value) + except ValueError as e: + raise BadRequest(str(e)) from e + @classmethod - def from_request_json(cls, data): # noqa: C901 + def from_request_json(cls, data): if "document" not in data: raise BadRequest("No document upload") @@ -61,39 +112,13 @@ def from_request_json(cls, data): # noqa: C901 if len(raw_content) > current_app.config["MAX_DECODED_FILE_SIZE"]: abort(413) - file_data = BytesIO(raw_content) - is_csv = data.get("is_csv", False) - - if not isinstance(is_csv, bool): - raise BadRequest("Value for is_csv must be a boolean") - - confirmation_email = data.get("confirmation_email", None) - if confirmation_email is not None: - try: - confirmation_email = clean_and_validate_email_address(confirmation_email) - except InvalidEmailError as e: - raise BadRequest(str(e)) from e - - retention_period = data.get("retention_period", None) - if retention_period is not None: - try: - retention_period = clean_and_validate_retention_period(retention_period) - except ValueError as e: - raise BadRequest(str(e)) from e - - filename = data.get("filename", None) - if filename: - try: - filename = validate_filename(filename) - except ValueError as e: - raise BadRequest(str(e)) from e return cls( - file_data=file_data, - is_csv=is_csv, - confirmation_email=confirmation_email, - retention_period=retention_period, - filename=filename, + file_data=BytesIO(raw_content), + is_csv=data.get("is_csv"), + confirmation_email=data.get("confirmation_email"), + retention_period=data.get("retention_period"), + filename=data.get("filename", None), ) def get_mime_type_and_run_antivirus_scan_json(self): From 32409edd8f662dfa74320141e3b3a0b12879f41b Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 12:26:10 +0100 Subject: [PATCH 33/63] Refactor to remove duplication --- app/file_checks/views.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/app/file_checks/views.py b/app/file_checks/views.py index f2a1852a..99c5c371 100644 --- a/app/file_checks/views.py +++ b/app/file_checks/views.py @@ -127,25 +127,22 @@ def get_mime_type_and_run_antivirus_scan_json(self): except Exception as e: return {"failure": {"error": e.message, "status_code": e.status_code}} - @property - def virus_free(self): + def get_check(self, check): result = self.get_mime_type_and_run_antivirus_scan_json() if "failure" in result: raise AntivirusAndMimeTypeCheckError( message=result["failure"]["error"], status_code=result["failure"]["status_code"], ) - return result["success"]["virus_free"] + return result["success"][check] + + @property + def virus_free(self): + return self.get_check("virus_free") @property def mimetype(self): - result = self.get_mime_type_and_run_antivirus_scan_json() - if "failure" in result: - raise AntivirusAndMimeTypeCheckError( - message=result.message, - status_code=result.status_code, - ) - return result["success"]["mimetype"] + return self.get_check("mimetype") @property def _virus_free(self): From 0581b7e9d4b0d45fab1c176ba572763ebe8dbeaa Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 12:33:11 +0100 Subject: [PATCH 34/63] Do checks on init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So exceptions are always raised from the same place, and the calling code doesn’t have to do multiple catches, or wrap multiple statements in a `try` --- app/file_checks/views.py | 14 ++++---------- app/upload/views.py | 7 +++---- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/app/file_checks/views.py b/app/file_checks/views.py index 99c5c371..07890618 100644 --- a/app/file_checks/views.py +++ b/app/file_checks/views.py @@ -48,6 +48,8 @@ def __init__(self, file_data, is_csv, confirmation_email, retention_period, file self.filename = filename self.confirmation_email = confirmation_email self.retention_period = retention_period + self.virus_free = self.get_check("virus_free") + self.mimetype = self.get_check("mimetype") @property def is_csv(self): @@ -136,14 +138,6 @@ def get_check(self, check): ) return result["success"][check] - @property - def virus_free(self): - return self.get_check("virus_free") - - @property - def mimetype(self): - return self.get_check("mimetype") - @property def _virus_free(self): if not current_app.config["ANTIVIRUS_ENABLED"]: @@ -183,7 +177,7 @@ def get_mime_type_and_run_antivirus_scan(): uploaded_file = UploadedFile.from_request_json(request.json) except BadRequest as e: return jsonify(error=e.description), 400 - try: - return jsonify(virus_free=uploaded_file.virus_free, mimetype=uploaded_file.mimetype), 200 except AntivirusAndMimeTypeCheckError as e: return jsonify(error=e.message), e.status_code + + return jsonify(virus_free=uploaded_file.virus_free, mimetype=uploaded_file.mimetype), 200 diff --git a/app/upload/views.py b/app/upload/views.py index 14953bb9..740d848e 100644 --- a/app/upload/views.py +++ b/app/upload/views.py @@ -16,13 +16,12 @@ def upload_document(service_id): uploaded_file = UploadedFile.from_request_json(request.json) except BadRequest as e: return jsonify(error=e.description), 400 - - try: - if not uploaded_file.virus_free: - return jsonify(error="File did not pass the virus scan"), 400 except AntivirusAndMimeTypeCheckError as e: return jsonify(error=e.message), e.status_code + if not uploaded_file.virus_free: + return jsonify(error="File did not pass the virus scan"), 400 + document = document_store.put( service_id, uploaded_file.file_data, From 885ef3baa15676aff125bc6b0c42e8256fcca224 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 12:36:17 +0100 Subject: [PATCH 35/63] Remove unused classes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These were just a sketch of something. Don’t think it’s necessary because we can handle the serialisation/deserialisation inside the `UploadedFiles` class instead of using a model which wraps the `dict` directly --- app/file_checks/views.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/app/file_checks/views.py b/app/file_checks/views.py index 07890618..111d6d53 100644 --- a/app/file_checks/views.py +++ b/app/file_checks/views.py @@ -1,6 +1,5 @@ import mimetypes from base64 import b64decode, binascii -from dataclasses import dataclass from io import BytesIO from flask import Blueprint, abort, current_app, jsonify, request @@ -23,18 +22,6 @@ file_checks_blueprint.before_request(check_auth) -@dataclass -class ErrorResponse: - error: str - status_code: int - - -@dataclass -class SuccessResponse: - virus_free: bool - mimetype: str - - class AntivirusAndMimeTypeCheckError(Exception): def __init__(self, message=None, status_code=None): self.message = message From 4924a7e0f5083c2a319b743e955a86b2408cac4c Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 12:53:51 +0100 Subject: [PATCH 36/63] Move class into utils MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It’s shared between multiple views so shouldn’t sit in one or the other --- app/file_checks/views.py | 153 +------------------------------- app/upload/views.py | 2 +- app/utils/file_checks.py | 147 +++++++++++++++++++++++++++++- tests/file_checks/test_views.py | 2 +- tests/upload/test_views.py | 2 +- 5 files changed, 149 insertions(+), 157 deletions(-) diff --git a/app/file_checks/views.py b/app/file_checks/views.py index 111d6d53..9ceafbfe 100644 --- a/app/file_checks/views.py +++ b/app/file_checks/views.py @@ -1,163 +1,14 @@ -import mimetypes -from base64 import b64decode, binascii -from io import BytesIO -from flask import Blueprint, abort, current_app, jsonify, request -from notifications_utils.clients.antivirus.antivirus_client import AntivirusError -from notifications_utils.recipient_validation.errors import InvalidEmailError +from flask import Blueprint, jsonify, request from werkzeug.exceptions import BadRequest -from app import antivirus_client -from app.utils import get_mime_type from app.utils.authentication import check_auth -from app.utils.file_checks import FiletypeError -from app.utils.files import split_filename -from app.utils.validation import ( - clean_and_validate_email_address, - clean_and_validate_retention_period, - validate_filename, -) +from app.utils.file_checks import AntivirusAndMimeTypeCheckError, UploadedFile file_checks_blueprint = Blueprint("file_checks", __name__, url_prefix="") file_checks_blueprint.before_request(check_auth) -class AntivirusAndMimeTypeCheckError(Exception): - def __init__(self, message=None, status_code=None): - self.message = message - self.status_code = status_code - - -class UploadedFile: - def __init__(self, file_data, is_csv, confirmation_email, retention_period, filename): - self.file_data = file_data - self.is_csv = is_csv - self.filename = filename - self.confirmation_email = confirmation_email - self.retention_period = retention_period - self.virus_free = self.get_check("virus_free") - self.mimetype = self.get_check("mimetype") - - @property - def is_csv(self): - return self._is_csv - - @is_csv.setter - def is_csv(self, value): - if value is None: - value = False - if not isinstance(value, bool): - raise BadRequest("Value for is_csv must be a boolean") - self._is_csv = value - - @property - def confirmation_email(self): - return getattr(self, "_confirmation_email", None) - - @confirmation_email.setter - def confirmation_email(self, value): - if value is None: - return - try: - self._confirmation_email = clean_and_validate_email_address(value) - except InvalidEmailError as e: - raise BadRequest(str(e)) from e - - @property - def retention_period(self): - return getattr(self, "_retention_period", None) - - @retention_period.setter - def retention_period(self, value): - if value is None: - return - try: - self._retention_period = clean_and_validate_retention_period(value) - except ValueError as e: - raise BadRequest(str(e)) from e - - @property - def filename(self): - return getattr(self, "_filename", None) - - @filename.setter - def filename(self, value): - if value is None: - return - try: - self._filename = validate_filename(value) - except ValueError as e: - raise BadRequest(str(e)) from e - - @classmethod - def from_request_json(cls, data): - if "document" not in data: - raise BadRequest("No document upload") - - try: - raw_content = b64decode(data["document"]) - except (binascii.Error, ValueError) as e: - raise BadRequest("Document is not base64 encoded") from e - - if len(raw_content) > current_app.config["MAX_DECODED_FILE_SIZE"]: - abort(413) - - return cls( - file_data=BytesIO(raw_content), - is_csv=data.get("is_csv"), - confirmation_email=data.get("confirmation_email"), - retention_period=data.get("retention_period"), - filename=data.get("filename", None), - ) - - def get_mime_type_and_run_antivirus_scan_json(self): - try: - return {"success": {"virus_free": self._virus_free, "mimetype": self._mimetype}} - except Exception as e: - return {"failure": {"error": e.message, "status_code": e.status_code}} - - def get_check(self, check): - result = self.get_mime_type_and_run_antivirus_scan_json() - if "failure" in result: - raise AntivirusAndMimeTypeCheckError( - message=result["failure"]["error"], - status_code=result["failure"]["status_code"], - ) - return result["success"][check] - - @property - def _virus_free(self): - if not current_app.config["ANTIVIRUS_ENABLED"]: - return False - try: - virus_free = antivirus_client.scan(self.file_data) - except AntivirusError as e: - raise AntivirusError(message="Antivirus API error", status_code=503) from e - if not virus_free: - raise AntivirusError(message="File did not pass the virus scan", status_code=400) - return virus_free - - @property - def _mimetype(self): - if self.filename: - mimetype = mimetypes.types_map[split_filename(self.filename, dotted=True)[1]] - else: - mimetype = get_mime_type(self.file_data) - # Our mimetype auto-detection sometimes resolves CSV content as text/plain, so we use - # an explicit POST body parameter `is_csv` from the caller to resolve it as text/csv - if self.is_csv and mimetype == "text/plain": - mimetype = "text/csv" - if mimetype not in current_app.config["MIME_TYPES_TO_FILE_EXTENSIONS"]: - allowed_file_types = ", ".join( - sorted({f"'.{x}'" for x in current_app.config["FILE_EXTENSIONS_TO_MIMETYPES"].keys()}) - ) - raise FiletypeError( - message=f"Unsupported file type '{mimetype}'. Supported types are: {allowed_file_types}", - status_code=400, - ) - return mimetype - - @file_checks_blueprint.route("/antivirus_and_mimetype_check", methods=["POST"]) def get_mime_type_and_run_antivirus_scan(): try: diff --git a/app/upload/views.py b/app/upload/views.py index 740d848e..8d145a09 100644 --- a/app/upload/views.py +++ b/app/upload/views.py @@ -2,8 +2,8 @@ from werkzeug.exceptions import BadRequest from app import document_store -from app.file_checks.views import AntivirusAndMimeTypeCheckError, UploadedFile from app.utils.authentication import check_auth +from app.utils.file_checks import AntivirusAndMimeTypeCheckError, UploadedFile from app.utils.urls import get_direct_file_url, get_frontend_download_url upload_blueprint = Blueprint("upload", __name__, url_prefix="") diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index d3a08472..7ef61a3e 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -1,11 +1,20 @@ import mimetypes +from base64 import b64decode, binascii +from io import BytesIO -from flask import current_app +from flask import abort, current_app from notifications_utils.clients.antivirus.antivirus_client import AntivirusError +from notifications_utils.recipient_validation.errors import InvalidEmailError +from werkzeug.exceptions import BadRequest from app import antivirus_client from app.utils import get_mime_type from app.utils.files import split_filename +from app.utils.validation import ( + clean_and_validate_email_address, + clean_and_validate_retention_period, + validate_filename, +) class FiletypeError(Exception): @@ -14,8 +23,140 @@ def __init__(self, message=None, status_code=None): self.status_code = status_code -def run_antivirus_and_mimetype_checks(file_data): - pass +class AntivirusAndMimeTypeCheckError(Exception): + def __init__(self, message=None, status_code=None): + self.message = message + self.status_code = status_code + + +class UploadedFile: + def __init__(self, file_data, is_csv, confirmation_email, retention_period, filename): + self.file_data = file_data + self.is_csv = is_csv + self.filename = filename + self.confirmation_email = confirmation_email + self.retention_period = retention_period + self.virus_free = self.get_check("virus_free") + self.mimetype = self.get_check("mimetype") + + @property + def is_csv(self): + return self._is_csv + + @is_csv.setter + def is_csv(self, value): + if value is None: + value = False + if not isinstance(value, bool): + raise BadRequest("Value for is_csv must be a boolean") + self._is_csv = value + + @property + def confirmation_email(self): + return getattr(self, "_confirmation_email", None) + + @confirmation_email.setter + def confirmation_email(self, value): + if value is None: + return + try: + self._confirmation_email = clean_and_validate_email_address(value) + except InvalidEmailError as e: + raise BadRequest(str(e)) from e + + @property + def retention_period(self): + return getattr(self, "_retention_period", None) + + @retention_period.setter + def retention_period(self, value): + if value is None: + return + try: + self._retention_period = clean_and_validate_retention_period(value) + except ValueError as e: + raise BadRequest(str(e)) from e + + @property + def filename(self): + return getattr(self, "_filename", None) + + @filename.setter + def filename(self, value): + if value is None: + return + try: + self._filename = validate_filename(value) + except ValueError as e: + raise BadRequest(str(e)) from e + + @classmethod + def from_request_json(cls, data): + if "document" not in data: + raise BadRequest("No document upload") + + try: + raw_content = b64decode(data["document"]) + except (binascii.Error, ValueError) as e: + raise BadRequest("Document is not base64 encoded") from e + + if len(raw_content) > current_app.config["MAX_DECODED_FILE_SIZE"]: + abort(413) + + return cls( + file_data=BytesIO(raw_content), + is_csv=data.get("is_csv"), + confirmation_email=data.get("confirmation_email"), + retention_period=data.get("retention_period"), + filename=data.get("filename", None), + ) + + def get_mime_type_and_run_antivirus_scan_json(self): + try: + return {"success": {"virus_free": self._virus_free, "mimetype": self._mimetype}} + except Exception as e: + return {"failure": {"error": e.message, "status_code": e.status_code}} + + def get_check(self, check): + result = self.get_mime_type_and_run_antivirus_scan_json() + if "failure" in result: + raise AntivirusAndMimeTypeCheckError( + message=result["failure"]["error"], + status_code=result["failure"]["status_code"], + ) + return result["success"][check] + + @property + def _virus_free(self): + if not current_app.config["ANTIVIRUS_ENABLED"]: + return False + try: + virus_free = antivirus_client.scan(self.file_data) + except AntivirusError as e: + raise AntivirusError(message="Antivirus API error", status_code=503) from e + if not virus_free: + raise AntivirusError(message="File did not pass the virus scan", status_code=400) + return virus_free + + @property + def _mimetype(self): + if self.filename: + mimetype = mimetypes.types_map[split_filename(self.filename, dotted=True)[1]] + else: + mimetype = get_mime_type(self.file_data) + # Our mimetype auto-detection sometimes resolves CSV content as text/plain, so we use + # an explicit POST body parameter `is_csv` from the caller to resolve it as text/csv + if self.is_csv and mimetype == "text/plain": + mimetype = "text/csv" + if mimetype not in current_app.config["MIME_TYPES_TO_FILE_EXTENSIONS"]: + allowed_file_types = ", ".join( + sorted({f"'.{x}'" for x in current_app.config["FILE_EXTENSIONS_TO_MIMETYPES"].keys()}) + ) + raise FiletypeError( + message=f"Unsupported file type '{mimetype}'. Supported types are: {allowed_file_types}", + status_code=400, + ) + return mimetype def run_antivirus_checks(file_data): diff --git a/tests/file_checks/test_views.py b/tests/file_checks/test_views.py index 53713672..6cf5b5a3 100644 --- a/tests/file_checks/test_views.py +++ b/tests/file_checks/test_views.py @@ -16,7 +16,7 @@ def _file_checks(client, file_content): @pytest.fixture def antivirus(mocker): return mocker.patch( - "app.file_checks.views.antivirus_client", + "app.utils.file_checks.antivirus_client", # prevent LocalProxy being detected as an async function new_callable=mocker.MagicMock, ) diff --git a/tests/upload/test_views.py b/tests/upload/test_views.py index 4cff629d..d889cadf 100644 --- a/tests/upload/test_views.py +++ b/tests/upload/test_views.py @@ -21,7 +21,7 @@ def store(mocker): @pytest.fixture def antivirus(mocker): return mocker.patch( - "app.file_checks.views.antivirus_client", + "app.utils.file_checks.antivirus_client", # prevent LocalProxy being detected as an async function new_callable=mocker.MagicMock, ) From 1d76ae2a89b5e010b713fd0658ac5f89bc1c2667 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 12:54:42 +0100 Subject: [PATCH 37/63] Remove unused code --- app/utils/file_checks.py | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index 7ef61a3e..e773a3d1 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -158,37 +158,3 @@ def _mimetype(self): ) return mimetype - -def run_antivirus_checks(file_data): - if current_app.config["ANTIVIRUS_ENABLED"]: - results = {} - try: - virus_free = antivirus_client.scan(file_data) - except AntivirusError: - return {"message": "Antivirus API error", "status_code": 503} - if virus_free: - results["virus_free"] = True - else: - results["virus_free"] = False - return results - return - - -def run_mimetype_checks(file_data, is_csv, filename=None): - if filename: - mimetype = mimetypes.types_map[split_filename(filename, dotted=True)[1]] - else: - mimetype = get_mime_type(file_data) - # Our mimetype auto-detection sometimes resolves CSV content as text/plain, so we use - # an explicit POST body parameter `is_csv` from the caller to resolve it as text/csv - if is_csv and mimetype == "text/plain": - mimetype = "text/csv" - if mimetype not in current_app.config["MIME_TYPES_TO_FILE_EXTENSIONS"]: - allowed_file_types = ", ".join( - sorted({f"'.{x}'" for x in current_app.config["FILE_EXTENSIONS_TO_MIMETYPES"].keys()}) - ) - raise FiletypeError( - error_message=f"Unsupported file type '{mimetype}'. Supported types are: {allowed_file_types}", - status_code=400, - ) - return mimetype From 1bcc6356c5f2989481aa073164519cb9cdc25cec Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 13:20:52 +0100 Subject: [PATCH 38/63] Ensure file checks are always called with a hash of the contents MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will allow us to cache this function further down the line. Adds a second hash for integrity checking – makes sure the function is never called with the wrong hash. Hashing should be ridiculously fast for 2MB files so we don’t need to worry about the performance implications of doing it once or twice. --- app/file_checks/views.py | 1 - app/utils/file_checks.py | 14 +++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/app/file_checks/views.py b/app/file_checks/views.py index 9ceafbfe..fb4e9ef4 100644 --- a/app/file_checks/views.py +++ b/app/file_checks/views.py @@ -1,4 +1,3 @@ - from flask import Blueprint, jsonify, request from werkzeug.exceptions import BadRequest diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index e773a3d1..16859131 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -1,5 +1,6 @@ import mimetypes from base64 import b64decode, binascii +from hashlib import sha1 from io import BytesIO from flask import abort, current_app @@ -111,14 +112,22 @@ def from_request_json(cls, data): filename=data.get("filename", None), ) - def get_mime_type_and_run_antivirus_scan_json(self): + @property + def file_data_hash(self): + file_data_hash = sha1(self.file_data.read()).hexdigest() + self.file_data.seek(0) + return file_data_hash + + def get_mime_type_and_run_antivirus_scan_json(self, file_data_hash): + if file_data_hash != self.file_data_hash: + raise RuntimeError("Wrong hash value passed to cache") try: return {"success": {"virus_free": self._virus_free, "mimetype": self._mimetype}} except Exception as e: return {"failure": {"error": e.message, "status_code": e.status_code}} def get_check(self, check): - result = self.get_mime_type_and_run_antivirus_scan_json() + result = self.get_mime_type_and_run_antivirus_scan_json(self.file_data_hash) if "failure" in result: raise AntivirusAndMimeTypeCheckError( message=result["failure"]["error"], @@ -157,4 +166,3 @@ def _mimetype(self): status_code=400, ) return mimetype - From 37ff0b6f501df111cc4fbaa5cdbbc8ed92be6376 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 13:24:48 +0100 Subject: [PATCH 39/63] Add caching of mime type and virus checks These can be quite slow, so where the same file is uploaded multiple times we can cache the result and avoid repeating them. --- app/utils/file_checks.py | 14 ++++++---- tests/file_checks/test_views.py | 48 +++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 5 deletions(-) diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index 16859131..681a83f9 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -5,10 +5,11 @@ from flask import abort, current_app from notifications_utils.clients.antivirus.antivirus_client import AntivirusError +from notifications_utils.clients.redis import RequestCache from notifications_utils.recipient_validation.errors import InvalidEmailError from werkzeug.exceptions import BadRequest -from app import antivirus_client +from app import antivirus_client, redis_client from app.utils import get_mime_type from app.utils.files import split_filename from app.utils.validation import ( @@ -17,6 +18,8 @@ validate_filename, ) +cache = RequestCache(redis_client) + class FiletypeError(Exception): def __init__(self, message=None, status_code=None): @@ -37,8 +40,7 @@ def __init__(self, file_data, is_csv, confirmation_email, retention_period, file self.filename = filename self.confirmation_email = confirmation_email self.retention_period = retention_period - self.virus_free = self.get_check("virus_free") - self.mimetype = self.get_check("mimetype") + self.virus_free, self.mimetype = self.virus_free_and_mimetype @property def is_csv(self): @@ -118,6 +120,7 @@ def file_data_hash(self): self.file_data.seek(0) return file_data_hash + @cache.set("file-checks-{file_data_hash}") def get_mime_type_and_run_antivirus_scan_json(self, file_data_hash): if file_data_hash != self.file_data_hash: raise RuntimeError("Wrong hash value passed to cache") @@ -126,14 +129,15 @@ def get_mime_type_and_run_antivirus_scan_json(self, file_data_hash): except Exception as e: return {"failure": {"error": e.message, "status_code": e.status_code}} - def get_check(self, check): + @property + def virus_free_and_mimetype(self): result = self.get_mime_type_and_run_antivirus_scan_json(self.file_data_hash) if "failure" in result: raise AntivirusAndMimeTypeCheckError( message=result["failure"]["error"], status_code=result["failure"]["status_code"], ) - return result["success"][check] + return result["success"]["virus_free"], result["success"]["mimetype"] @property def _virus_free(self): diff --git a/tests/file_checks/test_views.py b/tests/file_checks/test_views.py index 6cf5b5a3..bf206dcf 100644 --- a/tests/file_checks/test_views.py +++ b/tests/file_checks/test_views.py @@ -1,4 +1,5 @@ import base64 +from unittest.mock import call import pytest from notifications_utils.clients.antivirus.antivirus_client import AntivirusError @@ -68,3 +69,50 @@ def test_file_checks_unknown_type(client, antivirus): "Supported types are: '.csv', '.doc', '.docx', '.jpeg', '.jpg', '.json', '.odt', '.pdf', '.png', '.rtf', " "'.txt', '.xlsx'" ) + + +def test_virus_check_puts_value_in_cache(client, mocker, antivirus): + antivirus.scan.return_value = True + mock_redis_set = mocker.patch("app.redis_client.set") + + _file_checks(client, b"%PDF-1.4 first file contents") + _file_checks(client, b"second file contents") + + assert mock_redis_set.call_args_list == [ + call( + "file-checks-d90dbad3ec3d280ac1190458b692d56661f7410a", + '{"success": {"virus_free": true, "mimetype": "application/pdf"}}', + ex=2419200, + ), + call( + "file-checks-85336573f4f627cefb440bc2140c9a6b4925355b", + '{"success": {"virus_free": true, "mimetype": "text/plain"}}', + ex=2419200, + ), + ] + + +def test_virus_check_returns_value_from_cache(client, mocker): + mock_redis_get = mocker.patch( + "app.redis_client.get", + return_value='{"failure": {"error": "I’m a teapot", "status_code": 418}}'.encode(), + ) + + file_1_content = b"%PDF-1.4 first file contents" + file_2_content = b"%PDF-1.4 second file contents" + + for _ in range(3): + response_1 = _file_checks(client, file_1_content) + response_2 = _file_checks(client, file_2_content) + + assert response_1.status_code == response_2.status_code == 418 + assert response_1.json == response_2.json == {"error": "I’m a teapot"} + + assert mock_redis_get.call_args_list == [ + call("file-checks-d90dbad3ec3d280ac1190458b692d56661f7410a"), + call("file-checks-93fb06037e8211f2fe7fbffe31b69ec0df48789e"), + call("file-checks-d90dbad3ec3d280ac1190458b692d56661f7410a"), + call("file-checks-93fb06037e8211f2fe7fbffe31b69ec0df48789e"), + call("file-checks-d90dbad3ec3d280ac1190458b692d56661f7410a"), + call("file-checks-93fb06037e8211f2fe7fbffe31b69ec0df48789e"), + ] From 924bf4a9b448d8fb1a8d852bae7e546ad2db1467 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 13:25:45 +0100 Subject: [PATCH 40/63] Catch specific exceptions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It’s bad practice to blanket catch the base exception class, because you could be hiding unexpected errors --- app/utils/file_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index 681a83f9..58e6dc6a 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -126,7 +126,7 @@ def get_mime_type_and_run_antivirus_scan_json(self, file_data_hash): raise RuntimeError("Wrong hash value passed to cache") try: return {"success": {"virus_free": self._virus_free, "mimetype": self._mimetype}} - except Exception as e: + except (AntivirusError, FiletypeError) as e: return {"failure": {"error": e.message, "status_code": e.status_code}} @property From 342b6bbfb2597c87ec6480aecc1bbeaf00a5154c Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 14:07:35 +0100 Subject: [PATCH 41/63] Move classmethod up I like classmethods to be near the init method, since the former calls the latter --- app/utils/file_checks.py | 42 ++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index 58e6dc6a..737b504b 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -42,6 +42,27 @@ def __init__(self, file_data, is_csv, confirmation_email, retention_period, file self.retention_period = retention_period self.virus_free, self.mimetype = self.virus_free_and_mimetype + @classmethod + def from_request_json(cls, data): + if "document" not in data: + raise BadRequest("No document upload") + + try: + raw_content = b64decode(data["document"]) + except (binascii.Error, ValueError) as e: + raise BadRequest("Document is not base64 encoded") from e + + if len(raw_content) > current_app.config["MAX_DECODED_FILE_SIZE"]: + abort(413) + + return cls( + file_data=BytesIO(raw_content), + is_csv=data.get("is_csv"), + confirmation_email=data.get("confirmation_email"), + retention_period=data.get("retention_period"), + filename=data.get("filename", None), + ) + @property def is_csv(self): return self._is_csv @@ -93,27 +114,6 @@ def filename(self, value): except ValueError as e: raise BadRequest(str(e)) from e - @classmethod - def from_request_json(cls, data): - if "document" not in data: - raise BadRequest("No document upload") - - try: - raw_content = b64decode(data["document"]) - except (binascii.Error, ValueError) as e: - raise BadRequest("Document is not base64 encoded") from e - - if len(raw_content) > current_app.config["MAX_DECODED_FILE_SIZE"]: - abort(413) - - return cls( - file_data=BytesIO(raw_content), - is_csv=data.get("is_csv"), - confirmation_email=data.get("confirmation_email"), - retention_period=data.get("retention_period"), - filename=data.get("filename", None), - ) - @property def file_data_hash(self): file_data_hash = sha1(self.file_data.read()).hexdigest() From 1c8fc1d620f032255ce640ae570026182b3da5df Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 14:16:21 +0100 Subject: [PATCH 42/63] Only raise one type of exception MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We don’t have different code paths for the different types of exception, so they don’t need to be different --- app/file_checks/views.py | 3 --- app/upload/views.py | 3 --- app/utils/file_checks.py | 20 +++++++++----------- tests/upload/test_views.py | 6 +++--- 4 files changed, 12 insertions(+), 20 deletions(-) diff --git a/app/file_checks/views.py b/app/file_checks/views.py index fb4e9ef4..7b0d5b61 100644 --- a/app/file_checks/views.py +++ b/app/file_checks/views.py @@ -1,5 +1,4 @@ from flask import Blueprint, jsonify, request -from werkzeug.exceptions import BadRequest from app.utils.authentication import check_auth from app.utils.file_checks import AntivirusAndMimeTypeCheckError, UploadedFile @@ -12,8 +11,6 @@ def get_mime_type_and_run_antivirus_scan(): try: uploaded_file = UploadedFile.from_request_json(request.json) - except BadRequest as e: - return jsonify(error=e.description), 400 except AntivirusAndMimeTypeCheckError as e: return jsonify(error=e.message), e.status_code diff --git a/app/upload/views.py b/app/upload/views.py index 8d145a09..4efba99d 100644 --- a/app/upload/views.py +++ b/app/upload/views.py @@ -1,5 +1,4 @@ from flask import Blueprint, jsonify, request -from werkzeug.exceptions import BadRequest from app import document_store from app.utils.authentication import check_auth @@ -14,8 +13,6 @@ def upload_document(service_id): try: uploaded_file = UploadedFile.from_request_json(request.json) - except BadRequest as e: - return jsonify(error=e.description), 400 except AntivirusAndMimeTypeCheckError as e: return jsonify(error=e.message), e.status_code diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index 737b504b..e2db3904 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -7,7 +7,6 @@ from notifications_utils.clients.antivirus.antivirus_client import AntivirusError from notifications_utils.clients.redis import RequestCache from notifications_utils.recipient_validation.errors import InvalidEmailError -from werkzeug.exceptions import BadRequest from app import antivirus_client, redis_client from app.utils import get_mime_type @@ -22,13 +21,13 @@ class FiletypeError(Exception): - def __init__(self, message=None, status_code=None): + def __init__(self, message=None, status_code=400): self.message = message self.status_code = status_code class AntivirusAndMimeTypeCheckError(Exception): - def __init__(self, message=None, status_code=None): + def __init__(self, message=None, status_code=400): self.message = message self.status_code = status_code @@ -45,12 +44,12 @@ def __init__(self, file_data, is_csv, confirmation_email, retention_period, file @classmethod def from_request_json(cls, data): if "document" not in data: - raise BadRequest("No document upload") + raise AntivirusAndMimeTypeCheckError("No document upload") try: raw_content = b64decode(data["document"]) except (binascii.Error, ValueError) as e: - raise BadRequest("Document is not base64 encoded") from e + raise AntivirusAndMimeTypeCheckError("Document is not base64 encoded") from e if len(raw_content) > current_app.config["MAX_DECODED_FILE_SIZE"]: abort(413) @@ -72,7 +71,7 @@ def is_csv(self, value): if value is None: value = False if not isinstance(value, bool): - raise BadRequest("Value for is_csv must be a boolean") + raise AntivirusAndMimeTypeCheckError("Value for is_csv must be a boolean") self._is_csv = value @property @@ -86,7 +85,7 @@ def confirmation_email(self, value): try: self._confirmation_email = clean_and_validate_email_address(value) except InvalidEmailError as e: - raise BadRequest(str(e)) from e + raise AntivirusAndMimeTypeCheckError(str(e)) from e @property def retention_period(self): @@ -99,7 +98,7 @@ def retention_period(self, value): try: self._retention_period = clean_and_validate_retention_period(value) except ValueError as e: - raise BadRequest(str(e)) from e + raise AntivirusAndMimeTypeCheckError(str(e)) from e @property def filename(self): @@ -112,7 +111,7 @@ def filename(self, value): try: self._filename = validate_filename(value) except ValueError as e: - raise BadRequest(str(e)) from e + raise AntivirusAndMimeTypeCheckError(str(e)) from e @property def file_data_hash(self): @@ -166,7 +165,6 @@ def _mimetype(self): sorted({f"'.{x}'" for x in current_app.config["FILE_EXTENSIONS_TO_MIMETYPES"].keys()}) ) raise FiletypeError( - message=f"Unsupported file type '{mimetype}'. Supported types are: {allowed_file_types}", - status_code=400, + message=f"Unsupported file type '{mimetype}'. Supported types are: {allowed_file_types}" ) return mimetype diff --git a/tests/upload/test_views.py b/tests/upload/test_views.py index d889cadf..f04d355c 100644 --- a/tests/upload/test_views.py +++ b/tests/upload/test_views.py @@ -3,10 +3,10 @@ import pytest from notifications_utils.clients.antivirus.antivirus_client import AntivirusError -from werkzeug.exceptions import BadRequest import app from app.upload.views import UploadedFile +from app.utils.file_checks import AntivirusAndMimeTypeCheckError @pytest.fixture @@ -430,7 +430,7 @@ def test_document_upload_bad_is_csv_value(client): ), ) def test_get_upload_document_request_data_errors(app, data, expected_error): - with pytest.raises(BadRequest) as e: + with pytest.raises(AntivirusAndMimeTypeCheckError) as e: UploadedFile.from_request_json(data) - assert e.value.description == expected_error + assert e.value.message == expected_error From adc9f810c10ec90052557db2d1785ea8cf195205 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 15:11:50 +0100 Subject: [PATCH 43/63] Vary the hash key for different filenames and values of `is_csv` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These can control the mime type so we shouldn’t cache under the same key if they change. --- app/utils/file_checks.py | 8 +++++-- tests/file_checks/test_views.py | 41 +++++++++++++++++++++++++-------- 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index e2db3904..a4831971 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -115,9 +115,13 @@ def filename(self, value): @property def file_data_hash(self): - file_data_hash = sha1(self.file_data.read()).hexdigest() + contents = bytearray(self.file_data.read()) self.file_data.seek(0) - return file_data_hash + + contents += bytes(self.is_csv) + contents += str(self.filename).encode() + + return sha1(contents).hexdigest() @cache.set("file-checks-{file_data_hash}") def get_mime_type_and_run_antivirus_scan_json(self, file_data_hash): diff --git a/tests/file_checks/test_views.py b/tests/file_checks/test_views.py index bf206dcf..818413e2 100644 --- a/tests/file_checks/test_views.py +++ b/tests/file_checks/test_views.py @@ -5,11 +5,15 @@ from notifications_utils.clients.antivirus.antivirus_client import AntivirusError -def _file_checks(client, file_content): +def _file_checks(client, file_content, is_csv=None, filename=None): url = "/antivirus_and_mimetype_check" data = { "document": base64.b64encode(file_content).decode("utf-8"), } + if is_csv is not None: + data |= {"is_csv": is_csv} + if filename is not None: + data |= {"filename": filename} response = client.post(url, json=data) return response @@ -80,12 +84,12 @@ def test_virus_check_puts_value_in_cache(client, mocker, antivirus): assert mock_redis_set.call_args_list == [ call( - "file-checks-d90dbad3ec3d280ac1190458b692d56661f7410a", + "file-checks-74cc0669d6b61ff7efa2416a51eb1a6ed17b23d5", '{"success": {"virus_free": true, "mimetype": "application/pdf"}}', ex=2419200, ), call( - "file-checks-85336573f4f627cefb440bc2140c9a6b4925355b", + "file-checks-9c8b0f33cd678ce620fced273bbc9950bd3350e7", '{"success": {"virus_free": true, "mimetype": "text/plain"}}', ex=2419200, ), @@ -109,10 +113,29 @@ def test_virus_check_returns_value_from_cache(client, mocker): assert response_1.json == response_2.json == {"error": "I’m a teapot"} assert mock_redis_get.call_args_list == [ - call("file-checks-d90dbad3ec3d280ac1190458b692d56661f7410a"), - call("file-checks-93fb06037e8211f2fe7fbffe31b69ec0df48789e"), - call("file-checks-d90dbad3ec3d280ac1190458b692d56661f7410a"), - call("file-checks-93fb06037e8211f2fe7fbffe31b69ec0df48789e"), - call("file-checks-d90dbad3ec3d280ac1190458b692d56661f7410a"), - call("file-checks-93fb06037e8211f2fe7fbffe31b69ec0df48789e"), + call("file-checks-74cc0669d6b61ff7efa2416a51eb1a6ed17b23d5"), + call("file-checks-da30062e97f8d78fa771dcb8f7b9bae884f0e61e"), + call("file-checks-74cc0669d6b61ff7efa2416a51eb1a6ed17b23d5"), + call("file-checks-da30062e97f8d78fa771dcb8f7b9bae884f0e61e"), + call("file-checks-74cc0669d6b61ff7efa2416a51eb1a6ed17b23d5"), + call("file-checks-da30062e97f8d78fa771dcb8f7b9bae884f0e61e"), + ] + + +def test_different_cache_keys_for_different_filename_and_is_csv(client, mocker): + mock_redis_get = mocker.patch( + "app.redis_client.get", + return_value='{"failure": {"error": "I’m a teapot", "status_code": 418}}'.encode(), + ) + + file_content = b"%PDF-1.4 first file contents" + + _file_checks(client, file_content) + _file_checks(client, file_content, filename="foo.pdf") + _file_checks(client, file_content, filename="foo.pdf", is_csv=True) + + assert mock_redis_get.call_args_list == [ + call("file-checks-74cc0669d6b61ff7efa2416a51eb1a6ed17b23d5"), + call("file-checks-3c7955f4f94c940efa494dd4cc9a5c171b8f863a"), + call("file-checks-e5734036f6ab95fade9d49a4ff8496489cf03293"), ] From 1041c246dafbffa2eb2a739849da23332d28327a Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 15:37:37 +0100 Subject: [PATCH 44/63] =?UTF-8?q?Use=20kebab=20case=20for=20URL=20paths=20?= =?UTF-8?q?=F0=9F=8D=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It’s what we do elsewhere --- app/file_checks/views.py | 2 +- tests/file_checks/test_views.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/app/file_checks/views.py b/app/file_checks/views.py index 7b0d5b61..01cf9264 100644 --- a/app/file_checks/views.py +++ b/app/file_checks/views.py @@ -7,7 +7,7 @@ file_checks_blueprint.before_request(check_auth) -@file_checks_blueprint.route("/antivirus_and_mimetype_check", methods=["POST"]) +@file_checks_blueprint.route("/antivirus-and-mimetype-check", methods=["POST"]) def get_mime_type_and_run_antivirus_scan(): try: uploaded_file = UploadedFile.from_request_json(request.json) diff --git a/tests/file_checks/test_views.py b/tests/file_checks/test_views.py index 818413e2..44c780f1 100644 --- a/tests/file_checks/test_views.py +++ b/tests/file_checks/test_views.py @@ -6,7 +6,7 @@ def _file_checks(client, file_content, is_csv=None, filename=None): - url = "/antivirus_and_mimetype_check" + url = "/antivirus-and-mimetype-check" data = { "document": base64.b64encode(file_content).decode("utf-8"), } @@ -56,7 +56,7 @@ def test_file_checks_virus_scan_error(client, antivirus): def test_file_checks_invalid_encoding(client): - response = client.post("/antivirus_and_mimetype_check", json={"document": "foo"}) + response = client.post("/antivirus-and-mimetype-check", json={"document": "foo"}) assert response.status_code == 400 assert response.json == {"error": "Document is not base64 encoded"} From db8b58766d825d253d7cda4c613423d904657eab Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 15:50:48 +0100 Subject: [PATCH 45/63] Reduce cache lifetime to 24 hours We want to re-check files for viruses after 24 hours, in case the definitions have changed. --- app/utils/file_checks.py | 2 +- tests/file_checks/test_views.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index a4831971..131a618b 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -123,7 +123,7 @@ def file_data_hash(self): return sha1(contents).hexdigest() - @cache.set("file-checks-{file_data_hash}") + @cache.set("file-checks-{file_data_hash}", ttl_in_seconds=86_400) def get_mime_type_and_run_antivirus_scan_json(self, file_data_hash): if file_data_hash != self.file_data_hash: raise RuntimeError("Wrong hash value passed to cache") diff --git a/tests/file_checks/test_views.py b/tests/file_checks/test_views.py index 44c780f1..1d67bacf 100644 --- a/tests/file_checks/test_views.py +++ b/tests/file_checks/test_views.py @@ -86,12 +86,12 @@ def test_virus_check_puts_value_in_cache(client, mocker, antivirus): call( "file-checks-74cc0669d6b61ff7efa2416a51eb1a6ed17b23d5", '{"success": {"virus_free": true, "mimetype": "application/pdf"}}', - ex=2419200, + ex=86_400, ), call( "file-checks-9c8b0f33cd678ce620fced273bbc9950bd3350e7", '{"success": {"virus_free": true, "mimetype": "text/plain"}}', - ex=2419200, + ex=86_400, ), ] From 3416747e2c5cb0430f28cab70482e953db7c3a7f Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 16:32:55 +0100 Subject: [PATCH 46/63] Add more tests for serialisation/deserialisation We were a bit light on test coverage here --- app/utils/file_checks.py | 5 +++ tests/file_checks/test_views.py | 54 ++++++++++++++++++++++++++++++--- 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index 131a618b..115f7448 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -140,6 +140,11 @@ def virus_free_and_mimetype(self): message=result["failure"]["error"], status_code=result["failure"]["status_code"], ) + if not result["success"]["virus_free"]: + raise AntivirusAndMimeTypeCheckError( + message="File did not pass the virus scan", + status_code=400, + ) return result["success"]["virus_free"], result["success"]["mimetype"] @property diff --git a/tests/file_checks/test_views.py b/tests/file_checks/test_views.py index 1d67bacf..5d211735 100644 --- a/tests/file_checks/test_views.py +++ b/tests/file_checks/test_views.py @@ -75,8 +75,30 @@ def test_file_checks_unknown_type(client, antivirus): ) -def test_virus_check_puts_value_in_cache(client, mocker, antivirus): - antivirus.scan.return_value = True +@pytest.mark.parametrize( + "antivirus_scan_result, expected_first_cache_value, expected_second_cache_value", + ( + ( + True, + '{"success": {"virus_free": true, "mimetype": "application/pdf"}}', + '{"success": {"virus_free": true, "mimetype": "text/plain"}}', + ), + ( + False, + '{"failure": {"error": "File did not pass the virus scan", "status_code": 400}}', + '{"failure": {"error": "File did not pass the virus scan", "status_code": 400}}', + ), + ), +) +def test_virus_check_puts_value_in_cache( + client, + mocker, + antivirus, + antivirus_scan_result, + expected_first_cache_value, + expected_second_cache_value, +): + antivirus.scan.return_value = antivirus_scan_result mock_redis_set = mocker.patch("app.redis_client.set") _file_checks(client, b"%PDF-1.4 first file contents") @@ -85,12 +107,12 @@ def test_virus_check_puts_value_in_cache(client, mocker, antivirus): assert mock_redis_set.call_args_list == [ call( "file-checks-74cc0669d6b61ff7efa2416a51eb1a6ed17b23d5", - '{"success": {"virus_free": true, "mimetype": "application/pdf"}}', + expected_first_cache_value, ex=86_400, ), call( "file-checks-9c8b0f33cd678ce620fced273bbc9950bd3350e7", - '{"success": {"virus_free": true, "mimetype": "text/plain"}}', + expected_second_cache_value, ex=86_400, ), ] @@ -139,3 +161,27 @@ def test_different_cache_keys_for_different_filename_and_is_csv(client, mocker): call("file-checks-3c7955f4f94c940efa494dd4cc9a5c171b8f863a"), call("file-checks-e5734036f6ab95fade9d49a4ff8496489cf03293"), ] + + +@pytest.mark.parametrize( + "json_string, expected_status_code, expected_response_json", + ( + ( + '{"success": {"virus_free": true, "mimetype": "application/pdf"}}', + 200, + {"mimetype": "application/pdf", "virus_free": True}, + ), + ( + '{"success": {"virus_free": false, "mimetype": "application/pdf"}}', + 400, + {"error": "File did not pass the virus scan"}, + ), + ), +) +def test_success_response_from_cache(client, mocker, json_string, expected_status_code, expected_response_json): + mocker.patch("app.redis_client.get", return_value=json_string.encode()) + + response = _file_checks(client, b"Anything") + + assert response.status_code == expected_status_code + assert response.json == expected_response_json From 2c6f65d2b356500ab23a93f66c34b0f16409b9a0 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 16:47:01 +0100 Subject: [PATCH 47/63] Remove `virus_free` from the serialised format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Either the format is `{"success": …}` or {`error`: …}. `{"success": {"virus_free": False}}` is ambiguous so it shouldn’t be a structure we allow to be possible. --- app/file_checks/views.py | 2 +- app/upload/views.py | 3 --- app/utils/file_checks.py | 21 +++++++++------------ tests/file_checks/test_views.py | 32 ++++++++++---------------------- 4 files changed, 20 insertions(+), 38 deletions(-) diff --git a/app/file_checks/views.py b/app/file_checks/views.py index 01cf9264..cd49b905 100644 --- a/app/file_checks/views.py +++ b/app/file_checks/views.py @@ -14,4 +14,4 @@ def get_mime_type_and_run_antivirus_scan(): except AntivirusAndMimeTypeCheckError as e: return jsonify(error=e.message), e.status_code - return jsonify(virus_free=uploaded_file.virus_free, mimetype=uploaded_file.mimetype), 200 + return jsonify(mimetype=uploaded_file.mimetype), 200 diff --git a/app/upload/views.py b/app/upload/views.py index 4efba99d..eca65792 100644 --- a/app/upload/views.py +++ b/app/upload/views.py @@ -16,9 +16,6 @@ def upload_document(service_id): except AntivirusAndMimeTypeCheckError as e: return jsonify(error=e.message), e.status_code - if not uploaded_file.virus_free: - return jsonify(error="File did not pass the virus scan"), 400 - document = document_store.put( service_id, uploaded_file.file_data, diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index 115f7448..4c6205dc 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -39,7 +39,7 @@ def __init__(self, file_data, is_csv, confirmation_email, retention_period, file self.filename = filename self.confirmation_email = confirmation_email self.retention_period = retention_period - self.virus_free, self.mimetype = self.virus_free_and_mimetype + self.mimetype = self.mimetype_deserialised @classmethod def from_request_json(cls, data): @@ -124,31 +124,26 @@ def file_data_hash(self): return sha1(contents).hexdigest() @cache.set("file-checks-{file_data_hash}", ttl_in_seconds=86_400) - def get_mime_type_and_run_antivirus_scan_json(self, file_data_hash): + def mimetype_serialised(self, file_data_hash): if file_data_hash != self.file_data_hash: raise RuntimeError("Wrong hash value passed to cache") try: - return {"success": {"virus_free": self._virus_free, "mimetype": self._mimetype}} + return {"success": {"mimetype": self._mimetype}} except (AntivirusError, FiletypeError) as e: return {"failure": {"error": e.message, "status_code": e.status_code}} @property - def virus_free_and_mimetype(self): - result = self.get_mime_type_and_run_antivirus_scan_json(self.file_data_hash) + def mimetype_deserialised(self): + result = self.mimetype_serialised(self.file_data_hash) if "failure" in result: raise AntivirusAndMimeTypeCheckError( message=result["failure"]["error"], status_code=result["failure"]["status_code"], ) - if not result["success"]["virus_free"]: - raise AntivirusAndMimeTypeCheckError( - message="File did not pass the virus scan", - status_code=400, - ) - return result["success"]["virus_free"], result["success"]["mimetype"] + return result["success"]["mimetype"] @property - def _virus_free(self): + def virus_free(self): if not current_app.config["ANTIVIRUS_ENABLED"]: return False try: @@ -161,6 +156,8 @@ def _virus_free(self): @property def _mimetype(self): + if not self.virus_free: + return if self.filename: mimetype = mimetypes.types_map[split_filename(self.filename, dotted=True)[1]] else: diff --git a/tests/file_checks/test_views.py b/tests/file_checks/test_views.py index 5d211735..fb7185df 100644 --- a/tests/file_checks/test_views.py +++ b/tests/file_checks/test_views.py @@ -42,7 +42,7 @@ def test_file_checks_returns_json_object_with_expected_results(client, antivirus file_content = b"%PDF-1.4 file contents" response = _file_checks(client, file_content) assert response.status_code == 200 - assert response.json == {"mimetype": "application/pdf", "virus_free": True} + assert response.json == {"mimetype": "application/pdf"} def test_file_checks_virus_scan_error(client, antivirus): @@ -80,8 +80,8 @@ def test_file_checks_unknown_type(client, antivirus): ( ( True, - '{"success": {"virus_free": true, "mimetype": "application/pdf"}}', - '{"success": {"virus_free": true, "mimetype": "text/plain"}}', + '{"success": {"mimetype": "application/pdf"}}', + '{"success": {"mimetype": "text/plain"}}', ), ( False, @@ -163,25 +163,13 @@ def test_different_cache_keys_for_different_filename_and_is_csv(client, mocker): ] -@pytest.mark.parametrize( - "json_string, expected_status_code, expected_response_json", - ( - ( - '{"success": {"virus_free": true, "mimetype": "application/pdf"}}', - 200, - {"mimetype": "application/pdf", "virus_free": True}, - ), - ( - '{"success": {"virus_free": false, "mimetype": "application/pdf"}}', - 400, - {"error": "File did not pass the virus scan"}, - ), - ), -) -def test_success_response_from_cache(client, mocker, json_string, expected_status_code, expected_response_json): - mocker.patch("app.redis_client.get", return_value=json_string.encode()) +def test_success_response_from_cache(client, mocker): + mocker.patch( + "app.redis_client.get", + return_value=b'{"success": {"mimetype": "application/pdf"}}', + ) response = _file_checks(client, b"Anything") - assert response.status_code == expected_status_code - assert response.json == expected_response_json + assert response.status_code == 200 + assert response.json == {"mimetype": "application/pdf"} From d70466aa3038a40fa6b703aa3b2c4117eaa27e5c Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 16:50:10 +0100 Subject: [PATCH 48/63] Re-order methods for easier reading --- app/utils/file_checks.py | 43 ++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index 4c6205dc..9d29d2ef 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -27,7 +27,7 @@ def __init__(self, message=None, status_code=400): class AntivirusAndMimeTypeCheckError(Exception): - def __init__(self, message=None, status_code=400): + def __init__(self, message, status_code=400): self.message = message self.status_code = status_code @@ -39,7 +39,7 @@ def __init__(self, file_data, is_csv, confirmation_email, retention_period, file self.filename = filename self.confirmation_email = confirmation_email self.retention_period = retention_period - self.mimetype = self.mimetype_deserialised + self.mimetype = self.mimetype_deserialised() @classmethod def from_request_json(cls, data): @@ -123,16 +123,6 @@ def file_data_hash(self): return sha1(contents).hexdigest() - @cache.set("file-checks-{file_data_hash}", ttl_in_seconds=86_400) - def mimetype_serialised(self, file_data_hash): - if file_data_hash != self.file_data_hash: - raise RuntimeError("Wrong hash value passed to cache") - try: - return {"success": {"mimetype": self._mimetype}} - except (AntivirusError, FiletypeError) as e: - return {"failure": {"error": e.message, "status_code": e.status_code}} - - @property def mimetype_deserialised(self): result = self.mimetype_serialised(self.file_data_hash) if "failure" in result: @@ -142,17 +132,14 @@ def mimetype_deserialised(self): ) return result["success"]["mimetype"] - @property - def virus_free(self): - if not current_app.config["ANTIVIRUS_ENABLED"]: - return False + @cache.set("file-checks-{file_data_hash}", ttl_in_seconds=86_400) + def mimetype_serialised(self, file_data_hash): + if file_data_hash != self.file_data_hash: + raise RuntimeError("Wrong hash value passed to cache") try: - virus_free = antivirus_client.scan(self.file_data) - except AntivirusError as e: - raise AntivirusError(message="Antivirus API error", status_code=503) from e - if not virus_free: - raise AntivirusError(message="File did not pass the virus scan", status_code=400) - return virus_free + return {"success": {"mimetype": self._mimetype}} + except (AntivirusError, FiletypeError) as e: + return {"failure": {"error": e.message, "status_code": e.status_code}} @property def _mimetype(self): @@ -174,3 +161,15 @@ def _mimetype(self): message=f"Unsupported file type '{mimetype}'. Supported types are: {allowed_file_types}" ) return mimetype + + @property + def virus_free(self): + if not current_app.config["ANTIVIRUS_ENABLED"]: + return False + try: + virus_free = antivirus_client.scan(self.file_data) + except AntivirusError as e: + raise AntivirusError(message="Antivirus API error", status_code=503) from e + if not virus_free: + raise AntivirusError(message="File did not pass the virus scan", status_code=400) + return virus_free From 7e9c849ad8f6a55522b42dd62e9bf17616c84a5e Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Fri, 26 Sep 2025 17:29:54 +0100 Subject: [PATCH 49/63] Always check for viruses when file data is changed This is a bit more robust than just relying on execution order. --- app/utils/file_checks.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index 9d29d2ef..070123d2 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -34,12 +34,11 @@ def __init__(self, message, status_code=400): class UploadedFile: def __init__(self, file_data, is_csv, confirmation_email, retention_period, filename): - self.file_data = file_data self.is_csv = is_csv self.filename = filename self.confirmation_email = confirmation_email self.retention_period = retention_period - self.mimetype = self.mimetype_deserialised() + self.file_data = file_data @classmethod def from_request_json(cls, data): @@ -113,6 +112,15 @@ def filename(self, value): except ValueError as e: raise AntivirusAndMimeTypeCheckError(str(e)) from e + @property + def file_data(self): + return self._file_data + + @file_data.setter + def file_data(self, value): + self._file_data = value + self.mimetype = self.mimetype_deserialised() + @property def file_data_hash(self): contents = bytearray(self.file_data.read()) From ea51ebe114e48999a3b736f6b19121f84bf655f5 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Mon, 29 Sep 2025 10:39:42 +0100 Subject: [PATCH 50/63] Hash file contents with service ID This prevents timing attacks, where one service could see if any other service has uploaded a document by seeing if the checks for that document run more quickly (because they have been cached). --- app/file_checks/views.py | 6 ++-- app/upload/views.py | 2 +- app/utils/file_checks.py | 7 +++-- tests/file_checks/test_views.py | 50 ++++++++++++++++++++++++--------- tests/upload/test_views.py | 2 +- 5 files changed, 46 insertions(+), 21 deletions(-) diff --git a/app/file_checks/views.py b/app/file_checks/views.py index cd49b905..1e0e1cfb 100644 --- a/app/file_checks/views.py +++ b/app/file_checks/views.py @@ -7,10 +7,10 @@ file_checks_blueprint.before_request(check_auth) -@file_checks_blueprint.route("/antivirus-and-mimetype-check", methods=["POST"]) -def get_mime_type_and_run_antivirus_scan(): +@file_checks_blueprint.route("/services//antivirus-and-mimetype-check", methods=["POST"]) +def get_mime_type_and_run_antivirus_scan(service_id): try: - uploaded_file = UploadedFile.from_request_json(request.json) + uploaded_file = UploadedFile.from_request_json(request.json, service_id=service_id) except AntivirusAndMimeTypeCheckError as e: return jsonify(error=e.message), e.status_code diff --git a/app/upload/views.py b/app/upload/views.py index eca65792..47dc2db4 100644 --- a/app/upload/views.py +++ b/app/upload/views.py @@ -12,7 +12,7 @@ @upload_blueprint.route("/services//documents", methods=["POST"]) def upload_document(service_id): try: - uploaded_file = UploadedFile.from_request_json(request.json) + uploaded_file = UploadedFile.from_request_json(request.json, service_id=service_id) except AntivirusAndMimeTypeCheckError as e: return jsonify(error=e.message), e.status_code diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index 070123d2..b6d13bd4 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -33,15 +33,16 @@ def __init__(self, message, status_code=400): class UploadedFile: - def __init__(self, file_data, is_csv, confirmation_email, retention_period, filename): + def __init__(self, *, file_data, is_csv, confirmation_email, retention_period, filename, service_id): self.is_csv = is_csv self.filename = filename self.confirmation_email = confirmation_email self.retention_period = retention_period + self.service_id = service_id self.file_data = file_data @classmethod - def from_request_json(cls, data): + def from_request_json(cls, data, *, service_id): if "document" not in data: raise AntivirusAndMimeTypeCheckError("No document upload") @@ -59,6 +60,7 @@ def from_request_json(cls, data): confirmation_email=data.get("confirmation_email"), retention_period=data.get("retention_period"), filename=data.get("filename", None), + service_id=service_id, ) @property @@ -128,6 +130,7 @@ def file_data_hash(self): contents += bytes(self.is_csv) contents += str(self.filename).encode() + contents += str(self.service_id).encode() return sha1(contents).hexdigest() diff --git a/tests/file_checks/test_views.py b/tests/file_checks/test_views.py index fb7185df..70943ded 100644 --- a/tests/file_checks/test_views.py +++ b/tests/file_checks/test_views.py @@ -1,12 +1,14 @@ import base64 from unittest.mock import call +from uuid import UUID import pytest from notifications_utils.clients.antivirus.antivirus_client import AntivirusError -def _file_checks(client, file_content, is_csv=None, filename=None): - url = "/antivirus-and-mimetype-check" +def _file_checks(client, file_content, is_csv=None, filename=None, service_id=None): + service_id = service_id or UUID(int=0, version=4) + url = f"/services/{service_id}/antivirus-and-mimetype-check" data = { "document": base64.b64encode(file_content).decode("utf-8"), } @@ -56,7 +58,8 @@ def test_file_checks_virus_scan_error(client, antivirus): def test_file_checks_invalid_encoding(client): - response = client.post("/antivirus-and-mimetype-check", json={"document": "foo"}) + service_id = UUID(int=0, version=4) + response = client.post(f"/services/{service_id}/antivirus-and-mimetype-check", json={"document": "foo"}) assert response.status_code == 400 assert response.json == {"error": "Document is not base64 encoded"} @@ -106,12 +109,12 @@ def test_virus_check_puts_value_in_cache( assert mock_redis_set.call_args_list == [ call( - "file-checks-74cc0669d6b61ff7efa2416a51eb1a6ed17b23d5", + "file-checks-b923c205dab97514f00194b3ee5cde0546f1aa7c", expected_first_cache_value, ex=86_400, ), call( - "file-checks-9c8b0f33cd678ce620fced273bbc9950bd3350e7", + "file-checks-9e1c0c215d2eaefe915dd2d431a1cf21e777bbae", expected_second_cache_value, ex=86_400, ), @@ -135,12 +138,12 @@ def test_virus_check_returns_value_from_cache(client, mocker): assert response_1.json == response_2.json == {"error": "I’m a teapot"} assert mock_redis_get.call_args_list == [ - call("file-checks-74cc0669d6b61ff7efa2416a51eb1a6ed17b23d5"), - call("file-checks-da30062e97f8d78fa771dcb8f7b9bae884f0e61e"), - call("file-checks-74cc0669d6b61ff7efa2416a51eb1a6ed17b23d5"), - call("file-checks-da30062e97f8d78fa771dcb8f7b9bae884f0e61e"), - call("file-checks-74cc0669d6b61ff7efa2416a51eb1a6ed17b23d5"), - call("file-checks-da30062e97f8d78fa771dcb8f7b9bae884f0e61e"), + call("file-checks-b923c205dab97514f00194b3ee5cde0546f1aa7c"), + call("file-checks-fcdd443163779531f4fc93f72b34504ad6d14ac8"), + call("file-checks-b923c205dab97514f00194b3ee5cde0546f1aa7c"), + call("file-checks-fcdd443163779531f4fc93f72b34504ad6d14ac8"), + call("file-checks-b923c205dab97514f00194b3ee5cde0546f1aa7c"), + call("file-checks-fcdd443163779531f4fc93f72b34504ad6d14ac8"), ] @@ -150,6 +153,25 @@ def test_different_cache_keys_for_different_filename_and_is_csv(client, mocker): return_value='{"failure": {"error": "I’m a teapot", "status_code": 418}}'.encode(), ) + file_content = b"%PDF-1.4 file contents" + + _file_checks(client, file_content, service_id=UUID(int=0, version=4)) + _file_checks(client, file_content, service_id=UUID(int=1, version=4)) + _file_checks(client, file_content, service_id=UUID(int=1, version=4)) + + assert mock_redis_get.call_args_list == [ + call("file-checks-01ca5a1f80d254b3d6f4dcff59c1d93b4f2ec939"), + call("file-checks-dc50228dab1b172ca18465eef02df493d208896b"), + call("file-checks-dc50228dab1b172ca18465eef02df493d208896b"), + ] + + +def test_different_cache_keys_for_different_service_ids(client, mocker): + mock_redis_get = mocker.patch( + "app.redis_client.get", + return_value='{"failure": {"error": "I’m a teapot", "status_code": 418}}'.encode(), + ) + file_content = b"%PDF-1.4 first file contents" _file_checks(client, file_content) @@ -157,9 +179,9 @@ def test_different_cache_keys_for_different_filename_and_is_csv(client, mocker): _file_checks(client, file_content, filename="foo.pdf", is_csv=True) assert mock_redis_get.call_args_list == [ - call("file-checks-74cc0669d6b61ff7efa2416a51eb1a6ed17b23d5"), - call("file-checks-3c7955f4f94c940efa494dd4cc9a5c171b8f863a"), - call("file-checks-e5734036f6ab95fade9d49a4ff8496489cf03293"), + call("file-checks-b923c205dab97514f00194b3ee5cde0546f1aa7c"), + call("file-checks-173d5d18c19e3d0e9113dd672a15a31cf41bfb64"), + call("file-checks-fc27ee1ffde6ae9c24988896e3d1d1f1d5c7d1f4"), ] diff --git a/tests/upload/test_views.py b/tests/upload/test_views.py index f04d355c..b102419e 100644 --- a/tests/upload/test_views.py +++ b/tests/upload/test_views.py @@ -431,6 +431,6 @@ def test_document_upload_bad_is_csv_value(client): ) def test_get_upload_document_request_data_errors(app, data, expected_error): with pytest.raises(AntivirusAndMimeTypeCheckError) as e: - UploadedFile.from_request_json(data) + UploadedFile.from_request_json(data, service_id="foo") assert e.value.message == expected_error From bf0d05c451eb9723016578a7f43f4d05ba6b8200 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Mon, 29 Sep 2025 10:53:55 +0100 Subject: [PATCH 51/63] Restore deleted test assertion --- tests/upload/test_views.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/upload/test_views.py b/tests/upload/test_views.py index b102419e..98f28a49 100644 --- a/tests/upload/test_views.py +++ b/tests/upload/test_views.py @@ -160,7 +160,8 @@ def test_document_file_size_just_right_after_b64decode(client, store, antivirus) assert response.status_code == 201 -def test_document_file_size_too_large_werkzeug_content_length(client): +def test_document_file_size_too_large_werkzeug_content_length(client, mocker): + mock_uploaded_file = mocker.patch("app.upload.views.UploadedFile.from_request_json") url = "/services/12345678-1111-1111-1111-123456789012/documents" # Gets hit by Werkzeug's 3MiB content length limit automatically (before our app logic). @@ -169,7 +170,7 @@ def test_document_file_size_too_large_werkzeug_content_length(client): assert response.status_code == 413 assert response.json == {"error": "Uploaded file exceeds file size limit"} - # Mock assertion missing here + assert mock_uploaded_file.called is False def test_document_file_size_too_large_b64_decoded_content(client): From 9851b2c49fd28336ef128f538807431dec2e7009 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Mon, 29 Sep 2025 11:00:36 +0100 Subject: [PATCH 52/63] Refactor with parameterize This is more robust because it shows that the difference in. behaviour is entirely dependent on file size --- tests/upload/test_views.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/tests/upload/test_views.py b/tests/upload/test_views.py index 98f28a49..c0ca3188 100644 --- a/tests/upload/test_views.py +++ b/tests/upload/test_views.py @@ -160,28 +160,27 @@ def test_document_file_size_just_right_after_b64decode(client, store, antivirus) assert response.status_code == 201 -def test_document_file_size_too_large_werkzeug_content_length(client, mocker): - mock_uploaded_file = mocker.patch("app.upload.views.UploadedFile.from_request_json") - url = "/services/12345678-1111-1111-1111-123456789012/documents" - - # Gets hit by Werkzeug's 3MiB content length limit automatically (before our app logic). - file_content = b"a" * (3 * 1024 * 1024 + 1) - response = _document_upload(client, url, file_content) - - assert response.status_code == 413 - assert response.json == {"error": "Uploaded file exceeds file size limit"} - assert mock_uploaded_file.called is False - - -def test_document_file_size_too_large_b64_decoded_content(client): +@pytest.mark.parametrize( + "file_size_in_bytes, application_code_call_expected", + ( + # Gets hit by Werkzeug's 3MiB content length limit automatically (before our app logic). + ((3 * 1024 * 1024 + 1), False), + # Gets through Werkzeug's 3MiB content length limit, but too big for our python ~2MiB check. + ((2 * 1024 * 1024 + 1025), True), + ), +) +def test_document_file_size_too_large(client, mocker, file_size_in_bytes, application_code_call_expected): + mock_uploaded_file = mocker.patch( + "app.upload.views.UploadedFile.from_request_json", wraps=UploadedFile.from_request_json + ) url = "/services/12345678-1111-1111-1111-123456789012/documents" - # Gets through Werkzeug's 3MiB content length limit, but too big for our python ~2MiB check. - file_content = b"a" * (2 * 1024 * 1024 + 1025) + file_content = b"a" * file_size_in_bytes response = _document_upload(client, url, file_content) assert response.status_code == 413 assert response.json == {"error": "Uploaded file exceeds file size limit"} + assert mock_uploaded_file.called is application_code_call_expected def test_document_upload_no_document(client): From b15942a2c36a146950e5666bbf95e061dd7f135c Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Mon, 29 Sep 2025 11:36:43 +0100 Subject: [PATCH 53/63] Restore original behaviour when antivirus is disabled The file is always marked as virus-free when antivirus is disabled. This makes local development easier. This behaviour was accidentally switched in the process of refactoring, because - the original code was hard to read - there were no tests --- app/utils/file_checks.py | 2 +- tests/file_checks/test_views.py | 36 +++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index b6d13bd4..57bf32d9 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -176,7 +176,7 @@ def _mimetype(self): @property def virus_free(self): if not current_app.config["ANTIVIRUS_ENABLED"]: - return False + return True try: virus_free = antivirus_client.scan(self.file_data) except AntivirusError as e: diff --git a/tests/file_checks/test_views.py b/tests/file_checks/test_views.py index 70943ded..bc1fccd0 100644 --- a/tests/file_checks/test_views.py +++ b/tests/file_checks/test_views.py @@ -3,8 +3,11 @@ from uuid import UUID import pytest +from flask import current_app from notifications_utils.clients.antivirus.antivirus_client import AntivirusError +from app.config import Test + def _file_checks(client, file_content, is_csv=None, filename=None, service_id=None): service_id = service_id or UUID(int=0, version=4) @@ -195,3 +198,36 @@ def test_success_response_from_cache(client, mocker): assert response.status_code == 200 assert response.json == {"mimetype": "application/pdf"} + + +@pytest.mark.xdist_group(name="modifies_app_context") +@pytest.mark.parametrize( + "scan_results", + ( + # Scan results are ignored + True, + False, + ), +) +@pytest.mark.parametrize( + "antivirus_enabled", + ( + pytest.param( + True, + marks=pytest.mark.xfail(reason="Virus scan will be called"), + ), + False, + ), +) +def test_file_is_always_virus_free_when_antivirus_disabled(antivirus, client, mocker, scan_results, antivirus_enabled): + current_app.config["ANTIVIRUS_ENABLED"] = antivirus_enabled + antivirus.scan.return_value = scan_results + + response = _file_checks(client, b"Anything") + + assert response.status_code == 200 + assert response.json == {"mimetype": "text/plain"} + assert antivirus.scan.called is False + + # Reset the app config to avoid affecting other tests + current_app.config["ANTIVIRUS_ENABLED"] = Test.ANTIVIRUS_ENABLED From 7345ec8ed12f8b1f575b49f20a451f1cece2cdd3 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Mon, 29 Sep 2025 11:41:04 +0100 Subject: [PATCH 54/63] Make `virus_free` a method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before, virus free could theoretically: - return `True` - return `False` - raise In practice the way it was structured meant it could never return False. This means calling code had to account for the `raise`/`False` possibilities, even though the latter would never happen. Changing it to a method makes it clearer that it’s doing work. It also means it less unexpected when we change it to either: - return `None` (success) - raise (failed) --- app/utils/file_checks.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index 57bf32d9..fd68fc5b 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -154,8 +154,8 @@ def mimetype_serialised(self, file_data_hash): @property def _mimetype(self): - if not self.virus_free: - return + self.do_virus_scan() + if self.filename: mimetype = mimetypes.types_map[split_filename(self.filename, dotted=True)[1]] else: @@ -173,14 +173,12 @@ def _mimetype(self): ) return mimetype - @property - def virus_free(self): + def do_virus_scan(self): if not current_app.config["ANTIVIRUS_ENABLED"]: - return True + return try: virus_free = antivirus_client.scan(self.file_data) except AntivirusError as e: raise AntivirusError(message="Antivirus API error", status_code=503) from e if not virus_free: raise AntivirusError(message="File did not pass the virus scan", status_code=400) - return virus_free From f069adcfc439f1e4951d244a2382c5b7b04fed3c Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Tue, 30 Sep 2025 11:50:41 +0100 Subject: [PATCH 55/63] Move `do_virus_scan` call into `mimetype_serialised` This is a better separation of the control flow because it groups the exception handler with the calls which can raise the exception. --- app/utils/file_checks.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index fd68fc5b..9a802663 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -148,14 +148,13 @@ def mimetype_serialised(self, file_data_hash): if file_data_hash != self.file_data_hash: raise RuntimeError("Wrong hash value passed to cache") try: + self.do_virus_scan() return {"success": {"mimetype": self._mimetype}} except (AntivirusError, FiletypeError) as e: return {"failure": {"error": e.message, "status_code": e.status_code}} @property def _mimetype(self): - self.do_virus_scan() - if self.filename: mimetype = mimetypes.types_map[split_filename(self.filename, dotted=True)[1]] else: From f9bae07f2ed2185751a1b1fb9d3b3cc15a1b66fc Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Tue, 30 Sep 2025 12:04:12 +0100 Subject: [PATCH 56/63] Fixed wrongly-swapped test names --- tests/file_checks/test_views.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/file_checks/test_views.py b/tests/file_checks/test_views.py index bc1fccd0..4814f1fb 100644 --- a/tests/file_checks/test_views.py +++ b/tests/file_checks/test_views.py @@ -150,7 +150,7 @@ def test_virus_check_returns_value_from_cache(client, mocker): ] -def test_different_cache_keys_for_different_filename_and_is_csv(client, mocker): +def test_different_cache_keys_for_different_service_ids(client, mocker): mock_redis_get = mocker.patch( "app.redis_client.get", return_value='{"failure": {"error": "I’m a teapot", "status_code": 418}}'.encode(), @@ -169,7 +169,7 @@ def test_different_cache_keys_for_different_filename_and_is_csv(client, mocker): ] -def test_different_cache_keys_for_different_service_ids(client, mocker): +def test_different_cache_keys_for_different_filename_and_is_csv(client, mocker): mock_redis_get = mocker.patch( "app.redis_client.get", return_value='{"failure": {"error": "I’m a teapot", "status_code": 418}}'.encode(), From 4f841d844010a71ec842cf095ab870f2df6c4593 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Tue, 30 Sep 2025 14:29:28 +0100 Subject: [PATCH 57/63] Use file extension to determine cache key MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only the file extension can change what MIME type is detected for the file. The rest of the filename doesn’t matter. This probably won’t result in a speedup (the dict lookup from extention to MIME type is cheap) but it could result in slightly fewer cache keys being used. --- app/utils/file_checks.py | 10 ++++++++-- tests/file_checks/test_views.py | 4 ++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index 9a802663..8c6abfe8 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -129,11 +129,17 @@ def file_data_hash(self): self.file_data.seek(0) contents += bytes(self.is_csv) - contents += str(self.filename).encode() + contents += str(self.file_extension).encode() contents += str(self.service_id).encode() return sha1(contents).hexdigest() + @property + def file_extension(self): + if not self.filename: + return + return split_filename(self.filename, dotted=True)[1] + def mimetype_deserialised(self): result = self.mimetype_serialised(self.file_data_hash) if "failure" in result: @@ -156,7 +162,7 @@ def mimetype_serialised(self, file_data_hash): @property def _mimetype(self): if self.filename: - mimetype = mimetypes.types_map[split_filename(self.filename, dotted=True)[1]] + mimetype = mimetypes.types_map[self.file_extension] else: mimetype = get_mime_type(self.file_data) # Our mimetype auto-detection sometimes resolves CSV content as text/plain, so we use diff --git a/tests/file_checks/test_views.py b/tests/file_checks/test_views.py index 4814f1fb..78087d51 100644 --- a/tests/file_checks/test_views.py +++ b/tests/file_checks/test_views.py @@ -183,8 +183,8 @@ def test_different_cache_keys_for_different_filename_and_is_csv(client, mocker): assert mock_redis_get.call_args_list == [ call("file-checks-b923c205dab97514f00194b3ee5cde0546f1aa7c"), - call("file-checks-173d5d18c19e3d0e9113dd672a15a31cf41bfb64"), - call("file-checks-fc27ee1ffde6ae9c24988896e3d1d1f1d5c7d1f4"), + call("file-checks-01de2a8237b9fbdb364a257098787408ae53ab97"), + call("file-checks-138df91b49568633875b51f9ffafd67003b42b2c"), ] From 2309b8283ca23eb856a2a9f04214981c6079c264 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Tue, 30 Sep 2025 14:50:21 +0100 Subject: [PATCH 58/63] Only cache `is_csv` if filename not provided The `is_csv` argument is ignored if a filename is provided. So we can reduce the space of possible hash keys by not including it --- app/utils/file_checks.py | 7 ++++-- tests/file_checks/test_views.py | 38 +++++++++++++++++---------------- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py index 8c6abfe8..e75dbb6b 100644 --- a/app/utils/file_checks.py +++ b/app/utils/file_checks.py @@ -128,8 +128,11 @@ def file_data_hash(self): contents = bytearray(self.file_data.read()) self.file_data.seek(0) - contents += bytes(self.is_csv) - contents += str(self.file_extension).encode() + if self.file_extension: + contents += str(self.file_extension).encode() + else: + contents += bytes(self.is_csv) + contents += str(self.service_id).encode() return sha1(contents).hexdigest() diff --git a/tests/file_checks/test_views.py b/tests/file_checks/test_views.py index 78087d51..b9e90bc8 100644 --- a/tests/file_checks/test_views.py +++ b/tests/file_checks/test_views.py @@ -112,12 +112,12 @@ def test_virus_check_puts_value_in_cache( assert mock_redis_set.call_args_list == [ call( - "file-checks-b923c205dab97514f00194b3ee5cde0546f1aa7c", + "file-checks-78b2a017d57195bd248ea2ac7ca7c676ff082ae9", expected_first_cache_value, ex=86_400, ), call( - "file-checks-9e1c0c215d2eaefe915dd2d431a1cf21e777bbae", + "file-checks-b9e8a7de077339594399102403d2834b21324613", expected_second_cache_value, ex=86_400, ), @@ -140,14 +140,8 @@ def test_virus_check_returns_value_from_cache(client, mocker): assert response_1.status_code == response_2.status_code == 418 assert response_1.json == response_2.json == {"error": "I’m a teapot"} - assert mock_redis_get.call_args_list == [ - call("file-checks-b923c205dab97514f00194b3ee5cde0546f1aa7c"), - call("file-checks-fcdd443163779531f4fc93f72b34504ad6d14ac8"), - call("file-checks-b923c205dab97514f00194b3ee5cde0546f1aa7c"), - call("file-checks-fcdd443163779531f4fc93f72b34504ad6d14ac8"), - call("file-checks-b923c205dab97514f00194b3ee5cde0546f1aa7c"), - call("file-checks-fcdd443163779531f4fc93f72b34504ad6d14ac8"), - ] + assert len(mock_redis_get.call_args_list) == 6 + assert len({c[0] for c in mock_redis_get.call_args_list}) == 2 def test_different_cache_keys_for_different_service_ids(client, mocker): @@ -162,11 +156,8 @@ def test_different_cache_keys_for_different_service_ids(client, mocker): _file_checks(client, file_content, service_id=UUID(int=1, version=4)) _file_checks(client, file_content, service_id=UUID(int=1, version=4)) - assert mock_redis_get.call_args_list == [ - call("file-checks-01ca5a1f80d254b3d6f4dcff59c1d93b4f2ec939"), - call("file-checks-dc50228dab1b172ca18465eef02df493d208896b"), - call("file-checks-dc50228dab1b172ca18465eef02df493d208896b"), - ] + assert len(mock_redis_get.call_args_list) == 3 + assert len({c[0] for c in mock_redis_get.call_args_list}) == 2 def test_different_cache_keys_for_different_filename_and_is_csv(client, mocker): @@ -179,12 +170,23 @@ def test_different_cache_keys_for_different_filename_and_is_csv(client, mocker): _file_checks(client, file_content) _file_checks(client, file_content, filename="foo.pdf") - _file_checks(client, file_content, filename="foo.pdf", is_csv=True) + _file_checks(client, file_content, filename="bar.pdf") + _file_checks(client, file_content, filename="bar.jpg") + _file_checks(client, file_content, filename="bar.jpg", is_csv=True) + _file_checks(client, file_content, is_csv=True) assert mock_redis_get.call_args_list == [ - call("file-checks-b923c205dab97514f00194b3ee5cde0546f1aa7c"), + # No filename + call("file-checks-78b2a017d57195bd248ea2ac7ca7c676ff082ae9"), + # Different filenames but same extention + call("file-checks-01de2a8237b9fbdb364a257098787408ae53ab97"), call("file-checks-01de2a8237b9fbdb364a257098787408ae53ab97"), - call("file-checks-138df91b49568633875b51f9ffafd67003b42b2c"), + # Different extention + call("file-checks-1b6950c7d6718aad287c9718771229d7c6321a99"), + # Same filename but is_csv=True (which is ignored) + call("file-checks-1b6950c7d6718aad287c9718771229d7c6321a99"), + # No filename but is_csv=True + call("file-checks-cf0768397cf40807321810280dc65d236bc53c70"), ] From 8d7064cb2434406d390e479fdfcf63565cbd7544 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Thu, 2 Oct 2025 11:00:48 +0100 Subject: [PATCH 59/63] Spelling mistake Co-authored-by: Mervi Tyczynska --- tests/file_checks/test_views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/file_checks/test_views.py b/tests/file_checks/test_views.py index b9e90bc8..110237bb 100644 --- a/tests/file_checks/test_views.py +++ b/tests/file_checks/test_views.py @@ -178,7 +178,7 @@ def test_different_cache_keys_for_different_filename_and_is_csv(client, mocker): assert mock_redis_get.call_args_list == [ # No filename call("file-checks-78b2a017d57195bd248ea2ac7ca7c676ff082ae9"), - # Different filenames but same extention + # Different filenames but same extension call("file-checks-01de2a8237b9fbdb364a257098787408ae53ab97"), call("file-checks-01de2a8237b9fbdb364a257098787408ae53ab97"), # Different extention From 63ae6bc090389fb3b8bf1ffdb5f014e49313b0f3 Mon Sep 17 00:00:00 2001 From: Chris Hill-Scott Date: Thu, 2 Oct 2025 11:00:57 +0100 Subject: [PATCH 60/63] Spelling mistake Co-authored-by: Mervi Tyczynska --- tests/file_checks/test_views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/file_checks/test_views.py b/tests/file_checks/test_views.py index 110237bb..2580e34e 100644 --- a/tests/file_checks/test_views.py +++ b/tests/file_checks/test_views.py @@ -181,7 +181,7 @@ def test_different_cache_keys_for_different_filename_and_is_csv(client, mocker): # Different filenames but same extension call("file-checks-01de2a8237b9fbdb364a257098787408ae53ab97"), call("file-checks-01de2a8237b9fbdb364a257098787408ae53ab97"), - # Different extention + # Different extension call("file-checks-1b6950c7d6718aad287c9718771229d7c6321a99"), # Same filename but is_csv=True (which is ignored) call("file-checks-1b6950c7d6718aad287c9718771229d7c6321a99"), From 679d2bbbabbd85417b950a6f0c1997d5683a35eb Mon Sep 17 00:00:00 2001 From: pgaetani Date: Thu, 16 Oct 2025 09:25:20 +0000 Subject: [PATCH 61/63] nl dockerfile --- docker/Dockerfile_nl | 97 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 docker/Dockerfile_nl diff --git a/docker/Dockerfile_nl b/docker/Dockerfile_nl new file mode 100644 index 00000000..1c1020f0 --- /dev/null +++ b/docker/Dockerfile_nl @@ -0,0 +1,97 @@ +FROM python:3.13-slim-bookworm AS base + +COPY --from=ghcr.io/astral-sh/uv:0.5.30 /uv /uvx /bin/ + +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONUNBUFFERED=1 \ + UV_COMPILE_BYTECODE=1 \ + UV_CACHE_DIR='/tmp/uv-cache/' \ + VIRTUAL_ENV="/opt/venv" + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + libcurl4-openssl-dev \ + libssl-dev \ + libmagic-dev \ + curl \ + media-types \ + && apt-get -y clean \ + && rm -rf /var/lib/apt/lists/* /tmp/* + +WORKDIR /home/vcap/app + +##### Python Build Image ##################################################### +FROM base AS python_build + +RUN echo "Install OS dependencies for python app requirements" && \ + apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential \ + git \ + && apt-get -y clean \ + && rm -rf /var/lib/apt/lists/* /tmp/* + +COPY requirements_nl.txt ./ + +RUN echo "Installing python dependencies" && \ + python3 -m venv /opt/venv && \ + uv pip sync --python /opt/venv/bin/python requirements_nl.txt + +COPY . . +RUN make generate-version-file # This file gets copied across + +##### Production Image ####################################################### +FROM base AS production + +RUN groupadd -r notify && useradd -r -g notify notify && chown -R notify:notify /home/vcap +USER notify + +RUN mkdir /home/vcap/logs + +COPY --from=python_build --chown=root:root /opt/venv /opt/venv +ENV PATH="/opt/venv/bin:${PATH}" + +COPY --chown=notify:notify app app +COPY --chown=notify:notify application.py entrypoint.sh gunicorn_config.py ./ +COPY --from=python_build --chown=notify:notify /home/vcap/app/app/version.py app/version.py + +RUN chown -R notify:notify /home/vcap/app + +RUN python -m compileall . && \ + chown -R notify:notify /home/vcap/app && \ + chmod +x /home/vcap/app/entrypoint.sh + +ENTRYPOINT [ "/home/vcap/app/entrypoint.sh" ] + +##### Test Image ############################################################## +FROM production as test + +USER root +RUN echo "Install OS dependencies for test build" \ + && apt-get update && \ + apt-get install -y --no-install-recommends \ + sudo \ + curl \ + git \ + make \ + && apt-get -y clean \ + && rm -rf /var/lib/apt/lists/* /tmp/* +RUN usermod -aG sudo notify +RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers +USER notify + +ENV HOME=/home/vcap + +# Make sure the app/ directory is there so that "make bootstrap" can create app/version.py +RUN mkdir -p app + +# Copying to overwrite is faster than RUN chown notify:notify ... +COPY --from=python_build --chown=notify:notify /opt/venv /opt/venv + +# Install dev/test requirements +COPY --chown=notify:notify Makefile requirements_nl_test.txt ./ +RUN make bootstrap + +# Copy from the real world, one dir up (project root) into the environment's current working directory +# Docker will rebuild from here down every time. +COPY --chown=notify:notify . . From 83b7e74ebe905a5e3fc39f71ef48c2fbf604519c Mon Sep 17 00:00:00 2001 From: pgaetani Date: Thu, 16 Oct 2025 09:26:20 +0000 Subject: [PATCH 62/63] workflow changes --- .github/workflows/merge.yml | 8 ++++---- .github/workflows/pr.yml | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/merge.yml b/.github/workflows/merge.yml index 53cdae61..af6c9b6f 100644 --- a/.github/workflows/merge.yml +++ b/.github/workflows/merge.yml @@ -34,11 +34,11 @@ jobs: activate-environment: true enable-cache: true cache-dependency-glob: | - requirements_for_test.in - requirements_for_test.txt + requirements_nl_test.in + requirements_nl_test.txt - name: Install application requirements (pip) - run: uv pip install -r requirements_for_test.txt + run: uv pip install -r requirements_nl_test.txt - name: Set tag id: set-tag @@ -88,7 +88,7 @@ jobs: - uses: docker/build-push-action@v6 with: - file: docker/Dockerfile + file: docker/Dockerfile_nl push: true tags: ${{ env.IMAGE }}:latest,${{ env.IMAGE }}:${{ needs.build-tag-test.outputs.tag }} platforms: linux/amd64,linux/arm64 diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 6ac48dd4..e4690cae 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -31,11 +31,11 @@ jobs: activate-environment: true enable-cache: true cache-dependency-glob: | - requirements_for_test.in - requirements_for_test.txt + requirements_nl_test.in + requirements_nl_test.txt - name: Install application requirements (pip) - run: uv pip install -r requirements_for_test.txt + run: uv pip install -r requirements_nl_test.txt - uses: astral-sh/ruff-action@v3 From 02eb6d0864995a9812971ce31ab08d74f61e0f74 Mon Sep 17 00:00:00 2001 From: pgaetani Date: Thu, 16 Oct 2025 09:29:39 +0000 Subject: [PATCH 63/63] ruff formatting --- app/config.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/app/config.py b/app/config.py index 2ead91ff..bda1bdf5 100644 --- a/app/config.py +++ b/app/config.py @@ -97,7 +97,7 @@ class Development(Config): DOCUMENT_DOWNLOAD_API_HOSTNAME = os.environ.get("DOCUMENT_DOWNLOAD_API_HOSTNAME", "localhost:7000") REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/1") - + ################ ### NotifyNL ### @@ -111,37 +111,37 @@ class ConfigNL(Config): class DevNL(ConfigNL): NOTIFY_ENVIRONMENT = "development" - + DEBUG = True NOTIFY_REQUEST_LOG_LEVEL = "DEBUG" - + DOCUMENTS_BUCKET = f"{NL_PREFIX}-{NOTIFY_ENVIRONMENT}-document-download" class TestNL(ConfigNL): NOTIFY_ENVIRONMENT = "test" - + DEBUG = True NOTIFY_REQUEST_LOG_LEVEL = "DEBUG" - + DOCUMENTS_BUCKET = f"{NL_PREFIX}-{NOTIFY_ENVIRONMENT}-document-download" class AccNL(ConfigNL): NOTIFY_ENVIRONMENT = "acceptance" - + DEBUG = False NOTIFY_REQUEST_LOG_LEVEL = "INFO" - + DOCUMENTS_BUCKET = f"{NL_PREFIX}-{NOTIFY_ENVIRONMENT}-document-download" class ProdNL(ConfigNL): NOTIFY_ENVIRONMENT = "production" - + DEBUG = False NOTIFY_REQUEST_LOG_LEVEL = "ERROR" - + DOCUMENTS_BUCKET = f"{NL_PREFIX}-{NOTIFY_ENVIRONMENT}-document-download"