diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md deleted file mode 100644 index 09e9bc7c..00000000 --- a/.github/pull_request_template.md +++ /dev/null @@ -1,7 +0,0 @@ - - ---- - -🚨⚠️ This will be deployed automatically all the way to production when you click merge ⚠️🚨 - -For more information, including how to check this deployment on preview or staging first before it goes to production, see our [team wiki section on deployment](https://github.com/alphagov/notifications-manuals/wiki/Merging-and-deploying#deployment) diff --git a/.github/workflows/merge.yml b/.github/workflows/merge.yml index 53cdae61..af6c9b6f 100644 --- a/.github/workflows/merge.yml +++ b/.github/workflows/merge.yml @@ -34,11 +34,11 @@ jobs: activate-environment: true enable-cache: true cache-dependency-glob: | - requirements_for_test.in - requirements_for_test.txt + requirements_nl_test.in + requirements_nl_test.txt - name: Install application requirements (pip) - run: uv pip install -r requirements_for_test.txt + run: uv pip install -r requirements_nl_test.txt - name: Set tag id: set-tag @@ -88,7 +88,7 @@ jobs: - uses: docker/build-push-action@v6 with: - file: docker/Dockerfile + file: docker/Dockerfile_nl push: true tags: ${{ env.IMAGE }}:latest,${{ env.IMAGE }}:${{ needs.build-tag-test.outputs.tag }} platforms: linux/amd64,linux/arm64 diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 6ac48dd4..e4690cae 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -31,11 +31,11 @@ jobs: activate-environment: true enable-cache: true cache-dependency-glob: | - requirements_for_test.in - requirements_for_test.txt + requirements_nl_test.in + requirements_nl_test.txt - name: Install application requirements (pip) - run: uv pip install -r requirements_for_test.txt + run: uv pip install -r requirements_nl_test.txt - uses: astral-sh/ruff-action@v3 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e58e2b61..cf0b5a54 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -# This file was automatically copied from notifications-utils@92.1.1 +# This file was automatically copied from notifications-utils@100.2.0 repos: - repo: https://github.com/pre-commit/pre-commit-hooks @@ -9,12 +9,8 @@ repos: - id: check-yaml - id: debug-statements - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: 'v0.8.2' + rev: 'v0.11.4' hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] -- repo: https://github.com/psf/black - rev: 24.10.0 - hooks: - - id: black - name: black (python) + - id: ruff-format diff --git a/Makefile b/Makefile index c6892eb9..6b275ce2 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ run-flask-with-docker: ## Run flask with docker .PHONY: test test: ## Run all tests ruff check . - black --check . + ruff format --check . py.test tests/ .PHONY: test-with-docker diff --git a/README.md b/README.md index 9dd414ee..43d61286 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ### Python version -Check the version is [runtime.txt](runtime.txt) +Check the version is the same as at the top of [Dockerfile](docker/Dockerfile) ### uv diff --git a/app/__init__.py b/app/__init__.py index 7cd359ee..a20e5b13 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -5,11 +5,12 @@ from flask import Flask, current_app, jsonify from gds_metrics import GDSMetrics -from notifications_utils import logging, request_helper +from notifications_utils import request_helper from notifications_utils.clients.antivirus.antivirus_client import AntivirusClient from notifications_utils.clients.redis.redis_client import RedisClient from notifications_utils.eventlet import EventletTimeout from notifications_utils.local_vars import LazyLocalGetter +from notifications_utils.logging import flask as utils_logging from werkzeug.local import LocalProxy from app.config import Config, configs @@ -48,6 +49,7 @@ from app.download.views import download_blueprint +from app.file_checks.views import file_checks_blueprint from app.upload.views import upload_blueprint mimetypes.init() @@ -63,7 +65,7 @@ def create_app(): application.config.from_object(Config) request_helper.init_app(application) - logging.init_app(application) + utils_logging.init_app(application) metrics.init_app(application) redis_client.init_app(application) @@ -73,6 +75,7 @@ def create_app(): application.register_blueprint(download_blueprint) application.register_blueprint(upload_blueprint) + application.register_blueprint(file_checks_blueprint) @application.errorhandler(EventletTimeout) def eventlet_timeout(error): diff --git a/app/config.py b/app/config.py index 07f96673..bda1bdf5 100644 --- a/app/config.py +++ b/app/config.py @@ -99,7 +99,57 @@ class Development(Config): REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/1") +################ +### NotifyNL ### +################ +NL_PREFIX = "notifynl" + + +class ConfigNL(Config): + NOTIFY_APP_NAME = "document-download-api" + + +class DevNL(ConfigNL): + NOTIFY_ENVIRONMENT = "development" + + DEBUG = True + NOTIFY_REQUEST_LOG_LEVEL = "DEBUG" + + DOCUMENTS_BUCKET = f"{NL_PREFIX}-{NOTIFY_ENVIRONMENT}-document-download" + + +class TestNL(ConfigNL): + NOTIFY_ENVIRONMENT = "test" + + DEBUG = True + NOTIFY_REQUEST_LOG_LEVEL = "DEBUG" + + DOCUMENTS_BUCKET = f"{NL_PREFIX}-{NOTIFY_ENVIRONMENT}-document-download" + + +class AccNL(ConfigNL): + NOTIFY_ENVIRONMENT = "acceptance" + + DEBUG = False + NOTIFY_REQUEST_LOG_LEVEL = "INFO" + + DOCUMENTS_BUCKET = f"{NL_PREFIX}-{NOTIFY_ENVIRONMENT}-document-download" + + +class ProdNL(ConfigNL): + NOTIFY_ENVIRONMENT = "production" + + DEBUG = False + NOTIFY_REQUEST_LOG_LEVEL = "ERROR" + + DOCUMENTS_BUCKET = f"{NL_PREFIX}-{NOTIFY_ENVIRONMENT}-document-download" + + configs = { - "test": Test, "development": Development, + "devnl": DevNL, + "test": Test, + "testnl": TestNL, + "acceptance": AccNL, + "production": ProdNL } diff --git a/app/download/views.py b/app/download/views.py index 5e15635d..1775a90a 100644 --- a/app/download/views.py +++ b/app/download/views.py @@ -77,7 +77,7 @@ def download_document(service_id, document_id, extension=None): "document_id": document_id, }, ) - return jsonify(error=str(e)), 400 + return jsonify(error=str(e)), e.suggested_status_code if redirect := get_redirect_url_if_user_not_authenticated(request, document): return redirect @@ -130,24 +130,21 @@ def get_document_metadata(service_id, document_id): "document_id": document_id, }, ) - return jsonify(error=str(e)), 400 - - if metadata: - document = { - "direct_file_url": get_direct_file_url( - service_id=service_id, - document_id=document_id, - key=key, - mimetype=metadata["mimetype"], - ), - "confirm_email": metadata["confirm_email"], - "size_in_bytes": metadata["size"], - "file_extension": current_app.config["MIME_TYPES_TO_FILE_EXTENSIONS"][metadata["mimetype"]], - "filename": metadata["filename"], - "available_until": metadata["available_until"], - } - else: - document = None + return jsonify(error=str(e)), e.suggested_status_code + + document = { + "direct_file_url": get_direct_file_url( + service_id=service_id, + document_id=document_id, + key=key, + mimetype=metadata["mimetype"], + ), + "confirm_email": metadata["confirm_email"], + "size_in_bytes": metadata["size"], + "file_extension": current_app.config["MIME_TYPES_TO_FILE_EXTENSIONS"][metadata["mimetype"]], + "filename": metadata["filename"], + "available_until": metadata["available_until"], + } response = make_response({"document": document}) response.headers["X-Robots-Tag"] = "noindex, nofollow" diff --git a/app/file_checks/__init__.py b/app/file_checks/__init__.py new file mode 100644 index 00000000..0218afa6 --- /dev/null +++ b/app/file_checks/__init__.py @@ -0,0 +1 @@ +from .errors import * # noqa import errors module to register error handlers diff --git a/app/file_checks/errors.py b/app/file_checks/errors.py new file mode 100644 index 00000000..4a0ee146 --- /dev/null +++ b/app/file_checks/errors.py @@ -0,0 +1,8 @@ +from flask import jsonify + +from .views import file_checks_blueprint + + +@file_checks_blueprint.errorhandler(413) +def request_entity_too_large(error): + return jsonify(error="Uploaded file exceeds file size limit"), 413 diff --git a/app/file_checks/views.py b/app/file_checks/views.py new file mode 100644 index 00000000..1e0e1cfb --- /dev/null +++ b/app/file_checks/views.py @@ -0,0 +1,17 @@ +from flask import Blueprint, jsonify, request + +from app.utils.authentication import check_auth +from app.utils.file_checks import AntivirusAndMimeTypeCheckError, UploadedFile + +file_checks_blueprint = Blueprint("file_checks", __name__, url_prefix="") +file_checks_blueprint.before_request(check_auth) + + +@file_checks_blueprint.route("/services//antivirus-and-mimetype-check", methods=["POST"]) +def get_mime_type_and_run_antivirus_scan(service_id): + try: + uploaded_file = UploadedFile.from_request_json(request.json, service_id=service_id) + except AntivirusAndMimeTypeCheckError as e: + return jsonify(error=e.message), e.status_code + + return jsonify(mimetype=uploaded_file.mimetype), 200 diff --git a/app/upload/views.py b/app/upload/views.py index 2b4102f1..47dc2db4 100644 --- a/app/upload/views.py +++ b/app/upload/views.py @@ -1,109 +1,28 @@ -import mimetypes -from base64 import b64decode, binascii -from io import BytesIO +from flask import Blueprint, jsonify, request -from flask import Blueprint, abort, current_app, jsonify, request -from notifications_utils.clients.antivirus.antivirus_client import AntivirusError -from notifications_utils.recipient_validation.errors import InvalidEmailError -from werkzeug.exceptions import BadRequest - -from app import antivirus_client, document_store -from app.utils import get_mime_type +from app import document_store from app.utils.authentication import check_auth -from app.utils.files import split_filename +from app.utils.file_checks import AntivirusAndMimeTypeCheckError, UploadedFile from app.utils.urls import get_direct_file_url, get_frontend_download_url -from app.utils.validation import ( - clean_and_validate_email_address, - clean_and_validate_retention_period, - validate_filename, -) upload_blueprint = Blueprint("upload", __name__, url_prefix="") upload_blueprint.before_request(check_auth) -def _get_upload_document_request_data(data): # noqa: C901 - if "document" not in data: - raise BadRequest("No document upload") - - try: - raw_content = b64decode(data["document"]) - except (binascii.Error, ValueError) as e: - raise BadRequest("Document is not base64 encoded") from e - - if len(raw_content) > current_app.config["MAX_DECODED_FILE_SIZE"]: - abort(413) - file_data = BytesIO(raw_content) - is_csv = data.get("is_csv", False) - - if not isinstance(is_csv, bool): - raise BadRequest("Value for is_csv must be a boolean") - - confirmation_email = data.get("confirmation_email", None) - if confirmation_email is not None: - try: - confirmation_email = clean_and_validate_email_address(confirmation_email) - except InvalidEmailError as e: - raise BadRequest(str(e)) from e - - retention_period = data.get("retention_period", None) - if retention_period is not None: - try: - retention_period = clean_and_validate_retention_period(retention_period) - except ValueError as e: - raise BadRequest(str(e)) from e - - filename = data.get("filename", None) - if filename: - try: - filename = validate_filename(filename) - except ValueError as e: - raise BadRequest(str(e)) from e - - return file_data, is_csv, confirmation_email, retention_period, filename - - @upload_blueprint.route("/services//documents", methods=["POST"]) def upload_document(service_id): try: - file_data, is_csv, confirmation_email, retention_period, filename = _get_upload_document_request_data( - request.json - ) - except BadRequest as e: - return jsonify(error=e.description), 400 - - if current_app.config["ANTIVIRUS_ENABLED"]: - try: - virus_free = antivirus_client.scan(file_data) - except AntivirusError: - return jsonify(error="Antivirus API error"), 503 - - if not virus_free: - return jsonify(error="File did not pass the virus scan"), 400 - - if filename: - mimetype = mimetypes.types_map[split_filename(filename, dotted=True)[1]] - else: - mimetype = get_mime_type(file_data) - - # Our mimetype auto-detection sometimes resolves CSV content as text/plain, so we use - # an explicit POST body parameter `is_csv` from the caller to resolve it as text/csv - if is_csv and mimetype == "text/plain": - mimetype = "text/csv" - - if mimetype not in current_app.config["MIME_TYPES_TO_FILE_EXTENSIONS"]: - allowed_file_types = ", ".join( - sorted({f"'.{x}'" for x in current_app.config["FILE_EXTENSIONS_TO_MIMETYPES"].keys()}) - ) - return jsonify(error=f"Unsupported file type '{mimetype}'. Supported types are: {allowed_file_types}"), 400 + uploaded_file = UploadedFile.from_request_json(request.json, service_id=service_id) + except AntivirusAndMimeTypeCheckError as e: + return jsonify(error=e.message), e.status_code document = document_store.put( service_id, - file_data, - mimetype=mimetype, - confirmation_email=confirmation_email, - retention_period=retention_period, - filename=filename, + uploaded_file.file_data, + mimetype=uploaded_file.mimetype, + confirmation_email=uploaded_file.confirmation_email, + retention_period=uploaded_file.retention_period, + filename=uploaded_file.filename, ) return ( @@ -115,14 +34,14 @@ def upload_document(service_id): service_id=service_id, document_id=document["id"], key=document["encryption_key"], - mimetype=mimetype, + mimetype=uploaded_file.mimetype, ), "url": get_frontend_download_url( service_id=service_id, document_id=document["id"], key=document["encryption_key"], ), - "mimetype": mimetype, + "mimetype": uploaded_file.mimetype, }, ), 201, diff --git a/app/utils/file_checks.py b/app/utils/file_checks.py new file mode 100644 index 00000000..e75dbb6b --- /dev/null +++ b/app/utils/file_checks.py @@ -0,0 +1,192 @@ +import mimetypes +from base64 import b64decode, binascii +from hashlib import sha1 +from io import BytesIO + +from flask import abort, current_app +from notifications_utils.clients.antivirus.antivirus_client import AntivirusError +from notifications_utils.clients.redis import RequestCache +from notifications_utils.recipient_validation.errors import InvalidEmailError + +from app import antivirus_client, redis_client +from app.utils import get_mime_type +from app.utils.files import split_filename +from app.utils.validation import ( + clean_and_validate_email_address, + clean_and_validate_retention_period, + validate_filename, +) + +cache = RequestCache(redis_client) + + +class FiletypeError(Exception): + def __init__(self, message=None, status_code=400): + self.message = message + self.status_code = status_code + + +class AntivirusAndMimeTypeCheckError(Exception): + def __init__(self, message, status_code=400): + self.message = message + self.status_code = status_code + + +class UploadedFile: + def __init__(self, *, file_data, is_csv, confirmation_email, retention_period, filename, service_id): + self.is_csv = is_csv + self.filename = filename + self.confirmation_email = confirmation_email + self.retention_period = retention_period + self.service_id = service_id + self.file_data = file_data + + @classmethod + def from_request_json(cls, data, *, service_id): + if "document" not in data: + raise AntivirusAndMimeTypeCheckError("No document upload") + + try: + raw_content = b64decode(data["document"]) + except (binascii.Error, ValueError) as e: + raise AntivirusAndMimeTypeCheckError("Document is not base64 encoded") from e + + if len(raw_content) > current_app.config["MAX_DECODED_FILE_SIZE"]: + abort(413) + + return cls( + file_data=BytesIO(raw_content), + is_csv=data.get("is_csv"), + confirmation_email=data.get("confirmation_email"), + retention_period=data.get("retention_period"), + filename=data.get("filename", None), + service_id=service_id, + ) + + @property + def is_csv(self): + return self._is_csv + + @is_csv.setter + def is_csv(self, value): + if value is None: + value = False + if not isinstance(value, bool): + raise AntivirusAndMimeTypeCheckError("Value for is_csv must be a boolean") + self._is_csv = value + + @property + def confirmation_email(self): + return getattr(self, "_confirmation_email", None) + + @confirmation_email.setter + def confirmation_email(self, value): + if value is None: + return + try: + self._confirmation_email = clean_and_validate_email_address(value) + except InvalidEmailError as e: + raise AntivirusAndMimeTypeCheckError(str(e)) from e + + @property + def retention_period(self): + return getattr(self, "_retention_period", None) + + @retention_period.setter + def retention_period(self, value): + if value is None: + return + try: + self._retention_period = clean_and_validate_retention_period(value) + except ValueError as e: + raise AntivirusAndMimeTypeCheckError(str(e)) from e + + @property + def filename(self): + return getattr(self, "_filename", None) + + @filename.setter + def filename(self, value): + if value is None: + return + try: + self._filename = validate_filename(value) + except ValueError as e: + raise AntivirusAndMimeTypeCheckError(str(e)) from e + + @property + def file_data(self): + return self._file_data + + @file_data.setter + def file_data(self, value): + self._file_data = value + self.mimetype = self.mimetype_deserialised() + + @property + def file_data_hash(self): + contents = bytearray(self.file_data.read()) + self.file_data.seek(0) + + if self.file_extension: + contents += str(self.file_extension).encode() + else: + contents += bytes(self.is_csv) + + contents += str(self.service_id).encode() + + return sha1(contents).hexdigest() + + @property + def file_extension(self): + if not self.filename: + return + return split_filename(self.filename, dotted=True)[1] + + def mimetype_deserialised(self): + result = self.mimetype_serialised(self.file_data_hash) + if "failure" in result: + raise AntivirusAndMimeTypeCheckError( + message=result["failure"]["error"], + status_code=result["failure"]["status_code"], + ) + return result["success"]["mimetype"] + + @cache.set("file-checks-{file_data_hash}", ttl_in_seconds=86_400) + def mimetype_serialised(self, file_data_hash): + if file_data_hash != self.file_data_hash: + raise RuntimeError("Wrong hash value passed to cache") + try: + self.do_virus_scan() + return {"success": {"mimetype": self._mimetype}} + except (AntivirusError, FiletypeError) as e: + return {"failure": {"error": e.message, "status_code": e.status_code}} + + @property + def _mimetype(self): + if self.filename: + mimetype = mimetypes.types_map[self.file_extension] + else: + mimetype = get_mime_type(self.file_data) + # Our mimetype auto-detection sometimes resolves CSV content as text/plain, so we use + # an explicit POST body parameter `is_csv` from the caller to resolve it as text/csv + if self.is_csv and mimetype == "text/plain": + mimetype = "text/csv" + if mimetype not in current_app.config["MIME_TYPES_TO_FILE_EXTENSIONS"]: + allowed_file_types = ", ".join( + sorted({f"'.{x}'" for x in current_app.config["FILE_EXTENSIONS_TO_MIMETYPES"].keys()}) + ) + raise FiletypeError( + message=f"Unsupported file type '{mimetype}'. Supported types are: {allowed_file_types}" + ) + return mimetype + + def do_virus_scan(self): + if not current_app.config["ANTIVIRUS_ENABLED"]: + return + try: + virus_free = antivirus_client.scan(self.file_data) + except AntivirusError as e: + raise AntivirusError(message="Antivirus API error", status_code=503) from e + if not virus_free: + raise AntivirusError(message="File did not pass the virus scan", status_code=400) diff --git a/app/utils/store.py b/app/utils/store.py index 04aabf55..98ae8bf5 100644 --- a/app/utils/store.py +++ b/app/utils/store.py @@ -1,7 +1,7 @@ import os import re import uuid -from datetime import date, timedelta +from datetime import UTC, date, datetime, timedelta from urllib.parse import urlencode import boto3 @@ -13,14 +13,22 @@ class DocumentStoreError(Exception): - pass + suggested_status_code = 400 -class DocumentBlocked(Exception): - pass +class DocumentBlocked(DocumentStoreError): + suggested_status_code = 403 + + +class DocumentExpired(DocumentStoreError): + suggested_status_code = 410 -class DocumentExpired(Exception): +class DocumentNotFound(DocumentStoreError): + suggested_status_code = 404 + + +class CannotDetermineExpiration(Exception): pass @@ -38,16 +46,9 @@ def __init__(self, bucket=None): def init_app(self, app): self.bucket = app.config["DOCUMENTS_BUCKET"] - def check_for_blocked_document(self, service_id, document_id): - """Raises an exception if access to the document has been blocked after creation - - This should be checked before any document access. This might be used to quickly prevent anyone from accessing - a file that a service has sent out accidentally. - - Note that the `blocked` tag key MUST be in lowercase. - """ + def _get_document_tags(self, service_id, document_id): try: - tags = { + return { item["Key"]: item["Value"] for item in self.s3.get_object_tagging( Bucket=self.bucket, Key=self.get_document_key(service_id, document_id) @@ -56,14 +57,38 @@ def check_for_blocked_document(self, service_id, document_id): except BotoClientError as e: if e.response["Error"].get("ResourceType") == "DeleteMarker": # The S3 object has been marked as expired (eg by our retention period lifecycle policy) - # We should treat is as not existing raise DocumentExpired("The document is no longer available") from e + if e.response["Error"].get("Code") == "NoSuchKey": + raise DocumentNotFound("The requested document could not be found") from e + raise e + def check_for_blocked_document(self, tags): + """Raises an exception if access to the document has been blocked after creation + + This should be checked before any document access. This might be used to quickly prevent anyone from accessing + a file that a service has sent out accidentally. + + Note that the `blocked` tag key MUST be in lowercase. + """ + if tags.get("blocked", "false").lower() in {"true", "yes"}: raise DocumentBlocked("Access to the document has been blocked") + @classmethod + def check_for_expired_document(cls, s3_response, tags): + effective_expiry_date = cls._get_effective_expiry_date(s3_response, tags) + + if not effective_expiry_date: + current_app.logger.error( + "Expiration information not available for document, attempting to serve it anyway.." + ) + return + + if effective_expiry_date < date.today(): + raise DocumentExpired("The document is no longer available") + def put( self, service_id, document_stream, *, mimetype, confirmation_email=None, retention_period=None, filename=None ): @@ -86,14 +111,20 @@ def put( {"service_id": service_id, "document_id": document_id}, ) + tags = { + # in case we lose S3's native timestamp for some reason + "created-at": datetime.now(UTC).isoformat(timespec="seconds"), + } + if retention_period: - tags = {"retention-period": retention_period} - extra_kwargs["Tagging"] = urlencode(tags) + tags["retention-period"] = retention_period current_app.logger.info( "Setting custom retention period for %(service_id)s/%(document_id)s: %(retention_period)s", {"service_id": service_id, "document_id": document_id, "retention_period": retention_period}, ) + extra_kwargs["Tagging"] = urlencode(tags) + if filename: # Convert utf-8 filenames to ASCII suitable for storing in AWS S3 Metadata. extra_kwargs["Metadata"]["filename"] = filename.encode("unicode_escape").decode("ascii") @@ -127,24 +158,27 @@ def get(self, service_id, document_id, decryption_key): decryption_key should be raw bytes """ try: - self.check_for_blocked_document(service_id, document_id) - document = self.s3.get_object( + tags = self._get_document_tags(service_id, document_id) + self.check_for_blocked_document(tags) + s3_response = self.s3.get_object( Bucket=self.bucket, Key=self.get_document_key(service_id, document_id), SSECustomerKey=decryption_key, SSECustomerAlgorithm="AES256", ) + self.check_for_expired_document(s3_response, tags) - except (DocumentBlocked, DocumentExpired) as e: - raise DocumentStoreError(str(e)) from e except BotoClientError as e: + if e.response["Error"]["Code"] == "404": + raise DocumentNotFound("The requested document could not be found") from e + raise DocumentStoreError(e.response["Error"]) from e return { - "body": document["Body"], - "mimetype": document["ContentType"], - "size": document["ContentLength"], - "metadata": self._normalise_metadata(document["Metadata"]), + "body": s3_response["Body"], + "mimetype": s3_response["ContentType"], + "size": s3_response["ContentLength"], + "metadata": self._normalise_metadata(s3_response["Metadata"]), } def get_document_metadata(self, service_id, document_id, decryption_key): @@ -153,45 +187,94 @@ def get_document_metadata(self, service_id, document_id, decryption_key): """ try: - self.check_for_blocked_document(service_id, document_id) - metadata = self.s3.head_object( + tags = self._get_document_tags(service_id, document_id) + self.check_for_blocked_document(tags) + s3_response = self.s3.head_object( Bucket=self.bucket, Key=self.get_document_key(service_id, document_id), SSECustomerKey=decryption_key, SSECustomerAlgorithm="AES256", ) + self.check_for_expired_document(s3_response, tags) - expiry_date = self._convert_expiry_date_to_date_object(metadata["Expiration"]) - + available_until = self._get_effective_expiry_date(s3_response, tags) return { - "mimetype": metadata["ContentType"], - "confirm_email": self.get_email_hash(metadata) is not None, - "size": metadata["ContentLength"], - "available_until": str(expiry_date), - "filename": self._normalise_metadata(metadata["Metadata"]).get("filename"), + "mimetype": s3_response["ContentType"], + "confirm_email": self.get_email_hash(s3_response) is not None, + "size": s3_response["ContentLength"], + "available_until": str(available_until) if available_until else None, + "filename": self._normalise_metadata(s3_response["Metadata"]).get("filename"), } - except (DocumentBlocked, DocumentExpired): - return None except BotoClientError as e: if e.response["Error"]["Code"] == "404": - return None + raise DocumentNotFound("The requested document could not be found") from e + raise DocumentStoreError(e.response["Error"]) from e + @classmethod + def _get_effective_expiry_date(cls, s3_response, tags): + potential_expiry_dates = [] + + try: + potential_expiry_dates.append(cls._get_expiry_date_from_expiration_header(s3_response)) + except CannotDetermineExpiration as err: + current_app.logger.warning("Cannot determine document expiration through Expiration header: %s", str(err)) + + try: + potential_expiry_dates.append(cls._get_expiry_date_from_tags(s3_response, tags)) + except CannotDetermineExpiration as err: + current_app.logger.warning("Cannot determine document expiration through tags: %s", str(err)) + + if potential_expiry_dates: + return min(potential_expiry_dates) + + # in case we're having a maintenance issue/outage with our s3 bucket + # we'd prefer to serve some files that should have been deleted (hopefully <14 + # days ago) instead of being unable to serve any documents to any users + return None + @staticmethod - def _convert_expiry_date_to_date_object(raw_expiry_date: str) -> date: + def _get_expiry_date_from_expiration_header(s3_response) -> date: pattern = re.compile(r'([^=\s]+?)="(.+?)"') - expiry_date_as_dict = dict(pattern.findall(raw_expiry_date)) + expiry_date_as_dict = dict(pattern.findall(s3_response.get("Expiration") or "") or {}) + + if "expiry-date" not in expiry_date_as_dict: + raise CannotDetermineExpiration("No expiry-date found") expiry_date_string = expiry_date_as_dict["expiry-date"] timezone = expiry_date_string.split()[-1] if timezone != "GMT": - current_app.logger.warning("AWS S3 object expiration has unhandled timezone: %s", timezone) + raise CannotDetermineExpiration(f"AWS S3 object expiration has unhandled timezone: {timezone}") - expiry_date = parser.parse(expiry_date_string) - expiry_date = expiry_date.date() - timedelta(days=1) + try: + expiry_date = parser.parse(expiry_date_string) + except parser.ParserError as err: + raise CannotDetermineExpiration from err + + return expiry_date.date() - timedelta(days=1) + + @staticmethod + def _get_expiry_date_from_tags(s3_response, tags): + try: + retention_period_match = re.fullmatch(r"(\d+) weeks", tags["retention-period"]) + except KeyError as err: + raise CannotDetermineExpiration from err + + if retention_period_match is None: + raise CannotDetermineExpiration("Cannot parse retention-period header") + + try: + retention_period_days = int(retention_period_match.group(1), base=10) * 7 + except ValueError as err: + raise CannotDetermineExpiration from err + + try: + created_at = datetime.fromisoformat(tags["created-at"]) + except (KeyError, ValueError): + created_at = s3_response["LastModified"] - return expiry_date + return created_at.date() + timedelta(days=retention_period_days) def generate_encryption_key(self): return os.urandom(32) @@ -204,13 +287,15 @@ def authenticate(self, service_id: str, document_id: str, decryption_key: bytes, email_address needs to be in a validated and known-good format before being passed to this method """ try: - self.check_for_blocked_document(service_id, document_id) - response = self.s3.head_object( + tags = self._get_document_tags(service_id, document_id) + self.check_for_blocked_document(tags) + s3_response = self.s3.head_object( Bucket=self.bucket, Key=self.get_document_key(service_id, document_id), SSECustomerKey=decryption_key, SSECustomerAlgorithm="AES256", ) + self.check_for_expired_document(s3_response, tags) except (DocumentBlocked, DocumentExpired): return False except BotoClientError as e: @@ -218,7 +303,7 @@ def authenticate(self, service_id: str, document_id: str, decryption_key: bytes, return False raise DocumentStoreError(e.response["Error"]) from e - hashed_email = self.get_email_hash(response) + hashed_email = self.get_email_hash(s3_response) if not hashed_email: return False diff --git a/docker/Dockerfile b/docker/Dockerfile index 713f280e..55e5fcd0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,8 +1,12 @@ -FROM python:3.11-slim-bookworm AS base +FROM python:3.13-slim-bookworm AS base + +COPY --from=ghcr.io/astral-sh/uv:0.5.30 /uv /uvx /bin/ ENV DEBIAN_FRONTEND=noninteractive \ PYTHONUNBUFFERED=1 \ - UV_COMPILE_BYTECODE=1 + UV_COMPILE_BYTECODE=1 \ + UV_CACHE_DIR='/tmp/uv-cache/' \ + VIRTUAL_ENV="/opt/venv" RUN apt-get update \ && apt-get install -y --no-install-recommends \ @@ -29,9 +33,6 @@ RUN echo "Install OS dependencies for python app requirements" && \ COPY requirements.txt ./ -RUN pip install uv - -ENV UV_CACHE_DIR='/tmp/uv-cache/' RUN echo "Installing python dependencies" && \ python3 -m venv /opt/venv && \ uv pip sync --python /opt/venv/bin/python requirements.txt @@ -88,9 +89,7 @@ RUN mkdir -p app COPY --from=python_build --chown=notify:notify /opt/venv /opt/venv # Install dev/test requirements -RUN pip install uv COPY --chown=notify:notify Makefile requirements_for_test.txt ./ -ENV UV_CACHE_DIR='/tmp/uv-cache/' RUN make bootstrap # Copy from the real world, one dir up (project root) into the environment's current working directory diff --git a/docker/Dockerfile_nl b/docker/Dockerfile_nl new file mode 100644 index 00000000..1c1020f0 --- /dev/null +++ b/docker/Dockerfile_nl @@ -0,0 +1,97 @@ +FROM python:3.13-slim-bookworm AS base + +COPY --from=ghcr.io/astral-sh/uv:0.5.30 /uv /uvx /bin/ + +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONUNBUFFERED=1 \ + UV_COMPILE_BYTECODE=1 \ + UV_CACHE_DIR='/tmp/uv-cache/' \ + VIRTUAL_ENV="/opt/venv" + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + libcurl4-openssl-dev \ + libssl-dev \ + libmagic-dev \ + curl \ + media-types \ + && apt-get -y clean \ + && rm -rf /var/lib/apt/lists/* /tmp/* + +WORKDIR /home/vcap/app + +##### Python Build Image ##################################################### +FROM base AS python_build + +RUN echo "Install OS dependencies for python app requirements" && \ + apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential \ + git \ + && apt-get -y clean \ + && rm -rf /var/lib/apt/lists/* /tmp/* + +COPY requirements_nl.txt ./ + +RUN echo "Installing python dependencies" && \ + python3 -m venv /opt/venv && \ + uv pip sync --python /opt/venv/bin/python requirements_nl.txt + +COPY . . +RUN make generate-version-file # This file gets copied across + +##### Production Image ####################################################### +FROM base AS production + +RUN groupadd -r notify && useradd -r -g notify notify && chown -R notify:notify /home/vcap +USER notify + +RUN mkdir /home/vcap/logs + +COPY --from=python_build --chown=root:root /opt/venv /opt/venv +ENV PATH="/opt/venv/bin:${PATH}" + +COPY --chown=notify:notify app app +COPY --chown=notify:notify application.py entrypoint.sh gunicorn_config.py ./ +COPY --from=python_build --chown=notify:notify /home/vcap/app/app/version.py app/version.py + +RUN chown -R notify:notify /home/vcap/app + +RUN python -m compileall . && \ + chown -R notify:notify /home/vcap/app && \ + chmod +x /home/vcap/app/entrypoint.sh + +ENTRYPOINT [ "/home/vcap/app/entrypoint.sh" ] + +##### Test Image ############################################################## +FROM production as test + +USER root +RUN echo "Install OS dependencies for test build" \ + && apt-get update && \ + apt-get install -y --no-install-recommends \ + sudo \ + curl \ + git \ + make \ + && apt-get -y clean \ + && rm -rf /var/lib/apt/lists/* /tmp/* +RUN usermod -aG sudo notify +RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers +USER notify + +ENV HOME=/home/vcap + +# Make sure the app/ directory is there so that "make bootstrap" can create app/version.py +RUN mkdir -p app + +# Copying to overwrite is faster than RUN chown notify:notify ... +COPY --from=python_build --chown=notify:notify /opt/venv /opt/venv + +# Install dev/test requirements +COPY --chown=notify:notify Makefile requirements_nl_test.txt ./ +RUN make bootstrap + +# Copy from the real world, one dir up (project root) into the environment's current working directory +# Docker will rebuild from here down every time. +COPY --chown=notify:notify . . diff --git a/gunicorn_config.py b/gunicorn_config.py index 4735811a..45439f9b 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -1,7 +1,7 @@ import os from gds_metrics.gunicorn import child_exit # noqa -from notifications_utils.gunicorn_defaults import set_gunicorn_defaults +from notifications_utils.gunicorn.defaults import set_gunicorn_defaults set_gunicorn_defaults(globals()) diff --git a/requirements.in b/requirements.in index 072fb2ce..bd011d8f 100644 --- a/requirements.in +++ b/requirements.in @@ -9,6 +9,6 @@ gds-metrics==0.2.4 argon2-cffi==21.3.0 # Run `make bump-utils` to update to the latest version -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@92.1.1 +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@100.2.0 -sentry_sdk[flask]>=1.0.0,<2.0.0 +sentry-sdk[flask]==1.45.1 diff --git a/requirements.txt b/requirements.txt index 114acce4..0daad14c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,7 +23,7 @@ certifi==2024.7.4 # via # requests # sentry-sdk -cffi==1.16.0 +cffi==1.17.1 # via argon2-cffi-bindings charset-normalizer==3.3.2 # via requests @@ -31,9 +31,9 @@ click==8.1.7 # via flask dnspython==2.6.1 # via eventlet -eventlet==0.35.2 +eventlet==0.39.1 # via gunicorn -flask==3.1.0 +flask==3.1.1 # via # flask-redis # gds-metrics @@ -45,9 +45,9 @@ gds-metrics==0.2.4 # via -r requirements.in govuk-bank-holidays==0.15 # via notifications-utils -greenlet==3.0.3 +greenlet==3.2.2 # via eventlet -gunicorn==21.2.0 +gunicorn==23.0.0 # via notifications-utils idna==3.7 # via requests @@ -55,7 +55,7 @@ itsdangerous==2.2.0 # via # flask # notifications-utils -jinja2==3.1.5 +jinja2==3.1.6 # via # flask # notifications-utils @@ -65,18 +65,19 @@ jmespath==1.0.1 # botocore markupsafe==2.1.3 # via + # flask # jinja2 # sentry-sdk # werkzeug mistune==0.8.4 # via notifications-utils -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@b9ef1b45b655324d6341995afffc3ebc25a3ed72 +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@bb31f41dd194802ced5ea814f84cc25855467234 # via -r requirements.in ordered-set==4.1.0 # via notifications-utils packaging==23.2 # via gunicorn -phonenumbers==8.13.52 +phonenumbers==9.0.10 # via notifications-utils prometheus-client==0.19.0 # via gds-metrics @@ -86,9 +87,11 @@ pycparser==2.21 # via cffi pypdf==3.17.4 # via notifications-utils -python-dateutil==2.8.2 - # via botocore -python-json-logger==2.0.7 +python-dateutil==2.9.0.post0 + # via + # botocore + # notifications-utils +python-json-logger==3.3.0 # via notifications-utils python-magic==0.4.25 # via -r requirements.in @@ -108,7 +111,7 @@ s3transfer==0.10.1 # via boto3 segno==1.6.1 # via notifications-utils -sentry-sdk==1.45.0 +sentry-sdk==1.45.1 # via -r requirements.in six==1.16.0 # via python-dateutil diff --git a/requirements_for_test.in b/requirements_for_test.in index 97a257d6..c8c31c31 100644 --- a/requirements_for_test.in +++ b/requirements_for_test.in @@ -1,10 +1,4 @@ -r requirements.txt -r requirements_for_test_common.in -hypothesis==6.54.4 - -pytest-cov -pytest-dotenv -pytest-md -pytest-emoji -python-dotenv \ No newline at end of file +hypothesis==6.54.4 \ No newline at end of file diff --git a/requirements_for_test.txt b/requirements_for_test.txt index a23a6dc9..04683300 100644 --- a/requirements_for_test.txt +++ b/requirements_for_test.txt @@ -14,8 +14,6 @@ awscrt==0.20.11 # botocore beautifulsoup4==4.12.3 # via -r requirements_for_test_common.in -black==24.10.0 - # via -r requirements_for_test_common.in blinker==1.9.0 # via # -r requirements.txt @@ -39,7 +37,7 @@ certifi==2024.7.4 # -r requirements.txt # requests # sentry-sdk -cffi==1.16.0 +cffi==1.17.1 # via # -r requirements.txt # argon2-cffi-bindings @@ -50,23 +48,20 @@ charset-normalizer==3.3.2 click==8.1.7 # via # -r requirements.txt - # black # flask coverage==7.6.4 - # via - # pytest-cov - # pytest-testmon + # via pytest-testmon dnspython==2.6.1 # via # -r requirements.txt # eventlet -eventlet==0.35.2 +eventlet==0.39.1 # via # -r requirements.txt # gunicorn execnet==2.1.1 # via pytest-xdist -flask==3.1.0 +flask==3.1.1 # via # -r requirements.txt # flask-redis @@ -84,11 +79,11 @@ govuk-bank-holidays==0.15 # via # -r requirements.txt # notifications-utils -greenlet==3.0.3 +greenlet==3.2.2 # via # -r requirements.txt # eventlet -gunicorn==21.2.0 +gunicorn==23.0.0 # via # -r requirements.txt # notifications-utils @@ -105,7 +100,7 @@ itsdangerous==2.2.0 # -r requirements.txt # flask # notifications-utils -jinja2==3.1.5 +jinja2==3.1.6 # via # -r requirements.txt # flask @@ -118,15 +113,14 @@ jmespath==1.0.1 markupsafe==2.1.3 # via # -r requirements.txt + # flask # jinja2 # werkzeug mistune==0.8.4 # via # -r requirements.txt # notifications-utils -mypy-extensions==1.0.0 - # via black -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@b9ef1b45b655324d6341995afffc3ebc25a3ed72 +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@bb31f41dd194802ced5ea814f84cc25855467234 # via -r requirements.txt ordered-set==4.1.0 # via @@ -135,21 +129,14 @@ ordered-set==4.1.0 packaging==23.2 # via # -r requirements.txt - # black # gunicorn # pytest -pathspec==0.12.1 - # via black -phonenumbers==8.13.52 +phonenumbers==9.0.10 # via # -r requirements.txt # notifications-utils -platformdirs==4.3.6 - # via black pluggy==1.5.0 - # via - # pytest - # pytest-cov + # via pytest prometheus-client==0.19.0 # via # -r requirements.txt @@ -169,40 +156,25 @@ pypdf==3.17.4 pytest==8.3.4 # via # -r requirements_for_test_common.in - # pytest-cov - # pytest-dotenv - # pytest-emoji # pytest-env - # pytest-md # pytest-mock # pytest-testmon # pytest-xdist -pytest-cov==6.2.1 - # via -r requirements_for_test.in -pytest-dotenv==0.5.2 - # via -r requirements_for_test.in -pytest-emoji==0.2.0 - # via -r requirements_for_test.in pytest-env==1.1.5 # via -r requirements_for_test_common.in -pytest-md==0.2.0 - # via -r requirements_for_test.in pytest-mock==3.14.0 # via -r requirements_for_test_common.in pytest-testmon==2.1.1 # via -r requirements_for_test_common.in pytest-xdist==3.6.1 # via -r requirements_for_test_common.in -python-dateutil==2.8.2 +python-dateutil==2.9.0.post0 # via # -r requirements.txt # botocore # freezegun -python-dotenv==1.1.1 - # via - # -r requirements_for_test.in - # pytest-dotenv -python-json-logger==2.0.7 + # notifications-utils +python-json-logger==3.3.0 # via # -r requirements.txt # notifications-utils @@ -230,7 +202,7 @@ requests-mock==1.12.1 # via -r requirements_for_test_common.in rsa==4.7.2 # via -r requirements.txt -ruff==0.8.2 +ruff==0.11.4 # via -r requirements_for_test_common.in s3transfer==0.10.1 # via @@ -240,7 +212,7 @@ segno==1.6.1 # via # -r requirements.txt # notifications-utils -sentry-sdk==1.45.0 +sentry-sdk==1.45.1 # via -r requirements.txt six==1.16.0 # via diff --git a/requirements_for_test_common.in b/requirements_for_test_common.in index d424c600..6f622a88 100644 --- a/requirements_for_test_common.in +++ b/requirements_for_test_common.in @@ -1,4 +1,4 @@ -# This file was automatically copied from notifications-utils@92.1.1 +# This file was automatically copied from notifications-utils@100.2.0 beautifulsoup4==4.12.3 pytest==8.3.4 @@ -9,5 +9,4 @@ pytest-testmon==2.1.1 requests-mock==1.12.1 freezegun==1.5.1 -black==24.10.0 # Also update `.pre-commit-config.yaml` if this changes -ruff==0.8.2 # Also update `.pre-commit-config.yaml` if this changes +ruff==0.11.4 # Also update `.pre-commit-config.yaml` if this changes diff --git a/requirements_nl.in b/requirements_nl.in new file mode 100644 index 00000000..671a0790 --- /dev/null +++ b/requirements_nl.in @@ -0,0 +1,14 @@ +# Run `make freeze-requirements` to update requirements.txt +# with package version changes made in requirements.in + +python-magic==0.4.25 +rsa>=4.3 + +gds-metrics==0.2.4 + +argon2-cffi==21.3.0 + +# Run `make bump-utils` to update to the latest version +notifications-utils @ git+https://github.com/Worth-NL/notifynl-utils.git@101.1.2 + +sentry-sdk[flask]==1.45.1 diff --git a/requirements_nl.txt b/requirements_nl.txt new file mode 100644 index 00000000..eee4f240 --- /dev/null +++ b/requirements_nl.txt @@ -0,0 +1,128 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements_nl.in -o requirements_nl.txt +argon2-cffi==21.3.0 + # via -r requirements_nl.in +argon2-cffi-bindings==25.1.0 + # via argon2-cffi +awscrt==0.27.6 + # via botocore +blinker==1.9.0 + # via + # flask + # gds-metrics + # sentry-sdk +boto3==1.40.53 + # via notifications-utils +botocore==1.40.53 + # via + # boto3 + # s3transfer +cachetools==6.2.1 + # via notifications-utils +certifi==2025.10.5 + # via + # requests + # sentry-sdk +cffi==2.0.0 + # via argon2-cffi-bindings +charset-normalizer==3.4.4 + # via requests +click==8.3.0 + # via flask +dnspython==2.8.0 + # via eventlet +eventlet==0.40.3 + # via gunicorn +flask==3.1.2 + # via + # flask-redis + # gds-metrics + # notifications-utils + # sentry-sdk +flask-redis==0.4.0 + # via notifications-utils +gds-metrics==0.2.4 + # via -r requirements_nl.in +govuk-bank-holidays==0.17 + # via notifications-utils +greenlet==3.2.4 + # via eventlet +gunicorn==23.0.0 + # via notifications-utils +idna==3.11 + # via requests +itsdangerous==2.2.0 + # via + # flask + # notifications-utils +jinja2==3.1.6 + # via + # flask + # notifications-utils +jmespath==1.0.1 + # via + # boto3 + # botocore +markupsafe==3.0.3 + # via + # flask + # jinja2 + # sentry-sdk + # werkzeug +mistune==0.8.4 + # via notifications-utils +notifications-utils @ git+https://github.com/Worth-NL/notifynl-utils.git@c8f5fd91401608b9d8cec3f59dcca1f89880aa87 + # via -r requirements_nl.in +ordered-set==4.1.0 + # via notifications-utils +packaging==25.0 + # via gunicorn +phonenumbers==9.0.16 + # via notifications-utils +prometheus-client==0.23.1 + # via gds-metrics +pyasn1==0.6.1 + # via rsa +pycparser==2.23 + # via cffi +pypdf==6.1.1 + # via notifications-utils +python-dateutil==2.9.0.post0 + # via + # botocore + # notifications-utils +python-json-logger==4.0.0 + # via notifications-utils +python-magic==0.4.25 + # via -r requirements_nl.in +pytz==2025.2 + # via notifications-utils +pyyaml==6.0.3 + # via notifications-utils +redis==6.4.0 + # via flask-redis +requests==2.32.5 + # via + # govuk-bank-holidays + # notifications-utils +rsa==4.9.1 + # via -r requirements_nl.in +s3transfer==0.14.0 + # via boto3 +segno==1.6.6 + # via notifications-utils +sentry-sdk==1.45.1 + # via -r requirements_nl.in +six==1.17.0 + # via python-dateutil +smartypants==2.0.2 + # via notifications-utils +statsd==4.0.1 + # via notifications-utils +urllib3==2.5.0 + # via + # botocore + # requests + # sentry-sdk +werkzeug==3.1.3 + # via flask diff --git a/requirements_nl_test.in b/requirements_nl_test.in new file mode 100644 index 00000000..29512e5d --- /dev/null +++ b/requirements_nl_test.in @@ -0,0 +1,10 @@ +-r requirements_nl.txt +-r requirements_for_test_common.in + +hypothesis==6.54.4 + +pytest-cov +pytest-dotenv +pytest-md +pytest-emoji +python-dotenv \ No newline at end of file diff --git a/requirements_nl_test.txt b/requirements_nl_test.txt new file mode 100644 index 00000000..11948070 --- /dev/null +++ b/requirements_nl_test.txt @@ -0,0 +1,262 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements_nl_test.in -o requirements_nl_test.txt +argon2-cffi==21.3.0 + # via -r requirements_nl.txt +argon2-cffi-bindings==25.1.0 + # via + # -r requirements_nl.txt + # argon2-cffi +attrs==25.4.0 + # via hypothesis +awscrt==0.27.6 + # via + # -r requirements_nl.txt + # botocore +beautifulsoup4==4.12.3 + # via -r requirements_for_test_common.in +blinker==1.9.0 + # via + # -r requirements_nl.txt + # flask + # gds-metrics +boto3==1.40.53 + # via + # -r requirements_nl.txt + # notifications-utils +botocore==1.40.53 + # via + # -r requirements_nl.txt + # boto3 + # s3transfer +cachetools==6.2.1 + # via + # -r requirements_nl.txt + # notifications-utils +certifi==2025.10.5 + # via + # -r requirements_nl.txt + # requests + # sentry-sdk +cffi==2.0.0 + # via + # -r requirements_nl.txt + # argon2-cffi-bindings +charset-normalizer==3.4.4 + # via + # -r requirements_nl.txt + # requests +click==8.3.0 + # via + # -r requirements_nl.txt + # flask +coverage==7.11.0 + # via + # pytest-cov + # pytest-testmon +dnspython==2.8.0 + # via + # -r requirements_nl.txt + # eventlet +eventlet==0.40.3 + # via + # -r requirements_nl.txt + # gunicorn +execnet==2.1.1 + # via pytest-xdist +flask==3.1.2 + # via + # -r requirements_nl.txt + # flask-redis + # gds-metrics + # notifications-utils +flask-redis==0.4.0 + # via + # -r requirements_nl.txt + # notifications-utils +freezegun==1.5.1 + # via -r requirements_for_test_common.in +gds-metrics==0.2.4 + # via -r requirements_nl.txt +govuk-bank-holidays==0.17 + # via + # -r requirements_nl.txt + # notifications-utils +greenlet==3.2.4 + # via + # -r requirements_nl.txt + # eventlet +gunicorn==23.0.0 + # via + # -r requirements_nl.txt + # notifications-utils +hypothesis==6.54.4 + # via -r requirements_nl_test.in +idna==3.11 + # via + # -r requirements_nl.txt + # requests +iniconfig==2.1.0 + # via pytest +itsdangerous==2.2.0 + # via + # -r requirements_nl.txt + # flask + # notifications-utils +jinja2==3.1.6 + # via + # -r requirements_nl.txt + # flask + # notifications-utils +jmespath==1.0.1 + # via + # -r requirements_nl.txt + # boto3 + # botocore +markupsafe==3.0.3 + # via + # -r requirements_nl.txt + # flask + # jinja2 + # werkzeug +mistune==0.8.4 + # via + # -r requirements_nl.txt + # notifications-utils +notifications-utils @ git+https://github.com/Worth-NL/notifynl-utils.git@c8f5fd91401608b9d8cec3f59dcca1f89880aa87 + # via -r requirements_nl.txt +ordered-set==4.1.0 + # via + # -r requirements_nl.txt + # notifications-utils +packaging==25.0 + # via + # -r requirements_nl.txt + # gunicorn + # pytest +phonenumbers==9.0.16 + # via + # -r requirements_nl.txt + # notifications-utils +pluggy==1.6.0 + # via + # pytest + # pytest-cov +prometheus-client==0.23.1 + # via + # -r requirements_nl.txt + # gds-metrics +pyasn1==0.6.1 + # via + # -r requirements_nl.txt + # rsa +pycparser==2.23 + # via + # -r requirements_nl.txt + # cffi +pypdf==6.1.1 + # via + # -r requirements_nl.txt + # notifications-utils +pytest==8.3.4 + # via + # -r requirements_for_test_common.in + # pytest-cov + # pytest-dotenv + # pytest-emoji + # pytest-env + # pytest-md + # pytest-mock + # pytest-testmon + # pytest-xdist +pytest-cov==7.0.0 + # via -r requirements_nl_test.in +pytest-dotenv==0.5.2 + # via -r requirements_nl_test.in +pytest-emoji==0.2.0 + # via -r requirements_nl_test.in +pytest-env==1.1.5 + # via -r requirements_for_test_common.in +pytest-md==0.2.0 + # via -r requirements_nl_test.in +pytest-mock==3.14.0 + # via -r requirements_for_test_common.in +pytest-testmon==2.1.1 + # via -r requirements_for_test_common.in +pytest-xdist==3.6.1 + # via -r requirements_for_test_common.in +python-dateutil==2.9.0.post0 + # via + # -r requirements_nl.txt + # botocore + # freezegun + # notifications-utils +python-dotenv==1.1.1 + # via + # -r requirements_nl_test.in + # pytest-dotenv +python-json-logger==4.0.0 + # via + # -r requirements_nl.txt + # notifications-utils +python-magic==0.4.25 + # via -r requirements_nl.txt +pytz==2025.2 + # via + # -r requirements_nl.txt + # notifications-utils +pyyaml==6.0.3 + # via + # -r requirements_nl.txt + # notifications-utils +redis==6.4.0 + # via + # -r requirements_nl.txt + # flask-redis +requests==2.32.5 + # via + # -r requirements_nl.txt + # govuk-bank-holidays + # notifications-utils + # requests-mock +requests-mock==1.12.1 + # via -r requirements_for_test_common.in +rsa==4.9.1 + # via -r requirements_nl.txt +ruff==0.11.4 + # via -r requirements_for_test_common.in +s3transfer==0.14.0 + # via + # -r requirements_nl.txt + # boto3 +segno==1.6.6 + # via + # -r requirements_nl.txt + # notifications-utils +sentry-sdk==1.45.1 + # via -r requirements_nl.txt +six==1.17.0 + # via + # -r requirements_nl.txt + # python-dateutil +smartypants==2.0.2 + # via + # -r requirements_nl.txt + # notifications-utils +sortedcontainers==2.4.0 + # via hypothesis +soupsieve==2.8 + # via beautifulsoup4 +statsd==4.0.1 + # via + # -r requirements_nl.txt + # notifications-utils +urllib3==2.5.0 + # via + # -r requirements_nl.txt + # botocore + # requests + # sentry-sdk +werkzeug==3.1.3 + # via + # -r requirements_nl.txt + # flask diff --git a/pyproject.toml b/ruff.toml similarity index 76% rename from pyproject.toml rename to ruff.toml index a98c29c9..348e96e8 100644 --- a/pyproject.toml +++ b/ruff.toml @@ -1,14 +1,20 @@ -# This file was automatically copied from notifications-utils@92.1.1 +# This file was automatically copied from notifications-utils@100.2.0 -[tool.black] -line-length = 120 +extend-exclude = [ + "migrations/versions/", + "__pycache__", + "cache", + "migrations", + "build", + "sample_cap_xml_documents.py", +] -[tool.ruff] line-length = 120 target-version = "py311" -lint.select = [ +[lint] +select = [ "E", # pycodestyle "W", # pycodestyle "F", # pyflakes @@ -23,15 +29,6 @@ lint.select = [ "RSE", # flake8-raise "PIE", # flake8-pie "N804", # First argument of a class method should be named `cls` + "RUF100", # Checks for noqa directives that are no longer applicable ] -lint.ignore = [] -exclude = [ - "migrations/versions/", - "venv*", - "__pycache__", - "node_modules", - "cache", - "migrations", - "build", - "sample_cap_xml_documents.py", -] +ignore = [] diff --git a/tests/download/test_views.py b/tests/download/test_views.py index f42a3816..7b2715ff 100644 --- a/tests/download/test_views.py +++ b/tests/download/test_views.py @@ -8,7 +8,7 @@ from app.download.views import get_redirect_url_if_user_not_authenticated from app.utils.signed_data import sign_service_and_document_id -from app.utils.store import DocumentStoreError +from app.utils.store import DocumentNotFound, DocumentStoreError @pytest.fixture @@ -332,7 +332,7 @@ def test_get_document_metadata_when_document_is_in_s3(client, store, mimetype, e def test_get_document_metadata_when_document_is_not_in_s3(client, store): - store.get_document_metadata.return_value = None + store.get_document_metadata.side_effect = DocumentNotFound("no such document") response = client.get( url_for( "download.get_document_metadata", @@ -342,9 +342,8 @@ def test_get_document_metadata_when_document_is_not_in_s3(client, store): ) ) - assert response.status_code == 200 - assert response.json == {"document": None} - assert response.headers["X-Robots-Tag"] == "noindex, nofollow" + assert response.status_code == 404 + assert response.json == {"error": "no such document"} class TestAuthenticateDocument: diff --git a/tests/file_checks/__init__.py b/tests/file_checks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/file_checks/test_views.py b/tests/file_checks/test_views.py new file mode 100644 index 00000000..2580e34e --- /dev/null +++ b/tests/file_checks/test_views.py @@ -0,0 +1,235 @@ +import base64 +from unittest.mock import call +from uuid import UUID + +import pytest +from flask import current_app +from notifications_utils.clients.antivirus.antivirus_client import AntivirusError + +from app.config import Test + + +def _file_checks(client, file_content, is_csv=None, filename=None, service_id=None): + service_id = service_id or UUID(int=0, version=4) + url = f"/services/{service_id}/antivirus-and-mimetype-check" + data = { + "document": base64.b64encode(file_content).decode("utf-8"), + } + if is_csv is not None: + data |= {"is_csv": is_csv} + if filename is not None: + data |= {"filename": filename} + response = client.post(url, json=data) + return response + + +@pytest.fixture +def antivirus(mocker): + return mocker.patch( + "app.utils.file_checks.antivirus_client", + # prevent LocalProxy being detected as an async function + new_callable=mocker.MagicMock, + ) + + +def test_file_checks_virus_found(client, antivirus): + antivirus.scan.return_value = False + + file_content = b"%PDF-1.4 file contents" + response = _file_checks(client, file_content) + + assert response.status_code == 400 + assert response.json == {"error": "File did not pass the virus scan"} + + +def test_file_checks_returns_json_object_with_expected_results(client, antivirus): + antivirus.scan.return_value = True + file_content = b"%PDF-1.4 file contents" + response = _file_checks(client, file_content) + assert response.status_code == 200 + assert response.json == {"mimetype": "application/pdf"} + + +def test_file_checks_virus_scan_error(client, antivirus): + antivirus.scan.side_effect = AntivirusError(503, "connection error") + + file_content = b"%PDF-1.4 file contents" + response = _file_checks(client, file_content) + + assert response.status_code == 503 + assert response.json == {"error": "Antivirus API error"} + + +def test_file_checks_invalid_encoding(client): + service_id = UUID(int=0, version=4) + response = client.post(f"/services/{service_id}/antivirus-and-mimetype-check", json={"document": "foo"}) + + assert response.status_code == 400 + assert response.json == {"error": "Document is not base64 encoded"} + + +def test_file_checks_unknown_type(client, antivirus): + file_content = b"\x00pdf file contents\n" + + response = _file_checks(client, file_content) + + assert response.status_code == 400 + assert response.json["error"] == ( + "Unsupported file type 'application/octet-stream'. " + "Supported types are: '.csv', '.doc', '.docx', '.jpeg', '.jpg', '.json', '.odt', '.pdf', '.png', '.rtf', " + "'.txt', '.xlsx'" + ) + + +@pytest.mark.parametrize( + "antivirus_scan_result, expected_first_cache_value, expected_second_cache_value", + ( + ( + True, + '{"success": {"mimetype": "application/pdf"}}', + '{"success": {"mimetype": "text/plain"}}', + ), + ( + False, + '{"failure": {"error": "File did not pass the virus scan", "status_code": 400}}', + '{"failure": {"error": "File did not pass the virus scan", "status_code": 400}}', + ), + ), +) +def test_virus_check_puts_value_in_cache( + client, + mocker, + antivirus, + antivirus_scan_result, + expected_first_cache_value, + expected_second_cache_value, +): + antivirus.scan.return_value = antivirus_scan_result + mock_redis_set = mocker.patch("app.redis_client.set") + + _file_checks(client, b"%PDF-1.4 first file contents") + _file_checks(client, b"second file contents") + + assert mock_redis_set.call_args_list == [ + call( + "file-checks-78b2a017d57195bd248ea2ac7ca7c676ff082ae9", + expected_first_cache_value, + ex=86_400, + ), + call( + "file-checks-b9e8a7de077339594399102403d2834b21324613", + expected_second_cache_value, + ex=86_400, + ), + ] + + +def test_virus_check_returns_value_from_cache(client, mocker): + mock_redis_get = mocker.patch( + "app.redis_client.get", + return_value='{"failure": {"error": "I’m a teapot", "status_code": 418}}'.encode(), + ) + + file_1_content = b"%PDF-1.4 first file contents" + file_2_content = b"%PDF-1.4 second file contents" + + for _ in range(3): + response_1 = _file_checks(client, file_1_content) + response_2 = _file_checks(client, file_2_content) + + assert response_1.status_code == response_2.status_code == 418 + assert response_1.json == response_2.json == {"error": "I’m a teapot"} + + assert len(mock_redis_get.call_args_list) == 6 + assert len({c[0] for c in mock_redis_get.call_args_list}) == 2 + + +def test_different_cache_keys_for_different_service_ids(client, mocker): + mock_redis_get = mocker.patch( + "app.redis_client.get", + return_value='{"failure": {"error": "I’m a teapot", "status_code": 418}}'.encode(), + ) + + file_content = b"%PDF-1.4 file contents" + + _file_checks(client, file_content, service_id=UUID(int=0, version=4)) + _file_checks(client, file_content, service_id=UUID(int=1, version=4)) + _file_checks(client, file_content, service_id=UUID(int=1, version=4)) + + assert len(mock_redis_get.call_args_list) == 3 + assert len({c[0] for c in mock_redis_get.call_args_list}) == 2 + + +def test_different_cache_keys_for_different_filename_and_is_csv(client, mocker): + mock_redis_get = mocker.patch( + "app.redis_client.get", + return_value='{"failure": {"error": "I’m a teapot", "status_code": 418}}'.encode(), + ) + + file_content = b"%PDF-1.4 first file contents" + + _file_checks(client, file_content) + _file_checks(client, file_content, filename="foo.pdf") + _file_checks(client, file_content, filename="bar.pdf") + _file_checks(client, file_content, filename="bar.jpg") + _file_checks(client, file_content, filename="bar.jpg", is_csv=True) + _file_checks(client, file_content, is_csv=True) + + assert mock_redis_get.call_args_list == [ + # No filename + call("file-checks-78b2a017d57195bd248ea2ac7ca7c676ff082ae9"), + # Different filenames but same extension + call("file-checks-01de2a8237b9fbdb364a257098787408ae53ab97"), + call("file-checks-01de2a8237b9fbdb364a257098787408ae53ab97"), + # Different extension + call("file-checks-1b6950c7d6718aad287c9718771229d7c6321a99"), + # Same filename but is_csv=True (which is ignored) + call("file-checks-1b6950c7d6718aad287c9718771229d7c6321a99"), + # No filename but is_csv=True + call("file-checks-cf0768397cf40807321810280dc65d236bc53c70"), + ] + + +def test_success_response_from_cache(client, mocker): + mocker.patch( + "app.redis_client.get", + return_value=b'{"success": {"mimetype": "application/pdf"}}', + ) + + response = _file_checks(client, b"Anything") + + assert response.status_code == 200 + assert response.json == {"mimetype": "application/pdf"} + + +@pytest.mark.xdist_group(name="modifies_app_context") +@pytest.mark.parametrize( + "scan_results", + ( + # Scan results are ignored + True, + False, + ), +) +@pytest.mark.parametrize( + "antivirus_enabled", + ( + pytest.param( + True, + marks=pytest.mark.xfail(reason="Virus scan will be called"), + ), + False, + ), +) +def test_file_is_always_virus_free_when_antivirus_disabled(antivirus, client, mocker, scan_results, antivirus_enabled): + current_app.config["ANTIVIRUS_ENABLED"] = antivirus_enabled + antivirus.scan.return_value = scan_results + + response = _file_checks(client, b"Anything") + + assert response.status_code == 200 + assert response.json == {"mimetype": "text/plain"} + assert antivirus.scan.called is False + + # Reset the app config to avoid affecting other tests + current_app.config["ANTIVIRUS_ENABLED"] = Test.ANTIVIRUS_ENABLED diff --git a/tests/test_gunicorn_config.py b/tests/test_gunicorn_config.py new file mode 100644 index 00000000..7ff18331 --- /dev/null +++ b/tests/test_gunicorn_config.py @@ -0,0 +1,9 @@ +from gunicorn_config import keepalive, timeout, worker_class, worker_connections, workers + + +def test_gunicorn_config(): + assert workers == 4 + assert worker_class == "eventlet" + assert worker_connections == 1_000 + assert keepalive == 90 + assert timeout == 30 diff --git a/tests/upload/test_views.py b/tests/upload/test_views.py index 9c388950..c0ca3188 100644 --- a/tests/upload/test_views.py +++ b/tests/upload/test_views.py @@ -1,13 +1,12 @@ import base64 from pathlib import Path -from unittest import mock import pytest from notifications_utils.clients.antivirus.antivirus_client import AntivirusError -from werkzeug.exceptions import BadRequest import app -from app.upload.views import _get_upload_document_request_data +from app.upload.views import UploadedFile +from app.utils.file_checks import AntivirusAndMimeTypeCheckError @pytest.fixture @@ -22,7 +21,7 @@ def store(mocker): @pytest.fixture def antivirus(mocker): return mocker.patch( - "app.upload.views.antivirus_client", + "app.utils.file_checks.antivirus_client", # prevent LocalProxy being detected as an async function new_callable=mocker.MagicMock, ) @@ -161,34 +160,27 @@ def test_document_file_size_just_right_after_b64decode(client, store, antivirus) assert response.status_code == 201 -def test_document_file_size_too_large_werkzeug_content_length(client): - url = "/services/12345678-1111-1111-1111-123456789012/documents" - - # Gets hit by Werkzeug's 3MiB content length limit automatically (before our app logic). - file_content = b"a" * (3 * 1024 * 1024 + 1) - with mock.patch( - "app.upload.views._get_upload_document_request_data", wraps=_get_upload_document_request_data - ) as mock_get_data: - response = _document_upload(client, url, file_content) - - assert response.status_code == 413 - assert response.json == {"error": "Uploaded file exceeds file size limit"} - assert mock_get_data.call_count == 0 - - -def test_document_file_size_too_large_b64_decoded_content(client): +@pytest.mark.parametrize( + "file_size_in_bytes, application_code_call_expected", + ( + # Gets hit by Werkzeug's 3MiB content length limit automatically (before our app logic). + ((3 * 1024 * 1024 + 1), False), + # Gets through Werkzeug's 3MiB content length limit, but too big for our python ~2MiB check. + ((2 * 1024 * 1024 + 1025), True), + ), +) +def test_document_file_size_too_large(client, mocker, file_size_in_bytes, application_code_call_expected): + mock_uploaded_file = mocker.patch( + "app.upload.views.UploadedFile.from_request_json", wraps=UploadedFile.from_request_json + ) url = "/services/12345678-1111-1111-1111-123456789012/documents" - # Gets through Werkzeug's 3MiB content length limit, but too big for our python ~2MiB check. - file_content = b"a" * (2 * 1024 * 1024 + 1025) - with mock.patch( - "app.upload.views._get_upload_document_request_data", wraps=_get_upload_document_request_data - ) as mock_get_data: - response = _document_upload(client, url, file_content) + file_content = b"a" * file_size_in_bytes + response = _document_upload(client, url, file_content) assert response.status_code == 413 assert response.json == {"error": "Uploaded file exceeds file size limit"} - assert mock_get_data.call_count == 1 + assert mock_uploaded_file.called is application_code_call_expected def test_document_upload_no_document(client): @@ -324,7 +316,7 @@ def test_document_upload_csv_handling( "http://download.document-download-frontend-test", "/services/00000000-0000-0000-0000-000000000000", "/documents/ffffffff-ffff-ffff-ffff-ffffffffffff", - f'.{app.config["MIME_TYPES_TO_FILE_EXTENSIONS"][expected_mimetype]}', + f".{app.config['MIME_TYPES_TO_FILE_EXTENSIONS'][expected_mimetype]}", "?key=AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", ] ), @@ -438,7 +430,7 @@ def test_document_upload_bad_is_csv_value(client): ), ) def test_get_upload_document_request_data_errors(app, data, expected_error): - with pytest.raises(BadRequest) as e: - _get_upload_document_request_data(data) + with pytest.raises(AntivirusAndMimeTypeCheckError) as e: + UploadedFile.from_request_json(data, service_id="foo") - assert e.value.description == expected_error + assert e.value.message == expected_error diff --git a/tests/utils/test_store.py b/tests/utils/test_store.py index fa1559d5..647b1871 100644 --- a/tests/utils/test_store.py +++ b/tests/utils/test_store.py @@ -1,14 +1,16 @@ import uuid -from datetime import date +from datetime import date, datetime from unittest import mock import botocore import pytest from botocore.exceptions import ClientError as BotoClientError +from freezegun import freeze_time from app.utils.store import ( DocumentBlocked, DocumentExpired, + DocumentNotFound, DocumentStore, DocumentStoreError, ) @@ -25,6 +27,7 @@ def mock_boto(mocker): def store(mock_boto): mock_boto.client.return_value.get_object.return_value = { "Body": mock.Mock(), + "Expiration": 'expiry-date="Fri, 01 May 2020 00:00:00 GMT", expiry-rule="custom-retention-1-weeks"', "ContentType": "application/pdf", "ContentLength": 100, "Metadata": {}, @@ -52,7 +55,7 @@ def blocked_document(mock_boto): @pytest.fixture -def expired_document(mock_boto): +def delete_markered_document(mock_boto): mock_boto.client.return_value.get_object_tagging.side_effect = botocore.exceptions.ClientError( { "Error": { @@ -66,20 +69,6 @@ def expired_document(mock_boto): ) -@pytest.fixture -def missing_document(mock_boto): - mock_boto.client.return_value.get_object_tagging.side_effect = botocore.exceptions.ClientError( - { - "Error": { - "Code": "NoSuchKey", - "Message": "The specified key does not exist.", - "Key": "object-key", - } - }, - "GetObjectTagging", - ) - - @pytest.fixture def store_with_email(mock_boto): mock_boto.client.return_value.get_object.return_value = { @@ -166,19 +155,83 @@ def test_check_for_blocked_document_raises_error(store, mock_boto, blocked_value } with pytest.raises(DocumentBlocked): - store.check_for_blocked_document("service-id", "doc-id") + store.check_for_blocked_document(store._get_document_tags("service-id", "doc-id")) -def test_check_for_blocked_document_raises_expired_error(store, expired_document): +def test_check_for_blocked_document_delete_marker_document_expired_error(store, delete_markered_document): with pytest.raises(DocumentExpired): - store.check_for_blocked_document("service-id", "doc-id") + store.check_for_blocked_document(store._get_document_tags("service-id", "doc-id")) -def test_check_for_blocked_document_reraises_other_boto_error(store, missing_document): +def test_check_for_blocked_document_missing_raises_document_not_found_error(store): + store.s3.get_object_tagging.side_effect = botocore.exceptions.ClientError( + { + "Error": { + "Code": "NoSuchKey", + "Message": "The specified key does not exist.", + "Key": "object-key", + } + }, + "GetObjectTagging", + ) + + with pytest.raises(DocumentNotFound): + store.check_for_blocked_document(store._get_document_tags("service-id", "doc-id")) + + +def test_check_for_blocked_document_random_error_propagated(store): + store.s3.get_object_tagging.side_effect = botocore.exceptions.ClientError( + { + "Error": { + "Code": "NotEnoughBananas", + } + }, + "GetObjectTagging", + ) + with pytest.raises(BotoClientError): - store.check_for_blocked_document("service-id", "doc-id") + store.check_for_blocked_document(store._get_document_tags("service-id", "doc-id")) + + +@pytest.mark.parametrize( + "expiration", + ( + 'expiry-date="Fri, 01 May 2020 00:00:00 GMT", expiry-rule="custom-retention-1-weeks"', + 'expiry-date="Sat, 02 May 2020 00:00:00 GMT", expiry-rule="custom-retention-3-weeks"', + ), +) +def test_check_for_expired_document_expired(store, expiration): + with freeze_time("2020-05-02 10:00:00"): + with pytest.raises(DocumentExpired): + store.check_for_expired_document( + { + "Expiration": expiration, + }, + {}, + ) + + +@pytest.mark.parametrize( + "expiration", + ( + 'expiry-date="Sun, 03 May 2020 00:00:00 GMT", expiry-rule="custom-retention-1-weeks"', + None, + ), +) +def test_check_for_expired_document_not_expired(store, expiration): + with freeze_time("2020-05-02 10:00:00"): + assert ( + store.check_for_expired_document( + { + "Expiration": expiration, + }, + {}, + ) + is None + ) +@freeze_time("2021-02-03T04:05:06") def test_put_document(store): ret = store.put( "service-id", mock.Mock(), mimetype="application/pdf", confirmation_email=None, retention_period=None @@ -197,9 +250,11 @@ def test_put_document(store): SSECustomerKey=ret["encryption_key"], SSECustomerAlgorithm="AES256", Metadata={}, + Tagging="created-at=2021-02-03T04%3A05%3A06%2B00%3A00", ) +@freeze_time("2021-02-03T04:05:06") def test_put_document_sends_hashed_recipient_email_to_s3_as_metadata_if_confirmation_email_present(store): ret = store.put("service-id", mock.Mock(), mimetype="application/pdf", confirmation_email="email@example.com") @@ -216,9 +271,11 @@ def test_put_document_sends_hashed_recipient_email_to_s3_as_metadata_if_confirma SSECustomerKey=ret["encryption_key"], SSECustomerAlgorithm="AES256", Metadata={"hashed-recipient-email": mock.ANY}, + Tagging="created-at=2021-02-03T04%3A05%3A06%2B00%3A00", ) +@freeze_time("2021-02-03T04:05:06") def test_put_document_tags_document_if_retention_period_set(store): ret = store.put("service-id", mock.Mock(), mimetype="application/pdf", retention_period="4 weeks") @@ -234,7 +291,7 @@ def test_put_document_tags_document_if_retention_period_set(store): Key=Matcher("document key", lambda x: x.startswith("service-id/") and len(x) == 11 + 36), SSECustomerKey=ret["encryption_key"], SSECustomerAlgorithm="AES256", - Tagging="retention-period=4+weeks", + Tagging="created-at=2021-02-03T04%3A05%3A06%2B00%3A00&retention-period=4+weeks", Metadata={}, ) @@ -249,6 +306,7 @@ def test_put_document_tags_document_if_retention_period_set(store): (r"\u2705.pdf", r"\\u2705.pdf"), ), ) +@freeze_time("2021-02-03T04:05:06") def test_put_document_records_filename_if_set(store, filename, expected_filename_for_s3): ret = store.put("service-id", mock.Mock(), mimetype="application/pdf", filename=filename) @@ -265,16 +323,18 @@ def test_put_document_records_filename_if_set(store, filename, expected_filename SSECustomerKey=ret["encryption_key"], SSECustomerAlgorithm="AES256", Metadata={"filename": expected_filename_for_s3}, + Tagging="created-at=2021-02-03T04%3A05%3A06%2B00%3A00", ) def test_get_document(store): - assert store.get("service-id", "document-id", bytes(32)) == { - "body": mock.ANY, - "mimetype": "application/pdf", - "size": 100, - "metadata": {}, - } + with freeze_time("2020-04-28 10:00:00"): + assert store.get("service-id", "document-id", bytes(32)) == { + "body": mock.ANY, + "mimetype": "application/pdf", + "size": 100, + "metadata": {}, + } store.s3.get_object.assert_called_once_with( Bucket="test-bucket", @@ -295,21 +355,19 @@ def test_get_document_with_boto_error(store): def test_get_blocked_document(store, blocked_document): - with pytest.raises(DocumentStoreError) as e: + with pytest.raises(DocumentBlocked): store.get("service-id", "document-id", bytes(32)) - assert str(e.value) == "Access to the document has been blocked" - -def test_get_expired_document(store, expired_document): - with pytest.raises(DocumentStoreError) as e: +def test_get_delete_markered_document(store, delete_markered_document): + with pytest.raises(DocumentExpired): store.get("service-id", "document-id", bytes(32)) - assert str(e.value) == "The document is no longer available" - def test_get_document_metadata_when_document_is_in_s3(store): - metadata = store.get_document_metadata("service-id", "document-id", "0f0f0f") + with freeze_time("2020-04-28 10:00:00"): + metadata = store.get_document_metadata("service-id", "document-id", "0f0f0f") + assert metadata == { "mimetype": "text/plain", "confirm_email": False, @@ -319,8 +377,22 @@ def test_get_document_metadata_when_document_is_in_s3(store): } +def test_get_document_metadata_when_document_is_in_s3_but_missing_expiration(store): + del store.s3.head_object.return_value["Expiration"] + metadata = store.get_document_metadata("service-id", "document-id", "0f0f0f") + assert metadata == { + "mimetype": "text/plain", + "confirm_email": False, + "size": 100, + "available_until": None, + "filename": None, + } + + def test_get_document_metadata_when_document_is_in_s3_with_hashed_email(store_with_email): - metadata = store_with_email.get_document_metadata("service-id", "document-id", "0f0f0f") + with freeze_time("2020-04-28 10:00:00"): + metadata = store_with_email.get_document_metadata("service-id", "document-id", "0f0f0f") + assert metadata == { "mimetype": "text/plain", "confirm_email": True, @@ -331,7 +403,9 @@ def test_get_document_metadata_when_document_is_in_s3_with_hashed_email(store_wi def test_get_document_metadata_when_document_is_in_s3_with_filename(store_with_filename): - metadata = store_with_filename.get_document_metadata("service-id", "document-id", "0f0f0f") + with freeze_time("2020-04-28 10:00:00"): + metadata = store_with_filename.get_document_metadata("service-id", "document-id", "0f0f0f") + assert metadata == { "mimetype": "text/plain", "confirm_email": False, @@ -341,12 +415,19 @@ def test_get_document_metadata_when_document_is_in_s3_with_filename(store_with_f } +def test_get_document_metadata_when_document_is_in_s3_but_expired(store): + with pytest.raises(DocumentExpired): + with freeze_time("2020-05-12 10:00:00"): + store.get_document_metadata("service-id", "document-id", "0f0f0f") + + def test_get_document_metadata_when_document_is_not_in_s3(store): store.s3.head_object = mock.Mock( side_effect=BotoClientError({"Error": {"Code": "404", "Message": "Not Found"}}, "HeadObject") ) - assert store.get_document_metadata("service-id", "document-id", "0f0f0f") is None + with pytest.raises(DocumentNotFound): + store.get_document_metadata("service-id", "document-id", "0f0f0f") def test_get_document_metadata_with_unexpected_boto_error(store): @@ -359,19 +440,28 @@ def test_get_document_metadata_with_unexpected_boto_error(store): def test_get_document_metadata_with_blocked_document(store_with_email, blocked_document): - assert store_with_email.get_document_metadata("service-id", "document-id", "0f0f0f") is None + with pytest.raises(DocumentBlocked): + store_with_email.get_document_metadata("service-id", "document-id", "0f0f0f") -def test_get_document_metadata_with_expired_document(store_with_email, expired_document): - assert store_with_email.get_document_metadata("service-id", "document-id", "0f0f0f") is None +def test_get_document_metadata_with_delete_markered_document(store_with_email, delete_markered_document): + with pytest.raises(DocumentExpired): + store_with_email.get_document_metadata("service-id", "document-id", "0f0f0f") -def test_authenticate_document_when_missing(store): +def test_get_document_metadata_when_missing(store): store.s3.head_object = mock.Mock( side_effect=BotoClientError({"Error": {"Code": "404", "Message": "Not Found"}}, "HeadObject") ) - assert store.get_document_metadata("service-id", "document-id", "0f0f0f") is None + with pytest.raises(DocumentNotFound): + store.get_document_metadata("service-id", "document-id", "0f0f0f") + + +def test_authenticate_document_when_missing(store): + store.s3.head_object = mock.Mock( + side_effect=BotoClientError({"Error": {"Code": "404", "Message": "Not Found"}}, "HeadObject") + ) assert store.authenticate("service-id", "document-id", b"0f0f0f", "test@notify.example") is False @@ -384,7 +474,13 @@ def test_authenticate_document_when_missing(store): ), ) def test_authenticate_document_email_address_check(store_with_email, email_address, expected_result): - assert store_with_email.authenticate("service-id", "document-id", b"0f0f0f", email_address) is expected_result + with freeze_time("2020-04-28 10:00:00"): + assert store_with_email.authenticate("service-id", "document-id", b"0f0f0f", email_address) is expected_result + + +def test_authenticate_document_expired(store_with_email): + with freeze_time("2020-05-28 10:00:00"): + assert store_with_email.authenticate("service-id", "document-id", b"0f0f0f", "test@notify.example") is False def test_authenticate_fails_if_document_does_not_have_hash(store): @@ -409,32 +505,128 @@ def test_authenticate_with_blocked_document(store, blocked_document): assert store.authenticate("service-id", "document-id", b"0f0f0f", "test@notify.example") is False -def test_authenticate_with_expired_document(store, expired_document): +def test_authenticate_with_delete_markered_document(store, delete_markered_document): assert store.authenticate("service-id", "document-id", b"0f0f0f", "test@notify.example") is False @pytest.mark.parametrize( - "raw_expiry_rule,expected_date", + "s3_response,tags,expected_date", [ # An expiry date in winter time (GMT) - date in GMT ISO 8601 format - ('expiry-date="Mon, 31 Oct 2022 00:00:00 GMT", rule-id="remove-old-documents"', date(2022, 10, 30)), + ( + {"Expiration": 'expiry-date="Mon, 31 Oct 2022 00:00:00 GMT", rule-id="remove-old-documents"'}, + {}, + date(2022, 10, 30), + ), # An expiry date in summer time (BST) - still sent by AWS in GMT ISO 8601 format. - ('expiry-date="Wed, 26 Oct 2022 00:00:00 GMT", rule-id="remove-old-documents"', date(2022, 10, 25)), + ( + {"Expiration": 'expiry-date="Wed, 26 Oct 2022 00:00:00 GMT", rule-id="remove-old-documents"'}, + {}, + date(2022, 10, 25), + ), # Swap the order of the key-value pairs - ('rule-id="remove-old-documents", expiry-date="Mon, 31 Oct 2022 00:00:00 GMT"', date(2022, 10, 30)), + ( + {"Expiration": 'rule-id="remove-old-documents", expiry-date="Mon, 31 Oct 2022 00:00:00 GMT"'}, + {}, + date(2022, 10, 30), + ), # Expiry date should handle month borders just fine - ('rule-id="remove-old-documents", expiry-date="Tue, 01 Nov 2022 00:00:00 GMT"', date(2022, 10, 31)), + ( + {"Expiration": 'rule-id="remove-old-documents", expiry-date="Tue, 01 Nov 2022 00:00:00 GMT"'}, + {}, + date(2022, 10, 31), + ), + # tag-based expiry is earlier so should be preferred + ( + {"Expiration": 'expiry-date="Mon, 31 Oct 2022 00:00:00 GMT", rule-id="remove-old-documents"'}, + { + "created-at": "2022-10-12T12:34:56+0000", + "retention-period": "2 weeks", + }, + date(2022, 10, 26), + ), + # retention-period tag corrupt, preventing tag use & causing fallback to Expiration header + ( + {"Expiration": 'expiry-date="Mon, 31 Oct 2022 00:00:00 GMT", rule-id="remove-old-documents"'}, + { + "created-at": "2022-10-12T12:34:56+0000", + "retention-period": "2.0 apples", + }, + date(2022, 10, 30), + ), + # created-at tag corrupt, preventing tag use & causing fallback to LastModified header used with + # retention-period tag + ( + { + "Expiration": 'expiry-date="Mon, 31 Oct 2022 00:00:00 GMT", rule-id="remove-old-documents"', + "LastModified": datetime(2022, 9, 12, 12, 34, 56), + }, + { + "created-at": "20222-10-122T12:34:56+0000", + "retention-period": "2 weeks", + }, + date(2022, 9, 26), + ), + # created-at tag missing, preventing tag use & causing fallback to LastModified header used with + # retention-period tag + ( + { + "Expiration": 'expiry-date="Mon, 31 Oct 2022 00:00:00 GMT", rule-id="remove-old-documents"', + "LastModified": datetime(2022, 9, 12, 12, 34, 56), + }, + { + "retention-period": "2 weeks", + }, + date(2022, 9, 26), + ), + # Expiration header is non-GMT so ignored in favour of (later) tag-based expiry date + ( + {"Expiration": 'expiry-date="Mon, 31 Oct 2022 00:00:00 EST", rule-id="remove-old-documents"'}, + { + "created-at": "2022-11-12T12:34:56+0000", + "retention-period": "2 weeks", + }, + date(2022, 11, 26), + ), + # Expiration header is missing expiry-date so ignored in favour of tag-based expiry date, + # though not disregarding LastModified + ( + { + "Expiration": 'rule-id="remove-old-documents"', + "LastModified": datetime(2022, 9, 12, 12, 34, 56), + }, + { + "retention-period": "30 weeks", + }, + date(2023, 4, 10), + ), + # Expiration header missing, tags used + ( + {}, + { + "created-at": "2022-10-12T12:34:56+0000", + "retention-period": "2 weeks", + }, + date(2022, 10, 26), + ), + # Expiration header unparseable, tags used + ( + {"Expiration": "blah"}, + { + "created-at": "2022-10-12T12:34:56+0000", + "retention-period": "2 weeks", + }, + date(2022, 10, 26), + ), + # Both methods unusable + ( + {"Expiration": "blah"}, + { + "created-at": "2022-10-12T12:34:56+0000", + }, + None, + ), ], ) -def test___convert_expiry_date_to_date_object(raw_expiry_rule, expected_date): - result = DocumentStore._convert_expiry_date_to_date_object(raw_expiry_rule) - assert result == expected_date - - -def test_convert_expiry_date_to_date_object_logs_on_non_gmt_expiration(app, caplog): - DocumentStore._convert_expiry_date_to_date_object( - 'expiry-date="Mon, 31 Oct 2022 00:00:00 EST", rule-id="remove-old-documents"' - ) - - assert len(caplog.records) == 1 - assert caplog.records[0].message == "AWS S3 object expiration has unhandled timezone: EST" +def test__get_effective_expiry_date(s3_response, tags, expected_date): + assert DocumentStore._get_effective_expiry_date(s3_response, tags) == expected_date diff --git a/uv.lock b/uv.lock new file mode 100644 index 00000000..9431a635 --- /dev/null +++ b/uv.lock @@ -0,0 +1,3 @@ +version = 1 +revision = 3 +requires-python = ">=3.11"