From e42b3b654f0daa1184124f99af31fe21822c11f6 Mon Sep 17 00:00:00 2001 From: Thomas Howe Date: Thu, 11 Dec 2025 13:25:22 -0500 Subject: [PATCH 1/2] Add S3 storage option to diet link with presigned URLs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable dialog bodies to be stored in S3 instead of being removed or posted to an HTTP endpoint. The S3 option generates presigned URLs for secure access with configurable expiration (optional, defaults to 1 hour). New configuration options: - s3_bucket: S3 bucket name for storing dialog bodies - s3_path: Optional path prefix within the bucket - aws_access_key_id/aws_secret_access_key: AWS credentials - aws_region: AWS region (default: us-east-1) - presigned_url_expiration: URL expiration in seconds (optional) S3 takes precedence over post_media_to_url when both are configured. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- server/links/diet/README.md | 61 +++++++++- server/links/diet/__init__.py | 107 ++++++++++++++++- server/links/diet/test_diet.py | 214 ++++++++++++++++++++++++++++++++- 3 files changed, 373 insertions(+), 9 deletions(-) diff --git a/server/links/diet/README.md b/server/links/diet/README.md index dc73c39..b37e4ad 100644 --- a/server/links/diet/README.md +++ b/server/links/diet/README.md @@ -5,7 +5,8 @@ The Diet link is a specialized plugin that helps reduce the size and content of ## Features - Selective removal of dialog body content -- Optional media redirection to external storage +- Optional media redirection to external storage (HTTP endpoint or S3) +- S3 storage with presigned URL generation for secure access - Removal of analysis data - Filtering of attachments by MIME type - Removal of system prompts to prevent LLM instruction injection @@ -20,6 +21,13 @@ default_options = { "remove_analysis": False, # Remove all analysis data "remove_attachment_types": [], # List of attachment types to remove (e.g., ["image/jpeg", "audio/mp3"]) "remove_system_prompts": False, # Remove system_prompt keys to prevent LLM instruction insertion + # S3 storage options for dialog bodies + "s3_bucket": "", # S3 bucket name for storing dialog bodies + "s3_path": "", # Optional path prefix within the bucket + "aws_access_key_id": "", # AWS access key ID + "aws_secret_access_key": "", # AWS secret access key + "aws_region": "us-east-1", # AWS region (default: us-east-1) + "presigned_url_expiration": None, # Presigned URL expiration in seconds (None = default 1 hour) } ``` @@ -31,6 +39,15 @@ default_options = { - `remove_attachment_types`: List of MIME types to remove from attachments - `remove_system_prompts`: Whether to remove system_prompt keys to prevent LLM instruction injection +### S3 Storage Options + +- `s3_bucket`: The S3 bucket name where dialog bodies will be stored +- `s3_path`: Optional path prefix within the bucket (e.g., "dialogs/processed") +- `aws_access_key_id`: AWS access key ID for authentication +- `aws_secret_access_key`: AWS secret access key for authentication +- `aws_region`: AWS region where the bucket is located (default: "us-east-1") +- `presigned_url_expiration`: Expiration time in seconds for presigned URLs (optional, defaults to 3600 seconds / 1 hour) + ## Usage The link processes vCons by: @@ -42,9 +59,39 @@ The link processes vCons by: - Removing system prompts if specified 3. Storing the modified vCon back in Redis -## Media Redirection +## Media Storage Options + +The diet link supports two methods for storing dialog bodies externally: + +### S3 Storage (Recommended) + +When `s3_bucket` is configured, the link will: +1. Upload dialog body content to the specified S3 bucket +2. Generate a presigned URL for secure access +3. Replace the body content with the presigned URL +4. Set the body_type to "url" +5. If the upload fails, the body content will be removed + +**S3 takes precedence over HTTP endpoint** - if both `s3_bucket` and `post_media_to_url` are configured, S3 will be used. + +Example S3 configuration: +```python +{ + "remove_dialog_body": True, + "s3_bucket": "my-vcon-storage", + "s3_path": "dialogs/archived", + "aws_access_key_id": "AKIAXXXXXXXX", + "aws_secret_access_key": "xxxxxxxxxxxxx", + "aws_region": "us-west-2", + "presigned_url_expiration": 86400, # 24 hours +} +``` + +The S3 key structure is: `{s3_path}/{vcon_uuid}/{dialog_id}_{unique_id}.txt` + +### HTTP Endpoint Storage -When `post_media_to_url` is configured, the link will: +When `post_media_to_url` is configured (and `s3_bucket` is not), the link will: 1. Post the media content to the specified URL 2. Replace the body content with the URL to the stored content 3. Set the body_type to "url" @@ -59,7 +106,8 @@ When `post_media_to_url` is configured, the link will: ## Dependencies - Redis for vCon storage -- Requests library for media redirection +- Requests library for HTTP media redirection +- boto3 library for S3 storage - Custom utilities: - logging_utils @@ -67,4 +115,7 @@ When `post_media_to_url` is configured, the link will: - Redis connection must be configured - Appropriate permissions for vCon access and storage -- If using media redirection, a valid endpoint URL must be provided \ No newline at end of file +- If using HTTP media redirection, a valid endpoint URL must be provided +- If using S3 storage: + - Valid AWS credentials with write access to the specified bucket + - The bucket must exist and be accessible \ No newline at end of file diff --git a/server/links/diet/__init__.py b/server/links/diet/__init__.py index a064ca8..099bd10 100644 --- a/server/links/diet/__init__.py +++ b/server/links/diet/__init__.py @@ -2,6 +2,9 @@ from lib.logging_utils import init_logger import json import requests +import uuid +import boto3 +from botocore.exceptions import ClientError from typing import Dict, List, Any, Optional logger = init_logger(__name__) @@ -15,8 +18,93 @@ "remove_analysis": False, # Remove all analysis data "remove_attachment_types": [], # List of attachment types to remove (e.g., ["image/jpeg", "audio/mp3"]) "remove_system_prompts": False, # Remove system_prompt keys to prevent LLM instruction insertion + # S3 storage options for dialog bodies + "s3_bucket": "", # S3 bucket name for storing dialog bodies + "s3_path": "", # Optional path prefix within the bucket + "aws_access_key_id": "", # AWS access key ID + "aws_secret_access_key": "", # AWS secret access key + "aws_region": "us-east-1", # AWS region (default: us-east-1) + "presigned_url_expiration": None, # Presigned URL expiration in seconds (None = no expiration/default 1 hour) } + +def _get_s3_client(options: Dict[str, Any]): + """Create and return an S3 client with the provided credentials.""" + return boto3.client( + "s3", + aws_access_key_id=options["aws_access_key_id"], + aws_secret_access_key=options["aws_secret_access_key"], + region_name=options.get("aws_region", "us-east-1"), + ) + + +def _upload_to_s3_and_get_presigned_url( + content: str, + vcon_uuid: str, + dialog_id: str, + options: Dict[str, Any] +) -> Optional[str]: + """ + Upload dialog body content to S3 and return a presigned URL. + + Args: + content: The dialog body content to upload + vcon_uuid: The vCon UUID + dialog_id: The dialog ID + options: Configuration options including S3 credentials and bucket info + + Returns: + Presigned URL to access the uploaded content, or None if upload fails + """ + try: + s3 = _get_s3_client(options) + + # Generate a unique key for this dialog body + unique_id = str(uuid.uuid4()) + key = f"{dialog_id}_{unique_id}.txt" if dialog_id else f"{unique_id}.txt" + + # Add vcon_uuid as a directory level + key = f"{vcon_uuid}/{key}" + + # Add optional path prefix + if options.get("s3_path"): + key = f"{options['s3_path']}/{key}" + + bucket = options["s3_bucket"] + + # Upload the content + s3.put_object( + Bucket=bucket, + Key=key, + Body=content.encode("utf-8") if isinstance(content, str) else content, + ContentType="text/plain", + ) + + logger.info(f"Successfully uploaded dialog body to s3://{bucket}/{key}") + + # Generate presigned URL + expiration = options.get("presigned_url_expiration") + if expiration is None: + # Default to 1 hour (3600 seconds) if not specified + expiration = 3600 + + presigned_url = s3.generate_presigned_url( + "get_object", + Params={"Bucket": bucket, "Key": key}, + ExpiresIn=expiration, + ) + + logger.info(f"Generated presigned URL with expiration {expiration}s") + return presigned_url + + except ClientError as e: + logger.error(f"S3 client error uploading dialog body: {e}") + return None + except Exception as e: + logger.error(f"Exception uploading dialog body to S3: {e}") + return None + + def run(vcon_uuid, link_name, opts=default_options): logger.info("Starting diet::run") @@ -41,12 +129,27 @@ def run(vcon_uuid, link_name, opts=default_options): logger.info("diet::got dialog") if options["remove_dialog_body"] and "body" in dialog: logger.info("diet::remove_dialog_body AND body") - if options["post_media_to_url"] and dialog.get("body"): + dialog_body = dialog.get("body") + dialog_id = dialog.get("id", "") + + # Check if S3 storage is configured + if options.get("s3_bucket") and dialog_body: + logger.info("diet::uploading to S3") + presigned_url = _upload_to_s3_and_get_presigned_url( + dialog_body, vcon_uuid, dialog_id, options + ) + if presigned_url: + dialog["body"] = presigned_url + dialog["body_type"] = "url" + else: + logger.error("Failed to upload to S3, removing body") + dialog["body"] = "" + elif options["post_media_to_url"] and dialog_body: try: # Post the body content to the specified URL response = requests.post( options["post_media_to_url"], - json={"content": dialog["body"], "vcon_uuid": vcon_uuid, "dialog_id": dialog.get("id", "")} + json={"content": dialog_body, "vcon_uuid": vcon_uuid, "dialog_id": dialog_id} ) if response.status_code == 200: # Replace body with the URL to the stored content diff --git a/server/links/diet/test_diet.py b/server/links/diet/test_diet.py index 61eeeaf..6327b9e 100644 --- a/server/links/diet/test_diet.py +++ b/server/links/diet/test_diet.py @@ -1,8 +1,9 @@ import json import pytest from unittest.mock import patch, MagicMock +from botocore.exceptions import ClientError -from server.links.diet import run, default_options, remove_system_prompts_recursive +from server.links.diet import run, default_options, remove_system_prompts_recursive, _upload_to_s3_and_get_presigned_url @pytest.fixture def sample_vcon(): @@ -269,4 +270,213 @@ def test_combined_options(mock_redis, sample_vcon): assert "analysis" not in saved_vcon assert len(saved_vcon["attachments"]) == 2 assert saved_vcon["attachments"][0]["id"] == "att2" - assert "system_prompt" not in saved_vcon["attachments"][1]["metadata"] \ No newline at end of file + assert "system_prompt" not in saved_vcon["attachments"][1]["metadata"] + + +@pytest.fixture +def s3_options(): + return { + "remove_dialog_body": True, + "s3_bucket": "test-bucket", + "s3_path": "dialogs", + "aws_access_key_id": "test-key-id", + "aws_secret_access_key": "test-secret-key", + "aws_region": "us-west-2", + "presigned_url_expiration": 7200, + } + + +@patch('server.links.diet.boto3') +def test_upload_to_s3_and_get_presigned_url(mock_boto3): + # Test the S3 upload helper function + mock_s3 = MagicMock() + mock_boto3.client.return_value = mock_s3 + mock_s3.generate_presigned_url.return_value = "https://test-bucket.s3.amazonaws.com/presigned-url" + + options = { + "s3_bucket": "test-bucket", + "s3_path": "dialogs", + "aws_access_key_id": "test-key-id", + "aws_secret_access_key": "test-secret-key", + "aws_region": "us-west-2", + "presigned_url_expiration": 7200, + } + + result = _upload_to_s3_and_get_presigned_url( + "test content", + "vcon-uuid-123", + "dialog1", + options + ) + + # Verify S3 client was created with correct credentials + mock_boto3.client.assert_called_once_with( + "s3", + aws_access_key_id="test-key-id", + aws_secret_access_key="test-secret-key", + region_name="us-west-2", + ) + + # Verify put_object was called + mock_s3.put_object.assert_called_once() + call_kwargs = mock_s3.put_object.call_args[1] + assert call_kwargs["Bucket"] == "test-bucket" + assert "dialogs/vcon-uuid-123/dialog1_" in call_kwargs["Key"] + assert call_kwargs["ContentType"] == "text/plain" + + # Verify presigned URL was generated with correct expiration + mock_s3.generate_presigned_url.assert_called_once() + presign_call = mock_s3.generate_presigned_url.call_args + assert presign_call[0][0] == "get_object" + assert presign_call[1]["ExpiresIn"] == 7200 + + assert result == "https://test-bucket.s3.amazonaws.com/presigned-url" + + +@patch('server.links.diet.boto3') +def test_upload_to_s3_default_expiration(mock_boto3): + # Test that default expiration (3600) is used when not specified + mock_s3 = MagicMock() + mock_boto3.client.return_value = mock_s3 + mock_s3.generate_presigned_url.return_value = "https://presigned-url" + + options = { + "s3_bucket": "test-bucket", + "aws_access_key_id": "test-key-id", + "aws_secret_access_key": "test-secret-key", + "presigned_url_expiration": None, # Not specified + } + + _upload_to_s3_and_get_presigned_url("content", "vcon-123", "dialog1", options) + + # Verify default expiration of 3600 seconds was used + presign_call = mock_s3.generate_presigned_url.call_args + assert presign_call[1]["ExpiresIn"] == 3600 + + +@patch('server.links.diet.boto3') +def test_upload_to_s3_no_path_prefix(mock_boto3): + # Test S3 upload without path prefix + mock_s3 = MagicMock() + mock_boto3.client.return_value = mock_s3 + mock_s3.generate_presigned_url.return_value = "https://presigned-url" + + options = { + "s3_bucket": "test-bucket", + "s3_path": "", # No path prefix + "aws_access_key_id": "test-key-id", + "aws_secret_access_key": "test-secret-key", + } + + _upload_to_s3_and_get_presigned_url("content", "vcon-123", "dialog1", options) + + # Verify key doesn't have prefix + call_kwargs = mock_s3.put_object.call_args[1] + assert call_kwargs["Key"].startswith("vcon-123/dialog1_") + assert not call_kwargs["Key"].startswith("/") + + +@patch('server.links.diet.boto3') +def test_upload_to_s3_failure(mock_boto3): + # Test handling of S3 upload failure + mock_s3 = MagicMock() + mock_boto3.client.return_value = mock_s3 + mock_s3.put_object.side_effect = ClientError( + {"Error": {"Code": "AccessDenied", "Message": "Access Denied"}}, + "PutObject" + ) + + options = { + "s3_bucket": "test-bucket", + "aws_access_key_id": "test-key-id", + "aws_secret_access_key": "test-secret-key", + } + + result = _upload_to_s3_and_get_presigned_url("content", "vcon-123", "dialog1", options) + + assert result is None + + +@patch('server.links.diet.redis') +@patch('server.links.diet.boto3') +def test_run_with_s3_storage(mock_boto3, mock_redis, sample_vcon, s3_options): + # Test the full run function with S3 storage + mock_json = MagicMock() + mock_redis.json.return_value = mock_json + mock_json.get.return_value = sample_vcon + + mock_s3 = MagicMock() + mock_boto3.client.return_value = mock_s3 + mock_s3.generate_presigned_url.return_value = "https://test-bucket.s3.amazonaws.com/presigned-url" + + run("test-vcon-123", "diet", s3_options) + + # Verify JSON.SET was called + mock_json.set.assert_called_once() + args, kwargs = mock_json.set.call_args + saved_vcon = args[2] + + # Check that dialog bodies were replaced with presigned URLs + assert saved_vcon["dialog"][0]["body"] == "https://test-bucket.s3.amazonaws.com/presigned-url" + assert saved_vcon["dialog"][0]["body_type"] == "url" + assert saved_vcon["dialog"][1]["body"] == "https://test-bucket.s3.amazonaws.com/presigned-url" + assert saved_vcon["dialog"][1]["body_type"] == "url" + + # Verify S3 was called for each dialog + assert mock_s3.put_object.call_count == 2 + + +@patch('server.links.diet.redis') +@patch('server.links.diet.boto3') +def test_run_with_s3_storage_failure_removes_body(mock_boto3, mock_redis, sample_vcon, s3_options): + # Test that body is removed when S3 upload fails + mock_json = MagicMock() + mock_redis.json.return_value = mock_json + mock_json.get.return_value = sample_vcon + + mock_s3 = MagicMock() + mock_boto3.client.return_value = mock_s3 + mock_s3.put_object.side_effect = ClientError( + {"Error": {"Code": "AccessDenied", "Message": "Access Denied"}}, + "PutObject" + ) + + run("test-vcon-123", "diet", s3_options) + + # Verify JSON.SET was called + mock_json.set.assert_called_once() + args, kwargs = mock_json.set.call_args + saved_vcon = args[2] + + # Check that dialog bodies were removed due to failure + assert saved_vcon["dialog"][0]["body"] == "" + assert saved_vcon["dialog"][1]["body"] == "" + + +@patch('server.links.diet.redis') +@patch('server.links.diet.boto3') +def test_s3_takes_precedence_over_post_url(mock_boto3, mock_redis, sample_vcon): + # Test that S3 storage takes precedence over post_media_to_url + mock_json = MagicMock() + mock_redis.json.return_value = mock_json + mock_json.get.return_value = sample_vcon + + mock_s3 = MagicMock() + mock_boto3.client.return_value = mock_s3 + mock_s3.generate_presigned_url.return_value = "https://s3-presigned-url" + + options = { + "remove_dialog_body": True, + "s3_bucket": "test-bucket", + "aws_access_key_id": "test-key-id", + "aws_secret_access_key": "test-secret-key", + "post_media_to_url": "https://should-not-be-called.com", # This should be ignored + } + + with patch('server.links.diet.requests.post') as mock_post: + run("test-vcon-123", "diet", options) + # post_media_to_url should not be called when S3 is configured + mock_post.assert_not_called() + + # Verify S3 was used instead + assert mock_s3.put_object.call_count == 2 \ No newline at end of file From 87809477db393c4992e9135fc05e8275c1f6fcd6 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Fri, 12 Dec 2025 18:00:44 +0000 Subject: [PATCH 2/2] feat: Redact sensitive options in diet link logs Co-authored-by: thomas.howe --- server/links/diet/__init__.py | 22 +++++++++++++++++++++- server/links/diet/test_diet.py | 22 +++++++++++++++++++++- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/server/links/diet/__init__.py b/server/links/diet/__init__.py index 099bd10..4b1ea8f 100644 --- a/server/links/diet/__init__.py +++ b/server/links/diet/__init__.py @@ -10,6 +10,26 @@ logger = init_logger(__name__) logger.info("MDO THIS SHOULD PRINT") +_REDACTED = "[REDACTED]" + + +def _redact_option_value(key: str, value: Any) -> Any: + """ + Redact sensitive option values before logging. + + This prevents leaking secrets (for example AWS credentials) into logs. + """ + key_l = (key or "").lower() + if ( + key_l == "aws_secret_access_key" + or "secret" in key_l + or "password" in key_l + or "token" in key_l + or key_l.endswith("_secret") + ): + return _REDACTED + return value + # Default options that control which elements to remove default_options = { @@ -112,7 +132,7 @@ def run(vcon_uuid, link_name, opts=default_options): options = {**default_options, **opts} for key, value in options.items(): - logger.info(f"diet::{key}: {value}") + logger.info("diet::%s: %s", key, _redact_option_value(key, value)) # Load vCon from Redis using JSON.GET vcon = redis.json().get(f"vcon:{vcon_uuid}") diff --git a/server/links/diet/test_diet.py b/server/links/diet/test_diet.py index 6327b9e..1a18e90 100644 --- a/server/links/diet/test_diet.py +++ b/server/links/diet/test_diet.py @@ -1,5 +1,6 @@ import json import pytest +import logging from unittest.mock import patch, MagicMock from botocore.exceptions import ClientError @@ -479,4 +480,23 @@ def test_s3_takes_precedence_over_post_url(mock_boto3, mock_redis, sample_vcon): mock_post.assert_not_called() # Verify S3 was used instead - assert mock_s3.put_object.call_count == 2 \ No newline at end of file + assert mock_s3.put_object.call_count == 2 + + +@patch('server.links.diet.redis') +def test_options_logging_redacts_aws_secret_access_key(mock_redis, sample_vcon, caplog): + # Ensure secrets are not written to logs + mock_json = MagicMock() + mock_redis.json.return_value = mock_json + mock_json.get.return_value = sample_vcon + + secret = "test-secret-key" + caplog.set_level(logging.INFO) + + run("test-vcon-123", "diet", { + "remove_dialog_body": False, + "aws_secret_access_key": secret, + }) + + assert secret not in caplog.text + assert "diet::aws_secret_access_key: [REDACTED]" in caplog.text \ No newline at end of file