TemoaProject · ParticularlyPythonicBS · Sep 23, 2025 · Sep 23, 2025 · Sep 23, 2025 · Sep 24, 2025
diff --git a/.github/scripts/publish_script.py b/.github/scripts/publish_script.py
@@ -12,6 +12,7 @@
 SECRET_ACCESS_KEY = os.environ["R2_SECRET_ACCESS_KEY"]
 PROD_BUCKET = os.environ["R2_PRODUCTION_BUCKET"]
 STAGING_BUCKET = os.environ["R2_STAGING_BUCKET"]
+INTERNAL_BUCKET = os.environ["R2_INTERNAL_BUCKET"]
 ENDPOINT_URL = f"https://{ACCOUNT_ID}.r2.cloudflarestorage.com"
 MANIFEST_FILE = "manifest.json"
 DATASETS_DOC_PATH = "docs/source/datasets.md"
@@ -134,23 +135,32 @@ def handle_deletions(manifest_data: list[dict[str, Any]]) -> bool:
     processed_deletion = False
 
     for dataset in manifest_data:
+        # Get the bucket type for this dataset
+        bucket_type = dataset.get("bucket", "production")
+        target_bucket = INTERNAL_BUCKET if bucket_type == "internal" else PROD_BUCKET
+        bucket_name = "internal" if bucket_type == "internal" else "production"
+
         if dataset.get("status") == "pending-deletion":
             processed_deletion = True
-            print(f"Found dataset marked for full deletion: {dataset['fileName']}")
+            print(
+                f"Found dataset marked for full deletion: {dataset['fileName']} from {bucket_name} bucket"
+            )
             for entry in dataset.get("history", []):
                 if "r2_object_key" in entry:
-                    objects_to_delete_from_r2.append({"Key": entry["r2_object_key"]})
+                    objects_to_delete_from_r2.append(
+                        {"Key": entry["r2_object_key"], "Bucket": target_bucket}
+                    )
         else:
             versions_to_keep = []
             for entry in dataset.get("history", []):
                 if entry.get("status") == "pending-deletion":
                     processed_deletion = True
                     print(
-                        f"Found version marked for deletion: {dataset['fileName']} v{entry['version']}"
+                        f"Found version marked for deletion: {dataset['fileName']} v{entry['version']} from {bucket_name} bucket"
                     )
                     if "r2_object_key" in entry:
                         objects_to_delete_from_r2.append(
-                            {"Key": entry["r2_object_key"]}
+                            {"Key": entry["r2_object_key"], "Bucket": target_bucket}
                         )
                 else:
                     versions_to_keep.append(entry)
@@ -162,18 +172,44 @@ def handle_deletions(manifest_data: list[dict[str, Any]]) -> bool:
         return False
 
     if objects_to_delete_from_r2:
-        print(
-            f"\nDeleting {len(objects_to_delete_from_r2)} objects from production R2 bucket..."
-        )
-        for i in range(0, len(objects_to_delete_from_r2), 1000):
-            chunk: Any = objects_to_delete_from_r2[i : i + 1000]
-            response = client.delete_objects(
-                Bucket=PROD_BUCKET, Delete={"Objects": chunk, "Quiet": True}
+        # Group objects by bucket for deletion
+        prod_objects = [
+            obj for obj in objects_to_delete_from_r2 if obj["Bucket"] == PROD_BUCKET
+        ]
+        internal_objects = [
+            obj for obj in objects_to_delete_from_r2 if obj["Bucket"] == INTERNAL_BUCKET
+        ]
+
+        if prod_objects:
+            print(
+                f"\nDeleting {len(prod_objects)} objects from production R2 bucket..."
+            )
+            for i in range(0, len(prod_objects), 1000):
+                chunk: Any = prod_objects[i : i + 1000]
+                objects_only = [{"Key": obj["Key"]} for obj in chunk]
+                response = client.delete_objects(
+                    Bucket=PROD_BUCKET, Delete={"Objects": objects_only, "Quiet": True}
+                )
+                if response.get("Errors"):
+                    print("  ❌ ERROR during batch deletion:", response["Errors"])
+                    exit(1)
+            print("✅ Successfully deleted objects from production R2 bucket.")
+
+        if internal_objects:
+            print(
+                f"\nDeleting {len(internal_objects)} objects from internal R2 bucket..."
             )
-            if response.get("Errors"):
-                print("  ❌ ERROR during batch deletion:", response["Errors"])
-                exit(1)
-        print("✅ Successfully deleted objects from R2.")
+            for i in range(0, len(internal_objects), 1000):
+                chunk: Any = internal_objects[i : i + 1000]
+                objects_only = [{"Key": obj["Key"]} for obj in chunk]
+                response = client.delete_objects(
+                    Bucket=INTERNAL_BUCKET,
+                    Delete={"Objects": objects_only, "Quiet": True},
+                )
+                if response.get("Errors"):
+                    print("  ❌ ERROR during batch deletion:", response["Errors"])
+                    exit(1)
+            print("✅ Successfully deleted objects from internal R2 bucket.")
 
     finalize_manifest(datasets_to_keep, "ci: Finalize manifest after data deletion")
     return True
@@ -186,6 +222,11 @@ def handle_publications(manifest_data: list[dict[str, Any]]) -> bool:
     """
     print("\n--- Phase 2: Checking for pending publications ---")
     for dataset in manifest_data:
+        # Get the bucket type for this dataset
+        bucket_type = dataset.get("bucket", "production")
+        target_bucket = INTERNAL_BUCKET if bucket_type == "internal" else PROD_BUCKET
+        bucket_name = "internal" if bucket_type == "internal" else "production"
+
         for i, entry in enumerate(dataset["history"]):
             if entry.get("commit") == "pending-merge":
                 commit_details = get_commit_details()
@@ -197,32 +238,36 @@ def handle_publications(manifest_data: list[dict[str, Any]]) -> bool:
                 if "staging_key" in entry and entry["staging_key"]:
                     staging_key = entry.pop("staging_key")
                     final_key = entry["r2_object_key"]
-                    print(f"Publishing: {dataset['fileName']} v{entry['version']}")
+                    print(
+                        f"Publishing: {dataset['fileName']} v{entry['version']} to {bucket_name} bucket"
+                    )
                     print(f"  Description: {entry['description']}")
                     try:
                         copy_source: Any = {
                             "Bucket": STAGING_BUCKET,
                             "Key": staging_key,
                         }
                         client.copy_object(
-                            CopySource=copy_source, Bucket=PROD_BUCKET, Key=final_key
+                            CopySource=copy_source, Bucket=target_bucket, Key=final_key
+                        )
+                        print(
+                            f"  ✅ Server-side copy to {bucket_name} bucket successful."
                         )
-                        print("  ✅ Server-side copy successful.")
                         client.delete_object(Bucket=STAGING_BUCKET, Key=staging_key)
                         print("  ✅ Staging object deleted.")
                     except ClientError as e:
                         print(f"  ❌ ERROR: Could not process object. Reason: {e}")
                         exit(1)
                 else:
                     print(
-                        f"Finalizing rollback: {dataset['fileName']} v{entry['version']}"
+                        f"Finalizing rollback: {dataset['fileName']} v{entry['version']} in {bucket_name} bucket"
                     )
                     print(f"  Description: {entry['description']}")
 
                 dataset["history"][i] = entry
                 finalize_manifest(
                     manifest_data,
-                    f"ci: Publish {dataset['fileName']} {entry['version']}",
+                    f"ci: Publish {dataset['fileName']} {entry['version']} to {bucket_name}",
                 )
                 return True  # Process only one publication per run
 

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -41,3 +41,4 @@ jobs:
           R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
           R2_PRODUCTION_BUCKET: ${{ vars.R2_PRODUCTION_BUCKET }} # Use repo variable
           R2_STAGING_BUCKET: ${{ vars.R2_STAGING_BUCKET }} # Use repo variable
+          R2_INTERNAL_BUCKET: ${{ vars.R2_INTERNAL_BUCKET }} # Use repo variable
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/astral-sh/uv-pre-commit
-    rev: 0.8.19
+    rev: 0.8.22
     hooks:
       # Dependency management
       - id: uv-lock
@@ -21,6 +21,11 @@ repos:
         args: ["--maxkb=1024"]
       - id: debug-statements
 
+  - repo: https://github.com/asottile/pyupgrade
+    rev: v3.20.0
+    hooks:
+      - id: pyupgrade
+
     # Python Linting & Formatting with Ruff
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.13.1

diff --git a/README.md b/README.md
@@ -59,19 +59,41 @@ flowchart TD
 
 ## Features
 
+- **Multi-Bucket Support:** Choose between production (public) and internal (team-only) buckets for different data access levels.
 - **CI/CD-Driven Publishing:** Data publication is transactional and automated via GitHub Actions after a pull request is merged, preventing inconsistent states.
 - **Enhanced Security:** Production credentials are never stored on developer machines; they are only used by the trusted GitHub Actions runner.
 - **Interactive TUI:** Run `datamanager` with no arguments for a user-friendly, menu-driven interface.
 - **Data Lifecycle Management:** A full suite of commands for rollback, deletion, and pruning, all gated by the same secure PR workflow.
 - **Integrity Verification:** All downloaded files are automatically checked against their SHA256 hash from the manifest.
-- **Credential Verification:** A detailed verify command reports read/write/delete permissions for both production and staging buckets.
+- **Credential Verification:** A detailed verify command reports read/write/delete permissions for production, staging, and internal buckets.
+
+## Bucket Types
+
+The system supports two types of data storage buckets:
+
+### Production Bucket (Public)
+
+- **Access Level:** Publicly accessible
+- **Use Case:** Data intended for public consumption
+- **Default:** Used by default for all operations
+- **Permissions:** Requires appropriate public access settings in Cloudflare R2
+
+### Internal Bucket (Team-Only)
+
+- **Access Level:** Team members only
+- **Use Case:** Sensitive or internal data not for public consumption
+- **Security:** Private access, team credentials required
+- **Usage:** Specify `--bucket internal` when using commands
 
 ## Prerequisites
 
 - Python 3.12+
 - Git
 - `sqlite3` command-line tool
-- An active Cloudflare account with **two** R2 buckets (one for production, one for staging).
+- An active Cloudflare account with **three** R2 buckets:
+  - Production bucket (publicly accessible)
+  - Staging bucket (for temporary uploads)
+  - Internal bucket (team-only access)
 - For the data in this repo, contact the OEO team for access to the R2 buckets.
 
 ## ⚙️ Setup and Installation
@@ -113,6 +135,7 @@ flowchart TD
     R2_SECRET_ACCESS_KEY="your_r2_secret_key"
     R2_PRODUCTION_BUCKET="your-production-bucket-name"
     R2_STAGING_BUCKET="your-staging-bucket-name"
+    R2_INTERNAL_BUCKET="your-internal-bucket-name"
     ```
 
 4. **Verify Configuration:**
@@ -143,8 +166,11 @@ git checkout -b feat/update-energy-data
 Use the `datamanager` tool to stage your changes. The `prepare` command handles both creating new datasets and updating existing ones.
 
 ```bash
-# This uploads the file to the staging bucket and updates manifest.json locally
+# Prepare for production bucket (default)
 uv run datamanager prepare energy-data.sqlite ./local-files/new-energy.sqlite
+
+# Prepare for internal bucket
+uv run datamanager prepare energy-data.sqlite ./local-files/new-energy.sqlite --bucket internal
 ```
 
 The tool will guide you through the process. For other maintenance tasks like `rollback` or `delete`, use the corresponding command.
@@ -191,7 +217,7 @@ This will launch a menu where you can choose your desired action, including the
 
 ### Command-Line Interface (CLI)
 
-You can also use the command-line interface directly for specific tasks or for scripting purposes.
+You can also use the command-line interface directly for specific tasks or for scripting purposes. Use the `--bucket` option to specify whether to work with production or internal data.
 
 ![CLI](assets/cli.png)
 
@@ -231,11 +257,17 @@ uv run datamanager list-datasets
 Downloads a dataset from the **production** R2 bucket and verifies its integrity.
 
 ```bash
-# Pull the latest version
+# Pull the latest version from production (default)
 uv run datamanager pull user-profiles.sqlite
 
-# Pull a specific version
+# Pull from internal bucket
+uv run datamanager pull user-profiles.sqlite --bucket internal
+
+# Pull a specific version from production
 uv run datamanager pull user-profiles.sqlite --version v2
+
+# Pull a specific version from internal bucket
+uv run datamanager pull user-profiles.sqlite --version v2 --bucket internal
 ```
 
 ![pull](assets/pull.png)
@@ -268,7 +300,7 @@ uv run datamanager prune-versions <dataset-name.sqlite> --keep 5
 
 #### `verify`
 
-Checks R2 credentials and reports granular read/write/delete permissions for both production and staging buckets.
+Checks R2 credentials and reports granular read/write/delete permissions for production, staging, and internal buckets.
 
 ```bash
 uv run datamanager verify

diff --git a/docs/source/setup.md b/docs/source/setup.md
@@ -1,5 +1,11 @@
 # Setup and Installation
 
+This tool requires three Cloudflare R2 buckets:
+
+- **Production bucket**: For publicly accessible datasets
+- **Staging bucket**: For temporary uploads during the review process
+- **Internal bucket**: For team-only datasets with restricted access
+
 1. **Clone the Repository:**
 
     ```bash
@@ -37,6 +43,7 @@
     R2_SECRET_ACCESS_KEY="your_r2_secret_key"
     R2_PRODUCTION_BUCKET="your-production-bucket-name"
     R2_STAGING_BUCKET="your-staging-bucket-name"
+    R2_INTERNAL_BUCKET="your-internal-bucket-name"
     ```
 
 4. **Verify Configuration:**

diff --git a/docs/source/usage.md b/docs/source/usage.md
@@ -54,7 +54,7 @@ uv run datamanager list-datasets
 
 ### `pull`
 
-Downloads a dataset from the **production** R2 bucket and verifies its integrity.
+Downloads a dataset from the appropriate R2 bucket (production or internal) and verifies its integrity. The bucket is determined by the dataset's configuration in the manifest.
 
 ```bash
 # Pull the latest version
@@ -94,7 +94,7 @@ uv run datamanager prune-versions <dataset-name.sqlite> --keep 5
 
 ### `verify`
 
-Checks R2 credentials and reports granular read/write/delete permissions for both production and staging buckets.
+Checks R2 credentials and reports granular read/write/delete permissions for all three buckets (production, staging, and internal).
 
 ```bash
 uv run datamanager verify

diff --git a/docs/source/workflow.md b/docs/source/workflow.md
@@ -41,8 +41,8 @@ Go to GitHub and open a pull request from your feature branch to `main`. The dif
 
 Once the PR is reviewed, approved, and all status checks pass, merge it. The CI/CD pipeline takes over automatically:
 
-- It copies the data from the staging bucket to the production bucket.
+- It copies the data from the staging bucket to the appropriate target bucket (production or internal).
 - It finalizes the `manifest.json` with the new commit hash and description.
 - It pushes a final commit back to `main`.
 
-The new data version is now live and available to all users via `datamanager pull`.
+The new data version is now live and available via `datamanager pull`. **Note:** Internal datasets are only accessible to team members with appropriate R2 bucket permissions.
diff --git a/env.example b/env.example
@@ -3,3 +3,4 @@ R2_ACCESS_KEY_ID="your_r2_access_key"
 R2_SECRET_ACCESS_KEY="your_r2_secret_key"
 R2_PRODUCTION_BUCKET="your-production-bucket-name"
 R2_STAGING_BUCKET="your-staging-bucket-name"
+R2_INTERNAL_BUCKET="your-internal-bucket-name"
diff --git a/manifest.json b/manifest.json
@@ -1,6 +1,7 @@
 [
   {
     "fileName": "test_database.sqlite",
+    "bucket": "production",
     "latestVersion": "v1",
     "history": [
       {
@@ -31,5 +32,22 @@
         "description": "updating test_database to get multiple versions"
       }
     ]
+  },
+  {
+    "fileName": "internal_test_database.sqlite",
+    "bucket": "internal",
+    "latestVersion": "v1",
+    "history": [
+      {
+        "version": "v1",
+        "timestamp": "2025-09-23T21:36:33.540270Z",
+        "sha256": "6d60f0035a80de92c3f3df433212699e0584a09a7d4943693ae0889d98640641",
+        "r2_object_key": "internal/v1-6d60f0035a80de92c3f3df433212699e0584a09a7d4943693ae0889d98640641.sqlite",
+        "staging_key": "staging-uploads/6d60f0035a80de92c3f3df433212699e0584a09a7d4943693ae0889d98640641.sqlite",
+        "diffFromPrevious": null,
+        "commit": "pending-merge",
+        "description": "pending-merge"
+      }
+    ]
   }
 ]