Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 65 additions & 20 deletions .github/scripts/publish_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
SECRET_ACCESS_KEY = os.environ["R2_SECRET_ACCESS_KEY"]
PROD_BUCKET = os.environ["R2_PRODUCTION_BUCKET"]
STAGING_BUCKET = os.environ["R2_STAGING_BUCKET"]
INTERNAL_BUCKET = os.environ["R2_INTERNAL_BUCKET"]
ENDPOINT_URL = f"https://{ACCOUNT_ID}.r2.cloudflarestorage.com"
MANIFEST_FILE = "manifest.json"
DATASETS_DOC_PATH = "docs/source/datasets.md"
Expand Down Expand Up @@ -134,23 +135,32 @@ def handle_deletions(manifest_data: list[dict[str, Any]]) -> bool:
processed_deletion = False

for dataset in manifest_data:
# Get the bucket type for this dataset
bucket_type = dataset.get("bucket", "production")
target_bucket = INTERNAL_BUCKET if bucket_type == "internal" else PROD_BUCKET
bucket_name = "internal" if bucket_type == "internal" else "production"

if dataset.get("status") == "pending-deletion":
processed_deletion = True
print(f"Found dataset marked for full deletion: {dataset['fileName']}")
print(
f"Found dataset marked for full deletion: {dataset['fileName']} from {bucket_name} bucket"
)
for entry in dataset.get("history", []):
if "r2_object_key" in entry:
objects_to_delete_from_r2.append({"Key": entry["r2_object_key"]})
objects_to_delete_from_r2.append(
{"Key": entry["r2_object_key"], "Bucket": target_bucket}
)
else:
versions_to_keep = []
for entry in dataset.get("history", []):
if entry.get("status") == "pending-deletion":
processed_deletion = True
print(
f"Found version marked for deletion: {dataset['fileName']} v{entry['version']}"
f"Found version marked for deletion: {dataset['fileName']} v{entry['version']} from {bucket_name} bucket"
)
if "r2_object_key" in entry:
objects_to_delete_from_r2.append(
{"Key": entry["r2_object_key"]}
{"Key": entry["r2_object_key"], "Bucket": target_bucket}
)
else:
versions_to_keep.append(entry)
Expand All @@ -162,18 +172,44 @@ def handle_deletions(manifest_data: list[dict[str, Any]]) -> bool:
return False

if objects_to_delete_from_r2:
print(
f"\nDeleting {len(objects_to_delete_from_r2)} objects from production R2 bucket..."
)
for i in range(0, len(objects_to_delete_from_r2), 1000):
chunk: Any = objects_to_delete_from_r2[i : i + 1000]
response = client.delete_objects(
Bucket=PROD_BUCKET, Delete={"Objects": chunk, "Quiet": True}
# Group objects by bucket for deletion
prod_objects = [
obj for obj in objects_to_delete_from_r2 if obj["Bucket"] == PROD_BUCKET
]
internal_objects = [
obj for obj in objects_to_delete_from_r2 if obj["Bucket"] == INTERNAL_BUCKET
]

if prod_objects:
print(
f"\nDeleting {len(prod_objects)} objects from production R2 bucket..."
)
for i in range(0, len(prod_objects), 1000):
chunk: Any = prod_objects[i : i + 1000]
objects_only = [{"Key": obj["Key"]} for obj in chunk]
response = client.delete_objects(
Bucket=PROD_BUCKET, Delete={"Objects": objects_only, "Quiet": True}
)
if response.get("Errors"):
print(" ❌ ERROR during batch deletion:", response["Errors"])
exit(1)
print("✅ Successfully deleted objects from production R2 bucket.")

if internal_objects:
print(
f"\nDeleting {len(internal_objects)} objects from internal R2 bucket..."
)
if response.get("Errors"):
print(" ❌ ERROR during batch deletion:", response["Errors"])
exit(1)
print("✅ Successfully deleted objects from R2.")
for i in range(0, len(internal_objects), 1000):
chunk: Any = internal_objects[i : i + 1000]
objects_only = [{"Key": obj["Key"]} for obj in chunk]
response = client.delete_objects(
Bucket=INTERNAL_BUCKET,
Delete={"Objects": objects_only, "Quiet": True},
)
if response.get("Errors"):
print(" ❌ ERROR during batch deletion:", response["Errors"])
exit(1)
print("✅ Successfully deleted objects from internal R2 bucket.")

finalize_manifest(datasets_to_keep, "ci: Finalize manifest after data deletion")
return True
Expand All @@ -186,6 +222,11 @@ def handle_publications(manifest_data: list[dict[str, Any]]) -> bool:
"""
print("\n--- Phase 2: Checking for pending publications ---")
for dataset in manifest_data:
# Get the bucket type for this dataset
bucket_type = dataset.get("bucket", "production")
target_bucket = INTERNAL_BUCKET if bucket_type == "internal" else PROD_BUCKET
bucket_name = "internal" if bucket_type == "internal" else "production"

for i, entry in enumerate(dataset["history"]):
if entry.get("commit") == "pending-merge":
commit_details = get_commit_details()
Expand All @@ -197,32 +238,36 @@ def handle_publications(manifest_data: list[dict[str, Any]]) -> bool:
if "staging_key" in entry and entry["staging_key"]:
staging_key = entry.pop("staging_key")
final_key = entry["r2_object_key"]
print(f"Publishing: {dataset['fileName']} v{entry['version']}")
print(
f"Publishing: {dataset['fileName']} v{entry['version']} to {bucket_name} bucket"
)
print(f" Description: {entry['description']}")
try:
copy_source: Any = {
"Bucket": STAGING_BUCKET,
"Key": staging_key,
}
client.copy_object(
CopySource=copy_source, Bucket=PROD_BUCKET, Key=final_key
CopySource=copy_source, Bucket=target_bucket, Key=final_key
)
print(
f" ✅ Server-side copy to {bucket_name} bucket successful."
)
print(" ✅ Server-side copy successful.")
client.delete_object(Bucket=STAGING_BUCKET, Key=staging_key)
print(" ✅ Staging object deleted.")
except ClientError as e:
print(f" ❌ ERROR: Could not process object. Reason: {e}")
exit(1)
else:
print(
f"Finalizing rollback: {dataset['fileName']} v{entry['version']}"
f"Finalizing rollback: {dataset['fileName']} v{entry['version']} in {bucket_name} bucket"
)
print(f" Description: {entry['description']}")

dataset["history"][i] = entry
finalize_manifest(
manifest_data,
f"ci: Publish {dataset['fileName']} {entry['version']}",
f"ci: Publish {dataset['fileName']} {entry['version']} to {bucket_name}",
)
return True # Process only one publication per run

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,4 @@ jobs:
R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
R2_PRODUCTION_BUCKET: ${{ vars.R2_PRODUCTION_BUCKET }} # Use repo variable
R2_STAGING_BUCKET: ${{ vars.R2_STAGING_BUCKET }} # Use repo variable
R2_INTERNAL_BUCKET: ${{ vars.R2_INTERNAL_BUCKET }} # Use repo variable
7 changes: 6 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/astral-sh/uv-pre-commit
rev: 0.8.19
rev: 0.8.22
hooks:
# Dependency management
- id: uv-lock
Expand All @@ -21,6 +21,11 @@ repos:
args: ["--maxkb=1024"]
- id: debug-statements

- repo: https://github.com/asottile/pyupgrade
rev: v3.20.0
hooks:
- id: pyupgrade

# Python Linting & Formatting with Ruff
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.13.1
Expand Down
46 changes: 39 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,19 +59,41 @@ flowchart TD

## Features

- **Multi-Bucket Support:** Choose between production (public) and internal (team-only) buckets for different data access levels.
- **CI/CD-Driven Publishing:** Data publication is transactional and automated via GitHub Actions after a pull request is merged, preventing inconsistent states.
- **Enhanced Security:** Production credentials are never stored on developer machines; they are only used by the trusted GitHub Actions runner.
- **Interactive TUI:** Run `datamanager` with no arguments for a user-friendly, menu-driven interface.
- **Data Lifecycle Management:** A full suite of commands for rollback, deletion, and pruning, all gated by the same secure PR workflow.
- **Integrity Verification:** All downloaded files are automatically checked against their SHA256 hash from the manifest.
- **Credential Verification:** A detailed verify command reports read/write/delete permissions for both production and staging buckets.
- **Credential Verification:** A detailed verify command reports read/write/delete permissions for production, staging, and internal buckets.

## Bucket Types

The system supports two types of data storage buckets:

### Production Bucket (Public)

- **Access Level:** Publicly accessible
- **Use Case:** Data intended for public consumption
- **Default:** Used by default for all operations
- **Permissions:** Requires appropriate public access settings in Cloudflare R2

### Internal Bucket (Team-Only)

- **Access Level:** Team members only
- **Use Case:** Sensitive or internal data not for public consumption
- **Security:** Private access, team credentials required
- **Usage:** Specify `--bucket internal` when using commands

## Prerequisites

- Python 3.12+
- Git
- `sqlite3` command-line tool
- An active Cloudflare account with **two** R2 buckets (one for production, one for staging).
- An active Cloudflare account with **three** R2 buckets:
- Production bucket (publicly accessible)
- Staging bucket (for temporary uploads)
- Internal bucket (team-only access)
- For the data in this repo, contact the OEO team for access to the R2 buckets.

## ⚙️ Setup and Installation
Expand Down Expand Up @@ -113,6 +135,7 @@ flowchart TD
R2_SECRET_ACCESS_KEY="your_r2_secret_key"
R2_PRODUCTION_BUCKET="your-production-bucket-name"
R2_STAGING_BUCKET="your-staging-bucket-name"
R2_INTERNAL_BUCKET="your-internal-bucket-name"
```

4. **Verify Configuration:**
Expand Down Expand Up @@ -143,8 +166,11 @@ git checkout -b feat/update-energy-data
Use the `datamanager` tool to stage your changes. The `prepare` command handles both creating new datasets and updating existing ones.

```bash
# This uploads the file to the staging bucket and updates manifest.json locally
# Prepare for production bucket (default)
uv run datamanager prepare energy-data.sqlite ./local-files/new-energy.sqlite

# Prepare for internal bucket
uv run datamanager prepare energy-data.sqlite ./local-files/new-energy.sqlite --bucket internal
```

The tool will guide you through the process. For other maintenance tasks like `rollback` or `delete`, use the corresponding command.
Expand Down Expand Up @@ -191,7 +217,7 @@ This will launch a menu where you can choose your desired action, including the

### Command-Line Interface (CLI)

You can also use the command-line interface directly for specific tasks or for scripting purposes.
You can also use the command-line interface directly for specific tasks or for scripting purposes. Use the `--bucket` option to specify whether to work with production or internal data.

![CLI](assets/cli.png)

Expand Down Expand Up @@ -231,11 +257,17 @@ uv run datamanager list-datasets
Downloads a dataset from the **production** R2 bucket and verifies its integrity.

```bash
# Pull the latest version
# Pull the latest version from production (default)
uv run datamanager pull user-profiles.sqlite

# Pull a specific version
# Pull from internal bucket
uv run datamanager pull user-profiles.sqlite --bucket internal

# Pull a specific version from production
uv run datamanager pull user-profiles.sqlite --version v2

# Pull a specific version from internal bucket
uv run datamanager pull user-profiles.sqlite --version v2 --bucket internal
```

![pull](assets/pull.png)
Expand Down Expand Up @@ -268,7 +300,7 @@ uv run datamanager prune-versions <dataset-name.sqlite> --keep 5

#### `verify`

Checks R2 credentials and reports granular read/write/delete permissions for both production and staging buckets.
Checks R2 credentials and reports granular read/write/delete permissions for production, staging, and internal buckets.

```bash
uv run datamanager verify
Expand Down
7 changes: 7 additions & 0 deletions docs/source/setup.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Setup and Installation

This tool requires three Cloudflare R2 buckets:

- **Production bucket**: For publicly accessible datasets
- **Staging bucket**: For temporary uploads during the review process
- **Internal bucket**: For team-only datasets with restricted access

1. **Clone the Repository:**

```bash
Expand Down Expand Up @@ -37,6 +43,7 @@
R2_SECRET_ACCESS_KEY="your_r2_secret_key"
R2_PRODUCTION_BUCKET="your-production-bucket-name"
R2_STAGING_BUCKET="your-staging-bucket-name"
R2_INTERNAL_BUCKET="your-internal-bucket-name"
```

4. **Verify Configuration:**
Expand Down
4 changes: 2 additions & 2 deletions docs/source/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ uv run datamanager list-datasets

### `pull`

Downloads a dataset from the **production** R2 bucket and verifies its integrity.
Downloads a dataset from the appropriate R2 bucket (production or internal) and verifies its integrity. The bucket is determined by the dataset's configuration in the manifest.

```bash
# Pull the latest version
Expand Down Expand Up @@ -94,7 +94,7 @@ uv run datamanager prune-versions <dataset-name.sqlite> --keep 5

### `verify`

Checks R2 credentials and reports granular read/write/delete permissions for both production and staging buckets.
Checks R2 credentials and reports granular read/write/delete permissions for all three buckets (production, staging, and internal).

```bash
uv run datamanager verify
Expand Down
4 changes: 2 additions & 2 deletions docs/source/workflow.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ Go to GitHub and open a pull request from your feature branch to `main`. The dif

Once the PR is reviewed, approved, and all status checks pass, merge it. The CI/CD pipeline takes over automatically:

- It copies the data from the staging bucket to the production bucket.
- It copies the data from the staging bucket to the appropriate target bucket (production or internal).
- It finalizes the `manifest.json` with the new commit hash and description.
- It pushes a final commit back to `main`.

The new data version is now live and available to all users via `datamanager pull`.
The new data version is now live and available via `datamanager pull`. **Note:** Internal datasets are only accessible to team members with appropriate R2 bucket permissions.
1 change: 1 addition & 0 deletions env.example
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ R2_ACCESS_KEY_ID="your_r2_access_key"
R2_SECRET_ACCESS_KEY="your_r2_secret_key"
R2_PRODUCTION_BUCKET="your-production-bucket-name"
R2_STAGING_BUCKET="your-staging-bucket-name"
R2_INTERNAL_BUCKET="your-internal-bucket-name"
18 changes: 18 additions & 0 deletions manifest.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
[
{
"fileName": "test_database.sqlite",
"bucket": "production",
"latestVersion": "v1",
"history": [
{
Expand Down Expand Up @@ -31,5 +32,22 @@
"description": "updating test_database to get multiple versions"
}
]
},
{
"fileName": "internal_test_database.sqlite",
"bucket": "internal",
"latestVersion": "v1",
"history": [
{
"version": "v1",
"timestamp": "2025-09-23T21:36:33.540270Z",
"sha256": "6d60f0035a80de92c3f3df433212699e0584a09a7d4943693ae0889d98640641",
"r2_object_key": "internal/v1-6d60f0035a80de92c3f3df433212699e0584a09a7d4943693ae0889d98640641.sqlite",
"staging_key": "staging-uploads/6d60f0035a80de92c3f3df433212699e0584a09a7d4943693ae0889d98640641.sqlite",
"diffFromPrevious": null,
"commit": "pending-merge",
"description": "pending-merge"
}
]
}
]
Loading