From 3e979fe9c5a7a71e96fd7cb3359f0ea01cfbfc68 Mon Sep 17 00:00:00 2001 From: cbullinger Date: Thu, 10 Jul 2025 13:34:19 -0400 Subject: [PATCH 1/2] Add disaster_recovery package --- .../go/atlas-sdk-go/disaster_recovery.md | 0 .../examples/disaster_recovery/main.go | 246 ++++++++++++++++++ 2 files changed, 246 insertions(+) create mode 100644 usage-examples/go/atlas-sdk-go/disaster_recovery.md create mode 100644 usage-examples/go/atlas-sdk-go/examples/disaster_recovery/main.go diff --git a/usage-examples/go/atlas-sdk-go/disaster_recovery.md b/usage-examples/go/atlas-sdk-go/disaster_recovery.md new file mode 100644 index 0000000..e69de29 diff --git a/usage-examples/go/atlas-sdk-go/examples/disaster_recovery/main.go b/usage-examples/go/atlas-sdk-go/examples/disaster_recovery/main.go new file mode 100644 index 0000000..cf33c13 --- /dev/null +++ b/usage-examples/go/atlas-sdk-go/examples/disaster_recovery/main.go @@ -0,0 +1,246 @@ +package main + +import ( + "atlas-sdk-go/internal/auth" + "atlas-sdk-go/internal/config" + "atlas-sdk-go/internal/errors" + "context" + "flag" + "fmt" + "github.com/joho/godotenv" + "log" + "os" + "time" + + "go.mongodb.org/atlas-sdk/v20250219001/admin" +) + +// const ( +// logsDir = "logs" +// ) +// +// type Config struct { +// PublicKey string +// PrivateKey string +// ProjectID string +// ClusterName string +// BackupID string +// ScenarioType string +// } + +func main() { + if err := godotenv.Load(); err != nil { + log.Printf("Warning: .env file not loaded: %v", err) + } + + secrets, cfg, err := config.LoadAll("configs/config.json") + if err != nil { + errors.ExitWithError("Failed to load configuration", err) + } + + client, err := auth.NewClient(cfg, secrets) + if err != nil { + errors.ExitWithError("Failed to initialize authentication client", err) + } + + // Parse command line flags for DR configuration + drCfg := parseFlags() + + ctx := context.Background() + setupLogging() + + // Execute the requested disaster recovery scenario + switch drCfg.ScenarioType { + case "regional-outage": + handleRegionalOutage(ctx, client, drCfg) + case "cloud-provider-outage": + handleCloudProviderOutage(ctx, client, drCfg) + case "restore-data": + handleDataRestoration(ctx, client, drCfg) + default: + log.Fatalf("Unknown scenario type: %s", drCfg.ScenarioType) + } +} + +func handleRegionalOutage(ctx context.Context, sdk *admin.APIClient, cfg Config) { + log.Println("Handling regional outage by adding nodes to unaffected regions...") + + // 1. Get current cluster configuration + cluster, _, err := sdk.ClustersApi.GetCluster(ctx, cfg.ProjectID, cfg.ClusterName).Execute() + if err != nil { + log.Fatalf("Failed to get cluster details: %v", err) + } + + // 2. Identify regions that are currently not in use and add a node + var newRegions []admin.ReplicationSpec20240805 + foundRegion := false + + if cluster.ReplicationSpecs != nil { + for _, region := range *cluster.ReplicationSpecs { + newRegions = append(newRegions, region) + if *region.ZoneName == "EU_WEST_1" { + foundRegion = true + } + } + } + + if !foundRegion { + // Add a new region that's unaffected by the outage + priority := int64(5) + electableNodes := int64(1) + readOnlyNodes := int64(0) + analyticsNodes := int64(0) + regionName := "EU_WEST_1" + + newRegion := admin.ReplicationSpec20240805{ + RegionName: ®ionName, + Priority: &priority, + RegionConfigs: &admin.CloudRegionConfig20240805{} & electableNodes, + ReadOnlyNodes: &readOnlyNodes, + AnalyticsNodes: &analyticsNodes, + } + newRegions = append(newRegions, newRegion) + } + + // 3. Update cluster with new regions + updateRequest := admin.AdvancedClusterDescriptionV2{ + ReplicationSpecs: &newRegions, + } + + _, _, err = sdk.ClustersApi.UpdateCluster(ctx, cfg.ProjectID, cfg.ClusterName, &updateRequest).Execute() + if err != nil { + log.Fatalf("Failed to update cluster: %v", err) + } + + log.Println("Successfully added nodes to unaffected regions") +} + +func handleCloudProviderOutage(ctx context.Context, sdk *admin.APIClient, cfg Config) { + log.Println("Handling cloud provider outage...") + + // 1. Get current cluster configuration + sourceCluster, _, err := sdk.ClustersApi.GetCluster(ctx, cfg.ProjectID, cfg.ClusterName).Execute() + if err != nil { + log.Fatalf("Failed to get source cluster details: %v", err) + } + + // 2. Create new cluster on alternative cloud provider + newClusterName := cfg.ClusterName + "-recovery" + clusterType := "REPLICASET" + providerName := "GCP" // Switch from AWS to GCP or vice versa + + newCluster := admin.AdvancedClusterDescriptionV2{ + Name: &newClusterName, + ClusterType: &clusterType, + ProviderName: &providerName, + DiskSizeGB: sourceCluster.DiskSizeGB, + MongoDBMajorVersion: sourceCluster.MongoDBMajorVersion, + } + + // Configure replica set based on original configuration + // (simplified for example - would need more configuration in practice) + + _, _, err = sdk.ClustersApi.CreateCluster(ctx, cfg.ProjectID, &newCluster).Execute() + if err != nil { + log.Fatalf("Failed to create recovery cluster: %v", err) + } + + log.Printf("Created recovery cluster: %s", newClusterName) + + // 3. Wait for cluster to be ready + log.Println("Waiting for cluster to become available...") + waitForClusterReady(ctx, sdk, cfg.ProjectID, newClusterName) + + // 4. Restore the most recent snapshot to the new cluster + log.Println("Restoring backup to recovery cluster...") + restoreRequest := admin.DiskBackupSnapshotRestoreJob{ + TargetClusterName: &newClusterName, + SnapshotId: &cfg.BackupID, + } + + _, _, err = sdk.CloudBackupsApi.CreateBackupRestoreJob(ctx, cfg.ProjectID, cfg.ClusterName, &restoreRequest).Execute() + if err != nil { + log.Fatalf("Failed to restore backup: %v", err) + } + + log.Printf("Successfully initiated restore to cluster %s from backup %s", newClusterName, cfg.BackupID) + log.Println("Once restore is complete, update your application connection strings to point to the new cluster") +} + +func handleDataRestoration(ctx context.Context, sdk *admin.APIClient, cfg Config) { + log.Println("Handling data restoration after accidental deletion...") + + // Restore from point-in-time backup + restoreRequest := admin.DiskBackupSnapshotRestoreJob{ + TargetClusterName: &cfg.ClusterName, + SnapshotId: &cfg.BackupID, + } + + _, _, err := sdk.CloudBackupsApi.CreateBackupRestoreJob(ctx, cfg.ProjectID, cfg.ClusterName, &restoreRequest).Execute() + if err != nil { + log.Fatalf("Failed to restore backup: %v", err) + } + + log.Printf("Successfully initiated restore to cluster %s from backup %s", cfg.ClusterName, cfg.BackupID) + log.Println("After restoration, verify data integrity and reimport any data collected since the backup") +} + +// Helper function to wait for cluster to be ready +func waitForClusterReady(ctx context.Context, sdk *admin.APIClient, projectID, clusterName string) { + for { + cluster, _, err := sdk.ClustersApi.GetCluster(ctx, projectID, clusterName).Execute() + if err != nil { + log.Printf("Error checking cluster status: %v", err) + } else if cluster.StateName != nil && *cluster.StateName == "IDLE" { + log.Println("Cluster is ready") + return + } + + log.Printf("Cluster status: %s. Waiting 30 seconds...", *cluster.StateName) + time.Sleep(30 * time.Second) + } +} + +func parseFlags() Config { + cfg := Config{} + + flag.StringVar(&cfg.PublicKey, "public-key", os.Getenv("ATLAS_PUBLIC_KEY"), "MongoDB Atlas public API key") + flag.StringVar(&cfg.PrivateKey, "private-key", os.Getenv("ATLAS_PRIVATE_KEY"), "MongoDB Atlas private API key") + flag.StringVar(&cfg.ProjectID, "project-id", "", "MongoDB Atlas project ID") + flag.StringVar(&cfg.ClusterName, "cluster-name", "", "MongoDB Atlas cluster name") + flag.StringVar(&cfg.BackupID, "backup-id", "", "MongoDB Atlas backup snapshot ID (for restore operations)") + flag.StringVar(&cfg.ScenarioType, "scenario", "", "Disaster recovery scenario type: regional-outage, cloud-provider-outage, restore-data") + + flag.Parse() + + // Validate required parameters + if cfg.PublicKey == "" || cfg.PrivateKey == "" || cfg.ProjectID == "" || cfg.ClusterName == "" || cfg.ScenarioType == "" { + flag.Usage() + os.Exit(1) + } + + return cfg +} + +func setupLogging() { + // Ensure logs directory exists + defaultDir := os.Getenv("ATLAS_DOWNLOADS_DIR") + logDir := logsDir + if defaultDir != "" { + logDir = fmt.Sprintf("%s/%s", defaultDir, logsDir) + } + + if err := os.MkdirAll(logDir, 0755); err != nil { + log.Fatalf("Failed to create logs directory: %v", err) + } + + // Set up logging to file + logFile := fmt.Sprintf("%s/disaster_recovery_%s.log", logDir, time.Now().Format("20060102_150405")) + f, err := os.Create(logFile) + if err != nil { + log.Fatalf("Failed to create log file: %v", err) + } + + log.SetOutput(f) + log.Printf("Starting disaster recovery script at %s", time.Now().Format(time.RFC3339)) +} From 5e8c3d98b3ba6372c6f6ed714b226b3770b43aca Mon Sep 17 00:00:00 2001 From: cbullinger Date: Mon, 15 Sep 2025 13:23:08 -0400 Subject: [PATCH 2/2] Add disaster recovery example scripts --- .../main.snippet.disaster-recovery.go | 116 +++++++++ .../go/atlas-sdk-go/project-copy/README.md | 22 +- .../project-copy/examples/recovery/main.go | 115 ++++++++ .../internal/data/recovery/options.go | 119 +++++++++ .../internal/data/recovery/restore.go | 57 ++++ .../project-copy/internal/typeutils/load.go | 38 +++ usage-examples/go/atlas-sdk-go/README.md | 22 +- .../examples/disaster_recovery/main.go | 246 ------------------ .../go/atlas-sdk-go/examples/recovery/main.go | 120 +++++++++ .../examples/recovery/main_test.go | 171 ++++++++++++ .../internal/data/recovery/options.go | 119 +++++++++ .../internal/data/recovery/restore.go | 57 ++++ .../internal/data/recovery/restore_test.go | 6 + .../atlas-sdk-go/internal/typeutils/load.go | 38 +++ 14 files changed, 988 insertions(+), 258 deletions(-) create mode 100644 generated-usage-examples/go/atlas-sdk-go/main.snippet.disaster-recovery.go create mode 100644 generated-usage-examples/go/atlas-sdk-go/project-copy/examples/recovery/main.go create mode 100644 generated-usage-examples/go/atlas-sdk-go/project-copy/internal/data/recovery/options.go create mode 100644 generated-usage-examples/go/atlas-sdk-go/project-copy/internal/data/recovery/restore.go create mode 100644 generated-usage-examples/go/atlas-sdk-go/project-copy/internal/typeutils/load.go delete mode 100644 usage-examples/go/atlas-sdk-go/examples/disaster_recovery/main.go create mode 100644 usage-examples/go/atlas-sdk-go/examples/recovery/main.go create mode 100644 usage-examples/go/atlas-sdk-go/examples/recovery/main_test.go create mode 100644 usage-examples/go/atlas-sdk-go/internal/data/recovery/options.go create mode 100644 usage-examples/go/atlas-sdk-go/internal/data/recovery/restore.go create mode 100644 usage-examples/go/atlas-sdk-go/internal/data/recovery/restore_test.go create mode 100644 usage-examples/go/atlas-sdk-go/internal/typeutils/load.go diff --git a/generated-usage-examples/go/atlas-sdk-go/main.snippet.disaster-recovery.go b/generated-usage-examples/go/atlas-sdk-go/main.snippet.disaster-recovery.go new file mode 100644 index 0000000..07e2a2f --- /dev/null +++ b/generated-usage-examples/go/atlas-sdk-go/main.snippet.disaster-recovery.go @@ -0,0 +1,116 @@ +// See entire project at https://github.com/mongodb/atlas-architecture-go-sdk +package main + +import ( + "context" + "fmt" + "log" + "time" + + "atlas-sdk-go/internal/auth" + "atlas-sdk-go/internal/config" + "atlas-sdk-go/internal/data/recovery" + "atlas-sdk-go/internal/typeutils" + + "github.com/joho/godotenv" + "go.mongodb.org/atlas-sdk/v20250219001/admin" +) + +const ( + scenarioRegionalOutage = "regional-outage" + scenarioDataDeletion = "data-deletion" +) + +func main() { + envFile := ".env.production" + if err := godotenv.Load(envFile); err != nil { + log.Printf("Warning: could not load %s file: %v", envFile, err) + } + + secrets, cfg, err := config.LoadAllFromEnv() + if err != nil { + log.Fatalf("Failed to load configuration %v", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute) + defer cancel() + client, err := auth.NewClient(ctx, cfg, secrets) + if err != nil { + log.Fatalf("Failed to initialize authentication client: %v", err) + } + + opts, err := recovery.LoadDROptionsFromEnv(cfg.ProjectID) + if err != nil { + log.Fatalf("Configuration error: %v", err) + } + + fmt.Printf("Starting disaster recovery scenario: %s\nProject: %s\nCluster: %s\n", opts.Scenario, opts.ProjectID, opts.ClusterName) + + if opts.DryRun { + fmt.Println("DRY RUN: no write operations will be performed") + } + + var summary string + var opErr error + + switch opts.Scenario { + case scenarioRegionalOutage: + summary, opErr = simulateRegionalOutage(ctx, client, opts) + case scenarioDataDeletion: + summary, opErr = executeDataDeletionRestore(ctx, client, opts) + default: + opErr = fmt.Errorf("unsupported DR_SCENARIO '%s'", opts.Scenario) + } + + if opErr != nil { + log.Fatalf("Scenario failed: %v", opErr) + } + + fmt.Println("\n=== Summary ===") + fmt.Println(summary) + fmt.Println("Disaster recovery procedure completed.") +} + +// executeDataDeletionRestore initiates a restore job for a specified snapshot in a MongoDB Atlas cluster. +func executeDataDeletionRestore(ctx context.Context, client *admin.APIClient, o recovery.DrOptions) (string, error) { + job := admin.DiskBackupSnapshotRestoreJob{SnapshotId: &o.SnapshotID, TargetClusterName: &o.ClusterName} + if o.DryRun { + return fmt.Sprintf("(dry-run) Would submit restore job for snapshot %s", o.SnapshotID), nil + } + _, _, err := client.CloudBackupsApi.CreateBackupRestoreJob(ctx, o.ProjectID, o.ClusterName, &job).Execute() + if err != nil { + return "", fmt.Errorf("create restore job: %w", err) + } + return fmt.Sprintf("Restore job submitted for snapshot %s", o.SnapshotID), nil +} + +// simulateRegionalOutage modifies the electable node count in a target region for a MongoDB Atlas cluster. +func simulateRegionalOutage(ctx context.Context, client *admin.APIClient, o recovery.DrOptions) (string, error) { + cluster, _, err := client.ClustersApi.GetCluster(ctx, o.ProjectID, o.ClusterName).Execute() + if err != nil { + return "", fmt.Errorf("get cluster: %w", err) + } + if !cluster.HasReplicationSpecs() { + return "", fmt.Errorf("cluster has no replication specs") + } + repl := cluster.GetReplicationSpecs() + addedNodes, foundTarget := recovery.AddElectableNodesToRegion(repl, o.TargetRegion, o.AddNodes) + if !foundTarget { + return "", fmt.Errorf("target region '%s' not found in replication specs", o.TargetRegion) + } + zeroedRegions := 0 + if o.OutageRegion != "" { + zeroedRegions = recovery.ZeroElectableNodesInRegion(repl, o.OutageRegion) + } + payload := admin.NewClusterDescription20240805() + payload.SetReplicationSpecs(repl) + if o.DryRun { + return fmt.Sprintf("(dry-run) Would add %d electable nodes to %s%s", addedNodes, o.TargetRegion, typeutils.SuffixZeroed(zeroedRegions, o.OutageRegion)), nil + } + _, _, err = client.ClustersApi.UpdateCluster(ctx, o.ProjectID, o.ClusterName, payload).Execute() + if err != nil { + return "", fmt.Errorf("update cluster: %w", err) + } + return fmt.Sprintf("Added %d electable nodes to %s%s", addedNodes, o.TargetRegion, typeutils.SuffixZeroed(zeroedRegions, o.OutageRegion)), nil +} + diff --git a/generated-usage-examples/go/atlas-sdk-go/project-copy/README.md b/generated-usage-examples/go/atlas-sdk-go/project-copy/README.md index 96b9817..3420349 100644 --- a/generated-usage-examples/go/atlas-sdk-go/project-copy/README.md +++ b/generated-usage-examples/go/atlas-sdk-go/project-copy/README.md @@ -18,6 +18,7 @@ Currently, the repository includes examples that demonstrate the following: - Return all linked organizations from a specific billing organization - Get historical invoices for an organization - Programmatically archive Atlas cluster data +- Perform disaster recovery operations (e.g. restore from snapshot) As the Architecture Center documentation evolves, this repository will be updated with new examples and improvements to existing code. @@ -29,7 +30,8 @@ and improvements to existing code. ├── examples # Runnable examples by category │ ├── billing/ │ ├── monitoring/ -│ └── performance/ +│ ├── performance/ +│ └── recovery/ ├── configs # Atlas configuration template │ └── config.example.json ├── internal # Shared utilities and helpers @@ -42,7 +44,8 @@ and improvements to existing code. │ ├── errors/ │ ├── fileutils/ │ ├── logs/ -│ └── metrics/ +│ ├── metrics/ +│ └── typeutils/ ├── go.mod ├── go.sum ├── CHANGELOG.md # List of major changes to the project @@ -61,10 +64,10 @@ and improvements to existing code. 1. Create a `.env.` file in the root directory with your MongoDB Atlas service account credentials. For example, create a `.env.development` file for your dev environment: ```dotenv - MONGODB_ATLAS_SERVICE_ACCOUNT_ID= - MONGODB_ATLAS_SERVICE_ACCOUNT_SECRET= - ATLAS_DOWNLOADS_DIR="tmp/atlas_downloads" # optional download directory - CONFIG_PATH="configs/config.development.json" # optional path to Atlas config file + MONGODB_ATLAS_SERVICE_ACCOUNT_ID= + MONGODB_ATLAS_SERVICE_ACCOUNT_SECRET= + ATLAS_DOWNLOADS_DIR="tmp/atlas_downloads" # optional download directory + CONFIG_PATH="configs/config.development.json" # optional path to Atlas config file ``` > **NOTE:** For production, use a secrets manager (e.g. HashiCorp Vault, AWS Secrets Manager) > instead of environment variables. @@ -133,6 +136,13 @@ go run examples/monitoring/metrics_process/main.go go run examples/performance/archiving/main.go ``` +### Recovery + +#### Perform Disaster Recovery Operations +```bash +go run examples/performance/recovery/main.go +``` + ## Changelog For list of major changes to this project, see [CHANGELOG](CHANGELOG.md). diff --git a/generated-usage-examples/go/atlas-sdk-go/project-copy/examples/recovery/main.go b/generated-usage-examples/go/atlas-sdk-go/project-copy/examples/recovery/main.go new file mode 100644 index 0000000..7e3ee9f --- /dev/null +++ b/generated-usage-examples/go/atlas-sdk-go/project-copy/examples/recovery/main.go @@ -0,0 +1,115 @@ +package main + +import ( + "context" + "fmt" + "log" + "time" + + "atlas-sdk-go/internal/auth" + "atlas-sdk-go/internal/config" + "atlas-sdk-go/internal/data/recovery" + "atlas-sdk-go/internal/typeutils" + + "github.com/joho/godotenv" + "go.mongodb.org/atlas-sdk/v20250219001/admin" +) + +const ( + scenarioRegionalOutage = "regional-outage" + scenarioDataDeletion = "data-deletion" +) + +func main() { + envFile := ".env.production" + if err := godotenv.Load(envFile); err != nil { + log.Printf("Warning: could not load %s file: %v", envFile, err) + } + + secrets, cfg, err := config.LoadAllFromEnv() + if err != nil { + log.Fatalf("Failed to load configuration %v", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute) + defer cancel() + client, err := auth.NewClient(ctx, cfg, secrets) + if err != nil { + log.Fatalf("Failed to initialize authentication client: %v", err) + } + + opts, err := recovery.LoadDROptionsFromEnv(cfg.ProjectID) + if err != nil { + log.Fatalf("Configuration error: %v", err) + } + + fmt.Printf("Starting disaster recovery scenario: %s\nProject: %s\nCluster: %s\n", opts.Scenario, opts.ProjectID, opts.ClusterName) + + if opts.DryRun { + fmt.Println("DRY RUN: no write operations will be performed") + } + + var summary string + var opErr error + + switch opts.Scenario { + case scenarioRegionalOutage: + summary, opErr = simulateRegionalOutage(ctx, client, opts) + case scenarioDataDeletion: + summary, opErr = executeDataDeletionRestore(ctx, client, opts) + default: + opErr = fmt.Errorf("unsupported DR_SCENARIO '%s'", opts.Scenario) + } + + if opErr != nil { + log.Fatalf("Scenario failed: %v", opErr) + } + + fmt.Println("\n=== Summary ===") + fmt.Println(summary) + fmt.Println("Disaster recovery procedure completed.") +} + +// executeDataDeletionRestore initiates a restore job for a specified snapshot in a MongoDB Atlas cluster. +func executeDataDeletionRestore(ctx context.Context, client *admin.APIClient, o recovery.DrOptions) (string, error) { + job := admin.DiskBackupSnapshotRestoreJob{SnapshotId: &o.SnapshotID, TargetClusterName: &o.ClusterName} + if o.DryRun { + return fmt.Sprintf("(dry-run) Would submit restore job for snapshot %s", o.SnapshotID), nil + } + _, _, err := client.CloudBackupsApi.CreateBackupRestoreJob(ctx, o.ProjectID, o.ClusterName, &job).Execute() + if err != nil { + return "", fmt.Errorf("create restore job: %w", err) + } + return fmt.Sprintf("Restore job submitted for snapshot %s", o.SnapshotID), nil +} + +// simulateRegionalOutage modifies the electable node count in a target region for a MongoDB Atlas cluster. +func simulateRegionalOutage(ctx context.Context, client *admin.APIClient, o recovery.DrOptions) (string, error) { + cluster, _, err := client.ClustersApi.GetCluster(ctx, o.ProjectID, o.ClusterName).Execute() + if err != nil { + return "", fmt.Errorf("get cluster: %w", err) + } + if !cluster.HasReplicationSpecs() { + return "", fmt.Errorf("cluster has no replication specs") + } + repl := cluster.GetReplicationSpecs() + addedNodes, foundTarget := recovery.AddElectableNodesToRegion(repl, o.TargetRegion, o.AddNodes) + if !foundTarget { + return "", fmt.Errorf("target region '%s' not found in replication specs", o.TargetRegion) + } + zeroedRegions := 0 + if o.OutageRegion != "" { + zeroedRegions = recovery.ZeroElectableNodesInRegion(repl, o.OutageRegion) + } + payload := admin.NewClusterDescription20240805() + payload.SetReplicationSpecs(repl) + if o.DryRun { + return fmt.Sprintf("(dry-run) Would add %d electable nodes to %s%s", addedNodes, o.TargetRegion, typeutils.SuffixZeroed(zeroedRegions, o.OutageRegion)), nil + } + _, _, err = client.ClustersApi.UpdateCluster(ctx, o.ProjectID, o.ClusterName, payload).Execute() + if err != nil { + return "", fmt.Errorf("update cluster: %w", err) + } + return fmt.Sprintf("Added %d electable nodes to %s%s", addedNodes, o.TargetRegion, typeutils.SuffixZeroed(zeroedRegions, o.OutageRegion)), nil +} + diff --git a/generated-usage-examples/go/atlas-sdk-go/project-copy/internal/data/recovery/options.go b/generated-usage-examples/go/atlas-sdk-go/project-copy/internal/data/recovery/options.go new file mode 100644 index 0000000..47ed284 --- /dev/null +++ b/generated-usage-examples/go/atlas-sdk-go/project-copy/internal/data/recovery/options.go @@ -0,0 +1,119 @@ +package recovery + +import ( + "fmt" + "os" + "strconv" + "strings" + + "atlas-sdk-go/internal/typeutils" +) + +const ( + defaultAddNodes = 1 + scenarioRegionalOutage = "regional-outage" + scenarioDataDeletion = "data-deletion" +) + +// DrOptions holds the scenario and configuration parameters used by the +// disaster recovery example. Values are typically loaded from environment +// variables. Only the fields relevant to the chosen Scenario are required. +// +// Scenario values: +// - "regional-outage" : simulate adding capacity to a healthy region +// - "data-deletion" : submit a snapshot restore job +// Required per scenario: +// regional-outage: ProjectID, ClusterName, TargetRegion +// data-deletion: ProjectID, ClusterName, SnapshotID +// Optional: +// OutageRegion (regional-outage) region to zero electable nodes +// AddNodes (regional-outage) number of electable nodes to add (default 1) +// DryRun when true prints intended actions only. +type DrOptions struct { + Scenario string + ProjectID string + ClusterName string + TargetRegion string + OutageRegion string + AddNodes int + SnapshotID string + DryRun bool +} + +// LoadDROptionsFromEnv reads environment variables and validates scenario-specific requirements. +// Defaults are applied first, then overridden if env vars are present: +// +// DR_SCENARIO (req) regional-outage | data-deletion +// ATLAS_PROJECT_ID (red unless provided via config loader) +// ATLAS_CLUSTER_NAME (req) target cluster name +// DR_TARGET_REGION (regional-outage req) region receiving added capacity +// DR_OUTAGE_REGION (regional-outage opt) region considered impaired (its electable nodes set to 0) +// DR_ADD_NODES (regional-outage opt) number of electable nodes to add (default: 1) +// DR_SNAPSHOT_ID (data-deletion req) snapshot ID to restore +// DR_DRY_RUN (opt bool) if true, only log intended actions (default: false) +func LoadDROptionsFromEnv(fallbackProjectID string) (DrOptions, error) { + o := DrOptions{ + AddNodes: defaultAddNodes, + } + + o.Scenario = strings.ToLower(strings.TrimSpace(os.Getenv("DR_SCENARIO"))) + o.ProjectID = typeutils.FirstNonEmpty(os.Getenv("ATLAS_PROJECT_ID"), fallbackProjectID) + o.ClusterName = strings.TrimSpace(os.Getenv("ATLAS_CLUSTER_NAME")) + o.TargetRegion = strings.TrimSpace(os.Getenv("DR_TARGET_REGION")) + o.OutageRegion = strings.TrimSpace(os.Getenv("DR_OUTAGE_REGION")) + o.SnapshotID = strings.TrimSpace(os.Getenv("DR_SNAPSHOT_ID")) + + if v, ok := os.LookupEnv("DR_ADD_NODES"); ok { + n, err := strconv.Atoi(strings.TrimSpace(v)) + if err != nil { + return o, fmt.Errorf("invalid DR_ADD_NODES value '%s': must be a positive integer", v) + } + if n <= 0 { + return o, fmt.Errorf("DR_ADD_NODES must be a positive integer, got %d", n) + } + o.AddNodes = n + } + + if v, ok := os.LookupEnv("DR_DRY_RUN"); ok { + o.DryRun = typeutils.ParseBool(v) + } + if err := validateRequiredFields(o); err != nil { + return o, err + } + if err := validateScenarioRequirements(o); err != nil { + return o, err + } + + return o, nil +} + +func validateRequiredFields(o DrOptions) error { + if o.Scenario == "" { + return fmt.Errorf("DR_SCENARIO is required") + } + if o.ProjectID == "" { + return fmt.Errorf("ATLAS_PROJECT_ID is required") + } + if o.ClusterName == "" { + return fmt.Errorf("ATLAS_CLUSTER_NAME is required") + } + return nil +} + +// validateScenarioRequirements checks that scenario-specific required fields are set. +func validateScenarioRequirements(o DrOptions) error { + switch o.Scenario { + case scenarioRegionalOutage: + if o.TargetRegion == "" { + return fmt.Errorf("DR_TARGET_REGION is required for %s scenario", scenarioRegionalOutage) + } + case scenarioDataDeletion: + if o.SnapshotID == "" { + return fmt.Errorf("DR_SNAPSHOT_ID is required for %s scenario", scenarioDataDeletion) + } + default: + return fmt.Errorf("unsupported DR_SCENARIO '%s': valid options are %s, %s", + o.Scenario, scenarioRegionalOutage, scenarioDataDeletion) + } + return nil +} diff --git a/generated-usage-examples/go/atlas-sdk-go/project-copy/internal/data/recovery/restore.go b/generated-usage-examples/go/atlas-sdk-go/project-copy/internal/data/recovery/restore.go new file mode 100644 index 0000000..a9f46ba --- /dev/null +++ b/generated-usage-examples/go/atlas-sdk-go/project-copy/internal/data/recovery/restore.go @@ -0,0 +1,57 @@ +package recovery + +import ( + "go.mongodb.org/atlas-sdk/v20250219001/admin" +) + +// AddElectableNodesToRegion increases electable node count in the specified target region. +func AddElectableNodesToRegion(repl []admin.ReplicationSpec20240805, targetRegion string, addNodes int) (int, bool) { + added := 0 + found := false + for i := range repl { + rcs := repl[i].GetRegionConfigs() + for j := range rcs { + regionName := "" + if rcs[j].HasRegionName() { + regionName = rcs[j].GetRegionName() + } + if regionName == targetRegion && rcs[j].HasElectableSpecs() { + es := rcs[j].GetElectableSpecs() + before := 0 + if es.HasNodeCount() { + before = es.GetNodeCount() + } + es.SetNodeCount(before + addNodes) + rcs[j].SetElectableSpecs(es) + added += addNodes + found = true + } + } + repl[i].SetRegionConfigs(rcs) + } + return added, found +} + +// ZeroElectableNodesInRegion sets electable node count to zero in the outage region, returning count of regions modified. +func ZeroElectableNodesInRegion(repl []admin.ReplicationSpec20240805, outageRegion string) int { + zeroed := 0 + for i := range repl { + rcs := repl[i].GetRegionConfigs() + for j := range rcs { + regionName := "" + if rcs[j].HasRegionName() { + regionName = rcs[j].GetRegionName() + } + if regionName == outageRegion && rcs[j].HasElectableSpecs() { + es := rcs[j].GetElectableSpecs() + if es.HasNodeCount() && es.GetNodeCount() > 0 { + es.SetNodeCount(0) + rcs[j].SetElectableSpecs(es) + zeroed++ + } + } + } + repl[i].SetRegionConfigs(rcs) + } + return zeroed +} diff --git a/generated-usage-examples/go/atlas-sdk-go/project-copy/internal/typeutils/load.go b/generated-usage-examples/go/atlas-sdk-go/project-copy/internal/typeutils/load.go new file mode 100644 index 0000000..e5747f1 --- /dev/null +++ b/generated-usage-examples/go/atlas-sdk-go/project-copy/internal/typeutils/load.go @@ -0,0 +1,38 @@ +package typeutils + +import ( + "fmt" + "strings" +) + +// FirstNonEmpty returns the first non-empty, non-whitespace string from values, or an empty string if none found. +func FirstNonEmpty(values ...string) string { + for _, v := range values { + if strings.TrimSpace(v) != "" { + return v + } + } + return "" +} + +// ParseBool interprets a string as a boolean. It returns true for "true", "1", "yes", "y" (case insensitive, trimmed), and false otherwise. +func ParseBool(v string) bool { + v = strings.ToLower(strings.TrimSpace(v)) + return v == "true" || v == "1" || v == "yes" || v == "y" +} + +// SuffixZeroed returns a formatted string if zeroed > 0, otherwise an empty string. +func SuffixZeroed(zeroed int, region string) string { + if zeroed == 0 { + return "" + } + return fmt.Sprintf(", zeroed electable nodes in region %s", region) +} + +// DefaultIfBlank returns d if v is an empty string, otherwise returns v. +func DefaultIfBlank(v, d string) string { + if v == "" { + return d + } + return v +} diff --git a/usage-examples/go/atlas-sdk-go/README.md b/usage-examples/go/atlas-sdk-go/README.md index 96b9817..3420349 100644 --- a/usage-examples/go/atlas-sdk-go/README.md +++ b/usage-examples/go/atlas-sdk-go/README.md @@ -18,6 +18,7 @@ Currently, the repository includes examples that demonstrate the following: - Return all linked organizations from a specific billing organization - Get historical invoices for an organization - Programmatically archive Atlas cluster data +- Perform disaster recovery operations (e.g. restore from snapshot) As the Architecture Center documentation evolves, this repository will be updated with new examples and improvements to existing code. @@ -29,7 +30,8 @@ and improvements to existing code. ├── examples # Runnable examples by category │ ├── billing/ │ ├── monitoring/ -│ └── performance/ +│ ├── performance/ +│ └── recovery/ ├── configs # Atlas configuration template │ └── config.example.json ├── internal # Shared utilities and helpers @@ -42,7 +44,8 @@ and improvements to existing code. │ ├── errors/ │ ├── fileutils/ │ ├── logs/ -│ └── metrics/ +│ ├── metrics/ +│ └── typeutils/ ├── go.mod ├── go.sum ├── CHANGELOG.md # List of major changes to the project @@ -61,10 +64,10 @@ and improvements to existing code. 1. Create a `.env.` file in the root directory with your MongoDB Atlas service account credentials. For example, create a `.env.development` file for your dev environment: ```dotenv - MONGODB_ATLAS_SERVICE_ACCOUNT_ID= - MONGODB_ATLAS_SERVICE_ACCOUNT_SECRET= - ATLAS_DOWNLOADS_DIR="tmp/atlas_downloads" # optional download directory - CONFIG_PATH="configs/config.development.json" # optional path to Atlas config file + MONGODB_ATLAS_SERVICE_ACCOUNT_ID= + MONGODB_ATLAS_SERVICE_ACCOUNT_SECRET= + ATLAS_DOWNLOADS_DIR="tmp/atlas_downloads" # optional download directory + CONFIG_PATH="configs/config.development.json" # optional path to Atlas config file ``` > **NOTE:** For production, use a secrets manager (e.g. HashiCorp Vault, AWS Secrets Manager) > instead of environment variables. @@ -133,6 +136,13 @@ go run examples/monitoring/metrics_process/main.go go run examples/performance/archiving/main.go ``` +### Recovery + +#### Perform Disaster Recovery Operations +```bash +go run examples/performance/recovery/main.go +``` + ## Changelog For list of major changes to this project, see [CHANGELOG](CHANGELOG.md). diff --git a/usage-examples/go/atlas-sdk-go/examples/disaster_recovery/main.go b/usage-examples/go/atlas-sdk-go/examples/disaster_recovery/main.go deleted file mode 100644 index cf33c13..0000000 --- a/usage-examples/go/atlas-sdk-go/examples/disaster_recovery/main.go +++ /dev/null @@ -1,246 +0,0 @@ -package main - -import ( - "atlas-sdk-go/internal/auth" - "atlas-sdk-go/internal/config" - "atlas-sdk-go/internal/errors" - "context" - "flag" - "fmt" - "github.com/joho/godotenv" - "log" - "os" - "time" - - "go.mongodb.org/atlas-sdk/v20250219001/admin" -) - -// const ( -// logsDir = "logs" -// ) -// -// type Config struct { -// PublicKey string -// PrivateKey string -// ProjectID string -// ClusterName string -// BackupID string -// ScenarioType string -// } - -func main() { - if err := godotenv.Load(); err != nil { - log.Printf("Warning: .env file not loaded: %v", err) - } - - secrets, cfg, err := config.LoadAll("configs/config.json") - if err != nil { - errors.ExitWithError("Failed to load configuration", err) - } - - client, err := auth.NewClient(cfg, secrets) - if err != nil { - errors.ExitWithError("Failed to initialize authentication client", err) - } - - // Parse command line flags for DR configuration - drCfg := parseFlags() - - ctx := context.Background() - setupLogging() - - // Execute the requested disaster recovery scenario - switch drCfg.ScenarioType { - case "regional-outage": - handleRegionalOutage(ctx, client, drCfg) - case "cloud-provider-outage": - handleCloudProviderOutage(ctx, client, drCfg) - case "restore-data": - handleDataRestoration(ctx, client, drCfg) - default: - log.Fatalf("Unknown scenario type: %s", drCfg.ScenarioType) - } -} - -func handleRegionalOutage(ctx context.Context, sdk *admin.APIClient, cfg Config) { - log.Println("Handling regional outage by adding nodes to unaffected regions...") - - // 1. Get current cluster configuration - cluster, _, err := sdk.ClustersApi.GetCluster(ctx, cfg.ProjectID, cfg.ClusterName).Execute() - if err != nil { - log.Fatalf("Failed to get cluster details: %v", err) - } - - // 2. Identify regions that are currently not in use and add a node - var newRegions []admin.ReplicationSpec20240805 - foundRegion := false - - if cluster.ReplicationSpecs != nil { - for _, region := range *cluster.ReplicationSpecs { - newRegions = append(newRegions, region) - if *region.ZoneName == "EU_WEST_1" { - foundRegion = true - } - } - } - - if !foundRegion { - // Add a new region that's unaffected by the outage - priority := int64(5) - electableNodes := int64(1) - readOnlyNodes := int64(0) - analyticsNodes := int64(0) - regionName := "EU_WEST_1" - - newRegion := admin.ReplicationSpec20240805{ - RegionName: ®ionName, - Priority: &priority, - RegionConfigs: &admin.CloudRegionConfig20240805{} & electableNodes, - ReadOnlyNodes: &readOnlyNodes, - AnalyticsNodes: &analyticsNodes, - } - newRegions = append(newRegions, newRegion) - } - - // 3. Update cluster with new regions - updateRequest := admin.AdvancedClusterDescriptionV2{ - ReplicationSpecs: &newRegions, - } - - _, _, err = sdk.ClustersApi.UpdateCluster(ctx, cfg.ProjectID, cfg.ClusterName, &updateRequest).Execute() - if err != nil { - log.Fatalf("Failed to update cluster: %v", err) - } - - log.Println("Successfully added nodes to unaffected regions") -} - -func handleCloudProviderOutage(ctx context.Context, sdk *admin.APIClient, cfg Config) { - log.Println("Handling cloud provider outage...") - - // 1. Get current cluster configuration - sourceCluster, _, err := sdk.ClustersApi.GetCluster(ctx, cfg.ProjectID, cfg.ClusterName).Execute() - if err != nil { - log.Fatalf("Failed to get source cluster details: %v", err) - } - - // 2. Create new cluster on alternative cloud provider - newClusterName := cfg.ClusterName + "-recovery" - clusterType := "REPLICASET" - providerName := "GCP" // Switch from AWS to GCP or vice versa - - newCluster := admin.AdvancedClusterDescriptionV2{ - Name: &newClusterName, - ClusterType: &clusterType, - ProviderName: &providerName, - DiskSizeGB: sourceCluster.DiskSizeGB, - MongoDBMajorVersion: sourceCluster.MongoDBMajorVersion, - } - - // Configure replica set based on original configuration - // (simplified for example - would need more configuration in practice) - - _, _, err = sdk.ClustersApi.CreateCluster(ctx, cfg.ProjectID, &newCluster).Execute() - if err != nil { - log.Fatalf("Failed to create recovery cluster: %v", err) - } - - log.Printf("Created recovery cluster: %s", newClusterName) - - // 3. Wait for cluster to be ready - log.Println("Waiting for cluster to become available...") - waitForClusterReady(ctx, sdk, cfg.ProjectID, newClusterName) - - // 4. Restore the most recent snapshot to the new cluster - log.Println("Restoring backup to recovery cluster...") - restoreRequest := admin.DiskBackupSnapshotRestoreJob{ - TargetClusterName: &newClusterName, - SnapshotId: &cfg.BackupID, - } - - _, _, err = sdk.CloudBackupsApi.CreateBackupRestoreJob(ctx, cfg.ProjectID, cfg.ClusterName, &restoreRequest).Execute() - if err != nil { - log.Fatalf("Failed to restore backup: %v", err) - } - - log.Printf("Successfully initiated restore to cluster %s from backup %s", newClusterName, cfg.BackupID) - log.Println("Once restore is complete, update your application connection strings to point to the new cluster") -} - -func handleDataRestoration(ctx context.Context, sdk *admin.APIClient, cfg Config) { - log.Println("Handling data restoration after accidental deletion...") - - // Restore from point-in-time backup - restoreRequest := admin.DiskBackupSnapshotRestoreJob{ - TargetClusterName: &cfg.ClusterName, - SnapshotId: &cfg.BackupID, - } - - _, _, err := sdk.CloudBackupsApi.CreateBackupRestoreJob(ctx, cfg.ProjectID, cfg.ClusterName, &restoreRequest).Execute() - if err != nil { - log.Fatalf("Failed to restore backup: %v", err) - } - - log.Printf("Successfully initiated restore to cluster %s from backup %s", cfg.ClusterName, cfg.BackupID) - log.Println("After restoration, verify data integrity and reimport any data collected since the backup") -} - -// Helper function to wait for cluster to be ready -func waitForClusterReady(ctx context.Context, sdk *admin.APIClient, projectID, clusterName string) { - for { - cluster, _, err := sdk.ClustersApi.GetCluster(ctx, projectID, clusterName).Execute() - if err != nil { - log.Printf("Error checking cluster status: %v", err) - } else if cluster.StateName != nil && *cluster.StateName == "IDLE" { - log.Println("Cluster is ready") - return - } - - log.Printf("Cluster status: %s. Waiting 30 seconds...", *cluster.StateName) - time.Sleep(30 * time.Second) - } -} - -func parseFlags() Config { - cfg := Config{} - - flag.StringVar(&cfg.PublicKey, "public-key", os.Getenv("ATLAS_PUBLIC_KEY"), "MongoDB Atlas public API key") - flag.StringVar(&cfg.PrivateKey, "private-key", os.Getenv("ATLAS_PRIVATE_KEY"), "MongoDB Atlas private API key") - flag.StringVar(&cfg.ProjectID, "project-id", "", "MongoDB Atlas project ID") - flag.StringVar(&cfg.ClusterName, "cluster-name", "", "MongoDB Atlas cluster name") - flag.StringVar(&cfg.BackupID, "backup-id", "", "MongoDB Atlas backup snapshot ID (for restore operations)") - flag.StringVar(&cfg.ScenarioType, "scenario", "", "Disaster recovery scenario type: regional-outage, cloud-provider-outage, restore-data") - - flag.Parse() - - // Validate required parameters - if cfg.PublicKey == "" || cfg.PrivateKey == "" || cfg.ProjectID == "" || cfg.ClusterName == "" || cfg.ScenarioType == "" { - flag.Usage() - os.Exit(1) - } - - return cfg -} - -func setupLogging() { - // Ensure logs directory exists - defaultDir := os.Getenv("ATLAS_DOWNLOADS_DIR") - logDir := logsDir - if defaultDir != "" { - logDir = fmt.Sprintf("%s/%s", defaultDir, logsDir) - } - - if err := os.MkdirAll(logDir, 0755); err != nil { - log.Fatalf("Failed to create logs directory: %v", err) - } - - // Set up logging to file - logFile := fmt.Sprintf("%s/disaster_recovery_%s.log", logDir, time.Now().Format("20060102_150405")) - f, err := os.Create(logFile) - if err != nil { - log.Fatalf("Failed to create log file: %v", err) - } - - log.SetOutput(f) - log.Printf("Starting disaster recovery script at %s", time.Now().Format(time.RFC3339)) -} diff --git a/usage-examples/go/atlas-sdk-go/examples/recovery/main.go b/usage-examples/go/atlas-sdk-go/examples/recovery/main.go new file mode 100644 index 0000000..5f549b5 --- /dev/null +++ b/usage-examples/go/atlas-sdk-go/examples/recovery/main.go @@ -0,0 +1,120 @@ +// :snippet-start: disaster-recovery +// :state-remove-start: copy +// See entire project at https://github.com/mongodb/atlas-architecture-go-sdk +// :state-remove-end: [copy] +package main + +import ( + "context" + "fmt" + "log" + "time" + + "atlas-sdk-go/internal/auth" + "atlas-sdk-go/internal/config" + "atlas-sdk-go/internal/data/recovery" + "atlas-sdk-go/internal/typeutils" + + "github.com/joho/godotenv" + "go.mongodb.org/atlas-sdk/v20250219001/admin" +) + +const ( + scenarioRegionalOutage = "regional-outage" + scenarioDataDeletion = "data-deletion" +) + +func main() { + envFile := ".env.production" + if err := godotenv.Load(envFile); err != nil { + log.Printf("Warning: could not load %s file: %v", envFile, err) + } + + secrets, cfg, err := config.LoadAllFromEnv() + if err != nil { + log.Fatalf("Failed to load configuration %v", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute) + defer cancel() + client, err := auth.NewClient(ctx, cfg, secrets) + if err != nil { + log.Fatalf("Failed to initialize authentication client: %v", err) + } + + opts, err := recovery.LoadDROptionsFromEnv(cfg.ProjectID) + if err != nil { + log.Fatalf("Configuration error: %v", err) + } + + fmt.Printf("Starting disaster recovery scenario: %s\nProject: %s\nCluster: %s\n", opts.Scenario, opts.ProjectID, opts.ClusterName) + + if opts.DryRun { + fmt.Println("DRY RUN: no write operations will be performed") + } + + var summary string + var opErr error + + switch opts.Scenario { + case scenarioRegionalOutage: + summary, opErr = simulateRegionalOutage(ctx, client, opts) + case scenarioDataDeletion: + summary, opErr = executeDataDeletionRestore(ctx, client, opts) + default: + opErr = fmt.Errorf("unsupported DR_SCENARIO '%s'", opts.Scenario) + } + + if opErr != nil { + log.Fatalf("Scenario failed: %v", opErr) + } + + fmt.Println("\n=== Summary ===") + fmt.Println(summary) + fmt.Println("Disaster recovery procedure completed.") +} + +// executeDataDeletionRestore initiates a restore job for a specified snapshot in a MongoDB Atlas cluster. +func executeDataDeletionRestore(ctx context.Context, client *admin.APIClient, o recovery.DrOptions) (string, error) { + job := admin.DiskBackupSnapshotRestoreJob{SnapshotId: &o.SnapshotID, TargetClusterName: &o.ClusterName} + if o.DryRun { + return fmt.Sprintf("(dry-run) Would submit restore job for snapshot %s", o.SnapshotID), nil + } + _, _, err := client.CloudBackupsApi.CreateBackupRestoreJob(ctx, o.ProjectID, o.ClusterName, &job).Execute() + if err != nil { + return "", fmt.Errorf("create restore job: %w", err) + } + return fmt.Sprintf("Restore job submitted for snapshot %s", o.SnapshotID), nil +} + +// simulateRegionalOutage modifies the electable node count in a target region for a MongoDB Atlas cluster. +func simulateRegionalOutage(ctx context.Context, client *admin.APIClient, o recovery.DrOptions) (string, error) { + cluster, _, err := client.ClustersApi.GetCluster(ctx, o.ProjectID, o.ClusterName).Execute() + if err != nil { + return "", fmt.Errorf("get cluster: %w", err) + } + if !cluster.HasReplicationSpecs() { + return "", fmt.Errorf("cluster has no replication specs") + } + repl := cluster.GetReplicationSpecs() + addedNodes, foundTarget := recovery.AddElectableNodesToRegion(repl, o.TargetRegion, o.AddNodes) + if !foundTarget { + return "", fmt.Errorf("target region '%s' not found in replication specs", o.TargetRegion) + } + zeroedRegions := 0 + if o.OutageRegion != "" { + zeroedRegions = recovery.ZeroElectableNodesInRegion(repl, o.OutageRegion) + } + payload := admin.NewClusterDescription20240805() + payload.SetReplicationSpecs(repl) + if o.DryRun { + return fmt.Sprintf("(dry-run) Would add %d electable nodes to %s%s", addedNodes, o.TargetRegion, typeutils.SuffixZeroed(zeroedRegions, o.OutageRegion)), nil + } + _, _, err = client.ClustersApi.UpdateCluster(ctx, o.ProjectID, o.ClusterName, payload).Execute() + if err != nil { + return "", fmt.Errorf("update cluster: %w", err) + } + return fmt.Sprintf("Added %d electable nodes to %s%s", addedNodes, o.TargetRegion, typeutils.SuffixZeroed(zeroedRegions, o.OutageRegion)), nil +} + +// :snippet-end: [disaster-recovery] diff --git a/usage-examples/go/atlas-sdk-go/examples/recovery/main_test.go b/usage-examples/go/atlas-sdk-go/examples/recovery/main_test.go new file mode 100644 index 0000000..ddd274f --- /dev/null +++ b/usage-examples/go/atlas-sdk-go/examples/recovery/main_test.go @@ -0,0 +1,171 @@ +package main + +import ( + "context" + "net/http" + "net/http/httptest" + "strings" + "sync/atomic" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.mongodb.org/atlas-sdk/v20250219001/admin" + + "atlas-sdk-go/internal/data/recovery" +) + +// testClient helper replicates pattern from internal tests. +func testClient(t *testing.T, handler http.HandlerFunc) *admin.APIClient { + server := httptest.NewServer(handler) + t.Cleanup(server.Close) + client, err := admin.NewClient(admin.UseBaseURL(server.URL)) + require.NoError(t, err) + return client +} + +func TestExecuteDataDeletionRestore_Seam(t *testing.T) { + ctx := context.Background() + opts := recovery.DrOptions{ProjectID: "proj", ClusterName: "ClusterA", SnapshotID: "snap1"} + + // Dry-run path (should not call API) + { + var called atomic.Bool + client := testClient(t, func(w http.ResponseWriter, r *http.Request) { + called.Store(true) + w.WriteHeader(http.StatusInternalServerError) + }) + msg, err := executeDataDeletionRestore(ctx, client, recovery.DrOptions{ProjectID: opts.ProjectID, ClusterName: opts.ClusterName, SnapshotID: opts.SnapshotID, DryRun: true}) + require.NoError(t, err) + assert.Contains(t, msg, "(dry-run)") + assert.False(t, called.Load(), "API must not be invoked for dry-run") + } + + // Success path + { + client := testClient(t, func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodPost && strings.Contains(r.URL.Path, "/backup/restoreJobs") { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusCreated) + // Provide minimal fields typical for restore job object + _, _ = w.Write([]byte(`{"id":"job1","snapshotId":"snap1","deliveryType":"automated"}`)) + return + } + w.WriteHeader(http.StatusNotFound) + }) + msg, err := executeDataDeletionRestore(ctx, client, opts) + require.NoError(t, err) + assert.Contains(t, msg, "Restore job submitted") + } + + // Error path + { + client := testClient(t, func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + }) + msg, err := executeDataDeletionRestore(ctx, client, opts) + require.Error(t, err) + assert.Empty(t, msg) + assert.Contains(t, err.Error(), "create restore job") + } +} + +func TestSimulateRegionalOutage_Seam(t *testing.T) { + ctx := context.Background() + projectID := "proj" + clusterName := "ClusterA" + baseClusterJSON := `{"replicationSpecs":[{"regionConfigs":[{"regionName":"us-east-1","electableSpecs":{"nodeCount":3}},{"regionName":"us-west-2","electableSpecs":{"nodeCount":3}}]}]}` + noReplJSON := `{}` + // Dry-run add nodes only + { + var updateCalled atomic.Bool + client := testClient(t, func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/clusters/"+clusterName) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(baseClusterJSON)) + return + } + if r.Method != http.MethodGet { + updateCalled.Store(true) + } + w.WriteHeader(http.StatusNotFound) + }) + msg, err := simulateRegionalOutage(ctx, client, recovery.DrOptions{ProjectID: projectID, ClusterName: clusterName, TargetRegion: "us-east-1", AddNodes: 2, DryRun: true}) + require.NoError(t, err) + assert.Contains(t, msg, "(dry-run)") + assert.Contains(t, msg, "add 2 electable nodes") + assert.False(t, updateCalled.Load()) + } + // Dry run with outage region zeroing + { + client := testClient(t, func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/clusters/"+clusterName) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(baseClusterJSON)) + return + } + w.WriteHeader(http.StatusNotFound) + }) + msg, err := simulateRegionalOutage(ctx, client, recovery.DrOptions{ProjectID: projectID, ClusterName: clusterName, TargetRegion: "us-east-1", OutageRegion: "us-west-2", AddNodes: 1, DryRun: true}) + require.NoError(t, err) + assert.Contains(t, msg, "zeroed electable nodes") + } + // Target region not found + { + client := testClient(t, func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/clusters/"+clusterName) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(baseClusterJSON)) + return + } + w.WriteHeader(http.StatusNotFound) + }) + msg, err := simulateRegionalOutage(ctx, client, recovery.DrOptions{ProjectID: projectID, ClusterName: clusterName, TargetRegion: "eu-central-1", AddNodes: 1, DryRun: true}) + require.Error(t, err) + assert.Empty(t, msg) + assert.Contains(t, err.Error(), "target region") + } + // No replication specs + { + client := testClient(t, func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/clusters/"+clusterName) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(noReplJSON)) + return + } + w.WriteHeader(http.StatusNotFound) + }) + msg, err := simulateRegionalOutage(ctx, client, recovery.DrOptions{ProjectID: projectID, ClusterName: clusterName, TargetRegion: "us-east-1", AddNodes: 1, DryRun: true}) + require.Error(t, err) + assert.Empty(t, msg) + assert.Contains(t, err.Error(), "no replication specs") + } + // Update cluster error (non dry-run) + { + var getCount, updateCount int32 + client := testClient(t, func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/clusters/"+clusterName) { + atomic.AddInt32(&getCount, 1) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(baseClusterJSON)) + return + } + if strings.HasSuffix(r.URL.Path, "/clusters/"+clusterName) { + atomic.AddInt32(&updateCount, 1) + w.WriteHeader(http.StatusInternalServerError) + return + } + w.WriteHeader(http.StatusNotFound) + }) + msg, err := simulateRegionalOutage(ctx, client, recovery.DrOptions{ProjectID: projectID, ClusterName: clusterName, TargetRegion: "us-east-1", AddNodes: 1}) + require.Error(t, err) + assert.Empty(t, msg) + assert.Equal(t, int32(1), getCount) + assert.Equal(t, int32(1), updateCount) + } +} diff --git a/usage-examples/go/atlas-sdk-go/internal/data/recovery/options.go b/usage-examples/go/atlas-sdk-go/internal/data/recovery/options.go new file mode 100644 index 0000000..47ed284 --- /dev/null +++ b/usage-examples/go/atlas-sdk-go/internal/data/recovery/options.go @@ -0,0 +1,119 @@ +package recovery + +import ( + "fmt" + "os" + "strconv" + "strings" + + "atlas-sdk-go/internal/typeutils" +) + +const ( + defaultAddNodes = 1 + scenarioRegionalOutage = "regional-outage" + scenarioDataDeletion = "data-deletion" +) + +// DrOptions holds the scenario and configuration parameters used by the +// disaster recovery example. Values are typically loaded from environment +// variables. Only the fields relevant to the chosen Scenario are required. +// +// Scenario values: +// - "regional-outage" : simulate adding capacity to a healthy region +// - "data-deletion" : submit a snapshot restore job +// Required per scenario: +// regional-outage: ProjectID, ClusterName, TargetRegion +// data-deletion: ProjectID, ClusterName, SnapshotID +// Optional: +// OutageRegion (regional-outage) region to zero electable nodes +// AddNodes (regional-outage) number of electable nodes to add (default 1) +// DryRun when true prints intended actions only. +type DrOptions struct { + Scenario string + ProjectID string + ClusterName string + TargetRegion string + OutageRegion string + AddNodes int + SnapshotID string + DryRun bool +} + +// LoadDROptionsFromEnv reads environment variables and validates scenario-specific requirements. +// Defaults are applied first, then overridden if env vars are present: +// +// DR_SCENARIO (req) regional-outage | data-deletion +// ATLAS_PROJECT_ID (red unless provided via config loader) +// ATLAS_CLUSTER_NAME (req) target cluster name +// DR_TARGET_REGION (regional-outage req) region receiving added capacity +// DR_OUTAGE_REGION (regional-outage opt) region considered impaired (its electable nodes set to 0) +// DR_ADD_NODES (regional-outage opt) number of electable nodes to add (default: 1) +// DR_SNAPSHOT_ID (data-deletion req) snapshot ID to restore +// DR_DRY_RUN (opt bool) if true, only log intended actions (default: false) +func LoadDROptionsFromEnv(fallbackProjectID string) (DrOptions, error) { + o := DrOptions{ + AddNodes: defaultAddNodes, + } + + o.Scenario = strings.ToLower(strings.TrimSpace(os.Getenv("DR_SCENARIO"))) + o.ProjectID = typeutils.FirstNonEmpty(os.Getenv("ATLAS_PROJECT_ID"), fallbackProjectID) + o.ClusterName = strings.TrimSpace(os.Getenv("ATLAS_CLUSTER_NAME")) + o.TargetRegion = strings.TrimSpace(os.Getenv("DR_TARGET_REGION")) + o.OutageRegion = strings.TrimSpace(os.Getenv("DR_OUTAGE_REGION")) + o.SnapshotID = strings.TrimSpace(os.Getenv("DR_SNAPSHOT_ID")) + + if v, ok := os.LookupEnv("DR_ADD_NODES"); ok { + n, err := strconv.Atoi(strings.TrimSpace(v)) + if err != nil { + return o, fmt.Errorf("invalid DR_ADD_NODES value '%s': must be a positive integer", v) + } + if n <= 0 { + return o, fmt.Errorf("DR_ADD_NODES must be a positive integer, got %d", n) + } + o.AddNodes = n + } + + if v, ok := os.LookupEnv("DR_DRY_RUN"); ok { + o.DryRun = typeutils.ParseBool(v) + } + if err := validateRequiredFields(o); err != nil { + return o, err + } + if err := validateScenarioRequirements(o); err != nil { + return o, err + } + + return o, nil +} + +func validateRequiredFields(o DrOptions) error { + if o.Scenario == "" { + return fmt.Errorf("DR_SCENARIO is required") + } + if o.ProjectID == "" { + return fmt.Errorf("ATLAS_PROJECT_ID is required") + } + if o.ClusterName == "" { + return fmt.Errorf("ATLAS_CLUSTER_NAME is required") + } + return nil +} + +// validateScenarioRequirements checks that scenario-specific required fields are set. +func validateScenarioRequirements(o DrOptions) error { + switch o.Scenario { + case scenarioRegionalOutage: + if o.TargetRegion == "" { + return fmt.Errorf("DR_TARGET_REGION is required for %s scenario", scenarioRegionalOutage) + } + case scenarioDataDeletion: + if o.SnapshotID == "" { + return fmt.Errorf("DR_SNAPSHOT_ID is required for %s scenario", scenarioDataDeletion) + } + default: + return fmt.Errorf("unsupported DR_SCENARIO '%s': valid options are %s, %s", + o.Scenario, scenarioRegionalOutage, scenarioDataDeletion) + } + return nil +} diff --git a/usage-examples/go/atlas-sdk-go/internal/data/recovery/restore.go b/usage-examples/go/atlas-sdk-go/internal/data/recovery/restore.go new file mode 100644 index 0000000..a9f46ba --- /dev/null +++ b/usage-examples/go/atlas-sdk-go/internal/data/recovery/restore.go @@ -0,0 +1,57 @@ +package recovery + +import ( + "go.mongodb.org/atlas-sdk/v20250219001/admin" +) + +// AddElectableNodesToRegion increases electable node count in the specified target region. +func AddElectableNodesToRegion(repl []admin.ReplicationSpec20240805, targetRegion string, addNodes int) (int, bool) { + added := 0 + found := false + for i := range repl { + rcs := repl[i].GetRegionConfigs() + for j := range rcs { + regionName := "" + if rcs[j].HasRegionName() { + regionName = rcs[j].GetRegionName() + } + if regionName == targetRegion && rcs[j].HasElectableSpecs() { + es := rcs[j].GetElectableSpecs() + before := 0 + if es.HasNodeCount() { + before = es.GetNodeCount() + } + es.SetNodeCount(before + addNodes) + rcs[j].SetElectableSpecs(es) + added += addNodes + found = true + } + } + repl[i].SetRegionConfigs(rcs) + } + return added, found +} + +// ZeroElectableNodesInRegion sets electable node count to zero in the outage region, returning count of regions modified. +func ZeroElectableNodesInRegion(repl []admin.ReplicationSpec20240805, outageRegion string) int { + zeroed := 0 + for i := range repl { + rcs := repl[i].GetRegionConfigs() + for j := range rcs { + regionName := "" + if rcs[j].HasRegionName() { + regionName = rcs[j].GetRegionName() + } + if regionName == outageRegion && rcs[j].HasElectableSpecs() { + es := rcs[j].GetElectableSpecs() + if es.HasNodeCount() && es.GetNodeCount() > 0 { + es.SetNodeCount(0) + rcs[j].SetElectableSpecs(es) + zeroed++ + } + } + } + repl[i].SetRegionConfigs(rcs) + } + return zeroed +} diff --git a/usage-examples/go/atlas-sdk-go/internal/data/recovery/restore_test.go b/usage-examples/go/atlas-sdk-go/internal/data/recovery/restore_test.go new file mode 100644 index 0000000..b2e36e6 --- /dev/null +++ b/usage-examples/go/atlas-sdk-go/internal/data/recovery/restore_test.go @@ -0,0 +1,6 @@ +package recovery + +// Tests for regional outage and data deletion recovery flows have moved to +// examples/recovery/main_test.go where the executable seams live. This package +// now only exposes pure helper functions which are indirectly covered via the +// example seam tests. diff --git a/usage-examples/go/atlas-sdk-go/internal/typeutils/load.go b/usage-examples/go/atlas-sdk-go/internal/typeutils/load.go new file mode 100644 index 0000000..e5747f1 --- /dev/null +++ b/usage-examples/go/atlas-sdk-go/internal/typeutils/load.go @@ -0,0 +1,38 @@ +package typeutils + +import ( + "fmt" + "strings" +) + +// FirstNonEmpty returns the first non-empty, non-whitespace string from values, or an empty string if none found. +func FirstNonEmpty(values ...string) string { + for _, v := range values { + if strings.TrimSpace(v) != "" { + return v + } + } + return "" +} + +// ParseBool interprets a string as a boolean. It returns true for "true", "1", "yes", "y" (case insensitive, trimmed), and false otherwise. +func ParseBool(v string) bool { + v = strings.ToLower(strings.TrimSpace(v)) + return v == "true" || v == "1" || v == "yes" || v == "y" +} + +// SuffixZeroed returns a formatted string if zeroed > 0, otherwise an empty string. +func SuffixZeroed(zeroed int, region string) string { + if zeroed == 0 { + return "" + } + return fmt.Sprintf(", zeroed electable nodes in region %s", region) +} + +// DefaultIfBlank returns d if v is an empty string, otherwise returns v. +func DefaultIfBlank(v, d string) string { + if v == "" { + return d + } + return v +}