Add a readonly function to walk a dynamic tree (#3084)

shreyas-goenka · web-flow · commit 8d3a8463d052 · 2025-07-03T15:23:02.000+02:00
## Changes
This PR adds a read-only version of dyn.Walk, which can be used to walk
the bundle configuration tree faster than the normal walk method.

Benchmarks show it is 4x faster than the normal walk method.

## Why
Many walk use cases are read-only, so having such a method is useful.
It'll be used to perform required and enum validation in DABs.

## Tests
New unit tests.
diff --git a/bundle/internal/bundletest/benchmark.go b/bundle/internal/bundletest/benchmark.go
@@ -0,0 +1,240 @@
+package bundletest
+
+import (
+	"context"
+	"encoding/json"
+	"strconv"
+	"testing"
+
+	"github.com/databricks/cli/bundle"
+	"github.com/databricks/cli/bundle/config"
+	"github.com/databricks/cli/bundle/config/resources"
+	"github.com/databricks/cli/libs/diag"
+	"github.com/databricks/cli/libs/dyn"
+	"github.com/databricks/databricks-sdk-go/service/jobs"
+	"github.com/stretchr/testify/require"
+)
+
+const jobExample = `
+{
+    "budget_policy_id": "550e8400-e29b-41d4-a716-446655440000",
+    "continuous": {
+      "pause_status": "UNPAUSED"
+    },
+    "deployment": {
+      "kind": "BUNDLE",
+      "metadata_file_path": "string"
+    },
+    "description": "This job contain multiple tasks that are required to produce the weekly shark sightings report.",
+    "edit_mode": "UI_LOCKED",
+    "email_notifications": {
+      "no_alert_for_skipped_runs": false,
+      "on_duration_warning_threshold_exceeded": [
+        "user.name@databricks.com"
+      ],
+      "on_failure": [
+        "user.name@databricks.com"
+      ],
+      "on_start": [
+        "user.name@databricks.com"
+      ],
+      "on_streaming_backlog_exceeded": [
+        "user.name@databricks.com"
+      ],
+      "on_success": [
+        "user.name@databricks.com"
+      ]
+    },
+    "environments": [
+      {
+        "environment_key": "string",
+        "spec": {
+          "client": "1",
+          "dependencies": [
+            "string"
+          ]
+        }
+      }
+    ],
+    "format": "SINGLE_TASK",
+    "git_source": {
+      "git_branch": "main",
+      "git_provider": "gitHub",
+      "git_url": "https://github.com/databricks/databricks-cli"
+    },
+    "health": {
+      "rules": [
+        {
+          "metric": "RUN_DURATION_SECONDS",
+          "op": "GREATER_THAN",
+          "value": 10
+        }
+      ]
+    },
+    "job_clusters": [
+      {
+        "job_cluster_key": "auto_scaling_cluster",
+        "new_cluster": {
+          "autoscale": {
+            "max_workers": 16,
+            "min_workers": 2
+          },
+          "node_type_id": null,
+          "spark_conf": {
+            "spark.speculation": "true"
+          },
+          "spark_version": "7.3.x-scala2.12"
+        }
+      }
+    ],
+    "max_concurrent_runs": 10,
+    "name": "A multitask job",
+    "notification_settings": {
+      "no_alert_for_canceled_runs": false,
+      "no_alert_for_skipped_runs": false
+    },
+    "parameters": [
+      {
+        "default": "users",
+        "name": "table"
+      }
+    ],
+    "performance_target": "PERFORMANCE_OPTIMIZED",
+    "queue": {
+      "enabled": true
+    },
+    "run_as": {
+      "service_principal_name": "692bc6d0-ffa3-11ed-be56-0242ac120002",
+      "user_name": "user@databricks.com"
+    },
+    "schedule": {
+      "pause_status": "UNPAUSED",
+      "quartz_cron_expression": "20 30 * * * ?",
+      "timezone_id": "Europe/London"
+    },
+    "tags": {
+      "cost-center": "engineering",
+      "team": "jobs"
+    },
+    "tasks": [
+      {
+        "depends_on": [],
+        "description": "Extracts session data from events",
+        "existing_cluster_id": "0923-164208-meows279",
+        "libraries": [
+          {
+            "jar": "dbfs:/mnt/databricks/Sessionize.jar"
+          }
+        ],
+        "max_retries": 3,
+        "min_retry_interval_millis": 2000,
+        "retry_on_timeout": false,
+        "spark_jar_task": {
+          "main_class_name": "com.databricks.Sessionize",
+          "parameters": [
+            "--data",
+            "dbfs:/path/to/data.json"
+          ]
+        },
+        "task_key": "Sessionize",
+        "timeout_seconds": 86400
+      },
+      {
+        "depends_on": [],
+        "description": "Ingests order data",
+        "job_cluster_key": "auto_scaling_cluster",
+        "libraries": [
+          {
+            "jar": "dbfs:/mnt/databricks/OrderIngest.jar"
+          }
+        ],
+        "max_retries": 3,
+        "min_retry_interval_millis": 2000,
+        "retry_on_timeout": false,
+        "spark_jar_task": {
+          "main_class_name": "com.databricks.OrdersIngest",
+          "parameters": [
+            "--data",
+            "dbfs:/path/to/order-data.json"
+          ]
+        },
+        "task_key": "Orders_Ingest",
+        "timeout_seconds": 86400
+      },
+      {
+        "depends_on": [
+          {
+            "task_key": "Orders_Ingest"
+          },
+          {
+            "task_key": "Sessionize"
+          }
+        ],
+        "description": "Matches orders with user sessions",
+        "max_retries": 3,
+        "min_retry_interval_millis": 2000,
+        "new_cluster": {
+          "autoscale": {
+            "max_workers": 16,
+            "min_workers": 2
+          },
+          "node_type_id": null,
+          "spark_conf": {
+            "spark.speculation": "true"
+          },
+          "spark_version": "7.3.x-scala2.12"
+        },
+        "notebook_task": {
+          "base_parameters": {
+            "age": "35",
+            "name": "John Doe"
+          },
+          "notebook_path": "/Users/user.name@databricks.com/Match"
+        },
+        "retry_on_timeout": false,
+        "run_if": "ALL_SUCCESS",
+        "timeout_seconds": 86400
+      }
+    ],
+    "timeout_seconds": 86400,
+    "trigger": {
+      "file_arrival": {
+        "min_time_between_triggers_seconds": 0,
+        "url": "string",
+        "wait_after_last_change_seconds": 0
+      },
+      "pause_status": "UNPAUSED",
+      "periodic": {
+        "interval": 0,
+        "unit": "HOURS"
+      }
+    }
+}`
+
+func BundleV(b *testing.B, numJobs int) dyn.Value {
+	allJobs := map[string]*resources.Job{}
+	for i := range numJobs {
+		job := jobs.JobSettings{}
+		err := json.Unmarshal([]byte(jobExample), &job)
+		require.NoError(b, err)
+
+		allJobs[strconv.Itoa(i)] = &resources.Job{
+			JobSettings: job,
+		}
+	}
+
+	myBundle := bundle.Bundle{
+		Config: config.Root{
+			Resources: config.Resources{
+				Jobs: allJobs,
+			},
+		},
+	}
+
+	// Apply noop mutator to initialize the bundle value.
+	bundle.ApplyFunc(context.Background(), &myBundle, func(ctx context.Context, b *bundle.Bundle) diag.Diagnostics {
+		return nil
+	})
+
+	return myBundle.Config.Value()
+}
diff --git a/bundle/internal/bundletest/benchmark_test.go b/bundle/internal/bundletest/benchmark_test.go
@@ -0,0 +1,32 @@
+package bundletest
+
+import (
+	"testing"
+
+	"github.com/databricks/cli/libs/dyn"
+	"github.com/stretchr/testify/assert"
+)
+
+// This took 40ms to run on 18th June 2025.
+func BenchmarkWalkReadOnly(b *testing.B) {
+	input := BundleV(b, 10000)
+
+	for b.Loop() {
+		err := dyn.WalkReadOnly(input, func(p dyn.Path, v dyn.Value) error {
+			return nil
+		})
+		assert.NoError(b, err)
+	}
+}
+
+// This took 160ms to run on 18th June 2025.
+func BenchmarkWalk(b *testing.B) {
+	input := BundleV(b, 10000)
+
+	for b.Loop() {
+		_, err := dyn.Walk(input, func(p dyn.Path, v dyn.Value) (dyn.Value, error) {
+			return v, nil
+		})
+		assert.NoError(b, err)
+	}
+}
diff --git a/libs/dyn/mapping.go b/libs/dyn/mapping.go
@@ -28,6 +28,21 @@ func NewMapping() Mapping {
 	}
 }
 
+// NewMappingFromPairs computes a [Mapping] from a list of [Pair]s. The index
+// map does not need to be provided since that will be computed from the
+// key-value pairs provided.
+func NewMappingFromPairs(pairs []Pair) Mapping {
+	index := make(map[string]int)
+	for i, p := range pairs {
+		index[p.Key.MustString()] = i
+	}
+
+	return Mapping{
+		pairs: pairs,
+		index: index,
+	}
+}
+
 // newMappingWithSize creates a new Mapping preallocated to the specified size.
 func newMappingWithSize(size int) Mapping {
 	return Mapping{
diff --git a/libs/dyn/walk_read_only.go b/libs/dyn/walk_read_only.go
@@ -0,0 +1,41 @@
+package dyn
+
+// WalkReadOnly walks the configuration tree in readonly mode and calls the given function on each node.
+// The callback may return ErrSkip to skip traversal of a subtree.
+// If the callback returns another error, the walk is aborted, and the error is returned.
+func WalkReadOnly(v Value, fn func(p Path, v Value) error) error {
+	return walkReadOnly(v, EmptyPath, fn)
+}
+
+// Unexported counterpart to WalkReadOnly.
+// It carries the path leading up to the current node,
+// such that it can be passed to the callback function.
+func walkReadOnly(v Value, p Path, fn func(p Path, v Value) error) error {
+	if err := fn(p, v); err != nil {
+		if err == ErrSkip {
+			return nil
+		}
+		return err
+	}
+
+	switch v.Kind() {
+	case KindMap:
+		m := v.MustMap()
+		for _, pair := range m.Pairs() {
+			pk := pair.Key
+			pv := pair.Value
+			if err := walkReadOnly(pv, append(p, Key(pk.MustString())), fn); err != nil {
+				return err
+			}
+		}
+	case KindSequence:
+		s := v.MustSequence()
+		for i := range s {
+			if err := walkReadOnly(s[i], append(p, Index(i)), fn); err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
diff --git a/libs/dyn/walk_read_only_test.go b/libs/dyn/walk_read_only_test.go