From 1a8ff403711606750a847255557df4d101fea2fd Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 19 Mar 2026 13:01:37 +0000 Subject: [PATCH 01/15] Add error_message field to bundle deploy telemetry Track the first error diagnostic summary encountered during bundle deploy in telemetry. Move telemetry logging into a defer so it's always captured, even when deploy fails. Co-authored-by: Isaac --- bundle/phases/deploy.go | 1 - bundle/phases/telemetry.go | 4 +++- cmd/bundle/utils/process.go | 23 +++++++++++++++++++---- libs/logdiag/logdiag.go | 16 ++++++++++++++++ libs/telemetry/protos/bundle_deploy.go | 3 +++ 5 files changed, 41 insertions(+), 6 deletions(-) diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 4613a7a211..df39cb79d0 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -208,7 +208,6 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand return } - logDeployTelemetry(ctx, b) bundle.ApplyContext(ctx, b, scripts.Execute(config.ScriptPostDeploy)) } diff --git a/bundle/phases/telemetry.go b/bundle/phases/telemetry.go index 4584e9fc5e..ba9af95ef3 100644 --- a/bundle/phases/telemetry.go +++ b/bundle/phases/telemetry.go @@ -33,7 +33,8 @@ func getExecutionTimes(b *bundle.Bundle) []protos.IntMapEntry { return executionTimes } -func logDeployTelemetry(ctx context.Context, b *bundle.Bundle) { +// LogDeployTelemetry logs a telemetry event for a bundle deploy command. +func LogDeployTelemetry(ctx context.Context, b *bundle.Bundle, errMsg string) { resourcesCount := int64(0) _, err := dyn.MapByPattern(b.Config.Value(), dyn.NewPattern(dyn.Key("resources"), dyn.AnyKey(), dyn.AnyKey()), func(p dyn.Path, v dyn.Value) (dyn.Value, error) { resourcesCount++ @@ -149,6 +150,7 @@ func logDeployTelemetry(ctx context.Context, b *bundle.Bundle) { BundleDeployEvent: &protos.BundleDeployEvent{ BundleUuid: bundleUuid, DeploymentId: b.Metrics.DeploymentId.String(), + ErrorMessage: errMsg, ResourceCount: resourcesCount, ResourceJobCount: int64(len(b.Config.Resources.Jobs)), diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index 8390548912..30e0beadde 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -80,7 +80,7 @@ func ProcessBundle(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, err return b, err } -func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, *statemgmt.StateDesc, error) { +func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle, stateDesc *statemgmt.StateDesc, retErr error) { var err error ctx := cmd.Context() if opts.SkipInitContext { @@ -93,7 +93,24 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, } // Load bundle config and apply target - b := root.MustConfigureBundle(cmd) + b = root.MustConfigureBundle(cmd) + + // Log deploy telemetry on all exit paths. This is a defer to ensure + // telemetry is logged even when the deploy command fails, for both + // diagnostic errors and regular Go errors. + if opts.Deploy { + defer func() { + if b == nil { + return + } + errMsg := logdiag.GetFirstErrorSummary(ctx) + if errMsg == "" && retErr != nil && !errors.Is(retErr, root.ErrAlreadyPrinted) { + errMsg = retErr.Error() + } + phases.LogDeployTelemetry(ctx, b, errMsg) + }() + } + if logdiag.HasError(ctx) { return b, nil, root.ErrAlreadyPrinted } @@ -147,8 +164,6 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, } } - var stateDesc *statemgmt.StateDesc - shouldReadState := opts.ReadState || opts.AlwaysPull || opts.InitIDs || opts.ErrorOnEmptyState || opts.PreDeployChecks || opts.Deploy || opts.ReadPlanPath != "" if shouldReadState { diff --git a/libs/logdiag/logdiag.go b/libs/logdiag/logdiag.go index e466576c87..896d105751 100644 --- a/libs/logdiag/logdiag.go +++ b/libs/logdiag/logdiag.go @@ -30,6 +30,9 @@ type LogDiagData struct { // If Collect is true, diagnostics are appended to Collected. Use SetCollected() to set. Collect bool Collected []diag.Diagnostic + + // Summary of the first error diagnostic logged, if any. + FirstErrorSummary string } // IsSetup returns whether InitContext() was already called. @@ -117,6 +120,16 @@ func FlushCollected(ctx context.Context) diag.Diagnostics { return result } +// GetFirstErrorSummary returns the summary of the first error diagnostic +// logged, or an empty string if no errors have been logged. +func GetFirstErrorSummary(ctx context.Context) string { + val := read(ctx) + val.mu.Lock() + defer val.mu.Unlock() + + return val.FirstErrorSummary +} + func LogDiag(ctx context.Context, d diag.Diagnostic) { val := read(ctx) val.mu.Lock() @@ -125,6 +138,9 @@ func LogDiag(ctx context.Context, d diag.Diagnostic) { switch d.Severity { case diag.Error: val.Errors += 1 + if val.FirstErrorSummary == "" { + val.FirstErrorSummary = d.Summary + } case diag.Warning: val.Warnings += 1 case diag.Recommendation: diff --git a/libs/telemetry/protos/bundle_deploy.go b/libs/telemetry/protos/bundle_deploy.go index ab1b3a46de..d9439437d9 100644 --- a/libs/telemetry/protos/bundle_deploy.go +++ b/libs/telemetry/protos/bundle_deploy.go @@ -7,6 +7,9 @@ type BundleDeployEvent struct { // UUID associated with the deployment. DeploymentId string `json:"deployment_id,omitempty"` + // Error message encountered during the bundle deploy command, if any. + ErrorMessage string `json:"error_message,omitempty"` + ResourceCount int64 `json:"resource_count"` ResourceJobCount int64 `json:"resource_job_count"` ResourcePipelineCount int64 `json:"resource_pipeline_count"` From 5b39bf37fa81294615829adb1ce670d286774c59 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 26 Mar 2026 16:47:06 +0100 Subject: [PATCH 02/15] Fix CI failures in deploy telemetry Two fixes: 1. Remove redundant empty BundleDeployEvent logging in root.go Execute. The defer in process.go already logs detailed deploy telemetry via LogDeployTelemetry on all exit paths. The root.go code caused duplicate events and Upload retries (mock returns numProtoSuccess=1). 2. Add HasConfigUsed guard in telemetry.Upload before calling ConfigUsed. When bundle config fails early (e.g., restricted script execution blocking preinit), ConfigUsed is never set on the context. The LogDeployTelemetry defer still logs an event, so Upload would panic trying to create an API client without auth config. Co-authored-by: Isaac --- cmd/root/root.go | 9 --------- libs/telemetry/logger.go | 5 +++++ 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/cmd/root/root.go b/cmd/root/root.go index e29d6df83c..6b6de2a9ba 100644 --- a/cmd/root/root.go +++ b/cmd/root/root.go @@ -179,15 +179,6 @@ Stack Trace: commandStr := commandString(cmd) ctx = cmd.Context() - // Log bundle deploy failures. Only log if we have successfully configured - // an authenticated Databricks client. We cannot log unauthenticated telemetry - // from the CLI yet. - if cmdctx.HasConfigUsed(ctx) && commandStr == "bundle_deploy" && exitCode != 0 { - telemetry.Log(ctx, protos.DatabricksCliLog{ - BundleDeployEvent: &protos.BundleDeployEvent{}, - }) - } - telemetryErr := telemetry.Upload(cmd.Context(), protos.ExecutionContext{ CmdExecID: cmdctx.ExecId(ctx), Version: build.GetInfo().Version, diff --git a/libs/telemetry/logger.go b/libs/telemetry/logger.go index c294fee17b..70132f69f0 100644 --- a/libs/telemetry/logger.go +++ b/libs/telemetry/logger.go @@ -76,6 +76,11 @@ func Upload(ctx context.Context, ec protos.ExecutionContext) error { protoLogs[i] = string(b) } + if !cmdctx.HasConfigUsed(ctx) { + log.Debugf(ctx, "no auth config available; skipping telemetry upload") + return nil + } + apiClient, err := client.New(cmdctx.ConfigUsed(ctx)) if err != nil { return fmt.Errorf("failed to create API client: %w", err) From c70dbb130b8d8137d33818a9a83a20122aa14da7 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 26 Mar 2026 16:50:51 +0100 Subject: [PATCH 03/15] Cap error message length in deploy telemetry to 500 chars Co-authored-by: Isaac --- bundle/phases/telemetry.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bundle/phases/telemetry.go b/bundle/phases/telemetry.go index ba9af95ef3..ca938a72fd 100644 --- a/bundle/phases/telemetry.go +++ b/bundle/phases/telemetry.go @@ -33,8 +33,15 @@ func getExecutionTimes(b *bundle.Bundle) []protos.IntMapEntry { return executionTimes } +// Maximum length of the error message included in telemetry. +const maxErrorMessageLength = 500 + // LogDeployTelemetry logs a telemetry event for a bundle deploy command. func LogDeployTelemetry(ctx context.Context, b *bundle.Bundle, errMsg string) { + if len(errMsg) > maxErrorMessageLength { + errMsg = errMsg[:maxErrorMessageLength] + } + resourcesCount := int64(0) _, err := dyn.MapByPattern(b.Config.Value(), dyn.NewPattern(dyn.Key("resources"), dyn.AnyKey(), dyn.AnyKey()), func(p dyn.Path, v dyn.Value) (dyn.Value, error) { resourcesCount++ From a2fb5ddd46cdbd265ad5659ee1d8f28709e1a339 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 26 Mar 2026 17:03:46 +0100 Subject: [PATCH 04/15] Add acceptance test for deploy telemetry error message Co-authored-by: Isaac --- .../bundle/telemetry/deploy-error-message/databricks.yml | 6 ++++++ .../bundle/telemetry/deploy-error-message/out.test.toml | 5 +++++ .../bundle/telemetry/deploy-error-message/output.txt | 9 +++++++++ acceptance/bundle/telemetry/deploy-error-message/script | 5 +++++ 4 files changed, 25 insertions(+) create mode 100644 acceptance/bundle/telemetry/deploy-error-message/databricks.yml create mode 100644 acceptance/bundle/telemetry/deploy-error-message/out.test.toml create mode 100644 acceptance/bundle/telemetry/deploy-error-message/output.txt create mode 100644 acceptance/bundle/telemetry/deploy-error-message/script diff --git a/acceptance/bundle/telemetry/deploy-error-message/databricks.yml b/acceptance/bundle/telemetry/deploy-error-message/databricks.yml new file mode 100644 index 0000000000..a613c03529 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-error-message/databricks.yml @@ -0,0 +1,6 @@ +bundle: + name: test-bundle + +variables: + myvar: + description: a required variable diff --git a/acceptance/bundle/telemetry/deploy-error-message/out.test.toml b/acceptance/bundle/telemetry/deploy-error-message/out.test.toml new file mode 100644 index 0000000000..d560f1de04 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-error-message/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct"] diff --git a/acceptance/bundle/telemetry/deploy-error-message/output.txt b/acceptance/bundle/telemetry/deploy-error-message/output.txt new file mode 100644 index 0000000000..675c1d67db --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-error-message/output.txt @@ -0,0 +1,9 @@ + +>>> [CLI] bundle deploy +Error: no value assigned to required variable myvar. Variables are usually assigned in databricks.yml, and they can be overridden using "--var", the BUNDLE_VAR_myvar environment variable, or .databricks/bundle//variable-overrides.json + + +Exit code: 1 + +>>> cat out.requests.txt +no value assigned to required variable myvar. Variables are usually assigned in databricks.yml, and they can be overridden using "--var", the BUNDLE_VAR_myvar environment variable, or .databricks/bundle//variable-overrides.json diff --git a/acceptance/bundle/telemetry/deploy-error-message/script b/acceptance/bundle/telemetry/deploy-error-message/script new file mode 100644 index 0000000000..8c5e853386 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-error-message/script @@ -0,0 +1,5 @@ +errcode trace $CLI bundle deploy + +trace cat out.requests.txt | jq -r 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.error_message' + +rm out.requests.txt From ca229c470c532efeedd699b1ecab068346662ea0 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 26 Mar 2026 17:40:39 +0100 Subject: [PATCH 05/15] Scrub sensitive paths and emails from deploy telemetry error messages Adds a scrubber that runs before error messages are sent to telemetry: 1. Replaces the bundle root path with "." to avoid leaking local paths 2. Redacts remaining home directory paths (/Users/..., /home/..., C:\Users\...) 3. Redacts email addresses (e.g., in workspace paths) Inspired by VS Code's telemetry path scrubbing and Sentry's @userpath rule. Co-authored-by: Isaac --- .../telemetry/deploy-error-message/output.txt | 2 +- bundle/phases/telemetry.go | 4 + bundle/phases/telemetry_scrub.go | 135 +++++++++ bundle/phases/telemetry_scrub_test.go | 272 ++++++++++++++++++ 4 files changed, 412 insertions(+), 1 deletion(-) create mode 100644 bundle/phases/telemetry_scrub.go create mode 100644 bundle/phases/telemetry_scrub_test.go diff --git a/acceptance/bundle/telemetry/deploy-error-message/output.txt b/acceptance/bundle/telemetry/deploy-error-message/output.txt index 675c1d67db..e6d3c98057 100644 --- a/acceptance/bundle/telemetry/deploy-error-message/output.txt +++ b/acceptance/bundle/telemetry/deploy-error-message/output.txt @@ -6,4 +6,4 @@ Error: no value assigned to required variable myvar. Variables are usually assig Exit code: 1 >>> cat out.requests.txt -no value assigned to required variable myvar. Variables are usually assigned in databricks.yml, and they can be overridden using "--var", the BUNDLE_VAR_myvar environment variable, or .databricks/bundle//variable-overrides.json +no value assigned to required variable myvar. Variables are usually assigned in databricks.yml, and they can be overridden using "--var", the BUNDLE_VAR_myvar environment variable, or [REDACTED_REL_PATH] diff --git a/bundle/phases/telemetry.go b/bundle/phases/telemetry.go index ca938a72fd..2d41a48b65 100644 --- a/bundle/phases/telemetry.go +++ b/bundle/phases/telemetry.go @@ -9,6 +9,7 @@ import ( "github.com/databricks/cli/bundle/config" "github.com/databricks/cli/bundle/libraries" "github.com/databricks/cli/libs/dyn" + "github.com/databricks/cli/libs/env" "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/telemetry" "github.com/databricks/cli/libs/telemetry/protos" @@ -38,6 +39,9 @@ const maxErrorMessageLength = 500 // LogDeployTelemetry logs a telemetry event for a bundle deploy command. func LogDeployTelemetry(ctx context.Context, b *bundle.Bundle, errMsg string) { + homeDir, _ := env.UserHomeDir(ctx) + errMsg = scrubForTelemetry(errMsg, b.BundleRootPath, homeDir) + if len(errMsg) > maxErrorMessageLength { errMsg = errMsg[:maxErrorMessageLength] } diff --git a/bundle/phases/telemetry_scrub.go b/bundle/phases/telemetry_scrub.go new file mode 100644 index 0000000000..35aba7d0b2 --- /dev/null +++ b/bundle/phases/telemetry_scrub.go @@ -0,0 +1,135 @@ +package phases + +import ( + "path/filepath" + "regexp" + "strings" +) + +// Scrub sensitive information from error messages before sending to telemetry. +// Inspired by VS Code's telemetry path scrubbing and Sentry's @userpath pattern. +// +// References: +// - VS Code: https://github.com/microsoft/vscode/blob/main/src/vs/platform/telemetry/common/telemetryUtils.ts +// - Sentry: https://github.com/getsentry/relay (PII rule: @userpath) +var ( + // Matches home directory paths on macOS and Linux. + // The leading delimiter check avoids matching workspace paths like + // /Workspace/Users/... where /Users is not a top-level component. + unixHomeDirRegexp = regexp.MustCompile(`(?:^|[\s:,"'])(/(?:Users|home)/[^\s:,"']+)`) + + // Matches home directory paths on Windows with either backslashes or + // forward slashes (C:\Users\xxx\... or C:/Users/xxx/...). + windowsHomeDirRegexp = regexp.MustCompile(`[A-Z]:[/\\]Users[/\\][^\s:,"']+`) + + // Matches absolute Unix paths with at least two components + // (e.g., /tmp/foo, /Workspace/Users/..., /Volumes/catalog/schema/...). + absPathRegexp = regexp.MustCompile(`(?:^|[\s:,"'])(/[^\s:,"'/]+/[^\s:,"']+)`) + + // Matches relative paths: + // - Explicit: ./foo, ../foo + // - Dot-prefixed directories: .databricks/bundle/..., .cache/foo + // - Home shorthand: ~/.databricks/... + explicitRelPathRegexp = regexp.MustCompile(`(?:^|[\s:,"'])((?:~|\.\.?|\.[a-zA-Z][^\s:,"'/]*)/[^\s:,"']+)`) + + // Matches implicit relative paths: at least two path components where + // the last component has a file extension (e.g., "resources/job.yml", + // "bundle/dev/state.json"). + implicitRelPathRegexp = regexp.MustCompile(`(?:^|[\s:,"'])([a-zA-Z0-9_][^\s:,"']*/[^\s:,"']*\.[a-zA-Z][^\s:,"']*)`) + + // Matches email addresses. Workspace paths in Databricks often contain + // emails (e.g., /Workspace/Users/user@example.com/.bundle/dev). + emailRegexp = regexp.MustCompile(`[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}`) +) + +// scrubForTelemetry is a best-effort scrubber that removes sensitive path and +// PII information from error messages before they are sent to telemetry. +// The error message is treated as PII by the logging infrastructure but we +// scrub to avoid collecting more information than necessary. +func scrubForTelemetry(msg, bundleRoot, homeDir string) string { + // Replace the bundle root path first since it's the most specific match. + // This turns "/Users/shreyas/project/databricks.yml" into "./databricks.yml". + if bundleRoot != "" { + msg = replacePath(msg, bundleRoot, ".") + } + + // Replace the user's home directory. This catches paths outside the + // bundle root like "/Users/shreyas/.databricks/..." → "~/.databricks/...". + if homeDir != "" { + msg = replacePath(msg, homeDir, "~") + } + + // Regex fallback: redact remaining home directory paths not covered by the + // direct home dir replacement above (e.g., paths from other users or + // non-standard home directory locations). + // Run Windows first to avoid partial matches from the Unix regex on + // paths like C:/Users/... + msg = windowsHomeDirRegexp.ReplaceAllString(msg, "[REDACTED_PATH]") + msg = replaceDelimitedMatch(msg, unixHomeDirRegexp, "[REDACTED_PATH]") + + // Redact all remaining absolute paths. + msg = replaceDelimitedMatch(msg, absPathRegexp, "[REDACTED_PATH]") + + // Redact relative paths. + msg = replaceDelimitedMatch(msg, explicitRelPathRegexp, "[REDACTED_REL_PATH]") + msg = replaceDelimitedMatch(msg, implicitRelPathRegexp, "[REDACTED_REL_PATH]") + + // Redact email addresses. + msg = emailRegexp.ReplaceAllString(msg, "[REDACTED_EMAIL]") + + return msg +} + +// replacePath replaces all occurrences of a directory path with the given +// replacement. It only replaces when the path appears as a complete prefix, +// i.e., followed by `/`, a delimiter, or end of string. This prevents partial +// matches like "/Users/shreyas" matching inside "/Workspace/Users/shreyas@...". +func replacePath(msg, path, replacement string) string { + normalized := filepath.ToSlash(path) + for _, p := range []string{normalized, path} { + msg = strings.ReplaceAll(msg, p+"/", replacement+"/") + + // Replace occurrences not followed by '/' only when the path is at + // a word boundary (followed by delimiter or end of string). + result := strings.Builder{} + for { + idx := strings.Index(msg, p) + if idx == -1 { + result.WriteString(msg) + break + } + after := idx + len(p) + // Check the character after the match. Only replace if it's + // a delimiter or end of string. + if after == len(msg) || strings.ContainsRune(" \t\n:,\"'", rune(msg[after])) { + result.WriteString(msg[:idx]) + result.WriteString(replacement) + msg = msg[after:] + } else { + result.WriteString(msg[:after]) + msg = msg[after:] + } + } + msg = result.String() + } + return msg +} + +const delimiters = " \t\n:,\"'" + +// replaceDelimitedMatch replaces paths matched by a regex that uses a leading +// delimiter group `(?:^|[\s:,"'])`. The optional delimiter character is +// preserved and only the path itself is replaced. +func replaceDelimitedMatch(msg string, re *regexp.Regexp, replacement string) string { + return re.ReplaceAllStringFunc(msg, func(match string) string { + if len(match) == 0 { + return match + } + // If the first character is a delimiter, preserve it. + if strings.ContainsRune(delimiters, rune(match[0])) { + return match[:1] + replacement + } + // Otherwise the match starts at ^ and the whole match is the path. + return replacement + }) +} diff --git a/bundle/phases/telemetry_scrub_test.go b/bundle/phases/telemetry_scrub_test.go new file mode 100644 index 0000000000..db64a4eebc --- /dev/null +++ b/bundle/phases/telemetry_scrub_test.go @@ -0,0 +1,272 @@ +package phases + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestScrubForTelemetry_BundleRootPath(t *testing.T) { + tests := []struct { + name string + msg string + bundleRoot string + expected string + }{ + { + // Bundle root is replaced with "." and then the resulting + // ./databricks.yml is caught by the relative path scrubber. + name: "replaces bundle root in file path", + msg: "failed to load /home/user/project/databricks.yml: invalid config", + bundleRoot: "/home/user/project", + expected: "failed to load [REDACTED_REL_PATH]: invalid config", + }, + { + name: "replaces bundle root without trailing content", + msg: "error at /home/user/project", + bundleRoot: "/home/user/project", + expected: "error at .", + }, + { + name: "replaces multiple occurrences", + msg: "path /home/user/project/a.yml and /home/user/project/b.yml", + bundleRoot: "/home/user/project", + expected: "path [REDACTED_REL_PATH] and [REDACTED_REL_PATH]", + }, + { + name: "empty bundle root is no-op", + msg: "some error", + bundleRoot: "", + expected: "some error", + }, + { + name: "empty message", + msg: "", + bundleRoot: "/home/user/project", + expected: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg, tt.bundleRoot, "")) + }) + } +} + +func TestScrubForTelemetry_HomeDir(t *testing.T) { + tests := []struct { + name string + msg string + bundleRoot string + homeDir string + expected string + }{ + { + // Home dir is replaced with ~ and then ~/.databricks/config.json + // is caught by the relative path scrubber. + name: "replaces home dir and scrubs resulting path", + msg: "failed to read /Users/shreyas/.databricks/config.json", + homeDir: "/Users/shreyas", + expected: "failed to read [REDACTED_REL_PATH]", + }, + { + name: "home dir replacement for multiple paths", + msg: "error: /Users/shreyas/project/file.yml and /Users/shreyas/.cache/other", + homeDir: "/Users/shreyas", + expected: "error: [REDACTED_REL_PATH] and [REDACTED_REL_PATH]", + }, + { + name: "bundle root takes priority over home dir", + msg: "error at /Users/shreyas/project/databricks.yml", + bundleRoot: "/Users/shreyas/project", + homeDir: "/Users/shreyas", + expected: "error at [REDACTED_REL_PATH]", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg, tt.bundleRoot, tt.homeDir)) + }) + } +} + +func TestScrubForTelemetry_HomeDirRegexFallback(t *testing.T) { + tests := []struct { + name string + msg string + expected string + }{ + { + name: "macOS home dir", + msg: "failed to read /Users/otheruser/some-project/file.yml", + expected: "failed to read [REDACTED_PATH]", + }, + { + name: "Linux home dir", + msg: "failed to read /home/runner/work/project/file.yml", + expected: "failed to read [REDACTED_PATH]", + }, + { + name: "home dir in middle of message", + msg: "error: /Users/jane/project/a.yml: not found, try again", + expected: "error: [REDACTED_PATH]: not found, try again", + }, + { + name: "Windows home dir with backslashes", + msg: `error at C:\Users\shreyas\project\file.yml`, + expected: "error at [REDACTED_PATH]", + }, + { + name: "Windows home dir with forward slashes", + msg: "error at C:/Users/shreyas/project/file.yml", + expected: "error at [REDACTED_PATH]", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg, "", "")) + }) + } +} + +func TestScrubForTelemetry_AbsolutePaths(t *testing.T) { + tests := []struct { + name string + msg string + expected string + }{ + { + name: "tmp path", + msg: "failed to write /tmp/bundle-xyz/state.json", + expected: "failed to write [REDACTED_PATH]", + }, + { + name: "var folders path", + msg: "error reading /var/folders/7t/n_tz6x9d4xj91h48pf8md5zh0000gp/T/test123/file", + expected: "error reading [REDACTED_PATH]", + }, + { + name: "etc path", + msg: "config at /etc/databricks/config.json not found", + expected: "config at [REDACTED_PATH] not found", + }, + { + name: "workspace path is redacted", + msg: "uploading to /Workspace/Users/dev/.bundle/files", + expected: "uploading to [REDACTED_PATH]", + }, + { + name: "volume path is redacted", + msg: "artifact at /Volumes/catalog/schema/volume/artifact.whl", + expected: "artifact at [REDACTED_PATH]", + }, + { + name: "dbfs path is redacted", + msg: "state at /dbfs/mnt/data/state.json", + expected: "state at [REDACTED_PATH]", + }, + { + name: "single component path is not matched", + msg: "POST /telemetry-ext failed", + expected: "POST /telemetry-ext failed", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg, "", "")) + }) + } +} + +func TestScrubForTelemetry_RelativePaths(t *testing.T) { + tests := []struct { + name string + msg string + expected string + }{ + { + name: "explicit relative path with ./", + msg: "failed to load ./resources/job.yml", + expected: "failed to load [REDACTED_REL_PATH]", + }, + { + name: "explicit relative path with ../", + msg: "path ../parent/file.yml not allowed", + expected: "path [REDACTED_REL_PATH] not allowed", + }, + { + name: "implicit relative path with extension", + msg: "failed to read resources/pipeline.yml: not found", + expected: "failed to read [REDACTED_REL_PATH]: not found", + }, + { + name: "dot-prefixed directory path", + msg: "error reading .databricks/bundle/dev/variable-overrides.json", + expected: "error reading [REDACTED_REL_PATH]", + }, + { + name: "tilde home path", + msg: "reading ~/.databricks/config.json failed", + expected: "reading [REDACTED_REL_PATH] failed", + }, + { + name: "single filename without path separator is preserved", + msg: "failed to load databricks.yml", + expected: "failed to load databricks.yml", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg, "", "")) + }) + } +} + +func TestScrubForTelemetry_Emails(t *testing.T) { + tests := []struct { + name string + msg string + expected string + }{ + { + name: "email in message", + msg: "access denied for user@example.com in workspace", + expected: "access denied for [REDACTED_EMAIL] in workspace", + }, + { + name: "no email present", + msg: "some error without emails", + expected: "some error without emails", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg, "", "")) + }) + } +} + +func TestScrubForTelemetry_Combined(t *testing.T) { + msg := "failed to load /Users/shreyas/myproject/databricks.yml: " + + "workspace /Workspace/Users/shreyas@databricks.com/.bundle is invalid, " + + "also tried /home/other/fallback/config.yml, " + + "temp at /tmp/bundle-cache/state.json, " + + "see .databricks/bundle/dev/variable-overrides.json" + + got := scrubForTelemetry(msg, "/Users/shreyas/myproject", "/Users/shreyas") + + assert.Equal(t, + "failed to load [REDACTED_REL_PATH]: "+ + "workspace [REDACTED_PATH] is invalid, "+ + "also tried [REDACTED_PATH], "+ + "temp at [REDACTED_PATH], "+ + "see [REDACTED_REL_PATH]", + got, + ) +} From c5b09031e45bf19b9beaf3ee7cbe7f43654ef8ec Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 26 Mar 2026 18:53:34 +0100 Subject: [PATCH 06/15] Simplify telemetry scrubber: remove home-specific code, add workspace path redaction - Replace windowsHomeDirRegexp with general windowsAbsPathRegexp - Add workspacePathRegexp for [REDACTED_WORKSPACE_PATH] label - Use capturing groups instead of replaceDelimitedMatch helper - Merge HomeDirRegexFallback tests into AbsolutePaths test Co-authored-by: Isaac --- bundle/phases/telemetry_scrub.go | 112 +++++++------------------- bundle/phases/telemetry_scrub_test.go | 86 +++++++++++--------- 2 files changed, 79 insertions(+), 119 deletions(-) diff --git a/bundle/phases/telemetry_scrub.go b/bundle/phases/telemetry_scrub.go index 35aba7d0b2..56e98f89d3 100644 --- a/bundle/phases/telemetry_scrub.go +++ b/bundle/phases/telemetry_scrub.go @@ -13,29 +13,27 @@ import ( // - VS Code: https://github.com/microsoft/vscode/blob/main/src/vs/platform/telemetry/common/telemetryUtils.ts // - Sentry: https://github.com/getsentry/relay (PII rule: @userpath) var ( - // Matches home directory paths on macOS and Linux. - // The leading delimiter check avoids matching workspace paths like - // /Workspace/Users/... where /Users is not a top-level component. - unixHomeDirRegexp = regexp.MustCompile(`(?:^|[\s:,"'])(/(?:Users|home)/[^\s:,"']+)`) + // Matches Windows absolute paths with at least two components + // (e.g., C:\foo\bar, D:/projects/myapp). + windowsAbsPathRegexp = regexp.MustCompile(`[A-Z]:[/\\][^\s:,"'/\\]+[/\\][^\s:,"']+`) - // Matches home directory paths on Windows with either backslashes or - // forward slashes (C:\Users\xxx\... or C:/Users/xxx/...). - windowsHomeDirRegexp = regexp.MustCompile(`[A-Z]:[/\\]Users[/\\][^\s:,"']+`) + // Matches Databricks workspace paths (/Workspace/...). + workspacePathRegexp = regexp.MustCompile(`(^|[\s:,"'])(/Workspace/[^\s:,"']+)`) // Matches absolute Unix paths with at least two components - // (e.g., /tmp/foo, /Workspace/Users/..., /Volumes/catalog/schema/...). - absPathRegexp = regexp.MustCompile(`(?:^|[\s:,"'])(/[^\s:,"'/]+/[^\s:,"']+)`) + // (e.g., /home/user/..., /tmp/foo). + absPathRegexp = regexp.MustCompile(`(^|[\s:,"'])(/[^\s:,"'/]+/[^\s:,"']+)`) // Matches relative paths: // - Explicit: ./foo, ../foo // - Dot-prefixed directories: .databricks/bundle/..., .cache/foo // - Home shorthand: ~/.databricks/... - explicitRelPathRegexp = regexp.MustCompile(`(?:^|[\s:,"'])((?:~|\.\.?|\.[a-zA-Z][^\s:,"'/]*)/[^\s:,"']+)`) + explicitRelPathRegexp = regexp.MustCompile(`(^|[\s:,"'])((?:~|\.\.?|\.[a-zA-Z][^\s:,"'/]*)/[^\s:,"']+)`) // Matches implicit relative paths: at least two path components where // the last component has a file extension (e.g., "resources/job.yml", // "bundle/dev/state.json"). - implicitRelPathRegexp = regexp.MustCompile(`(?:^|[\s:,"'])([a-zA-Z0-9_][^\s:,"']*/[^\s:,"']*\.[a-zA-Z][^\s:,"']*)`) + implicitRelPathRegexp = regexp.MustCompile(`(^|[\s:,"'])([a-zA-Z0-9_][^\s:,"']*/[^\s:,"']*\.[a-zA-Z][^\s:,"']*)`) // Matches email addresses. Workspace paths in Databricks often contain // emails (e.g., /Workspace/Users/user@example.com/.bundle/dev). @@ -47,32 +45,20 @@ var ( // The error message is treated as PII by the logging infrastructure but we // scrub to avoid collecting more information than necessary. func scrubForTelemetry(msg, bundleRoot, homeDir string) string { - // Replace the bundle root path first since it's the most specific match. - // This turns "/Users/shreyas/project/databricks.yml" into "./databricks.yml". - if bundleRoot != "" { - msg = replacePath(msg, bundleRoot, ".") - } - - // Replace the user's home directory. This catches paths outside the - // bundle root like "/Users/shreyas/.databricks/..." → "~/.databricks/...". - if homeDir != "" { - msg = replacePath(msg, homeDir, "~") - } + // Direct string replacement for the two most important known paths. + // This handles the common case (path followed by separator) and the + // regex catch-all below handles any remaining occurrences. + msg = replacePath(msg, bundleRoot, ".") + msg = replacePath(msg, homeDir, "~") - // Regex fallback: redact remaining home directory paths not covered by the - // direct home dir replacement above (e.g., paths from other users or - // non-standard home directory locations). - // Run Windows first to avoid partial matches from the Unix regex on - // paths like C:/Users/... - msg = windowsHomeDirRegexp.ReplaceAllString(msg, "[REDACTED_PATH]") - msg = replaceDelimitedMatch(msg, unixHomeDirRegexp, "[REDACTED_PATH]") - - // Redact all remaining absolute paths. - msg = replaceDelimitedMatch(msg, absPathRegexp, "[REDACTED_PATH]") + // Redact absolute paths. + msg = windowsAbsPathRegexp.ReplaceAllString(msg, "[REDACTED_PATH]") + msg = workspacePathRegexp.ReplaceAllString(msg, "${1}[REDACTED_WORKSPACE_PATH]") + msg = absPathRegexp.ReplaceAllString(msg, "${1}[REDACTED_PATH]") // Redact relative paths. - msg = replaceDelimitedMatch(msg, explicitRelPathRegexp, "[REDACTED_REL_PATH]") - msg = replaceDelimitedMatch(msg, implicitRelPathRegexp, "[REDACTED_REL_PATH]") + msg = explicitRelPathRegexp.ReplaceAllString(msg, "${1}[REDACTED_REL_PATH]") + msg = implicitRelPathRegexp.ReplaceAllString(msg, "${1}[REDACTED_REL_PATH]") // Redact email addresses. msg = emailRegexp.ReplaceAllString(msg, "[REDACTED_EMAIL]") @@ -80,56 +66,18 @@ func scrubForTelemetry(msg, bundleRoot, homeDir string) string { return msg } -// replacePath replaces all occurrences of a directory path with the given -// replacement. It only replaces when the path appears as a complete prefix, -// i.e., followed by `/`, a delimiter, or end of string. This prevents partial -// matches like "/Users/shreyas" matching inside "/Workspace/Users/shreyas@...". +// replacePath replaces occurrences of path/ with replacement/ in msg. +// Only replaces when the path is followed by a separator to avoid partial +// matches (e.g., "/Users/shreyas" inside "/Workspace/Users/shreyas@..."). +// Any remaining occurrences are handled by the regex catch-all patterns. func replacePath(msg, path, replacement string) string { + if path == "" { + return msg + } normalized := filepath.ToSlash(path) - for _, p := range []string{normalized, path} { - msg = strings.ReplaceAll(msg, p+"/", replacement+"/") - - // Replace occurrences not followed by '/' only when the path is at - // a word boundary (followed by delimiter or end of string). - result := strings.Builder{} - for { - idx := strings.Index(msg, p) - if idx == -1 { - result.WriteString(msg) - break - } - after := idx + len(p) - // Check the character after the match. Only replace if it's - // a delimiter or end of string. - if after == len(msg) || strings.ContainsRune(" \t\n:,\"'", rune(msg[after])) { - result.WriteString(msg[:idx]) - result.WriteString(replacement) - msg = msg[after:] - } else { - result.WriteString(msg[:after]) - msg = msg[after:] - } - } - msg = result.String() + msg = strings.ReplaceAll(msg, normalized+"/", replacement+"/") + if path != normalized { + msg = strings.ReplaceAll(msg, path+string(filepath.Separator), replacement+string(filepath.Separator)) } return msg } - -const delimiters = " \t\n:,\"'" - -// replaceDelimitedMatch replaces paths matched by a regex that uses a leading -// delimiter group `(?:^|[\s:,"'])`. The optional delimiter character is -// preserved and only the path itself is replaced. -func replaceDelimitedMatch(msg string, re *regexp.Regexp, replacement string) string { - return re.ReplaceAllStringFunc(msg, func(match string) string { - if len(match) == 0 { - return match - } - // If the first character is a delimiter, preserve it. - if strings.ContainsRune(delimiters, rune(match[0])) { - return match[:1] + replacement - } - // Otherwise the match starts at ^ and the whole match is the path. - return replacement - }) -} diff --git a/bundle/phases/telemetry_scrub_test.go b/bundle/phases/telemetry_scrub_test.go index db64a4eebc..5aa3475555 100644 --- a/bundle/phases/telemetry_scrub_test.go +++ b/bundle/phases/telemetry_scrub_test.go @@ -22,10 +22,12 @@ func TestScrubForTelemetry_BundleRootPath(t *testing.T) { expected: "failed to load [REDACTED_REL_PATH]: invalid config", }, { + // Bundle root without trailing separator is caught by the + // absolute path regex. name: "replaces bundle root without trailing content", msg: "error at /home/user/project", bundleRoot: "/home/user/project", - expected: "error at .", + expected: "error at [REDACTED_PATH]", }, { name: "replaces multiple occurrences", @@ -92,37 +94,67 @@ func TestScrubForTelemetry_HomeDir(t *testing.T) { } } -func TestScrubForTelemetry_HomeDirRegexFallback(t *testing.T) { +func TestScrubForTelemetry_AbsolutePaths(t *testing.T) { tests := []struct { name string msg string expected string }{ { - name: "macOS home dir", + name: "tmp path", + msg: "failed to write /tmp/bundle-xyz/state.json", + expected: "failed to write [REDACTED_PATH]", + }, + { + name: "var folders path", + msg: "error reading /var/folders/7t/n_tz6x9d4xj91h48pf8md5zh0000gp/T/test123/file", + expected: "error reading [REDACTED_PATH]", + }, + { + name: "etc path", + msg: "config at /etc/databricks/config.json not found", + expected: "config at [REDACTED_PATH] not found", + }, + { + name: "macOS home path", msg: "failed to read /Users/otheruser/some-project/file.yml", expected: "failed to read [REDACTED_PATH]", }, { - name: "Linux home dir", + name: "Linux home path", msg: "failed to read /home/runner/work/project/file.yml", expected: "failed to read [REDACTED_PATH]", }, { - name: "home dir in middle of message", + name: "absolute path in middle of message", msg: "error: /Users/jane/project/a.yml: not found, try again", expected: "error: [REDACTED_PATH]: not found, try again", }, { - name: "Windows home dir with backslashes", + name: "Windows path with backslashes", msg: `error at C:\Users\shreyas\project\file.yml`, expected: "error at [REDACTED_PATH]", }, { - name: "Windows home dir with forward slashes", + name: "Windows path with forward slashes", msg: "error at C:/Users/shreyas/project/file.yml", expected: "error at [REDACTED_PATH]", }, + { + name: "volume path is redacted", + msg: "artifact at /Volumes/catalog/schema/volume/artifact.whl", + expected: "artifact at [REDACTED_PATH]", + }, + { + name: "dbfs path is redacted", + msg: "state at /dbfs/mnt/data/state.json", + expected: "state at [REDACTED_PATH]", + }, + { + name: "single component path is not matched", + msg: "POST /telemetry-ext failed", + expected: "POST /telemetry-ext failed", + }, } for _, tt := range tests { @@ -132,46 +164,26 @@ func TestScrubForTelemetry_HomeDirRegexFallback(t *testing.T) { } } -func TestScrubForTelemetry_AbsolutePaths(t *testing.T) { +func TestScrubForTelemetry_WorkspacePaths(t *testing.T) { tests := []struct { name string msg string expected string }{ { - name: "tmp path", - msg: "failed to write /tmp/bundle-xyz/state.json", - expected: "failed to write [REDACTED_PATH]", - }, - { - name: "var folders path", - msg: "error reading /var/folders/7t/n_tz6x9d4xj91h48pf8md5zh0000gp/T/test123/file", - expected: "error reading [REDACTED_PATH]", - }, - { - name: "etc path", - msg: "config at /etc/databricks/config.json not found", - expected: "config at [REDACTED_PATH] not found", - }, - { - name: "workspace path is redacted", + name: "workspace user path", msg: "uploading to /Workspace/Users/dev/.bundle/files", - expected: "uploading to [REDACTED_PATH]", - }, - { - name: "volume path is redacted", - msg: "artifact at /Volumes/catalog/schema/volume/artifact.whl", - expected: "artifact at [REDACTED_PATH]", + expected: "uploading to [REDACTED_WORKSPACE_PATH]", }, { - name: "dbfs path is redacted", - msg: "state at /dbfs/mnt/data/state.json", - expected: "state at [REDACTED_PATH]", + name: "workspace path with email", + msg: "error at /Workspace/Users/user@example.com/.bundle/dev", + expected: "error at [REDACTED_WORKSPACE_PATH]", }, { - name: "single component path is not matched", - msg: "POST /telemetry-ext failed", - expected: "POST /telemetry-ext failed", + name: "workspace shared path", + msg: "cannot access /Workspace/Shared/project/notebook", + expected: "cannot access [REDACTED_WORKSPACE_PATH]", }, } @@ -263,7 +275,7 @@ func TestScrubForTelemetry_Combined(t *testing.T) { assert.Equal(t, "failed to load [REDACTED_REL_PATH]: "+ - "workspace [REDACTED_PATH] is invalid, "+ + "workspace [REDACTED_WORKSPACE_PATH] is invalid, "+ "also tried [REDACTED_PATH], "+ "temp at [REDACTED_PATH], "+ "see [REDACTED_REL_PATH]", From 8cc26b39d27be32fdb1fd2d78dbb974996bbe87f Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 26 Mar 2026 19:04:49 +0100 Subject: [PATCH 07/15] Fix Windows drive letter case sensitivity and add Windows bundle root test Co-authored-by: Isaac --- bundle/phases/telemetry_scrub.go | 2 +- bundle/phases/telemetry_scrub_test.go | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/bundle/phases/telemetry_scrub.go b/bundle/phases/telemetry_scrub.go index 56e98f89d3..b3f081dd38 100644 --- a/bundle/phases/telemetry_scrub.go +++ b/bundle/phases/telemetry_scrub.go @@ -15,7 +15,7 @@ import ( var ( // Matches Windows absolute paths with at least two components // (e.g., C:\foo\bar, D:/projects/myapp). - windowsAbsPathRegexp = regexp.MustCompile(`[A-Z]:[/\\][^\s:,"'/\\]+[/\\][^\s:,"']+`) + windowsAbsPathRegexp = regexp.MustCompile(`[A-Za-z]:[/\\][^\s:,"'/\\]+[/\\][^\s:,"']+`) // Matches Databricks workspace paths (/Workspace/...). workspacePathRegexp = regexp.MustCompile(`(^|[\s:,"'])(/Workspace/[^\s:,"']+)`) diff --git a/bundle/phases/telemetry_scrub_test.go b/bundle/phases/telemetry_scrub_test.go index 5aa3475555..6d72fe2e91 100644 --- a/bundle/phases/telemetry_scrub_test.go +++ b/bundle/phases/telemetry_scrub_test.go @@ -35,6 +35,15 @@ func TestScrubForTelemetry_BundleRootPath(t *testing.T) { bundleRoot: "/home/user/project", expected: "path [REDACTED_REL_PATH] and [REDACTED_REL_PATH]", }, + { + // On Windows, replacePath strips the bundle root with native + // separators. On other platforms, the Windows regex catches + // the entire path. Both produce a redacted result. + name: "Windows bundle root with backslashes", + msg: `error at C:\Users\shreyas\project\databricks.yml`, + bundleRoot: `C:\Users\shreyas\project`, + expected: "error at [REDACTED_PATH]", + }, { name: "empty bundle root is no-op", msg: "some error", @@ -140,6 +149,11 @@ func TestScrubForTelemetry_AbsolutePaths(t *testing.T) { msg: "error at C:/Users/shreyas/project/file.yml", expected: "error at [REDACTED_PATH]", }, + { + name: "Windows path with lowercase drive letter", + msg: `error at c:\Users\shreyas\project\file.yml`, + expected: "error at [REDACTED_PATH]", + }, { name: "volume path is redacted", msg: "artifact at /Volumes/catalog/schema/volume/artifact.whl", From 34b5228683993fcc549b0a8b67528a123e69cb5a Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 26 Mar 2026 19:12:58 +0100 Subject: [PATCH 08/15] Remove replacePath; let regex patterns handle all path scrubbing The replacePath function converted bundle root to "." and home dir to "~" before regex scrubbing. Since absolute and relative paths are handled by separate regex patterns anyway, this added complexity for no benefit. Absolute paths now consistently get [REDACTED_PATH]. Co-authored-by: Isaac --- bundle/phases/telemetry.go | 4 +- bundle/phases/telemetry_scrub.go | 30 +----- bundle/phases/telemetry_scrub_test.go | 132 ++++---------------------- 3 files changed, 24 insertions(+), 142 deletions(-) diff --git a/bundle/phases/telemetry.go b/bundle/phases/telemetry.go index 2d41a48b65..5478ddb2a1 100644 --- a/bundle/phases/telemetry.go +++ b/bundle/phases/telemetry.go @@ -9,7 +9,6 @@ import ( "github.com/databricks/cli/bundle/config" "github.com/databricks/cli/bundle/libraries" "github.com/databricks/cli/libs/dyn" - "github.com/databricks/cli/libs/env" "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/telemetry" "github.com/databricks/cli/libs/telemetry/protos" @@ -39,8 +38,7 @@ const maxErrorMessageLength = 500 // LogDeployTelemetry logs a telemetry event for a bundle deploy command. func LogDeployTelemetry(ctx context.Context, b *bundle.Bundle, errMsg string) { - homeDir, _ := env.UserHomeDir(ctx) - errMsg = scrubForTelemetry(errMsg, b.BundleRootPath, homeDir) + errMsg = scrubForTelemetry(errMsg) if len(errMsg) > maxErrorMessageLength { errMsg = errMsg[:maxErrorMessageLength] diff --git a/bundle/phases/telemetry_scrub.go b/bundle/phases/telemetry_scrub.go index b3f081dd38..06a2ce4cc0 100644 --- a/bundle/phases/telemetry_scrub.go +++ b/bundle/phases/telemetry_scrub.go @@ -1,10 +1,6 @@ package phases -import ( - "path/filepath" - "regexp" - "strings" -) +import "regexp" // Scrub sensitive information from error messages before sending to telemetry. // Inspired by VS Code's telemetry path scrubbing and Sentry's @userpath pattern. @@ -44,13 +40,7 @@ var ( // PII information from error messages before they are sent to telemetry. // The error message is treated as PII by the logging infrastructure but we // scrub to avoid collecting more information than necessary. -func scrubForTelemetry(msg, bundleRoot, homeDir string) string { - // Direct string replacement for the two most important known paths. - // This handles the common case (path followed by separator) and the - // regex catch-all below handles any remaining occurrences. - msg = replacePath(msg, bundleRoot, ".") - msg = replacePath(msg, homeDir, "~") - +func scrubForTelemetry(msg string) string { // Redact absolute paths. msg = windowsAbsPathRegexp.ReplaceAllString(msg, "[REDACTED_PATH]") msg = workspacePathRegexp.ReplaceAllString(msg, "${1}[REDACTED_WORKSPACE_PATH]") @@ -65,19 +55,3 @@ func scrubForTelemetry(msg, bundleRoot, homeDir string) string { return msg } - -// replacePath replaces occurrences of path/ with replacement/ in msg. -// Only replaces when the path is followed by a separator to avoid partial -// matches (e.g., "/Users/shreyas" inside "/Workspace/Users/shreyas@..."). -// Any remaining occurrences are handled by the regex catch-all patterns. -func replacePath(msg, path, replacement string) string { - if path == "" { - return msg - } - normalized := filepath.ToSlash(path) - msg = strings.ReplaceAll(msg, normalized+"/", replacement+"/") - if path != normalized { - msg = strings.ReplaceAll(msg, path+string(filepath.Separator), replacement+string(filepath.Separator)) - } - return msg -} diff --git a/bundle/phases/telemetry_scrub_test.go b/bundle/phases/telemetry_scrub_test.go index 6d72fe2e91..7f4178f2a8 100644 --- a/bundle/phases/telemetry_scrub_test.go +++ b/bundle/phases/telemetry_scrub_test.go @@ -6,103 +6,6 @@ import ( "github.com/stretchr/testify/assert" ) -func TestScrubForTelemetry_BundleRootPath(t *testing.T) { - tests := []struct { - name string - msg string - bundleRoot string - expected string - }{ - { - // Bundle root is replaced with "." and then the resulting - // ./databricks.yml is caught by the relative path scrubber. - name: "replaces bundle root in file path", - msg: "failed to load /home/user/project/databricks.yml: invalid config", - bundleRoot: "/home/user/project", - expected: "failed to load [REDACTED_REL_PATH]: invalid config", - }, - { - // Bundle root without trailing separator is caught by the - // absolute path regex. - name: "replaces bundle root without trailing content", - msg: "error at /home/user/project", - bundleRoot: "/home/user/project", - expected: "error at [REDACTED_PATH]", - }, - { - name: "replaces multiple occurrences", - msg: "path /home/user/project/a.yml and /home/user/project/b.yml", - bundleRoot: "/home/user/project", - expected: "path [REDACTED_REL_PATH] and [REDACTED_REL_PATH]", - }, - { - // On Windows, replacePath strips the bundle root with native - // separators. On other platforms, the Windows regex catches - // the entire path. Both produce a redacted result. - name: "Windows bundle root with backslashes", - msg: `error at C:\Users\shreyas\project\databricks.yml`, - bundleRoot: `C:\Users\shreyas\project`, - expected: "error at [REDACTED_PATH]", - }, - { - name: "empty bundle root is no-op", - msg: "some error", - bundleRoot: "", - expected: "some error", - }, - { - name: "empty message", - msg: "", - bundleRoot: "/home/user/project", - expected: "", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg, tt.bundleRoot, "")) - }) - } -} - -func TestScrubForTelemetry_HomeDir(t *testing.T) { - tests := []struct { - name string - msg string - bundleRoot string - homeDir string - expected string - }{ - { - // Home dir is replaced with ~ and then ~/.databricks/config.json - // is caught by the relative path scrubber. - name: "replaces home dir and scrubs resulting path", - msg: "failed to read /Users/shreyas/.databricks/config.json", - homeDir: "/Users/shreyas", - expected: "failed to read [REDACTED_REL_PATH]", - }, - { - name: "home dir replacement for multiple paths", - msg: "error: /Users/shreyas/project/file.yml and /Users/shreyas/.cache/other", - homeDir: "/Users/shreyas", - expected: "error: [REDACTED_REL_PATH] and [REDACTED_REL_PATH]", - }, - { - name: "bundle root takes priority over home dir", - msg: "error at /Users/shreyas/project/databricks.yml", - bundleRoot: "/Users/shreyas/project", - homeDir: "/Users/shreyas", - expected: "error at [REDACTED_REL_PATH]", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg, tt.bundleRoot, tt.homeDir)) - }) - } -} - func TestScrubForTelemetry_AbsolutePaths(t *testing.T) { tests := []struct { name string @@ -135,10 +38,15 @@ func TestScrubForTelemetry_AbsolutePaths(t *testing.T) { expected: "failed to read [REDACTED_PATH]", }, { - name: "absolute path in middle of message", + name: "absolute path after colon delimiter", msg: "error: /Users/jane/project/a.yml: not found, try again", expected: "error: [REDACTED_PATH]: not found, try again", }, + { + name: "multiple absolute paths", + msg: "path /home/user/project/a.yml and /home/user/project/b.yml", + expected: "path [REDACTED_PATH] and [REDACTED_PATH]", + }, { name: "Windows path with backslashes", msg: `error at C:\Users\shreyas\project\file.yml`, @@ -169,11 +77,16 @@ func TestScrubForTelemetry_AbsolutePaths(t *testing.T) { msg: "POST /telemetry-ext failed", expected: "POST /telemetry-ext failed", }, + { + name: "empty message", + msg: "", + expected: "", + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg, "", "")) + assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg)) }) } } @@ -203,7 +116,7 @@ func TestScrubForTelemetry_WorkspacePaths(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg, "", "")) + assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg)) }) } } @@ -248,7 +161,7 @@ func TestScrubForTelemetry_RelativePaths(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg, "", "")) + assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg)) }) } } @@ -273,7 +186,7 @@ func TestScrubForTelemetry_Emails(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg, "", "")) + assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg)) }) } } @@ -285,14 +198,11 @@ func TestScrubForTelemetry_Combined(t *testing.T) { "temp at /tmp/bundle-cache/state.json, " + "see .databricks/bundle/dev/variable-overrides.json" - got := scrubForTelemetry(msg, "/Users/shreyas/myproject", "/Users/shreyas") + expected := "failed to load [REDACTED_PATH]: " + + "workspace [REDACTED_WORKSPACE_PATH] is invalid, " + + "also tried [REDACTED_PATH], " + + "temp at [REDACTED_PATH], " + + "see [REDACTED_REL_PATH]" - assert.Equal(t, - "failed to load [REDACTED_REL_PATH]: "+ - "workspace [REDACTED_WORKSPACE_PATH] is invalid, "+ - "also tried [REDACTED_PATH], "+ - "temp at [REDACTED_PATH], "+ - "see [REDACTED_REL_PATH]", - got, - ) + assert.Equal(t, expected, scrubForTelemetry(msg)) } From 0242d309cb43ee8406ecf154a6e3f9f43646ac30 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 26 Mar 2026 19:17:25 +0100 Subject: [PATCH 09/15] Move tilde home path (~/) to absolute path regex ~/... expands to an absolute path, so it should match [REDACTED_PATH] rather than [REDACTED_REL_PATH]. Co-authored-by: Isaac --- bundle/phases/telemetry_scrub.go | 7 +++---- bundle/phases/telemetry_scrub_test.go | 10 +++++----- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/bundle/phases/telemetry_scrub.go b/bundle/phases/telemetry_scrub.go index 06a2ce4cc0..1ba4ae67b8 100644 --- a/bundle/phases/telemetry_scrub.go +++ b/bundle/phases/telemetry_scrub.go @@ -17,14 +17,13 @@ var ( workspacePathRegexp = regexp.MustCompile(`(^|[\s:,"'])(/Workspace/[^\s:,"']+)`) // Matches absolute Unix paths with at least two components - // (e.g., /home/user/..., /tmp/foo). - absPathRegexp = regexp.MustCompile(`(^|[\s:,"'])(/[^\s:,"'/]+/[^\s:,"']+)`) + // (e.g., /home/user/..., /tmp/foo, ~/.config/databricks). + absPathRegexp = regexp.MustCompile(`(^|[\s:,"'])(~?/[^\s:,"'/]+/[^\s:,"']+)`) // Matches relative paths: // - Explicit: ./foo, ../foo // - Dot-prefixed directories: .databricks/bundle/..., .cache/foo - // - Home shorthand: ~/.databricks/... - explicitRelPathRegexp = regexp.MustCompile(`(^|[\s:,"'])((?:~|\.\.?|\.[a-zA-Z][^\s:,"'/]*)/[^\s:,"']+)`) + explicitRelPathRegexp = regexp.MustCompile(`(^|[\s:,"'])((?:\.\.?|\.[a-zA-Z][^\s:,"'/]*)/[^\s:,"']+)`) // Matches implicit relative paths: at least two path components where // the last component has a file extension (e.g., "resources/job.yml", diff --git a/bundle/phases/telemetry_scrub_test.go b/bundle/phases/telemetry_scrub_test.go index 7f4178f2a8..01ce182938 100644 --- a/bundle/phases/telemetry_scrub_test.go +++ b/bundle/phases/telemetry_scrub_test.go @@ -72,6 +72,11 @@ func TestScrubForTelemetry_AbsolutePaths(t *testing.T) { msg: "state at /dbfs/mnt/data/state.json", expected: "state at [REDACTED_PATH]", }, + { + name: "tilde home path", + msg: "reading ~/.databricks/config.json failed", + expected: "reading [REDACTED_PATH] failed", + }, { name: "single component path is not matched", msg: "POST /telemetry-ext failed", @@ -147,11 +152,6 @@ func TestScrubForTelemetry_RelativePaths(t *testing.T) { msg: "error reading .databricks/bundle/dev/variable-overrides.json", expected: "error reading [REDACTED_REL_PATH]", }, - { - name: "tilde home path", - msg: "reading ~/.databricks/config.json failed", - expected: "reading [REDACTED_REL_PATH] failed", - }, { name: "single filename without path separator is preserved", msg: "failed to load databricks.yml", From 947e545480deda63e03f5b6c898cc44b6bd2af1b Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 26 Mar 2026 19:19:07 +0100 Subject: [PATCH 10/15] Retain known file extensions in redacted paths Redacted paths now include the file extension for a known set of types, e.g. [REDACTED_PATH](yml), [REDACTED_REL_PATH](json). This helps with debugging without leaking sensitive information. Co-authored-by: Isaac --- .../telemetry/deploy-error-message/output.txt | 2 +- bundle/phases/telemetry_scrub.go | 56 ++++++++++++++-- bundle/phases/telemetry_scrub_test.go | 67 ++++++++++--------- 3 files changed, 87 insertions(+), 38 deletions(-) diff --git a/acceptance/bundle/telemetry/deploy-error-message/output.txt b/acceptance/bundle/telemetry/deploy-error-message/output.txt index e6d3c98057..bb7f609143 100644 --- a/acceptance/bundle/telemetry/deploy-error-message/output.txt +++ b/acceptance/bundle/telemetry/deploy-error-message/output.txt @@ -6,4 +6,4 @@ Error: no value assigned to required variable myvar. Variables are usually assig Exit code: 1 >>> cat out.requests.txt -no value assigned to required variable myvar. Variables are usually assigned in databricks.yml, and they can be overridden using "--var", the BUNDLE_VAR_myvar environment variable, or [REDACTED_REL_PATH] +no value assigned to required variable myvar. Variables are usually assigned in databricks.yml, and they can be overridden using "--var", the BUNDLE_VAR_myvar environment variable, or [REDACTED_REL_PATH](json) diff --git a/bundle/phases/telemetry_scrub.go b/bundle/phases/telemetry_scrub.go index 1ba4ae67b8..22a9ee72c2 100644 --- a/bundle/phases/telemetry_scrub.go +++ b/bundle/phases/telemetry_scrub.go @@ -1,6 +1,10 @@ package phases -import "regexp" +import ( + "path" + "regexp" + "strings" +) // Scrub sensitive information from error messages before sending to telemetry. // Inspired by VS Code's telemetry path scrubbing and Sentry's @userpath pattern. @@ -35,22 +39,62 @@ var ( emailRegexp = regexp.MustCompile(`[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}`) ) +// Known file extensions that are safe to retain in redacted paths. +// These help with debugging without leaking sensitive information. +var knownExtensions = map[string]bool{ + ".py": true, + ".yml": true, + ".yaml": true, + ".json": true, + ".toml": true, + ".tf": true, + ".sql": true, + ".txt": true, + ".whl": true, + ".jar": true, + ".cfg": true, + ".ipynb": true, +} + // scrubForTelemetry is a best-effort scrubber that removes sensitive path and // PII information from error messages before they are sent to telemetry. // The error message is treated as PII by the logging infrastructure but we // scrub to avoid collecting more information than necessary. func scrubForTelemetry(msg string) string { // Redact absolute paths. - msg = windowsAbsPathRegexp.ReplaceAllString(msg, "[REDACTED_PATH]") - msg = workspacePathRegexp.ReplaceAllString(msg, "${1}[REDACTED_WORKSPACE_PATH]") - msg = absPathRegexp.ReplaceAllString(msg, "${1}[REDACTED_PATH]") + msg = replacePathRegexp(msg, windowsAbsPathRegexp, "[REDACTED_PATH]", false) + msg = replacePathRegexp(msg, workspacePathRegexp, "[REDACTED_WORKSPACE_PATH]", true) + msg = replacePathRegexp(msg, absPathRegexp, "[REDACTED_PATH]", true) // Redact relative paths. - msg = explicitRelPathRegexp.ReplaceAllString(msg, "${1}[REDACTED_REL_PATH]") - msg = implicitRelPathRegexp.ReplaceAllString(msg, "${1}[REDACTED_REL_PATH]") + msg = replacePathRegexp(msg, explicitRelPathRegexp, "[REDACTED_REL_PATH]", true) + msg = replacePathRegexp(msg, implicitRelPathRegexp, "[REDACTED_REL_PATH]", true) // Redact email addresses. msg = emailRegexp.ReplaceAllString(msg, "[REDACTED_EMAIL]") return msg } + +// replacePathRegexp replaces path matches with the given label, retaining +// known file extensions. When hasDelimiterGroup is true, the first character +// of the match is preserved as a delimiter prefix. +func replacePathRegexp(msg string, re *regexp.Regexp, label string, hasDelimiterGroup bool) string { + return re.ReplaceAllStringFunc(msg, func(match string) string { + prefix := "" + p := match + if hasDelimiterGroup && len(match) > 0 { + first := match[0] + if strings.ContainsRune(" \t\n:,\"'", rune(first)) { + prefix = match[:1] + p = match[1:] + } + } + + ext := path.Ext(p) + if knownExtensions[ext] { + return prefix + label + "(" + ext[1:] + ")" + } + return prefix + label + }) +} diff --git a/bundle/phases/telemetry_scrub_test.go b/bundle/phases/telemetry_scrub_test.go index 01ce182938..b8bd2b73cc 100644 --- a/bundle/phases/telemetry_scrub_test.go +++ b/bundle/phases/telemetry_scrub_test.go @@ -13,69 +13,74 @@ func TestScrubForTelemetry_AbsolutePaths(t *testing.T) { expected string }{ { - name: "tmp path", + name: "tmp path with known extension", msg: "failed to write /tmp/bundle-xyz/state.json", - expected: "failed to write [REDACTED_PATH]", + expected: "failed to write [REDACTED_PATH](json)", }, { - name: "var folders path", + name: "var folders path without extension", msg: "error reading /var/folders/7t/n_tz6x9d4xj91h48pf8md5zh0000gp/T/test123/file", expected: "error reading [REDACTED_PATH]", }, { - name: "etc path", + name: "etc path with known extension", msg: "config at /etc/databricks/config.json not found", - expected: "config at [REDACTED_PATH] not found", + expected: "config at [REDACTED_PATH](json) not found", }, { - name: "macOS home path", + name: "macOS home path with known extension", msg: "failed to read /Users/otheruser/some-project/file.yml", - expected: "failed to read [REDACTED_PATH]", + expected: "failed to read [REDACTED_PATH](yml)", }, { - name: "Linux home path", + name: "Linux home path with known extension", msg: "failed to read /home/runner/work/project/file.yml", - expected: "failed to read [REDACTED_PATH]", + expected: "failed to read [REDACTED_PATH](yml)", }, { name: "absolute path after colon delimiter", msg: "error: /Users/jane/project/a.yml: not found, try again", - expected: "error: [REDACTED_PATH]: not found, try again", + expected: "error: [REDACTED_PATH](yml): not found, try again", }, { name: "multiple absolute paths", msg: "path /home/user/project/a.yml and /home/user/project/b.yml", - expected: "path [REDACTED_PATH] and [REDACTED_PATH]", + expected: "path [REDACTED_PATH](yml) and [REDACTED_PATH](yml)", }, { name: "Windows path with backslashes", msg: `error at C:\Users\shreyas\project\file.yml`, - expected: "error at [REDACTED_PATH]", + expected: "error at [REDACTED_PATH](yml)", }, { name: "Windows path with forward slashes", msg: "error at C:/Users/shreyas/project/file.yml", - expected: "error at [REDACTED_PATH]", + expected: "error at [REDACTED_PATH](yml)", }, { name: "Windows path with lowercase drive letter", msg: `error at c:\Users\shreyas\project\file.yml`, - expected: "error at [REDACTED_PATH]", + expected: "error at [REDACTED_PATH](yml)", }, { - name: "volume path is redacted", + name: "volume path with known extension", msg: "artifact at /Volumes/catalog/schema/volume/artifact.whl", - expected: "artifact at [REDACTED_PATH]", + expected: "artifact at [REDACTED_PATH](whl)", }, { - name: "dbfs path is redacted", + name: "dbfs path with known extension", msg: "state at /dbfs/mnt/data/state.json", - expected: "state at [REDACTED_PATH]", + expected: "state at [REDACTED_PATH](json)", + }, + { + name: "path with unknown extension", + msg: "error at /home/user/project/file.xyz", + expected: "error at [REDACTED_PATH]", }, { - name: "tilde home path", + name: "tilde home path with known extension", msg: "reading ~/.databricks/config.json failed", - expected: "reading [REDACTED_PATH] failed", + expected: "reading [REDACTED_PATH](json) failed", }, { name: "single component path is not matched", @@ -103,7 +108,7 @@ func TestScrubForTelemetry_WorkspacePaths(t *testing.T) { expected string }{ { - name: "workspace user path", + name: "workspace user path without extension", msg: "uploading to /Workspace/Users/dev/.bundle/files", expected: "uploading to [REDACTED_WORKSPACE_PATH]", }, @@ -113,7 +118,7 @@ func TestScrubForTelemetry_WorkspacePaths(t *testing.T) { expected: "error at [REDACTED_WORKSPACE_PATH]", }, { - name: "workspace shared path", + name: "workspace shared path without extension", msg: "cannot access /Workspace/Shared/project/notebook", expected: "cannot access [REDACTED_WORKSPACE_PATH]", }, @@ -135,22 +140,22 @@ func TestScrubForTelemetry_RelativePaths(t *testing.T) { { name: "explicit relative path with ./", msg: "failed to load ./resources/job.yml", - expected: "failed to load [REDACTED_REL_PATH]", + expected: "failed to load [REDACTED_REL_PATH](yml)", }, { name: "explicit relative path with ../", msg: "path ../parent/file.yml not allowed", - expected: "path [REDACTED_REL_PATH] not allowed", + expected: "path [REDACTED_REL_PATH](yml) not allowed", }, { name: "implicit relative path with extension", msg: "failed to read resources/pipeline.yml: not found", - expected: "failed to read [REDACTED_REL_PATH]: not found", + expected: "failed to read [REDACTED_REL_PATH](yml): not found", }, { - name: "dot-prefixed directory path", + name: "dot-prefixed directory path with extension", msg: "error reading .databricks/bundle/dev/variable-overrides.json", - expected: "error reading [REDACTED_REL_PATH]", + expected: "error reading [REDACTED_REL_PATH](json)", }, { name: "single filename without path separator is preserved", @@ -198,11 +203,11 @@ func TestScrubForTelemetry_Combined(t *testing.T) { "temp at /tmp/bundle-cache/state.json, " + "see .databricks/bundle/dev/variable-overrides.json" - expected := "failed to load [REDACTED_PATH]: " + + expected := "failed to load [REDACTED_PATH](yml): " + "workspace [REDACTED_WORKSPACE_PATH] is invalid, " + - "also tried [REDACTED_PATH], " + - "temp at [REDACTED_PATH], " + - "see [REDACTED_REL_PATH]" + "also tried [REDACTED_PATH](yml), " + + "temp at [REDACTED_PATH](json), " + + "see [REDACTED_REL_PATH](json)" assert.Equal(t, expected, scrubForTelemetry(msg)) } From e4197a81513ab2040e65a77b833fbd575ecfd93a Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 26 Mar 2026 19:21:39 +0100 Subject: [PATCH 11/15] Add more known file extensions to telemetry scrubber Add .r, .scala, .sh, .hcl, .ini, .zip, .tar, .csv for better coverage of file types common in the Databricks CLI ecosystem. Co-authored-by: Isaac --- bundle/phases/telemetry_scrub.go | 35 ++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/bundle/phases/telemetry_scrub.go b/bundle/phases/telemetry_scrub.go index 22a9ee72c2..60295ea3eb 100644 --- a/bundle/phases/telemetry_scrub.go +++ b/bundle/phases/telemetry_scrub.go @@ -42,18 +42,35 @@ var ( // Known file extensions that are safe to retain in redacted paths. // These help with debugging without leaking sensitive information. var knownExtensions = map[string]bool{ + // Configuration and data formats + ".yml": true, + ".yaml": true, + ".json": true, + ".toml": true, + ".cfg": true, + ".ini": true, + + // Notebook and script languages ".py": true, - ".yml": true, - ".yaml": true, - ".json": true, - ".toml": true, - ".tf": true, + ".r": true, + ".scala": true, ".sql": true, - ".txt": true, - ".whl": true, - ".jar": true, - ".cfg": true, ".ipynb": true, + ".sh": true, + + // Terraform + ".tf": true, + ".hcl": true, + + // Build artifacts and archives + ".whl": true, + ".jar": true, + ".zip": true, + ".tar": true, + + // Other + ".txt": true, + ".csv": true, } // scrubForTelemetry is a best-effort scrubber that removes sensitive path and From 8d680a8f6133c80de7219d8ccfb10887d2228eed Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 26 Mar 2026 19:23:20 +0100 Subject: [PATCH 12/15] Add web/app and doc file extensions to telemetry scrubber Add .js, .ts, .jsx, .tsx, .html, .css (Databricks Apps / Vite), .env, and .md. Co-authored-by: Isaac --- bundle/phases/telemetry_scrub.go | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/bundle/phases/telemetry_scrub.go b/bundle/phases/telemetry_scrub.go index 60295ea3eb..3cdd580909 100644 --- a/bundle/phases/telemetry_scrub.go +++ b/bundle/phases/telemetry_scrub.go @@ -49,6 +49,7 @@ var knownExtensions = map[string]bool{ ".toml": true, ".cfg": true, ".ini": true, + ".env": true, // Notebook and script languages ".py": true, @@ -58,9 +59,17 @@ var knownExtensions = map[string]bool{ ".ipynb": true, ".sh": true, + // Web / Apps + ".js": true, + ".ts": true, + ".jsx": true, + ".tsx": true, + ".html": true, + ".css": true, + // Terraform - ".tf": true, - ".hcl": true, + ".tf": true, + ".hcl": true, // Build artifacts and archives ".whl": true, @@ -71,6 +80,7 @@ var knownExtensions = map[string]bool{ // Other ".txt": true, ".csv": true, + ".md": true, } // scrubForTelemetry is a best-effort scrubber that removes sensitive path and From 968e9fbd9db686e1bd8f82b6810f5ec748649a16 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Thu, 26 Mar 2026 19:25:27 +0100 Subject: [PATCH 13/15] Expand known extensions to cover all plausible error message paths Add .xml, .properties, .conf, .tfstate, .tfvars, .egg, .gz, .tgz, .dbc, .parquet, .avro, .log, .lock, .pem, .crt. Co-authored-by: Isaac --- bundle/phases/telemetry_scrub.go | 47 ++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/bundle/phases/telemetry_scrub.go b/bundle/phases/telemetry_scrub.go index 3cdd580909..57f2067ab2 100644 --- a/bundle/phases/telemetry_scrub.go +++ b/bundle/phases/telemetry_scrub.go @@ -43,13 +43,16 @@ var ( // These help with debugging without leaking sensitive information. var knownExtensions = map[string]bool{ // Configuration and data formats - ".yml": true, - ".yaml": true, - ".json": true, - ".toml": true, - ".cfg": true, - ".ini": true, - ".env": true, + ".yml": true, + ".yaml": true, + ".json": true, + ".toml": true, + ".cfg": true, + ".ini": true, + ".env": true, + ".xml": true, + ".properties": true, + ".conf": true, // Notebook and script languages ".py": true, @@ -68,19 +71,35 @@ var knownExtensions = map[string]bool{ ".css": true, // Terraform - ".tf": true, - ".hcl": true, + ".tf": true, + ".hcl": true, + ".tfstate": true, + ".tfvars": true, // Build artifacts and archives ".whl": true, ".jar": true, + ".egg": true, ".zip": true, ".tar": true, - - // Other - ".txt": true, - ".csv": true, - ".md": true, + ".gz": true, + ".tgz": true, + ".dbc": true, + + // Data formats + ".txt": true, + ".csv": true, + ".md": true, + ".parquet": true, + ".avro": true, + + // Logs and locks + ".log": true, + ".lock": true, + + // Certificates and keys + ".pem": true, + ".crt": true, } // scrubForTelemetry is a best-effort scrubber that removes sensitive path and From 4a0b8d9336f5001b315d148135011ff87551b171 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Fri, 27 Mar 2026 16:05:27 +0100 Subject: [PATCH 14/15] Document boundary character choice in path regexes Co-authored-by: Isaac --- bundle/phases/telemetry_scrub.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bundle/phases/telemetry_scrub.go b/bundle/phases/telemetry_scrub.go index 57f2067ab2..3153d75855 100644 --- a/bundle/phases/telemetry_scrub.go +++ b/bundle/phases/telemetry_scrub.go @@ -9,6 +9,13 @@ import ( // Scrub sensitive information from error messages before sending to telemetry. // Inspired by VS Code's telemetry path scrubbing and Sentry's @userpath pattern. // +// Path regexes use [\s:,"'] as boundary characters to delimit where a path +// ends. While these characters are technically valid in file paths, in error +// messages they act as delimiters (e.g. "error: /path/to/file: not found", +// or "failed to read '/some/path', skipping"). This is a practical tradeoff: +// paths containing colons, commas, or quotes are extremely rare, and without +// these boundaries the regexes would over-match into surrounding message text. +// // References: // - VS Code: https://github.com/microsoft/vscode/blob/main/src/vs/platform/telemetry/common/telemetryUtils.ts // - Sentry: https://github.com/getsentry/relay (PII rule: @userpath) From 4919773fc12e91d5a2380216a108c2862afbd547 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Fri, 27 Mar 2026 16:06:07 +0100 Subject: [PATCH 15/15] Update comment wording for known extensions Co-authored-by: Isaac --- bundle/phases/telemetry_scrub.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bundle/phases/telemetry_scrub.go b/bundle/phases/telemetry_scrub.go index 3153d75855..a25bfbc131 100644 --- a/bundle/phases/telemetry_scrub.go +++ b/bundle/phases/telemetry_scrub.go @@ -47,7 +47,7 @@ var ( ) // Known file extensions that are safe to retain in redacted paths. -// These help with debugging without leaking sensitive information. +// These help understand usage patterns without capturing sensitive information. var knownExtensions = map[string]bool{ // Configuration and data formats ".yml": true,