Skip to content

Commit ca229c4

Browse files
Scrub sensitive paths and emails from deploy telemetry error messages
Adds a scrubber that runs before error messages are sent to telemetry: 1. Replaces the bundle root path with "." to avoid leaking local paths 2. Redacts remaining home directory paths (/Users/..., /home/..., C:\Users\...) 3. Redacts email addresses (e.g., in workspace paths) Inspired by VS Code's telemetry path scrubbing and Sentry's @userpath rule. Co-authored-by: Isaac
1 parent a2fb5dd commit ca229c4

File tree

4 files changed

+412
-1
lines changed

4 files changed

+412
-1
lines changed

acceptance/bundle/telemetry/deploy-error-message/output.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@ Error: no value assigned to required variable myvar. Variables are usually assig
66
Exit code: 1
77

88
>>> cat out.requests.txt
9-
no value assigned to required variable myvar. Variables are usually assigned in databricks.yml, and they can be overridden using "--var", the BUNDLE_VAR_myvar environment variable, or .databricks/bundle/<target>/variable-overrides.json
9+
no value assigned to required variable myvar. Variables are usually assigned in databricks.yml, and they can be overridden using "--var", the BUNDLE_VAR_myvar environment variable, or [REDACTED_REL_PATH]

bundle/phases/telemetry.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"github.com/databricks/cli/bundle/config"
1010
"github.com/databricks/cli/bundle/libraries"
1111
"github.com/databricks/cli/libs/dyn"
12+
"github.com/databricks/cli/libs/env"
1213
"github.com/databricks/cli/libs/log"
1314
"github.com/databricks/cli/libs/telemetry"
1415
"github.com/databricks/cli/libs/telemetry/protos"
@@ -38,6 +39,9 @@ const maxErrorMessageLength = 500
3839

3940
// LogDeployTelemetry logs a telemetry event for a bundle deploy command.
4041
func LogDeployTelemetry(ctx context.Context, b *bundle.Bundle, errMsg string) {
42+
homeDir, _ := env.UserHomeDir(ctx)
43+
errMsg = scrubForTelemetry(errMsg, b.BundleRootPath, homeDir)
44+
4145
if len(errMsg) > maxErrorMessageLength {
4246
errMsg = errMsg[:maxErrorMessageLength]
4347
}

bundle/phases/telemetry_scrub.go

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
package phases
2+
3+
import (
4+
"path/filepath"
5+
"regexp"
6+
"strings"
7+
)
8+
9+
// Scrub sensitive information from error messages before sending to telemetry.
10+
// Inspired by VS Code's telemetry path scrubbing and Sentry's @userpath pattern.
11+
//
12+
// References:
13+
// - VS Code: https://github.com/microsoft/vscode/blob/main/src/vs/platform/telemetry/common/telemetryUtils.ts
14+
// - Sentry: https://github.com/getsentry/relay (PII rule: @userpath)
15+
var (
16+
// Matches home directory paths on macOS and Linux.
17+
// The leading delimiter check avoids matching workspace paths like
18+
// /Workspace/Users/... where /Users is not a top-level component.
19+
unixHomeDirRegexp = regexp.MustCompile(`(?:^|[\s:,"'])(/(?:Users|home)/[^\s:,"']+)`)
20+
21+
// Matches home directory paths on Windows with either backslashes or
22+
// forward slashes (C:\Users\xxx\... or C:/Users/xxx/...).
23+
windowsHomeDirRegexp = regexp.MustCompile(`[A-Z]:[/\\]Users[/\\][^\s:,"']+`)
24+
25+
// Matches absolute Unix paths with at least two components
26+
// (e.g., /tmp/foo, /Workspace/Users/..., /Volumes/catalog/schema/...).
27+
absPathRegexp = regexp.MustCompile(`(?:^|[\s:,"'])(/[^\s:,"'/]+/[^\s:,"']+)`)
28+
29+
// Matches relative paths:
30+
// - Explicit: ./foo, ../foo
31+
// - Dot-prefixed directories: .databricks/bundle/..., .cache/foo
32+
// - Home shorthand: ~/.databricks/...
33+
explicitRelPathRegexp = regexp.MustCompile(`(?:^|[\s:,"'])((?:~|\.\.?|\.[a-zA-Z][^\s:,"'/]*)/[^\s:,"']+)`)
34+
35+
// Matches implicit relative paths: at least two path components where
36+
// the last component has a file extension (e.g., "resources/job.yml",
37+
// "bundle/dev/state.json").
38+
implicitRelPathRegexp = regexp.MustCompile(`(?:^|[\s:,"'])([a-zA-Z0-9_][^\s:,"']*/[^\s:,"']*\.[a-zA-Z][^\s:,"']*)`)
39+
40+
// Matches email addresses. Workspace paths in Databricks often contain
41+
// emails (e.g., /Workspace/Users/user@example.com/.bundle/dev).
42+
emailRegexp = regexp.MustCompile(`[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}`)
43+
)
44+
45+
// scrubForTelemetry is a best-effort scrubber that removes sensitive path and
46+
// PII information from error messages before they are sent to telemetry.
47+
// The error message is treated as PII by the logging infrastructure but we
48+
// scrub to avoid collecting more information than necessary.
49+
func scrubForTelemetry(msg, bundleRoot, homeDir string) string {
50+
// Replace the bundle root path first since it's the most specific match.
51+
// This turns "/Users/shreyas/project/databricks.yml" into "./databricks.yml".
52+
if bundleRoot != "" {
53+
msg = replacePath(msg, bundleRoot, ".")
54+
}
55+
56+
// Replace the user's home directory. This catches paths outside the
57+
// bundle root like "/Users/shreyas/.databricks/..." → "~/.databricks/...".
58+
if homeDir != "" {
59+
msg = replacePath(msg, homeDir, "~")
60+
}
61+
62+
// Regex fallback: redact remaining home directory paths not covered by the
63+
// direct home dir replacement above (e.g., paths from other users or
64+
// non-standard home directory locations).
65+
// Run Windows first to avoid partial matches from the Unix regex on
66+
// paths like C:/Users/...
67+
msg = windowsHomeDirRegexp.ReplaceAllString(msg, "[REDACTED_PATH]")
68+
msg = replaceDelimitedMatch(msg, unixHomeDirRegexp, "[REDACTED_PATH]")
69+
70+
// Redact all remaining absolute paths.
71+
msg = replaceDelimitedMatch(msg, absPathRegexp, "[REDACTED_PATH]")
72+
73+
// Redact relative paths.
74+
msg = replaceDelimitedMatch(msg, explicitRelPathRegexp, "[REDACTED_REL_PATH]")
75+
msg = replaceDelimitedMatch(msg, implicitRelPathRegexp, "[REDACTED_REL_PATH]")
76+
77+
// Redact email addresses.
78+
msg = emailRegexp.ReplaceAllString(msg, "[REDACTED_EMAIL]")
79+
80+
return msg
81+
}
82+
83+
// replacePath replaces all occurrences of a directory path with the given
84+
// replacement. It only replaces when the path appears as a complete prefix,
85+
// i.e., followed by `/`, a delimiter, or end of string. This prevents partial
86+
// matches like "/Users/shreyas" matching inside "/Workspace/Users/shreyas@...".
87+
func replacePath(msg, path, replacement string) string {
88+
normalized := filepath.ToSlash(path)
89+
for _, p := range []string{normalized, path} {
90+
msg = strings.ReplaceAll(msg, p+"/", replacement+"/")
91+
92+
// Replace occurrences not followed by '/' only when the path is at
93+
// a word boundary (followed by delimiter or end of string).
94+
result := strings.Builder{}
95+
for {
96+
idx := strings.Index(msg, p)
97+
if idx == -1 {
98+
result.WriteString(msg)
99+
break
100+
}
101+
after := idx + len(p)
102+
// Check the character after the match. Only replace if it's
103+
// a delimiter or end of string.
104+
if after == len(msg) || strings.ContainsRune(" \t\n:,\"'", rune(msg[after])) {
105+
result.WriteString(msg[:idx])
106+
result.WriteString(replacement)
107+
msg = msg[after:]
108+
} else {
109+
result.WriteString(msg[:after])
110+
msg = msg[after:]
111+
}
112+
}
113+
msg = result.String()
114+
}
115+
return msg
116+
}
117+
118+
const delimiters = " \t\n:,\"'"
119+
120+
// replaceDelimitedMatch replaces paths matched by a regex that uses a leading
121+
// delimiter group `(?:^|[\s:,"'])`. The optional delimiter character is
122+
// preserved and only the path itself is replaced.
123+
func replaceDelimitedMatch(msg string, re *regexp.Regexp, replacement string) string {
124+
return re.ReplaceAllStringFunc(msg, func(match string) string {
125+
if len(match) == 0 {
126+
return match
127+
}
128+
// If the first character is a delimiter, preserve it.
129+
if strings.ContainsRune(delimiters, rune(match[0])) {
130+
return match[:1] + replacement
131+
}
132+
// Otherwise the match starts at ^ and the whole match is the path.
133+
return replacement
134+
})
135+
}

0 commit comments

Comments
 (0)