|
| 1 | +package phases |
| 2 | + |
| 3 | +import ( |
| 4 | + "path/filepath" |
| 5 | + "regexp" |
| 6 | + "strings" |
| 7 | +) |
| 8 | + |
| 9 | +// Scrub sensitive information from error messages before sending to telemetry. |
| 10 | +// Inspired by VS Code's telemetry path scrubbing and Sentry's @userpath pattern. |
| 11 | +// |
| 12 | +// References: |
| 13 | +// - VS Code: https://github.com/microsoft/vscode/blob/main/src/vs/platform/telemetry/common/telemetryUtils.ts |
| 14 | +// - Sentry: https://github.com/getsentry/relay (PII rule: @userpath) |
| 15 | +var ( |
| 16 | + // Matches home directory paths on macOS and Linux. |
| 17 | + // The leading delimiter check avoids matching workspace paths like |
| 18 | + // /Workspace/Users/... where /Users is not a top-level component. |
| 19 | + unixHomeDirRegexp = regexp.MustCompile(`(?:^|[\s:,"'])(/(?:Users|home)/[^\s:,"']+)`) |
| 20 | + |
| 21 | + // Matches home directory paths on Windows with either backslashes or |
| 22 | + // forward slashes (C:\Users\xxx\... or C:/Users/xxx/...). |
| 23 | + windowsHomeDirRegexp = regexp.MustCompile(`[A-Z]:[/\\]Users[/\\][^\s:,"']+`) |
| 24 | + |
| 25 | + // Matches absolute Unix paths with at least two components |
| 26 | + // (e.g., /tmp/foo, /Workspace/Users/..., /Volumes/catalog/schema/...). |
| 27 | + absPathRegexp = regexp.MustCompile(`(?:^|[\s:,"'])(/[^\s:,"'/]+/[^\s:,"']+)`) |
| 28 | + |
| 29 | + // Matches relative paths: |
| 30 | + // - Explicit: ./foo, ../foo |
| 31 | + // - Dot-prefixed directories: .databricks/bundle/..., .cache/foo |
| 32 | + // - Home shorthand: ~/.databricks/... |
| 33 | + explicitRelPathRegexp = regexp.MustCompile(`(?:^|[\s:,"'])((?:~|\.\.?|\.[a-zA-Z][^\s:,"'/]*)/[^\s:,"']+)`) |
| 34 | + |
| 35 | + // Matches implicit relative paths: at least two path components where |
| 36 | + // the last component has a file extension (e.g., "resources/job.yml", |
| 37 | + // "bundle/dev/state.json"). |
| 38 | + implicitRelPathRegexp = regexp.MustCompile(`(?:^|[\s:,"'])([a-zA-Z0-9_][^\s:,"']*/[^\s:,"']*\.[a-zA-Z][^\s:,"']*)`) |
| 39 | + |
| 40 | + // Matches email addresses. Workspace paths in Databricks often contain |
| 41 | + // emails (e.g., /Workspace/Users/user@example.com/.bundle/dev). |
| 42 | + emailRegexp = regexp.MustCompile(`[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}`) |
| 43 | +) |
| 44 | + |
| 45 | +// scrubForTelemetry is a best-effort scrubber that removes sensitive path and |
| 46 | +// PII information from error messages before they are sent to telemetry. |
| 47 | +// The error message is treated as PII by the logging infrastructure but we |
| 48 | +// scrub to avoid collecting more information than necessary. |
| 49 | +func scrubForTelemetry(msg, bundleRoot, homeDir string) string { |
| 50 | + // Replace the bundle root path first since it's the most specific match. |
| 51 | + // This turns "/Users/shreyas/project/databricks.yml" into "./databricks.yml". |
| 52 | + if bundleRoot != "" { |
| 53 | + msg = replacePath(msg, bundleRoot, ".") |
| 54 | + } |
| 55 | + |
| 56 | + // Replace the user's home directory. This catches paths outside the |
| 57 | + // bundle root like "/Users/shreyas/.databricks/..." → "~/.databricks/...". |
| 58 | + if homeDir != "" { |
| 59 | + msg = replacePath(msg, homeDir, "~") |
| 60 | + } |
| 61 | + |
| 62 | + // Regex fallback: redact remaining home directory paths not covered by the |
| 63 | + // direct home dir replacement above (e.g., paths from other users or |
| 64 | + // non-standard home directory locations). |
| 65 | + // Run Windows first to avoid partial matches from the Unix regex on |
| 66 | + // paths like C:/Users/... |
| 67 | + msg = windowsHomeDirRegexp.ReplaceAllString(msg, "[REDACTED_PATH]") |
| 68 | + msg = replaceDelimitedMatch(msg, unixHomeDirRegexp, "[REDACTED_PATH]") |
| 69 | + |
| 70 | + // Redact all remaining absolute paths. |
| 71 | + msg = replaceDelimitedMatch(msg, absPathRegexp, "[REDACTED_PATH]") |
| 72 | + |
| 73 | + // Redact relative paths. |
| 74 | + msg = replaceDelimitedMatch(msg, explicitRelPathRegexp, "[REDACTED_REL_PATH]") |
| 75 | + msg = replaceDelimitedMatch(msg, implicitRelPathRegexp, "[REDACTED_REL_PATH]") |
| 76 | + |
| 77 | + // Redact email addresses. |
| 78 | + msg = emailRegexp.ReplaceAllString(msg, "[REDACTED_EMAIL]") |
| 79 | + |
| 80 | + return msg |
| 81 | +} |
| 82 | + |
| 83 | +// replacePath replaces all occurrences of a directory path with the given |
| 84 | +// replacement. It only replaces when the path appears as a complete prefix, |
| 85 | +// i.e., followed by `/`, a delimiter, or end of string. This prevents partial |
| 86 | +// matches like "/Users/shreyas" matching inside "/Workspace/Users/shreyas@...". |
| 87 | +func replacePath(msg, path, replacement string) string { |
| 88 | + normalized := filepath.ToSlash(path) |
| 89 | + for _, p := range []string{normalized, path} { |
| 90 | + msg = strings.ReplaceAll(msg, p+"/", replacement+"/") |
| 91 | + |
| 92 | + // Replace occurrences not followed by '/' only when the path is at |
| 93 | + // a word boundary (followed by delimiter or end of string). |
| 94 | + result := strings.Builder{} |
| 95 | + for { |
| 96 | + idx := strings.Index(msg, p) |
| 97 | + if idx == -1 { |
| 98 | + result.WriteString(msg) |
| 99 | + break |
| 100 | + } |
| 101 | + after := idx + len(p) |
| 102 | + // Check the character after the match. Only replace if it's |
| 103 | + // a delimiter or end of string. |
| 104 | + if after == len(msg) || strings.ContainsRune(" \t\n:,\"'", rune(msg[after])) { |
| 105 | + result.WriteString(msg[:idx]) |
| 106 | + result.WriteString(replacement) |
| 107 | + msg = msg[after:] |
| 108 | + } else { |
| 109 | + result.WriteString(msg[:after]) |
| 110 | + msg = msg[after:] |
| 111 | + } |
| 112 | + } |
| 113 | + msg = result.String() |
| 114 | + } |
| 115 | + return msg |
| 116 | +} |
| 117 | + |
| 118 | +const delimiters = " \t\n:,\"'" |
| 119 | + |
| 120 | +// replaceDelimitedMatch replaces paths matched by a regex that uses a leading |
| 121 | +// delimiter group `(?:^|[\s:,"'])`. The optional delimiter character is |
| 122 | +// preserved and only the path itself is replaced. |
| 123 | +func replaceDelimitedMatch(msg string, re *regexp.Regexp, replacement string) string { |
| 124 | + return re.ReplaceAllStringFunc(msg, func(match string) string { |
| 125 | + if len(match) == 0 { |
| 126 | + return match |
| 127 | + } |
| 128 | + // If the first character is a delimiter, preserve it. |
| 129 | + if strings.ContainsRune(delimiters, rune(match[0])) { |
| 130 | + return match[:1] + replacement |
| 131 | + } |
| 132 | + // Otherwise the match starts at ^ and the whole match is the path. |
| 133 | + return replacement |
| 134 | + }) |
| 135 | +} |
0 commit comments