diff --git a/api.go b/api.go index 2c8eba6..0021c0e 100644 --- a/api.go +++ b/api.go @@ -85,6 +85,32 @@ type SafeFloat = i.SafeFloat // SafeRune aliases rune. See the explanation for SafeString. type SafeRune = i.SafeRune +// HashValue is a marker interface to be implemented by types whose +// values should be hashed when redacted, instead of being replaced +// with the redaction marker. +type HashValue = i.HashValue + +// HashString represents a string that should be hashed when redacted. +type HashString = i.HashString + +// HashInt represents an integer that should be hashed when redacted. +type HashInt = i.HashInt + +// HashUint represents an unsigned integer that should be hashed when redacted. +type HashUint = i.HashUint + +// HashFloat represents a floating-point value that should be hashed when redacted. +type HashFloat = i.HashFloat + +// HashRune represents a rune that should be hashed when redacted. +type HashRune = i.HashRune + +// HashByte represents a byte that should be hashed when redacted. +type HashByte = i.HashByte + +// HashBytes represents a byte slice that should be hashed when redacted. +type HashBytes = i.HashBytes + // RedactableString is a string that contains a mix of safe and unsafe // bits of data, but where it is known that unsafe bits are enclosed // by redaction markers ‹ and ›, and occurrences of the markers @@ -173,3 +199,11 @@ type StringBuilder = builder.StringBuilder func RegisterSafeType(t reflect.Type) { ifmt.RegisterSafeType(t) } + +// EnableHashing enables hash-based redaction with an optional salt. +// Hash markers (‹†value›) will be replaced with hashes instead of being fully redacted. +// When salt is nil, hash markers use plain SHA1. +// When salt is provided, hash markers use HMAC-SHA1 for better security. +func EnableHashing(salt []byte) { + m.EnableHashing(salt) +} diff --git a/interfaces/interfaces.go b/interfaces/interfaces.go index bd13a94..a7ac8a9 100644 --- a/interfaces/interfaces.go +++ b/interfaces/interfaces.go @@ -158,6 +158,61 @@ type SafeValue interface { SafeValue() } +// HashValue is a marker interface to be implemented by types whose +// values should be hashed when redacted, instead of being replaced +// with the redaction marker. +// +// This allows correlation between log entries while maintaining privacy. +// The hash is computed during the Redact() call, not at log creation time. +// +// Types implementing this interface should alias base Go types. +// The hash is applied to the string representation of the value. +type HashValue interface { + HashValue() +} + +// HashString represents a string that should be hashed when redacted. +type HashString string + +// HashValue makes HashString a HashValue. +func (HashString) HashValue() {} + +// HashInt represents an integer that should be hashed when redacted. +type HashInt int64 + +// HashValue makes HashInt a HashValue. +func (HashInt) HashValue() {} + +// HashUint represents an unsigned integer that should be hashed when redacted. +type HashUint uint64 + +// HashValue makes HashUint a HashValue. +func (HashUint) HashValue() {} + +// HashFloat represents a floating-point value that should be hashed when redacted. +type HashFloat float64 + +// HashValue makes HashFloat a HashValue. +func (HashFloat) HashValue() {} + +// HashRune represents a rune that should be hashed when redacted. +type HashRune rune + +// HashValue makes HashRune a HashValue. +func (HashRune) HashValue() {} + +// HashByte represents a byte that should be hashed when redacted. +type HashByte byte + +// HashValue makes HashByte a HashValue. +func (HashByte) HashValue() {} + +// HashBytes represents a byte slice that should be hashed when redacted. +type HashBytes []byte + +// HashValue makes HashBytes a HashValue. +func (HashBytes) HashValue() {} + // SafeMessager is an alternative to SafeFormatter used in previous // versions of CockroachDB. // NB: this interface is obsolete. Use SafeFormatter instead. diff --git a/internal/markers/constants.go b/internal/markers/constants.go index f062f7e..e734fea 100644 --- a/internal/markers/constants.go +++ b/internal/markers/constants.go @@ -27,6 +27,8 @@ const ( EscapeMark = '?' EscapeMarkS = string(EscapeMark) RedactedS = StartS + "×" + EndS + HashPrefix = '†' + HashPrefixS = string(HashPrefix) ) // Internal variables. @@ -35,6 +37,6 @@ var ( EndBytes = []byte(EndS) EscapeMarkBytes = []byte(EscapeMarkS) RedactedBytes = []byte(RedactedS) - ReStripSensitive = regexp.MustCompile(StartS + "[^" + StartS + EndS + "]*" + EndS) - ReStripMarkers = regexp.MustCompile("[" + StartS + EndS + "]") + ReStripSensitive = regexp.MustCompile(StartS + HashPrefixS + "?" + "[^" + StartS + EndS + "]*" + EndS) + ReStripMarkers = regexp.MustCompile("[" + StartS + EndS + HashPrefixS + "]") ) diff --git a/internal/markers/hash.go b/internal/markers/hash.go new file mode 100644 index 0000000..272f9f6 --- /dev/null +++ b/internal/markers/hash.go @@ -0,0 +1,124 @@ +// Copyright 2025 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package markers + +import ( + "crypto/hmac" + "crypto/sha1" + "encoding/hex" + "sync" +) + +/* + Hash function implementation notes: + + We use SHA-1 because it provides a good balance of performance and output size + for the log correlation use case. SHA-1 is fast and produces compact hashes, + making it well-suited for high-throughput logging scenarios. + + When a salt is provided via EnableHashing(), we use HMAC-SHA1 which provides + additional security properties and domain separation. + + The hash output is truncated to 8 hex characters (32 bits) to keep log output + concise while still providing sufficient collision resistance for typical logging + workloads. +*/ + +// defaultHashLength is the number of hex characters to use from the SHA-1 hash. +// 8 hex chars = 32 bits = ~4.3 billion unique values. +// This provides a good balance between collision resistance and output brevity. +// For typical logging scenarios with fewer unique sensitive values per analysis window, +// this collision risk should be acceptable. If not, we can make this configurable in the future. +const defaultHashLength = 8 + +var hashConfig = struct { + sync.RWMutex + enabled bool + salt []byte +}{ + enabled: false, + salt: nil, +} + +// EnableHashing enables hash-based redaction with an optional salt. +// When salt is nil, hash markers use plain SHA1. +// When salt is provided, hash markers use HMAC-SHA1 for better security. +func EnableHashing(salt []byte) { + hashConfig.Lock() + defer hashConfig.Unlock() + hashConfig.enabled = true + hashConfig.salt = salt +} + +// IsHashingEnabled returns true if hash-based redaction is enabled. +func IsHashingEnabled() bool { + hashConfig.RLock() + defer hashConfig.RUnlock() + return hashConfig.enabled +} + +// hashString computes a truncated hash of the input string. +// Uses HMAC-SHA1 if salt is set, otherwise plain SHA1. +// Must only be called when hashing is enabled (IsHashingEnabled() == true). +func hashString(value string) string { + hashConfig.RLock() + salt := hashConfig.salt + hashConfig.RUnlock() + + var h []byte + if len(salt) > 0 { + mac := hmac.New(sha1.New, salt) + mac.Write([]byte(value)) + h = mac.Sum(nil) + } else { + hasher := sha1.New() + hasher.Write([]byte(value)) + h = hasher.Sum(nil) + } + + fullHash := hex.EncodeToString(h) + if len(fullHash) > defaultHashLength { + return fullHash[:defaultHashLength] + } + return fullHash +} + +// hashBytes computes a truncated hash of the input byte slice. +// Uses HMAC-SHA1 if salt is set, otherwise plain SHA1. +// Must only be called when hashing is enabled (IsHashingEnabled() == true). +func hashBytes(value []byte) []byte { + hashConfig.RLock() + salt := hashConfig.salt + hashConfig.RUnlock() + + var h []byte + if len(salt) > 0 { + mac := hmac.New(sha1.New, salt) + mac.Write(value) + h = mac.Sum(nil) + } else { + hasher := sha1.New() + hasher.Write(value) + h = hasher.Sum(nil) + } + + fullHash := make([]byte, sha1.Size*2) + _ = hex.Encode(fullHash, h) + + if len(fullHash) > defaultHashLength { + return fullHash[:defaultHashLength] + } + return fullHash +} diff --git a/internal/markers/hash_test.go b/internal/markers/hash_test.go new file mode 100644 index 0000000..f192081 --- /dev/null +++ b/internal/markers/hash_test.go @@ -0,0 +1,162 @@ +// Copyright 2025 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing +// permissions and limitations under the License. + +package markers + +import ( + "strings" + "testing" +) + +func TestHash(t *testing.T) { + // Save original state and restore after test + hashConfig.RLock() + originalEnabled := hashConfig.enabled + originalSalt := hashConfig.salt + hashConfig.RUnlock() + defer func() { + hashConfig.Lock() + hashConfig.enabled = originalEnabled + hashConfig.salt = originalSalt + hashConfig.Unlock() + }() + + testCases := []struct { + name string + input string + salt []byte + expected string + }{ + { + name: "simple string", + input: "test", + salt: nil, + expected: "a94a8fe5", + }, + { + name: "empty string", + input: "", + salt: nil, + expected: "da39a3ee", + }, + { + name: "input exceeding hash length", + input: strings.Repeat("long-input-", 100), + salt: nil, + expected: "c375461f", + }, + { + name: "numeric string", + input: "12345", + salt: nil, + expected: "8cb2237d", + }, + { + name: "simple string with salt", + input: "test", + salt: []byte("my-salt"), + expected: "c48ce5fd", + }, + { + name: "empty string with salt", + input: "", + salt: []byte("my-salt"), + expected: "7b1829af", + }, + } + + t.Run("string", func(t *testing.T) { + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + EnableHashing(tc.salt) + + resultString := hashString(tc.input) + if resultString != tc.expected { + t.Errorf("hashString(%q) = %q, expected %q", tc.input, resultString, tc.expected) + } + if len(resultString) != 8 { + t.Errorf("hashString(%q) returned %d characters, expected 8", tc.input, len(resultString)) + } + }) + } + }) + + t.Run("bytes", func(t *testing.T) { + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + EnableHashing(tc.salt) + resultBytes := hashBytes([]byte(tc.input)) + if string(resultBytes) != tc.expected { + t.Errorf("hashBytes(%q) = %q, expected %q", tc.input, resultBytes, tc.expected) + } + if len(resultBytes) != 8 { + t.Errorf("hashBytes(%q) returned %d bytes, expected 8", tc.input, len(resultBytes)) + } + }) + } + }) +} + +func TestHashDeterminism(t *testing.T) { + // Save original state and restore after test + hashConfig.RLock() + originalEnabled := hashConfig.enabled + originalSalt := hashConfig.salt + hashConfig.RUnlock() + defer func() { + hashConfig.Lock() + hashConfig.enabled = originalEnabled + hashConfig.salt = originalSalt + hashConfig.Unlock() + }() + + EnableHashing(nil) + + input := "test-value" + input2 := "different-value" + + t.Run("hashString", func(t *testing.T) { + // Same input should always produce same output + hash1 := hashString(input) + hash2 := hashString(input) + + if hash1 != hash2 { + t.Errorf("hashString is not deterministic: hashString(%q) returned %q and %q", input, hash1, hash2) + } + + // Different inputs should produce different outputs + hash3 := hashString(input2) + + if hash1 == hash3 { + t.Errorf("Different inputs produced same hash: hashString(%q) = hashString(%q) = %q", input, input2, hash1) + } + }) + + t.Run("hashBytes", func(t *testing.T) { + // Same input should always produce same output + hash1 := hashBytes([]byte(input)) + hash2 := hashBytes([]byte(input)) + + if string(hash1) != string(hash2) { + t.Errorf("hashBytes is not deterministic: hashBytes(%q) returned %q and %q", input, hash1, hash2) + } + + // Different inputs should produce different outputs + hash3 := hashBytes([]byte(input2)) + + if string(hash1) == string(hash3) { + t.Errorf("Different inputs produced same hash: hashBytes(%q) = hashBytes(%q) = %q", input, input2, hash1) + } + }) +} diff --git a/internal/markers/markers.go b/internal/markers/markers.go index f6bfa10..6053b9c 100644 --- a/internal/markers/markers.go +++ b/internal/markers/markers.go @@ -14,7 +14,11 @@ package markers -import i "github.com/cockroachdb/redact/interfaces" +import ( + "bytes" + + i "github.com/cockroachdb/redact/interfaces" +) // RedactableString is a string that contains a mix of safe and unsafe // bits of data, but where it is known that unsafe bits are enclosed @@ -34,9 +38,26 @@ func (s RedactableString) StripMarkers() string { } // Redact replaces all occurrences of unsafe substrings by the -// “Redacted” marker, ‹×›. The result string is still safe. +// "Redacted" marker, ‹×›. Hash markers (‹†value›) are replaced +// with hashed values (‹hash›) if hashing is enabled, otherwise +// they are redacted like regular markers. The result string is still safe. func (s RedactableString) Redact() RedactableString { - return RedactableString(ReStripSensitive.ReplaceAllString(string(s), RedactedS)) + result := ReStripSensitive.ReplaceAllStringFunc(string(s), func(match string) string { + // Check if this is a hash marker by looking for † after ‹ + if len(match) > len(StartS)+len(EndS) && + match[len(StartS):len(StartS)+len(HashPrefixS)] == HashPrefixS { + // Only hash if hashing is enabled, otherwise redact + if IsHashingEnabled() { + // Extract value without ‹† and › and return its hash + value := match[len(StartS)+len(HashPrefixS) : len(match)-len(EndS)] + return StartS + hashString(value) + EndS + } + } + // Regular marker or disabled hashing - replace with × + return RedactedS + }) + + return RedactableString(result) } // ToBytes converts the string to a byte slice. @@ -66,9 +87,32 @@ func (s RedactableBytes) StripMarkers() []byte { } // Redact replaces all occurrences of unsafe substrings by the -// “Redacted” marker, ‹×›. +// "Redacted" marker, ‹×›. Hash markers (‹†value›) are replaced +// with hashed values (‹hash›) if hashing is enabled, otherwise +// they are redacted like regular markers. func (s RedactableBytes) Redact() RedactableBytes { - return RedactableBytes(ReStripSensitive.ReplaceAll(s, RedactedBytes)) + result := ReStripSensitive.ReplaceAllFunc([]byte(s), func(match []byte) []byte { + // Check if this is a hash marker by looking for † after ‹ + if len(match) > len(StartBytes)+len(EndBytes) && + bytes.Equal(match[len(StartBytes):len(StartBytes)+len(HashPrefixS)], []byte(HashPrefixS)) { + // Only hash if hashing is enabled, otherwise redact + if IsHashingEnabled() { + // Extract value without ‹† and › and return its hash + value := match[len(StartBytes)+len(HashPrefixS) : len(match)-len(EndBytes)] + hashed := hashBytes(value) + res := make([]byte, len(StartBytes)+len(hashed)+len(EndBytes)) + n := copy(res, StartBytes) + n += copy(res[n:], hashed) + copy(res[n:], EndBytes) + return res + } + } + + // Regular marker or disabled hashing - replace with × + return RedactedBytes + }) + + return RedactableBytes(result) } // ToString converts the byte slice to a string. diff --git a/internal/rfmt/print.go b/internal/rfmt/print.go index 16ba3ce..4c50eaf 100644 --- a/internal/rfmt/print.go +++ b/internal/rfmt/print.go @@ -67,7 +67,7 @@ type Formatter interface { } // Stringer is implemented by any value that has a String method, -// which defines the ``native'' format for that value. +// which defines the “native” format for that value. // The String method is used to print values passed as an operand // to any format that accepts a string or to an unformatted printer // such as Print. @@ -695,6 +695,24 @@ func (p *pp) handleMethods(verb rune) (handled bool) { return false } +// tryWriteHashMarker checks if arg implements HashValue and hashing is enabled. +// If so, writes hash markers around the value and returns true. +// Otherwise, returns false. +func (p *pp) tryWriteHashMarker(arg interface{}) bool { + if _, ok := arg.(i.HashValue); ok && m.IsHashingEnabled() { + // Mark the value for hashing during Redact(), but preserve the original value + // Use PreRedactable mode to prevent marker escaping + defer p.startPreRedactable().restore() + valueStr := origFmt.Sprint(arg) + p.buf.writeString(m.StartS) + p.buf.writeString(m.HashPrefixS) + p.buf.writeString(valueStr) + p.buf.writeString(m.EndS) + return true + } + return false +} + func (p *pp) printArg(arg interface{}, verb rune) { t := reflect.TypeOf(arg) if safeTypeRegistry[t] { @@ -707,8 +725,10 @@ func (p *pp) printArg(arg interface{}, verb rune) { arg = arg.(w.UnsafeWrap).GetValue() } - if _, ok := arg.(i.SafeValue); ok { + if _, isSafe := arg.(i.SafeValue); isSafe { defer p.startSafeOverride().restore() + } else if p.tryWriteHashMarker(arg) { + return } p.arg = arg @@ -788,8 +808,10 @@ func (p *pp) printArg(arg interface{}, verb rune) { if f.CanInterface() { p.arg = f.Interface() - if _, ok := p.arg.(i.SafeValue); ok { + if _, isSafe := p.arg.(i.SafeValue); isSafe { defer p.startSafeOverride().restore() + } else if p.tryWriteHashMarker(p.arg) { + return } if p.handleMethods(verb) { return @@ -831,8 +853,10 @@ func (p *pp) printValue(value reflect.Value, verb rune, depth int) { if value.CanInterface() { p.arg = value.Interface() - if _, ok := p.arg.(i.SafeValue); ok { + if _, isSafe := p.arg.(i.SafeValue); isSafe { defer p.startSafeOverride().restore() + } else if p.tryWriteHashMarker(p.arg) { + return } if p.handleMethods(verb) { return