Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 45 additions & 19 deletions digest.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,20 @@ import (
// DigestEncoding represents the encoding used for WARC digest values.
type DigestEncoding uint8

var (
base32NoPaddingEncoding = base32.StdEncoding.WithPadding(base32.NoPadding)
base64NoPaddingEncoding = base64.StdEncoding.WithPadding(base64.NoPadding)
)

func (d DigestEncoding) encode(digest *digest) string {
dig := digest.Sum(nil)
switch d {
case Base16:
return strings.ToLower(hex.EncodeToString(dig))
case Base32:
return base32.StdEncoding.EncodeToString(dig)
return base32NoPaddingEncoding.EncodeToString(dig)
case Base64:
return base64.StdEncoding.EncodeToString(dig)
return base64NoPaddingEncoding.EncodeToString(dig)
default:
return string(dig)
}
Expand All @@ -53,9 +58,15 @@ func (d DigestEncoding) decode(s string) ([]byte, error) {
case Base16:
return hex.DecodeString(s)
case Base32:
return base32.StdEncoding.DecodeString(s)
if strings.HasSuffix(s, "=") {
return base32.StdEncoding.DecodeString(s)
}
return base32NoPaddingEncoding.DecodeString(s)
case Base64:
return base64.StdEncoding.DecodeString(s)
if strings.HasSuffix(s, "=") {
return base64.StdEncoding.DecodeString(s)
}
return base64NoPaddingEncoding.DecodeString(s)
default:
return []byte(s), nil
}
Expand All @@ -68,13 +79,25 @@ const (
Base64 DigestEncoding = 3
)

// recommendedEncoding returns the WARC spec community-recommended encoding for the
// given algorithm. SHA-1 uses Base32 (no padding needed). All others use Base16 to
// avoid the need for padding characters which are forbidden in digest-value tokens.
func recommendedEncoding(algorithm string) DigestEncoding {
switch algorithm {
case "sha1":
return Base32
default:
return Base16
}
Comment thread
maeb marked this conversation as resolved.
}

func detectEncoding(algorithm, digest string, defaultEncoding DigestEncoding) DigestEncoding {
var algorithmLength int
switch algorithm {
case "md5":
if len(digest) == 32 {
// Special handling for md5 where encoded length are the same for base16 and base32.
// Distinction can be done on base32 padding
// Special handling for md5 where padded base32 encoded length (32) is the same as base16.
// Distinction can be done on base32 padding suffix.
if strings.HasSuffix(digest, "=") {
return Base32
} else {
Expand All @@ -88,13 +111,15 @@ func detectEncoding(algorithm, digest string, defaultEncoding DigestEncoding) Di
algorithmLength = sha256.Size
case "sha512":
algorithmLength = sha512.Size
default:
return defaultEncoding
}
switch len(digest) {
case algorithmLength * 2:
switch l := len(digest); {
case l == algorithmLength*2:
return Base16
case base32.StdEncoding.EncodedLen(algorithmLength):
case l == base32.StdEncoding.EncodedLen(algorithmLength) || l == base32NoPaddingEncoding.EncodedLen(algorithmLength):
return Base32
case base64.StdEncoding.EncodedLen(algorithmLength):
case l == base64.StdEncoding.EncodedLen(algorithmLength) || l == base64NoPaddingEncoding.EncodedLen(algorithmLength):
return Base64
}
return defaultEncoding
Expand Down Expand Up @@ -181,12 +206,10 @@ func (d *digest) updateDigest() {
// The encoding is deduced from the length of the digestValue. In the case where only the algorithm is submitted
// or the length of the digestValue is of wrong length for the supported encodings, the value of defaultEncoding is used.
func newDigest(digestString string, defaultEncoding DigestEncoding) (*digest, error) {
t := strings.SplitN(digestString, ":", 2)
algorithm := t[0]
algorithm, hash, _ := strings.Cut(digestString, ":")
algorithm = normalizeAlgorithmName(algorithm)
var hash string
if len(t) > 1 {
hash = t[1]
if defaultEncoding == unknown {
defaultEncoding = recommendedEncoding(algorithm)
}
encoding := detectEncoding(algorithm, hash, defaultEncoding)
switch encoding {
Expand All @@ -206,7 +229,7 @@ func newDigest(digestString string, defaultEncoding DigestEncoding) (*digest, er
case "sha512":
return &digest{sha512.New(), algorithm, hash, 0, encoding}, nil
case "":
return &digest{sha1.New(), "sha1", hash, 0, encoding}, nil
return &digest{sha256.New(), "sha256", hash, 0, encoding}, nil
default:
return nil, fmt.Errorf("%w: %s", ErrUnsupportedDigestAlgorithm, algorithm)
Comment thread
maeb marked this conversation as resolved.
}
Expand All @@ -215,13 +238,16 @@ func newDigest(digestString string, defaultEncoding DigestEncoding) (*digest, er
// newDigestFromField takes a warcRecord and a digest-field name and creates a new digest from it.
//
// If the digest-field is missing from the warcRecord a digest is created with the default algorithm and encoding set
// in the warcRecord's options
// in the warcRecord's options. If no encoding is configured (unknown), the spec-recommended encoding for the
// algorithm is used.
func newDigestFromField(wr *warcRecord, warcDigestField string) (d *digest, err error) {
var digestString string
if wr.WarcHeader().Has(warcDigestField) {
d, err = newDigest(wr.WarcHeader().Get(warcDigestField), wr.opts.defaultDigestEncoding)
digestString = wr.WarcHeader().Get(warcDigestField)
} else {
d, err = newDigest(wr.opts.defaultDigestAlgorithm, wr.opts.defaultDigestEncoding)
digestString = wr.opts.defaultDigestAlgorithm
}
d, err = newDigest(digestString, wr.opts.defaultDigestEncoding)
Comment thread
maeb marked this conversation as resolved.
return
}

Expand Down
45 changes: 32 additions & 13 deletions digest_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,26 +38,26 @@ func Test_newDigest(t *testing.T) {
}{
{"md5", "md5", "Some content", Base16, "md5", "md5:b53227da4280f0e18270f21dd77c91d0", false},
{"md5 with base16 digest", "md5:12345", "Some content", Base16, "md5", "md5:b53227da4280f0e18270f21dd77c91d0", false},
{"md5 with base32 digest", "md5:12345", "Some content", Base32, "md5", "md5:WUZCPWSCQDYODATQ6IO5O7ER2A======", false},
{"md5 with base64 digest", "md5:12345", "Some content", Base64, "md5", "md5:tTIn2kKA8OGCcPId13yR0A==", false},
{"md5 with base32 digest", "md5:12345", "Some content", Base32, "md5", "md5:WUZCPWSCQDYODATQ6IO5O7ER2A", false},
{"md5 with base64 digest", "md5:12345", "Some content", Base64, "md5", "md5:tTIn2kKA8OGCcPId13yR0A", false},
{"sha1", "sha1", "Some content", Base16, "sha1", "sha1:9f1a6ecf74e9f9b1ae52e8eb581d420e63e8453a", false},
{"sha1 with base16 digest", "sha1:12345", "Some content", Base16, "sha1", "sha1:9f1a6ecf74e9f9b1ae52e8eb581d420e63e8453a", false},
{"sha-1 with base16 digest", "sha-1:12345", "Some content", Base16, "sha1", "sha1:9f1a6ecf74e9f9b1ae52e8eb581d420e63e8453a", false},
{"sha1 with base32 digest", "sha1:12345", "Some content", Base32, "sha1", "sha1:T4NG5T3U5H43DLSS5DVVQHKCBZR6QRJ2", false},
{"sha1 with base64 digest", "sha1:12345", "Some content", Base64, "sha1", "sha1:nxpuz3Tp+bGuUujrWB1CDmPoRTo=", false},
{"sha1 with base64 digest", "sha1:12345", "Some content", Base64, "sha1", "sha1:nxpuz3Tp+bGuUujrWB1CDmPoRTo", false},
{"sha256", "sha256", "Some content", Base16, "sha256", "sha256:9c6609fc5111405ea3f5bb3d1f6b5a5efd19a0cec53d85893fd96d265439cd5b", false},
{"sha-256", "sha256", "Some content", Base16, "sha256", "sha256:9c6609fc5111405ea3f5bb3d1f6b5a5efd19a0cec53d85893fd96d265439cd5b", false},
{"sha256 with base16 digest", "sha256:12345", "Some content", Base16, "sha256", "sha256:9c6609fc5111405ea3f5bb3d1f6b5a5efd19a0cec53d85893fd96d265439cd5b", false},
{"sha256 with base32 digest", "sha256:12345", "Some content", Base32, "sha256", "sha256:TRTAT7CRCFAF5I7VXM6R6222L36RTIGOYU6YLCJ73FWSMVBZZVNQ====", false},
{"sha256 with base64 digest", "sha256:12345", "Some content", Base64, "sha256", "sha256:nGYJ/FERQF6j9bs9H2taXv0ZoM7FPYWJP9ltJlQ5zVs=", false},
{"sha256 with base32 digest", "sha256:12345", "Some content", Base32, "sha256", "sha256:TRTAT7CRCFAF5I7VXM6R6222L36RTIGOYU6YLCJ73FWSMVBZZVNQ", false},
{"sha256 with base64 digest", "sha256:12345", "Some content", Base64, "sha256", "sha256:nGYJ/FERQF6j9bs9H2taXv0ZoM7FPYWJP9ltJlQ5zVs", false},
{"sha512", "sha512", "Some content", Base16, "sha512", "sha512:b20d977718ed67f2bf7620ee2d982fd850c4883ec8d048440fe7b6a86cf6322fd791c47b0c7469dbeef3e339032e1abc4bcebe5efc104bc19a117bfef4478605", false},
{"sha512 with base16 digest", "sha512:12345", "Some content", Base16, "sha512", "sha512:b20d977718ed67f2bf7620ee2d982fd850c4883ec8d048440fe7b6a86cf6322fd791c47b0c7469dbeef3e339032e1abc4bcebe5efc104bc19a117bfef4478605", false},
{"sha512 with base32 digest", "sha512:12345", "Some content", Base32, "sha512", "sha512:WIGZO5YY5VT7FP3WEDXC3GBP3BIMJCB6ZDIEQRAP463KQ3HWGIX5PEOEPMGHI2O353Z6GOIDFYNLYS6OXZPPYECLYGNBC6766RDYMBI=", false},
{"sha512 with base64 digest", "sha512:12345", "Some content", Base64, "sha512", "sha512:sg2XdxjtZ/K/diDuLZgv2FDEiD7I0EhED+e2qGz2Mi/XkcR7DHRp2+7z4zkDLhq8S86+XvwQS8GaEXv+9EeGBQ==", false},
{"sha-512 with base64 digest", "sha512:12345", "Some content", Base64, "sha512", "sha512:sg2XdxjtZ/K/diDuLZgv2FDEiD7I0EhED+e2qGz2Mi/XkcR7DHRp2+7z4zkDLhq8S86+XvwQS8GaEXv+9EeGBQ==", false},
{"sha512 with base32 digest", "sha512:12345", "Some content", Base32, "sha512", "sha512:WIGZO5YY5VT7FP3WEDXC3GBP3BIMJCB6ZDIEQRAP463KQ3HWGIX5PEOEPMGHI2O353Z6GOIDFYNLYS6OXZPPYECLYGNBC6766RDYMBI", false},
{"sha512 with base64 digest", "sha512:12345", "Some content", Base64, "sha512", "sha512:sg2XdxjtZ/K/diDuLZgv2FDEiD7I0EhED+e2qGz2Mi/XkcR7DHRp2+7z4zkDLhq8S86+XvwQS8GaEXv+9EeGBQ", false},
{"sha-512 with base64 digest", "sha512:12345", "Some content", Base64, "sha512", "sha512:sg2XdxjtZ/K/diDuLZgv2FDEiD7I0EhED+e2qGz2Mi/XkcR7DHRp2+7z4zkDLhq8S86+XvwQS8GaEXv+9EeGBQ", false},
{"unknown algorithm", "mysecret:12345", "Some content", Base16, "mysecret", "mysecret:123", true},
{"unknown algorithm with digest", "mysecret:12345", "Some content", Base16, "mysecret", "mysecret:123", true},
{"empty algorithm defaults to sha1", "", "Some content", Base16, "sha1", "sha1:9f1a6ecf74e9f9b1ae52e8eb581d420e63e8453a", false},
{"empty algorithm defaults to sha256", "", "Some content", Base16, "sha256", "sha256:9c6609fc5111405ea3f5bb3d1f6b5a5efd19a0cec53d85893fd96d265439cd5b", false},
{"unsupported algorithm sha3", "sha3", "Some content", Base16, "sha3", "", true},
{"unsupported algorithm blake2", "blake2", "Some content", Base16, "blake2", "", true},
}
Expand Down Expand Up @@ -94,28 +94,35 @@ func Test_digest_validate(t *testing.T) {
{"md5", "Some content", "md5", false},
{"md5 with base16 digest", "Some content", "md5:b53227da4280f0e18270f21dd77c91d0", true},
{"md5 with base32 digest", "Some content", "md5:WUZCPWSCQDYODATQ6IO5O7ER2A======", true},
{"md5 with unpadded base32 digest", "Some content", "md5:WUZCPWSCQDYODATQ6IO5O7ER2A", true},
{"md5 with base64 digest", "Some content", "md5:tTIn2kKA8OGCcPId13yR0A==", true},
{"md5 with unpadded base64 digest", "Some content", "md5:tTIn2kKA8OGCcPId13yR0A", true},
{"md5 with wrong digest", "Some content", "md5:123", false},
{"sha1", "Some content", "sha1", false},
{"sha1 with base16 digest", "Some content", "sha1:9f1a6ecf74e9f9b1ae52e8eb581d420e63e8453a", true},
{"SHA-1 with base16 digest", "Some content", "SHA-1:9f1a6ecf74e9f9b1ae52e8eb581d420e63e8453a", true},
{"sha1 with base32 digest", "Some content", "sha1:T4NG5T3U5H43DLSS5DVVQHKCBZR6QRJ2", true},
{"sha1 with base64 digest", "Some content", "sha1:nxpuz3Tp+bGuUujrWB1CDmPoRTo=", true},
{"sha1 with unpadded base64 digest", "Some content", "sha1:nxpuz3Tp+bGuUujrWB1CDmPoRTo", true},
{"sha1 with wrong digest", "Some content", "sha1:123", false},
{"sha256", "Some content", "sha256", false},
{"sha256 with base16 digest", "Some content", "sha256:9c6609fc5111405ea3f5bb3d1f6b5a5efd19a0cec53d85893fd96d265439cd5b", true},
{"SHA-256 with base16 digest", "Some content", "SHA-256:9c6609fc5111405ea3f5bb3d1f6b5a5efd19a0cec53d85893fd96d265439cd5b", true},
{"sha256 with base32 digest", "Some content", "sha256:TRTAT7CRCFAF5I7VXM6R6222L36RTIGOYU6YLCJ73FWSMVBZZVNQ====", true},
{"sha256 with unpadded base32 digest", "Some content", "sha256:TRTAT7CRCFAF5I7VXM6R6222L36RTIGOYU6YLCJ73FWSMVBZZVNQ", true},
{"sha256 with base64 digest", "Some content", "sha256:nGYJ/FERQF6j9bs9H2taXv0ZoM7FPYWJP9ltJlQ5zVs=", true},
{"sha256 with unpadded base64 digest", "Some content", "sha256:nGYJ/FERQF6j9bs9H2taXv0ZoM7FPYWJP9ltJlQ5zVs", true},
{"sha256 with wrong digest", "Some content", "sha256:123", false},
{"sha512", "Some content", "sha512", false},
{"sha512 with base16 digest", "Some content", "sha512:b20d977718ed67f2bf7620ee2d982fd850c4883ec8d048440fe7b6a86cf6322fd791c47b0c7469dbeef3e339032e1abc4bcebe5efc104bc19a117bfef4478605", true},
{"sha512 with base32 digest", "Some content", "sha512:WIGZO5YY5VT7FP3WEDXC3GBP3BIMJCB6ZDIEQRAP463KQ3HWGIX5PEOEPMGHI2O353Z6GOIDFYNLYS6OXZPPYECLYGNBC6766RDYMBI=", true},
{"sha512 with unpadded base32 digest", "Some content", "sha512:WIGZO5YY5VT7FP3WEDXC3GBP3BIMJCB6ZDIEQRAP463KQ3HWGIX5PEOEPMGHI2O353Z6GOIDFYNLYS6OXZPPYECLYGNBC6766RDYMBI", true},
{"sha512 with base64 digest", "Some content", "sha512:sg2XdxjtZ/K/diDuLZgv2FDEiD7I0EhED+e2qGz2Mi/XkcR7DHRp2+7z4zkDLhq8S86+XvwQS8GaEXv+9EeGBQ==", true},
{"sha512 with unpadded base64 digest", "Some content", "sha512:sg2XdxjtZ/K/diDuLZgv2FDEiD7I0EhED+e2qGz2Mi/XkcR7DHRp2+7z4zkDLhq8S86+XvwQS8GaEXv+9EeGBQ", true},
{"sha512 with wrong digest", "Some content", "sha512:123", false},
{"uppercase base16 encoding", "Some content", "sha1:9F1A6ECF74E9F9B1AE52E8EB581D420E63E8453A", true},
{"lovercase base32 encoding", "Some content", "sha1:t4ng5t3u5h43dlss5dvvqhkcbzr6qrj2", true},
{"lovercase base64 encoding", "Some content", "sha1:nxpuz3tp+bguuujrwb1cdmporto=", false},
{"lowercase base32 encoding", "Some content", "sha1:t4ng5t3u5h43dlss5dvvqhkcbzr6qrj2", true},
{"lowercase base64 encoding", "Some content", "sha1:nxpuz3tp+bguuujrwb1cdmporto=", false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
Expand Down Expand Up @@ -158,12 +165,24 @@ func TestDigestEncoding_Decode(t *testing.T) {
input: "BT7V3XGGA4OVPYPVNTNJUMTXGY======",
wantError: false,
},
{
name: "valid Base32 unpadded",
encoding: Base32,
input: "BT7V3XGGA4OVPYPVNTNJUMTXGY",
wantError: false,
},
{
name: "valid Base64",
encoding: Base64,
input: "CY9rzUYh03PK3k6DJie09g==",
wantError: false,
},
{
name: "valid Base64 unpadded",
encoding: Base64,
input: "CY9rzUYh03PK3k6DJie09g",
wantError: false,
},
{
name: "invalid Base16",
encoding: Base16,
Expand Down Expand Up @@ -215,14 +234,14 @@ func TestDetectEncoding(t *testing.T) {
name: "md5 base16 (32 chars, no padding)",
algorithm: "md5",
digest: "098f6bcd4621d373cade4e832627b4f6",
defaultEncoding: Base64,
defaultEncoding: Base32,
want: Base16,
},
{
name: "md5 base32 (32 chars with padding)",
algorithm: "md5",
digest: "BT7V3XGGA4OVPYPVNTNJUMTXGY======",
defaultEncoding: Base64,
defaultEncoding: Base16,
want: Base32,
},
{
Expand Down
21 changes: 16 additions & 5 deletions options.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ type warcRecordOptions struct {
fixWarcFieldsBlockErrors bool
defaultDigestAlgorithm string
defaultDigestEncoding DigestEncoding
digestEncodingSet bool
bufferOptions []diskbuffer.Option
urlParserOptions []url.ParserOption
}
Expand All @@ -64,6 +65,7 @@ func (f WarcRecordOption) apply(o *warcRecordOptions) { f(o) }

func defaultWarcRecordOptions() warcRecordOptions {
uuid.EnableRandPool()
defaultDigestAlgorithm := normalizeAlgorithmName("sha256")
return warcRecordOptions{
warcVersion: V1_1,
errSyntax: ErrWarn,
Expand All @@ -75,8 +77,8 @@ func defaultWarcRecordOptions() warcRecordOptions {
recordIdFunc: defaultIdGenerator,
addMissingContentLength: false,
addMissingDigest: false,
defaultDigestAlgorithm: "sha1",
defaultDigestEncoding: Base32,
defaultDigestAlgorithm: defaultDigestAlgorithm,
defaultDigestEncoding: recommendedEncoding(defaultDigestAlgorithm),
fixContentLength: false,
fixDigest: false,
fixSyntaxErrors: false,
Comment thread
maeb marked this conversation as resolved.
Expand All @@ -90,6 +92,9 @@ func newOptions(opts ...WarcRecordOption) *warcRecordOptions {
for _, opt := range opts {
opt.apply(&o)
}
if !o.digestEncodingSet {
o.defaultDigestEncoding = recommendedEncoding(o.defaultDigestAlgorithm)
}
return &o
}

Expand Down Expand Up @@ -192,21 +197,27 @@ func WithAddMissingDigest(addMissingDigest bool) WarcRecordOption {
//
// Valid values: 'md5', 'sha1', 'sha256' and 'sha512'.
//
// defaults to sha1
// defaults to sha256
func WithDefaultDigestAlgorithm(defaultDigestAlgorithm string) WarcRecordOption {
return func(o *warcRecordOptions) {
o.defaultDigestAlgorithm = defaultDigestAlgorithm
o.defaultDigestAlgorithm = normalizeAlgorithmName(defaultDigestAlgorithm)
}
}

// WithDefaultDigestEncoding sets which encoding to use for digest generation.
//
// Valid values: Base16, Base32 and Base64.
//
// defaults to Base32
// Note: Base64 may violate strict WARC digest-value token grammar because
// Base64 output can contain '/' characters. Generated Base64 digest values
// are encoded without padding so no '=' characters will be present.
//
// By default, the spec-recommended encoding per algorithm is used:
// SHA-1 uses uppercase Base32, all others use lowercase Base16.
func WithDefaultDigestEncoding(defaultDigestEncoding DigestEncoding) WarcRecordOption {
return func(o *warcRecordOptions) {
o.defaultDigestEncoding = defaultDigestEncoding
o.digestEncodingSet = true
}
}

Expand Down
Loading