From f43ef7e736c51724baa88ce2831bed0218498bc1 Mon Sep 17 00:00:00 2001 From: Mike Jensen Date: Sun, 28 Dec 2025 17:51:04 -0700 Subject: [PATCH 1/6] feat: Initial implementation of base85 This implementation provides a Base85 implementation with an API similar to encoding/base64 so that custom alphabets and paddings can be provided. Using this we provide RFC1924 as a standard alphabet. This is being provided because the standard `ascii85` implementation does not allow custom character sets that would enable use of RFC1924 or other denser encodings (compared to Base64 which is the most dense built-in encoding) provided in standard go. --- Makefile | 17 + base85/decode_test.go | 233 +++++++++ base85/encode_test.go | 142 ++++++ base85/encoding.go | 548 ++++++++++++++++++++ base85/encoding_test.go | 1066 +++++++++++++++++++++++++++++++++++++++ go.mod | 11 + go.sum | 10 + 7 files changed, 2027 insertions(+) create mode 100644 Makefile create mode 100644 base85/decode_test.go create mode 100644 base85/encode_test.go create mode 100644 base85/encoding.go create mode 100644 base85/encoding_test.go create mode 100644 go.mod create mode 100644 go.sum diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..6b56a26 --- /dev/null +++ b/Makefile @@ -0,0 +1,17 @@ +export GO111MODULE=on + +.PHONY: default test test-cover bench lint + + +test: + go test -race -cover ./... + +test-cover: + go test -race -coverprofile=test.out ./... && go tool cover --html=test.out + +bench: + go test --benchmem -benchtime=10s -bench='Benchmark.*' -run='^$$' + +lint: + golangci-lint run --timeout=600s && go vet ./... + diff --git a/base85/decode_test.go b/base85/decode_test.go new file mode 100644 index 0000000..887528c --- /dev/null +++ b/base85/decode_test.go @@ -0,0 +1,233 @@ +package base85 + +import ( + "bytes" + "encoding/ascii85" + "io" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestDecode(t *testing.T) { + t.Parallel() + + // test cases avoid 4 consecutive zero bytes (which ascii85.Encode outputs as 'z') + tests := []struct { + name string + input []byte + }{ + {"empty", []byte{}}, + {"single_byte", []byte{0x01}}, + {"two_bytes", []byte{0x01, 0x02}}, + {"three_bytes", []byte{0x01, 0x02, 0x03}}, + {"four_bytes", []byte{0x01, 0x02, 0x03, 0x04}}, + {"five_bytes", []byte{0x01, 0x02, 0x03, 0x04, 0x05}}, + {"eight_bytes", []byte{0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08}}, + {"hello_world", []byte("Hello, World!")}, + {"binary_data", []byte{0xFF, 0xFE, 0xFD, 0xFC, 0xFB, 0xFA}}, + {"high_values", []byte{0xFF, 0xFF, 0xFF, 0xFF}}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + // encode with standard ascii85 + encoded := make([]byte, ascii85.MaxEncodedLen(len(tc.input))) + n := ascii85.Encode(encoded, tc.input) + encoded = encoded[:n] + + // decode with our implementation + dst := make([]byte, ascii85Encoding.DecodedLen(len(encoded))) + ndst, err := ascii85Encoding.Decode(dst, encoded) + + require.NoError(t, err) + assert.Equal(t, tc.input, dst[:ndst]) + }) + } +} + +func TestDecodeString(t *testing.T) { + t.Parallel() + + // bytes for "日本語" without triggering gosmopolitan + unicodeBytes := []byte{0xe6, 0x97, 0xa5, 0xe6, 0x9c, 0xac, 0xe8, 0xaa, 0x9e} + + tests := []struct { + name string + input []byte + }{ + {"empty", []byte{}}, + {"simple_text", []byte("test")}, + {"unicode", unicodeBytes}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + // encode with standard ascii85 + encoded := make([]byte, ascii85.MaxEncodedLen(len(tc.input))) + n := ascii85.Encode(encoded, tc.input) + encodedStr := string(encoded[:n]) + + // decode with our implementation + decoded, err := ascii85Encoding.DecodeString(encodedStr) + + require.NoError(t, err) + assert.Equal(t, tc.input, decoded) + }) + } +} + +func TestAppendDecode(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + prefix []byte + input []byte + }{ + {"empty_prefix", nil, []byte("test")}, + {"with_prefix", []byte("decoded:"), []byte("data")}, + {"empty_input", []byte("prefix:"), []byte{}}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + // encode with standard ascii85 + encoded := make([]byte, ascii85.MaxEncodedLen(len(tc.input))) + n := ascii85.Encode(encoded, tc.input) + encoded = encoded[:n] + + // decode with our implementation + result, err := ascii85Encoding.AppendDecode(tc.prefix, encoded) + + require.NoError(t, err) + + // verify prefix is preserved + if tc.prefix != nil { + assert.Equal(t, string(tc.prefix), string(result[:len(tc.prefix)])) + } + + // verify decoded content + decodedPortion := result[len(tc.prefix):] + assert.Equal(t, tc.input, decodedPortion) + }) + } +} + +func TestDecodeCorruptInput(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + input string + }{ + {"invalid_char", "hello\x00world"}, + {"out_of_alphabet", "~~~~~"}, + {"single_trailing_char", "AAAAA0"}, // 5 valid chars + 1 trailing (invalid) + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + _, err := ascii85Encoding.DecodeString(tc.input) + require.Error(t, err) + + var corruptErr CorruptInputError + assert.ErrorAs(t, err, &corruptErr) + }) + } +} + +func TestDecodeWhitespace(t *testing.T) { + t.Parallel() + + input := []byte("test") + + // encode with standard ascii85 + encoded := make([]byte, ascii85.MaxEncodedLen(len(input))) + n := ascii85.Encode(encoded, input) + encodedStr := string(encoded[:n]) + + // insert whitespace + withWhitespace := encodedStr[:2] + " \t\n\r" + encodedStr[2:] + + // decode should ignore whitespace + decoded, err := ascii85Encoding.DecodeString(withWhitespace) + + require.NoError(t, err) + assert.Equal(t, input, decoded) +} + +func TestNewDecoder(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + input []byte + readSize int // buffer size for Read calls, 0 means read all at once + }{ + {"empty", []byte{}, 0}, + {"single_byte", []byte{0x01}, 0}, + {"four_bytes", []byte{0x01, 0x02, 0x03, 0x04}, 0}, + {"five_bytes", []byte{0x01, 0x02, 0x03, 0x04, 0x05}, 0}, + {"hello_world", []byte("Hello, World!"), 0}, + {"small_reads", []byte("Hello, World!"), 3}, + {"byte_by_byte", []byte("test"), 1}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + // encode with standard ascii85 + encoded := make([]byte, ascii85.MaxEncodedLen(len(tc.input))) + n := ascii85.Encode(encoded, tc.input) + encoded = encoded[:n] + + // decode with our stream decoder + dec := NewDecoder(ascii85Encoding, bytes.NewReader(encoded)) + + var decoded []byte + if tc.readSize == 0 { + // read all at once + var err error + decoded, err = io.ReadAll(dec) + require.NoError(t, err) + } else { + // read in chunks + buf := make([]byte, tc.readSize) + for { + nr, err := dec.Read(buf) + if nr > 0 { + decoded = append(decoded, buf[:nr]...) + } + if err == io.EOF { + break + } + require.NoError(t, err) + } + } + + assert.Equal(t, tc.input, decoded) + }) + } +} + +func TestNewDecoderWithWhitespace(t *testing.T) { + t.Parallel() + + input := []byte("Hello, World!") + + // encode with standard ascii85 + encoded := make([]byte, ascii85.MaxEncodedLen(len(input))) + n := ascii85.Encode(encoded, input) + encodedStr := string(encoded[:n]) + + // insert whitespace + withWhitespace := encodedStr[:5] + "\n\t " + encodedStr[5:] + + // decode with stream decoder + dec := NewDecoder(ascii85Encoding, bytes.NewReader([]byte(withWhitespace))) + decoded, err := io.ReadAll(dec) + + require.NoError(t, err) + assert.Equal(t, input, decoded) +} diff --git a/base85/encode_test.go b/base85/encode_test.go new file mode 100644 index 0000000..5cc211c --- /dev/null +++ b/base85/encode_test.go @@ -0,0 +1,142 @@ +package base85 + +import ( + "bytes" + "encoding/ascii85" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// ASCII85 alphabet for testing against standard library +// Characters '!' (33) through 'u' (117) +const ascii85Alphabet = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstu" + +var ascii85Encoding = NewEncoding(ascii85Alphabet) + +func TestEncodeToString(t *testing.T) { + t.Parallel() + + // bytes for "日本語" without triggering gosmopolitan + unicodeBytes := []byte{0xe6, 0x97, 0xa5, 0xe6, 0x9c, 0xac, 0xe8, 0xaa, 0x9e} + + tests := []struct { + name string + input []byte + }{ + {"empty", []byte{}}, + {"single_byte", []byte{0x01}}, + {"two_bytes", []byte{0x01, 0x02}}, + {"three_bytes", []byte{0x01, 0x02, 0x03}}, + {"four_bytes", []byte{0x01, 0x02, 0x03, 0x04}}, + {"five_bytes", []byte{0x01, 0x02, 0x03, 0x04, 0x05}}, + {"eight_bytes", []byte{0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08}}, + {"hello_world", []byte("Hello, World!")}, + {"binary_data", []byte{0xFF, 0xFE, 0xFD, 0xFC, 0xFB, 0xFA}}, + {"high_values", []byte{0xFF, 0xFF, 0xFF, 0xFF}}, + {"mixed_content", []byte("Test\x00\x01\x02data")}, + {"unicode", unicodeBytes}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + encoded := ascii85Encoding.EncodeToString(tc.input) + + // verify via standard library decode + dst := make([]byte, len(tc.input)+4) + ndst, _, err := ascii85.Decode(dst, []byte(encoded), true) + + require.NoError(t, err) + assert.Equal(t, tc.input, dst[:ndst]) + }) + } +} + +func TestAppendEncode(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + prefix []byte + input []byte + }{ + {"empty_prefix", nil, []byte("test")}, + {"with_prefix", []byte("prefix:"), []byte("data")}, + {"empty_input", []byte("prefix:"), []byte{}}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + result := ascii85Encoding.AppendEncode(tc.prefix, tc.input) + + // verify prefix is preserved + if tc.prefix != nil { + assert.Equal(t, string(tc.prefix), string(result[:len(tc.prefix)])) + } + + // extract encoded portion and decode + encodedPortion := result[len(tc.prefix):] + dst := make([]byte, len(tc.input)+4) + ndst, _, err := ascii85.Decode(dst, encodedPortion, true) + + require.NoError(t, err) + assert.Equal(t, tc.input, dst[:ndst]) + }) + } +} + +func TestNewEncoder(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + input []byte + writes []int // chunk sizes for multiple writes + }{ + {"empty", []byte{}, nil}, + {"single_byte", []byte{0x01}, nil}, + {"four_bytes", []byte{0x01, 0x02, 0x03, 0x04}, nil}, + {"five_bytes", []byte{0x01, 0x02, 0x03, 0x04, 0x05}, nil}, + {"hello_world", []byte("Hello, World!"), nil}, + {"chunked_write", []byte("Hello, World!"), []int{3, 5, 5}}, + {"byte_by_byte", []byte("test"), []int{1, 1, 1, 1}}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + var buf bytes.Buffer + enc := NewEncoder(ascii85Encoding, &buf) + + if tc.writes == nil { + // single write + n, err := enc.Write(tc.input) + require.NoError(t, err) + assert.Equal(t, len(tc.input), n) + } else { + // multiple writes + offset := 0 + for _, size := range tc.writes { + end := offset + size + if end > len(tc.input) { + end = len(tc.input) + } + n, err := enc.Write(tc.input[offset:end]) + require.NoError(t, err) + assert.Equal(t, end-offset, n) + offset = end + } + } + + require.NoError(t, enc.Close()) + + // verify via standard library decode + encoded := buf.Bytes() + dst := make([]byte, len(tc.input)+4) + ndst, _, err := ascii85.Decode(dst, encoded, true) + + require.NoError(t, err) + assert.Equal(t, tc.input, dst[:ndst]) + }) + } +} diff --git a/base85/encoding.go b/base85/encoding.go new file mode 100644 index 0000000..7b1bd31 --- /dev/null +++ b/base85/encoding.go @@ -0,0 +1,548 @@ +package base85 + +import ( + "io" + "runtime" + "strconv" +) + +const ( + // NoPadding is used with WithPadding to disable padding. This is the default for base85 encodings. + NoPadding rune = -1 + + alphabetSize = 85 + + encodeRFC1924 = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~" + + // powers of 85 for decoding + pow85_1 = 85 + pow85_2 = 85 * 85 + pow85_3 = 85 * 85 * 85 + pow85_4 = 85 * 85 * 85 * 85 +) + +// RFC1924 is the encoding defined in RFC 1924 for compact representation of +// IPv6 addresses. It uses characters 0-9, A-Z, a-z, and 23 punctuation symbols, +// with no padding by default. +var RFC1924 = NewEncoding(encodeRFC1924) + +// Encoding represents a base85 encoding/decoding scheme defined by an 85-character alphabet. +type Encoding struct { + encode [alphabetSize]byte + decodeMap [256]uint8 + padChar rune +} + +// NewEncoding returns a new Encoding defined by the given 85-character +// alphabet, which must contain only unique ASCII characters and must not +// contain newline characters ('\r', '\n'). The resulting Encoding uses no +// padding by default. +func NewEncoding(encoder string) *Encoding { + if len(encoder) != alphabetSize { + panic("base85: encoding alphabet must be 85 bytes long") + } + + e := &Encoding{padChar: NoPadding} + copy(e.encode[:], encoder) + + for i := range e.decodeMap { + e.decodeMap[i] = 0xFF + } + + for i, c := range encoder { + if c > 127 { + panic("base85: encoding alphabet must contain only ASCII characters") + } else if c == '\n' || c == '\r' { + panic("base85: encoding alphabet contains newline character") + } else if e.decodeMap[c] != 0xFF { + panic("base85: encoding alphabet contains duplicate character") + } + e.decodeMap[c] = uint8(i) + } + + return e +} + +// WithPadding creates a new Encoding identical to enc except with a specified +// padding character, or NoPadding to disable padding. The padding character +// must be an ASCII character, must not be '\r' or '\n', and must not be +// contained in the encoding alphabet. +func (enc Encoding) WithPadding(padding rune) *Encoding { + if padding == '\r' || padding == '\n' { + panic("base85: invalid padding character") + } + + if padding != NoPadding { + if padding > 127 { + panic("base85: padding character must be ASCII") + } + for _, c := range enc.encode { + if rune(c) == padding { + panic("base85: padding character is in alphabet") + } + } + } + + enc.padChar = padding + return &enc +} + +// EncodedLen returns the length in bytes of the base85 encoding of an input buffer of length n. +func (enc *Encoding) EncodedLen(n int) int { + if enc.padChar == NoPadding { + // 4 bytes -> 5 chars, partial blocks: 1->2, 2->3, 3->4 + fullBlocks := n / 4 + remainder := n % 4 + length := fullBlocks * 5 + if remainder > 0 { + length += remainder + 1 + } + return length + } + // with padding: always multiple of 5 + return (n + 3) / 4 * 5 +} + +// DecodedLen returns the maximum length in bytes of the decoded data corresponding to n bytes of base85-encoded data. +func (enc *Encoding) DecodedLen(n int) int { + // 5 chars -> 4 bytes, partial: 2->1, 3->2, 4->3 + fullBlocks := n / 5 + remainder := n % 5 + length := fullBlocks * 4 + if remainder > 1 { + length += remainder - 1 + } + return length +} + +// Encode encodes src using the encoding enc, writing EncodedLen(len(src)) bytes to dst. +func (enc *Encoding) Encode(dst, src []byte) { + if len(src) == 0 { + return + } + + di := 0 + for len(src) >= 4 { + // big-endian uint32 + val := uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3]) + + dst[di+4] = enc.encode[val%85] + val /= 85 + dst[di+3] = enc.encode[val%85] + val /= 85 + dst[di+2] = enc.encode[val%85] + val /= 85 + dst[di+1] = enc.encode[val%85] + val /= 85 + dst[di] = enc.encode[val%85] + + src = src[4:] + di += 5 + } + + // handle remaining bytes + if len(src) > 0 { + var val uint32 + switch len(src) { + case 3: + val = uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 + case 2: + val = uint32(src[0])<<24 | uint32(src[1])<<16 + case 1: + val = uint32(src[0]) << 24 + } + + // encode and output only needed characters + var buf [5]byte + buf[4] = enc.encode[val%85] + val /= 85 + buf[3] = enc.encode[val%85] + val /= 85 + buf[2] = enc.encode[val%85] + val /= 85 + buf[1] = enc.encode[val%85] + val /= 85 + buf[0] = enc.encode[val%85] + + // output chars: 1 byte -> 2 chars, 2 bytes -> 3 chars, 3 bytes -> 4 chars + outLen := len(src) + 1 + copy(dst[di:], buf[:outLen]) + di += outLen + + // add padding if needed + if enc.padChar != NoPadding { + for i := outLen; i < 5; i++ { + dst[di] = byte(enc.padChar) + di++ + } + } + } +} + +// EncodeToString returns the base85 encoding of src. +func (enc *Encoding) EncodeToString(src []byte) string { + buf := make([]byte, enc.EncodedLen(len(src))) + enc.Encode(buf, src) + return string(buf) +} + +// AppendEncode appends the base85 encoding of src to dst and returns the extended buffer. +func (enc *Encoding) AppendEncode(dst, src []byte) []byte { + n := enc.EncodedLen(len(src)) + dst = append(dst, make([]byte, n)...) + enc.Encode(dst[len(dst)-n:], src) + return dst +} + +// Decode decodes src using the encoding enc. It writes at most DecodedLen(len(src)) bytes +// to dst and returns the number of bytes written. Whitespace (space, tab, CR, LF) is ignored +// unless included in the encoding alphabet. If src contains invalid base85 data, it will +// return the number of bytes successfully written and CorruptInputError. +func (enc *Encoding) Decode(dst, src []byte) (n int, err error) { + // filter whitespace (if not in alphabet) and validate + // use lazy allocation: only create filtered slice when needed + var filtered []byte + for i, c := range src { + if enc.padChar != NoPadding && rune(c) == enc.padChar { + if filtered != nil { + filtered = append(filtered, c) + } + continue + } else if (c == ' ' || c == '\t' || c == '\n' || c == '\r') && enc.decodeMap[c] == 0xFF { + if filtered == nil { + filtered = make([]byte, i, len(src)) + copy(filtered, src[:i]) + } + continue + } else if enc.decodeMap[c] == 0xFF { + return 0, CorruptInputError(i) + } else if filtered != nil { + filtered = append(filtered, c) + } + } + + if filtered == nil { + return enc.decodeFiltered(dst, src) + } + return enc.decodeFiltered(dst, filtered) +} + +// decodeBlock decodes 2-5 base85 alphabet bytes into 1-4 output bytes. +// Returns the number of bytes written to dst. Caller must ensure all bytes +// in src are valid alphabet characters (not padding, not invalid). +func (enc *Encoding) decodeBlock(dst, src []byte) int { + // initialize with highest alphabet index (84) for implicit padding + d0, d1, d2, d3, d4 := uint32(84), uint32(84), uint32(84), uint32(84), uint32(84) + + // map input bytes to digit values + switch len(src) { + case 5: + d4 = uint32(enc.decodeMap[src[4]]) + fallthrough + case 4: + d3 = uint32(enc.decodeMap[src[3]]) + fallthrough + case 3: + d2 = uint32(enc.decodeMap[src[2]]) + fallthrough + case 2: + d1 = uint32(enc.decodeMap[src[1]]) + d0 = uint32(enc.decodeMap[src[0]]) + } + + val := d0*pow85_4 + d1*pow85_3 + d2*pow85_2 + d3*pow85_1 + d4 + + // output length: 5 chars -> 4 bytes, otherwise len-1 + // only write the bytes we actually produce + switch len(src) { + case 5: + dst[3] = byte(val) + fallthrough + case 4: + dst[2] = byte(val >> 8) + fallthrough + case 3: + dst[1] = byte(val >> 16) + fallthrough + case 2: + dst[0] = byte(val >> 24) + } + + if len(src) == 5 { + return 4 + } + return len(src) - 1 +} + +// decodeFiltered decodes pre-validated and filtered input (whitespace removed, +// but padding chars still present if padding is enabled). +func (enc *Encoding) decodeFiltered(dst, src []byte) (n int, err error) { + if len(src) == 0 { + return 0, nil + } + + consumed := 0 + for len(src) >= 5 { + // find data length in this block (may end early due to padding) + dataLen := 5 + if enc.padChar != NoPadding { + for i := 0; i < 5; i++ { + if rune(src[i]) == enc.padChar { + dataLen = i + break + } + } + // validate: all chars after first padding must also be padding + for i := dataLen; i < 5; i++ { + if rune(src[i]) != enc.padChar { + return n, CorruptInputError(consumed + i) + } + } + // need at least 2 data chars to produce 1 byte + if dataLen < 2 { + return n, CorruptInputError(consumed + dataLen) + } + } + + // validate all data chars are in alphabet + for i := 0; i < dataLen; i++ { + if enc.decodeMap[src[i]] == 0xFF { + return n, CorruptInputError(consumed + i) + } + } + + n += enc.decodeBlock(dst[n:], src[:dataLen]) + src = src[5:] + consumed += 5 + } + + // handle remaining 1-4 chars (only valid when padding is disabled or no padding present) + if len(src) > 0 { + if len(src) == 1 { + return n, CorruptInputError(consumed) + } + // remaining chars cannot contain padding + for i, c := range src { + if enc.padChar != NoPadding && rune(c) == enc.padChar { + return n, CorruptInputError(consumed + i) + } + if enc.decodeMap[c] == 0xFF { + return n, CorruptInputError(consumed + i) + } + } + n += enc.decodeBlock(dst[n:], src) + } + + return n, nil +} + +// DecodeString returns the bytes represented by the base85 string s. +func (enc *Encoding) DecodeString(s string) ([]byte, error) { + dst := make([]byte, enc.DecodedLen(len(s))) + n, err := enc.Decode(dst, []byte(s)) + return dst[:n], err +} + +// AppendDecode appends the base85 decoding of src to dst and returns the extended buffer. +// If the input is malformed, it returns the partially decoded src and an error. +func (enc *Encoding) AppendDecode(dst, src []byte) ([]byte, error) { + n := enc.DecodedLen(len(src)) + dst = append(dst, make([]byte, n)...) + written, err := enc.Decode(dst[len(dst)-n:], src) + return dst[:len(dst)-n+written], err +} + +// NewEncoder returns a new base85 stream encoder. Data written to the +// returned writer will be encoded using enc and then written to w. +// Base85 encodings operate in 4-byte blocks; when finished writing, +// the caller must Close the returned encoder to flush any partially written blocks. +func NewEncoder(enc *Encoding, w io.Writer) io.WriteCloser { + return &encoder{enc: enc, w: w} +} + +type encoder struct { + enc *Encoding + w io.Writer + buf [4]byte + nbuf int + outBuf [5]byte + err error +} + +func (e *encoder) Write(p []byte) (n int, err error) { + if e.err != nil { + return 0, e.err + } + + // use buffered data first + if e.nbuf > 0 { + for len(p) > 0 && e.nbuf < 4 { + e.buf[e.nbuf] = p[0] + e.nbuf++ + p = p[1:] + n++ + } + if e.nbuf == 4 { + e.enc.Encode(e.outBuf[:], e.buf[:]) + if _, e.err = e.w.Write(e.outBuf[:]); e.err != nil { + return n, e.err + } + e.nbuf = 0 + } + } + + // encode full blocks + for len(p) >= 4 { + e.enc.Encode(e.outBuf[:], p[:4]) + if _, e.err = e.w.Write(e.outBuf[:]); e.err != nil { + return n, e.err + } + p = p[4:] + n += 4 + } + + // buffer remaining + for len(p) > 0 { + e.buf[e.nbuf] = p[0] + e.nbuf++ + p = p[1:] + n++ + } + + return n, nil +} + +func (e *encoder) Close() error { + if e.err != nil { + return e.err + } + + if e.nbuf > 0 { + encoded := make([]byte, e.enc.EncodedLen(e.nbuf)) + e.enc.Encode(encoded, e.buf[:e.nbuf]) + if _, e.err = e.w.Write(encoded); e.err != nil { + return e.err + } + } + + return nil +} + +// NewDecoder constructs a new base85 stream decoder. Data read from the returned reader will be decoded using enc. +func NewDecoder(enc *Encoding, r io.Reader) io.Reader { + return &decoder{enc: enc, r: r} +} + +type decoder struct { + enc *Encoding + r io.Reader + readBuf [1024]byte + encBuf [4]byte // buffer for incomplete encoded blocks (max 4 chars waiting for 5th) + nenc int // number of valid bytes in encBuf + outBuf []byte // buffered decoded output + err error + eof bool +} + +func (d *decoder) Read(p []byte) (n int, err error) { + // return buffered decoded data first + if len(d.outBuf) > 0 { + n = copy(p, d.outBuf) + d.outBuf = d.outBuf[n:] + return n, nil + } else if d.err != nil { + return 0, d.err + } else if d.eof { + return 0, io.EOF + } + + // loop until we have data to return or hit EOF/error + for { + // read more encoded data + nr, readErr := d.r.Read(d.readBuf[:]) + if readErr != nil && readErr != io.EOF { + // store error but process any data that was read + d.err = readErr + d.eof = true // treat terminal error as end of stream for decoding + if nr == 0 { + return 0, readErr + } + } else if readErr == io.EOF { + d.eof = true + } + if nr == 0 && !d.eof && d.err == nil { + // reader returned (0, nil), yield and retry + runtime.Gosched() + continue + } + + // filter whitespace and padding, combine with buffered encoded data + filtered := make([]byte, 0, d.nenc+nr) + filtered = append(filtered, d.encBuf[:d.nenc]...) + d.nenc = 0 + for i, c := range d.readBuf[:nr] { + if d.enc.padChar != NoPadding && rune(c) == d.enc.padChar { + filtered = append(filtered, c) + continue + } else if (c == ' ' || c == '\t' || c == '\n' || c == '\r') && d.enc.decodeMap[c] == 0xFF { + continue + } else if d.enc.decodeMap[c] == 0xFF { + d.err = CorruptInputError(i) + return 0, d.err + } + filtered = append(filtered, c) + } + + // if not at EOF, buffer incomplete block for next read + if !d.eof { + remainder := len(filtered) % 5 + if remainder > 0 { + d.nenc = copy(d.encBuf[:], filtered[len(filtered)-remainder:]) + filtered = filtered[:len(filtered)-remainder] + } + } + + if len(filtered) == 0 { + if d.eof { + if d.err == nil { + d.err = io.EOF + } + return 0, d.err + } else if d.err != nil { + return 0, d.err + } + continue // need more data + } + + // decode the filtered data + decoded := make([]byte, d.enc.DecodedLen(len(filtered))) + nd, decErr := d.enc.decodeFiltered(decoded, filtered) + if decErr != nil { + d.err = decErr + // still return what we decoded + n = copy(p, decoded[:nd]) + if n < nd { + d.outBuf = decoded[n:nd] + } + if n > 0 { + return n, nil // defer error until buffer drained + } + return 0, d.err + } + + // copy to output + n = copy(p, decoded[:nd]) + if n < nd { + d.outBuf = decoded[n:nd] + } + return n, nil + } +} + +// CorruptInputError is returned by Decode when the input contains invalid base85 data. +// The integer value represents the byte offset where the error was detected. +type CorruptInputError int64 + +func (e CorruptInputError) Error() string { + return "base85: illegal character at offset " + strconv.FormatInt(int64(e), 10) +} diff --git a/base85/encoding_test.go b/base85/encoding_test.go new file mode 100644 index 0000000..d29d09c --- /dev/null +++ b/base85/encoding_test.go @@ -0,0 +1,1066 @@ +package base85 + +import ( + "bytes" + "io" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestRFC1924RoundTrip(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + input []byte + }{ + {"empty", []byte{}}, + {"single_byte", []byte{0x00}}, + {"single_nonzero", []byte{0x42}}, + {"two_bytes", []byte{0x01, 0x02}}, + {"three_bytes", []byte{0x01, 0x02, 0x03}}, + {"four_bytes", []byte{0x01, 0x02, 0x03, 0x04}}, + {"five_bytes", []byte{0x01, 0x02, 0x03, 0x04, 0x05}}, + {"eight_bytes", []byte{0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08}}, + {"hello_world", []byte("Hello, World!")}, + {"all_zeros", []byte{0x00, 0x00, 0x00, 0x00}}, + {"all_ones", []byte{0xFF, 0xFF, 0xFF, 0xFF}}, + {"binary_sequence", []byte{0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07}}, + {"unicode_text", []byte{0xe6, 0x97, 0xa5, 0xe6, 0x9c, 0xac, 0xe8, 0xaa, 0x9e}}, + {"mixed_content", []byte("Test\x00\x01\x02\xFF\xFEdata")}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + encoded := RFC1924.EncodeToString(tc.input) + decoded, err := RFC1924.DecodeString(encoded) + + require.NoError(t, err) + assert.Equal(t, tc.input, decoded) + }) + } +} + +func TestRFC1924RoundTripWithPadding(t *testing.T) { + t.Parallel() + + // use '.' since '=' is in the RFC1924 alphabet + paddedRFC1924 := RFC1924.WithPadding('.') + + tests := []struct { + name string + input []byte + }{ + {"empty", []byte{}}, + {"single_byte", []byte{0x42}}, + {"two_bytes", []byte{0x01, 0x02}}, + {"three_bytes", []byte{0x01, 0x02, 0x03}}, + {"four_bytes", []byte{0x01, 0x02, 0x03, 0x04}}, + {"five_bytes", []byte{0x01, 0x02, 0x03, 0x04, 0x05}}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + encoded := paddedRFC1924.EncodeToString(tc.input) + + // padded output length is always a multiple of 5 + if len(tc.input) > 0 { + assert.Equal(t, 0, len(encoded)%5) + } + + decoded, err := paddedRFC1924.DecodeString(encoded) + + require.NoError(t, err) + assert.Equal(t, tc.input, decoded) + }) + } +} + +func TestNewEncoding(t *testing.T) { + t.Parallel() + + t.Run("valid_alphabet", func(t *testing.T) { + enc := NewEncoding(encodeRFC1924) + require.NotNil(t, enc) + assert.Equal(t, NoPadding, enc.padChar) + }) + + t.Run("wrong_length", func(t *testing.T) { + assert.Panics(t, func() { NewEncoding("short") }) + }) + + t.Run("duplicate_char", func(t *testing.T) { + // 85 chars with duplicate + alphabet := "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}0" + assert.Panics(t, func() { NewEncoding(alphabet) }) + }) + + t.Run("contains_newline", func(t *testing.T) { + alphabet := "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}\n" + assert.Panics(t, func() { NewEncoding(alphabet) }) + }) + + t.Run("contains_cr", func(t *testing.T) { + alphabet := "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}\r" + assert.Panics(t, func() { NewEncoding(alphabet) }) + }) + + t.Run("contains_non_ascii", func(t *testing.T) { + // 85 chars with a non-ASCII character (é = 0xC3 0xA9 in UTF-8) + alphabet := "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}é" + assert.Panics(t, func() { NewEncoding(alphabet) }) + }) + + t.Run("contains_high_byte", func(t *testing.T) { + // 85 bytes with a high byte (0x80+) that's not valid ASCII + alphabet := "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}\x80" + assert.Panics(t, func() { NewEncoding(alphabet) }) + }) +} + +func TestWithPadding(t *testing.T) { + t.Parallel() + + t.Run("valid_padding", func(t *testing.T) { + // use '.' since '=' is in the RFC1924 alphabet + enc := RFC1924.WithPadding('.') + require.NotNil(t, enc) + assert.Equal(t, '.', enc.padChar) + }) + + t.Run("no_padding", func(t *testing.T) { + enc := RFC1924.WithPadding(NoPadding) + require.NotNil(t, enc) + assert.Equal(t, NoPadding, enc.padChar) + }) + + t.Run("newline_padding", func(t *testing.T) { + assert.Panics(t, func() { RFC1924.WithPadding('\n') }) + }) + + t.Run("cr_padding", func(t *testing.T) { + assert.Panics(t, func() { RFC1924.WithPadding('\r') }) + }) + + t.Run("padding_in_alphabet", func(t *testing.T) { + assert.Panics(t, func() { + RFC1924.WithPadding('0') // '0' is in RFC1924 alphabet + }) + }) + + t.Run("non_ascii_padding", func(t *testing.T) { + assert.Panics(t, func() { + RFC1924.WithPadding('é') // non-ASCII character + }) + }) +} + +func TestEncodedLen(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + enc *Encoding + inputLen int + expected int + }{ + // no padding: 4 bytes -> 5 chars, partial: 1->2, 2->3, 3->4 + {"no_pad_zero", RFC1924, 0, 0}, + {"no_pad_one", RFC1924, 1, 2}, + {"no_pad_two", RFC1924, 2, 3}, + {"no_pad_three", RFC1924, 3, 4}, + {"no_pad_four", RFC1924, 4, 5}, + {"no_pad_five", RFC1924, 5, 7}, // 5 + 2 + {"no_pad_eight", RFC1924, 8, 10}, // 10 + + // with padding: always multiple of 5 + {"pad_zero", RFC1924.WithPadding('.'), 0, 0}, + {"pad_one", RFC1924.WithPadding('.'), 1, 5}, + {"pad_two", RFC1924.WithPadding('.'), 2, 5}, + {"pad_three", RFC1924.WithPadding('.'), 3, 5}, + {"pad_four", RFC1924.WithPadding('.'), 4, 5}, + {"pad_five", RFC1924.WithPadding('.'), 5, 10}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.expected, tc.enc.EncodedLen(tc.inputLen)) + }) + } +} + +func TestDecodedLen(t *testing.T) { + t.Parallel() + + // DecodedLen returns max possible decoded bytes + // 5 chars -> 4 bytes, partial: 2->1, 3->2, 4->3, 1->0 + tests := []struct { + name string + encodedLen int + expected int + }{ + {"zero", 0, 0}, + {"one", 1, 0}, // 1 char can't decode to anything + {"two", 2, 1}, // 2 chars -> 1 byte + {"three", 3, 2}, // 3 chars -> 2 bytes + {"four", 4, 3}, // 4 chars -> 3 bytes + {"five", 5, 4}, // 5 chars -> 4 bytes + {"six", 6, 4}, // 5 + 1 -> 4 + 0 + {"seven", 7, 5}, // 5 + 2 -> 4 + 1 + {"ten", 10, 8}, // 2 full blocks + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.expected, RFC1924.DecodedLen(tc.encodedLen)) + }) + } +} + +func TestCorruptInputError(t *testing.T) { + t.Parallel() + + err := CorruptInputError(42) + errStr := err.Error() + + assert.Contains(t, errStr, "42") +} + +func TestStreamEncoder(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + chunks [][]byte + }{ + {"single_chunk", [][]byte{[]byte("Hello, World!")}}, + {"multiple_chunks", [][]byte{[]byte("Hello"), []byte(", "), []byte("World!")}}, + {"byte_by_byte", [][]byte{{0x01}, {0x02}, {0x03}, {0x04}, {0x05}}}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + // combine chunks for expected result + var combined []byte + for _, chunk := range tc.chunks { + combined = append(combined, chunk...) + } + + // encode via stream + var buf bytes.Buffer + encoder := NewEncoder(RFC1924, &buf) + for _, chunk := range tc.chunks { + n, err := encoder.Write(chunk) + require.NoError(t, err) + assert.Len(t, chunk, n) + } + err := encoder.Close() + require.NoError(t, err) + + // decode and verify + decoded, err := RFC1924.DecodeString(buf.String()) + require.NoError(t, err) + assert.Equal(t, combined, decoded) + }) + } +} + +func TestStreamDecoder(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + input []byte + }{ + {"empty", []byte{}}, + {"simple", []byte("Hello, World!")}, + {"binary", []byte{0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08}}, + {"longer_data", bytes.Repeat([]byte("test"), 100)}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + // encode first + encoded := RFC1924.EncodeToString(tc.input) + + // decode via stream + decoder := NewDecoder(RFC1924, bytes.NewReader([]byte(encoded))) + decoded, err := io.ReadAll(decoder) + + require.NoError(t, err) + assert.Equal(t, tc.input, decoded) + }) + } +} + +func TestEncodeDirectBuffer(t *testing.T) { + t.Parallel() + + input := []byte("test data") + dst := make([]byte, RFC1924.EncodedLen(len(input))) + + RFC1924.Encode(dst, input) + + decoded, err := RFC1924.DecodeString(string(dst)) + require.NoError(t, err) + assert.Equal(t, input, decoded) +} + +func TestDecodeDirectBuffer(t *testing.T) { + t.Parallel() + + input := []byte("test data") + encoded := RFC1924.EncodeToString(input) + + dst := make([]byte, RFC1924.DecodedLen(len(encoded))) + n, err := RFC1924.Decode(dst, []byte(encoded)) + + require.NoError(t, err) + assert.Len(t, input, n) + assert.Equal(t, input, dst[:n]) +} + +func TestRFC1924AppendEncode(t *testing.T) { + t.Parallel() + + t.Run("append_to_empty", func(t *testing.T) { + input := []byte("test") + result := RFC1924.AppendEncode(nil, input) + + expected := RFC1924.EncodeToString(input) + assert.Equal(t, expected, string(result)) + }) + + t.Run("append_to_existing", func(t *testing.T) { + prefix := []byte("prefix:") + input := []byte("test") + result := RFC1924.AppendEncode(prefix, input) + + expected := "prefix:" + RFC1924.EncodeToString(input) + assert.Equal(t, expected, string(result)) + }) + + t.Run("append_empty_input", func(t *testing.T) { + prefix := []byte("prefix:") + result := RFC1924.AppendEncode(prefix, nil) + + assert.Equal(t, "prefix:", string(result)) + }) + + t.Run("preserves_original_capacity", func(t *testing.T) { + prefix := make([]byte, 0, 100) + prefix = append(prefix, "start:"...) + input := []byte("data") + + result := RFC1924.AppendEncode(prefix, input) + + // verify original slice not modified + assert.Len(t, prefix, 6) + assert.Greater(t, len(result), len(prefix)) + }) +} + +func TestRFC1924AppendDecode(t *testing.T) { + t.Parallel() + + t.Run("append_to_empty", func(t *testing.T) { + input := []byte("test") + encoded := RFC1924.EncodeToString(input) + + result, err := RFC1924.AppendDecode(nil, []byte(encoded)) + + require.NoError(t, err) + assert.Equal(t, input, result) + }) + + t.Run("append_to_existing", func(t *testing.T) { + prefix := []byte("prefix:") + input := []byte("test") + encoded := RFC1924.EncodeToString(input) + + result, err := RFC1924.AppendDecode(prefix, []byte(encoded)) + + require.NoError(t, err) + expected := append([]byte("prefix:"), input...) + assert.Equal(t, expected, result) + }) + + t.Run("append_empty_input", func(t *testing.T) { + prefix := []byte("prefix:") + result, err := RFC1924.AppendDecode(prefix, nil) + + require.NoError(t, err) + assert.Equal(t, "prefix:", string(result)) + }) + + t.Run("corrupt_input_returns_partial", func(t *testing.T) { + prefix := []byte("prefix:") + // valid block + invalid char + input := []byte("test") + encoded := RFC1924.EncodeToString(input) + "[[[" + + result, err := RFC1924.AppendDecode(prefix, []byte(encoded)) + + require.Error(t, err) + // should return prefix + successfully decoded data + assert.GreaterOrEqual(t, len(result), len(prefix)) + }) +} + +type errorWriter struct { + n int + err error +} + +func (w *errorWriter) Write(p []byte) (int, error) { + return w.n, w.err +} + +func TestStreamEncoderWriteError(t *testing.T) { + t.Parallel() + + t.Run("error_on_full_block", func(t *testing.T) { + w := &errorWriter{err: io.ErrShortWrite} + encoder := NewEncoder(RFC1924, w) + + _, err := encoder.Write([]byte("test")) + require.Error(t, err) + + // subsequent writes should fail + _, err = encoder.Write([]byte("more")) + require.Error(t, err) + }) + + t.Run("error_on_buffered_flush", func(t *testing.T) { + w := &errorWriter{err: io.ErrShortWrite} + encoder := NewEncoder(RFC1924, w) + + // write less than 4 bytes (buffers) + _, err := encoder.Write([]byte("ab")) + require.NoError(t, err) + + // write more to trigger flush + _, err = encoder.Write([]byte("cdef")) + require.Error(t, err) + }) + + t.Run("error_on_close", func(t *testing.T) { + w := &errorWriter{err: io.ErrShortWrite} + encoder := NewEncoder(RFC1924, w) + + // write less than 4 bytes + _, err := encoder.Write([]byte("ab")) + require.NoError(t, err) + + // close should fail when flushing remaining + err = encoder.Close() + require.Error(t, err) + + // close again should return same error + err = encoder.Close() + require.Error(t, err) + }) + + t.Run("close_without_write", func(t *testing.T) { + var buf bytes.Buffer + encoder := NewEncoder(RFC1924, &buf) + + err := encoder.Close() + require.NoError(t, err) + assert.Empty(t, buf.String()) + }) +} + +type errorReader struct { + data []byte + err error +} + +func (r *errorReader) Read(p []byte) (int, error) { + if len(r.data) == 0 { + return 0, r.err + } + n := copy(p, r.data) + r.data = r.data[n:] + if len(r.data) == 0 && r.err != nil { + return n, r.err + } + return n, nil +} + +// zeroReader returns (0, nil) a specified number of times before returning actual data. +type zeroReader struct { + data []byte + zeroReads int + chunkSize int + zerosDone int + betweenAll bool // if true, return zero read between every chunk +} + +func (r *zeroReader) Read(p []byte) (int, error) { + if len(r.data) == 0 { + return 0, io.EOF + } + // return (0, nil) the specified number of times at the start + if r.zerosDone < r.zeroReads { + r.zerosDone++ + return 0, nil + } + // optionally reset for next chunk + if r.betweenAll { + r.zerosDone = 0 + } + n := r.chunkSize + if n > len(r.data) { + n = len(r.data) + } + if n > len(p) { + n = len(p) + } + copy(p, r.data[:n]) + r.data = r.data[n:] + return n, nil +} + +// chunkReader splits data into chunks of specified size to test buffering behavior. +type chunkReader struct { + data []byte + chunkSize int +} + +func (r *chunkReader) Read(p []byte) (int, error) { + if len(r.data) == 0 { + return 0, io.EOF + } + n := r.chunkSize + if n > len(r.data) { + n = len(r.data) + } + if n > len(p) { + n = len(p) + } + copy(p, r.data[:n]) + r.data = r.data[n:] + return n, nil +} + +func TestStreamDecoderPartialBlockBuffering(t *testing.T) { + t.Parallel() + + // This test verifies that the decoder correctly handles reads that split + // in the middle of a 5-character encoded block. + input := []byte("Hello, World! This is a longer test string.") + encoded := RFC1924.EncodeToString(input) + + tests := []struct { + name string + chunkSize int + }{ + {"chunk_size_1", 1}, + {"chunk_size_3", 3}, + {"chunk_size_7", 7}, // splits mid-block (7 = 5 + 2) + {"chunk_size_11", 11}, // splits mid-block (11 = 10 + 1) + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + reader := &chunkReader{data: []byte(encoded), chunkSize: tc.chunkSize} + decoder := NewDecoder(RFC1924, reader) + decoded, err := io.ReadAll(decoder) + + require.NoError(t, err) + assert.Equal(t, input, decoded) + }) + } +} + +// dataWithErrorReader returns all data with an error in a single Read call. +type dataWithErrorReader struct { + data []byte + err error + read bool +} + +func (r *dataWithErrorReader) Read(p []byte) (int, error) { + if r.read { + return 0, r.err + } + r.read = true + n := copy(p, r.data) + return n, r.err +} + +// whitespaceOnlyReader returns only whitespace, then an error. +type whitespaceOnlyReader struct { + returned bool + err error +} + +func (r *whitespaceOnlyReader) Read(p []byte) (int, error) { + if r.returned { + return 0, r.err + } + r.returned = true + // return whitespace that will be filtered out + data := []byte(" \t\n\r ") + n := copy(p, data) + return n, nil +} + +func TestStreamDecoderReadError(t *testing.T) { + t.Parallel() + + t.Run("read_error", func(t *testing.T) { + r := &errorReader{err: io.ErrUnexpectedEOF} + decoder := NewDecoder(RFC1924, r) + + buf := make([]byte, 100) + _, err := decoder.Read(buf) + require.Error(t, err) + + // subsequent reads should return same error + _, err = decoder.Read(buf) + require.Error(t, err) + }) + + t.Run("error_after_whitespace_only", func(t *testing.T) { + // reader returns only whitespace (filtered to empty) then error + r := &whitespaceOnlyReader{err: io.ErrUnexpectedEOF} + decoder := NewDecoder(RFC1924, r) + + buf := make([]byte, 100) + _, err := decoder.Read(buf) + require.ErrorIs(t, err, io.ErrUnexpectedEOF) + }) + + t.Run("decode_error_with_partial_data_small_buffer", func(t *testing.T) { + // This tests the error path where some data is decoded before + // the error, and the output buffer is too small. Uses padding mode + // to trigger an error mid-stream after valid blocks are decoded. + paddedEnc := RFC1924.WithPadding('.') + + // Create input: valid padded block + block with non-contiguous padding + // Valid: 4 bytes = 5 chars (full block) + valid := paddedEnc.EncodeToString([]byte{0x01, 0x02, 0x03, 0x04}) + // Invalid: "AB.C." has non-contiguous padding (all chars valid, but padding broken) + // The 'C' is a valid alphabet char, so it passes stream filter but fails + // decodeFiltered's padding validation at line 297-299 + invalid := "AB.C." + + allData := valid + invalid + + // Use a reader that returns all data at once with EOF + decoder := NewDecoder(paddedEnc, bytes.NewReader([]byte(allData))) + + // Use small buffer to trigger buffering on decode error + buf := make([]byte, 2) + + n, err := decoder.Read(buf) + // Should decode 4 bytes from valid block before hitting error + // But buffer only holds 2, so should buffer the rest and defer error + require.Equal(t, 2, n) + require.NoError(t, err, "error should be deferred") + + // Continue reading to get buffered data + var result []byte + result = append(result, buf[:n]...) + for { + n, err = decoder.Read(buf) + if n > 0 { + result = append(result, buf[:n]...) + } + if err != nil { + break + } + } + + // Should have gotten the 4 valid bytes + assert.Equal(t, []byte{0x01, 0x02, 0x03, 0x04}, result) + // And eventually hit the error + assert.Error(t, err) + }) + + t.Run("data_with_error", func(t *testing.T) { + // encode some data + input := []byte("Hello, World!") + encoded := RFC1924.EncodeToString(input) + + // reader returns all data with an error in single read + r := &dataWithErrorReader{data: []byte(encoded), err: io.ErrUnexpectedEOF} + decoder := NewDecoder(RFC1924, r) + + // should get data first despite error + decoded, err := io.ReadAll(decoder) + + // data should be decoded successfully + assert.Equal(t, input, decoded) + // error should be returned after data exhausted + assert.ErrorIs(t, err, io.ErrUnexpectedEOF) + }) + + t.Run("small_output_buffer", func(t *testing.T) { + encoded := RFC1924.EncodeToString([]byte("Hello, World!")) + decoder := NewDecoder(RFC1924, bytes.NewReader([]byte(encoded))) + + // read with very small buffer to trigger buffering + buf := make([]byte, 2) + var result []byte + for { + n, err := decoder.Read(buf) + result = append(result, buf[:n]...) + if err == io.EOF { + break + } + require.NoError(t, err) + } + + assert.Equal(t, []byte("Hello, World!"), result) + }) + + t.Run("corrupt_input", func(t *testing.T) { + // use invalid characters for RFC1924 + decoder := NewDecoder(RFC1924, bytes.NewReader([]byte("[[["))) + + buf := make([]byte, 100) + _, err := decoder.Read(buf) + assert.Error(t, err) + }) + + t.Run("buffered_data_with_error", func(t *testing.T) { + // use input that encodes to complete 5-char blocks (8 bytes = 10 chars = 2 blocks) + input := []byte("12345678") + encoded := RFC1924.EncodeToString(input) + require.Len(t, encoded, 10) // verify our assumption + + // add a single trailing valid character - this creates an invalid 1-char partial block + // (minimum 2 chars needed to decode 1 byte) + encodedWithError := encoded + "0" + + // use a small read buffer to force outBuf buffering + decoder := NewDecoder(RFC1924, bytes.NewReader([]byte(encodedWithError))) + buf := make([]byte, 2) + + var result []byte + var lastErr error + for { + n, err := decoder.Read(buf) + if n > 0 { + result = append(result, buf[:n]...) + } + if err != nil { + lastErr = err + break + } + } + + // should have decoded the valid portion (8 bytes from 10 chars) + assert.Equal(t, input, result) + // should have received a corrupt input error for the trailing char + var corruptErr CorruptInputError + require.ErrorAs(t, lastErr, &corruptErr) + }) + + t.Run("error_with_small_buffer_returns_bytes", func(t *testing.T) { + // This test verifies that when a decode error occurs and the output + // buffer is smaller than decoded data, Read correctly returns the + // number of bytes written (not 0) and defers the error. + input := []byte("12345678") // 8 bytes -> 10 encoded chars + encoded := RFC1924.EncodeToString(input) + + // append invalid trailing char to cause error after decoding valid data + encodedWithError := encoded + "0" + + decoder := NewDecoder(RFC1924, bytes.NewReader([]byte(encodedWithError))) + + // use buffer smaller than decoded output (8 bytes) to trigger buffering + buf := make([]byte, 3) + + // first read should return bytes, not error (error deferred) + n, err := decoder.Read(buf) + require.Equal(t, 3, n, "should return actual bytes written, not 0") + require.NoError(t, err, "error should be deferred until buffer drained") + require.Equal(t, input[:3], buf[:n]) + + // continue reading to drain buffer + var result []byte + result = append(result, buf[:n]...) + for { + n, err = decoder.Read(buf) + if n > 0 { + result = append(result, buf[:n]...) + } + if err != nil { + break + } + } + + // all valid data should be returned before error + assert.Equal(t, input, result) + var corruptErr CorruptInputError + assert.ErrorAs(t, err, &corruptErr) + }) +} + +func TestStreamDecoderZeroReads(t *testing.T) { + t.Parallel() + + input := []byte("Hello, World!") + encoded := RFC1924.EncodeToString(input) + + tests := []struct { + name string + zeroReads int + chunkSize int + betweenAll bool + }{ + {"zeros_at_start", 3, 10, false}, + {"zeros_between_chunks", 1, 5, true}, + {"many_zeros_at_start", 10, 20, false}, + {"single_byte_with_zeros", 2, 1, true}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + reader := &zeroReader{ + data: []byte(encoded), + zeroReads: tc.zeroReads, + chunkSize: tc.chunkSize, + betweenAll: tc.betweenAll, + } + decoder := NewDecoder(RFC1924, reader) + decoded, err := io.ReadAll(decoder) + + require.NoError(t, err) + assert.Equal(t, input, decoded) + }) + } +} + +func TestPaddedDecoding(t *testing.T) { + t.Parallel() + + paddedEnc := RFC1924.WithPadding('.') + + t.Run("concatenated_padded_blocks", func(t *testing.T) { + // encode two 1-byte values separately with padding + encoded1 := paddedEnc.EncodeToString([]byte{0x42}) // 2 chars + 3 padding + encoded2 := paddedEnc.EncodeToString([]byte{0x43}) // 2 chars + 3 padding + + concatenated := encoded1 + encoded2 + + // should decode both blocks + decoded, err := paddedEnc.DecodeString(concatenated) + require.NoError(t, err) + assert.Equal(t, []byte{0x42, 0x43}, decoded) + }) + + t.Run("padded_blocks_with_whitespace", func(t *testing.T) { + encoded1 := paddedEnc.EncodeToString([]byte{0x42}) + encoded2 := paddedEnc.EncodeToString([]byte{0x43}) + + withWhitespace := encoded1 + " \t\n" + encoded2 + + decoded, err := paddedEnc.DecodeString(withWhitespace) + require.NoError(t, err) + assert.Equal(t, []byte{0x42, 0x43}, decoded) + }) + + t.Run("invalid_padding_positions", func(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + input string + }{ + {"padding_at_start", "....."}, + {"padding_in_middle", "AB.CD"}, // padding not at end of block + {"non_contiguous", "AB.C."}, // padding chars separated by non-padding + {"single_padding_remainder", "."}, // 1-char remainder with padding + {"two_padding_remainder", "AB...X"}, // valid block + padding in remainder + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + _, err := paddedEnc.DecodeString(tc.input) + assert.Error(t, err) + }) + } + }) + + t.Run("invalid_char_in_padded_block", func(t *testing.T) { + // '[' is not in RFC1924 alphabet - test invalid char in data portion of block + _, err := paddedEnc.DecodeString("AB[..") + assert.Error(t, err) + }) + + t.Run("padding_in_trailing_bytes", func(t *testing.T) { + // valid 5-char block followed by padding in trailing portion + encoded := paddedEnc.EncodeToString([]byte{0x01, 0x02, 0x03, 0x04}) // 5 chars, no padding + _, err := paddedEnc.DecodeString(encoded + "A.") // trailing 2 chars with padding + assert.Error(t, err) + }) + + t.Run("invalid_char_in_trailing_bytes", func(t *testing.T) { + // valid 5-char block followed by invalid char in trailing portion + encoded := RFC1924.EncodeToString([]byte{0x01, 0x02, 0x03, 0x04}) // 5 chars + _, err := RFC1924.DecodeString(encoded + "A[") // '[' is invalid + assert.Error(t, err) + }) + + t.Run("stream_concatenated_blocks", func(t *testing.T) { + encoded1 := paddedEnc.EncodeToString([]byte{0x42}) + encoded2 := paddedEnc.EncodeToString([]byte{0x43}) + concatenated := encoded1 + encoded2 + + decoder := NewDecoder(paddedEnc, bytes.NewReader([]byte(concatenated))) + decoded, err := io.ReadAll(decoder) + + require.NoError(t, err) + assert.Equal(t, []byte{0x42, 0x43}, decoded) + }) + + t.Run("stream_padding_split_across_reads", func(t *testing.T) { + encoded := paddedEnc.EncodeToString([]byte{0x42, 0x43}) // 2 bytes -> 3 chars + 2 padding + + // split in middle of padding region + reader := &chunkReader{data: []byte(encoded), chunkSize: 4} + decoder := NewDecoder(paddedEnc, reader) + decoded, err := io.ReadAll(decoder) + + require.NoError(t, err) + assert.Equal(t, []byte{0x42, 0x43}, decoded) + }) + + t.Run("stream_with_whitespace_between_padded", func(t *testing.T) { + encoded1 := paddedEnc.EncodeToString([]byte{0x42}) + encoded2 := paddedEnc.EncodeToString([]byte{0x43}) + withWhitespace := encoded1 + " \n " + encoded2 + + decoder := NewDecoder(paddedEnc, bytes.NewReader([]byte(withWhitespace))) + decoded, err := io.ReadAll(decoder) + + require.NoError(t, err) + assert.Equal(t, []byte{0x42, 0x43}, decoded) + }) +} + +func TestDecodeWhitespaceAlphabetAware(t *testing.T) { + t.Parallel() + + // create encoding with space in the alphabet (replaces '~' which is the last char in RFC1924) + alphabetWithSpace := "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|} " + encWithSpace := NewEncoding(alphabetWithSpace) + + // create encoding with tab in the alphabet (replaces '~') + alphabetWithTab := "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}\t" + encWithTab := NewEncoding(alphabetWithTab) + + t.Run("space_in_alphabet_not_skipped", func(t *testing.T) { + // space is at alphabet position 84 (value 84) + // create a 5-char encoded block where the last char is space (value 84) + // value % 85 = 84 means value = 84, 84+85=169, 84+85*2=254, etc. + // for simplicity, encode a full block where value 84 appears + // we'll construct a known encoded string with space and decode it + + // "00000" decodes to: 0*85^4 + 0*85^3 + 0*85^2 + 0*85 + 0 = 0 -> [0,0,0,0] + // "0000 " (with space = 84) decodes to: 0 + 0 + 0 + 0 + 84 = 84 -> different bytes + + // first verify that space in encoded data produces different result than without space + decoded1, err := encWithSpace.DecodeString("00000") + require.NoError(t, err) + + decoded2, err := encWithSpace.DecodeString("0000 ") // space = value 84 + require.NoError(t, err) + + assert.NotEqual(t, decoded1, decoded2) // space in alphabet should decode differently than '0' + + // round-trip with data that encodes to include spaces + input := []byte{0x00, 0x00, 0x00, 0x54} // 0x54 = 84, should encode with space + encoded := encWithSpace.EncodeToString(input) + decoded, err := encWithSpace.DecodeString(encoded) + require.NoError(t, err) + assert.Equal(t, input, decoded) + }) + + t.Run("tab_in_alphabet_not_skipped", func(t *testing.T) { + // similar test with tab + decoded1, err := encWithTab.DecodeString("00000") + require.NoError(t, err) + + decoded2, err := encWithTab.DecodeString("0000\t") // tab = value 84 + require.NoError(t, err) + + assert.NotEqual(t, decoded1, decoded2) // tab in alphabet should decode differently than '0' + + // round-trip + input := []byte{0x00, 0x00, 0x00, 0x54} + encoded := encWithTab.EncodeToString(input) + decoded, err := encWithTab.DecodeString(encoded) + require.NoError(t, err) + assert.Equal(t, input, decoded) + }) + + t.Run("space_not_in_alphabet_skipped", func(t *testing.T) { + // RFC1924 does not include space, so spaces should be ignored + input := []byte{0x01, 0x02, 0x03, 0x04} + encoded := RFC1924.EncodeToString(input) + + // insert spaces between each character + var sb strings.Builder + for i, c := range encoded { + if i > 0 { + sb.WriteByte(' ') + } + sb.WriteRune(c) + } + + decoded, err := RFC1924.DecodeString(sb.String()) + require.NoError(t, err) + assert.Equal(t, input, decoded) // space not in alphabet should be skipped + }) + + t.Run("tab_not_in_alphabet_skipped", func(t *testing.T) { + input := []byte{0x01, 0x02, 0x03, 0x04} + encoded := RFC1924.EncodeToString(input) + + // insert tabs + withTabs := encoded[:2] + "\t\t" + encoded[2:] + decoded, err := RFC1924.DecodeString(withTabs) + require.NoError(t, err) + assert.Equal(t, input, decoded) // tab not in alphabet should be skipped + }) + + t.Run("stream_decoder_space_in_alphabet", func(t *testing.T) { + input := []byte("test data for stream") + encoded := encWithSpace.EncodeToString(input) + + decoder := NewDecoder(encWithSpace, bytes.NewReader([]byte(encoded))) + decoded, err := io.ReadAll(decoder) + + require.NoError(t, err) + assert.Equal(t, input, decoded) + }) + + t.Run("stream_decoder_whitespace_skipped", func(t *testing.T) { + input := []byte("test data") + encoded := RFC1924.EncodeToString(input) + + // insert whitespace between blocks (every 5 chars) + var sb strings.Builder + for i, c := range encoded { + sb.WriteRune(c) + if (i+1)%5 == 0 { + sb.WriteString(" \t\n\r") + } + } + + decoder := NewDecoder(RFC1924, bytes.NewReader([]byte(sb.String()))) + decoded, err := io.ReadAll(decoder) + + require.NoError(t, err) + assert.Equal(t, input, decoded) + }) +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..cb4d688 --- /dev/null +++ b/go.mod @@ -0,0 +1,11 @@ +module github.com/go-analyze/encoding + +go 1.18 + +require github.com/stretchr/testify v1.11.1 + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..c4c1710 --- /dev/null +++ b/go.sum @@ -0,0 +1,10 @@ +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= From ae72589d0de2888a0b6ec4a11e0961c28bd8c4e0 Mon Sep 17 00:00:00 2001 From: Mike Jensen Date: Sun, 28 Dec 2025 17:56:45 -0700 Subject: [PATCH 2/6] ci: Add PR and Merge Test + Lint validation --- .github/workflows/codeql.yml | 48 +++++++++++++++++++++++ .github/workflows/tests-main.yml | 13 +++++++ .github/workflows/tests-pr.yml | 12 ++++++ .github/workflows/tests-workflow.yml | 58 ++++++++++++++++++++++++++++ .golangci.yml | 50 ++++++++++++++++++++++++ 5 files changed, 181 insertions(+) create mode 100644 .github/workflows/codeql.yml create mode 100644 .github/workflows/tests-main.yml create mode 100644 .github/workflows/tests-pr.yml create mode 100644 .github/workflows/tests-workflow.yml create mode 100644 .golangci.yml diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..c32e446 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,48 @@ +name: "CodeQL" + +on: + push: + branches: [ "main" ] + pull_request: + +jobs: + analyze: + name: Analyze (${{ matrix.language }}) + runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} + permissions: + security-events: write + packages: read + actions: read + contents: read + + strategy: + fail-fast: false + matrix: + language: [ 'go' ] + + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Set up Go + uses: actions/setup-go@v6 + with: + cache: false + go-version-file: go.mod + if: ${{ matrix.language == 'go' }} + + - name: Initialize CodeQL + uses: github/codeql-action/init@v4 + with: + languages: ${{ matrix.language }} + timeout-minutes: 10 + + - name: Autobuild (${{ matrix.language }}) + uses: github/codeql-action/autobuild@v4 + timeout-minutes: 10 + + - name: Perform CodeQL Analysis (${{ matrix.language }}) + uses: github/codeql-action/analyze@v4 + with: + category: "/language:${{matrix.language}}" + timeout-minutes: 10 diff --git a/.github/workflows/tests-main.yml b/.github/workflows/tests-main.yml new file mode 100644 index 0000000..5e7d32a --- /dev/null +++ b/.github/workflows/tests-main.yml @@ -0,0 +1,13 @@ +name: Tests - Main Push + +on: + push: + branches: [ main ] + +permissions: + contents: read + +jobs: + call-reusable: + uses: ./.github/workflows/tests-workflow.yml + diff --git a/.github/workflows/tests-pr.yml b/.github/workflows/tests-pr.yml new file mode 100644 index 0000000..46d2f0f --- /dev/null +++ b/.github/workflows/tests-pr.yml @@ -0,0 +1,12 @@ +name: Tests - Pull Request + +on: + pull_request: + +permissions: + contents: read + +jobs: + call-reusable: + uses: ./.github/workflows/tests-workflow.yml + diff --git a/.github/workflows/tests-workflow.yml b/.github/workflows/tests-workflow.yml new file mode 100644 index 0000000..04a8a5e --- /dev/null +++ b/.github/workflows/tests-workflow.yml @@ -0,0 +1,58 @@ +name: Tests + +on: + workflow_call: + +permissions: + contents: read + +jobs: + lint-and-tidy: + name: Verify Linting + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Set up Go + uses: actions/setup-go@v6 + with: + go-version: "1.25" + + - name: Set up golangci-lint + run: | + curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/v2.7.1/install.sh | sh -s -- -b $(go env GOPATH)/bin v2.7.1 + + - name: Run lint check + run: make lint + + - name: Run tidy check + run: | + go mod tidy + # Fail if go.mod or go.sum changed + git diff --exit-code go.mod go.sum + + test: + name: Verify Unit Tests + runs-on: ubuntu-latest + strategy: + matrix: + go: + - '1.25' + - '1.24' + - '1.23' + - '1.22' + - '1.21' + - '1.20' + - '1.19' + - '1.18' + steps: + - uses: actions/checkout@v6 + + - name: Set up Go ${{ matrix.go }} + uses: actions/setup-go@v6 + with: + go-version: ${{ matrix.go }} + + - name: Run unit tests + run: make test + diff --git a/.golangci.yml b/.golangci.yml new file mode 100644 index 0000000..23b0d5c --- /dev/null +++ b/.golangci.yml @@ -0,0 +1,50 @@ +version: 2 + +linters: + enable: + - asasalint + - asciicheck + - bidichk + - containedctx + - contextcheck + - decorder + - durationcheck + - errcheck + - errorlint + - exptostd + - fatcontext + - forbidigo + - gocheckcompilerdirectives + - gochecksumtype + - goconst + - godoclint + - gosmopolitan + - grouper + - iface + - importas + - mirror + - misspell + - musttag + - nilerr + - nilnil + - perfsprint + - prealloc + - reassign + - sloglint + - testifylint + - thelper + - unconvert + - wastedassign + - whitespace + settings: + goconst: + min-len: 4 + min-occurrences: 4 + +formatters: + enable: + - gofmt + - goimports + +run: + timeout: 600s From aa6477720add144e02d9f96212261d253253f8e5 Mon Sep 17 00:00:00 2001 From: Mike Jensen Date: Sun, 28 Dec 2025 17:57:05 -0700 Subject: [PATCH 3/6] chore: Configure Dependabot --- .github/dependabot.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..442087c --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +version: 2 +updates: + - package-ecosystem: "gomod" + directory: "/" + schedule: + interval: "weekly" + + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" From d65510e5a6043fd3bc0788f0f8a661278c08b101 Mon Sep 17 00:00:00 2001 From: Mike Jensen Date: Sun, 28 Dec 2025 17:57:17 -0700 Subject: [PATCH 4/6] docs: Update README.md to describe module and base85 functionality --- README.md | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e0b7327..35c99ee 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,53 @@ # encoding -Encoding utilities for data processing and storage + +Lightweight encoding packages for niche use cases not covered by Go's standard library. These packages are useful when you need compact encodings with specific character set requirements. + +The API closely mirrors Go's `encoding/base64` and related packages, making it easy to transition to standard implementations if they become available. Packages will be deprecated and removed if equivalent functionality is added to the standard library. + +## Installation + +```bash +go get github.com/go-analyze/encoding@latest +``` + +## base85 + +The standard library's `encoding/ascii85` package only supports the Adobe/btoa variant with a fixed alphabet. This package provides base85 encoding with support for custom alphabets via `NewEncoding()`, following the same pattern as `encoding/base64`. + +### RFC1924 + +RFC1924 defines a base85 encoding designed for compact representation of IPv6 addresses. It uses an 85-character alphabet consisting of `0-9`, `A-Z`, `a-z`, and 23 punctuation symbols, deliberately excluding characters that could cause parsing issues in various contexts (quotes, comma, period, slash, colon, brackets, and backslash). + +```go +package main + +import ( + "fmt" + + "github.com/go-analyze/encoding/base85" +) + +func main() { + data := []byte("Hello, World!") + + // Encode + encoded := base85.RFC1924.EncodeToString(data) + fmt.Println(encoded) // NM&qnZ!92JZ*pv8Ap + + // Decode + decoded, err := base85.RFC1924.DecodeString(encoded) + if err != nil { + panic(err) + } + fmt.Println(string(decoded)) // Hello, World! +} +``` + +### Custom Alphabets + +Create encodings with custom 85-character alphabets: + +```go +enc := base85.NewEncoding("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~") +encoded := enc.EncodeToString(data) +``` From 921862098ce4fd535c06b840b3fac8893775a34d Mon Sep 17 00:00:00 2001 From: Mike Jensen Date: Sun, 28 Dec 2025 18:28:58 -0700 Subject: [PATCH 5/6] test: Add fuzz testing --- Makefile | 10 ++- base85/fuzz_test.go | 196 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 205 insertions(+), 1 deletion(-) create mode 100644 base85/fuzz_test.go diff --git a/Makefile b/Makefile index 6b56a26..68b76f4 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ export GO111MODULE=on -.PHONY: default test test-cover bench lint +.PHONY: default test test-cover test-fuzz bench lint test: @@ -9,6 +9,14 @@ test: test-cover: go test -race -coverprofile=test.out ./... && go tool cover --html=test.out +test-fuzz: + go test -fuzz='^FuzzEncode$$' -fuzztime=2m ./base85/... + go test -fuzz='^FuzzDecode$$' -fuzztime=2m ./base85/... + go test -fuzz='^FuzzEncodeWithPadding$$' -fuzztime=2m ./base85/... + go test -fuzz='^FuzzDecodeWithPadding$$' -fuzztime=2m ./base85/... + go test -fuzz='^FuzzStreamRoundTrip$$' -fuzztime=2m ./base85/... + go test -fuzz='^FuzzStreamRoundTripWithPadding$$' -fuzztime=2m ./base85/... + bench: go test --benchmem -benchtime=10s -bench='Benchmark.*' -run='^$$' diff --git a/base85/fuzz_test.go b/base85/fuzz_test.go new file mode 100644 index 0000000..3bd56a0 --- /dev/null +++ b/base85/fuzz_test.go @@ -0,0 +1,196 @@ +package base85 + +import ( + "bytes" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func FuzzEncode(f *testing.F) { + f.Add([]byte{}) + f.Add([]byte{0}) + f.Add([]byte{255}) + f.Add([]byte("Hello, World!")) + f.Add([]byte("The quick brown fox jumps over the lazy dog")) + + // block boundary cases (base85 works in 4-byte blocks) + f.Add([]byte{1, 2, 3, 4}) // exact block + f.Add([]byte{1, 2, 3, 4, 5}) // one block + 1 + f.Add([]byte{1, 2, 3, 4, 5, 6}) // one block + 2 + f.Add([]byte{1, 2, 3, 4, 5, 6, 7}) // one block + 3 + + // partial blocks + f.Add([]byte{1}) + f.Add([]byte{1, 2}) + f.Add([]byte{1, 2, 3}) + + // all zeros and all ones + f.Add([]byte{0, 0, 0, 0}) + f.Add([]byte{255, 255, 255, 255}) + + // binary data patterns + f.Add([]byte{0x00, 0x7F, 0x80, 0xFF}) + f.Add([]byte{0xDE, 0xAD, 0xBE, 0xEF}) + + f.Fuzz(func(t *testing.T, data []byte) { + encoded := RFC1924.EncodeToString(data) // should never panic + + // round-trip should recover original data + decoded, err := RFC1924.DecodeString(encoded) + require.NoError(t, err) + assert.Equal(t, data, decoded) + }) +} + +func FuzzDecode(f *testing.F) { + f.Add([]byte{}) + + // valid RFC1924 encoded strings + f.Add([]byte("0")) // partial block (invalid - too short) + f.Add([]byte("00")) // minimal valid partial block + f.Add([]byte("0000000000")) // two full blocks worth + + // encoded "Hello" + f.Add([]byte("BOu!rDZ")) + + // whitespace handling + f.Add([]byte("BOu!r DZ")) + f.Add([]byte("BOu!r\nDZ")) + f.Add([]byte("BOu!r\tDZ")) + f.Add([]byte(" BOu!rDZ ")) + + // invalid characters (should return error, not panic) + f.Add([]byte("invalid\"chars")) + f.Add([]byte{0x00, 0x01, 0x02}) + f.Add([]byte{0xFF, 0xFE, 0xFD}) + + // boundary alphabet characters + f.Add([]byte("0")) // first alphabet char + f.Add([]byte("~")) // last alphabet char + f.Add([]byte("09AZaz!#$%&()")) // mixed alphabet chars + + f.Fuzz(func(t *testing.T, data []byte) { + _, _ = RFC1924.DecodeString(string(data)) // should never panic (errors are acceptable) + }) +} + +func FuzzEncodeWithPadding(f *testing.F) { + f.Add([]byte{}) + f.Add([]byte("test")) + f.Add([]byte("hello world")) + + // partial blocks that need padding + f.Add([]byte{1}) + f.Add([]byte{1, 2}) + f.Add([]byte{1, 2, 3}) + + // exact blocks (no padding needed) + f.Add([]byte{1, 2, 3, 4}) + f.Add([]byte{1, 2, 3, 4, 5, 6, 7, 8}) + + f.Fuzz(func(t *testing.T, data []byte) { + // use '.' as padding since '=' is in RFC1924 alphabet + enc := RFC1924.WithPadding('.') + + // encoding should never panic + encoded := enc.EncodeToString(data) + + // with padding, length should always be multiple of 5 + if len(data) > 0 && len(encoded)%5 != 0 { + t.Errorf("padded encoding length %d not multiple of 5", len(encoded)) + } + + // round-trip should recover original data + decoded, err := enc.DecodeString(encoded) + require.NoError(t, err) + assert.Equal(t, data, decoded) + }) +} + +func FuzzStreamRoundTrip(f *testing.F) { + f.Add([]byte{}) + f.Add([]byte{1}) + f.Add([]byte{1, 2, 3, 4}) + f.Add([]byte{1, 2, 3, 4, 5}) + f.Add(make([]byte, 100)) + f.Add([]byte("The quick brown fox jumps over the lazy dog")) + + f.Fuzz(func(t *testing.T, data []byte) { + // encode using stream encoder + var encoded bytes.Buffer + encoder := NewEncoder(RFC1924, &encoded) + _, err := encoder.Write(data) + require.NoError(t, err) + require.NoError(t, encoder.Close()) + + // decode using stream decoder + decoder := NewDecoder(RFC1924, &encoded) + var decoded bytes.Buffer + _, err = decoded.ReadFrom(decoder) + require.NoError(t, err) + assert.Equal(t, data, decoded.Bytes()) + }) +} + +func FuzzDecodeWithPadding(f *testing.F) { + f.Add([]byte{}) + + // use '.' as padding since '=' is in RFC1924 alphabet + enc := RFC1924.WithPadding('.') + + // valid padded encodings + f.Add([]byte("00...")) // 1 byte padded + f.Add([]byte("000..")) // 2 bytes padded + f.Add([]byte("0000.")) // 3 bytes padded + f.Add([]byte("00000")) // full block, no padding + f.Add([]byte("0000000000")) // two full blocks + f.Add([]byte("00000000..")) // one full + 2 bytes padded + f.Add([]byte("BOu!rDZ..")) // encoded "Hello" with padding + + // invalid padding patterns (should error, not panic) + f.Add([]byte(".....")) // all padding + f.Add([]byte("0....")) // only 1 data char + f.Add([]byte("00.0.")) // padding in middle + f.Add([]byte("...00")) // padding at start + + // mixed valid/invalid + f.Add([]byte("00000.....")) // full block + all padding block + f.Add([]byte{0x00, 0x01, '.', '.', '.'}) + + f.Fuzz(func(t *testing.T, data []byte) { + _, _ = enc.DecodeString(string(data)) // should never panic (errors are acceptable) + }) +} + +func FuzzStreamRoundTripWithPadding(f *testing.F) { + // various sizes to test buffering with padding + f.Add([]byte{}) + f.Add([]byte{1}) + f.Add([]byte{1, 2}) + f.Add([]byte{1, 2, 3}) + f.Add([]byte{1, 2, 3, 4}) + f.Add([]byte{1, 2, 3, 4, 5}) + f.Add(make([]byte, 100)) + f.Add([]byte("The quick brown fox jumps over the lazy dog")) + + f.Fuzz(func(t *testing.T, data []byte) { + // use '.' as padding since '=' is in RFC1924 alphabet + enc := RFC1924.WithPadding('.') + + // encode using stream encoder + var encoded bytes.Buffer + encoder := NewEncoder(enc, &encoded) + _, err := encoder.Write(data) + require.NoError(t, err) + require.NoError(t, encoder.Close()) + + // decode using stream decoder + decoder := NewDecoder(enc, &encoded) + var decoded bytes.Buffer + _, err = decoded.ReadFrom(decoder) + require.NoError(t, err) + assert.Equal(t, data, decoded.Bytes()) + }) +} From e7e5c8d21e3092814e842736d2c7df2092c28dc9 Mon Sep 17 00:00:00 2001 From: Mike Jensen Date: Sun, 28 Dec 2025 19:14:29 -0700 Subject: [PATCH 6/6] refactor: Added benchmarks with single pass refactor This improves performance, but overall still lacks compared to ascii85, in part because of the need for the alphabet character map lookup vs being able to use the continious range in ascii85. --- Makefile | 2 +- base85/bench_test.go | 49 ++++++++++++++++++++++ base85/encoding.go | 91 ++++++++++++++++++++++++++++++++--------- base85/encoding_test.go | 67 ++++++++++++++++++++++++++++++ 4 files changed, 189 insertions(+), 20 deletions(-) create mode 100644 base85/bench_test.go diff --git a/Makefile b/Makefile index 68b76f4..4bd4a9e 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ test-fuzz: go test -fuzz='^FuzzStreamRoundTripWithPadding$$' -fuzztime=2m ./base85/... bench: - go test --benchmem -benchtime=10s -bench='Benchmark.*' -run='^$$' + go test --benchmem -benchtime=10s -bench='Benchmark.*' -run='^$$' ./... lint: golangci-lint run --timeout=600s && go vet ./... diff --git a/base85/bench_test.go b/base85/bench_test.go new file mode 100644 index 0000000..58c6c57 --- /dev/null +++ b/base85/bench_test.go @@ -0,0 +1,49 @@ +package base85 + +import ( + "encoding/ascii85" + "testing" +) + +var benchData = []byte("The quick brown fox jumps over the lazy dog. 0123456789!@#$%^&*()") + +func BenchmarkEncodeBase85(b *testing.B) { + dst := make([]byte, RFC1924.EncodedLen(len(benchData))) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + RFC1924.Encode(dst, benchData) + } +} + +func BenchmarkDecodeBase85(b *testing.B) { + encoded := RFC1924.EncodeToString(benchData) + src := []byte(encoded) + dst := make([]byte, RFC1924.DecodedLen(len(src))) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + _, _ = RFC1924.Decode(dst, src) + } +} + +func BenchmarkEncodeAscii85(b *testing.B) { + dst := make([]byte, ascii85.MaxEncodedLen(len(benchData))) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + ascii85.Encode(dst, benchData) + } +} + +func BenchmarkDecodeAscii85(b *testing.B) { + src := make([]byte, ascii85.MaxEncodedLen(len(benchData))) + n := ascii85.Encode(src, benchData) + src = src[:n] + dst := make([]byte, len(benchData)) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + _, _, _ = ascii85.Decode(dst, src, true) + } +} diff --git a/base85/encoding.go b/base85/encoding.go index 7b1bd31..08cdc26 100644 --- a/base85/encoding.go +++ b/base85/encoding.go @@ -199,32 +199,85 @@ func (enc *Encoding) AppendEncode(dst, src []byte) []byte { // unless included in the encoding alphabet. If src contains invalid base85 data, it will // return the number of bytes successfully written and CorruptInputError. func (enc *Encoding) Decode(dst, src []byte) (n int, err error) { - // filter whitespace (if not in alphabet) and validate - // use lazy allocation: only create filtered slice when needed - var filtered []byte - for i, c := range src { - if enc.padChar != NoPadding && rune(c) == enc.padChar { - if filtered != nil { - filtered = append(filtered, c) - } + if len(src) == 0 { + return 0, nil + } + + var nb int + var digits [5]uint32 + hasPadding := enc.padChar != NoPadding + padCount := 0 // tracks padding chars seen in current block + + for i := 0; i < len(src); i++ { + c := src[i] + + // skip whitespace if not in alphabet + if (c == ' ' || c == '\t' || c == '\n' || c == '\r') && enc.decodeMap[c] == 0xFF { continue - } else if (c == ' ' || c == '\t' || c == '\n' || c == '\r') && enc.decodeMap[c] == 0xFF { - if filtered == nil { - filtered = make([]byte, i, len(src)) - copy(filtered, src[:i]) + } + + // padding handling + if hasPadding && rune(c) == enc.padChar { + if nb < 2 { + return n, CorruptInputError(i) + } + padCount++ + if nb+padCount == 5 { + // block complete - decode and reset + n += decodePartial(dst[n:], digits[:], nb) + nb = 0 + padCount = 0 } continue - } else if enc.decodeMap[c] == 0xFF { - return 0, CorruptInputError(i) - } else if filtered != nil { - filtered = append(filtered, c) } + + // data char after padding started is an error + if padCount > 0 { + return n, CorruptInputError(i) + } + + d := enc.decodeMap[c] + if d == 0xFF { + return n, CorruptInputError(i) + } + + digits[nb] = uint32(d) + nb++ + + if nb == 5 { + val := digits[0]*pow85_4 + digits[1]*pow85_3 + digits[2]*pow85_2 + digits[3]*pow85_1 + digits[4] + dst[n] = byte(val >> 24) + dst[n+1] = byte(val >> 16) + dst[n+2] = byte(val >> 8) + dst[n+3] = byte(val) + n += 4 + nb = 0 + } + } + + // handle remaining digits (unpadded case) + if nb > 0 { + if nb == 1 || padCount > 0 { + return n, CorruptInputError(len(src)) + } + n += decodePartial(dst[n:], digits[:], nb) } - if filtered == nil { - return enc.decodeFiltered(dst, src) + return n, nil +} + +// decodePartial decodes 2-4 accumulated digit values into output bytes. +func decodePartial(dst []byte, digits []uint32, nb int) int { + // fill remaining with 84 (highest digit) for implicit padding + for i := nb; i < 5; i++ { + digits[i] = 84 + } + val := digits[0]*pow85_4 + digits[1]*pow85_3 + digits[2]*pow85_2 + digits[3]*pow85_1 + digits[4] + for i := 0; i < nb-1; i++ { + dst[i] = byte(val >> 24) + val <<= 8 } - return enc.decodeFiltered(dst, filtered) + return nb - 1 } // decodeBlock decodes 2-5 base85 alphabet bytes into 1-4 output bytes. diff --git a/base85/encoding_test.go b/base85/encoding_test.go index d29d09c..af774c2 100644 --- a/base85/encoding_test.go +++ b/base85/encoding_test.go @@ -610,6 +610,23 @@ func (r *whitespaceOnlyReader) Read(p []byte) (int, error) { return n, nil } +// whitespaceWithErrorReader returns whitespace with a non-EOF error on first read. +type whitespaceWithErrorReader struct { + called bool + err error +} + +func (r *whitespaceWithErrorReader) Read(p []byte) (int, error) { + if r.called { + return 0, r.err + } + r.called = true + // return whitespace that filters to empty, with error + data := []byte(" \t\n\r ") + n := copy(p, data) + return n, r.err +} + func TestStreamDecoderReadError(t *testing.T) { t.Parallel() @@ -636,6 +653,16 @@ func TestStreamDecoderReadError(t *testing.T) { require.ErrorIs(t, err, io.ErrUnexpectedEOF) }) + t.Run("whitespace_with_error_not_eof", func(t *testing.T) { + // reader returns whitespace with error in same read (filters to empty, not EOF) + r := &whitespaceWithErrorReader{err: io.ErrUnexpectedEOF} + decoder := NewDecoder(RFC1924, r) + + buf := make([]byte, 100) + _, err := decoder.Read(buf) + require.ErrorIs(t, err, io.ErrUnexpectedEOF) + }) + t.Run("decode_error_with_partial_data_small_buffer", func(t *testing.T) { // This tests the error path where some data is decoded before // the error, and the output buffer is too small. Uses padding mode @@ -945,6 +972,46 @@ func TestPaddedDecoding(t *testing.T) { require.NoError(t, err) assert.Equal(t, []byte{0x42, 0x43}, decoded) }) + + // stream decoder tests for decodeFiltered error paths + t.Run("stream_dataLen_zero", func(t *testing.T) { + // "....." has 0 data chars - triggers dataLen < 2 check + decoder := NewDecoder(paddedEnc, bytes.NewReader([]byte("....."))) + _, err := io.ReadAll(decoder) + assert.Error(t, err) + }) + + t.Run("stream_dataLen_one", func(t *testing.T) { + // "A...." has 1 data char - triggers dataLen < 2 check + decoder := NewDecoder(paddedEnc, bytes.NewReader([]byte("A...."))) + _, err := io.ReadAll(decoder) + assert.Error(t, err) + }) + + t.Run("stream_invalid_char_in_padded_block", func(t *testing.T) { + // "[B..." has invalid char '[' in data portion of padded block + decoder := NewDecoder(paddedEnc, bytes.NewReader([]byte("[B..."))) + _, err := io.ReadAll(decoder) + assert.Error(t, err) + }) + + t.Run("stream_padding_in_remainder", func(t *testing.T) { + // valid 5-char block followed by remainder with padding + valid := paddedEnc.EncodeToString([]byte{0x01, 0x02, 0x03, 0x04}) // 5 chars + withPaddingRemainder := valid + "A." // remainder has padding + decoder := NewDecoder(paddedEnc, bytes.NewReader([]byte(withPaddingRemainder))) + _, err := io.ReadAll(decoder) + assert.Error(t, err) + }) + + t.Run("stream_invalid_char_in_remainder", func(t *testing.T) { + // valid 5-char block followed by remainder with invalid char + valid := RFC1924.EncodeToString([]byte{0x01, 0x02, 0x03, 0x04}) // 5 chars + withInvalidRemainder := valid + "A[" // '[' is invalid + decoder := NewDecoder(RFC1924, bytes.NewReader([]byte(withInvalidRemainder))) + _, err := io.ReadAll(decoder) + assert.Error(t, err) + }) } func TestDecodeWhitespaceAlphabetAware(t *testing.T) {