diff --git a/CHANGELOG.md b/CHANGELOG.md index bf5532454..d094be609 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,15 +23,32 @@ The following emojis are used to highlight certain changes: ### Changed +- `verifcid`: 🛠 Enhanced Allowlist interface with per-hash size limits ([#1018](https://github.com/ipfs/boxo/pull/1018)) + - Expanded `Allowlist` interface with `MinDigestSize(code uint64)` and `MaxDigestSize(code uint64)` methods for per-hash function size validation + - Added public constants: `DefaultMinDigestSize` (20 bytes), `DefaultMaxDigestSize` (128 bytes for cryptographic hashes), and `DefaultMaxIdentityDigestSize` (128 bytes for identity CIDs) + - `DefaultAllowlist` implementation now uses these constants and supports different size limits per hash type + - Renamed errors for clarity: Added `ErrDigestTooSmall` and `ErrDigestTooLarge` as the new primary errors + - `ErrBelowMinimumHashLength` and `ErrAboveMaximumHashLength` remain as deprecated aliases pointing to the new errors +- `bitswap`: Updated to use `verifcid.DefaultMaxDigestSize` for `MaximumHashLength` constant + - The default `MaximumAllowedCid` limit for incoming CIDs can be adjusted using `bitswap.MaxCidSize` or `server.MaxCidSize` options - 🛠 `bitswap/client`: The `RebroadcastDelay` option now takes a `time.Duration` value. This is a potentially BREAKING CHANGE. The time-varying functionality of `delay.Delay` was never used, so it was replaced with a fixed duration value. This also removes the `github.com/ipfs/go-ipfs-delay` dependency. - ### Removed ### Fixed +- `ipld/unixfs/mod`: + - `DagModifier` now correctly preserves raw node codec when modifying data under the chunker threshold, instead of incorrectly forcing everything to dag-pb + - `DagModifier` prevents creation of identity CIDs exceeding `verifcid.DefaultMaxIdentityDigestSize` limit when modifying data, automatically switching to proper cryptographic hash while preserving small identity CIDs + - `DagModifier` now supports appending data to a `RawNode` by automatically converting it into a UnixFS file structure where the original `RawNode` becomes the first leaf block, fixing previously impossible append operations that would fail with "expected protobuf dag node" errors +- `mfs`: Files with identity CIDs now properly inherit full CID prefix from parent directories (version, codec, hash type, length), not just hash type + ### Security +- `verifcid`: Now enforces maximum size limit of 128 bytes for identity CIDs to prevent abuse. + - 🛠 Attempts to read CIDs with identity multihash digests longer than `DefaultMaxIdentityDigestSize` will now produce `ErrDigestTooLarge` error. + - Identity CIDs can inline data directly, and without a size limit, they could embed arbitrary amounts of data. Limiting the size also protects gateways from poorly written clients that might send absurdly big data to the gateway encoded as identity CIDs only to retrieve it back. Note that identity CIDs do not provide integrity verification, making them vulnerable to bit flips. They should only be used in controlled contexts like raw leaves of a larger DAG. The limit is explicitly defined as `DefaultMaxIdentityDigestSize` (128 bytes). + ## [v0.34.0] diff --git a/bitswap/internal/defaults/defaults.go b/bitswap/internal/defaults/defaults.go index 2c5cc3668..f7184eb4c 100644 --- a/bitswap/internal/defaults/defaults.go +++ b/bitswap/internal/defaults/defaults.go @@ -3,6 +3,8 @@ package defaults import ( "encoding/binary" "time" + + "github.com/ipfs/boxo/verifcid" ) const ( @@ -27,9 +29,33 @@ const ( // Maximum size of the wantlist we are willing to keep in memory. MaxQueuedWantlistEntiresPerPeer = 1024 - // Copied from github.com/ipfs/go-verifcid#maximumHashLength - // FIXME: expose this in go-verifcid. - MaximumHashLength = 128 + // MaximumHashLength is the maximum size for hash digests we accept. + // This references the default from verifcid for consistency. + MaximumHashLength = verifcid.DefaultMaxDigestSize + + // MaximumAllowedCid is the maximum total CID size we accept in bitswap messages. + // Bitswap sends full CIDs (not just multihashes) on the wire, so we must + // limit the total size to prevent DoS attacks from maliciously large CIDs. + // + // The calculation is based on the CID binary format: + // - CIDv0: Just a multihash (hash type + hash length + hash digest) + // - CIDv1: + // - version: varint (usually 1 byte for version 1) + // - multicodec: varint (usually 1-2 bytes for common codecs) + // - multihash: + // - hash-type: varint (usually 1-2 bytes) + // - hash-length: varint (usually 1-2 bytes) + // - hash-digest: up to MaximumHashLength bytes + // + // We use binary.MaxVarintLen64*4 (40 bytes) to accommodate worst-case varint encoding: + // - 1 varint for CID version (max 10 bytes) + // - 1 varint for multicodec (max 10 bytes) + // - 1 varint for multihash type (max 10 bytes) + // - 1 varint for multihash length (max 10 bytes) + // Total: 40 bytes overhead + MaximumHashLength (128) = 168 bytes max + // + // This prevents peers from sending CIDs with pathologically large varint encodings + // that could exhaust memory or cause other issues. MaximumAllowedCid = binary.MaxVarintLen64*4 + MaximumHashLength // RebroadcastDelay is the default delay to trigger broadcast of diff --git a/bitswap/options.go b/bitswap/options.go index a705a6caf..313952f3b 100644 --- a/bitswap/options.go +++ b/bitswap/options.go @@ -32,7 +32,7 @@ func MaxQueuedWantlistEntriesPerPeer(count uint) Option { return Option{server.MaxQueuedWantlistEntriesPerPeer(count)} } -// MaxCidSize only affects the server. +// MaxCidSize limits the size of incoming CIDs in requests (server only). // If it is 0 no limit is applied. func MaxCidSize(n uint) Option { return Option{server.MaxCidSize(n)} diff --git a/bitswap/server/internal/decision/engine.go b/bitswap/server/internal/decision/engine.go index 28c60db26..5ac00b895 100644 --- a/bitswap/server/internal/decision/engine.go +++ b/bitswap/server/internal/decision/engine.go @@ -282,8 +282,11 @@ func WithMaxQueuedWantlistEntriesPerPeer(count uint) Option { } } -// WithMaxQueuedWantlistEntriesPerPeer limits how much individual entries each peer is allowed to send. -// If a peer send us more than this we will truncate newest entries. +// WithMaxCidSize limits the size of incoming CIDs that we are willing to accept. +// We will ignore requests for CIDs whose total encoded size exceeds this limit. +// This protects against malicious peers sending CIDs with pathologically large +// varint encodings that could exhaust memory. +// It defaults to [defaults.MaximumAllowedCid] func WithMaxCidSize(n uint) Option { return func(e *Engine) { e.maxCidSize = n diff --git a/bitswap/server/internal/decision/engine_test.go b/bitswap/server/internal/decision/engine_test.go index 60b4938a5..1a88e9260 100644 --- a/bitswap/server/internal/decision/engine_test.go +++ b/bitswap/server/internal/decision/engine_test.go @@ -1660,7 +1660,7 @@ func TestIgnoresCidsAboveLimit(t *testing.T) { wl := warsaw.Engine.WantlistForPeer(riga.Peer) if len(wl) != 1 { - t.Fatal("wantlist add a CID too big") + t.Fatalf("expected 1 entry in wantlist, got %d", len(wl)) } } diff --git a/bitswap/server/server.go b/bitswap/server/server.go index a0b7dd4c9..1e8dd3a31 100644 --- a/bitswap/server/server.go +++ b/bitswap/server/server.go @@ -190,9 +190,11 @@ func MaxQueuedWantlistEntriesPerPeer(count uint) Option { } } -// MaxCidSize limits how big CIDs we are willing to serve. -// We will ignore CIDs over this limit. -// It defaults to [defaults.MaxCidSize]. +// MaxCidSize limits the size of incoming CIDs in requests that we are willing to accept. +// We will ignore requests for CIDs whose total encoded size exceeds this limit. +// This protects against malicious peers sending CIDs with pathologically large +// varint encodings that could exhaust memory. +// It defaults to [defaults.MaximumAllowedCid]. // If it is 0 no limit is applied. func MaxCidSize(n uint) Option { o := decision.WithMaxCidSize(n) diff --git a/blockservice/blockservice_test.go b/blockservice/blockservice_test.go index 39c4b6e4b..9af9488eb 100644 --- a/blockservice/blockservice_test.go +++ b/blockservice/blockservice_test.go @@ -1,6 +1,7 @@ package blockservice import ( + "bytes" "context" "testing" @@ -288,6 +289,35 @@ func TestAllowlist(t *testing.T) { check(NewSession(ctx, blockservice).GetBlock) } +func TestIdentityHashSizeLimit(t *testing.T) { + a := assert.New(t) + ctx := context.Background() + bs := blockstore.NewBlockstore(dssync.MutexWrap(ds.NewMapDatastore())) + blockservice := New(bs, nil) + + // Create identity CID at the DefaultMaxIdentityDigestSize limit (should be valid) + validData := bytes.Repeat([]byte("a"), verifcid.DefaultMaxIdentityDigestSize) + validHash, err := multihash.Sum(validData, multihash.IDENTITY, -1) + a.NoError(err) + validCID := cid.NewCidV1(cid.Raw, validHash) + + // Create identity CID over the DefaultMaxIdentityDigestSize limit (should be rejected) + invalidData := bytes.Repeat([]byte("b"), verifcid.DefaultMaxIdentityDigestSize+1) + invalidHash, err := multihash.Sum(invalidData, multihash.IDENTITY, -1) + a.NoError(err) + invalidCID := cid.NewCidV1(cid.Raw, invalidHash) + + // Valid identity CID should work (though block won't be found) + _, err = blockservice.GetBlock(ctx, validCID) + a.Error(err) + a.True(ipld.IsNotFound(err), "expected not found error for valid identity CID") + + // Invalid identity CID should fail validation + _, err = blockservice.GetBlock(ctx, invalidCID) + a.Error(err) + a.ErrorIs(err, verifcid.ErrDigestTooLarge) +} + type fakeIsNewSessionCreateExchange struct { ses exchange.Fetcher newSessionWasCalled bool diff --git a/gateway/gateway_test.go b/gateway/gateway_test.go index 41833ceca..d2798c36c 100644 --- a/gateway/gateway_test.go +++ b/gateway/gateway_test.go @@ -1,6 +1,7 @@ package gateway import ( + "bytes" "context" "errors" "fmt" @@ -13,8 +14,10 @@ import ( "github.com/ipfs/boxo/namesys" "github.com/ipfs/boxo/path" "github.com/ipfs/boxo/path/resolver" + "github.com/ipfs/boxo/verifcid" "github.com/ipfs/go-cid" ipld "github.com/ipfs/go-ipld-format" + mh "github.com/multiformats/go-multihash" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -51,6 +54,25 @@ func TestGatewayGet(t *testing.T) { // detection is platform dependent. backend.namesys["/ipns/example.man"] = newMockNamesysItem(k, 0) + // Create identity CIDs for testing + // verifcid.DefaultMaxIdentityDigestSize bytes (at the identity limit, should be valid) + validIdentityData := bytes.Repeat([]byte("a"), verifcid.DefaultMaxIdentityDigestSize) + validIdentityHash, err := mh.Sum(validIdentityData, mh.IDENTITY, -1) + require.NoError(t, err) + validIdentityCID := cid.NewCidV1(cid.Raw, validIdentityHash) + + // verifcid.DefaultMaxIdentityDigestSize+1 bytes (over the identity limit, should be rejected) + invalidIdentityData := bytes.Repeat([]byte("b"), verifcid.DefaultMaxIdentityDigestSize+1) + invalidIdentityHash, err := mh.Sum(invalidIdentityData, mh.IDENTITY, -1) + require.NoError(t, err) + invalidIdentityCID := cid.NewCidV1(cid.Raw, invalidIdentityHash) + + // Short identity CID (below MinDigestSize, should still be valid) + shortIdentityData := []byte("hello") + shortIdentityHash, err := mh.Sum(shortIdentityData, mh.IDENTITY, -1) + require.NoError(t, err) + shortIdentityCID := cid.NewCidV1(cid.Raw, shortIdentityHash) + for _, test := range []struct { host string path string @@ -62,6 +84,9 @@ func TestGatewayGet(t *testing.T) { {"127.0.0.1:8080", "/ipns", http.StatusBadRequest, "invalid path \"/ipns/\": path does not have enough components\n"}, {"127.0.0.1:8080", "/" + k.RootCid().String(), http.StatusNotFound, "404 page not found\n"}, {"127.0.0.1:8080", "/ipfs/this-is-not-a-cid", http.StatusBadRequest, "invalid path \"/ipfs/this-is-not-a-cid\": invalid cid: illegal base32 data at input byte 3\n"}, + {"127.0.0.1:8080", "/ipfs/" + validIdentityCID.String(), http.StatusOK, string(validIdentityData)}, // Valid identity CID returns the inlined data + {"127.0.0.1:8080", "/ipfs/" + invalidIdentityCID.String(), http.StatusInternalServerError, "failed to resolve /ipfs/" + invalidIdentityCID.String() + ": digest too large: identity digest got 129 bytes, maximum 128\n"}, // Invalid identity CID, over size limit + {"127.0.0.1:8080", "/ipfs/" + shortIdentityCID.String(), http.StatusOK, "hello"}, // Short identity CID (below MinDigestSize) should work {"127.0.0.1:8080", k.String(), http.StatusOK, "fnord"}, {"127.0.0.1:8080", "/ipns/nxdomain.example.com", http.StatusInternalServerError, "failed to resolve /ipns/nxdomain.example.com: " + namesys.ErrResolveFailed.Error() + "\n"}, {"127.0.0.1:8080", "/ipns/%0D%0A%0D%0Ahello", http.StatusInternalServerError, "failed to resolve /ipns/\\r\\n\\r\\nhello: " + namesys.ErrResolveFailed.Error() + "\n"}, @@ -87,8 +112,12 @@ func TestGatewayGet(t *testing.T) { require.Equal(t, "text/plain; charset=utf-8", resp.Header.Get("Content-Type")) body, err := io.ReadAll(resp.Body) require.NoError(t, err) - require.Equal(t, test.status, resp.StatusCode, "body", body) - require.Equal(t, test.text, string(body)) + require.Equal(t, test.status, resp.StatusCode, "body", string(body)) + + // Check body content if expected text is provided + if test.text != "" { + require.Equal(t, test.text, string(body)) + } }) } } diff --git a/ipld/unixfs/mod/dagmodifier.go b/ipld/unixfs/mod/dagmodifier.go index e069587c5..3db1fb23a 100644 --- a/ipld/unixfs/mod/dagmodifier.go +++ b/ipld/unixfs/mod/dagmodifier.go @@ -1,5 +1,24 @@ // Package mod provides DAG modification utilities to, for example, // insert additional nodes in a unixfs DAG or truncate them. +// +// Identity CID Handling: +// +// This package automatically handles identity CIDs (multihash code 0x00) which +// inline data directly in the CID. When modifying nodes with identity CIDs, +// the package ensures the [verifcid.MaxIdentityDigestSize] limit is respected by +// automatically switching to a cryptographic hash function when the encoded +// data would exceed this limit. The replacement hash function is chosen from +// (in order): the configured Prefix if non-identity, or [util.DefaultIpfsHash] +// as a fallback. This prevents creation of unacceptably big identity CIDs +// while preserving them for small data that fits within the limit. +// +// RawNode Growth: +// +// When appending data to a RawNode that would require multiple blocks, the +// node is automatically converted to a UnixFS file structure. This is necessary +// because RawNodes cannot have child nodes. The original raw data remains +// accessible via its original CID, while the new structure provides full +// UnixFS capabilities. package mod import ( @@ -17,8 +36,11 @@ import ( help "github.com/ipfs/boxo/ipld/unixfs/importer/helpers" trickle "github.com/ipfs/boxo/ipld/unixfs/importer/trickle" uio "github.com/ipfs/boxo/ipld/unixfs/io" + "github.com/ipfs/boxo/util" + "github.com/ipfs/boxo/verifcid" cid "github.com/ipfs/go-cid" ipld "github.com/ipfs/go-ipld-format" + mh "github.com/multiformats/go-multihash" ) // Common errors @@ -54,8 +76,8 @@ type DagModifier struct { } // NewDagModifier returns a new DagModifier, the Cid prefix for newly -// created nodes will be inhered from the passed in node. If the Cid -// version if not 0 raw leaves will also be enabled. The Prefix and +// created nodes will be inherited from the passed in node. If the Cid +// version is not 0 raw leaves will also be enabled. The Prefix and // RawLeaves options can be overridden by changing them after the call. func NewDagModifier(ctx context.Context, from ipld.Node, serv ipld.DAGService, spl chunker.SplitterGen) (*DagModifier, error) { switch from.(type) { @@ -66,7 +88,11 @@ func NewDagModifier(ctx context.Context, from ipld.Node, serv ipld.DAGService, s } prefix := from.Cid().Prefix() - prefix.Codec = cid.DagProtobuf + // Preserve the original codec - don't force everything to DagProtobuf + // Only ProtoNodes with UnixFS data should use DagProtobuf + if _, ok := from.(*mdag.ProtoNode); ok { + prefix.Codec = cid.DagProtobuf + } rawLeaves := false if prefix.Version > 0 { rawLeaves = true @@ -244,6 +270,48 @@ func (dm *DagModifier) Sync() error { return nil } +// safePrefixForSize checks if using identity hash would exceed the size limit +// and returns an appropriate CID prefix. If the data fits within identity hash +// limits, returns the original prefix and false. Otherwise returns a prefix with +// a proper cryptographic hash function and true to indicate the prefix changed. +func (dm *DagModifier) safePrefixForSize(originalPrefix cid.Prefix, dataSize int) (cid.Prefix, bool) { + // If not identity hash, no size check needed - return as is + if originalPrefix.MhType != mh.IDENTITY { + return originalPrefix, false + } + + // For identity hash, check if data fits within the limit + if dataSize <= verifcid.DefaultMaxIdentityDigestSize { + return originalPrefix, false + } + + // Identity would overflow, need to switch to a different hash + // Use configured prefix if it's not identity + if dm.Prefix.MhType != mh.IDENTITY { + return dm.Prefix, true + } + + // Configured prefix is also identity, build a safe fallback + newPrefix := dm.Prefix // Copy all fields (Version, Codec, MhType, MhLength) + newPrefix.MhType = util.DefaultIpfsHash + newPrefix.MhLength = -1 // Only reset length when using fallback hash + return newPrefix, true +} + +// ensureSafeProtoNodeHash checks if a ProtoNode's identity hash would exceed +// the size limit and updates its CID builder if necessary. +func (dm *DagModifier) ensureSafeProtoNodeHash(node *mdag.ProtoNode) { + // CidBuilder() returns the cid.Builder which for V1CidPrefix is the Prefix itself + if prefix, ok := node.CidBuilder().(cid.Prefix); ok && prefix.MhType == mh.IDENTITY { + encodedData, _ := node.EncodeProtobuf(false) + if newPrefix, changed := dm.safePrefixForSize(prefix, len(encodedData)); changed { + // Only update the CID builder if the prefix changed + // to avoid unnecessary cache invalidation in ProtoNode + node.SetCidBuilder(newPrefix) + } + } +} + // modifyDag writes the data in 'dm.wrBuf' over the data in 'node' starting at 'offset' // returns the new key of the passed in node. func (dm *DagModifier) modifyDag(n ipld.Node, offset uint64) (cid.Cid, error) { @@ -256,6 +324,8 @@ func (dm *DagModifier) modifyDag(n ipld.Node, offset uint64) (cid.Cid, error) { return cid.Cid{}, err } + // For leaf ProtoNodes, we can only modify data within the existing size + // Expanding the data requires rebuilding the tree structure _, err = dm.wrBuf.Read(fsn.Data()[offset:]) if err != nil && err != io.EOF { return cid.Cid{}, err @@ -273,6 +343,10 @@ func (dm *DagModifier) modifyDag(n ipld.Node, offset uint64) (cid.Cid, error) { nd := new(mdag.ProtoNode) nd.SetData(b) nd.SetCidBuilder(nd0.CidBuilder()) + + // Check if using identity hash would exceed the size limit + dm.ensureSafeProtoNodeHash(nd) + err = dm.dagserv.Add(dm.ctx, nd) if err != nil { return cid.Cid{}, err @@ -312,7 +386,10 @@ func (dm *DagModifier) modifyDag(n ipld.Node, offset uint64) (cid.Cid, error) { copy(bytes[offsetPlusN:], origData[offsetPlusN:]) } - nd, err := mdag.NewRawNodeWPrefix(bytes, nd0.Cid().Prefix()) + // Check if using identity hash would exceed the size limit + prefix, _ := dm.safePrefixForSize(nd0.Cid().Prefix(), len(bytes)) + + nd, err := mdag.NewRawNodeWPrefix(bytes, prefix) if err != nil { return cid.Cid{}, err } @@ -366,14 +443,33 @@ func (dm *DagModifier) modifyDag(n ipld.Node, offset uint64) (cid.Cid, error) { cur += bs } + // Check if using identity hash would exceed the size limit for branch nodes + dm.ensureSafeProtoNodeHash(node) + err = dm.dagserv.Add(dm.ctx, node) return node.Cid(), err } -// appendData appends the blocks from the given chan to the end of this dag +// appendData appends blocks from the splitter to the end of this dag. +// +// For ProtoNodes: Uses trickle.Append directly to add new blocks to the +// existing UnixFS DAG structure. +// +// For RawNodes: Automatically converts to a UnixFS file structure when growth +// is needed. This conversion is necessary because RawNodes are pure data and +// cannot have child nodes. The conversion process: +// - Creates a ProtoNode with UnixFS file metadata +// - Adds the original RawNode as the first leaf block +// - Continues appending new data as raw leaves (if RawLeaves is set) +// +// This conversion is transparent to users but changes the node type from +// RawNode to ProtoNode. The original data remains accessible through the +// direct CID of the original raw block, and the new dag-pb CID makes +// post-append data accessible through the standard UnixFS APIs. func (dm *DagModifier) appendData(nd ipld.Node, spl chunker.Splitter) (ipld.Node, error) { switch nd := nd.(type) { - case *mdag.ProtoNode, *mdag.RawNode: + case *mdag.ProtoNode: + // ProtoNode can be directly passed to trickle.Append dbp := &help.DagBuilderParams{ Dagserv: dm.dagserv, Maxlinks: dm.MaxLinks, @@ -385,6 +481,45 @@ func (dm *DagModifier) appendData(nd ipld.Node, spl chunker.Splitter) (ipld.Node return nil, err } return trickle.Append(dm.ctx, nd, db) + + case *mdag.RawNode: + // RawNode needs to be converted to UnixFS structure for growth + // because RawNodes are just raw data and cannot have child nodes. + // We create a UnixFS file structure with the original RawNode as the first leaf. + fileNode := ft.NewFSNode(ft.TFile) + + // Add the original raw data size as the first block size + fileNode.AddBlockSize(uint64(len(nd.RawData()))) + + // Serialize the UnixFS node + fileNodeBytes, err := fileNode.GetBytes() + if err != nil { + return nil, fmt.Errorf("failed to serialize UnixFS metadata for RawNode conversion: %w", err) + } + + // Create a ProtoNode with the UnixFS metadata + protoNode := mdag.NodeWithData(fileNodeBytes) + protoNode.SetCidBuilder(dm.Prefix) + + // Add the RawNode as the first leaf + err = protoNode.AddNodeLink("", nd) + if err != nil { + return nil, fmt.Errorf("failed to add RawNode as leaf during conversion: %w", err) + } + + // Now we can append new data using trickle + dbp := &help.DagBuilderParams{ + Dagserv: dm.dagserv, + Maxlinks: dm.MaxLinks, + CidBuilder: dm.Prefix, + RawLeaves: true, // Ensure future leaves are raw for consistency + } + db, err := dbp.New(spl) + if err != nil { + return nil, fmt.Errorf("failed to create DAG builder for RawNode growth: %w", err) + } + return trickle.Append(dm.ctx, protoNode, db) + default: return nil, ErrNotUnixfs } @@ -560,7 +695,9 @@ func (dm *DagModifier) dagTruncate(ctx context.Context, n ipld.Node, size uint64 return mdag.NodeWithData(data), nil case *mdag.RawNode: - return mdag.NewRawNodeWPrefix(nd.RawData()[:size], nd.Cid().Prefix()) + data := nd.RawData()[:size] + prefix, _ := dm.safePrefixForSize(nd.Cid().Prefix(), len(data)) + return mdag.NewRawNodeWPrefix(data, prefix) } } diff --git a/ipld/unixfs/mod/dagmodifier_test.go b/ipld/unixfs/mod/dagmodifier_test.go index ab69374ef..eee243750 100644 --- a/ipld/unixfs/mod/dagmodifier_test.go +++ b/ipld/unixfs/mod/dagmodifier_test.go @@ -1,6 +1,7 @@ package mod import ( + "bytes" "context" "fmt" "io" @@ -11,7 +12,11 @@ import ( trickle "github.com/ipfs/boxo/ipld/unixfs/importer/trickle" uio "github.com/ipfs/boxo/ipld/unixfs/io" testu "github.com/ipfs/boxo/ipld/unixfs/test" + "github.com/ipfs/boxo/util" + "github.com/ipfs/boxo/verifcid" + "github.com/ipfs/go-cid" "github.com/ipfs/go-test/random" + mh "github.com/multiformats/go-multihash" ) func testModWrite(t *testing.T, beg, size uint64, orig []byte, dm *DagModifier, opts testu.NodeOpts) []byte { @@ -867,3 +872,633 @@ func getOffset(reader uio.DagReader) int64 { } return offset } + +// makeTestData generates test data with repeating lowercase ascii alphabet. +func makeTestData(size int) []byte { + data := make([]byte, size) + for i := range data { + data[i] = byte('a' + i%26) + } + return data +} + +// makeIdentityNode creates a ProtoNode with identity CID for testing +func makeIdentityNode(data []byte) *dag.ProtoNode { + node := dag.NodeWithData(unixfs.FilePBData(data, uint64(len(data)))) + node.SetCidBuilder(cid.Prefix{ + Version: 1, + Codec: cid.DagProtobuf, + MhType: mh.IDENTITY, + MhLength: -1, + }) + return node +} + +func TestIdentityCIDHandling(t *testing.T) { + ctx := context.Background() + dserv := testu.GetDAGServ() + + // Test that DagModifier prevents creating overlarge identity CIDs + // This addresses https://github.com/ipfs/kubo/issues/6011 + + t.Run("prevents identity CID overflow on modification", func(t *testing.T) { + // Create a UnixFS file node with identity hash near the size limit + // UnixFS overhead is approximately 8-10 bytes, so we use data that will + // encode to just under DefaultMaxIdentityDigestSize when combined with metadata + initialData := makeTestData(verifcid.DefaultMaxIdentityDigestSize - 10) + + // Create a UnixFS file node with identity CID + node := makeIdentityNode(initialData) + + // Verify the encoded size is under limit + encoded, _ := node.EncodeProtobuf(false) + if len(encoded) > verifcid.DefaultMaxIdentityDigestSize { + t.Fatalf("Test setup error: initial node too large for identity: %d bytes (max %d)", len(encoded), verifcid.DefaultMaxIdentityDigestSize) + } + + // Store the node + err := dserv.Add(ctx, node) + if err != nil { + t.Fatal(err) + } + + // Create DagModifier + dmod, err := NewDagModifier(ctx, node, dserv, testu.SizeSplitterGen(512)) + if err != nil { + t.Fatal(err) + } + + // Configure fallback hash (simulating what MFS does) + dmod.Prefix.MhType = util.DefaultIpfsHash + dmod.Prefix.MhLength = -1 + + // Modify first few bytes - this won't change the size but validates + // that our check works for any modification + modData := []byte("MODIFIED") + n, err := dmod.WriteAt(modData, 0) + if err != nil { + t.Fatal(err) + } + if n != len(modData) { + t.Fatalf("Expected to write %d bytes, wrote %d", len(modData), n) + } + + // Sync the changes + err = dmod.Sync() + if err != nil { + t.Fatal(err) + } + + // Get result + resultNode, err := dmod.GetNode() + if err != nil { + t.Fatal(err) + } + + // The key verification: if it's a leaf node and would exceed verifcid.DefaultMaxIdentityDigestSize, + // it must not use identity hash + if len(resultNode.Links()) == 0 { + if protoNode, ok := resultNode.(*dag.ProtoNode); ok { + encodedData, _ := protoNode.EncodeProtobuf(false) + if len(encodedData) > verifcid.DefaultMaxIdentityDigestSize && resultNode.Cid().Prefix().MhType == mh.IDENTITY { + t.Errorf("Leaf node with %d bytes must not use identity hash", len(encodedData)) + } + } + } + }) + + t.Run("small identity CID remains identity", func(t *testing.T) { + // Create a small UnixFS file node with identity hash + // Keep it well under verifcid.DefaultMaxIdentityDigestSize (the identity hash limit) + smallData := []byte("hello world") + + // Create a UnixFS file node with identity CID + smallNode := makeIdentityNode(smallData) + + // Store the node + err := dserv.Add(ctx, smallNode) + if err != nil { + t.Fatal(err) + } + + // Create DagModifier with default splitter + dmod, err := NewDagModifier(ctx, smallNode, dserv, testu.SizeSplitterGen(512)) + if err != nil { + t.Fatal(err) + } + + // Write small amount of data (total still under verifcid.DefaultMaxIdentityDigestSize) + additionalData := []byte(" and more") + n, err := dmod.WriteAt(additionalData, int64(len(smallData))) + if err != nil { + t.Fatalf("WriteAt failed: %v", err) + } + if n != len(additionalData) { + t.Fatalf("expected to write %d bytes, wrote %d", len(additionalData), n) + } + + // Get the resulting node + resultNode, err := dmod.GetNode() + if err != nil { + t.Fatal(err) + } + + // Verify it still uses identity hash + resultCID := resultNode.Cid() + if resultCID.Prefix().MhType != mh.IDENTITY { + t.Fatalf("expected identity hash for small data, got %v", resultCID.Prefix().MhType) + } + }) + + t.Run("preserves all prefix fields when switching from identity", func(t *testing.T) { + // Create initial data that fits well within identity CID limit + // This needs to be large enough to trigger modifyDag but small enough + // to fit in identity with UnixFS overhead + initialData := makeTestData(100) + + // Create identity CID ProtoNode with UnixFS data + initialNode := dag.NodeWithData(unixfs.FilePBData(initialData, uint64(len(initialData)))) + initialNode.SetCidBuilder(cid.Prefix{ + Version: 1, + Codec: cid.DagProtobuf, + MhType: mh.IDENTITY, + MhLength: -1, + }) + + // Verify initial data fits in identity + encodedInitial, _ := initialNode.EncodeProtobuf(false) + if len(encodedInitial) > verifcid.DefaultMaxIdentityDigestSize { + t.Skipf("Initial data too large for identity: %d bytes", len(encodedInitial)) + } + + err := dserv.Add(ctx, initialNode) + if err != nil { + t.Fatal(err) + } + + // Create DagModifier + dmod, err := NewDagModifier(ctx, initialNode, dserv, testu.SizeSplitterGen(512)) + if err != nil { + t.Fatal(err) + } + + // Configure a custom prefix with specific values + dmod.Prefix = cid.Prefix{ + Version: 1, + Codec: cid.DagProtobuf, + MhType: mh.SHA2_512, + MhLength: 32, // Truncated SHA-512 + } + + // Write within the existing data bounds to trigger modifyDag + // Use enough data that would cause identity overflow + modData := bytes.Repeat([]byte{'X'}, 50) + + // Write at offset 0, replacing first 50 bytes + n, err := dmod.WriteAt(modData, 0) + if err != nil { + t.Fatal(err) + } + if n != len(modData) { + t.Fatalf("expected to write %d bytes, wrote %d", len(modData), n) + } + + err = dmod.Sync() + if err != nil { + t.Fatal(err) + } + + // Get the modified node + modifiedNode, err := dmod.GetNode() + if err != nil { + t.Fatal(err) + } + + // The node should still be using identity since the data is small + // But let's test that prefix fields would be preserved if it switched + + // For this test, let's verify that our safePrefixForSize function + // correctly preserves fields when it would switch + testPrefix, changed := dmod.safePrefixForSize(initialNode.Cid().Prefix(), verifcid.DefaultMaxIdentityDigestSize+10) + + if !changed { + t.Errorf("safePrefixForSize: expected prefix to change for oversized identity") + } + if testPrefix.Version != 1 { + t.Errorf("safePrefixForSize: expected Version 1, got %d", testPrefix.Version) + } + if testPrefix.Codec != cid.DagProtobuf { + t.Errorf("safePrefixForSize: expected Codec DagProtobuf, got %d", testPrefix.Codec) + } + if testPrefix.MhType != mh.SHA2_512 { + t.Errorf("safePrefixForSize: expected MhType SHA2_512 (%d), got %d", mh.SHA2_512, testPrefix.MhType) + } + if testPrefix.MhLength != 32 { + t.Errorf("safePrefixForSize: expected MhLength 32, got %d", testPrefix.MhLength) + } + + // Also verify the actual node behavior (it should still be identity for this small data) + if modifiedNode.Cid().Prefix().MhType != mh.IDENTITY { + t.Logf("Note: Node switched from identity even though data is small") + } + }) + + t.Run("raw node preserves codec", func(t *testing.T) { + // Create a RawNode with identity CID that's near the limit + // This allows us to test overflow with small modifications + initialData := makeTestData(verifcid.DefaultMaxIdentityDigestSize - 10) + + rawNode, err := dag.NewRawNodeWPrefix(initialData, cid.Prefix{ + Version: 1, + Codec: cid.Raw, + MhType: mh.IDENTITY, + MhLength: -1, + }) + if err != nil { + t.Fatal(err) + } + + // Verify it's using Raw codec and identity hash + if rawNode.Cid().Prefix().Codec != cid.Raw { + t.Fatalf("expected Raw codec, got %d", rawNode.Cid().Prefix().Codec) + } + if rawNode.Cid().Prefix().MhType != mh.IDENTITY { + t.Fatalf("expected identity hash, got %d", rawNode.Cid().Prefix().MhType) + } + + err = dserv.Add(ctx, rawNode) + if err != nil { + t.Fatal(err) + } + + // Create DagModifier + dmod, err := NewDagModifier(ctx, rawNode, dserv, testu.SizeSplitterGen(512)) + if err != nil { + t.Fatal(err) + } + + // Verify DagModifier preserves the Raw codec (not forcing to DagProtobuf) + if dmod.Prefix.Codec != cid.Raw { + t.Errorf("BUG: DagModifier changed codec from Raw (%d) to %d", + cid.Raw, dmod.Prefix.Codec) + } + + // Configure fallback hash for when identity overflows + dmod.Prefix.MhType = util.DefaultIpfsHash + dmod.Prefix.MhLength = -1 + + // Modify a few bytes - the node should stay the same size + // but switch from identity to the configured hash + modData := []byte("MODIFIED") + _, err = dmod.WriteAt(modData, 0) + if err != nil { + t.Fatal(err) + } + + err = dmod.Sync() + if err != nil { + t.Fatal(err) + } + + modifiedNode, err := dmod.GetNode() + if err != nil { + t.Fatal(err) + } + + // The node should switch from identity to SHA256 but keep Raw codec + if modifiedNode.Cid().Prefix().MhType == mh.IDENTITY { + // It might still be identity if total size is under limit + t.Logf("Note: Node still uses identity hash") + } + + // CRITICAL: Must preserve Raw codec + if modifiedNode.Cid().Prefix().Codec != cid.Raw { + t.Errorf("BUG: Raw node changed codec! Expected Raw (%d), got %d", + cid.Raw, modifiedNode.Cid().Prefix().Codec) + } + }) + + t.Run("raw node identity overflow via truncate", func(t *testing.T) { + // Test that Truncate also handles identity overflow correctly for RawNodes + // Start with a RawNode using identity that's at max size + maxData := makeTestData(verifcid.DefaultMaxIdentityDigestSize) + + rawNode, err := dag.NewRawNodeWPrefix(maxData, cid.Prefix{ + Version: 1, + Codec: cid.Raw, + MhType: mh.IDENTITY, + MhLength: -1, + }) + if err != nil { + t.Fatal(err) + } + + // Verify initial state + if rawNode.Cid().Prefix().Codec != cid.Raw { + t.Fatalf("expected Raw codec, got %d", rawNode.Cid().Prefix().Codec) + } + if rawNode.Cid().Prefix().MhType != mh.IDENTITY { + t.Fatalf("expected identity hash, got %d", rawNode.Cid().Prefix().MhType) + } + + err = dserv.Add(ctx, rawNode) + if err != nil { + t.Fatal(err) + } + + // Create DagModifier with non-identity fallback + dmod, err := NewDagModifier(ctx, rawNode, dserv, testu.SizeSplitterGen(512)) + if err != nil { + t.Fatal(err) + } + + // Even though we're truncating to smaller, the original uses identity + // and we should preserve that if it still fits + err = dmod.Truncate(100) + if err != nil { + t.Fatal(err) + } + + truncatedNode, err := dmod.GetNode() + if err != nil { + t.Fatal(err) + } + + // Should still use identity (100 bytes < verifcid.DefaultMaxIdentityDigestSize) + if truncatedNode.Cid().Prefix().MhType != mh.IDENTITY { + t.Errorf("Expected identity hash for 100 bytes, got %d", + truncatedNode.Cid().Prefix().MhType) + } + + // Must preserve Raw codec + if truncatedNode.Cid().Prefix().Codec != cid.Raw { + t.Errorf("Truncate changed codec! Expected Raw (%d), got %d", + cid.Raw, truncatedNode.Cid().Prefix().Codec) + } + + // Verify data was actually truncated + if rawTrunc, ok := truncatedNode.(*dag.RawNode); ok { + if len(rawTrunc.RawData()) != 100 { + t.Errorf("Expected 100 bytes after truncate, got %d", len(rawTrunc.RawData())) + } + if !bytes.Equal(rawTrunc.RawData(), maxData[:100]) { + t.Error("Truncated data doesn't match expected") + } + } else { + t.Error("Truncated node is not a RawNode") + } + }) + + t.Run("raw node switches from identity when modified to exceed limit", func(t *testing.T) { + // Create a RawNode with identity CID near the limit + // Use size that allows modification without triggering append + initialData := bytes.Repeat([]byte{'a'}, verifcid.DefaultMaxIdentityDigestSize-1) // 127 bytes + + rawNode, err := dag.NewRawNodeWPrefix(initialData, cid.Prefix{ + Version: 1, + Codec: cid.Raw, + MhType: mh.IDENTITY, + MhLength: -1, + }) + if err != nil { + t.Fatal(err) + } + + // Verify it starts with identity (127 bytes < verifcid.DefaultMaxIdentityDigestSize) + if rawNode.Cid().Prefix().MhType != mh.IDENTITY { + t.Fatalf("Expected identity hash for 127 bytes, got %d", rawNode.Cid().Prefix().MhType) + } + + err = dserv.Add(ctx, rawNode) + if err != nil { + t.Fatal(err) + } + + // Create DagModifier + dmod, err := NewDagModifier(ctx, rawNode, dserv, testu.SizeSplitterGen(512)) + if err != nil { + t.Fatal(err) + } + + // Configure fallback for identity overflow + dmod.Prefix.MhType = util.DefaultIpfsHash + dmod.Prefix.MhLength = -1 + + // Replace last byte with 2 bytes, pushing size to verifcid.DefaultMaxIdentityDigestSize + // This modifies within bounds but increases total size just enough + twoBytes := []byte("XX") + _, err = dmod.WriteAt(twoBytes, int64(len(initialData)-1)) + if err != nil { + t.Fatal(err) + } + + err = dmod.Sync() + if err != nil { + t.Fatal(err) + } + + modifiedNode, err := dmod.GetNode() + if err != nil { + t.Fatal(err) + } + + // When extending a RawNode past its end, it gets converted to UnixFS structure + // because RawNodes can't have child nodes - they're just raw data + if protoNode, ok := modifiedNode.(*dag.ProtoNode); ok { + // This is expected - RawNode was converted to UnixFS for growth + fsn, err := unixfs.FSNodeFromBytes(protoNode.Data()) + if err != nil { + t.Fatalf("Failed to parse UnixFS metadata: %v", err) + } + + expectedSize := uint64(verifcid.DefaultMaxIdentityDigestSize) // 127 original + 1 byte extension + if fsn.FileSize() != expectedSize { + t.Errorf("Expected file size %d, got %d", expectedSize, fsn.FileSize()) + } + + // Should have switched from identity hash + if modifiedNode.Cid().Prefix().MhType == mh.IDENTITY { + t.Error("Large node should not use identity hash") + } + } else if rawMod, ok := modifiedNode.(*dag.RawNode); ok { + // If it's still a RawNode, it means the write didn't extend past the end + actualSize := len(rawMod.RawData()) + t.Logf("Modified node size: %d bytes", actualSize) + + // If size > verifcid.DefaultMaxIdentityDigestSize, must not use identity + if actualSize > verifcid.DefaultMaxIdentityDigestSize { + if modifiedNode.Cid().Prefix().MhType == mh.IDENTITY { + t.Errorf("Node with %d bytes still uses identity (max %d)", + actualSize, verifcid.DefaultMaxIdentityDigestSize) + } + } + + // Must preserve Raw codec + if modifiedNode.Cid().Prefix().Codec != cid.Raw { + t.Errorf("BUG: Raw node changed codec! Expected Raw (%d), got %d", + cid.Raw, modifiedNode.Cid().Prefix().Codec) + } + } else { + t.Error("Node is neither RawNode nor ProtoNode") + } + }) +} + +func TestRawNodeGrowthConversion(t *testing.T) { + ctx := context.Background() + dserv := testu.GetDAGServ() + + t.Run("raw node converts to UnixFS when growing beyond single block", func(t *testing.T) { + // Create a small RawNode + initialData := []byte("small raw data") + rawNode, err := dag.NewRawNodeWPrefix(initialData, cid.Prefix{ + Version: 1, + Codec: cid.Raw, + MhType: mh.SHA2_256, + MhLength: -1, + }) + if err != nil { + t.Fatal(err) + } + + err = dserv.Add(ctx, rawNode) + if err != nil { + t.Fatal(err) + } + + // Create DagModifier with small chunk size to force multiple blocks + dmod, err := NewDagModifier(ctx, rawNode, dserv, testu.SizeSplitterGen(32)) + if err != nil { + t.Fatal(err) + } + + // Seek to end to append (not overwrite) + _, err = dmod.Seek(0, io.SeekEnd) + if err != nil { + t.Fatal(err) + } + + // Append data that will create multiple blocks + largeData := makeTestData(100) + + _, err = dmod.Write(largeData) + if err != nil { + t.Fatal(err) + } + + // Get the resulting node + resultNode, err := dmod.GetNode() + if err != nil { + t.Fatal(err) + } + + // Should now be a ProtoNode with UnixFS metadata + protoNode, ok := resultNode.(*dag.ProtoNode) + if !ok { + t.Fatal("Expected ProtoNode after growth, got RawNode") + } + + // Verify it has UnixFS metadata + fsn, err := unixfs.FSNodeFromBytes(protoNode.Data()) + if err != nil { + t.Fatalf("Failed to parse UnixFS metadata: %v", err) + } + + if fsn.Type() != unixfs.TFile { + t.Errorf("Expected file type, got %v", fsn.Type()) + } + + // Should have multiple links (blocks) + if len(protoNode.Links()) < 2 { + t.Errorf("Expected multiple blocks, got %d", len(protoNode.Links())) + } + + // First link should be the original RawNode + if len(protoNode.Links()) > 0 { + firstChild, err := protoNode.Links()[0].GetNode(ctx, dserv) + if err != nil { + t.Fatal(err) + } + + // First block should contain our original data + if rawChild, ok := firstChild.(*dag.RawNode); ok { + if !bytes.HasPrefix(rawChild.RawData(), initialData) { + t.Error("First block doesn't contain original data") + } + } + } + + // Verify the total size is correct + totalSize := uint64(len(initialData) + len(largeData)) + if fsn.FileSize() != totalSize { + t.Errorf("Expected file size %d, got %d", totalSize, fsn.FileSize()) + } + }) + + t.Run("identity raw node converts and switches hash when growing", func(t *testing.T) { + // Create a RawNode with identity hash + smallData := []byte("tiny") + rawNode, err := dag.NewRawNodeWPrefix(smallData, cid.Prefix{ + Version: 1, + Codec: cid.Raw, + MhType: mh.IDENTITY, + MhLength: -1, + }) + if err != nil { + t.Fatal(err) + } + + err = dserv.Add(ctx, rawNode) + if err != nil { + t.Fatal(err) + } + + // Create DagModifier with small chunk size + dmod, err := NewDagModifier(ctx, rawNode, dserv, testu.SizeSplitterGen(32)) + if err != nil { + t.Fatal(err) + } + + // Configure fallback for identity overflow + dmod.Prefix = cid.Prefix{ + Version: 1, + Codec: cid.DagProtobuf, + MhType: mh.SHA2_256, + MhLength: -1, + } + + // Seek to end to append (not overwrite) + _, err = dmod.Seek(0, io.SeekEnd) + if err != nil { + t.Fatal(err) + } + + // Append enough data to force multi-block structure + appendData := bytes.Repeat([]byte{'X'}, 200) + + _, err = dmod.Write(appendData) + if err != nil { + t.Fatal(err) + } + + resultNode, err := dmod.GetNode() + if err != nil { + t.Fatal(err) + } + + // Should be ProtoNode with non-identity hash + protoNode, ok := resultNode.(*dag.ProtoNode) + if !ok { + t.Fatal("Expected ProtoNode after growth") + } + + if protoNode.Cid().Prefix().MhType == mh.IDENTITY { + t.Error("Large multi-block node should not use identity hash") + } + + // Verify structure + if len(protoNode.Links()) < 2 { + t.Errorf("Expected multiple blocks, got %d", len(protoNode.Links())) + } + }) +} diff --git a/mfs/file.go b/mfs/file.go index 3789e0daf..f31ec47ee 100644 --- a/mfs/file.go +++ b/mfs/file.go @@ -11,8 +11,10 @@ import ( dag "github.com/ipfs/boxo/ipld/merkledag" ft "github.com/ipfs/boxo/ipld/unixfs" mod "github.com/ipfs/boxo/ipld/unixfs/mod" + "github.com/ipfs/boxo/util" ipld "github.com/ipfs/go-ipld-format" "github.com/libp2p/go-libp2p/core/routing" + mh "github.com/multiformats/go-multihash" ) // File represents a file in the MFS, its logic its mainly targeted @@ -110,6 +112,29 @@ func (fi *File) Open(flags Flags) (_ FileDescriptor, _retErr error) { } dmod.RawLeaves = fi.RawLeaves + // If the node uses identity hash, configure DagModifier with a fallback + // to avoid issues when the data grows beyond the identity hash size limit + if dmod.Prefix.MhType == mh.IDENTITY { + // Try to inherit full prefix from parent directory + if fi.parent != nil { + if dir, ok := fi.parent.(*Directory); ok { + if parentNode, err := dir.GetNode(); err == nil { + parentPrefix := parentNode.Cid().Prefix() + if parentPrefix.MhType != mh.IDENTITY { + // Use parent's full prefix (all fields) + dmod.Prefix = parentPrefix + } + } + } + } + + // If still identity (no suitable parent), set up fallback + if dmod.Prefix.MhType == mh.IDENTITY { + dmod.Prefix.MhType = util.DefaultIpfsHash + dmod.Prefix.MhLength = -1 // Use default length for the hash function + } + } + return &fileDescriptor{ inode: fi, flags: flags, diff --git a/verifcid/allowlist.go b/verifcid/allowlist.go index b572de3a6..5068908d8 100644 --- a/verifcid/allowlist.go +++ b/verifcid/allowlist.go @@ -11,6 +11,12 @@ var DefaultAllowlist defaultAllowlist type Allowlist interface { // IsAllowed checks for multihash allowance by the code. IsAllowed(code uint64) bool + + // MinDigestSize returns the minimum digest size for a given multihash code. + MinDigestSize(code uint64) int + + // MaxDigestSize returns the maximum digest size for a given multihash code. + MaxDigestSize(code uint64) int } // NewAllowlist constructs new [Allowlist] from the given map set. @@ -41,6 +47,24 @@ func (al allowlist) IsAllowed(code uint64) bool { return false } +func (al allowlist) MinDigestSize(code uint64) int { + // If we have an override, delegate to it + if al.override != nil { + return al.override.MinDigestSize(code) + } + // Otherwise use default behavior + return DefaultAllowlist.MinDigestSize(code) +} + +func (al allowlist) MaxDigestSize(code uint64) int { + // If we have an override, delegate to it + if al.override != nil { + return al.override.MaxDigestSize(code) + } + // Otherwise use default behavior + return DefaultAllowlist.MaxDigestSize(code) +} + type defaultAllowlist struct{} func (defaultAllowlist) IsAllowed(code uint64) bool { @@ -67,3 +91,25 @@ func (defaultAllowlist) IsAllowed(code uint64) bool { return false } } + +func (defaultAllowlist) MinDigestSize(code uint64) int { + switch code { + case mh.IDENTITY: + // Identity hashes are exempt from minimum size requirements + // as they embed data directly + return 0 + default: + return DefaultMinDigestSize + } +} + +func (defaultAllowlist) MaxDigestSize(code uint64) int { + switch code { + case mh.IDENTITY: + // Identity CIDs embed data directly, limit to prevent abuse + return DefaultMaxIdentityDigestSize + default: + // Maximum size for cryptographic hash digests + return DefaultMaxDigestSize + } +} diff --git a/verifcid/allowlist_test.go b/verifcid/allowlist_test.go index b6057502f..1532f2f35 100644 --- a/verifcid/allowlist_test.go +++ b/verifcid/allowlist_test.go @@ -1,6 +1,7 @@ package verifcid import ( + "errors" "testing" "github.com/ipfs/go-cid" @@ -20,13 +21,6 @@ func TestDefaultAllowList(t *testing.T) { t.Fatal("expected failure") } } - mhcid := func(code uint64, length int) cid.Cid { - mhash, err := mh.Sum([]byte{}, code, length) - if err != nil { - t.Fatalf("%v: code: %x length: %d", err, code, length) - } - return cid.NewCidV1(cid.DagCBOR, mhash) - } allowlist := DefaultAllowlist assertTrue(allowlist.IsAllowed(mh.SHA2_256)) @@ -35,24 +29,35 @@ func TestDefaultAllowList(t *testing.T) { assertTrue(allowlist.IsAllowed(mh.KECCAK_256)) assertTrue(allowlist.IsAllowed(mh.SHA3)) assertTrue(allowlist.IsAllowed(mh.SHA1)) + assertTrue(allowlist.IsAllowed(mh.IDENTITY)) assertFalse(allowlist.IsAllowed(mh.BLAKE2B_MIN + 5)) cases := []struct { cid cid.Cid err error }{ - {mhcid(mh.SHA2_256, 32), nil}, - {mhcid(mh.SHA2_256, 16), ErrBelowMinimumHashLength}, - {mhcid(mh.MURMUR3X64_64, 4), ErrPossiblyInsecureHashFunction}, - {mhcid(mh.BLAKE3, 32), nil}, - {mhcid(mh.BLAKE3, 69), nil}, - {mhcid(mh.BLAKE3, 128), nil}, + {mhcid(t, mh.SHA2_256, 32), nil}, + {mhcid(t, mh.SHA2_256, 16), ErrDigestTooSmall}, + {mhcid(t, mh.MURMUR3X64_64, 4), ErrPossiblyInsecureHashFunction}, + {mhcid(t, mh.BLAKE3, 32), nil}, + {mhcid(t, mh.BLAKE3, 69), nil}, + {mhcid(t, mh.BLAKE3, 128), nil}, + {identityCid(t, DefaultMinDigestSize-1), nil}, // identity below minimum (exempt from minimum) + {identityCid(t, 64), nil}, // identity under DefaultMaxIdentityDigestSize limit + {identityCid(t, DefaultMaxIdentityDigestSize), nil}, // identity at DefaultMaxIdentityDigestSize limit + {identityCid(t, DefaultMaxIdentityDigestSize+1), ErrDigestTooLarge}, // identity above DefaultMaxIdentityDigestSize limit } for i, cas := range cases { - if ValidateCid(allowlist, cas.cid) != cas.err { + err := ValidateCid(allowlist, cas.cid) + if cas.err == nil { + if err != nil { + t.Errorf("wrong result in case of %s (index %d). Expected: nil, got %s", + cas.cid, i, err) + } + } else if !errors.Is(err, cas.err) { t.Errorf("wrong result in case of %s (index %d). Expected: %s, got %s", - cas.cid, i, cas.err, ValidateCid(DefaultAllowlist, cas.cid)) + cas.cid, i, cas.err, err) } } @@ -61,7 +66,31 @@ func TestDefaultAllowList(t *testing.T) { if err != nil { t.Fatalf("failed to produce a multihash from the long blake3 hash: %v", err) } - if ValidateCid(allowlist, cid.NewCidV1(cid.DagCBOR, longBlake3Mh)) != ErrAboveMaximumHashLength { - t.Errorf("a CID that was longer than the maximum hash length did not error with ErrAboveMaximumHashLength") + if err := ValidateCid(allowlist, cid.NewCidV1(cid.DagCBOR, longBlake3Mh)); !errors.Is(err, ErrDigestTooLarge) { + t.Errorf("a CID that was longer than the maximum hash length did not error with ErrDigestTooLarge, got: %v", err) + } +} + +// mhcid creates a CID with the specified multihash type and length +func mhcid(t *testing.T, code uint64, length int) cid.Cid { + t.Helper() + mhash, err := mh.Sum([]byte{}, code, length) + if err != nil { + t.Fatalf("%v: code: %x length: %d", err, code, length) + } + return cid.NewCidV1(cid.DagCBOR, mhash) +} + +// identityCid creates an identity CID with specific data size +func identityCid(t *testing.T, size int) cid.Cid { + t.Helper() + data := make([]byte, size) + for i := range data { + data[i] = byte(i % 256) + } + hash, err := mh.Sum(data, mh.IDENTITY, -1) + if err != nil { + t.Fatalf("failed to create identity hash: %v", err) } + return cid.NewCidV1(cid.Raw, hash) } diff --git a/verifcid/cid.go b/verifcid/cid.go index f3baf3343..c541a0876 100644 --- a/verifcid/cid.go +++ b/verifcid/cid.go @@ -8,15 +8,28 @@ import ( mh "github.com/multiformats/go-multihash" ) +const ( + // DefaultMinDigestSize is the default minimum size for hash digests (except for identity hashes) + DefaultMinDigestSize = 20 + // DefaultMaxDigestSize is the default maximum size for cryptographic hash digests. + // This does not apply to identity hashes which are not cryptographic and use DefaultMaxIdentityDigestSize instead. + DefaultMaxDigestSize = 128 + // DefaultMaxIdentityDigestSize is the default maximum size for identity CID digests. + // Identity CIDs (with multihash code 0x00) are not cryptographic hashes - they embed + // data directly in the CID. This separate limit prevents abuse while allowing + // different size constraints than cryptographic digests. + DefaultMaxIdentityDigestSize = 128 +) + var ( ErrPossiblyInsecureHashFunction = errors.New("potentially insecure hash functions not allowed") - ErrBelowMinimumHashLength = fmt.Errorf("hashes must be at least %d bytes long", minimumHashLength) - ErrAboveMaximumHashLength = fmt.Errorf("hashes must be at most %d bytes long", maximumHashLength) -) + ErrDigestTooSmall = errors.New("digest too small") + ErrDigestTooLarge = errors.New("digest too large") -const ( - minimumHashLength = 20 - maximumHashLength = 128 + // Deprecated: Use ErrDigestTooSmall instead + ErrBelowMinimumHashLength = ErrDigestTooSmall + // Deprecated: Use ErrDigestTooLarge instead + ErrAboveMaximumHashLength = ErrDigestTooLarge ) // ValidateCid validates multihash allowance behind given CID. @@ -26,13 +39,31 @@ func ValidateCid(allowlist Allowlist, c cid.Cid) error { return ErrPossiblyInsecureHashFunction } - if pref.MhType != mh.IDENTITY && pref.MhLength < minimumHashLength { - return ErrBelowMinimumHashLength - } + minSize := allowlist.MinDigestSize(pref.MhType) + maxSize := allowlist.MaxDigestSize(pref.MhType) - if pref.MhType != mh.IDENTITY && pref.MhLength > maximumHashLength { - return ErrAboveMaximumHashLength + if pref.MhLength < minSize { + return newErrDigestTooSmall(pref.MhType, pref.MhLength, minSize) + } + if pref.MhLength > maxSize { + return newErrDigestTooLarge(pref.MhType, pref.MhLength, maxSize) } return nil } + +func getHashName(code uint64) string { + name, ok := mh.Codes[code] + if !ok { + return fmt.Sprintf("unknown(%d)", code) + } + return name +} + +func newErrDigestTooSmall(code uint64, got int, min int) error { + return fmt.Errorf("%w: %s digest got %d bytes, minimum %d", ErrDigestTooSmall, getHashName(code), got, min) +} + +func newErrDigestTooLarge(code uint64, got int, max int) error { + return fmt.Errorf("%w: %s digest got %d bytes, maximum %d", ErrDigestTooLarge, getHashName(code), got, max) +} diff --git a/verifcid/cid_test.go b/verifcid/cid_test.go new file mode 100644 index 000000000..738e0ace9 --- /dev/null +++ b/verifcid/cid_test.go @@ -0,0 +1,162 @@ +package verifcid + +import ( + "bytes" + "errors" + "testing" + + "github.com/ipfs/go-cid" + mh "github.com/multiformats/go-multihash" +) + +func TestValidateCid(t *testing.T) { + allowlist := DefaultAllowlist + + tests := []struct { + name string + data []byte + mhType uint64 + wantErr error + }{ + { + name: "identity at max size", + data: bytes.Repeat([]byte("a"), DefaultMaxIdentityDigestSize), // max identity size + mhType: mh.IDENTITY, + wantErr: nil, + }, + { + name: "identity over max size", + data: bytes.Repeat([]byte("b"), DefaultMaxIdentityDigestSize+1), // over max identity size + mhType: mh.IDENTITY, + wantErr: ErrDigestTooLarge, + }, + { + name: "identity at 64 bytes", + data: bytes.Repeat([]byte("c"), 64), + mhType: mh.IDENTITY, + wantErr: nil, + }, + { + name: "identity with 1KB data", + data: bytes.Repeat([]byte("d"), 1024), + mhType: mh.IDENTITY, + wantErr: ErrDigestTooLarge, + }, + { + name: "small identity", + data: []byte("hello"), + mhType: mh.IDENTITY, + wantErr: nil, + }, + { + name: "empty identity", + data: []byte{}, + mhType: mh.IDENTITY, + wantErr: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + hash, err := mh.Sum(tt.data, tt.mhType, -1) + if err != nil { + t.Fatal(err) + } + c := cid.NewCidV1(cid.Raw, hash) + + err = ValidateCid(allowlist, c) + if tt.wantErr == nil { + if err != nil { + t.Errorf("ValidateCid() error = %v, wantErr nil", err) + } + } else if !errors.Is(err, tt.wantErr) { + t.Errorf("ValidateCid() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } + + t.Run("identity hash extracts correct data", func(t *testing.T) { + // Verify we can extract the original data from identity CID + originalData := []byte("inline data in CID") + hash, err := mh.Sum(originalData, mh.IDENTITY, -1) + if err != nil { + t.Fatal(err) + } + c := cid.NewCidV1(cid.Raw, hash) + + // Validate the CID + err = ValidateCid(allowlist, c) + if err != nil { + t.Errorf("expected valid identity CID, got error: %v", err) + } + + // Extract and verify the data + decoded, err := mh.Decode(c.Hash()) + if err != nil { + t.Fatal(err) + } + if decoded.Code != mh.IDENTITY { + t.Errorf("expected identity hash code, got: %v", decoded.Code) + } + if !bytes.Equal(decoded.Digest, originalData) { + t.Errorf("extracted data doesn't match original") + } + }) + + t.Run("regular hash still respects minimum", func(t *testing.T) { + // Create a SHA256 hash with less than MinDigestSize bytes (truncated) + data := []byte("test data") + fullHash, err := mh.Sum(data, mh.SHA2_256, -1) + if err != nil { + t.Fatal(err) + } + + // Manually create a truncated hash (DefaultMinDigestSize-1 bytes) + decoded, err := mh.Decode(fullHash) + if err != nil { + t.Fatal(err) + } + truncatedDigest := decoded.Digest[:DefaultMinDigestSize-1] // less than minimum + truncatedHash, err := mh.Encode(truncatedDigest, mh.SHA2_256) + if err != nil { + t.Fatal(err) + } + + c := cid.NewCidV1(cid.Raw, truncatedHash) + + err = ValidateCid(allowlist, c) + if !errors.Is(err, ErrDigestTooSmall) { + t.Errorf("expected ErrDigestTooSmall for truncated SHA256, got: %v", err) + } + }) + + t.Run("identity hash exempt from minimum size", func(t *testing.T) { + // Test that identity hashes below MinDigestSize are still valid + // This confirms identity hashes are exempt from the minimum size requirement + smallData := []byte("tiny") // Only 4 bytes, well below DefaultMinDigestSize + hash, err := mh.Sum(smallData, mh.IDENTITY, -1) + if err != nil { + t.Fatal(err) + } + c := cid.NewCidV1(cid.Raw, hash) + + // Should pass validation despite being below MinDigestSize + err = ValidateCid(allowlist, c) + if err != nil { + t.Errorf("identity CID with %d bytes should be valid (exempt from minimum), got error: %v", len(smallData), err) + } + + // Also test with exactly DefaultMinDigestSize-1 bytes (less than normal minimum) + dataAt19Bytes := bytes.Repeat([]byte("x"), DefaultMinDigestSize-1) + hash19, err := mh.Sum(dataAt19Bytes, mh.IDENTITY, -1) + if err != nil { + t.Fatal(err) + } + c19 := cid.NewCidV1(cid.Raw, hash19) + + err = ValidateCid(allowlist, c19) + if err != nil { + t.Errorf("identity CID with %d bytes should be valid (exempt from minimum), got error: %v", DefaultMinDigestSize-1, err) + } + }) +}