Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/car/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ COMMANDS:
create, c Create a car file
debug debug a car file
detach-index Detach an index to a detached file
extract, x Extract the contents of a car when the car encodes UnixFS data
extract, x Extract the contents of a car when the car encodes UnixFS or Git data
filter, f Filter the CIDs in a car
get-block, gb Get a block out of a car
get-dag, gd Get a dag out of a car
Expand Down
2 changes: 1 addition & 1 deletion cmd/car/car.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ func main1() int {
{
Name: "extract",
Aliases: []string{"x"},
Usage: "Extract the contents of a car when the car encodes UnixFS data",
Usage: "Extract the contents of a car when the car encodes UnixFS or Git data",
Action: ExtractCar,
ArgsUsage: "[output directory|-]",
Flags: []cli.Flag{
Expand Down
1 change: 1 addition & 0 deletions cmd/car/extract.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"sync"

"github.com/ipfs/go-cid"
_ "github.com/ipfs/go-ipld-git"
"github.com/ipld/go-car/cmd/car/lib"
"github.com/ipld/go-car/v2"
carstorage "github.com/ipld/go-car/v2/storage"
Expand Down
1 change: 1 addition & 0 deletions cmd/car/get.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"io"
"os"

_ "github.com/ipfs/go-ipld-git"
dagpb "github.com/ipld/go-codec-dagpb"
"github.com/ipld/go-ipld-prime"
_ "github.com/ipld/go-ipld-prime/codec/cbor"
Expand Down
198 changes: 198 additions & 0 deletions cmd/car/lib/extract.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package lib

import (
"bytes"
"context"
"errors"
"fmt"
Expand Down Expand Up @@ -54,6 +55,22 @@ func ExtractToDir(c context.Context, ls *ipld.LinkSystem, root cid.Cid, outputDi
return 0, nil
}

if root.Prefix().Codec == cid.GitRaw {
var outputResolvedDir string
var err error
if outputDir != "-" {
outputResolvedDir, err = filepath.EvalSymlinks(outputDir)
if err != nil {
return 0, err
}
}
var blobName string
if outputDir != "-" {
blobName = filepath.Join(outputResolvedDir, "unknown")
}
return extractGitAnyNode(c, ls, cidlink.Link{Cid: root}, outputResolvedDir, "/", blobName, path, verbose, logger)
}

pbn, err := ls.Load(ipld.LinkContext{}, cidlink.Link{Cid: root}, dagpb.Type.PBNode)
if err != nil {
return 0, err
Expand Down Expand Up @@ -299,3 +316,184 @@ func extractFile(c context.Context, ls *ipld.LinkSystem, n ipld.Node, outputName
_, err = io.Copy(f, nlr)
return err
}

// extractGitDir extracts a git-raw tree node to a directory, following the
// same path-filtering and output conventions as extractDir.
func extractGitDir(c context.Context, ls *ipld.LinkSystem, n ipld.Node, outputRoot, outputPath string, matchPath []string, verbose bool, logger io.Writer) (int, error) {
if n.Kind() != ipld.Kind_Map {
return 0, ErrNotDir
}

if outputRoot != "" {
dirPath, err := resolvePath(outputRoot, outputPath)
if err != nil {
return 0, err
}
if err := os.MkdirAll(dirPath, 0755); err != nil {
return 0, err
}
}

if outputPath == "-" && len(matchPath) == 0 {
return 0, fmt.Errorf("cannot extract a directory to stdout, use a path to extract a specific file")
}

subPath := matchPath
if len(matchPath) > 0 {
subPath = matchPath[1:]
}

extractEntry := func(name string, entry ipld.Node) (int, error) {
nextPath := path.Join(outputPath, name)
var nextRes string
if outputRoot != "" {
var err error
nextRes, err = resolvePath(outputRoot, nextPath)
if err != nil {
return 0, err
}
if verbose {
fmt.Fprintf(logger, "%s\n", nextRes)
}
}

hashNode, err := entry.LookupByString("hash")
if err != nil {
return 0, err
}
link, err := hashNode.AsLink()
if err != nil {
return 0, err
}
cnt, err := extractGitAnyNode(c, ls, link, outputRoot, nextPath, nextRes, subPath, verbose, logger)
if err != nil {
if nf, ok := err.(interface{ NotFound() bool }); ok && nf.NotFound() {
fmt.Fprintf(logger, "data for entry not found: %s (skipping...)\n", nextPath)
return 0, nil
}
return 0, err
}
return cnt, nil
}

if len(matchPath) > 0 {
val, err := n.LookupByString(matchPath[0])
if err != nil {
return 0, err
}
return extractEntry(matchPath[0], val)
}

var count int
mi := n.MapIterator()
for !mi.Done() {
key, val, err := mi.Next()
if err != nil {
return 0, err
}
ks, err := key.AsString()
if err != nil {
return 0, err
}
ecount, err := extractEntry(ks, val)
if err != nil {
return 0, err
}
count += ecount
}
return count, nil
}

// gitRawType reads the first bytes of a git-raw block to return its type
// string: "blob", "tree", "commit", or "tag".
func gitRawType(ls *ipld.LinkSystem, lnk ipld.Link) (string, error) {
r, err := ls.StorageReadOpener(ipld.LinkContext{}, lnk)
if err != nil {
return "", err
}
buf := make([]byte, 7) // long enough for "commit " (7 bytes)
n, err := r.Read(buf)
if n == 0 {
return "", fmt.Errorf("reading git object header: %w", err)
}
raw := buf[:n]
for _, t := range []string{"blob", "tree", "commit", "tag"} {
prefix := t + " "
if len(raw) >= len(prefix) && string(raw[:len(prefix)]) == prefix {
return t, nil
}
}
return "", fmt.Errorf("unrecognized git object header: %q", raw)
}

// extractGitAnyNode dispatches a git-raw link to the right handler by reading
// the object-type from its raw block header ("blob", "tree", "commit", "tag").
// blobPath is the output file path for a bare blob (empty = stdout); dirPath
// is the current relative path within outputRoot when entering a tree.
func extractGitAnyNode(c context.Context, ls *ipld.LinkSystem, lnk ipld.Link, outputRoot, dirPath, blobPath string, matchPath []string, verbose bool, logger io.Writer) (int, error) {
typ, err := gitRawType(ls, lnk)
if err != nil {
return 0, err
}
nd, err := ls.Load(ipld.LinkContext{}, lnk, basicnode.Prototype.Any)
if err != nil {
return 0, err
}
switch typ {
case "blob":
return 1, extractGitBlob(nd, blobPath)
case "commit":
treeNd, err := nd.LookupByString("tree")
if err != nil {
return 0, err
}
treeLnk, err := treeNd.AsLink()
if err != nil {
return 0, err
}
return extractGitAnyNode(c, ls, treeLnk, outputRoot, dirPath, blobPath, matchPath, verbose, logger)
case "tag":
objNd, err := nd.LookupByString("object")
if err != nil {
return 0, err
}
objLnk, err := objNd.AsLink()
if err != nil {
return 0, err
}
return extractGitAnyNode(c, ls, objLnk, outputRoot, dirPath, blobPath, matchPath, verbose, logger)
case "tree":
return extractGitDir(c, ls, nd, outputRoot, dirPath, matchPath, verbose, logger)
default:
return 0, fmt.Errorf("unrecognized git object type: %s", typ)
}
}

// extractGitBlob writes the content of a git blob node to outputName (or
// stdout when outputName is ""). Git blob bytes include the object header
// "blob <size>\0", which is stripped before writing.
func extractGitBlob(n ipld.Node, outputName string) error {
b, err := n.AsBytes()
if err != nil {
return err
}
// Strip git object header: "blob <size>\0"
nullIdx := bytes.IndexByte(b, 0)
if nullIdx < 0 {
return fmt.Errorf("invalid git blob: missing null byte in header")
}
content := b[nullIdx+1:]

var f *os.File
if outputName == "" {
f = os.Stdout
} else {
f, err = os.Create(outputName)
if err != nil {
return err
}
defer f.Close()
}
_, err = f.Write(content)
return err
}
Binary file added cmd/car/testdata/inputs/sample-git-commit.car
Binary file not shown.
Binary file added cmd/car/testdata/inputs/sample-git-raw.car
Binary file not shown.
Binary file added cmd/car/testdata/inputs/sample-git-tag-blob.car
Binary file not shown.
Binary file added cmd/car/testdata/inputs/sample-git-tag.car
Binary file not shown.
46 changes: 46 additions & 0 deletions cmd/car/testdata/script/extract.txt
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,52 @@ stderr -count=1 '^data for entry not found: /zimdump_version \(skipping\.\.\.\)$
stderr -count=1 '^data for entry not found: /favicon.ico \(skipping\.\.\.\)$'
stderr -count=1 '^data for entry not found: /index.html \(skipping\.\.\.\)$'

# git-raw CAR extract
mkdir actual-git-raw
car extract -f ${INPUTS}/sample-git-raw.car actual-git-raw
stderr '^extracted 1 file\(s\)$'
cmp actual-git-raw/baz expected-git-raw/baz

# git-raw CAR rooted at a commit: extracts the commit's tree
mkdir actual-git-commit
car extract -f ${INPUTS}/sample-git-commit.car actual-git-commit
stderr '^extracted 5 file\(s\)$'
cmp actual-git-commit/file expected-git-commit/file
cmp actual-git-commit/dir/f1 expected-git-commit/dir/f1
cmp actual-git-commit/dir/f4 expected-git-commit/dir/f4
cmp actual-git-commit/dir/subdir/f2 expected-git-commit/dir/subdir/f2
cmp actual-git-commit/dir2/f3 expected-git-commit/dir2/f3

# git-raw CAR rooted at a tag pointing to a commit: follows tag then extracts tree
mkdir actual-git-tag
car extract -f ${INPUTS}/sample-git-tag.car actual-git-tag
stderr '^extracted 5 file\(s\)$'
cmp actual-git-tag/file expected-git-commit/file
cmp actual-git-tag/dir/f1 expected-git-commit/dir/f1
cmp actual-git-tag/dir/f4 expected-git-commit/dir/f4
cmp actual-git-tag/dir/subdir/f2 expected-git-commit/dir/subdir/f2
cmp actual-git-tag/dir2/f3 expected-git-commit/dir2/f3

# git-raw CAR rooted at a tag pointing to a blob: extracts the blob as 'unknown'
mkdir actual-git-tag-blob
car extract -f ${INPUTS}/sample-git-tag-blob.car actual-git-tag-blob
stderr '^extracted 1 file\(s\)$'
cmp actual-git-tag-blob/unknown expected-git-tag-blob/unknown

-- expected-git-raw/baz --
hello
-- expected-git-commit/file --
Hello world
-- expected-git-commit/dir/f1 --
qwerty
-- expected-git-commit/dir/f4 --
;qjkxb
-- expected-git-commit/dir/subdir/f2 --
123456
-- expected-git-commit/dir2/f3 --
',.pyf
-- expected-git-tag-blob/unknown --
fgcrl
-- expected/a/1/A.txt --
a1A
-- expected/a/2/B.txt --
Expand Down
8 changes: 8 additions & 0 deletions cmd/car/testdata/script/get-dag-git-raw.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
env TREE_CID='baf4bcfe6bkemw5kpijnvfjzm575jntlvrqnfhky'
env BLOB_CID='baf4bcfgoae3ckaylvdn2sbxxk2lh7hu4uokemsq'
car get-dag ${INPUTS}/sample-git-raw.car ${TREE_CID} out.car
! stderr .
car list out.car
! stderr .
stdout ${TREE_CID}
stdout ${BLOB_CID}
1 change: 1 addition & 0 deletions cmd/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ require (
github.com/ipfs/go-block-format v0.2.3
github.com/ipfs/go-cid v0.6.0
github.com/ipfs/go-ipld-format v0.6.3
github.com/ipfs/go-ipld-git v0.1.1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the main reason this PR is going to be a problem. Nobody cares for that package, it's got accummulated dependabot PRs that don't get merged, some of which have CVEs associated with them (not that the package is actually impacted but it becomes a transitive problem). It gets updated by web3-bot just because it's signed up for getting GitHub actions and Go version bumps, but nothing beyond that.

https://github.com/ipfs/go-ipld-git/commits/master/

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From ipld/ipld#15 (comment):

go-cid moved to multiformats. Otherwise, the focus has been on switching to go-ipld-prime, the successor to many of these libraries.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to be fair, this is only on the cmd binary and not on the library itself so i am slightly less scared about the inclusion?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand the maintenance issue.
Re: "switching to go-ipld-prime", was that not done by ipfs/go-ipld-git#46 ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your call @willscott , if you're comfortable with this, I'm not a big fan because it expands the maintenance burden for people doing this out of casual interest. I'd be keen on this if there was an active team that cared about this.

github.com/ipfs/go-unixfsnode v1.10.3
github.com/ipld/go-car v0.6.3
github.com/ipld/go-car/v2 v2.16.0
Expand Down
Loading
Loading