diff --git a/README.md b/README.md index b6854e0d2..d682460c2 100644 --- a/README.md +++ b/README.md @@ -341,6 +341,15 @@ If you wish, you can use multiple flags, like below. It will download objects th Using a combination of `--include` and `--exclude` also possible. The command below will only sync objects that end with `.log` or `.txt` but exclude those that start with `access_`. For example, `request.log`, and `license.txt` will be included, while `access_log.txt`, and `readme.md` are excluded. s5cmd sync --include "*.log" --exclude "access_*" --include "*.txt" 's3://bucket/logs/*' . + +#### Listing objects with pagination + +`s5cmd` supports the `--start-after` flag for the `ls` command to list objects starting after a specific key. This is useful for pagination and resuming listings from a specific point: + + s5cmd ls --start-after "logs/2024/03/file100.txt" 's3://bucket/logs/*' + +This will list all objects that come alphabetically after `logs/2024/03/file100.txt` in the bucket. + #### Select JSON object content using SQL `s5cmd` supports the `SelectObjectContent` S3 operation, and will run your diff --git a/command/ls.go b/command/ls.go index 085b8ae18..21001e834 100644 --- a/command/ls.go +++ b/command/ls.go @@ -58,6 +58,9 @@ Examples: 11. List all files with their fullpaths > s5cmd {{.HelpName}} --show-fullpath "s3://bucket/*" + 12. List objects starting after a specific key + > s5cmd {{.HelpName}} --start-after "prefix/object5.txt" "s3://bucket/prefix/*" + ` func NewListCommand() *cli.Command { @@ -94,6 +97,10 @@ func NewListCommand() *cli.Command { Name: "show-fullpath", Usage: "shows only the fullpath names of the object(s)", }, + &cli.StringFlag{ + Name: "start-after", + Usage: "start listing after this specified key", + }, }, Before: func(c *cli.Context) error { err := validateLSCommand(c) @@ -115,7 +122,8 @@ func NewListCommand() *cli.Command { fullCommand := commandFromContext(c) srcurl, err := url.New(c.Args().First(), - url.WithAllVersions(c.Bool("all-versions"))) + url.WithAllVersions(c.Bool("all-versions")), + url.WithStartAfter(c.String("start-after"))) if err != nil { printError(fullCommand, c.Command.Name, err) return err diff --git a/e2e/ls_test.go b/e2e/ls_test.go index 9f3f09dfe..028afb725 100644 --- a/e2e/ls_test.go +++ b/e2e/ls_test.go @@ -802,3 +802,137 @@ func TestEmptyBucket(t *testing.T) { assertLines(t, result.Stdout(), nil) } + +// ls --start-after +func TestListObjectsWithStartAfter(t *testing.T) { + // Skip if using gofakes3 backend (doesn't support StartAfter) + if !isEndpointFromEnv() { + t.Skip("gofakes3 backend doesn't support StartAfter parameter - run with S5CMD_TEST_ENDPOINT_URL to test against real S3") + } + t.Parallel() + + s3client, s5cmd := setup(t) + + bucket := s3BucketFromTestName(t) + createBucket(t, s3client, bucket) + + // create multiple files + putFile(t, s3client, bucket, "a.txt", "content of a") + putFile(t, s3client, bucket, "b.txt", "content of b") + putFile(t, s3client, bucket, "c.txt", "content of c") + putFile(t, s3client, bucket, "d.txt", "content of d") + putFile(t, s3client, bucket, "e.txt", "content of e") + + // list objects starting after "b.txt" + cmd := s5cmd("ls", "--start-after", "b.txt", "s3://"+bucket+"/") + result := icmd.RunCmd(cmd) + + result.Assert(t, icmd.Success) + + // should only see c.txt, d.txt, and e.txt (b.txt and earlier should not appear) + assertLines(t, result.Stdout(), map[int]compareFunc{ + 0: suffix("c.txt"), + 1: suffix("d.txt"), + 2: suffix("e.txt"), + }, strictLineCheck(true), trimMatch(dateRe)) +} + +// ls --start-after with prefix +func TestListObjectsWithStartAfterAndPrefix(t *testing.T) { + // Skip if using gofakes3 backend (doesn't support StartAfter) + if !isEndpointFromEnv() { + t.Skip("gofakes3 backend doesn't support StartAfter parameter - run with S5CMD_TEST_ENDPOINT_URL to test against real S3") + } + t.Parallel() + + s3client, s5cmd := setup(t) + + bucket := s3BucketFromTestName(t) + createBucket(t, s3client, bucket) + + // create multiple files with prefix + putFile(t, s3client, bucket, "logs/2024/file1.txt", "content 1") + putFile(t, s3client, bucket, "logs/2024/file2.txt", "content 2") + putFile(t, s3client, bucket, "logs/2024/file3.txt", "content 3") + putFile(t, s3client, bucket, "logs/2024/file4.txt", "content 4") + + // list objects starting after "logs/2024/file2.txt" + cmd := s5cmd("ls", "--start-after", "logs/2024/file2.txt", "s3://"+bucket+"/logs/2024/") + result := icmd.RunCmd(cmd) + + result.Assert(t, icmd.Success) + + // should only see file3.txt and file4.txt + assertLines(t, result.Stdout(), map[int]compareFunc{ + 0: suffix("file3.txt"), + 1: suffix("file4.txt"), + }, strictLineCheck(true), trimMatch(dateRe)) +} + +// ls --start-after with wildcard filter +func TestListObjectsWithStartAfterAndWildcard(t *testing.T) { + // Skip if using gofakes3 backend (doesn't support StartAfter) + if !isEndpointFromEnv() { + t.Skip("gofakes3 backend doesn't support StartAfter parameter - run with S5CMD_TEST_ENDPOINT_URL to test against real S3") + } + t.Parallel() + + s3client, s5cmd := setup(t) + + bucket := s3BucketFromTestName(t) + createBucket(t, s3client, bucket) + + // create multiple files with prefix + putFile(t, s3client, bucket, "logs/2024/file1.txt", "content 1") + putFile(t, s3client, bucket, "logs/2024/file2.txt", "content 2") + putFile(t, s3client, bucket, "logs/2024/file3.txt", "content 3") + putFile(t, s3client, bucket, "logs/2024/file4.txt", "content 4") + putFile(t, s3client, bucket, "logs/2024/file5.log", "content 5") + + // list objects with wildcard starting after "logs/2024/file2.txt" + cmd := s5cmd("ls", "--start-after", "logs/2024/file2.txt", "s3://"+bucket+"/logs/2024/*") + result := icmd.RunCmd(cmd) + + result.Assert(t, icmd.Success) + + // should only see file3.txt, file4.txt, and file5.log (after file2.txt) + assertLines(t, result.Stdout(), map[int]compareFunc{ + 0: suffix("file3.txt"), + 1: suffix("file4.txt"), + 2: suffix("file5.log"), + }, strictLineCheck(true), trimMatch(dateRe)) +} + +// ls --start-after with wildcard filter matching specific extension +func TestListObjectsWithStartAfterAndWildcardExtension(t *testing.T) { + // Skip if using gofakes3 backend (doesn't support StartAfter) + if !isEndpointFromEnv() { + t.Skip("gofakes3 backend doesn't support StartAfter parameter - run with S5CMD_TEST_ENDPOINT_URL to test against real S3") + } + t.Parallel() + + s3client, s5cmd := setup(t) + + bucket := s3BucketFromTestName(t) + createBucket(t, s3client, bucket) + + // create multiple files with prefix + putFile(t, s3client, bucket, "logs/2024/file1.txt", "content 1") + putFile(t, s3client, bucket, "logs/2024/file2.txt", "content 2") + putFile(t, s3client, bucket, "logs/2024/file3.txt", "content 3") + putFile(t, s3client, bucket, "logs/2024/file4.txt", "content 4") + putFile(t, s3client, bucket, "logs/2024/file5.log", "content 5") + + // list only .txt files with wildcard starting after "logs/2024/file2.txt" + cmd := s5cmd("ls", "--start-after", "logs/2024/file2.txt", "s3://"+bucket+"/logs/2024/*.txt") + result := icmd.RunCmd(cmd) + + result.Assert(t, icmd.Success) + + // should only see file3.txt and file4.txt (file5.log should be filtered out by wildcard) + assertLines(t, result.Stdout(), map[int]compareFunc{ + 0: suffix("file3.txt"), + 1: suffix("file4.txt"), + }, strictLineCheck(true), trimMatch(dateRe)) +} + diff --git a/storage/s3.go b/storage/s3.go index 3313be66e..2f1130195 100644 --- a/storage/s3.go +++ b/storage/s3.go @@ -306,6 +306,10 @@ func (s *S3) listObjectsV2(ctx context.Context, url *url.URL) <-chan *Object { listInput.SetDelimiter(url.Delimiter) } + if url.StartAfter != "" { + listInput.SetStartAfter(url.StartAfter) + } + objCh := make(chan *Object) go func() { diff --git a/storage/url/url.go b/storage/url/url.go index a9489b7e2..fb52de852 100644 --- a/storage/url/url.go +++ b/storage/url/url.go @@ -48,6 +48,7 @@ type URL struct { Prefix string VersionID string AllVersions bool + StartAfter string relativePath string filter string @@ -75,6 +76,12 @@ func WithAllVersions(isAllVersions bool) Option { } } +func WithStartAfter(startAfter string) Option { + return func(u *URL) { + u.StartAfter = startAfter + } +} + // New creates a new URL from given path string. func New(s string, opts ...Option) (*URL, error) { scheme, rest, isFound := strings.Cut(s, "://")