From 41d97837ab1e5a35fdcfd7f6af9b5d56af62e92a Mon Sep 17 00:00:00 2001 From: Phillip Wood Date: Mon, 28 Jul 2025 22:05:19 +0300 Subject: [PATCH 01/10] xdiff: refactor xdl_hash_record() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inline the check for whitespace flags so that the compiler can hoist it out of the loop in xdl_prepare_ctx(). This improves the performance by 8%. $ hyperfine --warmup=1 -L rev HEAD,HEAD^ --setup='git checkout {rev} -- :/ && make git' ': {rev}; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0' Benchmark 1: : HEAD; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0 Time (mean ± σ): 1.670 s ± 0.044 s [User: 1.473 s, System: 0.196 s] Range (min … max): 1.619 s … 1.754 s 10 runs Benchmark 2: : HEAD^; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0 Time (mean ± σ): 1.801 s ± 0.021 s [User: 1.605 s, System: 0.192 s] Range (min … max): 1.766 s … 1.831 s 10 runs Summary ': HEAD^; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0' ran 1.08 ± 0.03 times faster than ': HEAD^^; GIT_CONFIG_GLOBAL=/dev/null ./git log --oneline --shortstat v2.0.0..v2.5.0' Signed-off-by: Phillip Wood Signed-off-by: Junio C Hamano --- xdiff/xutils.c | 7 ++----- xdiff/xutils.h | 10 +++++++++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/xdiff/xutils.c b/xdiff/xutils.c index 444a108f87c0b6..e070ed649ffcbc 100644 --- a/xdiff/xutils.c +++ b/xdiff/xutils.c @@ -249,7 +249,7 @@ int xdl_recmatch(const char *l1, long s1, const char *l2, long s2, long flags) return 1; } -static unsigned long xdl_hash_record_with_whitespace(char const **data, +unsigned long xdl_hash_record_with_whitespace(char const **data, char const *top, long flags) { unsigned long ha = 5381; char const *ptr = *data; @@ -294,13 +294,10 @@ static unsigned long xdl_hash_record_with_whitespace(char const **data, return ha; } -unsigned long xdl_hash_record(char const **data, char const *top, long flags) { +unsigned long xdl_hash_record_verbatim(char const **data, char const *top) { unsigned long ha = 5381; char const *ptr = *data; - if (flags & XDF_WHITESPACE_FLAGS) - return xdl_hash_record_with_whitespace(data, top, flags); - for (; ptr < top && *ptr != '\n'; ptr++) { ha += (ha << 5); ha ^= (unsigned long) *ptr; diff --git a/xdiff/xutils.h b/xdiff/xutils.h index fd0bba94e8b4d2..13f68310472a69 100644 --- a/xdiff/xutils.h +++ b/xdiff/xutils.h @@ -34,7 +34,15 @@ void *xdl_cha_alloc(chastore_t *cha); long xdl_guess_lines(mmfile_t *mf, long sample); int xdl_blankline(const char *line, long size, long flags); int xdl_recmatch(const char *l1, long s1, const char *l2, long s2, long flags); -unsigned long xdl_hash_record(char const **data, char const *top, long flags); +unsigned long xdl_hash_record_verbatim(char const **data, char const *top); +unsigned long xdl_hash_record_with_whitespace(char const **data, char const *top, long flags); +static inline unsigned long xdl_hash_record(char const **data, char const *top, long flags) +{ + if (flags & XDF_WHITESPACE_FLAGS) + return xdl_hash_record_with_whitespace(data, top, flags); + else + return xdl_hash_record_verbatim(data, top); +} unsigned int xdl_hashbits(unsigned int size); int xdl_num_out(char *out, long val); int xdl_emit_hunk_hdr(long s1, long c1, long s2, long c2, From a4bbe8af0b48f9c80ccc2c4619309c4a81c1460a Mon Sep 17 00:00:00 2001 From: Alexander Monakov Date: Mon, 28 Jul 2025 22:05:20 +0300 Subject: [PATCH 02/10] xdiff: optimize xdl_hash_record_verbatim xdl_hash_record_verbatim uses modified djb2 hash with XOR instead of ADD for combining. The ADD-based variant is used as the basis of the modern ("GNU") symbol lookup scheme in ELF. Glibc dynamic loader received an optimized version of this hash function thanks to Noah Goldstein [1]. Switch xdl_hash_record_verbatim to additive hashing and implement an optimized loop following the scheme suggested by Noah. Timing 'git log --oneline --shortstat v2.0.0..v2.5.0' under perf, I got version | cycles, bn | instructions, bn --------------------------------------- A 6.38 11.3 B 6.21 10.89 C 5.80 9.95 D 5.83 8.74 --------------------------------------- A: baseline (git master at e4ef0485fd78) B: plus 'xdiff: refactor xdl_hash_record()' C: and plus this patch D: with 'xdiff: use xxhash' by Phillip Wood The resulting speedup for xdl_hash_record_verbatim itself is about 1.5x. [1] https://inbox.sourceware.org/libc-alpha/20220519221803.57957-6-goldstein.w.n@gmail.com/ Signed-off-by: Alexander Monakov Signed-off-by: Junio C Hamano --- xdiff/xutils.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 4 deletions(-) diff --git a/xdiff/xutils.c b/xdiff/xutils.c index e070ed649ffcbc..78d1cf74b1cc48 100644 --- a/xdiff/xutils.c +++ b/xdiff/xutils.c @@ -294,16 +294,67 @@ unsigned long xdl_hash_record_with_whitespace(char const **data, return ha; } +/* + * Compiler reassociation barrier: pretend to modify X and Y to disallow + * changing evaluation order with respect to following uses of X and Y. + */ +#ifdef __GNUC__ +#define REASSOC_FENCE(x, y) __asm__("" : "+r"(x), "+r"(y)) +#else +#define REASSOC_FENCE(x, y) +#endif + unsigned long xdl_hash_record_verbatim(char const **data, char const *top) { - unsigned long ha = 5381; + unsigned long ha = 5381, c0, c1; char const *ptr = *data; - +#if 0 + /* + * The baseline form of the optimized loop below. This is the djb2 + * hash (the above function uses a variant with XOR instead of ADD). + */ for (; ptr < top && *ptr != '\n'; ptr++) { ha += (ha << 5); - ha ^= (unsigned long) *ptr; + ha += (unsigned long) *ptr; } *data = ptr < top ? ptr + 1: ptr; - +#else + /* Process two characters per iteration. */ + if (top - ptr >= 2) do { + if ((c0 = ptr[0]) == '\n') { + *data = ptr + 1; + return ha; + } + if ((c1 = ptr[1]) == '\n') { + *data = ptr + 2; + c0 += ha; + REASSOC_FENCE(c0, ha); + ha = ha * 32 + c0; + return ha; + } + /* + * Combine characters C0 and C1 into the hash HA. We have + * HA = (HA * 33 + C0) * 33 + C1, and we want to ensure + * that dependency chain over HA is just one multiplication + * and one addition, i.e. we want to evaluate this as + * HA = HA * 33 * 33 + (C0 * 33 + C1), and likewise prefer + * (C0 * 32 + (C0 + C1)) for the expression in parenthesis. + */ + ha *= 33 * 33; + c1 += c0; + REASSOC_FENCE(c1, c0); + c1 += c0 * 32; + REASSOC_FENCE(c1, ha); + ha += c1; + + ptr += 2; + } while (ptr < top - 1); + *data = top; + if (ptr < top && (c0 = ptr[0]) != '\n') { + c0 += ha; + REASSOC_FENCE(c0, ha); + ha = ha * 32 + c0; + } +#endif return ha; } From febb9d87dfea2f9e3e4c438d025506a5f18f6198 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 25 Aug 2025 12:49:56 +0000 Subject: [PATCH 03/10] path-walk: fix setup of pending objects Users reported an issue where objects were missing from their local repositories after a full repack using 'git repack -adf --path-walk'. This was alarming and took a while to create a reproducer. Here, we fix the bug and include a test case that would fail without this fix. The root cause is that certain objects existed in the index and had no second versions. These objects are usually blobs, though trees can be included if a cache-tree exists. The issue is that the revision walk adds these objects to the "pending" list and the path-walk API forgets to mark the lists it creates at this point as "maybe_interesting". If these paths only ever have a single version in the history of the repo (including the current staged version) then the parent directory never tries to add a new object to the list and mark the list as "maybe_interesting". Thus, when walking the list later, the group is skipped as it is expected that no objects are interesting. This happens even when there are actually no UNINTERESTING objects at all! This is based on the optimization enabled by the pack.useSparse=true config option, which is the default. Thus, we create a test case that demonstrates the many cases of this issue for reproducibility: 1. File a/b/c has only one committed version. 2. Files a/i and x/y only exist as staged changes. 3. Tree x/ only exists in the cache-tree. After performing a non-path-walk repack to force all loose objects into packfiles, run a --path-walk repack followed by 'git fsck'. This fsck is what fails with the following errors: error: invalid object 100644 f2e41136... for 'a/b/c' This is the dropped instance of the single-versioned a/b/c file. broken link from tree cfda31d8... to tree 3f725fcd... This is the missing tree for the single-versioned a/b/ directory. missing blob 0ddf2bae... (a/i) missing blob 975fbec8... (x/y) missing blob a60d869d... (file) missing blob f2e41136... (a/b/c) missing tree 3f725fcd... (a/b/) dangling tree 5896d7e... (staged root tree) Note that since the staged root tree is missing, the fsck output cannot even report that the staged x/ tree is missing as well. The core problem here is that the "maybe_interesting" member of 'struct type_and_oid_list' is not initialized to '1'. This member was added in 6333e7ae0b (path-walk: mark trees and blobs as UNINTERESTING, 2024-12-20) in a way to help when creating packfiles for a small commit range using the sparse path algorithm (enabled by pack.useSparse=true). The idea here is that the list is marked as "maybe_interesting" if an object is added that does not have the UNINTERESTING flag on it. Later, this is checked again in case all objects in the list were marked UNINTERESTING after that point in time. In this case, the algorithm skips the list as there is no reason to visit it. This leads to the problem where the "maybe_interesting" member was not appropriately initialized when the list is created from pending objects. Initializing this in the correct places fixes the bug. To reduce risk of similar bugs around initializing this structure, a follow-up change will make initializing lists use a shared method. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- path-walk.c | 2 ++ t/t7700-repack.sh | 63 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/path-walk.c b/path-walk.c index 2d4ddbadd50f78..1215ed398f4fb6 100644 --- a/path-walk.c +++ b/path-walk.c @@ -385,6 +385,7 @@ static int setup_pending_objects(struct path_walk_info *info, list->type = OBJ_TREE; strmap_put(&ctx->paths_to_lists, path, list); } + list->maybe_interesting = 1; oid_array_append(&list->oids, &obj->oid); free(path); } else { @@ -404,6 +405,7 @@ static int setup_pending_objects(struct path_walk_info *info, list->type = OBJ_BLOB; strmap_put(&ctx->paths_to_lists, path, list); } + list->maybe_interesting = 1; oid_array_append(&list->oids, &obj->oid); } else { /* assume a root tree, such as a lightweight tag. */ diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh index 611755cc139b96..73b78bdd887d80 100755 --- a/t/t7700-repack.sh +++ b/t/t7700-repack.sh @@ -838,4 +838,67 @@ test_expect_success '-n overrides repack.updateServerInfo=true' ' test_server_info_missing ' +test_expect_success 'pending objects are repacked appropriately' ' + test_when_finished rm -rf pending && + git init pending && + + ( + cd pending && + + # Commit file, a/b/c and never change them. + mkdir -p a/b && + echo singleton >file && + echo stuff >a/b/c && + echo more >a/d && + git add file a && + git commit -m "single blobs" && + + # Files a/d and a/e will not be singletons. + echo d >a/d && + echo e >a/e && + git add a && + git commit -m "more blobs" && + + # This use of a sparse index helps to force + # test that the cache-tree is walked, too. + git sparse-checkout set --sparse-index a x && + + # Create staged changes: + # * a/e now has multiple versions. + # * a/i now has only one version. + echo f >a/d && + echo h >a/e && + echo i >a/i && + git add a && + + # Stage and unstage a change to make use of + # resolve-undo cache and how that impacts fsck. + mkdir x && + echo y >x/y && + git add x && + xy=$(git rev-parse :x/y) && + git rm --cached x/y && + + # The blob for x/y must persist through repacks, + # but fsck currently ignores the REUC extension + # for finding links to the blob. + cat >expect <<-EOF && + dangling blob $xy + EOF + + # Bring the loose objects into a packfile to avoid + # leftovers in next test. Without this, the loose + # objects persist and the test succeeds for other + # reasons. + git repack -adf && + git fsck >out && + test_cmp expect out && + + # Test path walk version with pack.useSparse. + git -c pack.useSparse=true repack -adf --path-walk && + git fsck >out && + test_cmp expect out + ) +' + test_done From 93afe9b060dedd94c20feb8da553cef0e4301176 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 25 Aug 2025 12:49:57 +0000 Subject: [PATCH 04/10] path-walk: create initializer for path lists The previous change fixed a bug in 'git repack -adf --path-walk' that was due to an update to how path lists are initialized and missing some important cases when processing the pending objects. This change takes the three critical places where path lists are initialized and combines them into a static method. This simplifies the callers somewhat while also helping to avoid a missed update in the future. The other places where a path list (struct type_and_oid_list) is initialized is for the following "fixed" lists: * Tag objects. * Commit objects. * Root trees. * Tagged trees. * Tagged blobs. These lists are created and consumed in different ways, with only the root trees being passed into the logic that cares about the "maybe_interesting" bit. It is appropriate to keep these uses separate. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- path-walk.c | 57 +++++++++++++++++++++++------------------------------ 1 file changed, 25 insertions(+), 32 deletions(-) diff --git a/path-walk.c b/path-walk.c index 1215ed398f4fb6..f1ceed99e94ca9 100644 --- a/path-walk.c +++ b/path-walk.c @@ -105,6 +105,24 @@ static void push_to_stack(struct path_walk_context *ctx, prio_queue_put(&ctx->path_stack, xstrdup(path)); } +static void add_path_to_list(struct path_walk_context *ctx, + const char *path, + enum object_type type, + struct object_id *oid, + int interesting) +{ + struct type_and_oid_list *list = strmap_get(&ctx->paths_to_lists, path); + + if (!list) { + CALLOC_ARRAY(list, 1); + list->type = type; + strmap_put(&ctx->paths_to_lists, path, list); + } + + list->maybe_interesting |= interesting; + oid_array_append(&list->oids, oid); +} + static int add_tree_entries(struct path_walk_context *ctx, const char *base_path, struct object_id *oid) @@ -129,7 +147,6 @@ static int add_tree_entries(struct path_walk_context *ctx, init_tree_desc(&desc, &tree->object.oid, tree->buffer, tree->size); while (tree_entry(&desc, &entry)) { - struct type_and_oid_list *list; struct object *o; /* Not actually true, but we will ignore submodules later. */ enum object_type type = S_ISDIR(entry.mode) ? OBJ_TREE : OBJ_BLOB; @@ -190,17 +207,10 @@ static int add_tree_entries(struct path_walk_context *ctx, continue; } - if (!(list = strmap_get(&ctx->paths_to_lists, path.buf))) { - CALLOC_ARRAY(list, 1); - list->type = type; - strmap_put(&ctx->paths_to_lists, path.buf, list); - } - push_to_stack(ctx, path.buf); - - if (!(o->flags & UNINTERESTING)) - list->maybe_interesting = 1; + add_path_to_list(ctx, path.buf, type, &entry.oid, + !(o->flags & UNINTERESTING)); - oid_array_append(&list->oids, &entry.oid); + push_to_stack(ctx, path.buf); } free_tree_buffer(tree); @@ -377,16 +387,9 @@ static int setup_pending_objects(struct path_walk_info *info, if (!info->trees) continue; if (pending->path) { - struct type_and_oid_list *list; char *path = *pending->path ? xstrfmt("%s/", pending->path) : xstrdup(""); - if (!(list = strmap_get(&ctx->paths_to_lists, path))) { - CALLOC_ARRAY(list, 1); - list->type = OBJ_TREE; - strmap_put(&ctx->paths_to_lists, path, list); - } - list->maybe_interesting = 1; - oid_array_append(&list->oids, &obj->oid); + add_path_to_list(ctx, path, OBJ_TREE, &obj->oid, 1); free(path); } else { /* assume a root tree, such as a lightweight tag. */ @@ -397,20 +400,10 @@ static int setup_pending_objects(struct path_walk_info *info, case OBJ_BLOB: if (!info->blobs) continue; - if (pending->path) { - struct type_and_oid_list *list; - char *path = pending->path; - if (!(list = strmap_get(&ctx->paths_to_lists, path))) { - CALLOC_ARRAY(list, 1); - list->type = OBJ_BLOB; - strmap_put(&ctx->paths_to_lists, path, list); - } - list->maybe_interesting = 1; - oid_array_append(&list->oids, &obj->oid); - } else { - /* assume a root tree, such as a lightweight tag. */ + if (pending->path) + add_path_to_list(ctx, pending->path, OBJ_BLOB, &obj->oid, 1); + else oid_array_append(&tagged_blobs->oids, &obj->oid); - } break; case OBJ_COMMIT: From 0eeacde50e71cc320016f0bcf9f8b17d5168cbfd Mon Sep 17 00:00:00 2001 From: David Aguilar Date: Tue, 26 Aug 2025 16:35:25 -0700 Subject: [PATCH 05/10] Makefile: build libgit-rs and libgit-sys serially "make -JN" with INCLUDE_LIBGIT_RS enabled causes cargo lock warnings and can trigger ld errors during the build. The build errors are caused by two inner "make" invocations getting triggered concurrently: once inside of libgit-sys and another inside of libgit-rs. Make libgit-rs depend on libgit-sys so that "make" prevents them from running concurrently. Apply the same logic to the test invocations. Use cargo's "--manifest-path" option instead of "cd" in the recipes. Signed-off-by: David Aguilar Acked-by: Kyle Lippincott Signed-off-by: Junio C Hamano --- Makefile | 11 +++++------ t/Makefile | 14 ++++---------- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index 70d1543b6b8688..13ac35a151c6a6 100644 --- a/Makefile +++ b/Makefile @@ -3946,13 +3946,12 @@ unit-tests: $(UNIT_TEST_PROGS) $(CLAR_TEST_PROG) t/helper/test-tool$X $(MAKE) -C t/ unit-tests .PHONY: libgit-sys libgit-rs -libgit-sys libgit-rs: - $(QUIET)(\ - cd contrib/$@ && \ - cargo build \ - ) +libgit-sys: + $(QUIET)cargo build --manifest-path contrib/libgit-sys/Cargo.toml +libgit-rs: libgit-sys + $(QUIET)cargo build --manifest-path contrib/libgit-rs/Cargo.toml ifdef INCLUDE_LIBGIT_RS -all:: libgit-sys libgit-rs +all:: libgit-rs endif LIBGIT_PUB_OBJS += contrib/libgit-sys/public_symbol_export.o diff --git a/t/Makefile b/t/Makefile index 791e0a097893e9..29dd226c7dcd90 100644 --- a/t/Makefile +++ b/t/Makefile @@ -190,15 +190,9 @@ perf: .PHONY: libgit-sys-test libgit-rs-test libgit-sys-test: - $(QUIET)(\ - cd ../contrib/libgit-sys && \ - cargo test \ - ) -libgit-rs-test: - $(QUIET)(\ - cd ../contrib/libgit-rs && \ - cargo test \ - ) + $(QUIET)cargo test --manifest-path ../contrib/libgit-sys/Cargo.toml +libgit-rs-test: libgit-sys-test + $(QUIET)cargo test --manifest-path ../contrib/libgit-rs/Cargo.toml ifdef INCLUDE_LIBGIT_RS -all:: libgit-sys-test libgit-rs-test +all:: libgit-rs-test endif From 681f26bccc017371ae6ee20db55e3edb52420a25 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 15 Aug 2025 16:12:53 +0000 Subject: [PATCH 06/10] ls-files: conditionally leave index sparse When running 'git ls-files' with a pathspec, the index entries get filtered according to that pathspec before iterating over them in show_files(). In 78087097b8 (ls-files: add --sparse option, 2021-12-22), this iteration was prefixed with a check for the '--sparse' option which allows the command to output directory entries; this created a pre-loop call to ensure_full_index(). However, when a user runs 'git ls-files' where the pathspec matches directories that are recursively matched in the sparse-checkout, there are not any sparse directories that match the pathspec so they would not be written to the output. The expansion in this case is just a performance drop for no behavior difference. Replace this global check to expand the index with a check inside the loop for a matched sparse directory. If we see one, then expand the index and continue from the current location. This is safe since the previous entries in the index did not have any sparse directories and thus would remain stable in this expansion. A test in t1092 confirms that this changes the behavior. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- builtin/ls-files.c | 13 ++++++++++--- t/t1092-sparse-checkout-compatibility.sh | 13 +++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/builtin/ls-files.c b/builtin/ls-files.c index be74f0a03b20f3..c20394bcb76d37 100644 --- a/builtin/ls-files.c +++ b/builtin/ls-files.c @@ -413,14 +413,21 @@ static void show_files(struct repository *repo, struct dir_struct *dir) if (!(show_cached || show_stage || show_deleted || show_modified)) return; - if (!show_sparse_dirs) - ensure_full_index(repo->index); - for (i = 0; i < repo->index->cache_nr; i++) { const struct cache_entry *ce = repo->index->cache[i]; struct stat st; int stat_err; + if (S_ISSPARSEDIR(ce->ce_mode) && !show_sparse_dirs) { + /* + * This is the first time we've hit a sparse dir, + * so expansion will leave the first 'i' entries + * alone. + */ + ensure_full_index(repo->index); + ce = repo->index->cache[i]; + } + construct_fullname(&fullname, repo, ce); if ((dir->flags & DIR_SHOW_IGNORED) && diff --git a/t/t1092-sparse-checkout-compatibility.sh b/t/t1092-sparse-checkout-compatibility.sh index d8101139b40aa0..b0f691c151a7d0 100755 --- a/t/t1092-sparse-checkout-compatibility.sh +++ b/t/t1092-sparse-checkout-compatibility.sh @@ -1506,6 +1506,8 @@ test_expect_success 'sparse-index is not expanded' ' ensure_not_expanded reset --hard && ensure_not_expanded restore -s rename-out-to-out -- deep/deeper1 && + ensure_not_expanded ls-files deep/deeper1 && + echo >>sparse-index/README.md && ensure_not_expanded add -A && echo >>sparse-index/extra.txt && @@ -1607,6 +1609,17 @@ test_expect_success 'describe tested on all' ' test_all_match git describe --dirty ' +test_expect_success 'ls-files filtering and expansion' ' + init_repos && + + # This filtering will hit a sparse directory midway + # through the iteration. + test_all_match git ls-files deep && + + # This pathspec will filter the index to only a sparse + # directory. + test_all_match git ls-files folder1 +' test_expect_success 'sparse-index is not expanded: describe' ' init_repos && From 32f74582bc298621a05ab5733810ff0300b69715 Mon Sep 17 00:00:00 2001 From: Toon Claes Date: Tue, 5 Aug 2025 11:33:56 +0200 Subject: [PATCH 07/10] last-modified: new subcommand to show when files were last modified MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similar to git-blame(1), introduce a new subcommand git-last-modified(1). This command shows the most recent modification to paths in a tree. It does so by expanding the tree at a given commit, taking note of the current state of each path, and then walking backwards through history looking for commits where each path changed into its final commit ID. Based-on-patch-by: Jeff King Improved-by: Ævar Arnfjörð Bjarmason Signed-off-by: Toon Claes Signed-off-by: Junio C Hamano --- .gitignore | 1 + Documentation/git-last-modified.adoc | 54 +++++ Documentation/meson.build | 1 + Makefile | 1 + builtin.h | 1 + builtin/last-modified.c | 281 +++++++++++++++++++++++++++ command-list.txt | 1 + git.c | 1 + meson.build | 1 + t/meson.build | 1 + t/t8020-last-modified.sh | 210 ++++++++++++++++++++ 11 files changed, 553 insertions(+) create mode 100644 Documentation/git-last-modified.adoc create mode 100644 builtin/last-modified.c create mode 100755 t/t8020-last-modified.sh diff --git a/.gitignore b/.gitignore index 04c444404e4ba8..a36ee944433574 100644 --- a/.gitignore +++ b/.gitignore @@ -87,6 +87,7 @@ /git-init-db /git-interpret-trailers /git-instaweb +/git-last-modified /git-log /git-ls-files /git-ls-remote diff --git a/Documentation/git-last-modified.adoc b/Documentation/git-last-modified.adoc new file mode 100644 index 00000000000000..602843e09598a5 --- /dev/null +++ b/Documentation/git-last-modified.adoc @@ -0,0 +1,54 @@ +git-last-modified(1) +==================== + +NAME +---- +git-last-modified - EXPERIMENTAL: Show when files were last modified + + +SYNOPSIS +-------- +[synopsis] +git last-modified [--recursive] [--show-trees] [] [[--] ...] + +DESCRIPTION +----------- + +Shows which commit last modified each of the relevant files and subdirectories. +A commit renaming a path, or changing it's mode is also taken into account. + +THIS COMMAND IS EXPERIMENTAL. THE BEHAVIOR MAY CHANGE. + +OPTIONS +------- + +`-r`:: +`--recursive`:: + Instead of showing tree entries, step into subtrees and show all entries + inside them recursively. + +`-t`:: +`--show-trees`:: + Show tree entries even when recursing into them. It has no effect + without `--recursive`. + +``:: + Only traverse commits in the specified revision range. When no + `` is specified, it defaults to `HEAD` (i.e. the whole + history leading to the current commit). For a complete list of ways to + spell ``, see the 'Specifying Ranges' section of + linkgit:gitrevisions[7]. + +`[--] ...`:: + For each __ given, the commit which last modified it is returned. + Without an optional path parameter, all files and subdirectories + in path traversal the are included in the output. + +SEE ALSO +-------- +linkgit:git-blame[1], +linkgit:git-log[1]. + +GIT +--- +Part of the linkgit:git[1] suite diff --git a/Documentation/meson.build b/Documentation/meson.build index 4404c623f006db..a8ac5285f0abed 100644 --- a/Documentation/meson.build +++ b/Documentation/meson.build @@ -74,6 +74,7 @@ manpages = { 'git-init.adoc' : 1, 'git-instaweb.adoc' : 1, 'git-interpret-trailers.adoc' : 1, + 'git-last-modified.adoc' : 1, 'git-log.adoc' : 1, 'git-ls-files.adoc' : 1, 'git-ls-remote.adoc' : 1, diff --git a/Makefile b/Makefile index 5f7dd79dfa6ecf..b5ce55a70320dd 100644 --- a/Makefile +++ b/Makefile @@ -1265,6 +1265,7 @@ BUILTIN_OBJS += builtin/hook.o BUILTIN_OBJS += builtin/index-pack.o BUILTIN_OBJS += builtin/init-db.o BUILTIN_OBJS += builtin/interpret-trailers.o +BUILTIN_OBJS += builtin/last-modified.o BUILTIN_OBJS += builtin/log.o BUILTIN_OBJS += builtin/ls-files.o BUILTIN_OBJS += builtin/ls-remote.o diff --git a/builtin.h b/builtin.h index bff13e3069b4af..6ed6759ec4e037 100644 --- a/builtin.h +++ b/builtin.h @@ -176,6 +176,7 @@ int cmd_hook(int argc, const char **argv, const char *prefix, struct repository int cmd_index_pack(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_init_db(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_interpret_trailers(int argc, const char **argv, const char *prefix, struct repository *repo); +int cmd_last_modified(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_log_reflog(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_log(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_ls_files(int argc, const char **argv, const char *prefix, struct repository *repo); diff --git a/builtin/last-modified.c b/builtin/last-modified.c new file mode 100644 index 00000000000000..364493ac694c58 --- /dev/null +++ b/builtin/last-modified.c @@ -0,0 +1,281 @@ +#include "git-compat-util.h" +#include "builtin.h" +#include "commit.h" +#include "config.h" +#include "diff.h" +#include "diffcore.h" +#include "environment.h" +#include "hashmap.h" +#include "hex.h" +#include "log-tree.h" +#include "object-name.h" +#include "object.h" +#include "parse-options.h" +#include "quote.h" +#include "repository.h" +#include "revision.h" + +struct last_modified_entry { + struct hashmap_entry hashent; + struct object_id oid; + const char path[FLEX_ARRAY]; +}; + +static int last_modified_entry_hashcmp(const void *unused UNUSED, + const struct hashmap_entry *hent1, + const struct hashmap_entry *hent2, + const void *path) +{ + const struct last_modified_entry *ent1 = + container_of(hent1, const struct last_modified_entry, hashent); + const struct last_modified_entry *ent2 = + container_of(hent2, const struct last_modified_entry, hashent); + return strcmp(ent1->path, path ? path : ent2->path); +} + +struct last_modified { + struct hashmap paths; + struct rev_info rev; + bool recursive; + bool show_trees; +}; + +static void last_modified_release(struct last_modified *lm) +{ + hashmap_clear_and_free(&lm->paths, struct last_modified_entry, hashent); + release_revisions(&lm->rev); +} + +struct last_modified_callback_data { + struct last_modified *lm; + struct commit *commit; +}; + +static void add_path_from_diff(struct diff_queue_struct *q, + struct diff_options *opt UNUSED, void *data) +{ + struct last_modified *lm = data; + + for (int i = 0; i < q->nr; i++) { + struct diff_filepair *p = q->queue[i]; + struct last_modified_entry *ent; + const char *path = p->two->path; + + FLEX_ALLOC_STR(ent, path, path); + oidcpy(&ent->oid, &p->two->oid); + hashmap_entry_init(&ent->hashent, strhash(ent->path)); + hashmap_add(&lm->paths, &ent->hashent); + } +} + +static int populate_paths_from_revs(struct last_modified *lm) +{ + int num_interesting = 0; + struct diff_options diffopt; + + /* + * Create a copy of `struct diff_options`. In this copy a callback is + * set that when called adds entries to `paths` in `struct last_modified`. + * This copy is used to diff the tree of the target revision against an + * empty tree. This results in all paths in the target revision being + * listed. After `paths` is populated, we don't need this copy no more. + */ + memcpy(&diffopt, &lm->rev.diffopt, sizeof(diffopt)); + copy_pathspec(&diffopt.pathspec, &lm->rev.diffopt.pathspec); + diffopt.output_format = DIFF_FORMAT_CALLBACK; + diffopt.format_callback = add_path_from_diff; + diffopt.format_callback_data = lm; + + for (size_t i = 0; i < lm->rev.pending.nr; i++) { + struct object_array_entry *obj = lm->rev.pending.objects + i; + + if (obj->item->flags & UNINTERESTING) + continue; + + if (num_interesting++) + return error(_("last-modified can only operate on one tree at a time")); + + diff_tree_oid(lm->rev.repo->hash_algo->empty_tree, + &obj->item->oid, "", &diffopt); + diff_flush(&diffopt); + } + clear_pathspec(&diffopt.pathspec); + + return 0; +} + +static void last_modified_emit(struct last_modified *lm, + const char *path, const struct commit *commit) + +{ + if (commit->object.flags & BOUNDARY) + putchar('^'); + printf("%s\t", oid_to_hex(&commit->object.oid)); + + if (lm->rev.diffopt.line_termination) + write_name_quoted(path, stdout, '\n'); + else + printf("%s%c", path, '\0'); +} + +static void mark_path(const char *path, const struct object_id *oid, + struct last_modified_callback_data *data) +{ + struct last_modified_entry *ent; + + /* Is it even a path that we are interested in? */ + ent = hashmap_get_entry_from_hash(&data->lm->paths, strhash(path), path, + struct last_modified_entry, hashent); + if (!ent) + return; + + /* + * Is it arriving at a version of interest, or is it from a side branch + * which did not contribute to the final state? + */ + if (!oideq(oid, &ent->oid)) + return; + + last_modified_emit(data->lm, path, data->commit); + + hashmap_remove(&data->lm->paths, &ent->hashent, path); + free(ent); +} + +static void last_modified_diff(struct diff_queue_struct *q, + struct diff_options *opt UNUSED, void *cbdata) +{ + struct last_modified_callback_data *data = cbdata; + + for (int i = 0; i < q->nr; i++) { + struct diff_filepair *p = q->queue[i]; + switch (p->status) { + case DIFF_STATUS_DELETED: + /* + * There's no point in feeding a deletion, as it could + * not have resulted in our current state, which + * actually has the file. + */ + break; + + default: + /* + * Otherwise, we care only that we somehow arrived at + * a final oid state. Note that this covers some + * potentially controversial areas, including: + * + * 1. A rename or copy will be found, as it is the + * first time the content has arrived at the given + * path. + * + * 2. Even a non-content modification like a mode or + * type change will trigger it. + * + * We take the inclusive approach for now, and find + * anything which impacts the path. Options to tweak + * the behavior (e.g., to "--follow" the content across + * renames) can come later. + */ + mark_path(p->two->path, &p->two->oid, data); + break; + } + } +} + +static int last_modified_run(struct last_modified *lm) +{ + struct last_modified_callback_data data = { .lm = lm }; + + lm->rev.diffopt.output_format = DIFF_FORMAT_CALLBACK; + lm->rev.diffopt.format_callback = last_modified_diff; + lm->rev.diffopt.format_callback_data = &data; + + prepare_revision_walk(&lm->rev); + + while (hashmap_get_size(&lm->paths)) { + data.commit = get_revision(&lm->rev); + if (!data.commit) + BUG("paths remaining beyond boundary in last-modified"); + + if (data.commit->object.flags & BOUNDARY) { + diff_tree_oid(lm->rev.repo->hash_algo->empty_tree, + &data.commit->object.oid, "", + &lm->rev.diffopt); + diff_flush(&lm->rev.diffopt); + } else { + log_tree_commit(&lm->rev, data.commit); + } + } + + return 0; +} + +static int last_modified_init(struct last_modified *lm, struct repository *r, + const char *prefix, int argc, const char **argv) +{ + hashmap_init(&lm->paths, last_modified_entry_hashcmp, NULL, 0); + + repo_init_revisions(r, &lm->rev, prefix); + lm->rev.def = "HEAD"; + lm->rev.combine_merges = 1; + lm->rev.show_root_diff = 1; + lm->rev.boundary = 1; + lm->rev.no_commit_id = 1; + lm->rev.diff = 1; + lm->rev.diffopt.flags.recursive = lm->recursive; + lm->rev.diffopt.flags.tree_in_recursive = lm->show_trees; + + argc = setup_revisions(argc, argv, &lm->rev, NULL); + if (argc > 1) { + error(_("unknown last-modified argument: %s"), argv[1]); + return argc; + } + + if (populate_paths_from_revs(lm) < 0) + return error(_("unable to setup last-modified")); + + return 0; +} + +int cmd_last_modified(int argc, const char **argv, const char *prefix, + struct repository *repo) +{ + int ret; + struct last_modified lm = { 0 }; + + const char * const last_modified_usage[] = { + N_("git last-modified [--recursive] [--show-trees] " + "[] [[--] ...]"), + NULL + }; + + struct option last_modified_options[] = { + OPT_BOOL('r', "recursive", &lm.recursive, + N_("recurse into subtrees")), + OPT_BOOL('t', "show-trees", &lm.show_trees, + N_("show tree entries when recursing into subtrees")), + OPT_END() + }; + + argc = parse_options(argc, argv, prefix, last_modified_options, + last_modified_usage, + PARSE_OPT_KEEP_ARGV0 | PARSE_OPT_KEEP_UNKNOWN_OPT); + + repo_config(repo, git_default_config, NULL); + + ret = last_modified_init(&lm, repo, prefix, argc, argv); + if (ret > 0) + usage_with_options(last_modified_usage, + last_modified_options); + if (ret) + goto out; + + ret = last_modified_run(&lm); + if (ret) + goto out; + +out: + last_modified_release(&lm); + + return ret; +} diff --git a/command-list.txt b/command-list.txt index b7ade3ab9f3319..b715777b248a0d 100644 --- a/command-list.txt +++ b/command-list.txt @@ -124,6 +124,7 @@ git-index-pack plumbingmanipulators git-init mainporcelain init git-instaweb ancillaryinterrogators complete git-interpret-trailers purehelpers +git-last-modified plumbinginterrogators git-log mainporcelain info git-ls-files plumbinginterrogators git-ls-remote plumbinginterrogators diff --git a/git.c b/git.c index 07a5fe39fb69f0..76a0b2a1a44d39 100644 --- a/git.c +++ b/git.c @@ -565,6 +565,7 @@ static struct cmd_struct commands[] = { { "init", cmd_init_db }, { "init-db", cmd_init_db }, { "interpret-trailers", cmd_interpret_trailers, RUN_SETUP_GENTLY }, + { "last-modified", cmd_last_modified, RUN_SETUP }, { "log", cmd_log, RUN_SETUP }, { "ls-files", cmd_ls_files, RUN_SETUP }, { "ls-remote", cmd_ls_remote, RUN_SETUP_GENTLY }, diff --git a/meson.build b/meson.build index 9bc1826cb69e9b..77a3416b1c2384 100644 --- a/meson.build +++ b/meson.build @@ -607,6 +607,7 @@ builtin_sources = [ 'builtin/index-pack.c', 'builtin/init-db.c', 'builtin/interpret-trailers.c', + 'builtin/last-modified.c', 'builtin/log.c', 'builtin/ls-files.c', 'builtin/ls-remote.c', diff --git a/t/meson.build b/t/meson.build index 660d780dcc62d6..904455e3ab7fe1 100644 --- a/t/meson.build +++ b/t/meson.build @@ -961,6 +961,7 @@ integration_tests = [ 't8012-blame-colors.sh', 't8013-blame-ignore-revs.sh', 't8014-blame-ignore-fuzzy.sh', + 't8020-last-modified.sh', 't9001-send-email.sh', 't9002-column.sh', 't9003-help-autocorrect.sh', diff --git a/t/t8020-last-modified.sh b/t/t8020-last-modified.sh new file mode 100755 index 00000000000000..5eb4cef0359212 --- /dev/null +++ b/t/t8020-last-modified.sh @@ -0,0 +1,210 @@ +#!/bin/sh + +test_description='last-modified tests' + +. ./test-lib.sh + +test_expect_success 'setup' ' + test_commit 1 file && + mkdir a && + test_commit 2 a/file && + mkdir a/b && + test_commit 3 a/b/file +' + +test_expect_success 'cannot run last-modified on two trees' ' + test_must_fail git last-modified HEAD HEAD~1 +' + +check_last_modified() { + local indir= && + while test $# != 0 + do + case "$1" in + -C) + indir="$2" + shift + ;; + *) + break + ;; + esac && + shift + done && + + cat >expect && + test_when_finished "rm -f tmp.*" && + git ${indir:+-C "$indir"} last-modified "$@" >tmp.1 && + git name-rev --annotate-stdin --name-only --tags \ + tmp.2 && + tr '\t' ' ' actual && + test_cmp expect actual +} + +test_expect_success 'last-modified non-recursive' ' + check_last_modified <<-\EOF + 3 a + 1 file + EOF +' + +test_expect_success 'last-modified recursive' ' + check_last_modified -r <<-\EOF + 3 a/b/file + 2 a/file + 1 file + EOF +' + +test_expect_success 'last-modified recursive with show-trees' ' + check_last_modified -r -t <<-\EOF + 3 a + 3 a/b + 3 a/b/file + 2 a/file + 1 file + EOF +' + +test_expect_success 'last-modified non-recursive with show-trees' ' + check_last_modified -t <<-\EOF + 3 a + 1 file + EOF +' + +test_expect_success 'last-modified subdir' ' + check_last_modified a <<-\EOF + 3 a + EOF +' + +test_expect_success 'last-modified subdir recursive' ' + check_last_modified -r a <<-\EOF + 3 a/b/file + 2 a/file + EOF +' + +test_expect_success 'last-modified from non-HEAD commit' ' + check_last_modified HEAD^ <<-\EOF + 2 a + 1 file + EOF +' + +test_expect_success 'last-modified from subdir defaults to root' ' + check_last_modified -C a <<-\EOF + 3 a + 1 file + EOF +' + +test_expect_success 'last-modified from subdir uses relative pathspecs' ' + check_last_modified -C a -r b <<-\EOF + 3 a/b/file + EOF +' + +test_expect_success 'limit last-modified traversal by count' ' + check_last_modified -1 <<-\EOF + 3 a + ^2 file + EOF +' + +test_expect_success 'limit last-modified traversal by commit' ' + check_last_modified HEAD~2..HEAD <<-\EOF + 3 a + ^1 file + EOF +' + +test_expect_success 'only last-modified files in the current tree' ' + git rm -rf a && + git commit -m "remove a" && + check_last_modified <<-\EOF + 1 file + EOF +' + +test_expect_success 'cross merge boundaries in blaming' ' + git checkout HEAD^0 && + git rm -rf . && + test_commit m1 && + git checkout HEAD^ && + git rm -rf . && + test_commit m2 && + git merge m1 && + check_last_modified <<-\EOF + m2 m2.t + m1 m1.t + EOF +' + +test_expect_success 'last-modified merge for resolved conflicts' ' + git checkout HEAD^0 && + git rm -rf . && + test_commit c1 conflict && + git checkout HEAD^ && + git rm -rf . && + test_commit c2 conflict && + test_must_fail git merge c1 && + test_commit resolved conflict && + check_last_modified conflict <<-\EOF + resolved conflict + EOF +' + + +# Consider `file` with this content through history: +# +# A---B---B-------B---B +# \ / +# C---D +test_expect_success 'last-modified merge ignores content from branch' ' + git checkout HEAD^0 && + git rm -rf . && + test_commit a1 file A && + test_commit a2 file B && + test_commit a3 file C && + test_commit a4 file D && + git checkout a2 && + git merge --no-commit --no-ff a4 && + git checkout a2 -- file && + git merge --continue && + check_last_modified <<-\EOF + a2 file + EOF +' + +# Consider `file` with this content through history: +# +# A---B---B---C---D---B---B +# \ / +# B-------B +test_expect_success 'last-modified merge undoes changes' ' + git checkout HEAD^0 && + git rm -rf . && + test_commit b1 file A && + test_commit b2 file B && + test_commit b3 file C && + test_commit b4 file D && + git checkout b2 && + test_commit b5 file2 2 && + git checkout b4 && + git merge --no-commit --no-ff b5 && + git checkout b2 -- file && + git merge --continue && + check_last_modified <<-\EOF + b5 file2 + b2 file + EOF +' + +test_expect_success 'last-modified complains about unknown arguments' ' + test_must_fail git last-modified --foo 2>err && + grep "unknown last-modified argument: --foo" err +' + +test_done From 97d5301c54152d91a4e47449f759567f83140d4f Mon Sep 17 00:00:00 2001 From: Toon Claes Date: Tue, 5 Aug 2025 11:33:57 +0200 Subject: [PATCH 08/10] t/perf: add last-modified perf script This just runs some simple last-modified commands. We already test correctness in the regular suite, so this is just about finding performance regressions from one version to another. Based-on-patch-by: Jeff King Signed-off-by: Toon Claes Signed-off-by: Junio C Hamano --- t/meson.build | 1 + t/perf/p8020-last-modified.sh | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100755 t/perf/p8020-last-modified.sh diff --git a/t/meson.build b/t/meson.build index 904455e3ab7fe1..b74125b0479394 100644 --- a/t/meson.build +++ b/t/meson.build @@ -1155,6 +1155,7 @@ benchmarks = [ 'perf/p7820-grep-engines.sh', 'perf/p7821-grep-engines-fixed.sh', 'perf/p7822-grep-perl-character.sh', + 'perf/p8020-last-modified.sh', 'perf/p9210-scalar.sh', 'perf/p9300-fast-import-export.sh', ] diff --git a/t/perf/p8020-last-modified.sh b/t/perf/p8020-last-modified.sh new file mode 100755 index 00000000000000..cb1f98d3db9f4e --- /dev/null +++ b/t/perf/p8020-last-modified.sh @@ -0,0 +1,22 @@ +#!/bin/sh + +test_description='last-modified perf tests' +. ./perf-lib.sh + +test_perf_default_repo + +test_perf 'top-level last-modified' ' + git last-modified HEAD +' + +test_perf 'top-level recursive last-modified' ' + git last-modified -r HEAD +' + +test_perf 'subdir last-modified' ' + git ls-tree -d HEAD >subtrees && + path="$(head -n 1 subtrees | cut -f2)" && + git last-modified -r HEAD -- "$path" +' + +test_done From 8d9a7cdfda4c883e83d6ea7b57d0a1d989a7d439 Mon Sep 17 00:00:00 2001 From: Toon Claes Date: Tue, 5 Aug 2025 11:33:58 +0200 Subject: [PATCH 09/10] last-modified: use Bloom filters when available Our 'git last-modified' performs a revision walk, and computes a diff at each point in the walk to figure out whether a given revision changed any of the paths it considers interesting. When changed-path Bloom filters are available, we can avoid computing many such diffs. Before computing a diff, we first check if any of the remaining paths of interest were possibly changed at a given commit by consulting its Bloom filter. If any of them are, we are resigned to compute the diff. If none of those queries returned "maybe", we know that the given commit doesn't contain any changed paths which are interesting to us. So, we can avoid computing it in this case. Comparing the perf test results on git.git: Test HEAD~ HEAD ------------------------------------------------------------------------------------ 8020.1: top-level last-modified 4.49(4.34+0.11) 2.22(2.05+0.09) -50.6% 8020.2: top-level recursive last-modified 5.64(5.45+0.11) 5.62(5.30+0.11) -0.4% 8020.3: subdir last-modified 0.11(0.06+0.04) 0.07(0.03+0.04) -36.4% Based-on-patch-by: Taylor Blau Signed-off-by: Toon Claes Signed-off-by: Junio C Hamano --- builtin/last-modified.c | 48 +++++++++++++++++++++++++++++++++++++++-- commit-graph.c | 7 +++++- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/builtin/last-modified.c b/builtin/last-modified.c index 364493ac694c58..82c573982775be 100644 --- a/builtin/last-modified.c +++ b/builtin/last-modified.c @@ -1,5 +1,7 @@ #include "git-compat-util.h" +#include "bloom.h" #include "builtin.h" +#include "commit-graph.h" #include "commit.h" #include "config.h" #include "diff.h" @@ -18,6 +20,7 @@ struct last_modified_entry { struct hashmap_entry hashent; struct object_id oid; + struct bloom_key key; const char path[FLEX_ARRAY]; }; @@ -42,6 +45,12 @@ struct last_modified { static void last_modified_release(struct last_modified *lm) { + struct hashmap_iter iter; + struct last_modified_entry *ent; + + hashmap_for_each_entry(&lm->paths, &iter, ent, hashent) + bloom_key_clear(&ent->key); + hashmap_clear_and_free(&lm->paths, struct last_modified_entry, hashent); release_revisions(&lm->rev); } @@ -63,6 +72,9 @@ static void add_path_from_diff(struct diff_queue_struct *q, FLEX_ALLOC_STR(ent, path, path); oidcpy(&ent->oid, &p->two->oid); + if (lm->rev.bloom_filter_settings) + bloom_key_fill(&ent->key, path, strlen(path), + lm->rev.bloom_filter_settings); hashmap_entry_init(&ent->hashent, strhash(ent->path)); hashmap_add(&lm->paths, &ent->hashent); } @@ -139,6 +151,7 @@ static void mark_path(const char *path, const struct object_id *oid, last_modified_emit(data->lm, path, data->commit); hashmap_remove(&data->lm->paths, &ent->hashent, path); + bloom_key_clear(&ent->key); free(ent); } @@ -182,6 +195,30 @@ static void last_modified_diff(struct diff_queue_struct *q, } } +static bool maybe_changed_path(struct last_modified *lm, struct commit *origin) +{ + struct bloom_filter *filter; + struct last_modified_entry *ent; + struct hashmap_iter iter; + + if (!lm->rev.bloom_filter_settings) + return true; + + if (commit_graph_generation(origin) == GENERATION_NUMBER_INFINITY) + return true; + + filter = get_bloom_filter(lm->rev.repo, origin); + if (!filter) + return true; + + hashmap_for_each_entry(&lm->paths, &iter, ent, hashent) { + if (bloom_filter_contains(filter, &ent->key, + lm->rev.bloom_filter_settings)) + return true; + } + return false; +} + static int last_modified_run(struct last_modified *lm) { struct last_modified_callback_data data = { .lm = lm }; @@ -202,9 +239,14 @@ static int last_modified_run(struct last_modified *lm) &data.commit->object.oid, "", &lm->rev.diffopt); diff_flush(&lm->rev.diffopt); - } else { - log_tree_commit(&lm->rev, data.commit); + + break; } + + if (!maybe_changed_path(lm, data.commit)) + continue; + + log_tree_commit(&lm->rev, data.commit); } return 0; @@ -231,6 +273,8 @@ static int last_modified_init(struct last_modified *lm, struct repository *r, return argc; } + lm->rev.bloom_filter_settings = get_bloom_filter_settings(lm->rev.repo); + if (populate_paths_from_revs(lm) < 0) return error(_("unable to setup last-modified")); diff --git a/commit-graph.c b/commit-graph.c index bd7b6f5338bd9d..dc1f29dd2f34a3 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -820,7 +820,12 @@ int corrected_commit_dates_enabled(struct repository *r) struct bloom_filter_settings *get_bloom_filter_settings(struct repository *r) { - struct commit_graph *g = r->objects->commit_graph; + struct commit_graph *g; + + if (!prepare_commit_graph(r)) + return NULL; + + g = r->objects->commit_graph; while (g) { if (g->bloom_filter_settings) return g->bloom_filter_settings; From 4975ec3473b4bc61bc8a3df1ef29d0b7e7959e87 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Mon, 8 Sep 2025 14:54:20 -0700 Subject: [PATCH 10/10] The seventh batch Signed-off-by: Junio C Hamano --- Documentation/RelNotes/2.52.0.adoc | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/Documentation/RelNotes/2.52.0.adoc b/Documentation/RelNotes/2.52.0.adoc index fa72515358ec09..60660314911cc1 100644 --- a/Documentation/RelNotes/2.52.0.adoc +++ b/Documentation/RelNotes/2.52.0.adoc @@ -14,6 +14,9 @@ UI, Workflows & Features * A new subcommand "git repo" gives users a way to grab various repository characteristics. + * A new command "git last-modified" has been added to show the closest + ancestor commit that touched each path. + Performance, Internal Implementation, Development Support etc. -------------------------------------------------------------- @@ -40,6 +43,10 @@ Performance, Internal Implementation, Development Support etc. * Discord has been added to the first contribution documentation as another way to ask for help. + * Inspired by Ezekiel's recent effort to showcase Rust interface, the + hash function implementation used to hash lines have been updated + to the one used for ELF symbol lookup by Glibc. + Fixes since v2.51 ----------------- @@ -130,6 +137,20 @@ including security updates, are included in this release. instead of `gitgitgadget/git`. (merge 37001cdbc4 ds/doc-ggg-pr-fork-clarify later to maint). + * Makefile tried to run multiple "cargo build" which would not work + very well; serialize their execution to work it around. + (merge 0eeacde50e da/cargo-serialize later to maint). + + * "git repack --path-walk" lost objects in some corner cases, which + has been corrected. + (merge 93afe9b060 ds/path-walk-repack-fix later to maint). + + * "git ls-files ..." should not necessarily have to expand + the index fully if a sparsified directory is excluded by the + pathspec; the code is taught to expand the index on demand to avoid + this. + (merge 681f26bccc ds/ls-files-lazy-unsparse later to maint). + * Other code cleanup, docfix, build fix, etc. (merge 823d537fa7 kh/doc-git-log-markup-fix later to maint). (merge cf7efa4f33 rj/t6137-cygwin-fix later to maint).