Merge branch 'tc/last-modified'

A new command "git last-modified" has been added to show the closest
ancestor commit that touched each path.

* tc/last-modified:
  last-modified: use Bloom filters when available
  t/perf: add last-modified perf script
  last-modified: new subcommand to show when files were last modified
main
Junio C Hamano 2025-09-08 14:54:35 -07:00
commit 95a8428323
13 changed files with 627 additions and 1 deletions

1
.gitignore vendored
View File

@ -87,6 +87,7 @@
/git-init-db
/git-interpret-trailers
/git-instaweb
/git-last-modified
/git-log
/git-ls-files
/git-ls-remote

View File

@ -0,0 +1,54 @@
git-last-modified(1)
====================

NAME
----
git-last-modified - EXPERIMENTAL: Show when files were last modified


SYNOPSIS
--------
[synopsis]
git last-modified [--recursive] [--show-trees] [<revision-range>] [[--] <path>...]

DESCRIPTION
-----------

Shows which commit last modified each of the relevant files and subdirectories.
A commit renaming a path, or changing it's mode is also taken into account.

THIS COMMAND IS EXPERIMENTAL. THE BEHAVIOR MAY CHANGE.

OPTIONS
-------

`-r`::
`--recursive`::
Instead of showing tree entries, step into subtrees and show all entries
inside them recursively.

`-t`::
`--show-trees`::
Show tree entries even when recursing into them. It has no effect
without `--recursive`.

`<revision-range>`::
Only traverse commits in the specified revision range. When no
`<revision-range>` is specified, it defaults to `HEAD` (i.e. the whole
history leading to the current commit). For a complete list of ways to
spell `<revision-range>`, see the 'Specifying Ranges' section of
linkgit:gitrevisions[7].

`[--] <path>...`::
For each _<path>_ given, the commit which last modified it is returned.
Without an optional path parameter, all files and subdirectories
in path traversal the are included in the output.

SEE ALSO
--------
linkgit:git-blame[1],
linkgit:git-log[1].

GIT
---
Part of the linkgit:git[1] suite

View File

@ -74,6 +74,7 @@ manpages = {
'git-init.adoc' : 1,
'git-instaweb.adoc' : 1,
'git-interpret-trailers.adoc' : 1,
'git-last-modified.adoc' : 1,
'git-log.adoc' : 1,
'git-ls-files.adoc' : 1,
'git-ls-remote.adoc' : 1,

View File

@ -1265,6 +1265,7 @@ BUILTIN_OBJS += builtin/hook.o
BUILTIN_OBJS += builtin/index-pack.o
BUILTIN_OBJS += builtin/init-db.o
BUILTIN_OBJS += builtin/interpret-trailers.o
BUILTIN_OBJS += builtin/last-modified.o
BUILTIN_OBJS += builtin/log.o
BUILTIN_OBJS += builtin/ls-files.o
BUILTIN_OBJS += builtin/ls-remote.o

View File

@ -176,6 +176,7 @@ int cmd_hook(int argc, const char **argv, const char *prefix, struct repository
int cmd_index_pack(int argc, const char **argv, const char *prefix, struct repository *repo);
int cmd_init_db(int argc, const char **argv, const char *prefix, struct repository *repo);
int cmd_interpret_trailers(int argc, const char **argv, const char *prefix, struct repository *repo);
int cmd_last_modified(int argc, const char **argv, const char *prefix, struct repository *repo);
int cmd_log_reflog(int argc, const char **argv, const char *prefix, struct repository *repo);
int cmd_log(int argc, const char **argv, const char *prefix, struct repository *repo);
int cmd_ls_files(int argc, const char **argv, const char *prefix, struct repository *repo);

326
builtin/last-modified.c Normal file
View File

@ -0,0 +1,326 @@
#include "git-compat-util.h"
#include "bloom.h"
#include "builtin.h"
#include "commit-graph.h"
#include "commit.h"
#include "config.h"
#include "environment.h"
#include "diff.h"
#include "diffcore.h"
#include "environment.h"
#include "hashmap.h"
#include "hex.h"
#include "log-tree.h"
#include "object-name.h"
#include "object.h"
#include "parse-options.h"
#include "quote.h"
#include "repository.h"
#include "revision.h"

struct last_modified_entry {
struct hashmap_entry hashent;
struct object_id oid;
struct bloom_key key;
const char path[FLEX_ARRAY];
};

static int last_modified_entry_hashcmp(const void *unused UNUSED,
const struct hashmap_entry *hent1,
const struct hashmap_entry *hent2,
const void *path)
{
const struct last_modified_entry *ent1 =
container_of(hent1, const struct last_modified_entry, hashent);
const struct last_modified_entry *ent2 =
container_of(hent2, const struct last_modified_entry, hashent);
return strcmp(ent1->path, path ? path : ent2->path);
}

struct last_modified {
struct hashmap paths;
struct rev_info rev;
bool recursive;
bool show_trees;
};

static void last_modified_release(struct last_modified *lm)
{
struct hashmap_iter iter;
struct last_modified_entry *ent;

hashmap_for_each_entry(&lm->paths, &iter, ent, hashent)
bloom_key_clear(&ent->key);

hashmap_clear_and_free(&lm->paths, struct last_modified_entry, hashent);
release_revisions(&lm->rev);
}

struct last_modified_callback_data {
struct last_modified *lm;
struct commit *commit;
};

static void add_path_from_diff(struct diff_queue_struct *q,
struct diff_options *opt UNUSED, void *data)
{
struct last_modified *lm = data;

for (int i = 0; i < q->nr; i++) {
struct diff_filepair *p = q->queue[i];
struct last_modified_entry *ent;
const char *path = p->two->path;

FLEX_ALLOC_STR(ent, path, path);
oidcpy(&ent->oid, &p->two->oid);
if (lm->rev.bloom_filter_settings)
bloom_key_fill(&ent->key, path, strlen(path),
lm->rev.bloom_filter_settings);
hashmap_entry_init(&ent->hashent, strhash(ent->path));
hashmap_add(&lm->paths, &ent->hashent);
}
}

static int populate_paths_from_revs(struct last_modified *lm)
{
int num_interesting = 0;
struct diff_options diffopt;

/*
* Create a copy of `struct diff_options`. In this copy a callback is
* set that when called adds entries to `paths` in `struct last_modified`.
* This copy is used to diff the tree of the target revision against an
* empty tree. This results in all paths in the target revision being
* listed. After `paths` is populated, we don't need this copy no more.
*/
memcpy(&diffopt, &lm->rev.diffopt, sizeof(diffopt));
copy_pathspec(&diffopt.pathspec, &lm->rev.diffopt.pathspec);
diffopt.output_format = DIFF_FORMAT_CALLBACK;
diffopt.format_callback = add_path_from_diff;
diffopt.format_callback_data = lm;

for (size_t i = 0; i < lm->rev.pending.nr; i++) {
struct object_array_entry *obj = lm->rev.pending.objects + i;

if (obj->item->flags & UNINTERESTING)
continue;

if (num_interesting++)
return error(_("last-modified can only operate on one tree at a time"));

diff_tree_oid(lm->rev.repo->hash_algo->empty_tree,
&obj->item->oid, "", &diffopt);
diff_flush(&diffopt);
}
clear_pathspec(&diffopt.pathspec);

return 0;
}

static void last_modified_emit(struct last_modified *lm,
const char *path, const struct commit *commit)

{
if (commit->object.flags & BOUNDARY)
putchar('^');
printf("%s\t", oid_to_hex(&commit->object.oid));

if (lm->rev.diffopt.line_termination)
write_name_quoted(path, stdout, '\n');
else
printf("%s%c", path, '\0');
}

static void mark_path(const char *path, const struct object_id *oid,
struct last_modified_callback_data *data)
{
struct last_modified_entry *ent;

/* Is it even a path that we are interested in? */
ent = hashmap_get_entry_from_hash(&data->lm->paths, strhash(path), path,
struct last_modified_entry, hashent);
if (!ent)
return;

/*
* Is it arriving at a version of interest, or is it from a side branch
* which did not contribute to the final state?
*/
if (!oideq(oid, &ent->oid))
return;

last_modified_emit(data->lm, path, data->commit);

hashmap_remove(&data->lm->paths, &ent->hashent, path);
bloom_key_clear(&ent->key);
free(ent);
}

static void last_modified_diff(struct diff_queue_struct *q,
struct diff_options *opt UNUSED, void *cbdata)
{
struct last_modified_callback_data *data = cbdata;

for (int i = 0; i < q->nr; i++) {
struct diff_filepair *p = q->queue[i];
switch (p->status) {
case DIFF_STATUS_DELETED:
/*
* There's no point in feeding a deletion, as it could
* not have resulted in our current state, which
* actually has the file.
*/
break;

default:
/*
* Otherwise, we care only that we somehow arrived at
* a final oid state. Note that this covers some
* potentially controversial areas, including:
*
* 1. A rename or copy will be found, as it is the
* first time the content has arrived at the given
* path.
*
* 2. Even a non-content modification like a mode or
* type change will trigger it.
*
* We take the inclusive approach for now, and find
* anything which impacts the path. Options to tweak
* the behavior (e.g., to "--follow" the content across
* renames) can come later.
*/
mark_path(p->two->path, &p->two->oid, data);
break;
}
}
}

static bool maybe_changed_path(struct last_modified *lm, struct commit *origin)
{
struct bloom_filter *filter;
struct last_modified_entry *ent;
struct hashmap_iter iter;

if (!lm->rev.bloom_filter_settings)
return true;

if (commit_graph_generation(origin) == GENERATION_NUMBER_INFINITY)
return true;

filter = get_bloom_filter(lm->rev.repo, origin);
if (!filter)
return true;

hashmap_for_each_entry(&lm->paths, &iter, ent, hashent) {
if (bloom_filter_contains(filter, &ent->key,
lm->rev.bloom_filter_settings))
return true;
}
return false;
}

static int last_modified_run(struct last_modified *lm)
{
struct last_modified_callback_data data = { .lm = lm };

lm->rev.diffopt.output_format = DIFF_FORMAT_CALLBACK;
lm->rev.diffopt.format_callback = last_modified_diff;
lm->rev.diffopt.format_callback_data = &data;

prepare_revision_walk(&lm->rev);

while (hashmap_get_size(&lm->paths)) {
data.commit = get_revision(&lm->rev);
if (!data.commit)
BUG("paths remaining beyond boundary in last-modified");

if (data.commit->object.flags & BOUNDARY) {
diff_tree_oid(lm->rev.repo->hash_algo->empty_tree,
&data.commit->object.oid, "",
&lm->rev.diffopt);
diff_flush(&lm->rev.diffopt);

break;
}

if (!maybe_changed_path(lm, data.commit))
continue;

log_tree_commit(&lm->rev, data.commit);
}

return 0;
}

static int last_modified_init(struct last_modified *lm, struct repository *r,
const char *prefix, int argc, const char **argv)
{
hashmap_init(&lm->paths, last_modified_entry_hashcmp, NULL, 0);

repo_init_revisions(r, &lm->rev, prefix);
lm->rev.def = "HEAD";
lm->rev.combine_merges = 1;
lm->rev.show_root_diff = 1;
lm->rev.boundary = 1;
lm->rev.no_commit_id = 1;
lm->rev.diff = 1;
lm->rev.diffopt.flags.recursive = lm->recursive;
lm->rev.diffopt.flags.tree_in_recursive = lm->show_trees;

argc = setup_revisions(argc, argv, &lm->rev, NULL);
if (argc > 1) {
error(_("unknown last-modified argument: %s"), argv[1]);
return argc;
}

lm->rev.bloom_filter_settings = get_bloom_filter_settings(lm->rev.repo);

if (populate_paths_from_revs(lm) < 0)
return error(_("unable to setup last-modified"));

return 0;
}

int cmd_last_modified(int argc, const char **argv, const char *prefix,
struct repository *repo)
{
int ret;
struct last_modified lm = { 0 };

const char * const last_modified_usage[] = {
N_("git last-modified [--recursive] [--show-trees] "
"[<revision-range>] [[--] <path>...]"),
NULL
};

struct option last_modified_options[] = {
OPT_BOOL('r', "recursive", &lm.recursive,
N_("recurse into subtrees")),
OPT_BOOL('t', "show-trees", &lm.show_trees,
N_("show tree entries when recursing into subtrees")),
OPT_END()
};

argc = parse_options(argc, argv, prefix, last_modified_options,
last_modified_usage,
PARSE_OPT_KEEP_ARGV0 | PARSE_OPT_KEEP_UNKNOWN_OPT);

repo_config(repo, git_default_config, NULL);

ret = last_modified_init(&lm, repo, prefix, argc, argv);
if (ret > 0)
usage_with_options(last_modified_usage,
last_modified_options);
if (ret)
goto out;

ret = last_modified_run(&lm);
if (ret)
goto out;

out:
last_modified_release(&lm);

return ret;
}

View File

@ -124,6 +124,7 @@ git-index-pack plumbingmanipulators
git-init mainporcelain init
git-instaweb ancillaryinterrogators complete
git-interpret-trailers purehelpers
git-last-modified plumbinginterrogators
git-log mainporcelain info
git-ls-files plumbinginterrogators
git-ls-remote plumbinginterrogators

View File

@ -812,7 +812,12 @@ int corrected_commit_dates_enabled(struct repository *r)

struct bloom_filter_settings *get_bloom_filter_settings(struct repository *r)
{
struct commit_graph *g = r->objects->commit_graph;
struct commit_graph *g;

if (!prepare_commit_graph(r))
return NULL;

g = r->objects->commit_graph;
while (g) {
if (g->bloom_filter_settings)
return g->bloom_filter_settings;

1
git.c
View File

@ -565,6 +565,7 @@ static struct cmd_struct commands[] = {
{ "init", cmd_init_db },
{ "init-db", cmd_init_db },
{ "interpret-trailers", cmd_interpret_trailers, RUN_SETUP_GENTLY },
{ "last-modified", cmd_last_modified, RUN_SETUP },
{ "log", cmd_log, RUN_SETUP },
{ "ls-files", cmd_ls_files, RUN_SETUP },
{ "ls-remote", cmd_ls_remote, RUN_SETUP_GENTLY },

View File

@ -607,6 +607,7 @@ builtin_sources = [
'builtin/index-pack.c',
'builtin/init-db.c',
'builtin/interpret-trailers.c',
'builtin/last-modified.c',
'builtin/log.c',
'builtin/ls-files.c',
'builtin/ls-remote.c',

View File

@ -951,6 +951,7 @@ integration_tests = [
't8012-blame-colors.sh',
't8013-blame-ignore-revs.sh',
't8014-blame-ignore-fuzzy.sh',
't8020-last-modified.sh',
't9001-send-email.sh',
't9002-column.sh',
't9003-help-autocorrect.sh',
@ -1144,6 +1145,7 @@ benchmarks = [
'perf/p7820-grep-engines.sh',
'perf/p7821-grep-engines-fixed.sh',
'perf/p7822-grep-perl-character.sh',
'perf/p8020-last-modified.sh',
'perf/p9210-scalar.sh',
'perf/p9300-fast-import-export.sh',
]

22
t/perf/p8020-last-modified.sh Executable file
View File

@ -0,0 +1,22 @@
#!/bin/sh

test_description='last-modified perf tests'
. ./perf-lib.sh

test_perf_default_repo

test_perf 'top-level last-modified' '
git last-modified HEAD
'

test_perf 'top-level recursive last-modified' '
git last-modified -r HEAD
'

test_perf 'subdir last-modified' '
git ls-tree -d HEAD >subtrees &&
path="$(head -n 1 subtrees | cut -f2)" &&
git last-modified -r HEAD -- "$path"
'

test_done

210
t/t8020-last-modified.sh Executable file
View File

@ -0,0 +1,210 @@
#!/bin/sh

test_description='last-modified tests'

. ./test-lib.sh

test_expect_success 'setup' '
test_commit 1 file &&
mkdir a &&
test_commit 2 a/file &&
mkdir a/b &&
test_commit 3 a/b/file
'

test_expect_success 'cannot run last-modified on two trees' '
test_must_fail git last-modified HEAD HEAD~1
'

check_last_modified() {
local indir= &&
while test $# != 0
do
case "$1" in
-C)
indir="$2"
shift
;;
*)
break
;;
esac &&
shift
done &&

cat >expect &&
test_when_finished "rm -f tmp.*" &&
git ${indir:+-C "$indir"} last-modified "$@" >tmp.1 &&
git name-rev --annotate-stdin --name-only --tags \
<tmp.1 >tmp.2 &&
tr '\t' ' ' <tmp.2 >actual &&
test_cmp expect actual
}

test_expect_success 'last-modified non-recursive' '
check_last_modified <<-\EOF
3 a
1 file
EOF
'

test_expect_success 'last-modified recursive' '
check_last_modified -r <<-\EOF
3 a/b/file
2 a/file
1 file
EOF
'

test_expect_success 'last-modified recursive with show-trees' '
check_last_modified -r -t <<-\EOF
3 a
3 a/b
3 a/b/file
2 a/file
1 file
EOF
'

test_expect_success 'last-modified non-recursive with show-trees' '
check_last_modified -t <<-\EOF
3 a
1 file
EOF
'

test_expect_success 'last-modified subdir' '
check_last_modified a <<-\EOF
3 a
EOF
'

test_expect_success 'last-modified subdir recursive' '
check_last_modified -r a <<-\EOF
3 a/b/file
2 a/file
EOF
'

test_expect_success 'last-modified from non-HEAD commit' '
check_last_modified HEAD^ <<-\EOF
2 a
1 file
EOF
'

test_expect_success 'last-modified from subdir defaults to root' '
check_last_modified -C a <<-\EOF
3 a
1 file
EOF
'

test_expect_success 'last-modified from subdir uses relative pathspecs' '
check_last_modified -C a -r b <<-\EOF
3 a/b/file
EOF
'

test_expect_success 'limit last-modified traversal by count' '
check_last_modified -1 <<-\EOF
3 a
^2 file
EOF
'

test_expect_success 'limit last-modified traversal by commit' '
check_last_modified HEAD~2..HEAD <<-\EOF
3 a
^1 file
EOF
'

test_expect_success 'only last-modified files in the current tree' '
git rm -rf a &&
git commit -m "remove a" &&
check_last_modified <<-\EOF
1 file
EOF
'

test_expect_success 'cross merge boundaries in blaming' '
git checkout HEAD^0 &&
git rm -rf . &&
test_commit m1 &&
git checkout HEAD^ &&
git rm -rf . &&
test_commit m2 &&
git merge m1 &&
check_last_modified <<-\EOF
m2 m2.t
m1 m1.t
EOF
'

test_expect_success 'last-modified merge for resolved conflicts' '
git checkout HEAD^0 &&
git rm -rf . &&
test_commit c1 conflict &&
git checkout HEAD^ &&
git rm -rf . &&
test_commit c2 conflict &&
test_must_fail git merge c1 &&
test_commit resolved conflict &&
check_last_modified conflict <<-\EOF
resolved conflict
EOF
'


# Consider `file` with this content through history:
#
# A---B---B-------B---B
# \ /
# C---D
test_expect_success 'last-modified merge ignores content from branch' '
git checkout HEAD^0 &&
git rm -rf . &&
test_commit a1 file A &&
test_commit a2 file B &&
test_commit a3 file C &&
test_commit a4 file D &&
git checkout a2 &&
git merge --no-commit --no-ff a4 &&
git checkout a2 -- file &&
git merge --continue &&
check_last_modified <<-\EOF
a2 file
EOF
'

# Consider `file` with this content through history:
#
# A---B---B---C---D---B---B
# \ /
# B-------B
test_expect_success 'last-modified merge undoes changes' '
git checkout HEAD^0 &&
git rm -rf . &&
test_commit b1 file A &&
test_commit b2 file B &&
test_commit b3 file C &&
test_commit b4 file D &&
git checkout b2 &&
test_commit b5 file2 2 &&
git checkout b4 &&
git merge --no-commit --no-ff b5 &&
git checkout b2 -- file &&
git merge --continue &&
check_last_modified <<-\EOF
b5 file2
b2 file
EOF
'

test_expect_success 'last-modified complains about unknown arguments' '
test_must_fail git last-modified --foo 2>err &&
grep "unknown last-modified argument: --foo" err
'

test_done