From 9bb4abe6cd1b25107e6cd49af7a200242fd91f90 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 7 Aug 2025 22:52:56 +0200 Subject: [PATCH 1/3] combine-diff: zero memory used for callback filepairs In commit 25e5e2bf85 (combine-diff: support format_callback, 2011-08-19), the combined-diff code learned how to make a multi-sourced `diff_filepair` to pass to a diff callback. When we create each filepair, we do not bother to fill in many of the fields, because they would make no sense (e.g. there can be no rename score or broken_pair flag because we do not go through the diffcore filters). However, we did not even bother to zero them, leading to random values. Let's make sure everything is blank with xcalloc(), just as the regular diff code does. We would potentially want to set the `status` flag to something non-zero, but it is not clear to what. Possibly a new DIFF_STATUS_COMBINED would make sense, as this is not strictly a modification, nor does it fit any other category. Since it is not yet clear what callers would want, this patch simply leaves it as `0`, the same empty flag that is seen when `diffcore_std` is not used at all. Signed-off-by: Jeff King Signed-off-by: Toon Claes Signed-off-by: Junio C Hamano --- combine-diff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/combine-diff.c b/combine-diff.c index dfae9f7995..5d6bdf4364 100644 --- a/combine-diff.c +++ b/combine-diff.c @@ -1315,7 +1315,7 @@ static struct diff_filepair *combined_pair(struct combine_diff_path *p, struct diff_filepair *pair; struct diff_filespec *pool; - pair = xmalloc(sizeof(*pair)); + CALLOC_ARRAY(pair, 1); CALLOC_ARRAY(pool, st_add(num_parent, 1)); pair->one = pool + 1; pair->two = pool; From 2a43e0e5503f52fd4c06faddf6c83b5678dedfe3 Mon Sep 17 00:00:00 2001 From: Toon Claes Date: Thu, 7 Aug 2025 22:52:57 +0200 Subject: [PATCH 2/3] within_depth: fix return for empty path The within_depth() function is used to check whether pathspecs limited by a max-depth parameter are acceptable. It takes a path to check, a maximum depth, and a "base" depth. It counts the components in the path (by counting slashes), adds them to the base, and compares them to the maximum. However, if the base does not have any slashes at all, we always return `true`. If the base depth is 0, then this is correct; no matter what the maximum is, we are always within it. However, if the base depth is greater than 0, then we might return an erroneous result. This ends up not causing any user-visible bugs in the current code. The call sites in dir.c always pass a base depth of 0, so are unaffected. But tree_entry_interesting() uses this function differently: it will pass the prefix of the current entry, along with a `1` if the entry is a directory, in essence checking whether items inside the entry would be of interest. It turns out not to make a difference in behavior, but the reasoning is complex. Given a tree like: file a/file a/b/file walking the tree and calling tree_entry_interesting() will yield the following results: (with max_depth=0): file: yes a: yes a/file: no a/b: no (with max_depth=1): file: yes a: yes a/file: yes a/b: no So we have inconsistent behavior in considering directories interesting. If they are at the edge of our depth but at the root, we will recurse into them, but then find all of their entries uninteresting (e.g., in the first case, we will look at "a" but find "a/*" uninteresting). But if they are at the edge of our depth and not at the root, then we will not recurse (in the second example, we do not even bother entering "a/b"). This turns out not to matter because the only caller which uses max-depth pathspecs is cmd_grep(), which only cares about blob entries. From its perspective, it is exactly the same to not recurse into a subtree, or to recurse and find that it contains no matching entries. Not recursing is merely an optimization. It is debatable whether tree_entry_interesting() should consider such an entry interesting. The only caller does not care if it sees the tree itself, and can benefit from the optimization. But if we add a "max-depth" limiter to regular diffs, then a diff with DIFF_OPT_TREE_IN_RECURSIVE would probably want to show the tree itself, but not what it contains. This patch just fixes within_depth(), which means we consider such entries uninteresting (and makes the current caller happy). If we want to change that in the future, then this fix is still the correct first step, as the current behavior is simply inconsistent. This has the effect the function tree_entry_interesting() now behaves like following on the first example: (with max_depth=0): file: yes a: no a/file: no a/b: no Meaning we won't step in "a/" no more to realize all "a/*" entries are uninterested, but we stop at the tree entry itself. Based-on-patch-by: Jeff King Signed-off-by: Toon Claes Signed-off-by: Junio C Hamano --- Makefile | 1 + dir.c | 2 +- t/meson.build | 1 + t/unit-tests/u-dir.c | 47 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 t/unit-tests/u-dir.c diff --git a/Makefile b/Makefile index 70d1543b6b..b5fce1205d 100644 --- a/Makefile +++ b/Makefile @@ -1356,6 +1356,7 @@ THIRD_PARTY_SOURCES += $(UNIT_TEST_DIR)/clar/% THIRD_PARTY_SOURCES += $(UNIT_TEST_DIR)/clar/clar/% CLAR_TEST_SUITES += u-ctype +CLAR_TEST_SUITES += u-dir CLAR_TEST_SUITES += u-example-decorate CLAR_TEST_SUITES += u-hash CLAR_TEST_SUITES += u-hashmap diff --git a/dir.c b/dir.c index a374972b62..2ee108eeb6 100644 --- a/dir.c +++ b/dir.c @@ -277,7 +277,7 @@ int within_depth(const char *name, int namelen, if (depth > max_depth) return 0; } - return 1; + return depth <= max_depth; } /* diff --git a/t/meson.build b/t/meson.build index d052fc3e23..56ea96f04a 100644 --- a/t/meson.build +++ b/t/meson.build @@ -1,5 +1,6 @@ clar_test_suites = [ 'unit-tests/u-ctype.c', + 'unit-tests/u-dir.c', 'unit-tests/u-example-decorate.c', 'unit-tests/u-hash.c', 'unit-tests/u-hashmap.c', diff --git a/t/unit-tests/u-dir.c b/t/unit-tests/u-dir.c new file mode 100644 index 0000000000..2d0adaa39e --- /dev/null +++ b/t/unit-tests/u-dir.c @@ -0,0 +1,47 @@ +#include "unit-test.h" +#include "dir.h" + +#define TEST_WITHIN_DEPTH(path, depth, max_depth, expect) do { \ + int actual = within_depth(path, strlen(path), \ + depth, max_depth); \ + if (actual != expect) \ + cl_failf("path '%s' with depth '%d' and max-depth '%d': expected %d, got %d", \ + path, depth, max_depth, expect, actual); \ + } while (0) + +void test_dir__within_depth(void) +{ + /* depth = 0; max_depth = 0 */ + TEST_WITHIN_DEPTH("", 0, 0, 1); + TEST_WITHIN_DEPTH("file", 0, 0, 1); + TEST_WITHIN_DEPTH("a", 0, 0, 1); + TEST_WITHIN_DEPTH("a/file", 0, 0, 0); + TEST_WITHIN_DEPTH("a/b", 0, 0, 0); + TEST_WITHIN_DEPTH("a/b/file", 0, 0, 0); + + /* depth = 0; max_depth = 1 */ + TEST_WITHIN_DEPTH("", 0, 1, 1); + TEST_WITHIN_DEPTH("file", 0, 1, 1); + TEST_WITHIN_DEPTH("a", 0, 1, 1); + TEST_WITHIN_DEPTH("a/file", 0, 1, 1); + TEST_WITHIN_DEPTH("a/b", 0, 1, 1); + TEST_WITHIN_DEPTH("a/b/file", 0, 1, 0); + + /* depth = 1; max_depth = 1 */ + TEST_WITHIN_DEPTH("", 1, 1, 1); + TEST_WITHIN_DEPTH("file", 1, 1, 1); + TEST_WITHIN_DEPTH("a", 1, 1, 1); + TEST_WITHIN_DEPTH("a/file", 1, 1, 0); + TEST_WITHIN_DEPTH("a/b", 1, 1, 0); + TEST_WITHIN_DEPTH("a/b/file", 1, 1, 0); + + /* depth = 1; max_depth = 0 */ + TEST_WITHIN_DEPTH("", 1, 0, 0); + TEST_WITHIN_DEPTH("file", 1, 0, 0); + TEST_WITHIN_DEPTH("a", 1, 0, 0); + TEST_WITHIN_DEPTH("a/file", 1, 0, 0); + TEST_WITHIN_DEPTH("a/b", 1, 0, 0); + TEST_WITHIN_DEPTH("a/b/file", 1, 0, 0); + + +} From a1dfa5448d583bbfd1ec45642a4495ad499970c9 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 7 Aug 2025 22:52:58 +0200 Subject: [PATCH 3/3] diff: teach tree-diff a max-depth parameter When you are doing a tree-diff, there are basically two options: do not recurse into subtrees at all, or recurse indefinitely. While most callers would want to always recurse and see full pathnames, some may want the efficiency of looking only at a particular level of the tree. This is currently easy to do for the top-level (just turn off recursion), but you cannot say "show me what changed in subdir/, but do not recurse". This patch adds a max-depth parameter which is measured from the closest pathspec match, so that you can do: git log --raw --max-depth=1 -- a/b/c and see the raw output for a/b/c/, but not those of a/b/c/d/ (instead of the raw output you would see for a/b/c/d). Co-authored-by: Toon Claes Signed-off-by: Toon Claes Signed-off-by: Junio C Hamano --- Documentation/diff-options.adoc | 28 ++++++++ diff-lib.c | 5 ++ diff.c | 24 +++++++ diff.h | 8 +++ t/meson.build | 1 + t/t4072-diff-max-depth.sh | 116 ++++++++++++++++++++++++++++++++ tree-diff.c | 78 ++++++++++++++++++++- 7 files changed, 257 insertions(+), 3 deletions(-) create mode 100755 t/t4072-diff-max-depth.sh diff --git a/Documentation/diff-options.adoc b/Documentation/diff-options.adoc index 640eb6e7db..18a9020389 100644 --- a/Documentation/diff-options.adoc +++ b/Documentation/diff-options.adoc @@ -887,5 +887,33 @@ endif::git-format-patch[] reverted with `--ita-visible-in-index`. Both options are experimental and could be removed in future. +--max-depth=:: + For each pathspec given on command line, descend at most `` + levels of directories. A value of `-1` means no limit. + Cannot be combined with wildcards in the pathspec. + Given a tree containing `foo/bar/baz`, the following list shows the + matches generated by each set of options: ++ +-- + - `--max-depth=0 -- foo`: `foo` + + - `--max-depth=1 -- foo`: `foo/bar` + + - `--max-depth=1 -- foo/bar`: `foo/bar/baz` + + - `--max-depth=1 -- foo foo/bar`: `foo/bar/baz` + + - `--max-depth=2 -- foo`: `foo/bar/baz` +-- ++ +If no pathspec is given, the depth is measured as if all +top-level entries were specified. Note that this is different +than measuring from the root, in that `--max-depth=0` would +still return `foo`. This allows you to still limit depth while +asking for a subset of the top-level entries. ++ +Note that this option is only supported for diffs between tree objects, +not against the index or working tree. + For more detailed explanation on these common options, see also linkgit:gitdiffcore[7]. diff --git a/diff-lib.c b/diff-lib.c index 244468dd1a..b8f8f3bc31 100644 --- a/diff-lib.c +++ b/diff-lib.c @@ -115,6 +115,9 @@ void run_diff_files(struct rev_info *revs, unsigned int option) uint64_t start = getnanotime(); struct index_state *istate = revs->diffopt.repo->index; + if (revs->diffopt.max_depth_valid) + die(_("max-depth is not supported for worktree diffs")); + diff_set_mnemonic_prefix(&revs->diffopt, "i/", "w/"); refresh_fsmonitor(istate); @@ -560,6 +563,8 @@ static int diff_cache(struct rev_info *revs, opts.dst_index = NULL; opts.pathspec = &revs->diffopt.pathspec; opts.pathspec->recursive = 1; + if (revs->diffopt.max_depth_valid) + die(_("max-depth is not supported for index diffs")); init_tree_desc(&t, &tree->object.oid, tree->buffer, tree->size); return unpack_trees(1, &t, &opts); diff --git a/diff.c b/diff.c index 90e8003dd1..434627f249 100644 --- a/diff.c +++ b/diff.c @@ -4988,6 +4988,9 @@ void diff_setup_done(struct diff_options *options) options->filter = ~filter_bit[DIFF_STATUS_FILTER_AON]; options->filter &= ~options->filter_not; } + + if (options->pathspec.has_wildcard && options->max_depth_valid) + die("max-depth cannot be used with wildcard pathspecs"); } int parse_long_opt(const char *opt, const char **argv, @@ -5622,6 +5625,23 @@ static int diff_opt_rotate_to(const struct option *opt, const char *arg, int uns return 0; } +static int diff_opt_max_depth(const struct option *opt, + const char *arg, int unset) +{ + struct diff_options *options = opt->value; + + BUG_ON_OPT_NEG(unset); + + if (!git_parse_int(arg, &options->max_depth)) + return error(_("invalid value for '%s': '%s'"), + "--max-depth", arg); + + options->flags.recursive = 1; + options->max_depth_valid = options->max_depth >= 0; + + return 0; +} + /* * Consider adding new flags to __git_diff_common_options * in contrib/completion/git-completion.bash @@ -5894,6 +5914,10 @@ struct option *add_diff_options(const struct option *opts, OPT_CALLBACK_F(0, "diff-filter", options, N_("[(A|C|D|M|R|T|U|X|B)...[*]]"), N_("select files by diff type"), PARSE_OPT_NONEG, diff_opt_diff_filter), + OPT_CALLBACK_F(0, "max-depth", options, N_(""), + N_("maximum tree depth to recurse"), + PARSE_OPT_NONEG, diff_opt_max_depth), + { .type = OPTION_CALLBACK, .long_name = "output", diff --git a/diff.h b/diff.h index 62e5768a9a..bbced5f745 100644 --- a/diff.h +++ b/diff.h @@ -404,6 +404,14 @@ struct diff_options { struct strmap *additional_path_headers; int no_free; + + /* + * The value '0' is a valid max-depth (for no recursion), and value '-1' + * also (for unlimited recursion), so the extra "valid" flag is used to + * determined whether the user specified option --max-depth. + */ + int max_depth; + int max_depth_valid; }; unsigned diff_filter_bit(char status); diff --git a/t/meson.build b/t/meson.build index 56ea96f04a..74d72bc531 100644 --- a/t/meson.build +++ b/t/meson.build @@ -503,6 +503,7 @@ integration_tests = [ 't4069-remerge-diff.sh', 't4070-diff-pairs.sh', 't4071-diff-minimal.sh', + 't4072-diff-max-depth.sh', 't4100-apply-stat.sh', 't4101-apply-nonl.sh', 't4102-apply-rename.sh', diff --git a/t/t4072-diff-max-depth.sh b/t/t4072-diff-max-depth.sh new file mode 100755 index 0000000000..0fbf1321f7 --- /dev/null +++ b/t/t4072-diff-max-depth.sh @@ -0,0 +1,116 @@ +#!/bin/sh + +test_description='check that diff --max-depth will limit recursion' +. ./test-lib.sh + +make_dir() { + mkdir -p "$1" && + echo "$2" >"$1/file" +} + +make_files() { + echo "$1" >file && + make_dir one "$1" && + make_dir one/two "$1" && + make_dir one/two/three "$1" +} + +test_expect_success 'setup' ' + git commit --allow-empty -m empty && + git tag empty && + make_files added && + git add . && + git commit -m added && + make_files modified && + git add . && + git commit -m modified && + make_files index && + git add . && + make_files worktree +' + +test_expect_success '--max-depth is disallowed with wildcard pathspecs' ' + test_must_fail git diff-tree --max-depth=0 HEAD^ HEAD -- "f*" +' + +check_one() { + type=$1; shift + args=$1; shift + path=$1; shift + depth=$1; shift + test_expect_${expect:-success} "diff-$type $args, path=$path, depth=$depth" " + for i in $*; do echo \$i; done >expect && + git diff-$type --max-depth=$depth --name-only $args -- $path >actual && + test_cmp expect actual + " +} + +# For tree comparisons, we expect to see subtrees at the boundary +# get their own entry. +check_trees() { + check_one tree "$*" '' 0 file one + check_one tree "$*" '' 1 file one/file one/two + check_one tree "$*" '' 2 file one/file one/two/file one/two/three + check_one tree "$*" '' 3 file one/file one/two/file one/two/three/file + check_one tree "$*" '' -1 file one/file one/two/file one/two/three/file + check_one tree "$*" one 0 one + check_one tree "$*" one 1 one/file one/two + check_one tree "$*" one 2 one/file one/two/file one/two/three + check_one tree "$*" one 3 one/file one/two/file one/two/three/file + check_one tree "$*" one/two 0 one/two + check_one tree "$*" one/two 1 one/two/file one/two/three + check_one tree "$*" one/two 2 one/two/file one/two/three/file + check_one tree "$*" one/two 2 one/two/file one/two/three/file + check_one tree "$*" one/two/three 0 one/two/three + check_one tree "$*" one/two/three 1 one/two/three/file +} + +# But for index comparisons, we do not store subtrees at all, so we do not +# expect them. +check_index() { + check_one "$@" '' 0 file + check_one "$@" '' 1 file one/file + check_one "$@" '' 2 file one/file one/two/file + check_one "$@" '' 3 file one/file one/two/file one/two/three/file + check_one "$@" one 0 + check_one "$@" one 1 one/file + check_one "$@" one 2 one/file one/two/file + check_one "$@" one 3 one/file one/two/file one/two/three/file + check_one "$@" one/two 0 + check_one "$@" one/two 1 one/two/file + check_one "$@" one/two 2 one/two/file one/two/three/file + check_one "$@" one/two/three 0 + check_one "$@" one/two/three 1 one/two/three/file + + # Value '-1' for '--max-depth is the same as recursion without limit, + # and thus should always succeed. + local expect= + check_one "$@" '' -1 file one/file one/two/file one/two/three/file +} + +# Check as a modification... +check_trees HEAD^ HEAD +# ...and as an addition... +check_trees empty HEAD +# ...and as a deletion. +check_trees HEAD empty + +# We currently only implement max-depth for trees. +expect=failure +# Check index against a tree +check_index index "--cached HEAD" +# and index against the worktree +check_index files "" +expect= + +test_expect_success 'find shortest path within embedded pathspecs' ' + cat >expect <<-\EOF && + one/file + one/two/file + one/two/three/file + EOF + git diff-tree --max-depth=2 --name-only HEAD^ HEAD -- one one/two >actual && + test_cmp expect actual +' + +test_done diff --git a/tree-diff.c b/tree-diff.c index e00fc2f450..5988148b60 100644 --- a/tree-diff.c +++ b/tree-diff.c @@ -13,6 +13,7 @@ #include "tree-walk.h" #include "environment.h" #include "repository.h" +#include "dir.h" /* * Some mode bits are also used internally for computations. @@ -48,6 +49,73 @@ free((x)); \ } while(0) +/* Returns true if and only if "dir" is a leading directory of "path" */ +static int is_dir_prefix(const char *path, const char *dir, int dirlen) +{ + return !strncmp(path, dir, dirlen) && + (!path[dirlen] || path[dirlen] == '/'); +} + +static int check_recursion_depth(const struct strbuf *name, + const struct pathspec *ps, + int max_depth) +{ + int i; + + if (!ps->nr) + return within_depth(name->buf, name->len, 1, max_depth); + + /* + * We look through the pathspecs in reverse-sorted order, because we + * want to find the longest match first (e.g., "a/b" is better for + * checking depth than "a/b/c"). + */ + for (i = ps->nr - 1; i >= 0; i--) { + const struct pathspec_item *item = ps->items+i; + + /* + * If the name to match is longer than the pathspec, then we + * are only interested if the pathspec matches and we are + * within the allowed depth. + */ + if (name->len >= item->len) { + if (!is_dir_prefix(name->buf, item->match, item->len)) + continue; + return within_depth(name->buf + item->len, + name->len - item->len, + 1, max_depth); + } + + /* + * Otherwise, our name is shorter than the pathspec. We need to + * check if it is a prefix of the pathspec; if so, we must + * always recurse in order to process further (the resulting + * paths we find might or might not match our pathspec, but we + * cannot know until we recurse). + */ + if (is_dir_prefix(item->match, name->buf, name->len)) + return 1; + } + return 0; +} + +static int should_recurse(const struct strbuf *name, struct diff_options *opt) +{ + if (!opt->flags.recursive) + return 0; + if (!opt->max_depth_valid) + return 1; + + /* + * We catch this during diff_setup_done, but let's double-check + * against any internal munging. + */ + if (opt->pathspec.has_wildcard) + BUG("wildcard pathspecs are incompatible with max-depth"); + + return check_recursion_depth(name, &opt->pathspec, opt->max_depth); +} + static void ll_diff_tree_paths( struct combine_diff_path ***tail, const struct object_id *oid, const struct object_id **parents_oid, int nparent, @@ -170,9 +238,13 @@ static void emit_path(struct combine_diff_path ***tail, mode = 0; } - if (opt->flags.recursive && isdir) { - recurse = 1; - emitthis = opt->flags.tree_in_recursive; + if (isdir) { + strbuf_add(base, path, pathlen); + if (should_recurse(base, opt)) { + recurse = 1; + emitthis = opt->flags.tree_in_recursive; + } + strbuf_setlen(base, old_baselen); } if (emitthis) {