Merge branch 'ly/changed-paths-traversal'

Lift the limitation to use changed-path filter in "git log" so that
it can be used for a pathspec with multiple literal paths.

* ly/changed-paths-traversal:
  bloom: optimize multiple pathspec items in revision
  revision: make helper for pathspec to bloom keyvec
  bloom: replace struct bloom_key * with struct bloom_keyvec
  bloom: rename function operates on bloom_key
  bloom: add test helper to return murmur3 hash
maint
Junio C Hamano 2025-07-23 15:45:15 -07:00
commit f22d4ac4fd
8 changed files with 207 additions and 103 deletions

View File

@ -1311,7 +1311,7 @@ static void add_bloom_key(struct blame_bloom_data *bd,
} }


bd->keys[bd->nr] = xmalloc(sizeof(struct bloom_key)); bd->keys[bd->nr] = xmalloc(sizeof(struct bloom_key));
fill_bloom_key(path, strlen(path), bd->keys[bd->nr], bd->settings); bloom_key_fill(bd->keys[bd->nr], path, strlen(path), bd->settings);
bd->nr++; bd->nr++;
} }



84
bloom.c
View File

@ -107,7 +107,7 @@ int load_bloom_filter_from_graph(struct commit_graph *g,
* Not considered to be cryptographically secure. * Not considered to be cryptographically secure.
* Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
*/ */
uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len) static uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len)
{ {
const uint32_t c1 = 0xcc9e2d51; const uint32_t c1 = 0xcc9e2d51;
const uint32_t c2 = 0x1b873593; const uint32_t c2 = 0x1b873593;
@ -221,9 +221,7 @@ static uint32_t murmur3_seeded_v1(uint32_t seed, const char *data, size_t len)
return seed; return seed;
} }


void fill_bloom_key(const char *data, void bloom_key_fill(struct bloom_key *key, const char *data, size_t len,
size_t len,
struct bloom_key *key,
const struct bloom_filter_settings *settings) const struct bloom_filter_settings *settings)
{ {
int i; int i;
@ -243,7 +241,7 @@ void fill_bloom_key(const char *data,
key->hashes[i] = hash0 + i * hash1; key->hashes[i] = hash0 + i * hash1;
} }


void clear_bloom_key(struct bloom_key *key) void bloom_key_clear(struct bloom_key *key)
{ {
FREE_AND_NULL(key->hashes); FREE_AND_NULL(key->hashes);
} }
@ -280,6 +278,55 @@ void deinit_bloom_filters(void)
deep_clear_bloom_filter_slab(&bloom_filters, free_one_bloom_filter); deep_clear_bloom_filter_slab(&bloom_filters, free_one_bloom_filter);
} }


struct bloom_keyvec *bloom_keyvec_new(const char *path, size_t len,
const struct bloom_filter_settings *settings)
{
struct bloom_keyvec *vec;
const char *p;
size_t sz;
size_t nr = 1;

p = path;
while (*p) {
/*
* At this point, the path is normalized to use Unix-style
* path separators. This is required due to how the
* changed-path Bloom filters store the paths.
*/
if (*p == '/')
nr++;
p++;
}

sz = sizeof(struct bloom_keyvec);
sz += nr * sizeof(struct bloom_key);
vec = (struct bloom_keyvec *)xcalloc(1, sz);
if (!vec)
return NULL;
vec->count = nr;

bloom_key_fill(&vec->key[0], path, len, settings);
nr = 1;
p = path + len - 1;
while (p > path) {
if (*p == '/') {
bloom_key_fill(&vec->key[nr++], path, p - path, settings);
}
p--;
}
assert(nr == vec->count);
return vec;
}

void bloom_keyvec_free(struct bloom_keyvec *vec)
{
if (!vec)
return;
for (size_t nr = 0; nr < vec->count; nr++)
bloom_key_clear(&vec->key[nr]);
free(vec);
}

static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED, static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED,
const struct hashmap_entry *eptr, const struct hashmap_entry *eptr,
const struct hashmap_entry *entry_or_key, const struct hashmap_entry *entry_or_key,
@ -500,9 +547,9 @@ struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,


hashmap_for_each_entry(&pathmap, &iter, e, entry) { hashmap_for_each_entry(&pathmap, &iter, e, entry) {
struct bloom_key key; struct bloom_key key;
fill_bloom_key(e->path, strlen(e->path), &key, settings); bloom_key_fill(&key, e->path, strlen(e->path), settings);
add_key_to_filter(&key, filter, settings); add_key_to_filter(&key, filter, settings);
clear_bloom_key(&key); bloom_key_clear(&key);
} }


cleanup: cleanup:
@ -540,3 +587,26 @@ int bloom_filter_contains(const struct bloom_filter *filter,


return 1; return 1;
} }

int bloom_filter_contains_vec(const struct bloom_filter *filter,
const struct bloom_keyvec *vec,
const struct bloom_filter_settings *settings)
{
int ret = 1;

for (size_t nr = 0; ret > 0 && nr < vec->count; nr++)
ret = bloom_filter_contains(filter, &vec->key[nr], settings);

return ret;
}

uint32_t test_bloom_murmur3_seeded(uint32_t seed, const char *data, size_t len,
int version)
{
assert(version == 1 || version == 2);

if (version == 2)
return murmur3_seeded_v2(seed, data, len);
else
return murmur3_seeded_v1(seed, data, len);
}

56
bloom.h
View File

@ -74,24 +74,40 @@ struct bloom_key {
uint32_t *hashes; uint32_t *hashes;
}; };


/*
* A bloom_keyvec is a vector of bloom_keys, which
* can be used to store multiple keys for a single
* pathspec item.
*/
struct bloom_keyvec {
size_t count;
struct bloom_key key[FLEX_ARRAY];
};

int load_bloom_filter_from_graph(struct commit_graph *g, int load_bloom_filter_from_graph(struct commit_graph *g,
struct bloom_filter *filter, struct bloom_filter *filter,
uint32_t graph_pos); uint32_t graph_pos);


/* void bloom_key_fill(struct bloom_key *key, const char *data, size_t len,
* Calculate the murmur3 32-bit hash value for the given data
* using the given seed.
* Produces a uniformly distributed hash value.
* Not considered to be cryptographically secure.
* Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
*/
uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len);

void fill_bloom_key(const char *data,
size_t len,
struct bloom_key *key,
const struct bloom_filter_settings *settings); const struct bloom_filter_settings *settings);
void clear_bloom_key(struct bloom_key *key); void bloom_key_clear(struct bloom_key *key);

/*
* bloom_keyvec_new - Allocate and populate a bloom_keyvec with keys for the
* given path.
*
* This function splits the input path by '/' and generates a bloom key for each
* prefix, in reverse order of specificity. For example, given the input
* "a/b/c", it will generate bloom keys for:
* - "a/b/c"
* - "a/b"
* - "a"
*
* The resulting keys are stored in a newly allocated bloom_keyvec.
*/
struct bloom_keyvec *bloom_keyvec_new(const char *path, size_t len,
const struct bloom_filter_settings *settings);
void bloom_keyvec_free(struct bloom_keyvec *vec);


void add_key_to_filter(const struct bloom_key *key, void add_key_to_filter(const struct bloom_key *key,
struct bloom_filter *filter, struct bloom_filter *filter,
@ -137,4 +153,18 @@ int bloom_filter_contains(const struct bloom_filter *filter,
const struct bloom_key *key, const struct bloom_key *key,
const struct bloom_filter_settings *settings); const struct bloom_filter_settings *settings);


/*
* bloom_filter_contains_vec - Check if all keys in a key vector are in the
* Bloom filter.
*
* Returns 1 if **all** keys in the vector are present in the filter,
* 0 if **any** key is not present.
*/
int bloom_filter_contains_vec(const struct bloom_filter *filter,
const struct bloom_keyvec *v,
const struct bloom_filter_settings *settings);

uint32_t test_bloom_murmur3_seeded(uint32_t seed, const char *data, size_t len,
int version);

#endif #endif

View File

@ -1172,12 +1172,13 @@ static int bloom_filter_check(struct rev_info *rev,
return 0; return 0;


while (!result && range) { while (!result && range) {
fill_bloom_key(range->path, strlen(range->path), &key, rev->bloom_filter_settings); bloom_key_fill(&key, range->path, strlen(range->path),
rev->bloom_filter_settings);


if (bloom_filter_contains(filter, &key, rev->bloom_filter_settings)) if (bloom_filter_contains(filter, &key, rev->bloom_filter_settings))
result = 1; result = 1;


clear_bloom_key(&key); bloom_key_clear(&key);
range = range->next; range = range->next;
} }



View File

@ -675,24 +675,48 @@ static int forbid_bloom_filters(struct pathspec *spec)
{ {
if (spec->has_wildcard) if (spec->has_wildcard)
return 1; return 1;
if (spec->nr > 1)
return 1;
if (spec->magic & ~PATHSPEC_LITERAL) if (spec->magic & ~PATHSPEC_LITERAL)
return 1; return 1;
if (spec->nr && (spec->items[0].magic & ~PATHSPEC_LITERAL)) for (size_t nr = 0; nr < spec->nr; nr++)
return 1; if (spec->items[nr].magic & ~PATHSPEC_LITERAL)
return 1;


return 0; return 0;
} }


static void release_revisions_bloom_keyvecs(struct rev_info *revs);

static int convert_pathspec_to_bloom_keyvec(struct bloom_keyvec **out,
const struct pathspec_item *pi,
const struct bloom_filter_settings *settings)
{
char *path_alloc = NULL;
const char *path;
size_t len;
int res = 0;

/* remove single trailing slash from path, if needed */
if (pi->len > 0 && pi->match[pi->len - 1] == '/') {
path_alloc = xmemdupz(pi->match, pi->len - 1);
path = path_alloc;
} else
path = pi->match;

len = strlen(path);
if (!len) {
res = -1;
goto cleanup;
}

*out = bloom_keyvec_new(path, len, settings);

cleanup:
free(path_alloc);
return res;
}

static void prepare_to_use_bloom_filter(struct rev_info *revs) static void prepare_to_use_bloom_filter(struct rev_info *revs)
{ {
struct pathspec_item *pi;
char *path_alloc = NULL;
const char *path, *p;
size_t len;
int path_component_nr = 1;

if (!revs->commits) if (!revs->commits)
return; return;


@ -708,48 +732,14 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
if (!revs->pruning.pathspec.nr) if (!revs->pruning.pathspec.nr)
return; return;


pi = &revs->pruning.pathspec.items[0]; revs->bloom_keyvecs_nr = revs->pruning.pathspec.nr;
CALLOC_ARRAY(revs->bloom_keyvecs, revs->bloom_keyvecs_nr);


/* remove single trailing slash from path, if needed */ for (int i = 0; i < revs->pruning.pathspec.nr; i++) {
if (pi->len > 0 && pi->match[pi->len - 1] == '/') { if (convert_pathspec_to_bloom_keyvec(&revs->bloom_keyvecs[i],
path_alloc = xmemdupz(pi->match, pi->len - 1); &revs->pruning.pathspec.items[i],
path = path_alloc; revs->bloom_filter_settings))
} else goto fail;
path = pi->match;

len = strlen(path);
if (!len) {
revs->bloom_filter_settings = NULL;
free(path_alloc);
return;
}

p = path;
while (*p) {
/*
* At this point, the path is normalized to use Unix-style
* path separators. This is required due to how the
* changed-path Bloom filters store the paths.
*/
if (*p == '/')
path_component_nr++;
p++;
}

revs->bloom_keys_nr = path_component_nr;
ALLOC_ARRAY(revs->bloom_keys, revs->bloom_keys_nr);

fill_bloom_key(path, len, &revs->bloom_keys[0],
revs->bloom_filter_settings);
path_component_nr = 1;

p = path + len - 1;
while (p > path) {
if (*p == '/')
fill_bloom_key(path, p - path,
&revs->bloom_keys[path_component_nr++],
revs->bloom_filter_settings);
p--;
} }


if (trace2_is_enabled() && !bloom_filter_atexit_registered) { if (trace2_is_enabled() && !bloom_filter_atexit_registered) {
@ -757,14 +747,18 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
bloom_filter_atexit_registered = 1; bloom_filter_atexit_registered = 1;
} }


free(path_alloc); return;

fail:
revs->bloom_filter_settings = NULL;
release_revisions_bloom_keyvecs(revs);
} }


static int check_maybe_different_in_bloom_filter(struct rev_info *revs, static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
struct commit *commit) struct commit *commit)
{ {
struct bloom_filter *filter; struct bloom_filter *filter;
int result = 1, j; int result = 0;


if (!revs->repo->objects->commit_graph) if (!revs->repo->objects->commit_graph)
return -1; return -1;
@ -779,10 +773,10 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
return -1; return -1;
} }


for (j = 0; result && j < revs->bloom_keys_nr; j++) { for (size_t nr = 0; !result && nr < revs->bloom_keyvecs_nr; nr++) {
result = bloom_filter_contains(filter, result = bloom_filter_contains_vec(filter,
&revs->bloom_keys[j], revs->bloom_keyvecs[nr],
revs->bloom_filter_settings); revs->bloom_filter_settings);
} }


if (result) if (result)
@ -823,7 +817,7 @@ static int rev_compare_tree(struct rev_info *revs,
return REV_TREE_SAME; return REV_TREE_SAME;
} }


if (revs->bloom_keys_nr && !nth_parent) { if (revs->bloom_keyvecs_nr && !nth_parent) {
bloom_ret = check_maybe_different_in_bloom_filter(revs, commit); bloom_ret = check_maybe_different_in_bloom_filter(revs, commit);


if (bloom_ret == 0) if (bloom_ret == 0)
@ -850,7 +844,7 @@ static int rev_same_tree_as_empty(struct rev_info *revs, struct commit *commit,
if (!t1) if (!t1)
return 0; return 0;


if (!nth_parent && revs->bloom_keys_nr) { if (!nth_parent && revs->bloom_keyvecs_nr) {
bloom_ret = check_maybe_different_in_bloom_filter(revs, commit); bloom_ret = check_maybe_different_in_bloom_filter(revs, commit);
if (!bloom_ret) if (!bloom_ret)
return 1; return 1;
@ -3202,6 +3196,14 @@ static void release_revisions_mailmap(struct string_list *mailmap)


static void release_revisions_topo_walk_info(struct topo_walk_info *info); static void release_revisions_topo_walk_info(struct topo_walk_info *info);


static void release_revisions_bloom_keyvecs(struct rev_info *revs)
{
for (size_t nr = 0; nr < revs->bloom_keyvecs_nr; nr++)
bloom_keyvec_free(revs->bloom_keyvecs[nr]);
FREE_AND_NULL(revs->bloom_keyvecs);
revs->bloom_keyvecs_nr = 0;
}

static void free_void_commit_list(void *list) static void free_void_commit_list(void *list)
{ {
free_commit_list(list); free_commit_list(list);
@ -3230,11 +3232,7 @@ void release_revisions(struct rev_info *revs)
clear_decoration(&revs->treesame, free); clear_decoration(&revs->treesame, free);
line_log_free(revs); line_log_free(revs);
oidset_clear(&revs->missing_commits); oidset_clear(&revs->missing_commits);

release_revisions_bloom_keyvecs(revs);
for (int i = 0; i < revs->bloom_keys_nr; i++)
clear_bloom_key(&revs->bloom_keys[i]);
FREE_AND_NULL(revs->bloom_keys);
revs->bloom_keys_nr = 0;
} }


static void add_child(struct rev_info *revs, struct commit *parent, struct commit *child) static void add_child(struct rev_info *revs, struct commit *parent, struct commit *child)

View File

@ -62,7 +62,7 @@ struct repository;
struct rev_info; struct rev_info;
struct string_list; struct string_list;
struct saved_parents; struct saved_parents;
struct bloom_key; struct bloom_keyvec;
struct bloom_filter_settings; struct bloom_filter_settings;
struct option; struct option;
struct parse_opt_ctx_t; struct parse_opt_ctx_t;
@ -360,8 +360,8 @@ struct rev_info {


/* Commit graph bloom filter fields */ /* Commit graph bloom filter fields */
/* The bloom filter key(s) for the pathspec */ /* The bloom filter key(s) for the pathspec */
struct bloom_key *bloom_keys; struct bloom_keyvec **bloom_keyvecs;
int bloom_keys_nr; int bloom_keyvecs_nr;


/* /*
* The bloom filter settings used to generate the key. * The bloom filter settings used to generate the key.

View File

@ -12,13 +12,13 @@ static struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
static void add_string_to_filter(const char *data, struct bloom_filter *filter) { static void add_string_to_filter(const char *data, struct bloom_filter *filter) {
struct bloom_key key; struct bloom_key key;


fill_bloom_key(data, strlen(data), &key, &settings); bloom_key_fill(&key, data, strlen(data), &settings);
printf("Hashes:"); printf("Hashes:");
for (size_t i = 0; i < settings.num_hashes; i++) for (size_t i = 0; i < settings.num_hashes; i++)
printf("0x%08x|", key.hashes[i]); printf("0x%08x|", key.hashes[i]);
printf("\n"); printf("\n");
add_key_to_filter(&key, filter, &settings); add_key_to_filter(&key, filter, &settings);
clear_bloom_key(&key); bloom_key_clear(&key);
} }


static void print_bloom_filter(struct bloom_filter *filter) { static void print_bloom_filter(struct bloom_filter *filter) {
@ -61,13 +61,13 @@ int cmd__bloom(int argc, const char **argv)
uint32_t hashed; uint32_t hashed;
if (argc < 3) if (argc < 3)
usage(bloom_usage); usage(bloom_usage);
hashed = murmur3_seeded_v2(0, argv[2], strlen(argv[2])); hashed = test_bloom_murmur3_seeded(0, argv[2], strlen(argv[2]), 2);
printf("Murmur3 Hash with seed=0:0x%08x\n", hashed); printf("Murmur3 Hash with seed=0:0x%08x\n", hashed);
} }


if (!strcmp(argv[1], "get_murmur3_seven_highbit")) { if (!strcmp(argv[1], "get_murmur3_seven_highbit")) {
uint32_t hashed; uint32_t hashed;
hashed = murmur3_seeded_v2(0, "\x99\xaa\xbb\xcc\xdd\xee\xff", 7); hashed = test_bloom_murmur3_seeded(0, "\x99\xaa\xbb\xcc\xdd\xee\xff", 7, 2);
printf("Murmur3 Hash with seed=0:0x%08x\n", hashed); printf("Murmur3 Hash with seed=0:0x%08x\n", hashed);
} }



View File

@ -66,8 +66,9 @@ sane_unset GIT_TRACE2_CONFIG_PARAMS


setup () { setup () {
rm -f "$TRASH_DIRECTORY/trace.perf" && rm -f "$TRASH_DIRECTORY/trace.perf" &&
git -c core.commitGraph=false log --pretty="format:%s" $1 >log_wo_bloom && eval git -c core.commitGraph=false log --pretty="format:%s" "$1" >log_wo_bloom &&
GIT_TRACE2_PERF="$TRASH_DIRECTORY/trace.perf" git -c core.commitGraph=true log --pretty="format:%s" $1 >log_w_bloom eval "GIT_TRACE2_PERF=\"$TRASH_DIRECTORY/trace.perf\"" \
git -c core.commitGraph=true log --pretty="format:%s" "$1" >log_w_bloom
} }


test_bloom_filters_used () { test_bloom_filters_used () {
@ -138,10 +139,6 @@ test_expect_success 'git log with --walk-reflogs does not use Bloom filters' '
test_bloom_filters_not_used "--walk-reflogs -- A" test_bloom_filters_not_used "--walk-reflogs -- A"
' '


test_expect_success 'git log -- multiple path specs does not use Bloom filters' '
test_bloom_filters_not_used "-- file4 A/file1"
'

test_expect_success 'git log -- "." pathspec at root does not use Bloom filters' ' test_expect_success 'git log -- "." pathspec at root does not use Bloom filters' '
test_bloom_filters_not_used "-- ." test_bloom_filters_not_used "-- ."
' '
@ -151,9 +148,17 @@ test_expect_success 'git log with wildcard that resolves to a single path uses B
test_bloom_filters_used "-- *renamed" test_bloom_filters_used "-- *renamed"
' '


test_expect_success 'git log with wildcard that resolves to a multiple paths does not uses Bloom filters' ' test_expect_success 'git log with multiple literal paths uses Bloom filter' '
test_bloom_filters_not_used "-- *" && test_bloom_filters_used "-- file4 A/file1" &&
test_bloom_filters_not_used "-- file*" test_bloom_filters_used "-- *" &&
test_bloom_filters_used "-- file*"
'

test_expect_success 'git log with path contains a wildcard does not use Bloom filter' '
test_bloom_filters_not_used "-- file\*" &&
test_bloom_filters_not_used "-- A/\* file4" &&
test_bloom_filters_not_used "-- file4 A/\*" &&
test_bloom_filters_not_used "-- * A/\*"
' '


test_expect_success 'setup - add commit-graph to the chain without Bloom filters' ' test_expect_success 'setup - add commit-graph to the chain without Bloom filters' '