Merge branch 'en/batch-prefetch'

In a lazy clone, "git cherry" and "git grep" often fetch necessary
blob objects one by one from promisor remotes.  It has been corrected
to collect necessary object names and fetch them in bulk to gain
reasonable performance.

* en/batch-prefetch:
  grep: prefetch necessary blobs
  builtin/log: prefetch necessary blobs for `git cherry`
  patch-ids.h: add missing trailing parenthesis in documentation comment
  promisor-remote: document caller filtering contract
main
Junio C Hamano 2026-05-27 14:15:43 +09:00
commit 6d2ba7ead7
6 changed files with 371 additions and 1 deletions

View File

@ -28,9 +28,12 @@
#include "object-file.h"
#include "object-name.h"
#include "odb.h"
#include "oid-array.h"
#include "oidset.h"
#include "packfile.h"
#include "pager.h"
#include "path.h"
#include "promisor-remote.h"
#include "read-cache-ll.h"
#include "write-or-die.h"

@ -692,6 +695,144 @@ static int grep_tree(struct grep_opt *opt, const struct pathspec *pathspec,
return hit;
}

static void collect_blob_oids_for_tree(struct repository *repo,
const struct pathspec *pathspec,
struct tree_desc *tree,
struct strbuf *base,
int tn_len,
struct oidset *blob_oids)
{
struct name_entry entry;
int old_baselen = base->len;
struct strbuf name = STRBUF_INIT;
enum interesting match = entry_not_interesting;

while (tree_entry(tree, &entry)) {
if (match != all_entries_interesting) {
strbuf_addstr(&name, base->buf + tn_len);
match = tree_entry_interesting(repo->index,
&entry, &name,
pathspec);
strbuf_reset(&name);

if (match == all_entries_not_interesting)
break;
if (match == entry_not_interesting)
continue;
}

strbuf_add(base, entry.path, tree_entry_len(&entry));

if (S_ISREG(entry.mode)) {
if (!odb_has_object(repo->objects, &entry.oid, 0))
oidset_insert(blob_oids, &entry.oid);
} else if (S_ISDIR(entry.mode)) {
enum object_type type;
struct tree_desc sub_tree;
void *data;
unsigned long size;

data = odb_read_object(repo->objects, &entry.oid,
&type, &size);
if (!data)
die(_("unable to read tree (%s)"),
oid_to_hex(&entry.oid));

strbuf_addch(base, '/');
init_tree_desc(&sub_tree, &entry.oid, data, size);
collect_blob_oids_for_tree(repo, pathspec, &sub_tree,
base, tn_len, blob_oids);
free(data);
}
/*
* ...no else clause for S_ISGITLINK: submodules have their
* own promisor configuration and would need separate fetches
* anyway.
*/

strbuf_setlen(base, old_baselen);
}

strbuf_release(&name);
}

static void collect_blob_oids_for_treeish(struct grep_opt *opt,
const struct pathspec *pathspec,
const struct object_id *tree_ish_oid,
const char *name,
struct oidset *blob_oids)
{
struct tree_desc tree;
void *data;
unsigned long size;
struct strbuf base = STRBUF_INIT;
int len;

data = odb_read_object_peeled(opt->repo->objects, tree_ish_oid,
OBJ_TREE, &size, NULL);

if (!data)
return;

len = name ? strlen(name) : 0;
if (len) {
strbuf_add(&base, name, len);
strbuf_addch(&base, ':');
}
init_tree_desc(&tree, tree_ish_oid, data, size);

collect_blob_oids_for_tree(opt->repo, pathspec, &tree,
&base, base.len, blob_oids);

strbuf_release(&base);
free(data);
}

static void prefetch_grep_blobs(struct grep_opt *opt,
const struct pathspec *pathspec,
const struct object_array *list)
{
struct oidset blob_oids = OIDSET_INIT;

/* Exit if we're not in a partial clone */
if (!repo_has_promisor_remote(opt->repo))
return;

/* For each tree, gather the blobs in it */
for (int i = 0; i < list->nr; i++) {
struct object *real_obj;

obj_read_lock();
real_obj = deref_tag(opt->repo, list->objects[i].item,
NULL, 0);
obj_read_unlock();

if (real_obj &&
(real_obj->type == OBJ_COMMIT ||
real_obj->type == OBJ_TREE))
collect_blob_oids_for_treeish(opt, pathspec,
&real_obj->oid,
list->objects[i].name,
&blob_oids);
}

/* Prefetch the blobs we found */
if (oidset_size(&blob_oids)) {
struct oid_array to_fetch = OID_ARRAY_INIT;
struct oidset_iter iter;
const struct object_id *oid;

oidset_iter_init(&blob_oids, &iter);
while ((oid = oidset_iter_next(&iter)))
oid_array_append(&to_fetch, oid);

promisor_remote_get_direct(opt->repo, to_fetch.oid, to_fetch.nr);

oid_array_clear(&to_fetch);
}
oidset_clear(&blob_oids);
}

static int grep_object(struct grep_opt *opt, const struct pathspec *pathspec,
struct object *obj, const char *name, const char *path)
{
@ -732,6 +873,8 @@ static int grep_objects(struct grep_opt *opt, const struct pathspec *pathspec,
int hit = 0;
const unsigned int nr = list->nr;

prefetch_grep_blobs(opt, pathspec, list);

for (i = 0; i < nr; i++) {
struct object *real_obj;


View File

@ -21,10 +21,12 @@
#include "color.h"
#include "commit.h"
#include "diff.h"
#include "diffcore.h"
#include "diff-merges.h"
#include "revision.h"
#include "log-tree.h"
#include "oid-array.h"
#include "oidset.h"
#include "tag.h"
#include "reflog-walk.h"
#include "patch-ids.h"
@ -43,9 +45,11 @@
#include "utf8.h"

#include "commit-reach.h"
#include "promisor-remote.h"
#include "range-diff.h"
#include "tmp-objdir.h"
#include "tree.h"
#include "userdiff.h"
#include "write-or-die.h"

#define MAIL_DEFAULT_WRAP 72
@ -2602,6 +2606,131 @@ static void print_commit(char sign, struct commit *commit, int verbose,
}
}

/*
* Enumerate blob OIDs from a single commit's diff, inserting them into blobs.
* Skips files whose userdiff driver explicitly declares binary status
* (drv->binary > 0), since patch-ID uses oid_to_hex() for those and
* never reads blob content. Use userdiff_find_by_path() since
* diff_filespec_load_driver() is static in diff.c.
*
* Clean up with diff_queue_clear() (from diffcore.h).
*/
static void collect_diff_blob_oids(struct commit *commit,
struct diff_options *opts,
struct oidset *blobs)
{
struct diff_queue_struct *q;

/*
* Merge commits are filtered out by patch_id_defined() in patch-ids.c,
* so we'll never be called with one.
*/
assert(!commit->parents || !commit->parents->next);

if (commit->parents)
diff_tree_oid(&commit->parents->item->object.oid,
&commit->object.oid, "", opts);
else
diff_root_tree_oid(&commit->object.oid, "", opts);
diffcore_std(opts);

q = &diff_queued_diff;
for (int i = 0; i < q->nr; i++) {
struct diff_filepair *p = q->queue[i];
struct userdiff_driver *drv;

/* Skip binary files */
drv = userdiff_find_by_path(opts->repo->index, p->one->path);
if (drv && drv->binary > 0)
continue;

if (DIFF_FILE_VALID(p->one) &&
odb_read_object_info_extended(opts->repo->objects,
&p->one->oid, NULL,
OBJECT_INFO_FOR_PREFETCH))
oidset_insert(blobs, &p->one->oid);
if (DIFF_FILE_VALID(p->two) &&
odb_read_object_info_extended(opts->repo->objects,
&p->two->oid, NULL,
OBJECT_INFO_FOR_PREFETCH))
oidset_insert(blobs, &p->two->oid);
}
diff_queue_clear(q);
}

static int always_match(const void *cmp_data UNUSED,
const struct hashmap_entry *entry1 UNUSED,
const struct hashmap_entry *entry2 UNUSED,
const void *keydata UNUSED)
{
return 0;
}

/*
* Prefetch blobs for git cherry in partial clones.
*
* Called between the revision walk (which builds the head-side
* commit list) and the has_commit_patch_id() comparison loop.
*
* Uses a cmpfn-swap trick to avoid reading blobs: temporarily
* replaces the hashmap's comparison function with a trivial
* always-match function, so hashmap_get()/hashmap_get_next() match
* any entry with the same oidhash bucket. These are the set of oids
* that would trigger patch_id_neq() during normal lookup and cause
* blobs to be read on demand, and we want to prefetch them all at
* once instead.
*/
static void prefetch_cherry_blobs(struct repository *repo,
struct commit_list *list,
struct patch_ids *ids)
{
struct oidset blobs = OIDSET_INIT;
hashmap_cmp_fn original_cmpfn;

/* Exit if we're not in a partial clone */
if (!repo_has_promisor_remote(repo))
return;

/* Save original cmpfn, replace with always_match */
original_cmpfn = ids->patches.cmpfn;
ids->patches.cmpfn = always_match;

/* Find header-only collisions, gather blobs from those commits */
for (struct commit_list *l = list; l; l = l->next) {
struct commit *c = l->item;
bool match_found = false;
for (struct patch_id *cur = patch_id_iter_first(c, ids);
cur;
cur = patch_id_iter_next(cur, ids)) {
match_found = true;
collect_diff_blob_oids(cur->commit, &ids->diffopts,
&blobs);
}
if (match_found)
collect_diff_blob_oids(c, &ids->diffopts, &blobs);
}

/* Restore original cmpfn */
ids->patches.cmpfn = original_cmpfn;

/* If we have any blobs to fetch, fetch them */
if (oidset_size(&blobs)) {
struct oid_array to_fetch = OID_ARRAY_INIT;
struct oidset_iter iter;
const struct object_id *oid;

oidset_iter_init(&blobs, &iter);
while ((oid = oidset_iter_next(&iter)))
oid_array_append(&to_fetch, oid);

promisor_remote_get_direct(repo, to_fetch.oid, to_fetch.nr);

oid_array_clear(&to_fetch);
}

oidset_clear(&blobs);
}

int cmd_cherry(int argc,
const char **argv,
const char *prefix,
@ -2673,6 +2802,8 @@ int cmd_cherry(int argc,
commit_list_insert(commit, &list);
}

prefetch_cherry_blobs(the_repository, list, &ids);

for (struct commit_list *l = list; l; l = l->next) {
char sign = '+';


View File

@ -37,7 +37,7 @@ int has_commit_patch_id(struct commit *commit, struct patch_ids *);
* struct patch_id *cur;
* for (cur = patch_id_iter_first(commit, ids);
* cur;
* cur = patch_id_iter_next(cur, ids) {
* cur = patch_id_iter_next(cur, ids)) {
* ... look at cur->commit
* }
*/

View File

@ -29,6 +29,17 @@ int repo_has_promisor_remote(struct repository *r);
* Fetches all requested objects from all promisor remotes, trying them one at
* a time until all objects are fetched.
*
* Callers are responsible for filtering out OIDs that are already present
* locally before calling this function: every supplied OID is sent in the
* fetch request, even if the object already exists in the local object
* store. (Only after a fetch failure does this function fall back to
* stripping already-present OIDs from the list before trying the next
* configured promisor remote.) Callers should also deduplicate the OIDs.
*
* To test for local presence without triggering a lazy fetch (which would
* defeat the purpose of batching), use odb_has_object(..., 0) or
* odb_read_object_info_extended() with OBJECT_INFO_FOR_PREFETCH.
*
* If oid_nr is 0, this function returns immediately.
*/
void promisor_remote_get_direct(struct repository *repo,

View File

@ -78,4 +78,31 @@ test_expect_success 'cherry ignores whitespace' '
test_cmp expect actual
'

# Reuse the expect file from the previous test, in a partial clone
test_expect_success 'cherry in partial clone does bulk prefetch' '
test_config uploadpack.allowfilter 1 &&
test_config uploadpack.allowanysha1inwant 1 &&
test_when_finished "rm -rf copy" &&

git clone --bare --filter=blob:none file://"$(pwd)" copy &&
(
cd copy &&
GIT_TRACE2_EVENT="$(pwd)/trace.output" git cherry upstream-with-space feature-without-space >actual &&
test_cmp ../expect actual &&

grep "child_start.*fetch.negotiationAlgorithm" trace.output >fetches &&
test_line_count = 1 fetches &&
test_trace2_data promisor fetch_count 4 <trace.output &&

# A second invocation should not refetch any blobs, since
# the prefetch is expected to filter out OIDs that are
# already present locally.
GIT_TRACE2_EVENT="$(pwd)/trace2.output" git cherry upstream-with-space feature-without-space >actual &&
test_cmp ../expect actual &&

! grep "child_start.*fetch.negotiationAlgorithm" trace2.output &&
! grep "\"key\":\"fetch_count\"" trace2.output
)
'

test_done

View File

@ -1929,4 +1929,62 @@ test_expect_success 'grep does not report i-t-a and assume unchanged with -L' '
test_cmp expected actual
'

test_expect_success 'grep of revision in partial clone batches prefetch and honors pathspec' '
test_when_finished "rm -rf grep-partial-src grep-partial" &&

git init grep-partial-src &&
(
cd grep-partial-src &&
git config uploadpack.allowfilter 1 &&
git config uploadpack.allowanysha1inwant 1 &&
mkdir a b &&
echo "needle in haystack" >a/matches.txt &&
echo "nothing to see here" >a/nomatch.txt &&
echo "needle again" >b/matches.md &&
git add . &&
git commit -m "initial"
) &&

git clone --no-checkout --filter=blob:none \
"file://$(pwd)/grep-partial-src" grep-partial &&

# All three blobs are missing immediately after a blobless clone.
git -C grep-partial rev-list --quiet --objects \
--missing=print HEAD >missing &&
test_line_count = 3 missing &&

# A pathspec-limited grep should prefetch only the two blobs
# in a/. It should fetch both blobs in one batched request.
GIT_TRACE2_EVENT="$(pwd)/grep-trace-pathspec" \
git -C grep-partial grep -c "needle" HEAD -- "a/*.txt" >result &&

# Only a/matches.txt contains "needle" among the matched paths.
test_line_count = 1 result &&

# Exactly the two a/*.txt blobs should have been requested, and
# the server packed those two objects in the response.
test_trace2_data promisor fetch_count 2 <grep-trace-pathspec &&
test_trace2_data pack-objects written 2 <grep-trace-pathspec &&

# b/matches.md should still be missing locally.
git -C grep-partial rev-list --quiet --objects \
--missing=print HEAD >missing &&
test_line_count = 1 missing &&

# A second grep without a pathspec must recurse into both
# subdirectories, but should request only the still-missing blob
# from the promisor.
GIT_TRACE2_EVENT="$(pwd)/grep-trace-all" \
git -C grep-partial grep -c "needle" HEAD >result &&

test_line_count = 2 result &&
test_trace2_data promisor fetch_count 1 <grep-trace-all &&
test_trace2_data pack-objects written 1 <grep-trace-all &&

# Everything is local now.
git -C grep-partial rev-list --quiet --objects \
--missing=print HEAD >missing &&
test_line_count = 0 missing
'

test_done