grep: prefetch necessary blobs
In partial clones, `git grep` fetches necessary blobs on-demand one at a time, which can be very slow. In partial clones, add an extra preliminary walk over the tree similar to grep_tree() which collects the blobs of interest, and then prefetches them. Signed-off-by: Elijah Newren <newren@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>main
parent
463c1bfc2b
commit
854061ea54
143
builtin/grep.c
143
builtin/grep.c
|
|
@ -28,9 +28,12 @@
|
|||
#include "object-file.h"
|
||||
#include "object-name.h"
|
||||
#include "odb.h"
|
||||
#include "oid-array.h"
|
||||
#include "oidset.h"
|
||||
#include "packfile.h"
|
||||
#include "pager.h"
|
||||
#include "path.h"
|
||||
#include "promisor-remote.h"
|
||||
#include "read-cache-ll.h"
|
||||
#include "write-or-die.h"
|
||||
|
||||
|
|
@ -692,6 +695,144 @@ static int grep_tree(struct grep_opt *opt, const struct pathspec *pathspec,
|
|||
return hit;
|
||||
}
|
||||
|
||||
static void collect_blob_oids_for_tree(struct repository *repo,
|
||||
const struct pathspec *pathspec,
|
||||
struct tree_desc *tree,
|
||||
struct strbuf *base,
|
||||
int tn_len,
|
||||
struct oidset *blob_oids)
|
||||
{
|
||||
struct name_entry entry;
|
||||
int old_baselen = base->len;
|
||||
struct strbuf name = STRBUF_INIT;
|
||||
enum interesting match = entry_not_interesting;
|
||||
|
||||
while (tree_entry(tree, &entry)) {
|
||||
if (match != all_entries_interesting) {
|
||||
strbuf_addstr(&name, base->buf + tn_len);
|
||||
match = tree_entry_interesting(repo->index,
|
||||
&entry, &name,
|
||||
pathspec);
|
||||
strbuf_reset(&name);
|
||||
|
||||
if (match == all_entries_not_interesting)
|
||||
break;
|
||||
if (match == entry_not_interesting)
|
||||
continue;
|
||||
}
|
||||
|
||||
strbuf_add(base, entry.path, tree_entry_len(&entry));
|
||||
|
||||
if (S_ISREG(entry.mode)) {
|
||||
if (!odb_has_object(repo->objects, &entry.oid, 0))
|
||||
oidset_insert(blob_oids, &entry.oid);
|
||||
} else if (S_ISDIR(entry.mode)) {
|
||||
enum object_type type;
|
||||
struct tree_desc sub_tree;
|
||||
void *data;
|
||||
unsigned long size;
|
||||
|
||||
data = odb_read_object(repo->objects, &entry.oid,
|
||||
&type, &size);
|
||||
if (!data)
|
||||
die(_("unable to read tree (%s)"),
|
||||
oid_to_hex(&entry.oid));
|
||||
|
||||
strbuf_addch(base, '/');
|
||||
init_tree_desc(&sub_tree, &entry.oid, data, size);
|
||||
collect_blob_oids_for_tree(repo, pathspec, &sub_tree,
|
||||
base, tn_len, blob_oids);
|
||||
free(data);
|
||||
}
|
||||
/*
|
||||
* ...no else clause for S_ISGITLINK: submodules have their
|
||||
* own promisor configuration and would need separate fetches
|
||||
* anyway.
|
||||
*/
|
||||
|
||||
strbuf_setlen(base, old_baselen);
|
||||
}
|
||||
|
||||
strbuf_release(&name);
|
||||
}
|
||||
|
||||
static void collect_blob_oids_for_treeish(struct grep_opt *opt,
|
||||
const struct pathspec *pathspec,
|
||||
const struct object_id *tree_ish_oid,
|
||||
const char *name,
|
||||
struct oidset *blob_oids)
|
||||
{
|
||||
struct tree_desc tree;
|
||||
void *data;
|
||||
unsigned long size;
|
||||
struct strbuf base = STRBUF_INIT;
|
||||
int len;
|
||||
|
||||
data = odb_read_object_peeled(opt->repo->objects, tree_ish_oid,
|
||||
OBJ_TREE, &size, NULL);
|
||||
|
||||
if (!data)
|
||||
return;
|
||||
|
||||
len = name ? strlen(name) : 0;
|
||||
if (len) {
|
||||
strbuf_add(&base, name, len);
|
||||
strbuf_addch(&base, ':');
|
||||
}
|
||||
init_tree_desc(&tree, tree_ish_oid, data, size);
|
||||
|
||||
collect_blob_oids_for_tree(opt->repo, pathspec, &tree,
|
||||
&base, base.len, blob_oids);
|
||||
|
||||
strbuf_release(&base);
|
||||
free(data);
|
||||
}
|
||||
|
||||
static void prefetch_grep_blobs(struct grep_opt *opt,
|
||||
const struct pathspec *pathspec,
|
||||
const struct object_array *list)
|
||||
{
|
||||
struct oidset blob_oids = OIDSET_INIT;
|
||||
|
||||
/* Exit if we're not in a partial clone */
|
||||
if (!repo_has_promisor_remote(opt->repo))
|
||||
return;
|
||||
|
||||
/* For each tree, gather the blobs in it */
|
||||
for (int i = 0; i < list->nr; i++) {
|
||||
struct object *real_obj;
|
||||
|
||||
obj_read_lock();
|
||||
real_obj = deref_tag(opt->repo, list->objects[i].item,
|
||||
NULL, 0);
|
||||
obj_read_unlock();
|
||||
|
||||
if (real_obj &&
|
||||
(real_obj->type == OBJ_COMMIT ||
|
||||
real_obj->type == OBJ_TREE))
|
||||
collect_blob_oids_for_treeish(opt, pathspec,
|
||||
&real_obj->oid,
|
||||
list->objects[i].name,
|
||||
&blob_oids);
|
||||
}
|
||||
|
||||
/* Prefetch the blobs we found */
|
||||
if (oidset_size(&blob_oids)) {
|
||||
struct oid_array to_fetch = OID_ARRAY_INIT;
|
||||
struct oidset_iter iter;
|
||||
const struct object_id *oid;
|
||||
|
||||
oidset_iter_init(&blob_oids, &iter);
|
||||
while ((oid = oidset_iter_next(&iter)))
|
||||
oid_array_append(&to_fetch, oid);
|
||||
|
||||
promisor_remote_get_direct(opt->repo, to_fetch.oid, to_fetch.nr);
|
||||
|
||||
oid_array_clear(&to_fetch);
|
||||
}
|
||||
oidset_clear(&blob_oids);
|
||||
}
|
||||
|
||||
static int grep_object(struct grep_opt *opt, const struct pathspec *pathspec,
|
||||
struct object *obj, const char *name, const char *path)
|
||||
{
|
||||
|
|
@ -732,6 +873,8 @@ static int grep_objects(struct grep_opt *opt, const struct pathspec *pathspec,
|
|||
int hit = 0;
|
||||
const unsigned int nr = list->nr;
|
||||
|
||||
prefetch_grep_blobs(opt, pathspec, list);
|
||||
|
||||
for (i = 0; i < nr; i++) {
|
||||
struct object *real_obj;
|
||||
|
||||
|
|
|
|||
|
|
@ -1929,4 +1929,62 @@ test_expect_success 'grep does not report i-t-a and assume unchanged with -L' '
|
|||
test_cmp expected actual
|
||||
'
|
||||
|
||||
test_expect_success 'grep of revision in partial clone batches prefetch and honors pathspec' '
|
||||
test_when_finished "rm -rf grep-partial-src grep-partial" &&
|
||||
|
||||
git init grep-partial-src &&
|
||||
(
|
||||
cd grep-partial-src &&
|
||||
git config uploadpack.allowfilter 1 &&
|
||||
git config uploadpack.allowanysha1inwant 1 &&
|
||||
mkdir a b &&
|
||||
echo "needle in haystack" >a/matches.txt &&
|
||||
echo "nothing to see here" >a/nomatch.txt &&
|
||||
echo "needle again" >b/matches.md &&
|
||||
git add . &&
|
||||
git commit -m "initial"
|
||||
) &&
|
||||
|
||||
git clone --no-checkout --filter=blob:none \
|
||||
"file://$(pwd)/grep-partial-src" grep-partial &&
|
||||
|
||||
# All three blobs are missing immediately after a blobless clone.
|
||||
git -C grep-partial rev-list --quiet --objects \
|
||||
--missing=print HEAD >missing &&
|
||||
test_line_count = 3 missing &&
|
||||
|
||||
# A pathspec-limited grep should prefetch only the two blobs
|
||||
# in a/. It should fetch both blobs in one batched request.
|
||||
GIT_TRACE2_EVENT="$(pwd)/grep-trace-pathspec" \
|
||||
git -C grep-partial grep -c "needle" HEAD -- "a/*.txt" >result &&
|
||||
|
||||
# Only a/matches.txt contains "needle" among the matched paths.
|
||||
test_line_count = 1 result &&
|
||||
|
||||
# Exactly the two a/*.txt blobs should have been requested, and
|
||||
# the server packed those two objects in the response.
|
||||
test_trace2_data promisor fetch_count 2 <grep-trace-pathspec &&
|
||||
test_trace2_data pack-objects written 2 <grep-trace-pathspec &&
|
||||
|
||||
# b/matches.md should still be missing locally.
|
||||
git -C grep-partial rev-list --quiet --objects \
|
||||
--missing=print HEAD >missing &&
|
||||
test_line_count = 1 missing &&
|
||||
|
||||
# A second grep without a pathspec must recurse into both
|
||||
# subdirectories, but should request only the still-missing blob
|
||||
# from the promisor.
|
||||
GIT_TRACE2_EVENT="$(pwd)/grep-trace-all" \
|
||||
git -C grep-partial grep -c "needle" HEAD >result &&
|
||||
|
||||
test_line_count = 2 result &&
|
||||
test_trace2_data promisor fetch_count 1 <grep-trace-all &&
|
||||
test_trace2_data pack-objects written 1 <grep-trace-all &&
|
||||
|
||||
# Everything is local now.
|
||||
git -C grep-partial rev-list --quiet --objects \
|
||||
--missing=print HEAD >missing &&
|
||||
test_line_count = 0 missing
|
||||
'
|
||||
|
||||
test_done
|
||||
|
|
|
|||
Loading…
Reference in New Issue