From 3a1345af289703a04ff4754f9167ed3cfa2dfdce Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 8 Feb 2017 15:52:45 -0500 Subject: [PATCH 01/11] for_each_alternate_ref: handle failure from real_pathdup() In older versions of git, if real_path() failed to resolve the alternate object store path, we would die() with an error. However, since 4ac9006f8 (real_path: have callers use real_pathdup and strbuf_realpath, 2016-12-12) we use the real_pathdup() function, which may return NULL. Since we don't check the return value, we can segfault. This is hard to trigger in practice, since we check that the path is accessible before creating the alternate_object_database struct. But it could be removed racily, or we could see a transient filesystem error. We could restore the original behavior by switching back to xstrdup(real_path()). However, dying is probably not the best option here. This whole function is best-effort already; there might not even be a repository around the shared objects at all. And if the alternate store has gone away, there are no objects to show. So let's just quietly return, as we would if we failed to open "refs/", or if upload-pack failed to start, etc. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- transport.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/transport.c b/transport.c index d72e089484..9ce0ee96ba 100644 --- a/transport.c +++ b/transport.c @@ -1222,6 +1222,8 @@ static int refs_from_alternate_cb(struct alternate_object_database *e, struct alternate_refs_data *cb = data; other = real_pathdup(e->path); + if (!other) + return 0; len = strlen(other); while (other[len-1] == '/') From ece657f39939d067265eaf57e519a20019bcf794 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 8 Feb 2017 15:52:50 -0500 Subject: [PATCH 02/11] for_each_alternate_ref: stop trimming trailing slashes The real_pathdup() function will have removed extra slashes for us already (on top of the normalize_path() done when we created the alternate_object_database struct in the first place). Incidentally, this also fixes the case where the path is just "/", which would read off the start of the array. That doesn't seem possible to trigger in practice, though, as link_alt_odb_entry() blindly eats trailing slashes, including a bare "/". Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- transport.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/transport.c b/transport.c index 9ce0ee96ba..6fba9e95b2 100644 --- a/transport.c +++ b/transport.c @@ -1226,8 +1226,6 @@ static int refs_from_alternate_cb(struct alternate_object_database *e, return 0; len = strlen(other); - while (other[len-1] == '/') - other[--len] = '\0'; if (len < 8 || memcmp(other + len - 8, "/objects", 8)) goto out; /* Is this a git repository with refs? */ From 5e8c968c6465d35c9047ab3ed522cb08d46386f5 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 8 Feb 2017 15:52:54 -0500 Subject: [PATCH 03/11] for_each_alternate_ref: use strbuf for path allocation We have a string with ".../objects" pointing to the alternate object store, and overwrite bits of it to look at other paths in the (potential) git repository holding it. This works because the only path we care about is "refs", which is shorter than "objects". Using a strbuf to hold the path lets us get rid of some magic numbers, and makes it more obvious that the memory operations are safe. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- transport.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/transport.c b/transport.c index 6fba9e95b2..6b7847131f 100644 --- a/transport.c +++ b/transport.c @@ -1214,34 +1214,34 @@ struct alternate_refs_data { static int refs_from_alternate_cb(struct alternate_object_database *e, void *data) { - char *other; - size_t len; + struct strbuf path = STRBUF_INIT; + size_t base_len; struct remote *remote; struct transport *transport; const struct ref *extra; struct alternate_refs_data *cb = data; - other = real_pathdup(e->path); - if (!other) - return 0; - len = strlen(other); + if (!strbuf_realpath(&path, e->path, 0)) + goto out; + if (!strbuf_strip_suffix(&path, "/objects")) + goto out; + base_len = path.len; - if (len < 8 || memcmp(other + len - 8, "/objects", 8)) - goto out; /* Is this a git repository with refs? */ - memcpy(other + len - 8, "/refs", 6); - if (!is_directory(other)) + strbuf_addstr(&path, "/refs"); + if (!is_directory(path.buf)) goto out; - other[len - 8] = '\0'; - remote = remote_get(other); - transport = transport_get(remote, other); + strbuf_setlen(&path, base_len); + + remote = remote_get(path.buf); + transport = transport_get(remote, path.buf); for (extra = transport_get_remote_refs(transport); extra; extra = extra->next) cb->fn(extra, cb->data); transport_disconnect(transport); out: - free(other); + strbuf_release(&path); return 0; } From 2429d63a46b141bb5006b8c1ea82e2d0163ab626 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 8 Feb 2017 15:52:57 -0500 Subject: [PATCH 04/11] for_each_alternate_ref: pass name/oid instead of ref struct Breaking down the fields in the interface makes it easier to change the backend of for_each_alternate_ref to something that doesn't use "struct ref" internally. The only field that callers actually look at is the oid, anyway. The refname is kept in the interface as a plausible thing for future code to want. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/receive-pack.c | 6 ++++-- fetch-pack.c | 12 ++++++++---- transport.c | 2 +- transport.h | 2 +- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/builtin/receive-pack.c b/builtin/receive-pack.c index 1dbb8a0692..d21332d9e7 100644 --- a/builtin/receive-pack.c +++ b/builtin/receive-pack.c @@ -277,10 +277,12 @@ static int show_one_alternate_sha1(const unsigned char sha1[20], void *unused) return 0; } -static void collect_one_alternate_ref(const struct ref *ref, void *data) +static void collect_one_alternate_ref(const char *refname, + const struct object_id *oid, + void *data) { struct sha1_array *sa = data; - sha1_array_append(sa, ref->old_oid.hash); + sha1_array_append(sa, oid->hash); } static void write_head_info(void) diff --git a/fetch-pack.c b/fetch-pack.c index 601f0779a1..54f84c5733 100644 --- a/fetch-pack.c +++ b/fetch-pack.c @@ -253,9 +253,11 @@ static void send_request(struct fetch_pack_args *args, write_or_die(fd, buf->buf, buf->len); } -static void insert_one_alternate_ref(const struct ref *ref, void *unused) +static void insert_one_alternate_ref(const char *refname, + const struct object_id *oid, + void *unused) { - rev_list_insert_ref(NULL, ref->old_oid.hash); + rev_list_insert_ref(NULL, oid->hash); } #define INITIAL_FLUSH 16 @@ -619,9 +621,11 @@ static void filter_refs(struct fetch_pack_args *args, *refs = newlist; } -static void mark_alternate_complete(const struct ref *ref, void *unused) +static void mark_alternate_complete(const char *refname, + const struct object_id *oid, + void *unused) { - mark_complete(ref->old_oid.hash); + mark_complete(oid->hash); } static int everything_local(struct fetch_pack_args *args, diff --git a/transport.c b/transport.c index 6b7847131f..c87147046f 100644 --- a/transport.c +++ b/transport.c @@ -1238,7 +1238,7 @@ static int refs_from_alternate_cb(struct alternate_object_database *e, for (extra = transport_get_remote_refs(transport); extra; extra = extra->next) - cb->fn(extra, cb->data); + cb->fn(extra->name, &extra->old_oid, cb->data); transport_disconnect(transport); out: strbuf_release(&path); diff --git a/transport.h b/transport.h index e597b31b38..bc5571574b 100644 --- a/transport.h +++ b/transport.h @@ -255,6 +255,6 @@ int transport_refs_pushed(struct ref *ref); void transport_print_push_status(const char *dest, struct ref *refs, int verbose, int porcelain, unsigned int *reject_reasons); -typedef void alternate_ref_fn(const struct ref *, void *); +typedef void alternate_ref_fn(const char *refname, const struct object_id *oid, void *); extern void for_each_alternate_ref(alternate_ref_fn, void *); #endif From a10a17877bcce27b5677b5ac9f1a4e58d5bd8023 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 8 Feb 2017 15:53:00 -0500 Subject: [PATCH 05/11] for_each_alternate_ref: replace transport code with for-each-ref The current method for getting the refs from an alternate is to run upload-pack in the alternate and parse its output using the normal transport code. This works and is reasonably short, but it has a very bad memory footprint when there are a lot of refs in the alternate. There are two problems: 1. It reads in all of the refs before passing any back to us. Which means that our peak memory usage has to store every ref (including duplicates for peeled variants), even if our callback could determine that some are not interesting (e.g., because they point to the same sha1 as another ref). 2. It allocates a "struct ref" for each one. Among other things, this contains 3 separate 20-byte oids, along with the name and various pointers. That can add up, especially if the callback is only interested in the sha1 (which it can store in a sha1_array as just 20 bytes). On a particularly pathological case, where the alternate had over 80 million refs pointing to only around 60,000 unique objects, the peak heap usage of "git clone --reference" grew to over 25GB. This patch instead calls git-for-each-ref in the alternate repository, and passes each line to the callback as we read it. That drops the peak heap of the same command to 50MB. I considered and rejected a few alternatives. We could read all of the refs in the alternate using our own ref code, just as we do with submodules. However, as memory footprint is one of the concerns here, we want to avoid loading those refs into our own memory as a whole. It's possible that this will be a better technique in the future when the ref code can more easily iterate without loading all of packed-refs into memory. Another option is to keep calling upload-pack, and just parse its output ourselves in a streaming fashion. Besides for-each-ref being simpler (we get to define the format ourselves, and don't have to deal with speaking the git protocol), it's more flexible for possible future changes. For instance, it might be useful for the caller to be able to limit the set of "interesting" alternate refs. The motivating example is one where many "forks" of a particular repository share object storage, and the shared storage has refs for each fork (which is why so many of the refs are duplicates; each fork has the same tags). A plausible future optimization would be to ask for the alternate refs for just _one_ fork (if you had some out-of-band way of knowing which was the most interesting or important for the current operation). Similarly, no callbacks actually care about the symref value of alternate refs, and as before, this patch ignores them entirely. However, if we wanted to add them, for-each-ref's "%(symref)" is going to be more flexible than upload-pack, because the latter only handles the HEAD symref due to historical constraints. There is one potential downside, though: unlike upload-pack, our for-each-ref command doesn't report the peeled value of refs. The existing code calls the alternate_ref_fn callback twice for tags: once for the tag, and once for the peeled value with the refname set to "ref^{}". For the callers in fetch-pack, this doesn't matter at all. We immediately peel each tag down to a commit either way (so there's a slight improvement, as do not bother passing the redundant data over the pipe). For the caller in receive-pack, it means we will not advertise the peeled values of tags in our alternate. However, we also don't advertise peeled values for our _own_ tags, so this is actually making things more consistent. It's unclear whether receive-pack advertising peeled values is a win or not. On one hand, giving more information to the other side may let it omit some objects from the push. On the other hand, for tags which both sides have, they simply bloat the advertisement. The upload-pack advertisement of git.git is about 30% larger than the receive-pack advertisement due to its peeled information. This patch omits the peeled information from for_each_alternate_ref entirely, and leaves it up to the caller whether they want to dig up the information. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- transport.c | 48 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/transport.c b/transport.c index c87147046f..48864b3a9e 100644 --- a/transport.c +++ b/transport.c @@ -1206,6 +1206,42 @@ literal_copy: return xstrdup(url); } +static void read_alternate_refs(const char *path, + alternate_ref_fn *cb, + void *data) +{ + struct child_process cmd = CHILD_PROCESS_INIT; + struct strbuf line = STRBUF_INIT; + FILE *fh; + + cmd.git_cmd = 1; + argv_array_pushf(&cmd.args, "--git-dir=%s", path); + argv_array_push(&cmd.args, "for-each-ref"); + argv_array_push(&cmd.args, "--format=%(objectname) %(refname)"); + cmd.env = local_repo_env; + cmd.out = -1; + + if (start_command(&cmd)) + return; + + fh = xfdopen(cmd.out, "r"); + while (strbuf_getline_lf(&line, fh) != EOF) { + struct object_id oid; + + if (get_oid_hex(line.buf, &oid) || + line.buf[GIT_SHA1_HEXSZ] != ' ') { + warning("invalid line while parsing alternate refs: %s", + line.buf); + break; + } + + cb(line.buf + GIT_SHA1_HEXSZ + 1, &oid, data); + } + + fclose(fh); + finish_command(&cmd); +} + struct alternate_refs_data { alternate_ref_fn *fn; void *data; @@ -1216,9 +1252,6 @@ static int refs_from_alternate_cb(struct alternate_object_database *e, { struct strbuf path = STRBUF_INIT; size_t base_len; - struct remote *remote; - struct transport *transport; - const struct ref *extra; struct alternate_refs_data *cb = data; if (!strbuf_realpath(&path, e->path, 0)) @@ -1233,13 +1266,8 @@ static int refs_from_alternate_cb(struct alternate_object_database *e, goto out; strbuf_setlen(&path, base_len); - remote = remote_get(path.buf); - transport = transport_get(remote, path.buf); - for (extra = transport_get_remote_refs(transport); - extra; - extra = extra->next) - cb->fn(extra->name, &extra->old_oid, cb->data); - transport_disconnect(transport); + read_alternate_refs(path.buf, cb->fn, cb->data); + out: strbuf_release(&path); return 0; From 41a078c60b82bad4edf9d1bd8e826aae5f020ee5 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 8 Feb 2017 15:53:03 -0500 Subject: [PATCH 06/11] fetch-pack: cache results of for_each_alternate_ref We may run for_each_alternate_ref() twice, once in find_common() and once in everything_local(). This operation can be expensive, because it involves running a sub-process which must freshly load all of the alternate's refs from disk. Let's cache and reuse the results between the two calls. We can make some optimizations based on the particular use pattern in fetch-pack to keep our memory usage down. The first is that we only care about the sha1s, not the refs themselves. So it's OK to store only the sha1s, and to suppress duplicates. The natural fit would therefore be a sha1_array. However, sha1_array's de-duplication happens only after it has read and sorted all entries. It still stores each duplicate. For an alternate with a large number of refs pointing to the same commits, this is a needless expense. Instead, we'd prefer to eliminate duplicates before putting them in the cache, which implies using a hash. We can further note that fetch-pack will call parse_object() on each alternate sha1. We can therefore keep our cache as a set of pointers to "struct object". That gives us a place to put our "already seen" bit with an optimized hash lookup. And as a bonus, the object stores the sha1 for us, so pointer-to-object is all we need. There are two extra optimizations I didn't do here: - we actually store an array of pointer-to-object. Technically we could just walk the obj_hash table looking for entries with the ALTERNATE flag set (because our use case doesn't care about the order here). But that hash table may be mostly composed of non-ALTERNATE entries, so we'd waste time walking over them. So it would be a slight win in memory use, but a loss in CPU. - the items we pull out of the cache are actual "struct object"s, but then we feed "obj->sha1" to our sub-functions, which promptly call parse_object(). This second parse is cheap, because it starts with lookup_object() and will bail immediately when it sees we've already parsed the object. We could save the extra hash lookup, but it would involve refactoring the functions we call. It may or may not be worth the trouble. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- fetch-pack.c | 52 ++++++++++++++++++++++++++++++++++++++++++---------- object.h | 2 +- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/fetch-pack.c b/fetch-pack.c index 54f84c5733..e0f5d5ce87 100644 --- a/fetch-pack.c +++ b/fetch-pack.c @@ -35,6 +35,7 @@ static const char *alternate_shallow_file; #define COMMON_REF (1U << 2) #define SEEN (1U << 3) #define POPPED (1U << 4) +#define ALTERNATE (1U << 5) static int marked; @@ -67,6 +68,41 @@ static inline void print_verbose(const struct fetch_pack_args *args, fputc('\n', stderr); } +struct alternate_object_cache { + struct object **items; + size_t nr, alloc; +}; + +static void cache_one_alternate(const char *refname, + const struct object_id *oid, + void *vcache) +{ + struct alternate_object_cache *cache = vcache; + struct object *obj = parse_object(oid->hash); + + if (!obj || (obj->flags & ALTERNATE)) + return; + + obj->flags |= ALTERNATE; + ALLOC_GROW(cache->items, cache->nr + 1, cache->alloc); + cache->items[cache->nr++] = obj; +} + +static void for_each_cached_alternate(void (*cb)(struct object *)) +{ + static int initialized; + static struct alternate_object_cache cache; + size_t i; + + if (!initialized) { + for_each_alternate_ref(cache_one_alternate, &cache); + initialized = 1; + } + + for (i = 0; i < cache.nr; i++) + cb(cache.items[i]); +} + static void rev_list_push(struct commit *commit, int mark) { if (!(commit->object.flags & mark)) { @@ -253,11 +289,9 @@ static void send_request(struct fetch_pack_args *args, write_or_die(fd, buf->buf, buf->len); } -static void insert_one_alternate_ref(const char *refname, - const struct object_id *oid, - void *unused) +static void insert_one_alternate_object(struct object *obj) { - rev_list_insert_ref(NULL, oid->hash); + rev_list_insert_ref(NULL, obj->oid.hash); } #define INITIAL_FLUSH 16 @@ -300,7 +334,7 @@ static int find_common(struct fetch_pack_args *args, marked = 1; for_each_ref(rev_list_insert_ref_oid, NULL); - for_each_alternate_ref(insert_one_alternate_ref, NULL); + for_each_cached_alternate(insert_one_alternate_object); fetching = 0; for ( ; refs ; refs = refs->next) { @@ -621,11 +655,9 @@ static void filter_refs(struct fetch_pack_args *args, *refs = newlist; } -static void mark_alternate_complete(const char *refname, - const struct object_id *oid, - void *unused) +static void mark_alternate_complete(struct object *obj) { - mark_complete(oid->hash); + mark_complete(obj->oid.hash); } static int everything_local(struct fetch_pack_args *args, @@ -661,7 +693,7 @@ static int everything_local(struct fetch_pack_args *args, if (!args->deepen) { for_each_ref(mark_complete_oid, NULL); - for_each_alternate_ref(mark_alternate_complete, NULL); + for_each_cached_alternate(mark_alternate_complete); commit_list_sort_by_date(&complete); if (cutoff) mark_recent_complete_commits(args, cutoff); diff --git a/object.h b/object.h index 614a006756..f52957dcb3 100644 --- a/object.h +++ b/object.h @@ -29,7 +29,7 @@ struct object_array { /* * object flag allocation: * revision.h: 0---------10 26 - * fetch-pack.c: 0---4 + * fetch-pack.c: 0---5 * walker.c: 0-2 * upload-pack.c: 4 11----------------19 * builtin/blame.c: 12-13 From 29c2bd5fa8cb97eedcd463d49cfc7e753feb3145 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 8 Feb 2017 15:53:07 -0500 Subject: [PATCH 07/11] add oidset API This is similar to many of our uses of sha1-array, but it overcomes one limitation of a sha1-array: when you are de-duplicating a large input with relatively few unique entries, sha1-array uses 20 bytes per non-unique entry. Whereas this set will use memory linear in the number of unique entries (albeit a few more than 20 bytes due to hashmap overhead). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- Makefile | 1 + oidset.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ oidset.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+) create mode 100644 oidset.c create mode 100644 oidset.h diff --git a/Makefile b/Makefile index 8e4081e061..a5433978e3 100644 --- a/Makefile +++ b/Makefile @@ -781,6 +781,7 @@ LIB_OBJS += notes-cache.o LIB_OBJS += notes-merge.o LIB_OBJS += notes-utils.o LIB_OBJS += object.o +LIB_OBJS += oidset.o LIB_OBJS += pack-bitmap.o LIB_OBJS += pack-bitmap-write.o LIB_OBJS += pack-check.o diff --git a/oidset.c b/oidset.c new file mode 100644 index 0000000000..ac169f05d3 --- /dev/null +++ b/oidset.c @@ -0,0 +1,49 @@ +#include "cache.h" +#include "oidset.h" + +struct oidset_entry { + struct hashmap_entry hash; + struct object_id oid; +}; + +static int oidset_hashcmp(const void *va, const void *vb, + const void *vkey) +{ + const struct oidset_entry *a = va, *b = vb; + const struct object_id *key = vkey; + return oidcmp(&a->oid, key ? key : &b->oid); +} + +int oidset_contains(const struct oidset *set, const struct object_id *oid) +{ + struct hashmap_entry key; + + if (!set->map.cmpfn) + return 0; + + hashmap_entry_init(&key, sha1hash(oid->hash)); + return !!hashmap_get(&set->map, &key, oid); +} + +int oidset_insert(struct oidset *set, const struct object_id *oid) +{ + struct oidset_entry *entry; + + if (!set->map.cmpfn) + hashmap_init(&set->map, oidset_hashcmp, 0); + + if (oidset_contains(set, oid)) + return 1; + + entry = xmalloc(sizeof(*entry)); + hashmap_entry_init(&entry->hash, sha1hash(oid->hash)); + oidcpy(&entry->oid, oid); + + hashmap_add(&set->map, entry); + return 0; +} + +void oidset_clear(struct oidset *set) +{ + hashmap_free(&set->map, 1); +} diff --git a/oidset.h b/oidset.h new file mode 100644 index 0000000000..b7eaab5b88 --- /dev/null +++ b/oidset.h @@ -0,0 +1,45 @@ +#ifndef OIDSET_H +#define OIDSET_H + +/** + * This API is similar to sha1-array, in that it maintains a set of object ids + * in a memory-efficient way. The major differences are: + * + * 1. It uses a hash, so we can do online duplicate removal, rather than + * sort-and-uniq at the end. This can reduce memory footprint if you have + * a large list of oids with many duplicates. + * + * 2. The per-unique-oid memory footprint is slightly higher due to hash + * table overhead. + */ + +/** + * A single oidset; should be zero-initialized (or use OIDSET_INIT). + */ +struct oidset { + struct hashmap map; +}; + +#define OIDSET_INIT { { NULL } } + +/** + * Returns true iff `set` contains `oid`. + */ +int oidset_contains(const struct oidset *set, const struct object_id *oid); + +/** + * Insert the oid into the set; a copy is made, so "oid" does not need + * to persist after this function is called. + * + * Returns 1 if the oid was already in the set, 0 otherwise. This can be used + * to perform an efficient check-and-add. + */ +int oidset_insert(struct oidset *set, const struct object_id *oid); + +/** + * Remove all entries from the oidset, freeing any resources associated with + * it. + */ +void oidset_clear(struct oidset *set); + +#endif /* OIDSET_H */ From ab6eea6f7b9a5289d72c05476da19ab2bb457fd3 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 8 Feb 2017 15:53:10 -0500 Subject: [PATCH 08/11] receive-pack: use oidset to de-duplicate .have lines If you have an alternate object store with a very large number of refs, the peak memory usage of the sha1_array can grow high, even if most of them are duplicates that end up not being printed at all. The similar for_each_alternate_ref() code-paths in fetch-pack solve this by using flags in "struct object" to de-duplicate (and so are relying on obj_hash at the core). But we don't have a "struct object" at all in this case. We could call lookup_unknown_object() to get one, but if our goal is reducing memory footprint, it's not great: - an unknown object is as large as the largest object type (a commit), which is bigger than an oidset entry - we can free the memory after our ref advertisement, but "struct object" entries persist forever (and the receive-pack may hang around for a long time, as the bottleneck is often client upload bandwidth). So let's use an oidset. Note that unlike a sha1-array it doesn't sort the output as a side effect. However, our output is at least stable, because for_each_alternate_ref() will give us the sha1s in ref-sorted order. In one particularly pathological case with an alternate that has 60,000 unique refs out of 80 million total, this reduced the peak heap usage of "git receive-pack . Signed-off-by: Junio C Hamano --- builtin/receive-pack.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/builtin/receive-pack.c b/builtin/receive-pack.c index d21332d9e7..a4926fcfb4 100644 --- a/builtin/receive-pack.c +++ b/builtin/receive-pack.c @@ -21,6 +21,7 @@ #include "sigchain.h" #include "fsck.h" #include "tmp-objdir.h" +#include "oidset.h" static const char * const receive_pack_usage[] = { N_("git receive-pack "), @@ -271,27 +272,24 @@ static int show_ref_cb(const char *path_full, const struct object_id *oid, return 0; } -static int show_one_alternate_sha1(const unsigned char sha1[20], void *unused) +static void show_one_alternate_ref(const char *refname, + const struct object_id *oid, + void *data) { - show_ref(".have", sha1); - return 0; -} + struct oidset *seen = data; -static void collect_one_alternate_ref(const char *refname, - const struct object_id *oid, - void *data) -{ - struct sha1_array *sa = data; - sha1_array_append(sa, oid->hash); + if (oidset_insert(seen, oid)) + return; + + show_ref(".have", oid->hash); } static void write_head_info(void) { - struct sha1_array sa = SHA1_ARRAY_INIT; + static struct oidset seen = OIDSET_INIT; - for_each_alternate_ref(collect_one_alternate_ref, &sa); - sha1_array_for_each_unique(&sa, show_one_alternate_sha1, NULL); - sha1_array_clear(&sa); + for_each_alternate_ref(show_one_alternate_ref, &seen); + oidset_clear(&seen); for_each_ref(show_ref_cb, NULL); if (!sent_capabilities) show_ref("capabilities^{}", null_sha1); From fea6c47f2f106c3617eaf52f973a1a07629ebf46 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 8 Feb 2017 15:53:13 -0500 Subject: [PATCH 09/11] receive-pack: fix misleading namespace/.have comment The comment claims that we handle alternate ".have" lines through this function, but that hasn't been the case since 85f251045 (write_head_info(): handle "extra refs" locally, 2012-01-06). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/receive-pack.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/builtin/receive-pack.c b/builtin/receive-pack.c index a4926fcfb4..1821ad5fa6 100644 --- a/builtin/receive-pack.c +++ b/builtin/receive-pack.c @@ -261,10 +261,7 @@ static int show_ref_cb(const char *path_full, const struct object_id *oid, /* * Advertise refs outside our current namespace as ".have" * refs, so that the client can use them to minimize data - * transfer but will otherwise ignore them. This happens to - * cover ".have" that are thrown in by add_one_alternate_ref() - * to mark histories that are complete in our alternates as - * well. + * transfer but will otherwise ignore them. */ if (!path) path = ".have"; From 8b24b9e76534b6664d4ea1afa67db0fbb495c925 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 8 Feb 2017 15:53:16 -0500 Subject: [PATCH 10/11] receive-pack: treat namespace .have lines like alternates Namely, de-duplicate them. We use the same set as the alternates, since we call them both ".have" (i.e., there is no value in showing one versus the other). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/receive-pack.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/builtin/receive-pack.c b/builtin/receive-pack.c index 1821ad5fa6..c23b0cce86 100644 --- a/builtin/receive-pack.c +++ b/builtin/receive-pack.c @@ -251,8 +251,9 @@ static void show_ref(const char *path, const unsigned char *sha1) } static int show_ref_cb(const char *path_full, const struct object_id *oid, - int flag, void *unused) + int flag, void *data) { + struct oidset *seen = data; const char *path = strip_namespace(path_full); if (ref_is_hidden(path, path_full)) @@ -263,8 +264,11 @@ static int show_ref_cb(const char *path_full, const struct object_id *oid, * refs, so that the client can use them to minimize data * transfer but will otherwise ignore them. */ - if (!path) + if (!path) { + if (oidset_insert(seen, oid)) + return 0; path = ".have"; + } show_ref(path, oid->hash); return 0; } @@ -287,7 +291,7 @@ static void write_head_info(void) for_each_alternate_ref(show_one_alternate_ref, &seen); oidset_clear(&seen); - for_each_ref(show_ref_cb, NULL); + for_each_ref(show_ref_cb, &seen); if (!sent_capabilities) show_ref("capabilities^{}", null_sha1); From 63d428e656edcd670fa87e74136726096ff3de6f Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 8 Feb 2017 15:53:19 -0500 Subject: [PATCH 11/11] receive-pack: avoid duplicates between our refs and alternates We de-duplicate ".have" refs among themselves, but never check if they are duplicates of our local refs. It's not unreasonable that they would be if we are a "--shared" or "--reference" clone of a similar repository; we'd have all the same tags. We can handle this by inserting our local refs into the oidset, but obviously not suppressing duplicates (since the refnames are important). Note that this also switches the order in which we advertise refs, processing ours first and then any alternates. The order shouldn't matter (and arguably showing our refs first makes more sense). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/receive-pack.c | 4 +++- t/t5400-send-pack.sh | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/builtin/receive-pack.c b/builtin/receive-pack.c index c23b0cce86..9ed8fbbfad 100644 --- a/builtin/receive-pack.c +++ b/builtin/receive-pack.c @@ -268,6 +268,8 @@ static int show_ref_cb(const char *path_full, const struct object_id *oid, if (oidset_insert(seen, oid)) return 0; path = ".have"; + } else { + oidset_insert(seen, oid); } show_ref(path, oid->hash); return 0; @@ -289,9 +291,9 @@ static void write_head_info(void) { static struct oidset seen = OIDSET_INIT; + for_each_ref(show_ref_cb, &seen); for_each_alternate_ref(show_one_alternate_ref, &seen); oidset_clear(&seen); - for_each_ref(show_ref_cb, &seen); if (!sent_capabilities) show_ref("capabilities^{}", null_sha1); diff --git a/t/t5400-send-pack.sh b/t/t5400-send-pack.sh index 305ca7a930..3331e0f534 100755 --- a/t/t5400-send-pack.sh +++ b/t/t5400-send-pack.sh @@ -255,4 +255,42 @@ test_expect_success 'deny pushing to delete current branch' ' ) ' +extract_ref_advertisement () { + perl -lne ' + # \\ is there to skip capabilities after \0 + /push< ([^\\]+)/ or next; + exit 0 if $1 eq "0000"; + print $1; + ' +} + +test_expect_success 'receive-pack de-dupes .have lines' ' + git init shared && + git -C shared commit --allow-empty -m both && + git clone -s shared fork && + ( + cd shared && + git checkout -b only-shared && + git commit --allow-empty -m only-shared && + git update-ref refs/heads/foo HEAD + ) && + + # Notable things in this expectation: + # - local refs are not de-duped + # - .have does not duplicate locals + # - .have does not duplicate itself + local=$(git -C fork rev-parse HEAD) && + shared=$(git -C shared rev-parse only-shared) && + cat >expect <<-EOF && + $local refs/heads/master + $local refs/remotes/origin/HEAD + $local refs/remotes/origin/master + $shared .have + EOF + + GIT_TRACE_PACKET=$(pwd)/trace git push fork HEAD:foo && + extract_ref_advertisement refs && + test_cmp expect refs +' + test_done