Browse Source

packfile.c: speed up loading lots of packfiles

When loading packfiles on start-up, we traverse the internal packfile
list once per file to avoid reloading packfiles that have already
been loaded. This check runs in quadratic time, so for poorly
maintained repos with a large number of packfiles, it can be pretty
slow.

Add a hashmap containing the packfile names as we load them so that
the average runtime cost of checking for already-loaded packs becomes
constant.

Add a perf test to p5303 to show speed-up.

The existing p5303 test runtimes are dominated by other factors and do
not show an appreciable speed-up. The new test in p5303 clearly exposes
a speed-up in bad cases. In this test we create 10,000 packfiles and
measure the start-up time of git rev-parse, which does little else
besides load in the packs.

Here are the numbers for the new p5303 test:

Test                         HEAD^             HEAD
---------------------------------------------------------------------
5303.12: load 10,000 packs   1.03(0.92+0.10)   0.12(0.02+0.09) -88.3%

Signed-off-by: Colin Stolley <cstolley@runbox.com>
Helped-by: Jeff King <peff@peff.net>
[jc: squashed the change to call hashmap in install_packed_git() by peff]
Signed-off-by: Junio C Hamano <gitster@pobox.com>
maint
Colin Stolley 5 years ago committed by Junio C Hamano
parent
commit
ec48540fe8
  1. 21
      object-store.h
  2. 3
      object.c
  3. 19
      packfile.c
  4. 18
      t/perf/p5303-many-packs.sh

21
object-store.h

@ -60,6 +60,7 @@ struct oid_array *odb_loose_cache(struct object_directory *odb, @@ -60,6 +60,7 @@ struct oid_array *odb_loose_cache(struct object_directory *odb,
void odb_clear_loose_cache(struct object_directory *odb);

struct packed_git {
struct hashmap_entry packmap_ent;
struct packed_git *next;
struct list_head mru;
struct pack_window *windows;
@ -88,6 +89,20 @@ struct packed_git { @@ -88,6 +89,20 @@ struct packed_git {

struct multi_pack_index;

static inline int pack_map_entry_cmp(const void *unused_cmp_data,
const struct hashmap_entry *entry,
const struct hashmap_entry *entry2,
const void *keydata)
{
const char *key = keydata;
const struct packed_git *pg1, *pg2;

pg1 = container_of(entry, const struct packed_git, packmap_ent);
pg2 = container_of(entry2, const struct packed_git, packmap_ent);

return strcmp(pg1->pack_name, key ? key : pg2->pack_name);
}

struct raw_object_store {
/*
* Set of all object directories; the main directory is first (and
@ -131,6 +146,12 @@ struct raw_object_store { @@ -131,6 +146,12 @@ struct raw_object_store {
/* A most-recently-used ordered version of the packed_git list. */
struct list_head packed_git_mru;

/*
* A map of packfiles to packed_git structs for tracking which
* packs have been loaded already.
*/
struct hashmap pack_map;

/*
* A fast, rough count of the number of objects in the repository.
* These two fields are not meant for direct access. Use

3
object.c

@ -479,6 +479,7 @@ struct raw_object_store *raw_object_store_new(void) @@ -479,6 +479,7 @@ struct raw_object_store *raw_object_store_new(void)

memset(o, 0, sizeof(*o));
INIT_LIST_HEAD(&o->packed_git_mru);
hashmap_init(&o->pack_map, pack_map_entry_cmp, NULL, 0);
return o;
}

@ -518,6 +519,8 @@ void raw_object_store_clear(struct raw_object_store *o) @@ -518,6 +519,8 @@ void raw_object_store_clear(struct raw_object_store *o)
INIT_LIST_HEAD(&o->packed_git_mru);
close_object_store(o);
o->packed_git = NULL;

hashmap_free(&o->pack_map);
}

void parsed_object_pool_clear(struct parsed_object_pool *o)

19
packfile.c

@ -757,6 +757,9 @@ void install_packed_git(struct repository *r, struct packed_git *pack) @@ -757,6 +757,9 @@ void install_packed_git(struct repository *r, struct packed_git *pack)

pack->next = r->objects->packed_git;
r->objects->packed_git = pack;

hashmap_entry_init(&pack->packmap_ent, strhash(pack->pack_name));
hashmap_add(&r->objects->pack_map, &pack->packmap_ent);
}

void (*report_garbage)(unsigned seen_bits, const char *path);
@ -856,20 +859,18 @@ static void prepare_pack(const char *full_name, size_t full_name_len, @@ -856,20 +859,18 @@ static void prepare_pack(const char *full_name, size_t full_name_len,

if (strip_suffix_mem(full_name, &base_len, ".idx") &&
!(data->m && midx_contains_pack(data->m, file_name))) {
/* Don't reopen a pack we already have. */
for (p = data->r->objects->packed_git; p; p = p->next) {
size_t len;
if (strip_suffix(p->pack_name, ".pack", &len) &&
len == base_len &&
!memcmp(p->pack_name, full_name, len))
break;
}
struct hashmap_entry hent;
char *pack_name = xstrfmt("%.*s.pack", (int)base_len, full_name);
unsigned int hash = strhash(pack_name);
hashmap_entry_init(&hent, hash);

if (!p) {
/* Don't reopen a pack we already have. */
if (!hashmap_get(&data->r->objects->pack_map, &hent, pack_name)) {
p = add_packed_git(full_name, full_name_len, data->local);
if (p)
install_packed_git(data->r, p);
}
free(pack_name);
}

if (!report_garbage)

18
t/perf/p5303-many-packs.sh

@ -84,4 +84,22 @@ do @@ -84,4 +84,22 @@ do
'
done

# Measure pack loading with 10,000 packs.
test_expect_success 'generate lots of packs' '
for i in $(test_seq 10000); do
echo "blob"
echo "data <<EOF"
echo "blob $i"
echo "EOF"
echo "checkpoint"
done |
git -c fastimport.unpackLimit=0 fast-import
'

# The purpose of this test is to evaluate load time for a large number
# of packs while doing as little other work as possible.
test_perf "load 10,000 packs" '
git rev-parse --verify "HEAD^{commit}"
'

test_done

Loading…
Cancel
Save