From ca5381d43e8595f592d8d7ecfc9bb0bfa5e52f6d Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 16 Feb 2006 11:55:51 -0800 Subject: [PATCH] pack-objects: finishing touches. This introduces --no-reuse-delta option to disable reusing of existing delta, which is a large part of the optimization introduced by this series. This may become necessary if repeated repacking makes delta chain too long. With this, the output of the command becomes identical to that of the older implementation. But the performance suffers greatly. It still allows reusing non-deltified representations; there is no point uncompressing and recompressing the whole text. It also adds a couple more statistics output, while squelching it under -q flag, which the last round forgot to do. $ time old-git-pack-objects --stdout >/dev/null /dev/null /dev/null --- Documentation/git-pack-objects.txt | 21 +++++- pack-objects.c | 102 ++++++++++++++++++++--------- 2 files changed, 91 insertions(+), 32 deletions(-) diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt index 2d67d3911e..4cb2e83faa 100644 --- a/Documentation/git-pack-objects.txt +++ b/Documentation/git-pack-objects.txt @@ -8,7 +8,10 @@ git-pack-objects - Create a packed archive of objects. SYNOPSIS -------- -'git-pack-objects' [--non-empty] [--local] [--incremental] [--window=N] [--depth=N] {--stdout | base-name} < object-list +[verse] +'git-pack-objects' [-q] [--no-reuse-delta] [--non-empty] + [--local] [--incremental] [--window=N] [--depth=N] + {--stdout | base-name} < object-list DESCRIPTION @@ -32,6 +35,10 @@ Placing both in the pack/ subdirectory of $GIT_OBJECT_DIRECTORY (or any of the directories on $GIT_ALTERNATE_OBJECT_DIRECTORIES) enables git to read from such an archive. +In a packed archive, an object is either stored as a compressed +whole, or as a difference from some other object. The latter is +often called a delta. + OPTIONS ------- @@ -74,6 +81,18 @@ base-name:: Only create a packed archive if it would contain at least one object. +-q:: + This flag makes the command not to report its progress + on the standard error stream. + +--no-reuse-delta:: + When creating a packed archive in a repository that + has existing packs, the command reuses existing deltas. + This sometimes results in a slightly suboptimal pack. + This flag tells the command not to reuse existing deltas + but compute them from scratch. + + Author ------ Written by Linus Torvalds diff --git a/pack-objects.c b/pack-objects.c index 70fb2afeb8..38e1c9921b 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -5,7 +5,7 @@ #include "csum-file.h" #include -static const char pack_usage[] = "git-pack-objects [-q] [--non-empty] [--local] [--incremental] [--window=N] [--depth=N] {--stdout | base-name} < object-list"; +static const char pack_usage[] = "git-pack-objects [-q] [--no-reuse-delta] [--non-empty] [--local] [--incremental] [--window=N] [--depth=N] {--stdout | base-name} < object-list"; struct object_entry { unsigned char sha1[20]; @@ -14,10 +14,11 @@ struct object_entry { unsigned int depth; /* delta depth */ unsigned int hash; /* name hint hash */ enum object_type type; + unsigned char edge; /* reused delta chain points at this entry. */ + enum object_type in_pack_type; /* could be delta */ unsigned long delta_size; /* delta data size (uncompressed) */ struct object_entry *delta; /* delta base object */ struct packed_git *in_pack; /* already in pack */ - enum object_type in_pack_type; /* could be delta */ unsigned int in_pack_offset; }; @@ -36,6 +37,7 @@ struct object_entry { static unsigned char object_list_sha1[20]; static int non_empty = 0; +static int no_reuse_delta = 0; static int local = 0; static int incremental = 0; static struct object_entry **sorted_by_sha, **sorted_by_type; @@ -75,7 +77,9 @@ static int pack_revindex_hashsz = 0; * stats */ static int written = 0; +static int written_delta = 0; static int reused = 0; +static int reused_delta = 0; static int pack_revindex_ix(struct packed_git *p) { @@ -227,10 +231,23 @@ static unsigned long write_object(struct sha1file *f, struct object_entry *entry unsigned char header[10]; unsigned hdrlen, datalen; enum object_type obj_type; + int to_reuse = 0; obj_type = entry->type; - if (!entry->in_pack || - (obj_type != entry->in_pack_type)) { + if (! entry->in_pack) + to_reuse = 0; /* can't reuse what we don't have */ + else if (obj_type == OBJ_DELTA) + to_reuse = 1; /* check_object() decided it for us */ + else if (obj_type != entry->in_pack_type) + to_reuse = 0; /* pack has delta which is unusable */ + else if (entry->delta) + to_reuse = 0; /* we want to pack afresh */ + else + to_reuse = 1; /* we have it in-pack undeltified, + * and we do not need to deltify it. + */ + + if (! to_reuse) { buf = read_sha1_file(entry->sha1, type, &size); if (!buf) die("unable to read %s", sha1_to_hex(entry->sha1)); @@ -266,8 +283,12 @@ static unsigned long write_object(struct sha1file *f, struct object_entry *entry sha1write(f, buf, datalen); unuse_packed_git(p); hdrlen = 0; /* not really */ + if (obj_type == OBJ_DELTA) + reused_delta++; reused++; } + if (obj_type == OBJ_DELTA) + written_delta++; written++; return hdrlen + datalen; } @@ -294,7 +315,6 @@ static void write_pack_file(void) int i; struct sha1file *f; unsigned long offset; - unsigned long mb; struct pack_header hdr; if (!base_name) @@ -357,10 +377,9 @@ static int add_object_entry(unsigned char *sha1, unsigned int hash) unsigned int idx = nr_objects; struct object_entry *entry; struct packed_git *p; - unsigned int found_offset; - struct packed_git *found_pack; + unsigned int found_offset = 0; + struct packed_git *found_pack = NULL; - found_pack = NULL; for (p = packed_git; p; p = p->next) { struct pack_entry e; if (find_pack_entry_one(sha1, &e, p)) { @@ -420,32 +439,39 @@ static void check_object(struct object_entry *entry) char type[20]; if (entry->in_pack) { + unsigned char base[20]; + unsigned long size; + struct object_entry *base_entry; + + /* We want in_pack_type even if we do not reuse delta. + * There is no point not reusing non-delta representations. + */ + check_reuse_pack_delta(entry->in_pack, + entry->in_pack_offset, + base, &size, + &entry->in_pack_type); + /* Check if it is delta, and the base is also an object * we are going to pack. If so we will reuse the existing * delta. */ - unsigned char base[20]; - unsigned long size; - struct object_entry *base_entry; - if (!check_reuse_pack_delta(entry->in_pack, - entry->in_pack_offset, - base, &size, - &entry->in_pack_type) && + if (!no_reuse_delta && + entry->in_pack_type == OBJ_DELTA && (base_entry = locate_object_entry(base))) { - /* We do not know depth at this point, but it - * does not matter. Getting delta_chain_length - * with packed_object_info_detail() is not so - * expensive, so we could do that later if we - * wanted to. Calling sha1_object_info to get - * the true size (and later an uncompressed - * representation) of deeply deltified object - * is quite expensive. + + /* Depth value does not matter - find_deltas() + * will never consider reused delta as the + * base object to deltify other objects + * against, in order to avoid circular deltas. */ - entry->depth = 1; - /* uncompressed size */ + + /* uncompressed size of the delta data */ entry->size = entry->delta_size = size; entry->delta = base_entry; entry->type = OBJ_DELTA; + + base_entry->edge = 1; + return; } /* Otherwise we would do the usual */ @@ -568,6 +594,13 @@ static int try_delta(struct unpacked *cur, struct unpacked *old, unsigned max_de if (cur_entry->type != old_entry->type) return -1; + /* If the current object is at edge, take the depth the objects + * that depend on the current object into account -- otherwise + * they would become too deep. + */ + if (cur_entry->edge) + max_depth /= 4; + size = cur_entry->size; if (size < 50) return -1; @@ -627,7 +660,7 @@ static void find_deltas(struct object_entry **list, int window, int depth) if (entry->delta) /* This happens if we decided to reuse existing - * delta from a pack. + * delta from a pack. "!no_reuse_delta &&" is implied. */ continue; @@ -636,6 +669,7 @@ static void find_deltas(struct object_entry **list, int window, int depth) n->data = read_sha1_file(entry->sha1, type, &size); if (size != entry->size) die("object %s inconsistent object length (%lu vs %lu)", sha1_to_hex(entry->sha1), size, entry->size); + j = window; while (--j > 0) { unsigned int other_idx = idx + j; @@ -664,7 +698,7 @@ static void prepare_pack(int window, int depth) fprintf(stderr, "Packing %d objects", nr_objects); get_object_details(); if (progress) - fprintf(stderr, "."); + fputc('.', stderr); sorted_by_type = create_sorted_list(type_size_sort); if (window && depth) @@ -694,8 +728,9 @@ static int reuse_cached_pack(unsigned char *sha1, int pack_to_stdout) } } - fprintf(stderr, "Reusing %d objects pack %s\n", nr_objects, - sha1_to_hex(sha1)); + if (progress) + fprintf(stderr, "Reusing %d objects pack %s\n", nr_objects, + sha1_to_hex(sha1)); if (pack_to_stdout) { if (copy_fd(ifd, 1)) @@ -775,6 +810,10 @@ int main(int argc, char **argv) progress = 0; continue; } + if (!strcmp("--no-reuse-delta", arg)) { + no_reuse_delta = 1; + continue; + } if (!strcmp("--stdout", arg)) { pack_to_stdout = 1; continue; @@ -850,7 +889,8 @@ int main(int argc, char **argv) puts(sha1_to_hex(object_list_sha1)); } } - fprintf(stderr, "Total %d, written %d, reused %d\n", - nr_objects, written, reused); + if (progress) + fprintf(stderr, "Total %d, written %d (delta %d), reused %d (delta %d)\n", + nr_objects, written, written_delta, reused, reused_delta); return 0; }