From 57059091fad25427bce9b3d47e073ce0518d164b Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 9 Apr 2007 01:06:28 -0400 Subject: [PATCH 01/27] get rid of num_packed_objects() The coming index format change doesn't allow for the number of objects to be determined from the size of the index file directly. Instead, Let's initialize a field in the packed_git structure with the object count when the index is validated since the count is always known at that point. While at it let's reorder some struct packed_git fields to avoid padding due to needed 64-bit alignment for some of them. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-count-objects.c | 2 +- builtin-fsck.c | 2 +- builtin-pack-objects.c | 4 ++-- cache.h | 8 ++++---- pack-check.c | 4 ++-- sha1_file.c | 17 ++++++----------- sha1_name.c | 2 +- 7 files changed, 17 insertions(+), 22 deletions(-) diff --git a/builtin-count-objects.c b/builtin-count-objects.c index 6263d8af29..ff90ebd465 100644 --- a/builtin-count-objects.c +++ b/builtin-count-objects.c @@ -111,7 +111,7 @@ int cmd_count_objects(int ac, const char **av, const char *prefix) for (p = packed_git; p; p = p->next) { if (!p->pack_local) continue; - packed += num_packed_objects(p); + packed += p->num_objects; num_pack++; } printf("count: %lu\n", loose); diff --git a/builtin-fsck.c b/builtin-fsck.c index 4d8b66c344..44a02d3120 100644 --- a/builtin-fsck.c +++ b/builtin-fsck.c @@ -653,7 +653,7 @@ int cmd_fsck(int argc, char **argv, const char *prefix) verify_pack(p, 0); for (p = packed_git; p; p = p->next) { - uint32_t i, num = num_packed_objects(p); + uint32_t i, num = p->num_objects; for (i = 0; i < num; i++) fsck_sha1(nth_packed_object_sha1(p, i)); } diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 45ac3e482a..6bff17b130 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -164,7 +164,7 @@ static int cmp_offset(const void *a_, const void *b_) static void prepare_pack_revindex(struct pack_revindex *rix) { struct packed_git *p = rix->p; - int num_ent = num_packed_objects(p); + int num_ent = p->num_objects; int i; const char *index = p->index_data; @@ -198,7 +198,7 @@ static struct revindex_entry * find_packed_object(struct packed_git *p, prepare_pack_revindex(rix); revindex = rix->revindex; lo = 0; - hi = num_packed_objects(p) + 1; + hi = p->num_objects + 1; do { int mi = (lo + hi) / 2; if (revindex[mi].offset == ofs) { diff --git a/cache.h b/cache.h index eb57507b80..5b67f4c989 100644 --- a/cache.h +++ b/cache.h @@ -376,11 +376,12 @@ struct pack_window { extern struct packed_git { struct packed_git *next; struct pack_window *windows; - const void *index_data; - off_t index_size; off_t pack_size; - time_t mtime; + const void *index_data; + size_t index_size; + uint32_t num_objects; int index_version; + time_t mtime; int pack_fd; int pack_local; unsigned char sha1[20]; @@ -431,7 +432,6 @@ extern void pack_report(void); extern unsigned char* use_pack(struct packed_git *, struct pack_window **, off_t, unsigned int *); extern void unuse_pack(struct pack_window **); extern struct packed_git *add_packed_git(const char *, int, int); -extern uint32_t num_packed_objects(const struct packed_git *p); extern const unsigned char *nth_packed_object_sha1(const struct packed_git *, uint32_t); extern off_t find_pack_entry_one(const unsigned char *, struct packed_git *); extern void *unpack_entry(struct packed_git *, off_t, enum object_type *, unsigned long *); diff --git a/pack-check.c b/pack-check.c index f58083d11e..d04536bbff 100644 --- a/pack-check.c +++ b/pack-check.c @@ -40,7 +40,7 @@ static int verify_packfile(struct packed_git *p, * have verified that nr_objects matches between idx and pack, * we do not do scan-streaming check on the pack file. */ - nr_objects = num_packed_objects(p); + nr_objects = p->num_objects; for (i = 0, err = 0; i < nr_objects; i++) { const unsigned char *sha1; void *data; @@ -79,7 +79,7 @@ static void show_pack_info(struct packed_git *p) { uint32_t nr_objects, i, chain_histogram[MAX_CHAIN]; - nr_objects = num_packed_objects(p); + nr_objects = p->num_objects; memset(chain_histogram, 0, sizeof(chain_histogram)); for (i = 0; i < nr_objects; i++) { diff --git a/sha1_file.c b/sha1_file.c index 4304fe9bbc..d9ca69a916 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -494,6 +494,7 @@ static int check_packed_git_idx(const char *path, struct packed_git *p) p->index_version = 1; p->index_data = idx_map; p->index_size = idx_size; + p->num_objects = nr; return 0; } @@ -605,11 +606,11 @@ static int open_packed_git_1(struct packed_git *p) p->pack_name, ntohl(hdr.hdr_version)); /* Verify the pack matches its index. */ - if (num_packed_objects(p) != ntohl(hdr.hdr_entries)) + if (p->num_objects != ntohl(hdr.hdr_entries)) return error("packfile %s claims to have %u objects" - " while index size indicates %u objects", - p->pack_name, ntohl(hdr.hdr_entries), - num_packed_objects(p)); + " while index indicates %u objects", + p->pack_name, ntohl(hdr.hdr_entries), + p->num_objects); if (lseek(p->pack_fd, p->pack_size - sizeof(sha1), SEEK_SET) == -1) return error("end of packfile %s is unavailable", p->pack_name); if (read_in_full(p->pack_fd, sha1, sizeof(sha1)) != sizeof(sha1)) @@ -1526,18 +1527,12 @@ void *unpack_entry(struct packed_git *p, off_t obj_offset, return data; } -uint32_t num_packed_objects(const struct packed_git *p) -{ - /* See check_packed_git_idx() */ - return (uint32_t)((p->index_size - 20 - 20 - 4*256) / 24); -} - const unsigned char *nth_packed_object_sha1(const struct packed_git *p, uint32_t n) { const unsigned char *index = p->index_data; index += 4 * 256; - if (num_packed_objects(p) <= n) + if (n >= p->num_objects) return NULL; return index + 24 * n + 4; } diff --git a/sha1_name.c b/sha1_name.c index 267ea3f3ed..b0b12bbe9d 100644 --- a/sha1_name.c +++ b/sha1_name.c @@ -76,7 +76,7 @@ static int find_short_packed_object(int len, const unsigned char *match, unsigne prepare_packed_git(); for (p = packed_git; p && found < 2; p = p->next) { - uint32_t num = num_packed_objects(p); + uint32_t num = p->num_objects; uint32_t first = 0, last = num; while (first < last) { uint32_t mid = (first + last) / 2; From 8723f216263ba4a0f06be7b93fada863c0931e09 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 9 Apr 2007 01:06:29 -0400 Subject: [PATCH 02/27] make overflow test on delta base offset work regardless of variable size This patch introduces the MSB() macro to obtain the desired number of most significant bits from a given variable independently of the variable type. It is then used to better implement the overflow test on the OBJ_OFS_DELTA base offset variable with the property of always working correctly regardless of the type/size of that variable. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 2 +- builtin-unpack-objects.c | 2 +- git-compat-util.h | 8 ++++++++ index-pack.c | 2 +- sha1_file.c | 2 +- 5 files changed, 12 insertions(+), 4 deletions(-) diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 6bff17b130..ee607a0d2c 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -1014,7 +1014,7 @@ static void check_object(struct object_entry *entry) ofs = c & 127; while (c & 128) { ofs += 1; - if (!ofs || ofs & ~(~0UL >> 7)) + if (!ofs || MSB(ofs, 7)) die("delta base offset overflow in pack for %s", sha1_to_hex(entry->sha1)); c = buf[used_0++]; diff --git a/builtin-unpack-objects.c b/builtin-unpack-objects.c index 3956c56334..63f7db6831 100644 --- a/builtin-unpack-objects.c +++ b/builtin-unpack-objects.c @@ -209,7 +209,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size, base_offset = c & 127; while (c & 128) { base_offset += 1; - if (!base_offset || base_offset & ~(~0UL >> 7)) + if (!base_offset || MSB(base_offset, 7)) die("offset value overflow for delta base object"); pack = fill(1); c = *pack; diff --git a/git-compat-util.h b/git-compat-util.h index 139fc19108..bcfcb35ecf 100644 --- a/git-compat-util.h +++ b/git-compat-util.h @@ -13,6 +13,14 @@ #define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) +#ifdef __GNUC__ +#define TYPEOF(x) (__typeof__(x)) +#else +#define TYPEOF(x) +#endif + +#define MSB(x, bits) ((x) & TYPEOF(x)(~0ULL << (sizeof(x) * 8 - (bits)))) + #if !defined(__APPLE__) && !defined(__FreeBSD__) #define _XOPEN_SOURCE 600 /* glibc2 and AIX 5.3L need 500, OpenBSD needs 600 for S_ISLNK() */ #define _XOPEN_SOURCE_EXTENDED 1 /* AIX 5.3L needs this */ diff --git a/index-pack.c b/index-pack.c index 3c768fbc63..0e54aa6844 100644 --- a/index-pack.c +++ b/index-pack.c @@ -249,7 +249,7 @@ static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_ base_offset = c & 127; while (c & 128) { base_offset += 1; - if (!base_offset || base_offset & ~(~0UL >> 7)) + if (!base_offset || MSB(base_offset, 7)) bad_object(obj->offset, "offset value overflow for delta base object"); p = fill(1); c = *p; diff --git a/sha1_file.c b/sha1_file.c index d9ca69a916..ebdd497bad 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -1150,7 +1150,7 @@ static off_t get_delta_base(struct packed_git *p, base_offset = c & 127; while (c & 128) { base_offset += 1; - if (!base_offset || base_offset & ~(~0UL >> 7)) + if (!base_offset || MSB(base_offset, 7)) die("offset value overflow for delta base object"); c = base_info[used++]; base_offset = (base_offset << 7) + (c & 127); From d7dd02231f75604e388afb905f7bf8afd1bf4b24 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 9 Apr 2007 01:06:30 -0400 Subject: [PATCH 03/27] add overflow tests on pack offset variables Change a few size and offset variables to more appropriate type, then add overflow tests on those offsets. This prevents any bad data to be generated/processed if off_t happens to not be large enough to handle some big packs. Better be safe than sorry. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 19 +++++++++++++------ builtin-unpack-objects.c | 17 +++++++++++------ index-pack.c | 14 ++++++++++---- 3 files changed, 34 insertions(+), 16 deletions(-) diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index ee607a0d2c..d0be879443 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -369,7 +369,7 @@ static int revalidate_loose_object(struct object_entry *entry, return check_loose_inflate(map, mapsize, size); } -static off_t write_object(struct sha1file *f, +static unsigned long write_object(struct sha1file *f, struct object_entry *entry) { unsigned long size; @@ -503,16 +503,23 @@ static off_t write_one(struct sha1file *f, struct object_entry *e, off_t offset) { + unsigned long size; + + /* offset is non zero if object is written already. */ if (e->offset || e->preferred_base) - /* offset starts from header size and cannot be zero - * if it is written already. - */ return offset; - /* if we are deltified, write out its base object first. */ + + /* if we are deltified, write out base object first. */ if (e->delta) offset = write_one(f, e->delta, offset); + e->offset = offset; - return offset + write_object(f, e); + size = write_object(f, e); + + /* make sure off_t is sufficiently large not to wrap */ + if (offset > offset + size) + die("pack too large for current definition of off_t"); + return offset + size; } static void write_pack_file(void) diff --git a/builtin-unpack-objects.c b/builtin-unpack-objects.c index 63f7db6831..f821906460 100644 --- a/builtin-unpack-objects.c +++ b/builtin-unpack-objects.c @@ -13,7 +13,8 @@ static const char unpack_usage[] = "git-unpack-objects [-n] [-q] [-r] < pack-fil /* We always read in 4kB chunks. */ static unsigned char buffer[4096]; -static unsigned long offset, len, consumed_bytes; +static unsigned int offset, len; +static off_t consumed_bytes; static SHA_CTX ctx; /* @@ -49,6 +50,10 @@ static void use(int bytes) die("used more bytes than were available"); len -= bytes; offset += bytes; + + /* make sure off_t is sufficiently large not to wrap */ + if (consumed_bytes > consumed_bytes + bytes) + die("pack too large for current definition of off_t"); consumed_bytes += bytes; } @@ -88,17 +93,17 @@ static void *get_data(unsigned long size) struct delta_info { unsigned char base_sha1[20]; - unsigned long base_offset; + unsigned nr; + off_t base_offset; unsigned long size; void *delta; - unsigned nr; struct delta_info *next; }; static struct delta_info *delta_list; static void add_delta_to_list(unsigned nr, unsigned const char *base_sha1, - unsigned long base_offset, + off_t base_offset, void *delta, unsigned long size) { struct delta_info *info = xmalloc(sizeof(*info)); @@ -113,7 +118,7 @@ static void add_delta_to_list(unsigned nr, unsigned const char *base_sha1, } struct obj_info { - unsigned long offset; + off_t offset; unsigned char sha1[20]; }; @@ -200,7 +205,7 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size, } else { unsigned base_found = 0; unsigned char *pack, c; - unsigned long base_offset; + off_t base_offset; unsigned lo, mid, hi; pack = fill(1); diff --git a/index-pack.c b/index-pack.c index 0e54aa6844..66fb0bcedf 100644 --- a/index-pack.c +++ b/index-pack.c @@ -12,7 +12,7 @@ static const char index_pack_usage[] = struct object_entry { - unsigned long offset; + off_t offset; unsigned long size; unsigned int hdr_size; enum object_type type; @@ -22,7 +22,7 @@ struct object_entry union delta_base { unsigned char sha1[20]; - unsigned long offset; + off_t offset; }; /* @@ -83,7 +83,8 @@ static unsigned display_progress(unsigned n, unsigned total, unsigned last_pc) /* We always read in 4kB chunks. */ static unsigned char input_buffer[4096]; -static unsigned long input_offset, input_len, consumed_bytes; +static unsigned int input_offset, input_len; +static off_t consumed_bytes; static SHA_CTX input_ctx; static int input_fd, output_fd, pack_fd; @@ -129,6 +130,10 @@ static void use(int bytes) die("used more bytes than were available"); input_len -= bytes; input_offset += bytes; + + /* make sure off_t is sufficiently large not to wrap */ + if (consumed_bytes > consumed_bytes + bytes) + die("pack too large for current definition of off_t"); consumed_bytes += bytes; } @@ -216,7 +221,8 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size) static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_base) { unsigned char *p, c; - unsigned long size, base_offset; + unsigned long size; + off_t base_offset; unsigned shift; obj->offset = consumed_bytes; From 78d1e84fe5586ed45740b915b52e6856bb3f1c44 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 9 Apr 2007 01:06:31 -0400 Subject: [PATCH 04/27] compute a CRC32 for each object as stored in a pack The most important optimization for performance when repacking is the ability to reuse data from a previous pack as is and bypass any delta or even SHA1 computation by simply copying the raw data from one pack to another directly. The problem with this is that any data corruption within a copied object would go unnoticed and the new (repacked) pack would be self-consistent with its own checksum despite containing a corrupted object. This is a real issue that already happened at least once in the past. In some attempt to prevent this, we validate the copied data by inflating it and making sure no error is signaled by zlib. But this is still not perfect as a significant portion of a pack content is made of object headers and references to delta base objects which are not deflated and therefore not validated when repacking actually making the pack data reuse still not as safe as it could be. Of course a full SHA1 validation could be performed, but that implies full data inflating and delta replaying which is extremely costly, which cost the data reuse optimization was designed to avoid in the first place. So the best solution to this is simply to store a CRC32 of the raw pack data for each object in the pack index. This way any object in a pack can be validated before being copied as is in another pack, including header and any other non deflated data. Why CRC32 instead of a faster checksum like Adler32? Quoting Wikipedia: Jonathan Stone discovered in 2001 that Adler-32 has a weakness for very short messages. He wrote "Briefly, the problem is that, for very short packets, Adler32 is guaranteed to give poor coverage of the available bits. Don't take my word for it, ask Mark Adler. :-)" The problem is that sum A does not wrap for short messages. The maximum value of A for a 128-byte message is 32640, which is below the value 65521 used by the modulo operation. An extended explanation can be found in RFC 3309, which mandates the use of CRC32 instead of Adler-32 for SCTP, the Stream Control Transmission Protocol. In the context of a GIT pack, we have lots of small objects, especially deltas, which are likely to be quite small and in a size range for which Adler32 is dimed not to be sufficient. Another advantage of CRC32 is the possibility for recovery from certain types of small corruptions like single bit errors which are the most probable type of corruptions. OK what this patch does is to compute the CRC32 of each object written to a pack within pack-objects. It is not written to the index yet and it is obviously not validated when reusing pack data yet either. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 6 ++++++ csum-file.c | 14 ++++++++++++++ csum-file.h | 4 ++++ 3 files changed, 24 insertions(+) diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index d0be879443..03e36f0183 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -44,6 +44,7 @@ struct object_entry { * be used as the base objectto delta huge * objects against. */ + uint32_t crc32; /* crc of raw pack data for this object */ }; /* @@ -381,6 +382,9 @@ static unsigned long write_object(struct sha1file *f, enum object_type obj_type; int to_reuse = 0; + if (!pack_to_stdout) + crc32_begin(f); + obj_type = entry->type; if (! entry->in_pack) to_reuse = 0; /* can't reuse what we don't have */ @@ -496,6 +500,8 @@ static unsigned long write_object(struct sha1file *f, if (entry->delta) written_delta++; written++; + if (!pack_to_stdout) + entry->crc32 = crc32_end(f); return hdrlen + datalen; } diff --git a/csum-file.c b/csum-file.c index b7174c6c05..7c806ada48 100644 --- a/csum-file.c +++ b/csum-file.c @@ -49,6 +49,8 @@ int sha1close(struct sha1file *f, unsigned char *result, int update) int sha1write(struct sha1file *f, void *buf, unsigned int count) { + if (f->do_crc) + f->crc32 = crc32(f->crc32, buf, count); while (count) { unsigned offset = f->offset; unsigned left = sizeof(f->buffer) - offset; @@ -91,6 +93,7 @@ struct sha1file *sha1create(const char *fmt, ...) f->fd = fd; f->error = 0; f->offset = 0; + f->do_crc = 0; SHA1_Init(&f->ctx); return f; } @@ -111,6 +114,7 @@ struct sha1file *sha1fd(int fd, const char *name) f->fd = fd; f->error = 0; f->offset = 0; + f->do_crc = 0; SHA1_Init(&f->ctx); return f; } @@ -143,4 +147,14 @@ int sha1write_compressed(struct sha1file *f, void *in, unsigned int size) return size; } +void crc32_begin(struct sha1file *f) +{ + f->crc32 = crc32(0, Z_NULL, 0); + f->do_crc = 1; +} +uint32_t crc32_end(struct sha1file *f) +{ + f->do_crc = 0; + return f->crc32; +} diff --git a/csum-file.h b/csum-file.h index 3ad1a992a7..7e1339189d 100644 --- a/csum-file.h +++ b/csum-file.h @@ -7,6 +7,8 @@ struct sha1file { unsigned int offset, namelen; SHA_CTX ctx; char name[PATH_MAX]; + int do_crc; + uint32_t crc32; unsigned char buffer[8192]; }; @@ -15,5 +17,7 @@ extern struct sha1file *sha1create(const char *fmt, ...) __attribute__((format ( extern int sha1close(struct sha1file *, unsigned char *, int); extern int sha1write(struct sha1file *, void *, unsigned int); extern int sha1write_compressed(struct sha1file *, void *, unsigned int); +extern void crc32_begin(struct sha1file *); +extern uint32_t crc32_end(struct sha1file *); #endif From ee5743ce191d4f39f976e76fe4c12bf8fc67a590 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 9 Apr 2007 01:06:32 -0400 Subject: [PATCH 05/27] compute object CRC32 with index-pack Same as previous patch but for index-pack. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- index-pack.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/index-pack.c b/index-pack.c index 66fb0bcedf..d33f723365 100644 --- a/index-pack.c +++ b/index-pack.c @@ -15,6 +15,7 @@ struct object_entry off_t offset; unsigned long size; unsigned int hdr_size; + uint32_t crc32; enum object_type type; enum object_type real_type; unsigned char sha1[20]; @@ -86,6 +87,7 @@ static unsigned char input_buffer[4096]; static unsigned int input_offset, input_len; static off_t consumed_bytes; static SHA_CTX input_ctx; +static uint32_t input_crc32; static int input_fd, output_fd, pack_fd; /* Discard current buffer used content. */ @@ -128,6 +130,7 @@ static void use(int bytes) { if (bytes > input_len) die("used more bytes than were available"); + input_crc32 = crc32(input_crc32, input_buffer + input_offset, bytes); input_len -= bytes; input_offset += bytes; @@ -224,8 +227,10 @@ static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_ unsigned long size; off_t base_offset; unsigned shift; + void *data; obj->offset = consumed_bytes; + input_crc32 = crc32(0, Z_NULL, 0); p = fill(1); c = *p; @@ -276,7 +281,9 @@ static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_ } obj->hdr_size = consumed_bytes - obj->offset; - return unpack_entry_data(obj->offset, obj->size); + data = unpack_entry_data(obj->offset, obj->size); + obj->crc32 = input_crc32; + return data; } static void *get_data_from_pack(struct object_entry *obj) @@ -521,7 +528,7 @@ static void parse_pack_objects(unsigned char *sha1) fputc('\n', stderr); } -static int write_compressed(int fd, void *in, unsigned int size) +static int write_compressed(int fd, void *in, unsigned int size, uint32_t *obj_crc) { z_stream stream; unsigned long maxsize; @@ -542,6 +549,7 @@ static int write_compressed(int fd, void *in, unsigned int size) size = stream.total_out; write_or_die(fd, out, size); + *obj_crc = crc32(*obj_crc, out, size); free(out); return size; } @@ -562,8 +570,10 @@ static void append_obj_to_pack(const unsigned char *sha1, void *buf, } header[n++] = c; write_or_die(output_fd, header, n); + obj[0].crc32 = crc32(0, Z_NULL, 0); + obj[0].crc32 = crc32(obj[0].crc32, header, n); obj[1].offset = obj[0].offset + n; - obj[1].offset += write_compressed(output_fd, buf, size); + obj[1].offset += write_compressed(output_fd, buf, size, &obj[0].crc32); hashcpy(obj->sha1, sha1); } From c553ca25bd60dc9fd50b8bc7bd329601b81cee66 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 9 Apr 2007 01:06:33 -0400 Subject: [PATCH 06/27] pack-objects: learn about pack index version 2 Pack index version 2 goes as follows: - 8 bytes of header with signature and version. - 256 entries of 4-byte first-level fan-out table. - Table of sorted 20-byte SHA1 records for each object in pack. - Table of 4-byte CRC32 entries for raw pack object data. - Table of 4-byte offset entries for objects in the pack if offset is representable with 31 bits or less, otherwise it is an index in the next table with top bit set. - Table of 8-byte offset entries indexed from previous table for offsets which are 32 bits or more (optional). - 20-byte SHA1 checksum of sorted object names. - 20-byte SHA1 checksum of the above. The object SHA1 table is all contiguous so future pack format that would contain this table directly won't require big changes to the code. It is also tighter for slightly better cache locality when looking up entries. Support for large packs exceeding 31 bits in size won't impose an index size bloat for packs within that range that don't need a 64-bit offset. And because newer objects which are likely to be the most frequently used are located at the beginning of the pack, they won't pay the 64-bit offset lookup at run time either even if the pack is large. Right now an index version 2 is created only when the biggest offset in a pack reaches 31 bits. It might be a good idea to always use index version 2 eventually to benefit from the CRC32 it contains when reusing pack data while repacking. [jc: with the "oops" fix to keep track of the last offset correctly] Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 99 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 87 insertions(+), 12 deletions(-) diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 03e36f0183..8cf2871751 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -169,13 +169,33 @@ static void prepare_pack_revindex(struct pack_revindex *rix) int i; const char *index = p->index_data; - index += 4 * 256; rix->revindex = xmalloc(sizeof(*rix->revindex) * (num_ent + 1)); - for (i = 0; i < num_ent; i++) { - uint32_t hl = *((uint32_t *)(index + 24 * i)); - rix->revindex[i].offset = ntohl(hl); - rix->revindex[i].nr = i; + index += 4 * 256; + + if (p->index_version > 1) { + const uint32_t *off_32 = + (uint32_t *)(index + 8 + p->num_objects * (20 + 4)); + const uint32_t *off_64 = off_32 + p->num_objects; + for (i = 0; i < num_ent; i++) { + uint32_t off = ntohl(*off_32++); + if (!(off & 0x80000000)) { + rix->revindex[i].offset = off; + } else { + rix->revindex[i].offset = + ((uint64_t)ntohl(*off_64++)) << 32; + rix->revindex[i].offset |= + ntohl(*off_64++); + } + rix->revindex[i].nr = i; + } + } else { + for (i = 0; i < num_ent; i++) { + uint32_t hl = *((uint32_t *)(index + 24 * i)); + rix->revindex[i].offset = ntohl(hl); + rix->revindex[i].nr = i; + } } + /* This knows the pack format -- the 20-byte trailer * follows immediately after the last object data. */ @@ -528,11 +548,11 @@ static off_t write_one(struct sha1file *f, return offset + size; } -static void write_pack_file(void) +static off_t write_pack_file(void) { uint32_t i; struct sha1file *f; - off_t offset; + off_t offset, last_obj_offset = 0; struct pack_header hdr; unsigned last_percent = 999; int do_progress = progress; @@ -555,6 +575,7 @@ static void write_pack_file(void) if (!nr_result) goto done; for (i = 0; i < nr_objects; i++) { + last_obj_offset = offset; offset = write_one(f, objects + i, offset); if (do_progress) { unsigned percent = written * 100 / nr_result; @@ -572,9 +593,11 @@ static void write_pack_file(void) if (written != nr_result) die("wrote %u objects while expecting %u", written, nr_result); sha1close(f, pack_file_sha1, 1); + + return last_obj_offset; } -static void write_index_file(void) +static void write_index_file(off_t last_obj_offset) { uint32_t i; struct sha1file *f = sha1create("%s-%s.%s", base_name, @@ -582,6 +605,18 @@ static void write_index_file(void) struct object_entry **list = sorted_by_sha; struct object_entry **last = list + nr_result; uint32_t array[256]; + uint32_t index_version; + + /* if last object's offset is >= 2^31 we should use index V2 */ + index_version = (last_obj_offset >> 31) ? 2 : 1; + + /* index versions 2 and above need a header */ + if (index_version >= 2) { + struct pack_idx_header hdr; + hdr.idx_signature = htonl(PACK_IDX_SIGNATURE); + hdr.idx_version = htonl(index_version); + sha1write(f, &hdr, sizeof(hdr)); + } /* * Write the first-level table (the list is sorted, @@ -607,10 +642,49 @@ static void write_index_file(void) list = sorted_by_sha; for (i = 0; i < nr_result; i++) { struct object_entry *entry = *list++; - uint32_t offset = htonl(entry->offset); - sha1write(f, &offset, 4); + if (index_version < 2) { + uint32_t offset = htonl(entry->offset); + sha1write(f, &offset, 4); + } sha1write(f, entry->sha1, 20); } + + if (index_version >= 2) { + unsigned int nr_large_offset = 0; + + /* write the crc32 table */ + list = sorted_by_sha; + for (i = 0; i < nr_objects; i++) { + struct object_entry *entry = *list++; + uint32_t crc32_val = htonl(entry->crc32); + sha1write(f, &crc32_val, 4); + } + + /* write the 32-bit offset table */ + list = sorted_by_sha; + for (i = 0; i < nr_objects; i++) { + struct object_entry *entry = *list++; + uint32_t offset = (entry->offset <= 0x7fffffff) ? + entry->offset : (0x80000000 | nr_large_offset++); + offset = htonl(offset); + sha1write(f, &offset, 4); + } + + /* write the large offset table */ + list = sorted_by_sha; + while (nr_large_offset) { + struct object_entry *entry = *list++; + uint64_t offset = entry->offset; + if (offset > 0x7fffffff) { + uint32_t split[2]; + split[0] = htonl(offset >> 32); + split[1] = htonl(offset & 0xffffffff); + sha1write(f, split, 8); + nr_large_offset--; + } + } + } + sha1write(f, pack_file_sha1, 20); sha1close(f, NULL, 1); } @@ -1698,6 +1772,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) if (reuse_cached_pack(object_list_sha1)) ; else { + off_t last_obj_offset; if (nr_result) prepare_pack(window, depth); if (progress == 1 && pack_to_stdout) { @@ -1707,9 +1782,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) signal(SIGALRM, SIG_IGN ); progress_update = 0; } - write_pack_file(); + last_obj_offset = write_pack_file(); if (!pack_to_stdout) { - write_index_file(); + write_index_file(last_obj_offset); puts(sha1_to_hex(object_list_sha1)); } } From d1a46a9eab2ce136240fe2e8db34b36338052e97 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 9 Apr 2007 01:06:34 -0400 Subject: [PATCH 07/27] index-pack: learn about pack index version 2 Like previous patch but for index-pack. [ There is quite some code duplication between pack-objects and index-pack for generating a pack index (and fast-import as well I suppose). This should be reworked into a common function eventually. But not now. ] Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- index-pack.c | 66 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 57 insertions(+), 9 deletions(-) diff --git a/index-pack.c b/index-pack.c index d33f723365..a833f640f6 100644 --- a/index-pack.c +++ b/index-pack.c @@ -686,9 +686,10 @@ static const char *write_index_file(const char *index_name, unsigned char *sha1) { struct sha1file *f; struct object_entry **sorted_by_sha, **list, **last; - unsigned int array[256]; + uint32_t array[256]; int i, fd; SHA_CTX ctx; + uint32_t index_version; if (nr_objects) { sorted_by_sha = @@ -699,7 +700,6 @@ static const char *write_index_file(const char *index_name, unsigned char *sha1) sorted_by_sha[i] = &objects[i]; qsort(sorted_by_sha, nr_objects, sizeof(sorted_by_sha[0]), sha1_compare); - } else sorted_by_sha = list = last = NULL; @@ -718,6 +718,17 @@ static const char *write_index_file(const char *index_name, unsigned char *sha1) die("unable to create %s: %s", index_name, strerror(errno)); f = sha1fd(fd, index_name); + /* if last object's offset is >= 2^31 we should use index V2 */ + index_version = (objects[nr_objects-1].offset >> 31) ? 2 : 1; + + /* index versions 2 and above need a header */ + if (index_version >= 2) { + struct pack_idx_header hdr; + hdr.idx_signature = htonl(PACK_IDX_SIGNATURE); + hdr.idx_version = htonl(index_version); + sha1write(f, &hdr, sizeof(hdr)); + } + /* * Write the first-level table (the list is sorted, * but we use a 256-entry lookup to be able to avoid @@ -734,24 +745,61 @@ static const char *write_index_file(const char *index_name, unsigned char *sha1) array[i] = htonl(next - sorted_by_sha); list = next; } - sha1write(f, array, 256 * sizeof(int)); + sha1write(f, array, 256 * 4); - /* recompute the SHA1 hash of sorted object names. - * currently pack-objects does not do this, but that - * can be fixed. - */ + /* compute the SHA1 hash of sorted object names. */ SHA1_Init(&ctx); + /* * Write the actual SHA1 entries.. */ list = sorted_by_sha; for (i = 0; i < nr_objects; i++) { struct object_entry *obj = *list++; - unsigned int offset = htonl(obj->offset); - sha1write(f, &offset, 4); + if (index_version < 2) { + uint32_t offset = htonl(obj->offset); + sha1write(f, &offset, 4); + } sha1write(f, obj->sha1, 20); SHA1_Update(&ctx, obj->sha1, 20); } + + if (index_version >= 2) { + unsigned int nr_large_offset = 0; + + /* write the crc32 table */ + list = sorted_by_sha; + for (i = 0; i < nr_objects; i++) { + struct object_entry *obj = *list++; + uint32_t crc32_val = htonl(obj->crc32); + sha1write(f, &crc32_val, 4); + } + + /* write the 32-bit offset table */ + list = sorted_by_sha; + for (i = 0; i < nr_objects; i++) { + struct object_entry *obj = *list++; + uint32_t offset = (obj->offset <= 0x7fffffff) ? + obj->offset : (0x80000000 | nr_large_offset++); + offset = htonl(offset); + sha1write(f, &offset, 4); + } + + /* write the large offset table */ + list = sorted_by_sha; + while (nr_large_offset) { + struct object_entry *obj = *list++; + uint64_t offset = obj->offset; + if (offset > 0x7fffffff) { + uint32_t split[2]; + split[0] = htonl(offset >> 32); + split[1] = htonl(offset & 0xffffffff); + sha1write(f, split, 8); + nr_large_offset--; + } + } + } + sha1write(f, sha1, 20); sha1close(f, NULL, 1); free(sorted_by_sha); From 74e34e1fca2ed9998581cc94073bc2dd28bbb8f3 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 9 Apr 2007 01:06:35 -0400 Subject: [PATCH 08/27] sha1_file.c: learn about index version 2 With this patch, packs larger than 4GB are usable, even on a 32-bit machine (at least on Linux). If off_t is not large enough to deal with a large pack then die() is called instead of attempting to use the pack and producing garbage. This was tested with a 8GB pack specially created for the occasion on a 32-bit machine. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- sha1_file.c | 118 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 89 insertions(+), 29 deletions(-) diff --git a/sha1_file.c b/sha1_file.c index ebdd497bad..0be9737bd1 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -437,7 +437,7 @@ static int check_packed_git_idx(const char *path, struct packed_git *p) void *idx_map; struct pack_idx_header *hdr; size_t idx_size; - uint32_t nr, i, *index; + uint32_t version, nr, i, *index; int fd = open(path, O_RDONLY); struct stat st; @@ -455,21 +455,23 @@ static int check_packed_git_idx(const char *path, struct packed_git *p) idx_map = xmmap(NULL, idx_size, PROT_READ, MAP_PRIVATE, fd, 0); close(fd); - /* a future index format would start with this, as older git - * binaries would fail the non-monotonic index check below. - * give a nicer warning to the user if we can. - */ hdr = idx_map; if (hdr->idx_signature == htonl(PACK_IDX_SIGNATURE)) { - munmap(idx_map, idx_size); - return error("index file %s is a newer version" - " and is not supported by this binary" - " (try upgrading GIT to a newer version)", - path); - } + version = ntohl(hdr->idx_version); + if (version < 2 || version > 2) { + munmap(idx_map, idx_size); + return error("index file %s is version %d" + " and is not supported by this binary" + " (try upgrading GIT to a newer version)", + path, version); + } + } else + version = 1; nr = 0; index = idx_map; + if (version > 1) + index += 2; /* skip index header */ for (i = 0; i < 256; i++) { uint32_t n = ntohl(index[i]); if (n < nr) { @@ -479,19 +481,48 @@ static int check_packed_git_idx(const char *path, struct packed_git *p) nr = n; } - /* - * Total size: - * - 256 index entries 4 bytes each - * - 24-byte entries * nr (20-byte sha1 + 4-byte offset) - * - 20-byte SHA1 of the packfile - * - 20-byte SHA1 file checksum - */ - if (idx_size != 4*256 + nr * 24 + 20 + 20) { - munmap(idx_map, idx_size); - return error("wrong index file size in %s", path); + if (version == 1) { + /* + * Total size: + * - 256 index entries 4 bytes each + * - 24-byte entries * nr (20-byte sha1 + 4-byte offset) + * - 20-byte SHA1 of the packfile + * - 20-byte SHA1 file checksum + */ + if (idx_size != 4*256 + nr * 24 + 20 + 20) { + munmap(idx_map, idx_size); + return error("wrong index file size in %s", path); + } + } else if (version == 2) { + /* + * Minimum size: + * - 8 bytes of header + * - 256 index entries 4 bytes each + * - 20-byte sha1 entry * nr + * - 4-byte crc entry * nr + * - 4-byte offset entry * nr + * - 20-byte SHA1 of the packfile + * - 20-byte SHA1 file checksum + * And after the 4-byte offset table might be a + * variable sized table containing 8-byte entries + * for offsets larger than 2^31. + */ + unsigned long min_size = 8 + 4*256 + nr*(20 + 4 + 4) + 20 + 20; + if (idx_size < min_size || idx_size > min_size + (nr - 1)*8) { + munmap(idx_map, idx_size); + return error("wrong index file size in %s", path); + } + if (idx_size != min_size) { + /* make sure we can deal with large pack offsets */ + off_t x = 0x7fffffffUL, y = 0xffffffffUL; + if (x > (x + 1) || y > (y + 1)) { + munmap(idx_map, idx_size); + return error("pack too large for current definition of off_t in %s", path); + } + } } - p->index_version = 1; + p->index_version = version; p->index_data = idx_map; p->index_size = idx_size; p->num_objects = nr; @@ -1531,27 +1562,56 @@ const unsigned char *nth_packed_object_sha1(const struct packed_git *p, uint32_t n) { const unsigned char *index = p->index_data; - index += 4 * 256; if (n >= p->num_objects) return NULL; - return index + 24 * n + 4; + index += 4 * 256; + if (p->index_version == 1) { + return index + 24 * n + 4; + } else { + index += 8; + return index + 20 * n; + } +} + +static off_t nth_packed_object_offset(const struct packed_git *p, uint32_t n) +{ + const unsigned char *index = p->index_data; + index += 4 * 256; + if (p->index_version == 1) { + return ntohl(*((uint32_t *)(index + 24 * n))); + } else { + uint32_t off; + index += 8 + p->num_objects * (20 + 4); + off = ntohl(*((uint32_t *)(index + 4 * n))); + if (!(off & 0x80000000)) + return off; + index += p->num_objects * 4 + (off & 0x7fffffff) * 8; + return (((uint64_t)ntohl(*((uint32_t *)(index + 0)))) << 32) | + ntohl(*((uint32_t *)(index + 4))); + } } off_t find_pack_entry_one(const unsigned char *sha1, struct packed_git *p) { const uint32_t *level1_ofs = p->index_data; - int hi = ntohl(level1_ofs[*sha1]); - int lo = ((*sha1 == 0x0) ? 0 : ntohl(level1_ofs[*sha1 - 1])); const unsigned char *index = p->index_data; + unsigned hi, lo; + if (p->index_version > 1) { + level1_ofs += 2; + index += 8; + } index += 4 * 256; + hi = ntohl(level1_ofs[*sha1]); + lo = ((*sha1 == 0x0) ? 0 : ntohl(level1_ofs[*sha1 - 1])); do { - int mi = (lo + hi) / 2; - int cmp = hashcmp(index + 24 * mi + 4, sha1); + unsigned mi = (lo + hi) / 2; + unsigned x = (p->index_version > 1) ? (mi * 20) : (mi * 24 + 4); + int cmp = hashcmp(index + x, sha1); if (!cmp) - return ntohl(*((uint32_t *)((char *)index + (24 * mi)))); + return nth_packed_object_offset(p, mi); if (cmp > 0) hi = mi; else From 32637cdf4a6a8d806feaa6b018bc4e4c7e2eb366 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 9 Apr 2007 01:06:36 -0400 Subject: [PATCH 09/27] show-index.c: learn about index v2 When index v2 is encountered, the CRC32 of each object is also displayed in parenthesis at the end of the line. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- show-index.c | 68 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 9 deletions(-) diff --git a/show-index.c b/show-index.c index a30a2de5d1..57ed9e87b7 100644 --- a/show-index.c +++ b/show-index.c @@ -1,14 +1,26 @@ #include "cache.h" +#include "pack.h" int main(int argc, char **argv) { int i; unsigned nr; - unsigned int entry[6]; + unsigned int version; static unsigned int top_index[256]; - if (fread(top_index, sizeof(top_index), 1, stdin) != 1) - die("unable to read index"); + if (fread(top_index, 2 * 4, 1, stdin) != 1) + die("unable to read header"); + if (top_index[0] == htonl(PACK_IDX_SIGNATURE)) { + version = ntohl(top_index[1]); + if (version < 2 || version > 2) + die("unknown index version"); + if (fread(top_index, 256 * 4, 1, stdin) != 1) + die("unable to read index"); + } else { + version = 1; + if (fread(&top_index[2], 254 * 4, 1, stdin) != 1) + die("unable to read index"); + } nr = 0; for (i = 0; i < 256; i++) { unsigned n = ntohl(top_index[i]); @@ -16,13 +28,51 @@ int main(int argc, char **argv) die("corrupt index file"); nr = n; } - for (i = 0; i < nr; i++) { - unsigned offset; + if (version == 1) { + for (i = 0; i < nr; i++) { + unsigned int offset, entry[6]; - if (fread(entry, 24, 1, stdin) != 1) - die("unable to read entry %u/%u", i, nr); - offset = ntohl(entry[0]); - printf("%u %s\n", offset, sha1_to_hex((void *)(entry+1))); + if (fread(entry, 4 + 20, 1, stdin) != 1) + die("unable to read entry %u/%u", i, nr); + offset = ntohl(entry[0]); + printf("%u %s\n", offset, sha1_to_hex((void *)(entry+1))); + } + } else { + unsigned off64_nr = 0; + struct { + unsigned char sha1[20]; + uint32_t crc; + uint32_t off; + } *entries = xmalloc(nr * sizeof(entries[0])); + for (i = 0; i < nr; i++) + if (fread(entries[i].sha1, 20, 1, stdin) != 1) + die("unable to read sha1 %u/%u", i, nr); + for (i = 0; i < nr; i++) + if (fread(&entries[i].crc, 4, 1, stdin) != 1) + die("unable to read crc %u/%u", i, nr); + for (i = 0; i < nr; i++) + if (fread(&entries[i].off, 4, 1, stdin) != 1) + die("unable to read 32b offset %u/%u", i, nr); + for (i = 0; i < nr; i++) { + uint64_t offset; + uint32_t off = ntohl(entries[i].off); + if (!(off & 0x80000000)) { + offset = off; + } else { + uint32_t off64[2]; + if ((off & 0x7fffffff) != off64_nr) + die("inconsistent 64b offset index"); + if (fread(off64, 8, 1, stdin) != 1) + die("unable to read 64b offset %u", off64_nr); + offset = (((uint64_t)ntohl(off64[0])) << 32) | + ntohl(off64[1]); + off64_nr++; + } + printf("%llu %s (%08x)\n", (unsigned long long) offset, + sha1_to_hex(entries[i].sha1), + ntohl(entries[i].crc)); + } + free(entries); } return 0; } From 8c681e07c91b544756e2631493ebc15abd1e8589 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 9 Apr 2007 01:06:37 -0400 Subject: [PATCH 10/27] pack-redundant.c: learn about index v2 Initially the conversion was made using nth_packed_object_sha1() which made this file completely index version agnostic. Unfortunately the overhead was quite significant so I went back to raw index walking but with selectable base and step values which brought back similar performances as the original. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- pack-redundant.c | 47 +++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/pack-redundant.c b/pack-redundant.c index 40e579b2d9..87077e150c 100644 --- a/pack-redundant.c +++ b/pack-redundant.c @@ -247,16 +247,19 @@ static struct pack_list * pack_list_difference(const struct pack_list *A, static void cmp_two_packs(struct pack_list *p1, struct pack_list *p2) { - int p1_off, p2_off; + unsigned long p1_off = 0, p2_off = 0, p1_step, p2_step; const unsigned char *p1_base, *p2_base; struct llist_item *p1_hint = NULL, *p2_hint = NULL; - p1_off = p2_off = 256 * 4 + 4; p1_base = p1->pack->index_data; p2_base = p2->pack->index_data; + p1_base += 256 * 4 + ((p1->pack->index_version < 2) ? 4 : 8); + p2_base += 256 * 4 + ((p2->pack->index_version < 2) ? 4 : 8); + p1_step = (p1->pack->index_version < 2) ? 24 : 20; + p2_step = (p2->pack->index_version < 2) ? 24 : 20; - while (p1_off <= p1->pack->index_size - 3 * 20 && - p2_off <= p2->pack->index_size - 3 * 20) + while (p1_off < p1->pack->num_objects * p1_step && + p2_off < p2->pack->num_objects * p2_step) { int cmp = hashcmp(p1_base + p1_off, p2_base + p2_off); /* cmp ~ p1 - p2 */ @@ -265,14 +268,14 @@ static void cmp_two_packs(struct pack_list *p1, struct pack_list *p2) p1_base + p1_off, p1_hint); p2_hint = llist_sorted_remove(p2->unique_objects, p1_base + p1_off, p2_hint); - p1_off+=24; - p2_off+=24; + p1_off += p1_step; + p2_off += p2_step; continue; } if (cmp < 0) { /* p1 has the object, p2 doesn't */ - p1_off+=24; + p1_off += p1_step; } else { /* p2 has the object, p1 doesn't */ - p2_off+=24; + p2_off += p2_step; } } } @@ -352,28 +355,31 @@ static int is_superset(struct pack_list *pl, struct llist *list) static size_t sizeof_union(struct packed_git *p1, struct packed_git *p2) { size_t ret = 0; - int p1_off, p2_off; + unsigned long p1_off = 0, p2_off = 0, p1_step, p2_step; const unsigned char *p1_base, *p2_base; - p1_off = p2_off = 256 * 4 + 4; p1_base = p1->index_data; p2_base = p2->index_data; + p1_base += 256 * 4 + ((p1->index_version < 2) ? 4 : 8); + p2_base += 256 * 4 + ((p2->index_version < 2) ? 4 : 8); + p1_step = (p1->index_version < 2) ? 24 : 20; + p2_step = (p2->index_version < 2) ? 24 : 20; - while (p1_off <= p1->index_size - 3 * 20 && - p2_off <= p2->index_size - 3 * 20) + while (p1_off < p1->num_objects * p1_step && + p2_off < p2->num_objects * p2_step) { int cmp = hashcmp(p1_base + p1_off, p2_base + p2_off); /* cmp ~ p1 - p2 */ if (cmp == 0) { ret++; - p1_off+=24; - p2_off+=24; + p1_off += p1_step; + p2_off += p2_step; continue; } if (cmp < 0) { /* p1 has the object, p2 doesn't */ - p1_off+=24; + p1_off += p1_step; } else { /* p2 has the object, p1 doesn't */ - p2_off+=24; + p2_off += p2_step; } } return ret; @@ -535,7 +541,7 @@ static void scan_alt_odb_packs(void) static struct pack_list * add_pack(struct packed_git *p) { struct pack_list l; - size_t off; + unsigned long off = 0, step; const unsigned char *base; if (!p->pack_local && !(alt_odb || verbose)) @@ -544,11 +550,12 @@ static struct pack_list * add_pack(struct packed_git *p) l.pack = p; llist_init(&l.all_objects); - off = 256 * 4 + 4; base = p->index_data; - while (off <= p->index_size - 3 * 20) { + base += 256 * 4 + ((p->index_version < 2) ? 4 : 8); + step = (p->index_version < 2) ? 24 : 20; + while (off < p->num_objects * step) { llist_insert_back(l.all_objects, base + off); - off += 24; + off += step; } /* this list will be pruned in cmp_two_packs later */ l.unique_objects = llist_copy(l.all_objects); From 4ba7d711539122c21bd44af06b6cab4fc4f65a74 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 9 Apr 2007 17:32:03 -0400 Subject: [PATCH 11/27] allow forcing index v2 and 64-bit offset treshold This is necessary for testing the new capabilities in some automated way without having an actual 4GB+ pack. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 20 +++++++++++++++++--- index-pack.c | 18 +++++++++++++++--- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 8cf2871751..099dea0e1e 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -597,6 +597,9 @@ static off_t write_pack_file(void) return last_obj_offset; } +static uint32_t index_default_version = 1; +static uint32_t index_off32_limit = 0x7fffffff; + static void write_index_file(off_t last_obj_offset) { uint32_t i; @@ -608,7 +611,7 @@ static void write_index_file(off_t last_obj_offset) uint32_t index_version; /* if last object's offset is >= 2^31 we should use index V2 */ - index_version = (last_obj_offset >> 31) ? 2 : 1; + index_version = (last_obj_offset >> 31) ? 2 : index_default_version; /* index versions 2 and above need a header */ if (index_version >= 2) { @@ -664,7 +667,7 @@ static void write_index_file(off_t last_obj_offset) list = sorted_by_sha; for (i = 0; i < nr_objects; i++) { struct object_entry *entry = *list++; - uint32_t offset = (entry->offset <= 0x7fffffff) ? + uint32_t offset = (entry->offset <= index_off32_limit) ? entry->offset : (0x80000000 | nr_large_offset++); offset = htonl(offset); sha1write(f, &offset, 4); @@ -675,7 +678,7 @@ static void write_index_file(off_t last_obj_offset) while (nr_large_offset) { struct object_entry *entry = *list++; uint64_t offset = entry->offset; - if (offset > 0x7fffffff) { + if (offset > index_off32_limit) { uint32_t split[2]; split[0] = htonl(offset >> 32); split[1] = htonl(offset & 0xffffffff); @@ -1714,6 +1717,17 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) rp_av[1] = "--objects-edge"; continue; } + if (!prefixcmp(arg, "--index-version=")) { + char *c; + index_default_version = strtoul(arg + 16, &c, 10); + if (index_default_version > 2) + die("bad %s", arg); + if (*c == ',') + index_off32_limit = strtoul(c+1, &c, 0); + if (*c || index_off32_limit & 0x80000000) + die("bad %s", arg); + continue; + } usage(pack_usage); } diff --git a/index-pack.c b/index-pack.c index a833f640f6..7aad261d48 100644 --- a/index-pack.c +++ b/index-pack.c @@ -671,6 +671,9 @@ static void readjust_pack_header_and_sha1(unsigned char *sha1) write_or_die(output_fd, sha1, 20); } +static uint32_t index_default_version = 1; +static uint32_t index_off32_limit = 0x7fffffff; + static int sha1_compare(const void *_a, const void *_b) { struct object_entry *a = *(struct object_entry **)_a; @@ -719,7 +722,7 @@ static const char *write_index_file(const char *index_name, unsigned char *sha1) f = sha1fd(fd, index_name); /* if last object's offset is >= 2^31 we should use index V2 */ - index_version = (objects[nr_objects-1].offset >> 31) ? 2 : 1; + index_version = (objects[nr_objects-1].offset >> 31) ? 2 : index_default_version; /* index versions 2 and above need a header */ if (index_version >= 2) { @@ -779,7 +782,7 @@ static const char *write_index_file(const char *index_name, unsigned char *sha1) list = sorted_by_sha; for (i = 0; i < nr_objects; i++) { struct object_entry *obj = *list++; - uint32_t offset = (obj->offset <= 0x7fffffff) ? + uint32_t offset = (obj->offset <= index_off32_limit) ? obj->offset : (0x80000000 | nr_large_offset++); offset = htonl(offset); sha1write(f, &offset, 4); @@ -790,7 +793,7 @@ static const char *write_index_file(const char *index_name, unsigned char *sha1) while (nr_large_offset) { struct object_entry *obj = *list++; uint64_t offset = obj->offset; - if (offset > 0x7fffffff) { + if (offset > index_off32_limit) { uint32_t split[2]; split[0] = htonl(offset >> 32); split[1] = htonl(offset & 0xffffffff); @@ -929,6 +932,15 @@ int main(int argc, char **argv) if (index_name || (i+1) >= argc) usage(index_pack_usage); index_name = argv[++i]; + } else if (!prefixcmp(arg, "--index-version=")) { + char *c; + index_default_version = strtoul(arg + 16, &c, 10); + if (index_default_version > 2) + die("bad %s", arg); + if (*c == ',') + index_off32_limit = strtoul(c+1, &c, 0); + if (*c || index_off32_limit & 0x80000000) + die("bad %s", arg); } else usage(index_pack_usage); continue; From 91ecbeca48e675fef8141451edb2c86577f9d63c Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 10 Apr 2007 00:15:41 -0400 Subject: [PATCH 12/27] validate reused pack data with CRC when possible This replaces the inflate validation with a CRC validation when reusing data from a pack which uses index version 2. That makes repacking much safer against corruptions, and it should be a bit faster too. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 46 +++++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 099dea0e1e..687b4b5a99 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -233,12 +233,6 @@ static struct revindex_entry * find_packed_object(struct packed_git *p, die("internal error: pack revindex corrupt"); } -static off_t find_packed_object_size(struct packed_git *p, off_t ofs) -{ - struct revindex_entry *entry = find_packed_object(p, ofs); - return entry[1].offset - ofs; -} - static const unsigned char *find_packed_object_name(struct packed_git *p, off_t ofs) { @@ -321,6 +315,28 @@ static int check_pack_inflate(struct packed_git *p, stream.total_in == len) ? 0 : -1; } +static int check_pack_crc(struct packed_git *p, struct pack_window **w_curs, + off_t offset, off_t len, unsigned int nr) +{ + const uint32_t *index_crc; + uint32_t data_crc = crc32(0, Z_NULL, 0); + + do { + unsigned int avail; + void *data = use_pack(p, w_curs, offset, &avail); + if (avail > len) + avail = len; + data_crc = crc32(data_crc, data, avail); + offset += avail; + len -= avail; + } while (len); + + index_crc = p->index_data; + index_crc += 2 + 256 + p->num_objects * (20/4) + nr; + + return data_crc != ntohl(*index_crc); +} + static void copy_pack_data(struct sha1file *f, struct packed_git *p, struct pack_window **w_curs, @@ -485,6 +501,7 @@ static unsigned long write_object(struct sha1file *f, else { struct packed_git *p = entry->in_pack; struct pack_window *w_curs = NULL; + struct revindex_entry *revidx; off_t offset; if (entry->delta) { @@ -507,12 +524,17 @@ static unsigned long write_object(struct sha1file *f, hdrlen += 20; } - offset = entry->in_pack_offset + entry->in_pack_header_size; - datalen = find_packed_object_size(p, entry->in_pack_offset) - - entry->in_pack_header_size; - if (!pack_to_stdout && check_pack_inflate(p, &w_curs, - offset, datalen, entry->size)) - die("corrupt delta in pack %s", sha1_to_hex(entry->sha1)); + offset = entry->in_pack_offset; + revidx = find_packed_object(p, offset); + datalen = revidx[1].offset - offset; + if (!pack_to_stdout && p->index_version > 1 && + check_pack_crc(p, &w_curs, offset, datalen, revidx->nr)) + die("bad packed object CRC for %s", sha1_to_hex(entry->sha1)); + offset += entry->in_pack_header_size; + datalen -= entry->in_pack_header_size; + if (!pack_to_stdout && p->index_version == 1 && + check_pack_inflate(p, &w_curs, offset, datalen, entry->size)) + die("corrupt packed object for %s", sha1_to_hex(entry->sha1)); copy_pack_data(f, p, &w_curs, offset, datalen); unuse_pack(&w_curs); reused++; From 2dca1af44806924946828e2fdc82ea408353d286 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 11 Apr 2007 13:59:51 -0400 Subject: [PATCH 13/27] simple random data generator for tests Reliance on /dev/urandom produces test vectors that are, well, random. This can cause problems impossible to track down when the data is different from one test invokation to another. The goal is not to have random data to test, but rather to have a convenient way to create sets of large files with non compressible and non deltifiable data in a reproducible way. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- .gitignore | 1 + Makefile | 7 +++++-- test-genrandom.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 2 deletions(-) create mode 100644 test-genrandom.c diff --git a/.gitignore b/.gitignore index 9229e918cd..fa7ac93594 100644 --- a/.gitignore +++ b/.gitignore @@ -149,6 +149,7 @@ test-chmtime test-date test-delta test-dump-cache-tree +test-genrandom test-match-trees common-cmds.h *.tar.gz diff --git a/Makefile b/Makefile index a77d31de98..bd0ba95b1a 100644 --- a/Makefile +++ b/Makefile @@ -932,7 +932,7 @@ endif export NO_SVN_TESTS -test: all test-chmtime$X +test: all test-chmtime$X test-genrandom$X $(MAKE) -C t/ all test-date$X: test-date.c date.o ctype.o @@ -953,6 +953,9 @@ test-match-trees$X: test-match-trees.o $(GITLIBS) test-chmtime$X: test-chmtime.c $(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $< +test-genrandom$X: test-genrandom.c + $(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $< + check-sha1:: test-sha1$X ./test-sha1.sh @@ -1041,7 +1044,7 @@ dist-doc: clean: rm -f *.o mozilla-sha1/*.o arm/*.o ppc/*.o compat/*.o xdiff/*.o \ - test-chmtime$X $(LIB_FILE) $(XDIFF_LIB) + test-chmtime$X test-genrandom$X $(LIB_FILE) $(XDIFF_LIB) rm -f $(ALL_PROGRAMS) $(BUILT_INS) git$X rm -f *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags rm -rf autom4te.cache diff --git a/test-genrandom.c b/test-genrandom.c new file mode 100644 index 0000000000..8cefe6cfed --- /dev/null +++ b/test-genrandom.c @@ -0,0 +1,34 @@ +/* + * Simple random data generator used to create reproducible test files. + * This is inspired from POSIX.1-2001 implementation example for rand(). + * Copyright (C) 2007 by Nicolas Pitre, licensed under the GPL version 2. + */ + +#include +#include + +int main(int argc, char *argv[]) +{ + unsigned long count, next = 0; + unsigned char *c; + + if (argc < 2 || argc > 3) { + fprintf( stderr, "Usage: %s []", argv[0]); + return 1; + } + + c = (unsigned char *) argv[1]; + do { + next = next * 11 + *c; + } while (*c++); + + count = (argc == 3) ? strtoul(argv[2], NULL, 0) : -1L; + + while (count--) { + next = next * 1103515245 + 12345; + if (putchar((next >> 16) & 0xff) == EOF) + return -1; + } + + return 0; +} From 39551b6926ef869e15af41f4eaa9356da98b2b5e Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 11 Apr 2007 13:35:13 -0400 Subject: [PATCH 14/27] use test-genrandom in tests instead of /dev/urandom This way tests are completely deterministic and possibly more portable. Signed-off-by: Nicolas Pitre --- t/t5301-sliding-window.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t/t5301-sliding-window.sh b/t/t5301-sliding-window.sh index a6dbb04a86..fce77f1255 100755 --- a/t/t5301-sliding-window.sh +++ b/t/t5301-sliding-window.sh @@ -12,7 +12,7 @@ test_expect_success \ for i in a b c do echo $i >$i && - dd if=/dev/urandom bs=32k count=1 >>$i && + test-genrandom "$i" 32768 >>$i && git-update-index --add $i || return 1 done && echo d >d && cat c >>d && git-update-index --add d && From 6e5417769c587c8b1f3412e19a1f80645fe82a3c Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 10 Apr 2007 16:26:10 -0400 Subject: [PATCH 15/27] tests for various pack index features This is a fairly complete list of tests for various aspects of pack index versions 1 and 2. Tests on index v2 include 32-bit and 64-bit offsets, as well as a nice demonstration of the flawed repacking integrity checks that index version 2 intend to solve over index version 1 with the per object CRC. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- t/t5302-pack-index.sh | 146 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100755 t/t5302-pack-index.sh diff --git a/t/t5302-pack-index.sh b/t/t5302-pack-index.sh new file mode 100755 index 0000000000..232e5f1964 --- /dev/null +++ b/t/t5302-pack-index.sh @@ -0,0 +1,146 @@ +#!/bin/sh +# +# Copyright (c) 2007 Nicolas Pitre +# + +test_description='pack index with 64-bit offsets and object CRC' +. ./test-lib.sh + +test_expect_success \ + 'setup' \ + 'rm -rf .git + git-init && + for i in `seq -w 100` + do + echo $i >file_$i && + test-genrandom "$i" 8192 >>file_$i && + git-update-index --add file_$i || return 1 + done && + echo 101 >file_101 && tail -c 8192 file_100 >>file_101 && + git-update-index --add file_101 && + tree=`git-write-tree` && + commit=`git-commit-tree $tree obj-list && + git-update-ref HEAD $commit' + +test_expect_success \ + 'pack-objects with index version 1' \ + 'pack1=$(git-pack-objects --index-version=1 test-1 blob_1 && + chmod +w ".git/objects/pack/pack-${pack1}.pack" && + dd of=".git/objects/pack/pack-${pack1}.pack" seek=$(($delta_offs + 1)) \ + if=".git/objects/pack/pack-${pack1}.idx" skip=$((256 * 4 + 4)) \ + bs=1 count=20 conv=notrunc && + git-cat-file blob "$delta_sha1" > blob_2 )' + +test_expect_failure \ + '[index v1] 3) corrupted delta happily returned wrong data' \ + 'cmp blob_1 blob_2' + +test_expect_failure \ + '[index v1] 4) confirm that the pack is actually corrupted' \ + 'git-fsck --full $commit' + +test_expect_success \ + '[index v1] 5) pack-objects happily reuses corrupted data' \ + 'pack4=$(git-pack-objects test-4 blob_3 && + chmod +w ".git/objects/pack/pack-${pack1}.pack" && + dd of=".git/objects/pack/pack-${pack1}.pack" seek=$(($delta_offs + 1)) \ + if=".git/objects/pack/pack-${pack1}.idx" skip=$((8 + 256 * 4)) \ + bs=1 count=20 conv=notrunc && + git-cat-file blob "$delta_sha1" > blob_4 )' + +test_expect_failure \ + '[index v2] 3) corrupted delta happily returned wrong data' \ + 'cmp blob_3 blob_4' + +test_expect_failure \ + '[index v2] 4) confirm that the pack is actually corrupted' \ + 'git-fsck --full $commit' + +test_expect_failure \ + '[index v2] 5) pack-objects refuses to reuse corrupted data' \ + 'git-pack-objects test-5 Date: Tue, 10 Apr 2007 22:54:36 -0400 Subject: [PATCH 16/27] clean up add_object_entry() This function used to call locate_object_entry_hash() _twice_ per added object while only once should suffice. Let's reorganize that code a bit. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 52 ++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 687b4b5a99..bc5f2329a8 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -781,12 +781,19 @@ static unsigned name_hash(const char *name) static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclude) { - uint32_t idx = nr_objects; struct object_entry *entry; - struct packed_git *p; + struct packed_git *p, *found_pack = NULL; off_t found_offset = 0; - struct packed_git *found_pack = NULL; - int ix, status = 0; + int ix; + + ix = nr_objects ? locate_object_entry_hash(sha1) : -1; + if (ix >= 0) { + if (exclude) { + entry = objects + object_ix[ix] - 1; + entry->preferred_base = 1; + } + return 0; + } if (!exclude) { for (p = packed_git; p; p = p->next) { @@ -803,43 +810,34 @@ static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclud } } } - if ((entry = locate_object_entry(sha1)) != NULL) - goto already_added; - if (idx >= nr_alloc) { - nr_alloc = (idx + 1024) * 3 / 2; + if (nr_objects >= nr_alloc) { + nr_alloc = (nr_alloc + 1024) * 3 / 2; objects = xrealloc(objects, nr_alloc * sizeof(*entry)); } - entry = objects + idx; - nr_objects = idx + 1; + + entry = objects + nr_objects++; memset(entry, 0, sizeof(*entry)); hashcpy(entry->sha1, sha1); entry->hash = hash; + if (exclude) + entry->preferred_base = 1; + if (found_pack) { + entry->in_pack = found_pack; + entry->in_pack_offset = found_offset; + } if (object_ix_hashsz * 3 <= nr_objects * 4) rehash_objects(); - else { - ix = locate_object_entry_hash(entry->sha1); - if (0 <= ix) - die("internal error in object hashing."); - object_ix[-1 - ix] = idx + 1; - } - status = 1; + else + object_ix[-1 - ix] = nr_objects; - already_added: if (progress_update) { fprintf(stderr, "Counting objects...%u\r", nr_objects); progress_update = 0; } - if (exclude) - entry->preferred_base = 1; - else { - if (found_pack) { - entry->in_pack = found_pack; - entry->in_pack_offset = found_offset; - } - } - return status; + + return 1; } struct pbase_tree_cache { From 8a5a8d6c97e36dbd95361eab1109e4380fe45df4 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 16 Apr 2007 12:28:10 -0400 Subject: [PATCH 17/27] pack-objects: optimize preferred base handling a bit Let's avoid some cycles when there is no base to test against, and avoid unnecessary object lookups. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index bc5f2329a8..62a011e2e9 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -959,22 +959,21 @@ static void add_pbase_object(struct tree_desc *tree, const char *fullname) { struct name_entry entry; + int cmp; while (tree_entry(tree,&entry)) { - unsigned long size; - enum object_type type; - - if (tree_entry_len(entry.path, entry.sha1) != cmplen || - memcmp(entry.path, name, cmplen) || - !has_sha1_file(entry.sha1) || - (type = sha1_object_info(entry.sha1, &size)) < 0) + cmp = tree_entry_len(entry.path, entry.sha1) != cmplen ? 1 : + memcmp(name, entry.path, cmplen); + if (cmp > 0) continue; + if (cmp < 0) + return; if (name[cmplen] != '/') { unsigned hash = name_hash(fullname); add_object_entry(entry.sha1, hash, 1); return; } - if (type == OBJ_TREE) { + if (S_ISDIR(entry.mode)) { struct tree_desc sub; struct pbase_tree_cache *tree; const char *down = name+cmplen+1; @@ -1034,15 +1033,15 @@ static int check_pbase_path(unsigned hash) static void add_preferred_base_object(const char *name, unsigned hash) { struct pbase_tree *it; - int cmplen = name_cmp_len(name); + int cmplen; - if (check_pbase_path(hash)) + if (!num_preferred_base || check_pbase_path(hash)) return; + cmplen = name_cmp_len(name); for (it = pbase_tree; it; it = it->next) { if (cmplen == 0) { - hash = name_hash(""); - add_object_entry(it->pcache.sha1, hash, 1); + add_object_entry(it->pcache.sha1, 0, 1); } else { struct tree_desc tree; @@ -1587,9 +1586,7 @@ static void read_object_list_from_stdin(void) static void show_commit(struct commit *commit) { - unsigned hash = name_hash(""); - add_preferred_base_object("", hash); - add_object_entry(commit->object.sha1, hash, 0); + add_object_entry(commit->object.sha1, 0, 0); } static void show_object(struct object_array_entry *p) From adcc70950e594065050c375ace8a039678d2e31f Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 16 Apr 2007 12:28:52 -0400 Subject: [PATCH 18/27] pack-objects: equal objects in size should delta against newer objects Before finding best delta combinations, we sort objects by name hash, then by size, then by their position in memory. Then we walk the list backwards to test delta candidates. We hope that a bigger size usually means a newer objects. But a bigger address in memory does not mean a newer object. So the last comparison must be reversed. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 62a011e2e9..869ca1ab26 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -1276,7 +1276,7 @@ static int type_size_sort(const struct object_entry *a, const struct object_entr return -1; if (a->size > b->size) return 1; - return a < b ? -1 : (a > b); + return a > b ? -1 : (a < b); /* newest last */ } struct unpacked { From 898b14cedc353de95945fcc56e14f463c3066bf0 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 16 Apr 2007 12:29:16 -0400 Subject: [PATCH 19/27] pack-objects: rework check_delta_limit usage Objects that have delta "children" from pack data reuse must consider the depth of their deepest child when they try to deltify themselves for those children not to become too deep. However, in the context of a "thin" pack, the delta children depth was skipped entirely on the presumption that the pack was always going to be exploded on the receiving end, hence the delta length wasn't an issue. Now that we keep received packs as is and reuse pack data when repacking, those packs do contain delta chains that are longer than expected. Worse, those delta chain may even grow longer when the pack is further repacked into another thin pack for a subsequent transmission. So this patch restores strict delta length even for thin packs, and it moves check_delta_limit() usage directly in the delta loop where it is needed. This way the delta_limit can be removed from struct object_entry as well. Oh and the initial value was wrong too. The progress_interval() function was moved to a more logical location in the process. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 76 ++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 44 deletions(-) diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 869ca1ab26..d44b8f4c01 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -27,7 +27,6 @@ struct object_entry { * nonzero if already written. */ unsigned int depth; /* delta depth */ - unsigned int delta_limit; /* base adjustment for in-pack delta */ unsigned int hash; /* name hint hash */ enum object_type type; enum object_type in_pack_type; /* could be delta */ @@ -1172,19 +1171,6 @@ static void check_object(struct object_entry *entry) sha1_to_hex(entry->sha1)); } -static unsigned int check_delta_limit(struct object_entry *me, unsigned int n) -{ - struct object_entry *child = me->delta_child; - unsigned int m = n; - while (child) { - unsigned int c = check_delta_limit(child, n + 1); - if (m < c) - m = c; - child = child->delta_sibling; - } - return m; -} - static void get_object_details(void) { uint32_t i; @@ -1193,23 +1179,6 @@ static void get_object_details(void) prepare_pack_ix(); for (i = 0, entry = objects; i < nr_objects; i++, entry++) check_object(entry); - - if (nr_objects == nr_result) { - /* - * Depth of objects that depend on the entry -- this - * is subtracted from depth-max to break too deep - * delta chain because of delta data reusing. - * However, we loosen this restriction when we know we - * are creating a thin pack -- it will have to be - * expanded on the other end anyway, so do not - * artificially cut the delta chain and let it go as - * deep as it wants. - */ - for (i = 0, entry = objects; i < nr_objects; i++, entry++) - if (!entry->delta && entry->delta_child) - entry->delta_limit = - check_delta_limit(entry, 1); - } } typedef int (*entry_sort_t)(const struct object_entry *, const struct object_entry *); @@ -1322,16 +1291,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src, trg_entry->in_pack_type != OBJ_OFS_DELTA) return 0; - /* - * If the current object is at pack edge, take the depth the - * objects that depend on the current object into account -- - * otherwise they would become too deep. - */ - if (trg_entry->delta_child) { - if (max_depth <= trg_entry->delta_limit) - return 0; - max_depth -= trg_entry->delta_limit; - } + /* Let's not bust the allowed depth. */ if (src_entry->depth >= max_depth) return 0; @@ -1378,9 +1338,17 @@ static int try_delta(struct unpacked *trg, struct unpacked *src, return 1; } -static void progress_interval(int signum) +static unsigned int check_delta_limit(struct object_entry *me, unsigned int n) { - progress_update = 1; + struct object_entry *child = me->delta_child; + unsigned int m = n; + while (child) { + unsigned int c = check_delta_limit(child, n + 1); + if (m < c) + m = c; + child = child->delta_sibling; + } + return m; } static void find_deltas(struct object_entry **list, int window, int depth) @@ -1389,6 +1357,7 @@ static void find_deltas(struct object_entry **list, int window, int depth) unsigned int array_size = window * sizeof(struct unpacked); struct unpacked *array; unsigned last_percent = 999; + int max_depth; if (!nr_objects) return; @@ -1429,6 +1398,18 @@ static void find_deltas(struct object_entry **list, int window, int depth) n->data = NULL; n->entry = entry; + /* + * If the current object is at pack edge, take the depth the + * objects that depend on the current object into account + * otherwise they would become too deep. + */ + max_depth = depth; + if (entry->delta_child) { + max_depth -= check_delta_limit(entry, 0); + if (max_depth <= 0) + goto next; + } + j = window; while (--j > 0) { uint32_t other_idx = idx + j; @@ -1438,9 +1419,10 @@ static void find_deltas(struct object_entry **list, int window, int depth) m = array + other_idx; if (!m->entry) break; - if (try_delta(n, m, depth) < 0) + if (try_delta(n, m, max_depth) < 0) break; } + /* if we made n a delta, and if n is already at max * depth, leaving it in the window is pointless. we * should evict it first. @@ -1448,6 +1430,7 @@ static void find_deltas(struct object_entry **list, int window, int depth) if (entry->delta && depth <= entry->depth) continue; + next: idx++; if (idx >= window) idx = 0; @@ -1525,6 +1508,11 @@ static int reuse_cached_pack(unsigned char *sha1) return 1; } +static void progress_interval(int signum) +{ + progress_update = 1; +} + static void setup_progress_signal(void) { struct sigaction sa; From 9668cf59a83d1aa881036818abf29cc2ea9e291b Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 16 Apr 2007 12:29:54 -0400 Subject: [PATCH 20/27] pack-objects: clean up list sorting Get rid of sort_comparator() as it impose a run time double indirect function call for little compile time type checking gain. Also get rid of create_sorted_list() as it only has one user which would as well be just fine doing its sorting locally. Eventually the list of deltifiable objects might be shorter than the whole object list. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 53 ++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index d44b8f4c01..15119d63d9 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -66,7 +66,7 @@ static int local; static int incremental; static int allow_ofs_delta; -static struct object_entry **sorted_by_sha, **sorted_by_type; +static struct object_entry **sorted_by_sha; static struct object_entry *objects; static uint32_t nr_objects, nr_alloc, nr_result; static const char *base_name; @@ -1181,31 +1181,10 @@ static void get_object_details(void) check_object(entry); } -typedef int (*entry_sort_t)(const struct object_entry *, const struct object_entry *); - -static entry_sort_t current_sort; - -static int sort_comparator(const void *_a, const void *_b) -{ - struct object_entry *a = *(struct object_entry **)_a; - struct object_entry *b = *(struct object_entry **)_b; - return current_sort(a,b); -} - -static struct object_entry **create_sorted_list(entry_sort_t sort) -{ - struct object_entry **list = xmalloc(nr_objects * sizeof(struct object_entry *)); - uint32_t i; - - for (i = 0; i < nr_objects; i++) - list[i] = objects + i; - current_sort = sort; - qsort(list, nr_objects, sizeof(struct object_entry *), sort_comparator); - return list; -} - -static int sha1_sort(const struct object_entry *a, const struct object_entry *b) +static int sha1_sort(const void *_a, const void *_b) { + const struct object_entry *a = *(struct object_entry **)_a; + const struct object_entry *b = *(struct object_entry **)_b; return hashcmp(a->sha1, b->sha1); } @@ -1222,13 +1201,15 @@ static struct object_entry **create_final_object_list(void) if (!objects[i].preferred_base) list[j++] = objects + i; } - current_sort = sha1_sort; - qsort(list, nr_result, sizeof(struct object_entry *), sort_comparator); + qsort(list, nr_result, sizeof(struct object_entry *), sha1_sort); return list; } -static int type_size_sort(const struct object_entry *a, const struct object_entry *b) +static int type_size_sort(const void *_a, const void *_b) { + const struct object_entry *a = *(struct object_entry **)_a; + const struct object_entry *b = *(struct object_entry **)_b; + if (a->type < b->type) return -1; if (a->type > b->type) @@ -1448,10 +1429,20 @@ static void find_deltas(struct object_entry **list, int window, int depth) static void prepare_pack(int window, int depth) { + struct object_entry **delta_list; + uint32_t i; + get_object_details(); - sorted_by_type = create_sorted_list(type_size_sort); - if (window && depth) - find_deltas(sorted_by_type, window+1, depth); + + if (!window || !depth) + return; + + delta_list = xmalloc(nr_objects * sizeof(*delta_list)); + for (i = 0; i < nr_objects; i++) + delta_list[i] = objects + i; + qsort(delta_list, nr_objects, sizeof(*delta_list), type_size_sort); + find_deltas(delta_list, window+1, depth); + free(delta_list); } static int reuse_cached_pack(unsigned char *sha1) From f7ae6a930a3c5d501439cdba98417af747350738 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 16 Apr 2007 12:30:15 -0400 Subject: [PATCH 21/27] pack-objects: get rid of reuse_cached_pack This capability is practically never useful, and therefore never tested, because it is fairly unlikely that the requested pack will be already available. Furthermore it is of little gain over the ability to reuse existing pack data. In fact the ability to change delta type on the fly when reusing delta data is a nice thing that has almost no cost and allows greater backward compatibility with a client's capabilities than if the client is blindly sent a whole pack without any discrimination. And this "feature" is simply in the way of other cleanups. Let's get rid of it. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 86 +++++++----------------------------------- 1 file changed, 14 insertions(+), 72 deletions(-) diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 15119d63d9..c2f7c30817 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -1445,60 +1445,6 @@ static void prepare_pack(int window, int depth) free(delta_list); } -static int reuse_cached_pack(unsigned char *sha1) -{ - static const char cache[] = "pack-cache/pack-%s.%s"; - char *cached_pack, *cached_idx; - int ifd, ofd, ifd_ix = -1; - - cached_pack = git_path(cache, sha1_to_hex(sha1), "pack"); - ifd = open(cached_pack, O_RDONLY); - if (ifd < 0) - return 0; - - if (!pack_to_stdout) { - cached_idx = git_path(cache, sha1_to_hex(sha1), "idx"); - ifd_ix = open(cached_idx, O_RDONLY); - if (ifd_ix < 0) { - close(ifd); - return 0; - } - } - - if (progress) - fprintf(stderr, "Reusing %u objects pack %s\n", nr_objects, - sha1_to_hex(sha1)); - - if (pack_to_stdout) { - if (copy_fd(ifd, 1)) - exit(1); - close(ifd); - } - else { - char name[PATH_MAX]; - snprintf(name, sizeof(name), - "%s-%s.%s", base_name, sha1_to_hex(sha1), "pack"); - ofd = open(name, O_CREAT | O_EXCL | O_WRONLY, 0666); - if (ofd < 0) - die("unable to open %s (%s)", name, strerror(errno)); - if (copy_fd(ifd, ofd)) - exit(1); - close(ifd); - - snprintf(name, sizeof(name), - "%s-%s.%s", base_name, sha1_to_hex(sha1), "idx"); - ofd = open(name, O_CREAT | O_EXCL | O_WRONLY, 0666); - if (ofd < 0) - die("unable to open %s (%s)", name, strerror(errno)); - if (copy_fd(ifd_ix, ofd)) - exit(1); - close(ifd_ix); - puts(sha1_to_hex(sha1)); - } - - return 1; -} - static void progress_interval(int signum) { progress_update = 1; @@ -1618,6 +1564,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) SHA_CTX ctx; int depth = 10; struct object_entry **list; + off_t last_obj_offset; int use_internal_rev_list = 0; int thin = 0; uint32_t i; @@ -1779,24 +1726,19 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) if (progress && (nr_objects != nr_result)) fprintf(stderr, "Result has %u objects.\n", nr_result); - if (reuse_cached_pack(object_list_sha1)) - ; - else { - off_t last_obj_offset; - if (nr_result) - prepare_pack(window, depth); - if (progress == 1 && pack_to_stdout) { - /* the other end usually displays progress itself */ - struct itimerval v = {{0,},}; - setitimer(ITIMER_REAL, &v, NULL); - signal(SIGALRM, SIG_IGN ); - progress_update = 0; - } - last_obj_offset = write_pack_file(); - if (!pack_to_stdout) { - write_index_file(last_obj_offset); - puts(sha1_to_hex(object_list_sha1)); - } + if (nr_result) + prepare_pack(window, depth); + if (progress == 1 && pack_to_stdout) { + /* the other end usually displays progress itself */ + struct itimerval v = {{0,},}; + setitimer(ITIMER_REAL, &v, NULL); + signal(SIGALRM, SIG_IGN ); + progress_update = 0; + } + last_obj_offset = write_pack_file(); + if (!pack_to_stdout) { + write_index_file(last_obj_offset); + puts(sha1_to_hex(object_list_sha1)); } if (progress) fprintf(stderr, "Total %u (delta %u), reused %u (delta %u)\n", From 81a216a5d6a12976b20d9a39829562f280ae96f2 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 16 Apr 2007 12:31:05 -0400 Subject: [PATCH 22/27] pack-objects: get rid of create_final_object_list() Because we don't have to know the SHA1 h(hence the name) of the pack up front anymore, let's get rid of yet another global sorted object list and sort them only in write_index_file(), then compute the object list SHA1 on the fly. This has the advantage of saving another chunk of memory, and the sorted list SHA1 won't be computed needlessly on servers during a fetch. Of course the cunning plan is also to make write_index_file() much like the function with the same name in index-pack.c for an eventual easy sharing. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 127 +++++++++++++++++++++++------------------ 1 file changed, 72 insertions(+), 55 deletions(-) diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index c2f7c30817..7af1776673 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -59,17 +59,16 @@ struct object_entry { * heuristics. */ -static unsigned char object_list_sha1[20]; static int non_empty; static int no_reuse_delta; static int local; static int incremental; static int allow_ofs_delta; -static struct object_entry **sorted_by_sha; static struct object_entry *objects; static uint32_t nr_objects, nr_alloc, nr_result; -static const char *base_name; +static const char *pack_tmp_name, *idx_tmp_name; +static char tmpname[PATH_MAX]; static unsigned char pack_file_sha1[20]; static int progress = 1; static volatile sig_atomic_t progress_update; @@ -578,13 +577,19 @@ static off_t write_pack_file(void) unsigned last_percent = 999; int do_progress = progress; - if (!base_name) { + if (pack_to_stdout) { f = sha1fd(1, ""); do_progress >>= 1; + } else { + int fd; + snprintf(tmpname, sizeof(tmpname), "tmp_pack_XXXXXX"); + fd = mkstemp(tmpname); + if (fd < 0) + die("unable to create %s: %s\n", tmpname, strerror(errno)); + pack_tmp_name = xstrdup(tmpname); + f = sha1fd(fd, pack_tmp_name); } - else - f = sha1create("%s-%s.%s", base_name, - sha1_to_hex(object_list_sha1), "pack"); + if (do_progress) fprintf(stderr, "Writing %u objects.\n", nr_result); @@ -618,18 +623,46 @@ static off_t write_pack_file(void) return last_obj_offset; } +static int sha1_sort(const void *_a, const void *_b) +{ + const struct object_entry *a = *(struct object_entry **)_a; + const struct object_entry *b = *(struct object_entry **)_b; + return hashcmp(a->sha1, b->sha1); +} + static uint32_t index_default_version = 1; static uint32_t index_off32_limit = 0x7fffffff; -static void write_index_file(off_t last_obj_offset) +static void write_index_file(off_t last_obj_offset, unsigned char *sha1) { - uint32_t i; - struct sha1file *f = sha1create("%s-%s.%s", base_name, - sha1_to_hex(object_list_sha1), "idx"); - struct object_entry **list = sorted_by_sha; - struct object_entry **last = list + nr_result; + struct sha1file *f; + struct object_entry **sorted_by_sha, **list, **last; uint32_t array[256]; - uint32_t index_version; + uint32_t i, index_version; + SHA_CTX ctx; + int fd; + + snprintf(tmpname, sizeof(tmpname), "tmp_idx_XXXXXX"); + fd = mkstemp(tmpname); + if (fd < 0) + die("unable to create %s: %s\n", tmpname, strerror(errno)); + idx_tmp_name = xstrdup(tmpname); + f = sha1fd(fd, idx_tmp_name); + + if (nr_result) { + uint32_t j = 0; + sorted_by_sha = + xcalloc(nr_result, sizeof(struct object_entry *)); + for (i = 0; i < nr_objects; i++) + if (!objects[i].preferred_base) + sorted_by_sha[j++] = objects + i; + if (j != nr_result) + die("listed %u objects while expecting %u", j, nr_result); + qsort(sorted_by_sha, nr_result, sizeof(*sorted_by_sha), sha1_sort); + list = sorted_by_sha; + last = sorted_by_sha + nr_result; + } else + sorted_by_sha = list = last = NULL; /* if last object's offset is >= 2^31 we should use index V2 */ index_version = (last_obj_offset >> 31) ? 2 : index_default_version; @@ -660,9 +693,10 @@ static void write_index_file(off_t last_obj_offset) } sha1write(f, array, 256 * 4); - /* - * Write the actual SHA1 entries.. - */ + /* Compute the SHA1 hash of sorted object names. */ + SHA1_Init(&ctx); + + /* Write the actual SHA1 entries. */ list = sorted_by_sha; for (i = 0; i < nr_result; i++) { struct object_entry *entry = *list++; @@ -671,6 +705,7 @@ static void write_index_file(off_t last_obj_offset) sha1write(f, &offset, 4); } sha1write(f, entry->sha1, 20); + SHA1_Update(&ctx, entry->sha1, 20); } if (index_version >= 2) { @@ -711,6 +746,8 @@ static void write_index_file(off_t last_obj_offset) sha1write(f, pack_file_sha1, 20); sha1close(f, NULL, 1); + free(sorted_by_sha); + SHA1_Final(sha1, &ctx); } static int locate_object_entry_hash(const unsigned char *sha1) @@ -789,6 +826,8 @@ static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclud if (ix >= 0) { if (exclude) { entry = objects + object_ix[ix] - 1; + if (!entry->preferred_base) + nr_result--; entry->preferred_base = 1; } return 0; @@ -821,6 +860,8 @@ static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclud entry->hash = hash; if (exclude) entry->preferred_base = 1; + else + nr_result++; if (found_pack) { entry->in_pack = found_pack; entry->in_pack_offset = found_offset; @@ -1181,30 +1222,6 @@ static void get_object_details(void) check_object(entry); } -static int sha1_sort(const void *_a, const void *_b) -{ - const struct object_entry *a = *(struct object_entry **)_a; - const struct object_entry *b = *(struct object_entry **)_b; - return hashcmp(a->sha1, b->sha1); -} - -static struct object_entry **create_final_object_list(void) -{ - struct object_entry **list; - uint32_t i, j; - - for (i = nr_result = 0; i < nr_objects; i++) - if (!objects[i].preferred_base) - nr_result++; - list = xmalloc(nr_result * sizeof(struct object_entry *)); - for (i = j = 0; i < nr_objects; i++) { - if (!objects[i].preferred_base) - list[j++] = objects + i; - } - qsort(list, nr_result, sizeof(struct object_entry *), sha1_sort); - return list; -} - static int type_size_sort(const void *_a, const void *_b) { const struct object_entry *a = *(struct object_entry **)_a; @@ -1561,13 +1578,12 @@ static void get_object_list(int ac, const char **av) int cmd_pack_objects(int argc, const char **argv, const char *prefix) { - SHA_CTX ctx; int depth = 10; - struct object_entry **list; - off_t last_obj_offset; int use_internal_rev_list = 0; int thin = 0; uint32_t i; + off_t last_obj_offset; + const char *base_name = NULL; const char **rp_av; int rp_ac_alloc = 64; int rp_ac; @@ -1712,20 +1728,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) if (progress) fprintf(stderr, "Done counting %u objects.\n", nr_objects); - sorted_by_sha = create_final_object_list(); if (non_empty && !nr_result) return 0; - - SHA1_Init(&ctx); - list = sorted_by_sha; - for (i = 0; i < nr_result; i++) { - struct object_entry *entry = *list++; - SHA1_Update(&ctx, entry->sha1, 20); - } - SHA1_Final(object_list_sha1, &ctx); if (progress && (nr_objects != nr_result)) fprintf(stderr, "Result has %u objects.\n", nr_result); - if (nr_result) prepare_pack(window, depth); if (progress == 1 && pack_to_stdout) { @@ -1737,7 +1743,18 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) } last_obj_offset = write_pack_file(); if (!pack_to_stdout) { - write_index_file(last_obj_offset); + unsigned char object_list_sha1[20]; + write_index_file(last_obj_offset, object_list_sha1); + snprintf(tmpname, sizeof(tmpname), "%s-%s.pack", + base_name, sha1_to_hex(object_list_sha1)); + if (rename(pack_tmp_name, tmpname)) + die("unable to rename temporary pack file: %s", + strerror(errno)); + snprintf(tmpname, sizeof(tmpname), "%s-%s.idx", + base_name, sha1_to_hex(object_list_sha1)); + if (rename(idx_tmp_name, tmpname)) + die("unable to rename temporary index file: %s", + strerror(errno)); puts(sha1_to_hex(object_list_sha1)); } if (progress) From a3fbf4dfe1bf2386add261dc7c2809b652b5f9ae Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 16 Apr 2007 12:31:31 -0400 Subject: [PATCH 23/27] pack-objects: make in_pack_header_size a variable of its own It currently aliases delta_size on the principle that reused deltas won't go through the whole delta matching loop hence delta_size was unused. This is not true if given delta doesn't find its base in the pack though. But we need that information even for whole object data reuse. Well in short the current state looks awful and is prone to bugs. It just works fine now because try_delta() tests trg_entry->delta before using trg_entry->delta_size, but that is a bit subtle and I was wondering for a while why things just worked fine... even if I'm guilty of having introduced this abomination myself in the first place. Let's do the sensible thing instead with no ambiguity, which is to have a separate variable for in_pack_header_size. This might even help future optimizations. While at it, let's reorder some struct object_entry members so they all align well with their own width, regardless of the architecture or the size of off_t. Some memory saving is to be expected with this alone. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 7af1776673..7100a76cd2 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -22,28 +22,26 @@ git-pack-objects [{ -q | --progress | --all-progress }] \n\ struct object_entry { unsigned char sha1[20]; + uint32_t crc32; /* crc of raw pack data for this object */ + off_t offset; /* offset into the final pack file */ unsigned long size; /* uncompressed size */ - off_t offset; /* offset into the final pack file; - * nonzero if already written. - */ - unsigned int depth; /* delta depth */ unsigned int hash; /* name hint hash */ - enum object_type type; - enum object_type in_pack_type; /* could be delta */ - unsigned long delta_size; /* delta data size (uncompressed) */ -#define in_pack_header_size delta_size /* only when reusing pack data */ - struct object_entry *delta; /* delta base object */ + unsigned int depth; /* delta depth */ struct packed_git *in_pack; /* already in pack */ off_t in_pack_offset; + struct object_entry *delta; /* delta base object */ struct object_entry *delta_child; /* deltified objects who bases me */ struct object_entry *delta_sibling; /* other deltified objects who * uses the same base as me */ - int preferred_base; /* we do not pack this, but is encouraged to - * be used as the base objectto delta huge - * objects against. - */ - uint32_t crc32; /* crc of raw pack data for this object */ + unsigned long delta_size; /* delta data size (uncompressed) */ + enum object_type type; + enum object_type in_pack_type; /* could be delta */ + unsigned char in_pack_header_size; + unsigned char preferred_base; /* we do not pack this, but is available + * to be used as the base objectto delta + * objects against. + */ }; /* From 54dab52ae8518da67e271b5b3a1f91af1fd5e314 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 16 Apr 2007 12:31:56 -0400 Subject: [PATCH 24/27] add get_size_from_delta() ... which consists of existing code split out of packed_delta_info() for other callers to use it as well. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- cache.h | 1 + sha1_file.c | 73 ++++++++++++++++++++++++++++------------------------- 2 files changed, 40 insertions(+), 34 deletions(-) diff --git a/cache.h b/cache.h index 5b67f4c989..63399c7c6b 100644 --- a/cache.h +++ b/cache.h @@ -436,6 +436,7 @@ extern const unsigned char *nth_packed_object_sha1(const struct packed_git *, ui extern off_t find_pack_entry_one(const unsigned char *, struct packed_git *); extern void *unpack_entry(struct packed_git *, off_t, enum object_type *, unsigned long *); extern unsigned long unpack_object_header_gently(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep); +extern unsigned long get_size_from_delta(struct packed_git *, struct pack_window **, off_t); extern const char *packed_object_info_detail(struct packed_git *, off_t, unsigned long *, unsigned long *, unsigned int *, unsigned char *); /* Dumb servers support */ diff --git a/sha1_file.c b/sha1_file.c index 0be9737bd1..5dac4666b6 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -1160,6 +1160,43 @@ static void *unpack_sha1_file(void *map, unsigned long mapsize, enum object_type return unpack_sha1_rest(&stream, hdr, *size, sha1); } +unsigned long get_size_from_delta(struct packed_git *p, + struct pack_window **w_curs, + off_t curpos) +{ + const unsigned char *data; + unsigned char delta_head[20], *in; + z_stream stream; + int st; + + memset(&stream, 0, sizeof(stream)); + stream.next_out = delta_head; + stream.avail_out = sizeof(delta_head); + + inflateInit(&stream); + do { + in = use_pack(p, w_curs, curpos, &stream.avail_in); + stream.next_in = in; + st = inflate(&stream, Z_FINISH); + curpos += stream.next_in - in; + } while ((st == Z_OK || st == Z_BUF_ERROR) && + stream.total_out < sizeof(delta_head)); + inflateEnd(&stream); + if ((st != Z_STREAM_END) && stream.total_out != sizeof(delta_head)) + die("delta data unpack-initial failed"); + + /* Examine the initial part of the delta to figure out + * the result size. + */ + data = delta_head; + + /* ignore base size */ + get_delta_hdr_size(&data, delta_head+sizeof(delta_head)); + + /* Read the result size */ + return get_delta_hdr_size(&data, delta_head+sizeof(delta_head)); +} + static off_t get_delta_base(struct packed_git *p, struct pack_window **w_curs, off_t *curpos, @@ -1223,40 +1260,8 @@ static int packed_delta_info(struct packed_git *p, * based on a base with a wrong size. This saves tons of * inflate() calls. */ - if (sizep) { - const unsigned char *data; - unsigned char delta_head[20], *in; - z_stream stream; - int st; - - memset(&stream, 0, sizeof(stream)); - stream.next_out = delta_head; - stream.avail_out = sizeof(delta_head); - - inflateInit(&stream); - do { - in = use_pack(p, w_curs, curpos, &stream.avail_in); - stream.next_in = in; - st = inflate(&stream, Z_FINISH); - curpos += stream.next_in - in; - } while ((st == Z_OK || st == Z_BUF_ERROR) - && stream.total_out < sizeof(delta_head)); - inflateEnd(&stream); - if ((st != Z_STREAM_END) && - stream.total_out != sizeof(delta_head)) - die("delta data unpack-initial failed"); - - /* Examine the initial part of the delta to figure out - * the result size. - */ - data = delta_head; - - /* ignore base size */ - get_delta_hdr_size(&data, delta_head+sizeof(delta_head)); - - /* Read the result size */ - *sizep = get_delta_hdr_size(&data, delta_head+sizeof(delta_head)); - } + if (sizep) + *sizep = get_size_from_delta(p, w_curs, curpos); return type; } From 5c49c11686df9d1c27a194349d0b2092e6446f42 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 16 Apr 2007 12:32:13 -0400 Subject: [PATCH 25/27] pack-objects: better check_object() performances With large amount of objects, check_object() is really trashing the pack sliding map and the filesystem cache. It has a completely random access pattern especially with old objects where delta replay jumps back and forth all over the pack. This patch improves things by: 1) sorting objects by their offset in pack before calling check_object() so the pack access pattern is linear; 2) recording the object type at add_object_entry() time since it is already known in most cases; 3) recording the pack offset even for preferred_base objects; 4) avoid calling sha1_object_info() if all possible. This limits pack accesses to the bare minimum and makes them perfectly linear. In the process check_object() was made more clear (to me at least). Note: I thought about walking the sorted_by_offset list backward in get_object_details() so if a pack happens to be larger than the available file cache, then the cache would have been populated with useful data from the beginning of the pack already when find_deltas() is called. Strangely, testing (on Linux) showed absolutely no performance difference. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 216 +++++++++++++++++++++++++---------------- 1 file changed, 131 insertions(+), 85 deletions(-) diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 7100a76cd2..19fae4c917 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -813,7 +813,8 @@ static unsigned name_hash(const char *name) return hash; } -static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclude) +static int add_object_entry(const unsigned char *sha1, enum object_type type, + unsigned hash, int exclude) { struct object_entry *entry; struct packed_git *p, *found_pack = NULL; @@ -831,19 +832,19 @@ static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclud return 0; } - if (!exclude) { - for (p = packed_git; p; p = p->next) { - off_t offset = find_pack_entry_one(sha1, p); - if (offset) { - if (incremental) - return 0; - if (local && !p->pack_local) - return 0; - if (!found_pack) { - found_offset = offset; - found_pack = p; - } + for (p = packed_git; p; p = p->next) { + off_t offset = find_pack_entry_one(sha1, p); + if (offset) { + if (!found_pack) { + found_offset = offset; + found_pack = p; } + if (exclude) + break; + if (incremental) + return 0; + if (local && !p->pack_local) + return 0; } } @@ -856,6 +857,8 @@ static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclud memset(entry, 0, sizeof(*entry)); hashcpy(entry->sha1, sha1); entry->hash = hash; + if (type) + entry->type = type; if (exclude) entry->preferred_base = 1; else @@ -1008,7 +1011,9 @@ static void add_pbase_object(struct tree_desc *tree, return; if (name[cmplen] != '/') { unsigned hash = name_hash(fullname); - add_object_entry(entry.sha1, hash, 1); + add_object_entry(entry.sha1, + S_ISDIR(entry.mode) ? OBJ_TREE : OBJ_BLOB, + hash, 1); return; } if (S_ISDIR(entry.mode)) { @@ -1079,7 +1084,7 @@ static void add_preferred_base_object(const char *name, unsigned hash) cmplen = name_cmp_len(name); for (it = pbase_tree; it; it = it->next) { if (cmplen == 0) { - add_object_entry(it->pcache.sha1, 0, 1); + add_object_entry(it->pcache.sha1, OBJ_TREE, 0, 1); } else { struct tree_desc tree; @@ -1121,87 +1126,105 @@ static void add_preferred_base(unsigned char *sha1) static void check_object(struct object_entry *entry) { - if (entry->in_pack && !entry->preferred_base) { + if (entry->in_pack) { struct packed_git *p = entry->in_pack; struct pack_window *w_curs = NULL; - unsigned long size, used; + const unsigned char *base_ref = NULL; + struct object_entry *base_entry; + unsigned long used, used_0; unsigned int avail; - unsigned char *buf; - struct object_entry *base_entry = NULL; + off_t ofs; + unsigned char *buf, c; buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail); - /* We want in_pack_type even if we do not reuse delta. + /* + * We want in_pack_type even if we do not reuse delta. * There is no point not reusing non-delta representations. */ used = unpack_object_header_gently(buf, avail, - &entry->in_pack_type, &size); + &entry->in_pack_type, + &entry->size); - /* Check if it is delta, and the base is also an object - * we are going to pack. If so we will reuse the existing - * delta. + /* + * Determine if this is a delta and if so whether we can + * reuse it or not. Otherwise let's find out as cheaply as + * possible what the actual type and size for this object is. */ - if (!no_reuse_delta) { - unsigned char c; - const unsigned char *base_name; - off_t ofs; - unsigned long used_0; - /* there is at least 20 bytes left in the pack */ - switch (entry->in_pack_type) { - case OBJ_REF_DELTA: - base_name = use_pack(p, &w_curs, - entry->in_pack_offset + used, NULL); - used += 20; - break; - case OBJ_OFS_DELTA: - buf = use_pack(p, &w_curs, - entry->in_pack_offset + used, NULL); - used_0 = 0; - c = buf[used_0++]; - ofs = c & 127; - while (c & 128) { - ofs += 1; - if (!ofs || MSB(ofs, 7)) - die("delta base offset overflow in pack for %s", - sha1_to_hex(entry->sha1)); - c = buf[used_0++]; - ofs = (ofs << 7) + (c & 127); - } - if (ofs >= entry->in_pack_offset) - die("delta base offset out of bound for %s", - sha1_to_hex(entry->sha1)); - ofs = entry->in_pack_offset - ofs; - base_name = find_packed_object_name(p, ofs); - used += used_0; - break; - default: - base_name = NULL; - } - if (base_name) - base_entry = locate_object_entry(base_name); - } - unuse_pack(&w_curs); - entry->in_pack_header_size = used; - - if (base_entry) { - - /* Depth value does not matter - find_deltas() - * will never consider reused delta as the - * base object to deltify other objects - * against, in order to avoid circular deltas. - */ - - /* uncompressed size of the delta data */ - entry->size = size; - entry->delta = base_entry; + switch (entry->in_pack_type) { + default: + /* Not a delta hence we've already got all we need. */ entry->type = entry->in_pack_type; + entry->in_pack_header_size = used; + unuse_pack(&w_curs); + return; + case OBJ_REF_DELTA: + if (!no_reuse_delta && !entry->preferred_base) + base_ref = use_pack(p, &w_curs, + entry->in_pack_offset + used, NULL); + entry->in_pack_header_size = used + 20; + break; + case OBJ_OFS_DELTA: + buf = use_pack(p, &w_curs, + entry->in_pack_offset + used, NULL); + used_0 = 0; + c = buf[used_0++]; + ofs = c & 127; + while (c & 128) { + ofs += 1; + if (!ofs || MSB(ofs, 7)) + die("delta base offset overflow in pack for %s", + sha1_to_hex(entry->sha1)); + c = buf[used_0++]; + ofs = (ofs << 7) + (c & 127); + } + if (ofs >= entry->in_pack_offset) + die("delta base offset out of bound for %s", + sha1_to_hex(entry->sha1)); + ofs = entry->in_pack_offset - ofs; + if (!no_reuse_delta && !entry->preferred_base) + base_ref = find_packed_object_name(p, ofs); + entry->in_pack_header_size = used + used_0; + break; + } + if (base_ref && (base_entry = locate_object_entry(base_ref))) { + /* + * If base_ref was set above that means we wish to + * reuse delta data, and we even found that base + * in the list of objects we want to pack. Goodie! + * + * Depth value does not matter - find_deltas() will + * never consider reused delta as the base object to + * deltify other objects against, in order to avoid + * circular deltas. + */ + entry->type = entry->in_pack_type; + entry->delta = base_entry; entry->delta_sibling = base_entry->delta_child; base_entry->delta_child = entry; - + unuse_pack(&w_curs); return; } - /* Otherwise we would do the usual */ + + if (entry->type) { + /* + * This must be a delta and we already know what the + * final object type is. Let's extract the actual + * object size from the delta header. + */ + entry->size = get_size_from_delta(p, &w_curs, + entry->in_pack_offset + entry->in_pack_header_size); + unuse_pack(&w_curs); + return; + } + + /* + * No choice but to fall back to the recursive delta walk + * with sha1_object_info() to find about the object type + * at this point... + */ + unuse_pack(&w_curs); } entry->type = sha1_object_info(entry->sha1, &entry->size); @@ -1210,14 +1233,37 @@ static void check_object(struct object_entry *entry) sha1_to_hex(entry->sha1)); } +static int pack_offset_sort(const void *_a, const void *_b) +{ + const struct object_entry *a = *(struct object_entry **)_a; + const struct object_entry *b = *(struct object_entry **)_b; + + /* avoid filesystem trashing with loose objects */ + if (!a->in_pack && !b->in_pack) + return hashcmp(a->sha1, b->sha1); + + if (a->in_pack < b->in_pack) + return -1; + if (a->in_pack > b->in_pack) + return 1; + return a->in_pack_offset < b->in_pack_offset ? -1 : + (a->in_pack_offset > b->in_pack_offset); +} + static void get_object_details(void) { uint32_t i; - struct object_entry *entry; + struct object_entry **sorted_by_offset; + + sorted_by_offset = xcalloc(nr_objects, sizeof(struct object_entry *)); + for (i = 0; i < nr_objects; i++) + sorted_by_offset[i] = objects + i; + qsort(sorted_by_offset, nr_objects, sizeof(*sorted_by_offset), pack_offset_sort); prepare_pack_ix(); - for (i = 0, entry = objects; i < nr_objects; i++, entry++) - check_object(entry); + for (i = 0; i < nr_objects; i++) + check_object(sorted_by_offset[i]); + free(sorted_by_offset); } static int type_size_sort(const void *_a, const void *_b) @@ -1520,20 +1566,20 @@ static void read_object_list_from_stdin(void) hash = name_hash(line+41); add_preferred_base_object(line+41, hash); - add_object_entry(sha1, hash, 0); + add_object_entry(sha1, 0, hash, 0); } } static void show_commit(struct commit *commit) { - add_object_entry(commit->object.sha1, 0, 0); + add_object_entry(commit->object.sha1, OBJ_COMMIT, 0, 0); } static void show_object(struct object_array_entry *p) { unsigned hash = name_hash(p->name); add_preferred_base_object(p->name, hash); - add_object_entry(p->item->sha1, hash, 0); + add_object_entry(p->item->sha1, p->item->type, hash, 0); } static void show_edge(struct commit *commit) From e4d58311ba1a6cefa2ac5a3d95918af0d9b43588 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 19 Apr 2007 22:28:02 -0400 Subject: [PATCH 26/27] pack-objects: remove obsolete comments The sorted-by-sha ans sorted-by-type arrays are no more. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 19fae4c917..c72e07a2bb 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -49,22 +49,15 @@ struct object_entry { * expanded). nr_objects & nr_alloc controls this array. They are stored * in the order we see -- typically rev-list --objects order that gives us * nice "minimum seek" order. - * - * sorted-by-sha ans sorted-by-type are arrays of pointers that point at - * elements in the objects array. The former is used to build the pack - * index (lists object names in the ascending order to help offset lookup), - * and the latter is used to group similar things together by try_delta() - * heuristics. */ +static struct object_entry *objects; +static uint32_t nr_objects, nr_alloc, nr_result; static int non_empty; static int no_reuse_delta; static int local; static int incremental; static int allow_ofs_delta; - -static struct object_entry *objects; -static uint32_t nr_objects, nr_alloc, nr_result; static const char *pack_tmp_name, *idx_tmp_name; static char tmpname[PATH_MAX]; static unsigned char pack_file_sha1[20]; @@ -76,8 +69,7 @@ static int num_preferred_base; /* * The object names in objects array are hashed with this hashtable, - * to help looking up the entry by object name. Binary search from - * sorted_by_sha is also possible but this was easier to code and faster. + * to help looking up the entry by object name. * This hashtable is built after all the objects are seen. */ static int *object_ix; From be18c1fe1291ed839aef817283a880e09e222be5 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 19 Apr 2007 22:16:53 -0400 Subject: [PATCH 27/27] document --index-version for index-pack and pack-objects Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- Documentation/git-index-pack.txt | 5 +++++ Documentation/git-pack-objects.txt | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/Documentation/git-index-pack.txt b/Documentation/git-index-pack.txt index 2229ee86b7..b7a49b9f58 100644 --- a/Documentation/git-index-pack.txt +++ b/Documentation/git-index-pack.txt @@ -68,6 +68,11 @@ OPTIONS message can later be searched for within all .keep files to locate any which have outlived their usefulness. +--index-version=[,]:: + This is intended to be used by the test suite only. It allows + to force the version for the generated pack index, and to force + 64-bit index entries on objects located above the given offset. + Note ---- diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt index fdc6f97289..d9e11c6534 100644 --- a/Documentation/git-pack-objects.txt +++ b/Documentation/git-pack-objects.txt @@ -138,6 +138,11 @@ base-name:: length, this option typically shrinks the resulting packfile by 3-5 per-cent. +--index-version=[,]:: + This is intended to be used by the test suite only. It allows + to force the version for the generated pack index, and to force + 64-bit index entries on objects located above the given offset. + Author ------