Browse Source

make pack data reuse compatible with both delta types

This is the missing part to git-pack-objects allowing it to reuse delta
data to/from any of the two delta types.  It can reuse delta from any
type, and it outputs base offsets when --allow-delta-base-offset is
provided and the base is also included in the pack.  Otherwise it
outputs base sha1 references just like it always did.

Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
maint
Nicolas Pitre 19 years ago committed by Junio C Hamano
parent
commit
780e6e735b
  1. 204
      builtin-pack-objects.c
  2. 3
      pack.h
  3. 19
      sha1_file.c

204
builtin-pack-objects.c

@ -29,6 +29,7 @@ struct object_entry { @@ -29,6 +29,7 @@ struct object_entry {
enum object_type type;
enum object_type in_pack_type; /* could be delta */
unsigned long delta_size; /* delta data size (uncompressed) */
#define in_pack_header_size delta_size /* only when reusing pack data */
struct object_entry *delta; /* delta base object */
struct packed_git *in_pack; /* already in pack */
unsigned int in_pack_offset;
@ -86,17 +87,25 @@ static int object_ix_hashsz; @@ -86,17 +87,25 @@ static int object_ix_hashsz;
* Pack index for existing packs give us easy access to the offsets into
* corresponding pack file where each object's data starts, but the entries
* do not store the size of the compressed representation (uncompressed
* size is easily available by examining the pack entry header). We build
* a hashtable of existing packs (pack_revindex), and keep reverse index
* here -- pack index file is sorted by object name mapping to offset; this
* pack_revindex[].revindex array is an ordered list of offsets, so if you
* know the offset of an object, next offset is where its packed
* representation ends.
* size is easily available by examining the pack entry header). It is
* also rather expensive to find the sha1 for an object given its offset.
*
* We build a hashtable of existing packs (pack_revindex), and keep reverse
* index here -- pack index file is sorted by object name mapping to offset;
* this pack_revindex[].revindex array is a list of offset/index_nr pairs
* ordered by offset, so if you know the offset of an object, next offset
* is where its packed representation ends and the index_nr can be used to
* get the object sha1 from the main index.
*/
struct revindex_entry {
unsigned int offset;
unsigned int nr;
};
struct pack_revindex {
struct packed_git *p;
unsigned long *revindex;
} *pack_revindex = NULL;
struct revindex_entry *revindex;
};
static struct pack_revindex *pack_revindex;
static int pack_revindex_hashsz;

/*
@ -143,14 +152,9 @@ static void prepare_pack_ix(void) @@ -143,14 +152,9 @@ static void prepare_pack_ix(void)

static int cmp_offset(const void *a_, const void *b_)
{
unsigned long a = *(unsigned long *) a_;
unsigned long b = *(unsigned long *) b_;
if (a < b)
return -1;
else if (a == b)
return 0;
else
return 1;
const struct revindex_entry *a = a_;
const struct revindex_entry *b = b_;
return (a->offset < b->offset) ? -1 : (a->offset > b->offset) ? 1 : 0;
}

/*
@ -163,25 +167,27 @@ static void prepare_pack_revindex(struct pack_revindex *rix) @@ -163,25 +167,27 @@ static void prepare_pack_revindex(struct pack_revindex *rix)
int i;
void *index = p->index_base + 256;

rix->revindex = xmalloc(sizeof(unsigned long) * (num_ent + 1));
rix->revindex = xmalloc(sizeof(*rix->revindex) * (num_ent + 1));
for (i = 0; i < num_ent; i++) {
unsigned int hl = *((unsigned int *)((char *) index + 24*i));
rix->revindex[i] = ntohl(hl);
rix->revindex[i].offset = ntohl(hl);
rix->revindex[i].nr = i;
}
/* This knows the pack format -- the 20-byte trailer
* follows immediately after the last object data.
*/
rix->revindex[num_ent] = p->pack_size - 20;
qsort(rix->revindex, num_ent, sizeof(unsigned long), cmp_offset);
rix->revindex[num_ent].offset = p->pack_size - 20;
rix->revindex[num_ent].nr = -1;
qsort(rix->revindex, num_ent, sizeof(*rix->revindex), cmp_offset);
}

static unsigned long find_packed_object_size(struct packed_git *p,
unsigned long ofs)
static struct revindex_entry * find_packed_object(struct packed_git *p,
unsigned int ofs)
{
int num;
int lo, hi;
struct pack_revindex *rix;
unsigned long *revindex;
struct revindex_entry *revindex;
num = pack_revindex_ix(p);
if (num < 0)
die("internal error: pack revindex uninitialized");
@ -193,10 +199,10 @@ static unsigned long find_packed_object_size(struct packed_git *p, @@ -193,10 +199,10 @@ static unsigned long find_packed_object_size(struct packed_git *p,
hi = num_packed_objects(p) + 1;
do {
int mi = (lo + hi) / 2;
if (revindex[mi] == ofs) {
return revindex[mi+1] - ofs;
if (revindex[mi].offset == ofs) {
return revindex + mi;
}
else if (ofs < revindex[mi])
else if (ofs < revindex[mi].offset)
hi = mi;
else
lo = mi + 1;
@ -204,6 +210,20 @@ static unsigned long find_packed_object_size(struct packed_git *p, @@ -204,6 +210,20 @@ static unsigned long find_packed_object_size(struct packed_git *p,
die("internal error: pack revindex corrupt");
}

static unsigned long find_packed_object_size(struct packed_git *p,
unsigned long ofs)
{
struct revindex_entry *entry = find_packed_object(p, ofs);
return entry[1].offset - ofs;
}

static unsigned char *find_packed_object_name(struct packed_git *p,
unsigned long ofs)
{
struct revindex_entry *entry = find_packed_object(p, ofs);
return (unsigned char *)(p->index_base + 256) + 24 * entry->nr + 4;
}

static void *delta_against(void *buf, unsigned long size, struct object_entry *entry)
{
unsigned long othersize, delta_size;
@ -249,6 +269,10 @@ static int encode_header(enum object_type type, unsigned long size, unsigned cha @@ -249,6 +269,10 @@ static int encode_header(enum object_type type, unsigned long size, unsigned cha
return n;
}

/*
* we are going to reuse the existing object data as is. make
* sure it is not corrupt.
*/
static int check_inflate(unsigned char *data, unsigned long len, unsigned long expect)
{
z_stream stream;
@ -280,32 +304,6 @@ static int check_inflate(unsigned char *data, unsigned long len, unsigned long e @@ -280,32 +304,6 @@ static int check_inflate(unsigned char *data, unsigned long len, unsigned long e
return st;
}

/*
* we are going to reuse the existing pack entry data. make
* sure it is not corrupt.
*/
static int revalidate_pack_entry(struct object_entry *entry, unsigned char *data, unsigned long len)
{
enum object_type type;
unsigned long size, used;

if (pack_to_stdout)
return 0;

/* the caller has already called use_packed_git() for us,
* so it is safe to access the pack data from mmapped location.
* make sure the entry inflates correctly.
*/
used = unpack_object_header_gently(data, len, &type, &size);
if (!used)
return -1;
if (type == OBJ_REF_DELTA)
used += 20; /* skip base object name */
data += used;
len -= used;
return check_inflate(data, len, entry->size);
}

static int revalidate_loose_object(struct object_entry *entry,
unsigned char *map,
unsigned long mapsize)
@ -339,7 +337,7 @@ static unsigned long write_object(struct sha1file *f, @@ -339,7 +337,7 @@ static unsigned long write_object(struct sha1file *f,
obj_type = entry->type;
if (! entry->in_pack)
to_reuse = 0; /* can't reuse what we don't have */
else if (obj_type == OBJ_REF_DELTA)
else if (obj_type == OBJ_REF_DELTA || obj_type == OBJ_OFS_DELTA)
to_reuse = 1; /* check_object() decided it for us */
else if (obj_type != entry->in_pack_type)
to_reuse = 0; /* pack has delta which is unusable */
@ -415,18 +413,38 @@ static unsigned long write_object(struct sha1file *f, @@ -415,18 +413,38 @@ static unsigned long write_object(struct sha1file *f,
}
else {
struct packed_git *p = entry->in_pack;
use_packed_git(p);

datalen = find_packed_object_size(p, entry->in_pack_offset);
buf = (char *) p->pack_base + entry->in_pack_offset;
if (entry->delta) {
obj_type = (allow_ofs_delta && entry->delta->offset) ?
OBJ_OFS_DELTA : OBJ_REF_DELTA;
reused_delta++;
}
hdrlen = encode_header(obj_type, entry->size, header);
sha1write(f, header, hdrlen);
if (obj_type == OBJ_OFS_DELTA) {
unsigned long ofs = entry->offset - entry->delta->offset;
unsigned pos = sizeof(header) - 1;
header[pos] = ofs & 127;
while (ofs >>= 7)
header[--pos] = 128 | (--ofs & 127);
sha1write(f, header + pos, sizeof(header) - pos);
hdrlen += sizeof(header) - pos;
} else if (obj_type == OBJ_REF_DELTA) {
sha1write(f, entry->delta->sha1, 20);
hdrlen += 20;
}

if (revalidate_pack_entry(entry, buf, datalen))
use_packed_git(p);
buf = (char *) p->pack_base
+ entry->in_pack_offset
+ entry->in_pack_header_size;
datalen = find_packed_object_size(p, entry->in_pack_offset)
- entry->in_pack_header_size;
//fprintf(stderr, "reusing %d at %d header %d size %d\n", obj_type, entry->in_pack_offset, entry->in_pack_header_size, datalen);
if (!pack_to_stdout && check_inflate(buf, datalen, entry->size))
die("corrupt delta in pack %s", sha1_to_hex(entry->sha1));
sha1write(f, buf, datalen);
unuse_packed_git(p);
hdrlen = 0; /* not really */
if (obj_type == OBJ_REF_DELTA)
reused_delta++;
reused++;
}
if (entry->delta)
@ -914,26 +932,64 @@ static void check_object(struct object_entry *entry) @@ -914,26 +932,64 @@ static void check_object(struct object_entry *entry)
char type[20];

if (entry->in_pack && !entry->preferred_base) {
unsigned char base[20];
unsigned long size;
struct object_entry *base_entry;
struct packed_git *p = entry->in_pack;
unsigned long left = p->pack_size - entry->in_pack_offset;
unsigned long size, used;
unsigned char *buf;
struct object_entry *base_entry = NULL;

use_packed_git(p);
buf = p->pack_base;
buf += entry->in_pack_offset;

/* We want in_pack_type even if we do not reuse delta.
* There is no point not reusing non-delta representations.
*/
check_reuse_pack_delta(entry->in_pack,
entry->in_pack_offset,
base, &size,
&entry->in_pack_type);
used = unpack_object_header_gently(buf, left,
&entry->in_pack_type, &size);
if (!used || left - used <= 20)
die("corrupt pack for %s", sha1_to_hex(entry->sha1));

/* Check if it is delta, and the base is also an object
* we are going to pack. If so we will reuse the existing
* delta.
*/
if (!no_reuse_delta &&
entry->in_pack_type == OBJ_REF_DELTA &&
(base_entry = locate_object_entry(base)) &&
(!base_entry->preferred_base)) {
if (!no_reuse_delta) {
unsigned char c, *base_name;
unsigned long ofs;
/* there is at least 20 bytes left in the pack */
switch (entry->in_pack_type) {
case OBJ_REF_DELTA:
base_name = buf + used;
used += 20;
break;
case OBJ_OFS_DELTA:
c = buf[used++];
ofs = c & 127;
while (c & 128) {
ofs += 1;
if (!ofs || ofs & ~(~0UL >> 7))
die("delta base offset overflow in pack for %s",
sha1_to_hex(entry->sha1));
c = buf[used++];
ofs = (ofs << 7) + (c & 127);
}
if (ofs >= entry->in_pack_offset)
die("delta base offset out of bound for %s",
sha1_to_hex(entry->sha1));
ofs = entry->in_pack_offset - ofs;
base_name = find_packed_object_name(p, ofs);
break;
default:
base_name = NULL;
}
if (base_name)
base_entry = locate_object_entry(base_name);
}
unuse_packed_git(p);
entry->in_pack_header_size = used;

if (base_entry && !base_entry->preferred_base) {

/* Depth value does not matter - find_deltas()
* will never consider reused delta as the
@ -942,9 +998,9 @@ static void check_object(struct object_entry *entry) @@ -942,9 +998,9 @@ static void check_object(struct object_entry *entry)
*/

/* uncompressed size of the delta data */
entry->size = entry->delta_size = size;
entry->size = size;
entry->delta = base_entry;
entry->type = OBJ_REF_DELTA;
entry->type = entry->in_pack_type;

entry->delta_sibling = base_entry->delta_child;
base_entry->delta_child = entry;
@ -1500,7 +1556,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) @@ -1500,7 +1556,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
continue;
}
if (!strcmp("--delta-base-offset", arg)) {
allow_ofs_delta = no_reuse_delta = 1;
allow_ofs_delta = 1;
continue;
}
if (!strcmp("--stdout", arg)) {

3
pack.h

@ -16,7 +16,4 @@ struct pack_header { @@ -16,7 +16,4 @@ struct pack_header {
};

extern int verify_pack(struct packed_git *, int);
extern int check_reuse_pack_delta(struct packed_git *, unsigned long,
unsigned char *, unsigned long *,
enum object_type *);
#endif

19
sha1_file.c

@ -1000,25 +1000,6 @@ static unsigned long unpack_object_header(struct packed_git *p, unsigned long of @@ -1000,25 +1000,6 @@ static unsigned long unpack_object_header(struct packed_git *p, unsigned long of
return offset + used;
}

int check_reuse_pack_delta(struct packed_git *p, unsigned long offset,
unsigned char *base, unsigned long *sizep,
enum object_type *kindp)
{
unsigned long ptr;
int status = -1;

use_packed_git(p);
ptr = offset;
ptr = unpack_object_header(p, ptr, kindp, sizep);
if (*kindp != OBJ_REF_DELTA)
goto done;
hashcpy(base, (unsigned char *) p->pack_base + ptr);
status = 0;
done:
unuse_packed_git(p);
return status;
}

void packed_object_info_detail(struct packed_git *p,
unsigned long offset,
char *type,

Loading…
Cancel
Save