|
|
|
#ifndef PACK_OBJECTS_H
|
|
|
|
#define PACK_OBJECTS_H
|
|
|
|
|
|
|
|
/*
|
|
|
|
* basic object info
|
|
|
|
* -----------------
|
|
|
|
* idx.oid is filled up before delta searching starts. idx.crc32 is
|
|
|
|
* only valid after the object is written out and will be used for
|
|
|
|
* generating the index. idx.offset will be both gradually set and
|
|
|
|
* used in writing phase (base objects get offset first, then deltas
|
|
|
|
* refer to them)
|
|
|
|
*
|
|
|
|
* "size" is the uncompressed object size. Compressed size of the raw
|
|
|
|
* data for an object in a pack is not stored anywhere but is computed
|
|
|
|
* and made available when reverse .idx is made.
|
|
|
|
*
|
|
|
|
* "hash" contains a path name hash which is used for sorting the
|
|
|
|
* delta list and also during delta searching. Once prepare_pack()
|
|
|
|
* returns it's no longer needed.
|
|
|
|
*
|
|
|
|
* source pack info
|
|
|
|
* ----------------
|
|
|
|
* The (in_pack, in_pack_offset) tuple contains the location of the
|
|
|
|
* object in the source pack. in_pack_header_size allows quickly
|
|
|
|
* skipping the header and going straight to the zlib stream.
|
|
|
|
*
|
|
|
|
* "type" and "in_pack_type" both describe object type. in_pack_type
|
|
|
|
* may contain a delta type, while type is always the canonical type.
|
|
|
|
*
|
|
|
|
* deltas
|
|
|
|
* ------
|
|
|
|
* Delta links (delta, delta_child and delta_sibling) are created to
|
|
|
|
* reflect that delta graph from the source pack then updated or added
|
|
|
|
* during delta searching phase when we find better deltas.
|
|
|
|
*
|
|
|
|
* delta_child and delta_sibling are last needed in
|
|
|
|
* compute_write_order(). "delta" and "delta_size" must remain valid
|
|
|
|
* at object writing phase in case the delta is not cached.
|
|
|
|
*
|
|
|
|
* If a delta is cached in memory and is compressed, delta_data points
|
|
|
|
* to the data and z_delta_size contains the compressed size. If it's
|
|
|
|
* uncompressed [1], z_delta_size must be zero. delta_size is always
|
|
|
|
* the uncompressed size and must be valid even if the delta is not
|
|
|
|
* cached.
|
|
|
|
*
|
|
|
|
* [1] during try_delta phase we don't bother with compressing because
|
|
|
|
* the delta could be quickly replaced with a better one.
|
|
|
|
*/
|
|
|
|
struct object_entry {
|
|
|
|
struct pack_idx_entry idx;
|
|
|
|
unsigned long size; /* uncompressed size */
|
|
|
|
struct packed_git *in_pack; /* already in pack */
|
|
|
|
off_t in_pack_offset;
|
|
|
|
struct object_entry *delta; /* delta base object */
|
|
|
|
struct object_entry *delta_child; /* deltified objects who bases me */
|
|
|
|
struct object_entry *delta_sibling; /* other deltified objects who
|
|
|
|
* uses the same base as me
|
|
|
|
*/
|
|
|
|
void *delta_data; /* cached delta (uncompressed) */
|
|
|
|
unsigned long delta_size; /* delta data size (uncompressed) */
|
|
|
|
unsigned long z_delta_size; /* delta data size (compressed) */
|
|
|
|
enum object_type type;
|
|
|
|
enum object_type in_pack_type; /* could be delta */
|
|
|
|
uint32_t hash; /* name hint hash */
|
pack-objects: implement bitmap writing
This commit extends more the functionality of `pack-objects` by allowing
it to write out a `.bitmap` index next to any written packs, together
with the `.idx` index that currently gets written.
If bitmap writing is enabled for a given repository (either by calling
`pack-objects` with the `--write-bitmap-index` flag or by having
`pack.writebitmaps` set to `true` in the config) and pack-objects is
writing a packfile that would normally be indexed (i.e. not piping to
stdout), we will attempt to write the corresponding bitmap index for the
packfile.
Bitmap index writing happens after the packfile and its index has been
successfully written to disk (`finish_tmp_packfile`). The process is
performed in several steps:
1. `bitmap_writer_set_checksum`: this call stores the partial
checksum for the packfile being written; the checksum will be
written in the resulting bitmap index to verify its integrity
2. `bitmap_writer_build_type_index`: this call uses the array of
`struct object_entry` that has just been sorted when writing out
the actual packfile index to disk to generate 4 type-index bitmaps
(one for each object type).
These bitmaps have their nth bit set if the given object is of
the bitmap's type. E.g. the nth bit of the Commits bitmap will be
1 if the nth object in the packfile index is a commit.
This is a very cheap operation because the bitmap writing code has
access to the metadata stored in the `struct object_entry` array,
and hence the real type for each object in the packfile.
3. `bitmap_writer_reuse_bitmaps`: if there exists an existing bitmap
index for one of the packfiles we're trying to repack, this call
will efficiently rebuild the existing bitmaps so they can be
reused on the new index. All the existing bitmaps will be stored
in a `reuse` hash table, and the commit selection phase will
prioritize these when selecting, as they can be written directly
to the new index without having to perform a revision walk to
fill the bitmap. This can greatly speed up the repack of a
repository that already has bitmaps.
4. `bitmap_writer_select_commits`: if bitmap writing is enabled for
a given `pack-objects` run, the sequence of commits generated
during the Counting Objects phase will be stored in an array.
We then use that array to build up the list of selected commits.
Writing a bitmap in the index for each object in the repository
would be cost-prohibitive, so we use a simple heuristic to pick
the commits that will be indexed with bitmaps.
The current heuristics are a simplified version of JGit's
original implementation. We select a higher density of commits
depending on their age: the 100 most recent commits are always
selected, after that we pick 1 commit of each 100, and the gap
increases as the commits grow older. On top of that, we make sure
that every single branch that has not been merged (all the tips
that would be required from a clone) gets their own bitmap, and
when selecting commits between a gap, we tend to prioritize the
commit with the most parents.
Do note that there is no right/wrong way to perform commit
selection; different selection algorithms will result in
different commits being selected, but there's no such thing as
"missing a commit". The bitmap walker algorithm implemented in
`prepare_bitmap_walk` is able to adapt to missing bitmaps by
performing manual walks that complete the bitmap: the ideal
selection algorithm, however, would select the commits that are
more likely to be used as roots for a walk in the future (e.g.
the tips of each branch, and so on) to ensure a bitmap for them
is always available.
5. `bitmap_writer_build`: this is the computationally expensive part
of bitmap generation. Based on the list of commits that were
selected in the previous step, we perform several incremental
walks to generate the bitmap for each commit.
The walks begin from the oldest commit, and are built up
incrementally for each branch. E.g. consider this dag where A, B,
C, D, E, F are the selected commits, and a, b, c, e are a chunk
of simplified history that will not receive bitmaps.
A---a---B--b--C--c--D
\
E--e--F
We start by building the bitmap for A, using A as the root for a
revision walk and marking all the objects that are reachable
until the walk is over. Once this bitmap is stored, we reuse the
bitmap walker to perform the walk for B, assuming that once we
reach A again, the walk will be terminated because A has already
been SEEN on the previous walk.
This process is repeated for C, and D, but when we try to
generate the bitmaps for E, we can reuse neither the current walk
nor the bitmap we have generated so far.
What we do now is resetting both the walk and clearing the
bitmap, and performing the walk from scratch using E as the
origin. This new walk, however, does not need to be completed.
Once we hit B, we can lookup the bitmap we have already stored
for that commit and OR it with the existing bitmap we've composed
so far, allowing us to limit the walk early.
After all the bitmaps have been generated, another iteration
through the list of commits is performed to find the best XOR
offsets for compression before writing them to disk. Because of
the incremental nature of these bitmaps, XORing one of them with
its predecesor results in a minimal "bitmap delta" most of the
time. We can write this delta to the on-disk bitmap index, and
then re-compose the original bitmaps by XORing them again when
loaded.
This is a phase very similar to pack-object's `find_delta` (using
bitmaps instead of objects, of course), except the heuristics
have been greatly simplified: we only check the 10 bitmaps before
any given one to find best compressing one. This gives good
results in practice, because there is locality in the ordering of
the objects (and therefore bitmaps) in the packfile.
6. `bitmap_writer_finish`: the last step in the process is
serializing to disk all the bitmap data that has been generated
in the two previous steps.
The bitmap is written to a tmp file and then moved atomically to
its final destination, using the same process as
`pack-write.c:write_idx_file`.
Signed-off-by: Vicent Marti <tanoku@gmail.com>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
11 years ago
|
|
|
unsigned int in_pack_pos;
|
|
|
|
unsigned char in_pack_header_size;
|
|
|
|
unsigned preferred_base:1; /*
|
|
|
|
* we do not pack this, but is available
|
|
|
|
* to be used as the base object to delta
|
|
|
|
* objects against.
|
|
|
|
*/
|
|
|
|
unsigned no_try_delta:1;
|
|
|
|
unsigned tagged:1; /* near the very tip of refs */
|
|
|
|
unsigned filled:1; /* assigned write-order */
|
pack-objects: break delta cycles before delta-search phase
We do not allow cycles in the delta graph of a pack (i.e., A
is a delta of B which is a delta of A) for the obvious
reason that you cannot actually access any of the objects in
such a case.
There's a last-ditch attempt to notice cycles during the
write phase, during which we issue a warning to the user and
write one of the objects out in full. However, this is
"last-ditch" for two reasons:
1. By this time, it's too late to find another delta for
the object, so the resulting pack is larger than it
otherwise could be.
2. The warning is there because this is something that
_shouldn't_ ever happen. If it does, then either:
a. a pack we are reusing deltas from had its own
cycle
b. we are reusing deltas from multiple packs, and
we found a cycle among them (i.e., A is a delta of
B in one pack, but B is a delta of A in another,
and we choose to use both deltas).
c. there is a bug in the delta-search code
So this code serves as a final check that none of these
things has happened, warns the user, and prevents us
from writing a bogus pack.
Right now, (2b) should never happen because of the static
ordering of packs in want_object_in_pack(). If two objects
have a delta relationship, then they must be in the same
pack, and therefore we will find them from that same pack.
However, a future patch would like to change that static
ordering, which will make (2b) a common occurrence. In
preparation, we should be able to handle those kinds of
cycles better. This patch does by introducing a
cycle-breaking step during the get_object_details() phase,
when we are deciding which deltas can be reused. That gives
us the chance to feed the objects into the delta search as
if the cycle did not exist.
We'll leave the detection and warning in the write_object()
phase in place, as it still serves as a check for case (2c).
This does mean we will stop warning for (2a). That case is
caused by bogus input packs, and we ideally would warn the
user about it. However, since those cycles show up after
picking reusable deltas, they look the same as (2b) to us;
our new code will break the cycles early and the last-ditch
check will never see them.
We could do analysis on any cycles that we find to
distinguish the two cases (i.e., it is a bogus pack if and
only if every delta in the cycle is in the same pack), but
we don't need to. If there is a cycle inside a pack, we'll
run into problems not only reusing the delta, but accessing
the object data at all. So when we try to dig up the actual
size of the object, we'll hit that same cycle and kick in
our usual complain-and-try-another-source code.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
9 years ago
|
|
|
|
|
|
|
/*
|
|
|
|
* State flags for depth-first search used for analyzing delta cycles.
|
pack-objects: enforce --depth limit in reused deltas
Since 898b14c (pack-objects: rework check_delta_limit usage,
2007-04-16), we check the delta depth limit only when
figuring out whether we should make a new delta. We don't
consider it at all when reusing deltas, which means that
packing once with --depth=250, and then again with
--depth=50, the second pack may still contain chains larger
than 50.
This is generally considered a feature, as the results of
earlier high-depth repacks are carried forward, used for
serving fetches, etc. However, since we started using
cross-pack deltas in c9af708b1 (pack-objects: use mru list
when iterating over packs, 2016-08-11), we are no longer
bounded by the length of an existing delta chain in a single
pack.
Here's one particular pathological case: a sequence of N
packs, each with 2 objects, the base of which is stored as a
delta in a previous pack. If we chain all the deltas
together, we have a cycle of length N. We break the cycle,
but the tip delta is still at depth N-1.
This is less unlikely than it might sound. See the included
test for a reconstruction based on real-world actions. I
ran into such a case in the wild, where a client was rapidly
sending packs, and we had accumulated 10,000 before doing a
server-side repack. The pack that "git repack" tried to
generate had a very deep chain, which caused pack-objects to
run out of stack space in the recursive write_one().
This patch bounds the length of delta chains in the output
pack based on --depth, regardless of whether they are caused
by cross-pack deltas or existed in the input packs. This
fixes the problem, but does have two possible downsides:
1. High-depth aggressive repacks followed by "normal"
repacks will throw away the high-depth chains.
In the long run this is probably OK; investigation
showed that high-depth repacks aren't actually
beneficial, and we dropped the aggressive depth default
to match the normal case in 07e7dbf0d (gc: default
aggressive depth to 50, 2016-08-11).
2. If you really do want to store high-depth deltas on
disk, they may be discarded and new delta computed when
serving a fetch, unless you set pack.depth to match
your high-depth size.
The implementation uses the existing search for delta
cycles. That lets us compute the depth of any node based on
the depth of its base, because we know the base is DFS_DONE
by the time we look at it (modulo any cycles in the graph,
but we know there cannot be any because we break them as we
see them).
There is some subtlety worth mentioning, though. We record
the depth of each object as we compute it. It might seem
like we could save the per-object storage space by just
keeping track of the depth of our traversal (i.e., have
break_delta_chains() report how deep it went). But we may
visit an object through multiple delta paths, and on
subsequent paths we want to know its depth immediately,
without having to walk back down to its final base (doing so
would make our graph walk quadratic rather than linear).
Likewise, one could try to record the depth not from the
base, but from our starting point (i.e., start
recursion_depth at 0, and pass "recursion_depth + 1" to each
invocation of break_delta_chains()). And then when
recursion_depth gets too big, we know that we must cut the
delta chain. But that technique is wrong if we do not visit
the nodes in topological order. In a chain A->B->C, it
if we visit "C", then "B", then "A", we will never recurse
deeper than 1 link (because we see at each node that we have
already visited it).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
8 years ago
|
|
|
*
|
|
|
|
* The depth is measured in delta-links to the base (so if A is a delta
|
|
|
|
* against B, then A has a depth of 1, and B a depth of 0).
|
pack-objects: break delta cycles before delta-search phase
We do not allow cycles in the delta graph of a pack (i.e., A
is a delta of B which is a delta of A) for the obvious
reason that you cannot actually access any of the objects in
such a case.
There's a last-ditch attempt to notice cycles during the
write phase, during which we issue a warning to the user and
write one of the objects out in full. However, this is
"last-ditch" for two reasons:
1. By this time, it's too late to find another delta for
the object, so the resulting pack is larger than it
otherwise could be.
2. The warning is there because this is something that
_shouldn't_ ever happen. If it does, then either:
a. a pack we are reusing deltas from had its own
cycle
b. we are reusing deltas from multiple packs, and
we found a cycle among them (i.e., A is a delta of
B in one pack, but B is a delta of A in another,
and we choose to use both deltas).
c. there is a bug in the delta-search code
So this code serves as a final check that none of these
things has happened, warns the user, and prevents us
from writing a bogus pack.
Right now, (2b) should never happen because of the static
ordering of packs in want_object_in_pack(). If two objects
have a delta relationship, then they must be in the same
pack, and therefore we will find them from that same pack.
However, a future patch would like to change that static
ordering, which will make (2b) a common occurrence. In
preparation, we should be able to handle those kinds of
cycles better. This patch does by introducing a
cycle-breaking step during the get_object_details() phase,
when we are deciding which deltas can be reused. That gives
us the chance to feed the objects into the delta search as
if the cycle did not exist.
We'll leave the detection and warning in the write_object()
phase in place, as it still serves as a check for case (2c).
This does mean we will stop warning for (2a). That case is
caused by bogus input packs, and we ideally would warn the
user about it. However, since those cycles show up after
picking reusable deltas, they look the same as (2b) to us;
our new code will break the cycles early and the last-ditch
check will never see them.
We could do analysis on any cycles that we find to
distinguish the two cases (i.e., it is a bogus pack if and
only if every delta in the cycle is in the same pack), but
we don't need to. If there is a cycle inside a pack, we'll
run into problems not only reusing the delta, but accessing
the object data at all. So when we try to dig up the actual
size of the object, we'll hit that same cycle and kick in
our usual complain-and-try-another-source code.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
9 years ago
|
|
|
*/
|
|
|
|
enum {
|
|
|
|
DFS_NONE = 0,
|
|
|
|
DFS_ACTIVE,
|
|
|
|
DFS_DONE
|
|
|
|
} dfs_state;
|
pack-objects: enforce --depth limit in reused deltas
Since 898b14c (pack-objects: rework check_delta_limit usage,
2007-04-16), we check the delta depth limit only when
figuring out whether we should make a new delta. We don't
consider it at all when reusing deltas, which means that
packing once with --depth=250, and then again with
--depth=50, the second pack may still contain chains larger
than 50.
This is generally considered a feature, as the results of
earlier high-depth repacks are carried forward, used for
serving fetches, etc. However, since we started using
cross-pack deltas in c9af708b1 (pack-objects: use mru list
when iterating over packs, 2016-08-11), we are no longer
bounded by the length of an existing delta chain in a single
pack.
Here's one particular pathological case: a sequence of N
packs, each with 2 objects, the base of which is stored as a
delta in a previous pack. If we chain all the deltas
together, we have a cycle of length N. We break the cycle,
but the tip delta is still at depth N-1.
This is less unlikely than it might sound. See the included
test for a reconstruction based on real-world actions. I
ran into such a case in the wild, where a client was rapidly
sending packs, and we had accumulated 10,000 before doing a
server-side repack. The pack that "git repack" tried to
generate had a very deep chain, which caused pack-objects to
run out of stack space in the recursive write_one().
This patch bounds the length of delta chains in the output
pack based on --depth, regardless of whether they are caused
by cross-pack deltas or existed in the input packs. This
fixes the problem, but does have two possible downsides:
1. High-depth aggressive repacks followed by "normal"
repacks will throw away the high-depth chains.
In the long run this is probably OK; investigation
showed that high-depth repacks aren't actually
beneficial, and we dropped the aggressive depth default
to match the normal case in 07e7dbf0d (gc: default
aggressive depth to 50, 2016-08-11).
2. If you really do want to store high-depth deltas on
disk, they may be discarded and new delta computed when
serving a fetch, unless you set pack.depth to match
your high-depth size.
The implementation uses the existing search for delta
cycles. That lets us compute the depth of any node based on
the depth of its base, because we know the base is DFS_DONE
by the time we look at it (modulo any cycles in the graph,
but we know there cannot be any because we break them as we
see them).
There is some subtlety worth mentioning, though. We record
the depth of each object as we compute it. It might seem
like we could save the per-object storage space by just
keeping track of the depth of our traversal (i.e., have
break_delta_chains() report how deep it went). But we may
visit an object through multiple delta paths, and on
subsequent paths we want to know its depth immediately,
without having to walk back down to its final base (doing so
would make our graph walk quadratic rather than linear).
Likewise, one could try to record the depth not from the
base, but from our starting point (i.e., start
recursion_depth at 0, and pass "recursion_depth + 1" to each
invocation of break_delta_chains()). And then when
recursion_depth gets too big, we know that we must cut the
delta chain. But that technique is wrong if we do not visit
the nodes in topological order. In a chain A->B->C, it
if we visit "C", then "B", then "A", we will never recurse
deeper than 1 link (because we see at each node that we have
already visited it).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
8 years ago
|
|
|
int depth;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct packing_data {
|
|
|
|
struct object_entry *objects;
|
|
|
|
uint32_t nr_objects, nr_alloc;
|
|
|
|
|
|
|
|
int32_t *index;
|
|
|
|
uint32_t index_size;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct object_entry *packlist_alloc(struct packing_data *pdata,
|
|
|
|
const unsigned char *sha1,
|
|
|
|
uint32_t index_pos);
|
|
|
|
|
|
|
|
struct object_entry *packlist_find(struct packing_data *pdata,
|
|
|
|
const unsigned char *sha1,
|
|
|
|
uint32_t *index_pos);
|
|
|
|
|
|
|
|
static inline uint32_t pack_name_hash(const char *name)
|
|
|
|
{
|
|
|
|
uint32_t c, hash = 0;
|
|
|
|
|
|
|
|
if (!name)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This effectively just creates a sortable number from the
|
|
|
|
* last sixteen non-whitespace characters. Last characters
|
|
|
|
* count "most", so things that end in ".c" sort together.
|
|
|
|
*/
|
|
|
|
while ((c = *name++) != 0) {
|
|
|
|
if (isspace(c))
|
|
|
|
continue;
|
|
|
|
hash = (hash >> 2) + (c << 24);
|
|
|
|
}
|
|
|
|
return hash;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|