From 4e7b65ba8e7b4174c4ab249b64c6cb3ad0009732 Mon Sep 17 00:00:00 2001 From: Taylor Blau <me@ttaylorr.com> Date: Mon, 24 Oct 2022 14:43:03 -0400 Subject: [PATCH 1/4] builtin/repack.c: pass "out" to `prepare_pack_objects` `builtin/repack.c`'s `prepare_pack_objects()` is used to prepare a set of arguments to a `pack-objects` process which will generate a desired pack. A future patch will add an `--expire-to` option which allows `git repack` to write a cruft pack containing the pruned objects out to a separate repository. Prepare for this by teaching that function to write packs to an arbitrary location specified by the caller. All existing callers of `prepare_pack_objects()` will pass `packtmp` for `out`, retaining the existing behavior. Signed-off-by: Taylor Blau <me@ttaylorr.com> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- builtin/repack.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/builtin/repack.c b/builtin/repack.c index a5bacc7797..0a7bd57636 100644 --- a/builtin/repack.c +++ b/builtin/repack.c @@ -188,7 +188,8 @@ static void remove_redundant_pack(const char *dir_name, const char *base_name) } static void prepare_pack_objects(struct child_process *cmd, - const struct pack_objects_args *args) + const struct pack_objects_args *args, + const char *out) { strvec_push(&cmd->args, "pack-objects"); if (args->window) @@ -211,7 +212,7 @@ static void prepare_pack_objects(struct child_process *cmd, strvec_push(&cmd->args, "--quiet"); if (delta_base_offset) strvec_push(&cmd->args, "--delta-base-offset"); - strvec_push(&cmd->args, packtmp); + strvec_push(&cmd->args, out); cmd->git_cmd = 1; cmd->out = -1; } @@ -275,7 +276,7 @@ static void repack_promisor_objects(const struct pack_objects_args *args, FILE *out; struct strbuf line = STRBUF_INIT; - prepare_pack_objects(&cmd, args); + prepare_pack_objects(&cmd, args, packtmp); cmd.in = -1; /* @@ -673,7 +674,7 @@ static int write_cruft_pack(const struct pack_objects_args *args, FILE *in, *out; int ret; - prepare_pack_objects(&cmd, args); + prepare_pack_objects(&cmd, args, packtmp); strvec_push(&cmd.args, "--cruft"); if (cruft_expiration) @@ -861,7 +862,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix) sigchain_push_common(remove_pack_on_signal); - prepare_pack_objects(&cmd, &po_args); + prepare_pack_objects(&cmd, &po_args, packtmp); show_progress = !po_args.quiet && isatty(2); From eddad3686080553bb1e6ffa6d9394912f9427823 Mon Sep 17 00:00:00 2001 From: Taylor Blau <me@ttaylorr.com> Date: Mon, 24 Oct 2022 14:43:06 -0400 Subject: [PATCH 2/4] builtin/repack.c: pass "cruft_expiration" to `write_cruft_pack` `builtin/repack.c`'s `write_cruft_pack()` is used to generate the cruft pack when `--cruft` is supplied. It uses a static variable "cruft_expiration" which is filled in by option parsing. A future patch will add an `--expire-to` option which allows `git repack` to write a cruft pack containing the pruned objects out to a separate repository. In order to implement this functionality, some callers will have to pass a value for `cruft_expiration` different than the one filled out by option parsing. Prepare for this by teaching `write_cruft_pack` to take a "cruft_expiration" parameter, instead of reading a single static variable. The (sole) existing caller of `write_cruft_pack()` will pass the value for "cruft_expiration" filled in by option parsing, retaining existing behavior. This means that we can make the variable local to `cmd_repack()`, and eliminate the static declaration. Signed-off-by: Taylor Blau <me@ttaylorr.com> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- builtin/repack.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/builtin/repack.c b/builtin/repack.c index 0a7bd57636..1184e8c257 100644 --- a/builtin/repack.c +++ b/builtin/repack.c @@ -32,7 +32,6 @@ static int write_bitmaps = -1; static int use_delta_islands; static int run_update_server_info = 1; static char *packdir, *packtmp_name, *packtmp; -static char *cruft_expiration; static const char *const git_repack_usage[] = { N_("git repack [<options>]"), @@ -664,6 +663,7 @@ static int write_midx_included_packs(struct string_list *include, static int write_cruft_pack(const struct pack_objects_args *args, const char *pack_prefix, + const char *cruft_expiration, struct string_list *names, struct string_list *existing_packs, struct string_list *existing_kept_packs) @@ -746,6 +746,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix) struct pack_objects_args cruft_po_args = {NULL}; int geometric_factor = 0; int write_midx = 0; + const char *cruft_expiration = NULL; struct option builtin_repack_options[] = { OPT_BIT('a', NULL, &pack_everything, @@ -985,7 +986,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix) cruft_po_args.local = po_args.local; cruft_po_args.quiet = po_args.quiet; - ret = write_cruft_pack(&cruft_po_args, pack_prefix, &names, + ret = write_cruft_pack(&cruft_po_args, pack_prefix, + cruft_expiration, &names, &existing_nonkept_packs, &existing_kept_packs); if (ret) From c12cda479eb103bc364e299a2e5654a7165df3cd Mon Sep 17 00:00:00 2001 From: Taylor Blau <me@ttaylorr.com> Date: Mon, 24 Oct 2022 14:43:09 -0400 Subject: [PATCH 3/4] builtin/repack.c: write cruft packs to arbitrary locations In the following commit, a new write_cruft_pack() caller will be added which wants to write a cruft pack to an arbitrary location. Prepare for this by adding a parameter which controls the destination of the cruft pack. For now, provide "packtmp" so that this commit does not change any behavior. Signed-off-by: Taylor Blau <me@ttaylorr.com> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- builtin/repack.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/builtin/repack.c b/builtin/repack.c index 1184e8c257..432fa3f677 100644 --- a/builtin/repack.c +++ b/builtin/repack.c @@ -662,6 +662,7 @@ static int write_midx_included_packs(struct string_list *include, } static int write_cruft_pack(const struct pack_objects_args *args, + const char *destination, const char *pack_prefix, const char *cruft_expiration, struct string_list *names, @@ -673,8 +674,10 @@ static int write_cruft_pack(const struct pack_objects_args *args, struct string_list_item *item; FILE *in, *out; int ret; + const char *scratch; + int local = skip_prefix(destination, packdir, &scratch); - prepare_pack_objects(&cmd, args, packtmp); + prepare_pack_objects(&cmd, args, destination); strvec_push(&cmd.args, "--cruft"); if (cruft_expiration) @@ -714,7 +717,12 @@ static int write_cruft_pack(const struct pack_objects_args *args, if (line.len != the_hash_algo->hexsz) die(_("repack: Expecting full hex object ID lines only " "from pack-objects.")); - string_list_append(names, line.buf); + /* + * avoid putting packs written outside of the repository in the + * list of names + */ + if (local) + string_list_append(names, line.buf); } fclose(out); @@ -986,7 +994,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix) cruft_po_args.local = po_args.local; cruft_po_args.quiet = po_args.quiet; - ret = write_cruft_pack(&cruft_po_args, pack_prefix, + ret = write_cruft_pack(&cruft_po_args, packtmp, pack_prefix, cruft_expiration, &names, &existing_nonkept_packs, &existing_kept_packs); From 91badeba32d31d4dcc695a8888e5b697b4c3d90c Mon Sep 17 00:00:00 2001 From: Taylor Blau <me@ttaylorr.com> Date: Mon, 24 Oct 2022 14:43:12 -0400 Subject: [PATCH 4/4] builtin/repack.c: implement `--expire-to` for storing pruned objects When pruning objects with `--cruft`, `git repack` offers some flexibility when selecting the set of which objects are pruned via the `--cruft-expiration` option. This is useful for expiring objects which are older than the grace period, making races where to-be-pruned objects become reachable and then ancestors of freshly pushed objects, leaving the repository in a corrupt state after pruning substantially less likely [1]. But in practice, such races are impossible to avoid entirely, no matter how long the grace period is. To prevent this race, it is often advisable to temporarily put a repository into a read-only state. But in practice, this is not always practical, and so some middle ground would be nice. This patch introduces a new option, `--expire-to`, which teaches `git repack` to write an additional cruft pack containing just the objects which were pruned from the repository. The caller can specify a directory outside of the current repository as the destination for this second cruft pack. This makes it possible to prune objects from a repository, while still holding onto a supplemental copy of them outside of the original repository. Having this copy on-disk makes it substantially easier to recover objects when the aforementioned race is encountered. `--expire-to` is implemented in a somewhat convoluted manner, which is to take advantage of the fact that the first time `write_cruft_pack()` is called, it adds the name of the cruft pack to the `names` string list. That means the second time we call `write_cruft_pack()`, objects in the previously-written cruft pack will be excluded. As long as the caller ensures that no objects are expired during the second pass, this is sufficient to generate a cruft pack containing all objects which don't appear in any of the new packs written by `git repack`, including the cruft pack. In other words, all of the objects which are about to be pruned from the repository. It is important to note that the destination in `--expire-to` does not necessarily need to be a Git repository (though it can be) Notably, the expired packs do not contain all ancestors of expired objects. So if the source repository contains something like: <unreachable> / C1 --- C2 \ refs/heads/master where C2 is unreachable, but has a parent (C1) which is reachable, and C2 would be pruned, then the expiry pack will contain only C2, not C1. [1]: https://lore.kernel.org/git/20190319001829.GL29661@sigill.intra.peff.net/ Signed-off-by: Taylor Blau <me@ttaylorr.com> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- Documentation/git-repack.txt | 6 ++ builtin/repack.c | 40 ++++++++++++ t/t7700-repack.sh | 121 +++++++++++++++++++++++++++++++++++ 3 files changed, 167 insertions(+) diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt index 0bf13893d8..4017157949 100644 --- a/Documentation/git-repack.txt +++ b/Documentation/git-repack.txt @@ -74,6 +74,12 @@ to the new separate pack will be written. immediately instead of waiting for the next `git gc` invocation. Only useful with `--cruft -d`. +--expire-to=<dir>:: + Write a cruft pack containing pruned objects (if any) to the + directory `<dir>`. This option is useful for keeping a copy of + any pruned objects in a separate directory as a backup. Only + useful with `--cruft -d`. + -l:: Pass the `--local` option to 'git pack-objects'. See linkgit:git-pack-objects[1]. diff --git a/builtin/repack.c b/builtin/repack.c index 432fa3f677..a70962ee03 100644 --- a/builtin/repack.c +++ b/builtin/repack.c @@ -702,6 +702,10 @@ static int write_cruft_pack(const struct pack_objects_args *args, * By the time it is read here, it contains only the pack(s) * that were just written, which is exactly the set of packs we * want to consider kept. + * + * If `--expire-to` is given, the double-use served by `names` + * ensures that the pack written to `--expire-to` excludes any + * objects contained in the cruft pack. */ in = xfdopen(cmd.in, "w"); for_each_string_list_item(item, names) @@ -755,6 +759,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix) int geometric_factor = 0; int write_midx = 0; const char *cruft_expiration = NULL; + const char *expire_to = NULL; struct option builtin_repack_options[] = { OPT_BIT('a', NULL, &pack_everything, @@ -804,6 +809,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix) N_("find a geometric progression with factor <N>")), OPT_BOOL('m', "write-midx", &write_midx, N_("write a multi-pack index of the resulting packs")), + OPT_STRING(0, "expire-to", &expire_to, N_("dir"), + N_("pack prefix to store a pack containing pruned objects")), OPT_END() }; @@ -1000,6 +1007,39 @@ int cmd_repack(int argc, const char **argv, const char *prefix) &existing_kept_packs); if (ret) return ret; + + if (delete_redundant && expire_to) { + /* + * If `--expire-to` is given with `-d`, it's possible + * that we're about to prune some objects. With cruft + * packs, pruning is implicit: any objects from existing + * packs that weren't picked up by new packs are removed + * when their packs are deleted. + * + * Generate an additional cruft pack, with one twist: + * `names` now includes the name of the cruft pack + * written in the previous step. So the contents of + * _this_ cruft pack exclude everything contained in the + * existing cruft pack (that is, all of the unreachable + * objects which are no older than + * `--cruft-expiration`). + * + * To make this work, cruft_expiration must become NULL + * so that this cruft pack doesn't actually prune any + * objects. If it were non-NULL, this call would always + * generate an empty pack (since every object not in the + * cruft pack generated above will have an mtime older + * than the expiration). + */ + ret = write_cruft_pack(&cruft_po_args, expire_to, + pack_prefix, + NULL, + &names, + &existing_nonkept_packs, + &existing_kept_packs); + if (ret) + return ret; + } } string_list_sort(&names); diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh index ca45c4cd2c..17ee6fc2cc 100755 --- a/t/t7700-repack.sh +++ b/t/t7700-repack.sh @@ -482,4 +482,125 @@ test_expect_success '-n overrides repack.updateServerInfo=true' ' test_server_info_missing ' +test_expect_success '--expire-to stores pruned objects (now)' ' + git init expire-to-now && + ( + cd expire-to-now && + + git branch -M main && + + test_commit base && + + git checkout -b cruft && + test_commit --no-tag cruft && + + git rev-list --objects --no-object-names main..cruft >moved.raw && + sort moved.raw >moved.want && + + git rev-list --all --objects --no-object-names >expect.raw && + sort expect.raw >expect && + + git checkout main && + git branch -D cruft && + git reflog expire --all --expire=all && + + git init --bare expired.git && + git repack -d \ + --cruft --cruft-expiration="now" \ + --expire-to="expired.git/objects/pack/pack" && + + expired="$(ls expired.git/objects/pack/pack-*.idx)" && + test_path_is_file "${expired%.idx}.mtimes" && + + # Since the `--cruft-expiration` is "now", the effective + # behavior is to move _all_ unreachable objects out to + # the location in `--expire-to`. + git show-index <$expired >expired.raw && + cut -d" " -f2 expired.raw | sort >expired.objects && + git rev-list --all --objects --no-object-names \ + >remaining.objects && + + # ...in other words, the combined contents of this + # repository and expired.git should be the same as the + # set of objects we started with. + cat expired.objects remaining.objects | sort >actual && + test_cmp expect actual && + + # The "moved" objects (i.e., those in expired.git) + # should be the same as the cruft objects which were + # expired in the previous step. + test_cmp moved.want expired.objects + ) +' + +test_expect_success '--expire-to stores pruned objects (5.minutes.ago)' ' + git init expire-to-5.minutes.ago && + ( + cd expire-to-5.minutes.ago && + + git branch -M main && + + test_commit base && + + # Create two classes of unreachable objects, one which + # is older than 5 minutes (stale), and another which is + # newer (recent). + for kind in stale recent + do + git checkout -b $kind main && + test_commit --no-tag $kind || return 1 + done && + + git rev-list --objects --no-object-names main..stale >in && + stale="$(git pack-objects $objdir/pack/pack <in)" && + mtime="$(test-tool chmtime --get =-600 $objdir/pack/pack-$stale.pack)" && + + # expect holds the set of objects we expect to find in + # this repository after repacking + git rev-list --objects --no-object-names recent >expect.raw && + sort expect.raw >expect && + + # moved.want holds the set of objects we expect to find + # in expired.git + git rev-list --objects --no-object-names main..stale >out && + sort out >moved.want && + + git checkout main && + git branch -D stale recent && + git reflog expire --all --expire=all && + git prune-packed && + + git init --bare expired.git && + git repack -d \ + --cruft --cruft-expiration=5.minutes.ago \ + --expire-to="expired.git/objects/pack/pack" && + + # Some of the remaining objects in this repository are + # unreachable, so use `cat-file --batch-all-objects` + # instead of `rev-list` to get their names + git cat-file --batch-all-objects --batch-check="%(objectname)" \ + >remaining.objects && + sort remaining.objects >actual && + test_cmp expect actual && + + ( + cd expired.git && + + expired="$(ls objects/pack/pack-*.mtimes)" && + test-tool pack-mtimes $(basename $expired) >out && + cut -d" " -f1 out | sort >../moved.got && + + # Ensure that there are as many objects with the + # expected mtime as were moved to expired.git. + # + # In other words, ensure that the recorded + # mtimes of any moved objects was written + # correctly. + grep " $mtime$" out >matching && + test_line_count = $(wc -l <../moved.want) matching + ) && + test_cmp moved.want moved.got + ) +' + test_done