From 4e7b65ba8e7b4174c4ab249b64c6cb3ad0009732 Mon Sep 17 00:00:00 2001
From: Taylor Blau <me@ttaylorr.com>
Date: Mon, 24 Oct 2022 14:43:03 -0400
Subject: [PATCH 1/4] builtin/repack.c: pass "out" to `prepare_pack_objects`

`builtin/repack.c`'s `prepare_pack_objects()` is used to prepare a set
of arguments to a `pack-objects` process which will generate a desired
pack.

A future patch will add an `--expire-to` option which allows `git
repack` to write a cruft pack containing the pruned objects out to a
separate repository. Prepare for this by teaching that function to write
packs to an arbitrary location specified by the caller.

All existing callers of `prepare_pack_objects()` will pass `packtmp` for
`out`, retaining the existing behavior.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/repack.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/builtin/repack.c b/builtin/repack.c
index a5bacc7797..0a7bd57636 100644
--- a/builtin/repack.c
+++ b/builtin/repack.c
@@ -188,7 +188,8 @@ static void remove_redundant_pack(const char *dir_name, const char *base_name)
 }
 
 static void prepare_pack_objects(struct child_process *cmd,
-				 const struct pack_objects_args *args)
+				 const struct pack_objects_args *args,
+				 const char *out)
 {
 	strvec_push(&cmd->args, "pack-objects");
 	if (args->window)
@@ -211,7 +212,7 @@ static void prepare_pack_objects(struct child_process *cmd,
 		strvec_push(&cmd->args,  "--quiet");
 	if (delta_base_offset)
 		strvec_push(&cmd->args,  "--delta-base-offset");
-	strvec_push(&cmd->args, packtmp);
+	strvec_push(&cmd->args, out);
 	cmd->git_cmd = 1;
 	cmd->out = -1;
 }
@@ -275,7 +276,7 @@ static void repack_promisor_objects(const struct pack_objects_args *args,
 	FILE *out;
 	struct strbuf line = STRBUF_INIT;
 
-	prepare_pack_objects(&cmd, args);
+	prepare_pack_objects(&cmd, args, packtmp);
 	cmd.in = -1;
 
 	/*
@@ -673,7 +674,7 @@ static int write_cruft_pack(const struct pack_objects_args *args,
 	FILE *in, *out;
 	int ret;
 
-	prepare_pack_objects(&cmd, args);
+	prepare_pack_objects(&cmd, args, packtmp);
 
 	strvec_push(&cmd.args, "--cruft");
 	if (cruft_expiration)
@@ -861,7 +862,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 
 	sigchain_push_common(remove_pack_on_signal);
 
-	prepare_pack_objects(&cmd, &po_args);
+	prepare_pack_objects(&cmd, &po_args, packtmp);
 
 	show_progress = !po_args.quiet && isatty(2);
 

From eddad3686080553bb1e6ffa6d9394912f9427823 Mon Sep 17 00:00:00 2001
From: Taylor Blau <me@ttaylorr.com>
Date: Mon, 24 Oct 2022 14:43:06 -0400
Subject: [PATCH 2/4] builtin/repack.c: pass "cruft_expiration" to
 `write_cruft_pack`

`builtin/repack.c`'s `write_cruft_pack()` is used to generate the cruft
pack when `--cruft` is supplied. It uses a static variable
"cruft_expiration" which is filled in by option parsing.

A future patch will add an `--expire-to` option which allows `git
repack` to write a cruft pack containing the pruned objects out to a
separate repository. In order to implement this functionality, some
callers will have to pass a value for `cruft_expiration` different than
the one filled out by option parsing.

Prepare for this by teaching `write_cruft_pack` to take a
"cruft_expiration" parameter, instead of reading a single static
variable.

The (sole) existing caller of `write_cruft_pack()` will pass the value
for "cruft_expiration" filled in by option parsing, retaining existing
behavior. This means that we can make the variable local to
`cmd_repack()`, and eliminate the static declaration.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/repack.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/builtin/repack.c b/builtin/repack.c
index 0a7bd57636..1184e8c257 100644
--- a/builtin/repack.c
+++ b/builtin/repack.c
@@ -32,7 +32,6 @@ static int write_bitmaps = -1;
 static int use_delta_islands;
 static int run_update_server_info = 1;
 static char *packdir, *packtmp_name, *packtmp;
-static char *cruft_expiration;
 
 static const char *const git_repack_usage[] = {
 	N_("git repack [<options>]"),
@@ -664,6 +663,7 @@ static int write_midx_included_packs(struct string_list *include,
 
 static int write_cruft_pack(const struct pack_objects_args *args,
 			    const char *pack_prefix,
+			    const char *cruft_expiration,
 			    struct string_list *names,
 			    struct string_list *existing_packs,
 			    struct string_list *existing_kept_packs)
@@ -746,6 +746,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 	struct pack_objects_args cruft_po_args = {NULL};
 	int geometric_factor = 0;
 	int write_midx = 0;
+	const char *cruft_expiration = NULL;
 
 	struct option builtin_repack_options[] = {
 		OPT_BIT('a', NULL, &pack_everything,
@@ -985,7 +986,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 		cruft_po_args.local = po_args.local;
 		cruft_po_args.quiet = po_args.quiet;
 
-		ret = write_cruft_pack(&cruft_po_args, pack_prefix, &names,
+		ret = write_cruft_pack(&cruft_po_args, pack_prefix,
+				       cruft_expiration, &names,
 				       &existing_nonkept_packs,
 				       &existing_kept_packs);
 		if (ret)

From c12cda479eb103bc364e299a2e5654a7165df3cd Mon Sep 17 00:00:00 2001
From: Taylor Blau <me@ttaylorr.com>
Date: Mon, 24 Oct 2022 14:43:09 -0400
Subject: [PATCH 3/4] builtin/repack.c: write cruft packs to arbitrary
 locations

In the following commit, a new write_cruft_pack() caller will be added
which wants to write a cruft pack to an arbitrary location. Prepare for
this by adding a parameter which controls the destination of the cruft
pack.

For now, provide "packtmp" so that this commit does not change any
behavior.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/repack.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/builtin/repack.c b/builtin/repack.c
index 1184e8c257..432fa3f677 100644
--- a/builtin/repack.c
+++ b/builtin/repack.c
@@ -662,6 +662,7 @@ static int write_midx_included_packs(struct string_list *include,
 }
 
 static int write_cruft_pack(const struct pack_objects_args *args,
+			    const char *destination,
 			    const char *pack_prefix,
 			    const char *cruft_expiration,
 			    struct string_list *names,
@@ -673,8 +674,10 @@ static int write_cruft_pack(const struct pack_objects_args *args,
 	struct string_list_item *item;
 	FILE *in, *out;
 	int ret;
+	const char *scratch;
+	int local = skip_prefix(destination, packdir, &scratch);
 
-	prepare_pack_objects(&cmd, args, packtmp);
+	prepare_pack_objects(&cmd, args, destination);
 
 	strvec_push(&cmd.args, "--cruft");
 	if (cruft_expiration)
@@ -714,7 +717,12 @@ static int write_cruft_pack(const struct pack_objects_args *args,
 		if (line.len != the_hash_algo->hexsz)
 			die(_("repack: Expecting full hex object ID lines only "
 			      "from pack-objects."));
-		string_list_append(names, line.buf);
+		/*
+		 * avoid putting packs written outside of the repository in the
+		 * list of names
+		 */
+		if (local)
+			string_list_append(names, line.buf);
 	}
 	fclose(out);
 
@@ -986,7 +994,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 		cruft_po_args.local = po_args.local;
 		cruft_po_args.quiet = po_args.quiet;
 
-		ret = write_cruft_pack(&cruft_po_args, pack_prefix,
+		ret = write_cruft_pack(&cruft_po_args, packtmp, pack_prefix,
 				       cruft_expiration, &names,
 				       &existing_nonkept_packs,
 				       &existing_kept_packs);

From 91badeba32d31d4dcc695a8888e5b697b4c3d90c Mon Sep 17 00:00:00 2001
From: Taylor Blau <me@ttaylorr.com>
Date: Mon, 24 Oct 2022 14:43:12 -0400
Subject: [PATCH 4/4] builtin/repack.c: implement `--expire-to` for storing
 pruned objects

When pruning objects with `--cruft`, `git repack` offers some
flexibility when selecting the set of which objects are pruned via the
`--cruft-expiration` option.

This is useful for expiring objects which are older than the grace
period, making races where to-be-pruned objects become reachable and
then ancestors of freshly pushed objects, leaving the repository in a
corrupt state after pruning substantially less likely [1].

But in practice, such races are impossible to avoid entirely, no matter
how long the grace period is. To prevent this race, it is often
advisable to temporarily put a repository into a read-only state. But in
practice, this is not always practical, and so some middle ground would
be nice.

This patch introduces a new option, `--expire-to`, which teaches `git
repack` to write an additional cruft pack containing just the objects
which were pruned from the repository. The caller can specify a
directory outside of the current repository as the destination for this
second cruft pack.

This makes it possible to prune objects from a repository, while still
holding onto a supplemental copy of them outside of the original
repository. Having this copy on-disk makes it substantially easier to
recover objects when the aforementioned race is encountered.

`--expire-to` is implemented in a somewhat convoluted manner, which is
to take advantage of the fact that the first time `write_cruft_pack()`
is called, it adds the name of the cruft pack to the `names` string
list. That means the second time we call `write_cruft_pack()`, objects
in the previously-written cruft pack will be excluded.

As long as the caller ensures that no objects are expired during the
second pass, this is sufficient to generate a cruft pack containing all
objects which don't appear in any of the new packs written by `git
repack`, including the cruft pack. In other words, all of the objects
which are about to be pruned from the repository.

It is important to note that the destination in `--expire-to` does not
necessarily need to be a Git repository (though it can be) Notably, the
expired packs do not contain all ancestors of expired objects. So if the
source repository contains something like:

              <unreachable>
             /
    C1 --- C2
      \
       refs/heads/master

where C2 is unreachable, but has a parent (C1) which is reachable, and
C2 would be pruned, then the expiry pack will contain only C2, not C1.

[1]: https://lore.kernel.org/git/20190319001829.GL29661@sigill.intra.peff.net/

Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 Documentation/git-repack.txt |   6 ++
 builtin/repack.c             |  40 ++++++++++++
 t/t7700-repack.sh            | 121 +++++++++++++++++++++++++++++++++++
 3 files changed, 167 insertions(+)

diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt
index 0bf13893d8..4017157949 100644
--- a/Documentation/git-repack.txt
+++ b/Documentation/git-repack.txt
@@ -74,6 +74,12 @@ to the new separate pack will be written.
 	immediately instead of waiting for the next `git gc` invocation.
 	Only useful with `--cruft -d`.
 
+--expire-to=<dir>::
+	Write a cruft pack containing pruned objects (if any) to the
+	directory `<dir>`. This option is useful for keeping a copy of
+	any pruned objects in a separate directory as a backup. Only
+	useful with `--cruft -d`.
+
 -l::
 	Pass the `--local` option to 'git pack-objects'. See
 	linkgit:git-pack-objects[1].
diff --git a/builtin/repack.c b/builtin/repack.c
index 432fa3f677..a70962ee03 100644
--- a/builtin/repack.c
+++ b/builtin/repack.c
@@ -702,6 +702,10 @@ static int write_cruft_pack(const struct pack_objects_args *args,
 	 * By the time it is read here, it contains only the pack(s)
 	 * that were just written, which is exactly the set of packs we
 	 * want to consider kept.
+	 *
+	 * If `--expire-to` is given, the double-use served by `names`
+	 * ensures that the pack written to `--expire-to` excludes any
+	 * objects contained in the cruft pack.
 	 */
 	in = xfdopen(cmd.in, "w");
 	for_each_string_list_item(item, names)
@@ -755,6 +759,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 	int geometric_factor = 0;
 	int write_midx = 0;
 	const char *cruft_expiration = NULL;
+	const char *expire_to = NULL;
 
 	struct option builtin_repack_options[] = {
 		OPT_BIT('a', NULL, &pack_everything,
@@ -804,6 +809,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 			    N_("find a geometric progression with factor <N>")),
 		OPT_BOOL('m', "write-midx", &write_midx,
 			   N_("write a multi-pack index of the resulting packs")),
+		OPT_STRING(0, "expire-to", &expire_to, N_("dir"),
+			   N_("pack prefix to store a pack containing pruned objects")),
 		OPT_END()
 	};
 
@@ -1000,6 +1007,39 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 				       &existing_kept_packs);
 		if (ret)
 			return ret;
+
+		if (delete_redundant && expire_to) {
+			/*
+			 * If `--expire-to` is given with `-d`, it's possible
+			 * that we're about to prune some objects. With cruft
+			 * packs, pruning is implicit: any objects from existing
+			 * packs that weren't picked up by new packs are removed
+			 * when their packs are deleted.
+			 *
+			 * Generate an additional cruft pack, with one twist:
+			 * `names` now includes the name of the cruft pack
+			 * written in the previous step. So the contents of
+			 * _this_ cruft pack exclude everything contained in the
+			 * existing cruft pack (that is, all of the unreachable
+			 * objects which are no older than
+			 * `--cruft-expiration`).
+			 *
+			 * To make this work, cruft_expiration must become NULL
+			 * so that this cruft pack doesn't actually prune any
+			 * objects. If it were non-NULL, this call would always
+			 * generate an empty pack (since every object not in the
+			 * cruft pack generated above will have an mtime older
+			 * than the expiration).
+			 */
+			ret = write_cruft_pack(&cruft_po_args, expire_to,
+					       pack_prefix,
+					       NULL,
+					       &names,
+					       &existing_nonkept_packs,
+					       &existing_kept_packs);
+			if (ret)
+				return ret;
+		}
 	}
 
 	string_list_sort(&names);
diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh
index ca45c4cd2c..17ee6fc2cc 100755
--- a/t/t7700-repack.sh
+++ b/t/t7700-repack.sh
@@ -482,4 +482,125 @@ test_expect_success '-n overrides repack.updateServerInfo=true' '
 	test_server_info_missing
 '
 
+test_expect_success '--expire-to stores pruned objects (now)' '
+	git init expire-to-now &&
+	(
+		cd expire-to-now &&
+
+		git branch -M main &&
+
+		test_commit base &&
+
+		git checkout -b cruft &&
+		test_commit --no-tag cruft &&
+
+		git rev-list --objects --no-object-names main..cruft >moved.raw &&
+		sort moved.raw >moved.want &&
+
+		git rev-list --all --objects --no-object-names >expect.raw &&
+		sort expect.raw >expect &&
+
+		git checkout main &&
+		git branch -D cruft &&
+		git reflog expire --all --expire=all &&
+
+		git init --bare expired.git &&
+		git repack -d \
+			--cruft --cruft-expiration="now" \
+			--expire-to="expired.git/objects/pack/pack" &&
+
+		expired="$(ls expired.git/objects/pack/pack-*.idx)" &&
+		test_path_is_file "${expired%.idx}.mtimes" &&
+
+		# Since the `--cruft-expiration` is "now", the effective
+		# behavior is to move _all_ unreachable objects out to
+		# the location in `--expire-to`.
+		git show-index <$expired >expired.raw &&
+		cut -d" " -f2 expired.raw | sort >expired.objects &&
+		git rev-list --all --objects --no-object-names \
+			>remaining.objects &&
+
+		# ...in other words, the combined contents of this
+		# repository and expired.git should be the same as the
+		# set of objects we started with.
+		cat expired.objects remaining.objects | sort >actual &&
+		test_cmp expect actual &&
+
+		# The "moved" objects (i.e., those in expired.git)
+		# should be the same as the cruft objects which were
+		# expired in the previous step.
+		test_cmp moved.want expired.objects
+	)
+'
+
+test_expect_success '--expire-to stores pruned objects (5.minutes.ago)' '
+	git init expire-to-5.minutes.ago &&
+	(
+		cd expire-to-5.minutes.ago &&
+
+		git branch -M main &&
+
+		test_commit base &&
+
+		# Create two classes of unreachable objects, one which
+		# is older than 5 minutes (stale), and another which is
+		# newer (recent).
+		for kind in stale recent
+		do
+			git checkout -b $kind main &&
+			test_commit --no-tag $kind || return 1
+		done &&
+
+		git rev-list --objects --no-object-names main..stale >in &&
+		stale="$(git pack-objects $objdir/pack/pack <in)" &&
+		mtime="$(test-tool chmtime --get =-600 $objdir/pack/pack-$stale.pack)" &&
+
+		# expect holds the set of objects we expect to find in
+		# this repository after repacking
+		git rev-list --objects --no-object-names recent >expect.raw &&
+		sort expect.raw >expect &&
+
+		# moved.want holds the set of objects we expect to find
+		# in expired.git
+		git rev-list --objects --no-object-names main..stale >out &&
+		sort out >moved.want &&
+
+		git checkout main &&
+		git branch -D stale recent &&
+		git reflog expire --all --expire=all &&
+		git prune-packed &&
+
+		git init --bare expired.git &&
+		git repack -d \
+			--cruft --cruft-expiration=5.minutes.ago \
+			--expire-to="expired.git/objects/pack/pack" &&
+
+		# Some of the remaining objects in this repository are
+		# unreachable, so use `cat-file --batch-all-objects`
+		# instead of `rev-list` to get their names
+		git cat-file --batch-all-objects --batch-check="%(objectname)" \
+			>remaining.objects &&
+		sort remaining.objects >actual &&
+		test_cmp expect actual &&
+
+		(
+			cd expired.git &&
+
+			expired="$(ls objects/pack/pack-*.mtimes)" &&
+			test-tool pack-mtimes $(basename $expired) >out &&
+			cut -d" " -f1 out | sort >../moved.got &&
+
+			# Ensure that there are as many objects with the
+			# expected mtime as were moved to expired.git.
+			#
+			# In other words, ensure that the recorded
+			# mtimes of any moved objects was written
+			# correctly.
+			grep " $mtime$" out >matching &&
+			test_line_count = $(wc -l <../moved.want) matching
+		) &&
+		test_cmp moved.want moved.got
+	)
+'
+
 test_done