From 29ef1f27fed21b5b7d3c996a01f1364e7e841917 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 5 Aug 2021 13:25:24 +0200 Subject: [PATCH 1/5] revision: separate walk and unsorted flags The `--no-walk` flag supports two modes: either it sorts the revisions given as input input or it doesn't. This is reflected in a single `no_walk` flag, which reflects one of the three states "walk", "don't walk but without sorting" and "don't walk but with sorting". Split up the flag into two separate bits, one indicating whether we should walk or not and one indicating whether the input should be sorted or not. This will allow us to more easily introduce a new flag `--unsorted-input`, which only impacts the sorting bit. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/log.c | 2 +- builtin/revert.c | 3 ++- revision.c | 9 +++++---- revision.h | 7 ++----- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/builtin/log.c b/builtin/log.c index 3d7717ba5c..f75d87e8d7 100644 --- a/builtin/log.c +++ b/builtin/log.c @@ -637,7 +637,7 @@ int cmd_show(int argc, const char **argv, const char *prefix) repo_init_revisions(the_repository, &rev, prefix); rev.diff = 1; rev.always_show_header = 1; - rev.no_walk = REVISION_WALK_NO_WALK_SORTED; + rev.no_walk = 1; rev.diffopt.stat_width = -1; /* Scale to real terminal size */ memset(&opt, 0, sizeof(opt)); diff --git a/builtin/revert.c b/builtin/revert.c index 237f2f18d4..2e13660e4b 100644 --- a/builtin/revert.c +++ b/builtin/revert.c @@ -191,7 +191,8 @@ static int run_sequencer(int argc, const char **argv, struct replay_opts *opts) struct setup_revision_opt s_r_opt; opts->revs = xmalloc(sizeof(*opts->revs)); repo_init_revisions(the_repository, opts->revs, NULL); - opts->revs->no_walk = REVISION_WALK_NO_WALK_UNSORTED; + opts->revs->no_walk = 1; + opts->revs->unsorted_input = 1; if (argc < 2) usage_with_options(usage_str, options); if (!strcmp(argv[1], "-")) diff --git a/revision.c b/revision.c index cddd0542a6..86bbcd10d2 100644 --- a/revision.c +++ b/revision.c @@ -2651,16 +2651,17 @@ static int handle_revision_pseudo_opt(const char *submodule, } else if (!strcmp(arg, "--not")) { *flags ^= UNINTERESTING | BOTTOM; } else if (!strcmp(arg, "--no-walk")) { - revs->no_walk = REVISION_WALK_NO_WALK_SORTED; + revs->no_walk = 1; } else if (skip_prefix(arg, "--no-walk=", &optarg)) { /* * Detached form ("--no-walk X" as opposed to "--no-walk=X") * not allowed, since the argument is optional. */ + revs->no_walk = 1; if (!strcmp(optarg, "sorted")) - revs->no_walk = REVISION_WALK_NO_WALK_SORTED; + revs->unsorted_input = 0; else if (!strcmp(optarg, "unsorted")) - revs->no_walk = REVISION_WALK_NO_WALK_UNSORTED; + revs->unsorted_input = 1; else return error("invalid argument to --no-walk"); } else if (!strcmp(arg, "--do-walk")) { @@ -3584,7 +3585,7 @@ int prepare_revision_walk(struct rev_info *revs) if (!revs->reflog_info) prepare_to_use_bloom_filter(revs); - if (revs->no_walk != REVISION_WALK_NO_WALK_UNSORTED) + if (!revs->unsorted_input) commit_list_sort_by_date(&revs->commits); if (revs->no_walk) return 0; diff --git a/revision.h b/revision.h index fbb068da9f..0c65a760ee 100644 --- a/revision.h +++ b/revision.h @@ -79,10 +79,6 @@ struct rev_cmdline_info { } *rev; }; -#define REVISION_WALK_WALK 0 -#define REVISION_WALK_NO_WALK_SORTED 1 -#define REVISION_WALK_NO_WALK_UNSORTED 2 - struct oidset; struct topo_walk_info; @@ -129,7 +125,8 @@ struct rev_info { /* Traversal flags */ unsigned int dense:1, prune:1, - no_walk:2, + no_walk:1, + unsorted_input:1, remove_empty_trees:1, simplify_history:1, show_pulls:1, From f45022dc2fd692fd024f2eb41a86a66f19013d43 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 9 Aug 2021 10:11:50 +0200 Subject: [PATCH 2/5] connected: do not sort input revisions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to compute whether objects reachable from a set of tips are all connected, we do a revision walk with these tips as positive references and `--not --all`. `--not --all` will cause the revision walk to load all preexisting references as uninteresting, which can be very expensive in repositories with many references. Benchmarking the git-rev-list(1) command highlights that by far the most expensive single phase is initial sorting of the input revisions: after all references have been loaded, we first sort commits by author date. In a real-world repository with about 2.2 million references, it makes up about 40% of the total runtime of git-rev-list(1). Ultimately, the connectivity check shouldn't really bother about the order of input revisions at all. We only care whether we can actually walk all objects until we hit the cut-off point. So sorting the input is a complete waste of time. Introduce a new "--unsorted-input" flag to git-rev-list(1) which will cause it to not sort the commits and adjust the connectivity check to always pass the flag. This results in the following speedups, executed in a clone of gitlab-org/gitlab [1]: Benchmark #1: git rev-list --objects --quiet --not --all --not $(cat newrev) Time (mean ± σ): 7.639 s ± 0.065 s [User: 7.304 s, System: 0.335 s] Range (min … max): 7.543 s … 7.742 s 10 runs Benchmark #2: git rev-list --unsorted-input --objects --quiet --not --all --not $newrev Time (mean ± σ): 4.995 s ± 0.044 s [User: 4.657 s, System: 0.337 s] Range (min … max): 4.909 s … 5.048 s 10 runs Summary 'git rev-list --unsorted-input --objects --quiet --not --all --not $(cat newrev)' ran 1.53 ± 0.02 times faster than 'git rev-list --objects --quiet --not --all --not $newrev' [1]: https://gitlab.com/gitlab-org/gitlab.git. Note that not all refs are visible to clients. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- Documentation/rev-list-options.txt | 8 +++++++- connected.c | 1 + revision.c | 9 +++++++++ t/t6000-rev-list-misc.sh | 31 ++++++++++++++++++++++++++++++ 4 files changed, 48 insertions(+), 1 deletion(-) diff --git a/Documentation/rev-list-options.txt b/Documentation/rev-list-options.txt index 24569b06d1..b7bd27e171 100644 --- a/Documentation/rev-list-options.txt +++ b/Documentation/rev-list-options.txt @@ -968,6 +968,11 @@ list of the missing objects. Object IDs are prefixed with a ``?'' character. objects. endif::git-rev-list[] +--unsorted-input:: + Show commits in the order they were given on the command line instead + of sorting them in reverse chronological order by commit time. Cannot + be combined with `--no-walk` or `--no-walk=sorted`. + --no-walk[=(sorted|unsorted)]:: Only show the given commits, but do not traverse their ancestors. This has no effect if a range is specified. If the argument @@ -975,7 +980,8 @@ endif::git-rev-list[] given on the command line. Otherwise (if `sorted` or no argument was given), the commits are shown in reverse chronological order by commit time. - Cannot be combined with `--graph`. + Cannot be combined with `--graph`. Cannot be combined with + `--unsorted-input` if `sorted` or no argument was given. --do-walk:: Overrides a previous `--no-walk`. diff --git a/connected.c b/connected.c index b18299fdf0..b5f9523a5f 100644 --- a/connected.c +++ b/connected.c @@ -106,6 +106,7 @@ no_promisor_pack_found: if (opt->progress) strvec_pushf(&rev_list.args, "--progress=%s", _("Checking connectivity")); + strvec_push(&rev_list.args, "--unsorted-input"); rev_list.git_cmd = 1; rev_list.env = opt->env; diff --git a/revision.c b/revision.c index 86bbcd10d2..47541407d2 100644 --- a/revision.c +++ b/revision.c @@ -2256,6 +2256,10 @@ static int handle_revision_opt(struct rev_info *revs, int argc, const char **arg } else if (!strcmp(arg, "--author-date-order")) { revs->sort_order = REV_SORT_BY_AUTHOR_DATE; revs->topo_order = 1; + } else if (!strcmp(arg, "--unsorted-input")) { + if (revs->no_walk) + die(_("--unsorted-input is incompatible with --no-walk")); + revs->unsorted_input = 1; } else if (!strcmp(arg, "--early-output")) { revs->early_output = 100; revs->topo_order = 1; @@ -2651,8 +2655,13 @@ static int handle_revision_pseudo_opt(const char *submodule, } else if (!strcmp(arg, "--not")) { *flags ^= UNINTERESTING | BOTTOM; } else if (!strcmp(arg, "--no-walk")) { + if (!revs->no_walk && revs->unsorted_input) + die(_("--no-walk is incompatible with --unsorted-input")); revs->no_walk = 1; } else if (skip_prefix(arg, "--no-walk=", &optarg)) { + if (!revs->no_walk && revs->unsorted_input) + die(_("--no-walk is incompatible with --unsorted-input")); + /* * Detached form ("--no-walk X" as opposed to "--no-walk=X") * not allowed, since the argument is optional. diff --git a/t/t6000-rev-list-misc.sh b/t/t6000-rev-list-misc.sh index 12def7bcbf..ef849e5bc8 100755 --- a/t/t6000-rev-list-misc.sh +++ b/t/t6000-rev-list-misc.sh @@ -169,4 +169,35 @@ test_expect_success 'rev-list --count --objects' ' test_line_count = $count actual ' +test_expect_success 'rev-list --unsorted-input results in different sorting' ' + git rev-list --unsorted-input HEAD HEAD~ >first && + git rev-list --unsorted-input HEAD~ HEAD >second && + ! test_cmp first second && + sort first >first.sorted && + sort second >second.sorted && + test_cmp first.sorted second.sorted +' + +test_expect_success 'rev-list --unsorted-input incompatible with --no-walk' ' + cat >expect <<-EOF && + fatal: --no-walk is incompatible with --unsorted-input + EOF + test_must_fail git rev-list --unsorted-input --no-walk HEAD 2>error && + test_cmp expect error && + test_must_fail git rev-list --unsorted-input --no-walk=sorted HEAD 2>error && + test_cmp expect error && + test_must_fail git rev-list --unsorted-input --no-walk=unsorted HEAD 2>error && + test_cmp expect error && + + cat >expect <<-EOF && + fatal: --unsorted-input is incompatible with --no-walk + EOF + test_must_fail git rev-list --no-walk --unsorted-input HEAD 2>error && + test_cmp expect error && + test_must_fail git rev-list --no-walk=sorted --unsorted-input HEAD 2>error && + test_cmp expect error && + test_must_fail git rev-list --no-walk=unsorted --unsorted-input HEAD 2>error && + test_cmp expect error +' + test_done From bf9c0cbddbcd730e4312ba5e19f8b8a2edd65bb3 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 9 Aug 2021 10:11:54 +0200 Subject: [PATCH 3/5] revision: stop retrieving reference twice MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When queueing up references for the revision walk, `handle_one_ref()` will resolve the reference's object ID via `get_reference()` and then queue the ID as pending object via `add_pending_oid()`. But given that `add_pending_oid()` is only a thin wrapper around `add_pending_object()` which fist calls `get_reference()`, we effectively resolve the reference twice and thus duplicate some of the work. Fix the issue by instead calling `add_pending_object()` directly, which takes the already-resolved object as input. In a repository with lots of refs, this translates into a near 10% speedup: Benchmark #1: HEAD~: rev-list --unsorted-input --objects --quiet --not --all --not $newrev Time (mean ± σ): 5.015 s ± 0.038 s [User: 4.698 s, System: 0.316 s] Range (min … max): 4.970 s … 5.089 s 10 runs Benchmark #2: HEAD: rev-list --unsorted-input --objects --quiet --not --all --not $newrev Time (mean ± σ): 4.606 s ± 0.029 s [User: 4.260 s, System: 0.345 s] Range (min … max): 4.565 s … 4.657 s 10 runs Summary 'HEAD: rev-list --unsorted-input --objects --quiet --not --all --not $newrev' ran 1.09 ± 0.01 times faster than 'HEAD~: rev-list --unsorted-input --objects --quiet --not --all --not $newrev' Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- revision.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/revision.c b/revision.c index 47541407d2..80a59896b9 100644 --- a/revision.c +++ b/revision.c @@ -1534,7 +1534,7 @@ static int handle_one_ref(const char *path, const struct object_id *oid, object = get_reference(cb->all_revs, path, oid, cb->all_flags); add_rev_cmdline(cb->all_revs, object, path, REV_CMD_REF, cb->all_flags); - add_pending_oid(cb->all_revs, path, oid, cb->all_flags); + add_pending_object(cb->all_revs, object, path); return 0; } From 809ea28f809e52d3204b597637b2f5e072c140f8 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 9 Aug 2021 10:11:59 +0200 Subject: [PATCH 4/5] commit-graph: split out function to search commit position The function `find_commit_in_graph()` assumes that the caller has passed an object which was already determined to be a commit given that it will access the commit's graph position, which is stored in a commit slab. In a subsequent patch, we want to search for an object ID though without knowing whether it is a commit or not, which is not currently possible. Split out the logic to search the commit graph for a given object ID to prepare for this change. This commit also renames the function to `find_commit_pos_in_graph()`, which more accurately reflects what this function does. Furthermore, in order to allow for the searched object ID to be const, we need to adjust `bsearch_graph()`'s signature to accept a constant object ID as input, too. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- commit-graph.c | 55 +++++++++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index 3860a0d847..8c4c7262c8 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -723,7 +723,7 @@ void close_commit_graph(struct raw_object_store *o) o->commit_graph = NULL; } -static int bsearch_graph(struct commit_graph *g, struct object_id *oid, uint32_t *pos) +static int bsearch_graph(struct commit_graph *g, const struct object_id *oid, uint32_t *pos) { return bsearch_hash(oid->hash, g->chunk_oid_fanout, g->chunk_oid_lookup, g->hash_len, pos); @@ -864,25 +864,30 @@ static int fill_commit_in_graph(struct repository *r, return 1; } -static int find_commit_in_graph(struct commit *item, struct commit_graph *g, uint32_t *pos) +static int search_commit_pos_in_graph(const struct object_id *id, struct commit_graph *g, uint32_t *pos) +{ + struct commit_graph *cur_g = g; + uint32_t lex_index; + + while (cur_g && !bsearch_graph(cur_g, id, &lex_index)) + cur_g = cur_g->base_graph; + + if (cur_g) { + *pos = lex_index + cur_g->num_commits_in_base; + return 1; + } + + return 0; +} + +static int find_commit_pos_in_graph(struct commit *item, struct commit_graph *g, uint32_t *pos) { uint32_t graph_pos = commit_graph_position(item); if (graph_pos != COMMIT_NOT_FROM_GRAPH) { *pos = graph_pos; return 1; } else { - struct commit_graph *cur_g = g; - uint32_t lex_index; - - while (cur_g && !bsearch_graph(cur_g, &(item->object.oid), &lex_index)) - cur_g = cur_g->base_graph; - - if (cur_g) { - *pos = lex_index + cur_g->num_commits_in_base; - return 1; - } - - return 0; + return search_commit_pos_in_graph(&item->object.oid, g, pos); } } @@ -895,7 +900,7 @@ static int parse_commit_in_graph_one(struct repository *r, if (item->object.parsed) return 1; - if (find_commit_in_graph(item, g, &pos)) + if (find_commit_pos_in_graph(item, g, &pos)) return fill_commit_in_graph(r, item, g, pos); return 0; @@ -921,7 +926,7 @@ void load_commit_graph_info(struct repository *r, struct commit *item) uint32_t pos; if (!prepare_commit_graph(r)) return; - if (find_commit_in_graph(item, r->objects->commit_graph, &pos)) + if (find_commit_pos_in_graph(item, r->objects->commit_graph, &pos)) fill_commit_graph_info(item, r->objects->commit_graph, pos); } @@ -1091,9 +1096,9 @@ static int write_graph_chunk_data(struct hashfile *f, edge_value += ctx->new_num_commits_in_base; else if (ctx->new_base_graph) { uint32_t pos; - if (find_commit_in_graph(parent->item, - ctx->new_base_graph, - &pos)) + if (find_commit_pos_in_graph(parent->item, + ctx->new_base_graph, + &pos)) edge_value = pos; } @@ -1122,9 +1127,9 @@ static int write_graph_chunk_data(struct hashfile *f, edge_value += ctx->new_num_commits_in_base; else if (ctx->new_base_graph) { uint32_t pos; - if (find_commit_in_graph(parent->item, - ctx->new_base_graph, - &pos)) + if (find_commit_pos_in_graph(parent->item, + ctx->new_base_graph, + &pos)) edge_value = pos; } @@ -1235,9 +1240,9 @@ static int write_graph_chunk_extra_edges(struct hashfile *f, edge_value += ctx->new_num_commits_in_base; else if (ctx->new_base_graph) { uint32_t pos; - if (find_commit_in_graph(parent->item, - ctx->new_base_graph, - &pos)) + if (find_commit_pos_in_graph(parent->item, + ctx->new_base_graph, + &pos)) edge_value = pos; } From f559d6d45e7e58ae1f922213948723de77ea77bd Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 9 Aug 2021 10:12:03 +0200 Subject: [PATCH 5/5] revision: avoid hitting packfiles when commits are in commit-graph MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When queueing references in git-rev-list(1), we try to optimize parsing of commits via the commit-graph. To do so, we first look up the object's type, and if it is a commit we call `repo_parse_commit()` instead of `parse_object()`. This is quite inefficient though given that we're always uncompressing the object header in order to determine the type. Instead, we can opportunistically search the commit-graph for the object ID: in case it's found, we know it's a commit and can directly fill in the commit object without having to uncompress the object header. Expose a new function `lookup_commit_in_graph()`, which tries to find a commit in the commit-graph by ID, and convert `get_reference()` to use this function. This provides a big performance win in cases where we load references in a repository with lots of references pointing to commits. The following has been executed in a real-world repository with about 2.2 million refs: Benchmark #1: HEAD~: rev-list --unsorted-input --objects --quiet --not --all --not $newrev Time (mean ± σ): 4.458 s ± 0.044 s [User: 4.115 s, System: 0.342 s] Range (min … max): 4.409 s … 4.534 s 10 runs Benchmark #2: HEAD: rev-list --unsorted-input --objects --quiet --not --all --not $newrev Time (mean ± σ): 3.089 s ± 0.015 s [User: 2.768 s, System: 0.321 s] Range (min … max): 3.061 s … 3.105 s 10 runs Summary 'HEAD: rev-list --unsorted-input --objects --quiet --not --all --not $newrev' ran 1.44 ± 0.02 times faster than 'HEAD~: rev-list --unsorted-input --objects --quiet --not --all --not $newrev' Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- commit-graph.c | 24 ++++++++++++++++++++++++ commit-graph.h | 8 ++++++++ revision.c | 18 ++++++++---------- 3 files changed, 40 insertions(+), 10 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index 8c4c7262c8..00614acd65 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -891,6 +891,30 @@ static int find_commit_pos_in_graph(struct commit *item, struct commit_graph *g, } } +struct commit *lookup_commit_in_graph(struct repository *repo, const struct object_id *id) +{ + struct commit *commit; + uint32_t pos; + + if (!repo->objects->commit_graph) + return NULL; + if (!search_commit_pos_in_graph(id, repo->objects->commit_graph, &pos)) + return NULL; + if (!repo_has_object_file(repo, id)) + return NULL; + + commit = lookup_commit(repo, id); + if (!commit) + return NULL; + if (commit->object.parsed) + return commit; + + if (!fill_commit_in_graph(repo, commit, repo->objects->commit_graph, pos)) + return NULL; + + return commit; +} + static int parse_commit_in_graph_one(struct repository *r, struct commit_graph *g, struct commit *item) diff --git a/commit-graph.h b/commit-graph.h index 96c24fb577..04a94e1830 100644 --- a/commit-graph.h +++ b/commit-graph.h @@ -40,6 +40,14 @@ int open_commit_graph(const char *graph_file, int *fd, struct stat *st); */ int parse_commit_in_graph(struct repository *r, struct commit *item); +/* + * Look up the given commit ID in the commit-graph. This will only return a + * commit if the ID exists both in the graph and in the object database such + * that we don't return commits whose object has been pruned. Otherwise, this + * function returns `NULL`. + */ +struct commit *lookup_commit_in_graph(struct repository *repo, const struct object_id *id); + /* * It is possible that we loaded commit contents from the commit buffer, * but we also want to ensure the commit-graph content is correctly diff --git a/revision.c b/revision.c index 80a59896b9..0dabb5a0bc 100644 --- a/revision.c +++ b/revision.c @@ -360,20 +360,18 @@ static struct object *get_reference(struct rev_info *revs, const char *name, unsigned int flags) { struct object *object; + struct commit *commit; /* - * If the repository has commit graphs, repo_parse_commit() avoids - * reading the object buffer, so use it whenever possible. + * If the repository has commit graphs, we try to opportunistically + * look up the object ID in those graphs. Like this, we can avoid + * parsing commit data from disk. */ - if (oid_object_info(revs->repo, oid, NULL) == OBJ_COMMIT) { - struct commit *c = lookup_commit(revs->repo, oid); - if (!repo_parse_commit(revs->repo, c)) - object = (struct object *) c; - else - object = NULL; - } else { + commit = lookup_commit_in_graph(revs->repo, oid); + if (commit) + object = &commit->object; + else object = parse_object(revs->repo, oid); - } if (!object) { if (revs->ignore_missing)