Merge branch 'ds/push-sparse-tree-walk'

"git pack-objects" learned another algorithm to compute the set of
objects to send, that trades the resulting packfile off to save
traversal cost to favor small pushes.

* ds/push-sparse-tree-walk:
  pack-objects: create GIT_TEST_PACK_SPARSE
  pack-objects: create pack.useSparse setting
  revision: implement sparse algorithm
  list-objects: consume sparse tree walk
  revision: add mark_tree_uninteresting_sparse
maint
Junio C Hamano 2019-02-06 22:05:24 -08:00
commit 5fda343321
12 changed files with 379 additions and 18 deletions

View File

@ -105,6 +105,15 @@ pack.useBitmaps::
true. You should not generally need to turn this off unless
you are debugging pack bitmaps.

pack.useSparse::
When true, git will default to using the '--sparse' option in
'git pack-objects' when the '--revs' option is present. This
algorithm only walks trees that appear in paths that introduce new
objects. This can have significant performance benefits when
computing a pack to send a small change. However, it is possible
that extra objects are added to the pack-file if the included
commits contain certain types of direct renames.

pack.writeBitmaps (deprecated)::
This is a deprecated synonym for `repack.writeBitmaps`.


View File

@ -14,7 +14,7 @@ SYNOPSIS
[--local] [--incremental] [--window=<n>] [--depth=<n>]
[--revs [--unpacked | --all]] [--keep-pack=<pack-name>]
[--stdout [--filter=<filter-spec>] | base-name]
[--shallow] [--keep-true-parents] < object-list
[--shallow] [--keep-true-parents] [--sparse] < object-list


DESCRIPTION
@ -196,6 +196,15 @@ depth is 4095.
Add --no-reuse-object if you want to force a uniform compression
level on all data no matter the source.

--sparse::
Use the "sparse" algorithm to determine which objects to include in
the pack, when combined with the "--revs" option. This algorithm
only walks trees that appear in paths that introduce new objects.
This can have significant performance benefits when computing
a pack to send a small change. However, it is possible that extra
objects are added to the pack-file if the included commits contain
certain types of direct renames.

--thin::
Create a "thin" pack by omitting the common objects between a
sender and a receiver in order to reduce network transfer. This

View File

@ -658,7 +658,7 @@ static void bisect_common(struct rev_info *revs)
if (prepare_revision_walk(revs))
die("revision walk setup failed");
if (revs->tree_objects)
mark_edges_uninteresting(revs, NULL);
mark_edges_uninteresting(revs, NULL, 0);
}

static void exit_if_skipped_commits(struct commit_list *tried,

View File

@ -84,6 +84,7 @@ static unsigned long pack_size_limit;
static int depth = 50;
static int delta_search_threads;
static int pack_to_stdout;
static int sparse;
static int thin;
static int num_preferred_base;
static struct progress *progress_state;
@ -2703,6 +2704,10 @@ static int git_pack_config(const char *k, const char *v, void *cb)
use_bitmap_index_default = git_config_bool(k, v);
return 0;
}
if (!strcmp(k, "pack.usesparse")) {
sparse = git_config_bool(k, v);
return 0;
}
if (!strcmp(k, "pack.threads")) {
delta_search_threads = git_config_int(k, v);
if (delta_search_threads < 0)
@ -3130,7 +3135,7 @@ static void get_object_list(int ac, const char **av)

if (prepare_revision_walk(&revs))
die(_("revision walk setup failed"));
mark_edges_uninteresting(&revs, show_edge);
mark_edges_uninteresting(&revs, show_edge, sparse);

if (!fn_show_object)
fn_show_object = show_object;
@ -3287,6 +3292,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
{ OPTION_CALLBACK, 0, "unpack-unreachable", NULL, N_("time"),
N_("unpack unreachable objects newer than <time>"),
PARSE_OPT_OPTARG, option_parse_unpack_unreachable },
OPT_BOOL(0, "sparse", &sparse,
N_("use the sparse reachability algorithm")),
OPT_BOOL(0, "thin", &thin,
N_("create thin packs")),
OPT_BOOL(0, "shallow", &shallow,
@ -3319,6 +3326,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)

read_replace_refs = 0;

sparse = git_env_bool("GIT_TEST_PACK_SPARSE", 0);
reset_pack_idx_option(&pack_idx_opts);
git_config(git_pack_config, NULL);


View File

@ -546,7 +546,7 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix)
if (prepare_revision_walk(&revs))
die("revision walk setup failed");
if (revs.tree_objects)
mark_edges_uninteresting(&revs, show_edge);
mark_edges_uninteresting(&revs, show_edge, 0);

if (bisect_list) {
int reaches, all;

View File

@ -1933,7 +1933,7 @@ int cmd_main(int argc, const char **argv)
pushing = 0;
if (prepare_revision_walk(&revs))
die("revision walk setup failed");
mark_edges_uninteresting(&revs, NULL);
mark_edges_uninteresting(&revs, NULL, 0);
objects_to_send = get_delta(&revs, ref_lock);
finish_all_active_slots();


View File

@ -226,25 +226,73 @@ static void mark_edge_parents_uninteresting(struct commit *commit,
}
}

void mark_edges_uninteresting(struct rev_info *revs, show_edge_fn show_edge)
static void add_edge_parents(struct commit *commit,
struct rev_info *revs,
show_edge_fn show_edge,
struct oidset *set)
{
struct commit_list *parents;

for (parents = commit->parents; parents; parents = parents->next) {
struct commit *parent = parents->item;
struct tree *tree = get_commit_tree(parent);

if (!tree)
continue;

oidset_insert(set, &tree->object.oid);

if (!(parent->object.flags & UNINTERESTING))
continue;
tree->object.flags |= UNINTERESTING;

if (revs->edge_hint && !(parent->object.flags & SHOWN)) {
parent->object.flags |= SHOWN;
show_edge(parent);
}
}
}

void mark_edges_uninteresting(struct rev_info *revs,
show_edge_fn show_edge,
int sparse)
{
struct commit_list *list;
int i;

for (list = revs->commits; list; list = list->next) {
struct commit *commit = list->item;
if (sparse) {
struct oidset set;
oidset_init(&set, 16);

if (commit->object.flags & UNINTERESTING) {
mark_tree_uninteresting(revs->repo,
get_commit_tree(commit));
if (revs->edge_hint_aggressive && !(commit->object.flags & SHOWN)) {
commit->object.flags |= SHOWN;
show_edge(commit);
}
continue;
for (list = revs->commits; list; list = list->next) {
struct commit *commit = list->item;
struct tree *tree = get_commit_tree(commit);

if (commit->object.flags & UNINTERESTING)
tree->object.flags |= UNINTERESTING;

oidset_insert(&set, &tree->object.oid);
add_edge_parents(commit, revs, show_edge, &set);
}

mark_trees_uninteresting_sparse(revs->repo, &set);
oidset_clear(&set);
} else {
for (list = revs->commits; list; list = list->next) {
struct commit *commit = list->item;
if (commit->object.flags & UNINTERESTING) {
mark_tree_uninteresting(revs->repo,
get_commit_tree(commit));
if (revs->edge_hint_aggressive && !(commit->object.flags & SHOWN)) {
commit->object.flags |= SHOWN;
show_edge(commit);
}
continue;
}
mark_edge_parents_uninteresting(commit, revs, show_edge);
}
mark_edge_parents_uninteresting(commit, revs, show_edge);
}

if (revs->edge_hint_aggressive) {
for (i = 0; i < revs->cmdline.nr; i++) {
struct object *obj = revs->cmdline.rev[i].item;

View File

@ -10,7 +10,9 @@ typedef void (*show_object_fn)(struct object *, const char *, void *);
void traverse_commit_list(struct rev_info *, show_commit_fn, show_object_fn, void *);

typedef void (*show_edge_fn)(struct commit *);
void mark_edges_uninteresting(struct rev_info *, show_edge_fn);
void mark_edges_uninteresting(struct rev_info *revs,
show_edge_fn show_edge,
int sparse);

struct oidset;
struct list_objects_filter_options;

View File

@ -27,6 +27,7 @@
#include "commit-reach.h"
#include "commit-graph.h"
#include "prio-queue.h"
#include "hashmap.h"

volatile show_early_output_fn_t show_early_output;

@ -99,6 +100,148 @@ void mark_tree_uninteresting(struct repository *r, struct tree *tree)
mark_tree_contents_uninteresting(r, tree);
}

struct path_and_oids_entry {
struct hashmap_entry ent;
char *path;
struct oidset trees;
};

static int path_and_oids_cmp(const void *hashmap_cmp_fn_data,
const struct path_and_oids_entry *e1,
const struct path_and_oids_entry *e2,
const void *keydata)
{
return strcmp(e1->path, e2->path);
}

static void paths_and_oids_init(struct hashmap *map)
{
hashmap_init(map, (hashmap_cmp_fn) path_and_oids_cmp, NULL, 0);
}

static void paths_and_oids_clear(struct hashmap *map)
{
struct hashmap_iter iter;
struct path_and_oids_entry *entry;
hashmap_iter_init(map, &iter);

while ((entry = (struct path_and_oids_entry *)hashmap_iter_next(&iter))) {
oidset_clear(&entry->trees);
free(entry->path);
}

hashmap_free(map, 1);
}

static void paths_and_oids_insert(struct hashmap *map,
const char *path,
const struct object_id *oid)
{
int hash = strhash(path);
struct path_and_oids_entry key;
struct path_and_oids_entry *entry;

hashmap_entry_init(&key, hash);

/* use a shallow copy for the lookup */
key.path = (char *)path;
oidset_init(&key.trees, 0);

if (!(entry = (struct path_and_oids_entry *)hashmap_get(map, &key, NULL))) {
entry = xcalloc(1, sizeof(struct path_and_oids_entry));
hashmap_entry_init(entry, hash);
entry->path = xstrdup(key.path);
oidset_init(&entry->trees, 16);
hashmap_put(map, entry);
}

oidset_insert(&entry->trees, oid);
}

static void add_children_by_path(struct repository *r,
struct tree *tree,
struct hashmap *map)
{
struct tree_desc desc;
struct name_entry entry;

if (!tree)
return;

if (parse_tree_gently(tree, 1) < 0)
return;

init_tree_desc(&desc, tree->buffer, tree->size);
while (tree_entry(&desc, &entry)) {
switch (object_type(entry.mode)) {
case OBJ_TREE:
paths_and_oids_insert(map, entry.path, &entry.oid);

if (tree->object.flags & UNINTERESTING) {
struct tree *child = lookup_tree(r, &entry.oid);
if (child)
child->object.flags |= UNINTERESTING;
}
break;
case OBJ_BLOB:
if (tree->object.flags & UNINTERESTING) {
struct blob *child = lookup_blob(r, &entry.oid);
if (child)
child->object.flags |= UNINTERESTING;
}
break;
default:
/* Subproject commit - not in this repository */
break;
}
}

free_tree_buffer(tree);
}

void mark_trees_uninteresting_sparse(struct repository *r,
struct oidset *trees)
{
unsigned has_interesting = 0, has_uninteresting = 0;
struct hashmap map;
struct hashmap_iter map_iter;
struct path_and_oids_entry *entry;
struct object_id *oid;
struct oidset_iter iter;

oidset_iter_init(trees, &iter);
while ((!has_interesting || !has_uninteresting) &&
(oid = oidset_iter_next(&iter))) {
struct tree *tree = lookup_tree(r, oid);

if (!tree)
continue;

if (tree->object.flags & UNINTERESTING)
has_uninteresting = 1;
else
has_interesting = 1;
}

/* Do not walk unless we have both types of trees. */
if (!has_uninteresting || !has_interesting)
return;

paths_and_oids_init(&map);

oidset_iter_init(trees, &iter);
while ((oid = oidset_iter_next(&iter))) {
struct tree *tree = lookup_tree(r, oid);
add_children_by_path(r, tree, &map);
}

hashmap_iter_init(&map, &map_iter);
while ((entry = hashmap_iter_next(&map_iter)))
mark_trees_uninteresting_sparse(r, &entry->trees);

paths_and_oids_clear(&map);
}

struct commit_stack {
struct commit **items;
size_t nr, alloc;

View File

@ -67,6 +67,7 @@ struct rev_cmdline_info {
#define REVISION_WALK_NO_WALK_SORTED 1
#define REVISION_WALK_NO_WALK_UNSORTED 2

struct oidset;
struct topo_walk_info;

struct rev_info {
@ -327,6 +328,7 @@ void put_revision_mark(const struct rev_info *revs,

void mark_parents_uninteresting(struct commit *commit);
void mark_tree_uninteresting(struct repository *r, struct tree *tree);
void mark_trees_uninteresting_sparse(struct repository *r, struct oidset *trees);

void show_object_with_name(FILE *, struct object *, const char *);


View File

@ -358,6 +358,10 @@ GIT_TEST_INDEX_VERSION=<n> exercises the index read/write code path
for the index version specified. Can be set to any valid version
(currently 2, 3, or 4).

GIT_TEST_PACK_SPARSE=<boolean> if enabled will default the pack-objects
builtin to use the sparse object walk. This can still be overridden by
the --no-sparse command-line argument.

GIT_TEST_PRELOAD_INDEX=<boolean> exercises the preload-index code path
by overriding the minimum number of cache entries required per thread.


136
t/t5322-pack-objects-sparse.sh Executable file
View File

@ -0,0 +1,136 @@
#!/bin/sh

test_description='pack-objects object selection using sparse algorithm'
. ./test-lib.sh

test_expect_success 'setup repo' '
test_commit initial &&
for i in $(test_seq 1 3)
do
mkdir f$i &&
for j in $(test_seq 1 3)
do
mkdir f$i/f$j &&
echo $j >f$i/f$j/data.txt
done
done &&
git add . &&
git commit -m "Initialized trees" &&
for i in $(test_seq 1 3)
do
git checkout -b topic$i master &&
echo change-$i >f$i/f$i/data.txt &&
git commit -a -m "Changed f$i/f$i/data.txt"
done &&
cat >packinput.txt <<-EOF &&
topic1
^topic2
^topic3
EOF
git rev-parse \
topic1 \
topic1^{tree} \
topic1:f1 \
topic1:f1/f1 \
topic1:f1/f1/data.txt | sort >expect_objects.txt
'

test_expect_success 'non-sparse pack-objects' '
git pack-objects --stdout --revs --no-sparse <packinput.txt >nonsparse.pack &&
git index-pack -o nonsparse.idx nonsparse.pack &&
git show-index <nonsparse.idx | awk "{print \$2}" >nonsparse_objects.txt &&
test_cmp expect_objects.txt nonsparse_objects.txt
'

test_expect_success 'sparse pack-objects' '
git pack-objects --stdout --revs --sparse <packinput.txt >sparse.pack &&
git index-pack -o sparse.idx sparse.pack &&
git show-index <sparse.idx | awk "{print \$2}" >sparse_objects.txt &&
test_cmp expect_objects.txt sparse_objects.txt
'

test_expect_success 'duplicate a folder from f3 and commit to topic1' '
git checkout topic1 &&
echo change-3 >f3/f3/data.txt &&
git commit -a -m "Changed f3/f3/data.txt" &&
git rev-parse \
topic1~1 \
topic1~1^{tree} \
topic1^{tree} \
topic1 \
topic1:f1 \
topic1:f1/f1 \
topic1:f1/f1/data.txt | sort >required_objects.txt
'

test_expect_success 'non-sparse pack-objects' '
git pack-objects --stdout --revs --no-sparse <packinput.txt >nonsparse.pack &&
git index-pack -o nonsparse.idx nonsparse.pack &&
git show-index <nonsparse.idx | awk "{print \$2}" >nonsparse_objects.txt &&
comm -1 -2 required_objects.txt nonsparse_objects.txt >nonsparse_required_objects.txt &&
test_cmp required_objects.txt nonsparse_required_objects.txt
'

test_expect_success 'sparse pack-objects' '
git pack-objects --stdout --revs --sparse <packinput.txt >sparse.pack &&
git index-pack -o sparse.idx sparse.pack &&
git show-index <sparse.idx | awk "{print \$2}" >sparse_objects.txt &&
comm -1 -2 required_objects.txt sparse_objects.txt >sparse_required_objects.txt &&
test_cmp required_objects.txt sparse_required_objects.txt
'

# Demonstrate that the algorithms differ when we copy a tree wholesale
# from one folder to another.

test_expect_success 'duplicate a folder from f1 into f3' '
mkdir f3/f4 &&
cp -r f1/f1/* f3/f4 &&
git add f3/f4 &&
git commit -m "Copied f1/f1 to f3/f4" &&
cat >packinput.txt <<-EOF &&
topic1
^topic1~1
EOF
git rev-parse \
topic1 \
topic1^{tree} \
topic1:f3 | sort >required_objects.txt
'

test_expect_success 'non-sparse pack-objects' '
git pack-objects --stdout --revs --no-sparse <packinput.txt >nonsparse.pack &&
git index-pack -o nonsparse.idx nonsparse.pack &&
git show-index <nonsparse.idx | awk "{print \$2}" >nonsparse_objects.txt &&
comm -1 -2 required_objects.txt nonsparse_objects.txt >nonsparse_required_objects.txt &&
test_cmp required_objects.txt nonsparse_required_objects.txt
'

test_expect_success 'sparse pack-objects' '
git rev-parse \
topic1 \
topic1^{tree} \
topic1:f3 \
topic1:f3/f4 \
topic1:f3/f4/data.txt | sort >expect_sparse_objects.txt &&
git pack-objects --stdout --revs --sparse <packinput.txt >sparse.pack &&
git index-pack -o sparse.idx sparse.pack &&
git show-index <sparse.idx | awk "{print \$2}" >sparse_objects.txt &&
test_cmp expect_sparse_objects.txt sparse_objects.txt
'

test_expect_success 'pack.useSparse enables algorithm' '
git config pack.useSparse true &&
git pack-objects --stdout --revs <packinput.txt >sparse.pack &&
git index-pack -o sparse.idx sparse.pack &&
git show-index <sparse.idx | awk "{print \$2}" >sparse_objects.txt &&
test_cmp expect_sparse_objects.txt sparse_objects.txt
'

test_expect_success 'pack.useSparse overridden' '
git pack-objects --stdout --revs --no-sparse <packinput.txt >sparse.pack &&
git index-pack -o sparse.idx sparse.pack &&
git show-index <sparse.idx | awk "{print \$2}" >sparse_objects.txt &&
test_cmp required_objects.txt sparse_objects.txt
'

test_done