diff --git a/Documentation/config/commitgraph.txt b/Documentation/config/commitgraph.txt index e68cdededa..7f8c9d6638 100644 --- a/Documentation/config/commitgraph.txt +++ b/Documentation/config/commitgraph.txt @@ -15,7 +15,7 @@ commitGraph.readChangedPaths:: commitGraph.changedPathsVersion:: Specifies the version of the changed-path Bloom filters that Git will read and - write. May be -1, 0 or 1. Note that values greater than 1 may be + write. May be -1, 0, 1, or 2. Note that values greater than 1 may be incompatible with older versions of Git which do not yet understand those versions. Use caution when operating in a mixed-version environment. @@ -31,4 +31,7 @@ filters when instructed to write. If 1, Git will only read version 1 Bloom filters, and will write version 1 Bloom filters. + +If 2, Git will only read version 2 Bloom filters, and will write version 2 +Bloom filters. ++ See linkgit:git-commit-graph[1] for more information. diff --git a/bloom.c b/bloom.c index c24489dbcf..323d8012b8 100644 --- a/bloom.c +++ b/bloom.c @@ -100,7 +100,64 @@ int load_bloom_filter_from_graph(struct commit_graph *g, * Not considered to be cryptographically secure. * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm */ -uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len) +uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len) +{ + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; + const uint32_t r1 = 15; + const uint32_t r2 = 13; + const uint32_t m = 5; + const uint32_t n = 0xe6546b64; + int i; + uint32_t k1 = 0; + const char *tail; + + int len4 = len / sizeof(uint32_t); + + uint32_t k; + for (i = 0; i < len4; i++) { + uint32_t byte1 = (uint32_t)(unsigned char)data[4*i]; + uint32_t byte2 = ((uint32_t)(unsigned char)data[4*i + 1]) << 8; + uint32_t byte3 = ((uint32_t)(unsigned char)data[4*i + 2]) << 16; + uint32_t byte4 = ((uint32_t)(unsigned char)data[4*i + 3]) << 24; + k = byte1 | byte2 | byte3 | byte4; + k *= c1; + k = rotate_left(k, r1); + k *= c2; + + seed ^= k; + seed = rotate_left(seed, r2) * m + n; + } + + tail = (data + len4 * sizeof(uint32_t)); + + switch (len & (sizeof(uint32_t) - 1)) { + case 3: + k1 ^= ((uint32_t)(unsigned char)tail[2]) << 16; + /*-fallthrough*/ + case 2: + k1 ^= ((uint32_t)(unsigned char)tail[1]) << 8; + /*-fallthrough*/ + case 1: + k1 ^= ((uint32_t)(unsigned char)tail[0]) << 0; + k1 *= c1; + k1 = rotate_left(k1, r1); + k1 *= c2; + seed ^= k1; + break; + } + + seed ^= (uint32_t)len; + seed ^= (seed >> 16); + seed *= 0x85ebca6b; + seed ^= (seed >> 13); + seed *= 0xc2b2ae35; + seed ^= (seed >> 16); + + return seed; +} + +static uint32_t murmur3_seeded_v1(uint32_t seed, const char *data, size_t len) { const uint32_t c1 = 0xcc9e2d51; const uint32_t c2 = 0x1b873593; @@ -165,8 +222,14 @@ void fill_bloom_key(const char *data, int i; const uint32_t seed0 = 0x293ae76f; const uint32_t seed1 = 0x7e646e2c; - const uint32_t hash0 = murmur3_seeded(seed0, data, len); - const uint32_t hash1 = murmur3_seeded(seed1, data, len); + uint32_t hash0, hash1; + if (settings->hash_version == 2) { + hash0 = murmur3_seeded_v2(seed0, data, len); + hash1 = murmur3_seeded_v2(seed1, data, len); + } else { + hash0 = murmur3_seeded_v1(seed0, data, len); + hash1 = murmur3_seeded_v1(seed1, data, len); + } key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t)); for (i = 0; i < settings->num_hashes; i++) diff --git a/bloom.h b/bloom.h index 052a993aab..bfe389e29c 100644 --- a/bloom.h +++ b/bloom.h @@ -8,9 +8,11 @@ struct commit_graph; struct bloom_filter_settings { /* * The version of the hashing technique being used. - * We currently only support version = 1 which is + * The newest version is 2, which is * the seeded murmur3 hashing technique implemented - * in bloom.c. + * in bloom.c. Bloom filters of version 1 were created + * with prior versions of Git, which had a bug in the + * implementation of the hash function. */ uint32_t hash_version; @@ -81,7 +83,7 @@ int load_bloom_filter_from_graph(struct commit_graph *g, * Not considered to be cryptographically secure. * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm */ -uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len); +uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len); void fill_bloom_key(const char *data, size_t len, diff --git a/commit-graph.c b/commit-graph.c index d6fb714f32..ce4524e2b0 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -344,7 +344,6 @@ static int graph_read_bloom_data(const unsigned char *chunk_start, size_t chunk_size, void *data) { struct commit_graph *g = data; - uint32_t hash_version; if (chunk_size < BLOOMDATA_CHUNK_HEADER_SIZE) { warning(_("ignoring too-small changed-path chunk" @@ -356,10 +355,9 @@ static int graph_read_bloom_data(const unsigned char *chunk_start, g->chunk_bloom_data = chunk_start; g->chunk_bloom_data_size = chunk_size; - hash_version = get_be32(chunk_start); g->bloom_filter_settings = xmalloc(sizeof(struct bloom_filter_settings)); - g->bloom_filter_settings->hash_version = hash_version; + g->bloom_filter_settings->hash_version = get_be32(chunk_start); g->bloom_filter_settings->num_hashes = get_be32(chunk_start + 4); g->bloom_filter_settings->bits_per_entry = get_be32(chunk_start + 8); g->bloom_filter_settings->max_changed_paths = DEFAULT_BLOOM_MAX_CHANGES; @@ -2498,6 +2496,13 @@ int write_commit_graph(struct object_directory *odb, } if (!commit_graph_compatible(r)) return 0; + if (r->settings.commit_graph_changed_paths_version < -1 + || r->settings.commit_graph_changed_paths_version > 2) { + warning(_("attempting to write a commit-graph, but " + "'commitGraph.changedPathsVersion' (%d) is not supported"), + r->settings.commit_graph_changed_paths_version); + return 0; + } CALLOC_ARRAY(ctx, 1); ctx->r = r; @@ -2540,7 +2545,7 @@ int write_commit_graph(struct object_directory *odb, g = ctx->r->objects->commit_graph; /* We have changed-paths already. Keep them in the next graph */ - if (g && g->chunk_bloom_data) { + if (g && g->bloom_filter_settings) { ctx->changed_paths = 1; /* don't propagate the hash_version unless unspecified */ diff --git a/t/helper/test-bloom.c b/t/helper/test-bloom.c index 1281e66876..eefc1668c7 100644 --- a/t/helper/test-bloom.c +++ b/t/helper/test-bloom.c @@ -49,6 +49,7 @@ static void get_bloom_filter_for_commit(const struct object_id *commit_oid) static const char *bloom_usage = "\n" " test-tool bloom get_murmur3 \n" +" test-tool bloom get_murmur3_seven_highbit\n" " test-tool bloom generate_filter [...]\n" " test-tool bloom get_filter_for_commit \n"; @@ -63,7 +64,13 @@ int cmd__bloom(int argc, const char **argv) uint32_t hashed; if (argc < 3) usage(bloom_usage); - hashed = murmur3_seeded(0, argv[2], strlen(argv[2])); + hashed = murmur3_seeded_v2(0, argv[2], strlen(argv[2])); + printf("Murmur3 Hash with seed=0:0x%08x\n", hashed); + } + + if (!strcmp(argv[1], "get_murmur3_seven_highbit")) { + uint32_t hashed; + hashed = murmur3_seeded_v2(0, "\x99\xaa\xbb\xcc\xdd\xee\xff", 7); printf("Murmur3 Hash with seed=0:0x%08x\n", hashed); } diff --git a/t/t0095-bloom.sh b/t/t0095-bloom.sh index b567383eb8..c8d84ab606 100755 --- a/t/t0095-bloom.sh +++ b/t/t0095-bloom.sh @@ -29,6 +29,14 @@ test_expect_success 'compute unseeded murmur3 hash for test string 2' ' test_cmp expect actual ' +test_expect_success 'compute unseeded murmur3 hash for test string 3' ' + cat >expect <<-\EOF && + Murmur3 Hash with seed=0:0xa183ccfd + EOF + test-tool bloom get_murmur3_seven_highbit >actual && + test_cmp expect actual +' + test_expect_success 'compute bloom key for empty string' ' cat >expect <<-\EOF && Hashes:0x5615800c|0x5b966560|0x61174ab4|0x66983008|0x6c19155c|0x7199fab0|0x771ae004| diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh index 49d1113171..cc6e5733f6 100755 --- a/t/t4216-log-bloom.sh +++ b/t/t4216-log-bloom.sh @@ -488,14 +488,49 @@ test_expect_success 'merge graph layers with incompatible Bloom settings' ' test_must_be_empty err ' +# chosen to be the same under all Unicode normalization forms +CENT=$(printf "\302\242") + +test_expect_success 'ensure Bloom filter with incompatible versions are ignored' ' + rm "$repo/$graph" && + + git -C $repo log --oneline --no-decorate -- $CENT >expect && + + # Compute v1 Bloom filters for commits at the bottom. + git -C $repo rev-parse HEAD^ >in && + git -C $repo commit-graph write --stdin-commits --changed-paths \ + --split in && + git -C $repo -c commitGraph.changedPathsVersion=2 commit-graph write \ + --stdin-commits --changed-paths --split=no-merge actual 2>err && + test_cmp expect actual && + + layer="$(head -n 1 $repo/$chain)" && + cat >expect.err <<-EOF && + warning: disabling Bloom filters for commit-graph layer $SQ$layer$SQ due to incompatible settings + EOF + test_cmp expect.err err && + + # Merge the two layers with incompatible bloom filter versions, + # ensuring that the v2 filters are used. + >trace2.txt && + GIT_TRACE2_EVENT="$(pwd)/trace2.txt" \ + git -C $repo -c commitGraph.changedPathsVersion=2 commit-graph write --reachable --changed-paths 2>err && + grep "disabling Bloom filters for commit-graph layer .$layer." err && + grep "{\"hash_version\":2,\"num_hashes\":7,\"bits_per_entry\":10,\"max_changed_paths\":512" trace2.txt +' + get_first_changed_path_filter () { test-tool read-graph bloom-filters >filters.dat && head -n 1 filters.dat } -# chosen to be the same under all Unicode normalization forms -CENT=$(printf "\302\242") - test_expect_success 'set up repo with high bit path, version 1 changed-path' ' git init highbit1 && test_commit -C highbit1 c1 "$CENT" && @@ -539,6 +574,120 @@ test_expect_success 'version 1 changed-path used when version 1 requested' ' ) ' +test_expect_success 'version 1 changed-path not used when version 2 requested' ' + ( + cd highbit1 && + git config --add commitGraph.changedPathsVersion 2 && + test_bloom_filters_not_used "-- another$CENT" + ) +' + +test_expect_success 'version 1 changed-path used when autodetect requested' ' + ( + cd highbit1 && + git config --add commitGraph.changedPathsVersion -1 && + test_bloom_filters_used "-- another$CENT" + ) +' + +test_expect_success 'when writing another commit graph, preserve existing version 1 of changed-path' ' + test_commit -C highbit1 c1double "$CENT$CENT" && + git -C highbit1 commit-graph write --reachable --changed-paths && + ( + cd highbit1 && + git config --add commitGraph.changedPathsVersion -1 && + echo "options: bloom(1,10,7) read_generation_data" >expect && + test-tool read-graph >full && + grep options full >actual && + test_cmp expect actual + ) +' + +test_expect_success 'set up repo with high bit path, version 2 changed-path' ' + git init highbit2 && + git -C highbit2 config --add commitGraph.changedPathsVersion 2 && + test_commit -C highbit2 c2 "$CENT" && + git -C highbit2 commit-graph write --reachable --changed-paths +' + +test_expect_success 'check value of version 2 changed-path' ' + ( + cd highbit2 && + echo "c01f" >expect && + get_first_changed_path_filter >actual && + test_cmp expect actual + ) +' + +test_expect_success 'setup make another commit' ' + # "git log" does not use Bloom filters for root commits - see how, in + # revision.c, rev_compare_tree() (the only code path that eventually calls + # get_bloom_filter()) is only called by try_to_simplify_commit() when the commit + # has one parent. Therefore, make another commit so that we perform the tests on + # a non-root commit. + test_commit -C highbit2 anotherc2 "another$CENT" +' + +test_expect_success 'version 2 changed-path used when version 2 requested' ' + ( + cd highbit2 && + test_bloom_filters_used "-- another$CENT" + ) +' + +test_expect_success 'version 2 changed-path not used when version 1 requested' ' + ( + cd highbit2 && + git config --add commitGraph.changedPathsVersion 1 && + test_bloom_filters_not_used "-- another$CENT" + ) +' + +test_expect_success 'version 2 changed-path used when autodetect requested' ' + ( + cd highbit2 && + git config --add commitGraph.changedPathsVersion -1 && + test_bloom_filters_used "-- another$CENT" + ) +' + +test_expect_success 'when writing another commit graph, preserve existing version 2 of changed-path' ' + test_commit -C highbit2 c2double "$CENT$CENT" && + git -C highbit2 commit-graph write --reachable --changed-paths && + ( + cd highbit2 && + git config --add commitGraph.changedPathsVersion -1 && + echo "options: bloom(2,10,7) read_generation_data" >expect && + test-tool read-graph >full && + grep options full >actual && + test_cmp expect actual + ) +' + +test_expect_success 'when writing commit graph, do not reuse changed-path of another version' ' + git init doublewrite && + test_commit -C doublewrite c "$CENT" && + git -C doublewrite config --add commitGraph.changedPathsVersion 1 && + git -C doublewrite commit-graph write --reachable --changed-paths && + for v in -2 3 + do + git -C doublewrite config --add commitGraph.changedPathsVersion $v && + git -C doublewrite commit-graph write --reachable --changed-paths 2>err && + cat >expect <<-EOF && + warning: attempting to write a commit-graph, but ${SQ}commitGraph.changedPathsVersion${SQ} ($v) is not supported + EOF + test_cmp expect err || return 1 + done && + git -C doublewrite config --add commitGraph.changedPathsVersion 2 && + git -C doublewrite commit-graph write --reachable --changed-paths && + ( + cd doublewrite && + echo "c01f" >expect && + get_first_changed_path_filter >actual && + test_cmp expect actual + ) +' + corrupt_graph () { test_when_finished "rm -rf $graph" && git commit-graph write --reachable --changed-paths &&