Browse Source

Merge branch 'jt/path-filter-fix' into seen

* jt/path-filter-fix:
  commit-graph: new filter ver. that fixes murmur3
  repo-settings: introduce commitgraph.changedPathsVersion
  t4216: test changed path filters with high bit paths
seen
Junio C Hamano 2 years ago
parent
commit
a658abdd93
  1. 16
      Documentation/config/commitgraph.txt
  2. 65
      bloom.c
  3. 8
      bloom.h
  4. 29
      commit-graph.c
  5. 2
      oss-fuzz/fuzz-commit-graph.c
  6. 6
      repo-settings.c
  7. 2
      repository.h
  8. 9
      t/helper/test-bloom.c
  9. 8
      t/t0095-bloom.sh
  10. 65
      t/t4216-log-bloom.sh

16
Documentation/config/commitgraph.txt

@ -9,6 +9,16 @@ commitGraph.maxNewFilters:: @@ -9,6 +9,16 @@ commitGraph.maxNewFilters::
commit-graph write` (c.f., linkgit:git-commit-graph[1]).

commitGraph.readChangedPaths::
If true, then git will use the changed-path Bloom filters in the
commit-graph file (if it exists, and they are present). Defaults to
true. See linkgit:git-commit-graph[1] for more information.
Deprecated. Equivalent to changedPathsVersion=1 if true, and
changedPathsVersion=0 if false.

commitGraph.changedPathsVersion::
Specifies the version of the changed-path Bloom filters that Git will read and
write. May be 0, 1, or 2. Any changed-path Bloom filters on disk that do not
match the version set in this config variable will be ignored.
+
Defaults to 1.
+
If 0, git will write version 1 Bloom filters when instructed to write.
+
See linkgit:git-commit-graph[1] for more information.

65
bloom.c

@ -66,7 +66,64 @@ static int load_bloom_filter_from_graph(struct commit_graph *g, @@ -66,7 +66,64 @@ static int load_bloom_filter_from_graph(struct commit_graph *g,
* Not considered to be cryptographically secure.
* Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
*/
uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len)
uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len)
{
const uint32_t c1 = 0xcc9e2d51;
const uint32_t c2 = 0x1b873593;
const uint32_t r1 = 15;
const uint32_t r2 = 13;
const uint32_t m = 5;
const uint32_t n = 0xe6546b64;
int i;
uint32_t k1 = 0;
const char *tail;

int len4 = len / sizeof(uint32_t);

uint32_t k;
for (i = 0; i < len4; i++) {
uint32_t byte1 = (uint32_t)(unsigned char)data[4*i];
uint32_t byte2 = ((uint32_t)(unsigned char)data[4*i + 1]) << 8;
uint32_t byte3 = ((uint32_t)(unsigned char)data[4*i + 2]) << 16;
uint32_t byte4 = ((uint32_t)(unsigned char)data[4*i + 3]) << 24;
k = byte1 | byte2 | byte3 | byte4;
k *= c1;
k = rotate_left(k, r1);
k *= c2;

seed ^= k;
seed = rotate_left(seed, r2) * m + n;
}

tail = (data + len4 * sizeof(uint32_t));

switch (len & (sizeof(uint32_t) - 1)) {
case 3:
k1 ^= ((uint32_t)(unsigned char)tail[2]) << 16;
/*-fallthrough*/
case 2:
k1 ^= ((uint32_t)(unsigned char)tail[1]) << 8;
/*-fallthrough*/
case 1:
k1 ^= ((uint32_t)(unsigned char)tail[0]) << 0;
k1 *= c1;
k1 = rotate_left(k1, r1);
k1 *= c2;
seed ^= k1;
break;
}

seed ^= (uint32_t)len;
seed ^= (seed >> 16);
seed *= 0x85ebca6b;
seed ^= (seed >> 13);
seed *= 0xc2b2ae35;
seed ^= (seed >> 16);

return seed;
}

static uint32_t murmur3_seeded_v1(uint32_t seed, const char *data, size_t len)
{
const uint32_t c1 = 0xcc9e2d51;
const uint32_t c2 = 0x1b873593;
@ -131,8 +188,10 @@ void fill_bloom_key(const char *data, @@ -131,8 +188,10 @@ void fill_bloom_key(const char *data,
int i;
const uint32_t seed0 = 0x293ae76f;
const uint32_t seed1 = 0x7e646e2c;
const uint32_t hash0 = murmur3_seeded(seed0, data, len);
const uint32_t hash1 = murmur3_seeded(seed1, data, len);
const uint32_t hash0 = (settings->hash_version == 2
? murmur3_seeded_v2 : murmur3_seeded_v1)(seed0, data, len);
const uint32_t hash1 = (settings->hash_version == 2
? murmur3_seeded_v2 : murmur3_seeded_v1)(seed1, data, len);

key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t));
for (i = 0; i < settings->num_hashes; i++)

8
bloom.h

@ -7,9 +7,11 @@ struct repository; @@ -7,9 +7,11 @@ struct repository;
struct bloom_filter_settings {
/*
* The version of the hashing technique being used.
* We currently only support version = 1 which is
* The newest version is 2, which is
* the seeded murmur3 hashing technique implemented
* in bloom.c.
* in bloom.c. Bloom filters of version 1 were created
* with prior versions of Git, which had a bug in the
* implementation of the hash function.
*/
uint32_t hash_version;

@ -75,7 +77,7 @@ struct bloom_key { @@ -75,7 +77,7 @@ struct bloom_key {
* Not considered to be cryptographically secure.
* Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
*/
uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len);
uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len);

void fill_bloom_key(const char *data,
size_t len,

29
commit-graph.c

@ -305,15 +305,21 @@ static int graph_read_oid_lookup(const unsigned char *chunk_start, @@ -305,15 +305,21 @@ static int graph_read_oid_lookup(const unsigned char *chunk_start,
return 0;
}

struct graph_read_bloom_data_data {
struct commit_graph *g;
int commit_graph_changed_paths_version;
};

static int graph_read_bloom_data(const unsigned char *chunk_start,
size_t chunk_size, void *data)
{
struct commit_graph *g = data;
struct graph_read_bloom_data_data *d = data;
struct commit_graph *g = d->g;
uint32_t hash_version;
g->chunk_bloom_data = chunk_start;
hash_version = get_be32(chunk_start);

if (hash_version != 1)
if (hash_version != d->commit_graph_changed_paths_version)
return 0;

g->bloom_filter_settings = xmalloc(sizeof(struct bloom_filter_settings));
@ -402,11 +408,16 @@ struct commit_graph *parse_commit_graph(struct repo_settings *s, @@ -402,11 +408,16 @@ struct commit_graph *parse_commit_graph(struct repo_settings *s,
graph->read_generation_data = 1;
}

if (s->commit_graph_read_changed_paths) {
if (s->commit_graph_changed_paths_version == 1
|| s->commit_graph_changed_paths_version == 2) {
struct graph_read_bloom_data_data data = {
.g = graph,
.commit_graph_changed_paths_version = s->commit_graph_changed_paths_version
};
pair_chunk(cf, GRAPH_CHUNKID_BLOOMINDEXES,
&graph->chunk_bloom_indexes);
read_chunk(cf, GRAPH_CHUNKID_BLOOMDATA,
graph_read_bloom_data, graph);
graph_read_bloom_data, &data);
}

if (graph->chunk_bloom_indexes && graph->chunk_bloom_data) {
@ -2372,6 +2383,14 @@ int write_commit_graph(struct object_directory *odb, @@ -2372,6 +2383,14 @@ int write_commit_graph(struct object_directory *odb,
ctx->write_generation_data = (get_configured_generation_version(r) == 2);
ctx->num_generation_data_overflows = 0;

if (r->settings.commit_graph_changed_paths_version < 0
|| r->settings.commit_graph_changed_paths_version > 2) {
warning(_("attempting to write a commit-graph, but 'commitgraph.changedPathsVersion' (%d) is not supported"),
r->settings.commit_graph_changed_paths_version);
return 0;
}
bloom_settings.hash_version = r->settings.commit_graph_changed_paths_version == 2
? 2 : 1;
bloom_settings.bits_per_entry = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_BITS_PER_ENTRY",
bloom_settings.bits_per_entry);
bloom_settings.num_hashes = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_NUM_HASHES",
@ -2401,7 +2420,7 @@ int write_commit_graph(struct object_directory *odb, @@ -2401,7 +2420,7 @@ int write_commit_graph(struct object_directory *odb,
g = ctx->r->objects->commit_graph;

/* We have changed-paths already. Keep them in the next graph */
if (g && g->chunk_bloom_data) {
if (g && g->bloom_filter_settings) {
ctx->changed_paths = 1;
ctx->bloom_settings = g->bloom_filter_settings;
}

2
oss-fuzz/fuzz-commit-graph.c

@ -19,7 +19,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) @@ -19,7 +19,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
* possible.
*/
the_repository->settings.commit_graph_generation_version = 2;
the_repository->settings.commit_graph_read_changed_paths = 1;
the_repository->settings.commit_graph_changed_paths_version = 1;
g = parse_commit_graph(&the_repository->settings, (void *)data, size);
repo_clear(the_repository);
free_commit_graph(g);

6
repo-settings.c

@ -24,6 +24,7 @@ void prepare_repo_settings(struct repository *r) @@ -24,6 +24,7 @@ void prepare_repo_settings(struct repository *r)
int value;
const char *strval;
int manyfiles;
int readChangedPaths;

if (!r->gitdir)
BUG("Cannot add settings for uninitialized repository");
@ -54,7 +55,10 @@ void prepare_repo_settings(struct repository *r) @@ -54,7 +55,10 @@ void prepare_repo_settings(struct repository *r)
/* Commit graph config or default, does not cascade (simple) */
repo_cfg_bool(r, "core.commitgraph", &r->settings.core_commit_graph, 1);
repo_cfg_int(r, "commitgraph.generationversion", &r->settings.commit_graph_generation_version, 2);
repo_cfg_bool(r, "commitgraph.readchangedpaths", &r->settings.commit_graph_read_changed_paths, 1);
repo_cfg_bool(r, "commitgraph.readchangedpaths", &readChangedPaths, 1);
repo_cfg_int(r, "commitgraph.changedpathsversion",
&r->settings.commit_graph_changed_paths_version,
readChangedPaths ? 1 : 0);
repo_cfg_bool(r, "gc.writecommitgraph", &r->settings.gc_write_commit_graph, 1);
repo_cfg_bool(r, "fetch.writecommitgraph", &r->settings.fetch_write_commit_graph, 0);


2
repository.h

@ -29,7 +29,7 @@ struct repo_settings { @@ -29,7 +29,7 @@ struct repo_settings {

int core_commit_graph;
int commit_graph_generation_version;
int commit_graph_read_changed_paths;
int commit_graph_changed_paths_version;
int gc_write_commit_graph;
int fetch_write_commit_graph;
int command_requires_full_index;

9
t/helper/test-bloom.c

@ -50,6 +50,7 @@ static void get_bloom_filter_for_commit(const struct object_id *commit_oid) @@ -50,6 +50,7 @@ static void get_bloom_filter_for_commit(const struct object_id *commit_oid)

static const char *bloom_usage = "\n"
" test-tool bloom get_murmur3 <string>\n"
" test-tool bloom get_murmur3_seven_highbit\n"
" test-tool bloom generate_filter <string> [<string>...]\n"
" test-tool bloom get_filter_for_commit <commit-hex>\n";

@ -64,7 +65,13 @@ int cmd__bloom(int argc, const char **argv) @@ -64,7 +65,13 @@ int cmd__bloom(int argc, const char **argv)
uint32_t hashed;
if (argc < 3)
usage(bloom_usage);
hashed = murmur3_seeded(0, argv[2], strlen(argv[2]));
hashed = murmur3_seeded_v2(0, argv[2], strlen(argv[2]));
printf("Murmur3 Hash with seed=0:0x%08x\n", hashed);
}

if (!strcmp(argv[1], "get_murmur3_seven_highbit")) {
uint32_t hashed;
hashed = murmur3_seeded_v2(0, "\x99\xaa\xbb\xcc\xdd\xee\xff", 7);
printf("Murmur3 Hash with seed=0:0x%08x\n", hashed);
}


8
t/t0095-bloom.sh

@ -29,6 +29,14 @@ test_expect_success 'compute unseeded murmur3 hash for test string 2' ' @@ -29,6 +29,14 @@ test_expect_success 'compute unseeded murmur3 hash for test string 2' '
test_cmp expect actual
'

test_expect_success 'compute unseeded murmur3 hash for test string 3' '
cat >expect <<-\EOF &&
Murmur3 Hash with seed=0:0xa183ccfd
EOF
test-tool bloom get_murmur3_seven_highbit >actual &&
test_cmp expect actual
'

test_expect_success 'compute bloom key for empty string' '
cat >expect <<-\EOF &&
Hashes:0x5615800c|0x5b966560|0x61174ab4|0x66983008|0x6c19155c|0x7199fab0|0x771ae004|

65
t/t4216-log-bloom.sh

@ -404,4 +404,69 @@ test_expect_success 'Bloom generation backfills empty commits' ' @@ -404,4 +404,69 @@ test_expect_success 'Bloom generation backfills empty commits' '
)
'

get_bdat_offset () {
perl -0777 -ne \
'print unpack("N", "$1") if /BDAT\0\0\0\0(....)/ or exit 1' \
.git/objects/info/commit-graph
}

get_first_changed_path_filter () {
BDAT_OFFSET=$(get_bdat_offset) &&
perl -0777 -ne \
'print unpack("H*", substr($_, '$BDAT_OFFSET' + 12, 2))' \
.git/objects/info/commit-graph
}

# chosen to be the same under all Unicode normalization forms
CENT=$(printf "\xc2\xa2")

test_expect_success 'set up repo with high bit path, version 1 changed-path' '
git init highbit1 &&
test_commit -C highbit1 c1 "$CENT" &&
git -C highbit1 commit-graph write --reachable --changed-paths
'

test_expect_success 'check value of version 1 changed-path' '
(cd highbit1 &&
printf "52a9" >expect &&
get_first_changed_path_filter >actual &&
test_cmp expect actual)
'

test_expect_success 'version 1 changed-path used when version 1 requested' '
(cd highbit1 &&
test_bloom_filters_used "-- $CENT")
'

test_expect_success 'version 1 changed-path not used when version 2 requested' '
(cd highbit1 &&
git config --add commitgraph.changedPathsVersion 2 &&
test_bloom_filters_not_used "-- $CENT")
'

test_expect_success 'set up repo with high bit path, version 2 changed-path' '
git init highbit2 &&
git -C highbit2 config --add commitgraph.changedPathsVersion 2 &&
test_commit -C highbit2 c2 "$CENT" &&
git -C highbit2 commit-graph write --reachable --changed-paths
'

test_expect_success 'check value of version 2 changed-path' '
(cd highbit2 &&
printf "c01f" >expect &&
get_first_changed_path_filter >actual &&
test_cmp expect actual)
'

test_expect_success 'version 2 changed-path used when version 2 requested' '
(cd highbit2 &&
test_bloom_filters_used "-- $CENT")
'

test_expect_success 'version 2 changed-path not used when version 1 requested' '
(cd highbit2 &&
git config --add commitgraph.changedPathsVersion 1 &&
test_bloom_filters_not_used "-- $CENT")
'

test_done

Loading…
Cancel
Save