git/bloom.h

171 lines
4.9 KiB
C

#ifndef BLOOM_H
#define BLOOM_H
struct commit;
struct repository;
struct commit_graph;
struct bloom_filter_settings {
/*
* The version of the hashing technique being used.
* The newest version is 2, which is
* the seeded murmur3 hashing technique implemented
* in bloom.c. Bloom filters of version 1 were created
* with prior versions of Git, which had a bug in the
* implementation of the hash function.
*/
uint32_t hash_version;
/*
* The number of times a path is hashed, i.e. the
* number of bit positions that cumulatively
* determine whether a path is present in the
* Bloom filter.
*/
uint32_t num_hashes;
/*
* The minimum number of bits per entry in the Bloom
* filter. If the filter contains 'n' entries, then
* filter size is the minimum number of 8-bit words
* that contain n*b bits.
*/
uint32_t bits_per_entry;
/*
* The maximum number of changed paths per commit
* before declaring a Bloom filter to be too-large.
*
* Not written to the commit-graph file.
*/
uint32_t max_changed_paths;
};
#define DEFAULT_BLOOM_MAX_CHANGES 512
#define DEFAULT_BLOOM_FILTER_SETTINGS { 1, 7, 10, DEFAULT_BLOOM_MAX_CHANGES }
#define BITS_PER_WORD 8
#define BLOOMDATA_CHUNK_HEADER_SIZE 3 * sizeof(uint32_t)
/*
* A bloom_filter struct represents a data segment to
* use when testing hash values. The 'len' member
* dictates how many entries are stored in
* 'data'.
*/
struct bloom_filter {
unsigned char *data;
size_t len;
int version;
void *to_free;
};
/*
* A bloom_key represents the k hash values for a
* given string. These can be precomputed and
* stored in a bloom_key for re-use when testing
* against a bloom_filter. The number of hashes is
* given by the Bloom filter settings and is the same
* for all Bloom filters and keys interacting with
* the loaded version of the commit graph file and
* the Bloom data chunks.
*/
struct bloom_key {
uint32_t *hashes;
};
/*
* A bloom_keyvec is a vector of bloom_keys, which
* can be used to store multiple keys for a single
* pathspec item.
*/
struct bloom_keyvec {
size_t count;
struct bloom_key key[FLEX_ARRAY];
};
int load_bloom_filter_from_graph(struct commit_graph *g,
struct bloom_filter *filter,
uint32_t graph_pos);
void bloom_key_fill(struct bloom_key *key, const char *data, size_t len,
const struct bloom_filter_settings *settings);
void bloom_key_clear(struct bloom_key *key);
/*
* bloom_keyvec_new - Allocate and populate a bloom_keyvec with keys for the
* given path.
*
* This function splits the input path by '/' and generates a bloom key for each
* prefix, in reverse order of specificity. For example, given the input
* "a/b/c", it will generate bloom keys for:
* - "a/b/c"
* - "a/b"
* - "a"
*
* The resulting keys are stored in a newly allocated bloom_keyvec.
*/
struct bloom_keyvec *bloom_keyvec_new(const char *path, size_t len,
const struct bloom_filter_settings *settings);
void bloom_keyvec_free(struct bloom_keyvec *vec);
void add_key_to_filter(const struct bloom_key *key,
struct bloom_filter *filter,
const struct bloom_filter_settings *settings);
void init_bloom_filters(void);
void deinit_bloom_filters(void);
enum bloom_filter_computed {
BLOOM_NOT_COMPUTED = (1 << 0),
BLOOM_COMPUTED = (1 << 1),
BLOOM_TRUNC_LARGE = (1 << 2),
BLOOM_TRUNC_EMPTY = (1 << 3),
BLOOM_UPGRADED = (1 << 4),
};
struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
struct commit *c,
int compute_if_not_present,
const struct bloom_filter_settings *settings,
enum bloom_filter_computed *computed);
/*
* Find the Bloom filter associated with the given commit "c".
*
* If any of the following are true
*
* - the repository does not have a commit-graph, or
* - the repository disables reading from the commit-graph, or
* - the given commit does not have a Bloom filter computed, or
* - there is a Bloom filter for commit "c", but it cannot be read
* because the filter uses an incompatible version of murmur3
*
* , then `get_bloom_filter()` will return NULL. Otherwise, the corresponding
* Bloom filter will be returned.
*
* For callers who wish to inspect Bloom filters with incompatible hash
* versions, use get_or_compute_bloom_filter().
*/
struct bloom_filter *get_bloom_filter(struct repository *r, struct commit *c);
int bloom_filter_contains(const struct bloom_filter *filter,
const struct bloom_key *key,
const struct bloom_filter_settings *settings);
/*
* bloom_filter_contains_vec - Check if all keys in a key vector are in the
* Bloom filter.
*
* Returns 1 if **all** keys in the vector are present in the filter,
* 0 if **any** key is not present.
*/
int bloom_filter_contains_vec(const struct bloom_filter *filter,
const struct bloom_keyvec *v,
const struct bloom_filter_settings *settings);
uint32_t test_bloom_murmur3_seeded(uint32_t seed, const char *data, size_t len,
int version);
#endif