From 550d7b7c89a9cf80794c72e8c7d036164a5b1927 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 10 Apr 2026 14:12:38 +0200 Subject: [PATCH] cbtree: allow using arbitrary wrapper structures for nodes The cbtree subsystem allows the user to store arbitrary data in a prefix-free set of strings. This is used by us to store object IDs in a way that we can easily iterate through them in lexicograph order, and so that we can easily perform lookups with shortened object IDs. In its current form, it is not easily possible to store arbitrary data with the tree nodes. There are a couple of approaches such a caller could try to use, but none of them really work: - One may embed the `struct cb_node` in a custom structure. This does not work though as `struct cb_node` contains a flex array, and embedding such a struct in another struct is forbidden. - One may use a `union` over `struct cb_node` and ones own data type, which _is_ allowed even if the struct contains a flex array. This does not work though, as the compiler may align members of the struct so that the node key would not immediately start where the flex array starts. - One may allocate `struct cb_node` such that it has room for both its key and the custom data. This has the downside though that if the custom data is itself a pointer to allocated memory, then the leak checker will not consider the pointer to be alive anymore. Refactor the cbtree to drop the flex array and instead take in an explicit offset for where to find the key, which allows the caller to embed `struct cb_node` is a wrapper struct. Note that this change has the downside that we now have a bit of padding in our structure, which grows the size from 60 to 64 bytes on a 64 bit system. On the other hand though, it allows us to get rid of the memory copies that we previously had to do to ensure proper alignment. This seems like a reasonable tradeoff. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- cbtree.c | 25 ++++++++++++++++++------- cbtree.h | 17 +++++++++-------- oidtree.c | 33 ++++++++++++++------------------- 3 files changed, 41 insertions(+), 34 deletions(-) diff --git a/cbtree.c b/cbtree.c index 4ab794bddc..8f5edbb80a 100644 --- a/cbtree.c +++ b/cbtree.c @@ -7,6 +7,11 @@ #include "git-compat-util.h" #include "cbtree.h" +static inline uint8_t *cb_node_key(struct cb_tree *t, struct cb_node *node) +{ + return (uint8_t *) node + t->key_offset; +} + static struct cb_node *cb_node_of(const void *p) { return (struct cb_node *)((uintptr_t)p - 1); @@ -33,6 +38,7 @@ struct cb_node *cb_insert(struct cb_tree *t, struct cb_node *node, size_t klen) uint8_t c; int newdirection; struct cb_node **wherep, *p; + uint8_t *node_key, *p_key; assert(!((uintptr_t)node & 1)); /* allocations must be aligned */ @@ -41,23 +47,26 @@ struct cb_node *cb_insert(struct cb_tree *t, struct cb_node *node, size_t klen) return NULL; /* success */ } + node_key = cb_node_key(t, node); + /* see if a node already exists */ - p = cb_internal_best_match(t->root, node->k, klen); + p = cb_internal_best_match(t->root, node_key, klen); + p_key = cb_node_key(t, p); /* find first differing byte */ for (newbyte = 0; newbyte < klen; newbyte++) { - if (p->k[newbyte] != node->k[newbyte]) + if (p_key[newbyte] != node_key[newbyte]) goto different_byte_found; } return p; /* element exists, let user deal with it */ different_byte_found: - newotherbits = p->k[newbyte] ^ node->k[newbyte]; + newotherbits = p_key[newbyte] ^ node_key[newbyte]; newotherbits |= newotherbits >> 1; newotherbits |= newotherbits >> 2; newotherbits |= newotherbits >> 4; newotherbits = (newotherbits & ~(newotherbits >> 1)) ^ 255; - c = p->k[newbyte]; + c = p_key[newbyte]; newdirection = (1 + (newotherbits | c)) >> 8; node->byte = newbyte; @@ -78,7 +87,7 @@ different_byte_found: break; if (q->byte == newbyte && q->otherbits > newotherbits) break; - c = q->byte < klen ? node->k[q->byte] : 0; + c = q->byte < klen ? node_key[q->byte] : 0; direction = (1 + (q->otherbits | c)) >> 8; wherep = q->child + direction; } @@ -93,7 +102,7 @@ struct cb_node *cb_lookup(struct cb_tree *t, const uint8_t *k, size_t klen) { struct cb_node *p = cb_internal_best_match(t->root, k, klen); - return p && !memcmp(p->k, k, klen) ? p : NULL; + return p && !memcmp(cb_node_key(t, p), k, klen) ? p : NULL; } static int cb_descend(struct cb_node *p, cb_iter fn, void *arg) @@ -115,6 +124,7 @@ int cb_each(struct cb_tree *t, const uint8_t *kpfx, size_t klen, struct cb_node *p = t->root; struct cb_node *top = p; size_t i = 0; + uint8_t *p_key; if (!p) return 0; /* empty tree */ @@ -130,8 +140,9 @@ int cb_each(struct cb_tree *t, const uint8_t *kpfx, size_t klen, top = p; } + p_key = cb_node_key(t, p); for (i = 0; i < klen; i++) { - if (p->k[i] != kpfx[i]) + if (p_key[i] != kpfx[i]) return 0; /* "best" match failed */ } diff --git a/cbtree.h b/cbtree.h index c374b1b3db..4647d4a32f 100644 --- a/cbtree.h +++ b/cbtree.h @@ -6,9 +6,9 @@ * * This is adapted to store arbitrary data (not just NUL-terminated C strings * and allocates no memory internally. The user needs to allocate - * "struct cb_node" and fill cb_node.k[] with arbitrary match data - * for memcmp. - * If "klen" is variable, then it should be embedded into "c_node.k[]" + * "struct cb_node" and provide `key_offset` to indicate where the key can be + * found relative to the `struct cb_node` for memcmp. + * If "klen" is variable, then it should be embedded into the key. * Recursion is bound by the maximum value of "klen" used. */ #ifndef CBTREE_H @@ -23,18 +23,19 @@ struct cb_node { */ uint32_t byte; uint8_t otherbits; - uint8_t k[FLEX_ARRAY]; /* arbitrary data, unaligned */ }; struct cb_tree { struct cb_node *root; + ptrdiff_t key_offset; }; -#define CBTREE_INIT { 0 } - -static inline void cb_init(struct cb_tree *t) +static inline void cb_init(struct cb_tree *t, + ptrdiff_t key_offset) { - struct cb_tree blank = CBTREE_INIT; + struct cb_tree blank = { + .key_offset = key_offset, + }; memcpy(t, &blank, sizeof(*t)); } diff --git a/oidtree.c b/oidtree.c index ab9fe7ec7a..117649753f 100644 --- a/oidtree.c +++ b/oidtree.c @@ -6,9 +6,14 @@ #include "oidtree.h" #include "hash.h" +struct oidtree_node { + struct cb_node base; + struct object_id key; +}; + void oidtree_init(struct oidtree *ot) { - cb_init(&ot->tree); + cb_init(&ot->tree, offsetof(struct oidtree_node, key)); mem_pool_init(&ot->mem_pool, 0); } @@ -22,20 +27,13 @@ void oidtree_clear(struct oidtree *ot) void oidtree_insert(struct oidtree *ot, const struct object_id *oid) { - struct cb_node *on; - struct object_id k; + struct oidtree_node *on; if (!oid->algo) BUG("oidtree_insert requires oid->algo"); - on = mem_pool_alloc(&ot->mem_pool, sizeof(*on) + sizeof(*oid)); - - /* - * Clear the padding and copy the result in separate steps to - * respect the 4-byte alignment needed by struct object_id. - */ - oidcpy(&k, oid); - memcpy(on->k, &k, sizeof(k)); + on = mem_pool_alloc(&ot->mem_pool, sizeof(*on)); + oidcpy(&on->key, oid); /* * n.b. Current callers won't get us duplicates, here. If a @@ -43,7 +41,7 @@ void oidtree_insert(struct oidtree *ot, const struct object_id *oid) * that won't be freed until oidtree_clear. Currently it's not * worth maintaining a free list */ - cb_insert(&ot->tree, on, sizeof(*oid)); + cb_insert(&ot->tree, &on->base, sizeof(*oid)); } bool oidtree_contains(struct oidtree *ot, const struct object_id *oid) @@ -73,21 +71,18 @@ struct oidtree_each_data { static int iter(struct cb_node *n, void *cb_data) { + struct oidtree_node *node = container_of(n, struct oidtree_node, base); struct oidtree_each_data *data = cb_data; - struct object_id k; - /* Copy to provide 4-byte alignment needed by struct object_id. */ - memcpy(&k, n->k, sizeof(k)); - - if (data->algo != GIT_HASH_UNKNOWN && data->algo != k.algo) + if (data->algo != GIT_HASH_UNKNOWN && data->algo != node->key.algo) return 0; if (data->last_nibble_at) { - if ((k.hash[*data->last_nibble_at] ^ data->last_byte) & 0xf0) + if ((node->key.hash[*data->last_nibble_at] ^ data->last_byte) & 0xf0) return 0; } - return data->cb(&k, data->cb_data); + return data->cb(&node->key, data->cb_data); } int oidtree_each(struct oidtree *ot, const struct object_id *prefix,