cbtree: allow using arbitrary wrapper structures for nodes

The cbtree subsystem allows the user to store arbitrary data in a
prefix-free set of strings. This is used by us to store object IDs in a
way that we can easily iterate through them in lexicograph order, and so
that we can easily perform lookups with shortened object IDs.

In its current form, it is not easily possible to store arbitrary data
with the tree nodes. There are a couple of approaches such a caller
could try to use, but none of them really work:

  - One may embed the `struct cb_node` in a custom structure. This does
    not work though as `struct cb_node` contains a flex array, and
    embedding such a struct in another struct is forbidden.

  - One may use a `union` over `struct cb_node` and ones own data type,
    which _is_ allowed even if the struct contains a flex array. This
    does not work though, as the compiler may align members of the
    struct so that the node key would not immediately start where the
    flex array starts.

  - One may allocate `struct cb_node` such that it has room for both its
    key and the custom data. This has the downside though that if the
    custom data is itself a pointer to allocated memory, then the leak
    checker will not consider the pointer to be alive anymore.

Refactor the cbtree to drop the flex array and instead take in an
explicit offset for where to find the key, which allows the caller to
embed `struct cb_node` is a wrapper struct.

Note that this change has the downside that we now have a bit of padding
in our structure, which grows the size from 60 to 64 bytes on a 64 bit
system. On the other hand though, it allows us to get rid of the memory
copies that we previously had to do to ensure proper alignment. This
seems like a reasonable tradeoff.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
main
Patrick Steinhardt 2026-04-10 14:12:38 +02:00 committed by Junio C Hamano
parent 197c8a85e3
commit 550d7b7c89
3 changed files with 41 additions and 34 deletions

View File

@ -7,6 +7,11 @@
#include "git-compat-util.h"
#include "cbtree.h"

static inline uint8_t *cb_node_key(struct cb_tree *t, struct cb_node *node)
{
return (uint8_t *) node + t->key_offset;
}

static struct cb_node *cb_node_of(const void *p)
{
return (struct cb_node *)((uintptr_t)p - 1);
@ -33,6 +38,7 @@ struct cb_node *cb_insert(struct cb_tree *t, struct cb_node *node, size_t klen)
uint8_t c;
int newdirection;
struct cb_node **wherep, *p;
uint8_t *node_key, *p_key;

assert(!((uintptr_t)node & 1)); /* allocations must be aligned */

@ -41,23 +47,26 @@ struct cb_node *cb_insert(struct cb_tree *t, struct cb_node *node, size_t klen)
return NULL; /* success */
}

node_key = cb_node_key(t, node);

/* see if a node already exists */
p = cb_internal_best_match(t->root, node->k, klen);
p = cb_internal_best_match(t->root, node_key, klen);
p_key = cb_node_key(t, p);

/* find first differing byte */
for (newbyte = 0; newbyte < klen; newbyte++) {
if (p->k[newbyte] != node->k[newbyte])
if (p_key[newbyte] != node_key[newbyte])
goto different_byte_found;
}
return p; /* element exists, let user deal with it */

different_byte_found:
newotherbits = p->k[newbyte] ^ node->k[newbyte];
newotherbits = p_key[newbyte] ^ node_key[newbyte];
newotherbits |= newotherbits >> 1;
newotherbits |= newotherbits >> 2;
newotherbits |= newotherbits >> 4;
newotherbits = (newotherbits & ~(newotherbits >> 1)) ^ 255;
c = p->k[newbyte];
c = p_key[newbyte];
newdirection = (1 + (newotherbits | c)) >> 8;

node->byte = newbyte;
@ -78,7 +87,7 @@ different_byte_found:
break;
if (q->byte == newbyte && q->otherbits > newotherbits)
break;
c = q->byte < klen ? node->k[q->byte] : 0;
c = q->byte < klen ? node_key[q->byte] : 0;
direction = (1 + (q->otherbits | c)) >> 8;
wherep = q->child + direction;
}
@ -93,7 +102,7 @@ struct cb_node *cb_lookup(struct cb_tree *t, const uint8_t *k, size_t klen)
{
struct cb_node *p = cb_internal_best_match(t->root, k, klen);

return p && !memcmp(p->k, k, klen) ? p : NULL;
return p && !memcmp(cb_node_key(t, p), k, klen) ? p : NULL;
}

static int cb_descend(struct cb_node *p, cb_iter fn, void *arg)
@ -115,6 +124,7 @@ int cb_each(struct cb_tree *t, const uint8_t *kpfx, size_t klen,
struct cb_node *p = t->root;
struct cb_node *top = p;
size_t i = 0;
uint8_t *p_key;

if (!p)
return 0; /* empty tree */
@ -130,8 +140,9 @@ int cb_each(struct cb_tree *t, const uint8_t *kpfx, size_t klen,
top = p;
}

p_key = cb_node_key(t, p);
for (i = 0; i < klen; i++) {
if (p->k[i] != kpfx[i])
if (p_key[i] != kpfx[i])
return 0; /* "best" match failed */
}


View File

@ -6,9 +6,9 @@
*
* This is adapted to store arbitrary data (not just NUL-terminated C strings
* and allocates no memory internally. The user needs to allocate
* "struct cb_node" and fill cb_node.k[] with arbitrary match data
* for memcmp.
* If "klen" is variable, then it should be embedded into "c_node.k[]"
* "struct cb_node" and provide `key_offset` to indicate where the key can be
* found relative to the `struct cb_node` for memcmp.
* If "klen" is variable, then it should be embedded into the key.
* Recursion is bound by the maximum value of "klen" used.
*/
#ifndef CBTREE_H
@ -23,18 +23,19 @@ struct cb_node {
*/
uint32_t byte;
uint8_t otherbits;
uint8_t k[FLEX_ARRAY]; /* arbitrary data, unaligned */
};

struct cb_tree {
struct cb_node *root;
ptrdiff_t key_offset;
};

#define CBTREE_INIT { 0 }

static inline void cb_init(struct cb_tree *t)
static inline void cb_init(struct cb_tree *t,
ptrdiff_t key_offset)
{
struct cb_tree blank = CBTREE_INIT;
struct cb_tree blank = {
.key_offset = key_offset,
};
memcpy(t, &blank, sizeof(*t));
}


View File

@ -6,9 +6,14 @@
#include "oidtree.h"
#include "hash.h"

struct oidtree_node {
struct cb_node base;
struct object_id key;
};

void oidtree_init(struct oidtree *ot)
{
cb_init(&ot->tree);
cb_init(&ot->tree, offsetof(struct oidtree_node, key));
mem_pool_init(&ot->mem_pool, 0);
}

@ -22,20 +27,13 @@ void oidtree_clear(struct oidtree *ot)

void oidtree_insert(struct oidtree *ot, const struct object_id *oid)
{
struct cb_node *on;
struct object_id k;
struct oidtree_node *on;

if (!oid->algo)
BUG("oidtree_insert requires oid->algo");

on = mem_pool_alloc(&ot->mem_pool, sizeof(*on) + sizeof(*oid));

/*
* Clear the padding and copy the result in separate steps to
* respect the 4-byte alignment needed by struct object_id.
*/
oidcpy(&k, oid);
memcpy(on->k, &k, sizeof(k));
on = mem_pool_alloc(&ot->mem_pool, sizeof(*on));
oidcpy(&on->key, oid);

/*
* n.b. Current callers won't get us duplicates, here. If a
@ -43,7 +41,7 @@ void oidtree_insert(struct oidtree *ot, const struct object_id *oid)
* that won't be freed until oidtree_clear. Currently it's not
* worth maintaining a free list
*/
cb_insert(&ot->tree, on, sizeof(*oid));
cb_insert(&ot->tree, &on->base, sizeof(*oid));
}

bool oidtree_contains(struct oidtree *ot, const struct object_id *oid)
@ -73,21 +71,18 @@ struct oidtree_each_data {

static int iter(struct cb_node *n, void *cb_data)
{
struct oidtree_node *node = container_of(n, struct oidtree_node, base);
struct oidtree_each_data *data = cb_data;
struct object_id k;

/* Copy to provide 4-byte alignment needed by struct object_id. */
memcpy(&k, n->k, sizeof(k));

if (data->algo != GIT_HASH_UNKNOWN && data->algo != k.algo)
if (data->algo != GIT_HASH_UNKNOWN && data->algo != node->key.algo)
return 0;

if (data->last_nibble_at) {
if ((k.hash[*data->last_nibble_at] ^ data->last_byte) & 0xf0)
if ((node->key.hash[*data->last_nibble_at] ^ data->last_byte) & 0xf0)
return 0;
}

return data->cb(&k, data->cb_data);
return data->cb(&node->key, data->cb_data);
}

int oidtree_each(struct oidtree *ot, const struct object_id *prefix,