object-file: extract logic to approximate object count

In "builtin/gc.c" we have some logic that checks whether we need to
repack objects. This is done by counting the number of objects that we
have and checking whether it exceeds a certain threshold. We don't
really need an accurate object count though, which is why we only
open a single object directory shard and then extrapolate from there.

Extract this logic into a new function that is owned by the loose object
database source. This is done to prepare for a subsequent change, where
we'll introduce object counting on the object database source level.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
maint
Patrick Steinhardt 2026-03-12 09:42:58 +01:00 committed by Junio C Hamano
parent dd587cd59e
commit 222fddeaa4
3 changed files with 62 additions and 27 deletions

View File

@ -467,37 +467,18 @@ out:
static int too_many_loose_objects(int limit)
{
/*
* Quickly check if a "gc" is needed, by estimating how
* many loose objects there are. Because SHA-1 is evenly
* distributed, we can check only one and get a reasonable
* estimate.
* This is weird, but stems from legacy behaviour: the GC auto
* threshold was always essentially interpreted as if it was rounded up
* to the next multiple 256 of, so we retain this behaviour for now.
*/
DIR *dir;
struct dirent *ent;
int auto_threshold;
int num_loose = 0;
int needed = 0;
const unsigned hexsz_loose = the_hash_algo->hexsz - 2;
char *path;
int auto_threshold = DIV_ROUND_UP(limit, 256) * 256;
unsigned long loose_count;

path = repo_git_path(the_repository, "objects/17");
dir = opendir(path);
free(path);
if (!dir)
if (odb_source_loose_approximate_object_count(the_repository->objects->sources,
&loose_count) < 0)
return 0;

auto_threshold = DIV_ROUND_UP(limit, 256);
while ((ent = readdir(dir)) != NULL) {
if (strspn(ent->d_name, "0123456789abcdef") != hexsz_loose ||
ent->d_name[hexsz_loose] != '\0')
continue;
if (++num_loose > auto_threshold) {
needed = 1;
break;
}
}
closedir(dir);
return needed;
return loose_count > auto_threshold;
}

static struct packed_git *find_base_packs(struct string_list *packs,

View File

@ -1868,6 +1868,47 @@ int odb_source_loose_for_each_object(struct odb_source *source,
NULL, NULL, &data);
}

int odb_source_loose_approximate_object_count(struct odb_source *source,
unsigned long *out)
{
const unsigned hexsz = source->odb->repo->hash_algo->hexsz - 2;
unsigned long count = 0;
struct dirent *ent;
char *path = NULL;
DIR *dir = NULL;
int ret;

path = xstrfmt("%s/17", source->path);

dir = opendir(path);
if (!dir) {
if (errno == ENOENT) {
*out = 0;
ret = 0;
goto out;
}

ret = error_errno("cannot open object shard '%s'", path);
goto out;
}

while ((ent = readdir(dir)) != NULL) {
if (strspn(ent->d_name, "0123456789abcdef") != hexsz ||
ent->d_name[hexsz] != '\0')
continue;
count++;
}

*out = count * 256;
ret = 0;

out:
if (dir)
closedir(dir);
free(path);
return ret;
}

static int append_loose_object(const struct object_id *oid,
const char *path UNUSED,
void *data)

View File

@ -139,6 +139,19 @@ int odb_source_loose_for_each_object(struct odb_source *source,
void *cb_data,
unsigned flags);

/*
* Count the number of loose objects in this source.
*
* The object count is approximated by opening a single sharding directory for
* loose objects and scanning its contents. The result is then extrapolated by
* 256. This should generally work as a reasonable estimate given that the
* object hash is supposed to be indistinguishable from random.
*
* Returns 0 on success, a negative error code otherwise.
*/
int odb_source_loose_approximate_object_count(struct odb_source *source,
unsigned long *out);

/**
* format_object_header() is a thin wrapper around s xsnprintf() that
* writes the initial "<type> <obj-len>" part of the loose object