Merge branch 'ds/name-hash-tweaks'

"git pack-objects" and its wrapper "git repack" learned an option
to use an alternative path-hash function to improve delta-base
selection to produce a packfile with deeper history than window
size.

* ds/name-hash-tweaks:
  pack-objects: prevent name hash version change
  test-tool: add helper for name-hash values
  p5313: add size comparison test
  pack-objects: add GIT_TEST_NAME_HASH_VERSION
  repack: add --name-hash-version option
  pack-objects: add --name-hash-version option
  pack-objects: create new name-hash function version
maint
Junio C Hamano 2025-02-12 10:08:51 -08:00
commit aae91a86fb
22 changed files with 389 additions and 16 deletions

View File

@ -15,7 +15,8 @@ SYNOPSIS
[--revs [--unpacked | --all]] [--keep-pack=<pack-name>]
[--cruft] [--cruft-expiration=<time>]
[--stdout [--filter=<filter-spec>] | <base-name>]
[--shallow] [--keep-true-parents] [--[no-]sparse] < <object-list>
[--shallow] [--keep-true-parents] [--[no-]sparse]
[--name-hash-version=<n>] < <object-list>


DESCRIPTION
@ -345,6 +346,35 @@ raise an error.
Restrict delta matches based on "islands". See DELTA ISLANDS
below.

--name-hash-version=<n>::
While performing delta compression, Git groups objects that may be
similar based on heuristics using the path to that object. While
grouping objects by an exact path match is good for paths with
many versions, there are benefits for finding delta pairs across
different full paths. Git collects objects by type and then by a
"name hash" of the path and then by size, hoping to group objects
that will compress well together.
+
The default name hash version is `1`, which prioritizes hash locality by
considering the final bytes of the path as providing the maximum magnitude
to the hash function. This version excels at distinguishing short paths
and finding renames across directories. However, the hash function depends
primarily on the final 16 bytes of the path. If there are many paths in
the repo that have the same final 16 bytes and differ only by parent
directory, then this name-hash may lead to too many collisions and cause
poor results. At the moment, this version is required when writing
reachability bitmap files with `--write-bitmap-index`.
+
The name hash version `2` has similar locality features as version `1`,
except it considers each path component separately and overlays the hashes
with a shift. This still prioritizes the final bytes of the path, but also
"salts" the lower bits of the hash using the parent directory names. This
method allows for some of the locality benefits of version `1` while
breaking most of the collisions from a similarly-named file appearing in
many different directories. At the moment, this version is not allowed
when writing reachability bitmap files with `--write-bitmap-index` and it
will be automatically changed to version `1`.


DELTA ISLANDS
-------------

View File

@ -9,7 +9,9 @@ git-repack - Pack unpacked objects in a repository
SYNOPSIS
--------
[verse]
'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m] [--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>] [--write-midx]
'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m]
[--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]
[--write-midx] [--name-hash-version=<n>]

DESCRIPTION
-----------
@ -249,6 +251,11 @@ linkgit:git-multi-pack-index[1]).
Write a multi-pack index (see linkgit:git-multi-pack-index[1])
containing the non-redundant packs.

--name-hash-version=<n>::
Provide this argument to the underlying `git pack-objects` process.
See linkgit:git-pack-objects[1] for full details.


CONFIGURATION
-------------


View File

@ -813,6 +813,7 @@ TEST_BUILTINS_OBJS += test-lazy-init-name-hash.o
TEST_BUILTINS_OBJS += test-match-trees.o
TEST_BUILTINS_OBJS += test-mergesort.o
TEST_BUILTINS_OBJS += test-mktemp.o
TEST_BUILTINS_OBJS += test-name-hash.o
TEST_BUILTINS_OBJS += test-online-cpus.o
TEST_BUILTINS_OBJS += test-pack-mtimes.o
TEST_BUILTINS_OBJS += test-parse-options.o

View File

@ -269,6 +269,43 @@ struct configured_exclusion {
static struct oidmap configured_exclusions;

static struct oidset excluded_by_config;
static int name_hash_version = -1;

/**
* Check whether the name_hash_version chosen by user input is appropriate,
* and also validate whether it is compatible with other features.
*/
static void validate_name_hash_version(void)
{
if (name_hash_version < 1 || name_hash_version > 2)
die(_("invalid --name-hash-version option: %d"), name_hash_version);
if (write_bitmap_index && name_hash_version != 1) {
warning(_("currently, --write-bitmap-index requires --name-hash-version=1"));
name_hash_version = 1;
}
}

static inline uint32_t pack_name_hash_fn(const char *name)
{
static int seen_version = -1;

if (seen_version < 0)
seen_version = name_hash_version;
else if (seen_version != name_hash_version)
BUG("name hash version changed from %d to %d mid-process",
seen_version, name_hash_version);

switch (name_hash_version) {
case 1:
return pack_name_hash(name);

case 2:
return pack_name_hash_v2((const unsigned char *)name);

default:
BUG("invalid name-hash version: %d", name_hash_version);
}
}

/*
* stats
@ -1689,7 +1726,7 @@ static int add_object_entry(const struct object_id *oid, enum object_type type,
return 0;
}

create_object_entry(oid, type, pack_name_hash(name),
create_object_entry(oid, type, pack_name_hash_fn(name),
exclude, name && no_try_delta(name),
found_pack, found_offset);
return 1;
@ -1903,7 +1940,7 @@ static void add_preferred_base_object(const char *name)
{
struct pbase_tree *it;
size_t cmplen;
unsigned hash = pack_name_hash(name);
unsigned hash = pack_name_hash_fn(name);

if (!num_preferred_base || check_pbase_path(hash))
return;
@ -3415,7 +3452,7 @@ static void show_object_pack_hint(struct object *object, const char *name,
* here using a now in order to perhaps improve the delta selection
* process.
*/
oe->hash = pack_name_hash(name);
oe->hash = pack_name_hash_fn(name);
oe->no_try_delta = name && no_try_delta(name);

stdin_packs_hints_nr++;
@ -3565,7 +3602,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
entry = packlist_find(&to_pack, oid);
if (entry) {
if (name) {
entry->hash = pack_name_hash(name);
entry->hash = pack_name_hash_fn(name);
entry->no_try_delta = no_try_delta(name);
}
} else {
@ -3588,7 +3625,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
return;
}

entry = create_object_entry(oid, type, pack_name_hash(name),
entry = create_object_entry(oid, type, pack_name_hash_fn(name),
0, name && no_try_delta(name),
pack, offset);
}
@ -4068,6 +4105,15 @@ static int get_object_list_from_bitmap(struct rev_info *revs)
if (!(bitmap_git = prepare_bitmap_walk(revs, 0)))
return -1;

/*
* For now, force the name-hash version to be 1 since that
* is the version implied by the bitmap format. Later, the
* format can include this version explicitly in its format,
* allowing readers to know the version that was used during
* the bitmap write.
*/
name_hash_version = 1;

if (pack_options_allow_reuse())
reuse_partial_packfile_from_bitmap(bitmap_git,
&reuse_packfiles,
@ -4443,6 +4489,8 @@ int cmd_pack_objects(int argc,
OPT_STRING_LIST(0, "uri-protocol", &uri_protocols,
N_("protocol"),
N_("exclude any configured uploadpack.blobpackfileuri with this protocol")),
OPT_INTEGER(0, "name-hash-version", &name_hash_version,
N_("use the specified name-hash function to group similar objects")),
OPT_END(),
};

@ -4598,6 +4646,11 @@ int cmd_pack_objects(int argc,
if (pack_to_stdout || !rev_list_all)
write_bitmap_index = 0;

if (name_hash_version < 0)
name_hash_version = (int)git_env_ulong("GIT_TEST_NAME_HASH_VERSION", 1);

validate_name_hash_version();

if (use_delta_islands)
strvec_push(&rp, "--topo-order");


View File

@ -41,7 +41,9 @@ static int run_update_server_info = 1;
static char *packdir, *packtmp_name, *packtmp;

static const char *const git_repack_usage[] = {
N_("git repack [<options>]"),
N_("git repack [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m]\n"
"[--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]\n"
"[--write-midx] [--name-hash-version=<n>]"),
NULL
};

@ -60,6 +62,7 @@ struct pack_objects_args {
int no_reuse_object;
int quiet;
int local;
int name_hash_version;
struct list_objects_filter_options filter_options;
};

@ -308,6 +311,8 @@ static void prepare_pack_objects(struct child_process *cmd,
strvec_pushf(&cmd->args, "--no-reuse-delta");
if (args->no_reuse_object)
strvec_pushf(&cmd->args, "--no-reuse-object");
if (args->name_hash_version)
strvec_pushf(&cmd->args, "--name-hash-version=%d", args->name_hash_version);
if (args->local)
strvec_push(&cmd->args, "--local");
if (args->quiet)
@ -1205,6 +1210,8 @@ int cmd_repack(int argc,
N_("pass --no-reuse-delta to git-pack-objects")),
OPT_BOOL('F', NULL, &po_args.no_reuse_object,
N_("pass --no-reuse-object to git-pack-objects")),
OPT_INTEGER(0, "name-hash-version", &po_args.name_hash_version,
N_("specify the name hash version to use for grouping similar objects by path")),
OPT_NEGBIT('n', NULL, &run_update_server_info,
N_("do not run git-update-server-info"), 1),
OPT__QUIET(&po_args.quiet, N_("be quiet")),

View File

@ -208,6 +208,34 @@ static inline uint32_t pack_name_hash(const char *name)
return hash;
}

static inline uint32_t pack_name_hash_v2(const unsigned char *name)
{
uint32_t hash = 0, base = 0, c;

if (!name)
return 0;

while ((c = *name++)) {
if (isspace(c))
continue;
if (c == '/') {
base = (base >> 6) ^ hash;
hash = 0;
} else {
/*
* 'c' is only a single byte. Reverse it and move
* it to the top of the hash, moving the rest to
* less-significant bits.
*/
c = (c & 0xF0) >> 4 | (c & 0x0F) << 4;
c = (c & 0xCC) >> 2 | (c & 0x33) << 2;
c = (c & 0xAA) >> 1 | (c & 0x55) << 1;
hash = (hash >> 2) + (c << 24);
}
}
return (base >> 6) ^ hash;
}

static inline enum object_type oe_type(const struct object_entry *e)
{
return e->type_valid ? e->type_ : OBJ_BAD;

View File

@ -471,6 +471,10 @@ a test and then fails then the whole test run will abort. This can help to make
sure the expected tests are executed and not silently skipped when their
dependency breaks or is simply not present in a new environment.

GIT_TEST_NAME_HASH_VERSION=<int>, when set, causes 'git pack-objects' to
assume '--name-hash-version=<n>'.


Naming Tests
------------


View File

@ -34,6 +34,7 @@ test_tool_sources = [
'test-match-trees.c',
'test-mergesort.c',
'test-mktemp.c',
'test-name-hash.c',
'test-online-cpus.c',
'test-pack-mtimes.c',
'test-parse-options.c',

23
t/helper/test-name-hash.c Normal file
View File

@ -0,0 +1,23 @@
/*
* test-name-hash.c: Read a list of paths over stdin and report on their
* name-hash and full name-hash.
*/

#include "test-tool.h"
#include "git-compat-util.h"
#include "pack-objects.h"
#include "strbuf.h"

int cmd__name_hash(int argc UNUSED, const char **argv UNUSED)
{
struct strbuf line = STRBUF_INIT;

while (!strbuf_getline(&line, stdin)) {
printf("%10u ", pack_name_hash(line.buf));
printf("%10u ", pack_name_hash_v2((unsigned const char *)line.buf));
printf("%s\n", line.buf);
}

strbuf_release(&line);
return 0;
}

View File

@ -44,6 +44,7 @@ static struct test_cmd cmds[] = {
{ "match-trees", cmd__match_trees },
{ "mergesort", cmd__mergesort },
{ "mktemp", cmd__mktemp },
{ "name-hash", cmd__name_hash },
{ "online-cpus", cmd__online_cpus },
{ "pack-mtimes", cmd__pack_mtimes },
{ "parse-options", cmd__parse_options },

View File

@ -37,6 +37,7 @@ int cmd__lazy_init_name_hash(int argc, const char **argv);
int cmd__match_trees(int argc, const char **argv);
int cmd__mergesort(int argc, const char **argv);
int cmd__mktemp(int argc, const char **argv);
int cmd__name_hash(int argc, const char **argv);
int cmd__online_cpus(int argc, const char **argv);
int cmd__pack_mtimes(int argc, const char **argv);
int cmd__parse_options(int argc, const char **argv);

70
t/perf/p5313-pack-objects.sh Executable file
View File

@ -0,0 +1,70 @@
#!/bin/sh

test_description='Tests pack performance using bitmaps'
. ./perf-lib.sh

GIT_TEST_PASSING_SANITIZE_LEAK=0
export GIT_TEST_PASSING_SANITIZE_LEAK

test_perf_large_repo

test_expect_success 'create rev input' '
cat >in-thin <<-EOF &&
$(git rev-parse HEAD)
^$(git rev-parse HEAD~1)
EOF

cat >in-big <<-EOF &&
$(git rev-parse HEAD)
^$(git rev-parse HEAD~1000)
EOF

cat >in-shallow <<-EOF
$(git rev-parse HEAD)
--shallow $(git rev-parse HEAD)
EOF
'

for version in 1 2
do
export version

test_perf "thin pack with version $version" '
git pack-objects --thin --stdout --revs --sparse \
--name-hash-version=$version <in-thin >out
'

test_size "thin pack size with version $version" '
test_file_size out
'

test_perf "big pack with version $version" '
git pack-objects --stdout --revs --sparse \
--name-hash-version=$version <in-big >out
'

test_size "big pack size with version $version" '
test_file_size out
'

test_perf "shallow fetch pack with version $version" '
git pack-objects --stdout --revs --sparse --shallow \
--name-hash-version=$version <in-shallow >out
'

test_size "shallow pack size with version $version" '
test_file_size out
'

test_perf "repack with version $version" '
git repack -adf --name-hash-version=$version
'

test_size "repack size with version $version" '
gitdir=$(git rev-parse --git-dir) &&
pack=$(ls $gitdir/objects/pack/pack-*.pack) &&
test_file_size "$pack"
'
done

test_done

31
t/perf/p5314-name-hash.sh Executable file
View File

@ -0,0 +1,31 @@
#!/bin/sh

test_description='Tests pack performance using bitmaps'
. ./perf-lib.sh

GIT_TEST_PASSING_SANITIZE_LEAK=0
export GIT_TEST_PASSING_SANITIZE_LEAK

test_perf_large_repo

test_size 'paths at head' '
git ls-tree -r --name-only HEAD >path-list &&
wc -l <path-list &&
test-tool name-hash <path-list >name-hashes
'

for version in 1 2
do
test_size "distinct hash value: v$version" '
awk "{ print \$$version; }" <name-hashes | sort | \
uniq -c >name-hash-count &&
wc -l <name-hash-count
'

test_size "maximum multiplicity: v$version" '
sort -nr <name-hash-count | head -n 1 | \
awk "{ print \$1; }"
'
done

test_done

View File

@ -45,7 +45,6 @@ rebase
remote
remote-ext
remote-fd
repack
reset
restore
rev-parse

View File

@ -689,4 +689,38 @@ do
'
done

test_expect_success 'valid and invalid --name-hash-versions' '
sane_unset GIT_TEST_NAME_HASH_VERSION &&

# Valid values are hard to verify other than "do not fail".
# Performance tests will be more valuable to validate these versions.
# Negative values are converted to version 1.
for value in -1 1 2
do
git pack-objects base --all --name-hash-version=$value || return 1
done &&

# Invalid values have clear post-conditions.
for value in 0 3
do
test_must_fail git pack-objects base --all --name-hash-version=$value 2>err &&
test_grep "invalid --name-hash-version option" err || return 1
done
'

# The following test is not necessarily a permanent choice, but since we do not
# have a "name hash version" bit in the .bitmap file format, we cannot write the
# hash values into the .bitmap file without risking breakage later.
#
# TODO: Make these compatible in the future and replace this test with the
# expected behavior when both are specified.
test_expect_success '--name-hash-version=2 and --write-bitmap-index are incompatible' '
git pack-objects base --all --name-hash-version=2 --write-bitmap-index 2>err &&
test_grep "currently, --write-bitmap-index requires --name-hash-version=1" err &&

# --stdout option silently removes --write-bitmap-index
git pack-objects --stdout --all --name-hash-version=2 --write-bitmap-index >out 2>err &&
! test_grep "currently, --write-bitmap-index requires --name-hash-version=1" err
'

test_done

View File

@ -26,6 +26,36 @@ has_any () {
grep -Ff "$1" "$2"
}

# Since name-hash values are stored in the .bitmap files, add a test
# that checks that the name-hash calculations are stable across versions.
# Not exhaustive, but these hashing algorithms would be hard to change
# without causing deviations here.
test_expect_success 'name-hash value stability' '
cat >names <<-\EOF &&
first
second
third
a/one-long-enough-for-collisions
b/two-long-enough-for-collisions
many/parts/to/this/path/enough/to/collide/in/v2
enough/parts/to/this/path/enough/to/collide/in/v2
EOF

test-tool name-hash <names >out &&

cat >expect <<-\EOF &&
2582249472 1763573760 first
2289942528 1188134912 second
2300837888 1130758144 third
2544516325 3963087891 a/one-long-enough-for-collisions
2544516325 4013419539 b/two-long-enough-for-collisions
1420111091 1709547268 many/parts/to/this/path/enough/to/collide/in/v2
1420111091 1709547268 enough/parts/to/this/path/enough/to/collide/in/v2
EOF

test_cmp expect out
'

test_bitmap_cases () {
writeLookupTable=false
for i in "$@"
@ -419,7 +449,10 @@ test_bitmap_cases () {
cat >expect <<-\EOF &&
error: missing value for '\''pack.preferbitmaptips'\''
EOF
git repack -adb 2>actual &&

# Disable name hash version adjustment due to stderr comparison.
GIT_TEST_NAME_HASH_VERSION=1 \
git repack -adb 2>actual &&
test_cmp expect actual
)
'

View File

@ -208,7 +208,8 @@ test_expect_success 'bitmapPseudoMerge.stableThreshold creates stable groups' '
'

test_expect_success 'out of order thresholds are rejected' '
test_must_fail git \
# Disable the test var to remove a stderr message.
test_must_fail env GIT_TEST_NAME_HASH_VERSION=1 git \
-c bitmapPseudoMerge.test.pattern="refs/*" \
-c bitmapPseudoMerge.test.threshold=1.month.ago \
-c bitmapPseudoMerge.test.stableThreshold=1.week.ago \

View File

@ -1237,7 +1237,12 @@ test_expect_success 'all boundary commits are excluded' '
test_tick &&
git merge otherside &&
ad=$(git log --no-walk --format=%ad HEAD) &&
git bundle create twoside-boundary.bdl main --since="$ad" &&

# If the a different name hash function is used here, then no delta
# pair is found and the bundle does not expand to three objects
# when fixing the thin object.
GIT_TEST_NAME_HASH_VERSION=1 \
git bundle create twoside-boundary.bdl main --since="$ad" &&
test_bundle_object_count --thin twoside-boundary.bdl 3
'


View File

@ -246,7 +246,11 @@ test_expect_success 'create bundle with --since option' '
EOF
test_cmp expect actual &&

git bundle create since.bdl \
# If a different name hash function is used, then one fewer
# delta base is found and this counts a different number
# of objects after performing --fix-thin.
GIT_TEST_NAME_HASH_VERSION=1 \
git bundle create since.bdl \
--since "Thu Apr 7 15:27:00 2005 -0700" \
--all &&


View File

@ -1093,7 +1093,9 @@ test_expect_success 'submodule update --quiet passes quietness to fetch with a s
) &&
git clone super4 super5 &&
(cd super5 &&
git submodule update --quiet --init --depth=1 submodule3 >out 2>err &&
# This test var can mess with the stderr output checked in this test.
GIT_TEST_NAME_HASH_VERSION=1 \
git submodule update --quiet --init --depth=1 submodule3 >out 2>err &&
test_must_be_empty out &&
test_must_be_empty err
) &&

View File

@ -308,7 +308,10 @@ test_expect_success 'no bitmaps created if .keep files present' '
keep=${pack%.pack}.keep &&
test_when_finished "rm -f \"\$keep\"" &&
>"$keep" &&
git -C bare.git repack -ad 2>stderr &&

# Disable --name-hash-version test due to stderr comparison.
GIT_TEST_NAME_HASH_VERSION=1 \
git -C bare.git repack -ad 2>stderr &&
test_must_be_empty stderr &&
find bare.git/objects/pack/ -type f -name "*.bitmap" >actual &&
test_must_be_empty actual
@ -319,7 +322,10 @@ test_expect_success 'auto-bitmaps do not complain if unavailable' '
blob=$(test-tool genrandom big $((1024*1024)) |
git -C bare.git hash-object -w --stdin) &&
git -C bare.git update-ref refs/tags/big $blob &&
git -C bare.git repack -ad 2>stderr &&

# Disable --name-hash-version test due to stderr comparison.
GIT_TEST_NAME_HASH_VERSION=1 \
git -C bare.git repack -ad 2>stderr &&
test_must_be_empty stderr &&
find bare.git/objects/pack -type f -name "*.bitmap" >actual &&
test_must_be_empty actual
@ -776,6 +782,12 @@ test_expect_success 'repack -ad cleans up old .tmp-* packs' '
test_must_be_empty tmpfiles
'

test_expect_success '--name-hash-version option passes through to pack-objects' '
GIT_TRACE2_EVENT="$(pwd)/hash-trace.txt" \
git repack -a --name-hash-version=2 &&
test_subcommand_flex git pack-objects --name-hash-version=2 <hash-trace.txt
'

test_expect_success 'setup for update-server-info' '
git init update-server-info &&
test_commit -C update-server-info message

View File

@ -1896,6 +1896,32 @@ test_subcommand () {
fi
}

# Check that the given subcommand was run with the given set of
# arguments in order (but with possible extra arguments).
#
# test_subcommand_flex [!] <command> <args>... < <trace>
#
# If the first parameter passed is !, this instead checks that
# the given command was not called.
#
test_subcommand_flex () {
local negate=
if test "$1" = "!"
then
negate=t
shift
fi

local expr="$(printf '"%s".*' "$@")"

if test -n "$negate"
then
! grep "\[$expr\]"
else
grep "\[$expr\]"
fi
}

# Check that the given command was invoked as part of the
# trace2-format trace on stdin.
#