From 3a1fd7b09cd927774d1d72830cd49cc1432ef1e8 Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Mon, 17 Nov 2025 22:16:07 +0000 Subject: [PATCH 01/15] repository: require Rust support for interoperability We'll be implementing some of our interoperability code, like the loose object map, in Rust. While the code currently compiles with the old loose object map format, which is written entirely in C, we'll soon replace that with the Rust-based implementation. Require the use of Rust for compatibility mode and die if it is not supported. Because the repo argument is not used when Rust is missing, cast it to void to silence the compiler warning, which we do not care about. Add a prerequisite in our tests, RUST, that checks if Rust functionality is available and use it in the tests that handle interoperability. This is technically a regression in functionality compared to our existing state, but pack index v3 is not yet implemented and thus the functionality is mostly quite broken, which is why we've recently marked this functionality as experimental. We don't believe anyone is getting useful use out of the interoperability code in its current state, so no actual users should be negatively impacted by this change. Signed-off-by: brian m. carlson Signed-off-by: Junio C Hamano --- repository.c | 8 ++- t/t1006-cat-file.sh | 82 +++++++++++++++++++++---------- t/t1016-compatObjectFormat.sh | 6 +++ t/t1500-rev-parse.sh | 2 +- t/t9305-fast-import-signatures.sh | 4 +- t/t9350-fast-export.sh | 4 +- t/test-lib.sh | 4 ++ 7 files changed, 77 insertions(+), 33 deletions(-) diff --git a/repository.c b/repository.c index 6faf5c7398..186d2c1028 100644 --- a/repository.c +++ b/repository.c @@ -3,6 +3,7 @@ #include "repository.h" #include "odb.h" #include "config.h" +#include "gettext.h" #include "object.h" #include "lockfile.h" #include "path.h" @@ -190,13 +191,18 @@ void repo_set_hash_algo(struct repository *repo, int hash_algo) repo->hash_algo = &hash_algos[hash_algo]; } -void repo_set_compat_hash_algo(struct repository *repo, int algo) +void repo_set_compat_hash_algo(struct repository *repo MAYBE_UNUSED, int algo) { +#ifdef WITH_RUST if (hash_algo_by_ptr(repo->hash_algo) == algo) BUG("hash_algo and compat_hash_algo match"); repo->compat_hash_algo = algo ? &hash_algos[algo] : NULL; if (repo->compat_hash_algo) repo_read_loose_object_map(repo); +#else + if (algo) + die(_("compatibility hash algorithm support requires Rust")); +#endif } void repo_set_ref_storage_format(struct repository *repo, diff --git a/t/t1006-cat-file.sh b/t/t1006-cat-file.sh index 1f61b666a7..29a9503523 100755 --- a/t/t1006-cat-file.sh +++ b/t/t1006-cat-file.sh @@ -241,10 +241,16 @@ hello_content="Hello World" hello_size=$(strlen "$hello_content") hello_oid=$(echo_without_newline "$hello_content" | git hash-object --stdin) -test_expect_success "setup" ' +test_expect_success "setup part 1" ' git config core.repositoryformatversion 1 && - git config extensions.objectformat $test_hash_algo && - git config extensions.compatobjectformat $test_compat_hash_algo && + git config extensions.objectformat $test_hash_algo +' + +test_expect_success RUST 'compat setup' ' + git config extensions.compatobjectformat $test_compat_hash_algo +' + +test_expect_success 'setup part 2' ' echo_without_newline "$hello_content" > hello && git update-index --add hello && echo_without_newline "$hello_content" > "path with spaces" && @@ -273,9 +279,13 @@ run_blob_tests () { ' } -hello_compat_oid=$(git rev-parse --output-object-format=$test_compat_hash_algo $hello_oid) run_blob_tests $hello_oid -run_blob_tests $hello_compat_oid + +if test_have_prereq RUST +then + hello_compat_oid=$(git rev-parse --output-object-format=$test_compat_hash_algo $hello_oid) + run_blob_tests $hello_compat_oid +fi test_expect_success '--batch-check without %(rest) considers whole line' ' echo "$hello_oid blob $hello_size" >expect && @@ -286,62 +296,76 @@ test_expect_success '--batch-check without %(rest) considers whole line' ' ' tree_oid=$(git write-tree) -tree_compat_oid=$(git rev-parse --output-object-format=$test_compat_hash_algo $tree_oid) tree_size=$((2 * $(test_oid rawsz) + 13 + 24)) -tree_compat_size=$((2 * $(test_oid --hash=compat rawsz) + 13 + 24)) tree_pretty_content="100644 blob $hello_oid hello${LF}100755 blob $hello_oid path with spaces${LF}" -tree_compat_pretty_content="100644 blob $hello_compat_oid hello${LF}100755 blob $hello_compat_oid path with spaces${LF}" run_tests 'tree' $tree_oid "" $tree_size "" "$tree_pretty_content" -run_tests 'tree' $tree_compat_oid "" $tree_compat_size "" "$tree_compat_pretty_content" run_tests 'blob' "$tree_oid:hello" "100644" $hello_size "" "$hello_content" $hello_oid -run_tests 'blob' "$tree_compat_oid:hello" "100644" $hello_size "" "$hello_content" $hello_compat_oid run_tests 'blob' "$tree_oid:path with spaces" "100755" $hello_size "" "$hello_content" $hello_oid -run_tests 'blob' "$tree_compat_oid:path with spaces" "100755" $hello_size "" "$hello_content" $hello_compat_oid + +if test_have_prereq RUST +then + tree_compat_oid=$(git rev-parse --output-object-format=$test_compat_hash_algo $tree_oid) + tree_compat_size=$((2 * $(test_oid --hash=compat rawsz) + 13 + 24)) + tree_compat_pretty_content="100644 blob $hello_compat_oid hello${LF}100755 blob $hello_compat_oid path with spaces${LF}" + + run_tests 'tree' $tree_compat_oid "" $tree_compat_size "" "$tree_compat_pretty_content" + run_tests 'blob' "$tree_compat_oid:hello" "100644" $hello_size "" "$hello_content" $hello_compat_oid + run_tests 'blob' "$tree_compat_oid:path with spaces" "100755" $hello_size "" "$hello_content" $hello_compat_oid +fi commit_message="Initial commit" commit_oid=$(echo_without_newline "$commit_message" | git commit-tree $tree_oid) -commit_compat_oid=$(git rev-parse --output-object-format=$test_compat_hash_algo $commit_oid) commit_size=$(($(test_oid hexsz) + 137)) -commit_compat_size=$(($(test_oid --hash=compat hexsz) + 137)) commit_content="tree $tree_oid author $GIT_AUTHOR_NAME <$GIT_AUTHOR_EMAIL> $GIT_AUTHOR_DATE committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE $commit_message" -commit_compat_content="tree $tree_compat_oid +run_tests 'commit' $commit_oid "" $commit_size "$commit_content" "$commit_content" + +if test_have_prereq RUST +then + commit_compat_oid=$(git rev-parse --output-object-format=$test_compat_hash_algo $commit_oid) + commit_compat_size=$(($(test_oid --hash=compat hexsz) + 137)) + commit_compat_content="tree $tree_compat_oid author $GIT_AUTHOR_NAME <$GIT_AUTHOR_EMAIL> $GIT_AUTHOR_DATE committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE $commit_message" -run_tests 'commit' $commit_oid "" $commit_size "$commit_content" "$commit_content" -run_tests 'commit' $commit_compat_oid "" $commit_compat_size "$commit_compat_content" "$commit_compat_content" + run_tests 'commit' $commit_compat_oid "" $commit_compat_size "$commit_compat_content" "$commit_compat_content" +fi tag_header_without_oid="type blob tag hellotag tagger $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL>" tag_header_without_timestamp="object $hello_oid $tag_header_without_oid" -tag_compat_header_without_timestamp="object $hello_compat_oid -$tag_header_without_oid" tag_description="This is a tag" tag_content="$tag_header_without_timestamp 0 +0000 -$tag_description" -tag_compat_content="$tag_compat_header_without_timestamp 0 +0000 - $tag_description" tag_oid=$(echo_without_newline "$tag_content" | git hash-object -t tag --stdin -w) tag_size=$(strlen "$tag_content") -tag_compat_oid=$(git rev-parse --output-object-format=$test_compat_hash_algo $tag_oid) -tag_compat_size=$(strlen "$tag_compat_content") - run_tests 'tag' $tag_oid "" $tag_size "$tag_content" "$tag_content" -run_tests 'tag' $tag_compat_oid "" $tag_compat_size "$tag_compat_content" "$tag_compat_content" + +if test_have_prereq RUST +then + tag_compat_header_without_timestamp="object $hello_compat_oid +$tag_header_without_oid" + tag_compat_content="$tag_compat_header_without_timestamp 0 +0000 + +$tag_description" + + tag_compat_oid=$(git rev-parse --output-object-format=$test_compat_hash_algo $tag_oid) + tag_compat_size=$(strlen "$tag_compat_content") + + run_tests 'tag' $tag_compat_oid "" $tag_compat_size "$tag_compat_content" "$tag_compat_content" +fi test_expect_success "Reach a blob from a tag pointing to it" ' echo_without_newline "$hello_content" >expect && @@ -590,7 +614,8 @@ flush" } batch_tests $hello_oid $tree_oid $tree_size $commit_oid $commit_size "$commit_content" $tag_oid $tag_size "$tag_content" -batch_tests $hello_compat_oid $tree_compat_oid $tree_compat_size $commit_compat_oid $commit_compat_size "$commit_compat_content" $tag_compat_oid $tag_compat_size "$tag_compat_content" + +test_have_prereq RUST && batch_tests $hello_compat_oid $tree_compat_oid $tree_compat_size $commit_compat_oid $commit_compat_size "$commit_compat_content" $tag_compat_oid $tag_compat_size "$tag_compat_content" test_expect_success FUNNYNAMES 'setup with newline in input' ' @@ -1226,7 +1251,10 @@ test_expect_success 'batch-check with a submodule' ' test_unconfig extensions.compatobjectformat && printf "160000 commit $(test_oid deadbeef)\tsub\n" >tree-with-sub && tree=$(git mktree actual <<-EOF && $tree:sub diff --git a/t/t1016-compatObjectFormat.sh b/t/t1016-compatObjectFormat.sh index a9af8b2396..af3ceac3f5 100755 --- a/t/t1016-compatObjectFormat.sh +++ b/t/t1016-compatObjectFormat.sh @@ -8,6 +8,12 @@ test_description='Test how well compatObjectFormat works' . ./test-lib.sh . "$TEST_DIRECTORY"/lib-gpg.sh +if ! test_have_prereq RUST +then + skip_all='interoperability requires a Git built with Rust' + test_done +fi + # All of the follow variables must be defined in the environment: # GIT_AUTHOR_NAME # GIT_AUTHOR_EMAIL diff --git a/t/t1500-rev-parse.sh b/t/t1500-rev-parse.sh index 7739ab611b..98c5a772bd 100755 --- a/t/t1500-rev-parse.sh +++ b/t/t1500-rev-parse.sh @@ -208,7 +208,7 @@ test_expect_success 'rev-parse --show-object-format in repo' ' ' -test_expect_success 'rev-parse --show-object-format in repo with compat mode' ' +test_expect_success RUST 'rev-parse --show-object-format in repo with compat mode' ' mkdir repo && ( sane_unset GIT_DEFAULT_HASH && diff --git a/t/t9305-fast-import-signatures.sh b/t/t9305-fast-import-signatures.sh index c2b4271658..63c0a2b5c4 100755 --- a/t/t9305-fast-import-signatures.sh +++ b/t/t9305-fast-import-signatures.sh @@ -70,7 +70,7 @@ test_expect_success GPGSSH 'strip SSH signature with --signed-commits=strip' ' test_must_be_empty log ' -test_expect_success GPG 'setup a commit with dual OpenPGP signatures on its SHA-1 and SHA-256 formats' ' +test_expect_success RUST,GPG 'setup a commit with dual OpenPGP signatures on its SHA-1 and SHA-256 formats' ' # Create a signed SHA-256 commit git init --object-format=sha256 explicit-sha256 && git -C explicit-sha256 config extensions.compatObjectFormat sha1 && @@ -91,7 +91,7 @@ test_expect_success GPG 'setup a commit with dual OpenPGP signatures on its SHA- test_grep -E "^gpgsig-sha256 " out ' -test_expect_success GPG 'strip both OpenPGP signatures with --signed-commits=warn-strip' ' +test_expect_success RUST,GPG 'strip both OpenPGP signatures with --signed-commits=warn-strip' ' git -C explicit-sha256 fast-export --signed-commits=verbatim dual-signed >output && test_grep -E "^gpgsig sha1 openpgp" output && test_grep -E "^gpgsig sha256 openpgp" output && diff --git a/t/t9350-fast-export.sh b/t/t9350-fast-export.sh index 3d153a4805..784d68b6e5 100755 --- a/t/t9350-fast-export.sh +++ b/t/t9350-fast-export.sh @@ -972,7 +972,7 @@ test_expect_success 'fast-export handles --end-of-options' ' test_cmp expect actual ' -test_expect_success GPG 'setup a commit with dual signatures on its SHA-1 and SHA-256 formats' ' +test_expect_success GPG,RUST 'setup a commit with dual signatures on its SHA-1 and SHA-256 formats' ' # Create a signed SHA-256 commit git init --object-format=sha256 explicit-sha256 && git -C explicit-sha256 config extensions.compatObjectFormat sha1 && @@ -993,7 +993,7 @@ test_expect_success GPG 'setup a commit with dual signatures on its SHA-1 and SH test_grep -E "^gpgsig-sha256 " out ' -test_expect_success GPG 'export and import of doubly signed commit' ' +test_expect_success GPG,RUST 'export and import of doubly signed commit' ' git -C explicit-sha256 fast-export --signed-commits=verbatim dual-signed >output && test_grep -E "^gpgsig sha1 openpgp" output && test_grep -E "^gpgsig sha256 openpgp" output && diff --git a/t/test-lib.sh b/t/test-lib.sh index ef0ab7ec2d..3499a83806 100644 --- a/t/test-lib.sh +++ b/t/test-lib.sh @@ -1890,6 +1890,10 @@ test_lazy_prereq LONG_IS_64BIT ' test 8 -le "$(build_option sizeof-long)" ' +test_lazy_prereq RUST ' + test "$(build_option rust)" = enabled +' + test_lazy_prereq TIME_IS_64BIT 'test-tool date is64bit' test_lazy_prereq TIME_T_IS_64BIT 'test-tool date time_t-is64bit' From 447480a5a6356b79ab82624390376b13800f140f Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Mon, 17 Nov 2025 22:16:08 +0000 Subject: [PATCH 02/15] conversion: don't crash when no destination algo When we set up a repository that doesn't have a compatibility hash algorithm, we set the destination algorithm object to NULL. In such a case, we want to silently do nothing instead of crashing, so simply treat the operation as a no-op and copy the object ID. Signed-off-by: brian m. carlson Signed-off-by: Junio C Hamano --- object-file-convert.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object-file-convert.c b/object-file-convert.c index 7ab875afe6..e44c821084 100644 --- a/object-file-convert.c +++ b/object-file-convert.c @@ -23,7 +23,7 @@ int repo_oid_to_algop(struct repository *repo, const struct object_id *src, const struct git_hash_algo *from = src->algo ? &hash_algos[src->algo] : repo->hash_algo; - if (from == to) { + if (from == to || !to) { if (src != dest) oidcpy(dest, src); return 0; From f97c49398b24a544a88e0ed5f230fd4714bb8d1c Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Mon, 17 Nov 2025 22:16:09 +0000 Subject: [PATCH 03/15] hash: use uint32_t for object_id algorithm We currently use an int for this value, but we'll define this structure from Rust in a future commit and we want to ensure that our data types are exactly identical. To make that possible, use a uint32_t for the hash algorithm. Signed-off-by: brian m. carlson Signed-off-by: Junio C Hamano --- hash.c | 6 +++--- hash.h | 10 +++++----- oidtree.c | 2 +- repository.c | 6 +++--- repository.h | 4 ++-- serve.c | 2 +- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/hash.c b/hash.c index 4a04ecb50e..81b4f87027 100644 --- a/hash.c +++ b/hash.c @@ -241,7 +241,7 @@ const char *empty_tree_oid_hex(const struct git_hash_algo *algop) return oid_to_hex_r(buf, algop->empty_tree); } -int hash_algo_by_name(const char *name) +uint32_t hash_algo_by_name(const char *name) { if (!name) return GIT_HASH_UNKNOWN; @@ -251,7 +251,7 @@ int hash_algo_by_name(const char *name) return GIT_HASH_UNKNOWN; } -int hash_algo_by_id(uint32_t format_id) +uint32_t hash_algo_by_id(uint32_t format_id) { for (size_t i = 1; i < GIT_HASH_NALGOS; i++) if (format_id == hash_algos[i].format_id) @@ -259,7 +259,7 @@ int hash_algo_by_id(uint32_t format_id) return GIT_HASH_UNKNOWN; } -int hash_algo_by_length(size_t len) +uint32_t hash_algo_by_length(size_t len) { for (size_t i = 1; i < GIT_HASH_NALGOS; i++) if (len == hash_algos[i].rawsz) diff --git a/hash.h b/hash.h index fae966b23c..99c9c2a0a8 100644 --- a/hash.h +++ b/hash.h @@ -211,7 +211,7 @@ static inline void git_SHA256_Clone(git_SHA256_CTX *dst, const git_SHA256_CTX *s struct object_id { unsigned char hash[GIT_MAX_RAWSZ]; - int algo; /* XXX requires 4-byte alignment */ + uint32_t algo; /* XXX requires 4-byte alignment */ }; #define GET_OID_QUIETLY 01 @@ -344,13 +344,13 @@ static inline void git_hash_final_oid(struct object_id *oid, struct git_hash_ctx * Return a GIT_HASH_* constant based on the name. Returns GIT_HASH_UNKNOWN if * the name doesn't match a known algorithm. */ -int hash_algo_by_name(const char *name); +uint32_t hash_algo_by_name(const char *name); /* Identical, except based on the format ID. */ -int hash_algo_by_id(uint32_t format_id); +uint32_t hash_algo_by_id(uint32_t format_id); /* Identical, except based on the length. */ -int hash_algo_by_length(size_t len); +uint32_t hash_algo_by_length(size_t len); /* Identical, except for a pointer to struct git_hash_algo. */ -static inline int hash_algo_by_ptr(const struct git_hash_algo *p) +static inline uint32_t hash_algo_by_ptr(const struct git_hash_algo *p) { size_t i; for (i = 0; i < GIT_HASH_NALGOS; i++) { diff --git a/oidtree.c b/oidtree.c index 151568f74f..324de94934 100644 --- a/oidtree.c +++ b/oidtree.c @@ -10,7 +10,7 @@ struct oidtree_iter_data { oidtree_iter fn; void *arg; size_t *last_nibble_at; - int algo; + uint32_t algo; uint8_t last_byte; }; diff --git a/repository.c b/repository.c index 186d2c1028..ebe719de3c 100644 --- a/repository.c +++ b/repository.c @@ -39,7 +39,7 @@ struct repository *the_repository = &the_repo; static void set_default_hash_algo(struct repository *repo) { const char *hash_name; - int algo; + uint32_t algo; hash_name = getenv("GIT_TEST_DEFAULT_HASH_ALGO"); if (!hash_name) @@ -186,12 +186,12 @@ void repo_set_gitdir(struct repository *repo, repo->gitdir, "index"); } -void repo_set_hash_algo(struct repository *repo, int hash_algo) +void repo_set_hash_algo(struct repository *repo, uint32_t hash_algo) { repo->hash_algo = &hash_algos[hash_algo]; } -void repo_set_compat_hash_algo(struct repository *repo MAYBE_UNUSED, int algo) +void repo_set_compat_hash_algo(struct repository *repo MAYBE_UNUSED, uint32_t algo) { #ifdef WITH_RUST if (hash_algo_by_ptr(repo->hash_algo) == algo) diff --git a/repository.h b/repository.h index 5808a5d610..c0a3543b24 100644 --- a/repository.h +++ b/repository.h @@ -193,8 +193,8 @@ struct set_gitdir_args { void repo_set_gitdir(struct repository *repo, const char *root, const struct set_gitdir_args *extra_args); void repo_set_worktree(struct repository *repo, const char *path); -void repo_set_hash_algo(struct repository *repo, int algo); -void repo_set_compat_hash_algo(struct repository *repo, int compat_algo); +void repo_set_hash_algo(struct repository *repo, uint32_t algo); +void repo_set_compat_hash_algo(struct repository *repo, uint32_t compat_algo); void repo_set_ref_storage_format(struct repository *repo, enum ref_storage_format format); void initialize_repository(struct repository *repo); diff --git a/serve.c b/serve.c index 53ecab3b42..49a6e39b1d 100644 --- a/serve.c +++ b/serve.c @@ -14,7 +14,7 @@ static int advertise_sid = -1; static int advertise_object_info = -1; -static int client_hash_algo = GIT_HASH_SHA1_LEGACY; +static uint32_t client_hash_algo = GIT_HASH_SHA1_LEGACY; static int always_advertise(struct repository *r UNUSED, struct strbuf *value UNUSED) From 0404613aa08457f89cae5c26932e4e32661b32f7 Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Mon, 17 Nov 2025 22:16:10 +0000 Subject: [PATCH 04/15] rust: add a ObjectID struct We'd like to be able to write some Rust code that can work with object IDs. Add a structure here that's identical to struct object_id in C, for easy use in sharing across the FFI boundary. We will use this structure in several places in hot paths, such as index-pack or pack-objects when converting between algorithms, so prioritize efficient interchange over a more idiomatic Rust approach. Signed-off-by: brian m. carlson Signed-off-by: Junio C Hamano --- Makefile | 1 + src/hash.rs | 21 +++++++++++++++++++++ src/lib.rs | 1 + src/meson.build | 1 + 4 files changed, 24 insertions(+) create mode 100644 src/hash.rs diff --git a/Makefile b/Makefile index 562e637fa0..2a926a375b 100644 --- a/Makefile +++ b/Makefile @@ -1528,6 +1528,7 @@ CLAR_TEST_OBJS += $(UNIT_TEST_DIR)/unit-test.o UNIT_TEST_OBJS += $(UNIT_TEST_DIR)/test-lib.o +RUST_SOURCES += src/hash.rs RUST_SOURCES += src/lib.rs RUST_SOURCES += src/varint.rs diff --git a/src/hash.rs b/src/hash.rs new file mode 100644 index 0000000000..0219391820 --- /dev/null +++ b/src/hash.rs @@ -0,0 +1,21 @@ +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation: version 2 of the License, dated June 1991. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this program; if not, see . + +pub const GIT_MAX_RAWSZ: usize = 32; + +/// A binary object ID. +#[repr(C)] +#[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq)] +pub struct ObjectID { + pub hash: [u8; GIT_MAX_RAWSZ], + pub algo: u32, +} diff --git a/src/lib.rs b/src/lib.rs index 9da70d8b57..cf7c962509 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1 +1,2 @@ +pub mod hash; pub mod varint; diff --git a/src/meson.build b/src/meson.build index 25b9ad5a14..c77041a3fa 100644 --- a/src/meson.build +++ b/src/meson.build @@ -1,4 +1,5 @@ libgit_rs_sources = [ + 'hash.rs', 'lib.rs', 'varint.rs', ] From 76ee08578ea6478015e7408f8376686d9f0411d6 Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Mon, 17 Nov 2025 22:16:11 +0000 Subject: [PATCH 05/15] rust: add a hash algorithm abstraction This works very similarly to the existing one in C except that it doesn't provide any functionality to hash an object. We don't currently need that right now, but the use of those function pointers do make it substantially more difficult to write a bit-for-bit identical structure across the C/Rust interface, so omit them for now. Instead of the more customary "&self", use "self", because the former is the size of a pointer and the latter is the size of an integer on most systems. Don't define an unknown value but use an Option for that instead. Update the object ID structure to allow slicing the data appropriately for the algorithm. Signed-off-by: brian m. carlson Signed-off-by: Junio C Hamano --- src/hash.rs | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) diff --git a/src/hash.rs b/src/hash.rs index 0219391820..0ec0ab0490 100644 --- a/src/hash.rs +++ b/src/hash.rs @@ -10,8 +10,25 @@ // You should have received a copy of the GNU General Public License along // with this program; if not, see . +use std::error::Error; +use std::fmt::{self, Debug, Display}; + pub const GIT_MAX_RAWSZ: usize = 32; +/// An error indicating an invalid hash algorithm. +/// +/// The contained `u32` is the same as the `algo` field in `ObjectID`. +#[derive(Debug, Copy, Clone)] +pub struct InvalidHashAlgorithm(pub u32); + +impl Display for InvalidHashAlgorithm { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "invalid hash algorithm {}", self.0) + } +} + +impl Error for InvalidHashAlgorithm {} + /// A binary object ID. #[repr(C)] #[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq)] @@ -19,3 +36,145 @@ pub struct ObjectID { pub hash: [u8; GIT_MAX_RAWSZ], pub algo: u32, } + +#[allow(dead_code)] +impl ObjectID { + pub fn as_slice(&self) -> Result<&[u8], InvalidHashAlgorithm> { + match HashAlgorithm::from_u32(self.algo) { + Some(algo) => Ok(&self.hash[0..algo.raw_len()]), + None => Err(InvalidHashAlgorithm(self.algo)), + } + } + + pub fn as_mut_slice(&mut self) -> Result<&mut [u8], InvalidHashAlgorithm> { + match HashAlgorithm::from_u32(self.algo) { + Some(algo) => Ok(&mut self.hash[0..algo.raw_len()]), + None => Err(InvalidHashAlgorithm(self.algo)), + } + } +} + +/// A hash algorithm, +#[repr(C)] +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] +pub enum HashAlgorithm { + SHA1 = 1, + SHA256 = 2, +} + +#[allow(dead_code)] +impl HashAlgorithm { + const SHA1_NULL_OID: ObjectID = ObjectID { + hash: [0u8; 32], + algo: Self::SHA1 as u32, + }; + const SHA256_NULL_OID: ObjectID = ObjectID { + hash: [0u8; 32], + algo: Self::SHA256 as u32, + }; + + const SHA1_EMPTY_TREE: ObjectID = ObjectID { + hash: *b"\x4b\x82\x5d\xc6\x42\xcb\x6e\xb9\xa0\x60\xe5\x4b\xf8\xd6\x92\x88\xfb\xee\x49\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + algo: Self::SHA1 as u32, + }; + const SHA256_EMPTY_TREE: ObjectID = ObjectID { + hash: *b"\x6e\xf1\x9b\x41\x22\x5c\x53\x69\xf1\xc1\x04\xd4\x5d\x8d\x85\xef\xa9\xb0\x57\xb5\x3b\x14\xb4\xb9\xb9\x39\xdd\x74\xde\xcc\x53\x21", + algo: Self::SHA256 as u32, + }; + + const SHA1_EMPTY_BLOB: ObjectID = ObjectID { + hash: *b"\xe6\x9d\xe2\x9b\xb2\xd1\xd6\x43\x4b\x8b\x29\xae\x77\x5a\xd8\xc2\xe4\x8c\x53\x91\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + algo: Self::SHA1 as u32, + }; + const SHA256_EMPTY_BLOB: ObjectID = ObjectID { + hash: *b"\x47\x3a\x0f\x4c\x3b\xe8\xa9\x36\x81\xa2\x67\xe3\xb1\xe9\xa7\xdc\xda\x11\x85\x43\x6f\xe1\x41\xf7\x74\x91\x20\xa3\x03\x72\x18\x13", + algo: Self::SHA256 as u32, + }; + + /// Return a hash algorithm based on the internal integer ID used by Git. + /// + /// Returns `None` if the algorithm doesn't indicate a valid algorithm. + pub const fn from_u32(algo: u32) -> Option { + match algo { + 1 => Some(HashAlgorithm::SHA1), + 2 => Some(HashAlgorithm::SHA256), + _ => None, + } + } + + /// Return a hash algorithm based on the internal integer ID used by Git. + /// + /// Returns `None` if the algorithm doesn't indicate a valid algorithm. + pub const fn from_format_id(algo: u32) -> Option { + match algo { + 0x73686131 => Some(HashAlgorithm::SHA1), + 0x73323536 => Some(HashAlgorithm::SHA256), + _ => None, + } + } + + /// The name of this hash algorithm as a string suitable for the configuration file. + pub const fn name(self) -> &'static str { + match self { + HashAlgorithm::SHA1 => "sha1", + HashAlgorithm::SHA256 => "sha256", + } + } + + /// The format ID of this algorithm for binary formats. + /// + /// Note that when writing this to a data format, it should be written in big-endian format + /// explicitly. + pub const fn format_id(self) -> u32 { + match self { + HashAlgorithm::SHA1 => 0x73686131, + HashAlgorithm::SHA256 => 0x73323536, + } + } + + /// The length of binary object IDs in this algorithm in bytes. + pub const fn raw_len(self) -> usize { + match self { + HashAlgorithm::SHA1 => 20, + HashAlgorithm::SHA256 => 32, + } + } + + /// The length of object IDs in this algorithm in hexadecimal characters. + pub const fn hex_len(self) -> usize { + self.raw_len() * 2 + } + + /// The number of bytes which is processed by one iteration of this algorithm's compression + /// function. + pub const fn block_size(self) -> usize { + match self { + HashAlgorithm::SHA1 => 64, + HashAlgorithm::SHA256 => 64, + } + } + + /// The object ID representing the empty blob. + pub const fn empty_blob(self) -> &'static ObjectID { + match self { + HashAlgorithm::SHA1 => &Self::SHA1_EMPTY_BLOB, + HashAlgorithm::SHA256 => &Self::SHA256_EMPTY_BLOB, + } + } + + /// The object ID representing the empty tree. + pub const fn empty_tree(self) -> &'static ObjectID { + match self { + HashAlgorithm::SHA1 => &Self::SHA1_EMPTY_TREE, + HashAlgorithm::SHA256 => &Self::SHA256_EMPTY_TREE, + } + } + + /// The object ID which is all zeros. + pub const fn null_oid(self) -> &'static ObjectID { + match self { + HashAlgorithm::SHA1 => &Self::SHA1_NULL_OID, + HashAlgorithm::SHA256 => &Self::SHA256_NULL_OID, + } + } +} From eae08d8bf7ee7b045371b5a64dc581e902703e58 Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Mon, 17 Nov 2025 22:16:12 +0000 Subject: [PATCH 06/15] hash: add a function to look up hash algo structs In C, it's easy for us to look up a hash algorithm structure by its offset by simply indexing the hash_algos array. However, in Rust, we sometimes need a pointer to pass to a C function, but we have our own hash algorithm abstraction. To get one from the other, let's provide a simple function that looks up the C structure from the offset and expose it in Rust. Signed-off-by: brian m. carlson Signed-off-by: Junio C Hamano --- hash.c | 7 +++++++ hash.h | 1 + src/hash.rs | 14 ++++++++++++++ 3 files changed, 22 insertions(+) diff --git a/hash.c b/hash.c index 81b4f87027..97fd473607 100644 --- a/hash.c +++ b/hash.c @@ -241,6 +241,13 @@ const char *empty_tree_oid_hex(const struct git_hash_algo *algop) return oid_to_hex_r(buf, algop->empty_tree); } +const struct git_hash_algo *hash_algo_ptr_by_number(uint32_t algo) +{ + if (algo >= GIT_HASH_NALGOS) + return NULL; + return &hash_algos[algo]; +} + uint32_t hash_algo_by_name(const char *name) { if (!name) diff --git a/hash.h b/hash.h index 99c9c2a0a8..709d7585a5 100644 --- a/hash.h +++ b/hash.h @@ -340,6 +340,7 @@ static inline void git_hash_final_oid(struct object_id *oid, struct git_hash_ctx ctx->algop->final_oid_fn(oid, ctx); } +const struct git_hash_algo *hash_algo_ptr_by_number(uint32_t algo); /* * Return a GIT_HASH_* constant based on the name. Returns GIT_HASH_UNKNOWN if * the name doesn't match a known algorithm. diff --git a/src/hash.rs b/src/hash.rs index 0ec0ab0490..70bb8095e8 100644 --- a/src/hash.rs +++ b/src/hash.rs @@ -12,6 +12,7 @@ use std::error::Error; use std::fmt::{self, Debug, Display}; +use std::os::raw::c_void; pub const GIT_MAX_RAWSZ: usize = 32; @@ -177,4 +178,17 @@ impl HashAlgorithm { HashAlgorithm::SHA256 => &Self::SHA256_NULL_OID, } } + + /// A pointer to the C `struct git_hash_algo` for interoperability with C. + pub fn hash_algo_ptr(self) -> *const c_void { + unsafe { c::hash_algo_ptr_by_number(self as u32) } + } +} + +pub mod c { + use std::os::raw::c_void; + + extern "C" { + pub fn hash_algo_ptr_by_number(n: u32) -> *const c_void; + } } From a8dfbc133a37df79fe49f71c43b8c07d666f1079 Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Mon, 17 Nov 2025 22:16:13 +0000 Subject: [PATCH 07/15] rust: add additional helpers for ObjectID Right now, users can internally access the contents of the ObjectID struct, which can lead to data that is not valid, such as invalid algorithms or non-zero-padded hash values. These can cause problems down the line as we use them more. Add a constructor for ObjectID that allows us to set these values and also provide an accessor for the algorithm so that we can access it. In addition, provide useful Display and Debug implementations that can format our data in a useful way. Now that we have the ability to work with these various components in a nice way, add some tests as well to make sure that ObjectID and HashAlgorithm work together as expected. Signed-off-by: brian m. carlson Signed-off-by: Junio C Hamano --- src/hash.rs | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 132 insertions(+), 1 deletion(-) diff --git a/src/hash.rs b/src/hash.rs index 70bb8095e8..e1fa568661 100644 --- a/src/hash.rs +++ b/src/hash.rs @@ -32,7 +32,7 @@ impl Error for InvalidHashAlgorithm {} /// A binary object ID. #[repr(C)] -#[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq)] +#[derive(Clone, Ord, PartialOrd, Eq, PartialEq)] pub struct ObjectID { pub hash: [u8; GIT_MAX_RAWSZ], pub algo: u32, @@ -40,6 +40,27 @@ pub struct ObjectID { #[allow(dead_code)] impl ObjectID { + /// Return a new object ID with the given algorithm and hash. + /// + /// `hash` must be exactly the proper length for `algo` and this function panics if it is not. + /// The extra internal storage of `hash`, if any, is zero filled. + pub fn new(algo: HashAlgorithm, hash: &[u8]) -> Self { + let mut data = [0u8; GIT_MAX_RAWSZ]; + // This verifies that the length of `hash` is correct. + data[0..algo.raw_len()].copy_from_slice(hash); + Self { + hash: data, + algo: algo as u32, + } + } + + /// Return the algorithm for this object ID. + /// + /// If the algorithm set internally is not valid, this function panics. + pub fn algo(&self) -> Result { + HashAlgorithm::from_u32(self.algo).ok_or(InvalidHashAlgorithm(self.algo)) + } + pub fn as_slice(&self) -> Result<&[u8], InvalidHashAlgorithm> { match HashAlgorithm::from_u32(self.algo) { Some(algo) => Ok(&self.hash[0..algo.raw_len()]), @@ -55,6 +76,41 @@ impl ObjectID { } } +impl Display for ObjectID { + /// Format this object ID as a hex object ID. + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let hash = self.as_slice().unwrap(); + for x in hash { + write!(f, "{:02x}", x)?; + } + Ok(()) + } +} + +impl Debug for ObjectID { + /// Format this object ID as a hex object ID with a colon and name appended to it. + /// + /// ``` + /// assert_eq!( + /// format!("{:?}", HashAlgorithm::SHA256.null_oid()), + /// "0000000000000000000000000000000000000000000000000000000000000000:sha256" + /// ); + /// ``` + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let hash = match self.as_slice() { + Ok(hash) => hash, + Err(_) => &self.hash, + }; + for x in hash { + write!(f, "{:02x}", x)?; + } + match self.algo() { + Ok(algo) => write!(f, ":{}", algo.name()), + Err(e) => write!(f, ":invalid-hash-algo-{}", e.0), + } + } +} + /// A hash algorithm, #[repr(C)] #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] @@ -192,3 +248,78 @@ pub mod c { pub fn hash_algo_ptr_by_number(n: u32) -> *const c_void; } } + +#[cfg(test)] +mod tests { + use super::HashAlgorithm; + + fn all_algos() -> &'static [HashAlgorithm] { + &[HashAlgorithm::SHA1, HashAlgorithm::SHA256] + } + + #[test] + fn format_id_round_trips() { + for algo in all_algos() { + assert_eq!( + *algo, + HashAlgorithm::from_format_id(algo.format_id()).unwrap() + ); + } + } + + #[test] + fn offset_round_trips() { + for algo in all_algos() { + assert_eq!(*algo, HashAlgorithm::from_u32(*algo as u32).unwrap()); + } + } + + #[test] + fn slices_have_correct_length() { + for algo in all_algos() { + for oid in [algo.null_oid(), algo.empty_blob(), algo.empty_tree()] { + assert_eq!(oid.as_slice().unwrap().len(), algo.raw_len()); + } + } + } + + #[test] + fn object_ids_format_correctly() { + let entries = &[ + ( + HashAlgorithm::SHA1.null_oid(), + "0000000000000000000000000000000000000000", + "0000000000000000000000000000000000000000:sha1", + ), + ( + HashAlgorithm::SHA1.empty_blob(), + "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391", + "e69de29bb2d1d6434b8b29ae775ad8c2e48c5391:sha1", + ), + ( + HashAlgorithm::SHA1.empty_tree(), + "4b825dc642cb6eb9a060e54bf8d69288fbee4904", + "4b825dc642cb6eb9a060e54bf8d69288fbee4904:sha1", + ), + ( + HashAlgorithm::SHA256.null_oid(), + "0000000000000000000000000000000000000000000000000000000000000000", + "0000000000000000000000000000000000000000000000000000000000000000:sha256", + ), + ( + HashAlgorithm::SHA256.empty_blob(), + "473a0f4c3be8a93681a267e3b1e9a7dcda1185436fe141f7749120a303721813", + "473a0f4c3be8a93681a267e3b1e9a7dcda1185436fe141f7749120a303721813:sha256", + ), + ( + HashAlgorithm::SHA256.empty_tree(), + "6ef19b41225c5369f1c104d45d8d85efa9b057b53b14b4b9b939dd74decc5321", + "6ef19b41225c5369f1c104d45d8d85efa9b057b53b14b4b9b939dd74decc5321:sha256", + ), + ]; + for (oid, display, debug) in entries { + assert_eq!(format!("{}", oid), *display); + assert_eq!(format!("{:?}", oid), *debug); + } + } +} From 0bbba5f98f864380de6e72e073294ee2a3ca07ad Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Mon, 17 Nov 2025 22:16:14 +0000 Subject: [PATCH 08/15] csum-file: define hashwrite's count as a uint32_t We want to call this code from Rust and ensure that the types are the same for compatibility, which is easiest to do if the type is a fixed size. Since unsigned int is 32 bits on all the platforms we care about, define it as a uint32_t instead. Signed-off-by: brian m. carlson Signed-off-by: Junio C Hamano --- csum-file.c | 2 +- csum-file.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/csum-file.c b/csum-file.c index 6e21e3cac8..3d3047c776 100644 --- a/csum-file.c +++ b/csum-file.c @@ -110,7 +110,7 @@ void discard_hashfile(struct hashfile *f) free_hashfile(f); } -void hashwrite(struct hashfile *f, const void *buf, unsigned int count) +void hashwrite(struct hashfile *f, const void *buf, uint32_t count) { while (count) { unsigned left = f->buffer_len - f->offset; diff --git a/csum-file.h b/csum-file.h index 07ae11024a..ecce9d27b0 100644 --- a/csum-file.h +++ b/csum-file.h @@ -63,7 +63,7 @@ void free_hashfile(struct hashfile *f); */ int finalize_hashfile(struct hashfile *, unsigned char *, enum fsync_component, unsigned int); void discard_hashfile(struct hashfile *); -void hashwrite(struct hashfile *, const void *, unsigned int); +void hashwrite(struct hashfile *, const void *, uint32_t); void hashflush(struct hashfile *f); void crc32_begin(struct hashfile *); uint32_t crc32_end(struct hashfile *); From f00c4ace1e9b43591f08dac324a0e7ef3172854d Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Mon, 17 Nov 2025 22:16:15 +0000 Subject: [PATCH 09/15] write-or-die: add an fsync component for the object map We'll soon be writing out an object map using the hashfile code. Add an fsync component to allow us to handle fsyncing it correctly. Signed-off-by: brian m. carlson Signed-off-by: Junio C Hamano --- write-or-die.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/write-or-die.h b/write-or-die.h index 65a5c42a47..ff0408bd84 100644 --- a/write-or-die.h +++ b/write-or-die.h @@ -21,6 +21,7 @@ enum fsync_component { FSYNC_COMPONENT_COMMIT_GRAPH = 1 << 3, FSYNC_COMPONENT_INDEX = 1 << 4, FSYNC_COMPONENT_REFERENCE = 1 << 5, + FSYNC_COMPONENT_OBJECT_MAP = 1 << 6, }; #define FSYNC_COMPONENTS_OBJECTS (FSYNC_COMPONENT_LOOSE_OBJECT | \ @@ -44,7 +45,8 @@ enum fsync_component { FSYNC_COMPONENT_PACK_METADATA | \ FSYNC_COMPONENT_COMMIT_GRAPH | \ FSYNC_COMPONENT_INDEX | \ - FSYNC_COMPONENT_REFERENCE) + FSYNC_COMPONENT_REFERENCE | \ + FSYNC_COMPONENT_OBJECT_MAP) #ifndef FSYNC_COMPONENTS_PLATFORM_DEFAULT #define FSYNC_COMPONENTS_PLATFORM_DEFAULT FSYNC_COMPONENTS_DEFAULT From f29070cb25863cdff181c5cd1f5f9ad6a0c1caed Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Mon, 17 Nov 2025 22:16:16 +0000 Subject: [PATCH 10/15] hash: expose hash context functions to Rust We'd like to be able to hash our data in Rust using the same contexts as in C. However, we need our helper functions to not be inline so they can be linked into the binary appropriately. In addition, to avoid managing memory manually and since we don't know the size of the hash context structure, we want to have simple alloc and free functions we can use to make sure a context can be easily dynamically created. Expose the helper functions and create alloc, free, and init functions we can call. Signed-off-by: brian m. carlson Signed-off-by: Junio C Hamano --- hash.c | 35 +++++++++++++++++++++++++++++++++++ hash.h | 27 +++++++-------------------- 2 files changed, 42 insertions(+), 20 deletions(-) diff --git a/hash.c b/hash.c index 97fd473607..553f2008ea 100644 --- a/hash.c +++ b/hash.c @@ -248,6 +248,41 @@ const struct git_hash_algo *hash_algo_ptr_by_number(uint32_t algo) return &hash_algos[algo]; } +struct git_hash_ctx *git_hash_alloc(void) +{ + return xmalloc(sizeof(struct git_hash_ctx)); +} + +void git_hash_free(struct git_hash_ctx *ctx) +{ + free(ctx); +} + +void git_hash_init(struct git_hash_ctx *ctx, const struct git_hash_algo *algop) +{ + algop->init_fn(ctx); +} + +void git_hash_clone(struct git_hash_ctx *dst, const struct git_hash_ctx *src) +{ + src->algop->clone_fn(dst, src); +} + +void git_hash_update(struct git_hash_ctx *ctx, const void *in, size_t len) +{ + ctx->algop->update_fn(ctx, in, len); +} + +void git_hash_final(unsigned char *hash, struct git_hash_ctx *ctx) +{ + ctx->algop->final_fn(hash, ctx); +} + +void git_hash_final_oid(struct object_id *oid, struct git_hash_ctx *ctx) +{ + ctx->algop->final_oid_fn(oid, ctx); +} + uint32_t hash_algo_by_name(const char *name) { if (!name) diff --git a/hash.h b/hash.h index 709d7585a5..d51efce1d3 100644 --- a/hash.h +++ b/hash.h @@ -320,27 +320,14 @@ struct git_hash_algo { }; extern const struct git_hash_algo hash_algos[GIT_HASH_NALGOS]; -static inline void git_hash_clone(struct git_hash_ctx *dst, const struct git_hash_ctx *src) -{ - src->algop->clone_fn(dst, src); -} - -static inline void git_hash_update(struct git_hash_ctx *ctx, const void *in, size_t len) -{ - ctx->algop->update_fn(ctx, in, len); -} - -static inline void git_hash_final(unsigned char *hash, struct git_hash_ctx *ctx) -{ - ctx->algop->final_fn(hash, ctx); -} - -static inline void git_hash_final_oid(struct object_id *oid, struct git_hash_ctx *ctx) -{ - ctx->algop->final_oid_fn(oid, ctx); -} - +void git_hash_init(struct git_hash_ctx *ctx, const struct git_hash_algo *algop); +void git_hash_clone(struct git_hash_ctx *dst, const struct git_hash_ctx *src); +void git_hash_update(struct git_hash_ctx *ctx, const void *in, size_t len); +void git_hash_final(unsigned char *hash, struct git_hash_ctx *ctx); +void git_hash_final_oid(struct object_id *oid, struct git_hash_ctx *ctx); const struct git_hash_algo *hash_algo_ptr_by_number(uint32_t algo); +struct git_hash_ctx *git_hash_alloc(void); +void git_hash_free(struct git_hash_ctx *ctx); /* * Return a GIT_HASH_* constant based on the name. Returns GIT_HASH_UNKNOWN if * the name doesn't match a known algorithm. From ddeec7a34fd355e65cf7d1d111833aba77d122c5 Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Mon, 17 Nov 2025 22:16:17 +0000 Subject: [PATCH 11/15] rust: add a build.rs script for tests Cargo uses the build.rs script to determine how to compile and link a binary. The only binary we're generating, however, is for our tests, but in a future commit, we're going to link against libgit.a for some functionality and we'll need to make sure the test binaries are complete. Add a build.rs file for this case and specify the files we're going to be linking against. Because we cannot specify different dependencies when building our static library versus our tests, update the Makefile to specify these dependencies for our static library to avoid race conditions during build. Signed-off-by: brian m. carlson Signed-off-by: Junio C Hamano --- Makefile | 2 +- build.rs | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 build.rs diff --git a/Makefile b/Makefile index 2a926a375b..cab07944e4 100644 --- a/Makefile +++ b/Makefile @@ -2958,7 +2958,7 @@ scalar$X: scalar.o GIT-LDFLAGS $(GITLIBS) $(LIB_FILE): $(LIB_OBJS) $(QUIET_AR)$(RM) $@ && $(AR) $(ARFLAGS) $@ $^ -$(RUST_LIB): Cargo.toml $(RUST_SOURCES) +$(RUST_LIB): Cargo.toml $(RUST_SOURCES) $(LIB_FILE) $(QUIET_CARGO)cargo build $(CARGO_ARGS) .PHONY: rust diff --git a/build.rs b/build.rs new file mode 100644 index 0000000000..3724b3a930 --- /dev/null +++ b/build.rs @@ -0,0 +1,17 @@ +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation: version 2 of the License, dated June 1991. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this program; if not, see . + +fn main() { + println!("cargo:rustc-link-search=."); + println!("cargo:rustc-link-lib=git"); + println!("cargo:rustc-link-lib=z"); +} From f15c9f93d593b849f21f2f0bfcebafb98548238e Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Mon, 17 Nov 2025 22:16:18 +0000 Subject: [PATCH 12/15] rust: add functionality to hash an object In a future commit, we'll want to hash some data when dealing with an object map. Let's make this easy by creating a structure to hash objects and calling into the C functions as necessary to perform the hashing. For now, we only implement safe hashing, but in the future we could add unsafe hashing if we want. Implement Clone and Drop to appropriately manage our memory. Additionally implement Write to make it easy to use with other formats that implement this trait. While we're at it, add some tests for the various hashing cases. Signed-off-by: brian m. carlson Signed-off-by: Junio C Hamano --- src/hash.rs | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 142 insertions(+), 1 deletion(-) diff --git a/src/hash.rs b/src/hash.rs index e1fa568661..dea2998de4 100644 --- a/src/hash.rs +++ b/src/hash.rs @@ -12,6 +12,7 @@ use std::error::Error; use std::fmt::{self, Debug, Display}; +use std::io::{self, Write}; use std::os::raw::c_void; pub const GIT_MAX_RAWSZ: usize = 32; @@ -111,6 +112,100 @@ impl Debug for ObjectID { } } +/// A trait to implement hashing with a cryptographic algorithm. +pub trait CryptoDigest { + /// Return true if this digest is safe for use with untrusted data, false otherwise. + fn is_safe(&self) -> bool; + + /// Update the digest with the specified data. + fn update(&mut self, data: &[u8]); + + /// Return an object ID, consuming the hasher. + fn into_oid(self) -> ObjectID; + + /// Return a hash as a `Vec`, consuming the hasher. + fn into_vec(self) -> Vec; +} + +/// A structure to hash data with a cryptographic hash algorithm. +/// +/// Instances of this class are safe for use with untrusted data, provided Git has been compiled +/// with a collision-detecting implementation of SHA-1. +pub struct CryptoHasher { + algo: HashAlgorithm, + ctx: *mut c_void, +} + +impl CryptoHasher { + /// Create a new hasher with the algorithm specified with `algo`. + /// + /// This hasher is safe to use on untrusted data. If SHA-1 is selected and Git was compiled + /// with a collision-detecting implementation of SHA-1, then this function will use that + /// implementation and detect any attempts at a collision. + pub fn new(algo: HashAlgorithm) -> Self { + let ctx = unsafe { c::git_hash_alloc() }; + unsafe { c::git_hash_init(ctx, algo.hash_algo_ptr()) }; + Self { algo, ctx } + } +} + +impl CryptoDigest for CryptoHasher { + /// Return true if this digest is safe for use with untrusted data, false otherwise. + fn is_safe(&self) -> bool { + true + } + + /// Update the hasher with the specified data. + fn update(&mut self, data: &[u8]) { + unsafe { c::git_hash_update(self.ctx, data.as_ptr() as *const c_void, data.len()) }; + } + + /// Return an object ID, consuming the hasher. + fn into_oid(self) -> ObjectID { + let mut oid = ObjectID { + hash: [0u8; 32], + algo: self.algo as u32, + }; + unsafe { c::git_hash_final_oid(&mut oid as *mut ObjectID as *mut c_void, self.ctx) }; + oid + } + + /// Return a hash as a `Vec`, consuming the hasher. + fn into_vec(self) -> Vec { + let mut v = vec![0u8; self.algo.raw_len()]; + unsafe { c::git_hash_final(v.as_mut_ptr(), self.ctx) }; + v + } +} + +impl Clone for CryptoHasher { + fn clone(&self) -> Self { + let ctx = unsafe { c::git_hash_alloc() }; + unsafe { c::git_hash_clone(ctx, self.ctx) }; + Self { + algo: self.algo, + ctx, + } + } +} + +impl Drop for CryptoHasher { + fn drop(&mut self) { + unsafe { c::git_hash_free(self.ctx) }; + } +} + +impl Write for CryptoHasher { + fn write(&mut self, data: &[u8]) -> io::Result { + self.update(data); + Ok(data.len()) + } + + fn flush(&mut self) -> io::Result<()> { + Ok(()) + } +} + /// A hash algorithm, #[repr(C)] #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] @@ -239,6 +334,11 @@ impl HashAlgorithm { pub fn hash_algo_ptr(self) -> *const c_void { unsafe { c::hash_algo_ptr_by_number(self as u32) } } + + /// Create a hasher for this algorithm. + pub fn hasher(self) -> CryptoHasher { + CryptoHasher::new(self) + } } pub mod c { @@ -246,12 +346,21 @@ pub mod c { extern "C" { pub fn hash_algo_ptr_by_number(n: u32) -> *const c_void; + pub fn unsafe_hash_algo(algop: *const c_void) -> *const c_void; + pub fn git_hash_alloc() -> *mut c_void; + pub fn git_hash_free(ctx: *mut c_void); + pub fn git_hash_init(dst: *mut c_void, algop: *const c_void); + pub fn git_hash_clone(dst: *mut c_void, src: *const c_void); + pub fn git_hash_update(ctx: *mut c_void, inp: *const c_void, len: usize); + pub fn git_hash_final(hash: *mut u8, ctx: *mut c_void); + pub fn git_hash_final_oid(hash: *mut c_void, ctx: *mut c_void); } } #[cfg(test)] mod tests { - use super::HashAlgorithm; + use super::{CryptoDigest, HashAlgorithm, ObjectID}; + use std::io::Write; fn all_algos() -> &'static [HashAlgorithm] { &[HashAlgorithm::SHA1, HashAlgorithm::SHA256] @@ -322,4 +431,36 @@ mod tests { assert_eq!(format!("{:?}", oid), *debug); } } + + #[test] + fn hasher_works_correctly() { + for algo in all_algos() { + let tests: &[(&[u8], &ObjectID)] = &[ + (b"blob 0\0", algo.empty_blob()), + (b"tree 0\0", algo.empty_tree()), + ]; + for (data, oid) in tests { + let mut h = algo.hasher(); + assert!(h.is_safe()); + // Test that this works incrementally. + h.update(&data[0..2]); + h.update(&data[2..]); + + let h2 = h.clone(); + + let actual_oid = h.into_oid(); + assert_eq!(**oid, actual_oid); + + let v = h2.into_vec(); + assert_eq!((*oid).as_slice().unwrap(), &v); + + let mut h = algo.hasher(); + h.write_all(&data[0..2]).unwrap(); + h.write_all(&data[2..]).unwrap(); + + let actual_oid = h.into_oid(); + assert_eq!(**oid, actual_oid); + } + } + } } From ca05bfbae07af8d2c3b710b04be55115cc8f7036 Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Mon, 17 Nov 2025 22:16:19 +0000 Subject: [PATCH 13/15] rust: add a new binary object map format Our current loose object format has a few problems. First, it is not efficient: the list of object IDs is not sorted and even if it were, there would not be an efficient way to look up objects in both algorithms. Second, we need to store mappings for things which are not technically loose objects but are not packed objects, either, and so cannot be stored in a pack index. These kinds of things include shallows, their parents, and their trees, as well as submodules. Yet we also need to implement a sensible way to store the kind of object so that we can prune unneeded entries. For instance, if the user has updated the shallows, we can remove the old values. For these reasons, introduce a new binary object map format. The careful reader will notice that it resembles very closely the pack index v3 format. Add an in-memory object map as well, and allow writing to a batched map, which can then be written later as one of the binary object maps. Include several tests for round tripping and data lookup across algorithms. Note that the use of this code elsewhere in Git will involve some C code and some C-compatible code in Rust that will be introduced in a future commit. Thus, for example, we ignore the fact that if there is no current batch and the caller asks for data to be written, this code does nothing, mostly because this code also does not involve itself with opening or manipulating files. The C code that we will add later will implement this functionality at a higher level and take care of this, since the code which is necessary for writing to the object store is deeply involved with our C abstractions and it would require extensive work (which would not be especially valuable at this point) to port those to Rust. Signed-off-by: brian m. carlson Signed-off-by: Junio C Hamano --- Documentation/gitformat-loose.adoc | 78 +++ Makefile | 1 + src/lib.rs | 1 + src/loose.rs | 913 +++++++++++++++++++++++++++++ src/meson.build | 1 + 5 files changed, 994 insertions(+) create mode 100644 src/loose.rs diff --git a/Documentation/gitformat-loose.adoc b/Documentation/gitformat-loose.adoc index 947993663e..b0b569761b 100644 --- a/Documentation/gitformat-loose.adoc +++ b/Documentation/gitformat-loose.adoc @@ -10,6 +10,7 @@ SYNOPSIS -------- [verse] $GIT_DIR/objects/[0-9a-f][0-9a-f]/* +$GIT_DIR/objects/object-map/map-*.map DESCRIPTION ----------- @@ -48,6 +49,83 @@ stored under Similarly, a blob containing the contents `abc` would have the uncompressed data of `blob 3\0abc`. +== Loose object mapping + +When the `compatObjectFormat` option is used, Git needs to store a mapping +between the repository's main algorithm and the compatibility algorithm for +loose objects as well as some auxiliary information. + +The mapping consists of a set of files under `$GIT_DIR/objects/object-map` +ending in `.map`. The portion of the filename before the extension is that of +the main hash checksum (that is, the one specified in +`extensions.objectformat`) in hex format. + +`git gc` will repack existing entries into one file, removing any unnecessary +objects, such as obsolete shallow entries or loose objects that have been +packed. + +The file format is as follows. All values are in network byte order and all +4-byte and 8-byte values must be 4-byte aligned in the file, so the NUL padding +may be required in some cases. Git always uses the smallest number of NUL +bytes (including zero) that is required for the padding in order to make +writing files deterministic. + +- A header appears at the beginning and consists of the following: + * A 4-byte mapping signature: `LMAP` + * 4-byte version number: 1 + * 4-byte length of the header section (including reserved entries but + excluding any NUL padding). + * 4-byte number of objects declared in this map file. + * 4-byte number of object formats declared in this map file. + * For each object format: + ** 4-byte format identifier (e.g., `sha1` for SHA-1) + ** 4-byte length in bytes of shortened object names (that is, prefixes of + the full object names). This is the shortest possible length needed to + make names in the shortened object name table unambiguous. + ** 8-byte integer, recording where tables relating to this format + are stored in this index file, as an offset from the beginning. + * 8-byte offset to the trailer from the beginning of this file. + * The remainder of the header section is reserved for future use. + Readers must ignore unrecognized data here. +- Zero or more NUL bytes. These are used to improve the alignment of the + 4-byte quantities below. +- Tables for the first object format: + * A sorted table of shortened object names. These are prefixes of the names + of all objects in this file, packed together to reduce the cache footprint + of the binary search for a specific object name. + * A sorted table of full object names. + * A table of 4-byte metadata values. +- Zero or more NUL bytes. +- Tables for subsequent object formats: + * A sorted table of shortened object names. These are prefixes of the names + of all objects in this file, packed together without offset values to + reduce the cache footprint of the binary search for a specific object name. + * A table of full object names in the order specified by the first object format. + * A table of 4-byte values mapping object name order to the order of the + first object format. For an object in the table of sorted shortened object + names, the value at the corresponding index in this table is the index in + the previous table for that same object. + * Zero or more NUL bytes. +- The trailer consists of the following: + * Hash checksum of all of the above using the main hash. + +The lower six bits of each metadata table contain a type field indicating the +reason that this object is stored: + +0:: + Reserved. +1:: + This object is stored as a loose object in the repository. +2:: + This object is a shallow entry. The mapping refers to a shallow value + returned by a remote server. +3:: + This object is a submodule entry. The mapping refers to the commit stored + representing a submodule. + +Other data may be stored in this field in the future. Bits that are not used +must be zero. + GIT --- Part of the linkgit:git[1] suite diff --git a/Makefile b/Makefile index cab07944e4..ecca75f431 100644 --- a/Makefile +++ b/Makefile @@ -1530,6 +1530,7 @@ UNIT_TEST_OBJS += $(UNIT_TEST_DIR)/test-lib.o RUST_SOURCES += src/hash.rs RUST_SOURCES += src/lib.rs +RUST_SOURCES += src/loose.rs RUST_SOURCES += src/varint.rs GIT-VERSION-FILE: FORCE diff --git a/src/lib.rs b/src/lib.rs index cf7c962509..442f9433dc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,3 @@ pub mod hash; +pub mod loose; pub mod varint; diff --git a/src/loose.rs b/src/loose.rs new file mode 100644 index 0000000000..24accf9c33 --- /dev/null +++ b/src/loose.rs @@ -0,0 +1,913 @@ +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation: version 2 of the License, dated June 1991. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this program; if not, see . + +use crate::hash::{HashAlgorithm, ObjectID, GIT_MAX_RAWSZ}; +use std::collections::BTreeMap; +use std::convert::TryInto; +use std::io::{self, Write}; + +/// The type of object stored in the map. +/// +/// If this value is `Reserved`, then it is never written to disk and is used primarily to store +/// certain hard-coded objects, like the empty tree, empty blob, or null object ID. +/// +/// If this value is `LooseObject`, then this represents a loose object. `Shallow` represents a +/// shallow commit, its parent, or its tree. `Submodule` represents a submodule commit. +#[repr(C)] +#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq)] +pub enum MapType { + Reserved = 0, + LooseObject = 1, + Shallow = 2, + Submodule = 3, +} + +impl MapType { + pub fn from_u32(n: u32) -> Option { + match n { + 0 => Some(Self::Reserved), + 1 => Some(Self::LooseObject), + 2 => Some(Self::Shallow), + 3 => Some(Self::Submodule), + _ => None, + } + } +} + +/// The value of an object stored in a `ObjectMemoryMap`. +/// +/// This keeps the object ID to which the key is mapped and its kind together. +struct MappedObject { + oid: ObjectID, + kind: MapType, +} + +/// Memory storage for a loose object. +struct ObjectMemoryMap { + to_compat: BTreeMap, + to_storage: BTreeMap, + compat: HashAlgorithm, + storage: HashAlgorithm, +} + +impl ObjectMemoryMap { + /// Create a new `ObjectMemoryMap`. + /// + /// The storage and compatibility `HashAlgorithm` instances are used to store the object IDs in + /// the correct map. + fn new(storage: HashAlgorithm, compat: HashAlgorithm) -> Self { + Self { + to_compat: BTreeMap::new(), + to_storage: BTreeMap::new(), + compat, + storage, + } + } + + fn len(&self) -> usize { + self.to_compat.len() + } + + /// Write this map to an interface implementing `std::io::Write`. + fn write(&self, wrtr: W) -> io::Result<()> { + const VERSION_NUMBER: u32 = 1; + const NUM_OBJECT_FORMATS: u32 = 2; + const PADDING: [u8; 4] = [0u8; 4]; + + let mut wrtr = wrtr; + let header_size: u32 = (4 * 5) + (4 + 4 + 8) * NUM_OBJECT_FORMATS + 8; + + wrtr.write_all(b"LMAP")?; + wrtr.write_all(&VERSION_NUMBER.to_be_bytes())?; + wrtr.write_all(&header_size.to_be_bytes())?; + wrtr.write_all(&(self.to_compat.len() as u32).to_be_bytes())?; + wrtr.write_all(&NUM_OBJECT_FORMATS.to_be_bytes())?; + + let storage_short_len = self.find_short_name_len(&self.to_compat, self.storage); + let compat_short_len = self.find_short_name_len(&self.to_storage, self.compat); + + let storage_npadding = Self::required_nul_padding(self.to_compat.len(), storage_short_len); + let compat_npadding = Self::required_nul_padding(self.to_compat.len(), compat_short_len); + + let mut offset: u64 = header_size as u64; + + for (algo, len, npadding) in &[ + (self.storage, storage_short_len, storage_npadding), + (self.compat, compat_short_len, compat_npadding), + ] { + wrtr.write_all(&algo.format_id().to_be_bytes())?; + wrtr.write_all(&(*len as u32).to_be_bytes())?; + + offset += *npadding; + wrtr.write_all(&offset.to_be_bytes())?; + + offset += self.to_compat.len() as u64 * (*len as u64 + algo.raw_len() as u64 + 4); + } + + wrtr.write_all(&offset.to_be_bytes())?; + + let order_map: BTreeMap<&ObjectID, usize> = self + .to_compat + .keys() + .enumerate() + .map(|(i, oid)| (oid, i)) + .collect(); + + wrtr.write_all(&PADDING[0..storage_npadding as usize])?; + for oid in self.to_compat.keys() { + wrtr.write_all(&oid.as_slice().unwrap()[0..storage_short_len])?; + } + for oid in self.to_compat.keys() { + wrtr.write_all(oid.as_slice().unwrap())?; + } + for meta in self.to_compat.values() { + wrtr.write_all(&(meta.kind as u32).to_be_bytes())?; + } + + wrtr.write_all(&PADDING[0..compat_npadding as usize])?; + for oid in self.to_storage.keys() { + wrtr.write_all(&oid.as_slice().unwrap()[0..compat_short_len])?; + } + for meta in self.to_compat.values() { + wrtr.write_all(meta.oid.as_slice().unwrap())?; + } + for meta in self.to_storage.values() { + wrtr.write_all(&(order_map[&meta.oid] as u32).to_be_bytes())?; + } + + Ok(()) + } + + fn required_nul_padding(nitems: usize, short_len: usize) -> u64 { + let shortened_table_len = nitems as u64 * short_len as u64; + let misalignment = shortened_table_len & 3; + // If the value is 0, return 0; otherwise, return the difference from 4. + (4 - misalignment) & 3 + } + + fn last_matching_offset(a: &ObjectID, b: &ObjectID, algop: HashAlgorithm) -> usize { + for i in 0..=algop.raw_len() { + if a.hash[i] != b.hash[i] { + return i; + } + } + algop.raw_len() + } + + fn find_short_name_len( + &self, + map: &BTreeMap, + algop: HashAlgorithm, + ) -> usize { + if map.len() <= 1 { + return 1; + } + let mut len = 1; + let mut iter = map.keys(); + let mut cur = match iter.next() { + Some(cur) => cur, + None => return len, + }; + for item in iter { + let offset = Self::last_matching_offset(cur, item, algop); + if offset >= len { + len = offset + 1; + } + cur = item; + } + if len > algop.raw_len() { + algop.raw_len() + } else { + len + } + } +} + +struct ObjectFormatData { + data_off: usize, + shortened_len: usize, + full_off: usize, + mapping_off: Option, +} + +pub struct MmapedObjectMapIter<'a> { + offset: usize, + algos: Vec, + source: &'a MmapedObjectMap<'a>, +} + +impl<'a> Iterator for MmapedObjectMapIter<'a> { + type Item = Vec; + + fn next(&mut self) -> Option { + if self.offset >= self.source.nitems { + return None; + } + let offset = self.offset; + self.offset += 1; + let v: Vec = self + .algos + .iter() + .cloned() + .filter_map(|algo| self.source.oid_from_offset(offset, algo)) + .collect(); + if v.len() != self.algos.len() { + return None; + } + Some(v) + } +} + +#[allow(dead_code)] +pub struct MmapedObjectMap<'a> { + memory: &'a [u8], + nitems: usize, + meta_off: usize, + obj_formats: BTreeMap, + main_algo: HashAlgorithm, +} + +#[derive(Debug)] +#[allow(dead_code)] +enum MmapedParseError { + HeaderTooSmall, + InvalidSignature, + InvalidVersion, + UnknownAlgorithm, + OffsetTooLarge, + TooFewObjectFormats, + UnalignedData, + InvalidTrailerOffset, +} + +#[allow(dead_code)] +impl<'a> MmapedObjectMap<'a> { + fn new( + slice: &'a [u8], + hash_algo: HashAlgorithm, + ) -> Result, MmapedParseError> { + let object_format_header_size = 4 + 4 + 8; + let trailer_offset_size = 8; + let header_size: usize = + 4 + 4 + 4 + 4 + 4 + object_format_header_size * 2 + trailer_offset_size; + if slice.len() < header_size { + return Err(MmapedParseError::HeaderTooSmall); + } + if slice[0..4] != *b"LMAP" { + return Err(MmapedParseError::InvalidSignature); + } + if Self::u32_at_offset(slice, 4) != 1 { + return Err(MmapedParseError::InvalidVersion); + } + let _ = Self::u32_at_offset(slice, 8) as usize; + let nitems = Self::u32_at_offset(slice, 12) as usize; + let nobj_formats = Self::u32_at_offset(slice, 16) as usize; + if nobj_formats < 2 { + return Err(MmapedParseError::TooFewObjectFormats); + } + let mut offset = 20; + let mut meta_off = None; + let mut data = BTreeMap::new(); + for i in 0..nobj_formats { + if offset + object_format_header_size + trailer_offset_size > slice.len() { + return Err(MmapedParseError::HeaderTooSmall); + } + let format_id = Self::u32_at_offset(slice, offset); + let shortened_len = Self::u32_at_offset(slice, offset + 4) as usize; + let data_off = Self::u64_at_offset(slice, offset + 8); + + let algo = HashAlgorithm::from_format_id(format_id) + .ok_or(MmapedParseError::UnknownAlgorithm)?; + let data_off: usize = data_off + .try_into() + .map_err(|_| MmapedParseError::OffsetTooLarge)?; + + // Every object format must have these entries. + let shortened_table_len = shortened_len + .checked_mul(nitems) + .ok_or(MmapedParseError::OffsetTooLarge)?; + let full_off = data_off + .checked_add(shortened_table_len) + .ok_or(MmapedParseError::OffsetTooLarge)?; + Self::verify_aligned(full_off)?; + Self::verify_valid(slice, full_off as u64)?; + + let full_length = algo + .raw_len() + .checked_mul(nitems) + .ok_or(MmapedParseError::OffsetTooLarge)?; + let off = full_length + .checked_add(full_off) + .ok_or(MmapedParseError::OffsetTooLarge)?; + Self::verify_aligned(off)?; + Self::verify_valid(slice, off as u64)?; + + // This is for the metadata for the first object format and for the order mapping for + // other object formats. + let meta_size = nitems + .checked_mul(4) + .ok_or(MmapedParseError::OffsetTooLarge)?; + let meta_end = off + .checked_add(meta_size) + .ok_or(MmapedParseError::OffsetTooLarge)?; + Self::verify_valid(slice, meta_end as u64)?; + + let mut mapping_off = None; + if i == 0 { + meta_off = Some(off); + } else { + mapping_off = Some(off); + } + + data.insert( + algo, + ObjectFormatData { + data_off, + shortened_len, + full_off, + mapping_off, + }, + ); + offset += object_format_header_size; + } + let trailer = Self::u64_at_offset(slice, offset); + Self::verify_aligned(trailer as usize)?; + Self::verify_valid(slice, trailer)?; + let end = trailer + .checked_add(hash_algo.raw_len() as u64) + .ok_or(MmapedParseError::OffsetTooLarge)?; + if end != slice.len() as u64 { + return Err(MmapedParseError::InvalidTrailerOffset); + } + match meta_off { + Some(meta_off) => Ok(MmapedObjectMap { + memory: slice, + nitems, + meta_off, + obj_formats: data, + main_algo: hash_algo, + }), + None => Err(MmapedParseError::TooFewObjectFormats), + } + } + + fn iter(&self) -> MmapedObjectMapIter<'_> { + let mut algos = Vec::with_capacity(self.obj_formats.len()); + algos.push(self.main_algo); + for algo in self.obj_formats.keys().cloned() { + if algo != self.main_algo { + algos.push(algo); + } + } + MmapedObjectMapIter { + offset: 0, + algos, + source: self, + } + } + + /// Treats `sl` as if it were a set of slices of `wanted.len()` bytes, and searches for + /// `wanted` within it. + /// + /// If found, returns the offset of the subslice in `sl`. + /// + /// ``` + /// let sl = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + /// + /// assert_eq!(MmapedObjectMap::binary_search_slice(sl, &[2, 3]), Some(1)); + /// assert_eq!(MmapedObjectMap::binary_search_slice(sl, &[6, 7]), Some(4)); + /// assert_eq!(MmapedObjectMap::binary_search_slice(sl, &[1, 2]), None); + /// assert_eq!(MmapedObjectMap::binary_search_slice(sl, &[10, 20]), None); + /// ``` + fn binary_search_slice(sl: &[u8], wanted: &[u8]) -> Option { + let len = wanted.len(); + let res = sl.binary_search_by(|item| { + // We would like element_offset, but that is currently nightly only. Instead, do a + // pointer subtraction to find the index. + let index = unsafe { (item as *const u8).offset_from(sl.as_ptr()) } as usize; + // Now we have the index of this object. Round it down to the nearest full-sized + // chunk to find the actual offset where this starts. + let index = index - (index % len); + // Compute the comparison of that value instead, which will provide the expected + // result. + sl[index..index + wanted.len()].cmp(wanted) + }); + res.ok().map(|offset| offset / len) + } + + /// Look up `oid` in the map in order to convert it to `algo`. + /// + /// If this object is in the map, return the offset in the table for the main algorithm. + fn look_up_object(&self, oid: &ObjectID) -> Option { + let oid_algo = HashAlgorithm::from_u32(oid.algo)?; + let params = self.obj_formats.get(&oid_algo)?; + let short_table = + &self.memory[params.data_off..params.data_off + (params.shortened_len * self.nitems)]; + let index = Self::binary_search_slice( + short_table, + &oid.as_slice().unwrap()[0..params.shortened_len], + )?; + match params.mapping_off { + Some(from_off) => { + // oid is in a compatibility algorithm. Find the mapping index. + let mapped = Self::u32_at_offset(self.memory, from_off + index * 4) as usize; + if mapped >= self.nitems { + return None; + } + let oid_offset = params.full_off + mapped * oid_algo.raw_len(); + if self.memory[oid_offset..oid_offset + oid_algo.raw_len()] + != *oid.as_slice().unwrap() + { + return None; + } + Some(mapped) + } + None => { + // oid is in the main algorithm. Find the object ID in the main map to confirm + // it's correct. + let oid_offset = params.full_off + index * oid_algo.raw_len(); + if self.memory[oid_offset..oid_offset + oid_algo.raw_len()] + != *oid.as_slice().unwrap() + { + return None; + } + Some(index) + } + } + } + + #[allow(dead_code)] + fn map_object(&self, oid: &ObjectID, algo: HashAlgorithm) -> Option { + let main = self.look_up_object(oid)?; + let meta = MapType::from_u32(Self::u32_at_offset(self.memory, self.meta_off + (main * 4)))?; + Some(MappedObject { + oid: self.oid_from_offset(main, algo)?, + kind: meta, + }) + } + + fn map_oid(&self, oid: &ObjectID, algo: HashAlgorithm) -> Option { + if algo as u32 == oid.algo { + return Some(oid.clone()); + } + + let main = self.look_up_object(oid)?; + self.oid_from_offset(main, algo) + } + + fn oid_from_offset(&self, offset: usize, algo: HashAlgorithm) -> Option { + let aparams = self.obj_formats.get(&algo)?; + + let mut hash = [0u8; GIT_MAX_RAWSZ]; + let len = algo.raw_len(); + let oid_off = aparams.full_off + (offset * len); + hash[0..len].copy_from_slice(&self.memory[oid_off..oid_off + len]); + Some(ObjectID { + hash, + algo: algo as u32, + }) + } + + fn u32_at_offset(slice: &[u8], offset: usize) -> u32 { + u32::from_be_bytes(slice[offset..offset + 4].try_into().unwrap()) + } + + fn u64_at_offset(slice: &[u8], offset: usize) -> u64 { + u64::from_be_bytes(slice[offset..offset + 8].try_into().unwrap()) + } + + fn verify_aligned(offset: usize) -> Result<(), MmapedParseError> { + if (offset & 3) != 0 { + return Err(MmapedParseError::UnalignedData); + } + Ok(()) + } + + fn verify_valid(slice: &[u8], offset: u64) -> Result<(), MmapedParseError> { + if offset >= slice.len() as u64 { + return Err(MmapedParseError::OffsetTooLarge); + } + Ok(()) + } +} + +/// A map for loose and other non-packed object IDs that maps between a storage and compatibility +/// mapping. +/// +/// In addition to the in-memory option, there is an optional batched storage, which can be used to +/// write objects to disk in an efficient way. +pub struct ObjectMap { + mem: ObjectMemoryMap, + batch: Option, +} + +impl ObjectMap { + /// Create a new `ObjectMap` with the given hash algorithms. + /// + /// This initializes the memory map to automatically map the empty tree, empty blob, and null + /// object ID. + pub fn new(storage: HashAlgorithm, compat: HashAlgorithm) -> Self { + let mut map = ObjectMemoryMap::new(storage, compat); + for (main, compat) in &[ + (storage.empty_tree(), compat.empty_tree()), + (storage.empty_blob(), compat.empty_blob()), + (storage.null_oid(), compat.null_oid()), + ] { + map.to_storage.insert( + (*compat).clone(), + MappedObject { + oid: (*main).clone(), + kind: MapType::Reserved, + }, + ); + map.to_compat.insert( + (*main).clone(), + MappedObject { + oid: (*compat).clone(), + kind: MapType::Reserved, + }, + ); + } + Self { + mem: map, + batch: None, + } + } + + pub fn hash_algo(&self) -> HashAlgorithm { + self.mem.storage + } + + /// Start a batch for efficient writing. + /// + /// If there is already a batch started, this does nothing and the existing batch is retained. + pub fn start_batch(&mut self) { + if self.batch.is_none() { + self.batch = Some(ObjectMemoryMap::new(self.mem.storage, self.mem.compat)); + } + } + + pub fn batch_len(&self) -> Option { + self.batch.as_ref().map(|b| b.len()) + } + + /// If a batch exists, write it to the writer. + pub fn finish_batch(&mut self, w: W) -> io::Result<()> { + if let Some(txn) = self.batch.take() { + txn.write(w)?; + } + Ok(()) + } + + /// If a batch exists, write it to the writer. + pub fn abort_batch(&mut self) { + self.batch = None; + } + + /// Return whether there is a batch already started. + /// + /// If you just want a batch to exist and don't care whether one has already been started, you + /// may simply call `start_batch` unconditionally. + pub fn has_batch(&self) -> bool { + self.batch.is_some() + } + + /// Insert an object into the map. + /// + /// If `write` is true and there is a batch started, write the object into the batch as well as + /// into the memory map. + pub fn insert(&mut self, oid1: &ObjectID, oid2: &ObjectID, kind: MapType, write: bool) { + let (compat_oid, storage_oid) = + if HashAlgorithm::from_u32(oid1.algo) == Some(self.mem.compat) { + (oid1, oid2) + } else { + (oid2, oid1) + }; + Self::insert_into(&mut self.mem, storage_oid, compat_oid, kind); + if write { + if let Some(ref mut batch) = self.batch { + Self::insert_into(batch, storage_oid, compat_oid, kind); + } + } + } + + fn insert_into( + map: &mut ObjectMemoryMap, + storage: &ObjectID, + compat: &ObjectID, + kind: MapType, + ) { + map.to_compat.insert( + storage.clone(), + MappedObject { + oid: compat.clone(), + kind, + }, + ); + map.to_storage.insert( + compat.clone(), + MappedObject { + oid: storage.clone(), + kind, + }, + ); + } + + #[allow(dead_code)] + fn map_object(&self, oid: &ObjectID, algo: HashAlgorithm) -> Option<&MappedObject> { + let map = if algo == self.mem.storage { + &self.mem.to_storage + } else { + &self.mem.to_compat + }; + map.get(oid) + } + + #[allow(dead_code)] + fn map_oid<'a, 'b: 'a>( + &'b self, + oid: &'a ObjectID, + algo: HashAlgorithm, + ) -> Option<&'a ObjectID> { + if algo as u32 == oid.algo { + return Some(oid); + } + let entry = self.map_object(oid, algo); + entry.map(|obj| &obj.oid) + } +} + +#[cfg(test)] +mod tests { + use super::{MapType, MmapedObjectMap, ObjectMap, ObjectMemoryMap}; + use crate::hash::{CryptoDigest, CryptoHasher, HashAlgorithm, ObjectID}; + use std::convert::TryInto; + use std::io::{self, Cursor, Write}; + + struct TrailingWriter { + curs: Cursor>, + hasher: CryptoHasher, + } + + impl TrailingWriter { + fn new() -> Self { + Self { + curs: Cursor::new(Vec::new()), + hasher: CryptoHasher::new(HashAlgorithm::SHA256), + } + } + + fn finalize(mut self) -> Vec { + let _ = self.hasher.flush(); + let mut v = self.curs.into_inner(); + v.extend(self.hasher.into_vec()); + v + } + } + + impl Write for TrailingWriter { + fn write(&mut self, data: &[u8]) -> io::Result { + self.hasher.write_all(data)?; + self.curs.write_all(data)?; + Ok(data.len()) + } + + fn flush(&mut self) -> io::Result<()> { + self.hasher.flush()?; + self.curs.flush()?; + Ok(()) + } + } + + fn sha1_oid(b: &[u8]) -> ObjectID { + assert_eq!(b.len(), 20); + let mut data = [0u8; 32]; + data[0..20].copy_from_slice(b); + ObjectID { + hash: data, + algo: HashAlgorithm::SHA1 as u32, + } + } + + fn sha256_oid(b: &[u8]) -> ObjectID { + assert_eq!(b.len(), 32); + ObjectID { + hash: b.try_into().unwrap(), + algo: HashAlgorithm::SHA256 as u32, + } + } + + #[allow(clippy::type_complexity)] + fn test_entries() -> &'static [(&'static str, &'static [u8], &'static [u8], MapType, bool)] { + // These are all example blobs containing the content in the first argument. + &[ + ("abc", b"\xf2\xba\x8f\x84\xab\x5c\x1b\xce\x84\xa7\xb4\x41\xcb\x19\x59\xcf\xc7\x09\x3b\x7f", b"\xc1\xcf\x6e\x46\x50\x77\x93\x0e\x88\xdc\x51\x36\x64\x1d\x40\x2f\x72\xa2\x29\xdd\xd9\x96\xf6\x27\xd6\x0e\x96\x39\xea\xba\x35\xa6", MapType::LooseObject, false), + ("def", b"\x0c\x00\x38\x32\xe7\xbf\xa9\xca\x8b\x5c\x20\x35\xc9\xbd\x68\x4a\x5f\x26\x23\xbc", b"\x8a\x90\x17\x26\x48\x4d\xb0\xf2\x27\x9f\x30\x8d\x58\x96\xd9\x6b\xf6\x3a\xd6\xde\x95\x7c\xa3\x8a\xdc\x33\x61\x68\x03\x6e\xf6\x63", MapType::Shallow, true), + ("ghi", b"\x45\xa8\x2e\x29\x5c\x52\x47\x31\x14\xc5\x7c\x18\xf4\xf5\x23\x68\xdf\x2a\x3c\xfd", b"\x6e\x47\x4c\x74\xf5\xd7\x78\x14\xc7\xf7\xf0\x7c\x37\x80\x07\x90\x53\x42\xaf\x42\x81\xe6\x86\x8d\x33\x46\x45\x4b\xb8\x63\xab\xc3", MapType::Submodule, false), + ("jkl", b"\x45\x32\x8c\x36\xff\x2e\x9b\x9b\x4e\x59\x2c\x84\x7d\x3f\x9a\x7f\xd9\xb3\xe7\x16", b"\xc3\xee\xf7\x54\xa2\x1e\xc6\x9d\x43\x75\xbe\x6f\x18\x47\x89\xa8\x11\x6f\xd9\x66\xfc\x67\xdc\x31\xd2\x11\x15\x42\xc8\xd5\xa0\xaf", MapType::LooseObject, true), + ] + } + + fn test_map(write_all: bool) -> Box { + let mut map = Box::new(ObjectMap::new(HashAlgorithm::SHA256, HashAlgorithm::SHA1)); + + map.start_batch(); + + for (_blob_content, sha1, sha256, kind, swap) in test_entries() { + let s256 = sha256_oid(sha256); + let s1 = sha1_oid(sha1); + let write = write_all || (*kind as u32 & 2) == 0; + if *swap { + // Insert the item into the batch arbitrarily based on the type. This tests that + // we can specify either order and we'll do the right thing. + map.insert(&s256, &s1, *kind, write); + } else { + map.insert(&s1, &s256, *kind, write); + } + } + + map + } + + #[test] + fn can_read_and_write_format() { + for full in &[true, false] { + let mut map = test_map(*full); + let mut wrtr = TrailingWriter::new(); + map.finish_batch(&mut wrtr).unwrap(); + + assert!(!map.has_batch()); + + let data = wrtr.finalize(); + MmapedObjectMap::new(&data, HashAlgorithm::SHA256).unwrap(); + } + } + + #[test] + fn looks_up_from_mmaped() { + let mut map = test_map(true); + let mut wrtr = TrailingWriter::new(); + map.finish_batch(&mut wrtr).unwrap(); + + assert!(!map.has_batch()); + + let data = wrtr.finalize(); + let entries = test_entries(); + let map = MmapedObjectMap::new(&data, HashAlgorithm::SHA256).unwrap(); + + for (_, sha1, sha256, kind, _) in entries { + let s256 = sha256_oid(sha256); + let s1 = sha1_oid(sha1); + + let res = map.map_object(&s256, HashAlgorithm::SHA1).unwrap(); + assert_eq!(res.oid, s1); + assert_eq!(res.kind, *kind); + let res = map.map_oid(&s256, HashAlgorithm::SHA1).unwrap(); + assert_eq!(res, s1); + + let res = map.map_object(&s256, HashAlgorithm::SHA256).unwrap(); + assert_eq!(res.oid, s256); + assert_eq!(res.kind, *kind); + let res = map.map_oid(&s256, HashAlgorithm::SHA256).unwrap(); + assert_eq!(res, s256); + + let res = map.map_object(&s1, HashAlgorithm::SHA256).unwrap(); + assert_eq!(res.oid, s256); + assert_eq!(res.kind, *kind); + let res = map.map_oid(&s1, HashAlgorithm::SHA256).unwrap(); + assert_eq!(res, s256); + + let res = map.map_object(&s1, HashAlgorithm::SHA1).unwrap(); + assert_eq!(res.oid, s1); + assert_eq!(res.kind, *kind); + let res = map.map_oid(&s1, HashAlgorithm::SHA1).unwrap(); + assert_eq!(res, s1); + } + + for octet in &[0x00u8, 0x6d, 0x6e, 0x8a, 0xff] { + let missing_oid = ObjectID { + hash: [*octet; 32], + algo: HashAlgorithm::SHA256 as u32, + }; + + assert!(map.map_object(&missing_oid, HashAlgorithm::SHA1).is_none()); + assert!(map.map_oid(&missing_oid, HashAlgorithm::SHA1).is_none()); + + assert_eq!( + map.map_oid(&missing_oid, HashAlgorithm::SHA256).unwrap(), + missing_oid + ); + } + } + + #[test] + fn binary_searches_slices_correctly() { + let sl = &[ + 0, 1, 2, 15, 14, 13, 18, 10, 2, 20, 20, 20, 21, 21, 0, 21, 21, 1, 21, 21, 21, 21, 21, + 22, 22, 23, 24, + ]; + + let expected: &[(&[u8], Option)] = &[ + (&[0, 1, 2], Some(0)), + (&[15, 14, 13], Some(1)), + (&[18, 10, 2], Some(2)), + (&[20, 20, 20], Some(3)), + (&[21, 21, 0], Some(4)), + (&[21, 21, 1], Some(5)), + (&[21, 21, 21], Some(6)), + (&[21, 21, 22], Some(7)), + (&[22, 23, 24], Some(8)), + (&[2, 15, 14], None), + (&[0, 21, 21], None), + (&[21, 21, 23], None), + (&[22, 22, 23], None), + (&[0xff, 0xff, 0xff], None), + (&[0, 0, 0], None), + ]; + + for (wanted, value) in expected { + assert_eq!(MmapedObjectMap::binary_search_slice(sl, wanted), *value); + } + } + + #[test] + fn looks_up_oid_correctly() { + let map = test_map(false); + let entries = test_entries(); + + let s256 = sha256_oid(entries[0].2); + let s1 = sha1_oid(entries[0].1); + + let missing_oid = ObjectID { + hash: [0xffu8; 32], + algo: HashAlgorithm::SHA256 as u32, + }; + + let res = map.map_object(&s256, HashAlgorithm::SHA1).unwrap(); + assert_eq!(res.oid, s1); + assert_eq!(res.kind, MapType::LooseObject); + let res = map.map_oid(&s256, HashAlgorithm::SHA1).unwrap(); + assert_eq!(*res, s1); + + let res = map.map_object(&s1, HashAlgorithm::SHA256).unwrap(); + assert_eq!(res.oid, s256); + assert_eq!(res.kind, MapType::LooseObject); + let res = map.map_oid(&s1, HashAlgorithm::SHA256).unwrap(); + assert_eq!(*res, s256); + + assert!(map.map_object(&missing_oid, HashAlgorithm::SHA1).is_none()); + assert!(map.map_oid(&missing_oid, HashAlgorithm::SHA1).is_none()); + + assert_eq!( + *map.map_oid(&missing_oid, HashAlgorithm::SHA256).unwrap(), + missing_oid + ); + } + + #[test] + fn looks_up_known_oids_correctly() { + let map = test_map(false); + + let funcs: &[&dyn Fn(HashAlgorithm) -> &'static ObjectID] = &[ + &|h: HashAlgorithm| h.empty_tree(), + &|h: HashAlgorithm| h.empty_blob(), + &|h: HashAlgorithm| h.null_oid(), + ]; + + for f in funcs { + let s256 = f(HashAlgorithm::SHA256); + let s1 = f(HashAlgorithm::SHA1); + + let res = map.map_object(s256, HashAlgorithm::SHA1).unwrap(); + assert_eq!(res.oid, *s1); + assert_eq!(res.kind, MapType::Reserved); + let res = map.map_oid(s256, HashAlgorithm::SHA1).unwrap(); + assert_eq!(*res, *s1); + + let res = map.map_object(s1, HashAlgorithm::SHA256).unwrap(); + assert_eq!(res.oid, *s256); + assert_eq!(res.kind, MapType::Reserved); + let res = map.map_oid(s1, HashAlgorithm::SHA256).unwrap(); + assert_eq!(*res, *s256); + } + } + + #[test] + fn nul_padding() { + assert_eq!(ObjectMemoryMap::required_nul_padding(1, 1), 3); + assert_eq!(ObjectMemoryMap::required_nul_padding(2, 1), 2); + assert_eq!(ObjectMemoryMap::required_nul_padding(3, 1), 1); + assert_eq!(ObjectMemoryMap::required_nul_padding(2, 2), 0); + + assert_eq!(ObjectMemoryMap::required_nul_padding(39, 3), 3); + } +} diff --git a/src/meson.build b/src/meson.build index c77041a3fa..1eea068519 100644 --- a/src/meson.build +++ b/src/meson.build @@ -1,6 +1,7 @@ libgit_rs_sources = [ 'hash.rs', 'lib.rs', + 'loose.rs', 'varint.rs', ] From 44ed7c18860bcddbea524744394a52a2f13efc33 Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Mon, 17 Nov 2025 22:16:20 +0000 Subject: [PATCH 14/15] rust: add a small wrapper around the hashfile code Our new binary object map code avoids needing to be intimately involved with file handling by simply writing data to an object implement Write. This makes it very easy to test by writing to a Cursor wrapping a Vec for tests, and thus decouples it from intimate knowledge about how we handle files. However, we will actually want to write our data to an actual file, since that's the most practical way to persist data. Implement a wrapper around the hashfile code that implements the Write trait so that we can write our object map into a file. Signed-off-by: brian m. carlson Signed-off-by: Junio C Hamano --- Makefile | 1 + src/csum_file.rs | 81 ++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + src/meson.build | 1 + 4 files changed, 84 insertions(+) create mode 100644 src/csum_file.rs diff --git a/Makefile b/Makefile index ecca75f431..b1a5169e01 100644 --- a/Makefile +++ b/Makefile @@ -1528,6 +1528,7 @@ CLAR_TEST_OBJS += $(UNIT_TEST_DIR)/unit-test.o UNIT_TEST_OBJS += $(UNIT_TEST_DIR)/test-lib.o +RUST_SOURCES += src/csum_file.rs RUST_SOURCES += src/hash.rs RUST_SOURCES += src/lib.rs RUST_SOURCES += src/loose.rs diff --git a/src/csum_file.rs b/src/csum_file.rs new file mode 100644 index 0000000000..7f2c6c4fcb --- /dev/null +++ b/src/csum_file.rs @@ -0,0 +1,81 @@ +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation: version 2 of the License, dated June 1991. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this program; if not, see . + +use crate::hash::{HashAlgorithm, GIT_MAX_RAWSZ}; +use std::ffi::CStr; +use std::io::{self, Write}; +use std::os::raw::c_void; + +/// A writer that can write files identified by their hash or containing a trailing hash. +pub struct HashFile { + ptr: *mut c_void, + algo: HashAlgorithm, +} + +impl HashFile { + /// Create a new HashFile. + /// + /// The hash used will be `algo`, its name should be in `name`, and an open file descriptor + /// pointing to that file should be in `fd`. + pub fn new(algo: HashAlgorithm, fd: i32, name: &CStr) -> HashFile { + HashFile { + ptr: unsafe { c::hashfd(algo.hash_algo_ptr(), fd, name.as_ptr()) }, + algo, + } + } + + /// Finalize this HashFile instance. + /// + /// Returns the hash computed over the data. + pub fn finalize(self, component: u32, flags: u32) -> Vec { + let mut result = vec![0u8; GIT_MAX_RAWSZ]; + unsafe { c::finalize_hashfile(self.ptr, result.as_mut_ptr(), component, flags) }; + result.truncate(self.algo.raw_len()); + result + } +} + +impl Write for HashFile { + fn write(&mut self, data: &[u8]) -> io::Result { + for chunk in data.chunks(u32::MAX as usize) { + unsafe { + c::hashwrite( + self.ptr, + chunk.as_ptr() as *const c_void, + chunk.len() as u32, + ) + }; + } + Ok(data.len()) + } + + fn flush(&mut self) -> io::Result<()> { + unsafe { c::hashflush(self.ptr) }; + Ok(()) + } +} + +pub mod c { + use std::os::raw::{c_char, c_int, c_void}; + + extern "C" { + pub fn hashfd(algop: *const c_void, fd: i32, name: *const c_char) -> *mut c_void; + pub fn hashwrite(f: *mut c_void, data: *const c_void, len: u32); + pub fn hashflush(f: *mut c_void); + pub fn finalize_hashfile( + f: *mut c_void, + data: *mut u8, + component: u32, + flags: u32, + ) -> c_int; + } +} diff --git a/src/lib.rs b/src/lib.rs index 442f9433dc..0c598298b1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ +pub mod csum_file; pub mod hash; pub mod loose; pub mod varint; diff --git a/src/meson.build b/src/meson.build index 1eea068519..45739957b4 100644 --- a/src/meson.build +++ b/src/meson.build @@ -1,4 +1,5 @@ libgit_rs_sources = [ + 'csum_file.rs', 'hash.rs', 'lib.rs', 'loose.rs', From 68aace560b3ed2dba8ca36c34773e26c11b3adca Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Mon, 17 Nov 2025 22:16:21 +0000 Subject: [PATCH 15/15] object-file-convert: always make sure object ID algo is valid In some cases, we zero-initialize our object IDs, which sets the algo member to zero as well, which is not a valid algorithm number. This is a bad practice, but we typically paper over it in many cases by simply substituting the repository's hash algorithm. However, our new Rust loose object map code doesn't handle this gracefully and can't find object IDs when the algorithm is zero because they don't compare equal to those with the correct algo field. In addition, the comparison code doesn't have any knowledge of what the main algorithm is because that's global state, so we can't adjust the comparison. To make our code function properly and to avoid propagating these bad entries, if we get a source object ID with a zero algo, just make a copy of it with the fixed algorithm. This has the benefit of also fixing the object IDs if we're in a single algorithm mode as well. Signed-off-by: brian m. carlson Signed-off-by: Junio C Hamano --- object-file-convert.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/object-file-convert.c b/object-file-convert.c index e44c821084..f8dce94811 100644 --- a/object-file-convert.c +++ b/object-file-convert.c @@ -13,7 +13,7 @@ #include "gpg-interface.h" #include "object-file-convert.h" -int repo_oid_to_algop(struct repository *repo, const struct object_id *src, +int repo_oid_to_algop(struct repository *repo, const struct object_id *srcoid, const struct git_hash_algo *to, struct object_id *dest) { /* @@ -21,7 +21,15 @@ int repo_oid_to_algop(struct repository *repo, const struct object_id *src, * default hash algorithm for that object. */ const struct git_hash_algo *from = - src->algo ? &hash_algos[src->algo] : repo->hash_algo; + srcoid->algo ? &hash_algos[srcoid->algo] : repo->hash_algo; + struct object_id temp; + const struct object_id *src = srcoid; + + if (!srcoid->algo) { + oidcpy(&temp, srcoid); + temp.algo = hash_algo_by_ptr(repo->hash_algo); + src = &temp; + } if (from == to || !to) { if (src != dest)