From 413e96f41e61d90d266dbfb963f107168995a94b Mon Sep 17 00:00:00 2001 From: Mikhail Klyushin Date: Thu, 14 Jan 2021 10:39:10 +0000 Subject: [PATCH 01/63] git-gui: fix typo in russian locale MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixed typo in russian locale: издекса -> индекса Signed-off-by: Mikhail Klyushin Signed-off-by: Pratyush Yadav --- po/ru.po | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/po/ru.po b/po/ru.po index 161ee1ac8c..7aebaf809d 100644 --- a/po/ru.po +++ b/po/ru.po @@ -331,7 +331,7 @@ msgstr "Добавить изменённые файлы в индекс" #: git-gui.sh:2936 msgid "Unstage From Commit" -msgstr "Убрать из издекса" +msgstr "Убрать из индекса" #: git-gui.sh:2942 lib/index.tcl:521 msgid "Revert Changes" From eb9071912f5ca370d7e30e88434ffa10c182ed81 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 5 Feb 2021 14:30:36 +0000 Subject: [PATCH 02/63] commit-graph: anonymize data in chunk_write_fn In preparation for creating an API around file formats using chunks and tables of contents, prepare the commit-graph write code to use prototypes that will match this new API. Specifically, convert chunk_write_fn to take a "void *data" parameter instead of the commit-graph-specific "struct write_commit_graph_context" pointer. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- commit-graph.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index f3bde2ad95..fae7d1b639 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -1040,8 +1040,9 @@ struct write_commit_graph_context { }; static int write_graph_chunk_fanout(struct hashfile *f, - struct write_commit_graph_context *ctx) + void *data) { + struct write_commit_graph_context *ctx = data; int i, count = 0; struct commit **list = ctx->commits.list; @@ -1066,8 +1067,9 @@ static int write_graph_chunk_fanout(struct hashfile *f, } static int write_graph_chunk_oids(struct hashfile *f, - struct write_commit_graph_context *ctx) + void *data) { + struct write_commit_graph_context *ctx = data; struct commit **list = ctx->commits.list; int count; for (count = 0; count < ctx->commits.nr; count++, list++) { @@ -1085,8 +1087,9 @@ static const unsigned char *commit_to_sha1(size_t index, void *table) } static int write_graph_chunk_data(struct hashfile *f, - struct write_commit_graph_context *ctx) + void *data) { + struct write_commit_graph_context *ctx = data; struct commit **list = ctx->commits.list; struct commit **last = ctx->commits.list + ctx->commits.nr; uint32_t num_extra_edges = 0; @@ -1187,8 +1190,9 @@ static int write_graph_chunk_data(struct hashfile *f, } static int write_graph_chunk_generation_data(struct hashfile *f, - struct write_commit_graph_context *ctx) + void *data) { + struct write_commit_graph_context *ctx = data; int i, num_generation_data_overflows = 0; for (i = 0; i < ctx->commits.nr; i++) { @@ -1208,8 +1212,9 @@ static int write_graph_chunk_generation_data(struct hashfile *f, } static int write_graph_chunk_generation_data_overflow(struct hashfile *f, - struct write_commit_graph_context *ctx) + void *data) { + struct write_commit_graph_context *ctx = data; int i; for (i = 0; i < ctx->commits.nr; i++) { struct commit *c = ctx->commits.list[i]; @@ -1226,8 +1231,9 @@ static int write_graph_chunk_generation_data_overflow(struct hashfile *f, } static int write_graph_chunk_extra_edges(struct hashfile *f, - struct write_commit_graph_context *ctx) + void *data) { + struct write_commit_graph_context *ctx = data; struct commit **list = ctx->commits.list; struct commit **last = ctx->commits.list + ctx->commits.nr; struct commit_list *parent; @@ -1280,8 +1286,9 @@ static int write_graph_chunk_extra_edges(struct hashfile *f, } static int write_graph_chunk_bloom_indexes(struct hashfile *f, - struct write_commit_graph_context *ctx) + void *data) { + struct write_commit_graph_context *ctx = data; struct commit **list = ctx->commits.list; struct commit **last = ctx->commits.list + ctx->commits.nr; uint32_t cur_pos = 0; @@ -1315,8 +1322,9 @@ static void trace2_bloom_filter_settings(struct write_commit_graph_context *ctx) } static int write_graph_chunk_bloom_data(struct hashfile *f, - struct write_commit_graph_context *ctx) + void *data) { + struct write_commit_graph_context *ctx = data; struct commit **list = ctx->commits.list; struct commit **last = ctx->commits.list + ctx->commits.nr; @@ -1737,8 +1745,9 @@ static int write_graph_chunk_base_1(struct hashfile *f, } static int write_graph_chunk_base(struct hashfile *f, - struct write_commit_graph_context *ctx) + void *data) { + struct write_commit_graph_context *ctx = data; int num = write_graph_chunk_base_1(f, ctx->new_base_graph); if (num != ctx->num_commit_graphs_after - 1) { @@ -1750,7 +1759,7 @@ static int write_graph_chunk_base(struct hashfile *f, } typedef int (*chunk_write_fn)(struct hashfile *f, - struct write_commit_graph_context *ctx); + void *data); struct chunk_info { uint32_t id; From 5712d62ccf236eab22a0f0bf196be98ca4a0bf45 Mon Sep 17 00:00:00 2001 From: Shubham Verma Date: Fri, 12 Feb 2021 01:16:55 +0530 Subject: [PATCH 03/63] t7001: modernize test formatting Some tests in this script are formatted using a very old style: test_expect_success \ 'title' \ 'body line 1 && body line 2' Update the formatting to the modern style: test_expect_success 'title' ' body line 1 && body line 2 ' Signed-off-by: Shubham Verma Reviewed-by: Eric Sunshine Signed-off-by: Junio C Hamano --- t/t7001-mv.sh | 188 +++++++++++++++++++++++++------------------------- 1 file changed, 94 insertions(+), 94 deletions(-) diff --git a/t/t7001-mv.sh b/t/t7001-mv.sh index 63d5f41a12..4bbb51e578 100755 --- a/t/t7001-mv.sh +++ b/t/t7001-mv.sh @@ -3,74 +3,74 @@ test_description='git mv in subdirs' . ./test-lib.sh -test_expect_success \ - 'prepare reference tree' \ - 'mkdir path0 path1 && +test_expect_success 'prepare reference tree' ' + mkdir path0 path1 && cp "$TEST_DIRECTORY"/../COPYING path0/COPYING && git add path0/COPYING && - git commit -m add -a' + git commit -m add -a +' -test_expect_success \ - 'moving the file out of subdirectory' \ - 'cd path0 && git mv COPYING ../path1/COPYING' +test_expect_success 'moving the file out of subdirectory' ' + cd path0 && git mv COPYING ../path1/COPYING +' # in path0 currently -test_expect_success \ - 'commiting the change' \ - 'cd .. && git commit -m move-out -a' +test_expect_success 'commiting the change' ' + cd .. && git commit -m move-out -a +' -test_expect_success \ - 'checking the commit' \ - 'git diff-tree -r -M --name-status HEAD^ HEAD >actual && - grep "^R100..*path0/COPYING..*path1/COPYING" actual' +test_expect_success 'checking the commit' ' + git diff-tree -r -M --name-status HEAD^ HEAD >actual && + grep "^R100..*path0/COPYING..*path1/COPYING" actual +' -test_expect_success \ - 'moving the file back into subdirectory' \ - 'cd path0 && git mv ../path1/COPYING COPYING' +test_expect_success 'moving the file back into subdirectory' ' + cd path0 && git mv ../path1/COPYING COPYING +' # in path0 currently -test_expect_success \ - 'commiting the change' \ - 'cd .. && git commit -m move-in -a' +test_expect_success 'commiting the change' ' + cd .. && git commit -m move-in -a +' -test_expect_success \ - 'checking the commit' \ - 'git diff-tree -r -M --name-status HEAD^ HEAD >actual && - grep "^R100..*path1/COPYING..*path0/COPYING" actual' +test_expect_success 'checking the commit' ' + git diff-tree -r -M --name-status HEAD^ HEAD >actual && + grep "^R100..*path1/COPYING..*path0/COPYING" actual +' -test_expect_success \ - 'mv --dry-run does not move file' \ - 'git mv -n path0/COPYING MOVED && +test_expect_success 'mv --dry-run does not move file' ' + git mv -n path0/COPYING MOVED && test -f path0/COPYING && - test ! -f MOVED' + test ! -f MOVED +' -test_expect_success \ - 'checking -k on non-existing file' \ - 'git mv -k idontexist path0' +test_expect_success 'checking -k on non-existing file' ' + git mv -k idontexist path0 +' -test_expect_success \ - 'checking -k on untracked file' \ - 'touch untracked1 && +test_expect_success 'checking -k on untracked file' ' + touch untracked1 && git mv -k untracked1 path0 && test -f untracked1 && - test ! -f path0/untracked1' + test ! -f path0/untracked1 +' -test_expect_success \ - 'checking -k on multiple untracked files' \ - 'touch untracked2 && +test_expect_success 'checking -k on multiple untracked files' ' + touch untracked2 && git mv -k untracked1 untracked2 path0 && test -f untracked1 && test -f untracked2 && test ! -f path0/untracked1 && - test ! -f path0/untracked2' + test ! -f path0/untracked2 +' -test_expect_success \ - 'checking -f on untracked file with existing target' \ - 'touch path0/untracked1 && +test_expect_success 'checking -f on untracked file with existing target' ' + touch path0/untracked1 && test_must_fail git mv -f untracked1 path0 && test ! -f .git/index.lock && test -f untracked1 && - test -f path0/untracked1' + test -f path0/untracked1 +' # clean up the mess in case bad things happen rm -f idontexist untracked1 untracked2 \ @@ -78,79 +78,79 @@ rm -f idontexist untracked1 untracked2 \ .git/index.lock rmdir path1 -test_expect_success \ - 'moving to absent target with trailing slash' \ - 'test_must_fail git mv path0/COPYING no-such-dir/ && +test_expect_success 'moving to absent target with trailing slash' ' + test_must_fail git mv path0/COPYING no-such-dir/ && test_must_fail git mv path0/COPYING no-such-dir// && git mv path0/ no-such-dir/ && - test_path_is_dir no-such-dir' + test_path_is_dir no-such-dir +' -test_expect_success \ - 'clean up' \ - 'git reset --hard' +test_expect_success 'clean up' ' + git reset --hard +' -test_expect_success \ - 'moving to existing untracked target with trailing slash' \ - 'mkdir path1 && +test_expect_success 'moving to existing untracked target with trailing slash' ' + mkdir path1 && git mv path0/ path1/ && - test_path_is_dir path1/path0/' + test_path_is_dir path1/path0/ +' -test_expect_success \ - 'moving to existing tracked target with trailing slash' \ - 'mkdir path2 && +test_expect_success 'moving to existing tracked target with trailing slash' ' + mkdir path2 && >path2/file && git add path2/file && git mv path1/path0/ path2/ && - test_path_is_dir path2/path0/' + test_path_is_dir path2/path0/ +' -test_expect_success \ - 'clean up' \ - 'git reset --hard' +test_expect_success 'clean up' ' + git reset --hard +' -test_expect_success \ - 'adding another file' \ - 'cp "$TEST_DIRECTORY"/../README.md path0/README && +test_expect_success 'adding another file' ' + cp "$TEST_DIRECTORY"/../README.md path0/README && git add path0/README && - git commit -m add2 -a' + git commit -m add2 -a +' -test_expect_success \ - 'moving whole subdirectory' \ - 'git mv path0 path2' +test_expect_success 'moving whole subdirectory' ' + git mv path0 path2 +' -test_expect_success \ - 'commiting the change' \ - 'git commit -m dir-move -a' +test_expect_success 'commiting the change' ' + git commit -m dir-move -a +' -test_expect_success \ - 'checking the commit' \ - 'git diff-tree -r -M --name-status HEAD^ HEAD >actual && +test_expect_success 'checking the commit' ' + git diff-tree -r -M --name-status HEAD^ HEAD >actual && grep "^R100..*path0/COPYING..*path2/COPYING" actual && - grep "^R100..*path0/README..*path2/README" actual' + grep "^R100..*path0/README..*path2/README" actual +' -test_expect_success \ - 'succeed when source is a prefix of destination' \ - 'git mv path2/COPYING path2/COPYING-renamed' +test_expect_success 'succeed when source is a prefix of destination' ' + git mv path2/COPYING path2/COPYING-renamed +' -test_expect_success \ - 'moving whole subdirectory into subdirectory' \ - 'git mv path2 path1' +test_expect_success 'moving whole subdirectory into subdirectory' ' + git mv path2 path1 +' -test_expect_success \ - 'commiting the change' \ - 'git commit -m dir-move -a' +test_expect_success 'commiting the change' ' + git commit -m dir-move -a +' -test_expect_success \ - 'checking the commit' \ - 'git diff-tree -r -M --name-status HEAD^ HEAD >actual && +test_expect_success 'checking the commit' ' + git diff-tree -r -M --name-status HEAD^ HEAD >actual && grep "^R100..*path2/COPYING..*path1/path2/COPYING" actual && - grep "^R100..*path2/README..*path1/path2/README" actual' + grep "^R100..*path2/README..*path1/path2/README" actual +' -test_expect_success \ - 'do not move directory over existing directory' \ - 'mkdir path0 && mkdir path0/path2 && test_must_fail git mv path2 path0' +test_expect_success 'do not move directory over existing directory' ' + mkdir path0 && mkdir path0/path2 && test_must_fail git mv path2 path0 +' -test_expect_success \ - 'move into "."' \ - 'git mv path1/path2/ .' +test_expect_success 'move into "."' ' + git mv path1/path2/ . +' test_expect_success "Michael Cassar's test case" ' rm -fr .git papers partA && From a76d90670af643d5ff3112d4fd88b27bd28b7d49 Mon Sep 17 00:00:00 2001 From: Shubham Verma Date: Fri, 12 Feb 2021 01:16:56 +0530 Subject: [PATCH 04/63] t7001: indent with TABs instead of spaces Signed-off-by: Shubham Verma Reviewed-by: Eric Sunshine Signed-off-by: Junio C Hamano --- t/t7001-mv.sh | 120 +++++++++++++++++++++++++------------------------- 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/t/t7001-mv.sh b/t/t7001-mv.sh index 4bbb51e578..7503233814 100755 --- a/t/t7001-mv.sh +++ b/t/t7001-mv.sh @@ -4,72 +4,72 @@ test_description='git mv in subdirs' . ./test-lib.sh test_expect_success 'prepare reference tree' ' - mkdir path0 path1 && - cp "$TEST_DIRECTORY"/../COPYING path0/COPYING && - git add path0/COPYING && - git commit -m add -a + mkdir path0 path1 && + cp "$TEST_DIRECTORY"/../COPYING path0/COPYING && + git add path0/COPYING && + git commit -m add -a ' test_expect_success 'moving the file out of subdirectory' ' - cd path0 && git mv COPYING ../path1/COPYING + cd path0 && git mv COPYING ../path1/COPYING ' # in path0 currently test_expect_success 'commiting the change' ' - cd .. && git commit -m move-out -a + cd .. && git commit -m move-out -a ' test_expect_success 'checking the commit' ' - git diff-tree -r -M --name-status HEAD^ HEAD >actual && - grep "^R100..*path0/COPYING..*path1/COPYING" actual + git diff-tree -r -M --name-status HEAD^ HEAD >actual && + grep "^R100..*path0/COPYING..*path1/COPYING" actual ' test_expect_success 'moving the file back into subdirectory' ' - cd path0 && git mv ../path1/COPYING COPYING + cd path0 && git mv ../path1/COPYING COPYING ' # in path0 currently test_expect_success 'commiting the change' ' - cd .. && git commit -m move-in -a + cd .. && git commit -m move-in -a ' test_expect_success 'checking the commit' ' - git diff-tree -r -M --name-status HEAD^ HEAD >actual && - grep "^R100..*path1/COPYING..*path0/COPYING" actual + git diff-tree -r -M --name-status HEAD^ HEAD >actual && + grep "^R100..*path1/COPYING..*path0/COPYING" actual ' test_expect_success 'mv --dry-run does not move file' ' - git mv -n path0/COPYING MOVED && - test -f path0/COPYING && - test ! -f MOVED + git mv -n path0/COPYING MOVED && + test -f path0/COPYING && + test ! -f MOVED ' test_expect_success 'checking -k on non-existing file' ' - git mv -k idontexist path0 + git mv -k idontexist path0 ' test_expect_success 'checking -k on untracked file' ' - touch untracked1 && - git mv -k untracked1 path0 && - test -f untracked1 && - test ! -f path0/untracked1 + touch untracked1 && + git mv -k untracked1 path0 && + test -f untracked1 && + test ! -f path0/untracked1 ' test_expect_success 'checking -k on multiple untracked files' ' - touch untracked2 && - git mv -k untracked1 untracked2 path0 && - test -f untracked1 && - test -f untracked2 && - test ! -f path0/untracked1 && - test ! -f path0/untracked2 + touch untracked2 && + git mv -k untracked1 untracked2 path0 && + test -f untracked1 && + test -f untracked2 && + test ! -f path0/untracked1 && + test ! -f path0/untracked2 ' test_expect_success 'checking -f on untracked file with existing target' ' - touch path0/untracked1 && - test_must_fail git mv -f untracked1 path0 && - test ! -f .git/index.lock && - test -f untracked1 && - test -f path0/untracked1 + touch path0/untracked1 && + test_must_fail git mv -f untracked1 path0 && + test ! -f .git/index.lock && + test -f untracked1 && + test -f path0/untracked1 ' # clean up the mess in case bad things happen @@ -79,77 +79,77 @@ rm -f idontexist untracked1 untracked2 \ rmdir path1 test_expect_success 'moving to absent target with trailing slash' ' - test_must_fail git mv path0/COPYING no-such-dir/ && - test_must_fail git mv path0/COPYING no-such-dir// && - git mv path0/ no-such-dir/ && - test_path_is_dir no-such-dir + test_must_fail git mv path0/COPYING no-such-dir/ && + test_must_fail git mv path0/COPYING no-such-dir// && + git mv path0/ no-such-dir/ && + test_path_is_dir no-such-dir ' test_expect_success 'clean up' ' - git reset --hard + git reset --hard ' test_expect_success 'moving to existing untracked target with trailing slash' ' - mkdir path1 && - git mv path0/ path1/ && - test_path_is_dir path1/path0/ + mkdir path1 && + git mv path0/ path1/ && + test_path_is_dir path1/path0/ ' test_expect_success 'moving to existing tracked target with trailing slash' ' - mkdir path2 && - >path2/file && git add path2/file && - git mv path1/path0/ path2/ && - test_path_is_dir path2/path0/ + mkdir path2 && + >path2/file && git add path2/file && + git mv path1/path0/ path2/ && + test_path_is_dir path2/path0/ ' test_expect_success 'clean up' ' - git reset --hard + git reset --hard ' test_expect_success 'adding another file' ' - cp "$TEST_DIRECTORY"/../README.md path0/README && - git add path0/README && - git commit -m add2 -a + cp "$TEST_DIRECTORY"/../README.md path0/README && + git add path0/README && + git commit -m add2 -a ' test_expect_success 'moving whole subdirectory' ' - git mv path0 path2 + git mv path0 path2 ' test_expect_success 'commiting the change' ' - git commit -m dir-move -a + git commit -m dir-move -a ' test_expect_success 'checking the commit' ' - git diff-tree -r -M --name-status HEAD^ HEAD >actual && - grep "^R100..*path0/COPYING..*path2/COPYING" actual && - grep "^R100..*path0/README..*path2/README" actual + git diff-tree -r -M --name-status HEAD^ HEAD >actual && + grep "^R100..*path0/COPYING..*path2/COPYING" actual && + grep "^R100..*path0/README..*path2/README" actual ' test_expect_success 'succeed when source is a prefix of destination' ' - git mv path2/COPYING path2/COPYING-renamed + git mv path2/COPYING path2/COPYING-renamed ' test_expect_success 'moving whole subdirectory into subdirectory' ' - git mv path2 path1 + git mv path2 path1 ' test_expect_success 'commiting the change' ' - git commit -m dir-move -a + git commit -m dir-move -a ' test_expect_success 'checking the commit' ' - git diff-tree -r -M --name-status HEAD^ HEAD >actual && - grep "^R100..*path2/COPYING..*path1/path2/COPYING" actual && - grep "^R100..*path2/README..*path1/path2/README" actual + git diff-tree -r -M --name-status HEAD^ HEAD >actual && + grep "^R100..*path2/COPYING..*path1/path2/COPYING" actual && + grep "^R100..*path2/README..*path1/path2/README" actual ' test_expect_success 'do not move directory over existing directory' ' - mkdir path0 && mkdir path0/path2 && test_must_fail git mv path2 path0 + mkdir path0 && mkdir path0/path2 && test_must_fail git mv path2 path0 ' test_expect_success 'move into "."' ' - git mv path1/path2/ . + git mv path1/path2/ . ' test_expect_success "Michael Cassar's test case" ' From 9b46e9c9cc6787cbf88d18c143006316928e3aa3 Mon Sep 17 00:00:00 2001 From: Shubham Verma Date: Fri, 12 Feb 2021 01:16:57 +0530 Subject: [PATCH 05/63] t7001: remove unnecessary blank lines Some tests use a deprecated style in which there are unnecessary blank lines after the opening quote of the test body and before the closing quote. So we should remove these unnecessary blank lines. Signed-off-by: Shubham Verma Reviewed-by: Eric Sunshine Signed-off-by: Junio C Hamano --- t/t7001-mv.sh | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/t/t7001-mv.sh b/t/t7001-mv.sh index 7503233814..f63802442b 100755 --- a/t/t7001-mv.sh +++ b/t/t7001-mv.sh @@ -182,7 +182,6 @@ test_expect_success "Sergey Vlasov's test case" ' ' test_expect_success 'absolute pathname' '( - rm -fr mine && mkdir mine && cd mine && @@ -196,12 +195,9 @@ test_expect_success 'absolute pathname' '( ! test -d sub && test -d in && git ls-files --error-unmatch in/file - - )' test_expect_success 'absolute pathname outside should fail' '( - rm -fr mine && mkdir mine && cd mine && @@ -216,7 +212,6 @@ test_expect_success 'absolute pathname outside should fail' '( test -d sub && ! test -d ../in && git ls-files --error-unmatch sub/file - )' test_expect_success 'git mv to move multiple sources into a directory' ' @@ -232,7 +227,6 @@ test_expect_success 'git mv to move multiple sources into a directory' ' ' test_expect_success 'git mv should not change sha1 of moved cache entry' ' - rm -fr .git && git init && echo 1 >dirty && @@ -243,7 +237,6 @@ test_expect_success 'git mv should not change sha1 of moved cache entry' ' echo 2 >dirty2 && git mv dirty2 dirty && [ "$entry" = "$(git ls-files --stage dirty | cut -f 1)" ] - ' rm -f dirty dirty2 @@ -266,7 +259,6 @@ test_expect_success 'git mv error on conflicted file' ' ' test_expect_success 'git mv should overwrite symlink to a file' ' - rm -fr .git && git init && echo 1 >moved && @@ -279,13 +271,11 @@ test_expect_success 'git mv should overwrite symlink to a file' ' test "$(cat symlink)" = 1 && git update-index --refresh && git diff-files --quiet - ' rm -f moved symlink test_expect_success 'git mv should overwrite file with a symlink' ' - rm -fr .git && git init && echo 1 >moved && @@ -296,11 +286,9 @@ test_expect_success 'git mv should overwrite file with a symlink' ' ! test -e symlink && git update-index --refresh && git diff-files --quiet - ' test_expect_success SYMLINKS 'check moved symlink' ' - test -h moved ' From 9bcaeb71a63e3f13b859202813d8c95047902b27 Mon Sep 17 00:00:00 2001 From: Shubham Verma Date: Fri, 12 Feb 2021 01:16:58 +0530 Subject: [PATCH 06/63] t7001: modernize subshell formatting Some test use an old style for formatting subshells: (command && ... Update them to the modern style: ( command && ... Signed-off-by: Shubham Verma Reviewed-by: Eric Sunshine Signed-off-by: Junio C Hamano --- t/t7001-mv.sh | 68 ++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/t/t7001-mv.sh b/t/t7001-mv.sh index f63802442b..a4a14a3b2e 100755 --- a/t/t7001-mv.sh +++ b/t/t7001-mv.sh @@ -181,38 +181,42 @@ test_expect_success "Sergey Vlasov's test case" ' git mv ab a ' -test_expect_success 'absolute pathname' '( - rm -fr mine && - mkdir mine && - cd mine && - test_create_repo one && - cd one && - mkdir sub && - >sub/file && - git add sub/file && +test_expect_success 'absolute pathname' ' + ( + rm -fr mine && + mkdir mine && + cd mine && + test_create_repo one && + cd one && + mkdir sub && + >sub/file && + git add sub/file && - git mv sub "$(pwd)/in" && - ! test -d sub && - test -d in && - git ls-files --error-unmatch in/file -)' + git mv sub "$(pwd)/in" && + ! test -d sub && + test -d in && + git ls-files --error-unmatch in/file + ) +' -test_expect_success 'absolute pathname outside should fail' '( - rm -fr mine && - mkdir mine && - cd mine && - out=$(pwd) && - test_create_repo one && - cd one && - mkdir sub && - >sub/file && - git add sub/file && +test_expect_success 'absolute pathname outside should fail' ' + ( + rm -fr mine && + mkdir mine && + cd mine && + out=$(pwd) && + test_create_repo one && + cd one && + mkdir sub && + >sub/file && + git add sub/file && - test_must_fail git mv sub "$out/out" && - test -d sub && - ! test -d ../in && - git ls-files --error-unmatch sub/file -)' + test_must_fail git mv sub "$out/out" && + test -d sub && + ! test -d ../in && + git ls-files --error-unmatch sub/file + ) +' test_expect_success 'git mv to move multiple sources into a directory' ' rm -fr .git && git init && @@ -503,14 +507,16 @@ test_expect_success 'moving a submodule in nested directories' ' test_expect_success 'moving nested submodules' ' git commit -am "cleanup commit" && mkdir sub_nested_nested && - (cd sub_nested_nested && + ( + cd sub_nested_nested && touch nested_level2 && git init && git add . && git commit -m "nested level 2" ) && mkdir sub_nested && - (cd sub_nested && + ( + cd sub_nested && touch nested_level1 && git init && git add . && From dd72154149a48120f135de28b0202ac5de34d17b Mon Sep 17 00:00:00 2001 From: Shubham Verma Date: Fri, 12 Feb 2021 01:16:59 +0530 Subject: [PATCH 07/63] t7001: remove whitespace after redirect operators According to Documentation/CodingGuidelines, there should be no whitespace after redirect operators. So, we should remove these whitespaces after redirect operators. Signed-off-by: Shubham Verma Reviewed-by: Eric Sunshine Signed-off-by: Junio C Hamano --- t/t7001-mv.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/t/t7001-mv.sh b/t/t7001-mv.sh index a4a14a3b2e..02fbc90dea 100755 --- a/t/t7001-mv.sh +++ b/t/t7001-mv.sh @@ -156,9 +156,9 @@ test_expect_success "Michael Cassar's test case" ' rm -fr .git papers partA && git init && mkdir -p papers/unsorted papers/all-papers partA && - echo a > papers/unsorted/Thesis.pdf && - echo b > partA/outline.txt && - echo c > papers/unsorted/_another && + echo a >papers/unsorted/Thesis.pdf && + echo b >partA/outline.txt && + echo c >papers/unsorted/_another && git add papers partA && T1=$(git write-tree) && From 368d2782499af9fbe9db73aed2ffbba53772374a Mon Sep 17 00:00:00 2001 From: Shubham Verma Date: Fri, 12 Feb 2021 01:17:00 +0530 Subject: [PATCH 08/63] t7001: avoid using `cd` outside of subshells Avoid using `cd` outside of subshells since, if the test fails, there is no guarantee that the current working directory is the expected one, which may cause subsequent tests to run in the wrong directory. While at it, make some other tests more concise by replacing simple subshells with `git -C`. Signed-off-by: Shubham Verma Reviewed-by: Eric Sunshine Signed-off-by: Junio C Hamano --- t/t7001-mv.sh | 43 +++++++++++-------------------------------- 1 file changed, 11 insertions(+), 32 deletions(-) diff --git a/t/t7001-mv.sh b/t/t7001-mv.sh index 02fbc90dea..efbcba12d3 100755 --- a/t/t7001-mv.sh +++ b/t/t7001-mv.sh @@ -11,12 +11,12 @@ test_expect_success 'prepare reference tree' ' ' test_expect_success 'moving the file out of subdirectory' ' - cd path0 && git mv COPYING ../path1/COPYING + git -C path0 mv COPYING ../path1/COPYING ' # in path0 currently test_expect_success 'commiting the change' ' - cd .. && git commit -m move-out -a + git commit -m move-out -a ' test_expect_success 'checking the commit' ' @@ -25,12 +25,12 @@ test_expect_success 'checking the commit' ' ' test_expect_success 'moving the file back into subdirectory' ' - cd path0 && git mv ../path1/COPYING COPYING + git -C path0 mv ../path1/COPYING COPYING ' # in path0 currently test_expect_success 'commiting the change' ' - cd .. && git commit -m move-in -a + git commit -m move-in -a ' test_expect_success 'checking the commit' ' @@ -328,10 +328,7 @@ test_expect_success 'git mv moves a submodule with a .git directory and no .gitm git mv sub mod/sub && ! test -e sub && [ "$entry" = "$(git ls-files --stage mod/sub | cut -f 1)" ] && - ( - cd mod/sub && - git status - ) && + git -C mod/sub status && git update-index --refresh && git diff-files --quiet ' @@ -351,10 +348,7 @@ test_expect_success 'git mv moves a submodule with a .git directory and .gitmodu git mv sub mod/sub && ! test -e sub && [ "$entry" = "$(git ls-files --stage mod/sub | cut -f 1)" ] && - ( - cd mod/sub && - git status - ) && + git -C mod/sub status && echo mod/sub >expected && git config -f .gitmodules submodule.sub.path >actual && test_cmp expected actual && @@ -368,16 +362,10 @@ test_expect_success 'git mv moves a submodule with gitfile' ' git submodule update && entry="$(git ls-files --stage sub | cut -f 1)" && mkdir mod && - ( - cd mod && - git mv ../sub/ . - ) && + git -C mod mv ../sub/ . && ! test -e sub && [ "$entry" = "$(git ls-files --stage mod/sub | cut -f 1)" ] && - ( - cd mod/sub && - git status - ) && + git -C mod/sub status && echo mod/sub >expected && git config -f .gitmodules submodule.sub.path >actual && test_cmp expected actual && @@ -396,10 +384,7 @@ test_expect_success 'mv does not complain when no .gitmodules file is found' ' test_must_be_empty actual.err && ! test -e sub && [ "$entry" = "$(git ls-files --stage mod/sub | cut -f 1)" ] && - ( - cd mod/sub && - git status - ) && + git -C mod/sub status && git update-index --refresh && git diff-files --quiet ' @@ -420,10 +405,7 @@ test_expect_success 'mv will error out on a modified .gitmodules file unless sta test_must_be_empty actual.err && ! test -e sub && [ "$entry" = "$(git ls-files --stage mod/sub | cut -f 1)" ] && - ( - cd mod/sub && - git status - ) && + git -C mod/sub status && git update-index --refresh && git diff-files --quiet ' @@ -441,10 +423,7 @@ test_expect_success 'mv issues a warning when section is not found in .gitmodule test_i18ncmp expect.err actual.err && ! test -e sub && [ "$entry" = "$(git ls-files --stage mod/sub | cut -f 1)" ] && - ( - cd mod/sub && - git status - ) && + git -C mod/sub status && git update-index --refresh && git diff-files --quiet ' From d2ecddc9816e6e0b9732046be9e926b42d4b3226 Mon Sep 17 00:00:00 2001 From: Shubham Verma Date: Fri, 12 Feb 2021 01:17:01 +0530 Subject: [PATCH 09/63] t7001: use '>' rather than 'touch' Use `>` rather than `touch` to create an empty file when the timestamp isn't relevant to the test. Signed-off-by: Shubham Verma Reviewed-by: Eric Sunshine Signed-off-by: Junio C Hamano --- t/t7001-mv.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/t/t7001-mv.sh b/t/t7001-mv.sh index efbcba12d3..cd67fe0bd1 100755 --- a/t/t7001-mv.sh +++ b/t/t7001-mv.sh @@ -49,14 +49,14 @@ test_expect_success 'checking -k on non-existing file' ' ' test_expect_success 'checking -k on untracked file' ' - touch untracked1 && + >untracked1 && git mv -k untracked1 path0 && test -f untracked1 && test ! -f path0/untracked1 ' test_expect_success 'checking -k on multiple untracked files' ' - touch untracked2 && + >untracked2 && git mv -k untracked1 untracked2 path0 && test -f untracked1 && test -f untracked2 && @@ -65,7 +65,7 @@ test_expect_success 'checking -k on multiple untracked files' ' ' test_expect_success 'checking -f on untracked file with existing target' ' - touch path0/untracked1 && + >path0/untracked1 && test_must_fail git mv -f untracked1 path0 && test ! -f .git/index.lock && test -f untracked1 && @@ -488,7 +488,7 @@ test_expect_success 'moving nested submodules' ' mkdir sub_nested_nested && ( cd sub_nested_nested && - touch nested_level2 && + >nested_level2 && git init && git add . && git commit -m "nested level 2" @@ -496,7 +496,7 @@ test_expect_success 'moving nested submodules' ' mkdir sub_nested && ( cd sub_nested && - touch nested_level1 && + >nested_level1 && git init && git add . && git commit -m "nested level 1" && From 5d683c3f4bcef667ca128d3d737fd36118d49a3d Mon Sep 17 00:00:00 2001 From: Shubham Verma Date: Fri, 12 Feb 2021 01:17:02 +0530 Subject: [PATCH 10/63] t7001: put each command on a separate line Modern practice is to avoid multiple commands per line, and instead place each command on its own line. Signed-off-by: Shubham Verma Reviewed-by: Eric Sunshine Signed-off-by: Junio C Hamano --- t/t7001-mv.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/t/t7001-mv.sh b/t/t7001-mv.sh index cd67fe0bd1..f55d18ed9c 100755 --- a/t/t7001-mv.sh +++ b/t/t7001-mv.sh @@ -145,7 +145,9 @@ test_expect_success 'checking the commit' ' ' test_expect_success 'do not move directory over existing directory' ' - mkdir path0 && mkdir path0/path2 && test_must_fail git mv path2 path0 + mkdir path0 && + mkdir path0/path2 && + test_must_fail git mv path2 path0 ' test_expect_success 'move into "."' ' From 39252c833e0402c011dfb940a87db9d0a64ddd0c Mon Sep 17 00:00:00 2001 From: Shubham Verma Date: Fri, 12 Feb 2021 01:17:03 +0530 Subject: [PATCH 11/63] t7001: use here-docs instead of echo Change from old style to current style by taking advantage of here-docs instead of echo commands. Signed-off-by: Shubham Verma Reviewed-by: Eric Sunshine Signed-off-by: Junio C Hamano --- t/t7001-mv.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/t/t7001-mv.sh b/t/t7001-mv.sh index f55d18ed9c..c8c22c09e5 100755 --- a/t/t7001-mv.sh +++ b/t/t7001-mv.sh @@ -228,7 +228,10 @@ test_expect_success 'git mv to move multiple sources into a directory' ' git add dir/?.txt && git mv dir/a.txt dir/b.txt other && git ls-files >actual && - { echo other/a.txt; echo other/b.txt; } >expect && + cat >expect <<-\EOF && + other/a.txt + other/b.txt + EOF test_cmp expect actual ' From 488acf15dfa3edec67fa0397d6d09b247c0c46ae Mon Sep 17 00:00:00 2001 From: Shubham Verma Date: Fri, 12 Feb 2021 01:17:04 +0530 Subject: [PATCH 12/63] t7001: use `test` rather than `[` According to Documentation/CodingGuidelines, we should use "test" rather than "[ ... ]" in shell scripts, so let's replace the "[ ... ]" with "test" in the t7001 test script. Signed-off-by: Shubham Verma Reviewed-by: Eric Sunshine Signed-off-by: Junio C Hamano --- t/t7001-mv.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/t/t7001-mv.sh b/t/t7001-mv.sh index c8c22c09e5..63d779d886 100755 --- a/t/t7001-mv.sh +++ b/t/t7001-mv.sh @@ -242,10 +242,10 @@ test_expect_success 'git mv should not change sha1 of moved cache entry' ' git add dirty && entry="$(git ls-files --stage dirty | cut -f 1)" && git mv dirty dirty2 && - [ "$entry" = "$(git ls-files --stage dirty2 | cut -f 1)" ] && + test "$entry" = "$(git ls-files --stage dirty2 | cut -f 1)" && echo 2 >dirty2 && git mv dirty2 dirty && - [ "$entry" = "$(git ls-files --stage dirty | cut -f 1)" ] + test "$entry" = "$(git ls-files --stage dirty | cut -f 1)" ' rm -f dirty dirty2 @@ -332,7 +332,7 @@ test_expect_success 'git mv moves a submodule with a .git directory and no .gitm mkdir mod && git mv sub mod/sub && ! test -e sub && - [ "$entry" = "$(git ls-files --stage mod/sub | cut -f 1)" ] && + test "$entry" = "$(git ls-files --stage mod/sub | cut -f 1)" && git -C mod/sub status && git update-index --refresh && git diff-files --quiet @@ -352,7 +352,7 @@ test_expect_success 'git mv moves a submodule with a .git directory and .gitmodu mkdir mod && git mv sub mod/sub && ! test -e sub && - [ "$entry" = "$(git ls-files --stage mod/sub | cut -f 1)" ] && + test "$entry" = "$(git ls-files --stage mod/sub | cut -f 1)" && git -C mod/sub status && echo mod/sub >expected && git config -f .gitmodules submodule.sub.path >actual && @@ -369,7 +369,7 @@ test_expect_success 'git mv moves a submodule with gitfile' ' mkdir mod && git -C mod mv ../sub/ . && ! test -e sub && - [ "$entry" = "$(git ls-files --stage mod/sub | cut -f 1)" ] && + test "$entry" = "$(git ls-files --stage mod/sub | cut -f 1)" && git -C mod/sub status && echo mod/sub >expected && git config -f .gitmodules submodule.sub.path >actual && @@ -388,7 +388,7 @@ test_expect_success 'mv does not complain when no .gitmodules file is found' ' git mv sub mod/sub 2>actual.err && test_must_be_empty actual.err && ! test -e sub && - [ "$entry" = "$(git ls-files --stage mod/sub | cut -f 1)" ] && + test "$entry" = "$(git ls-files --stage mod/sub | cut -f 1)" && git -C mod/sub status && git update-index --refresh && git diff-files --quiet @@ -409,7 +409,7 @@ test_expect_success 'mv will error out on a modified .gitmodules file unless sta git mv sub mod/sub 2>actual.err && test_must_be_empty actual.err && ! test -e sub && - [ "$entry" = "$(git ls-files --stage mod/sub | cut -f 1)" ] && + test "$entry" = "$(git ls-files --stage mod/sub | cut -f 1)" && git -C mod/sub status && git update-index --refresh && git diff-files --quiet @@ -427,7 +427,7 @@ test_expect_success 'mv issues a warning when section is not found in .gitmodule git mv sub mod/sub 2>actual.err && test_i18ncmp expect.err actual.err && ! test -e sub && - [ "$entry" = "$(git ls-files --stage mod/sub | cut -f 1)" ] && + test "$entry" = "$(git ls-files --stage mod/sub | cut -f 1)" && git -C mod/sub status && git update-index --refresh && git diff-files --quiet From f15eb7c1cf2674df69c1ff8aebdc5536a580c65e Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Wed, 3 Feb 2021 20:03:46 +0000 Subject: [PATCH 13/63] diffcore-rename: no point trying to find a match better than exact MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit diffcore_rename() had some code to avoid having destination paths that already had an exact rename detected from being re-checked for other renames. Source paths, however, were re-checked because we wanted to allow the possibility of detecting copies. But if copy detection isn't turned on, then this merely amounts to attempting to find a better-than-exact match, which naturally ends up being an expensive no-op. In particular, copy detection is never turned on by the merge machinery. For the testcases mentioned in commit 557ac0350d ("merge-ort: begin performance work; instrument with trace2_region_* calls", 2020-10-28), this change improves the performance as follows: Before After no-renames: 14.263 s ± 0.053 s 14.119 s ± 0.101 s mega-renames: 5504.231 s ± 5.150 s 1802.044 s ± 0.828 s just-one-mega: 158.534 s ± 0.498 s 51.391 s ± 0.028 s Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 8fe6c9384b..8b118628b4 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -463,9 +463,11 @@ void diffcore_rename(struct diff_options *options) struct diff_score *mx; int i, j, rename_count, skip_unmodified = 0; int num_destinations, dst_cnt; + int num_sources, want_copies; struct progress *progress = NULL; trace2_region_enter("diff", "setup", options->repo); + want_copies = (detect_rename == DIFF_DETECT_COPY); if (!minimum_score) minimum_score = DEFAULT_RENAME_SCORE; @@ -502,7 +504,7 @@ void diffcore_rename(struct diff_options *options) p->one->rename_used++; register_rename_src(p); } - else if (detect_rename == DIFF_DETECT_COPY) { + else if (want_copies) { /* * Increment the "rename_used" score by * one, to indicate ourselves as a user. @@ -532,12 +534,15 @@ void diffcore_rename(struct diff_options *options) * files still remain as options for rename/copies!) */ num_destinations = (rename_dst_nr - rename_count); + num_sources = rename_src_nr; + if (!want_copies) + num_sources -= rename_count; /* All done? */ - if (!num_destinations) + if (!num_destinations || !num_sources) goto cleanup; - switch (too_many_rename_candidates(num_destinations, rename_src_nr, + switch (too_many_rename_candidates(num_destinations, num_sources, options)) { case 1: goto cleanup; @@ -553,7 +558,7 @@ void diffcore_rename(struct diff_options *options) if (options->show_rename_progress) { progress = start_delayed_progress( _("Performing inexact rename detection"), - (uint64_t)num_destinations * (uint64_t)rename_src_nr); + (uint64_t)num_destinations * (uint64_t)num_sources); } mx = xcalloc(st_mult(NUM_CANDIDATE_PER_DST, num_destinations), @@ -573,6 +578,9 @@ void diffcore_rename(struct diff_options *options) struct diff_filespec *one = rename_src[j].p->one; struct diff_score this_src; + if (one->rename_used && !want_copies) + continue; + if (skip_unmodified && diff_unmodified_pair(rename_src[j].p)) continue; @@ -594,7 +602,7 @@ void diffcore_rename(struct diff_options *options) } dst_cnt++; display_progress(progress, - (uint64_t)dst_cnt * (uint64_t)rename_src_nr); + (uint64_t)dst_cnt * (uint64_t)num_sources); } stop_progress(&progress); @@ -602,7 +610,7 @@ void diffcore_rename(struct diff_options *options) STABLE_QSORT(mx, dst_cnt * NUM_CANDIDATE_PER_DST, score_compare); rename_count += find_renames(mx, dst_cnt, minimum_score, 0); - if (detect_rename == DIFF_DETECT_COPY) + if (want_copies) rename_count += find_renames(mx, dst_cnt, minimum_score, 1); free(mx); trace2_region_leave("diff", "inexact renames", options->repo); From 727331dce16d88735a5e3a8050b1520d03c6e77a Mon Sep 17 00:00:00 2001 From: Hariom Verma Date: Sat, 13 Feb 2021 01:52:40 +0000 Subject: [PATCH 14/63] t6300: use function to test trailer options Add a function to test trailer options. This will make tests look cleaner, as well as will make it easier to add new tests for trailers in the future. Mentored-by: Christian Couder Mentored-by: Heba Waly Signed-off-by: Hariom Verma Signed-off-by: Junio C Hamano --- t/t6300-for-each-ref.sh | 88 +++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 42 deletions(-) diff --git a/t/t6300-for-each-ref.sh b/t/t6300-for-each-ref.sh index ca62e764b5..a8faddd18a 100755 --- a/t/t6300-for-each-ref.sh +++ b/t/t6300-for-each-ref.sh @@ -814,53 +814,57 @@ test_expect_success 'set up trailers for next test' ' EOF ' -test_expect_success '%(trailers:unfold) unfolds trailers' ' - { - unfold expect && - git for-each-ref --format="%(trailers:unfold)" refs/heads/main >actual && - test_cmp expect actual && - git for-each-ref --format="%(contents:trailers:unfold)" refs/heads/main >actual && - test_cmp expect actual -' +test_trailer_option () { + title=$1 option=$2 + cat >expect + test_expect_success "$title" ' + git for-each-ref --format="%($option)" refs/heads/main >actual && + test_cmp expect actual && + git for-each-ref --format="%(contents:$option)" refs/heads/main >actual && + test_cmp expect actual + ' +} -test_expect_success '%(trailers:only) shows only "key: value" trailers' ' - { - grep -v patch.description expect && - git for-each-ref --format="%(trailers:only)" refs/heads/main >actual && - test_cmp expect actual && - git for-each-ref --format="%(contents:trailers:only)" refs/heads/main >actual && - test_cmp expect actual -' +test_trailer_option '%(trailers:unfold) unfolds trailers' \ + 'trailers:unfold' <<-EOF + $(unfold expect && - git for-each-ref --format="%(trailers:only,unfold)" refs/heads/main >actual && - test_cmp expect actual && - git for-each-ref --format="%(trailers:unfold,only)" refs/heads/main >actual && - test_cmp actual actual && - git for-each-ref --format="%(contents:trailers:only,unfold)" refs/heads/main >actual && - test_cmp expect actual && - git for-each-ref --format="%(contents:trailers:unfold,only)" refs/heads/main >actual && - test_cmp actual actual -' + EOF -test_expect_success '%(trailers) rejects unknown trailers arguments' ' - # error message cannot be checked under i18n - cat >expect <<-EOF && +test_trailer_option '%(trailers:only) shows only "key: value" trailers' \ + 'trailers:only' <<-EOF + $(grep -v patch.description expect + test_expect_success "$title" ' + # error message cannot be checked under i18n + test_must_fail git for-each-ref --format="%($option)" refs/heads/main 2>actual && + test_i18ncmp expect actual && + test_must_fail git for-each-ref --format="%(contents:$option)" refs/heads/main 2>actual && + test_i18ncmp expect actual + ' +} + +test_failing_trailer_option '%(trailers) rejects unknown trailers arguments' \ + 'trailers:unsupported' <<-\EOF fatal: unknown %(trailers) argument: unsupported EOF - test_must_fail git for-each-ref --format="%(trailers:unsupported)" 2>actual && - test_i18ncmp expect actual && - test_must_fail git for-each-ref --format="%(contents:trailers:unsupported)" 2>actual && - test_i18ncmp expect actual -' test_expect_success 'if arguments, %(contents:trailers) shows error if colon is missing' ' cat >expect <<-EOF && From 90563aedcab92b75d4b5f6d7aa43d6a98aaccde6 Mon Sep 17 00:00:00 2001 From: Hariom Verma Date: Sat, 13 Feb 2021 01:52:41 +0000 Subject: [PATCH 15/63] pretty.c: refactor trailer logic to `format_set_trailers_options()` Refactored trailers formatting logic inside pretty.c to a new function `format_set_trailers_options()`. This new function returns the non-zero in case of unusual. The caller handles the non-zero by "goto trailers_out". This change will allow us to reuse the same logic in other places. Mentored-by: Christian Couder Mentored-by: Heba Waly Signed-off-by: Hariom Verma Signed-off-by: Junio C Hamano --- pretty.c | 89 +++++++++++++++++++++++++++++++------------------------- pretty.h | 11 +++++++ 2 files changed, 61 insertions(+), 39 deletions(-) diff --git a/pretty.c b/pretty.c index 3922f6f9f2..59cefdddf6 100644 --- a/pretty.c +++ b/pretty.c @@ -1148,6 +1148,54 @@ static int format_trailer_match_cb(const struct strbuf *key, void *ud) return 0; } +int format_set_trailers_options(struct process_trailer_options *opts, + struct string_list *filter_list, + struct strbuf *sepbuf, + struct strbuf *kvsepbuf, + const char **arg) +{ + for (;;) { + const char *argval; + size_t arglen; + + if (match_placeholder_arg_value(*arg, "key", arg, &argval, &arglen)) { + uintptr_t len = arglen; + + if (!argval) + return -1; + + if (len && argval[len - 1] == ':') + len--; + string_list_append(filter_list, argval)->util = (char *)len; + + opts->filter = format_trailer_match_cb; + opts->filter_data = filter_list; + opts->only_trailers = 1; + } else if (match_placeholder_arg_value(*arg, "separator", arg, &argval, &arglen)) { + char *fmt; + + strbuf_reset(sepbuf); + fmt = xstrndup(argval, arglen); + strbuf_expand(sepbuf, fmt, strbuf_expand_literal_cb, NULL); + free(fmt); + opts->separator = sepbuf; + } else if (match_placeholder_arg_value(*arg, "key_value_separator", arg, &argval, &arglen)) { + char *fmt; + + strbuf_reset(kvsepbuf); + fmt = xstrndup(argval, arglen); + strbuf_expand(kvsepbuf, fmt, strbuf_expand_literal_cb, NULL); + free(fmt); + opts->key_value_separator = kvsepbuf; + } else if (!match_placeholder_bool_arg(*arg, "only", arg, &opts->only_trailers) && + !match_placeholder_bool_arg(*arg, "unfold", arg, &opts->unfold) && + !match_placeholder_bool_arg(*arg, "keyonly", arg, &opts->key_only) && + !match_placeholder_bool_arg(*arg, "valueonly", arg, &opts->value_only)) + break; + } + return 0; +} + static size_t format_commit_one(struct strbuf *sb, /* in UTF-8 */ const char *placeholder, void *context) @@ -1425,45 +1473,8 @@ static size_t format_commit_one(struct strbuf *sb, /* in UTF-8 */ if (*arg == ':') { arg++; - for (;;) { - const char *argval; - size_t arglen; - - if (match_placeholder_arg_value(arg, "key", &arg, &argval, &arglen)) { - uintptr_t len = arglen; - - if (!argval) - goto trailer_out; - - if (len && argval[len - 1] == ':') - len--; - string_list_append(&filter_list, argval)->util = (char *)len; - - opts.filter = format_trailer_match_cb; - opts.filter_data = &filter_list; - opts.only_trailers = 1; - } else if (match_placeholder_arg_value(arg, "separator", &arg, &argval, &arglen)) { - char *fmt; - - strbuf_reset(&sepbuf); - fmt = xstrndup(argval, arglen); - strbuf_expand(&sepbuf, fmt, strbuf_expand_literal_cb, NULL); - free(fmt); - opts.separator = &sepbuf; - } else if (match_placeholder_arg_value(arg, "key_value_separator", &arg, &argval, &arglen)) { - char *fmt; - - strbuf_reset(&kvsepbuf); - fmt = xstrndup(argval, arglen); - strbuf_expand(&kvsepbuf, fmt, strbuf_expand_literal_cb, NULL); - free(fmt); - opts.key_value_separator = &kvsepbuf; - } else if (!match_placeholder_bool_arg(arg, "only", &arg, &opts.only_trailers) && - !match_placeholder_bool_arg(arg, "unfold", &arg, &opts.unfold) && - !match_placeholder_bool_arg(arg, "keyonly", &arg, &opts.key_only) && - !match_placeholder_bool_arg(arg, "valueonly", &arg, &opts.value_only)) - break; - } + if (format_set_trailers_options(&opts, &filter_list, &sepbuf, &kvsepbuf, &arg)) + goto trailer_out; } if (*arg == ')') { format_trailers_from_commit(sb, msg + c->subject_off, &opts); diff --git a/pretty.h b/pretty.h index 7ce6c0b437..7369cf7e14 100644 --- a/pretty.h +++ b/pretty.h @@ -6,6 +6,7 @@ struct commit; struct strbuf; +struct process_trailer_options; /* Commit formats */ enum cmit_fmt { @@ -142,4 +143,14 @@ int commit_format_is_empty(enum cmit_fmt); /* Make subject of commit message suitable for filename */ void format_sanitized_subject(struct strbuf *sb, const char *msg, size_t len); +/* + * Set values of fields in "struct process_trailer_options" + * according to trailers arguments. + */ +int format_set_trailers_options(struct process_trailer_options *opts, + struct string_list *filter_list, + struct strbuf *sepbuf, + struct strbuf *kvsepbuf, + const char **arg); + #endif /* PRETTY_H */ From 636a0aeedfee5f3cce2ebf1fcc174a49ab9bb0c3 Mon Sep 17 00:00:00 2001 From: Hariom Verma Date: Sat, 13 Feb 2021 01:52:42 +0000 Subject: [PATCH 16/63] pretty.c: capture invalid trailer argument As we would like to use this trailers logic in the ref-filter, it's nice to get an invalid trailer argument. This will allow us to print precise error message while using `format_set_trailers_options()` in ref-filter. For capturing the invalid argument, we changed the working of `format_set_trailers_options()` a little bit. Original logic does "break" and fell through in mainly 2 cases - 1. unknown/invalid argument 2. end of the arg string But now instead of "break", we capture invalid argument and return non-zero. And non-zero is handled by the caller. (We prepared the caller to handle non-zero in the previous commit). Capturing invalid arguments this way will also affects the working of current logic. As at the end of the arg string it will return non-zero. So in order to make things correct, introduced an additional conditional statement i.e if encounter ")", do 'break'. Mentored-by: Christian Couder Mentored-by: Heba Waly Signed-off-by: Hariom Verma Signed-off-by: Junio C Hamano --- pretty.c | 17 +++++++++++++---- pretty.h | 3 ++- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/pretty.c b/pretty.c index 59cefdddf6..ed16b32df9 100644 --- a/pretty.c +++ b/pretty.c @@ -1152,12 +1152,16 @@ int format_set_trailers_options(struct process_trailer_options *opts, struct string_list *filter_list, struct strbuf *sepbuf, struct strbuf *kvsepbuf, - const char **arg) + const char **arg, + char **invalid_arg) { for (;;) { const char *argval; size_t arglen; + if (**arg == ')') + break; + if (match_placeholder_arg_value(*arg, "key", arg, &argval, &arglen)) { uintptr_t len = arglen; @@ -1190,8 +1194,13 @@ int format_set_trailers_options(struct process_trailer_options *opts, } else if (!match_placeholder_bool_arg(*arg, "only", arg, &opts->only_trailers) && !match_placeholder_bool_arg(*arg, "unfold", arg, &opts->unfold) && !match_placeholder_bool_arg(*arg, "keyonly", arg, &opts->key_only) && - !match_placeholder_bool_arg(*arg, "valueonly", arg, &opts->value_only)) - break; + !match_placeholder_bool_arg(*arg, "valueonly", arg, &opts->value_only)) { + if (invalid_arg) { + size_t len = strcspn(*arg, ",)"); + *invalid_arg = xstrndup(*arg, len); + } + return -1; + } } return 0; } @@ -1473,7 +1482,7 @@ static size_t format_commit_one(struct strbuf *sb, /* in UTF-8 */ if (*arg == ':') { arg++; - if (format_set_trailers_options(&opts, &filter_list, &sepbuf, &kvsepbuf, &arg)) + if (format_set_trailers_options(&opts, &filter_list, &sepbuf, &kvsepbuf, &arg, NULL)) goto trailer_out; } if (*arg == ')') { diff --git a/pretty.h b/pretty.h index 7369cf7e14..d902cdd70a 100644 --- a/pretty.h +++ b/pretty.h @@ -151,6 +151,7 @@ int format_set_trailers_options(struct process_trailer_options *opts, struct string_list *filter_list, struct strbuf *sepbuf, struct strbuf *kvsepbuf, - const char **arg); + const char **arg, + char **invalid_arg); #endif /* PRETTY_H */ From ee82a487f68a7c86551fb59f6176812d63be61b6 Mon Sep 17 00:00:00 2001 From: Hariom Verma Date: Sat, 13 Feb 2021 01:52:43 +0000 Subject: [PATCH 17/63] ref-filter: use pretty.c logic for trailers Now, ref-filter is using pretty.c logic for setting trailer options. New to ref-filter: :key= - only show trailers with specified key. :valueonly[=val] - only show the value part. :separator= - inserted between trailer lines. :key_value_separator= - inserted between key and value in trailer lines Enhancement to existing options(now can take value and its optional): :only[=val] :unfold[=val] 'val' can be: true, on, yes or false, off, no. Mentored-by: Christian Couder Mentored-by: Heba Waly Signed-off-by: Hariom Verma Signed-off-by: Junio C Hamano --- Documentation/git-for-each-ref.txt | 8 +-- ref-filter.c | 36 ++++++----- t/t6300-for-each-ref.sh | 95 ++++++++++++++++++++++++++++++ 3 files changed, 118 insertions(+), 21 deletions(-) diff --git a/Documentation/git-for-each-ref.txt b/Documentation/git-for-each-ref.txt index 2962f85a50..2ae2478de7 100644 --- a/Documentation/git-for-each-ref.txt +++ b/Documentation/git-for-each-ref.txt @@ -260,11 +260,9 @@ contents:lines=N:: The first `N` lines of the message. Additionally, the trailers as interpreted by linkgit:git-interpret-trailers[1] -are obtained as `trailers` (or by using the historical alias -`contents:trailers`). Non-trailer lines from the trailer block can be omitted -with `trailers:only`. Whitespace-continuations can be removed from trailers so -that each trailer appears on a line by itself with its full content with -`trailers:unfold`. Both can be used together as `trailers:unfold,only`. +are obtained as `trailers[:options]` (or by using the historical alias +`contents:trailers[:options]`). For valid [:option] values see `trailers` +section of linkgit:git-log[1]. For sorting purposes, fields with numeric values sort in numeric order (`objectsize`, `authordate`, `committerdate`, `creatordate`, `taggerdate`). diff --git a/ref-filter.c b/ref-filter.c index ee337df232..4dc4882cc7 100644 --- a/ref-filter.c +++ b/ref-filter.c @@ -67,6 +67,12 @@ struct refname_atom { int lstrip, rstrip; }; +static struct ref_trailer_buf { + struct string_list filter_list; + struct strbuf sepbuf; + struct strbuf kvsepbuf; +} ref_trailer_buf = {STRING_LIST_INIT_NODUP, STRBUF_INIT, STRBUF_INIT}; + static struct expand_data { struct object_id oid; enum object_type type; @@ -313,28 +319,26 @@ static int subject_atom_parser(const struct ref_format *format, struct used_atom static int trailers_atom_parser(const struct ref_format *format, struct used_atom *atom, const char *arg, struct strbuf *err) { - struct string_list params = STRING_LIST_INIT_DUP; - int i; - atom->u.contents.trailer_opts.no_divider = 1; if (arg) { - string_list_split(¶ms, arg, ',', -1); - for (i = 0; i < params.nr; i++) { - const char *s = params.items[i].string; - if (!strcmp(s, "unfold")) - atom->u.contents.trailer_opts.unfold = 1; - else if (!strcmp(s, "only")) - atom->u.contents.trailer_opts.only_trailers = 1; - else { - strbuf_addf(err, _("unknown %%(trailers) argument: %s"), s); - string_list_clear(¶ms, 0); - return -1; - } + const char *argbuf = xstrfmt("%s)", arg); + char *invalid_arg = NULL; + + if (format_set_trailers_options(&atom->u.contents.trailer_opts, + &ref_trailer_buf.filter_list, + &ref_trailer_buf.sepbuf, + &ref_trailer_buf.kvsepbuf, + &argbuf, &invalid_arg)) { + if (!invalid_arg) + strbuf_addf(err, _("expected %%(trailers:key=)")); + else + strbuf_addf(err, _("unknown %%(trailers) argument: %s"), invalid_arg); + free((char *)invalid_arg); + return -1; } } atom->u.contents.option = C_TRAILERS; - string_list_clear(¶ms, 0); return 0; } diff --git a/t/t6300-for-each-ref.sh b/t/t6300-for-each-ref.sh index a8faddd18a..cac7f443d0 100755 --- a/t/t6300-for-each-ref.sh +++ b/t/t6300-for-each-ref.sh @@ -837,6 +837,24 @@ test_trailer_option '%(trailers:only) shows only "key: value" trailers' \ EOF +test_trailer_option '%(trailers:only=no,only=true) shows only "key: value" trailers' \ + 'trailers:only=no,only=true' <<-EOF + $(grep -v patch.description + + EOF + +test_trailer_option '%(trailers:key=foo) is case insensitive' \ + 'trailers:key=SiGned-oFf-bY' <<-EOF + Signed-off-by: A U Thor + + EOF + +test_trailer_option '%(trailers:key=foo:) trailing colon also works' \ + 'trailers:key=Signed-off-by:' <<-EOF + Signed-off-by: A U Thor + + EOF + +test_trailer_option '%(trailers:key=foo) multiple keys' \ + 'trailers:key=Reviewed-by:,key=Signed-off-by' <<-EOF + Reviewed-by: A U Thor + Signed-off-by: A U Thor + + EOF + +test_trailer_option '%(trailers:key=nonexistent) becomes empty' \ + 'trailers:key=Shined-off-by:' <<-EOF + + EOF + +test_trailer_option '%(trailers:key=foo) handles multiple lines even if folded' \ + 'trailers:key=Acked-by' <<-EOF + $(grep -v patch.description + $(grep patch.description + + EOF + +test_trailer_option '%(trailers:separator) changes separator' \ + 'trailers:separator=%x2C,key=Reviewed-by,key=Signed-off-by:' <<-EOF + Reviewed-by: A U Thor ,Signed-off-by: A U Thor + EOF + +test_trailer_option '%(trailers:key_value_separator) changes key-value separator' \ + 'trailers:key_value_separator=%x2C,key=Reviewed-by,key=Signed-off-by:' <<-EOF + Reviewed-by,A U Thor + Signed-off-by,A U Thor + + EOF + +test_trailer_option '%(trailers:separator,key_value_separator) changes both separators' \ + 'trailers:separator=%x2C,key_value_separator=%x2C,key=Reviewed-by,key=Signed-off-by:' <<-EOF + Reviewed-by,A U Thor ,Signed-off-by,A U Thor + EOF + test_failing_trailer_option () { title=$1 option=$2 cat >expect @@ -866,6 +956,11 @@ test_failing_trailer_option '%(trailers) rejects unknown trailers arguments' \ fatal: unknown %(trailers) argument: unsupported EOF +test_failing_trailer_option '%(trailers:key) without value is error' \ + 'trailers:key' <<-\EOF + fatal: expected %(trailers:key=) + EOF + test_expect_success 'if arguments, %(contents:trailers) shows error if colon is missing' ' cat >expect <<-EOF && fatal: unrecognized %(contents) argument: trailersonly From 829514c5151deee1b37cdeaf451bf28219602126 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sun, 14 Feb 2021 07:35:01 +0000 Subject: [PATCH 18/63] diffcore-rename: filter rename_src list when possible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have to look at each entry in rename_src a total of rename_dst_nr times. When we're not detecting copies, any exact renames or ignorable rename paths will just be skipped over. While checking that these can be skipped over is a relatively cheap check, it's still a waste of time to do that check more than once, let alone rename_dst_nr times. When rename_src_nr is a few thousand times bigger than the number of relevant sources (such as when cherry-picking a commit that only touched a handful of files, but from a side of history that has different names for some high level directories), this time can add up. First make an initial pass over the rename_src array and move all the relevant entries to the front, so that we can iterate over just those relevant entries. For the testcases mentioned in commit 557ac0350d ("merge-ort: begin performance work; instrument with trace2_region_* calls", 2020-10-28), this change improves the performance as follows: Before After no-renames: 14.119 s ± 0.101 s 13.815 s ± 0.062 s mega-renames: 1802.044 s ± 0.828 s 1799.937 s ± 0.493 s just-one-mega: 51.391 s ± 0.028 s 51.289 s ± 0.019 s Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 59 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 8 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 8b118628b4..6fd0c4a2f4 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -454,6 +454,54 @@ static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, i return count; } +static void remove_unneeded_paths_from_src(int detecting_copies) +{ + int i, new_num_src; + + if (detecting_copies) + return; /* nothing to remove */ + if (break_idx) + return; /* culling incompatible with break detection */ + + /* + * Note on reasons why we cull unneeded sources but not destinations: + * 1) Pairings are stored in rename_dst (not rename_src), which we + * need to keep around. So, we just can't cull rename_dst even + * if we wanted to. But doing so wouldn't help because... + * + * 2) There is a matrix pairwise comparison that follows the + * "Performing inexact rename detection" progress message. + * Iterating over the destinations is done in the outer loop, + * hence we only iterate over each of those once and we can + * easily skip the outer loop early if the destination isn't + * relevant. That's only one check per destination path to + * skip. + * + * By contrast, the sources are iterated in the inner loop; if + * we check whether a source can be skipped, then we'll be + * checking it N separate times, once for each destination. + * We don't want to have to iterate over known-not-needed + * sources N times each, so avoid that by removing the sources + * from rename_src here. + */ + for (i = 0, new_num_src = 0; i < rename_src_nr; i++) { + /* + * renames are stored in rename_dst, so if a rename has + * already been detected using this source, we can just + * remove the source knowing rename_dst has its info. + */ + if (rename_src[i].p->one->rename_used) + continue; + + if (new_num_src < i) + memcpy(&rename_src[new_num_src], &rename_src[i], + sizeof(struct diff_rename_src)); + new_num_src++; + } + + rename_src_nr = new_num_src; +} + void diffcore_rename(struct diff_options *options) { int detect_rename = options->detect_rename; @@ -529,14 +577,10 @@ void diffcore_rename(struct diff_options *options) if (minimum_score == MAX_SCORE) goto cleanup; - /* - * Calculate how many renames are left (but all the source - * files still remain as options for rename/copies!) - */ + /* Calculate how many renames are left */ num_destinations = (rename_dst_nr - rename_count); + remove_unneeded_paths_from_src(want_copies); num_sources = rename_src_nr; - if (!want_copies) - num_sources -= rename_count; /* All done? */ if (!num_destinations || !num_sources) @@ -578,8 +622,7 @@ void diffcore_rename(struct diff_options *options) struct diff_filespec *one = rename_src[j].p->one; struct diff_score this_src; - if (one->rename_used && !want_copies) - continue; + assert(!one->rename_used || want_copies || break_idx); if (skip_unmodified && diff_unmodified_pair(rename_src[j].p)) From f3845257a55aa5c949ce2543d53fe1f28b5072df Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sun, 14 Feb 2021 07:51:46 +0000 Subject: [PATCH 19/63] t4001: add a test comparing basename similarity and content similarity Add a simple test where a removed file is similar to two different added files; one of them has the same basename, and the other has a slightly higher content similarity. In the current test, content similarity is weighted higher than filename similarity. Subsequent commits will add a new rule that weighs a mixture of filename similarity and content similarity in a manner that will change the outcome of this testcase. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- t/t4001-diff-rename.sh | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/t/t4001-diff-rename.sh b/t/t4001-diff-rename.sh index c16486a9d4..0f97858197 100755 --- a/t/t4001-diff-rename.sh +++ b/t/t4001-diff-rename.sh @@ -262,4 +262,27 @@ test_expect_success 'diff-tree -l0 defaults to a big rename limit, not zero' ' grep "myotherfile.*myfile" actual ' +test_expect_success 'basename similarity vs best similarity' ' + mkdir subdir && + test_write_lines line1 line2 line3 line4 line5 \ + line6 line7 line8 line9 line10 >subdir/file.txt && + git add subdir/file.txt && + git commit -m "base txt" && + + git rm subdir/file.txt && + test_write_lines line1 line2 line3 line4 line5 \ + line6 line7 line8 >file.txt && + test_write_lines line1 line2 line3 line4 line5 \ + line6 line7 line8 line9 >file.md && + git add file.txt file.md && + git commit -a -m "rename" && + git diff-tree -r -M --name-status HEAD^ HEAD >actual && + # subdir/file.txt is 88% similar to file.md and 78% similar to file.txt + cat >expected <<-\EOF && + R088 subdir/file.txt file.md + A file.txt + EOF + test_cmp expected actual +' + test_done From a35df3371c2e2e9b407ff8c950169e74f6bf4add Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sun, 14 Feb 2021 07:51:47 +0000 Subject: [PATCH 20/63] diffcore-rename: compute basenames of source and dest candidates We want to make use of unique basenames among remaining source and destination files to help inform rename detection, so that more likely pairings can be checked first. (src/moduleA/foo.txt and source/module/A/foo.txt are likely related if there are no other 'foo.txt' files among the remaining deleted and added files.) Add a new function, not yet used, which creates a map of the unique basenames within rename_src and another within rename_dst, together with the indices within rename_src/rename_dst where those basenames show up. Non-unique basenames still show up in the map, but have an invalid index (-1). This function was inspired by the fact that in real world repositories, files are often moved across directories without changing names. Here are some sample repositories and the percentage of their historical renames (as of early 2020) that preserved basenames: * linux: 76% * gcc: 64% * gecko: 79% * webkit: 89% These statistics alone don't prove that an optimization in this area will help or how much it will help, since there are also unpaired adds and deletes, restrictions on which basenames we consider, etc., but it certainly motivated the idea to try something in this area. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/diffcore-rename.c b/diffcore-rename.c index 6fd0c4a2f4..e51f33a218 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -367,6 +367,69 @@ static int find_exact_renames(struct diff_options *options) return renames; } +static const char *get_basename(const char *filename) +{ + /* + * gitbasename() has to worry about special drives, multiple + * directory separator characters, trailing slashes, NULL or + * empty strings, etc. We only work on filenames as stored in + * git, and thus get to ignore all those complications. + */ + const char *base = strrchr(filename, '/'); + return base ? base + 1 : filename; +} + +MAYBE_UNUSED +static int find_basename_matches(struct diff_options *options, + int minimum_score) +{ + int i; + struct strintmap sources; + struct strintmap dests; + + /* + * Create maps of basename -> fullname(s) for remaining sources and + * dests. + */ + strintmap_init_with_options(&sources, -1, NULL, 0); + strintmap_init_with_options(&dests, -1, NULL, 0); + for (i = 0; i < rename_src_nr; ++i) { + char *filename = rename_src[i].p->one->path; + const char *base; + + /* exact renames removed in remove_unneeded_paths_from_src() */ + assert(!rename_src[i].p->one->rename_used); + + /* Record index within rename_src (i) if basename is unique */ + base = get_basename(filename); + if (strintmap_contains(&sources, base)) + strintmap_set(&sources, base, -1); + else + strintmap_set(&sources, base, i); + } + for (i = 0; i < rename_dst_nr; ++i) { + char *filename = rename_dst[i].p->two->path; + const char *base; + + if (rename_dst[i].is_rename) + continue; /* involved in exact match already. */ + + /* Record index within rename_dst (i) if basename is unique */ + base = get_basename(filename); + if (strintmap_contains(&dests, base)) + strintmap_set(&dests, base, -1); + else + strintmap_set(&dests, base, i); + } + + /* TODO: Make use of basenames source and destination basenames */ + + strintmap_clear(&sources); + strintmap_clear(&dests); + + return 0; +} + #define NUM_CANDIDATE_PER_DST 4 static void record_if_better(struct diff_score m[], struct diff_score *o) { From da09f651277a982daa28227a13cd48d15b7245e1 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sun, 14 Feb 2021 07:51:48 +0000 Subject: [PATCH 21/63] diffcore-rename: complete find_basename_matches() It is not uncommon in real world repositories for the majority of file renames to not change the basename of the file; i.e. most "renames" are just a move of files into different directories. We can make use of this to avoid comparing all rename source candidates with all rename destination candidates, by first comparing sources to destinations with the same basenames. If two files with the same basename are sufficiently similar, we record the rename; if not, we include those files in the more exhaustive matrix comparison. This means we are adding a set of preliminary additional comparisons, but for each file we only compare it with at most one other file. For example, if there was a include/media/device.h that was deleted and a src/module/media/device.h that was added, and there are no other device.h files in the remaining sets of added and deleted files after exact rename detection, then these two files would be compared in the preliminary step. This commit does not yet actually employ this new optimization, it merely adds a function which can be used for this purpose. The next commit will do the necessary plumbing to make use of it. Note that this optimization might give us different results than without the optimization, because it's possible that despite files with the same basename being sufficiently similar to be considered a rename, there's an even better match between files without the same basename. I think that is okay for four reasons: (1) it's easy to explain to the users what happened if it does ever occur (or even for them to intuitively figure out), (2) as the next patch will show it provides such a large performance boost that it's worth the tradeoff, and (3) it's somewhat unlikely that despite having unique matching basenames that other files serve as better matches. Reason (4) takes a full paragraph to explain... If the previous three reasons aren't enough, consider what rename detection already does. Break detection is not the default, meaning that if files have the same _fullname_, then they are considered related even if they are 0% similar. In fact, in such a case, we don't even bother comparing the files to see if they are similar let alone comparing them to all other files to see what they are most similar to. Basically, we override content similarity based on sufficient filename similarity. Without the filename similarity (currently implemented as an exact match of filename), we swing the pendulum the opposite direction and say that filename similarity is irrelevant and compare a full N x M matrix of sources and destinations to find out which have the most similar contents. This optimization just adds another form of filename similarity comparison, but augments it with a file content similarity check as well. Basically, if two files have the same basename and are sufficiently similar to be considered a rename, mark them as such without comparing the two to all other rename candidates. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 82 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 79 insertions(+), 3 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index e51f33a218..266d4fae48 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -383,9 +383,53 @@ MAYBE_UNUSED static int find_basename_matches(struct diff_options *options, int minimum_score) { - int i; + /* + * When I checked in early 2020, over 76% of file renames in linux + * just moved files to a different directory but kept the same + * basename. gcc did that with over 64% of renames, gecko did it + * with over 79%, and WebKit did it with over 89%. + * + * Therefore we can bypass the normal exhaustive NxM matrix + * comparison of similarities between all potential rename sources + * and destinations by instead using file basename as a hint (i.e. + * the portion of the filename after the last '/'), checking for + * similarity between files with the same basename, and if we find + * a pair that are sufficiently similar, record the rename pair and + * exclude those two from the NxM matrix. + * + * This *might* cause us to find a less than optimal pairing (if + * there is another file that we are even more similar to but has a + * different basename). Given the huge performance advantage + * basename matching provides, and given the frequency with which + * people use the same basename in real world projects, that's a + * trade-off we are willing to accept when doing just rename + * detection. + * + * If someone wants copy detection that implies they are willing to + * spend more cycles to find similarities between files, so it may + * be less likely that this heuristic is wanted. If someone is + * doing break detection, that means they do not want filename + * similarity to imply any form of content similiarity, and thus + * this heuristic would definitely be incompatible. + */ + + int i, renames = 0; struct strintmap sources; struct strintmap dests; + struct hashmap_iter iter; + struct strmap_entry *entry; + + /* + * The prefeteching stuff wants to know if it can skip prefetching + * blobs that are unmodified...and will then do a little extra work + * to verify that the oids are indeed different before prefetching. + * Unmodified blobs are only relevant when doing copy detection; + * when limiting to rename detection, diffcore_rename[_extended]() + * will never be called with unmodified source paths fed to us, so + * the extra work necessary to check if rename_src entries are + * unmodified would be a small waste. + */ + int skip_unmodified = 0; /* * Create maps of basename -> fullname(s) for remaining sources and @@ -422,12 +466,44 @@ static int find_basename_matches(struct diff_options *options, strintmap_set(&dests, base, i); } - /* TODO: Make use of basenames source and destination basenames */ + /* Now look for basename matchups and do similarity estimation */ + strintmap_for_each_entry(&sources, &iter, entry) { + const char *base = entry->key; + intptr_t src_index = (intptr_t)entry->value; + intptr_t dst_index; + if (src_index == -1) + continue; + + if (0 <= (dst_index = strintmap_get(&dests, base))) { + struct diff_filespec *one, *two; + int score; + + /* Estimate the similarity */ + one = rename_src[src_index].p->one; + two = rename_dst[dst_index].p->two; + score = estimate_similarity(options->repo, one, two, + minimum_score, skip_unmodified); + + /* If sufficiently similar, record as rename pair */ + if (score < minimum_score) + continue; + record_rename_pair(dst_index, src_index, score); + renames++; + + /* + * Found a rename so don't need text anymore; if we + * didn't find a rename, the filespec_blob would get + * re-used when doing the matrix of comparisons. + */ + diff_free_filespec_blob(one); + diff_free_filespec_blob(two); + } + } strintmap_clear(&sources); strintmap_clear(&dests); - return 0; + return renames; } #define NUM_CANDIDATE_PER_DST 4 From bd24aa2f97a08fdd5a4982bc6268f70c6bb7b747 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sun, 14 Feb 2021 07:51:49 +0000 Subject: [PATCH 22/63] diffcore-rename: guide inexact rename detection based on basenames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make use of the new find_basename_matches() function added in the last two patches, to find renames more rapidly in cases where we can match up files based on basenames. As a quick reminder (see the last two commit messages for more details), this means for example that docs/extensions.txt and docs/config/extensions.txt are considered likely renames if there are no remaining 'extensions.txt' files elsewhere among the added and deleted files, and if a similarity check confirms they are similar, then they are marked as a rename without looking for a better similarity match among other files. This is a behavioral change, as covered in more detail in the previous commit message. We do not use this heuristic together with either break or copy detection. The point of break detection is to say that filename similarity does not imply file content similarity, and we only want to know about file content similarity. The point of copy detection is to use more resources to check for additional similarities, while this is an optimization that uses far less resources but which might also result in finding slightly fewer similarities. So the idea behind this optimization goes against both of those features, and will be turned off for both. For the testcases mentioned in commit 557ac0350d ("merge-ort: begin performance work; instrument with trace2_region_* calls", 2020-10-28), this change improves the performance as follows: Before After no-renames: 13.815 s ± 0.062 s 13.294 s ± 0.103 s mega-renames: 1799.937 s ± 0.493 s 187.248 s ± 0.882 s just-one-mega: 51.289 s ± 0.019 s 5.557 s ± 0.017 s Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 53 ++++++++++++++++++++++++++++++++++++++---- t/t4001-diff-rename.sh | 7 +++--- 2 files changed, 52 insertions(+), 8 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 266d4fae48..41558185ae 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -379,7 +379,6 @@ static const char *get_basename(const char *filename) return base ? base + 1 : filename; } -MAYBE_UNUSED static int find_basename_matches(struct diff_options *options, int minimum_score) { @@ -716,11 +715,55 @@ void diffcore_rename(struct diff_options *options) if (minimum_score == MAX_SCORE) goto cleanup; - /* Calculate how many renames are left */ - num_destinations = (rename_dst_nr - rename_count); - remove_unneeded_paths_from_src(want_copies); num_sources = rename_src_nr; + if (want_copies || break_idx) { + /* + * Cull sources: + * - remove ones corresponding to exact renames + */ + trace2_region_enter("diff", "cull after exact", options->repo); + remove_unneeded_paths_from_src(want_copies); + trace2_region_leave("diff", "cull after exact", options->repo); + } else { + /* Determine minimum score to match basenames */ + double factor = 0.5; + char *basename_factor = getenv("GIT_BASENAME_FACTOR"); + int min_basename_score; + + if (basename_factor) + factor = strtol(basename_factor, NULL, 10)/100.0; + assert(factor >= 0.0 && factor <= 1.0); + min_basename_score = minimum_score + + (int)(factor * (MAX_SCORE - minimum_score)); + + /* + * Cull sources: + * - remove ones involved in renames (found via exact match) + */ + trace2_region_enter("diff", "cull after exact", options->repo); + remove_unneeded_paths_from_src(want_copies); + trace2_region_leave("diff", "cull after exact", options->repo); + + /* Utilize file basenames to quickly find renames. */ + trace2_region_enter("diff", "basename matches", options->repo); + rename_count += find_basename_matches(options, + min_basename_score); + trace2_region_leave("diff", "basename matches", options->repo); + + /* + * Cull sources, again: + * - remove ones involved in renames (found via basenames) + */ + trace2_region_enter("diff", "cull basename", options->repo); + remove_unneeded_paths_from_src(want_copies); + trace2_region_leave("diff", "cull basename", options->repo); + } + + /* Calculate how many rename destinations are left */ + num_destinations = (rename_dst_nr - rename_count); + num_sources = rename_src_nr; /* rename_src_nr reflects lower number */ + /* All done? */ if (!num_destinations || !num_sources) goto cleanup; @@ -751,7 +794,7 @@ void diffcore_rename(struct diff_options *options) struct diff_score *m; if (rename_dst[i].is_rename) - continue; /* dealt with exact match already. */ + continue; /* exact or basename match already handled */ m = &mx[dst_cnt * NUM_CANDIDATE_PER_DST]; for (j = 0; j < NUM_CANDIDATE_PER_DST; j++) diff --git a/t/t4001-diff-rename.sh b/t/t4001-diff-rename.sh index 0f97858197..99a5d1bd1c 100755 --- a/t/t4001-diff-rename.sh +++ b/t/t4001-diff-rename.sh @@ -277,10 +277,11 @@ test_expect_success 'basename similarity vs best similarity' ' git add file.txt file.md && git commit -a -m "rename" && git diff-tree -r -M --name-status HEAD^ HEAD >actual && - # subdir/file.txt is 88% similar to file.md and 78% similar to file.txt + # subdir/file.txt is 88% similar to file.md, 78% similar to file.txt, + # but since same basenames are checked first... cat >expected <<-\EOF && - R088 subdir/file.txt file.md - A file.txt + A file.md + R078 subdir/file.txt file.txt EOF test_cmp expected actual ' From 07c9a7fcb50e7af9d350edc82d5f906a6eb6bd62 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sun, 14 Feb 2021 07:51:50 +0000 Subject: [PATCH 23/63] gitdiffcore doc: mention new preliminary step for rename detection The last few patches have introduced a new preliminary step when rename detection is on but both break detection and copy detection are off. Document this new step. While we're at it, add a testcase that checks the new behavior as well. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- Documentation/gitdiffcore.txt | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/Documentation/gitdiffcore.txt b/Documentation/gitdiffcore.txt index c970d9fe43..80fcf95424 100644 --- a/Documentation/gitdiffcore.txt +++ b/Documentation/gitdiffcore.txt @@ -168,6 +168,26 @@ a similarity score different from the default of 50% by giving a number after the "-M" or "-C" option (e.g. "-M8" to tell it to use 8/10 = 80%). +Note that when rename detection is on but both copy and break +detection are off, rename detection adds a preliminary step that first +checks if files are moved across directories while keeping their +filename the same. If there is a file added to a directory whose +contents is sufficiently similar to a file with the same name that got +deleted from a different directory, it will mark them as renames and +exclude them from the later quadratic step (the one that pairwise +compares all unmatched files to find the "best" matches, determined by +the highest content similarity). So, for example, if a deleted +docs/ext.txt and an added docs/config/ext.txt are similar enough, they +will be marked as a rename and prevent an added docs/ext.md that may +be even more similar to the deleted docs/ext.txt from being considered +as the rename destination in the later step. For this reason, the +preliminary "match same filename" step uses a bit higher threshold to +mark a file pair as a rename and stop considering other candidates for +better matches. At most, one comparison is done per file in this +preliminary pass; so if there are several remaining ext.txt files +throughout the directory hierarchy after exact rename detection, this +preliminary step will be skipped for those files. + Note. When the "-C" option is used with `--find-copies-harder` option, 'git diff-{asterisk}' commands feed unmodified filepairs to diffcore mechanism as well as modified ones. This lets the copy From f78cf976172fbad79a9ca4fc158f384fb868f177 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sun, 14 Feb 2021 07:51:51 +0000 Subject: [PATCH 24/63] merge-ort: call diffcore_rename() directly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want to pass additional information to diffcore_rename() (or some variant thereof) without plumbing that extra information through diff_tree_oid() and diffcore_std(). Further, since we will need to gather additional special information related to diffs and are walking the trees anyway in collect_merge_info(), it seems odd to have diff_tree_oid()/diffcore_std() repeat those tree walks. And there may be times where we can avoid traversing into a subtree in collect_merge_info() (based on additional information at our disposal), that the basic diff logic would be unable to take advantage of. For all these reasons, just create the add and delete pairs ourself and then call diffcore_rename() directly. This change is primarily about enabling future optimizations; the advantage of avoiding extra tree traversals is small compared to the cost of rename detection, and the advantage of avoiding the extra tree traversals is somewhat offset by the extra time spent in collect_merge_info() collecting the additional data anyway. However... For the testcases mentioned in commit 557ac0350d ("merge-ort: begin performance work; instrument with trace2_region_* calls", 2020-10-28), this change improves the performance as follows: Before After no-renames: 13.294 s ± 0.103 s 12.775 s ± 0.062 s mega-renames: 187.248 s ± 0.882 s 188.754 s ± 0.284 s just-one-mega: 5.557 s ± 0.017 s 5.599 s ± 0.019 s Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- merge-ort.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 7 deletions(-) diff --git a/merge-ort.c b/merge-ort.c index 931b91438c..603d30c521 100644 --- a/merge-ort.c +++ b/merge-ort.c @@ -535,6 +535,23 @@ static void setup_path_info(struct merge_options *opt, result->util = mi; } +static void add_pair(struct merge_options *opt, + struct name_entry *names, + const char *pathname, + unsigned side, + unsigned is_add /* if false, is_delete */) +{ + struct diff_filespec *one, *two; + struct rename_info *renames = &opt->priv->renames; + int names_idx = is_add ? side : 0; + + one = alloc_filespec(pathname); + two = alloc_filespec(pathname); + fill_filespec(is_add ? two : one, + &names[names_idx].oid, 1, names[names_idx].mode); + diff_queue(&renames->pairs[side], one, two); +} + static void collect_rename_info(struct merge_options *opt, struct name_entry *names, const char *dirname, @@ -544,6 +561,7 @@ static void collect_rename_info(struct merge_options *opt, unsigned match_mask) { struct rename_info *renames = &opt->priv->renames; + unsigned side; /* Update dirs_removed, as needed */ if (dirmask == 1 || dirmask == 3 || dirmask == 5) { @@ -554,6 +572,21 @@ static void collect_rename_info(struct merge_options *opt, if (sides & 2) strset_add(&renames->dirs_removed[2], fullname); } + + if (filemask == 0 || filemask == 7) + return; + + for (side = MERGE_SIDE1; side <= MERGE_SIDE2; ++side) { + unsigned side_mask = (1 << side); + + /* Check for deletion on side */ + if ((filemask & 1) && !(filemask & side_mask)) + add_pair(opt, names, fullname, side, 0 /* delete */); + + /* Check for addition on side */ + if (!(filemask & 1) && (filemask & side_mask)) + add_pair(opt, names, fullname, side, 1 /* add */); + } } static int collect_merge_info_callback(int n, @@ -2079,6 +2112,27 @@ static int process_renames(struct merge_options *opt, return clean_merge; } +static void resolve_diffpair_statuses(struct diff_queue_struct *q) +{ + /* + * A simplified version of diff_resolve_rename_copy(); would probably + * just use that function but it's static... + */ + int i; + struct diff_filepair *p; + + for (i = 0; i < q->nr; ++i) { + p = q->queue[i]; + p->status = 0; /* undecided */ + if (!DIFF_FILE_VALID(p->one)) + p->status = DIFF_STATUS_ADDED; + else if (!DIFF_FILE_VALID(p->two)) + p->status = DIFF_STATUS_DELETED; + else if (DIFF_PAIR_RENAME(p)) + p->status = DIFF_STATUS_RENAMED; + } +} + static int compare_pairs(const void *a_, const void *b_) { const struct diff_filepair *a = *((const struct diff_filepair **)a_); @@ -2089,8 +2143,6 @@ static int compare_pairs(const void *a_, const void *b_) /* Call diffcore_rename() to compute which files have changed on given side */ static void detect_regular_renames(struct merge_options *opt, - struct tree *merge_base, - struct tree *side, unsigned side_index) { struct diff_options diff_opts; @@ -2108,11 +2160,11 @@ static void detect_regular_renames(struct merge_options *opt, diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT; diff_setup_done(&diff_opts); + diff_queued_diff = renames->pairs[side_index]; trace2_region_enter("diff", "diffcore_rename", opt->repo); - diff_tree_oid(&merge_base->object.oid, &side->object.oid, "", - &diff_opts); - diffcore_std(&diff_opts); + diffcore_rename(&diff_opts); trace2_region_leave("diff", "diffcore_rename", opt->repo); + resolve_diffpair_statuses(&diff_queued_diff); if (diff_opts.needed_rename_limit > renames->needed_limit) renames->needed_limit = diff_opts.needed_rename_limit; @@ -2212,8 +2264,8 @@ static int detect_and_process_renames(struct merge_options *opt, memset(&combined, 0, sizeof(combined)); trace2_region_enter("merge", "regular renames", opt->repo); - detect_regular_renames(opt, merge_base, side1, MERGE_SIDE1); - detect_regular_renames(opt, merge_base, side2, MERGE_SIDE2); + detect_regular_renames(opt, MERGE_SIDE1); + detect_regular_renames(opt, MERGE_SIDE2); trace2_region_leave("merge", "regular renames", opt->repo); trace2_region_enter("merge", "directory renames", opt->repo); From eb10e637cf04a19b9a3d5d7c904d71bc9e4ea408 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Wed, 3 Feb 2021 15:34:40 +0000 Subject: [PATCH 25/63] p7519: do not rely on "xargs -d" in test Convert the test to use a more portable method to update the mtime on a large number of files under version control. The Mac version of xargs does not support the "-d" option. Likewise, the "-0" and "--null" options are not portable. Furthermore, use `test-tool chmtime` rather than `touch` to update the mtime to ensure that it is actually updated (especially on file systems with only whole second resolution). Signed-off-by: Jeff Hostetler Reviewed-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/perf/p7519-fsmonitor.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/t/perf/p7519-fsmonitor.sh b/t/perf/p7519-fsmonitor.sh index 9b43342806..6677e0ef7a 100755 --- a/t/perf/p7519-fsmonitor.sh +++ b/t/perf/p7519-fsmonitor.sh @@ -164,8 +164,18 @@ test_fsmonitor_suite() { git status -uall ' + # Update the mtimes on upto 100k files to make status think + # that they are dirty. For simplicity, omit any files with + # LFs (i.e. anything that ls-files thinks it needs to dquote). + # Then fully backslash-quote the paths to capture any + # whitespace so that they pass thru xargs properly. + # test_perf_w_drop_caches "status (dirty) ($DESC)" ' - git ls-files | head -100000 | xargs -d "\n" touch -h && + git ls-files | \ + head -100000 | \ + grep -v \" | \ + sed '\''s/\(.\)/\\\1/g'\'' | \ + xargs test-tool chmtime -300 && git status ' From 0917763d67b99c22d980a10e94102047cc4e9a73 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Wed, 3 Feb 2021 15:34:41 +0000 Subject: [PATCH 26/63] p7519: fix watchman watch-list test on Windows Only use the final portion of the test trash directory file name when verifying that Watchman was started. On Windows and under the SDK, $GIT_WORKTREE is a cygwin-style path with forward slashes and a "/c/" drive name. However `watchman watch-list` reports a proper Windows-style pathname with drive letters and backslashes. This causes the grep to fail. Since we don't really care about the full pathname (and we really don't want to bother with normalizaing them), just see if the test-name portion of the path is found. Signed-off-by: Jeff Hostetler Reviewed-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/perf/p7519-fsmonitor.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t/perf/p7519-fsmonitor.sh b/t/perf/p7519-fsmonitor.sh index 6677e0ef7a..21d525541d 100755 --- a/t/perf/p7519-fsmonitor.sh +++ b/t/perf/p7519-fsmonitor.sh @@ -101,7 +101,7 @@ test_expect_success "one time repo setup" ' # If Watchman exists, watch the work tree and attempt a query. if test_have_prereq WATCHMAN; then watchman watch "$GIT_WORK_TREE" && - watchman watch-list | grep -q -F "$GIT_WORK_TREE" + watchman watch-list | grep -q -F "p7519-fsmonitor" fi ' From a7556c3bde48b9266f11794a20d12719269becd3 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Wed, 3 Feb 2021 15:34:42 +0000 Subject: [PATCH 27/63] p7519: move watchman cleanup earlier in the test Shutdown Watchman after the Watchman-based tests and before the block of "no fsmonitor" tests. This helps ensure that Watchman cannot affect the test results for the other. Signed-off-by: Jeff Hostetler Reviewed-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/perf/p7519-fsmonitor.sh | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/t/perf/p7519-fsmonitor.sh b/t/perf/p7519-fsmonitor.sh index 21d525541d..78e7d2c03f 100755 --- a/t/perf/p7519-fsmonitor.sh +++ b/t/perf/p7519-fsmonitor.sh @@ -208,6 +208,11 @@ test_fsmonitor_suite() { ' } +# +# Run a full set of perf tests using each Hook-based fsmonitor provider, +# such as Watchman. +# + if test -n "$GIT_PERF_7519_FSMONITOR"; then for INTEGRATION_PATH in $GIT_PERF_7519_FSMONITOR; do test_expect_success "setup for fsmonitor $INTEGRATION_PATH" 'setup_for_fsmonitor' @@ -218,14 +223,6 @@ else test_fsmonitor_suite fi -test_expect_success "setup without fsmonitor" ' - unset INTEGRATION_SCRIPT && - git config --unset core.fsmonitor && - git update-index --no-fsmonitor -' - -test_fsmonitor_suite - if test_have_prereq WATCHMAN then watchman watch-del "$GIT_WORK_TREE" >/dev/null 2>&1 && @@ -235,4 +232,16 @@ then watchman shutdown-server >/dev/null 2>&1 fi +# +# Run a full set of perf tests with the fsmonitor feature disabled. +# + +test_expect_success "setup without fsmonitor" ' + unset INTEGRATION_SCRIPT && + git config --unset core.fsmonitor && + git update-index --no-fsmonitor +' + +test_fsmonitor_suite + test_done From 4f2009dce2d270ded8bb94417becf6cc2143c70c Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Wed, 3 Feb 2021 15:34:43 +0000 Subject: [PATCH 28/63] p7519: add trace logging during perf test Add optional trace logging to allow us to better compare performance of various fsmonitor providers and compare results with non-fsmonitor runs. Currently, this includes Trace2 logging, but may be extended to include other trace targets, such as GIT_TRACE_FSMONITOR if desired. Using this logging helped me explain an odd behavior on MacOS where the kernel was dropping events and causing the hook to Watchman to timeout. Signed-off-by: Jeff Hostetler Reviewed-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/perf/.gitignore | 1 + t/perf/Makefile | 4 ++-- t/perf/p7519-fsmonitor.sh | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/t/perf/.gitignore b/t/perf/.gitignore index 982eb8e3a9..72f5d0d314 100644 --- a/t/perf/.gitignore +++ b/t/perf/.gitignore @@ -1,3 +1,4 @@ /build/ /test-results/ +/test-trace/ /trash directory*/ diff --git a/t/perf/Makefile b/t/perf/Makefile index fcb0e8865e..2465770a78 100644 --- a/t/perf/Makefile +++ b/t/perf/Makefile @@ -7,10 +7,10 @@ perf: pre-clean ./run pre-clean: - rm -rf test-results + rm -rf test-results test-trace clean: - rm -rf build "trash directory".* test-results + rm -rf build "trash directory".* test-results test-trace test-lint: $(MAKE) -C .. test-lint diff --git a/t/perf/p7519-fsmonitor.sh b/t/perf/p7519-fsmonitor.sh index 78e7d2c03f..aa0ea0e2ec 100755 --- a/t/perf/p7519-fsmonitor.sh +++ b/t/perf/p7519-fsmonitor.sh @@ -32,6 +32,8 @@ test_description="Test core.fsmonitor" # # GIT_PERF_7519_DROP_CACHE: if set, the OS caches are dropped between tests # +# GIT_PERF_7519_TRACE: if set, enable trace logging during the test. +# Trace logs will be grouped by fsmonitor provider. test_perf_large_repo test_checkout_worktree @@ -70,6 +72,32 @@ then fi fi +trace_start() { + if test -n "$GIT_PERF_7519_TRACE" + then + name="$1" + TEST_TRACE_DIR="$TEST_OUTPUT_DIRECTORY/test-trace/p7519/" + echo "Writing trace logging to $TEST_TRACE_DIR" + + mkdir -p "$TEST_TRACE_DIR" + + # Start Trace2 logging and any other GIT_TRACE_* logs that you + # want for this named test case. + + GIT_TRACE2_PERF="$TEST_TRACE_DIR/$name.trace2perf" + export GIT_TRACE2_PERF + + >"$GIT_TRACE2_PERF" + fi +} + +trace_stop() { + if test -n "$GIT_PERF_7519_TRACE" + then + unset GIT_TRACE2_PERF + fi +} + test_expect_success "one time repo setup" ' # set untrackedCache depending on the environment if test -n "$GIT_PERF_7519_UNTRACKED_CACHE" @@ -213,6 +241,7 @@ test_fsmonitor_suite() { # such as Watchman. # +trace_start fsmonitor-watchman if test -n "$GIT_PERF_7519_FSMONITOR"; then for INTEGRATION_PATH in $GIT_PERF_7519_FSMONITOR; do test_expect_success "setup for fsmonitor $INTEGRATION_PATH" 'setup_for_fsmonitor' @@ -231,11 +260,13 @@ then # preventing the removal of the trash directory watchman shutdown-server >/dev/null 2>&1 fi +trace_stop # # Run a full set of perf tests with the fsmonitor feature disabled. # +trace_start fsmonitor-disabled test_expect_success "setup without fsmonitor" ' unset INTEGRATION_SCRIPT && git config --unset core.fsmonitor && @@ -243,5 +274,6 @@ test_expect_success "setup without fsmonitor" ' ' test_fsmonitor_suite +trace_stop test_done From 8c4b7503d033d8ba20771a1af8e077a79a18ce49 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Wed, 3 Feb 2021 15:34:44 +0000 Subject: [PATCH 29/63] preload-index: log the number of lstat calls to trace2 Report the total number of calls made to lstat() inside preload_index(). FSMonitor improves the performance of commands like `git status` by avoiding scanning the disk for changed files. This can be seen in `preload_index()`. Let's measure this. Signed-off-by: Jeff Hostetler Reviewed-by: Taylor Blau Signed-off-by: Junio C Hamano --- preload-index.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/preload-index.c b/preload-index.c index ed6eaa4738..e5529a5863 100644 --- a/preload-index.c +++ b/preload-index.c @@ -31,6 +31,7 @@ struct thread_data { struct pathspec pathspec; struct progress_data *progress; int offset, nr; + int t2_nr_lstat; }; static void *preload_thread(void *_data) @@ -73,6 +74,7 @@ static void *preload_thread(void *_data) continue; if (threaded_has_symlink_leading_path(&cache, ce->name, ce_namelen(ce))) continue; + p->t2_nr_lstat++; if (lstat(ce->name, &st)) continue; if (ie_match_stat(index, ce, &st, CE_MATCH_RACY_IS_DIRTY|CE_MATCH_IGNORE_FSMONITOR)) @@ -98,6 +100,7 @@ void preload_index(struct index_state *index, int threads, i, work, offset; struct thread_data data[MAX_PARALLEL]; struct progress_data pd; + int t2_sum_lstat = 0; if (!HAVE_THREADS || !core_preload_index) return; @@ -107,6 +110,9 @@ void preload_index(struct index_state *index, threads = 2; if (threads < 2) return; + + trace2_region_enter("index", "preload", NULL); + trace_performance_enter(); if (threads > MAX_PARALLEL) threads = MAX_PARALLEL; @@ -141,10 +147,14 @@ void preload_index(struct index_state *index, struct thread_data *p = data+i; if (pthread_join(p->pthread, NULL)) die("unable to join threaded lstat"); + t2_sum_lstat += p->t2_nr_lstat; } stop_progress(&pd.progress); trace_performance_leave("preload index"); + + trace2_data_intmax("index", NULL, "preload/sum_lstat", t2_sum_lstat); + trace2_region_leave("index", "preload", NULL); } int repo_read_index_preload(struct repository *repo, From a98e0f2d317818bfcb7f3c9cf53dfad2334e8ca7 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Wed, 3 Feb 2021 15:34:45 +0000 Subject: [PATCH 30/63] read-cache: log the number of lstat calls to trace2 Report the total number of calls made to lstat() inside of refresh_index(). FSMonitor improves the performance of commands like `git status` by avoiding scanning the disk for changed files. This can be seen in `refresh_index()`. Let's measure this. Signed-off-by: Jeff Hostetler Reviewed-by: Taylor Blau Signed-off-by: Junio C Hamano --- read-cache.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/read-cache.c b/read-cache.c index ecf6f68994..893cc41e1d 100644 --- a/read-cache.c +++ b/read-cache.c @@ -1364,7 +1364,8 @@ int add_index_entry(struct index_state *istate, struct cache_entry *ce, int opti static struct cache_entry *refresh_cache_ent(struct index_state *istate, struct cache_entry *ce, unsigned int options, int *err, - int *changed_ret) + int *changed_ret, + int *t2_did_lstat) { struct stat st; struct cache_entry *updated; @@ -1406,6 +1407,8 @@ static struct cache_entry *refresh_cache_ent(struct index_state *istate, return NULL; } + if (t2_did_lstat) + *t2_did_lstat = 1; if (lstat(ce->name, &st) < 0) { if (ignore_missing && errno == ENOENT) return ce; @@ -1519,6 +1522,7 @@ int refresh_index(struct index_state *istate, unsigned int flags, const char *added_fmt; const char *unmerged_fmt; struct progress *progress = NULL; + int t2_sum_lstat = 0; if (flags & REFRESH_PROGRESS && isatty(2)) progress = start_delayed_progress(_("Refresh index"), @@ -1536,11 +1540,13 @@ int refresh_index(struct index_state *istate, unsigned int flags, * we only have to do the special cases that are left. */ preload_index(istate, pathspec, 0); + trace2_region_enter("index", "refresh", NULL); for (i = 0; i < istate->cache_nr; i++) { struct cache_entry *ce, *new_entry; int cache_errno = 0; int changed = 0; int filtered = 0; + int t2_did_lstat = 0; ce = istate->cache[i]; if (ignore_submodules && S_ISGITLINK(ce->ce_mode)) @@ -1566,7 +1572,10 @@ int refresh_index(struct index_state *istate, unsigned int flags, if (filtered) continue; - new_entry = refresh_cache_ent(istate, ce, options, &cache_errno, &changed); + new_entry = refresh_cache_ent(istate, ce, options, + &cache_errno, &changed, + &t2_did_lstat); + t2_sum_lstat += t2_did_lstat; if (new_entry == ce) continue; if (progress) @@ -1602,6 +1611,8 @@ int refresh_index(struct index_state *istate, unsigned int flags, replace_index_entry(istate, i, new_entry); } + trace2_data_intmax("index", NULL, "refresh/sum_lstat", t2_sum_lstat); + trace2_region_leave("index", "refresh", NULL); if (progress) { display_progress(progress, istate->cache_nr); stop_progress(&progress); @@ -1614,7 +1625,7 @@ struct cache_entry *refresh_cache_entry(struct index_state *istate, struct cache_entry *ce, unsigned int options) { - return refresh_cache_ent(istate, ce, options, NULL, NULL); + return refresh_cache_ent(istate, ce, options, NULL, NULL, NULL); } From 15268d12bef12a40753767882088f4fcfefb3b2b Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Wed, 3 Feb 2021 15:34:46 +0000 Subject: [PATCH 31/63] read-cache: log the number of scanned files to trace2 Report the number of files in the working directory that were read and their hashes verified in `refresh_index()`. FSMonitor improves the performance of commands like `git status` by avoiding scanning the disk for changed files. Let's measure this. Signed-off-by: Jeff Hostetler Reviewed-by: Taylor Blau Signed-off-by: Junio C Hamano --- read-cache.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/read-cache.c b/read-cache.c index 893cc41e1d..c9dd7f4015 100644 --- a/read-cache.c +++ b/read-cache.c @@ -1365,7 +1365,8 @@ static struct cache_entry *refresh_cache_ent(struct index_state *istate, struct cache_entry *ce, unsigned int options, int *err, int *changed_ret, - int *t2_did_lstat) + int *t2_did_lstat, + int *t2_did_scan) { struct stat st; struct cache_entry *updated; @@ -1445,6 +1446,8 @@ static struct cache_entry *refresh_cache_ent(struct index_state *istate, } } + if (t2_did_scan) + *t2_did_scan = 1; if (ie_modified(istate, ce, &st, options)) { if (err) *err = EINVAL; @@ -1523,6 +1526,7 @@ int refresh_index(struct index_state *istate, unsigned int flags, const char *unmerged_fmt; struct progress *progress = NULL; int t2_sum_lstat = 0; + int t2_sum_scan = 0; if (flags & REFRESH_PROGRESS && isatty(2)) progress = start_delayed_progress(_("Refresh index"), @@ -1547,6 +1551,7 @@ int refresh_index(struct index_state *istate, unsigned int flags, int changed = 0; int filtered = 0; int t2_did_lstat = 0; + int t2_did_scan = 0; ce = istate->cache[i]; if (ignore_submodules && S_ISGITLINK(ce->ce_mode)) @@ -1574,8 +1579,9 @@ int refresh_index(struct index_state *istate, unsigned int flags, new_entry = refresh_cache_ent(istate, ce, options, &cache_errno, &changed, - &t2_did_lstat); + &t2_did_lstat, &t2_did_scan); t2_sum_lstat += t2_did_lstat; + t2_sum_scan += t2_did_scan; if (new_entry == ce) continue; if (progress) @@ -1612,6 +1618,7 @@ int refresh_index(struct index_state *istate, unsigned int flags, replace_index_entry(istate, i, new_entry); } trace2_data_intmax("index", NULL, "refresh/sum_lstat", t2_sum_lstat); + trace2_data_intmax("index", NULL, "refresh/sum_scan", t2_sum_scan); trace2_region_leave("index", "refresh", NULL); if (progress) { display_progress(progress, istate->cache_nr); @@ -1625,7 +1632,7 @@ struct cache_entry *refresh_cache_entry(struct index_state *istate, struct cache_entry *ce, unsigned int options) { - return refresh_cache_ent(istate, ce, options, NULL, NULL, NULL); + return refresh_cache_ent(istate, ce, options, NULL, NULL, NULL, NULL); } From 940b94f35cf1858adf23fe939bcbbe73147ca1f3 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Wed, 3 Feb 2021 15:34:47 +0000 Subject: [PATCH 32/63] fsmonitor: log invocation of FSMonitor hook to trace2 Let's measure the time taken to request and receive FSMonitor data via the hook API and the size of the response. Signed-off-by: Jeff Hostetler Reviewed-by: Taylor Blau Signed-off-by: Junio C Hamano --- fsmonitor.c | 29 ++++++++++++++++++++++++++++- fsmonitor.h | 5 +++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/fsmonitor.c b/fsmonitor.c index ca031c3abb..7a2be24cd4 100644 --- a/fsmonitor.c +++ b/fsmonitor.c @@ -142,6 +142,7 @@ void write_fsmonitor_extension(struct strbuf *sb, struct index_state *istate) static int query_fsmonitor(int version, const char *last_update, struct strbuf *query_result) { struct child_process cp = CHILD_PROCESS_INIT; + int result; if (!core_fsmonitor) return -1; @@ -152,7 +153,33 @@ static int query_fsmonitor(int version, const char *last_update, struct strbuf * cp.use_shell = 1; cp.dir = get_git_work_tree(); - return capture_command(&cp, query_result, 1024); + trace2_region_enter("fsm_hook", "query", NULL); + + result = capture_command(&cp, query_result, 1024); + + if (result) + trace2_data_intmax("fsm_hook", NULL, "query/failed", result); + else { + trace2_data_intmax("fsm_hook", NULL, "query/response-length", + query_result->len); + + if (fsmonitor_is_trivial_response(query_result)) + trace2_data_intmax("fsm_hook", NULL, + "query/trivial-response", 1); + } + + trace2_region_leave("fsm_hook", "query", NULL); + + return result; +} + +int fsmonitor_is_trivial_response(const struct strbuf *query_result) +{ + static char trivial_response[3] = { '\0', '/', '\0' }; + int is_trivial = !memcmp(trivial_response, + &query_result->buf[query_result->len - 3], 3); + + return is_trivial; } static void fsmonitor_refresh_callback(struct index_state *istate, const char *name) diff --git a/fsmonitor.h b/fsmonitor.h index 739318ab6d..7f1794b90b 100644 --- a/fsmonitor.h +++ b/fsmonitor.h @@ -44,6 +44,11 @@ void tweak_fsmonitor(struct index_state *istate); */ void refresh_fsmonitor(struct index_state *istate); +/* + * Does the received result contain the "trivial" response? + */ +int fsmonitor_is_trivial_response(const struct strbuf *query_result); + /* * Set the given cache entries CE_FSMONITOR_VALID bit. This should be * called any time the cache entry has been updated to reflect the From 29fbbf43a031cefb6fe6e3f723d78f82af4979b0 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Wed, 3 Feb 2021 15:34:48 +0000 Subject: [PATCH 33/63] fsmonitor: log FSMN token when reading and writing the index Signed-off-by: Jeff Hostetler Reviewed-by: Taylor Blau Signed-off-by: Junio C Hamano --- fsmonitor.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fsmonitor.c b/fsmonitor.c index 7a2be24cd4..3105dc370a 100644 --- a/fsmonitor.c +++ b/fsmonitor.c @@ -87,7 +87,11 @@ int read_fsmonitor_extension(struct index_state *istate, const void *data, BUG("fsmonitor_dirty has more entries than the index (%"PRIuMAX" > %u)", (uintmax_t)istate->fsmonitor_dirty->bit_size, istate->cache_nr); - trace_printf_key(&trace_fsmonitor, "read fsmonitor extension successful"); + trace2_data_string("index", NULL, "extension/fsmn/read/token", + istate->fsmonitor_last_update); + trace_printf_key(&trace_fsmonitor, + "read fsmonitor extension successful '%s'", + istate->fsmonitor_last_update); return 0; } @@ -133,7 +137,11 @@ void write_fsmonitor_extension(struct strbuf *sb, struct index_state *istate) put_be32(&ewah_size, sb->len - ewah_start); memcpy(sb->buf + fixup, &ewah_size, sizeof(uint32_t)); - trace_printf_key(&trace_fsmonitor, "write fsmonitor extension successful"); + trace2_data_string("index", NULL, "extension/fsmn/write/token", + istate->fsmonitor_last_update); + trace_printf_key(&trace_fsmonitor, + "write fsmonitor extension successful '%s'", + istate->fsmonitor_last_update); } /* From ff03836b9d696617d699da2e4108057f9a48b9ef Mon Sep 17 00:00:00 2001 From: Kevin Willford Date: Wed, 3 Feb 2021 15:34:49 +0000 Subject: [PATCH 34/63] fsmonitor: allow all entries for a folder to be invalidated Allow fsmonitor to report directory changes by reporting paths with a trailing slash. Signed-off-by: Jeff Hostetler Signed-off-by: Kevin Willford Signed-off-by: Johannes Schindelin Reviewed-by: Taylor Blau Signed-off-by: Junio C Hamano --- fsmonitor.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/fsmonitor.c b/fsmonitor.c index 3105dc370a..64deeda597 100644 --- a/fsmonitor.c +++ b/fsmonitor.c @@ -190,13 +190,34 @@ int fsmonitor_is_trivial_response(const struct strbuf *query_result) return is_trivial; } -static void fsmonitor_refresh_callback(struct index_state *istate, const char *name) +static void fsmonitor_refresh_callback(struct index_state *istate, char *name) { - int pos = index_name_pos(istate, name, strlen(name)); + int i, len = strlen(name); + if (name[len - 1] == '/') { - if (pos >= 0) { - struct cache_entry *ce = istate->cache[pos]; - ce->ce_flags &= ~CE_FSMONITOR_VALID; + /* + * TODO We should binary search to find the first path with + * TODO this directory prefix. Then linearly update entries + * TODO while the prefix matches. Taking care to search without + * TODO the trailing slash -- because '/' sorts after a few + * TODO interesting special chars, like '.' and ' '. + */ + + /* Mark all entries for the folder invalid */ + for (i = 0; i < istate->cache_nr; i++) { + if (istate->cache[i]->ce_flags & CE_FSMONITOR_VALID && + starts_with(istate->cache[i]->name, name)) + istate->cache[i]->ce_flags &= ~CE_FSMONITOR_VALID; + } + /* Need to remove the / from the path for the untracked cache */ + name[len - 1] = '\0'; + } else { + int pos = index_name_pos(istate, name, strlen(name)); + + if (pos >= 0) { + struct cache_entry *ce = istate->cache[pos]; + ce->ce_flags &= ~CE_FSMONITOR_VALID; + } } /* From fcd19b09f8b2cddffe18dc4d2f974fce94dc27b0 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Wed, 3 Feb 2021 15:34:50 +0000 Subject: [PATCH 35/63] fsmonitor: refactor initialization of fsmonitor_last_update token Isolate and document initialization of `istate->fsmonitor_last_update`. This field should contain a fsmonitor-specific opaque token, but we need to initialize it before we can actually talk to a fsmonitor process, so we create a generic default value. Signed-off-by: Jeff Hostetler Reviewed-by: Taylor Blau Signed-off-by: Junio C Hamano --- fsmonitor.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/fsmonitor.c b/fsmonitor.c index 64deeda597..e12214b300 100644 --- a/fsmonitor.c +++ b/fsmonitor.c @@ -343,16 +343,45 @@ void refresh_fsmonitor(struct index_state *istate) istate->fsmonitor_last_update = strbuf_detach(&last_update_token, NULL); } +/* + * The caller wants to turn on FSMonitor. And when the caller writes + * the index to disk, a FSMonitor extension should be included. This + * requires that `istate->fsmonitor_last_update` not be NULL. But we + * have not actually talked to a FSMonitor process yet, so we don't + * have an initial value for this field. + * + * For a protocol V1 FSMonitor process, this field is a formatted + * "nanoseconds since epoch" field. However, for a protocol V2 + * FSMonitor process, this field is an opaque token. + * + * Historically, `add_fsmonitor()` has initialized this field to the + * current time for protocol V1 processes. There are lots of race + * conditions here, but that code has shipped... + * + * The only true solution is to use a V2 FSMonitor and get a current + * or default token value (that it understands), but we cannot do that + * until we have actually talked to an instance of the FSMonitor process + * (but the protocol requires that we send a token first...). + * + * For simplicity, just initialize like we have a V1 process and require + * that V2 processes adapt. + */ +static void initialize_fsmonitor_last_update(struct index_state *istate) +{ + struct strbuf last_update = STRBUF_INIT; + + strbuf_addf(&last_update, "%"PRIu64"", getnanotime()); + istate->fsmonitor_last_update = strbuf_detach(&last_update, NULL); +} + void add_fsmonitor(struct index_state *istate) { unsigned int i; - struct strbuf last_update = STRBUF_INIT; if (!istate->fsmonitor_last_update) { trace_printf_key(&trace_fsmonitor, "add fsmonitor"); istate->cache_changed |= FSMONITOR_CHANGED; - strbuf_addf(&last_update, "%"PRIu64"", getnanotime()); - istate->fsmonitor_last_update = strbuf_detach(&last_update, NULL); + initialize_fsmonitor_last_update(istate); /* reset the fsmonitor state */ for (i = 0; i < istate->cache_nr; i++) From b9a43869c9f96d3577d6f568c1bda1940c8f0e31 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Wed, 3 Feb 2021 01:28:12 +0530 Subject: [PATCH 36/63] git-gui: remove lines starting with the comment character The comment character is specified by the config variable 'core.commentchar'. Any lines starting with this character is considered a comment and should not be included in the final commit message. Teach git-gui to filter out lines in the commit message that start with the comment character using git-stripspace. If the config is not set, '#' is taken as the default. Also add a message educating users about the comment character. Signed-off-by: Pratyush Yadav --- git-gui.sh | 5 +++++ lib/commit.tcl | 18 ++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/git-gui.sh b/git-gui.sh index 201524c34e..236bc4e61d 100755 --- a/git-gui.sh +++ b/git-gui.sh @@ -875,6 +875,7 @@ set default_config(merge.summary) false set default_config(merge.verbosity) 2 set default_config(user.name) {} set default_config(user.email) {} +set default_config(core.commentchar) "#" set default_config(gui.encoding) [encoding system] set default_config(gui.matchtrackingbranch) false @@ -3436,6 +3437,10 @@ proc trace_commit_type {varname args} { merge {set txt [mc "Merge Commit Message:"]} * {set txt [mc "Commit Message:"]} } + + set comment_char [get_config core.commentchar] + set txt [string cat $txt \ + [mc " (Lines starting with '$comment_char' will be ignored)"]] $ui_coml conf -text $txt } trace add variable commit_type write trace_commit_type diff --git a/lib/commit.tcl b/lib/commit.tcl index 11379f8ad3..23d67d4651 100644 --- a/lib/commit.tcl +++ b/lib/commit.tcl @@ -141,6 +141,20 @@ proc setup_commit_encoding {msg_wt {quiet 0}} { } } +proc strip_msg {msg} { + set cmd [concat [list | ] [_git_cmd stripspace] --strip-comments] + _trace_exec $cmd + set fd [open $cmd r+] + fconfigure $fd -translation binary -encoding utf-8 + + puts -nonewline $fd $msg + close $fd w + set result [read $fd] + close $fd + + return $result +} + proc commit_tree {} { global HEAD commit_type file_states ui_comm repo_config global pch_error @@ -207,8 +221,8 @@ You must stage at least 1 file before you can commit. # -- A message is required. # - set msg [string trim [$ui_comm get 1.0 end]] - regsub -all -line {[ \t\r]+$} $msg {} msg + set msg [strip_msg [$ui_comm get 1.0 end]] + if {$msg eq {}} { error_popup [mc "Please supply a commit message. From 570df42610a971b80046846d7f262007bec23dd6 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 18 Feb 2021 14:07:24 +0000 Subject: [PATCH 37/63] chunk-format: create chunk format write API In anticipation of combining the logic from the commit-graph and multi-pack-index file formats, create a new chunk-format API. Use a 'struct chunkfile' pointer to keep track of data that has been registered for writes. This struct is anonymous outside of chunk-format.c to ensure no user attempts to interfere with the data. The next change will use this API in commit-graph.c, but the general approach is: 1. initialize the chunkfile with init_chunkfile(f). 2. add chunks in the intended writing order with add_chunk(). 3. write any header information to the hashfile f. 4. write the chunkfile data using write_chunkfile(). 5. free the chunkfile struct using free_chunkfile(). Helped-by: Taylor Blau Helped-by: Junio C Hamano Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- Makefile | 1 + chunk-format.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++ chunk-format.h | 21 ++++++++++++ 3 files changed, 112 insertions(+) create mode 100644 chunk-format.c create mode 100644 chunk-format.h diff --git a/Makefile b/Makefile index 7b64106930..50a7663841 100644 --- a/Makefile +++ b/Makefile @@ -854,6 +854,7 @@ LIB_OBJS += bundle.o LIB_OBJS += cache-tree.o LIB_OBJS += chdir-notify.o LIB_OBJS += checkout.o +LIB_OBJS += chunk-format.o LIB_OBJS += color.o LIB_OBJS += column.o LIB_OBJS += combine-diff.o diff --git a/chunk-format.c b/chunk-format.c new file mode 100644 index 0000000000..6c9b52b70c --- /dev/null +++ b/chunk-format.c @@ -0,0 +1,90 @@ +#include "cache.h" +#include "chunk-format.h" +#include "csum-file.h" + +/* + * When writing a chunk-based file format, collect the chunks in + * an array of chunk_info structs. The size stores the _expected_ + * amount of data that will be written by write_fn. + */ +struct chunk_info { + uint32_t id; + uint64_t size; + chunk_write_fn write_fn; +}; + +struct chunkfile { + struct hashfile *f; + + struct chunk_info *chunks; + size_t chunks_nr; + size_t chunks_alloc; +}; + +struct chunkfile *init_chunkfile(struct hashfile *f) +{ + struct chunkfile *cf = xcalloc(1, sizeof(*cf)); + cf->f = f; + return cf; +} + +void free_chunkfile(struct chunkfile *cf) +{ + if (!cf) + return; + free(cf->chunks); + free(cf); +} + +int get_num_chunks(struct chunkfile *cf) +{ + return cf->chunks_nr; +} + +void add_chunk(struct chunkfile *cf, + uint32_t id, + size_t size, + chunk_write_fn fn) +{ + ALLOC_GROW(cf->chunks, cf->chunks_nr + 1, cf->chunks_alloc); + + cf->chunks[cf->chunks_nr].id = id; + cf->chunks[cf->chunks_nr].write_fn = fn; + cf->chunks[cf->chunks_nr].size = size; + cf->chunks_nr++; +} + +int write_chunkfile(struct chunkfile *cf, void *data) +{ + int i; + uint64_t cur_offset = hashfile_total(cf->f); + + /* Add the table of contents to the current offset */ + cur_offset += (cf->chunks_nr + 1) * CHUNK_TOC_ENTRY_SIZE; + + for (i = 0; i < cf->chunks_nr; i++) { + hashwrite_be32(cf->f, cf->chunks[i].id); + hashwrite_be64(cf->f, cur_offset); + + cur_offset += cf->chunks[i].size; + } + + /* Trailing entry marks the end of the chunks */ + hashwrite_be32(cf->f, 0); + hashwrite_be64(cf->f, cur_offset); + + for (i = 0; i < cf->chunks_nr; i++) { + off_t start_offset = hashfile_total(cf->f); + int result = cf->chunks[i].write_fn(cf->f, data); + + if (result) + return result; + + if (hashfile_total(cf->f) - start_offset != cf->chunks[i].size) + BUG("expected to write %"PRId64" bytes to chunk %"PRIx32", but wrote %"PRId64" instead", + cf->chunks[i].size, cf->chunks[i].id, + hashfile_total(cf->f) - start_offset); + } + + return 0; +} diff --git a/chunk-format.h b/chunk-format.h new file mode 100644 index 0000000000..ce598b66d9 --- /dev/null +++ b/chunk-format.h @@ -0,0 +1,21 @@ +#ifndef CHUNK_FORMAT_H +#define CHUNK_FORMAT_H + +#include "git-compat-util.h" + +struct hashfile; +struct chunkfile; + +#define CHUNK_TOC_ENTRY_SIZE (sizeof(uint32_t) + sizeof(uint64_t)) + +struct chunkfile *init_chunkfile(struct hashfile *f); +void free_chunkfile(struct chunkfile *cf); +int get_num_chunks(struct chunkfile *cf); +typedef int (*chunk_write_fn)(struct hashfile *f, void *data); +void add_chunk(struct chunkfile *cf, + uint32_t id, + size_t size, + chunk_write_fn fn); +int write_chunkfile(struct chunkfile *cf, void *data); + +#endif From 47410aa8370fdfc67d379fc808ac2316aef1d2c5 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 18 Feb 2021 14:07:25 +0000 Subject: [PATCH 38/63] commit-graph: use chunk-format write API The commit-graph write logic is ready to make use of the chunk-format write API. Each chunk write method is already in the correct prototype. We only need to use the 'struct chunkfile' pointer and the correct API calls. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- commit-graph.c | 117 +++++++++++++++---------------------------------- 1 file changed, 36 insertions(+), 81 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index fae7d1b639..a889130cc8 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -19,6 +19,7 @@ #include "shallow.h" #include "json-writer.h" #include "trace2.h" +#include "chunk-format.h" void git_test_write_commit_graph_or_die(void) { @@ -44,7 +45,6 @@ void git_test_write_commit_graph_or_die(void) #define GRAPH_CHUNKID_BLOOMINDEXES 0x42494458 /* "BIDX" */ #define GRAPH_CHUNKID_BLOOMDATA 0x42444154 /* "BDAT" */ #define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */ -#define MAX_NUM_CHUNKS 9 #define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16) @@ -1758,27 +1758,17 @@ static int write_graph_chunk_base(struct hashfile *f, return 0; } -typedef int (*chunk_write_fn)(struct hashfile *f, - void *data); - -struct chunk_info { - uint32_t id; - uint64_t size; - chunk_write_fn write_fn; -}; - static int write_commit_graph_file(struct write_commit_graph_context *ctx) { uint32_t i; int fd; struct hashfile *f; struct lock_file lk = LOCK_INIT; - struct chunk_info chunks[MAX_NUM_CHUNKS + 1]; const unsigned hashsz = the_hash_algo->rawsz; struct strbuf progress_title = STRBUF_INIT; int num_chunks = 3; - uint64_t chunk_offset; struct object_id file_hash; + struct chunkfile *cf; if (ctx->split) { struct strbuf tmp_file = STRBUF_INIT; @@ -1824,76 +1814,50 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) f = hashfd(lk.tempfile->fd, lk.tempfile->filename.buf); } - chunks[0].id = GRAPH_CHUNKID_OIDFANOUT; - chunks[0].size = GRAPH_FANOUT_SIZE; - chunks[0].write_fn = write_graph_chunk_fanout; - chunks[1].id = GRAPH_CHUNKID_OIDLOOKUP; - chunks[1].size = hashsz * ctx->commits.nr; - chunks[1].write_fn = write_graph_chunk_oids; - chunks[2].id = GRAPH_CHUNKID_DATA; - chunks[2].size = (hashsz + 16) * ctx->commits.nr; - chunks[2].write_fn = write_graph_chunk_data; + cf = init_chunkfile(f); + + add_chunk(cf, GRAPH_CHUNKID_OIDFANOUT, GRAPH_FANOUT_SIZE, + write_graph_chunk_fanout); + add_chunk(cf, GRAPH_CHUNKID_OIDLOOKUP, hashsz * ctx->commits.nr, + write_graph_chunk_oids); + add_chunk(cf, GRAPH_CHUNKID_DATA, (hashsz + 16) * ctx->commits.nr, + write_graph_chunk_data); if (git_env_bool(GIT_TEST_COMMIT_GRAPH_NO_GDAT, 0)) ctx->write_generation_data = 0; - if (ctx->write_generation_data) { - chunks[num_chunks].id = GRAPH_CHUNKID_GENERATION_DATA; - chunks[num_chunks].size = sizeof(uint32_t) * ctx->commits.nr; - chunks[num_chunks].write_fn = write_graph_chunk_generation_data; - num_chunks++; - } - if (ctx->num_generation_data_overflows) { - chunks[num_chunks].id = GRAPH_CHUNKID_GENERATION_DATA_OVERFLOW; - chunks[num_chunks].size = sizeof(timestamp_t) * ctx->num_generation_data_overflows; - chunks[num_chunks].write_fn = write_graph_chunk_generation_data_overflow; - num_chunks++; - } - if (ctx->num_extra_edges) { - chunks[num_chunks].id = GRAPH_CHUNKID_EXTRAEDGES; - chunks[num_chunks].size = 4 * ctx->num_extra_edges; - chunks[num_chunks].write_fn = write_graph_chunk_extra_edges; - num_chunks++; - } + if (ctx->write_generation_data) + add_chunk(cf, GRAPH_CHUNKID_GENERATION_DATA, + sizeof(uint32_t) * ctx->commits.nr, + write_graph_chunk_generation_data); + if (ctx->num_generation_data_overflows) + add_chunk(cf, GRAPH_CHUNKID_GENERATION_DATA_OVERFLOW, + sizeof(timestamp_t) * ctx->num_generation_data_overflows, + write_graph_chunk_generation_data_overflow); + if (ctx->num_extra_edges) + add_chunk(cf, GRAPH_CHUNKID_EXTRAEDGES, + 4 * ctx->num_extra_edges, + write_graph_chunk_extra_edges); if (ctx->changed_paths) { - chunks[num_chunks].id = GRAPH_CHUNKID_BLOOMINDEXES; - chunks[num_chunks].size = sizeof(uint32_t) * ctx->commits.nr; - chunks[num_chunks].write_fn = write_graph_chunk_bloom_indexes; - num_chunks++; - chunks[num_chunks].id = GRAPH_CHUNKID_BLOOMDATA; - chunks[num_chunks].size = sizeof(uint32_t) * 3 - + ctx->total_bloom_filter_data_size; - chunks[num_chunks].write_fn = write_graph_chunk_bloom_data; - num_chunks++; + add_chunk(cf, GRAPH_CHUNKID_BLOOMINDEXES, + sizeof(uint32_t) * ctx->commits.nr, + write_graph_chunk_bloom_indexes); + add_chunk(cf, GRAPH_CHUNKID_BLOOMDATA, + sizeof(uint32_t) * 3 + + ctx->total_bloom_filter_data_size, + write_graph_chunk_bloom_data); } - if (ctx->num_commit_graphs_after > 1) { - chunks[num_chunks].id = GRAPH_CHUNKID_BASE; - chunks[num_chunks].size = hashsz * (ctx->num_commit_graphs_after - 1); - chunks[num_chunks].write_fn = write_graph_chunk_base; - num_chunks++; - } - - chunks[num_chunks].id = 0; - chunks[num_chunks].size = 0; + if (ctx->num_commit_graphs_after > 1) + add_chunk(cf, GRAPH_CHUNKID_BASE, + hashsz * (ctx->num_commit_graphs_after - 1), + write_graph_chunk_base); hashwrite_be32(f, GRAPH_SIGNATURE); hashwrite_u8(f, GRAPH_VERSION); hashwrite_u8(f, oid_version()); - hashwrite_u8(f, num_chunks); + hashwrite_u8(f, get_num_chunks(cf)); hashwrite_u8(f, ctx->num_commit_graphs_after - 1); - chunk_offset = 8 + (num_chunks + 1) * GRAPH_CHUNKLOOKUP_WIDTH; - for (i = 0; i <= num_chunks; i++) { - uint32_t chunk_write[3]; - - chunk_write[0] = htonl(chunks[i].id); - chunk_write[1] = htonl(chunk_offset >> 32); - chunk_write[2] = htonl(chunk_offset & 0xffffffff); - hashwrite(f, chunk_write, 12); - - chunk_offset += chunks[i].size; - } - if (ctx->report_progress) { strbuf_addf(&progress_title, Q_("Writing out commit graph in %d pass", @@ -1905,17 +1869,7 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) num_chunks * ctx->commits.nr); } - for (i = 0; i < num_chunks; i++) { - uint64_t start_offset = f->total + f->offset; - - if (chunks[i].write_fn(f, ctx)) - return -1; - - if (f->total + f->offset != start_offset + chunks[i].size) - BUG("expected to write %"PRId64" bytes to chunk %"PRIx32", but wrote %"PRId64" instead", - chunks[i].size, chunks[i].id, - f->total + f->offset - start_offset); - } + write_chunkfile(cf, ctx); stop_progress(&ctx->progress); strbuf_release(&progress_title); @@ -1932,6 +1886,7 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) close_commit_graph(ctx->r->objects); finalize_hashfile(f, file_hash.hash, CSUM_HASH_IN_STREAM | CSUM_FSYNC); + free_chunkfile(cf); if (ctx->split) { FILE *chainf = fdopen_lock_file(&lk, "w"); From 577dc49696afee67fb507e0bd72be4c8677b83c2 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 18 Feb 2021 14:07:26 +0000 Subject: [PATCH 39/63] midx: rename pack_info to write_midx_context In an effort to streamline our chunk-based file formats, align some of the code structure in write_midx_internal() to be similar to the patterns in write_commit_graph_file(). Specifically, let's create a "struct write_midx_context" that can be used as a data parameter to abstract function types. This change only renames "struct pack_info" to "struct write_midx_context" and the names of instances from "packs" to "ctx". In future changes, we will expand the data inside "struct write_midx_context" and align our chunk-writing method with the chunk-format API. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- midx.c | 124 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/midx.c b/midx.c index 79c282b070..561f65a63a 100644 --- a/midx.c +++ b/midx.c @@ -451,7 +451,7 @@ static int pack_info_compare(const void *_a, const void *_b) return strcmp(a->pack_name, b->pack_name); } -struct pack_list { +struct write_midx_context { struct pack_info *info; uint32_t nr; uint32_t alloc; @@ -463,37 +463,37 @@ struct pack_list { static void add_pack_to_midx(const char *full_path, size_t full_path_len, const char *file_name, void *data) { - struct pack_list *packs = (struct pack_list *)data; + struct write_midx_context *ctx = data; if (ends_with(file_name, ".idx")) { - display_progress(packs->progress, ++packs->pack_paths_checked); - if (packs->m && midx_contains_pack(packs->m, file_name)) + display_progress(ctx->progress, ++ctx->pack_paths_checked); + if (ctx->m && midx_contains_pack(ctx->m, file_name)) return; - ALLOC_GROW(packs->info, packs->nr + 1, packs->alloc); + ALLOC_GROW(ctx->info, ctx->nr + 1, ctx->alloc); - packs->info[packs->nr].p = add_packed_git(full_path, - full_path_len, - 0); + ctx->info[ctx->nr].p = add_packed_git(full_path, + full_path_len, + 0); - if (!packs->info[packs->nr].p) { + if (!ctx->info[ctx->nr].p) { warning(_("failed to add packfile '%s'"), full_path); return; } - if (open_pack_index(packs->info[packs->nr].p)) { + if (open_pack_index(ctx->info[ctx->nr].p)) { warning(_("failed to open pack-index '%s'"), full_path); - close_pack(packs->info[packs->nr].p); - FREE_AND_NULL(packs->info[packs->nr].p); + close_pack(ctx->info[ctx->nr].p); + FREE_AND_NULL(ctx->info[ctx->nr].p); return; } - packs->info[packs->nr].pack_name = xstrdup(file_name); - packs->info[packs->nr].orig_pack_int_id = packs->nr; - packs->info[packs->nr].expired = 0; - packs->nr++; + ctx->info[ctx->nr].pack_name = xstrdup(file_name); + ctx->info[ctx->nr].orig_pack_int_id = ctx->nr; + ctx->info[ctx->nr].expired = 0; + ctx->nr++; } } @@ -801,7 +801,7 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * uint32_t i; struct hashfile *f = NULL; struct lock_file lk; - struct pack_list packs; + struct write_midx_context ctx = { 0 }; uint32_t *pack_perm = NULL; uint64_t written = 0; uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1]; @@ -820,40 +820,40 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * midx_name); if (m) - packs.m = m; + ctx.m = m; else - packs.m = load_multi_pack_index(object_dir, 1); + ctx.m = load_multi_pack_index(object_dir, 1); - packs.nr = 0; - packs.alloc = packs.m ? packs.m->num_packs : 16; - packs.info = NULL; - ALLOC_ARRAY(packs.info, packs.alloc); + ctx.nr = 0; + ctx.alloc = ctx.m ? ctx.m->num_packs : 16; + ctx.info = NULL; + ALLOC_ARRAY(ctx.info, ctx.alloc); - if (packs.m) { - for (i = 0; i < packs.m->num_packs; i++) { - ALLOC_GROW(packs.info, packs.nr + 1, packs.alloc); + if (ctx.m) { + for (i = 0; i < ctx.m->num_packs; i++) { + ALLOC_GROW(ctx.info, ctx.nr + 1, ctx.alloc); - packs.info[packs.nr].orig_pack_int_id = i; - packs.info[packs.nr].pack_name = xstrdup(packs.m->pack_names[i]); - packs.info[packs.nr].p = NULL; - packs.info[packs.nr].expired = 0; - packs.nr++; + ctx.info[ctx.nr].orig_pack_int_id = i; + ctx.info[ctx.nr].pack_name = xstrdup(ctx.m->pack_names[i]); + ctx.info[ctx.nr].p = NULL; + ctx.info[ctx.nr].expired = 0; + ctx.nr++; } } - packs.pack_paths_checked = 0; + ctx.pack_paths_checked = 0; if (flags & MIDX_PROGRESS) - packs.progress = start_delayed_progress(_("Adding packfiles to multi-pack-index"), 0); + ctx.progress = start_delayed_progress(_("Adding packfiles to multi-pack-index"), 0); else - packs.progress = NULL; + ctx.progress = NULL; - for_each_file_in_pack_dir(object_dir, add_pack_to_midx, &packs); - stop_progress(&packs.progress); + for_each_file_in_pack_dir(object_dir, add_pack_to_midx, &ctx); + stop_progress(&ctx.progress); - if (packs.m && packs.nr == packs.m->num_packs && !packs_to_drop) + if (ctx.m && ctx.nr == ctx.m->num_packs && !packs_to_drop) goto cleanup; - entries = get_sorted_entries(packs.m, packs.info, packs.nr, &nr_entries); + entries = get_sorted_entries(ctx.m, ctx.info, ctx.nr, &nr_entries); for (i = 0; i < nr_entries; i++) { if (entries[i].offset > 0x7fffffff) @@ -862,19 +862,19 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * large_offsets_needed = 1; } - QSORT(packs.info, packs.nr, pack_info_compare); + QSORT(ctx.info, ctx.nr, pack_info_compare); if (packs_to_drop && packs_to_drop->nr) { int drop_index = 0; int missing_drops = 0; - for (i = 0; i < packs.nr && drop_index < packs_to_drop->nr; i++) { - int cmp = strcmp(packs.info[i].pack_name, + for (i = 0; i < ctx.nr && drop_index < packs_to_drop->nr; i++) { + int cmp = strcmp(ctx.info[i].pack_name, packs_to_drop->items[drop_index].string); if (!cmp) { drop_index++; - packs.info[i].expired = 1; + ctx.info[i].expired = 1; } else if (cmp > 0) { error(_("did not see pack-file %s to drop"), packs_to_drop->items[drop_index].string); @@ -882,7 +882,7 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * missing_drops++; i--; } else { - packs.info[i].expired = 0; + ctx.info[i].expired = 0; } } @@ -898,19 +898,19 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * * * pack_perm[old_id] = new_id */ - ALLOC_ARRAY(pack_perm, packs.nr); - for (i = 0; i < packs.nr; i++) { - if (packs.info[i].expired) { + ALLOC_ARRAY(pack_perm, ctx.nr); + for (i = 0; i < ctx.nr; i++) { + if (ctx.info[i].expired) { dropped_packs++; - pack_perm[packs.info[i].orig_pack_int_id] = PACK_EXPIRED; + pack_perm[ctx.info[i].orig_pack_int_id] = PACK_EXPIRED; } else { - pack_perm[packs.info[i].orig_pack_int_id] = i - dropped_packs; + pack_perm[ctx.info[i].orig_pack_int_id] = i - dropped_packs; } } - for (i = 0; i < packs.nr; i++) { - if (!packs.info[i].expired) - pack_name_concat_len += strlen(packs.info[i].pack_name) + 1; + for (i = 0; i < ctx.nr; i++) { + if (!ctx.info[i].expired) + pack_name_concat_len += strlen(ctx.info[i].pack_name) + 1; } if (pack_name_concat_len % MIDX_CHUNK_ALIGNMENT) @@ -921,19 +921,19 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * f = hashfd(lk.tempfile->fd, lk.tempfile->filename.buf); FREE_AND_NULL(midx_name); - if (packs.m) - close_midx(packs.m); + if (ctx.m) + close_midx(ctx.m); cur_chunk = 0; num_chunks = large_offsets_needed ? 5 : 4; - if (packs.nr - dropped_packs == 0) { + if (ctx.nr - dropped_packs == 0) { error(_("no pack files to index.")); result = 1; goto cleanup; } - written = write_midx_header(f, num_chunks, packs.nr - dropped_packs); + written = write_midx_header(f, num_chunks, ctx.nr - dropped_packs); chunk_ids[cur_chunk] = MIDX_CHUNKID_PACKNAMES; chunk_offsets[cur_chunk] = written + (num_chunks + 1) * MIDX_CHUNKLOOKUP_WIDTH; @@ -990,7 +990,7 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * switch (chunk_ids[i]) { case MIDX_CHUNKID_PACKNAMES: - written += write_midx_pack_names(f, packs.info, packs.nr); + written += write_midx_pack_names(f, ctx.info, ctx.nr); break; case MIDX_CHUNKID_OIDFANOUT: @@ -1027,15 +1027,15 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * commit_lock_file(&lk); cleanup: - for (i = 0; i < packs.nr; i++) { - if (packs.info[i].p) { - close_pack(packs.info[i].p); - free(packs.info[i].p); + for (i = 0; i < ctx.nr; i++) { + if (ctx.info[i].p) { + close_pack(ctx.info[i].p); + free(ctx.info[i].p); } - free(packs.info[i].pack_name); + free(ctx.info[i].pack_name); } - free(packs.info); + free(ctx.info); free(entries); free(pack_perm); free(midx_name); From b4d941420bce15cd48bccabc1faa90477bda2fae Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 18 Feb 2021 14:07:27 +0000 Subject: [PATCH 40/63] midx: use context in write_midx_pack_names() In an effort to align the write_midx_internal() to use the chunk-format API, start converting chunk writing methods to match chunk_write_fn. The first case is to convert write_midx_pack_names() to take "void *data". We already have the necessary data in "struct write_midx_context", so this conversion is rather mechanical. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- midx.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/midx.c b/midx.c index 561f65a63a..88452b0443 100644 --- a/midx.c +++ b/midx.c @@ -643,27 +643,26 @@ static struct pack_midx_entry *get_sorted_entries(struct multi_pack_index *m, return deduplicated_entries; } -static size_t write_midx_pack_names(struct hashfile *f, - struct pack_info *info, - uint32_t num_packs) +static size_t write_midx_pack_names(struct hashfile *f, void *data) { + struct write_midx_context *ctx = data; uint32_t i; unsigned char padding[MIDX_CHUNK_ALIGNMENT]; size_t written = 0; - for (i = 0; i < num_packs; i++) { + for (i = 0; i < ctx->nr; i++) { size_t writelen; - if (info[i].expired) + if (ctx->info[i].expired) continue; - if (i && strcmp(info[i].pack_name, info[i - 1].pack_name) <= 0) + if (i && strcmp(ctx->info[i].pack_name, ctx->info[i - 1].pack_name) <= 0) BUG("incorrect pack-file order: %s before %s", - info[i - 1].pack_name, - info[i].pack_name); + ctx->info[i - 1].pack_name, + ctx->info[i].pack_name); - writelen = strlen(info[i].pack_name) + 1; - hashwrite(f, info[i].pack_name, writelen); + writelen = strlen(ctx->info[i].pack_name) + 1; + hashwrite(f, ctx->info[i].pack_name, writelen); written += writelen; } @@ -990,7 +989,7 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * switch (chunk_ids[i]) { case MIDX_CHUNKID_PACKNAMES: - written += write_midx_pack_names(f, ctx.info, ctx.nr); + written += write_midx_pack_names(f, &ctx); break; case MIDX_CHUNKID_OIDFANOUT: From 31bda9a237b363574bd8a5bd98f86bdeb6ced1e6 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 18 Feb 2021 14:07:28 +0000 Subject: [PATCH 41/63] midx: add entries to write_midx_context In an effort to align write_midx_internal() with the chunk-format API, continue to group necessary data into "struct write_midx_context". This change collects the "struct pack_midx_entry *entries" list and its count into the context. Update write_midx_oid_fanout() and write_midx_oid_lookup() to take the context directly, as these are easy conversions with this new data. Only the callers of write_midx_object_offsets() and write_midx_large_offsets() are updated here, since additional data in the context before those methods can match chunk_write_fn. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- midx.c | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/midx.c b/midx.c index 88452b0443..4520ef82b9 100644 --- a/midx.c +++ b/midx.c @@ -458,6 +458,9 @@ struct write_midx_context { struct multi_pack_index *m; struct progress *progress; unsigned pack_paths_checked; + + struct pack_midx_entry *entries; + uint32_t entries_nr; }; static void add_pack_to_midx(const char *full_path, size_t full_path_len, @@ -678,11 +681,11 @@ static size_t write_midx_pack_names(struct hashfile *f, void *data) } static size_t write_midx_oid_fanout(struct hashfile *f, - struct pack_midx_entry *objects, - uint32_t nr_objects) + void *data) { - struct pack_midx_entry *list = objects; - struct pack_midx_entry *last = objects + nr_objects; + struct write_midx_context *ctx = data; + struct pack_midx_entry *list = ctx->entries; + struct pack_midx_entry *last = ctx->entries + ctx->entries_nr; uint32_t count = 0; uint32_t i; @@ -706,18 +709,19 @@ static size_t write_midx_oid_fanout(struct hashfile *f, return MIDX_CHUNK_FANOUT_SIZE; } -static size_t write_midx_oid_lookup(struct hashfile *f, unsigned char hash_len, - struct pack_midx_entry *objects, - uint32_t nr_objects) +static size_t write_midx_oid_lookup(struct hashfile *f, + void *data) { - struct pack_midx_entry *list = objects; + struct write_midx_context *ctx = data; + unsigned char hash_len = the_hash_algo->rawsz; + struct pack_midx_entry *list = ctx->entries; uint32_t i; size_t written = 0; - for (i = 0; i < nr_objects; i++) { + for (i = 0; i < ctx->entries_nr; i++) { struct pack_midx_entry *obj = list++; - if (i < nr_objects - 1) { + if (i < ctx->entries_nr - 1) { struct pack_midx_entry *next = list; if (oidcmp(&obj->oid, &next->oid) >= 0) BUG("OIDs not in order: %s >= %s", @@ -805,8 +809,7 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * uint64_t written = 0; uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1]; uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1]; - uint32_t nr_entries, num_large_offsets = 0; - struct pack_midx_entry *entries = NULL; + uint32_t num_large_offsets = 0; struct progress *progress = NULL; int large_offsets_needed = 0; int pack_name_concat_len = 0; @@ -852,12 +855,12 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * if (ctx.m && ctx.nr == ctx.m->num_packs && !packs_to_drop) goto cleanup; - entries = get_sorted_entries(ctx.m, ctx.info, ctx.nr, &nr_entries); + ctx.entries = get_sorted_entries(ctx.m, ctx.info, ctx.nr, &ctx.entries_nr); - for (i = 0; i < nr_entries; i++) { - if (entries[i].offset > 0x7fffffff) + for (i = 0; i < ctx.entries_nr; i++) { + if (ctx.entries[i].offset > 0x7fffffff) num_large_offsets++; - if (entries[i].offset > 0xffffffff) + if (ctx.entries[i].offset > 0xffffffff) large_offsets_needed = 1; } @@ -947,10 +950,10 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * cur_chunk++; chunk_ids[cur_chunk] = MIDX_CHUNKID_OBJECTOFFSETS; - chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + nr_entries * the_hash_algo->rawsz; + chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + ctx.entries_nr * the_hash_algo->rawsz; cur_chunk++; - chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + nr_entries * MIDX_CHUNK_OFFSET_WIDTH; + chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + ctx.entries_nr * MIDX_CHUNK_OFFSET_WIDTH; if (large_offsets_needed) { chunk_ids[cur_chunk] = MIDX_CHUNKID_LARGEOFFSETS; @@ -993,19 +996,19 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * break; case MIDX_CHUNKID_OIDFANOUT: - written += write_midx_oid_fanout(f, entries, nr_entries); + written += write_midx_oid_fanout(f, &ctx); break; case MIDX_CHUNKID_OIDLOOKUP: - written += write_midx_oid_lookup(f, the_hash_algo->rawsz, entries, nr_entries); + written += write_midx_oid_lookup(f, &ctx); break; case MIDX_CHUNKID_OBJECTOFFSETS: - written += write_midx_object_offsets(f, large_offsets_needed, pack_perm, entries, nr_entries); + written += write_midx_object_offsets(f, large_offsets_needed, pack_perm, ctx.entries, ctx.entries_nr); break; case MIDX_CHUNKID_LARGEOFFSETS: - written += write_midx_large_offsets(f, num_large_offsets, entries, nr_entries); + written += write_midx_large_offsets(f, num_large_offsets, ctx.entries, ctx.entries_nr); break; default: @@ -1035,7 +1038,7 @@ cleanup: } free(ctx.info); - free(entries); + free(ctx.entries); free(pack_perm); free(midx_name); return result; From 7a3ada1192bd251e530a668f63a65b4befa076d0 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 18 Feb 2021 14:07:29 +0000 Subject: [PATCH 42/63] midx: add pack_perm to write_midx_context In an effort to align write_midx_internal() with the chunk-format API, continue to group necessary data into "struct write_midx_context". This change collects the "uint32_t *pack_perm" and large_offsets_needed bit into the context. Update write_midx_object_offsets() to match chunk_write_fn. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- midx.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/midx.c b/midx.c index 4520ef82b9..cd994e333e 100644 --- a/midx.c +++ b/midx.c @@ -461,6 +461,9 @@ struct write_midx_context { struct pack_midx_entry *entries; uint32_t entries_nr; + + uint32_t *pack_perm; + unsigned large_offsets_needed:1; }; static void add_pack_to_midx(const char *full_path, size_t full_path_len, @@ -736,27 +739,27 @@ static size_t write_midx_oid_lookup(struct hashfile *f, return written; } -static size_t write_midx_object_offsets(struct hashfile *f, int large_offset_needed, - uint32_t *perm, - struct pack_midx_entry *objects, uint32_t nr_objects) +static size_t write_midx_object_offsets(struct hashfile *f, + void *data) { - struct pack_midx_entry *list = objects; + struct write_midx_context *ctx = data; + struct pack_midx_entry *list = ctx->entries; uint32_t i, nr_large_offset = 0; size_t written = 0; - for (i = 0; i < nr_objects; i++) { + for (i = 0; i < ctx->entries_nr; i++) { struct pack_midx_entry *obj = list++; - if (perm[obj->pack_int_id] == PACK_EXPIRED) + if (ctx->pack_perm[obj->pack_int_id] == PACK_EXPIRED) BUG("object %s is in an expired pack with int-id %d", oid_to_hex(&obj->oid), obj->pack_int_id); - hashwrite_be32(f, perm[obj->pack_int_id]); + hashwrite_be32(f, ctx->pack_perm[obj->pack_int_id]); - if (large_offset_needed && obj->offset >> 31) + if (ctx->large_offsets_needed && obj->offset >> 31) hashwrite_be32(f, MIDX_LARGE_OFFSET_NEEDED | nr_large_offset++); - else if (!large_offset_needed && obj->offset >> 32) + else if (!ctx->large_offsets_needed && obj->offset >> 32) BUG("object %s requires a large offset (%"PRIx64") but the MIDX is not writing large offsets!", oid_to_hex(&obj->oid), obj->offset); @@ -805,13 +808,11 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * struct hashfile *f = NULL; struct lock_file lk; struct write_midx_context ctx = { 0 }; - uint32_t *pack_perm = NULL; uint64_t written = 0; uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1]; uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1]; uint32_t num_large_offsets = 0; struct progress *progress = NULL; - int large_offsets_needed = 0; int pack_name_concat_len = 0; int dropped_packs = 0; int result = 0; @@ -857,11 +858,12 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * ctx.entries = get_sorted_entries(ctx.m, ctx.info, ctx.nr, &ctx.entries_nr); + ctx.large_offsets_needed = 0; for (i = 0; i < ctx.entries_nr; i++) { if (ctx.entries[i].offset > 0x7fffffff) num_large_offsets++; if (ctx.entries[i].offset > 0xffffffff) - large_offsets_needed = 1; + ctx.large_offsets_needed = 1; } QSORT(ctx.info, ctx.nr, pack_info_compare); @@ -900,13 +902,13 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * * * pack_perm[old_id] = new_id */ - ALLOC_ARRAY(pack_perm, ctx.nr); + ALLOC_ARRAY(ctx.pack_perm, ctx.nr); for (i = 0; i < ctx.nr; i++) { if (ctx.info[i].expired) { dropped_packs++; - pack_perm[ctx.info[i].orig_pack_int_id] = PACK_EXPIRED; + ctx.pack_perm[ctx.info[i].orig_pack_int_id] = PACK_EXPIRED; } else { - pack_perm[ctx.info[i].orig_pack_int_id] = i - dropped_packs; + ctx.pack_perm[ctx.info[i].orig_pack_int_id] = i - dropped_packs; } } @@ -927,7 +929,7 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * close_midx(ctx.m); cur_chunk = 0; - num_chunks = large_offsets_needed ? 5 : 4; + num_chunks = ctx.large_offsets_needed ? 5 : 4; if (ctx.nr - dropped_packs == 0) { error(_("no pack files to index.")); @@ -954,7 +956,7 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * cur_chunk++; chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + ctx.entries_nr * MIDX_CHUNK_OFFSET_WIDTH; - if (large_offsets_needed) { + if (ctx.large_offsets_needed) { chunk_ids[cur_chunk] = MIDX_CHUNKID_LARGEOFFSETS; cur_chunk++; @@ -1004,7 +1006,7 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * break; case MIDX_CHUNKID_OBJECTOFFSETS: - written += write_midx_object_offsets(f, large_offsets_needed, pack_perm, ctx.entries, ctx.entries_nr); + written += write_midx_object_offsets(f, &ctx); break; case MIDX_CHUNKID_LARGEOFFSETS: @@ -1039,7 +1041,7 @@ cleanup: free(ctx.info); free(ctx.entries); - free(pack_perm); + free(ctx.pack_perm); free(midx_name); return result; } From 980f525c3cee7e272136eeb6e988a66f5f8a0bd8 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 18 Feb 2021 14:07:30 +0000 Subject: [PATCH 43/63] midx: add num_large_offsets to write_midx_context In an effort to align write_midx_internal() with the chunk-format API, continue to group necessary data into "struct write_midx_context". This change collects the "uint32_t num_large_offsets" into the context. With this new data, write_midx_large_offsets() now matches the chunk_write_fn type. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- midx.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/midx.c b/midx.c index cd994e333e..5be081f229 100644 --- a/midx.c +++ b/midx.c @@ -464,6 +464,7 @@ struct write_midx_context { uint32_t *pack_perm; unsigned large_offsets_needed:1; + uint32_t num_large_offsets; }; static void add_pack_to_midx(const char *full_path, size_t full_path_len, @@ -772,11 +773,14 @@ static size_t write_midx_object_offsets(struct hashfile *f, return written; } -static size_t write_midx_large_offsets(struct hashfile *f, uint32_t nr_large_offset, - struct pack_midx_entry *objects, uint32_t nr_objects) +static size_t write_midx_large_offsets(struct hashfile *f, + void *data) { - struct pack_midx_entry *list = objects, *end = objects + nr_objects; + struct write_midx_context *ctx = data; + struct pack_midx_entry *list = ctx->entries; + struct pack_midx_entry *end = ctx->entries + ctx->entries_nr; size_t written = 0; + uint32_t nr_large_offset = ctx->num_large_offsets; while (nr_large_offset) { struct pack_midx_entry *obj; @@ -811,7 +815,6 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * uint64_t written = 0; uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1]; uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1]; - uint32_t num_large_offsets = 0; struct progress *progress = NULL; int pack_name_concat_len = 0; int dropped_packs = 0; @@ -861,7 +864,7 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * ctx.large_offsets_needed = 0; for (i = 0; i < ctx.entries_nr; i++) { if (ctx.entries[i].offset > 0x7fffffff) - num_large_offsets++; + ctx.num_large_offsets++; if (ctx.entries[i].offset > 0xffffffff) ctx.large_offsets_needed = 1; } @@ -961,7 +964,7 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * cur_chunk++; chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + - num_large_offsets * MIDX_CHUNK_LARGE_OFFSET_WIDTH; + ctx.num_large_offsets * MIDX_CHUNK_LARGE_OFFSET_WIDTH; } chunk_ids[cur_chunk] = 0; @@ -1010,7 +1013,7 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * break; case MIDX_CHUNKID_LARGEOFFSETS: - written += write_midx_large_offsets(f, num_large_offsets, ctx.entries, ctx.entries_nr); + written += write_midx_large_offsets(f, &ctx); break; default: From 0ccd713cb6c4dafd36bdf85384becbb9b38504ba Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 18 Feb 2021 14:07:31 +0000 Subject: [PATCH 44/63] midx: return success/failure in chunk write methods Historically, the chunk-writing methods in midx.c have returned the amount of data written so the writer method could compare this with the table of contents. This presents with some interesting issues: 1. If a chunk writing method has a bug that miscalculates the written bytes, then we can satisfy the table of contents without actually writing the right amount of data to the hashfile. The commit-graph writing code checks the hashfile struct directly for a more robust verification. 2. There is no way for a chunk writing method to gracefully fail. Returning an int presents an opportunity to fail without a die(). 3. The current pattern doesn't match chunk_write_fn type exactly, so we cannot share code with commit-graph.c For these reasons, convert the midx chunk writer methods to return an 'int'. Since none of them fail at the moment, they all return 0. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- midx.c | 63 +++++++++++++++++++++++++--------------------------------- 1 file changed, 27 insertions(+), 36 deletions(-) diff --git a/midx.c b/midx.c index 5be081f229..c92a6c47be 100644 --- a/midx.c +++ b/midx.c @@ -650,7 +650,7 @@ static struct pack_midx_entry *get_sorted_entries(struct multi_pack_index *m, return deduplicated_entries; } -static size_t write_midx_pack_names(struct hashfile *f, void *data) +static int write_midx_pack_names(struct hashfile *f, void *data) { struct write_midx_context *ctx = data; uint32_t i; @@ -678,14 +678,13 @@ static size_t write_midx_pack_names(struct hashfile *f, void *data) if (i < MIDX_CHUNK_ALIGNMENT) { memset(padding, 0, sizeof(padding)); hashwrite(f, padding, i); - written += i; } - return written; + return 0; } -static size_t write_midx_oid_fanout(struct hashfile *f, - void *data) +static int write_midx_oid_fanout(struct hashfile *f, + void *data) { struct write_midx_context *ctx = data; struct pack_midx_entry *list = ctx->entries; @@ -710,17 +709,16 @@ static size_t write_midx_oid_fanout(struct hashfile *f, list = next; } - return MIDX_CHUNK_FANOUT_SIZE; + return 0; } -static size_t write_midx_oid_lookup(struct hashfile *f, - void *data) +static int write_midx_oid_lookup(struct hashfile *f, + void *data) { struct write_midx_context *ctx = data; unsigned char hash_len = the_hash_algo->rawsz; struct pack_midx_entry *list = ctx->entries; uint32_t i; - size_t written = 0; for (i = 0; i < ctx->entries_nr; i++) { struct pack_midx_entry *obj = list++; @@ -734,19 +732,17 @@ static size_t write_midx_oid_lookup(struct hashfile *f, } hashwrite(f, obj->oid.hash, (int)hash_len); - written += hash_len; } - return written; + return 0; } -static size_t write_midx_object_offsets(struct hashfile *f, - void *data) +static int write_midx_object_offsets(struct hashfile *f, + void *data) { struct write_midx_context *ctx = data; struct pack_midx_entry *list = ctx->entries; uint32_t i, nr_large_offset = 0; - size_t written = 0; for (i = 0; i < ctx->entries_nr; i++) { struct pack_midx_entry *obj = list++; @@ -766,20 +762,17 @@ static size_t write_midx_object_offsets(struct hashfile *f, obj->offset); else hashwrite_be32(f, (uint32_t)obj->offset); - - written += MIDX_CHUNK_OFFSET_WIDTH; } - return written; + return 0; } -static size_t write_midx_large_offsets(struct hashfile *f, - void *data) +static int write_midx_large_offsets(struct hashfile *f, + void *data) { struct write_midx_context *ctx = data; struct pack_midx_entry *list = ctx->entries; struct pack_midx_entry *end = ctx->entries + ctx->entries_nr; - size_t written = 0; uint32_t nr_large_offset = ctx->num_large_offsets; while (nr_large_offset) { @@ -795,12 +788,12 @@ static size_t write_midx_large_offsets(struct hashfile *f, if (!(offset >> 31)) continue; - written += hashwrite_be64(f, offset); + hashwrite_be64(f, offset); nr_large_offset--; } - return written; + return 0; } static int write_midx_internal(const char *object_dir, struct multi_pack_index *m, @@ -812,7 +805,7 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * struct hashfile *f = NULL; struct lock_file lk; struct write_midx_context ctx = { 0 }; - uint64_t written = 0; + uint64_t header_size = 0; uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1]; uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1]; struct progress *progress = NULL; @@ -940,10 +933,10 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * goto cleanup; } - written = write_midx_header(f, num_chunks, ctx.nr - dropped_packs); + header_size = write_midx_header(f, num_chunks, ctx.nr - dropped_packs); chunk_ids[cur_chunk] = MIDX_CHUNKID_PACKNAMES; - chunk_offsets[cur_chunk] = written + (num_chunks + 1) * MIDX_CHUNKLOOKUP_WIDTH; + chunk_offsets[cur_chunk] = header_size + (num_chunks + 1) * MIDX_CHUNKLOOKUP_WIDTH; cur_chunk++; chunk_ids[cur_chunk] = MIDX_CHUNKID_OIDFANOUT; @@ -981,39 +974,37 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * hashwrite_be32(f, chunk_ids[i]); hashwrite_be64(f, chunk_offsets[i]); - - written += MIDX_CHUNKLOOKUP_WIDTH; } if (flags & MIDX_PROGRESS) progress = start_delayed_progress(_("Writing chunks to multi-pack-index"), num_chunks); for (i = 0; i < num_chunks; i++) { - if (written != chunk_offsets[i]) + if (f->total + f->offset != chunk_offsets[i]) BUG("incorrect chunk offset (%"PRIu64" != %"PRIu64") for chunk id %"PRIx32, chunk_offsets[i], - written, + f->total + f->offset, chunk_ids[i]); switch (chunk_ids[i]) { case MIDX_CHUNKID_PACKNAMES: - written += write_midx_pack_names(f, &ctx); + write_midx_pack_names(f, &ctx); break; case MIDX_CHUNKID_OIDFANOUT: - written += write_midx_oid_fanout(f, &ctx); + write_midx_oid_fanout(f, &ctx); break; case MIDX_CHUNKID_OIDLOOKUP: - written += write_midx_oid_lookup(f, &ctx); + write_midx_oid_lookup(f, &ctx); break; case MIDX_CHUNKID_OBJECTOFFSETS: - written += write_midx_object_offsets(f, &ctx); + write_midx_object_offsets(f, &ctx); break; case MIDX_CHUNKID_LARGEOFFSETS: - written += write_midx_large_offsets(f, &ctx); + write_midx_large_offsets(f, &ctx); break; default: @@ -1025,9 +1016,9 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * } stop_progress(&progress); - if (written != chunk_offsets[num_chunks]) + if (hashfile_total(f) != chunk_offsets[num_chunks]) BUG("incorrect final offset %"PRIu64" != %"PRIu64, - written, + hashfile_total(f), chunk_offsets[num_chunks]); finalize_hashfile(f, NULL, CSUM_FSYNC | CSUM_HASH_IN_STREAM); From c1442410d869cd5fb2c0dd79aa1a7c152b99b0f9 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 18 Feb 2021 14:07:32 +0000 Subject: [PATCH 45/63] midx: drop chunk progress during write Most expensive operations in write_midx_internal() use the context struct's progress member, and these indicate the process of the expensive operations within the chunk writing methods. However, there is a competing progress struct that counts the progress over all chunks. This is not very helpful compared to the others, so drop it. This also reduces our barriers to combining the chunk writing code with chunk-format.c. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- midx.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/midx.c b/midx.c index c92a6c47be..4f4aa351e6 100644 --- a/midx.c +++ b/midx.c @@ -808,7 +808,6 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * uint64_t header_size = 0; uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1]; uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1]; - struct progress *progress = NULL; int pack_name_concat_len = 0; int dropped_packs = 0; int result = 0; @@ -976,9 +975,6 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * hashwrite_be64(f, chunk_offsets[i]); } - if (flags & MIDX_PROGRESS) - progress = start_delayed_progress(_("Writing chunks to multi-pack-index"), - num_chunks); for (i = 0; i < num_chunks; i++) { if (f->total + f->offset != chunk_offsets[i]) BUG("incorrect chunk offset (%"PRIu64" != %"PRIu64") for chunk id %"PRIx32, @@ -1011,10 +1007,7 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * BUG("trying to write unknown chunk id %"PRIx32, chunk_ids[i]); } - - display_progress(progress, i + 1); } - stop_progress(&progress); if (hashfile_total(f) != chunk_offsets[num_chunks]) BUG("incorrect final offset %"PRIu64" != %"PRIu64, From 63a8f0e9b9d9ce636738094391619c35856f7665 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 18 Feb 2021 14:07:33 +0000 Subject: [PATCH 46/63] midx: use chunk-format API in write_midx_internal() The chunk-format API allows writing the table of contents and all chunks using the anonymous 'struct chunkfile' type. We only need to convert our local chunk logic to this API for the multi-pack-index writes to share that logic with the commit-graph file writes. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- midx.c | 106 +++++++++++---------------------------------------------- 1 file changed, 20 insertions(+), 86 deletions(-) diff --git a/midx.c b/midx.c index 4f4aa351e6..d2fd9c10fe 100644 --- a/midx.c +++ b/midx.c @@ -11,6 +11,7 @@ #include "trace2.h" #include "run-command.h" #include "repository.h" +#include "chunk-format.h" #define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */ #define MIDX_VERSION 1 @@ -21,7 +22,6 @@ #define MIDX_HEADER_SIZE 12 #define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + the_hash_algo->rawsz) -#define MIDX_MAX_CHUNKS 5 #define MIDX_CHUNK_ALIGNMENT 4 #define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */ #define MIDX_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */ @@ -799,18 +799,15 @@ static int write_midx_large_offsets(struct hashfile *f, static int write_midx_internal(const char *object_dir, struct multi_pack_index *m, struct string_list *packs_to_drop, unsigned flags) { - unsigned char cur_chunk, num_chunks = 0; char *midx_name; uint32_t i; struct hashfile *f = NULL; struct lock_file lk; struct write_midx_context ctx = { 0 }; - uint64_t header_size = 0; - uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1]; - uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1]; int pack_name_concat_len = 0; int dropped_packs = 0; int result = 0; + struct chunkfile *cf; midx_name = get_midx_filename(object_dir); if (safe_create_leading_directories(midx_name)) @@ -923,98 +920,35 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * if (ctx.m) close_midx(ctx.m); - cur_chunk = 0; - num_chunks = ctx.large_offsets_needed ? 5 : 4; - if (ctx.nr - dropped_packs == 0) { error(_("no pack files to index.")); result = 1; goto cleanup; } - header_size = write_midx_header(f, num_chunks, ctx.nr - dropped_packs); + cf = init_chunkfile(f); - chunk_ids[cur_chunk] = MIDX_CHUNKID_PACKNAMES; - chunk_offsets[cur_chunk] = header_size + (num_chunks + 1) * MIDX_CHUNKLOOKUP_WIDTH; + add_chunk(cf, MIDX_CHUNKID_PACKNAMES, pack_name_concat_len, + write_midx_pack_names); + add_chunk(cf, MIDX_CHUNKID_OIDFANOUT, MIDX_CHUNK_FANOUT_SIZE, + write_midx_oid_fanout); + add_chunk(cf, MIDX_CHUNKID_OIDLOOKUP, + ctx.entries_nr * the_hash_algo->rawsz, + write_midx_oid_lookup); + add_chunk(cf, MIDX_CHUNKID_OBJECTOFFSETS, + ctx.entries_nr * MIDX_CHUNK_OFFSET_WIDTH, + write_midx_object_offsets); - cur_chunk++; - chunk_ids[cur_chunk] = MIDX_CHUNKID_OIDFANOUT; - chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + pack_name_concat_len; + if (ctx.large_offsets_needed) + add_chunk(cf, MIDX_CHUNKID_LARGEOFFSETS, + ctx.num_large_offsets * MIDX_CHUNK_LARGE_OFFSET_WIDTH, + write_midx_large_offsets); - cur_chunk++; - chunk_ids[cur_chunk] = MIDX_CHUNKID_OIDLOOKUP; - chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + MIDX_CHUNK_FANOUT_SIZE; - - cur_chunk++; - chunk_ids[cur_chunk] = MIDX_CHUNKID_OBJECTOFFSETS; - chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + ctx.entries_nr * the_hash_algo->rawsz; - - cur_chunk++; - chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + ctx.entries_nr * MIDX_CHUNK_OFFSET_WIDTH; - if (ctx.large_offsets_needed) { - chunk_ids[cur_chunk] = MIDX_CHUNKID_LARGEOFFSETS; - - cur_chunk++; - chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + - ctx.num_large_offsets * MIDX_CHUNK_LARGE_OFFSET_WIDTH; - } - - chunk_ids[cur_chunk] = 0; - - for (i = 0; i <= num_chunks; i++) { - if (i && chunk_offsets[i] < chunk_offsets[i - 1]) - BUG("incorrect chunk offsets: %"PRIu64" before %"PRIu64, - chunk_offsets[i - 1], - chunk_offsets[i]); - - if (chunk_offsets[i] % MIDX_CHUNK_ALIGNMENT) - BUG("chunk offset %"PRIu64" is not properly aligned", - chunk_offsets[i]); - - hashwrite_be32(f, chunk_ids[i]); - hashwrite_be64(f, chunk_offsets[i]); - } - - for (i = 0; i < num_chunks; i++) { - if (f->total + f->offset != chunk_offsets[i]) - BUG("incorrect chunk offset (%"PRIu64" != %"PRIu64") for chunk id %"PRIx32, - chunk_offsets[i], - f->total + f->offset, - chunk_ids[i]); - - switch (chunk_ids[i]) { - case MIDX_CHUNKID_PACKNAMES: - write_midx_pack_names(f, &ctx); - break; - - case MIDX_CHUNKID_OIDFANOUT: - write_midx_oid_fanout(f, &ctx); - break; - - case MIDX_CHUNKID_OIDLOOKUP: - write_midx_oid_lookup(f, &ctx); - break; - - case MIDX_CHUNKID_OBJECTOFFSETS: - write_midx_object_offsets(f, &ctx); - break; - - case MIDX_CHUNKID_LARGEOFFSETS: - write_midx_large_offsets(f, &ctx); - break; - - default: - BUG("trying to write unknown chunk id %"PRIx32, - chunk_ids[i]); - } - } - - if (hashfile_total(f) != chunk_offsets[num_chunks]) - BUG("incorrect final offset %"PRIu64" != %"PRIu64, - hashfile_total(f), - chunk_offsets[num_chunks]); + write_midx_header(f, get_num_chunks(cf), ctx.nr - dropped_packs); + write_chunkfile(cf, &ctx); finalize_hashfile(f, NULL, CSUM_FSYNC | CSUM_HASH_IN_STREAM); + free_chunkfile(cf); commit_lock_file(&lk); cleanup: From 5f0879f54b1f5cda348528a49af57a9c3fd620f9 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 18 Feb 2021 14:07:34 +0000 Subject: [PATCH 47/63] chunk-format: create read chunk API Add the capability to read the table of contents, then pair the chunks with necessary logic using read_chunk_fn pointers. Callers will be added in future changes, but the typical outline will be: 1. initialize a 'struct chunkfile' with init_chunkfile(NULL). 2. call read_table_of_contents(). 3. for each chunk to parse, a. call pair_chunk() to assign a pointer with the chunk position, or b. call read_chunk() to run a callback on the chunk start and size. 4. call free_chunkfile() to clear the 'struct chunkfile' data. We are re-using the anonymous 'struct chunkfile' data, as it is internal to the chunk-format API. This gives it essentially two modes: write and read. If the same struct instance was used for both reads and writes, then there would be failures. Helped-by: Junio C Hamano Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- chunk-format.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++ chunk-format.h | 47 +++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) diff --git a/chunk-format.c b/chunk-format.c index 6c9b52b70c..2c1fecf1c3 100644 --- a/chunk-format.c +++ b/chunk-format.c @@ -11,6 +11,8 @@ struct chunk_info { uint32_t id; uint64_t size; chunk_write_fn write_fn; + + const void *start; }; struct chunkfile { @@ -88,3 +90,81 @@ int write_chunkfile(struct chunkfile *cf, void *data) return 0; } + +int read_table_of_contents(struct chunkfile *cf, + const unsigned char *mfile, + size_t mfile_size, + uint64_t toc_offset, + int toc_length) +{ + uint32_t chunk_id; + const unsigned char *table_of_contents = mfile + toc_offset; + + ALLOC_GROW(cf->chunks, toc_length, cf->chunks_alloc); + + while (toc_length--) { + uint64_t chunk_offset, next_chunk_offset; + + chunk_id = get_be32(table_of_contents); + chunk_offset = get_be64(table_of_contents + 4); + + if (!chunk_id) { + error(_("terminating chunk id appears earlier than expected")); + return 1; + } + + table_of_contents += CHUNK_TOC_ENTRY_SIZE; + next_chunk_offset = get_be64(table_of_contents + 4); + + if (next_chunk_offset < chunk_offset || + next_chunk_offset > mfile_size - the_hash_algo->rawsz) { + error(_("improper chunk offset(s) %"PRIx64" and %"PRIx64""), + chunk_offset, next_chunk_offset); + return -1; + } + + cf->chunks[cf->chunks_nr].id = chunk_id; + cf->chunks[cf->chunks_nr].start = mfile + chunk_offset; + cf->chunks[cf->chunks_nr].size = next_chunk_offset - chunk_offset; + cf->chunks_nr++; + } + + chunk_id = get_be32(table_of_contents); + if (chunk_id) { + error(_("final chunk has non-zero id %"PRIx32""), chunk_id); + return -1; + } + + return 0; +} + +static int pair_chunk_fn(const unsigned char *chunk_start, + size_t chunk_size, + void *data) +{ + const unsigned char **p = data; + *p = chunk_start; + return 0; +} + +int pair_chunk(struct chunkfile *cf, + uint32_t chunk_id, + const unsigned char **p) +{ + return read_chunk(cf, chunk_id, pair_chunk_fn, p); +} + +int read_chunk(struct chunkfile *cf, + uint32_t chunk_id, + chunk_read_fn fn, + void *data) +{ + int i; + + for (i = 0; i < cf->chunks_nr; i++) { + if (cf->chunks[i].id == chunk_id) + return fn(cf->chunks[i].start, cf->chunks[i].size, data); + } + + return CHUNK_NOT_FOUND; +} diff --git a/chunk-format.h b/chunk-format.h index ce598b66d9..9ccbe00377 100644 --- a/chunk-format.h +++ b/chunk-format.h @@ -8,6 +8,20 @@ struct chunkfile; #define CHUNK_TOC_ENTRY_SIZE (sizeof(uint32_t) + sizeof(uint64_t)) +/* + * Initialize a 'struct chunkfile' for writing _or_ reading a file + * with the chunk format. + * + * If writing a file, supply a non-NULL 'struct hashfile *' that will + * be used to write. + * + * If reading a file, use a NULL 'struct hashfile *' and then call + * read_table_of_contents(). Supply the memory-mapped data to the + * pair_chunk() or read_chunk() methods, as appropriate. + * + * DO NOT MIX THESE MODES. Use different 'struct chunkfile' instances + * for reading and writing. + */ struct chunkfile *init_chunkfile(struct hashfile *f); void free_chunkfile(struct chunkfile *cf); int get_num_chunks(struct chunkfile *cf); @@ -18,4 +32,37 @@ void add_chunk(struct chunkfile *cf, chunk_write_fn fn); int write_chunkfile(struct chunkfile *cf, void *data); +int read_table_of_contents(struct chunkfile *cf, + const unsigned char *mfile, + size_t mfile_size, + uint64_t toc_offset, + int toc_length); + +#define CHUNK_NOT_FOUND (-2) + +/* + * Find 'chunk_id' in the given chunkfile and assign the + * given pointer to the position in the mmap'd file where + * that chunk begins. + * + * Returns CHUNK_NOT_FOUND if the chunk does not exist. + */ +int pair_chunk(struct chunkfile *cf, + uint32_t chunk_id, + const unsigned char **p); + +typedef int (*chunk_read_fn)(const unsigned char *chunk_start, + size_t chunk_size, void *data); +/* + * Find 'chunk_id' in the given chunkfile and call the + * given chunk_read_fn method with the information for + * that chunk. + * + * Returns CHUNK_NOT_FOUND if the chunk does not exist. + */ +int read_chunk(struct chunkfile *cf, + uint32_t chunk_id, + chunk_read_fn fn, + void *data); + #endif From 2692c2f6fd1cc9f74e433cb05d5756575682b9ab Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 18 Feb 2021 14:07:35 +0000 Subject: [PATCH 48/63] commit-graph: use chunk-format read API Instead of parsing the table of contents directly, use the chunk-format API methods read_table_of_contents() and pair_chunk(). While the current implementation loses the duplicate-chunk detection, that will be added in a future change. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- commit-graph.c | 159 ++++++++++++++-------------------------- t/t5318-commit-graph.sh | 2 +- 2 files changed, 55 insertions(+), 106 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index a889130cc8..76514a879e 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -59,8 +59,7 @@ void git_test_write_commit_graph_or_die(void) #define GRAPH_HEADER_SIZE 8 #define GRAPH_FANOUT_SIZE (4 * 256) -#define GRAPH_CHUNKLOOKUP_WIDTH 12 -#define GRAPH_MIN_SIZE (GRAPH_HEADER_SIZE + 4 * GRAPH_CHUNKLOOKUP_WIDTH \ +#define GRAPH_MIN_SIZE (GRAPH_HEADER_SIZE + 4 * CHUNK_TOC_ENTRY_SIZE \ + GRAPH_FANOUT_SIZE + the_hash_algo->rawsz) #define CORRECTED_COMMIT_DATE_OFFSET_OVERFLOW (1ULL << 31) @@ -298,15 +297,43 @@ static int verify_commit_graph_lite(struct commit_graph *g) return 0; } +static int graph_read_oid_lookup(const unsigned char *chunk_start, + size_t chunk_size, void *data) +{ + struct commit_graph *g = data; + g->chunk_oid_lookup = chunk_start; + g->num_commits = chunk_size / g->hash_len; + return 0; +} + +static int graph_read_bloom_data(const unsigned char *chunk_start, + size_t chunk_size, void *data) +{ + struct commit_graph *g = data; + uint32_t hash_version; + g->chunk_bloom_data = chunk_start; + hash_version = get_be32(chunk_start); + + if (hash_version != 1) + return 0; + + g->bloom_filter_settings = xmalloc(sizeof(struct bloom_filter_settings)); + g->bloom_filter_settings->hash_version = hash_version; + g->bloom_filter_settings->num_hashes = get_be32(chunk_start + 4); + g->bloom_filter_settings->bits_per_entry = get_be32(chunk_start + 8); + g->bloom_filter_settings->max_changed_paths = DEFAULT_BLOOM_MAX_CHANGES; + + return 0; +} + struct commit_graph *parse_commit_graph(struct repository *r, void *graph_map, size_t graph_size) { - const unsigned char *data, *chunk_lookup; - uint32_t i; + const unsigned char *data; struct commit_graph *graph; - uint64_t next_chunk_offset; uint32_t graph_signature; unsigned char graph_version, hash_version; + struct chunkfile *cf = NULL; if (!graph_map) return NULL; @@ -347,7 +374,7 @@ struct commit_graph *parse_commit_graph(struct repository *r, graph->data_len = graph_size; if (graph_size < GRAPH_HEADER_SIZE + - (graph->num_chunks + 1) * GRAPH_CHUNKLOOKUP_WIDTH + + (graph->num_chunks + 1) * CHUNK_TOC_ENTRY_SIZE + GRAPH_FANOUT_SIZE + the_hash_algo->rawsz) { error(_("commit-graph file is too small to hold %u chunks"), graph->num_chunks); @@ -355,108 +382,28 @@ struct commit_graph *parse_commit_graph(struct repository *r, return NULL; } - chunk_lookup = data + 8; - next_chunk_offset = get_be64(chunk_lookup + 4); - for (i = 0; i < graph->num_chunks; i++) { - uint32_t chunk_id; - uint64_t chunk_offset = next_chunk_offset; - int chunk_repeated = 0; + cf = init_chunkfile(NULL); - chunk_id = get_be32(chunk_lookup + 0); + if (read_table_of_contents(cf, graph->data, graph_size, + GRAPH_HEADER_SIZE, graph->num_chunks)) + goto free_and_return; - chunk_lookup += GRAPH_CHUNKLOOKUP_WIDTH; - next_chunk_offset = get_be64(chunk_lookup + 4); + pair_chunk(cf, GRAPH_CHUNKID_OIDFANOUT, + (const unsigned char **)&graph->chunk_oid_fanout); + read_chunk(cf, GRAPH_CHUNKID_OIDLOOKUP, graph_read_oid_lookup, graph); + pair_chunk(cf, GRAPH_CHUNKID_DATA, &graph->chunk_commit_data); + pair_chunk(cf, GRAPH_CHUNKID_EXTRAEDGES, &graph->chunk_extra_edges); + pair_chunk(cf, GRAPH_CHUNKID_BASE, &graph->chunk_base_graphs); + pair_chunk(cf, GRAPH_CHUNKID_GENERATION_DATA, + &graph->chunk_generation_data); + pair_chunk(cf, GRAPH_CHUNKID_GENERATION_DATA_OVERFLOW, + &graph->chunk_generation_data_overflow); - if (chunk_offset > graph_size - the_hash_algo->rawsz) { - error(_("commit-graph improper chunk offset %08x%08x"), (uint32_t)(chunk_offset >> 32), - (uint32_t)chunk_offset); - goto free_and_return; - } - - switch (chunk_id) { - case GRAPH_CHUNKID_OIDFANOUT: - if (graph->chunk_oid_fanout) - chunk_repeated = 1; - else - graph->chunk_oid_fanout = (uint32_t*)(data + chunk_offset); - break; - - case GRAPH_CHUNKID_OIDLOOKUP: - if (graph->chunk_oid_lookup) - chunk_repeated = 1; - else { - graph->chunk_oid_lookup = data + chunk_offset; - graph->num_commits = (next_chunk_offset - chunk_offset) - / graph->hash_len; - } - break; - - case GRAPH_CHUNKID_DATA: - if (graph->chunk_commit_data) - chunk_repeated = 1; - else - graph->chunk_commit_data = data + chunk_offset; - break; - - case GRAPH_CHUNKID_GENERATION_DATA: - if (graph->chunk_generation_data) - chunk_repeated = 1; - else - graph->chunk_generation_data = data + chunk_offset; - break; - - case GRAPH_CHUNKID_GENERATION_DATA_OVERFLOW: - if (graph->chunk_generation_data_overflow) - chunk_repeated = 1; - else - graph->chunk_generation_data_overflow = data + chunk_offset; - break; - - case GRAPH_CHUNKID_EXTRAEDGES: - if (graph->chunk_extra_edges) - chunk_repeated = 1; - else - graph->chunk_extra_edges = data + chunk_offset; - break; - - case GRAPH_CHUNKID_BASE: - if (graph->chunk_base_graphs) - chunk_repeated = 1; - else - graph->chunk_base_graphs = data + chunk_offset; - break; - - case GRAPH_CHUNKID_BLOOMINDEXES: - if (graph->chunk_bloom_indexes) - chunk_repeated = 1; - else if (r->settings.commit_graph_read_changed_paths) - graph->chunk_bloom_indexes = data + chunk_offset; - break; - - case GRAPH_CHUNKID_BLOOMDATA: - if (graph->chunk_bloom_data) - chunk_repeated = 1; - else if (r->settings.commit_graph_read_changed_paths) { - uint32_t hash_version; - graph->chunk_bloom_data = data + chunk_offset; - hash_version = get_be32(data + chunk_offset); - - if (hash_version != 1) - break; - - graph->bloom_filter_settings = xmalloc(sizeof(struct bloom_filter_settings)); - graph->bloom_filter_settings->hash_version = hash_version; - graph->bloom_filter_settings->num_hashes = get_be32(data + chunk_offset + 4); - graph->bloom_filter_settings->bits_per_entry = get_be32(data + chunk_offset + 8); - graph->bloom_filter_settings->max_changed_paths = DEFAULT_BLOOM_MAX_CHANGES; - } - break; - } - - if (chunk_repeated) { - error(_("commit-graph chunk id %08x appears multiple times"), chunk_id); - goto free_and_return; - } + if (r->settings.commit_graph_read_changed_paths) { + pair_chunk(cf, GRAPH_CHUNKID_BLOOMINDEXES, + &graph->chunk_bloom_indexes); + read_chunk(cf, GRAPH_CHUNKID_BLOOMDATA, + graph_read_bloom_data, graph); } if (graph->chunk_bloom_indexes && graph->chunk_bloom_data) { @@ -473,9 +420,11 @@ struct commit_graph *parse_commit_graph(struct repository *r, if (verify_commit_graph_lite(graph)) goto free_and_return; + free_chunkfile(cf); return graph; free_and_return: + free_chunkfile(cf); free(graph->bloom_filter_settings); free(graph); return NULL; diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh index fa27df579a..c7da741284 100755 --- a/t/t5318-commit-graph.sh +++ b/t/t5318-commit-graph.sh @@ -564,7 +564,7 @@ test_expect_success 'detect bad hash version' ' test_expect_success 'detect low chunk count' ' corrupt_graph_and_verify $GRAPH_BYTE_CHUNK_COUNT "\01" \ - "missing the .* chunk" + "final chunk has non-zero id" ' test_expect_success 'detect missing OID fanout chunk' ' From 6ab3b8b8b8dc94c8e4caefb1d368cc704e70d38b Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 18 Feb 2021 14:07:36 +0000 Subject: [PATCH 49/63] midx: use chunk-format read API Instead of parsing the table of contents directly, use the chunk-format API methods read_table_of_contents() and pair_chunk(). In particular, we can use the return value of pair_chunk() to generate an error when a required chunk is missing. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- midx.c | 71 +++++++++++++------------------------ t/t5319-multi-pack-index.sh | 6 ++-- 2 files changed, 28 insertions(+), 49 deletions(-) diff --git a/midx.c b/midx.c index d2fd9c10fe..d7ea0d1375 100644 --- a/midx.c +++ b/midx.c @@ -28,7 +28,6 @@ #define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */ #define MIDX_CHUNKID_OBJECTOFFSETS 0x4f4f4646 /* "OOFF" */ #define MIDX_CHUNKID_LARGEOFFSETS 0x4c4f4646 /* "LOFF" */ -#define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t)) #define MIDX_CHUNK_FANOUT_SIZE (sizeof(uint32_t) * 256) #define MIDX_CHUNK_OFFSET_WIDTH (2 * sizeof(uint32_t)) #define MIDX_CHUNK_LARGE_OFFSET_WIDTH (sizeof(uint64_t)) @@ -53,6 +52,19 @@ static char *get_midx_filename(const char *object_dir) return xstrfmt("%s/pack/multi-pack-index", object_dir); } +static int midx_read_oid_fanout(const unsigned char *chunk_start, + size_t chunk_size, void *data) +{ + struct multi_pack_index *m = data; + m->chunk_oid_fanout = (uint32_t *)chunk_start; + + if (chunk_size != 4 * 256) { + error(_("multi-pack-index OID fanout is of the wrong size")); + return 1; + } + return 0; +} + struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local) { struct multi_pack_index *m = NULL; @@ -64,6 +76,7 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local char *midx_name = get_midx_filename(object_dir); uint32_t i; const char *cur_pack_name; + struct chunkfile *cf = NULL; fd = git_open(midx_name); @@ -113,58 +126,23 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local m->num_packs = get_be32(m->data + MIDX_BYTE_NUM_PACKS); - for (i = 0; i < m->num_chunks; i++) { - uint32_t chunk_id = get_be32(m->data + MIDX_HEADER_SIZE + - MIDX_CHUNKLOOKUP_WIDTH * i); - uint64_t chunk_offset = get_be64(m->data + MIDX_HEADER_SIZE + 4 + - MIDX_CHUNKLOOKUP_WIDTH * i); + cf = init_chunkfile(NULL); - if (chunk_offset >= m->data_len) - die(_("invalid chunk offset (too large)")); + if (read_table_of_contents(cf, m->data, midx_size, + MIDX_HEADER_SIZE, m->num_chunks)) + goto cleanup_fail; - switch (chunk_id) { - case MIDX_CHUNKID_PACKNAMES: - m->chunk_pack_names = m->data + chunk_offset; - break; - - case MIDX_CHUNKID_OIDFANOUT: - m->chunk_oid_fanout = (uint32_t *)(m->data + chunk_offset); - break; - - case MIDX_CHUNKID_OIDLOOKUP: - m->chunk_oid_lookup = m->data + chunk_offset; - break; - - case MIDX_CHUNKID_OBJECTOFFSETS: - m->chunk_object_offsets = m->data + chunk_offset; - break; - - case MIDX_CHUNKID_LARGEOFFSETS: - m->chunk_large_offsets = m->data + chunk_offset; - break; - - case 0: - die(_("terminating multi-pack-index chunk id appears earlier than expected")); - break; - - default: - /* - * Do nothing on unrecognized chunks, allowing future - * extensions to add optional chunks. - */ - break; - } - } - - if (!m->chunk_pack_names) + if (pair_chunk(cf, MIDX_CHUNKID_PACKNAMES, &m->chunk_pack_names) == CHUNK_NOT_FOUND) die(_("multi-pack-index missing required pack-name chunk")); - if (!m->chunk_oid_fanout) + if (read_chunk(cf, MIDX_CHUNKID_OIDFANOUT, midx_read_oid_fanout, m) == CHUNK_NOT_FOUND) die(_("multi-pack-index missing required OID fanout chunk")); - if (!m->chunk_oid_lookup) + if (pair_chunk(cf, MIDX_CHUNKID_OIDLOOKUP, &m->chunk_oid_lookup) == CHUNK_NOT_FOUND) die(_("multi-pack-index missing required OID lookup chunk")); - if (!m->chunk_object_offsets) + if (pair_chunk(cf, MIDX_CHUNKID_OBJECTOFFSETS, &m->chunk_object_offsets) == CHUNK_NOT_FOUND) die(_("multi-pack-index missing required object offsets chunk")); + pair_chunk(cf, MIDX_CHUNKID_LARGEOFFSETS, &m->chunk_large_offsets); + m->num_objects = ntohl(m->chunk_oid_fanout[255]); m->pack_names = xcalloc(m->num_packs, sizeof(*m->pack_names)); @@ -190,6 +168,7 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local cleanup_fail: free(m); free(midx_name); + free(cf); if (midx_map) munmap(midx_map, midx_size); if (0 <= fd) diff --git a/t/t5319-multi-pack-index.sh b/t/t5319-multi-pack-index.sh index 297de502a9..ad4e878b65 100755 --- a/t/t5319-multi-pack-index.sh +++ b/t/t5319-multi-pack-index.sh @@ -314,12 +314,12 @@ test_expect_success 'verify bad OID version' ' test_expect_success 'verify truncated chunk count' ' corrupt_midx_and_verify $MIDX_BYTE_CHUNK_COUNT "\01" $objdir \ - "missing required" + "final chunk has non-zero id" ' test_expect_success 'verify extended chunk count' ' corrupt_midx_and_verify $MIDX_BYTE_CHUNK_COUNT "\07" $objdir \ - "terminating multi-pack-index chunk id appears earlier than expected" + "terminating chunk id appears earlier than expected" ' test_expect_success 'verify missing required chunk' ' @@ -329,7 +329,7 @@ test_expect_success 'verify missing required chunk' ' test_expect_success 'verify invalid chunk offset' ' corrupt_midx_and_verify $MIDX_BYTE_CHUNK_OFFSET "\01" $objdir \ - "invalid chunk offset (too large)" + "improper chunk offset(s)" ' test_expect_success 'verify packnames out of order' ' From 329fac3a366244ceac599ab63cd338bcc6a1dcb4 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 18 Feb 2021 14:07:37 +0000 Subject: [PATCH 50/63] midx: use 64-bit multiplication for chunk sizes When calculating the sizes of certain chunks, we should use 64-bit multiplication always. This allows us to properly predict the chunk sizes without risk of overflow. Other possible overflows were discovered by evaluating each multiplication in midx.c and ensuring that at least one side of the operator was of type size_t or off_t. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- midx.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/midx.c b/midx.c index d7ea0d1375..5c7f2ed233 100644 --- a/midx.c +++ b/midx.c @@ -244,7 +244,7 @@ static off_t nth_midxed_offset(struct multi_pack_index *m, uint32_t pos) const unsigned char *offset_data; uint32_t offset32; - offset_data = m->chunk_object_offsets + pos * MIDX_CHUNK_OFFSET_WIDTH; + offset_data = m->chunk_object_offsets + (off_t)pos * MIDX_CHUNK_OFFSET_WIDTH; offset32 = get_be32(offset_data + sizeof(uint32_t)); if (m->chunk_large_offsets && offset32 & MIDX_LARGE_OFFSET_NEEDED) { @@ -260,7 +260,8 @@ static off_t nth_midxed_offset(struct multi_pack_index *m, uint32_t pos) static uint32_t nth_midxed_pack_int_id(struct multi_pack_index *m, uint32_t pos) { - return get_be32(m->chunk_object_offsets + pos * MIDX_CHUNK_OFFSET_WIDTH); + return get_be32(m->chunk_object_offsets + + (off_t)pos * MIDX_CHUNK_OFFSET_WIDTH); } static int nth_midxed_pack_entry(struct repository *r, @@ -912,15 +913,15 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * add_chunk(cf, MIDX_CHUNKID_OIDFANOUT, MIDX_CHUNK_FANOUT_SIZE, write_midx_oid_fanout); add_chunk(cf, MIDX_CHUNKID_OIDLOOKUP, - ctx.entries_nr * the_hash_algo->rawsz, + (size_t)ctx.entries_nr * the_hash_algo->rawsz, write_midx_oid_lookup); add_chunk(cf, MIDX_CHUNKID_OBJECTOFFSETS, - ctx.entries_nr * MIDX_CHUNK_OFFSET_WIDTH, + (size_t)ctx.entries_nr * MIDX_CHUNK_OFFSET_WIDTH, write_midx_object_offsets); if (ctx.large_offsets_needed) add_chunk(cf, MIDX_CHUNKID_LARGEOFFSETS, - ctx.num_large_offsets * MIDX_CHUNK_LARGE_OFFSET_WIDTH, + (size_t)ctx.num_large_offsets * MIDX_CHUNK_LARGE_OFFSET_WIDTH, write_midx_large_offsets); write_midx_header(f, get_num_chunks(cf), ctx.nr - dropped_packs); From 5387fefadc121cb142e513557997415f2e75188a Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 18 Feb 2021 14:07:38 +0000 Subject: [PATCH 51/63] chunk-format: restore duplicate chunk checks Before refactoring into the chunk-format API, the commit-graph parsing logic included checks for duplicate chunks. It is unlikely that we would desire a chunk-based file format that allows duplicate chunk IDs in the table of contents, so add duplicate checks into read_table_of_contents(). Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- chunk-format.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/chunk-format.c b/chunk-format.c index 2c1fecf1c3..da191e59a2 100644 --- a/chunk-format.c +++ b/chunk-format.c @@ -97,6 +97,7 @@ int read_table_of_contents(struct chunkfile *cf, uint64_t toc_offset, int toc_length) { + int i; uint32_t chunk_id; const unsigned char *table_of_contents = mfile + toc_offset; @@ -123,6 +124,14 @@ int read_table_of_contents(struct chunkfile *cf, return -1; } + for (i = 0; i < cf->chunks_nr; i++) { + if (cf->chunks[i].id == chunk_id) { + error(_("duplicate chunk ID %"PRIx32" found"), + chunk_id); + return -1; + } + } + cf->chunks[cf->chunks_nr].id = chunk_id; cf->chunks[cf->chunks_nr].start = mfile + chunk_offset; cf->chunks[cf->chunks_nr].size = next_chunk_offset - chunk_offset; From a43a2e6c2af63e5b93444c8ed8ac252927c2f0f3 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 18 Feb 2021 14:07:39 +0000 Subject: [PATCH 52/63] chunk-format: add technical docs The chunk-based file format is now an API in the code, but we should also take time to document it as a file format. Specifically, it matches the CHUNK LOOKUP sections of the commit-graph and multi-pack-index files, but there are some commonalities that should be grouped in this document. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- Documentation/technical/chunk-format.txt | 116 ++++++++++++++++++ .../technical/commit-graph-format.txt | 3 + Documentation/technical/pack-format.txt | 3 + 3 files changed, 122 insertions(+) create mode 100644 Documentation/technical/chunk-format.txt diff --git a/Documentation/technical/chunk-format.txt b/Documentation/technical/chunk-format.txt new file mode 100644 index 0000000000..593614fced --- /dev/null +++ b/Documentation/technical/chunk-format.txt @@ -0,0 +1,116 @@ +Chunk-based file formats +======================== + +Some file formats in Git use a common concept of "chunks" to describe +sections of the file. This allows structured access to a large file by +scanning a small "table of contents" for the remaining data. This common +format is used by the `commit-graph` and `multi-pack-index` files. See +link:technical/pack-format.html[the `multi-pack-index` format] and +link:technical/commit-graph-format.html[the `commit-graph` format] for +how they use the chunks to describe structured data. + +A chunk-based file format begins with some header information custom to +that format. That header should include enough information to identify +the file type, format version, and number of chunks in the file. From this +information, that file can determine the start of the chunk-based region. + +The chunk-based region starts with a table of contents describing where +each chunk starts and ends. This consists of (C+1) rows of 12 bytes each, +where C is the number of chunks. Consider the following table: + + | Chunk ID (4 bytes) | Chunk Offset (8 bytes) | + |--------------------|------------------------| + | ID[0] | OFFSET[0] | + | ... | ... | + | ID[C] | OFFSET[C] | + | 0x0000 | OFFSET[C+1] | + +Each row consists of a 4-byte chunk identifier (ID) and an 8-byte offset. +Each integer is stored in network-byte order. + +The chunk identifier `ID[i]` is a label for the data stored within this +fill from `OFFSET[i]` (inclusive) to `OFFSET[i+1]` (exclusive). Thus, the +size of the `i`th chunk is equal to the difference between `OFFSET[i+1]` +and `OFFSET[i]`. This requires that the chunk data appears contiguously +in the same order as the table of contents. + +The final entry in the table of contents must be four zero bytes. This +confirms that the table of contents is ending and provides the offset for +the end of the chunk-based data. + +Note: The chunk-based format expects that the file contains _at least_ a +trailing hash after `OFFSET[C+1]`. + +Functions for working with chunk-based file formats are declared in +`chunk-format.h`. Using these methods provide extra checks that assist +developers when creating new file formats. + +Writing chunk-based file formats +-------------------------------- + +To write a chunk-based file format, create a `struct chunkfile` by +calling `init_chunkfile()` and pass a `struct hashfile` pointer. The +caller is responsible for opening the `hashfile` and writing header +information so the file format is identifiable before the chunk-based +format begins. + +Then, call `add_chunk()` for each chunk that is intended for write. This +populates the `chunkfile` with information about the order and size of +each chunk to write. Provide a `chunk_write_fn` function pointer to +perform the write of the chunk data upon request. + +Call `write_chunkfile()` to write the table of contents to the `hashfile` +followed by each of the chunks. This will verify that each chunk wrote +the expected amount of data so the table of contents is correct. + +Finally, call `free_chunkfile()` to clear the `struct chunkfile` data. The +caller is responsible for finalizing the `hashfile` by writing the trailing +hash and closing the file. + +Reading chunk-based file formats +-------------------------------- + +To read a chunk-based file format, the file must be opened as a +memory-mapped region. The chunk-format API expects that the entire file +is mapped as a contiguous memory region. + +Initialize a `struct chunkfile` pointer with `init_chunkfile(NULL)`. + +After reading the header information from the beginning of the file, +including the chunk count, call `read_table_of_contents()` to populate +the `struct chunkfile` with the list of chunks, their offsets, and their +sizes. + +Extract the data information for each chunk using `pair_chunk()` or +`read_chunk()`: + +* `pair_chunk()` assigns a given pointer with the location inside the + memory-mapped file corresponding to that chunk's offset. If the chunk + does not exist, then the pointer is not modified. + +* `read_chunk()` takes a `chunk_read_fn` function pointer and calls it + with the appropriate initial pointer and size information. The function + is not called if the chunk does not exist. Use this method to read chunks + if you need to perform immediate parsing or if you need to execute logic + based on the size of the chunk. + +After calling these methods, call `free_chunkfile()` to clear the +`struct chunkfile` data. This will not close the memory-mapped region. +Callers are expected to own that data for the timeframe the pointers into +the region are needed. + +Examples +-------- + +These file formats use the chunk-format API, and can be used as examples +for future formats: + +* *commit-graph:* see `write_commit_graph_file()` and `parse_commit_graph()` + in `commit-graph.c` for how the chunk-format API is used to write and + parse the commit-graph file format documented in + link:technical/commit-graph-format.html[the commit-graph file format]. + +* *multi-pack-index:* see `write_midx_internal()` and `load_multi_pack_index()` + in `midx.c` for how the chunk-format API is used to write and + parse the multi-pack-index file format documented in + link:technical/pack-format.html[the multi-pack-index file format]. diff --git a/Documentation/technical/commit-graph-format.txt b/Documentation/technical/commit-graph-format.txt index b6658eff18..87971c27dd 100644 --- a/Documentation/technical/commit-graph-format.txt +++ b/Documentation/technical/commit-graph-format.txt @@ -61,6 +61,9 @@ CHUNK LOOKUP: the length using the next chunk position if necessary.) Each chunk ID appears at most once. + The CHUNK LOOKUP matches the table of contents from + link:technical/chunk-format.html[the chunk-based file format]. + The remaining data in the body is described one chunk at a time, and these chunks may be given in any order. Chunks are required unless otherwise specified. diff --git a/Documentation/technical/pack-format.txt b/Documentation/technical/pack-format.txt index f96b2e605f..2fb1e60d29 100644 --- a/Documentation/technical/pack-format.txt +++ b/Documentation/technical/pack-format.txt @@ -301,6 +301,9 @@ CHUNK LOOKUP: (Chunks are provided in file-order, so you can infer the length using the next chunk position if necessary.) + The CHUNK LOOKUP matches the table of contents from + link:technical/chunk-format.html[the chunk-based file format]. + The remaining data in the body is described one chunk at a time, and these chunks may be given in any order. Chunks are required unless otherwise specified. From 726b25a91ba0e8f26f83c8d39ad16351b7bdb510 Mon Sep 17 00:00:00 2001 From: Jonathan Tan Date: Mon, 22 Feb 2021 11:20:06 -0800 Subject: [PATCH 53/63] http: allow custom index-pack args Currently, when fetching, packfiles referenced by URIs are run through index-pack without any arguments other than --stdin and --keep, no matter what arguments are used for the packfile that is inline in the fetch response. As a preparation for ensuring that all packs (whether inline or not) use the same index-pack arguments, teach the http subsystem to allow custom index-pack arguments. http-fetch has been updated to use the new API. For now, it passes --keep alone instead of --keep with a process ID, but this is only temporary because http-fetch itself will be taught to accept index-pack parameters (instead of using a hardcoded constant) in a subsequent commit. Signed-off-by: Jonathan Tan Signed-off-by: Junio C Hamano --- http-fetch.c | 6 +++++- http.c | 15 ++++++++------- http.h | 10 +++++----- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/http-fetch.c b/http-fetch.c index c4ccc5fea9..2d1d9d054f 100644 --- a/http-fetch.c +++ b/http-fetch.c @@ -43,6 +43,9 @@ static int fetch_using_walker(const char *raw_url, int get_verbosely, return rc; } +static const char *index_pack_args[] = + {"index-pack", "--stdin", "--keep", NULL}; + static void fetch_single_packfile(struct object_id *packfile_hash, const char *url) { struct http_pack_request *preq; @@ -55,7 +58,8 @@ static void fetch_single_packfile(struct object_id *packfile_hash, if (preq == NULL) die("couldn't create http pack request"); preq->slot->results = &results; - preq->generate_keep = 1; + preq->index_pack_args = index_pack_args; + preq->preserve_index_pack_stdout = 1; if (start_active_slot(preq->slot)) { run_active_slot(preq->slot); diff --git a/http.c b/http.c index 8b23a546af..f8ea28bb2e 100644 --- a/http.c +++ b/http.c @@ -2259,6 +2259,9 @@ void release_http_pack_request(struct http_pack_request *preq) free(preq); } +static const char *default_index_pack_args[] = + {"index-pack", "--stdin", NULL}; + int finish_http_pack_request(struct http_pack_request *preq) { struct child_process ip = CHILD_PROCESS_INIT; @@ -2270,17 +2273,15 @@ int finish_http_pack_request(struct http_pack_request *preq) tmpfile_fd = xopen(preq->tmpfile.buf, O_RDONLY); - strvec_push(&ip.args, "index-pack"); - strvec_push(&ip.args, "--stdin"); ip.git_cmd = 1; ip.in = tmpfile_fd; - if (preq->generate_keep) { - strvec_pushf(&ip.args, "--keep=git %"PRIuMAX, - (uintmax_t)getpid()); + ip.argv = preq->index_pack_args ? preq->index_pack_args + : default_index_pack_args; + + if (preq->preserve_index_pack_stdout) ip.out = 0; - } else { + else ip.no_stdout = 1; - } if (run_command(&ip)) { ret = -1; diff --git a/http.h b/http.h index 5de792ef3f..bf3d1270ad 100644 --- a/http.h +++ b/http.h @@ -218,12 +218,12 @@ struct http_pack_request { char *url; /* - * If this is true, finish_http_pack_request() will pass "--keep" to - * index-pack, resulting in the creation of a keep file, and will not - * suppress its stdout (that is, the "keep\t\n" line will be - * printed to stdout). + * index-pack command to run. Must be terminated by NULL. + * + * If NULL, defaults to {"index-pack", "--stdin", NULL}. */ - unsigned generate_keep : 1; + const char **index_pack_args; + unsigned preserve_index_pack_stdout : 1; FILE *packfile; struct strbuf tmpfile; From 27e35ba6c6b532ad6fc88b554d437b6892b4e915 Mon Sep 17 00:00:00 2001 From: Jonathan Tan Date: Mon, 22 Feb 2021 11:20:07 -0800 Subject: [PATCH 54/63] http-fetch: allow custom index-pack args This is the next step in teaching fetch-pack to pass its index-pack arguments when processing packfiles referenced by URIs. The "--keep" in fetch-pack.c will be replaced with a full message in a subsequent commit. Signed-off-by: Jonathan Tan Signed-off-by: Junio C Hamano --- Documentation/git-http-fetch.txt | 10 ++++++++-- fetch-pack.c | 3 +++ http-fetch.c | 20 +++++++++++++++----- t/t5550-http-fetch-dumb.sh | 5 ++++- 4 files changed, 30 insertions(+), 8 deletions(-) diff --git a/Documentation/git-http-fetch.txt b/Documentation/git-http-fetch.txt index 4deb4893f5..9fa17b60e4 100644 --- a/Documentation/git-http-fetch.txt +++ b/Documentation/git-http-fetch.txt @@ -41,11 +41,17 @@ commit-id:: ['\t'] --packfile=:: - Instead of a commit id on the command line (which is not expected in + For internal use only. Instead of a commit id on the command + line (which is not expected in this case), 'git http-fetch' fetches the packfile directly at the given URL and uses index-pack to generate corresponding .idx and .keep files. The hash is used to determine the name of the temporary file and is - arbitrary. The output of index-pack is printed to stdout. + arbitrary. The output of index-pack is printed to stdout. Requires + --index-pack-args. + +--index-pack-args=:: + For internal use only. The command to run on the contents of the + downloaded pack. Arguments are URL-encoded separated by spaces. --recover:: Verify that everything reachable from target is fetched. Used after diff --git a/fetch-pack.c b/fetch-pack.c index 876f90c759..aeac010b0b 100644 --- a/fetch-pack.c +++ b/fetch-pack.c @@ -1645,6 +1645,9 @@ static struct ref *do_fetch_pack_v2(struct fetch_pack_args *args, strvec_pushf(&cmd.args, "--packfile=%.*s", (int) the_hash_algo->hexsz, packfile_uris.items[i].string); + strvec_push(&cmd.args, "--index-pack-arg=index-pack"); + strvec_push(&cmd.args, "--index-pack-arg=--stdin"); + strvec_push(&cmd.args, "--index-pack-arg=--keep"); strvec_push(&cmd.args, uri); cmd.git_cmd = 1; cmd.no_stdin = 1; diff --git a/http-fetch.c b/http-fetch.c index 2d1d9d054f..fa642462a9 100644 --- a/http-fetch.c +++ b/http-fetch.c @@ -3,6 +3,7 @@ #include "exec-cmd.h" #include "http.h" #include "walker.h" +#include "strvec.h" static const char http_fetch_usage[] = "git http-fetch " "[-c] [-t] [-a] [-v] [--recover] [-w ref] [--stdin | --packfile=hash | commit-id] url"; @@ -43,11 +44,9 @@ static int fetch_using_walker(const char *raw_url, int get_verbosely, return rc; } -static const char *index_pack_args[] = - {"index-pack", "--stdin", "--keep", NULL}; - static void fetch_single_packfile(struct object_id *packfile_hash, - const char *url) { + const char *url, + const char **index_pack_args) { struct http_pack_request *preq; struct slot_results results; int ret; @@ -90,6 +89,7 @@ int cmd_main(int argc, const char **argv) int packfile = 0; int nongit; struct object_id packfile_hash; + struct strvec index_pack_args = STRVEC_INIT; setup_git_directory_gently(&nongit); @@ -116,6 +116,8 @@ int cmd_main(int argc, const char **argv) packfile = 1; if (parse_oid_hex(p, &packfile_hash, &end) || *end) die(_("argument to --packfile must be a valid hash (got '%s')"), p); + } else if (skip_prefix(argv[arg], "--index-pack-arg=", &p)) { + strvec_push(&index_pack_args, p); } arg++; } @@ -128,10 +130,18 @@ int cmd_main(int argc, const char **argv) git_config(git_default_config, NULL); if (packfile) { - fetch_single_packfile(&packfile_hash, argv[arg]); + if (!index_pack_args.nr) + die(_("--packfile requires --index-pack-args")); + + fetch_single_packfile(&packfile_hash, argv[arg], + index_pack_args.v); + return 0; } + if (index_pack_args.nr) + die(_("--index-pack-args can only be used with --packfile")); + if (commits_on_stdin) { commits = walker_targets_stdin(&commit_id, &write_ref); } else { diff --git a/t/t5550-http-fetch-dumb.sh b/t/t5550-http-fetch-dumb.sh index 483578b2d7..358b322e05 100755 --- a/t/t5550-http-fetch-dumb.sh +++ b/t/t5550-http-fetch-dumb.sh @@ -224,7 +224,10 @@ test_expect_success 'http-fetch --packfile' ' git init packfileclient && p=$(cd "$HTTPD_DOCUMENT_ROOT_PATH"/repo_pack.git && ls objects/pack/pack-*.pack) && - git -C packfileclient http-fetch --packfile=$ARBITRARY "$HTTPD_URL"/dumb/repo_pack.git/$p >out && + git -C packfileclient http-fetch --packfile=$ARBITRARY \ + --index-pack-arg=index-pack --index-pack-arg=--stdin \ + --index-pack-arg=--keep \ + "$HTTPD_URL"/dumb/repo_pack.git/$p >out && grep "^keep.[0-9a-f]\{16,\}$" out && cut -c6- out >packhash && From b664e9ffa153189dae9b88f32d1c5fedcf85056a Mon Sep 17 00:00:00 2001 From: Jonathan Tan Date: Mon, 22 Feb 2021 11:20:08 -0800 Subject: [PATCH 55/63] fetch-pack: with packfile URIs, use index-pack arg Unify the index-pack arguments used when processing the inline pack and when downloading packfiles referenced by URIs. This is done by teaching get_pack() to also store the index-pack arguments whenever at least one packfile URI is given, and then when processing the packfile URI(s), using the stored arguments. Signed-off-by: Jonathan Tan Signed-off-by: Junio C Hamano --- fetch-pack.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/fetch-pack.c b/fetch-pack.c index aeac010b0b..dd0a6c4b34 100644 --- a/fetch-pack.c +++ b/fetch-pack.c @@ -797,12 +797,13 @@ static void write_promisor_file(const char *keep_name, } /* - * Pass 1 as "only_packfile" if the pack received is the only pack in this - * fetch request (that is, if there were no packfile URIs provided). + * If packfile URIs were provided, pass a non-NULL pointer to index_pack_args. + * The strings to pass as the --index-pack-arg arguments to http-fetch will be + * stored there. (It must be freed by the caller.) */ static int get_pack(struct fetch_pack_args *args, int xd[2], struct string_list *pack_lockfiles, - int only_packfile, + struct strvec *index_pack_args, struct ref **sought, int nr_sought) { struct async demux; @@ -845,7 +846,7 @@ static int get_pack(struct fetch_pack_args *args, strvec_push(&cmd.args, alternate_shallow_file); } - if (do_keep || args->from_promisor) { + if (do_keep || args->from_promisor || index_pack_args) { if (pack_lockfiles) cmd.out = -1; cmd_name = "index-pack"; @@ -863,7 +864,7 @@ static int get_pack(struct fetch_pack_args *args, "--keep=fetch-pack %"PRIuMAX " on %s", (uintmax_t)getpid(), hostname); } - if (only_packfile && args->check_self_contained_and_connected) + if (!index_pack_args && args->check_self_contained_and_connected) strvec_push(&cmd.args, "--check-self-contained-and-connected"); else /* @@ -901,7 +902,7 @@ static int get_pack(struct fetch_pack_args *args, : transfer_fsck_objects >= 0 ? transfer_fsck_objects : 0) { - if (args->from_promisor || !only_packfile) + if (args->from_promisor || index_pack_args) /* * We cannot use --strict in index-pack because it * checks both broken objects and links, but we only @@ -913,6 +914,13 @@ static int get_pack(struct fetch_pack_args *args, fsck_msg_types.buf); } + if (index_pack_args) { + int i; + + for (i = 0; i < cmd.args.nr; i++) + strvec_push(index_pack_args, cmd.args.v[i]); + } + cmd.in = demux.out; cmd.git_cmd = 1; if (start_command(&cmd)) @@ -1084,7 +1092,7 @@ static struct ref *do_fetch_pack(struct fetch_pack_args *args, alternate_shallow_file = setup_temporary_shallow(si->shallow); else alternate_shallow_file = NULL; - if (get_pack(args, fd, pack_lockfiles, 1, sought, nr_sought)) + if (get_pack(args, fd, pack_lockfiles, NULL, sought, nr_sought)) die(_("git fetch-pack: fetch failed.")); all_done: @@ -1535,6 +1543,7 @@ static struct ref *do_fetch_pack_v2(struct fetch_pack_args *args, int seen_ack = 0; struct string_list packfile_uris = STRING_LIST_INIT_DUP; int i; + struct strvec index_pack_args = STRVEC_INIT; negotiator = &negotiator_alloc; fetch_negotiator_init(r, negotiator); @@ -1624,7 +1633,8 @@ static struct ref *do_fetch_pack_v2(struct fetch_pack_args *args, receive_packfile_uris(&reader, &packfile_uris); process_section_header(&reader, "packfile", 0); if (get_pack(args, fd, pack_lockfiles, - !packfile_uris.nr, sought, nr_sought)) + packfile_uris.nr ? &index_pack_args : NULL, + sought, nr_sought)) die(_("git fetch-pack: fetch failed.")); do_check_stateless_delimiter(args, &reader); @@ -1636,6 +1646,7 @@ static struct ref *do_fetch_pack_v2(struct fetch_pack_args *args, } for (i = 0; i < packfile_uris.nr; i++) { + int j; struct child_process cmd = CHILD_PROCESS_INIT; char packname[GIT_MAX_HEXSZ + 1]; const char *uri = packfile_uris.items[i].string + @@ -1645,9 +1656,9 @@ static struct ref *do_fetch_pack_v2(struct fetch_pack_args *args, strvec_pushf(&cmd.args, "--packfile=%.*s", (int) the_hash_algo->hexsz, packfile_uris.items[i].string); - strvec_push(&cmd.args, "--index-pack-arg=index-pack"); - strvec_push(&cmd.args, "--index-pack-arg=--stdin"); - strvec_push(&cmd.args, "--index-pack-arg=--keep"); + for (j = 0; j < index_pack_args.nr; j++) + strvec_pushf(&cmd.args, "--index-pack-arg=%s", + index_pack_args.v[j]); strvec_push(&cmd.args, uri); cmd.git_cmd = 1; cmd.no_stdin = 1; @@ -1683,6 +1694,7 @@ static struct ref *do_fetch_pack_v2(struct fetch_pack_args *args, packname)); } string_list_clear(&packfile_uris, 0); + strvec_clear(&index_pack_args); if (negotiator) negotiator->release(negotiator); From 5476e1efded571e374cd97c7d69f17962ba1c44f Mon Sep 17 00:00:00 2001 From: Jonathan Tan Date: Mon, 22 Feb 2021 11:20:09 -0800 Subject: [PATCH 56/63] fetch-pack: print and use dangling .gitmodules Teach index-pack to print dangling .gitmodules links after its "keep" or "pack" line instead of declaring an error, and teach fetch-pack to check such lines printed. This allows the tree side of the .gitmodules link to be in one packfile and the blob side to be in another without failing the fsck check, because it is now fetch-pack which checks such objects after all packfiles have been downloaded and indexed (and not index-pack on an individual packfile, as it is before this commit). Signed-off-by: Jonathan Tan Signed-off-by: Junio C Hamano --- Documentation/git-index-pack.txt | 7 ++- builtin/index-pack.c | 25 +++++++++- builtin/receive-pack.c | 2 +- fetch-pack.c | 78 +++++++++++++++++++++++++++----- fsck.c | 5 ++ fsck.h | 2 + pack-write.c | 8 +++- pack.h | 2 +- t/t5702-protocol-v2.sh | 58 ++++++++++++++++++++++-- 9 files changed, 165 insertions(+), 22 deletions(-) diff --git a/Documentation/git-index-pack.txt b/Documentation/git-index-pack.txt index af0c26232c..e74a4a1eda 100644 --- a/Documentation/git-index-pack.txt +++ b/Documentation/git-index-pack.txt @@ -78,7 +78,12 @@ OPTIONS Die if the pack contains broken links. For internal use only. --fsck-objects:: - Die if the pack contains broken objects. For internal use only. + For internal use only. ++ +Die if the pack contains broken objects. If the pack contains a tree +pointing to a .gitmodules blob that does not exist, prints the hash of +that blob (for the caller to check) after the hash that goes into the +name of the pack/idx file (see "Notes"). --threads=:: Specifies the number of threads to spawn when resolving diff --git a/builtin/index-pack.c b/builtin/index-pack.c index 557bd2f348..0444febeee 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -1693,6 +1693,22 @@ static void show_pack_info(int stat_only) } } +static int print_dangling_gitmodules(struct fsck_options *o, + const struct object_id *oid, + enum object_type object_type, + int msg_type, const char *message) +{ + /* + * NEEDSWORK: Plumb the MSG_ID (from fsck.c) here and use it + * instead of relying on this string check. + */ + if (starts_with(message, "gitmodulesMissing")) { + printf("%s\n", oid_to_hex(oid)); + return 0; + } + return fsck_error_function(o, oid, object_type, msg_type, message); +} + int cmd_index_pack(int argc, const char **argv, const char *prefix) { int i, fix_thin_pack = 0, verify = 0, stat_only = 0; @@ -1888,8 +1904,13 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix) else close(input_fd); - if (do_fsck_object && fsck_finish(&fsck_options)) - die(_("fsck error in pack objects")); + if (do_fsck_object) { + struct fsck_options fo = fsck_options; + + fo.error_func = print_dangling_gitmodules; + if (fsck_finish(&fo)) + die(_("fsck error in pack objects")); + } free(objects); strbuf_release(&index_name_buf); diff --git a/builtin/receive-pack.c b/builtin/receive-pack.c index d49d050e6e..ed2c9b42e9 100644 --- a/builtin/receive-pack.c +++ b/builtin/receive-pack.c @@ -2275,7 +2275,7 @@ static const char *unpack(int err_fd, struct shallow_info *si) status = start_command(&child); if (status) return "index-pack fork failed"; - pack_lockfile = index_pack_lockfile(child.out); + pack_lockfile = index_pack_lockfile(child.out, NULL); close(child.out); status = finish_command(&child); if (status) diff --git a/fetch-pack.c b/fetch-pack.c index dd0a6c4b34..f9def5ac74 100644 --- a/fetch-pack.c +++ b/fetch-pack.c @@ -796,6 +796,26 @@ static void write_promisor_file(const char *keep_name, strbuf_release(&promisor_name); } +static void parse_gitmodules_oids(int fd, struct oidset *gitmodules_oids) +{ + int len = the_hash_algo->hexsz + 1; /* hash + NL */ + + do { + char hex_hash[GIT_MAX_HEXSZ + 1]; + int read_len = read_in_full(fd, hex_hash, len); + struct object_id oid; + const char *end; + + if (!read_len) + return; + if (read_len != len) + die("invalid length read %d", read_len); + if (parse_oid_hex(hex_hash, &oid, &end) || *end != '\n') + die("invalid hash"); + oidset_insert(gitmodules_oids, &oid); + } while (1); +} + /* * If packfile URIs were provided, pass a non-NULL pointer to index_pack_args. * The strings to pass as the --index-pack-arg arguments to http-fetch will be @@ -804,7 +824,8 @@ static void write_promisor_file(const char *keep_name, static int get_pack(struct fetch_pack_args *args, int xd[2], struct string_list *pack_lockfiles, struct strvec *index_pack_args, - struct ref **sought, int nr_sought) + struct ref **sought, int nr_sought, + struct oidset *gitmodules_oids) { struct async demux; int do_keep = args->keep_pack; @@ -812,6 +833,7 @@ static int get_pack(struct fetch_pack_args *args, struct pack_header header; int pass_header = 0; struct child_process cmd = CHILD_PROCESS_INIT; + int fsck_objects = 0; int ret; memset(&demux, 0, sizeof(demux)); @@ -846,8 +868,15 @@ static int get_pack(struct fetch_pack_args *args, strvec_push(&cmd.args, alternate_shallow_file); } - if (do_keep || args->from_promisor || index_pack_args) { - if (pack_lockfiles) + if (fetch_fsck_objects >= 0 + ? fetch_fsck_objects + : transfer_fsck_objects >= 0 + ? transfer_fsck_objects + : 0) + fsck_objects = 1; + + if (do_keep || args->from_promisor || index_pack_args || fsck_objects) { + if (pack_lockfiles || fsck_objects) cmd.out = -1; cmd_name = "index-pack"; strvec_push(&cmd.args, cmd_name); @@ -897,11 +926,7 @@ static int get_pack(struct fetch_pack_args *args, strvec_pushf(&cmd.args, "--pack_header=%"PRIu32",%"PRIu32, ntohl(header.hdr_version), ntohl(header.hdr_entries)); - if (fetch_fsck_objects >= 0 - ? fetch_fsck_objects - : transfer_fsck_objects >= 0 - ? transfer_fsck_objects - : 0) { + if (fsck_objects) { if (args->from_promisor || index_pack_args) /* * We cannot use --strict in index-pack because it @@ -925,10 +950,15 @@ static int get_pack(struct fetch_pack_args *args, cmd.git_cmd = 1; if (start_command(&cmd)) die(_("fetch-pack: unable to fork off %s"), cmd_name); - if (do_keep && pack_lockfiles) { - char *pack_lockfile = index_pack_lockfile(cmd.out); + if (do_keep && (pack_lockfiles || fsck_objects)) { + int is_well_formed; + char *pack_lockfile = index_pack_lockfile(cmd.out, &is_well_formed); + + if (!is_well_formed) + die(_("fetch-pack: invalid index-pack output")); if (pack_lockfile) string_list_append_nodup(pack_lockfiles, pack_lockfile); + parse_gitmodules_oids(cmd.out, gitmodules_oids); close(cmd.out); } @@ -963,6 +993,22 @@ static int cmp_ref_by_name(const void *a_, const void *b_) return strcmp(a->name, b->name); } +static void fsck_gitmodules_oids(struct oidset *gitmodules_oids) +{ + struct oidset_iter iter; + const struct object_id *oid; + struct fsck_options fo = FSCK_OPTIONS_STRICT; + + if (!oidset_size(gitmodules_oids)) + return; + + oidset_iter_init(gitmodules_oids, &iter); + while ((oid = oidset_iter_next(&iter))) + register_found_gitmodules(oid); + if (fsck_finish(&fo)) + die("fsck failed"); +} + static struct ref *do_fetch_pack(struct fetch_pack_args *args, int fd[2], const struct ref *orig_ref, @@ -977,6 +1023,7 @@ static struct ref *do_fetch_pack(struct fetch_pack_args *args, int agent_len; struct fetch_negotiator negotiator_alloc; struct fetch_negotiator *negotiator; + struct oidset gitmodules_oids = OIDSET_INIT; negotiator = &negotiator_alloc; fetch_negotiator_init(r, negotiator); @@ -1092,8 +1139,10 @@ static struct ref *do_fetch_pack(struct fetch_pack_args *args, alternate_shallow_file = setup_temporary_shallow(si->shallow); else alternate_shallow_file = NULL; - if (get_pack(args, fd, pack_lockfiles, NULL, sought, nr_sought)) + if (get_pack(args, fd, pack_lockfiles, NULL, sought, nr_sought, + &gitmodules_oids)) die(_("git fetch-pack: fetch failed.")); + fsck_gitmodules_oids(&gitmodules_oids); all_done: if (negotiator) @@ -1544,6 +1593,7 @@ static struct ref *do_fetch_pack_v2(struct fetch_pack_args *args, struct string_list packfile_uris = STRING_LIST_INIT_DUP; int i; struct strvec index_pack_args = STRVEC_INIT; + struct oidset gitmodules_oids = OIDSET_INIT; negotiator = &negotiator_alloc; fetch_negotiator_init(r, negotiator); @@ -1634,7 +1684,7 @@ static struct ref *do_fetch_pack_v2(struct fetch_pack_args *args, process_section_header(&reader, "packfile", 0); if (get_pack(args, fd, pack_lockfiles, packfile_uris.nr ? &index_pack_args : NULL, - sought, nr_sought)) + sought, nr_sought, &gitmodules_oids)) die(_("git fetch-pack: fetch failed.")); do_check_stateless_delimiter(args, &reader); @@ -1677,6 +1727,8 @@ static struct ref *do_fetch_pack_v2(struct fetch_pack_args *args, packname[the_hash_algo->hexsz] = '\0'; + parse_gitmodules_oids(cmd.out, &gitmodules_oids); + close(cmd.out); if (finish_command(&cmd)) @@ -1696,6 +1748,8 @@ static struct ref *do_fetch_pack_v2(struct fetch_pack_args *args, string_list_clear(&packfile_uris, 0); strvec_clear(&index_pack_args); + fsck_gitmodules_oids(&gitmodules_oids); + if (negotiator) negotiator->release(negotiator); diff --git a/fsck.c b/fsck.c index f82e2fe9e3..49ef6569e8 100644 --- a/fsck.c +++ b/fsck.c @@ -1243,6 +1243,11 @@ int fsck_error_function(struct fsck_options *o, return 1; } +void register_found_gitmodules(const struct object_id *oid) +{ + oidset_insert(&gitmodules_found, oid); +} + int fsck_finish(struct fsck_options *options) { int ret = 0; diff --git a/fsck.h b/fsck.h index 69cf715e79..d75b723bd5 100644 --- a/fsck.h +++ b/fsck.h @@ -62,6 +62,8 @@ int fsck_walk(struct object *obj, void *data, struct fsck_options *options); int fsck_object(struct object *obj, void *data, unsigned long size, struct fsck_options *options); +void register_found_gitmodules(const struct object_id *oid); + /* * Some fsck checks are context-dependent, and may end up queued; run this * after completing all fsck_object() calls in order to resolve any remaining diff --git a/pack-write.c b/pack-write.c index 3513665e1e..f66ea8e5a1 100644 --- a/pack-write.c +++ b/pack-write.c @@ -272,7 +272,7 @@ void fixup_pack_header_footer(int pack_fd, fsync_or_die(pack_fd, pack_name); } -char *index_pack_lockfile(int ip_out) +char *index_pack_lockfile(int ip_out, int *is_well_formed) { char packname[GIT_MAX_HEXSZ + 6]; const int len = the_hash_algo->hexsz + 6; @@ -286,11 +286,17 @@ char *index_pack_lockfile(int ip_out) */ if (read_in_full(ip_out, packname, len) == len && packname[len-1] == '\n') { const char *name; + + if (is_well_formed) + *is_well_formed = 1; packname[len-1] = 0; if (skip_prefix(packname, "keep\t", &name)) return xstrfmt("%s/pack/pack-%s.keep", get_object_directory(), name); + return NULL; } + if (is_well_formed) + *is_well_formed = 0; return NULL; } diff --git a/pack.h b/pack.h index 9fc0945ac9..09cffec395 100644 --- a/pack.h +++ b/pack.h @@ -85,7 +85,7 @@ int verify_pack_index(struct packed_git *); int verify_pack(struct repository *, struct packed_git *, verify_fn fn, struct progress *, uint32_t); off_t write_pack_header(struct hashfile *f, uint32_t); void fixup_pack_header_footer(int, unsigned char *, const char *, uint32_t, unsigned char *, off_t); -char *index_pack_lockfile(int fd); +char *index_pack_lockfile(int fd, int *is_well_formed); /* * The "hdr" output buffer should be at least this big, which will handle sizes diff --git a/t/t5702-protocol-v2.sh b/t/t5702-protocol-v2.sh index 7d5b17909b..b1bc73a9a9 100755 --- a/t/t5702-protocol-v2.sh +++ b/t/t5702-protocol-v2.sh @@ -847,8 +847,9 @@ test_expect_success 'part of packfile response provided as URI' ' test -f hfound && test -f h2found && - # Ensure that there are exactly 6 files (3 .pack and 3 .idx). - ls http_child/.git/objects/pack/* >filelist && + # Ensure that there are exactly 3 packfiles with associated .idx + ls http_child/.git/objects/pack/*.pack \ + http_child/.git/objects/pack/*.idx >filelist && test_line_count = 6 filelist ' @@ -901,8 +902,9 @@ test_expect_success 'packfile-uri with transfer.fsckobjects' ' -c fetch.uriprotocols=http,https \ clone "$HTTPD_URL/smart/http_parent" http_child && - # Ensure that there are exactly 4 files (2 .pack and 2 .idx). - ls http_child/.git/objects/pack/* >filelist && + # Ensure that there are exactly 2 packfiles with associated .idx + ls http_child/.git/objects/pack/*.pack \ + http_child/.git/objects/pack/*.idx >filelist && test_line_count = 4 filelist ' @@ -936,6 +938,54 @@ test_expect_success 'packfile-uri with transfer.fsckobjects fails on bad object' test_i18ngrep "invalid author/committer line - missing email" error ' +test_expect_success 'packfile-uri with transfer.fsckobjects succeeds when .gitmodules is separate from tree' ' + P="$HTTPD_DOCUMENT_ROOT_PATH/http_parent" && + rm -rf "$P" http_child && + + git init "$P" && + git -C "$P" config "uploadpack.allowsidebandall" "true" && + + echo "[submodule libfoo]" >"$P/.gitmodules" && + echo "path = include/foo" >>"$P/.gitmodules" && + echo "url = git://example.com/git/lib.git" >>"$P/.gitmodules" && + git -C "$P" add .gitmodules && + git -C "$P" commit -m x && + + configure_exclusion "$P" .gitmodules >h && + + sane_unset GIT_TEST_SIDEBAND_ALL && + git -c protocol.version=2 -c transfer.fsckobjects=1 \ + -c fetch.uriprotocols=http,https \ + clone "$HTTPD_URL/smart/http_parent" http_child && + + # Ensure that there are exactly 2 packfiles with associated .idx + ls http_child/.git/objects/pack/*.pack \ + http_child/.git/objects/pack/*.idx >filelist && + test_line_count = 4 filelist +' + +test_expect_success 'packfile-uri with transfer.fsckobjects fails when .gitmodules separate from tree is invalid' ' + P="$HTTPD_DOCUMENT_ROOT_PATH/http_parent" && + rm -rf "$P" http_child err && + + git init "$P" && + git -C "$P" config "uploadpack.allowsidebandall" "true" && + + echo "[submodule \"..\"]" >"$P/.gitmodules" && + echo "path = include/foo" >>"$P/.gitmodules" && + echo "url = git://example.com/git/lib.git" >>"$P/.gitmodules" && + git -C "$P" add .gitmodules && + git -C "$P" commit -m x && + + configure_exclusion "$P" .gitmodules >h && + + sane_unset GIT_TEST_SIDEBAND_ALL && + test_must_fail git -c protocol.version=2 -c transfer.fsckobjects=1 \ + -c fetch.uriprotocols=http,https \ + clone "$HTTPD_URL/smart/http_parent" http_child 2>err && + test_i18ngrep "disallowed submodule name" err +' + # DO NOT add non-httpd-specific tests here, because the last part of this # test script is only executed when httpd is available and enabled. From 00f68732e5371d6cc5bb17fbd2fa99c8786571da Mon Sep 17 00:00:00 2001 From: Han-Wen Nienhuys Date: Tue, 23 Feb 2021 16:57:23 +0000 Subject: [PATCH 57/63] doc/reftable: document how to handle windows On Windows we can't delete or overwrite files opened by other processes. Here we sketch how to handle this situation. We propose to use a random element in the filename. It's possible to design an alternate solution based on counters, but that would assign semantics to the filenames that complicates implementation. Signed-off-by: Han-Wen Nienhuys Signed-off-by: Junio C Hamano --- Documentation/technical/reftable.txt | 42 +++++++++++++++++----------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/Documentation/technical/reftable.txt b/Documentation/technical/reftable.txt index 8095ab2590..3ef169af27 100644 --- a/Documentation/technical/reftable.txt +++ b/Documentation/technical/reftable.txt @@ -872,17 +872,11 @@ A repository must set its `$GIT_DIR/config` to configure reftable: Layout ^^^^^^ -A collection of reftable files are stored in the `$GIT_DIR/reftable/` -directory: - -.... -00000001-00000001.log -00000002-00000002.ref -00000003-00000003.ref -.... - -where reftable files are named by a unique name such as produced by the -function `${min_update_index}-${max_update_index}.ref`. +A collection of reftable files are stored in the `$GIT_DIR/reftable/` directory. +Their names should have a random element, such that each filename is globally +unique; this helps avoid spurious failures on Windows, where open files cannot +be removed or overwritten. It suggested to use +`${min_update_index}-${max_update_index}-${random}.ref` as a naming convention. Log-only files use the `.log` extension, while ref-only and mixed ref and log files use `.ref`. extension. @@ -893,9 +887,9 @@ current files, one per line, in order, from oldest (base) to newest .... $ cat .git/reftable/tables.list -00000001-00000001.log -00000002-00000002.ref -00000003-00000003.ref +00000001-00000001-RANDOM1.log +00000002-00000002-RANDOM2.ref +00000003-00000003-RANDOM3.ref .... Readers must read `$GIT_DIR/reftable/tables.list` to determine which @@ -940,7 +934,7 @@ new reftable and atomically appending it to the stack: 3. Select `update_index` to be most recent file's `max_update_index + 1`. 4. Prepare temp reftable `tmp_XXXXXX`, including log entries. -5. Rename `tmp_XXXXXX` to `${update_index}-${update_index}.ref`. +5. Rename `tmp_XXXXXX` to `${update_index}-${update_index}-${random}.ref`. 6. Copy `tables.list` to `tables.list.lock`, appending file from (5). 7. Rename `tables.list.lock` to `tables.list`. @@ -993,7 +987,7 @@ prevents other processes from trying to compact these files. should always be the case, assuming that other processes are adhering to the locking protocol. 7. Rename `${min_update_index}-${max_update_index}_XXXXXX` to -`${min_update_index}-${max_update_index}.ref`. +`${min_update_index}-${max_update_index}-${random}.ref`. 8. Write the new stack to `tables.list.lock`, replacing `B` and `C` with the file from (4). 9. Rename `tables.list.lock` to `tables.list`. @@ -1005,6 +999,22 @@ This strategy permits compactions to proceed independently of updates. Each reftable (compacted or not) is uniquely identified by its name, so open reftables can be cached by their name. +Windows +^^^^^^^ + +On windows, and other systems that do not allow deleting or renaming to open +files, compaction may succeed, but other readers may prevent obsolete tables +from being deleted. + +On these platforms, the following strategy can be followed: on closing a +reftable stack, reload `tables.list`, and delete any tables no longer mentioned +in `tables.list`. + +Irregular program exit may still leave about unused files. In this case, a +cleanup operation can read `tables.list`, note its modification timestamp, and +delete any unreferenced `*.ref` files that are older. + + Alternatives considered ~~~~~~~~~~~~~~~~~~~~~~~ From c4ff24bbb354377a6a7937744fbbef2898243fc7 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Wed, 24 Feb 2021 12:12:22 -0500 Subject: [PATCH 58/63] commit-graph.c: display correct number of chunks when writing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When writing a commit-graph, a progress meter is shown which indicates the number of pieces of data to write (one per commit in each chunk). In 47410aa837 (commit-graph: use chunk-format write API, 2021-02-18), the number of chunks became tracked by the new chunk-format API. But a stray local variable was left behind from when write_commit_graph_file() used to keep track of the same. Since this was no longer updated after 47410aa837, the progress meter appeared broken: $ git commit-graph write --reachable Expanding reachable commits in commit graph: 837569, done. Writing out commit graph in 3 passes: 166% (4187845/2512707), done. Drop the local variable and rely instead on the chunk-format API to tell us the correct number of chunks. Reported-by: SZEDER Gábor Signed-off-by: Taylor Blau Acked-by: Derrick Stolee Signed-off-by: Junio C Hamano --- commit-graph.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index 76514a879e..b9efeddeab 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -1715,7 +1715,6 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) struct lock_file lk = LOCK_INIT; const unsigned hashsz = the_hash_algo->rawsz; struct strbuf progress_title = STRBUF_INIT; - int num_chunks = 3; struct object_id file_hash; struct chunkfile *cf; @@ -1811,11 +1810,11 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx) strbuf_addf(&progress_title, Q_("Writing out commit graph in %d pass", "Writing out commit graph in %d passes", - num_chunks), - num_chunks); + get_num_chunks(cf)), + get_num_chunks(cf)); ctx->progress = start_delayed_progress( progress_title.buf, - num_chunks * ctx->commits.nr); + get_num_chunks(cf) * ctx->commits.nr); } write_chunkfile(cf, ctx); From 6347d649bcddf531f82d400103e23d99ea8f2fd4 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Wed, 24 Feb 2021 14:31:57 +0000 Subject: [PATCH 59/63] dir: fix malloc of root untracked_cache_dir Use FLEX_ALLOC_STR() to allocate the `struct untracked_cache_dir` for the root directory. Get rid of unsafe code that might fail to initialize the `name` field (if FLEX_ARRAY is not 1). This will make it clear that we intend to have a structure with an empty string following it. A problem was observed on Windows where the length of the memset() was too short, so the first byte of the name field was not zeroed. This resulted in the name field having garbage from a previous use of that area of memory. The record for the root directory was then written to the untracked-cache extension in the index. This garbage would then be visible to future commands when they reloaded the untracked-cache extension. Since the directory record for the root directory had garbage in the `name` field, the `t/helper/test-tool dump-untracked-cache` tool printed this garbage as the path prefix (rather than '/') for each directory in the untracked cache as it recursed. Signed-off-by: Jeff Hostetler Reviewed-by: Taylor Blau Signed-off-by: Junio C Hamano --- dir.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/dir.c b/dir.c index d637461da5..188d9d220b 100644 --- a/dir.c +++ b/dir.c @@ -2730,11 +2730,8 @@ static struct untracked_cache_dir *validate_untracked_cache(struct dir_struct *d return NULL; } - if (!dir->untracked->root) { - const int len = sizeof(*dir->untracked->root); - dir->untracked->root = xmalloc(len); - memset(dir->untracked->root, 0, len); - } + if (!dir->untracked->root) + FLEX_ALLOC_STR(dir->untracked->root, name, ""); /* Validate $GIT_DIR/info/exclude and core.excludesfile */ root = dir->untracked->root; From f279894d283101e8e7427347a5469a8731820872 Mon Sep 17 00:00:00 2001 From: Neeraj Singh Date: Thu, 18 Feb 2021 02:48:26 +0000 Subject: [PATCH 60/63] read-cache: make the index write buffer size 128K Writing an index 8K at a time invokes the OS filesystem and caching code very frequently, introducing noticeable overhead while writing large indexes. When experimenting with different write buffer sizes on Windows writing the Windows OS repo index (260MB), most of the benefit came by bumping the index write buffer size to 64K. I picked 128K to ensure that we're past the knee of the curve. With this change, the time under do_write_index for an index with 3M files goes from ~1.02s to ~0.72s. Signed-off-by: Neeraj Singh Acked-by: Jeff Hostetler Signed-off-by: Junio C Hamano --- read-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/read-cache.c b/read-cache.c index 29144cf879..a5b2779b95 100644 --- a/read-cache.c +++ b/read-cache.c @@ -2447,7 +2447,7 @@ int repo_index_has_changes(struct repository *repo, } } -#define WRITE_BUFFER_SIZE 8192 +#define WRITE_BUFFER_SIZE (128 * 1024) static unsigned char write_buffer[WRITE_BUFFER_SIZE]; static unsigned long write_buffer_len; From cdc986a7c2e86fde9e7945bbc40335f7ab2a7c5e Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Mon, 1 Mar 2021 09:19:37 -0800 Subject: [PATCH 61/63] Revert "commit-graph: when incompatible with graphs, indicate why" This reverts commit c85eec7fc37e1ca79072f263ae6ea1ee305ba38c, as it is a bit overzealous, we are in prerelease freeze, and we want to have enough time to get this right and cook in 'next'. cf. <8735xgkvuo.fsf@evledraar.gmail.com> --- commit-graph.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/commit-graph.c b/commit-graph.c index 8fd4804343..031641014f 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -205,24 +205,16 @@ static int commit_graph_compatible(struct repository *r) if (read_replace_refs) { prepare_replace_object(r); - if (hashmap_get_size(&r->objects->replace_map->map)) { - warning(_("repository contains replace objects; " - "skipping commit-graph")); + if (hashmap_get_size(&r->objects->replace_map->map)) return 0; - } } prepare_commit_graft(r); if (r->parsed_objects && - (r->parsed_objects->grafts_nr || r->parsed_objects->substituted_parent)) { - warning(_("repository contains (deprecated) grafts; " - "skipping commit-graph")); + (r->parsed_objects->grafts_nr || r->parsed_objects->substituted_parent)) return 0; - } - if (is_repository_shallow(r)) { - warning(_("repository is shallow; skipping commit-graph")); + if (is_repository_shallow(r)) return 0; - } return 1; } From ec125d1bc1053df7e4103f71ebd24fe48dd624a2 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Mon, 1 Mar 2021 14:02:42 -0800 Subject: [PATCH 62/63] Hopefully the last batch before -rc1 Signed-off-by: Junio C Hamano --- Documentation/RelNotes/2.31.0.txt | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/Documentation/RelNotes/2.31.0.txt b/Documentation/RelNotes/2.31.0.txt index ef8b0d158e..04bd5b70a9 100644 --- a/Documentation/RelNotes/2.31.0.txt +++ b/Documentation/RelNotes/2.31.0.txt @@ -197,6 +197,31 @@ Performance, Internal Implementation, Development Support etc. * The code to implement "git merge-base --independent" was poorly done and was kept from the very beginning of the feature. + * Preliminary changes to fsmonitor integration. + + * Performance optimization work on the rename detection continues. + + * The common code to deal with "chunked file format" that is shared + by the multi-pack-index and commit-graph files have been factored + out, to help codepaths for both filetypes to become more robust. + + * The approach to "fsck" the incoming objects in "index-pack" is + attractive for performance reasons (we have them already in core, + inflated and ready to be inspected), but fundamentally cannot be + applied fully when we receive more than one pack stream, as a tree + object in one pack may refer to a blob object in another pack as + ".gitmodules", when we want to inspect blobs that are used as + ".gitmodules" file, for example. Teach "index-pack" to emit + objects that must be inspected later and check them in the calling + "fetch-pack" process. + + * The logic to handle "trailer" related placeholders in the + "--format=" mechanisms in the "log" family and "for-each-ref" + family is getting unified. + + * Raise the buffer size used when writing the index file out from + (obviously too small) 8kB to (clearly sufficiently large) 128kB. + Fixes since v2.30 ----------------- @@ -318,6 +343,12 @@ Fixes since v2.30 corrected. (merge 20e416409f jc/push-delete-nothing later to maint). + * Test script modernization. + (merge 488acf15df sv/t7001-modernize later to maint). + + * An under-allocation for the untracked cache data has been corrected. + (merge 6347d649bc jh/untracked-cache-fix later to maint). + * Other code cleanup, docfix, build fix, etc. (merge e3f5da7e60 sg/t7800-difftool-robustify later to maint). (merge 9d336655ba js/doc-proto-v2-response-end later to maint). From f01623b2c9d14207e497b21ebc6b3ec4afaf4b46 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Tue, 2 Mar 2021 22:41:13 -0800 Subject: [PATCH 63/63] Git 2.31-rc1 Signed-off-by: Junio C Hamano --- GIT-VERSION-GEN | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GIT-VERSION-GEN b/GIT-VERSION-GEN index 6fb66d3e55..7b99fe5dc5 100755 --- a/GIT-VERSION-GEN +++ b/GIT-VERSION-GEN @@ -1,7 +1,7 @@ #!/bin/sh GVF=GIT-VERSION-FILE -DEF_VER=v2.31.0-rc0 +DEF_VER=v2.31.0-rc1 LF=' '