From 0b20f1a26604a4f43f4728acc56c6ffc6a976220 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 13 Jan 2017 12:54:10 -0500 Subject: [PATCH 1/6] t1450: refactor loose-object removal Commit 90cf590f5 (fsck: optionally show more helpful info for broken links, 2016-07-17) added a remove_loose_object() helper, but we already had a remove_object() helper that did the same thing. Let's combine these into one. The implementations had a few subtle differences, so I've tried to take the best of both: - the original used "sed", but the newer version avoids spawning an extra process - the original processed "$*", which was nonsense, as it assumed only a single sha1. Use "$1" to make that more clear. - the newer version ran an extra rev-parse, but it was not necessary; it's sole caller already converted the argument into a raw sha1 - the original used "rm -f", whereas the new one uses "rm". The latter is better because it may notice a bug or other unexpected failure in the test. (The original does check that the object exists before we remove it, which is good, but that's a subset of the possible unexpected conditions). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- t/t1450-fsck.sh | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/t/t1450-fsck.sh b/t/t1450-fsck.sh index ee7d4736db..3297d4cb2f 100755 --- a/t/t1450-fsck.sh +++ b/t/t1450-fsck.sh @@ -43,13 +43,13 @@ test_expect_success 'HEAD is part of refs, valid objects appear valid' ' test_expect_success 'setup: helpers for corruption tests' ' sha1_file() { - echo "$*" | sed "s#..#.git/objects/&/#" + remainder=${1#??} && + firsttwo=${1%$remainder} && + echo ".git/objects/$firsttwo/$remainder" } && remove_object() { - file=$(sha1_file "$*") && - test -e "$file" && - rm -f "$file" + rm "$(sha1_file "$1")" } ' @@ -535,13 +535,6 @@ test_expect_success 'fsck --connectivity-only' ' ) ' -remove_loose_object () { - sha1="$(git rev-parse "$1")" && - remainder=${sha1#??} && - firsttwo=${sha1%$remainder} && - rm .git/objects/$firsttwo/$remainder -} - test_expect_success 'fsck --name-objects' ' rm -rf name-objects && git init name-objects && @@ -550,7 +543,7 @@ test_expect_success 'fsck --name-objects' ' test_commit julius caesar.t && test_commit augustus && test_commit caesar && - remove_loose_object $(git rev-parse julius:caesar.t) && + remove_object $(git rev-parse julius:caesar.t) && test_must_fail git fsck --name-objects >out && tree=$(git rev-parse --verify julius:) && grep "$tree (\(refs/heads/master\|HEAD\)@{[0-9]*}:" out From 771e7d578ee93753f5ac0ed346effd0af3d5a4b4 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 13 Jan 2017 12:54:39 -0500 Subject: [PATCH 2/6] sha1_file: fix error message for alternate objects When we fail to open a corrupt loose object, we report an error and mention the filename via sha1_file_name(). However, that function will always give us a path in the local repository, whereas the corrupt object may have come from an alternate. The result is a very misleading error message. Teach the open_sha1_file() and stat_sha1_file() helpers to pass back the path they found, so that we can report it correctly. Note that the pointers we return go to static storage (e.g., from sha1_file_name()), which is slightly dangerous. However, these helpers are static local helpers, and the names are used for immediately generating error messages. The simplicity is an acceptable tradeoff for the danger. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- sha1_file.c | 46 +++++++++++++++++++++++++++++++--------------- t/t1450-fsck.sh | 10 ++++++++++ 2 files changed, 41 insertions(+), 15 deletions(-) diff --git a/sha1_file.c b/sha1_file.c index 9c86d1924a..6c8e4b43dd 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -1613,39 +1613,54 @@ int git_open(const char *name) } } -static int stat_sha1_file(const unsigned char *sha1, struct stat *st) +/* + * Find "sha1" as a loose object in the local repository or in an alternate. + * Returns 0 on success, negative on failure. + * + * The "path" out-parameter will give the path of the object we found (if any). + * Note that it may point to static storage and is only valid until another + * call to sha1_file_name(), etc. + */ +static int stat_sha1_file(const unsigned char *sha1, struct stat *st, + const char **path) { struct alternate_object_database *alt; - if (!lstat(sha1_file_name(sha1), st)) + *path = sha1_file_name(sha1); + if (!lstat(*path, st)) return 0; prepare_alt_odb(); errno = ENOENT; for (alt = alt_odb_list; alt; alt = alt->next) { - const char *path = alt_sha1_path(alt, sha1); - if (!lstat(path, st)) + *path = alt_sha1_path(alt, sha1); + if (!lstat(*path, st)) return 0; } return -1; } -static int open_sha1_file(const unsigned char *sha1) +/* + * Like stat_sha1_file(), but actually open the object and return the + * descriptor. See the caveats on the "path" parameter above. + */ +static int open_sha1_file(const unsigned char *sha1, const char **path) { int fd; struct alternate_object_database *alt; int most_interesting_errno; - fd = git_open(sha1_file_name(sha1)); + *path = sha1_file_name(sha1); + fd = git_open(*path); if (fd >= 0) return fd; most_interesting_errno = errno; prepare_alt_odb(); for (alt = alt_odb_list; alt; alt = alt->next) { - const char *path = alt_sha1_path(alt, sha1); - fd = git_open(path); + *path = alt_sha1_path(alt, sha1); + fd = git_open(*path); if (fd >= 0) return fd; if (most_interesting_errno == ENOENT) @@ -1657,10 +1672,11 @@ static int open_sha1_file(const unsigned char *sha1) void *map_sha1_file(const unsigned char *sha1, unsigned long *size) { + const char *path; void *map; int fd; - fd = open_sha1_file(sha1); + fd = open_sha1_file(sha1, &path); map = NULL; if (fd >= 0) { struct stat st; @@ -1669,7 +1685,7 @@ void *map_sha1_file(const unsigned char *sha1, unsigned long *size) *size = xsize_t(st.st_size); if (!*size) { /* mmap() is forbidden on empty files */ - error("object file %s is empty", sha1_file_name(sha1)); + error("object file %s is empty", path); return NULL; } map = xmmap(NULL, *size, PROT_READ, MAP_PRIVATE, fd, 0); @@ -2789,8 +2805,9 @@ static int sha1_loose_object_info(const unsigned char *sha1, * object even exists. */ if (!oi->typep && !oi->typename && !oi->sizep) { + const char *path; struct stat st; - if (stat_sha1_file(sha1, &st) < 0) + if (stat_sha1_file(sha1, &st, &path) < 0) return -1; if (oi->disk_sizep) *oi->disk_sizep = st.st_size; @@ -2986,6 +3003,8 @@ void *read_sha1_file_extended(const unsigned char *sha1, { void *data; const struct packed_git *p; + const char *path; + struct stat st; const unsigned char *repl = lookup_replace_object_extended(sha1, flag); errno = 0; @@ -3001,12 +3020,9 @@ void *read_sha1_file_extended(const unsigned char *sha1, die("replacement %s not found for %s", sha1_to_hex(repl), sha1_to_hex(sha1)); - if (has_loose_object(repl)) { - const char *path = sha1_file_name(sha1); - + if (!stat_sha1_file(repl, &st, &path)) die("loose object %s (stored in %s) is corrupt", sha1_to_hex(repl), path); - } if ((p = has_packed_and_bad(repl)) != NULL) die("packed object %s (stored in %s) is corrupt", diff --git a/t/t1450-fsck.sh b/t/t1450-fsck.sh index 3297d4cb2f..f95174c9d8 100755 --- a/t/t1450-fsck.sh +++ b/t/t1450-fsck.sh @@ -550,4 +550,14 @@ test_expect_success 'fsck --name-objects' ' ) ' +test_expect_success 'alternate objects are correctly blamed' ' + test_when_finished "rm -rf alt.git .git/objects/info/alternates" && + git init --bare alt.git && + echo "../../alt.git/objects" >.git/objects/info/alternates && + mkdir alt.git/objects/12 && + >alt.git/objects/12/34567890123456789012345678901234567890 && + test_must_fail git fsck >out 2>&1 && + grep alt.git out +' + test_done From 118e6cead47fafa17db12fdec9b997535f4f0fc6 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 13 Jan 2017 12:55:55 -0500 Subject: [PATCH 3/6] t1450: test fsck of packed objects The code paths in fsck for packed and loose objects are quite different, and it is not immediately obvious that the packed case behaves well. In particular: 1. The fsck_loose() function always returns "0" to tell the iterator to keep checking more objects. Whereas fsck_obj_buffer() (which handles packed objects) returns -1. This is OK, because the callback machinery for verify_pack() does not stop when it sees a non-zero return. 2. The fsck_loose() function sets the ERROR_OBJECT bit when fsck_obj() fails, whereas fsck_obj_buffer() sets it only when it sees a corrupt object. This turns out not to matter. We don't actually do anything with this bit except exit the program with a non-zero code, and that is handled already by the non-zero return from the function. So there are no bugs here, but it was certainly confusing to me. And we do not test either of the properties in t1450 (neither that a non-corruption error will caused a non-zero exit for a packed object, nor that we keep going after seeing the first error). Let's test both of those conditions, so that we'll notice if any of those assumptions becomes invalid. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- t/t1450-fsck.sh | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/t/t1450-fsck.sh b/t/t1450-fsck.sh index f95174c9d8..c39d421209 100755 --- a/t/t1450-fsck.sh +++ b/t/t1450-fsck.sh @@ -560,4 +560,25 @@ test_expect_success 'alternate objects are correctly blamed' ' grep alt.git out ' +test_expect_success 'fsck errors in packed objects' ' + git cat-file commit HEAD >basis && + sed "s/one && + sed "s/two && + one=$(git hash-object -t commit -w one) && + two=$(git hash-object -t commit -w two) && + pack=$( + { + echo $one && + echo $two + } | git pack-objects .git/objects/pack/pack + ) && + test_when_finished "rm -f .git/objects/pack/pack-$pack.*" && + remove_object $one && + remove_object $two && + test_must_fail git fsck 2>out && + grep "error in commit $one.* - bad name" out && + grep "error in commit $two.* - bad name" out && + ! grep corrupt out +' + test_done From f6371f9210418f1beabc85b097e2a3470aeeb54d Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 13 Jan 2017 12:58:16 -0500 Subject: [PATCH 4/6] sha1_file: add read_loose_object() function It's surprisingly hard to ask the sha1_file code to open a _specific_ incarnation of a loose object. Most of the functions take a sha1, and loop over the various object types (packed versus loose) and locations (local versus alternates) at a low level. However, some tools like fsck need to look at a specific file. This patch gives them a function they can use to open the loose object at a given path. The implementation unfortunately ends up repeating bits of related functions, but there's not a good way around it without some major refactoring of the whole sha1_file stack. We need to mmap the specific file, then partially read the zlib stream to know whether we're streaming or not, and then finally either stream it or copy the data to a buffer. We can do that by assembling some of the more arcane internal sha1_file functions, but we end up having to essentially reimplement unpack_sha1_file(), along with the streaming bits of check_sha1_signature(). Still, most of the ugliness is contained in the new function, and the interface is clean enough that it may be reusable (though it seems unlikely anything but git-fsck would care about opening a specific file). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- cache.h | 13 +++++ sha1_file.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 143 insertions(+), 3 deletions(-) diff --git a/cache.h b/cache.h index a50a61a197..ed45895706 100644 --- a/cache.h +++ b/cache.h @@ -1139,6 +1139,19 @@ extern int finalize_object_file(const char *tmpfile, const char *filename); extern int has_sha1_pack(const unsigned char *sha1); +/* + * Open the loose object at path, check its sha1, and return the contents, + * type, and size. If the object is a blob, then "contents" may return NULL, + * to allow streaming of large blobs. + * + * Returns 0 on success, negative on error (details may be written to stderr). + */ +int read_loose_object(const char *path, + const unsigned char *expected_sha1, + enum object_type *type, + unsigned long *size, + void **contents); + /* * Return true iff we have an object named sha1, whether local or in * an alternate object database, and whether packed or loose. This diff --git a/sha1_file.c b/sha1_file.c index 6c8e4b43dd..ca1c90e5d3 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -1670,13 +1670,21 @@ static int open_sha1_file(const unsigned char *sha1, const char **path) return -1; } -void *map_sha1_file(const unsigned char *sha1, unsigned long *size) +/* + * Map the loose object at "path" if it is not NULL, or the path found by + * searching for a loose object named "sha1". + */ +static void *map_sha1_file_1(const char *path, + const unsigned char *sha1, + unsigned long *size) { - const char *path; void *map; int fd; - fd = open_sha1_file(sha1, &path); + if (path) + fd = git_open(path); + else + fd = open_sha1_file(sha1, &path); map = NULL; if (fd >= 0) { struct stat st; @@ -1695,6 +1703,11 @@ void *map_sha1_file(const unsigned char *sha1, unsigned long *size) return map; } +void *map_sha1_file(const unsigned char *sha1, unsigned long *size) +{ + return map_sha1_file_1(NULL, sha1, size); +} + unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep) { @@ -3792,3 +3805,117 @@ int for_each_packed_object(each_packed_object_fn cb, void *data, unsigned flags) } return r ? r : pack_errors; } + +static int check_stream_sha1(git_zstream *stream, + const char *hdr, + unsigned long size, + const char *path, + const unsigned char *expected_sha1) +{ + git_SHA_CTX c; + unsigned char real_sha1[GIT_SHA1_RAWSZ]; + unsigned char buf[4096]; + unsigned long total_read; + int status = Z_OK; + + git_SHA1_Init(&c); + git_SHA1_Update(&c, hdr, stream->total_out); + + /* + * We already read some bytes into hdr, but the ones up to the NUL + * do not count against the object's content size. + */ + total_read = stream->total_out - strlen(hdr) - 1; + + /* + * This size comparison must be "<=" to read the final zlib packets; + * see the comment in unpack_sha1_rest for details. + */ + while (total_read <= size && + (status == Z_OK || status == Z_BUF_ERROR)) { + stream->next_out = buf; + stream->avail_out = sizeof(buf); + if (size - total_read < stream->avail_out) + stream->avail_out = size - total_read; + status = git_inflate(stream, Z_FINISH); + git_SHA1_Update(&c, buf, stream->next_out - buf); + total_read += stream->next_out - buf; + } + git_inflate_end(stream); + + if (status != Z_STREAM_END) { + error("corrupt loose object '%s'", sha1_to_hex(expected_sha1)); + return -1; + } + + git_SHA1_Final(real_sha1, &c); + if (hashcmp(expected_sha1, real_sha1)) { + error("sha1 mismatch for %s (expected %s)", path, + sha1_to_hex(expected_sha1)); + return -1; + } + + return 0; +} + +int read_loose_object(const char *path, + const unsigned char *expected_sha1, + enum object_type *type, + unsigned long *size, + void **contents) +{ + int ret = -1; + int fd = -1; + void *map = NULL; + unsigned long mapsize; + git_zstream stream; + char hdr[32]; + + *contents = NULL; + + map = map_sha1_file_1(path, NULL, &mapsize); + if (!map) { + error_errno("unable to mmap %s", path); + goto out; + } + + if (unpack_sha1_header(&stream, map, mapsize, hdr, sizeof(hdr)) < 0) { + error("unable to unpack header of %s", path); + goto out; + } + + *type = parse_sha1_header(hdr, size); + if (*type < 0) { + error("unable to parse header of %s", path); + git_inflate_end(&stream); + goto out; + } + + if (*type == OBJ_BLOB) { + if (check_stream_sha1(&stream, hdr, *size, path, expected_sha1) < 0) + goto out; + } else { + *contents = unpack_sha1_rest(&stream, hdr, *size, expected_sha1); + if (!*contents) { + error("unable to unpack contents of %s", path); + git_inflate_end(&stream); + goto out; + } + if (check_sha1_signature(expected_sha1, *contents, + *size, typename(*type))) { + error("sha1 mismatch for %s (expected %s)", path, + sha1_to_hex(expected_sha1)); + free(*contents); + goto out; + } + } + + ret = 0; /* everything checks out */ + +out: + if (map) + munmap(map, mapsize); + if (fd >= 0) + close(fd); + return ret; +} From c68b489e56431cf27f7719913ab09ddc62f95912 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 13 Jan 2017 12:59:44 -0500 Subject: [PATCH 5/6] fsck: parse loose object paths directly When we iterate over the list of loose objects to check, we get the actual path of each object. But we then throw it away and pass just the sha1 to fsck_sha1(), which will do a fresh lookup. Usually it would find the same object, but it may not if an object exists both as a loose and a packed object. We may end up checking the packed object twice, and never look at the loose one. In practice this isn't too terrible, because if fsck doesn't complain, it means you have at least one good copy. But since the point of fsck is to look for corruption, we should be thorough. The new read_loose_object() interface can help us get the data from disk, and then we replace parse_object() with parse_object_buffer(). As a bonus, our error messages now mention the path to a corrupted object, which should make it easier to track down errors when they do happen. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/fsck.c | 46 +++++++++++++++++++++++++++++++++------------- t/t1450-fsck.sh | 16 ++++++++++++++++ 2 files changed, 49 insertions(+), 13 deletions(-) diff --git a/builtin/fsck.c b/builtin/fsck.c index f01b81eebf..4b91ee95e6 100644 --- a/builtin/fsck.c +++ b/builtin/fsck.c @@ -362,18 +362,6 @@ static int fsck_obj(struct object *obj) return 0; } -static int fsck_sha1(const unsigned char *sha1) -{ - struct object *obj = parse_object(sha1); - if (!obj) { - errors_found |= ERROR_OBJECT; - return error("%s: object corrupt or missing", - sha1_to_hex(sha1)); - } - obj->flags |= HAS_OBJ; - return fsck_obj(obj); -} - static int fsck_obj_buffer(const unsigned char *sha1, enum object_type type, unsigned long size, void *buffer, int *eaten) { @@ -488,9 +476,41 @@ static void get_default_heads(void) } } +static struct object *parse_loose_object(const unsigned char *sha1, + const char *path) +{ + struct object *obj; + void *contents; + enum object_type type; + unsigned long size; + int eaten; + + if (read_loose_object(path, sha1, &type, &size, &contents) < 0) + return NULL; + + if (!contents && type != OBJ_BLOB) + die("BUG: read_loose_object streamed a non-blob"); + + obj = parse_object_buffer(sha1, type, size, contents, &eaten); + + if (!eaten) + free(contents); + return obj; +} + static int fsck_loose(const unsigned char *sha1, const char *path, void *data) { - if (fsck_sha1(sha1)) + struct object *obj = parse_loose_object(sha1, path); + + if (!obj) { + errors_found |= ERROR_OBJECT; + error("%s: object corrupt or missing: %s", + sha1_to_hex(sha1), path); + return 0; /* keep checking other objects */ + } + + obj->flags = HAS_OBJ; + if (fsck_obj(obj)) errors_found |= ERROR_OBJECT; return 0; } diff --git a/t/t1450-fsck.sh b/t/t1450-fsck.sh index c39d421209..455c186fe2 100755 --- a/t/t1450-fsck.sh +++ b/t/t1450-fsck.sh @@ -581,4 +581,20 @@ test_expect_success 'fsck errors in packed objects' ' ! grep corrupt out ' +test_expect_success 'fsck finds problems in duplicate loose objects' ' + rm -rf broken-duplicate && + git init broken-duplicate && + ( + cd broken-duplicate && + test_commit duplicate && + # no "-d" here, so we end up with duplicates + git repack && + # now corrupt the loose copy + file=$(sha1_file "$(git rev-parse HEAD)") && + rm "$file" && + echo broken >"$file" && + test_must_fail git fsck + ) +' + test_done From cce044df7f2392d0c6cb21d6dca94f01ff838727 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 13 Jan 2017 13:00:25 -0500 Subject: [PATCH 6/6] fsck: detect trailing garbage in all object types When a loose tree or commit is read by fsck (or any git program), unpack_sha1_rest() checks whether there is extra cruft at the end of the object file, after the zlib data. Blobs that are streamed, however, do not have this check. For normal git operations, it's not a big deal. We know the sha1 and size checked out, so we have the object bytes we wanted. The trailing garbage doesn't affect what we're trying to do. But since the point of fsck is to find corruption or other problems, it should be more thorough. This patch teaches its loose-sha1 reader to detect extra bytes after the zlib stream and complain. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- sha1_file.c | 5 +++++ t/t1450-fsck.sh | 22 ++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/sha1_file.c b/sha1_file.c index ca1c90e5d3..b2c6648085 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -3847,6 +3847,11 @@ static int check_stream_sha1(git_zstream *stream, error("corrupt loose object '%s'", sha1_to_hex(expected_sha1)); return -1; } + if (stream->avail_in) { + error("garbage at end of loose object '%s'", + sha1_to_hex(expected_sha1)); + return -1; + } git_SHA1_Final(real_sha1, &c); if (hashcmp(expected_sha1, real_sha1)) { diff --git a/t/t1450-fsck.sh b/t/t1450-fsck.sh index 455c186fe2..8975b4d1bc 100755 --- a/t/t1450-fsck.sh +++ b/t/t1450-fsck.sh @@ -597,4 +597,26 @@ test_expect_success 'fsck finds problems in duplicate loose objects' ' ) ' +test_expect_success 'fsck detects trailing loose garbage (commit)' ' + git cat-file commit HEAD >basis && + echo bump-commit-sha1 >>basis && + commit=$(git hash-object -w -t commit basis) && + file=$(sha1_file $commit) && + test_when_finished "remove_object $commit" && + chmod +w "$file" && + echo garbage >>"$file" && + test_must_fail git fsck 2>out && + test_i18ngrep "garbage.*$commit" out +' + +test_expect_success 'fsck detects trailing loose garbage (blob)' ' + blob=$(echo trailing | git hash-object -w --stdin) && + file=$(sha1_file $blob) && + test_when_finished "remove_object $blob" && + chmod +w "$file" && + echo garbage >>"$file" && + test_must_fail git fsck 2>out && + test_i18ngrep "garbage.*$blob" out +' + test_done