|
|
|
/*
|
|
|
|
* csum-file.c
|
|
|
|
*
|
|
|
|
* Copyright (C) 2005 Linus Torvalds
|
|
|
|
*
|
|
|
|
* Simple file write infrastructure for writing SHA1-summed
|
|
|
|
* files. Useful when you write a file that you want to be
|
|
|
|
* able to verify hasn't been messed with afterwards.
|
|
|
|
*/
|
|
|
|
#include "cache.h"
|
|
|
|
#include "progress.h"
|
|
|
|
#include "csum-file.h"
|
|
|
|
|
|
|
|
static void flush(struct hashfile *f, const void *buf, unsigned int count)
|
|
|
|
{
|
|
|
|
if (0 <= f->check_fd && count) {
|
|
|
|
unsigned char check_buffer[8192];
|
|
|
|
ssize_t ret = read_in_full(f->check_fd, check_buffer, count);
|
|
|
|
|
|
|
|
if (ret < 0)
|
|
|
|
die_errno("%s: sha1 file read error", f->name);
|
|
|
|
if (ret != count)
|
|
|
|
die("%s: sha1 file truncated", f->name);
|
|
|
|
if (memcmp(buf, check_buffer, count))
|
|
|
|
die("sha1 file '%s' validation error", f->name);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
int ret = xwrite(f->fd, buf, count);
|
|
|
|
if (ret > 0) {
|
|
|
|
f->total += ret;
|
|
|
|
display_throughput(f->tp, f->total);
|
|
|
|
buf = (char *) buf + ret;
|
|
|
|
count -= ret;
|
|
|
|
if (count)
|
|
|
|
continue;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (!ret)
|
|
|
|
die("sha1 file '%s' write error. Out of diskspace", f->name);
|
|
|
|
die_errno("sha1 file '%s' write error", f->name);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void hashflush(struct hashfile *f)
|
|
|
|
{
|
|
|
|
unsigned offset = f->offset;
|
|
|
|
|
|
|
|
if (offset) {
|
|
|
|
the_hash_algo->update_fn(&f->ctx, f->buffer, offset);
|
|
|
|
flush(f, f->buffer, offset);
|
|
|
|
f->offset = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int finalize_hashfile(struct hashfile *f, unsigned char *result, unsigned int flags)
|
|
|
|
{
|
|
|
|
int fd;
|
|
|
|
|
|
|
|
hashflush(f);
|
|
|
|
the_hash_algo->final_fn(f->buffer, &f->ctx);
|
|
|
|
if (result)
|
|
|
|
hashcpy(result, f->buffer);
|
|
|
|
if (flags & CSUM_HASH_IN_STREAM)
|
|
|
|
flush(f, f->buffer, the_hash_algo->rawsz);
|
|
|
|
if (flags & CSUM_FSYNC)
|
|
|
|
fsync_or_die(f->fd, f->name);
|
|
|
|
if (flags & CSUM_CLOSE) {
|
|
|
|
if (close(f->fd))
|
|
|
|
die_errno("%s: sha1 file error on close", f->name);
|
|
|
|
fd = 0;
|
|
|
|
} else
|
|
|
|
fd = f->fd;
|
|
|
|
if (0 <= f->check_fd) {
|
|
|
|
char discard;
|
|
|
|
int cnt = read_in_full(f->check_fd, &discard, 1);
|
|
|
|
if (cnt < 0)
|
|
|
|
die_errno("%s: error when reading the tail of sha1 file",
|
|
|
|
f->name);
|
|
|
|
if (cnt)
|
|
|
|
die("%s: sha1 file has trailing garbage", f->name);
|
|
|
|
if (close(f->check_fd))
|
|
|
|
die_errno("%s: sha1 file error on close", f->name);
|
|
|
|
}
|
|
|
|
free(f);
|
|
|
|
return fd;
|
|
|
|
}
|
|
|
|
|
|
|
|
void hashwrite(struct hashfile *f, const void *buf, unsigned int count)
|
|
|
|
{
|
|
|
|
while (count) {
|
|
|
|
unsigned offset = f->offset;
|
|
|
|
unsigned left = sizeof(f->buffer) - offset;
|
|
|
|
unsigned nr = count > left ? left : count;
|
|
|
|
const void *data;
|
|
|
|
|
|
|
|
if (f->do_crc)
|
|
|
|
f->crc32 = crc32(f->crc32, buf, nr);
|
|
|
|
|
|
|
|
if (nr == sizeof(f->buffer)) {
|
|
|
|
/* process full buffer directly without copy */
|
|
|
|
data = buf;
|
|
|
|
} else {
|
|
|
|
memcpy(f->buffer + offset, buf, nr);
|
|
|
|
data = f->buffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
count -= nr;
|
|
|
|
offset += nr;
|
|
|
|
buf = (char *) buf + nr;
|
|
|
|
left -= nr;
|
|
|
|
if (!left) {
|
|
|
|
the_hash_algo->update_fn(&f->ctx, data, offset);
|
|
|
|
flush(f, data, offset);
|
|
|
|
offset = 0;
|
|
|
|
}
|
|
|
|
f->offset = offset;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
struct hashfile *hashfd(int fd, const char *name)
|
|
|
|
{
|
|
|
|
return hashfd_throughput(fd, name, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct hashfile *hashfd_check(const char *name)
|
|
|
|
{
|
|
|
|
int sink, check;
|
|
|
|
struct hashfile *f;
|
|
|
|
|
|
|
|
sink = open("/dev/null", O_WRONLY);
|
|
|
|
if (sink < 0)
|
|
|
|
die_errno("unable to open /dev/null");
|
|
|
|
check = open(name, O_RDONLY);
|
|
|
|
if (check < 0)
|
|
|
|
die_errno("unable to open '%s'", name);
|
|
|
|
f = hashfd(sink, name);
|
|
|
|
f->check_fd = check;
|
|
|
|
return f;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct hashfile *hashfd_throughput(int fd, const char *name, struct progress *tp)
|
|
|
|
{
|
|
|
|
struct hashfile *f = xmalloc(sizeof(*f));
|
|
|
|
f->fd = fd;
|
|
|
|
f->check_fd = -1;
|
|
|
|
f->offset = 0;
|
|
|
|
f->total = 0;
|
|
|
|
f->tp = tp;
|
|
|
|
f->name = name;
|
compute a CRC32 for each object as stored in a pack
The most important optimization for performance when repacking is the
ability to reuse data from a previous pack as is and bypass any delta
or even SHA1 computation by simply copying the raw data from one pack
to another directly.
The problem with this is that any data corruption within a copied object
would go unnoticed and the new (repacked) pack would be self-consistent
with its own checksum despite containing a corrupted object. This is a
real issue that already happened at least once in the past.
In some attempt to prevent this, we validate the copied data by inflating
it and making sure no error is signaled by zlib. But this is still not
perfect as a significant portion of a pack content is made of object
headers and references to delta base objects which are not deflated and
therefore not validated when repacking actually making the pack data reuse
still not as safe as it could be.
Of course a full SHA1 validation could be performed, but that implies
full data inflating and delta replaying which is extremely costly, which
cost the data reuse optimization was designed to avoid in the first place.
So the best solution to this is simply to store a CRC32 of the raw pack
data for each object in the pack index. This way any object in a pack can
be validated before being copied as is in another pack, including header
and any other non deflated data.
Why CRC32 instead of a faster checksum like Adler32? Quoting Wikipedia:
Jonathan Stone discovered in 2001 that Adler-32 has a weakness for very
short messages. He wrote "Briefly, the problem is that, for very short
packets, Adler32 is guaranteed to give poor coverage of the available
bits. Don't take my word for it, ask Mark Adler. :-)" The problem is
that sum A does not wrap for short messages. The maximum value of A for
a 128-byte message is 32640, which is below the value 65521 used by the
modulo operation. An extended explanation can be found in RFC 3309,
which mandates the use of CRC32 instead of Adler-32 for SCTP, the
Stream Control Transmission Protocol.
In the context of a GIT pack, we have lots of small objects, especially
deltas, which are likely to be quite small and in a size range for which
Adler32 is dimed not to be sufficient. Another advantage of CRC32 is the
possibility for recovery from certain types of small corruptions like
single bit errors which are the most probable type of corruptions.
OK what this patch does is to compute the CRC32 of each object written to
a pack within pack-objects. It is not written to the index yet and it is
obviously not validated when reusing pack data yet either.
Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
18 years ago
|
|
|
f->do_crc = 0;
|
|
|
|
the_hash_algo->init_fn(&f->ctx);
|
|
|
|
return f;
|
|
|
|
}
|
|
|
|
|
|
|
|
void hashfile_checkpoint(struct hashfile *f, struct hashfile_checkpoint *checkpoint)
|
|
|
|
{
|
|
|
|
hashflush(f);
|
|
|
|
checkpoint->offset = f->total;
|
|
|
|
checkpoint->ctx = f->ctx;
|
|
|
|
}
|
|
|
|
|
|
|
|
int hashfile_truncate(struct hashfile *f, struct hashfile_checkpoint *checkpoint)
|
|
|
|
{
|
|
|
|
off_t offset = checkpoint->offset;
|
|
|
|
|
|
|
|
if (ftruncate(f->fd, offset) ||
|
|
|
|
lseek(f->fd, offset, SEEK_SET) != offset)
|
|
|
|
return -1;
|
|
|
|
f->total = offset;
|
|
|
|
f->ctx = checkpoint->ctx;
|
|
|
|
f->offset = 0; /* hashflush() was called in checkpoint */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void crc32_begin(struct hashfile *f)
|
compute a CRC32 for each object as stored in a pack
The most important optimization for performance when repacking is the
ability to reuse data from a previous pack as is and bypass any delta
or even SHA1 computation by simply copying the raw data from one pack
to another directly.
The problem with this is that any data corruption within a copied object
would go unnoticed and the new (repacked) pack would be self-consistent
with its own checksum despite containing a corrupted object. This is a
real issue that already happened at least once in the past.
In some attempt to prevent this, we validate the copied data by inflating
it and making sure no error is signaled by zlib. But this is still not
perfect as a significant portion of a pack content is made of object
headers and references to delta base objects which are not deflated and
therefore not validated when repacking actually making the pack data reuse
still not as safe as it could be.
Of course a full SHA1 validation could be performed, but that implies
full data inflating and delta replaying which is extremely costly, which
cost the data reuse optimization was designed to avoid in the first place.
So the best solution to this is simply to store a CRC32 of the raw pack
data for each object in the pack index. This way any object in a pack can
be validated before being copied as is in another pack, including header
and any other non deflated data.
Why CRC32 instead of a faster checksum like Adler32? Quoting Wikipedia:
Jonathan Stone discovered in 2001 that Adler-32 has a weakness for very
short messages. He wrote "Briefly, the problem is that, for very short
packets, Adler32 is guaranteed to give poor coverage of the available
bits. Don't take my word for it, ask Mark Adler. :-)" The problem is
that sum A does not wrap for short messages. The maximum value of A for
a 128-byte message is 32640, which is below the value 65521 used by the
modulo operation. An extended explanation can be found in RFC 3309,
which mandates the use of CRC32 instead of Adler-32 for SCTP, the
Stream Control Transmission Protocol.
In the context of a GIT pack, we have lots of small objects, especially
deltas, which are likely to be quite small and in a size range for which
Adler32 is dimed not to be sufficient. Another advantage of CRC32 is the
possibility for recovery from certain types of small corruptions like
single bit errors which are the most probable type of corruptions.
OK what this patch does is to compute the CRC32 of each object written to
a pack within pack-objects. It is not written to the index yet and it is
obviously not validated when reusing pack data yet either.
Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
18 years ago
|
|
|
{
|
|
|
|
f->crc32 = crc32(0, NULL, 0);
|
compute a CRC32 for each object as stored in a pack
The most important optimization for performance when repacking is the
ability to reuse data from a previous pack as is and bypass any delta
or even SHA1 computation by simply copying the raw data from one pack
to another directly.
The problem with this is that any data corruption within a copied object
would go unnoticed and the new (repacked) pack would be self-consistent
with its own checksum despite containing a corrupted object. This is a
real issue that already happened at least once in the past.
In some attempt to prevent this, we validate the copied data by inflating
it and making sure no error is signaled by zlib. But this is still not
perfect as a significant portion of a pack content is made of object
headers and references to delta base objects which are not deflated and
therefore not validated when repacking actually making the pack data reuse
still not as safe as it could be.
Of course a full SHA1 validation could be performed, but that implies
full data inflating and delta replaying which is extremely costly, which
cost the data reuse optimization was designed to avoid in the first place.
So the best solution to this is simply to store a CRC32 of the raw pack
data for each object in the pack index. This way any object in a pack can
be validated before being copied as is in another pack, including header
and any other non deflated data.
Why CRC32 instead of a faster checksum like Adler32? Quoting Wikipedia:
Jonathan Stone discovered in 2001 that Adler-32 has a weakness for very
short messages. He wrote "Briefly, the problem is that, for very short
packets, Adler32 is guaranteed to give poor coverage of the available
bits. Don't take my word for it, ask Mark Adler. :-)" The problem is
that sum A does not wrap for short messages. The maximum value of A for
a 128-byte message is 32640, which is below the value 65521 used by the
modulo operation. An extended explanation can be found in RFC 3309,
which mandates the use of CRC32 instead of Adler-32 for SCTP, the
Stream Control Transmission Protocol.
In the context of a GIT pack, we have lots of small objects, especially
deltas, which are likely to be quite small and in a size range for which
Adler32 is dimed not to be sufficient. Another advantage of CRC32 is the
possibility for recovery from certain types of small corruptions like
single bit errors which are the most probable type of corruptions.
OK what this patch does is to compute the CRC32 of each object written to
a pack within pack-objects. It is not written to the index yet and it is
obviously not validated when reusing pack data yet either.
Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
18 years ago
|
|
|
f->do_crc = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t crc32_end(struct hashfile *f)
|
compute a CRC32 for each object as stored in a pack
The most important optimization for performance when repacking is the
ability to reuse data from a previous pack as is and bypass any delta
or even SHA1 computation by simply copying the raw data from one pack
to another directly.
The problem with this is that any data corruption within a copied object
would go unnoticed and the new (repacked) pack would be self-consistent
with its own checksum despite containing a corrupted object. This is a
real issue that already happened at least once in the past.
In some attempt to prevent this, we validate the copied data by inflating
it and making sure no error is signaled by zlib. But this is still not
perfect as a significant portion of a pack content is made of object
headers and references to delta base objects which are not deflated and
therefore not validated when repacking actually making the pack data reuse
still not as safe as it could be.
Of course a full SHA1 validation could be performed, but that implies
full data inflating and delta replaying which is extremely costly, which
cost the data reuse optimization was designed to avoid in the first place.
So the best solution to this is simply to store a CRC32 of the raw pack
data for each object in the pack index. This way any object in a pack can
be validated before being copied as is in another pack, including header
and any other non deflated data.
Why CRC32 instead of a faster checksum like Adler32? Quoting Wikipedia:
Jonathan Stone discovered in 2001 that Adler-32 has a weakness for very
short messages. He wrote "Briefly, the problem is that, for very short
packets, Adler32 is guaranteed to give poor coverage of the available
bits. Don't take my word for it, ask Mark Adler. :-)" The problem is
that sum A does not wrap for short messages. The maximum value of A for
a 128-byte message is 32640, which is below the value 65521 used by the
modulo operation. An extended explanation can be found in RFC 3309,
which mandates the use of CRC32 instead of Adler-32 for SCTP, the
Stream Control Transmission Protocol.
In the context of a GIT pack, we have lots of small objects, especially
deltas, which are likely to be quite small and in a size range for which
Adler32 is dimed not to be sufficient. Another advantage of CRC32 is the
possibility for recovery from certain types of small corruptions like
single bit errors which are the most probable type of corruptions.
OK what this patch does is to compute the CRC32 of each object written to
a pack within pack-objects. It is not written to the index yet and it is
obviously not validated when reusing pack data yet either.
Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
18 years ago
|
|
|
{
|
|
|
|
f->do_crc = 0;
|
|
|
|
return f->crc32;
|
|
|
|
}
|