Merge branch 'ps/reftable-optimize-io'
Low-level I/O optimization for reftable. * ps/reftable-optimize-io: reftable/stack: fix race in up-to-date check reftable/stack: unconditionally reload stack after commit reftable/blocksource: use mmap to read tables reftable/blocksource: refactor code to match our coding style reftable/stack: use stat info to avoid re-reading stack list reftable/stack: refactor reloading to use file descriptor reftable/stack: refactor stack reloading to have common exit pathmaint
commit
4d5a46ecb1
|
@ -76,8 +76,8 @@ struct reftable_block_source malloc_block_source(void)
|
||||||
}
|
}
|
||||||
|
|
||||||
struct file_block_source {
|
struct file_block_source {
|
||||||
int fd;
|
|
||||||
uint64_t size;
|
uint64_t size;
|
||||||
|
unsigned char *data;
|
||||||
};
|
};
|
||||||
|
|
||||||
static uint64_t file_size(void *b)
|
static uint64_t file_size(void *b)
|
||||||
|
@ -87,19 +87,12 @@ static uint64_t file_size(void *b)
|
||||||
|
|
||||||
static void file_return_block(void *b, struct reftable_block *dest)
|
static void file_return_block(void *b, struct reftable_block *dest)
|
||||||
{
|
{
|
||||||
if (dest->len)
|
|
||||||
memset(dest->data, 0xff, dest->len);
|
|
||||||
reftable_free(dest->data);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void file_close(void *b)
|
static void file_close(void *v)
|
||||||
{
|
{
|
||||||
int fd = ((struct file_block_source *)b)->fd;
|
struct file_block_source *b = v;
|
||||||
if (fd > 0) {
|
munmap(b->data, b->size);
|
||||||
close(fd);
|
|
||||||
((struct file_block_source *)b)->fd = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
reftable_free(b);
|
reftable_free(b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -108,9 +101,7 @@ static int file_read_block(void *v, struct reftable_block *dest, uint64_t off,
|
||||||
{
|
{
|
||||||
struct file_block_source *b = v;
|
struct file_block_source *b = v;
|
||||||
assert(off + size <= b->size);
|
assert(off + size <= b->size);
|
||||||
dest->data = reftable_malloc(size);
|
dest->data = b->data + off;
|
||||||
if (pread_in_full(b->fd, dest->data, size, off) != size)
|
|
||||||
return -1;
|
|
||||||
dest->len = size;
|
dest->len = size;
|
||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
|
@ -125,26 +116,26 @@ static struct reftable_block_source_vtable file_vtable = {
|
||||||
int reftable_block_source_from_file(struct reftable_block_source *bs,
|
int reftable_block_source_from_file(struct reftable_block_source *bs,
|
||||||
const char *name)
|
const char *name)
|
||||||
{
|
{
|
||||||
struct stat st = { 0 };
|
struct file_block_source *p;
|
||||||
int err = 0;
|
struct stat st;
|
||||||
int fd = open(name, O_RDONLY);
|
int fd;
|
||||||
struct file_block_source *p = NULL;
|
|
||||||
|
fd = open(name, O_RDONLY);
|
||||||
if (fd < 0) {
|
if (fd < 0) {
|
||||||
if (errno == ENOENT) {
|
if (errno == ENOENT)
|
||||||
return REFTABLE_NOT_EXIST_ERROR;
|
return REFTABLE_NOT_EXIST_ERROR;
|
||||||
}
|
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
err = fstat(fd, &st);
|
if (fstat(fd, &st) < 0) {
|
||||||
if (err < 0) {
|
|
||||||
close(fd);
|
close(fd);
|
||||||
return REFTABLE_IO_ERROR;
|
return REFTABLE_IO_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
p = reftable_calloc(sizeof(struct file_block_source));
|
p = reftable_calloc(sizeof(*p));
|
||||||
p->size = st.st_size;
|
p->size = st.st_size;
|
||||||
p->fd = fd;
|
p->data = xmmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
|
||||||
|
close(fd);
|
||||||
|
|
||||||
assert(!bs->ops);
|
assert(!bs->ops);
|
||||||
bs->ops = &file_vtable;
|
bs->ops = &file_vtable;
|
||||||
|
|
204
reftable/stack.c
204
reftable/stack.c
|
@ -66,6 +66,7 @@ int reftable_new_stack(struct reftable_stack **dest, const char *dir,
|
||||||
strbuf_addstr(&list_file_name, "/tables.list");
|
strbuf_addstr(&list_file_name, "/tables.list");
|
||||||
|
|
||||||
p->list_file = strbuf_detach(&list_file_name, NULL);
|
p->list_file = strbuf_detach(&list_file_name, NULL);
|
||||||
|
p->list_fd = -1;
|
||||||
p->reftable_dir = xstrdup(dir);
|
p->reftable_dir = xstrdup(dir);
|
||||||
p->config = config;
|
p->config = config;
|
||||||
|
|
||||||
|
@ -175,6 +176,12 @@ void reftable_stack_destroy(struct reftable_stack *st)
|
||||||
st->readers_len = 0;
|
st->readers_len = 0;
|
||||||
FREE_AND_NULL(st->readers);
|
FREE_AND_NULL(st->readers);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (st->list_fd >= 0) {
|
||||||
|
close(st->list_fd);
|
||||||
|
st->list_fd = -1;
|
||||||
|
}
|
||||||
|
|
||||||
FREE_AND_NULL(st->list_file);
|
FREE_AND_NULL(st->list_file);
|
||||||
FREE_AND_NULL(st->reftable_dir);
|
FREE_AND_NULL(st->reftable_dir);
|
||||||
reftable_free(st);
|
reftable_free(st);
|
||||||
|
@ -304,69 +311,134 @@ static int tv_cmp(struct timeval *a, struct timeval *b)
|
||||||
static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st,
|
static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st,
|
||||||
int reuse_open)
|
int reuse_open)
|
||||||
{
|
{
|
||||||
struct timeval deadline = { 0 };
|
char **names = NULL, **names_after = NULL;
|
||||||
int err = gettimeofday(&deadline, NULL);
|
struct timeval deadline;
|
||||||
int64_t delay = 0;
|
int64_t delay = 0;
|
||||||
int tries = 0;
|
int tries = 0, err;
|
||||||
|
int fd = -1;
|
||||||
|
|
||||||
|
err = gettimeofday(&deadline, NULL);
|
||||||
if (err < 0)
|
if (err < 0)
|
||||||
return err;
|
goto out;
|
||||||
|
|
||||||
deadline.tv_sec += 3;
|
deadline.tv_sec += 3;
|
||||||
|
|
||||||
while (1) {
|
while (1) {
|
||||||
char **names = NULL;
|
struct timeval now;
|
||||||
char **names_after = NULL;
|
|
||||||
struct timeval now = { 0 };
|
|
||||||
int err = gettimeofday(&now, NULL);
|
|
||||||
int err2 = 0;
|
|
||||||
if (err < 0) {
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Only look at deadlines after the first few times. This
|
err = gettimeofday(&now, NULL);
|
||||||
simplifies debugging in GDB */
|
if (err < 0)
|
||||||
tries++;
|
goto out;
|
||||||
if (tries > 3 && tv_cmp(&now, &deadline) >= 0) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = read_lines(st->list_file, &names);
|
/*
|
||||||
if (err < 0) {
|
* Only look at deadlines after the first few times. This
|
||||||
free_names(names);
|
* simplifies debugging in GDB.
|
||||||
return err;
|
|
||||||
}
|
|
||||||
err = reftable_stack_reload_once(st, names, reuse_open);
|
|
||||||
if (err == 0) {
|
|
||||||
free_names(names);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (err != REFTABLE_NOT_EXIST_ERROR) {
|
|
||||||
free_names(names);
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* err == REFTABLE_NOT_EXIST_ERROR can be caused by a concurrent
|
|
||||||
writer. Check if there was one by checking if the name list
|
|
||||||
changed.
|
|
||||||
*/
|
*/
|
||||||
err2 = read_lines(st->list_file, &names_after);
|
tries++;
|
||||||
if (err2 < 0) {
|
if (tries > 3 && tv_cmp(&now, &deadline) >= 0)
|
||||||
free_names(names);
|
goto out;
|
||||||
return err2;
|
|
||||||
|
fd = open(st->list_file, O_RDONLY);
|
||||||
|
if (fd < 0) {
|
||||||
|
if (errno != ENOENT) {
|
||||||
|
err = REFTABLE_IO_ERROR;
|
||||||
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (names_equal(names_after, names)) {
|
names = reftable_calloc(sizeof(char *));
|
||||||
free_names(names);
|
} else {
|
||||||
free_names(names_after);
|
err = fd_read_lines(fd, &names);
|
||||||
return err;
|
if (err < 0)
|
||||||
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
err = reftable_stack_reload_once(st, names, reuse_open);
|
||||||
|
if (!err)
|
||||||
|
break;
|
||||||
|
if (err != REFTABLE_NOT_EXIST_ERROR)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* REFTABLE_NOT_EXIST_ERROR can be caused by a concurrent
|
||||||
|
* writer. Check if there was one by checking if the name list
|
||||||
|
* changed.
|
||||||
|
*/
|
||||||
|
err = read_lines(st->list_file, &names_after);
|
||||||
|
if (err < 0)
|
||||||
|
goto out;
|
||||||
|
if (names_equal(names_after, names)) {
|
||||||
|
err = REFTABLE_NOT_EXIST_ERROR;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
free_names(names);
|
free_names(names);
|
||||||
|
names = NULL;
|
||||||
free_names(names_after);
|
free_names(names_after);
|
||||||
|
names_after = NULL;
|
||||||
|
close(fd);
|
||||||
|
fd = -1;
|
||||||
|
|
||||||
delay = delay + (delay * rand()) / RAND_MAX + 1;
|
delay = delay + (delay * rand()) / RAND_MAX + 1;
|
||||||
sleep_millisec(delay);
|
sleep_millisec(delay);
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
out:
|
||||||
|
/*
|
||||||
|
* Invalidate the stat cache. It is sufficient to only close the file
|
||||||
|
* descriptor and keep the cached stat info because we never use the
|
||||||
|
* latter when the former is negative.
|
||||||
|
*/
|
||||||
|
if (st->list_fd >= 0) {
|
||||||
|
close(st->list_fd);
|
||||||
|
st->list_fd = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Cache stat information in case it provides a useful signal to us.
|
||||||
|
* According to POSIX, "The st_ino and st_dev fields taken together
|
||||||
|
* uniquely identify the file within the system." That being said,
|
||||||
|
* Windows is not POSIX compliant and we do not have these fields
|
||||||
|
* available. So the information we have there is insufficient to
|
||||||
|
* determine whether two file descriptors point to the same file.
|
||||||
|
*
|
||||||
|
* While we could fall back to using other signals like the file's
|
||||||
|
* mtime, those are not sufficient to avoid races. We thus refrain from
|
||||||
|
* using the stat cache on such systems and fall back to the secondary
|
||||||
|
* caching mechanism, which is to check whether contents of the file
|
||||||
|
* have changed.
|
||||||
|
*
|
||||||
|
* On other systems which are POSIX compliant we must keep the file
|
||||||
|
* descriptor open. This is to avoid a race condition where two
|
||||||
|
* processes access the reftable stack at the same point in time:
|
||||||
|
*
|
||||||
|
* 1. A reads the reftable stack and caches its stat info.
|
||||||
|
*
|
||||||
|
* 2. B updates the stack, appending a new table to "tables.list".
|
||||||
|
* This will both use a new inode and result in a different file
|
||||||
|
* size, thus invalidating A's cache in theory.
|
||||||
|
*
|
||||||
|
* 3. B decides to auto-compact the stack and merges two tables. The
|
||||||
|
* file size now matches what A has cached again. Furthermore, the
|
||||||
|
* filesystem may decide to recycle the inode number of the file
|
||||||
|
* we have replaced in (2) because it is not in use anymore.
|
||||||
|
*
|
||||||
|
* 4. A reloads the reftable stack. Neither the inode number nor the
|
||||||
|
* file size changed. If the timestamps did not change either then
|
||||||
|
* we think the cached copy of our stack is up-to-date.
|
||||||
|
*
|
||||||
|
* By keeping the file descriptor open the inode number cannot be
|
||||||
|
* recycled, mitigating the race.
|
||||||
|
*/
|
||||||
|
if (!err && fd >= 0 && !fstat(fd, &st->list_st) &&
|
||||||
|
st->list_st.st_dev && st->list_st.st_ino) {
|
||||||
|
st->list_fd = fd;
|
||||||
|
fd = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fd >= 0)
|
||||||
|
close(fd);
|
||||||
|
free_names(names);
|
||||||
|
free_names(names_after);
|
||||||
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* -1 = error
|
/* -1 = error
|
||||||
|
@ -375,8 +447,44 @@ static int reftable_stack_reload_maybe_reuse(struct reftable_stack *st,
|
||||||
static int stack_uptodate(struct reftable_stack *st)
|
static int stack_uptodate(struct reftable_stack *st)
|
||||||
{
|
{
|
||||||
char **names = NULL;
|
char **names = NULL;
|
||||||
int err = read_lines(st->list_file, &names);
|
int err;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* When we have cached stat information available then we use it to
|
||||||
|
* verify whether the file has been rewritten.
|
||||||
|
*
|
||||||
|
* Note that we explicitly do not want to use `stat_validity_check()`
|
||||||
|
* and friends here because they may end up not comparing the `st_dev`
|
||||||
|
* and `st_ino` fields. These functions thus cannot guarantee that we
|
||||||
|
* indeed still have the same file.
|
||||||
|
*/
|
||||||
|
if (st->list_fd >= 0) {
|
||||||
|
struct stat list_st;
|
||||||
|
|
||||||
|
if (stat(st->list_file, &list_st) < 0) {
|
||||||
|
/*
|
||||||
|
* It's fine for "tables.list" to not exist. In that
|
||||||
|
* case, we have to refresh when the loaded stack has
|
||||||
|
* any readers.
|
||||||
|
*/
|
||||||
|
if (errno == ENOENT)
|
||||||
|
return !!st->readers_len;
|
||||||
|
return REFTABLE_IO_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* When "tables.list" refers to the same file we can assume
|
||||||
|
* that it didn't change. This is because we always use
|
||||||
|
* rename(3P) to update the file and never write to it
|
||||||
|
* directly.
|
||||||
|
*/
|
||||||
|
if (st->list_st.st_dev == list_st.st_dev &&
|
||||||
|
st->list_st.st_ino == list_st.st_ino)
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
err = read_lines(st->list_file, &names);
|
||||||
if (err < 0)
|
if (err < 0)
|
||||||
return err;
|
return err;
|
||||||
|
|
||||||
|
@ -559,7 +667,7 @@ int reftable_addition_commit(struct reftable_addition *add)
|
||||||
add->new_tables = NULL;
|
add->new_tables = NULL;
|
||||||
add->new_tables_len = 0;
|
add->new_tables_len = 0;
|
||||||
|
|
||||||
err = reftable_stack_reload(add->stack);
|
err = reftable_stack_reload_maybe_reuse(add->stack, 1);
|
||||||
if (err)
|
if (err)
|
||||||
goto done;
|
goto done;
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,10 @@ https://developers.google.com/open-source/licenses/bsd
|
||||||
#include "reftable-stack.h"
|
#include "reftable-stack.h"
|
||||||
|
|
||||||
struct reftable_stack {
|
struct reftable_stack {
|
||||||
|
struct stat list_st;
|
||||||
char *list_file;
|
char *list_file;
|
||||||
|
int list_fd;
|
||||||
|
|
||||||
char *reftable_dir;
|
char *reftable_dir;
|
||||||
int disable_auto_compact;
|
int disable_auto_compact;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue