Merge branch 'ds/sparse-lstat-caching'

The code to deal with modified paths that are out-of-cone in a
sparsely checked out working tree has been optimized.

* ds/sparse-lstat-caching:
  sparse-index: improve lstat caching of sparse paths
  sparse-index: count lstat() calls
  sparse-index: use strbuf in path_found()
  sparse-index: refactor path_found()
  sparse-checkout: refactor skip worktree retry logic
maint
Junio C Hamano 2024-07-08 14:53:11 -07:00
commit a43b001cce
1 changed files with 164 additions and 52 deletions

View File

@ -439,96 +439,208 @@ void ensure_correct_sparsity(struct index_state *istate)
ensure_full_index(istate);
}

static int path_found(const char *path, const char **dirname, size_t *dir_len,
int *dir_found)
struct path_found_data {
/**
* The path stored in 'dir', if non-empty, corresponds to the most-
* recent path that we checked where:
*
* 1. The path should be a directory, according to the index.
* 2. The path does not exist.
* 3. The parent path _does_ exist. (This may be the root of the
* working directory.)
*/
struct strbuf dir;
size_t lstat_count;
};

#define PATH_FOUND_DATA_INIT { \
.dir = STRBUF_INIT \
}

static void clear_path_found_data(struct path_found_data *data)
{
strbuf_release(&data->dir);
}

/**
* Return the length of the longest common substring that ends in a
* slash ('/') to indicate the longest common parent directory. Returns
* zero if no common directory exists.
*/
static size_t max_common_dir_prefix(const char *path1, const char *path2)
{
size_t common_prefix = 0;
for (size_t i = 0; path1[i] && path2[i]; i++) {
if (path1[i] != path2[i])
break;

/*
* If they agree at a directory separator, then add one
* to make sure it is included in the common prefix string.
*/
if (path1[i] == '/')
common_prefix = i + 1;
}

return common_prefix;
}

static int path_found(const char *path, struct path_found_data *data)
{
struct stat st;
char *newdir;
char *tmp;
size_t common_prefix;

/*
* If dirname corresponds to a directory that doesn't exist, and this
* path starts with dirname, then path can't exist.
* If data->dir is non-empty, then it contains a path that doesn't
* exist, including an ending slash ('/'). If it is a prefix of 'path',
* then we can return 0.
*/
if (!*dir_found && !memcmp(path, *dirname, *dir_len))
if (data->dir.len && !memcmp(path, data->dir.buf, data->dir.len))
return 0;

/*
* If path itself exists, return 1.
* Otherwise, we must check if the current path exists. If it does, then
* return 1. The cached directory will be skipped until we come across
* a missing path again.
*/
data->lstat_count++;
if (!lstat(path, &st))
return 1;

/*
* Otherwise, path does not exist so we'll return 0...but we'll first
* determine some info about its parent directory so we can avoid
* lstat calls for future cache entries.
* At this point, we know that 'path' doesn't exist, and we know that
* the parent directory of 'data->dir' does exist. Let's set 'data->dir'
* to be the top-most non-existing directory of 'path'. If the first
* parent of 'path' exists, then we will act as though 'path'
* corresponds to a directory (by adding a slash).
*/
newdir = strrchr(path, '/');
if (!newdir)
return 0; /* Didn't find a parent dir; just return 0 now. */
common_prefix = max_common_dir_prefix(path, data->dir.buf);

/*
* If path starts with directory (which we already lstat'ed and found),
* then no need to lstat parent directory again.
* At this point, 'path' and 'data->dir' have a common existing parent
* directory given by path[0..common_prefix] (which could have length 0).
* We "grow" the data->dir buffer by checking for existing directories
* along 'path'.
*/
if (*dir_found && *dirname && memcmp(path, *dirname, *dir_len))
return 0;

/* Free previous dirname, and cache path's dirname */
*dirname = path;
*dir_len = newdir - path + 1;
strbuf_setlen(&data->dir, common_prefix);
while (1) {
/* Find the next directory in 'path'. */
const char *rest = path + data->dir.len;
const char *next_slash = strchr(rest, '/');

tmp = xstrndup(path, *dir_len);
*dir_found = !lstat(tmp, &st);
free(tmp);
/*
* If there are no more slashes, then 'path' doesn't contain a
* non-existent _parent_ directory. Set 'data->dir' to be equal
* to 'path' plus an additional slash, so it can be used for
* caching in the future. The filename of 'path' is considered
* a non-existent directory.
*
* Note: if "{path}/" exists as a directory, then it will never
* appear as a prefix of other callers to this method, assuming
* the context from the clear_skip_worktree... methods. If this
* method is reused, then this must be reconsidered.
*/
if (!next_slash) {
strbuf_addstr(&data->dir, rest);
strbuf_addch(&data->dir, '/');
break;
}

/*
* Now that we have a slash, let's grow 'data->dir' to include
* this slash, then test if we should stop.
*/
strbuf_add(&data->dir, rest, next_slash - rest + 1);

/* If the parent dir doesn't exist, then stop here. */
data->lstat_count++;
if (lstat(data->dir.buf, &st))
return 0;
}

/*
* At this point, 'data->dir' is equal to 'path' plus a slash character,
* and the parent directory of 'path' definitely exists. Moreover, we
* know that 'path' doesn't exist, or we would have returned 1 earlier.
*/
return 0;
}

void clear_skip_worktree_from_present_files(struct index_state *istate)
static int clear_skip_worktree_from_present_files_sparse(struct index_state *istate)
{
const char *last_dirname = NULL;
size_t dir_len = 0;
int dir_found = 1;
struct path_found_data data = PATH_FOUND_DATA_INIT;

int i;
int path_count[2] = {0, 0};
int restarted = 0;
int path_count = 0;
int to_restart = 0;

if (!core_apply_sparse_checkout ||
sparse_expect_files_outside_of_patterns)
return;

trace2_region_enter("index", "clear_skip_worktree_from_present_files",
trace2_region_enter("index", "clear_skip_worktree_from_present_files_sparse",
istate->repo);
restart:
for (i = 0; i < istate->cache_nr; i++) {
for (int i = 0; i < istate->cache_nr; i++) {
struct cache_entry *ce = istate->cache[i];

if (ce_skip_worktree(ce)) {
path_count[restarted]++;
if (path_found(ce->name, &last_dirname, &dir_len, &dir_found)) {
path_count++;
if (path_found(ce->name, &data)) {
if (S_ISSPARSEDIR(ce->ce_mode)) {
if (restarted)
BUG("ensure-full-index did not fully flatten?");
ensure_full_index(istate);
restarted = 1;
goto restart;
to_restart = 1;
break;
}
ce->ce_flags &= ~CE_SKIP_WORKTREE;
}
}
}

if (path_count[0])
trace2_data_intmax("index", istate->repo,
"sparse_path_count", path_count[0]);
if (restarted)
trace2_data_intmax("index", istate->repo,
"sparse_path_count_full", path_count[1]);
trace2_region_leave("index", "clear_skip_worktree_from_present_files",
trace2_data_intmax("index", istate->repo,
"sparse_path_count", path_count);
trace2_data_intmax("index", istate->repo,
"sparse_lstat_count", data.lstat_count);
trace2_region_leave("index", "clear_skip_worktree_from_present_files_sparse",
istate->repo);
clear_path_found_data(&data);
return to_restart;
}

static void clear_skip_worktree_from_present_files_full(struct index_state *istate)
{
struct path_found_data data = PATH_FOUND_DATA_INIT;

int path_count = 0;

trace2_region_enter("index", "clear_skip_worktree_from_present_files_full",
istate->repo);
for (int i = 0; i < istate->cache_nr; i++) {
struct cache_entry *ce = istate->cache[i];

if (S_ISSPARSEDIR(ce->ce_mode))
BUG("ensure-full-index did not fully flatten?");

if (ce_skip_worktree(ce)) {
path_count++;
if (path_found(ce->name, &data))
ce->ce_flags &= ~CE_SKIP_WORKTREE;
}
}

trace2_data_intmax("index", istate->repo,
"full_path_count", path_count);
trace2_data_intmax("index", istate->repo,
"full_lstat_count", data.lstat_count);
trace2_region_leave("index", "clear_skip_worktree_from_present_files_full",
istate->repo);
clear_path_found_data(&data);
}

void clear_skip_worktree_from_present_files(struct index_state *istate)
{
if (!core_apply_sparse_checkout ||
sparse_expect_files_outside_of_patterns)
return;

if (clear_skip_worktree_from_present_files_sparse(istate)) {
ensure_full_index(istate);
clear_skip_worktree_from_present_files_full(istate);
}
}

/*