From 532e21698634b4aa305981a6243bc8e3bd795ae1 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 28 Jun 2024 12:43:21 +0000 Subject: [PATCH 1/5] sparse-checkout: refactor skip worktree retry logic The clear_skip_worktree_from_present_files() method was introduced in af6a51875a (repo_read_index: clear SKIP_WORKTREE bit from files present in worktree, 2022-01-14) to help cases where sparse-checkout is enabled but some paths outside of the sparse-checkout also exist on disk. This operation can be slow as it needs to check path existence in a way not stored in the index, so caching was introduced in d79d299352 (Accelerate clear_skip_worktree_from_present_files() by caching, 2022-01-14). This check is particularly confusing in the presence of a sparse index, as a sparse tree entry corresponding to an existing directory must first be expanded to a full index before examining the paths within. This is currently implemented using a 'goto' and a boolean variable to ensure we restart only once. Even with that caching, it was noticed that this could take a long time to execute. 89aaab11a3 (index: add trace2 region for clear skip worktree, 2022-11-03) introduced trace2 regions to measure this time. Further, the way the loop repeats itself was slightly confusing and prone to breakage, so a BUG() statement was added in 8c7abdc596 (index: raise a bug if the index is materialised more than once, 2022-11-03) to be sure that the second run of the loop does not hit any sparse trees. One thing that can be confusing about the current setup is that the trace2 regions nest and it is not clear that a second loop is running after a sparse index is expanded. Here is an example of what the regions look like in a typical case: | region_enter | ... | label:clear_skip_worktree_from_present_files | region_enter | ... | ..label:update | region_leave | ... | ..label:update | region_enter | ... | ..label:ensure_full_index | region_enter | ... | ....label:update | region_leave | ... | ....label:update | region_leave | ... | ..label:ensure_full_index | data | ... | ..sparse_path_count:1 | data | ... | ..sparse_path_count_full:269538 | region_leave | ... | label:clear_skip_worktree_from_present_files One thing that is particularly difficult to understand about these regions is that most of the time is spent between the close of the ensure_full_index region and the reporting of the end data. This is because of the restart of the loop being within the same region as the first iteration of the loop. This change refactors the method into two separate methods that are traced separately. This will be more important later when we change other features of the methods, but for now the only functional change is the difference in the structure of the trace regions. After this change, the same telemetry section is split into three distinct chunks: | region_enter | ... | label:clear_skip_worktree_from_present_files_sparse | data | ... | ..sparse_path_count:1 | region_leave | ... | label:clear_skip_worktree_from_present_files_sparse | region_enter | ... | label:update | region_leave | ... | label:update | region_enter | ... | label:ensure_full_index | region_enter | ... | ..label:update | region_leave | ... | ..label:update | region_leave | ... | label:ensure_full_index | region_enter | ... | label:clear_skip_worktree_from_present_files_full | data | ... | ..full_path_count:269538 | region_leave | ... | label:clear_skip_worktree_from_present_files_full Here, we see the sparse loop terminating early with its first sparse path being a sparse directory containing a file. Then, that loop's region terminates before ensure_full_index begins (in this case, the cache-tree must also be computed). Then, _after_ the index is expanded, the full loop begins with its own region. Signed-off-by: Derrick Stolee Reviewed-by: Elijah Newren Signed-off-by: Junio C Hamano --- sparse-index.c | 77 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 53 insertions(+), 24 deletions(-) diff --git a/sparse-index.c b/sparse-index.c index e48e40cae7..e0457c87ff 100644 --- a/sparse-index.c +++ b/sparse-index.c @@ -486,49 +486,78 @@ static int path_found(const char *path, const char **dirname, size_t *dir_len, return 0; } -void clear_skip_worktree_from_present_files(struct index_state *istate) +static int clear_skip_worktree_from_present_files_sparse(struct index_state *istate) { const char *last_dirname = NULL; size_t dir_len = 0; int dir_found = 1; - int i; - int path_count[2] = {0, 0}; - int restarted = 0; + int path_count = 0; + int to_restart = 0; - if (!core_apply_sparse_checkout || - sparse_expect_files_outside_of_patterns) - return; - - trace2_region_enter("index", "clear_skip_worktree_from_present_files", + trace2_region_enter("index", "clear_skip_worktree_from_present_files_sparse", istate->repo); -restart: - for (i = 0; i < istate->cache_nr; i++) { + for (int i = 0; i < istate->cache_nr; i++) { struct cache_entry *ce = istate->cache[i]; if (ce_skip_worktree(ce)) { - path_count[restarted]++; + path_count++; if (path_found(ce->name, &last_dirname, &dir_len, &dir_found)) { if (S_ISSPARSEDIR(ce->ce_mode)) { - if (restarted) - BUG("ensure-full-index did not fully flatten?"); - ensure_full_index(istate); - restarted = 1; - goto restart; + to_restart = 1; + break; } ce->ce_flags &= ~CE_SKIP_WORKTREE; } } } - if (path_count[0]) - trace2_data_intmax("index", istate->repo, - "sparse_path_count", path_count[0]); - if (restarted) - trace2_data_intmax("index", istate->repo, - "sparse_path_count_full", path_count[1]); - trace2_region_leave("index", "clear_skip_worktree_from_present_files", + trace2_data_intmax("index", istate->repo, + "sparse_path_count", path_count); + trace2_region_leave("index", "clear_skip_worktree_from_present_files_sparse", istate->repo); + return to_restart; +} + +static void clear_skip_worktree_from_present_files_full(struct index_state *istate) +{ + const char *last_dirname = NULL; + size_t dir_len = 0; + int dir_found = 1; + + int path_count = 0; + + trace2_region_enter("index", "clear_skip_worktree_from_present_files_full", + istate->repo); + for (int i = 0; i < istate->cache_nr; i++) { + struct cache_entry *ce = istate->cache[i]; + + if (S_ISSPARSEDIR(ce->ce_mode)) + BUG("ensure-full-index did not fully flatten?"); + + if (ce_skip_worktree(ce)) { + path_count++; + if (path_found(ce->name, &last_dirname, &dir_len, &dir_found)) + ce->ce_flags &= ~CE_SKIP_WORKTREE; + } + } + + trace2_data_intmax("index", istate->repo, + "full_path_count", path_count); + trace2_region_leave("index", "clear_skip_worktree_from_present_files_full", + istate->repo); +} + +void clear_skip_worktree_from_present_files(struct index_state *istate) +{ + if (!core_apply_sparse_checkout || + sparse_expect_files_outside_of_patterns) + return; + + if (clear_skip_worktree_from_present_files_sparse(istate)) { + ensure_full_index(istate); + clear_skip_worktree_from_present_files_full(istate); + } } /* From b746a85d9a0d2384d3219b3b53593ccb880f3124 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 28 Jun 2024 12:43:22 +0000 Subject: [PATCH 2/5] sparse-index: refactor path_found() In advance of changing the behavior of path_found(), take all of the intermediate data values and group them into a single struct. This simplifies the method prototype as well as the initialization. Future changes can be made directly to the struct and method without changing the callers with this approach. Note that the clear_path_found_data() method is currently empty, as there is nothing to free. This method is a placeholder for future changes that require a non-trivial implementation. Its stub is created now so consumers could call it now and not change in future changes. Signed-off-by: Derrick Stolee Reviewed-by: Elijah Newren Signed-off-by: Junio C Hamano --- sparse-index.c | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/sparse-index.c b/sparse-index.c index e0457c87ff..de6e727f5c 100644 --- a/sparse-index.c +++ b/sparse-index.c @@ -439,8 +439,22 @@ void ensure_correct_sparsity(struct index_state *istate) ensure_full_index(istate); } -static int path_found(const char *path, const char **dirname, size_t *dir_len, - int *dir_found) +struct path_found_data { + const char *dirname; + size_t dir_len; + int dir_found; +}; + +#define PATH_FOUND_DATA_INIT { \ + .dir_found = 1 \ +} + +static void clear_path_found_data(struct path_found_data *data) +{ + return; +} + +static int path_found(const char *path, struct path_found_data *data) { struct stat st; char *newdir; @@ -450,7 +464,7 @@ static int path_found(const char *path, const char **dirname, size_t *dir_len, * If dirname corresponds to a directory that doesn't exist, and this * path starts with dirname, then path can't exist. */ - if (!*dir_found && !memcmp(path, *dirname, *dir_len)) + if (!data->dir_found && !memcmp(path, data->dirname, data->dir_len)) return 0; /* @@ -472,15 +486,16 @@ static int path_found(const char *path, const char **dirname, size_t *dir_len, * If path starts with directory (which we already lstat'ed and found), * then no need to lstat parent directory again. */ - if (*dir_found && *dirname && memcmp(path, *dirname, *dir_len)) + if (data->dir_found && data->dirname && + memcmp(path, data->dirname, data->dir_len)) return 0; /* Free previous dirname, and cache path's dirname */ - *dirname = path; - *dir_len = newdir - path + 1; + data->dirname = path; + data->dir_len = newdir - path + 1; - tmp = xstrndup(path, *dir_len); - *dir_found = !lstat(tmp, &st); + tmp = xstrndup(path, data->dir_len); + data->dir_found = !lstat(tmp, &st); free(tmp); return 0; @@ -488,9 +503,7 @@ static int path_found(const char *path, const char **dirname, size_t *dir_len, static int clear_skip_worktree_from_present_files_sparse(struct index_state *istate) { - const char *last_dirname = NULL; - size_t dir_len = 0; - int dir_found = 1; + struct path_found_data data = PATH_FOUND_DATA_INIT; int path_count = 0; int to_restart = 0; @@ -502,7 +515,7 @@ static int clear_skip_worktree_from_present_files_sparse(struct index_state *ist if (ce_skip_worktree(ce)) { path_count++; - if (path_found(ce->name, &last_dirname, &dir_len, &dir_found)) { + if (path_found(ce->name, &data)) { if (S_ISSPARSEDIR(ce->ce_mode)) { to_restart = 1; break; @@ -516,14 +529,13 @@ static int clear_skip_worktree_from_present_files_sparse(struct index_state *ist "sparse_path_count", path_count); trace2_region_leave("index", "clear_skip_worktree_from_present_files_sparse", istate->repo); + clear_path_found_data(&data); return to_restart; } static void clear_skip_worktree_from_present_files_full(struct index_state *istate) { - const char *last_dirname = NULL; - size_t dir_len = 0; - int dir_found = 1; + struct path_found_data data = PATH_FOUND_DATA_INIT; int path_count = 0; @@ -537,7 +549,7 @@ static void clear_skip_worktree_from_present_files_full(struct index_state *ista if (ce_skip_worktree(ce)) { path_count++; - if (path_found(ce->name, &last_dirname, &dir_len, &dir_found)) + if (path_found(ce->name, &data)) ce->ce_flags &= ~CE_SKIP_WORKTREE; } } @@ -546,6 +558,7 @@ static void clear_skip_worktree_from_present_files_full(struct index_state *ista "full_path_count", path_count); trace2_region_leave("index", "clear_skip_worktree_from_present_files_full", istate->repo); + clear_path_found_data(&data); } void clear_skip_worktree_from_present_files(struct index_state *istate) From 23dd6f8bcc11fc4a468f0863b64f3ebe27a173cd Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 28 Jun 2024 12:43:23 +0000 Subject: [PATCH 3/5] sparse-index: use strbuf in path_found() The path_found() method previously reused strings from the cache entries the calling methods were using. This prevents string manipulation in place and causes some odd reallocation before the final lstat() call in the method. Refactor the method to use strbufs and copy the path into the strbuf, but also only the parent directory and not the whole path. This looks like extra copying when assigning the path to the strbuf, but we save an allocation by dropping the 'tmp' string, and we are "reusing" the copy from 'tmp' to put the data in the strbuf. Signed-off-by: Derrick Stolee Reviewed-by: Elijah Newren Signed-off-by: Junio C Hamano --- sparse-index.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/sparse-index.c b/sparse-index.c index de6e727f5c..fec4f39336 100644 --- a/sparse-index.c +++ b/sparse-index.c @@ -440,31 +440,30 @@ void ensure_correct_sparsity(struct index_state *istate) } struct path_found_data { - const char *dirname; - size_t dir_len; + struct strbuf dir; int dir_found; }; #define PATH_FOUND_DATA_INIT { \ + .dir = STRBUF_INIT, \ .dir_found = 1 \ } static void clear_path_found_data(struct path_found_data *data) { - return; + strbuf_release(&data->dir); } static int path_found(const char *path, struct path_found_data *data) { struct stat st; char *newdir; - char *tmp; /* * If dirname corresponds to a directory that doesn't exist, and this * path starts with dirname, then path can't exist. */ - if (!data->dir_found && !memcmp(path, data->dirname, data->dir_len)) + if (!data->dir_found && !memcmp(path, data->dir.buf, data->dir.len)) return 0; /* @@ -486,17 +485,15 @@ static int path_found(const char *path, struct path_found_data *data) * If path starts with directory (which we already lstat'ed and found), * then no need to lstat parent directory again. */ - if (data->dir_found && data->dirname && - memcmp(path, data->dirname, data->dir_len)) + if (data->dir_found && data->dir.buf && + memcmp(path, data->dir.buf, data->dir.len)) return 0; /* Free previous dirname, and cache path's dirname */ - data->dirname = path; - data->dir_len = newdir - path + 1; + strbuf_reset(&data->dir); + strbuf_add(&data->dir, path, newdir - path + 1); - tmp = xstrndup(path, data->dir_len); - data->dir_found = !lstat(tmp, &st); - free(tmp); + data->dir_found = !lstat(data->dir.buf, &st); return 0; } From c4e8c42c19ecc354abacd97c61c808383c0870fa Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 28 Jun 2024 12:43:24 +0000 Subject: [PATCH 4/5] sparse-index: count lstat() calls The clear_skip_worktree.. methods already report some statistics about how many cache entries are checked against path_found() due to having the skip-worktree bit set. However, due to path_found() performing some caching, this isn't the only information that would be helpful to report. Add a new lstat_count member to the path_found_data struct to count the number of times path_found() calls lstat(). This will be helpful to help explain performance problems in this method as well as to demonstrate future changes to the caching algorithm in a more concrete way than end-to-end timings. Signed-off-by: Derrick Stolee Reviewed-by: Elijah Newren Signed-off-by: Junio C Hamano --- sparse-index.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sparse-index.c b/sparse-index.c index fec4f39336..8577fa726b 100644 --- a/sparse-index.c +++ b/sparse-index.c @@ -442,6 +442,7 @@ void ensure_correct_sparsity(struct index_state *istate) struct path_found_data { struct strbuf dir; int dir_found; + size_t lstat_count; }; #define PATH_FOUND_DATA_INIT { \ @@ -469,6 +470,7 @@ static int path_found(const char *path, struct path_found_data *data) /* * If path itself exists, return 1. */ + data->lstat_count++; if (!lstat(path, &st)) return 1; @@ -493,6 +495,7 @@ static int path_found(const char *path, struct path_found_data *data) strbuf_reset(&data->dir); strbuf_add(&data->dir, path, newdir - path + 1); + data->lstat_count++; data->dir_found = !lstat(data->dir.buf, &st); return 0; @@ -524,6 +527,8 @@ static int clear_skip_worktree_from_present_files_sparse(struct index_state *ist trace2_data_intmax("index", istate->repo, "sparse_path_count", path_count); + trace2_data_intmax("index", istate->repo, + "sparse_lstat_count", data.lstat_count); trace2_region_leave("index", "clear_skip_worktree_from_present_files_sparse", istate->repo); clear_path_found_data(&data); @@ -553,6 +558,8 @@ static void clear_skip_worktree_from_present_files_full(struct index_state *ista trace2_data_intmax("index", istate->repo, "full_path_count", path_count); + trace2_data_intmax("index", istate->repo, + "full_lstat_count", data.lstat_count); trace2_region_leave("index", "clear_skip_worktree_from_present_files_full", istate->repo); clear_path_found_data(&data); From 114bff72ac030b9e9c931a9efd2bd0af8137692b Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 28 Jun 2024 12:43:25 +0000 Subject: [PATCH 5/5] sparse-index: improve lstat caching of sparse paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The clear_skip_worktree_from_present_files() method was first introduced in af6a51875a (repo_read_index: clear SKIP_WORKTREE bit from files present in worktree, 2022-01-14) to allow better interaction with the working directory in the presence of paths outside of the sparse-checkout. The initial implementation would lstat() every single SKIP_WORKTREE path to see if it existed; if it ran across a sparse directory that existed (when a sparse index was in use), then it would expand the index and then check every SKIP_WORKTREE path. Since these lstat() calls were very expensive, this was improved in d79d299352 (Accelerate clear_skip_worktree_from_present_files() by caching, 2022-01-14) by caching directories that do not exist so it could avoid lstat()ing any files under such directories. However, there are some inefficiencies in that caching mechanism. The caching mechanism stored only the parent directory as not existing, even if a higher parent directory also does not exist. This means that wasted lstat() calls would occur when the paths passed to path_found() change immediate parent directories but within the same parent directory that does not exist. To create an example repository that demonstrates this problem, it helps to have a directory outside of the sparse-checkout that contains many deep paths. In particular, the first paths (in lexicographic order) underneath the sparse directory should have deep directory structures, maximizing the difference between the old caching algorithm that looks to a single parent and the new caching algorithm that looks to the top-most missing directory. The performance test script p2000-sparse-operations.sh takes the sample repository and copies its HEAD to several copies nested in directories of the form f/f/f where i, j, and k are numbers from 1 to 4. The sparse-checkout cone is then selected as "f2/f4/". Creating "f1/f1/" will trigger the behavior and also lead to some interesting cases for the caching algorithm since "f1/f1/" exists but "f1/f2/" and "f3/" do not. This is difficult to notice when running performance tests using the Git repository (or a blow-up of the Git repository, as in p2000-sparse-operations.sh) because Git has a very shallow directory structure. This change reorganizes the caching algorithm to focus on storing the highest level leading directory that does not exist; specifically this means that that directory's parent _does_ exist. By doing a little extra work on a path passed to path_found(), we can short-circuit all of the paths passed to path_found() afterwards that match a prefix with that non-existing directory. When in a repository where the first sparse file is likely to have a much deeper path than the first non-existing directory, this can realize significant gains. The details of this algorithm require careful attention, so the new implementation of path_found() has detailed comments, including the use of a new max_common_dir_prefix() method that may be of independent interest. It's worth noting that this is not universally positive, since we are doing extra lstat() calls to establish the exact path to cache. In the blow-up of the Git repository, we can see that the lstat count _increases_ from 28 to 31. However, these numbers were already artificially low. Contributor Elijah Newren created a publicly-available test repository that demonstrates the difference in these caching algorithms in the most extreme way. To test, follow these steps: git clone --sparse https://github.com/newren/gvfs-like-git-bomb cd gvfs-like-git-bomb ./runme.sh # NOTE: check scripts before running! At this point, assuming you do not have index.sparse=true set globally, the index has one million paths with the SKIP_WORKTREE bit and they will all be sent to path_found() in the sparse loop. You can measure this by running 'git status' with GIT_TRACE2_PERF=1: Sparse files in the index: 1,000,000 sparse_lstat_count (before): 200,000 sparse_lstat_count (after): 2 And here are the performance numbers: Benchmark 1: old Time (mean ± σ): 397.5 ms ± 4.1 ms Range (min … max): 391.2 ms … 404.8 ms 10 runs Benchmark 2: new Time (mean ± σ): 252.7 ms ± 3.1 ms Range (min … max): 249.4 ms … 259.5 ms 11 runs Summary 'new' ran 1.57 ± 0.02 times faster than 'old' By modifying this example further, we can demonstrate a more realistic example and include the sparse index expansion. Continue by creating this directory, confusing both caching algorithms somewhat: mkdir -p bomb/d/e/f/a/a Then re-run the 'git status' tests to see these statistics: Sparse files in the index: 1,000,000 sparse_lstat_count (before): 724,010 sparse_lstat_count (after): 106 Benchmark 1: old Time (mean ± σ): 753.0 ms ± 3.5 ms Range (min … max): 749.7 ms … 760.9 ms 10 runs Benchmark 2: new Time (mean ± σ): 201.4 ms ± 3.2 ms Range (min … max): 196.0 ms … 207.9 ms 14 runs Summary 'new' ran 3.74 ± 0.06 times faster than 'old' Note that if this repository had a sparse index enabled, the additional cost of expanding the sparse index affects the total time of these commands by over four seconds, significantly diminishing the benefit of the caching algorithm. Having existing paths outside of the sparse-checkout is a known performance issue for the sparse index and is a known trade-off for the performance benefits given when no such paths exist. Using an internal monorepo with over two million paths at HEAD and a typical sparse-checkout cone such that the sparse index contains ~190,000 entries (including over two thousand sparse trees), I was able to measure these lstat counts when one sparse directory actually exists on disk: Sparse files in expanded index: 1,841,997 full_lstat_count (before): 1,188,161 full_lstat_count (after): 4,404 This resulted in this absolute time change, on a warm disk: Time in full loop (before): 13.481 s Time in full loop (after): 0.081 s (These times were calculated on a Windows machine, where lstat() is slower than a similar Linux machine.) Helped-by: Elijah Newren Signed-off-by: Derrick Stolee Reviewed-by: Elijah Newren Signed-off-by: Junio C Hamano --- sparse-index.c | 114 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 90 insertions(+), 24 deletions(-) diff --git a/sparse-index.c b/sparse-index.c index 8577fa726b..9913a6078c 100644 --- a/sparse-index.c +++ b/sparse-index.c @@ -440,14 +440,21 @@ void ensure_correct_sparsity(struct index_state *istate) } struct path_found_data { + /** + * The path stored in 'dir', if non-empty, corresponds to the most- + * recent path that we checked where: + * + * 1. The path should be a directory, according to the index. + * 2. The path does not exist. + * 3. The parent path _does_ exist. (This may be the root of the + * working directory.) + */ struct strbuf dir; - int dir_found; size_t lstat_count; }; #define PATH_FOUND_DATA_INIT { \ - .dir = STRBUF_INIT, \ - .dir_found = 1 \ + .dir = STRBUF_INIT \ } static void clear_path_found_data(struct path_found_data *data) @@ -455,49 +462,108 @@ static void clear_path_found_data(struct path_found_data *data) strbuf_release(&data->dir); } +/** + * Return the length of the longest common substring that ends in a + * slash ('/') to indicate the longest common parent directory. Returns + * zero if no common directory exists. + */ +static size_t max_common_dir_prefix(const char *path1, const char *path2) +{ + size_t common_prefix = 0; + for (size_t i = 0; path1[i] && path2[i]; i++) { + if (path1[i] != path2[i]) + break; + + /* + * If they agree at a directory separator, then add one + * to make sure it is included in the common prefix string. + */ + if (path1[i] == '/') + common_prefix = i + 1; + } + + return common_prefix; +} + static int path_found(const char *path, struct path_found_data *data) { struct stat st; - char *newdir; + size_t common_prefix; /* - * If dirname corresponds to a directory that doesn't exist, and this - * path starts with dirname, then path can't exist. + * If data->dir is non-empty, then it contains a path that doesn't + * exist, including an ending slash ('/'). If it is a prefix of 'path', + * then we can return 0. */ - if (!data->dir_found && !memcmp(path, data->dir.buf, data->dir.len)) + if (data->dir.len && !memcmp(path, data->dir.buf, data->dir.len)) return 0; /* - * If path itself exists, return 1. + * Otherwise, we must check if the current path exists. If it does, then + * return 1. The cached directory will be skipped until we come across + * a missing path again. */ data->lstat_count++; if (!lstat(path, &st)) return 1; /* - * Otherwise, path does not exist so we'll return 0...but we'll first - * determine some info about its parent directory so we can avoid - * lstat calls for future cache entries. + * At this point, we know that 'path' doesn't exist, and we know that + * the parent directory of 'data->dir' does exist. Let's set 'data->dir' + * to be the top-most non-existing directory of 'path'. If the first + * parent of 'path' exists, then we will act as though 'path' + * corresponds to a directory (by adding a slash). */ - newdir = strrchr(path, '/'); - if (!newdir) - return 0; /* Didn't find a parent dir; just return 0 now. */ + common_prefix = max_common_dir_prefix(path, data->dir.buf); /* - * If path starts with directory (which we already lstat'ed and found), - * then no need to lstat parent directory again. + * At this point, 'path' and 'data->dir' have a common existing parent + * directory given by path[0..common_prefix] (which could have length 0). + * We "grow" the data->dir buffer by checking for existing directories + * along 'path'. */ - if (data->dir_found && data->dir.buf && - memcmp(path, data->dir.buf, data->dir.len)) - return 0; - /* Free previous dirname, and cache path's dirname */ - strbuf_reset(&data->dir); - strbuf_add(&data->dir, path, newdir - path + 1); + strbuf_setlen(&data->dir, common_prefix); + while (1) { + /* Find the next directory in 'path'. */ + const char *rest = path + data->dir.len; + const char *next_slash = strchr(rest, '/'); - data->lstat_count++; - data->dir_found = !lstat(data->dir.buf, &st); + /* + * If there are no more slashes, then 'path' doesn't contain a + * non-existent _parent_ directory. Set 'data->dir' to be equal + * to 'path' plus an additional slash, so it can be used for + * caching in the future. The filename of 'path' is considered + * a non-existent directory. + * + * Note: if "{path}/" exists as a directory, then it will never + * appear as a prefix of other callers to this method, assuming + * the context from the clear_skip_worktree... methods. If this + * method is reused, then this must be reconsidered. + */ + if (!next_slash) { + strbuf_addstr(&data->dir, rest); + strbuf_addch(&data->dir, '/'); + break; + } + /* + * Now that we have a slash, let's grow 'data->dir' to include + * this slash, then test if we should stop. + */ + strbuf_add(&data->dir, rest, next_slash - rest + 1); + + /* If the parent dir doesn't exist, then stop here. */ + data->lstat_count++; + if (lstat(data->dir.buf, &st)) + return 0; + } + + /* + * At this point, 'data->dir' is equal to 'path' plus a slash character, + * and the parent directory of 'path' definitely exists. Moreover, we + * know that 'path' doesn't exist, or we would have returned 1 earlier. + */ return 0; }