From 532e21698634b4aa305981a6243bc8e3bd795ae1 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Fri, 28 Jun 2024 12:43:21 +0000
Subject: [PATCH 1/5] sparse-checkout: refactor skip worktree retry logic

The clear_skip_worktree_from_present_files() method was introduced in
af6a51875a (repo_read_index: clear SKIP_WORKTREE bit from files present
in worktree, 2022-01-14) to help cases where sparse-checkout is enabled
but some paths outside of the sparse-checkout also exist on disk.  This
operation can be slow as it needs to check path existence in a way not
stored in the index, so caching was introduced in d79d299352 (Accelerate
clear_skip_worktree_from_present_files() by caching, 2022-01-14).

This check is particularly confusing in the presence of a sparse index,
as a sparse tree entry corresponding to an existing directory must first
be expanded to a full index before examining the paths within. This is
currently implemented using a 'goto' and a boolean variable to ensure we
restart only once.

Even with that caching, it was noticed that this could take a long time
to execute. 89aaab11a3 (index: add trace2 region for clear skip
worktree, 2022-11-03) introduced trace2 regions to measure this time.
Further, the way the loop repeats itself was slightly confusing and
prone to breakage, so a BUG() statement was added in 8c7abdc596 (index:
raise a bug if the index is materialised more than once, 2022-11-03) to
be sure that the second run of the loop does not hit any sparse trees.

One thing that can be confusing about the current setup is that the
trace2 regions nest and it is not clear that a second loop is running
after a sparse index is expanded. Here is an example of what the regions
look like in a typical case:

| region_enter | ... | label:clear_skip_worktree_from_present_files
| region_enter | ... | ..label:update
| region_leave | ... | ..label:update
| region_enter | ... | ..label:ensure_full_index
| region_enter | ... | ....label:update
| region_leave | ... | ....label:update
| region_leave | ... | ..label:ensure_full_index
| data         | ... | ..sparse_path_count:1
| data         | ... | ..sparse_path_count_full:269538
| region_leave | ... | label:clear_skip_worktree_from_present_files

One thing that is particularly difficult to understand about these
regions is that most of the time is spent between the close of the
ensure_full_index region and the reporting of the end data. This is
because of the restart of the loop being within the same region as the
first iteration of the loop.

This change refactors the method into two separate methods that are
traced separately. This will be more important later when we change
other features of the methods, but for now the only functional change is
the difference in the structure of the trace regions.

After this change, the same telemetry section is split into three
distinct chunks:

| region_enter | ... | label:clear_skip_worktree_from_present_files_sparse
| data         | ... | ..sparse_path_count:1
| region_leave | ... | label:clear_skip_worktree_from_present_files_sparse
| region_enter | ... | label:update
| region_leave | ... | label:update
| region_enter | ... | label:ensure_full_index
| region_enter | ... | ..label:update
| region_leave | ... | ..label:update
| region_leave | ... | label:ensure_full_index
| region_enter | ... | label:clear_skip_worktree_from_present_files_full
| data         | ... | ..full_path_count:269538
| region_leave | ... | label:clear_skip_worktree_from_present_files_full

Here, we see the sparse loop terminating early with its first sparse
path being a sparse directory containing a file. Then, that loop's
region terminates before ensure_full_index begins (in this case, the
cache-tree must also be computed). Then, _after_ the index is expanded,
the full loop begins with its own region.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
Reviewed-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 sparse-index.c | 77 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 53 insertions(+), 24 deletions(-)

diff --git a/sparse-index.c b/sparse-index.c
index e48e40cae7..e0457c87ff 100644
--- a/sparse-index.c
+++ b/sparse-index.c
@@ -486,49 +486,78 @@ static int path_found(const char *path, const char **dirname, size_t *dir_len,
 	return 0;
 }
 
-void clear_skip_worktree_from_present_files(struct index_state *istate)
+static int clear_skip_worktree_from_present_files_sparse(struct index_state *istate)
 {
 	const char *last_dirname = NULL;
 	size_t dir_len = 0;
 	int dir_found = 1;
 
-	int i;
-	int path_count[2] = {0, 0};
-	int restarted = 0;
+	int path_count = 0;
+	int to_restart = 0;
 
-	if (!core_apply_sparse_checkout ||
-	    sparse_expect_files_outside_of_patterns)
-		return;
-
-	trace2_region_enter("index", "clear_skip_worktree_from_present_files",
+	trace2_region_enter("index", "clear_skip_worktree_from_present_files_sparse",
 			    istate->repo);
-restart:
-	for (i = 0; i < istate->cache_nr; i++) {
+	for (int i = 0; i < istate->cache_nr; i++) {
 		struct cache_entry *ce = istate->cache[i];
 
 		if (ce_skip_worktree(ce)) {
-			path_count[restarted]++;
+			path_count++;
 			if (path_found(ce->name, &last_dirname, &dir_len, &dir_found)) {
 				if (S_ISSPARSEDIR(ce->ce_mode)) {
-					if (restarted)
-						BUG("ensure-full-index did not fully flatten?");
-					ensure_full_index(istate);
-					restarted = 1;
-					goto restart;
+					to_restart = 1;
+					break;
 				}
 				ce->ce_flags &= ~CE_SKIP_WORKTREE;
 			}
 		}
 	}
 
-	if (path_count[0])
-		trace2_data_intmax("index", istate->repo,
-				   "sparse_path_count", path_count[0]);
-	if (restarted)
-		trace2_data_intmax("index", istate->repo,
-				   "sparse_path_count_full", path_count[1]);
-	trace2_region_leave("index", "clear_skip_worktree_from_present_files",
+	trace2_data_intmax("index", istate->repo,
+			   "sparse_path_count", path_count);
+	trace2_region_leave("index", "clear_skip_worktree_from_present_files_sparse",
 			    istate->repo);
+	return to_restart;
+}
+
+static void clear_skip_worktree_from_present_files_full(struct index_state *istate)
+{
+	const char *last_dirname = NULL;
+	size_t dir_len = 0;
+	int dir_found = 1;
+
+	int path_count = 0;
+
+	trace2_region_enter("index", "clear_skip_worktree_from_present_files_full",
+			    istate->repo);
+	for (int i = 0; i < istate->cache_nr; i++) {
+		struct cache_entry *ce = istate->cache[i];
+
+		if (S_ISSPARSEDIR(ce->ce_mode))
+			BUG("ensure-full-index did not fully flatten?");
+
+		if (ce_skip_worktree(ce)) {
+			path_count++;
+			if (path_found(ce->name, &last_dirname, &dir_len, &dir_found))
+				ce->ce_flags &= ~CE_SKIP_WORKTREE;
+		}
+	}
+
+	trace2_data_intmax("index", istate->repo,
+			   "full_path_count", path_count);
+	trace2_region_leave("index", "clear_skip_worktree_from_present_files_full",
+			    istate->repo);
+}
+
+void clear_skip_worktree_from_present_files(struct index_state *istate)
+{
+	if (!core_apply_sparse_checkout ||
+	    sparse_expect_files_outside_of_patterns)
+		return;
+
+	if (clear_skip_worktree_from_present_files_sparse(istate)) {
+		ensure_full_index(istate);
+		clear_skip_worktree_from_present_files_full(istate);
+	}
 }
 
 /*

From b746a85d9a0d2384d3219b3b53593ccb880f3124 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Fri, 28 Jun 2024 12:43:22 +0000
Subject: [PATCH 2/5] sparse-index: refactor path_found()

In advance of changing the behavior of path_found(), take all of the
intermediate data values and group them into a single struct. This
simplifies the method prototype as well as the initialization. Future
changes can be made directly to the struct and method without changing
the callers with this approach.

Note that the clear_path_found_data() method is currently empty, as
there is nothing to free. This method is a placeholder for future
changes that require a non-trivial implementation. Its stub is created
now so consumers could call it now and not change in future changes.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
Reviewed-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 sparse-index.c | 45 +++++++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/sparse-index.c b/sparse-index.c
index e0457c87ff..de6e727f5c 100644
--- a/sparse-index.c
+++ b/sparse-index.c
@@ -439,8 +439,22 @@ void ensure_correct_sparsity(struct index_state *istate)
 		ensure_full_index(istate);
 }
 
-static int path_found(const char *path, const char **dirname, size_t *dir_len,
-		      int *dir_found)
+struct path_found_data {
+	const char *dirname;
+	size_t dir_len;
+	int dir_found;
+};
+
+#define PATH_FOUND_DATA_INIT { \
+	.dir_found = 1 \
+}
+
+static void clear_path_found_data(struct path_found_data *data)
+{
+	return;
+}
+
+static int path_found(const char *path, struct path_found_data *data)
 {
 	struct stat st;
 	char *newdir;
@@ -450,7 +464,7 @@ static int path_found(const char *path, const char **dirname, size_t *dir_len,
 	 * If dirname corresponds to a directory that doesn't exist, and this
 	 * path starts with dirname, then path can't exist.
 	 */
-	if (!*dir_found && !memcmp(path, *dirname, *dir_len))
+	if (!data->dir_found && !memcmp(path, data->dirname, data->dir_len))
 		return 0;
 
 	/*
@@ -472,15 +486,16 @@ static int path_found(const char *path, const char **dirname, size_t *dir_len,
 	 * If path starts with directory (which we already lstat'ed and found),
 	 * then no need to lstat parent directory again.
 	 */
-	if (*dir_found && *dirname && memcmp(path, *dirname, *dir_len))
+	if (data->dir_found && data->dirname &&
+	    memcmp(path, data->dirname, data->dir_len))
 		return 0;
 
 	/* Free previous dirname, and cache path's dirname */
-	*dirname = path;
-	*dir_len = newdir - path + 1;
+	data->dirname = path;
+	data->dir_len = newdir - path + 1;
 
-	tmp = xstrndup(path, *dir_len);
-	*dir_found = !lstat(tmp, &st);
+	tmp = xstrndup(path, data->dir_len);
+	data->dir_found = !lstat(tmp, &st);
 	free(tmp);
 
 	return 0;
@@ -488,9 +503,7 @@ static int path_found(const char *path, const char **dirname, size_t *dir_len,
 
 static int clear_skip_worktree_from_present_files_sparse(struct index_state *istate)
 {
-	const char *last_dirname = NULL;
-	size_t dir_len = 0;
-	int dir_found = 1;
+	struct path_found_data data = PATH_FOUND_DATA_INIT;
 
 	int path_count = 0;
 	int to_restart = 0;
@@ -502,7 +515,7 @@ static int clear_skip_worktree_from_present_files_sparse(struct index_state *ist
 
 		if (ce_skip_worktree(ce)) {
 			path_count++;
-			if (path_found(ce->name, &last_dirname, &dir_len, &dir_found)) {
+			if (path_found(ce->name, &data)) {
 				if (S_ISSPARSEDIR(ce->ce_mode)) {
 					to_restart = 1;
 					break;
@@ -516,14 +529,13 @@ static int clear_skip_worktree_from_present_files_sparse(struct index_state *ist
 			   "sparse_path_count", path_count);
 	trace2_region_leave("index", "clear_skip_worktree_from_present_files_sparse",
 			    istate->repo);
+	clear_path_found_data(&data);
 	return to_restart;
 }
 
 static void clear_skip_worktree_from_present_files_full(struct index_state *istate)
 {
-	const char *last_dirname = NULL;
-	size_t dir_len = 0;
-	int dir_found = 1;
+	struct path_found_data data = PATH_FOUND_DATA_INIT;
 
 	int path_count = 0;
 
@@ -537,7 +549,7 @@ static void clear_skip_worktree_from_present_files_full(struct index_state *ista
 
 		if (ce_skip_worktree(ce)) {
 			path_count++;
-			if (path_found(ce->name, &last_dirname, &dir_len, &dir_found))
+			if (path_found(ce->name, &data))
 				ce->ce_flags &= ~CE_SKIP_WORKTREE;
 		}
 	}
@@ -546,6 +558,7 @@ static void clear_skip_worktree_from_present_files_full(struct index_state *ista
 			   "full_path_count", path_count);
 	trace2_region_leave("index", "clear_skip_worktree_from_present_files_full",
 			    istate->repo);
+	clear_path_found_data(&data);
 }
 
 void clear_skip_worktree_from_present_files(struct index_state *istate)

From 23dd6f8bcc11fc4a468f0863b64f3ebe27a173cd Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Fri, 28 Jun 2024 12:43:23 +0000
Subject: [PATCH 3/5] sparse-index: use strbuf in path_found()

The path_found() method previously reused strings from the cache entries
the calling methods were using. This prevents string manipulation in
place and causes some odd reallocation before the final lstat() call in
the method.

Refactor the method to use strbufs and copy the path into the strbuf,
but also only the parent directory and not the whole path. This looks
like extra copying when assigning the path to the strbuf, but we save an
allocation by dropping the 'tmp' string, and we are "reusing" the copy
from 'tmp' to put the data in the strbuf.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
Reviewed-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 sparse-index.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/sparse-index.c b/sparse-index.c
index de6e727f5c..fec4f39336 100644
--- a/sparse-index.c
+++ b/sparse-index.c
@@ -440,31 +440,30 @@ void ensure_correct_sparsity(struct index_state *istate)
 }
 
 struct path_found_data {
-	const char *dirname;
-	size_t dir_len;
+	struct strbuf dir;
 	int dir_found;
 };
 
 #define PATH_FOUND_DATA_INIT { \
+	.dir = STRBUF_INIT, \
 	.dir_found = 1 \
 }
 
 static void clear_path_found_data(struct path_found_data *data)
 {
-	return;
+	strbuf_release(&data->dir);
 }
 
 static int path_found(const char *path, struct path_found_data *data)
 {
 	struct stat st;
 	char *newdir;
-	char *tmp;
 
 	/*
 	 * If dirname corresponds to a directory that doesn't exist, and this
 	 * path starts with dirname, then path can't exist.
 	 */
-	if (!data->dir_found && !memcmp(path, data->dirname, data->dir_len))
+	if (!data->dir_found && !memcmp(path, data->dir.buf, data->dir.len))
 		return 0;
 
 	/*
@@ -486,17 +485,15 @@ static int path_found(const char *path, struct path_found_data *data)
 	 * If path starts with directory (which we already lstat'ed and found),
 	 * then no need to lstat parent directory again.
 	 */
-	if (data->dir_found && data->dirname &&
-	    memcmp(path, data->dirname, data->dir_len))
+	if (data->dir_found && data->dir.buf &&
+	    memcmp(path, data->dir.buf, data->dir.len))
 		return 0;
 
 	/* Free previous dirname, and cache path's dirname */
-	data->dirname = path;
-	data->dir_len = newdir - path + 1;
+	strbuf_reset(&data->dir);
+	strbuf_add(&data->dir, path, newdir - path + 1);
 
-	tmp = xstrndup(path, data->dir_len);
-	data->dir_found = !lstat(tmp, &st);
-	free(tmp);
+	data->dir_found = !lstat(data->dir.buf, &st);
 
 	return 0;
 }

From c4e8c42c19ecc354abacd97c61c808383c0870fa Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Fri, 28 Jun 2024 12:43:24 +0000
Subject: [PATCH 4/5] sparse-index: count lstat() calls

The clear_skip_worktree.. methods already report some statistics about
how many cache entries are checked against path_found() due to having
the skip-worktree bit set. However, due to path_found() performing some
caching, this isn't the only information that would be helpful to
report.

Add a new lstat_count member to the path_found_data struct to count the
number of times path_found() calls lstat(). This will be helpful to help
explain performance problems in this method as well as to demonstrate
future changes to the caching algorithm in a more concrete way than
end-to-end timings.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
Reviewed-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 sparse-index.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/sparse-index.c b/sparse-index.c
index fec4f39336..8577fa726b 100644
--- a/sparse-index.c
+++ b/sparse-index.c
@@ -442,6 +442,7 @@ void ensure_correct_sparsity(struct index_state *istate)
 struct path_found_data {
 	struct strbuf dir;
 	int dir_found;
+	size_t lstat_count;
 };
 
 #define PATH_FOUND_DATA_INIT { \
@@ -469,6 +470,7 @@ static int path_found(const char *path, struct path_found_data *data)
 	/*
 	 * If path itself exists, return 1.
 	 */
+	data->lstat_count++;
 	if (!lstat(path, &st))
 		return 1;
 
@@ -493,6 +495,7 @@ static int path_found(const char *path, struct path_found_data *data)
 	strbuf_reset(&data->dir);
 	strbuf_add(&data->dir, path, newdir - path + 1);
 
+	data->lstat_count++;
 	data->dir_found = !lstat(data->dir.buf, &st);
 
 	return 0;
@@ -524,6 +527,8 @@ static int clear_skip_worktree_from_present_files_sparse(struct index_state *ist
 
 	trace2_data_intmax("index", istate->repo,
 			   "sparse_path_count", path_count);
+	trace2_data_intmax("index", istate->repo,
+			   "sparse_lstat_count", data.lstat_count);
 	trace2_region_leave("index", "clear_skip_worktree_from_present_files_sparse",
 			    istate->repo);
 	clear_path_found_data(&data);
@@ -553,6 +558,8 @@ static void clear_skip_worktree_from_present_files_full(struct index_state *ista
 
 	trace2_data_intmax("index", istate->repo,
 			   "full_path_count", path_count);
+	trace2_data_intmax("index", istate->repo,
+			   "full_lstat_count", data.lstat_count);
 	trace2_region_leave("index", "clear_skip_worktree_from_present_files_full",
 			    istate->repo);
 	clear_path_found_data(&data);

From 114bff72ac030b9e9c931a9efd2bd0af8137692b Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Fri, 28 Jun 2024 12:43:25 +0000
Subject: [PATCH 5/5] sparse-index: improve lstat caching of sparse paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The clear_skip_worktree_from_present_files() method was first introduced
in af6a51875a (repo_read_index: clear SKIP_WORKTREE bit from files
present in worktree, 2022-01-14) to allow better interaction with the
working directory in the presence of paths outside of the
sparse-checkout. The initial implementation would lstat() every single
SKIP_WORKTREE path to see if it existed; if it ran across a sparse
directory that existed (when a sparse index was in use), then it would
expand the index and then check every SKIP_WORKTREE path.

Since these lstat() calls were very expensive, this was improved in
d79d299352 (Accelerate clear_skip_worktree_from_present_files() by
caching, 2022-01-14) by caching directories that do not exist so it
could avoid lstat()ing any files under such directories. However, there
are some inefficiencies in that caching mechanism.

The caching mechanism stored only the parent directory as not existing,
even if a higher parent directory also does not exist. This means that
wasted lstat() calls would occur when the paths passed to path_found()
change immediate parent directories but within the same parent directory
that does not exist.

To create an example repository that demonstrates this problem, it helps
to have a directory outside of the sparse-checkout that contains many
deep paths. In particular, the first paths (in lexicographic order)
underneath the sparse directory should have deep directory structures,
maximizing the difference between the old caching algorithm that looks
to a single parent and the new caching algorithm that looks to the
top-most missing directory.

The performance test script p2000-sparse-operations.sh takes the sample
repository and copies its HEAD to several copies nested in directories
of the form f<i>/f<j>/f<k> where i, j, and k are numbers from 1 to 4.
The sparse-checkout cone is then selected as "f2/f4/". Creating "f1/f1/"
will trigger the behavior and also lead to some interesting cases for
the caching algorithm since "f1/f1/" exists but "f1/f2/" and "f3/" do
not.

This is difficult to notice when running performance tests using the Git
repository (or a blow-up of the Git repository, as in
p2000-sparse-operations.sh) because Git has a very shallow directory
structure.

This change reorganizes the caching algorithm to focus on storing the
highest level leading directory that does not exist; specifically this
means that that directory's parent _does_ exist. By doing a little extra
work on a path passed to path_found(), we can short-circuit all of the
paths passed to path_found() afterwards that match a prefix with that
non-existing directory. When in a repository where the first sparse file
is likely to have a much deeper path than the first non-existing
directory, this can realize significant gains.

The details of this algorithm require careful attention, so the new
implementation of path_found() has detailed comments, including the use
of a new max_common_dir_prefix() method that may be of independent
interest.

It's worth noting that this is not universally positive, since we are
doing extra lstat() calls to establish the exact path to cache. In the
blow-up of the Git repository, we can see that the lstat count
_increases_ from 28 to 31. However, these numbers were already
artificially low.

Contributor Elijah Newren created a publicly-available test repository
that demonstrates the difference in these caching algorithms in the most
extreme way. To test, follow these steps:

  git clone --sparse https://github.com/newren/gvfs-like-git-bomb
  cd gvfs-like-git-bomb
  ./runme.sh                   # NOTE: check scripts before running!

At this point, assuming you do not have index.sparse=true set globally,
the index has one million paths with the SKIP_WORKTREE bit and they will
all be sent to path_found() in the sparse loop. You can measure this by
running 'git status' with GIT_TRACE2_PERF=1:

    Sparse files in the index: 1,000,000
  sparse_lstat_count (before):   200,000
   sparse_lstat_count (after):         2

And here are the performance numbers:

  Benchmark 1: old
    Time (mean ± σ):     397.5 ms ±   4.1 ms
    Range (min … max):   391.2 ms … 404.8 ms    10 runs

  Benchmark 2: new
    Time (mean ± σ):     252.7 ms ±   3.1 ms
    Range (min … max):   249.4 ms … 259.5 ms    11 runs

  Summary
    'new' ran
      1.57 ± 0.02 times faster than 'old'

By modifying this example further, we can demonstrate a more realistic
example and include the sparse index expansion. Continue by creating
this directory, confusing both caching algorithms somewhat:

  mkdir -p bomb/d/e/f/a/a

Then re-run the 'git status' tests to see these statistics:

    Sparse files in the index: 1,000,000
  sparse_lstat_count (before):   724,010
   sparse_lstat_count (after):       106

  Benchmark 1: old
    Time (mean ± σ):     753.0 ms ±   3.5 ms
    Range (min … max):   749.7 ms … 760.9 ms    10 runs

  Benchmark 2: new
    Time (mean ± σ):     201.4 ms ±   3.2 ms
    Range (min … max):   196.0 ms … 207.9 ms    14 runs

  Summary
    'new' ran
      3.74 ± 0.06 times faster than 'old'

Note that if this repository had a sparse index enabled, the additional
cost of expanding the sparse index affects the total time of these
commands by over four seconds, significantly diminishing the benefit of
the caching algorithm. Having existing paths outside of the
sparse-checkout is a known performance issue for the sparse index and is
a known trade-off for the performance benefits given when no such paths
exist.

Using an internal monorepo with over two million paths at HEAD and a
typical sparse-checkout cone such that the sparse index contains
~190,000 entries (including over two thousand sparse trees), I was able
to measure these lstat counts when one sparse directory actually exists
on disk:

  Sparse files in expanded index: 1,841,997
       full_lstat_count (before): 1,188,161
       full_lstat_count  (after):     4,404

This resulted in this absolute time change, on a warm disk:

      Time in full loop (before): 13.481 s
      Time in full loop  (after):  0.081 s

(These times were calculated on a Windows machine, where lstat() is
slower than a similar Linux machine.)

Helped-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Derrick Stolee <stolee@gmail.com>
Reviewed-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 sparse-index.c | 114 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 90 insertions(+), 24 deletions(-)

diff --git a/sparse-index.c b/sparse-index.c
index 8577fa726b..9913a6078c 100644
--- a/sparse-index.c
+++ b/sparse-index.c
@@ -440,14 +440,21 @@ void ensure_correct_sparsity(struct index_state *istate)
 }
 
 struct path_found_data {
+	/**
+	 * The path stored in 'dir', if non-empty, corresponds to the most-
+	 * recent path that we checked where:
+	 *
+	 *   1. The path should be a directory, according to the index.
+	 *   2. The path does not exist.
+	 *   3. The parent path _does_ exist. (This may be the root of the
+	 *      working directory.)
+	 */
 	struct strbuf dir;
-	int dir_found;
 	size_t lstat_count;
 };
 
 #define PATH_FOUND_DATA_INIT { \
-	.dir = STRBUF_INIT, \
-	.dir_found = 1 \
+	.dir = STRBUF_INIT \
 }
 
 static void clear_path_found_data(struct path_found_data *data)
@@ -455,49 +462,108 @@ static void clear_path_found_data(struct path_found_data *data)
 	strbuf_release(&data->dir);
 }
 
+/**
+ * Return the length of the longest common substring that ends in a
+ * slash ('/') to indicate the longest common parent directory. Returns
+ * zero if no common directory exists.
+ */
+static size_t max_common_dir_prefix(const char *path1, const char *path2)
+{
+	size_t common_prefix = 0;
+	for (size_t i = 0; path1[i] && path2[i]; i++) {
+		if (path1[i] != path2[i])
+			break;
+
+		/*
+		 * If they agree at a directory separator, then add one
+		 * to make sure it is included in the common prefix string.
+		 */
+		if (path1[i] == '/')
+			common_prefix = i + 1;
+	}
+
+	return common_prefix;
+}
+
 static int path_found(const char *path, struct path_found_data *data)
 {
 	struct stat st;
-	char *newdir;
+	size_t common_prefix;
 
 	/*
-	 * If dirname corresponds to a directory that doesn't exist, and this
-	 * path starts with dirname, then path can't exist.
+	 * If data->dir is non-empty, then it contains a path that doesn't
+	 * exist, including an ending slash ('/'). If it is a prefix of 'path',
+	 * then we can return 0.
 	 */
-	if (!data->dir_found && !memcmp(path, data->dir.buf, data->dir.len))
+	if (data->dir.len && !memcmp(path, data->dir.buf, data->dir.len))
 		return 0;
 
 	/*
-	 * If path itself exists, return 1.
+	 * Otherwise, we must check if the current path exists. If it does, then
+	 * return 1. The cached directory will be skipped until we come across
+	 * a missing path again.
 	 */
 	data->lstat_count++;
 	if (!lstat(path, &st))
 		return 1;
 
 	/*
-	 * Otherwise, path does not exist so we'll return 0...but we'll first
-	 * determine some info about its parent directory so we can avoid
-	 * lstat calls for future cache entries.
+	 * At this point, we know that 'path' doesn't exist, and we know that
+	 * the parent directory of 'data->dir' does exist. Let's set 'data->dir'
+	 * to be the top-most non-existing directory of 'path'. If the first
+	 * parent of 'path' exists, then we will act as though 'path'
+	 * corresponds to a directory (by adding a slash).
 	 */
-	newdir = strrchr(path, '/');
-	if (!newdir)
-		return 0; /* Didn't find a parent dir; just return 0 now. */
+	common_prefix = max_common_dir_prefix(path, data->dir.buf);
 
 	/*
-	 * If path starts with directory (which we already lstat'ed and found),
-	 * then no need to lstat parent directory again.
+	 * At this point, 'path' and 'data->dir' have a common existing parent
+	 * directory given by path[0..common_prefix] (which could have length 0).
+	 * We "grow" the data->dir buffer by checking for existing directories
+	 * along 'path'.
 	 */
-	if (data->dir_found && data->dir.buf &&
-	    memcmp(path, data->dir.buf, data->dir.len))
-		return 0;
 
-	/* Free previous dirname, and cache path's dirname */
-	strbuf_reset(&data->dir);
-	strbuf_add(&data->dir, path, newdir - path + 1);
+	strbuf_setlen(&data->dir, common_prefix);
+	while (1) {
+		/* Find the next directory in 'path'. */
+		const char *rest = path + data->dir.len;
+		const char *next_slash = strchr(rest, '/');
 
-	data->lstat_count++;
-	data->dir_found = !lstat(data->dir.buf, &st);
+		/*
+		 * If there are no more slashes, then 'path' doesn't contain a
+		 * non-existent _parent_ directory. Set 'data->dir' to be equal
+		 * to 'path' plus an additional slash, so it can be used for
+		 * caching in the future. The filename of 'path' is considered
+		 * a non-existent directory.
+		 *
+		 * Note: if "{path}/" exists as a directory, then it will never
+		 * appear as a prefix of other callers to this method, assuming
+		 * the context from the clear_skip_worktree... methods. If this
+		 * method is reused, then this must be reconsidered.
+		 */
+		if (!next_slash) {
+			strbuf_addstr(&data->dir, rest);
+			strbuf_addch(&data->dir, '/');
+			break;
+		}
 
+		/*
+		 * Now that we have a slash, let's grow 'data->dir' to include
+		 * this slash, then test if we should stop.
+		 */
+		strbuf_add(&data->dir, rest, next_slash - rest + 1);
+
+		/* If the parent dir doesn't exist, then stop here. */
+		data->lstat_count++;
+		if (lstat(data->dir.buf, &st))
+			return 0;
+	}
+
+	/*
+	 * At this point, 'data->dir' is equal to 'path' plus a slash character,
+	 * and the parent directory of 'path' definitely exists. Moreover, we
+	 * know that 'path' doesn't exist, or we would have returned 1 earlier.
+	 */
 	return 0;
 }