Browse Source

Merge branch 'jh/add-index-entry-optim'

"git checkout" that handles a lot of paths has been optimized by
reducing the number of unnecessary checks of paths in the
has_dir_name() function.

* jh/add-index-entry-optim:
  read-cache: speed up has_dir_name (part 2)
  read-cache: speed up has_dir_name (part 1)
  read-cache: speed up add_index_entry during checkout
  p0006-read-tree-checkout: perf test to time read-tree
  read-cache: add strcmp_offset function
maint
Junio C Hamano 8 years ago
parent
commit
6cbc478d83
  1. 1
      Makefile
  2. 1
      cache.h
  3. 139
      read-cache.c
  4. 1
      t/helper/.gitignore
  5. 22
      t/helper/test-strcmp-offset.c
  6. 67
      t/perf/p0006-read-tree-checkout.sh
  7. 1
      t/perf/repos/.gitignore
  8. 85
      t/perf/repos/inflate-repo.sh
  9. 110
      t/perf/repos/many-files.sh
  10. 21
      t/t0065-strcmp-offset.sh

1
Makefile

@ -639,6 +639,7 @@ TEST_PROGRAMS_NEED_X += test-scrap-cache-tree @@ -639,6 +639,7 @@ TEST_PROGRAMS_NEED_X += test-scrap-cache-tree
TEST_PROGRAMS_NEED_X += test-sha1
TEST_PROGRAMS_NEED_X += test-sha1-array
TEST_PROGRAMS_NEED_X += test-sigchain
TEST_PROGRAMS_NEED_X += test-strcmp-offset
TEST_PROGRAMS_NEED_X += test-string-list
TEST_PROGRAMS_NEED_X += test-submodule-config
TEST_PROGRAMS_NEED_X += test-subprocess

1
cache.h

@ -599,6 +599,7 @@ extern int write_locked_index(struct index_state *, struct lock_file *lock, unsi @@ -599,6 +599,7 @@ extern int write_locked_index(struct index_state *, struct lock_file *lock, unsi
extern int discard_index(struct index_state *);
extern int unmerged_index(const struct index_state *);
extern int verify_path(const char *path);
extern int strcmp_offset(const char *s1, const char *s2, size_t *first_change);
extern int index_dir_exists(struct index_state *istate, const char *name, int namelen);
extern void adjust_dirname_case(struct index_state *istate, char *name);
extern struct cache_entry *index_file_exists(struct index_state *istate, const char *name, int namelen, int igncase);

139
read-cache.c

@ -887,9 +887,32 @@ static int has_file_name(struct index_state *istate, @@ -887,9 +887,32 @@ static int has_file_name(struct index_state *istate,
return retval;
}


/*
* Like strcmp(), but also return the offset of the first change.
* If strings are equal, return the length.
*/
int strcmp_offset(const char *s1, const char *s2, size_t *first_change)
{
size_t k;

if (!first_change)
return strcmp(s1, s2);

for (k = 0; s1[k] == s2[k]; k++)
if (s1[k] == '\0')
break;

*first_change = k;
return (unsigned char)s1[k] - (unsigned char)s2[k];
}

/*
* Do we have another file with a pathname that is a proper
* subset of the name we're trying to add?
*
* That is, is there another file in the index with a path
* that matches a sub-directory in the given entry?
*/
static int has_dir_name(struct index_state *istate,
const struct cache_entry *ce, int pos, int ok_to_replace)
@ -898,9 +921,51 @@ static int has_dir_name(struct index_state *istate, @@ -898,9 +921,51 @@ static int has_dir_name(struct index_state *istate,
int stage = ce_stage(ce);
const char *name = ce->name;
const char *slash = name + ce_namelen(ce);
size_t len_eq_last;
int cmp_last = 0;

/*
* We are frequently called during an iteration on a sorted
* list of pathnames and while building a new index. Therefore,
* there is a high probability that this entry will eventually
* be appended to the index, rather than inserted in the middle.
* If we can confirm that, we can avoid binary searches on the
* components of the pathname.
*
* Compare the entry's full path with the last path in the index.
*/
if (istate->cache_nr > 0) {
cmp_last = strcmp_offset(name,
istate->cache[istate->cache_nr - 1]->name,
&len_eq_last);
if (cmp_last > 0) {
if (len_eq_last == 0) {
/*
* The entry sorts AFTER the last one in the
* index and their paths have no common prefix,
* so there cannot be a F/D conflict.
*/
return retval;
} else {
/*
* The entry sorts AFTER the last one in the
* index, but has a common prefix. Fall through
* to the loop below to disect the entry's path
* and see where the difference is.
*/
}
} else if (cmp_last == 0) {
/*
* The entry exactly matches the last one in the
* index, but because of multiple stage and CE_REMOVE
* items, we fall through and let the regular search
* code handle it.
*/
}
}

for (;;) {
int len;
size_t len;

for (;;) {
if (*--slash == '/')
@ -910,6 +975,67 @@ static int has_dir_name(struct index_state *istate, @@ -910,6 +975,67 @@ static int has_dir_name(struct index_state *istate,
}
len = slash - name;

if (cmp_last > 0) {
/*
* (len + 1) is a directory boundary (including
* the trailing slash). And since the loop is
* decrementing "slash", the first iteration is
* the longest directory prefix; subsequent
* iterations consider parent directories.
*/

if (len + 1 <= len_eq_last) {
/*
* The directory prefix (including the trailing
* slash) also appears as a prefix in the last
* entry, so the remainder cannot collide (because
* strcmp said the whole path was greater).
*
* EQ: last: xxx/A
* this: xxx/B
*
* LT: last: xxx/file_A
* this: xxx/file_B
*/
return retval;
}

if (len > len_eq_last) {
/*
* This part of the directory prefix (excluding
* the trailing slash) is longer than the known
* equal portions, so this sub-directory cannot
* collide with a file.
*
* GT: last: xxxA
* this: xxxB/file
*/
return retval;
}

if (istate->cache_nr > 0 &&
ce_namelen(istate->cache[istate->cache_nr - 1]) > len) {
/*
* The directory prefix lines up with part of
* a longer file or directory name, but sorts
* after it, so this sub-directory cannot
* collide with a file.
*
* last: xxx/yy-file (because '-' sorts before '/')
* this: xxx/yy/abc
*/
return retval;
}

/*
* This is a possible collision. Fall through and
* let the regular search code handle it.
*
* last: xxx
* this: xxx/file
*/
}

pos = index_name_stage_pos(istate, name, len, stage);
if (pos >= 0) {
/*
@ -1001,7 +1127,16 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e @@ -1001,7 +1127,16 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e

if (!(option & ADD_CACHE_KEEP_CACHE_TREE))
cache_tree_invalidate_path(istate, ce->name);
pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce));

/*
* If this entry's path sorts after the last entry in the index,
* we can avoid searching for it.
*/
if (istate->cache_nr > 0 &&
strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0)
pos = -istate->cache_nr - 1;
else
pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce));

/* existing match? Just replace it. */
if (pos >= 0) {

1
t/helper/.gitignore vendored

@ -28,6 +28,7 @@ @@ -28,6 +28,7 @@
/test-sha1
/test-sha1-array
/test-sigchain
/test-strcmp-offset
/test-string-list
/test-submodule-config
/test-subprocess

22
t/helper/test-strcmp-offset.c

@ -0,0 +1,22 @@ @@ -0,0 +1,22 @@
#include "cache.h"

int cmd_main(int argc, const char **argv)
{
int result;
size_t offset;

if (!argv[1] || !argv[2])
die("usage: %s <string1> <string2>", argv[0]);

result = strcmp_offset(argv[1], argv[2], &offset);

/*
* Because differnt CRTs behave differently, only rely on signs
* of the result values.
*/
result = (result < 0 ? -1 :
result > 0 ? 1 :
0);
printf("%d %"PRIuMAX"\n", result, (uintmax_t)offset);
return 0;
}

67
t/perf/p0006-read-tree-checkout.sh

@ -0,0 +1,67 @@ @@ -0,0 +1,67 @@
#!/bin/sh
#
# This test measures the performance of various read-tree
# and checkout operations. It is primarily interested in
# the algorithmic costs of index operations and recursive
# tree traversal -- and NOT disk I/O on thousands of files.

test_description="Tests performance of read-tree"

. ./perf-lib.sh

test_perf_default_repo

# If the test repo was generated by ./repos/many-files.sh
# then we know something about the data shape and branches,
# so we can isolate testing to the ballast-related commits
# and setup sparse-checkout so we don't have to populate
# the ballast files and directories.
#
# Otherwise, we make some general assumptions about the
# repo and consider the entire history of the current
# branch to be the ballast.

test_expect_success "setup repo" '
if git rev-parse --verify refs/heads/p0006-ballast^{commit}
then
echo Assuming synthetic repo from many-files.sh
git branch br_base master
git branch br_ballast p0006-ballast^
git branch br_ballast_alias p0006-ballast^
git branch br_ballast_plus_1 p0006-ballast
git config --local core.sparsecheckout 1
cat >.git/info/sparse-checkout <<-EOF
/*
!ballast/*
EOF
else
echo Assuming non-synthetic repo...
git branch br_base $(git rev-list HEAD | tail -n 1)
git branch br_ballast HEAD^ || error "no ancestor commit from current head"
git branch br_ballast_alias HEAD^
git branch br_ballast_plus_1 HEAD
fi &&
git checkout -q br_ballast &&
nr_files=$(git ls-files | wc -l)
'

test_perf "read-tree br_base br_ballast ($nr_files)" '
git read-tree -m br_base br_ballast -n
'

test_perf "switch between br_base br_ballast ($nr_files)" '
git checkout -q br_base &&
git checkout -q br_ballast
'

test_perf "switch between br_ballast br_ballast_plus_1 ($nr_files)" '
git checkout -q br_ballast_plus_1 &&
git checkout -q br_ballast
'

test_perf "switch between aliases ($nr_files)" '
git checkout -q br_ballast_alias &&
git checkout -q br_ballast
'

test_done

1
t/perf/repos/.gitignore vendored

@ -0,0 +1 @@ @@ -0,0 +1 @@
gen-*/

85
t/perf/repos/inflate-repo.sh

@ -0,0 +1,85 @@ @@ -0,0 +1,85 @@
#!/bin/sh
# Inflate the size of an EXISTING repo.
#
# This script should be run inside the worktree of a TEST repo.
# It will use the contents of the current HEAD to generate a
# commit containing copies of the current worktree such that the
# total size of the commit has at least <target_size> files.
#
# Usage: [-t target_size] [-b branch_name]

set -e

target_size=10000
branch_name=p0006-ballast
ballast=ballast

while test "$#" -ne 0
do
case "$1" in
-b)
shift;
test "$#" -ne 0 || { echo 'error: -b requires an argument' >&2; exit 1; }
branch_name=$1;
shift ;;
-t)
shift;
test "$#" -ne 0 || { echo 'error: -t requires an argument' >&2; exit 1; }
target_size=$1;
shift ;;
*)
echo "error: unknown option '$1'" >&2; exit 1 ;;
esac
done

git ls-tree -r HEAD >GEN_src_list
nr_src_files=$(cat GEN_src_list | wc -l)

src_branch=$(git symbolic-ref --short HEAD)

echo "Branch $src_branch initially has $nr_src_files files."

if test $target_size -le $nr_src_files
then
echo "Repository already exceeds target size $target_size."
rm GEN_src_list
exit 1
fi

# Create well-known branch and add 1 file change to start
# if off before the ballast.
git checkout -b $branch_name HEAD
echo "$target_size" > inflate-repo.params
git add inflate-repo.params
git commit -q -m params

# Create ballast for in our branch.
copy=1
nr_files=$nr_src_files
while test $nr_files -lt $target_size
do
sed -e "s| | $ballast/$copy/|" <GEN_src_list |
git update-index --index-info

nr_files=$(expr $nr_files + $nr_src_files)
copy=$(expr $copy + 1)
done
rm GEN_src_list
git commit -q -m "ballast"

# Modify 1 file and commit.
echo "$target_size" >> inflate-repo.params
git add inflate-repo.params
git commit -q -m "ballast plus 1"

nr_files=$(git ls-files | wc -l)

# Checkout master to put repo in canonical state (because
# the perf test may need to clone and enable sparse-checkout
# before attempting to checkout a commit with the ballast
# (because it may contain 100K directories and 1M files)).
git checkout $src_branch

echo "Repository inflated. Branch $branch_name has $nr_files files."

exit 0

110
t/perf/repos/many-files.sh

@ -0,0 +1,110 @@ @@ -0,0 +1,110 @@
#!/bin/sh
# Generate test data repository using the given parameters.
# When omitted, we create "gen-many-files-d-w-f.git".
#
# Usage: [-r repo] [-d depth] [-w width] [-f files]
#
# -r repo: path to the new repo to be generated
# -d depth: the depth of sub-directories
# -w width: the number of sub-directories at each level
# -f files: the number of files created in each directory
#
# Note that all files will have the same SHA-1 and each
# directory at a level will have the same SHA-1, so we
# will potentially have a large index, but not a large
# ODB.
#
# Ballast will be created under "ballast/".

EMPTY_BLOB=e69de29bb2d1d6434b8b29ae775ad8c2e48c5391

set -e

# (5, 10, 9) will create 999,999 ballast files.
# (4, 10, 9) will create 99,999 ballast files.
depth=5
width=10
files=9

while test "$#" -ne 0
do
case "$1" in
-r)
shift;
test "$#" -ne 0 || { echo 'error: -r requires an argument' >&2; exit 1; }
repo=$1;
shift ;;
-d)
shift;
test "$#" -ne 0 || { echo 'error: -d requires an argument' >&2; exit 1; }
depth=$1;
shift ;;
-w)
shift;
test "$#" -ne 0 || { echo 'error: -w requires an argument' >&2; exit 1; }
width=$1;
shift ;;
-f)
shift;
test "$#" -ne 0 || { echo 'error: -f requires an argument' >&2; exit 1; }
files=$1;
shift ;;
*)
echo "error: unknown option '$1'" >&2; exit 1 ;;
esac
done

# Inflate the index with thousands of empty files.
# usage: dir depth width files
fill_index() {
awk -v arg_dir=$1 -v arg_depth=$2 -v arg_width=$3 -v arg_files=$4 '
function make_paths(dir, depth, width, files, f, w) {
for (f = 1; f <= files; f++) {
print dir "/file" f
}
if (depth > 0) {
for (w = 1; w <= width; w++) {
make_paths(dir "/dir" w, depth - 1, width, files)
}
}
}
END { make_paths(arg_dir, arg_depth, arg_width, arg_files) }
' </dev/null |
sed "s/^/100644 $EMPTY_BLOB /" |
git update-index --index-info
return 0
}

[ -z "$repo" ] && repo=gen-many-files-$depth.$width.$files.git

mkdir $repo
cd $repo
git init .

# Create an initial commit just to define master.
touch many-files.empty
echo "$depth $width $files" >many-files.params
git add many-files.*
git commit -q -m params

# Create ballast for p0006 based upon the given params and
# inflate the index with thousands of empty files and commit.
git checkout -b p0006-ballast
fill_index "ballast" $depth $width $files
git commit -q -m "ballast"

nr_files=$(git ls-files | wc -l)

# Modify 1 file and commit.
echo "$depth $width $files" >>many-files.params
git add many-files.params
git commit -q -m "ballast plus 1"

# Checkout master to put repo in canonical state (because
# the perf test may need to clone and enable sparse-checkout
# before attempting to checkout a commit with the ballast
# (because it may contain 100K directories and 1M files)).
git checkout master

echo "Repository "$repo" ($depth, $width, $files) created. Ballast $nr_files."
exit 0

21
t/t0065-strcmp-offset.sh

@ -0,0 +1,21 @@ @@ -0,0 +1,21 @@
#!/bin/sh

test_description='Test strcmp_offset functionality'

. ./test-lib.sh

while read s1 s2 expect
do
test_expect_success "strcmp_offset($s1, $s2)" '
echo "$expect" >expect &&
test-strcmp-offset "$s1" "$s2" >actual &&
test_cmp expect actual
'
done <<-EOF
abc abc 0 3
abc def -1 0
abc abz -1 2
abc abcdef -1 3
EOF

test_done
Loading…
Cancel
Save