[PATCH] fix scalability problems with git-deltafy-script

Current version would spin forever and exhaust memory while attempting
to sort all files from all revisions at once, until it dies before even
doing any real work.  This is especially noticeable when used on a big
repository like the imported bkcvs repo for the Linux kernel.

This patch allows for batching the sort to put a bound on needed
resources and making progress early, as well as including some small
cleanups.

Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
maint
Nicolas Pitre 2005-06-21 10:18:00 -04:00 committed by Linus Torvalds
parent 69f956e104
commit 83ba99bc8c
1 changed files with 32 additions and 18 deletions

View File

@ -1,6 +1,6 @@
#!/bin/bash #!/bin/bash


# Example script to deltafy an entire GIT repository based on the commit list. # Example script to deltify an entire GIT repository based on the commit list.
# The most recent version of a file is the reference and previous versions # The most recent version of a file is the reference and previous versions
# are made delta against the best earlier version available. And so on for # are made delta against the best earlier version available. And so on for
# successive versions going back in time. This way the increasing delta # successive versions going back in time. This way the increasing delta
@ -25,37 +25,51 @@


set -e set -e


depth= max_depth=
[ "$1" == "-d" ] && depth="--max-depth=$2" && shift 2 [ "$1" == "-d" ] && max_depth="--max-depth=$2" && shift 2

overlap=30
max_behind="--max-behind=$overlap"


function process_list() { function process_list() {
if [ "$list" ]; then if [ "$list" ]; then
echo "Processing $curr_file" echo "Processing $curr_file"
echo "$head $list" | xargs git-mkdelta $depth --max-behind=30 -v echo "$list" | xargs git-mkdelta $max_depth $max_behind -v
fi fi
} }


rev_list=""
curr_file="" curr_file=""


git-rev-list HEAD | git-rev-list HEAD |
git-diff-tree -r -t --stdin | while true; do
awk '/^:/ { if ($5 == "M" || $5 == "N") print $4, $6; # Let's batch revisions into groups of 1000 to give it a chance to
if ($5 == "M") print $3, $6 }' | # scale with repositories containing long revision lists. We also
LC_ALL=C sort -s -k 2 | uniq | # overlap with the previous batch the size of mkdelta's look behind
while read sha1 file; do # value in order to account for the processing discontinuity.
if [ "$file" == "$curr_file" ]; then rev_list="$(echo -e -n "$rev_list" | tail --lines=$overlap)"
list="$list $sha1" for i in $(seq 1000); do
else read rev || break
process_list rev_list="$rev_list$rev\n"
curr_file="$file" done
list="" echo -e -n "$rev_list" |
head="$sha1" git-diff-tree -r -t --stdin |
fi awk '/^:/ { if ($5 == "M") printf "%s %s\n%s %s\n", $4, $6, $3, $6 }' |
LC_ALL=C sort -s -k 2 | uniq |
while read sha1 file; do
if [ "$file" == "$curr_file" ]; then
list="$list $sha1"
else
process_list
curr_file="$file"
list="$sha1"
fi
done
[ "$rev" ] || break
done done
process_list process_list


curr_file="root directory" curr_file="root directory"
head=""
list="$( list="$(
git-rev-list HEAD | git-rev-list HEAD |
while read commit; do while read commit; do