From 565e4b79816c1618161a5bd647982aa0daff81c4 Mon Sep 17 00:00:00 2001 From: "Strain, Roger L" Date: Fri, 28 Sep 2018 13:35:37 -0500 Subject: [PATCH 1/5] subtree: refactor split of a commit into standalone method In a particularly complex repo, subtree split was not creating compatible splits for pushing back to a separate repo. Addressing one of the issues requires recursive handling of parent commits that were not initially considered by the algorithm. This commit makes no functional changes, but relocates the code to be called recursively into a new method to simply comparisons of later commits. Signed-off-by: Strain, Roger L Signed-off-by: Junio C Hamano --- contrib/subtree/git-subtree.sh | 78 ++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/contrib/subtree/git-subtree.sh b/contrib/subtree/git-subtree.sh index d3f39a862a..2cd7b345b9 100755 --- a/contrib/subtree/git-subtree.sh +++ b/contrib/subtree/git-subtree.sh @@ -598,6 +598,47 @@ ensure_valid_ref_format () { die "'$1' does not look like a ref" } +process_split_commit () { + local rev="$1" + local parents="$2" + revcount=$(($revcount + 1)) + progress "$revcount/$revmax ($createcount)" + debug "Processing commit: $rev" + exists=$(cache_get "$rev") + if test -n "$exists" + then + debug " prior: $exists" + return + fi + createcount=$(($createcount + 1)) + debug " parents: $parents" + newparents=$(cache_get $parents) + debug " newparents: $newparents" + + tree=$(subtree_for_commit "$rev" "$dir") + debug " tree is: $tree" + + check_parents $parents + + # ugly. is there no better way to tell if this is a subtree + # vs. a mainline commit? Does it matter? + if test -z "$tree" + then + set_notree "$rev" + if test -n "$newparents" + then + cache_set "$rev" "$rev" + fi + return + fi + + newrev=$(copy_or_skip "$rev" "$tree" "$newparents") || exit $? + debug " newrev is: $newrev" + cache_set "$rev" "$newrev" + cache_set latest_new "$newrev" + cache_set latest_old "$rev" +} + cmd_add () { if test -e "$dir" then @@ -706,42 +747,7 @@ cmd_split () { eval "$grl" | while read rev parents do - revcount=$(($revcount + 1)) - progress "$revcount/$revmax ($createcount)" - debug "Processing commit: $rev" - exists=$(cache_get "$rev") - if test -n "$exists" - then - debug " prior: $exists" - continue - fi - createcount=$(($createcount + 1)) - debug " parents: $parents" - newparents=$(cache_get $parents) - debug " newparents: $newparents" - - tree=$(subtree_for_commit "$rev" "$dir") - debug " tree is: $tree" - - check_parents $parents - - # ugly. is there no better way to tell if this is a subtree - # vs. a mainline commit? Does it matter? - if test -z "$tree" - then - set_notree "$rev" - if test -n "$newparents" - then - cache_set "$rev" "$rev" - fi - continue - fi - - newrev=$(copy_or_skip "$rev" "$tree" "$newparents") || exit $? - debug " newrev is: $newrev" - cache_set "$rev" "$newrev" - cache_set latest_new "$newrev" - cache_set latest_old "$rev" + process_split_commit "$rev" "$parents" done || exit $? latest_new=$(cache_get latest_new) From dd21d43b58ee3ce1b9801b5121fead0298961be1 Mon Sep 17 00:00:00 2001 From: "Strain, Roger L" Date: Fri, 28 Sep 2018 13:35:38 -0500 Subject: [PATCH 2/5] subtree: make --ignore-joins pay attention to adds Changes the behavior of --ignore-joins to always consider a subtree add commit, and ignore only splits and squashes. The --ignore-joins option is documented to ignore prior --rejoin commits. However, it additionally ignored subtree add commits generated when a subtree was initially added to a repo. Due to the logic which determines whether a commit is a mainline commit or a subtree commit (namely, the presence or absence of content in the subtree prefix) this causes commits before the initial add to appear to be part of the subtree. An --ignore-joins split would therefore consider those commits part of the subtree history and include them at the beginning of the synthetic history, causing the resulting hashes to be incorrect for all later commits. Signed-off-by: Strain, Roger L Signed-off-by: Junio C Hamano --- contrib/subtree/git-subtree.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/contrib/subtree/git-subtree.sh b/contrib/subtree/git-subtree.sh index 2cd7b345b9..d8861f3065 100755 --- a/contrib/subtree/git-subtree.sh +++ b/contrib/subtree/git-subtree.sh @@ -340,7 +340,12 @@ find_existing_splits () { revs="$2" main= sub= - git log --grep="^git-subtree-dir: $dir/*\$" \ + local grep_format="^git-subtree-dir: $dir/*\$" + if test -n "$ignore_joins" + then + grep_format="^Add '$dir/' from commit '" + fi + git log --grep="$grep_format" \ --no-show-signature --pretty=format:'START %H%n%s%n%n%b%nEND%n' $revs | while read a b junk do @@ -730,12 +735,7 @@ cmd_split () { done fi - if test -n "$ignore_joins" - then - unrevs= - else - unrevs="$(find_existing_splits "$dir" "$revs")" - fi + unrevs="$(find_existing_splits "$dir" "$revs")" # We can't restrict rev-list to only $dir here, because some of our # parents have the $dir contents the root, and those won't match. From 315a84f9aa0e2e629b0680068646b0032518ebed Mon Sep 17 00:00:00 2001 From: "Strain, Roger L" Date: Fri, 28 Sep 2018 13:35:39 -0500 Subject: [PATCH 3/5] subtree: use commits before rejoins for splits Adds recursive evaluation of parent commits which were not part of the initial commit list when performing a split. Split expects all relevant commits to be reachable from the target commit but not reachable from any previous rejoins. However, a branch could be based on a commit prior to a rejoin, then later merged back into the current code. In this case, a parent to the commit will not be present in the initial list of commits, trigging an "incorrect order" warning. Previous behavior was to consider that commit to have no parent, creating an original commit containing all subtree content. This commit is not present in an existing subtree commit graph, changing commit hashes and making pushing to a subtree repo impossible. New behavior will recursively check these unexpected parent commits to track them back to either an earlier rejoin, or a true original commit. The generated synthetic commits will properly match previously-generated commits, allowing successful pushing to a prior subtree repo. Signed-off-by: Strain, Roger L Signed-off-by: Junio C Hamano --- contrib/subtree/git-subtree.sh | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/contrib/subtree/git-subtree.sh b/contrib/subtree/git-subtree.sh index d8861f3065..23dd04cbe8 100755 --- a/contrib/subtree/git-subtree.sh +++ b/contrib/subtree/git-subtree.sh @@ -231,12 +231,14 @@ cache_miss () { } check_parents () { - missed=$(cache_miss "$@") + missed=$(cache_miss "$1") + local indent=$(($2 + 1)) for miss in $missed do if ! test -r "$cachedir/notree/$miss" then debug " incorrect order: $miss" + process_split_commit "$miss" "" "$indent" fi done } @@ -606,8 +608,20 @@ ensure_valid_ref_format () { process_split_commit () { local rev="$1" local parents="$2" - revcount=$(($revcount + 1)) - progress "$revcount/$revmax ($createcount)" + local indent=$3 + + if test $indent -eq 0 + then + revcount=$(($revcount + 1)) + else + # processing commit without normal parent information; + # fetch from repo + parents=$(git show -s --pretty=%P "$rev") + extracount=$(($extracount + 1)) + fi + + progress "$revcount/$revmax ($createcount) [$extracount]" + debug "Processing commit: $rev" exists=$(cache_get "$rev") if test -n "$exists" @@ -617,14 +631,13 @@ process_split_commit () { fi createcount=$(($createcount + 1)) debug " parents: $parents" + check_parents "$parents" "$indent" newparents=$(cache_get $parents) debug " newparents: $newparents" tree=$(subtree_for_commit "$rev" "$dir") debug " tree is: $tree" - check_parents $parents - # ugly. is there no better way to tell if this is a subtree # vs. a mainline commit? Does it matter? if test -z "$tree" @@ -744,10 +757,11 @@ cmd_split () { revmax=$(eval "$grl" | wc -l) revcount=0 createcount=0 + extracount=0 eval "$grl" | while read rev parents do - process_split_commit "$rev" "$parents" + process_split_commit "$rev" "$parents" 0 done || exit $? latest_new=$(cache_get latest_new) From 68f8ff81513fb3599ef3dfc3dd11da36d868e91b Mon Sep 17 00:00:00 2001 From: "Strain, Roger L" Date: Fri, 28 Sep 2018 13:35:40 -0500 Subject: [PATCH 4/5] subtree: improve decision on merges kept in split When multiple identical parents are detected for a commit being considered for copying, explicitly check whether one is the common merge base between the commits. If so, the other commit can be used as the identical parent; if not, a merge must be performed to maintain history. In some situations two parents of a merge commit may appear to both have identical subtree content with each other and the current commit. However, those parents can potentially come from different commit graphs. Previous behavior would simply select one of the identical parents to serve as the replacement for this commit, based on the order in which they were processed. New behavior compares the merge base between the commits to determine if a new merge commit is necessary to maintain history despite the identical content. Signed-off-by: Strain, Roger L Signed-off-by: Junio C Hamano --- contrib/subtree/git-subtree.sh | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/contrib/subtree/git-subtree.sh b/contrib/subtree/git-subtree.sh index 23dd04cbe8..1c157dbd95 100755 --- a/contrib/subtree/git-subtree.sh +++ b/contrib/subtree/git-subtree.sh @@ -541,6 +541,7 @@ copy_or_skip () { nonidentical= p= gotparents= + copycommit= for parent in $newparents do ptree=$(toptree_for_commit $parent) || exit $? @@ -548,7 +549,24 @@ copy_or_skip () { if test "$ptree" = "$tree" then # an identical parent could be used in place of this rev. - identical="$parent" + if test -n "$identical" + then + # if a previous identical parent was found, check whether + # one is already an ancestor of the other + mergebase=$(git merge-base $identical $parent) + if test "$identical" = "$mergebase" + then + # current identical commit is an ancestor of parent + identical="$parent" + elif test "$parent" != "$mergebase" + then + # no common history; commit must be copied + copycommit=1 + fi + else + # first identical parent detected + identical="$parent" + fi else nonidentical="$parent" fi @@ -571,7 +589,6 @@ copy_or_skip () { fi done - copycommit= if test -n "$identical" && test -n "$nonidentical" then extras=$(git rev-list --count $identical..$nonidentical) From 19ad68d95d6f8104eca1e86f8d1dfae50c7fb268 Mon Sep 17 00:00:00 2001 From: Roger Strain Date: Fri, 12 Oct 2018 08:52:18 -0500 Subject: [PATCH 5/5] subtree: performance improvement for finding unexpected parent commits After testing a previous patch at larger scale, a performance issue was detected when using git show to locate parent revisions, with a single run of the git show command taking 2 seconds or longer in a complex repo. When the command is required tens or hundreds of times in a run of the script, the additional wait time is unaccepatable. Replacing the command with git rev-parse resulted in significantly increased performance, with the command in question returning instantly. Signed-off-by: Roger Strain Signed-off-by: Junio C Hamano --- contrib/subtree/git-subtree.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/subtree/git-subtree.sh b/contrib/subtree/git-subtree.sh index 1c157dbd95..147201dc6c 100755 --- a/contrib/subtree/git-subtree.sh +++ b/contrib/subtree/git-subtree.sh @@ -633,7 +633,7 @@ process_split_commit () { else # processing commit without normal parent information; # fetch from repo - parents=$(git show -s --pretty=%P "$rev") + parents=$(git rev-parse "$rev^@") extracount=$(($extracount + 1)) fi