From f15eb7c1cf2674df69c1ff8aebdc5536a580c65e Mon Sep 17 00:00:00 2001 From: Elijah Newren <newren@gmail.com> Date: Wed, 3 Feb 2021 20:03:46 +0000 Subject: [PATCH 1/8] diffcore-rename: no point trying to find a match better than exact MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit diffcore_rename() had some code to avoid having destination paths that already had an exact rename detected from being re-checked for other renames. Source paths, however, were re-checked because we wanted to allow the possibility of detecting copies. But if copy detection isn't turned on, then this merely amounts to attempting to find a better-than-exact match, which naturally ends up being an expensive no-op. In particular, copy detection is never turned on by the merge machinery. For the testcases mentioned in commit 557ac0350d ("merge-ort: begin performance work; instrument with trace2_region_* calls", 2020-10-28), this change improves the performance as follows: Before After no-renames: 14.263 s ± 0.053 s 14.119 s ± 0.101 s mega-renames: 5504.231 s ± 5.150 s 1802.044 s ± 0.828 s just-one-mega: 158.534 s ± 0.498 s 51.391 s ± 0.028 s Signed-off-by: Elijah Newren <newren@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- diffcore-rename.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 8fe6c9384b..8b118628b4 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -463,9 +463,11 @@ void diffcore_rename(struct diff_options *options) struct diff_score *mx; int i, j, rename_count, skip_unmodified = 0; int num_destinations, dst_cnt; + int num_sources, want_copies; struct progress *progress = NULL; trace2_region_enter("diff", "setup", options->repo); + want_copies = (detect_rename == DIFF_DETECT_COPY); if (!minimum_score) minimum_score = DEFAULT_RENAME_SCORE; @@ -502,7 +504,7 @@ void diffcore_rename(struct diff_options *options) p->one->rename_used++; register_rename_src(p); } - else if (detect_rename == DIFF_DETECT_COPY) { + else if (want_copies) { /* * Increment the "rename_used" score by * one, to indicate ourselves as a user. @@ -532,12 +534,15 @@ void diffcore_rename(struct diff_options *options) * files still remain as options for rename/copies!) */ num_destinations = (rename_dst_nr - rename_count); + num_sources = rename_src_nr; + if (!want_copies) + num_sources -= rename_count; /* All done? */ - if (!num_destinations) + if (!num_destinations || !num_sources) goto cleanup; - switch (too_many_rename_candidates(num_destinations, rename_src_nr, + switch (too_many_rename_candidates(num_destinations, num_sources, options)) { case 1: goto cleanup; @@ -553,7 +558,7 @@ void diffcore_rename(struct diff_options *options) if (options->show_rename_progress) { progress = start_delayed_progress( _("Performing inexact rename detection"), - (uint64_t)num_destinations * (uint64_t)rename_src_nr); + (uint64_t)num_destinations * (uint64_t)num_sources); } mx = xcalloc(st_mult(NUM_CANDIDATE_PER_DST, num_destinations), @@ -573,6 +578,9 @@ void diffcore_rename(struct diff_options *options) struct diff_filespec *one = rename_src[j].p->one; struct diff_score this_src; + if (one->rename_used && !want_copies) + continue; + if (skip_unmodified && diff_unmodified_pair(rename_src[j].p)) continue; @@ -594,7 +602,7 @@ void diffcore_rename(struct diff_options *options) } dst_cnt++; display_progress(progress, - (uint64_t)dst_cnt * (uint64_t)rename_src_nr); + (uint64_t)dst_cnt * (uint64_t)num_sources); } stop_progress(&progress); @@ -602,7 +610,7 @@ void diffcore_rename(struct diff_options *options) STABLE_QSORT(mx, dst_cnt * NUM_CANDIDATE_PER_DST, score_compare); rename_count += find_renames(mx, dst_cnt, minimum_score, 0); - if (detect_rename == DIFF_DETECT_COPY) + if (want_copies) rename_count += find_renames(mx, dst_cnt, minimum_score, 1); free(mx); trace2_region_leave("diff", "inexact renames", options->repo); From 829514c5151deee1b37cdeaf451bf28219602126 Mon Sep 17 00:00:00 2001 From: Elijah Newren <newren@gmail.com> Date: Sun, 14 Feb 2021 07:35:01 +0000 Subject: [PATCH 2/8] diffcore-rename: filter rename_src list when possible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have to look at each entry in rename_src a total of rename_dst_nr times. When we're not detecting copies, any exact renames or ignorable rename paths will just be skipped over. While checking that these can be skipped over is a relatively cheap check, it's still a waste of time to do that check more than once, let alone rename_dst_nr times. When rename_src_nr is a few thousand times bigger than the number of relevant sources (such as when cherry-picking a commit that only touched a handful of files, but from a side of history that has different names for some high level directories), this time can add up. First make an initial pass over the rename_src array and move all the relevant entries to the front, so that we can iterate over just those relevant entries. For the testcases mentioned in commit 557ac0350d ("merge-ort: begin performance work; instrument with trace2_region_* calls", 2020-10-28), this change improves the performance as follows: Before After no-renames: 14.119 s ± 0.101 s 13.815 s ± 0.062 s mega-renames: 1802.044 s ± 0.828 s 1799.937 s ± 0.493 s just-one-mega: 51.391 s ± 0.028 s 51.289 s ± 0.019 s Signed-off-by: Elijah Newren <newren@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- diffcore-rename.c | 59 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 8 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 8b118628b4..6fd0c4a2f4 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -454,6 +454,54 @@ static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, i return count; } +static void remove_unneeded_paths_from_src(int detecting_copies) +{ + int i, new_num_src; + + if (detecting_copies) + return; /* nothing to remove */ + if (break_idx) + return; /* culling incompatible with break detection */ + + /* + * Note on reasons why we cull unneeded sources but not destinations: + * 1) Pairings are stored in rename_dst (not rename_src), which we + * need to keep around. So, we just can't cull rename_dst even + * if we wanted to. But doing so wouldn't help because... + * + * 2) There is a matrix pairwise comparison that follows the + * "Performing inexact rename detection" progress message. + * Iterating over the destinations is done in the outer loop, + * hence we only iterate over each of those once and we can + * easily skip the outer loop early if the destination isn't + * relevant. That's only one check per destination path to + * skip. + * + * By contrast, the sources are iterated in the inner loop; if + * we check whether a source can be skipped, then we'll be + * checking it N separate times, once for each destination. + * We don't want to have to iterate over known-not-needed + * sources N times each, so avoid that by removing the sources + * from rename_src here. + */ + for (i = 0, new_num_src = 0; i < rename_src_nr; i++) { + /* + * renames are stored in rename_dst, so if a rename has + * already been detected using this source, we can just + * remove the source knowing rename_dst has its info. + */ + if (rename_src[i].p->one->rename_used) + continue; + + if (new_num_src < i) + memcpy(&rename_src[new_num_src], &rename_src[i], + sizeof(struct diff_rename_src)); + new_num_src++; + } + + rename_src_nr = new_num_src; +} + void diffcore_rename(struct diff_options *options) { int detect_rename = options->detect_rename; @@ -529,14 +577,10 @@ void diffcore_rename(struct diff_options *options) if (minimum_score == MAX_SCORE) goto cleanup; - /* - * Calculate how many renames are left (but all the source - * files still remain as options for rename/copies!) - */ + /* Calculate how many renames are left */ num_destinations = (rename_dst_nr - rename_count); + remove_unneeded_paths_from_src(want_copies); num_sources = rename_src_nr; - if (!want_copies) - num_sources -= rename_count; /* All done? */ if (!num_destinations || !num_sources) @@ -578,8 +622,7 @@ void diffcore_rename(struct diff_options *options) struct diff_filespec *one = rename_src[j].p->one; struct diff_score this_src; - if (one->rename_used && !want_copies) - continue; + assert(!one->rename_used || want_copies || break_idx); if (skip_unmodified && diff_unmodified_pair(rename_src[j].p)) From f3845257a55aa5c949ce2543d53fe1f28b5072df Mon Sep 17 00:00:00 2001 From: Elijah Newren <newren@gmail.com> Date: Sun, 14 Feb 2021 07:51:46 +0000 Subject: [PATCH 3/8] t4001: add a test comparing basename similarity and content similarity Add a simple test where a removed file is similar to two different added files; one of them has the same basename, and the other has a slightly higher content similarity. In the current test, content similarity is weighted higher than filename similarity. Subsequent commits will add a new rule that weighs a mixture of filename similarity and content similarity in a manner that will change the outcome of this testcase. Signed-off-by: Elijah Newren <newren@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- t/t4001-diff-rename.sh | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/t/t4001-diff-rename.sh b/t/t4001-diff-rename.sh index c16486a9d4..0f97858197 100755 --- a/t/t4001-diff-rename.sh +++ b/t/t4001-diff-rename.sh @@ -262,4 +262,27 @@ test_expect_success 'diff-tree -l0 defaults to a big rename limit, not zero' ' grep "myotherfile.*myfile" actual ' +test_expect_success 'basename similarity vs best similarity' ' + mkdir subdir && + test_write_lines line1 line2 line3 line4 line5 \ + line6 line7 line8 line9 line10 >subdir/file.txt && + git add subdir/file.txt && + git commit -m "base txt" && + + git rm subdir/file.txt && + test_write_lines line1 line2 line3 line4 line5 \ + line6 line7 line8 >file.txt && + test_write_lines line1 line2 line3 line4 line5 \ + line6 line7 line8 line9 >file.md && + git add file.txt file.md && + git commit -a -m "rename" && + git diff-tree -r -M --name-status HEAD^ HEAD >actual && + # subdir/file.txt is 88% similar to file.md and 78% similar to file.txt + cat >expected <<-\EOF && + R088 subdir/file.txt file.md + A file.txt + EOF + test_cmp expected actual +' + test_done From a35df3371c2e2e9b407ff8c950169e74f6bf4add Mon Sep 17 00:00:00 2001 From: Elijah Newren <newren@gmail.com> Date: Sun, 14 Feb 2021 07:51:47 +0000 Subject: [PATCH 4/8] diffcore-rename: compute basenames of source and dest candidates We want to make use of unique basenames among remaining source and destination files to help inform rename detection, so that more likely pairings can be checked first. (src/moduleA/foo.txt and source/module/A/foo.txt are likely related if there are no other 'foo.txt' files among the remaining deleted and added files.) Add a new function, not yet used, which creates a map of the unique basenames within rename_src and another within rename_dst, together with the indices within rename_src/rename_dst where those basenames show up. Non-unique basenames still show up in the map, but have an invalid index (-1). This function was inspired by the fact that in real world repositories, files are often moved across directories without changing names. Here are some sample repositories and the percentage of their historical renames (as of early 2020) that preserved basenames: * linux: 76% * gcc: 64% * gecko: 79% * webkit: 89% These statistics alone don't prove that an optimization in this area will help or how much it will help, since there are also unpaired adds and deletes, restrictions on which basenames we consider, etc., but it certainly motivated the idea to try something in this area. Signed-off-by: Elijah Newren <newren@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- diffcore-rename.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/diffcore-rename.c b/diffcore-rename.c index 6fd0c4a2f4..e51f33a218 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -367,6 +367,69 @@ static int find_exact_renames(struct diff_options *options) return renames; } +static const char *get_basename(const char *filename) +{ + /* + * gitbasename() has to worry about special drives, multiple + * directory separator characters, trailing slashes, NULL or + * empty strings, etc. We only work on filenames as stored in + * git, and thus get to ignore all those complications. + */ + const char *base = strrchr(filename, '/'); + return base ? base + 1 : filename; +} + +MAYBE_UNUSED +static int find_basename_matches(struct diff_options *options, + int minimum_score) +{ + int i; + struct strintmap sources; + struct strintmap dests; + + /* + * Create maps of basename -> fullname(s) for remaining sources and + * dests. + */ + strintmap_init_with_options(&sources, -1, NULL, 0); + strintmap_init_with_options(&dests, -1, NULL, 0); + for (i = 0; i < rename_src_nr; ++i) { + char *filename = rename_src[i].p->one->path; + const char *base; + + /* exact renames removed in remove_unneeded_paths_from_src() */ + assert(!rename_src[i].p->one->rename_used); + + /* Record index within rename_src (i) if basename is unique */ + base = get_basename(filename); + if (strintmap_contains(&sources, base)) + strintmap_set(&sources, base, -1); + else + strintmap_set(&sources, base, i); + } + for (i = 0; i < rename_dst_nr; ++i) { + char *filename = rename_dst[i].p->two->path; + const char *base; + + if (rename_dst[i].is_rename) + continue; /* involved in exact match already. */ + + /* Record index within rename_dst (i) if basename is unique */ + base = get_basename(filename); + if (strintmap_contains(&dests, base)) + strintmap_set(&dests, base, -1); + else + strintmap_set(&dests, base, i); + } + + /* TODO: Make use of basenames source and destination basenames */ + + strintmap_clear(&sources); + strintmap_clear(&dests); + + return 0; +} + #define NUM_CANDIDATE_PER_DST 4 static void record_if_better(struct diff_score m[], struct diff_score *o) { From da09f651277a982daa28227a13cd48d15b7245e1 Mon Sep 17 00:00:00 2001 From: Elijah Newren <newren@gmail.com> Date: Sun, 14 Feb 2021 07:51:48 +0000 Subject: [PATCH 5/8] diffcore-rename: complete find_basename_matches() It is not uncommon in real world repositories for the majority of file renames to not change the basename of the file; i.e. most "renames" are just a move of files into different directories. We can make use of this to avoid comparing all rename source candidates with all rename destination candidates, by first comparing sources to destinations with the same basenames. If two files with the same basename are sufficiently similar, we record the rename; if not, we include those files in the more exhaustive matrix comparison. This means we are adding a set of preliminary additional comparisons, but for each file we only compare it with at most one other file. For example, if there was a include/media/device.h that was deleted and a src/module/media/device.h that was added, and there are no other device.h files in the remaining sets of added and deleted files after exact rename detection, then these two files would be compared in the preliminary step. This commit does not yet actually employ this new optimization, it merely adds a function which can be used for this purpose. The next commit will do the necessary plumbing to make use of it. Note that this optimization might give us different results than without the optimization, because it's possible that despite files with the same basename being sufficiently similar to be considered a rename, there's an even better match between files without the same basename. I think that is okay for four reasons: (1) it's easy to explain to the users what happened if it does ever occur (or even for them to intuitively figure out), (2) as the next patch will show it provides such a large performance boost that it's worth the tradeoff, and (3) it's somewhat unlikely that despite having unique matching basenames that other files serve as better matches. Reason (4) takes a full paragraph to explain... If the previous three reasons aren't enough, consider what rename detection already does. Break detection is not the default, meaning that if files have the same _fullname_, then they are considered related even if they are 0% similar. In fact, in such a case, we don't even bother comparing the files to see if they are similar let alone comparing them to all other files to see what they are most similar to. Basically, we override content similarity based on sufficient filename similarity. Without the filename similarity (currently implemented as an exact match of filename), we swing the pendulum the opposite direction and say that filename similarity is irrelevant and compare a full N x M matrix of sources and destinations to find out which have the most similar contents. This optimization just adds another form of filename similarity comparison, but augments it with a file content similarity check as well. Basically, if two files have the same basename and are sufficiently similar to be considered a rename, mark them as such without comparing the two to all other rename candidates. Signed-off-by: Elijah Newren <newren@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- diffcore-rename.c | 82 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 79 insertions(+), 3 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index e51f33a218..266d4fae48 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -383,9 +383,53 @@ MAYBE_UNUSED static int find_basename_matches(struct diff_options *options, int minimum_score) { - int i; + /* + * When I checked in early 2020, over 76% of file renames in linux + * just moved files to a different directory but kept the same + * basename. gcc did that with over 64% of renames, gecko did it + * with over 79%, and WebKit did it with over 89%. + * + * Therefore we can bypass the normal exhaustive NxM matrix + * comparison of similarities between all potential rename sources + * and destinations by instead using file basename as a hint (i.e. + * the portion of the filename after the last '/'), checking for + * similarity between files with the same basename, and if we find + * a pair that are sufficiently similar, record the rename pair and + * exclude those two from the NxM matrix. + * + * This *might* cause us to find a less than optimal pairing (if + * there is another file that we are even more similar to but has a + * different basename). Given the huge performance advantage + * basename matching provides, and given the frequency with which + * people use the same basename in real world projects, that's a + * trade-off we are willing to accept when doing just rename + * detection. + * + * If someone wants copy detection that implies they are willing to + * spend more cycles to find similarities between files, so it may + * be less likely that this heuristic is wanted. If someone is + * doing break detection, that means they do not want filename + * similarity to imply any form of content similiarity, and thus + * this heuristic would definitely be incompatible. + */ + + int i, renames = 0; struct strintmap sources; struct strintmap dests; + struct hashmap_iter iter; + struct strmap_entry *entry; + + /* + * The prefeteching stuff wants to know if it can skip prefetching + * blobs that are unmodified...and will then do a little extra work + * to verify that the oids are indeed different before prefetching. + * Unmodified blobs are only relevant when doing copy detection; + * when limiting to rename detection, diffcore_rename[_extended]() + * will never be called with unmodified source paths fed to us, so + * the extra work necessary to check if rename_src entries are + * unmodified would be a small waste. + */ + int skip_unmodified = 0; /* * Create maps of basename -> fullname(s) for remaining sources and @@ -422,12 +466,44 @@ static int find_basename_matches(struct diff_options *options, strintmap_set(&dests, base, i); } - /* TODO: Make use of basenames source and destination basenames */ + /* Now look for basename matchups and do similarity estimation */ + strintmap_for_each_entry(&sources, &iter, entry) { + const char *base = entry->key; + intptr_t src_index = (intptr_t)entry->value; + intptr_t dst_index; + if (src_index == -1) + continue; + + if (0 <= (dst_index = strintmap_get(&dests, base))) { + struct diff_filespec *one, *two; + int score; + + /* Estimate the similarity */ + one = rename_src[src_index].p->one; + two = rename_dst[dst_index].p->two; + score = estimate_similarity(options->repo, one, two, + minimum_score, skip_unmodified); + + /* If sufficiently similar, record as rename pair */ + if (score < minimum_score) + continue; + record_rename_pair(dst_index, src_index, score); + renames++; + + /* + * Found a rename so don't need text anymore; if we + * didn't find a rename, the filespec_blob would get + * re-used when doing the matrix of comparisons. + */ + diff_free_filespec_blob(one); + diff_free_filespec_blob(two); + } + } strintmap_clear(&sources); strintmap_clear(&dests); - return 0; + return renames; } #define NUM_CANDIDATE_PER_DST 4 From bd24aa2f97a08fdd5a4982bc6268f70c6bb7b747 Mon Sep 17 00:00:00 2001 From: Elijah Newren <newren@gmail.com> Date: Sun, 14 Feb 2021 07:51:49 +0000 Subject: [PATCH 6/8] diffcore-rename: guide inexact rename detection based on basenames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make use of the new find_basename_matches() function added in the last two patches, to find renames more rapidly in cases where we can match up files based on basenames. As a quick reminder (see the last two commit messages for more details), this means for example that docs/extensions.txt and docs/config/extensions.txt are considered likely renames if there are no remaining 'extensions.txt' files elsewhere among the added and deleted files, and if a similarity check confirms they are similar, then they are marked as a rename without looking for a better similarity match among other files. This is a behavioral change, as covered in more detail in the previous commit message. We do not use this heuristic together with either break or copy detection. The point of break detection is to say that filename similarity does not imply file content similarity, and we only want to know about file content similarity. The point of copy detection is to use more resources to check for additional similarities, while this is an optimization that uses far less resources but which might also result in finding slightly fewer similarities. So the idea behind this optimization goes against both of those features, and will be turned off for both. For the testcases mentioned in commit 557ac0350d ("merge-ort: begin performance work; instrument with trace2_region_* calls", 2020-10-28), this change improves the performance as follows: Before After no-renames: 13.815 s ± 0.062 s 13.294 s ± 0.103 s mega-renames: 1799.937 s ± 0.493 s 187.248 s ± 0.882 s just-one-mega: 51.289 s ± 0.019 s 5.557 s ± 0.017 s Signed-off-by: Elijah Newren <newren@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- diffcore-rename.c | 53 ++++++++++++++++++++++++++++++++++++++---- t/t4001-diff-rename.sh | 7 +++--- 2 files changed, 52 insertions(+), 8 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 266d4fae48..41558185ae 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -379,7 +379,6 @@ static const char *get_basename(const char *filename) return base ? base + 1 : filename; } -MAYBE_UNUSED static int find_basename_matches(struct diff_options *options, int minimum_score) { @@ -716,11 +715,55 @@ void diffcore_rename(struct diff_options *options) if (minimum_score == MAX_SCORE) goto cleanup; - /* Calculate how many renames are left */ - num_destinations = (rename_dst_nr - rename_count); - remove_unneeded_paths_from_src(want_copies); num_sources = rename_src_nr; + if (want_copies || break_idx) { + /* + * Cull sources: + * - remove ones corresponding to exact renames + */ + trace2_region_enter("diff", "cull after exact", options->repo); + remove_unneeded_paths_from_src(want_copies); + trace2_region_leave("diff", "cull after exact", options->repo); + } else { + /* Determine minimum score to match basenames */ + double factor = 0.5; + char *basename_factor = getenv("GIT_BASENAME_FACTOR"); + int min_basename_score; + + if (basename_factor) + factor = strtol(basename_factor, NULL, 10)/100.0; + assert(factor >= 0.0 && factor <= 1.0); + min_basename_score = minimum_score + + (int)(factor * (MAX_SCORE - minimum_score)); + + /* + * Cull sources: + * - remove ones involved in renames (found via exact match) + */ + trace2_region_enter("diff", "cull after exact", options->repo); + remove_unneeded_paths_from_src(want_copies); + trace2_region_leave("diff", "cull after exact", options->repo); + + /* Utilize file basenames to quickly find renames. */ + trace2_region_enter("diff", "basename matches", options->repo); + rename_count += find_basename_matches(options, + min_basename_score); + trace2_region_leave("diff", "basename matches", options->repo); + + /* + * Cull sources, again: + * - remove ones involved in renames (found via basenames) + */ + trace2_region_enter("diff", "cull basename", options->repo); + remove_unneeded_paths_from_src(want_copies); + trace2_region_leave("diff", "cull basename", options->repo); + } + + /* Calculate how many rename destinations are left */ + num_destinations = (rename_dst_nr - rename_count); + num_sources = rename_src_nr; /* rename_src_nr reflects lower number */ + /* All done? */ if (!num_destinations || !num_sources) goto cleanup; @@ -751,7 +794,7 @@ void diffcore_rename(struct diff_options *options) struct diff_score *m; if (rename_dst[i].is_rename) - continue; /* dealt with exact match already. */ + continue; /* exact or basename match already handled */ m = &mx[dst_cnt * NUM_CANDIDATE_PER_DST]; for (j = 0; j < NUM_CANDIDATE_PER_DST; j++) diff --git a/t/t4001-diff-rename.sh b/t/t4001-diff-rename.sh index 0f97858197..99a5d1bd1c 100755 --- a/t/t4001-diff-rename.sh +++ b/t/t4001-diff-rename.sh @@ -277,10 +277,11 @@ test_expect_success 'basename similarity vs best similarity' ' git add file.txt file.md && git commit -a -m "rename" && git diff-tree -r -M --name-status HEAD^ HEAD >actual && - # subdir/file.txt is 88% similar to file.md and 78% similar to file.txt + # subdir/file.txt is 88% similar to file.md, 78% similar to file.txt, + # but since same basenames are checked first... cat >expected <<-\EOF && - R088 subdir/file.txt file.md - A file.txt + A file.md + R078 subdir/file.txt file.txt EOF test_cmp expected actual ' From 07c9a7fcb50e7af9d350edc82d5f906a6eb6bd62 Mon Sep 17 00:00:00 2001 From: Elijah Newren <newren@gmail.com> Date: Sun, 14 Feb 2021 07:51:50 +0000 Subject: [PATCH 7/8] gitdiffcore doc: mention new preliminary step for rename detection The last few patches have introduced a new preliminary step when rename detection is on but both break detection and copy detection are off. Document this new step. While we're at it, add a testcase that checks the new behavior as well. Signed-off-by: Elijah Newren <newren@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- Documentation/gitdiffcore.txt | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/Documentation/gitdiffcore.txt b/Documentation/gitdiffcore.txt index c970d9fe43..80fcf95424 100644 --- a/Documentation/gitdiffcore.txt +++ b/Documentation/gitdiffcore.txt @@ -168,6 +168,26 @@ a similarity score different from the default of 50% by giving a number after the "-M" or "-C" option (e.g. "-M8" to tell it to use 8/10 = 80%). +Note that when rename detection is on but both copy and break +detection are off, rename detection adds a preliminary step that first +checks if files are moved across directories while keeping their +filename the same. If there is a file added to a directory whose +contents is sufficiently similar to a file with the same name that got +deleted from a different directory, it will mark them as renames and +exclude them from the later quadratic step (the one that pairwise +compares all unmatched files to find the "best" matches, determined by +the highest content similarity). So, for example, if a deleted +docs/ext.txt and an added docs/config/ext.txt are similar enough, they +will be marked as a rename and prevent an added docs/ext.md that may +be even more similar to the deleted docs/ext.txt from being considered +as the rename destination in the later step. For this reason, the +preliminary "match same filename" step uses a bit higher threshold to +mark a file pair as a rename and stop considering other candidates for +better matches. At most, one comparison is done per file in this +preliminary pass; so if there are several remaining ext.txt files +throughout the directory hierarchy after exact rename detection, this +preliminary step will be skipped for those files. + Note. When the "-C" option is used with `--find-copies-harder` option, 'git diff-{asterisk}' commands feed unmodified filepairs to diffcore mechanism as well as modified ones. This lets the copy From f78cf976172fbad79a9ca4fc158f384fb868f177 Mon Sep 17 00:00:00 2001 From: Elijah Newren <newren@gmail.com> Date: Sun, 14 Feb 2021 07:51:51 +0000 Subject: [PATCH 8/8] merge-ort: call diffcore_rename() directly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want to pass additional information to diffcore_rename() (or some variant thereof) without plumbing that extra information through diff_tree_oid() and diffcore_std(). Further, since we will need to gather additional special information related to diffs and are walking the trees anyway in collect_merge_info(), it seems odd to have diff_tree_oid()/diffcore_std() repeat those tree walks. And there may be times where we can avoid traversing into a subtree in collect_merge_info() (based on additional information at our disposal), that the basic diff logic would be unable to take advantage of. For all these reasons, just create the add and delete pairs ourself and then call diffcore_rename() directly. This change is primarily about enabling future optimizations; the advantage of avoiding extra tree traversals is small compared to the cost of rename detection, and the advantage of avoiding the extra tree traversals is somewhat offset by the extra time spent in collect_merge_info() collecting the additional data anyway. However... For the testcases mentioned in commit 557ac0350d ("merge-ort: begin performance work; instrument with trace2_region_* calls", 2020-10-28), this change improves the performance as follows: Before After no-renames: 13.294 s ± 0.103 s 12.775 s ± 0.062 s mega-renames: 187.248 s ± 0.882 s 188.754 s ± 0.284 s just-one-mega: 5.557 s ± 0.017 s 5.599 s ± 0.019 s Signed-off-by: Elijah Newren <newren@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com> --- merge-ort.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 7 deletions(-) diff --git a/merge-ort.c b/merge-ort.c index 931b91438c..603d30c521 100644 --- a/merge-ort.c +++ b/merge-ort.c @@ -535,6 +535,23 @@ static void setup_path_info(struct merge_options *opt, result->util = mi; } +static void add_pair(struct merge_options *opt, + struct name_entry *names, + const char *pathname, + unsigned side, + unsigned is_add /* if false, is_delete */) +{ + struct diff_filespec *one, *two; + struct rename_info *renames = &opt->priv->renames; + int names_idx = is_add ? side : 0; + + one = alloc_filespec(pathname); + two = alloc_filespec(pathname); + fill_filespec(is_add ? two : one, + &names[names_idx].oid, 1, names[names_idx].mode); + diff_queue(&renames->pairs[side], one, two); +} + static void collect_rename_info(struct merge_options *opt, struct name_entry *names, const char *dirname, @@ -544,6 +561,7 @@ static void collect_rename_info(struct merge_options *opt, unsigned match_mask) { struct rename_info *renames = &opt->priv->renames; + unsigned side; /* Update dirs_removed, as needed */ if (dirmask == 1 || dirmask == 3 || dirmask == 5) { @@ -554,6 +572,21 @@ static void collect_rename_info(struct merge_options *opt, if (sides & 2) strset_add(&renames->dirs_removed[2], fullname); } + + if (filemask == 0 || filemask == 7) + return; + + for (side = MERGE_SIDE1; side <= MERGE_SIDE2; ++side) { + unsigned side_mask = (1 << side); + + /* Check for deletion on side */ + if ((filemask & 1) && !(filemask & side_mask)) + add_pair(opt, names, fullname, side, 0 /* delete */); + + /* Check for addition on side */ + if (!(filemask & 1) && (filemask & side_mask)) + add_pair(opt, names, fullname, side, 1 /* add */); + } } static int collect_merge_info_callback(int n, @@ -2079,6 +2112,27 @@ static int process_renames(struct merge_options *opt, return clean_merge; } +static void resolve_diffpair_statuses(struct diff_queue_struct *q) +{ + /* + * A simplified version of diff_resolve_rename_copy(); would probably + * just use that function but it's static... + */ + int i; + struct diff_filepair *p; + + for (i = 0; i < q->nr; ++i) { + p = q->queue[i]; + p->status = 0; /* undecided */ + if (!DIFF_FILE_VALID(p->one)) + p->status = DIFF_STATUS_ADDED; + else if (!DIFF_FILE_VALID(p->two)) + p->status = DIFF_STATUS_DELETED; + else if (DIFF_PAIR_RENAME(p)) + p->status = DIFF_STATUS_RENAMED; + } +} + static int compare_pairs(const void *a_, const void *b_) { const struct diff_filepair *a = *((const struct diff_filepair **)a_); @@ -2089,8 +2143,6 @@ static int compare_pairs(const void *a_, const void *b_) /* Call diffcore_rename() to compute which files have changed on given side */ static void detect_regular_renames(struct merge_options *opt, - struct tree *merge_base, - struct tree *side, unsigned side_index) { struct diff_options diff_opts; @@ -2108,11 +2160,11 @@ static void detect_regular_renames(struct merge_options *opt, diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT; diff_setup_done(&diff_opts); + diff_queued_diff = renames->pairs[side_index]; trace2_region_enter("diff", "diffcore_rename", opt->repo); - diff_tree_oid(&merge_base->object.oid, &side->object.oid, "", - &diff_opts); - diffcore_std(&diff_opts); + diffcore_rename(&diff_opts); trace2_region_leave("diff", "diffcore_rename", opt->repo); + resolve_diffpair_statuses(&diff_queued_diff); if (diff_opts.needed_rename_limit > renames->needed_limit) renames->needed_limit = diff_opts.needed_rename_limit; @@ -2212,8 +2264,8 @@ static int detect_and_process_renames(struct merge_options *opt, memset(&combined, 0, sizeof(combined)); trace2_region_enter("merge", "regular renames", opt->repo); - detect_regular_renames(opt, merge_base, side1, MERGE_SIDE1); - detect_regular_renames(opt, merge_base, side2, MERGE_SIDE2); + detect_regular_renames(opt, MERGE_SIDE1); + detect_regular_renames(opt, MERGE_SIDE2); trace2_region_leave("merge", "regular renames", opt->repo); trace2_region_enter("merge", "directory renames", opt->repo);