diffcore-rename: provide basic implementation of idx_possible_rename()
Add a new struct dir_rename_info with various values we need inside our idx_possible_rename() function introduced in the previous commit. Add a basic implementation for this function showing how we plan to use the variables, but which will just return early with a value of -1 (not found) when those variables are not set up. Future commits will do the work necessary to set up those other variables so that idx_possible_rename() does not always return -1. Reviewed-by: Derrick Stolee <dstolee@microsoft.com> Signed-off-by: Elijah Newren <newren@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>maint
parent
37a2514364
commit
bde8b9f34c
|
@ -367,6 +367,19 @@ static int find_exact_renames(struct diff_options *options)
|
||||||
return renames;
|
return renames;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct dir_rename_info {
|
||||||
|
struct strintmap idx_map;
|
||||||
|
struct strmap dir_rename_guess;
|
||||||
|
struct strmap *dir_rename_count;
|
||||||
|
unsigned setup;
|
||||||
|
};
|
||||||
|
|
||||||
|
static char *get_dirname(const char *filename)
|
||||||
|
{
|
||||||
|
char *slash = strrchr(filename, '/');
|
||||||
|
return slash ? xstrndup(filename, slash - filename) : xstrdup("");
|
||||||
|
}
|
||||||
|
|
||||||
static const char *get_basename(const char *filename)
|
static const char *get_basename(const char *filename)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
|
@ -379,14 +392,86 @@ static const char *get_basename(const char *filename)
|
||||||
return base ? base + 1 : filename;
|
return base ? base + 1 : filename;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int idx_possible_rename(char *filename)
|
static int idx_possible_rename(char *filename, struct dir_rename_info *info)
|
||||||
{
|
{
|
||||||
/* Unconditionally return -1, "not found", for now */
|
/*
|
||||||
|
* Our comparison of files with the same basename (see
|
||||||
|
* find_basename_matches() below), is only helpful when after exact
|
||||||
|
* rename detection we have exactly one file with a given basename
|
||||||
|
* among the rename sources and also only exactly one file with
|
||||||
|
* that basename among the rename destinations. When we have
|
||||||
|
* multiple files with the same basename in either set, we do not
|
||||||
|
* know which to compare against. However, there are some
|
||||||
|
* filenames that occur in large numbers (particularly
|
||||||
|
* build-related filenames such as 'Makefile', '.gitignore', or
|
||||||
|
* 'build.gradle' that potentially exist within every single
|
||||||
|
* subdirectory), and for performance we want to be able to quickly
|
||||||
|
* find renames for these files too.
|
||||||
|
*
|
||||||
|
* The reason basename comparisons are a useful heuristic was that it
|
||||||
|
* is common for people to move files across directories while keeping
|
||||||
|
* their filename the same. If we had a way of determining or even
|
||||||
|
* making a good educated guess about which directory these non-unique
|
||||||
|
* basename files had moved the file to, we could check it.
|
||||||
|
* Luckily...
|
||||||
|
*
|
||||||
|
* When an entire directory is in fact renamed, we have two factors
|
||||||
|
* helping us out:
|
||||||
|
* (a) the original directory disappeared giving us a hint
|
||||||
|
* about when we can apply an extra heuristic.
|
||||||
|
* (a) we often have several files within that directory and
|
||||||
|
* subdirectories that are renamed without changes
|
||||||
|
* So, rules for a heuristic:
|
||||||
|
* (0) If there basename matches are non-unique (the condition under
|
||||||
|
* which this function is called) AND
|
||||||
|
* (1) the directory in which the file was found has disappeared
|
||||||
|
* (i.e. dirs_removed is non-NULL and has a relevant entry) THEN
|
||||||
|
* (2) use exact renames of files within the directory to determine
|
||||||
|
* where the directory is likely to have been renamed to. IF
|
||||||
|
* there is at least one exact rename from within that
|
||||||
|
* directory, we can proceed.
|
||||||
|
* (3) If there are multiple places the directory could have been
|
||||||
|
* renamed to based on exact renames, ignore all but one of them.
|
||||||
|
* Just use the destination with the most renames going to it.
|
||||||
|
* (4) Check if applying that directory rename to the original file
|
||||||
|
* would result in a destination filename that is in the
|
||||||
|
* potential rename set. If so, return the index of the
|
||||||
|
* destination file (the index within rename_dst).
|
||||||
|
* (5) Compare the original file and returned destination for
|
||||||
|
* similarity, and if they are sufficiently similar, record the
|
||||||
|
* rename.
|
||||||
|
*
|
||||||
|
* This function, idx_possible_rename(), is only responsible for (4).
|
||||||
|
* The conditions/steps in (1)-(3) will be handled via setting up
|
||||||
|
* dir_rename_count and dir_rename_guess in a future
|
||||||
|
* initialize_dir_rename_info() function. Steps (0) and (5) are
|
||||||
|
* handled by the caller of this function.
|
||||||
|
*/
|
||||||
|
char *old_dir, *new_dir;
|
||||||
|
struct strbuf new_path = STRBUF_INIT;
|
||||||
|
int idx;
|
||||||
|
|
||||||
|
if (!info->setup)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
|
old_dir = get_dirname(filename);
|
||||||
|
new_dir = strmap_get(&info->dir_rename_guess, old_dir);
|
||||||
|
free(old_dir);
|
||||||
|
if (!new_dir)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
strbuf_addstr(&new_path, new_dir);
|
||||||
|
strbuf_addch(&new_path, '/');
|
||||||
|
strbuf_addstr(&new_path, get_basename(filename));
|
||||||
|
|
||||||
|
idx = strintmap_get(&info->idx_map, new_path.buf);
|
||||||
|
strbuf_release(&new_path);
|
||||||
|
return idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int find_basename_matches(struct diff_options *options,
|
static int find_basename_matches(struct diff_options *options,
|
||||||
int minimum_score)
|
int minimum_score,
|
||||||
|
struct dir_rename_info *info)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* When I checked in early 2020, over 76% of file renames in linux
|
* When I checked in early 2020, over 76% of file renames in linux
|
||||||
|
@ -494,7 +579,7 @@ static int find_basename_matches(struct diff_options *options,
|
||||||
dst_index = strintmap_get(&dests, base);
|
dst_index = strintmap_get(&dests, base);
|
||||||
if (src_index == -1 || dst_index == -1) {
|
if (src_index == -1 || dst_index == -1) {
|
||||||
src_index = i;
|
src_index = i;
|
||||||
dst_index = idx_possible_rename(filename);
|
dst_index = idx_possible_rename(filename, info);
|
||||||
}
|
}
|
||||||
if (dst_index == -1)
|
if (dst_index == -1)
|
||||||
continue;
|
continue;
|
||||||
|
@ -677,8 +762,10 @@ void diffcore_rename(struct diff_options *options)
|
||||||
int num_destinations, dst_cnt;
|
int num_destinations, dst_cnt;
|
||||||
int num_sources, want_copies;
|
int num_sources, want_copies;
|
||||||
struct progress *progress = NULL;
|
struct progress *progress = NULL;
|
||||||
|
struct dir_rename_info info;
|
||||||
|
|
||||||
trace2_region_enter("diff", "setup", options->repo);
|
trace2_region_enter("diff", "setup", options->repo);
|
||||||
|
info.setup = 0;
|
||||||
want_copies = (detect_rename == DIFF_DETECT_COPY);
|
want_copies = (detect_rename == DIFF_DETECT_COPY);
|
||||||
if (!minimum_score)
|
if (!minimum_score)
|
||||||
minimum_score = DEFAULT_RENAME_SCORE;
|
minimum_score = DEFAULT_RENAME_SCORE;
|
||||||
|
@ -774,7 +861,8 @@ void diffcore_rename(struct diff_options *options)
|
||||||
/* Utilize file basenames to quickly find renames. */
|
/* Utilize file basenames to quickly find renames. */
|
||||||
trace2_region_enter("diff", "basename matches", options->repo);
|
trace2_region_enter("diff", "basename matches", options->repo);
|
||||||
rename_count += find_basename_matches(options,
|
rename_count += find_basename_matches(options,
|
||||||
min_basename_score);
|
min_basename_score,
|
||||||
|
&info);
|
||||||
trace2_region_leave("diff", "basename matches", options->repo);
|
trace2_region_leave("diff", "basename matches", options->repo);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
Loading…
Reference in New Issue