blame: use the fingerprint heuristic to match ignored lines

This commit integrates the fuzzy fingerprint heuristic into
guess_line_blames().

We actually make two passes.  The first pass uses the fuzzy algorithm to
find a match within the current diff chunk.  If that fails, the second
pass searches the entire parent file for the best match.

For an example of scanning the entire parent for a match, consider:

	commit-a 30) #include <sys/header_a.h>
	commit-b 31) #include <header_b.h>
	commit-c 32) #include <header_c.h>

Then commit X alphabetizes them:

	commit-X 30) #include <header_b.h>
	commit-X 31) #include <header_c.h>
	commit-X 32) #include <sys/header_a.h>

If we just check the parent's chunk (i.e. the first pass), we'd get:

	commit-b 30) #include <header_b.h>
	commit-c 31) #include <header_c.h>
	commit-X 32) #include <sys/header_a.h>

That's because commit X actually consists of two chunks: one chunk is
removing sys/header_a.h, then some context, and the second chunk is
adding sys/header_a.h.

If we scan the entire parent file, we get:

	commit-b 30) #include <header_b.h>
	commit-c 31) #include <header_c.h>
	commit-a 32) #include <sys/header_a.h>

Signed-off-by: Barret Rhoden <brho@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
maint
Barret Rhoden 2019-06-20 12:38:19 -04:00 committed by Junio C Hamano
parent 1d028dc682
commit a07a97760c
2 changed files with 55 additions and 8 deletions

60
blame.c
View File

@ -989,12 +989,19 @@ static void fill_origin_fingerprints(struct blame_origin *o, mmfile_t *file)
return; return;
o->num_lines = find_line_starts(&line_starts, o->file.ptr, o->num_lines = find_line_starts(&line_starts, o->file.ptr,
o->file.size); o->file.size);
/* TODO: Will fill in fingerprints in a future commit */ o->fingerprints = xcalloc(sizeof(struct fingerprint), o->num_lines);
get_line_fingerprints(o->fingerprints, o->file.ptr, line_starts,
0, o->num_lines);
free(line_starts); free(line_starts);
} }


static void drop_origin_fingerprints(struct blame_origin *o) static void drop_origin_fingerprints(struct blame_origin *o)
{ {
if (o->fingerprints) {
free_line_fingerprints(o->fingerprints, o->num_lines);
o->num_lines = 0;
FREE_AND_NULL(o->fingerprints);
}
} }


/* /*
@ -1572,9 +1579,34 @@ static int are_lines_adjacent(struct blame_line_tracker *first,
first->s_lno + 1 == second->s_lno; first->s_lno + 1 == second->s_lno;
} }


static int scan_parent_range(struct fingerprint *p_fps,
struct fingerprint *t_fps, int t_idx,
int from, int nr_lines)
{
int sim, p_idx;
#define FINGERPRINT_FILE_THRESHOLD 10
int best_sim_val = FINGERPRINT_FILE_THRESHOLD;
int best_sim_idx = -1;

for (p_idx = from; p_idx < from + nr_lines; p_idx++) {
sim = fingerprint_similarity(&t_fps[t_idx], &p_fps[p_idx]);
if (sim < best_sim_val)
continue;
/* Break ties with the closest-to-target line number */
if (sim == best_sim_val && best_sim_idx != -1 &&
abs(best_sim_idx - t_idx) < abs(p_idx - t_idx))
continue;
best_sim_val = sim;
best_sim_idx = p_idx;
}
return best_sim_idx;
}

/* /*
* This cheap heuristic assigns lines in the chunk to their relative location in * The first pass checks the blame entry (from the target) against the parent's
* the parent's chunk. Any additional lines are left with the target. * diff chunk. If that fails for a line, the second pass tries to match that
* line to any part of parent file. That catches cases where a change was
* broken into two chunks by 'context.'
*/ */
static void guess_line_blames(struct blame_origin *parent, static void guess_line_blames(struct blame_origin *parent,
struct blame_origin *target, struct blame_origin *target,
@ -1583,11 +1615,22 @@ static void guess_line_blames(struct blame_origin *parent,
{ {
int i, best_idx, target_idx; int i, best_idx, target_idx;
int parent_slno = tlno + offset; int parent_slno = tlno + offset;
int *fuzzy_matches;


fuzzy_matches = fuzzy_find_matching_lines(parent, target,
tlno, parent_slno, same,
parent_len);
for (i = 0; i < same - tlno; i++) { for (i = 0; i < same - tlno; i++) {
target_idx = tlno + i; target_idx = tlno + i;
best_idx = target_idx + offset; if (fuzzy_matches && fuzzy_matches[i] >= 0) {
if (best_idx < parent_slno + parent_len) { best_idx = fuzzy_matches[i];
} else {
best_idx = scan_parent_range(parent->fingerprints,
target->fingerprints,
target_idx, 0,
parent->num_lines);
}
if (best_idx >= 0) {
line_blames[i].is_parent = 1; line_blames[i].is_parent = 1;
line_blames[i].s_lno = best_idx; line_blames[i].s_lno = best_idx;
} else { } else {
@ -1595,6 +1638,7 @@ static void guess_line_blames(struct blame_origin *parent,
line_blames[i].s_lno = target_idx; line_blames[i].s_lno = target_idx;
} }
} }
free(fuzzy_matches);
} }


/* /*
@ -2371,6 +2415,12 @@ static void pass_blame(struct blame_scoreboard *sb, struct blame_origin *origin,
if (!porigin) if (!porigin)
continue; continue;
pass_blame_to_parent(sb, origin, porigin, 1); pass_blame_to_parent(sb, origin, porigin, 1);
/*
* Preemptively drop porigin so we can refresh the
* fingerprints if we use the parent again, which can
* occur if you ignore back-to-back commits.
*/
drop_origin_blob(porigin);
if (!origin->suspects) if (!origin->suspects)
goto finish; goto finish;
} }

View File

@ -3,9 +3,6 @@
test_description='git blame ignore fuzzy heuristic' test_description='git blame ignore fuzzy heuristic'
. ./test-lib.sh . ./test-lib.sh


# short circuit until blame has the fuzzy capabilities
test_done

pick_author='s/^[0-9a-f^]* *(\([^ ]*\) .*/\1/' pick_author='s/^[0-9a-f^]* *(\([^ ]*\) .*/\1/'


# Each test is composed of 4 variables: # Each test is composed of 4 variables: