xdiff: move xdl_cleanup_records() from xprepare.c to xdiffi.c

Only the classic diff uses xdl_cleanup_records(). Move it,
xdl_clean_mmatch(), and the macros to xdiffi.c and call
xdl_cleanup_records() inside of xdl_do_classic_diff(). This better
organizes the code related to the classic diff.

Signed-off-by: Ezekiel Newren <ezekielnewren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Ezekiel Newren 2026-01-02 18:52:24 +00:00 committed by Junio C Hamano
parent efd314b8e8
commit 083774849b
2 changed files with 181 additions and 190 deletions

View File

@ -21,6 +21,7 @@
*/

#include "xinclude.h"
#include "compat/ivec.h"

static size_t get_hash(xdfile_t *xdf, long index)
{
@ -33,6 +34,14 @@ static size_t get_hash(xdfile_t *xdf, long index)
#define XDL_SNAKE_CNT 20
#define XDL_K_HEUR 4

#define XDL_KPDIS_RUN 4
#define XDL_MAX_EQLIMIT 1024
#define XDL_SIMSCAN_WINDOW 100

#define DISCARD 0
#define KEEP 1
#define INVESTIGATE 2

typedef struct s_xdpsplit {
long i1, i2;
int min_lo, min_hi;
@ -311,6 +320,175 @@ int xdl_recs_cmp(xdfile_t *xdf1, long off1, long lim1,
}


static bool xdl_clean_mmatch(uint8_t const *action, long i, long s, long e) {
long r, rdis0, rpdis0, rdis1, rpdis1;

/*
* Limits the window that is examined during the similar-lines
* scan. The loops below stops when action[i - r] == KEEP
* (line that has no match), but there are corner cases where
* the loop proceed all the way to the extremities by causing
* huge performance penalties in case of big files.
*/
if (i - s > XDL_SIMSCAN_WINDOW)
s = i - XDL_SIMSCAN_WINDOW;
if (e - i > XDL_SIMSCAN_WINDOW)
e = i + XDL_SIMSCAN_WINDOW;

/*
* Scans the lines before 'i' to find a run of lines that either
* have no match (action[j] == DISCARD) or have multiple matches
* (action[j] == INVESTIGATE). Note that we always call this
* function with action[i] == INVESTIGATE, so the current line
* (i) is already a multimatch line.
*/
for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) {
if (action[i - r] == DISCARD)
rdis0++;
else if (action[i - r] == INVESTIGATE)
rpdis0++;
else if (action[i - r] == KEEP)
break;
else
BUG("Illegal value for action[i - r]");
}
/*
* If the run before the line 'i' found only multimatch lines,
* we return false and hence we don't make the current line (i)
* discarded. We want to discard multimatch lines only when
* they appear in the middle of runs with nomatch lines
* (action[j] == DISCARD).
*/
if (rdis0 == 0)
return 0;
for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) {
if (action[i + r] == DISCARD)
rdis1++;
else if (action[i + r] == INVESTIGATE)
rpdis1++;
else if (action[i + r] == KEEP)
break;
else
BUG("Illegal value for action[i + r]");
}
/*
* If the run after the line 'i' found only multimatch lines,
* we return false and hence we don't make the current line (i)
* discarded.
*/
if (rdis1 == 0)
return false;
rdis1 += rdis0;
rpdis1 += rpdis0;

return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1);
}

struct xoccurrence
{
size_t file1, file2;
};


DEFINE_IVEC_TYPE(struct xoccurrence, xoccurrence);


/*
* Try to reduce the problem complexity, discard records that have no
* matches on the other file. Also, lines that have multiple matches
* might be potentially discarded if they appear in a run of discardable.
*/
static int xdl_cleanup_records(xdfenv_t *xe, uint64_t flags) {
long i;
size_t nm, mlim;
xrecord_t *recs;
uint8_t *action1 = NULL, *action2 = NULL;
struct IVec_xoccurrence occ;
bool need_min = !!(flags & XDF_NEED_MINIMAL);
int ret = 0;
ptrdiff_t dend1 = xe->xdf1.nrec - 1 - xe->delta_end;
ptrdiff_t dend2 = xe->xdf2.nrec - 1 - xe->delta_end;

IVEC_INIT(occ);
ivec_zero(&occ, xe->mph_size);

for (size_t j = 0; j < xe->xdf1.nrec; j++) {
size_t mph1 = xe->xdf1.recs[j].minimal_perfect_hash;
occ.ptr[mph1].file1 += 1;
}

for (size_t j = 0; j < xe->xdf2.nrec; j++) {
size_t mph2 = xe->xdf2.recs[j].minimal_perfect_hash;
occ.ptr[mph2].file2 += 1;
}

/*
* Create temporary arrays that will help us decide if
* changed[i] should remain false, or become true.
*/
if (!XDL_CALLOC_ARRAY(action1, xe->xdf1.nrec + 1)) {
ret = -1;
goto cleanup;
}
if (!XDL_CALLOC_ARRAY(action2, xe->xdf2.nrec + 1)) {
ret = -1;
goto cleanup;
}

/*
* Initialize temporary arrays with DISCARD, KEEP, or INVESTIGATE.
*/
if ((mlim = xdl_bogosqrt((long)xe->xdf1.nrec)) > XDL_MAX_EQLIMIT)
mlim = XDL_MAX_EQLIMIT;
for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start]; i <= dend1; i++, recs++) {
nm = occ.ptr[recs->minimal_perfect_hash].file2;
action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
}

if ((mlim = xdl_bogosqrt((long)xe->xdf2.nrec)) > XDL_MAX_EQLIMIT)
mlim = XDL_MAX_EQLIMIT;
for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start]; i <= dend2; i++, recs++) {
nm = occ.ptr[recs->minimal_perfect_hash].file1;
action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
}

/*
* Use temporary arrays to decide if changed[i] should remain
* false, or become true.
*/
xe->xdf1.nreff = 0;
for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start];
i <= dend1; i++, recs++) {
if (action1[i] == KEEP ||
(action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xe->delta_start, dend1))) {
xe->xdf1.reference_index[xe->xdf1.nreff++] = i;
/* changed[i] remains false, i.e. keep */
} else
xe->xdf1.changed[i] = true;
/* i.e. discard */
}

xe->xdf2.nreff = 0;
for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start];
i <= dend2; i++, recs++) {
if (action2[i] == KEEP ||
(action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xe->delta_start, dend2))) {
xe->xdf2.reference_index[xe->xdf2.nreff++] = i;
/* changed[i] remains false, i.e. keep */
} else
xe->xdf2.changed[i] = true;
/* i.e. discard */
}

cleanup:
xdl_free(action1);
xdl_free(action2);
ivec_free(&occ);

return ret;
}


int xdl_do_classic_diff(xdfenv_t *xe, uint64_t flags)
{
long ndiags;
@ -318,6 +496,8 @@ int xdl_do_classic_diff(xdfenv_t *xe, uint64_t flags)
xdalgoenv_t xenv;
int res;

xdl_cleanup_records(xe, flags);

/*
* Allocate and setup K vectors to be used by the differential
* algorithm.

View File

@ -24,14 +24,6 @@
#include "compat/ivec.h"


#define XDL_KPDIS_RUN 4
#define XDL_MAX_EQLIMIT 1024
#define XDL_SIMSCAN_WINDOW 100

#define DISCARD 0
#define KEEP 1
#define INVESTIGATE 2

typedef struct s_xdlclass {
struct s_xdlclass *next;
xrecord_t rec;
@ -50,8 +42,6 @@ typedef struct s_xdlclassifier {
} xdlclassifier_t;




static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags) {
memset(cf, 0, sizeof(xdlclassifier_t));

@ -186,175 +176,6 @@ void xdl_free_env(xdfenv_t *xe) {
}


static bool xdl_clean_mmatch(uint8_t const *action, long i, long s, long e) {
long r, rdis0, rpdis0, rdis1, rpdis1;

/*
* Limits the window that is examined during the similar-lines
* scan. The loops below stops when action[i - r] == KEEP
* (line that has no match), but there are corner cases where
* the loop proceed all the way to the extremities by causing
* huge performance penalties in case of big files.
*/
if (i - s > XDL_SIMSCAN_WINDOW)
s = i - XDL_SIMSCAN_WINDOW;
if (e - i > XDL_SIMSCAN_WINDOW)
e = i + XDL_SIMSCAN_WINDOW;

/*
* Scans the lines before 'i' to find a run of lines that either
* have no match (action[j] == DISCARD) or have multiple matches
* (action[j] == INVESTIGATE). Note that we always call this
* function with action[i] == INVESTIGATE, so the current line
* (i) is already a multimatch line.
*/
for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) {
if (action[i - r] == DISCARD)
rdis0++;
else if (action[i - r] == INVESTIGATE)
rpdis0++;
else if (action[i - r] == KEEP)
break;
else
BUG("Illegal value for action[i - r]");
}
/*
* If the run before the line 'i' found only multimatch lines,
* we return false and hence we don't make the current line (i)
* discarded. We want to discard multimatch lines only when
* they appear in the middle of runs with nomatch lines
* (action[j] == DISCARD).
*/
if (rdis0 == 0)
return 0;
for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) {
if (action[i + r] == DISCARD)
rdis1++;
else if (action[i + r] == INVESTIGATE)
rpdis1++;
else if (action[i + r] == KEEP)
break;
else
BUG("Illegal value for action[i + r]");
}
/*
* If the run after the line 'i' found only multimatch lines,
* we return false and hence we don't make the current line (i)
* discarded.
*/
if (rdis1 == 0)
return false;
rdis1 += rdis0;
rpdis1 += rpdis0;

return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1);
}

struct xoccurrence
{
size_t file1, file2;
};


DEFINE_IVEC_TYPE(struct xoccurrence, xoccurrence);


/*
* Try to reduce the problem complexity, discard records that have no
* matches on the other file. Also, lines that have multiple matches
* might be potentially discarded if they appear in a run of discardable.
*/
static int xdl_cleanup_records(xdfenv_t *xe, uint64_t flags) {
long i;
size_t nm, mlim;
xrecord_t *recs;
uint8_t *action1 = NULL, *action2 = NULL;
struct IVec_xoccurrence occ;
bool need_min = !!(flags & XDF_NEED_MINIMAL);
int ret = 0;
ptrdiff_t dend1 = xe->xdf1.nrec - 1 - xe->delta_end;
ptrdiff_t dend2 = xe->xdf2.nrec - 1 - xe->delta_end;

IVEC_INIT(occ);
ivec_zero(&occ, xe->mph_size);

for (size_t j = 0; j < xe->xdf1.nrec; j++) {
size_t mph1 = xe->xdf1.recs[j].minimal_perfect_hash;
occ.ptr[mph1].file1 += 1;
}

for (size_t j = 0; j < xe->xdf2.nrec; j++) {
size_t mph2 = xe->xdf2.recs[j].minimal_perfect_hash;
occ.ptr[mph2].file2 += 1;
}

/*
* Create temporary arrays that will help us decide if
* changed[i] should remain false, or become true.
*/
if (!XDL_CALLOC_ARRAY(action1, xe->xdf1.nrec + 1)) {
ret = -1;
goto cleanup;
}
if (!XDL_CALLOC_ARRAY(action2, xe->xdf2.nrec + 1)) {
ret = -1;
goto cleanup;
}

/*
* Initialize temporary arrays with DISCARD, KEEP, or INVESTIGATE.
*/
if ((mlim = xdl_bogosqrt((long)xe->xdf1.nrec)) > XDL_MAX_EQLIMIT)
mlim = XDL_MAX_EQLIMIT;
for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start]; i <= dend1; i++, recs++) {
nm = occ.ptr[recs->minimal_perfect_hash].file2;
action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
}

if ((mlim = xdl_bogosqrt((long)xe->xdf2.nrec)) > XDL_MAX_EQLIMIT)
mlim = XDL_MAX_EQLIMIT;
for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start]; i <= dend2; i++, recs++) {
nm = occ.ptr[recs->minimal_perfect_hash].file1;
action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
}

/*
* Use temporary arrays to decide if changed[i] should remain
* false, or become true.
*/
xe->xdf1.nreff = 0;
for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start];
i <= dend1; i++, recs++) {
if (action1[i] == KEEP ||
(action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xe->delta_start, dend1))) {
xe->xdf1.reference_index[xe->xdf1.nreff++] = i;
/* changed[i] remains false, i.e. keep */
} else
xe->xdf1.changed[i] = true;
/* i.e. discard */
}

xe->xdf2.nreff = 0;
for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start];
i <= dend2; i++, recs++) {
if (action2[i] == KEEP ||
(action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xe->delta_start, dend2))) {
xe->xdf2.reference_index[xe->xdf2.nreff++] = i;
/* changed[i] remains false, i.e. keep */
} else
xe->xdf2.changed[i] = true;
/* i.e. discard */
}

cleanup:
xdl_free(action1);
xdl_free(action2);
ivec_free(&occ);

return ret;
}


/*
* Early trim initial and terminal matching records.
*/
@ -414,19 +235,9 @@ int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
}

xe->mph_size = cf.count;
xdl_free_classifier(&cf);

xdl_trim_ends(xe);
if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
(XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF) &&
xdl_cleanup_records(xe, xpp->flags) < 0) {

xdl_free_ctx(&xe->xdf2);
xdl_free_ctx(&xe->xdf1);
xdl_free_classifier(&cf);
return -1;
}

xdl_free_classifier(&cf);

return 0;
}