xdiff: add macros DISCARD(0), KEEP(1), INVESTIGATE(2) in xprepare.c

This commit is refactor-only; no behavior is changed. A future commit
will use bool literals for changed[i].

The functions xdl_clean_mmatch() and xdl_cleanup_records() will be
cleaned up more in a future patch series. The changes to
xdl_cleanup_records(), in this patch, are just to make it clear why
`char rchg` is refactored to `bool changed`.

Rename dis* to action* and replace literal numericals with macros.
The old names came from when dis* (which I think was short for discard)
was treated like a boolean, but over time it grew into a ternary state
machine. The result was confusing because dis* and rchg* both used 0/1
values with different meanings.

The new names and macros make the states explicit. nm is short for
number of matches, and mlim is a heuristic limit:

  nm == 0       -> action[i] = DISCARD     -> changed[i] = true
  0 < nm < mlim -> action[i] = KEEP        -> changed[i] = false
  nm >= mlim    -> action[i] = INVESTIGATE -> changed[i] = xdl_clean_mmatch()

When need_min is true, only DISCARD and KEEP occur because the limit
is effectively infinite.

Best-viewed-with: --color-words
Signed-off-by: Ezekiel Newren <ezekielnewren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
main
Ezekiel Newren 2025-09-26 22:41:58 +00:00 committed by Junio C Hamano
parent b7de64a6d6
commit e385e1b7d2
1 changed files with 69 additions and 37 deletions

View File

@ -29,6 +29,9 @@
#define XDL_GUESS_NLINES1 256 #define XDL_GUESS_NLINES1 256
#define XDL_GUESS_NLINES2 20 #define XDL_GUESS_NLINES2 20


#define DISCARD 0
#define KEEP 1
#define INVESTIGATE 2


typedef struct s_xdlclass { typedef struct s_xdlclass {
struct s_xdlclass *next; struct s_xdlclass *next;
@ -190,15 +193,15 @@ void xdl_free_env(xdfenv_t *xe) {
} }




static int xdl_clean_mmatch(char const *dis, long i, long s, long e) { static bool xdl_clean_mmatch(uint8_t const *action, long i, long s, long e) {
long r, rdis0, rpdis0, rdis1, rpdis1; long r, rdis0, rpdis0, rdis1, rpdis1;


/* /*
* Limits the window the is examined during the similar-lines * Limits the window that is examined during the similar-lines
* scan. The loops below stops when dis[i - r] == 1 (line that * scan. The loops below stops when action[i - r] == KEEP
* has no match), but there are corner cases where the loop * (line that has no match), but there are corner cases where
* proceed all the way to the extremities by causing huge * the loop proceed all the way to the extremities by causing
* performance penalties in case of big files. * huge performance penalties in case of big files.
*/ */
if (i - s > XDL_SIMSCAN_WINDOW) if (i - s > XDL_SIMSCAN_WINDOW)
s = i - XDL_SIMSCAN_WINDOW; s = i - XDL_SIMSCAN_WINDOW;
@ -207,40 +210,47 @@ static int xdl_clean_mmatch(char const *dis, long i, long s, long e) {


/* /*
* Scans the lines before 'i' to find a run of lines that either * Scans the lines before 'i' to find a run of lines that either
* have no match (dis[j] == 0) or have multiple matches (dis[j] > 1). * have no match (action[j] == DISCARD) or have multiple matches
* Note that we always call this function with dis[i] > 1, so the * (action[j] == INVESTIGATE). Note that we always call this
* current line (i) is already a multimatch line. * function with action[i] == INVESTIGATE, so the current line
* (i) is already a multimatch line.
*/ */
for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) { for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) {
if (!dis[i - r]) if (action[i - r] == DISCARD)
rdis0++; rdis0++;
else if (dis[i - r] == 2) else if (action[i - r] == INVESTIGATE)
rpdis0++; rpdis0++;
else else if (action[i - r] == KEEP)
break; break;
else
BUG("Illegal value for action[i - r]");
} }
/* /*
* If the run before the line 'i' found only multimatch lines, we * If the run before the line 'i' found only multimatch lines,
* return 0 and hence we don't make the current line (i) discarded. * we return false and hence we don't make the current line (i)
* We want to discard multimatch lines only when they appear in the * discarded. We want to discard multimatch lines only when
* middle of runs with nomatch lines (dis[j] == 0). * they appear in the middle of runs with nomatch lines
* (action[j] == DISCARD).
*/ */
if (rdis0 == 0) if (rdis0 == 0)
return 0; return 0;
for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) { for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) {
if (!dis[i + r]) if (action[i + r] == DISCARD)
rdis1++; rdis1++;
else if (dis[i + r] == 2) else if (action[i + r] == INVESTIGATE)
rpdis1++; rpdis1++;
else else if (action[i + r] == KEEP)
break; break;
else
BUG("Illegal value for action[i + r]");
} }
/* /*
* If the run after the line 'i' found only multimatch lines, we * If the run after the line 'i' found only multimatch lines,
* return 0 and hence we don't make the current line (i) discarded. * we return false and hence we don't make the current line (i)
* discarded.
*/ */
if (rdis1 == 0) if (rdis1 == 0)
return 0; return false;
rdis1 += rdis0; rdis1 += rdis0;
rpdis1 += rpdis0; rpdis1 += rpdis0;


@ -251,26 +261,38 @@ static int xdl_clean_mmatch(char const *dis, long i, long s, long e) {
/* /*
* Try to reduce the problem complexity, discard records that have no * Try to reduce the problem complexity, discard records that have no
* matches on the other file. Also, lines that have multiple matches * matches on the other file. Also, lines that have multiple matches
* might be potentially discarded if they happear in a run of discardable. * might be potentially discarded if they appear in a run of discardable.
*/ */
static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) { static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) {
long i, nm, nreff, mlim; long i, nm, nreff, mlim;
xrecord_t *recs; xrecord_t *recs;
xdlclass_t *rcrec; xdlclass_t *rcrec;
char *dis, *dis1, *dis2; uint8_t *action1 = NULL, *action2 = NULL;
int need_min = !!(cf->flags & XDF_NEED_MINIMAL); bool need_min = !!(cf->flags & XDF_NEED_MINIMAL);
int ret = 0;


if (!XDL_CALLOC_ARRAY(dis, xdf1->nrec + xdf2->nrec + 2)) /*
return -1; * Create temporary arrays that will help us decide if
dis1 = dis; * changed[i] should remain 0 or become 1.
dis2 = dis1 + xdf1->nrec + 1; */
if (!XDL_CALLOC_ARRAY(action1, xdf1->nrec + 1)) {
ret = -1;
goto cleanup;
}
if (!XDL_CALLOC_ARRAY(action2, xdf2->nrec + 1)) {
ret = -1;
goto cleanup;
}


/*
* Initialize temporary arrays with DISCARD, KEEP, or INVESTIGATE.
*/
if ((mlim = xdl_bogosqrt(xdf1->nrec)) > XDL_MAX_EQLIMIT) if ((mlim = xdl_bogosqrt(xdf1->nrec)) > XDL_MAX_EQLIMIT)
mlim = XDL_MAX_EQLIMIT; mlim = XDL_MAX_EQLIMIT;
for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) { for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) {
rcrec = cf->rcrecs[recs->ha]; rcrec = cf->rcrecs[recs->ha];
nm = rcrec ? rcrec->len2 : 0; nm = rcrec ? rcrec->len2 : 0;
dis1[i] = (nm == 0) ? 0: (nm >= mlim && !need_min) ? 2: 1; action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
} }


if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT) if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT)
@ -278,32 +300,42 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd
for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) { for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) {
rcrec = cf->rcrecs[recs->ha]; rcrec = cf->rcrecs[recs->ha];
nm = rcrec ? rcrec->len1 : 0; nm = rcrec ? rcrec->len1 : 0;
dis2[i] = (nm == 0) ? 0: (nm >= mlim && !need_min) ? 2: 1; action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
} }


/*
* Use temporary arrays to decide if changed[i] should remain
* 0 or become 1.
*/
for (nreff = 0, i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; for (nreff = 0, i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart];
i <= xdf1->dend; i++, recs++) { i <= xdf1->dend; i++, recs++) {
if (dis1[i] == 1 || if (action1[i] == KEEP ||
(dis1[i] == 2 && !xdl_clean_mmatch(dis1, i, xdf1->dstart, xdf1->dend))) { (action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xdf1->dstart, xdf1->dend))) {
xdf1->rindex[nreff++] = i; xdf1->rindex[nreff++] = i;
/* changed[i] remains 0, i.e. keep */
} else } else
xdf1->changed[i] = 1; xdf1->changed[i] = 1;
/* i.e. discard */
} }
xdf1->nreff = nreff; xdf1->nreff = nreff;


for (nreff = 0, i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; for (nreff = 0, i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart];
i <= xdf2->dend; i++, recs++) { i <= xdf2->dend; i++, recs++) {
if (dis2[i] == 1 || if (action2[i] == KEEP ||
(dis2[i] == 2 && !xdl_clean_mmatch(dis2, i, xdf2->dstart, xdf2->dend))) { (action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xdf2->dstart, xdf2->dend))) {
xdf2->rindex[nreff++] = i; xdf2->rindex[nreff++] = i;
/* changed[i] remains 0, i.e. keep */
} else } else
xdf2->changed[i] = 1; xdf2->changed[i] = 1;
/* i.e. discard */
} }
xdf2->nreff = nreff; xdf2->nreff = nreff;


xdl_free(dis); cleanup:
xdl_free(action1);
xdl_free(action2);


return 0; return ret;
} }