xdiff: move xdl_cleanup_records() from xprepare.c to xdiffi.c

Only the classic diff uses xdl_cleanup_records(). Move it, xdl_clean_mmatch(), and the macros to xdiffi.c and call xdl_cleanup_records() inside of xdl_do_classic_diff(). This better organizes the code related to the classic diff. Signed-off-by: Ezekiel Newren <ezekielnewren@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2026-01-02 18:52:24 +00:00 · 2026-01-02 18:52:24 +00:00 · 083774849b
parent efd314b8e8
commit 083774849b
2 changed files with 181 additions and 190 deletions
--- a/xdiff/xdiffi.c
+++ b/xdiff/xdiffi.c
@ -21,6 +21,7 @@
 */

 #include "xinclude.h"
+#include "compat/ivec.h"

 static size_t get_hash(xdfile_t *xdf, long index)
 {
@ -33,6 +34,14 @@ static size_t get_hash(xdfile_t *xdf, long index)
 #define XDL_SNAKE_CNT 20
 #define XDL_K_HEUR 4

+#define XDL_KPDIS_RUN 4
+#define XDL_MAX_EQLIMIT 1024
+#define XDL_SIMSCAN_WINDOW 100
+
+#define DISCARD 0
+#define KEEP 1
+#define INVESTIGATE 2
+
 typedef struct s_xdpsplit {
 	long i1, i2;
 	int min_lo, min_hi;
@ -311,6 +320,175 @@ int xdl_recs_cmp(xdfile_t *xdf1, long off1, long lim1,
 }


+static bool xdl_clean_mmatch(uint8_t const *action, long i, long s, long e) {
+	long r, rdis0, rpdis0, rdis1, rpdis1;
+
+	/*
+	 * Limits the window that is examined during the similar-lines
+	 * scan. The loops below stops when action[i - r] == KEEP
+	 * (line that has no match), but there are corner cases where
+	 * the loop proceed all the way to the extremities by causing
+	 * huge performance penalties in case of big files.
+	 */
+	if (i - s > XDL_SIMSCAN_WINDOW)
+		s = i - XDL_SIMSCAN_WINDOW;
+	if (e - i > XDL_SIMSCAN_WINDOW)
+		e = i + XDL_SIMSCAN_WINDOW;
+
+	/*
+	 * Scans the lines before 'i' to find a run of lines that either
+	 * have no match (action[j] == DISCARD) or have multiple matches
+	 * (action[j] == INVESTIGATE). Note that we always call this
+	 * function with action[i] == INVESTIGATE, so the current line
+	 * (i) is already a multimatch line.
+	 */
+	for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) {
+		if (action[i - r] == DISCARD)
+			rdis0++;
+		else if (action[i - r] == INVESTIGATE)
+			rpdis0++;
+		else if (action[i - r] == KEEP)
+			break;
+		else
+			BUG("Illegal value for action[i - r]");
+	}
+	/*
+	 * If the run before the line 'i' found only multimatch lines,
+	 * we return false and hence we don't make the current line (i)
+	 * discarded. We want to discard multimatch lines only when
+	 * they appear in the middle of runs with nomatch lines
+	 * (action[j] == DISCARD).
+	 */
+	if (rdis0 == 0)
+		return 0;
+	for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) {
+		if (action[i + r] == DISCARD)
+			rdis1++;
+		else if (action[i + r] == INVESTIGATE)
+			rpdis1++;
+		else if (action[i + r] == KEEP)
+			break;
+		else
+			BUG("Illegal value for action[i + r]");
+	}
+	/*
+	 * If the run after the line 'i' found only multimatch lines,
+	 * we return false and hence we don't make the current line (i)
+	 * discarded.
+	 */
+	if (rdis1 == 0)
+		return false;
+	rdis1 += rdis0;
+	rpdis1 += rpdis0;
+
+	return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1);
+}
+
+struct xoccurrence
+{
+	size_t file1, file2;
+};
+
+
+DEFINE_IVEC_TYPE(struct xoccurrence, xoccurrence);
+
+
+/*
+ * Try to reduce the problem complexity, discard records that have no
+ * matches on the other file. Also, lines that have multiple matches
+ * might be potentially discarded if they appear in a run of discardable.
+ */
+static int xdl_cleanup_records(xdfenv_t *xe, uint64_t flags) {
+	long i;
+	size_t nm, mlim;
+	xrecord_t *recs;
+	uint8_t *action1 = NULL, *action2 = NULL;
+	struct IVec_xoccurrence occ;
+	bool need_min = !!(flags & XDF_NEED_MINIMAL);
+	int ret = 0;
+	ptrdiff_t dend1 = xe->xdf1.nrec - 1 - xe->delta_end;
+	ptrdiff_t dend2 = xe->xdf2.nrec - 1 - xe->delta_end;
+
+	IVEC_INIT(occ);
+	ivec_zero(&occ, xe->mph_size);
+
+	for (size_t j = 0; j < xe->xdf1.nrec; j++) {
+		size_t mph1 = xe->xdf1.recs[j].minimal_perfect_hash;
+		occ.ptr[mph1].file1 += 1;
+	}
+
+	for (size_t j = 0; j < xe->xdf2.nrec; j++) {
+		size_t mph2 = xe->xdf2.recs[j].minimal_perfect_hash;
+		occ.ptr[mph2].file2 += 1;
+	}
+
+	/*
+	 * Create temporary arrays that will help us decide if
+	 * changed[i] should remain false, or become true.
+	 */
+	if (!XDL_CALLOC_ARRAY(action1, xe->xdf1.nrec + 1)) {
+		ret = -1;
+		goto cleanup;
+	}
+	if (!XDL_CALLOC_ARRAY(action2, xe->xdf2.nrec + 1)) {
+		ret = -1;
+		goto cleanup;
+	}
+
+	/*
+	 * Initialize temporary arrays with DISCARD, KEEP, or INVESTIGATE.
+	 */
+	if ((mlim = xdl_bogosqrt((long)xe->xdf1.nrec)) > XDL_MAX_EQLIMIT)
+		mlim = XDL_MAX_EQLIMIT;
+	for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start]; i <= dend1; i++, recs++) {
+		nm = occ.ptr[recs->minimal_perfect_hash].file2;
+		action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
+	}
+
+	if ((mlim = xdl_bogosqrt((long)xe->xdf2.nrec)) > XDL_MAX_EQLIMIT)
+		mlim = XDL_MAX_EQLIMIT;
+	for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start]; i <= dend2; i++, recs++) {
+		nm = occ.ptr[recs->minimal_perfect_hash].file1;
+		action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
+	}
+
+	/*
+	 * Use temporary arrays to decide if changed[i] should remain
+	 * false, or become true.
+	 */
+	xe->xdf1.nreff = 0;
+	for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start];
+	     i <= dend1; i++, recs++) {
+		if (action1[i] == KEEP ||
+		    (action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xe->delta_start, dend1))) {
+			xe->xdf1.reference_index[xe->xdf1.nreff++] = i;
+			/* changed[i] remains false, i.e. keep */
+		} else
+			xe->xdf1.changed[i] = true;
+			/* i.e. discard */
+	}
+
+	xe->xdf2.nreff = 0;
+	for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start];
+	     i <= dend2; i++, recs++) {
+		if (action2[i] == KEEP ||
+		    (action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xe->delta_start, dend2))) {
+			xe->xdf2.reference_index[xe->xdf2.nreff++] = i;
+			/* changed[i] remains false, i.e. keep */
+		} else
+			xe->xdf2.changed[i] = true;
+			/* i.e. discard */
+	}
+
+cleanup:
+	xdl_free(action1);
+	xdl_free(action2);
+	ivec_free(&occ);
+
+	return ret;
+}
+
+
 int xdl_do_classic_diff(xdfenv_t *xe, uint64_t flags)
 {
 	long ndiags;
@ -318,6 +496,8 @@ int xdl_do_classic_diff(xdfenv_t *xe, uint64_t flags)
 	xdalgoenv_t xenv;
 	int res;

+	xdl_cleanup_records(xe, flags);
+
 	/*
 	 * Allocate and setup K vectors to be used by the differential
 	 * algorithm.
--- a/xdiff/xprepare.c
+++ b/xdiff/xprepare.c
@ -24,14 +24,6 @@
 #include "compat/ivec.h"


-#define XDL_KPDIS_RUN 4
-#define XDL_MAX_EQLIMIT 1024
-#define XDL_SIMSCAN_WINDOW 100
-
-#define DISCARD 0
-#define KEEP 1
-#define INVESTIGATE 2
-
 typedef struct s_xdlclass {
 	struct s_xdlclass *next;
 	xrecord_t rec;
@ -50,8 +42,6 @@ typedef struct s_xdlclassifier {
 } xdlclassifier_t;


-
-
 static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags) {
 	memset(cf, 0, sizeof(xdlclassifier_t));

@ -186,175 +176,6 @@ void xdl_free_env(xdfenv_t *xe) {
 }


-static bool xdl_clean_mmatch(uint8_t const *action, long i, long s, long e) {
-	long r, rdis0, rpdis0, rdis1, rpdis1;
-
-	/*
-	 * Limits the window that is examined during the similar-lines
-	 * scan. The loops below stops when action[i - r] == KEEP
-	 * (line that has no match), but there are corner cases where
-	 * the loop proceed all the way to the extremities by causing
-	 * huge performance penalties in case of big files.
-	 */
-	if (i - s > XDL_SIMSCAN_WINDOW)
-		s = i - XDL_SIMSCAN_WINDOW;
-	if (e - i > XDL_SIMSCAN_WINDOW)
-		e = i + XDL_SIMSCAN_WINDOW;
-
-	/*
-	 * Scans the lines before 'i' to find a run of lines that either
-	 * have no match (action[j] == DISCARD) or have multiple matches
-	 * (action[j] == INVESTIGATE). Note that we always call this
-	 * function with action[i] == INVESTIGATE, so the current line
-	 * (i) is already a multimatch line.
-	 */
-	for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) {
-		if (action[i - r] == DISCARD)
-			rdis0++;
-		else if (action[i - r] == INVESTIGATE)
-			rpdis0++;
-		else if (action[i - r] == KEEP)
-			break;
-		else
-			BUG("Illegal value for action[i - r]");
-	}
-	/*
-	 * If the run before the line 'i' found only multimatch lines,
-	 * we return false and hence we don't make the current line (i)
-	 * discarded. We want to discard multimatch lines only when
-	 * they appear in the middle of runs with nomatch lines
-	 * (action[j] == DISCARD).
-	 */
-	if (rdis0 == 0)
-		return 0;
-	for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) {
-		if (action[i + r] == DISCARD)
-			rdis1++;
-		else if (action[i + r] == INVESTIGATE)
-			rpdis1++;
-		else if (action[i + r] == KEEP)
-			break;
-		else
-			BUG("Illegal value for action[i + r]");
-	}
-	/*
-	 * If the run after the line 'i' found only multimatch lines,
-	 * we return false and hence we don't make the current line (i)
-	 * discarded.
-	 */
-	if (rdis1 == 0)
-		return false;
-	rdis1 += rdis0;
-	rpdis1 += rpdis0;
-
-	return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1);
-}
-
-struct xoccurrence
-{
-	size_t file1, file2;
-};
-
-
-DEFINE_IVEC_TYPE(struct xoccurrence, xoccurrence);
-
-
-/*
- * Try to reduce the problem complexity, discard records that have no
- * matches on the other file. Also, lines that have multiple matches
- * might be potentially discarded if they appear in a run of discardable.
- */
-static int xdl_cleanup_records(xdfenv_t *xe, uint64_t flags) {
-	long i;
-	size_t nm, mlim;
-	xrecord_t *recs;
-	uint8_t *action1 = NULL, *action2 = NULL;
-	struct IVec_xoccurrence occ;
-	bool need_min = !!(flags & XDF_NEED_MINIMAL);
-	int ret = 0;
-	ptrdiff_t dend1 = xe->xdf1.nrec - 1 - xe->delta_end;
-	ptrdiff_t dend2 = xe->xdf2.nrec - 1 - xe->delta_end;
-
-	IVEC_INIT(occ);
-	ivec_zero(&occ, xe->mph_size);
-
-	for (size_t j = 0; j < xe->xdf1.nrec; j++) {
-		size_t mph1 = xe->xdf1.recs[j].minimal_perfect_hash;
-		occ.ptr[mph1].file1 += 1;
-	}
-
-	for (size_t j = 0; j < xe->xdf2.nrec; j++) {
-		size_t mph2 = xe->xdf2.recs[j].minimal_perfect_hash;
-		occ.ptr[mph2].file2 += 1;
-	}
-
-	/*
-	 * Create temporary arrays that will help us decide if
-	 * changed[i] should remain false, or become true.
-	 */
-	if (!XDL_CALLOC_ARRAY(action1, xe->xdf1.nrec + 1)) {
-		ret = -1;
-		goto cleanup;
-	}
-	if (!XDL_CALLOC_ARRAY(action2, xe->xdf2.nrec + 1)) {
-		ret = -1;
-		goto cleanup;
-	}
-
-	/*
-	 * Initialize temporary arrays with DISCARD, KEEP, or INVESTIGATE.
-	 */
-	if ((mlim = xdl_bogosqrt((long)xe->xdf1.nrec)) > XDL_MAX_EQLIMIT)
-		mlim = XDL_MAX_EQLIMIT;
-	for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start]; i <= dend1; i++, recs++) {
-		nm = occ.ptr[recs->minimal_perfect_hash].file2;
-		action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
-	}
-
-	if ((mlim = xdl_bogosqrt((long)xe->xdf2.nrec)) > XDL_MAX_EQLIMIT)
-		mlim = XDL_MAX_EQLIMIT;
-	for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start]; i <= dend2; i++, recs++) {
-		nm = occ.ptr[recs->minimal_perfect_hash].file1;
-		action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
-	}
-
-	/*
-	 * Use temporary arrays to decide if changed[i] should remain
-	 * false, or become true.
-	 */
-	xe->xdf1.nreff = 0;
-	for (i = xe->delta_start, recs = &xe->xdf1.recs[xe->delta_start];
-	     i <= dend1; i++, recs++) {
-		if (action1[i] == KEEP ||
-		    (action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xe->delta_start, dend1))) {
-			xe->xdf1.reference_index[xe->xdf1.nreff++] = i;
-			/* changed[i] remains false, i.e. keep */
-		} else
-			xe->xdf1.changed[i] = true;
-			/* i.e. discard */
-	}
-
-	xe->xdf2.nreff = 0;
-	for (i = xe->delta_start, recs = &xe->xdf2.recs[xe->delta_start];
-	     i <= dend2; i++, recs++) {
-		if (action2[i] == KEEP ||
-		    (action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xe->delta_start, dend2))) {
-			xe->xdf2.reference_index[xe->xdf2.nreff++] = i;
-			/* changed[i] remains false, i.e. keep */
-		} else
-			xe->xdf2.changed[i] = true;
-			/* i.e. discard */
-	}
-
-cleanup:
-	xdl_free(action1);
-	xdl_free(action2);
-	ivec_free(&occ);
-
-	return ret;
-}
-
-
 /*
 * Early trim initial and terminal matching records.
 */
@ -414,19 +235,9 @@ int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
 	}

 	xe->mph_size = cf.count;
+	xdl_free_classifier(&cf);

 	xdl_trim_ends(xe);
-	if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
-	    (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF) &&
-	    xdl_cleanup_records(xe, xpp->flags) < 0) {
-
-		xdl_free_ctx(&xe->xdf2);
-		xdl_free_ctx(&xe->xdf1);
-		xdl_free_classifier(&cf);
-		return -1;
-	}
-
-	xdl_free_classifier(&cf);

 	return 0;
 }