grep: optimize built-in grep by skipping lines that do not hit

The internal "grep" engine we use checks for hits line-by-line, instead of letting the underlying regexec()/fixmatch() routines scan for the first match from the rest of the buffer. This was a major source of overhead compared to the external grep. Introduce a "look-ahead" mechanism to find the next line that would potentially match by using regexec()/fixmatch() in the remainder of the text to skip unmatching lines, and use it when the query criteria is simple enough (i.e. punt for an advanced grep boolean expression like "lines that have both X and Y but not Z" for now) and we are not running under "-v" (aka "--invert-match") option. Note that "-L" (aka "--files-without-match") is not a reason to disable this optimization. Under the option, we are interested if the file has any hit at all, and that is what we determine reliably with or without the optimization. Signed-off-by: Junio C Hamano <gitster@pobox.com>
2010-01-10 22:39:36 -08:00 · 2010-01-10 22:39:36 -08:00 · a26345b608
parent cb572206d9
commit a26345b608
1 changed files with 75 additions and 0 deletions
--- a/grep.c
+++ b/grep.c
@ -608,6 +608,65 @@ static void show_pre_context(struct grep_opt *opt, const char *name, char *buf,
 	}
 }

+static int should_lookahead(struct grep_opt *opt)
+{
+	struct grep_pat *p;
+
+	if (opt->extended)
+		return 0; /* punt for too complex stuff */
+	if (opt->invert)
+		return 0;
+	for (p = opt->pattern_list; p; p = p->next) {
+		if (p->token != GREP_PATTERN)
+			return 0; /* punt for "header only" and stuff */
+	}
+	return 1;
+}
+
+static int look_ahead(struct grep_opt *opt,
+		      unsigned long *left_p,
+		      unsigned *lno_p,
+		      char **bol_p)
+{
+	unsigned lno = *lno_p;
+	char *bol = *bol_p;
+	struct grep_pat *p;
+	char *sp, *last_bol;
+	regoff_t earliest = -1;
+
+	for (p = opt->pattern_list; p; p = p->next) {
+		int hit;
+		regmatch_t m;
+
+		if (p->fixed)
+			hit = !fixmatch(p->pattern, bol, &m);
+		else
+			hit = !regexec(&p->regexp, bol, 1, &m, 0);
+		if (!hit || m.rm_so < 0 || m.rm_eo < 0)
+			continue;
+		if (earliest < 0 || m.rm_so < earliest)
+			earliest = m.rm_so;
+	}
+
+	if (earliest < 0) {
+		*bol_p = bol + *left_p;
+		*left_p = 0;
+		return 1;
+	}
+	for (sp = bol + earliest; bol < sp && sp[-1] != '\n'; sp--)
+		; /* find the beginning of the line */
+	last_bol = sp;
+
+	for (sp = bol; sp < last_bol; sp++) {
+		if (*sp == '\n')
+			lno++;
+	}
+	*left_p -= last_bol - bol;
+	*bol_p = last_bol;
+	*lno_p = lno;
+	return 0;
+}
+
 static int grep_buffer_1(struct grep_opt *opt, const char *name,
 			 char *buf, unsigned long size, int collect_hits)
 {
@ -617,6 +676,7 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name,
 	unsigned last_hit = 0;
 	int binary_match_only = 0;
 	unsigned count = 0;
+	int try_lookahead = 0;
 	enum grep_context ctx = GREP_CONTEXT_HEAD;
 	xdemitconf_t xecfg;

@ -645,11 +705,26 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name,
 			opt->priv = &xecfg;
 		}
 	}
+	try_lookahead = should_lookahead(opt);

 	while (left) {
 		char *eol, ch;
 		int hit;

+		/*
+		 * look_ahead() skips quicly to the line that possibly
+		 * has the next hit; don't call it if we need to do
+		 * something more than just skipping the current line
+		 * in response to an unmatch for the current line.  E.g.
+		 * inside a post-context window, we will show the current
+		 * line as a context around the previous hit when it
+		 * doesn't hit.
+		 */
+		if (try_lookahead
+		    && !(last_hit
+			 && lno <= last_hit + opt->post_context)
+		    && look_ahead(opt, &left, &lno, &bol))
+			break;
 		eol = end_of_line(bol, &left);
 		ch = *eol;
 		*eol = 0;