grep: disable lookahead on error
regexec(3) can fail. E.g. on macOS it fails if it is used with an UTF-8 locale to match a valid regex against a buffer containing invalid UTF-8 characters. git grep has two ways to search for matches in a file: Either it splits its contents into lines and matches them separately, or it matches the whole content and figures out line boundaries later. The latter is done by look_ahead() and it's quicker in the common case where most files don't contain a match. Fall back to line-by-line matching if look_ahead() encounters an regexec(3) error by propagating errors out of patmatch() and bailing out of look_ahead() if there is one. This way we at least can find matches in lines that contain only valid characters. That matches the behavior of grep(1) on macOS. pcre2match() dies if pcre2_jit_match() or pcre2_match() fail, but since we use the flag PCRE2_MATCH_INVALID_UTF it handles invalid UTF-8 characters gracefully. So implement the fall-back only for regexec(3) and leave the PCRE2 matching unchanged. Reported-by: David Gstir <david@sigma-star.at> Signed-off-by: René Scharfe <l.s.r@web.de> Tested-by: David Gstir <david@sigma-star.at> Signed-off-by: Taylor Blau <me@ttaylorr.com>maint
parent
34b6ce9b30
commit
ce025ae4f6
28
grep.c
28
grep.c
|
@ -906,15 +906,17 @@ static int patmatch(struct grep_pat *p,
|
|||
const char *line, const char *eol,
|
||||
regmatch_t *match, int eflags)
|
||||
{
|
||||
int hit;
|
||||
|
||||
if (p->pcre2_pattern)
|
||||
hit = !pcre2match(p, line, eol, match, eflags);
|
||||
else
|
||||
hit = !regexec_buf(&p->regexp, line, eol - line, 1, match,
|
||||
eflags);
|
||||
return !pcre2match(p, line, eol, match, eflags);
|
||||
|
||||
return hit;
|
||||
switch (regexec_buf(&p->regexp, line, eol - line, 1, match, eflags)) {
|
||||
case 0:
|
||||
return 1;
|
||||
case REG_NOMATCH:
|
||||
return 0;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
static void strip_timestamp(const char *bol, const char **eol_p)
|
||||
|
@ -952,6 +954,8 @@ static int headerless_match_one_pattern(struct grep_pat *p,
|
|||
|
||||
again:
|
||||
hit = patmatch(p, bol, eol, pmatch, eflags);
|
||||
if (hit < 0)
|
||||
hit = 0;
|
||||
|
||||
if (hit && p->word_regexp) {
|
||||
if ((pmatch[0].rm_so < 0) ||
|
||||
|
@ -1461,6 +1465,8 @@ static int look_ahead(struct grep_opt *opt,
|
|||
regmatch_t m;
|
||||
|
||||
hit = patmatch(p, bol, bol + *left_p, &m, 0);
|
||||
if (hit < 0)
|
||||
return -1;
|
||||
if (!hit || m.rm_so < 0 || m.rm_eo < 0)
|
||||
continue;
|
||||
if (earliest < 0 || m.rm_so < earliest)
|
||||
|
@ -1655,9 +1661,13 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle
|
|||
if (try_lookahead
|
||||
&& !(last_hit
|
||||
&& (show_function ||
|
||||
lno <= last_hit + opt->post_context))
|
||||
&& look_ahead(opt, &left, &lno, &bol))
|
||||
lno <= last_hit + opt->post_context))) {
|
||||
hit = look_ahead(opt, &left, &lno, &bol);
|
||||
if (hit < 0)
|
||||
try_lookahead = 0;
|
||||
else if (hit)
|
||||
break;
|
||||
}
|
||||
eol = end_of_line(bol, &left);
|
||||
|
||||
if ((ctx == GREP_CONTEXT_HEAD) && (eol == bol))
|
||||
|
|
|
@ -87,6 +87,7 @@ test_expect_success setup '
|
|||
# Still a no-op.
|
||||
function dummy() {}
|
||||
EOF
|
||||
printf "\200\nASCII\n" >invalid-utf8 &&
|
||||
if test_have_prereq FUNNYNAMES
|
||||
then
|
||||
echo unusual >"\"unusual\" pathname" &&
|
||||
|
@ -534,6 +535,14 @@ do
|
|||
test_cmp expected actual
|
||||
'
|
||||
|
||||
test_expect_success "grep $L searches past invalid lines on UTF-8 locale" '
|
||||
LC_ALL=en_US.UTF-8 git grep A. invalid-utf8 >actual &&
|
||||
cat >expected <<-EOF &&
|
||||
invalid-utf8:ASCII
|
||||
EOF
|
||||
test_cmp expected actual
|
||||
'
|
||||
|
||||
test_expect_success FUNNYNAMES "grep $L should quote unusual pathnames" '
|
||||
cat >expected <<-EOF &&
|
||||
${HC}"\"unusual\" pathname":unusual
|
||||
|
|
Loading…
Reference in New Issue