grep: don't use PCRE2?_UTF8 with "log --encoding=<non-utf8>"

Fix a bug introduced in 18547aacf5 ("grep/pcre: support utf-8", 2016-06-25) that was missed due to a blindspot in our tests, as discussed in the previous commit. I then blindly copied the same bug in 94da9193a6 ("grep: add support for PCRE v2", 2017-06-01) when adding the PCRE v2 code. We should not tell PCRE that we're processing UTF-8 just because we're dealing with non-ASCII. In the case of e.g. "log --encoding=<...>" under is_utf8_locale() the haystack might be in ISO-8859-1, and the needle might be in a non-UTF-8 encoding. Maybe we should be more strict here and die earlier? Should we also be converting the needle to the encoding in question, and failing if it's not a string that's valid in that encoding? Maybe. But for now matching this as non-UTF8 at least has some hope of producing sensible results, since we know that our default heuristic of assuming the text to be matched is in the user locale encoding isn't true when we've explicitly encoded it to be in a different encoding. Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2019-06-28 01:39:05 +02:00 · 2019-06-28 01:39:05 +02:00 · 44570188a0
parent 4e2443b181
commit 44570188a0
4 changed files with 10 additions and 8 deletions
--- a/grep.c
+++ b/grep.c
@ -388,11 +388,11 @@ static void compile_pcre1_regexp(struct grep_pat *p, const struct grep_opt *opt)
 	int options = PCRE_MULTILINE;

 	if (opt->ignore_case) {
-		if (has_non_ascii(p->pattern))
+		if (!opt->ignore_locale && has_non_ascii(p->pattern))
 			p->pcre1_tables = pcre_maketables();
 		options |= PCRE_CASELESS;
 	}
-	if (is_utf8_locale() && has_non_ascii(p->pattern))
+	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern))
 		options |= PCRE_UTF8;

 	p->pcre1_regexp = pcre_compile(p->pattern, options, &error, &erroffset,
@ -498,14 +498,14 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
 	p->pcre2_compile_context = NULL;

 	if (opt->ignore_case) {
-		if (has_non_ascii(p->pattern)) {
+		if (!opt->ignore_locale && has_non_ascii(p->pattern)) {
 			character_tables = pcre2_maketables(NULL);
 			p->pcre2_compile_context = pcre2_compile_context_create(NULL);
 			pcre2_set_character_tables(p->pcre2_compile_context, character_tables);
 		}
 		options |= PCRE2_CASELESS;
 	}
-	if (is_utf8_locale() && has_non_ascii(p->pattern))
+	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern))
 		options |= PCRE2_UTF;

 	p->pcre2_pattern = pcre2_compile((PCRE2_SPTR)p->pattern,
--- a/grep.h
+++ b/grep.h
@ -173,6 +173,7 @@ struct grep_opt {
 	int funcbody;
 	int extended_regexp_option;
 	int pattern_type_option;
+	int ignore_locale;
 	char colors[NR_GREP_COLORS][COLOR_MAXLEN];
 	unsigned pre_context;
 	unsigned post_context;
--- a/revision.c
+++ b/revision.c
@ -28,6 +28,7 @@
 #include "commit-graph.h"
 #include "prio-queue.h"
 #include "hashmap.h"
+#include "utf8.h"

 volatile show_early_output_fn_t show_early_output;

@ -2655,6 +2656,8 @@ int setup_revisions(int argc, const char **argv, struct rev_info *revs, struct s

 	grep_commit_pattern_type(GREP_PATTERN_TYPE_UNSPECIFIED,
 				 &revs->grep_filter);
+	if (!is_encoding_utf8(get_log_output_encoding()))
+		revs->grep_filter.ignore_locale = 1;
 	compile_grep_patterns(&revs->grep_filter);

 	if (revs->reverse && revs->reflog_info)
--- a/t/t4210-log-i18n.sh
+++ b/t/t4210-log-i18n.sh
@ -59,10 +59,8 @@ test_expect_success 'log --grep does not find non-reencoded values (latin1)' '
 for engine in fixed basic extended perl
 do
 	prereq=
-	result=success
 	if test $engine = "perl"
 	then
-		result=failure
 		prereq="PCRE"
 	else
 		prereq=""
@ -72,7 +70,7 @@ do
 	then
 	    force_regex=.*
 	fi
-	test_expect_$result GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not find non-reencoded values (latin1 + locale)" "
+	test_expect_success GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not find non-reencoded values (latin1 + locale)" "
 		cat >expect <<-\EOF &&
 		latin1
 		utf8
@ -86,7 +84,7 @@ do
 		test_must_be_empty actual
 	"

-	test_expect_$result GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not die on invalid UTF-8 value (latin1 + locale + invalid needle)" "
+	test_expect_success GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not die on invalid UTF-8 value (latin1 + locale + invalid needle)" "
 		LC_ALL=\"$is_IS_locale\" git -c grep.patternType=$engine log --encoding=ISO-8859-1 --format=%s --grep=\"$force_regex$invalid_e\" >actual &&
 		test_must_be_empty actual
 	"