grep/pcre2: fix an edge case concerning ascii patterns and UTF-8 data
If we attempt to grep non-ascii log message text with an ascii pattern, we
run into the following issue:
    $ git log --color --author='.var.*Bjar' -1 origin/master | grep ^Author
    grep: (standard input): binary file matches
So, to fix this teach the grep code to use PCRE2_UTF, as long as the log
output is encoded in UTF-8.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
			
			
				maint
			
			
		
							parent
							
								
									6a5c337922
								
							
						
					
					
						commit
						ae39ba431a
					
				
							
								
								
									
										6
									
								
								grep.c
								
								
								
								
							
							
						
						
									
										6
									
								
								grep.c
								
								
								
								
							|  | @ -382,8 +382,10 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt | |||
| 		} | ||||
| 		options |= PCRE2_CASELESS; | ||||
| 	} | ||||
| 	if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) && | ||||
| 	    !(!opt->ignore_case && (p->fixed || p->is_fixed))) | ||||
| 	if ((!opt->ignore_locale && !has_non_ascii(p->pattern)) || | ||||
| 	    (!opt->ignore_locale && is_utf8_locale() && | ||||
| 	     has_non_ascii(p->pattern) && !(!opt->ignore_case && | ||||
| 					    (p->fixed || p->is_fixed)))) | ||||
| 		options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF); | ||||
|  | ||||
| #ifdef GIT_PCRE2_VERSION_10_36_OR_HIGHER | ||||
|  |  | |||
|  | @ -53,6 +53,54 @@ test_expect_success REGEX_LOCALE 'pickaxe -i on non-ascii' ' | |||
| 	test_cmp expected actual | ||||
| ' | ||||
|  | ||||
| test_expect_success GETTEXT_LOCALE,PCRE 'log --author with an ascii pattern on UTF-8 data' ' | ||||
| 	cat >expected <<-\EOF && | ||||
| 	Author: <BOLD;RED>À Ú Thor<RESET> <author@example.com> | ||||
| 	EOF | ||||
| 	test_write_lines "forth" >file4 && | ||||
| 	git add file4 && | ||||
| 	git commit --author="À Ú Thor <author@example.com>" -m sécond && | ||||
| 	git log -1 --color=always --perl-regexp --author=".*Thor" >log && | ||||
| 	grep Author log >actual.raw && | ||||
| 	test_decode_color <actual.raw >actual && | ||||
| 	test_cmp expected actual | ||||
| ' | ||||
|  | ||||
| test_expect_success GETTEXT_LOCALE,PCRE 'log --committer with an ascii pattern on ISO-8859-1 data' ' | ||||
| 	cat >expected <<-\EOF && | ||||
| 	Commit:     Ç<BOLD;RED> O Mîtter <committer@example.com><RESET> | ||||
| 	EOF | ||||
| 	test_write_lines "fifth" >file5 && | ||||
| 	git add file5 && | ||||
| 	GIT_COMMITTER_NAME="Ç O Mîtter" && | ||||
| 	GIT_COMMITTER_EMAIL="committer@example.com" && | ||||
| 	git -c i18n.commitEncoding=latin1 commit -m thïrd && | ||||
| 	git -c i18n.logOutputEncoding=latin1 log -1 --pretty=fuller --color=always --perl-regexp --committer=" O.*" >log && | ||||
| 	grep Commit: log >actual.raw && | ||||
| 	test_decode_color <actual.raw >actual && | ||||
| 	test_cmp expected actual | ||||
| ' | ||||
|  | ||||
| test_expect_success GETTEXT_LOCALE,PCRE 'log --grep with an ascii pattern on UTF-8 data' ' | ||||
| 	cat >expected <<-\EOF && | ||||
| 	    sé<BOLD;RED>con<RESET>d | ||||
| 	EOF | ||||
| 	git log -1 --color=always --perl-regexp --grep="con" >log && | ||||
| 	grep con log >actual.raw && | ||||
| 	test_decode_color <actual.raw >actual && | ||||
| 	test_cmp expected actual | ||||
| ' | ||||
|  | ||||
| test_expect_success GETTEXT_LOCALE,PCRE 'log --grep with an ascii pattern on ISO-8859-1 data' ' | ||||
| 	cat >expected <<-\EOF && | ||||
| 	    <BOLD;RED>thïrd<RESET> | ||||
| 	EOF | ||||
| 	git -c i18n.logOutputEncoding=latin1 log -1 --color=always --perl-regexp --grep="th.*rd" >log && | ||||
| 	grep "th.*rd" log >actual.raw && | ||||
| 	test_decode_color <actual.raw >actual && | ||||
| 	test_cmp expected actual | ||||
| ' | ||||
|  | ||||
| test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: setup invalid UTF-8 data' ' | ||||
| 	printf "\\200\\n" >invalid-0x80 && | ||||
| 	echo "ævar" >expected && | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 Hamza Mahfooz
						Hamza Mahfooz