You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
456 lines
11 KiB
456 lines
11 KiB
From 7a7c776a4e228d180e74614fd8c8afcad5d4bdf7 Mon Sep 17 00:00:00 2001 |
|
From: Jakub Martisko <jamartis@redhat.com> |
|
Date: Thu, 7 Jul 2016 12:53:26 +0200 |
|
Subject: [PATCH] coreutils-i18n-un-expand-BOM.patch |
|
|
|
--- |
|
src/expand-common.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++ |
|
src/expand-common.h | 12 ++++++ |
|
src/expand.c | 45 +++++++++++++++++++- |
|
src/unexpand.c | 43 ++++++++++++++++++- |
|
tests/expand/mb.sh | 71 ++++++++++++++++++++++++++++++++ |
|
tests/unexpand/mb.sh | 59 ++++++++++++++++++++++++++ |
|
6 files changed, 342 insertions(+), 2 deletions(-) |
|
|
|
diff --git a/src/expand-common.c b/src/expand-common.c |
|
index 4657e46..97cbb09 100644 |
|
--- a/src/expand-common.c |
|
+++ b/src/expand-common.c |
|
@@ -19,6 +19,7 @@ |
|
#include <assert.h> |
|
#include <stdio.h> |
|
#include <sys/types.h> |
|
+#include <mbfile.h> |
|
#include "system.h" |
|
#include "die.h" |
|
#include "error.h" |
|
@@ -126,6 +127,119 @@ set_increment_size (uintmax_t tabval) |
|
return ok; |
|
} |
|
|
|
+extern int |
|
+set_utf_locale (void) |
|
+{ |
|
+ /*try using some predefined locale */ |
|
+ const char* predef_locales[] = {"C.UTF8","en_US.UTF8","en_GB.UTF8"}; |
|
+ |
|
+ const int predef_locales_count=3; |
|
+ for (int i=0;i<predef_locales_count;i++) |
|
+ { |
|
+ if (setlocale(LC_ALL,predef_locales[i])!=NULL) |
|
+ { |
|
+ break; |
|
+ } |
|
+ else if (i==predef_locales_count-1) |
|
+ { |
|
+ return 1; |
|
+ error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale")); |
|
+ } |
|
+ } |
|
+ return 0; |
|
+} |
|
+ |
|
+extern bool |
|
+check_utf_locale(void) |
|
+{ |
|
+ char* locale = setlocale (LC_CTYPE , NULL); |
|
+ if (locale == NULL) |
|
+ { |
|
+ return false; |
|
+ } |
|
+ else if (strcasestr(locale, "utf8") == NULL && strcasestr(locale, "utf-8") == NULL) |
|
+ { |
|
+ return false; |
|
+ } |
|
+ return true; |
|
+} |
|
+ |
|
+extern bool |
|
+check_bom(FILE* fp, mb_file_t *mbf) |
|
+{ |
|
+ int c; |
|
+ |
|
+ |
|
+ c=fgetc(fp); |
|
+ |
|
+ /*test BOM header of the first file */ |
|
+ mbf->bufcount=0; |
|
+ if (c == 0xEF) |
|
+ { |
|
+ c=fgetc(fp); |
|
+ } |
|
+ else |
|
+ { |
|
+ if (c != EOF) |
|
+ { |
|
+ ungetc(c,fp); |
|
+ } |
|
+ return false; |
|
+ } |
|
+ |
|
+ if (c == 0xBB) |
|
+ { |
|
+ c=fgetc(fp); |
|
+ } |
|
+ else |
|
+ { |
|
+ if ( c!= EOF ) |
|
+ { |
|
+ mbf->buf[0]=(unsigned char) 0xEF; |
|
+ mbf->bufcount=1; |
|
+ ungetc(c,fp); |
|
+ return false; |
|
+ } |
|
+ else |
|
+ { |
|
+ ungetc(0xEF,fp); |
|
+ return false; |
|
+ } |
|
+ } |
|
+ if (c == 0xBF) |
|
+ { |
|
+ mbf->bufcount=0; |
|
+ return true; |
|
+ } |
|
+ else |
|
+ { |
|
+ if (c != EOF) |
|
+ { |
|
+ mbf->buf[0]=(unsigned char) 0xEF; |
|
+ mbf->buf[1]=(unsigned char) 0xBB; |
|
+ mbf->bufcount=2; |
|
+ ungetc(c,fp); |
|
+ return false; |
|
+ } |
|
+ else |
|
+ { |
|
+ mbf->buf[0]=(unsigned char) 0xEF; |
|
+ mbf->bufcount=1; |
|
+ ungetc(0xBB,fp); |
|
+ return false; |
|
+ } |
|
+ } |
|
+ return false; |
|
+} |
|
+ |
|
+extern void |
|
+print_bom(void) |
|
+{ |
|
+ putc (0xEF, stdout); |
|
+ putc (0xBB, stdout); |
|
+ putc (0xBF, stdout); |
|
+} |
|
+ |
|
/* Add the comma or blank separated list of tab stops STOPS |
|
to the list of tab stops. */ |
|
extern void |
|
diff --git a/src/expand-common.h b/src/expand-common.h |
|
index 8cb2079..763bfda 100644 |
|
--- a/src/expand-common.h |
|
+++ b/src/expand-common.h |
|
@@ -34,6 +34,18 @@ extern size_t max_column_width; |
|
/* The desired exit status. */ |
|
extern int exit_status; |
|
|
|
+extern int |
|
+set_utf_locale (void); |
|
+ |
|
+extern bool |
|
+check_utf_locale(void); |
|
+ |
|
+extern bool |
|
+check_bom(FILE* fp, mb_file_t *mbf); |
|
+ |
|
+extern void |
|
+print_bom(void); |
|
+ |
|
/* Add tab stop TABVAL to the end of 'tab_list'. */ |
|
extern void |
|
add_tab_stop (uintmax_t tabval); |
|
diff --git a/src/expand.c b/src/expand.c |
|
index 310b349..4136824 100644 |
|
--- a/src/expand.c |
|
+++ b/src/expand.c |
|
@@ -103,11 +103,33 @@ expand (void) |
|
FILE *fp = next_file (NULL); |
|
mb_file_t mbf; |
|
mbf_char_t c; |
|
+ /* True if the starting locale is utf8. */ |
|
+ bool using_utf_locale; |
|
+ |
|
+ /* True if the first file contains BOM header. */ |
|
+ bool found_bom; |
|
+ using_utf_locale=check_utf_locale(); |
|
|
|
if (!fp) |
|
return; |
|
- |
|
mbf_init (mbf, fp); |
|
+ found_bom=check_bom(fp,&mbf); |
|
+ |
|
+ if (using_utf_locale == false && found_bom == true) |
|
+ { |
|
+ /*try using some predefined locale */ |
|
+ |
|
+ if (set_utf_locale () != 0) |
|
+ { |
|
+ error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale")); |
|
+ } |
|
+ } |
|
+ |
|
+ |
|
+ if (found_bom == true) |
|
+ { |
|
+ print_bom(); |
|
+ } |
|
|
|
while (true) |
|
{ |
|
@@ -132,6 +154,27 @@ expand (void) |
|
if ((mb_iseof (c)) && (fp = next_file (fp))) |
|
{ |
|
mbf_init (mbf, fp); |
|
+ if (fp!=NULL) |
|
+ { |
|
+ if (check_bom(fp,&mbf)==true) |
|
+ { |
|
+ /*Not the first file - check BOM header*/ |
|
+ if (using_utf_locale==false && found_bom==false) |
|
+ { |
|
+ /*BOM header in subsequent file but not in the first one. */ |
|
+ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); |
|
+ } |
|
+ } |
|
+ else |
|
+ { |
|
+ if(using_utf_locale==false && found_bom==true) |
|
+ { |
|
+ /*First file conatined BOM header - locale was switched to UTF |
|
+ *all subsequent files should contain BOM. */ |
|
+ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); |
|
+ } |
|
+ } |
|
+ } |
|
continue; |
|
} |
|
else |
|
diff --git a/src/unexpand.c b/src/unexpand.c |
|
index 863a90a..5681b58 100644 |
|
--- a/src/unexpand.c |
|
+++ b/src/unexpand.c |
|
@@ -116,16 +116,36 @@ unexpand (void) |
|
include characters other than spaces, so the blanks must be |
|
stored, not merely counted. */ |
|
mbf_char_t *pending_blank; |
|
+ /* True if the starting locale is utf8. */ |
|
+ bool using_utf_locale; |
|
+ |
|
+ /* True if the first file contains BOM header. */ |
|
+ bool found_bom; |
|
+ using_utf_locale=check_utf_locale(); |
|
|
|
if (!fp) |
|
return; |
|
+ mbf_init (mbf, fp); |
|
+ found_bom=check_bom(fp,&mbf); |
|
+ |
|
+ if (using_utf_locale == false && found_bom == true) |
|
+ { |
|
+ /*try using some predefined locale */ |
|
|
|
+ if (set_utf_locale () != 0) |
|
+ { |
|
+ error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale")); |
|
+ } |
|
+ } |
|
/* The worst case is a non-blank character, then one blank, then a |
|
tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so |
|
allocate MAX_COLUMN_WIDTH bytes to store the blanks. */ |
|
pending_blank = xmalloc (max_column_width * sizeof (mbf_char_t)); |
|
|
|
- mbf_init (mbf, fp); |
|
+ if (found_bom == true) |
|
+ { |
|
+ print_bom(); |
|
+ } |
|
|
|
while (true) |
|
{ |
|
@@ -169,6 +189,27 @@ unexpand (void) |
|
if ((mb_iseof (c)) && (fp = next_file (fp))) |
|
{ |
|
mbf_init (mbf, fp); |
|
+ if (fp!=NULL) |
|
+ { |
|
+ if (check_bom(fp,&mbf)==true) |
|
+ { |
|
+ /*Not the first file - check BOM header*/ |
|
+ if (using_utf_locale==false && found_bom==false) |
|
+ { |
|
+ /*BOM header in subsequent file but not in the first one. */ |
|
+ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); |
|
+ } |
|
+ } |
|
+ else |
|
+ { |
|
+ if(using_utf_locale==false && found_bom==true) |
|
+ { |
|
+ /*First file conatined BOM header - locale was switched to UTF |
|
+ *all subsequent files should contain BOM. */ |
|
+ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); |
|
+ } |
|
+ } |
|
+ } |
|
continue; |
|
} |
|
else |
|
diff --git a/tests/expand/mb.sh b/tests/expand/mb.sh |
|
index 031be7a..1621c84 100755 |
|
--- a/tests/expand/mb.sh |
|
+++ b/tests/expand/mb.sh |
|
@@ -109,4 +109,75 @@ env printf '12345678 |
|
expand < in > out || fail=1 |
|
compare exp out > /dev/null 2>&1 || fail=1 |
|
|
|
+ |
|
+ |
|
+#BOM header test 1 |
|
+printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_ |
|
+1234567812345678123456781 |
|
+. . . . |
|
+a b c d |
|
+. . . . |
|
+ä ö ü ß |
|
+. . . . |
|
+EOF |
|
+env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_ |
|
+ |
|
+printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ |
|
+1234567812345678123456781 |
|
+. . . . |
|
+a b c d |
|
+. . . . |
|
+ä ö ü ß |
|
+. . . . |
|
+ äöü . öüä. ä xx |
|
+EOF |
|
+ |
|
+ |
|
+expand < in > out || fail=1 |
|
+compare exp out > /dev/null 2>&1 || fail=1 |
|
+ |
|
+LANG=C expand < in > out || fail=1 |
|
+compare exp out > /dev/null 2>&1 || fail=1 |
|
+ |
|
+LC_ALL=C expand < in > out || fail=1 |
|
+compare exp out > /dev/null 2>&1 || fail=1 |
|
+ |
|
+ |
|
+printf '\xEF\xBB\xBF' > in1; cat <<\EOF >> in1 || framework_failure_ |
|
+1234567812345678123456781 |
|
+. . . . |
|
+a b c d |
|
+. . . . |
|
+ä ö ü ß |
|
+. . . . |
|
+EOF |
|
+env printf ' äöü\t. öüä. \tä xx\n' >> in1 || framework_failure_ |
|
+ |
|
+ |
|
+printf '\xEF\xBB\xBF' > exp; cat <<\EOF >> exp || framework_failure_ |
|
+1234567812345678123456781 |
|
+. . . . |
|
+a b c d |
|
+. . . . |
|
+ä ö ü ß |
|
+. . . . |
|
+ äöü . öüä. ä xx |
|
+1234567812345678123456781 |
|
+. . . . |
|
+a b c d |
|
+. . . . |
|
+ä ö ü ß |
|
+. . . . |
|
+ äöü . öüä. ä xx |
|
+EOF |
|
+ |
|
+expand in1 in1 > out || fail=1 |
|
+compare exp out > /dev/null 2>&1 || fail=1 |
|
+ |
|
+LANG=C expand in1 in1 > out || fail=1 |
|
+compare exp out > /dev/null 2>&1 || fail=1 |
|
+ |
|
+LC_ALL=C expand in1 in1 > out || fail=1 |
|
+compare exp out > /dev/null 2>&1 || fail=1 |
|
+ |
|
exit $fail |
|
diff --git a/tests/unexpand/mb.sh b/tests/unexpand/mb.sh |
|
index 8d75652..9d4ee3e 100755 |
|
--- a/tests/unexpand/mb.sh |
|
+++ b/tests/unexpand/mb.sh |
|
@@ -111,3 +111,62 @@ env printf '12345678 |
|
|
|
unexpand -a < in > out || fail=1 |
|
compare exp out > /dev/null 2>&1 || fail=1 |
|
+ |
|
+#BOM header test 1 |
|
+printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_ |
|
+1234567812345678123456781 |
|
+. . . . |
|
+a b c d |
|
+. . . . |
|
+ä ö ü ß |
|
+. . . . |
|
+ äöü . öüä. ä xx |
|
+EOF |
|
+env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_ |
|
+ |
|
+printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ |
|
+1234567812345678123456781 |
|
+. . . . |
|
+a b c d |
|
+. . . . |
|
+ä ö ü ß |
|
+. . . . |
|
+ äöü . öüä. ä xx |
|
+EOF |
|
+ |
|
+unexpand < in > out || fail=1 |
|
+compare exp out > /dev/null 2>&1 || fail=1 |
|
+ |
|
+LANG=C unexpand < in > out || fail=1 |
|
+compare exp out > /dev/null 2>&1 || fail=1 |
|
+ |
|
+LC_ALL=C unexpand < in > out || fail=1 |
|
+compare exp out > /dev/null 2>&1 || fail=1 |
|
+ |
|
+ |
|
+printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ |
|
+1234567812345678123456781 |
|
+. . . . |
|
+a b c d |
|
+. . . . |
|
+ä ö ü ß |
|
+. . . . |
|
+ äöü . öüä. ä xx |
|
+1234567812345678123456781 |
|
+. . . . |
|
+a b c d |
|
+. . . . |
|
+ä ö ü ß |
|
+. . . . |
|
+ äöü . öüä. ä xx |
|
+EOF |
|
+ |
|
+ |
|
+unexpand in in > out || fail=1 |
|
+compare exp out > /dev/null 2>&1 || fail=1 |
|
+ |
|
+LANG=C unexpand in in > out || fail=1 |
|
+compare exp out > /dev/null 2>&1 || fail=1 |
|
+ |
|
+LC_ALL=C unexpand in in > out || fail=1 |
|
+compare exp out > /dev/null 2>&1 || fail=1 |
|
-- |
|
2.9.3 |
|
|
|
|