You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
565 lines
19 KiB
565 lines
19 KiB
diff --git a/src/cut.c b/src/cut.c |
|
index 7ab6be4..022d0ad 100644 |
|
--- a/src/cut.c |
|
+++ b/src/cut.c |
|
@@ -28,6 +28,11 @@ |
|
#include <assert.h> |
|
#include <getopt.h> |
|
#include <sys/types.h> |
|
+ |
|
+/* Get mbstate_t, mbrtowc(). */ |
|
+#if HAVE_WCHAR_H |
|
+# include <wchar.h> |
|
+#endif |
|
#include "system.h" |
|
|
|
#include "error.h" |
|
@@ -38,6 +43,18 @@ |
|
|
|
#include "set-fields.h" |
|
|
|
+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC |
|
+ installation; work around this configuration error. */ |
|
+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2 |
|
+# undef MB_LEN_MAX |
|
+# define MB_LEN_MAX 16 |
|
+#endif |
|
+ |
|
+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */ |
|
+#if HAVE_MBRTOWC && defined mbstate_t |
|
+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0) |
|
+#endif |
|
+ |
|
/* The official name of this program (e.g., no 'g' prefix). */ |
|
#define PROGRAM_NAME "cut" |
|
|
|
@@ -54,6 +71,52 @@ |
|
} \ |
|
while (0) |
|
|
|
+/* Refill the buffer BUF to get a multibyte character. */ |
|
+#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM) \ |
|
+ do \ |
|
+ { \ |
|
+ if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM)) \ |
|
+ { \ |
|
+ memmove (BUF, BUFPOS, BUFLEN); \ |
|
+ BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \ |
|
+ BUFPOS = BUF; \ |
|
+ } \ |
|
+ } \ |
|
+ while (0) |
|
+ |
|
+/* Get wide character on BUFPOS. BUFPOS is not included after that. |
|
+ If byte sequence is not valid as a character, CONVFAIL is true. Otherwise false. */ |
|
+#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \ |
|
+ do \ |
|
+ { \ |
|
+ mbstate_t state_bak; \ |
|
+ \ |
|
+ if (BUFLEN < 1) \ |
|
+ { \ |
|
+ WC = WEOF; \ |
|
+ break; \ |
|
+ } \ |
|
+ \ |
|
+ /* Get a wide character. */ \ |
|
+ CONVFAIL = false; \ |
|
+ state_bak = STATE; \ |
|
+ MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE); \ |
|
+ \ |
|
+ switch (MBLENGTH) \ |
|
+ { \ |
|
+ case (size_t)-1: \ |
|
+ case (size_t)-2: \ |
|
+ CONVFAIL = true; \ |
|
+ STATE = state_bak; \ |
|
+ /* Fall througn. */ \ |
|
+ \ |
|
+ case 0: \ |
|
+ MBLENGTH = 1; \ |
|
+ break; \ |
|
+ } \ |
|
+ } \ |
|
+ while (0) |
|
+ |
|
|
|
/* Pointer inside RP. When checking if a byte or field is selected |
|
by a finite range, we check if it is between CURRENT_RP.LO |
|
@@ -61,6 +124,9 @@ |
|
CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */ |
|
static struct field_range_pair *current_rp; |
|
|
|
+/* Length of the delimiter given as argument to -d. */ |
|
+size_t delimlen; |
|
+ |
|
/* This buffer is used to support the semantics of the -s option |
|
(or lack of same) when the specified field list includes (does |
|
not include) the first field. In both of those cases, the entire |
|
@@ -77,15 +143,25 @@ enum operating_mode |
|
{ |
|
undefined_mode, |
|
|
|
- /* Output characters that are in the given bytes. */ |
|
+ /* Output bytes that are at the given positions. */ |
|
byte_mode, |
|
|
|
+ /* Output characters that are at the given positions. */ |
|
+ character_mode, |
|
+ |
|
/* Output the given delimiter-separated fields. */ |
|
field_mode |
|
}; |
|
|
|
static enum operating_mode operating_mode; |
|
|
|
+/* If nonzero, when in byte mode, don't split multibyte characters. */ |
|
+static int byte_mode_character_aware; |
|
+ |
|
+/* If nonzero, the function for single byte locale is work |
|
+ if this program runs on multibyte locale. */ |
|
+static int force_singlebyte_mode; |
|
+ |
|
/* If true do not output lines containing no delimiter characters. |
|
Otherwise, all such lines are printed. This option is valid only |
|
with field mode. */ |
|
@@ -97,6 +173,9 @@ static bool complement; |
|
|
|
/* The delimiter character for field mode. */ |
|
static unsigned char delim; |
|
+#if HAVE_WCHAR_H |
|
+static wchar_t wcdelim; |
|
+#endif |
|
|
|
/* The delimiter for each line/record. */ |
|
static unsigned char line_delim = '\n'; |
|
@@ -164,7 +243,7 @@ Print selected parts of lines from each FILE to standard output.\n\ |
|
-f, --fields=LIST select only these fields; also print any line\n\ |
|
that contains no delimiter character, unless\n\ |
|
the -s option is specified\n\ |
|
- -n (ignored)\n\ |
|
+ -n with -b: don't split multibyte characters\n\ |
|
"), stdout); |
|
fputs (_("\ |
|
--complement complement the set of selected bytes, characters\n\ |
|
@@ -280,6 +359,82 @@ cut_bytes (FILE *stream) |
|
} |
|
} |
|
|
|
+#if HAVE_MBRTOWC |
|
+/* This function is in use for the following case. |
|
+ |
|
+ 1. Read from the stream STREAM, printing to standard output any selected |
|
+ characters. |
|
+ |
|
+ 2. Read from stream STREAM, printing to standard output any selected bytes, |
|
+ without splitting multibyte characters. */ |
|
+ |
|
+static void |
|
+cut_characters_or_cut_bytes_no_split (FILE *stream) |
|
+{ |
|
+ uintmax_t idx; /* number of bytes or characters in the line so far. */ |
|
+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */ |
|
+ char *bufpos; /* Next read position of BUF. */ |
|
+ size_t buflen; /* The length of the byte sequence in buf. */ |
|
+ wint_t wc; /* A gotten wide character. */ |
|
+ size_t mblength; /* The byte size of a multibyte character which shows |
|
+ as same character as WC. */ |
|
+ mbstate_t state; /* State of the stream. */ |
|
+ bool convfail = false; /* true, when conversion failed. Otherwise false. */ |
|
+ /* Whether to begin printing delimiters between ranges for the current line. |
|
+ Set after we've begun printing data corresponding to the first range. */ |
|
+ bool print_delimiter = false; |
|
+ |
|
+ idx = 0; |
|
+ buflen = 0; |
|
+ bufpos = buf; |
|
+ memset (&state, '\0', sizeof(mbstate_t)); |
|
+ |
|
+ current_rp = frp; |
|
+ |
|
+ while (1) |
|
+ { |
|
+ REFILL_BUFFER (buf, bufpos, buflen, stream); |
|
+ |
|
+ GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail); |
|
+ (void) convfail; /* ignore unused */ |
|
+ |
|
+ if (wc == WEOF) |
|
+ { |
|
+ if (idx > 0) |
|
+ putchar (line_delim); |
|
+ break; |
|
+ } |
|
+ else if (wc == line_delim) |
|
+ { |
|
+ putchar (line_delim); |
|
+ idx = 0; |
|
+ print_delimiter = false; |
|
+ current_rp = frp; |
|
+ } |
|
+ else |
|
+ { |
|
+ next_item (&idx); |
|
+ if (print_kth (idx)) |
|
+ { |
|
+ if (output_delimiter_specified) |
|
+ { |
|
+ if (print_delimiter && is_range_start_index (idx)) |
|
+ { |
|
+ fwrite (output_delimiter_string, sizeof (char), |
|
+ output_delimiter_length, stdout); |
|
+ } |
|
+ print_delimiter = true; |
|
+ } |
|
+ fwrite (bufpos, mblength, sizeof(char), stdout); |
|
+ } |
|
+ } |
|
+ |
|
+ buflen -= mblength; |
|
+ bufpos += mblength; |
|
+ } |
|
+} |
|
+#endif |
|
+ |
|
/* Read from stream STREAM, printing to standard output any selected fields. */ |
|
|
|
static void |
|
@@ -425,13 +580,211 @@ cut_fields (FILE *stream) |
|
} |
|
} |
|
|
|
+#if HAVE_MBRTOWC |
|
+static void |
|
+cut_fields_mb (FILE *stream) |
|
+{ |
|
+ int c; |
|
+ uintmax_t field_idx; |
|
+ int found_any_selected_field; |
|
+ int buffer_first_field; |
|
+ int empty_input; |
|
+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */ |
|
+ char *bufpos; /* Next read position of BUF. */ |
|
+ size_t buflen; /* The length of the byte sequence in buf. */ |
|
+ wint_t wc = 0; /* A gotten wide character. */ |
|
+ size_t mblength; /* The byte size of a multibyte character which shows |
|
+ as same character as WC. */ |
|
+ mbstate_t state; /* State of the stream. */ |
|
+ bool convfail = false; /* true, when conversion failed. Otherwise false. */ |
|
+ |
|
+ current_rp = frp; |
|
+ |
|
+ found_any_selected_field = 0; |
|
+ field_idx = 1; |
|
+ bufpos = buf; |
|
+ buflen = 0; |
|
+ memset (&state, '\0', sizeof(mbstate_t)); |
|
+ |
|
+ c = getc (stream); |
|
+ empty_input = (c == EOF); |
|
+ if (c != EOF) |
|
+ { |
|
+ ungetc (c, stream); |
|
+ wc = 0; |
|
+ } |
|
+ else |
|
+ wc = WEOF; |
|
+ |
|
+ /* To support the semantics of the -s flag, we may have to buffer |
|
+ all of the first field to determine whether it is `delimited.' |
|
+ But that is unnecessary if all non-delimited lines must be printed |
|
+ and the first field has been selected, or if non-delimited lines |
|
+ must be suppressed and the first field has *not* been selected. |
|
+ That is because a non-delimited line has exactly one field. */ |
|
+ buffer_first_field = (suppress_non_delimited ^ !print_kth (1)); |
|
+ |
|
+ while (1) |
|
+ { |
|
+ if (field_idx == 1 && buffer_first_field) |
|
+ { |
|
+ int len = 0; |
|
+ |
|
+ while (1) |
|
+ { |
|
+ REFILL_BUFFER (buf, bufpos, buflen, stream); |
|
+ |
|
+ GET_NEXT_WC_FROM_BUFFER |
|
+ (wc, bufpos, buflen, mblength, state, convfail); |
|
+ |
|
+ if (wc == WEOF) |
|
+ break; |
|
+ |
|
+ field_1_buffer = xrealloc (field_1_buffer, len + mblength); |
|
+ memcpy (field_1_buffer + len, bufpos, mblength); |
|
+ len += mblength; |
|
+ buflen -= mblength; |
|
+ bufpos += mblength; |
|
+ |
|
+ if (!convfail && (wc == line_delim || wc == wcdelim)) |
|
+ break; |
|
+ } |
|
+ |
|
+ if (len <= 0 && wc == WEOF) |
|
+ break; |
|
+ |
|
+ /* If the first field extends to the end of line (it is not |
|
+ delimited) and we are printing all non-delimited lines, |
|
+ print this one. */ |
|
+ if (convfail || (!convfail && wc != wcdelim)) |
|
+ { |
|
+ if (suppress_non_delimited) |
|
+ { |
|
+ /* Empty. */ |
|
+ } |
|
+ else |
|
+ { |
|
+ fwrite (field_1_buffer, sizeof (char), len, stdout); |
|
+ /* Make sure the output line is newline terminated. */ |
|
+ if (convfail || (!convfail && wc != line_delim)) |
|
+ putchar (line_delim); |
|
+ } |
|
+ continue; |
|
+ } |
|
+ |
|
+ if (print_kth (1)) |
|
+ { |
|
+ /* Print the field, but not the trailing delimiter. */ |
|
+ fwrite (field_1_buffer, sizeof (char), len - 1, stdout); |
|
+ found_any_selected_field = 1; |
|
+ } |
|
+ next_item (&field_idx); |
|
+ } |
|
+ |
|
+ if (wc != WEOF) |
|
+ { |
|
+ if (print_kth (field_idx)) |
|
+ { |
|
+ if (found_any_selected_field) |
|
+ { |
|
+ fwrite (output_delimiter_string, sizeof (char), |
|
+ output_delimiter_length, stdout); |
|
+ } |
|
+ found_any_selected_field = 1; |
|
+ } |
|
+ |
|
+ while (1) |
|
+ { |
|
+ REFILL_BUFFER (buf, bufpos, buflen, stream); |
|
+ |
|
+ GET_NEXT_WC_FROM_BUFFER |
|
+ (wc, bufpos, buflen, mblength, state, convfail); |
|
+ |
|
+ if (wc == WEOF) |
|
+ break; |
|
+ else if (!convfail && (wc == wcdelim || wc == line_delim)) |
|
+ { |
|
+ buflen -= mblength; |
|
+ bufpos += mblength; |
|
+ break; |
|
+ } |
|
+ |
|
+ if (print_kth (field_idx)) |
|
+ fwrite (bufpos, mblength, sizeof(char), stdout); |
|
+ |
|
+ buflen -= mblength; |
|
+ bufpos += mblength; |
|
+ } |
|
+ } |
|
+ |
|
+ if ((!convfail || wc == line_delim) && buflen < 1) |
|
+ wc = WEOF; |
|
+ |
|
+ if (!convfail && wc == wcdelim) |
|
+ next_item (&field_idx); |
|
+ else if (wc == WEOF || (!convfail && wc == line_delim)) |
|
+ { |
|
+ if (found_any_selected_field |
|
+ || (!empty_input && !(suppress_non_delimited && field_idx == 1))) |
|
+ putchar (line_delim); |
|
+ if (wc == WEOF) |
|
+ break; |
|
+ field_idx = 1; |
|
+ current_rp = frp; |
|
+ found_any_selected_field = 0; |
|
+ } |
|
+ } |
|
+} |
|
+#endif |
|
+ |
|
static void |
|
cut_stream (FILE *stream) |
|
{ |
|
- if (operating_mode == byte_mode) |
|
- cut_bytes (stream); |
|
+#if HAVE_MBRTOWC |
|
+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode) |
|
+ { |
|
+ switch (operating_mode) |
|
+ { |
|
+ case byte_mode: |
|
+ if (byte_mode_character_aware) |
|
+ cut_characters_or_cut_bytes_no_split (stream); |
|
+ else |
|
+ cut_bytes (stream); |
|
+ break; |
|
+ |
|
+ case character_mode: |
|
+ cut_characters_or_cut_bytes_no_split (stream); |
|
+ break; |
|
+ |
|
+ case field_mode: |
|
+ if (delimlen == 1) |
|
+ { |
|
+ /* Check if we have utf8 multibyte locale, so we can use this |
|
+ optimization because of uniqueness of characters, which is |
|
+ not true for e.g. SJIS */ |
|
+ char * loc = setlocale(LC_CTYPE, NULL); |
|
+ if (loc && (strstr (loc, "UTF-8") || strstr (loc, "utf-8") || |
|
+ strstr (loc, "UTF8") || strstr (loc, "utf8"))) |
|
+ { |
|
+ cut_fields (stream); |
|
+ break; |
|
+ } |
|
+ } |
|
+ cut_fields_mb (stream); |
|
+ break; |
|
+ |
|
+ default: |
|
+ abort (); |
|
+ } |
|
+ } |
|
else |
|
- cut_fields (stream); |
|
+#endif |
|
+ { |
|
+ if (operating_mode == field_mode) |
|
+ cut_fields (stream); |
|
+ else |
|
+ cut_bytes (stream); |
|
+ } |
|
} |
|
|
|
/* Process file FILE to standard output. |
|
@@ -483,6 +836,7 @@ main (int argc, char **argv) |
|
bool ok; |
|
bool delim_specified = false; |
|
char *spec_list_string IF_LINT ( = NULL); |
|
+ char mbdelim[MB_LEN_MAX + 1]; |
|
|
|
initialize_main (&argc, &argv); |
|
set_program_name (argv[0]); |
|
@@ -505,7 +859,6 @@ main (int argc, char **argv) |
|
switch (optc) |
|
{ |
|
case 'b': |
|
- case 'c': |
|
/* Build the byte list. */ |
|
if (operating_mode != undefined_mode) |
|
FATAL_ERROR (_("only one type of list may be specified")); |
|
@@ -513,6 +866,14 @@ main (int argc, char **argv) |
|
spec_list_string = optarg; |
|
break; |
|
|
|
+ case 'c': |
|
+ /* Build the character list. */ |
|
+ if (operating_mode != undefined_mode) |
|
+ FATAL_ERROR (_("only one type of list may be specified")); |
|
+ operating_mode = character_mode; |
|
+ spec_list_string = optarg; |
|
+ break; |
|
+ |
|
case 'f': |
|
/* Build the field list. */ |
|
if (operating_mode != undefined_mode) |
|
@@ -524,10 +885,38 @@ main (int argc, char **argv) |
|
case 'd': |
|
/* New delimiter. */ |
|
/* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */ |
|
- if (optarg[0] != '\0' && optarg[1] != '\0') |
|
- FATAL_ERROR (_("the delimiter must be a single character")); |
|
- delim = optarg[0]; |
|
- delim_specified = true; |
|
+ { |
|
+#if HAVE_MBRTOWC |
|
+ if(MB_CUR_MAX > 1) |
|
+ { |
|
+ mbstate_t state; |
|
+ |
|
+ memset (&state, '\0', sizeof(mbstate_t)); |
|
+ delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state); |
|
+ |
|
+ if (delimlen == (size_t)-1 || delimlen == (size_t)-2) |
|
+ ++force_singlebyte_mode; |
|
+ else |
|
+ { |
|
+ delimlen = (delimlen < 1) ? 1 : delimlen; |
|
+ if (wcdelim != L'\0' && *(optarg + delimlen) != '\0') |
|
+ FATAL_ERROR (_("the delimiter must be a single character")); |
|
+ memcpy (mbdelim, optarg, delimlen); |
|
+ mbdelim[delimlen] = '\0'; |
|
+ if (delimlen == 1) |
|
+ delim = *optarg; |
|
+ } |
|
+ } |
|
+ |
|
+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode) |
|
+#endif |
|
+ { |
|
+ if (optarg[0] != '\0' && optarg[1] != '\0') |
|
+ FATAL_ERROR (_("the delimiter must be a single character")); |
|
+ delim = (unsigned char) optarg[0]; |
|
+ } |
|
+ delim_specified = true; |
|
+ } |
|
break; |
|
|
|
case OUTPUT_DELIMITER_OPTION: |
|
@@ -540,6 +929,7 @@ main (int argc, char **argv) |
|
break; |
|
|
|
case 'n': |
|
+ byte_mode_character_aware = 1; |
|
break; |
|
|
|
case 's': |
|
@@ -579,15 +969,34 @@ main (int argc, char **argv) |
|
| (complement ? SETFLD_COMPLEMENT : 0) ); |
|
|
|
if (!delim_specified) |
|
- delim = '\t'; |
|
+ { |
|
+ delim = '\t'; |
|
+#ifdef HAVE_MBRTOWC |
|
+ wcdelim = L'\t'; |
|
+ mbdelim[0] = '\t'; |
|
+ mbdelim[1] = '\0'; |
|
+ delimlen = 1; |
|
+#endif |
|
+ } |
|
|
|
if (output_delimiter_string == NULL) |
|
{ |
|
- static char dummy[2]; |
|
- dummy[0] = delim; |
|
- dummy[1] = '\0'; |
|
- output_delimiter_string = dummy; |
|
- output_delimiter_length = 1; |
|
+#ifdef HAVE_MBRTOWC |
|
+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode) |
|
+ { |
|
+ output_delimiter_string = xstrdup(mbdelim); |
|
+ output_delimiter_length = delimlen; |
|
+ } |
|
+ |
|
+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode) |
|
+#endif |
|
+ { |
|
+ static char dummy[2]; |
|
+ dummy[0] = delim; |
|
+ dummy[1] = '\0'; |
|
+ output_delimiter_string = dummy; |
|
+ output_delimiter_length = 1; |
|
+ } |
|
} |
|
|
|
if (optind == argc)
|
|
|