123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610 |
- /* cut - remove parts of lines of files
- Copyright (C) 1997-2018 Free Software Foundation, Inc.
- Copyright (C) 1984 David M. Ihnat
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <https://www.gnu.org/licenses/>. */
- /* Written by David Ihnat. */
- /* POSIX changes, bug fixes, long-named options, and cleanup
- by David MacKenzie <djm@gnu.ai.mit.edu>.
- Rewrite cut_fields and cut_bytes -- Jim Meyering. */
- #include <config.h>
- #include <stdio.h>
- #include <assert.h>
- #include <getopt.h>
- #include <sys/types.h>
- #include "system.h"
- #include "error.h"
- #include "fadvise.h"
- #include "getndelim2.h"
- #include "hash.h"
- #include "xstrndup.h"
- #include "set-fields.h"
- /* The official name of this program (e.g., no 'g' prefix). */
- #define PROGRAM_NAME "cut"
- #define AUTHORS \
- proper_name ("David M. Ihnat"), \
- proper_name ("David MacKenzie"), \
- proper_name ("Jim Meyering")
- #define FATAL_ERROR(Message) \
- do \
- { \
- error (0, 0, (Message)); \
- usage (EXIT_FAILURE); \
- } \
- while (0)
- /* Pointer inside RP. When checking if a byte or field is selected
- by a finite range, we check if it is between CURRENT_RP.LO
- and CURRENT_RP.HI. If the byte or field index is greater than
- CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */
- static struct field_range_pair *current_rp;
- /* This buffer is used to support the semantics of the -s option
- (or lack of same) when the specified field list includes (does
- not include) the first field. In both of those cases, the entire
- first field must be read into this buffer to determine whether it
- is followed by a delimiter or a newline before any of it may be
- output. Otherwise, cut_fields can do the job without using this
- buffer. */
- static char *field_1_buffer;
- /* The number of bytes allocated for FIELD_1_BUFFER. */
- static size_t field_1_bufsize;
- enum operating_mode
- {
- undefined_mode,
- /* Output characters that are in the given bytes. */
- byte_mode,
- /* Output the given delimiter-separated fields. */
- field_mode
- };
- static enum operating_mode operating_mode;
- /* If true do not output lines containing no delimiter characters.
- Otherwise, all such lines are printed. This option is valid only
- with field mode. */
- static bool suppress_non_delimited;
- /* If true, print all bytes, characters, or fields _except_
- those that were specified. */
- static bool complement;
- /* The delimiter character for field mode. */
- static unsigned char delim;
- /* The delimiter for each line/record. */
- static unsigned char line_delim = '\n';
- /* True if the --output-delimiter=STRING option was specified. */
- static bool output_delimiter_specified;
- /* The length of output_delimiter_string. */
- static size_t output_delimiter_length;
- /* The output field separator string. Defaults to the 1-character
- string consisting of the input delimiter. */
- static char *output_delimiter_string;
- /* True if we have ever read standard input. */
- static bool have_read_stdin;
- /* For long options that have no equivalent short option, use a
- non-character as a pseudo short option, starting with CHAR_MAX + 1. */
- enum
- {
- OUTPUT_DELIMITER_OPTION = CHAR_MAX + 1,
- COMPLEMENT_OPTION
- };
- static struct option const longopts[] =
- {
- {"bytes", required_argument, NULL, 'b'},
- {"characters", required_argument, NULL, 'c'},
- {"fields", required_argument, NULL, 'f'},
- {"delimiter", required_argument, NULL, 'd'},
- {"only-delimited", no_argument, NULL, 's'},
- {"output-delimiter", required_argument, NULL, OUTPUT_DELIMITER_OPTION},
- {"complement", no_argument, NULL, COMPLEMENT_OPTION},
- {"zero-terminated", no_argument, NULL, 'z'},
- {GETOPT_HELP_OPTION_DECL},
- {GETOPT_VERSION_OPTION_DECL},
- {NULL, 0, NULL, 0}
- };
- void
- usage (int status)
- {
- if (status != EXIT_SUCCESS)
- emit_try_help ();
- else
- {
- printf (_("\
- Usage: %s OPTION... [FILE]...\n\
- "),
- program_name);
- fputs (_("\
- Print selected parts of lines from each FILE to standard output.\n\
- "), stdout);
- emit_stdin_note ();
- emit_mandatory_arg_note ();
- fputs (_("\
- -b, --bytes=LIST select only these bytes\n\
- -c, --characters=LIST select only these characters\n\
- -d, --delimiter=DELIM use DELIM instead of TAB for field delimiter\n\
- "), stdout);
- fputs (_("\
- -f, --fields=LIST select only these fields; also print any line\n\
- that contains no delimiter character, unless\n\
- the -s option is specified\n\
- -n (ignored)\n\
- "), stdout);
- fputs (_("\
- --complement complement the set of selected bytes, characters\n\
- or fields\n\
- "), stdout);
- fputs (_("\
- -s, --only-delimited do not print lines not containing delimiters\n\
- --output-delimiter=STRING use STRING as the output delimiter\n\
- the default is to use the input delimiter\n\
- "), stdout);
- fputs (_("\
- -z, --zero-terminated line delimiter is NUL, not newline\n\
- "), stdout);
- fputs (HELP_OPTION_DESCRIPTION, stdout);
- fputs (VERSION_OPTION_DESCRIPTION, stdout);
- fputs (_("\
- \n\
- Use one, and only one of -b, -c or -f. Each LIST is made up of one\n\
- range, or many ranges separated by commas. Selected input is written\n\
- in the same order that it is read, and is written exactly once.\n\
- "), stdout);
- fputs (_("\
- Each range is one of:\n\
- \n\
- N N'th byte, character or field, counted from 1\n\
- N- from N'th byte, character or field, to end of line\n\
- N-M from N'th to M'th (included) byte, character or field\n\
- -M from first to M'th (included) byte, character or field\n\
- "), stdout);
- emit_ancillary_info (PROGRAM_NAME);
- }
- exit (status);
- }
- /* Increment *ITEM_IDX (i.e., a field or byte index),
- and if required CURRENT_RP. */
- static inline void
- next_item (uintmax_t *item_idx)
- {
- (*item_idx)++;
- if ((*item_idx) > current_rp->hi)
- current_rp++;
- }
- /* Return nonzero if the K'th field or byte is printable. */
- static inline bool
- print_kth (uintmax_t k)
- {
- return current_rp->lo <= k;
- }
- /* Return nonzero if K'th byte is the beginning of a range. */
- static inline bool
- is_range_start_index (uintmax_t k)
- {
- return k == current_rp->lo;
- }
- /* Read from stream STREAM, printing to standard output any selected bytes. */
- static void
- cut_bytes (FILE *stream)
- {
- uintmax_t byte_idx; /* Number of bytes in the line so far. */
- /* Whether to begin printing delimiters between ranges for the current line.
- Set after we've begun printing data corresponding to the first range. */
- bool print_delimiter;
- byte_idx = 0;
- print_delimiter = false;
- current_rp = frp;
- while (true)
- {
- int c; /* Each character from the file. */
- c = getc (stream);
- if (c == line_delim)
- {
- putchar (c);
- byte_idx = 0;
- print_delimiter = false;
- current_rp = frp;
- }
- else if (c == EOF)
- {
- if (byte_idx > 0)
- putchar (line_delim);
- break;
- }
- else
- {
- next_item (&byte_idx);
- if (print_kth (byte_idx))
- {
- if (output_delimiter_specified)
- {
- if (print_delimiter && is_range_start_index (byte_idx))
- {
- fwrite (output_delimiter_string, sizeof (char),
- output_delimiter_length, stdout);
- }
- print_delimiter = true;
- }
- putchar (c);
- }
- }
- }
- }
- /* Read from stream STREAM, printing to standard output any selected fields. */
- static void
- cut_fields (FILE *stream)
- {
- int c;
- uintmax_t field_idx = 1;
- bool found_any_selected_field = false;
- bool buffer_first_field;
- current_rp = frp;
- c = getc (stream);
- if (c == EOF)
- return;
- ungetc (c, stream);
- c = 0;
- /* To support the semantics of the -s flag, we may have to buffer
- all of the first field to determine whether it is 'delimited.'
- But that is unnecessary if all non-delimited lines must be printed
- and the first field has been selected, or if non-delimited lines
- must be suppressed and the first field has *not* been selected.
- That is because a non-delimited line has exactly one field. */
- buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
- while (1)
- {
- if (field_idx == 1 && buffer_first_field)
- {
- ssize_t len;
- size_t n_bytes;
- len = getndelim2 (&field_1_buffer, &field_1_bufsize, 0,
- GETNLINE_NO_LIMIT, delim, line_delim, stream);
- if (len < 0)
- {
- free (field_1_buffer);
- field_1_buffer = NULL;
- if (ferror (stream) || feof (stream))
- break;
- xalloc_die ();
- }
- n_bytes = len;
- assert (n_bytes != 0);
- c = 0;
- /* If the first field extends to the end of line (it is not
- delimited) and we are printing all non-delimited lines,
- print this one. */
- if (to_uchar (field_1_buffer[n_bytes - 1]) != delim)
- {
- if (suppress_non_delimited)
- {
- /* Empty. */
- }
- else
- {
- fwrite (field_1_buffer, sizeof (char), n_bytes, stdout);
- /* Make sure the output line is newline terminated. */
- if (field_1_buffer[n_bytes - 1] != line_delim)
- putchar (line_delim);
- c = line_delim;
- }
- continue;
- }
- if (print_kth (1))
- {
- /* Print the field, but not the trailing delimiter. */
- fwrite (field_1_buffer, sizeof (char), n_bytes - 1, stdout);
- /* With -d$'\n' don't treat the last '\n' as a delimiter. */
- if (delim == line_delim)
- {
- int last_c = getc (stream);
- if (last_c != EOF)
- {
- ungetc (last_c, stream);
- found_any_selected_field = true;
- }
- }
- else
- found_any_selected_field = true;
- }
- next_item (&field_idx);
- }
- int prev_c = c;
- if (print_kth (field_idx))
- {
- if (found_any_selected_field)
- {
- fwrite (output_delimiter_string, sizeof (char),
- output_delimiter_length, stdout);
- }
- found_any_selected_field = true;
- while ((c = getc (stream)) != delim && c != line_delim && c != EOF)
- {
- putchar (c);
- prev_c = c;
- }
- }
- else
- {
- while ((c = getc (stream)) != delim && c != line_delim && c != EOF)
- {
- prev_c = c;
- }
- }
- /* With -d$'\n' don't treat the last '\n' as a delimiter. */
- if (delim == line_delim && c == delim)
- {
- int last_c = getc (stream);
- if (last_c != EOF)
- ungetc (last_c, stream);
- else
- c = last_c;
- }
- if (c == delim)
- next_item (&field_idx);
- else if (c == line_delim || c == EOF)
- {
- if (found_any_selected_field
- || !(suppress_non_delimited && field_idx == 1))
- {
- if (c == line_delim || prev_c != line_delim
- || delim == line_delim)
- putchar (line_delim);
- }
- if (c == EOF)
- break;
- field_idx = 1;
- current_rp = frp;
- found_any_selected_field = false;
- }
- }
- }
- static void
- cut_stream (FILE *stream)
- {
- if (operating_mode == byte_mode)
- cut_bytes (stream);
- else
- cut_fields (stream);
- }
- /* Process file FILE to standard output.
- Return true if successful. */
- static bool
- cut_file (char const *file)
- {
- FILE *stream;
- if (STREQ (file, "-"))
- {
- have_read_stdin = true;
- stream = stdin;
- }
- else
- {
- stream = fopen (file, "r");
- if (stream == NULL)
- {
- error (0, errno, "%s", quotef (file));
- return false;
- }
- }
- fadvise (stream, FADVISE_SEQUENTIAL);
- cut_stream (stream);
- if (ferror (stream))
- {
- error (0, errno, "%s", quotef (file));
- return false;
- }
- if (STREQ (file, "-"))
- clearerr (stream); /* Also clear EOF. */
- else if (fclose (stream) == EOF)
- {
- error (0, errno, "%s", quotef (file));
- return false;
- }
- return true;
- }
- int
- main (int argc, char **argv)
- {
- int optc;
- bool ok;
- bool delim_specified = false;
- char *spec_list_string IF_LINT ( = NULL);
- initialize_main (&argc, &argv);
- set_program_name (argv[0]);
- setlocale (LC_ALL, "");
- bindtextdomain (PACKAGE, LOCALEDIR);
- textdomain (PACKAGE);
- atexit (close_stdout);
- operating_mode = undefined_mode;
- /* By default, all non-delimited lines are printed. */
- suppress_non_delimited = false;
- delim = '\0';
- have_read_stdin = false;
- while ((optc = getopt_long (argc, argv, "b:c:d:f:nsz", longopts, NULL)) != -1)
- {
- switch (optc)
- {
- case 'b':
- case 'c':
- /* Build the byte list. */
- if (operating_mode != undefined_mode)
- FATAL_ERROR (_("only one type of list may be specified"));
- operating_mode = byte_mode;
- spec_list_string = optarg;
- break;
- case 'f':
- /* Build the field list. */
- if (operating_mode != undefined_mode)
- FATAL_ERROR (_("only one type of list may be specified"));
- operating_mode = field_mode;
- spec_list_string = optarg;
- break;
- case 'd':
- /* New delimiter. */
- /* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */
- if (optarg[0] != '\0' && optarg[1] != '\0')
- FATAL_ERROR (_("the delimiter must be a single character"));
- delim = optarg[0];
- delim_specified = true;
- break;
- case OUTPUT_DELIMITER_OPTION:
- output_delimiter_specified = true;
- /* Interpret --output-delimiter='' to mean
- 'use the NUL byte as the delimiter.' */
- output_delimiter_length = (optarg[0] == '\0'
- ? 1 : strlen (optarg));
- output_delimiter_string = xstrdup (optarg);
- break;
- case 'n':
- break;
- case 's':
- suppress_non_delimited = true;
- break;
- case 'z':
- line_delim = '\0';
- break;
- case COMPLEMENT_OPTION:
- complement = true;
- break;
- case_GETOPT_HELP_CHAR;
- case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
- default:
- usage (EXIT_FAILURE);
- }
- }
- if (operating_mode == undefined_mode)
- FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
- if (delim_specified && operating_mode != field_mode)
- FATAL_ERROR (_("an input delimiter may be specified only\
- when operating on fields"));
- if (suppress_non_delimited && operating_mode != field_mode)
- FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
- \tonly when operating on fields"));
- set_fields (spec_list_string,
- ( (operating_mode == field_mode) ? 0 : SETFLD_ERRMSG_USE_POS)
- | (complement ? SETFLD_COMPLEMENT : 0) );
- if (!delim_specified)
- delim = '\t';
- if (output_delimiter_string == NULL)
- {
- static char dummy[2];
- dummy[0] = delim;
- dummy[1] = '\0';
- output_delimiter_string = dummy;
- output_delimiter_length = 1;
- }
- if (optind == argc)
- ok = cut_file ("-");
- else
- for (ok = true; optind < argc; optind++)
- ok &= cut_file (argv[optind]);
- if (have_read_stdin && fclose (stdin) == EOF)
- {
- error (0, errno, "-");
- ok = false;
- }
- IF_LINT (reset_fields ());
- return ok ? EXIT_SUCCESS : EXIT_FAILURE;
- }
|