summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAssaf Gordon <assafgordon@gmail.com>2017-06-08 01:31:17 (GMT)
committerAssaf Gordon <assafgordon@gmail.com>2017-06-08 01:31:17 (GMT)
commitdff364989d2cfd2f8dd0e132f244a59998419bec (patch)
treedc45ab83968b5d0360178931976a7e7fea186283
parentb7f91b4c34cfa46753cdb7fe8eda57e8162e990c (diff)
downloaddatamash-dff364989d2cfd2f8dd0e132f244a59998419bec.zip
datamash-dff364989d2cfd2f8dd0e132f244a59998419bec.tar.gz
datamash-dff364989d2cfd2f8dd0e132f244a59998419bec.tar.bz2
datamash: add lines/fields option to 'check' operationdev/temp-master
Datamash will fail with non-zero exit code if the input does not have the expected number of lines/fields. Typical usage: $ seq 10 | paste - - | datamash check 2 fields && echo ok 5 lines, 2 fields ok $ seq 10 | paste - - | datamash check 6 fields && echo ok line 1 (2 fields): 1 2 datamash: check failed: line 1 has 2 fields (expecting 6) $ seq 10 | datamash check 11 lines datamash: check failed: input had 10 lines (expecting 11) * NEWS: Mention new options. * src/datamash.c (tabular_check_file): Implement additional checks. * src/op-parser.h (struct mode_check_params_t): New struct to hold options. (struct datamash_ops): Add new struct. * src/op-parser.c (parse_check_line_or_field, parse_mode_check): New functions to parse 'check' options. (parse_mode): Parse option in 'check' mode. * tests/datamash-parser.pl: Add parsing tests. * tests/datamash-check.pl: Add 'check' tests. * Makefile.am: Add 'datamash-check.pl' test script. * man/datamash.x: Add 'check' examples. * doc/datamash.texi: Expand 'check' section.
-rw-r--r--Makefile.am1
-rw-r--r--NEWS3
-rw-r--r--doc/datamash.texi56
-rw-r--r--man/datamash.x23
-rw-r--r--src/datamash.c32
-rw-r--r--src/op-parser.c73
-rw-r--r--src/op-parser.h14
-rwxr-xr-xtests/datamash-check.pl150
-rwxr-xr-xtests/datamash-parser.pl11
9 files changed, 357 insertions, 6 deletions
diff --git a/Makefile.am b/Makefile.am
index 4869aba..542bcd8 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -123,6 +123,7 @@ TESTS = \
tests/datamash-stats.pl \
tests/datamash-transpose.pl \
tests/datamash-crosstab.pl \
+ tests/datamash-check.pl \
tests/datamash-pair-tests.pl \
tests/datamash-check-tabular.pl \
tests/datamash-sort-header.sh \
diff --git a/NEWS b/NEWS
index 452d45d..37f0227 100644
--- a/NEWS
+++ b/NEWS
@@ -6,6 +6,9 @@
perc (percentile),
range (max-min of values in group/column)
+ Improved 'check' operation:
+ Expected number of lines/fields can be specified as parameter.
+
* Noteworthy changes in release 1.1.1 (2017-01-19) [stable]
diff --git a/doc/datamash.texi b/doc/datamash.texi
index 6be9454..b9d6dd4 100644
--- a/doc/datamash.texi
+++ b/doc/datamash.texi
@@ -1086,6 +1086,62 @@ datamash: check failed: line 3 has 2 fields (previous line had 3)
fail
@end example
+@subsection Expected number of lines/fields
+
+@option{check} accepts optional @var{lines} and @var{fields} and will
+return failure if the input does not have the requested number of lines/fields.
+
+@exdent The syntax is:
+
+@example
+datamash check [@var{N} lines] [@var{N} fields]
+@end example
+
+@exdent Usage examples:
+
+@example
+$ cat file.txt
+A 1 ww
+B 2 xx
+C 3 yy
+D 4 zz
+
+$ datamash check 4 lines < file.txt && echo ok
+4 lines, 3 fields
+ok
+
+$ datamash check 3 fields < file.txt && echo ok
+4 lines, 3 fields
+ok
+
+$ datamash check 4 lines 3 fields < file.txt && echo ok
+4 lines, 3 fields
+ok
+
+$ datamash check 7 fields < file.txt && echo ok
+line 1 (3 fields):
+ A 1 ww
+datamash: check failed: line 1 has 3 fields (expecting 22)
+
+$ datamash check 10 lines < file.txt && echo ok
+datamash: check failed: input had 4 lines (expecting 10)
+@end example
+
+For convenience, @var{line},@var{row},@var{rows}
+can be used instead of @var{lines};
+@var{field},@var{columns},@var{column},@var{col} can be used
+instead of @var{fields}.
+The following are all equivalent:
+
+@example
+datamash check 4 lines 10 fields < file.txt
+datamash check 4 rows 10 columns < file.txt
+datamash check 10 col 4 row < file.txt
+@end example
+
+
+@subsection checks in automation scripts
+
@cindex fail fast
@cindex shell scripts, check
@cindex check, in automation and shell scripts
diff --git a/man/datamash.x b/man/datamash.x
index ef18587..3357a72 100644
--- a/man/datamash.x
+++ b/man/datamash.x
@@ -37,10 +37,12 @@ transpose rows, columns of the input file
reverse field order in each line
.TP
-.B check
-verify the input file has same number of fields in all lines.
+.B check [N lines] [N fields]
+verify the input file has same number of fields in all lines,
+or the expected number of lines/fields.
number of lines and fields are printed to STDOUT. Exits with non-zero code
-and prints the offending line if there's a mismatch in the number of fields.
+and prints the offending line if there's a mismatch in the number of lines/
+fields.
.PP
@@ -453,8 +455,8 @@ $ sha1sum *.txt | datamash -Wf sha1 2
.SS "Check file structure"
-Check the structure of the input file (ensure all lines
-have the same number of fields):
+Check the structure of the input file: ensure all lines
+have the same number of fields, or expected number of lines/fields:
.PP
.nf
.RS
@@ -469,6 +471,17 @@ line 5 (2 fields):
13
datamash: check failed: line 5 has 2 fields (previous line had 3)
fail
+
+$ seq 10 | paste \- \- | datamash check 2 fields 5 lines
+5 lines, 2 fields
+
+$ seq 10 | paste \- \- | datamash check 4 fields
+line 1 (2 fields):
+ 1 2
+datamash: check failed: line 1 has 2 fields (expecting 4)
+
+$ seq 10 | paste \- \- | datamash check 7 lines
+datamash: check failed: input had 5 lines (expecting 7)
.RE
.fi
.PP
diff --git a/src/datamash.c b/src/datamash.c
index c0d58bf..c37dc7d 100644
--- a/src/datamash.c
+++ b/src/datamash.c
@@ -825,6 +825,9 @@ tabular_check_file ()
struct line_record_t lb1, lb2;
struct line_record_t *thisline, *prevline;
+ const uintmax_t n_lines = dm->mode_params.check_params.n_lines;
+ const uintmax_t n_fields = dm->mode_params.check_params.n_fields;
+
thisline = &lb1;
prevline = &lb2;
@@ -839,7 +842,26 @@ tabular_check_file ()
const size_t num_fields = line_record_num_fields (thisline);
- if (line_number>1 && num_fields != prev_num_fields)
+ /* Check if the number of fields is different than expected/requested
+ on the command line (e.g. with 'datamash check 6 fields') */
+ if (n_fields && n_fields != num_fields)
+ {
+ fprintf (stderr, _("line %"PRIuMAX" (%"PRIuMAX" fields):\n "),
+ (uintmax_t)(line_number), (uintmax_t)num_fields);
+ ignore_value (fwrite (line_record_buffer (thisline),
+ line_record_length (thisline), sizeof (char),
+ stderr));
+ fputc ('\n', stderr);
+ die (EXIT_FAILURE, 0, _("check failed: line " \
+ "%"PRIuMAX" has %"PRIuMAX" fields (expecting "\
+ "%"PRIuMAX")"),
+ (uintmax_t)line_number, (uintmax_t)num_fields,
+ (uintmax_t)n_fields);
+ }
+
+ /* Check if the the number of fields changed from one line to the next
+ (only if no expected number of fields specified on the command line).*/
+ else if (line_number>1 && num_fields != prev_num_fields)
{
fprintf (stderr, _("line %"PRIuMAX" (%"PRIuMAX" fields):\n "),
(uintmax_t)(line_number-1), (uintmax_t)prev_num_fields);
@@ -864,6 +886,14 @@ tabular_check_file ()
SWAP_LINES (prevline, thisline);
}
+ /* Check if we read too many/few lines */
+ if (n_lines && n_lines != line_number)
+ {
+ die (EXIT_FAILURE, 0, _("check failed: input had %"PRIuMAX" lines " \
+ "(expecting %"PRIuMAX")"),
+ (uintmax_t)line_number, (uintmax_t)n_lines);
+ }
+
/* Print summary */
printf (ngettext ("%"PRIuMAX" line", "%"PRIuMAX" lines",
select_plural (line_number)), (uintmax_t)line_number);
diff --git a/src/op-parser.c b/src/op-parser.c
index 11035f1..db01209 100644
--- a/src/op-parser.c
+++ b/src/op-parser.c
@@ -25,6 +25,7 @@
#include "system.h"
#include "die.h"
+#include "ignore-value.h"
#include "op-scanner.h"
#include "op-defs.h"
#include "op-parser.h"
@@ -530,6 +531,75 @@ parse_mode_column_list (enum processing_mode pm)
quote (get_processing_mode_name (pm)));
}
+static bool
+parse_check_line_or_field (const char* s)
+{
+ if (STREQ (s,"lines") || STREQ (s,"line") \
+ || STREQ (s,"rows") || STREQ (s,"row"))
+ return true;
+ if (STREQ (s,"fields") || STREQ (s,"field") \
+ || STREQ (s,"columns") || STREQ (s,"column") || STREQ (s,"col"))
+ return false;
+
+ die (EXIT_FAILURE, 0, _("invalid option %s for operation check"), quote (s));
+}
+
+static void
+parse_mode_check ()
+{
+ bool set_lines = true; // false = set columns
+ uintmax_t value = 0;
+
+ uintmax_t n_lines = 0;
+ uintmax_t n_fields = 0;
+
+ enum TOKEN tok = scanner_peek_token ();
+ while (tok != TOK_END)
+ {
+ tok = scanner_get_token ();
+ if (tok == TOK_INTEGER)
+ {
+ value = scan_val_int;
+
+ ignore_value (scanner_get_token ());
+ set_lines = parse_check_line_or_field (scanner_identifier);
+ }
+ else
+ {
+ set_lines = parse_check_line_or_field (scanner_identifier);
+ tok = scanner_get_token ();
+ if (tok != TOK_INTEGER)
+ die (EXIT_FAILURE, 0, _("number expected after option in " \
+ "operation 'check'"));
+ value = scan_val_int;
+ }
+
+ if (value == 0)
+ die (EXIT_FAILURE, 0, _("invalid value zero for lines/fields in " \
+ "operation 'check'"));
+
+ if (set_lines)
+ {
+ if (n_lines>0)
+ die (EXIT_FAILURE, 0, _("number of lines/rows already set in " \
+ "operation 'check'"));
+ n_lines = value;
+ }
+ else
+ {
+ if (n_fields>0)
+ die (EXIT_FAILURE, 0, _("number of fields/columns already set in " \
+ "operation 'check'"));
+ n_fields = value;
+ }
+
+ tok = scanner_peek_token ();
+ }
+
+ dm->mode_params.check_params.n_lines = n_lines;
+ dm->mode_params.check_params.n_fields = n_fields;
+}
+
static void
parse_mode ()
{
@@ -542,7 +612,10 @@ parse_mode ()
case MODE_TRANSPOSE:
case MODE_NOOP:
case MODE_REVERSE:
+ break;
+
case MODE_TABULAR_CHECK:
+ parse_mode_check ();
break;
case MODE_REMOVE_DUPS:
diff --git a/src/op-parser.h b/src/op-parser.h
index 1f3cfc7..2e5c96c 100644
--- a/src/op-parser.h
+++ b/src/op-parser.h
@@ -39,6 +39,12 @@ struct op_column_t
enum field_operation op;
};
+struct mode_check_params_t
+{
+ uintmax_t n_lines; /* If not zero, require this number of lines */
+ uintmax_t n_fields; /* if not zero, require this number of fields */
+};
+
struct datamash_ops
{
enum processing_mode mode; /* the processing mode */
@@ -52,6 +58,14 @@ struct datamash_ops
struct fieldop *ops; /* field operations */
size_t num_ops;
size_t alloc_ops;
+
+ /* Additional parameters for mode operatons
+ (i.e. ones relating to the operation mode,
+ not to specific field-ops) */
+ union
+ {
+ struct mode_check_params_t check_params;
+ } mode_params;
};
/* Parse the operations, return new datamash_ops structure.
diff --git a/tests/datamash-check.pl b/tests/datamash-check.pl
new file mode 100755
index 0000000..8f050fe
--- /dev/null
+++ b/tests/datamash-check.pl
@@ -0,0 +1,150 @@
+#!/usr/bin/env perl
+=pod
+ Unit Tests for GNU Datamash - perform simple calculation on input data
+ Tests for 'check' operation mode
+
+ Copyright (C) 2013-2017 Assaf Gordon <assafgordon@gmail.com>
+
+ This file is part of GNU Datamash.
+
+ GNU Datamash is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ GNU Datamash is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GNU Datamash. If not, see <http://www.gnu.org/licenses/>.
+
+ Written by Assaf Gordon.
+=cut
+use strict;
+use warnings;
+use List::Util qw/max/;
+use Data::Dumper;
+
+# Until a better way comes along to auto-use Coreutils Perl modules
+# as in the coreutils' autotools system.
+use Coreutils;
+use CuSkip;
+use CuTmpdir qw(datamash);
+
+(my $program_name = $0) =~ s|.*/||;
+my $prog_bin = 'datamash';
+
+## Cross-Compiling portability hack:
+## under qemu/binfmt, argv[0] (which is used to report errors) will contain
+## the full path of the binary, if the binary is on the $PATH.
+## So we try to detect what is the actual returned value of the program
+## in case of an error.
+my $prog = `$prog_bin --foobar 2>&1 | head -n 1 | cut -f1 -d:`;
+chomp $prog if $prog;
+$prog = $prog_bin unless $prog;
+
+# Turn off localization of executable's output.
+@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
+
+
+my $in1=<<'EOF';
+A 1 !
+B 2 @
+C 3 #
+D 4 $
+E 5 %
+EOF
+
+my $in2=<<'EOF';
+A 1
+B
+C 3
+EOF
+
+my $in3=<<'EOF';
+A
+EOF
+
+
+my @Tests =
+(
+ # Simple transpose and reverse
+ ['c1', 'check', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}],
+
+ # Variations on command-line parsing
+ ['c2', 'check 3 field', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}],
+ ['c3', 'check 3 fields', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}],
+ ['c4', 'check 3 col', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}],
+ ['c5', 'check 3 columns', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}],
+ ['c6', 'check 3 column', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}],
+
+ ['c7', 'check field 3', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}],
+ ['c8', 'check fields 3', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}],
+ ['c9', 'check col 3', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}],
+ ['c10', 'check columns 3', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}],
+ ['c11', 'check column 3', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}],
+
+ ['c12', 'check 5 lines', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}],
+ ['c13', 'check 5 line', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}],
+ ['c14', 'check 5 rows', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}],
+ ['c15', 'check 5 row', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}],
+
+ ['c16', 'check lines 5', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}],
+ ['c17', 'check line 5', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}],
+ ['c18', 'check row 5', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}],
+ ['c19', 'check rows 5', {IN_PIPE=>$in1}, {OUT=>"5 lines, 3 fields\n"}],
+
+
+ # Duplicated options
+ ['e1', 'check rows 5 lines 6', {IN_PIPE=>$in1}, {EXIT=>1},
+ {ERR=>"$prog: number of lines/rows already set in operation 'check'\n"}],
+ ['e2', 'check fields 6 fields 1', {IN_PIPE=>$in1}, {EXIT=>1},
+ {ERR=>"$prog: number of fields/columns already set in operation 'check'\n"}],
+
+ # Invalid values
+ ['e3', 'check 0 lines', {IN_PIPE=>$in1}, {EXIT=>1},
+ {ERR=>"$prog: invalid value zero for lines/fields in operation 'check'\n"}],
+ ['e4', 'check 0 fields', {IN_PIPE=>$in1}, {EXIT=>1},
+ {ERR=>"$prog: invalid value zero for lines/fields in operation 'check'\n"}],
+
+
+
+ # Check lines
+ ['c40', 'check 4 lines', {IN_PIPE=>$in1}, {EXIT=>1},
+ {ERR=>"$prog: check failed: input had 5 lines (expecting 4)\n"}],
+ ['c41', 'check 6 lines', {IN_PIPE=>$in1}, {EXIT=>1},
+ {ERR=>"$prog: check failed: input had 5 lines (expecting 6)\n"}],
+ ['c42', 'check 6 lines', {IN_PIPE=>""}, {EXIT=>1},
+ {ERR=>"$prog: check failed: input had 0 lines (expecting 6)\n"}],
+
+ # Check fields
+ ['c60', 'check 2 fields', {IN_PIPE=>$in1}, {EXIT=>1},
+ {ERR=>"line 1 (3 fields):\n" .
+ " A\t1\t!\n" .
+ "$prog: check failed: line 1 has 3 fields (expecting 2)\n"}],
+
+
+ # Check matrix structure, no expected number of fields
+ ['c61', 'check', {IN_PIPE=>$in2}, {EXIT=>1},
+ {ERR=>"line 1 (2 fields):\n" .
+ " A\t1\n" .
+ "line 2 (1 fields):\n" .
+ " B\n" .
+ "$prog: check failed: line 2 has 1 fields (previous line had 2)\n"}],
+
+ # With expected number of fields
+ ['c62', 'check 2 fields', {IN_PIPE=>$in2}, {EXIT=>1},
+ {ERR=>"line 2 (1 fields):\n" .
+ " B\n" .
+ "$prog: check failed: line 2 has 1 fields (expecting 2)\n"}],
+
+
+);
+
+my $save_temps = $ENV{SAVE_TEMPS};
+my $verbose = $ENV{VERBOSE};
+
+my $fail = run_tests ($program_name, $prog_bin, \@Tests, $save_temps, $verbose);
+exit $fail;
diff --git a/tests/datamash-parser.pl b/tests/datamash-parser.pl
index 92a35df..a9fbb2b 100755
--- a/tests/datamash-parser.pl
+++ b/tests/datamash-parser.pl
@@ -123,6 +123,17 @@ my @Tests =
['p27','sum 1,2 sum 3-5',
{IN_PIPE=>$in2}, {OUT=>$out2}],
+ # 'check' options
+ ['p30','check', {IN_PIPE=>""}, {OUT=>"0 lines, 0 fields\n"}],
+ ['p31','check foo', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
+ ['p32','check 10', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
+ ['p33','check lines lines', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
+ ['p34','check 1 line fields', {IN_PIPE=>""}, {EXIT=>1},
+ {ERR_SUBST=>'s/.*//s'}],
+ ['p35','check 10 foo', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
+
+
+
# Field range with invalid syntax
['e20','sum 1-', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],
['e21','sum 1-x', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],