summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAssaf Gordon <assafgordon@gmail.com>2017-03-15 19:56:07 (GMT)
committerAssaf Gordon <assafgordon@gmail.com>2017-03-17 03:31:58 (GMT)
commit1042e8c83cb76f0e9afc103110799add7729c17b (patch)
treee1ced1444b5f0182cf231470012dded493544587
parentdc55ae560e6720b9e4e9a47edd5c19ab7c87cb0d (diff)
downloaddatamash-dev/percentiles2.zip
datamash-dev/percentiles2.tar.gz
datamash-dev/percentiles2.tar.bz2
percentile minor adjustmentsdev/percentiles2
* NEWS: Mention new operation. * tests/datamash-tests.pl: Test 'perc' headers. * tests/datamash-error-msgs.pl: Test 'perc' parameter error messages.
-rw-r--r--NEWS7
-rw-r--r--doc/datamash.texi3
-rw-r--r--src/datamash.c2
-rw-r--r--src/field-ops.c3
-rw-r--r--src/op-parser.c6
-rw-r--r--tests/datamash-error-msgs.pl12
-rwxr-xr-xtests/datamash-stats.pl34
-rwxr-xr-xtests/datamash-tests.pl6
8 files changed, 64 insertions, 9 deletions
diff --git a/NEWS b/NEWS
index 877659e..64996a5 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,10 @@
+* Noteworthy changes in release X.X.X
+
+** New Features
+
+ New operation: perc (percentile)
+
+
* Noteworthy changes in release 1.1.1 (2017-01-19) [stable]
** Bug fixes
diff --git a/doc/datamash.texi b/doc/datamash.texi
index 49924dc..fdbecab 100644
--- a/doc/datamash.texi
+++ b/doc/datamash.texi
@@ -181,7 +181,8 @@ all the values in the input file.
@code{collapse}, @code{countunique}
@item Group-by Statistical operations:
-@code{mean}, @code{median}, @code{q1}, @code{q3}, @code{iqr}, @code{perc}, @code{mode},
+@code{mean}, @code{mode},
+@code{median}, @code{q1}, @code{q3}, @code{iqr}, @code{perc},
@code{antimode}, @code{pstdev}, @code{sstdev}, @code{pvar}, @code{svar},
@code{mad}, @code{madraw}, @code{sskew}, @code{pskew}, @code{skurt},
@code{pkurt}, @code{jarque}, @code{dpo},
diff --git a/src/datamash.c b/src/datamash.c
index 73cbe34..8a1e1d0 100644
--- a/src/datamash.c
+++ b/src/datamash.c
@@ -197,7 +197,7 @@ which require a pair of fields (e.g. 'pcov 2:6').\n"), stdout);
fputs (_("Statistical Grouping operations:\n"),stdout);
fputs ("\
- mean, median, q1, q3, iqr, mode, antimode, pstdev, sstdev, pvar,\n\
+ mean, median, q1, q3, iqr, perc, mode, antimode, pstdev, sstdev, pvar,\n\
svar, mad, madraw, pskew, sskew, pkurt, skurt, dpo, jarque,\n\
scov, pcov, spearson, ppearson\n\
\n", stdout);
diff --git a/src/field-ops.c b/src/field-ops.c
index 16ab486..6ca0a20 100644
--- a/src/field-ops.c
+++ b/src/field-ops.c
@@ -793,7 +793,8 @@ field_op_summarize (struct fieldop *op)
case OP_PERCENTILE:
field_op_sort_values (op);
- numeric_result = percentile_value ( op->values, op->num_values, op->params.percentile / 100.0 );
+ numeric_result = percentile_value ( op->values, op->num_values,
+ op->params.percentile / 100.0 );
break;
case OP_PSTDEV:
diff --git a/src/op-parser.c b/src/op-parser.c
index 58cd557..52cfe3e 100644
--- a/src/op-parser.c
+++ b/src/op-parser.c
@@ -184,9 +184,9 @@ set_op_params (struct fieldop *op)
op->params.percentile = 95; /* default percentile */
if (_params_used==1)
op->params.percentile = _params[0].u;
- if (op->params.percentile==0)
- error (EXIT_FAILURE, 0, _("strbin bucket size must not be zero"));
- /* TODO: in the future, accept offset as well? */
+ if (op->params.percentile==0 || op->params.percentile>100)
+ error (EXIT_FAILURE, 0, _("invalid percentile value %" PRIuMAX),
+ op->params.percentile);
if (_params_used>1)
error (EXIT_FAILURE, 0, _("too many parameters for operation %s"),
quote (get_field_operation_name (op->op)));
diff --git a/tests/datamash-error-msgs.pl b/tests/datamash-error-msgs.pl
index dbd4a97..19695c8 100644
--- a/tests/datamash-error-msgs.pl
+++ b/tests/datamash-error-msgs.pl
@@ -180,6 +180,18 @@ my @Tests =
['e93','strbin:0 1', {IN_PIPE=>""}, {EXIT=>1},
{ERR=>"$prog: strbin bucket size must not be zero\n"}],
+ # values for percentile operation
+ ['e94','perc:0 1', {IN_PIPE=>""}, {EXIT=>1},
+ {ERR=>"$prog: invalid percentile value 0\n"}],
+ ['e95','perc:101 1', {IN_PIPE=>""}, {EXIT=>1},
+ {ERR=>"$prog: invalid percentile value 101\n"}],
+ ['e96','perc:foo 1', {IN_PIPE=>""}, {EXIT=>1},
+ {ERR=>"$prog: invalid parameter foo for operation 'perc'\n"}],
+ ['e97','perc:-32 1', {IN_PIPE=>""}, {EXIT=>1},
+ {ERR=>"$prog: invalid parameter - for operation 'perc'\n"}],
+ ['e98','perc:1:2 1', {IN_PIPE=>""}, {EXIT=>1},
+ {ERR=>"$prog: too many parameters for operation 'perc'\n"}],
+
);
my $save_temps = $ENV{SAVE_TEMPS};
diff --git a/tests/datamash-stats.pl b/tests/datamash-stats.pl
index 621d130..f840036 100755
--- a/tests/datamash-stats.pl
+++ b/tests/datamash-stats.pl
@@ -169,9 +169,11 @@ The datamash tests below should return the same results are thes R commands:
q3=function(x) { quantile(x, prob=0.75) }
# Helper functions for quartiles
+ perc1=function(x) { quantile(x, prob=0.01) }
perc90=function(x) { quantile(x, prob=0.90) }
perc95=function(x) { quantile(x, prob=0.95) }
perc99=function(x) { quantile(x, prob=0.99) }
+ perc100=function(x) { quantile(x, prob=1) }
# Helper function for madraw
madraw=function(x) { mad(x,constant=1.0) }
@@ -333,7 +335,8 @@ my @Tests =
['perc90_6', 'perc:90 1' , {IN_PIPE=>$seq11}, {OUT => "29\n"}],
['perc90_7', 'perc:90 1' , {IN_PIPE=>$seq12}, {OUT => "30.8\n"}],
['perc90_8', 'perc:90 1' , {IN_PIPE=>$seq12_unsorted}, {OUT => "30.8\n"}],
- ['perc90_9', '--sort perc:90 1' , {IN_PIPE=>$seq12_unsorted}, {OUT => "30.8\n"}],
+ ['perc90_9', '--sort perc:90 1',
+ {IN_PIPE=>$seq12_unsorted}, {OUT => "30.8\n"}],
['perc90_10','perc:90 1' , {IN_PIPE=>$seq20}, {OUT => "111\n"},],
['perc90_11','perc:90 1' , {IN_PIPE=>$seq21}, {OUT => "84.2\n"},],
['perc90_12','perc:90 1' , {IN_PIPE=>$seq22}, {OUT => "70\n"},],
@@ -349,7 +352,8 @@ my @Tests =
['perc95_6', 'perc:95 1' , {IN_PIPE=>$seq11}, {OUT => "30\n"}],
['perc95_7', 'perc:95 1' , {IN_PIPE=>$seq12}, {OUT => "33.7\n"}],
['perc95_8', 'perc:95 1' , {IN_PIPE=>$seq12_unsorted}, {OUT => "33.7\n"}],
- ['perc95_9', '--sort perc:95 1' , {IN_PIPE=>$seq12_unsorted}, {OUT => "33.7\n"}],
+ ['perc95_9', '--sort perc:95 1',
+ {IN_PIPE=>$seq12_unsorted}, {OUT => "33.7\n"}],
['perc95_10','perc:95 1' , {IN_PIPE=>$seq20}, {OUT => "114.15\n"},],
['perc95_11','perc:95 1' , {IN_PIPE=>$seq21}, {OUT => "103.35\n"},],
['perc95_12','perc:95 1' , {IN_PIPE=>$seq22}, {OUT => "73\n"},],
@@ -365,12 +369,36 @@ my @Tests =
['perc99_6', 'perc:99 1' , {IN_PIPE=>$seq11}, {OUT => "30.8\n"}],
['perc99_7', 'perc:99 1' , {IN_PIPE=>$seq12}, {OUT => "36.34\n"}],
['perc99_8', 'perc:99 1' , {IN_PIPE=>$seq12_unsorted}, {OUT => "36.34\n"}],
- ['perc99_9', '--sort perc:99 1' , {IN_PIPE=>$seq12_unsorted}, {OUT => "36.34\n"}],
+ ['perc99_9', '--sort perc:99 1',
+ {IN_PIPE=>$seq12_unsorted}, {OUT => "36.34\n"}],
['perc99_10','perc:99 1' , {IN_PIPE=>$seq20}, {OUT => "118.02\n"},],
['perc99_11','perc:99 1' , {IN_PIPE=>$seq21}, {OUT => "120.49\n"},],
['perc99_12','perc:99 1' , {IN_PIPE=>$seq22}, {OUT => "73\n"},],
['perc99_13','perc:99 1' , {IN_PIPE=>$seq23}, {OUT => "11\n"},],
+ # Test edge cases: perc:1 and perc:100
+ ['perc1_1', 'perc:1 1', {IN_PIPE=>$seq20}, {OUT => "78\n"},],
+ ['perc100_1','perc:100 1', {IN_PIPE=>$seq20}, {OUT => "120\n"},],
+
+ # Sanity check: percentile:50 should be equal to 'median' 'op'
+ ['perc50_1','perc:50 1' , {IN_PIPE=>$seq20}, {OUT => "100\n"},],
+ ['perc50_2','perc:50 1' , {IN_PIPE=>$seq21}, {OUT => "37\n"},],
+ ['perc50_3','perc:50 1' , {IN_PIPE=>$seq22}, {OUT => "67\n"},],
+ ['perc50_4','perc:50 1' , {IN_PIPE=>$seq23}, {OUT => "6\n"},],
+
+ # Sanity check: percentile:25 should be equal to 'q1' op
+ ['perc25_1','perc:25 1' , {IN_PIPE=>$seq20}, {OUT => "93\n"},],
+ ['perc25_2','perc:25 1' , {IN_PIPE=>$seq21}, {OUT => "23\n"},],
+ ['perc25_3','perc:25 1' , {IN_PIPE=>$seq22}, {OUT => "67\n"},],
+ ['perc25_4','perc:25 1' , {IN_PIPE=>$seq23}, {OUT => "4\n"},],
+
+ # Sanity check: percentile:75 should be equal to 'q3' op
+ ['perc75_10','perc:75 1' , {IN_PIPE=>$seq20}, {OUT => "107\n"},],
+ ['perc75_11','perc:75 1' , {IN_PIPE=>$seq21}, {OUT => "61.5\n"},],
+ ['perc75_12','perc:75 1' , {IN_PIPE=>$seq22}, {OUT => "70\n"},],
+ ['perc75_13','perc:75 1' , {IN_PIPE=>$seq23}, {OUT => "8\n"},],
+
+
# Test IQR
['iqr_1', 'iqr 1' , {IN_PIPE=>$seq1}, {OUT => "1.5\n"}],
['iqr_2', 'iqr 1' , {IN_PIPE=>$seq2}, {OUT => "1\n"}],
diff --git a/tests/datamash-tests.pl b/tests/datamash-tests.pl
index b2fa9b1..22450e8 100755
--- a/tests/datamash-tests.pl
+++ b/tests/datamash-tests.pl
@@ -602,6 +602,12 @@ my @Tests =
['hdr23', '-t: --header-in rmdup 1', {IN_PIPE=>""}, {OUT=>""}],
['hdr24', '-t: -H rmdup 1', {IN_PIPE=>""}, {OUT=>""}],
+ # percentile operation has special header handling (which includes
+ # the percentile value).
+ ['hdr25', '-W -H perc 2', {IN_PIPE=>$in_hdr1},
+ {OUT=>"perc:95(y)\n8.45\n"},],
+ ['hdr26', '-W -H perc:50 2', {IN_PIPE=>$in_hdr1},
+ {OUT=>"perc:50(y)\n4\n"}],
# Test single line per group
['sl1', '-t" " -g 1 mean 2', {IN_PIPE=>$in_g4},