summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAssaf Gordon <assafgordon@gmail.com>2017-11-14 19:48:53 (GMT)
committerAssaf Gordon <assafgordon@gmail.com>2017-11-14 19:55:28 (GMT)
commitabd7baa9a830fea5d7c7ffc7729c0ca64aa18f50 (patch)
treebad79f1f00acc0668b7c01ec144b4402ff8af65f
parentfd49457db9bec56addc9acafeee85df03c27358e (diff)
downloaddatamash-abd7baa9a830fea5d7c7ffc7729c0ca64aa18f50.zip
datamash-abd7baa9a830fea5d7c7ffc7729c0ca64aa18f50.tar.gz
datamash-abd7baa9a830fea5d7c7ffc7729c0ca64aa18f50.tar.bz2
datamash: new operation 'trimmean'
To calculate 20% trimmed mean: $ printf "%s\n" 13 3 7 33 3 9 | datamash trimmean:0.2 1 8 Using 'trimmean:0' is equivalent to 'mean'. Using 'trimmean:0.5' is equivalent to 'median' (for R compatability). Suggested by Khavish Bhundoo in https://lists.gnu.org/archive/html/bug-datamash/2017-10/msg00004.html . * NEWS: Mention new operation. * src/datamash.c (usage): Add new op. (print_column_headers): Print percent value in headers. * src/op-defs.{h,c}: Define new OP_TRIMMED_MEAN. * src/field-ops.{h,c}: Implement new OP_TRIMMED_MEAN. * src/op-parser.c: (set_op_params): Handle 'trimmean:N' paramater. * src/utils.{h,c} (trimmed_mean_value): New function. * contrib/bash-completion/datamash: Add new operation. * doc/datamash.texi, man/datamash.x: Mention new operation. * tests/datamash-error-msgs.pl: Test 'trimmean' parameter error messages. * tests/datamash-stats.pl: Add tests, compare against R results.
-rw-r--r--NEWS6
-rw-r--r--contrib/bash-completion/datamash2
-rw-r--r--doc/datamash.texi2
-rw-r--r--man/datamash.x6
-rw-r--r--src/datamash.c8
-rw-r--r--src/field-ops.c10
-rw-r--r--src/field-ops.h1
-rw-r--r--src/op-defs.c1
-rw-r--r--src/op-defs.h3
-rw-r--r--src/op-parser.c15
-rw-r--r--src/utils.c23
-rw-r--r--src/utils.h7
-rw-r--r--tests/datamash-error-msgs.pl10
-rwxr-xr-xtests/datamash-stats.pl80
14 files changed, 170 insertions, 4 deletions
diff --git a/NEWS b/NEWS
index 494c55d..e80e853 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,12 @@
New option: --output-delimiter=X overrides -t/-W.
+ New operation: trimmean (trimmed mean value).
+ To calculate 20% trimmed mean:
+ $ printf "%s\n" 13 3 7 33 3 9 | datamash trimmean:0.2 1
+ 8
+
+
** Bug fixes
Datamash now builds correctly with external OpenSSL libraries
diff --git a/contrib/bash-completion/datamash b/contrib/bash-completion/datamash
index 5811c83..85074a9 100644
--- a/contrib/bash-completion/datamash
+++ b/contrib/bash-completion/datamash
@@ -27,7 +27,7 @@ _datamash ()
local groupby_ops="sum min max absmin absmax range \
count first last rand \
unique collapse countunique \
-mean median q1 q3 iqr perc mode antimode \
+mean trimmean median q1 q3 iqr perc mode antimode \
pstdev sstdev pvar svar mad madraw \
pskew sskew pkurt skurt dpo jarque \
pcov scov ppearson spearson strbin"
diff --git a/doc/datamash.texi b/doc/datamash.texi
index a3d67c2..579ff64 100644
--- a/doc/datamash.texi
+++ b/doc/datamash.texi
@@ -420,6 +420,8 @@ number of unique/distinct values
@table @option
@item mean
mean of the values
+@item trimmean
+trimmed mean of the values
@item median
median value
@item q1
diff --git a/man/datamash.x b/man/datamash.x
index 3357a72..ee2deae 100644
--- a/man/datamash.x
+++ b/man/datamash.x
@@ -155,6 +155,12 @@ to R's \fBsd()\fR function).
mean of the values
.TP
+.B trimmean[:PERCENT]
+trimmed mean of the values. \fBPERCENT\fR should be between 0 and 0.5.
+(\fBtrimmean:0\fR is equivalent to \fBmean\fR. \fBtrimmean:0.5\fR is equivalent
+to \fBmedian\fR).
+
+.TP
.B median
median value
diff --git a/src/datamash.c b/src/datamash.c
index 733bc2c..0b23c93 100644
--- a/src/datamash.c
+++ b/src/datamash.c
@@ -203,8 +203,9 @@ which require a pair of fields (e.g. 'pcov 2:6').\n"), stdout);
fputs (_("Statistical Grouping operations:\n"),stdout);
fputs ("\
- mean, median, q1, q3, iqr, perc, mode, antimode, pstdev, sstdev, pvar,\n\
- svar, mad, madraw, pskew, sskew, pkurt, skurt, dpo, jarque,\n\
+ mean, trimmean, median, q1, q3, iqr, perc, mode, antimode, \n\
+ pstdev, sstdev, pvar, svar, mad, madraw,\n\
+ pskew, sskew, pkurt, skurt, dpo, jarque,\n\
scov, pcov, spearson, ppearson\n\
\n", stdout);
fputs ("\n", stdout);
@@ -458,6 +459,9 @@ print_column_headers ()
if (op->op == OP_PERCENTILE) {
printf (":%"PRIuMAX, (uintmax_t)op->params.percentile);
}
+ if (op->op == OP_TRIMMED_MEAN) {
+ printf (":%Lg", op->params.trimmed_mean);
+ }
printf ("(%s)", get_input_field_name (op->field));
diff --git a/src/field-ops.c b/src/field-ops.c
index 2d4d86a..5903c15 100644
--- a/src/field-ops.c
+++ b/src/field-ops.c
@@ -153,6 +153,8 @@ struct operation_data operations[] =
{NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
/* OP_FRACTION */
{NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
+ /* OP_TRIMMED_MEAN */
+ {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT},
{0, 0, NUMERIC_RESULT}
};
@@ -506,6 +508,7 @@ field_op_collect (struct fieldop *op,
case OP_S_COVARIANCE:
case OP_P_PEARSON_COR:
case OP_S_PEARSON_COR:
+ case OP_TRIMMED_MEAN:
field_op_add_value (op, num_value);
break;
@@ -692,6 +695,7 @@ field_op_summarize_empty (struct fieldop *op)
case OP_TRUNCATE:
case OP_FRACTION:
case OP_RANGE:
+ case OP_TRIMMED_MEAN:
numeric_result = nanl ("");
break;
@@ -822,6 +826,12 @@ field_op_summarize (struct fieldop *op)
op->params.percentile / 100.0 );
break;
+ case OP_TRIMMED_MEAN:
+ field_op_sort_values (op);
+ numeric_result = trimmed_mean_value ( op->values, op->num_values,
+ op->params.trimmed_mean);
+ break;
+
case OP_PSTDEV:
numeric_result = stdev_value ( op->values, op->num_values, DF_POPULATION);
break;
diff --git a/src/field-ops.h b/src/field-ops.h
index d347bb5..24c91d5 100644
--- a/src/field-ops.h
+++ b/src/field-ops.h
@@ -88,6 +88,7 @@ struct fieldop
long double bin_bucket_size;
size_t strbin_bucket_size;
size_t percentile;
+ long double trimmed_mean;
} params;
/* Collected Data */
diff --git a/src/op-defs.c b/src/op-defs.c
index ced98bf..4bb00fe 100644
--- a/src/op-defs.c
+++ b/src/op-defs.c
@@ -85,6 +85,7 @@ struct field_operation_definition field_operations[] =
{"round", OP_ROUND, MODE_PER_LINE},
{"trunc", OP_TRUNCATE, MODE_PER_LINE},
{"frac", OP_FRACTION, MODE_PER_LINE},
+ {"trimmean",OP_TRIMMED_MEAN, MODE_GROUPBY},
{NULL, OP_INVALID, MODE_INVALID}
};
diff --git a/src/op-defs.h b/src/op-defs.h
index 8fb75e2..01c4d52 100644
--- a/src/op-defs.h
+++ b/src/op-defs.h
@@ -75,7 +75,8 @@ enum field_operation
OP_CEIL, /* Ceiling */
OP_ROUND, /* Round */
OP_TRUNCATE, /* Truncate */
- OP_FRACTION /* Fraction */
+ OP_FRACTION, /* Fraction */
+ OP_TRIMMED_MEAN /* Trimmed Mean */
};
enum processing_mode
diff --git a/src/op-parser.c b/src/op-parser.c
index 82bd174..37d49a3 100644
--- a/src/op-parser.c
+++ b/src/op-parser.c
@@ -195,6 +195,21 @@ set_op_params (struct fieldop *op)
return;
}
+ if (op->op==OP_TRIMMED_MEAN)
+ {
+ op->params.trimmed_mean = 0; /* default trimmed mean = no trim */
+ if (_params_used==1)
+ op->params.trimmed_mean = _params[0].f;
+ if (op->params.trimmed_mean<0 || op->params.trimmed_mean>0.5)
+ die (EXIT_FAILURE, 0, _("invalid trim mean value %Lg " \
+ "(expected 0 <= X <= 0.5)"),
+ op->params.trimmed_mean);
+ if (_params_used>1)
+ die (EXIT_FAILURE, 0, _("too many parameters for operation %s"),
+ quote (get_field_operation_name (op->op)));
+ return;
+ }
+
/* All other operations do not take parameters */
if (_params_used>0)
die (EXIT_FAILURE, 0, _("too many parameters for operation %s"),
diff --git a/src/utils.c b/src/utils.c
index 516838f..0de5e0d 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -405,6 +405,29 @@ mode_value ( const long double * const values, size_t n, enum MODETYPE type)
return best_value;
}
+long double _GL_ATTRIBUTE_PURE
+trimmed_mean_value ( const long double * const values, size_t n,
+ const long double trimmed_mean_percent)
+{
+ assert (trimmed_mean_percent >= 0); /* LCOV_EXCL_LINE */
+ assert (trimmed_mean_percent <= 0.5); /* LCOV_EXCL_LINE */
+
+ /* For R compatability:
+ mean (x,trim=0.5) in R is equivalent to median (x). */
+ if (trimmed_mean_percent >= 0.5)
+ return median_value (values, n);
+
+ /* number of element to skip from each end */
+ size_t c = pos_zero (floorl (trimmed_mean_percent * n));
+
+ long double v = 0;
+ for (size_t i=c; i< (n-c); i++)
+ v += values[i];
+
+ return v / (long double)(n-c*2);
+}
+
+
int _GL_ATTRIBUTE_PURE
cmpstringp (const void *p1, const void *p2)
{
diff --git a/src/utils.h b/src/utils.h
index e6b3713..0678473 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -182,6 +182,13 @@ enum MODETYPE
long double
mode_value ( const long double * const values, size_t n, enum MODETYPE type);
+/*
+ Given an array of doubles, return the mode/anti-mode values.
+ */
+long double
+trimmed_mean_value ( const long double * const values, size_t n,
+ const long double trimmed_mean_percent);
+
/*
comparison functions, to be used with 'qsort ()'
diff --git a/tests/datamash-error-msgs.pl b/tests/datamash-error-msgs.pl
index 01c0923..52d0085 100644
--- a/tests/datamash-error-msgs.pl
+++ b/tests/datamash-error-msgs.pl
@@ -201,6 +201,16 @@ my @Tests =
['e102', '--output-delimiter "XX"', {IN_PIPE=>""}, {EXIT=>1},
{ERR=>"$prog: the delimiter must be a single character\n"}],
+ # values for trimmean operation
+ ['e103','trimmean:12 1', {IN_PIPE=>""}, {EXIT=>1},
+ {ERR=>"$prog: invalid trim mean value 12 (expected 0 <= X <= 0.5)\n"}],
+ ['e104','trimmean:0.51 1', {IN_PIPE=>""}, {EXIT=>1},
+ {ERR=>"$prog: invalid trim mean value 0.51 (expected 0 <= X <= 0.5)\n"}],
+ ['e105','trimmean:-32 1', {IN_PIPE=>""}, {EXIT=>1},
+ {ERR=>"$prog: invalid parameter - for operation 'trimmean'\n"}],
+ ['e106','trimmean:1:2 1', {IN_PIPE=>""}, {EXIT=>1},
+ {ERR=>"$prog: too many parameters for operation 'trimmean'\n"}],
+
);
my $save_temps = $ENV{SAVE_TEMPS};
diff --git a/tests/datamash-stats.pl b/tests/datamash-stats.pl
index 5f7ba73..1fd59fb 100755
--- a/tests/datamash-stats.pl
+++ b/tests/datamash-stats.pl
@@ -180,6 +180,14 @@ The datamash tests below should return the same results are thes R commands:
perc99=function(x) { quantile(x, prob=0.99) }
perc100=function(x) { quantile(x, prob=1) }
+ trimmean0=function(x){mean(x,trim=0)}
+ trimmean10=function(x){mean(x,trim=0.1)}
+ trimmean20=function(x){mean(x,trim=0.2)}
+ trimmean30=function(x){mean(x,trim=0.3)}
+ trimmean40=function(x){mean(x,trim=0.4)}
+ trimmean50=function(x){mean(x,trim=0.5)}
+
+
# Helper function for madraw
madraw=function(x) { mad(x,constant=1.0) }
@@ -243,6 +251,12 @@ The datamash tests below should return the same results are thes R commands:
test(perc90)
test(perc95)
test(perc99)
+ test(trimmean0)
+ test(trimmean10)
+ test(trimmean20)
+ test(trimmean30)
+ test(trimmean40)
+ test(trimmean50)
test(iqr)
test(smp.sd)
test(pop.sd)
@@ -421,6 +435,72 @@ my @Tests =
['perc75_13','perc:75 1' , {IN_PIPE=>$seq23}, {OUT => "8\n"},],
+ # Trimmed Mean:0
+ ['tmean0_1', 'trimmean:0 1' , {IN_PIPE=>$seq1}, {OUT => "2.5\n"}],
+ ['tmean0_2', 'trimmean:0 1' , {IN_PIPE=>$seq2}, {OUT => "2\n"}],
+ ['tmean0_3', 'trimmean:0 1' , {IN_PIPE=>$seq3}, {OUT => "2\n"}],
+ ['tmean0_4', 'trimmean:0 1' , {IN_PIPE=>$seq9}, {OUT => "11.111\n"}],
+ ['tmean0_5', 'trimmean:0 1' , {IN_PIPE=>$seq10}, {OUT => "12.9\n"}],
+ ['tmean0_6', 'trimmean:0 1' , {IN_PIPE=>$seq11}, {OUT => "14.545\n"}],
+ ['tmean0_7', 'trimmean:0 1' , {IN_PIPE=>$seq12}, {OUT => "16.416\n"}],
+ ['tmean0_8', 'trimmean:0 1' , {IN_PIPE=>$seq20}, {OUT => "100.06\n"},],
+ ['tmean0_9', 'trimmean:0 1' , {IN_PIPE=>$seq21}, {OUT => "45.32\n"},],
+ ['tmean0_10','trimmean:0 1' , {IN_PIPE=>$seq22}, {OUT => "67.45\n"},],
+ ['tmean0_11','trimmean:0 1' , {IN_PIPE=>$seq23}, {OUT => "6.125\n"},],
+
+ # Trimmed Mean:0.1
+ ['tmean1_1', 'trimmean:0.1 1' , {IN_PIPE=>$seq1}, {OUT => "2.5\n"}],
+ ['tmean1_2', 'trimmean:0.1 1' , {IN_PIPE=>$seq2}, {OUT => "2\n"}],
+ ['tmean1_3', 'trimmean:0.1 1' , {IN_PIPE=>$seq3}, {OUT => "2\n"}],
+ ['tmean1_4', 'trimmean:0.1 1' , {IN_PIPE=>$seq9}, {OUT => "11.111\n"}],
+ ['tmean1_5', 'trimmean:0.1 1' , {IN_PIPE=>$seq10}, {OUT => "12.25\n"}],
+ ['tmean1_6', 'trimmean:0.1 1' , {IN_PIPE=>$seq11}, {OUT => "14.111\n"}],
+ ['tmean1_7', 'trimmean:0.1 1' , {IN_PIPE=>$seq12}, {OUT => "15.8\n"}],
+ ['tmean1_8', 'trimmean:0.1 1' , {IN_PIPE=>$seq20}, {OUT => "100.275\n"},],
+ ['tmean1_9', 'trimmean:0.1 1' , {IN_PIPE=>$seq21}, {OUT => "42\n"},],
+ ['tmean1_10','trimmean:0.1 1' , {IN_PIPE=>$seq22}, {OUT => "67.45\n"},],
+ ['tmean1_11','trimmean:0.1 1' , {IN_PIPE=>$seq23}, {OUT => "6.076\n"},],
+
+ # Trimmed Mean:0.2
+ ['tmean2_1', 'trimmean:0.2 1' , {IN_PIPE=>$seq1}, {OUT => "2.5\n"}],
+ ['tmean2_2', 'trimmean:0.2 1' , {IN_PIPE=>$seq2}, {OUT => "2\n"}],
+ ['tmean2_3', 'trimmean:0.2 1' , {IN_PIPE=>$seq3}, {OUT => "2\n"}],
+ ['tmean2_4', 'trimmean:0.2 1' , {IN_PIPE=>$seq9}, {OUT => "10.714\n"}],
+ ['tmean2_5', 'trimmean:0.2 1' , {IN_PIPE=>$seq10}, {OUT => "12\n"}],
+ ['tmean2_6', 'trimmean:0.2 1' , {IN_PIPE=>$seq11}, {OUT => "13.571\n"}],
+ ['tmean2_7', 'trimmean:0.2 1' , {IN_PIPE=>$seq12}, {OUT => "15.5\n"}],
+ ['tmean2_8', 'trimmean:0.2 1' , {IN_PIPE=>$seq20}, {OUT => "100.516\n"},],
+ ['tmean2_9', 'trimmean:0.2 1' , {IN_PIPE=>$seq21}, {OUT => "40.1\n"},],
+ ['tmean2_10','trimmean:0.2 1' , {IN_PIPE=>$seq22}, {OUT => "67.6\n"},],
+ ['tmean2_11','trimmean:0.2 1' , {IN_PIPE=>$seq23}, {OUT => "6.053\n"},],
+
+ # Trimmed Mean:0.4
+ ['tmean4_1', 'trimmean:0.4 1' , {IN_PIPE=>$seq1}, {OUT => "2.5\n"}],
+ ['tmean4_2', 'trimmean:0.4 1' , {IN_PIPE=>$seq2}, {OUT => "2\n"}],
+ ['tmean4_3', 'trimmean:0.4 1' , {IN_PIPE=>$seq3}, {OUT => "2\n"}],
+ ['tmean4_4', 'trimmean:0.4 1' , {IN_PIPE=>$seq9}, {OUT => "10.333\n"}],
+ ['tmean4_5', 'trimmean:0.4 1' , {IN_PIPE=>$seq10}, {OUT => "12\n"}],
+ ['tmean4_6', 'trimmean:0.4 1' , {IN_PIPE=>$seq11}, {OUT => "13.666\n"}],
+ ['tmean4_7', 'trimmean:0.4 1' , {IN_PIPE=>$seq12}, {OUT => "15\n"}],
+ ['tmean4_8', 'trimmean:0.4 1' , {IN_PIPE=>$seq20}, {OUT => "100.55\n"},],
+ ['tmean4_9', 'trimmean:0.4 1' , {IN_PIPE=>$seq21}, {OUT => "37.3\n"},],
+ ['tmean4_10','trimmean:0.4 1' , {IN_PIPE=>$seq22}, {OUT => "67\n"},],
+ ['tmean4_11','trimmean:0.4 1' , {IN_PIPE=>$seq23}, {OUT => "6.067\n"},],
+
+ # Trimmed Mean:0.5
+ ['tmean5_1', 'trimmean:0.5 1' , {IN_PIPE=>$seq1}, {OUT => "2.5\n"}],
+ ['tmean5_2', 'trimmean:0.5 1' , {IN_PIPE=>$seq2}, {OUT => "2\n"}],
+ ['tmean5_3', 'trimmean:0.5 1' , {IN_PIPE=>$seq3}, {OUT => "2\n"}],
+ ['tmean5_4', 'trimmean:0.5 1' , {IN_PIPE=>$seq9}, {OUT => "11\n"}],
+ ['tmean5_5', 'trimmean:0.5 1' , {IN_PIPE=>$seq10}, {OUT => "12\n"}],
+ ['tmean5_6', 'trimmean:0.5 1' , {IN_PIPE=>$seq11}, {OUT => "13\n"}],
+ ['tmean5_7', 'trimmean:0.5 1' , {IN_PIPE=>$seq12}, {OUT => "15\n"}],
+ ['tmean5_8', 'trimmean:0.5 1' , {IN_PIPE=>$seq20}, {OUT => "100\n"},],
+ ['tmean5_9', 'trimmean:0.5 1' , {IN_PIPE=>$seq21}, {OUT => "37\n"},],
+ ['tmean5_10','trimmean:0.5 1' , {IN_PIPE=>$seq22}, {OUT => "67\n"},],
+ ['tmean5_11','trimmean:0.5 1' , {IN_PIPE=>$seq23}, {OUT => "6\n"},],
+
+
# Test IQR
['iqr_1', 'iqr 1' , {IN_PIPE=>$seq1}, {OUT => "1.5\n"}],
['iqr_2', 'iqr 1' , {IN_PIPE=>$seq2}, {OUT => "1\n"}],