#!/usr/bin/perl # # Calcurate statistics of values in each column. # Copyright (c) 2003-2010, Hiroyuki Ohsaki. # All rights reserved. # # $Id: stats,v 1.19 2010/08/25 07:37:27 oosaki Exp oosaki $ # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # usage: stats [-va] [-t type[,type...]] [file...] use File::Basename; use Getopt::Std; use List::Util; use Smart::Comments; use diagnostics; use strict; use warnings; our %STAT_TBL = ( count => \&count, sum => \&sum, total => \&sum, # backward compatibility min => \&min, max => \&max, mean => \&mean, median => \&median, mode => \&mode, var => \&variance, stddev => \&stddev, skew => \&skewness, kurt => \&kurtosis, cv => sub {$_ = shift; 100 * stddev($_) / mean($_)}, quant25 => sub {percentile(0.25, shift)}, quant50 => sub {percentile(0.5, shift)}, quant75 => sub {percentile(0.75, shift)}, conf90 => sub {conf_interval(90, shift)}, conf95 => sub {conf_interval(95, shift)}, conf99 => sub {conf_interval(99, shift)}, covar => \&covariance, corr => \&correlation, spearmn => \&rank_correlation, kendall => \&kendall_tau, ); our %IS_MULTI = qw(covar 1 corr 1 spearmn 1 kendall 1); sub usage { my $prog = basename($0); my $types = join('/', sort keys %STAT_TBL); die < 0); return 0; } sub count ($_) { my $listp = shift; return scalar(@{$listp}); } sub sum ($_) { my $listp = shift; my $sum = 0; $sum += $_ for (@{$listp}); return $sum; } sub min ($_) { my $listp = shift; my $min; for (@{$listp}) { $min = $_ if (!defined $min or $_ < $min); } return $min; } sub max ($_) { my $listp = shift; my $max; for (@{$listp}) { $max = $_ if (!defined $max or $_ > $max); } return $max; } sub mean ($) { my $listp = shift; return undef unless count($listp); return sum($listp) / count($listp); } sub median ($) { my $listp = shift; return (sort {$a <=> $b} @{$listp})[int($#{$listp} / 2)]; } sub mode ($) { my $listp = shift; my %counts; $counts{$_}++ for @{$listp}; return (sort {$counts{$b} <=> $counts{$a}} keys %counts)[0]; } sub moment ($$) { my ($n, $listp) = @_; my $mean = mean($listp); my @l = map {($_ - $mean)**$n} @{$listp}; return mean(\@l); } sub variance ($) { my $listp = shift; return moment(2, $listp); } sub stddev ($) { my $listp = shift; return sqrt(variance($listp)); } sub skewness ($) { my $listp = shift; return (moment(3, $listp) / moment(2, $listp)**1.5); } sub kurtosis ($) { my $listp = shift; return (moment(4, $listp) / moment(2, $listp)**2 - 3); } # return the P-percentile sub percentile ($$) { my ($p, $listp) = @_; return (sort {$a <=> $b} @{$listp})[int(count($listp) * $p)]; } # return the LEVEL% confidence interval sub conf_interval ($$) { my ($level, $listp) = @_; my %zval = (90 => 1.645, 95 => 1.960, 99 => 2.576, 99.9 => 3.29); return undef unless (exists $zval{$level}); return undef unless count($listp); return ($zval{$level} * stddev($listp) / sqrt(count($listp))); } sub covariance ($$) { my ($x, $y) = @_; my $meanx = mean($x); my $meany = mean($y); my @l = map {($x->[$_] - $meanx) * ($y->[$_] - $meany)} (0 .. $#{$x}); my $cov = mean(\@l); return $cov; } sub correlation ($$) { my ($x, $y) = @_; return covariance($x, $y) / sqrt(variance($x)) / sqrt(variance($y)); } sub rank ($) { my $listp = shift; my @sorted = sort {$a <=> $b} @{$listp}; my %rank_by_val; for (0 .. $#{$listp}) { $rank_by_val{$sorted[$_]} = $_ + 1; } return [map {$rank_by_val{$_}} @{$listp}]; } sub rank_correlation ($$) { my ($x, $y) = @_; my $x_rank = rank($x); my $y_rank = rank($y); return correlation($x_rank, $y_rank); } sub kendall_tau ($$) { my ($x, $y) = @_; my $x_rank = rank($x); my $y_rank = rank($y); my $ncordant = 0; for my $i (0 .. $#{$x_rank} - 1) { for my $j ($i + 1 .. $#{$x_rank}) { if (sign($x_rank->[$j] - $x_rank->[$i]) == sign($y_rank->[$j] - $y_rank->[$i])) { $ncordant++; } else { $ncordant--; } } } my $n = count($x); return $ncordant / ($n * ($n - 1) / 2); } sub format_number ($) { my $n = shift; return 'NaN' unless defined $n; return sprintf('%g', $n); } our ($opt_v, $opt_a, $opt_t); getopts('vat:') || usage; my $verbose = $opt_v; my @stat_types = $opt_t ? split(',', $opt_t) : qw(count sum min max mean median mode var stddev cv skew kurt quant25 quant50 quant75 conf90 conf95 conf99); @stat_types = sort keys %STAT_TBL if $opt_a; # load each column into @LIST my @col; while (<>) { chomp; next unless /^\s*([+-.\d]|NaN)/i; my @f = split(/\s+/, $_); for (0 .. $#f) { if ($f[$_] eq 'NaN') { $col[$_] = [] unless $col[$_]; } else { push(@{$col[$_]}, $f[$_]); } } } # display statistics for each column for my $type (@stat_types) { my @val; die "Unsupported statistic `$type'\n" unless (defined $STAT_TBL{$type}); if ($IS_MULTI{$type}) { for (0 .. $#col - 1) { push(@val, $STAT_TBL{$type}->($col[$_], $col[$_ + 1])); } } else { @val = map {$STAT_TBL{$type}->($_)} @col; } next unless @val; print "$type\t" if $verbose; print join("\t", map {format_number($_)} @val), "\n"; } __END__ =head1 NAME stats - Calcurate statistics of values in each column =head1 SYNOPSIS stats [-v] [-t type[,type...]] [file...] =head1 DESCRIPTION This manual page documents B. This program is for calcurating various statistics of values in each column. B can compute several statistics: the number of values, sum, minimum, maximum, mean, variance, standard deviation, coefficient of variation, 25%/50%/75% quantiles, and 90%/95%/99% confidence interlvals. B can also compute statistic for multiple data sets: covariance and correlation coefficient. B calcurates statistics of numbers in all lines, located at the same column position. For instance, if the input file has N colums, B displays statistics for these N columns. The input file format of B is straightforward; each column is seperated by whitespace, and each value is any valid Perl number. See C and C for details of whitespace characters and valid number formats. =head1 OPTIONS =over 4 =item I<-v> Display label for each statistic =item I<-t type[,type...]> Specify statistic to display. I must be one of the following keywordss. count the number of values sum sum min minimum max maximum mean sample mean var sample variance stddev standard deviation cv coefficient of variation quant25 25% quantile quant50 50% quantile quant75 75% quantile conf90 90% confidence interval conf95 95% confidence interval conf99 99% confidence interval covar covariance corr correlation coefficient =back =head1 EXAMPLE =over 4 =item - Read numbers from standard input $ seq 10 1 2 3 4 5 6 7 8 9 10 $ seq 10 | stats -v count 10.0000 sum 55.0000 min 1.0000 max 10.0000 mean 5.5000 var 8.2500 stddev 2.8723 cv 52.2233 quant25 3.0000 quant50 6.0000 quant75 8.0000 conf90 1.4941 conf95 1.7803 conf99 2.3398 =item - Select statistic to display $ seq 10 | stats -t mean 5.5000 Also, you can specify list of statistics to display. $ seq 1 10 | stats -t mean,var 5.5000 8.2500 =item - B displays statistics for multiple columns $ seq 10 | column -c 16 1 6 2 7 3 8 4 9 5 10 $ seq 10 | column -c 16 | stats -v count 5.0000 5.0000 sum 15.0000 40.0000 min 1.0000 6.0000 max 5.0000 10.0000 mean 3.0000 8.0000 var 2.0000 2.0000 stddev 1.4142 1.4142 cv 47.1405 17.6777 quant25 2.0000 7.0000 quant50 3.0000 8.0000 quant75 4.0000 9.0000 conf90 1.0404 1.0404 conf95 1.2396 1.2396 conf99 1.6292 1.6292 =item - Check file size statistics of some device driver % cd /usr/src/linux/net/ipv4 % ls -l *.c -rw-r--r-- 1 root staff 29313 Nov 29 2003 af_inet.c -rw-r--r-- 1 root staff 33868 Apr 14 2004 arp.c -rw-r--r-- 1 root staff 31912 Apr 14 2004 devinet.c -rw-r--r-- 1 root staff 15044 Aug 25 2003 fib_frontend.c -rw-r--r-- 1 root staff 21003 Aug 25 2003 fib_hash.c -rw-r--r-- 1 root staff 11564 Feb 18 2004 fib_rules.c -rw-r--r-- 1 root staff 25791 Aug 25 2003 fib_semantics.c -rw-r--r-- 1 root staff 27842 Apr 14 2004 icmp.c -rw-r--r-- 1 root staff 53725 Apr 14 2004 igmp.c -rw-r--r-- 1 root staff 14530 Oct 2 2001 inetpeer.c -rw-r--r-- 1 root staff 4131 Apr 13 2001 ip_forward.c -rw-r--r-- 1 root staff 16226 Jun 13 2003 ip_fragment.c -rw-r--r-- 1 root staff 32291 Nov 29 2003 ip_gre.c -rw-r--r-- 1 root staff 13138 Aug 3 2002 ip_input.c -rw-r--r-- 1 root staff 4349 Apr 13 2001 ip_nat_dumb.c -rw-r--r-- 1 root staff 14331 Nov 29 2002 ip_options.c -rw-r--r-- 1 root staff 25271 Nov 29 2003 ip_output.c -rw-r--r-- 1 root staff 25436 Apr 14 2004 ip_sockglue.c -rw-r--r-- 1 root staff 34923 Nov 29 2003 ipconfig.c -rw-r--r-- 1 root staff 22332 Nov 29 2003 ipip.c -rw-r--r-- 1 root staff 40373 Nov 29 2003 ipmr.c -rw-r--r-- 1 root staff 7526 Jun 13 2003 proc.c -rw-r--r-- 1 root staff 4345 May 20 2001 protocol.c -rw-r--r-- 1 root staff 16381 Aug 25 2003 raw.c -rw-r--r-- 1 root staff 65629 Nov 29 2003 route.c -rw-r--r-- 1 root staff 5198 Aug 3 2002 syncookies.c -rw-r--r-- 1 root staff 9608 Apr 14 2004 sysctl_net_ipv4.c -rw-r--r-- 1 root staff 70143 Feb 18 2004 tcp.c -rw-rw-r-- 1 root staff 15180 Aug 25 2003 tcp_diag.c -rw-r--r-- 1 root staff 120806 Apr 14 2004 tcp_input.c -rw-r--r-- 1 root staff 58649 Apr 14 2004 tcp_ipv4.c -rw-r--r-- 1 root staff 29731 Aug 25 2003 tcp_minisocks.c -rw-r--r-- 1 root staff 43280 Nov 29 2003 tcp_output.c -rw-r--r-- 1 root staff 17652 Jun 13 2003 tcp_timer.c -rw-r--r-- 1 root staff 26360 Apr 14 2004 udp.c -rw-r--r-- 1 root staff 1272 Aug 25 2003 utils.c % ls -l *.c | awk '{ print $5 }' | stats -v count 36.0000 sum 1010643.0000 min 1272.0000 max 136404.0000 mean 28073.4167 var 621397171.6319 stddev 24927.8393 cv 88.7952 quant25 14359.0000 quant50 25342.0000 quant75 34573.0000 conf90 6834.3826 conf95 8143.0942 conf99 10702.3523 =back =head1 AVAILABILITY The latest version of B is available at http://www.ispl.jp/~oosaki/software/stats/stats =head1 SEE ALSO perl(1), perlnumber(1), perlre(1) =head1 AUTHOR Hiroyuki Ohsaki =cut EOF