| #!/usr/bin/perl |
| |
| # This script is used to print some statistics about classification accuracy |
| # with a k-fold cross validation |
| |
| use strict; |
| |
| my $lambda = 50; # desired lambda for TCR calculation |
| |
| if ( scalar(@ARGV) < 1 ) { |
| print STDERR "Usage: model-statistics [validate]\n"; |
| exit 1; |
| } |
| |
| my (@fp1, @fn1, @tcr1); |
| |
| open (FILE, $ARGV[0]) || die $!; |
| while (<FILE>) { |
| my @x = split(/\s+/); |
| push (@fp1, $x[2] / ($x[0] + $x[2])); |
| push (@fn1, $x[3] / ($x[1] + $x[3])); |
| push (@tcr1, $x[1] / ($x[3] + $lambda * $x[2])); |
| } |
| close (FILE); |
| |
| stat_analysis ("False positives", "pct", \@fp1); |
| stat_analysis ("False negatives", "pct", \@fn1); |
| stat_analysis ("TCR (lambda=$lambda)", "lin", \@tcr1); |
| |
| sub stat_analysis { |
| my $title = shift; |
| my $pct = shift; |
| my $s1 = shift; |
| |
| # This is the number of degrees of freedom of the two sample sets (i.e. |
| # the number of samples in each set). |
| my $dof = scalar(@{$s1}); |
| |
| # Compute the mean and standard deviation of the first sample |
| # mean = 1/n * sum(s[i]) |
| my $mean_s1 = 0; |
| foreach my $i (1..$dof) { |
| $mean_s1 += $$s1[$i]; |
| } |
| $mean_s1 /= $dof; |
| |
| # var = 1/(n-1) * sum((mean - s[i])^2) |
| my $var_s1 = 0; |
| foreach my $i (1..$dof) { |
| $var_s1 += ($mean_s1 - $$s1[$i])**2; |
| } |
| $var_s1 /= $dof - 1; |
| |
| # std = sqrt(var) |
| my $std_s1 = sqrt($var_s1); |
| |
| # SA developers like percentage points instead of probabilities. |
| if ( $pct eq "pct" ) { |
| printf "%s: mean=%0.4f%% std=%0.4f\n",$title,100*$mean_s1,100*$std_s1; |
| } else { |
| printf "%s: mean=%0.4f std=%0.4f\n",$title,$mean_s1,$std_s1; |
| } |
| } |