blob: 588c11e65120d80b3560aabb9858b22a6bf95796 [file] [log] [blame]
#!/usr/bin/perl
# This script is used to print some statistics about classification accuracy
# with a k-fold cross validation
use strict;
my $lambda = 50; # desired lambda for TCR calculation
if ( scalar(@ARGV) < 1 ) {
print STDERR "Usage: model-statistics [validate]\n";
exit 1;
}
my (@fp1, @fn1, @tcr1);
open (FILE, $ARGV[0]) || die $!;
while (<FILE>) {
my @x = split(/\s+/);
push (@fp1, $x[2] / ($x[0] + $x[2]));
push (@fn1, $x[3] / ($x[1] + $x[3]));
push (@tcr1, $x[1] / ($x[3] + $lambda * $x[2]));
}
close (FILE);
stat_analysis ("False positives", "pct", \@fp1);
stat_analysis ("False negatives", "pct", \@fn1);
stat_analysis ("TCR (lambda=$lambda)", "lin", \@tcr1);
sub stat_analysis {
my $title = shift;
my $pct = shift;
my $s1 = shift;
# This is the number of degrees of freedom of the two sample sets (i.e.
# the number of samples in each set).
my $dof = scalar(@{$s1});
# Compute the mean and standard deviation of the first sample
# mean = 1/n * sum(s[i])
my $mean_s1 = 0;
foreach my $i (1..$dof) {
$mean_s1 += $$s1[$i];
}
$mean_s1 /= $dof;
# var = 1/(n-1) * sum((mean - s[i])^2)
my $var_s1 = 0;
foreach my $i (1..$dof) {
$var_s1 += ($mean_s1 - $$s1[$i])**2;
}
$var_s1 /= $dof - 1;
# std = sqrt(var)
my $std_s1 = sqrt($var_s1);
# SA developers like percentage points instead of probabilities.
if ( $pct eq "pct" ) {
printf "%s: mean=%0.4f%% std=%0.4f\n",$title,100*$mean_s1,100*$std_s1;
} else {
printf "%s: mean=%0.4f std=%0.4f\n",$title,$mean_s1,$std_s1;
}
}