| #!/usr/bin/perl |
| |
| # This script is used to do a statistical comparative analysis of |
| |
| use Statistics::Distributions; |
| use strict; |
| |
| my $alpha = 0.95; # acceptable confidence in not making a type 1 error |
| # i.e. assume that the means are different |
| my $lambda = 50; # desired lambda for TCR calculation |
| |
| if ( scalar(@ARGV) < 2 ) { |
| print STDERR "Usage: compare-models [validate1] [validate2]\n"; |
| exit 1; |
| } |
| |
| my (@fp1, @fn1, @tcr1); |
| |
| # allow dir names to be specified, too, for brevity |
| if (-f "$ARGV[0]/validate") { $ARGV[0] .= "/validate"; } |
| if (-f "$ARGV[1]/validate") { $ARGV[1] .= "/validate"; } |
| |
| open (FILE, $ARGV[0]) || die $!; |
| while (<FILE>) { |
| my @x = split(/\s+/); |
| push (@fp1, $x[2] / ($x[0] + $x[2])); |
| push (@fn1, $x[3] / ($x[1] + $x[3])); |
| push (@tcr1, $x[1] / ($x[3] + $lambda * $x[2])); |
| } |
| close (FILE); |
| |
| my (@fp2, @fn2, @tcr2); |
| |
| open (FILE, $ARGV[1]) || die $!; |
| while (<FILE>) { |
| my @x = split(/\s+/); |
| push (@fp2, $x[2] / ($x[0] + $x[2])); |
| push (@fn2, $x[3] / ($x[1] + $x[3])); |
| push (@tcr2, $x[1] / ($x[3] + $lambda * $x[2])); |
| } |
| close (FILE); |
| |
| stat_analysis ("False positives", "pct", \@fp1, \@fp2); |
| stat_analysis ("False negatives", "pct", \@fn1, \@fn2); |
| stat_analysis ("TCR (lambda=$lambda)", "lin", \@tcr1, \@tcr2); |
| |
| sub stat_analysis { |
| my $title = shift; |
| my $pct = shift; |
| my $s1 = shift; |
| my $s2 = shift; |
| |
| unless ( scalar(@$s1) == scalar(@$s1) ) { |
| print STDERR "Can't compute stats for $title. Samples are not paired.\n"; |
| return; |
| } |
| |
| # This is the number of degrees of freedom of the two sample sets (i.e. |
| # the number of samples in each set). |
| my $dof = scalar(@{$s1}); |
| |
| print "$title:\n"; |
| |
| # Compute the mean and standard deviation of the first sample |
| # mean = 1/n * sum(s[i]) |
| my $mean_s1 = 0; |
| foreach my $i (1..$dof) { |
| $mean_s1 += $$s1[$i]; |
| } |
| $mean_s1 /= $dof; |
| |
| # var = 1/(n-1) * sum((mean - s[i])^2) |
| my $var_s1 = 0; |
| foreach my $i (1..$dof) { |
| $var_s1 += ($mean_s1 - $$s1[$i])**2; |
| } |
| $var_s1 /= $dof - 1; |
| |
| # std = sqrt(var) |
| my $std_s1 = sqrt($var_s1); |
| |
| # Compute the mean and standard deviation of the second sample |
| # mean = 1/n * sum(s[i]) |
| my $mean_s2 = 0; |
| foreach my $i (1..$dof) { |
| $mean_s2 += $$s2[$i]; |
| } |
| $mean_s2 /= $dof; |
| |
| # var = 1/(n-1) * sum((mean - s[i])^2) |
| my $var_s2 = 0; |
| foreach my $i (1..$dof) { |
| $var_s2 += ($mean_s2 - $$s2[$i])**2; |
| } |
| $var_s2 /= $dof - 1; |
| |
| # std = sqrt(var) |
| my $std_s2 = sqrt($var_s2); |
| |
| # SA developers like percentage points instead of probabilities. |
| if ( $pct eq "pct" ) { |
| printf "\tSample 1: mean=%0.4f%% std=%0.4f\n",100*$mean_s1,100*$std_s1; |
| printf "\tSample 2: mean=%0.4f%% std=%0.4f\n",100*$mean_s2,100*$std_s2; |
| } else { |
| printf "\tSample 1: mean=%0.4f std=%0.4f\n",$mean_s1,$std_s1; |
| printf "\tSample 2: mean=%0.4f std=%0.4f\n",$mean_s2,$std_s2; |
| } |
| |
| # Compute the mean of the differences between the two samples |
| my $mean_d = 0; |
| foreach my $i (1..$dof) { |
| $mean_d += $$s1[$i] - $$s2[$i]; |
| } |
| $mean_d /= $dof; |
| |
| # Compute the variance of the differences between the two samples |
| my $var_d = 0; |
| foreach my $i (1..$dof) { |
| $var_d += ($mean_d - $$s1[$i] + $$s2[$i])**2; |
| } |
| $var_d /= $dof - 1; |
| my $std_d = sqrt($var_d); |
| |
| # To determine whether two samples are from the same distribution |
| # (i.e. they have the same mean), we are going to use a paired sample |
| # t-test. You can find more information about this in Tom Mitchell's |
| # "Machine Learning" book. |
| |
| # Let t = mean / (var * sqrt(n)) |
| my $tstat; |
| if ( $var_d > 0 ) { |
| $tstat = $mean_d / sqrt($var_d / $dof); |
| } else { |
| $tstat = 0; |
| } |
| |
| # Now we find the critical value of t for the alpha% confidence |
| # interval. |
| my $tcrit = Statistics::Distributions::tdistr ($dof, (1-$alpha)/2); |
| |
| # This is the probability that the two distributions are from different |
| # means. |
| my $tprob = 1-Statistics::Distributions::tprob ($dof, abs($tstat))/2; |
| |
| # If the t statistic is less than the critical value, this means that |
| # we should accept the null hypothesis (the distributions have the same |
| # mean), otherwise it should be accepted. |
| if ( abs($tstat) < $tcrit ) { |
| printf "\tNot statistically significantly different (alpha=%0.4f)\n", $alpha; |
| } else { |
| printf "\tStatistically significantly different with confidence %0.4f%%\n", 100*$tprob; |
| } |
| |
| # This displays an estimate of the confidence interval around the |
| # estimated mean difference between the two samples. Bear in mind that |
| # the t statistic is working on the local differences and not the |
| # global difference. |
| if ( $pct eq "pct" ) { |
| printf "\tEstimated difference: %0.4f%% +/- %0.4f\n", 100*$mean_d, 100*$std_d*$tcrit; |
| } else { |
| printf "\tEstimated difference: %0.4f +/- %0.4f\n", $mean_d, $std_d*$tcrit; |
| } |
| |
| print "\n"; |
| } |