masses/compare-models - spamassassin - Git at Google

 #!/usr/bin/perl

 # This script is used to do a statistical comparative analysis of

 use Statistics::Distributions;
 use strict;

 my $alpha = 0.95; # acceptable confidence in not making a type 1 error
 		  # i.e. assume that the means are different
 my $lambda = 50;  # desired lambda for TCR calculation

 if ( scalar(@ARGV) < 2 ) {
 	print STDERR "Usage: compare-models [validate1] [validate2]\n";
 	exit 1;
 }

 my (@fp1, @fn1, @tcr1);

 # allow dir names to be specified, too, for brevity
 if (-f "$ARGV[0]/validate") { $ARGV[0] .= "/validate"; }
 if (-f "$ARGV[1]/validate") { $ARGV[1] .= "/validate"; }

 open (FILE, $ARGV[0]) || die $!;
 while (<FILE>) {
 	my @x = split(/\s+/);
 	push (@fp1, $x[2] / ($x[0] + $x[2]));
 	push (@fn1, $x[3] / ($x[1] + $x[3]));
 	push (@tcr1, $x[1] / ($x[3] + $lambda * $x[2]));
 }
 close (FILE);

 my (@fp2, @fn2, @tcr2);

 open (FILE, $ARGV[1]) || die $!;
 while (<FILE>) {
 	my @x = split(/\s+/);
 	push (@fp2, $x[2] / ($x[0] + $x[2]));
 	push (@fn2, $x[3] / ($x[1] + $x[3]));
 	push (@tcr2, $x[1] / ($x[3] + $lambda * $x[2]));
 }
 close (FILE);

 stat_analysis ("False positives", "pct", \@fp1, \@fp2);
 stat_analysis ("False negatives", "pct", \@fn1, \@fn2);
 stat_analysis ("TCR (lambda=$lambda)", "lin", \@tcr1, \@tcr2);

 sub stat_analysis {
 	my $title = shift;
 	my $pct = shift;
 	my $s1 = shift;
 	my $s2 = shift;

 	unless ( scalar(@$s1) == scalar(@$s1) ) {
 		print STDERR "Can't compute stats for $title.  Samples are not paired.\n";
 		return;
 	}

 	# This is the number of degrees of freedom of the two sample sets (i.e.
 	# the number of samples in each set).
 	my $dof = scalar(@{$s1});

 	print "$title:\n";

 	# Compute the mean and standard deviation of the first sample
 	# mean = 1/n * sum(s[i])
 	my $mean_s1 = 0;
 	foreach my $i (1..$dof) {
 		$mean_s1 += $$s1[$i];
 	}
 	$mean_s1 /= $dof;

 	# var = 1/(n-1) * sum((mean - s[i])^2)
 	my $var_s1 = 0;
 	foreach my $i (1..$dof) {
 		$var_s1 += ($mean_s1 - $$s1[$i])**2;
 	}
 	$var_s1 /= $dof - 1;

 	# std = sqrt(var)
 	my $std_s1 = sqrt($var_s1);

 	# Compute the mean and standard deviation of the second sample
 	# mean = 1/n * sum(s[i])
 	my $mean_s2 = 0;
 	foreach my $i (1..$dof) {
 		$mean_s2 += $$s2[$i];
 	}
 	$mean_s2 /= $dof;

 	# var = 1/(n-1) * sum((mean - s[i])^2)
 	my $var_s2 = 0;
 	foreach my $i (1..$dof) {
 		$var_s2 += ($mean_s2 - $$s2[$i])**2;
 	}
 	$var_s2 /= $dof - 1;

 	# std = sqrt(var)
 	my $std_s2 = sqrt($var_s2);

 	# SA developers like percentage points instead of probabilities.
 	if ( $pct eq "pct" ) {
 		printf "\tSample 1: mean=%0.4f%% std=%0.4f\n",100*$mean_s1,100*$std_s1;
 		printf "\tSample 2: mean=%0.4f%% std=%0.4f\n",100*$mean_s2,100*$std_s2;
 	} else {
 		printf "\tSample 1: mean=%0.4f std=%0.4f\n",$mean_s1,$std_s1;
 		printf "\tSample 2: mean=%0.4f std=%0.4f\n",$mean_s2,$std_s2;
 	}

 	# Compute the mean of the differences between the two samples
 	my $mean_d = 0;
 	foreach my $i (1..$dof) {
 		$mean_d += $$s1[$i] - $$s2[$i];
 	}
 	$mean_d /= $dof;

 	# Compute the variance of the differences between the two samples
 	my $var_d = 0;
 	foreach my $i (1..$dof) {
 		$var_d += ($mean_d - $$s1[$i] + $$s2[$i])**2;
 	}
 	$var_d /= $dof - 1;
 	my $std_d = sqrt($var_d);

 	# To determine whether two samples are from the same distribution
 	# (i.e. they have the same mean), we are going to use a paired sample
 	# t-test.  You can find more information about this in Tom Mitchell's
 	# "Machine Learning" book.

 	# Let t = mean / (var * sqrt(n))
 	my $tstat;
 	if ( $var_d > 0 ) {
 		$tstat = $mean_d / sqrt($var_d / $dof);
 	} else {
 		$tstat = 0;
 	}

 	# Now we find the critical value of t for the alpha% confidence
 	# interval.
 	my $tcrit = Statistics::Distributions::tdistr ($dof, (1-$alpha)/2);

 	# This is the probability that the two distributions are from different
 	# means.
 	my $tprob = 1-Statistics::Distributions::tprob ($dof, abs($tstat))/2;

 	# If the t statistic is less than the critical value, this means that
 	# we should accept the null hypothesis (the distributions have the same
 	# mean), otherwise it should be accepted.
 	if ( abs($tstat) < $tcrit ) {
 		printf "\tNot statistically significantly different (alpha=%0.4f)\n", $alpha;
 	} else {
 		printf "\tStatistically significantly different with confidence %0.4f%%\n", 100*$tprob;
 	}

 	# This displays an estimate of the confidence interval around the
 	# estimated mean difference between the two samples.  Bear in mind that
 	# the t statistic is working on the local differences and not the
 	# global difference.
 	if ( $pct eq "pct" ) {
 		printf "\tEstimated difference: %0.4f%% +/- %0.4f\n", 100*$mean_d, 100*$std_d*$tcrit;
 	} else {
 		printf "\tEstimated difference: %0.4f +/- %0.4f\n", $mean_d, $std_d*$tcrit;
 	}

 	print "\n";
 }
	#!/usr/bin/perl

	# This script is used to do a statistical comparative analysis of

	use Statistics::Distributions;
	use strict;

	my $alpha = 0.95; # acceptable confidence in not making a type 1 error
	# i.e. assume that the means are different
	my $lambda = 50; # desired lambda for TCR calculation

	if ( scalar(@ARGV) < 2 ) {
	print STDERR "Usage: compare-models [validate1] [validate2]\n";
	exit 1;
	}

	my (@fp1, @fn1, @tcr1);

	# allow dir names to be specified, too, for brevity
	if (-f "$ARGV[0]/validate") { $ARGV[0] .= "/validate"; }
	if (-f "$ARGV[1]/validate") { $ARGV[1] .= "/validate"; }

	open (FILE, $ARGV[0]) \|\| die $!;
	while (<FILE>) {
	my @x = split(/\s+/);
	push (@fp1, $x[2] / ($x[0] + $x[2]));
	push (@fn1, $x[3] / ($x[1] + $x[3]));
	push (@tcr1, $x[1] / ($x[3] + $lambda * $x[2]));
	}
	close (FILE);

	my (@fp2, @fn2, @tcr2);

	open (FILE, $ARGV[1]) \|\| die $!;
	while (<FILE>) {
	my @x = split(/\s+/);
	push (@fp2, $x[2] / ($x[0] + $x[2]));
	push (@fn2, $x[3] / ($x[1] + $x[3]));
	push (@tcr2, $x[1] / ($x[3] + $lambda * $x[2]));
	}
	close (FILE);

	stat_analysis ("False positives", "pct", \@fp1, \@fp2);
	stat_analysis ("False negatives", "pct", \@fn1, \@fn2);
	stat_analysis ("TCR (lambda=$lambda)", "lin", \@tcr1, \@tcr2);

	sub stat_analysis {
	my $title = shift;
	my $pct = shift;
	my $s1 = shift;
	my $s2 = shift;

	unless ( scalar(@$s1) == scalar(@$s1) ) {
	print STDERR "Can't compute stats for $title. Samples are not paired.\n";
	return;
	}

	# This is the number of degrees of freedom of the two sample sets (i.e.
	# the number of samples in each set).
	my $dof = scalar(@{$s1});

	print "$title:\n";

	# Compute the mean and standard deviation of the first sample
	# mean = 1/n * sum(s[i])
	my $mean_s1 = 0;
	foreach my $i (1..$dof) {
	$mean_s1 += $$s1[$i];
	}
	$mean_s1 /= $dof;

	# var = 1/(n-1) * sum((mean - s[i])^2)
	my $var_s1 = 0;
	foreach my $i (1..$dof) {
	$var_s1 += ($mean_s1 - $$s1[$i])**2;
	}
	$var_s1 /= $dof - 1;

	# std = sqrt(var)
	my $std_s1 = sqrt($var_s1);

	# Compute the mean and standard deviation of the second sample
	# mean = 1/n * sum(s[i])
	my $mean_s2 = 0;
	foreach my $i (1..$dof) {
	$mean_s2 += $$s2[$i];
	}
	$mean_s2 /= $dof;

	# var = 1/(n-1) * sum((mean - s[i])^2)
	my $var_s2 = 0;
	foreach my $i (1..$dof) {
	$var_s2 += ($mean_s2 - $$s2[$i])**2;
	}
	$var_s2 /= $dof - 1;

	# std = sqrt(var)
	my $std_s2 = sqrt($var_s2);

	# SA developers like percentage points instead of probabilities.
	if ( $pct eq "pct" ) {
	printf "\tSample 1: mean=%0.4f%% std=%0.4f\n",100$mean_s1,100$std_s1;
	printf "\tSample 2: mean=%0.4f%% std=%0.4f\n",100$mean_s2,100$std_s2;
	} else {
	printf "\tSample 1: mean=%0.4f std=%0.4f\n",$mean_s1,$std_s1;
	printf "\tSample 2: mean=%0.4f std=%0.4f\n",$mean_s2,$std_s2;
	}

	# Compute the mean of the differences between the two samples
	my $mean_d = 0;
	foreach my $i (1..$dof) {
	$mean_d += $$s1[$i] - $$s2[$i];
	}
	$mean_d /= $dof;

	# Compute the variance of the differences between the two samples
	my $var_d = 0;
	foreach my $i (1..$dof) {
	$var_d += ($mean_d - $$s1[$i] + $$s2[$i])**2;
	}
	$var_d /= $dof - 1;
	my $std_d = sqrt($var_d);

	# To determine whether two samples are from the same distribution
	# (i.e. they have the same mean), we are going to use a paired sample
	# t-test. You can find more information about this in Tom Mitchell's
	# "Machine Learning" book.

	# Let t = mean / (var * sqrt(n))
	my $tstat;
	if ( $var_d > 0 ) {
	$tstat = $mean_d / sqrt($var_d / $dof);
	} else {
	$tstat = 0;
	}

	# Now we find the critical value of t for the alpha% confidence
	# interval.
	my $tcrit = Statistics::Distributions::tdistr ($dof, (1-$alpha)/2);

	# This is the probability that the two distributions are from different
	# means.
	my $tprob = 1-Statistics::Distributions::tprob ($dof, abs($tstat))/2;

	# If the t statistic is less than the critical value, this means that
	# we should accept the null hypothesis (the distributions have the same
	# mean), otherwise it should be accepted.
	if ( abs($tstat) < $tcrit ) {
	printf "\tNot statistically significantly different (alpha=%0.4f)\n", $alpha;
	} else {
	printf "\tStatistically significantly different with confidence %0.4f%%\n", 100*$tprob;
	}

	# This displays an estimate of the confidence interval around the
	# estimated mean difference between the two samples. Bear in mind that
	# the t statistic is working on the local differences and not the
	# global difference.
	if ( $pct eq "pct" ) {
	printf "\tEstimated difference: %0.4f%% +/- %0.4f\n", 100$mean_d, 100$std_d*$tcrit;
	} else {
	printf "\tEstimated difference: %0.4f +/- %0.4f\n", $mean_d, $std_d*$tcrit;
	}

	print "\n";
	}