masses/model-statistics - spamassassin - Git at Google

 #!/usr/bin/perl

 # This script is used to print some statistics about classification accuracy
 # with a k-fold cross validation

 use strict;

 my $lambda = 50;  # desired lambda for TCR calculation

 if ( scalar(@ARGV) < 1 ) {
 	print STDERR "Usage: model-statistics [validate]\n";
 	exit 1;
 }

 my (@fp1, @fn1, @tcr1);

 open (FILE, $ARGV[0]) || die $!;
 while (<FILE>) {
 	my @x = split(/\s+/);
 	push (@fp1, $x[2] / ($x[0] + $x[2]));
 	push (@fn1, $x[3] / ($x[1] + $x[3]));
 	push (@tcr1, $x[1] / ($x[3] + $lambda * $x[2]));
 }
 close (FILE);

 stat_analysis ("False positives", "pct", \@fp1);
 stat_analysis ("False negatives", "pct", \@fn1);
 stat_analysis ("TCR (lambda=$lambda)", "lin", \@tcr1);

 sub stat_analysis {
 	my $title = shift;
 	my $pct = shift;
 	my $s1 = shift;

 	# This is the number of degrees of freedom of the two sample sets (i.e.
 	# the number of samples in each set).
 	my $dof = scalar(@{$s1});

 	# Compute the mean and standard deviation of the first sample
 	# mean = 1/n * sum(s[i])
 	my $mean_s1 = 0;
 	foreach my $i (1..$dof) {
 		$mean_s1 += $$s1[$i];
 	}
 	$mean_s1 /= $dof;

 	# var = 1/(n-1) * sum((mean - s[i])^2)
 	my $var_s1 = 0;
 	foreach my $i (1..$dof) {
 		$var_s1 += ($mean_s1 - $$s1[$i])**2;
 	}
 	$var_s1 /= $dof - 1;

 	# std = sqrt(var)
 	my $std_s1 = sqrt($var_s1);

 	# SA developers like percentage points instead of probabilities.
 	if ( $pct eq "pct" ) {
 		printf "%s: mean=%0.4f%% std=%0.4f\n",$title,100*$mean_s1,100*$std_s1;
 	} else {
 		printf "%s: mean=%0.4f std=%0.4f\n",$title,$mean_s1,$std_s1;
 	}
 }
	#!/usr/bin/perl

	# This script is used to print some statistics about classification accuracy
	# with a k-fold cross validation

	use strict;

	my $lambda = 50; # desired lambda for TCR calculation

	if ( scalar(@ARGV) < 1 ) {
	print STDERR "Usage: model-statistics [validate]\n";
	exit 1;
	}

	my (@fp1, @fn1, @tcr1);

	open (FILE, $ARGV[0]) \|\| die $!;
	while (<FILE>) {
	my @x = split(/\s+/);
	push (@fp1, $x[2] / ($x[0] + $x[2]));
	push (@fn1, $x[3] / ($x[1] + $x[3]));
	push (@tcr1, $x[1] / ($x[3] + $lambda * $x[2]));
	}
	close (FILE);

	stat_analysis ("False positives", "pct", \@fp1);
	stat_analysis ("False negatives", "pct", \@fn1);
	stat_analysis ("TCR (lambda=$lambda)", "lin", \@tcr1);

	sub stat_analysis {
	my $title = shift;
	my $pct = shift;
	my $s1 = shift;

	# This is the number of degrees of freedom of the two sample sets (i.e.
	# the number of samples in each set).
	my $dof = scalar(@{$s1});

	# Compute the mean and standard deviation of the first sample
	# mean = 1/n * sum(s[i])
	my $mean_s1 = 0;
	foreach my $i (1..$dof) {
	$mean_s1 += $$s1[$i];
	}
	$mean_s1 /= $dof;

	# var = 1/(n-1) * sum((mean - s[i])^2)
	my $var_s1 = 0;
	foreach my $i (1..$dof) {
	$var_s1 += ($mean_s1 - $$s1[$i])**2;
	}
	$var_s1 /= $dof - 1;

	# std = sqrt(var)
	my $std_s1 = sqrt($var_s1);

	# SA developers like percentage points instead of probabilities.
	if ( $pct eq "pct" ) {
	printf "%s: mean=%0.4f%% std=%0.4f\n",$title,100$mean_s1,100$std_s1;
	} else {
	printf "%s: mean=%0.4f std=%0.4f\n",$title,$mean_s1,$std_s1;
	}
	}