masses/bayes-testing/bayes-thresholds - spamassassin - Git at Google

 #!/usr/bin/perl -w
 #
 # Given a spam.log and nonspam.log from a "mass-check --bayes" run,
 # work out optimum spam/ham cutoffs to maximise efficiency.
 #
 # usage: bayes-threshold spam.log nonspam.log

 my $spam = $ARGV[0] || "spam.log";
 my $nonspam = $ARGV[1] || (-f "good.log" ? "good.log" : "nonspam.log");

 my $nbuckets = 50;

 my $range_lo = 0.00;
 my $range_hi = 1.00;

 # shamelessly nicked from spambayes' testing infrastructure; a system to
 # compute the "cost" of a pair of thresholds and classifier.  I'm supporting
 # it here so our stats are (at least a little) comparable against theirs.
 #
 # Note that they use a different way to run 10PCV tests; they train with 1
 # bucket and test against 9, whereas the lit suggests doing the opposite,
 # which is what we do; so the stats may still be unportable due to us getting
 # better training and less testing.  TODO

 my $best_cutoff_fp_weight = 10.0;
 my $best_cutoff_fn_weight = 1.0;
 my $best_cutoff_unsure_weight = 0.2;

 %bux_sp = ();
 %bux_ns = ();

 my $step = ($range_hi - $range_lo) / $nbuckets;
 my $i;
 for ($i = $range_lo; $i <= $range_hi; $i += $step) {
   push (@buckets, $i);
   $bux_ns{$i} = $bux_sp{$i} = 0;
 }

 foreach my $file ($spam, $nonspam) {
   open (IN, "<$file") || die "Could not open file '$file': $!";

   my $isspam = 0; ($file eq $spam) and $isspam = 1;

   while (<IN>) {
     /^(\.|Y)\s.+bayes=([^\s,]+)/ or next;
     my $score = $2+0;

     my $bucket_id;
     foreach my $bucket (@buckets) {
       if ($score >= $bucket && $score < $bucket+$step) {
         $bucket_id = $bucket; last;
       }
     }

     if ($isspam) {
       $bux_sp{$bucket_id}++;
     } else {
       $bux_ns{$bucket_id}++;
     }
   }
 }

 my $max_sp = 0;
 my $max_ns = 0;
 my $tot_sp = 0;
 my $tot_ns = 0;
 foreach my $bucket (@buckets) {
   $tot_sp += $bux_sp{$bucket};
   if ($bux_sp{$bucket} > $max_sp)
                         { $max_sp = $bux_sp{$bucket}; }
   $tot_ns += $bux_ns{$bucket};
   if ($bux_ns{$bucket} > $max_ns)
                         { $max_ns = $bux_ns{$bucket}; }
 }

 my %results = ();
 for ($hamcutoff = 0; $hamcutoff < 1; $hamcutoff += $step) {
   for ($spamcutoff = 0; $spamcutoff < 1; $spamcutoff += $step) {
     my $fn = 0;
     my $fp = 0;
     my $unsure_sp = 0;
     my $unsure_ns = 0;

     for ($i = $range_lo; $i < $hamcutoff; $i += $step) {
       $fn += $bux_sp{$i};
     }
     # total up the unsures (between hamcutoff and spamcutoff)
     for ($i = $hamcutoff; $i < $spamcutoff; $i += $step) {
       $unsure_ns += $bux_ns{$i};
       $unsure_sp += $bux_sp{$i};
     }
     for ($i = $spamcutoff; $i < $range_hi; $i += $step) {
       $fp += $bux_ns{$i};
     }

     my $cost = ($fp * $best_cutoff_fp_weight)
 		+ ($fn * $best_cutoff_fn_weight)
 		+ (($unsure_ns+$unsure_sp) * $best_cutoff_unsure_weight);

     $results{"$hamcutoff $spamcutoff"} = {
       'hamcutoff' => $hamcutoff,
       'spamcutoff'=> $spamcutoff,
       'cost' => $cost,
       'unsure_ns' => $unsure_ns,
       'unsure_sp' => $unsure_sp,
       'fp' => $fp,
       'fn' => $fn
     };
   }
 }

 my $count = 10;
 foreach my $r (sort { $a->{cost} <=> $b->{cost} } values %results) {
   printf "Threshold optimization for hamcutoff=%3.2f, spamcutoff=%3.2f: cost=\$%5.2f\n",
 		$r->{hamcutoff}, $r->{spamcutoff}, $r->{cost};
   printf "Total ham:spam:   %d:%d\n", $tot_ns, $tot_sp;

   printf "FP: %5d %5.3f%%    ", $r->{fp}, ($r->{fp}*100) / $tot_ns;
   printf "FN: %5d %5.3f%%\n", $r->{fn}, ($r->{fn}*100) / $tot_sp;

   my $unsure = $r->{unsure_ns} + $r->{unsure_sp};
   printf "Unsure: %5d %5.3f%%     ", $unsure,
 				($unsure*100) / ($tot_sp+$tot_ns);
   printf "(ham: %5d %5.3f%%    ", $r->{unsure_ns},
 				($r->{unsure_ns}*100) / ($tot_ns);
   printf "spam: %5d %5.3f%%)\n", $r->{unsure_sp},
 				($r->{unsure_sp}*100) / ($tot_sp);

   # for TCR calc, treat "unsures" as ham
   # TODO: unsure_sp should probably be treated as spam, assuming
   # it'll fall in the 5.0-6.0 score range
   my $fn = $r->{unsure_sp} + $r->{fn};
   my $fp = $r->{fp};
   printf "TCRs:              l=1 %5.3f    l=5 %5.3f    l=9 %5.3f\n",
     tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 1),
     tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 5),
     tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 9);

   print "\n";
   last if ($count-- < 0);
 }


 sub tcr {
   my ($nspamspam, $nspamlegit, $nlegitspam, $nlegitlegit, $lambda) = @_;

   my $nlegit = $nlegitspam+$nlegitlegit;
   my $nspam = $nspamspam+$nspamlegit;

   my $werr = ($lambda * $nlegitspam + $nspamlegit)
 		  / ($lambda * $nlegit + $nspam);

   my $werr_base = $nspam
 		  / ($lambda * $nlegit + $nspam);

   $werr ||= 0.000001;     # avoid / by 0
   my $tcr = $werr_base / $werr;

 #my $sr = ($nspamspam / $nspam) * 100.0;
 #my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;

   $tcr;
 }
	#!/usr/bin/perl -w
	#
	# Given a spam.log and nonspam.log from a "mass-check --bayes" run,
	# work out optimum spam/ham cutoffs to maximise efficiency.
	#
	# usage: bayes-threshold spam.log nonspam.log

	my $spam = $ARGV[0] \|\| "spam.log";
	my $nonspam = $ARGV[1] \|\| (-f "good.log" ? "good.log" : "nonspam.log");

	my $nbuckets = 50;

	my $range_lo = 0.00;
	my $range_hi = 1.00;

	# shamelessly nicked from spambayes' testing infrastructure; a system to
	# compute the "cost" of a pair of thresholds and classifier. I'm supporting
	# it here so our stats are (at least a little) comparable against theirs.
	#
	# Note that they use a different way to run 10PCV tests; they train with 1
	# bucket and test against 9, whereas the lit suggests doing the opposite,
	# which is what we do; so the stats may still be unportable due to us getting
	# better training and less testing. TODO

	my $best_cutoff_fp_weight = 10.0;
	my $best_cutoff_fn_weight = 1.0;
	my $best_cutoff_unsure_weight = 0.2;

	%bux_sp = ();
	%bux_ns = ();

	my $step = ($range_hi - $range_lo) / $nbuckets;
	my $i;
	for ($i = $range_lo; $i <= $range_hi; $i += $step) {
	push (@buckets, $i);
	$bux_ns{$i} = $bux_sp{$i} = 0;
	}

	foreach my $file ($spam, $nonspam) {
	open (IN, "<$file") \|\| die "Could not open file '$file': $!";

	my $isspam = 0; ($file eq $spam) and $isspam = 1;

	while (<IN>) {
	/^(\.\|Y)\s.+bayes=([^\s,]+)/ or next;
	my $score = $2+0;

	my $bucket_id;
	foreach my $bucket (@buckets) {
	if ($score >= $bucket && $score < $bucket+$step) {
	$bucket_id = $bucket; last;
	}
	}

	if ($isspam) {
	$bux_sp{$bucket_id}++;
	} else {
	$bux_ns{$bucket_id}++;
	}
	}
	}

	my $max_sp = 0;
	my $max_ns = 0;
	my $tot_sp = 0;
	my $tot_ns = 0;
	foreach my $bucket (@buckets) {
	$tot_sp += $bux_sp{$bucket};
	if ($bux_sp{$bucket} > $max_sp)
	{ $max_sp = $bux_sp{$bucket}; }
	$tot_ns += $bux_ns{$bucket};
	if ($bux_ns{$bucket} > $max_ns)
	{ $max_ns = $bux_ns{$bucket}; }
	}

	my %results = ();
	for ($hamcutoff = 0; $hamcutoff < 1; $hamcutoff += $step) {
	for ($spamcutoff = 0; $spamcutoff < 1; $spamcutoff += $step) {
	my $fn = 0;
	my $fp = 0;
	my $unsure_sp = 0;
	my $unsure_ns = 0;

	for ($i = $range_lo; $i < $hamcutoff; $i += $step) {
	$fn += $bux_sp{$i};
	}
	# total up the unsures (between hamcutoff and spamcutoff)
	for ($i = $hamcutoff; $i < $spamcutoff; $i += $step) {
	$unsure_ns += $bux_ns{$i};
	$unsure_sp += $bux_sp{$i};
	}
	for ($i = $spamcutoff; $i < $range_hi; $i += $step) {
	$fp += $bux_ns{$i};
	}

	my $cost = ($fp * $best_cutoff_fp_weight)
	+ ($fn * $best_cutoff_fn_weight)
	+ (($unsure_ns+$unsure_sp) * $best_cutoff_unsure_weight);

	$results{"$hamcutoff $spamcutoff"} = {
	'hamcutoff' => $hamcutoff,
	'spamcutoff'=> $spamcutoff,
	'cost' => $cost,
	'unsure_ns' => $unsure_ns,
	'unsure_sp' => $unsure_sp,
	'fp' => $fp,
	'fn' => $fn
	};
	}
	}

	my $count = 10;
	foreach my $r (sort { $a->{cost} <=> $b->{cost} } values %results) {
	printf "Threshold optimization for hamcutoff=%3.2f, spamcutoff=%3.2f: cost=\$%5.2f\n",
	$r->{hamcutoff}, $r->{spamcutoff}, $r->{cost};
	printf "Total ham:spam: %d:%d\n", $tot_ns, $tot_sp;

	printf "FP: %5d %5.3f%% ", $r->{fp}, ($r->{fp}*100) / $tot_ns;
	printf "FN: %5d %5.3f%%\n", $r->{fn}, ($r->{fn}*100) / $tot_sp;

	my $unsure = $r->{unsure_ns} + $r->{unsure_sp};
	printf "Unsure: %5d %5.3f%% ", $unsure,
	($unsure*100) / ($tot_sp+$tot_ns);
	printf "(ham: %5d %5.3f%% ", $r->{unsure_ns},
	($r->{unsure_ns}*100) / ($tot_ns);
	printf "spam: %5d %5.3f%%)\n", $r->{unsure_sp},
	($r->{unsure_sp}*100) / ($tot_sp);

	# for TCR calc, treat "unsures" as ham
	# TODO: unsure_sp should probably be treated as spam, assuming
	# it'll fall in the 5.0-6.0 score range
	my $fn = $r->{unsure_sp} + $r->{fn};
	my $fp = $r->{fp};
	printf "TCRs: l=1 %5.3f l=5 %5.3f l=9 %5.3f\n",
	tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 1),
	tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 5),
	tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 9);

	print "\n";
	last if ($count-- < 0);
	}


	sub tcr {
	my ($nspamspam, $nspamlegit, $nlegitspam, $nlegitlegit, $lambda) = @_;

	my $nlegit = $nlegitspam+$nlegitlegit;
	my $nspam = $nspamspam+$nspamlegit;

	my $werr = ($lambda * $nlegitspam + $nspamlegit)
	/ ($lambda * $nlegit + $nspam);

	my $werr_base = $nspam
	/ ($lambda * $nlegit + $nspam);

	$werr \|\|= 0.000001; # avoid / by 0
	my $tcr = $werr_base / $werr;

	#my $sr = ($nspamspam / $nspam) * 100.0;
	#my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;

	$tcr;
	}