blob: a6c38ed5887f76533ef5296e8c3b0b4d94f9ccfb [file] [log] [blame]
#!/usr/bin/perl -w
#
# Given a spam.log and nonspam.log from a "mass-check --bayes" run,
# work out efficiency. This variant uses static thresholds that
# model more closely what SpamAssassin uses for scoring.
#
# usage: bayes-threshold spam.log nonspam.log
my $spam = $ARGV[0] || "spam.log";
my $nonspam = $ARGV[1] || (-f "good.log" ? "good.log" : "nonspam.log");
my $hamcutoff = 0.20;
my $spamcutoff = 0.80;
my $nbuckets = 50;
my $range_lo = 0.00;
my $range_hi = 1.00;
my $step = 0.02;
#my $step = ($range_hi - $range_lo) / $nbuckets; # unusable - round errors!
$hamcutoff += 0.0;
$spamcutoff += 0.0;
# shamelessly nicked from spambayes' testing infrastructure; a system to
# compute the "cost" of a pair of thresholds and classifier. I'm supporting
# it here so our stats are (at least a little) comparable against theirs.
my $best_cutoff_fp_weight = 10.0;
my $best_cutoff_fn_weight = 1.0;
my $best_cutoff_unsure_weight = 0.1;
%bux_sp = ();
%bux_ns = ();
my $i;
for ($i = 0; $i <= ($range_hi - $range_lo) / $step; $i++)
{
my $lvl = $range_lo + ($i * $step);
push (@buckets, $lvl);
$bux_ns{$lvl} = $bux_sp{$lvl} = 0;
}
foreach my $file ($spam, $nonspam) {
open (IN, "<$file") || die "Could not open file '$file': $!";
my $isspam = 0; ($file eq $spam) and $isspam = 1;
while (<IN>) {
/^(\.|Y)\s.+bayes=([^\s,]+)/ or next;
my $score = $2+0;
if ($score == 1) { $score = 0.9999999999999; }
my $bucket_id;
foreach my $bucket (@buckets) {
if ($score >= $bucket && $score < $bucket+$step) {
$bucket_id = $bucket; last;
}
}
if (!defined $bucket_id) {
warn "no bucket for $score!";
}
if (!defined $bux_sp{$bucket_id}) {
warn "undef bucket at $bucket_id! (score=$bucket)";
}
if (!defined $bux_ns{$bucket_id}) {
warn "undef bucket at $bucket_id! (score=$bucket)";
}
if ($isspam) {
$bux_sp{$bucket_id}++;
} else {
$bux_ns{$bucket_id}++;
}
}
}
my $max_sp = 0;
my $max_ns = 0;
my $tot_sp = 0;
my $tot_ns = 0;
foreach my $bucket (@buckets) {
$tot_sp += $bux_sp{$bucket};
if ($bux_sp{$bucket} > $max_sp)
{ $max_sp = $bux_sp{$bucket}; }
$tot_ns += $bux_ns{$bucket};
if ($bux_ns{$bucket} > $max_ns)
{ $max_ns = $bux_ns{$bucket}; }
}
foreach my $cutoff (0.3, 0.2, 0.1, 0.04, 0.02) {
$hamcutoff = $cutoff;
$spamcutoff = 1.0 - $cutoff;
my %results = results_for_cutoff ($hamcutoff, $spamcutoff);
write_results (%results);
}
sub results_for_cutoff {
my %results = ();
my $fn = 0;
my $fp = 0;
my $unsure_sp = 0;
my $unsure_ns = 0;
for ($i = $range_lo; $i < $hamcutoff; $i += $step) {
foreach my $bucket (@buckets) {
if ($i >= $bucket && $i < $bucket+$step) {
$fn += $bux_sp{$bucket};
}
}
}
# total up the unsures (between hamcutoff and spamcutoff)
for ($i = $hamcutoff; $i <= $spamcutoff; $i += $step) {
foreach my $bucket (@buckets) {
if ($i >= $bucket && $i < $bucket+$step) {
$unsure_ns += $bux_ns{$bucket};
$unsure_sp += $bux_sp{$bucket};
}
}
}
for ($i = $spamcutoff; $i <= $range_hi; $i += $step) {
foreach my $bucket (@buckets) {
if ($i >= $bucket && $i < $bucket+$step) {
$fp += $bux_ns{$bucket};
}
}
}
my $cost = ($fp * $best_cutoff_fp_weight)
+ ($fn * $best_cutoff_fn_weight)
+ ($unsure_ns * $best_cutoff_unsure_weight)
+ ($unsure_sp * $best_cutoff_unsure_weight);
$results{"$hamcutoff $spamcutoff"} = {
'hamcutoff' => $hamcutoff,
'spamcutoff'=> $spamcutoff,
'cost' => $cost,
'unsure_ns' => $unsure_ns,
'unsure_sp' => $unsure_sp,
'fp' => $fp,
'fn' => $fn
};
return %results;
}
sub write_results {
my (%results) = @_;
foreach my $r (values %results) {
printf "Threshold optimization for hamcutoff=%3.2f, spamcutoff=%3.2f: cost=\$%5.2f\n",
$r->{hamcutoff}, $r->{spamcutoff}, $r->{cost};
printf "Total ham:spam: %d:%d\n", $tot_ns, $tot_sp;
printf "FP: %5d %5.3f%% ", $r->{fp}, ($r->{fp}*100) / $tot_ns;
printf "FN: %5d %5.3f%%\n", $r->{fn}, ($r->{fn}*100) / $tot_sp;
my $unsure = $r->{unsure_ns} + $r->{unsure_sp};
printf "Unsure: %5d %5.3f%% ", $unsure,
($unsure*100) / ($tot_sp+$tot_ns);
printf "(ham: %5d %5.3f%% ", $r->{unsure_ns},
($r->{unsure_ns}*100) / ($tot_ns);
printf "spam: %5d %5.3f%%)\n", $r->{unsure_sp},
($r->{unsure_sp}*100) / ($tot_sp);
# for TCR calc, treat "unsures" as ham
# TODO: unsure_sp should probably be treated as spam, assuming
# it'll fall in the 5.0-6.0 score range
my $fn = $r->{unsure_sp} + $r->{fn};
my $fp = $r->{fp};
printf "TCRs: l=1 %5.3f l=5 %5.3f l=9 %5.3f\n",
tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 1),
tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 5),
tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 9);
printf "SUMMARY: %3.2f/%3.2f fp %5d fn %5d uh %5d us %5d c %5.2f\n",
$r->{hamcutoff}, $r->{spamcutoff},
$r->{fp}, $r->{fn}, $r->{unsure_ns}, $r->{unsure_sp},
$r->{cost};
print "\n";
}
}
sub tcr {
my ($nspamspam, $nspamlegit, $nlegitspam, $nlegitlegit, $lambda) = @_;
my $nlegit = $nlegitspam+$nlegitlegit;
my $nspam = $nspamspam+$nspamlegit;
my $werr = ($lambda * $nlegitspam + $nspamlegit)
/ ($lambda * $nlegit + $nspam);
my $werr_base = $nspam
/ ($lambda * $nlegit + $nspam);
$werr ||= 0.000001; # avoid / by 0
my $tcr = $werr_base / $werr;
#my $sr = ($nspamspam / $nspam) * 100.0;
#my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
$tcr;
}