blob: 86d6c0ea2805f154bbc2300e5a855f41a7205cda [file] [log] [blame]
#!/usr/bin/perl -w
#
# Given a spam.log and nonspam.log from a "mass-check --bayes" run,
# work out optimum spam/ham cutoffs to maximise efficiency.
#
# usage: bayes-threshold spam.log nonspam.log
my $spam = $ARGV[0] || "spam.log";
my $nonspam = $ARGV[1] || (-f "good.log" ? "good.log" : "nonspam.log");
my $nbuckets = 50;
my $range_lo = 0.00;
my $range_hi = 1.00;
# shamelessly nicked from spambayes' testing infrastructure; a system to
# compute the "cost" of a pair of thresholds and classifier. I'm supporting
# it here so our stats are (at least a little) comparable against theirs.
#
# Note that they use a different way to run 10PCV tests; they train with 1
# bucket and test against 9, whereas the lit suggests doing the opposite,
# which is what we do; so the stats may still be unportable due to us getting
# better training and less testing. TODO
my $best_cutoff_fp_weight = 10.0;
my $best_cutoff_fn_weight = 1.0;
my $best_cutoff_unsure_weight = 0.2;
%bux_sp = ();
%bux_ns = ();
my $step = ($range_hi - $range_lo) / $nbuckets;
my $i;
for ($i = $range_lo; $i <= $range_hi; $i += $step) {
push (@buckets, $i);
$bux_ns{$i} = $bux_sp{$i} = 0;
}
foreach my $file ($spam, $nonspam) {
open (IN, "<$file") || die "Could not open file '$file': $!";
my $isspam = 0; ($file eq $spam) and $isspam = 1;
while (<IN>) {
/^(\.|Y)\s.+bayes=([^\s,]+)/ or next;
my $score = $2+0;
my $bucket_id;
foreach my $bucket (@buckets) {
if ($score >= $bucket && $score < $bucket+$step) {
$bucket_id = $bucket; last;
}
}
if ($isspam) {
$bux_sp{$bucket_id}++;
} else {
$bux_ns{$bucket_id}++;
}
}
}
my $max_sp = 0;
my $max_ns = 0;
my $tot_sp = 0;
my $tot_ns = 0;
foreach my $bucket (@buckets) {
$tot_sp += $bux_sp{$bucket};
if ($bux_sp{$bucket} > $max_sp)
{ $max_sp = $bux_sp{$bucket}; }
$tot_ns += $bux_ns{$bucket};
if ($bux_ns{$bucket} > $max_ns)
{ $max_ns = $bux_ns{$bucket}; }
}
my %results = ();
for ($hamcutoff = 0; $hamcutoff < 1; $hamcutoff += $step) {
for ($spamcutoff = 0; $spamcutoff < 1; $spamcutoff += $step) {
my $fn = 0;
my $fp = 0;
my $unsure_sp = 0;
my $unsure_ns = 0;
for ($i = $range_lo; $i < $hamcutoff; $i += $step) {
$fn += $bux_sp{$i};
}
# total up the unsures (between hamcutoff and spamcutoff)
for ($i = $hamcutoff; $i < $spamcutoff; $i += $step) {
$unsure_ns += $bux_ns{$i};
$unsure_sp += $bux_sp{$i};
}
for ($i = $spamcutoff; $i < $range_hi; $i += $step) {
$fp += $bux_ns{$i};
}
my $cost = ($fp * $best_cutoff_fp_weight)
+ ($fn * $best_cutoff_fn_weight)
+ (($unsure_ns+$unsure_sp) * $best_cutoff_unsure_weight);
$results{"$hamcutoff $spamcutoff"} = {
'hamcutoff' => $hamcutoff,
'spamcutoff'=> $spamcutoff,
'cost' => $cost,
'unsure_ns' => $unsure_ns,
'unsure_sp' => $unsure_sp,
'fp' => $fp,
'fn' => $fn
};
}
}
my $count = 10;
foreach my $r (sort { $a->{cost} <=> $b->{cost} } values %results) {
printf "Threshold optimization for hamcutoff=%3.2f, spamcutoff=%3.2f: cost=\$%5.2f\n",
$r->{hamcutoff}, $r->{spamcutoff}, $r->{cost};
printf "Total ham:spam: %d:%d\n", $tot_ns, $tot_sp;
printf "FP: %5d %5.3f%% ", $r->{fp}, ($r->{fp}*100) / $tot_ns;
printf "FN: %5d %5.3f%%\n", $r->{fn}, ($r->{fn}*100) / $tot_sp;
my $unsure = $r->{unsure_ns} + $r->{unsure_sp};
printf "Unsure: %5d %5.3f%% ", $unsure,
($unsure*100) / ($tot_sp+$tot_ns);
printf "(ham: %5d %5.3f%% ", $r->{unsure_ns},
($r->{unsure_ns}*100) / ($tot_ns);
printf "spam: %5d %5.3f%%)\n", $r->{unsure_sp},
($r->{unsure_sp}*100) / ($tot_sp);
# for TCR calc, treat "unsures" as ham
# TODO: unsure_sp should probably be treated as spam, assuming
# it'll fall in the 5.0-6.0 score range
my $fn = $r->{unsure_sp} + $r->{fn};
my $fp = $r->{fp};
printf "TCRs: l=1 %5.3f l=5 %5.3f l=9 %5.3f\n",
tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 1),
tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 5),
tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 9);
print "\n";
last if ($count-- < 0);
}
sub tcr {
my ($nspamspam, $nspamlegit, $nlegitspam, $nlegitlegit, $lambda) = @_;
my $nlegit = $nlegitspam+$nlegitlegit;
my $nspam = $nspamspam+$nspamlegit;
my $werr = ($lambda * $nlegitspam + $nspamlegit)
/ ($lambda * $nlegit + $nspam);
my $werr_base = $nspam
/ ($lambda * $nlegit + $nspam);
$werr ||= 0.000001; # avoid / by 0
my $tcr = $werr_base / $werr;
#my $sr = ($nspamspam / $nspam) * 100.0;
#my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
$tcr;
}