| #!/usr/bin/perl -w |
| # |
| # Given a spam.log and nonspam.log from a "mass-check --bayes" run, |
| # work out efficiency. This variant uses static thresholds that |
| # model more closely what SpamAssassin uses for scoring. |
| # |
| # usage: bayes-threshold spam.log nonspam.log |
| |
| my $spam = $ARGV[0] || "spam.log"; |
| my $nonspam = $ARGV[1] || (-f "good.log" ? "good.log" : "nonspam.log"); |
| |
| my $hamcutoff = 0.20; |
| my $spamcutoff = 0.80; |
| |
| my $nbuckets = 50; |
| my $range_lo = 0.00; |
| my $range_hi = 1.00; |
| my $step = 0.02; |
| #my $step = ($range_hi - $range_lo) / $nbuckets; # unusable - round errors! |
| |
| $hamcutoff += 0.0; |
| $spamcutoff += 0.0; |
| |
| # shamelessly nicked from spambayes' testing infrastructure; a system to |
| # compute the "cost" of a pair of thresholds and classifier. I'm supporting |
| # it here so our stats are (at least a little) comparable against theirs. |
| |
| my $best_cutoff_fp_weight = 10.0; |
| my $best_cutoff_fn_weight = 1.0; |
| my $best_cutoff_unsure_weight = 0.1; |
| |
| %bux_sp = (); |
| %bux_ns = (); |
| |
| my $i; |
| for ($i = 0; $i <= ($range_hi - $range_lo) / $step; $i++) |
| { |
| my $lvl = $range_lo + ($i * $step); |
| push (@buckets, $lvl); |
| $bux_ns{$lvl} = $bux_sp{$lvl} = 0; |
| } |
| |
| foreach my $file ($spam, $nonspam) { |
| open (IN, "<$file") || die "Could not open file '$file': $!"; |
| |
| my $isspam = 0; ($file eq $spam) and $isspam = 1; |
| |
| while (<IN>) { |
| /^(\.|Y)\s.+bayes=([^\s,]+)/ or next; |
| my $score = $2+0; |
| if ($score == 1) { $score = 0.9999999999999; } |
| |
| my $bucket_id; |
| foreach my $bucket (@buckets) { |
| if ($score >= $bucket && $score < $bucket+$step) { |
| $bucket_id = $bucket; last; |
| } |
| } |
| |
| if (!defined $bucket_id) { |
| warn "no bucket for $score!"; |
| } |
| if (!defined $bux_sp{$bucket_id}) { |
| warn "undef bucket at $bucket_id! (score=$bucket)"; |
| } |
| if (!defined $bux_ns{$bucket_id}) { |
| warn "undef bucket at $bucket_id! (score=$bucket)"; |
| } |
| if ($isspam) { |
| $bux_sp{$bucket_id}++; |
| } else { |
| $bux_ns{$bucket_id}++; |
| } |
| } |
| } |
| |
| my $max_sp = 0; |
| my $max_ns = 0; |
| my $tot_sp = 0; |
| my $tot_ns = 0; |
| foreach my $bucket (@buckets) { |
| $tot_sp += $bux_sp{$bucket}; |
| if ($bux_sp{$bucket} > $max_sp) |
| { $max_sp = $bux_sp{$bucket}; } |
| $tot_ns += $bux_ns{$bucket}; |
| if ($bux_ns{$bucket} > $max_ns) |
| { $max_ns = $bux_ns{$bucket}; } |
| } |
| |
| foreach my $cutoff (0.3, 0.2, 0.1, 0.04, 0.02) { |
| $hamcutoff = $cutoff; |
| $spamcutoff = 1.0 - $cutoff; |
| |
| my %results = results_for_cutoff ($hamcutoff, $spamcutoff); |
| write_results (%results); |
| } |
| |
| sub results_for_cutoff { |
| my %results = (); |
| |
| my $fn = 0; |
| my $fp = 0; |
| my $unsure_sp = 0; |
| my $unsure_ns = 0; |
| |
| for ($i = $range_lo; $i < $hamcutoff; $i += $step) { |
| foreach my $bucket (@buckets) { |
| if ($i >= $bucket && $i < $bucket+$step) { |
| $fn += $bux_sp{$bucket}; |
| } |
| } |
| } |
| # total up the unsures (between hamcutoff and spamcutoff) |
| for ($i = $hamcutoff; $i <= $spamcutoff; $i += $step) { |
| foreach my $bucket (@buckets) { |
| if ($i >= $bucket && $i < $bucket+$step) { |
| $unsure_ns += $bux_ns{$bucket}; |
| $unsure_sp += $bux_sp{$bucket}; |
| } |
| } |
| } |
| for ($i = $spamcutoff; $i <= $range_hi; $i += $step) { |
| foreach my $bucket (@buckets) { |
| if ($i >= $bucket && $i < $bucket+$step) { |
| $fp += $bux_ns{$bucket}; |
| } |
| } |
| } |
| |
| my $cost = ($fp * $best_cutoff_fp_weight) |
| + ($fn * $best_cutoff_fn_weight) |
| + ($unsure_ns * $best_cutoff_unsure_weight) |
| + ($unsure_sp * $best_cutoff_unsure_weight); |
| |
| $results{"$hamcutoff $spamcutoff"} = { |
| 'hamcutoff' => $hamcutoff, |
| 'spamcutoff'=> $spamcutoff, |
| 'cost' => $cost, |
| 'unsure_ns' => $unsure_ns, |
| 'unsure_sp' => $unsure_sp, |
| 'fp' => $fp, |
| 'fn' => $fn |
| }; |
| |
| return %results; |
| } |
| |
| sub write_results { |
| my (%results) = @_; |
| foreach my $r (values %results) { |
| printf "Threshold optimization for hamcutoff=%3.2f, spamcutoff=%3.2f: cost=\$%5.2f\n", |
| $r->{hamcutoff}, $r->{spamcutoff}, $r->{cost}; |
| printf "Total ham:spam: %d:%d\n", $tot_ns, $tot_sp; |
| |
| printf "FP: %5d %5.3f%% ", $r->{fp}, ($r->{fp}*100) / $tot_ns; |
| printf "FN: %5d %5.3f%%\n", $r->{fn}, ($r->{fn}*100) / $tot_sp; |
| |
| my $unsure = $r->{unsure_ns} + $r->{unsure_sp}; |
| printf "Unsure: %5d %5.3f%% ", $unsure, |
| ($unsure*100) / ($tot_sp+$tot_ns); |
| printf "(ham: %5d %5.3f%% ", $r->{unsure_ns}, |
| ($r->{unsure_ns}*100) / ($tot_ns); |
| printf "spam: %5d %5.3f%%)\n", $r->{unsure_sp}, |
| ($r->{unsure_sp}*100) / ($tot_sp); |
| |
| # for TCR calc, treat "unsures" as ham |
| # TODO: unsure_sp should probably be treated as spam, assuming |
| # it'll fall in the 5.0-6.0 score range |
| my $fn = $r->{unsure_sp} + $r->{fn}; |
| my $fp = $r->{fp}; |
| printf "TCRs: l=1 %5.3f l=5 %5.3f l=9 %5.3f\n", |
| tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 1), |
| tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 5), |
| tcr ($tot_sp - $fn, $fn, $fp, $tot_ns - $fp, 9); |
| |
| printf "SUMMARY: %3.2f/%3.2f fp %5d fn %5d uh %5d us %5d c %5.2f\n", |
| $r->{hamcutoff}, $r->{spamcutoff}, |
| $r->{fp}, $r->{fn}, $r->{unsure_ns}, $r->{unsure_sp}, |
| $r->{cost}; |
| |
| print "\n"; |
| } |
| } |
| |
| |
| sub tcr { |
| my ($nspamspam, $nspamlegit, $nlegitspam, $nlegitlegit, $lambda) = @_; |
| |
| my $nlegit = $nlegitspam+$nlegitlegit; |
| my $nspam = $nspamspam+$nspamlegit; |
| |
| my $werr = ($lambda * $nlegitspam + $nspamlegit) |
| / ($lambda * $nlegit + $nspam); |
| |
| my $werr_base = $nspam |
| / ($lambda * $nlegit + $nspam); |
| |
| $werr ||= 0.000001; # avoid / by 0 |
| my $tcr = $werr_base / $werr; |
| |
| #my $sr = ($nspamspam / $nspam) * 100.0; |
| #my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0; |
| |
| $tcr; |
| } |
| |