blob: 523204bb86a1e3ccb5c9fb94a9d0881e6ceeb90a [file] [log] [blame]
#!/usr/bin/perl -w
#
# Given a spam.log and nonspam.log from a "mass-check --bayes" run,
# draw a histogram of the score ranges.
#
# This now draws a detailed "zoom" view as well as the overall histogram,
# so the low-frequency FPs and FNs around the middle ground can be viewed.
# In addition, it does not show ham lines or spam lines, if those buckets
# got no hits.
#
# usage: draw-bayes-histogram [--spam=spam.log] [--nonspam=nonspam.log]
# [--nocollapse] [--nozoom] [--buckets=20]
#
# or: draw-bayes-histogram spam.log nonspam.log (backwards compatible)
use Getopt::Long;
our ($opt_spam, $opt_nonspam, $opt_nocollapse, $opt_nozoom, $opt_buckets);
GetOptions("spam=s", "nonspam=s", "nocollapse", "nozoom", "buckets=i");
my $spam = $opt_spam;
if (!$spam && $ARGV[0] !~ /^\-/) { $spam = $ARGV[0]; }
if (!$spam) { $spam = "spam.log"; }
my $nonspam = $opt_nonspam;
if (!$nonspam && $ARGV[1] !~ /^\-/) { $nonspam = $ARGV[1]; }
if (!$nonspam) { $nonspam = "nonspam.log"; }
my $buckets = $opt_buckets || 25;
my $zoomfactor = 20;
my $range_lo = 0.0;
my $range_hi = 1.0;
%bux_sp = ();
%bux_ns = ();
my $step = ($range_hi - $range_lo) / $buckets;
my $i;
for ($i = $range_lo; $i <= $range_hi; $i += $step) {
push (@buckets, $i);
$bux_ns{$i} = $bux_sp{$i} = 0;
}
foreach my $file ($spam, $nonspam) {
open (IN, "<$file") || die "Could not open file '$file': $!";
my $isspam = 0; ($file eq $spam) and $isspam = 1;
while (<IN>) {
/^(\.|Y)\s.+bayes=([^\s,]+)/ or next;
my $score = $2+0;
my $bucket_id;
foreach my $bucket (@buckets) {
if ($score >= $bucket && $score < $bucket+$step) {
$bucket_id = $bucket; last;
}
}
if ($isspam) {
$bux_sp{$bucket_id}++;
} else {
$bux_ns{$bucket_id}++;
}
}
}
my $max_sp = 0;
my $max_ns = 0;
my $tot_sp = 0;
my $tot_ns = 0;
foreach my $bucket (@buckets) {
$tot_sp += $bux_sp{$bucket};
if ($bux_sp{$bucket} > $max_sp)
{ $max_sp = $bux_sp{$bucket}; }
$tot_ns += $bux_ns{$bucket};
if ($bux_ns{$bucket} > $max_ns)
{ $max_ns = $bux_ns{$bucket}; }
}
my $chars_in_line = 55;
if ($opt_nozoom) {
$chars_in_line += 10;
}
my $scale_sp = ($max_sp / $chars_in_line);
my $scale_ns = ($max_ns / $chars_in_line);
$scale_sp ||= 0.000001; $scale_ns ||= 0.000001;
$tot_sp ||= 0.000001; $tot_ns ||= 0.000001;
print STDOUT
"SCORE NUMHIT DETAIL OVERALL HISTOGRAM (. = ham, # = spam)\n";
# 0.000 (19.217%) ..........|....................
foreach my $bucket (@buckets) {
my $numdots;
$numdots = int (($bux_ns{$bucket} / $scale_ns) + .5);
my $line_ns = ('.' x $numdots);
$numdots = int ((($bux_ns{$bucket}*$zoomfactor) / $scale_ns) + .5);
my $zoomline_ns = ('.' x $numdots);
$zoomline_ns = sprintf ("%-10s", substr ($zoomline_ns, 0, 10));
$numdots = int (($bux_sp{$bucket} / $scale_sp) + .5);
my $line_sp = ('#' x $numdots);
$numdots = int ((($bux_sp{$bucket}*$zoomfactor) / $scale_sp) + .5);
my $zoomline_sp = ('#' x $numdots);
$zoomline_sp = sprintf ("%-10s", substr ($zoomline_sp, 0, 10));
if (!$opt_nozoom) {
$line_ns = $zoomline_ns.'|'.$line_ns;
$line_sp = $zoomline_sp.'|'.$line_sp;
}
if ($bux_ns{$bucket} != 0 && !$opt_nocollapse) {
printf STDOUT "%3.3f (%6.3f%%) %s\n", $bucket,
(($bux_ns{$bucket} / $tot_ns) * 100.0), $line_ns;
}
if ($bux_sp{$bucket} != 0 && !$opt_nocollapse) {
printf STDOUT "%3.3f (%6.3f%%) %s\n", $bucket,
(($bux_sp{$bucket} / $tot_sp) * 100.0), $line_sp;
}
}