blob: a711f77293b99bf1796fc86743ed5ddf60724531 [file] [log] [blame]
#!/usr/bin/perl -w
#
# Given a 'results' dir from a bayes-10pcv-driver run,
# graph a histogram of the score ranges using GNUPlot.
#
# usage: graph-bayes-histogram [--buckets=100] ...dir/results .../dir2/results ...
#
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
use Getopt::Long;
our $opt_buckets;
GetOptions("buckets=i");
my $buckets = $opt_buckets || 100;
my $range_lo = 0.0;
my $range_hi = 1.0;
%bux_sp = ();
%bux_ns = ();
my $step = ($range_hi - $range_lo) / $buckets;
my $i;
for ($i = $range_lo; $i <= $range_hi; $i += $step) {
push (@buckets, $i);
}
open(DATA, ">plot.data");
my $setcount = 0;
my %tag = ();
my @dirs = ();
foreach my $dir (@ARGV) {
for ($i = $range_lo; $i <= $range_hi; $i += $step) {
$bux_ns{$i} = $bux_sp{$i} = 0;
}
dofile($setcount, "$dir/spam_all.log", "$dir/nonspam_all.log");
push (@dirs, $dir);
$tag{$dir} = $setcount;
$setcount++;
}
close DATA;
open (OUT, "| gnuplot -") or die "cannot run gnuplot";
select(OUT);
print "
set xlabel 'P(spam)'
set ylabel 'Frequency'
set logscale y 2
set xrange [0.0:1.01]
set yrange []
set xtics 0,0.1,0.99
set terminal png crop
set out 'graph.png'
plot ";
my @text = ();
my $t = 0;
foreach my $dir (@dirs) {
my $s = $tag{$dir};
$t++; push (@text, " 'plot.data' using 1:2 index $s with linesp lt $t pt $t t 'ham, $dir'");
$t++; push (@text, " 'plot.data' using 1:3 index $s with linesp lt $t pt $t t 'spam, $dir'");
}
print join(", \\\n", @text);
print "\n";
close OUT;
exit;
sub dofile {
my ($setcount, $spam, $nonspam) = @_;
foreach my $file ($spam, $nonspam) {
open (IN, "<$file") || die "Could not open file '$file': $!";
my $isspam = 0; ($file eq $spam) and $isspam = 1;
while (<IN>) {
/^(\.|Y)\s.+bayes=([^\s,]+)/ or next;
my $score = $2+0;
my $bucket_id;
foreach my $bucket (@buckets) {
if ($score >= $bucket && $score < $bucket+$step) {
$bucket_id = $bucket; last;
}
}
if ($isspam) {
$bux_sp{$bucket_id}++;
} else {
$bux_ns{$bucket_id}++;
}
}
}
my $sideoffset = 0.001*$setcount;
foreach my $bucket (@buckets) {
my $ns = $bux_ns{$bucket};
my $sp = $bux_sp{$bucket};
my $xpos = $bucket + $sideoffset;
print DATA "$xpos $ns $sp\n";
}
print DATA "\n\n";
}