blob: cb15d78b55a0304da5bcf00ff54a8fd380714733 [file] [log] [blame]
#!/usr/bin/perl -w
#
# Given a 'results' dir from a bayes-10pcv-driver run,
# graph a ROC curve of accuracy.
#
# usage: graph-accuracy-curve [--buckets=100] ...dir/results .../dir2/results ...
#
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
use Getopt::Long;
our $opt_buckets;
GetOptions("buckets=i");
my $buckets = $opt_buckets || 100;
my $range_lo = 0.0;
my $range_hi = 1.0;
%bux_sp = ();
%bux_ns = ();
my $step = ($range_hi - $range_lo) / $buckets;
my $i;
for ($i = $range_lo; $i <= $range_hi; $i += $step) {
push (@buckets, $i);
}
open(DATA, ">plot.data");
my $setcount = 0;
my %tag = ();
my @dirs = ();
foreach my $dir (@ARGV) {
for ($i = $range_lo; $i <= $range_hi; $i += $step) {
$bux_ns{$i} = $bux_sp{$i} = 0;
}
dofile($setcount, "$dir/spam_all.log", "$dir/nonspam_all.log");
push (@dirs, $dir);
$tag{$dir} = $setcount;
$setcount++;
}
close DATA;
open (OUT, "| gnuplot -") or die "cannot run gnuplot";
select(OUT);
# set xtics 0,0.1,0.99
print "
set xlabel 'FPs'
set ylabel 'FNs'
set logscale xy 2
set xrange []
set yrange []
set terminal png size 1024,768 crop
set out 'graph.png'
plot ";
my @text = ();
my $t = 0;
foreach my $dir (@dirs) {
my $s = $tag{$dir};
$t++; push (@text, " 'plot.data' using 1:2 index $s with linesp lt $t pt $t t 'ham, $dir'");
}
print join(", \\\n", @text);
print "\n";
close OUT;
exit;
sub dofile {
my ($setcount, $spam, $nonspam) = @_;
foreach my $file ($spam, $nonspam) {
open (IN, "<$file") || die "Could not open file '$file': $!";
my $isspam = 0; ($file eq $spam) and $isspam = 1;
while (<IN>) {
/^(\.|Y)\s.+bayes=([^\s,]+)/ or next;
my $score = $2+0;
my $bucket_id;
foreach my $bucket (@buckets) {
if ($score >= $bucket && $score < $bucket+$step) {
$bucket_id = $bucket; last;
}
}
if ($isspam) {
$bux_sp{$bucket_id}++;
} else {
$bux_ns{$bucket_id}++;
}
}
}
foreach my $bucket (@buckets) {
my ($fp, $fn) = results_for_cutoff($bucket);
print DATA "$fp $fn\n";
}
print DATA "\n\n";
}
sub results_for_cutoff {
my $cutoff = shift;
my $fn = 0;
my $fp = 0;
for ($i = $range_lo; $i < $cutoff; $i += $step) {
foreach my $bucket (@buckets) {
if ($i >= $bucket && $i < $bucket+$step) {
$fn += $bux_sp{$bucket};
}
}
}
for ($i = $cutoff; $i <= $range_hi; $i += $step) {
foreach my $bucket (@buckets) {
if ($i >= $bucket && $i < $bucket+$step) {
$fp += $bux_ns{$bucket};
}
}
}
return ($fp, $fn);
}