| #!/usr/bin/perl -w |
| # |
| # Given a 'results' dir from a bayes-10pcv-driver run, |
| # graph a ROC curve of accuracy. |
| # |
| # usage: graph-accuracy-curve [--buckets=100] ...dir/results .../dir2/results ... |
| # |
| # <@LICENSE> |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to you under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at: |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # </@LICENSE> |
| |
| |
| use Getopt::Long; |
| |
| our $opt_buckets; |
| GetOptions("buckets=i"); |
| |
| my $buckets = $opt_buckets || 100; |
| my $range_lo = 0.0; |
| my $range_hi = 1.0; |
| |
| %bux_sp = (); |
| %bux_ns = (); |
| |
| my $step = ($range_hi - $range_lo) / $buckets; |
| my $i; |
| for ($i = $range_lo; $i <= $range_hi; $i += $step) { |
| push (@buckets, $i); |
| } |
| |
| open(DATA, ">plot.data"); |
| my $setcount = 0; |
| my %tag = (); |
| my @dirs = (); |
| foreach my $dir (@ARGV) { |
| for ($i = $range_lo; $i <= $range_hi; $i += $step) { |
| $bux_ns{$i} = $bux_sp{$i} = 0; |
| } |
| |
| dofile($setcount, "$dir/spam_all.log", "$dir/nonspam_all.log"); |
| push (@dirs, $dir); |
| $tag{$dir} = $setcount; |
| $setcount++; |
| } |
| close DATA; |
| |
| open (OUT, "| gnuplot -") or die "cannot run gnuplot"; |
| select(OUT); |
| |
| # set xtics 0,0.1,0.99 |
| |
| print " |
| set xlabel 'FPs' |
| set ylabel 'FNs' |
| set logscale xy 2 |
| set xrange [] |
| set yrange [] |
| set terminal png size 1024,768 crop |
| set out 'graph.png' |
| |
| plot "; |
| |
| my @text = (); |
| my $t = 0; |
| foreach my $dir (@dirs) { |
| my $s = $tag{$dir}; |
| $t++; push (@text, " 'plot.data' using 1:2 index $s with linesp lt $t pt $t t 'ham, $dir'"); |
| } |
| |
| print join(", \\\n", @text); |
| print "\n"; |
| |
| close OUT; |
| exit; |
| |
| |
| sub dofile { |
| my ($setcount, $spam, $nonspam) = @_; |
| |
| foreach my $file ($spam, $nonspam) { |
| open (IN, "<$file") || die "Could not open file '$file': $!"; |
| |
| my $isspam = 0; ($file eq $spam) and $isspam = 1; |
| |
| while (<IN>) { |
| /^(\.|Y)\s.+bayes=([^\s,]+)/ or next; |
| my $score = $2+0; |
| |
| my $bucket_id; |
| foreach my $bucket (@buckets) { |
| if ($score >= $bucket && $score < $bucket+$step) { |
| $bucket_id = $bucket; last; |
| } |
| } |
| |
| if ($isspam) { |
| $bux_sp{$bucket_id}++; |
| } else { |
| $bux_ns{$bucket_id}++; |
| } |
| } |
| } |
| |
| foreach my $bucket (@buckets) { |
| my ($fp, $fn) = results_for_cutoff($bucket); |
| print DATA "$fp $fn\n"; |
| } |
| print DATA "\n\n"; |
| } |
| |
| sub results_for_cutoff { |
| my $cutoff = shift; |
| my $fn = 0; |
| my $fp = 0; |
| |
| for ($i = $range_lo; $i < $cutoff; $i += $step) { |
| foreach my $bucket (@buckets) { |
| if ($i >= $bucket && $i < $bucket+$step) { |
| $fn += $bux_sp{$bucket}; |
| } |
| } |
| } |
| for ($i = $cutoff; $i <= $range_hi; $i += $step) { |
| foreach my $bucket (@buckets) { |
| if ($i >= $bucket && $i < $bucket+$step) { |
| $fp += $bux_ns{$bucket}; |
| } |
| } |
| } |
| |
| return ($fp, $fn); |
| } |
| |