blob: e56947ddf5b653b185023baa63cda0654ff40374 [file] [log] [blame]
#!/usr/bin/perl -w
#
# mk-roc-graphs -- create Receiver Operating Characteristic graphs
#
# Creates ROC curve graphs from a pair of SpamAssassin logs and the current
# score-set. See <http://home.comcast.net/~tom.fawcett/public_html/ROCCH/>
# for details on ROC curves.
#
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
use Getopt::Long;
our ($opt_cffile, $opt_spam, $opt_ham, $opt_count, $opt_lambda);
GetOptions("cffile=s", "count", "threshold=f", "spam=s",
"ham=s", "scoreset=i", "lambda=f");
$opt_cffile ||= "../rules";
$opt_spam ||= 'spam.log';
$opt_ham ||= 'ham.log';
$opt_count ||= 0;
$opt_scoreset = 0 if (!defined $opt_scoreset);
my $msgline;
our (%rules, %allrules);
my %spam_points = ();
my %ham_points = ();
my ($num_tests, $num_spam, $num_ham);
my $nybias = 10;
# lambda value for TCR equation, representing the cost of of an FP vs. the
# cost of a FN. Some example values are: 1 = tagged only, 9 = mailed back
# to sender asking for token, 999 = blocking or deleting a message.
#
# We roughly aim for a value representing "moved to infrequently-read folder".
my $lambda = 50;
if ($opt_lambda) { $lambda = $opt_lambda; }
readscores();
readlogs();
if ($opt_count) {
$nybias = $nybias*($num_spam / $num_ham);
evaluate_reports();
}
else {
evaluate_scores();
}
exit 0;
# arguments are $isspam, $count, \@tests
sub log_line_count {
my $score = 0;
$score += $scores{$_} for @{$_[2]};
$score = (int($score * 10) / 10);
if ($_[0]) {
$num_spam++;
$spam_points{$score}++;
}
else {
$num_ham++;
$ham_points{$score}++;
}
}
sub readlogs {
my $count = 0;
$num_spam = $num_ham = 0;
# set handler for log lines
my $log_line = \&log_line_count;
foreach my $file ($opt_spam, $opt_ham) {
open (IN, "<$file") || die "Could not open file '$file': $!";
my $isspam = ($file eq $opt_spam);
my $caught; # 1st parameter of log line
my $rules; # 4th parameter of log line
while (defined($msgline = <IN>)) {
($caught, undef, undef, $rules) = split(' ', $msgline);
# only take lines starting with Y or .
next unless ($caught eq 'Y' || $caught eq '.') && $rules;
# get tests, but ignore unknown tests and subrules
my @tests;
foreach my $r (split(/,/, $rules)) {
my $hits = 1;
# Support compacted RULE(hitcount) format
if ($r =~ s/\((\d+)\)$//) {
$hits = $1;
}
next unless (defined $scores{$r} && !$allrules{$r}->{issubrule});
push @tests, $r for (1 .. $hits);
}
# run handler
$log_line->($isspam, $count, \@tests);
# increment line
$count++;
}
close IN;
}
$num_tests = $count;
}
sub readscores {
warn "Reading scores from \"$opt_cffile\"...\n";
system ("../build/parse-rules-for-masses -d \"$opt_cffile\" -s $opt_scoreset") and die;
require "./tmp/rules.pl";
%allrules = %rules; # ensure it stays global
}
sub _evaluate_at_all_thresholds {
my $callback = shift;
my %u=();
my @scores = grep {defined} map {
if (exists $u{$_}) { undef; } else { $u{$_}=undef;$_; }
} sort { $a <=> $b } (keys %ham_points, keys %spam_points);
my $tot_h = $num_ham || 0.00001;
my $tot_s = $num_spam || 0.00001;
foreach my $threshold (@scores) {
my $fp = 0; # false positives; ham marked as spam
my $fn = 0; # false negatives; spam marked as ham
my $tp = 0; # true positives; spam marked as spam
my $tn = 0; # true negatives; ham marked as ham
foreach my $scr (@scores) {
my $nh = $ham_points{$scr} || 0;
my $ns = $spam_points{$scr} || 0;
if ($threshold > $scr) {
$tn += $nh;
$fn += $ns;
} else {
$fp += $nh;
$tp += $ns;
}
}
$callback->($threshold, $tot_h, $tot_s, $tn, $tp, $fn, $fp);
}
}
sub evaluate_reports {
_evaluate_at_all_thresholds (sub {
my ($threshold, $tot_h, $tot_s, $tn, $tp, $fn, $fp) = @_;
my $nh = $ham_points{$threshold} || 0;
my $ns = $spam_points{$threshold} || 0;
# $nh /= $tot_h;
# $ns /= $tot_s;
# $tn /= $tot_h;
# $tp /= $tot_s;
# $fp /= $tot_h;
# $fn /= $tot_s;
printf ("\n# SUMMARY for threshold %3.1f:\n", $threshold);
printf "# Correctly non-spam: %6d %4.2f%%\n",
$tn, ($tn / $tot_h) * 100.0;
printf "# Correctly spam: %6d %4.2f%%\n",
$tp, ($tp / $tot_s) * 100.0;
printf "# False positives: %6d %4.2f%%\n",
$fp, ($fp / $tot_h) * 100.0;
printf "# False negatives: %6d %4.2f%%\n",
$fn, ($fn / $tot_s) * 100.0;
# convert to the TCR metrics used in the published lit
my $nspamspam = $tp;
my $nspamlegit = $fn;
my $nlegitspam = $fp;
my $nlegitlegit = $tn;
my $nlegit = $tot_h;
my $nspam = $tot_s;
my $werr = ($lambda * $nlegitspam + $nspamlegit)
/ ($lambda * $nlegit + $nspam);
my $werr_base = $nspam
/ ($lambda * $nlegit + $nspam);
$werr ||= 0.000001; # avoid / by 0
my $tcr = $werr_base / $werr;
my $sr = ($nspamspam / $nspam) * 100.0;
my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
printf "# TCR(l=%s): %3.6f SpamRecall: %3.3f%% SpamPrec: %3.3f%%\n",
$lambda, $tcr, $sr, $sp;
});
}
sub evaluate_scores {
_evaluate_at_all_thresholds (sub {
my ($threshold, $tot_h, $tot_s, $tn, $tp, $fn, $fp) = @_;
my $nh = $ham_points{$threshold} || 0;
my $ns = $spam_points{$threshold} || 0;
$nh /= $tot_h;
$ns /= $tot_s;
$tn /= $tot_h;
$tp /= $tot_s;
$fp /= $tot_h;
$fn /= $tot_s;
printf "%f %.9f %.9f %.9f %.9f %.9f %.9f\n",
$threshold, $nh, $ns, $tn, $tp, $fp, $fn;
});
}
__DATA__
# Suggested commands to plot data from this script:
# ham score distribution:
./mk-roc-graphs --scoreset=3 > dat
gnuplot
set grid
plot [-5:40] [] "dat" using 1:2 with boxes
# spam score distribution:
./mk-roc-graphs --scoreset=3 > dat
gnuplot
set grid
plot [-5:40] [] "dat" using 1:3 with boxes
# plot a ROC curve:
./mk-roc-graphs --scoreset=3 > dat
gnuplot
set grid
plot [0:0.1] [0:0.1] "dat" using 6:7 with linesp
# For a ROC curve, as shown in the literature
./mk-roc-graphs --scoreset=3 > dat
set grid
set xlabel "False Positives (1.0 = 100%)"
set ylabel "True Positives (1.0 = 100%)"
plot "dat" using 6:5 with linesp
# same, but tighter:
./mk-roc-graphs --scoreset=3 > dat
gnuplot
set grid
plot [0:0.02] [0:0.02] "dat" using 6:7 with linesp
# and the piece de resistance:
./mk-roc-graphs --scoreset=0 > set0
./mk-roc-graphs --scoreset=1 > set1
./mk-roc-graphs --scoreset=2 > set2
./mk-roc-graphs --scoreset=3 > set3
gnuplot
set terminal png
set output "out.png"
set grid
set xlabel "False Positives (1.0 = 100% of ham marked as spam)"
set ylabel "False Negatives (1.0 = 100% of spam marked as ham)"
plot [0:0.1] [0:0.1] "set0" using 6:7 with linesp, \
"set1" using 6:7 with linesp, \
"set2" using 6:7 with linesp, \
"set3" using 6:7 with linesp