| #!/usr/bin/perl -w |
| # |
| # mk-roc-graphs -- create Receiver Operating Characteristic graphs |
| # |
| # Creates ROC curve graphs from a pair of SpamAssassin logs and the current |
| # score-set. See <http://home.comcast.net/~tom.fawcett/public_html/ROCCH/> |
| # for details on ROC curves. |
| # |
| # <@LICENSE> |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to you under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at: |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # </@LICENSE> |
| |
| use Getopt::Long; |
| our ($opt_cffile, $opt_spam, $opt_ham, $opt_count, $opt_lambda); |
| |
| GetOptions("cffile=s", "count", "threshold=f", "spam=s", |
| "ham=s", "scoreset=i", "lambda=f"); |
| |
| $opt_cffile ||= "../rules"; |
| $opt_spam ||= 'spam.log'; |
| $opt_ham ||= 'ham.log'; |
| $opt_count ||= 0; |
| |
| $opt_scoreset = 0 if (!defined $opt_scoreset); |
| |
| my $msgline; |
| |
| our (%rules, %allrules); |
| |
| my %spam_points = (); |
| my %ham_points = (); |
| my ($num_tests, $num_spam, $num_ham); |
| |
| my $nybias = 10; |
| |
| # lambda value for TCR equation, representing the cost of of an FP vs. the |
| # cost of a FN. Some example values are: 1 = tagged only, 9 = mailed back |
| # to sender asking for token, 999 = blocking or deleting a message. |
| # |
| # We roughly aim for a value representing "moved to infrequently-read folder". |
| |
| my $lambda = 50; |
| if ($opt_lambda) { $lambda = $opt_lambda; } |
| |
| |
| readscores(); |
| readlogs(); |
| |
| if ($opt_count) { |
| $nybias = $nybias*($num_spam / $num_ham); |
| evaluate_reports(); |
| } |
| else { |
| evaluate_scores(); |
| } |
| exit 0; |
| |
| |
| # arguments are $isspam, $count, \@tests |
| sub log_line_count { |
| my $score = 0; |
| $score += $scores{$_} for @{$_[2]}; |
| |
| $score = (int($score * 10) / 10); |
| if ($_[0]) { |
| $num_spam++; |
| $spam_points{$score}++; |
| } |
| else { |
| $num_ham++; |
| $ham_points{$score}++; |
| } |
| } |
| |
| sub readlogs { |
| my $count = 0; |
| $num_spam = $num_ham = 0; |
| |
| # set handler for log lines |
| my $log_line = \&log_line_count; |
| |
| foreach my $file ($opt_spam, $opt_ham) { |
| open (IN, "<$file") || die "Could not open file '$file': $!"; |
| |
| my $isspam = ($file eq $opt_spam); |
| my $caught; # 1st parameter of log line |
| my $rules; # 4th parameter of log line |
| |
| while (defined($msgline = <IN>)) { |
| ($caught, undef, undef, $rules) = split(' ', $msgline); |
| |
| # only take lines starting with Y or . |
| next unless ($caught eq 'Y' || $caught eq '.') && $rules; |
| |
| # get tests, but ignore unknown tests and subrules |
| my @tests = grep { defined $scores{$_} && !$allrules{$_}->{issubrule} } |
| split(/,/, $rules); |
| |
| # run handler |
| $log_line->($isspam, $count, \@tests); |
| |
| # increment line |
| $count++; |
| } |
| close IN; |
| } |
| $num_tests = $count; |
| } |
| |
| sub readscores { |
| warn "Reading scores from \"$opt_cffile\"...\n"; |
| system ("../build/parse-rules-for-masses -d \"$opt_cffile\" -s $opt_scoreset") and die; |
| require "./tmp/rules.pl"; |
| %allrules = %rules; # ensure it stays global |
| } |
| |
| sub _evaluate_at_all_thresholds { |
| my $callback = shift; |
| |
| my %u=(); |
| my @scores = grep {defined} map { |
| if (exists $u{$_}) { undef; } else { $u{$_}=undef;$_; } |
| } sort { $a <=> $b } (keys %ham_points, keys %spam_points); |
| |
| my $tot_h = $num_ham || 0.00001; |
| my $tot_s = $num_spam || 0.00001; |
| |
| foreach my $threshold (@scores) { |
| my $fp = 0; # false positives; ham marked as spam |
| my $fn = 0; # false negatives; spam marked as ham |
| my $tp = 0; # true positives; spam marked as spam |
| my $tn = 0; # true negatives; ham marked as ham |
| foreach my $scr (@scores) { |
| my $nh = $ham_points{$scr} || 0; |
| my $ns = $spam_points{$scr} || 0; |
| if ($threshold > $scr) { |
| $tn += $nh; |
| $fn += $ns; |
| } else { |
| $fp += $nh; |
| $tp += $ns; |
| } |
| } |
| $callback->($threshold, $tot_h, $tot_s, $tn, $tp, $fn, $fp); |
| } |
| } |
| |
| sub evaluate_reports { |
| _evaluate_at_all_thresholds (sub { |
| my ($threshold, $tot_h, $tot_s, $tn, $tp, $fn, $fp) = @_; |
| my $nh = $ham_points{$threshold} || 0; |
| my $ns = $spam_points{$threshold} || 0; |
| # $nh /= $tot_h; |
| # $ns /= $tot_s; |
| # $tn /= $tot_h; |
| # $tp /= $tot_s; |
| # $fp /= $tot_h; |
| # $fn /= $tot_s; |
| |
| printf ("\n# SUMMARY for threshold %3.1f:\n", $threshold); |
| printf "# Correctly non-spam: %6d %4.2f%%\n", |
| $tn, ($tn / $tot_h) * 100.0; |
| printf "# Correctly spam: %6d %4.2f%%\n", |
| $tp, ($tp / $tot_s) * 100.0; |
| printf "# False positives: %6d %4.2f%%\n", |
| $fp, ($fp / $tot_h) * 100.0; |
| printf "# False negatives: %6d %4.2f%%\n", |
| $fn, ($fn / $tot_s) * 100.0; |
| |
| # convert to the TCR metrics used in the published lit |
| my $nspamspam = $tp; |
| my $nspamlegit = $fn; |
| my $nlegitspam = $fp; |
| my $nlegitlegit = $tn; |
| my $nlegit = $tot_h; |
| my $nspam = $tot_s; |
| |
| my $werr = ($lambda * $nlegitspam + $nspamlegit) |
| / ($lambda * $nlegit + $nspam); |
| |
| my $werr_base = $nspam |
| / ($lambda * $nlegit + $nspam); |
| |
| $werr ||= 0.000001; # avoid / by 0 |
| my $tcr = $werr_base / $werr; |
| |
| my $sr = ($nspamspam / $nspam) * 100.0; |
| my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0; |
| printf "# TCR(l=%s): %3.6f SpamRecall: %3.3f%% SpamPrec: %3.3f%%\n", |
| $lambda, $tcr, $sr, $sp; |
| }); |
| } |
| |
| sub evaluate_scores { |
| _evaluate_at_all_thresholds (sub { |
| my ($threshold, $tot_h, $tot_s, $tn, $tp, $fn, $fp) = @_; |
| my $nh = $ham_points{$threshold} || 0; |
| my $ns = $spam_points{$threshold} || 0; |
| $nh /= $tot_h; |
| $ns /= $tot_s; |
| $tn /= $tot_h; |
| $tp /= $tot_s; |
| $fp /= $tot_h; |
| $fn /= $tot_s; |
| printf "%f %.9f %.9f %.9f %.9f %.9f %.9f\n", |
| $threshold, $nh, $ns, $tn, $tp, $fp, $fn; |
| }); |
| } |
| |
| __DATA__ |
| |
| # Suggested commands to plot data from this script: |
| |
| # ham score distribution: |
| ./mk-roc-graphs --scoreset=3 > dat |
| gnuplot |
| set grid |
| plot [-5:40] [] "dat" using 1:2 with boxes |
| |
| # spam score distribution: |
| ./mk-roc-graphs --scoreset=3 > dat |
| gnuplot |
| set grid |
| plot [-5:40] [] "dat" using 1:3 with boxes |
| |
| # plot a ROC curve: |
| ./mk-roc-graphs --scoreset=3 > dat |
| gnuplot |
| set grid |
| plot [0:0.1] [0:0.1] "dat" using 6:7 with linesp |
| |
| # For a ROC curve, as shown in the literature |
| ./mk-roc-graphs --scoreset=3 > dat |
| set grid |
| set xlabel "False Positives (1.0 = 100%)" |
| set ylabel "True Positives (1.0 = 100%)" |
| plot "dat" using 6:5 with linesp |
| |
| # same, but tighter: |
| ./mk-roc-graphs --scoreset=3 > dat |
| gnuplot |
| set grid |
| plot [0:0.02] [0:0.02] "dat" using 6:7 with linesp |
| |
| # and the piece de resistance: |
| ./mk-roc-graphs --scoreset=0 > set0 |
| ./mk-roc-graphs --scoreset=1 > set1 |
| ./mk-roc-graphs --scoreset=2 > set2 |
| ./mk-roc-graphs --scoreset=3 > set3 |
| gnuplot |
| set terminal png |
| set output "out.png" |
| set grid |
| set xlabel "False Positives (1.0 = 100% of ham marked as spam)" |
| set ylabel "False Negatives (1.0 = 100% of spam marked as ham)" |
| plot [0:0.1] [0:0.1] "set0" using 6:7 with linesp, \ |
| "set1" using 6:7 with linesp, \ |
| "set2" using 6:7 with linesp, \ |
| "set3" using 6:7 with linesp |
| |
| |