masses/mk-roc-graphs - spamassassin - Git at Google

 #!/usr/bin/perl -w
 #
 # mk-roc-graphs -- create Receiver Operating Characteristic graphs
 #
 # Creates ROC curve graphs from a pair of SpamAssassin logs and the current
 # score-set.  See <http://home.comcast.net/~tom.fawcett/public_html/ROCCH/>
 # for details on ROC curves.
 #
 # <@LICENSE>
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to you under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at:
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # </@LICENSE>

 use Getopt::Long;
 our ($opt_cffile, $opt_spam, $opt_ham, $opt_count, $opt_lambda);

 GetOptions("cffile=s", "count", "threshold=f", "spam=s",
 	   "ham=s", "scoreset=i", "lambda=f");

 $opt_cffile ||= "../rules";
 $opt_spam ||= 'spam.log';
 $opt_ham ||= 'ham.log';
 $opt_count ||= 0;

 $opt_scoreset = 0 if (!defined $opt_scoreset);

 my $msgline;

 our (%rules, %allrules);

 my %spam_points = ();
 my %ham_points = ();
 my ($num_tests, $num_spam, $num_ham);

 my $nybias = 10;

 # lambda value for TCR equation, representing the cost of of an FP vs. the
 # cost of a FN.  Some example values are: 1 = tagged only, 9 = mailed back
 # to sender asking for token, 999 = blocking or deleting a message.
 #
 # We roughly aim for a value representing "moved to infrequently-read folder".

 my $lambda = 50;
 if ($opt_lambda) { $lambda = $opt_lambda; }


 readscores();
 readlogs();

 if ($opt_count) {
   $nybias = $nybias*($num_spam / $num_ham);
   evaluate_reports();
 }
 else {
   evaluate_scores();
 }
 exit 0;


 # arguments are $isspam, $count, \@tests
 sub log_line_count {
   my $score = 0;
   $score += $scores{$_} for @{$_[2]};

   $score = (int($score * 10) / 10);
   if ($_[0]) {
     $num_spam++;
     $spam_points{$score}++;
   }
   else {
     $num_ham++;
     $ham_points{$score}++;
   }
 }

 sub readlogs {
   my $count = 0;
   $num_spam = $num_ham = 0;

   # set handler for log lines
   my $log_line = \&log_line_count;

   foreach my $file ($opt_spam, $opt_ham) {
     open (IN, "<$file") || die "Could not open file '$file': $!";

     my $isspam = ($file eq $opt_spam);
     my $caught;			# 1st parameter of log line
     my $rules;			# 4th parameter of log line

     while (defined($msgline = <IN>)) {
       ($caught, undef, undef, $rules) = split(' ', $msgline);

       # only take lines starting with Y or .
       next unless ($caught eq 'Y' || $caught eq '.') && $rules;

       # get tests, but ignore unknown tests and subrules
       my @tests;
       foreach my $r (split(/,/, $rules)) {
         my $hits = 1;
         # Support compacted RULE(hitcount) format
         if ($r =~ s/\((\d+)\)$//) {
           $hits = $1;
         }
         next unless (defined $scores{$r} && !$allrules{$r}->{issubrule});
         push @tests, $r for (1 .. $hits);
       }

       # run handler
       $log_line->($isspam, $count, \@tests);

       # increment line
       $count++;
     }
     close IN;
   }
   $num_tests = $count;
 }

 sub readscores {
   warn "Reading scores from \"$opt_cffile\"...\n";
   system ("../build/parse-rules-for-masses -d \"$opt_cffile\" -s $opt_scoreset") and die;
   require "./tmp/rules.pl";
   %allrules = %rules;           # ensure it stays global
 }

 sub _evaluate_at_all_thresholds {
   my $callback = shift;

   my %u=();
   my @scores = grep {defined} map {
     if (exists $u{$_}) { undef; } else { $u{$_}=undef;$_; }
   } sort { $a <=> $b } (keys %ham_points, keys %spam_points);

   my $tot_h = $num_ham || 0.00001;
   my $tot_s = $num_spam || 0.00001;

   foreach my $threshold (@scores) {
     my $fp = 0;     # false positives; ham marked as spam
     my $fn = 0;     # false negatives; spam marked as ham
     my $tp = 0;     # true positives; spam marked as spam
     my $tn = 0;     # true negatives; ham marked as ham
     foreach my $scr (@scores) {
       my $nh = $ham_points{$scr} || 0;
       my $ns = $spam_points{$scr} || 0;
       if ($threshold > $scr) {
         $tn += $nh;
         $fn += $ns;
       } else {
         $fp += $nh;
         $tp += $ns;
       }
     }
     $callback->($threshold, $tot_h, $tot_s, $tn, $tp, $fn, $fp);
   }
 }

 sub evaluate_reports {
   _evaluate_at_all_thresholds (sub {
     my ($threshold, $tot_h, $tot_s, $tn, $tp, $fn, $fp) = @_;
     my $nh = $ham_points{$threshold} || 0;
     my $ns = $spam_points{$threshold} || 0;
     # $nh /= $tot_h;
     # $ns /= $tot_s;
     # $tn /= $tot_h;
     # $tp /= $tot_s;
     # $fp /= $tot_h;
     # $fn /= $tot_s;

     printf ("\n# SUMMARY for threshold %3.1f:\n", $threshold);
     printf "# Correctly non-spam: %6d  %4.2f%%\n",
         $tn, ($tn /  $tot_h) * 100.0;
     printf "# Correctly spam:     %6d  %4.2f%%\n",
         $tp, ($tp /  $tot_s) * 100.0;
     printf "# False positives:    %6d  %4.2f%%\n",
         $fp, ($fp /  $tot_h) * 100.0;
     printf "# False negatives:    %6d  %4.2f%%\n",
         $fn, ($fn /  $tot_s) * 100.0;

     # convert to the TCR metrics used in the published lit
     my $nspamspam = $tp;
     my $nspamlegit = $fn;
     my $nlegitspam = $fp;
     my $nlegitlegit = $tn;
     my $nlegit = $tot_h;
     my $nspam = $tot_s;

     my $werr = ($lambda * $nlegitspam + $nspamlegit)
                     / ($lambda * $nlegit + $nspam);

     my $werr_base = $nspam
                     / ($lambda * $nlegit + $nspam);

     $werr ||= 0.000001;     # avoid / by 0
     my $tcr = $werr_base / $werr;

     my $sr = ($nspamspam / $nspam) * 100.0;
     my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
     printf "# TCR(l=%s): %3.6f  SpamRecall: %3.3f%%  SpamPrec: %3.3f%%\n",
       $lambda, $tcr, $sr, $sp;
   });
 }

 sub evaluate_scores {
   _evaluate_at_all_thresholds (sub {
     my ($threshold, $tot_h, $tot_s, $tn, $tp, $fn, $fp) = @_;
     my $nh = $ham_points{$threshold} || 0;
     my $ns = $spam_points{$threshold} || 0;
     $nh /= $tot_h;
     $ns /= $tot_s;
     $tn /= $tot_h;
     $tp /= $tot_s;
     $fp /= $tot_h;
     $fn /= $tot_s;
     printf "%f %.9f %.9f %.9f %.9f %.9f %.9f\n",
         $threshold, $nh, $ns, $tn, $tp, $fp, $fn;
   });
 }

 __DATA__

 # Suggested commands to plot data from this script:

 # ham score distribution:
 ./mk-roc-graphs --scoreset=3  > dat
 gnuplot
 set grid
 plot [-5:40] [] "dat" using 1:2 with boxes

 # spam score distribution:
 ./mk-roc-graphs --scoreset=3  > dat
 gnuplot
 set grid
 plot [-5:40] [] "dat" using 1:3 with boxes

 # plot a ROC curve:
 ./mk-roc-graphs --scoreset=3  > dat
 gnuplot
 set grid
 plot [0:0.1] [0:0.1] "dat" using 6:7 with linesp

 # For a ROC curve, as shown in the literature
 ./mk-roc-graphs --scoreset=3 > dat
 set grid
 set xlabel "False Positives (1.0 = 100%)"
 set ylabel "True Positives (1.0 = 100%)"
 plot "dat" using 6:5 with linesp

 # same, but tighter:
 ./mk-roc-graphs --scoreset=3  > dat
 gnuplot
 set grid
 plot [0:0.02] [0:0.02] "dat" using 6:7 with linesp

 # and the piece de resistance:
 ./mk-roc-graphs --scoreset=0  > set0
 ./mk-roc-graphs --scoreset=1  > set1
 ./mk-roc-graphs --scoreset=2  > set2
 ./mk-roc-graphs --scoreset=3  > set3
 gnuplot
 set terminal png
 set output "out.png"
 set grid
 set xlabel "False Positives (1.0 = 100% of ham marked as spam)"
 set ylabel "False Negatives (1.0 = 100% of spam marked as ham)"
 plot [0:0.1] [0:0.1] "set0" using 6:7 with linesp, \
     "set1" using 6:7 with linesp, \
     "set2" using 6:7 with linesp, \
     "set3" using 6:7 with linesp
	#!/usr/bin/perl -w
	#
	# mk-roc-graphs -- create Receiver Operating Characteristic graphs
	#
	# Creates ROC curve graphs from a pair of SpamAssassin logs and the current
	# score-set. See <http://home.comcast.net/~tom.fawcett/public_html/ROCCH/>
	# for details on ROC curves.
	#
	# <@LICENSE>
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to you under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at:
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# </@LICENSE>

	use Getopt::Long;
	our ($opt_cffile, $opt_spam, $opt_ham, $opt_count, $opt_lambda);

	GetOptions("cffile=s", "count", "threshold=f", "spam=s",
	"ham=s", "scoreset=i", "lambda=f");

	$opt_cffile \|\|= "../rules";
	$opt_spam \|\|= 'spam.log';
	$opt_ham \|\|= 'ham.log';
	$opt_count \|\|= 0;

	$opt_scoreset = 0 if (!defined $opt_scoreset);

	my $msgline;

	our (%rules, %allrules);

	my %spam_points = ();
	my %ham_points = ();
	my ($num_tests, $num_spam, $num_ham);

	my $nybias = 10;

	# lambda value for TCR equation, representing the cost of of an FP vs. the
	# cost of a FN. Some example values are: 1 = tagged only, 9 = mailed back
	# to sender asking for token, 999 = blocking or deleting a message.
	#
	# We roughly aim for a value representing "moved to infrequently-read folder".

	my $lambda = 50;
	if ($opt_lambda) { $lambda = $opt_lambda; }


	readscores();
	readlogs();

	if ($opt_count) {
	$nybias = $nybias*($num_spam / $num_ham);
	evaluate_reports();
	}
	else {
	evaluate_scores();
	}
	exit 0;


	# arguments are $isspam, $count, \@tests
	sub log_line_count {
	my $score = 0;
	$score += $scores{$_} for @{$_[2]};

	$score = (int($score * 10) / 10);
	if ($_[0]) {
	$num_spam++;
	$spam_points{$score}++;
	}
	else {
	$num_ham++;
	$ham_points{$score}++;
	}
	}

	sub readlogs {
	my $count = 0;
	$num_spam = $num_ham = 0;

	# set handler for log lines
	my $log_line = \&log_line_count;

	foreach my $file ($opt_spam, $opt_ham) {
	open (IN, "<$file") \|\| die "Could not open file '$file': $!";

	my $isspam = ($file eq $opt_spam);
	my $caught; # 1st parameter of log line
	my $rules; # 4th parameter of log line

	while (defined($msgline = <IN>)) {
	($caught, undef, undef, $rules) = split(' ', $msgline);

	# only take lines starting with Y or .
	next unless ($caught eq 'Y' \|\| $caught eq '.') && $rules;

	# get tests, but ignore unknown tests and subrules
	my @tests;
	foreach my $r (split(/,/, $rules)) {
	my $hits = 1;
	# Support compacted RULE(hitcount) format
	if ($r =~ s/\((\d+)\)$//) {
	$hits = $1;
	}
	next unless (defined $scores{$r} && !$allrules{$r}->{issubrule});
	push @tests, $r for (1 .. $hits);
	}

	# run handler
	$log_line->($isspam, $count, \@tests);

	# increment line
	$count++;
	}
	close IN;
	}
	$num_tests = $count;
	}

	sub readscores {
	warn "Reading scores from \"$opt_cffile\"...\n";
	system ("../build/parse-rules-for-masses -d \"$opt_cffile\" -s $opt_scoreset") and die;
	require "./tmp/rules.pl";
	%allrules = %rules; # ensure it stays global
	}

	sub _evaluate_at_all_thresholds {
	my $callback = shift;

	my %u=();
	my @scores = grep {defined} map {
	if (exists $u{$_}) { undef; } else { $u{$_}=undef;$_; }
	} sort { $a <=> $b } (keys %ham_points, keys %spam_points);

	my $tot_h = $num_ham \|\| 0.00001;
	my $tot_s = $num_spam \|\| 0.00001;

	foreach my $threshold (@scores) {
	my $fp = 0; # false positives; ham marked as spam
	my $fn = 0; # false negatives; spam marked as ham
	my $tp = 0; # true positives; spam marked as spam
	my $tn = 0; # true negatives; ham marked as ham
	foreach my $scr (@scores) {
	my $nh = $ham_points{$scr} \|\| 0;
	my $ns = $spam_points{$scr} \|\| 0;
	if ($threshold > $scr) {
	$tn += $nh;
	$fn += $ns;
	} else {
	$fp += $nh;
	$tp += $ns;
	}
	}
	$callback->($threshold, $tot_h, $tot_s, $tn, $tp, $fn, $fp);
	}
	}

	sub evaluate_reports {
	_evaluate_at_all_thresholds (sub {
	my ($threshold, $tot_h, $tot_s, $tn, $tp, $fn, $fp) = @_;
	my $nh = $ham_points{$threshold} \|\| 0;
	my $ns = $spam_points{$threshold} \|\| 0;
	# $nh /= $tot_h;
	# $ns /= $tot_s;
	# $tn /= $tot_h;
	# $tp /= $tot_s;
	# $fp /= $tot_h;
	# $fn /= $tot_s;

	printf ("\n# SUMMARY for threshold %3.1f:\n", $threshold);
	printf "# Correctly non-spam: %6d %4.2f%%\n",
	$tn, ($tn / $tot_h) * 100.0;
	printf "# Correctly spam: %6d %4.2f%%\n",
	$tp, ($tp / $tot_s) * 100.0;
	printf "# False positives: %6d %4.2f%%\n",
	$fp, ($fp / $tot_h) * 100.0;
	printf "# False negatives: %6d %4.2f%%\n",
	$fn, ($fn / $tot_s) * 100.0;

	# convert to the TCR metrics used in the published lit
	my $nspamspam = $tp;
	my $nspamlegit = $fn;
	my $nlegitspam = $fp;
	my $nlegitlegit = $tn;
	my $nlegit = $tot_h;
	my $nspam = $tot_s;

	my $werr = ($lambda * $nlegitspam + $nspamlegit)
	/ ($lambda * $nlegit + $nspam);

	my $werr_base = $nspam
	/ ($lambda * $nlegit + $nspam);

	$werr \|\|= 0.000001; # avoid / by 0
	my $tcr = $werr_base / $werr;

	my $sr = ($nspamspam / $nspam) * 100.0;
	my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
	printf "# TCR(l=%s): %3.6f SpamRecall: %3.3f%% SpamPrec: %3.3f%%\n",
	$lambda, $tcr, $sr, $sp;
	});
	}

	sub evaluate_scores {
	_evaluate_at_all_thresholds (sub {
	my ($threshold, $tot_h, $tot_s, $tn, $tp, $fn, $fp) = @_;
	my $nh = $ham_points{$threshold} \|\| 0;
	my $ns = $spam_points{$threshold} \|\| 0;
	$nh /= $tot_h;
	$ns /= $tot_s;
	$tn /= $tot_h;
	$tp /= $tot_s;
	$fp /= $tot_h;
	$fn /= $tot_s;
	printf "%f %.9f %.9f %.9f %.9f %.9f %.9f\n",
	$threshold, $nh, $ns, $tn, $tp, $fp, $fn;
	});
	}

	__DATA__

	# Suggested commands to plot data from this script:

	# ham score distribution:
	./mk-roc-graphs --scoreset=3 > dat
	gnuplot
	set grid
	plot [-5:40] [] "dat" using 1:2 with boxes

	# spam score distribution:
	./mk-roc-graphs --scoreset=3 > dat
	gnuplot
	set grid
	plot [-5:40] [] "dat" using 1:3 with boxes

	# plot a ROC curve:
	./mk-roc-graphs --scoreset=3 > dat
	gnuplot
	set grid
	plot [0:0.1] [0:0.1] "dat" using 6:7 with linesp

	# For a ROC curve, as shown in the literature
	./mk-roc-graphs --scoreset=3 > dat
	set grid
	set xlabel "False Positives (1.0 = 100%)"
	set ylabel "True Positives (1.0 = 100%)"
	plot "dat" using 6:5 with linesp

	# same, but tighter:
	./mk-roc-graphs --scoreset=3 > dat
	gnuplot
	set grid
	plot [0:0.02] [0:0.02] "dat" using 6:7 with linesp

	# and the piece de resistance:
	./mk-roc-graphs --scoreset=0 > set0
	./mk-roc-graphs --scoreset=1 > set1
	./mk-roc-graphs --scoreset=2 > set2
	./mk-roc-graphs --scoreset=3 > set3
	gnuplot
	set terminal png
	set output "out.png"
	set grid
	set xlabel "False Positives (1.0 = 100% of ham marked as spam)"
	set ylabel "False Negatives (1.0 = 100% of spam marked as ham)"
	plot [0:0.1] [0:0.1] "set0" using 6:7 with linesp, \
	"set1" using 6:7 with linesp, \
	"set2" using 6:7 with linesp, \
	"set3" using 6:7 with linesp