masses/fp-fn-statistics - spamassassin - Git at Google

 #!/usr/bin/perl -w
 #
 # <@LICENSE>
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to you under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at:
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # </@LICENSE>

 =head1 NAME

 fp-fn-statistics - Display statistics about the quality of scores

 =head1 SYNOPSIS

 fp-fn-statistics [options]

   Options:
     -c,--cffile=path	  Use path as the rules directory
     -s,--scoreset=n	  Use scoreset n
     -t,--threshold=n      Use a spam/ham threshold of n (default: 5)
     --lambda=n            Use a lambda value of n
     --spam=file           Location of mass-check spam log (spam.log)
     --ham=file            Location of mass-check ham log (ham.log)
     --fplog=file          File to which false positive logs should be saved
     --fnlog=file          File to which false negative logs should be saved

 =head1 DESCRIPTION

 B<fp-fn-statistics> first calculates the score each message from a
 masses.log would have under a new set of scores. It then aggregates
 the number of messages correctly and incorrectly found as spam and
 ham, and their average scores.

 In addition, B<fp-fn-statistics> determines the "Total Cost Ratio" as
 a result of the false positives and negatives mentioned above. This
 calculation takes into the value of lambda, which represents the cost
 of recovering a false positive, where 1 indicates a message is tagged
 only, 9 means the message is mailed back to sender asking for a token
 (TMDA style) and 999 means a message is delted. The default, 5,
 represents the message being moved to an infrequently read folder.

 B<fp-fn-statistics> can also save false positive and false negatives
 logs to a file for future analysis. If this is all you're doing, it
 could be accomplished a lot quicker with B<grep>, but why not reinvent
 the wheel?

 =cut

 use Getopt::Long;
 use strict;

 our ($opt_cffile, $opt_lambda, $opt_threshold, $opt_scoreset, $opt_spam, $opt_ham, $opt_fplog, $opt_fnlog);

 $opt_cffile = "../rules";
 $opt_threshold = 5;
 $opt_spam = 'spam.log';
 $opt_ham = 'ham.log';
 $opt_scoreset = 0;

 GetOptions("c|cffile=s" => \$opt_cffile,
 	   "lambda=f" => \$opt_lambda,
 	   "t|threshold=f" => \$opt_threshold,
 	   "spam=s" => \$opt_spam,
 	   "ham=s" => \$opt_ham,
 	   "s|scoreset=i" => \$opt_scoreset,
 	   "fplog=s" => \$opt_fplog,
 	   "fnlog=s" => \$opt_fnlog
 	  );

 # If desired, report false positives and false negatives for analysis
 if (defined $opt_fnlog) { open (FNLOG, ">$opt_fnlog"); }
 if (defined $opt_fplog) { open (FPLOG, ">$opt_fplog"); }

 # lambda value for TCR equation, representing the cost of of an FP vs. the
 # cost of a FN.  Some example values are: 1 = tagged only, 9 = mailed back
 # to sender asking for token, 999 = blocking or deleting a message.
 #
 # We roughly aim for a value representing "moved to infrequently-read folder".

 my $lambda = 50;
 if ($opt_lambda) { $lambda = $opt_lambda; }

 our (%scores, %allrules, %rules);

 readscores();

 die "wrong scoreset in tmp/rules.pl" unless $allrules{_scoreset} == $opt_scoreset;

 print "Reading per-message hit stat logs and scores...\n";
 my ($num_spam, $num_ham);
 my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore);

 readlogs();

 evaluate();

 # show memory usage before we exit
 # print "Running \"ps aux\"...\n";
 # open(PS, "ps aux|");
 # while(<PS>) {
 # print if $. == 1 || /\b$$\b/;
 # }
 # close(PS);

 exit 0;

 # arguments are $isspam, $count, \@tests, $msgline;
 sub log_line_count {
   my $score = 0;
   $score += $scores{$_} for @{$_[2]};

   if ($_[0]) {
     $num_spam++;
     if ($score >= $opt_threshold) {
       $ga_yy++;
       $yyscore += $score;
     }
     else {
       $ga_yn++;
       $ynscore += $score;
       if (defined $opt_fnlog) {
 	print FNLOG $_[3];
       }
     }
   }
   else {
     $num_ham++;
     if ($score >= $opt_threshold) {
       #print STDERR "FP: $id\n";
       $ga_ny++;
       $nyscore += $score;
       if (defined $opt_fplog) {
 	print FPLOG $_[3];
       }
     }
     else {
       $ga_nn++;
       $nnscore += $score;
     }
   }
 }

 sub readlogs {
   my $msgline;
   my $count = 0;
   $num_spam = $num_ham = 0;

   $ga_yy = $ga_ny = $ga_yn = $ga_nn = 0;
   $yyscore = $ynscore = $nyscore = $nnscore = 0.0;

   foreach my $file ($opt_spam, $opt_ham) {
     open (IN, "<$file") || die "Could not open file '$file': $!";

     my $isspam = ($file eq $opt_spam);
     my $caught;			# 1st parameter of log line
     my $rules;			# 4th parameter of log line

     while (defined($msgline = <IN>)) {
       ($caught, undef, undef, $rules) = split(' ', $msgline);

       # only take lines starting with Y or .
       next unless ($caught eq 'Y' || $caught eq '.') && $rules;

       # get tests, but ignore unknown tests and subrules
       my @tests = grep { defined $scores{$_} && !$allrules{$_}->{issubrule} }
 	split(/,/, $rules);

       # run handler
       log_line_count($isspam, $count, \@tests, $msgline);

       # increment line
       $count++;
     }
     close IN;
   }
 }

 sub readscores {
   print "Reading scores from \"$opt_cffile\"...\n";
   my $cmd = "../build/parse-rules-for-masses -o ./tmp/rules_$$.pl -d \"$opt_cffile\" -s $opt_scoreset";
   warn "[$cmd]\n";
   system ($cmd) and die;
   require "./tmp/rules_$$.pl";
   unlink "./tmp/rules_$$.pl";
   %allrules = %rules;           # ensure it stays global
 }

 sub evaluate {
    printf ("\n# SUMMARY for threshold %3.1f:\n", $opt_threshold);
    printf "# Correctly non-spam: %6d  %4.2f%%\n",
        $ga_nn, ($ga_nn /  $num_ham) * 100.0;
    printf "# Correctly spam:     %6d  %4.2f%%\n",
        $ga_yy, ($ga_yy /  $num_spam) * 100.0;
    printf "# False positives:    %6d  %4.2f%%\n",
        $ga_ny, ($ga_ny /  $num_ham) * 100.0;
    printf "# False negatives:    %6d  %4.2f%%\n",
        $ga_yn, ($ga_yn /  $num_spam) * 100.0;

   # convert to the TCR metrics used in the published lit
   my $nspamspam = $ga_yy;
   my $nspamlegit = $ga_yn;
   my $nlegitspam = $ga_ny;
   my $nlegitlegit = $ga_yn;
   my $nlegit = $num_ham;
   my $nspam = $num_spam;

   my $werr = ($lambda * $nlegitspam + $nspamlegit)
                   / ($lambda * $nlegit + $nspam);

   my $werr_base = $nspam
                   / ($lambda * $nlegit + $nspam);

   $werr ||= 0.000001;     # avoid / by 0
   my $tcr = $werr_base / $werr;

   my $sr = ($nspamspam / $nspam) * 100.0;
   my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
   printf "# TCR(l=%s): %3.6f  SpamRecall: %3.3f%%  SpamPrec: %3.3f%%\n",
     $lambda, $tcr, $sr, $sp;
 }
	#!/usr/bin/perl -w
	#
	# <@LICENSE>
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to you under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at:
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# </@LICENSE>

	=head1 NAME

	fp-fn-statistics - Display statistics about the quality of scores

	=head1 SYNOPSIS

	fp-fn-statistics [options]

	Options:
	-c,--cffile=path Use path as the rules directory
	-s,--scoreset=n Use scoreset n
	-t,--threshold=n Use a spam/ham threshold of n (default: 5)
	--lambda=n Use a lambda value of n
	--spam=file Location of mass-check spam log (spam.log)
	--ham=file Location of mass-check ham log (ham.log)
	--fplog=file File to which false positive logs should be saved
	--fnlog=file File to which false negative logs should be saved

	=head1 DESCRIPTION

	B<fp-fn-statistics> first calculates the score each message from a
	masses.log would have under a new set of scores. It then aggregates
	the number of messages correctly and incorrectly found as spam and
	ham, and their average scores.

	In addition, B<fp-fn-statistics> determines the "Total Cost Ratio" as
	a result of the false positives and negatives mentioned above. This
	calculation takes into the value of lambda, which represents the cost
	of recovering a false positive, where 1 indicates a message is tagged
	only, 9 means the message is mailed back to sender asking for a token
	(TMDA style) and 999 means a message is delted. The default, 5,
	represents the message being moved to an infrequently read folder.

	B<fp-fn-statistics> can also save false positive and false negatives
	logs to a file for future analysis. If this is all you're doing, it
	could be accomplished a lot quicker with B<grep>, but why not reinvent
	the wheel?

	=cut

	use Getopt::Long;
	use strict;

	our ($opt_cffile, $opt_lambda, $opt_threshold, $opt_scoreset, $opt_spam, $opt_ham, $opt_fplog, $opt_fnlog);

	$opt_cffile = "../rules";
	$opt_threshold = 5;
	$opt_spam = 'spam.log';
	$opt_ham = 'ham.log';
	$opt_scoreset = 0;

	GetOptions("c\|cffile=s" => \$opt_cffile,
	"lambda=f" => \$opt_lambda,
	"t\|threshold=f" => \$opt_threshold,
	"spam=s" => \$opt_spam,
	"ham=s" => \$opt_ham,
	"s\|scoreset=i" => \$opt_scoreset,
	"fplog=s" => \$opt_fplog,
	"fnlog=s" => \$opt_fnlog
	);

	# If desired, report false positives and false negatives for analysis
	if (defined $opt_fnlog) { open (FNLOG, ">$opt_fnlog"); }
	if (defined $opt_fplog) { open (FPLOG, ">$opt_fplog"); }

	# lambda value for TCR equation, representing the cost of of an FP vs. the
	# cost of a FN. Some example values are: 1 = tagged only, 9 = mailed back
	# to sender asking for token, 999 = blocking or deleting a message.
	#
	# We roughly aim for a value representing "moved to infrequently-read folder".

	my $lambda = 50;
	if ($opt_lambda) { $lambda = $opt_lambda; }

	our (%scores, %allrules, %rules);

	readscores();

	die "wrong scoreset in tmp/rules.pl" unless $allrules{_scoreset} == $opt_scoreset;

	print "Reading per-message hit stat logs and scores...\n";
	my ($num_spam, $num_ham);
	my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore);

	readlogs();

	evaluate();

	# show memory usage before we exit
	# print "Running \"ps aux\"...\n";
	# open(PS, "ps aux\|");
	# while(<PS>) {
	# print if $. == 1 \|\| /\b$$\b/;
	# }
	# close(PS);

	exit 0;

	# arguments are $isspam, $count, \@tests, $msgline;
	sub log_line_count {
	my $score = 0;
	$score += $scores{$_} for @{$_[2]};

	if ($_[0]) {
	$num_spam++;
	if ($score >= $opt_threshold) {
	$ga_yy++;
	$yyscore += $score;
	}
	else {
	$ga_yn++;
	$ynscore += $score;
	if (defined $opt_fnlog) {
	print FNLOG $_[3];
	}
	}
	}
	else {
	$num_ham++;
	if ($score >= $opt_threshold) {
	#print STDERR "FP: $id\n";
	$ga_ny++;
	$nyscore += $score;
	if (defined $opt_fplog) {
	print FPLOG $_[3];
	}
	}
	else {
	$ga_nn++;
	$nnscore += $score;
	}
	}
	}

	sub readlogs {
	my $msgline;
	my $count = 0;
	$num_spam = $num_ham = 0;

	$ga_yy = $ga_ny = $ga_yn = $ga_nn = 0;
	$yyscore = $ynscore = $nyscore = $nnscore = 0.0;

	foreach my $file ($opt_spam, $opt_ham) {
	open (IN, "<$file") \|\| die "Could not open file '$file': $!";

	my $isspam = ($file eq $opt_spam);
	my $caught; # 1st parameter of log line
	my $rules; # 4th parameter of log line

	while (defined($msgline = <IN>)) {
	($caught, undef, undef, $rules) = split(' ', $msgline);

	# only take lines starting with Y or .
	next unless ($caught eq 'Y' \|\| $caught eq '.') && $rules;

	# get tests, but ignore unknown tests and subrules
	my @tests = grep { defined $scores{$_} && !$allrules{$_}->{issubrule} }
	split(/,/, $rules);

	# run handler
	log_line_count($isspam, $count, \@tests, $msgline);

	# increment line
	$count++;
	}
	close IN;
	}
	}

	sub readscores {
	print "Reading scores from \"$opt_cffile\"...\n";
	my $cmd = "../build/parse-rules-for-masses -o ./tmp/rules_$$.pl -d \"$opt_cffile\" -s $opt_scoreset";
	warn "[$cmd]\n";
	system ($cmd) and die;
	require "./tmp/rules_$$.pl";
	unlink "./tmp/rules_$$.pl";
	%allrules = %rules; # ensure it stays global
	}

	sub evaluate {
	printf ("\n# SUMMARY for threshold %3.1f:\n", $opt_threshold);
	printf "# Correctly non-spam: %6d %4.2f%%\n",
	$ga_nn, ($ga_nn / $num_ham) * 100.0;
	printf "# Correctly spam: %6d %4.2f%%\n",
	$ga_yy, ($ga_yy / $num_spam) * 100.0;
	printf "# False positives: %6d %4.2f%%\n",
	$ga_ny, ($ga_ny / $num_ham) * 100.0;
	printf "# False negatives: %6d %4.2f%%\n",
	$ga_yn, ($ga_yn / $num_spam) * 100.0;

	# convert to the TCR metrics used in the published lit
	my $nspamspam = $ga_yy;
	my $nspamlegit = $ga_yn;
	my $nlegitspam = $ga_ny;
	my $nlegitlegit = $ga_yn;
	my $nlegit = $num_ham;
	my $nspam = $num_spam;

	my $werr = ($lambda * $nlegitspam + $nspamlegit)
	/ ($lambda * $nlegit + $nspam);

	my $werr_base = $nspam
	/ ($lambda * $nlegit + $nspam);

	$werr \|\|= 0.000001; # avoid / by 0
	my $tcr = $werr_base / $werr;

	my $sr = ($nspamspam / $nspam) * 100.0;
	my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
	printf "# TCR(l=%s): %3.6f SpamRecall: %3.3f%% SpamPrec: %3.3f%%\n",
	$lambda, $tcr, $sr, $sp;
	}