blob: a0ea9fa352b8a7183195fb14d0bc7fca0a641f1b [file] [log] [blame]
#!/usr/bin/perl -w
#
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
=head1 NAME
fp-fn-statistics - Display statistics about the quality of scores
=head1 SYNOPSIS
fp-fn-statistics [options]
Options:
-c,--cffile=path Use path as the rules directory
-s,--scoreset=n Use scoreset n
-t,--threshold=n Use a spam/ham threshold of n (default: 5)
--lambda=n Use a lambda value of n
--spam=file Location of mass-check spam log (spam.log)
--ham=file Location of mass-check ham log (ham.log)
--fplog=file File to which false positive logs should be saved
--fnlog=file File to which false negative logs should be saved
=head1 DESCRIPTION
B<fp-fn-statistics> first calculates the score each message from a
masses.log would have under a new set of scores. It then aggregates
the number of messages correctly and incorrectly found as spam and
ham, and their average scores.
In addition, B<fp-fn-statistics> determines the "Total Cost Ratio" as
a result of the false positives and negatives mentioned above. This
calculation takes into the value of lambda, which represents the cost
of recovering a false positive, where 1 indicates a message is tagged
only, 9 means the message is mailed back to sender asking for a token
(TMDA style) and 999 means a message is delted. The default, 5,
represents the message being moved to an infrequently read folder.
B<fp-fn-statistics> can also save false positive and false negatives
logs to a file for future analysis. If this is all you're doing, it
could be accomplished a lot quicker with B<grep>, but why not reinvent
the wheel?
=cut
use Getopt::Long;
use strict;
our ($opt_cffile, $opt_lambda, $opt_threshold, $opt_scoreset, $opt_spam, $opt_ham, $opt_fplog, $opt_fnlog);
$opt_cffile = "../rules";
$opt_threshold = 5;
$opt_spam = 'spam.log';
$opt_ham = 'ham.log';
$opt_scoreset = 0;
GetOptions("c|cffile=s" => \$opt_cffile,
"lambda=f" => \$opt_lambda,
"t|threshold=f" => \$opt_threshold,
"spam=s" => \$opt_spam,
"ham=s" => \$opt_ham,
"s|scoreset=i" => \$opt_scoreset,
"fplog=s" => \$opt_fplog,
"fnlog=s" => \$opt_fnlog
);
# If desired, report false positives and false negatives for analysis
if (defined $opt_fnlog) { open (FNLOG, ">$opt_fnlog"); }
if (defined $opt_fplog) { open (FPLOG, ">$opt_fplog"); }
# lambda value for TCR equation, representing the cost of of an FP vs. the
# cost of a FN. Some example values are: 1 = tagged only, 9 = mailed back
# to sender asking for token, 999 = blocking or deleting a message.
#
# We roughly aim for a value representing "moved to infrequently-read folder".
my $lambda = 50;
if ($opt_lambda) { $lambda = $opt_lambda; }
our (%scores, %allrules, %rules);
readscores();
die "wrong scoreset in tmp/rules.pl" unless $allrules{_scoreset} == $opt_scoreset;
print "Reading per-message hit stat logs and scores...\n";
my ($num_spam, $num_ham);
my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore);
readlogs();
evaluate();
# show memory usage before we exit
# print "Running \"ps aux\"...\n";
# open(PS, "ps aux|");
# while(<PS>) {
# print if $. == 1 || /\b$$\b/;
# }
# close(PS);
exit 0;
# arguments are $isspam, $count, \@tests, $msgline;
sub log_line_count {
my $score = 0;
$score += $scores{$_} for @{$_[2]};
if ($_[0]) {
$num_spam++;
if ($score >= $opt_threshold) {
$ga_yy++;
$yyscore += $score;
}
else {
$ga_yn++;
$ynscore += $score;
if (defined $opt_fnlog) {
print FNLOG $_[3];
}
}
}
else {
$num_ham++;
if ($score >= $opt_threshold) {
#print STDERR "FP: $id\n";
$ga_ny++;
$nyscore += $score;
if (defined $opt_fplog) {
print FPLOG $_[3];
}
}
else {
$ga_nn++;
$nnscore += $score;
}
}
}
sub readlogs {
my $msgline;
my $count = 0;
$num_spam = $num_ham = 0;
$ga_yy = $ga_ny = $ga_yn = $ga_nn = 0;
$yyscore = $ynscore = $nyscore = $nnscore = 0.0;
foreach my $file ($opt_spam, $opt_ham) {
open (IN, "<$file") || die "Could not open file '$file': $!";
my $isspam = ($file eq $opt_spam);
my $caught; # 1st parameter of log line
my $rules; # 4th parameter of log line
while (defined($msgline = <IN>)) {
($caught, undef, undef, $rules) = split(' ', $msgline);
# only take lines starting with Y or .
next unless ($caught eq 'Y' || $caught eq '.') && $rules;
# get tests, but ignore unknown tests and subrules
my @tests = grep { defined $scores{$_} && !$allrules{$_}->{issubrule} }
split(/,/, $rules);
# run handler
log_line_count($isspam, $count, \@tests, $msgline);
# increment line
$count++;
}
close IN;
}
}
sub readscores {
print "Reading scores from \"$opt_cffile\"...\n";
my $cmd = "../build/parse-rules-for-masses -o ./tmp/rules_$$.pl -d \"$opt_cffile\" -s $opt_scoreset";
warn "[$cmd]\n";
system ($cmd) and die;
require "./tmp/rules_$$.pl";
unlink "./tmp/rules_$$.pl";
%allrules = %rules; # ensure it stays global
}
sub evaluate {
printf ("\n# SUMMARY for threshold %3.1f:\n", $opt_threshold);
printf "# Correctly non-spam: %6d %4.2f%%\n",
$ga_nn, ($ga_nn / $num_ham) * 100.0;
printf "# Correctly spam: %6d %4.2f%%\n",
$ga_yy, ($ga_yy / $num_spam) * 100.0;
printf "# False positives: %6d %4.2f%%\n",
$ga_ny, ($ga_ny / $num_ham) * 100.0;
printf "# False negatives: %6d %4.2f%%\n",
$ga_yn, ($ga_yn / $num_spam) * 100.0;
# convert to the TCR metrics used in the published lit
my $nspamspam = $ga_yy;
my $nspamlegit = $ga_yn;
my $nlegitspam = $ga_ny;
my $nlegitlegit = $ga_yn;
my $nlegit = $num_ham;
my $nspam = $num_spam;
my $werr = ($lambda * $nlegitspam + $nspamlegit)
/ ($lambda * $nlegit + $nspam);
my $werr_base = $nspam
/ ($lambda * $nlegit + $nspam);
$werr ||= 0.000001; # avoid / by 0
my $tcr = $werr_base / $werr;
my $sr = ($nspamspam / $nspam) * 100.0;
my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
printf "# TCR(l=%s): %3.6f SpamRecall: %3.3f%% SpamPrec: %3.3f%%\n",
$lambda, $tcr, $sr, $sp;
}