| #!/usr/bin/perl -w |
| # |
| # <@LICENSE> |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to you under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at: |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # </@LICENSE> |
| |
| =head1 NAME |
| |
| fp-fn-statistics - Display statistics about the quality of scores |
| |
| =head1 SYNOPSIS |
| |
| fp-fn-statistics [options] |
| |
| Options: |
| -c,--cffile=path Use path as the rules directory |
| -s,--scoreset=n Use scoreset n |
| -t,--threshold=n Use a spam/ham threshold of n (default: 5) |
| --lambda=n Use a lambda value of n |
| --spam=file Location of mass-check spam log (spam.log) |
| --ham=file Location of mass-check ham log (ham.log) |
| --fplog=file File to which false positive logs should be saved |
| --fnlog=file File to which false negative logs should be saved |
| |
| =head1 DESCRIPTION |
| |
| B<fp-fn-statistics> first calculates the score each message from a |
| masses.log would have under a new set of scores. It then aggregates |
| the number of messages correctly and incorrectly found as spam and |
| ham, and their average scores. |
| |
| In addition, B<fp-fn-statistics> determines the "Total Cost Ratio" as |
| a result of the false positives and negatives mentioned above. This |
| calculation takes into the value of lambda, which represents the cost |
| of recovering a false positive, where 1 indicates a message is tagged |
| only, 9 means the message is mailed back to sender asking for a token |
| (TMDA style) and 999 means a message is delted. The default, 5, |
| represents the message being moved to an infrequently read folder. |
| |
| B<fp-fn-statistics> can also save false positive and false negatives |
| logs to a file for future analysis. If this is all you're doing, it |
| could be accomplished a lot quicker with B<grep>, but why not reinvent |
| the wheel? |
| |
| =cut |
| |
| use Getopt::Long; |
| use strict; |
| |
| our ($opt_cffile, $opt_lambda, $opt_threshold, $opt_scoreset, $opt_spam, $opt_ham, $opt_fplog, $opt_fnlog); |
| |
| $opt_cffile = "../rules"; |
| $opt_threshold = 5; |
| $opt_spam = 'spam.log'; |
| $opt_ham = 'ham.log'; |
| $opt_scoreset = 0; |
| |
| GetOptions("c|cffile=s" => \$opt_cffile, |
| "lambda=f" => \$opt_lambda, |
| "t|threshold=f" => \$opt_threshold, |
| "spam=s" => \$opt_spam, |
| "ham=s" => \$opt_ham, |
| "s|scoreset=i" => \$opt_scoreset, |
| "fplog=s" => \$opt_fplog, |
| "fnlog=s" => \$opt_fnlog |
| ); |
| |
| # If desired, report false positives and false negatives for analysis |
| if (defined $opt_fnlog) { open (FNLOG, ">$opt_fnlog"); } |
| if (defined $opt_fplog) { open (FPLOG, ">$opt_fplog"); } |
| |
| # lambda value for TCR equation, representing the cost of of an FP vs. the |
| # cost of a FN. Some example values are: 1 = tagged only, 9 = mailed back |
| # to sender asking for token, 999 = blocking or deleting a message. |
| # |
| # We roughly aim for a value representing "moved to infrequently-read folder". |
| |
| my $lambda = 50; |
| if ($opt_lambda) { $lambda = $opt_lambda; } |
| |
| our (%scores, %allrules, %rules); |
| |
| readscores(); |
| |
| die "wrong scoreset in tmp/rules.pl" unless $allrules{_scoreset} == $opt_scoreset; |
| |
| print "Reading per-message hit stat logs and scores...\n"; |
| my ($num_spam, $num_ham); |
| my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore); |
| |
| readlogs(); |
| |
| evaluate(); |
| |
| # show memory usage before we exit |
| # print "Running \"ps aux\"...\n"; |
| # open(PS, "ps aux|"); |
| # while(<PS>) { |
| # print if $. == 1 || /\b$$\b/; |
| # } |
| # close(PS); |
| |
| exit 0; |
| |
| # arguments are $isspam, $count, \@tests, $msgline; |
| sub log_line_count { |
| my $score = 0; |
| $score += $scores{$_} for @{$_[2]}; |
| |
| if ($_[0]) { |
| $num_spam++; |
| if ($score >= $opt_threshold) { |
| $ga_yy++; |
| $yyscore += $score; |
| } |
| else { |
| $ga_yn++; |
| $ynscore += $score; |
| if (defined $opt_fnlog) { |
| print FNLOG $_[3]; |
| } |
| } |
| } |
| else { |
| $num_ham++; |
| if ($score >= $opt_threshold) { |
| #print STDERR "FP: $id\n"; |
| $ga_ny++; |
| $nyscore += $score; |
| if (defined $opt_fplog) { |
| print FPLOG $_[3]; |
| } |
| } |
| else { |
| $ga_nn++; |
| $nnscore += $score; |
| } |
| } |
| } |
| |
| sub readlogs { |
| my $msgline; |
| my $count = 0; |
| $num_spam = $num_ham = 0; |
| |
| $ga_yy = $ga_ny = $ga_yn = $ga_nn = 0; |
| $yyscore = $ynscore = $nyscore = $nnscore = 0.0; |
| |
| foreach my $file ($opt_spam, $opt_ham) { |
| open (IN, "<$file") || die "Could not open file '$file': $!"; |
| |
| my $isspam = ($file eq $opt_spam); |
| my $caught; # 1st parameter of log line |
| my $rules; # 4th parameter of log line |
| |
| while (defined($msgline = <IN>)) { |
| ($caught, undef, undef, $rules) = split(' ', $msgline); |
| |
| # only take lines starting with Y or . |
| next unless ($caught eq 'Y' || $caught eq '.') && $rules; |
| |
| # get tests, but ignore unknown tests and subrules |
| my @tests; |
| foreach my $r (split(/,/, $rules)) { |
| my $hits = 1; |
| # Support compacted RULE(hitcount) format |
| if ($r =~ s/\((\d+)\)$//) { |
| $hits = $1; |
| } |
| next unless (defined $scores{$r} && !$allrules{$r}->{issubrule}); |
| push @tests, $r for (1 .. $hits); |
| } |
| |
| # run handler |
| log_line_count($isspam, $count, \@tests, $msgline); |
| |
| # increment line |
| $count++; |
| } |
| close IN; |
| } |
| } |
| |
| sub readscores { |
| print "Reading scores from \"$opt_cffile\"...\n"; |
| my $cmd = "../build/parse-rules-for-masses -o ./tmp/rules_$$.pl -d \"$opt_cffile\" -s $opt_scoreset"; |
| warn "[$cmd]\n"; |
| system ($cmd) and die; |
| require "./tmp/rules_$$.pl"; |
| unlink "./tmp/rules_$$.pl"; |
| %allrules = %rules; # ensure it stays global |
| } |
| |
| sub evaluate { |
| printf ("\n# SUMMARY for threshold %3.1f:\n", $opt_threshold); |
| printf "# Correctly non-spam: %6d %4.2f%%\n", |
| $ga_nn, ($ga_nn / $num_ham) * 100.0; |
| printf "# Correctly spam: %6d %4.2f%%\n", |
| $ga_yy, ($ga_yy / $num_spam) * 100.0; |
| printf "# False positives: %6d %4.2f%%\n", |
| $ga_ny, ($ga_ny / $num_ham) * 100.0; |
| printf "# False negatives: %6d %4.2f%%\n", |
| $ga_yn, ($ga_yn / $num_spam) * 100.0; |
| |
| # convert to the TCR metrics used in the published lit |
| my $nspamspam = $ga_yy; |
| my $nspamlegit = $ga_yn; |
| my $nlegitspam = $ga_ny; |
| my $nlegitlegit = $ga_yn; |
| my $nlegit = $num_ham; |
| my $nspam = $num_spam; |
| |
| my $werr = ($lambda * $nlegitspam + $nspamlegit) |
| / ($lambda * $nlegit + $nspam); |
| |
| my $werr_base = $nspam |
| / ($lambda * $nlegit + $nspam); |
| |
| $werr ||= 0.000001; # avoid / by 0 |
| my $tcr = $werr_base / $werr; |
| |
| my $sr = ($nspamspam / $nspam) * 100.0; |
| my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0; |
| printf "# TCR(l=%s): %3.6f SpamRecall: %3.3f%% SpamPrec: %3.3f%%\n", |
| $lambda, $tcr, $sr, $sp; |
| } |
| |