blob: 775843e1b12a0a783a2d8e2ef90e1ae8b65d0ea4 [file] [log] [blame]
#!/usr/bin/perl -w
#
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
use strict;
use warnings;
use FindBin;
use Getopt::Long qw(:config bundling auto_help);
use Pod::Usage;
our (
$opt_c, $opt_s, $opt_f, $opt_a, $opt_p, $opt_x, $opt_m, $opt_t, $opt_M,
$opt_X, $opt_L, $opt_l, $opt_i, $opt_T, $opt_o, $opt_S, $opt_P, $opt_d,
$opt_g
);
GetOptions("c|cffile=s" => \$opt_c,
"s|scoreset=i" => \$opt_s, # ,, pacify stupid emacs cperl mode
"f|falses" => \$opt_f,
"a|all" => \$opt_a,
"p|percentages" => \$opt_p,
"x|extended" => \$opt_x,
"m|matchrules=s" => \$opt_m,
"t|tflags=s" => \$opt_t,
"M|matchlogs=s" => \$opt_M,
"X|excludelogs=s" => \$opt_X,
"L|onlylanguage=s" => \$opt_L,
"l|alsolanguage=s" => \$opt_l,
"i|ig" => \$opt_i,
"T|times" => \$opt_T,
"o|overlaps" => \$opt_o,
"S|scoremap" => \$opt_S,
"P|promotion" => \$opt_P,
"d|xml" => \$opt_d,
"g|generated" => \$opt_g
);
=head1 NAME
hit-frequencies - Display statistics about tests hit by a mass-check run
=head1 SYNOPSIS
hit-frequencies [options] <spam-log> <ham-log>
Options:
-c,--cffile=path Use path as the rules directory
-s,--scoreset=n Use scoreset n
-f,--falses Count only false-positives/false-negatives
-a,--all Report all tests (including subrules)
-p,--percentages Report percentages instead of raw hits (implies -x)
-x,--extended "Extended" output, include RANK, S/O and SCORE
-m,--matchrules=re Print rules matching the regular expression
-t,--tflags=re Print only rules with tflags matching the regular expression
-M,--matchlogs=re Consider only logs matching the regular expression
-X,--excludelogs=re Exclude logs matching this regular expression
-L,--onlylanguage=lc Only print language specific tests for specified lang code (try 'all')
-l,--alsolanguage=lc Also print language specific tests for specified lang code (try 'all')
-i,--ig Use IG (information gain) for ranking
-T,--times Display rule times (implies -x, -p)
-o,--overlaps Display hit overlaps against all other rules
-S,--scoremap Display score-map of hits
-P,--promotion Flag rules that meet the promotion criteria
-d,--XML XML output (conflicts with -x, -p)
-g,--generated Include generated nightly scores for sandbox rules
=head1 DESCRIPTION
B<hit-frequencies> will read the mass-check logs F<spam.log> and
F<ham.log> or the logs given on the command line. The output will
contain a summary of the number of ham and spam messages and detailed
statistics for each rule. The output will include the following
columns:
=over 4
=item OVERALL
Number of times (or percentage with B<-p>) the rule hit on
all messages (spam or ham).
=item SPAM
Number of times (or percentage with B<-p>) the rule hit on
spam messages.
=item HAM
Number of times (or percentage with B<-p>) the rule hit on
ham messages.
=item FPOS
=item FNEG
Shown only with B<-f>, these refer to the number of times (or
percentage) the rule hit on messages that were found to be false
positives or false negatives.
=item S/O
Shown only with B<-x> or B<-p>, this is the number of spam hits
divided by total number of hits (C<S/O> refers to spam divided by
overall).
=item RANK
Shown only with B<-x> or B<-p>, and when B<-i> is not used, this is a
measure that attempts to indicate how I<good> or I<useful> a test
is. The higher it is, the better the test.
=item IG
Shown only with B<-i>, this is another measure that attempts to
indicate how I<useful> a test is.
=item SCORE
Shown only with B<-x> or B<-p>, this is the current score assigned to
the rule. If B<-g> is used, the scores generated for sandbox rules from
nightly masscheck results (72_scores.cf) will be included as well.
=item NAME
This is the rule's name.
=back
=head1 BUGS
Please report bugs to http://bugzilla.spamassassin.org/
=head1 SEE ALSO
L<mass-check(1)>, L<perceptron(1)>
=cut
if ($opt_l && $opt_L) {
pod2usage("-L/--alsolanguage and -l/--onlylanguage are mutually exclusive");
}
if ($opt_d && ($opt_x || $opt_p)) {
pod2usage("-d/--xml conflicts with -x/--extended and -p/--percentages");
}
$opt_s = 0 if ( !defined $opt_s );
if ($opt_p) {
$opt_x = 1;
}
if ($opt_d || $opt_T) {
$opt_x = $opt_p = 1;
}
# as per http://wiki.apache.org/spamassassin/RulesProjPromotion, for -P
my $promote_so_min = 0.95;
my $promote_hitrate_min = 0.02;
my $promote_fprate_max = 1.00;
my $cffile = $opt_c || "$FindBin::Bin/../rules";
# "our" so that the require'd file can overwrite them
my $rules_pl_unparseable;
our %rules = ();
our %scores = ();
my %soratio = ();
my %freq_spam = ();
my %freq_ham = ();
my %hmap_spam = ();
my %hmap_ham = ();
my %scoremap_spam = ();
my %scoremap_ham = ();
my %freq = ();
my $num_spam = 0;
my $num_ham = 0;
my %ranking = ();
my $ok_lang = '';
my %meta_subrule_pairs = ();
my %rule_times = ();
readscores($cffile);
$ok_lang = lc ($opt_l || $opt_L || '');
if ($ok_lang eq 'all') { $ok_lang = '.'; }
if (($opt_t || $opt_o) && $rules_pl_unparseable) {
die "-t/-o require rules.pl to be parseable";
}
foreach my $key (keys %rules) {
if ($key eq '_scoreset') {
delete $rules{$key}; # bug 5683
next;
}
if ( ($opt_L && !$rules{$key}->{lang}) ||
($rules{$key}->{lang} &&
(!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/io)
) ) {
delete $rules{$key} ; next;
}
$freq_spam{$key} = 0;
$freq_ham{$key} = 0;
if ($opt_o) {
$hmap_spam{$key} = '';
$hmap_ham{$key} = '';
}
}
readlogs();
my $hdr_all = $num_spam + $num_ham;
my $hdr_spam = $num_spam;
my $hdr_ham = $num_ham;
my $sorting = $opt_i ? "IG" : "RANK";
if ($opt_d) {
$hdr_all ||= 0.00001; # avoid div by 0 in the next 2 statements
$hdr_spam = ($num_spam / $hdr_all) * 100.0;
$hdr_ham = ($num_ham / $hdr_all) * 100.0;
$opt_P = 1;
print qq{
<freqs>
<allmessages>
<count class='spam'>$num_spam</count>
<count class='ham'>$num_spam</count>
<pc class='spam'>$hdr_spam</pc>
<pc class='ham'>$hdr_spam</pc>
</allmessages>
};
}
elsif ($opt_p) {
printf "%7s %7s %7s %6s %6s %6s %s\n",
$opt_T?"MSECS":"OVERALL", $opt_f?"FNEG%":"SPAM%", $opt_f?"FPO%":"HAM%",
"S/O", $sorting, "SCORE", "NAME";
printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n",
0, $hdr_spam, $hdr_ham,
soratio ($num_spam,$num_ham), 0, 0;
$hdr_all ||= 0.00001; # avoid div by 0 in the next 2 statements
$hdr_spam = ($num_spam / $hdr_all) * 100.0;
$hdr_ham = ($num_ham / $hdr_all) * 100.0;
$hdr_all = 100.0; # this is obvious
printf "%7.5f %7.4f %7.4f %7.3f %6.2f %6.2f (all messages as %%)\n",
0, $hdr_spam, $hdr_ham,
soratio ($num_spam,$num_ham), 0, 0;
}
elsif ($opt_p) {
printf "%8s %7s %7s %6s %6s %6s %s\n",
"OVERALL%", $opt_f?"FNEG%":"SPAM%", $opt_f?"FPO%":"HAM%",
"S/O", $sorting, "SCORE", "NAME";
printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n",
$hdr_all, $hdr_spam, $hdr_ham,
soratio ($num_spam,$num_ham), 0, 0;
$hdr_all ||= 0.00001; # avoid div by 0 in the next 2 statements
$hdr_spam = ($num_spam / $hdr_all) * 100.0;
$hdr_ham = ($num_ham / $hdr_all) * 100.0;
$hdr_all = 100.0; # this is obvious
printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f (all messages as %%)\n",
$hdr_all, $hdr_spam, $hdr_ham,
soratio ($num_spam,$num_ham), 0, 0;
}
elsif ($opt_x) {
printf "%7s %7s %7s %6s %6s %6s %s\n",
"OVERALL%", $opt_f?"FNEG%":"SPAM%", $opt_f?"FPOS%":"HAM%", "S/O", $sorting, "SCORE", "NAME";
printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n",
$hdr_all, $hdr_spam, $hdr_ham,
soratio ($num_spam,$num_ham), 0, 0;
} else {
printf "%10s %10s %10s %s\n",
"OVERALL", $opt_f?"FNEG":"SPAM", $opt_f?"FPO":"HAM",
"NAME";
printf "%10d %10d %10d (all messages)\n",
$hdr_all, $hdr_spam, $hdr_ham;
}
my %done = ();
my @tests = ();
my $rank_hi = 0;
my $rank_lo = 9999999;
# variables for wanted/unwanted RANK
my %wanted;
my %unwanted;
my %isnice;
my %wranks;
my %uranks;
# rules that we want to look at
$freq{$_}++ for keys %freq_ham;
$freq{$_}++ for keys %freq_spam;
# if a rule exists in the config, we want it in the output too
$freq{$_}++ for keys %rules;
my $test;
foreach $test (keys %freq) {
my $parsed_rules_entry = $rules{$test};
# Allow nice T_ rules to be detected
my $no_t = $test;
if ($no_t =~ s/^T_//) {
if (defined $rules{$no_t}) {
$parsed_rules_entry = $rules{$no_t};
}
}
# do not require 'tmp/rules.pl' to have been built from the
# exact same ruleset version; this assumption screws up nightly
# mass-check reports if they are generated with a different SVN rev
# next unless (exists $rules{$test});
next if (!$opt_a && $test =~ /^__/);
next if $done{$test}; $done{$test} = 1;
push (@tests, $test);
my $isnice = 0;
if ($parsed_rules_entry) {
if ($parsed_rules_entry->{tflags} &&
$parsed_rules_entry->{tflags} =~ /\bnice\b/)
{
$isnice = 1;
}
}
$isnice{$test} = $isnice;
my $fs = $freq_spam{$test}; $fs ||= 0;
my $fn = $freq_ham{$test}; $fn ||= 0;
my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;
my $soratio = $soratio{$test} = soratio ($fsadj, $fnadj);
if ($isnice) {
$soratio = 1.0 - $soratio;
my $tmp = $fsadj; $fsadj = $fnadj; $fnadj = $tmp;
}
if ($opt_i) {
# come up with a ranking
my $rank;
# IG system: from "Learning to Filter Unsolicited Commercial E-Mail",
# Ion Androutsopoulos et al: determine the information gain IG(X, C) of the
# Boolean attributes (ie. the rules). Measures "the average reduction in
# the entropy of C (classification) given the value of X (the rule)". Makes
# a good ranking measure with a proper statistical basis. ;)
#
# Still would like to get an entropy measure in, too.
#
# sum P(X = x ^ C = c)
# IG(X,C) = x in [0, 1] P(X = x ^ C = c) . log2( ------------------- )
# c in [Ch, Cs] P(X = x) . P(C = c)
#
my $safe_nspam = $num_spam || 0.0000001;
my $safe_nham = $num_ham || 0.0000001;
my $num_all = ($num_spam + $num_ham);
my $safe_all = $num_all || 0.0000001;
my $f_all = $fs+$fn;
my $px0 = (($num_all - $f_all) / $safe_all); # P(X = 0)
my $px1 = ($f_all / $safe_all); # P(X = 1)
my $pccs = ($num_spam / $safe_all); # P(C = Cs)
my $pcch = ($num_ham / $safe_all); # P(C = Ch)
my $px1ccs = ($fs / $safe_nspam); # P(X = 1 ^ C = Cs)
my $px1cch = ($fn / $safe_nham); # P(X = 1 ^ C = Ch)
my $px0ccs = (($num_spam - $fs) / $safe_nspam); # P(X = 0 ^ C = Cs)
my $px0cch = (($num_ham - $fn) / $safe_nham); # P(X = 0 ^ C = Ch)
my $safe_px0_dot_pccs = ($px0 * $pccs) || 0.00000001;
my $safe_px0_dot_pcch = ($px0 * $pcch) || 0.00000001;
my $safe_px1_dot_pccs = ($px1 * $pccs) || 0.00000001;
my $safe_px1_dot_pcch = ($px1 * $pcch) || 0.00000001;
sub log2 { return log($_[0]) / 0.693147180559945; } # log(2) = 0.6931...
my $safe_px0ccs = ($px0ccs || 0.0000001);
my $safe_px0cch = ($px0cch || 0.0000001);
my $safe_px1ccs = ($px1ccs || 0.0000001);
my $safe_px1cch = ($px1cch || 0.0000001);
$rank = ( $px0ccs * log2($safe_px0ccs / $safe_px0_dot_pccs) ) +
( $px0cch * log2($safe_px0cch / $safe_px0_dot_pcch) ) +
( $px1ccs * log2($safe_px1ccs / $safe_px1_dot_pccs) ) +
( $px1cch * log2($safe_px1cch / $safe_px1_dot_pcch) );
$ranking{$test} = $rank;
$rank_hi = $rank if ($rank > $rank_hi);
$rank_lo = $rank if ($rank < $rank_lo);
}
else {
# RANK: basic wanted/unwanted ranking
#
# The rank of each test based on two ranks: (1) the number of wanted
# hits and (2) the number of unwanted hits. Each test is ranked
# positionally for both its wanted and unwanted hits (ties are
# allowed) and the two ranks are normalized to have the same range.
# Those two ranks are added together, producing a single RANK number
# that is then normalized to [0, 1]. The result is equivalent to:
#
# RANK(rule) = (percentile(wanted) + percentile(unwanted))/2
#
$wanted{$test} = $isnice ? $fn : $fs;
$unwanted{$test} = $isnice ? $fs : $fn;
# count number of ranks of each type
$wranks{$wanted{$test}} = 1;
$uranks{$unwanted{$test}} = 1;
}
}
# finish basic wanted/unwanted ranking
if (! $opt_i) {
my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted;
my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted;
# first half of ranking is the wanted rank
my $position = 0;
my $last = undef;
for my $test (@wanted) {
$position++ if defined $last && $last != $wanted{$test};
$ranking{$test} += $position;
$last = $wanted{$test}
}
# second half of ranking is the unwanted rank
my $normalize = (scalar keys %wranks) / (scalar keys %uranks || 0.001);
$position = 0;
$last = undef;
for my $test (@unwanted) {
$position++ if defined $last && $last != $unwanted{$test};
$ranking{$test} += ($position * $normalize);
$last = $unwanted{$test};
$rank_hi = $ranking{$test} if ($ranking{$test} > $rank_hi);
$rank_lo = $ranking{$test} if ($ranking{$test} < $rank_lo);
}
}
{
# now normalise the rankings to [0, 1]
$rank_hi -= $rank_lo;
foreach my $test (@tests) {
$ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / ($rank_hi);
}
}
if ($opt_T) {
read_timings();
}
foreach $test (sort { $ranking{$b} <=> $ranking{$a} } @tests) {
my $parsed_rules_entry = $rules{$test};
# Allow nice T_ rules to be detected
my $no_t = $test;
if ($no_t =~ s/^T_//) {
if (defined $rules{$no_t}) {
$parsed_rules_entry = $rules{$no_t};
}
}
# do not require 'tmp/rules.pl' to have been built from the
# exact same ruleset version; this assumption screws up nightly
# mass-check reports if they are generated with a different SVN rev
# next unless (exists $rules{$test});
next if (!$opt_a && $test =~ /^__/);
my $fs = $freq_spam{$test}; $fs ||= 0;
my $fn = $freq_ham{$test}; $fn ||= 0;
my $fa = $fs+$fn;
my $num_fs = $fs;
my $num_fn = $fn;
my $num_fa = $fa;
my $tflags = '';
if ($parsed_rules_entry) {
$tflags = $parsed_rules_entry->{tflags};
}
# match certain tests
next if ($opt_m && $test !~ m/$opt_m/);
# match tflags
next if ($opt_t && (!$tflags || $tflags !~ /$opt_t/));
if (!$opt_a && !$opt_t && $tflags) {
# not net tests
next if ($tflags =~ /\bnet\b/ && ($opt_s % 2 == 0));
# not userconf
# Jul 13 2005 jm: removed. this blocks SPF_PASS from showing up!
# why should userconf rules not be visible in freqs output?
# next if ($tflags =~ /\buserconf\b/);
}
# adjust based on corpora sizes (and cvt to % while we're at it)
my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;
if ($opt_f && $fsadj == 0 && $fnadj == 0) { next; }
if ($opt_p) {
my $denom = ($num_spam + $num_ham) || 0.000001; # avoid / by 0
$fa = ($fa / $denom) * 100.0;
$fs = $fsadj;
$fn = $fnadj;
}
my $soratio = $soratio{$test};
if (!defined $soratio) {
$soratio{$test} = soratio ($fsadj, $fnadj);
}
my $promotable;
if ($opt_P) {
$promotable = 1;
if ($isnice{$test}) {
if (($soratio{$test} > (1.0 - $promote_so_min))
|| ($fn < $promote_hitrate_min)
|| ($fs >= $promote_fprate_max))
{
$promotable = 0;
}
} else {
if (($soratio{$test} < $promote_so_min)
|| ($fs < $promote_hitrate_min)
|| ($fn >= $promote_fprate_max))
{
$promotable = 0;
}
}
}
my $promotable_str = $opt_P ? ($promotable ? '+ ' : '- ') : '';
if ($opt_d) {
print qq{
<rule>
<time>}.($rule_times{$test}||0).qq{</time>
<count class='all'>$num_fa</count>
<count class='spam'>$num_fs</count>
<count class='ham'>$num_fn</count>
<pc class='all'>}.sprintf("%.5f", $fa).qq{</pc>
<pc class='spam'>}.sprintf("%.5f", $fs).qq{</pc>
<pc class='ham'>}.sprintf("%.5f", $fn).qq{</pc>
<so>}.sprintf("%.8f", $soratio).qq{</so>
<rank>}.sprintf("%.8f", $ranking{$test}).qq{</rank>
<score set='$opt_s'>}.($scores{$test}||0).qq{</score>
<promotable>$promotable</promotable>
<test>$test</test>
};
} elsif ($opt_T) {
printf "%7.5f %7.4f %7.4f %7.3f %6.2f %6.2f %s%s\n",
$rule_times{$test}||0, $fs, $fn, $soratio, $ranking{$test},
$scores{$test}||0,
$promotable_str, $test;
} elsif ($opt_p) {
printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f %s%s\n",
$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}||0,
$promotable_str, $test;
} elsif ($opt_x) {
printf "%7d %7d %7d %7.3f %6.2f %6.2f %s%s\n",
$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}||0,
$promotable_str, $test;
} else {
printf "%10d %10d %10d %s\n", $fa, $fs, $fn, $test;
}
if ($opt_S) {
_print_scoremap("ham", $scoremap_ham{$test});
_print_scoremap("spam", $scoremap_spam{$test});
}
if ($opt_o) {
compute_overlaps_for_rule($test);
}
if ($opt_d) {
print qq{ </rule> };
}
}
if ($opt_d) {
print qq{
</freqs>
};
}
exit;
sub _print_scoremap {
my ($name, $smap) = @_;
if ($opt_d) {
print qq{ <scoremap class='$name'> };
}
$smap ||= { };
my @scores = (sort { $a <=> $b } keys %{$smap});
my $total = 0;
foreach my $score (@scores) {
$total += $smap->{$score};
}
foreach my $score (@scores) {
my $num = $smap->{$score};
my $pc = sprintf("%.4f", ($num / ($total||0.0001)) * 100);
if ($opt_d) {
print qq{
<si score='$score' pc='$pc' count='$num' /> };
}
else {
printf " scoremap %4s: %2d %6.2f%% %4d %s\n",
$name, $score, $pc, $num, _scoremap_graph($pc);
}
}
if ($opt_d) {
print qq{ </scoremap> };
} else {
print "\n";
}
}
sub _scoremap_graph {
my ($pc) = @_;
return '*' x ($pc * (40/100));
}
sub readlogs {
my $spam = $ARGV[0] || "spam.log";
my $ham = $ARGV[1] || "ham.log";
foreach my $file ($spam, $ham) {
open (IN, "<$file") || die "Could not open file '$file': $!";
my $isspam = ($file eq $spam);
my $caught;
my $restofline;
my $rules;
my $score;
# this is very speed-sensitive code. remove all possible
# conditionals using an eval('..').
my $evalstr = '
while (<IN>) {
';
if ($opt_M) {
$evalstr .= '
next unless /$opt_M/o;
';
}
if ($opt_X) {
$evalstr .= '
next if /$opt_X/o;
';
}
# note: doing the match with a regexp shaves off no less than
# 7 opcodes. nice!
# the additional split() is for this case:
# ". -20 /path time=1112116980,scantime=0,format=f,reuse=no"
# in other words, no hits. split(' ') cannot deal with this
# correctly, seeing (".", "-20", "/path", "time=...etc"). Work
# around this by using a literal / / regexp split to discard
# the csv stuff we don't want out of the rest of the line.
$evalstr .= '
($caught, $score, $restofline) = split(\' \', $_, 3);
next unless ($caught =~ /^[Y\.]$/ && $restofline);
(undef, $rules) = split(/ /, $restofline, 3);
my %freq_mesg = ();
';
if ($opt_f) {
$evalstr .= '
next if (!(($caught eq "Y") xor $isspam));
';
}
if ($opt_S) {
$evalstr .= '
$score = int $score;
';
}
$evalstr .= '
my @rules;
foreach my $r (split(/,/, $rules)) {
my $hits = 1;
# Support compacted RULE(hitcount) format
if ($r =~ s/\((\d+)\)$//) {
$hits = $1;
}
push @rules, $r for (1 .. $hits);
}
';
my $hmapstr = '';
my $smapstr = '';
if ($isspam) {
if ($opt_o) {
$hmapstr = '
if (!exists $hmap_spam{$r}) {
$hmap_spam{$r} = "";
}
vec ($hmap_spam{$r}, $num_spam, 1) = 1;
';
}
if ($opt_S) {
$smapstr = ' $scoremap_spam{$r}{$score}++; ';
}
$evalstr .= '
foreach my $r (@rules) {
$freq_spam{$r}++ unless $freq_mesg{$r}++;
'.$hmapstr.$smapstr.'
}
$num_spam++;
';
} else {
if ($opt_o) {
$hmapstr = '
if (!exists $hmap_ham{$r}) {
$hmap_ham{$r} = "";
}
vec ($hmap_ham{$r}, $num_ham, 1) = 1;
';
}
if ($opt_S) {
$smapstr = ' $scoremap_ham{$r}{$score}++; ';
}
$evalstr .= '
foreach my $r (@rules) {
$freq_ham{$r}++ unless $freq_mesg{$r}++;
'.$hmapstr.$smapstr.'
}
$num_ham++;
';
}
$evalstr .= '
}
';
# warn "JMD $evalstr";
eval $evalstr;
if ($@) {
die $@;
}
close IN;
}
# paranoia: remove zero length rules
delete $freq_spam{''};
delete $hmap_spam{''};
delete $freq_ham{''};
delete $hmap_ham{''};
}
sub compute_overlaps_for_rule {
my ($r1) = @_;
my %overlaps_ham1 = ();
my %overlaps_spam1 = ();
my %overlaps_ham2 = ();
my %overlaps_spam2 = ();
my %overlaps_ham1r = ();
my %overlaps_spam1r = ();
if ($freq_spam{$r1}) {
foreach my $r2 (keys %hmap_spam) {
next if $r1 eq $r2;
my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r2, $r1,
$hmap_spam{$r2}, $hmap_spam{$r1});
if ($a1ina2 > 0)
{
$overlaps_spam1r{$r2} = $a1ina2;
if (exists $overlaps_spam1{$a1ina2})
{ $overlaps_spam1{$a1ina2} .= " ".$r2."[$a2ina1]"; }
else { $overlaps_spam1{$a1ina2} = $r2."[$a2ina1]"; }
if (exists $overlaps_spam2{$a2ina1})
{ $overlaps_spam2{$a2ina1} .= " ".$r2."[$a2ina1]"; }
else { $overlaps_spam2{$a2ina1} = $r2."[$a2ina1]"; }
}
}
}
if ($freq_ham{$r1}) {
foreach my $r2 (keys %hmap_ham) {
next if $r1 eq $r2;
my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r1, $r2,
$hmap_ham{$r2}, $hmap_ham{$r1});
if ($a1ina2 > 0)
{
$overlaps_ham1r{$r2} = $a1ina2;
if (exists $overlaps_ham1{$a1ina2})
{ $overlaps_ham1{$a1ina2} .= " ".$r2."[$a2ina1]"; }
else { $overlaps_ham1{$a1ina2} = $r2."[$a2ina1]"; }
if (exists $overlaps_ham2{$a2ina1})
{ $overlaps_ham2{$a2ina1} .= " ".$r2."[$a1ina2]"; }
else { $overlaps_ham2{$a2ina1} = $r2."[$a1ina2]"; }
}
}
}
_print_overlap_ratios($r1, \%overlaps_spam1, \%overlaps_spam2, "spam", \%overlaps_ham1r, "ham");
_print_overlap_ratios($r1, \%overlaps_ham1, \%overlaps_ham2, "ham", \%overlaps_spam1r, "spam");
}
sub _print_overlap_ratios {
my ($r1, $hash1, $hash2, $type, $hash_other_type, $other_type) = @_;
return unless defined $r1;
if ($opt_d) {
print qq{ <overlap class='$type'> };
}
my %other_type_rules = %$hash_other_type;
foreach my $ratio (sort { $b <=> $a } keys %$hash1) {
$ratio ||= 0;
last if ($ratio < 20 && $type eq 'spam'); # 20% cutoff for spam
#last if ($ratio < 1 && $type eq 'ham'); # follow ham all the way to zero for tuning analysis unless this is uncommented and set
my $rules = _prettify_overlap_rules($r1, $hash1->{$ratio});
next if ($rules eq '');
foreach my $r2 (split(' ', $rules)) {
$r2 =~ s/\[(.*?)\]$//;
my $reverse_ratio = $1 || 0;
next unless defined $r2;
## If the ham hits are negligible, and there are spam hits, then it's not useful for tuning - skip
#next if ($ratio < 2 && $type eq 'ham' && exists $other_type_rules{$r2} );
my $is_subrule = ($meta_subrule_pairs{"$r1.$r2"}
|| $meta_subrule_pairs{"$r2.$r1"});
if ($opt_d) {
print qq{
<overlappair>
<overlaprules ratio='$ratio'><r1>$r2</r1><r2>$r1</r2></overlaprules>
<overlaprules ratio='$reverse_ratio'><r1>$r1</r1><r2>$r2</r2></overlaprules>
</overlappair>
};
} else {
printf " overlap %4s: %3d%% of %s hits also hit %s; %3d%% of %s hits also hit %s%s%s\n",
$type, $ratio, $r1, $r2,
$reverse_ratio, $r2, $r1,
($is_subrule ? ' (meta rule and subrule)' : ''),
(exists $other_type_rules{$r2} ? " ($other_type $other_type_rules{$r2}%)" : " (no $other_type)")
;
}
}
}
if ($opt_d) {
print qq{ </overlap };
}
}
sub _prettify_overlap_rules {
my ($rule, $str) = @_;
my @rules = sort split(' ', $str);
if ($rules{$rule} && $rules{$rule}->{type} eq 'meta') {
# ignore meta-subrules that match the rule they make up.
# TODO: this is simplistic; it doesn't look to see if those subrules
# are in turn meta rules with further subrules that should be ignored.
# but it works well enough...
my $code = $rules{$rule}->{code};
@rules = grep {
my $tmp = $_;
$tmp =~ s/\[.*\]$//;
if ($code =~ /\b\Q${tmp}\E\b/) {
$meta_subrule_pairs{"$rule.$tmp"} = 1;
0;
} else {
1;
}
} @rules;
}
return join (' ', @rules);
}
sub _hmap_to_overlap_ratio {
my ($r1, $r2, $hmap1, $hmap2) = @_;
# my $i; for ($i = 0; $i < length($hmap1)*8; $i++) { print vec($hmap1,$i,1); } print "\n"; for ($i = 0; $i < length($hmap2)*8; $i++) { print vec($hmap2,$i,1); } print "\n";
# count bits in each, so we can show when one is fully subsumed by another
# with perl's support for bitstring ops, we get C speed here, nice!
# no hits on either? this would normally give a 100% hitrate match,
# but that's misleading -- so hide it by giving it a 0% overlap.
#
# also, ignore cases where there are no hits on *one* of the rules,
# while there are hits on the other -- after all, if one rule doesn't
# have a single hit, it cannot overlap.
my $a1 = unpack("%32b*", $hmap1);
return (0,0) unless $a1;
my $a2 = unpack("%32b*", $hmap2);
return (0,0) unless $a2;
my $a1_and_a2 = unpack("%32b*", ($hmap1 & $hmap2));
# ceiling rather than truncate, so that small overlaps are not lost
my $a1_in_a2 = int ((($a1_and_a2 / ($a2 || 0.0001))*100) + 0.99);
my $a2_in_a1 = int ((($a1_and_a2 / ($a1 || 0.0001))*100) + 0.99);
return ($a1_in_a2, $a2_in_a1);
}
sub readscores {
my($cffile) = @_;
my $tmpf = "tmp/rules$$.pl";
my $genscores = "";
if ($opt_g) {
$genscores = "-d \"$FindBin::Bin/../rulesrc/scores\"";
}
if (system (
"$FindBin::Bin/../build/parse-rules-for-masses -d \"$cffile\" $genscores -s $opt_s -o $tmpf"
))
{
warn "parse-rules-for-masses failed!";
}
eval {
require "./$tmpf";
};
if ($@) {
warn "$tmpf is unparseable: $@";
$rules_pl_unparseable = 1;
# carry on anyway (for most uses); leave the tmp file behind for
# possible debugging
} else {
$rules_pl_unparseable = 0;
unlink $tmpf;
}
}
sub soratio {
my ($s, $n) = @_;
$s ||= 0;
$n ||= 0;
if ($s + $n > 0) {
return $s / ($s + $n);
} else {
return 0.5; # no results -> not effective
}
}
sub read_timings {
if (!open (IN, "<timing.log")) {
warn "hit-frequencies: cannot read 'timing.log', timings will be 0";
return;
}
my $ver = <IN>;
if ($ver !~ /^v1/) {
warn "hit-frequencies: unknown version in 'timing.log', timings will be 0";
close IN;
return;
}
while (<IN>) {
if (/^T\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/) {
my ($name, $duration, $max, $runs) = ($1,$2,$3,$4);
$rule_times{$name} = ($duration / ($runs||0.00001)) * 1000;
}
}
close IN;
}