| #!/usr/bin/perl -w |
| # |
| # <@LICENSE> |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to you under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at: |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # </@LICENSE> |
| |
| use strict; |
| use warnings; |
| |
| use FindBin; |
| use Getopt::Long qw(:config bundling auto_help); |
| use Pod::Usage; |
| |
| our ( |
| $opt_c, $opt_s, $opt_f, $opt_a, $opt_p, $opt_x, $opt_m, $opt_t, $opt_M, |
| $opt_X, $opt_L, $opt_l, $opt_i, $opt_T, $opt_o, $opt_S, $opt_P, $opt_d, |
| $opt_g |
| ); |
| |
| GetOptions("c|cffile=s" => \$opt_c, |
| "s|scoreset=i" => \$opt_s, # ,, pacify stupid emacs cperl mode |
| "f|falses" => \$opt_f, |
| "a|all" => \$opt_a, |
| "p|percentages" => \$opt_p, |
| "x|extended" => \$opt_x, |
| "m|matchrules=s" => \$opt_m, |
| "t|tflags=s" => \$opt_t, |
| "M|matchlogs=s" => \$opt_M, |
| "X|excludelogs=s" => \$opt_X, |
| "L|onlylanguage=s" => \$opt_L, |
| "l|alsolanguage=s" => \$opt_l, |
| "i|ig" => \$opt_i, |
| "T|times" => \$opt_T, |
| "o|overlaps" => \$opt_o, |
| "S|scoremap" => \$opt_S, |
| "P|promotion" => \$opt_P, |
| "d|xml" => \$opt_d, |
| "g|generated" => \$opt_g |
| ); |
| |
| =head1 NAME |
| |
| hit-frequencies - Display statistics about tests hit by a mass-check run |
| |
| =head1 SYNOPSIS |
| |
| hit-frequencies [options] <spam-log> <ham-log> |
| |
| Options: |
| -c,--cffile=path Use path as the rules directory |
| -s,--scoreset=n Use scoreset n |
| -f,--falses Count only false-positives/false-negatives |
| -a,--all Report all tests (including subrules) |
| -p,--percentages Report percentages instead of raw hits (implies -x) |
| -x,--extended "Extended" output, include RANK, S/O and SCORE |
| -m,--matchrules=re Print rules matching the regular expression |
| -t,--tflags=re Print only rules with tflags matching the regular expression |
| -M,--matchlogs=re Consider only logs matching the regular expression |
| -X,--excludelogs=re Exclude logs matching this regular expression |
| -L,--onlylanguage=lc Only print language specific tests for specified lang code (try 'all') |
| -l,--alsolanguage=lc Also print language specific tests for specified lang code (try 'all') |
| -i,--ig Use IG (information gain) for ranking |
| -T,--times Display rule times (implies -x, -p) |
| -o,--overlaps Display hit overlaps against all other rules |
| -S,--scoremap Display score-map of hits |
| -P,--promotion Flag rules that meet the promotion criteria |
| -d,--XML XML output (conflicts with -x, -p) |
| -g,--generated Include generated nightly scores for sandbox rules |
| |
| =head1 DESCRIPTION |
| |
| B<hit-frequencies> will read the mass-check logs F<spam.log> and |
| F<ham.log> or the logs given on the command line. The output will |
| contain a summary of the number of ham and spam messages and detailed |
| statistics for each rule. The output will include the following |
| columns: |
| |
| =over 4 |
| |
| =item OVERALL |
| |
| Number of times (or percentage with B<-p>) the rule hit on |
| all messages (spam or ham). |
| |
| =item SPAM |
| |
| Number of times (or percentage with B<-p>) the rule hit on |
| spam messages. |
| |
| =item HAM |
| |
| Number of times (or percentage with B<-p>) the rule hit on |
| ham messages. |
| |
| =item FPOS |
| |
| =item FNEG |
| |
| Shown only with B<-f>, these refer to the number of times (or |
| percentage) the rule hit on messages that were found to be false |
| positives or false negatives. |
| |
| =item S/O |
| |
| Shown only with B<-x> or B<-p>, this is the number of spam hits |
| divided by total number of hits (C<S/O> refers to spam divided by |
| overall). |
| |
| =item RANK |
| |
| Shown only with B<-x> or B<-p>, and when B<-i> is not used, this is a |
| measure that attempts to indicate how I<good> or I<useful> a test |
| is. The higher it is, the better the test. |
| |
| =item IG |
| |
| Shown only with B<-i>, this is another measure that attempts to |
| indicate how I<useful> a test is. |
| |
| =item SCORE |
| |
| Shown only with B<-x> or B<-p>, this is the current score assigned to |
| the rule. If B<-g> is used, the scores generated for sandbox rules from |
| nightly masscheck results (72_scores.cf) will be included as well. |
| |
| =item NAME |
| |
| This is the rule's name. |
| |
| =back |
| |
| =head1 BUGS |
| |
| Please report bugs to http://bugzilla.spamassassin.org/ |
| |
| =head1 SEE ALSO |
| |
| L<mass-check(1)>, L<perceptron(1)> |
| |
| =cut |
| if ($opt_l && $opt_L) { |
| pod2usage("-L/--alsolanguage and -l/--onlylanguage are mutually exclusive"); |
| } |
| |
| if ($opt_d && ($opt_x || $opt_p)) { |
| pod2usage("-d/--xml conflicts with -x/--extended and -p/--percentages"); |
| } |
| |
| $opt_s = 0 if ( !defined $opt_s ); |
| |
| if ($opt_p) { |
| $opt_x = 1; |
| } |
| |
| if ($opt_d || $opt_T) { |
| $opt_x = $opt_p = 1; |
| } |
| |
| |
| # as per http://wiki.apache.org/spamassassin/RulesProjPromotion, for -P |
| my $promote_so_min = 0.95; |
| my $promote_hitrate_min = 0.02; |
| my $promote_fprate_max = 1.00; |
| |
| |
| my $cffile = $opt_c || "$FindBin::Bin/../rules"; |
| |
| # "our" so that the require'd file can overwrite them |
| my $rules_pl_unparseable; |
| our %rules = (); |
| our %scores = (); |
| |
| my %soratio = (); |
| my %freq_spam = (); |
| my %freq_ham = (); |
| my %hmap_spam = (); |
| my %hmap_ham = (); |
| my %scoremap_spam = (); |
| my %scoremap_ham = (); |
| my %freq = (); |
| my $num_spam = 0; |
| my $num_ham = 0; |
| my %ranking = (); |
| my $ok_lang = ''; |
| my %meta_subrule_pairs = (); |
| |
| my %rule_times = (); |
| |
| readscores($cffile); |
| |
| $ok_lang = lc ($opt_l || $opt_L || ''); |
| if ($ok_lang eq 'all') { $ok_lang = '.'; } |
| |
| if (($opt_t || $opt_o) && $rules_pl_unparseable) { |
| die "-t/-o require rules.pl to be parseable"; |
| } |
| |
| foreach my $key (keys %rules) { |
| if ($key eq '_scoreset') { |
| delete $rules{$key}; # bug 5683 |
| next; |
| } |
| |
| if ( ($opt_L && !$rules{$key}->{lang}) || |
| ($rules{$key}->{lang} && |
| (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/io) |
| ) ) { |
| delete $rules{$key} ; next; |
| } |
| |
| $freq_spam{$key} = 0; |
| $freq_ham{$key} = 0; |
| if ($opt_o) { |
| $hmap_spam{$key} = ''; |
| $hmap_ham{$key} = ''; |
| } |
| } |
| |
| readlogs(); |
| |
| my $hdr_all = $num_spam + $num_ham; |
| my $hdr_spam = $num_spam; |
| my $hdr_ham = $num_ham; |
| |
| my $sorting = $opt_i ? "IG" : "RANK"; |
| |
| if ($opt_d) { |
| $hdr_all ||= 0.00001; # avoid div by 0 in the next 2 statements |
| $hdr_spam = ($num_spam / $hdr_all) * 100.0; |
| $hdr_ham = ($num_ham / $hdr_all) * 100.0; |
| $opt_P = 1; |
| |
| print qq{ |
| |
| <freqs> |
| <allmessages> |
| <count class='spam'>$num_spam</count> |
| <count class='ham'>$num_spam</count> |
| <pc class='spam'>$hdr_spam</pc> |
| <pc class='ham'>$hdr_spam</pc> |
| </allmessages> |
| |
| }; |
| |
| } |
| elsif ($opt_p) { |
| printf "%7s %7s %7s %6s %6s %6s %s\n", |
| $opt_T?"MSECS":"OVERALL", $opt_f?"FNEG%":"SPAM%", $opt_f?"FPO%":"HAM%", |
| "S/O", $sorting, "SCORE", "NAME"; |
| |
| printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", |
| 0, $hdr_spam, $hdr_ham, |
| soratio ($num_spam,$num_ham), 0, 0; |
| |
| $hdr_all ||= 0.00001; # avoid div by 0 in the next 2 statements |
| $hdr_spam = ($num_spam / $hdr_all) * 100.0; |
| $hdr_ham = ($num_ham / $hdr_all) * 100.0; |
| $hdr_all = 100.0; # this is obvious |
| |
| printf "%7.5f %7.4f %7.4f %7.3f %6.2f %6.2f (all messages as %%)\n", |
| 0, $hdr_spam, $hdr_ham, |
| soratio ($num_spam,$num_ham), 0, 0; |
| |
| } |
| elsif ($opt_p) { |
| printf "%8s %7s %7s %6s %6s %6s %s\n", |
| "OVERALL%", $opt_f?"FNEG%":"SPAM%", $opt_f?"FPO%":"HAM%", |
| "S/O", $sorting, "SCORE", "NAME"; |
| |
| printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", |
| $hdr_all, $hdr_spam, $hdr_ham, |
| soratio ($num_spam,$num_ham), 0, 0; |
| |
| $hdr_all ||= 0.00001; # avoid div by 0 in the next 2 statements |
| $hdr_spam = ($num_spam / $hdr_all) * 100.0; |
| $hdr_ham = ($num_ham / $hdr_all) * 100.0; |
| $hdr_all = 100.0; # this is obvious |
| |
| printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f (all messages as %%)\n", |
| $hdr_all, $hdr_spam, $hdr_ham, |
| soratio ($num_spam,$num_ham), 0, 0; |
| |
| } |
| elsif ($opt_x) { |
| printf "%7s %7s %7s %6s %6s %6s %s\n", |
| "OVERALL%", $opt_f?"FNEG%":"SPAM%", $opt_f?"FPOS%":"HAM%", "S/O", $sorting, "SCORE", "NAME"; |
| printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", |
| $hdr_all, $hdr_spam, $hdr_ham, |
| soratio ($num_spam,$num_ham), 0, 0; |
| |
| } else { |
| printf "%10s %10s %10s %s\n", |
| "OVERALL", $opt_f?"FNEG":"SPAM", $opt_f?"FPO":"HAM", |
| "NAME"; |
| printf "%10d %10d %10d (all messages)\n", |
| $hdr_all, $hdr_spam, $hdr_ham; |
| } |
| |
| my %done = (); |
| my @tests = (); |
| my $rank_hi = 0; |
| my $rank_lo = 9999999; |
| |
| # variables for wanted/unwanted RANK |
| my %wanted; |
| my %unwanted; |
| my %isnice; |
| my %wranks; |
| my %uranks; |
| |
| # rules that we want to look at |
| $freq{$_}++ for keys %freq_ham; |
| $freq{$_}++ for keys %freq_spam; |
| # if a rule exists in the config, we want it in the output too |
| $freq{$_}++ for keys %rules; |
| |
| my $test; |
| foreach $test (keys %freq) { |
| my $parsed_rules_entry = $rules{$test}; |
| |
| # Allow nice T_ rules to be detected |
| my $no_t = $test; |
| if ($no_t =~ s/^T_//) { |
| if (defined $rules{$no_t}) { |
| $parsed_rules_entry = $rules{$no_t}; |
| } |
| } |
| |
| # do not require 'tmp/rules.pl' to have been built from the |
| # exact same ruleset version; this assumption screws up nightly |
| # mass-check reports if they are generated with a different SVN rev |
| # next unless (exists $rules{$test}); |
| |
| next if (!$opt_a && $test =~ /^__/); |
| |
| next if $done{$test}; $done{$test} = 1; |
| push (@tests, $test); |
| |
| my $isnice = 0; |
| if ($parsed_rules_entry) { |
| if ($parsed_rules_entry->{tflags} && |
| $parsed_rules_entry->{tflags} =~ /\bnice\b/) |
| { |
| $isnice = 1; |
| } |
| } |
| $isnice{$test} = $isnice; |
| |
| my $fs = $freq_spam{$test}; $fs ||= 0; |
| my $fn = $freq_ham{$test}; $fn ||= 0; |
| my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0; |
| my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0; |
| |
| my $soratio = $soratio{$test} = soratio ($fsadj, $fnadj); |
| |
| if ($isnice) { |
| $soratio = 1.0 - $soratio; |
| my $tmp = $fsadj; $fsadj = $fnadj; $fnadj = $tmp; |
| } |
| |
| if ($opt_i) { |
| # come up with a ranking |
| my $rank; |
| |
| # IG system: from "Learning to Filter Unsolicited Commercial E-Mail", |
| # Ion Androutsopoulos et al: determine the information gain IG(X, C) of the |
| # Boolean attributes (ie. the rules). Measures "the average reduction in |
| # the entropy of C (classification) given the value of X (the rule)". Makes |
| # a good ranking measure with a proper statistical basis. ;) |
| # |
| # Still would like to get an entropy measure in, too. |
| # |
| # sum P(X = x ^ C = c) |
| # IG(X,C) = x in [0, 1] P(X = x ^ C = c) . log2( ------------------- ) |
| # c in [Ch, Cs] P(X = x) . P(C = c) |
| # |
| my $safe_nspam = $num_spam || 0.0000001; |
| my $safe_nham = $num_ham || 0.0000001; |
| |
| my $num_all = ($num_spam + $num_ham); |
| my $safe_all = $num_all || 0.0000001; |
| my $f_all = $fs+$fn; |
| |
| my $px0 = (($num_all - $f_all) / $safe_all); # P(X = 0) |
| my $px1 = ($f_all / $safe_all); # P(X = 1) |
| my $pccs = ($num_spam / $safe_all); # P(C = Cs) |
| my $pcch = ($num_ham / $safe_all); # P(C = Ch) |
| my $px1ccs = ($fs / $safe_nspam); # P(X = 1 ^ C = Cs) |
| my $px1cch = ($fn / $safe_nham); # P(X = 1 ^ C = Ch) |
| my $px0ccs = (($num_spam - $fs) / $safe_nspam); # P(X = 0 ^ C = Cs) |
| my $px0cch = (($num_ham - $fn) / $safe_nham); # P(X = 0 ^ C = Ch) |
| my $safe_px0_dot_pccs = ($px0 * $pccs) || 0.00000001; |
| my $safe_px0_dot_pcch = ($px0 * $pcch) || 0.00000001; |
| my $safe_px1_dot_pccs = ($px1 * $pccs) || 0.00000001; |
| my $safe_px1_dot_pcch = ($px1 * $pcch) || 0.00000001; |
| |
| sub log2 { return log($_[0]) / 0.693147180559945; } # log(2) = 0.6931... |
| |
| my $safe_px0ccs = ($px0ccs || 0.0000001); |
| my $safe_px0cch = ($px0cch || 0.0000001); |
| my $safe_px1ccs = ($px1ccs || 0.0000001); |
| my $safe_px1cch = ($px1cch || 0.0000001); |
| $rank = ( $px0ccs * log2($safe_px0ccs / $safe_px0_dot_pccs) ) + |
| ( $px0cch * log2($safe_px0cch / $safe_px0_dot_pcch) ) + |
| ( $px1ccs * log2($safe_px1ccs / $safe_px1_dot_pccs) ) + |
| ( $px1cch * log2($safe_px1cch / $safe_px1_dot_pcch) ); |
| |
| $ranking{$test} = $rank; |
| $rank_hi = $rank if ($rank > $rank_hi); |
| $rank_lo = $rank if ($rank < $rank_lo); |
| } |
| else { |
| # RANK: basic wanted/unwanted ranking |
| # |
| # The rank of each test based on two ranks: (1) the number of wanted |
| # hits and (2) the number of unwanted hits. Each test is ranked |
| # positionally for both its wanted and unwanted hits (ties are |
| # allowed) and the two ranks are normalized to have the same range. |
| # Those two ranks are added together, producing a single RANK number |
| # that is then normalized to [0, 1]. The result is equivalent to: |
| # |
| # RANK(rule) = (percentile(wanted) + percentile(unwanted))/2 |
| # |
| $wanted{$test} = $isnice ? $fn : $fs; |
| $unwanted{$test} = $isnice ? $fs : $fn; |
| # count number of ranks of each type |
| $wranks{$wanted{$test}} = 1; |
| $uranks{$unwanted{$test}} = 1; |
| } |
| } |
| |
| # finish basic wanted/unwanted ranking |
| if (! $opt_i) { |
| my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted; |
| my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted; |
| |
| # first half of ranking is the wanted rank |
| my $position = 0; |
| my $last = undef; |
| for my $test (@wanted) { |
| $position++ if defined $last && $last != $wanted{$test}; |
| $ranking{$test} += $position; |
| $last = $wanted{$test} |
| } |
| |
| # second half of ranking is the unwanted rank |
| my $normalize = (scalar keys %wranks) / (scalar keys %uranks || 0.001); |
| $position = 0; |
| $last = undef; |
| for my $test (@unwanted) { |
| $position++ if defined $last && $last != $unwanted{$test}; |
| $ranking{$test} += ($position * $normalize); |
| $last = $unwanted{$test}; |
| $rank_hi = $ranking{$test} if ($ranking{$test} > $rank_hi); |
| $rank_lo = $ranking{$test} if ($ranking{$test} < $rank_lo); |
| } |
| } |
| |
| { |
| # now normalise the rankings to [0, 1] |
| $rank_hi -= $rank_lo; |
| foreach my $test (@tests) { |
| $ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / ($rank_hi); |
| } |
| } |
| |
| if ($opt_T) { |
| read_timings(); |
| } |
| |
| foreach $test (sort { $ranking{$b} <=> $ranking{$a} } @tests) { |
| my $parsed_rules_entry = $rules{$test}; |
| |
| # Allow nice T_ rules to be detected |
| my $no_t = $test; |
| if ($no_t =~ s/^T_//) { |
| if (defined $rules{$no_t}) { |
| $parsed_rules_entry = $rules{$no_t}; |
| } |
| } |
| |
| # do not require 'tmp/rules.pl' to have been built from the |
| # exact same ruleset version; this assumption screws up nightly |
| # mass-check reports if they are generated with a different SVN rev |
| # next unless (exists $rules{$test}); |
| |
| next if (!$opt_a && $test =~ /^__/); |
| |
| my $fs = $freq_spam{$test}; $fs ||= 0; |
| my $fn = $freq_ham{$test}; $fn ||= 0; |
| my $fa = $fs+$fn; |
| my $num_fs = $fs; |
| my $num_fn = $fn; |
| my $num_fa = $fa; |
| |
| my $tflags = ''; |
| if ($parsed_rules_entry) { |
| $tflags = $parsed_rules_entry->{tflags}; |
| } |
| |
| # match certain tests |
| next if ($opt_m && $test !~ m/$opt_m/); |
| # match tflags |
| next if ($opt_t && (!$tflags || $tflags !~ /$opt_t/)); |
| |
| if (!$opt_a && !$opt_t && $tflags) { |
| # not net tests |
| next if ($tflags =~ /\bnet\b/ && ($opt_s % 2 == 0)); |
| |
| # not userconf |
| # Jul 13 2005 jm: removed. this blocks SPF_PASS from showing up! |
| # why should userconf rules not be visible in freqs output? |
| # next if ($tflags =~ /\buserconf\b/); |
| } |
| |
| # adjust based on corpora sizes (and cvt to % while we're at it) |
| my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0; |
| my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0; |
| |
| if ($opt_f && $fsadj == 0 && $fnadj == 0) { next; } |
| |
| if ($opt_p) { |
| my $denom = ($num_spam + $num_ham) || 0.000001; # avoid / by 0 |
| $fa = ($fa / $denom) * 100.0; |
| $fs = $fsadj; |
| $fn = $fnadj; |
| } |
| |
| my $soratio = $soratio{$test}; |
| if (!defined $soratio) { |
| $soratio{$test} = soratio ($fsadj, $fnadj); |
| } |
| |
| my $promotable; |
| if ($opt_P) { |
| $promotable = 1; |
| |
| if ($isnice{$test}) { |
| if (($soratio{$test} > (1.0 - $promote_so_min)) |
| || ($fn < $promote_hitrate_min) |
| || ($fs >= $promote_fprate_max)) |
| { |
| $promotable = 0; |
| } |
| } else { |
| if (($soratio{$test} < $promote_so_min) |
| || ($fs < $promote_hitrate_min) |
| || ($fn >= $promote_fprate_max)) |
| { |
| $promotable = 0; |
| } |
| } |
| } |
| my $promotable_str = $opt_P ? ($promotable ? '+ ' : '- ') : ''; |
| |
| if ($opt_d) { |
| print qq{ |
| <rule> |
| <time>}.($rule_times{$test}||0).qq{</time> |
| <count class='all'>$num_fa</count> |
| <count class='spam'>$num_fs</count> |
| <count class='ham'>$num_fn</count> |
| <pc class='all'>}.sprintf("%.5f", $fa).qq{</pc> |
| <pc class='spam'>}.sprintf("%.5f", $fs).qq{</pc> |
| <pc class='ham'>}.sprintf("%.5f", $fn).qq{</pc> |
| <so>}.sprintf("%.8f", $soratio).qq{</so> |
| <rank>}.sprintf("%.8f", $ranking{$test}).qq{</rank> |
| <score set='$opt_s'>}.($scores{$test}||0).qq{</score> |
| <promotable>$promotable</promotable> |
| <test>$test</test> |
| }; |
| |
| } elsif ($opt_T) { |
| printf "%7.5f %7.4f %7.4f %7.3f %6.2f %6.2f %s%s\n", |
| $rule_times{$test}||0, $fs, $fn, $soratio, $ranking{$test}, |
| $scores{$test}||0, |
| $promotable_str, $test; |
| |
| } elsif ($opt_p) { |
| printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f %s%s\n", |
| $fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}||0, |
| $promotable_str, $test; |
| |
| } elsif ($opt_x) { |
| printf "%7d %7d %7d %7.3f %6.2f %6.2f %s%s\n", |
| $fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}||0, |
| $promotable_str, $test; |
| |
| } else { |
| printf "%10d %10d %10d %s\n", $fa, $fs, $fn, $test; |
| } |
| |
| if ($opt_S) { |
| _print_scoremap("ham", $scoremap_ham{$test}); |
| _print_scoremap("spam", $scoremap_spam{$test}); |
| } |
| |
| if ($opt_o) { |
| compute_overlaps_for_rule($test); |
| } |
| |
| if ($opt_d) { |
| print qq{ </rule> }; |
| } |
| } |
| |
| if ($opt_d) { |
| print qq{ |
| </freqs> |
| }; |
| } |
| exit; |
| |
| |
| sub _print_scoremap { |
| my ($name, $smap) = @_; |
| |
| if ($opt_d) { |
| print qq{ <scoremap class='$name'> }; |
| } |
| |
| $smap ||= { }; |
| my @scores = (sort { $a <=> $b } keys %{$smap}); |
| |
| my $total = 0; |
| foreach my $score (@scores) { |
| $total += $smap->{$score}; |
| } |
| |
| foreach my $score (@scores) { |
| my $num = $smap->{$score}; |
| my $pc = sprintf("%.4f", ($num / ($total||0.0001)) * 100); |
| |
| if ($opt_d) { |
| print qq{ |
| <si score='$score' pc='$pc' count='$num' /> }; |
| |
| } |
| else { |
| printf " scoremap %4s: %2d %6.2f%% %4d %s\n", |
| $name, $score, $pc, $num, _scoremap_graph($pc); |
| |
| } |
| } |
| |
| if ($opt_d) { |
| print qq{ </scoremap> }; |
| |
| } else { |
| print "\n"; |
| } |
| } |
| |
| sub _scoremap_graph { |
| my ($pc) = @_; |
| return '*' x ($pc * (40/100)); |
| } |
| |
| |
| sub readlogs { |
| my $spam = $ARGV[0] || "spam.log"; |
| my $ham = $ARGV[1] || "ham.log"; |
| |
| foreach my $file ($spam, $ham) { |
| open (IN, "<$file") || die "Could not open file '$file': $!"; |
| |
| my $isspam = ($file eq $spam); |
| my $caught; |
| my $restofline; |
| my $rules; |
| my $score; |
| |
| # this is very speed-sensitive code. remove all possible |
| # conditionals using an eval('..'). |
| my $evalstr = ' |
| while (<IN>) { |
| '; |
| |
| if ($opt_M) { |
| $evalstr .= ' |
| next unless /$opt_M/o; |
| '; |
| } |
| if ($opt_X) { |
| $evalstr .= ' |
| next if /$opt_X/o; |
| '; |
| } |
| |
| # note: doing the match with a regexp shaves off no less than |
| # 7 opcodes. nice! |
| |
| # the additional split() is for this case: |
| # ". -20 /path time=1112116980,scantime=0,format=f,reuse=no" |
| # in other words, no hits. split(' ') cannot deal with this |
| # correctly, seeing (".", "-20", "/path", "time=...etc"). Work |
| # around this by using a literal / / regexp split to discard |
| # the csv stuff we don't want out of the rest of the line. |
| |
| |
| $evalstr .= ' |
| ($caught, $score, $restofline) = split(\' \', $_, 3); |
| next unless ($caught =~ /^[Y\.]$/ && $restofline); |
| (undef, $rules) = split(/ /, $restofline, 3); |
| my %freq_mesg = (); |
| '; |
| |
| if ($opt_f) { |
| $evalstr .= ' |
| next if (!(($caught eq "Y") xor $isspam)); |
| '; |
| } |
| |
| if ($opt_S) { |
| $evalstr .= ' |
| $score = int $score; |
| '; |
| } |
| |
| $evalstr .= ' |
| my @rules; |
| foreach my $r (split(/,/, $rules)) { |
| my $hits = 1; |
| # Support compacted RULE(hitcount) format |
| if ($r =~ s/\((\d+)\)$//) { |
| $hits = $1; |
| } |
| push @rules, $r for (1 .. $hits); |
| } |
| '; |
| |
| my $hmapstr = ''; |
| my $smapstr = ''; |
| if ($isspam) { |
| if ($opt_o) { |
| $hmapstr = ' |
| if (!exists $hmap_spam{$r}) { |
| $hmap_spam{$r} = ""; |
| } |
| vec ($hmap_spam{$r}, $num_spam, 1) = 1; |
| '; |
| } |
| |
| if ($opt_S) { |
| $smapstr = ' $scoremap_spam{$r}{$score}++; '; |
| } |
| |
| $evalstr .= ' |
| foreach my $r (@rules) { |
| $freq_spam{$r}++ unless $freq_mesg{$r}++; |
| '.$hmapstr.$smapstr.' |
| } |
| $num_spam++; |
| '; |
| } else { |
| if ($opt_o) { |
| $hmapstr = ' |
| if (!exists $hmap_ham{$r}) { |
| $hmap_ham{$r} = ""; |
| } |
| vec ($hmap_ham{$r}, $num_ham, 1) = 1; |
| '; |
| } |
| |
| if ($opt_S) { |
| $smapstr = ' $scoremap_ham{$r}{$score}++; '; |
| } |
| |
| $evalstr .= ' |
| foreach my $r (@rules) { |
| $freq_ham{$r}++ unless $freq_mesg{$r}++; |
| '.$hmapstr.$smapstr.' |
| } |
| $num_ham++; |
| '; |
| } |
| $evalstr .= ' |
| } |
| '; |
| |
| # warn "JMD $evalstr"; |
| eval $evalstr; |
| |
| if ($@) { |
| die $@; |
| } |
| |
| close IN; |
| } |
| # paranoia: remove zero length rules |
| delete $freq_spam{''}; |
| delete $hmap_spam{''}; |
| delete $freq_ham{''}; |
| delete $hmap_ham{''}; |
| } |
| |
| sub compute_overlaps_for_rule { |
| my ($r1) = @_; |
| |
| my %overlaps_ham1 = (); |
| my %overlaps_spam1 = (); |
| my %overlaps_ham2 = (); |
| my %overlaps_spam2 = (); |
| my %overlaps_ham1r = (); |
| my %overlaps_spam1r = (); |
| |
| if ($freq_spam{$r1}) { |
| foreach my $r2 (keys %hmap_spam) { |
| next if $r1 eq $r2; |
| |
| my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r2, $r1, |
| $hmap_spam{$r2}, $hmap_spam{$r1}); |
| |
| if ($a1ina2 > 0) |
| { |
| $overlaps_spam1r{$r2} = $a1ina2; |
| |
| if (exists $overlaps_spam1{$a1ina2}) |
| { $overlaps_spam1{$a1ina2} .= " ".$r2."[$a2ina1]"; } |
| else { $overlaps_spam1{$a1ina2} = $r2."[$a2ina1]"; } |
| |
| if (exists $overlaps_spam2{$a2ina1}) |
| { $overlaps_spam2{$a2ina1} .= " ".$r2."[$a2ina1]"; } |
| else { $overlaps_spam2{$a2ina1} = $r2."[$a2ina1]"; } |
| } |
| } |
| } |
| |
| if ($freq_ham{$r1}) { |
| foreach my $r2 (keys %hmap_ham) { |
| next if $r1 eq $r2; |
| |
| my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r1, $r2, |
| $hmap_ham{$r2}, $hmap_ham{$r1}); |
| |
| if ($a1ina2 > 0) |
| { |
| $overlaps_ham1r{$r2} = $a1ina2; |
| |
| if (exists $overlaps_ham1{$a1ina2}) |
| { $overlaps_ham1{$a1ina2} .= " ".$r2."[$a2ina1]"; } |
| else { $overlaps_ham1{$a1ina2} = $r2."[$a2ina1]"; } |
| |
| if (exists $overlaps_ham2{$a2ina1}) |
| { $overlaps_ham2{$a2ina1} .= " ".$r2."[$a1ina2]"; } |
| else { $overlaps_ham2{$a2ina1} = $r2."[$a1ina2]"; } |
| } |
| } |
| } |
| |
| _print_overlap_ratios($r1, \%overlaps_spam1, \%overlaps_spam2, "spam", \%overlaps_ham1r, "ham"); |
| _print_overlap_ratios($r1, \%overlaps_ham1, \%overlaps_ham2, "ham", \%overlaps_spam1r, "spam"); |
| } |
| |
| sub _print_overlap_ratios { |
| my ($r1, $hash1, $hash2, $type, $hash_other_type, $other_type) = @_; |
| |
| return unless defined $r1; |
| |
| if ($opt_d) { |
| print qq{ <overlap class='$type'> }; |
| } |
| |
| my %other_type_rules = %$hash_other_type; |
| |
| foreach my $ratio (sort { $b <=> $a } keys %$hash1) { |
| $ratio ||= 0; |
| last if ($ratio < 20 && $type eq 'spam'); # 20% cutoff for spam |
| #last if ($ratio < 1 && $type eq 'ham'); # follow ham all the way to zero for tuning analysis unless this is uncommented and set |
| my $rules = _prettify_overlap_rules($r1, $hash1->{$ratio}); |
| next if ($rules eq ''); |
| |
| foreach my $r2 (split(' ', $rules)) { |
| $r2 =~ s/\[(.*?)\]$//; |
| my $reverse_ratio = $1 || 0; |
| next unless defined $r2; |
| |
| ## If the ham hits are negligible, and there are spam hits, then it's not useful for tuning - skip |
| #next if ($ratio < 2 && $type eq 'ham' && exists $other_type_rules{$r2} ); |
| |
| my $is_subrule = ($meta_subrule_pairs{"$r1.$r2"} |
| || $meta_subrule_pairs{"$r2.$r1"}); |
| |
| if ($opt_d) { |
| print qq{ |
| <overlappair> |
| <overlaprules ratio='$ratio'><r1>$r2</r1><r2>$r1</r2></overlaprules> |
| <overlaprules ratio='$reverse_ratio'><r1>$r1</r1><r2>$r2</r2></overlaprules> |
| </overlappair> |
| }; |
| |
| } else { |
| printf " overlap %4s: %3d%% of %s hits also hit %s; %3d%% of %s hits also hit %s%s%s\n", |
| $type, $ratio, $r1, $r2, |
| $reverse_ratio, $r2, $r1, |
| ($is_subrule ? ' (meta rule and subrule)' : ''), |
| (exists $other_type_rules{$r2} ? " ($other_type $other_type_rules{$r2}%)" : " (no $other_type)") |
| ; |
| } |
| } |
| } |
| |
| if ($opt_d) { |
| print qq{ </overlap }; |
| } |
| } |
| |
| sub _prettify_overlap_rules { |
| my ($rule, $str) = @_; |
| |
| my @rules = sort split(' ', $str); |
| if ($rules{$rule} && $rules{$rule}->{type} eq 'meta') { |
| # ignore meta-subrules that match the rule they make up. |
| # TODO: this is simplistic; it doesn't look to see if those subrules |
| # are in turn meta rules with further subrules that should be ignored. |
| # but it works well enough... |
| |
| my $code = $rules{$rule}->{code}; |
| @rules = grep { |
| my $tmp = $_; |
| $tmp =~ s/\[.*\]$//; |
| if ($code =~ /\b\Q${tmp}\E\b/) { |
| $meta_subrule_pairs{"$rule.$tmp"} = 1; |
| 0; |
| } else { |
| 1; |
| } |
| } @rules; |
| } |
| return join (' ', @rules); |
| } |
| |
| sub _hmap_to_overlap_ratio { |
| my ($r1, $r2, $hmap1, $hmap2) = @_; |
| |
| # my $i; for ($i = 0; $i < length($hmap1)*8; $i++) { print vec($hmap1,$i,1); } print "\n"; for ($i = 0; $i < length($hmap2)*8; $i++) { print vec($hmap2,$i,1); } print "\n"; |
| |
| # count bits in each, so we can show when one is fully subsumed by another |
| # with perl's support for bitstring ops, we get C speed here, nice! |
| |
| # no hits on either? this would normally give a 100% hitrate match, |
| # but that's misleading -- so hide it by giving it a 0% overlap. |
| # |
| # also, ignore cases where there are no hits on *one* of the rules, |
| # while there are hits on the other -- after all, if one rule doesn't |
| # have a single hit, it cannot overlap. |
| |
| my $a1 = unpack("%32b*", $hmap1); |
| return (0,0) unless $a1; |
| my $a2 = unpack("%32b*", $hmap2); |
| return (0,0) unless $a2; |
| |
| my $a1_and_a2 = unpack("%32b*", ($hmap1 & $hmap2)); |
| |
| # ceiling rather than truncate, so that small overlaps are not lost |
| my $a1_in_a2 = int ((($a1_and_a2 / ($a2 || 0.0001))*100) + 0.99); |
| my $a2_in_a1 = int ((($a1_and_a2 / ($a1 || 0.0001))*100) + 0.99); |
| |
| return ($a1_in_a2, $a2_in_a1); |
| } |
| |
| |
| sub readscores { |
| my($cffile) = @_; |
| my $tmpf = "tmp/rules$$.pl"; |
| my $genscores = ""; |
| if ($opt_g) { |
| $genscores = "-d \"$FindBin::Bin/../rulesrc/scores\""; |
| } |
| if (system ( |
| "$FindBin::Bin/../build/parse-rules-for-masses -d \"$cffile\" $genscores -s $opt_s -o $tmpf" |
| )) |
| { |
| warn "parse-rules-for-masses failed!"; |
| } |
| eval { |
| require "./$tmpf"; |
| }; |
| if ($@) { |
| warn "$tmpf is unparseable: $@"; |
| $rules_pl_unparseable = 1; |
| # carry on anyway (for most uses); leave the tmp file behind for |
| # possible debugging |
| } else { |
| $rules_pl_unparseable = 0; |
| unlink $tmpf; |
| } |
| } |
| |
| sub soratio { |
| my ($s, $n) = @_; |
| |
| $s ||= 0; |
| $n ||= 0; |
| |
| if ($s + $n > 0) { |
| return $s / ($s + $n); |
| } else { |
| return 0.5; # no results -> not effective |
| } |
| } |
| |
| sub read_timings { |
| if (!open (IN, "<timing.log")) { |
| warn "hit-frequencies: cannot read 'timing.log', timings will be 0"; |
| return; |
| } |
| my $ver = <IN>; |
| if ($ver !~ /^v1/) { |
| warn "hit-frequencies: unknown version in 'timing.log', timings will be 0"; |
| close IN; |
| return; |
| } |
| while (<IN>) { |
| if (/^T\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/) { |
| my ($name, $duration, $max, $runs) = ($1,$2,$3,$4); |
| $rule_times{$name} = ($duration / ($runs||0.00001)) * 1000; |
| } |
| } |
| close IN; |
| } |
| |