masses/hit-frequencies - spamassassin - Git at Google

 #!/usr/bin/perl -w
 #
 # <@LICENSE>
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to you under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at:
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # </@LICENSE>

 use strict;
 use warnings;

 use FindBin;
 use Getopt::Long qw(:config bundling auto_help);
 use Pod::Usage;

 our (
   $opt_c, $opt_s, $opt_f, $opt_a, $opt_p, $opt_x, $opt_m, $opt_t, $opt_M,
   $opt_X, $opt_L, $opt_l, $opt_i, $opt_T, $opt_o, $opt_S, $opt_P, $opt_d,
   $opt_g
 );

 GetOptions("c|cffile=s" => \$opt_c,
 	   "s|scoreset=i" => \$opt_s, # ,, pacify stupid emacs cperl mode
 	   "f|falses" => \$opt_f,
 	   "a|all" => \$opt_a,
 	   "p|percentages" => \$opt_p,
 	   "x|extended" => \$opt_x,
 	   "m|matchrules=s" => \$opt_m,
 	   "t|tflags=s" => \$opt_t,
 	   "M|matchlogs=s" => \$opt_M,
 	   "X|excludelogs=s" => \$opt_X,
 	   "L|onlylanguage=s" => \$opt_L,
 	   "l|alsolanguage=s" => \$opt_l,
 	   "i|ig" => \$opt_i,
 	   "T|times" => \$opt_T,
 	   "o|overlaps" => \$opt_o,
 	   "S|scoremap" => \$opt_S,
 	   "P|promotion" => \$opt_P,
 	   "d|xml" => \$opt_d,
 	   "g|generated" => \$opt_g
 	  );

 =head1 NAME

 hit-frequencies - Display statistics about tests hit by a mass-check run

 =head1 SYNOPSIS

 hit-frequencies [options] <spam-log> <ham-log>

  Options:
     -c,--cffile=path	  Use path as the rules directory
     -s,--scoreset=n	  Use scoreset n
     -f,--falses  	  Count only false-positives/false-negatives
     -a,--all		  Report all tests (including subrules)
     -p,--percentages	  Report percentages instead of raw hits (implies -x)
     -x,--extended	  "Extended" output, include RANK, S/O and SCORE
     -m,--matchrules=re    Print rules matching the regular expression
     -t,--tflags=re	  Print only rules with tflags matching the regular expression
     -M,--matchlogs=re     Consider only logs matching the regular expression
     -X,--excludelogs=re	  Exclude logs matching this regular expression
     -L,--onlylanguage=lc  Only print language specific tests for specified lang code (try 'all')
     -l,--alsolanguage=lc  Also print language specific tests for specified lang code (try 'all')
     -i,--ig               Use IG (information gain) for ranking
     -T,--times            Display rule times (implies -x, -p)
     -o,--overlaps         Display hit overlaps against all other rules
     -S,--scoremap         Display score-map of hits
     -P,--promotion        Flag rules that meet the promotion criteria
     -d,--XML              XML output (conflicts with -x, -p)
     -g,--generated        Include generated nightly scores for sandbox rules

 =head1 DESCRIPTION

 B<hit-frequencies> will read the mass-check logs F<spam.log> and
 F<ham.log> or the logs given on the command line. The output will
 contain a summary of the number of ham and spam messages and detailed
 statistics for each rule. The output will include the following
 columns:

 =over 4

 =item OVERALL

 Number of times (or percentage with B<-p>) the rule hit on
 all messages (spam or ham).

 =item SPAM

 Number of times (or percentage with B<-p>) the rule hit on
 spam messages.

 =item HAM

 Number of times (or percentage with B<-p>) the rule hit on
 ham messages.

 =item FPOS

 =item FNEG

 Shown only with B<-f>, these refer to the number of times (or
 percentage) the rule hit on messages that were found to be false
 positives or false negatives.

 =item S/O

 Shown only with B<-x> or B<-p>, this is the number of spam hits
 divided by total number of hits (C<S/O> refers to spam divided by
 overall).

 =item RANK

 Shown only with B<-x> or B<-p>, and when B<-i> is not used, this is a
 measure that attempts to indicate how I<good> or I<useful> a test
 is. The higher it is, the better the test.

 =item IG

 Shown only with B<-i>, this is another measure that attempts to
 indicate how I<useful> a test is.

 =item SCORE

 Shown only with B<-x> or B<-p>, this is the current score assigned to
 the rule. If B<-g> is used, the scores generated for sandbox rules from
 nightly masscheck results (72_scores.cf) will be included as well.

 =item NAME

 This is the rule's name.

 =back

 =head1 BUGS

 Please report bugs to http://bugzilla.spamassassin.org/

 =head1 SEE ALSO

 L<mass-check(1)>, L<perceptron(1)>

 =cut
 if ($opt_l && $opt_L) {
   pod2usage("-L/--alsolanguage and -l/--onlylanguage are mutually exclusive");
 }

 if ($opt_d && ($opt_x || $opt_p)) {
   pod2usage("-d/--xml conflicts with -x/--extended and -p/--percentages");
 }

 $opt_s = 0 if ( !defined $opt_s );

 if ($opt_p) {
   $opt_x = 1;
 }

 if ($opt_d || $opt_T) {
   $opt_x = $opt_p = 1;
 }


 # as per http://wiki.apache.org/spamassassin/RulesProjPromotion, for -P
 my $promote_so_min      = 0.95;
 my $promote_hitrate_min = 0.02;
 my $promote_fprate_max  = 1.00;


 my $cffile = $opt_c || "$FindBin::Bin/../rules";

 # "our" so that the require'd file can overwrite them
 my $rules_pl_unparseable;
 our %rules = ();
 our %scores = ();

 my %soratio = ();
 my %freq_spam = ();
 my %freq_ham = ();
 my %hmap_spam = ();
 my %hmap_ham = ();
 my %scoremap_spam = ();
 my %scoremap_ham = ();
 my %freq = ();
 my $num_spam = 0;
 my $num_ham = 0;
 my %ranking = ();
 my $ok_lang = '';
 my %meta_subrule_pairs = ();

 my %rule_times = ();

 readscores($cffile);

 $ok_lang = lc ($opt_l || $opt_L || '');
 if ($ok_lang eq 'all') { $ok_lang = '.'; }

 if (($opt_t || $opt_o) && $rules_pl_unparseable) {
   die "-t/-o require rules.pl to be parseable";
 }

 foreach my $key (keys %rules) {
   if ($key eq '_scoreset') {
     delete $rules{$key}; # bug 5683
     next;
   }

   if ( ($opt_L && !$rules{$key}->{lang}) ||
        ($rules{$key}->{lang} &&
          (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/io)
      ) ) {
     delete $rules{$key} ; next;
   }

   $freq_spam{$key} = 0;
   $freq_ham{$key} = 0;
   if ($opt_o) {
     $hmap_spam{$key} = '';
     $hmap_ham{$key} = '';
   }
 }

 readlogs();

 my $hdr_all = $num_spam + $num_ham;
 my $hdr_spam = $num_spam;
 my $hdr_ham = $num_ham;

 my $sorting = $opt_i ? "IG" : "RANK";

 if ($opt_d) {
   $hdr_all ||= 0.00001;     # avoid div by 0 in the next 2 statements
   $hdr_spam = ($num_spam / $hdr_all) * 100.0;
   $hdr_ham = ($num_ham / $hdr_all) * 100.0;
   $opt_P = 1;

   print qq{

     <freqs>
       <allmessages>
         <count class='spam'>$num_spam</count>
         <count class='ham'>$num_spam</count>
         <pc class='spam'>$hdr_spam</pc>
         <pc class='ham'>$hdr_spam</pc>
       </allmessages>

   };

 }
 elsif ($opt_p) {
   printf "%7s  %7s  %7s  %6s  %6s  %6s  %s\n",
   	$opt_T?"MSECS":"OVERALL", $opt_f?"FNEG%":"SPAM%", $opt_f?"FPO%":"HAM%",
         "S/O", $sorting, "SCORE", "NAME";

   printf "%7d  %7d  %7d  %7.3f %6.2f  %6.2f  (all messages)\n",
   	0, $hdr_spam, $hdr_ham,
         soratio ($num_spam,$num_ham), 0, 0;

   $hdr_all ||= 0.00001;     # avoid div by 0 in the next 2 statements
   $hdr_spam = ($num_spam / $hdr_all) * 100.0;
   $hdr_ham = ($num_ham / $hdr_all) * 100.0;
   $hdr_all = 100.0;             # this is obvious

   printf "%7.5f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  (all messages as %%)\n",
   	0, $hdr_spam, $hdr_ham,
         soratio ($num_spam,$num_ham), 0, 0;

 }
 elsif ($opt_p) {
   printf "%8s %7s  %7s  %6s  %6s  %6s  %s\n",
   	"OVERALL%", $opt_f?"FNEG%":"SPAM%", $opt_f?"FPO%":"HAM%",
         "S/O", $sorting, "SCORE", "NAME";

   printf "%7d  %7d  %7d  %7.3f %6.2f  %6.2f  (all messages)\n",
   	$hdr_all, $hdr_spam, $hdr_ham,
         soratio ($num_spam,$num_ham), 0, 0;

   $hdr_all ||= 0.00001;     # avoid div by 0 in the next 2 statements
   $hdr_spam = ($num_spam / $hdr_all) * 100.0;
   $hdr_ham = ($num_ham / $hdr_all) * 100.0;
   $hdr_all = 100.0;             # this is obvious

   printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  (all messages as %%)\n",
   	$hdr_all, $hdr_spam, $hdr_ham,
         soratio ($num_spam,$num_ham), 0, 0;

 }
 elsif ($opt_x) {
   printf "%7s %7s  %7s  %6s  %6s %6s  %s\n",
   	"OVERALL%", $opt_f?"FNEG%":"SPAM%", $opt_f?"FPOS%":"HAM%", "S/O", $sorting, "SCORE", "NAME";
   printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  (all messages)\n",
   	$hdr_all, $hdr_spam, $hdr_ham,
         soratio ($num_spam,$num_ham), 0, 0;

 } else {
   printf "%10s  %10s  %10s  %s\n",
   	"OVERALL", $opt_f?"FNEG":"SPAM", $opt_f?"FPO":"HAM",
 	  "NAME";
   printf "%10d  %10d  %10d  (all messages)\n",
   	$hdr_all, $hdr_spam, $hdr_ham;
 }

 my %done = ();
 my @tests = ();
 my $rank_hi = 0;
 my $rank_lo = 9999999;

 # variables for wanted/unwanted RANK
 my %wanted;
 my %unwanted;
 my %isnice;
 my %wranks;
 my %uranks;

 # rules that we want to look at
 $freq{$_}++ for keys %freq_ham;
 $freq{$_}++ for keys %freq_spam;
 # if a rule exists in the config, we want it in the output too
 $freq{$_}++ for keys %rules;

 my $test;
 foreach $test (keys %freq) {
   my $parsed_rules_entry = $rules{$test};

   # Allow nice T_ rules to be detected
   my $no_t = $test;
   if ($no_t =~ s/^T_//) {
     if (defined $rules{$no_t}) {
       $parsed_rules_entry = $rules{$no_t};
     }
   }

   # do not require 'tmp/rules.pl' to have been built from the
   # exact same ruleset version; this assumption screws up nightly
   # mass-check reports if they are generated with a different SVN rev
   # next unless (exists $rules{$test});

   next if (!$opt_a && $test =~ /^__/);

   next if $done{$test}; $done{$test} = 1;
   push (@tests, $test);

   my $isnice = 0;
   if ($parsed_rules_entry) {
     if ($parsed_rules_entry->{tflags} &&
             $parsed_rules_entry->{tflags} =~ /\bnice\b/)
     {
       $isnice = 1;
     }
   }
   $isnice{$test} = $isnice;

   my $fs = $freq_spam{$test}; $fs ||= 0;
   my $fn = $freq_ham{$test}; $fn ||= 0;
   my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
   my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;

   my $soratio = $soratio{$test} = soratio ($fsadj, $fnadj);

   if ($isnice) {
     $soratio = 1.0 - $soratio;
     my $tmp = $fsadj; $fsadj = $fnadj; $fnadj = $tmp;
   }

   if ($opt_i) {
     # come up with a ranking
     my $rank;

     # IG system: from "Learning to Filter Unsolicited Commercial E-Mail",
     # Ion Androutsopoulos et al: determine the information gain IG(X, C) of the
     # Boolean attributes (ie. the rules). Measures "the average reduction in
     # the entropy of C (classification) given the value of X (the rule)". Makes
     # a good ranking measure with a proper statistical basis. ;)
     #
     # Still would like to get an entropy measure in, too.
     #
     #             sum                                    P(X = x ^ C = c)
     # IG(X,C) = x in [0, 1]    P(X = x ^ C = c) . log2( ------------------- )
     #           c in [Ch, Cs]                           P(X = x) . P(C = c)
     #
     my $safe_nspam = $num_spam || 0.0000001;
     my $safe_nham = $num_ham || 0.0000001;

     my $num_all = ($num_spam + $num_ham);
     my $safe_all = $num_all || 0.0000001;
     my $f_all = $fs+$fn;

     my $px0 = (($num_all - $f_all) / $safe_all);         # P(X = 0)
     my $px1 = ($f_all / $safe_all);                      # P(X = 1)
     my $pccs = ($num_spam / $safe_all);                  # P(C = Cs)
     my $pcch = ($num_ham / $safe_all);                   # P(C = Ch)
     my $px1ccs = ($fs / $safe_nspam);                   # P(X = 1 ^ C = Cs)
     my $px1cch = ($fn / $safe_nham);                    # P(X = 1 ^ C = Ch)
     my $px0ccs = (($num_spam - $fs) / $safe_nspam);     # P(X = 0 ^ C = Cs)
     my $px0cch = (($num_ham - $fn) / $safe_nham);       # P(X = 0 ^ C = Ch)
     my $safe_px0_dot_pccs = ($px0 * $pccs) || 0.00000001;
     my $safe_px0_dot_pcch = ($px0 * $pcch) || 0.00000001;
     my $safe_px1_dot_pccs = ($px1 * $pccs) || 0.00000001;
     my $safe_px1_dot_pcch = ($px1 * $pcch) || 0.00000001;

     sub log2 { return log($_[0]) / 0.693147180559945; } # log(2) = 0.6931...

     my $safe_px0ccs = ($px0ccs || 0.0000001);
     my $safe_px0cch = ($px0cch || 0.0000001);
     my $safe_px1ccs = ($px1ccs || 0.0000001);
     my $safe_px1cch = ($px1cch || 0.0000001);
     $rank = ( $px0ccs * log2($safe_px0ccs / $safe_px0_dot_pccs) ) +
                     ( $px0cch * log2($safe_px0cch / $safe_px0_dot_pcch) ) +
                     ( $px1ccs * log2($safe_px1ccs / $safe_px1_dot_pccs) ) +
                     ( $px1cch * log2($safe_px1cch / $safe_px1_dot_pcch) );

     $ranking{$test} = $rank;
     $rank_hi = $rank if ($rank > $rank_hi);
     $rank_lo = $rank if ($rank < $rank_lo);
   }
   else {
     # RANK: basic wanted/unwanted ranking
     #
     # The rank of each test based on two ranks: (1) the number of wanted
     # hits and (2) the number of unwanted hits.  Each test is ranked
     # positionally for both its wanted and unwanted hits (ties are
     # allowed) and the two ranks are normalized to have the same range.
     # Those two ranks are added together, producing a single RANK number
     # that is then normalized to [0, 1].  The result is equivalent to:
     #
     # RANK(rule) = (percentile(wanted) + percentile(unwanted))/2
     #
     $wanted{$test} = $isnice ? $fn : $fs;
     $unwanted{$test} = $isnice ? $fs : $fn;
     # count number of ranks of each type
     $wranks{$wanted{$test}} = 1;
     $uranks{$unwanted{$test}} = 1;
   }
 }

 # finish basic wanted/unwanted ranking
 if (! $opt_i) {
   my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted;
   my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted;

   # first half of ranking is the wanted rank
   my $position = 0;
   my $last = undef;
   for my $test (@wanted) {
     $position++ if defined $last && $last != $wanted{$test};
     $ranking{$test} += $position;
     $last = $wanted{$test}
   }

   # second half of ranking is the unwanted rank
   my $normalize = (scalar keys %wranks) / (scalar keys %uranks || 0.001);
   $position = 0;
   $last = undef;
   for my $test (@unwanted) {
     $position++ if defined $last && $last != $unwanted{$test};
     $ranking{$test} += ($position * $normalize);
     $last = $unwanted{$test};
     $rank_hi = $ranking{$test} if ($ranking{$test} > $rank_hi);
     $rank_lo = $ranking{$test} if ($ranking{$test} < $rank_lo);
   }
 }

 {
   # now normalise the rankings to [0, 1]
   $rank_hi -= $rank_lo;
   foreach my $test (@tests) {
     $ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / ($rank_hi);
   }
 }

 if ($opt_T) {
   read_timings();
 }

 foreach $test (sort { $ranking{$b} <=> $ranking{$a} } @tests) {
   my $parsed_rules_entry = $rules{$test};

   # Allow nice T_ rules to be detected
   my $no_t = $test;
   if ($no_t =~ s/^T_//) {
     if (defined $rules{$no_t}) {
       $parsed_rules_entry = $rules{$no_t};
     }
   }

   # do not require 'tmp/rules.pl' to have been built from the
   # exact same ruleset version; this assumption screws up nightly
   # mass-check reports if they are generated with a different SVN rev
   # next unless (exists $rules{$test});

   next if (!$opt_a && $test =~ /^__/);

   my $fs = $freq_spam{$test}; $fs ||= 0;
   my $fn = $freq_ham{$test}; $fn ||= 0;
   my $fa = $fs+$fn;
   my $num_fs = $fs;
   my $num_fn = $fn;
   my $num_fa = $fa;

   my $tflags = '';
   if ($parsed_rules_entry) {
     $tflags = $parsed_rules_entry->{tflags};
   }

   # match certain tests
   next if ($opt_m && $test !~ m/$opt_m/);
   # match tflags
   next if ($opt_t && (!$tflags || $tflags !~ /$opt_t/));

   if (!$opt_a && !$opt_t && $tflags) {
     # not net tests
     next if ($tflags =~ /\bnet\b/ && ($opt_s % 2 == 0));

     # not userconf
     # Jul 13 2005 jm: removed.  this blocks SPF_PASS from showing up!
     # why should userconf rules not be visible in freqs output?
     # next if ($tflags =~ /\buserconf\b/);
   }

   # adjust based on corpora sizes (and cvt to % while we're at it)
   my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
   my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;

   if ($opt_f && $fsadj == 0 && $fnadj == 0) { next; }

   if ($opt_p) {
     my $denom = ($num_spam + $num_ham) || 0.000001; # avoid / by 0
     $fa = ($fa / $denom) * 100.0;
     $fs = $fsadj;
     $fn = $fnadj;
   }

   my $soratio = $soratio{$test};
   if (!defined $soratio) {
     $soratio{$test} = soratio ($fsadj, $fnadj);
   }

   my $promotable;
   if ($opt_P) {
     $promotable = 1;

     if ($isnice{$test}) {
       if (($soratio{$test} > (1.0 - $promote_so_min))
           || ($fn < $promote_hitrate_min)
           || ($fs >= $promote_fprate_max))
       {
         $promotable = 0;
       }
     } else {
       if (($soratio{$test} < $promote_so_min)
           || ($fs < $promote_hitrate_min)
           || ($fn >= $promote_fprate_max))
       {
         $promotable = 0;
       }
     }
   }
   my $promotable_str = $opt_P ? ($promotable ? '+ ' : '- ') : '';

   if ($opt_d) {
     print qq{
       <rule>
         <time>}.($rule_times{$test}||0).qq{</time>
         <count class='all'>$num_fa</count>
         <count class='spam'>$num_fs</count>
         <count class='ham'>$num_fn</count>
         <pc class='all'>}.sprintf("%.5f", $fa).qq{</pc>
         <pc class='spam'>}.sprintf("%.5f", $fs).qq{</pc>
         <pc class='ham'>}.sprintf("%.5f", $fn).qq{</pc>
         <so>}.sprintf("%.8f", $soratio).qq{</so>
         <rank>}.sprintf("%.8f", $ranking{$test}).qq{</rank>
         <score set='$opt_s'>}.($scores{$test}||0).qq{</score>
         <promotable>$promotable</promotable>
         <test>$test</test>
     };

   } elsif ($opt_T) {
     printf "%7.5f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  %s%s\n",
   	$rule_times{$test}||0, $fs, $fn, $soratio, $ranking{$test},
         $scores{$test}||0,
         $promotable_str, $test;

   } elsif ($opt_p) {
     printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  %s%s\n",
   	$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}||0,
         $promotable_str, $test;

   } elsif ($opt_x) {
     printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  %s%s\n",
   	$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}||0,
         $promotable_str, $test;

   } else {
     printf "%10d  %10d  %10d  %s\n", $fa, $fs, $fn, $test;
   }

   if ($opt_S) {
     _print_scoremap("ham", $scoremap_ham{$test});
     _print_scoremap("spam", $scoremap_spam{$test});
   }

   if ($opt_o) {
     compute_overlaps_for_rule($test);
   }

   if ($opt_d) {
     print qq{ </rule> };
   }
 }

 if ($opt_d) {
   print qq{
     </freqs>
   };
 }
 exit;


 sub _print_scoremap {
   my ($name, $smap) = @_;

   if ($opt_d) {
     print qq{ <scoremap class='$name'> };
   }

   $smap ||= { };
   my @scores = (sort { $a <=> $b } keys %{$smap});

   my $total = 0;
   foreach my $score (@scores) {
     $total += $smap->{$score};
   }

   foreach my $score (@scores) {
     my $num = $smap->{$score};
     my $pc = sprintf("%.4f", ($num / ($total||0.0001)) * 100);

     if ($opt_d) {
       print qq{
         <si score='$score' pc='$pc' count='$num' /> };

     }
     else {
       printf "  scoremap %4s: %2d %6.2f%% %4d %s\n",
           $name, $score, $pc, $num, _scoremap_graph($pc);

     }
   }

   if ($opt_d) {
     print qq{ </scoremap> };

   } else {
     print "\n";
   }
 }

 sub _scoremap_graph {
   my ($pc) = @_;
   return '*' x ($pc * (40/100));
 }


 sub readlogs {
   my $spam = $ARGV[0] || "spam.log";
   my $ham = $ARGV[1] || "ham.log";

   foreach my $file ($spam, $ham) {
     open (IN, "<$file") || die "Could not open file '$file': $!";

     my $isspam = ($file eq $spam);
     my $caught;
     my $restofline;
     my $rules;
     my $score;

     # this is very speed-sensitive code.  remove all possible
     # conditionals using an eval('..').
     my $evalstr = '
       while (<IN>) {
     ';

     if ($opt_M) {
       $evalstr .= '
         next unless /$opt_M/o;
       ';
     }
     if ($opt_X) {
       $evalstr .= '
         next if /$opt_X/o;
       ';
     }

     # note: doing the match with a regexp shaves off no less than
     # 7 opcodes. nice!

     # the additional split() is for this case:
     # ".  -20 /path  time=1112116980,scantime=0,format=f,reuse=no"
     # in other words, no hits.  split(' ') cannot deal with this
     # correctly, seeing (".", "-20", "/path", "time=...etc").  Work
     # around this by using a literal / / regexp split to discard
     # the csv stuff we don't want out of the rest of the line.


     $evalstr .= '
         ($caught, $score, $restofline) = split(\' \', $_, 3);
         next unless ($caught =~ /^[Y\.]$/ && $restofline);
         (undef, $rules) = split(/ /, $restofline, 3);
 	my %freq_mesg = ();
     ';

     if ($opt_f) {
       $evalstr .= '
         next if (!(($caught eq "Y") xor $isspam));
       ';
     }

     if ($opt_S) {
       $evalstr .= '
         $score = int $score;
       ';
     }

     $evalstr .= '
       my @rules;
       foreach my $r (split(/,/, $rules)) {
         my $hits = 1;
         # Support compacted RULE(hitcount) format
         if ($r =~ s/\((\d+)\)$//) {
           $hits = $1;
         }
         push @rules, $r for (1 .. $hits);
       }
     ';

     my $hmapstr = '';
     my $smapstr = '';
     if ($isspam) {
       if ($opt_o) {
         $hmapstr = '
           if (!exists $hmap_spam{$r}) {
             $hmap_spam{$r} = "";
           }
           vec ($hmap_spam{$r}, $num_spam, 1) = 1;
         ';
       }

       if ($opt_S) {
         $smapstr = ' $scoremap_spam{$r}{$score}++; ';
       }

       $evalstr .= '
         foreach my $r (@rules) {
           $freq_spam{$r}++ unless $freq_mesg{$r}++;
           '.$hmapstr.$smapstr.'
         }
         $num_spam++;
       ';
     } else {
       if ($opt_o) {
         $hmapstr = '
           if (!exists $hmap_ham{$r}) {
             $hmap_ham{$r} = "";
           }
           vec ($hmap_ham{$r}, $num_ham, 1) = 1;
         ';
       }

       if ($opt_S) {
         $smapstr = ' $scoremap_ham{$r}{$score}++; ';
       }

       $evalstr .= '
         foreach my $r (@rules) {
           $freq_ham{$r}++ unless $freq_mesg{$r}++;
           '.$hmapstr.$smapstr.'
         }
         $num_ham++;
       ';
     }
     $evalstr .= '
       }
     ';

     # warn "JMD $evalstr";
     eval $evalstr;

     if ($@) {
       die $@;
     }

     close IN;
   }
   # paranoia: remove zero length rules
   delete $freq_spam{''};
   delete $hmap_spam{''};
   delete $freq_ham{''};
   delete $hmap_ham{''};
 }

 sub compute_overlaps_for_rule {
   my ($r1) = @_;

   my %overlaps_ham1 = ();
   my %overlaps_spam1 = ();
   my %overlaps_ham2 = ();
   my %overlaps_spam2 = ();
   my %overlaps_ham1r = ();
   my %overlaps_spam1r = ();

   if ($freq_spam{$r1}) {
     foreach my $r2 (keys %hmap_spam) {
       next if $r1 eq $r2;

       my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r2, $r1,
                                       $hmap_spam{$r2}, $hmap_spam{$r1});

       if ($a1ina2 > 0)
       {
         $overlaps_spam1r{$r2} = $a1ina2;

         if (exists $overlaps_spam1{$a1ina2})
         { $overlaps_spam1{$a1ina2} .= " ".$r2."[$a2ina1]"; }
         else { $overlaps_spam1{$a1ina2} = $r2."[$a2ina1]"; }

         if (exists $overlaps_spam2{$a2ina1})
         { $overlaps_spam2{$a2ina1} .= " ".$r2."[$a2ina1]"; }
         else { $overlaps_spam2{$a2ina1} = $r2."[$a2ina1]"; }
       }
     }
   }

   if ($freq_ham{$r1}) {
     foreach my $r2 (keys %hmap_ham) {
       next if $r1 eq $r2;

       my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r1, $r2,
                                       $hmap_ham{$r2}, $hmap_ham{$r1});

       if ($a1ina2 > 0)
       {
         $overlaps_ham1r{$r2} = $a1ina2;

         if (exists $overlaps_ham1{$a1ina2})
         { $overlaps_ham1{$a1ina2} .= " ".$r2."[$a2ina1]"; }
         else { $overlaps_ham1{$a1ina2} = $r2."[$a2ina1]"; }

         if (exists $overlaps_ham2{$a2ina1})
         { $overlaps_ham2{$a2ina1} .= " ".$r2."[$a1ina2]"; }
         else { $overlaps_ham2{$a2ina1} = $r2."[$a1ina2]"; }
       }
     }
   }

   _print_overlap_ratios($r1, \%overlaps_spam1, \%overlaps_spam2, "spam", \%overlaps_ham1r, "ham");
   _print_overlap_ratios($r1, \%overlaps_ham1, \%overlaps_ham2, "ham", \%overlaps_spam1r, "spam");
 }

 sub _print_overlap_ratios {
   my ($r1, $hash1, $hash2, $type, $hash_other_type, $other_type) = @_;

   return unless defined $r1;

   if ($opt_d) {
     print qq{ <overlap class='$type'> };
   }

   my %other_type_rules = %$hash_other_type;

   foreach my $ratio (sort { $b <=> $a } keys %$hash1) {
     $ratio ||= 0;
     last if ($ratio < 20 && $type eq 'spam');	# 20% cutoff for spam
     #last if ($ratio < 1 && $type eq 'ham');	# follow ham all the way to zero for tuning analysis unless this is uncommented and set
     my $rules = _prettify_overlap_rules($r1, $hash1->{$ratio});
     next if ($rules eq '');

     foreach my $r2 (split(' ', $rules)) {
       $r2 =~ s/\[(.*?)\]$//;
       my $reverse_ratio = $1 || 0;
       next unless defined $r2;

       ## If the ham hits are negligible, and there are spam hits, then it's not useful for tuning - skip
       #next if ($ratio < 2 && $type eq 'ham' && exists $other_type_rules{$r2} );

       my $is_subrule = ($meta_subrule_pairs{"$r1.$r2"}
                       || $meta_subrule_pairs{"$r2.$r1"});

       if ($opt_d) {
         print qq{
           <overlappair>
             <overlaprules ratio='$ratio'><r1>$r2</r1><r2>$r1</r2></overlaprules>
             <overlaprules ratio='$reverse_ratio'><r1>$r1</r1><r2>$r2</r2></overlaprules>
           </overlappair>
         };

       } else {
         printf "  overlap %4s: %3d%% of %s hits also hit %s; %3d%% of %s hits also hit %s%s%s\n",
                     $type, $ratio, $r1, $r2,
                     $reverse_ratio, $r2, $r1,
                     ($is_subrule ? ' (meta rule and subrule)' : ''),
                     (exists $other_type_rules{$r2} ? " ($other_type $other_type_rules{$r2}%)" : " (no $other_type)")
         ;
       }
     }
   }

   if ($opt_d) {
     print qq{ </overlap };
   }
 }

 sub _prettify_overlap_rules {
   my ($rule, $str) = @_;

   my @rules = sort split(' ', $str);
   if ($rules{$rule} && $rules{$rule}->{type} eq 'meta') {
     # ignore meta-subrules that match the rule they make up.
     # TODO: this is simplistic; it doesn't look to see if those subrules
     # are in turn meta rules with further subrules that should be ignored.
     # but it works well enough...

     my $code = $rules{$rule}->{code};
     @rules = grep {
       my $tmp = $_;
       $tmp =~ s/\[.*\]$//;
       if ($code =~ /\b\Q${tmp}\E\b/) {
         $meta_subrule_pairs{"$rule.$tmp"} = 1;
         0;
       } else {
         1;
       }
     } @rules;
   }
   return join (' ', @rules);
 }

 sub _hmap_to_overlap_ratio {
   my ($r1, $r2, $hmap1, $hmap2) = @_;

   # my $i; for ($i = 0; $i < length($hmap1)*8; $i++) { print vec($hmap1,$i,1); } print "\n"; for ($i = 0; $i < length($hmap2)*8; $i++) { print vec($hmap2,$i,1); } print "\n";

   # count bits in each, so we can show when one is fully subsumed by another
   # with perl's support for bitstring ops, we get C speed here, nice!

   # no hits on either? this would normally give a 100% hitrate match,
   # but that's misleading -- so hide it by giving it a 0% overlap.
   #
   # also, ignore cases where there are no hits on *one* of the rules,
   # while there are hits on the other -- after all, if one rule doesn't
   # have a single hit, it cannot overlap.

   my $a1 = unpack("%32b*", $hmap1);
   return (0,0) unless $a1;
   my $a2 = unpack("%32b*", $hmap2);
   return (0,0) unless $a2;

   my $a1_and_a2 = unpack("%32b*", ($hmap1 & $hmap2));

   # ceiling rather than truncate, so that small overlaps are not lost
   my $a1_in_a2 = int ((($a1_and_a2 / ($a2 || 0.0001))*100) + 0.99);
   my $a2_in_a1 = int ((($a1_and_a2 / ($a1 || 0.0001))*100) + 0.99);

   return ($a1_in_a2, $a2_in_a1);
 }


 sub readscores {
   my($cffile) = @_;
   my $tmpf = "tmp/rules$$.pl";
   my $genscores = "";
   if ($opt_g) {
     $genscores = "-d \"$FindBin::Bin/../rulesrc/scores\"";
   }
   if (system (
       "$FindBin::Bin/../build/parse-rules-for-masses -d \"$cffile\" $genscores -s $opt_s -o $tmpf"
     ))
   {
     warn "parse-rules-for-masses failed!";
   }
   eval {
     require "./$tmpf";
   };
   if ($@) {
     warn "$tmpf is unparseable: $@";
     $rules_pl_unparseable = 1;
     # carry on anyway (for most uses); leave the tmp file behind for
     # possible debugging
   } else {
     $rules_pl_unparseable = 0;
     unlink $tmpf;
   }
 }

 sub soratio {
   my ($s, $n) = @_;

   $s ||= 0;
   $n ||= 0;

   if ($s + $n > 0) {
       return $s / ($s + $n);
   } else {
       return 0.5;		# no results -> not effective
   }
 }

 sub read_timings {
   if (!open (IN, "<timing.log")) {
     warn "hit-frequencies: cannot read 'timing.log', timings will be 0";
     return;
   }
   my $ver = <IN>;
   if ($ver !~ /^v1/) {
     warn "hit-frequencies: unknown version in 'timing.log', timings will be 0";
     close IN;
     return;
   }
   while (<IN>) {
     if (/^T\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/) {
       my ($name, $duration, $max, $runs) = ($1,$2,$3,$4);
       $rule_times{$name} = ($duration / ($runs||0.00001)) * 1000;
     }
   }
   close IN;
 }