masses/rule-qa/list-bad-rules - spamassassin - Git at Google

 #!/usr/bin/perl

 use strict;
 use warnings;

 use File::Temp ();
 use LWP::Simple;
 use URI::Escape;
 use Data::Dumper;

 my $FROM_CACHE; if (@ARGV && $ARGV[0] eq '--fromcache') { $FROM_CACHE = 1; }
 my $MAKE_CACHE; $MAKE_CACHE = 1;        # turn this on by default, no harm

 # we allow promotion of rules that are "ifplugin" one of these
 my @def_plugins = map {
     s,^lib/Mail/SpamAssassin/Plugin/(\S+)\.pm$,Mail::SpamAssassin::Plugin::$1,gs;
     $_;
   } <lib/Mail/SpamAssassin/Plugin/*.pm>;

 my $PROMOTABLE_PLUGINS_RE = "^" . join("|", @def_plugins) . "\$";

 # number of days to look back; if a rule isn't listed as promotable on
 # all of these days, it won't be listed.  (we grant an exception for
 # new rules that didn't exist on previous days, however, so new rules
 # can be published quickly to handle sudden outbreaks without requiring
 # manual update work)
 my @DAYS_REQUIRED = (1, 2, 3);

 # S/O threshold required
 my $SO_THRESHOLD = .8;

 ###########################################################################

 print q{
 Bad performing rules, from the past 3 night's mass-checks.

 (Note: 'net' rules will be listed as 'no hits' unless you set 'tflags net'.
 This also applies for meta rules which use 'net' rules.)

 };

 my $cgi_url = "https://ruleqa.spamassassin.org/";
 my @doc = ();
 my $cache = 'ruleqa.cache.';
 my $submitters = '';

 my $url;        # tracks the last day used
 my $dayoffset = 0;
 foreach my $day (@DAYS_REQUIRED) {
   if (!$FROM_CACHE || !-f $cache.$day) {

 with_new_offset:
     $url = $cgi_url.($day+$dayoffset)."-days-ago?xml=1";
     warn "HTTP get: $url\n";

     $doc[$day] = get ($url);
     if (!$doc[$day]) {
       die "HTTP get failed: $doc[$day]\n";
     }

     if ($MAKE_CACHE) {
       if (open(O, ">$cache$day")) {
         print O $doc[$day]; close O;
       }
     }
   }
   else {
     open(I, "<$cache$day") or die; $doc[$day] = join('',<I>); close I;
   }

 ###########################################################################

   # the HTML looks like:
   #
   #   <span class="daterev_masscheck_description" class="mcviewing">
   #   ...
   #   <em><span class="mcsubmitters"> ....... </span></em>
   #   ...
   #   </span>
   #
   # in other words, the machine-parseable metadata is embedded in the HTML
   # as a microformat.

   if ($doc[$day] =~ m{
           <span\s+class="daterev_masscheck_description\smcviewing"
           .{0,400}
           <span\s+class="mcsubmitters">\s*(.*?)\s*</span>
         }sx)
   {
     my $daysubs = $1;

     # ignore days when the mass-check sets contain a --net log, since
     # it's the weekly --net run.  That generally contains a much
     # smaller set of logs (since it takes longer to run mass-check --net)
     # so the results are untrustworthy.
     if ($daysubs =~ /(?:^|\s)net-/) {
       warn "day $day contains a --net mass-check! offsetting by an extra day\n";
       $dayoffset++; goto with_new_offset;
     }

     ($submitters ne '') and $submitters .= "; ";
     $submitters .= "day $day: $daysubs";
   }
   else {
     loghtml_die("no 'mcviewing', 'mcsubmitters' microformats on day $day");
   }
 }

 ###########################################################################

 # <rule><test>__HIGHBITS</test><promo>0</promo>
 # <spc>8.7654</spc><hpc>0.2056</hpc><so>0.977</so>
 # <detailhref>ruleqa%3Fdaterev%3Dlast-night%26rule%3D__HIGHBITS%26s_detail%3D1</detailhref></rule>

 my $plist;
 foreach my $day (@DAYS_REQUIRED) {
   while ($doc[$day] =~ m!<rule>(.*?)</rule>!xg) {
     my $xml = $1;
     my $obj = { };

     while ($xml =~ s!<([A-Za-z0-9_]+)>(.*?)</\1>!!) {
       $obj->{$1} = $2;
     }
     while ($xml =~ s!<([A-Za-z0-9_]+)\s+esc=["']1["']>(.*?)</\1>!!) {
       $obj->{$1} = uri_unescape($2);
     }

     my $name = $obj->{test};
     $obj->{detailhref} = $cgi_url.$obj->{detailhref};

     $plist->[$day]->{$name} = $obj;
   }

   if (!scalar keys %{$plist->[$day]}) {
     loghtml_die("no rules found? on day $day");
   }
 }

 ###########################################################################

 ## my $dump = Data::Dumper->Dump([$plist], ['promolist']); print $dump;

 # use SpamAssassin classes directly, so we can lint rules
 # as we go
 use lib 'lib';
 use Mail::SpamAssassin;

 my $mailsa = Mail::SpamAssassin->new({
     rules_filename => "rules",
     site_rules_filename => join("\000", qw( rulesrc/core rulesrc/sandbox )),
     local_tests_only => 1,
     dont_copy_prefs => 1,
     config_tree_recurse => 1,
     keep_config_parsing_metadata => 1,
     # debug => 1,
 });

 # hack hack hack!!  we don't want to load plugin files twice,
 # and since the mkrules compiler copies from rulesrc/sandbox/*/*.pm
 # to rules/*.pm, they would otherwise appear twice.
 foreach my $fname (<rules/*.pm>) {
   my $path = File::Spec->rel2abs($fname);
   $INC{$path} = 1;
   # warn "JMD $path";
 }

 my %rules_with_errors = ();
 my %killed_rules = ();
 my %count_rules = ();
 my %report_bad_subrules = ();

 $mailsa->{lint_callback} = sub {
   my %opts = @_;

   # ignore non-rule-issue lint failures
   return if ($opts{msg} =~ /(?:
         score\sset\sfor\snon-existent|
         description\sexists
     )/x);

   warn "demoting $opts{rule}: $opts{msg}";
   if ($opts{iserror}) {
     $rules_with_errors{$opts{rule}}++;
   }
 };

 $mailsa->lint_rules();

 # print "# active ruleset list, automatically generated from $cgi_url\n";
 # print "# with results from: $submitters\n";

 my @spcs = ($submitters =~ /\s+/g);
 if (scalar @spcs < 2) {
   die "not generating results; less than 3 submitter results available!\n";
 }

 # base most of our decisions off day 1 (last night's mass-checks).
 # note: meta rules must come before their __SUBRULES in this sort;
 # default lexical sort will do this.
 foreach my $plistkey (sort keys %{$plist->[1]}) {
   my $name = $plistkey;
   my $plistobj = $plist->[1]->{$plistkey};
   my $notes = '';

   # rules in sandboxes without a T_ prefix, will be renamed during the
   # ruleqa process... in other words, the output freqs line will talk
   # about rule "T_FOO".   if there's a rule "FOO" defined, assume that's
   # the one being talked about.
   my $no_t = $name;
   if ($no_t =~ s/^T_//) {
     if (defined $mailsa->{conf}->{scores}->{$no_t}) {
       $name = $no_t;
     }
   }

   # ignore rules that don't exist (if they have a desc or score,
   # they exist according to the Conf parser)
   next unless ($mailsa->{conf}->{descriptions}->{$name}
         || $mailsa->{conf}->{scores}->{$name});

   my $tfs = $mailsa->{conf}->{tflags}->{$name} || '';

   my $src = $mailsa->{conf}->{source_file}->{$name};
   if ( defined $src ) {
     $count_rules{$src}++;
   } else {
     $count_rules{'not_present'}++;
   }

   # skip rules of these tflags, we cannot judge them without more data
   if ($tfs =~ /\b(?:userconf|learn|net)\b/) {
     next;
   }

   # rules that fail lint
   next if $rules_with_errors{$name};

   # subrules with ok parent rules
   if ($name =~ /^__/ && !$report_bad_subrules{$name}) {
     # print "  # ignoring subrule $name: parent rules seem fine\n";
     next;
   }

   # certain tests need to be reversed for "nice" rules
   my $is_nice = 0;
   if ($tfs =~ /\bnice\b/) { $is_nice = 1; }

   my $valid = 1;    # number of nights the rule appears in
   my $so = $plist->[1]->{$plistkey}->{so};
   if (defined $plist->[2]->{$plistkey}->{so}) {
       $so += $plist->[2]->{$plistkey}->{so}; $valid++;
   }
   if (defined $plist->[3]->{$plistkey}->{so}) {
       $so += $plist->[3]->{$plistkey}->{so}; $valid++;
   }
   $so /= $valid;      # average across all 3
   my $adj_so;
   if ($is_nice) {
     $adj_so = 1.0 - $so;            # 0.0 => 1.0
   } else {
     $adj_so = $so;
   }
   next unless ($adj_so < $SO_THRESHOLD);

   my $target = ($is_nice ? 'hpc' : 'spc');
   my $spc = $plist->[1]->{$plistkey}->{$target};
   $spc += $plist->[2]->{$plistkey}->{$target} || 0;
   $spc += $plist->[3]->{$plistkey}->{$target} || 0;
   $spc /= $valid;

   $target = ($is_nice ? 'spc' : 'hpc');
   my $hpc = $plist->[1]->{$plistkey}->{$target};
   $hpc += $plist->[2]->{$plistkey}->{$target} || 0;
   $hpc += $plist->[3]->{$plistkey}->{$target} || 0;
   $hpc /= $valid;

   if ($spc <= 0.0001) {
     if ($hpc <= 0.0001) {
       badrule($name, "no hits at all");
     } else {
       badrule($name, "no hits of target type");
     }
     next;
   }

   badrule($name, "bad, avg S/O=".sprintf("%.2f",$so)." ".
                         "avg Spam%=".sprintf("%.2f",$spc)." ".
                         "avg Ham%=".sprintf("%.2f",$hpc)
               );
 }

 foreach my $srcfile (reverse sort keys %killed_rules) {
   my $set = $killed_rules{$srcfile};
   my $count = $count_rules{$srcfile};
   my $c_bad = scalar keys %{$set};

   print "\n$srcfile ($count rules, $c_bad bad):\n\n";

   foreach my $name (sort keys %{$set}) {
     my $reason = $set->{$name};
     print "  $name:  $reason\n";
   }
 }

 exit;

 sub badrule {
   my ($name, $reason) = @_;
   my $src = $mailsa->{conf}->{source_file}->{$name};
   if ( defined $src ) {
     $killed_rules{$src}->{$name} = $reason;
   } else {
     $killed_rules{'not_present'}->{$name} = $reason;
   }

   # if it's a subrule in a meta rule, note this
   # TODO: this only works reliably for lexically-previous meta rules;
   # that's ok for __SUBRULES used in META_RULES, since "M" < "_".
   if ($report_bad_subrules{$name}) {
     $killed_rules{$src}->{$name} .= "\n      # used in:$report_bad_subrules{$name}";
   }

   # if it's a meta rule, note that we can complain about its subrules too
   foreach my $r (split ' ', $mailsa->{conf}->{meta_dependencies}->{$name} || '') {
     $report_bad_subrules{$r} .= " ".$name;
   }
 }

 sub loghtml_die {
   die "$_[0]\nURL: $url\n";
 }
	#!/usr/bin/perl

	use strict;
	use warnings;

	use File::Temp ();
	use LWP::Simple;
	use URI::Escape;
	use Data::Dumper;

	my $FROM_CACHE; if (@ARGV && $ARGV[0] eq '--fromcache') { $FROM_CACHE = 1; }
	my $MAKE_CACHE; $MAKE_CACHE = 1; # turn this on by default, no harm

	# we allow promotion of rules that are "ifplugin" one of these
	my @def_plugins = map {
	s,^lib/Mail/SpamAssassin/Plugin/(\S+)\.pm$,Mail::SpamAssassin::Plugin::$1,gs;
	$_;
	} <lib/Mail/SpamAssassin/Plugin/*.pm>;

	my $PROMOTABLE_PLUGINS_RE = "^" . join("\|", @def_plugins) . "\$";

	# number of days to look back; if a rule isn't listed as promotable on
	# all of these days, it won't be listed. (we grant an exception for
	# new rules that didn't exist on previous days, however, so new rules
	# can be published quickly to handle sudden outbreaks without requiring
	# manual update work)
	my @DAYS_REQUIRED = (1, 2, 3);

	# S/O threshold required
	my $SO_THRESHOLD = .8;

	###########################################################################

	print q{
	Bad performing rules, from the past 3 night's mass-checks.

	(Note: 'net' rules will be listed as 'no hits' unless you set 'tflags net'.
	This also applies for meta rules which use 'net' rules.)

	};

	my $cgi_url = "https://ruleqa.spamassassin.org/";
	my @doc = ();
	my $cache = 'ruleqa.cache.';
	my $submitters = '';

	my $url; # tracks the last day used
	my $dayoffset = 0;
	foreach my $day (@DAYS_REQUIRED) {
	if (!$FROM_CACHE \|\| !-f $cache.$day) {

	with_new_offset:
	$url = $cgi_url.($day+$dayoffset)."-days-ago?xml=1";
	warn "HTTP get: $url\n";

	$doc[$day] = get ($url);
	if (!$doc[$day]) {
	die "HTTP get failed: $doc[$day]\n";
	}

	if ($MAKE_CACHE) {
	if (open(O, ">$cache$day")) {
	print O $doc[$day]; close O;
	}
	}
	}
	else {
	open(I, "<$cache$day") or die; $doc[$day] = join('',<I>); close I;
	}

	###########################################################################

	# the HTML looks like:
	#
	# <span class="daterev_masscheck_description" class="mcviewing">
	# ...
	# <em><span class="mcsubmitters"> ....... </span></em>
	# ...
	# </span>
	#
	# in other words, the machine-parseable metadata is embedded in the HTML
	# as a microformat.

	if ($doc[$day] =~ m{
	<span\s+class="daterev_masscheck_description\smcviewing"
	.{0,400}
	<span\s+class="mcsubmitters">\s(.?)\s*</span>
	}sx)
	{
	my $daysubs = $1;

	# ignore days when the mass-check sets contain a --net log, since
	# it's the weekly --net run. That generally contains a much
	# smaller set of logs (since it takes longer to run mass-check --net)
	# so the results are untrustworthy.
	if ($daysubs =~ /(?:^\|\s)net-/) {
	warn "day $day contains a --net mass-check! offsetting by an extra day\n";
	$dayoffset++; goto with_new_offset;
	}

	($submitters ne '') and $submitters .= "; ";
	$submitters .= "day $day: $daysubs";
	}
	else {
	loghtml_die("no 'mcviewing', 'mcsubmitters' microformats on day $day");
	}
	}

	###########################################################################

	# <rule><test>__HIGHBITS</test><promo>0</promo>
	# <spc>8.7654</spc><hpc>0.2056</hpc><so>0.977</so>
	# <detailhref>ruleqa%3Fdaterev%3Dlast-night%26rule%3D__HIGHBITS%26s_detail%3D1</detailhref></rule>

	my $plist;
	foreach my $day (@DAYS_REQUIRED) {
	while ($doc[$day] =~ m!<rule>(.*?)</rule>!xg) {
	my $xml = $1;
	my $obj = { };

	while ($xml =~ s!<([A-Za-z0-9_]+)>(.*?)</\1>!!) {
	$obj->{$1} = $2;
	}
	while ($xml =~ s!<([A-Za-z0-9_]+)\s+esc=["']1["']>(.*?)</\1>!!) {
	$obj->{$1} = uri_unescape($2);
	}

	my $name = $obj->{test};
	$obj->{detailhref} = $cgi_url.$obj->{detailhref};

	$plist->[$day]->{$name} = $obj;
	}

	if (!scalar keys %{$plist->[$day]}) {
	loghtml_die("no rules found? on day $day");
	}
	}

	###########################################################################

	## my $dump = Data::Dumper->Dump([$plist], ['promolist']); print $dump;

	# use SpamAssassin classes directly, so we can lint rules
	# as we go
	use lib 'lib';
	use Mail::SpamAssassin;

	my $mailsa = Mail::SpamAssassin->new({
	rules_filename => "rules",
	site_rules_filename => join("\000", qw( rulesrc/core rulesrc/sandbox )),
	local_tests_only => 1,
	dont_copy_prefs => 1,
	config_tree_recurse => 1,
	keep_config_parsing_metadata => 1,
	# debug => 1,
	});

	# hack hack hack!! we don't want to load plugin files twice,
	# and since the mkrules compiler copies from rulesrc/sandbox//.pm
	# to rules/*.pm, they would otherwise appear twice.
	foreach my $fname (<rules/*.pm>) {
	my $path = File::Spec->rel2abs($fname);
	$INC{$path} = 1;
	# warn "JMD $path";
	}

	my %rules_with_errors = ();
	my %killed_rules = ();
	my %count_rules = ();
	my %report_bad_subrules = ();

	$mailsa->{lint_callback} = sub {
	my %opts = @_;

	# ignore non-rule-issue lint failures
	return if ($opts{msg} =~ /(?:
	score\sset\sfor\snon-existent\|
	description\sexists
	)/x);

	warn "demoting $opts{rule}: $opts{msg}";
	if ($opts{iserror}) {
	$rules_with_errors{$opts{rule}}++;
	}
	};

	$mailsa->lint_rules();

	# print "# active ruleset list, automatically generated from $cgi_url\n";
	# print "# with results from: $submitters\n";

	my @spcs = ($submitters =~ /\s+/g);
	if (scalar @spcs < 2) {
	die "not generating results; less than 3 submitter results available!\n";
	}

	# base most of our decisions off day 1 (last night's mass-checks).
	# note: meta rules must come before their __SUBRULES in this sort;
	# default lexical sort will do this.
	foreach my $plistkey (sort keys %{$plist->[1]}) {
	my $name = $plistkey;
	my $plistobj = $plist->[1]->{$plistkey};
	my $notes = '';

	# rules in sandboxes without a T_ prefix, will be renamed during the
	# ruleqa process... in other words, the output freqs line will talk
	# about rule "T_FOO". if there's a rule "FOO" defined, assume that's
	# the one being talked about.
	my $no_t = $name;
	if ($no_t =~ s/^T_//) {
	if (defined $mailsa->{conf}->{scores}->{$no_t}) {
	$name = $no_t;
	}
	}

	# ignore rules that don't exist (if they have a desc or score,
	# they exist according to the Conf parser)
	next unless ($mailsa->{conf}->{descriptions}->{$name}
	\|\| $mailsa->{conf}->{scores}->{$name});

	my $tfs = $mailsa->{conf}->{tflags}->{$name} \|\| '';

	my $src = $mailsa->{conf}->{source_file}->{$name};
	if ( defined $src ) {
	$count_rules{$src}++;
	} else {
	$count_rules{'not_present'}++;
	}

	# skip rules of these tflags, we cannot judge them without more data
	if ($tfs =~ /\b(?:userconf\|learn\|net)\b/) {
	next;
	}

	# rules that fail lint
	next if $rules_with_errors{$name};

	# subrules with ok parent rules
	if ($name =~ /^__/ && !$report_bad_subrules{$name}) {
	# print " # ignoring subrule $name: parent rules seem fine\n";
	next;
	}

	# certain tests need to be reversed for "nice" rules
	my $is_nice = 0;
	if ($tfs =~ /\bnice\b/) { $is_nice = 1; }

	my $valid = 1; # number of nights the rule appears in
	my $so = $plist->[1]->{$plistkey}->{so};
	if (defined $plist->[2]->{$plistkey}->{so}) {
	$so += $plist->[2]->{$plistkey}->{so}; $valid++;
	}
	if (defined $plist->[3]->{$plistkey}->{so}) {
	$so += $plist->[3]->{$plistkey}->{so}; $valid++;
	}
	$so /= $valid; # average across all 3
	my $adj_so;
	if ($is_nice) {
	$adj_so = 1.0 - $so; # 0.0 => 1.0
	} else {
	$adj_so = $so;
	}
	next unless ($adj_so < $SO_THRESHOLD);

	my $target = ($is_nice ? 'hpc' : 'spc');
	my $spc = $plist->[1]->{$plistkey}->{$target};
	$spc += $plist->[2]->{$plistkey}->{$target} \|\| 0;
	$spc += $plist->[3]->{$plistkey}->{$target} \|\| 0;
	$spc /= $valid;

	$target = ($is_nice ? 'spc' : 'hpc');
	my $hpc = $plist->[1]->{$plistkey}->{$target};
	$hpc += $plist->[2]->{$plistkey}->{$target} \|\| 0;
	$hpc += $plist->[3]->{$plistkey}->{$target} \|\| 0;
	$hpc /= $valid;

	if ($spc <= 0.0001) {
	if ($hpc <= 0.0001) {
	badrule($name, "no hits at all");
	} else {
	badrule($name, "no hits of target type");
	}
	next;
	}

	badrule($name, "bad, avg S/O=".sprintf("%.2f",$so)." ".
	"avg Spam%=".sprintf("%.2f",$spc)." ".
	"avg Ham%=".sprintf("%.2f",$hpc)
	);
	}

	foreach my $srcfile (reverse sort keys %killed_rules) {
	my $set = $killed_rules{$srcfile};
	my $count = $count_rules{$srcfile};
	my $c_bad = scalar keys %{$set};

	print "\n$srcfile ($count rules, $c_bad bad):\n\n";

	foreach my $name (sort keys %{$set}) {
	my $reason = $set->{$name};
	print " $name: $reason\n";
	}
	}

	exit;

	sub badrule {
	my ($name, $reason) = @_;
	my $src = $mailsa->{conf}->{source_file}->{$name};
	if ( defined $src ) {
	$killed_rules{$src}->{$name} = $reason;
	} else {
	$killed_rules{'not_present'}->{$name} = $reason;
	}

	# if it's a subrule in a meta rule, note this
	# TODO: this only works reliably for lexically-previous meta rules;
	# that's ok for __SUBRULES used in META_RULES, since "M" < "_".
	if ($report_bad_subrules{$name}) {
	$killed_rules{$src}->{$name} .= "\n # used in:$report_bad_subrules{$name}";
	}

	# if it's a meta rule, note that we can complain about its subrules too
	foreach my $r (split ' ', $mailsa->{conf}->{meta_dependencies}->{$name} \|\| '') {
	$report_bad_subrules{$r} .= " ".$name;
	}
	}

	sub loghtml_die {
	die "$_[0]\nURL: $url\n";
	}