blob: 057b431ba83531fdadd26131929042431e65190a [file] [log] [blame]
#!/usr/bin/perl
use strict;
use warnings;
use File::Temp ();
use LWP::Simple;
use URI::Escape;
use Data::Dumper;
my $FROM_CACHE; if (@ARGV && $ARGV[0] eq '--fromcache') { $FROM_CACHE = 1; }
my $MAKE_CACHE; $MAKE_CACHE = 1; # turn this on by default, no harm
# we allow promotion of rules that are "ifplugin" one of these
my @def_plugins = map {
s,^lib/Mail/SpamAssassin/Plugin/(\S+)\.pm$,Mail::SpamAssassin::Plugin::$1,gs;
$_;
} <lib/Mail/SpamAssassin/Plugin/*.pm>;
my $PROMOTABLE_PLUGINS_RE = "^" . join("|", @def_plugins) . "\$";
# number of days to look back; if a rule isn't listed as promotable on
# all of these days, it won't be listed. (we grant an exception for
# new rules that didn't exist on previous days, however, so new rules
# can be published quickly to handle sudden outbreaks without requiring
# manual update work)
my @DAYS_REQUIRED = (1, 2, 3);
# S/O threshold required
my $SO_THRESHOLD = .8;
###########################################################################
print q{
Bad performing rules, from the past 3 night's mass-checks.
(Note: 'net' rules will be listed as 'no hits' unless you set 'tflags net'.
This also applies for meta rules which use 'net' rules.)
};
my $cgi_url = "https://ruleqa.spamassassin.org/";
my @doc = ();
my $cache = 'ruleqa.cache.';
my $submitters = '';
my $url; # tracks the last day used
my $dayoffset = 0;
foreach my $day (@DAYS_REQUIRED) {
if (!$FROM_CACHE || !-f $cache.$day) {
with_new_offset:
$url = $cgi_url.($day+$dayoffset)."-days-ago?xml=1";
warn "HTTP get: $url\n";
$doc[$day] = get ($url);
if (!$doc[$day]) {
die "HTTP get failed: $doc[$day]\n";
}
if ($MAKE_CACHE) {
if (open(O, ">$cache$day")) {
print O $doc[$day]; close O;
}
}
}
else {
open(I, "<$cache$day") or die; $doc[$day] = join('',<I>); close I;
}
###########################################################################
# the HTML looks like:
#
# <span class="daterev_masscheck_description" class="mcviewing">
# ...
# <em><span class="mcsubmitters"> ....... </span></em>
# ...
# </span>
#
# in other words, the machine-parseable metadata is embedded in the HTML
# as a microformat.
if ($doc[$day] =~ m{
<span\s+class="daterev_masscheck_description\smcviewing"
.{0,400}
<span\s+class="mcsubmitters">\s*(.*?)\s*</span>
}sx)
{
my $daysubs = $1;
# ignore days when the mass-check sets contain a --net log, since
# it's the weekly --net run. That generally contains a much
# smaller set of logs (since it takes longer to run mass-check --net)
# so the results are untrustworthy.
if ($daysubs =~ /(?:^|\s)net-/) {
warn "day $day contains a --net mass-check! offsetting by an extra day\n";
$dayoffset++; goto with_new_offset;
}
($submitters ne '') and $submitters .= "; ";
$submitters .= "day $day: $daysubs";
}
else {
loghtml_die("no 'mcviewing', 'mcsubmitters' microformats on day $day");
}
}
###########################################################################
# <rule><test>__HIGHBITS</test><promo>0</promo>
# <spc>8.7654</spc><hpc>0.2056</hpc><so>0.977</so>
# <detailhref>ruleqa%3Fdaterev%3Dlast-night%26rule%3D__HIGHBITS%26s_detail%3D1</detailhref></rule>
my $plist;
foreach my $day (@DAYS_REQUIRED) {
while ($doc[$day] =~ m!<rule>(.*?)</rule>!xg) {
my $xml = $1;
my $obj = { };
while ($xml =~ s!<([A-Za-z0-9_]+)>(.*?)</\1>!!) {
$obj->{$1} = $2;
}
while ($xml =~ s!<([A-Za-z0-9_]+)\s+esc=["']1["']>(.*?)</\1>!!) {
$obj->{$1} = uri_unescape($2);
}
my $name = $obj->{test};
$obj->{detailhref} = $cgi_url.$obj->{detailhref};
$plist->[$day]->{$name} = $obj;
}
if (!scalar keys %{$plist->[$day]}) {
loghtml_die("no rules found? on day $day");
}
}
###########################################################################
## my $dump = Data::Dumper->Dump([$plist], ['promolist']); print $dump;
# use SpamAssassin classes directly, so we can lint rules
# as we go
use lib 'lib';
use Mail::SpamAssassin;
my $mailsa = Mail::SpamAssassin->new({
rules_filename => "rules",
site_rules_filename => join("\000", qw( rulesrc/core rulesrc/sandbox )),
local_tests_only => 1,
dont_copy_prefs => 1,
config_tree_recurse => 1,
keep_config_parsing_metadata => 1,
# debug => 1,
});
# hack hack hack!! we don't want to load plugin files twice,
# and since the mkrules compiler copies from rulesrc/sandbox/*/*.pm
# to rules/*.pm, they would otherwise appear twice.
foreach my $fname (<rules/*.pm>) {
my $path = File::Spec->rel2abs($fname);
$INC{$path} = 1;
# warn "JMD $path";
}
my %rules_with_errors = ();
my %killed_rules = ();
my %count_rules = ();
my %report_bad_subrules = ();
$mailsa->{lint_callback} = sub {
my %opts = @_;
# ignore non-rule-issue lint failures
return if ($opts{msg} =~ /(?:
score\sset\sfor\snon-existent|
description\sexists
)/x);
warn "demoting $opts{rule}: $opts{msg}";
if ($opts{iserror}) {
$rules_with_errors{$opts{rule}}++;
}
};
$mailsa->lint_rules();
# print "# active ruleset list, automatically generated from $cgi_url\n";
# print "# with results from: $submitters\n";
my @spcs = ($submitters =~ /\s+/g);
if (scalar @spcs < 2) {
die "not generating results; less than 3 submitter results available!\n";
}
# base most of our decisions off day 1 (last night's mass-checks).
# note: meta rules must come before their __SUBRULES in this sort;
# default lexical sort will do this.
foreach my $plistkey (sort keys %{$plist->[1]}) {
my $name = $plistkey;
my $plistobj = $plist->[1]->{$plistkey};
my $notes = '';
# rules in sandboxes without a T_ prefix, will be renamed during the
# ruleqa process... in other words, the output freqs line will talk
# about rule "T_FOO". if there's a rule "FOO" defined, assume that's
# the one being talked about.
my $no_t = $name;
if ($no_t =~ s/^T_//) {
if (defined $mailsa->{conf}->{scores}->{$no_t}) {
$name = $no_t;
}
}
# ignore rules that don't exist (if they have a desc or score,
# they exist according to the Conf parser)
next unless ($mailsa->{conf}->{descriptions}->{$name}
|| $mailsa->{conf}->{scores}->{$name});
my $tfs = $mailsa->{conf}->{tflags}->{$name} || '';
my $src = $mailsa->{conf}->{source_file}->{$name};
if ( defined $src ) {
$count_rules{$src}++;
} else {
$count_rules{'not_present'}++;
}
# skip rules of these tflags, we cannot judge them without more data
if ($tfs =~ /\b(?:userconf|learn|net)\b/) {
next;
}
# rules that fail lint
next if $rules_with_errors{$name};
# subrules with ok parent rules
if ($name =~ /^__/ && !$report_bad_subrules{$name}) {
# print " # ignoring subrule $name: parent rules seem fine\n";
next;
}
# certain tests need to be reversed for "nice" rules
my $is_nice = 0;
if ($tfs =~ /\bnice\b/) { $is_nice = 1; }
my $valid = 1; # number of nights the rule appears in
my $so = $plist->[1]->{$plistkey}->{so};
if (defined $plist->[2]->{$plistkey}->{so}) {
$so += $plist->[2]->{$plistkey}->{so}; $valid++;
}
if (defined $plist->[3]->{$plistkey}->{so}) {
$so += $plist->[3]->{$plistkey}->{so}; $valid++;
}
$so /= $valid; # average across all 3
my $adj_so;
if ($is_nice) {
$adj_so = 1.0 - $so; # 0.0 => 1.0
} else {
$adj_so = $so;
}
next unless ($adj_so < $SO_THRESHOLD);
my $target = ($is_nice ? 'hpc' : 'spc');
my $spc = $plist->[1]->{$plistkey}->{$target};
$spc += $plist->[2]->{$plistkey}->{$target} || 0;
$spc += $plist->[3]->{$plistkey}->{$target} || 0;
$spc /= $valid;
$target = ($is_nice ? 'spc' : 'hpc');
my $hpc = $plist->[1]->{$plistkey}->{$target};
$hpc += $plist->[2]->{$plistkey}->{$target} || 0;
$hpc += $plist->[3]->{$plistkey}->{$target} || 0;
$hpc /= $valid;
if ($spc <= 0.0001) {
if ($hpc <= 0.0001) {
badrule($name, "no hits at all");
} else {
badrule($name, "no hits of target type");
}
next;
}
badrule($name, "bad, avg S/O=".sprintf("%.2f",$so)." ".
"avg Spam%=".sprintf("%.2f",$spc)." ".
"avg Ham%=".sprintf("%.2f",$hpc)
);
}
foreach my $srcfile (reverse sort keys %killed_rules) {
my $set = $killed_rules{$srcfile};
my $count = $count_rules{$srcfile};
my $c_bad = scalar keys %{$set};
print "\n$srcfile ($count rules, $c_bad bad):\n\n";
foreach my $name (sort keys %{$set}) {
my $reason = $set->{$name};
print " $name: $reason\n";
}
}
exit;
sub badrule {
my ($name, $reason) = @_;
my $src = $mailsa->{conf}->{source_file}->{$name};
if ( defined $src ) {
$killed_rules{$src}->{$name} = $reason;
} else {
$killed_rules{'not_present'}->{$name} = $reason;
}
# if it's a subrule in a meta rule, note this
# TODO: this only works reliably for lexically-previous meta rules;
# that's ok for __SUBRULES used in META_RULES, since "M" < "_".
if ($report_bad_subrules{$name}) {
$killed_rules{$src}->{$name} .= "\n # used in:$report_bad_subrules{$name}";
}
# if it's a meta rule, note that we can complain about its subrules too
foreach my $r (split ' ', $mailsa->{conf}->{meta_dependencies}->{$name} || '') {
$report_bad_subrules{$r} .= " ".$name;
}
}
sub loghtml_die {
die "$_[0]\nURL: $url\n";
}