blob: f1a10bdc95cec9e0a61b13251fecad517c7a4b96 [file] [log] [blame]
#!/usr/bin/perl
use strict;
use warnings;
use File::Temp ();
use LWP::Simple;
use URI::Escape;
use Data::Dumper;
my $FROM_CACHE; if (@ARGV && $ARGV[0] eq '--fromcache') { $FROM_CACHE = 1; }
my $MAKE_CACHE; $MAKE_CACHE = 1; # turn this on by default, no harm
# we allow promotion of rules that are "ifplugin" one of these
my @def_plugins = map {
s,^lib/Mail/SpamAssassin/Plugin/(\S+)\.pm$,Mail::SpamAssassin::Plugin::$1,gs;
$_;
} <lib/Mail/SpamAssassin/Plugin/*.pm>;
my $PROMOTABLE_PLUGINS_RE = "^" . join("|", @def_plugins) . "\$";
# number of days to look back; if a rule isn't listed as promotable on
# all of these days, it won't be listed. (we grant an exception for
# new rules that didn't exist on previous days, however, so new rules
# can be published quickly to handle sudden outbreaks without requiring
# manual update work)
my @DAYS_REQUIRED = (1, 2, 3, 4, 5);
###########################################################################
my $cgi_url = "https://ruleqa.spamassassin.org/";
my @doc = ();
my $cache = 'ruleqa.cache.';
my $submitters = '';
my $last_net;
my %outputs;
if (!$FROM_CACHE || !-f "${cache}net" || (-M "${cache}net") > 0.5) {
my $neturl = $cgi_url."last-net?xml=1";
warn "HTTP get: $neturl\n";
$last_net = get ($neturl);
if (!$last_net) {
die "HTTP get failed: last-net\n";
}
if ($MAKE_CACHE) {
open(O, ">${cache}net"); print O $last_net; close O;
}
} else {
open(I, "<${cache}net") or die; $last_net = join('',<I>); close I;
}
if ($last_net =~ m{
<span\s+class="daterev_masscheck_description\smcviewing"
.{0,400}
<span\s+class="mcsubmitters">\s*(.*?)\s*</span>
}sx)
{
my $netsubs = $1;
($submitters ne '') and $submitters .= "; ";
$submitters .= "last-net: $netsubs";
} else {
loghtml_die("no 'mcviewing', 'mcsubmitters' microformats for last-net");
}
my $netlist;
while ($last_net =~ m!<rule>(.*?)</rule>!xg) {
my $xml = $1;
my $obj = { };
while ($xml =~ s!<([A-Za-z0-9_]+)>(.*?)</\1>!!) {
$obj->{$1} = $2;
}
while ($xml =~ s!<([A-Za-z0-9_]+)\s+esc=["']1["']>(.*?)</\1>!!) {
$obj->{$1} = uri_unescape($2);
}
my $name = $obj->{test};
$obj->{detailhref} = $cgi_url.$obj->{detailhref};
$netlist->{$name} = $obj;
}
if (!scalar keys %{$netlist}) {
loghtml_die("no rules found? on last-net");
}
my $url; # tracks the last day used
my $dayoffset = 0;
foreach my $day (@DAYS_REQUIRED) {
if (!$FROM_CACHE || !-f $cache.$day || (-M $cache.$day) > 0.5) {
with_new_offset:
$url = $cgi_url.($day+$dayoffset)."-days-ago?xml=1";
warn "HTTP get: $url\n";
$doc[$day] = get ($url);
if (!$doc[$day]) {
die "HTTP get failed: $doc[$day]\n";
}
if ($MAKE_CACHE) {
open(O, ">$cache$day"); print O $doc[$day]; close O;
}
}
else {
open(I, "<$cache$day") or die; $doc[$day] = join('',<I>); close I;
}
###########################################################################
# the HTML looks like:
#
# <span class="daterev_masscheck_description" class="mcviewing">
# ...
# <em><span class="mcsubmitters"> ....... </span></em>
# ...
# </span>
#
# in other words, the machine-parseable metadata is embedded in the HTML
# as a microformat.
if ($doc[$day] =~ m{
<span\s+class="daterev_masscheck_description\smcviewing"
.{0,400}
<span\s+class="mcsubmitters">\s*(.*?)\s*</span>
}sx)
{
my $daysubs = $1;
# ignore days when the mass-check sets contain a --net log, since
# it's the weekly --net run. That generally contains a much
# smaller set of logs (since it takes longer to run mass-check --net)
# so the results are untrustworthy.
if ($daysubs =~ /(?:^|\s)net-/) {
warn "day $day contains a --net mass-check! offsetting by an extra day\n";
$dayoffset++; goto with_new_offset;
}
($submitters ne '') and $submitters .= "; ";
$submitters .= "day $day: $daysubs";
}
else {
loghtml_die("no 'mcviewing', 'mcsubmitters' microformats on day $day");
}
}
###########################################################################
# <rule><test>__HIGHBITS</test><promo>0</promo>
# <spc>8.7654</spc><hpc>0.2056</hpc><so>0.977</so>
# <detailhref>ruleqa%3Fdaterev%3Dlast-night%26rule%3D__HIGHBITS%26s_detail%3D1</detailhref></rule>
my $plist;
my %ruleslist;
foreach my $day (@DAYS_REQUIRED) {
while ($doc[$day] =~ m!<rule>(.*?)</rule>!xg) {
my $xml = $1;
my $obj = { };
while ($xml =~ s!<([A-Za-z0-9_]+)>(.*?)</\1>!!) {
$obj->{$1} = $2;
}
while ($xml =~ s!<([A-Za-z0-9_]+)\s+esc=["']1["']>(.*?)</\1>!!) {
$obj->{$1} = uri_unescape($2);
}
my $name = $obj->{test};
$ruleslist{$name} = 1;
$obj->{detailhref} = $cgi_url.$obj->{detailhref};
$plist->[$day]->{$name} = $obj;
}
if (!scalar keys %{$plist->[$day]}) {
loghtml_die("no rules found? on day $day");
}
}
sub median_array {
my @vals = sort {$a <=> $b} @_;
my $len = @vals;
if($len%2) #odd?
{
return $vals[int($len/2)];
}
else #even
{
return ($vals[int($len/2)-1] + $vals[int($len/2)])/2;
}
}
###########################################################################
## my $dump = Data::Dumper->Dump([$plist], ['promolist']); print $dump;
# use SpamAssassin classes directly, so we can lint rules
# as we go
use lib 'lib';
use Mail::SpamAssassin;
my $mailsa = Mail::SpamAssassin->new({
rules_filename => "rules",
site_rules_filename => join("\000", qw( rulesrc/core rulesrc/sandbox )),
local_tests_only => 1,
dont_copy_prefs => 1,
config_tree_recurse => 1,
keep_config_parsing_metadata => 1,
# debug => 1,
});
# hack hack hack!! we don't want to load plugin files twice,
# and since the mkrules compiler copies from rulesrc/sandbox/*/*.pm
# to rules/*.pm, they would otherwise appear twice.
foreach my $fname (<rules/*.pm>) {
my $path = File::Spec->rel2abs($fname);
$INC{$path} = 1;
# warn "JMD $path";
}
my %rules_with_errors = ();
$mailsa->{lint_callback} = sub {
my %opts = @_;
# ignore non-rule-issue lint failures
return if ($opts{msg} =~ /(?:
score\sset\sfor\snon-existent|
description\sexists
)/x);
warn "demoting $opts{rule}: $opts{msg}";
if ($opts{iserror}) {
$rules_with_errors{$opts{rule}}++;
}
};
$mailsa->lint_rules();
print "# DO NOT EDIT: file generated by build/mkupdates/listpromotable\n";
print "# active ruleset list, automatically generated from $cgi_url\n";
print "# with results from: $submitters\n";
my @spcs = ($submitters =~ /\s+/g);
if (scalar @spcs < 2) {
die "not generating results; less than 3 submitter results available!\n";
}
foreach my $netrule (sort keys %{$netlist}) {
my $name = $netrule;
my $notes = '';
next if ($name =~ /^__/);
my $no_t = $name;
if ($no_t =~ s/^T_//) {
if (defined $mailsa->{conf}->{scores}->{$no_t}) {
$name = $no_t;
}
}
# now that it's ok to have sandbox rules without a T_ prefix,
# "T_" prefix implies "tflags nopublish"
next if ($name =~ /^T_/);
# ignore rules that don't exist (if they have a desc or score,
# they exist according to the Conf parser)
next unless ($mailsa->{conf}->{descriptions}->{$name}
|| $mailsa->{conf}->{scores}->{$name});
my $tfs = $mailsa->{conf}->{tflags}->{$name};
# "nopublish" tflags
if ($tfs) {
next if ($tfs =~ /\bnopublish\b/);
}
next if ($mailsa->{conf}->{testrules}->{$name});
if ($tfs && $tfs =~ /\b(net)\b/) {
$notes = "tflags ".$1;
goto publish;
}
next;
publish:
$outputs{$name} = $notes unless defined $outputs{$name};
}
foreach my $name (keys %ruleslist) {
next if $name =~ /^__/;
my $plistobj = $plist->[1]->{$name};
my $notes = '';
# rules in sandboxes without a T_ prefix, will be renamed during the
# ruleqa process... in other words, the output freqs line will talk
# about rule "T_FOO". if there's a rule "FOO" defined, assume that's
# the one being talked about.
my $no_t = $name;
if ($no_t =~ s/^T_//) {
if (defined $mailsa->{conf}->{scores}->{$no_t}) {
$name = $no_t;
}
}
# now that it's ok to have sandbox rules without a T_ prefix,
# "T_" prefix implies "tflags nopublish"
next if ($name =~ /^T_/);
# ignore rules that don't exist (if they have a desc or score,
# they exist according to the Conf parser)
next unless ($mailsa->{conf}->{descriptions}->{$name}
|| $mailsa->{conf}->{scores}->{$name});
# "nopublish" tflags
my $tfs = $mailsa->{conf}->{tflags}->{$name};
if ($tfs) {
next if ($tfs =~ /\bnopublish\b/);
if ($tfs =~ /\b(publish)\b/) {
$notes = "tflags ".$1;
goto publish;
}
}
# rule was from a file marked with "#testrules" (bug 5545)
# note: this is after "tflags publish" support, so you can override
# it on a rule-by-rule basis anyway
next if ($mailsa->{conf}->{testrules}->{$name});
# bug 6560, unless specifically declared #testrules;
# all of these tflags force publication;
# include "net", since otherwise this script has to be aware
# what day of the week it is for weekly net/non-net mass-checks!
# very messy. TODO?
if ($tfs && $tfs =~ /\b(userconf|learn|net)\b/) {
$notes = "tflags ".$1;
goto publish;
}
# only rules from "rulesrc" dirs
my $src = $mailsa->{conf}->{source_file}->{$name};
next if (!$src || $src !~ /rulesrc/);
# rules that fail lint
next if $rules_with_errors{$name};
# base active on DAYS_REQUIRED days of checks
# Find median for promo
my @promo_arr;
foreach my $day (@DAYS_REQUIRED) {
if (defined $plist->[$day]->{$name}) {
push (@promo_arr, $plist->[$day]->{$name}{promo});
}
if (defined $plist->[$day]->{"T_$name"}) {
push (@promo_arr, $plist->[$day]->{"T_$name"}{promo});
}
}
my $is_promo = median_array(@promo_arr);
next unless $is_promo gt 0;
# that require a plugin we won't have
my $skip = 0;
my $ifs = $mailsa->{conf}->{if_stack}->{$name};
while ($ifs && $ifs =~ /plugin\s*\((.+?)\)/g) {
my $pkg = $1;
# grep out the ones we *do* have, and do use in "ifplugin"
# lines in "rulesrc", here...
next; #JMD:
next if ($pkg =~ /${PROMOTABLE_PLUGINS_RE}/o);
print "\n# not publishing $name: needs $ifs\n";
$skip++;
}
next if $skip;
# don't output the ever-changing bits of data
# $notes = "spam=$plistobj->{spc} ham=$plistobj->{hpc} so=$plistobj->{so}";
$notes = "good enough";
publish:
$outputs{$name} = $notes unless defined $outputs{$name};
}
foreach my $key (sort(keys %outputs)) {
print "\n# ", $outputs{$key}, "\n", $key, "\n";
}
exit;
sub loghtml_die {
die "$_[0]\nURL: $url\n";
}