| #!/usr/bin/perl |
| |
| use strict; |
| use warnings; |
| |
| use File::Temp (); |
| use LWP::Simple; |
| use URI::Escape; |
| use Data::Dumper; |
| |
| my $FROM_CACHE; if (@ARGV && $ARGV[0] eq '--fromcache') { $FROM_CACHE = 1; } |
| my $MAKE_CACHE; $MAKE_CACHE = 1; # turn this on by default, no harm |
| |
| # we allow promotion of rules that are "ifplugin" one of these |
| my @def_plugins = map { |
| s,^lib/Mail/SpamAssassin/Plugin/(\S+)\.pm$,Mail::SpamAssassin::Plugin::$1,gs; |
| $_; |
| } <lib/Mail/SpamAssassin/Plugin/*.pm>; |
| |
| my $PROMOTABLE_PLUGINS_RE = "^" . join("|", @def_plugins) . "\$"; |
| |
| # number of days to look back; if a rule isn't listed as promotable on |
| # all of these days, it won't be listed. (we grant an exception for |
| # new rules that didn't exist on previous days, however, so new rules |
| # can be published quickly to handle sudden outbreaks without requiring |
| # manual update work) |
| my @DAYS_REQUIRED = (1, 2, 3, 4, 5); |
| |
| ########################################################################### |
| |
| my $cgi_url = "https://ruleqa.spamassassin.org/"; |
| my @doc = (); |
| my $cache = 'ruleqa.cache.'; |
| my $submitters = ''; |
| my $last_net; |
| my %outputs; |
| |
| if (!$FROM_CACHE || !-f "${cache}net" || (-M "${cache}net") > 0.5) { |
| my $neturl = $cgi_url."last-net?xml=1"; |
| warn "HTTP get: $neturl\n"; |
| $last_net = get ($neturl); |
| if (!$last_net) { |
| die "HTTP get failed: last-net\n"; |
| } |
| if ($MAKE_CACHE) { |
| open(O, ">${cache}net"); print O $last_net; close O; |
| } |
| } else { |
| open(I, "<${cache}net") or die; $last_net = join('',<I>); close I; |
| } |
| |
| if ($last_net =~ m{ |
| <span\s+class="daterev_masscheck_description\smcviewing" |
| .{0,400} |
| <span\s+class="mcsubmitters">\s*(.*?)\s*</span> |
| }sx) |
| { |
| |
| my $netsubs = $1; |
| ($submitters ne '') and $submitters .= "; "; |
| $submitters .= "last-net: $netsubs"; |
| } else { |
| loghtml_die("no 'mcviewing', 'mcsubmitters' microformats for last-net"); |
| } |
| |
| my $netlist; |
| while ($last_net =~ m!<rule>(.*?)</rule>!xg) { |
| my $xml = $1; |
| my $obj = { }; |
| |
| while ($xml =~ s!<([A-Za-z0-9_]+)>(.*?)</\1>!!) { |
| $obj->{$1} = $2; |
| } |
| while ($xml =~ s!<([A-Za-z0-9_]+)\s+esc=["']1["']>(.*?)</\1>!!) { |
| $obj->{$1} = uri_unescape($2); |
| } |
| |
| my $name = $obj->{test}; |
| $obj->{detailhref} = $cgi_url.$obj->{detailhref}; |
| |
| $netlist->{$name} = $obj; |
| } |
| |
| if (!scalar keys %{$netlist}) { |
| loghtml_die("no rules found? on last-net"); |
| } |
| |
| my $url; # tracks the last day used |
| my $dayoffset = 0; |
| foreach my $day (@DAYS_REQUIRED) { |
| if (!$FROM_CACHE || !-f $cache.$day || (-M $cache.$day) > 0.5) { |
| |
| with_new_offset: |
| $url = $cgi_url.($day+$dayoffset)."-days-ago?xml=1"; |
| warn "HTTP get: $url\n"; |
| |
| $doc[$day] = get ($url); |
| if (!$doc[$day]) { |
| die "HTTP get failed: $doc[$day]\n"; |
| } |
| |
| if ($MAKE_CACHE) { |
| open(O, ">$cache$day"); print O $doc[$day]; close O; |
| } |
| } |
| else { |
| open(I, "<$cache$day") or die; $doc[$day] = join('',<I>); close I; |
| } |
| |
| ########################################################################### |
| |
| # the HTML looks like: |
| # |
| # <span class="daterev_masscheck_description" class="mcviewing"> |
| # ... |
| # <em><span class="mcsubmitters"> ....... </span></em> |
| # ... |
| # </span> |
| # |
| # in other words, the machine-parseable metadata is embedded in the HTML |
| # as a microformat. |
| |
| if ($doc[$day] =~ m{ |
| <span\s+class="daterev_masscheck_description\smcviewing" |
| .{0,400} |
| <span\s+class="mcsubmitters">\s*(.*?)\s*</span> |
| }sx) |
| { |
| my $daysubs = $1; |
| |
| # ignore days when the mass-check sets contain a --net log, since |
| # it's the weekly --net run. That generally contains a much |
| # smaller set of logs (since it takes longer to run mass-check --net) |
| # so the results are untrustworthy. |
| if ($daysubs =~ /(?:^|\s)net-/) { |
| warn "day $day contains a --net mass-check! offsetting by an extra day\n"; |
| $dayoffset++; goto with_new_offset; |
| } |
| |
| ($submitters ne '') and $submitters .= "; "; |
| $submitters .= "day $day: $daysubs"; |
| } |
| else { |
| loghtml_die("no 'mcviewing', 'mcsubmitters' microformats on day $day"); |
| } |
| } |
| |
| ########################################################################### |
| |
| # <rule><test>__HIGHBITS</test><promo>0</promo> |
| # <spc>8.7654</spc><hpc>0.2056</hpc><so>0.977</so> |
| # <detailhref>ruleqa%3Fdaterev%3Dlast-night%26rule%3D__HIGHBITS%26s_detail%3D1</detailhref></rule> |
| |
| my $plist; |
| my %ruleslist; |
| foreach my $day (@DAYS_REQUIRED) { |
| while ($doc[$day] =~ m!<rule>(.*?)</rule>!xg) { |
| my $xml = $1; |
| my $obj = { }; |
| |
| while ($xml =~ s!<([A-Za-z0-9_]+)>(.*?)</\1>!!) { |
| $obj->{$1} = $2; |
| } |
| while ($xml =~ s!<([A-Za-z0-9_]+)\s+esc=["']1["']>(.*?)</\1>!!) { |
| $obj->{$1} = uri_unescape($2); |
| } |
| |
| my $name = $obj->{test}; |
| $ruleslist{$name} = 1; |
| $obj->{detailhref} = $cgi_url.$obj->{detailhref}; |
| |
| $plist->[$day]->{$name} = $obj; |
| } |
| |
| if (!scalar keys %{$plist->[$day]}) { |
| loghtml_die("no rules found? on day $day"); |
| } |
| } |
| |
| sub median_array { |
| my @vals = sort {$a <=> $b} @_; |
| my $len = @vals; |
| if($len%2) #odd? |
| { |
| return $vals[int($len/2)]; |
| } |
| else #even |
| { |
| return ($vals[int($len/2)-1] + $vals[int($len/2)])/2; |
| } |
| } |
| |
| ########################################################################### |
| |
| ## my $dump = Data::Dumper->Dump([$plist], ['promolist']); print $dump; |
| |
| # use SpamAssassin classes directly, so we can lint rules |
| # as we go |
| use lib 'lib'; |
| use Mail::SpamAssassin; |
| |
| my $mailsa = Mail::SpamAssassin->new({ |
| rules_filename => "rules", |
| site_rules_filename => join("\000", qw( rulesrc/core rulesrc/sandbox )), |
| local_tests_only => 1, |
| dont_copy_prefs => 1, |
| config_tree_recurse => 1, |
| keep_config_parsing_metadata => 1, |
| # debug => 1, |
| }); |
| |
| # hack hack hack!! we don't want to load plugin files twice, |
| # and since the mkrules compiler copies from rulesrc/sandbox/*/*.pm |
| # to rules/*.pm, they would otherwise appear twice. |
| foreach my $fname (<rules/*.pm>) { |
| my $path = File::Spec->rel2abs($fname); |
| $INC{$path} = 1; |
| # warn "JMD $path"; |
| } |
| |
| my %rules_with_errors = (); |
| |
| $mailsa->{lint_callback} = sub { |
| my %opts = @_; |
| |
| # ignore non-rule-issue lint failures |
| return if ($opts{msg} =~ /(?: |
| score\sset\sfor\snon-existent| |
| description\sexists |
| )/x); |
| |
| warn "demoting $opts{rule}: $opts{msg}"; |
| if ($opts{iserror}) { |
| $rules_with_errors{$opts{rule}}++; |
| } |
| }; |
| |
| $mailsa->lint_rules(); |
| |
| print "# DO NOT EDIT: file generated by build/mkupdates/listpromotable\n"; |
| print "# active ruleset list, automatically generated from $cgi_url\n"; |
| print "# with results from: $submitters\n"; |
| |
| my @spcs = ($submitters =~ /\s+/g); |
| if (scalar @spcs < 2) { |
| die "not generating results; less than 3 submitter results available!\n"; |
| } |
| |
| foreach my $netrule (sort keys %{$netlist}) { |
| my $name = $netrule; |
| my $notes = ''; |
| |
| next if ($name =~ /^__/); |
| |
| my $no_t = $name; |
| if ($no_t =~ s/^T_//) { |
| if (defined $mailsa->{conf}->{scores}->{$no_t}) { |
| $name = $no_t; |
| } |
| } |
| |
| # now that it's ok to have sandbox rules without a T_ prefix, |
| # "T_" prefix implies "tflags nopublish" |
| next if ($name =~ /^T_/); |
| |
| # ignore rules that don't exist (if they have a desc or score, |
| # they exist according to the Conf parser) |
| next unless ($mailsa->{conf}->{descriptions}->{$name} |
| || $mailsa->{conf}->{scores}->{$name}); |
| |
| my $tfs = $mailsa->{conf}->{tflags}->{$name}; |
| |
| # "nopublish" tflags |
| if ($tfs) { |
| next if ($tfs =~ /\bnopublish\b/); |
| } |
| |
| next if ($mailsa->{conf}->{testrules}->{$name}); |
| |
| if ($tfs && $tfs =~ /\b(net)\b/) { |
| $notes = "tflags ".$1; |
| goto publish; |
| } |
| |
| next; |
| publish: |
| |
| $outputs{$name} = $notes unless defined $outputs{$name}; |
| } |
| |
| foreach my $name (keys %ruleslist) { |
| next if $name =~ /^__/; |
| |
| my $plistobj = $plist->[1]->{$name}; |
| my $notes = ''; |
| |
| # rules in sandboxes without a T_ prefix, will be renamed during the |
| # ruleqa process... in other words, the output freqs line will talk |
| # about rule "T_FOO". if there's a rule "FOO" defined, assume that's |
| # the one being talked about. |
| my $no_t = $name; |
| if ($no_t =~ s/^T_//) { |
| if (defined $mailsa->{conf}->{scores}->{$no_t}) { |
| $name = $no_t; |
| } |
| } |
| |
| # now that it's ok to have sandbox rules without a T_ prefix, |
| # "T_" prefix implies "tflags nopublish" |
| next if ($name =~ /^T_/); |
| |
| # ignore rules that don't exist (if they have a desc or score, |
| # they exist according to the Conf parser) |
| next unless ($mailsa->{conf}->{descriptions}->{$name} |
| || $mailsa->{conf}->{scores}->{$name}); |
| |
| # "nopublish" tflags |
| my $tfs = $mailsa->{conf}->{tflags}->{$name}; |
| if ($tfs) { |
| next if ($tfs =~ /\bnopublish\b/); |
| |
| if ($tfs =~ /\b(publish)\b/) { |
| $notes = "tflags ".$1; |
| goto publish; |
| } |
| } |
| |
| # rule was from a file marked with "#testrules" (bug 5545) |
| # note: this is after "tflags publish" support, so you can override |
| # it on a rule-by-rule basis anyway |
| next if ($mailsa->{conf}->{testrules}->{$name}); |
| |
| # bug 6560, unless specifically declared #testrules; |
| # all of these tflags force publication; |
| # include "net", since otherwise this script has to be aware |
| # what day of the week it is for weekly net/non-net mass-checks! |
| # very messy. TODO? |
| if ($tfs && $tfs =~ /\b(userconf|learn|net)\b/) { |
| $notes = "tflags ".$1; |
| goto publish; |
| } |
| |
| # only rules from "rulesrc" dirs |
| my $src = $mailsa->{conf}->{source_file}->{$name}; |
| next if (!$src || $src !~ /rulesrc/); |
| |
| # rules that fail lint |
| next if $rules_with_errors{$name}; |
| |
| # base active on DAYS_REQUIRED days of checks |
| # Find median for promo |
| my @promo_arr; |
| foreach my $day (@DAYS_REQUIRED) { |
| if (defined $plist->[$day]->{$name}) { |
| push (@promo_arr, $plist->[$day]->{$name}{promo}); |
| } |
| if (defined $plist->[$day]->{"T_$name"}) { |
| push (@promo_arr, $plist->[$day]->{"T_$name"}{promo}); |
| } |
| } |
| |
| my $is_promo = median_array(@promo_arr); |
| |
| next unless $is_promo gt 0; |
| |
| # that require a plugin we won't have |
| my $skip = 0; |
| my $ifs = $mailsa->{conf}->{if_stack}->{$name}; |
| while ($ifs && $ifs =~ /plugin\s*\((.+?)\)/g) { |
| my $pkg = $1; |
| # grep out the ones we *do* have, and do use in "ifplugin" |
| # lines in "rulesrc", here... |
| next; #JMD: |
| next if ($pkg =~ /${PROMOTABLE_PLUGINS_RE}/o); |
| print "\n# not publishing $name: needs $ifs\n"; |
| $skip++; |
| } |
| next if $skip; |
| |
| # don't output the ever-changing bits of data |
| # $notes = "spam=$plistobj->{spc} ham=$plistobj->{hpc} so=$plistobj->{so}"; |
| $notes = "good enough"; |
| |
| publish: |
| |
| $outputs{$name} = $notes unless defined $outputs{$name}; |
| } |
| |
| foreach my $key (sort(keys %outputs)) { |
| print "\n# ", $outputs{$key}, "\n", $key, "\n"; |
| } |
| exit; |
| |
| sub loghtml_die { |
| die "$_[0]\nURL: $url\n"; |
| } |