| #!/usr/bin/perl |
| |
| use strict; |
| use warnings; |
| |
| use File::Temp (); |
| use LWP::Simple; |
| use URI::Escape; |
| use Data::Dumper; |
| |
| my $FROM_CACHE; if (@ARGV && $ARGV[0] eq '--fromcache') { $FROM_CACHE = 1; } |
| my $MAKE_CACHE; $MAKE_CACHE = 1; # turn this on by default, no harm |
| |
| # we allow promotion of rules that are "ifplugin" one of these |
| my @def_plugins = map { |
| s,^lib/Mail/SpamAssassin/Plugin/(\S+)\.pm$,Mail::SpamAssassin::Plugin::$1,gs; |
| $_; |
| } <lib/Mail/SpamAssassin/Plugin/*.pm>; |
| |
| my $PROMOTABLE_PLUGINS_RE = "^" . join("|", @def_plugins) . "\$"; |
| |
| # number of days to look back; if a rule isn't listed as promotable on |
| # all of these days, it won't be listed. (we grant an exception for |
| # new rules that didn't exist on previous days, however, so new rules |
| # can be published quickly to handle sudden outbreaks without requiring |
| # manual update work) |
| my @DAYS_REQUIRED = (1, 2, 3); |
| |
| # S/O threshold required |
| my $SO_THRESHOLD = .8; |
| |
| ########################################################################### |
| |
| print q{ |
| Bad performing rules, from the past 3 night's mass-checks. |
| |
| (Note: 'net' rules will be listed as 'no hits' unless you set 'tflags net'. |
| This also applies for meta rules which use 'net' rules.) |
| |
| }; |
| |
| my $cgi_url = "https://ruleqa.spamassassin.org/"; |
| my @doc = (); |
| my $cache = 'ruleqa.cache.'; |
| my $submitters = ''; |
| |
| my $url; # tracks the last day used |
| my $dayoffset = 0; |
| foreach my $day (@DAYS_REQUIRED) { |
| if (!$FROM_CACHE || !-f $cache.$day) { |
| |
| with_new_offset: |
| $url = $cgi_url.($day+$dayoffset)."-days-ago?xml=1"; |
| warn "HTTP get: $url\n"; |
| |
| $doc[$day] = get ($url); |
| if (!$doc[$day]) { |
| die "HTTP get failed: $doc[$day]\n"; |
| } |
| |
| if ($MAKE_CACHE) { |
| if (open(O, ">$cache$day")) { |
| print O $doc[$day]; close O; |
| } |
| } |
| } |
| else { |
| open(I, "<$cache$day") or die; $doc[$day] = join('',<I>); close I; |
| } |
| |
| ########################################################################### |
| |
| # the HTML looks like: |
| # |
| # <span class="daterev_masscheck_description" class="mcviewing"> |
| # ... |
| # <em><span class="mcsubmitters"> ....... </span></em> |
| # ... |
| # </span> |
| # |
| # in other words, the machine-parseable metadata is embedded in the HTML |
| # as a microformat. |
| |
| if ($doc[$day] =~ m{ |
| <span\s+class="daterev_masscheck_description\smcviewing" |
| .{0,400} |
| <span\s+class="mcsubmitters">\s*(.*?)\s*</span> |
| }sx) |
| { |
| my $daysubs = $1; |
| |
| # ignore days when the mass-check sets contain a --net log, since |
| # it's the weekly --net run. That generally contains a much |
| # smaller set of logs (since it takes longer to run mass-check --net) |
| # so the results are untrustworthy. |
| if ($daysubs =~ /(?:^|\s)net-/) { |
| warn "day $day contains a --net mass-check! offsetting by an extra day\n"; |
| $dayoffset++; goto with_new_offset; |
| } |
| |
| ($submitters ne '') and $submitters .= "; "; |
| $submitters .= "day $day: $daysubs"; |
| } |
| else { |
| loghtml_die("no 'mcviewing', 'mcsubmitters' microformats on day $day"); |
| } |
| } |
| |
| ########################################################################### |
| |
| # <rule><test>__HIGHBITS</test><promo>0</promo> |
| # <spc>8.7654</spc><hpc>0.2056</hpc><so>0.977</so> |
| # <detailhref>ruleqa%3Fdaterev%3Dlast-night%26rule%3D__HIGHBITS%26s_detail%3D1</detailhref></rule> |
| |
| my $plist; |
| foreach my $day (@DAYS_REQUIRED) { |
| while ($doc[$day] =~ m!<rule>(.*?)</rule>!xg) { |
| my $xml = $1; |
| my $obj = { }; |
| |
| while ($xml =~ s!<([A-Za-z0-9_]+)>(.*?)</\1>!!) { |
| $obj->{$1} = $2; |
| } |
| while ($xml =~ s!<([A-Za-z0-9_]+)\s+esc=["']1["']>(.*?)</\1>!!) { |
| $obj->{$1} = uri_unescape($2); |
| } |
| |
| my $name = $obj->{test}; |
| $obj->{detailhref} = $cgi_url.$obj->{detailhref}; |
| |
| $plist->[$day]->{$name} = $obj; |
| } |
| |
| if (!scalar keys %{$plist->[$day]}) { |
| loghtml_die("no rules found? on day $day"); |
| } |
| } |
| |
| ########################################################################### |
| |
| ## my $dump = Data::Dumper->Dump([$plist], ['promolist']); print $dump; |
| |
| # use SpamAssassin classes directly, so we can lint rules |
| # as we go |
| use lib 'lib'; |
| use Mail::SpamAssassin; |
| |
| my $mailsa = Mail::SpamAssassin->new({ |
| rules_filename => "rules", |
| site_rules_filename => join("\000", qw( rulesrc/core rulesrc/sandbox )), |
| local_tests_only => 1, |
| dont_copy_prefs => 1, |
| config_tree_recurse => 1, |
| keep_config_parsing_metadata => 1, |
| # debug => 1, |
| }); |
| |
| # hack hack hack!! we don't want to load plugin files twice, |
| # and since the mkrules compiler copies from rulesrc/sandbox/*/*.pm |
| # to rules/*.pm, they would otherwise appear twice. |
| foreach my $fname (<rules/*.pm>) { |
| my $path = File::Spec->rel2abs($fname); |
| $INC{$path} = 1; |
| # warn "JMD $path"; |
| } |
| |
| my %rules_with_errors = (); |
| my %killed_rules = (); |
| my %count_rules = (); |
| my %report_bad_subrules = (); |
| |
| $mailsa->{lint_callback} = sub { |
| my %opts = @_; |
| |
| # ignore non-rule-issue lint failures |
| return if ($opts{msg} =~ /(?: |
| score\sset\sfor\snon-existent| |
| description\sexists |
| )/x); |
| |
| warn "demoting $opts{rule}: $opts{msg}"; |
| if ($opts{iserror}) { |
| $rules_with_errors{$opts{rule}}++; |
| } |
| }; |
| |
| $mailsa->lint_rules(); |
| |
| # print "# active ruleset list, automatically generated from $cgi_url\n"; |
| # print "# with results from: $submitters\n"; |
| |
| my @spcs = ($submitters =~ /\s+/g); |
| if (scalar @spcs < 2) { |
| die "not generating results; less than 3 submitter results available!\n"; |
| } |
| |
| # base most of our decisions off day 1 (last night's mass-checks). |
| # note: meta rules must come before their __SUBRULES in this sort; |
| # default lexical sort will do this. |
| foreach my $plistkey (sort keys %{$plist->[1]}) { |
| my $name = $plistkey; |
| my $plistobj = $plist->[1]->{$plistkey}; |
| my $notes = ''; |
| |
| # rules in sandboxes without a T_ prefix, will be renamed during the |
| # ruleqa process... in other words, the output freqs line will talk |
| # about rule "T_FOO". if there's a rule "FOO" defined, assume that's |
| # the one being talked about. |
| my $no_t = $name; |
| if ($no_t =~ s/^T_//) { |
| if (defined $mailsa->{conf}->{scores}->{$no_t}) { |
| $name = $no_t; |
| } |
| } |
| |
| # ignore rules that don't exist (if they have a desc or score, |
| # they exist according to the Conf parser) |
| next unless ($mailsa->{conf}->{descriptions}->{$name} |
| || $mailsa->{conf}->{scores}->{$name}); |
| |
| my $tfs = $mailsa->{conf}->{tflags}->{$name} || ''; |
| |
| my $src = $mailsa->{conf}->{source_file}->{$name}; |
| if ( defined $src ) { |
| $count_rules{$src}++; |
| } else { |
| $count_rules{'not_present'}++; |
| } |
| |
| # skip rules of these tflags, we cannot judge them without more data |
| if ($tfs =~ /\b(?:userconf|learn|net)\b/) { |
| next; |
| } |
| |
| # rules that fail lint |
| next if $rules_with_errors{$name}; |
| |
| # subrules with ok parent rules |
| if ($name =~ /^__/ && !$report_bad_subrules{$name}) { |
| # print " # ignoring subrule $name: parent rules seem fine\n"; |
| next; |
| } |
| |
| # certain tests need to be reversed for "nice" rules |
| my $is_nice = 0; |
| if ($tfs =~ /\bnice\b/) { $is_nice = 1; } |
| |
| my $valid = 1; # number of nights the rule appears in |
| my $so = $plist->[1]->{$plistkey}->{so}; |
| if (defined $plist->[2]->{$plistkey}->{so}) { |
| $so += $plist->[2]->{$plistkey}->{so}; $valid++; |
| } |
| if (defined $plist->[3]->{$plistkey}->{so}) { |
| $so += $plist->[3]->{$plistkey}->{so}; $valid++; |
| } |
| $so /= $valid; # average across all 3 |
| my $adj_so; |
| if ($is_nice) { |
| $adj_so = 1.0 - $so; # 0.0 => 1.0 |
| } else { |
| $adj_so = $so; |
| } |
| next unless ($adj_so < $SO_THRESHOLD); |
| |
| my $target = ($is_nice ? 'hpc' : 'spc'); |
| my $spc = $plist->[1]->{$plistkey}->{$target}; |
| $spc += $plist->[2]->{$plistkey}->{$target} || 0; |
| $spc += $plist->[3]->{$plistkey}->{$target} || 0; |
| $spc /= $valid; |
| |
| $target = ($is_nice ? 'spc' : 'hpc'); |
| my $hpc = $plist->[1]->{$plistkey}->{$target}; |
| $hpc += $plist->[2]->{$plistkey}->{$target} || 0; |
| $hpc += $plist->[3]->{$plistkey}->{$target} || 0; |
| $hpc /= $valid; |
| |
| if ($spc <= 0.0001) { |
| if ($hpc <= 0.0001) { |
| badrule($name, "no hits at all"); |
| } else { |
| badrule($name, "no hits of target type"); |
| } |
| next; |
| } |
| |
| badrule($name, "bad, avg S/O=".sprintf("%.2f",$so)." ". |
| "avg Spam%=".sprintf("%.2f",$spc)." ". |
| "avg Ham%=".sprintf("%.2f",$hpc) |
| ); |
| } |
| |
| foreach my $srcfile (reverse sort keys %killed_rules) { |
| my $set = $killed_rules{$srcfile}; |
| my $count = $count_rules{$srcfile}; |
| my $c_bad = scalar keys %{$set}; |
| |
| print "\n$srcfile ($count rules, $c_bad bad):\n\n"; |
| |
| foreach my $name (sort keys %{$set}) { |
| my $reason = $set->{$name}; |
| print " $name: $reason\n"; |
| } |
| } |
| |
| exit; |
| |
| sub badrule { |
| my ($name, $reason) = @_; |
| my $src = $mailsa->{conf}->{source_file}->{$name}; |
| if ( defined $src ) { |
| $killed_rules{$src}->{$name} = $reason; |
| } else { |
| $killed_rules{'not_present'}->{$name} = $reason; |
| } |
| |
| # if it's a subrule in a meta rule, note this |
| # TODO: this only works reliably for lexically-previous meta rules; |
| # that's ok for __SUBRULES used in META_RULES, since "M" < "_". |
| if ($report_bad_subrules{$name}) { |
| $killed_rules{$src}->{$name} .= "\n # used in:$report_bad_subrules{$name}"; |
| } |
| |
| # if it's a meta rule, note that we can complain about its subrules too |
| foreach my $r (split ' ', $mailsa->{conf}->{meta_dependencies}->{$name} || '') { |
| $report_bad_subrules{$r} .= " ".$name; |
| } |
| } |
| |
| sub loghtml_die { |
| die "$_[0]\nURL: $url\n"; |
| } |
| |