|  | #!/usr/bin/perl | 
|  |  | 
|  | use strict; | 
|  | use warnings; | 
|  |  | 
|  | use File::Temp (); | 
|  | use LWP::Simple; | 
|  | use URI::Escape; | 
|  | use Data::Dumper; | 
|  |  | 
|  | my $FROM_CACHE; if (@ARGV && $ARGV[0] eq '--fromcache') { $FROM_CACHE = 1; } | 
|  | my $MAKE_CACHE; $MAKE_CACHE = 1;        # turn this on by default, no harm | 
|  |  | 
|  | # we allow promotion of rules that are "ifplugin" one of these | 
|  | my @def_plugins = map { | 
|  | s,^lib/Mail/SpamAssassin/Plugin/(\S+)\.pm$,Mail::SpamAssassin::Plugin::$1,gs; | 
|  | $_; | 
|  | } <lib/Mail/SpamAssassin/Plugin/*.pm>; | 
|  |  | 
|  | my $PROMOTABLE_PLUGINS_RE = "^" . join("|", @def_plugins) . "\$"; | 
|  |  | 
|  | # number of days to look back; if a rule isn't listed as promotable on | 
|  | # all of these days, it won't be listed.  (we grant an exception for | 
|  | # new rules that didn't exist on previous days, however, so new rules | 
|  | # can be published quickly to handle sudden outbreaks without requiring | 
|  | # manual update work) | 
|  | my @DAYS_REQUIRED = (1, 2, 3); | 
|  |  | 
|  | # S/O threshold required | 
|  | my $SO_THRESHOLD = .8; | 
|  |  | 
|  | ########################################################################### | 
|  |  | 
|  | print q{ | 
|  | Bad performing rules, from the past 3 night's mass-checks. | 
|  |  | 
|  | (Note: 'net' rules will be listed as 'no hits' unless you set 'tflags net'. | 
|  | This also applies for meta rules which use 'net' rules.) | 
|  |  | 
|  | }; | 
|  |  | 
|  | my $cgi_url = "https://ruleqa.spamassassin.org/"; | 
|  | my @doc = (); | 
|  | my $cache = 'ruleqa.cache.'; | 
|  | my $submitters = ''; | 
|  |  | 
|  | my $url;        # tracks the last day used | 
|  | my $dayoffset = 0; | 
|  | foreach my $day (@DAYS_REQUIRED) { | 
|  | if (!$FROM_CACHE || !-f $cache.$day) { | 
|  |  | 
|  | with_new_offset: | 
|  | $url = $cgi_url.($day+$dayoffset)."-days-ago?xml=1"; | 
|  | warn "HTTP get: $url\n"; | 
|  |  | 
|  | $doc[$day] = get ($url); | 
|  | if (!$doc[$day]) { | 
|  | die "HTTP get failed: $doc[$day]\n"; | 
|  | } | 
|  |  | 
|  | if ($MAKE_CACHE) { | 
|  | if (open(O, ">$cache$day")) { | 
|  | print O $doc[$day]; close O; | 
|  | } | 
|  | } | 
|  | } | 
|  | else { | 
|  | open(I, "<$cache$day") or die; $doc[$day] = join('',<I>); close I; | 
|  | } | 
|  |  | 
|  | ########################################################################### | 
|  |  | 
|  | # the HTML looks like: | 
|  | # | 
|  | #   <span class="daterev_masscheck_description" class="mcviewing"> | 
|  | #   ... | 
|  | #   <em><span class="mcsubmitters"> ....... </span></em> | 
|  | #   ... | 
|  | #   </span> | 
|  | # | 
|  | # in other words, the machine-parseable metadata is embedded in the HTML | 
|  | # as a microformat. | 
|  |  | 
|  | if ($doc[$day] =~ m{ | 
|  | <span\s+class="daterev_masscheck_description\smcviewing" | 
|  | .{0,400} | 
|  | <span\s+class="mcsubmitters">\s*(.*?)\s*</span> | 
|  | }sx) | 
|  | { | 
|  | my $daysubs = $1; | 
|  |  | 
|  | # ignore days when the mass-check sets contain a --net log, since | 
|  | # it's the weekly --net run.  That generally contains a much | 
|  | # smaller set of logs (since it takes longer to run mass-check --net) | 
|  | # so the results are untrustworthy. | 
|  | if ($daysubs =~ /(?:^|\s)net-/) { | 
|  | warn "day $day contains a --net mass-check! offsetting by an extra day\n"; | 
|  | $dayoffset++; goto with_new_offset; | 
|  | } | 
|  |  | 
|  | ($submitters ne '') and $submitters .= "; "; | 
|  | $submitters .= "day $day: $daysubs"; | 
|  | } | 
|  | else { | 
|  | loghtml_die("no 'mcviewing', 'mcsubmitters' microformats on day $day"); | 
|  | } | 
|  | } | 
|  |  | 
|  | ########################################################################### | 
|  |  | 
|  | # <rule><test>__HIGHBITS</test><promo>0</promo> | 
|  | # <spc>8.7654</spc><hpc>0.2056</hpc><so>0.977</so> | 
|  | # <detailhref>ruleqa%3Fdaterev%3Dlast-night%26rule%3D__HIGHBITS%26s_detail%3D1</detailhref></rule> | 
|  |  | 
|  | my $plist; | 
|  | foreach my $day (@DAYS_REQUIRED) { | 
|  | while ($doc[$day] =~ m!<rule>(.*?)</rule>!xg) { | 
|  | my $xml = $1; | 
|  | my $obj = { }; | 
|  |  | 
|  | while ($xml =~ s!<([A-Za-z0-9_]+)>(.*?)</\1>!!) { | 
|  | $obj->{$1} = $2; | 
|  | } | 
|  | while ($xml =~ s!<([A-Za-z0-9_]+)\s+esc=["']1["']>(.*?)</\1>!!) { | 
|  | $obj->{$1} = uri_unescape($2); | 
|  | } | 
|  |  | 
|  | my $name = $obj->{test}; | 
|  | $obj->{detailhref} = $cgi_url.$obj->{detailhref}; | 
|  |  | 
|  | $plist->[$day]->{$name} = $obj; | 
|  | } | 
|  |  | 
|  | if (!scalar keys %{$plist->[$day]}) { | 
|  | loghtml_die("no rules found? on day $day"); | 
|  | } | 
|  | } | 
|  |  | 
|  | ########################################################################### | 
|  |  | 
|  | ## my $dump = Data::Dumper->Dump([$plist], ['promolist']); print $dump; | 
|  |  | 
|  | # use SpamAssassin classes directly, so we can lint rules | 
|  | # as we go | 
|  | use lib 'lib'; | 
|  | use Mail::SpamAssassin; | 
|  |  | 
|  | my $mailsa = Mail::SpamAssassin->new({ | 
|  | rules_filename => "rules", | 
|  | site_rules_filename => join("\000", qw( rulesrc/core rulesrc/sandbox )), | 
|  | local_tests_only => 1, | 
|  | dont_copy_prefs => 1, | 
|  | config_tree_recurse => 1, | 
|  | keep_config_parsing_metadata => 1, | 
|  | # debug => 1, | 
|  | }); | 
|  |  | 
|  | # hack hack hack!!  we don't want to load plugin files twice, | 
|  | # and since the mkrules compiler copies from rulesrc/sandbox/*/*.pm | 
|  | # to rules/*.pm, they would otherwise appear twice. | 
|  | foreach my $fname (<rules/*.pm>) { | 
|  | my $path = File::Spec->rel2abs($fname); | 
|  | $INC{$path} = 1; | 
|  | # warn "JMD $path"; | 
|  | } | 
|  |  | 
|  | my %rules_with_errors = (); | 
|  | my %killed_rules = (); | 
|  | my %count_rules = (); | 
|  | my %report_bad_subrules = (); | 
|  |  | 
|  | $mailsa->{lint_callback} = sub { | 
|  | my %opts = @_; | 
|  |  | 
|  | # ignore non-rule-issue lint failures | 
|  | return if ($opts{msg} =~ /(?: | 
|  | score\sset\sfor\snon-existent| | 
|  | description\sexists | 
|  | )/x); | 
|  |  | 
|  | warn "demoting $opts{rule}: $opts{msg}"; | 
|  | if ($opts{iserror}) { | 
|  | $rules_with_errors{$opts{rule}}++; | 
|  | } | 
|  | }; | 
|  |  | 
|  | $mailsa->lint_rules(); | 
|  |  | 
|  | # print "# active ruleset list, automatically generated from $cgi_url\n"; | 
|  | # print "# with results from: $submitters\n"; | 
|  |  | 
|  | my @spcs = ($submitters =~ /\s+/g); | 
|  | if (scalar @spcs < 2) { | 
|  | die "not generating results; less than 3 submitter results available!\n"; | 
|  | } | 
|  |  | 
|  | # base most of our decisions off day 1 (last night's mass-checks). | 
|  | # note: meta rules must come before their __SUBRULES in this sort; | 
|  | # default lexical sort will do this. | 
|  | foreach my $plistkey (sort keys %{$plist->[1]}) { | 
|  | my $name = $plistkey; | 
|  | my $plistobj = $plist->[1]->{$plistkey}; | 
|  | my $notes = ''; | 
|  |  | 
|  | # rules in sandboxes without a T_ prefix, will be renamed during the | 
|  | # ruleqa process... in other words, the output freqs line will talk | 
|  | # about rule "T_FOO".   if there's a rule "FOO" defined, assume that's | 
|  | # the one being talked about. | 
|  | my $no_t = $name; | 
|  | if ($no_t =~ s/^T_//) { | 
|  | if (defined $mailsa->{conf}->{scores}->{$no_t}) { | 
|  | $name = $no_t; | 
|  | } | 
|  | } | 
|  |  | 
|  | # ignore rules that don't exist (if they have a desc or score, | 
|  | # they exist according to the Conf parser) | 
|  | next unless ($mailsa->{conf}->{descriptions}->{$name} | 
|  | || $mailsa->{conf}->{scores}->{$name}); | 
|  |  | 
|  | my $tfs = $mailsa->{conf}->{tflags}->{$name} || ''; | 
|  |  | 
|  | my $src = $mailsa->{conf}->{source_file}->{$name}; | 
|  | if ( defined $src ) { | 
|  | $count_rules{$src}++; | 
|  | } else { | 
|  | $count_rules{'not_present'}++; | 
|  | } | 
|  |  | 
|  | # skip rules of these tflags, we cannot judge them without more data | 
|  | if ($tfs =~ /\b(?:userconf|learn|net)\b/) { | 
|  | next; | 
|  | } | 
|  |  | 
|  | # rules that fail lint | 
|  | next if $rules_with_errors{$name}; | 
|  |  | 
|  | # subrules with ok parent rules | 
|  | if ($name =~ /^__/ && !$report_bad_subrules{$name}) { | 
|  | # print "  # ignoring subrule $name: parent rules seem fine\n"; | 
|  | next; | 
|  | } | 
|  |  | 
|  | # certain tests need to be reversed for "nice" rules | 
|  | my $is_nice = 0; | 
|  | if ($tfs =~ /\bnice\b/) { $is_nice = 1; } | 
|  |  | 
|  | my $valid = 1;    # number of nights the rule appears in | 
|  | my $so = $plist->[1]->{$plistkey}->{so}; | 
|  | if (defined $plist->[2]->{$plistkey}->{so}) { | 
|  | $so += $plist->[2]->{$plistkey}->{so}; $valid++; | 
|  | } | 
|  | if (defined $plist->[3]->{$plistkey}->{so}) { | 
|  | $so += $plist->[3]->{$plistkey}->{so}; $valid++; | 
|  | } | 
|  | $so /= $valid;      # average across all 3 | 
|  | my $adj_so; | 
|  | if ($is_nice) { | 
|  | $adj_so = 1.0 - $so;            # 0.0 => 1.0 | 
|  | } else { | 
|  | $adj_so = $so; | 
|  | } | 
|  | next unless ($adj_so < $SO_THRESHOLD); | 
|  |  | 
|  | my $target = ($is_nice ? 'hpc' : 'spc'); | 
|  | my $spc = $plist->[1]->{$plistkey}->{$target}; | 
|  | $spc += $plist->[2]->{$plistkey}->{$target} || 0; | 
|  | $spc += $plist->[3]->{$plistkey}->{$target} || 0; | 
|  | $spc /= $valid; | 
|  |  | 
|  | $target = ($is_nice ? 'spc' : 'hpc'); | 
|  | my $hpc = $plist->[1]->{$plistkey}->{$target}; | 
|  | $hpc += $plist->[2]->{$plistkey}->{$target} || 0; | 
|  | $hpc += $plist->[3]->{$plistkey}->{$target} || 0; | 
|  | $hpc /= $valid; | 
|  |  | 
|  | if ($spc <= 0.0001) { | 
|  | if ($hpc <= 0.0001) { | 
|  | badrule($name, "no hits at all"); | 
|  | } else { | 
|  | badrule($name, "no hits of target type"); | 
|  | } | 
|  | next; | 
|  | } | 
|  |  | 
|  | badrule($name, "bad, avg S/O=".sprintf("%.2f",$so)." ". | 
|  | "avg Spam%=".sprintf("%.2f",$spc)." ". | 
|  | "avg Ham%=".sprintf("%.2f",$hpc) | 
|  | ); | 
|  | } | 
|  |  | 
|  | foreach my $srcfile (reverse sort keys %killed_rules) { | 
|  | my $set = $killed_rules{$srcfile}; | 
|  | my $count = $count_rules{$srcfile}; | 
|  | my $c_bad = scalar keys %{$set}; | 
|  |  | 
|  | print "\n$srcfile ($count rules, $c_bad bad):\n\n"; | 
|  |  | 
|  | foreach my $name (sort keys %{$set}) { | 
|  | my $reason = $set->{$name}; | 
|  | print "  $name:  $reason\n"; | 
|  | } | 
|  | } | 
|  |  | 
|  | exit; | 
|  |  | 
|  | sub badrule { | 
|  | my ($name, $reason) = @_; | 
|  | my $src = $mailsa->{conf}->{source_file}->{$name}; | 
|  | if ( defined $src ) { | 
|  | $killed_rules{$src}->{$name} = $reason; | 
|  | } else { | 
|  | $killed_rules{'not_present'}->{$name} = $reason; | 
|  | } | 
|  |  | 
|  | # if it's a subrule in a meta rule, note this | 
|  | # TODO: this only works reliably for lexically-previous meta rules; | 
|  | # that's ok for __SUBRULES used in META_RULES, since "M" < "_". | 
|  | if ($report_bad_subrules{$name}) { | 
|  | $killed_rules{$src}->{$name} .= "\n      # used in:$report_bad_subrules{$name}"; | 
|  | } | 
|  |  | 
|  | # if it's a meta rule, note that we can complain about its subrules too | 
|  | foreach my $r (split ' ', $mailsa->{conf}->{meta_dependencies}->{$name} || '') { | 
|  | $report_bad_subrules{$r} .= " ".$name; | 
|  | } | 
|  | } | 
|  |  | 
|  | sub loghtml_die { | 
|  | die "$_[0]\nURL: $url\n"; | 
|  | } | 
|  |  |