| #!/usr/bin/perl |
| # |
| # <@LICENSE> |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to you under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at: |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # </@LICENSE> |
| |
| # any tests that get less than this % of matches on *both* spam or nonspam, are |
| # reported. |
| my $LOW_MATCHES_PERCENT = 0.03; |
| my $scoreset = 0; |
| |
| sub usage { |
| die " |
| lint-rules-from-freqs: perform 'lint' testing on SpamAssassin rules and scores |
| |
| usage: ./lint-rules-from-freqs [-f falsefreqs] < freqs > badtests |
| |
| This analyzes SpamAssassin tests, based on the hit frequencies and S/O ratios |
| from a mass-check logfile pair. |
| |
| The 'freqs' argument is the frequency of hits in all messages ('hit-frequencies |
| -x -p' output). |
| |
| The 'falsefreqs' argument is frequencies of hits in false-positives and |
| false-negatives only ('hit-frequencies -x -p -f' output). |
| |
| "; |
| } |
| |
| my $opt_falsefreqs; |
| while ($#ARGV >= 0) { |
| $_ = shift @ARGV; |
| if (/^-f/) { $_ = shift @ARGV; $opt_falsefreqs = $_; } |
| elsif (/^-s/) { $_ = shift @ARGV; $scoreset = $_; } |
| else { usage(); } |
| } |
| |
| print "BAD TESTS REPORT\n"; |
| readrules(); |
| print "\n" .((scalar keys %rulefile) + 1). " rules found.\n"; |
| print "\nRule file syntax issues:\n\n"; |
| lintrules(); |
| |
| if ($opt_falsefreqs) { |
| open (FALSE, "<$opt_falsefreqs"); |
| while (<FALSE>) { |
| if (!/^\s*([\d\.]+)/) { |
| my ($overall, $spam, $nons, $so, $score, $name) = split (' '); |
| next unless ($name =~ /\S/); |
| $falsefreqs_spam{$name} = $spam; |
| $falsefreqs_nons{$name} = $nons; |
| $falsefreqs_so{$name} = $so; |
| } |
| } |
| close FALSE; |
| } |
| |
| while (<>) { |
| if (!/^\s*([\d\.]+)/) { |
| $output{'a_header'} = $_; next; |
| } |
| |
| my $badrule; |
| my ($overall, $spam, $nons, $so, $score, $name) = split (' '); |
| next unless ($name =~ /\S/); |
| |
| my $ffspam = $falsefreqs_spam{$name}; |
| my $ffnons = $falsefreqs_nons{$name}; |
| my $ffso = $falsefreqs_so{$name}; |
| |
| my $tf = $tflags{$name}; |
| next if ($tf =~ /net/ && ($scoreset % 2) == 0); |
| next if ($tf =~ /userconf/); |
| |
| if ($overall == 0.0 && $spam == 0.0 && $nons == 0.0) { # sanity! |
| $badrule = 'no matches'; |
| |
| } else { |
| if ($score < 0.0) { |
| # negative score with more spams than nonspams? bad rule. |
| if ($tf !~ /nice/ && $so > 0.5 && $score < 0.5) { |
| $badrule = 'non-nice but -ve score'; |
| } |
| |
| if ($tf =~ /nice/ && $so > 0.5 && $score < 0.5) { |
| if ($ffso < 0.5) { |
| $badrule = 'fn'; |
| } else { |
| # ignore, the FNs are overridden by other tests so it doesn't |
| # affect the overall results. |
| } |
| } |
| |
| # low number of matches overall |
| if ($nons < $LOW_MATCHES_PERCENT) |
| { $badrule ||= ''; $badrule .= ', low matches'; } |
| |
| } elsif ($score > 0.0) { |
| # positive score with more nonspams than spams? bad. |
| if ($tf =~ /nice/ && $so < 0.5 && $score > 0.5) { |
| $badrule = 'nice but +ve score'; |
| } |
| |
| if ($tf !~ /nice/ && $so < 0.5 && $score > 0.5) { |
| if ($ffso > 0.5) { |
| $badrule = 'fp'; |
| } else { |
| # ignore, the FPs are overridden by other tests so it doesn't |
| # affect the overall results. |
| } |
| } |
| |
| # low number of matches overall |
| if ($spam < $LOW_MATCHES_PERCENT) |
| { $badrule ||= ''; $badrule .= ', low matches'; } |
| |
| } elsif ($score == 0.0) { |
| $badrule = 'score is 0'; |
| } |
| } |
| |
| if (defined $badrule) { |
| $badrule =~ s/^, //; chomp; |
| $output{$badrule} .= $_ . " ($badrule)\n"; |
| } |
| } |
| |
| # do all but 'no/low matches' first |
| print "\nHigh-priority issues:\n\n"; |
| foreach my $badness (sort keys %output) { |
| next if ($badness eq 'no matches'); |
| next if ($badness eq 'low matches'); |
| print $output{$badness}; |
| delete $output{$badness}; |
| } |
| |
| # now go back and do the other 2 (if they're there) |
| print "\nLow-priority issues:\n\n"; |
| foreach my $badness (sort keys %output) { |
| next unless defined ($output{$badness}); |
| print $output{$badness}; |
| delete $output{$badness}; |
| } |
| exit; |
| |
| |
| sub concat_rule_lang { |
| my $rule = shift; |
| my $lang = shift; |
| |
| if (defined $lang && $lang ne '') { |
| return "[$lang]_$rule"; |
| } else { |
| return $rule; |
| } |
| } |
| |
| # note: do not use parse-rules-for-masses here, we need to do linting instead |
| # of your average parse |
| sub readrules { |
| my @files = <../rules/[0-9]*.cf>; |
| my $file; |
| %rulesfound = (); |
| %langs = (); |
| foreach $file (@files) { |
| open (IN, "<$file"); |
| while (<IN>) { |
| s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/; |
| |
| # make all the foo-bar stuff foo_bar |
| 1 while s/^(\S+)-/\1_/g; |
| 1 while s/^(lang\s+\S+\s+\S+)-/\1_/g; |
| |
| my $lang = ''; |
| if (s/^lang\s+(\S+)\s+//) { |
| $lang = $1; $langs{$1} = 1; |
| } |
| |
| if (/^(header|rawbody|body|full|uri|meta|mimeheader)\s+(\S+)\s+/) { |
| $rulesfound{$2} = 1; |
| $rulefile{$2} ||= $file; |
| $scorefile{$1} = $file; |
| $score{$2} ||= 1.0; |
| $tflags{$2} ||= ''; |
| $descfile{$2} ||= $file; # a rule with no score or desc is OK |
| $description{$2}->{$lang} = undef; |
| |
| if (/^body\s+\S+\s+eval:/) { |
| # ignored |
| } elsif (/^body\s+\S+\s+(.*)$/) { |
| my $re = $1; |
| |
| # If there's a ( in a rule where it should be (?:, flag it. |
| # but ignore [abc(] ... |
| if ($re =~ /[^\\]\([^\?]/ && $re !~ /\[[^\]]*[^\\]\(/) { |
| print "warning: non-(?:...) capture in regexp in $file: $_\n"; |
| } |
| if ($re =~ /\.[\*\+]/) { |
| print "warning: .* in regexp in $file: $_\n"; |
| } |
| if ($re =~ /[^\\]\{(\d*),?(\d*?)\}/) { |
| if ($1 > 120 || $2 > 120) { |
| print "warning: long .{n} in regexp in $file: $_\n"; |
| } |
| } |
| } |
| |
| } elsif (/^describe\s+(\S+)\s+(.*?)\s*$/) { |
| $rulesfound{$1} = 1; |
| $descfile{concat_rule_lang ($1, $lang)} ||= $file; |
| $descfile{$1} ||= $file; |
| $description{$1}->{$lang} = $2; |
| } elsif (/^tflags\s+(\S+)\s+(.+)$/) { |
| $rulesfound{$1} = 1; |
| $tflags{$1} = $2; |
| $tflagsfile{concat_rule_lang ($1, $lang)} = $file; |
| $tflagsfile{$1} = $file; |
| } elsif (/^score\s+(\S+)\s+(.+)$/) { |
| $rulesfound{$1} = 1; |
| $scorefile{concat_rule_lang ($1, $lang)} = $file; |
| $scorefile{$1} = $file; |
| $score{$1} = $2; |
| } elsif (/^(clear_report_template|clear_spamtrap_template|report|spamtrap| |
| clear_terse_report_template|terse_report| |
| required_score|ok_locales|ok_languages|test|lang| |
| spamphrase|whitelist_from|require_version| |
| clear_unsafe_report_template|unsafe_report| |
| (?:bayes_)?auto_learn_threshold_nonspam|(?:bayes_)?auto_learn_threshold_spam| |
| (?:bayes_)?auto_learn |
| )/x) { |
| next; |
| } else { |
| print "warning: unknown rule in $file: $_\n"; |
| } |
| } |
| close IN; |
| } |
| @langsfound = sort keys %langs; |
| @rulesfound = sort keys %rulesfound; |
| } |
| |
| sub lintrules { |
| my %possible_renames = (); |
| |
| foreach my $rule (@rulesfound) { |
| my $match = $rule; |
| $match =~ s/_\d+[^_]+$//gs; # trim e.g. "_20K" |
| $match =~ s/[^A-Z]+//gs; # trim numbers etc. |
| |
| if (defined ($rulefile{$rule}) && $possible_renames{$match} !~ / \Q$rule\E\b/) { |
| $possible_renames{$match} .= " ".$rule; |
| } |
| $possible_rename_matches{$rule} = $match; |
| } |
| |
| foreach my $lang ('', @langsfound) { |
| foreach my $baserule (@rulesfound) { |
| next if ( $baserule =~ /^__/ || $baserule =~ /^T_/ ); |
| |
| my $rule = concat_rule_lang ($baserule, $lang); |
| my $f = $descfile{$rule}; |
| my $warned = ''; |
| |
| if (defined $f && !defined ($rulefile{$rule}) |
| && !defined ($rulefile{$baserule})) |
| { |
| print "warning: $baserule has description, but no rule: $f\n"; |
| $warned .= ' lamedesc'; |
| } |
| |
| # Check our convention for rule length |
| if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && length $baserule > 22 ) { |
| print "warning: $baserule has a name longer than 22 chars: $f\n"; |
| } |
| # Check our convention for rule length |
| if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && defined $description{$baserule}->{$lang} && length $description{$baserule}->{$lang} > 50 ) { |
| print "warning: $baserule has a description longer than 50 chars: $f\n"; |
| } |
| |
| # lang rule trumps normal rule |
| $f = $rulefile{$rule} || $rulefile{$baserule}; |
| # if the rule exists, and the language/rule description doesn't exist ... |
| if ( defined $f && !defined $description{$baserule}->{$lang} ) |
| { |
| print "warning: $baserule exists, ",( $lang ne '' ? "lang $lang, " : "" ),"but has no description: $f\n"; |
| $warned .= ' lamedesc'; |
| } |
| |
| |
| $f = $scorefile{$rule}; |
| if (defined $f && !defined ($rulefile{$rule}) |
| && !defined ($rulefile{$baserule})) |
| { |
| print "warning: $baserule has score, but no rule: $f\n"; |
| $warned .= ' lamescore'; |
| } |
| |
| my $r = $possible_rename_matches{$rule}; |
| if ($warned ne '' && defined $r) { |
| my @matches = split (' ', $possible_renames{$r}); |
| if (scalar @matches != 0) { |
| my $text = ''; |
| |
| # now try and figure out "nearby" rules with no description/score |
| foreach my $baser (@matches) { |
| my $blang; |
| if ($descfile{$rule} =~ /text_(\S\S)\./) { |
| $blang = $1; |
| } |
| my $r = concat_rule_lang ($baser, $blang); |
| #warn "$r $descfile{$r} $descfile{$baser}"; |
| next if ($warned =~ /lamedesc/ && (defined $descfile{$r})); |
| next if ($warned =~ /lamescore/ && (defined $scorefile{$r})); |
| $text .= " $baser"; |
| } |
| |
| if ($text ne '') { |
| print "warning: (possible renamed rule? $text)\n"; |
| } |
| } |
| } |
| } |
| } |
| } |
| |