blob: 92f952a293313401eea73019172179094db1a4a2 [file] [log] [blame]
#!/usr/bin/perl
#
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
# any tests that get less than this % of matches on *both* spam or nonspam, are
# reported.
my $LOW_MATCHES_PERCENT = 0.03;
my $scoreset = 0;
sub usage {
die "
lint-rules-from-freqs: perform 'lint' testing on SpamAssassin rules and scores
usage: ./lint-rules-from-freqs [-f falsefreqs] < freqs > badtests
This analyzes SpamAssassin tests, based on the hit frequencies and S/O ratios
from a mass-check logfile pair.
The 'freqs' argument is the frequency of hits in all messages ('hit-frequencies
-x -p' output).
The 'falsefreqs' argument is frequencies of hits in false-positives and
false-negatives only ('hit-frequencies -x -p -f' output).
";
}
my $opt_falsefreqs;
while ($#ARGV >= 0) {
$_ = shift @ARGV;
if (/^-f/) { $_ = shift @ARGV; $opt_falsefreqs = $_; }
elsif (/^-s/) { $_ = shift @ARGV; $scoreset = $_; }
else { usage(); }
}
print "BAD TESTS REPORT\n";
readrules();
print "\n" .((scalar keys %rulefile) + 1). " rules found.\n";
print "\nRule file syntax issues:\n\n";
lintrules();
if ($opt_falsefreqs) {
open (FALSE, "<$opt_falsefreqs");
while (<FALSE>) {
if (!/^\s*([\d\.]+)/) {
my ($overall, $spam, $nons, $so, $score, $name) = split (' ');
next unless ($name =~ /\S/);
$falsefreqs_spam{$name} = $spam;
$falsefreqs_nons{$name} = $nons;
$falsefreqs_so{$name} = $so;
}
}
close FALSE;
}
while (<>) {
if (!/^\s*([\d\.]+)/) {
$output{'a_header'} = $_; next;
}
my $badrule;
my ($overall, $spam, $nons, $so, $score, $name) = split (' ');
next unless ($name =~ /\S/);
my $ffspam = $falsefreqs_spam{$name};
my $ffnons = $falsefreqs_nons{$name};
my $ffso = $falsefreqs_so{$name};
my $tf = $tflags{$name};
next if ($tf =~ /net/ && ($scoreset % 2) == 0);
next if ($tf =~ /userconf/);
if ($overall == 0.0 && $spam == 0.0 && $nons == 0.0) { # sanity!
$badrule = 'no matches';
} else {
if ($score < 0.0) {
# negative score with more spams than nonspams? bad rule.
if ($tf !~ /nice/ && $so > 0.5 && $score < 0.5) {
$badrule = 'non-nice but -ve score';
}
if ($tf =~ /nice/ && $so > 0.5 && $score < 0.5) {
if ($ffso < 0.5) {
$badrule = 'fn';
} else {
# ignore, the FNs are overridden by other tests so it doesn't
# affect the overall results.
}
}
# low number of matches overall
if ($nons < $LOW_MATCHES_PERCENT)
{ $badrule ||= ''; $badrule .= ', low matches'; }
} elsif ($score > 0.0) {
# positive score with more nonspams than spams? bad.
if ($tf =~ /nice/ && $so < 0.5 && $score > 0.5) {
$badrule = 'nice but +ve score';
}
if ($tf !~ /nice/ && $so < 0.5 && $score > 0.5) {
if ($ffso > 0.5) {
$badrule = 'fp';
} else {
# ignore, the FPs are overridden by other tests so it doesn't
# affect the overall results.
}
}
# low number of matches overall
if ($spam < $LOW_MATCHES_PERCENT)
{ $badrule ||= ''; $badrule .= ', low matches'; }
} elsif ($score == 0.0) {
$badrule = 'score is 0';
}
}
if (defined $badrule) {
$badrule =~ s/^, //; chomp;
$output{$badrule} .= $_ . " ($badrule)\n";
}
}
# do all but 'no/low matches' first
print "\nHigh-priority issues:\n\n";
foreach my $badness (sort keys %output) {
next if ($badness eq 'no matches');
next if ($badness eq 'low matches');
print $output{$badness};
delete $output{$badness};
}
# now go back and do the other 2 (if they're there)
print "\nLow-priority issues:\n\n";
foreach my $badness (sort keys %output) {
next unless defined ($output{$badness});
print $output{$badness};
delete $output{$badness};
}
exit;
sub concat_rule_lang {
my $rule = shift;
my $lang = shift;
if (defined $lang && $lang ne '') {
return "[$lang]_$rule";
} else {
return $rule;
}
}
# note: do not use parse-rules-for-masses here, we need to do linting instead
# of your average parse
sub readrules {
my @files = <../rules/[0-9]*.cf>;
my $file;
%rulesfound = ();
%langs = ();
foreach $file (@files) {
open (IN, "<$file");
while (<IN>) {
s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/;
# make all the foo-bar stuff foo_bar
1 while s/^(\S+)-/\1_/g;
1 while s/^(lang\s+\S+\s+\S+)-/\1_/g;
my $lang = '';
if (s/^lang\s+(\S+)\s+//) {
$lang = $1; $langs{$1} = 1;
}
if (/^(header|rawbody|body|full|uri|meta|mimeheader)\s+(\S+)\s+/) {
$rulesfound{$2} = 1;
$rulefile{$2} ||= $file;
$scorefile{$1} = $file;
$score{$2} ||= 1.0;
$tflags{$2} ||= '';
$descfile{$2} ||= $file; # a rule with no score or desc is OK
$description{$2}->{$lang} = undef;
if (/^body\s+\S+\s+eval:/) {
# ignored
} elsif (/^body\s+\S+\s+(.*)$/) {
my $re = $1;
# If there's a ( in a rule where it should be (?:, flag it.
# but ignore [abc(] ...
if ($re =~ /[^\\]\([^\?]/ && $re !~ /\[[^\]]*[^\\]\(/) {
print "warning: non-(?:...) capture in regexp in $file: $_\n";
}
if ($re =~ /\.[\*\+]/) {
print "warning: .* in regexp in $file: $_\n";
}
if ($re =~ /[^\\]\{(\d*),?(\d*?)\}/) {
if ($1 > 120 || $2 > 120) {
print "warning: long .{n} in regexp in $file: $_\n";
}
}
}
} elsif (/^describe\s+(\S+)\s+(.*?)\s*$/) {
$rulesfound{$1} = 1;
$descfile{concat_rule_lang ($1, $lang)} ||= $file;
$descfile{$1} ||= $file;
$description{$1}->{$lang} = $2;
} elsif (/^tflags\s+(\S+)\s+(.+)$/) {
$rulesfound{$1} = 1;
$tflags{$1} = $2;
$tflagsfile{concat_rule_lang ($1, $lang)} = $file;
$tflagsfile{$1} = $file;
} elsif (/^score\s+(\S+)\s+(.+)$/) {
$rulesfound{$1} = 1;
$scorefile{concat_rule_lang ($1, $lang)} = $file;
$scorefile{$1} = $file;
$score{$1} = $2;
} elsif (/^(clear_report_template|clear_spamtrap_template|report|spamtrap|
clear_terse_report_template|terse_report|
required_score|ok_locales|ok_languages|test|lang|
spamphrase|(?:welcomelist|whitelist)_from|require_version|
clear_unsafe_report_template|unsafe_report|
(?:bayes_)?auto_learn_threshold_nonspam|(?:bayes_)?auto_learn_threshold_spam|
(?:bayes_)?auto_learn
)/x) {
next;
} else {
print "warning: unknown rule in $file: $_\n";
}
}
close IN;
}
@langsfound = sort keys %langs;
@rulesfound = sort keys %rulesfound;
}
sub lintrules {
my %possible_renames = ();
foreach my $rule (@rulesfound) {
my $match = $rule;
$match =~ s/_\d+[^_]+$//gs; # trim e.g. "_20K"
$match =~ s/[^A-Z]+//gs; # trim numbers etc.
if (defined ($rulefile{$rule}) && $possible_renames{$match} !~ / \Q$rule\E\b/) {
$possible_renames{$match} .= " ".$rule;
}
$possible_rename_matches{$rule} = $match;
}
foreach my $lang ('', @langsfound) {
foreach my $baserule (@rulesfound) {
next if ( $baserule =~ /^__/ || $baserule =~ /^T_/ );
my $rule = concat_rule_lang ($baserule, $lang);
my $f = $descfile{$rule};
my $warned = '';
if (defined $f && !defined ($rulefile{$rule})
&& !defined ($rulefile{$baserule}))
{
print "warning: $baserule has description, but no rule: $f\n";
$warned .= ' lamedesc';
}
# Check our convention for rule length
if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && length $baserule > 22 ) {
print "warning: $baserule has a name longer than 22 chars: $f\n";
}
# Check our convention for rule length
if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && defined $description{$baserule}->{$lang} && length $description{$baserule}->{$lang} > 50 ) {
print "warning: $baserule has a description longer than 50 chars: $f\n";
}
# lang rule trumps normal rule
$f = $rulefile{$rule} || $rulefile{$baserule};
# if the rule exists, and the language/rule description doesn't exist ...
if ( defined $f && !defined $description{$baserule}->{$lang} )
{
print "warning: $baserule exists, ",( $lang ne '' ? "lang $lang, " : "" ),"but has no description: $f\n";
$warned .= ' lamedesc';
}
$f = $scorefile{$rule};
if (defined $f && !defined ($rulefile{$rule})
&& !defined ($rulefile{$baserule}))
{
print "warning: $baserule has score, but no rule: $f\n";
$warned .= ' lamescore';
}
my $r = $possible_rename_matches{$rule};
if ($warned ne '' && defined $r) {
my @matches = split (' ', $possible_renames{$r});
if (scalar @matches != 0) {
my $text = '';
# now try and figure out "nearby" rules with no description/score
foreach my $baser (@matches) {
my $blang;
if ($descfile{$rule} =~ /text_(\S\S)\./) {
$blang = $1;
}
my $r = concat_rule_lang ($baser, $blang);
#warn "$r $descfile{$r} $descfile{$baser}";
next if ($warned =~ /lamedesc/ && (defined $descfile{$r}));
next if ($warned =~ /lamescore/ && (defined $scorefile{$r}));
$text .= " $baser";
}
if ($text ne '') {
print "warning: (possible renamed rule? $text)\n";
}
}
}
}
}
}