masses/lint-rules-from-freqs - spamassassin - Git at Google

 #!/usr/bin/perl
 #
 # <@LICENSE>
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to you under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at:
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # </@LICENSE>

 # any tests that get less than this % of matches on *both* spam or nonspam, are
 # reported.
 my $LOW_MATCHES_PERCENT = 0.03;
 my $scoreset = 0;

 sub usage {
   die "
 lint-rules-from-freqs: perform 'lint' testing on SpamAssassin rules and scores

 usage: ./lint-rules-from-freqs [-f falsefreqs] < freqs > badtests

 This analyzes SpamAssassin tests, based on the hit frequencies and S/O ratios
 from a mass-check logfile pair.

 The 'freqs' argument is the frequency of hits in all messages ('hit-frequencies
 -x -p' output).

 The 'falsefreqs' argument is frequencies of hits in false-positives and
 false-negatives only ('hit-frequencies -x -p -f' output).

 ";
 }

 my $opt_falsefreqs;
 while ($#ARGV >= 0) {
   $_ = shift @ARGV;
   if (/^-f/) { $_ = shift @ARGV; $opt_falsefreqs = $_; }
   elsif (/^-s/) { $_ = shift @ARGV; $scoreset = $_; }
   else { usage(); }
 }

 print "BAD TESTS REPORT\n";
 readrules();
 print "\n" .((scalar keys %rulefile) + 1). " rules found.\n";
 print "\nRule file syntax issues:\n\n";
 lintrules();

 if ($opt_falsefreqs) {
   open (FALSE, "<$opt_falsefreqs");
   while (<FALSE>) {
     if (!/^\s*([\d\.]+)/) {
       my ($overall, $spam, $nons, $so, $score, $name) = split (' ');
       next unless ($name =~ /\S/);
       $falsefreqs_spam{$name} = $spam;
       $falsefreqs_nons{$name} = $nons;
       $falsefreqs_so{$name} = $so;
     }
   }
   close FALSE;
 }

 while (<>) {
   if (!/^\s*([\d\.]+)/) {
     $output{'a_header'} = $_; next;
   }

   my $badrule;
   my ($overall, $spam, $nons, $so, $score, $name) = split (' ');
   next unless ($name =~ /\S/);

   my $ffspam = $falsefreqs_spam{$name};
   my $ffnons = $falsefreqs_nons{$name};
   my $ffso = $falsefreqs_so{$name};

   my $tf = $tflags{$name};
   next if ($tf =~ /net/ && ($scoreset % 2) == 0);
   next if ($tf =~ /userconf/);

   if ($overall == 0.0 && $spam == 0.0 && $nons == 0.0) {        # sanity!
     $badrule = 'no matches';

   } else {
     if ($score < 0.0) {
       # negative score with more spams than nonspams? bad rule.
       if ($tf !~ /nice/ && $so > 0.5 && $score < 0.5) {
         $badrule = 'non-nice but -ve score';
       }

       if ($tf =~ /nice/ && $so > 0.5 && $score < 0.5) {
         if ($ffso < 0.5) {
           $badrule = 'fn';
         } else {
           # ignore, the FNs are overridden by other tests so it doesn't
           # affect the overall results.
         }
       }

       # low number of matches overall
       if ($nons < $LOW_MATCHES_PERCENT)
                  { $badrule ||= ''; $badrule .= ', low matches'; }

     } elsif ($score > 0.0) {
       # positive score with more nonspams than spams? bad.
       if ($tf =~ /nice/ && $so < 0.5 && $score > 0.5) {
         $badrule = 'nice but +ve score';
       }

       if ($tf !~ /nice/ && $so < 0.5 && $score > 0.5) {
         if ($ffso > 0.5) {
           $badrule = 'fp';
         } else {
           # ignore, the FPs are overridden by other tests so it doesn't
           # affect the overall results.
         }
       }

       # low number of matches overall
       if ($spam < $LOW_MATCHES_PERCENT)
                  { $badrule ||= ''; $badrule .= ', low matches'; }

     } elsif ($score == 0.0) {
       $badrule = 'score is 0';
     }
   }

   if (defined $badrule) {
     $badrule =~ s/^, //; chomp;
     $output{$badrule} .= $_ . " ($badrule)\n";
   }
 }

 # do all but 'no/low matches' first
 print "\nHigh-priority issues:\n\n";
 foreach my $badness (sort keys %output) {
   next if ($badness eq 'no matches');
   next if ($badness eq 'low matches');
   print $output{$badness};
   delete $output{$badness};
 }

 # now go back and do the other 2 (if they're there)
 print "\nLow-priority issues:\n\n";
 foreach my $badness (sort keys %output) {
   next unless defined ($output{$badness});
   print $output{$badness};
   delete $output{$badness};
 }
 exit;


 sub concat_rule_lang {
   my $rule = shift;
   my $lang = shift;

   if (defined $lang && $lang ne '') {
     return "[$lang]_$rule";
   } else {
     return $rule;
   }
 }

 # note: do not use parse-rules-for-masses here, we need to do linting instead
 # of your average parse
 sub readrules {
   my @files = <../rules/[0-9]*.cf>;
   my $file;
   %rulesfound = ();
   %langs = ();
   foreach $file (@files) {
     open (IN, "<$file");
     while (<IN>) {
       s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/;

       # make all the foo-bar stuff foo_bar
       1 while s/^(\S+)-/\1_/g;
       1 while s/^(lang\s+\S+\s+\S+)-/\1_/g;

       my $lang = '';
       if (s/^lang\s+(\S+)\s+//) {
         $lang = $1; $langs{$1} = 1;
       }

       if (/^(header|rawbody|body|full|uri|meta|mimeheader)\s+(\S+)\s+/) {
         $rulesfound{$2} = 1;
         $rulefile{$2} ||= $file;
         $scorefile{$1} = $file;
         $score{$2} ||= 1.0;
         $tflags{$2} ||= '';
         $descfile{$2} ||= $file;       # a rule with no score or desc is OK
 	$description{$2}->{$lang} = undef;

         if (/^body\s+\S+\s+eval:/) {
           # ignored
         } elsif (/^body\s+\S+\s+(.*)$/) {
           my $re = $1;

 	  # If there's a ( in a rule where it should be (?:, flag it.
 	  # but ignore [abc(] ...
           if ($re =~ /[^\\]\([^\?]/ && $re !~ /\[[^\]]*[^\\]\(/) {
             print "warning: non-(?:...) capture in regexp in $file: $_\n";
           }
           if ($re =~ /\.[\*\+]/) {
             print "warning: .* in regexp in $file: $_\n";
           }
           if ($re =~ /[^\\]\{(\d*),?(\d*?)\}/) {
             if ($1 > 120 || $2 > 120) {
               print "warning: long .{n} in regexp in $file: $_\n";
             }
           }
         }

       } elsif (/^describe\s+(\S+)\s+(.*?)\s*$/) {
         $rulesfound{$1} = 1;
         $descfile{concat_rule_lang ($1, $lang)} ||= $file;
         $descfile{$1} ||= $file;
 	$description{$1}->{$lang} = $2;
       } elsif (/^tflags\s+(\S+)\s+(.+)$/) {
         $rulesfound{$1} = 1;
         $tflags{$1} = $2;
         $tflagsfile{concat_rule_lang ($1, $lang)} = $file;
         $tflagsfile{$1} = $file;
       } elsif (/^score\s+(\S+)\s+(.+)$/) {
         $rulesfound{$1} = 1;
         $scorefile{concat_rule_lang ($1, $lang)} = $file;
         $scorefile{$1} = $file;
         $score{$1} = $2;
       } elsif (/^(clear_report_template|clear_spamtrap_template|report|spamtrap|
                 clear_terse_report_template|terse_report|
                 required_score|ok_locales|ok_languages|test|lang|
                 spamphrase|whitelist_from|require_version|
 		clear_unsafe_report_template|unsafe_report|
 		(?:bayes_)?auto_learn_threshold_nonspam|(?:bayes_)?auto_learn_threshold_spam|
 		(?:bayes_)?auto_learn
                 )/x) {
         next;
       } else {
         print "warning: unknown rule in $file: $_\n";
       }
     }
     close IN;
   }
   @langsfound = sort keys %langs;
   @rulesfound = sort keys %rulesfound;
 }

 sub lintrules {
   my %possible_renames = ();

   foreach my $rule (@rulesfound) {
     my $match = $rule;
     $match =~ s/_\d+[^_]+$//gs;    # trim e.g. "_20K"
     $match =~ s/[^A-Z]+//gs;    # trim numbers etc.

     if (defined ($rulefile{$rule}) && $possible_renames{$match} !~ / \Q$rule\E\b/) {
       $possible_renames{$match} .= " ".$rule;
     }
     $possible_rename_matches{$rule} = $match;
   }

   foreach my $lang ('', @langsfound) {
     foreach my $baserule (@rulesfound) {
       next if ( $baserule =~ /^__/ || $baserule =~ /^T_/ );

       my $rule = concat_rule_lang ($baserule, $lang);
       my $f = $descfile{$rule};
       my $warned = '';

       if (defined $f && !defined ($rulefile{$rule})
                 && !defined ($rulefile{$baserule}))
       {
         print "warning: $baserule has description, but no rule: $f\n";
         $warned .= ' lamedesc';
       }

 	# Check our convention for rule length
 	if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && length $baserule > 22 ) {
 	  print "warning: $baserule has a name longer than 22 chars: $f\n";
 	}
  	# Check our convention for rule length
 	if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && defined $description{$baserule}->{$lang} && length $description{$baserule}->{$lang} > 50 ) {
 	  print "warning: $baserule has a description longer than 50 chars: $f\n";
 	}

       # lang rule trumps normal rule
       $f = $rulefile{$rule} || $rulefile{$baserule};
       # if the rule exists, and the language/rule description doesn't exist ...
       if ( defined $f && !defined $description{$baserule}->{$lang} )
       {
         print "warning: $baserule exists, ",( $lang ne '' ? "lang $lang, " : "" ),"but has no description: $f\n";
         $warned .= ' lamedesc';
       }


       $f = $scorefile{$rule};
       if (defined $f && !defined ($rulefile{$rule})
                 && !defined ($rulefile{$baserule}))
       {
         print "warning: $baserule has score, but no rule: $f\n";
         $warned .= ' lamescore';
       }

       my $r = $possible_rename_matches{$rule};
       if ($warned ne '' && defined $r) {
         my @matches = split (' ', $possible_renames{$r});
         if (scalar @matches != 0) {
           my $text = '';

           # now try and figure out "nearby" rules with no description/score
           foreach my $baser (@matches) {
             my $blang;
             if ($descfile{$rule} =~ /text_(\S\S)\./) {
               $blang = $1;
             }
             my $r = concat_rule_lang ($baser, $blang);
             #warn "$r $descfile{$r} $descfile{$baser}";
             next if ($warned =~ /lamedesc/ && (defined $descfile{$r}));
             next if ($warned =~ /lamescore/ && (defined $scorefile{$r}));
             $text .= " $baser";
           }

           if ($text ne '') {
             print "warning: (possible renamed rule? $text)\n";
           }
         }
       }
     }
   }
 }
	#!/usr/bin/perl
	#
	# <@LICENSE>
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to you under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at:
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# </@LICENSE>

	# any tests that get less than this % of matches on both spam or nonspam, are
	# reported.
	my $LOW_MATCHES_PERCENT = 0.03;
	my $scoreset = 0;

	sub usage {
	die "
	lint-rules-from-freqs: perform 'lint' testing on SpamAssassin rules and scores

	usage: ./lint-rules-from-freqs [-f falsefreqs] < freqs > badtests

	This analyzes SpamAssassin tests, based on the hit frequencies and S/O ratios
	from a mass-check logfile pair.

	The 'freqs' argument is the frequency of hits in all messages ('hit-frequencies
	-x -p' output).

	The 'falsefreqs' argument is frequencies of hits in false-positives and
	false-negatives only ('hit-frequencies -x -p -f' output).

	";
	}

	my $opt_falsefreqs;
	while ($#ARGV >= 0) {
	$_ = shift @ARGV;
	if (/^-f/) { $_ = shift @ARGV; $opt_falsefreqs = $_; }
	elsif (/^-s/) { $_ = shift @ARGV; $scoreset = $_; }
	else { usage(); }
	}

	print "BAD TESTS REPORT\n";
	readrules();
	print "\n" .((scalar keys %rulefile) + 1). " rules found.\n";
	print "\nRule file syntax issues:\n\n";
	lintrules();

	if ($opt_falsefreqs) {
	open (FALSE, "<$opt_falsefreqs");
	while (<FALSE>) {
	if (!/^\s*([\d\.]+)/) {
	my ($overall, $spam, $nons, $so, $score, $name) = split (' ');
	next unless ($name =~ /\S/);
	$falsefreqs_spam{$name} = $spam;
	$falsefreqs_nons{$name} = $nons;
	$falsefreqs_so{$name} = $so;
	}
	}
	close FALSE;
	}

	while (<>) {
	if (!/^\s*([\d\.]+)/) {
	$output{'a_header'} = $_; next;
	}

	my $badrule;
	my ($overall, $spam, $nons, $so, $score, $name) = split (' ');
	next unless ($name =~ /\S/);

	my $ffspam = $falsefreqs_spam{$name};
	my $ffnons = $falsefreqs_nons{$name};
	my $ffso = $falsefreqs_so{$name};

	my $tf = $tflags{$name};
	next if ($tf =~ /net/ && ($scoreset % 2) == 0);
	next if ($tf =~ /userconf/);

	if ($overall == 0.0 && $spam == 0.0 && $nons == 0.0) { # sanity!
	$badrule = 'no matches';

	} else {
	if ($score < 0.0) {
	# negative score with more spams than nonspams? bad rule.
	if ($tf !~ /nice/ && $so > 0.5 && $score < 0.5) {
	$badrule = 'non-nice but -ve score';
	}

	if ($tf =~ /nice/ && $so > 0.5 && $score < 0.5) {
	if ($ffso < 0.5) {
	$badrule = 'fn';
	} else {
	# ignore, the FNs are overridden by other tests so it doesn't
	# affect the overall results.
	}
	}

	# low number of matches overall
	if ($nons < $LOW_MATCHES_PERCENT)
	{ $badrule \|\|= ''; $badrule .= ', low matches'; }

	} elsif ($score > 0.0) {
	# positive score with more nonspams than spams? bad.
	if ($tf =~ /nice/ && $so < 0.5 && $score > 0.5) {
	$badrule = 'nice but +ve score';
	}

	if ($tf !~ /nice/ && $so < 0.5 && $score > 0.5) {
	if ($ffso > 0.5) {
	$badrule = 'fp';
	} else {
	# ignore, the FPs are overridden by other tests so it doesn't
	# affect the overall results.
	}
	}

	# low number of matches overall
	if ($spam < $LOW_MATCHES_PERCENT)
	{ $badrule \|\|= ''; $badrule .= ', low matches'; }

	} elsif ($score == 0.0) {
	$badrule = 'score is 0';
	}
	}

	if (defined $badrule) {
	$badrule =~ s/^, //; chomp;
	$output{$badrule} .= $_ . " ($badrule)\n";
	}
	}

	# do all but 'no/low matches' first
	print "\nHigh-priority issues:\n\n";
	foreach my $badness (sort keys %output) {
	next if ($badness eq 'no matches');
	next if ($badness eq 'low matches');
	print $output{$badness};
	delete $output{$badness};
	}

	# now go back and do the other 2 (if they're there)
	print "\nLow-priority issues:\n\n";
	foreach my $badness (sort keys %output) {
	next unless defined ($output{$badness});
	print $output{$badness};
	delete $output{$badness};
	}
	exit;


	sub concat_rule_lang {
	my $rule = shift;
	my $lang = shift;

	if (defined $lang && $lang ne '') {
	return "[$lang]_$rule";
	} else {
	return $rule;
	}
	}

	# note: do not use parse-rules-for-masses here, we need to do linting instead
	# of your average parse
	sub readrules {
	my @files = <../rules/[0-9]*.cf>;
	my $file;
	%rulesfound = ();
	%langs = ();
	foreach $file (@files) {
	open (IN, "<$file");
	while (<IN>) {
	s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/;

	# make all the foo-bar stuff foo_bar
	1 while s/^(\S+)-/\1_/g;
	1 while s/^(lang\s+\S+\s+\S+)-/\1_/g;

	my $lang = '';
	if (s/^lang\s+(\S+)\s+//) {
	$lang = $1; $langs{$1} = 1;
	}

	if (/^(header\|rawbody\|body\|full\|uri\|meta\|mimeheader)\s+(\S+)\s+/) {
	$rulesfound{$2} = 1;
	$rulefile{$2} \|\|= $file;
	$scorefile{$1} = $file;
	$score{$2} \|\|= 1.0;
	$tflags{$2} \|\|= '';
	$descfile{$2} \|\|= $file; # a rule with no score or desc is OK
	$description{$2}->{$lang} = undef;

	if (/^body\s+\S+\s+eval:/) {
	# ignored
	} elsif (/^body\s+\S+\s+(.*)$/) {
	my $re = $1;

	# If there's a ( in a rule where it should be (?:, flag it.
	# but ignore [abc(] ...
	if ($re =~ /[^\\]\([^\?]/ && $re !~ /\[[^\]]*[^\\]\(/) {
	print "warning: non-(?:...) capture in regexp in $file: $_\n";
	}
	if ($re =~ /\.[\*\+]/) {
	print "warning: .* in regexp in $file: $_\n";
	}
	if ($re =~ /[^\\]\{(\d),?(\d?)\}/) {
	if ($1 > 120 \|\| $2 > 120) {
	print "warning: long .{n} in regexp in $file: $_\n";
	}
	}
	}

	} elsif (/^describe\s+(\S+)\s+(.?)\s$/) {
	$rulesfound{$1} = 1;
	$descfile{concat_rule_lang ($1, $lang)} \|\|= $file;
	$descfile{$1} \|\|= $file;
	$description{$1}->{$lang} = $2;
	} elsif (/^tflags\s+(\S+)\s+(.+)$/) {
	$rulesfound{$1} = 1;
	$tflags{$1} = $2;
	$tflagsfile{concat_rule_lang ($1, $lang)} = $file;
	$tflagsfile{$1} = $file;
	} elsif (/^score\s+(\S+)\s+(.+)$/) {
	$rulesfound{$1} = 1;
	$scorefile{concat_rule_lang ($1, $lang)} = $file;
	$scorefile{$1} = $file;
	$score{$1} = $2;
	} elsif (/^(clear_report_template\|clear_spamtrap_template\|report\|spamtrap\|
	clear_terse_report_template\|terse_report\|
	required_score\|ok_locales\|ok_languages\|test\|lang\|
	spamphrase\|whitelist_from\|require_version\|
	clear_unsafe_report_template\|unsafe_report\|
	(?:bayes_)?auto_learn_threshold_nonspam\|(?:bayes_)?auto_learn_threshold_spam\|
	(?:bayes_)?auto_learn
	)/x) {
	next;
	} else {
	print "warning: unknown rule in $file: $_\n";
	}
	}
	close IN;
	}
	@langsfound = sort keys %langs;
	@rulesfound = sort keys %rulesfound;
	}

	sub lintrules {
	my %possible_renames = ();

	foreach my $rule (@rulesfound) {
	my $match = $rule;
	$match =~ s/_\d+[^_]+$//gs; # trim e.g. "_20K"
	$match =~ s/[^A-Z]+//gs; # trim numbers etc.

	if (defined ($rulefile{$rule}) && $possible_renames{$match} !~ / \Q$rule\E\b/) {
	$possible_renames{$match} .= " ".$rule;
	}
	$possible_rename_matches{$rule} = $match;
	}

	foreach my $lang ('', @langsfound) {
	foreach my $baserule (@rulesfound) {
	next if ( $baserule =~ /^__/ \|\| $baserule =~ /^T_/ );

	my $rule = concat_rule_lang ($baserule, $lang);
	my $f = $descfile{$rule};
	my $warned = '';

	if (defined $f && !defined ($rulefile{$rule})
	&& !defined ($rulefile{$baserule}))
	{
	print "warning: $baserule has description, but no rule: $f\n";
	$warned .= ' lamedesc';
	}

	# Check our convention for rule length
	if ( (($lang ne '' && defined($rulefile{$rule})) \|\| ($lang eq '' && defined ($rulefile{$baserule}))) && length $baserule > 22 ) {
	print "warning: $baserule has a name longer than 22 chars: $f\n";
	}
	# Check our convention for rule length
	if ( (($lang ne '' && defined($rulefile{$rule})) \|\| ($lang eq '' && defined ($rulefile{$baserule}))) && defined $description{$baserule}->{$lang} && length $description{$baserule}->{$lang} > 50 ) {
	print "warning: $baserule has a description longer than 50 chars: $f\n";
	}

	# lang rule trumps normal rule
	$f = $rulefile{$rule} \|\| $rulefile{$baserule};
	# if the rule exists, and the language/rule description doesn't exist ...
	if ( defined $f && !defined $description{$baserule}->{$lang} )
	{
	print "warning: $baserule exists, ",( $lang ne '' ? "lang $lang, " : "" ),"but has no description: $f\n";
	$warned .= ' lamedesc';
	}


	$f = $scorefile{$rule};
	if (defined $f && !defined ($rulefile{$rule})
	&& !defined ($rulefile{$baserule}))
	{
	print "warning: $baserule has score, but no rule: $f\n";
	$warned .= ' lamescore';
	}

	my $r = $possible_rename_matches{$rule};
	if ($warned ne '' && defined $r) {
	my @matches = split (' ', $possible_renames{$r});
	if (scalar @matches != 0) {
	my $text = '';

	# now try and figure out "nearby" rules with no description/score
	foreach my $baser (@matches) {
	my $blang;
	if ($descfile{$rule} =~ /text_(\S\S)\./) {
	$blang = $1;
	}
	my $r = concat_rule_lang ($baser, $blang);
	#warn "$r $descfile{$r} $descfile{$baser}";
	next if ($warned =~ /lamedesc/ && (defined $descfile{$r}));
	next if ($warned =~ /lamescore/ && (defined $scorefile{$r}));
	$text .= " $baser";
	}

	if ($text ne '') {
	print "warning: (possible renamed rule? $text)\n";
	}
	}
	}
	}
	}
	}