| #!/usr/bin/perl -w |
| # |
| # <@LICENSE> |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to you under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at: |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # </@LICENSE> |
| |
| use strict; |
| |
| sub usage { |
| die " |
| parse-rules-for-masses: parse the SpamAssassin rules files for mass-checks, |
| evolving, and frequency analysis |
| |
| usage: ./parse-rules-for-masses [-d rulesdir] [-o outputfile] [-s scoreset] [-x] |
| |
| rulesdir defaults to ../rules |
| outputfile defaults to ./tmp/rules.pl |
| scoreset default to 0 |
| -x do not include test rules files (ie 70_*) |
| "; |
| } |
| |
| use Getopt::Long; |
| use Data::Dumper; |
| |
| use vars qw(@rulesdirs $outputfile $scoreset $skip_test_rules); |
| GetOptions ( |
| "d=s" => \@rulesdirs, |
| "o=s" => \$outputfile, |
| "s=i" => \$scoreset, |
| "x" => \$skip_test_rules, |
| "help|h|?" => sub { usage(); } ); |
| |
| if ($#rulesdirs < 0) { |
| @rulesdirs = ("../rules"); |
| } |
| |
| if (!defined $outputfile) { |
| $outputfile = "./tmp/rules.pl"; |
| mkdir ("tmp", 0755); |
| } |
| |
| $scoreset = 0 if ( !defined $scoreset ); |
| |
| my $rules = { }; |
| $rules->{_scoreset} = $scoreset; |
| readrules(@rulesdirs); |
| |
| my $scores = { }; |
| foreach my $key (keys %{$rules}) { |
| next if $key eq '_scoreset'; |
| $scores->{$key} = $rules->{$key}->{score}; |
| } |
| |
| writerules($outputfile); |
| exit; |
| |
| sub readrules { |
| foreach my $indir (@_) { |
| my @files = <$indir/*.cf>; |
| |
| my $file; |
| my $scores_mutable = 1; |
| my %rulesfound = (); |
| my %langs = (); |
| foreach $file (sort @files) { |
| $scores_mutable = 1; # always start off mutable in each file |
| if ($skip_test_rules) { |
| next if ($file =~ /70_/); |
| } |
| open (IN, "<$file"); |
| while (<IN>) |
| { |
| # these appear in comments, so deal with them before comment stripping |
| # takes place |
| if (/<\/gen:mutable>/i) { |
| $scores_mutable = 0; |
| } |
| elsif (/<gen:mutable>/i) { |
| $scores_mutable = 1; |
| } |
| |
| s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/; |
| |
| # TODO: this could be overwriting stuff |
| my $lang = ''; |
| if (s/^lang\s+(\S+)\s+//) { |
| $lang = $1; |
| } |
| |
| if (/^(header|rawbody|body|full|uri|askdns|meta|mimeheader|reuse)\s+(\S+)\s+(.*)$/) { |
| my $type = $1; |
| my $name = $2; |
| my $val = $3; |
| |
| if (exists $rules->{$name}->{type} && $type eq 'reuse') { |
| # "reuse" should be skipped if we already have a rule |
| next; |
| } |
| |
| $rules->{$name} ||= { }; |
| $rules->{$name}->{type} = $type; |
| $rules->{$name}->{lang} = $lang; |
| $rules->{$name}->{issubrule} = ($name =~ /^__/) ? '1' : '0'; |
| $rules->{$name}->{tflags} = ''; |
| $rules->{$name}->{eval} = ($val =~ /\beval:(\w+)/) ? $1 : '0'; |
| if ($type eq "meta") { |
| my @depends = grep { !/^\d+$/ } ($val =~ m/(\w+)/g); |
| push(@{ $rules->{$name}->{depends} }, @depends); |
| } |
| $rules->{$name}->{code} = $val; |
| |
| } elsif (/^describe\s+(\S+)\s+(.+)$/) { |
| $rules->{$1} ||= { }; |
| if ($lang) { |
| $rules->{$1}->{describe} ||= $2; |
| } |
| else { |
| $rules->{$1}->{describe} = $2; |
| } |
| } elsif (/^tflags\s+(\S+)\s+(.+)$/) { |
| $rules->{$1} ||= { }; |
| $rules->{$1}->{tflags} = $2; |
| |
| } elsif (/^score\s+(\S+)\s+(.+)$/) { |
| my($name,$score) = ($1,$2); |
| $rules->{$name} ||= { }; |
| if ( $score =~ /\s/ ) { # there are multiple scores |
| ($score) = (split(/\s+/,$score))[$scoreset]; |
| } |
| $rules->{$name}->{score} = $score; |
| $rules->{$name}->{mutable} = $scores_mutable; |
| |
| } |
| } |
| close IN; |
| } |
| } |
| |
| foreach my $rule (keys %{$rules}) { |
| next if ($rule eq '_scoreset'); |
| if (!defined $rules->{$rule}->{type}) { |
| delete $rules->{$rule}; # no rule definition -> no rule |
| next; |
| } |
| |
| my $tflags = $rules->{$rule}->{tflags}; |
| if (!defined $rules->{$rule}->{score}) { |
| my $def = 1.0; |
| if ($rule =~ /^T_/) { |
| $def = 0.01; |
| } |
| |
| if ($tflags =~ /\bnice\b/) { |
| $rules->{$rule}->{score} = -$def; |
| } else { |
| $rules->{$rule}->{score} = $def; |
| } |
| $rules->{$rule}->{no_score_found} = 1; |
| } |
| |
| # ignore net rules in set 0 or set 2 |
| if ($tflags =~ /\bnet\b/ && ($scoreset & 1) == 0) { |
| $rules->{$rule}->{mutable} = 0; |
| $rules->{$rule}->{score} = 0; |
| } |
| |
| # ignore bayes rules in set 0 or set 2 |
| if ($tflags =~ /\blearn\b/ && ($scoreset & 2) == 0) { |
| $rules->{$rule}->{mutable} = 0; |
| $rules->{$rule}->{score} = 0; |
| } |
| |
| # if a rule didn't have a score specified, assume it's |
| # mutable |
| if (!defined $rules->{$rule}->{mutable}) { |
| $rules->{$rule}->{mutable} = 1; |
| } |
| |
| # although T_ test rules are clamped to 0.01. this works well |
| # for release mass-checks, at least |
| if ($rule =~ /^T_/) { |
| $rules->{$rule}->{mutable} = 0; |
| } elsif ($rule eq 'AWL') { # ignore entirely |
| $rules->{$rule}->{mutable} = 0; |
| $rules->{$rule}->{score} = 0; |
| } |
| |
| } |
| } |
| |
| sub writerules { |
| my $outfile = shift; |
| # quick hack to create the tmp directory |
| system ("mkdir -p $outfile 2>/dev/null ; rmdir $outfile 2>/dev/null"); |
| |
| open (OUT, ">$outfile") or die "cannot write to $outfile"; |
| print OUT "# dumped at ".`date`."\n"; |
| |
| $Data::Dumper::Purity = 1; |
| print OUT Data::Dumper->Dump ([$rules, $scores], ['*rules', '*scores']); |
| |
| print OUT "1;"; |
| close OUT; |
| } |
| |