| #!/usr/bin/perl -w |
| # |
| # <@LICENSE> |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to you under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at: |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # </@LICENSE> |
| |
| =head1 NAME |
| |
| rewrite-cf-with-new-scores - Rewrite SpamAssassin scores file with new |
| scores. |
| |
| =head1 SYNOPSIS |
| |
| rewrite-cf-with-new-scores [options] |
| |
| Options |
| --old-scores=file Read file containing the old SpamAssassin scores |
| --new-scores=file Read file containing the new SpamAssassin scores |
| -s,--scoreset n Rewrite scoreset n |
| --output=file Output rewritten score file to file |
| |
| Note: these options can be shortened (i.e. --old, --new, --out) as |
| long as they are unambiguous. |
| |
| =head1 DESCRIPTION |
| |
| B<rewrite-cf-with-new-scores> is a tool to update the sitewide scores |
| file with the newly generated scores. Since SpamAssassin has four |
| different scoresets, which each need to be generated separately, this |
| tool is used to only change the correct scoreset. |
| |
| By default, the old scores are read from F<../rules/50_scores.cf> and |
| the new ones from F<perceptron.scores>. The output will be |
| F<50_scores.cf> by default. |
| |
| If no options are given, the script will look for command line options |
| in the following order: scoreset, old-scores, new-scores. In this |
| case, output will go to B<STDOUT>. |
| |
| The rules directory needs to be used to make sure scores are given for |
| the right tests. Rules not found in the rules directory will not be |
| given scores in the output. |
| |
| =head1 BUGS |
| |
| Please report bugs to http://bugzilla.spamassassin.org/ |
| |
| =head1 SEE ALSO |
| |
| L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)> |
| |
| =cut |
| |
| use strict; |
| use warnings; |
| |
| use Getopt::Long qw(:config auto_help); |
| use Pod::Usage; |
| |
| our ($opt_old, $opt_new, $opt_scoreset, $opt_out, $opt_cffile); |
| |
| GetOptions("old-scores=s" => \$opt_old, |
| "new-scores=s" => \$opt_new, |
| "cffile=s" => \$opt_cffile, |
| "s|scoreset=i" => \$opt_scoreset, |
| "output=s" => \$opt_out); |
| |
| # Backwards compatibility mode |
| |
| if (!defined($opt_old) && |
| !defined($opt_new) && |
| !defined($opt_scoreset) && |
| !defined($opt_out)) { |
| |
| ($opt_scoreset, $opt_old, $opt_new) = @ARGV; |
| } |
| |
| if (!defined $opt_out) { |
| $opt_out = "-"; #STDOUT |
| } |
| if (!defined $opt_cffile) { |
| $opt_cffile = '../rules'; |
| } |
| if (!defined $opt_scoreset) { |
| $opt_scoreset = 0; |
| } |
| |
| $opt_new ||= "perceptron.scores"; |
| $opt_old ||= "../rules/50_scores.cf"; |
| $opt_out ||= "50_scores.cf"; |
| |
| my $NUM_SCORESETS = 4; |
| my $ZERO_MINISCULE_SCORES = 0; |
| my $MINISCULE_THRESHOLD = 0.01; # points |
| |
| my $UNZERO_META_PREDICATES = 1; |
| my $IGNORE_ZEROED_TEST_RULES = 1; |
| |
| if ($opt_scoreset < 0 || $opt_scoreset >= $NUM_SCORESETS) { |
| pod2usage("scoreset $opt_scoreset out of range 0 - " . ($NUM_SCORESETS-1)); |
| } |
| |
| # Open output |
| open(OUT, ">$opt_out"); |
| |
| # scores are broken into three regions: |
| # 1. "pre" (stuff before generated mutable scores) |
| # 2. "gen" (first generated mutable scores section) |
| # 3. "end" (stuff after generated mutable scores) |
| # 4. "gen2" (any later generated mutable scores sections) |
| |
| # variables filled-out in read_rules() |
| our %rules; # rules data |
| |
| # variables filled-out in read_gascores() |
| my %gascores = (); # generated scores |
| |
| # variables filled-out in read_ranges_data() |
| my %range_zeroed_rules = (); # zeroed by score-ranges-from-freqs |
| |
| # variables filled-out in read_oldscores() |
| my $pre = ''; # stuff before first "gen" section |
| my $end = ''; # stuff after first "gen" section |
| my %oldscores; # old scores |
| my %comment; # "gen" rule comments |
| my %fixed; # scores that are fixed (non-gen) |
| my %gen2; # scores that are gen in the $end string |
| |
| # compiled output |
| my @gen_order = (); |
| my %gen_lines = (); |
| |
| # read stuff in |
| read_rules(); |
| read_gascores(); |
| read_oldscores(); |
| # TODO: read_ranges.data |
| read_ranges_data(); |
| |
| sub read_ranges_data { |
| # used to pick up rules zeroed by score-ranges-from-freqs, e.g. |
| # "ignoring 'FM_MORTGAGE6PLUS': score and range == 0" |
| # which results in: |
| # tmp/ranges.data:0.000 0.000 0 FM_MORTGAGE6PLUS |
| |
| open (IN, "<tmp/ranges.data") |
| or die "cannot read tmp/ranges.data"; |
| while (<IN>) { |
| /^(\S+) (\S+) (\S+) (\S+)/ or next; |
| my ($min, $max, $mutable, $name) = ($1,$2,$3,$4); |
| |
| if ($min == 0 && $max == 0 && $mutable == 0) { |
| $range_zeroed_rules{$name} = 1; |
| } |
| } |
| close IN; |
| } |
| |
| build_new_scores(); |
| |
| if ($ZERO_MINISCULE_SCORES) { |
| fixup_miniscule_scores(); |
| } |
| if ($UNZERO_META_PREDICATES) { |
| fixup_meta_predicates(); |
| } |
| |
| $end = sub_gen2($end); |
| |
| # write stuff out |
| print OUT $pre; |
| print_gen(); |
| print OUT $end; |
| exit; |
| |
| |
| sub read_rules { |
| my $tmpf = "./tmp/rules$$.pl"; |
| system "../build/parse-rules-for-masses ". |
| "-d \"$opt_cffile\" -s $opt_scoreset -o $tmpf" and die; |
| require $tmpf; |
| unlink $tmpf; |
| } |
| |
| sub read_gascores { |
| open (STDIN, "<$opt_new") or die "cannot open $opt_new"; |
| while (<STDIN>) { |
| next unless /^score\s+(\S+)\s+(-?\d+(?:\.\d+)?)/; |
| my $name = $1; |
| my $score = $2; |
| |
| # various things we should be concerned about |
| if (!exists $rules{$name}) { |
| warn "$name is not defined in tmp/rules.pl\n"; |
| next; |
| } |
| if ($rules{$name}->{issubrule}) { |
| warn "$name is an indirect sub-rule in tmp/rules.pl\n"; |
| next; |
| } |
| if ($rules{$name} =~ /^__/) { |
| warn "$name has an indirect sub-rule \"__\" prefix\n"; |
| next; |
| } |
| if ($name eq '(null)') { |
| warn "$name is (null)\n"; |
| next; |
| } |
| |
| $gascores{$name} = $score; |
| } |
| } |
| |
| sub read_oldscores { |
| open (IN, "<$opt_old") or die "cannot open $opt_old"; |
| |
| # state of things |
| my $where = "pre"; # region of original scores file that we're in |
| my $seen_gen = 0; # have we seen the first <gen:mutable> tag? |
| |
| # read everything in |
| while (my $line = <IN>) { |
| if ($line =~ /<\/gen:mutable>/) { |
| $where = "end"; |
| } |
| |
| if ($where eq "pre") { |
| readline_fix($line); |
| $pre .= $line; |
| } |
| elsif ($where eq "gen") { |
| readline_gen($line); |
| } |
| elsif ($where eq "gen2") { |
| readline_gen2($line); |
| $end .= $line; |
| } |
| elsif ($where eq "end") { |
| readline_fix($line); |
| $end .= $line; |
| } |
| |
| if ($line =~ /<gen:mutable>/) { |
| if ($seen_gen) { |
| $where = "gen2"; |
| } |
| else { |
| $where = "gen"; |
| $seen_gen = 1; |
| } |
| } |
| } |
| } |
| |
| # used for both "pre" and "end" |
| sub readline_fix { |
| my ($line) = @_; |
| |
| my $comment; |
| if ($line =~ s/\s*#\s*(.*)//) { |
| $comment = $1; |
| } |
| if ($line =~ /^\s*score\s+(\S+)\s/) { |
| my (undef, $name, @scores) = split(' ', $line); |
| $fixed{$name}++; |
| $comment{$name} = $comment if $comment; |
| } |
| } |
| |
| sub readline_gen { |
| my ($line) = @_; |
| |
| my $comment; |
| if ($line =~ s/\s*#\s*(.*)//) { |
| $comment = $1; |
| $comment =~ s/ n=$opt_scoreset//; |
| } |
| if ($line =~ /^\s*score\s+(\S+)\s/) { |
| my (undef, $name, @scores) = split(' ', $line); |
| for (my $i = 1; $i < $NUM_SCORESETS; $i++) { |
| $scores[$i] = $scores[0] unless defined $scores[$i]; |
| } |
| @{$oldscores{$name}} = @scores; |
| $comment{$name} = $comment if $comment; |
| } |
| } |
| |
| sub readline_gen2 { |
| my ($line) = @_; |
| |
| my $comment; |
| if ($line =~ s/\s*#\s*(.*)//) { |
| $comment = $1; |
| $comment =~ s/ n=$opt_scoreset//; |
| } |
| if ($line =~ /^\s*score\s+(\S+)\s/) { |
| my (undef, $name, @scores) = split(' ', $line); |
| for (my $i = 1; $i < $NUM_SCORESETS; $i++) { |
| $scores[$i] = $scores[0] unless defined $scores[$i]; |
| } |
| @{$oldscores{$name}} = @scores; |
| $comment{$name} = $comment if $comment; |
| $gen2{$name}++; |
| } |
| } |
| |
| sub build_new_scores { |
| # we just consider scores for this set that are in the input or were in the |
| # "gen" region from the old scores, tmp/rules.pl is not considered here |
| my %gen; # rules to be printed in "gen" region |
| $gen{$_} = 1 for keys %gascores; # scores for this set from GA |
| $gen{$_} = 1 for keys %oldscores; # original scores in "gen" region |
| $gen{$_} = 1 for keys %range_zeroed_rules; # zeroed by range script |
| |
| # remove fixed scores |
| for (keys %fixed) { |
| delete $gen{$_}; |
| } |
| |
| # sort all generated rules by name |
| for my $name (sort keys %gen) { |
| next if ($rules{$name}->{lang}); # "lang es" rules etc. |
| next if ($rules{$name}->{issubrule}); # indirect sub-rules |
| next if ($name eq 'AWL'); # dynamic score |
| |
| my @scores = (); |
| my $comment = ''; |
| $comment = $comment{$name} if defined $comment{$name}; |
| |
| # use the old scores if they existed |
| @scores = @{$oldscores{$name}} if exists $oldscores{$name}; |
| |
| # set appropriate scoreset value |
| if (defined $gascores{$name}) { |
| $scores[$opt_scoreset] = $gascores{$name}; |
| delete $oldscores{$name}; |
| } |
| elsif ($range_zeroed_rules{$name}) { |
| if ($IGNORE_ZEROED_TEST_RULES && $name =~ /^T_/) { next; } |
| |
| # zeroed pre-evolver, by score-ranges-from-freqs |
| $scores[$opt_scoreset] = 0; |
| |
| if (defined $oldscores{$name}) { |
| $comment .= " n=$opt_scoreset"; |
| } |
| } |
| else { |
| # zero for current scoreset if there was no new score; |
| # when the perceptron does this for mutable rules, it means |
| # that score had a new score of 0 |
| $scores[$opt_scoreset] = 0; |
| |
| if (defined $oldscores{$name}) { |
| $comment .= " n=$opt_scoreset"; |
| #warn "$name has no GA score, but had a score before\n"; |
| } |
| } |
| |
| # sort and unique comment tags |
| my %unique; |
| $unique{$_} = 1 for split(' ', $comment); |
| $comment = join(' ', sort keys %unique); |
| |
| push (@gen_order, $name); |
| $gen_lines{$name} = { |
| scores => \@scores, |
| comment => $comment |
| }; |
| } |
| } |
| |
| sub new_score_line { |
| my ($name) = @_; |
| |
| # create new score line |
| my @scores = @{$gen_lines{$name}{scores}}; |
| my $comment = $gen_lines{$name}{comment}; |
| return sprintf("score %s %s%s", $name, |
| join(" ", generate_scores($name, @scores)), |
| ($comment) ? ' # ' . $comment : ''); |
| } |
| |
| sub print_gen { |
| print OUT "\n"; |
| foreach my $name (@gen_order) { |
| next if ($gen2{$name}); # will do that separately |
| print OUT new_score_line($name), "\n"; |
| } |
| print OUT "\n"; |
| } |
| |
| sub sub_gen2 { |
| my $end = shift; |
| |
| foreach my $name (keys %gen2) { |
| if ($end !~ s/^\s*score\s+${name}\s.+?$/ |
| new_score_line($name); |
| /em) |
| { |
| # we failed to sub it; output score in main gen:mutable block instead |
| delete $gen2{$name}; |
| } |
| } |
| $end; |
| } |
| |
| sub generate_scores { |
| my ($name, @scores) = @_; |
| |
| my $isnet = 0; |
| my $islearn = 0; |
| if (defined $rules{$name}->{tflags}) { |
| $isnet = ($rules{$name}->{tflags} =~ /\bnet\b/); |
| $islearn = ($rules{$name}->{tflags} =~ /\blearn\b/); |
| } |
| |
| # set defaults if not already set |
| if (!defined $scores[0]) { |
| warn "$name does not have a default score\n"; |
| $scores[0] ||= 0; |
| } |
| |
| my $flag = 1; |
| for (my $i = 1; $i < $NUM_SCORESETS; $i++) { |
| $scores[$i] = $scores[0] unless defined $scores[$i]; |
| $flag = 0 if ($scores[$i] != $scores[$i-1]); |
| }; |
| |
| # enforce rule/scoreset rules. |
| # net rules never have a non-zero score in sets 0 and 2 |
| for (my $i = 0; $i < $NUM_SCORESETS; $i++) { |
| if ($isnet && ($i & 1) == 0) { |
| $scores[$i] = 0; |
| $flag = 0 if ($i > 0 && $scores[$i] != $scores[$i-1]); |
| } |
| if ($islearn && ($i & 2) == 0) { |
| $scores[$i] = 0; |
| $flag = 0 if ($i > 0 && $scores[$i] != $scores[$i-1]); |
| } |
| } |
| |
| if ($flag) { |
| splice @scores, 1; |
| } |
| |
| return @scores; |
| } |
| |
| sub fixup_miniscule_scores { |
| my $num_fixed = 0; |
| |
| foreach my $name (@gen_order) { |
| my @scores = @{$gen_lines{$name}{scores}}; |
| if (abs($scores[$opt_scoreset]) < $MINISCULE_THRESHOLD) { |
| $scores[$opt_scoreset] = 0; |
| $num_fixed++; |
| } |
| @{$gen_lines{$name}{scores}} = @scores; |
| } |
| |
| warn "zeroed $num_fixed scores for being 'miniscule'.\n"; |
| } |
| |
| sub fixup_meta_predicates { |
| # this is the opposite of t/meta.t |
| |
| while (my ($name, $info) = each %rules) |
| { |
| next if ($name eq '_scoreset'); |
| |
| my $type = $info->{type} || "unknown"; |
| # look at meta rules that are not disabled |
| next unless ($type eq "meta" && ($name =~ /^__/ || $info->{score} != 0)); |
| |
| next unless ($info->{depends}); |
| |
| # test rules should not impose requirements on release rules; ignore |
| # any dependency requirements caused by T_ rules |
| next if $name =~ /^T_/; |
| |
| for my $depend (@{ $info->{depends} }) { |
| if (!exists $rules{$depend}) { |
| warn "$name depends on $depend which is nonexistent\n"; |
| next; |
| } |
| |
| # if dependency is a predicate, it'll run |
| next if $depend =~ /^__/; |
| |
| # not a generated rule? not our problem, then; t/meta.t will catch it |
| next unless (exists $gen_lines{$depend}); |
| |
| # ignore "tflags net" and "tflags learn" rules -- it is OK |
| # for those to have zero scores in some scoresets, for obvious |
| # reasons. |
| next if (defined $rules{$depend}->{tflags} && |
| $rules{$depend}->{tflags} =~ /\b(?:net|learn)\b/); |
| |
| # if dependency has a non-zero score, it'll run |
| my $depscore = $gen_lines{$depend}{scores}[$opt_scoreset]; |
| next if (defined $depscore && $depscore != 0); |
| |
| warn "fix meta dep: $name depends on $depend with 0 score, fixing at non-0\n"; |
| $gen_lines{$depend}{scores}[$opt_scoreset] = 0.001; |
| } |
| } |
| } |