build/mkrules - spamassassin - Git at Google

 #!/usr/bin/perl -w
 #
 # build/mkrules -- compile the SpamAssassin rules into installable form
 #
 # <@LICENSE>
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to you under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at:
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # </@LICENSE>

 # This is an implementation of
 # http://wiki.apache.org/spamassassin/RulesProjPromotion

 sub usage {
   die "build/mkrules [--src srcdir] [--exit_on_no_src] [--out outputdir]\n";
 }

 my $RULE_DEFINE_KEYWORDS_RE = qr{
         header|rawbody|body|full|uri
         |meta|mimeheader|urirhssub|uridnsbl
     }x;

 my $RULE_KEYWORDS_RE = qr{
         ${RULE_DEFINE_KEYWORDS_RE}|
         describe|tflags|reuse|score
     }x;

 my $fail_message = "";

 use strict;
 use File::Find;
 use File::Copy;
 use File::Basename;
 use Getopt::Long;

 # use SpamAssassin classes directly, so we can lint rules
 # as we go
 use lib 'lib';
 use Mail::SpamAssassin;

 our ( @opt_srcs, $opt_out, $opt_sandboxout, $opt_manifest,
     $opt_manifestskip, $opt_listpromotable, $opt_active,
     $opt_activeout, $default_file_header,
     $opt_rulemetadata, $opt_exit_on_no_src);

 GetOptions("src=s" => \@opt_srcs,
     "out=s",
     "sandboxout=s",
     "activeout=s",
     "active=s",
     "manifest=s",
     "manifestskip=s",
     "rulemetadata=s",
     "exit_on_no_src",
   );

 if (!@opt_srcs) {
   foreach ( 'rulescode', 'rulesrc' ) {
     if (-d $_) {
       # print "using default src $_\n";
       push(@opt_srcs, $_);
     }
   }
 }

 if (!$opt_manifest && -f "MANIFEST") {
   $opt_manifest = "MANIFEST";
 }

 if (!$opt_manifestskip && -f "MANIFEST.SKIP") {
   $opt_manifestskip = "MANIFEST.SKIP";
 }

 if (!$opt_active && -f "rules/active.list") {
   $opt_active = "rules/active.list";
 }

 if ($opt_exit_on_no_src) {
   my $foundone = 0;
   foreach my $src (@opt_srcs) {
     if (-d $src) { $foundone++; last; }
   }

   if (!$foundone) {
     print "no source directory found: exiting\n";
     exit 0;
   }
 }
 # else
 die "no src" unless (@opt_srcs >= 1);

 my $promolist;

   die "no out" unless ($opt_out);
   die "unreadable out" unless (-d $opt_out);
   die "unreadable active" unless (-f $opt_active);

 $opt_sandboxout  ||= "70_sandbox.cf";
 $opt_activeout   ||= "72_active.cf";

 # source files that need compilation, and their targets
 my $needs_compile = { };
 my $found_output = { };
 my $current_src;
 my $newest_src_mtime = 0;
 my $newest_out_mtime = 0;

 $default_file_header = join('', <DATA>);
 compile_utf8ify_function();

 foreach my $src (@opt_srcs) {
   if (!-d $src) {
     warn "WARNING: unreadable src '$src'\n";
     next;
   }
   $current_src = $src;
   File::Find::find ({
           wanted => \&src_wanted,
           no_chdir => 1
         }, $src);
 }

 # get mtimes of output files; we can be sure that all
 # output is under the "opt_out" dir, so recurse there
 File::Find::find ({
         wanted => \&out_wanted,
         no_chdir => 1
       }, $opt_out);

 # we must rebuild if a compiled .pm is missing, too
 my $found_all_pm_files = 1;
 foreach my $f (keys %{$needs_compile}) {
   next unless ($f =~ /\.pm$/i);
   if (!exists $found_output->{basename $f}) {
     $found_all_pm_files = 0;
   }
 }

 # check mtime on the active.list file, too
 {
   my @st = stat $opt_active;
   if ($st[9] && $st[9] > $newest_src_mtime) {
     $newest_src_mtime = $st[9];
   }
 }

 # check mtimes, and also require that the two required output files
 # really do exist
 if ($newest_src_mtime && $newest_out_mtime
     && $newest_src_mtime < $newest_out_mtime
     && -f $opt_out.'/'.$opt_sandboxout
     && -f $opt_out.'/'.$opt_activeout
     && $found_all_pm_files)
 {
   print "mkrules: no rules updated\n";
   exit 0;
 }

 my $rules = { };

 my $file_manifest = { };
 my $file_manifest_skip = [ ];
 if ($opt_manifest) {
   read_manifest($opt_manifest);
 }
 if ($opt_manifestskip) {
   read_manifest_skip($opt_manifestskip);
 }

 my $active_rules = { };
 read_active($opt_active);

 # context for the rules compiler
 my $seen_rules = { };
 my $renamed_rules = { };
 my $output_files = { };
 my $output_file_text = { };
 my $files_to_lint = { };
 my $entries_for_rule_name = { };

 # $COMMENTS is a "catch-all" "name", for lines that appear after the last line
 # that refers to a rule by name.  Those lines are not published by themselves;
 # they'll be published to all pubfiles found in the file.
 #
 # It's assumed they are comments, because they generally are, but could be all
 # sorts of unparseable lines.
 my $COMMENTS = '!comments!';

 # another "fake name" for lines that should always be published.  They'll
 # be published to the non-sandbox file.
 my $ALWAYS_PUBLISH = '!always_publish!';

 read_all_rules($needs_compile);
 read_rules_from_output_dir();
 compile_output_files();
 lint_output_files();
 write_output_files();

 # mkrules.t relies on the script exiting cleanly Bug #7302 and Bug #7692
 exit if ($ENV{'TEST_ACTIVE'}) ;

 die "$fail_message" if ( $fail_message =~ m/./) ;
 exit;

 # ---------------------------------------------------------------------------

 sub lint_output_files {
   foreach my $file (keys %{$files_to_lint}) {
     my $text = join("\n", "file start $file", $output_file_text->{$file}, "file end $file");
     if (lint_rule_text($text) != 0) {
       warn "\nERROR: LINT FAILED, suppressing output: $file\n\n";
       $fail_message =  $fail_message . "ERROR: LINT FAILED, suppressing output: $file\n";

       # don't suppress entirely, otherwise 'make distcheck'/'disttest'
       # will fail since the MANIFEST-listed output files will be
       # empty.

       # delete $output_file_text->{$file};
       $output_file_text->{$file} = '';
     }
   }
 }

 sub lint_rule_text {
   my ($text) = @_;

   # ensure we turn off slow/optional stuff for linting, but keep the essentials
   my $pretext = q{
     loadplugin Mail::SpamAssassin::Plugin::Check
     loadplugin Mail::SpamAssassin::Plugin::URIDNSBL
     util_rb_tld com # skip "need to run sa-update" warn
     use_bayes 0
   };

   my $mailsa = Mail::SpamAssassin->new({
       rules_filename => "./rules",
       # debug => 1,
       local_tests_only => 1,
       dont_copy_prefs => 1,
       config_text => $pretext.$text
   });

   my $errors = 0;
   $mailsa->{lint_callback} = sub {
     my %opts = @_;

     return if ($opts{msg} =~ /
           (?:score\sset\sfor\snon-existent|description\sexists)
       /x);

     warn "lint: $opts{msg}";
     if ($opts{iserror}) {
       $errors++;
     }
   };

   $mailsa->lint_rules();
   $mailsa->finish();
   return $errors;       # 0 means good
 }

 sub src_wanted {
   my $path = $File::Find::name;

   # record stat times of directories, too, to catch file additions/removals
   # in the source tree
   my @st = stat $path;
   if ($st[9] && $st[9] > $newest_src_mtime) {
     $newest_src_mtime = $st[9];
   }

   # only files from now on, though
   return if (!-f $path);

   # limit what will be copied from sandboxes
   return if ($path =~ /\bsandbox\b/ && !/(?:\d.*\.cf|\.pm)$/i);

   # don't use generated scores; they can be out of sync with what is currently
   # in the sandboxes or the most current active.list file at any given time
   return if ($path =~ /\bscores\b/);

   # a bit of sanity please - no svn metadata ;)
   return if ($path =~ /\.svn/);

   my $dir = $path;
   $dir =~ s/^${current_src}[\/\\\:]//s;
   $dir =~ s/([^\/\\\:]+)$//;
   my $filename = $1;


   my $f = "$current_src/$dir$filename";
   my $t;
   $t = "$opt_out/$filename";

   $needs_compile->{$f} = {
           f => $f,
           t => $t,
           dir => $dir,
           filename => $filename
         };
 }

 sub out_wanted {
   my $path = $File::Find::name;
   return unless (-f $path);
   return if ($path =~ /\.svn/);
   return unless ($path =~ /\.(?:cf|pm)$/i);

   my @st = stat $path;
   if ($st[9] && $st[9] > $newest_out_mtime) {
     $newest_out_mtime = $st[9];
   }

   my $dir = $path;
   $dir =~ s/^${current_src}[\/\\\:]//s;
   $dir =~ s/([^\/\\\:]+)$//;
   my $filename = $1;

   if ($path =~ /\.pm$/i) {
     $found_output->{$filename} = 1;
   }
 }

 # compile all the source files found by the src_wanted() sub, in sorted
 # order so that the order of precedence makes sense.
 sub read_all_rules {
   my ($sources) = @_;

   # deal with the perl modules first, so that later linting w/ loadplugin will
   # work appropriately.
   foreach my $f (sort {
                   my ($ae) = $a =~ /\.(cf|pm)$/;
                   my ($be) = $b =~ /\.(cf|pm)$/;
                   return $be cmp $ae || $a cmp $b;
                 } keys %$sources)
   {
     my $entry = $needs_compile->{$f};
     my $t = $entry->{t};

     # TODO: dependency checking optimization?
     ## my $needs_rebuild = 0;
     ## if (!-f $t || -M $t > -M $f) {
     ## # the source file is newer, or dest is not there
     ## $needs_rebuild = 1;
     ## }

     my $needs_rebuild = 1;

     if ($entry->{filename} =~ /\.pm$/) {
       plugin_file_compile($entry);
     }
     elsif ($entry->{dir} =~ /sandbox/) {
       rule_file_compile($f, $t, $entry->{filename},
                 { issandbox => 1 });
     }
     elsif ($entry->{dir} =~ /scores/) {
       rule_file_compile($f, $t, $entry->{filename},
                 { issandbox => 1, isscores => 1 });
     }
     elsif ($entry->{dir} =~ /extra/) {
       # 'extra' rulesets; not built by default (TODO)
       next;
     }
     else {
       # rules in "core" and "lang" are always copied
       if ($needs_rebuild) {
         rule_file_compile($f, $t, $entry->{filename}, { });
       }
     }
   }
 }

 ###########################################################################

 # Rules are compiled from source dir to output dir.
 #
 # Rules in "rules/active.list" are promoted to "72_active.cf"; rules not
 # listed there are relegated to "70_sandbox.cf".  There is code to allow
 # other filenames to be selected from the rulesrc .cf file, but I'm not
 # sure if it works anymore ;)
 #
 # Rules will be autorenamed, if there's a collision between a new rule name and
 # one that's already been output by the compiler in another source file. The
 # autorenaming is very simple -- portions of the current source path are
 # appended to the rule name, sanitised.

 sub rule_file_compile {
   my ($f, $t, $filename, $flags) = @_;
   my $issandbox = $flags->{issandbox};
   my $isscores = $flags->{isscores};

   open (IN, "<$f") or die "cannot read $f";

   # a fast parser for the config file format; don't need the
   # full deal here, and it must be fast, since it's run on every
   # "make" invocation

   my $rule_order = [ ];

   my $lastrule = $COMMENTS;

   if (!defined $rules->{$ALWAYS_PUBLISH}) {
     $rules->{$ALWAYS_PUBLISH} = rule_entry_create();
   }

   # zero or more "ifplugin" or "if" scopes
   my @current_conditionals = ();
   my $current_comments = '';

   while (<IN>) {
     my $orig = $_;

     s/#.*$//g; s/^\s+//; s/\s+$//;

     # drop comments/blank lines from output
     next if (/^$/);

     # save "lang" declarations
     my $lang = '';
     if (s/^lang\s+(\S+)\s+//) {
       $lang = $1;
     }

     if (/^(${RULE_KEYWORDS_RE})\s+(\S+)\s+(.*)$/)
     {
       # rule definitions
       my $type = $1;
       my $name = $2;
       my $val = $3;

       my $origname = $name;
       if ($issandbox) {
         $name = sandbox_rule_name_avoid_collisions($name, $f);
       }
       my $origname_w_T_prefix = $name;
       # non-sandbox rules always use the same name

       if (scalar @current_conditionals) {
         # ensure the current conditionals are used in the block name;
         # this ensures that we scope alternative (#ifdef-style) dupe
         # rule definitions in their own ifplugin scopes
         $name .= " ".join("", @current_conditionals);
         $name =~ s/\s+/ /gs; $name =~ s/ $//;
       }

       # track this as a rule-entry block for that rule name
       # (and it's T_ prefixed variant, if relevant)
       push @{$entries_for_rule_name->{$origname}}, $name;
       push @{$entries_for_rule_name->{$origname_w_T_prefix}}, $name;

       # comment "score" lines for sandbox rules (bug 5558)
       # use generated scores, though, if the rule is active
       if ($type eq 'score' && $issandbox &&
         !($isscores && $active_rules->{$name}))
       {
         $orig =~ s/^/#/g;
       }

       if (!$rules->{$name}) { $rules->{$name} = rule_entry_create(); }

       $rules->{$name}->{issandbox} = $issandbox;
       $rules->{$name}->{isscores} = $isscores;
       $rules->{$name}->{origname} = $origname;
       $rules->{$name}->{origname_w_T_prefix} = $origname_w_T_prefix;
       $rules->{$name}->{cond} = [@current_conditionals];
       $rules->{$name}->{text} .= $current_comments . $orig;
       $rules->{$name}->{plugin_dependencies} = {};

       # note if the conditional is a plugin reference, as we need to
       # ensure that "loadplugin" lines stay in the same place
       foreach my $c (@current_conditionals) {
         if ($c =~ /^ifplugin\s+(\S+)/) {
           $rules->{$name}->{plugin_dependencies}->{$1} = 1;
         } elsif ($c =~ /^if.*plugin/) {
           while ($c =~ /plugin\s*\(\s*(\S+)\s*\)/g) {
             $rules->{$name}->{plugin_dependencies}->{$1} = 1;
           }
         }
       }

       # note if we found the rule defn or not.  if we did not,
       # that means the rule was a code-tied rule, which should always
       # have its descriptions/scores/etc. published in "active".
       if ($type =~ /^${RULE_DEFINE_KEYWORDS_RE}$/x) {
         $rules->{$name}->{found_definition} = 1;
         $rules->{$name}->{srcfile} = $f;
         $rules->{$name}->{code} = $orig;
       }
       elsif ($type eq 'tflags') {
         # userconf rules are always published in "active"
         if ($val =~ /\buserconf\b/) {
           $rules->{$name}->{forceactive} = 1;
         }

         # record for rulemetadata code
         $val =~ s/\s+/ /gs;
         if ($rules->{$name}->{tflags}) {
           $rules->{$name}->{tflags} .= ' '.$val;
         } else {
           $rules->{$name}->{tflags} = $val;
         }
       }

       $current_comments = '';

       $lastrule = $name;
       push (@$rule_order, $name);
     }
     elsif (/^
         (pubfile|publish)
         \s+(\S+)\s*(.*?)$
       /x)
     {
       # preprocessor directives
       my $command = $1;
       my $name = $2;
       my $val = $3;

       my $origname = $name;

       # note: if we call sandbox_rule_name_avoid_collisions(), it'll
       # rename to 'T_RULENAME' -- which is exactly what we're trying
       # to avoid in 'publish RULENAME' lines!  so don't call it here.
       # if ($issandbox) {
       # $name = sandbox_rule_name_avoid_collisions($name, $f);
       # }

       if (!$rules->{$name}) { $rules->{$name} = rule_entry_create(); }
       $rules->{$name}->{origname} = $origname;
       $rules->{$name}->{origname_w_T_prefix} = $origname;

       if ($command eq 'publish') {
         # the 'publish' command defaults to "1", unless it explicitly
         # is set to "0".  iow: publish RULE_NAME [(0 | 1)]  [default: 1]
         if (!defined $val || $val eq '') { $val = '1'; }
       }
       elsif ($command eq 'pubfile') {
         if (!filename_in_manifest($opt_out.'/'.$val)) {
           warn "$val: WARNING: not listed in manifest file, using default\n";
           next;     # don't set 'pubfile' below
         }
       }

       $rules->{$name}->{$command} = $val;

       # if we see "publish NAMEOFRULE", that means the rule is
       # considered active
       if ($rules->{$name}->{publish}) {
         $rules->{$name}->{forceactive} = 1;
       }
     }
     elsif (/^
         (if|ifplugin)
         \s+(.*?)$
       /x)
     {
       push @current_conditionals, $orig;
     }
     elsif (/^else\b/x)
     {
       if (!scalar @current_conditionals) {
         warn "WARNING: 'else' without 'if'/'ifplugin' conditional\n";
       } else {
         my $cond = invert_conditional(pop @current_conditionals);
         push @current_conditionals, $cond;
       }
     }
     elsif (/^endif\b/x)
     {
       if (!scalar @current_conditionals) {
         warn "WARNING: 'endif' without 'if'/'ifplugin' conditional\n";
       } else {
         pop @current_conditionals;
       }
     }
     elsif (/^require_version\s*(\S+)\b/) {
       # silently ignored.  TODO?  (meh)
     }
     elsif (/^loadplugin\s*(\S+)\b/) {
       my $name = 'loadplugin_'.$1;

       unless ($rules->{$name}) {
         $rules->{$name} = rule_entry_create();
         $rules->{$name}->{origname} = $name;
         $rules->{$name}->{origname_w_T_prefix} = $name;
         $rules->{$name}->{issandbox} = $issandbox;
         $rules->{$name}->{iscommand} = 1;
       }

       if (/^loadplugin\s*\S+\s+(\S+)/) {
         my $fname = $1;
         my $fpath = dirname($f)."/".$fname;

         if (!-f $fpath) {
           warn "$f: WARNING: plugin code file '$fpath' not found, line ignored: $orig";
           next;
         }

         if ($fpath =~ /sandbox/i) {
           # Since this is a sandbox plugin, force its output to the sandbox area.
           $rules->{$name}->{sandbox_plugin} = 1;
         }

         # If a 'loadplugin' line is found, and the plugin .pm is not listed in
         # the MANIFEST file, this will mean that the .pm will not be copied
         # during "make dist".  This causes failures during "make disttest",
         # since the file does not exist.
         #
         # However, we do want to preserve these lines in the 'rules' dir, for
         # use during development -- without requiring that the .pm's be put
         # into MANIFEST -- ie. before the plugin is considered release-ready,
         # ie. sandbox plugins.
         #
         # fix: make it a "tryplugin" line instead; these are ignored if the
         # target file is nonexistent.

         if (!filename_in_manifest($opt_out.'/'.$fname)) {
           warn "$f: WARNING: '$opt_out/$fname' not listed in manifest file, making 'tryplugin': $orig";
           $orig =~ s/^\s*loadplugin\b/tryplugin/;
         }
       }

       $rules->{$name}->{text} .= $orig;
       unshift (@$rule_order, $name);
     }
     else {
       # an unhandled configuration line; "redirector_pattern",
       # "report", something like that.  This should be sent to
       # the active.cf output (or sandbox if it appeared in a sandbox
       # input file).

       # use the line itself as a key
       my $name = $_;
       /^\s*(\S+)/ and $name = $1;
       $name =~ s/\s+/ /gs;

       my $forceactive = 1;

       # always send 'test' lines to the sandbox files
       if (/^test\s*/) {
         $forceactive = 0;

         $name = $_;   # ensure we don't drag rules with us though!
         $name =~ s/\s+/ /gs;
       }

       if (scalar @current_conditionals) {
         $name = join("", @current_conditionals);
         $name =~ s/\s+/ /gs; $name =~ s/ $//;
       }

       if ($issandbox) {
         $name .= "_sandbox";
       }

       unless ($rules->{$name}) {
         $rules->{$name} = rule_entry_create();
         $rules->{$name}->{origname} = $name;
         $rules->{$name}->{origname_w_T_prefix} = $name;
       }
       $rules->{$name}->{cond} = [@current_conditionals];
       $rules->{$name}->{issandbox} = $issandbox;
       $rules->{$name}->{forceactive} = $forceactive;
       # $rules->{$name}->{forceactive} = 1;
       $rules->{$name}->{iscommand} = 1;

       # TODO: bug 6241: 'replace_rules' should be handled ok, but isn't

       # warn "unknown line in rules file '$f', saving to default: $orig";

       $rules->{$name}->{text} .= $orig;
       unshift (@$rule_order, $name);
     }
   }
   close IN;

   if ($current_comments) {
     $rules->{$COMMENTS}->{text} .= $current_comments;
   }

   # now append all the found text to the output file buffers
   copy_to_output_buffers($rule_order, $issandbox, $f, $filename);

   # ok; file complete.  now mark all those rules as "seen"; future
   # refs to those rule names will trigger an autorename.
   foreach my $name (@$rule_order) {
     $seen_rules->{$name} = 1;
   }
 }

 # this is only run if we're generating rulemetadata!
 sub read_rules_from_output_dir {
   return unless ($opt_rulemetadata);

   foreach my $file (<$opt_out/*.cf>) {
     next unless ($file =~ /\d\d_\S+\.cf$/);
     next if (pubfile_is_activeout($file));
     next if (pubfile_is_sandboxout($file));
     read_output_file($file);
   }
 }

 sub read_output_file {
   my ($file) = @_;
   open (IN, "<$file") or warn "cannot read $file";
   while (<IN>) {
     my $orig = $_;

     s/#.*$//g; s/^\s+//; s/\s+$//;

     # drop comments/blank lines from output
     next if (/^$/);

     # save "lang" declarations
     my $lang = '';
     if (s/^lang\s+(\S+)\s+//) {
       $lang = $1;
     }

     if (/^(${RULE_KEYWORDS_RE})\s+(\S+)\s+(.*)$/) {
       # rule definitions
       my $type = $1;
       my $name = $2;
       my $val = $3;

       # note: we only want to do this if --rulemetadata is in use!
       if (!$rules->{$name}) { $rules->{$name} = rule_entry_create(); }

       if ($type eq 'tflags') {
         $val =~ s/\s+/ /gs;
         if ($rules->{$name}->{tflags}) {
           $rules->{$name}->{tflags} .= ' '.$val;
         } else {
           $rules->{$name}->{tflags} = $val;
         }
       }

       if ($type =~ /^${RULE_DEFINE_KEYWORDS_RE}$/x) {
         $rules->{$name}->{srcfile} = $file;
         $rules->{$name}->{code} = $orig;
       }
     }
   }
   close IN;
 }

 sub copy_to_output_buffers {
   my ($rule_order, $issandbox, $f, $filename) = @_;

   # always output these two files, even if they're empty!
   foreach my $pubfile ($opt_out.'/'.$opt_sandboxout,
                 $opt_out.'/'.$opt_activeout)
   {
     $output_files->{$pubfile} = {
       header => $default_file_header
     };
   }

   my %already_done = ();
   my $copied_active = 0;
   my $copied_other = 0;
   foreach my $name (@$rule_order)
   {
     # only do each rule once, please ;)
     next if exists $already_done{$name};
     $already_done{$name} = undef;

     my $text = $rules->{$name}->{text};
     if (!$text) {
       next;     # nothing to write!
     }

     my $srcfile = $rules->{$name}->{srcfile};
     my $pubfile = pubfile_for_rule($rules, $rules->{$name}->{origname_w_T_prefix});
     my $is_active = 0;
     if (pubfile_is_activeout($pubfile)) {
       $is_active++;
     }

     my $cond = $rules->{$name}->{cond};
     if ($cond) {
       foreach my $pluginclass (keys %{$rules->{$name}->{plugin_dependencies}}) {
         my $ifplugin_text_name = "loadplugin_".($pluginclass || "");

         if ($rules->{$ifplugin_text_name}) {
           # if the plugin is a sandbox plugin, ensure it's not
           # sent to the active file
           if ($rules->{$ifplugin_text_name}->{sandbox_plugin}) {
             $pubfile = $opt_out.'/'.$opt_sandboxout;
             $is_active = 0;
           }

           # either way, ensure the "loadplugin" line, if there is one,
           # goes to the same file
           $rules->{$ifplugin_text_name}->{output_file} = $pubfile;
         }
       }

       # ensure we produce enough "endif"s to match however many
       # nested conditions there are
       my $endifs = "endif\n" x (scalar @{$cond});

       $rules->{$name}->{output_text} = "\n"
                 .join("", @{$cond})
                 .$text
                 .$endifs;

     } else {
       $rules->{$name}->{output_text} = $text;
     }

     # note the target file
     $rules->{$name}->{output_file} = $pubfile;

     $output_files->{$pubfile} = {
       header => $default_file_header
     };

     if ($is_active) {
       $copied_active++;
     } else {
       $copied_other++;
     }
   }

   print "$f: $copied_active active rules, ".
             "$copied_other other\n";
 }

 sub pubfile_for_rule {
   my ($rules, $name) = @_;

   my $pubfile;
   if ($rules->{$name}->{publish}) {
     # "publish NAMEOFRULE" => send it to active
     $pubfile = $opt_out.'/'.$opt_activeout;
   }

   # default: "70_sandbox.cf" or "72_active.cf"
   if (!$pubfile) {
     if ($active_rules->{$name}      # is active
         || $rules->{$name}->{forceactive}   # or is forced to be
         || (!$rules->{$name}->{found_definition} && !$rules->{$name}->{iscommand}
             && !$rules->{$name}->{isscores}))
                 # or is a rule-related setting in reference to an unknown rule
                 # but isn't a generated score
     {
       $pubfile = $opt_out.'/'.$opt_activeout;
     }
     elsif ($rules->{$name}->{issandbox}) {
       $pubfile = $opt_out.'/'.$opt_sandboxout;
     }
     else {
       warn "oops? inactive rule, non-sandbox, shouldn't be possible anymore";
       $pubfile = $opt_out.'/'.$opt_sandboxout;
     }
   }
   return $pubfile;
 }

 sub plugin_file_compile {
   my ($entry) = @_;

   return if $opt_listpromotable;
   # just copy the raw perl module over to the new area
   # we can't really rename to avoid conflicts since the loadplugin lines
   # are going to be all screwed up in that case.
   # jm: we always want to update the output file in case the input
   # has been changed!
   if (0 && -e $entry->{t}) {
     warn "The perl module ".$entry->{t}." already exists, can't copy from ".$entry->{f}."\n";
   }
   else {
     copy($entry->{f}, $entry->{t}) || warn "Couldn't copy ".$entry->{f}.": $!";
   }
 }

 ###########################################################################

 sub compile_output_files {
   my $always = $rules->{$ALWAYS_PUBLISH}->{output_text};

   # create all known output files
   foreach my $file (keys %$output_files) {
     $output_file_text->{$file} = $output_files->{$file}->{header};

     if ($always && pubfile_is_activeout($file)) {
       $output_file_text->{$file} .= $always;
     }
   }

   # this is a horrible kluge.
   # at this point in the game, we've lost the ordered list of rules, so the
   # loadplugin lines have no guarantee that they'll be loaded before the rules
   # that require them.  so we kluge the sort to always have loadplugin lines
   # appear at the very top of the array so we know they'll be listed before
   # anything else.
   my @rulenames = sort {
     if ($a =~ /^loadplugin_/) {
       return -1;
     }
     elsif ($b =~ /^loadplugin_/) {
       return 1;
     }
     return $a cmp $b;
   } keys %$rules;
   my %seen = ();

   # go through the rules looking for meta subrules we
   # may have forgotten; this happens if a non-subrule is
   # listed in active.list, the subrules will not be!  fix them
   # to appear in the same output file as the master rule.
   foreach my $rule (@rulenames) {
     fix_up_rule_dependencies($rule);
   }

   # now repeat, just for rules in the active set; their dependencies should
   # always be likewise promoted into the active set, overriding the prev step.
   foreach my $rule (@rulenames) {
     my $pubfile = $rules->{$rule}->{output_file};
     next unless ($pubfile && pubfile_is_activeout($pubfile));
     fix_up_rule_dependencies($rule);
   }

   my $rulemd = '';

   # output the known rules that are not meta subrules.
   foreach my $rule (@rulenames) {
     $rulemd .= get_rulemetadata_string($rule);      # all metadata strings

     next if ($rule =~ /^__/);
     my $pubfile = $rules->{$rule}->{output_file};
     my $text    = $rules->{$rule}->{output_text};
     next unless defined ($text);

 # DOS - bug 6297 - HACK HACK HACK HACK
 # this will probably screw up meta rules that do something like '&& !$rule'

     # avoid publishing 'tflags nopublish' rules
     if (pubfile_is_activeout($pubfile) && exists $rules->{$rule}->{tflags} &&
         $rules->{$rule}->{tflags} =~ /\bnopublish\b/)
     {
       print "omitting rule $rule due to tflags nopublish (tflags $rules->{$rule}->{tflags})\n";
       next;
     }

 # DOS - END HACK

     $output_file_text->{$pubfile} .= "##{ $rule\n".
                             $text.
                             "##} ".$rule."\n\n";
   }

   # now output all subrules (in a slightly more compact form)
   foreach my $rule (@rulenames) {
     next unless ($rule =~ /^__/);
     my $pubfile = $rules->{$rule}->{output_file};
     my $text    = $rules->{$rule}->{output_text};
     next unless defined ($text);

 # DOS - bug 6297 - HACK HACK HACK HACK
 # this will probably screw up meta rules that do something like '&& !$rule'

     # avoid publishing 'tflags nopublish' rules
     if (pubfile_is_activeout($pubfile) && exists $rules->{$rule}->{tflags} &&
         $rules->{$rule}->{tflags} =~ /\bnopublish\b/)
     {
       print "omitting rule $rule due to tflags nopublish (tflags $rules->{$rule}->{tflags})\n";
       next;
     }

 # DOS - END HACK

     $output_file_text->{$pubfile} .= $text;
   }

   # finally, finish off all output files
   foreach my $file (keys %$output_files) {
     # and get them lint-checked!
     $files_to_lint->{$file} = 1;
   }

   if ($opt_rulemetadata) {
     open (RULEMD, ">".$opt_rulemetadata)
             or die "cannot write rulemd to $opt_rulemetadata";
     print RULEMD "<?xml version='1.0' encoding='UTF-8'?>\n",
                     "<rulemds>", $rulemd, "</rulemds>\n";
     close RULEMD or die "cannot close rulemd to $opt_rulemetadata";
   }
 }

 # conditionally build a method to UTF-8-encode a string.  this is only required
 # for the rulemetadata XML output, so don't make it mandatory!
 sub compile_utf8ify_function {
   if (!eval '
       sub utf8ify { use Encode; return Encode::encode("UTF-8", $_[0]); } 1;
     ')
   {
     eval '
       sub utf8ify { die "unimplemented -- Encode module required!" } 1;
     '
   }
 }

 sub get_rulemetadata_string {
   my ($rule) = @_;

   return '' unless ($opt_rulemetadata);

   my $mod = 0;
   my $srcfile = '';
   my $code = '';
   my $name = $rule;

   # if we found a rule definition with a T_ prefix, use that data
   if (!$rules->{$name}->{srcfile} && $rules->{"T_".$name}->{srcfile}) {
     $name = "T_".$name;
   }

   if ($rules->{$name}->{srcfile}) {
     $srcfile = $rules->{$name}->{srcfile};
     if ($srcfile) {
       my @s = stat $srcfile;
       if (@s) { $mod = $s[9]; }
     }
   }

   if ($rules->{$name}->{code}) {
     $code = $rules->{$name}->{code};
     $code =~ s/\]\]>/\](defanged by mkrules)\]>/gs;     # ensure it's CDATA-safe
     $code = utf8ify($code);
   }

   my $tf = $rules->{$name}->{tflags} || '';

   return "<rulemetadata>".
             "<name>$rule</name>".
             "<src>$srcfile</src>".
             "<srcmtime>$mod</srcmtime>".
             # don't include <code> blocks; they bloat up the XML badly (to 800KB)
             # and make it very slow to parse later
             # "<code><![CDATA[$code]]></code>".
             "<tf>$tf</tf>".
           "</rulemetadata>\n";
 }

 sub fix_up_rule_dependencies {
   my $rule = shift;

   my $pubfile = $rules->{$rule}->{output_file};
   my $text    = $rules->{$rule}->{output_text};
   return unless $text;

   while ($text =~ /^\s*meta\s+(.*)$/mg) {
     my $line = $1;
     while ($line =~ /\b([_A-Za-z0-9]+)\b/g) {
       # force that subrule (if it exists) to output in the
       # same pubfile
       my $rule2 = $1;

       # deal with rules that changed name from "FOO" to "T_FOO"
       sed_renamed_rule_names(\$rule2);

       if (!$entries_for_rule_name->{$rule2}) {
         # we may not always have a rule entry, if the rule was from a non-sandbox
         # source
         # warn "cannot find entries_for_rule_name '$rule2'";
       }

       foreach my $entryname2 (@{$entries_for_rule_name->{$rule2}}) {
         next unless ($rules->{$entryname2} && $rules->{$entryname2}->{output_file});

         # don't do this if the subrule would be moved *out* of the
         # active file!
         my $pubfile2 = $rules->{$entryname2}->{output_file};
         next if (pubfile_is_activeout($pubfile2));

         $rules->{$entryname2}->{output_file} = $pubfile;
       }
     }
   }
 }

 sub pubfile_is_activeout {
   return 1 if ($_[0] && $_[0] =~ /\b\Q$opt_activeout\E$/);
   return 0;
 }

 sub pubfile_is_sandboxout {
   return 1 if ($_[0] && $_[0] =~ /\b\Q$opt_sandboxout\E$/);
   return 0;
 }

 sub write_output_files {
   foreach my $pubfile (sort keys %$output_files) {
     if (-f $pubfile) {
       unlink $pubfile or die "cannot remove output file '$pubfile'";
     }

     if (!filename_in_manifest($pubfile)) {
       warn "$pubfile: WARNING: not listed in manifest file\n";
     }

     my $text = $output_file_text->{$pubfile};
     if ($text) {
       open (OUT, ">".$pubfile) or die "cannot write to output file '$pubfile'";
       sed_renamed_rule_names(\$text);
       print OUT $text;
       close OUT or die "cannot close output file '$pubfile'";
       # print "$pubfile: written\n";        # too noisy
     }
     else {
       print "$pubfile: no rules promoted\n";

       # create an empty file anyway to satisfy MANIFEST
       open (OUT, ">".$pubfile) or die "cannot write to output file '$pubfile'";
       close OUT or die "cannot close output file '$pubfile'";
     }
   }
 }

 ###########################################################################

 sub rule_entry_create {
   return {
     text => '',
     publish => 0
   };
 }

 ###########################################################################

 sub sandbox_rule_name_avoid_collisions {
   my ($rule, $path) = @_;
   my $new;
   my $newreason;
   my $dowarn = 0;

   return $rule if $opt_listpromotable;
   return $rule if $active_rules->{$rule};
   return $rule if $rules->{$rule}->{forceactive};

   if ($rule !~ /^(?:T_|__)/) {
     $new = "T_".$rule;
     $newreason = "missing T_ prefix";
   }
   elsif (!exists $seen_rules->{$rule}) {
     return $rule;
   }
   else {
     $new = $path;
     $new =~ s/[^A-Za-z0-9]+/_/gs;
     $new =~ s/_+/_/gs;
     $new =~ s/^_//;
     $new =~ s/_$//;
     $new = $rule.'_'.$new;
     $newreason = "collision with existing rule";
     $dowarn = 1;
   }

   if (!$renamed_rules->{$new}) {
     $renamed_rules->{$new} = $rule;
     if ($dowarn) {
       warn "WARNING: $rule: renamed as $new due to $newreason\n";
     }
   }

   return $new;
 }

 sub sed_renamed_rule_names {
   my ($textref) = @_;
   foreach my $new (keys %{$renamed_rules}) {
     my $rule = $renamed_rules->{$new};
     $$textref =~ s/\b${rule}\b/${new}/gs;
   }
 }

 ###########################################################################

 sub invert_conditional {
   my $cond = shift;
   if ($cond =~ /^ \s* ifplugin \s+(.*?)$ /x) {
     return "if !plugin($1)\n";
   } elsif ($cond =~ /^ \s* if \s+(.*?)$ /x) {
     return "if !($1)\n";
   } else {
     warn "WARNING: cannot parse '$cond' for 'else'\n";
     return 'if 0';
   }
 }

 ###########################################################################

 sub read_manifest {
   my ($fname) = @_;
   parse_line_delimited_config_file($fname, sub {
       /^\s*(.*?)\s*$/ and $file_manifest->{$1} = 1;
     });
 }

 sub read_manifest_skip {
   my ($fname) = @_;
   parse_line_delimited_config_file($fname, sub {
       /^\s*(.*?)\s*$/ and push (@{$file_manifest_skip}, qr/$1/);
     });
 }

 sub read_active {
   my ($fname) = @_;
   parse_line_delimited_config_file($fname, sub {
       /^(\S+)/ and $active_rules->{$1} = 1;
     });
 }

 sub filename_in_manifest {
   my ($fname) = @_;
   return 1 if ($file_manifest->{$fname});
   foreach my $skipre (@{$file_manifest_skip}) {
     return 1 if ($fname =~ $skipre);
   }
   return 0;
 }

 sub parse_line_delimited_config_file {
   my ($fname, $callback) = @_;
   if (!open (IN, "<$fname")) {
     warn "cannot read $fname\n";
   } else {
     while (<IN>) {
       next if /^#/;
       $callback->();
     }
     close IN;
   }
 }


 __DATA__
 # SpamAssassin rules file
 #
 # Please don't modify this file as your changes will be overwritten with
 # the next update. Use @@LOCAL_RULES_DIR@@/local.cf instead.
 # See 'perldoc Mail::SpamAssassin::Conf' for details.
 #
 # <@LICENSE>
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to you under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at:
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # </@LICENSE>
 #
 ###########################################################################

 require_version @@VERSION@@