lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm - spamassassin - Git at Google

 # <@LICENSE>
 # Copyright 2006 Apache Software Foundation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # </@LICENSE>

 =head1 NAME

 Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor - extract "bases" from body ruleset

 =head1 SYNOPSIS

 This is a plugin to extract "base" strings from SpamAssassin 'body' rules,
 suitable for use in Rule2XSBody rules or other parallel matching algorithms.

 =cut

 package Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor;

 use Mail::SpamAssassin::Plugin;
 use Mail::SpamAssassin::Logger;
 use Mail::SpamAssassin::Util qw(untaint_var);
 use Mail::SpamAssassin::Util::Progress;

 use Errno qw(ENOENT EACCES EEXIST);
 use Data::Dumper;

 use strict;
 use warnings;
 use bytes;
 use re 'taint';

 use vars qw(@ISA);
 @ISA = qw(Mail::SpamAssassin::Plugin);

 use constant DEBUG_RE_PARSING => 0;     # noisy!

 # a few settings that control what kind of bases are output.

 # treat all rules as lowercase for purposes of term extraction?
 # $main->{bases_must_be_casei} = 1;
 # $main->{bases_can_use_alternations} = 0; # /(foo|bar|baz)/
 # $main->{bases_can_use_quantifiers} = 0; # /foo.*bar/ or /foo*bar/ or /foooo?bar/
 # $main->{bases_can_use_char_classes} = 0; # /fo[opqr]bar/
 # $main->{bases_split_out_alternations} = 1; # /(foo|bar|baz)/ => ["foo", "bar", "baz"]
 # $main->{base_quiet} = 0;      # silences progress output

 # TODO: it would be nice to have a clean API to pass such settings
 # through to plugins instead of hanging them off $main

 ##############################################################################

 # testing purposes only
 my $fixup_re_test;
 #$fixup_re_test = 1; fixup_re("fr()|\\\\|"); die;
 #$fixup_re_test = 1; fixup_re("\\x{1b}\$b"); die;
 #$fixup_re_test = 1; fixup_re("\\33\$b"); die;
 #$fixup_re_test = 1; fixup_re("[link]"); die;
 #$fixup_re_test = 1; fixup_re("please do not resend your original message."); die;

 ###########################################################################

 sub new {
   my $class = shift;
   my $mailsaobject = shift;
   $class = ref($class) || $class;
   my $self = $class->SUPER::new($mailsaobject);
   bless ($self, $class);

   $self->{show_progress} = !$mailsaobject->{base_quiet};

   # $self->test(); exit;
   return $self;
 }

 ###########################################################################

 sub finish_parsing_end {
   my ($self, $params) = @_;
   my $conf = $params->{conf};
   $self->extract_bases($conf);
 }

 sub extract_bases {
   my ($self, $conf) = @_;

   my $main = $conf->{main};
   if (!$main->{base_extract}) { return; }

   $self->{show_progress} and
         info("base extraction starting.  this can take a while...");

   $self->extract_set($conf, $conf->{body_tests}, 'body');
 }

 sub extract_set {
   my ($self, $conf, $test_set, $ruletype) = @_;

   foreach my $pri (keys %{$test_set}) {
     my $nicepri = $pri; $nicepri =~ s/-/neg/g;
     $self->extract_set_pri($conf, $test_set->{$pri}, $ruletype.'_'.$nicepri);
   }
 }

 ###########################################################################

 sub extract_set_pri {
   my ($self, $conf, $rules, $ruletype) = @_;

   my @good_bases;
   my @failed;
   my $yes = 0;
   my $no = 0;
   my $count = 0;
   my $start = time;
   $self->{main} = $conf->{main};	# for use in extract_hints()
   $self->{show_progress} and info ("extracting from rules of type $ruletype");

   # attempt to find good "base strings" (simplified regexp subsets) for each
   # regexp.  We try looking at the regexp from both ends, since there
   # may be a good long string of text at the end of the rule.

   # require this many chars in a base string + delimiters for it to be viable
   my $min_chars = 5;

   my $progress;
   $self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({
                 total => (scalar keys %{$rules} || 1),
                 itemtype => 'rules',
               });

   my $cached = { };
   my $cachefile;

   if ($self->{main}->{bases_cache_dir}) {
     $cachefile = $self->{main}->{bases_cache_dir}."/rules.$ruletype";
     $cached = $self->read_cachefile($cachefile);
   }

 NEXT_RULE:
   foreach my $name (keys %{$rules}) {
     $self->{show_progress} and $progress and $progress->update(++$count);

     my $rule = $rules->{$name};
     my $cachekey = join "#", $name, $rule;

     my $cent = $cached->{rule_bases}->{$cachekey};
     if (defined $cent) {
       if (defined $cent->{g}) {
         dbg("zoom: YES (cached) $rule");
         foreach my $ent (@{$cent->{g}}) {
           # note: we have to copy these, since otherwise later
           # modifications corrupt the cached data
           push @good_bases, {
             base => $ent->{base}, orig => $ent->{orig}, name => $ent->{name}
           };
         }
         $yes++;
       }
       else {
         dbg("zoom: NO (cached) $rule");
         push @failed, { orig => $rule };    # no need to cache this
         $no++;
       }
       next NEXT_RULE;
     }

     # ignore ReplaceTags rules
     # TODO: need cleaner way to do this
     goto NO if ($conf->{rules_to_replace}->{$name});

     my ($lossy, @bases);

     eval {  # catch die()s
       my ($qr, $mods) = $self->simplify_and_qr_regexp($rule);
       ($lossy, @bases) = $self->extract_hints($rule, $qr, $mods);
       1;
     } or do {
       my $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat;
       dbg("zoom: giving up on regexp: $eval_stat");
     };

     # if any of the extracted hints in a set are too short, the entire
     # set is invalid; this is because each set of N hints represents just
     # 1 regexp.
     my $minlen;
     foreach my $str (@bases) {
       my $len = length fixup_re($str); # bug 6143: count decoded characters
       if ($len < $min_chars) { $minlen = undef; @bases = (); last; }
       elsif (!defined($minlen) || $len < $minlen) { $minlen = $len; }
     }

     if ($minlen && @bases) {
       # dbg("zoom: YES <base>$base</base> <origrule>$rule</origrule>");

       # figure out if we have e.g. ["foo", "foob", "foobar"]; in this
       # case, we only need to track ["foo"].
       my %subsumed;
       foreach my $base1 (@bases) {
         foreach my $base2 (@bases) {
           if ($base1 ne $base2 && $base1 =~ /\Q$base2\E/) {
             $subsumed{$base1} = 1; # base2 is inside base1; discard the longer
           }
         }
       }

       my @forcache;
       foreach my $base (@bases) {
         next if $subsumed{$base};
         push @good_bases, {
             base => $base, orig => $rule, name => "$name,[l=$lossy]"
           };
         # *separate* copies for cache -- we modify the @good_bases entry
         push @forcache, {
             base => $base, orig => $rule, name => "$name,[l=$lossy]"
           };
       }

       $cached->{rule_bases}->{$cachekey} = { g => \@forcache };
       $yes++;
     }
     else {
 NO:
       dbg("zoom: NO $rule");
       push @failed, { orig => $rule };
       $cached->{rule_bases}->{$cachekey} = { };
       $no++;
     }
   }

   $self->{show_progress} and $progress and $progress->final();

   dbg("zoom: $ruletype: found ".(scalar @good_bases).
       " usable base strings in $yes rules, skipped $no rules");

   # NOTE: re2c will attempt to provide the longest pattern that matched; e.g.
   # ("food" =~ "foo" / "food") will return "food".  So therefore if a pattern
   # subsumes other patterns, we need to return hits for all of them.  We also
   # need to take care of the case where multiple regexps wind up sharing the
   # same base.
   #
   # Another gotcha, an exception to the subsumption rule; if one pattern isn't
   # entirely subsumed (e.g. "food" =~ "foo" / "ood"), then they will be
   # returned as two hits, correctly.  So we only have to be smart about the
   # full-subsumption case; overlapping is taken care of for us, by re2c.
   #
   # TODO: there's a bug here.  Since the code in extract_hints() has been
   # modified to support more complex regexps, we can no longer simply assume
   # that if pattern A is not contained in pattern B, that means that pattern B
   # doesn't subsume it.  Consider, for example, A="foo*bar" and
   # B="morefobarry"; A is indeed subsumed by B, but we won't be able to test
   # that without running the A RE match itself somehow against B.
   # same issue remains with:
   #
   #   "foo?bar" / "fobar"
   #   "fo(?:o|oo|)bar" / "fobar"
   #   "fo(?:o|oo)?bar" / "fobar"
   #   "fo(?:o*|baz)bar" / "fobar"
   #   "(?:fo(?:o*|baz)bar|blargh)" / "fobar"
   #
   # it's worse with this:
   #
   #   "fo(?:o|oo|)bar" / "foo*bar"
   #
   # basically, this is impossible to compute without reimplementing most of
   # re2c, and it appears the re2c developers don't plan to offer this:
   # https://sourceforge.net/tracker/index.php?func=detail&aid=1540845&group_id=96864&atid=616203

   $conf->{base_orig}->{$ruletype} = { };
   $conf->{base_string}->{$ruletype} = { };

   $count = 0;
   $self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({
                 total => (scalar @good_bases || 1),
                 itemtype => 'bases',
               });

   # this bit is annoyingly O(N^2).  Rewrite the data -- the @good_bases
   # array -- into a more efficient format, using arrays and with a little
   # bit of precomputation, to go (quite a bit) faster

   my @rewritten;
   foreach my $set1 (@good_bases) {
     my $base = $set1->{base};
     next if (!$base || !$set1->{name});
     push @rewritten, [
       $base,                # 0
       $set1->{name},        # 1
       $set1->{orig},        # 2
       length $base,         # 3
       qr/\Q$base\E/,        # 4
       0                     # 5, has_multiple flag
     ];
   }
   @good_bases = @rewritten;

   foreach my $set1 (@good_bases) {
     $self->{show_progress} and $progress and $progress->update(++$count);

     my $base1 = $set1->[0]; next unless $base1;
     my $name1 = $set1->[1];
     my $orig1 = $set1->[2];
     $conf->{base_orig}->{$ruletype}->{$name1} = $orig1;
     my $len1 = $set1->[3];

     foreach my $set2 (@good_bases) {
       next if ($set1 == $set2);

       my $base2 = $set2->[0]; next unless $base2;
       my $name2 = $set2->[1];

       # clobber exact dups; this can happen if a regexp outputs the
       # same base string multiple times
       if ($base1 eq $base2 &&
           $name1 eq $name2 &&
           $orig1 eq $set2->[2])
       {
         $set2->[0] = '';       # clobber
         next;
       }

       # skip if it's too short to contain the other base string
       next if ($len1 < $set2->[3]);

       # skip if either already contains the other rule's name
       # optimize: this can only happen if the base has more than
       # one rule already attached, ie [5]
       next if ($set2->[5] && $name2 =~ /(?: |^)\Q$name1\E(?: |$)/);

       # don't use $name1 here, since another base in the set2 loop
       # may have added $name2 since we set that
       next if ($set1->[5] && $set1->[1] =~ /(?: |^)\Q$name2\E(?: |$)/);

       # and finally check to see if it *does* contain the other base string
       next if ($base1 !~ $set2->[4]);

       # base2 is just a subset of base1
       # dbg("zoom: subsuming '$base2' ($name2) into '$base1': [1]=$set1->[1] [5]=$set1->[5]");
       $set1->[1] .= " ".$name2;
       $set1->[5] = 1;
     }
   }

   # we can still have duplicate cases; __FRAUD_PTS and __SARE_FRAUD_BADTHINGS
   # both contain "killed" for example, pointing at different rules, which
   # the above search hasn't found.  Collapse them here with a hash
   my %bases;
   foreach my $set (@good_bases) {
     my $base = $set->[0];
     next unless $base;

     if (defined $bases{$base}) {
       $bases{$base} .= " ".$set->[1];
     } else {
       $bases{$base} = $set->[1];
     }
   }
   undef @good_bases;

   foreach my $base (keys %bases) {
     # uniq the list, since there are probably dup rules listed
     my %u;
     for my $i (split ' ', $bases{$base}) {
       next if exists $u{$i}; undef $u{$i};
     }
     $conf->{base_string}->{$ruletype}->{$base} = join ' ', sort keys %u;
   }
   $self->{show_progress} and $progress and $progress->final();

   if ($cachefile) {
     $self->write_cachefile ($cachefile, $cached);
   }

   my $elapsed = time - $start;
   $self->{show_progress} and info ("$ruletype: ".
             (scalar keys %{$conf->{base_string}->{$ruletype}}).
             " base strings extracted in $elapsed seconds\n");
 }

 ###########################################################################

 # TODO:
 # NO /no.{1,10}P(?:er|re)scription.{1,10}(?:needed|require|necessary)/i
 #     => should extract 'scription' somehow
 # /time to refinance|refinanc\w{1,3}\b.{0,16}\bnow\b/i
 #     => should understand alternations; tricky

 sub simplify_and_qr_regexp {
   my $self = shift;
   my $rule = shift;

   my $main = $self->{main};
   $rule = Mail::SpamAssassin::Util::regexp_remove_delimiters($rule);

   # remove the regexp modifiers, keep for later
   my $mods = '';
   while ($rule =~ s/^\(\?([a-z]*)\)//) { $mods .= $1; }

   # modifier removal
   while ($rule =~ s/^\(\?-([a-z]*)\)//) {
     foreach my $modchar (split '', $mods) {
       $mods =~ s/$modchar//g;
     }
   }

   my $lossy = 0;

   # now: simplify aspects of the regexp.  Bear in mind that we can
   # simplify as long as we cause the regexp to become more general;
   # more hits is OK, since false positives will be discarded afterwards
   # anyway.  Simplification that causes the regexp to *not* hit
   # stuff that the "real" rule would hit, however, is a bad thing.

   if ($main->{bases_must_be_casei}) {
     $rule = lc $rule;

     $lossy = 1;
     $mods =~ s/i// and $lossy = 0;

     # always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/
     $rule =~ s/(?<!\\)\(\?i\:(.*?)\)/$1/gs and $lossy++;

     # always case-i: /A(?-i:ct)/ => /Act/
     $rule =~ s/(?<!\\)\(\?-i\:(.*?)\)/$1/gs and $lossy++;

     # remove (?i)
     $rule =~ s/\(\?i\)//gs;
   }
   else {
     die "case-i" if $rule =~ /\(\?i\)/;
     die "case-i" if $mods =~ /i/;

     # always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/
     $rule =~ s/(?<!\\)\(\?i\:(.*?)\)/$1/gs and die "case-i";

     # we're already non-case-i so this is a no-op: /A(?-i:ct)/ => /Act/
     $rule =~ s/(?<!\\)\(\?-i\:(.*?)\)/$1/gs;
   }

   # remove /m and /s modifiers
   $mods =~ s/m// and $lossy++;
   $mods =~ s/s// and $lossy++;

   # remove (^|\b)'s
   # T_KAM_STOCKTIP23 /(EXTREME INNOVATIONS|(^|\b)EXTI($|\b))/is
   $rule =~ s/\(\^\|\\b\)//gs and $lossy++;
   $rule =~ s/\(\$\|\\b\)//gs and $lossy++;
   $rule =~ s/\(\\b\|\^\)//gs and $lossy++;
   $rule =~ s/\(\\b\|\$\)//gs and $lossy++;

   # remove (?!credit)
   $rule =~ s/\(\?\![^\)]+\)//gs and $lossy++;

   # remove \b's
   $rule =~ s/(?<!\\)\\b//gs and $lossy++;

   # remove the "?=" trick
   # (?=[dehklnswxy])(horny|nasty|hot|wild|young|....etc...)
   $rule =~ s/\(\?\=\[[^\]]+\]\)//gs;

   $mods .= "L" if $lossy;
   ($rule, $mods);
 }

 sub extract_hints {
   my $self = shift;
   my $rawrule = shift;
   my $rule = shift;
   my $mods = shift;

   my $main = $self->{main};
   my $orig = $rule;

   my $lossy = 0;
   $mods =~ s/L// and $lossy++;

   # if there are anchors, give up; we can't get much
   # faster than these anyway
   die "anchors" if $rule =~ /^\(?(?:\^|\\A)/;

   # die "anchors" if $rule =~ /(?:\$|\\Z)\)?$/;
   # just remove end-of-string anchors; they're slow so could gain
   # from our speedup
   $rule =~ s/(?<!\\)(?:\$|\\Z)\)?$// and $lossy++;

   # simplify (?:..) to (..)
   $main->{bases_allow_noncapture_groups} or
             $rule =~ s/\(\?:/\(/g;

   # simplify some grouping arrangements so they're easier for us to parse
   # (foo)? => (foo|)
   $rule =~ s/\((.*?)\)\?/\($1\|\)/gs;
   # r? => (r|)
   $rule =~ s/(?<!\\)(\w)\?/\($1\|\)/gs;

   my ($tmpf, $tmpfh) = Mail::SpamAssassin::Util::secure_tmpfile();
   untaint_var(\$tmpf);

   # attempt to find a safe regexp delimiter...
   # TODO: would prob be easier to just read this from $rawrule
   my $quos = "/"; if ($rule =~ m/\Q${quos}\E/) {
     $quos = "#"; if ($rule =~ m/\Q${quos}\E/) {
       $quos = "'"; if ($rule =~ m/\Q${quos}\E/) {
         $quos = "@"; if ($rule =~ m/\Q${quos}\E/) {
           $quos = "*"; if ($rule =~ m/\Q${quos}\E/) {
             $quos = "!";
           }
         }
       }
     }
   }
   print $tmpfh "use bytes; m".$quos.$rule.$quos.$mods
     or die "error writing to $tmpf: $!";
   close $tmpfh  or die "error closing $tmpf: $!";

   my $perl = $self->get_perl();
   local *IN;
   open (IN, "$perl -c -Mre=debug $tmpf 2>&1 |")
     or die "cannot run $perl: ".exit_status_str($?,$!);

   my($inbuf,$nread,$fullstr); $fullstr = '';
   while ( $nread=read(IN,$inbuf,16384) ) { $fullstr .= $inbuf }
   defined $nread  or die "error reading from pipe: $!";

   close IN      or die "error closing pipe: $!";
   unlink $tmpf  or die "cannot unlink $tmpf: $!";
   defined $fullstr  or warn "empty result from a pipe";

   # now parse the -Mre=debug output.
   # perl 5.10 format
   $fullstr =~ s/^.*\nFinal program:\n//gs;
   # perl 5.6/5.8 format
   $fullstr =~ s/^(?:.*\n|)size \d[^\n]*\n//gs;
   $fullstr =~ s/^(?:.*\n|)first at \d[^\n]*\n//gs;
   # common to all
   $fullstr =~ s/\nOffsets:.*$//gs;

   # clean up every other line that doesn't start with a space
   $fullstr =~ s/^\S.*$//gm;

   if ($fullstr !~ /((?:\s[^\n]+\n)+)/m) {
     die "failed to parse Mre=debug output: $fullstr m".$quos.$rule.$quos.$mods." $rawrule";
   }
   my $opsstr = $1;

   # what's left looks like this:
   #    1: EXACTF <v>(3)
   #    3: ANYOF[1ILil](14)
   #   14: EXACTF <a>(16)
   #   16: CURLY {2,7}(29)
   #   18:   ANYOF[A-Za-z](0)
   #   29: SPACE(30)
   #   30: EXACTF <http://>(33)
   #   33: END(0)
   #
   DEBUG_RE_PARSING and warn "Mre=debug output: $opsstr";

   my @ops;
   foreach my $op (split(/\n/s, $opsstr)) {
     next unless $op;
     if ($op =~ /^\s+\d+: (\s*)([A-Z]\w+)\b(.*)(?:\(\d+\))?$/) {
       push @ops, [ $1, $2, $3 ];
     }
     elsif ($op =~ /^      (\s*)<(.*)>\.\.\.\s*$/) {
       #    5:   TRIE-EXACT[im](44)
       #         <message contained attachments that have been blocked by guin>...
       my $spcs = $1;
       # we could use the entire length here, but it's easier to trim to
       # the length of a perl 5.8.x/5.6.x EXACT* string; that way our test
       # suite results will match, since the sa-update --list extraction will
       # be the same for all versions.  (The "..." trailer is important btw)
       my $str = substr ($2, 0, 55);
       push @ops, [ $spcs, '_moretrie', "<$str...>" ];
     }
     elsif ($op =~ /^      (\s*)(<.*>)\s*(?:\(\d+\))?$/) {
       #    5:   TRIE-EXACT[am](21)
       #         <am> (21)
       #         <might> (12)
       push @ops, [ $1, '_moretrie', $2 ];
     }
     elsif ($op =~ /^ at .+ line \d+$/) {
       next; # ' at /local/perl561/lib/5.6.1/i86pc-solaris/re.pm line 109':
     }
     else {
       warn "cannot parse '$op': $opsstr";
       next;
     }
   }

   # unroll the branches; returns a list of versions.
   # e.g. /foo(bar|baz)argh/ => [ "foobarargh", "foobazargh" ]
   my @unrolled;
   if ($main->{bases_split_out_alternations}) {
     @unrolled = $self->unroll_branches(0, \@ops);
   } else {
     @unrolled = ( \@ops );
   }

   # now find the longest DFA-friendly string in each unrolled version
   my @longests;
   foreach my $opsarray (@unrolled) {
     my $longestexact = '';
     my $buf = '';

     # use a closure to keep the code succinct
     my $add_candidate = sub {
       if (length $buf > length $longestexact) { $longestexact = $buf; }
       $buf = '';
     };

     my $prevop;
     foreach my $op (@{$opsarray}) {
       my ($spcs, $item, $args) = @{$op};

       next if ($item eq 'NOTHING');

       # EXACT == case-sensitive
       # EXACTF == case-i
       # we can do both, since we canonicalize to lc.
       if (!$spcs && $item =~ /^EXACT/ && $args =~ /<(.*)>/)
       {
         my $str = $1;
         $buf .= $str;
         if ($buf =~ s/\\x\{[0-9a-fA-F]{4,}\}.*$//) {
           # a high Unicode codepoint, interpreted by perl 5.8.x.  cut and stop
           $add_candidate->();
         }
         if (length $str >= 55 && $buf =~ s/\.\.\.$//) {
           # perl 5.8.x truncates with a "..." here!  cut and stop
           $add_candidate->();
         }
       }
       # _moretrie == a TRIE-EXACT entry
       elsif (!$spcs && $item =~ /^_moretrie/ && $args =~ /<(.*)>/)
       {
         $buf .= $1;
         if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
           # perl 5.8.x truncates with a "..." here!  cut and stop
           $add_candidate->();
         }
       }
       # /(?:foo|bar|baz){2}/ results in a CURLYX beforehand
       elsif ($item =~ /^EXACT/ &&
           $prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&
                     $prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&
           $args =~ /<(.*)>/)
       {
         $buf .= $1;
         if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
           # perl 5.8.x truncates with a "..." here!  cut and stop
           $add_candidate->();
         }
       }
       # CURLYX, for perl >= 5.9.5
       elsif ($item =~ /^_moretrie/ &&
           $prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&
                     $prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&
           $args =~ /<(.*)>/)
       {
         $buf .= $1;
         if (length $1 >= 60 && $buf =~ s/\.\.\.$//) {
           # perl 5.8.x truncates with a "..." here!  cut and stop
           $add_candidate->();
         }
       }
       else {
         # not an /^EXACT/; clear the buffer
         $add_candidate->();
         if ($item !~ /^(?:END|CLOSE\d|MINMOD)$/)
         {
           $lossy = 1;
           DEBUG_RE_PARSING and warn "item $item makes regexp lossy";
         }
       }
       $prevop = $op;
     }
     $add_candidate->();

     if (!$longestexact) {
       die "no long-enough string found in $rawrule";
       # all unrolled versions must have a long string, otherwise
       # we cannot reliably match all variants of the rule
     } else {
       push @longests, ($main->{bases_must_be_casei}) ?
                             lc $longestexact : $longestexact;
     }
   }

   DEBUG_RE_PARSING and warn "longest base strings: /".join("/", @longests)."/";
   return ($lossy, @longests);
 }

 ###########################################################################

 sub unroll_branches {
   my ($self, $depth, $opslist) = @_;

   die "too deep" if ($depth++ > 5);

   my @ops = (@{$opslist});      # copy
   my @pre_branch_ops;
   my $branch_spcs;
   my $trie_spcs;
   my $open_spcs;

 # our input looks something like this 2-level structure:
 #  1: BOUND(2)
 #  2: EXACT <Dear >(5)
 #  5: BRANCH(9)
 #  6:   EXACT <IT>(8)
 #  8:   NALNUM(24)
 #  9: BRANCH(23)
 # 10:   EXACT <Int>(12)
 # 12:   BRANCH(14)
 # 13:     NOTHING(21)
 # 14:   BRANCH(17)
 # 15:     EXACT <a>(21)
 # 17:   BRANCH(20)
 # 18:     EXACT <er>(21)
 # 20:   TAIL(21)
 # 21:   EXACT <net>(24)
 # 23: TAIL(24)
 # 24: EXACT < shop>(27)
 # 27: END(0)
 #
 # or:
 #
 #  1: OPEN1(3)
 #  3:   BRANCH(6)
 #  4:     EXACT <v>(9)
 #  6:   BRANCH(9)
 #  7:     EXACT <\\/>(9)
 #  9: CLOSE1(11)
 # 11: CURLY {2,5}(14)
 # 13:   REG_ANY(0)
 # 14: EXACT < g r a >(17)
 # 17: ANYOF[a-z](28)
 # 28: END(0)
 #
 # or:
 #
 #  1: EXACT <i >(3)
 #  3: OPEN1(5)
 #  5:   TRIE-EXACT[am](21)
 #       <am> (21)
 #       <might> (12)
 # 12:     OPEN2(14)
 # 14:       TRIE-EXACT[ ](19)
 #           < be>
 #           <>
 # 19:     CLOSE2(21)
 # 21: CLOSE1(23)
 # 23: EXACT < c>(25)

   DEBUG_RE_PARSING and warn "starting parse";

   # this happens for /foo|bar/ instead of /(?:foo|bar)/ ; transform
   # it into the latter.  bit of a kludge to do this before the loop, but hey.
   # note that it doesn't fix the CLOSE1/END ordering to be correct
   if (scalar @ops > 1 && $ops[0]->[1] =~ /^BRANCH/) {
     my @newops = ([ "", "OPEN1", "" ]);
     foreach my $op (@ops) {
       push @newops, [ "  ".$op->[0], $op->[1], $op->[2] ];
     }
     push @newops, [ "", "CLOSE1", "" ];
     @ops = @newops;
   }

   # iterate until we start a branch set. using
   # /dkjfksl(foo|bar(baz|argh)boo)gab/ as an example, we're at "dkj..."
   # just hitting an OPEN is not enough; wait until we see a TRIE-EXACT
   # or a BRANCH, *then* unroll the most recent OPEN set.
   while (1) {
     my $op = shift @ops;
     last unless defined $op;

     my ($spcs, $item, $args) = @{$op};
     DEBUG_RE_PARSING and warn "pre: [$spcs] $item $args";

     if ($item =~ /^OPEN/) {
       $open_spcs = $spcs;
       next;         # next will be a BRANCH or TRIE

     } elsif ($item =~ /^TRIE/) {
       $trie_spcs = $spcs;
       last;

     } elsif ($item =~ /^BRANCH/) {
       $branch_spcs = $spcs;
       last;

     } elsif ($item =~ /^EXACT/ && defined $open_spcs) {
       # perl 5.9.5 does this; f(o|oish) => OPEN, EXACT, TRIE-EXACT
       push @pre_branch_ops, [ $open_spcs, $item, $args ];
       next;

     } elsif (defined $open_spcs) {
       # OPEN not followed immediately by BRANCH, EXACT or TRIE-EXACT:
       # ignore this OPEN block entirely and don't try to unroll it
       undef $open_spcs;

     } else {
       push @pre_branch_ops, $op;
     }
   }

   # no branches found?  we're done unrolling on this one!
   if (scalar @ops == 0) {
     return [ @pre_branch_ops ];
   }

   # otherwise we're at the start of a new branch set
   # /(foo|bar(baz|argh)boo)gab/
   my @alts;
   my @in_this_branch;

   DEBUG_RE_PARSING and warn "entering branch: ".
         "open='".(defined $open_spcs ? $open_spcs : 'undef')."' ".
         "branch='".(defined $branch_spcs ? $branch_spcs : 'undef')."' ".
         "trie='".(defined $trie_spcs ? $trie_spcs : 'undef')."'";

   # indentation level to remove from "normal" ops (using a s///)
   my $open_sub_spcs = ($branch_spcs ? $branch_spcs : "")."  ";
   my $trie_sub_spcs = "";
   while (1) {
     my $op = shift @ops;
     last unless defined $op;
     my ($spcs, $item, $args) = @{$op};
     DEBUG_RE_PARSING and warn "in:  [$spcs] $item $args";

     if (defined $branch_spcs && $branch_spcs eq $spcs && $item =~ /^BRANCH/) {  # alt
       push @alts, [ @pre_branch_ops, @in_this_branch ];
       @in_this_branch = ();
       $open_sub_spcs = $branch_spcs."  ";
       $trie_sub_spcs = "";
       next;
     }
     elsif (defined $branch_spcs && $branch_spcs eq $spcs && $item eq 'TAIL') { # end
       push @alts, [ @pre_branch_ops, @in_this_branch ];
       undef $branch_spcs;
       $open_sub_spcs = "";
       $trie_sub_spcs = "";
       last;
     }
     elsif (defined $trie_spcs && $trie_spcs eq $spcs && $item eq '_moretrie') {
       if (scalar @in_this_branch > 0) {
         push @alts, [ @pre_branch_ops, @in_this_branch ];
       }
       # use $open_spcs instead of $trie_spcs (which is 2 spcs further indented)
       @in_this_branch = ( [ $open_spcs, $item, $args ] );
       $open_sub_spcs = ($branch_spcs ? $branch_spcs : "")."  ";
       $trie_sub_spcs = "  ";
       next;
     }
     elsif (defined $open_spcs && $open_spcs eq $spcs && $item =~ /^CLOSE/) {   # end
       push @alts, [ @pre_branch_ops, @in_this_branch ];
       undef $branch_spcs;
       undef $open_spcs;
       undef $trie_spcs;
       $open_sub_spcs = "";
       $trie_sub_spcs = "";
       last;
     }
     elsif ($item eq 'END') {  # of string
       push @alts, [ @pre_branch_ops, @in_this_branch ];
       undef $branch_spcs;
       undef $open_spcs;
       undef $trie_spcs;
       $open_sub_spcs = "";
       $trie_sub_spcs = "";
       last;
     }
     else {
       if ($open_sub_spcs) {
         # deindent the space-level to match the opening brace
         $spcs =~ s/^$open_sub_spcs//;
         # tries also add one more indent level in
         $spcs =~ s/^$trie_sub_spcs//;
       }
       push @in_this_branch, [ $spcs, $item, $args ];
       # note that we ignore ops at a deeper $spcs level entirely (until later!)
     }
   }

   if (defined $branch_spcs) {
     die "fell off end of string with a branch open: '$branch_spcs'";
   }

   # we're now after the branch set: /gab/
   # @alts looks like [ /dkjfkslfoo/ , /dkjfkslbar(baz|argh)boo/ ]
   foreach my $alt (@alts) {
     push @{$alt}, @ops;     # add all remaining ops to each one
     # note that this could include more (?:...); we don't care, since
     # those can be handled by recursing
   }

   # ok, parsed the entire ops list
   # @alts looks like [ /dkjfkslfoogab/ , /dkjfkslbar(baz|argh)boogab/ ]

   if (DEBUG_RE_PARSING) {
     print "unrolled: "; foreach my $alt (@alts) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }
   }

   # now recurse, to unroll the remaining branches (if any exist)
   my @rets;
   foreach my $alt (@alts) {
     push @rets, $self->unroll_branches($depth, $alt);
   }

   if (DEBUG_RE_PARSING) {
     print "unrolled post-recurse: "; foreach my $alt (@rets) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }
   }

   return @rets;
 }

 ###########################################################################

 sub test {
   my ($self) = @_;

   $self->test_split_alt("foo", "/foo/");
   $self->test_split_alt("(foo)", "/foo/");
   $self->test_split_alt("foo(bar)baz", "/foobarbaz/");
   $self->test_split_alt("x(foo|)", "/xfoo/ /x/");
   $self->test_split_alt("fo(o|)", "/foo/ /fo/");
   $self->test_split_alt("(foo|bar)", "/foo/ /bar/");
   $self->test_split_alt("foo|bar", "/foo/ /bar/");
   $self->test_split_alt("foo (bar|baz) argh", "/foo bar argh/ /foo baz argh/");
   $self->test_split_alt("foo (bar|baz|bl(arg|at)) cough", "/foo bar cough/ /foo baz cough/ /foo blarg cough/ /foo blat cough/");
   $self->test_split_alt("(s(otc|tco)k)", "/sotck/ /stcok/");
   $self->test_split_alt("(business partner(s|ship|)|silent partner(s|ship|))", "/business partners/ /silent partners/ /business partnership/ /silent partnership/ /business partner/ /silent partner/");
 }

 sub test_split_alt {
   my ($self, $in, $out) = @_;

   my @got = $self->split_alt($in);
   $out =~ s/^\///;
   $out =~ s/\/$//;
   my @want = split(/\/ \//, $out);

   my $failed = 0;
   if (scalar @want != scalar @got) {
     warn "FAIL: results count don't match";
     $failed++;
   }
   else {
     my %got = map { $_ => 1 } @got;
     foreach my $w (@want) {
       if (!$got{$w}) {
         warn "FAIL: '$w' not found";
         $failed++;
       }
     }
   }

   if ($failed) {
     print "want: /".join('/ /', @want)."/\n"  or die "error writing: $!";
     print "got:  /".join('/ /', @got)."/\n"   or die "error writing: $!";
     return 0;
   } else {
     print "ok\n"  or die "error writing: $!";
     return 1;
   }
 }

 ###########################################################################

 sub get_perl {
   my ($self) = @_;
   my $perl;

   # allow user override of the perl interpreter to use when
   # extracting base strings.
   # TODO: expose this via sa-compile command-line option
   my $fromconf = $self->{main}->{conf}->{re_parser_perl};

   if ($fromconf) {
     $perl = $fromconf;
   } elsif ($^X =~ m|^/|) {
     $perl = $^X;
   } else {
     use Config;
     $perl = $Config{perlpath};
     $perl =~ s|/[^/]*$|/$^X|;
   }
   untaint_var(\$perl);
   return $perl;
 }

 ###########################################################################

 sub read_cachefile {
   my ($self, $cachefile) = @_;
   local *IN;
   if (open(IN, "<".$cachefile)) {
     my($inbuf,$nread,$str); $str = '';
     while ( $nread=read(IN,$inbuf,16384) ) { $str .= $inbuf }
     defined $nread  or die "error reading from $cachefile: $!";
     close IN  or die "error closing $cachefile: $!";

     untaint_var(\$str);
     my $VAR1;              # Data::Dumper
     if (eval $str) {
       return $VAR1;        # Data::Dumper's naming
     }
   }
   return { };
 }

 sub write_cachefile {
   my ($self, $cachefile, $cached) = @_;

   my $dump = Data::Dumper->new ([ $cached ]);
   $dump->Deepcopy(1);
   $dump->Purity(1);
   $dump->Indent(1);
   if (mkdir($self->{main}->{bases_cache_dir})) {
     # successfully created
   } elsif ($! == EEXIST) {
     dbg("zoom: ok, cache directory already existed");
   } else {
     warn "cannot create a directory: $!";
   }
   open(CACHE, ">$cachefile")  or warn "cannot write to $cachefile";
   print CACHE ($dump->Dump, ";1;")  or die "error writing: $!";
   close CACHE  or die "error closing $cachefile: $!";
 }

 =item my ($cleanregexp) = fixup_re($regexp);

 Converts encoded characters in a regular expression pattern into their
 equivalent characters

 =cut

 sub fixup_re {
   my $re = shift;

   if ($fixup_re_test) { print "INPUT: /$re/\n"  or die "error writing: $!" }

   my $output = "";
   my $TOK = qr([\"\\]);

   my $STATE;
   local ($1,$2);
   while ($re =~ /\G(.*?)($TOK)/gc) {
     my $pre = $1;
     my $tok = $2;

     if (length($pre)) {
       $output .= "\"$pre\"";
     }

     if ($tok eq '"') {
       $output .= '"\\""';
     }
     elsif ($tok eq '\\') {
       $re =~ /\G(x\{[^\}]+\}|\d+|.)/gc or die "\\ at end of string!";
       my $esc = $1;
       if ($esc eq '"') {
         $output .= '"\\""';
       } elsif ($esc eq '\\') {
         $output .= '"**BACKSLASH**"';   # avoid hairy escape-parsing
       } elsif ($esc =~ /^x\{(\S+)\}$/) {
         $output .= '"'.chr(hex($1)).'"';
       } elsif ($esc =~ /^\d+/) {
         $output .= '"'.chr(oct($esc)).'"';
       } else {
         $output .= "\"$esc\"";
       }
     }
     else {
       print "PRE: $pre\nTOK: $tok\n"  or die "error writing: $!";
     }
   }

   if (!defined(pos($re))) {
     # no matches
     $output .= "\"$re\"";
   }
   elsif (pos($re) <= length($re)) {
     $output .= fixup_re(substr($re, pos($re)));
   }

   $output =~ s/^""/"/;  # protect start and end quotes
   $output =~ s/(?<!\\)""$/"/;
   $output =~ s/(?<!\\)""//g; # strip empty strings, or turn "abc""def" -> "abcdef"
   $output =~ s/\*\*BACKSLASH\*\*/\\\\/gs;

   if ($fixup_re_test) { print "OUTPUT: $output\n"  or die "error writing: $!" }
   return $output;
 }

 1;