masses/rule-dev/seek-phrases-in-log - spamassassin - Git at Google

 #!/usr/bin/perl

 =head1 NAME

 seek-phrases-in-log - extract good-looking rules from a text-dump mc log

 =cut

 # <@LICENSE>
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to you under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at:
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # </@LICENSE>

 # ---------------------------------------------------------------------------

 sub usage {
   die "
 usage: seek-phrases-in-log [--reqhitrate n] [--reqpatlength n] [ --maxreqpatlength n]
    [--rules] [--ruletype 'type'] [--ruleprefix FOO]
    [--maxtextread n] --ham hamlog --spam spamlog

 --reqhitrate: percentage hit-rate against spam required (default: 0.5)
    (multiple values can be specified, separated by spaces)
 --reqpatlength: required pattern length, in characters (default: 0)
 --maxreqpatlength: maximum pattern length, in characters (default: 1024)
 --maxtextread: bytes of message text examined (default: 32768)
 --rules: generate SpamAssassin rule output (default: 0)
 --ruleprefix: specify prefix string for rules (default: 'SEEK_')
 --ruletype 'type': generate rules of type: 'header', 'body'
 ";
 }

 # ---------------------------------------------------------------------------

 use warnings;
 use strict;
 use Getopt::Long qw(:config no_ignore_case);
 use Data::Dumper;
 use Digest::SHA qw(sha1_base64);

 sub logmsg;

 my %opt = ();
 $opt{reqhitrate} = 0.5;
 $opt{reqpatlength} = 0;
 $opt{maxreqpatlength} = 1024;
 $opt{maxtextread} = 32768;
 $opt{rules} = 0;
 $opt{ruleprefix} = 'SEEK_';
 $opt{ruletype} = 'body';

 my $fs;
 my $fh;
 my @files = ();
 GetOptions(
         "rules" => \$opt{rules},
         "ruleprefix=s" => \$opt{ruleprefix},
         "reqhitrate=s" => \$opt{reqhitrate},
         "reqpatlength=s" => \$opt{reqpatlength},
         "maxreqpatlength=s" => \$opt{maxreqpatlength},
         "ruletype=s" => \$opt{ruletype},
         "maxtextread=s" => \$opt{maxtextread},
         "phase2=s" => \$opt{phase2},
         "ham=s" => \$fh,
         "spam=s" => \$fs,
         'help' => \&usage
 ) or usage();

 usage() unless (($fs && $fh) || $opt{phase2});

 my @hitratethresholds = ();
 if ($opt{reqhitrate} =~ /\S\s+\S/) {        # multiple values
   @hitratethresholds = reverse sort {$a<=>$b} split (' ', $opt{reqhitrate});
   $opt{reqhitrate} = pop @hitratethresholds;  # the lowest
   # @hitratethresholds is now sorted, highest to lowest
 }

 # n-gram reading state
 my %word2sym = ('' => '');
 my %sym2word = ('' => '');
 my $sym_acc = 'a';      # symbols are represented using IDs from this counter
 my $msgcount = 0;

 # these are shared with the assembly stage (via $asmstate)
 my @text_string = ();
 my %ngram_count = ();
 my %msg_subset_hit = ();
 my $asmstate;

 if ($opt{phase2}) {
   $asmstate = load_state($opt{phase2});
 }
 else {
   logmsg "reading $fs...";
   open IN, "<$fs" or die "cannot open spam log $fs";
   while (<IN>) {
     /^text: (.*)$/ and proc_text_spam($1);
   }
   close IN;

   # only do this if we have read enough spam messages, otherwise we could
   # discard tokens for which reqhitrate has been achieved
   if ($msgcount > 2 * (100 / $opt{reqhitrate})) {
     discard_hapaxes();
   }

   logmsg "n-grams active: ".(scalar keys %ngram_count);

   logmsg "reading $fh...";
   open IN, "<$fh" or die "cannot open ham log $fh";
   while (<IN>) {
     /^text: (.*)$/ and proc_text_ham($1);
   }
   close IN;
   logmsg "n-grams active: ".(scalar keys %ngram_count);

   # move onto the next step; assembly.  free stuff we no longer need here
   undef %word2sym;     # free this, no longer needed
   # create the assembly state object
   $asmstate = { };

   filter_into_message_subsets();
   undef %sym2word;     # no longer needed after that
   write_state($asmstate, "assemble.state");
 }

 assemble_regexps();
 exit;

 # ---------------------------------------------------------------------------
 # PHASE 1: PARSING, NGRAM PROBABILITIES

 sub proc_text_spam {
   my ($text) = @_;

   # we only need to chop off the end of spam samples
   if (length($text) > $opt{maxtextread}) {
     $text = substr $text, 0, $opt{maxtextread};      # chop!
   }

   $text =~ s/  +/ /gs;                  # single spaces, please

   # we only need to save spam samples in memory, ignore ham samples
   push @text_string, $text;

   my $cp = pack "l", $msgcount;
   $msgcount++;

   (($msgcount % 1000) == 999) and discard_hapaxes();

   my %tokens = ();
   foreach my $line (split(/\[p\]/, $text)) {
     my $w1 = '';
     my $w2 = '';
     my $w3 = '';

     foreach my $w (split(' ', $line)) {
       # if (length $w > 20) { $w = "sk:".substr($w, 0, 5); }

       $w3 = $w2;
       $w2 = $w1;

       $w1 = $word2sym{$w};
       if (!$w1) {
         $word2sym{$w} = $w1 = $sym_acc;
         $sym2word{$sym_acc} = $w;
         $sym_acc++;
       }

       # simple bayesian N-grams to start
       $tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = 1;
     }

     # deal with leftovers
     if ($w2 && $w1) {
       $tokens{"$w2.$w1"} = 1;
     }
   }

   foreach my $tok (keys %tokens) {
     #warn "JMD adding $tok => ".decode_sym2words($tok);
     # incr the counter for this token
     $ngram_count{$tok}++;
     $msg_subset_hit{$tok} .= $cp;    # the message subset hit by this tok
   }
 }

 sub discard_hapaxes {
   my $before = (scalar keys %ngram_count);
   foreach my $tok (keys %ngram_count) {
     if ($ngram_count{$tok} <= 1) {
       delete $ngram_count{$tok};
       delete $msg_subset_hit{$tok};
     }
   }
   my $after = (scalar keys %ngram_count);
   my $killed = ($before - $after);
   logmsg "shrunk dbs: $killed hapaxes killed, kept $after entries";
 }

 sub proc_text_ham {
   my ($text) = @_;

   my %tokens = ();
   foreach my $line (split(/\[p\]/, $text)) {
     my $w1 = '';
     my $w2 = '';
     my $w3 = '';

     foreach my $w (split(' ', $line)) {
       $w3 = $w2;
       $w2 = $w1;

       $w1 = $word2sym{$w};
       if (!$w1) {
         # since we're deleting, there's no need to add new words
         # to the dictionary; just use the final $sym_acc to mean
         # "unknown ham word", and don't increment it
         $w1 = $sym_acc;
       }

       $tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = $tokens{".$w3"} = 1;
     }

     # deal with leftovers
     if ($w2 && $w1) {
       $tokens{"$w2.$w1"} = 1;
     }
   }

   foreach my $tok (keys %tokens) {
     # we're not tracking hits; we're killing false positives.
     # as soon as a single FP appears, kill all record of that token,
     # it cannot be used
     remove_fp_ing_token($tok);
   }
 }

 sub remove_fp_ing_token {
   my $tok = shift;
   #warn "JMD removing $tok => ".decode_sym2words($tok);
   delete $ngram_count{$tok};
   delete $msg_subset_hit{$tok};
   delete $ngram_count{".".$tok};
   delete $msg_subset_hit{".".$tok};
   delete $ngram_count{"..".$tok};
   delete $msg_subset_hit{"..".$tok};
 }

 sub filter_into_message_subsets {
   logmsg "filtering into message subsets...";

   $asmstate = { };

   # hash all msg_subset_hit lists; we don't need the full data, so this
   # saves space
   foreach my $id (keys %msg_subset_hit) {
     $msg_subset_hit{$id} = unpack("%32C*", $msg_subset_hit{$id});
   }

   # note: we don't care about stuff that hits *any* ham at all
   $asmstate->{msg_count_spam} = scalar @text_string;
   $asmstate->{msg_count_spam} ||= 0.000001;

   $asmstate->{all_patterns_for_set} = { };

   foreach my $id (keys %ngram_count) {
     my $count = $ngram_count{$id};
     my $bad;

     # must occur more than once!
     if (!defined $count || $count <= 1) {
       $bad++;
     }
     # require N% spam hits
     elsif (($count*100) / $asmstate->{msg_count_spam} < $opt{reqhitrate}) {
       $bad++;
     }

     if ($bad) {
       # we don't need to remember anything about this pattern after here
       remove_fp_ing_token($id);
       next;
     }

     my $set = $msg_subset_hit{$id};
     push @{$asmstate->{all_patterns_for_set}->{$set}}, decode_sym2words($id);
   }

   logmsg "message subsets found: ".(scalar
                                 keys %{$asmstate->{all_patterns_for_set}});

   $asmstate->{ngram_count} = \%ngram_count;
   $asmstate->{msg_subset_hit} = \%msg_subset_hit;
   $asmstate->{text_string} = \@text_string;
 }

 sub decode_sym2words {
   my $ids = shift;
   my $r;
   if ($ids =~ /^([^.]*)\.([^.]*)\.([^.]*)$/) {
     $r = "$sym2word{$1} $sym2word{$2} $sym2word{$3}";
   }
   elsif ($ids =~ /^([^.]*)\.([^.]*)$/) {
     $r = "$sym2word{$1} $sym2word{$2}";
   }
   else {
     warn "bad decode_sym2words format: '$ids'";
   }
   $r =~ s/^\s+//;
   return $r;
 }

 sub write_state {
   my ($state, $statefile) = @_;
   my $dump = Data::Dumper->new ([ $state ]);
   $dump->Deepcopy(1);
   $dump->Purity(1);
   $dump->Indent(1);
   my $text = $dump->Dump.";1;";
   open (OUT, ">$statefile") or die "cannot write $statefile";
   print OUT $text;
   close OUT or die "cannot write $statefile";
 }

 # ---------------------------------------------------------------------------
 # PHASE 2: REGEXP ASSEMBLY

 sub load_state {
   my $file = shift;
   if (open(IN, "<".$file)) {
     my $str = join("", <IN>); close IN;
     my $VAR1;                 # Data::Dumper
     if (eval $str) {
       return $VAR1;        # Data::Dumper's naming
     }
   }
   die "failed to load state from $file";
 }

 sub assemble_regexps {
   my %done_set = ();
   my @done_pats = ();

   logmsg "deduping and assembling regexps...";
   if (!$opt{rules}) {
     printf ("%6s  %6s  %6s   %s\n", "RATIO", "SPAM%", "HAM%", "DATA");
   }

   $| = 1;
   my $count = 0;
   my $count_out = 0;
   foreach my $id (sort {
               $asmstate->{ngram_count}->{$b} <=> $asmstate->{ngram_count}->{$a}
           } keys %{$asmstate->{ngram_count}})
   {
     my $set = $asmstate->{msg_subset_hit}->{$id};
     next if $done_set{$set}; $done_set{$set}++;

     if ($count++ % 200 == 0) {
       logmsg "working on message subset $count ($count_out)...";
     }

     # we now have several patterns.  see if we can expand them sideways
     # to make the pattern bigger, and collapse into a smaller number of
     # pats at the same time
     my @pats = collapse_pats($asmstate->{all_patterns_for_set}->{$set});
     # my @pats = @{$asmstate->{all_patterns_for_set}->{$set}};

     @pats = quote_all_metas(@pats);
     @pats = subsume_with_dotstars(@pats);
     @pats = expand_with_dots(@pats);
     @pats = ensure_reqpatlength(@pats);
     # at this point, patterns are all valid regexps

     # now check to see if any of these pats have been subsumed in an
     # already-output pattern (one with more hits!)
     my @pats_new = ();
     foreach my $pat (@pats) {
       my $subsumed = 0;
       foreach my $done (@done_pats, @pats_new) {
         # pattern == existing pattern, or existing pattern is contained by
         # pattern, or pattern is contained in existing pattern
         if ($pat eq $done || $pat =~ /\Q${done}\E/ || $done =~ /\Q${pat}\E/) {
           $subsumed=1;
           last;
         }
         # or one pattern contains the other (but interpreted as a regexp!)
         # this deals with /foo.{0,10} bar/ vs /foo ish bar/
         if ($pat =~ /$done/) {
           $subsumed=1;
           last;
         }
         if ($done =~ /$pat/) {
           $subsumed=1;
           last;
         }
       }
       next if $subsumed;

       # add to the current list
       push @pats_new, $pat;
     }

     @pats = @pats_new;
     push @done_pats, @pats_new;

     # if we have no non-subsumed pats at this stage, skip this set
     next unless @pats;

     if (($asmstate->{ngram_count}->{$id}*100) / $asmstate->{msg_count_spam} <
             $opt{reqhitrate})
     {
       # quit if we hit the final hitrate threshold
       return;
     }

     if (defined($hitratethresholds[0]) &&
         ($asmstate->{ngram_count}->{$id}*100) / $asmstate->{msg_count_spam} < $hitratethresholds[0])
     {
       print "# passed hit-rate threshold: $hitratethresholds[0]\n";
       shift @hitratethresholds;
     }

     if ($opt{rules}) {
       printf "# %6.3f  %6.3f  %6.3f\n",
         1.0, ($asmstate->{ngram_count}->{$id}*100) / $asmstate->{msg_count_spam}, 0;

       # sort, to ensure ordering always remains the same
       foreach my $pat (sort @pats) {
         my $name = generate_rule_name($pat);

         if ($opt{ruletype} eq 'header') {
           # deal with header-specific munging.
           # "\[\\n\]" is the result of "[\n]", at this stage
           $pat =~ s/\Q\[\\n\]\E/\\n/gs;
           $pat =~ s/\Q\[\\t\]\E/\\t/gs;
         }

         if ($opt{ruletype} eq 'body') {
           $pat =~ s/^\s/\\s/;
         }

         if ($opt{ruletype} eq 'header') {
           print "$opt{ruletype} $opt{ruleprefix}${name}  ALL =~ /$pat/\n";
         } else {
           print "$opt{ruletype} $opt{ruleprefix}${name}  /$pat/\n";
         }
         $count_out++;
       }

     } else {
       my $pats = '/'.join ('/, /', map { s/\//\\\//gs; $_; } @pats).'/';
       printf "%6.3f  %6.3f  %6.3f  %s\n",
         1.0, ($asmstate->{ngram_count}->{$id}*100) / $asmstate->{msg_count_spam}, 0, $pats;
       $count_out++;
     }
   }
 }

 sub collapse_pats {
   my $pataryref = $_[0];
   my @ret = ();

   while (1) {
     my $pat = shift(@{$pataryref});
     last unless defined($pat);

     #warn "JMD collapse [$pat]";
     $pat =~ s/^\s+//;

     # TODO: optimise, second-slowest line
     my @hits = grep /\Q$pat\E/, @{$asmstate->{text_string}};

     if (scalar @hits == 0) {
       warn "supposed pattern /$pat/ is 0-hitter";
       warn "JMD strings:\n  ".join("\n  ", @{$asmstate->{text_string}})."\n";
       push @ret, "[BROKEN: 0 hitter]$pat";
       next;
     }

     # we don't have all day!
     my $pat_maxlen = $opt{maxtextread};
     my $s = $hits[0];

     # Now, expand the pattern using a BLAST-style algorithm

     # expand towards start of string
     while (1) {
       my $l = length($pat);
       last if ($l > $pat_maxlen);     # too long

       my $found;
       if ($s =~ /(.)\Q$pat\E/s && $s !~ /\[p\]\Q$pat\E/s) { $found = $1; }

       if (!defined $found) {
         # start of string.  break
         last;
       }

       # give up if there are a differing number of hits for the new pat
       my $newpat = $found.$pat;

       # note: we can search just in @hits, instead of in
       # @{$asmstate->{text_string}}, because there's no way we can
       # *increase* the hitrate in the corpus; we can only reduce it.
       # this is double-checked after these 2 expansion loops anyway
       if (scalar (grep /\Q$newpat\E/, @hits) != scalar @hits) { last; }

       $pat = $newpat;     # and carry on
     }
     # warn "JMD $pat";

     # expand towards end of string
     while (1) {
       if (length($pat) > $pat_maxlen
          || $s =~ /\Q$pat\E\[p\]/s
          || $s !~ /\Q$pat\E(.)/s)
       {
         # end of string.  break
         last;
       }

       my $newpat = $pat.$1;
       if (scalar (grep /\Q$newpat\E/, @hits) != scalar @hits) { last; }

       $pat = $newpat;     # and carry on
     }

     # double-check to ensure we haven't somehow INCREASED our hitrate
     # beyond the initial pattern
     if (scalar (grep /\Q$pat\E/, @{$asmstate->{text_string}}) != scalar @hits) {
       die "oops! went too far";
     }

     # now remove subsumed patterns
     @{$pataryref} = grep { $pat !~ /\Q$_\E/s } @{$pataryref};

     # warn "JMD $pat";
     # skip recording this if it's already inside one of the results
     next if grep { $_ =~ /\Q$pat\E/s } @ret;

     # also, remove cases where this pattern contains previous results
     @ret = grep { $pat !~ /\Q$_\E/s } @ret;

     # warn "JMD $pat";
     push (@ret, $pat);
   }

   return @ret;
 }

 sub quote_all_metas {
   return map {
     s/([!-+\`\?\^\~\\\/\|\.\{\}\(\)\[\]\@])/\\$1/gs;
     $_;
   } @_;
 }

 sub subsume_with_dotstars {
   my @working = @_;

   # attempt to resolve . and .*
   my @ret = ();

   while (1) {
     my $p1 = shift(@working);
     last unless defined($p1);

     # TODO: optimise, 4th-slowest line
     my @hits = grep /$p1/, @{$asmstate->{text_string}};

     if (scalar @hits == 0) {
       warn "supposed pattern /$p1/ is 0-hitter";
       push @ret, "[BROKEN: 0 hitter]$p1";
       next;
     }

     foreach my $p2 (@working) {
       next if ($p1 eq $p2);

       DOTLOOP: for my $dotstar
             ( ".", ".?", ".{0,3}", ".{0,5}", ".{0,20}", ".{0,40}" )
       {
         my $newpatcapture = $p1."(".$dotstar.")".$p2;
         my $newpat        = $p1.$dotstar.$p2;

         # drop the attempt if any of the existing hits becomes a miss,
         # or the .* includes a paragraph boundary in any of the hits
         foreach my $t (@hits) {
           next DOTLOOP if ($t !~ /$newpatcapture/);
           next DOTLOOP if ($1 =~ /\[p\]/);
         }
         next if (scalar (grep /$newpat/, @{$asmstate->{text_string}}) != scalar @hits);

         # it works! yay

         # TODO: for /./, see if we can capture the options and
         # construct a /[oO0]/ char class instead
         $p1 = $newpat;
       }
     }

     # skip recording this if it's already inside one of the results
     next if grep { $_ =~ /\Q$p1\E/s } @ret;

     push (@ret, $p1);
   }
   return @ret;
 }

 sub expand_with_dots {
   my @working = @_;
   my @ret = ();

   my $pat_maxlen = $opt{maxtextread};
   while (1) {
     my $p1 = shift(@working);
     last unless defined($p1);

     # TODO: optimise, 5th-slowest line
     my @hits = grep /$p1/, @{$asmstate->{text_string}};

     if (scalar @hits == 0) {
       warn "supposed pattern /$p1/ is 0-hitter";
       push @ret, "[BROKEN: 0 hitter]$p1";
       next;
     }

     my $s = $hits[0];

     # expand towards end of string (with .)
     while (1) {
       last if (length($p1) >= $pat_maxlen);

       if ($s !~ /$p1\[p\]/s && $s =~ /$p1(.)/s)
       {
         my $extn = $1; quote_all_metas($extn);
         my $newpat = $p1.$extn;

         # TODO: optimise, 3rd-slowest line
         if (scalar (grep /$newpat/, @{$asmstate->{text_string}}) == scalar @hits) {
           $p1 = $newpat; next;
         }
       }

       if ($s !~ /$p1.?\[p\]/s && $s =~ /$p1[^\\](.)/s)
       {
         my $extn = $1; quote_all_metas($extn);
         my $newpat = $p1.".".$extn;
         if (scalar (grep /$newpat/, @{$asmstate->{text_string}}) == scalar @hits) {
           $p1 = $newpat; next;
         }
       }

       if ($s !~ /$p1.{0,2}\[p\]/s && $s =~ /$p1[^\\][^\\](.)/s)
       {
         my $extn = $1; quote_all_metas($extn);
         my $newpat = $p1."..".$extn;
         if (scalar (grep /$newpat/, @{$asmstate->{text_string}}) == scalar @hits) {
           $p1 = $newpat; next;
         }
       }

       last;
     }

     push (@ret, $p1);
   }

   @ret;
 }

 sub ensure_reqpatlength {
   my @ret = @_;
   if ($opt{reqpatlength}) {
     @ret = grep { (length($_) >= $opt{reqpatlength}) && (length($_) < $opt{maxreqpatlength}) } @ret;
     return () unless @ret;
   }
   return @ret;
 }

 sub generate_rule_name {
   my $str = shift;
   $str = sha1_base64($str);
   $str =~ s/^(.{6}).*$/$1/gs;
   $str =~ tr/a-z/A-Z/;
   $str =~ s/[^A-Z0-9]/_/gs;
   return $str;
 }

 # ---------------------------------------------------------------------------
 # used by all phases

 sub logmsg {
   warn "".(scalar localtime time).": $_[0]\n";
 }
	#!/usr/bin/perl

	=head1 NAME

	seek-phrases-in-log - extract good-looking rules from a text-dump mc log

	=cut

	# <@LICENSE>
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to you under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at:
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# </@LICENSE>

	# ---------------------------------------------------------------------------

	sub usage {
	die "
	usage: seek-phrases-in-log [--reqhitrate n] [--reqpatlength n] [ --maxreqpatlength n]
	[--rules] [--ruletype 'type'] [--ruleprefix FOO]
	[--maxtextread n] --ham hamlog --spam spamlog

	--reqhitrate: percentage hit-rate against spam required (default: 0.5)
	(multiple values can be specified, separated by spaces)
	--reqpatlength: required pattern length, in characters (default: 0)
	--maxreqpatlength: maximum pattern length, in characters (default: 1024)
	--maxtextread: bytes of message text examined (default: 32768)
	--rules: generate SpamAssassin rule output (default: 0)
	--ruleprefix: specify prefix string for rules (default: 'SEEK_')
	--ruletype 'type': generate rules of type: 'header', 'body'
	";
	}

	# ---------------------------------------------------------------------------

	use warnings;
	use strict;
	use Getopt::Long qw(:config no_ignore_case);
	use Data::Dumper;
	use Digest::SHA qw(sha1_base64);

	sub logmsg;

	my %opt = ();
	$opt{reqhitrate} = 0.5;
	$opt{reqpatlength} = 0;
	$opt{maxreqpatlength} = 1024;
	$opt{maxtextread} = 32768;
	$opt{rules} = 0;
	$opt{ruleprefix} = 'SEEK_';
	$opt{ruletype} = 'body';

	my $fs;
	my $fh;
	my @files = ();
	GetOptions(
	"rules" => \$opt{rules},
	"ruleprefix=s" => \$opt{ruleprefix},
	"reqhitrate=s" => \$opt{reqhitrate},
	"reqpatlength=s" => \$opt{reqpatlength},
	"maxreqpatlength=s" => \$opt{maxreqpatlength},
	"ruletype=s" => \$opt{ruletype},
	"maxtextread=s" => \$opt{maxtextread},
	"phase2=s" => \$opt{phase2},
	"ham=s" => \$fh,
	"spam=s" => \$fs,
	'help' => \&usage
	) or usage();

	usage() unless (($fs && $fh) \|\| $opt{phase2});

	my @hitratethresholds = ();
	if ($opt{reqhitrate} =~ /\S\s+\S/) { # multiple values
	@hitratethresholds = reverse sort {$a<=>$b} split (' ', $opt{reqhitrate});
	$opt{reqhitrate} = pop @hitratethresholds; # the lowest
	# @hitratethresholds is now sorted, highest to lowest
	}

	# n-gram reading state
	my %word2sym = ('' => '');
	my %sym2word = ('' => '');
	my $sym_acc = 'a'; # symbols are represented using IDs from this counter
	my $msgcount = 0;

	# these are shared with the assembly stage (via $asmstate)
	my @text_string = ();
	my %ngram_count = ();
	my %msg_subset_hit = ();
	my $asmstate;

	if ($opt{phase2}) {
	$asmstate = load_state($opt{phase2});
	}
	else {
	logmsg "reading $fs...";
	open IN, "<$fs" or die "cannot open spam log $fs";
	while (<IN>) {
	/^text: (.*)$/ and proc_text_spam($1);
	}
	close IN;

	# only do this if we have read enough spam messages, otherwise we could
	# discard tokens for which reqhitrate has been achieved
	if ($msgcount > 2 * (100 / $opt{reqhitrate})) {
	discard_hapaxes();
	}

	logmsg "n-grams active: ".(scalar keys %ngram_count);

	logmsg "reading $fh...";
	open IN, "<$fh" or die "cannot open ham log $fh";
	while (<IN>) {
	/^text: (.*)$/ and proc_text_ham($1);
	}
	close IN;
	logmsg "n-grams active: ".(scalar keys %ngram_count);

	# move onto the next step; assembly. free stuff we no longer need here
	undef %word2sym; # free this, no longer needed
	# create the assembly state object
	$asmstate = { };

	filter_into_message_subsets();
	undef %sym2word; # no longer needed after that
	write_state($asmstate, "assemble.state");
	}

	assemble_regexps();
	exit;

	# ---------------------------------------------------------------------------
	# PHASE 1: PARSING, NGRAM PROBABILITIES

	sub proc_text_spam {
	my ($text) = @_;

	# we only need to chop off the end of spam samples
	if (length($text) > $opt{maxtextread}) {
	$text = substr $text, 0, $opt{maxtextread}; # chop!
	}

	$text =~ s/ +/ /gs; # single spaces, please

	# we only need to save spam samples in memory, ignore ham samples
	push @text_string, $text;

	my $cp = pack "l", $msgcount;
	$msgcount++;

	(($msgcount % 1000) == 999) and discard_hapaxes();

	my %tokens = ();
	foreach my $line (split(/\[p\]/, $text)) {
	my $w1 = '';
	my $w2 = '';
	my $w3 = '';

	foreach my $w (split(' ', $line)) {
	# if (length $w > 20) { $w = "sk:".substr($w, 0, 5); }

	$w3 = $w2;
	$w2 = $w1;

	$w1 = $word2sym{$w};
	if (!$w1) {
	$word2sym{$w} = $w1 = $sym_acc;
	$sym2word{$sym_acc} = $w;
	$sym_acc++;
	}

	# simple bayesian N-grams to start
	$tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = 1;
	}

	# deal with leftovers
	if ($w2 && $w1) {
	$tokens{"$w2.$w1"} = 1;
	}
	}

	foreach my $tok (keys %tokens) {
	#warn "JMD adding $tok => ".decode_sym2words($tok);
	# incr the counter for this token
	$ngram_count{$tok}++;
	$msg_subset_hit{$tok} .= $cp; # the message subset hit by this tok
	}
	}

	sub discard_hapaxes {
	my $before = (scalar keys %ngram_count);
	foreach my $tok (keys %ngram_count) {
	if ($ngram_count{$tok} <= 1) {
	delete $ngram_count{$tok};
	delete $msg_subset_hit{$tok};
	}
	}
	my $after = (scalar keys %ngram_count);
	my $killed = ($before - $after);
	logmsg "shrunk dbs: $killed hapaxes killed, kept $after entries";
	}

	sub proc_text_ham {
	my ($text) = @_;

	my %tokens = ();
	foreach my $line (split(/\[p\]/, $text)) {
	my $w1 = '';
	my $w2 = '';
	my $w3 = '';

	foreach my $w (split(' ', $line)) {
	$w3 = $w2;
	$w2 = $w1;

	$w1 = $word2sym{$w};
	if (!$w1) {
	# since we're deleting, there's no need to add new words
	# to the dictionary; just use the final $sym_acc to mean
	# "unknown ham word", and don't increment it
	$w1 = $sym_acc;
	}

	$tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = $tokens{".$w3"} = 1;
	}

	# deal with leftovers
	if ($w2 && $w1) {
	$tokens{"$w2.$w1"} = 1;
	}
	}

	foreach my $tok (keys %tokens) {
	# we're not tracking hits; we're killing false positives.
	# as soon as a single FP appears, kill all record of that token,
	# it cannot be used
	remove_fp_ing_token($tok);
	}
	}

	sub remove_fp_ing_token {
	my $tok = shift;
	#warn "JMD removing $tok => ".decode_sym2words($tok);
	delete $ngram_count{$tok};
	delete $msg_subset_hit{$tok};
	delete $ngram_count{".".$tok};
	delete $msg_subset_hit{".".$tok};
	delete $ngram_count{"..".$tok};
	delete $msg_subset_hit{"..".$tok};
	}

	sub filter_into_message_subsets {
	logmsg "filtering into message subsets...";

	$asmstate = { };

	# hash all msg_subset_hit lists; we don't need the full data, so this
	# saves space
	foreach my $id (keys %msg_subset_hit) {
	$msg_subset_hit{$id} = unpack("%32C*", $msg_subset_hit{$id});
	}

	# note: we don't care about stuff that hits any ham at all
	$asmstate->{msg_count_spam} = scalar @text_string;
	$asmstate->{msg_count_spam} \|\|= 0.000001;

	$asmstate->{all_patterns_for_set} = { };

	foreach my $id (keys %ngram_count) {
	my $count = $ngram_count{$id};
	my $bad;

	# must occur more than once!
	if (!defined $count \|\| $count <= 1) {
	$bad++;
	}
	# require N% spam hits
	elsif (($count*100) / $asmstate->{msg_count_spam} < $opt{reqhitrate}) {
	$bad++;
	}

	if ($bad) {
	# we don't need to remember anything about this pattern after here
	remove_fp_ing_token($id);
	next;
	}

	my $set = $msg_subset_hit{$id};
	push @{$asmstate->{all_patterns_for_set}->{$set}}, decode_sym2words($id);
	}

	logmsg "message subsets found: ".(scalar
	keys %{$asmstate->{all_patterns_for_set}});

	$asmstate->{ngram_count} = \%ngram_count;
	$asmstate->{msg_subset_hit} = \%msg_subset_hit;
	$asmstate->{text_string} = \@text_string;
	}

	sub decode_sym2words {
	my $ids = shift;
	my $r;
	if ($ids =~ /^([^.])\.([^.])\.([^.]*)$/) {
	$r = "$sym2word{$1} $sym2word{$2} $sym2word{$3}";
	}
	elsif ($ids =~ /^([^.])\.([^.])$/) {
	$r = "$sym2word{$1} $sym2word{$2}";
	}
	else {
	warn "bad decode_sym2words format: '$ids'";
	}
	$r =~ s/^\s+//;
	return $r;
	}

	sub write_state {
	my ($state, $statefile) = @_;
	my $dump = Data::Dumper->new ([ $state ]);
	$dump->Deepcopy(1);
	$dump->Purity(1);
	$dump->Indent(1);
	my $text = $dump->Dump.";1;";
	open (OUT, ">$statefile") or die "cannot write $statefile";
	print OUT $text;
	close OUT or die "cannot write $statefile";
	}

	# ---------------------------------------------------------------------------
	# PHASE 2: REGEXP ASSEMBLY

	sub load_state {
	my $file = shift;
	if (open(IN, "<".$file)) {
	my $str = join("", <IN>); close IN;
	my $VAR1; # Data::Dumper
	if (eval $str) {
	return $VAR1; # Data::Dumper's naming
	}
	}
	die "failed to load state from $file";
	}

	sub assemble_regexps {
	my %done_set = ();
	my @done_pats = ();

	logmsg "deduping and assembling regexps...";
	if (!$opt{rules}) {
	printf ("%6s %6s %6s %s\n", "RATIO", "SPAM%", "HAM%", "DATA");
	}

	$\| = 1;
	my $count = 0;
	my $count_out = 0;
	foreach my $id (sort {
	$asmstate->{ngram_count}->{$b} <=> $asmstate->{ngram_count}->{$a}
	} keys %{$asmstate->{ngram_count}})
	{
	my $set = $asmstate->{msg_subset_hit}->{$id};
	next if $done_set{$set}; $done_set{$set}++;

	if ($count++ % 200 == 0) {
	logmsg "working on message subset $count ($count_out)...";
	}

	# we now have several patterns. see if we can expand them sideways
	# to make the pattern bigger, and collapse into a smaller number of
	# pats at the same time
	my @pats = collapse_pats($asmstate->{all_patterns_for_set}->{$set});
	# my @pats = @{$asmstate->{all_patterns_for_set}->{$set}};

	@pats = quote_all_metas(@pats);
	@pats = subsume_with_dotstars(@pats);
	@pats = expand_with_dots(@pats);
	@pats = ensure_reqpatlength(@pats);
	# at this point, patterns are all valid regexps

	# now check to see if any of these pats have been subsumed in an
	# already-output pattern (one with more hits!)
	my @pats_new = ();
	foreach my $pat (@pats) {
	my $subsumed = 0;
	foreach my $done (@done_pats, @pats_new) {
	# pattern == existing pattern, or existing pattern is contained by
	# pattern, or pattern is contained in existing pattern
	if ($pat eq $done \|\| $pat =~ /\Q${done}\E/ \|\| $done =~ /\Q${pat}\E/) {
	$subsumed=1;
	last;
	}
	# or one pattern contains the other (but interpreted as a regexp!)
	# this deals with /foo.{0,10} bar/ vs /foo ish bar/
	if ($pat =~ /$done/) {
	$subsumed=1;
	last;
	}
	if ($done =~ /$pat/) {
	$subsumed=1;
	last;
	}
	}
	next if $subsumed;

	# add to the current list
	push @pats_new, $pat;
	}

	@pats = @pats_new;
	push @done_pats, @pats_new;

	# if we have no non-subsumed pats at this stage, skip this set
	next unless @pats;

	if (($asmstate->{ngram_count}->{$id}*100) / $asmstate->{msg_count_spam} <
	$opt{reqhitrate})
	{
	# quit if we hit the final hitrate threshold
	return;
	}

	if (defined($hitratethresholds[0]) &&
	($asmstate->{ngram_count}->{$id}*100) / $asmstate->{msg_count_spam} < $hitratethresholds[0])
	{
	print "# passed hit-rate threshold: $hitratethresholds[0]\n";
	shift @hitratethresholds;
	}

	if ($opt{rules}) {
	printf "# %6.3f %6.3f %6.3f\n",
	1.0, ($asmstate->{ngram_count}->{$id}*100) / $asmstate->{msg_count_spam}, 0;

	# sort, to ensure ordering always remains the same
	foreach my $pat (sort @pats) {
	my $name = generate_rule_name($pat);

	if ($opt{ruletype} eq 'header') {
	# deal with header-specific munging.
	# "\[\\n\]" is the result of "[\n]", at this stage
	$pat =~ s/\Q\[\\n\]\E/\\n/gs;
	$pat =~ s/\Q\[\\t\]\E/\\t/gs;
	}

	if ($opt{ruletype} eq 'body') {
	$pat =~ s/^\s/\\s/;
	}

	if ($opt{ruletype} eq 'header') {
	print "$opt{ruletype} $opt{ruleprefix}${name} ALL =~ /$pat/\n";
	} else {
	print "$opt{ruletype} $opt{ruleprefix}${name} /$pat/\n";
	}
	$count_out++;
	}

	} else {
	my $pats = '/'.join ('/, /', map { s/\//\\\//gs; $_; } @pats).'/';
	printf "%6.3f %6.3f %6.3f %s\n",
	1.0, ($asmstate->{ngram_count}->{$id}*100) / $asmstate->{msg_count_spam}, 0, $pats;
	$count_out++;
	}
	}
	}

	sub collapse_pats {
	my $pataryref = $_[0];
	my @ret = ();

	while (1) {
	my $pat = shift(@{$pataryref});
	last unless defined($pat);

	#warn "JMD collapse [$pat]";
	$pat =~ s/^\s+//;

	# TODO: optimise, second-slowest line
	my @hits = grep /\Q$pat\E/, @{$asmstate->{text_string}};

	if (scalar @hits == 0) {
	warn "supposed pattern /$pat/ is 0-hitter";
	warn "JMD strings:\n ".join("\n ", @{$asmstate->{text_string}})."\n";
	push @ret, "[BROKEN: 0 hitter]$pat";
	next;
	}

	# we don't have all day!
	my $pat_maxlen = $opt{maxtextread};
	my $s = $hits[0];

	# Now, expand the pattern using a BLAST-style algorithm

	# expand towards start of string
	while (1) {
	my $l = length($pat);
	last if ($l > $pat_maxlen); # too long

	my $found;
	if ($s =~ /(.)\Q$pat\E/s && $s !~ /\[p\]\Q$pat\E/s) { $found = $1; }

	if (!defined $found) {
	# start of string. break
	last;
	}

	# give up if there are a differing number of hits for the new pat
	my $newpat = $found.$pat;

	# note: we can search just in @hits, instead of in
	# @{$asmstate->{text_string}}, because there's no way we can
	# increase the hitrate in the corpus; we can only reduce it.
	# this is double-checked after these 2 expansion loops anyway
	if (scalar (grep /\Q$newpat\E/, @hits) != scalar @hits) { last; }

	$pat = $newpat; # and carry on
	}
	# warn "JMD $pat";

	# expand towards end of string
	while (1) {
	if (length($pat) > $pat_maxlen
	\|\| $s =~ /\Q$pat\E\[p\]/s
	\|\| $s !~ /\Q$pat\E(.)/s)
	{
	# end of string. break
	last;
	}

	my $newpat = $pat.$1;
	if (scalar (grep /\Q$newpat\E/, @hits) != scalar @hits) { last; }

	$pat = $newpat; # and carry on
	}

	# double-check to ensure we haven't somehow INCREASED our hitrate
	# beyond the initial pattern
	if (scalar (grep /\Q$pat\E/, @{$asmstate->{text_string}}) != scalar @hits) {
	die "oops! went too far";
	}

	# now remove subsumed patterns
	@{$pataryref} = grep { $pat !~ /\Q$_\E/s } @{$pataryref};

	# warn "JMD $pat";
	# skip recording this if it's already inside one of the results
	next if grep { $_ =~ /\Q$pat\E/s } @ret;

	# also, remove cases where this pattern contains previous results
	@ret = grep { $pat !~ /\Q$_\E/s } @ret;

	# warn "JMD $pat";
	push (@ret, $pat);
	}

	return @ret;
	}

	sub quote_all_metas {
	return map {
	s/([!-+\`\?\^\~\\\/\\|\.\{\}\(\)\[\]\@])/\\$1/gs;
	$_;
	} @_;
	}

	sub subsume_with_dotstars {
	my @working = @_;

	# attempt to resolve . and .*
	my @ret = ();

	while (1) {
	my $p1 = shift(@working);
	last unless defined($p1);

	# TODO: optimise, 4th-slowest line
	my @hits = grep /$p1/, @{$asmstate->{text_string}};

	if (scalar @hits == 0) {
	warn "supposed pattern /$p1/ is 0-hitter";
	push @ret, "[BROKEN: 0 hitter]$p1";
	next;
	}

	foreach my $p2 (@working) {
	next if ($p1 eq $p2);

	DOTLOOP: for my $dotstar
	( ".", ".?", ".{0,3}", ".{0,5}", ".{0,20}", ".{0,40}" )
	{
	my $newpatcapture = $p1."(".$dotstar.")".$p2;
	my $newpat = $p1.$dotstar.$p2;

	# drop the attempt if any of the existing hits becomes a miss,
	# or the .* includes a paragraph boundary in any of the hits
	foreach my $t (@hits) {
	next DOTLOOP if ($t !~ /$newpatcapture/);
	next DOTLOOP if ($1 =~ /\[p\]/);
	}
	next if (scalar (grep /$newpat/, @{$asmstate->{text_string}}) != scalar @hits);

	# it works! yay

	# TODO: for /./, see if we can capture the options and
	# construct a /[oO0]/ char class instead
	$p1 = $newpat;
	}
	}

	# skip recording this if it's already inside one of the results
	next if grep { $_ =~ /\Q$p1\E/s } @ret;

	push (@ret, $p1);
	}
	return @ret;
	}

	sub expand_with_dots {
	my @working = @_;
	my @ret = ();

	my $pat_maxlen = $opt{maxtextread};
	while (1) {
	my $p1 = shift(@working);
	last unless defined($p1);

	# TODO: optimise, 5th-slowest line
	my @hits = grep /$p1/, @{$asmstate->{text_string}};

	if (scalar @hits == 0) {
	warn "supposed pattern /$p1/ is 0-hitter";
	push @ret, "[BROKEN: 0 hitter]$p1";
	next;
	}

	my $s = $hits[0];

	# expand towards end of string (with .)
	while (1) {
	last if (length($p1) >= $pat_maxlen);

	if ($s !~ /$p1\[p\]/s && $s =~ /$p1(.)/s)
	{
	my $extn = $1; quote_all_metas($extn);
	my $newpat = $p1.$extn;

	# TODO: optimise, 3rd-slowest line
	if (scalar (grep /$newpat/, @{$asmstate->{text_string}}) == scalar @hits) {
	$p1 = $newpat; next;
	}
	}

	if ($s !~ /$p1.?\[p\]/s && $s =~ /$p1[^\\](.)/s)
	{
	my $extn = $1; quote_all_metas($extn);
	my $newpat = $p1.".".$extn;
	if (scalar (grep /$newpat/, @{$asmstate->{text_string}}) == scalar @hits) {
	$p1 = $newpat; next;
	}
	}

	if ($s !~ /$p1.{0,2}\[p\]/s && $s =~ /$p1[^\\][^\\](.)/s)
	{
	my $extn = $1; quote_all_metas($extn);
	my $newpat = $p1."..".$extn;
	if (scalar (grep /$newpat/, @{$asmstate->{text_string}}) == scalar @hits) {
	$p1 = $newpat; next;
	}
	}

	last;
	}

	push (@ret, $p1);
	}

	@ret;
	}

	sub ensure_reqpatlength {
	my @ret = @_;
	if ($opt{reqpatlength}) {
	@ret = grep { (length($_) >= $opt{reqpatlength}) && (length($_) < $opt{maxreqpatlength}) } @ret;
	return () unless @ret;
	}
	return @ret;
	}

	sub generate_rule_name {
	my $str = shift;
	$str = sha1_base64($str);
	$str =~ s/^(.{6}).*$/$1/gs;
	$str =~ tr/a-z/A-Z/;
	$str =~ s/[^A-Z0-9]/_/gs;
	return $str;
	}

	# ---------------------------------------------------------------------------
	# used by all phases

	sub logmsg {
	warn "".(scalar localtime time).": $_[0]\n";
	}