masses/logs-to-c - spamassassin - Git at Google

 #!/usr/bin/perl -w
 #
 # <@LICENSE>
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to you under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at:
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # </@LICENSE>

 =head1 NAME

 logs-to-c - Convert a mass-check log into perceptron format

 =head1 SYNOPSIS

 logs-to-c [options]

  Options:
     -c,--cffile=path	  Use path as the rules directory
     -s,--scoreset=n	  Use scoreset n
     --spam=file           Location of spam mass-check log
     --ham=file            Location of ham mass-check log

 =head1 DESCRIPTION

 B<logs-to-c> will read the mass-check logs F<spam.log> and F<ham.log>
 or as specified by the B<--spam> and B<--ham> options, and convert it
 into the format needed by the perceptron. This is a format that is
 simple for the perceptron to parse, but is not very readable to
 humans.

 =head1 BUGS

 Please report bugs to http://bugzilla.spamassassin.org/

 =head1 SEE ALSO

 L<mass-check(1)>, L<perceptron(1)>

 =cut

 use Getopt::Long qw(:config auto_help bundling);
 use strict;

 our $opt_cffile = "../rules";
 our $opt_spam = 'spam.log';
 our $opt_ham = 'ham.log';
 our $opt_scoreset = 0;

 GetOptions("cffile=s", "spam=s", "ham=s", "scoreset=i");

 my $is_spam = '';		# vec aligned with @tests_hit
 my @tests_hit = ();
 my %mutable_tests = ();

 our (%rules, %allrules, %scores);

 my (%ignored_rule, %range_lo, %range_hi);
 my %rule_to_index;

 readscores();

 print "Reading per-message hit stat logs and scores...\n";
 my ($num_tests, $num_spam, $num_ham);

 read_ranges();
 readlogs();

 print "Writing logs and current scores as C code...\n";
 writescores_c();

 # show memory usage before we exit
 # print "Running \"ps aux\"...\n";
 # open(PS, "ps aux|");
 # while(<PS>) {
 # print if $. == 1 || /\b$$\b/;
 # }
 # close(PS);

 exit 0;

 # code to freeze/thaw test lines in as little space as possible
 # this could be faster, but improves memory usage by a phenomenal
 # amount over arrayrefs or strings of comma-separated-values
 my $short_index = 1;
 my %long_to_short;
 my @short_to_long;

 sub new_short {
   $short_index++;
   $long_to_short{$_[0]} = $short_index;
   $short_to_long[$short_index] = $_[0];
   return $short_index;
 }

 # uses less than half the memory of join on ',' and even better
 # compared to Storable::freeze
 sub freeze_tests {
   return pack("w*", map
 	      {
 		$long_to_short{$_} || new_short($_);
 	      } @{$_[0]})
 }

 sub thaw_tests {
   return map { $short_to_long[$_] } unpack("w*", $_[0]);
 }

 sub readlogs {
   my $msgline;

   my $count = 0;
   $num_spam = $num_ham = 0;

   foreach my $file ($opt_spam, $opt_ham) {
     open (IN, "<$file") || die "Could not open file '$file': $!";

     my $isspam = ($file eq $opt_spam);
     my $caught;			# 1st parameter of log line
     my $rules;			# 4th parameter of log line
     my $restofline;             # intermediate parse buffer

     while (defined($msgline = <IN>)) {
       # faster log-reading code from hit-frequencies.
       # the additional split() is for this case:
       # ".  -20 /path  time=1112116980,scantime=0,format=f,reuse=no"
       # in other words, no hits.  split(' ') cannot deal with this
       # correctly, seeing (".", "-20", "/path", "time=...etc").  Work
       # around this by using a literal / / regexp split to discard
       # the csv stuff we don't want out of the rest of the line.

       ($caught, undef, $restofline) = split(' ', $msgline, 3);
       next unless ($caught =~ /^[Y\.]$/ && $restofline);
       (undef, $rules) = split(/ /, $restofline, 3);

       # get tests, but ignore unknown tests and subrules
       my @tests;
       foreach my $r (split(/,/, $rules)) {
         my $hits = 1;
         # Support compacted RULE(hitcount) format
         if ($r =~ s/\((\d+)\)$//) {
           $hits = $1;
         }
         next unless (defined $scores{$r} && !$allrules{$r}->{issubrule});
         push @tests, $r for (1 .. $hits);
       }

       if ($isspam) {
         $num_spam++;
         vec($is_spam, $count, 1) = 1;
       }
       else {
         $num_ham++;
         vec($is_spam, $count, 1) = 0;
       }

       # inlined for speed.
       # ORIGINAL: $tests_hit[$count] = freeze_tests(\@tests);
       $tests_hit[$count] = pack("w*", map
                   {
                     $long_to_short{$_} || new_short($_);
                   } @tests);

       # TODO: benchmark using foreach(), map() is often slower

       $count++;                  # increment line
     }
     close IN;
   }
   $num_tests = $count;
 }

 sub readscores {
   print "Reading scores from \"$opt_cffile\"...\n";
   my $tmpf = "./tmp/rules$$.pl";
   system "../build/parse-rules-for-masses ".
         "-d \"$opt_cffile\" -s $opt_scoreset -o $tmpf" and die;
   require $tmpf;
   unlink $tmpf;
   %allrules = %rules;           # ensure it stays global
 }

 sub writescores_c {
   my $output = '';
   my $size = 0;
   my $mutable = 0;
   my $i;

     # jm: now, score-ranges-from-freqs has tflags to work from, so
     # it will always list all mutable tests.

   my @index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) ||
 			  ($mutable_tests{$b} <=> $mutable_tests{$a}) ||
 			   ($a cmp $b)} (keys %scores);
   my $max_hits_per_msg = 0;
   for (my $file = 0; $file < $num_tests; $file++) {
     my(@hits) =
      grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (thaw_tests($tests_hit[$file]));
     if ((scalar(@hits)+1) > $max_hits_per_msg) {
       $max_hits_per_msg = scalar(@hits)+1;
     }
   }

   for ($i = 0; $i <= $#index_to_rule; $i++) {
     my $name = $index_to_rule[$i];
     $rule_to_index{$name} = $i;

     if ($ignored_rule{$name}) { next; }

     if ($mutable_tests{$name} == 0) {
       $range_lo{$name} = $range_hi{$name} = $scores{$name};
     } else {
       $mutable++;
       if ($range_lo{$name} > $range_hi{$name}) {
 	($range_lo{$name},$range_hi{$name}) =
 	 ($range_hi{$name},$range_lo{$name});
       }
       #$range_lo{$name} ||= 0.1;
       #$range_hi{$name} ||= 1.5;

       # no default score found? set it to max and let GA adjust downwards.  this
       # seems to help avoid a load of really good rules getting 1.0 scores
       if ($allrules{$name}->{no_score_found}) {
         $scores{$name} = ($range_hi{$name} + $range_lo{$name}) / 2.0;
       }
     }

     $output .= ".".$i."\n".
                 "n".$name."\n".
                 "b".$scores{$name}."\n".
                 "m".$mutable_tests{$name}."\n".
                 "l".$range_lo{$name}."\n".
                 "h".$range_hi{$name}."\n";
     $size++;
   }


   open (DAT, ">tmp/scores.data");
   print DAT "N$size\n", "M$mutable\n", # informational only
    $output;
   close DAT;

   open (OUT, ">tmp/scores.h");
   print OUT "
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>

 int num_scores = $size;
 int num_mutable = $mutable;
 unsigned char is_mutable[$size];
 double range_lo[$size];
 double range_hi[$size];
 double bestscores[$size];
 char *score_names[$size];
 double tmp_scores[$size][2];
 unsigned char ny_hit[$mutable];
 unsigned char yn_hit[$mutable];

 double lookup[$mutable];

 /* readscores() is defined in tests.h */

 ";
   close OUT;

   writetests_c($max_hits_per_msg); # make sure $rule_to_index is around
 }

 sub writetests_c {
   my $max_hits_per_msg = $_[0];

   my(%uniq_files) = ();
   my(%count_keys) = ();
   my(%file_key) = ();

   my $file;

   for ($file = 0; $file < $num_tests; $file++)
   {
     my $uniq_key = vec($is_spam, $file, 1) . " ";

     my (@good_tests) =
      grep {length($_) && (! $ignored_rule{$_}) &&
 	    (defined($rule_to_index{$_}))} (thaw_tests($tests_hit[$file]));

     @good_tests = sort {$a <=> $b} (map {$rule_to_index{$_}} (@good_tests));

     $uniq_key .= join(" ",@good_tests);

     if (exists($count_keys{$uniq_key})) {
       $count_keys{$uniq_key}++;
     } else {
       $count_keys{$uniq_key} = 1;
       $file_key{$file} = $uniq_key;
       $uniq_files{$file} = scalar(keys(%count_keys)) - 1;
     }
   }

   my $num_nondup = scalar(keys(%uniq_files));

   open (TOP, ">tmp/tests.h");
   print TOP "
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>

 int num_tests = $num_tests;
 int num_nondup = $num_nondup;
 int num_spam = $num_spam;
 int num_ham = $num_ham;
 int max_hits_per_msg = $max_hits_per_msg;
 unsigned char num_tests_hit[$num_nondup];
 unsigned char is_spam[$num_nondup];
 unsigned short tests_hit[$num_nondup][$max_hits_per_msg];
 double scores[$num_nondup];
 double tmp_total[$num_nondup];
 int tests_count[$num_nondup];

 ";
   $_ = join ('', <DATA>);
   print TOP $_;
   close TOP;

   open (DAT, ">tmp/tests.data");

   foreach $file (sort {$a <=> $b} (keys %uniq_files)) {
     print DAT ".".$uniq_files{$file}."\n";

     my $out = '';
     $out .= "s".vec($is_spam, $file, 1)."\n";

     my $base_score = 0;
     my $num_tests_hit = 0;
     foreach my $test (thaw_tests($tests_hit[$file])) {
       if ($test eq '') { next; }

       if ($ignored_rule{$test}) {
         # this is not a log-worthy event anymore, since we have a lot
         # of T_ test rules that are ignored during perceptron runs
         # warn "ignored rule $test got a hit in $file!\n";
         next;
       }

       if (!defined $rule_to_index{$test}) {
 	warn "test with no C index: $test\n";
 	next;
       }

       if ($mutable_tests{$test}) {
         $num_tests_hit++;
         $out .= "t".$rule_to_index{$test}."\n";

         if ($num_tests_hit >= $max_hits_per_msg) {
           die "Need to increase \$max_hits_per_msg";
         }
       } else {
 	$base_score += $scores{$test};
       }
     }

     $out .= "b" . $base_score . "\n"; # score to add in for non-mutable tests
     $out .= "c" . $count_keys{$file_key{$file}} . "\n";

     print DAT "n".$num_tests_hit."\n".$out;
   }
   close DAT;
 }

 sub read_ranges {
   if (!-f 'tmp/ranges.data') {
     die "need to make 'tmp/ranges.data' first";
   }

   # read ranges, and mutableness, from ranges.data.
   open (IN, "<tmp/ranges.data")
   	or die "need to run score-ranges-from-freqs first!";

   my $count = 0;
   while (<IN>) {
     /^(\S+) (\S+) (\d+) (\S+)$/ or next;
     my $t = $4;
     $range_lo{$t} = $1+0;
     $range_hi{$t} = $2+0;
     my $mut = $3+0;

     if ($allrules{$t}->{issubrule}) {
       # warn "$t: ignoring, is sub-rule\n";    # no need to warn
       $ignored_rule{$t} = 1;
       $mutable_tests{$t} = 0;
       next;
     }
     if ($t =~ /^T_/) {
       # warn "$t: ignoring, is T_ test rule\n";    # no need to warn
       $ignored_rule{$t} = 1;
       $mutable_tests{$t} = 0;
       $range_lo{$t} = 0.01;    # clamp to insignificant range
       $range_hi{$t} = 0.01;
       next;
     }
     if (($range_lo{$t} == $range_hi{$t}) && (! $range_lo{$t})) {
       warn "$t: ignoring, score and range == 0\n";
       $ignored_rule{$t} = 1;
       $mutable_tests{$t} = 0;
       next;
     }

     $ignored_rule{$t} = 0;

     if (!$mut) {
       $mutable_tests{$t} = 0;
     } elsif ($range_lo{$t} == $range_hi{$t}) {
       $mutable_tests{$t} = 0;
     } elsif ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
       $mutable_tests{$t} = 0;
     } else {
       $mutable_tests{$t} = 1;
     }
     unless ($mutable_tests{$t} || $scores{$t}) {
       warn "$t: ignoring, immutable and score == 0 in this scoreset\n";
       $ignored_rule{$t} = 1;
     }
   }
   close IN;

   # catch up on the ones missed; seems to be userconf or 0-hitters mostly.
   foreach my $t (sort keys %allrules) {
     next if ($t eq '_scoreset');
     next if (exists($range_lo{$t}));

     if ($allrules{$t}->{issubrule}) {
       if (!$ignored_rule{$t}) {
         # warn "$t: ignoring, is sub-rule\n";  # no need to warn here
         $ignored_rule{$t} = 1;
       }
       $mutable_tests{$t} = 0;
       next;
     }
     if ($t =~ /^T_/) {
       if (!$ignored_rule{$t}) {
         # warn "$t: ignoring, is T_ test rule\n";  # no need to warn here
         $ignored_rule{$t} = 1;
 	$range_lo{$t} = 0.01;    # clamp to insignificant range
 	$range_hi{$t} = 0.01;
       }
       $mutable_tests{$t} = 0;
       next;
     }
     $ignored_rule{$t} = 0;
     unless (exists($mutable_tests{$t}) &&
 	    ($allrules{$t}->{tflags} !~ m/\buserconf\b/i)) {
       $mutable_tests{$t} = 0;
     }
     unless ($mutable_tests{$t} || $scores{$t}) {
       if (!$ignored_rule{$t}) {
         warn "$t: ignoring, immutable and score == 0 in this scoreset\n";
         $ignored_rule{$t} = 1;
       }
     }
   }
   foreach my $t (keys %range_lo) {
     next if ($ignored_rule{$t});
     if ($mutable_tests{$t}) {
       if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
 	$scores{$t} = -1;
       } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
 	       ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
 	$scores{$t} = -0.01;
       }
       if ($scores{$t} >= $range_hi{$t}) {
 	$scores{$t} = $range_hi{$t} - 0.001;
       } elsif ($scores{$t} <= $range_lo{$t}) {
 	$scores{$t} = $range_lo{$t} + 0.001;
       }
     } else {
       if ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
 	next;
       } elsif ($range_lo{$t} == $range_hi{$t}) {
 	$scores{$t} = $range_lo{$t};
 	next;
       }
       if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
 	$scores{$t} = -1;
       } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
 	       ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
 	$scores{$t} = -0.01;
       }
       if ($scores{$t} > $range_hi{$t}) {
 	$scores{$t} = $range_hi{$t};
       } elsif ($scores{$t} < $range_lo{$t}) {
 	$scores{$t} = $range_lo{$t};
       }
     }
   }
 }


 __DATA__

 void loadtests (void) {
   FILE *fin = fopen ("tmp/tests.data", "r");
   char buf[256];
   int file = 0;
   int tnum = 0;

   while (fgets (buf, 255, fin) != NULL) {
     char cmd;
     long arg;
     float argd;

     cmd = (char) *buf;
     arg = strtol (buf+1, NULL, 10);
     argd = (float)strtod (buf+1, NULL);

     if (cmd == '.') {
       file = arg;

     } else if (cmd == 'n') {
       tnum = 0;
       num_tests_hit[file] = arg;

     } else if (cmd == 's') {
       is_spam[file] = arg;

     } else if (cmd == 'b') {
       scores[file] = argd;

     } else if (cmd == 't') {
       tests_hit[file][tnum] = arg; tnum++;

     } else if (cmd == 'c') {
       tests_count[file] = arg;

     }
   }
   fclose(fin);

   printf ("Read test results for %d messages (%d total).\n", file+1,
 	  num_tests);
 }

 void loadscores (void) {
   FILE *fin = fopen ("tmp/scores.data", "r");
   char buf[256];
   int snum = 0;

   while (fgets (buf, 255, fin) != NULL) {
     char cmd;
     long arg;
     float argd;
     char *str, *white;

     cmd = (char) *buf;
     arg = strtol (buf+1, NULL, 10);
     argd = (float)strtod (buf+1, NULL);
     str = buf+1;

     while ((white = strchr (str, '\n')) != NULL) {
       *white = '\0';
     }

     if (cmd == '.') {
       snum = arg;

     } else if (cmd == 'b') {
       bestscores[snum] = argd;

     } else if (cmd == 'l') {
       range_lo[snum] = argd;

     } else if (cmd == 'h') {
       range_hi[snum] = argd;

     } else if (cmd == 'n') {
       score_names[snum] = strdup (str);	/* leaky leak ;) */

     } else if (cmd == 'm') {
       is_mutable[snum] = arg;
     }
   }
   fclose(fin);

   printf ("Read scores for %d tests.\n", num_scores);
 }
	#!/usr/bin/perl -w
	#
	# <@LICENSE>
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to you under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at:
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# </@LICENSE>

	=head1 NAME

	logs-to-c - Convert a mass-check log into perceptron format

	=head1 SYNOPSIS

	logs-to-c [options]

	Options:
	-c,--cffile=path Use path as the rules directory
	-s,--scoreset=n Use scoreset n
	--spam=file Location of spam mass-check log
	--ham=file Location of ham mass-check log

	=head1 DESCRIPTION

	B<logs-to-c> will read the mass-check logs F<spam.log> and F<ham.log>
	or as specified by the B<--spam> and B<--ham> options, and convert it
	into the format needed by the perceptron. This is a format that is
	simple for the perceptron to parse, but is not very readable to
	humans.

	=head1 BUGS

	Please report bugs to http://bugzilla.spamassassin.org/

	=head1 SEE ALSO

	L<mass-check(1)>, L<perceptron(1)>

	=cut

	use Getopt::Long qw(:config auto_help bundling);
	use strict;

	our $opt_cffile = "../rules";
	our $opt_spam = 'spam.log';
	our $opt_ham = 'ham.log';
	our $opt_scoreset = 0;

	GetOptions("cffile=s", "spam=s", "ham=s", "scoreset=i");

	my $is_spam = ''; # vec aligned with @tests_hit
	my @tests_hit = ();
	my %mutable_tests = ();

	our (%rules, %allrules, %scores);

	my (%ignored_rule, %range_lo, %range_hi);
	my %rule_to_index;

	readscores();

	print "Reading per-message hit stat logs and scores...\n";
	my ($num_tests, $num_spam, $num_ham);

	read_ranges();
	readlogs();

	print "Writing logs and current scores as C code...\n";
	writescores_c();

	# show memory usage before we exit
	# print "Running \"ps aux\"...\n";
	# open(PS, "ps aux\|");
	# while(<PS>) {
	# print if $. == 1 \|\| /\b$$\b/;
	# }
	# close(PS);

	exit 0;

	# code to freeze/thaw test lines in as little space as possible
	# this could be faster, but improves memory usage by a phenomenal
	# amount over arrayrefs or strings of comma-separated-values
	my $short_index = 1;
	my %long_to_short;
	my @short_to_long;

	sub new_short {
	$short_index++;
	$long_to_short{$_[0]} = $short_index;
	$short_to_long[$short_index] = $_[0];
	return $short_index;
	}

	# uses less than half the memory of join on ',' and even better
	# compared to Storable::freeze
	sub freeze_tests {
	return pack("w*", map
	{
	$long_to_short{$_} \|\| new_short($_);
	} @{$_[0]})
	}

	sub thaw_tests {
	return map { $short_to_long[$_] } unpack("w*", $_[0]);
	}

	sub readlogs {
	my $msgline;

	my $count = 0;
	$num_spam = $num_ham = 0;

	foreach my $file ($opt_spam, $opt_ham) {
	open (IN, "<$file") \|\| die "Could not open file '$file': $!";

	my $isspam = ($file eq $opt_spam);
	my $caught; # 1st parameter of log line
	my $rules; # 4th parameter of log line
	my $restofline; # intermediate parse buffer

	while (defined($msgline = <IN>)) {
	# faster log-reading code from hit-frequencies.
	# the additional split() is for this case:
	# ". -20 /path time=1112116980,scantime=0,format=f,reuse=no"
	# in other words, no hits. split(' ') cannot deal with this
	# correctly, seeing (".", "-20", "/path", "time=...etc"). Work
	# around this by using a literal / / regexp split to discard
	# the csv stuff we don't want out of the rest of the line.

	($caught, undef, $restofline) = split(' ', $msgline, 3);
	next unless ($caught =~ /^[Y\.]$/ && $restofline);
	(undef, $rules) = split(/ /, $restofline, 3);

	# get tests, but ignore unknown tests and subrules
	my @tests;
	foreach my $r (split(/,/, $rules)) {
	my $hits = 1;
	# Support compacted RULE(hitcount) format
	if ($r =~ s/\((\d+)\)$//) {
	$hits = $1;
	}
	next unless (defined $scores{$r} && !$allrules{$r}->{issubrule});
	push @tests, $r for (1 .. $hits);
	}

	if ($isspam) {
	$num_spam++;
	vec($is_spam, $count, 1) = 1;
	}
	else {
	$num_ham++;
	vec($is_spam, $count, 1) = 0;
	}

	# inlined for speed.
	# ORIGINAL: $tests_hit[$count] = freeze_tests(\@tests);
	$tests_hit[$count] = pack("w*", map
	{
	$long_to_short{$_} \|\| new_short($_);
	} @tests);

	# TODO: benchmark using foreach(), map() is often slower

	$count++; # increment line
	}
	close IN;
	}
	$num_tests = $count;
	}

	sub readscores {
	print "Reading scores from \"$opt_cffile\"...\n";
	my $tmpf = "./tmp/rules$$.pl";
	system "../build/parse-rules-for-masses ".
	"-d \"$opt_cffile\" -s $opt_scoreset -o $tmpf" and die;
	require $tmpf;
	unlink $tmpf;
	%allrules = %rules; # ensure it stays global
	}

	sub writescores_c {
	my $output = '';
	my $size = 0;
	my $mutable = 0;
	my $i;

	# jm: now, score-ranges-from-freqs has tflags to work from, so
	# it will always list all mutable tests.

	my @index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) \|\|
	($mutable_tests{$b} <=> $mutable_tests{$a}) \|\|
	($a cmp $b)} (keys %scores);
	my $max_hits_per_msg = 0;
	for (my $file = 0; $file < $num_tests; $file++) {
	my(@hits) =
	grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (thaw_tests($tests_hit[$file]));
	if ((scalar(@hits)+1) > $max_hits_per_msg) {
	$max_hits_per_msg = scalar(@hits)+1;
	}
	}

	for ($i = 0; $i <= $#index_to_rule; $i++) {
	my $name = $index_to_rule[$i];
	$rule_to_index{$name} = $i;

	if ($ignored_rule{$name}) { next; }

	if ($mutable_tests{$name} == 0) {
	$range_lo{$name} = $range_hi{$name} = $scores{$name};
	} else {
	$mutable++;
	if ($range_lo{$name} > $range_hi{$name}) {
	($range_lo{$name},$range_hi{$name}) =
	($range_hi{$name},$range_lo{$name});
	}
	#$range_lo{$name} \|\|= 0.1;
	#$range_hi{$name} \|\|= 1.5;

	# no default score found? set it to max and let GA adjust downwards. this
	# seems to help avoid a load of really good rules getting 1.0 scores
	if ($allrules{$name}->{no_score_found}) {
	$scores{$name} = ($range_hi{$name} + $range_lo{$name}) / 2.0;
	}
	}

	$output .= ".".$i."\n".
	"n".$name."\n".
	"b".$scores{$name}."\n".
	"m".$mutable_tests{$name}."\n".
	"l".$range_lo{$name}."\n".
	"h".$range_hi{$name}."\n";
	$size++;
	}


	open (DAT, ">tmp/scores.data");
	print DAT "N$size\n", "M$mutable\n", # informational only
	$output;
	close DAT;

	open (OUT, ">tmp/scores.h");
	print OUT "
	#include <stdio.h>
	#include <string.h>
	#include <stdlib.h>

	int num_scores = $size;
	int num_mutable = $mutable;
	unsigned char is_mutable[$size];
	double range_lo[$size];
	double range_hi[$size];
	double bestscores[$size];
	char *score_names[$size];
	double tmp_scores[$size][2];
	unsigned char ny_hit[$mutable];
	unsigned char yn_hit[$mutable];

	double lookup[$mutable];

	/* readscores() is defined in tests.h */

	";
	close OUT;

	writetests_c($max_hits_per_msg); # make sure $rule_to_index is around
	}

	sub writetests_c {
	my $max_hits_per_msg = $_[0];

	my(%uniq_files) = ();
	my(%count_keys) = ();
	my(%file_key) = ();

	my $file;

	for ($file = 0; $file < $num_tests; $file++)
	{
	my $uniq_key = vec($is_spam, $file, 1) . " ";

	my (@good_tests) =
	grep {length($_) && (! $ignored_rule{$_}) &&
	(defined($rule_to_index{$_}))} (thaw_tests($tests_hit[$file]));

	@good_tests = sort {$a <=> $b} (map {$rule_to_index{$_}} (@good_tests));

	$uniq_key .= join(" ",@good_tests);

	if (exists($count_keys{$uniq_key})) {
	$count_keys{$uniq_key}++;
	} else {
	$count_keys{$uniq_key} = 1;
	$file_key{$file} = $uniq_key;
	$uniq_files{$file} = scalar(keys(%count_keys)) - 1;
	}
	}

	my $num_nondup = scalar(keys(%uniq_files));

	open (TOP, ">tmp/tests.h");
	print TOP "
	#include <stdio.h>
	#include <string.h>
	#include <stdlib.h>

	int num_tests = $num_tests;
	int num_nondup = $num_nondup;
	int num_spam = $num_spam;
	int num_ham = $num_ham;
	int max_hits_per_msg = $max_hits_per_msg;
	unsigned char num_tests_hit[$num_nondup];
	unsigned char is_spam[$num_nondup];
	unsigned short tests_hit[$num_nondup][$max_hits_per_msg];
	double scores[$num_nondup];
	double tmp_total[$num_nondup];
	int tests_count[$num_nondup];

	";
	$_ = join ('', <DATA>);
	print TOP $_;
	close TOP;

	open (DAT, ">tmp/tests.data");

	foreach $file (sort {$a <=> $b} (keys %uniq_files)) {
	print DAT ".".$uniq_files{$file}."\n";

	my $out = '';
	$out .= "s".vec($is_spam, $file, 1)."\n";

	my $base_score = 0;
	my $num_tests_hit = 0;
	foreach my $test (thaw_tests($tests_hit[$file])) {
	if ($test eq '') { next; }

	if ($ignored_rule{$test}) {
	# this is not a log-worthy event anymore, since we have a lot
	# of T_ test rules that are ignored during perceptron runs
	# warn "ignored rule $test got a hit in $file!\n";
	next;
	}

	if (!defined $rule_to_index{$test}) {
	warn "test with no C index: $test\n";
	next;
	}

	if ($mutable_tests{$test}) {
	$num_tests_hit++;
	$out .= "t".$rule_to_index{$test}."\n";

	if ($num_tests_hit >= $max_hits_per_msg) {
	die "Need to increase \$max_hits_per_msg";
	}
	} else {
	$base_score += $scores{$test};
	}
	}

	$out .= "b" . $base_score . "\n"; # score to add in for non-mutable tests
	$out .= "c" . $count_keys{$file_key{$file}} . "\n";

	print DAT "n".$num_tests_hit."\n".$out;
	}
	close DAT;
	}

	sub read_ranges {
	if (!-f 'tmp/ranges.data') {
	die "need to make 'tmp/ranges.data' first";
	}

	# read ranges, and mutableness, from ranges.data.
	open (IN, "<tmp/ranges.data")
	or die "need to run score-ranges-from-freqs first!";

	my $count = 0;
	while (<IN>) {
	/^(\S+) (\S+) (\d+) (\S+)$/ or next;
	my $t = $4;
	$range_lo{$t} = $1+0;
	$range_hi{$t} = $2+0;
	my $mut = $3+0;

	if ($allrules{$t}->{issubrule}) {
	# warn "$t: ignoring, is sub-rule\n"; # no need to warn
	$ignored_rule{$t} = 1;
	$mutable_tests{$t} = 0;
	next;
	}
	if ($t =~ /^T_/) {
	# warn "$t: ignoring, is T_ test rule\n"; # no need to warn
	$ignored_rule{$t} = 1;
	$mutable_tests{$t} = 0;
	$range_lo{$t} = 0.01; # clamp to insignificant range
	$range_hi{$t} = 0.01;
	next;
	}
	if (($range_lo{$t} == $range_hi{$t}) && (! $range_lo{$t})) {
	warn "$t: ignoring, score and range == 0\n";
	$ignored_rule{$t} = 1;
	$mutable_tests{$t} = 0;
	next;
	}

	$ignored_rule{$t} = 0;

	if (!$mut) {
	$mutable_tests{$t} = 0;
	} elsif ($range_lo{$t} == $range_hi{$t}) {
	$mutable_tests{$t} = 0;
	} elsif ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
	$mutable_tests{$t} = 0;
	} else {
	$mutable_tests{$t} = 1;
	}
	unless ($mutable_tests{$t} \|\| $scores{$t}) {
	warn "$t: ignoring, immutable and score == 0 in this scoreset\n";
	$ignored_rule{$t} = 1;
	}
	}
	close IN;

	# catch up on the ones missed; seems to be userconf or 0-hitters mostly.
	foreach my $t (sort keys %allrules) {
	next if ($t eq '_scoreset');
	next if (exists($range_lo{$t}));

	if ($allrules{$t}->{issubrule}) {
	if (!$ignored_rule{$t}) {
	# warn "$t: ignoring, is sub-rule\n"; # no need to warn here
	$ignored_rule{$t} = 1;
	}
	$mutable_tests{$t} = 0;
	next;
	}
	if ($t =~ /^T_/) {
	if (!$ignored_rule{$t}) {
	# warn "$t: ignoring, is T_ test rule\n"; # no need to warn here
	$ignored_rule{$t} = 1;
	$range_lo{$t} = 0.01; # clamp to insignificant range
	$range_hi{$t} = 0.01;
	}
	$mutable_tests{$t} = 0;
	next;
	}
	$ignored_rule{$t} = 0;
	unless (exists($mutable_tests{$t}) &&
	($allrules{$t}->{tflags} !~ m/\buserconf\b/i)) {
	$mutable_tests{$t} = 0;
	}
	unless ($mutable_tests{$t} \|\| $scores{$t}) {
	if (!$ignored_rule{$t}) {
	warn "$t: ignoring, immutable and score == 0 in this scoreset\n";
	$ignored_rule{$t} = 1;
	}
	}
	}
	foreach my $t (keys %range_lo) {
	next if ($ignored_rule{$t});
	if ($mutable_tests{$t}) {
	if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
	$scores{$t} = -1;
	} elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
	($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
	$scores{$t} = -0.01;
	}
	if ($scores{$t} >= $range_hi{$t}) {
	$scores{$t} = $range_hi{$t} - 0.001;
	} elsif ($scores{$t} <= $range_lo{$t}) {
	$scores{$t} = $range_lo{$t} + 0.001;
	}
	} else {
	if ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
	next;
	} elsif ($range_lo{$t} == $range_hi{$t}) {
	$scores{$t} = $range_lo{$t};
	next;
	}
	if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
	$scores{$t} = -1;
	} elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
	($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
	$scores{$t} = -0.01;
	}
	if ($scores{$t} > $range_hi{$t}) {
	$scores{$t} = $range_hi{$t};
	} elsif ($scores{$t} < $range_lo{$t}) {
	$scores{$t} = $range_lo{$t};
	}
	}
	}
	}


	__DATA__

	void loadtests (void) {
	FILE *fin = fopen ("tmp/tests.data", "r");
	char buf[256];
	int file = 0;
	int tnum = 0;

	while (fgets (buf, 255, fin) != NULL) {
	char cmd;
	long arg;
	float argd;

	cmd = (char) *buf;
	arg = strtol (buf+1, NULL, 10);
	argd = (float)strtod (buf+1, NULL);

	if (cmd == '.') {
	file = arg;

	} else if (cmd == 'n') {
	tnum = 0;
	num_tests_hit[file] = arg;

	} else if (cmd == 's') {
	is_spam[file] = arg;

	} else if (cmd == 'b') {
	scores[file] = argd;

	} else if (cmd == 't') {
	tests_hit[file][tnum] = arg; tnum++;

	} else if (cmd == 'c') {
	tests_count[file] = arg;

	}
	}
	fclose(fin);

	printf ("Read test results for %d messages (%d total).\n", file+1,
	num_tests);
	}

	void loadscores (void) {
	FILE *fin = fopen ("tmp/scores.data", "r");
	char buf[256];
	int snum = 0;

	while (fgets (buf, 255, fin) != NULL) {
	char cmd;
	long arg;
	float argd;
	char str, white;

	cmd = (char) *buf;
	arg = strtol (buf+1, NULL, 10);
	argd = (float)strtod (buf+1, NULL);
	str = buf+1;

	while ((white = strchr (str, '\n')) != NULL) {
	*white = '\0';
	}

	if (cmd == '.') {
	snum = arg;

	} else if (cmd == 'b') {
	bestscores[snum] = argd;

	} else if (cmd == 'l') {
	range_lo[snum] = argd;

	} else if (cmd == 'h') {
	range_hi[snum] = argd;

	} else if (cmd == 'n') {
	score_names[snum] = strdup (str); /* leaky leak ;) */

	} else if (cmd == 'm') {
	is_mutable[snum] = arg;
	}
	}
	fclose(fin);

	printf ("Read scores for %d tests.\n", num_scores);
	}