masses/mass-check - spamassassin - Git at Google

 #!/usr/bin/perl -w
 use strict;

 #
 # <@LICENSE>
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to you under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at:
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # </@LICENSE>

 sub aidbg;

 sub usage {
   my $status = shift;

   my $out = $status ? \*STDERR : \*STDOUT;
   print $out <<EOF;
 usage: mass-check [options] target ...

   -c=file       set configuration/rules directory
   -p=dir        set user-prefs directory
   -f=file       read list of targets from <file>
   -j=jobs       specify the number of processes to run simultaneously
   --net         turn on network checks!
   --mid         report Message-ID from each message
   --debug=LIST  report debugging information (default is all facilities, LIST
                 is a comma-separated list of facilities)
   --progress    show progress updates during check
   --noisy       show noisier progress updates during check
   --showdots    print a dot for each scanned message
   --rewrite=OUT save rewritten message to OUT (default is /tmp/out)
   --rules=RE    Only test rules matching the given regexp RE
   --restart=N   restart all of the children after processing N messages
   --deencap=RE  Extract SpamAssassin-encapsulated spam mails only if they
                 were encapsulated by servers matching the regexp RE
                 (default = extract all SpamAssassin-encapsulated mails)
   --lint        check rules for syntax before running
   --cf='config line'  Additional line of configuration

   client/server mode options
   --server host:port
                 use server mode, running on the given hostname and port
   --client host:port
   		use client mode, connecting to the given hostname and port
   --cs_max N
   		at most, only ever request (client)/give out (server) a
 		maximum of N messages (defaults to 1000)
   --cs_timeout N
   		in client mode, try to connect to the server every N seconds
 		defaults to 120
 		in server mode, timeout messages after N seconds
 		defaults to 300
   --cs_paths_only
 		only used in client mode.  when making requests of the
 		server, only ask for paths to the messages and not the
 		messages themselves.  useful when the client and server
 		have the same paths to the corpus data.

   log options
   -o            write all logs to stdout
   --loghits     log the text hit for patterns (useful for debugging)
   --loguris	log the URIs found
   --logmem	log the memory delta (only on Linux)
   --hamlog=log  use <log> as ham log ('ham.log' is default)
   --spamlog=log use <log> as spam log ('spam.log' is default)

   message selection options
   -n            no date sorting or spam/ham interleaving
   --cache	use cache information when selecting messages
   --cachedir=dir write cache info for --cache in this directory tree
   --all         don't skip big messages

   message selection options, can be specified for each target
   --after=N     only test mails received after time_t N (negative values
                 are an offset from current time, e.g. -86400 = last day)
                 or after date as parsed by Time::ParseDate (e.g. '-6 months')
   --before=N    same as --after, except received times are before time_t N
   --scanprob=N  probability of scanning a message, range 0.0 - 1.0 (default: 1.0)

   message selection options, can be specified for each target class
   --head=N      only check first N ham and N spam (N messages if -n used)
   --tail=N      only check last N ham and N spam (N messages if -n used)

   simple target options (implies -o and no ham/spam classification)
   --dir         subsequent targets are directories
   --file        subsequent targets are files in RFC 822 format
   --mbox        subsequent targets are mbox files
   --mbx         subsequent targets are mbx files

   Just left over functions we should remove at some point:
   --bayes       report score from Bayesian classifier

   options used during score generation process
   --learn=N     learn N% of messages as spam or ham
   --reuse       reuse network checks if X-Spam-Status: is present in messages

   non-option arguments are used as target names (mail files and folders),
   the target format is: <class>:<format>:<location>
   <class>       is "spam" or "ham"
   <format>      is "dir", "file", "mbx", "mbox", or "detect"
                 (see 'perldoc Mail::SpamAssassin::ArchiveIterator)
   <location>    is a file or directory name.  globbing of ~ and * is supported

 EOF
   exit($status);
 }

 ###########################################################################

 use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all $opt_bayes
 	    $opt_debug $opt_format $opt_hamlog $opt_head $opt_loghits
 	    $opt_mid $opt_net $opt_nosort $opt_progress $opt_showdots
 	    $opt_spamlog $opt_tail $opt_rules $opt_restart $opt_loguris
 	    $opt_logmem $opt_after $opt_before $opt_rewrite $opt_deencap
 	    $opt_learn $opt_reuse $opt_lint $opt_cache $opt_noisy $opt_cf
 	    $total_messages $statusevery $opt_cachedir $opt_scanprob
 	    $opt_client $opt_cs_max $opt_cs_timeout $opt_cs_paths_only
 	    $opt_server %postdata %real $svn_revision
 	    $tmpfd %reuse %orig_conf %reuse_conf $reuse_rules_loaded_p);

 use FindBin;
 use lib "$FindBin::Bin/../lib";
 eval "use bytes";
 use IO::Select;
 use IO::Socket;
 use Mail::SpamAssassin::ArchiveIterator;
 use Mail::SpamAssassin;
 use Mail::SpamAssassin::Logger;
 use File::Copy;
 use File::Spec;
 use Getopt::Long;
 use POSIX qw(strftime);
 use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; };
 use constant HAS_IO_ZLIB => eval { require IO::Zlib; };
 use Config;

 # default settings
 $opt_c = "$FindBin::Bin/../rules";
 $opt_p = "$FindBin::Bin/spamassassin";
 $opt_j = 1;
 $opt_head = 0;
 $opt_tail = 0;
 $opt_net = 0;
 $opt_hamlog = "ham.log";
 $opt_spamlog = "spam.log";
 $opt_learn = 0;
 $reuse_rules_loaded_p = 0;
 $opt_cf = [];

 my @ORIG_ARGV = @ARGV;
 GetOptions("c=s", "p=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug:s",
 	   "hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net",
 	   "progress!", "rewrite:s", "showdots", "spamlog=s", "tail=i",
 	   "rules=s", "restart=i", "loguris",
 	   "deencap=s", "logmem", "learn=i", "reuse", "lint", "cache",
            "cachedir=s", "noisy", "scanprob=f",
 	   "server=s", "cs_max=i", "cs_timeout=i", "cs_paths_only",
 	   "client=s",
 	   "before=s" => \&deal_with_before_after,
 	   "after=s" => \&deal_with_before_after,
            'cf=s' => \@{$opt_cf},
 	   "dir" => sub { $opt_format = "dir"; },
 	   "file" => sub { $opt_format = "file"; },
 	   "mbox" => sub { $opt_format = "mbox"; },
 	   "mbx" => sub { $opt_format = "mbx"; },
 	   "help" => sub { usage(0); },
 	   '<>' => \&target) or usage(1);

 # We need IO::Zlib for client-server mode!
 if ( ($opt_client || $opt_server) && ! HAS_IO_ZLIB ) {
   die "IO::Zlib required for client/server mode!\n";
 }

 # rules.pl is for the --reuse option, score set doesn't matter
 if ($opt_reuse) {
   my $rules_path = "$FindBin::Bin/tmp/rules.pl";
   if (! -f $rules_path) {
     # some people specify paths relatively, whereas this needs an absolute path,
     # so "do the right thing"(tm).
     my $abs_opt_c = File::Spec->rel2abs($opt_c);
     system("cd $FindBin::Bin; perl ../build/parse-rules-for-masses -d $abs_opt_c");
   }

   require $rules_path;
 }

 if ($opt_noisy) {
   $opt_progress = 1;        # implies --progress
 }

 $opt_debug ||= 'all' if defined $opt_debug;

 my $user_prefs = "$opt_p/user_prefs";

 # --lint
 # In theory we could probably use the same spamtest object as below,
 # but since it's probably not expecting that, and we don't want
 # strange things happening, create a local object.
 if ($opt_lint) {
   my $spamlint = new Mail::SpamAssassin ({
     'debug'              			=> $opt_debug,
     'rules_filename'     			=> $opt_c,
     'userprefs_filename' 			=> $user_prefs,
     'site_rules_filename'			=> "$opt_p/local.cf",
     'userstate_dir'     			=> "$opt_p",
     'save_pattern_hits'  			=> $opt_loghits,
     'dont_copy_prefs'   			=> 1,
     'local_tests_only'   			=> $opt_net ? 0 : 1,
     'only_these_rules'   			=> $opt_rules,
     'ignore_safety_expire_timeout'		=> 1,
     'post_config_text'                          => join("\n", @{$opt_cf})."\n",
     PREFIX					=> '',
     DEF_RULES_DIR        			=> $opt_c,
     LOCAL_RULES_DIR      			=> '',
   });

   $spamlint->debug_diagnostics();
   my $res = $spamlint->lint_rules();
   $spamlint->finish();
   warn "lint: $res issues detected, please rerun with debug enabled for more information\n" if ($res);
   exit 1 if $res;
 }

 # test messages for the mass-check
 my @targets;
 if (!$opt_client) {
   if ($opt_f) {
     open(F, $opt_f) || die "cannot read target $opt_f: $!";
     push(@targets, map { chomp; $_ } <F>);
     close(F);
   }
   usage(1) if !@targets;
 }

 my $spamtest = new Mail::SpamAssassin ({
   'debug'              			=> $opt_debug,
   'rules_filename'     			=> $opt_c,
   'userprefs_filename' 			=> $user_prefs,
   'site_rules_filename'			=> "$opt_p/local.cf",
   'userstate_dir'     			=> "$opt_p",
   'save_pattern_hits'  			=> $opt_loghits,
   'dont_copy_prefs'   			=> 1,
   'local_tests_only'   			=> $opt_net ? 0 : 1,
   'only_these_rules'   			=> $opt_rules,
   'ignore_safety_expire_timeout'	=> 1,
   'post_config_text'                    => join("\n", @{$opt_cf})."\n",
   PREFIX				=> '',
   DEF_RULES_DIR        			=> $opt_c,
   LOCAL_RULES_DIR      			=> '',
 });

 $spamtest->compile_now(1);
 $spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf");

 # generated user_prefs
 if ($opt_reuse) {
   # copy current prefs if it exists
   $spamtest->copy_config(undef, \%orig_conf);

   # zeroed scores to mass_prefs
   my @zero = sort grep { defined $reuse{$_}->{skip} } keys %reuse;
   open(PREFS, ">> $opt_p/mass_prefs") || die "Unable to open $opt_p/mass_prefs: $!\nNeeded for --reuse to work properly";
   for my $zero (@zero) {
     print PREFS "score $zero 0\n";
   }
   close(PREFS);

   $spamtest->read_scoreonly_config("$opt_p/mass_prefs");

   $spamtest->copy_config(undef, \%reuse_conf);
   $reuse_rules_loaded_p = 1;
 }

 my $who = `id -un 2>/dev/null`;
 my $where = `uname -n 2>/dev/null`;
 my $when = `date -u`;
 my $host = $ENV{'HOSTNAME'} || $ENV{'HOST'} || `hostname` || 'localhost';
 chomp $who;
 chomp $where;
 chomp $when;
 chomp $host;
 $svn_revision = get_current_svn_revision();

 # when displaying the commandline, quote any arguments which have
 # "questionable" characters such as spaces, pipes, etc.
 my $cmdline = join(' ',map { m@[^A-Za-z0-9_/\\.-]@ ? qq/"$_"/ : $_ } @ORIG_ARGV); $cmdline =~ s/\s+/ /gs;
 my $isowhen = strftime("%Y%m%dT%H%M%SZ", gmtime(time)); # better

 my $log_header = "# mass-check results from $who\@$where, on $when\n" .
 		 "# M:SA version ".$spamtest->Version()."\n" .
 		 "# SVN revision: $svn_revision\n" .
                  "# Date: $isowhen\n" .
 		 "# Perl version: $] on $Config{archname}\n" .
                  "# Switches: '$cmdline'\n";

 my $updates = ($opt_noisy ? 100 : 10);
 my $total_count = 0;
 my $spam_count = 0;
 my $ham_count = 0;
 my $init_results = 0;

 my $showdots_active = ($opt_showdots || $opt_noisy);
 my $showdots_counter = 0;
 my $showdots_every = ($opt_showdots ? 1 : 20);

 my $AIopts = {
 	'opt_all' => $opt_all,
 	};

 if (!$opt_client) {
   # Deal with --rewrite
   if (defined $opt_rewrite) {
     my $rewrite = ($opt_rewrite ? $opt_rewrite : "/tmp/out");
     open(REWRITE, "> $rewrite") || die "open of $rewrite failed: $!";
   }

   # ArchiveIterator options for non-client mode
   $AIopts->{'opt_scanprob'} = $opt_scanprob;
   $AIopts->{'opt_cache'} = $opt_cache;
   $AIopts->{'opt_cachedir'} = $opt_cachedir;
   $AIopts->{'opt_after'} = $opt_after;
   $AIopts->{'opt_before'} = $opt_before;
   $AIopts->{'scan_progress_sub'} = \&showdots_blip;
   $AIopts->{'opt_want_date'} = ! $opt_n;

   # ensure that scanprob stuff is predictable and reproducable
   if (defined $opt_scanprob && $opt_scanprob < 1.0) {
     srand(1);
   }
 }
 else {
   # ArchiveIterator options for client mode -- tends to be simple
   $opt_n = 1;
   $AIopts->{'opt_want_date'} = 0;
 }

 ###########################################################################
 ## SCAN MODE

 my $iter = new Mail::SpamAssassin::ArchiveIterator($AIopts);

 # setup the AI functions
 if ($opt_client) {
   $iter->set_functions(\&wanted, \&result_client);
 }
 elsif ($opt_server) {
   $iter->set_functions(\&wanted_server, \&result);
 }
 else {
   $iter->set_functions(\&wanted, \&result);
 }

 my $messages;

 # normal mode as well as a server do scan mode and get a temp file
 if (!$opt_client) {
   status('starting scan stage') if ($opt_progress);

   # Make a temp file and delete it
   my $tmpf;
   ($tmpf, $tmpfd) = Mail::SpamAssassin::Util::secure_tmpfile();
   die 'mass-check: failed to create temp file' unless $tmpf;
   unlink $tmpf or die "mass-check: unlink '$tmpf': $!";

   # having opt_j or server mode means do scan in a separate process
   if ($opt_server || $opt_j) {
     if ($tmpf = fork()) {
       # parent
       waitpid($tmpf, 0);
     }
     elsif (defined $tmpf) {
       # child -- process using message_array
       generate_queue(\@targets, $tmpfd);
       exit;
     }
     else {
       die "mass-check: cannot fork: $!";
     }
   }
   else {
     # we get here if opt_j == 0, so scan in this process
     generate_queue(\@targets, $tmpfd);
   }

   # we now have a temporary file with the messages to process
   seek($tmpfd, 0, 0);
   # the first line is the number of messages
   $total_messages = read_line($tmpfd);

   if (!$total_messages) {
     die "mass-check: no messages to process\n";
   }

   status("completed scan stage, $total_messages messages") if ($opt_progress);
 }

 ###########################################################################
 ## RUN MODE

 if ($opt_client) {
   client_mode();
 }
 else {
   status('starting run stage') if ($opt_progress);

   if ($opt_server) {
     server_mode();
   }
   else {
     run_through_messages();
   }

   status('completed run stage') if ($opt_progress);
 }

 # Even though we're about to exit, let's clean up after ourselves
 close($tmpfd) if ($tmpfd);
 showdots_finish();

 if (defined $opt_rewrite) {
   close(REWRITE);
 }

 $spamtest->finish();

 # exit status: did we check at least one message correctly?
 exit(!($ham_count || $spam_count));

 ###########################################################################

 sub target  {
   my ($target) = @_;

   # message-selection options; these can now be specified separately
   # for each target
   my %selopts = (
     opt_scanprob => $opt_scanprob,
     opt_after => $opt_after,
     opt_before => $opt_before
   );

   if (!defined($opt_format)) {
     push(@targets, { %selopts, target => $target });
   }
   else {
     $opt_o = 1;
     push(@targets, { %selopts, target => "spam:$opt_format:$target" });
   }
 }

 ###########################################################################

 sub init_results {
   $init_results = 1;

   showdots_finish();

   # now, showdots only happens if --showdots was used
   $showdots_active = $opt_showdots;

   if ($opt_progress) {
     # round up since 100% will be caught at end already
     $statusevery = int($total_messages / $updates + 1);

     # if $messages < $updates, just give a status line per msg.
     $statusevery ||= 1;
   }

   return if $opt_client;

   if ($opt_o) {
     autoflush STDOUT 1;
     print STDOUT $log_header;
   }
   else {
     open(HAM, "> $opt_hamlog") || die "open of $opt_hamlog failed: $!";
     open(SPAM, "> $opt_spamlog") || die "open of $opt_spamlog failed: $!";
     autoflush HAM 1;
     autoflush SPAM 1;
     print HAM $log_header;
     print SPAM $log_header;
   }
 }

 sub result {
   my ($class, $result, $time) = @_;

   # don't open results files until we get here to avoid overwriting files
   init_results() if !$init_results;

   if ($class eq "s") {
     if ($opt_o) { print STDOUT $result; } else { print SPAM $result; }
     $spam_count++;
   }
   elsif ($class eq "h") {
     if ($opt_o) { print STDOUT $result; } else { print HAM $result; }
     $ham_count++;
   }

   $total_count++;

   if ($opt_progress) {
     progress($time);
   }
 }

 sub wanted {
   my ($class, $id, $time, $dataref, $format) = @_;
   my $out = '';

   # if origid is defined, it'll be the message number from server mode
   my $origid;

   # client mode is a little crazy because we need to kluge around the fact
   # that the information needed to do the run is different than the
   # information that goes into the results.
   if ($opt_client) {
     if ($opt_cs_paths_only) {
       # the server message number
       $origid = $real{$id};
     }
     else {
       # if we're a non-paths_only client, change the format and id to the real
       # version, make sure to remember the server's message number
       $origid=$id;
       $origid =~ s/^.+?(\d+)$/$1/;
       $format = $real{$id}->[2];
       $id = $real{$id}->[3];
     }
   }

   memory_track_start() if ($opt_logmem);

   # parse the message, and force it to complete
   my $ma = $spamtest->parse($dataref, 1);

   # remove SpamAssassin markup, if present and the mail was spam
   my $header = $ma->get_header("Received");
   my $x_spam_status;
   if ($opt_reuse) {
     # get X-Spam-Status: header for rule hit resue
     $x_spam_status = $ma->get_header("X-Spam-Status");
   }
   # previous hits
   my @previous;
   if ($x_spam_status) {
     $x_spam_status =~ s/,\s+/,/gs;
     if ($x_spam_status =~ m/tests=(\S*)/
         && $x_spam_status !~ /\bshortcircuit=(?:ham|spam|default)\b/)
     {
       push @previous, split(/,/, $1);

       # we found previous tests, so move the reuse config into place
       unless ($reuse_rules_loaded_p) {
 	$spamtest->copy_config(\%reuse_conf, undef);
 	$reuse_rules_loaded_p = 1;
       }
     }
   }
   elsif ($opt_reuse) {
     if ($reuse_rules_loaded_p) {
       $spamtest->copy_config(\%orig_conf, undef);
       $reuse_rules_loaded_p = 0;
     }
   }

   if ($header && $header =~ /\bwith SpamAssassin\b/) {
     if (!$opt_deencap || message_should_be_deencapped($ma)) {
       my $new_ma = $spamtest->parse($spamtest->remove_spamassassin_markup($ma), 1);
       $ma->finish();
       $ma = $new_ma;
     }
   }

   # plugin hook to cause us to skip messages
   my $skip = $spamtest->call_plugins("mass_check_skip_message", {
         class => $class,
         'time' => $time,
         'id' => $id,
         msg => $ma
       });
   return if $skip;

   # log-uris support
   my $status;
   my @uris;
   my $before;
   my $after;
   if ($opt_loguris) {
     my $pms = Mail::SpamAssassin::PerMsgStatus->new($spamtest, $ma);
     @uris = $pms->get_uri_list();
     $pms->finish();

   } else {
     $before = time;
     $status = $spamtest->check($ma);
     $after = time;
   }

   my @extra;

   # sample-based learning
   if ($opt_learn > 0) {
     my $spam;
     # spam learned as ham = 0.05%
     if ($class eq 's' && rand(100) < 0.05) {
       $spam = 0;
     }
     # ham learned as spam = 0.01%
     elsif ($class eq 'h' && rand(100) < 0.01) {
       $spam = 1;
     }
     # spam/ham learned correctly
     elsif (rand(100) < $opt_learn) {
       if ($class eq 's') {
 	$spam = 1;
       }
       elsif ($class eq 'h') {
 	$spam = 0;
       }
       else {
 	die "unknown class, learning failed";
       }
     }
     if (defined $spam) {
       my $result = ($spam ? "spam" : "ham");
       my $status = $spamtest->learn($ma, undef, $spam, 0);
       my $learned = $status->did_learn();
       $result = "undef" if !defined $learned;
       push(@extra, "learn=".$result);
     }
   }

   if (defined($time)) {
     push(@extra, "time=".$time);
   }
   if ($status && defined $status->{bayes_score}) {
     push(@extra, "bayes=".sprintf("%06f", $status->{bayes_score}));
   }
   if ($opt_mid) {
     my $mid = $ma->get_header("Message-Id");
     if ($mid) {     # message contains a Message-Id:
       while($mid =~ s/\([^\(\)]*\)//s) {};   # remove comments and
       $mid =~ s/^\s+|\s+$//sg;               # leading and trailing spaces
       $mid =~ s/\s.*$//s;                    # keep only the first token
     }
     else {          # it doesn't have a Message-Id:
       $mid = $id;             # so build one from the id
       $mid =~ s,^.*/,,;       # remove the path
       $mid = "<$mid\@$host.masses.spamassassin.org>";  # and put it together
     }
     $mid =~ tr/-A-Za-z0-9_!#%&=~<@>/./c;     # replace dangerous chars with . (so regexp search just works)
     push(@extra, "mid=$mid");
   }
   push(@extra, "scantime=" . ($after - $before));
   push(@extra, "format=$format");

   if ($opt_logmem) {
     my $mem = memory_track_finish();
     if ($mem) {
       push(@extra, $mem);
     }
   }

   if ($reuse_rules_loaded_p) {
     push(@extra, "reuse=yes");
   }
   else {
     push(@extra, "reuse=no");
   }

   if ($opt_client) {
     push(@extra, "host=$where");
   }

   my $yorn;
   my $score;
   my $tests;
   my $extra;

   if ($opt_loguris) {
     $yorn = '.';
     $score = 0;
     $tests = join(" ", sort @uris);
     $extra = '';
   } else {
     $yorn = $status->is_spam() ? 'Y' : '.';
     # don't bother adjusting scores for reuse
     $score = $status->get_score();
     # list of tests hit
     my @tests;
     push @tests, split(/,/, $status->get_names_of_tests_hit());
     push @tests, split(/,/, $status->get_names_of_subtests_hit());
     # hit reuse
     if ($x_spam_status) {
       # generate mapping of hits to remove hits that are marked as skip
       @tests = grep { !$reuse{$_}->{skip} } @tests;
       # add hits from previous
       for (@previous) {
 	push(@tests, $reuse{$_}->{reuse}) if $reuse{$_}->{reuse};
       }
     }
     $tests = join(",", sort(@tests));
     $extra = join(",", @extra);
   }

   if (defined $opt_rewrite) {
     print REWRITE $status->rewrite_mail();
   }

   $id =~ s/\s/_/g;

   # if we have an origid set, it'll be the server mode's message number, so
   # attach it to our result appropriately.
   if (defined $origid) {
     $out = "$origid ";
   }

   $out .= sprintf("%s %2d %s %s %s\n", $yorn, $score, $id, $tests, $extra);

   if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) {
     $out .= logkilled($ma, $id, "possible virus");
   }

   if ($opt_loghits) {
     my $log = '';
     foreach my $t (sort keys %{$status->{pattern_hits}}) {
       $_ = $status->{pattern_hits}->{$t};
       $_ ||= '';
       s/\r/\\r/gs;      # fix unprintables
       s/\n/\\n/gs;
       $log .= "$t=\"$_\" ";
     }
     if ($log) {
       chomp $log;
       $out .= "# $log\n";
     }
   }

   if (defined $status) { $status->finish(); }
   $ma->finish();
   undef $ma;		# clean 'em up
   undef $status;

   showdots_blip();
 #  print ">>>> out = $out\n";
   return $out;
 }

 sub showdots_blip {
   return unless ($showdots_active);

   $showdots_counter++;
   if ($showdots_counter % $showdots_every == 0) {
     print STDERR '.';
     if ($showdots_counter % (60 * $showdots_every) == 0) {
       print STDERR "\n";
     }
   }
 }

 sub showdots_finish {
   print STDERR "\n" if ($showdots_active);
   $showdots_counter = 0;
 }

 # ick.  We have to go grovelling through the body parts to see if a message
 # is a report_safe-marked-up message, because a local scanner will overwrite
 # any remote scanner's X-Spam-Checker-Version header.
 #
 sub message_should_be_deencapped {
   my ($ma) = @_;

   # not sure why this is undefined, but it is sometimes
   if (defined $ma->{body_parts} && scalar @{$ma->{body_parts}} > 0) {
     my $firstpart = $ma->{body_parts}->[0];
     if (!$firstpart->{headers}->{'content-type'}
         || $firstpart->{headers}->{'content-type'} ne 'text/plain')
     {
       return 0;     # not a 'report_safe' encapsulation
     }

     if (scalar @{$firstpart->{raw}} < 3) { return 0; } # too short to be a report

     # grab first 2 lines
     my $text = $firstpart->{raw}->[0] . $firstpart->{raw}->[1];
     $text =~ s/\s+/ /gs;
     if ($text =~ /^Spam detection software, running on the system \"(\S+)\"/) {
       my $hname = $1;
       if ($hname =~ /$opt_deencap/io) {
         return 1;
       }
     }
   }

   return 0;     # a different host marked it up.  pass it through!
 }

 sub logkilled {
   my ($ma, $id, $reason) = @_;

   my $from = $ma->get_header("From")       || 'undef';
   my $to   = $ma->get_header("To")         || 'undef';
   my $subj = $ma->get_header("Subject")    || 'undef';
   my $mid  = $ma->get_header("Message-Id") || 'undef';
   chomp ($from);
   chomp ($to);
   chomp ($subj);
   chomp ($mid);
   return "# skipped killfiled message ($reason): from=$from to=$to subj=$subj mid=$mid id=$id\n";
 }

 sub progress {
   my ($time) = @_;
   $time ||= 0;

   # Are we at the end or otherwise at a point we should print status?  Then do it.
   if ($total_messages == $total_count || $total_count % $statusevery == 0) {
     my $time = strftime("%Y-%m-%d", localtime($time));
     status(sprintf("%3d%% ham: %-6d spam: %-6d date: %s",
 	int(($total_count / $total_messages) * 100), $ham_count, $spam_count, $time));
   }
 }

 sub status {
   my($str) = @_;
   my $now = strftime("%Y-%m-%d %X", localtime(time));
   printf STDERR "status: %-48s now: %s\n", $str, $now;
 }

 ###########################################################################

 our ($mem_size, $mem_rss, $mem_shared);

 sub memory_track_start {
   if ($^O =~ /linux/i) {
     if (open (IN, "</proc/$$/statm")) {
       my $statm = <IN>;
       close IN;
       if ($statm =~ /^(\d+) (\d+) (\d+) /) {
         $mem_size = $1;
         $mem_rss = $2;
         $mem_shared = $3;
       }
     }
   }
 }

 sub memory_track_finish {
   my $str = '';

   if ($^O =~ /linux/i) {
     if (open (IN, "</proc/$$/statm")) {
       my $statm = <IN>;
       close IN;
       if ($statm =~ /^(\d+) (\d+) (\d+) /) {
         my $size = $1;
         my $rss = $2;
         my $shared = $3;

         $str = sprintf ("memsz=%d,memrss=%d,memshr=%d",
                 ($size - $mem_size),
                 ($rss - $mem_rss),
                 ($shared - $mem_shared));
       }
     }
   }
   return $str;
 }

 sub get_current_svn_revision {
   my $revision;

   # this is usually "${TOPDIR}/masses"
   my $dir = $FindBin::Bin || ".";

   if (-d "$dir/.svn" || -f "$dir/svninfo.tmp") {
     if (-f "$dir/svninfo.tmp") {
       # created by build/automc/buildbot_ready for chrooted mass-checks
       open (SVNINFO, "< $dir/svninfo.tmp");
     }
     else {
       # note, ".." since we want to pick up changes outside 'masses'
       # too!
       open (SVNINFO, "( svn info --non-interactive $dir/.. || svn info $dir/.. ) 2>&1 |");
     }

     while (<SVNINFO>) {
       # Revision: 383822
       next unless /^Revision: (\d+)/;
       $revision = $1;
       last;
     }
     close SVNINFO;
     return $revision if $revision;
   }

   # this probably will never work due to Rules Project changes TODO
   if (open(TESTING, "$opt_c/70_testing.cf")) {
     chomp($revision = <TESTING>);
     $revision =~ s/.*\$Rev:\s*(\S+).*/$1/;
     close(TESTING);
     return $revision if $revision;
   }

   return $revision || "unknown";
 }

 ############################################################################

 ## children processors, start and process, used when opt_j > 1

 sub start_children {
   my ($count, $child, $pid, $socket) = @_;

   my $io = IO::Socket->new();
   my $parent;

   # create children
   for (my $i = 0; $i < $count; $i++) {
     ($child->[$i],$parent) = $io->socketpair(AF_UNIX,SOCK_STREAM,PF_UNSPEC)
 	or die "mass-check: socketpair failed: $!";
     if ($pid->[$i] = fork) {
       close $parent;

       # disable caching for parent<->child relations
       my ($old) = select($child->[$i]);
       $|++;
       select($old);

       $socket->add($child->[$i]);
       aidbg "mass-check: starting new child $i (pid ".$pid->[$i].")\n";
       next;
     }
     elsif (defined $pid->[$i]) {
       my $result;
       my $line;

       close $tmpfd if defined $tmpfd;

       close $child->[$i];
       select($parent);
       $| = 1;	# print to parent by default, turn off buffering
       send_line($parent,"START");
       while ($line = read_line($parent)) {
 	if ($line eq "exit") {
 	  close $parent;
 	  exit;
 	}

 	my($class, $format, $date, $where, $result) = $iter->_run_message($line);
 	$result ||= '';

 	# If determine_receive_date is not set, the original input date
 	# wasn't calculated, but run_message would have done so, so reset
 	# the packed version if possible ...  use defined for date since
 	# it could == 0.
         if (!$iter->{determine_receive_date} && $class && $format && defined $date && $where) {
 	  $line = Mail::SpamAssassin::ArchiveIterator::_index_pack($date, $class, $format, $where);
         }

 	send_line($parent,"$result\0RESULT $line");
       }
       exit;
     }
     else {
       die "mass-check: cannot fork: $!";
     }
   }
 }

 ## handling killing off the children

 sub reap_children {
   my ($count, $socket, $pid) = @_;

   # If the child died, sending it the exit will generate a SIGPIPE, but we
   # don't really care since the readline will go undef (which is fine),
   # then we do the waitpid which will finish it off.  So we end up in the
   # right state, in theory.
   local $SIG{'PIPE'} = 'IGNORE';

   for (my $i = 0; $i < $count; $i++) {
     aidbg "mass-check: killing child $i (pid ",$pid->[$i],")\n";
     send_line($socket->[$i],"exit"); # tell the child to die.
     close $socket->[$i];
     waitpid($pid->[$i], 0); # wait for the signal ...
   }
 }

 # in server mode, this gets called to read in the HTTP request from a given
 # socket, then return the information the client sent to us.
 sub handle_http_request {
   my $socket = shift;

   my $headers = {};
   my $postdata = {};

   # read in the request (POST / HTTP/1.0)
   my $line = $socket->getline();
   $line ||= '';
   $line =~ s/\r\n$//;

   my ($type, $URI, $VERS) = $line =~ /^([a-zA-Z]+)\s+(\S+)(?:\s*(\S+))/;
   unless ($type && $URI && $VERS) {
     $type ||= '';
     $URI  ||= '';

     return ($type, $URI, $headers, $postdata);
   }

   $type = uc $type;

   # read in headers, "key: value" up to a blank line
   do {
     $line = $socket->getline();
     last unless defined $line;
     $line =~ s/\r\n$//;

     if ($line) {
       my ($k,$v) = split(/:\s*/, $line, 2);
       $headers->{lc $k} = $v;
     }
   } while ($line !~ /^$/);

   # if this is a POST request w/ content-length, there'll be a payload, deal
   # with it.
   if ($type eq 'POST' && $headers->{'content-length'}) {
     my $pd;
     $socket->read($pd, $headers->{'content-length'});
     $pd =~ s/[\r\n]+$//; # a hack for manual requests/telnet/etc

     # key1=value1&key2=value2...
     %{$postdata} = map {
       my($k,$v) = split(/=/, $_, 2);

       # we need to decode the key and value
       $k =~ s/\%([0-9a-fA-F]{2})/sprintf "%c", hex($1)/eg;
       $v =~ s/\%([0-9a-fA-F]{2})/sprintf "%c", hex($1)/eg;

       $k => $v;
     } split(/\&/, $pd);
   }

   return($type, $URI, $headers, $postdata);
 }

 # in server mode, generate a gzip compressed data stream with the messages and
 # return the path to the compressed file which the server will read and pass
 # to the client.
 #
 # Input:
 #  - Number of messages to generate (scalar)
 #  - Hash of Arrays of outstanding requests (reference to hash of array refs)
 #     timestamp# -> [ num1, num2, ... ]
 #     Used to quickly find outstanding/timed out messages to send to client.
 #  - Hash of outstanding messages and associated data (ref to hash of hash refs)
 #     num1 -> { data => 'binary data from scan mode', timestamp => timestamp# }
 #     Used later on to specify the timestamp entry to remove the entry from.
 #  - Paths only?  If true, just include the original message data in the gzip
 #    file.  Otherwise, include the message data.  Useful if the client has the
 #    corpus available via the same paths as originally specified.
 #
 # Returns: scalar path to gzip file
 #
 sub generate_messages {
   my($msgs, $timestamps, $msgsout, $paths_only) = @_;

   # Hold the message numbers we'll be sending out
   my @tosend = ();

   # Find out if any of the messages we sent out before need to be sent out
   # again because we haven't seen a response within the timeout.
   my $tooold = time - $opt_cs_timeout;
   foreach (sort { $a <=> $b } keys %{$timestamps}) {
     # since we're going in numeric order, if the current entry is newer than
     # the timeout value, the rest will be too, so stop looking.
     last if ($_ > $tooold);

     # how many messages do we still need to fulfill the request?
     my $wanted = $msgs - @tosend;

     if (@{$timestamps->{$_}} > $wanted) {
       # there are more entries in the timestamp list than we want, so just
       # grab that many off the list.
       push(@tosend, splice @{$timestamps->{$_}}, 0, $wanted);
     }
     else {
       # there are just enough, or not enough entries on the timestamp list to
       # satisfy our request, so take them all and we'll loop around.
       push(@tosend, @{$timestamps->{$_}});
       delete $timestamps->{$_};
     }

     # Ok, we have enough messages so we can stop now.
     last if (@tosend == $msgs);
   }

   # if we still have the temp file with the input messages open, we'll fillup
   # out message output queue with messages from there.
   if ($tmpfd) {
     while (@tosend < $msgs) {
       my $msg = read_line($tmpfd);

       # no more messages from the temp file, close it out
       unless ($msg) {
         delete $msgsout->{'curnum'};
 	close $tmpfd;
 	undef $tmpfd;
 	last;
       }

       # we got a result, so assign it a number (curnum) and store the data
       # appropriately, then add the new number to the queue.
       my $num = $msgsout->{'curnum'}++;
       $msgsout->{$num}->{'data'} = $msg;
       push(@tosend, $num);
     }
   }

   # ok, at this point, @tosend ought to have a list of numbers, pointers into
   # %{$msgsout}.  turn that into a tar file.
   return '' unless @tosend;

   my($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile();
   die "Can't make tempfile, exiting" unless $gzpath;
   close($gzfd);

   $gzfd = IO::Zlib->new($gzpath, 'wb') || die "Can't create temp gzip file: $!";

   # first line is the number of messages included in the file
   send_line($gzfd, scalar @tosend) || die "mass-check: error when writing to gz temp file\n";

   # Generate an archive in the temp file
   foreach my $num (@tosend) {
     # Archive format, gzip compressed file w/ 3 parts per message:
     # 1- server message number in text format
     # 2- server index string, binary packed format
     # 3- message content -- unless paths_only
     send_line($gzfd, $num) || die "mass-check: error when writing to gz temp file\n";

     my $data = $msgsout->{$num}->{'data'};
     send_line($gzfd, $data) || die "mass-check: error when writing to gz temp file\n";

     if (!$paths_only) {
       my $msg = ($iter->_run_message($data))[4];
       send_line($gzfd, join('', @{$msg})) ||
         die "mass-check: error when writing to gz temp file\n";
     }
   }

   $gzfd->close;

   # update timestamp entries
   my $ts = time;
   foreach (@tosend) {
     $msgsout->{$_}->{'timestamp'} = $ts;
   }

   # conveniently, this list should be the only thing sent out w/ this
   # timestamp, so just set the reference appropriately. :)
   $timestamps->{$ts} = \@tosend;

   if ($opt_noisy) {
     print "generated ".scalar(@tosend)." messages\n";
   }

   return $gzpath;
 }

 # we've gotten results posted, so clean up msgsout and timestamp hashes and
 # process result...
 sub handle_post_results {
   my($postdata, $timestamps, $msgsout) = @_;

   # local version to batch the removals
   my %timestamps = ();

   # $msgsout->{num}->{data|timestamp}
   # $timestamp{num} = [ msgout_nums ... ]
   # $postdata{num} = result_string

   while( my($k,$v) = each %{$postdata} ) {
     # message run results will be \d+ => log entry
     next if ($k !~ /^\d+$/);

     # if we've been waiting for this result, process it, otherwise throw it on
     # the ground.  multiple clients could have been given the same messages to
     # process, and we take whatever the first responder sends us.
     if (exists $msgsout->{$k}) {
       # the result_sub will need parts of the message data, so get it ready
       my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($msgsout->{$k}->{'data'});

       # go ahead and do the result
       &{$iter->{result_sub}}($d[1], $v, $d[0]);

       # prep to get rid of the cached entries
       $timestamps{$msgsout->{$k}->{'timestamp'}}->{$k} = 1;
       delete $msgsout->{$k};
     }
   }

   # if we got any results, clean out the results from the timestamp arrays
   while ( my($k,$v) = each %timestamps ) {
     # trim out the result list from the timestamp sent list
     my @temp = grep(!exists $v->{$_}, @{$timestamps->{$k}});

     # if there are results left for a specific timestamp, update the array
     # pointer.  otherwise, delete the timestamp entry since it's empty.
     if (@temp) {
       $timestamps->{$k} = \@temp;
     }
     else {
       delete $timestamps->{$k};
     }
   }
 }

 # This function reads from $tmpfd and processes the message as appropriate wrt
 # $opt_j, $opt_restart, etc.
 #
 sub run_through_messages {
   # do everything in one process
   if ($opt_j <= 1 && !defined $opt_restart) {
     my $message;
     my $messages;
     my $total_count = 0;

     while (($total_messages > $total_count) && ($message = read_line($tmpfd))) {
       my($class, undef, $date, undef, $result) = $iter->_run_message($message);
       if ($result) {
         &{$iter->{result_sub}}($class, $result, $date);
       }
       $total_count++;
     }
   }
   # more than one process or one process with restarts
   else {
     my $select = IO::Select->new();

     my $total_count = 0;
     my $needs_restart = 0;
     my @child = ();
     my @pid = ();
     my $messages;

     # start children processes
     start_children($opt_j, \@child, \@pid, $select);

     # feed childen, make them work for it, repeat
     while ($select->count()) {
       foreach my $socket ($select->can_read()) {
         my $line = read_line($socket);

         # some error happened during the read!
         if (!defined $line) {
           $needs_restart = 1;
           warn "mass-check: readline failed, attempting to recover\n";
           $select->remove($socket);
         }
         elsif ($line =~ /^([^\0]*)\0RESULT (.+)$/s) {
 	  my $result = $1;
 	  my ($date,$class,$type) = Mail::SpamAssassin::ArchiveIterator::_index_unpack($2);
 	  aidbg "mass-check: $class, $type, $date\n";

 	  if (defined $opt_restart && ($total_count % $opt_restart) == 0) {
 	    $needs_restart = 1;
 	  }

 	  # if messages remain, and we don't need to restart, send message
 	  if (($total_messages > $total_count) && !$needs_restart) {
 	    send_line($socket, read_line($tmpfd));
 	    $total_count++;
 	    aidbg "mass-check: $total_messages $total_count\n";
 	  }
 	  else {
 	    # stop listening on this child since we're done with it
 	    aidbg "mass-check: $needs_restart $total_messages $total_count\n";
 	    $select->remove($socket);
 	  }

 	  # deal with the result we received
 	  if ($result) {
 	    &{$iter->{result_sub}}($class, $result, $date);
 	  }
         }
         elsif ($line eq "START") {
 	  if ($total_messages > $total_count) {
 	    # we still have messages, send one to child
 	    send_line($socket, read_line($tmpfd));
 	    $total_count++;
 	    aidbg "mass-check: $total_messages $total_count\n";
 	  }
 	  else {
 	    # no more messages, so stop listening on this child
 	    aidbg "mass-check: $needs_restart $total_messages $total_count\n";
 	    $select->remove($socket);
 	  }
         }
         else {
           $needs_restart = 1;
           warn "mass-check: bad line from readline: $line\n";
           $select->remove($socket);
         }
       }

       aidbg "mass-check: out of loop, $total_messages $total_count $needs_restart ".$select->count()."\n";

       # If there are still messages to process, and we need to restart
       # the children, and all of the children are idle, let's go ahead.
       if ($needs_restart && $select->count == 0 && $total_messages > $total_count) {
         $needs_restart = 0;

         aidbg "mass-check: needs restart, $total_messages total, $total_count done\n";
         reap_children($opt_j, \@child, \@pid);
         @child=();
         @pid=();
         start_children($opt_j, \@child, \@pid, $select);
       }
     }

     # reap children
     reap_children($opt_j, \@child, \@pid);
   }
 }

 # send an HTTP response to a socket based on the input result, headers, and
 # data values.
 sub http_response {
   my($socket, $result, $headers, $data) = @_;

   print $socket
     "HTTP/1.0 $result\r\n",
     "Pragma: no-cache\r\n",
     "Server: mass-check/$svn_revision\r\n",
     map { "$_: ".$headers->{$_}."\r\n" } keys %{$headers};
   print $socket "\r\n";
   print $socket $data;
 }

 # the client needs to make a request to the server on a given socket.
 sub http_make_request {
   my($socket, $type, $uri, $headers, $data) = @_;

   print $socket
     "$type $uri HTTP/1.0\r\n",
     "User-Agent: mass-check/$svn_revision\r\n",
     map { "$_: ".$headers->{$_}."\r\n" } keys %{$headers};
   print $socket "\r\n";
   print $socket $data;

   # parse the response that the server sends us
   my $line = $socket->getline() || '';
   my(undef, $code, $string) = split(/\s+/, $line, 3);
   return unless $code == 200;

   my %headers = ();
   do {
     $line = $socket->getline();
     last unless defined $line;
     $line =~ s/\r\n$//;

     if ($line) {
       my ($k,$v) = split(/:\s*/, $line, 2);
       $headers{lc $k} = $v;
     }
   } while ($line !~ /^$/);

   # the server has sent us notification that it's going to exit, so let's
   # follow suit.
   return 'finished' if ($headers{'finished'});

   my $gzpath = '';
   if ($headers{'content-length'}) {
     my $gzfd;
     ($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile();
     die "Can't make tempfile, exiting" unless $gzpath;

     my $rd;
     $socket->read($rd, $headers{'content-length'}) || die "mass-check: error reading in data from server\n";
     print $gzfd $rd;
     close $gzfd;
   }

   $socket->close();
   return $gzpath;
 }

 # Be conservative -- encode most things.
 # we could encode spaces to plusses, then decode that later, but...
 sub post_encode {
   my $string = shift;
   $string =~  s/([^a-zA-Z0-9_,.\/\\-])/sprintf "%%%02x",unpack("C",$1)/egx;
   return $string;
 }

 # remove all of the files in a given directory, non-recursive
 sub clean_dir {
   my $dir = shift;

   unless (opendir(DIR, $dir)) {
     warn "error: can't opendir $dir: $!\n";
     return;
   }
   while(my $file = readdir(DIR)) {
     $file =~ /^(.+)$/;       # untaint
     $file = $1;

     my $path = File::Spec->catfile($dir, $file);
     next unless (-f $path);

     if (!unlink $path) {
       warn "error: can't remove file $path: $!\n";
       closedir(DIR);
       return;
     }
   }
   closedir(DIR);
   return 1;
 }

 ############################################################################

 # four bytes in network/vax format (little endian) as length of message
 # the rest is the actual message

 sub read_line {
   my $fd = shift;
   my($length,$msg);

   # read in the 4 byte length and unpack
   $fd->read($length, 4) || return;

   $length = unpack("V", $length);
   return unless $length;

   # read in the rest of the single message
   $fd->read($msg, $length) || return;

   return $msg;
 }

 sub send_line {
   my $fd = shift;
   foreach ( @_ ) {
     my $length = pack("V", length $_);
     $fd->print($length.$_) || return 0;
   }

   return 1;
 }

 ############################################################################

 # this is the function that implemented server mode.  basically, sit and wait
 # for connections to come in.  when a client sends in a request, deal with any
 # results that the client sent, then generate a response and send it back,
 # and then go back to waiting.  lather, rinse, repeat.
 sub server_mode {
   $opt_cs_max ||= 1000;
   $opt_cs_timeout ||= 60 * 5;

   my $serv_socket = IO::Socket::INET->new(
     LocalAddr => $opt_server,
     Proto => 'tcp',
     Listen => 5,
     ReuseAddr => 1,
   );

   die "Could not create socket: $!\n" unless $serv_socket;

   if ($opt_progress) {
     status('server ready for connections');
   }

   # Setup out "what messages have been sent out" hashes
   my $timestamps = {};
   my $msgsout = { 'curnum' => 0 };

   # Generate an IO::Select object and put the server socket on the queue
   my $select = IO::Select->new( $serv_socket );

   # We'll keep looping while there's something to pay attention to
   while ($select->count()) {
     # Sit and block until there's something for us to read from
     foreach my $socket ($select->can_read()) {
       if ($socket == $serv_socket) {
         # it's the server socket, go ahead and accept the connection and add
 	# it to the queue.
         $select->add($serv_socket->accept);
       }
       else {
 	# it's some client, so deal with the request
 	my($type, $URI, $headers, $postdata) = handle_http_request($socket);

 	# we don't do GET, so just send something back
 	if ($type eq 'GET') {
 	  if ($opt_noisy) {
 	    print "GET request from ".$socket->peerhost."\n";
 	  }

 	  http_response($socket, "200 OK", {
 	      'Content-type' => 'text/plain',
 	    },
 	    "Your GET request came from IP Address: ".$socket->peerhost."\n");
 	}
         elsif ($type eq 'POST') {
 	  # ooh, POST.  deal with any results that the client sent
 	  handle_post_results($postdata, $timestamps, $msgsout);

 	  if ($opt_noisy) {
 	    print "POST request from ".$socket->peerhost."\n";
 	  }

           # based on the number of messages that the client requested,
 	  # generate a gzip file with the appropriate data in it
 	  my $messages = '';
 	  if ($postdata->{'max_messages'}) {
 	    my $msgnum = $postdata->{'max_messages'};
 	    if ($msgnum > $opt_cs_max || $msgnum < 1) {
 	      $msgnum = $opt_cs_max;
 	    }

 	    if ($opt_noisy) {
 	      print "client requested ".$postdata->{'max_messages'}." messages\n";
 	    }

 	    $messages = generate_messages($msgnum, $timestamps, $msgsout, $postdata->{'paths_only'});
 	  }

           # $messages will contain the path to the gzip file if there are
 	  # messages to send out.
           if ($messages && open(MSG, $messages)) {
 	    binmode(MSG);
 	    local $/ = undef;  # go go slurp mode

 	    # send the response
 	    http_response($socket, "200 OK", {
 	      'Content-Type' => 'application/x-gzip',
 	      'Content-Encoding' => 'x-gzip',
 	      "Content-Length" => (-s $messages),
 	      },
 	      scalar <MSG>);

 	    close(MSG);

 	    # we don't need the file anymore, so get rid of it
 	    unlink $messages;
           }
 	  elsif (!keys %{$msgsout} && !defined $tmpfd) {
 	    # we have no more outstanding messages and our original queue of
 	    # messages to process is empty, so tell the client to exit.
 	    http_response($socket, "200 OK", {
               "Content-type" => "text/plain",
 	      "Finished" => 1,
 	      },
 	      'We are all done');
 	  }
 	  else {
 	    # when in doubt, treat this like a GET
 	    http_response($socket, "200 OK", {
               "Content-type" => "text/plain",
 	      },
               "Your POST request (sans max_messages) came from IP Address: ".$socket->peerhost."\n");
 	  }
 	}
 	else {
           # for error, "501 Not Implemented"
 	  http_response($socket, '501 Not Implemented', {}, '');
 	}

 	# ok, we don't do keepalive, so get rid of the socket
         $select->remove($socket);
 	$socket->close;
       }
     }

     if ($opt_noisy) {
       print scalar(keys %{$msgsout})." messages outstanding\n";
     }


 #print "msgs waiting: ".join(" ", keys %{$msgsout})."\n";
 #print "tmpfd defined? ".(defined $tmpfd ? "yes" : "no")."\n";

     # we're not awaiting responses and we've exhausted the input file, so
     # drop the server socket. :)
     $select->remove($serv_socket) if (!keys %{$msgsout} && !defined $tmpfd);
   }
 }

 # this is the function that implements client mode.  generally, in a loop:
 #  make a request of the server for some max number of messages, and send our
 #  results back at the same time.  based on the results of that request, put
 #  messages into a temp dir and process them.  prep the results and loop.
 #  lather, rinse, repeat.
 sub client_mode {
   $opt_cs_max ||= 1000;
   $opt_cs_timeout ||= 60 * 2;

   my($host, $uri);

   if ($opt_client =~ /^http:\/\/([^\/]+)(\/.*)?/) {
     ($host, $uri) = ($1,$2);
   }
   else {
     $host = $opt_client;
     if ($host =~ /^:/) {
       $host = 'localhost'.$host;
     }
   }
   my($http_host) = split(/:/, $host);

   die "No host found in opt_client" unless $host;
   $uri ||= "/";

   # use this to track how many messages we ought to be requesting
   # start at 100 to get warmed up
   my $msgnum = $opt_cs_max > 100 ? 100 : $opt_cs_max;

   my $tmpdir;

   # if we're not doing paths_only, create a temp dir where we'll put the
   # incoming messages to process.
   if (!$opt_cs_paths_only) {
     $tmpdir = Mail::SpamAssassin::Util::secure_tmpdir();
     die "Can't create tempdir" unless $tmpdir;
   }

   # keep going until something stops us.
   while (1) {
     # if the number of messages to request is too much, bring it down
     $msgnum = $opt_cs_max if ($msgnum > $opt_cs_max);

     # prep the POST request
     $postdata{'max_messages'} = $msgnum;
     $postdata{'paths_only'} = 1 if ($opt_cs_paths_only);

     # the actual POST data string
     my $POSTDATA = join('&', map { post_encode($_) . '=' . post_encode($postdata{$_}) } keys %postdata);

     # connect to server
     my $socket = IO::Socket::INET->new($host);

     # last if connection fails
     last unless ($socket);

     print "Requesting $msgnum messages from server\n" if ($opt_noisy);

     # make request, include and then drop results if there are any
     my $result = http_make_request($socket, 'POST', $uri, {
       'Host'		=> $http_host,
       'Content-Type'	=> 'application/x-www-form-urlencoded',
       'Content-Length'	=> length($POSTDATA),
       },
       $POSTDATA
     );
     %postdata = ();
     undef $POSTDATA;

     # If we received messages to run through, go ahead and do it.
     # otherwise, just sleep for the timeout length and try again
     if (!defined $result) {
       # we got an error?!?  abort!
       last;
     }
     elsif ($result eq 'finished') {
       # the server said that we're done
       print "Server states that there is no more work, exiting.\n" if ($opt_noisy);
       last;
     }
     elsif ($result eq '') {
       # no messages means the server may give us more work down the road.
       # sleep for client_timeout seconds and try the request again
       print "Received no messages from server, waiting $opt_cs_timeout seconds\n" if ($opt_noisy);
       sleep $opt_cs_timeout;
     }
     else {
       # we got messages, so deal with them.
       my $time_start = time;

       # postdata will hold our results, real will hold the original message
       # data from the server's scan mode.
       %postdata = ();
       %real = ();
       $init_results = $total_count = $spam_count = $ham_count = 0;

       # we got a result, so do things with it!
       my $gzfd = IO::Zlib->new($result, "rb");
       die "Can't open temp result file: $!" unless $gzfd;

       # used for the temp queue file
       my $tmppath;
       ($tmppath, $tmpfd) = Mail::SpamAssassin::Util::secure_tmpfile();
       die "Can't make tempfile, exiting" unless $tmppath;
       unlink $tmppath;

       # if we have a temp directory, clean it out for this run
       clean_dir($tmpdir) if ($tmpdir);

       # Archive format, gzip compressed file w/ 3 parts per message:
       # 1- server message number in text format
       # 2- server index string, binary packed format
       # 3- message content, if not doing paths_only

       # number of messages
       $msgnum = $total_messages = read_line($gzfd) || die "mass-check: error reading from gzip message file\n";

       status("server gave us $total_messages messages") if ($opt_progress);

       # loop through and prep all of the messages the server sent
       for(my $i = 0 ; $i < $total_messages; $i++ ) {
         my $num = read_line($gzfd);
 	last unless defined $num;

         my $index = read_line($gzfd);
 	last unless defined $index;

 	# if we're doing paths_only, there'll be no message content
 	if (!$opt_cs_paths_only) {
 	  my $msg = read_line($gzfd);
 	  last unless defined $msg;

 	  # it's going to be a dir of file formatted messages
 	  if (open(OUT, ">$tmpdir/$num")) {
 	    print OUT $msg;
 	    close(OUT);

 	    # this is a little tricky -- we need to process the files in the
 	    # path and format we've created, but the original data is needed
 	    # to create a proper result later, so deal with that here.
 	    my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($index);
 	    $real{"$tmpdir/$num"} = \@d;
             send_line($tmpfd,
 	      Mail::SpamAssassin::ArchiveIterator::_index_pack($d[0], $d[1], 'f', "$tmpdir/$num")) ||
 	        die "mass-check: error writing out temp file in client mode\n";
 	  }
 	  else {
 	    warn "Can't create/write $tmpdir/$num: $!";
 	  }
 	}
 	else {
 	  # in paths_only mode, there's no kluging between formats since we're
 	  # reading the same corpus, however we do still need to track server
 	  # message number to message data so our results will be useable.
 	  my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($index);
 	  $real{$d[3]} = $num;
 	  send_line($tmpfd, $index) ||
 	        die "mass-check: error writing out temp file in client mode\n";
 	}
       }

       $gzfd->close;
       unlink $result;

       if ($opt_progress) {
         status('starting run stage');
       }

       # we're about to start running, so go back to the start of the file
       seek $tmpfd, 0, 0;

       run_through_messages();

       # we're done with the temp file -- bye bye
       close($tmpfd);

       # figure out new max messages, try keeping ~cs_timeout between runs
       my $time_end = time;

       # if we only requested a small number of messages, it may take <1s to
       # run through them, so fake it and say it took 1s.
       if ($time_end == $time_start) {
         $time_end++;
       }

       if ($opt_progress) {
         status('completed run stage');
       }

       print "Completed run in ".($time_end-$time_start)." seconds\n" if ($opt_noisy);
       $msgnum = int($msgnum * $opt_cs_timeout / ($time_end-$time_start)) || 1;
     }
   }

   # if we were using a temp dir, clean it out and then remove it
   if ($tmpdir) {
     clean_dir($tmpdir);
     rmdir $tmpdir;
   }
 }

 ############################################################################

 # in server mode, just return the ref to the message data
 sub wanted_server {
   my ($class, $id, $time, $dataref, $format) = @_;
   return $dataref;
 }

 # very similar to result() except the result has the message number at the
 # front, so strip it off and then set the POST data appropriately.
 sub result_client {
   my ($class, $result, $time) = @_;

   # don't open results files until we get here to avoid overwriting files
   init_results() if !$init_results;

   if ($class eq "s") {
     $spam_count++;
   }
   elsif ($class eq "h") {
     $ham_count++;
   }

   $total_count++;

   if ($opt_progress) {
     progress($time);
   }

   if ($result =~ s/^(\d+)\s+//m) {
     $postdata{$1} = $result;
   }
   else {
     warn ">> WTH!?  result is not in the correct format: $result\n";
   }
 }

 sub aidbg {
   if (would_log("dbg", "mass-check") == 2) {
     dbg (@_);
   }
 }

 sub deal_with_before_after {
   my($which, $time) = @_;

   if ($time && $time =~ /^-\d+$/) {
     $time = time + $time;
   }
   elsif ($time && $time !~ /^-?\d+$/) {
     if (HAS_TIME_PARSEDATE) {
       $time = Time::ParseDate::parsedate($time, GMT => 1, PREFER_PAST => 1);
     }
     else {
       die "You need Time::ParseDate if you use either the --before or --after option.";
     }
   }

   if ($which eq 'before') {
     $opt_before = $time;
   }
   else {
     $opt_after = $time;
   }

   if ($opt_before && $opt_after && $opt_after >= $opt_before) {
     die "--before ($opt_before) <= --after ($opt_after) -- conflict!";
   }
 }

 sub generate_queue {
   my ($targets, $tmpfd) = @_;

   # scan the targets and get the number and list of messages
   $iter->_scan_targets($targets,
     sub {
       my($self, $date, $class, $format, $mail) = @_;
       push(@{$self->{$class}}, Mail::SpamAssassin::ArchiveIterator::_index_pack($date, $class, $format, $mail));
     }
   );

   # deal with opt_head and opt_tail
   top_and_tail_messages($iter->{h});
   top_and_tail_messages($iter->{s});

   my $messages;
   if ($opt_n) {
     # OPT_N == 1 means don't bother sorting on message receive date

     # for ease of memory, we'll play with pointers
     $messages = $iter->{s};
     undef $iter->{s};
     push(@{$messages}, @{$iter->{h}});
     undef $iter->{h};
   }
   else {
     # OPT_N == 0 means sort on message receive date

     # Sort the spam and ham groups by date
     my @s = @{$iter->{s}};
     undef $iter->{s};
     my @h = @{$iter->{h}};
     undef $iter->{h};

     # interleave ordered spam and ham
     if (@s && @h) {
       my $ratio = @s / @h;
       while (@s && @h) {
 	push @{$messages}, (@s / @h > $ratio) ? (shift @s) : (shift @h);
       }
     }
     # push the rest onto the end
     push @{$messages}, @s, @h;
   }

   # head or tail < 0 means crop the total list, negate the value appropriately
   if ($opt_tail < 0) {
     splice(@{$messages}, 0, $opt_tail);
   }
   if ($opt_head < 0) {
     splice(@{$messages}, -$opt_head);
   }

   my $num = $Mail::SpamAssassin::ArchiveIterator::MESSAGES = scalar(@{$messages});

   # Dump out the number of messages and the message index info to
   # the temp file
   send_line($tmpfd, $num, @{$messages});
 }

 sub top_and_tail_messages {
   my ($ary) = @_;

   if ($opt_n) {
     # OPT_N == 1 means don't bother sorting on message receive date

     # head or tail > 0 means crop each list
     if ($opt_tail > 0) {
       splice(@{$ary}, 0, -$opt_tail);
     }
     if ($opt_head > 0) {
       splice(@{$ary}, min ($opt_head, scalar @{$ary}));
     }
   }
   else {
     # OPT_N == 0 means sort on message receive date

     # Sort the spam and ham groups by date
     my @s = sort { $a cmp $b } @{$ary};

     # head or tail > 0 means crop each list
     if ($opt_tail > 0) {
       splice(@s, 0, -$opt_tail);
     }
     if ($opt_head > 0) {
       splice(@s, min ($opt_head, scalar @s));
     }

     @{$ary} = @s;
   }
 }

 sub min {
   return ($_[0] < $_[1] ? $_[0] : $_[1]);
 }