| #!/usr/bin/perl -w |
| use strict; |
| |
| # |
| # <@LICENSE> |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to you under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at: |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # </@LICENSE> |
| |
| sub aidbg; |
| |
| sub usage { |
| my $status = shift; |
| |
| my $out = $status ? \*STDERR : \*STDOUT; |
| print $out <<EOF; |
| usage: mass-check [options] target ... |
| |
| -c=file set configuration/rules directory |
| -p=dir set user-prefs directory |
| -f=file read list of targets from <file> |
| -j=jobs specify the number of processes to run simultaneously |
| --net turn on network checks! |
| --mid report Message-ID from each message |
| --debug=LIST report debugging information (default is all facilities, LIST |
| is a comma-separated list of facilities) |
| --progress show progress updates during check |
| --noisy show noisier progress updates during check |
| --showdots print a dot for each scanned message |
| --rewrite=OUT save rewritten message to OUT (default is /tmp/out) |
| --rules=RE Only test rules matching the given regexp RE |
| --restart=N restart all of the children after processing N messages |
| --deencap=RE Extract SpamAssassin-encapsulated spam mails only if they |
| were encapsulated by servers matching the regexp RE |
| (default = extract all SpamAssassin-encapsulated mails) |
| --lint check rules for syntax before running |
| --cf='config line' Additional line of configuration |
| |
| client/server mode options |
| --server host:port |
| use server mode, running on the given hostname and port |
| --client host:port |
| use client mode, connecting to the given hostname and port |
| --cs_max N |
| at most, only ever request (client)/give out (server) a |
| maximum of N messages (defaults to 1000) |
| --cs_timeout N |
| in client mode, try to connect to the server every N seconds |
| defaults to 120 |
| in server mode, timeout messages after N seconds |
| defaults to 300 |
| --cs_paths_only |
| only used in client mode. when making requests of the |
| server, only ask for paths to the messages and not the |
| messages themselves. useful when the client and server |
| have the same paths to the corpus data. |
| |
| log options |
| -o write all logs to stdout |
| --loghits log the text hit for patterns (useful for debugging) |
| --loguris log the URIs found |
| --logmem log the memory delta (only on Linux) |
| --hamlog=log use <log> as ham log ('ham.log' is default) |
| --spamlog=log use <log> as spam log ('spam.log' is default) |
| |
| message selection options |
| -n no date sorting or spam/ham interleaving |
| --cache use cache information when selecting messages |
| --cachedir=dir write cache info for --cache in this directory tree |
| --all don't skip big messages |
| |
| message selection options, can be specified for each target |
| --after=N only test mails received after time_t N (negative values |
| are an offset from current time, e.g. -86400 = last day) |
| or after date as parsed by Time::ParseDate (e.g. '-6 months') |
| --before=N same as --after, except received times are before time_t N |
| --scanprob=N probability of scanning a message, range 0.0 - 1.0 (default: 1.0) |
| |
| message selection options, can be specified for each target class |
| --head=N only check first N ham and N spam (N messages if -n used) |
| --tail=N only check last N ham and N spam (N messages if -n used) |
| |
| simple target options (implies -o and no ham/spam classification) |
| --dir subsequent targets are directories |
| --file subsequent targets are files in RFC 822 format |
| --mbox subsequent targets are mbox files |
| --mbx subsequent targets are mbx files |
| |
| Just left over functions we should remove at some point: |
| --bayes report score from Bayesian classifier |
| |
| options used during score generation process |
| --learn=N learn N% of messages as spam or ham |
| --reuse reuse network checks if X-Spam-Status: is present in messages |
| |
| non-option arguments are used as target names (mail files and folders), |
| the target format is: <class>:<format>:<location> |
| <class> is "spam" or "ham" |
| <format> is "dir", "file", "mbx", "mbox", or "detect" |
| (see 'perldoc Mail::SpamAssassin::ArchiveIterator) |
| <location> is a file or directory name. globbing of ~ and * is supported |
| |
| EOF |
| exit($status); |
| } |
| |
| ########################################################################### |
| |
| use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all $opt_bayes |
| $opt_debug $opt_format $opt_hamlog $opt_head $opt_loghits |
| $opt_mid $opt_net $opt_nosort $opt_progress $opt_showdots |
| $opt_spamlog $opt_tail $opt_rules $opt_restart $opt_loguris |
| $opt_logmem $opt_after $opt_before $opt_rewrite $opt_deencap |
| $opt_learn $opt_reuse $opt_lint $opt_cache $opt_noisy $opt_cf |
| $total_messages $statusevery $opt_cachedir $opt_scanprob |
| $opt_client $opt_cs_max $opt_cs_timeout $opt_cs_paths_only |
| $opt_server %postdata %real $svn_revision |
| $tmpfd %reuse %orig_conf %reuse_conf $reuse_rules_loaded_p); |
| |
| use FindBin; |
| use lib "$FindBin::Bin/../lib"; |
| eval "use bytes"; |
| use IO::Select; |
| use IO::Socket; |
| use Mail::SpamAssassin::ArchiveIterator; |
| use Mail::SpamAssassin; |
| use Mail::SpamAssassin::Logger; |
| use File::Copy; |
| use File::Spec; |
| use Getopt::Long; |
| use POSIX qw(strftime); |
| use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; }; |
| use constant HAS_IO_ZLIB => eval { require IO::Zlib; }; |
| use Config; |
| |
| # default settings |
| $opt_c = "$FindBin::Bin/../rules"; |
| $opt_p = "$FindBin::Bin/spamassassin"; |
| $opt_j = 1; |
| $opt_head = 0; |
| $opt_tail = 0; |
| $opt_net = 0; |
| $opt_hamlog = "ham.log"; |
| $opt_spamlog = "spam.log"; |
| $opt_learn = 0; |
| $reuse_rules_loaded_p = 0; |
| $opt_cf = []; |
| |
| my @ORIG_ARGV = @ARGV; |
| GetOptions("c=s", "p=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug:s", |
| "hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net", |
| "progress!", "rewrite:s", "showdots", "spamlog=s", "tail=i", |
| "rules=s", "restart=i", "loguris", |
| "deencap=s", "logmem", "learn=i", "reuse", "lint", "cache", |
| "cachedir=s", "noisy", "scanprob=f", |
| "server=s", "cs_max=i", "cs_timeout=i", "cs_paths_only", |
| "client=s", |
| "before=s" => \&deal_with_before_after, |
| "after=s" => \&deal_with_before_after, |
| 'cf=s' => \@{$opt_cf}, |
| "dir" => sub { $opt_format = "dir"; }, |
| "file" => sub { $opt_format = "file"; }, |
| "mbox" => sub { $opt_format = "mbox"; }, |
| "mbx" => sub { $opt_format = "mbx"; }, |
| "help" => sub { usage(0); }, |
| '<>' => \&target) or usage(1); |
| |
| # We need IO::Zlib for client-server mode! |
| if ( ($opt_client || $opt_server) && ! HAS_IO_ZLIB ) { |
| die "IO::Zlib required for client/server mode!\n"; |
| } |
| |
| # rules.pl is for the --reuse option, score set doesn't matter |
| if ($opt_reuse) { |
| my $rules_path = "$FindBin::Bin/tmp/rules.pl"; |
| if (! -f $rules_path) { |
| # some people specify paths relatively, whereas this needs an absolute path, |
| # so "do the right thing"(tm). |
| my $abs_opt_c = File::Spec->rel2abs($opt_c); |
| system("cd $FindBin::Bin; perl ../build/parse-rules-for-masses -d $abs_opt_c"); |
| } |
| |
| require $rules_path; |
| } |
| |
| if ($opt_noisy) { |
| $opt_progress = 1; # implies --progress |
| } |
| |
| $opt_debug ||= 'all' if defined $opt_debug; |
| |
| my $user_prefs = "$opt_p/user_prefs"; |
| |
| # --lint |
| # In theory we could probably use the same spamtest object as below, |
| # but since it's probably not expecting that, and we don't want |
| # strange things happening, create a local object. |
| if ($opt_lint) { |
| my $spamlint = new Mail::SpamAssassin ({ |
| 'debug' => $opt_debug, |
| 'rules_filename' => $opt_c, |
| 'userprefs_filename' => $user_prefs, |
| 'site_rules_filename' => "$opt_p/local.cf", |
| 'userstate_dir' => "$opt_p", |
| 'save_pattern_hits' => $opt_loghits, |
| 'dont_copy_prefs' => 1, |
| 'local_tests_only' => $opt_net ? 0 : 1, |
| 'only_these_rules' => $opt_rules, |
| 'ignore_safety_expire_timeout' => 1, |
| 'post_config_text' => join("\n", @{$opt_cf})."\n", |
| PREFIX => '', |
| DEF_RULES_DIR => $opt_c, |
| LOCAL_RULES_DIR => '', |
| }); |
| |
| $spamlint->debug_diagnostics(); |
| my $res = $spamlint->lint_rules(); |
| $spamlint->finish(); |
| warn "lint: $res issues detected, please rerun with debug enabled for more information\n" if ($res); |
| exit 1 if $res; |
| } |
| |
| # test messages for the mass-check |
| my @targets; |
| if (!$opt_client) { |
| if ($opt_f) { |
| open(F, $opt_f) || die "cannot read target $opt_f: $!"; |
| push(@targets, map { chomp; $_ } <F>); |
| close(F); |
| } |
| usage(1) if !@targets; |
| } |
| |
| my $spamtest = new Mail::SpamAssassin ({ |
| 'debug' => $opt_debug, |
| 'rules_filename' => $opt_c, |
| 'userprefs_filename' => $user_prefs, |
| 'site_rules_filename' => "$opt_p/local.cf", |
| 'userstate_dir' => "$opt_p", |
| 'save_pattern_hits' => $opt_loghits, |
| 'dont_copy_prefs' => 1, |
| 'local_tests_only' => $opt_net ? 0 : 1, |
| 'only_these_rules' => $opt_rules, |
| 'ignore_safety_expire_timeout' => 1, |
| 'post_config_text' => join("\n", @{$opt_cf})."\n", |
| PREFIX => '', |
| DEF_RULES_DIR => $opt_c, |
| LOCAL_RULES_DIR => '', |
| }); |
| |
| $spamtest->compile_now(1); |
| $spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf"); |
| |
| # generated user_prefs |
| if ($opt_reuse) { |
| # copy current prefs if it exists |
| $spamtest->copy_config(undef, \%orig_conf); |
| |
| # zeroed scores to mass_prefs |
| my @zero = sort grep { defined $reuse{$_}->{skip} } keys %reuse; |
| open(PREFS, ">> $opt_p/mass_prefs") || die "Unable to open $opt_p/mass_prefs: $!\nNeeded for --reuse to work properly"; |
| for my $zero (@zero) { |
| print PREFS "score $zero 0\n"; |
| } |
| close(PREFS); |
| |
| $spamtest->read_scoreonly_config("$opt_p/mass_prefs"); |
| |
| $spamtest->copy_config(undef, \%reuse_conf); |
| $reuse_rules_loaded_p = 1; |
| } |
| |
| my $who = `id -un 2>/dev/null`; |
| my $where = `uname -n 2>/dev/null`; |
| my $when = `date -u`; |
| my $host = $ENV{'HOSTNAME'} || $ENV{'HOST'} || `hostname` || 'localhost'; |
| chomp $who; |
| chomp $where; |
| chomp $when; |
| chomp $host; |
| $svn_revision = get_current_svn_revision(); |
| |
| # when displaying the commandline, quote any arguments which have |
| # "questionable" characters such as spaces, pipes, etc. |
| my $cmdline = join(' ',map { m@[^A-Za-z0-9_/\\.-]@ ? qq/"$_"/ : $_ } @ORIG_ARGV); $cmdline =~ s/\s+/ /gs; |
| my $isowhen = strftime("%Y%m%dT%H%M%SZ", gmtime(time)); # better |
| |
| my $log_header = "# mass-check results from $who\@$where, on $when\n" . |
| "# M:SA version ".$spamtest->Version()."\n" . |
| "# SVN revision: $svn_revision\n" . |
| "# Date: $isowhen\n" . |
| "# Perl version: $] on $Config{archname}\n" . |
| "# Switches: '$cmdline'\n"; |
| |
| my $updates = ($opt_noisy ? 100 : 10); |
| my $total_count = 0; |
| my $spam_count = 0; |
| my $ham_count = 0; |
| my $init_results = 0; |
| |
| my $showdots_active = ($opt_showdots || $opt_noisy); |
| my $showdots_counter = 0; |
| my $showdots_every = ($opt_showdots ? 1 : 20); |
| |
| my $AIopts = { |
| 'opt_all' => $opt_all, |
| }; |
| |
| if (!$opt_client) { |
| # Deal with --rewrite |
| if (defined $opt_rewrite) { |
| my $rewrite = ($opt_rewrite ? $opt_rewrite : "/tmp/out"); |
| open(REWRITE, "> $rewrite") || die "open of $rewrite failed: $!"; |
| } |
| |
| # ArchiveIterator options for non-client mode |
| $AIopts->{'opt_scanprob'} = $opt_scanprob; |
| $AIopts->{'opt_cache'} = $opt_cache; |
| $AIopts->{'opt_cachedir'} = $opt_cachedir; |
| $AIopts->{'opt_after'} = $opt_after; |
| $AIopts->{'opt_before'} = $opt_before; |
| $AIopts->{'scan_progress_sub'} = \&showdots_blip; |
| $AIopts->{'opt_want_date'} = ! $opt_n; |
| |
| # ensure that scanprob stuff is predictable and reproducable |
| if (defined $opt_scanprob && $opt_scanprob < 1.0) { |
| srand(1); |
| } |
| } |
| else { |
| # ArchiveIterator options for client mode -- tends to be simple |
| $opt_n = 1; |
| $AIopts->{'opt_want_date'} = 0; |
| } |
| |
| ########################################################################### |
| ## SCAN MODE |
| |
| my $iter = new Mail::SpamAssassin::ArchiveIterator($AIopts); |
| |
| # setup the AI functions |
| if ($opt_client) { |
| $iter->set_functions(\&wanted, \&result_client); |
| } |
| elsif ($opt_server) { |
| $iter->set_functions(\&wanted_server, \&result); |
| } |
| else { |
| $iter->set_functions(\&wanted, \&result); |
| } |
| |
| my $messages; |
| |
| # normal mode as well as a server do scan mode and get a temp file |
| if (!$opt_client) { |
| status('starting scan stage') if ($opt_progress); |
| |
| # Make a temp file and delete it |
| my $tmpf; |
| ($tmpf, $tmpfd) = Mail::SpamAssassin::Util::secure_tmpfile(); |
| die 'mass-check: failed to create temp file' unless $tmpf; |
| unlink $tmpf or die "mass-check: unlink '$tmpf': $!"; |
| |
| # having opt_j or server mode means do scan in a separate process |
| if ($opt_server || $opt_j) { |
| if ($tmpf = fork()) { |
| # parent |
| waitpid($tmpf, 0); |
| } |
| elsif (defined $tmpf) { |
| # child -- process using message_array |
| generate_queue(\@targets, $tmpfd); |
| exit; |
| } |
| else { |
| die "mass-check: cannot fork: $!"; |
| } |
| } |
| else { |
| # we get here if opt_j == 0, so scan in this process |
| generate_queue(\@targets, $tmpfd); |
| } |
| |
| # we now have a temporary file with the messages to process |
| seek($tmpfd, 0, 0); |
| # the first line is the number of messages |
| $total_messages = read_line($tmpfd); |
| |
| if (!$total_messages) { |
| die "mass-check: no messages to process\n"; |
| } |
| |
| status("completed scan stage, $total_messages messages") if ($opt_progress); |
| } |
| |
| ########################################################################### |
| ## RUN MODE |
| |
| if ($opt_client) { |
| client_mode(); |
| } |
| else { |
| status('starting run stage') if ($opt_progress); |
| |
| if ($opt_server) { |
| server_mode(); |
| } |
| else { |
| run_through_messages(); |
| } |
| |
| status('completed run stage') if ($opt_progress); |
| } |
| |
| # Even though we're about to exit, let's clean up after ourselves |
| close($tmpfd) if ($tmpfd); |
| showdots_finish(); |
| |
| if (defined $opt_rewrite) { |
| close(REWRITE); |
| } |
| |
| $spamtest->finish(); |
| |
| # exit status: did we check at least one message correctly? |
| exit(!($ham_count || $spam_count)); |
| |
| ########################################################################### |
| |
| sub target { |
| my ($target) = @_; |
| |
| # message-selection options; these can now be specified separately |
| # for each target |
| my %selopts = ( |
| opt_scanprob => $opt_scanprob, |
| opt_after => $opt_after, |
| opt_before => $opt_before |
| ); |
| |
| if (!defined($opt_format)) { |
| push(@targets, { %selopts, target => $target }); |
| } |
| else { |
| $opt_o = 1; |
| push(@targets, { %selopts, target => "spam:$opt_format:$target" }); |
| } |
| } |
| |
| ########################################################################### |
| |
| sub init_results { |
| $init_results = 1; |
| |
| showdots_finish(); |
| |
| # now, showdots only happens if --showdots was used |
| $showdots_active = $opt_showdots; |
| |
| if ($opt_progress) { |
| # round up since 100% will be caught at end already |
| $statusevery = int($total_messages / $updates + 1); |
| |
| # if $messages < $updates, just give a status line per msg. |
| $statusevery ||= 1; |
| } |
| |
| return if $opt_client; |
| |
| if ($opt_o) { |
| autoflush STDOUT 1; |
| print STDOUT $log_header; |
| } |
| else { |
| open(HAM, "> $opt_hamlog") || die "open of $opt_hamlog failed: $!"; |
| open(SPAM, "> $opt_spamlog") || die "open of $opt_spamlog failed: $!"; |
| autoflush HAM 1; |
| autoflush SPAM 1; |
| print HAM $log_header; |
| print SPAM $log_header; |
| } |
| } |
| |
| sub result { |
| my ($class, $result, $time) = @_; |
| |
| # don't open results files until we get here to avoid overwriting files |
| init_results() if !$init_results; |
| |
| if ($class eq "s") { |
| if ($opt_o) { print STDOUT $result; } else { print SPAM $result; } |
| $spam_count++; |
| } |
| elsif ($class eq "h") { |
| if ($opt_o) { print STDOUT $result; } else { print HAM $result; } |
| $ham_count++; |
| } |
| |
| $total_count++; |
| |
| if ($opt_progress) { |
| progress($time); |
| } |
| } |
| |
| sub wanted { |
| my ($class, $id, $time, $dataref, $format) = @_; |
| my $out = ''; |
| |
| # if origid is defined, it'll be the message number from server mode |
| my $origid; |
| |
| # client mode is a little crazy because we need to kluge around the fact |
| # that the information needed to do the run is different than the |
| # information that goes into the results. |
| if ($opt_client) { |
| if ($opt_cs_paths_only) { |
| # the server message number |
| $origid = $real{$id}; |
| } |
| else { |
| # if we're a non-paths_only client, change the format and id to the real |
| # version, make sure to remember the server's message number |
| $origid=$id; |
| $origid =~ s/^.+?(\d+)$/$1/; |
| $format = $real{$id}->[2]; |
| $id = $real{$id}->[3]; |
| } |
| } |
| |
| memory_track_start() if ($opt_logmem); |
| |
| # parse the message, and force it to complete |
| my $ma = $spamtest->parse($dataref, 1); |
| |
| # remove SpamAssassin markup, if present and the mail was spam |
| my $header = $ma->get_header("Received"); |
| my $x_spam_status; |
| if ($opt_reuse) { |
| # get X-Spam-Status: header for rule hit resue |
| $x_spam_status = $ma->get_header("X-Spam-Status"); |
| } |
| # previous hits |
| my @previous; |
| if ($x_spam_status) { |
| $x_spam_status =~ s/,\s+/,/gs; |
| if ($x_spam_status =~ m/tests=(\S*)/ |
| && $x_spam_status !~ /\bshortcircuit=(?:ham|spam|default)\b/) |
| { |
| push @previous, split(/,/, $1); |
| |
| # we found previous tests, so move the reuse config into place |
| unless ($reuse_rules_loaded_p) { |
| $spamtest->copy_config(\%reuse_conf, undef); |
| $reuse_rules_loaded_p = 1; |
| } |
| } |
| } |
| elsif ($opt_reuse) { |
| if ($reuse_rules_loaded_p) { |
| $spamtest->copy_config(\%orig_conf, undef); |
| $reuse_rules_loaded_p = 0; |
| } |
| } |
| |
| if ($header && $header =~ /\bwith SpamAssassin\b/) { |
| if (!$opt_deencap || message_should_be_deencapped($ma)) { |
| my $new_ma = $spamtest->parse($spamtest->remove_spamassassin_markup($ma), 1); |
| $ma->finish(); |
| $ma = $new_ma; |
| } |
| } |
| |
| # plugin hook to cause us to skip messages |
| my $skip = $spamtest->call_plugins("mass_check_skip_message", { |
| class => $class, |
| 'time' => $time, |
| 'id' => $id, |
| msg => $ma |
| }); |
| return if $skip; |
| |
| # log-uris support |
| my $status; |
| my @uris; |
| my $before; |
| my $after; |
| if ($opt_loguris) { |
| my $pms = Mail::SpamAssassin::PerMsgStatus->new($spamtest, $ma); |
| @uris = $pms->get_uri_list(); |
| $pms->finish(); |
| |
| } else { |
| $before = time; |
| $status = $spamtest->check($ma); |
| $after = time; |
| } |
| |
| my @extra; |
| |
| # sample-based learning |
| if ($opt_learn > 0) { |
| my $spam; |
| # spam learned as ham = 0.05% |
| if ($class eq 's' && rand(100) < 0.05) { |
| $spam = 0; |
| } |
| # ham learned as spam = 0.01% |
| elsif ($class eq 'h' && rand(100) < 0.01) { |
| $spam = 1; |
| } |
| # spam/ham learned correctly |
| elsif (rand(100) < $opt_learn) { |
| if ($class eq 's') { |
| $spam = 1; |
| } |
| elsif ($class eq 'h') { |
| $spam = 0; |
| } |
| else { |
| die "unknown class, learning failed"; |
| } |
| } |
| if (defined $spam) { |
| my $result = ($spam ? "spam" : "ham"); |
| my $status = $spamtest->learn($ma, undef, $spam, 0); |
| my $learned = $status->did_learn(); |
| $result = "undef" if !defined $learned; |
| push(@extra, "learn=".$result); |
| } |
| } |
| |
| if (defined($time)) { |
| push(@extra, "time=".$time); |
| } |
| if ($status && defined $status->{bayes_score}) { |
| push(@extra, "bayes=".sprintf("%06f", $status->{bayes_score})); |
| } |
| if ($opt_mid) { |
| my $mid = $ma->get_header("Message-Id"); |
| if ($mid) { # message contains a Message-Id: |
| while($mid =~ s/\([^\(\)]*\)//s) {}; # remove comments and |
| $mid =~ s/^\s+|\s+$//sg; # leading and trailing spaces |
| $mid =~ s/\s.*$//s; # keep only the first token |
| } |
| else { # it doesn't have a Message-Id: |
| $mid = $id; # so build one from the id |
| $mid =~ s,^.*/,,; # remove the path |
| $mid = "<$mid\@$host.masses.spamassassin.org>"; # and put it together |
| } |
| $mid =~ tr/-A-Za-z0-9_!#%&=~<@>/./c; # replace dangerous chars with . (so regexp search just works) |
| push(@extra, "mid=$mid"); |
| } |
| push(@extra, "scantime=" . ($after - $before)); |
| push(@extra, "format=$format"); |
| |
| if ($opt_logmem) { |
| my $mem = memory_track_finish(); |
| if ($mem) { |
| push(@extra, $mem); |
| } |
| } |
| |
| if ($reuse_rules_loaded_p) { |
| push(@extra, "reuse=yes"); |
| } |
| else { |
| push(@extra, "reuse=no"); |
| } |
| |
| if ($opt_client) { |
| push(@extra, "host=$where"); |
| } |
| |
| my $yorn; |
| my $score; |
| my $tests; |
| my $extra; |
| |
| if ($opt_loguris) { |
| $yorn = '.'; |
| $score = 0; |
| $tests = join(" ", sort @uris); |
| $extra = ''; |
| } else { |
| $yorn = $status->is_spam() ? 'Y' : '.'; |
| # don't bother adjusting scores for reuse |
| $score = $status->get_score(); |
| # list of tests hit |
| my @tests; |
| push @tests, split(/,/, $status->get_names_of_tests_hit()); |
| push @tests, split(/,/, $status->get_names_of_subtests_hit()); |
| # hit reuse |
| if ($x_spam_status) { |
| # generate mapping of hits to remove hits that are marked as skip |
| @tests = grep { !$reuse{$_}->{skip} } @tests; |
| # add hits from previous |
| for (@previous) { |
| push(@tests, $reuse{$_}->{reuse}) if $reuse{$_}->{reuse}; |
| } |
| } |
| $tests = join(",", sort(@tests)); |
| $extra = join(",", @extra); |
| } |
| |
| if (defined $opt_rewrite) { |
| print REWRITE $status->rewrite_mail(); |
| } |
| |
| $id =~ s/\s/_/g; |
| |
| # if we have an origid set, it'll be the server mode's message number, so |
| # attach it to our result appropriately. |
| if (defined $origid) { |
| $out = "$origid "; |
| } |
| |
| $out .= sprintf("%s %2d %s %s %s\n", $yorn, $score, $id, $tests, $extra); |
| |
| if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) { |
| $out .= logkilled($ma, $id, "possible virus"); |
| } |
| |
| if ($opt_loghits) { |
| my $log = ''; |
| foreach my $t (sort keys %{$status->{pattern_hits}}) { |
| $_ = $status->{pattern_hits}->{$t}; |
| $_ ||= ''; |
| s/\r/\\r/gs; # fix unprintables |
| s/\n/\\n/gs; |
| $log .= "$t=\"$_\" "; |
| } |
| if ($log) { |
| chomp $log; |
| $out .= "# $log\n"; |
| } |
| } |
| |
| if (defined $status) { $status->finish(); } |
| $ma->finish(); |
| undef $ma; # clean 'em up |
| undef $status; |
| |
| showdots_blip(); |
| # print ">>>> out = $out\n"; |
| return $out; |
| } |
| |
| sub showdots_blip { |
| return unless ($showdots_active); |
| |
| $showdots_counter++; |
| if ($showdots_counter % $showdots_every == 0) { |
| print STDERR '.'; |
| if ($showdots_counter % (60 * $showdots_every) == 0) { |
| print STDERR "\n"; |
| } |
| } |
| } |
| |
| sub showdots_finish { |
| print STDERR "\n" if ($showdots_active); |
| $showdots_counter = 0; |
| } |
| |
| # ick. We have to go grovelling through the body parts to see if a message |
| # is a report_safe-marked-up message, because a local scanner will overwrite |
| # any remote scanner's X-Spam-Checker-Version header. |
| # |
| sub message_should_be_deencapped { |
| my ($ma) = @_; |
| |
| # not sure why this is undefined, but it is sometimes |
| if (defined $ma->{body_parts} && scalar @{$ma->{body_parts}} > 0) { |
| my $firstpart = $ma->{body_parts}->[0]; |
| if (!$firstpart->{headers}->{'content-type'} |
| || $firstpart->{headers}->{'content-type'} ne 'text/plain') |
| { |
| return 0; # not a 'report_safe' encapsulation |
| } |
| |
| if (scalar @{$firstpart->{raw}} < 3) { return 0; } # too short to be a report |
| |
| # grab first 2 lines |
| my $text = $firstpart->{raw}->[0] . $firstpart->{raw}->[1]; |
| $text =~ s/\s+/ /gs; |
| if ($text =~ /^Spam detection software, running on the system \"(\S+)\"/) { |
| my $hname = $1; |
| if ($hname =~ /$opt_deencap/io) { |
| return 1; |
| } |
| } |
| } |
| |
| return 0; # a different host marked it up. pass it through! |
| } |
| |
| sub logkilled { |
| my ($ma, $id, $reason) = @_; |
| |
| my $from = $ma->get_header("From") || 'undef'; |
| my $to = $ma->get_header("To") || 'undef'; |
| my $subj = $ma->get_header("Subject") || 'undef'; |
| my $mid = $ma->get_header("Message-Id") || 'undef'; |
| chomp ($from); |
| chomp ($to); |
| chomp ($subj); |
| chomp ($mid); |
| return "# skipped killfiled message ($reason): from=$from to=$to subj=$subj mid=$mid id=$id\n"; |
| } |
| |
| sub progress { |
| my ($time) = @_; |
| $time ||= 0; |
| |
| # Are we at the end or otherwise at a point we should print status? Then do it. |
| if ($total_messages == $total_count || $total_count % $statusevery == 0) { |
| my $time = strftime("%Y-%m-%d", localtime($time)); |
| status(sprintf("%3d%% ham: %-6d spam: %-6d date: %s", |
| int(($total_count / $total_messages) * 100), $ham_count, $spam_count, $time)); |
| } |
| } |
| |
| sub status { |
| my($str) = @_; |
| my $now = strftime("%Y-%m-%d %X", localtime(time)); |
| printf STDERR "status: %-48s now: %s\n", $str, $now; |
| } |
| |
| ########################################################################### |
| |
| our ($mem_size, $mem_rss, $mem_shared); |
| |
| sub memory_track_start { |
| if ($^O =~ /linux/i) { |
| if (open (IN, "</proc/$$/statm")) { |
| my $statm = <IN>; |
| close IN; |
| if ($statm =~ /^(\d+) (\d+) (\d+) /) { |
| $mem_size = $1; |
| $mem_rss = $2; |
| $mem_shared = $3; |
| } |
| } |
| } |
| } |
| |
| sub memory_track_finish { |
| my $str = ''; |
| |
| if ($^O =~ /linux/i) { |
| if (open (IN, "</proc/$$/statm")) { |
| my $statm = <IN>; |
| close IN; |
| if ($statm =~ /^(\d+) (\d+) (\d+) /) { |
| my $size = $1; |
| my $rss = $2; |
| my $shared = $3; |
| |
| $str = sprintf ("memsz=%d,memrss=%d,memshr=%d", |
| ($size - $mem_size), |
| ($rss - $mem_rss), |
| ($shared - $mem_shared)); |
| } |
| } |
| } |
| return $str; |
| } |
| |
| sub get_current_svn_revision { |
| my $revision; |
| |
| # this is usually "${TOPDIR}/masses" |
| my $dir = $FindBin::Bin || "."; |
| |
| if (-d "$dir/.svn" || -f "$dir/svninfo.tmp") { |
| if (-f "$dir/svninfo.tmp") { |
| # created by build/automc/buildbot_ready for chrooted mass-checks |
| open (SVNINFO, "< $dir/svninfo.tmp"); |
| } |
| else { |
| # note, ".." since we want to pick up changes outside 'masses' |
| # too! |
| open (SVNINFO, "( svn info --non-interactive $dir/.. || svn info $dir/.. ) 2>&1 |"); |
| } |
| |
| while (<SVNINFO>) { |
| # Revision: 383822 |
| next unless /^Revision: (\d+)/; |
| $revision = $1; |
| last; |
| } |
| close SVNINFO; |
| return $revision if $revision; |
| } |
| |
| # this probably will never work due to Rules Project changes TODO |
| if (open(TESTING, "$opt_c/70_testing.cf")) { |
| chomp($revision = <TESTING>); |
| $revision =~ s/.*\$Rev:\s*(\S+).*/$1/; |
| close(TESTING); |
| return $revision if $revision; |
| } |
| |
| return $revision || "unknown"; |
| } |
| |
| ############################################################################ |
| |
| ## children processors, start and process, used when opt_j > 1 |
| |
| sub start_children { |
| my ($count, $child, $pid, $socket) = @_; |
| |
| my $io = IO::Socket->new(); |
| my $parent; |
| |
| # create children |
| for (my $i = 0; $i < $count; $i++) { |
| ($child->[$i],$parent) = $io->socketpair(AF_UNIX,SOCK_STREAM,PF_UNSPEC) |
| or die "mass-check: socketpair failed: $!"; |
| if ($pid->[$i] = fork) { |
| close $parent; |
| |
| # disable caching for parent<->child relations |
| my ($old) = select($child->[$i]); |
| $|++; |
| select($old); |
| |
| $socket->add($child->[$i]); |
| aidbg "mass-check: starting new child $i (pid ".$pid->[$i].")\n"; |
| next; |
| } |
| elsif (defined $pid->[$i]) { |
| my $result; |
| my $line; |
| |
| close $tmpfd if defined $tmpfd; |
| |
| close $child->[$i]; |
| select($parent); |
| $| = 1; # print to parent by default, turn off buffering |
| send_line($parent,"START"); |
| while ($line = read_line($parent)) { |
| if ($line eq "exit") { |
| close $parent; |
| exit; |
| } |
| |
| my($class, $format, $date, $where, $result) = $iter->_run_message($line); |
| $result ||= ''; |
| |
| # If determine_receive_date is not set, the original input date |
| # wasn't calculated, but run_message would have done so, so reset |
| # the packed version if possible ... use defined for date since |
| # it could == 0. |
| if (!$iter->{determine_receive_date} && $class && $format && defined $date && $where) { |
| $line = Mail::SpamAssassin::ArchiveIterator::_index_pack($date, $class, $format, $where); |
| } |
| |
| send_line($parent,"$result\0RESULT $line"); |
| } |
| exit; |
| } |
| else { |
| die "mass-check: cannot fork: $!"; |
| } |
| } |
| } |
| |
| ## handling killing off the children |
| |
| sub reap_children { |
| my ($count, $socket, $pid) = @_; |
| |
| # If the child died, sending it the exit will generate a SIGPIPE, but we |
| # don't really care since the readline will go undef (which is fine), |
| # then we do the waitpid which will finish it off. So we end up in the |
| # right state, in theory. |
| local $SIG{'PIPE'} = 'IGNORE'; |
| |
| for (my $i = 0; $i < $count; $i++) { |
| aidbg "mass-check: killing child $i (pid ",$pid->[$i],")\n"; |
| send_line($socket->[$i],"exit"); # tell the child to die. |
| close $socket->[$i]; |
| waitpid($pid->[$i], 0); # wait for the signal ... |
| } |
| } |
| |
| # in server mode, this gets called to read in the HTTP request from a given |
| # socket, then return the information the client sent to us. |
| sub handle_http_request { |
| my $socket = shift; |
| |
| my $headers = {}; |
| my $postdata = {}; |
| |
| # read in the request (POST / HTTP/1.0) |
| my $line = $socket->getline(); |
| $line ||= ''; |
| $line =~ s/\r\n$//; |
| |
| my ($type, $URI, $VERS) = $line =~ /^([a-zA-Z]+)\s+(\S+)(?:\s*(\S+))/; |
| unless ($type && $URI && $VERS) { |
| $type ||= ''; |
| $URI ||= ''; |
| |
| return ($type, $URI, $headers, $postdata); |
| } |
| |
| $type = uc $type; |
| |
| # read in headers, "key: value" up to a blank line |
| do { |
| $line = $socket->getline(); |
| last unless defined $line; |
| $line =~ s/\r\n$//; |
| |
| if ($line) { |
| my ($k,$v) = split(/:\s*/, $line, 2); |
| $headers->{lc $k} = $v; |
| } |
| } while ($line !~ /^$/); |
| |
| # if this is a POST request w/ content-length, there'll be a payload, deal |
| # with it. |
| if ($type eq 'POST' && $headers->{'content-length'}) { |
| my $pd; |
| $socket->read($pd, $headers->{'content-length'}); |
| $pd =~ s/[\r\n]+$//; # a hack for manual requests/telnet/etc |
| |
| # key1=value1&key2=value2... |
| %{$postdata} = map { |
| my($k,$v) = split(/=/, $_, 2); |
| |
| # we need to decode the key and value |
| $k =~ s/\%([0-9a-fA-F]{2})/sprintf "%c", hex($1)/eg; |
| $v =~ s/\%([0-9a-fA-F]{2})/sprintf "%c", hex($1)/eg; |
| |
| $k => $v; |
| } split(/\&/, $pd); |
| } |
| |
| return($type, $URI, $headers, $postdata); |
| } |
| |
| # in server mode, generate a gzip compressed data stream with the messages and |
| # return the path to the compressed file which the server will read and pass |
| # to the client. |
| # |
| # Input: |
| # - Number of messages to generate (scalar) |
| # - Hash of Arrays of outstanding requests (reference to hash of array refs) |
| # timestamp# -> [ num1, num2, ... ] |
| # Used to quickly find outstanding/timed out messages to send to client. |
| # - Hash of outstanding messages and associated data (ref to hash of hash refs) |
| # num1 -> { data => 'binary data from scan mode', timestamp => timestamp# } |
| # Used later on to specify the timestamp entry to remove the entry from. |
| # - Paths only? If true, just include the original message data in the gzip |
| # file. Otherwise, include the message data. Useful if the client has the |
| # corpus available via the same paths as originally specified. |
| # |
| # Returns: scalar path to gzip file |
| # |
| sub generate_messages { |
| my($msgs, $timestamps, $msgsout, $paths_only) = @_; |
| |
| # Hold the message numbers we'll be sending out |
| my @tosend = (); |
| |
| # Find out if any of the messages we sent out before need to be sent out |
| # again because we haven't seen a response within the timeout. |
| my $tooold = time - $opt_cs_timeout; |
| foreach (sort { $a <=> $b } keys %{$timestamps}) { |
| # since we're going in numeric order, if the current entry is newer than |
| # the timeout value, the rest will be too, so stop looking. |
| last if ($_ > $tooold); |
| |
| # how many messages do we still need to fulfill the request? |
| my $wanted = $msgs - @tosend; |
| |
| if (@{$timestamps->{$_}} > $wanted) { |
| # there are more entries in the timestamp list than we want, so just |
| # grab that many off the list. |
| push(@tosend, splice @{$timestamps->{$_}}, 0, $wanted); |
| } |
| else { |
| # there are just enough, or not enough entries on the timestamp list to |
| # satisfy our request, so take them all and we'll loop around. |
| push(@tosend, @{$timestamps->{$_}}); |
| delete $timestamps->{$_}; |
| } |
| |
| # Ok, we have enough messages so we can stop now. |
| last if (@tosend == $msgs); |
| } |
| |
| # if we still have the temp file with the input messages open, we'll fillup |
| # out message output queue with messages from there. |
| if ($tmpfd) { |
| while (@tosend < $msgs) { |
| my $msg = read_line($tmpfd); |
| |
| # no more messages from the temp file, close it out |
| unless ($msg) { |
| delete $msgsout->{'curnum'}; |
| close $tmpfd; |
| undef $tmpfd; |
| last; |
| } |
| |
| # we got a result, so assign it a number (curnum) and store the data |
| # appropriately, then add the new number to the queue. |
| my $num = $msgsout->{'curnum'}++; |
| $msgsout->{$num}->{'data'} = $msg; |
| push(@tosend, $num); |
| } |
| } |
| |
| # ok, at this point, @tosend ought to have a list of numbers, pointers into |
| # %{$msgsout}. turn that into a tar file. |
| return '' unless @tosend; |
| |
| my($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile(); |
| die "Can't make tempfile, exiting" unless $gzpath; |
| close($gzfd); |
| |
| $gzfd = IO::Zlib->new($gzpath, 'wb') || die "Can't create temp gzip file: $!"; |
| |
| # first line is the number of messages included in the file |
| send_line($gzfd, scalar @tosend) || die "mass-check: error when writing to gz temp file\n"; |
| |
| # Generate an archive in the temp file |
| foreach my $num (@tosend) { |
| # Archive format, gzip compressed file w/ 3 parts per message: |
| # 1- server message number in text format |
| # 2- server index string, binary packed format |
| # 3- message content -- unless paths_only |
| send_line($gzfd, $num) || die "mass-check: error when writing to gz temp file\n"; |
| |
| my $data = $msgsout->{$num}->{'data'}; |
| send_line($gzfd, $data) || die "mass-check: error when writing to gz temp file\n"; |
| |
| if (!$paths_only) { |
| my $msg = ($iter->_run_message($data))[4]; |
| send_line($gzfd, join('', @{$msg})) || |
| die "mass-check: error when writing to gz temp file\n"; |
| } |
| } |
| |
| $gzfd->close; |
| |
| # update timestamp entries |
| my $ts = time; |
| foreach (@tosend) { |
| $msgsout->{$_}->{'timestamp'} = $ts; |
| } |
| |
| # conveniently, this list should be the only thing sent out w/ this |
| # timestamp, so just set the reference appropriately. :) |
| $timestamps->{$ts} = \@tosend; |
| |
| if ($opt_noisy) { |
| print "generated ".scalar(@tosend)." messages\n"; |
| } |
| |
| return $gzpath; |
| } |
| |
| # we've gotten results posted, so clean up msgsout and timestamp hashes and |
| # process result... |
| sub handle_post_results { |
| my($postdata, $timestamps, $msgsout) = @_; |
| |
| # local version to batch the removals |
| my %timestamps = (); |
| |
| # $msgsout->{num}->{data|timestamp} |
| # $timestamp{num} = [ msgout_nums ... ] |
| # $postdata{num} = result_string |
| |
| while( my($k,$v) = each %{$postdata} ) { |
| # message run results will be \d+ => log entry |
| next if ($k !~ /^\d+$/); |
| |
| # if we've been waiting for this result, process it, otherwise throw it on |
| # the ground. multiple clients could have been given the same messages to |
| # process, and we take whatever the first responder sends us. |
| if (exists $msgsout->{$k}) { |
| # the result_sub will need parts of the message data, so get it ready |
| my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($msgsout->{$k}->{'data'}); |
| |
| # go ahead and do the result |
| &{$iter->{result_sub}}($d[1], $v, $d[0]); |
| |
| # prep to get rid of the cached entries |
| $timestamps{$msgsout->{$k}->{'timestamp'}}->{$k} = 1; |
| delete $msgsout->{$k}; |
| } |
| } |
| |
| # if we got any results, clean out the results from the timestamp arrays |
| while ( my($k,$v) = each %timestamps ) { |
| # trim out the result list from the timestamp sent list |
| my @temp = grep(!exists $v->{$_}, @{$timestamps->{$k}}); |
| |
| # if there are results left for a specific timestamp, update the array |
| # pointer. otherwise, delete the timestamp entry since it's empty. |
| if (@temp) { |
| $timestamps->{$k} = \@temp; |
| } |
| else { |
| delete $timestamps->{$k}; |
| } |
| } |
| } |
| |
| # This function reads from $tmpfd and processes the message as appropriate wrt |
| # $opt_j, $opt_restart, etc. |
| # |
| sub run_through_messages { |
| # do everything in one process |
| if ($opt_j <= 1 && !defined $opt_restart) { |
| my $message; |
| my $messages; |
| my $total_count = 0; |
| |
| while (($total_messages > $total_count) && ($message = read_line($tmpfd))) { |
| my($class, undef, $date, undef, $result) = $iter->_run_message($message); |
| if ($result) { |
| &{$iter->{result_sub}}($class, $result, $date); |
| } |
| $total_count++; |
| } |
| } |
| # more than one process or one process with restarts |
| else { |
| my $select = IO::Select->new(); |
| |
| my $total_count = 0; |
| my $needs_restart = 0; |
| my @child = (); |
| my @pid = (); |
| my $messages; |
| |
| # start children processes |
| start_children($opt_j, \@child, \@pid, $select); |
| |
| # feed childen, make them work for it, repeat |
| while ($select->count()) { |
| foreach my $socket ($select->can_read()) { |
| my $line = read_line($socket); |
| |
| # some error happened during the read! |
| if (!defined $line) { |
| $needs_restart = 1; |
| warn "mass-check: readline failed, attempting to recover\n"; |
| $select->remove($socket); |
| } |
| elsif ($line =~ /^([^\0]*)\0RESULT (.+)$/s) { |
| my $result = $1; |
| my ($date,$class,$type) = Mail::SpamAssassin::ArchiveIterator::_index_unpack($2); |
| aidbg "mass-check: $class, $type, $date\n"; |
| |
| if (defined $opt_restart && ($total_count % $opt_restart) == 0) { |
| $needs_restart = 1; |
| } |
| |
| # if messages remain, and we don't need to restart, send message |
| if (($total_messages > $total_count) && !$needs_restart) { |
| send_line($socket, read_line($tmpfd)); |
| $total_count++; |
| aidbg "mass-check: $total_messages $total_count\n"; |
| } |
| else { |
| # stop listening on this child since we're done with it |
| aidbg "mass-check: $needs_restart $total_messages $total_count\n"; |
| $select->remove($socket); |
| } |
| |
| # deal with the result we received |
| if ($result) { |
| &{$iter->{result_sub}}($class, $result, $date); |
| } |
| } |
| elsif ($line eq "START") { |
| if ($total_messages > $total_count) { |
| # we still have messages, send one to child |
| send_line($socket, read_line($tmpfd)); |
| $total_count++; |
| aidbg "mass-check: $total_messages $total_count\n"; |
| } |
| else { |
| # no more messages, so stop listening on this child |
| aidbg "mass-check: $needs_restart $total_messages $total_count\n"; |
| $select->remove($socket); |
| } |
| } |
| else { |
| $needs_restart = 1; |
| warn "mass-check: bad line from readline: $line\n"; |
| $select->remove($socket); |
| } |
| } |
| |
| aidbg "mass-check: out of loop, $total_messages $total_count $needs_restart ".$select->count()."\n"; |
| |
| # If there are still messages to process, and we need to restart |
| # the children, and all of the children are idle, let's go ahead. |
| if ($needs_restart && $select->count == 0 && $total_messages > $total_count) { |
| $needs_restart = 0; |
| |
| aidbg "mass-check: needs restart, $total_messages total, $total_count done\n"; |
| reap_children($opt_j, \@child, \@pid); |
| @child=(); |
| @pid=(); |
| start_children($opt_j, \@child, \@pid, $select); |
| } |
| } |
| |
| # reap children |
| reap_children($opt_j, \@child, \@pid); |
| } |
| } |
| |
| # send an HTTP response to a socket based on the input result, headers, and |
| # data values. |
| sub http_response { |
| my($socket, $result, $headers, $data) = @_; |
| |
| print $socket |
| "HTTP/1.0 $result\r\n", |
| "Pragma: no-cache\r\n", |
| "Server: mass-check/$svn_revision\r\n", |
| map { "$_: ".$headers->{$_}."\r\n" } keys %{$headers}; |
| print $socket "\r\n"; |
| print $socket $data; |
| } |
| |
| # the client needs to make a request to the server on a given socket. |
| sub http_make_request { |
| my($socket, $type, $uri, $headers, $data) = @_; |
| |
| print $socket |
| "$type $uri HTTP/1.0\r\n", |
| "User-Agent: mass-check/$svn_revision\r\n", |
| map { "$_: ".$headers->{$_}."\r\n" } keys %{$headers}; |
| print $socket "\r\n"; |
| print $socket $data; |
| |
| # parse the response that the server sends us |
| my $line = $socket->getline() || ''; |
| my(undef, $code, $string) = split(/\s+/, $line, 3); |
| return unless $code == 200; |
| |
| my %headers = (); |
| do { |
| $line = $socket->getline(); |
| last unless defined $line; |
| $line =~ s/\r\n$//; |
| |
| if ($line) { |
| my ($k,$v) = split(/:\s*/, $line, 2); |
| $headers{lc $k} = $v; |
| } |
| } while ($line !~ /^$/); |
| |
| # the server has sent us notification that it's going to exit, so let's |
| # follow suit. |
| return 'finished' if ($headers{'finished'}); |
| |
| my $gzpath = ''; |
| if ($headers{'content-length'}) { |
| my $gzfd; |
| ($gzpath, $gzfd) = Mail::SpamAssassin::Util::secure_tmpfile(); |
| die "Can't make tempfile, exiting" unless $gzpath; |
| |
| my $rd; |
| $socket->read($rd, $headers{'content-length'}) || die "mass-check: error reading in data from server\n"; |
| print $gzfd $rd; |
| close $gzfd; |
| } |
| |
| $socket->close(); |
| return $gzpath; |
| } |
| |
| # Be conservative -- encode most things. |
| # we could encode spaces to plusses, then decode that later, but... |
| sub post_encode { |
| my $string = shift; |
| $string =~ s/([^a-zA-Z0-9_,.\/\\-])/sprintf "%%%02x",unpack("C",$1)/egx; |
| return $string; |
| } |
| |
| # remove all of the files in a given directory, non-recursive |
| sub clean_dir { |
| my $dir = shift; |
| |
| unless (opendir(DIR, $dir)) { |
| warn "error: can't opendir $dir: $!\n"; |
| return; |
| } |
| while(my $file = readdir(DIR)) { |
| $file =~ /^(.+)$/; # untaint |
| $file = $1; |
| |
| my $path = File::Spec->catfile($dir, $file); |
| next unless (-f $path); |
| |
| if (!unlink $path) { |
| warn "error: can't remove file $path: $!\n"; |
| closedir(DIR); |
| return; |
| } |
| } |
| closedir(DIR); |
| return 1; |
| } |
| |
| ############################################################################ |
| |
| # four bytes in network/vax format (little endian) as length of message |
| # the rest is the actual message |
| |
| sub read_line { |
| my $fd = shift; |
| my($length,$msg); |
| |
| # read in the 4 byte length and unpack |
| $fd->read($length, 4) || return; |
| |
| $length = unpack("V", $length); |
| return unless $length; |
| |
| # read in the rest of the single message |
| $fd->read($msg, $length) || return; |
| |
| return $msg; |
| } |
| |
| sub send_line { |
| my $fd = shift; |
| foreach ( @_ ) { |
| my $length = pack("V", length $_); |
| $fd->print($length.$_) || return 0; |
| } |
| |
| return 1; |
| } |
| |
| ############################################################################ |
| |
| # this is the function that implemented server mode. basically, sit and wait |
| # for connections to come in. when a client sends in a request, deal with any |
| # results that the client sent, then generate a response and send it back, |
| # and then go back to waiting. lather, rinse, repeat. |
| sub server_mode { |
| $opt_cs_max ||= 1000; |
| $opt_cs_timeout ||= 60 * 5; |
| |
| my $serv_socket = IO::Socket::INET->new( |
| LocalAddr => $opt_server, |
| Proto => 'tcp', |
| Listen => 5, |
| ReuseAddr => 1, |
| ); |
| |
| die "Could not create socket: $!\n" unless $serv_socket; |
| |
| if ($opt_progress) { |
| status('server ready for connections'); |
| } |
| |
| # Setup out "what messages have been sent out" hashes |
| my $timestamps = {}; |
| my $msgsout = { 'curnum' => 0 }; |
| |
| # Generate an IO::Select object and put the server socket on the queue |
| my $select = IO::Select->new( $serv_socket ); |
| |
| # We'll keep looping while there's something to pay attention to |
| while ($select->count()) { |
| # Sit and block until there's something for us to read from |
| foreach my $socket ($select->can_read()) { |
| if ($socket == $serv_socket) { |
| # it's the server socket, go ahead and accept the connection and add |
| # it to the queue. |
| $select->add($serv_socket->accept); |
| } |
| else { |
| # it's some client, so deal with the request |
| my($type, $URI, $headers, $postdata) = handle_http_request($socket); |
| |
| # we don't do GET, so just send something back |
| if ($type eq 'GET') { |
| if ($opt_noisy) { |
| print "GET request from ".$socket->peerhost."\n"; |
| } |
| |
| http_response($socket, "200 OK", { |
| 'Content-type' => 'text/plain', |
| }, |
| "Your GET request came from IP Address: ".$socket->peerhost."\n"); |
| } |
| elsif ($type eq 'POST') { |
| # ooh, POST. deal with any results that the client sent |
| handle_post_results($postdata, $timestamps, $msgsout); |
| |
| if ($opt_noisy) { |
| print "POST request from ".$socket->peerhost."\n"; |
| } |
| |
| # based on the number of messages that the client requested, |
| # generate a gzip file with the appropriate data in it |
| my $messages = ''; |
| if ($postdata->{'max_messages'}) { |
| my $msgnum = $postdata->{'max_messages'}; |
| if ($msgnum > $opt_cs_max || $msgnum < 1) { |
| $msgnum = $opt_cs_max; |
| } |
| |
| if ($opt_noisy) { |
| print "client requested ".$postdata->{'max_messages'}." messages\n"; |
| } |
| |
| $messages = generate_messages($msgnum, $timestamps, $msgsout, $postdata->{'paths_only'}); |
| } |
| |
| # $messages will contain the path to the gzip file if there are |
| # messages to send out. |
| if ($messages && open(MSG, $messages)) { |
| binmode(MSG); |
| local $/ = undef; # go go slurp mode |
| |
| # send the response |
| http_response($socket, "200 OK", { |
| 'Content-Type' => 'application/x-gzip', |
| 'Content-Encoding' => 'x-gzip', |
| "Content-Length" => (-s $messages), |
| }, |
| scalar <MSG>); |
| |
| close(MSG); |
| |
| # we don't need the file anymore, so get rid of it |
| unlink $messages; |
| } |
| elsif (!keys %{$msgsout} && !defined $tmpfd) { |
| # we have no more outstanding messages and our original queue of |
| # messages to process is empty, so tell the client to exit. |
| http_response($socket, "200 OK", { |
| "Content-type" => "text/plain", |
| "Finished" => 1, |
| }, |
| 'We are all done'); |
| } |
| else { |
| # when in doubt, treat this like a GET |
| http_response($socket, "200 OK", { |
| "Content-type" => "text/plain", |
| }, |
| "Your POST request (sans max_messages) came from IP Address: ".$socket->peerhost."\n"); |
| } |
| } |
| else { |
| # for error, "501 Not Implemented" |
| http_response($socket, '501 Not Implemented', {}, ''); |
| } |
| |
| # ok, we don't do keepalive, so get rid of the socket |
| $select->remove($socket); |
| $socket->close; |
| } |
| } |
| |
| if ($opt_noisy) { |
| print scalar(keys %{$msgsout})." messages outstanding\n"; |
| } |
| |
| |
| #print "msgs waiting: ".join(" ", keys %{$msgsout})."\n"; |
| #print "tmpfd defined? ".(defined $tmpfd ? "yes" : "no")."\n"; |
| |
| # we're not awaiting responses and we've exhausted the input file, so |
| # drop the server socket. :) |
| $select->remove($serv_socket) if (!keys %{$msgsout} && !defined $tmpfd); |
| } |
| } |
| |
| # this is the function that implements client mode. generally, in a loop: |
| # make a request of the server for some max number of messages, and send our |
| # results back at the same time. based on the results of that request, put |
| # messages into a temp dir and process them. prep the results and loop. |
| # lather, rinse, repeat. |
| sub client_mode { |
| $opt_cs_max ||= 1000; |
| $opt_cs_timeout ||= 60 * 2; |
| |
| my($host, $uri); |
| |
| if ($opt_client =~ /^http:\/\/([^\/]+)(\/.*)?/) { |
| ($host, $uri) = ($1,$2); |
| } |
| else { |
| $host = $opt_client; |
| if ($host =~ /^:/) { |
| $host = 'localhost'.$host; |
| } |
| } |
| my($http_host) = split(/:/, $host); |
| |
| die "No host found in opt_client" unless $host; |
| $uri ||= "/"; |
| |
| # use this to track how many messages we ought to be requesting |
| # start at 100 to get warmed up |
| my $msgnum = $opt_cs_max > 100 ? 100 : $opt_cs_max; |
| |
| my $tmpdir; |
| |
| # if we're not doing paths_only, create a temp dir where we'll put the |
| # incoming messages to process. |
| if (!$opt_cs_paths_only) { |
| $tmpdir = Mail::SpamAssassin::Util::secure_tmpdir(); |
| die "Can't create tempdir" unless $tmpdir; |
| } |
| |
| # keep going until something stops us. |
| while (1) { |
| # if the number of messages to request is too much, bring it down |
| $msgnum = $opt_cs_max if ($msgnum > $opt_cs_max); |
| |
| # prep the POST request |
| $postdata{'max_messages'} = $msgnum; |
| $postdata{'paths_only'} = 1 if ($opt_cs_paths_only); |
| |
| # the actual POST data string |
| my $POSTDATA = join('&', map { post_encode($_) . '=' . post_encode($postdata{$_}) } keys %postdata); |
| |
| # connect to server |
| my $socket = IO::Socket::INET->new($host); |
| |
| # last if connection fails |
| last unless ($socket); |
| |
| print "Requesting $msgnum messages from server\n" if ($opt_noisy); |
| |
| # make request, include and then drop results if there are any |
| my $result = http_make_request($socket, 'POST', $uri, { |
| 'Host' => $http_host, |
| 'Content-Type' => 'application/x-www-form-urlencoded', |
| 'Content-Length' => length($POSTDATA), |
| }, |
| $POSTDATA |
| ); |
| %postdata = (); |
| undef $POSTDATA; |
| |
| # If we received messages to run through, go ahead and do it. |
| # otherwise, just sleep for the timeout length and try again |
| if (!defined $result) { |
| # we got an error?!? abort! |
| last; |
| } |
| elsif ($result eq 'finished') { |
| # the server said that we're done |
| print "Server states that there is no more work, exiting.\n" if ($opt_noisy); |
| last; |
| } |
| elsif ($result eq '') { |
| # no messages means the server may give us more work down the road. |
| # sleep for client_timeout seconds and try the request again |
| print "Received no messages from server, waiting $opt_cs_timeout seconds\n" if ($opt_noisy); |
| sleep $opt_cs_timeout; |
| } |
| else { |
| # we got messages, so deal with them. |
| my $time_start = time; |
| |
| # postdata will hold our results, real will hold the original message |
| # data from the server's scan mode. |
| %postdata = (); |
| %real = (); |
| $init_results = $total_count = $spam_count = $ham_count = 0; |
| |
| # we got a result, so do things with it! |
| my $gzfd = IO::Zlib->new($result, "rb"); |
| die "Can't open temp result file: $!" unless $gzfd; |
| |
| # used for the temp queue file |
| my $tmppath; |
| ($tmppath, $tmpfd) = Mail::SpamAssassin::Util::secure_tmpfile(); |
| die "Can't make tempfile, exiting" unless $tmppath; |
| unlink $tmppath; |
| |
| # if we have a temp directory, clean it out for this run |
| clean_dir($tmpdir) if ($tmpdir); |
| |
| # Archive format, gzip compressed file w/ 3 parts per message: |
| # 1- server message number in text format |
| # 2- server index string, binary packed format |
| # 3- message content, if not doing paths_only |
| |
| # number of messages |
| $msgnum = $total_messages = read_line($gzfd) || die "mass-check: error reading from gzip message file\n"; |
| |
| status("server gave us $total_messages messages") if ($opt_progress); |
| |
| # loop through and prep all of the messages the server sent |
| for(my $i = 0 ; $i < $total_messages; $i++ ) { |
| my $num = read_line($gzfd); |
| last unless defined $num; |
| |
| my $index = read_line($gzfd); |
| last unless defined $index; |
| |
| # if we're doing paths_only, there'll be no message content |
| if (!$opt_cs_paths_only) { |
| my $msg = read_line($gzfd); |
| last unless defined $msg; |
| |
| # it's going to be a dir of file formatted messages |
| if (open(OUT, ">$tmpdir/$num")) { |
| print OUT $msg; |
| close(OUT); |
| |
| # this is a little tricky -- we need to process the files in the |
| # path and format we've created, but the original data is needed |
| # to create a proper result later, so deal with that here. |
| my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($index); |
| $real{"$tmpdir/$num"} = \@d; |
| send_line($tmpfd, |
| Mail::SpamAssassin::ArchiveIterator::_index_pack($d[0], $d[1], 'f', "$tmpdir/$num")) || |
| die "mass-check: error writing out temp file in client mode\n"; |
| } |
| else { |
| warn "Can't create/write $tmpdir/$num: $!"; |
| } |
| } |
| else { |
| # in paths_only mode, there's no kluging between formats since we're |
| # reading the same corpus, however we do still need to track server |
| # message number to message data so our results will be useable. |
| my @d = Mail::SpamAssassin::ArchiveIterator::_index_unpack($index); |
| $real{$d[3]} = $num; |
| send_line($tmpfd, $index) || |
| die "mass-check: error writing out temp file in client mode\n"; |
| } |
| } |
| |
| $gzfd->close; |
| unlink $result; |
| |
| if ($opt_progress) { |
| status('starting run stage'); |
| } |
| |
| # we're about to start running, so go back to the start of the file |
| seek $tmpfd, 0, 0; |
| |
| run_through_messages(); |
| |
| # we're done with the temp file -- bye bye |
| close($tmpfd); |
| |
| # figure out new max messages, try keeping ~cs_timeout between runs |
| my $time_end = time; |
| |
| # if we only requested a small number of messages, it may take <1s to |
| # run through them, so fake it and say it took 1s. |
| if ($time_end == $time_start) { |
| $time_end++; |
| } |
| |
| if ($opt_progress) { |
| status('completed run stage'); |
| } |
| |
| print "Completed run in ".($time_end-$time_start)." seconds\n" if ($opt_noisy); |
| $msgnum = int($msgnum * $opt_cs_timeout / ($time_end-$time_start)) || 1; |
| } |
| } |
| |
| # if we were using a temp dir, clean it out and then remove it |
| if ($tmpdir) { |
| clean_dir($tmpdir); |
| rmdir $tmpdir; |
| } |
| } |
| |
| ############################################################################ |
| |
| # in server mode, just return the ref to the message data |
| sub wanted_server { |
| my ($class, $id, $time, $dataref, $format) = @_; |
| return $dataref; |
| } |
| |
| # very similar to result() except the result has the message number at the |
| # front, so strip it off and then set the POST data appropriately. |
| sub result_client { |
| my ($class, $result, $time) = @_; |
| |
| # don't open results files until we get here to avoid overwriting files |
| init_results() if !$init_results; |
| |
| if ($class eq "s") { |
| $spam_count++; |
| } |
| elsif ($class eq "h") { |
| $ham_count++; |
| } |
| |
| $total_count++; |
| |
| if ($opt_progress) { |
| progress($time); |
| } |
| |
| if ($result =~ s/^(\d+)\s+//m) { |
| $postdata{$1} = $result; |
| } |
| else { |
| warn ">> WTH!? result is not in the correct format: $result\n"; |
| } |
| } |
| |
| sub aidbg { |
| if (would_log("dbg", "mass-check") == 2) { |
| dbg (@_); |
| } |
| } |
| |
| sub deal_with_before_after { |
| my($which, $time) = @_; |
| |
| if ($time && $time =~ /^-\d+$/) { |
| $time = time + $time; |
| } |
| elsif ($time && $time !~ /^-?\d+$/) { |
| if (HAS_TIME_PARSEDATE) { |
| $time = Time::ParseDate::parsedate($time, GMT => 1, PREFER_PAST => 1); |
| } |
| else { |
| die "You need Time::ParseDate if you use either the --before or --after option."; |
| } |
| } |
| |
| if ($which eq 'before') { |
| $opt_before = $time; |
| } |
| else { |
| $opt_after = $time; |
| } |
| |
| if ($opt_before && $opt_after && $opt_after >= $opt_before) { |
| die "--before ($opt_before) <= --after ($opt_after) -- conflict!"; |
| } |
| } |
| |
| sub generate_queue { |
| my ($targets, $tmpfd) = @_; |
| |
| # scan the targets and get the number and list of messages |
| $iter->_scan_targets($targets, |
| sub { |
| my($self, $date, $class, $format, $mail) = @_; |
| push(@{$self->{$class}}, Mail::SpamAssassin::ArchiveIterator::_index_pack($date, $class, $format, $mail)); |
| } |
| ); |
| |
| # deal with opt_head and opt_tail |
| top_and_tail_messages($iter->{h}); |
| top_and_tail_messages($iter->{s}); |
| |
| my $messages; |
| if ($opt_n) { |
| # OPT_N == 1 means don't bother sorting on message receive date |
| |
| # for ease of memory, we'll play with pointers |
| $messages = $iter->{s}; |
| undef $iter->{s}; |
| push(@{$messages}, @{$iter->{h}}); |
| undef $iter->{h}; |
| } |
| else { |
| # OPT_N == 0 means sort on message receive date |
| |
| # Sort the spam and ham groups by date |
| my @s = @{$iter->{s}}; |
| undef $iter->{s}; |
| my @h = @{$iter->{h}}; |
| undef $iter->{h}; |
| |
| # interleave ordered spam and ham |
| if (@s && @h) { |
| my $ratio = @s / @h; |
| while (@s && @h) { |
| push @{$messages}, (@s / @h > $ratio) ? (shift @s) : (shift @h); |
| } |
| } |
| # push the rest onto the end |
| push @{$messages}, @s, @h; |
| } |
| |
| # head or tail < 0 means crop the total list, negate the value appropriately |
| if ($opt_tail < 0) { |
| splice(@{$messages}, 0, $opt_tail); |
| } |
| if ($opt_head < 0) { |
| splice(@{$messages}, -$opt_head); |
| } |
| |
| my $num = $Mail::SpamAssassin::ArchiveIterator::MESSAGES = scalar(@{$messages}); |
| |
| # Dump out the number of messages and the message index info to |
| # the temp file |
| send_line($tmpfd, $num, @{$messages}); |
| } |
| |
| sub top_and_tail_messages { |
| my ($ary) = @_; |
| |
| if ($opt_n) { |
| # OPT_N == 1 means don't bother sorting on message receive date |
| |
| # head or tail > 0 means crop each list |
| if ($opt_tail > 0) { |
| splice(@{$ary}, 0, -$opt_tail); |
| } |
| if ($opt_head > 0) { |
| splice(@{$ary}, min ($opt_head, scalar @{$ary})); |
| } |
| } |
| else { |
| # OPT_N == 0 means sort on message receive date |
| |
| # Sort the spam and ham groups by date |
| my @s = sort { $a cmp $b } @{$ary}; |
| |
| # head or tail > 0 means crop each list |
| if ($opt_tail > 0) { |
| splice(@s, 0, -$opt_tail); |
| } |
| if ($opt_head > 0) { |
| splice(@s, min ($opt_head, scalar @s)); |
| } |
| |
| @{$ary} = @s; |
| } |
| } |
| |
| sub min { |
| return ($_[0] < $_[1] ? $_[0] : $_[1]); |
| } |