| #!/usr/bin/perl -w | 
 |  | 
 | # settings are located in $HOME/.corpus | 
 |  | 
 | use strict; | 
 | use Getopt::Long; | 
 |  | 
 | our ( $opt_override, $opt_tag ); | 
 | GetOptions( | 
 |     "tag=s" => \$opt_tag, | 
 |     "override=s" => \$opt_override, | 
 | ); | 
 |  | 
 | $opt_override ||= ''; | 
 | $opt_tag ||= 'n';       # nightly is the default | 
 |  | 
 | use File::Path; | 
 | use File::Copy; | 
 | use Time::ParseDate; | 
 | use Cwd qw(abs_path); | 
 | use POSIX qw(nice strftime); | 
 |  | 
 | use constant WEEK => 7*60*60*24; | 
 | nice(15); | 
 |  | 
 | # daterevs -- e.g. "20060429/r239832-r" -- are aligned to 0800 UTC, just before | 
 | # the time of day when the mass-check tagging occurs; see | 
 | # http://wiki.apache.org/spamassassin/DateRev for more details. | 
 | use constant DATEREV_ADJ => - (8 * 60 * 60); | 
 |  | 
 | # what's the max age of mail we will accept data from? (in weeks) | 
 | # TODO: maybe this should be in ~/.corpus | 
 | my $OLDEST_HAM_WEEKS    = 72 * 4;       # 72 months = 6 years | 
 | my $OLDEST_SPAM_WEEKS    = 2 * 4;       # 2 months | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |  | 
 | sub runcmd; | 
 | my $configuration = "$ENV{HOME}/.corpus"; | 
 | my %cf; | 
 | my %revision = (); | 
 | my %filesize = (); | 
 | my %dateline = (); | 
 | my %mtime = (); | 
 | my %logs_by_daterev = (); | 
 | my %is_net_daterev = (); | 
 | my %time = (); | 
 | my @tmps = (); | 
 | my $time_start = time; | 
 | my $output_revpath; | 
 | my $perl_path = $^X; | 
 |  | 
 | configure(); | 
 | init(); | 
 |  | 
 | my $logsdir = "$cf{html}/logs"; | 
 | print "reading logs from '$logsdir'\n"; | 
 |  | 
 | locate_input(); | 
 | generate_logs(); | 
 | clean_up(); | 
 | exit; | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |  | 
 | sub configure { | 
 |   # does rough equivalent of source | 
 |   open(C, $configuration) || die "open failed: $configuration: $!\n"; | 
 |   my $pwd = Cwd::getcwd; | 
 |  | 
 |   # add 'override' options | 
 |   my @lines = (<C>, split(/\|/, $opt_override)); | 
 |  | 
 |   foreach (@lines) { | 
 |     chomp; | 
 |     s/#.*//; | 
 |     if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) { | 
 |       my ($key, $val) = ($1, $2); | 
 |       $val =~ s/\$PWD/$pwd/gs; | 
 |       $cf{$key} = $val; | 
 |     } | 
 |   } | 
 |   close(C); | 
 |  | 
 |   $cf{output_classes} ||= | 
 |             "DETAILS.new DETAILS.all DETAILS.age NET.new NET.all NET.age"; | 
 | } | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |  | 
 | sub clean_up { | 
 |   chdir "/"; | 
 |   runcmd "rm -rf $cf{tmp}/*.$$ ".join(' ', @tmps); | 
 | } | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |  | 
 | sub init { | 
 |   $SIG{INT} = \&clean_up; | 
 |   $SIG{TERM} = \&clean_up; | 
 |  | 
 |   $ENV{RSYNC_PASSWORD} = $cf{password}; | 
 |   $ENV{TIME} = '%e,%U,%S'; | 
 |   $ENV{TZ} = 'UTC'; | 
 | } | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |  | 
 | sub locate_input { | 
 |   opendir(CORPUS, $logsdir); | 
 |   my @files = sort readdir(CORPUS); | 
 |   closedir(CORPUS); | 
 |  | 
 |   @files = grep { | 
 |     /^(?:spam|ham)-(?:net-)?\S+\.log$/ && -f "$logsdir/$_" && -M _ < 10  | 
 |   } @files; | 
 |  | 
 |   foreach my $file (@files) { | 
 |     my $tag = 0; | 
 |     my $headers = ''; | 
 |  | 
 |     open(FILE, "$logsdir/$file") or warn "cannot read $logsdir/$file"; | 
 |     while (my $line = <FILE>) { | 
 |       last if $line !~ /^#/; | 
 |       $headers .= $line; | 
 |       if ($line =~ /, on (... ... .. )(..)(:..:.. ... ....)/) { | 
 |         my ($datepre, $hh, $datepost) = ($1,$2,$3); | 
 |          | 
 |         my $timet = Time::ParseDate::parsedate($datepre.$hh.$datepost, | 
 |                     GMT => 1, PREFER_PAST => 1); | 
 |  | 
 |         $time{$file} = $timet; | 
 |       } | 
 |       elsif ($line =~ m/^# Date:\s*(\S+)/) { | 
 |         # a better way to do the above.  TODO: parse it instead | 
 |         $dateline{$file} = $1; | 
 |         if (!defined $time{$file}) { | 
 |           # if time line unparseable (localized?) use this instead | 
 |           my ($yyyy, $mm, $dd, $h, $m, $s) = $dateline{$file} =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/; | 
 |            | 
 |           my $timet = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} ${h}:${m}:${s} GMT+0", | 
 |                     GMT => 1, PREFER_PAST => 1); | 
 |  | 
 |           $time{$file} = $timet; | 
 |         } | 
 |       } | 
 |       elsif ($line =~ m/^# SVN revision:\s*(\S+)/) { | 
 |         $revision{$file} = $1; | 
 |       } | 
 |     } | 
 |     close(FILE); | 
 |  | 
 |     my @s = stat("$logsdir/$file"); | 
 |     $filesize{$file} = $s[7]; | 
 |     $mtime{$file} = $s[9]; | 
 |  | 
 |     if (!defined $time{$file}) { | 
 |       warn "$logsdir/$file: no time found, ignored\n"; next; | 
 |     } | 
 |     if (!defined $revision{$file}) { | 
 |       warn "$logsdir/$file: no revision found, ignored\n"; next; | 
 |     } | 
 |     if ($revision{$file} eq 'unknown') { | 
 |       warn "$logsdir/$file: not tagged with a revision, ignored\n"; next; | 
 |     } | 
 |  | 
 |     my $daterev = mk_daterev($time{$file},$revision{$file},$opt_tag); | 
 |  | 
 |     $logs_by_daterev{$daterev} ||= [ ]; | 
 |     push (@{$logs_by_daterev{$daterev}}, $file); | 
 |  | 
 |     if ($file =~ /-net-/) { | 
 |       $is_net_daterev{$daterev} = 1; | 
 |       print "$logsdir/$file: rev=$daterev time=$time{$file} (set 1)\n"; | 
 |     } | 
 |     else { | 
 |       print "$logsdir/$file: rev=$daterev time=$time{$file} (set 0)\n"; | 
 |     } | 
 |  | 
 |     get_rulemetadata_for_revision($daterev, $revision{$file}); | 
 |   } | 
 | } | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |  | 
 | sub sort_all { | 
 |   my ($a1, $a2) = ($a =~ m/(\(.*?\)|\S+)(?::(\S+))?$/); | 
 |   my ($b1, $b2) = ($b =~ m/(\(.*?\)|\S+)(?::(\S+))?$/); | 
 |   $a1 =~ s/^[\+\-]//; | 
 |   $b1 =~ s/^[\+\-]//; | 
 |  | 
 |   my $n = ($a1 cmp $b1) || (($a2 || '') cmp ($b2 || '')); | 
 |   if ($a1 =~ /^OVERALL/)			{ $n -= 1000; } | 
 |   elsif ($a1 =~ /^\(all messages\)/)		{ $n -= 100; } | 
 |   elsif ($a1 =~ /^\(all messages as \%\)/)	{ $n -= 10; } | 
 |   if ($b1 =~ /^OVERALL/)			{ $n += 1000; } | 
 |   elsif ($b1 =~ /^\(all messages\)/)		{ $n += 100; } | 
 |   elsif ($b1 =~ /^\(all messages as \%\)/)	{ $n += 10; } | 
 |   return $n; | 
 | } | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |  | 
 | sub time_filter_fileset { | 
 |   my ($fileary, $outname, $after, $before) = @_; | 
 |  | 
 |   my $timet_before = (defined $before ?  | 
 |             ($time_start - ($before * WEEK)) : $time_start+1); | 
 |   my $timet_after  = (defined $after ?  | 
 |             ($time_start - ($after * WEEK))  : 0); | 
 |  | 
 |   open(TMP, ">$outname") or warn "cannot write $outname"; | 
 |   for my $file (@{$fileary}) { | 
 |     open(IN, $file) or warn "cannot read $file"; | 
 |     while (<IN>) { | 
 |       next unless /\btime=(\d+)/; | 
 |       next if ($1 < $timet_after || $1 > $timet_before); | 
 |       print TMP; | 
 |     } | 
 |     close IN; | 
 |   } | 
 |   close TMP or warn "failed to close $outname"; | 
 | } | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |  | 
 | sub generate_logs { | 
 |   foreach my $entry (split(' ', $cf{output_classes})) { | 
 |     $entry =~ /^(\S+)\.(\S+)$/; | 
 |     my $class = $1; | 
 |     my $rtype = $2; | 
 |     if (!$rtype) { warn "no rtype in $entry"; next; } | 
 |     if ($class eq 'HTML') { warn "class HTML in $entry obsolete, ignored"; next; } | 
 |  | 
 |     foreach my $daterev (reverse sort keys %logs_by_daterev) { | 
 |       my $rev; | 
 |       if ($daterev !~ /\/r(\d+)/) { | 
 |         warn "bad daterev: $daterev"; next; | 
 |       } | 
 |       $rev = $1; | 
 |  | 
 |       if ($class eq "NET") { | 
 |         next unless $is_net_daterev{$daterev}; | 
 |       } | 
 |  | 
 |       gen_class ($daterev, $rev, $class, $rtype); | 
 |     } | 
 |   } | 
 | }  | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |  | 
 | my ($tmp_h, $tmp_s, $no_messages_in_freqs, $hf_flags); | 
 |  | 
 | sub gen_class { | 
 |   my ($daterev, $rev, $class, $rtype) = @_; | 
 |   return if ($class eq "NET" && $rtype !~ /^(?:new|all|age|7day)$/); | 
 |  | 
 |   chdir $logsdir; | 
 |   print STDERR "\ngenerating: $cf{html}/$daterev/$class.$rtype\n"; | 
 |  | 
 |   my @ham = grep { /^ham/ } @{$logs_by_daterev{$daterev}}; | 
 |   print STDERR "input h: " . join(' ', @ham) . "\n"; | 
 |  | 
 |   my @spam = grep { /^spam/ } @{$logs_by_daterev{$daterev}}; | 
 |   print STDERR "input s: " . join(' ', @spam) . "\n"; | 
 |  | 
 |   # net vs. local | 
 |   if ($class eq "NET") { | 
 |     @ham = grep { /-net-/ } @ham; | 
 |     @spam = grep { /-net-/ } @spam; | 
 |   } | 
 |   # age | 
 |   if ($rtype =~ /(\d+)day/) { | 
 |     my $mtime = $1; | 
 |     @ham = grep { -M $_ < $mtime } @ham; | 
 |     @spam = grep { -M $_ < $mtime } @spam; | 
 |   } | 
 |  | 
 |   print STDERR "selected h: " . join(' ', @ham) . "\n"; | 
 |   print STDERR "selected s: " . join(' ', @spam) . "\n"; | 
 |    | 
 |   # we cannot continue if we have no files that match the criteria... | 
 |   # demand at least 1 ham and 1 spam file | 
 |   if (scalar @spam <= 0 || scalar @ham <= 0) { | 
 |     warn "not enough files found matching criteria ($daterev $class $rtype)\n"; | 
 |     return; | 
 |   } | 
 |  | 
 |   my $dir = create_outputdir($daterev); | 
 |   my $fname = "$dir/$class.$rtype"; | 
 |  | 
 |   # now, if the target file already exists, check to see if it's newer | 
 |   # than all the sources, make-style; if not, don't re-create it | 
 |   if (-f $fname) { | 
 |     my $targetfreshness = (-M $fname); | 
 |     my $needsrebuild = 0; | 
 |  | 
 |     foreach my $srcfile (@spam, @ham) { | 
 |       my $srcfreshness = (-M $srcfile); | 
 |       if ($targetfreshness > $srcfreshness) {     # src is fresher | 
 |         print "need rebuild, $fname is older than $srcfile: $targetfreshness > $srcfreshness\n"; | 
 |         $needsrebuild = 1; | 
 |         last; | 
 |       } | 
 |     } | 
 |  | 
 |     if (!$needsrebuild) { | 
 |       print "existing: $fname, fresher than sources\n"; | 
 |       return; | 
 |     } | 
 |   } | 
 |  | 
 |   my $when = scalar localtime time; | 
 |   print qq{creating: $fname ($class) | 
 |   started $when... | 
 | }; | 
 |   my $bytes = 0; | 
 |  | 
 |   my $tmpfname = "$fname.$$"; | 
 |   if ($class eq 'LOGS') { | 
 |     $bytes = gen_report_logs($fname, \@ham, \@spam); | 
 |   } | 
 |   elsif ($class eq 'CORPUS') { | 
 |     push (@tmps, abs_path($tmpfname)); | 
 |  | 
 |     my $cmd = "$perl_path $cf{tree}/masses/logs-to-corpus-report ". | 
 |           join(" ", @ham)." ".join(" ", @spam)." > $tmpfname"; | 
 |     runcmd $cmd; | 
 |     ($? >> 8 == 0) or warn "failed to run logs-to-corpus-report"; | 
 |  | 
 |     rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname"; | 
 |     $bytes = (-s $fname); | 
 |   } | 
 |   else { | 
 |     push (@tmps, abs_path($tmpfname)); | 
 |  | 
 |     open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname"; | 
 |     print OUT "# ham results used for $daterev $class $rtype: " . join(" ", @ham) . "\n"; | 
 |     print OUT "# spam results used for $daterev $class $rtype: " . join(" ", @spam) . "\n"; | 
 |     print OUT "# ".log_metadata_xml($daterev, @ham, @spam)."\n"; | 
 |  | 
 |     $hf_flags = ""; | 
 |     $hf_flags = "-t net -s 1" if $class eq "NET"; | 
 |     $hf_flags = "-o" if $class eq "OVERLAP"; | 
 |     $hf_flags = "-S" if $class eq "SCOREMAP"; | 
 |  | 
 |     if ($cf{rules_dir}) { | 
 |       $hf_flags .= " -c '$cf{rules_dir}'"; | 
 |     } | 
 |  | 
 |     # are we analyzing --net mass-check logs?  if so, use scoreset 1 | 
 |     if (join(" ", @ham) =~ /-net-/) { | 
 |       $hf_flags .= " -s 1" if $class eq "NET"; | 
 |     } | 
 |  | 
 |     # catch an odd error condition, where hit-frequencies creates output | 
 |     # with no log lines included at all | 
 |     $no_messages_in_freqs = 0; | 
 |  | 
 |     $tmp_h = "$cf{tmp}/ham.log.$$"; | 
 |     $tmp_s = "$cf{tmp}/spam.log.$$"; | 
 |  | 
 |     if ($rtype eq "all") { | 
 |       gen_report_freqs_all($tmpfname, \@ham, \@spam, $rev); | 
 |     } | 
 |     elsif ($rtype eq "age") { | 
 |       gen_report_freqs_age($tmpfname, \@ham, \@spam, $rev); | 
 |     } | 
 |     elsif (@ham && @spam) { | 
 |       gen_report_freqs_basic($tmpfname, \@ham, \@spam, $rev); | 
 |     } | 
 |  | 
 |     $bytes = (-s OUT); | 
 |     close(OUT); | 
 |  | 
 |     unlink $tmp_h, $tmp_s; | 
 |  | 
 |     if ($no_messages_in_freqs) { | 
 |       warn "ERROR: no data in freqs!  aborting, leaving tmp file as $tmpfname"; | 
 |       return; | 
 |     } | 
 |  | 
 |     rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname"; | 
 |  | 
 |     # compress for certain classes | 
 |     if ($class eq "OVERLAP") { | 
 |       $fname =~ s/'//gs; | 
 |       runcmd ("rm '$fname.gz'; pigz '$fname'");  | 
 |       # takes care of keeping the original around so we don't have to | 
 |       if ($? >> 8 != 0) { warn "pigz '$fname' failed"; } | 
 |     } | 
 |   } | 
 |  | 
 |   $when = scalar localtime time; | 
 |   print qq{created: $bytes bytes, finished at $when | 
 | URL: | 
 |  | 
 |   $cf{ruleqa_url}$output_revpath | 
 |  | 
 | }; | 
 |  | 
 | } | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |  | 
 | sub mk_daterev { | 
 |   my ($timet, $rev, $tag) = @_; | 
 |   return strftime("%Y%m%d", gmtime($timet + DATEREV_ADJ)) . "/r$rev-$tag"; | 
 | } | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |  | 
 | sub create_outputdir { | 
 |   my ($revpath) = @_; | 
 |   my $dir = $cf{html} .'/'. $revpath; | 
 |  | 
 |   # print "output dir: $dir\n"; | 
 |   if (!-d $dir) { | 
 |     my $prevu = umask 0; | 
 |     mkpath([$dir], 0, oct($cf{html_mode})) or warn "failed to mkdir $dir"; | 
 |     umask $prevu; | 
 |   } | 
 |  | 
 |   $output_revpath = $revpath;       # set the global | 
 |   $output_revpath =~ s/\//-/;       # looks nicer | 
 |  | 
 |   return $dir; | 
 | } | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |  | 
 | sub log_metadata_xml { | 
 |   my ($daterev, @files) = @_; | 
 |   my $str = ''; | 
 |  | 
 |   # this is extracted into the info.xml file later by the gen_info_xml script | 
 |   foreach my $f (@files) { | 
 |     $str .= qq{ | 
 |       <mclogmd file='$f'> | 
 |         <daterev>$daterev</daterev> | 
 |         <rev>$revision{$f}</rev> | 
 |         <fsize>$filesize{$f}</fsize> | 
 |         <mcstartdate>$dateline{$f}</mcstartdate> | 
 |         <mtime>$mtime{$f}</mtime> | 
 |       </mclogmd> | 
 |     }; | 
 |   } | 
 |  | 
 |   $str =~ s/\s+/ /gs;  # on a single line please | 
 |   return '<mclogmds>'.$str.'</mclogmds>'; | 
 | } | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |  | 
 | sub create_rulemetadata_dir { | 
 |   my $rev = shift; | 
 |   my $dir = "$cf{html}/rulemetadata/$rev"; | 
 |   if (!-d $dir) { | 
 |     my $prevu = umask 0; | 
 |     mkpath([$dir], 0, oct($cf{html_mode})) or warn "failed to mkdir $dir"; | 
 |     umask $prevu; | 
 |   } | 
 |   return $dir; | 
 | } | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |  | 
 | sub get_rulemetadata_for_revision { | 
 |   my ($daterev, $rev) = @_; | 
 |  | 
 |   my $dir = create_rulemetadata_dir($rev); | 
 |  | 
 |   # argh.  this is silly; ~bbmass/.corpus specifies "$PWD" in its | 
 |   # "tree" path, so we have to ensure we're in the 'masses' dir | 
 |   # for this to work! | 
 |   chdir "$cf{tree}/masses" or die "cannot chdir $cf{tree}/masses"; | 
 |  | 
 |   my $cmd = "$cf{tree}/masses/rule-qa/get-rulemetadata-for-revision ". | 
 |                     "--rev=$rev --outputdir='$dir'"; | 
 |  | 
 |   runcmd($cmd); | 
 |   if ($? >> 8 != 0) { | 
 |     warn "'$cmd' failed"; | 
 |   } | 
 |  | 
 |   chdir $logsdir; | 
 | } | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |  | 
 | sub start_hit_frequencies_at_rev { | 
 |   my ($rev, $args) = @_; | 
 |  | 
 |   $rev ||= 'HEAD'; | 
 |   (-d "$cf{tmp}/hfdir") or runcmd("mkdir -p $cf{tmp}/hfdir"); | 
 |   my $hfdir = "$cf{tmp}/hfdir/r$rev"; | 
 |   my $expected_svn_file = "$hfdir/Makefile.PL"; | 
 |  | 
 |   print "setting up hit-frequencies for r$rev in $hfdir\n"; | 
 |  | 
 |   my $needs_checkout = 0; | 
 |   if (-d $hfdir && chdir $hfdir) { | 
 |     eval { | 
 |       # "svn up" has been observed to wedge on the ruleqa zone VM, put a timeout so we can recover | 
 |       local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required | 
 |       alarm 60*60;	# an hour should be generous enough | 
 |       runcmd("svn up -r$rev"); | 
 |       alarm 0; | 
 |     }; | 
 |  | 
 |     if ($@ || $?>>8 != 0 || !-f $expected_svn_file) { | 
 |       print "simple 'svn update' failed. performing full checkout instead...\n"; | 
 |       $needs_checkout = 1; | 
 |     } | 
 |   } else { | 
 |     $needs_checkout = 1; | 
 |   } | 
 |  | 
 |   if ($needs_checkout) { | 
 |     my $svnurl = get_svn_url(); | 
 |     runcmd("rm -rf $hfdir"); | 
 |     eval { | 
 |       local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required | 
 |       alarm 60*60;	# an hour should be generous enough | 
 |       runcmd("svn co $svnurl\@$rev $hfdir"); | 
 |       alarm 0; | 
 |     }; | 
 |     if ($@ || $?>>8 != 0 || !-f $expected_svn_file) { die "svn co failed"; } | 
 |   } | 
 |   chdir "$hfdir" or die "cannot chdir $hfdir"; | 
 |  | 
 |   # ensure these are rebuilt | 
 |   runcmd "rm -f rules/70_sandbox.cf rules/72_active.cf"; | 
 |  | 
 |   # do this twice in case Makefile.PL is rebuilt | 
 |   runcmd "( make build_rules || $perl_path Makefile.PL;make build_rules )</dev/null"; | 
 |  | 
 |   chdir "$hfdir/masses" or die "cannot chdir $hfdir/masses"; | 
 |   open (FREQS, "$perl_path hit-frequencies -TxpagP $args |") | 
 |             or die "cannot run ./hit-frequencies $args |"; | 
 |  | 
 |   chdir $logsdir; | 
 | } | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |    | 
 | sub get_svn_url { | 
 |   open (SVNINFO, "svn info $cf{tree}|") or die "cannot run svn info"; | 
 |   my $svnurl; | 
 |   while (<SVNINFO>) { | 
 |     /URL: (.*)$/ and $svnurl = $1; | 
 |   } | 
 |   close SVNINFO or die "cannot close svn info"; | 
 |   return $svnurl; | 
 | } | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |  | 
 | sub gen_report_logs { | 
 |   my ($fname, $hamref, $spamref) = @_; | 
 |  | 
 |   my $bytes = 0; | 
 |   foreach my $f (@$hamref, @$spamref) { | 
 |     $f =~ s/[^-\._A-Za-z0-9]+/_/gs;    # sanitize! | 
 |     my $zf = "$fname-$f.gz"; | 
 |  | 
 |     runcmd("pigz -c < $f > $zf.$$"); | 
 |     if ($? >> 8 != 0) { | 
 |       warn "pigz -c < $f > $zf.$$ failed"; | 
 |     } | 
 |  | 
 |     rename("$zf.$$", $zf) or | 
 |                   warn "cannot rename $zf.$$ to $zf"; | 
 |     $bytes += (-s $zf); | 
 |   } | 
 |  | 
 |   # this is just so we won't recompress these logs if re-run | 
 |   open TOUCH, ">$fname" or warn "cannot write to $fname"; | 
 |   close TOUCH; | 
 |  | 
 |   return $bytes; | 
 | } | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |  | 
 | sub gen_report_freqs_all { | 
 |   my ($tmpfname, $hamref, $spamref, $rev) = @_; | 
 |   my %spam; | 
 |   my %ham; | 
 |   my @output; | 
 |    | 
 |   for my $file (@$spamref) { | 
 |     my $u = extract_username_from_log_filename($file); | 
 |     $spam{$u} = $file; print "username in spam log: $u\n"; | 
 |   } | 
 |   for my $file (@$hamref) { | 
 |     my $u = extract_username_from_log_filename($file); | 
 |     $ham{$u} = $file; print "username in ham log: $u\n"; | 
 |   } | 
 |  | 
 |   if (scalar keys %spam <= 0 && scalar keys %ham <= 0) { | 
 |     warn "no files found"; | 
 |     return; | 
 |   } | 
 |  | 
 |   my $tmp_h_all = "$cf{tmp}/hamall.log.$$"; | 
 |   my $tmp_s_all = "$cf{tmp}/spamall.log.$$"; | 
 |   unlink $tmp_h_all, $tmp_s_all; | 
 |  | 
 |   my %alluserkeys; | 
 |   for my $k (keys %spam, keys %ham) { | 
 |     next if exists $alluserkeys{$k}; undef $alluserkeys{$k}; | 
 |   } | 
 |   for my $user (sort keys %alluserkeys) { | 
 |     my $files_h = []; | 
 |     if ($ham{$user}) { $files_h = [ "$logsdir/$ham{$user}" ]; } | 
 |     my $files_s = []; | 
 |     if ($spam{$user}) { $files_s = [ "$logsdir/$spam{$user}" ]; } | 
 |  | 
 |     time_filter_fileset($files_h, $tmp_h, $OLDEST_HAM_WEEKS,  undef); | 
 |     time_filter_fileset($files_s, $tmp_s, $OLDEST_SPAM_WEEKS, undef); | 
 |  | 
 |     start_hit_frequencies_at_rev($rev, "$hf_flags $tmp_s $tmp_h"); | 
 |     while(<FREQS>) { | 
 |       chomp; push @output, "$_:$user\n"; | 
 |     } | 
 |     close(FREQS); | 
 |  | 
 |     runcmd("cat $tmp_h >> $tmp_h_all"); | 
 |     runcmd("cat $tmp_s >> $tmp_s_all"); | 
 |   } | 
 |  | 
 |   if (-z $tmp_h_all && -z $tmp_s_all) { | 
 |     warn "time_filter_fileset() returned empty logs. not creating freqs!"; | 
 |     unlink $tmp_h_all, $tmp_s_all; | 
 |     return;     # we'll try again later | 
 |   } | 
 |  | 
 |   start_hit_frequencies_at_rev($rev, "$hf_flags $tmp_s_all $tmp_h_all"); | 
 |   while(<FREQS>) { | 
 |     /\s0\s+0\s+0.500\s+0.00\s+0.00\s+\(all messages\)/ and $no_messages_in_freqs = 1; | 
 |     push @output, $_; | 
 |   } | 
 |   close(FREQS); | 
 |   for (sort sort_all @output) { print OUT; } | 
 |  | 
 |   unlink $tmp_h_all, $tmp_s_all; | 
 | } | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |  | 
 | sub extract_username_from_log_filename { | 
 |   my $u = shift; | 
 |   # spam-someuser.log, spam-net-bb-jm.20090518-r775863-n.log | 
 |   $u =~ s/\.log$//; $u =~ s/.*\///; $u =~ s/^(h|sp)am-(?:net-)?//; | 
 |   $u =~ s/\.\d{8}-r\d+-[a-z]//;       # daterev | 
 |   return $u; | 
 | } | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |  | 
 | sub gen_report_freqs_age { | 
 |   my ($tmpfname, $hamref, $spamref, $rev) = @_; | 
 |   my @output; | 
 |  | 
 |   for my $which (("0-1", "1-2", "2-3", "3-6")) { | 
 |     my ($before, $after) = split(/-/, $which); | 
 |     time_filter_fileset($hamref, $tmp_h, $after, $before); | 
 |     time_filter_fileset($spamref, $tmp_s, $after, $before); | 
 |  | 
 |     # print out by age | 
 |     start_hit_frequencies_at_rev($rev, "$hf_flags $tmp_s $tmp_h"); | 
 |     while(<FREQS>) { | 
 |       chomp; push @output, "$_:$which\n"; | 
 |     } | 
 |     close(FREQS); | 
 |   } | 
 |   for (sort sort_all @output) { print OUT; } | 
 | } | 
 |  | 
 | # --------------------------------------------------------------------------- | 
 |  | 
 | sub gen_report_freqs_basic { | 
 |   my ($tmpfname, $hamref, $spamref, $rev) = @_; | 
 |  | 
 |   time_filter_fileset($hamref, $tmp_h, $OLDEST_HAM_WEEKS, undef); | 
 |   time_filter_fileset($spamref, $tmp_s, $OLDEST_SPAM_WEEKS, undef); | 
 |  | 
 |   if (-z $tmp_h && -z $tmp_s) { | 
 |     warn "time_filter_fileset() returned empty logs. not creating freqs!"; | 
 |     return;     # we'll try again later | 
 |   } | 
 |  | 
 |   start_hit_frequencies_at_rev($rev, "$hf_flags $tmp_s $tmp_h"); | 
 |   while(<FREQS>) { | 
 |     /\s0\s+0\s+0.500\s+0.00\s+0.00\s+\(all messages\)/ and $no_messages_in_freqs = 1; | 
 |     print(OUT); | 
 |   } | 
 |   close(FREQS); | 
 | } | 
 |  | 
 | sub runcmd { | 
 |   my ($cmd) = @_; | 
 |   print "[$cmd]\n"; | 
 |   system $cmd; | 
 | } |