masses/rule-qa/reports-from-logs - spamassassin - Git at Google

 #!/usr/bin/perl -w

 # settings are located in $HOME/.corpus

 use strict;
 use Getopt::Long;

 our ( $opt_override, $opt_tag );
 GetOptions(
     "tag=s" => \$opt_tag,
     "override=s" => \$opt_override,
 );

 $opt_override ||= '';
 $opt_tag ||= 'n';       # nightly is the default

 use File::Path;
 use File::Copy;
 use Time::ParseDate;
 use Cwd qw(abs_path);
 use POSIX qw(nice strftime);

 use constant WEEK => 7*60*60*24;
 nice(15);

 # daterevs -- e.g. "20060429/r239832-r" -- are aligned to 0800 UTC, just before
 # the time of day when the mass-check tagging occurs; see
 # http://wiki.apache.org/spamassassin/DateRev for more details.
 use constant DATEREV_ADJ => - (8 * 60 * 60);

 # what's the max age of mail we will accept data from? (in weeks)
 # TODO: maybe this should be in ~/.corpus
 my $OLDEST_HAM_WEEKS    = 72 * 4;       # 72 months = 6 years
 my $OLDEST_SPAM_WEEKS    = 2 * 4;       # 2 months

 # ---------------------------------------------------------------------------

 sub runcmd;
 my $configuration = "$ENV{HOME}/.corpus";
 my %cf;
 my %revision = ();
 my %filesize = ();
 my %dateline = ();
 my %mtime = ();
 my %logs_by_daterev = ();
 my %is_net_daterev = ();
 my %time = ();
 my @tmps = ();
 my $time_start = time;
 my $output_revpath;
 my $perl_path = $^X;

 configure();
 init();

 my $logsdir = "$cf{html}/logs";
 print "reading logs from '$logsdir'\n";

 locate_input();
 generate_logs();
 clean_up();
 exit;

 # ---------------------------------------------------------------------------

 sub configure {
   # does rough equivalent of source
   open(C, $configuration) || die "open failed: $configuration: $!\n";
   my $pwd = Cwd::getcwd;

   # add 'override' options
   my @lines = (<C>, split(/\|/, $opt_override));

   foreach (@lines) {
     chomp;
     s/#.*//;
     if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) {
       my ($key, $val) = ($1, $2);
       $val =~ s/\$PWD/$pwd/gs;
       $cf{$key} = $val;
     }
   }
   close(C);

   $cf{output_classes} ||=
             "DETAILS.new DETAILS.all DETAILS.age NET.new NET.all NET.age";
 }

 # ---------------------------------------------------------------------------

 sub clean_up {
   chdir "/";
   runcmd "rm -rf $cf{tmp}/*.$$ ".join(' ', @tmps);
 }

 # ---------------------------------------------------------------------------

 sub init {
   $SIG{INT} = \&clean_up;
   $SIG{TERM} = \&clean_up;

   $ENV{RSYNC_PASSWORD} = $cf{password};
   $ENV{TIME} = '%e,%U,%S';
   $ENV{TZ} = 'UTC';
 }

 # ---------------------------------------------------------------------------

 sub locate_input {
   opendir(CORPUS, $logsdir);
   my @files = sort readdir(CORPUS);
   closedir(CORPUS);

   warn "found ", $#files + 1, " files in $logsdir";

   @files = grep {
     /^(?:spam|ham)-(?:net-)?\S+\.log$/ && -f "$logsdir/$_" && -M _ < 10
   } @files;

   warn "kept ", $#files + 1, " files";

   foreach my $file (@files) {
     my $tag = 0;
     my $headers = '';

     warn "processing $logsdir/$file";

     open(FILE, "$logsdir/$file") or warn "cannot read $logsdir/$file";
     while (my $line = <FILE>) {
       last if $line !~ /^#/;
       $headers .= $line;
       if ($line =~ /, on (... ... .. )(..)(:..:.. ... ....)/) {
         my ($datepre, $hh, $datepost) = ($1,$2,$3);

         my $timet = Time::ParseDate::parsedate($datepre.$hh.$datepost,
                     GMT => 1, PREFER_PAST => 1);

         $time{$file} = $timet;
       }
       elsif ($line =~ m/^# Date:\s*(\S+)/) {
         # a better way to do the above.  TODO: parse it instead
         $dateline{$file} = $1;
         if (!defined $time{$file}) {
           # if time line unparseable (localized?) use this instead
           my ($yyyy, $mm, $dd, $h, $m, $s) = $dateline{$file} =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/;

           my $timet = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} ${h}:${m}:${s} GMT+0",
                     GMT => 1, PREFER_PAST => 1);

           $time{$file} = $timet;
         }
       }
       elsif ($line =~ m/^# SVN revision:\s*(\S+)/) {
         $revision{$file} = $1;
       }
     }
     close(FILE);

     my @s = stat("$logsdir/$file");
     $filesize{$file} = $s[7];
     $mtime{$file} = $s[9];

     if (!defined $time{$file}) {
       warn "$logsdir/$file: no time found, ignored\n"; next;
     }
     if (!defined $revision{$file}) {
       warn "$logsdir/$file: no revision found, ignored\n"; next;
     }
     if ($revision{$file} eq 'unknown') {
       warn "$logsdir/$file: not tagged with a revision, ignored\n"; next;
     }

     my $daterev = mk_daterev($time{$file},$revision{$file},$opt_tag);

     $logs_by_daterev{$daterev} ||= [ ];
     push (@{$logs_by_daterev{$daterev}}, $file);

     if ($file =~ /-net-/) {
       $is_net_daterev{$daterev} = 1;
       print "$logsdir/$file: rev=$daterev time=$time{$file} (set 1)\n";
     }
     else {
       print "$logsdir/$file: rev=$daterev time=$time{$file} (set 0)\n";
     }

     get_rulemetadata_for_revision($daterev, $revision{$file});
   }
 }

 # ---------------------------------------------------------------------------

 sub sort_all {
   my ($a1, $a2) = ($a =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);
   my ($b1, $b2) = ($b =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);
   $a1 =~ s/^[\+\-]//;
   $b1 =~ s/^[\+\-]//;

   my $n = ($a1 cmp $b1) || (($a2 || '') cmp ($b2 || ''));
   if ($a1 =~ /^OVERALL/)			{ $n -= 1000; }
   elsif ($a1 =~ /^\(all messages\)/)		{ $n -= 100; }
   elsif ($a1 =~ /^\(all messages as \%\)/)	{ $n -= 10; }
   if ($b1 =~ /^OVERALL/)			{ $n += 1000; }
   elsif ($b1 =~ /^\(all messages\)/)		{ $n += 100; }
   elsif ($b1 =~ /^\(all messages as \%\)/)	{ $n += 10; }
   return $n;
 }

 # ---------------------------------------------------------------------------

 sub time_filter_fileset {
   my ($fileary, $outname, $after, $before) = @_;

   my $timet_before = (defined $before ?
             ($time_start - ($before * WEEK)) : $time_start+1);
   my $timet_after  = (defined $after ?
             ($time_start - ($after * WEEK))  : 0);

   open(TMP, ">$outname") or warn "cannot write $outname";
   for my $file (@{$fileary}) {
     open(IN, $file) or warn "cannot read $file";
     while (<IN>) {
       next unless /\btime=(\d+)/;
       next if ($1 < $timet_after || $1 > $timet_before);
       print TMP;
     }
     close IN;
   }
   close TMP or warn "failed to close $outname";
 }

 # ---------------------------------------------------------------------------

 sub generate_logs {
   foreach my $entry (split(' ', $cf{output_classes})) {
     $entry =~ /^(\S+)\.(\S+)$/;
     my $class = $1;
     my $rtype = $2;
     if (!$rtype) { warn "no rtype in $entry"; next; }
     if ($class eq 'HTML') { warn "class HTML in $entry obsolete, ignored"; next; }

     foreach my $daterev (reverse sort keys %logs_by_daterev) {
       my $rev;
       if ($daterev !~ /\/r(\d+)/) {
         warn "bad daterev: $daterev"; next;
       }
       $rev = $1;

       if ($class eq "NET") {
         next unless $is_net_daterev{$daterev};
       }

       gen_class ($daterev, $rev, $class, $rtype);
     }
   }
 }

 # ---------------------------------------------------------------------------

 my ($tmp_h, $tmp_s, $no_messages_in_freqs, $hf_flags);

 sub gen_class {
   my ($daterev, $rev, $class, $rtype) = @_;
   return if ($class eq "NET" && $rtype !~ /^(?:new|all|age|7day)$/);

   chdir $logsdir;
   print STDERR "\ngenerating: $cf{html}/$daterev/$class.$rtype\n";

   my @ham = grep { /^ham/ } @{$logs_by_daterev{$daterev}};
   print STDERR "input h: " . join(' ', @ham) . "\n";

   my @spam = grep { /^spam/ } @{$logs_by_daterev{$daterev}};
   print STDERR "input s: " . join(' ', @spam) . "\n";

   # net vs. local
   if ($class eq "NET") {
     @ham = grep { /-net-/ } @ham;
     @spam = grep { /-net-/ } @spam;
   }
   # age
   if ($rtype =~ /(\d+)day/) {
     my $mtime = $1;
     @ham = grep { -M $_ < $mtime } @ham;
     @spam = grep { -M $_ < $mtime } @spam;
   }

   print STDERR "selected h: " . join(' ', @ham) . "\n";
   print STDERR "selected s: " . join(' ', @spam) . "\n";

   # we cannot continue if we have no files that match the criteria...
   # demand at least 1 ham and 1 spam file
   if (scalar @spam <= 0 || scalar @ham <= 0) {
     warn "not enough files found matching criteria ($daterev $class $rtype)\n";
     return;
   }

   my $dir = create_outputdir($daterev);
   my $fname = "$dir/$class.$rtype";

   # now, if the target file already exists, check to see if it's newer
   # than all the sources, make-style; if not, don't re-create it
   if (-f $fname) {
     my $targetfreshness = (-M $fname);
     my $needsrebuild = 0;

     foreach my $srcfile (@spam, @ham) {
       my $srcfreshness = (-M $srcfile);
       if ($targetfreshness > $srcfreshness) {     # src is fresher
         print "need rebuild, $fname is older than $srcfile: $targetfreshness > $srcfreshness\n";
         $needsrebuild = 1;
         last;
       }
     }

     if (!$needsrebuild) {
       print "existing: $fname, fresher than sources\n";
       return;
     }
   }

   my $when = scalar localtime time;
   print qq{creating: $fname ($class)
   started $when...
 };
   my $bytes = 0;

   my $tmpfname = "$fname.$$";
   if ($class eq 'LOGS') {
     $bytes = gen_report_logs($fname, \@ham, \@spam);
   }
   elsif ($class eq 'CORPUS') {
     push (@tmps, abs_path($tmpfname));

     my $cmd = "$perl_path $cf{tree}/masses/logs-to-corpus-report ".
           join(" ", @ham)." ".join(" ", @spam)." > $tmpfname";
     runcmd $cmd;
     ($? >> 8 == 0) or warn "failed to run logs-to-corpus-report";

     rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname";
     $bytes = (-s $fname);
   }
   else {
     push (@tmps, abs_path($tmpfname));

     open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname";
     print OUT "# ham results used for $daterev $class $rtype: " . join(" ", @ham) . "\n";
     print OUT "# spam results used for $daterev $class $rtype: " . join(" ", @spam) . "\n";
     print OUT "# ".log_metadata_xml($daterev, @ham, @spam)."\n";

     $hf_flags = "";
     $hf_flags = "-t net -s 1" if $class eq "NET";
     $hf_flags = "-o" if $class eq "OVERLAP";
     $hf_flags = "-S" if $class eq "SCOREMAP";

     if ($cf{rules_dir}) {
       $hf_flags .= " -c '$cf{rules_dir}'";
     }

     # are we analyzing --net mass-check logs?  if so, use scoreset 1
     if (join(" ", @ham) =~ /-net-/) {
       $hf_flags .= " -s 1" if $class eq "NET";
     }

     # catch an odd error condition, where hit-frequencies creates output
     # with no log lines included at all
     $no_messages_in_freqs = 0;

     $tmp_h = "$cf{tmp}/ham.log.$$";
     $tmp_s = "$cf{tmp}/spam.log.$$";

     if ($rtype eq "all") {
       gen_report_freqs_all($tmpfname, \@ham, \@spam, $rev);
     }
     elsif ($rtype eq "age") {
       gen_report_freqs_age($tmpfname, \@ham, \@spam, $rev);
     }
     elsif (@ham && @spam) {
       gen_report_freqs_basic($tmpfname, \@ham, \@spam, $rev);
     }

     $bytes = (-s OUT);
     close(OUT);

     if ($no_messages_in_freqs) {
       warn "ERROR: no data in freqs!  aborting, leaving tmp file as $tmpfname";
       return;
     }

     rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname";

     # compress for certain classes
     if ($class eq "OVERLAP") {
       $fname =~ s/'//gs;
       runcmd ("rm '$fname.gz'; pigz '$fname'");
       # takes care of keeping the original around so we don't have to
       if ($? >> 8 != 0) { warn "pigz '$fname' failed"; }
     }
   }

   $when = scalar localtime time;
   print qq{created: $bytes bytes, finished at $when
 URL:

   $cf{ruleqa_url}$output_revpath

 };

 }

 # ---------------------------------------------------------------------------

 sub mk_daterev {
   my ($timet, $rev, $tag) = @_;
   return strftime("%Y%m%d", gmtime($timet + DATEREV_ADJ)) . "/r$rev-$tag";
 }

 # ---------------------------------------------------------------------------

 sub create_outputdir {
   my ($revpath) = @_;
   my $dir = $cf{html} .'/'. $revpath;

   # print "output dir: $dir\n";
   if (!-d $dir) {
     my $prevu = umask 0;
     mkpath([$dir], 0, oct($cf{html_mode})) or warn "failed to mkdir $dir";
     umask $prevu;
   }

   $output_revpath = $revpath;       # set the global
   $output_revpath =~ s/\//-/;       # looks nicer

   return $dir;
 }

 # ---------------------------------------------------------------------------

 sub log_metadata_xml {
   my ($daterev, @files) = @_;
   my $str = '';

   # this is extracted into the info.xml file later by the gen_info_xml script
   foreach my $f (@files) {
     $str .= qq{
       <mclogmd file='$f'>
         <daterev>$daterev</daterev>
         <rev>$revision{$f}</rev>
         <fsize>$filesize{$f}</fsize>
         <mcstartdate>$dateline{$f}</mcstartdate>
         <mtime>$mtime{$f}</mtime>
       </mclogmd>
     };
   }

   $str =~ s/\s+/ /gs;  # on a single line please
   return '<mclogmds>'.$str.'</mclogmds>';
 }

 # ---------------------------------------------------------------------------

 sub create_rulemetadata_dir {
   my $rev = shift;
   my $dir = "$cf{html}/rulemetadata/$rev";
   if (!-d $dir) {
     my $prevu = umask 0;
     mkpath([$dir], 0, oct($cf{html_mode})) or warn "failed to mkdir $dir";
     umask $prevu;
   }
   return $dir;
 }

 # ---------------------------------------------------------------------------

 sub get_rulemetadata_for_revision {
   my ($daterev, $rev) = @_;

   my $dir = create_rulemetadata_dir($rev);

   # argh.  this is silly; ~bbmass/.corpus specifies "$PWD" in its
   # "tree" path, so we have to ensure we're in the 'masses' dir
   # for this to work!
   chdir "$cf{tree}/masses" or die "cannot chdir $cf{tree}/masses";

   my $cmd = "$cf{tree}/masses/rule-qa/get-rulemetadata-for-revision ".
                     "--rev=$rev --outputdir='$dir'";

   runcmd($cmd);
   if ($? >> 8 != 0) {
     warn "'$cmd' failed";
   }

   chdir $logsdir;
 }

 # ---------------------------------------------------------------------------

 sub start_hit_frequencies_at_rev {
   my ($rev, $args) = @_;

   $rev ||= 'HEAD';
   (-d "$cf{tmp}/hfdir") or runcmd("mkdir -p $cf{tmp}/hfdir");
   my $hfdir = "$cf{tmp}/hfdir/r$rev";
   my $expected_svn_file = "$hfdir/Makefile.PL";

   print "setting up hit-frequencies for r$rev in $hfdir\n";

   my $needs_checkout = 0;
   if (-d $hfdir && chdir $hfdir) {
     eval {
       # "svn up" has been observed to wedge on the ruleqa zone VM, put a timeout so we can recover
       local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
       alarm 60*60;	# an hour should be generous enough
       runcmd("svn up -r$rev");
       alarm 0;
     };

     if ($@ || $?>>8 != 0 || !-f $expected_svn_file) {
       print "simple 'svn update' failed. performing full checkout instead...\n";
       $needs_checkout = 1;
     }
   } else {
     $needs_checkout = 1;
   }

   if ($needs_checkout) {
     my $svnurl = get_svn_url();
     runcmd("rm -rf $hfdir");
     eval {
       local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
       alarm 60*60;	# an hour should be generous enough
       runcmd("svn co $svnurl\@$rev $hfdir");
       alarm 0;
     };
     if ($@ || $?>>8 != 0 || !-f $expected_svn_file) { die "svn co failed"; }
   }
   chdir "$hfdir" or die "cannot chdir $hfdir";

   # ensure these are rebuilt
   runcmd "rm -f rules/70_sandbox.cf rules/72_active.cf";

   # do this twice in case Makefile.PL is rebuilt
   runcmd "( make build_rules || $perl_path Makefile.PL;make build_rules )</dev/null";

   chdir "$hfdir/masses" or die "cannot chdir $hfdir/masses";
   open (FREQS, "$perl_path hit-frequencies -TxpagP $args |")
             or die "cannot run ./hit-frequencies $args |";

   chdir $logsdir;
 }

 # ---------------------------------------------------------------------------

 sub get_svn_url {
   open (SVNINFO, "svn info $cf{tree}|") or die "cannot run svn info";
   my $svnurl;
   while (<SVNINFO>) {
     /URL: (.*)$/ and $svnurl = $1;
   }
   close SVNINFO or die "cannot close svn info";
   return $svnurl;
 }

 # ---------------------------------------------------------------------------

 sub gen_report_logs {
   my ($fname, $hamref, $spamref) = @_;

   my $bytes = 0;
   foreach my $f (@$hamref, @$spamref) {
     $f =~ s/[^-\._A-Za-z0-9]+/_/gs;    # sanitize!
     my $zf = "$fname-$f.gz";

     runcmd("pigz -c < $f > $zf.$$");
     if ($? >> 8 != 0) {
       warn "pigz -c < $f > $zf.$$ failed";
     }

     rename("$zf.$$", $zf) or
                   warn "cannot rename $zf.$$ to $zf";
     $bytes += (-s $zf);
   }

   # this is just so we won't recompress these logs if re-run
   open TOUCH, ">$fname" or warn "cannot write to $fname";
   close TOUCH;

   return $bytes;
 }

 # ---------------------------------------------------------------------------

 sub gen_report_freqs_all {
   my ($tmpfname, $hamref, $spamref, $rev) = @_;
   my %spam;
   my %ham;
   my @output;

   for my $file (@$spamref) {
     my $u = extract_username_from_log_filename($file);
     $spam{$u} = $file; print "username in spam log: $u\n";
   }
   for my $file (@$hamref) {
     my $u = extract_username_from_log_filename($file);
     $ham{$u} = $file; print "username in ham log: $u\n";
   }

   if (scalar keys %spam <= 0 && scalar keys %ham <= 0) {
     warn "no files found";
     return;
   }

   my $tmp_h_all = "$cf{tmp}/hamall.log.$$";
   my $tmp_s_all = "$cf{tmp}/spamall.log.$$";
   unlink $tmp_h_all, $tmp_s_all;

   my %alluserkeys;
   for my $k (keys %spam, keys %ham) {
     next if exists $alluserkeys{$k}; undef $alluserkeys{$k};
   }
   for my $user (sort keys %alluserkeys) {
     my $files_h = [];
     if ($ham{$user}) { $files_h = [ "$logsdir/$ham{$user}" ]; }
     my $files_s = [];
     if ($spam{$user}) { $files_s = [ "$logsdir/$spam{$user}" ]; }

     time_filter_fileset($files_h, $tmp_h, $OLDEST_HAM_WEEKS,  undef);
     time_filter_fileset($files_s, $tmp_s, $OLDEST_SPAM_WEEKS, undef);

     start_hit_frequencies_at_rev($rev, "$hf_flags $tmp_s $tmp_h");
     while(<FREQS>) {
       chomp; push @output, "$_:$user\n";
     }
     close(FREQS);

     runcmd("cat $tmp_h >> $tmp_h_all");
     runcmd("cat $tmp_s >> $tmp_s_all");
   }

   if (-z $tmp_h_all && -z $tmp_s_all) {
     warn "time_filter_fileset() returned empty logs. not creating freqs!";
     return;     # we'll try again later
   }

   start_hit_frequencies_at_rev($rev, "$hf_flags $tmp_s_all $tmp_h_all");
   while(<FREQS>) {
     /\s0\s+0\s+0.500\s+0.00\s+0.00\s+\(all messages\)/ and $no_messages_in_freqs = 1;
     push @output, $_;
   }
   close(FREQS);
   for (sort sort_all @output) { print OUT; }
 }

 # ---------------------------------------------------------------------------

 sub extract_username_from_log_filename {
   my $u = shift;
   # spam-someuser.log, spam-net-bb-jm.20090518-r775863-n.log
   $u =~ s/\.log$//; $u =~ s/.*\///; $u =~ s/^(h|sp)am-(?:net-)?//;
   $u =~ s/\.\d{8}-r\d+-[a-z]//;       # daterev
   return $u;
 }

 # ---------------------------------------------------------------------------

 sub gen_report_freqs_age {
   my ($tmpfname, $hamref, $spamref, $rev) = @_;
   my @output;

   for my $which (("0-1", "1-2", "2-3", "3-6")) {
     my ($before, $after) = split(/-/, $which);
     time_filter_fileset($hamref, $tmp_h, $after, $before);
     time_filter_fileset($spamref, $tmp_s, $after, $before);

     # print out by age
     start_hit_frequencies_at_rev($rev, "$hf_flags $tmp_s $tmp_h");
     while(<FREQS>) {
       chomp; push @output, "$_:$which\n";
     }
     close(FREQS);
   }
   for (sort sort_all @output) { print OUT; }
 }

 # ---------------------------------------------------------------------------

 sub gen_report_freqs_basic {
   my ($tmpfname, $hamref, $spamref, $rev) = @_;

   time_filter_fileset($hamref, $tmp_h, $OLDEST_HAM_WEEKS, undef);
   time_filter_fileset($spamref, $tmp_s, $OLDEST_SPAM_WEEKS, undef);

   if (-z $tmp_h && -z $tmp_s) {
     warn "time_filter_fileset() returned empty logs. not creating freqs!";
     return;     # we'll try again later
   }

   start_hit_frequencies_at_rev($rev, "$hf_flags $tmp_s $tmp_h");
   while(<FREQS>) {
     /\s0\s+0\s+0.500\s+0.00\s+0.00\s+\(all messages\)/ and $no_messages_in_freqs = 1;
     print(OUT);
   }
   close(FREQS);
 }

 sub runcmd {
   my ($cmd) = @_;
   print "[$cmd]\n";
   system $cmd;
 }
	#!/usr/bin/perl -w

	# settings are located in $HOME/.corpus

	use strict;
	use Getopt::Long;

	our ( $opt_override, $opt_tag );
	GetOptions(
	"tag=s" => \$opt_tag,
	"override=s" => \$opt_override,
	);

	$opt_override \|\|= '';
	$opt_tag \|\|= 'n'; # nightly is the default

	use File::Path;
	use File::Copy;
	use Time::ParseDate;
	use Cwd qw(abs_path);
	use POSIX qw(nice strftime);

	use constant WEEK => 76060*24;
	nice(15);

	# daterevs -- e.g. "20060429/r239832-r" -- are aligned to 0800 UTC, just before
	# the time of day when the mass-check tagging occurs; see
	# http://wiki.apache.org/spamassassin/DateRev for more details.
	use constant DATEREV_ADJ => - (8 * 60 * 60);

	# what's the max age of mail we will accept data from? (in weeks)
	# TODO: maybe this should be in ~/.corpus
	my $OLDEST_HAM_WEEKS = 72 * 4; # 72 months = 6 years
	my $OLDEST_SPAM_WEEKS = 2 * 4; # 2 months

	# ---------------------------------------------------------------------------

	sub runcmd;
	my $configuration = "$ENV{HOME}/.corpus";
	my %cf;
	my %revision = ();
	my %filesize = ();
	my %dateline = ();
	my %mtime = ();
	my %logs_by_daterev = ();
	my %is_net_daterev = ();
	my %time = ();
	my @tmps = ();
	my $time_start = time;
	my $output_revpath;
	my $perl_path = $^X;

	configure();
	init();

	my $logsdir = "$cf{html}/logs";
	print "reading logs from '$logsdir'\n";

	locate_input();
	generate_logs();
	clean_up();
	exit;

	# ---------------------------------------------------------------------------

	sub configure {
	# does rough equivalent of source
	open(C, $configuration) \|\| die "open failed: $configuration: $!\n";
	my $pwd = Cwd::getcwd;

	# add 'override' options
	my @lines = (<C>, split(/\\|/, $opt_override));

	foreach (@lines) {
	chomp;
	s/#.*//;
	if (/^\s(.?)\s=\s(.?)\s$/) {
	my ($key, $val) = ($1, $2);
	$val =~ s/\$PWD/$pwd/gs;
	$cf{$key} = $val;
	}
	}
	close(C);

	$cf{output_classes} \|\|=
	"DETAILS.new DETAILS.all DETAILS.age NET.new NET.all NET.age";
	}

	# ---------------------------------------------------------------------------

	sub clean_up {
	chdir "/";
	runcmd "rm -rf $cf{tmp}/*.$$ ".join(' ', @tmps);
	}

	# ---------------------------------------------------------------------------

	sub init {
	$SIG{INT} = \&clean_up;
	$SIG{TERM} = \&clean_up;

	$ENV{RSYNC_PASSWORD} = $cf{password};
	$ENV{TIME} = '%e,%U,%S';
	$ENV{TZ} = 'UTC';
	}

	# ---------------------------------------------------------------------------

	sub locate_input {
	opendir(CORPUS, $logsdir);
	my @files = sort readdir(CORPUS);
	closedir(CORPUS);

	warn "found ", $#files + 1, " files in $logsdir";

	@files = grep {
	/^(?:spam\|ham)-(?:net-)?\S+\.log$/ && -f "$logsdir/$_" && -M _ < 10
	} @files;

	warn "kept ", $#files + 1, " files";

	foreach my $file (@files) {
	my $tag = 0;
	my $headers = '';

	warn "processing $logsdir/$file";

	open(FILE, "$logsdir/$file") or warn "cannot read $logsdir/$file";
	while (my $line = <FILE>) {
	last if $line !~ /^#/;
	$headers .= $line;
	if ($line =~ /, on (... ... .. )(..)(:..:.. ... ....)/) {
	my ($datepre, $hh, $datepost) = ($1,$2,$3);

	my $timet = Time::ParseDate::parsedate($datepre.$hh.$datepost,
	GMT => 1, PREFER_PAST => 1);

	$time{$file} = $timet;
	}
	elsif ($line =~ m/^# Date:\s*(\S+)/) {
	# a better way to do the above. TODO: parse it instead
	$dateline{$file} = $1;
	if (!defined $time{$file}) {
	# if time line unparseable (localized?) use this instead
	my ($yyyy, $mm, $dd, $h, $m, $s) = $dateline{$file} =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/;

	my $timet = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} ${h}:${m}:${s} GMT+0",
	GMT => 1, PREFER_PAST => 1);

	$time{$file} = $timet;
	}
	}
	elsif ($line =~ m/^# SVN revision:\s*(\S+)/) {
	$revision{$file} = $1;
	}
	}
	close(FILE);

	my @s = stat("$logsdir/$file");
	$filesize{$file} = $s[7];
	$mtime{$file} = $s[9];

	if (!defined $time{$file}) {
	warn "$logsdir/$file: no time found, ignored\n"; next;
	}
	if (!defined $revision{$file}) {
	warn "$logsdir/$file: no revision found, ignored\n"; next;
	}
	if ($revision{$file} eq 'unknown') {
	warn "$logsdir/$file: not tagged with a revision, ignored\n"; next;
	}

	my $daterev = mk_daterev($time{$file},$revision{$file},$opt_tag);

	$logs_by_daterev{$daterev} \|\|= [ ];
	push (@{$logs_by_daterev{$daterev}}, $file);

	if ($file =~ /-net-/) {
	$is_net_daterev{$daterev} = 1;
	print "$logsdir/$file: rev=$daterev time=$time{$file} (set 1)\n";
	}
	else {
	print "$logsdir/$file: rev=$daterev time=$time{$file} (set 0)\n";
	}

	get_rulemetadata_for_revision($daterev, $revision{$file});
	}
	}

	# ---------------------------------------------------------------------------

	sub sort_all {
	my ($a1, $a2) = ($a =~ m/(\(.*?\)\|\S+)(?::(\S+))?$/);
	my ($b1, $b2) = ($b =~ m/(\(.*?\)\|\S+)(?::(\S+))?$/);
	$a1 =~ s/^[\+\-]//;
	$b1 =~ s/^[\+\-]//;

	my $n = ($a1 cmp $b1) \|\| (($a2 \|\| '') cmp ($b2 \|\| ''));
	if ($a1 =~ /^OVERALL/) { $n -= 1000; }
	elsif ($a1 =~ /^\(all messages\)/) { $n -= 100; }
	elsif ($a1 =~ /^\(all messages as \%\)/) { $n -= 10; }
	if ($b1 =~ /^OVERALL/) { $n += 1000; }
	elsif ($b1 =~ /^\(all messages\)/) { $n += 100; }
	elsif ($b1 =~ /^\(all messages as \%\)/) { $n += 10; }
	return $n;
	}

	# ---------------------------------------------------------------------------

	sub time_filter_fileset {
	my ($fileary, $outname, $after, $before) = @_;

	my $timet_before = (defined $before ?
	($time_start - ($before * WEEK)) : $time_start+1);
	my $timet_after = (defined $after ?
	($time_start - ($after * WEEK)) : 0);

	open(TMP, ">$outname") or warn "cannot write $outname";
	for my $file (@{$fileary}) {
	open(IN, $file) or warn "cannot read $file";
	while (<IN>) {
	next unless /\btime=(\d+)/;
	next if ($1 < $timet_after \|\| $1 > $timet_before);
	print TMP;
	}
	close IN;
	}
	close TMP or warn "failed to close $outname";
	}

	# ---------------------------------------------------------------------------

	sub generate_logs {
	foreach my $entry (split(' ', $cf{output_classes})) {
	$entry =~ /^(\S+)\.(\S+)$/;
	my $class = $1;
	my $rtype = $2;
	if (!$rtype) { warn "no rtype in $entry"; next; }
	if ($class eq 'HTML') { warn "class HTML in $entry obsolete, ignored"; next; }

	foreach my $daterev (reverse sort keys %logs_by_daterev) {
	my $rev;
	if ($daterev !~ /\/r(\d+)/) {
	warn "bad daterev: $daterev"; next;
	}
	$rev = $1;

	if ($class eq "NET") {
	next unless $is_net_daterev{$daterev};
	}

	gen_class ($daterev, $rev, $class, $rtype);
	}
	}
	}

	# ---------------------------------------------------------------------------

	my ($tmp_h, $tmp_s, $no_messages_in_freqs, $hf_flags);

	sub gen_class {
	my ($daterev, $rev, $class, $rtype) = @_;
	return if ($class eq "NET" && $rtype !~ /^(?:new\|all\|age\|7day)$/);

	chdir $logsdir;
	print STDERR "\ngenerating: $cf{html}/$daterev/$class.$rtype\n";

	my @ham = grep { /^ham/ } @{$logs_by_daterev{$daterev}};
	print STDERR "input h: " . join(' ', @ham) . "\n";

	my @spam = grep { /^spam/ } @{$logs_by_daterev{$daterev}};
	print STDERR "input s: " . join(' ', @spam) . "\n";

	# net vs. local
	if ($class eq "NET") {
	@ham = grep { /-net-/ } @ham;
	@spam = grep { /-net-/ } @spam;
	}
	# age
	if ($rtype =~ /(\d+)day/) {
	my $mtime = $1;
	@ham = grep { -M $_ < $mtime } @ham;
	@spam = grep { -M $_ < $mtime } @spam;
	}

	print STDERR "selected h: " . join(' ', @ham) . "\n";
	print STDERR "selected s: " . join(' ', @spam) . "\n";

	# we cannot continue if we have no files that match the criteria...
	# demand at least 1 ham and 1 spam file
	if (scalar @spam <= 0 \|\| scalar @ham <= 0) {
	warn "not enough files found matching criteria ($daterev $class $rtype)\n";
	return;
	}

	my $dir = create_outputdir($daterev);
	my $fname = "$dir/$class.$rtype";

	# now, if the target file already exists, check to see if it's newer
	# than all the sources, make-style; if not, don't re-create it
	if (-f $fname) {
	my $targetfreshness = (-M $fname);
	my $needsrebuild = 0;

	foreach my $srcfile (@spam, @ham) {
	my $srcfreshness = (-M $srcfile);
	if ($targetfreshness > $srcfreshness) { # src is fresher
	print "need rebuild, $fname is older than $srcfile: $targetfreshness > $srcfreshness\n";
	$needsrebuild = 1;
	last;
	}
	}

	if (!$needsrebuild) {
	print "existing: $fname, fresher than sources\n";
	return;
	}
	}

	my $when = scalar localtime time;
	print qq{creating: $fname ($class)
	started $when...
	};
	my $bytes = 0;

	my $tmpfname = "$fname.$$";
	if ($class eq 'LOGS') {
	$bytes = gen_report_logs($fname, \@ham, \@spam);
	}
	elsif ($class eq 'CORPUS') {
	push (@tmps, abs_path($tmpfname));

	my $cmd = "$perl_path $cf{tree}/masses/logs-to-corpus-report ".
	join(" ", @ham)." ".join(" ", @spam)." > $tmpfname";
	runcmd $cmd;
	($? >> 8 == 0) or warn "failed to run logs-to-corpus-report";

	rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname";
	$bytes = (-s $fname);
	}
	else {
	push (@tmps, abs_path($tmpfname));

	open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname";
	print OUT "# ham results used for $daterev $class $rtype: " . join(" ", @ham) . "\n";
	print OUT "# spam results used for $daterev $class $rtype: " . join(" ", @spam) . "\n";
	print OUT "# ".log_metadata_xml($daterev, @ham, @spam)."\n";

	$hf_flags = "";
	$hf_flags = "-t net -s 1" if $class eq "NET";
	$hf_flags = "-o" if $class eq "OVERLAP";
	$hf_flags = "-S" if $class eq "SCOREMAP";

	if ($cf{rules_dir}) {
	$hf_flags .= " -c '$cf{rules_dir}'";
	}

	# are we analyzing --net mass-check logs? if so, use scoreset 1
	if (join(" ", @ham) =~ /-net-/) {
	$hf_flags .= " -s 1" if $class eq "NET";
	}

	# catch an odd error condition, where hit-frequencies creates output
	# with no log lines included at all
	$no_messages_in_freqs = 0;

	$tmp_h = "$cf{tmp}/ham.log.$$";
	$tmp_s = "$cf{tmp}/spam.log.$$";

	if ($rtype eq "all") {
	gen_report_freqs_all($tmpfname, \@ham, \@spam, $rev);
	}
	elsif ($rtype eq "age") {
	gen_report_freqs_age($tmpfname, \@ham, \@spam, $rev);
	}
	elsif (@ham && @spam) {
	gen_report_freqs_basic($tmpfname, \@ham, \@spam, $rev);
	}

	$bytes = (-s OUT);
	close(OUT);

	if ($no_messages_in_freqs) {
	warn "ERROR: no data in freqs! aborting, leaving tmp file as $tmpfname";
	return;
	}

	rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname";

	# compress for certain classes
	if ($class eq "OVERLAP") {
	$fname =~ s/'//gs;
	runcmd ("rm '$fname.gz'; pigz '$fname'");
	# takes care of keeping the original around so we don't have to
	if ($? >> 8 != 0) { warn "pigz '$fname' failed"; }
	}
	}

	$when = scalar localtime time;
	print qq{created: $bytes bytes, finished at $when
	URL:

	$cf{ruleqa_url}$output_revpath

	};

	}

	# ---------------------------------------------------------------------------

	sub mk_daterev {
	my ($timet, $rev, $tag) = @_;
	return strftime("%Y%m%d", gmtime($timet + DATEREV_ADJ)) . "/r$rev-$tag";
	}

	# ---------------------------------------------------------------------------

	sub create_outputdir {
	my ($revpath) = @_;
	my $dir = $cf{html} .'/'. $revpath;

	# print "output dir: $dir\n";
	if (!-d $dir) {
	my $prevu = umask 0;
	mkpath([$dir], 0, oct($cf{html_mode})) or warn "failed to mkdir $dir";
	umask $prevu;
	}

	$output_revpath = $revpath; # set the global
	$output_revpath =~ s/\//-/; # looks nicer

	return $dir;
	}

	# ---------------------------------------------------------------------------

	sub log_metadata_xml {
	my ($daterev, @files) = @_;
	my $str = '';

	# this is extracted into the info.xml file later by the gen_info_xml script
	foreach my $f (@files) {
	$str .= qq{
	<mclogmd file='$f'>
	<daterev>$daterev</daterev>
	<rev>$revision{$f}</rev>
	<fsize>$filesize{$f}</fsize>
	<mcstartdate>$dateline{$f}</mcstartdate>
	<mtime>$mtime{$f}</mtime>
	</mclogmd>
	};
	}

	$str =~ s/\s+/ /gs; # on a single line please
	return '<mclogmds>'.$str.'</mclogmds>';
	}

	# ---------------------------------------------------------------------------

	sub create_rulemetadata_dir {
	my $rev = shift;
	my $dir = "$cf{html}/rulemetadata/$rev";
	if (!-d $dir) {
	my $prevu = umask 0;
	mkpath([$dir], 0, oct($cf{html_mode})) or warn "failed to mkdir $dir";
	umask $prevu;
	}
	return $dir;
	}

	# ---------------------------------------------------------------------------

	sub get_rulemetadata_for_revision {
	my ($daterev, $rev) = @_;

	my $dir = create_rulemetadata_dir($rev);

	# argh. this is silly; ~bbmass/.corpus specifies "$PWD" in its
	# "tree" path, so we have to ensure we're in the 'masses' dir
	# for this to work!
	chdir "$cf{tree}/masses" or die "cannot chdir $cf{tree}/masses";

	my $cmd = "$cf{tree}/masses/rule-qa/get-rulemetadata-for-revision ".
	"--rev=$rev --outputdir='$dir'";

	runcmd($cmd);
	if ($? >> 8 != 0) {
	warn "'$cmd' failed";
	}

	chdir $logsdir;
	}

	# ---------------------------------------------------------------------------

	sub start_hit_frequencies_at_rev {
	my ($rev, $args) = @_;

	$rev \|\|= 'HEAD';
	(-d "$cf{tmp}/hfdir") or runcmd("mkdir -p $cf{tmp}/hfdir");
	my $hfdir = "$cf{tmp}/hfdir/r$rev";
	my $expected_svn_file = "$hfdir/Makefile.PL";

	print "setting up hit-frequencies for r$rev in $hfdir\n";

	my $needs_checkout = 0;
	if (-d $hfdir && chdir $hfdir) {
	eval {
	# "svn up" has been observed to wedge on the ruleqa zone VM, put a timeout so we can recover
	local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
	alarm 60*60; # an hour should be generous enough
	runcmd("svn up -r$rev");
	alarm 0;
	};

	if ($@ \|\| $?>>8 != 0 \|\| !-f $expected_svn_file) {
	print "simple 'svn update' failed. performing full checkout instead...\n";
	$needs_checkout = 1;
	}
	} else {
	$needs_checkout = 1;
	}

	if ($needs_checkout) {
	my $svnurl = get_svn_url();
	runcmd("rm -rf $hfdir");
	eval {
	local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
	alarm 60*60; # an hour should be generous enough
	runcmd("svn co $svnurl\@$rev $hfdir");
	alarm 0;
	};
	if ($@ \|\| $?>>8 != 0 \|\| !-f $expected_svn_file) { die "svn co failed"; }
	}
	chdir "$hfdir" or die "cannot chdir $hfdir";

	# ensure these are rebuilt
	runcmd "rm -f rules/70_sandbox.cf rules/72_active.cf";

	# do this twice in case Makefile.PL is rebuilt
	runcmd "( make build_rules \|\| $perl_path Makefile.PL;make build_rules )</dev/null";

	chdir "$hfdir/masses" or die "cannot chdir $hfdir/masses";
	open (FREQS, "$perl_path hit-frequencies -TxpagP $args \|")
	or die "cannot run ./hit-frequencies $args \|";

	chdir $logsdir;
	}

	# ---------------------------------------------------------------------------

	sub get_svn_url {
	open (SVNINFO, "svn info $cf{tree}\|") or die "cannot run svn info";
	my $svnurl;
	while (<SVNINFO>) {
	/URL: (.*)$/ and $svnurl = $1;
	}
	close SVNINFO or die "cannot close svn info";
	return $svnurl;
	}

	# ---------------------------------------------------------------------------

	sub gen_report_logs {
	my ($fname, $hamref, $spamref) = @_;

	my $bytes = 0;
	foreach my $f (@$hamref, @$spamref) {
	$f =~ s/[^-\._A-Za-z0-9]+/_/gs; # sanitize!
	my $zf = "$fname-$f.gz";

	runcmd("pigz -c < $f > $zf.$$");
	if ($? >> 8 != 0) {
	warn "pigz -c < $f > $zf.$$ failed";
	}

	rename("$zf.$$", $zf) or
	warn "cannot rename $zf.$$ to $zf";
	$bytes += (-s $zf);
	}

	# this is just so we won't recompress these logs if re-run
	open TOUCH, ">$fname" or warn "cannot write to $fname";
	close TOUCH;

	return $bytes;
	}

	# ---------------------------------------------------------------------------

	sub gen_report_freqs_all {
	my ($tmpfname, $hamref, $spamref, $rev) = @_;
	my %spam;
	my %ham;
	my @output;

	for my $file (@$spamref) {
	my $u = extract_username_from_log_filename($file);
	$spam{$u} = $file; print "username in spam log: $u\n";
	}
	for my $file (@$hamref) {
	my $u = extract_username_from_log_filename($file);
	$ham{$u} = $file; print "username in ham log: $u\n";
	}

	if (scalar keys %spam <= 0 && scalar keys %ham <= 0) {
	warn "no files found";
	return;
	}

	my $tmp_h_all = "$cf{tmp}/hamall.log.$$";
	my $tmp_s_all = "$cf{tmp}/spamall.log.$$";
	unlink $tmp_h_all, $tmp_s_all;

	my %alluserkeys;
	for my $k (keys %spam, keys %ham) {
	next if exists $alluserkeys{$k}; undef $alluserkeys{$k};
	}
	for my $user (sort keys %alluserkeys) {
	my $files_h = [];
	if ($ham{$user}) { $files_h = [ "$logsdir/$ham{$user}" ]; }
	my $files_s = [];
	if ($spam{$user}) { $files_s = [ "$logsdir/$spam{$user}" ]; }

	time_filter_fileset($files_h, $tmp_h, $OLDEST_HAM_WEEKS, undef);
	time_filter_fileset($files_s, $tmp_s, $OLDEST_SPAM_WEEKS, undef);

	start_hit_frequencies_at_rev($rev, "$hf_flags $tmp_s $tmp_h");
	while(<FREQS>) {
	chomp; push @output, "$_:$user\n";
	}
	close(FREQS);

	runcmd("cat $tmp_h >> $tmp_h_all");
	runcmd("cat $tmp_s >> $tmp_s_all");
	}

	if (-z $tmp_h_all && -z $tmp_s_all) {
	warn "time_filter_fileset() returned empty logs. not creating freqs!";
	return; # we'll try again later
	}

	start_hit_frequencies_at_rev($rev, "$hf_flags $tmp_s_all $tmp_h_all");
	while(<FREQS>) {
	/\s0\s+0\s+0.500\s+0.00\s+0.00\s+\(all messages\)/ and $no_messages_in_freqs = 1;
	push @output, $_;
	}
	close(FREQS);
	for (sort sort_all @output) { print OUT; }
	}

	# ---------------------------------------------------------------------------

	sub extract_username_from_log_filename {
	my $u = shift;
	# spam-someuser.log, spam-net-bb-jm.20090518-r775863-n.log
	$u =~ s/\.log$//; $u =~ s/.*\///; $u =~ s/^(h\|sp)am-(?:net-)?//;
	$u =~ s/\.\d{8}-r\d+-[a-z]//; # daterev
	return $u;
	}

	# ---------------------------------------------------------------------------

	sub gen_report_freqs_age {
	my ($tmpfname, $hamref, $spamref, $rev) = @_;
	my @output;

	for my $which (("0-1", "1-2", "2-3", "3-6")) {
	my ($before, $after) = split(/-/, $which);
	time_filter_fileset($hamref, $tmp_h, $after, $before);
	time_filter_fileset($spamref, $tmp_s, $after, $before);

	# print out by age
	start_hit_frequencies_at_rev($rev, "$hf_flags $tmp_s $tmp_h");
	while(<FREQS>) {
	chomp; push @output, "$_:$which\n";
	}
	close(FREQS);
	}
	for (sort sort_all @output) { print OUT; }
	}

	# ---------------------------------------------------------------------------

	sub gen_report_freqs_basic {
	my ($tmpfname, $hamref, $spamref, $rev) = @_;

	time_filter_fileset($hamref, $tmp_h, $OLDEST_HAM_WEEKS, undef);
	time_filter_fileset($spamref, $tmp_s, $OLDEST_SPAM_WEEKS, undef);

	if (-z $tmp_h && -z $tmp_s) {
	warn "time_filter_fileset() returned empty logs. not creating freqs!";
	return; # we'll try again later
	}

	start_hit_frequencies_at_rev($rev, "$hf_flags $tmp_s $tmp_h");
	while(<FREQS>) {
	/\s0\s+0\s+0.500\s+0.00\s+0.00\s+\(all messages\)/ and $no_messages_in_freqs = 1;
	print(OUT);
	}
	close(FREQS);
	}

	sub runcmd {
	my ($cmd) = @_;
	print "[$cmd]\n";
	system $cmd;
	}