masses/rule-qa/corpus-hourly - spamassassin - Git at Google

 #!/usr/bin/perl -w

 # settings are located in $HOME/.corpus

 use strict;
 use Getopt::Long;

 our ( $corpusdir, $opt_override, $opt_tag );
 GetOptions(
     "tag=s" => \$opt_tag,
     "dir=s" => \$corpusdir,
     "override=s" => \$opt_override,
 );

 $opt_override ||= '';
 $opt_tag ||= 'n';       # nightly is the default


 use File::Path;
 use File::Copy;
 use Time::ParseDate;
 use Cwd;
 use POSIX qw(nice strftime);

 use constant WEEK => 60*60*24;
 nice(15);

 my $configuration = "$ENV{HOME}/.corpus";
 my %opt;
 my %revision = ();
 my %logs_by_rev = ();
 my %is_net_revision = ();
 my %time = ();
 my %revision_date = ();
 my @files;
 my @tmps = ();
 my $skip = '';
 my $time_start = time;
 $time_start -= ($time_start % 3600);
 my $output_revpath;

 &configure;
 &init;

 if ($corpusdir) {
   print "reading logs from '$corpusdir'\n";
 }
 else {
   $corpusdir = $opt{corpus};
   &update_rsync;
 }

 &locate;
 &current;
 &clean_up;

 sub configure {
   # does rough equivalent of source
   open(C, $configuration) || die "open failed: $configuration: $!\n";
   my $pwd = getcwd;

   # add 'override' options
   my @lines = (<C>, split(/\|/, $opt_override));

   foreach $_ (@lines) {
 	chomp;
 	s/#.*//;
 	if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) {
           my ($key, $val) = ($1, $2);
           $val =~ s/\$PWD/$pwd/gs;
 	  $opt{$key} = $val;
 	}
   }
   close(C);
 }

 sub clean_up {
   system "rm -f $opt{tmp}/*.$$ ".join(' ', @tmps);
 }

 sub init {
   $SIG{INT} = \&clean_up;
   $SIG{TERM} = \&clean_up;

   $ENV{RSYNC_PASSWORD} = $opt{password};
   $ENV{TIME} = '%e,%U,%S';
   $ENV{TZ} = 'UTC';
 }

 sub update_rsync {
   chdir $corpusdir;

   # allow non-running of rsync under some circumstances
   if ($opt{rsync_command}) {
     system $opt{rsync_command};
   } else {
     system "rsync -CPcvuzt --timeout=300 $opt{username}" . '@rsync.spamassassin.org::corpus/*.log .';
   }

   # this block is no longer required -- we do sensible things with modtime
   # comparisons to work it out!
   if (0 && !$opt{always_update_html}) {
     if (-f "rsync.last") {
       open(FIND, "find . -type f -newer rsync.last |");
       my $files = "";
       while(<FIND>) {
         $files .= $_;
       }
       close(FIND);
       if (! $files) {
         print STDERR "no new corpus files\n";
         if (rand(24) > 1) {
           exit 0;
         }
         else {
           print STDERR "updating anyway\n";
         }
       }
     }
   }

   open(RSYNC, "> rsync.last");
   close(RSYNC);
   system "chmod +r *.log";
 }

 sub locate {
   # chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";

   opendir(CORPUS, $corpusdir);
   @files = sort readdir(CORPUS);
   closedir(CORPUS);

   @files = grep {
     /^(?:spam|ham)-(?:net-)?[-\w]+\.r[0-9]+\.log$/ && -f "$corpusdir/$_" && -M _ < 10
   } @files;

   foreach my $file (@files) {
     my $tag = 0;
     my $revtime;
     open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file";
     while (my $line = <FILE>) {
       last if $line !~ /^#/;
       if ($line =~ m/^# Date:\s*(\S+)/) {
         my $date_line = $1;
         my ($yyyy, $mm, $dd, $h, $m, $s) = $date_line =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/;

         my $timet = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} ${h}:${m}:${s} GMT+0",
                   GMT => 1, PREFER_PAST => 1);

         $revtime = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} 09:00:00 GMT+0",
                   GMT => 1, PREFER_PAST => 1);

         $time{$file} = $timet;
         print "$corpusdir/$file: time=$timet\n";

       }
       if ($line =~ m/^# SVN revision:\s*(\S+)/) {
         my $rev = $1;
         $revision{$file} = $rev;

         $logs_by_rev{$rev} ||= [ ];
         push (@{$logs_by_rev{$rev}}, $file);

         if ($file =~ /-net-/) {
           $is_net_revision{$rev} = 1;
           print "$corpusdir/$file: rev=$rev (net)\n";
         }
         else {
           print "$corpusdir/$file: rev=$rev (non-net)\n";
         }
       }
     }
     close(FILE);
     if ($revtime) {
       my $rev = $revision{$file};
       $revision_date{$rev} = $revtime unless defined $revision_date{$rev};

       if ($revtime < $revision_date{$rev}) {
         $revision_date{$rev} = $revtime;
       }
     }
   }
 }

 sub sort_all {
   my ($a1, $a2) = ($a =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);
   my ($b1, $b2) = ($b =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);
   $a1 =~ s/^[\+\-]//;
   $b1 =~ s/^[\+\-]//;

   my $n = ($a1 cmp $b1) || (($a2 || '') cmp ($b2 || ''));
   if ($a1 =~ /^OVERALL/)			{ $n -= 1000; }
   elsif ($a1 =~ /^\(all messages\)/)		{ $n -= 100; }
   elsif ($a1 =~ /^\(all messages as \%\)/)	{ $n -= 10; }
   if ($b1 =~ /^OVERALL/)			{ $n += 1000; }
   elsif ($b1 =~ /^\(all messages\)/)		{ $n += 100; }
   elsif ($b1 =~ /^\(all messages as \%\)/)	{ $n += 10; }
   return $n;
 }

 sub time_filter {
   my ($target, $after, $before) = @_;
   if (/time=(\d+)/) {
 	return (($target - $1 >= WEEK * $after) &&
 		($target - $1 < WEEK * $before));
   }
   return 0;
 }

 sub current {
   my $classes = $opt{output_classes};
   $classes ||= "DETAILS.new DETAILS.all DETAILS.age HTML.new HTML.all HTML.age NET.new NET.all NET.age";

   foreach my $entry (split(' ', $classes)) {
     $entry =~ /^(\S+)\.(\S+)$/;
     my $class = $1;
     my $age = $2;
     if (!$age) { warn "no age in $entry"; next; }

     foreach my $rev (sort keys %logs_by_rev) {
       next if ($rev eq 'unknown');

       if ($class =~ /NET/) {
         next unless $is_net_revision{$rev};
       }

       gen_class ($rev, $class, $age);
     }
   }
 }

 sub gen_class {
   my ($rev, $class, $age) = @_;

   print STDERR "\ngenerating r$rev $class.$age:\n";

   next if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/);

   my @ham = grep { /^ham/ } @{$logs_by_rev{$rev}};
   my @spam = grep { /^spam/ } @{$logs_by_rev{$rev}};

   print STDERR "input h: " . join(' ', @ham) . "\n";
   print STDERR "input s: " . join(' ', @spam) . "\n";

   chdir $corpusdir;

   # net vs. local
   if ($class eq "NET") {
     @ham = grep { /-net-/ } @ham;
     @spam = grep { /-net-/ } @spam;
   }
   else {
     # if both net and local exist, use newer
     my %spam;
     my %ham;

     for my $file (@spam) {
       $spam{$1}++ if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
     }
     for my $file (@ham) {
       $ham{$1}++ if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
     }
     while (my ($user, $count) = each %ham) {
       if ($count > 1) {
         my $nightly = "ham-$user.log";
         my $weekly = "ham-net-$user.log";
         if ($revision{$nightly} >= $revision{$weekly}) {
           @ham = grep { $_ ne $weekly } @ham;
         }
         else {
           @ham = grep { $_ ne $nightly } @ham;
         }
       }
     }
     while (my ($user, $count) = each %spam) {
       if ($count > 1) {
         my $nightly = "spam-$user.log";
         my $weekly = "spam-net-$user.log";
         if ($revision{$nightly} >= $revision{$weekly}) {
           @spam = grep { $_ ne $weekly } @spam;
         }
         else {
           @spam = grep { $_ ne $nightly } @spam;
         }
       }
     }
   }

   # age
   if ($age =~ /(\d+)day/) {
     my $mtime = $1;
     @ham = grep { -M "$_" < $mtime } @ham;
     @spam = grep { -M "$_" < $mtime } @spam;
   }
   elsif ($class ne 'NET' && $age =~ /^(?:new|all|age)$/)
   {
     # just ignore the tagtime stuff; since we now may be
     # dealing with multiple mass-checks per day, just use svn rev data
     # my $tt = (-M $opt{tagtime});
     # @ham = grep { !defined($tt) || ((-M "$_") < $tt) } @ham;
     # @spam = grep { !defined($tt) || ((-M "$_") < $tt) } @spam;
   }

   print STDERR "selected h: " . join(' ', @ham) . "\n";
   print STDERR "selected s: " . join(' ', @spam) . "\n";

   # we cannot continue if we have no files that match the criteria...
   # demand at least 1 ham and 1 spam file
   if (scalar @spam <= 0 || scalar @ham <= 0) {
     warn "not enough files found matching criteria ($rev $class $age)\n";
     return;
   }

   my $crev_time = $revision_date{$rev};
   my $dir = create_outputdir($rev, $crev_time);

   my $fname = "$dir/$class.$age";

   # Look through corpus for files that have been added since last full build
   # Update all class files on this

   my $buildfile = "$dir/.buildtime";
   my $last_build = 0;
   my $needs_rebuild = 0;

   if (-f $buildfile) {
     open(BFILE, "$buildfile") or warn "cannot read $buildfile";
     while (my $line = <BFILE>) {
       last if $line !~ /^#/;
       if ($line =~ m/^# BuildTime:\s*(\S+)/) {
         $last_build = $1;
       }
     }
     close(BFILE);
   }

   if ( !(-f $fname) || !$last_build || $last_build == $time_start) {
     # No last build or we've already done the loop below
     $needs_rebuild = 1;
   } else {
     foreach my $srcfile (@spam, @ham) {
       my $file_time = (stat($srcfile))[9];
       if ($file_time >= $last_build) {
         $needs_rebuild = 1;
         last;
       }
     }
   }

   if (!$needs_rebuild) {
     print "last buildtime is fresher than sources\n";
     return;
   }

   if ($last_build != $time_start) {
     open(BFILE, "> $buildfile") or warn "cannot write to $buildfile";
     print BFILE "# BuildTime: $time_start\n";
     close(BFILE);
   }

   my $when = scalar localtime time;
   print qq{creating: $fname
   started $when...
   };
   my $bytes = 0;

   if ($class eq 'LOGS') {
     foreach my $f (@ham, @spam) {
       $f =~ s/[^-\._A-Za-z0-9]+/_/gs;    # sanitize!
       my $zf = "$fname-$f.gz";

       system("pigz -c < $f > $zf.$$");
       if ($? >> 8 != 0) {
         warn "pigz -c < $f > $zf.$$ failed";
       }

       rename("$zf.$$", $zf) or
                     warn "cannot rename $zf.$$ to $zf";
       $bytes += (-s $zf);
     }
     my $tmpfname = "$fname.$$";
     open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname";
     print OUT "# $$ \n";
     close(OUT);
     rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname";
   }
   else {
     my $tmpfname = "$fname.$$";

     open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname";
     print OUT "# ham results used for $rev $class $age: " . join(" ", @ham) . "\n";
     print OUT "# spam results used for $rev $class $age: " . join(" ", @spam) . "\n";
     for (@ham) {
       print OUT "# $_ was at r$revision{$_}\n";
     }
     for (@spam) {
       print OUT "# $_ was at r$revision{$_}\n";
     }

     push (@tmps, $tmpfname);

     my $flags = "";
     $flags = "-t net -s 1" if $class eq "NET";
     $flags = "-M HTML_MESSAGE" if $class eq "HTML";
     $flags = "-o" if $class eq "OVERLAP";
     $flags = "-S" if $class eq "SCOREMAP";
     if ($opt{rules_dir}) {
       $flags .= " -c '$opt{rules_dir}'";
     }

     if ($age eq "all") {
       my %spam;
       my %ham;
       my @output;

       for my $file (@spam) {
         $spam{$1} = $file if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
       }
       for my $file (@ham) {
         $ham{$1} = $file if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
       }
       unlink "$opt{tmp}/ham.log.$$";
       unlink "$opt{tmp}/spam.log.$$";

       if (scalar keys %spam <= 0 || scalar keys %ham <= 0) {
         warn "no files found for $class.$age";
         return;
       }

       chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
       for my $user (sort keys %spam) {
         next unless $ham{$user};
         system("cat $corpusdir/$ham{$user} >> $opt{tmp}/ham.log.$$");
         system("cat $corpusdir/$spam{$user} >> $opt{tmp}/spam.log.$$");
         open(IN, "./hit-frequencies -TxpaP $flags $corpusdir/$spam{$user} $corpusdir/$ham{$user} |");
         while(<IN>) {
           chomp;
           push @output, "$_:$user\n";
         }
         close(IN);
       }
       open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
       while(<IN>) {
         push @output, $_;
       }
       close(IN);
       for (sort sort_all @output) { print OUT; }
       unlink "$opt{tmp}/ham.log.$$";
       unlink "$opt{tmp}/spam.log.$$";
     }
     elsif ($age eq "age") {
       my @output;

       for my $which (("0-1", "1-2", "2-3", "3-6")) {
         my ($after, $before) = split(/-/, $which);
         # get and filter logs
         chdir $corpusdir;
         for my $type (("ham", "spam")) {
           open(TMP, "> $opt{tmp}/$type.log.$$");
           my @array = ($type eq "ham") ? @ham : @spam;
           for my $file (@array) {
             open(IN, $file) or warn "cannot read $file";
             while (<IN>) {
               print TMP $_ if time_filter($crev_time, $after, $before);
             }
             close(IN);
           }
           close (TMP);
         }
         # print out by age
         chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
         open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
         while(<IN>) {
           chomp;
           push @output, "$_:$which\n";
         }
         close(IN);
       }
       for (sort sort_all @output) { print OUT; }
       unlink "$opt{tmp}/ham.log.$$";
       unlink "$opt{tmp}/spam.log.$$";
     }
     elsif (@ham && @spam) {
       # get logs
       system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$");
       system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$");

       chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
       open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
       while(<IN>) { print(OUT); }
       close(IN);
       unlink "$opt{tmp}/ham.log.$$";
       unlink "$opt{tmp}/spam.log.$$";
     }

     $bytes = (-s OUT);
     close(OUT);
     rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname";
   }

   $when = scalar localtime time;
   print qq{created: $bytes bytes, finished at $when
 URL:

   http://buildbot.spamassassin.org/ruleqa?daterev=$output_revpath

 };

 }

 sub create_outputdir {
   my ($rev, $time) = @_;
   my $revpath = strftime("%Y%m%d", gmtime($time)) . "/r$rev-$opt_tag";
   my $dir = $opt{html} .'/'. $revpath;

   # print "output dir: $dir\n";
   if (!-d $dir) {
     my $prevu = umask 0;
     mkpath([$dir], 0, oct($opt{html_mode})) or warn "failed to mkdir $dir";
     umask $prevu;
   }

   $output_revpath = $revpath;       # set the global
   $output_revpath =~ s/\//-/;       # looks nicer

   return $dir;
 }
	#!/usr/bin/perl -w

	# settings are located in $HOME/.corpus

	use strict;
	use Getopt::Long;

	our ( $corpusdir, $opt_override, $opt_tag );
	GetOptions(
	"tag=s" => \$opt_tag,
	"dir=s" => \$corpusdir,
	"override=s" => \$opt_override,
	);

	$opt_override \|\|= '';
	$opt_tag \|\|= 'n'; # nightly is the default


	use File::Path;
	use File::Copy;
	use Time::ParseDate;
	use Cwd;
	use POSIX qw(nice strftime);

	use constant WEEK => 606024;
	nice(15);

	my $configuration = "$ENV{HOME}/.corpus";
	my %opt;
	my %revision = ();
	my %logs_by_rev = ();
	my %is_net_revision = ();
	my %time = ();
	my %revision_date = ();
	my @files;
	my @tmps = ();
	my $skip = '';
	my $time_start = time;
	$time_start -= ($time_start % 3600);
	my $output_revpath;

	&configure;
	&init;

	if ($corpusdir) {
	print "reading logs from '$corpusdir'\n";
	}
	else {
	$corpusdir = $opt{corpus};
	&update_rsync;
	}

	&locate;
	&current;
	&clean_up;

	sub configure {
	# does rough equivalent of source
	open(C, $configuration) \|\| die "open failed: $configuration: $!\n";
	my $pwd = getcwd;

	# add 'override' options
	my @lines = (<C>, split(/\\|/, $opt_override));

	foreach $_ (@lines) {
	chomp;
	s/#.*//;
	if (/^\s(.?)\s=\s(.?)\s$/) {
	my ($key, $val) = ($1, $2);
	$val =~ s/\$PWD/$pwd/gs;
	$opt{$key} = $val;
	}
	}
	close(C);
	}

	sub clean_up {
	system "rm -f $opt{tmp}/*.$$ ".join(' ', @tmps);
	}

	sub init {
	$SIG{INT} = \&clean_up;
	$SIG{TERM} = \&clean_up;

	$ENV{RSYNC_PASSWORD} = $opt{password};
	$ENV{TIME} = '%e,%U,%S';
	$ENV{TZ} = 'UTC';
	}

	sub update_rsync {
	chdir $corpusdir;

	# allow non-running of rsync under some circumstances
	if ($opt{rsync_command}) {
	system $opt{rsync_command};
	} else {
	system "rsync -CPcvuzt --timeout=300 $opt{username}" . '@rsync.spamassassin.org::corpus/*.log .';
	}

	# this block is no longer required -- we do sensible things with modtime
	# comparisons to work it out!
	if (0 && !$opt{always_update_html}) {
	if (-f "rsync.last") {
	open(FIND, "find . -type f -newer rsync.last \|");
	my $files = "";
	while(<FIND>) {
	$files .= $_;
	}
	close(FIND);
	if (! $files) {
	print STDERR "no new corpus files\n";
	if (rand(24) > 1) {
	exit 0;
	}
	else {
	print STDERR "updating anyway\n";
	}
	}
	}
	}

	open(RSYNC, "> rsync.last");
	close(RSYNC);
	system "chmod +r *.log";
	}

	sub locate {
	# chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";

	opendir(CORPUS, $corpusdir);
	@files = sort readdir(CORPUS);
	closedir(CORPUS);

	@files = grep {
	/^(?:spam\|ham)-(?:net-)?[-\w]+\.r[0-9]+\.log$/ && -f "$corpusdir/$_" && -M _ < 10
	} @files;

	foreach my $file (@files) {
	my $tag = 0;
	my $revtime;
	open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file";
	while (my $line = <FILE>) {
	last if $line !~ /^#/;
	if ($line =~ m/^# Date:\s*(\S+)/) {
	my $date_line = $1;
	my ($yyyy, $mm, $dd, $h, $m, $s) = $date_line =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/;

	my $timet = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} ${h}:${m}:${s} GMT+0",
	GMT => 1, PREFER_PAST => 1);

	$revtime = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} 09:00:00 GMT+0",
	GMT => 1, PREFER_PAST => 1);

	$time{$file} = $timet;
	print "$corpusdir/$file: time=$timet\n";

	}
	if ($line =~ m/^# SVN revision:\s*(\S+)/) {
	my $rev = $1;
	$revision{$file} = $rev;

	$logs_by_rev{$rev} \|\|= [ ];
	push (@{$logs_by_rev{$rev}}, $file);

	if ($file =~ /-net-/) {
	$is_net_revision{$rev} = 1;
	print "$corpusdir/$file: rev=$rev (net)\n";
	}
	else {
	print "$corpusdir/$file: rev=$rev (non-net)\n";
	}
	}
	}
	close(FILE);
	if ($revtime) {
	my $rev = $revision{$file};
	$revision_date{$rev} = $revtime unless defined $revision_date{$rev};

	if ($revtime < $revision_date{$rev}) {
	$revision_date{$rev} = $revtime;
	}
	}
	}
	}

	sub sort_all {
	my ($a1, $a2) = ($a =~ m/(\(.*?\)\|\S+)(?::(\S+))?$/);
	my ($b1, $b2) = ($b =~ m/(\(.*?\)\|\S+)(?::(\S+))?$/);
	$a1 =~ s/^[\+\-]//;
	$b1 =~ s/^[\+\-]//;

	my $n = ($a1 cmp $b1) \|\| (($a2 \|\| '') cmp ($b2 \|\| ''));
	if ($a1 =~ /^OVERALL/) { $n -= 1000; }
	elsif ($a1 =~ /^\(all messages\)/) { $n -= 100; }
	elsif ($a1 =~ /^\(all messages as \%\)/) { $n -= 10; }
	if ($b1 =~ /^OVERALL/) { $n += 1000; }
	elsif ($b1 =~ /^\(all messages\)/) { $n += 100; }
	elsif ($b1 =~ /^\(all messages as \%\)/) { $n += 10; }
	return $n;
	}

	sub time_filter {
	my ($target, $after, $before) = @_;
	if (/time=(\d+)/) {
	return (($target - $1 >= WEEK * $after) &&
	($target - $1 < WEEK * $before));
	}
	return 0;
	}

	sub current {
	my $classes = $opt{output_classes};
	$classes \|\|= "DETAILS.new DETAILS.all DETAILS.age HTML.new HTML.all HTML.age NET.new NET.all NET.age";

	foreach my $entry (split(' ', $classes)) {
	$entry =~ /^(\S+)\.(\S+)$/;
	my $class = $1;
	my $age = $2;
	if (!$age) { warn "no age in $entry"; next; }

	foreach my $rev (sort keys %logs_by_rev) {
	next if ($rev eq 'unknown');

	if ($class =~ /NET/) {
	next unless $is_net_revision{$rev};
	}

	gen_class ($rev, $class, $age);
	}
	}
	}

	sub gen_class {
	my ($rev, $class, $age) = @_;

	print STDERR "\ngenerating r$rev $class.$age:\n";

	next if ($class eq "NET" && $age !~ /^(?:new\|all\|age\|7day)$/);

	my @ham = grep { /^ham/ } @{$logs_by_rev{$rev}};
	my @spam = grep { /^spam/ } @{$logs_by_rev{$rev}};

	print STDERR "input h: " . join(' ', @ham) . "\n";
	print STDERR "input s: " . join(' ', @spam) . "\n";

	chdir $corpusdir;

	# net vs. local
	if ($class eq "NET") {
	@ham = grep { /-net-/ } @ham;
	@spam = grep { /-net-/ } @spam;
	}
	else {
	# if both net and local exist, use newer
	my %spam;
	my %ham;

	for my $file (@spam) {
	$spam{$1}++ if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
	}
	for my $file (@ham) {
	$ham{$1}++ if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
	}
	while (my ($user, $count) = each %ham) {
	if ($count > 1) {
	my $nightly = "ham-$user.log";
	my $weekly = "ham-net-$user.log";
	if ($revision{$nightly} >= $revision{$weekly}) {
	@ham = grep { $_ ne $weekly } @ham;
	}
	else {
	@ham = grep { $_ ne $nightly } @ham;
	}
	}
	}
	while (my ($user, $count) = each %spam) {
	if ($count > 1) {
	my $nightly = "spam-$user.log";
	my $weekly = "spam-net-$user.log";
	if ($revision{$nightly} >= $revision{$weekly}) {
	@spam = grep { $_ ne $weekly } @spam;
	}
	else {
	@spam = grep { $_ ne $nightly } @spam;
	}
	}
	}
	}

	# age
	if ($age =~ /(\d+)day/) {
	my $mtime = $1;
	@ham = grep { -M "$_" < $mtime } @ham;
	@spam = grep { -M "$_" < $mtime } @spam;
	}
	elsif ($class ne 'NET' && $age =~ /^(?:new\|all\|age)$/)
	{
	# just ignore the tagtime stuff; since we now may be
	# dealing with multiple mass-checks per day, just use svn rev data
	# my $tt = (-M $opt{tagtime});
	# @ham = grep { !defined($tt) \|\| ((-M "$_") < $tt) } @ham;
	# @spam = grep { !defined($tt) \|\| ((-M "$_") < $tt) } @spam;
	}

	print STDERR "selected h: " . join(' ', @ham) . "\n";
	print STDERR "selected s: " . join(' ', @spam) . "\n";

	# we cannot continue if we have no files that match the criteria...
	# demand at least 1 ham and 1 spam file
	if (scalar @spam <= 0 \|\| scalar @ham <= 0) {
	warn "not enough files found matching criteria ($rev $class $age)\n";
	return;
	}

	my $crev_time = $revision_date{$rev};
	my $dir = create_outputdir($rev, $crev_time);

	my $fname = "$dir/$class.$age";

	# Look through corpus for files that have been added since last full build
	# Update all class files on this

	my $buildfile = "$dir/.buildtime";
	my $last_build = 0;
	my $needs_rebuild = 0;

	if (-f $buildfile) {
	open(BFILE, "$buildfile") or warn "cannot read $buildfile";
	while (my $line = <BFILE>) {
	last if $line !~ /^#/;
	if ($line =~ m/^# BuildTime:\s*(\S+)/) {
	$last_build = $1;
	}
	}
	close(BFILE);
	}

	if ( !(-f $fname) \|\| !$last_build \|\| $last_build == $time_start) {
	# No last build or we've already done the loop below
	$needs_rebuild = 1;
	} else {
	foreach my $srcfile (@spam, @ham) {
	my $file_time = (stat($srcfile))[9];
	if ($file_time >= $last_build) {
	$needs_rebuild = 1;
	last;
	}
	}
	}

	if (!$needs_rebuild) {
	print "last buildtime is fresher than sources\n";
	return;
	}

	if ($last_build != $time_start) {
	open(BFILE, "> $buildfile") or warn "cannot write to $buildfile";
	print BFILE "# BuildTime: $time_start\n";
	close(BFILE);
	}

	my $when = scalar localtime time;
	print qq{creating: $fname
	started $when...
	};
	my $bytes = 0;

	if ($class eq 'LOGS') {
	foreach my $f (@ham, @spam) {
	$f =~ s/[^-\._A-Za-z0-9]+/_/gs; # sanitize!
	my $zf = "$fname-$f.gz";

	system("pigz -c < $f > $zf.$$");
	if ($? >> 8 != 0) {
	warn "pigz -c < $f > $zf.$$ failed";
	}

	rename("$zf.$$", $zf) or
	warn "cannot rename $zf.$$ to $zf";
	$bytes += (-s $zf);
	}
	my $tmpfname = "$fname.$$";
	open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname";
	print OUT "# $$ \n";
	close(OUT);
	rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname";
	}
	else {
	my $tmpfname = "$fname.$$";

	open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname";
	print OUT "# ham results used for $rev $class $age: " . join(" ", @ham) . "\n";
	print OUT "# spam results used for $rev $class $age: " . join(" ", @spam) . "\n";
	for (@ham) {
	print OUT "# $_ was at r$revision{$_}\n";
	}
	for (@spam) {
	print OUT "# $_ was at r$revision{$_}\n";
	}

	push (@tmps, $tmpfname);

	my $flags = "";
	$flags = "-t net -s 1" if $class eq "NET";
	$flags = "-M HTML_MESSAGE" if $class eq "HTML";
	$flags = "-o" if $class eq "OVERLAP";
	$flags = "-S" if $class eq "SCOREMAP";
	if ($opt{rules_dir}) {
	$flags .= " -c '$opt{rules_dir}'";
	}

	if ($age eq "all") {
	my %spam;
	my %ham;
	my @output;

	for my $file (@spam) {
	$spam{$1} = $file if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
	}
	for my $file (@ham) {
	$ham{$1} = $file if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
	}
	unlink "$opt{tmp}/ham.log.$$";
	unlink "$opt{tmp}/spam.log.$$";

	if (scalar keys %spam <= 0 \|\| scalar keys %ham <= 0) {
	warn "no files found for $class.$age";
	return;
	}

	chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
	for my $user (sort keys %spam) {
	next unless $ham{$user};
	system("cat $corpusdir/$ham{$user} >> $opt{tmp}/ham.log.$$");
	system("cat $corpusdir/$spam{$user} >> $opt{tmp}/spam.log.$$");
	open(IN, "./hit-frequencies -TxpaP $flags $corpusdir/$spam{$user} $corpusdir/$ham{$user} \|");
	while(<IN>) {
	chomp;
	push @output, "$_:$user\n";
	}
	close(IN);
	}
	open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ \|");
	while(<IN>) {
	push @output, $_;
	}
	close(IN);
	for (sort sort_all @output) { print OUT; }
	unlink "$opt{tmp}/ham.log.$$";
	unlink "$opt{tmp}/spam.log.$$";
	}
	elsif ($age eq "age") {
	my @output;

	for my $which (("0-1", "1-2", "2-3", "3-6")) {
	my ($after, $before) = split(/-/, $which);
	# get and filter logs
	chdir $corpusdir;
	for my $type (("ham", "spam")) {
	open(TMP, "> $opt{tmp}/$type.log.$$");
	my @array = ($type eq "ham") ? @ham : @spam;
	for my $file (@array) {
	open(IN, $file) or warn "cannot read $file";
	while (<IN>) {
	print TMP $_ if time_filter($crev_time, $after, $before);
	}
	close(IN);
	}
	close (TMP);
	}
	# print out by age
	chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
	open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ \|");
	while(<IN>) {
	chomp;
	push @output, "$_:$which\n";
	}
	close(IN);
	}
	for (sort sort_all @output) { print OUT; }
	unlink "$opt{tmp}/ham.log.$$";
	unlink "$opt{tmp}/spam.log.$$";
	}
	elsif (@ham && @spam) {
	# get logs
	system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$");
	system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$");

	chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
	open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ \|");
	while(<IN>) { print(OUT); }
	close(IN);
	unlink "$opt{tmp}/ham.log.$$";
	unlink "$opt{tmp}/spam.log.$$";
	}

	$bytes = (-s OUT);
	close(OUT);
	rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname";
	}

	$when = scalar localtime time;
	print qq{created: $bytes bytes, finished at $when
	URL:

	http://buildbot.spamassassin.org/ruleqa?daterev=$output_revpath

	};

	}

	sub create_outputdir {
	my ($rev, $time) = @_;
	my $revpath = strftime("%Y%m%d", gmtime($time)) . "/r$rev-$opt_tag";
	my $dir = $opt{html} .'/'. $revpath;

	# print "output dir: $dir\n";
	if (!-d $dir) {
	my $prevu = umask 0;
	mkpath([$dir], 0, oct($opt{html_mode})) or warn "failed to mkdir $dir";
	umask $prevu;
	}

	$output_revpath = $revpath; # set the global
	$output_revpath =~ s/\//-/; # looks nicer

	return $dir;
	}