blob: a8ed6f096c9398276c9e256f45642664795ca28e [file] [log] [blame]
#!/usr/bin/perl -w
# settings are located in $HOME/.corpus
use strict;
use Getopt::Long;
use File::Path;
use File::Copy;
use Time::ParseDate;
use POSIX qw(nice strftime);
use Cwd;
our ( $opt_dir, $opt_override, $opt_tag );
GetOptions(
"tag=s" => \$opt_tag,
"dir=s" => \$opt_dir,
"override=s" => \$opt_override,
);
$opt_override ||= '';
$opt_tag ||= 'n'; # nightly is the default
nice(15);
# daterevs -- e.g. "20060429/r239832-r" -- are aligned to 0800 UTC, just before
# the time of day when the mass-check tagging occurs; see
# http://wiki.apache.org/spamassassin/DateRev for more details.
use constant DATEREV_ADJ => - (8 * 60 * 60);
# ---------------------------------------------------------------------------
my $configuration = "$ENV{HOME}/.corpus";
my %cf;
configure();
init();
if (!$opt_dir) {
$opt_dir = $cf{corpus};
update_rsync();
}
chdir $opt_dir;
print "reading logs from '$opt_dir'\n";
my $linkdir = "$cf{html}/logs";
(-d $linkdir) or mkdir $linkdir;
locate_and_link();
exit;
sub configure {
# does rough equivalent of source
open(C, $configuration) || die "open failed: $configuration: $!\n";
my $pwd = Cwd::getcwd;
# add 'override' options
my @lines = (<C>, split(/\|/, $opt_override));
foreach $_ (@lines) {
chomp;
s/#.*//;
if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) {
my ($key, $val) = ($1, $2);
$val =~ s/\$PWD/$pwd/gs;
$cf{$key} = $val;
}
}
close(C);
}
sub init {
$ENV{RSYNC_PASSWORD} = $cf{password};
$ENV{TIME} = '%e,%U,%S';
$ENV{TZ} = 'UTC';
}
sub update_rsync {
chdir $opt_dir;
if (!$cf{rsync_command}) { die "no 'rsync_command' set"; }
system $cf{rsync_command};
system "chmod +r *.log > /dev/null 2>&1";
}
sub locate_and_link {
opendir(CORPUS, $opt_dir);
my @files = sort readdir(CORPUS);
closedir(CORPUS);
print "Found ", $#files + 1, " files in $opt_dir\n";
@files = grep {
/^(?:spam|ham)-(?:net-)?[-\w.]+\.log$/ && -f "$opt_dir/$_" && -M _ < 30
} @files;
print "Kept ", $#files + 1, " files\n";
foreach my $file (@files) {
my $ftime;
my $frevision;
print "Processing $opt_dir/$file\n";
open(FILE, "$opt_dir/$file") or warn "cannot read $opt_dir/$file";
while (my $line = <FILE>) {
last if $line !~ /^#/;
if ($line =~ /, on (... ... .. )(..)(:..:.. ... ....)/) {
my ($datepre, $hh, $datepost) = ($1,$2,$3);
$ftime = Time::ParseDate::parsedate($datepre.$hh.$datepost,
GMT => 1, PREFER_PAST => 1);
}
elsif ($line =~ m/^# Date:\s*(\S+)/) {
# a better way to do the above. TODO: parse it instead
}
elsif ($line =~ m/^# SVN revision:\s*(\S+)/) {
$frevision = $1;
}
}
close(FILE);
if (!defined $ftime) {
warn "$opt_dir/$file: no time found, ignored\n"; next;
}
if (!defined $frevision) {
warn "$opt_dir/$file: no revision found, ignored\n"; next;
}
if ($frevision eq 'unknown') {
warn "$opt_dir/$file: not tagged with a revision, ignored\n"; next;
}
my $daterev = mk_daterev($ftime, $frevision, $opt_tag);
link_file($file, $daterev);
}
}
sub mk_daterev {
my ($timet, $rev, $tag) = @_;
return strftime("%Y%m%d", gmtime($timet + DATEREV_ADJ)) . "/r$rev-$tag";
}
sub link_file {
my ($file, $daterev) = @_;
my $f = "$opt_dir/$file";
# /^(?:spam|ham)-(?:net-)?[-\w]+\.log$/
my $linkfile = $file;
my $dr = $daterev; $dr =~ s/\//-/gs; $linkfile =~ s/\.log$/.$dr.log/i;
my $t = "$linkdir/$linkfile";
print "ln $f $t\n";
(-f $t) and unlink $t;
# cannot hardlink unless we have ownership or RW perms on the file
symlink $f, $t or die "cannot ln";
system "/usr/bin/touch -h -r '$f' '$t'"; # preserve modtimes
}