blob: cc661a936eab7616a32321aee9209955b3862409 [file] [log] [blame]
#!/usr/bin/perl
#my $email_to = 'pds@apache.org';
my $email_to = 'ruleqa@spamassassin.apache.org';
use strict;
use Getopt::Long;
our ( $corpusdir );
GetOptions(
"dir=s" => \$corpusdir,
);
use File::Path;
use File::Copy;
use Time::ParseDate;
use Cwd;
use POSIX qw(nice strftime);
nice(15);
my %revision = ();
my %logs_by_rev = ();
my %is_net_revision = ();
my %dateline = ();
my %time = ();
my @files;
my $time_start = time;
my %revision_date = ();
my %before_nine = ();
my $delete_weekly = 60*60*24*9;
my $delete_nightly = 60*60*24*3;
&rename_corpus;
&read_files;
&cleanup_old;
&email_beforenine;
sub rename_corpus {
opendir(CORPUS, $corpusdir);
my @rfiles = sort readdir(CORPUS);
closedir(CORPUS);
@rfiles = grep {
/^(?:spam|ham)-(?:net-)?[-\w]+\.log$/ && !(/\.r[0-9]+\.log$/) && -f "$corpusdir/$_" && -M _ < 10
} @rfiles;
foreach my $file (@rfiles) {
my $rev;
open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file";
while (my $line = <FILE>) {
last if $line !~ /^#/;
if ($line =~ m/^# Date:\s*(\S+)/) {
my $date_line = $1;
my ($yyyy, $mm, $dd, $h, $m, $s) = $date_line =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/;
my $timet = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} ${h}:${m}:${s} GMT+0",
GMT => 1, PREFER_PAST => 1);
my $timetgt = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} 09:00:00 GMT+0",
GMT => 1, PREFER_PAST => 1);
if ($timet < $timetgt) {
$before_nine{$file} = $timet;
}
}
if ($line =~ m/^# SVN revision:\s*(\S+)/) {
$rev = $1;
}
}
close(FILE);
if ($rev) {
my $newfile = $file;
$newfile =~ s/\.log$/.r$rev.log/;
rename("$corpusdir/$file", "$corpusdir/$newfile");
}
}
}
sub read_files {
opendir(CORPUS, $corpusdir);
@files = sort readdir(CORPUS);
closedir(CORPUS);
@files = grep {
/^(?:spam|ham)-(?:net-)?[-\w]+\.r[0-9]+\.log$/ && -f "$corpusdir/$_" && -M _ < 10
} @files;
foreach my $file (@files) {
open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file";
while (my $line = <FILE>) {
last if $line !~ /^#/;
if ($line =~ m/^# Date:\s*(\S+)/) {
$dateline{$file} = $1;
# if time line unparseable (localized?) use this instead
my ($yyyy, $mm, $dd, $h, $m, $s) = $dateline{$file} =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/;
my $timetgt = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} 09:00:00 GMT+0",
GMT => 1, PREFER_PAST => 1);
$time{$file} = $timetgt;
}
if ($line =~ m/^# SVN revision:\s*(\S+)/) {
my $rev = $1;
$revision{$file} = $rev;
$logs_by_rev{$rev} ||= [ ];
push (@{$logs_by_rev{$rev}}, $file);
if ($file =~ /-net-/) {
$is_net_revision{$rev} = 1;
}
}
}
if ($time{$file} && $revision{$file}) {
my $rev = $revision{$file};
$revision_date{$rev} = $time{$file} unless defined $revision_date{$rev};
# set earliest file that has this revision
if ($time{$file} < $revision_date{$rev}) {
$revision_date{$rev} = $time{$file};
}
}
close(FILE);
}
}
sub cleanup_old {
my @cleanup = ();
foreach my $revision (keys %revision_date) {
# set target date based on if net rev
my $target_date = ($time_start - $delete_nightly);
$target_date = ($time_start - $delete_weekly) if $is_net_revision{$revision};
# add all files to cleanup arr
if ($revision_date{$revision} < $target_date) {
push(@cleanup, @{$logs_by_rev{$revision}})
}
}
my @cleanup = map "$corpusdir/$cleanup[$_]", 0..$#cleanup;
unlink($_) foreach @cleanup;
}
sub email_beforenine {
my $size = keys %before_nine;
return unless $size;
my $from = 'automc@sa-vm.apache.org';
my $subject = '[corpus-cleanup] Early runners';
my $message = "The following files were submitted by early runners:\n\n";
foreach my $revision (keys %before_nine) {
my $time = strftime("%F %R:%S %z", gmtime($before_nine{$revision}));
$message .= "$revision - Started at $time\n";
}
$message .= "\nPlease run automasscheck after 0900 UTC";
open(MAIL, "|/usr/sbin/sendmail -t");
# Email Header
print MAIL "To: $email_to\n";
print MAIL "From: $from\n";
print MAIL "Subject: $subject\n";
print MAIL "MIME-Version: 1.0\n";
print MAIL "Content-Type: text/plain; charset=UTF-8\n";
print MAIL "Content-Transfer-Encoding: 8bit\n";
print MAIL "\n";
# Email Body
print MAIL $message;
close(MAIL);
}