| #!/usr/bin/perl |
| |
| #my $email_to = 'pds@apache.org'; |
| my $email_to = 'ruleqa@spamassassin.apache.org'; |
| |
| use strict; |
| use Getopt::Long; |
| our ( $corpusdir ); |
| GetOptions( |
| "dir=s" => \$corpusdir, |
| ); |
| |
| use File::Path; |
| use File::Copy; |
| use Time::ParseDate; |
| use Cwd; |
| use POSIX qw(nice strftime); |
| |
| nice(15); |
| |
| my %revision = (); |
| my %logs_by_rev = (); |
| my %is_net_revision = (); |
| my %dateline = (); |
| my %time = (); |
| my @files; |
| my $time_start = time; |
| my %revision_date = (); |
| my %before_nine = (); |
| |
| my $delete_weekly = 60*60*24*9; |
| my $delete_nightly = 60*60*24*3; |
| |
| &rename_corpus; |
| &read_files; |
| &cleanup_old; |
| &email_beforenine; |
| |
| sub rename_corpus { |
| opendir(CORPUS, $corpusdir); |
| my @rfiles = sort readdir(CORPUS); |
| closedir(CORPUS); |
| |
| @rfiles = grep { |
| /^(?:spam|ham)-(?:net-)?[-\w]+\.log$/ && !(/\.r[0-9]+\.log$/) && -f "$corpusdir/$_" && -M _ < 10 |
| } @rfiles; |
| |
| foreach my $file (@rfiles) { |
| my $rev; |
| open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file"; |
| while (my $line = <FILE>) { |
| last if $line !~ /^#/; |
| if ($line =~ m/^# Date:\s*(\S+)/) { |
| my $date_line = $1; |
| my ($yyyy, $mm, $dd, $h, $m, $s) = $date_line =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/; |
| |
| my $timet = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} ${h}:${m}:${s} GMT+0", |
| GMT => 1, PREFER_PAST => 1); |
| |
| my $timetgt = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} 09:00:00 GMT+0", |
| GMT => 1, PREFER_PAST => 1); |
| |
| if ($timet < $timetgt) { |
| $before_nine{$file} = $timet; |
| } |
| } |
| if ($line =~ m/^# SVN revision:\s*(\S+)/) { |
| $rev = $1; |
| } |
| } |
| |
| close(FILE); |
| |
| if ($rev) { |
| my $newfile = $file; |
| $newfile =~ s/\.log$/.r$rev.log/; |
| rename("$corpusdir/$file", "$corpusdir/$newfile"); |
| } |
| |
| } |
| } |
| |
| sub read_files { |
| opendir(CORPUS, $corpusdir); |
| @files = sort readdir(CORPUS); |
| closedir(CORPUS); |
| |
| @files = grep { |
| /^(?:spam|ham)-(?:net-)?[-\w]+\.r[0-9]+\.log$/ && -f "$corpusdir/$_" && -M _ < 10 |
| } @files; |
| |
| foreach my $file (@files) { |
| open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file"; |
| while (my $line = <FILE>) { |
| last if $line !~ /^#/; |
| if ($line =~ m/^# Date:\s*(\S+)/) { |
| $dateline{$file} = $1; |
| # if time line unparseable (localized?) use this instead |
| my ($yyyy, $mm, $dd, $h, $m, $s) = $dateline{$file} =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/; |
| |
| my $timetgt = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} 09:00:00 GMT+0", |
| GMT => 1, PREFER_PAST => 1); |
| |
| $time{$file} = $timetgt; |
| } |
| if ($line =~ m/^# SVN revision:\s*(\S+)/) { |
| my $rev = $1; |
| $revision{$file} = $rev; |
| |
| $logs_by_rev{$rev} ||= [ ]; |
| push (@{$logs_by_rev{$rev}}, $file); |
| |
| if ($file =~ /-net-/) { |
| $is_net_revision{$rev} = 1; |
| } |
| } |
| } |
| if ($time{$file} && $revision{$file}) { |
| my $rev = $revision{$file}; |
| $revision_date{$rev} = $time{$file} unless defined $revision_date{$rev}; |
| |
| # set earliest file that has this revision |
| |
| if ($time{$file} < $revision_date{$rev}) { |
| $revision_date{$rev} = $time{$file}; |
| } |
| } |
| close(FILE); |
| } |
| } |
| |
| sub cleanup_old { |
| my @cleanup = (); |
| |
| foreach my $revision (keys %revision_date) { |
| # set target date based on if net rev |
| my $target_date = ($time_start - $delete_nightly); |
| $target_date = ($time_start - $delete_weekly) if $is_net_revision{$revision}; |
| # add all files to cleanup arr |
| if ($revision_date{$revision} < $target_date) { |
| push(@cleanup, @{$logs_by_rev{$revision}}) |
| } |
| } |
| |
| my @cleanup = map "$corpusdir/$cleanup[$_]", 0..$#cleanup; |
| |
| unlink($_) foreach @cleanup; |
| } |
| |
| sub email_beforenine { |
| my $size = keys %before_nine; |
| return unless $size; |
| |
| my $from = 'automc@sa-vm.apache.org'; |
| my $subject = '[corpus-cleanup] Early runners'; |
| my $message = "The following files were submitted by early runners:\n\n"; |
| foreach my $revision (keys %before_nine) { |
| my $time = strftime("%F %R:%S %z", gmtime($before_nine{$revision})); |
| $message .= "$revision - Started at $time\n"; |
| } |
| $message .= "\nPlease run automasscheck after 0900 UTC"; |
| open(MAIL, "|/usr/sbin/sendmail -t"); |
| |
| # Email Header |
| print MAIL "To: $email_to\n"; |
| print MAIL "From: $from\n"; |
| print MAIL "Subject: $subject\n"; |
| print MAIL "MIME-Version: 1.0\n"; |
| print MAIL "Content-Type: text/plain; charset=UTF-8\n"; |
| print MAIL "Content-Transfer-Encoding: 8bit\n"; |
| print MAIL "\n"; |
| # Email Body |
| print MAIL $message; |
| |
| close(MAIL); |
| |
| } |
| |