| #!/usr/bin/perl -w |
| |
| # settings are located in $HOME/.corpus |
| |
| use strict; |
| use Getopt::Long; |
| use File::Path; |
| use File::Copy; |
| use Time::ParseDate; |
| use POSIX qw(nice strftime); |
| use Cwd; |
| |
| our ( $opt_dir, $opt_override, $opt_tag ); |
| GetOptions( |
| "tag=s" => \$opt_tag, |
| "dir=s" => \$opt_dir, |
| "override=s" => \$opt_override, |
| ); |
| |
| $opt_override ||= ''; |
| $opt_tag ||= 'n'; # nightly is the default |
| |
| nice(15); |
| |
| # daterevs -- e.g. "20060429/r239832-r" -- are aligned to 0800 UTC, just before |
| # the time of day when the mass-check tagging occurs; see |
| # http://wiki.apache.org/spamassassin/DateRev for more details. |
| use constant DATEREV_ADJ => - (8 * 60 * 60); |
| |
| # --------------------------------------------------------------------------- |
| |
| my $configuration = "$ENV{HOME}/.corpus"; |
| my %cf; |
| |
| configure(); |
| init(); |
| |
| if (!$opt_dir) { |
| $opt_dir = $cf{corpus}; |
| update_rsync(); |
| } |
| |
| chdir $opt_dir; |
| print "reading logs from '$opt_dir'\n"; |
| |
| my $linkdir = "$cf{html}/logs"; |
| (-d $linkdir) or mkdir $linkdir; |
| |
| locate_and_link(); |
| exit; |
| |
| sub configure { |
| # does rough equivalent of source |
| open(C, $configuration) || die "open failed: $configuration: $!\n"; |
| my $pwd = Cwd::getcwd; |
| |
| # add 'override' options |
| my @lines = (<C>, split(/\|/, $opt_override)); |
| |
| foreach $_ (@lines) { |
| chomp; |
| s/#.*//; |
| if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) { |
| my ($key, $val) = ($1, $2); |
| $val =~ s/\$PWD/$pwd/gs; |
| $cf{$key} = $val; |
| } |
| } |
| close(C); |
| } |
| |
| sub init { |
| $ENV{RSYNC_PASSWORD} = $cf{password}; |
| $ENV{TIME} = '%e,%U,%S'; |
| $ENV{TZ} = 'UTC'; |
| } |
| |
| sub update_rsync { |
| chdir $opt_dir; |
| if (!$cf{rsync_command}) { die "no 'rsync_command' set"; } |
| system $cf{rsync_command}; |
| system "chmod +r *.log > /dev/null 2>&1"; |
| } |
| |
| sub locate_and_link { |
| opendir(CORPUS, $opt_dir); |
| my @files = sort readdir(CORPUS); |
| closedir(CORPUS); |
| |
| print "Found ", $#files + 1, " files in $opt_dir\n"; |
| |
| @files = grep { |
| /^(?:spam|ham)-(?:net-)?[-\w.]+\.log$/ && -f "$opt_dir/$_" && -M _ < 30 |
| } @files; |
| |
| print "Kept ", $#files + 1, " files\n"; |
| |
| foreach my $file (@files) { |
| my $ftime; |
| my $frevision; |
| |
| print "Processing $opt_dir/$file\n"; |
| open(FILE, "$opt_dir/$file") or warn "cannot read $opt_dir/$file"; |
| while (my $line = <FILE>) { |
| last if $line !~ /^#/; |
| if ($line =~ /, on (... ... .. )(..)(:..:.. ... ....)/) { |
| my ($datepre, $hh, $datepost) = ($1,$2,$3); |
| |
| $ftime = Time::ParseDate::parsedate($datepre.$hh.$datepost, |
| GMT => 1, PREFER_PAST => 1); |
| } |
| elsif ($line =~ m/^# Date:\s*(\S+)/) { |
| # a better way to do the above. TODO: parse it instead |
| } |
| elsif ($line =~ m/^# SVN revision:\s*(\S+)/) { |
| $frevision = $1; |
| } |
| } |
| close(FILE); |
| |
| if (!defined $ftime) { |
| warn "$opt_dir/$file: no time found, ignored\n"; next; |
| } |
| if (!defined $frevision) { |
| warn "$opt_dir/$file: no revision found, ignored\n"; next; |
| } |
| if ($frevision eq 'unknown') { |
| warn "$opt_dir/$file: not tagged with a revision, ignored\n"; next; |
| } |
| |
| my $daterev = mk_daterev($ftime, $frevision, $opt_tag); |
| link_file($file, $daterev); |
| } |
| } |
| |
| sub mk_daterev { |
| my ($timet, $rev, $tag) = @_; |
| return strftime("%Y%m%d", gmtime($timet + DATEREV_ADJ)) . "/r$rev-$tag"; |
| } |
| |
| sub link_file { |
| my ($file, $daterev) = @_; |
| |
| my $f = "$opt_dir/$file"; |
| |
| # /^(?:spam|ham)-(?:net-)?[-\w]+\.log$/ |
| my $linkfile = $file; |
| my $dr = $daterev; $dr =~ s/\//-/gs; $linkfile =~ s/\.log$/.$dr.log/i; |
| my $t = "$linkdir/$linkfile"; |
| |
| print "ln $f $t\n"; |
| (-f $t) and unlink $t; |
| # cannot hardlink unless we have ownership or RW perms on the file |
| symlink $f, $t or die "cannot ln"; |
| system "/usr/bin/touch -h -r '$f' '$t'"; # preserve modtimes |
| } |
| |