blob: 6eb114a4e935c655c2df81d86ef172b57a1ae629 [file] [log] [blame]
#!/usr/bin/perl -w
# settings are located in $HOME/.corpus
use strict;
use Getopt::Long;
our ( $corpusdir, $opt_override, $opt_tag );
GetOptions(
"tag=s" => \$opt_tag,
"dir=s" => \$corpusdir,
"override=s" => \$opt_override,
);
$opt_override ||= '';
$opt_tag ||= 'n'; # nightly is the default
use File::Path;
use File::Copy;
use Time::ParseDate;
use Cwd;
use POSIX qw(nice strftime);
use constant WEEK => 60*60*24;
nice(15);
my $configuration = "$ENV{HOME}/.corpus";
my %opt;
my %revision = ();
my %logs_by_rev = ();
my %is_net_revision = ();
my %time = ();
my %revision_date = ();
my @files;
my @tmps = ();
my $skip = '';
my $time_start = time;
$time_start -= ($time_start % 3600);
my $output_revpath;
&configure;
&init;
if ($corpusdir) {
print "reading logs from '$corpusdir'\n";
}
else {
$corpusdir = $opt{corpus};
&update_rsync;
}
&locate;
&current;
&clean_up;
sub configure {
# does rough equivalent of source
open(C, $configuration) || die "open failed: $configuration: $!\n";
my $pwd = getcwd;
# add 'override' options
my @lines = (<C>, split(/\|/, $opt_override));
foreach $_ (@lines) {
chomp;
s/#.*//;
if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) {
my ($key, $val) = ($1, $2);
$val =~ s/\$PWD/$pwd/gs;
$opt{$key} = $val;
}
}
close(C);
}
sub clean_up {
system "rm -f $opt{tmp}/*.$$ ".join(' ', @tmps);
}
sub init {
$SIG{INT} = \&clean_up;
$SIG{TERM} = \&clean_up;
$ENV{RSYNC_PASSWORD} = $opt{password};
$ENV{TIME} = '%e,%U,%S';
$ENV{TZ} = 'UTC';
}
sub update_rsync {
chdir $corpusdir;
# allow non-running of rsync under some circumstances
if ($opt{rsync_command}) {
system $opt{rsync_command};
} else {
system "rsync -CPcvuzt --timeout=300 $opt{username}" . '@rsync.spamassassin.org::corpus/*.log .';
}
# this block is no longer required -- we do sensible things with modtime
# comparisons to work it out!
if (0 && !$opt{always_update_html}) {
if (-f "rsync.last") {
open(FIND, "find . -type f -newer rsync.last |");
my $files = "";
while(<FIND>) {
$files .= $_;
}
close(FIND);
if (! $files) {
print STDERR "no new corpus files\n";
if (rand(24) > 1) {
exit 0;
}
else {
print STDERR "updating anyway\n";
}
}
}
}
open(RSYNC, "> rsync.last");
close(RSYNC);
system "chmod +r *.log";
}
sub locate {
# chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
opendir(CORPUS, $corpusdir);
@files = sort readdir(CORPUS);
closedir(CORPUS);
@files = grep {
/^(?:spam|ham)-(?:net-)?[-\w]+\.r[0-9]+\.log$/ && -f "$corpusdir/$_" && -M _ < 10
} @files;
foreach my $file (@files) {
my $tag = 0;
my $revtime;
open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file";
while (my $line = <FILE>) {
last if $line !~ /^#/;
if ($line =~ m/^# Date:\s*(\S+)/) {
my $date_line = $1;
my ($yyyy, $mm, $dd, $h, $m, $s) = $date_line =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/;
my $timet = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} ${h}:${m}:${s} GMT+0",
GMT => 1, PREFER_PAST => 1);
$revtime = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} 09:00:00 GMT+0",
GMT => 1, PREFER_PAST => 1);
$time{$file} = $timet;
print "$corpusdir/$file: time=$timet\n";
}
if ($line =~ m/^# SVN revision:\s*(\S+)/) {
my $rev = $1;
$revision{$file} = $rev;
$logs_by_rev{$rev} ||= [ ];
push (@{$logs_by_rev{$rev}}, $file);
if ($file =~ /-net-/) {
$is_net_revision{$rev} = 1;
print "$corpusdir/$file: rev=$rev (net)\n";
}
else {
print "$corpusdir/$file: rev=$rev (non-net)\n";
}
}
}
close(FILE);
if ($revtime) {
my $rev = $revision{$file};
$revision_date{$rev} = $revtime unless defined $revision_date{$rev};
if ($revtime < $revision_date{$rev}) {
$revision_date{$rev} = $revtime;
}
}
}
}
sub sort_all {
my ($a1, $a2) = ($a =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);
my ($b1, $b2) = ($b =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);
$a1 =~ s/^[\+\-]//;
$b1 =~ s/^[\+\-]//;
my $n = ($a1 cmp $b1) || (($a2 || '') cmp ($b2 || ''));
if ($a1 =~ /^OVERALL/) { $n -= 1000; }
elsif ($a1 =~ /^\(all messages\)/) { $n -= 100; }
elsif ($a1 =~ /^\(all messages as \%\)/) { $n -= 10; }
if ($b1 =~ /^OVERALL/) { $n += 1000; }
elsif ($b1 =~ /^\(all messages\)/) { $n += 100; }
elsif ($b1 =~ /^\(all messages as \%\)/) { $n += 10; }
return $n;
}
sub time_filter {
my ($target, $after, $before) = @_;
if (/time=(\d+)/) {
return (($target - $1 >= WEEK * $after) &&
($target - $1 < WEEK * $before));
}
return 0;
}
sub current {
my $classes = $opt{output_classes};
$classes ||= "DETAILS.new DETAILS.all DETAILS.age HTML.new HTML.all HTML.age NET.new NET.all NET.age";
foreach my $entry (split(' ', $classes)) {
$entry =~ /^(\S+)\.(\S+)$/;
my $class = $1;
my $age = $2;
if (!$age) { warn "no age in $entry"; next; }
foreach my $rev (sort keys %logs_by_rev) {
next if ($rev eq 'unknown');
if ($class =~ /NET/) {
next unless $is_net_revision{$rev};
}
gen_class ($rev, $class, $age);
}
}
}
sub gen_class {
my ($rev, $class, $age) = @_;
print STDERR "\ngenerating r$rev $class.$age:\n";
next if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/);
my @ham = grep { /^ham/ } @{$logs_by_rev{$rev}};
my @spam = grep { /^spam/ } @{$logs_by_rev{$rev}};
print STDERR "input h: " . join(' ', @ham) . "\n";
print STDERR "input s: " . join(' ', @spam) . "\n";
chdir $corpusdir;
# net vs. local
if ($class eq "NET") {
@ham = grep { /-net-/ } @ham;
@spam = grep { /-net-/ } @spam;
}
else {
# if both net and local exist, use newer
my %spam;
my %ham;
for my $file (@spam) {
$spam{$1}++ if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
}
for my $file (@ham) {
$ham{$1}++ if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
}
while (my ($user, $count) = each %ham) {
if ($count > 1) {
my $nightly = "ham-$user.log";
my $weekly = "ham-net-$user.log";
if ($revision{$nightly} >= $revision{$weekly}) {
@ham = grep { $_ ne $weekly } @ham;
}
else {
@ham = grep { $_ ne $nightly } @ham;
}
}
}
while (my ($user, $count) = each %spam) {
if ($count > 1) {
my $nightly = "spam-$user.log";
my $weekly = "spam-net-$user.log";
if ($revision{$nightly} >= $revision{$weekly}) {
@spam = grep { $_ ne $weekly } @spam;
}
else {
@spam = grep { $_ ne $nightly } @spam;
}
}
}
}
# age
if ($age =~ /(\d+)day/) {
my $mtime = $1;
@ham = grep { -M "$_" < $mtime } @ham;
@spam = grep { -M "$_" < $mtime } @spam;
}
elsif ($class ne 'NET' && $age =~ /^(?:new|all|age)$/)
{
# just ignore the tagtime stuff; since we now may be
# dealing with multiple mass-checks per day, just use svn rev data
# my $tt = (-M $opt{tagtime});
# @ham = grep { !defined($tt) || ((-M "$_") < $tt) } @ham;
# @spam = grep { !defined($tt) || ((-M "$_") < $tt) } @spam;
}
print STDERR "selected h: " . join(' ', @ham) . "\n";
print STDERR "selected s: " . join(' ', @spam) . "\n";
# we cannot continue if we have no files that match the criteria...
# demand at least 1 ham and 1 spam file
if (scalar @spam <= 0 || scalar @ham <= 0) {
warn "not enough files found matching criteria ($rev $class $age)\n";
return;
}
my $crev_time = $revision_date{$rev};
my $dir = create_outputdir($rev, $crev_time);
my $fname = "$dir/$class.$age";
# Look through corpus for files that have been added since last full build
# Update all class files on this
my $buildfile = "$dir/.buildtime";
my $last_build = 0;
my $needs_rebuild = 0;
if (-f $buildfile) {
open(BFILE, "$buildfile") or warn "cannot read $buildfile";
while (my $line = <BFILE>) {
last if $line !~ /^#/;
if ($line =~ m/^# BuildTime:\s*(\S+)/) {
$last_build = $1;
}
}
close(BFILE);
}
if ( !(-f $fname) || !$last_build || $last_build == $time_start) {
# No last build or we've already done the loop below
$needs_rebuild = 1;
} else {
foreach my $srcfile (@spam, @ham) {
my $file_time = (stat($srcfile))[9];
if ($file_time >= $last_build) {
$needs_rebuild = 1;
last;
}
}
}
if (!$needs_rebuild) {
print "last buildtime is fresher than sources\n";
return;
}
if ($last_build != $time_start) {
open(BFILE, "> $buildfile") or warn "cannot write to $buildfile";
print BFILE "# BuildTime: $time_start\n";
close(BFILE);
}
my $when = scalar localtime time;
print qq{creating: $fname
started $when...
};
my $bytes = 0;
if ($class eq 'LOGS') {
foreach my $f (@ham, @spam) {
$f =~ s/[^-\._A-Za-z0-9]+/_/gs; # sanitize!
my $zf = "$fname-$f.gz";
system("pigz -c < $f > $zf.$$");
if ($? >> 8 != 0) {
warn "pigz -c < $f > $zf.$$ failed";
}
rename("$zf.$$", $zf) or
warn "cannot rename $zf.$$ to $zf";
$bytes += (-s $zf);
}
my $tmpfname = "$fname.$$";
open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname";
print OUT "# $$ \n";
close(OUT);
rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname";
}
else {
my $tmpfname = "$fname.$$";
open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname";
print OUT "# ham results used for $rev $class $age: " . join(" ", @ham) . "\n";
print OUT "# spam results used for $rev $class $age: " . join(" ", @spam) . "\n";
for (@ham) {
print OUT "# $_ was at r$revision{$_}\n";
}
for (@spam) {
print OUT "# $_ was at r$revision{$_}\n";
}
push (@tmps, $tmpfname);
my $flags = "";
$flags = "-t net -s 1" if $class eq "NET";
$flags = "-M HTML_MESSAGE" if $class eq "HTML";
$flags = "-o" if $class eq "OVERLAP";
$flags = "-S" if $class eq "SCOREMAP";
if ($opt{rules_dir}) {
$flags .= " -c '$opt{rules_dir}'";
}
if ($age eq "all") {
my %spam;
my %ham;
my @output;
for my $file (@spam) {
$spam{$1} = $file if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
}
for my $file (@ham) {
$ham{$1} = $file if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/);
}
unlink "$opt{tmp}/ham.log.$$";
unlink "$opt{tmp}/spam.log.$$";
if (scalar keys %spam <= 0 || scalar keys %ham <= 0) {
warn "no files found for $class.$age";
return;
}
chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
for my $user (sort keys %spam) {
next unless $ham{$user};
system("cat $corpusdir/$ham{$user} >> $opt{tmp}/ham.log.$$");
system("cat $corpusdir/$spam{$user} >> $opt{tmp}/spam.log.$$");
open(IN, "./hit-frequencies -TxpaP $flags $corpusdir/$spam{$user} $corpusdir/$ham{$user} |");
while(<IN>) {
chomp;
push @output, "$_:$user\n";
}
close(IN);
}
open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
while(<IN>) {
push @output, $_;
}
close(IN);
for (sort sort_all @output) { print OUT; }
unlink "$opt{tmp}/ham.log.$$";
unlink "$opt{tmp}/spam.log.$$";
}
elsif ($age eq "age") {
my @output;
for my $which (("0-1", "1-2", "2-3", "3-6")) {
my ($after, $before) = split(/-/, $which);
# get and filter logs
chdir $corpusdir;
for my $type (("ham", "spam")) {
open(TMP, "> $opt{tmp}/$type.log.$$");
my @array = ($type eq "ham") ? @ham : @spam;
for my $file (@array) {
open(IN, $file) or warn "cannot read $file";
while (<IN>) {
print TMP $_ if time_filter($crev_time, $after, $before);
}
close(IN);
}
close (TMP);
}
# print out by age
chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
while(<IN>) {
chomp;
push @output, "$_:$which\n";
}
close(IN);
}
for (sort sort_all @output) { print OUT; }
unlink "$opt{tmp}/ham.log.$$";
unlink "$opt{tmp}/spam.log.$$";
}
elsif (@ham && @spam) {
# get logs
system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$");
system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$");
chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
while(<IN>) { print(OUT); }
close(IN);
unlink "$opt{tmp}/ham.log.$$";
unlink "$opt{tmp}/spam.log.$$";
}
$bytes = (-s OUT);
close(OUT);
rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname";
}
$when = scalar localtime time;
print qq{created: $bytes bytes, finished at $when
URL:
http://buildbot.spamassassin.org/ruleqa?daterev=$output_revpath
};
}
sub create_outputdir {
my ($rev, $time) = @_;
my $revpath = strftime("%Y%m%d", gmtime($time)) . "/r$rev-$opt_tag";
my $dir = $opt{html} .'/'. $revpath;
# print "output dir: $dir\n";
if (!-d $dir) {
my $prevu = umask 0;
mkpath([$dir], 0, oct($opt{html_mode})) or warn "failed to mkdir $dir";
umask $prevu;
}
$output_revpath = $revpath; # set the global
$output_revpath =~ s/\//-/; # looks nicer
return $dir;
}