| #!/usr/bin/perl -w |
| |
| # settings are located in $HOME/.corpus |
| |
| use strict; |
| use Getopt::Long; |
| |
| our ( $corpusdir, $opt_override, $opt_tag ); |
| GetOptions( |
| "tag=s" => \$opt_tag, |
| "dir=s" => \$corpusdir, |
| "override=s" => \$opt_override, |
| ); |
| |
| $opt_override ||= ''; |
| $opt_tag ||= 'n'; # nightly is the default |
| |
| |
| use File::Path; |
| use File::Copy; |
| use Time::ParseDate; |
| use Cwd; |
| use POSIX qw(nice strftime); |
| |
| use constant WEEK => 60*60*24; |
| nice(15); |
| |
| my $configuration = "$ENV{HOME}/.corpus"; |
| my %opt; |
| my %revision = (); |
| my %logs_by_rev = (); |
| my %is_net_revision = (); |
| my %time = (); |
| my %revision_date = (); |
| my @files; |
| my @tmps = (); |
| my $skip = ''; |
| my $time_start = time; |
| $time_start -= ($time_start % 3600); |
| my $output_revpath; |
| |
| &configure; |
| &init; |
| |
| if ($corpusdir) { |
| print "reading logs from '$corpusdir'\n"; |
| } |
| else { |
| $corpusdir = $opt{corpus}; |
| &update_rsync; |
| } |
| |
| &locate; |
| ¤t; |
| &clean_up; |
| |
| sub configure { |
| # does rough equivalent of source |
| open(C, $configuration) || die "open failed: $configuration: $!\n"; |
| my $pwd = getcwd; |
| |
| # add 'override' options |
| my @lines = (<C>, split(/\|/, $opt_override)); |
| |
| foreach $_ (@lines) { |
| chomp; |
| s/#.*//; |
| if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) { |
| my ($key, $val) = ($1, $2); |
| $val =~ s/\$PWD/$pwd/gs; |
| $opt{$key} = $val; |
| } |
| } |
| close(C); |
| } |
| |
| sub clean_up { |
| system "rm -f $opt{tmp}/*.$$ ".join(' ', @tmps); |
| } |
| |
| sub init { |
| $SIG{INT} = \&clean_up; |
| $SIG{TERM} = \&clean_up; |
| |
| $ENV{RSYNC_PASSWORD} = $opt{password}; |
| $ENV{TIME} = '%e,%U,%S'; |
| $ENV{TZ} = 'UTC'; |
| } |
| |
| sub update_rsync { |
| chdir $corpusdir; |
| |
| # allow non-running of rsync under some circumstances |
| if ($opt{rsync_command}) { |
| system $opt{rsync_command}; |
| } else { |
| system "rsync -CPcvuzt --timeout=300 $opt{username}" . '@rsync.spamassassin.org::corpus/*.log .'; |
| } |
| |
| # this block is no longer required -- we do sensible things with modtime |
| # comparisons to work it out! |
| if (0 && !$opt{always_update_html}) { |
| if (-f "rsync.last") { |
| open(FIND, "find . -type f -newer rsync.last |"); |
| my $files = ""; |
| while(<FIND>) { |
| $files .= $_; |
| } |
| close(FIND); |
| if (! $files) { |
| print STDERR "no new corpus files\n"; |
| if (rand(24) > 1) { |
| exit 0; |
| } |
| else { |
| print STDERR "updating anyway\n"; |
| } |
| } |
| } |
| } |
| |
| open(RSYNC, "> rsync.last"); |
| close(RSYNC); |
| system "chmod +r *.log"; |
| } |
| |
| sub locate { |
| # chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses"; |
| |
| opendir(CORPUS, $corpusdir); |
| @files = sort readdir(CORPUS); |
| closedir(CORPUS); |
| |
| @files = grep { |
| /^(?:spam|ham)-(?:net-)?[-\w]+\.r[0-9]+\.log$/ && -f "$corpusdir/$_" && -M _ < 10 |
| } @files; |
| |
| foreach my $file (@files) { |
| my $tag = 0; |
| my $revtime; |
| open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file"; |
| while (my $line = <FILE>) { |
| last if $line !~ /^#/; |
| if ($line =~ m/^# Date:\s*(\S+)/) { |
| my $date_line = $1; |
| my ($yyyy, $mm, $dd, $h, $m, $s) = $date_line =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/; |
| |
| my $timet = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} ${h}:${m}:${s} GMT+0", |
| GMT => 1, PREFER_PAST => 1); |
| |
| $revtime = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} 09:00:00 GMT+0", |
| GMT => 1, PREFER_PAST => 1); |
| |
| $time{$file} = $timet; |
| print "$corpusdir/$file: time=$timet\n"; |
| |
| } |
| if ($line =~ m/^# SVN revision:\s*(\S+)/) { |
| my $rev = $1; |
| $revision{$file} = $rev; |
| |
| $logs_by_rev{$rev} ||= [ ]; |
| push (@{$logs_by_rev{$rev}}, $file); |
| |
| if ($file =~ /-net-/) { |
| $is_net_revision{$rev} = 1; |
| print "$corpusdir/$file: rev=$rev (net)\n"; |
| } |
| else { |
| print "$corpusdir/$file: rev=$rev (non-net)\n"; |
| } |
| } |
| } |
| close(FILE); |
| if ($revtime) { |
| my $rev = $revision{$file}; |
| $revision_date{$rev} = $revtime unless defined $revision_date{$rev}; |
| |
| if ($revtime < $revision_date{$rev}) { |
| $revision_date{$rev} = $revtime; |
| } |
| } |
| } |
| } |
| |
| sub sort_all { |
| my ($a1, $a2) = ($a =~ m/(\(.*?\)|\S+)(?::(\S+))?$/); |
| my ($b1, $b2) = ($b =~ m/(\(.*?\)|\S+)(?::(\S+))?$/); |
| $a1 =~ s/^[\+\-]//; |
| $b1 =~ s/^[\+\-]//; |
| |
| my $n = ($a1 cmp $b1) || (($a2 || '') cmp ($b2 || '')); |
| if ($a1 =~ /^OVERALL/) { $n -= 1000; } |
| elsif ($a1 =~ /^\(all messages\)/) { $n -= 100; } |
| elsif ($a1 =~ /^\(all messages as \%\)/) { $n -= 10; } |
| if ($b1 =~ /^OVERALL/) { $n += 1000; } |
| elsif ($b1 =~ /^\(all messages\)/) { $n += 100; } |
| elsif ($b1 =~ /^\(all messages as \%\)/) { $n += 10; } |
| return $n; |
| } |
| |
| sub time_filter { |
| my ($target, $after, $before) = @_; |
| if (/time=(\d+)/) { |
| return (($target - $1 >= WEEK * $after) && |
| ($target - $1 < WEEK * $before)); |
| } |
| return 0; |
| } |
| |
| sub current { |
| my $classes = $opt{output_classes}; |
| $classes ||= "DETAILS.new DETAILS.all DETAILS.age HTML.new HTML.all HTML.age NET.new NET.all NET.age"; |
| |
| foreach my $entry (split(' ', $classes)) { |
| $entry =~ /^(\S+)\.(\S+)$/; |
| my $class = $1; |
| my $age = $2; |
| if (!$age) { warn "no age in $entry"; next; } |
| |
| foreach my $rev (sort keys %logs_by_rev) { |
| next if ($rev eq 'unknown'); |
| |
| if ($class =~ /NET/) { |
| next unless $is_net_revision{$rev}; |
| } |
| |
| gen_class ($rev, $class, $age); |
| } |
| } |
| } |
| |
| sub gen_class { |
| my ($rev, $class, $age) = @_; |
| |
| print STDERR "\ngenerating r$rev $class.$age:\n"; |
| |
| next if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/); |
| |
| my @ham = grep { /^ham/ } @{$logs_by_rev{$rev}}; |
| my @spam = grep { /^spam/ } @{$logs_by_rev{$rev}}; |
| |
| print STDERR "input h: " . join(' ', @ham) . "\n"; |
| print STDERR "input s: " . join(' ', @spam) . "\n"; |
| |
| chdir $corpusdir; |
| |
| # net vs. local |
| if ($class eq "NET") { |
| @ham = grep { /-net-/ } @ham; |
| @spam = grep { /-net-/ } @spam; |
| } |
| else { |
| # if both net and local exist, use newer |
| my %spam; |
| my %ham; |
| |
| for my $file (@spam) { |
| $spam{$1}++ if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/); |
| } |
| for my $file (@ham) { |
| $ham{$1}++ if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/); |
| } |
| while (my ($user, $count) = each %ham) { |
| if ($count > 1) { |
| my $nightly = "ham-$user.log"; |
| my $weekly = "ham-net-$user.log"; |
| if ($revision{$nightly} >= $revision{$weekly}) { |
| @ham = grep { $_ ne $weekly } @ham; |
| } |
| else { |
| @ham = grep { $_ ne $nightly } @ham; |
| } |
| } |
| } |
| while (my ($user, $count) = each %spam) { |
| if ($count > 1) { |
| my $nightly = "spam-$user.log"; |
| my $weekly = "spam-net-$user.log"; |
| if ($revision{$nightly} >= $revision{$weekly}) { |
| @spam = grep { $_ ne $weekly } @spam; |
| } |
| else { |
| @spam = grep { $_ ne $nightly } @spam; |
| } |
| } |
| } |
| } |
| |
| # age |
| if ($age =~ /(\d+)day/) { |
| my $mtime = $1; |
| @ham = grep { -M "$_" < $mtime } @ham; |
| @spam = grep { -M "$_" < $mtime } @spam; |
| } |
| elsif ($class ne 'NET' && $age =~ /^(?:new|all|age)$/) |
| { |
| # just ignore the tagtime stuff; since we now may be |
| # dealing with multiple mass-checks per day, just use svn rev data |
| # my $tt = (-M $opt{tagtime}); |
| # @ham = grep { !defined($tt) || ((-M "$_") < $tt) } @ham; |
| # @spam = grep { !defined($tt) || ((-M "$_") < $tt) } @spam; |
| } |
| |
| print STDERR "selected h: " . join(' ', @ham) . "\n"; |
| print STDERR "selected s: " . join(' ', @spam) . "\n"; |
| |
| # we cannot continue if we have no files that match the criteria... |
| # demand at least 1 ham and 1 spam file |
| if (scalar @spam <= 0 || scalar @ham <= 0) { |
| warn "not enough files found matching criteria ($rev $class $age)\n"; |
| return; |
| } |
| |
| my $crev_time = $revision_date{$rev}; |
| my $dir = create_outputdir($rev, $crev_time); |
| |
| my $fname = "$dir/$class.$age"; |
| |
| # Look through corpus for files that have been added since last full build |
| # Update all class files on this |
| |
| my $buildfile = "$dir/.buildtime"; |
| my $last_build = 0; |
| my $needs_rebuild = 0; |
| |
| if (-f $buildfile) { |
| open(BFILE, "$buildfile") or warn "cannot read $buildfile"; |
| while (my $line = <BFILE>) { |
| last if $line !~ /^#/; |
| if ($line =~ m/^# BuildTime:\s*(\S+)/) { |
| $last_build = $1; |
| } |
| } |
| close(BFILE); |
| } |
| |
| if ( !(-f $fname) || !$last_build || $last_build == $time_start) { |
| # No last build or we've already done the loop below |
| $needs_rebuild = 1; |
| } else { |
| foreach my $srcfile (@spam, @ham) { |
| my $file_time = (stat($srcfile))[9]; |
| if ($file_time >= $last_build) { |
| $needs_rebuild = 1; |
| last; |
| } |
| } |
| } |
| |
| if (!$needs_rebuild) { |
| print "last buildtime is fresher than sources\n"; |
| return; |
| } |
| |
| if ($last_build != $time_start) { |
| open(BFILE, "> $buildfile") or warn "cannot write to $buildfile"; |
| print BFILE "# BuildTime: $time_start\n"; |
| close(BFILE); |
| } |
| |
| my $when = scalar localtime time; |
| print qq{creating: $fname |
| started $when... |
| }; |
| my $bytes = 0; |
| |
| if ($class eq 'LOGS') { |
| foreach my $f (@ham, @spam) { |
| $f =~ s/[^-\._A-Za-z0-9]+/_/gs; # sanitize! |
| my $zf = "$fname-$f.gz"; |
| |
| system("pigz -c < $f > $zf.$$"); |
| if ($? >> 8 != 0) { |
| warn "pigz -c < $f > $zf.$$ failed"; |
| } |
| |
| rename("$zf.$$", $zf) or |
| warn "cannot rename $zf.$$ to $zf"; |
| $bytes += (-s $zf); |
| } |
| my $tmpfname = "$fname.$$"; |
| open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname"; |
| print OUT "# $$ \n"; |
| close(OUT); |
| rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname"; |
| } |
| else { |
| my $tmpfname = "$fname.$$"; |
| |
| open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname"; |
| print OUT "# ham results used for $rev $class $age: " . join(" ", @ham) . "\n"; |
| print OUT "# spam results used for $rev $class $age: " . join(" ", @spam) . "\n"; |
| for (@ham) { |
| print OUT "# $_ was at r$revision{$_}\n"; |
| } |
| for (@spam) { |
| print OUT "# $_ was at r$revision{$_}\n"; |
| } |
| |
| push (@tmps, $tmpfname); |
| |
| my $flags = ""; |
| $flags = "-t net -s 1" if $class eq "NET"; |
| $flags = "-M HTML_MESSAGE" if $class eq "HTML"; |
| $flags = "-o" if $class eq "OVERLAP"; |
| $flags = "-S" if $class eq "SCOREMAP"; |
| if ($opt{rules_dir}) { |
| $flags .= " -c '$opt{rules_dir}'"; |
| } |
| |
| if ($age eq "all") { |
| my %spam; |
| my %ham; |
| my @output; |
| |
| for my $file (@spam) { |
| $spam{$1} = $file if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/); |
| } |
| for my $file (@ham) { |
| $ham{$1} = $file if ($file =~ m/-(\w[-\w]+)\.r[0-9]+\.log$/); |
| } |
| unlink "$opt{tmp}/ham.log.$$"; |
| unlink "$opt{tmp}/spam.log.$$"; |
| |
| if (scalar keys %spam <= 0 || scalar keys %ham <= 0) { |
| warn "no files found for $class.$age"; |
| return; |
| } |
| |
| chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses"; |
| for my $user (sort keys %spam) { |
| next unless $ham{$user}; |
| system("cat $corpusdir/$ham{$user} >> $opt{tmp}/ham.log.$$"); |
| system("cat $corpusdir/$spam{$user} >> $opt{tmp}/spam.log.$$"); |
| open(IN, "./hit-frequencies -TxpaP $flags $corpusdir/$spam{$user} $corpusdir/$ham{$user} |"); |
| while(<IN>) { |
| chomp; |
| push @output, "$_:$user\n"; |
| } |
| close(IN); |
| } |
| open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |"); |
| while(<IN>) { |
| push @output, $_; |
| } |
| close(IN); |
| for (sort sort_all @output) { print OUT; } |
| } |
| elsif ($age eq "age") { |
| my @output; |
| |
| for my $which (("0-1", "1-2", "2-3", "3-6")) { |
| my ($after, $before) = split(/-/, $which); |
| # get and filter logs |
| chdir $corpusdir; |
| for my $type (("ham", "spam")) { |
| open(TMP, "> $opt{tmp}/$type.log.$$"); |
| my @array = ($type eq "ham") ? @ham : @spam; |
| for my $file (@array) { |
| open(IN, $file) or warn "cannot read $file"; |
| while (<IN>) { |
| print TMP $_ if time_filter($crev_time, $after, $before); |
| } |
| close(IN); |
| } |
| close (TMP); |
| } |
| # print out by age |
| chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses"; |
| open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |"); |
| while(<IN>) { |
| chomp; |
| push @output, "$_:$which\n"; |
| } |
| close(IN); |
| } |
| for (sort sort_all @output) { print OUT; } |
| } |
| elsif (@ham && @spam) { |
| # get logs |
| system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$"); |
| system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$"); |
| |
| chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses"; |
| open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |"); |
| while(<IN>) { print(OUT); } |
| close(IN); |
| } |
| |
| $bytes = (-s OUT); |
| close(OUT); |
| rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname"; |
| } |
| |
| $when = scalar localtime time; |
| print qq{created: $bytes bytes, finished at $when |
| URL: |
| |
| http://buildbot.spamassassin.org/ruleqa?daterev=$output_revpath |
| |
| }; |
| |
| } |
| |
| sub create_outputdir { |
| my ($rev, $time) = @_; |
| my $revpath = strftime("%Y%m%d", gmtime($time)) . "/r$rev-$opt_tag"; |
| my $dir = $opt{html} .'/'. $revpath; |
| |
| # print "output dir: $dir\n"; |
| if (!-d $dir) { |
| my $prevu = umask 0; |
| mkpath([$dir], 0, oct($opt{html_mode})) or warn "failed to mkdir $dir"; |
| umask $prevu; |
| } |
| |
| $output_revpath = $revpath; # set the global |
| $output_revpath =~ s/\//-/; # looks nicer |
| |
| return $dir; |
| } |
| |