blob: 3f742e4ebcec653a20f1abebd099a16b9e3b7496 [file] [log] [blame]
#!/usr/bin/perl -w
#
# rule-hits-over-time - produce graphs of rule hits over time, using gnuplot
#
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
use GD;
use Statistics::DEA;
use strict;
use warnings;
use Fcntl;
use Getopt::Long;
use SDBM_File;
sub usage {
die q{
usage: rule-hits-over-time [options] --rule rulename log1 [log2 ...]
--rule=rulename specify rule to map
--period=secs specify period (default: 1 day)
--ignore_older=days ignore hits older than N days (default: 0 = none)
--scale_period=n scale period up to N items of data, 0=no scaling
(default: 0)
--size_x=pixels width of output graphs, in pixels (def: 800)
--size_y=pixels height of ONE of the output graphs, in pixels
(default: 400)
--cgi CGI output, to stdout with HTTP headers
--text text output only
};
}
our ( $opt_rule, $opt_size_x, $opt_size_y, $opt_text, $opt_cgi,
$opt_period, $opt_scale_period, $opt_ignore_older, $opt_debug );
GetOptions(
'rule=s',
'size_x=i',
'size_y=i',
'text',
'cgi',
'scale_period=i',
'ignore_older=i',
'period=i',
'debug'
) or usage();
usage() unless $opt_rule;
my $DEBUG_TMPDIR = $opt_debug; # keep the tmpdir around after exiting, for debug
# $DEBUG_TMPDIR = 1;
# fix PATHs for sucky Solaris compatibility.
$ENV{PATH} = "/local/gnuplot-4.0.0/bin:/opt/sfw/bin:".$ENV{PATH};
$ENV{LD_LIBRARY_PATH} .= ":/local/gd-2.0.33/lib";
my $rule_re = qr/[, ]${opt_rule}[, ]/;
# my $period = $opt_period || (24 * 60 * 60 * 1);
my $period = $opt_period || 3600;
my $graph_x = $opt_size_x || 800;
my $graph_y = $opt_size_y || 400;
my $fname_counter = 1;
my %graph_png_data = ();
my %allbuckets = ();
my %allresults = ();
my @allfiles = ();
my $graph_times = [];
my $graph_data = [];
my $this_file_results;
my $lastbucket;
my $nextbucket;
my $seen_y;
my $seen_n;
my $tmpdir = "/tmp/rulehits.$$";
if ($DEBUG_TMPDIR) { $tmpdir = "/tmp/rulehits.tmp"; system("rm -rf $tmpdir"); }
mkdir ($tmpdir) or die "collided on $tmpdir";
my $outdir = ".";
if ($opt_cgi) {
$outdir = $tmpdir;
}
my $file_sets = [ ]; # split into ham and spam
$file_sets = [ [ 'TITLE:hits in spam' ], [ 'TITLE:hits in ham' ] ];
foreach my $file (@ARGV) {
if ($file =~ /\bham\b/) {
push @{$file_sets->[1]}, $file;
} else {
push @{$file_sets->[0]}, $file;
}
}
foreach my $set (@{$file_sets}) {
@allfiles = ();
%allbuckets = ();
%allresults = ();
my $settitle = '';
if ($set->[0] =~ /^TITLE:(.*)$/) {
$settitle = $1; shift(@{$set});
}
create_gp("$opt_rule $settitle");
foreach my $file (@{$set}) {
if (!$opt_text) {
my $title = $file;
$title =~ s/^.*\///;
}
push (@allfiles, $file);
if (1) {
# use an on-disk file to avoid massive VM usage for this hash
# on huge datasets
unlink("$tmpdir/graph.tmp.dir");
unlink("$tmpdir/graph.tmp.pag");
tie (%{$allresults{$file}}, 'SDBM_File', "$tmpdir/graph.tmp",
O_RDWR|O_CREAT, 0600) or die "tie failed: $!";
}
else {
%{$allresults{$file}} = ();
}
$this_file_results = $allresults{$file};
read_logs($file);
$graph_times = [];
$graph_data = [];
summarise();
}
$opt_scale_period and collapse_periods();
plot_gp();
}
my $format = "gif";
{
my $both = GD::Image->new($graph_x, 15 + ($graph_y * 2));
my $file01 = GD::Image->newFromPngData($graph_png_data{"file01"}, 1);
my $file02 = GD::Image->newFromPngData($graph_png_data{"file02"}, 1);
if (!$file01 || !$file02) {
warn "bad input. leaving graph blank";
}
else {
$both->copy($file01, 0, 5, 0, 0, $graph_x-1, $graph_y-1);
$both->copy($file02, 0, 10 + $graph_y, 0, 0, $graph_x-1, $graph_y-1);
}
if ($opt_cgi) {
use CGI qw(:standard);
print header("image/$format"); binmode STDOUT;
print STDOUT $both->$format();
}
else {
open(IMG, ">both.$format") or die $!; binmode IMG;
print IMG $both->$format();
close IMG;
}
$both->gif();
}
if (!$DEBUG_TMPDIR) {
unlink(<$tmpdir/*.*>); rmdir $tmpdir;
} else {
system ("ls -l $tmpdir/*.* 1>&2");
}
exit;
sub summarise {
foreach my $bucket (sort keys %allbuckets) {
my @cols = ();
foreach my $file (@allfiles) {
my $res = $allresults{$file}->{$bucket};
my $sy;
my $sn;
if (!$res) {
$sn = $sy = -1;
}
elsif ($res !~ /^y(\d+)n(\d+)$/) {
warn "bad results: $res for $file $bucket";
next;
}
else {
$sy = $1;
$sn = $2;
}
if (!defined $sy && !defined $sn) {
$sn = $sy = -1;
} elsif (!defined $sy || !defined $sn) {
# assert: enforce both < 0, if either is
warn "oops? sy=$sy sn=$sn, should be both < 0";
$sn = $sy = -1;
}
if (($sy+$sn) > 0) {
push @cols, ($sy / ($sy + $sn)) * 100.0;
}
else {
push @cols, -1;
}
}
if ($opt_text) {
print $bucket," ".join(' ',@cols)."\n";
}
else {
push (@{$graph_times}, $bucket);
push (@{$graph_data}, \@cols);
}
}
}
sub collapse_periods {
while (scalar @{$graph_data} > $opt_scale_period) {
my $num_files = (scalar @allfiles - 1);
my $newtimes = [ ];
my $newdata = [ ];
my $i;
for ($i = 0; $i < (scalar @{$graph_data}); $i += 2) {
$newtimes->[$i >> 1] = $graph_times->[$i];
foreach my $j (0 .. $num_files)
{
my $v1 = $graph_data->[$i]->[$j];
my $v2 = $graph_data->[$i+1]->[$j];
if (!defined $v2) { $v2 = -1; }
if ($v1 >= 0.0 && $v2 >= 0.0) {
# both are valid. take their mean
$v1 = ($v1 + $v2) / 2.0;
}
elsif ($v2 >= 0.0) {
# only one is valid; use it and ignore the invalid one
$v1 = $v2;
}
else {
# we're good, v1 is the valid one anyway
}
$newdata->[$i >> 1]->[$j] = $v1;
}
}
@{$graph_times} = @{$newtimes};
@{$graph_data} = @{$newdata};
$period *= 2;
}
}
sub read_logs {
my $file = shift;
# limit to a range from [4 years ago, today] to avoid OOM craziness
# from corrupt input
#
if ($opt_ignore_older <= 0) {
$opt_ignore_older = 365 * 4;
}
my $limit_hi = time;
my $limit_lo = $limit_hi - (24*60*60*$opt_ignore_older);
$lastbucket = undef;
$nextbucket = undef;
$seen_y = 0;
$seen_n = 0;
if ($file =~ /\.gz$/) {
open (IN, "pigz -cd '$file'|") or die "cannot gunzip $file";
} else {
open (IN, "<$file") or die "cannot read $file";
}
while (<IN>) {
next if /^#/;
my $t;
/\btime=(\d+),/ and $t = $1;
next unless $t;
if ($t < $limit_lo || $t > $limit_hi) {
warn "ignoring out-of-range time $t (limit: $limit_lo < t < $limit_hi)";
next;
}
my $found = ($_ =~ $rule_re);
if (!defined $lastbucket) {
$lastbucket = $t - ($t % $period);
$nextbucket = $lastbucket + $period;
}
if ($t < $nextbucket) {
if ($found) {
$seen_y++;
} else {
$seen_n++;
}
}
else {
while ($t >= $nextbucket) {
completeline();
$lastbucket = $nextbucket;
$nextbucket += $period;
}
}
}
close IN;
completeline();
}
sub completeline {
return unless ($lastbucket);
$allbuckets{$lastbucket} = undef;
$this_file_results->{$lastbucket} = "y".$seen_y."n".$seen_n;
$seen_y = 0;
$seen_n = 0;
}
sub create_gp {
my $title = shift;
my $mailtype = 'mail';
if ($title =~ /\b(ham|spam)\b/) { $mailtype = $1; }
my $y_label = "\%age of $mailtype in period";
$SIG{PIPE} = sub {
die "unexpected SIGPIPE received!";
};
open (GP, "| gnuplot - > $tmpdir/gp.log 2>&1") or die "cannot run gnuplot";
# eye-candy
my $niceperiod = "$period secs";
if ($period % (24*60*60) == 0) {
$niceperiod = ($period / (24*60*60))." days";
}
# (NOTE: -1% hitrate means no data for that time period)'
print GP qq{
set terminal png small \\
interlace size $graph_x,$graph_y \\
xffffff x444444 x33cc00 \\
xff3300 x0000cc x99cc00 xff9900 \\
xcccc00 x333333 x999999 x9500d3
set out '$tmpdir/out.png'
set grid back xtics ytics
set xlabel 'Time, in blocks of $niceperiod'
set xdata time
set timefmt "%Y-%m-%d-%H"
set format x "%04Y%02m%02d"
set ylabel '$y_label'
set yrange [0:*]
set title "$title"
set key left top Left nobox
};
}
sub fmt_time_t {
my $tt = shift;
use POSIX qw(strftime);
return strftime "%Y-%m-%d-%H", gmtime($tt);
}
sub plot_gp {
my $num_files = (scalar @allfiles - 1);
my $num_datapoints = (scalar @{$graph_data} - 1);
# specify a number of alphas for Statistics::DEA. Right now,
# the graph is pretty unreadable with more than one.
my $dea_alphas = [ 0.9 ];
my $num_alphas = (scalar @{$dea_alphas} - 1);
my $times = [ ];
my $avgs = [ ];
my $graphname = sprintf("file%02d", $fname_counter++);
if (!$opt_text)
{
if (@{$graph_data}) {
my $deas = ();
foreach my $i (0 .. $num_files) {
foreach my $a (0 .. $num_alphas) {
$deas->[$a]->[$i] =
Statistics::DEA->new($dea_alphas->[$a], $period * 3);
}
}
foreach my $j (0 .. $num_datapoints) {
my (@datas) = @{$graph_data->[$j]};
$times->[$j] = fmt_time_t($graph_times->[$j]);
foreach my $i (0 .. $num_files) {
my $d = $datas[$i];
foreach my $a (0 .. $num_alphas) {
if ($d >= 0) {
$deas->[$a]->[$i]->update($d, $j);
}
my $avg;
eval {
# this can die if it hasn't received enough data!
# so trap with an eval.
$avg = $deas->[$a]->[$i]->average();
};
$avgs->[$a]->[$j]->[$i] = (defined $avg) ? $avg : -1;
}
}
}
}
# write the data plotfile
open (DATA, ">$tmpdir/plot.$graphname.data") or die;
if (@{$graph_data})
{
foreach my $j (0 .. $num_datapoints) {
print DATA $times->[$j]," ",join(' ', @{$graph_data->[$j]}),"\n";
}
} else {
# a fake datapoint so gnuplot doesn't puke on us
print DATA fmt_time_t(0)," 0 0\n";
}
close DATA or die;
# write the avgs plotfiles
foreach my $a (0 .. $num_alphas) {
open (DATA, ">$tmpdir/avgs$a.$graphname.data") or die;
if (@{$graph_data}) {
foreach my $j (0 .. $num_datapoints) {
print DATA $times->[$j]," ",
defined $avgs->[$a]->[$j] ? join ' ', @{$avgs->[$a]->[$j]} : '0',
"\n";
}
} else {
# a fake datapoint so gnuplot doesn't puke on us
print DATA fmt_time_t(0)," 0 0\n";
}
close DATA or die;
}
# and the commands file
my @plot = ();
foreach my $i (0 .. $num_files) {
my $legend = filename_to_legend ($allfiles[$i]);
my $style = $i+1;
my $col = $i+2;
push @plot,
qq{ '$tmpdir/plot.$graphname.data' using }.
qq{ 1:(\$$col >= 0 ? \$$col : 1/0) }.
# note: using "lt $style" gives us points in the same
# colour as the lines in the smoothed graph below
qq{ with points lt $style pt $style ps 1 }.
qq{ title '$legend' };
foreach my $a (0 .. $num_alphas) {
push @plot,
qq{ '$tmpdir/avgs$a.$graphname.data' using }.
qq{ 1:(\$$col >= 0 ? \$$col : 1/0) }.
qq{ with lines lt $style lw 3 }.
qq{ title ' (DEA a=$dea_alphas->[$a])' };
}
}
print GP "plot ",join(", ", @plot), "\n";
close GP
or warn "gnuplot command exited: $?";
$graph_png_data{$graphname} = readfile("$tmpdir/out.png");
}
}
sub readfile {
open (IN, "<$_[0]") or die "cannot read $_[0]";
binmode IN;
my $str = join('',<IN>);
close IN;
return $str;
}
sub filename_to_legend {
my $f = shift;
$f =~ s/^.*\///;
$f =~ s/LOGS\.all-//;
$f =~ s/\.log\.\S+$//;
return $f;
}