| #!/usr/bin/perl -w |
| # |
| # rule-hits-over-time - produce graphs of rule hits over time, using gnuplot |
| # |
| # <@LICENSE> |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to you under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at: |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # </@LICENSE> |
| |
| use GD; |
| use Statistics::DEA; |
| |
| use strict; |
| use warnings; |
| use Fcntl; |
| use Getopt::Long; |
| use SDBM_File; |
| |
| sub usage { |
| die q{ |
| usage: rule-hits-over-time [options] --rule rulename log1 [log2 ...] |
| |
| --rule=rulename specify rule to map |
| --period=secs specify period (default: 1 day) |
| --ignore_older=days ignore hits older than N days (default: 0 = none) |
| --scale_period=n scale period up to N items of data, 0=no scaling |
| (default: 0) |
| --size_x=pixels width of output graphs, in pixels (def: 800) |
| --size_y=pixels height of ONE of the output graphs, in pixels |
| (default: 400) |
| --cgi CGI output, to stdout with HTTP headers |
| --text text output only |
| }; |
| } |
| |
| our ( $opt_rule, $opt_size_x, $opt_size_y, $opt_text, $opt_cgi, |
| $opt_period, $opt_scale_period, $opt_ignore_older, $opt_debug ); |
| |
| GetOptions( |
| 'rule=s', |
| 'size_x=i', |
| 'size_y=i', |
| 'text', |
| 'cgi', |
| 'scale_period=i', |
| 'ignore_older=i', |
| 'period=i', |
| 'debug' |
| ) or usage(); |
| |
| usage() unless $opt_rule; |
| |
| my $DEBUG_TMPDIR = $opt_debug; # keep the tmpdir around after exiting, for debug |
| # $DEBUG_TMPDIR = 1; |
| |
| # fix PATHs for sucky Solaris compatibility. |
| $ENV{PATH} = "/local/gnuplot-4.0.0/bin:/opt/sfw/bin:".$ENV{PATH}; |
| $ENV{LD_LIBRARY_PATH} .= ":/local/gd-2.0.33/lib"; |
| |
| my $rule_re = qr/[, ]${opt_rule}[, ]/; |
| |
| # my $period = $opt_period || (24 * 60 * 60 * 1); |
| my $period = $opt_period || 3600; |
| |
| my $graph_x = $opt_size_x || 800; |
| my $graph_y = $opt_size_y || 400; |
| |
| my $fname_counter = 1; |
| my %graph_png_data = (); |
| |
| my %allbuckets = (); |
| my %allresults = (); |
| my @allfiles = (); |
| |
| my $graph_times = []; |
| my $graph_data = []; |
| |
| my $this_file_results; |
| my $lastbucket; |
| my $nextbucket; |
| my $seen_y; |
| my $seen_n; |
| |
| my $tmpdir = "/tmp/rulehits.$$"; |
| if ($DEBUG_TMPDIR) { $tmpdir = "/tmp/rulehits.tmp"; system("rm -rf $tmpdir"); } |
| |
| mkdir ($tmpdir) or die "collided on $tmpdir"; |
| |
| my $outdir = "."; |
| if ($opt_cgi) { |
| $outdir = $tmpdir; |
| } |
| |
| my $file_sets = [ ]; # split into ham and spam |
| $file_sets = [ [ 'TITLE:hits in spam' ], [ 'TITLE:hits in ham' ] ]; |
| |
| foreach my $file (@ARGV) { |
| if ($file =~ /\bham\b/) { |
| push @{$file_sets->[1]}, $file; |
| } else { |
| push @{$file_sets->[0]}, $file; |
| } |
| } |
| |
| foreach my $set (@{$file_sets}) { |
| @allfiles = (); |
| %allbuckets = (); |
| %allresults = (); |
| |
| my $settitle = ''; |
| if ($set->[0] =~ /^TITLE:(.*)$/) { |
| $settitle = $1; shift(@{$set}); |
| } |
| |
| create_gp("$opt_rule $settitle"); |
| |
| foreach my $file (@{$set}) { |
| if (!$opt_text) { |
| my $title = $file; |
| $title =~ s/^.*\///; |
| } |
| push (@allfiles, $file); |
| |
| if (1) { |
| # use an on-disk file to avoid massive VM usage for this hash |
| # on huge datasets |
| unlink("$tmpdir/graph.tmp.dir"); |
| unlink("$tmpdir/graph.tmp.pag"); |
| tie (%{$allresults{$file}}, 'SDBM_File', "$tmpdir/graph.tmp", |
| O_RDWR|O_CREAT, 0600) or die "tie failed: $!"; |
| } |
| else { |
| %{$allresults{$file}} = (); |
| } |
| |
| $this_file_results = $allresults{$file}; |
| read_logs($file); |
| |
| $graph_times = []; |
| $graph_data = []; |
| summarise(); |
| } |
| |
| $opt_scale_period and collapse_periods(); |
| |
| plot_gp(); |
| } |
| |
| my $format = "gif"; |
| |
| { |
| my $both = GD::Image->new($graph_x, 15 + ($graph_y * 2)); |
| my $file01 = GD::Image->newFromPngData($graph_png_data{"file01"}, 1); |
| my $file02 = GD::Image->newFromPngData($graph_png_data{"file02"}, 1); |
| |
| if (!$file01 || !$file02) { |
| warn "bad input. leaving graph blank"; |
| } |
| else { |
| $both->copy($file01, 0, 5, 0, 0, $graph_x-1, $graph_y-1); |
| $both->copy($file02, 0, 10 + $graph_y, 0, 0, $graph_x-1, $graph_y-1); |
| } |
| |
| if ($opt_cgi) { |
| use CGI qw(:standard); |
| print header("image/$format"); binmode STDOUT; |
| print STDOUT $both->$format(); |
| } |
| else { |
| open(IMG, ">both.$format") or die $!; binmode IMG; |
| print IMG $both->$format(); |
| close IMG; |
| } |
| |
| $both->gif(); |
| } |
| |
| if (!$DEBUG_TMPDIR) { |
| unlink(<$tmpdir/*.*>); rmdir $tmpdir; |
| } else { |
| system ("ls -l $tmpdir/*.* 1>&2"); |
| } |
| |
| exit; |
| |
| sub summarise { |
| foreach my $bucket (sort keys %allbuckets) { |
| my @cols = (); |
| foreach my $file (@allfiles) { |
| my $res = $allresults{$file}->{$bucket}; |
| my $sy; |
| my $sn; |
| |
| if (!$res) { |
| $sn = $sy = -1; |
| } |
| elsif ($res !~ /^y(\d+)n(\d+)$/) { |
| warn "bad results: $res for $file $bucket"; |
| next; |
| } |
| else { |
| $sy = $1; |
| $sn = $2; |
| } |
| |
| if (!defined $sy && !defined $sn) { |
| $sn = $sy = -1; |
| } elsif (!defined $sy || !defined $sn) { |
| # assert: enforce both < 0, if either is |
| warn "oops? sy=$sy sn=$sn, should be both < 0"; |
| $sn = $sy = -1; |
| } |
| |
| if (($sy+$sn) > 0) { |
| push @cols, ($sy / ($sy + $sn)) * 100.0; |
| } |
| else { |
| push @cols, -1; |
| } |
| } |
| |
| if ($opt_text) { |
| print $bucket," ".join(' ',@cols)."\n"; |
| } |
| else { |
| push (@{$graph_times}, $bucket); |
| push (@{$graph_data}, \@cols); |
| } |
| } |
| } |
| |
| |
| sub collapse_periods { |
| while (scalar @{$graph_data} > $opt_scale_period) { |
| my $num_files = (scalar @allfiles - 1); |
| my $newtimes = [ ]; |
| my $newdata = [ ]; |
| my $i; |
| for ($i = 0; $i < (scalar @{$graph_data}); $i += 2) { |
| $newtimes->[$i >> 1] = $graph_times->[$i]; |
| foreach my $j (0 .. $num_files) |
| { |
| my $v1 = $graph_data->[$i]->[$j]; |
| my $v2 = $graph_data->[$i+1]->[$j]; |
| if (!defined $v2) { $v2 = -1; } |
| |
| if ($v1 >= 0.0 && $v2 >= 0.0) { |
| # both are valid. take their mean |
| $v1 = ($v1 + $v2) / 2.0; |
| } |
| elsif ($v2 >= 0.0) { |
| # only one is valid; use it and ignore the invalid one |
| $v1 = $v2; |
| } |
| else { |
| # we're good, v1 is the valid one anyway |
| } |
| |
| $newdata->[$i >> 1]->[$j] = $v1; |
| } |
| } |
| @{$graph_times} = @{$newtimes}; |
| @{$graph_data} = @{$newdata}; |
| $period *= 2; |
| } |
| } |
| |
| |
| sub read_logs { |
| my $file = shift; |
| |
| # limit to a range from [4 years ago, today] to avoid OOM craziness |
| # from corrupt input |
| # |
| if ($opt_ignore_older <= 0) { |
| $opt_ignore_older = 365 * 4; |
| } |
| my $limit_hi = time; |
| my $limit_lo = $limit_hi - (24*60*60*$opt_ignore_older); |
| |
| $lastbucket = undef; |
| $nextbucket = undef; |
| $seen_y = 0; |
| $seen_n = 0; |
| |
| if ($file =~ /\.gz$/) { |
| open (IN, "pigz -cd '$file'|") or die "cannot gunzip $file"; |
| } else { |
| open (IN, "<$file") or die "cannot read $file"; |
| } |
| |
| while (<IN>) { |
| next if /^#/; |
| |
| my $t; |
| /\btime=(\d+),/ and $t = $1; |
| next unless $t; |
| |
| if ($t < $limit_lo || $t > $limit_hi) { |
| warn "ignoring out-of-range time $t (limit: $limit_lo < t < $limit_hi)"; |
| next; |
| } |
| |
| my $found = ($_ =~ $rule_re); |
| |
| if (!defined $lastbucket) { |
| $lastbucket = $t - ($t % $period); |
| $nextbucket = $lastbucket + $period; |
| } |
| |
| if ($t < $nextbucket) { |
| if ($found) { |
| $seen_y++; |
| } else { |
| $seen_n++; |
| } |
| } |
| else { |
| while ($t >= $nextbucket) { |
| completeline(); |
| $lastbucket = $nextbucket; |
| $nextbucket += $period; |
| } |
| } |
| } |
| close IN; |
| completeline(); |
| } |
| |
| sub completeline { |
| return unless ($lastbucket); |
| $allbuckets{$lastbucket} = undef; |
| $this_file_results->{$lastbucket} = "y".$seen_y."n".$seen_n; |
| $seen_y = 0; |
| $seen_n = 0; |
| } |
| |
| sub create_gp { |
| my $title = shift; |
| |
| my $mailtype = 'mail'; |
| if ($title =~ /\b(ham|spam)\b/) { $mailtype = $1; } |
| my $y_label = "\%age of $mailtype in period"; |
| |
| $SIG{PIPE} = sub { |
| die "unexpected SIGPIPE received!"; |
| }; |
| |
| open (GP, "| gnuplot - > $tmpdir/gp.log 2>&1") or die "cannot run gnuplot"; |
| |
| # eye-candy |
| my $niceperiod = "$period secs"; |
| if ($period % (24*60*60) == 0) { |
| $niceperiod = ($period / (24*60*60))." days"; |
| } |
| |
| # (NOTE: -1% hitrate means no data for that time period)' |
| print GP qq{ |
| |
| set terminal png small \\ |
| interlace size $graph_x,$graph_y \\ |
| xffffff x444444 x33cc00 \\ |
| xff3300 x0000cc x99cc00 xff9900 \\ |
| xcccc00 x333333 x999999 x9500d3 |
| |
| set out '$tmpdir/out.png' |
| |
| set grid back xtics ytics |
| |
| set xlabel 'Time, in blocks of $niceperiod' |
| set xdata time |
| set timefmt "%Y-%m-%d-%H" |
| set format x "%04Y%02m%02d" |
| |
| set ylabel '$y_label' |
| set yrange [0:*] |
| |
| set title "$title" |
| set key left top Left nobox |
| |
| }; |
| } |
| |
| sub fmt_time_t { |
| my $tt = shift; |
| use POSIX qw(strftime); |
| return strftime "%Y-%m-%d-%H", gmtime($tt); |
| } |
| |
| sub plot_gp { |
| my $num_files = (scalar @allfiles - 1); |
| my $num_datapoints = (scalar @{$graph_data} - 1); |
| |
| # specify a number of alphas for Statistics::DEA. Right now, |
| # the graph is pretty unreadable with more than one. |
| my $dea_alphas = [ 0.9 ]; |
| my $num_alphas = (scalar @{$dea_alphas} - 1); |
| |
| my $times = [ ]; |
| my $avgs = [ ]; |
| |
| my $graphname = sprintf("file%02d", $fname_counter++); |
| |
| if (!$opt_text) |
| { |
| if (@{$graph_data}) { |
| my $deas = (); |
| foreach my $i (0 .. $num_files) { |
| foreach my $a (0 .. $num_alphas) { |
| $deas->[$a]->[$i] = |
| Statistics::DEA->new($dea_alphas->[$a], $period * 3); |
| } |
| } |
| |
| foreach my $j (0 .. $num_datapoints) { |
| my (@datas) = @{$graph_data->[$j]}; |
| $times->[$j] = fmt_time_t($graph_times->[$j]); |
| |
| foreach my $i (0 .. $num_files) { |
| my $d = $datas[$i]; |
| |
| foreach my $a (0 .. $num_alphas) { |
| if ($d >= 0) { |
| $deas->[$a]->[$i]->update($d, $j); |
| } |
| |
| my $avg; |
| eval { |
| # this can die if it hasn't received enough data! |
| # so trap with an eval. |
| $avg = $deas->[$a]->[$i]->average(); |
| }; |
| $avgs->[$a]->[$j]->[$i] = (defined $avg) ? $avg : -1; |
| } |
| } |
| } |
| } |
| |
| # write the data plotfile |
| open (DATA, ">$tmpdir/plot.$graphname.data") or die; |
| if (@{$graph_data}) |
| { |
| foreach my $j (0 .. $num_datapoints) { |
| print DATA $times->[$j]," ",join(' ', @{$graph_data->[$j]}),"\n"; |
| } |
| } else { |
| # a fake datapoint so gnuplot doesn't puke on us |
| print DATA fmt_time_t(0)," 0 0\n"; |
| } |
| close DATA or die; |
| |
| |
| # write the avgs plotfiles |
| foreach my $a (0 .. $num_alphas) { |
| open (DATA, ">$tmpdir/avgs$a.$graphname.data") or die; |
| if (@{$graph_data}) { |
| foreach my $j (0 .. $num_datapoints) { |
| print DATA $times->[$j]," ", |
| defined $avgs->[$a]->[$j] ? join ' ', @{$avgs->[$a]->[$j]} : '0', |
| "\n"; |
| } |
| } else { |
| # a fake datapoint so gnuplot doesn't puke on us |
| print DATA fmt_time_t(0)," 0 0\n"; |
| } |
| close DATA or die; |
| } |
| |
| |
| # and the commands file |
| my @plot = (); |
| foreach my $i (0 .. $num_files) { |
| my $legend = filename_to_legend ($allfiles[$i]); |
| my $style = $i+1; |
| my $col = $i+2; |
| |
| push @plot, |
| qq{ '$tmpdir/plot.$graphname.data' using }. |
| qq{ 1:(\$$col >= 0 ? \$$col : 1/0) }. |
| # note: using "lt $style" gives us points in the same |
| # colour as the lines in the smoothed graph below |
| qq{ with points lt $style pt $style ps 1 }. |
| qq{ title '$legend' }; |
| |
| foreach my $a (0 .. $num_alphas) { |
| push @plot, |
| qq{ '$tmpdir/avgs$a.$graphname.data' using }. |
| qq{ 1:(\$$col >= 0 ? \$$col : 1/0) }. |
| qq{ with lines lt $style lw 3 }. |
| qq{ title ' (DEA a=$dea_alphas->[$a])' }; |
| } |
| } |
| |
| print GP "plot ",join(", ", @plot), "\n"; |
| close GP |
| or warn "gnuplot command exited: $?"; |
| |
| $graph_png_data{$graphname} = readfile("$tmpdir/out.png"); |
| } |
| } |
| |
| sub readfile { |
| open (IN, "<$_[0]") or die "cannot read $_[0]"; |
| binmode IN; |
| my $str = join('',<IN>); |
| close IN; |
| return $str; |
| } |
| |
| sub filename_to_legend { |
| my $f = shift; |
| |
| $f =~ s/^.*\///; |
| $f =~ s/LOGS\.all-//; |
| $f =~ s/\.log\.\S+$//; |
| return $f; |
| } |