| #!/usr/bin/perl |
| |
| use warnings; |
| use strict; |
| use File::Basename; |
| |
| my %opt = (); |
| $opt{percents} = 1; |
| |
| # --------------------------------------------------------------------------- |
| |
| my $date_lut = create_date_lookup_table(); |
| |
| my $pairs = { }; |
| foreach my $f (@ARGV) { |
| my ($class, $who, $daterev); |
| if ($f =~ m,LOGS\.\S+?-(ham|nonspam|spam)-([^\.]+)\.([^\.]+)\.log,) { |
| # LOGS.all-spam-bb-jhardin.20090714-r793817-n.log.gz |
| ($class, $who, $daterev) = ($1, $2, $3); |
| } |
| elsif ($f =~ m,(ham|nonspam|spam)-([^\.]+)\.([^\.]+)\.log,) { |
| # LOGS.all-spam-bb-jhardin.20090714-r793817-n.log.gz |
| ($class, $who, $daterev) = ($1, $2, $3); |
| } |
| elsif ($f =~ m,(ham|nonspam|spam)-([^\.]+)\.log,) { # ham-jm.log |
| ($class, $who) = ($1, $2); |
| } |
| elsif ($f =~ m,(ham|nonspam|spam),) { |
| ($class) = ($1); |
| $who = 'unknown'; |
| } |
| else { |
| die "cannot parse filename: $f\n"; |
| } |
| |
| $class = 'ham' if $class eq 'nonspam'; |
| push @{$pairs->{$who}}, [ $f, $class ]; |
| } |
| |
| my $byuser = {}; |
| my $total_counts = {}; |
| foreach my $who (keys %{$pairs}) { |
| my $buckets = {}; |
| foreach my $file (@{$pairs->{$who}}) { |
| my ($f, $class) = @{$file}; |
| load_log($buckets, $total_counts, $f, $class, $who); |
| } |
| $byuser->{$who}->{buckets} = $buckets; |
| } |
| |
| my $all_tspam = 0; |
| my $all_tham = 0; |
| |
| foreach my $who (sort keys %{$byuser}) { |
| report($byuser->{$who}->{buckets}, $total_counts, $who); |
| } |
| |
| final_report($total_counts); |
| exit; |
| |
| # --------------------------------------------------------------------------- |
| |
| sub load_log { |
| my ($buckets, $total_counts, $f, $class, $who) = @_; |
| |
| my ($caught, $score, $restofline); |
| |
| if ($f =~ /\.gz$/) { |
| open (IN, "gunzip -cd $f|") or die "cannot read $f"; |
| } else { |
| open (IN, "<$f") or die "cannot read $f"; |
| } |
| while (<IN>) { |
| ($caught, $score, $restofline) = split(' ', $_, 3); |
| next unless ($caught =~ /^[Y\.]$/ && $restofline); |
| next unless ($restofline =~ /(?: |,)time=(\d+)(?:\D|$)/); |
| my $t = $1; |
| |
| my $tbucket = time_to_bucket($t); |
| if (!exists $buckets->{$tbucket}) { |
| $buckets->{$tbucket} ||= { }; |
| } |
| |
| if (!exists $buckets->{$tbucket}->{$class}) { |
| $buckets->{$tbucket}->{$class} = { |
| count => 0, |
| range_lo => undef, |
| range_hi => undef, |
| }; |
| } |
| |
| $total_counts->{$class}++; |
| my $b = $buckets->{$tbucket}->{$class}; |
| $b->{count}++; |
| update_range_lo(\$b->{range_lo}, $score); |
| update_range_hi(\$b->{range_hi}, $score); |
| } |
| close IN; |
| } |
| |
| # --------------------------------------------------------------------------- |
| |
| # bb-jhardin Spam messages Score range Ham messages Score range |
| # in 2009-06 39 (0%) [0,29] 0 |
| # in 2009-07 8 (0%) [1,24] 2 (0%) [1,4] |
| # TOTAL: 73 (0%) [0,29] 2 (0%) [1,4] |
| |
| sub report { |
| my ($buckets, $total_counts, $who) = @_; |
| |
| |
| printf "%-16s %-15s %-14s %-15s %-14s\n", |
| $who, "Spam messages", "Score range", "Ham messages", "Score range"; |
| |
| my $tspam = 0; |
| my $tham = 0; |
| my ($trslo, $trshi, $trhlo, $trhhi); |
| |
| foreach my $tbucket (sort keys %{$buckets}) { |
| my $buck = $buckets->{$tbucket}; |
| my $nspam = $buck->{spam}->{count} || 0; |
| my $nham = $buck->{ham}->{count} || 0; |
| printf "%-16s %7s %6s %-14s %7s %6s %-14s\n", |
| " in $tbucket", |
| $nspam, as_percent($nspam, $total_counts->{spam}), |
| format_score_range($buck->{spam}->{range_lo}, $buck->{spam}->{range_hi}), |
| $nham, as_percent($nham, $total_counts->{ham}), |
| format_score_range($buck->{ham}->{range_lo}, $buck->{ham}->{range_hi}); |
| |
| $tspam += $nspam; |
| $tham += $nham; |
| update_range_lo(\$trslo, $buck->{spam}->{range_lo}); |
| update_range_hi(\$trshi, $buck->{spam}->{range_hi}); |
| update_range_lo(\$trhlo, $buck->{ham}->{range_lo}); |
| update_range_hi(\$trhhi, $buck->{ham}->{range_hi}); |
| } |
| |
| printf "%-16s %7s %6s %-14s %7s %6s %-14s\n", |
| " TOTAL:", |
| $tspam, as_percent($tspam, $total_counts->{spam}), |
| format_score_range($trslo, $trshi), |
| $tham, as_percent($tham, $total_counts->{ham}), |
| format_score_range($trhlo, $trhhi); |
| |
| $all_tspam += $tspam; |
| $all_tham += $tham; |
| print "\n"; |
| } |
| |
| # --------------------------------------------------------------------------- |
| |
| sub final_report { |
| my ($total_counts) = @_; |
| |
| printf "%-16s %7s %6s %-14s %7s %6s %-14s\n", |
| "OVERALL TOTAL:", |
| $all_tspam, '', '', |
| $all_tham, '', ''; |
| } |
| |
| # --------------------------------------------------------------------------- |
| |
| use Time::Local; |
| sub create_date_lookup_table { |
| |
| my ($sec,$min,$hour,$mday,$cmon,$cyear,$x) = gmtime time; |
| |
| my @month_starts = (); |
| my ($year, $mon); |
| for ($year = $cyear; $year >= 70; $year--) { |
| for ($mon = 11; $mon >= 0; $mon--) { |
| next if ($year == $cyear && $mon > $cmon); # in the future |
| |
| if ($year < $cyear-1 || ($year==$cyear-1 && $mon < $cmon)) { |
| # just record January 1 for times over a year ago |
| next unless ($mon == 0); |
| push @month_starts, [ |
| timegm(0,0,0,1,$mon,$year), $year+1900 |
| ]; |
| } |
| else { |
| push @month_starts, [ |
| timegm(0,0,0,1,$mon,$year), sprintf("%04d-%02d", $year+1900, $mon+1) |
| ]; |
| } |
| } |
| } |
| return \@month_starts; |
| } |
| |
| sub time_to_bucket { |
| my ($t) = @_; |
| # could binary-search here, but the win is probably not worth it |
| foreach my $pair (@{$date_lut}) { |
| if ($pair->[0] < $t) { |
| return $pair->[1]; |
| } |
| } |
| return "1970"; |
| } |
| |
| # --------------------------------------------------------------------------- |
| |
| sub as_percent { |
| my ($num, $total) = @_; |
| if (!$opt{percents} || !$num) { return ''; } |
| if (!$total) { return '(100%)'; } |
| return sprintf("(%d%%)", (($num||0) *100.0) / $total); |
| } |
| |
| # --------------------------------------------------------------------------- |
| |
| sub format_score_range { |
| my ($rlo, $rhi) = @_; |
| if (!defined $rlo && !defined $rhi) { return ''; } |
| if (!defined $rlo) { $rlo = ''; } |
| if (!defined $rhi) { $rhi = ''; } |
| return "[$rlo,$rhi]"; |
| } |
| |
| # --------------------------------------------------------------------------- |
| |
| sub update_range_lo { |
| my ($rloref, $score) = @_; |
| return unless defined $score; |
| if (!defined $$rloref || $score < $$rloref) { $$rloref = $score; } |
| } |
| |
| # --------------------------------------------------------------------------- |
| |
| sub update_range_hi { |
| my ($rhiref, $score) = @_; |
| return unless defined $score; |
| if (!defined $$rhiref || $score > $$rhiref) { $$rhiref = $score; } |
| } |
| |