blob: 032929e38ef254276214d7e644918f7adf3f02a2 [file] [log] [blame]
#!/usr/bin/perl
use warnings;
use strict;
use File::Basename;
my %opt = ();
$opt{percents} = 1;
# ---------------------------------------------------------------------------
my $date_lut = create_date_lookup_table();
my $pairs = { };
foreach my $f (@ARGV) {
my ($class, $who, $daterev);
if ($f =~ m,LOGS\.\S+?-(ham|nonspam|spam)-([^\.]+)\.([^\.]+)\.log,) {
# LOGS.all-spam-bb-jhardin.20090714-r793817-n.log.gz
($class, $who, $daterev) = ($1, $2, $3);
}
elsif ($f =~ m,(ham|nonspam|spam)-([^\.]+)\.([^\.]+)\.log,) {
# LOGS.all-spam-bb-jhardin.20090714-r793817-n.log.gz
($class, $who, $daterev) = ($1, $2, $3);
}
elsif ($f =~ m,(ham|nonspam|spam)-([^\.]+)\.log,) { # ham-jm.log
($class, $who) = ($1, $2);
}
elsif ($f =~ m,(ham|nonspam|spam),) {
($class) = ($1);
$who = 'unknown';
}
else {
die "cannot parse filename: $f\n";
}
$class = 'ham' if $class eq 'nonspam';
push @{$pairs->{$who}}, [ $f, $class ];
}
my $byuser = {};
my $total_counts = {};
foreach my $who (keys %{$pairs}) {
my $buckets = {};
foreach my $file (@{$pairs->{$who}}) {
my ($f, $class) = @{$file};
load_log($buckets, $total_counts, $f, $class, $who);
}
$byuser->{$who}->{buckets} = $buckets;
}
my $all_tspam = 0;
my $all_tham = 0;
foreach my $who (sort keys %{$byuser}) {
report($byuser->{$who}->{buckets}, $total_counts, $who);
}
final_report($total_counts);
exit;
# ---------------------------------------------------------------------------
sub load_log {
my ($buckets, $total_counts, $f, $class, $who) = @_;
my ($caught, $score, $restofline);
if ($f =~ /\.gz$/) {
open (IN, "gunzip -cd $f|") or die "cannot read $f";
} else {
open (IN, "<$f") or die "cannot read $f";
}
while (<IN>) {
($caught, $score, $restofline) = split(' ', $_, 3);
next unless ($caught =~ /^[Y\.]$/ && $restofline);
next unless ($restofline =~ /(?: |,)time=(\d+)(?:\D|$)/);
my $t = $1;
my $tbucket = time_to_bucket($t);
if (!exists $buckets->{$tbucket}) {
$buckets->{$tbucket} ||= { };
}
if (!exists $buckets->{$tbucket}->{$class}) {
$buckets->{$tbucket}->{$class} = {
count => 0,
range_lo => undef,
range_hi => undef,
};
}
$total_counts->{$class}++;
my $b = $buckets->{$tbucket}->{$class};
$b->{count}++;
update_range_lo(\$b->{range_lo}, $score);
update_range_hi(\$b->{range_hi}, $score);
}
close IN;
}
# ---------------------------------------------------------------------------
# bb-jhardin Spam messages Score range Ham messages Score range
# in 2009-06 39 (0%) [0,29] 0
# in 2009-07 8 (0%) [1,24] 2 (0%) [1,4]
# TOTAL: 73 (0%) [0,29] 2 (0%) [1,4]
sub report {
my ($buckets, $total_counts, $who) = @_;
printf "%-16s %-15s %-14s %-15s %-14s\n",
$who, "Spam messages", "Score range", "Ham messages", "Score range";
my $tspam = 0;
my $tham = 0;
my ($trslo, $trshi, $trhlo, $trhhi);
foreach my $tbucket (sort keys %{$buckets}) {
my $buck = $buckets->{$tbucket};
my $nspam = $buck->{spam}->{count} || 0;
my $nham = $buck->{ham}->{count} || 0;
printf "%-16s %7s %6s %-14s %7s %6s %-14s\n",
" in $tbucket",
$nspam, as_percent($nspam, $total_counts->{spam}),
format_score_range($buck->{spam}->{range_lo}, $buck->{spam}->{range_hi}),
$nham, as_percent($nham, $total_counts->{ham}),
format_score_range($buck->{ham}->{range_lo}, $buck->{ham}->{range_hi});
$tspam += $nspam;
$tham += $nham;
update_range_lo(\$trslo, $buck->{spam}->{range_lo});
update_range_hi(\$trshi, $buck->{spam}->{range_hi});
update_range_lo(\$trhlo, $buck->{ham}->{range_lo});
update_range_hi(\$trhhi, $buck->{ham}->{range_hi});
}
printf "%-16s %7s %6s %-14s %7s %6s %-14s\n",
" TOTAL:",
$tspam, as_percent($tspam, $total_counts->{spam}),
format_score_range($trslo, $trshi),
$tham, as_percent($tham, $total_counts->{ham}),
format_score_range($trhlo, $trhhi);
$all_tspam += $tspam;
$all_tham += $tham;
print "\n";
}
# ---------------------------------------------------------------------------
sub final_report {
my ($total_counts) = @_;
printf "%-16s %7s %6s %-14s %7s %6s %-14s\n",
"OVERALL TOTAL:",
$all_tspam, '', '',
$all_tham, '', '';
}
# ---------------------------------------------------------------------------
use Time::Local;
sub create_date_lookup_table {
my ($sec,$min,$hour,$mday,$cmon,$cyear,$x) = gmtime time;
my @month_starts = ();
my ($year, $mon);
for ($year = $cyear; $year >= 70; $year--) {
for ($mon = 11; $mon >= 0; $mon--) {
next if ($year == $cyear && $mon > $cmon); # in the future
if ($year < $cyear-1 || ($year==$cyear-1 && $mon < $cmon)) {
# just record January 1 for times over a year ago
next unless ($mon == 0);
push @month_starts, [
timegm(0,0,0,1,$mon,$year), $year+1900
];
}
else {
push @month_starts, [
timegm(0,0,0,1,$mon,$year), sprintf("%04d-%02d", $year+1900, $mon+1)
];
}
}
}
return \@month_starts;
}
sub time_to_bucket {
my ($t) = @_;
# could binary-search here, but the win is probably not worth it
foreach my $pair (@{$date_lut}) {
if ($pair->[0] < $t) {
return $pair->[1];
}
}
return "1970";
}
# ---------------------------------------------------------------------------
sub as_percent {
my ($num, $total) = @_;
if (!$opt{percents} || !$num) { return ''; }
if (!$total) { return '(100%)'; }
return sprintf("(%d%%)", (($num||0) *100.0) / $total);
}
# ---------------------------------------------------------------------------
sub format_score_range {
my ($rlo, $rhi) = @_;
if (!defined $rlo && !defined $rhi) { return ''; }
if (!defined $rlo) { $rlo = ''; }
if (!defined $rhi) { $rhi = ''; }
return "[$rlo,$rhi]";
}
# ---------------------------------------------------------------------------
sub update_range_lo {
my ($rloref, $score) = @_;
return unless defined $score;
if (!defined $$rloref || $score < $$rloref) { $$rloref = $score; }
}
# ---------------------------------------------------------------------------
sub update_range_hi {
my ($rhiref, $score) = @_;
return unless defined $score;
if (!defined $$rhiref || $score > $$rhiref) { $$rhiref = $score; }
}