blob: 084afe10b629479c44d599ad25283c505498be36 [file] [log] [blame]
#!/usr/bin/perl
#
# so-display spamfile hamfile
# combineddatasource | so-display
#
# Compute "S/O ratios" for data. S/O stands for Spam/Overall, and denotes the
# probability that a hit for that datum is spam (in the Bayesian style).
#
# combinedfile should contain lines in the format "X data", where "X" is either
# "h" or "s" for ham or spam, and "data" is what will be collated and reported.
#
# Otherwise "hamfile" and "spamfile" contain data entries, one per line.
#
# Feb 11 2003 jm
my $spamdata = shift @ARGV;
my $hamdata = shift @ARGV;
my $combined = 0;
if (!defined $spamdata) { $combined = 1; }
%spam = (); %ham = (); %found = ();
if ($combined) {
while (<>) {
chomp; s/^(\S+)\s+//;
if ($1 eq 's') { $spam{$_}++; } else { $ham{$_}++; }
$found{$_}++;
}
} else {
open (IN, "< $spamdata");
while (<IN>) { chomp; $found{$_}++; $spam{$_}++; }
close IN;
open (IN, "< $hamdata");
while (<IN>) { chomp; $found{$_}++; $ham{$_}++; }
close IN;
}
my $stot = 0;
my $htot = 0;
foreach my $id (keys %found) {
$ham{$id} ||= 0; $spam{$id} ||= 0;
$htot += $ham{$id}; $stot += $spam{$id};
}
$htot ||= 0.000001;
$stot ||= 0.000001;
foreach my $id (keys %found) {
my $ham = $ham{$id} / $htot;
my $spam = $spam{$id} / $stot;
my $t = $ham + $spam || 0.000001;
$so{$id} = $spam / $t;
}
printf ("%6s %6s %6s %s\n", "RATIO", "SPAM%", "HAM%", "DATA");
foreach my $id (sort {
$so{$a} <=> $so{$b}
|| $spam{$a} <=> $spam{$b}
|| $ham{$b} <=> $ham{$a}
} keys %so)
{
printf ("%6.3f %6.3f %6.3f %s\n",
$so{$id}, ($spam{$id}*100) / $stot, ($ham{$id}*100) / $htot, $id);
}
exit;