blob: 22c807c74350ad68a60e92bfab477abb8ce8aa78 [file] [log] [blame]
#!/usr/bin/perl -w
# Remove tokens from the bayes_probs db. Don't remove tokens from the
# bayes_toks db, since feeding the learner more messages might change
# the state of the token so it wouldn't have been removed; just re-run
# this tool after each learning sessions
use strict;
use Fcntl;
BEGIN { @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File); }
use AnyDBM_File;
# for the DB format...
use constant FORMAT_FLAG => 0xc0; # 11000000
use constant ONE_BYTE_FORMAT => 0xc0; # 11000000
use constant TWO_LONGS_FORMAT => 0x00; # 00000000
use constant ONE_BYTE_SSS => 0x38; # 00111000
use constant ONE_BYTE_HHH => 0x07; # 00000111
use vars qw{
%prob_db %toks_db
$opt_dbpath $opt_regexp $opt_min_hits
$opt_min_prob_strength
$opt_help
};
sub usage {
print "
Usage: trim_bayes_db [--dbpath=path] [--regexp=regexp] [--min-hits=int]
[--min-prob-strength==float]\n";
exit(1);
} # usage()
use Getopt::Long;
GetOptions("dbpath=s", "regexp=s", "min-hits=i", "min-prob-strength=f",
"help");
usage() if ($opt_help);
# At least one of the filtering options must be set
if (!$opt_regexp && !$opt_min_hits && !$opt_min_prob_strength) {
print "At least one of the filtering options must be set\n";
usage();
}
my ($MPS1, $MPS2);
if ($opt_min_prob_strength) {
$MPS1 = 0.5 - $opt_min_prob_strength;
$MPS2 = 0.5 + $opt_min_prob_strength;
}
my $path = $opt_dbpath;
$path ||= $ENV{HOME}."/.spamassassin/bayes";
my $toks_name = "${path}_toks";
tie %toks_db, "AnyDBM_File", $toks_name, O_RDONLY, 0600
or die "Cannot open file $toks_name: $!\n";
my $prob_name = "${path}_probs";
tie %prob_db, "AnyDBM_File", $prob_name, O_RDWR, 0666
or die "Cannot open file $prob_name: $!\n";
foreach my $key ( keys(%prob_db) ) {
if ($opt_regexp && ($key =~ m/$opt_regexp/o)) {
delete $prob_db{$key};
next;
}
if ($opt_min_hits) {
my ($ts, $th) = tok_unpack ($toks_db{$key});
my $hits = ($ts || 0) + ($th || 0);
if ($hits < $opt_min_hits) {
delete $prob_db{$key};
next;
}
} # if ($opt_min_hits)
if ($opt_min_prob_strength) {
my $prob = unpack ('f', $prob_db{$key});
if (($MPS1 < $prob) && ($prob < $MPS2)) {
delete $prob_db{$key};
next;
}
}
} # foreach my $key ( keys(%prob_db) )
untie %prob_db;
untie %toks_db;
if ($AnyDBM_File::ISA[0] eq "GDBM_File") {
# GDBM_File::reorganize() can't be perfomed on a AnyDBM_File
# tied hash, even if the underlying implementation is GDBM,
# so we have to tie it again; bleh.
print "Re-tieing db as GDBM_File to reduce db size\n";
tie %prob_db, "GDBM_File", $prob_name, O_RDWR, 0666
or die "Cannot open file $prob_name: $!\n";
GDBM_File::reorganize(tied(%prob_db));
untie %prob_db;
}
##################################################
sub tok_unpack {
my ($packed, $ts, $th) = unpack("CLL", $_[0] || 0);
if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) {
$ts = ($packed & ONE_BYTE_SSS) >> 3;
$th = ($packed & ONE_BYTE_HHH);
}
# else use $ts and $th we just unpacked.
return ($ts, $th);
} # tok_unpack()