| #!/usr/bin/perl -w |
| |
| # Remove tokens from the bayes_probs db. Don't remove tokens from the |
| # bayes_toks db, since feeding the learner more messages might change |
| # the state of the token so it wouldn't have been removed; just re-run |
| # this tool after each learning sessions |
| |
| use strict; |
| use Fcntl; |
| |
| BEGIN { @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File); } |
| use AnyDBM_File; |
| |
| # for the DB format... |
| use constant FORMAT_FLAG => 0xc0; # 11000000 |
| use constant ONE_BYTE_FORMAT => 0xc0; # 11000000 |
| use constant TWO_LONGS_FORMAT => 0x00; # 00000000 |
| |
| use constant ONE_BYTE_SSS => 0x38; # 00111000 |
| use constant ONE_BYTE_HHH => 0x07; # 00000111 |
| |
| use vars qw{ |
| %prob_db %toks_db |
| $opt_dbpath $opt_regexp $opt_min_hits |
| $opt_min_prob_strength |
| $opt_help |
| }; |
| |
| sub usage { |
| print " |
| Usage: trim_bayes_db [--dbpath=path] [--regexp=regexp] [--min-hits=int] |
| [--min-prob-strength==float]\n"; |
| exit(1); |
| } # usage() |
| |
| use Getopt::Long; |
| GetOptions("dbpath=s", "regexp=s", "min-hits=i", "min-prob-strength=f", |
| "help"); |
| |
| usage() if ($opt_help); |
| # At least one of the filtering options must be set |
| if (!$opt_regexp && !$opt_min_hits && !$opt_min_prob_strength) { |
| print "At least one of the filtering options must be set\n"; |
| usage(); |
| } |
| |
| my ($MPS1, $MPS2); |
| |
| if ($opt_min_prob_strength) { |
| $MPS1 = 0.5 - $opt_min_prob_strength; |
| $MPS2 = 0.5 + $opt_min_prob_strength; |
| } |
| |
| my $path = $opt_dbpath; |
| $path ||= $ENV{HOME}."/.spamassassin/bayes"; |
| |
| |
| my $toks_name = "${path}_toks"; |
| tie %toks_db, "AnyDBM_File", $toks_name, O_RDONLY, 0600 |
| or die "Cannot open file $toks_name: $!\n"; |
| |
| my $prob_name = "${path}_probs"; |
| tie %prob_db, "AnyDBM_File", $prob_name, O_RDWR, 0666 |
| or die "Cannot open file $prob_name: $!\n"; |
| |
| foreach my $key ( keys(%prob_db) ) { |
| if ($opt_regexp && ($key =~ m/$opt_regexp/o)) { |
| delete $prob_db{$key}; |
| next; |
| } |
| |
| if ($opt_min_hits) { |
| my ($ts, $th) = tok_unpack ($toks_db{$key}); |
| |
| my $hits = ($ts || 0) + ($th || 0); |
| |
| if ($hits < $opt_min_hits) { |
| delete $prob_db{$key}; |
| next; |
| } |
| } # if ($opt_min_hits) |
| |
| if ($opt_min_prob_strength) { |
| my $prob = unpack ('f', $prob_db{$key}); |
| |
| if (($MPS1 < $prob) && ($prob < $MPS2)) { |
| delete $prob_db{$key}; |
| next; |
| } |
| } |
| } # foreach my $key ( keys(%prob_db) ) |
| |
| untie %prob_db; |
| untie %toks_db; |
| |
| if ($AnyDBM_File::ISA[0] eq "GDBM_File") { |
| # GDBM_File::reorganize() can't be perfomed on a AnyDBM_File |
| # tied hash, even if the underlying implementation is GDBM, |
| # so we have to tie it again; bleh. |
| print "Re-tieing db as GDBM_File to reduce db size\n"; |
| tie %prob_db, "GDBM_File", $prob_name, O_RDWR, 0666 |
| or die "Cannot open file $prob_name: $!\n"; |
| |
| GDBM_File::reorganize(tied(%prob_db)); |
| untie %prob_db; |
| } |
| |
| ################################################## |
| |
| sub tok_unpack { |
| my ($packed, $ts, $th) = unpack("CLL", $_[0] || 0); |
| |
| if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) { |
| $ts = ($packed & ONE_BYTE_SSS) >> 3; |
| $th = ($packed & ONE_BYTE_HHH); |
| } |
| # else use $ts and $th we just unpacked. |
| |
| return ($ts, $th); |
| } # tok_unpack() |