| # <@LICENSE> |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to you under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at: |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # </@LICENSE> |
| |
| package Mail::SpamAssassin::BayesStore::DBM; |
| |
| use strict; |
| use warnings; |
| # use bytes; |
| use re 'taint'; |
| |
| use Fcntl; |
| use Errno qw(EBADF); |
| use File::Basename; |
| use File::Spec; |
| use File::Path; |
| use Digest::SHA qw(sha1); |
| |
| use Mail::SpamAssassin; |
| use Mail::SpamAssassin::Util qw(untaint_var am_running_on_windows compile_regexp); |
| use Mail::SpamAssassin::BayesStore; |
| use Mail::SpamAssassin::Logger; |
| |
| use constant MAGIC_RE => qr/^\015\001\007\011\003/; |
| |
| our ( @DBNAMES, |
| $NSPAM_MAGIC_TOKEN, $NHAM_MAGIC_TOKEN, $LAST_EXPIRE_MAGIC_TOKEN, $LAST_JOURNAL_SYNC_MAGIC_TOKEN, |
| $NTOKENS_MAGIC_TOKEN, $OLDEST_TOKEN_AGE_MAGIC_TOKEN, $LAST_EXPIRE_REDUCE_MAGIC_TOKEN, |
| $RUNNING_EXPIRE_MAGIC_TOKEN, $DB_VERSION_MAGIC_TOKEN, $LAST_ATIME_DELTA_MAGIC_TOKEN, |
| $NEWEST_TOKEN_AGE_MAGIC_TOKEN |
| ); |
| |
| our @ISA = qw( Mail::SpamAssassin::BayesStore ); |
| |
| # db layout (quoting Matt): |
| # |
| # > need five db files though to make it real fast: |
| # [probs] 1. ngood and nbad (two entries, so could be a flat file rather |
| # than a db file). (now 2 entries in db_toks) |
| # [toks] 2. good token -> number seen |
| # [toks] 3. bad token -> number seen (both are packed into 1 entry in 1 db) |
| # [probs] 4. Consolidated good token -> probability |
| # [probs] 5. Consolidated bad token -> probability |
| # > As you add new mails, you update the entry in 2 or 3, then regenerate |
| # > the entry for that token in 4 or 5. |
| # > Then as you test a new mail, you just need to pull the probability |
| # > direct from 4 and 5, and generate the overall probability. A simple and |
| # > very fast operation. |
| # |
| # jm: we use probs as overall probability. <0.5 = ham, >0.5 = spam |
| # |
| # update: probs is no longer maintained as a db, to keep on-disk and in-core |
| # usage down. |
| # |
| # also, added a new one to support forgetting, auto-learning, and |
| # auto-forgetting for refiled mails: |
| # [seen] 6. a list of Message-IDs of messages already learnt from. values |
| # are 's' for learnt-as-spam, 'h' for learnt-as-ham. |
| # |
| # and another, called [scancount] to model the scan-count for expiry. |
| # This is not a database. Instead it increases by one byte for each |
| # message scanned (note: scanned, not learned). |
| |
| @DBNAMES = qw(toks seen); |
| |
| # These are the magic tokens we use to track stuff in the DB. |
| # The format is '^M^A^G^I^C' followed by any string you want. |
| # None of the control chars will be in a real token. |
| $DB_VERSION_MAGIC_TOKEN = "\015\001\007\011\003DBVERSION"; |
| $LAST_ATIME_DELTA_MAGIC_TOKEN = "\015\001\007\011\003LASTATIMEDELTA"; |
| $LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE"; |
| $LAST_EXPIRE_REDUCE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIREREDUCE"; |
| $LAST_JOURNAL_SYNC_MAGIC_TOKEN = "\015\001\007\011\003LASTJOURNALSYNC"; |
| $NEWEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003NEWESTAGE"; |
| $NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM"; |
| $NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM"; |
| $NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS"; |
| $OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE"; |
| $RUNNING_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003RUNNINGEXPIRE"; |
| |
| sub HAS_DBM_MODULE { |
| my ($self) = @_; |
| if (exists($self->{has_dbm_module})) { |
| return $self->{has_dbm_module}; |
| } |
| $self->{has_dbm_module} = eval { require DB_File; }; |
| } |
| |
| sub DBM_MODULE { |
| return "DB_File"; |
| } |
| |
| # Possible file extensions used by the kinds of database files DB_File |
| # might create. We need these so we can create a new file and rename |
| # it into place. |
| sub DB_EXTENSIONS { |
| return ('', '.db'); |
| } |
| |
| ########################################################################### |
| |
| sub new { |
| my $class = shift; |
| $class = ref($class) || $class; |
| |
| my $self = $class->SUPER::new(@_); |
| |
| $self->{supported_db_version} = 3; |
| |
| $self->{already_tied} = 0; |
| $self->{is_locked} = 0; |
| $self->{string_to_journal} = ''; |
| |
| $self; |
| } |
| |
| ########################################################################### |
| |
| sub tie_db_readonly { |
| my ($self) = @_; |
| |
| if (!$self->HAS_DBM_MODULE) { |
| dbg("bayes: %s module not installed, cannot use bayes", $self->DBM_MODULE); |
| return 0; |
| } |
| |
| # return if we've already tied to the db's, using the same mode |
| # (locked/unlocked) as before. |
| return 1 if ($self->{already_tied} && $self->{is_locked} == 0); |
| |
| my $main = $self->{bayes}->{main}; |
| if (!defined($main->{conf}->{bayes_path})) { |
| dbg("bayes: bayes_path not defined"); |
| return 0; |
| } |
| |
| $self->read_db_configs(); |
| |
| my $path = $main->sed_path($main->{conf}->{bayes_path}); |
| |
| my $found = 0; |
| for my $ext ($self->DB_EXTENSIONS) { |
| if (-f $path.'_toks'.$ext) { |
| $found = 1; |
| last; |
| } |
| } |
| |
| if (!$found) { |
| dbg("bayes: no dbs present, cannot tie DB R/O: %s", $path.'_toks'); |
| return 0; |
| } |
| |
| foreach my $dbname (@DBNAMES) { |
| my $name = $path.'_'.$dbname; |
| my $db_var = 'db_'.$dbname; |
| dbg("bayes: tie-ing to DB file R/O $name"); |
| |
| # Bug 6901, [rt.cpan.org #83060] |
| # DB_File: Repeated tie to the same hash with no untie causes corruption |
| untie %{$self->{$db_var}}; # has no effect if the variable is not tied |
| |
| if (!tie %{$self->{$db_var}}, $self->DBM_MODULE, $name, O_RDONLY, |
| (oct($main->{conf}->{bayes_file_mode}) & 0666)) |
| { |
| # bug 2975: it's acceptable for the db_seen to not be present, |
| # to allow it to be recycled. if that's the case, just create |
| # a new, empty one. we don't need to lock it, since we won't |
| # be writing to it; let the R/W api deal with that case. |
| |
| if ($dbname eq 'seen') { |
| # Bug 6901, [rt.cpan.org #83060] |
| untie %{$self->{$db_var}}; # has no effect if the variable is not tied |
| tie %{$self->{$db_var}}, $self->DBM_MODULE, $name, O_RDWR|O_CREAT, |
| (oct($main->{conf}->{bayes_file_mode}) & 0666) |
| or goto failed_to_tie; |
| } |
| else { |
| goto failed_to_tie; |
| } |
| } |
| } |
| |
| $self->{db_version} = ($self->get_storage_variables())[6]; |
| dbg("bayes: found bayes db version %s", $self->{db_version}); |
| |
| # If the DB version is one we don't understand, abort! |
| if ($self->_check_db_version() != 0) { |
| warn("bayes: bayes db version ".$self->{db_version}." is not able to be used, aborting!"); |
| $self->untie_db(); |
| return 0; |
| } |
| |
| $self->{already_tied} = 1; |
| return 1; |
| |
| failed_to_tie: |
| warn "bayes: cannot open bayes databases ${path}_* R/O: tie failed: $!\n"; |
| foreach my $dbname (@DBNAMES) { |
| my $db_var = 'db_'.$dbname; |
| next unless exists $self->{$db_var}; |
| dbg("bayes: untie-ing DB file $dbname"); |
| untie %{$self->{$db_var}}; |
| } |
| |
| return 0; |
| } |
| |
| # tie() to the databases, read-write and locked. Any callers of |
| # this should ensure they call untie_db() afterwards! |
| # |
| sub tie_db_writable { |
| my ($self) = @_; |
| |
| if (!$self->HAS_DBM_MODULE) { |
| dbg("bayes: %s module not installed, cannot use bayes", $self->DBM_MODULE); |
| return 0; |
| } |
| |
| # Useful shortcut ... |
| my $main = $self->{bayes}->{main}; |
| |
| # if we've already tied the db's using the same mode |
| # (locked/unlocked) as we want now, freshen the lock and return. |
| if ($self->{already_tied} && $self->{is_locked} == 1) { |
| $main->{locker}->refresh_lock($self->{locked_file}); |
| return 1; |
| } |
| |
| if (!defined($main->{conf}->{bayes_path})) { |
| dbg("bayes: bayes_path not defined"); |
| return 0; |
| } |
| |
| $self->read_db_configs(); |
| |
| my $path = $main->sed_path($main->{conf}->{bayes_path}); |
| |
| my $found = 0; |
| for my $ext ($self->DB_EXTENSIONS) { |
| if (-f $path.'_toks'.$ext) { |
| $found = 1; |
| last; |
| } |
| } |
| |
| my $parentdir = dirname($path); |
| if (!-d $parentdir) { |
| # run in an eval(); if mkpath has no perms, it calls die() |
| eval { |
| mkpath($parentdir, 0, (oct($main->{conf}->{bayes_file_mode}) & 0777)); |
| }; |
| } |
| |
| my $tout; |
| if ($main->{learn_wait_for_lock}) { |
| $tout = 300; # TODO: Dan to write better lock code |
| } else { |
| $tout = 10; |
| } |
| if ($main->{locker}->safe_lock($path, $tout, $main->{conf}->{bayes_file_mode})) |
| { |
| $self->{locked_file} = $path; |
| $self->{is_locked} = 1; |
| } else { |
| warn "bayes: cannot open bayes databases ${path}_* R/W: lock failed: $!\n"; |
| return 0; |
| } |
| |
| my $umask = umask 0; |
| foreach my $dbname (@DBNAMES) { |
| my $name = $path.'_'.$dbname; |
| my $db_var = 'db_'.$dbname; |
| dbg("bayes: tie-ing to DB file R/W $name"); |
| |
| ($self->DBM_MODULE eq 'DB_File') and |
| Mail::SpamAssassin::Util::avoid_db_file_locking_bug ($name); |
| |
| # Bug 6901, [rt.cpan.org #83060] |
| untie %{$self->{$db_var}}; # has no effect if the variable is not tied |
| tie %{$self->{$db_var}}, $self->DBM_MODULE, $name, O_RDWR|O_CREAT, |
| (oct($main->{conf}->{bayes_file_mode}) & 0666) |
| or goto failed_to_tie; |
| } |
| umask $umask; |
| |
| # set our cache to what version DB we're using |
| $self->{db_version} = ($self->get_storage_variables())[6]; |
| # don't bother printing this unless found since it would be bogus anyway |
| dbg("bayes: found bayes db version %s", $self->{db_version}) if $found; |
| |
| # figure out if we can read the current DB and if we need to do a |
| # DB version update and do it if necessary if either has a problem, |
| # fail immediately |
| # |
| if ($found && !$self->_upgrade_db()) { |
| $self->untie_db(); |
| return 0; |
| } |
| elsif (!$found) { # new DB, make sure we know that ... |
| $self->{db_version} = $self->{db_toks}->{$DB_VERSION_MAGIC_TOKEN} = $self->DB_VERSION; |
| $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN} = 0; # no tokens in the db ... |
| dbg("bayes: new db, set db version %s and 0 tokens", $self->{db_version}); |
| } |
| |
| $self->{already_tied} = 1; |
| return 1; |
| |
| failed_to_tie: |
| my $err = $!; |
| umask $umask; |
| |
| foreach my $dbname (@DBNAMES) { |
| my $db_var = 'db_'.$dbname; |
| next unless exists $self->{$db_var}; |
| dbg("bayes: untie-ing DB file $dbname"); |
| untie %{$self->{$db_var}}; |
| } |
| |
| if ($self->{is_locked}) { |
| $self->{bayes}->{main}->{locker}->safe_unlock($self->{locked_file}); |
| $self->{is_locked} = 0; |
| } |
| warn "bayes: cannot open bayes databases ${path}_* R/W: tie failed: $err\n"; |
| return 0; |
| } |
| |
| # Do we understand how to deal with this DB version? |
| sub _check_db_version { |
| my ($self) = @_; |
| |
| # return -1 if older, 0 if current, 1 if newer |
| return $self->{db_version} <=> $self->DB_VERSION; |
| } |
| |
| # Check to see if we need to upgrade the DB, and do so if necessary |
| sub _upgrade_db { |
| my ($self) = @_; |
| |
| my $verschk = $self->_check_db_version(); |
| my $res = 0; # used later on for tie() checks |
| my $umask; # used later for umask modifications |
| |
| # If the DB is the latest version, no problem. |
| return 1 if ($verschk == 0); |
| |
| # If the DB is a newer version that we know what to do with ... abort! |
| if ($verschk == 1) { |
| warn("bayes: bayes db version ".$self->{db_version}." is newer than we understand, aborting!"); |
| return 0; |
| } |
| |
| # If the current DB version is lower than the new version, upgrade! |
| # Do conversions in order so we can go 1 -> 3, make sure to update |
| # $self->{db_version} along the way |
| |
| dbg("bayes: detected bayes db format %s, upgrading", $self->{db_version}); |
| |
| # since DB_File will not shrink a database (!!), we need to *create* |
| # a new one instead. |
| my $main = $self->{bayes}->{main}; |
| my $path = $main->sed_path($main->{conf}->{bayes_path}); |
| my $name = $path.'_toks'; |
| |
| # older version's journal files are likely not in the same format as the new ones, so remove it. |
| my $jpath = $self->_get_journal_filename(); |
| if (-f $jpath) { |
| dbg("bayes: old journal file found, removing"); |
| warn "bayes: couldn't remove $jpath: $!" if (!unlink $jpath); |
| } |
| |
| if ($self->{db_version} < 2) { |
| dbg("bayes: upgrading database format from v%s to v2", $self->{db_version}); |
| $self->set_running_expire_tok(); |
| |
| my ($DB_NSPAM_MAGIC_TOKEN, $DB_NHAM_MAGIC_TOKEN, $DB_NTOKENS_MAGIC_TOKEN); |
| my ($DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN, $DB_LAST_EXPIRE_MAGIC_TOKEN); |
| |
| # Magic tokens for version 0, defined as '**[A-Z]+' |
| if ($self->{db_version} == 0) { |
| $DB_NSPAM_MAGIC_TOKEN = '**NSPAM'; |
| $DB_NHAM_MAGIC_TOKEN = '**NHAM'; |
| $DB_NTOKENS_MAGIC_TOKEN = '**NTOKENS'; |
| #$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN = '**OLDESTAGE'; |
| #$DB_LAST_EXPIRE_MAGIC_TOKEN = '**LASTEXPIRE'; |
| #$DB_SCANCOUNT_BASE_MAGIC_TOKEN = '**SCANBASE'; |
| #$DB_RUNNING_EXPIRE_MAGIC_TOKEN = '**RUNNINGEXPIRE'; |
| } |
| else { |
| $DB_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM"; |
| $DB_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM"; |
| $DB_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS"; |
| #$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE"; |
| #$DB_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE"; |
| #$DB_SCANCOUNT_BASE_MAGIC_TOKEN = "\015\001\007\011\003SCANBASE"; |
| #$DB_RUNNING_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003RUNNINGEXPIRE"; |
| } |
| |
| # remember when we started ... |
| my $started = time; |
| my $newatime = $started; |
| |
| # use O_EXCL to avoid races (bonus paranoia, since we should be locked |
| # anyway) |
| my %new_toks; |
| $umask = umask 0; |
| |
| $res = tie %new_toks, $self->DBM_MODULE, "${name}.new", |
| O_RDWR|O_CREAT|O_EXCL, |
| (oct($main->{conf}->{bayes_file_mode}) & 0666); |
| umask $umask; |
| return 0 unless $res; |
| undef $res; |
| |
| # add the magic tokens to the new db. |
| $new_toks{$NSPAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NSPAM_MAGIC_TOKEN}; |
| $new_toks{$NHAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NHAM_MAGIC_TOKEN}; |
| $new_toks{$NTOKENS_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NTOKENS_MAGIC_TOKEN}; |
| $new_toks{$DB_VERSION_MAGIC_TOKEN} = 2; # we're now a DB version 2 file |
| $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $newatime; |
| $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = $newatime; |
| $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $newatime; |
| $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $newatime; |
| $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = 0; |
| $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = 0; |
| |
| # deal with the data tokens |
| my ($tok, $packed); |
| my $count = 0; |
| while (($tok, $packed) = each %{$self->{db_toks}}) { |
| next if ($tok =~ /^(?:\*\*[A-Z]+$|\015\001\007\011\003)/); # skip magic tokens |
| |
| my ($ts, $th, $atime) = $self->tok_unpack($packed); |
| $new_toks{$tok} = $self->tok_pack($ts, $th, $newatime); |
| |
| # Refresh the lock every so often... |
| if (($count++ % 1000) == 0) { |
| $self->set_running_expire_tok(); |
| } |
| } |
| |
| |
| # now untie so we can do renames |
| untie %{$self->{db_toks}}; |
| untie %new_toks; |
| |
| # This is the critical phase (moving files around), so don't allow |
| # it to be interrupted. |
| local $SIG{'INT'} = 'IGNORE'; |
| local $SIG{'TERM'} = 'IGNORE'; |
| local $SIG{'HUP'} = 'IGNORE' if !am_running_on_windows(); |
| |
| # older versions used scancount, so kill the stupid little file ... |
| my $msgc = $path.'_msgcount'; |
| if (-f $msgc) { |
| dbg("bayes: old msgcount file found, removing"); |
| if (!unlink $msgc) { |
| warn "bayes: couldn't remove $msgc: $!"; |
| } |
| } |
| |
| # now rename in the new one. Try several extensions |
| for my $ext ($self->DB_EXTENSIONS) { |
| my $newf = $name.'.new'.$ext; |
| my $oldf = $name.$ext; |
| next unless (-f $newf); |
| if (!rename ($newf, $oldf)) { |
| warn "bayes: rename $newf to $oldf failed: $!\n"; |
| return 0; |
| } |
| } |
| |
| # re-tie to the new db in read-write mode ... |
| $umask = umask 0; |
| # Bug 6901, [rt.cpan.org #83060] |
| untie %{$self->{db_toks}}; # has no effect if the variable is not tied |
| $res = tie %{$self->{db_toks}}, $self->DBM_MODULE, $name, O_RDWR|O_CREAT, |
| (oct($main->{conf}->{bayes_file_mode}) & 0666); |
| umask $umask; |
| return 0 unless $res; |
| undef $res; |
| |
| dbg("bayes: upgraded database format from v%s to v2 in %d seconds", |
| $self->{db_version}, time - $started); |
| $self->{db_version} = 2; # need this for other functions which check |
| } |
| |
| # Version 3 of the database converts all existing tokens to SHA1 hashes |
| if ($self->{db_version} == 2) { |
| dbg("bayes: upgrading database format from v%s to v3", $self->{db_version}); |
| $self->set_running_expire_tok(); |
| |
| my $DB_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM"; |
| my $DB_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM"; |
| my $DB_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS"; |
| my $DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE"; |
| my $DB_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE"; |
| my $DB_NEWEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003NEWESTAGE"; |
| my $DB_LAST_JOURNAL_SYNC_MAGIC_TOKEN = "\015\001\007\011\003LASTJOURNALSYNC"; |
| my $DB_LAST_ATIME_DELTA_MAGIC_TOKEN = "\015\001\007\011\003LASTATIMEDELTA"; |
| my $DB_LAST_EXPIRE_REDUCE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIREREDUCE"; |
| |
| # remember when we started ... |
| my $started = time; |
| |
| # use O_EXCL to avoid races (bonus paranoia, since we should be locked |
| # anyway) |
| my %new_toks; |
| $umask = umask 0; |
| $res = tie %new_toks, $self->DBM_MODULE, "${name}.new", O_RDWR|O_CREAT|O_EXCL, |
| (oct($main->{conf}->{bayes_file_mode}) & 0666); |
| umask $umask; |
| return 0 unless $res; |
| undef $res; |
| |
| # add the magic tokens to the new db. |
| $new_toks{$NSPAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NSPAM_MAGIC_TOKEN}; |
| $new_toks{$NHAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NHAM_MAGIC_TOKEN}; |
| $new_toks{$NTOKENS_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NTOKENS_MAGIC_TOKEN}; |
| $new_toks{$DB_VERSION_MAGIC_TOKEN} = 3; # we're now a DB version 3 file |
| $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $self->{db_toks}->{$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN}; |
| $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = $self->{db_toks}->{$DB_LAST_EXPIRE_MAGIC_TOKEN}; |
| $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NEWEST_TOKEN_AGE_MAGIC_TOKEN}; |
| $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $self->{db_toks}->{$DB_LAST_JOURNAL_SYNC_MAGIC_TOKEN}; |
| $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = $self->{db_toks}->{$DB_LAST_ATIME_DELTA_MAGIC_TOKEN}; |
| $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} =$self->{db_toks}->{$DB_LAST_EXPIRE_REDUCE_MAGIC_TOKEN}; |
| |
| # deal with the data tokens |
| my $count = 0; |
| while (my ($tok, $packed) = each %{$self->{db_toks}}) { |
| next if ($tok =~ /^\015\001\007\011\003/); # skip magic tokens |
| my $tok_hash = substr(sha1($tok), -5); |
| $new_toks{$tok_hash} = $packed; |
| |
| # Refresh the lock every so often... |
| if (($count++ % 1000) == 0) { |
| $self->set_running_expire_tok(); |
| } |
| } |
| |
| # now untie so we can do renames |
| untie %{$self->{db_toks}}; |
| untie %new_toks; |
| |
| # This is the critical phase (moving files around), so don't allow |
| # it to be interrupted. |
| local $SIG{'INT'} = 'IGNORE'; |
| local $SIG{'TERM'} = 'IGNORE'; |
| local $SIG{'HUP'} = 'IGNORE' if !am_running_on_windows(); |
| |
| # now rename in the new one. Try several extensions |
| for my $ext ($self->DB_EXTENSIONS) { |
| my $newf = $name.'.new'.$ext; |
| my $oldf = $name.$ext; |
| next unless (-f $newf); |
| if (!rename($newf, $oldf)) { |
| warn "bayes: rename $newf to $oldf failed: $!\n"; |
| return 0; |
| } |
| } |
| |
| # re-tie to the new db in read-write mode ... |
| $umask = umask 0; |
| # Bug 6901, [rt.cpan.org #83060] |
| untie %{$self->{db_toks}}; # has no effect if the variable is not tied |
| $res = tie %{$self->{db_toks}}, $self->DBM_MODULE, $name, O_RDWR|O_CREAT, |
| (oct ($main->{conf}->{bayes_file_mode}) & 0666); |
| umask $umask; |
| return 0 unless $res; |
| undef $res; |
| |
| dbg("bayes: upgraded database format from v%s to v3 in %d seconds", |
| $self->{db_version}, time - $started); |
| |
| $self->{db_version} = 3; # need this for other functions which check |
| } |
| |
| # if ($self->{db_version} == 3) { |
| # ... |
| # $self->{db_version} = 4; # need this for other functions which check |
| # } |
| # ... and so on. |
| |
| return 1; |
| } |
| |
| ########################################################################### |
| |
| sub untie_db { |
| my $self = shift; |
| |
| return if (!$self->{already_tied}); |
| |
| dbg("bayes: untie-ing"); |
| |
| foreach my $dbname (@DBNAMES) { |
| my $db_var = 'db_'.$dbname; |
| |
| if (exists $self->{$db_var}) { |
| # dbg("bayes: untie-ing $db_var"); |
| untie %{$self->{$db_var}}; |
| delete $self->{$db_var}; |
| } |
| } |
| |
| if ($self->{is_locked}) { |
| dbg("bayes: files locked, now unlocking lock"); |
| $self->{bayes}->{main}->{locker}->safe_unlock ($self->{locked_file}); |
| $self->{is_locked} = 0; |
| } |
| |
| $self->{already_tied} = 0; |
| $self->{db_version} = undef; |
| } |
| |
| ########################################################################### |
| |
| sub calculate_expire_delta { |
| my ($self, $newest_atime, $start, $max_expire_mult) = @_; |
| |
| my %delta; # use a hash since an array is going to be very sparse |
| |
| # do the first pass, figure out atime delta |
| my ($tok, $packed); |
| while (($tok, $packed) = each %{$self->{db_toks}}) { |
| next if ($tok =~ MAGIC_RE); # skip magic tokens |
| |
| my ($ts, $th, $atime) = $self->tok_unpack ($packed); |
| |
| # Go through from $start * 1 to $start * 512, mark how many tokens |
| # we would expire |
| my $token_age = $newest_atime - $atime; |
| for (my $i = 1; $i <= $max_expire_mult; $i<<=1) { |
| if ($token_age >= $start * $i) { |
| $delta{$i}++; |
| } |
| else { |
| # If the token age is less than the expire delta, it'll be |
| # less for all upcoming checks too, so abort early. |
| last; |
| } |
| } |
| } |
| return %delta; |
| } |
| |
| ########################################################################### |
| |
| sub token_expiration { |
| my ($self, $opts, $newdelta, @vars) = @_; |
| |
| my $deleted = 0; |
| my $kept = 0; |
| my $num_hapaxes = 0; |
| my $num_lowfreq = 0; |
| |
| # since DB_File will not shrink a database (!!), we need to *create* |
| # a new one instead. |
| my $main = $self->{bayes}->{main}; |
| my $path = $main->sed_path($main->{conf}->{bayes_path}); |
| |
| # use a temporary PID-based suffix just in case another one was |
| # created previously by an interrupted expire |
| my $tmpsuffix = "expire$$"; |
| my $tmpdbname = $path.'_toks.'.$tmpsuffix; |
| |
| # clean out any leftover db copies from previous runs |
| for my $ext ($self->DB_EXTENSIONS) { unlink ($tmpdbname.$ext); } |
| |
| # use O_EXCL to avoid races (bonus paranoia, since we should be locked |
| # anyway) |
| my %new_toks; |
| my $umask = umask 0; |
| tie %new_toks, $self->DBM_MODULE, $tmpdbname, O_RDWR|O_CREAT|O_EXCL, |
| (oct ($main->{conf}->{bayes_file_mode}) & 0666); |
| umask $umask; |
| my $oldest; |
| |
| my $showdots = $opts->{showdots}; |
| if ($showdots) { print STDERR "\n"; } |
| |
| # We've chosen a new atime delta if we've gotten here, so record it |
| # for posterity. |
| $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = $newdelta; |
| |
| # Figure out how old is too old... |
| my $too_old = $vars[10] - $newdelta; # tooold = newest - delta |
| |
| # Go ahead and do the move to new db/expire run now ... |
| my ($tok, $packed); |
| while (($tok, $packed) = each %{$self->{db_toks}}) { |
| next if ($tok =~ MAGIC_RE); # skip magic tokens |
| |
| my ($ts, $th, $atime) = $self->tok_unpack ($packed); |
| |
| if ($atime < $too_old) { |
| $deleted++; |
| } |
| else { |
| # if token atime > newest, reset to newest ... |
| if ($atime > $vars[10]) { |
| $atime = $vars[10]; |
| } |
| |
| $new_toks{$tok} = $self->tok_pack ($ts, $th, $atime); $kept++; |
| if (!defined($oldest) || $atime < $oldest) { $oldest = $atime; } |
| if ($ts + $th == 1) { |
| $num_hapaxes++; |
| } elsif ($ts < 8 && $th < 8) { |
| $num_lowfreq++; |
| } |
| } |
| |
| if ((($kept + $deleted) % 1000) == 0) { |
| if ($showdots) { print STDERR "."; } |
| $self->set_running_expire_tok(); |
| } |
| } |
| |
| # and add the magic tokens. don't add the expire_running token. |
| $new_toks{$DB_VERSION_MAGIC_TOKEN} = $self->DB_VERSION; |
| |
| # We haven't changed messages of each type seen, so just copy over. |
| $new_toks{$NSPAM_MAGIC_TOKEN} = $vars[1]; |
| $new_toks{$NHAM_MAGIC_TOKEN} = $vars[2]; |
| |
| # We magically haven't removed the newest token, so just copy that value over. |
| $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $vars[10]; |
| |
| # The rest of these have been modified, so replace as necessary. |
| $new_toks{$NTOKENS_MAGIC_TOKEN} = $kept; |
| $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = time(); |
| $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $oldest; |
| $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = $deleted; |
| |
| # Sanity check: if we expired too many tokens, abort! |
| if ($kept < 100000) { |
| dbg("bayes: token expiration would expire too many tokens, aborting"); |
| # set the magic tokens appropriately |
| # make sure the next expire run does a first pass |
| $self->{db_toks}->{$LAST_EXPIRE_MAGIC_TOKEN} = time(); |
| $self->{db_toks}->{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = 0; |
| $self->{db_toks}->{$LAST_ATIME_DELTA_MAGIC_TOKEN} = 0; |
| |
| # remove the new DB |
| untie %new_toks; |
| for my $ext ($self->DB_EXTENSIONS) { unlink ($tmpdbname.$ext); } |
| |
| # reset the results for the return |
| $kept = $vars[3]; |
| $deleted = 0; |
| $num_hapaxes = 0; |
| $num_lowfreq = 0; |
| } |
| else { |
| # now untie so we can do renames |
| untie %{$self->{db_toks}}; |
| untie %new_toks; |
| |
| # This is the critical phase (moving files around), so don't allow |
| # it to be interrupted. Scope the signal changes. |
| { |
| local $SIG{'INT'} = 'IGNORE'; |
| local $SIG{'TERM'} = 'IGNORE'; |
| local $SIG{'HUP'} = 'IGNORE' if !am_running_on_windows(); |
| |
| # now rename in the new one. Try several extensions |
| for my $ext ($self->DB_EXTENSIONS) { |
| my $newf = $tmpdbname.$ext; |
| my $oldf = $path.'_toks'.$ext; |
| next unless (-f $newf); |
| if (!rename ($newf, $oldf)) { |
| warn "bayes: rename $newf to $oldf failed: $!\n"; |
| } |
| } |
| } |
| } |
| |
| # Call untie_db() so we unlock correctly. |
| $self->untie_db(); |
| |
| return ($kept, $deleted, $num_hapaxes, $num_lowfreq); |
| } |
| |
| ########################################################################### |
| |
| # Is a sync due? |
| sub sync_due { |
| my ($self) = @_; |
| |
| # don't bother doing old db versions |
| return 0 if ($self->{db_version} < $self->DB_VERSION); |
| |
| my $conf = $self->{bayes}->{main}->{conf}; |
| return 0 if ($conf->{bayes_journal_max_size} == 0); |
| |
| my @vars = $self->get_storage_variables(); |
| dbg("bayes: DB journal sync: last sync: %s", $vars[7]); |
| |
| ## Ok, should we do a sync? |
| |
| # Not if the journal file doesn't exist, it's not a file, or it's 0 |
| # bytes long. |
| return 0 unless (stat($self->_get_journal_filename()) && -f _); |
| |
| # Yes if the file size is larger than the specified maximum size. |
| return 1 if (-s _ > $conf->{bayes_journal_max_size}); |
| |
| # Yes there has been a sync before, and if it's been at least a day |
| # since that sync. |
| return 1 if (($vars[7] > 0) && (time - $vars[7] > 86400)); |
| |
| # No, I guess not. |
| return 0; |
| } |
| |
| ########################################################################### |
| # db_seen reading APIs |
| |
| sub seen_get { |
| my ($self, $msgid) = @_; |
| $self->{db_seen}->{$msgid}; |
| } |
| |
| sub seen_put { |
| my ($self, $msgid, $seen) = @_; |
| |
| if ($self->{bayes}->{main}->{learn_to_journal}) { |
| $self->defer_update ("m $seen $msgid"); |
| } |
| else { |
| $self->_seen_put_direct($msgid, $seen); |
| } |
| } |
| sub _seen_put_direct { |
| my ($self, $msgid, $seen) = @_; |
| $self->{db_seen}->{$msgid} = $seen; |
| } |
| |
| sub seen_delete { |
| my ($self, $msgid) = @_; |
| |
| if ($self->{bayes}->{main}->{learn_to_journal}) { |
| $self->defer_update ("m f $msgid"); |
| } |
| else { |
| $self->_seen_delete_direct($msgid); |
| } |
| } |
| sub _seen_delete_direct { |
| my ($self, $msgid) = @_; |
| delete $self->{db_seen}->{$msgid}; |
| } |
| |
| ########################################################################### |
| # db reading APIs |
| |
| sub tok_get { |
| my ($self, $tok) = @_; |
| $self->tok_unpack ($self->{db_toks}->{$tok}); |
| } |
| |
| sub tok_get_all { |
| my ($self, @tokens) = @_; |
| |
| my @tokensdata; |
| foreach my $token (@tokens) { |
| my ($tok_spam, $tok_ham, $atime) = $self->tok_unpack($self->{db_toks}->{$token}); |
| push(@tokensdata, [$token, $tok_spam, $tok_ham, $atime]); |
| } |
| return \@tokensdata; |
| } |
| |
| # return the magic tokens in a specific order: |
| # 0: scan count base |
| # 1: number of spam |
| # 2: number of ham |
| # 3: number of tokens in db |
| # 4: last expire atime |
| # 5: oldest token in db atime |
| # 6: db version value |
| # 7: last journal sync |
| # 8: last atime delta |
| # 9: last expire reduction count |
| # 10: newest token in db atime |
| # |
| sub get_storage_variables { |
| my ($self) = @_; |
| my @values; |
| |
| my $db_ver = $self->{db_toks}->{$DB_VERSION_MAGIC_TOKEN}; |
| |
| if (!$db_ver || $db_ver =~ /\D/) { $db_ver = 0; } |
| |
| if ($db_ver >= 2) { |
| my $DB2_LAST_ATIME_DELTA_MAGIC_TOKEN = "\015\001\007\011\003LASTATIMEDELTA"; |
| my $DB2_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE"; |
| my $DB2_LAST_EXPIRE_REDUCE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIREREDUCE"; |
| my $DB2_LAST_JOURNAL_SYNC_MAGIC_TOKEN = "\015\001\007\011\003LASTJOURNALSYNC"; |
| my $DB2_NEWEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003NEWESTAGE"; |
| my $DB2_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM"; |
| my $DB2_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM"; |
| my $DB2_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS"; |
| my $DB2_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE"; |
| my $DB2_RUNNING_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003RUNNINGEXPIRE"; |
| |
| @values = ( |
| 0, |
| $self->{db_toks}->{$DB2_NSPAM_MAGIC_TOKEN}, |
| $self->{db_toks}->{$DB2_NHAM_MAGIC_TOKEN}, |
| $self->{db_toks}->{$DB2_NTOKENS_MAGIC_TOKEN}, |
| $self->{db_toks}->{$DB2_LAST_EXPIRE_MAGIC_TOKEN}, |
| $self->{db_toks}->{$DB2_OLDEST_TOKEN_AGE_MAGIC_TOKEN}, |
| $db_ver, |
| $self->{db_toks}->{$DB2_LAST_JOURNAL_SYNC_MAGIC_TOKEN}, |
| $self->{db_toks}->{$DB2_LAST_ATIME_DELTA_MAGIC_TOKEN}, |
| $self->{db_toks}->{$DB2_LAST_EXPIRE_REDUCE_MAGIC_TOKEN}, |
| $self->{db_toks}->{$DB2_NEWEST_TOKEN_AGE_MAGIC_TOKEN}, |
| ); |
| } |
| elsif ($db_ver == 0) { |
| my $DB0_NSPAM_MAGIC_TOKEN = '**NSPAM'; |
| my $DB0_NHAM_MAGIC_TOKEN = '**NHAM'; |
| my $DB0_OLDEST_TOKEN_AGE_MAGIC_TOKEN = '**OLDESTAGE'; |
| my $DB0_LAST_EXPIRE_MAGIC_TOKEN = '**LASTEXPIRE'; |
| my $DB0_NTOKENS_MAGIC_TOKEN = '**NTOKENS'; |
| my $DB0_SCANCOUNT_BASE_MAGIC_TOKEN = '**SCANBASE'; |
| |
| @values = ( |
| $self->{db_toks}->{$DB0_SCANCOUNT_BASE_MAGIC_TOKEN}, |
| $self->{db_toks}->{$DB0_NSPAM_MAGIC_TOKEN}, |
| $self->{db_toks}->{$DB0_NHAM_MAGIC_TOKEN}, |
| $self->{db_toks}->{$DB0_NTOKENS_MAGIC_TOKEN}, |
| $self->{db_toks}->{$DB0_LAST_EXPIRE_MAGIC_TOKEN}, |
| $self->{db_toks}->{$DB0_OLDEST_TOKEN_AGE_MAGIC_TOKEN}, |
| 0, |
| 0, |
| 0, |
| 0, |
| 0, |
| ); |
| } |
| elsif ($db_ver == 1) { |
| my $DB1_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM"; |
| my $DB1_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM"; |
| my $DB1_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE"; |
| my $DB1_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE"; |
| my $DB1_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS"; |
| my $DB1_SCANCOUNT_BASE_MAGIC_TOKEN = "\015\001\007\011\003SCANBASE"; |
| |
| @values = ( |
| $self->{db_toks}->{$DB1_SCANCOUNT_BASE_MAGIC_TOKEN}, |
| $self->{db_toks}->{$DB1_NSPAM_MAGIC_TOKEN}, |
| $self->{db_toks}->{$DB1_NHAM_MAGIC_TOKEN}, |
| $self->{db_toks}->{$DB1_NTOKENS_MAGIC_TOKEN}, |
| $self->{db_toks}->{$DB1_LAST_EXPIRE_MAGIC_TOKEN}, |
| $self->{db_toks}->{$DB1_OLDEST_TOKEN_AGE_MAGIC_TOKEN}, |
| 1, |
| 0, |
| 0, |
| 0, |
| 0, |
| ); |
| } |
| |
| foreach (@values) { |
| if (!$_ || $_ =~ /\D/) { |
| $_ = 0; |
| } |
| } |
| |
| return @values; |
| } |
| |
| sub dump_db_toks { |
| my ($self, $template, $regex, @vars) = @_; |
| |
| if (defined $regex) { |
| my ($rec, $err) = compile_regexp($regex, 2); |
| if (!$rec) { |
| die "Invalid dump_tokens regex '$regex': $err\n"; |
| } |
| $regex = $rec; |
| } |
| |
| while (my ($tok, $tokvalue) = each %{$self->{db_toks}}) { |
| next if ($tok =~ MAGIC_RE); # skip magic tokens |
| next if (defined $regex && $tok !~ /$regex/o); |
| |
| # We have the value already, so just unpack it. |
| my ($ts, $th, $atime) = $self->tok_unpack ($tokvalue); |
| |
| my $prob = $self->{bayes}->_compute_prob_for_token($tok, $vars[1], $vars[2], $ts, $th); |
| $prob ||= 0.5; |
| |
| my $encoded_tok = unpack("H*",$tok); |
| printf $template,$prob,$ts,$th,$atime,$encoded_tok; |
| } |
| } |
| |
| sub set_last_expire { |
| my ($self, $time) = @_; |
| $self->{db_toks}->{$LAST_EXPIRE_MAGIC_TOKEN} = time(); |
| } |
| |
| ## Don't bother using get_magic_tokens here. This token should only |
| ## ever exist when we're running expire, so we don't want to convert it if |
| ## it's there and we're not expiring ... |
| sub get_running_expire_tok { |
| my ($self) = @_; |
| my $running = $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN}; |
| if (!$running || $running =~ /\D/) { return; } |
| return $running; |
| } |
| |
| sub set_running_expire_tok { |
| my ($self) = @_; |
| |
| # update the lock and running expire magic token |
| $self->{bayes}->{main}->{locker}->refresh_lock ($self->{locked_file}); |
| $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN} = time(); |
| } |
| |
| sub remove_running_expire_tok { |
| my ($self) = @_; |
| delete $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN}; |
| } |
| |
| ########################################################################### |
| |
| # db abstraction: allow deferred writes, since we will be frequently |
| # writing while checking. |
| |
| sub tok_count_change { |
| my ($self, $ds, $dh, $tok, $atime) = @_; |
| |
| $atime = 0 unless defined $atime; |
| |
| if ($self->{bayes}->{main}->{learn_to_journal}) { |
| # we can't store the SHA1 binary value in the journal, so convert it |
| # to a printable value that can be converted back later |
| my $encoded_tok = unpack("H*",$tok); |
| $self->defer_update ("c $ds $dh $atime $encoded_tok"); |
| } else { |
| $self->tok_sync_counters ($ds, $dh, $atime, $tok); |
| } |
| } |
| |
| sub multi_tok_count_change { |
| my ($self, $ds, $dh, $tokens, $atime) = @_; |
| |
| $atime = 0 unless defined $atime; |
| |
| foreach my $tok (keys %{$tokens}) { |
| if ($self->{bayes}->{main}->{learn_to_journal}) { |
| # we can't store the SHA1 binary value in the journal, so convert it |
| # to a printable value that can be converted back later |
| my $encoded_tok = unpack("H*",$tok); |
| $self->defer_update ("c $ds $dh $atime $encoded_tok"); |
| } else { |
| $self->tok_sync_counters ($ds, $dh, $atime, $tok); |
| } |
| } |
| } |
| |
| sub nspam_nham_get { |
| my ($self) = @_; |
| my @vars = $self->get_storage_variables(); |
| ($vars[1], $vars[2]); |
| } |
| |
| sub nspam_nham_change { |
| my ($self, $ds, $dh) = @_; |
| |
| if ($self->{bayes}->{main}->{learn_to_journal}) { |
| $self->defer_update ("n $ds $dh"); |
| } else { |
| $self->tok_sync_nspam_nham ($ds, $dh); |
| } |
| } |
| |
| sub tok_touch { |
| my ($self, $tok, $atime) = @_; |
| # we can't store the SHA1 binary value in the journal, so convert it |
| # to a printable value that can be converted back later |
| my $encoded_tok = unpack("H*", $tok); |
| $self->defer_update ("t $atime $encoded_tok"); |
| } |
| |
| sub tok_touch_all { |
| my ($self, $tokens, $atime) = @_; |
| |
| foreach my $token (@{$tokens}) { |
| # we can't store the SHA1 binary value in the journal, so convert it |
| # to a printable value that can be converted back later |
| my $encoded_tok = unpack("H*", $token); |
| $self->defer_update ("t $atime $encoded_tok"); |
| } |
| } |
| |
| sub defer_update { |
| my ($self, $str) = @_; |
| $self->{string_to_journal} .= "$str\n"; |
| } |
| |
| ########################################################################### |
| |
| sub cleanup { |
| my ($self) = @_; |
| |
| my $nbytes = length ($self->{string_to_journal}); |
| return if ($nbytes == 0); |
| |
| my $path = $self->_get_journal_filename(); |
| |
| # use append mode, write atomically, then close, so simultaneous updates are |
| # not lost |
| my $conf = $self->{bayes}->{main}->{conf}; |
| |
| # set the umask to the inverse of what we want ... |
| my $umask = umask(0777 - (oct ($conf->{bayes_file_mode}) & 0666)); |
| |
| if (!open (OUT, ">>".$path)) { |
| warn "bayes: cannot write to $path, bayes db update ignored: $!\n"; |
| umask $umask; # reset umask |
| return; |
| } |
| umask $umask; # reset umask |
| |
| # do not use print() here, it will break up the buffer if it's >8192 bytes, |
| # which could result in two sets of tokens getting mixed up and their |
| # touches missed. |
| my $write_failure = 0; |
| my $original_point = tell OUT; |
| $original_point >= 0 or die "Can't obtain file position: $!"; |
| my $len; |
| do { |
| $len = syswrite (OUT, $self->{string_to_journal}, $nbytes); |
| |
| # argh, write failure, give up |
| if (!defined $len || $len < 0) { |
| my $err = ''; |
| if (!defined $len) { |
| $len = 0; |
| $err = " ($!)"; |
| } |
| warn "bayes: write failed to Bayes journal $path ($len of $nbytes)!$err\n"; |
| last; |
| } |
| |
| # This shouldn't happen, but could if the fs is full... |
| if ($len != $nbytes) { |
| warn "bayes: partial write to bayes journal $path ($len of $nbytes), recovering\n"; |
| |
| # we want to be atomic, so revert the journal file back to where |
| # we know it's "good". if we can't truncate the journal, or we've |
| # tried 5 times to do the write, abort! |
| if (!truncate(OUT, $original_point) || ($write_failure++ > 4)) { |
| warn "bayes: cannot write to bayes journal $path, aborting!\n"; |
| last; |
| } |
| |
| # if the fs is full, let's give the system a break |
| sleep 1; |
| } |
| } while ($len != $nbytes); |
| |
| if (!close OUT) { |
| warn "bayes: cannot write to $path, bayes db update ignored\n"; |
| } |
| |
| $self->{string_to_journal} = ''; |
| } |
| |
| # Return a qr'd RE to match a token with the correct format's magic token |
| sub get_magic_re { |
| my ($self) = @_; |
| |
| if (!defined $self->{db_version} || $self->{db_version} >= 1) { |
| return MAGIC_RE; |
| } |
| |
| # When in doubt, assume v0 |
| return qr/^\*\*[A-Z]+$/; |
| } |
| |
| # provide a more generalized public interface into the journal sync |
| |
| sub sync { |
| my ($self, $opts) = @_; |
| |
| return $self->_sync_journal($opts); |
| } |
| |
| ########################################################################### |
| # And this method reads the journal and applies the changes in one |
| # (locked) transaction. |
| |
| sub _sync_journal { |
| my ($self, $opts) = @_; |
| my $ret = 0; |
| |
| my $path = $self->_get_journal_filename(); |
| |
| # if $path doesn't exist, or it's not a file, or is 0 bytes in length, return |
| if (!stat($path) || !-f _ || -z _) { |
| return 0; |
| } |
| |
| my $eval_stat; |
| eval { |
| local $SIG{'__DIE__'}; # do not run user die() traps in here |
| if ($self->tie_db_writable()) { |
| $ret = $self->_sync_journal_trapped($opts, $path); |
| } |
| 1; |
| } or do { |
| $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; |
| }; |
| |
| # ok, untie from write-mode if we can |
| if (!$self->{bayes}->{main}->{learn_caller_will_untie}) { |
| $self->untie_db(); |
| } |
| |
| # handle any errors that may have occurred |
| if (defined $eval_stat) { |
| warn "bayes: $eval_stat\n"; |
| return 0; |
| } |
| |
| $ret; |
| } |
| |
| sub _sync_journal_trapped { |
| my ($self, $opts, $path) = @_; |
| |
| # Flag that we're doing work |
| $self->set_running_expire_tok(); |
| |
| my $started = time(); |
| my $count = 0; |
| my $total_count = 0; |
| my %tokens; |
| my $showdots = $opts->{showdots}; |
| my $retirepath = $path.".old"; |
| |
| # if $path doesn't exist, or it's not a file, or is 0 bytes in length, |
| # return we have to check again since the file may have been removed |
| # by a recent bayes db upgrade ... |
| if (!stat($path) || !-f _ || -z _) { |
| return 0; |
| } |
| |
| if (!-r $path) { # will we be able to read the file? |
| warn "bayes: bad permissions on journal, can't read: $path\n"; |
| return 0; |
| } |
| |
| # This is the critical phase (moving files around), so don't allow |
| # it to be interrupted. |
| { |
| local $SIG{'INT'} = 'IGNORE'; |
| local $SIG{'TERM'} = 'IGNORE'; |
| local $SIG{'HUP'} = 'IGNORE' if !am_running_on_windows(); |
| |
| # retire the journal, so we can update the db files from it in peace. |
| # TODO: use locking here |
| if (!rename ($path, $retirepath)) { |
| warn "bayes: failed rename $path to $retirepath\n"; |
| return 0; |
| } |
| |
| # now read the retired journal |
| local *JOURNAL; |
| if (!open (JOURNAL, "<$retirepath")) { |
| warn "bayes: cannot open read $retirepath\n"; |
| return 0; |
| } |
| |
| |
| # Read the journal |
| for ($!=0; defined($_=<JOURNAL>); $!=0) { |
| $total_count++; |
| |
| if (/^t (\d+) (.+)$/) { # Token timestamp update, cache resultant entries |
| my $tok = pack("H*",$2); |
| $tokens{$tok} = $1+0 if (!exists $tokens{$tok} || $1+0 > $tokens{$tok}); |
| } elsif (/^c (-?\d+) (-?\d+) (\d+) (.+)$/) { # Add/full token update |
| my $tok = pack("H*",$4); |
| $self->tok_sync_counters ($1+0, $2+0, $3+0, $tok); |
| $count++; |
| } elsif (/^n (-?\d+) (-?\d+)$/) { # update ham/spam count |
| $self->tok_sync_nspam_nham ($1+0, $2+0); |
| $count++; |
| } elsif (/^m ([hsf]) (.+)$/) { # update msgid seen database |
| if ($1 eq "f") { |
| $self->_seen_delete_direct($2); |
| } |
| else { |
| $self->_seen_put_direct($2,$1); |
| } |
| $count++; |
| } else { |
| warn "bayes: gibberish entry found in journal: $_"; |
| } |
| } |
| defined $_ || $!==0 or |
| $!==EBADF ? dbg("bayes: error reading journal file: $!") |
| : die "error reading journal file: $!"; |
| close(JOURNAL) or die "Can't close journal file: $!"; |
| |
| # Now that we've determined what tokens we need to update and their |
| # final values, update the DB. Should be much smaller than the full |
| # journal entries. |
| while (my ($k,$v) = each %tokens) { |
| $self->tok_touch_token ($v, $k); |
| |
| if ((++$count % 1000) == 0) { |
| if ($showdots) { print STDERR "."; } |
| $self->set_running_expire_tok(); |
| } |
| } |
| |
| if ($showdots) { print STDERR "\n"; } |
| |
| # we're all done, so unlink the old journal file |
| unlink ($retirepath) || warn "bayes: can't unlink $retirepath: $!\n"; |
| |
| $self->{db_toks}->{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $started; |
| |
| my $done = time(); |
| my $msg = ("bayes: synced databases from journal in " . |
| ($done - $started) . |
| " seconds: $count unique entries ($total_count total entries)"); |
| |
| if ($opts->{verbose}) { |
| print $msg,"\n"; |
| } else { |
| dbg($msg); |
| } |
| } |
| |
| # else, that's the lot, we're synced. return |
| return 1; |
| } |
| |
| sub tok_touch_token { |
| my ($self, $atime, $tok) = @_; |
| my ($ts, $th, $oldatime) = $self->tok_get ($tok); |
| |
| # If the new atime is < the old atime, ignore the update |
| # We figure that we'll never want to lower a token atime, so abort if |
| # we try. (journal out of sync, etc.) |
| return if ($oldatime >= $atime); |
| |
| $self->tok_put ($tok, $ts, $th, $atime); |
| } |
| |
| sub tok_sync_counters { |
| my ($self, $ds, $dh, $atime, $tok) = @_; |
| my ($ts, $th, $oldatime) = $self->tok_get ($tok); |
| $ts += $ds; if ($ts < 0) { $ts = 0; } |
| $th += $dh; if ($th < 0) { $th = 0; } |
| |
| # Don't roll the atime of tokens backwards ... |
| $atime = $oldatime if ($oldatime > $atime); |
| |
| $self->tok_put ($tok, $ts, $th, $atime); |
| } |
| |
| sub tok_put { |
| my ($self, $tok, $ts, $th, $atime) = @_; |
| $ts ||= 0; |
| $th ||= 0; |
| |
| # Ignore magic tokens, the don't go in this way ... |
| return if ($tok =~ MAGIC_RE); |
| |
| # use defined() rather than exists(); the latter is not supported |
| # by NDBM_File, believe it or not. Using defined() did not |
| # indicate any noticeable speed hit in my testing. (Mar 31 2003 jm) |
| my $exists_already = defined $self->{db_toks}->{$tok}; |
| |
| if ($ts == 0 && $th == 0) { |
| return if (!$exists_already); # If the token doesn't exist, just return |
| $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN}--; |
| delete $self->{db_toks}->{$tok}; |
| } else { |
| if (!$exists_already) { # If the token doesn't exist, raise the token count |
| $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN}++; |
| } |
| |
| $self->{db_toks}->{$tok} = $self->tok_pack ($ts, $th, $atime); |
| |
| my $newmagic = $self->{db_toks}->{$NEWEST_TOKEN_AGE_MAGIC_TOKEN}; |
| if (!defined ($newmagic) || $atime > $newmagic) { |
| $self->{db_toks}->{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $atime; |
| } |
| |
| # Make sure to check for either !defined or "" ... Apparently |
| # sometimes the DB module doesn't return the value correctly. :( |
| my $oldmagic = $self->{db_toks}->{$OLDEST_TOKEN_AGE_MAGIC_TOKEN}; |
| if (!defined ($oldmagic) || $oldmagic eq "" || $atime < $oldmagic) { |
| $self->{db_toks}->{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $atime; |
| } |
| } |
| } |
| |
| sub tok_sync_nspam_nham { |
| my ($self, $ds, $dh) = @_; |
| my ($ns, $nh) = ($self->get_storage_variables())[1,2]; |
| if ($ds) { $ns += $ds; } if ($ns < 0) { $ns = 0; } |
| if ($dh) { $nh += $dh; } if ($nh < 0) { $nh = 0; } |
| $self->{db_toks}->{$NSPAM_MAGIC_TOKEN} = $ns; |
| $self->{db_toks}->{$NHAM_MAGIC_TOKEN} = $nh; |
| } |
| |
| ########################################################################### |
| |
| sub _get_journal_filename { |
| my ($self) = @_; |
| |
| my $main = $self->{bayes}->{main}; |
| return $main->sed_path($main->{conf}->{bayes_path}."_journal"); |
| } |
| |
| ########################################################################### |
| |
| # this is called directly from sa-learn(1). |
| sub perform_upgrade { |
| my ($self, $opts) = @_; |
| my $ret = 0; |
| |
| my $eval_stat; |
| eval { |
| local $SIG{'__DIE__'}; # do not run user die() traps in here |
| |
| use File::Basename; |
| |
| # bayes directory |
| my $main = $self->{bayes}->{main}; |
| my $path = $main->sed_path($main->{conf}->{bayes_path}); |
| |
| # prevent dirname() from tainting the result, it assumes $1 is not tainted |
| local($1,$2,$3); # Bug 6310; perl #67962 (fixed in perl 5.12/5.13) |
| my $dir = dirname($path); |
| |
| # make temporary copy since old dbm and new dbm may have same name |
| opendir(DIR, $dir) or die "bayes: can't opendir $dir: $!"; |
| my @files = grep { /^bayes_(?:seen|toks)(?:\.\w+)?$/ } readdir(DIR); |
| closedir(DIR) or die "bayes: can't close directory $dir: $!"; |
| if (@files < 2 || !grep(/bayes_seen/,@files) || !grep(/bayes_toks/,@files)) |
| { |
| die "bayes: unable to find bayes_toks and bayes_seen, stopping\n"; |
| } |
| # untaint @files (already safe after grep) |
| untaint_var(\@files); |
| |
| for (@files) { |
| my $src = "$dir/$_"; |
| my $dst = "$dir/old_$_"; |
| eval q{ |
| use File::Copy; |
| copy($src, $dst); |
| } || die "bayes: can't copy $src to $dst: $!\n"; |
| } |
| |
| # delete previous to make way for import |
| for (@files) { unlink("$dir/$_"); } |
| |
| # import |
| if ($self->tie_db_writable()) { |
| $ret += $self->upgrade_old_dbm_files_trapped("$dir/old_bayes_seen", |
| $self->{db_seen}); |
| $ret += $self->upgrade_old_dbm_files_trapped("$dir/old_bayes_toks", |
| $self->{db_toks}); |
| } |
| |
| if ($ret == 2) { |
| print "import successful, original files saved with \"old\" prefix\n"; |
| } |
| else { |
| print "import failed, original files saved with \"old\" prefix\n"; |
| } |
| 1; |
| } or do { |
| $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; |
| }; |
| |
| $self->untie_db(); |
| |
| # if we died, untie the dbm files |
| if (defined $eval_stat) { |
| warn "bayes: perform_upgrade: $eval_stat\n"; |
| return 0; |
| } |
| $ret; |
| } |
| |
| sub upgrade_old_dbm_files_trapped { |
| my ($self, $filename, $output) = @_; |
| |
| my $count; |
| my %in; |
| |
| print "upgrading to DB_File, please be patient: $filename\n"; |
| |
| # try each type of file until we find one with > 0 entries |
| for my $dbm ('DB_File', 'GDBM_File', 'NDBM_File', 'SDBM_File') { |
| $count = 0; |
| # wrap in eval so it doesn't run in general use. This accesses db |
| # modules directly. |
| # Note: (bug 2390), the 'use' needs to be on the same line as the eval |
| # for RPM dependency checks to work properly. It's lame, but... |
| my $eval_stat; |
| eval 'use ' . $dbm . '; |
| tie %in, "' . $dbm . '", $filename, O_RDONLY, 0600; |
| %{ $output } = %in; |
| $count = scalar keys %{ $output }; |
| untie %in; |
| 1; |
| ' or do { |
| $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; |
| }; |
| if (defined $eval_stat) { |
| print "$dbm: $dbm module not installed(?), nothing copied: $eval_stat\n"; |
| dbg("bayes: error was: $eval_stat"); |
| } |
| elsif ($count == 0) { |
| print "$dbm: no database of that kind found, nothing copied\n"; |
| } |
| else { |
| print "$dbm: copied $count entries\n"; |
| return 1; |
| } |
| } |
| |
| return 0; |
| } |
| |
| sub clear_database { |
| my ($self) = @_; |
| |
| return 0 unless ($self->tie_db_writable()); |
| |
| dbg("bayes: untie-ing in preparation for removal."); |
| |
| foreach my $dbname (@DBNAMES) { |
| my $db_var = 'db_'.$dbname; |
| |
| if (exists $self->{$db_var}) { |
| # dbg("bayes: untie-ing $db_var"); |
| untie %{$self->{$db_var}}; |
| delete $self->{$db_var}; |
| } |
| } |
| |
| my $path = $self->{bayes}->{main}->sed_path($self->{bayes}->{main}->{conf}->{bayes_path}); |
| |
| foreach my $dbname (@DBNAMES, 'journal') { |
| foreach my $ext ($self->DB_EXTENSIONS) { |
| my $name = $path.'_'.$dbname.$ext; |
| my $ret = unlink $name; |
| dbg("bayes: clear_database: %s %s", |
| $ret ? 'removed' : 'tried to remove', $name); |
| } |
| } |
| |
| # the journal file needs to be done separately since it has no extension |
| foreach my $dbname ('journal') { |
| my $name = $path.'_'.$dbname; |
| my $ret = unlink $name; |
| dbg("bayes: clear_database: %s %s", |
| $ret ? 'removed' : 'tried to remove', $name); |
| } |
| |
| $self->untie_db(); |
| |
| return 1; |
| } |
| |
| sub backup_database { |
| my ($self) = @_; |
| |
| # we tie writable because we want the upgrade code to kick in if needed |
| return 0 unless ($self->tie_db_writable()); |
| |
| my @vars = $self->get_storage_variables(); |
| |
| print "v\t$vars[6]\tdb_version # this must be the first line!!!\n"; |
| print "v\t$vars[1]\tnum_spam\n"; |
| print "v\t$vars[2]\tnum_nonspam\n"; |
| |
| while (my ($tok, $packed) = each %{$self->{db_toks}}) { |
| next if ($tok =~ MAGIC_RE); # skip magic tokens |
| |
| my ($ts, $th, $atime) = $self->tok_unpack($packed); |
| my $encoded_token = unpack("H*",$tok); |
| print "t\t$ts\t$th\t$atime\t$encoded_token\n"; |
| } |
| |
| while (my ($msgid, $flag) = each %{$self->{db_seen}}) { |
| print "s\t$flag\t$msgid\n"; |
| } |
| |
| $self->untie_db(); |
| |
| return 1; |
| } |
| |
| sub restore_database { |
| my ($self, $filename, $showdots) = @_; |
| |
| local *DUMPFILE; |
| if (!open(DUMPFILE, '<', $filename)) { |
| dbg("bayes: unable to open backup file $filename: $!"); |
| return 0; |
| } |
| |
| if (!$self->tie_db_writable()) { |
| dbg("bayes: failed to tie db writable"); |
| return 0; |
| } |
| |
| my $main = $self->{bayes}->{main}; |
| my $path = $main->sed_path($main->{conf}->{bayes_path}); |
| |
| # use a temporary PID-based suffix just in case another one was |
| # created previously by an interrupted expire |
| my $tmpsuffix = "convert$$"; |
| my $tmptoksdbname = $path.'_toks.'.$tmpsuffix; |
| my $tmpseendbname = $path.'_seen.'.$tmpsuffix; |
| my $toksdbname = $path.'_toks'; |
| my $seendbname = $path.'_seen'; |
| |
| my %new_toks; |
| my %new_seen; |
| my $umask = umask 0; |
| unless (tie %new_toks, $self->DBM_MODULE, $tmptoksdbname, O_RDWR|O_CREAT|O_EXCL, |
| (oct ($main->{conf}->{bayes_file_mode}) & 0666)) { |
| dbg("bayes: failed to tie temp toks db: $!"); |
| $self->untie_db(); |
| umask $umask; |
| return 0; |
| } |
| unless (tie %new_seen, $self->DBM_MODULE, $tmpseendbname, O_RDWR|O_CREAT|O_EXCL, |
| (oct ($main->{conf}->{bayes_file_mode}) & 0666)) { |
| dbg("bayes: failed to tie temp seen db: $!"); |
| untie %new_toks; |
| $self->_unlink_file($tmptoksdbname); |
| $self->untie_db(); |
| umask $umask; |
| return 0; |
| } |
| umask $umask; |
| |
| my $line_count = 0; |
| my $db_version; |
| my $token_count = 0; |
| my $num_spam; |
| my $num_ham; |
| my $error_p = 0; |
| my $newest_token_age = 0; |
| # Kinda weird I know, but we need a nice big value and we know there will be |
| # no tokens > time() since we reset atime if > time(), so use that with a |
| # little buffer just in case. |
| my $oldest_token_age = time() + 100000; |
| |
| my $line = <DUMPFILE>; |
| defined $line or die "Error reading dump file: $!"; |
| $line_count++; |
| |
| # We require the database version line to be the first in the file so we can |
| # figure out how to properly deal with the file. If it is not the first |
| # line then fail |
| if ($line =~ m/^v\s+(\d+)\s+db_version/) { |
| $db_version = $1; |
| } |
| else { |
| dbg("bayes: database version must be the first line in the backup file, correct and re-run"); |
| untie %new_toks; |
| untie %new_seen; |
| $self->_unlink_file($tmptoksdbname); |
| $self->_unlink_file($tmpseendbname); |
| $self->untie_db(); |
| return 0; |
| } |
| |
| unless ($db_version == 2 || $db_version == 3) { |
| warn("bayes: database version $db_version is unsupported, must be version 2 or 3"); |
| untie %new_toks; |
| untie %new_seen; |
| $self->_unlink_file($tmptoksdbname); |
| $self->_unlink_file($tmpseendbname); |
| $self->untie_db(); |
| return 0; |
| } |
| |
| for ($!=0; defined($line=<DUMPFILE>); $!=0) { |
| chomp($line); |
| $line_count++; |
| |
| if ($line_count % 1000 == 0) { |
| print STDERR "." if ($showdots); |
| } |
| |
| if ($line =~ /^v\s+/) { # variable line |
| my @parsed_line = split(/\s+/, $line, 3); |
| my $value = $parsed_line[1] + 0; |
| if ($parsed_line[2] eq 'num_spam') { |
| $num_spam = $value; |
| } |
| elsif ($parsed_line[2] eq 'num_nonspam') { |
| $num_ham = $value; |
| } |
| else { |
| dbg("bayes: restore_database: skipping unknown line: $line"); |
| } |
| } |
| elsif ($line =~ /^t\s+/) { # token line |
| my @parsed_line = split(/\s+/, $line, 5); |
| my $spam_count = $parsed_line[1] + 0; |
| my $ham_count = $parsed_line[2] + 0; |
| my $atime = $parsed_line[3] + 0; |
| my $token = $parsed_line[4]; |
| |
| my $token_warn_p = 0; |
| my @warnings; |
| |
| if ($spam_count < 0) { |
| $spam_count = 0; |
| push(@warnings, 'spam count < 0, resetting'); |
| $token_warn_p = 1; |
| } |
| if ($ham_count < 0) { |
| $ham_count = 0; |
| push(@warnings, 'ham count < 0, resetting'); |
| $token_warn_p = 1; |
| } |
| |
| if ($spam_count == 0 && $ham_count == 0) { |
| dbg("bayes: token has zero spam and ham count, skipping"); |
| next; |
| } |
| |
| if ($atime > time()) { |
| $atime = time(); |
| push(@warnings, 'atime > current time, resetting'); |
| $token_warn_p = 1; |
| } |
| |
| if ($token_warn_p) { |
| dbg("bayes: token (%s) has the following warnings:\n%s", |
| $token, join("\n",@warnings)); |
| } |
| |
| # database versions < 3 did not encode their token values |
| if ($db_version < 3) { |
| $token = substr(sha1($token), -5); |
| } |
| else { |
| # turn unpacked binary token back into binary value |
| $token = pack("H*",$token); |
| } |
| |
| $new_toks{$token} = $self->tok_pack($spam_count, $ham_count, $atime); |
| if ($atime < $oldest_token_age) { |
| $oldest_token_age = $atime; |
| } |
| if ($atime > $newest_token_age) { |
| $newest_token_age = $atime; |
| } |
| $token_count++; |
| } |
| elsif ($line =~ /^s\s+/) { # seen line |
| my @parsed_line = split(/\s+/, $line, 3); |
| my $flag = $parsed_line[1]; |
| my $msgid = $parsed_line[2]; |
| |
| unless ($flag eq 'h' || $flag eq 's') { |
| dbg("bayes: unknown seen flag ($flag) for line: $line, skipping"); |
| next; |
| } |
| |
| unless ($msgid) { |
| dbg("bayes: blank msgid for line: $line, skipping"); |
| next; |
| } |
| |
| $new_seen{$msgid} = $flag; |
| } |
| else { |
| dbg("bayes: skipping unknown line: $line"); |
| next; |
| } |
| } |
| defined $line || $!==0 or die "Error reading dump file: $!"; |
| close(DUMPFILE) or die "Can't close dump file: $!"; |
| |
| print STDERR "\n" if ($showdots); |
| |
| unless (defined($num_spam)) { |
| dbg("bayes: unable to find num spam, please check file"); |
| $error_p = 1; |
| } |
| |
| unless (defined($num_ham)) { |
| dbg("bayes: unable to find num ham, please check file"); |
| $error_p = 1; |
| } |
| |
| if ($error_p) { |
| dbg("bayes: error(s) while attempting to load $filename, correct and re-run"); |
| |
| untie %new_toks; |
| untie %new_seen; |
| $self->_unlink_file($tmptoksdbname); |
| $self->_unlink_file($tmpseendbname); |
| $self->untie_db(); |
| return 0; |
| } |
| |
| # set the calculated magic tokens |
| $new_toks{$DB_VERSION_MAGIC_TOKEN} = $self->DB_VERSION(); |
| $new_toks{$NTOKENS_MAGIC_TOKEN} = $token_count; |
| $new_toks{$NSPAM_MAGIC_TOKEN} = $num_spam; |
| $new_toks{$NHAM_MAGIC_TOKEN} = $num_ham; |
| $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $newest_token_age; |
| $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $oldest_token_age; |
| |
| # go ahead and zero out these, chances are good that they are bogus anyway. |
| $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = 0; |
| $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = 0; |
| $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = 0; |
| $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = 0; |
| |
| local $SIG{'INT'} = 'IGNORE'; |
| local $SIG{'TERM'} = 'IGNORE'; |
| local $SIG{'HUP'} = 'IGNORE' if !am_running_on_windows(); |
| |
| untie %new_toks; |
| untie %new_seen; |
| $self->untie_db(); |
| |
| # Here is where something can go horribly wrong and screw up the bayes |
| # database files. If we are able to copy one and not the other then it |
| # will leave the database in an inconsistent state. Since this is an |
| # edge case, and they're trying to replace the DB anyway we should be ok. |
| unless ($self->_rename_file($tmptoksdbname, $toksdbname)) { |
| dbg("bayes: error while renaming $tmptoksdbname to $toksdbname: $!"); |
| return 0; |
| } |
| unless ($self->_rename_file($tmpseendbname, $seendbname)) { |
| dbg("bayes: error while renaming $tmpseendbname to $seendbname: $!"); |
| dbg("bayes: database now in inconsistent state"); |
| return 0; |
| } |
| |
| dbg("bayes: parsed $line_count lines"); |
| dbg("bayes: created database with $token_count tokens based on $num_spam spam messages and $num_ham ham messages"); |
| |
| return 1; |
| } |
| |
| ########################################################################### |
| |
| # token marshalling format for db_toks. |
| |
| # Since we may have many entries with few hits, especially thousands of hapaxes |
| # (1-occurrence entries), use a flexible entry format, instead of simply "2 |
| # packed ints", to keep the memory and disk space usage down. In my |
| # 18k-message test corpus, only 8.9% have >= 8 hits in either counter, so we |
| # can use a 1-byte representation for the other 91% of low-hitting entries |
| # and save masses of space. |
| |
| # This looks like: XXSSSHHH (XX = format bits, SSS = 3 spam-count bits, HHH = 3 |
| # ham-count bits). If XX in the first byte is 11, it's packed as this 1-byte |
| # representation; otherwise, if XX in the first byte is 00, it's packed as |
| # "CLL", ie. 1 byte and 2 32-bit "longs" in perl pack format. |
| |
| # Savings: roughly halves size of toks db, at the cost of a ~10% slowdown. |
| |
| use constant FORMAT_FLAG => 0xc0; # 11000000 |
| use constant ONE_BYTE_FORMAT => 0xc0; # 11000000 |
| use constant TWO_LONGS_FORMAT => 0x00; # 00000000 |
| |
| use constant ONE_BYTE_SSS_BITS => 0x38; # 00111000 |
| use constant ONE_BYTE_HHH_BITS => 0x07; # 00000111 |
| |
| sub tok_unpack { |
| my ($self, $value) = @_; |
| $value ||= 0; |
| |
| my ($packed, $atime); |
| if ($self->{db_version} >= 1) { |
| ($packed, $atime) = unpack("CV", $value); |
| } |
| elsif ($self->{db_version} == 0) { |
| ($packed, $atime) = unpack("CS", $value); |
| } |
| |
| if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) { |
| return (($packed & ONE_BYTE_SSS_BITS) >> 3, |
| $packed & ONE_BYTE_HHH_BITS, |
| $atime || 0); |
| } |
| elsif (($packed & FORMAT_FLAG) == TWO_LONGS_FORMAT) { |
| my ($packed, $ts, $th, $atime); |
| if ($self->{db_version} >= 1) { |
| ($packed, $ts, $th, $atime) = unpack("CVVV", $value); |
| } |
| elsif ($self->{db_version} == 0) { |
| ($packed, $ts, $th, $atime) = unpack("CLLS", $value); |
| } |
| return ($ts || 0, $th || 0, $atime || 0); |
| } |
| # other formats would go here... |
| else { |
| warn "bayes: unknown packing format for bayes db, please re-learn: $packed"; |
| return (0, 0, 0); |
| } |
| } |
| |
| sub tok_pack { |
| my ($self, $ts, $th, $atime) = @_; |
| $ts ||= 0; $th ||= 0; $atime ||= 0; |
| if ($ts < 8 && $th < 8) { |
| return pack ("CV", ONE_BYTE_FORMAT | ($ts << 3) | $th, $atime); |
| } else { |
| return pack ("CVVV", TWO_LONGS_FORMAT, $ts, $th, $atime); |
| } |
| } |
| |
| ########################################################################### |
| |
| sub db_readable { |
| my ($self) = @_; |
| return $self->{already_tied}; |
| } |
| |
| sub db_writable { |
| my ($self) = @_; |
| return $self->{already_tied} && $self->{is_locked}; |
| } |
| |
| ########################################################################### |
| |
| sub _unlink_file { |
| my ($self, $filename) = @_; |
| |
| unlink $filename; |
| } |
| |
| sub _rename_file { |
| my ($self, $sourcefilename, $targetfilename) = @_; |
| |
| return 0 unless (rename($sourcefilename, $targetfilename)); |
| |
| return 1; |
| } |
| |
| sub sa_die { Mail::SpamAssassin::sa_die(@_); } |
| |
| 1; |