blob: afeae84dea8d0ce5727089a66a1ed926a2206f05 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
=head1 NAME
Mail::SpamAssassin::BayesStore - Storage Module for default Bayes classifier
This is the public API for the Bayesian store methods. Any implementation of
the storage module for the default Bayes classifier must implement these methods.
package Mail::SpamAssassin::BayesStore;
use strict;
use warnings;
# use bytes;
use re 'taint';
use Mail::SpamAssassin::Logger;
# TODO: if we ever get tuits, it'd be good to make these POD
# method docs more perlish... hardly a biggie.
=head1 METHODS
=over 4
=item new
public class (Mail::SpamAssassin::BayesStore) new (Mail::SpamAssassin::Plugin::Bayes $bayes)
This method creates a new instance of the Mail::SpamAssassin::BayesStore
object. You must pass in an instance of the Mail::SpamAssassin::Plugin::Bayes
object, which is stashed for use throughout the module.
sub new {
my ($class, $bayes) = @_;
$class = ref($class) || $class;
my $self = {
'bayes' => $bayes,
'supported_db_version' => 0,
'db_version' => undef,
bless ($self, $class);
public instance (Integer) DB_VERSION ()
This method returns the currently supported database version for the
my ($self) = @_;
return $self->{supported_db_version};
=item read_db_configs
public instance () read_db_configs ()
This method reads any needed config variables from the configuration object
and then calls the Mail::SpamAssassin::Plugin::Bayes read_db_configs method.
sub read_db_configs {
my ($self) = @_;
# TODO: at some stage, this may be useful to read config items which
# control database bloat, like
# - use of hapaxes
# - use of case-sensitivity
# - more midrange-hapax-avoidance tactics when parsing headers (future)
# for now, we just set these settings statically.
my $conf = $self->{bayes}->{main}->{conf};
# Minimum desired database size? Expiry will not shrink the
# database below this number of entries. 100k entries is roughly
# equivalent to a 5Mb database file.
$self->{expiry_max_db_size} = $conf->{bayes_expiry_max_db_size};
$self->{expiry_pct} = $conf->{bayes_expiry_pct};
$self->{expiry_period} = $conf->{bayes_expiry_period};
$self->{expiry_max_exponent} = $conf->{bayes_expiry_max_exponent};
=item prefork_init
public instance (Boolean) prefork_init ()
This optional method is called in the parent process shortly before
forking off child processes.
# sub prefork_init {
# my ($self) = @_;
# }
=item spamd_child_init
public instance (Boolean) spamd_child_init ()
This optional method is called in a child process shortly after being spawned.
# sub spamd_child_init {
# my ($self) = @_;
# }
=item tie_db_readonly
public instance (Boolean) tie_db_readonly ()
This method opens up the database in readonly mode.
sub tie_db_readonly {
my ($self) = @_;
die "bayes: tie_db_readonly: not implemented\n";
=item tie_db_writable
public instance (Boolean) tie_db_writable ()
This method opens up the database in writable mode.
Any callers of this methods should ensure that they call untie_db()
sub tie_db_writable {
my ($self) = @_;
die "bayes: tie_db_writable: not implemented\n";
=item untie_db
public instance () untie_db ()
This method unties the database.
sub untie_db {
my $self = shift;
die "bayes: untie_db: not implemented\n";
=item calculate_expire_delta
public instance (%) calculate_expire_delta (Integer $newest_atime,
Integer $start,
Integer $max_expire_mult)
This method performs a calculation on the data to determine the optimum
atime for token expiration.
sub calculate_expire_delta {
my ($self, $newest_atime, $start, $max_expire_mult) = @_;
die "bayes: calculate_expire_delta: not implemented\n";
=item token_expiration
public instance (Integer, Integer,
Integer, Integer) token_expiration(\% $opts,
Integer $newest_atime,
Integer $newdelta)
This method performs the database specific expiration of tokens based on
the passed in C<$newest_atime> and C<$newdelta>.
sub token_expiration {
my ($self, $opts, $newest_atime, $newdelta) = @_;
die "bayes: token_expiration: not implemented\n";
=item expire_old_tokens
public instance (Boolean) expire_old_tokens (\% hashref)
This method expires old tokens from the database.
sub expire_old_tokens {
my ($self, $opts) = @_;
my $ret;
my $eval_stat;
eval {
local $SIG{'__DIE__'}; # do not run user die() traps in here
if ($self->tie_db_writable()) {
$ret = $self->expire_old_tokens_trapped ($opts);
} or do {
$eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
if (!$self->{bayes}->{main}->{learn_caller_will_untie}) {
if (defined $eval_stat) { # if we died, untie the dbs.
warn "bayes: expire_old_tokens: $eval_stat\n";
return 0;
=item expire_old_tokens_trapped
public instance (Boolean) expire_old_tokens_trapped (\% $opts)
This methods does the actual token expiration.
XXX More docs here about the methodology and what not
sub expire_old_tokens_trapped {
my ($self, $opts) = @_;
# Flag that we're doing work
# We don't need to do an expire, so why were we called? Oh well.
if (!$self->expiry_due()) {
return 0;
my $started = time();
my @vars = $self->get_storage_variables();
if ( $vars[10] > time ) {
dbg("bayes: expiry found newest atime in the future, resetting to current time");
$vars[10] = time;
# How many tokens do we want to keep?
my $goal_reduction = int($self->{expiry_max_db_size} * $self->{expiry_pct});
dbg("bayes: expiry check keep size, ".$self->{expiry_pct}." * max: $goal_reduction");
# Make sure we keep at least 100000 tokens in the DB
if ( $goal_reduction < 100000 ) {
$goal_reduction = 100000;
dbg("bayes: expiry keep size too small, resetting to 100,000 tokens");
# Now turn goal_reduction into how many to expire.
$goal_reduction = $vars[3] - $goal_reduction;
dbg("bayes: token count: ".$vars[3].", final goal reduction size: $goal_reduction");
if ( $goal_reduction < 1000 ) { # too few tokens to expire, abort.
dbg("bayes: reduction goal of $goal_reduction is under 1,000 tokens, skipping expire");
$self->remove_running_expire_tok(); # this won't be cleaned up, so do it now.
return 1; # we want to indicate things ran as expected
# Estimate new atime delta based on the last atime delta
my $newdelta = 0;
if ( $vars[9] > 0 ) {
# newdelta = olddelta * old / goal;
# this may seem backwards, but since we're talking delta here,
# not actual atime, we want smaller atimes to expire more tokens,
# and visa versa.
$newdelta = int($vars[8] * $vars[9] / $goal_reduction);
# Calculate size difference between last expiration token removal
# count and the current goal removal count.
my $ratio = ($vars[9] == 0 || $vars[9] > $goal_reduction) ? $vars[9]/$goal_reduction : $goal_reduction/$vars[9];
dbg("bayes: first pass? current: ".time().", Last: ".$vars[4].", atime: ".$vars[8].", count: ".$vars[9].", newdelta: $newdelta, ratio: $ratio, period: ".$self->{expiry_period});
# Do this for the first expire or "odd" looking results cause a first pass to determine atime:
# - last expire was more than 30 days ago
# assume mail flow stays roughly the same month to month, recompute if it's > 1 month
# - last atime delta was under expiry period
# if we're expiring often max_db_size should go up, but let's recompute just to check
# - last reduction count was < 1000 tokens
# ditto
# - new estimated atime delta is under expiry period
# ditto
# - difference of last reduction to current goal reduction is > 50%
# if the two values are out of balance, estimating atime is going to be funky, recompute
if ( (time() - $vars[4] > 86400*30) || ($vars[8] < $self->{expiry_period}) || ($vars[9] < 1000)
|| ($newdelta < $self->{expiry_period}) || ($ratio > 1.5) ) {
dbg("bayes: can't use estimation method for expiry, unexpected result, calculating optimal atime delta (first pass)");
my $start = $self->{expiry_period}; # exponential search starting at ...? 1/2 day, 1, 2, 4, 8, 16, ...
my $max_expire_mult = 2**$self->{expiry_max_exponent}; # $max_expire_mult * $start = max expire time (256 days), power of 2.
dbg("bayes: expiry max exponent: ".$self->{expiry_max_exponent});
my %delta = $self->calculate_expire_delta($vars[10], $start, $max_expire_mult);
return 0 unless (%delta);
# This will skip the for loop if debugging isn't enabled ...
if (would_log('dbg', 'bayes')) {
dbg("bayes: atime\ttoken reduction");
dbg("bayes: ========\t===============");
for(my $i = 1; $i<=$max_expire_mult; $i <<= 1) {
dbg("bayes: ".$start*$i."\t".(exists $delta{$i} ? $delta{$i} : 0));
# Now figure out which max_expire_mult value gives the closest results to goal_reduction, without
# going over ... Go from the largest delta backwards so the reduction size increases
# (tokens that expire at 4 also expire at 3, 2, and 1, so 1 will always be the largest expiry...)
for( ; $max_expire_mult > 0; $max_expire_mult>>=1 ) {
next unless exists $delta{$max_expire_mult};
if ($delta{$max_expire_mult} > $goal_reduction) {
$max_expire_mult<<=1; # the max expire is actually the next power of 2 out
# if max_expire_mult gets to 0, either we can't expire anything, or 1 is <= $goal_reduction
$max_expire_mult ||= 1;
# $max_expire_mult is now equal to the value we should use ...
# Check to see if the atime value we found is really good.
# It's not good if:
# - $max_expire_mult would not expire any tokens. This means that the majority of
# tokens are old or new, and more activity is required before an expiry can occur.
# - reduction count < 1000, not enough tokens to be worth doing an expire.
if ( !exists $delta{$max_expire_mult} || $delta{$max_expire_mult} < 1000 ) {
dbg("bayes: couldn't find a good delta atime, need more token difference, skipping expire");
$self->remove_running_expire_tok(); # this won't be cleaned up, so do it now.
return 1; # we want to indicate things ran as expected
$newdelta = $start * $max_expire_mult;
dbg("bayes: first pass decided on $newdelta for atime delta");
else { # use the estimation method
dbg("bayes: can do estimation method for expiry, skipping first pass");
my ($kept, $deleted, $num_hapaxes, $num_lowfreq) = $self->token_expiration($opts, $newdelta, @vars);
my $done = time();
my $msg = "expired old bayes database entries in ".($done - $started)." seconds";
my $msg2 = "$kept entries kept, $deleted deleted";
if ($opts->{verbose}) {
my $hapax_pc = ($num_hapaxes * 100) / $kept;
my $lowfreq_pc = ($num_lowfreq * 100) / $kept;
print "$msg\n$msg2\n" or die "Error writing: $!";
printf "token frequency: 1-occurrence tokens: %3.2f%%\n", $hapax_pc
or die "Error writing: $!";
printf "token frequency: less than 8 occurrences: %3.2f%%\n", $lowfreq_pc
or die "Error writing: $!";
else {
dbg("bayes: $msg: $msg2");
return 1;
=item sync_due
public instance (Boolean) sync_due ()
This methods determines if a sync is due.
sub sync_due {
my ($self) = @_;
die "bayes: sync_due: not implemented\n";
=item expiry_due
public instance (Boolean) expiry_due ()
This methods determines if an expire is due.
sub expiry_due {
my ($self) = @_;
$self->read_db_configs(); # make sure this has happened here
# If force expire was called, do the expire no matter what.
return 1 if ($self->{bayes}->{main}->{learn_force_expire});
# if config says not to auto expire then no need to continue
return 0 if ($self->{bayes}->{main}->{conf}->{bayes_auto_expire} == 0);
# is the database too small for expiry? (Do *not* use "scalar keys",
# as this will iterate through the entire db counting them!)
my @vars = $self->get_storage_variables();
my $ntoks = $vars[3];
my $last_expire = time() - $vars[4];
if (!$self->{bayes}->{main}->{ignore_safety_expire_timeout}) {
# if we're not ignoring the safety timeout, don't run an expire more
# than once every 12 hours.
return 0 if ($last_expire < 43200);
else {
# if we are ignoring the safety timeout (e.g.: mass-check), still
# limit the expiry to only one every 5 minutes.
return 0 if ($last_expire < 300);
dbg("bayes: DB expiry: tokens in DB: $ntoks, Expiry max size: ".$self->{expiry_max_db_size}.", Oldest atime: ".$vars[5].", Newest atime: ".$vars[10].", Last expire: ".$vars[4].", Current time: ".time());
my $conf = $self->{bayes}->{main}->{conf};
if ($ntoks <= 100000 || # keep at least 100k tokens
$self->{expiry_max_db_size} > $ntoks || # not enough tokens to cause an expire
$vars[10]-$vars[5] < 43200 || # delta between oldest and newest < 12h
$self->{db_version} < $self->DB_VERSION # ignore old db formats
) {
return 0;
return 1;
=item seen_get
public instance (Char) seen_get (String $msgid)
This method retrieves the stored value, if any, for C<$msgid>. The return
value is the stored string ('s' for spam and 'h' for ham) or undef if
C<$msgid> is not found.
sub seen_get {
my ($self, $msgid) = @_;
die "bayes: seen_get: not implemented\n";
=item seen_put
public instance (Boolean) seen_put (String $msgid, Char $flag)
This method records C<$msgid> as the type given by C<$flag>. C<$flag> is
one of two values 's' for spam and 'h' for ham.
sub seen_put {
my ($self, $msgid, $flag) = @_;
die "bayes: seen_put: not implemented\n";
=item seen_delete
public instance (Boolean) seen_delete (String $msgid)
This method removes C<$msgid> from storage.
sub seen_delete {
my ($self, $msgid) = @_;
die "bayes: seen_delete: not implemented\n";
=item get_storage_variables
public instance (@) get_storage_variables ()
This method retrieves the various administrative variables used by
the Bayes storage implementation.
The values returned in the array are in the following order:
0: scan count base
1: number of spam
2: number of ham
3: number of tokens in db
4: last expire atime
5: oldest token in db atime
6: db version value
7: last journal sync
8: last atime delta
9: last expire reduction count
10: newest token in db atime
sub get_storage_variables {
my ($self) = @_;
die "bayes: get_storage_variables: not implemented\n";
=item dump_db_toks
public instance () dump_db_toks (String $template, String $regex, @ @vars)
This method loops over all tokens, computing the probability for the token
and then printing it out according to the passed in template.
sub dump_db_toks {
my ($self, $template, $regex, @vars) = @_;
die "bayes: dump_db_toks: not implemented\n";
=item set_last_expire
public instance (Boolean) _set_last_expire (Integer $time)
This method sets the last expire time.
sub set_last_expire {
my ($self, $time) = @_;
die "bayes: set_last_expire: not implemented\n";
=item get_running_expire_tok
public instance (Time) get_running_expire_tok ()
This method determines if an expire is currently running and returns the time
the expire started.
sub get_running_expire_tok {
my ($self) = @_;
die "bayes: get_running_expire_tok: not implemented\n";
=item set_running_expire_tok
public instance (Time) set_running_expire_tok ()
This method sets the running expire time to the current time.
sub set_running_expire_tok {
my ($self) = @_;
die "bayes: set_running_expire_tok: not implemented\n";
=item remove_running_expire_tok
public instance (Boolean) remove_running_expire_tok ()
This method removes a currently set running expire time.
sub remove_running_expire_tok {
my ($self) = @_;
die "bayes: remove_running_expire_tok: not implemented\n";
=item tok_get
public instance (Integer, Integer, Time) tok_get (String $token)
This method retrieves the specified token (C<$token>) from storage and returns
it's spam count, ham acount and last access time.
sub tok_get {
my ($self, $token) = @_;
die "bayes: tok_get: not implemented\n";
=item tok_get_all
public instance (\@) tok_get_all (@ @tokens)
This method retrieves the specified tokens (C<@tokens>) from storage and
returns an array ref of arrays spam count, ham count and last access time.
sub tok_get_all {
my ($self, $tokens) = @_;
die "bayes: tok_get_all: not implemented\n";
=item tok_count_change
public instance (Boolean) tok_count_change (Integer $spam_count,
Integer $ham_count,
String $token,
Time $atime)
This method takes a C<$spam_count> and C<$ham_count> and adds it to
C<$token> along with updating C<$token>s atime with C<$atime>.
sub tok_count_change {
my ($self, $spam_count, $ham_count, $token, $atime) = @_;
die "bayes: tok_count_change: not implemented\n";
=item multi_tok_count_change
public instance (Boolean) multi_tok_count_change (Integer $spam_count,
Integer $ham_count,
\% $tokens,
String $atime)
This method takes a C<$spam_count> and C<$ham_count> and adds it to all
of the tokens in the C<$tokens> hash ref along with updating each tokens
atime with C<$atime>.
sub multi_tok_count_change {
my ($self, $spam_count, $ham_count, $tokens, $atime) = @_;
die "bayes: multi_tok_count_change: not implemented\n";
=item nspam_nham_get
public instance (Integer, Integer) nspam_nham_get ()
This method retrieves the total number of spam and the total number of ham
currently under storage.
sub nspam_nham_get {
my ($self) = @_;
die "bayes: nspam_nham_get: not implemented\n";
=item nspam_nham_change
public instance (Boolean) nspam_nham_change (Integer $num_spam,
Integer $num_ham)
This method updates the number of spam and the number of ham in the database.
sub nspam_nham_change {
my ($self, $num_spam, $num_ham) = @_;
die "bayes: nspam_nham_change: not implemented\n";
=item tok_touch
public instance (Boolean) tok_touch (String $token,
Time $atime)
This method updates the given tokens (C<$token>) access time.
sub tok_touch {
my ($self, $token, $atime) = @_;
die "bayes: tok_touch: not implemented\n";
=item tok_touch_all
public instance (Boolean) tok_touch_all (\@ $tokens,
Time $atime)
This method does a mass update of the given list of tokens C<$tokens>, if the existing token
atime is < C<$atime>.
sub tok_touch_all {
my ($self, $tokens, $atime) = @_;
die "bayes: tok_touch_all: not implemented\n";
=item cleanup
public instance (Boolean) cleanup ()
This method performs any cleanup necessary before moving onto the next
sub cleanup {
my ($self) = @_;
die "bayes: cleanup: not implemented\n";
=item get_magic_re
public instance get_magic_re (String)
This method returns a regexp which indicates a magic token.
sub get_magic_re {
my ($self) = @_;
die "bayes: get_magic_re: not implemented\n";
=item sync
public instance (Boolean) sync (\% $opts)
This method performs a sync of the database.
sub sync {
my ($self, $opts) = @_;
die "bayes: sync: not implemented\n";
=item perform_upgrade
public instance (Boolean) perform_upgrade (\% $opts)
This method is a utility method that performs any necessary upgrades
between versions. It should know how to handle previous versions and
what needs to happen to upgrade them.
A true return value indicates success.
sub perform_upgrade {
my ($self, $opts) = @_;
die "bayes: perform_upgrade: not implemented\n";
=item clear_database
public instance (Boolean) clear_database ()
This method deletes all records for a particular user.
Callers should be aware that any errors returned by this method
could causes the database to be inconsistent for the given user.
sub clear_database {
my ($self) = @_;
die "bayes: clear_database: not implemented\n";
=item backup_database
public instance (Boolean) backup_database ()
This method will dump the users database in a machine readable format.
sub backup_database {
my ($self) = @_;
die "bayes: backup_database: not implemented\n";
=item restore_database
public instance (Boolean) restore_database (String $filename, Boolean $showdots)
This method restores a database from the given filename, C<$filename>.
Callers should be aware that any errors returned by this method
could causes the database to be inconsistent for the given user.
sub restore_database {
my ($self, $filename, $showdots) = @_;
die "bayes: restore_database: not implemented\n";
=item db_readable
public instance (Boolean) db_readable ()
This method returns whether or not the Bayes DB is available in a
readable state.
sub db_readable {
my ($self) = @_;
die "bayes: db_readable: not implemented\n";
=item db_writable
public instance (Boolean) db_writable ()
This method returns whether or not the Bayes DB is available in a
writable state.
sub db_writable {
my ($self) = @_;
die "bayes: db_writable: not implemented\n";
sub sa_die { Mail::SpamAssassin::sa_die(@_); }