blob: e1153d3378cfa749753b465a1d78bfb03baf8bf0 [file] [log] [blame]
#!/bin/sh
#
# bayes-10pcv-driver - run 10-fold cross-validation test on SpamAssassin Bayes
#
# Since Bayesish probability analysis requires training on a corpus, the
# traditional SpamAssassin 10-pass cross-validation suite can't be used. Also,
# Bayes requires its own ten-pass testing, separately, to judge the effects of
# tweaks. So that's what this is.
#
# Before running, you need to create a test corpus, as "cor/spam" and
# "cor/ham". Here's how to do this:
# cd TEST
# SADIR/tools/split_corpora -n 10 -l 2000 -p cor/spam/bucket spf1 spf2 spf3 ...
# SADIR/tools/split_corpora -n 10 -l 2000 -p cor/ham/bucket ham1 ham2 ham3 ...
#
# SADIR = top-level directory of SpamAssassin distro
# TEST = the directory where the corpus and results are to be written
# spfN = mail folders full of spam
# hamN = mail folders full of ham
# It will produce a directory of results called "results". The most important
# are "hist_all": a histogram of scores and frequencies, and "thresholds_all":
# the output of analysis of all scores and frequencies from the
# bayes-thresholds script.
# NOTE: by default you will need *AT LEAST* 2000 of either type to use
# this, since bayes will not be activated without 200 messages in the db,
# and each fold is run using 10% of the corpus -- and 2000/10 = 200.
###########################################################################
testdir=`pwd`
learnargs=
if [ "$#" -gt 0 ] ; then
learnargs="$*"
fi
cd $SADIR/masses
PATH=$SADIR:$SADIR/masses:$PATH
results=$testdir/results
tmpdir=$results/config
rm -rf $results $tmpdir
# now, just copy in the Bayes ruleset
mkdir -p $results $tmpdir/rules
cp ../rules/23_*.cf $tmpdir/rules
cp ../rules/50*.cf $tmpdir/rules
cp ../rules/*.pre $tmpdir/rules # ensure we have plugins
# tell SpamAssassin to use this path for DBs
# TODO: for tests of these settings, read from a test-specific file
echo "
bayes_path $tmpdir/dbs/bayes
bayes_auto_learn 0
bayes_min_ham_num 10
bayes_min_spam_num 10
bayes_store_module Mail::SpamAssassin::BayesStore::SDBM
" > $tmpdir/rules/30bayes_path.cf
mkdir $tmpdir/dbs
INTERLEAVE_TESTS=0
TEST_AGAINST_10PC=0
LEARN_ALL_THEN_FORGET_TEST_SET=0
backup_dbs () {
echo "Backing up full learned DBs..."
( cd $tmpdir; tar cvf learned-all.tar dbs )
}
restore_dbs () {
echo "Restoring full learned DBs..."
( cd $tmpdir; rm -rf dbs; tar xf learned-all.tar )
}
runcmd () {
echo "$*"
time $*
}
if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then
# learn the lot, then forget the ones we're testing on each time.
# faster than learning from scratch for each fold
# note: we use randseed=1 so that every run will always pick the
# same messages if --learnprob is used.
(
echo -n "Learning from all ham buckets..." ; date
runcmd sa-learn --ham --randseed=1 --no-sync $learnargs \
--showdots --mbox --config-file=$tmpdir/rules $testdir/cor/ham/*
echo -n "Learning from all spam buckets..." ; date
runcmd sa-learn --spam --randseed=1 --no-sync $learnargs \
--showdots --mbox --config-file=$tmpdir/rules $testdir/cor/spam/*
runcmd sa-learn --sync $learnargs --config-file=$tmpdir/rules
echo -n "Done learning. " ; date
) 2>&1 | tee $results/learn.log
echo "Dumping bayes DB..."
( cd .. ; tools/check_bayes_db --dbpath=$tmpdir/dbs/bayes ) \
> $results/bayes_db.dump
fi
backup_dbs
(
echo -n "Starting test..." ; date
for bucket in 1 2 3 4 5 6 7 8 9 10 ; do
echo -n "Bucket $bucket..." ; date
if [ $bucket != 1 ] ; then restore_dbs ; fi
rdir=$results/bucket$bucket
mkdir $rdir
: > $rdir/hbucketlearn
: > $rdir/sbucketlearn
: > $rdir/hbuckettest
: > $rdir/sbuckettest
for subbucket in 1 2 3 4 5 6 7 8 9 10 ; do
type=l
[ $TEST_AGAINST_10PC = 1 -a $subbucket = $bucket ] && type=t
[ $TEST_AGAINST_10PC = 0 -a $subbucket != $bucket ] && type=t
if [ $type = l ] ; then
echo "Using bucket for learn: $subbucket ..."
cat $testdir/cor/ham/bucket.$subbucket >> $rdir/hbucketlearn
cat $testdir/cor/spam/bucket.$subbucket >> $rdir/sbucketlearn
else
echo "Using bucket for test: $subbucket ..."
cat $testdir/cor/ham/bucket.$subbucket >> $rdir/hbuckettest
cat $testdir/cor/spam/bucket.$subbucket >> $rdir/sbuckettest
fi
done
if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then
echo "Forgetting contents of test ham bucket..."
runcmd sa-learn --forget --config-file=$tmpdir/rules --showdots \
--mbox $rdir/hbuckettest
echo "Forgetting contents of test spam bucket..."
runcmd sa-learn --forget --config-file=$tmpdir/rules --showdots \
--mbox $rdir/sbuckettest
else
echo "Learning contents of learn ham bucket..."
runcmd sa-learn --ham --randseed=1 --no-sync $learnargs \
--showdots --mbox --config-file=$tmpdir/rules $rdir/hbucketlearn
echo "Learning contents of learn spam bucket..."
runcmd sa-learn --spam --randseed=1 --no-sync $learnargs \
--showdots --mbox --config-file=$tmpdir/rules $rdir/sbucketlearn
runcmd sa-learn --sync $learnargs --config-file=$tmpdir/rules
echo "Dumping bayes DB..."
( cd .. ; sa-learn --dump --dbpath=$tmpdir/dbs/bayes ) \
> $rdir/bayes_db.dump
fi
runcmd sa-learn --sync --config-file=$tmpdir/rules
# take a copy of the trained Bayes DBs, gzipped
( cd $tmpdir ; tar cf - dbs | gzip -c > $rdir/dbs.tgz )
if [ $INTERLEAVE_TESTS = 1 ] ; then
# now split the ham and spam test bucket into 10 sub-buckets,
# so we interleave ham and spam while testing. important for
# judging expiry effects
: > $rdir/nonspam.log
: > $rdir/spam.log
mkdir $rdir/testbuckets
(
cd ..
tools/split_corpora -n 10 -p $rdir/testbuckets/ham \
$rdir/hbuckettest
tools/split_corpora -n 10 -p $rdir/testbuckets/spam \
$rdir/sbuckettest
)
for subbucket in 1 2 3 4 5 6 7 8 9 10 ; do
echo "Running mass-check on ham test-bucket $subbucket..."
time ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
--bayes --mbox $rdir/testbuckets/ham.$subbucket \
>> $rdir/nonspam.log
echo "Running mass-check on spam test-bucket $subbucket..."
time ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
--bayes --mbox $rdir/testbuckets/spam.$subbucket \
>> $rdir/spam.log
done
else
echo "Running mass-check on ham bucket..."
runcmd ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
--bayes --mbox $rdir/hbuckettest \
> $rdir/nonspam.log
echo "Running mass-check on spam bucket..."
runcmd ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
--bayes --mbox $rdir/sbuckettest \
> $rdir/spam.log
fi
echo "Reporting..."
./bayes-testing/draw-bayes-histogram \
$rdir/spam.log $rdir/nonspam.log \
> $rdir/hist
./bayes-testing/bayes-thresholds \
$rdir/spam.log $rdir/nonspam.log \
> $rdir/thresholds
./bayes-testing/bayes-static-thresholds \
$rdir/spam.log $rdir/nonspam.log \
> $rdir/thresholds.static
# remove these, they're too big.
rm -f $rdir/hbucketlearn $rdir/sbucketlearn
# but keep these to find FPs/FNs later
gzip $rdir/hbuckettest $rdir/sbuckettest
done
echo -n "Done test..." ; date
) 2>&1 | tee $results/test.log
cat $results/bucket*/spam.log > $results/spam_all.log
cat $results/bucket*/nonspam.log > $results/nonspam_all.log
./bayes-testing/draw-bayes-histogram \
$results/spam_all.log $results/nonspam_all.log \
> $results/hist_all
./bayes-testing/bayes-thresholds \
$results/spam_all.log $results/nonspam_all.log \
> $results/thresholds_all
./bayes-testing/bayes-static-thresholds \
$results/spam_all.log $results/nonspam_all.log \
> $results/thresholds_all.static
echo "Done."
ls -l $results