| #!/bin/sh |
| # |
| # bayes-10pcv-driver - run 10-fold cross-validation test on SpamAssassin Bayes |
| # |
| # Since Bayesish probability analysis requires training on a corpus, the |
| # traditional SpamAssassin 10-pass cross-validation suite can't be used. Also, |
| # Bayes requires its own ten-pass testing, separately, to judge the effects of |
| # tweaks. So that's what this is. |
| # |
| # Before running, you need to create a test corpus, as "cor/spam" and |
| # "cor/ham". Here's how to do this: |
| |
| # cd TEST |
| # SADIR/tools/split_corpora -n 10 -l 2000 -p cor/spam/bucket spf1 spf2 spf3 ... |
| # SADIR/tools/split_corpora -n 10 -l 2000 -p cor/ham/bucket ham1 ham2 ham3 ... |
| # |
| # SADIR = top-level directory of SpamAssassin distro |
| # TEST = the directory where the corpus and results are to be written |
| # spfN = mail folders full of spam |
| # hamN = mail folders full of ham |
| |
| # It will produce a directory of results called "results". The most important |
| # are "hist_all": a histogram of scores and frequencies, and "thresholds_all": |
| # the output of analysis of all scores and frequencies from the |
| # bayes-thresholds script. |
| |
| # NOTE: by default you will need *AT LEAST* 2000 of either type to use |
| # this, since bayes will not be activated without 200 messages in the db, |
| # and each fold is run using 10% of the corpus -- and 2000/10 = 200. |
| |
| ########################################################################### |
| |
| testdir=`pwd` |
| numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1` |
| |
| learnargs= |
| if [ "$#" -gt 0 ] ; then |
| learnargs="$*" |
| fi |
| |
| cd $SADIR/masses |
| |
| results=$testdir/results |
| tmpdir=$results/config |
| |
| rm -rf $results $tmpdir |
| |
| # now, just copy in the Bayes ruleset |
| mkdir -p $results $tmpdir/rules |
| cp ../rules/20_aux_tlds.cf $tmpdir/rules |
| cp ../rules/23_bayes.cf $tmpdir/rules |
| cp ../rules/50_scores.cf $tmpdir/rules |
| cp ../rules/*.pre $tmpdir/rules # ensure we have plugins |
| cp $testdir/*.cf $tmpdir/rules 2>/dev/null |
| |
| # tell SpamAssassin to use this path for DBs |
| # TODO: for tests of these settings, read from a test-specific file |
| echo " |
| |
| bayes_path $tmpdir/dbs/bayes |
| bayes_auto_learn 0 |
| bayes_min_ham_num 10 |
| bayes_min_spam_num 10 |
| bayes_store_module Mail::SpamAssassin::BayesStore::SDBM |
| lock_method flock |
| |
| " > $tmpdir/rules/30bayes_path.cf |
| mkdir $tmpdir/dbs |
| |
| INTERLEAVE_TESTS=0 |
| TEST_AGAINST_10PC=0 |
| LEARN_ALL_THEN_FORGET_TEST_SET=0 |
| |
| backup_dbs () { |
| echo "Backing up full learned DBs..." |
| ( cd $tmpdir; tar cvf learned-all.tar dbs ) |
| } |
| restore_dbs () { |
| echo "Restoring full learned DBs..." |
| ( cd $tmpdir; rm -rf dbs; tar xf learned-all.tar ) |
| } |
| runcmd () { |
| echo "$*" |
| time $* |
| } |
| |
| if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then |
| |
| # learn the lot, then forget the ones we're testing on each time. |
| # faster than learning from scratch for each fold |
| |
| # note: we use randseed=1 so that every run will always pick the |
| # same messages if --learnprob is used. |
| |
| ( |
| echo -n "Learning from all ham buckets..." ; date |
| runcmd ../sa-learn --ham --randseed=1 --no-sync $learnargs \ |
| --showdots --mbox --siteconfigpath=$tmpdir/rules --config-file=$tmpdir/rules $testdir/cor/ham/* |
| |
| echo -n "Learning from all spam buckets..." ; date |
| runcmd ../sa-learn --spam --randseed=1 --no-sync $learnargs \ |
| --showdots --mbox --siteconfigpath=$tmpdir/rules --config-file=$tmpdir/rules $testdir/cor/spam/* |
| |
| runcmd ../sa-learn --sync $learnargs --siteconfigpath=$tmpdir/rules --config-file=$tmpdir/rules |
| |
| echo -n "Done learning. " ; date |
| ) 2>&1 | tee $results/learn.log |
| |
| echo "Dumping bayes DB..." |
| ../sa-learn --dump --dbpath=$tmpdir/dbs/bayes > $results/bayes_db.dump |
| |
| fi |
| |
| backup_dbs |
| |
| ( |
| |
| echo -n "Starting test..." ; date |
| for bucket in 1 2 3 4 5 6 7 8 9 10 ; do |
| echo -n "Bucket $bucket..." ; date |
| |
| if [ $bucket != 1 ] ; then restore_dbs ; fi |
| |
| rdir=$results/bucket$bucket |
| mkdir $rdir |
| |
| : > $rdir/hbucketlearn |
| : > $rdir/sbucketlearn |
| : > $rdir/hbuckettest |
| : > $rdir/sbuckettest |
| for subbucket in 1 2 3 4 5 6 7 8 9 10 ; do |
| type=l |
| [ $TEST_AGAINST_10PC = 1 -a $subbucket = $bucket ] && type=t |
| [ $TEST_AGAINST_10PC = 0 -a $subbucket != $bucket ] && type=t |
| |
| if [ $type = l ] ; then |
| echo "Using bucket for learn: $subbucket ..." |
| cat $testdir/cor/ham/bucket.$subbucket >> $rdir/hbucketlearn |
| cat $testdir/cor/spam/bucket.$subbucket >> $rdir/sbucketlearn |
| else |
| echo "Using bucket for test: $subbucket ..." |
| cat $testdir/cor/ham/bucket.$subbucket >> $rdir/hbuckettest |
| cat $testdir/cor/spam/bucket.$subbucket >> $rdir/sbuckettest |
| fi |
| done |
| |
| if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then |
| echo "Forgetting contents of test ham bucket..." |
| runcmd ../sa-learn --forget --siteconfigpath=$tmpdir/rules --config-file=$tmpdir/rules --showdots \ |
| --mbox $rdir/hbuckettest |
| |
| echo "Forgetting contents of test spam bucket..." |
| runcmd ../sa-learn --forget --siteconfigpath=$tmpdir/rules --config-file=$tmpdir/rules --showdots \ |
| --mbox $rdir/sbuckettest |
| |
| else |
| echo "Learning contents of learn ham bucket..." |
| runcmd ../sa-learn --ham --randseed=1 --no-sync $learnargs \ |
| --showdots --mbox --siteconfigpath=$tmpdir/rules --config-file=$tmpdir/rules $rdir/hbucketlearn |
| |
| echo "Learning contents of learn spam bucket..." |
| runcmd ../sa-learn --spam --randseed=1 --no-sync $learnargs \ |
| --showdots --mbox --siteconfigpath=$tmpdir/rules --config-file=$tmpdir/rules $rdir/sbucketlearn |
| |
| runcmd ../sa-learn --sync $learnargs --siteconfigpath=$tmpdir/rules --config-file=$tmpdir/rules |
| |
| echo "Dumping bayes DB..." |
| ../sa-learn --dump --dbpath=$tmpdir/dbs/bayes > $rdir/bayes_db.dump |
| fi |
| |
| runcmd ../sa-learn --sync --siteconfigpath=$tmpdir/rules --config-file=$tmpdir/rules |
| |
| # take a copy of the trained Bayes DBs |
| ( cd $tmpdir ; tar cf dbs.tar dbs ) |
| |
| if [ $INTERLEAVE_TESTS = 1 ] ; then |
| # now split the ham and spam test bucket into 10 sub-buckets, |
| # so we interleave ham and spam while testing. important for |
| # judging expiry effects |
| : > $rdir/nonspam.log |
| : > $rdir/spam.log |
| |
| mkdir $rdir/testbuckets |
| ../tools/split_corpora -n 10 -p $rdir/testbuckets/ham \ |
| $rdir/hbuckettest |
| ../tools/split_corpora -n 10 -p $rdir/testbuckets/spam \ |
| $rdir/sbuckettest |
| |
| for subbucket in 1 2 3 4 5 6 7 8 9 10 ; do |
| echo "Running mass-check on ham test-bucket $subbucket..." |
| time ./mass-check -j=$numcpus -c=$tmpdir/rules -p=$tmpdir/rules --showdots \ |
| --bayes --mbox $rdir/testbuckets/ham.$subbucket \ |
| >> $rdir/nonspam.log |
| |
| echo "Running mass-check on spam test-bucket $subbucket..." |
| time ./mass-check -j=$numcpus -c=$tmpdir/rules -p=$tmpdir/rules --showdots \ |
| --bayes --mbox $rdir/testbuckets/spam.$subbucket \ |
| >> $rdir/spam.log |
| done |
| |
| else |
| echo "Running mass-check on ham bucket..." |
| runcmd ./mass-check -j=$numcpus -c=$tmpdir/rules -p=$tmpdir/rules --showdots \ |
| --bayes --mbox $rdir/hbuckettest \ |
| > $rdir/nonspam.log |
| |
| echo "Running mass-check on spam bucket..." |
| runcmd ./mass-check -j=$numcpus -c=$tmpdir/rules -p=$tmpdir/rules --showdots \ |
| --bayes --mbox $rdir/sbuckettest \ |
| > $rdir/spam.log |
| fi |
| |
| echo "Reporting..." |
| ./bayes-testing/draw-bayes-histogram \ |
| $rdir/spam.log $rdir/nonspam.log \ |
| > $rdir/hist |
| |
| ./bayes-testing/bayes-thresholds \ |
| $rdir/spam.log $rdir/nonspam.log \ |
| > $rdir/thresholds |
| |
| ./bayes-testing/bayes-static-thresholds \ |
| $rdir/spam.log $rdir/nonspam.log \ |
| > $rdir/thresholds.static |
| |
| # remove these, they're too big. |
| rm -f $rdir/hbucketlearn $rdir/sbucketlearn |
| |
| # but keep these to find FPs/FNs later |
| lz4 --rm -m $rdir/hbuckettest $rdir/sbuckettest || \ |
| gzip $rdir/hbuckettest $rdir/sbuckettest |
| |
| done |
| echo -n "Done test..." ; date |
| |
| ) 2>&1 | tee $results/test.log |
| |
| cat $results/bucket*/spam.log > $results/spam_all.log |
| cat $results/bucket*/nonspam.log > $results/nonspam_all.log |
| |
| ./bayes-testing/draw-bayes-histogram \ |
| $results/spam_all.log $results/nonspam_all.log \ |
| > $results/hist_all |
| ./bayes-testing/bayes-thresholds \ |
| $results/spam_all.log $results/nonspam_all.log \ |
| > $results/thresholds_all |
| ./bayes-testing/bayes-static-thresholds \ |
| $results/spam_all.log $results/nonspam_all.log \ |
| > $results/thresholds_all.static |
| |
| echo "Done." |
| ls -l $results |
| |