masses/bayes-testing/bayes-10pcv-driver - spamassassin - Git at Google

 #!/bin/sh
 #
 # bayes-10pcv-driver - run 10-fold cross-validation test on SpamAssassin Bayes
 #
 # Since Bayesish probability analysis requires training on a corpus, the
 # traditional SpamAssassin 10-pass cross-validation suite can't be used.  Also,
 # Bayes requires its own ten-pass testing, separately, to judge the effects of
 # tweaks.  So that's what this is.
 #
 # Before running, you need to create a test corpus, as "cor/spam" and
 # "cor/ham".  Here's how to do this:

 #   cd TEST
 #   SADIR/tools/split_corpora -n 10 -l 2000 -p cor/spam/bucket spf1 spf2 spf3 ...
 #   SADIR/tools/split_corpora -n 10 -l 2000 -p cor/ham/bucket ham1 ham2 ham3 ...
 #
 # SADIR = top-level directory of SpamAssassin distro
 # TEST  = the directory where the corpus and results are to be written
 # spfN   = mail folders full of spam
 # hamN   = mail folders full of ham

 # It will produce a directory of results called "results".  The most important
 # are "hist_all": a histogram of scores and frequencies, and "thresholds_all":
 # the output of analysis of all scores and frequencies from the
 # bayes-thresholds script.

 # NOTE: by default you will need *AT LEAST* 2000 of either type to use
 # this, since bayes will not be activated without 200 messages in the db,
 # and each fold is run using 10% of the corpus -- and 2000/10 = 200.

 ###########################################################################

 testdir=`pwd`

 learnargs=
 if [ "$#" -gt 0 ] ; then
   learnargs="$*"
 fi

 cd $SADIR/masses
 PATH=$SADIR:$SADIR/masses:$PATH

 results=$testdir/results
 tmpdir=$results/config

 rm -rf $results $tmpdir

 # now, just copy in the Bayes ruleset
 mkdir -p $results $tmpdir/rules
 cp ../rules/23_*.cf $tmpdir/rules
 cp ../rules/50*.cf $tmpdir/rules
 cp ../rules/*.pre $tmpdir/rules         # ensure we have plugins

 # tell SpamAssassin to use this path for DBs
 # TODO: for tests of these settings, read from a test-specific file
 echo "

 bayes_path                $tmpdir/dbs/bayes
 bayes_auto_learn          0
 bayes_min_ham_num         10
 bayes_min_spam_num        10
 bayes_store_module Mail::SpamAssassin::BayesStore::SDBM

 " > $tmpdir/rules/30bayes_path.cf
 mkdir $tmpdir/dbs

 INTERLEAVE_TESTS=0
 TEST_AGAINST_10PC=0
 LEARN_ALL_THEN_FORGET_TEST_SET=0

 backup_dbs () {
   echo "Backing up full learned DBs..."
   ( cd $tmpdir; tar cvf learned-all.tar dbs )
 }
 restore_dbs () {
   echo "Restoring full learned DBs..."
   ( cd $tmpdir; rm -rf dbs; tar xf learned-all.tar )
 }
 runcmd () {
   echo "$*"
   time $*
 }

 if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then

   # learn the lot, then forget the ones we're testing on each time.
   # faster than learning from scratch for each fold

   # note: we use randseed=1 so that every run will always pick the
   # same messages if --learnprob is used.

   (
   echo -n "Learning from all ham buckets..." ; date
   runcmd sa-learn --ham --randseed=1 --no-sync $learnargs \
 	  --showdots --mbox --config-file=$tmpdir/rules $testdir/cor/ham/*

   echo -n "Learning from all spam buckets..." ; date
   runcmd sa-learn --spam --randseed=1 --no-sync $learnargs \
 	  --showdots --mbox --config-file=$tmpdir/rules $testdir/cor/spam/*

   runcmd sa-learn --sync $learnargs --config-file=$tmpdir/rules

   echo -n "Done learning. " ; date
   ) 2>&1 | tee $results/learn.log

   echo "Dumping bayes DB..."
   ( cd .. ; tools/check_bayes_db --dbpath=$tmpdir/dbs/bayes ) \
 	> $results/bayes_db.dump

 fi

 backup_dbs

 (

 echo -n "Starting test..." ; date
 for bucket in 1 2 3 4 5 6 7 8 9 10 ; do
   echo -n "Bucket $bucket..." ; date

   if [ $bucket != 1 ] ; then restore_dbs ; fi

   rdir=$results/bucket$bucket
   mkdir $rdir

   : > $rdir/hbucketlearn
   : > $rdir/sbucketlearn
   : > $rdir/hbuckettest
   : > $rdir/sbuckettest
   for subbucket in 1 2 3 4 5 6 7 8 9 10 ; do
     type=l
     [ $TEST_AGAINST_10PC = 1 -a $subbucket = $bucket ] && type=t
     [ $TEST_AGAINST_10PC = 0 -a $subbucket != $bucket ] && type=t

     if [ $type = l ] ; then
       echo "Using bucket for learn: $subbucket ..."
       cat $testdir/cor/ham/bucket.$subbucket >> $rdir/hbucketlearn
       cat $testdir/cor/spam/bucket.$subbucket >> $rdir/sbucketlearn
     else
       echo "Using bucket for test: $subbucket ..."
       cat $testdir/cor/ham/bucket.$subbucket >> $rdir/hbuckettest
       cat $testdir/cor/spam/bucket.$subbucket >> $rdir/sbuckettest
     fi
   done

   if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then
     echo "Forgetting contents of test ham bucket..."
     runcmd sa-learn --forget --config-file=$tmpdir/rules --showdots \
 			  --mbox $rdir/hbuckettest

     echo "Forgetting contents of test spam bucket..."
     runcmd sa-learn --forget --config-file=$tmpdir/rules --showdots \
 			  --mbox $rdir/sbuckettest

   else
     echo "Learning contents of learn ham bucket..."
     runcmd sa-learn --ham --randseed=1 --no-sync $learnargs \
 	    --showdots --mbox --config-file=$tmpdir/rules $rdir/hbucketlearn

     echo "Learning contents of learn spam bucket..."
     runcmd sa-learn --spam --randseed=1 --no-sync $learnargs \
 	    --showdots --mbox --config-file=$tmpdir/rules $rdir/sbucketlearn

     runcmd sa-learn --sync $learnargs --config-file=$tmpdir/rules

     echo "Dumping bayes DB..."
     ( cd .. ; sa-learn --dump --dbpath=$tmpdir/dbs/bayes ) \
 	  > $rdir/bayes_db.dump
   fi

   runcmd sa-learn --sync --config-file=$tmpdir/rules

   # take a copy of the trained Bayes DBs, gzipped
   ( cd $tmpdir ; tar cf - dbs | gzip -c > $rdir/dbs.tgz )

   if [ $INTERLEAVE_TESTS = 1 ] ; then
     # now split the ham and spam test bucket into 10 sub-buckets,
     # so we interleave ham and spam while testing. important for
     # judging expiry effects
     : > $rdir/nonspam.log
     : > $rdir/spam.log

     mkdir $rdir/testbuckets
     (
       cd ..
       tools/split_corpora -n 10 -p $rdir/testbuckets/ham \
 			$rdir/hbuckettest
       tools/split_corpora -n 10 -p $rdir/testbuckets/spam \
 			$rdir/sbuckettest
     )

     for subbucket in 1 2 3 4 5 6 7 8 9 10 ; do
       echo "Running mass-check on ham test-bucket $subbucket..."
       time ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
 	    --bayes --mbox $rdir/testbuckets/ham.$subbucket \
 	    >> $rdir/nonspam.log

       echo "Running mass-check on spam test-bucket $subbucket..."
       time ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
 	    --bayes --mbox $rdir/testbuckets/spam.$subbucket \
 	    >> $rdir/spam.log
     done

   else
     echo "Running mass-check on ham bucket..."
     runcmd ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
 	  --bayes --mbox $rdir/hbuckettest \
 	  > $rdir/nonspam.log

     echo "Running mass-check on spam bucket..."
     runcmd ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
 	  --bayes --mbox $rdir/sbuckettest \
 	  > $rdir/spam.log
   fi

   echo "Reporting..."
   ./bayes-testing/draw-bayes-histogram \
 	$rdir/spam.log $rdir/nonspam.log \
 	> $rdir/hist

   ./bayes-testing/bayes-thresholds \
 	$rdir/spam.log $rdir/nonspam.log \
 	> $rdir/thresholds

   ./bayes-testing/bayes-static-thresholds \
 	$rdir/spam.log $rdir/nonspam.log \
 	> $rdir/thresholds.static

   # remove these, they're too big.
   rm -f $rdir/hbucketlearn $rdir/sbucketlearn

   # but keep these to find FPs/FNs later
   gzip $rdir/hbuckettest $rdir/sbuckettest

 done
 echo -n "Done test..." ; date

 ) 2>&1 | tee $results/test.log

 cat $results/bucket*/spam.log > $results/spam_all.log
 cat $results/bucket*/nonspam.log > $results/nonspam_all.log

 ./bayes-testing/draw-bayes-histogram \
 	$results/spam_all.log $results/nonspam_all.log \
 	> $results/hist_all
 ./bayes-testing/bayes-thresholds \
 	$results/spam_all.log $results/nonspam_all.log \
 	> $results/thresholds_all
 ./bayes-testing/bayes-static-thresholds \
 	$results/spam_all.log $results/nonspam_all.log \
 	> $results/thresholds_all.static

 echo "Done."
 ls -l $results
	#!/bin/sh
	#
	# bayes-10pcv-driver - run 10-fold cross-validation test on SpamAssassin Bayes
	#
	# Since Bayesish probability analysis requires training on a corpus, the
	# traditional SpamAssassin 10-pass cross-validation suite can't be used. Also,
	# Bayes requires its own ten-pass testing, separately, to judge the effects of
	# tweaks. So that's what this is.
	#
	# Before running, you need to create a test corpus, as "cor/spam" and
	# "cor/ham". Here's how to do this:

	# cd TEST
	# SADIR/tools/split_corpora -n 10 -l 2000 -p cor/spam/bucket spf1 spf2 spf3 ...
	# SADIR/tools/split_corpora -n 10 -l 2000 -p cor/ham/bucket ham1 ham2 ham3 ...
	#
	# SADIR = top-level directory of SpamAssassin distro
	# TEST = the directory where the corpus and results are to be written
	# spfN = mail folders full of spam
	# hamN = mail folders full of ham

	# It will produce a directory of results called "results". The most important
	# are "hist_all": a histogram of scores and frequencies, and "thresholds_all":
	# the output of analysis of all scores and frequencies from the
	# bayes-thresholds script.

	# NOTE: by default you will need AT LEAST 2000 of either type to use
	# this, since bayes will not be activated without 200 messages in the db,
	# and each fold is run using 10% of the corpus -- and 2000/10 = 200.

	###########################################################################

	testdir=`pwd`

	learnargs=
	if [ "$#" -gt 0 ] ; then
	learnargs="$*"
	fi

	cd $SADIR/masses
	PATH=$SADIR:$SADIR/masses:$PATH

	results=$testdir/results
	tmpdir=$results/config

	rm -rf $results $tmpdir

	# now, just copy in the Bayes ruleset
	mkdir -p $results $tmpdir/rules
	cp ../rules/23_*.cf $tmpdir/rules
	cp ../rules/50*.cf $tmpdir/rules
	cp ../rules/*.pre $tmpdir/rules # ensure we have plugins

	# tell SpamAssassin to use this path for DBs
	# TODO: for tests of these settings, read from a test-specific file
	echo "

	bayes_path $tmpdir/dbs/bayes
	bayes_auto_learn 0
	bayes_min_ham_num 10
	bayes_min_spam_num 10
	bayes_store_module Mail::SpamAssassin::BayesStore::SDBM

	" > $tmpdir/rules/30bayes_path.cf
	mkdir $tmpdir/dbs

	INTERLEAVE_TESTS=0
	TEST_AGAINST_10PC=0
	LEARN_ALL_THEN_FORGET_TEST_SET=0

	backup_dbs () {
	echo "Backing up full learned DBs..."
	( cd $tmpdir; tar cvf learned-all.tar dbs )
	}
	restore_dbs () {
	echo "Restoring full learned DBs..."
	( cd $tmpdir; rm -rf dbs; tar xf learned-all.tar )
	}
	runcmd () {
	echo "$*"
	time $*
	}

	if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then

	# learn the lot, then forget the ones we're testing on each time.
	# faster than learning from scratch for each fold

	# note: we use randseed=1 so that every run will always pick the
	# same messages if --learnprob is used.

	(
	echo -n "Learning from all ham buckets..." ; date
	runcmd sa-learn --ham --randseed=1 --no-sync $learnargs \
	--showdots --mbox --config-file=$tmpdir/rules $testdir/cor/ham/*

	echo -n "Learning from all spam buckets..." ; date
	runcmd sa-learn --spam --randseed=1 --no-sync $learnargs \
	--showdots --mbox --config-file=$tmpdir/rules $testdir/cor/spam/*

	runcmd sa-learn --sync $learnargs --config-file=$tmpdir/rules

	echo -n "Done learning. " ; date
	) 2>&1 \| tee $results/learn.log

	echo "Dumping bayes DB..."
	( cd .. ; tools/check_bayes_db --dbpath=$tmpdir/dbs/bayes ) \
	> $results/bayes_db.dump

	fi

	backup_dbs

	(

	echo -n "Starting test..." ; date
	for bucket in 1 2 3 4 5 6 7 8 9 10 ; do
	echo -n "Bucket $bucket..." ; date

	if [ $bucket != 1 ] ; then restore_dbs ; fi

	rdir=$results/bucket$bucket
	mkdir $rdir

	: > $rdir/hbucketlearn
	: > $rdir/sbucketlearn
	: > $rdir/hbuckettest
	: > $rdir/sbuckettest
	for subbucket in 1 2 3 4 5 6 7 8 9 10 ; do
	type=l
	[ $TEST_AGAINST_10PC = 1 -a $subbucket = $bucket ] && type=t
	[ $TEST_AGAINST_10PC = 0 -a $subbucket != $bucket ] && type=t

	if [ $type = l ] ; then
	echo "Using bucket for learn: $subbucket ..."
	cat $testdir/cor/ham/bucket.$subbucket >> $rdir/hbucketlearn
	cat $testdir/cor/spam/bucket.$subbucket >> $rdir/sbucketlearn
	else
	echo "Using bucket for test: $subbucket ..."
	cat $testdir/cor/ham/bucket.$subbucket >> $rdir/hbuckettest
	cat $testdir/cor/spam/bucket.$subbucket >> $rdir/sbuckettest
	fi
	done

	if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then
	echo "Forgetting contents of test ham bucket..."
	runcmd sa-learn --forget --config-file=$tmpdir/rules --showdots \
	--mbox $rdir/hbuckettest

	echo "Forgetting contents of test spam bucket..."
	runcmd sa-learn --forget --config-file=$tmpdir/rules --showdots \
	--mbox $rdir/sbuckettest

	else
	echo "Learning contents of learn ham bucket..."
	runcmd sa-learn --ham --randseed=1 --no-sync $learnargs \
	--showdots --mbox --config-file=$tmpdir/rules $rdir/hbucketlearn

	echo "Learning contents of learn spam bucket..."
	runcmd sa-learn --spam --randseed=1 --no-sync $learnargs \
	--showdots --mbox --config-file=$tmpdir/rules $rdir/sbucketlearn

	runcmd sa-learn --sync $learnargs --config-file=$tmpdir/rules

	echo "Dumping bayes DB..."
	( cd .. ; sa-learn --dump --dbpath=$tmpdir/dbs/bayes ) \
	> $rdir/bayes_db.dump
	fi

	runcmd sa-learn --sync --config-file=$tmpdir/rules

	# take a copy of the trained Bayes DBs, gzipped
	( cd $tmpdir ; tar cf - dbs \| gzip -c > $rdir/dbs.tgz )

	if [ $INTERLEAVE_TESTS = 1 ] ; then
	# now split the ham and spam test bucket into 10 sub-buckets,
	# so we interleave ham and spam while testing. important for
	# judging expiry effects
	: > $rdir/nonspam.log
	: > $rdir/spam.log

	mkdir $rdir/testbuckets
	(
	cd ..
	tools/split_corpora -n 10 -p $rdir/testbuckets/ham \
	$rdir/hbuckettest
	tools/split_corpora -n 10 -p $rdir/testbuckets/spam \
	$rdir/sbuckettest
	)

	for subbucket in 1 2 3 4 5 6 7 8 9 10 ; do
	echo "Running mass-check on ham test-bucket $subbucket..."
	time ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
	--bayes --mbox $rdir/testbuckets/ham.$subbucket \
	>> $rdir/nonspam.log

	echo "Running mass-check on spam test-bucket $subbucket..."
	time ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
	--bayes --mbox $rdir/testbuckets/spam.$subbucket \
	>> $rdir/spam.log
	done

	else
	echo "Running mass-check on ham bucket..."
	runcmd ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
	--bayes --mbox $rdir/hbuckettest \
	> $rdir/nonspam.log

	echo "Running mass-check on spam bucket..."
	runcmd ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
	--bayes --mbox $rdir/sbuckettest \
	> $rdir/spam.log
	fi

	echo "Reporting..."
	./bayes-testing/draw-bayes-histogram \
	$rdir/spam.log $rdir/nonspam.log \
	> $rdir/hist

	./bayes-testing/bayes-thresholds \
	$rdir/spam.log $rdir/nonspam.log \
	> $rdir/thresholds

	./bayes-testing/bayes-static-thresholds \
	$rdir/spam.log $rdir/nonspam.log \
	> $rdir/thresholds.static

	# remove these, they're too big.
	rm -f $rdir/hbucketlearn $rdir/sbucketlearn

	# but keep these to find FPs/FNs later
	gzip $rdir/hbuckettest $rdir/sbuckettest

	done
	echo -n "Done test..." ; date

	) 2>&1 \| tee $results/test.log

	cat $results/bucket*/spam.log > $results/spam_all.log
	cat $results/bucket*/nonspam.log > $results/nonspam_all.log

	./bayes-testing/draw-bayes-histogram \
	$results/spam_all.log $results/nonspam_all.log \
	> $results/hist_all
	./bayes-testing/bayes-thresholds \
	$results/spam_all.log $results/nonspam_all.log \
	> $results/thresholds_all
	./bayes-testing/bayes-static-thresholds \
	$results/spam_all.log $results/nonspam_all.log \
	> $results/thresholds_all.static

	echo "Done."
	ls -l $results