blob: 66a7672be7ee8cb89716bd92f89fc1c364839e66 [file] [log] [blame]
#!/bin/sh
# TODO: add FPRATE instead of HAM_PREFERENCE
# set SCORESET
# must use a / in the arg to a 'source' command to avoid searching the PATH
. ./config
LEARN_RATE="${LEARN_RATE:-2.0}"
NAME="set$SCORESET"
# TODO: add $FPRATE instead of HAM_PREFERENCE
LOGDIR="gen-$NAME-$HAM_PREFERENCE-$THRESHOLD-$EPOCHS-ga"
# ensure sandbox T_ rules aren't used in the GA and don't appear in output
KILL_SANDBOX_RULES=y
###########################################################################
[ -d gen-cache ] || mkdir gen-cache # a cache, woo
if [ "$NOTE" != "" ]; then
LOGDIR="$LOGDIR-$NOTE"
fi
if [ "x$1" = "x" ]; then
# -------------------------------------------------------------------------
# Initial rescoring
if [ ! -f "ORIG/ham-$NAME.log" -o ! -f "ORIG/spam-$NAME.log" ]; then
echo "Couldn't find logs for $NAME" >&2
exit 1
fi
( # log this
set -x # trace commands to the log
# Create a directory to organize the logs with this group of settings
mkdir -p $LOGDIR $LOGDIR/NSBASE $LOGDIR/SPBASE
if ! [ -d $LOGDIR ] ; then
echo "Failed to mkdir $LOGDIR, dying" 1>&2
exit 1
fi
# This should be in here instead. Prevents testing.
# svn revert ../rules/50_scores.cf
rm -rf tmprules
cp -r ../rules tmprules
cp tmprules/50_scores.cf orig_scores.cf
# fix all scores to non-zero (avoid a possible bug, not quite sure)
./enable-all-evolved-rules < tmprules/50_scores.cf \
> tmprules/50_scores.cf.new || exit 1
mv tmprules/50_scores.cf.new tmprules/50_scores.cf
[ $KILL_SANDBOX_RULES = y ] && rm tmprules/70_sandbox.cf
echo "[Doing a scoreset $SCORESET score-generation run]"
# Clean out old runs
echo "[Cleaning up]"
rm -rf spam-test.log ham-test.log spam.log ham.log \
NSBASE SPBASE tmp freqs perceptron.scores \
garescorer.scores
make clean
# Generate 90/10 split logs
# keep the *-split*.logs in cwd so it's cacheable
echo "[Generating 90/10 split ham]"
perl tenpass/split-log-into-buckets-cached \
9:gen-cache/ham-split9.log 1:gen-cache/ham-split1.log ORIG/ham-$NAME.log
ln -f gen-cache/ham-split9.log $LOGDIR/NSBASE/ham.log
ln -f gen-cache/ham-split1.log $LOGDIR/NSBASE/ham-test.log
echo "[Generating 90/10 split spam]"
perl tenpass/split-log-into-buckets-cached \
9:gen-cache/spam-split9.log 1:gen-cache/spam-split1.log ORIG/spam-$NAME.log
ln -f gen-cache/spam-split9.log $LOGDIR/SPBASE/spam.log
ln -f gen-cache/spam-split1.log $LOGDIR/SPBASE/spam-test.log
echo "[Setting up for gen run]"
# Ok, setup for a run
ln -s $LOGDIR/SPBASE/spam.log .
ln -s $LOGDIR/NSBASE/ham.log .
ln -s $LOGDIR/SPBASE/spam-test.log .
ln -s $LOGDIR/NSBASE/ham-test.log .
# try to find number of processors
ostype=`uname`
if [ $ostype = "FreeBSD" ]; then
numcpus=`/sbin/sysctl -n kern.smp.cpus`
elif [ $ostype = "SunOS" ]; then
numcpus=`/usr/sbin/psrinfo | wc -l`
else
numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1`
fi
if [ ${numcpus:=0} -le 0 ]; then numcpus=1; fi
echo "[Generating GA]"
# Generate GA with full logs
make -j $numcpus SCORESET=$SCORESET garescorer > $LOGDIR/make.output 2>&1
cp freqs $LOGDIR/freqs
echo "[config]"
cat config
echo "[gen run start]"
pwd
date
# TODO: use -f $FPRATE instead of -b $HAM_PREFERENCE
time ./garescorer -b $HAM_PREFERENCE -e $EPOCHS -t $THRESHOLD || exit $?
date
# POST-GA COMMANDS:
mv garescorer.scores $LOGDIR/scores
echo "[gen run end]"
cp orig_scores.cf tmprules/50_scores.cf
perl ./rewrite-cf-with-new-scores --scoreset $SCORESET \
--old-scores tmprules/50_scores.cf \
--new-scores $LOGDIR/scores \
--cffile tmprules \
> tmprules/50_newscores.cf
mv tmprules/50_newscores.cf tmprules/50_scores.cf
cp tmprules/50_scores.cf $LOGDIR/50_scores.cf
perl ./fp-fn-statistics --ham ham-test.log --spam spam-test.log \
--scoreset $SCORESET --cffile=tmprules \
--fnlog $LOGDIR/false_negatives --fplog $LOGDIR/false_positives \
> $LOGDIR/test
# END OF POST-GA COMMANDS
) | tee $LOGDIR/log
else
# -------------------------------------------------------------------------
# Statistics generation, once everyone likes the scores
# use the logs we saved
fulllogh=$LOGDIR/NSBASE/ham.log
fulllogs=$LOGDIR/SPBASE/spam.log
testlogh=$LOGDIR/NSBASE/ham-test.log
testlogs=$LOGDIR/SPBASE/spam-test.log
if [ ! -f "$testlogh" -o ! -f "$testlogs" ]; then
echo "Couldn't find logs for $NAME: $testlogh $testlogs" >&2
exit 1
fi
rm -f ham-test.log spam-test.log
ln -s $testlogh ham-test.log
ln -s $testlogs spam-test.log
rm -f ham.log spam.log
ln -s $fulllogh ham.log
ln -s $fulllogs spam.log
[ $KILL_SANDBOX_RULES = y ] && rm ../rules/70_sandbox.cf
# This needs to have ../rules/50_scores.cf in place first ...
echo "[gen test results for set $SCORESET]"
perl ./fp-fn-statistics --ham $testlogh --spam $testlogs \
--scoreset $SCORESET --cffile=../rules | tee $LOGDIR/test
echo "[STATISTICS file generation for set $SCORESET]"
bash ./mk-baseline-results $SCORESET | tee $LOGDIR/statistics
cp $LOGDIR/statistics ../rules/STATISTICS-set${SCORESET}.txt
ls -l ../rules/STATISTICS-set${SCORESET}.txt
fi
exit 0