blob: 82bb33222a442a37520a55e4641a29ad5e6f1282 [file] [log] [blame]
#!/bin/sh
# set SCORESET
. config
LEARN_RATE="${LEARN_RATE:-2.0}"
RUNS=10
PASSES="1 2 3 4 5 6 7 8 9 10"
NAME="set$SCORESET"
LOGDIR="vm-$NAME-$HAM_PREFERENCE-$THRESHOLD-$EPOCHS"
CACHEDIR="vm-cache/$NAME"
if [ "$NOTE" != "" ]; then
LOGDIR="$LOGDIR-$NOTE"
fi
if [ ! -d $CACHEDIR ]; then
mkdir -p $CACHEDIR
fi
if [ ! -f "ORIG/ham-$NAME.log" -o ! -f "ORIG/spam-$NAME.log" ]; then
echo "Couldn't find logs for $NAME" >&2
exit 1
fi
echo "[Doing a scoreset $SCORESET score-generation run]"
# clear out the old logs
rm -rf $LOGDIR
# Create a directory to organize the logs with this group of settings
mkdir $LOGDIR
(
echo "[config]"
cat config
) | tee -a $LOGDIR/log
for PASS in $PASSES; do
# Clean out old runs
echo "[Cleaning up for pass $PASS]"
rm -rf spam-validate.log ham-validate.log spam.log ham.log \
NSBASE SPBASE tmp freqs perceptron.scores
make clean >/dev/null
# revert to the previous scoring
svn revert ../rules/50_scores.cf
if [ ! -d $CACHEDIR/$PASS ]; then
# Generate 90/10 split logs
echo "[Generating 90/10 split ham]"
mkdir NSBASE SPBASE
cd NSBASE
perl ../tenpass/split-log-into-buckets 10 < ../ORIG/ham-$NAME.log > /dev/null
for p in $PASSES; do
if [ "$p" != "$PASS" ]; then
cat split-$p.log >> ham.log
else
mv split-$p.log ham-validate.log
fi
done
rm -f split-*.log
echo "[Generating 90/10 split spam]"
cd ../SPBASE
perl ../tenpass/split-log-into-buckets 10 < ../ORIG/spam-$NAME.log > /dev/null
for p in $PASSES; do
if [ "$p" != "$PASS" ]; then
cat split-$p.log >> spam.log
else
mv split-$p.log spam-validate.log
fi
done
rm -f split-*.log
cd ..
echo "[Setting up for pass $PASS]"
# Ok, setup for a run
ln -s SPBASE/spam.log .
ln -s NSBASE/ham.log .
ln -s SPBASE/spam-validate.log .
ln -s NSBASE/ham-validate.log .
# try to find number of processors
numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1`
else
echo "[Retrieving from $CACHEDIR/$PASS]"
ln -s $CACHEDIR/$PASS/SPBASE .
ln -s $CACHEDIR/$PASS/NSBASE .
ln -s $CACHEDIR/$PASS/tmp .
ln -s $CACHEDIR/$PASS/freqs .
ln -s SPBASE/spam.log .
ln -s NSBASE/ham.log .
ln -s SPBASE/spam-validate.log .
ln -s NSBASE/ham-validate.log .
fi
echo "[Generating perceptron]"
# Generate perceptron with full logs
make -j $numcpus SCORESET=$SCORESET > $LOGDIR/make.output 2>&1
(
echo "[pass $PASS start]"
pwd
date
./perceptron -p $HAM_PREFERENCE -t $THRESHOLD -e $EPOCHS -l $LEARN_RATE
mv perceptron.scores $LOGDIR/scores.$PASS
echo "[pass $PASS end]"
) | tee -a $LOGDIR/log
perl ./rewrite-cf-with-new-scores $SCORESET ../rules/50_scores.cf $LOGDIR/scores.$PASS > /tmp/runGA.$$
mv /tmp/runGA.$$ ../rules/50_scores.cf
echo "[evaluating performance]" | tee -a $LOGDIR/log
perl ./fp-fn-statistics --ham ham-validate.log --spam spam-validate.log --scoreset $SCORESET --fnlog $LOGDIR/false_negatives.$PASS --fplog $LOGDIR/false_positives.$PASS > $LOGDIR/validate.$PASS 2> /dev/null
if [ ! -d $CACHEDIR/$PASS ]; then
echo "[Saving object files in $CACHEDIR/$PASS for faster runs]"
mkdir -p $CACHEDIR/$PASS
mv tmp freqs SPBASE NSBASE $CACHEDIR/$PASS
fi
done
perl ./extract-results $LOGDIR/validate.* > $LOGDIR/validate
perl ./model-statistics $LOGDIR/validate
exit 0