blob: 4a2ad27771fb752842b41e9e07e5f81aec86bde7 [file] [log] [blame]
#!/bin/sh
# set SCORESET
. config
LEARN_RATE="${LEARN_RATE:-2.0}"
NAME="set$SCORESET"
LOGDIR="gen-$NAME-$HAM_PREFERENCE-$THRESHOLD-$EPOCHS-$LEARN_RATE"
[ -d gen-cache ] || mkdir gen-cache # a cache, woo
if [ "$NOTE" != "" ]; then
LOGDIR="$LOGDIR-$NOTE"
fi
if [ ! -f "ORIG/ham-$NAME.log" -o ! -f "ORIG/spam-$NAME.log" ]; then
echo "Couldn't find logs for $NAME" >&2
exit 1
fi
if [ "x$1" = "x" ]; then
# This should be in here instead. Prevents testing.
svn revert ../rules/50_scores.cf
# fix all scores to non-zero (avoid bug)
perl -pi.bak \
-e 's/[ \t]/ /gs; s/ 0$/ 3/; s/ 0 / 3 /g; s/ 0 / 3 /g' \
../rules/50_scores.cf
echo "[Doing a scoreset $SCORESET score-generation run]"
# Clean out old runs
echo "[Cleaning up]"
rm -rf spam-test.log ham-test.log spam.log ham.log \
NSBASE SPBASE tmp make.output freqs perceptron.scores \
$LOGDIR
make clean >/dev/null
# Create a directory to organize the logs with this group of settings
mkdir $LOGDIR
mkdir $LOGDIR/NSBASE $LOGDIR/SPBASE
# Generate 90/10 split logs
# keep the *-split*.logs in cwd so it's cacheable
echo "[Generating 90/10 split ham]"
perl tenpass/split-log-into-buckets-cached \
9:gen-cache/ham-split9.log 1:gen-cache/ham-split1.log ORIG/ham-$NAME.log
ln gen-cache/ham-split9.log $LOGDIR/NSBASE/ham.log
ln gen-cache/ham-split1.log $LOGDIR/NSBASE/ham-test.log
echo "[Generating 90/10 split spam]"
perl tenpass/split-log-into-buckets-cached \
9:gen-cache/spam-split9.log 1:gen-cache/spam-split1.log ORIG/spam-$NAME.log
ln gen-cache/spam-split9.log $LOGDIR/SPBASE/spam.log
ln gen-cache/spam-split1.log $LOGDIR/SPBASE/spam-test.log
echo "[Setting up for gen run]"
# Ok, setup for a run
ln -s $LOGDIR/SPBASE/spam.log .
ln -s $LOGDIR/NSBASE/ham.log .
ln -s $LOGDIR/SPBASE/spam-test.log .
ln -s $LOGDIR/NSBASE/ham-test.log .
# try to find number of processors
numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1`
echo "[Generating perceptron]"
# Generate perceptron with full logs
make -j $numcpus SCORESET=$SCORESET > $LOGDIR/make.output 2>&1
cp freqs $LOGDIR/freqs
(
echo "[config]"
cat config
echo "[gen run start]"
pwd
date
./perceptron -p $HAM_PREFERENCE -t $THRESHOLD -e $EPOCHS -l $LEARN_RATE
mv perceptron.scores $LOGDIR/scores
echo "[gen run end]"
) | tee $LOGDIR/log
svn revert ../rules/50_scores.cf
./rewrite-cf-with-new-scores $SCORESET ../rules/50_scores.cf $LOGDIR/scores > /tmp/runGA.$$
mv /tmp/runGA.$$ ../rules/50_scores.cf
cp ../rules/50_scores.cf $LOGDIR/50_scores.cf
./fp-fn-statistics --ham ham-test.log --spam spam-test.log --scoreset $SCORESET --fnlog $LOGDIR/false_negatives --fplog $LOGDIR/false_positives > $LOGDIR/test
else
# This needs to have 50_scores.cf in place first ...
echo "[gen test results]"
./fp-fn-statistics --spam=spam-test.log \
--ham=ham-test.log \
--cffile=../rules --scoreset=$SCORESET | tee $LOGDIR/test
echo "[STATISTICS file generation]"
./mk-baseline-results $SCORESET | tee $LOGDIR/statistics
fi
exit 0