| #!/bin/sh |
| |
| # set SCORESET |
| . config |
| |
| LEARN_RATE="${LEARN_RATE:-2.0}" |
| |
| RUNS=10 |
| PASSES="1 2 3 4 5 6 7 8 9 10" |
| |
| NAME="set$SCORESET" |
| LOGDIR="vm-$NAME-$HAM_PREFERENCE-$THRESHOLD-$EPOCHS" |
| CACHEDIR="vm-cache/$NAME" |
| |
| if [ "$NOTE" != "" ]; then |
| LOGDIR="$LOGDIR-$NOTE" |
| fi |
| |
| if [ ! -d $CACHEDIR ]; then |
| mkdir -p $CACHEDIR |
| fi |
| |
| if [ ! -f "ORIG/ham-$NAME.log" -o ! -f "ORIG/spam-$NAME.log" ]; then |
| echo "Couldn't find logs for $NAME" >&2 |
| exit 1 |
| fi |
| |
| |
| echo "[Doing a scoreset $SCORESET score-generation run]" |
| |
| # clear out the old logs |
| rm -rf $LOGDIR |
| # Create a directory to organize the logs with this group of settings |
| mkdir $LOGDIR |
| |
| ( |
| echo "[config]" |
| cat config |
| ) | tee -a $LOGDIR/log |
| |
| for PASS in $PASSES; do |
| # Clean out old runs |
| echo "[Cleaning up for pass $PASS]" |
| rm -rf spam-validate.log ham-validate.log spam.log ham.log \ |
| NSBASE SPBASE tmp freqs perceptron.scores |
| make clean >/dev/null |
| |
| # revert to the previous scoring |
| svn revert ../rules/50_scores.cf |
| |
| if [ ! -d $CACHEDIR/$PASS ]; then |
| # Generate 90/10 split logs |
| echo "[Generating 90/10 split ham]" |
| mkdir NSBASE SPBASE |
| cd NSBASE |
| perl ../tenpass/split-log-into-buckets 10 < ../ORIG/ham-$NAME.log > /dev/null |
| for p in $PASSES; do |
| if [ "$p" != "$PASS" ]; then |
| cat split-$p.log >> ham.log |
| else |
| mv split-$p.log ham-validate.log |
| fi |
| done |
| rm -f split-*.log |
| |
| echo "[Generating 90/10 split spam]" |
| cd ../SPBASE |
| perl ../tenpass/split-log-into-buckets 10 < ../ORIG/spam-$NAME.log > /dev/null |
| for p in $PASSES; do |
| if [ "$p" != "$PASS" ]; then |
| cat split-$p.log >> spam.log |
| else |
| mv split-$p.log spam-validate.log |
| fi |
| done |
| rm -f split-*.log |
| cd .. |
| |
| echo "[Setting up for pass $PASS]" |
| # Ok, setup for a run |
| ln -s SPBASE/spam.log . |
| ln -s NSBASE/ham.log . |
| ln -s SPBASE/spam-validate.log . |
| ln -s NSBASE/ham-validate.log . |
| |
| # try to find number of processors |
| numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1` |
| else |
| echo "[Retrieving from $CACHEDIR/$PASS]" |
| ln -s $CACHEDIR/$PASS/SPBASE . |
| ln -s $CACHEDIR/$PASS/NSBASE . |
| ln -s $CACHEDIR/$PASS/tmp . |
| ln -s $CACHEDIR/$PASS/freqs . |
| |
| ln -s SPBASE/spam.log . |
| ln -s NSBASE/ham.log . |
| ln -s SPBASE/spam-validate.log . |
| ln -s NSBASE/ham-validate.log . |
| fi |
| |
| echo "[Generating perceptron]" |
| # Generate perceptron with full logs |
| make -j $numcpus SCORESET=$SCORESET > $LOGDIR/make.output 2>&1 |
| |
| ( |
| echo "[pass $PASS start]" |
| pwd |
| date |
| ./perceptron -p $HAM_PREFERENCE -t $THRESHOLD -e $EPOCHS -l $LEARN_RATE |
| mv perceptron.scores $LOGDIR/scores.$PASS |
| echo "[pass $PASS end]" |
| ) | tee -a $LOGDIR/log |
| perl ./rewrite-cf-with-new-scores $SCORESET ../rules/50_scores.cf $LOGDIR/scores.$PASS > /tmp/runGA.$$ |
| mv /tmp/runGA.$$ ../rules/50_scores.cf |
| echo "[evaluating performance]" | tee -a $LOGDIR/log |
| perl ./fp-fn-statistics --ham ham-validate.log --spam spam-validate.log --scoreset $SCORESET --fnlog $LOGDIR/false_negatives.$PASS --fplog $LOGDIR/false_positives.$PASS > $LOGDIR/validate.$PASS 2> /dev/null |
| |
| if [ ! -d $CACHEDIR/$PASS ]; then |
| echo "[Saving object files in $CACHEDIR/$PASS for faster runs]" |
| mkdir -p $CACHEDIR/$PASS |
| mv tmp freqs SPBASE NSBASE $CACHEDIR/$PASS |
| fi |
| |
| done |
| |
| perl ./extract-results $LOGDIR/validate.* > $LOGDIR/validate |
| |
| perl ./model-statistics $LOGDIR/validate |
| |
| exit 0 |