| #!/bin/sh |
| |
| # TODO: add FPRATE instead of HAM_PREFERENCE |
| # set SCORESET |
| |
| # must use a / in the arg to a 'source' command to avoid searching the PATH |
| . ./config |
| |
| LEARN_RATE="${LEARN_RATE:-2.0}" |
| |
| NAME="set$SCORESET" |
| # TODO: add $FPRATE instead of HAM_PREFERENCE |
| LOGDIR="gen-$NAME-$HAM_PREFERENCE-$THRESHOLD-$EPOCHS-ga" |
| |
| # ensure sandbox T_ rules aren't used in the GA and don't appear in output |
| KILL_SANDBOX_RULES=y |
| |
| ########################################################################### |
| |
| [ -d gen-cache ] || mkdir gen-cache # a cache, woo |
| |
| if [ "$NOTE" != "" ]; then |
| LOGDIR="$LOGDIR-$NOTE" |
| fi |
| |
| if [ "x$1" = "x" ]; then |
| |
| # ------------------------------------------------------------------------- |
| # Initial rescoring |
| |
| if [ ! -f "ORIG/ham-$NAME.log" -o ! -f "ORIG/spam-$NAME.log" ]; then |
| echo "Couldn't find logs for $NAME" >&2 |
| exit 1 |
| fi |
| |
| ( # log this |
| |
| set -x # trace commands to the log |
| |
| # Create a directory to organize the logs with this group of settings |
| mkdir -p $LOGDIR $LOGDIR/NSBASE $LOGDIR/SPBASE |
| |
| if ! [ -d $LOGDIR ] ; then |
| echo "Failed to mkdir $LOGDIR, dying" 1>&2 |
| exit 1 |
| fi |
| |
| # This should be in here instead. Prevents testing. |
| # svn revert ../rules/50_scores.cf |
| |
| rm -rf tmprules |
| cp -r ../rules tmprules |
| |
| cp tmprules/50_scores.cf orig_scores.cf |
| |
| # fix all scores to non-zero (avoid a possible bug, not quite sure) |
| ./enable-all-evolved-rules < tmprules/50_scores.cf \ |
| > tmprules/50_scores.cf.new || exit 1 |
| mv tmprules/50_scores.cf.new tmprules/50_scores.cf |
| |
| [ $KILL_SANDBOX_RULES = y ] && rm tmprules/70_sandbox.cf |
| |
| echo "[Doing a scoreset $SCORESET score-generation run]" |
| |
| # Clean out old runs |
| echo "[Cleaning up]" |
| rm -rf spam-test.log ham-test.log spam.log ham.log \ |
| NSBASE SPBASE tmp freqs perceptron.scores \ |
| garescorer.scores |
| make clean |
| |
| # Generate 90/10 split logs |
| # keep the *-split*.logs in cwd so it's cacheable |
| echo "[Generating 90/10 split ham]" |
| perl tenpass/split-log-into-buckets-cached \ |
| 9:gen-cache/ham-split9.log 1:gen-cache/ham-split1.log ORIG/ham-$NAME.log |
| ln -f gen-cache/ham-split9.log $LOGDIR/NSBASE/ham.log |
| ln -f gen-cache/ham-split1.log $LOGDIR/NSBASE/ham-test.log |
| |
| echo "[Generating 90/10 split spam]" |
| perl tenpass/split-log-into-buckets-cached \ |
| 9:gen-cache/spam-split9.log 1:gen-cache/spam-split1.log ORIG/spam-$NAME.log |
| ln -f gen-cache/spam-split9.log $LOGDIR/SPBASE/spam.log |
| ln -f gen-cache/spam-split1.log $LOGDIR/SPBASE/spam-test.log |
| |
| echo "[Setting up for gen run]" |
| # Ok, setup for a run |
| ln -s $LOGDIR/SPBASE/spam.log . |
| ln -s $LOGDIR/NSBASE/ham.log . |
| ln -s $LOGDIR/SPBASE/spam-test.log . |
| ln -s $LOGDIR/NSBASE/ham-test.log . |
| |
| # try to find number of processors |
| ostype=`uname` |
| if [ $ostype = "FreeBSD" ]; then |
| numcpus=`/sbin/sysctl -n kern.smp.cpus` |
| elif [ $ostype = "SunOS" ]; then |
| numcpus=`/usr/sbin/psrinfo | wc -l` |
| else |
| numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1` |
| fi |
| if [ ${numcpus:=0} -le 0 ]; then numcpus=1; fi |
| |
| echo "[Generating GA]" |
| # Generate GA with full logs |
| make -j $numcpus SCORESET=$SCORESET garescorer > $LOGDIR/make.output 2>&1 |
| cp freqs $LOGDIR/freqs |
| |
| echo "[config]" |
| cat config |
| echo "[gen run start]" |
| pwd |
| date |
| |
| # TODO: use -f $FPRATE instead of -b $HAM_PREFERENCE |
| time ./garescorer -b $HAM_PREFERENCE -e $EPOCHS -t $THRESHOLD || exit $? |
| date |
| |
| # POST-GA COMMANDS: |
| |
| mv garescorer.scores $LOGDIR/scores |
| echo "[gen run end]" |
| |
| cp orig_scores.cf tmprules/50_scores.cf |
| perl ./rewrite-cf-with-new-scores --scoreset $SCORESET \ |
| --old-scores tmprules/50_scores.cf \ |
| --new-scores $LOGDIR/scores \ |
| --cffile tmprules \ |
| > tmprules/50_newscores.cf |
| |
| mv tmprules/50_newscores.cf tmprules/50_scores.cf |
| cp tmprules/50_scores.cf $LOGDIR/50_scores.cf |
| |
| perl ./fp-fn-statistics --ham ham-test.log --spam spam-test.log \ |
| --scoreset $SCORESET --cffile=tmprules \ |
| --fnlog $LOGDIR/false_negatives --fplog $LOGDIR/false_positives \ |
| > $LOGDIR/test |
| |
| # END OF POST-GA COMMANDS |
| |
| ) | tee $LOGDIR/log |
| |
| else |
| |
| # ------------------------------------------------------------------------- |
| # Statistics generation, once everyone likes the scores |
| |
| |
| # use the logs we saved |
| fulllogh=$LOGDIR/NSBASE/ham.log |
| fulllogs=$LOGDIR/SPBASE/spam.log |
| testlogh=$LOGDIR/NSBASE/ham-test.log |
| testlogs=$LOGDIR/SPBASE/spam-test.log |
| |
| if [ ! -f "$testlogh" -o ! -f "$testlogs" ]; then |
| echo "Couldn't find logs for $NAME: $testlogh $testlogs" >&2 |
| exit 1 |
| fi |
| |
| rm -f ham-test.log spam-test.log |
| ln -s $testlogh ham-test.log |
| ln -s $testlogs spam-test.log |
| |
| rm -f ham.log spam.log |
| ln -s $fulllogh ham.log |
| ln -s $fulllogs spam.log |
| |
| [ $KILL_SANDBOX_RULES = y ] && rm ../rules/70_sandbox.cf |
| |
| # This needs to have ../rules/50_scores.cf in place first ... |
| echo "[gen test results for set $SCORESET]" |
| perl ./fp-fn-statistics --ham $testlogh --spam $testlogs \ |
| --scoreset $SCORESET --cffile=../rules | tee $LOGDIR/test |
| |
| echo "[STATISTICS file generation for set $SCORESET]" |
| bash ./mk-baseline-results $SCORESET | tee $LOGDIR/statistics |
| |
| cp $LOGDIR/statistics ../rules/STATISTICS-set${SCORESET}.txt |
| ls -l ../rules/STATISTICS-set${SCORESET}.txt |
| |
| fi |
| |
| exit 0 |