blob: f0b92959f11ecb379e744c02f09d1be9e00dee37 [file] [log] [blame]
#!/bin/bash
# generate-new-scores - generate scores for rules promoted after initial
# release mass-check scoring run
#
# usage: generate-new-scores (0|1|2|3)
#
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
SCORESET=$1
CORPUS_SRC_DIR="/export/home/corpus-rsync/corpus"
if [ ! $SCORESET ]; then
echo "Missing scoreset number parameter"
exit
fi
# prep current nightly mass-check logs
if [ ! -e corpus ]; then
echo "[ creating corpus directory ]"
mkdir corpus || exit $?
fi
date
# if running on spamassassin.zones.apache.org rsync locally, otherwise rsync remotely
if [ -e $CORPUS_SRC_DIR ]; then
echo "[ rsyncing logs locally ]"
rsync -artv --delete --exclude="*am-rescore-*" $CORPUS_SRC_DIR/*.log corpus/. || exit $?
else
echo "[ rsyncing logs remotely ]"
# load rsync credentials from RSYNC-CREDS file
# RSYNC_USERNAME="username"
# RSYNC_PASSWORD="password"
. RSYNC-CREDS
export RSYNC_PASSWORD
rsync -artvz --delete --exclude="*am-rescore-*" $RSYNC_USERNAME@rsync.spamassassin.org::corpus/*.log corpus/. || exit $?
fi
date
echo "[ selecting log files to use for scoreset $SCORESET ]"
# select a usable corpus (it'll use all available logs for the wanted score set
# with the most recent revision found among logs for that score set)
rm -rf corpus/usable-corpus-set$SCORESET
mkdir corpus/usable-corpus-set$SCORESET || exit $?
if [ $SCORESET -eq 3 ]; then
for FILE in `find corpus -type f -name "*am-bayes-net-*"`;
do
FILE=`echo $FILE | cut -d"/" -f2-`
ln corpus/$FILE corpus/usable-corpus-set${SCORESET}/$FILE || exit $?
echo "Linked $FILE to corpus/usable-corpus-set${SCORESET}/$FILE"
done
elif [ $SCORESET -eq 2 ]; then
for FILE in `find corpus -type f -name "*am-bayes-*" | grep -v net-`;
do
FILE=`echo $FILE | cut -d"/" -f2-`
ln corpus/$FILE corpus/usable-corpus-set${SCORESET}/$FILE || exit $?
echo "Linked $FILE to corpus/usable-corpus-set${SCORESET}/$FILE"
done
elif [ $SCORESET -eq 1 ]; then
for FILE in `find corpus -type f -name "*am-net-*"`;
do
FILE=`echo $FILE | cut -d"/" -f2-`
ln corpus/$FILE corpus/usable-corpus-set${SCORESET}/$FILE || exit $?
echo "Linked $FILE to corpus/usable-corpus-set${SCORESET}/$FILE"
done
elif [ $SCORESET -eq 0 ]; then
for FILE in `find corpus -type f -name "*am-*" | grep -v net- | grep -v bayes-`;
do
FILE=`echo $FILE | cut -d"/" -f2-`
ln corpus/$FILE corpus/usable-corpus-set${SCORESET}/$FILE || exit $?
echo "Linked $FILE to corpus/usable-corpus-set${SCORESET}/$FILE"
done
else
echo "Unknown score set: $SCORESET"
exit
fi
# cthielen's ham logs seem to have a shitload of spam in them
rm -f corpus/usable-corpus-set${SCORESET}/*cthielen.log
REVISION=`head corpus/usable-corpus-set${SCORESET}/* | grep "SVN revision" | cut -d" " -f4 | sort -rn | head -1`
if [ "$REVISION" == "" ]; then
echo "No logs for scoreset"
exit 1
fi
# DEBUG
#echo "test"
#exit 1
for FILE in `find corpus/usable-corpus-set$SCORESET -type f`;
do
echo "Checking $FILE for SVN $REVISION..."
head $FILE | grep "SVN revision: $REVISION" || (rm $FILE; echo "$FILE does not meet the requirements")
done
# check to make sure that we have enough corpus submitters
HAMCONTRIBS=`ls -l corpus/usable-corpus-set$SCORESET/ham-*.log | wc -l | sed -e 's/^[ \t]*//' | cut -d" " -f1`
SPAMCONTRIBS=`ls -l corpus/usable-corpus-set$SCORESET/spam-*.log | wc -l | sed -e 's/^[ \t]*//' | cut -d" " -f1`
echo " HAM CONTRIBUTORS FOUND: $HAMCONTRIBS (required 10)"
echo "SPAM CONTRIBUTORS FOUND: $SPAMCONTRIBS (required 10)"
if [ $HAMCONTRIBS -lt 10 ]; then
echo "Insufficient ham corpus contributors; aborting."
exit 6
fi
if [ $SPAMCONTRIBS -lt 10 ]; then
echo "Insufficient spam corpus contributors; aborting."
exit 7
fi
date
echo "[ checking out code from svn repository ]"
# make note of what logs we are going to use
echo "# Using score set $SCORESET logs for revision $REVISION from:" > scores-set$SCORESET
echo "#" `ls corpus/usable-corpus-set$SCORESET` >> scores-set$SCORESET
echo >> scores-set$SCORESET
# prep the ruleset checkout
rm -rf trunk-new-rules-set$SCORESET
svn co -r $REVISION http://svn.apache.org/repos/asf/spamassassin/trunk trunk-new-rules-set$SCORESET || exit $?
svn co http://svn.apache.org/repos/asf/spamassassin/tags/spamassassin_release_3_3_0/rules trunk-new-rules-set$SCORESET/rules-base || exit $?
svn co http://svn.apache.org/repos/asf/spamassassin/trunk/rules trunk-new-rules-set$SCORESET/rules-current || exit $?
svn up -r $REVISION trunk-new-rules-set${SCORESET}/rulesrc/ || exit $?
# use the newest masses/ directory so that we can fix bugs in the masses/ stuff
# and not have the net-enabled scores broken all week
svn up trunk-new-rules-set$SCORESET/masses/
# we need to patch the Makefile to get it to mangle some data for us
cd trunk-new-rules-set${SCORESET}/masses
patch < ../../masses-Makefile.patch || exit $?
cd ../..
# copy the support scripts to masses/ of the scoreset's checkout; this lets us
# contain all the new score generation scripts in their own directory and keeps
# us from having to pass the checkout path as an argument to each of the scripts
# NOTE: lock-scores now uses existing scores (even commented out) in 72_active.cf
# as absolute maximum values to be inserted in tmp/ranges.data
cp lock-scores trunk-new-rules-set$SCORESET/masses/lock-scores
cp extract-new-scores trunk-new-rules-set$SCORESET/masses/extract-new-scores
cp add-hitless-active-to-freqs trunk-new-rules-set$SCORESET/masses/add-hitless-active-to-freqs
date
echo "[ generating active ruleset via make ]"
cd trunk-new-rules-set$SCORESET
perl Makefile.PL < /dev/null || exit $?
make || exit $?
# strip scores from new rules so that the garescorer can set them
grep -v ^score rules/72_active.cf > rules/72_active.cf-scoreless
mv -f rules/72_active.cf-scoreless rules/72_active.cf
date
echo "[ running log-grep-recent ]"
# only use recent spam to generate scores; use a lot of ham history to avoid FPs - Increases Ham to 84 months on 8/8/2012 to try and get a masscheck out the door.
masses/log-grep-recent -m 84 ../corpus/usable-corpus-set$SCORESET/ham-*.log > masses/ham-full.log
masses/log-grep-recent -m 2 ../corpus/usable-corpus-set$SCORESET/spam-*.log > masses/spam-full.log
# make sure that we have enough mass-check results to actually generate reasonable scores
# NOTE: currently we only check for a minimum number of messages
HAMCOUNT=`wc -l masses/ham-full.log | sed -e 's/^[ \t]*//' | cut -d" " -f1`
SPAMCOUNT=`wc -l masses/spam-full.log | sed -e 's/^[ \t]*//' | cut -d" " -f1`
echo " HAM: $HAMCOUNT (150000 required)"
echo "SPAM: $SPAMCOUNT (150000 required)"
if [ $HAMCOUNT -lt 150000 ]; then
echo "Insufficient ham corpus to generate scores; aborting."
exit 8
fi
if [ $SPAMCOUNT -lt 150000 ]; then
echo "Insufficient spam corpus to generate scores; aborting."
exit 9
fi
# set config to chosen scoreset
cp masses/config.set$SCORESET masses/config
. masses/config
NAME="set$SCORESET"
LOGDIR="gen-$NAME-$HAM_PREFERENCE-$THRESHOLD-$EPOCHS-ga"
date
echo "[ running make freqs ]"
# generate new ruleset
cd masses
make clean || exit $?
rm -rf ORIG NSBASE SPBASE ham-validate.log spam-validate.log ham.log spam.log
ln -s ham-full.log ham.log
ln -s spam-full.log spam.log
make freqs SCORESET=$SCORESET || exit $?
cp freqs freqs.full # probably not needed for anything - someday I'll look to see
make > make.out 2>&1 || exit $?
rm -rf ORIG NSBASE SPBASE ham-validate.log spam-validate.log ham.log spam.log
mkdir ORIG
for CLASS in ham spam ; do
ln $CLASS-full.log ORIG/$CLASS.log
for I in 0 1 2 3 ; do
ln -s $CLASS.log ORIG/$CLASS-set$I.log
done
done
date
echo "[ starting runGA ]"
# generate the new scores
./runGA || exit $?
date
echo "[ generating fp-fn-statistics ]"
# generate stats on the old rules to compare against the new rules and their scores
./fp-fn-statistics --ham ham-test.log --spam spam-test.log --scoreset $SCORESET \
--cffile=../rules-base --fnlog $LOGDIR/false_negatives_original \
--fplog $LOGDIR/false_positives_original > $LOGDIR/stats-set$SCORESET-original-test
./fp-fn-statistics --ham ham.log --spam spam.log --scoreset $SCORESET \
--cffile=../rules-base --fnlog $LOGDIR/false_negatives_original \
--fplog $LOGDIR/false_positives_original > $LOGDIR/stats-set$SCORESET-original-full
date
echo "[ extracting new scores ]"
# extract the new scores
./extract-new-scores
cat $LOGDIR/scores-new >> ../../scores-set$SCORESET
# new active.list rules that didn't hit enough get zeroed... add the zero scores
# for them, otherwise SA will assign 1.0 defaults (or use whatever was in the sandbox)
if [ -s scores-active-zeroed ]; then
echo "# in active.list but have no hits in recent corpus" >> ../../scores-set$SCORESET
cat scores-active-zeroed >> ../../scores-set$SCORESET
fi
cd ../..
cat scores-set$SCORESET
# collect some stats
echo "##### WITH NEW RULES AND SCORES #####" > stats-set$SCORESET
head -10 trunk-new-rules-set$SCORESET/masses/$LOGDIR/scores >> stats-set$SCORESET
cat trunk-new-rules-set$SCORESET/masses/$LOGDIR/test >> stats-set$SCORESET
echo >> stats-set$SCORESET
echo "##### WITHOUT NEW RULES AND SCORES #####" >> stats-set$SCORESET
cat trunk-new-rules-set$SCORESET/masses/$LOGDIR/stats-set$SCORESET-original-full >> stats-set$SCORESET
cat trunk-new-rules-set$SCORESET/masses/$LOGDIR/stats-set$SCORESET-original-test >> stats-set$SCORESET
date
echo "[ completed ]"