blob: 05b616c943295f0591281337cd81449f7102547d [file] [log] [blame]
#!/bin/sh
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
# Written by Theo Van Dinter <felicity@apache.org>
# Please feel free to mail with any questions. :)
# This is a small script used to interact with run-masses to do a full
# corpus mass-check run, including the rsync to the SA server.
#
# NOTE: you MUST set RSYNC_USER and RSYNC_PASSWORD outside of this script so
# that your results can be sent up for review.
#
# By default, it'll do a nightly (set0) run. If you want to do a weekly
# (set1) run, add "--net" to the commandline.
#
# Defaults for mass-check parameters:
# no commandline parameters
# use DEF_AFTER for --after option
# adds "-n"
# use nightly-versions.txt for updates
# --net
# switches to NET_AFTER for --after option
# adds "-j 4", "--net", "--reuse"
# use weekly-versions.txt for updates
#
# ie: "run-corpora" equates to: "mass-check -n"
# "run-corpora --net" equates to: "mass-check -n -j 4 --net --reuse"
# Set the path appropriately
PATH=/bin:/usr/bin:/usr/local/bin
if [ -d /sw/bin ]; then
PATH=${PATH}:/sw/bin
fi
export PATH
# Where do things live?
# CORPUS is the directory that houses your mail corpus and these scripts.
CORPUS=$HOME/SA/corpus
# SA_VER is the directory that you want updated and will use for the
# nightly/weekly run
SA_VER=$HOME/SA/spamassassin-corpora
# OUTDIR is the directory which will be used for the temporary log files
# during the run, etc. If you don't know, leave it blank and it'll use
# "SA_VER/masses"
OUTDIR=
# FINALDIR, if set, is where you want the files moved to after processing is
# completed.
FINALDIR=
# These are paths to various programs. If PATH is set appropriately, you can
# just let the shell find them.
SVN=svn
SVNVERS=svnversion
WGET=wget
RSYNC=rsync
# DEF_AFTER is used for the set0 run --after parameter
# NET_AFTER is used for the set1 run --after parameter
# this needs to be specially handled since there are problems with shell
# parameter parsing that make these not work without it.
#
DEF_AFTER="-120 days"
NET_AFTER="-60 days"
if [ -z "$RSYNC_USER" -o -z "$RSYNC_PASSWORD" ]; then
echo "You need to specify RSYNC_USER and RSYNC_PASSWORD via the environment!" >&2
exit 2
fi
if [ -z "$OUTDIR" ]; then
OUTDIR="$SA_VER/masses"
fi
cd $OUTDIR
if [ -f ham.log -o -f spam.log ]; then
echo "A previous run still has log files, exiting." >&2
exit 2
fi
NET=0
OPTS="-n"
VERS=nightly
FILENAME=$RSYNC_USER
umask 002
while [ ! -z "$1" ]; do
if [ "$1" = "--net" ]; then
NET=1
fi
shift
done
if [ $NET -eq 1 ]; then
FILENAME="net-$FILENAME"
OPTS="$OPTS --net --reuse"
AFTER="$NET_AFTER"
VERS=weekly
# We want to do this with more parallelization...
OPTS="$OPTS -j 4"
else
AFTER="$DEF_AFTER"
fi
# Verify appropriate version to run with
echo "[Updating $SA_VER]"
COUNT=0
while ! $WGET -q -nd -m http://rsync.spamassassin.org/$VERS-versions.txt ; do
sleep 60
COUNT=`expr $COUNT + 1`
if [ $COUNT -gt 5 ]; then
echo "Couldn't get the $VERS revision version, aborting!" >&2
exit 2
fi
done
NREV=`tail -1 $VERS-versions.txt | awk '{print $2}'`
if [ -f "$SA_VER/masses/svninfo.tmp" ]; then
CREV=`grep ^Revision: $SA_VER/masses/svninfo.tmp | awk '{print $2}'`
if [ $CREV -eq $NREV ]; then
echo "Looks like a problem with the $VERS-versions.txt update, same rev ($CREV)" >&2
exit 2
fi
fi
if ! $RSYNC -uaqrC --exclude masses/spamassassin/ --delete rsync.spamassassin.org::tagged_builds/${VERS}_mass_check/ $SA_VER; then
echo "Couldn't rsync update the $VERS spamassassin corpora code" >&2
exit 2
fi
CREV=`grep ^Revision: $SA_VER/masses/svninfo.tmp | awk '{print $2}'`
if [ $CREV -ne $NREV ]; then
echo "Looks like a problem with the rsync area, found rev $CREV but expected rev $NREV" >&2
exit 2
fi
# do the run, treat $AFTER differently due to commandline parsing and "-"
echo "[Running mass-check '$OPTS' in $CORPUS]"
if [ -z "$AFTER" ]; then
$CORPUS/run-masses $SA_VER $OPTS > /dev/null
else
echo "[Using '$AFTER' for --after setting]"
$CORPUS/run-masses $SA_VER $OPTS --after "$AFTER" > /dev/null
fi
if [ ! -s ham.log -o ! -s spam.log ]; then
echo "There seems to be a problem with either ham.log or spam.log, aborting!" >&2
exit 1
fi
# now we have our ham.log and spam.log files...
echo "[Uploading daily corpus logs]"
mv ham.log ham-$FILENAME.log
mv spam.log spam-$FILENAME.log
mv results.log results-$FILENAME.log
if ! $RSYNC -qPcvzb ham-$FILENAME.log spam-$FILENAME.log $RSYNC_USER@rsync.spamassassin.org::corpus/; then
echo "There was an error during rsync!" >&2
fi
RESULTDIR=.
if [ "$FINALDIR" ]; then
mv -f ham-$FILENAME.log spam-$FILENAME.log $FINALDIR
if [ -f "ham-$FILENAME.log" -o -f "spam-$FILENAME.log" ]; then
echo "There was an error moving files around, aborting!" >&2
exit 1
fi
RESULTDIR=$FINALDIR
if [ -d "$FINALDIR/hf" ]; then
RESULTDIR="$FINALDIR/hf"
fi
mv -f results-$FILENAME.log $RESULTDIR
fi
echo "[Our results]"
cat $RESULTDIR/results-$FILENAME.log