blob: 962ef1f581947526d10feb90e46757afdd3759e6 [file] [log] [blame]
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Strings together the preprocessing scripts
# This script takes
# - a source and target language
# - a language pack directory (which should contain scripts 'prepare.sh' and 'joshua')
# - a prefix point to a test set (e.g., a line from ~mpost/hybrid16/parallel/$lang/test.txt)
#
# It then decodes the test set with the specified model. It assumes the
# model pointed to in the tuned config is the unfiltered, fully packed
# model, which is nice, because you don't have to filter and re-pack.
#
# e.g.,
# qsub test.sh es en apache-joshua-es-en ~mpost/hybrid16/parallel/ar/test/globalvoices-test
#
# The results will be written to a directory evaluation/TESTSET within the language pack.
# The output file will be stored in a file named "output" and the BLEU score in a file
# named "bleu"
# QSUB PARAMETERS
#$ -S /bin/bash -V
#$ -cwd
#$ -j y -o logs
#$ -l mem_free=32G,h_rt=4:00:00,num_proc=8
#$ -m aes
. ~/.bashrc
: ${TMP=/scratch}
# Adjust this to your own model root! Assumes $lang/$rundir
sourcelang=$1
targetlang=$2
lpdir=$3
testset=$4
if [[ -z $4 ]]; then
echo "Usage: test-lp SOURCE TARGET LANGUAGE_PACK TEST_PREFIX"
echo "where"
echo " SOURCE is the source language extension (e.g., es)"
echo " TARGET is the target language extension (e.g., en)"
echo " LANGUAGE_PACK points to a language pack directory"
echo " TEST_PREFIX is the path prefix to a test set"
exit
fi
set -u
# Ensure test set exists
if [[ ! -s $testset.$sourcelang ]]; then
echo "* FATAL: can't find test set '$testset.$sourcelang'"
exit
fi
test_name=$(basename $testset)
rundir=$lpdir/evaluation/$test_name
[[ ! -d $rundir ]] && mkdir -p $rundir
# Make sure there's a config file
binary=$lpdir/joshua
config=$lpdir/joshua.config
prepare=$lpdir/prepare.sh
for file in $binary $config $prepare; do
if [[ ! -s "$file" ]]; then
echo "* FATAL: '$lpdir' doesn't look like a language pack (can't find $file)."
exit
fi
done
# Decode
echo "Decoding $test_name with $lpdir..."
for file in $testset.{$sourcelang,$targetlang}; do
[[ ! -e $rundir/$(basename $file) ]] && cp $file $rundir/$(basename $file)
done
cat $rundir/$test_name.$sourcelang | $lpdir/prepare.sh | $lpdir/joshua > $rundir/out 2> $rundir/log
tokenize() {
cat $1 | lang=$2 $prepare | $lpdir/scripts/lowercase.pl > $3
}
if [[ -x $JOSHUA/bin/bleu ]]; then
echo -n "Scoring with BLEU..."
tokenize $rundir/out $targetlang $rundir/out.tok
tokenize $rundir/$test_name.$targetlang $targetlang $rundir/ref.tok
$JOSHUA/bin/bleu $rundir/out.tok $rundir/ref.tok > $rundir/bleu
cat $rundir/bleu | grep "BLEU =" | awk '{print $NF}'
fi