| #!/bin/bash |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| # Strings together the preprocessing scripts |
| |
| # This script takes |
| # - a source and target language |
| # - a language pack directory (which should contain scripts 'prepare.sh' and 'joshua') |
| # - a prefix point to a test set (e.g., a line from ~mpost/hybrid16/parallel/$lang/test.txt) |
| # |
| # It then decodes the test set with the specified model. It assumes the |
| # model pointed to in the tuned config is the unfiltered, fully packed |
| # model, which is nice, because you don't have to filter and re-pack. |
| # |
| # e.g., |
| # qsub test.sh es en apache-joshua-es-en ~mpost/hybrid16/parallel/ar/test/globalvoices-test |
| # |
| # The results will be written to a directory evaluation/TESTSET within the language pack. |
| # The output file will be stored in a file named "output" and the BLEU score in a file |
| # named "bleu" |
| |
| # QSUB PARAMETERS |
| #$ -S /bin/bash -V |
| #$ -cwd |
| #$ -j y -o logs |
| #$ -l mem_free=32G,h_rt=4:00:00,num_proc=8 |
| #$ -m aes |
| |
| . ~/.bashrc |
| |
| : ${TMP=/scratch} |
| |
| # Adjust this to your own model root! Assumes $lang/$rundir |
| sourcelang=$1 |
| targetlang=$2 |
| lpdir=$3 |
| testset=$4 |
| |
| if [[ -z $4 ]]; then |
| echo "Usage: test-lp SOURCE TARGET LANGUAGE_PACK TEST_PREFIX" |
| echo "where" |
| echo " SOURCE is the source language extension (e.g., es)" |
| echo " TARGET is the target language extension (e.g., en)" |
| echo " LANGUAGE_PACK points to a language pack directory" |
| echo " TEST_PREFIX is the path prefix to a test set" |
| exit |
| fi |
| |
| set -u |
| |
| # Ensure test set exists |
| if [[ ! -s $testset.$sourcelang ]]; then |
| echo "* FATAL: can't find test set '$testset.$sourcelang'" |
| exit |
| fi |
| |
| test_name=$(basename $testset) |
| |
| rundir=$lpdir/evaluation/$test_name |
| [[ ! -d $rundir ]] && mkdir -p $rundir |
| |
| # Make sure there's a config file |
| binary=$lpdir/joshua |
| config=$lpdir/joshua.config |
| prepare=$lpdir/prepare.sh |
| for file in $binary $config $prepare; do |
| if [[ ! -s "$file" ]]; then |
| echo "* FATAL: '$lpdir' doesn't look like a language pack (can't find $file)." |
| exit |
| fi |
| done |
| |
| # Decode |
| echo "Decoding $test_name with $lpdir..." |
| for file in $testset.{$sourcelang,$targetlang}; do |
| [[ ! -e $rundir/$(basename $file) ]] && cp $file $rundir/$(basename $file) |
| done |
| cat $rundir/$test_name.$sourcelang | $lpdir/prepare.sh | $lpdir/joshua > $rundir/out 2> $rundir/log |
| |
| tokenize() { |
| cat $1 | lang=$2 $prepare | $lpdir/scripts/lowercase.pl > $3 |
| } |
| |
| if [[ -x $JOSHUA/bin/bleu ]]; then |
| echo -n "Scoring with BLEU..." |
| tokenize $rundir/out $targetlang $rundir/out.tok |
| tokenize $rundir/$test_name.$targetlang $targetlang $rundir/ref.tok |
| $JOSHUA/bin/bleu $rundir/out.tok $rundir/ref.tok > $rundir/bleu |
| cat $rundir/bleu | grep "BLEU =" | awk '{print $NF}' |
| fi |
| |