blob: 9ec09447fd9dc65d576c857ff413178ee269328c [file] [log] [blame]
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Takes a grammar and a corpus and filters the grammar to each
# sentence in that corpus. Filtered files are placed in a newly
# created directory filtered/ in the current directory. Each file
# has the name grammar.filtered.SENTNO.gz. We assume the input
# grammar is compressed with gzip.
# Usage: grammar=GRAMMAR.GZ corpus=CORPUS ./filter.sh
# Works by calling itself recursively. When $sentno is not defined,
# it makes the filtered/ subdir and calls itself for each sentence
# of the corpus.
. ~/.bashrc
. $CACHEPIPE/bashrc
set -u
: ${rundir=$(pwd)}
: ${sentno=-1}
: ${corpus=tune.de.tok.lc}
: ${grammar=../grammar.filtered.gz}
# make $corpus and $grammar into complete path names (if not already)
startdir=$(pwd)
if [[ ! $corpus =~ "^/" ]]; then
corpus="$startdir/$corpus"
fi
if [[ ! $grammar =~ "^/" ]]; then
grammar="$startdir/$grammar"
fi
# chdir to $rundir
cd $rundir
if ! test -e "$corpus"; then
echo "* FATAL: can't find corpus '$corpus'"
exit
fi
# if sentno is defined, then run the cachecmd to build the
# sentence-level grammar file
if test $sentno -gt -1; then
let minus=sentno-1
cd filtered
# cache the filtering step
tmpfile=.tmp.$sentno
/home/hltcoe/mpost/bin/mid $sentno $corpus > $tmpfile
cachecmd filter-$sentno "gzip -cd $grammar | $THRAX/scripts/filter_rules.sh $tmpfile | gzip -9 > grammar.filtered.$minus.gz" $grammar grammar.filtered.$minus.gz
rm -f $tmpfile
else
# if sentno is not defined, create the filtered directory and
# start all the qsub jobs
[[ ! -d "filtered" ]] && mkdir filtered
numlines=$(cat $corpus | wc -l)
for sentno in $(seq 1 $numlines); do
qsub -cwd -l num_proc=2 -q cpu.q -v sentno=$sentno,corpus=$corpus,grammar=$grammar $JOSHUA/scripts/filter_grammar_to_sentences.sh
done
# wait for the last grammar to be finished (note: presents a
# slight but unlikely race condition, since the last sentence
# won't necessarily be the last grammar to be finished writing,
# and it might not be done)
numfound=$(ls filtered/ | wc -l)
while test $numfound -ne $numlines; do
echo "waiting for all subprocesses to finish (have $numfound / $numlines)..."
sleep 60
numfound=$(ls filtered/ | wc -l)
done
fi