scripts/filter_grammar_to_sentences.sh - joshua - Git at Google

 #!/bin/bash
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # Takes a grammar and a corpus and filters the grammar to each
 # sentence in that corpus.  Filtered files are placed in a newly
 # created directory filtered/ in the current directory.  Each file
 # has the name grammar.filtered.SENTNO.gz.  We assume the input
 # grammar is compressed with gzip.

 # Usage: grammar=GRAMMAR.GZ corpus=CORPUS ./filter.sh

 # Works by calling itself recursively.  When $sentno is not defined,
 # it makes the filtered/ subdir and calls itself for each sentence
 # of the corpus.

 . ~/.bashrc
 . $CACHEPIPE/bashrc

 set -u

 : ${rundir=$(pwd)}
 : ${sentno=-1}
 : ${corpus=tune.de.tok.lc}
 : ${grammar=../grammar.filtered.gz}

 # make $corpus and $grammar into complete path names (if not already)
 startdir=$(pwd)
 if [[ ! $corpus =~ "^/" ]]; then
 	corpus="$startdir/$corpus"
 fi
 if [[ ! $grammar =~ "^/" ]]; then
 	grammar="$startdir/$grammar"
 fi

 # chdir to $rundir
 cd $rundir

 if ! test -e "$corpus"; then
 	echo "* FATAL: can't find corpus '$corpus'"
 	exit
 fi

 # if sentno is defined, then run the cachecmd to build the
 # sentence-level grammar file
 if test $sentno -gt -1; then

 	let minus=sentno-1

 	cd filtered

 	# cache the filtering step
 	tmpfile=.tmp.$sentno
 	/home/hltcoe/mpost/bin/mid $sentno $corpus > $tmpfile
 	cachecmd filter-$sentno "gzip -cd $grammar | $THRAX/scripts/filter_rules.sh $tmpfile | gzip -9 > grammar.filtered.$minus.gz" $grammar grammar.filtered.$minus.gz
 	rm -f $tmpfile

 else

 	# if sentno is not defined, create the filtered directory and
 	# start all the qsub jobs
 	[[ ! -d "filtered" ]] && mkdir filtered
 	numlines=$(cat $corpus | wc -l)
 	for sentno in $(seq 1 $numlines); do
 		qsub -cwd -l num_proc=2 -q cpu.q -v sentno=$sentno,corpus=$corpus,grammar=$grammar $JOSHUA/scripts/filter_grammar_to_sentences.sh
 	done

 	# wait for the last grammar to be finished (note: presents a
 	# slight but unlikely race condition, since the last sentence
 	# won't necessarily be the last grammar to be finished writing,
 	# and it might not be done)
 	numfound=$(ls filtered/ | wc -l)
 	while test $numfound -ne $numlines; do
 		echo "waiting for all subprocesses to finish (have $numfound / $numlines)..."
 		sleep 60
 		numfound=$(ls filtered/ | wc -l)
 	done
 fi
	#!/bin/bash
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	# Takes a grammar and a corpus and filters the grammar to each
	# sentence in that corpus. Filtered files are placed in a newly
	# created directory filtered/ in the current directory. Each file
	# has the name grammar.filtered.SENTNO.gz. We assume the input
	# grammar is compressed with gzip.

	# Usage: grammar=GRAMMAR.GZ corpus=CORPUS ./filter.sh

	# Works by calling itself recursively. When $sentno is not defined,
	# it makes the filtered/ subdir and calls itself for each sentence
	# of the corpus.

	. ~/.bashrc
	. $CACHEPIPE/bashrc

	set -u

	: ${rundir=$(pwd)}
	: ${sentno=-1}
	: ${corpus=tune.de.tok.lc}
	: ${grammar=../grammar.filtered.gz}

	# make $corpus and $grammar into complete path names (if not already)
	startdir=$(pwd)
	if [[ ! $corpus =~ "^/" ]]; then
	corpus="$startdir/$corpus"
	fi
	if [[ ! $grammar =~ "^/" ]]; then
	grammar="$startdir/$grammar"
	fi

	# chdir to $rundir
	cd $rundir

	if ! test -e "$corpus"; then
	echo "* FATAL: can't find corpus '$corpus'"
	exit
	fi

	# if sentno is defined, then run the cachecmd to build the
	# sentence-level grammar file
	if test $sentno -gt -1; then

	let minus=sentno-1

	cd filtered

	# cache the filtering step
	tmpfile=.tmp.$sentno
	/home/hltcoe/mpost/bin/mid $sentno $corpus > $tmpfile
	cachecmd filter-$sentno "gzip -cd $grammar \| $THRAX/scripts/filter_rules.sh $tmpfile \| gzip -9 > grammar.filtered.$minus.gz" $grammar grammar.filtered.$minus.gz
	rm -f $tmpfile

	else

	# if sentno is not defined, create the filtered directory and
	# start all the qsub jobs
	[[ ! -d "filtered" ]] && mkdir filtered
	numlines=$(cat $corpus \| wc -l)
	for sentno in $(seq 1 $numlines); do
	qsub -cwd -l num_proc=2 -q cpu.q -v sentno=$sentno,corpus=$corpus,grammar=$grammar $JOSHUA/scripts/filter_grammar_to_sentences.sh
	done

	# wait for the last grammar to be finished (note: presents a
	# slight but unlikely race condition, since the last sentence
	# won't necessarily be the last grammar to be finished writing,
	# and it might not be done)
	numfound=$(ls filtered/ \| wc -l)
	while test $numfound -ne $numlines; do
	echo "waiting for all subprocesses to finish (have $numfound / $numlines)..."
	sleep 60
	numfound=$(ls filtered/ \| wc -l)
	done
	fi