blob: 3eae6b68e5055315245e3cc4ef79a16623a7b1c3 [file] [log] [blame]
#!/bin/bash -e
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
if [ $# -ne 5 ]
then
echo "Usage: pipeline.sh source target alignments target_parsed filter_set"
exit 2
fi
MOSES="/home/hltcoe/ccallison/Moses/trunk/"
SAMT="/home/hltcoe/ccallison/SAMT/"
TMP="/tmp"
export MALLOC_CHECK_=0
#
# This is a terrible and hackish script. Use with care.
# Several assumptions and requirements exist.
#
# (1) Set the above variables to the installation directories of
# the respective systems.
#
# (2) Add $JOSHUA/scripts/{samt,toolkit} to your $PATH, where
# $JOSHUA is your Joshua install directory
#
# (3) The script expects you to have created the lexprob files as
# follows:
# - create a data.josh for the corpus (see PDF documentation)
#
# - create a Joshua lexprob file for the corpus (see Joshua
# dev list, message 707 and 708). Take care to run from the
# Joshua installation dir for correct classpath. The output
# should be a file named data.lexprobs (set name in XML file).
#
# - run lexprob2samt.py data.lexprobs to split into two
# SAMT-format lexprob files, data.lexprobs.samt.{sgt,tgs}
#
# (4) You'll need to change gzcat to zcat for this to run on Linux.
#
# (5) For the chunking to be of maximal use, if would be best if the
# sentence length distibution were uniform over the whole corpus.
# I'll whip up a script for that sometime soon, if you deem it
# useful.
# TODO: dynamic chunking
chunki.py 200 $1 $2 $3 $4
HOLD_FOR=""
for C in chunk_*; do
RUN_PHRASE_EXTRACT="${TMP}/samt.phrase_extract.${C}.sh"
echo "($MOSES/scripts/training/phrase-extract/extract \
$C/$2 $C/$1 \
$C/$3 extract 8 --OnlyOutputSpanInfo > \
$C/phrases.log )" \
> $RUN_PHRASE_EXTRACT
chmod u+x $RUN_PHRASE_EXTRACT
JOB_ID=`qsub -V -cwd -N samt.phrase_extract.${C} \
$RUN_PHRASE_EXTRACT | \
sed -e "s/Your job \([0-9]*\).* has been submitted/\1/g"`;
RUN_RULE_EXTRACT="${TMP}/samt.rule_extract.${C}.sh"
echo "(($SAMT/scripts/extractrules.pl \
--PhrasePairFeedFile $C/phrases.log \
--TargetParseTreeFile $C/$4 \
-r $5 \
--MaxSourceLength 12 \
--LexicalWeightFile data.lexprobs.samt.sgt \
--LexicalWeightFileReversed data.lexprobs.samt.tgs | \
gzip > $C/extractrules.gz ) >& $C/extractrules.log )" \
> $RUN_RULE_EXTRACT
chmod u+x $RUN_RULE_EXTRACT
HOLD_FOR="${HOLD_FOR},"`qsub -V -cwd \
-N samt.rule_extract.${C} \
-hold_jid ${JOB_ID} \
$RUN_RULE_EXTRACT | \
sed -e "s/Your job \([0-9]*\).* has been submitted/\1/g"`;
done
HOLD_FOR=`echo ${HOLD_FOR} | sed -e "s/.\(.*\)/\1/g"`
RUN_RULE_MERGE="${TMP}/samt.rule_merge.${C}.sh"
echo "zcat chunk_*/extractrules.gz | $SAMT/scripts/sortsafe.sh -T $TMP | \
$SAMT/myoptions.coe/MergeRules 0 0 8 8 0 | gzip > mergedrules.gz" > \
$RUN_RULE_MERGE
chmod u+x $RUN_RULE_MERGE
JOB_ID=`qsub -V -cwd -N samt.rule_merge \
-hold_jid ${HOLD_FOR} \
$RUN_RULE_MERGE | \
sed -e "s/Your job \([0-9]*\).* has been submitted/\1/g"`;
RUN_RULE_FILTER="${TMP}/samt.rule_filter.${C}.sh"
echo "((zcat mergedrules.gz | \
$SAMT/scripts/filterrules.pl --cachesize 4000 \
--PhrasalFeatureCount 0 \
--LexicalWeightFile data.lexprobs.giza.sgt \
--LexicalWeightFileReversed data.lexprobs.giza.tgs \
--MinOccurrenceCountLexicalrules 0 --MinOccurrenceCountNonlexicalrules 0 \
--noUsePerlHashForRules | \
gzip > filteredrules.gz ) >& filteredrules.log)" > \
$RUN_RULE_FILTER
chmod u+x $RUN_RULE_FILTER
JOB_ID=`qsub -V -cwd -N samt.rule_filter \
-hold_jid ${JOB_ID} \
$RUN_RULE_FILTER | \
sed -e "s/Your job \([0-9]*\).* has been submitted/\1/g"`;
# throw away rules that do not have target side terminals
#zgrep -v "^\([^#]* \)*[^ @#][^ @#]*[^#]*#\(@[0-9][ ]*\)*#" filteredrules.gz | \
# grep -v "#1 [0-9]" | gzip > filteredrules.clean.gz
#zgrep "#1 [0-9]" filteredrules.gz | grep -v COUNT | \
# sed -e "s/@_S/@GOAL/g" | \
# awk '{ print $0; gsub(/ @2/, ""); gsub(/^@GOAL /, ""); print; }' | \
# gzip > samt.original.glue.gz