| #!/usr/bin/perl |
| |
| # This script implements the Joshua pipeline. It can run a complete |
| # pipeline --- from raw training corpora to bleu scores on a test set |
| # --- and it allows jumping into arbitrary points of the pipeline. It |
| # is modeled on the train-factored-phrase-model.perl from the Moses |
| # decoder, but it is built for hierarchical decoding, and handles |
| # parameter tuning (via MERT) and test-set decoding, as well. |
| # |
| # Currently implemented: |
| # |
| # - decoding with Hiero grammars and SAMT grammars |
| |
| # - jump to SUBSAMPLE, ALIGN, PARSE, THRAX, MERT, and TEST points |
| # (using --first-step and (optionally) --last-step) |
| # - built on top of CachePipe, so that intermediate results are cached |
| # and only re-run if necessary |
| # - uses Thrax for grammar extraction |
| |
| my $JOSHUA; |
| |
| BEGIN { |
| $JOSHUA = $ENV{JOSHUA} or not_defined("JOSHUA"); |
| unshift(@INC,"$ENV{JOSHUA}/scripts/training/cachepipe"); |
| } |
| |
| use strict; |
| use warnings; |
| use Getopt::Long; |
| use File::Basename; |
| use Cwd; |
| use POSIX qw[ceil]; |
| use List::Util qw[max min]; |
| use CachePipe; |
| |
| my $HADOOP = undef; |
| my $MOSES_SCRIPTS = $ENV{SCRIPTS_ROOTDIR} or not_defined("SCRIPTS_ROOTDIR"); |
| die not_defined("JAVA_HOME") unless exists $ENV{JAVA_HOME}; |
| |
| my (@CORPORA,$TUNE,$TEST,$ALIGNMENT,$SOURCE,$TARGET,$LMFILE,$GRAMMAR_FILE,$GLUE_GRAMMAR_FILE,$THRAX_CONF_FILE); |
| my $FIRST_STEP = "FIRST"; |
| my $LAST_STEP = "LAST"; |
| my $LMFILTER = "$ENV{HOME}/code/filter/filter"; |
| my $MAXLEN = 50; |
| my $DO_FILTER_LM = 1; |
| my $DO_SUBSAMPLE = 0; |
| my $SCRIPTDIR = "$JOSHUA/scripts"; |
| my $TOKENIZER = "$SCRIPTDIR/training/penn-treebank-tokenizer.perl"; |
| my $MOSES_TRAINER = "$MOSES_SCRIPTS/training/train-model.perl"; |
| my $MERTCONFDIR = "$JOSHUA/scripts/training/templates/mert"; |
| my $SRILM = "$ENV{SRILM}/bin/i686-m64/ngram-count"; |
| my $STARTDIR; |
| my $RUNDIR = $STARTDIR = getcwd; |
| my $GRAMMAR_TYPE = "hiero"; |
| |
| # this file should exist in the Joshua mert templates file; it contains |
| # the Joshua command invoked by MERT |
| my $JOSHUA_CONFIG_ORIG = "$MERTCONFDIR/joshua.config"; |
| my %MERTFILES = ( |
| 'decoder_command' => "$MERTCONFDIR/decoder_command.qsub", |
| 'joshua.config' => $JOSHUA_CONFIG_ORIG, |
| 'mert.config' => "$MERTCONFDIR/mert.config", |
| 'params.txt' => "$MERTCONFDIR/params.txt", |
| ); |
| |
| # whether to trim the grammars to each sentence |
| my $DO_SENT_SPECIFIC_TM = 0; |
| |
| my $DO_MBR = 1; |
| |
| my $ALIGNER = "giza"; # or "berkeley" |
| |
| # for hadoop java subprocesses (heap amount) |
| # you really just have to play around to find out how much is enough |
| my $HADOOP_MEM = "4g"; |
| my $JOSHUA_MEM = "3100m"; |
| my $ALIGNER_MEM = "10g"; |
| my $QSUB_ARGS = "-l num_proc=2"; |
| my $ALIGNER_BLOCKSIZE = 1000000; |
| my $NUM_JOBS = 1; |
| my $NUM_THREADS = 1; |
| |
| my @STEPS = qw[FIRST SUBSAMPLE ALIGN PARSE THRAX MERT TEST LAST]; |
| my %STEPS = map { $STEPS[$_] => $_ + 1 } (0..$#STEPS); |
| |
| my $retval = GetOptions( |
| "corpus=s" => \@CORPORA, |
| "tune=s" => \$TUNE, |
| "test=s" => \$TEST, |
| "aligner=s" => \$ALIGNER, |
| "alignment=s" => \$ALIGNMENT, |
| "aligner-mem=s" => \$ALIGNER_MEM, |
| "source=s" => \$SOURCE, |
| "target=s" => \$TARGET, |
| "rundir=s" => \$RUNDIR, |
| "filter-tm!" => \$DO_SENT_SPECIFIC_TM, |
| "filter-lm!" => \$DO_FILTER_LM, |
| "lmfile=s" => \$LMFILE, |
| "grammar=s" => \$GRAMMAR_FILE, |
| "glue-grammar=s" => \$GLUE_GRAMMAR_FILE, |
| "mbr!" => \$DO_MBR, |
| "type=s" => \$GRAMMAR_TYPE, |
| "maxlen=i" => \$MAXLEN, |
| "tokenizer=s" => \$TOKENIZER, |
| "joshua-config=s" => \$MERTFILES{'joshua.config'}, |
| "joshua-mem=s" => \$JOSHUA_MEM, |
| "hadoop-mem=s" => \$HADOOP_MEM, |
| "decoder-command=s" => \$MERTFILES{'decoder_command'}, |
| "thrax-conf=s" => \$THRAX_CONF_FILE, |
| "jobs=i" => \$NUM_JOBS, |
| "threads=i" => \$NUM_THREADS, |
| "subsample!" => \$DO_SUBSAMPLE, |
| "qsub-args=s" => \$QSUB_ARGS, |
| "first-step=s" => \$FIRST_STEP, |
| "last-step=s" => \$LAST_STEP, |
| "aligner-chunk-size=s" => \$ALIGNER_BLOCKSIZE, |
| "hadoop=s" => \$HADOOP, |
| ); |
| |
| if (! $retval) { |
| print "Invalid usage, quitting\n"; |
| exit 1; |
| } |
| |
| # capitalize these to offset a common error: |
| $FIRST_STEP = uc($FIRST_STEP); |
| $LAST_STEP = uc($LAST_STEP); |
| |
| $| = 1; |
| |
| my $cachepipe = new CachePipe(); |
| |
| $SIG{INT} = sub { |
| print "* Got C-c, quitting\n"; |
| $cachepipe->cleanup(); |
| exit 1; |
| }; |
| |
| ## Sanity Checking ################################################### |
| |
| if (defined $ENV{HADOOP} and ! defined $HADOOP) { |
| print "* FATAL: \$HADOOP defined (suggesting an existing hadoop\n"; |
| print "* FATAL: installation). If you want to use this, pass the\n"; |
| print "* FATAL: directory using the --hadoop flag; if you instead want to\n"; |
| print "* FATAL: roll out a new cluster automatically, then unset \$HADOOP\n"; |
| print "* FATAL: and re-run the script.\n"; |
| exit; |
| } |
| |
| # make sure a corpus was provided if we're doing any step before MERT |
| if (@CORPORA == 0 and $STEPS{$FIRST_STEP} < $STEPS{MERT}) { |
| print "* FATAL: need at least one training corpus (--corpus)\n"; |
| exit 1; |
| } |
| |
| # make sure a tuning corpus was provided if we're doing MERT |
| if (! defined $TUNE and ($STEPS{$FIRST_STEP} <= $STEPS{MERT} |
| and $STEPS{$LAST_STEP} >= $STEPS{MERT})) { |
| print "* FATAL: need a tuning set (--tune)\n"; |
| exit 1; |
| } |
| |
| # make sure a test corpus was provided if we're decoding a test set |
| if (! defined $TEST and ($STEPS{$FIRST_STEP} <= $STEPS{TEST} |
| and $STEPS{$LAST_STEP} >= $STEPS{TEST})) { |
| print "* FATAL: need a test set (--test)\n"; |
| exit 1; |
| } |
| |
| # make sure a grammar file was given if we're skipping training |
| if (! defined $GRAMMAR_FILE and ($STEPS{$FIRST_STEP} >= $STEPS{MERT})) { |
| print "* FATAL: need a grammar (--grammar) if you're skipping that step\n"; |
| exit 1; |
| } |
| |
| # if $CORPUS was a relative path, prepend the starting directory |
| # (under the assumption it was relative to there) |
| map { |
| $CORPORA[$_] = "$STARTDIR/$CORPORA[$_]" unless $CORPORA[$_] =~ /^\//; |
| } (0..$#CORPORA); |
| |
| foreach my $corpus (@CORPORA) { |
| foreach my $ext ($TARGET,$SOURCE) { |
| if (! -e "$corpus.$ext") { |
| print "* FATAL: can't find '$corpus.$ext'"; |
| exit 1; |
| } |
| } |
| } |
| |
| if ($ALIGNER ne "giza" and $ALIGNER ne "berkeley") { |
| print "* FATAL: aligner must be one of 'giza' or 'berkeley'\n"; |
| exit 1; |
| } |
| |
| |
| ## Dependent variable setting ######################################## |
| |
| # if parallelization is turned off, then use the sequential version of |
| # the decoder command |
| if ($NUM_JOBS == 1) { |
| $MERTFILES{'decoder_command'} = "$MERTCONFDIR/decoder_command.sequential"; |
| } |
| |
| my $OOV = ($GRAMMAR_TYPE eq "samt") ? "OOV" : "X"; |
| |
| # use this default unless it's already been defined by a command-line argument |
| $THRAX_CONF_FILE = "$JOSHUA/scripts/training/templates/thrax-$GRAMMAR_TYPE.conf" unless defined $THRAX_CONF_FILE; |
| |
| mkdir $RUNDIR unless -d $RUNDIR; |
| chdir($RUNDIR); |
| |
| # default values -- these are overridden if the full script is run |
| # (after tokenization and normalization) |
| my (%TRAIN,%TUNE,%TEST); |
| if (@CORPORA) { |
| $TRAIN{prefix} = $CORPORA[0]; |
| $TRAIN{source} = "$CORPORA[0].$SOURCE"; |
| $TRAIN{target} = "$CORPORA[0].$TARGET"; |
| } |
| |
| if ($TUNE) { |
| $TUNE{source} = "$TUNE.$SOURCE"; |
| $TUNE{target} = "$TUNE.$TARGET"; |
| } |
| |
| if ($TEST) { |
| $TEST{source} = "$TEST.$SOURCE"; |
| $TEST{target} = "$TEST.$TARGET"; |
| } |
| |
| if ($FIRST_STEP ne "FIRST") { |
| if (@CORPORA > 1) { |
| print "* FATAL: you can't skip steps if you specify more than one --corpus\n"; |
| exit(1); |
| } |
| |
| if (eval { goto $FIRST_STEP }) { |
| print "* Skipping to step $FIRST_STEP\n"; |
| goto $FIRST_STEP; |
| } else { |
| print "* No such step $FIRST_STEP\n"; |
| exit 1; |
| } |
| } |
| |
| ## STEP 1: filter and preprocess corpora ############################# |
| FIRST: |
| |
| if (defined $ALIGNMENT) { |
| print "* FATAL: it doesn't make sense to provide an alignment and then do\n"; |
| print " tokenization. Either remove --alignment or specify a first step\n"; |
| print " of Thrax (--first-step THRAX)\n"; |
| exit 1; |
| } |
| |
| if (@CORPORA == 0) { |
| print "* FATAL: need at least one training corpus (--corpus)\n"; |
| exit 1; |
| } |
| |
| # prepare the training data |
| my $prefix = prepare_data("train",\@CORPORA,$MAXLEN); |
| $TRAIN{prefix} = "train/corpus"; |
| foreach my $lang ($SOURCE,$TARGET) { |
| system("ln -sf $prefix.$lang train/corpus.$lang"); |
| } |
| $TRAIN{source} = "train/corpus.$SOURCE"; |
| $TRAIN{target} = "train/corpus.$TARGET"; |
| |
| # prepare the tuning and development data |
| if (defined $TUNE) { |
| my $prefix = prepare_data("tune",[$TUNE]); |
| $TUNE{source} = "tune/$prefix.$SOURCE"; |
| $TUNE{target} = "tune/$prefix.$TARGET"; |
| } |
| |
| if (defined $TEST) { |
| my $prefix = prepare_data("test",[$TEST]); |
| $TEST{source} = "test/$prefix.$SOURCE"; |
| $TEST{target} = "test/$prefix.$TARGET"; |
| } |
| |
| maybe_quit("FIRST"); |
| |
| ## SUBSAMPLE ######################################################### |
| |
| SUBSAMPLE: |
| |
| # subsample |
| if ($DO_SUBSAMPLE) { |
| mkdir("train/subsampled") unless -d "train/subsampled"; |
| |
| $cachepipe->cmd("subsample-manifest", |
| "echo corpus > train/subsampled/manifest", |
| "train/subsampled/manifest"); |
| |
| $cachepipe->cmd("subsample-testdata", |
| "cat $TUNE{source} $TEST{source} > train/subsampled/test-data", |
| $TUNE{source}, |
| $TEST{source}, |
| "train/subsampled/test-data"); |
| |
| $cachepipe->cmd("subsample", |
| "java -Xmx4g -Dfile.encoding=utf8 -cp $JOSHUA/bin:$JOSHUA/lib/commons-cli-2.0-SNAPSHOT.jar joshua.subsample.Subsampler -e $TARGET -f $SOURCE -epath train/ -fpath train/ -output train/subsampled/subsampled.$MAXLEN -ratio 1.04 -test train/subsampled/test-data -training train/subsampled/manifest", |
| "train/subsampled/manifest", |
| "train/subsampled/test-data", |
| $TRAIN{source}, |
| $TRAIN{target}, |
| "train/subsampled/subsampled.$MAXLEN.$TARGET", |
| "train/subsampled/subsampled.$MAXLEN.$SOURCE"); |
| |
| # rewrite the symlinks to point to the subsampled corpus |
| foreach my $lang ($TARGET,$SOURCE) { |
| system("ln -sf subsampled/subsampled.$MAXLEN.$lang train/corpus.$lang"); |
| } |
| } |
| |
| maybe_quit("SUBSAMPLE"); |
| |
| |
| ## ALIGN ############################################################# |
| |
| ALIGN: |
| |
| # This basically means that we've skipped tokenization, in which case |
| # we still want to move the input files into the canonical place |
| if ($FIRST_STEP eq "ALIGN") { |
| if (defined $ALIGNMENT) { |
| print "* FATAL: It doesn't make sense to provide an alignment\n"; |
| print " but not to skip the tokenization and subsampling steps\n"; |
| exit 1; |
| } |
| |
| # TODO: copy the files into the canonical place |
| |
| # Jumping straight to alignment is probably the same thing as |
| # skipping tokenization, and might also be implemented by a |
| # --no-tokenization flag |
| } |
| |
| # skip this step if an alignment was provided |
| if (! defined $ALIGNMENT) { |
| |
| # split up the data |
| system("mkdir","-p","train/splits") unless -d "train/splits"; |
| |
| $cachepipe->cmd("source-numlines", |
| "cat $TRAIN{source} | wc -l", |
| $TRAIN{source}); |
| my $numlines = $cachepipe->stdout(); |
| my $numchunks = ceil($numlines / $ALIGNER_BLOCKSIZE); |
| |
| open TARGET, $TRAIN{target} or die "can't read $TRAIN{target}"; |
| open SOURCE, $TRAIN{source} or die "can't read $TRAIN{source}"; |
| |
| my $lastchunk = -1; |
| while (my $target = <TARGET>) { |
| my $source = <SOURCE>; |
| |
| # We want to prevent a very small last chunk, which we accomplish |
| # by folding the last chunk into the penultimate chunk. |
| my $chunk = ($numchunks <= 2) |
| ? 0 |
| : min($numchunks - 2, |
| int( (${.} - 1) / $ALIGNER_BLOCKSIZE )); |
| |
| if ($chunk != $lastchunk) { |
| close CHUNK_SOURCE; |
| close CHUNK_TARGET; |
| open CHUNK_SOURCE, ">", "train/splits/corpus.$SOURCE.$chunk" or die; |
| open CHUNK_TARGET, ">", "train/splits/corpus.$TARGET.$chunk" or die; |
| |
| $lastchunk = $chunk; |
| } |
| |
| print CHUNK_SOURCE $source; |
| print CHUNK_TARGET $target; |
| } |
| close CHUNK_SOURCE; |
| close CHUNK_TARGET; |
| |
| close SOURCE; |
| close TARGET; |
| |
| for (my $chunkno = 0; $chunkno <= $lastchunk; $chunkno++) { |
| |
| # create the alignment subdirectory |
| my $chunkdir = "alignments/$chunkno"; |
| system("mkdir","-p", $chunkdir); |
| |
| if ($ALIGNER eq "giza") { |
| |
| # run the alignments commands |
| $cachepipe->cmd("giza-$chunkno", |
| "rm -f $chunkdir/corpus.0-0.*; $MOSES_TRAINER -root-dir $chunkdir -e $TARGET.$chunkno -f $SOURCE.$chunkno -corpus train/splits/corpus -first-step 1 -last-step 3 > $chunkdir/giza.log 2>&1", |
| "train/splits/corpus.$SOURCE.$chunkno", |
| "train/splits/corpus.$TARGET.$chunkno", |
| "$chunkdir/model/aligned.grow-diag-final"); |
| |
| } elsif ($ALIGNER eq "berkeley") { |
| |
| # copy and modify the config file |
| open FROM, "$JOSHUA/scripts/training/templates/alignment/word-align.conf" or die "can't read berkeley alignment template"; |
| open TO, ">", "alignments/$chunkno/word-align.conf" or die "can't write to 'alignments/$chunkno/word-align.conf'"; |
| while (<FROM>) { |
| s/<SOURCE>/$SOURCE.$chunkno/g; |
| s/<TARGET>/$TARGET.$chunkno/g; |
| s/<CHUNK>/$chunkno/g; |
| |
| print TO; |
| } |
| close(TO); |
| close(FROM); |
| |
| # run the job |
| $cachepipe->cmd("berkeley-aligner-chunk-$chunkno", |
| "java -d64 -Xmx${ALIGNER_MEM} -jar $JOSHUA/lib/berkeleyaligner.jar ++alignments/$chunkno/word-align.conf", |
| "alignments/$chunkno/word-align.conf", |
| "train/splits/corpus.$SOURCE.$chunkno", |
| "train/splits/corpus.$TARGET.$chunkno", |
| "$chunkdir/training.align"); |
| |
| } |
| } |
| |
| if ($ALIGNER eq "giza") { |
| # combine the alignments |
| $cachepipe->cmd("giza-aligner-combine", |
| "cat alignments/*/model/aligned.grow-diag-final > alignments/training.align", |
| "alignments/$lastchunk/model/aligned.grow-diag-final", |
| "alignments/training.align"); |
| } elsif ($ALIGNER eq "berkeley") { |
| |
| # combine the alignments |
| $cachepipe->cmd("berkeley-aligner-combine", |
| "cat alignments/*/training.align > alignments/training.align", |
| "alignments/$lastchunk/training.align", |
| "alignments/training.align"); |
| } |
| |
| $ALIGNMENT = "alignments/training.align"; |
| } |
| |
| maybe_quit("ALIGN"); |
| |
| |
| ## PARSE ############################################################# |
| |
| PARSE: |
| |
| mkdir("train") unless -d "train"; |
| |
| if ($GRAMMAR_TYPE eq "samt") { |
| |
| $cachepipe->cmd("build-vocab", |
| "cat $TRAIN{target} | $SCRIPTDIR/training/build-vocab.pl > train/vocab.$TARGET", |
| $TRAIN{target}, |
| "train/vocab.$TARGET"); |
| |
| $cachepipe->cmd("parse", |
| "cat $TRAIN{target} | java -jar $JOSHUA/lib/BerkeleyParser.jar -gr $JOSHUA/lib/eng_sm6.gr | sed 's/^\(/\(TOP/' | tee train/corpus.$TARGET.parsed.mc | perl -pi -e 's/(\\S+)\\)/lc(\$1).\")\"/ge' | tee train/corpus.$TARGET.parsed | perl $SCRIPTDIR/training/add-OOVs.pl train/vocab.$TARGET > train/corpus.parsed.$TARGET", |
| "$TRAIN{target}", |
| "train/corpus.parsed.$TARGET"); |
| |
| $TRAIN{parsed} = "train/corpus.parsed.$TARGET"; |
| } |
| |
| |
| ## THRAX ############################################################# |
| |
| THRAX: |
| |
| if ($GRAMMAR_TYPE eq "samt") { |
| |
| # if we jumped right here, $TRAIN{target} should be parsed |
| if (exists $TRAIN{parsed}) { |
| # parsing step happened in-script, all is well |
| |
| } elsif (already_parsed($TRAIN{target})) { |
| # skipped straight to this step, passing a parsed corpus |
| |
| mkdir("train") unless -d "train"; |
| |
| $TRAIN{parsed} = "train/corpus.parsed.$TARGET"; |
| |
| $cachepipe->cmd("cp-train-$TARGET", |
| "cp $TRAIN{target} $TRAIN{parsed}", |
| $TRAIN{target}, |
| $TRAIN{parsed}); |
| |
| $TRAIN{target} = "train/corpus.$TARGET"; |
| |
| # now extract the leaves of the parsed corpus |
| $cachepipe->cmd("extract-leaves", |
| "cat $TRAIN{parsed} | perl -pe 's/\\(.*?(\\S\+)\\)\+?/\$1/g' | perl -pe 's/\\)//g' > $TRAIN{target}", |
| $TRAIN{parsed}, |
| $TRAIN{target}); |
| |
| if ($TRAIN{source} ne "train/corpus.$SOURCE") { |
| $cachepipe->cmd("cp-train-$SOURCE", |
| "cp $TRAIN{source} train/corpus.$SOURCE", |
| $TRAIN{source}, "train/corpus.$SOURCE"); |
| $TRAIN{source} = "train/corpus.$SOURCE"; |
| } |
| |
| } else { |
| |
| print "* FATAL: You requested to build an SAMT grammar, but provided an\n"; |
| print " unparsed corpus. Please re-run the pipeline and begin no later\n"; |
| print " than the PARSE step (--first-step PARSE)\n"; |
| exit 1; |
| } |
| |
| } |
| |
| # we may have skipped directly to this step, in which case we need to |
| # ensure an alignment was provided |
| if (! defined $ALIGNMENT) { |
| print "* FATAL: no alignment file specified\n"; |
| exit(1); |
| } |
| |
| if (! defined $GRAMMAR_FILE) { |
| mkdir("train") unless -d "train"; |
| |
| # create the input file |
| my $target_file = ($GRAMMAR_TYPE eq "hiero") |
| ? $TRAIN{target} : $TRAIN{parsed}; |
| $cachepipe->cmd("thrax-input-file", |
| "paste $TRAIN{source} $target_file $ALIGNMENT | perl -pe 's/\\t/ ||| /g' | grep -v '(())' > train/thrax-input-file", |
| $TRAIN{source}, $target_file, $ALIGNMENT, |
| "train/thrax-input-file"); |
| |
| |
| # rollout the hadoop cluster if needed |
| start_hadoop_cluster() unless defined $HADOOP; |
| |
| # put the hadoop files in place |
| my $THRAXDIR = "pipeline-$SOURCE-$TARGET-$GRAMMAR_TYPE-$RUNDIR"; |
| $THRAXDIR =~ s#/#_#g; |
| |
| $cachepipe->cmd("thrax-prep", |
| "$HADOOP/bin/hadoop fs -rmr $THRAXDIR; $HADOOP/bin/hadoop fs -mkdir $THRAXDIR; $HADOOP/bin/hadoop fs -put train/thrax-input-file $THRAXDIR/input-file", |
| "train/thrax-input-file", |
| "grammar.gz"); |
| |
| # copy the thrax config file |
| system("grep -v input-file $THRAX_CONF_FILE > thrax-$GRAMMAR_TYPE.conf"); |
| system("echo input-file $THRAXDIR/input-file >> thrax-$GRAMMAR_TYPE.conf"); |
| |
| $cachepipe->cmd("thrax-run", |
| "$HADOOP/bin/hadoop jar $JOSHUA/lib/thrax.jar -D mapred.child.java.opts='-Xmx$HADOOP_MEM' thrax-$GRAMMAR_TYPE.conf $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; $HADOOP/bin/hadoop fs -getmerge $THRAXDIR/final/ grammar; gzip -9f grammar", |
| "train/thrax-input-file", |
| "thrax-$GRAMMAR_TYPE.conf", |
| "grammar.gz"); |
| |
| stop_hadoop_cluster() if $HADOOP eq "hadoop"; |
| |
| # cache the thrax-prep step, which depends on grammar.gz |
| $cachepipe->cmd("thrax-prep", "--cache-only"); |
| |
| # set the grammar file |
| $GRAMMAR_FILE = "grammar.gz"; |
| } |
| |
| maybe_quit("THRAX"); |
| |
| ## MERT ############################################################## |
| MERT: |
| |
| # If the language model file wasn't provided, build it from the target side of the training data. Otherwise, copy it to location. |
| if (! defined $LMFILE) { |
| if (exists $TRAIN{target}) { |
| $LMFILE="lm.gz"; |
| $cachepipe->cmd("srilm", |
| "$SRILM -interpolate -kndiscount -order 5 -text $TRAIN{target} -lm lm.gz", |
| $LMFILE); |
| } elsif (! defined $LMFILE) { |
| print "* FATAL: you skipped training and didn't specify a language model\n"; |
| exit(1); |
| } |
| } else { |
| if (! -e $LMFILE) { |
| print STDERR "* FATAL: can't find lmfile '$LMFILE'\n"; |
| exit(1); |
| } |
| |
| if ($LMFILE ne "lm.gz") { |
| $cachepipe->cmd("cp-lmfile", |
| "cp $LMFILE lm.gz", |
| $LMFILE, "lm.gz"); |
| $LMFILE = "lm.gz"; |
| } |
| } |
| |
| # filter the tuning LM to the training side of the data (if possible) |
| if (-e $LMFILTER and $DO_FILTER_LM and exists $TRAIN{target}) { |
| |
| $cachepipe->cmd("filter-lmfile", |
| "cat $TRAIN{target} | $LMFILTER union arpa model:$LMFILE lm-filtered; gzip -9f lm-filtered", |
| $LMFILE, "lm-filtered.gz"); |
| $LMFILE = "lm-filtered.gz"; |
| } |
| |
| mkdir("tune") unless -d "tune"; |
| |
| # filter the tuning grammar |
| $cachepipe->cmd("filter-tune", |
| "$SCRIPTDIR/training/scat $GRAMMAR_FILE | java -Dfile.encoding=utf8 -cp $JOSHUA/lib/thrax.jar edu.jhu.thrax.util.TestSetFilter -v $TUNE{source} | gzip -9 > tune/grammar.filtered.gz", |
| $GRAMMAR_FILE, |
| $TUNE{source}, |
| "tune/grammar.filtered.gz"); |
| |
| # copy the thrax config file if it's not already there |
| if (! defined $GLUE_GRAMMAR_FILE) { |
| system("grep -v input-file $THRAX_CONF_FILE > thrax-$GRAMMAR_TYPE.conf") |
| unless -e "thrax-$GRAMMAR_TYPE.conf"; |
| $cachepipe->cmd("glue-tune", |
| "$SCRIPTDIR/training/scat tune/grammar.filtered.gz | java -cp $JOSHUA/lib/thrax.jar:$HADOOP/hadoop-core-0.20.203.0.jar:$HADOOP/lib/commons-logging-1.1.1.jar edu.jhu.thrax.util.CreateGlueGrammar thrax-$GRAMMAR_TYPE.conf > tune/grammar.glue", |
| "tune/grammar.filtered.gz", |
| "tune/grammar.glue"); |
| $GLUE_GRAMMAR_FILE = "tune/grammar.glue"; |
| } else { |
| $cachepipe->cmd("glue-tune-copy", |
| "cp $GLUE_GRAMMAR_FILE tune/grammar.glue", |
| $GLUE_GRAMMAR_FILE, |
| "tune/grammar.glue"); |
| } |
| |
| |
| # figure out how many references there are |
| my $numrefs = get_numrefs($TUNE{target}); |
| |
| mkdir("mert") unless -d "mert"; |
| foreach my $key (keys %MERTFILES) { |
| my $file = $MERTFILES{$key}; |
| open FROM, $file or die "can't find file '$file'"; |
| open TO, ">mert/$key" or die "can't write to file 'mert/$key'"; |
| while (<FROM>) { |
| s/<INPUT>/$TUNE{source}/g; |
| s/<SOURCE>/$SOURCE/g; |
| s/<RUNDIR>/$RUNDIR/g; |
| s/<TARGET>/$TARGET/g; |
| s/<LMFILE>/$LMFILE/g; |
| s/<MEM>/$JOSHUA_MEM/g; |
| s/<GRAMMAR>/$GRAMMAR_TYPE/g; |
| s/<OOV>/$OOV/g; |
| s/<NUMJOBS>/$NUM_JOBS/g; |
| s/<NUMTHREADS>/$NUM_THREADS/g; |
| s/<QSUB_ARGS>/$QSUB_ARGS/g; |
| s/<OUTPUT>/mert\/tune.output.nbest/g; |
| s/<REF>/$TUNE{target}/g; |
| s/<JOSHUA>/$JOSHUA/g; |
| s/<NUMREFS>/$numrefs/g; |
| s/<CONFIG>/mert\/joshua.config/g; |
| s/<LOG>/mert\/joshua.log/g; |
| s/use_sent_specific_tm=.*/use_sent_specific_tm=$DO_SENT_SPECIFIC_TM/; |
| |
| print TO; |
| } |
| close(FROM); |
| close(TO); |
| } |
| chmod(0755,"mert/decoder_command"); |
| |
| # run MERT |
| $cachepipe->cmd("mert", |
| "java -d64 -cp $JOSHUA/bin joshua.zmert.ZMERT -maxMem 4500 mert/mert.config > mert/mert.log 2>&1", |
| "tune/grammar.filtered.gz", |
| "mert/joshua.config.ZMERT.final", |
| "mert/decoder_command", |
| "mert/mert.config", |
| "mert/params.txt"); |
| |
| # remove sentence-level Joshua files |
| #system("rm -rf tune/filtered/"); |
| |
| maybe_quit("MERT"); |
| |
| # set joshua config file location for testing |
| # $JOSHUA_CONFIG = "mert/joshua.config.ZMERT.final"; |
| |
| # If we're not quitting at this step, then copy the final Joshua |
| # config file to the test directory. |
| if ($LAST_STEP ne "MERT") { |
| mkdir("test") unless -d "test"; |
| |
| # for testing, mark OOVs, don't keep sentence-specific grammars |
| $cachepipe->cmd("test-joshua-config-from-mert", |
| "cat mert/joshua.config.ZMERT.final | perl -pe 's#tune/#test/#; s/mark_oovs=false/mark_oovs=true/; s/use_sent_specific_tm=.*/use_sent_specific_tm=$DO_SENT_SPECIFIC_TM/; s/keep_sent_specific_tm=true/keep_sent_specific_tm=false/' > test/joshua.config", |
| "mert/joshua.config.ZMERT.final", |
| "test/joshua.config"); |
| } |
| |
| ## Decode the test set |
| TEST: |
| |
| mkdir("test") unless -d "test"; |
| |
| # If we jumped directly to this step, then the caller is required to |
| # have specified a Joshua config file (fully instantiated, not a |
| # template), which we'll copy in place |
| if ($FIRST_STEP eq "TEST") { |
| if ($MERTFILES{'joshua.config'} eq $JOSHUA_CONFIG_ORIG) { |
| print "* FATAL: you need to explicitly specify a joshua.config (--joshua-config)\n"; |
| print " when starting at the TEST step\n"; |
| exit 1; |
| } |
| |
| $cachepipe->cmd("test-joshua-config", |
| "cp $MERTFILES{'joshua.config'} test/joshua.config", |
| $MERTFILES{'joshua.config'}, |
| "test/joshua.config"); |
| } |
| |
| # filter the test grammar |
| $cachepipe->cmd("filter-test", |
| "$SCRIPTDIR/training/scat $GRAMMAR_FILE | java -Dfile.encoding=utf8 -cp $JOSHUA/lib/thrax.jar edu.jhu.thrax.util.TestSetFilter -v $TEST{source} | gzip -9 > test/grammar.filtered.gz", |
| $GRAMMAR_FILE, |
| $TEST{source}, |
| "test/grammar.filtered.gz"); |
| |
| # copy the thrax config file if it's not already there |
| if (! defined $GLUE_GRAMMAR_FILE) { |
| system("grep -v input-file $THRAX_CONF_FILE > thrax-$GRAMMAR_TYPE.conf") |
| unless -e "thrax-$GRAMMAR_TYPE.conf"; |
| |
| $cachepipe->cmd("glue-test", |
| "$SCRIPTDIR/training/scat test/grammar.filtered.gz | java -cp $JOSHUA/lib/thrax.jar:$HADOOP/hadoop-core-20.203.0.jar:$HADOOP/lib/commons-logging-1.1.1.jar edu.jhu.thrax.util.CreateGlueGrammar thrax-$GRAMMAR_TYPE.conf > test/grammar.glue", |
| "test/grammar.filtered.gz", |
| "test/grammar.glue"); |
| $GLUE_GRAMMAR_FILE = "test/grammar.glue"; |
| } else { |
| $cachepipe->cmd("glue-test-copy", |
| "cp $GLUE_GRAMMAR_FILE test/grammar.glue", |
| $GLUE_GRAMMAR_FILE, |
| "test/grammar.glue"); |
| } |
| |
| # decode test set |
| foreach my $key (qw(decoder_command)) { |
| my $file = $MERTFILES{$key}; |
| open FROM, $file or die "can't find file '$file'"; |
| open TO, ">test/$key" or die "can't write to 'test/$key'"; |
| while (<FROM>) { |
| s/<INPUT>/$TEST{source}/g; |
| s/<NUMJOBS>/$NUM_JOBS/g; |
| s/<NUMTHREADS>/$NUM_THREADS/g; |
| s/<QSUB_ARGS>/$QSUB_ARGS/g; |
| s/<OUTPUT>/test\/test.output.nbest/g; |
| s/<JOSHUA>/$JOSHUA/g; |
| s/<NUMREFS>/$numrefs/g; |
| s/<SOURCE>/$SOURCE/g; |
| s/<TARGET>/$TARGET/g; |
| s/<RUNDIR>/$TARGET/g; |
| s/<LMFILE>/$LMFILE/g; |
| s/<MEM>/$JOSHUA_MEM/g; |
| s/<GRAMMAR>/$GRAMMAR_TYPE/g; |
| s/<OOV>/$OOV/g; |
| s/<CONFIG>/test\/joshua.config/g; |
| s/<LOG>/test\/joshua.log/g; |
| |
| print TO; |
| } |
| close(FROM); |
| close(TO); |
| } |
| chmod(0755,"test/decoder_command"); |
| |
| $cachepipe->cmd("test-decode", |
| "./test/decoder_command", |
| "test/decoder_command", |
| "test/grammar.glue", |
| "test/grammar.filtered.gz", |
| "test/test.output.nbest"); |
| |
| $cachepipe->cmd("remove-oov", |
| "cat test/test.output.nbest | perl -pe 's/_OOV//g' > test/test.output.nbest.noOOV", |
| "test/test.output.nbest", |
| "test/test.output.nbest.noOOV"); |
| |
| if ($DO_MBR) { |
| my $numlines = `cat $TEST{source} | wc -l`; |
| $numlines--; |
| |
| $cachepipe->cmd("test-onebest-parmbr", |
| "cat test/test.output.nbest.noOOV | java -Xmx1700m -cp $JOSHUA/bin joshua.decoder.NbestMinRiskReranker false 1 > test/test.output.1best", |
| "test/test.output.nbest.noOOV", |
| "test/test.output.1best"); |
| } else { |
| $cachepipe->cmd("test-extract-onebest", |
| "java -cp $JOSHUA/bin -Dfile.encoding=utf8 joshua.util.ExtractTopCand test/test.output.nbest test/test.output.1best", |
| "test/test.output.nbest.noOOV", |
| "test/test.output.1best"); |
| } |
| |
| $numrefs = get_numrefs($TEST{target}); |
| $cachepipe->cmd("test-bleu", |
| "java -cp $JOSHUA/bin -Djava.library.path=lib -Xmx1000m -Xms1000m -Djava.util.logging.config.file=logging.properties joshua.util.JoshuaEval -cand test/test.output.1best -ref $TEST{target} -rps $numrefs -m BLEU 4 closest > test/test.output.1best.bleu", |
| "test/test.output.1best", "test/test.output.1best.bleu"); |
| |
| system("cat test/test.output.1best.bleu"); |
| |
| ###################################################################### |
| ## SUBROUTINES ####################################################### |
| ###################################################################### |
| LAST: |
| 1; |
| |
| # Does tokenization and normalization of training, tuning, and test data. |
| # $label: one of train, tune, or test |
| # $corpora: arrayref of files (multiple allowed for training data) |
| # $maxlen: maximum length (only applicable to training) |
| sub prepare_data { |
| my ($label,$corpora,$maxlen) = @_; |
| |
| mkdir $label unless -d $label; |
| |
| # copy the data from its original location to our location |
| foreach my $ext ($TARGET,$SOURCE,"$TARGET.0","$TARGET.1","$TARGET.2","$TARGET.3") { |
| # append each extension to the corpora prefixes |
| my @files = map { "$_.$ext" } @$corpora; |
| # a list of all the files (in case of multiple corpora prefixes) |
| my $files = join(" ",@files); |
| if (-e $files[0]) { |
| $cachepipe->cmd("$label-copy-$ext", |
| "cat $files | gzip -9 > $label/$label.$ext.gz", |
| @files, "$label/$label.$ext.gz"); |
| } |
| } |
| |
| my $prefix = "$label"; |
| |
| # tokenize the data |
| foreach my $lang ($TARGET,$SOURCE,"$TARGET.0","$TARGET.1","$TARGET.2","$TARGET.3") { |
| if (-e "$label/$prefix.$lang.gz") { |
| $cachepipe->cmd("$label-tokenize-$lang", |
| "$SCRIPTDIR/training/scat $label/$prefix.$lang.gz | $TOKENIZER -l $lang 2> /dev/null | gzip -9 > $label/$prefix.tok.$lang.gz", |
| "$label/$prefix.$lang.gz", "$label/$prefix.tok.$lang.gz" |
| ); |
| # extend the prefix |
| } |
| } |
| $prefix .= ".tok"; |
| |
| if ($label eq "train") { |
| if ($maxlen) { |
| # trim training data |
| $cachepipe->cmd("train-trim", |
| "paste <(gzip -cd $label/$prefix.$TARGET.gz) <(gzip -cd $label/$prefix.$SOURCE.gz) | $SCRIPTDIR/training/trim_parallel_corpus.pl $maxlen | $SCRIPTDIR/training/split2files.pl $label/$prefix.$maxlen.$TARGET.gz $label/$prefix.$maxlen.$SOURCE.gz", |
| "$label/$prefix.$TARGET.gz", |
| "$label/$prefix.$SOURCE.gz", |
| "$label/$prefix.$maxlen.$TARGET.gz", |
| "$label/$prefix.$maxlen.$SOURCE.gz", |
| ); |
| } |
| $prefix .= ".$maxlen"; |
| } |
| |
| # lowercase |
| foreach my $lang ($TARGET,$SOURCE,"$TARGET.0","$TARGET.1","$TARGET.2","$TARGET.3") { |
| if (-e "$label/$prefix.$lang.gz") { |
| $cachepipe->cmd("$label-lowercase-$lang", |
| "gzip -cd $label/$prefix.$lang.gz | $SCRIPTDIR/lowercase.perl > $label/$prefix.lc.$lang", |
| "$label/$prefix.$lang.gz", |
| "$label/$prefix.lc.$lang"); |
| } |
| } |
| $prefix .= ".lc"; |
| |
| return $prefix; |
| } |
| |
| sub maybe_quit { |
| my ($current_step) = @_; |
| |
| if (defined $LAST_STEP and $current_step eq $LAST_STEP) { |
| print "* Quitting at this step\n"; |
| exit(0); |
| } |
| } |
| |
| ## returns 1 if every sentence in the corpus begins with an open paren, |
| ## false otherwise |
| sub already_parsed { |
| my ($corpus) = @_; |
| |
| open(CORPUS, $corpus) or die "can't read corpus file '$corpus'\n"; |
| while (<CORPUS>) { |
| # if we see a line not beginning with an open paren, we consider |
| # the file not to be parsed |
| return 0 unless /^\(/; |
| } |
| close(CORPUS); |
| |
| return 1; |
| } |
| |
| sub not_defined { |
| my ($var) = @_; |
| |
| print "* FATAL: environment variable \$$var is not defined.\n"; |
| exit; |
| } |
| |
| # Takes a prefix. If that prefix exists, then all the references are |
| # assumed to be in that file. Otherwise, we successively append an |
| # index, looking for parallel references. |
| sub get_numrefs { |
| my ($prefix) = @_; |
| |
| my $numrefs = 1; |
| if (! -e $prefix) { |
| my $index = 0; |
| while (-e "$prefix.$index") { |
| $index++; |
| } |
| $numrefs = $index; |
| } |
| |
| return $numrefs; |
| } |
| |
| sub start_hadoop_cluster { |
| rollout_hadoop_cluster(); |
| |
| # start the cluster |
| system("./hadoop/bin/start-all.sh"); |
| sleep(120); |
| } |
| |
| sub rollout_hadoop_cluster { |
| # if it's not already unpacked, unpack it |
| if (! -d "hadoop") { |
| system("tar xzf $JOSHUA/lib/hadoop-0.20.203.0rc1.tar.gz"); |
| system("ln -sf hadoop-0.20.203.0 hadoop"); |
| |
| chomp(my $hostname = `hostname -f`); |
| |
| # copy configuration files |
| foreach my $file (qw/core-site.xml mapred-site.xml hdfs-site.xml/) { |
| open READ, "$JOSHUA/scripts/training/templates/hadoop/$file" or die $file; |
| open WRITE, ">", "hadoop/conf/$file" or die "write $file"; |
| while (<READ>) { |
| s/<HADOOP-TMP-DIR>/$RUNDIR\/hadoop\/tmp/g; |
| s/<HOST>/$hostname/g; |
| s/<PORT1>/9000/g; |
| s/<PORT2>/9001/g; |
| s/<MAX-MAP-TASKS>/2/g; |
| s/<MAX-REDUCE-TASKS>/2/g; |
| |
| print WRITE; |
| } |
| close WRITE; |
| close READ; |
| } |
| |
| system("echo $hostname > hadoop/conf/masters"); |
| system("echo $hostname > hadoop/conf/slaves"); |
| |
| } else { |
| |
| # if it exists, shut things down, just in case |
| system("./hadoop/bin/stop-all.sh"); |
| |
| } |
| |
| # make sure hadoop isn't running already |
| my $running = `ps ax | grep hadoop | grep -v grep`; |
| if ($running) { |
| print "* WARNING: it looks like some Hadoop processes are already running\n"; |
| $running =~ s/^/\t/gm; |
| print $running; |
| } |
| |
| # format the name node |
| system("./hadoop/bin/hadoop namenode -format"); |
| sleep(120); |
| |
| $ENV{HADOOP} = $HADOOP = "hadoop"; |
| } |
| |
| sub stop_hadoop_cluster { |
| system("hadoop/bin/stop-all.sh"); |
| } |
| |
| sub teardown_hadoop_cluster { |
| stop_hadoop_cluster(); |
| system("rm -rf hadoop-0.20.203.0 hadoop"); |
| } |