| #!/usr/bin/perl |
| |
| # This script implements the Joshua pipeline. It can run a complete |
| # pipeline --- from raw training corpora to bleu scores on a test set |
| # --- and it allows jumping into arbitrary points of the pipeline. |
| |
| my $JOSHUA; |
| |
| BEGIN { |
| if (! exists $ENV{JOSHUA} || $ENV{JOSHUA} eq "" || |
| ! exists $ENV{JAVA_HOME} || $ENV{JAVA_HOME} eq "") { |
| print "Several environment variables must be set before running the pipeline. Please set:\n"; |
| print "* \$JOSHUA to the root of the Joshua source code.\n" |
| if (! exists $ENV{JOSHUA} || $ENV{JOSHUA} eq ""); |
| print "* \$JAVA_HOME to the directory of your local java installation. \n" |
| if (! exists $ENV{JAVA_HOME} || $ENV{JAVA_HOME} eq ""); |
| exit; |
| } |
| $JOSHUA = $ENV{JOSHUA}; |
| unshift(@INC,"$JOSHUA/scripts/training/cachepipe"); |
| unshift(@INC,"$JOSHUA/lib"); |
| } |
| |
| use strict; |
| use warnings; |
| use Getopt::Long; |
| use File::Basename; |
| use Cwd qw[abs_path getcwd]; |
| use POSIX qw[ceil]; |
| use List::Util qw[max min sum]; |
| use File::Temp qw[:mktemp]; |
| use CachePipe; |
| # use Thread::Pool; |
| |
| my $HADOOP = $ENV{HADOOP}; |
| my $MOSES = $ENV{MOSES}; |
| delete $ENV{GREP_OPTIONS}; |
| |
| my $THRAX = "$JOSHUA/thrax"; |
| |
| die not_defined("JAVA_HOME") unless exists $ENV{JAVA_HOME}; |
| |
| my (@CORPORA,$TUNE,$TEST,$ALIGNMENT,$SOURCE,$TARGET,@LMFILES,$GRAMMAR_FILE,$GLUE_GRAMMAR_FILE,$TUNE_GRAMMAR_FILE,$TEST_GRAMMAR_FILE,$THRAX_CONF_FILE); |
| my $FIRST_STEP = "FIRST"; |
| my $LAST_STEP = "LAST"; |
| my $LMFILTER = "$ENV{HOME}/code/filter/filter"; |
| |
| # The maximum length of training sentences (--maxlen). The threshold is applied to both sides. |
| my $MAXLEN = 50; |
| |
| # The maximum span rules in the main grammar can be applied to |
| my $MAXSPAN = 20; |
| |
| # The maximum length of tuning and testing sentences (--maxlen-tune and --maxlen-test). |
| my $MAXLEN_TUNE = 0; |
| my $MAXLEN_TEST = 0; |
| |
| my $DO_FILTER_TM = 1; |
| my $DO_SUBSAMPLE = 0; |
| my $DO_PACK_GRAMMARS = 1; |
| my $SCRIPTDIR = "$JOSHUA/scripts"; |
| my $TOKENIZER = "$SCRIPTDIR/training/penn-treebank-tokenizer.perl"; |
| my $NORMALIZER = "$SCRIPTDIR/training/normalize-punctuation.pl"; |
| my $GIZA_TRAINER = "$SCRIPTDIR/training/run-giza.pl"; |
| my $TUNECONFDIR = "$SCRIPTDIR/training/templates/tune"; |
| my $SRILM = ($ENV{SRILM}||"")."/bin/i686-m64/ngram-count"; |
| my $COPY_CONFIG = "$SCRIPTDIR/copy-config.pl"; |
| my $STARTDIR; |
| my $RUNDIR = $STARTDIR = getcwd(); |
| my $GRAMMAR_TYPE = "hiero"; # or "phrasal" or "samt" or "ghkm" |
| my $WITTEN_BELL = 0; |
| |
| my $JOSHUA_ARGS = ""; |
| |
| # Run description. |
| my $README = undef; |
| |
| # gzip-aware cat |
| my $CAT = "$SCRIPTDIR/training/scat"; |
| |
| # where processed data files are stored |
| my $DATA_DIR = "data"; |
| |
| # this file should exist in the Joshua mert templates file; it contains |
| # the Joshua command invoked by MERT |
| my $JOSHUA_CONFIG_ORIG = "$TUNECONFDIR/joshua.config"; |
| my %TUNEFILES = ( |
| 'decoder_command' => "$TUNECONFDIR/decoder_command.qsub", |
| 'joshua.config' => $JOSHUA_CONFIG_ORIG, |
| 'weights' => "$TUNECONFDIR/weights", |
| 'mert.config' => "$TUNECONFDIR/mert.config", |
| 'pro.config' => "$TUNECONFDIR/pro.config", |
| 'params.txt' => "$TUNECONFDIR/params.txt", |
| ); |
| |
| # Whether to do MBR decoding on the n-best list (for test data). |
| my $DO_MBR = 0; |
| |
| # Which aligner to use. The options are "giza" or "berkeley". |
| my $ALIGNER = "giza"; # "berkeley" or "giza" |
| |
| # Filter rules to the following maximum scope (Hopkins & Langmead, 2011). |
| my $SCOPE = 3; |
| |
| # What kind of filtering to use ("fast" or "exact"). |
| my $FILTERING = "fast"; |
| |
| # This is the amount of memory made available to Joshua. You'll need |
| # a lot more than this for SAMT decoding (though really it depends |
| # mostly on your grammar size) |
| my $JOSHUA_MEM = "3100m"; |
| |
| # the amount of memory available for hadoop processes (passed to |
| # Hadoop via -Dmapred.child.java.opts |
| my $HADOOP_MEM = "2g"; |
| |
| # The location of a custom core-site.xml file, if desired (optional). |
| my $HADOOP_CONF = undef; |
| |
| # memory available to the parser |
| my $PARSER_MEM = "2g"; |
| |
| # memory available for building the language model |
| my $BUILDLM_MEM = "2g"; |
| |
| # Memory available for packing the grammar. |
| my $PACKER_MEM = "2g"; |
| |
| # When qsub is called for decoding, these arguments should be passed to it. |
| my $QSUB_ARGS = ""; |
| |
| # When qsub is called for aligning, these arguments should be passed to it. |
| my $QSUB_ALIGN_ARGS = "-l h_rt=168:00:00,h_vmem=15g,mem_free=10g,num_proc=1"; |
| |
| # Amount of memory for the Berkeley aligner. |
| my $ALIGNER_MEM = "10g"; |
| |
| # Align corpus files a million lines at a time. |
| my $ALIGNER_BLOCKSIZE = 1000000; |
| |
| # The number of machines to decode on. If you set this higher than 1, |
| # you need to have qsub configured for your environment. |
| my $NUM_JOBS = 1; |
| |
| # The number of threads to use at different pieces in the pipeline |
| # (giza, decoding) |
| my $NUM_THREADS = 1; |
| |
| # which LM to use (kenlm or berkeleylm) |
| my $LM_TYPE = "kenlm"; |
| |
| # n-gram order |
| my $LM_ORDER = 5; |
| |
| # Whether to build and include an LM from the target-side of the |
| # corpus when manually-specified LM files are passed with --lmfile. |
| my $DO_BUILD_LM_FROM_CORPUS = 1; |
| |
| # whether to tokenize and lowercase training, tuning, and test data |
| my $DO_PREPARE_CORPORA = 1; |
| |
| # how many optimizer runs to perform |
| my $OPTIMIZER_RUNS = 1; |
| |
| # what to use to create language models ("berkeleylm" or "srilm") |
| my $LM_GEN = "berkeleylm"; |
| |
| my @STEPS = qw[FIRST SUBSAMPLE ALIGN PARSE THRAX GRAMMAR TUNE MERT PRO TEST LAST]; |
| my %STEPS = map { $STEPS[$_] => $_ + 1 } (0..$#STEPS); |
| |
| my $NAME = undef; |
| |
| # Methods to use for merging alignments (see Koehn et al., 2003). |
| # Options are union, {intersect, grow, srctotgt, tgttosrc}-{diag,final,final-and,diag-final,diag-final-and} |
| my $GIZA_MERGE = "grow-diag-final"; |
| |
| # Whether to merge all the --lmfile LMs into a single LM using weights based on the development corpus |
| my $MERGE_LMS = 0; |
| |
| # Which tuner to use by default |
| my $TUNER = "mert"; # or "pro" or "mira" |
| |
| # The number of iterations of the mira to run |
| my $MIRA_ITERATIONS = 8; |
| |
| # location of already-parsed corpus |
| my $PARSED_CORPUS = undef; |
| |
| my $retval = GetOptions( |
| "readme=s" => \$README, |
| "corpus=s" => \@CORPORA, |
| "parsed-corpus=s" => \$PARSED_CORPUS, |
| "tune=s" => \$TUNE, |
| "test=s" => \$TEST, |
| "prepare!" => \$DO_PREPARE_CORPORA, |
| "name=s" => \$NAME, |
| "aligner=s" => \$ALIGNER, |
| "alignment=s" => \$ALIGNMENT, |
| "aligner-mem=s" => \$ALIGNER_MEM, |
| "giza-merge=s" => \$GIZA_MERGE, |
| "source=s" => \$SOURCE, |
| "target=s" => \$TARGET, |
| "rundir=s" => \$RUNDIR, |
| "filter-tm!" => \$DO_FILTER_TM, |
| "scope=i" => \$SCOPE, |
| "filtering=s" => \$FILTERING, |
| "lm=s" => \$LM_TYPE, |
| "lmfile=s" => \@LMFILES, |
| "merge-lms!" => \$MERGE_LMS, |
| "lm-gen=s" => \$LM_GEN, |
| "lm-order=i" => \$LM_ORDER, |
| "corpus-lm!" => \$DO_BUILD_LM_FROM_CORPUS, |
| "witten-bell!" => \$WITTEN_BELL, |
| "tune-grammar=s" => \$TUNE_GRAMMAR_FILE, |
| "test-grammar=s" => \$TEST_GRAMMAR_FILE, |
| "grammar=s" => \$GRAMMAR_FILE, |
| "glue-grammar=s" => \$GLUE_GRAMMAR_FILE, |
| "maxspan=i" => \$MAXSPAN, |
| "mbr!" => \$DO_MBR, |
| "type=s" => \$GRAMMAR_TYPE, |
| "maxlen=i" => \$MAXLEN, |
| "maxlen-tune=i" => \$MAXLEN_TUNE, |
| "maxlen-test=i" => \$MAXLEN_TEST, |
| "tokenizer=s" => \$TOKENIZER, |
| "joshua-config=s" => \$TUNEFILES{'joshua.config'}, |
| "joshua-args=s" => \$JOSHUA_ARGS, |
| "joshua-mem=s" => \$JOSHUA_MEM, |
| "hadoop-mem=s" => \$HADOOP_MEM, |
| "parser-mem=s" => \$PARSER_MEM, |
| "buildlm-mem=s" => \$BUILDLM_MEM, |
| "packer-mem=s" => \$PACKER_MEM, |
| "decoder-command=s" => \$TUNEFILES{'decoder_command'}, |
| "tuner=s" => \$TUNER, |
| "mira-iterations=i" => \$MIRA_ITERATIONS, |
| "thrax=s" => \$THRAX, |
| "thrax-conf=s" => \$THRAX_CONF_FILE, |
| "jobs=i" => \$NUM_JOBS, |
| "threads=i" => \$NUM_THREADS, |
| "subsample!" => \$DO_SUBSAMPLE, |
| "qsub-args=s" => \$QSUB_ARGS, |
| "qsub-align-args=s" => \$QSUB_ALIGN_ARGS, |
| "first-step=s" => \$FIRST_STEP, |
| "last-step=s" => \$LAST_STEP, |
| "aligner-chunk-size=s" => \$ALIGNER_BLOCKSIZE, |
| "hadoop=s" => \$HADOOP, |
| "hadoop-conf=s" => \$HADOOP_CONF, |
| "optimizer-runs=i" => \$OPTIMIZER_RUNS, |
| ); |
| |
| if (! $retval) { |
| print "Invalid usage, quitting\n"; |
| exit 1; |
| } |
| |
| $RUNDIR = get_absolute_path($RUNDIR); |
| |
| $TUNER = lc $TUNER; |
| |
| my $DOING_LATTICES = 0; |
| |
| # Prepend a space to the arguments list if it's non-empty and doesn't already have the space. |
| if ($JOSHUA_ARGS ne "" and $JOSHUA_ARGS !~ /^\s/) { |
| $JOSHUA_ARGS = " $JOSHUA_ARGS"; |
| } |
| |
| $TUNEFILES{'joshua.config'} = get_absolute_path($TUNEFILES{'joshua.config'}); |
| $TUNEFILES{'decoder_command'} = get_absolute_path($TUNEFILES{'decoder_command'}); |
| |
| my %DATA_DIRS = ( |
| train => get_absolute_path("$RUNDIR/$DATA_DIR/train"), |
| tune => get_absolute_path("$RUNDIR/$DATA_DIR/tune"), |
| test => get_absolute_path("$RUNDIR/$DATA_DIR/test"), |
| ); |
| |
| if (defined $NAME) { |
| map { $DATA_DIRS{$_} .= "/$NAME" } (keys %DATA_DIRS); |
| } |
| |
| # capitalize these to offset a common error: |
| $FIRST_STEP = uc($FIRST_STEP); |
| $LAST_STEP = uc($LAST_STEP); |
| |
| $| = 1; |
| |
| my $cachepipe = new CachePipe(); |
| |
| # This tells cachepipe not to include the command signature when determining to run a command. Note |
| # that this is not backwards compatible! |
| $cachepipe->omit_cmd(); |
| |
| $SIG{INT} = sub { |
| print "* Got C-c, quitting\n"; |
| $cachepipe->cleanup(); |
| exit 1; |
| }; |
| |
| # if no LMs were specified, we need to build one from the target side of the corpus |
| if (scalar @LMFILES == 0) { |
| $DO_BUILD_LM_FROM_CORPUS = 1; |
| } |
| |
| ## Sanity Checking ################################################### |
| |
| # If a language model was specified and no corpus was given to build another one from the target |
| # side of the training data (which could happen, for example, when starting at the tuning step with |
| # an existing LM), turn off building an LM from the corpus. The user could have done this |
| # explicitly with --no-corpus-lm, but might have forgotten to, and we con't want to pester them with |
| # an error about easily-inferrable intentions. |
| if (scalar @LMFILES && ! scalar(@CORPORA)) { |
| $DO_BUILD_LM_FROM_CORPUS = 0; |
| } |
| |
| |
| # if merging LMs, make sure there are at least 2 LMs to merge. |
| # first, pin $DO_BUILD_LM_FROM_CORPUS to 0 or 1 so that the subsequent check works. |
| if ($MERGE_LMS) { |
| if ($DO_BUILD_LM_FROM_CORPUS != 0) { |
| $DO_BUILD_LM_FROM_CORPUS = 1 |
| } |
| |
| if (@LMFILES + $DO_BUILD_LM_FROM_CORPUS < 2) { |
| print "* FATAL: I need 2 or more language models to merge (including the corpus target-side LM)."; |
| exit 2; |
| } |
| } |
| |
| # absolutize LM file paths |
| map { |
| $LMFILES[$_] = get_absolute_path($LMFILES[$_]); |
| } 0..$#LMFILES; |
| |
| # make sure the LMs exist |
| foreach my $lmfile (@LMFILES) { |
| if (! -e $lmfile) { |
| print "* FATAL: couldn't find language model file '$lmfile'\n"; |
| exit 1; |
| } |
| } |
| |
| # case-normalize this |
| $GRAMMAR_TYPE = lc $GRAMMAR_TYPE; |
| |
| # make sure source and target were specified |
| if (! defined $SOURCE or $SOURCE eq "") { |
| print "* FATAL: I need a source language extension (--source)\n"; |
| exit 1; |
| } |
| if (! defined $TARGET or $TARGET eq "") { |
| print "* FATAL: I need a target language extension (--target)\n"; |
| exit 1; |
| } |
| |
| # make sure a corpus was provided if we're doing any step before tuning |
| if (@CORPORA == 0 and $STEPS{$FIRST_STEP} < $STEPS{TUNE}) { |
| print "* FATAL: need at least one training corpus (--corpus)\n"; |
| exit 1; |
| } |
| |
| # make sure a tuning corpus was provided if we're doing tuning |
| if (! defined $TUNE and ($STEPS{$FIRST_STEP} <= $STEPS{TUNE} |
| and $STEPS{$LAST_STEP} >= $STEPS{TUNE})) { |
| print "* FATAL: need a tuning set (--tune)\n"; |
| exit 1; |
| } |
| |
| # make sure a test corpus was provided if we're decoding a test set |
| if (! defined $TEST and ($STEPS{$FIRST_STEP} <= $STEPS{TEST} |
| and $STEPS{$LAST_STEP} >= $STEPS{TEST})) { |
| print "* FATAL: need a test set (--test)\n"; |
| exit 1; |
| } |
| |
| # make sure a grammar file was given if we're skipping training |
| if (! defined $GRAMMAR_FILE) { |
| if ($STEPS{$FIRST_STEP} >= $STEPS{TEST}) { |
| if (! defined $TEST_GRAMMAR_FILE) { |
| print "* FATAL: need a grammar (--grammar or --test-grammar) if you're skipping to testing\n"; |
| exit 1; |
| } |
| } elsif ($STEPS{$FIRST_STEP} >= $STEPS{TUNE}) { |
| if (! defined $TUNE_GRAMMAR_FILE) { |
| print "* FATAL: need a grammar (--grammar or --tune-grammar) if you're skipping grammar learning\n"; |
| exit 1; |
| } |
| } |
| } |
| |
| # make sure SRILM is defined if we're building a language model |
| if ($LM_GEN eq "srilm" && (scalar @LMFILES == 0) && $STEPS{$FIRST_STEP} <= $STEPS{TUNE} && $STEPS{$LAST_STEP} >= $STEPS{TUNE}) { |
| not_defined("SRILM") unless exists $ENV{SRILM} and -d $ENV{SRILM}; |
| } |
| |
| # check for file presence |
| if (defined $GRAMMAR_FILE and ! -e $GRAMMAR_FILE) { |
| print "* FATAL: couldn't find grammar file '$GRAMMAR_FILE'\n"; |
| exit 1; |
| } |
| if (defined $TUNE_GRAMMAR_FILE and ! -e $TUNE_GRAMMAR_FILE) { |
| print "* FATAL: couldn't find tuning grammar file '$TUNE_GRAMMAR_FILE'\n"; |
| exit 1; |
| } |
| if (defined $TEST_GRAMMAR_FILE and ! -e $TEST_GRAMMAR_FILE) { |
| print "* FATAL: couldn't find test grammar file '$TEST_GRAMMAR_FILE'\n"; |
| exit 1; |
| } |
| if (defined $ALIGNMENT and ! -e $ALIGNMENT) { |
| print "* FATAL: couldn't find alignment file '$ALIGNMENT'\n"; |
| exit 1; |
| } |
| |
| # If $CORPUS was a relative path, prepend the starting directory (under the assumption it was |
| # relative to there). This makes sure that everything will still work if we change the run |
| # directory. |
| map { |
| $CORPORA[$_] = get_absolute_path("$CORPORA[$_]"); |
| } (0..$#CORPORA); |
| |
| # Do the same for tuning and test data, and other files |
| $TUNE = get_absolute_path($TUNE); |
| $TEST = get_absolute_path($TEST); |
| |
| $GRAMMAR_FILE = get_absolute_path($GRAMMAR_FILE); |
| $GLUE_GRAMMAR_FILE = get_absolute_path($GLUE_GRAMMAR_FILE); |
| $TUNE_GRAMMAR_FILE = get_absolute_path($TUNE_GRAMMAR_FILE); |
| $TEST_GRAMMAR_FILE = get_absolute_path($TEST_GRAMMAR_FILE); |
| $THRAX_CONF_FILE = get_absolute_path($THRAX_CONF_FILE); |
| $ALIGNMENT = get_absolute_path($ALIGNMENT); |
| $HADOOP_CONF = get_absolute_path($HADOOP_CONF); |
| |
| foreach my $corpus (@CORPORA) { |
| foreach my $ext ($TARGET,$SOURCE) { |
| if (! -e "$corpus.$ext") { |
| print "* FATAL: can't find '$corpus.$ext'"; |
| exit 1; |
| } |
| } |
| } |
| |
| if ($ALIGNER ne "giza" and $ALIGNER ne "berkeley") { |
| print "* FATAL: aligner must be one of 'giza', or 'berkeley'\n"; |
| exit 1; |
| } |
| |
| if ($LM_TYPE ne "kenlm" and $LM_TYPE ne "berkeleylm") { |
| print "* FATAL: lm type (--lm) must be one of 'kenlm' or 'berkeleylm'\n"; |
| exit 1; |
| } |
| |
| if ($LM_GEN ne "berkeleylm" and $LM_GEN ne "srilm") { |
| print "* FATAL: lm generating code (--lm-gen) must be one of 'berkeleylm' (default) or 'srilm'\n"; |
| exit 1; |
| } |
| |
| if ($TUNER eq "mira") { |
| if (! defined $MOSES) { |
| print "* FATAL: using MIRA for tuning requires setting the MOSES environment variable\n"; |
| exit 1; |
| } |
| } |
| |
| if ($TUNER ne "mert" and $TUNER ne "mira" and $TUNER ne "pro") { |
| print "* FATAL: --tuner must be one of 'mert', 'pro', or 'mira'.\n"; |
| exit 1; |
| } |
| |
| $FILTERING = lc $FILTERING; |
| if ($FILTERING eq "fast") { |
| $FILTERING = "-f" |
| } elsif ($FILTERING eq "exact") { |
| $FILTERING = ""; |
| } else { |
| print "* FATAL: --filtering must be one of 'fast' (default) or 'exact'\n"; |
| exit 1; |
| } |
| |
| if (defined $HADOOP_CONF && ! -e $HADOOP_CONF) { |
| print STDERR "* FATAL: Couldn't find \$HADOOP_CONF file '$HADOOP_CONF'\n"; |
| exit 1; |
| } |
| |
| ## END SANITY CHECKS |
| |
| #################################################################################################### |
| ## Dependent variable setting ###################################################################### |
| #################################################################################################### |
| |
| # if parallelization is turned off, then use the sequential version of |
| # the decoder command |
| if ($NUM_JOBS == 1) { |
| $TUNEFILES{'decoder_command'} = "$TUNECONFDIR/decoder_command.sequential"; |
| } |
| |
| my $OOV = ($GRAMMAR_TYPE eq "hiero" or $GRAMMAR_TYPE eq "phrasal") ? "X" : "OOV"; |
| |
| # The phrasal system should use the ITG grammar, allowing for limited distortion |
| if ($GRAMMAR_TYPE eq "phrasal") { |
| $GLUE_GRAMMAR_FILE = get_absolute_path("$JOSHUA/data/glue-grammar.itg"); |
| } |
| |
| # use this default unless it's already been defined by a command-line argument |
| $THRAX_CONF_FILE = "$JOSHUA/scripts/training/templates/thrax-$GRAMMAR_TYPE.conf" unless defined $THRAX_CONF_FILE; |
| |
| mkdir $RUNDIR unless -d $RUNDIR; |
| chdir($RUNDIR); |
| |
| if (defined $README) { |
| open DESC, ">README" or die "can't write README file"; |
| print DESC $README; |
| print DESC $/; |
| close DESC; |
| } |
| |
| # default values -- these are overridden if the full script is run |
| # (after tokenization and normalization) |
| my (%TRAIN,%TUNE,%TEST); |
| if (@CORPORA) { |
| $TRAIN{prefix} = $CORPORA[0]; |
| $TRAIN{source} = "$CORPORA[0].$SOURCE"; |
| $TRAIN{target} = "$CORPORA[0].$TARGET"; |
| } |
| |
| # set the location of the parsed corpus if that was defined |
| if (defined $PARSED_CORPUS) { |
| $TRAIN{parsed} = get_absolute_path($PARSED_CORPUS); |
| } |
| |
| if ($TUNE) { |
| $TUNE{source} = "$TUNE.$SOURCE"; |
| $TUNE{target} = "$TUNE.$TARGET"; |
| |
| if (! -e "$TUNE{source}") { |
| print "* FATAL: couldn't find tune source file at '$TUNE{source}'\n"; |
| exit; |
| } |
| } |
| |
| if ($TEST) { |
| $TEST{source} = "$TEST.$SOURCE"; |
| $TEST{target} = "$TEST.$TARGET"; |
| |
| if (! -e "$TEST{source}") { |
| print "* FATAL: couldn't find test source file at '$TEST{source}'\n"; |
| exit; |
| } |
| } |
| |
| if ($FIRST_STEP ne "FIRST") { |
| if (@CORPORA > 1) { |
| print "* FATAL: you can't skip steps if you specify more than one --corpus\n"; |
| exit(1); |
| } |
| |
| if (eval { goto $FIRST_STEP }) { |
| print "* Skipping to step $FIRST_STEP\n"; |
| goto $FIRST_STEP; |
| } else { |
| print "* No such step $FIRST_STEP\n"; |
| exit 1; |
| } |
| } |
| |
| ## STEP 1: filter and preprocess corpora ############################# |
| FIRST: |
| ; |
| |
| if (defined $ALIGNMENT) { |
| print "* FATAL: it doesn't make sense to provide an alignment and then do\n"; |
| print " tokenization. Either remove --alignment or specify a first step\n"; |
| print " of Thrax (--first-step THRAX)\n"; |
| exit 1; |
| } |
| |
| if (@CORPORA == 0) { |
| print "* FATAL: need at least one training corpus (--corpus)\n"; |
| exit 1; |
| } |
| |
| # prepare the training data |
| my %PREPPED = ( |
| TRAIN => 0, |
| TUNE => 0, |
| TEST => 0 |
| ); |
| |
| |
| if ($DO_PREPARE_CORPORA) { |
| my $prefixes = prepare_data("train",\@CORPORA,$MAXLEN); |
| |
| # used for parsing |
| $TRAIN{mixedcase} = "$DATA_DIRS{train}/$prefixes->{shortened}.$TARGET.gz"; |
| |
| $TRAIN{prefix} = "$DATA_DIRS{train}/corpus"; |
| $TRAIN{source} = "$DATA_DIRS{train}/corpus.$SOURCE"; |
| $TRAIN{target} = "$DATA_DIRS{train}/corpus.$TARGET"; |
| $PREPPED{TRAIN} = 1; |
| } |
| |
| # prepare the tuning and development data |
| if (defined $TUNE and $DO_PREPARE_CORPORA) { |
| my $prefixes = prepare_data("tune",[$TUNE],$MAXLEN_TUNE); |
| $TUNE{source} = "$DATA_DIRS{tune}/$prefixes->{lowercased}.$SOURCE"; |
| $TUNE{target} = "$DATA_DIRS{tune}/$prefixes->{lowercased}.$TARGET"; |
| $PREPPED{TUNE} = 1; |
| } |
| |
| if (defined $TEST and $DO_PREPARE_CORPORA) { |
| my $prefixes = prepare_data("test",[$TEST],$MAXLEN_TEST); |
| $TEST{source} = "$DATA_DIRS{test}/$prefixes->{lowercased}.$SOURCE"; |
| $TEST{target} = "$DATA_DIRS{test}/$prefixes->{lowercased}.$TARGET"; |
| $PREPPED{TEST} = 1; |
| } |
| |
| maybe_quit("FIRST"); |
| |
| ## SUBSAMPLE ######################################################### |
| |
| SUBSAMPLE: |
| ; |
| |
| # subsample |
| if ($DO_SUBSAMPLE) { |
| mkdir("$DATA_DIRS{train}/subsampled") unless -d "$DATA_DIRS{train}/subsampled"; |
| |
| $cachepipe->cmd("subsample-manifest", |
| "echo corpus > $DATA_DIRS{train}/subsampled/manifest", |
| "$DATA_DIRS{train}/subsampled/manifest"); |
| |
| $cachepipe->cmd("subsample-testdata", |
| "cat $TUNE{source} $TEST{source} > $DATA_DIRS{train}/subsampled/test-data", |
| $TUNE{source}, |
| $TEST{source}, |
| "$DATA_DIRS{train}/subsampled/test-data"); |
| |
| $cachepipe->cmd("subsample", |
| "java -Xmx4g -Dfile.encoding=utf8 -cp $JOSHUA/bin:$JOSHUA/lib/commons-cli-2.0-SNAPSHOT.jar joshua.subsample.Subsampler -e $TARGET -f $SOURCE -epath $DATA_DIRS{train}/ -fpath $DATA_DIRS{train}/ -output $DATA_DIRS{train}/subsampled/subsampled.$MAXLEN -ratio 1.04 -test $DATA_DIRS{train}/subsampled/test-data -training $DATA_DIRS{train}/subsampled/manifest", |
| "$DATA_DIRS{train}/subsampled/manifest", |
| "$DATA_DIRS{train}/subsampled/test-data", |
| $TRAIN{source}, |
| $TRAIN{target}, |
| "$DATA_DIRS{train}/subsampled/subsampled.$MAXLEN.$TARGET", |
| "$DATA_DIRS{train}/subsampled/subsampled.$MAXLEN.$SOURCE"); |
| |
| # rewrite the symlinks to point to the subsampled corpus |
| foreach my $lang ($TARGET,$SOURCE) { |
| system("ln -sf subsampled/subsampled.$MAXLEN.$lang $DATA_DIRS{train}/corpus.$lang"); |
| } |
| } |
| |
| maybe_quit("SUBSAMPLE"); |
| |
| |
| ## ALIGN ############################################################# |
| |
| ALIGN: |
| ; |
| |
| # This basically means that we've skipped tokenization, in which case |
| # we still want to move the input files into the canonical place |
| if ($FIRST_STEP eq "ALIGN") { |
| if (defined $ALIGNMENT) { |
| print "* FATAL: It doesn't make sense to provide an alignment\n"; |
| print " but not to skip the tokenization and subsampling steps\n"; |
| exit 1; |
| } |
| |
| # TODO: copy the files into the canonical place |
| |
| # Jumping straight to alignment is probably the same thing as |
| # skipping tokenization, and might also be implemented by a |
| # --no-tokenization flag |
| } |
| |
| # skip this step if an alignment was provided |
| if (! defined $ALIGNMENT) { |
| |
| # We process the data in chunks which by default are 1,000,000 sentence pairs. So first split up |
| # the data into those chunks. |
| system("mkdir","-p","$DATA_DIRS{train}/splits") unless -d "$DATA_DIRS{train}/splits"; |
| |
| $cachepipe->cmd("source-numlines", |
| "cat $TRAIN{source} | wc -l", |
| $TRAIN{source}); |
| my $numlines = $cachepipe->stdout(); |
| my $numchunks = ceil($numlines / $ALIGNER_BLOCKSIZE); |
| |
| open TARGET, $TRAIN{target} or die "can't read $TRAIN{target}"; |
| open SOURCE, $TRAIN{source} or die "can't read $TRAIN{source}"; |
| |
| my $lastchunk = -1; |
| while (my $target = <TARGET>) { |
| my $source = <SOURCE>; |
| |
| # We want to prevent a very small last chunk, which we accomplish |
| # by folding the last chunk into the penultimate chunk. |
| my $chunk = ($numchunks <= 2) |
| ? 0 |
| : min($numchunks - 2, |
| int( (${.} - 1) / $ALIGNER_BLOCKSIZE )); |
| |
| if ($chunk != $lastchunk) { |
| close CHUNK_SOURCE; |
| close CHUNK_TARGET; |
| open CHUNK_SOURCE, ">", "$DATA_DIRS{train}/splits/corpus.$SOURCE.$chunk" or die; |
| open CHUNK_TARGET, ">", "$DATA_DIRS{train}/splits/corpus.$TARGET.$chunk" or die; |
| |
| $lastchunk = $chunk; |
| } |
| |
| print CHUNK_SOURCE $source; |
| print CHUNK_TARGET $target; |
| } |
| close CHUNK_SOURCE; |
| close CHUNK_TARGET; |
| |
| close SOURCE; |
| close TARGET; |
| |
| # my $max_aligner_threads = $NUM_THREADS; |
| # if ($ALIGNER eq "giza" and $max_aligner_threads > 1) { |
| # $max_aligner_threads /= 2; |
| # } |
| |
| # # With multi-threading, we can use a pool to set up concurrent GIZA jobs on the chunks. |
| # |
| # TODO: implement this. There appears to be a problem with calling system() in threads. |
| # |
| # my $pool = new Thread::Pool(Min => 1, Max => $max_aligner_threads); |
| |
| system("mkdir alignments") unless -d "alignments"; |
| |
| if ($lastchunk == 0 || $NUM_JOBS == 1) { |
| system("seq 0 $lastchunk | $SCRIPTDIR/training/paralign.pl -aligner $ALIGNER -num_threads $NUM_THREADS -giza_merge $GIZA_MERGE -aligner_mem $ALIGNER_MEM -source $SOURCE -target $TARGET -giza_trainer \"$GIZA_TRAINER\" -train_dir \"$DATA_DIRS{train}\" > alignments/run.log"); |
| } else { |
| system("seq 0 $lastchunk | $JOSHUA/scripts/training/parallelize/parallelize.pl --err err --jobs $NUM_JOBS --qsub-args \"$QSUB_ALIGN_ARGS\" -p $ALIGNER_MEM -- $SCRIPTDIR/training/paralign.pl -aligner $ALIGNER -num_threads $NUM_THREADS -giza_merge $GIZA_MERGE -aligner_mem $ALIGNER_MEM -source $SOURCE -target $TARGET -giza_trainer \"$GIZA_TRAINER\" -train_dir \"$DATA_DIRS{train}\" > alignments/run.log"); |
| } |
| |
| my @aligned_files; |
| if ($ALIGNER eq "giza") { |
| @aligned_files = map { "alignments/$_/model/aligned.$GIZA_MERGE" } (0..$lastchunk); |
| } elsif ($ALIGNER eq "berkeley") { |
| @aligned_files = map { "alignments/$_/training.align" } (0..$lastchunk); |
| } |
| my $aligned_file_list = join(" ", @aligned_files); |
| |
| # wait for all the threads to finish |
| # $pool->join(); |
| |
| # combine the alignments |
| $cachepipe->cmd("aligner-combine", |
| "cat $aligned_file_list > alignments/training.align", |
| $aligned_files[-1], |
| "alignments/training.align"); |
| |
| # at the end, all the files are concatenated into a single alignment file parallel to the input |
| # corpora |
| $ALIGNMENT = "alignments/training.align"; |
| } |
| |
| maybe_quit("ALIGN"); |
| |
| |
| ## PARSE ############################################################# |
| |
| PARSE: |
| ; |
| |
| # Parsing only happens for SAMT grammars. |
| |
| if ($FIRST_STEP eq "PARSE" and ($GRAMMAR_TYPE eq "hiero" or $GRAMMAR_TYPE eq "phrasal")) { |
| print STDERR "* FATAL: parsing doesn't apply to hiero grammars; You need to add '--type samt'\n"; |
| exit; |
| } |
| |
| if ($GRAMMAR_TYPE eq "samt" || $GRAMMAR_TYPE eq "ghkm") { |
| |
| # If the user passed in the already-parsed corpus, use that (after copying it into place) |
| if (defined $TRAIN{parsed} && -e $TRAIN{parsed}) { |
| # copy and adjust the location of the file to its canonical location |
| system("cp $TRAIN{parsed} $DATA_DIRS{train}/corpus.parsed.$TARGET"); |
| $TRAIN{parsed} = "$DATA_DIRS{train}/corpus.parsed.$TARGET"; |
| } else { |
| |
| system("mkdir -p $DATA_DIRS{train}") unless -e $DATA_DIRS{train}; |
| |
| $cachepipe->cmd("build-vocab", |
| "cat $TRAIN{target} | $SCRIPTDIR/training/build-vocab.pl > $DATA_DIRS{train}/vocab.$TARGET", |
| $TRAIN{target}, |
| "$DATA_DIRS{train}/vocab.$TARGET"); |
| |
| my $file_to_parse = (exists $TRAIN{mixedcase}) ? $TRAIN{mixedcase} : $TRAIN{target}; |
| |
| if ($NUM_JOBS > 1) { |
| # the black-box parallelizer model doesn't work with multiple |
| # threads, so we're always spawning single-threaded instances here |
| |
| # open PARSE, ">parse.sh" or die; |
| # print PARSE "cat $TRAIN{target} | $JOSHUA/scripts/training/parallelize/parallelize.pl --jobs $NUM_JOBS --qsub-args \"$QSUB_ARGS\" -- java -d64 -Xmx${PARSER_MEM} -jar $JOSHUA/lib/BerkeleyParser.jar -gr $JOSHUA/lib/eng_sm6.gr -nThreads 1 | sed 's/^\(/\(TOP/' | tee $DATA_DIRS{train}/corpus.$TARGET.parsed.mc | perl -pi -e 's/(\\S+)\\)/lc(\$1).\")\"/ge' | tee $DATA_DIRS{train}/corpus.$TARGET.parsed | perl $SCRIPTDIR/training/add-OOVs.pl $DATA_DIRS{train}/vocab.$TARGET > $DATA_DIRS{train}/corpus.parsed.$TARGET\n"; |
| # close PARSE; |
| # chmod 0755, "parse.sh"; |
| # $cachepipe->cmd("parse", |
| # "setsid ./parse.sh", |
| # "$TRAIN{target}", |
| # "$DATA_DIRS{train}/corpus.parsed.$TARGET"); |
| |
| $cachepipe->cmd("parse", |
| "$CAT $file_to_parse | $JOSHUA/scripts/training/parallelize/parallelize.pl --jobs $NUM_JOBS --qsub-args \"$QSUB_ARGS\" -p 8g -- java -d64 -Xmx${PARSER_MEM} -jar $JOSHUA/lib/BerkeleyParser.jar -gr $JOSHUA/lib/eng_sm6.gr -nThreads 1 | sed 's/^(())\$//; s/^(/(TOP/' | perl $SCRIPTDIR/training/add-OOVs.pl $DATA_DIRS{train}/vocab.$TARGET | tee $DATA_DIRS{train}/corpus.$TARGET.Parsed | $SCRIPTDIR/training/lowercase-leaves.pl > $DATA_DIRS{train}/corpus.parsed.$TARGET", |
| "$TRAIN{target}", |
| "$DATA_DIRS{train}/corpus.parsed.$TARGET"); |
| } else { |
| # Multi-threading in the Berkeley parser is broken, so we use a black-box parallelizer on top |
| # of it. |
| $cachepipe->cmd("parse", |
| "$CAT $file_to_parse | $JOSHUA/scripts/training/parallelize/parallelize.pl --jobs $NUM_THREADS --use-fork -- java -d64 -Xmx${PARSER_MEM} -jar $JOSHUA/lib/BerkeleyParser.jar -gr $JOSHUA/lib/eng_sm6.gr -nThreads 1 | sed 's/^(())\$//; s/^(/(TOP/' | perl $SCRIPTDIR/training/add-OOVs.pl $DATA_DIRS{train}/vocab.$TARGET | tee $DATA_DIRS{train}/corpus.$TARGET.Parsed | $SCRIPTDIR/training/lowercase-leaves.pl > $DATA_DIRS{train}/corpus.parsed.$TARGET", |
| "$TRAIN{target}", |
| "$DATA_DIRS{train}/corpus.parsed.$TARGET"); |
| } |
| |
| $TRAIN{parsed} = "$DATA_DIRS{train}/corpus.parsed.$TARGET"; |
| } |
| } |
| |
| maybe_quit("PARSE"); |
| |
| ## THRAX ############################################################# |
| |
| GRAMMAR: |
| ; |
| THRAX: |
| ; |
| |
| system("mkdir -p $DATA_DIRS{train}") unless -d $DATA_DIRS{train}; |
| |
| if ($GRAMMAR_TYPE eq "samt" || $GRAMMAR_TYPE eq "ghkm") { |
| |
| # if we jumped right here, $TRAIN{target} should be parsed |
| if (exists $TRAIN{parsed}) { |
| # parsing step happened in-script or a parsed corpus was passed in explicitly, all is well |
| |
| } elsif (already_parsed($TRAIN{target})) { |
| # skipped straight to this step, passing a parsed corpus |
| |
| $TRAIN{parsed} = "$DATA_DIRS{train}/corpus.parsed.$TARGET"; |
| |
| $cachepipe->cmd("cp-train-$TARGET", |
| "cp $TRAIN{target} $TRAIN{parsed}", |
| $TRAIN{target}, |
| $TRAIN{parsed}); |
| |
| $TRAIN{target} = "$DATA_DIRS{train}/corpus.$TARGET"; |
| |
| # now extract the leaves of the parsed corpus |
| $cachepipe->cmd("extract-leaves", |
| "cat $TRAIN{parsed} | perl -pe 's/\\(.*?(\\S\+)\\)\+?/\$1/g' | perl -pe 's/\\)//g' > $TRAIN{target}", |
| $TRAIN{parsed}, |
| $TRAIN{target}); |
| |
| if ($TRAIN{source} ne "$DATA_DIRS{train}/corpus.$SOURCE") { |
| $cachepipe->cmd("cp-train-$SOURCE", |
| "cp $TRAIN{source} $DATA_DIRS{train}/corpus.$SOURCE", |
| $TRAIN{source}, "$DATA_DIRS{train}/corpus.$SOURCE"); |
| $TRAIN{source} = "$DATA_DIRS{train}/corpus.$SOURCE"; |
| } |
| |
| } else { |
| print "* FATAL: You requested to build an SAMT grammar, but provided an\n"; |
| print " unparsed corpus. Please re-run the pipeline and begin no later\n"; |
| print " than the PARSE step (--first-step PARSE), or pass in a parsed corpus\n"; |
| print " using --parsed-corpus CORPUS.\n"; |
| exit 1; |
| } |
| |
| } |
| |
| # we may have skipped directly to this step, in which case we need to |
| # ensure an alignment was provided |
| if (! defined $ALIGNMENT) { |
| print "* FATAL: no alignment file specified\n"; |
| exit(1); |
| } |
| |
| # If the grammar file wasn't specified |
| if (! defined $GRAMMAR_FILE) { |
| |
| my $target_file = ($GRAMMAR_TYPE eq "hiero" or $GRAMMAR_TYPE eq "phrasal") ? $TRAIN{target} : $TRAIN{parsed}; |
| |
| if ($GRAMMAR_TYPE eq "ghkm") { |
| $cachepipe->cmd("ghkm-extract", |
| "java -Xmx4g -Xms4g -cp $JOSHUA/lib/ghkm-modified.jar:$JOSHUA/lib/fastutil.jar -XX:+UseCompressedOops edu.stanford.nlp.mt.syntax.ghkm.RuleExtractor -fCorpus $TRAIN{source} -eParsedCorpus $target_file -align $ALIGNMENT -threads $NUM_THREADS -joshuaFormat true -maxCompositions 1 -reversedAlignment false | $SCRIPTDIR/support/splittabs.pl ghkm-mapping.gz grammar.gz", |
| $ALIGNMENT, |
| "grammar.gz"); |
| } elsif (! -e "grammar.gz" && ! -z "grammar.gz") { |
| |
| # Since this is an expensive step, we short-circuit it if the grammar file is present. I'm not |
| # sure that this is the right behavior. |
| |
| # create the input file |
| $cachepipe->cmd("thrax-input-file", |
| "paste $TRAIN{source} $target_file $ALIGNMENT | perl -pe 's/\\t/ ||| /g' | grep -v '()' | grep -v '||| \\+\$' > $DATA_DIRS{train}/thrax-input-file", |
| $TRAIN{source}, $target_file, $ALIGNMENT, |
| "$DATA_DIRS{train}/thrax-input-file"); |
| |
| |
| # Rollout the hadoop cluster if needed. This causes $HADOOP to be defined (pointing to the |
| # unrolled directory). |
| start_hadoop_cluster() unless defined $HADOOP; |
| |
| # put the hadoop files in place |
| my $THRAXDIR; |
| my $thrax_input; |
| if ($HADOOP eq "hadoop") { |
| $THRAXDIR = "thrax"; |
| |
| $thrax_input = "$DATA_DIRS{train}/thrax-input-file" |
| |
| } else { |
| $THRAXDIR = "pipeline-$SOURCE-$TARGET-$GRAMMAR_TYPE-$RUNDIR"; |
| $THRAXDIR =~ s#/#_#g; |
| |
| $cachepipe->cmd("thrax-prep", |
| "$HADOOP/bin/hadoop fs -rmr $THRAXDIR; $HADOOP/bin/hadoop fs -mkdir $THRAXDIR; $HADOOP/bin/hadoop fs -put $DATA_DIRS{train}/thrax-input-file $THRAXDIR/input-file", |
| "$DATA_DIRS{train}/thrax-input-file", |
| "grammar.gz"); |
| |
| $thrax_input = "$THRAXDIR/input-file"; |
| } |
| |
| # copy the thrax config file |
| my $thrax_file = "thrax-$GRAMMAR_TYPE.conf"; |
| system("grep -v ^input-file $THRAX_CONF_FILE > $thrax_file.tmp"); |
| system("echo input-file $thrax_input >> $thrax_file.tmp"); |
| system("mv $thrax_file.tmp $thrax_file"); |
| |
| $cachepipe->cmd("thrax-run", |
| "$HADOOP/bin/hadoop jar $THRAX/bin/thrax.jar -D mapred.child.java.opts='-Xmx$HADOOP_MEM' $thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; $HADOOP/bin/hadoop fs -getmerge $THRAXDIR/final/ grammar.gz; $HADOOP/bin/hadoop fs -rmr $THRAXDIR", |
| "$DATA_DIRS{train}/thrax-input-file", |
| $thrax_file, |
| "grammar.gz"); |
| #perl -pi -e 's/\.?0+\b//g' grammar; |
| |
| stop_hadoop_cluster() if $HADOOP eq "hadoop"; |
| |
| # cache the thrax-prep step, which depends on grammar.gz |
| if ($HADOOP ne "hadoop") { |
| $cachepipe->cmd("thrax-prep", "--cache-only"); |
| } |
| |
| # clean up |
| # TODO: clean up real hadoop clusters too |
| if ($HADOOP eq "hadoop") { |
| system("rm -rf $THRAXDIR hadoop hadoop-0.20.2"); |
| } |
| } |
| |
| # set the grammar file |
| $GRAMMAR_FILE = "grammar.gz"; |
| } |
| |
| maybe_quit("THRAX"); |
| maybe_quit("GRAMMAR"); |
| |
| ## TUNING ############################################################## |
| TUNE: |
| ; |
| |
| # prep the tuning data, unless already prepped |
| if (! $PREPPED{TUNE} and $DO_PREPARE_CORPORA) { |
| my $prefixes = prepare_data("tune",[$TUNE],$MAXLEN_TUNE); |
| $TUNE{source} = "$DATA_DIRS{tune}/$prefixes->{lowercased}.$SOURCE"; |
| $TUNE{target} = "$DATA_DIRS{tune}/$prefixes->{lowercased}.$TARGET"; |
| $PREPPED{TUNE} = 1; |
| } |
| |
| sub compile_lm($) { |
| my $lmfile = shift; |
| if ($LM_TYPE eq "kenlm") { |
| my $kenlm_file = basename($lmfile, ".gz") . ".kenlm"; |
| $cachepipe->cmd("compile-kenlm", |
| "$JOSHUA/src/joshua/decoder/ff/lm/kenlm/build_binary $lmfile $kenlm_file", |
| $lmfile, $kenlm_file); |
| return $kenlm_file; |
| |
| } elsif ($LM_TYPE eq "berkeleylm") { |
| my $berkeleylm_file = basename($lmfile, ".gz") . ".berkeleylm"; |
| $cachepipe->cmd("compile-berkeleylm", |
| "java -cp $JOSHUA/lib/berkeleylm.jar -server -mx$BUILDLM_MEM edu.berkeley.nlp.lm.io.MakeLmBinaryFromArpa $lmfile $berkeleylm_file", |
| $lmfile, $berkeleylm_file); |
| return $berkeleylm_file; |
| |
| } else { |
| print "* FATAL: trying to compile an LM to neither kenlm nor berkeleylm."; |
| exit 2; |
| } |
| } |
| |
| # Build the language model if needed |
| if ($DO_BUILD_LM_FROM_CORPUS) { |
| |
| # make sure the training data is prepped |
| if (! $PREPPED{TRAIN} and $DO_PREPARE_CORPORA) { |
| my $prefixes = prepare_data("train",\@CORPORA,$MAXLEN); |
| |
| $TRAIN{prefix} = "$DATA_DIRS{train}/corpus"; |
| foreach my $lang ($SOURCE,$TARGET) { |
| system("ln -sf $prefixes->{lowercased}.$lang $DATA_DIRS{train}/corpus.$lang"); |
| } |
| $TRAIN{source} = "$DATA_DIRS{train}/corpus.$SOURCE"; |
| $TRAIN{target} = "$DATA_DIRS{train}/corpus.$TARGET"; |
| $PREPPED{TRAIN} = 1; |
| } |
| |
| if (! -e $TRAIN{target}) { |
| print "* FATAL: I need a training corpus to build the language model from (--corpus)\n"; |
| exit(1); |
| } |
| |
| my $lmfile = "lm.gz"; |
| if ($LM_GEN eq "srilm") { |
| my $smoothing = ($WITTEN_BELL) ? "-wbdiscount" : "-kndiscount"; |
| $cachepipe->cmd("srilm", |
| "$SRILM -order $LM_ORDER -interpolate $smoothing -unk -gt3min 1 -gt4min 1 -gt5min 1 -text $TRAIN{target} -lm lm.gz", |
| $lmfile); |
| } else { |
| $cachepipe->cmd("berkeleylm", |
| "java -ea -mx$BUILDLM_MEM -server -cp $JOSHUA/lib/berkeleylm.jar edu.berkeley.nlp.lm.io.MakeKneserNeyArpaFromText $LM_ORDER lm.gz $TRAIN{target}", |
| $lmfile); |
| } |
| |
| if ((! $MERGE_LMS) && ($LM_TYPE eq "kenlm" || $LM_TYPE eq "berkeleylm")) { |
| push (@LMFILES, get_absolute_path(compile_lm $lmfile, $RUNDIR)); |
| } else { |
| push (@LMFILES, get_absolute_path($lmfile, $RUNDIR)); |
| } |
| } |
| |
| if ($MERGE_LMS) { |
| # Merge @LMFILES. |
| my $merged_lm = "lm-merged.gz"; |
| print "@LMFILES"; |
| $cachepipe->cmd("merge-lms", |
| "$JOSHUA/scripts/support/merge_lms.py " |
| . "@LMFILES " |
| . "$TUNE{target} " |
| . "lm-merged.gz " |
| . "--temp-dir data/merge_lms ", |
| @LMFILES, |
| $merged_lm); |
| |
| # Empty out @LMFILES. |
| @LMFILES = (); |
| |
| # Compile merged LM |
| if ($LM_TYPE eq "kenlm" || $LM_TYPE eq "berkeleylm") { |
| push (@LMFILES, get_absolute_path(compile_lm $merged_lm, $RUNDIR)); |
| |
| } else { |
| push (@LMFILES, get_absolute_path($merged_lm, $RUNDIR)); |
| } |
| } |
| |
| system("mkdir -p $DATA_DIRS{tune}") unless -d $DATA_DIRS{tune}; |
| |
| # figure out how many references there are |
| my $numrefs = get_numrefs($TUNE{target}); |
| |
| # make sure the dev source exist |
| if (! -e $TUNE{source}) { |
| print STDERR "* FATAL: couldn't fine tuning source file '$TUNE{source}'\n"; |
| exit 1; |
| } |
| if ($numrefs > 1) { |
| for my $i (0..$numrefs-1) { |
| if (! -e "$TUNE{target}.$i") { |
| print STDERR "* FATAL: couldn't find tuning reference file '$TUNE{target}.$i'\n"; |
| exit 1; |
| } |
| } |
| } else { |
| if (! -e $TUNE{target}) { |
| print STDERR "* FATAL: couldn't find tuning reference file '$TUNE{target}'\n"; |
| exit 1; |
| } |
| } |
| |
| |
| # Filter the tuning grammar if it was requested (yes by default) and a tuned grammar was not passed |
| # in explicitly. |
| my $TUNE_GRAMMAR = (defined $TUNE_GRAMMAR_FILE) |
| ? $TUNE_GRAMMAR_FILE |
| : $GRAMMAR_FILE; |
| |
| if ($DO_FILTER_TM and ! defined $TUNE_GRAMMAR_FILE) { |
| $TUNE_GRAMMAR = "$DATA_DIRS{tune}/grammar.filtered.gz"; |
| |
| $cachepipe->cmd("filter-tune", |
| "$CAT $GRAMMAR_FILE | java -Xmx2g -Dfile.encoding=utf8 -cp $THRAX/bin/thrax.jar edu.jhu.thrax.util.TestSetFilter $FILTERING -v $TUNE{source} | $SCRIPTDIR/training/filter-rules.pl -bus$SCOPE | gzip -9n > $TUNE_GRAMMAR", |
| $GRAMMAR_FILE, |
| $TUNE{source}, |
| $TUNE_GRAMMAR); |
| } |
| |
| # Pack the grammar, if requested (yes by default). This must be done after the glue grammar is |
| # created, since we don't have a script (yet) to dump the rules from a packed grammar, which |
| # information we need to create the glue grammar. |
| if ($DO_PACK_GRAMMARS && !($TUNE_GRAMMAR =~ m/packed$/)) { |
| my $packed_dir = "$DATA_DIRS{tune}/grammar.packed"; |
| |
| $cachepipe->cmd("pack-tune", |
| "$SCRIPTDIR/support/grammar-packer.pl -m $PACKER_MEM $TUNE_GRAMMAR $packed_dir", |
| $TUNE_GRAMMAR, |
| "$packed_dir/vocabulary", |
| "$packed_dir/slice_00000.source"); |
| |
| # $TUNE_GRAMMAR_FILE, which previously held an optional command-line argument of a pre-filtered |
| # tuning grammar, is now used to record the text-based grammar, which is needed later for |
| # different things. |
| $TUNE_GRAMMAR_FILE = $TUNE_GRAMMAR; |
| |
| # The actual grammar used for decoding is the packed directory. |
| $TUNE_GRAMMAR = $packed_dir; |
| } |
| |
| # Create the glue grammars. This is done by looking at all the symbols in the grammar file and |
| # creating all the needed rules. |
| if (! defined $GLUE_GRAMMAR_FILE) { |
| $cachepipe->cmd("glue-tune", |
| "java -Xmx2g -cp $JOSHUA/lib/*:$THRAX/bin/thrax.jar edu.jhu.thrax.util.CreateGlueGrammar $TUNE_GRAMMAR > $DATA_DIRS{tune}/grammar.glue", |
| $TUNE_GRAMMAR, |
| "$DATA_DIRS{tune}/grammar.glue"); |
| $GLUE_GRAMMAR_FILE = "$DATA_DIRS{tune}/grammar.glue"; |
| } else { |
| # just create a symlink to it |
| my $filename = $DATA_DIRS{tune} . "/" . basename($GLUE_GRAMMAR_FILE); |
| system("ln -sf $GLUE_GRAMMAR_FILE $filename"); |
| } |
| |
| # For each language model, we need to create an entry in the Joshua |
| # config file and in ZMERT's params.txt file. We use %lm_strings to |
| # build the corresponding string substitutions |
| my (@configstrings, @weightstrings, @lmparamstrings); |
| for my $i (0..$#LMFILES) { |
| my $lmfile = $LMFILES[$i]; |
| |
| my $configstring = "lm = $LM_TYPE $LM_ORDER false false 100 $lmfile"; |
| push (@configstrings, $configstring); |
| |
| my $weightstring = "lm_$i 1.0"; |
| push (@weightstrings, $weightstring); |
| |
| my $lmparamstring = "lm_$i ||| 1.000000 Opt 0.1 +Inf +0.5 +1.5"; |
| push (@lmparamstrings, $lmparamstring); |
| } |
| |
| my $lmlines = join($/, @configstrings); |
| my $lmweights = join($/, @weightstrings); |
| my $lmparams = join($/, @lmparamstrings); |
| |
| my (@tmparamstrings, @tmweightstrings); |
| open CONFIG, $TUNEFILES{'joshua.config'} or die; |
| while (my $line = <CONFIG>) { |
| if ($line =~ /^tm\s*=/) { |
| $line =~ s/\s+$//; |
| my (undef,$grammarline) = split(/\s*=\s*/, $line); |
| my (undef,$owner,$span,$grammar) = split(' ', $grammarline); |
| |
| if ($grammar =~ /<GRAMMAR_FILE>/ or $grammar =~ /<GLUE_GRAMMAR>/) { |
| |
| my $grammar_file = ($grammar =~ /<GRAMMAR_FILE>/) ? $TUNE_GRAMMAR_FILE : $GLUE_GRAMMAR_FILE; |
| |
| # Add the weights for the tuning grammar. |
| my $num_tm_features = count_num_features($grammar_file); |
| for my $i (0..($num_tm_features-1)) { |
| push (@tmparamstrings, "tm_${owner}_$i ||| 1.0 Opt -Inf +Inf -1 +1"); |
| push (@tmweightstrings, "tm_${owner}_$i 1.0"); |
| } |
| |
| } else { |
| # Add weights for any pre-supplied grammars. |
| |
| my $num_tm_features = count_num_features($grammar); |
| for my $i (0..($num_tm_features-1)) { |
| push (@tmparamstrings, "tm_${owner}_${i} ||| 1.0 Opt -Inf +Inf -1 +1"); |
| push (@tmweightstrings, "tm_${owner}_${i} 1.0"); |
| } |
| } |
| } |
| } |
| close CONFIG; |
| |
| my $tmparams = join($/, @tmparamstrings); |
| my $tmweights = join($/, @tmweightstrings); |
| |
| my $latticeparam = ($DOING_LATTICES == 1) |
| ? "SourcePath ||| 1.0 Opt -Inf +Inf -1 +1" |
| : ""; |
| my $latticeweight = ($DOING_LATTICES == 1) |
| ? "SourcePath 1.0" |
| : ""; |
| |
| my @feature_functions; |
| if ($DOING_LATTICES) { |
| push(@feature_functions, "feature_function = SourcePath"); |
| } |
| my $feature_functions = join("\n", @feature_functions); |
| |
| for my $run (1..$OPTIMIZER_RUNS) { |
| my $tunedir = (defined $NAME) ? "tune/$NAME/$run" : "tune/$run"; |
| system("mkdir -p $tunedir") unless -d $tunedir; |
| |
| my $weights_file = get_absolute_path("$tunedir/weights",$RUNDIR); |
| |
| foreach my $key (keys %TUNEFILES) { |
| my $file = $TUNEFILES{$key}; |
| open FROM, $file or die "can't find file '$file'"; |
| open TO, ">$tunedir/$key" or die "can't write to file '$tunedir/$key'"; |
| while (<FROM>) { |
| s/<INPUT>/$TUNE{source}/g; |
| s/<SOURCE>/$SOURCE/g; |
| s/<RUNDIR>/$RUNDIR/g; |
| s/<TARGET>/$TARGET/g; |
| s/<LMLINES>/$lmlines/g; |
| s/<LMWEIGHTS>/$lmweights/g; |
| s/<TMWEIGHTS>/$tmweights/g; |
| s/<LMPARAMS>/$lmparams/g; |
| s/<TMPARAMS>/$tmparams/g; |
| s/<WEIGHTS_FILE>/$weights_file/g; |
| s/<FEATURE_FUNCTIONS>/$feature_functions/g; |
| s/<LATTICEWEIGHT>/$latticeweight/g; |
| s/<LATTICEPARAM>/$latticeparam/g; |
| s/<LMFILE>/$LMFILES[0]/g; |
| s/<LMTYPE>/$LM_TYPE/g; |
| s/<MEM>/$JOSHUA_MEM/g; |
| s/<GRAMMAR_TYPE>/$GRAMMAR_TYPE/g; |
| s/<GRAMMAR_FILE>/$TUNE_GRAMMAR/g; |
| s/<GLUE_GRAMMAR>/$GLUE_GRAMMAR_FILE/g; |
| s/<MAXSPAN>/$MAXSPAN/g; |
| s/<OOV>/$OOV/g; |
| s/<NUMJOBS>/$NUM_JOBS/g; |
| s/<NUMTHREADS>/$NUM_THREADS/g; |
| s/<QSUB_ARGS>/$QSUB_ARGS/g; |
| s/<OUTPUT>/$tunedir\/tune.output.nbest/g; |
| s/<REF>/$TUNE{target}/g; |
| s/<JOSHUA>/$JOSHUA/g; |
| s/<JOSHUA_ARGS>/$JOSHUA_ARGS/g; |
| s/<NUMREFS>/$numrefs/g; |
| s/<CONFIG>/$tunedir\/joshua.config/g; |
| s/<LOG>/$tunedir\/joshua.log/g; |
| s/<TUNEDIR>/$tunedir/g; |
| s/<MERTDIR>/$tunedir/g; # for backwards compatibility |
| s/use_sent_specific_tm=.*/use_sent_specific_tm=0/g; |
| print TO; |
| } |
| close(FROM); |
| close(TO); |
| } |
| chmod(0755,"$tunedir/decoder_command"); |
| |
| # tune |
| if ($TUNER eq "mert") { |
| $cachepipe->cmd("mert-$run", |
| "java -d64 -Xmx2g -cp $JOSHUA/class joshua.zmert.ZMERT -maxMem 4500 $tunedir/mert.config > $tunedir/mert.log 2>&1", |
| $TUNE_GRAMMAR_FILE, |
| "$tunedir/weights.ZMERT.final", |
| "$tunedir/decoder_command", |
| "$tunedir/mert.config", |
| "$tunedir/params.txt"); |
| system("ln -sf weights.ZMERT.final $tunedir/weights.final"); |
| } elsif ($TUNER eq "pro") { |
| $cachepipe->cmd("pro-$run", |
| "java -d64 -Xmx2g -cp $JOSHUA/class joshua.pro.PRO -maxMem 4500 $tunedir/pro.config > $tunedir/pro.log 2>&1", |
| $TUNE_GRAMMAR_FILE, |
| "$tunedir/weights.PRO.final", |
| "$tunedir/decoder_command", |
| "$tunedir/pro.config", |
| "$tunedir/params.txt"); |
| system("ln -sf weights.PRO.final $tunedir/weights.final"); |
| } elsif ($TUNER eq "mira") { |
| my $refs_path = $TUNE{target}; |
| $refs_path .= "." if (get_numrefs($TUNE{target}) > 1); |
| |
| my $extra_args = $JOSHUA_ARGS; |
| $extra_args =~ s/"/\\"/g; |
| $cachepipe->cmd("mira-$run", |
| "$SCRIPTDIR/training/mira/run-mira.pl --input $TUNE{source} --refs $refs_path --config $tunedir/joshua.config --decoder $JOSHUA/bin/decoder --mertdir $MOSES/bin --rootdir $MOSES/scripts --batch-mira --working-dir $tunedir --maximum-iterations $MIRA_ITERATIONS --return-best-dev --nbest 300 --decoder-flags \"-m $JOSHUA_MEM -threads $NUM_THREADS $extra_args\" > $tunedir/mira.log 2>&1", |
| $TUNE_GRAMMAR_FILE, |
| $TUNE{source}, |
| "$tunedir/weights.final"); |
| } |
| |
| # Go to the next tuning run if tuning is the last step. |
| if ($LAST_STEP eq "TUNE") { |
| next; |
| } |
| |
| |
| # prepare the testing data |
| if (! $PREPPED{TEST} and $DO_PREPARE_CORPORA) { |
| my $prefixes = prepare_data("test",[$TEST],$MAXLEN_TEST); |
| $TEST{source} = "$DATA_DIRS{test}/$prefixes->{lowercased}.$SOURCE"; |
| $TEST{target} = "$DATA_DIRS{test}/$prefixes->{lowercased}.$TARGET"; |
| $PREPPED{TEST} = 1; |
| } |
| |
| # filter the test grammar |
| system("mkdir -p $DATA_DIRS{test}") unless -d $DATA_DIRS{test}; |
| my $TEST_GRAMMAR; |
| if ($TEST_GRAMMAR_FILE) { |
| # if a specific test grammar was specified, use that (no filtering) |
| $TEST_GRAMMAR = $TEST_GRAMMAR_FILE; |
| } else { |
| # otherwise, use the main grammar, and filter it if requested |
| $TEST_GRAMMAR = $GRAMMAR_FILE; |
| |
| if ($DO_FILTER_TM) { |
| $TEST_GRAMMAR = "$DATA_DIRS{test}/grammar.filtered.gz"; |
| |
| $cachepipe->cmd("filter-test", |
| "$SCRIPTDIR/training/scat $GRAMMAR_FILE | java -Xmx2g -Dfile.encoding=utf8 -cp $THRAX/bin/thrax.jar edu.jhu.thrax.util.TestSetFilter $FILTERING -v $TEST{source} | $SCRIPTDIR/training/filter-rules.pl -bus$SCOPE | gzip -9n > $TEST_GRAMMAR", |
| $GRAMMAR_FILE, |
| $TEST{source}, |
| $TEST_GRAMMAR); |
| } |
| } |
| |
| # Pack the grammar. |
| if ($DO_PACK_GRAMMARS && !($TEST_GRAMMAR =~ m/packed$/)) { |
| my $packed_dir = "$DATA_DIRS{test}/grammar.packed"; |
| |
| $cachepipe->cmd("pack-test", |
| "$SCRIPTDIR/support/grammar-packer.pl -m $PACKER_MEM $TEST_GRAMMAR $packed_dir", |
| $TEST_GRAMMAR, |
| "$packed_dir/vocabulary", |
| "$packed_dir/encoding", |
| "$packed_dir/slice_00000.source"); |
| |
| # $TEST_GRAMMAR_FILE, which previously held an optional command-line argument of a pre-filtered |
| # tuning grammar, is now used to record the text-based grammar, which is needed later for |
| # different things. |
| $TEST_GRAMMAR_FILE = $TEST_GRAMMAR; |
| |
| # The actual grammar used for decoding is the packed directory. |
| $TEST_GRAMMAR = $packed_dir; |
| } |
| |
| # Create the glue file. |
| if (! defined $GLUE_GRAMMAR_FILE) { |
| $cachepipe->cmd("glue-test", |
| "java -Xmx1g -cp $JOSHUA/lib/*:$THRAX/bin/thrax.jar edu.jhu.thrax.util.CreateGlueGrammar $TEST_GRAMMAR > $DATA_DIRS{test}/grammar.glue", |
| $TEST_GRAMMAR, |
| "$DATA_DIRS{test}/grammar.glue"); |
| $GLUE_GRAMMAR_FILE = "$DATA_DIRS{test}/grammar.glue"; |
| |
| } else { |
| # just create a symlink to it |
| my $filename = $DATA_DIRS{test} . "/" . basename($GLUE_GRAMMAR_FILE); |
| |
| if ($GLUE_GRAMMAR_FILE =~ /^\//) { |
| system("ln -sf $GLUE_GRAMMAR_FILE $filename"); |
| } else { |
| system("ln -sf $STARTDIR/$GLUE_GRAMMAR_FILE $filename"); |
| } |
| } |
| |
| my $testrun = (defined $NAME) ? "test/$NAME/$run" : "test/$run"; |
| system("mkdir -p $testrun") unless -d $testrun; |
| $testrun = get_absolute_path($testrun, $RUNDIR); |
| |
| # If we're decoding a lattice, also output the source side path we chose |
| my $joshua_args = $JOSHUA_ARGS; |
| if ($DOING_LATTICES) { |
| $joshua_args .= " -output-format \"%i ||| %s ||| %e ||| %f ||| %c\""; |
| } |
| |
| foreach my $key (qw(decoder_command)) { |
| my $file = $TUNEFILES{$key}; |
| open FROM, $file or die "can't find file '$file'"; |
| open TO, ">$testrun/$key" or die "can't write to '$testrun/$key'"; |
| while (<FROM>) { |
| s/<INPUT>/$TEST{source}/g; |
| s/<NUMJOBS>/$NUM_JOBS/g; |
| s/<NUMTHREADS>/$NUM_THREADS/g; |
| s/<QSUB_ARGS>/$QSUB_ARGS/g; |
| s/<OUTPUT>/$testrun\/test.output.nbest/g; |
| s/<JOSHUA>/$JOSHUA/g; |
| s/<JOSHUA_ARGS>/$joshua_args/g; |
| s/<NUMREFS>/$numrefs/g; |
| s/<SOURCE>/$SOURCE/g; |
| s/<TARGET>/$TARGET/g; |
| s/<RUNDIR>/$TARGET/g; |
| s/<LMFILE>/$LMFILES[0]/g; |
| s/<MEM>/$JOSHUA_MEM/g; |
| s/<GRAMMAR_TYPE>/$GRAMMAR_TYPE/g; |
| s/<GRAMMAR_FILE>/$TEST_GRAMMAR/g; |
| s/<GLUE_GRAMMAR>/$GLUE_GRAMMAR_FILE/g; |
| s/<OOV>/$OOV/g; |
| s/<CONFIG>/$testrun\/joshua.config/g; |
| s/<LOG>/$testrun\/joshua.log/g; |
| |
| print TO; |
| } |
| close(FROM); |
| close(TO); |
| } |
| chmod(0755,"$testrun/decoder_command"); |
| |
| # Copy the config file over. |
| $cachepipe->cmd("test-joshua-config-from-tune-$run", |
| "cat $tunedir/joshua.config | $COPY_CONFIG -mark-oovs true -weights-file $testrun/weights -tm 'thrax pt $MAXSPAN $TEST_GRAMMAR' > $testrun/joshua.config", |
| "$tunedir/joshua.config", |
| "$testrun/joshua.config"); |
| |
| $cachepipe->cmd("test-joshua-weights-from-tune-$run", |
| "cp $tunedir/weights.final $testrun/weights", |
| "$tunedir/weights.final", |
| "$testrun/weights"); |
| |
| $cachepipe->cmd("test-decode-$run", |
| "$testrun/decoder_command", |
| "$testrun/decoder_command", |
| "$DATA_DIRS{test}/grammar.glue", |
| $TEST_GRAMMAR_FILE, |
| "$testrun/test.output.nbest"); |
| |
| $cachepipe->cmd("remove-oov-$run", |
| "cat $testrun/test.output.nbest | perl -pe 's/_OOV//g' > $testrun/test.output.nbest.noOOV", |
| "$testrun/test.output.nbest", |
| "$testrun/test.output.nbest.noOOV"); |
| |
| my $output = "$testrun/test.output.1best"; |
| $numrefs = get_numrefs($TEST{target}); |
| |
| # Always compute the BLEU score on the regular 1-best output, since it's easy to do |
| $cachepipe->cmd("test-extract-onebest-$run", |
| "java -Xmx500m -cp $JOSHUA/class -Dfile.encoding=utf8 joshua.util.ExtractTopCand $testrun/test.output.nbest.noOOV $output", |
| "$testrun/test.output.nbest.noOOV", |
| $output); |
| |
| $cachepipe->cmd("test-bleu-$run", |
| "java -cp $JOSHUA/class -Dfile.encoding=utf8 -Djava.library.path=lib -Xmx1000m -Xms1000m -Djava.util.logging.config.file=logging.properties joshua.util.JoshuaEval -cand $output -ref $TEST{target} -rps $numrefs -m BLEU 4 closest > $testrun/test.output.1best.bleu", |
| $output, |
| "$output.bleu"); |
| |
| # We can also rescore the output lattice with MBR |
| if ($DO_MBR) { |
| my $numlines = `cat $TEST{source} | wc -l`; |
| $numlines--; |
| $output .= ".mbr"; |
| |
| $cachepipe->cmd("test-onebest-parmbr-$run", |
| "cat $testrun/test.output.nbest.noOOV | java -Xmx1700m -cp $JOSHUA/class -Dfile.encoding=utf8 joshua.decoder.NbestMinRiskReranker false 1 $NUM_THREADS > $output", |
| "$testrun/test.output.nbest.noOOV", |
| $output); |
| |
| $cachepipe->cmd("test-bleu-mbr-$run", |
| "java -cp $JOSHUA/class -Dfile.encoding=utf8 -Djava.library.path=lib -Xmx1000m -Xms1000m -Djava.util.logging.config.file=logging.properties joshua.util.JoshuaEval -cand $output -ref $TEST{target} -rps $numrefs -m BLEU 4 closest > $testrun/test.output.1best.mbr.bleu", |
| $output, |
| "$output.bleu"); |
| } |
| |
| # Update the BLEU summary. |
| my $dir = (defined $NAME) ? "test/$NAME" : "test"; |
| compute_bleu_summary("$dir/*/*.1best.bleu", "$dir/final-bleu"); |
| compute_bleu_summary("$dir/*/*.1best.mbr.bleu", "$dir/final-bleu-mbr"); |
| compute_time_summary("$dir/*/joshua.log", "$dir/final-times"); |
| |
| # Now do the analysis |
| if ($DOING_LATTICES) { |
| # extract the source |
| my $source = "$testrun/test.lattice-path.txt"; |
| $cachepipe->cmd("test-lattice-extract-source-$run", |
| "$JOSHUA/bin/extract-1best $testrun/test.output.nbest.noOOV 2 | perl -pe 's/<s> //' > $source", |
| $output, $source); |
| |
| analyze_testrun($output,$source,$TEST{target}); |
| } else { |
| analyze_testrun($output,$TEST{source},$TEST{target}); |
| } |
| } |
| |
| exit; |
| |
| # This target allows the pipeline to be used just for decoding new |
| # data sets |
| |
| TEST: |
| ; |
| |
| system("mkdir -p $DATA_DIRS{test}") unless -d $DATA_DIRS{test}; |
| |
| if (! defined $NAME) { |
| print "* FATAL: for direct tests, you must specify a unique run name\n"; |
| exit 1; |
| } |
| |
| # if (-e "$DATA_DIRS{test}/$NAME") { |
| # print "* FATAL: you specified a run name, but it already exists\n"; |
| # exit 1; |
| # } |
| |
| if (! $PREPPED{TEST} and $DO_PREPARE_CORPORA) { |
| my $prefixes = prepare_data("test",[$TEST],$MAXLEN_TEST); |
| $TEST{source} = "$DATA_DIRS{test}/$prefixes->{lowercased}.$SOURCE"; |
| $TEST{target} = "$DATA_DIRS{test}/$prefixes->{lowercased}.$TARGET"; |
| $PREPPED{TEST} = 1; |
| } |
| |
| my $testrun = "test/$NAME"; |
| system("mkdir -p $testrun") unless -d $testrun; |
| |
| # filter the test grammar |
| my $TEST_GRAMMAR; |
| if ($TEST_GRAMMAR_FILE) { |
| # if a specific test grammar was specified, use that (no filtering) |
| $TEST_GRAMMAR = $TEST_GRAMMAR_FILE; |
| } else { |
| # otherwise, use the main grammar, and filter it if requested |
| $TEST_GRAMMAR = $GRAMMAR_FILE; |
| |
| if ($DO_FILTER_TM) { |
| $TEST_GRAMMAR = "$DATA_DIRS{test}/grammar.filtered.gz"; |
| |
| $cachepipe->cmd("filter-test-$NAME", |
| "$CAT $GRAMMAR_FILE | java -Xmx2g -Dfile.encoding=utf8 -cp $THRAX/bin/thrax.jar edu.jhu.thrax.util.TestSetFilter $FILTERING -v $TEST{source} | $SCRIPTDIR/training/filter-rules.pl -bus$SCOPE | gzip -9n > $TEST_GRAMMAR", |
| $GRAMMAR_FILE, |
| $TEST{source}, |
| $TEST_GRAMMAR); |
| } |
| } |
| |
| # build the glue grammar if needed |
| if (! defined $GLUE_GRAMMAR_FILE) { |
| $cachepipe->cmd("glue-test-$NAME", |
| "java -Xmx2g -cp $JOSHUA/lib/*:$THRAX/bin/thrax.jar edu.jhu.thrax.util.CreateGlueGrammar $TEST_GRAMMAR > $DATA_DIRS{test}/grammar.glue", |
| $TEST_GRAMMAR, |
| "$DATA_DIRS{test}/grammar.glue"); |
| $GLUE_GRAMMAR_FILE = "$DATA_DIRS{test}/grammar.glue"; |
| } |
| |
| if ($TUNEFILES{'joshua.config'} eq $JOSHUA_CONFIG_ORIG) { |
| print "* FATAL: for direct tests, I need a (tuned) Joshua config file\n"; |
| exit 1; |
| } |
| |
| if ($DO_PACK_GRAMMARS) { |
| my $packed_dir = "$DATA_DIRS{test}/grammar.packed"; |
| |
| $cachepipe->cmd("pack-test", |
| "$SCRIPTDIR/support/grammar-packer.pl -m $PACKER_MEM $TEST_GRAMMAR $packed_dir", |
| $TEST_GRAMMAR, |
| "$packed_dir/vocabulary", |
| "$packed_dir/slice_00000.source"); |
| |
| # $TEST_GRAMMAR_FILE, which previously held an optional command-line argument of a pre-filtered |
| # tuning grammar, is now used to record the text-based grammar, which is needed later for |
| # different things. |
| $TEST_GRAMMAR_FILE = $TEST_GRAMMAR; |
| |
| # The actual grammar used for decoding is the packed directory. |
| $TEST_GRAMMAR = $packed_dir; |
| } |
| |
| # this needs to be in a function since it is done all over the place |
| open FROM, $TUNEFILES{decoder_command} or die "can't find file '$TUNEFILES{decoder_command}'"; |
| open TO, ">$testrun/decoder_command"; |
| print TO "cat $TEST{source} | \$JOSHUA/bin/joshua-decoder -m $JOSHUA_MEM -threads $NUM_THREADS -c $testrun/joshua.config > $testrun/test.output.nbest 2> $testrun/joshua.log\n"; |
| close(TO); |
| chmod(0755,"$testrun/decoder_command"); |
| |
| my $weights_file = dirname($TUNEFILES{'joshua.config'}) . "/weights"; |
| $cachepipe->cmd("test-$NAME-copy-weights", |
| "cp $weights_file $testrun/weights", |
| $weights_file, |
| "$testrun/weights"); |
| |
| # copy over the config file |
| $cachepipe->cmd("test-$NAME-copy-config", |
| "cat $TUNEFILES{'joshua.config'} | $COPY_CONFIG -mark-oovs true -weights-file $testrun/weights -tm/pt 'thrax pt $MAXSPAN $TEST_GRAMMAR' -default-non-terminal $OOV > $testrun/joshua.config", |
| $TUNEFILES{'joshua.config'}, |
| "$testrun/joshua.config"); |
| |
| # decode |
| $cachepipe->cmd("test-$NAME-decode-run", |
| "$testrun/decoder_command", |
| "$testrun/decoder_command", |
| $TEST_GRAMMAR, |
| $GLUE_GRAMMAR_FILE, |
| "$testrun/test.output.nbest"); |
| |
| $cachepipe->cmd("test-$NAME-remove-oov", |
| "cat $testrun/test.output.nbest | perl -pe 's/_OOV//g' > $testrun/test.output.nbest.noOOV", |
| "$testrun/test.output.nbest", |
| "$testrun/test.output.nbest.noOOV"); |
| |
| if ($DO_MBR) { |
| $cachepipe->cmd("test-$NAME-onebest-parmbr", |
| "cat $testrun/test.output.nbest.noOOV | java -Xmx1700m -cp $JOSHUA/class -Dfile.encoding=utf8 joshua.decoder.NbestMinRiskReranker false 1 > $testrun/test.output.1best", |
| "$testrun/test.output.nbest.noOOV", |
| "$testrun/test.output.1best"); |
| } else { |
| $cachepipe->cmd("test-$NAME-extract-onebest", |
| "java -Xmx500m -cp $JOSHUA/class -Dfile.encoding=utf8 joshua.util.ExtractTopCand $testrun/test.output.nbest $testrun/test.output.1best", |
| "$testrun/test.output.nbest.noOOV", |
| "$testrun/test.output.1best"); |
| } |
| |
| $numrefs = get_numrefs($TEST{target}); |
| $cachepipe->cmd("$NAME-test-bleu", |
| "java -cp $JOSHUA/class -Dfile.encoding=utf8 -Djava.library.path=lib -Xmx1000m -Xms1000m -Djava.util.logging.config.file=logging.properties joshua.util.JoshuaEval -cand $testrun/test.output.1best -ref $TEST{target} -rps $numrefs -m BLEU 4 closest > $testrun/test.output.1best.bleu", |
| "$testrun/test.output.1best", |
| "$testrun/test.output.1best.bleu"); |
| |
| system("cat $testrun/test.output.1best.bleu"); |
| |
| |
| ###################################################################### |
| ## SUBROUTINES ####################################################### |
| ###################################################################### |
| LAST: |
| 1; |
| |
| # Does tokenization and normalization of training, tuning, and test data. |
| # $label: one of train, tune, or test |
| # $corpora: arrayref of files (multiple allowed for training data) |
| # $maxlen: maximum length (only applicable to training) |
| sub prepare_data { |
| my ($label,$corpora,$maxlen) = @_; |
| $maxlen = 0 unless defined $maxlen; |
| |
| system("mkdir -p $DATA_DIR") unless -d $DATA_DIR; |
| system("mkdir -p $DATA_DIRS{$label}") unless -d $DATA_DIRS{$label}; |
| |
| # records the pieces that are produced |
| my %prefixes; |
| |
| # copy the data from its original location to our location |
| my $numlines = -1; |
| foreach my $ext ($TARGET,$SOURCE,"$TARGET.0","$TARGET.1","$TARGET.2","$TARGET.3") { |
| # append each extension to the corpora prefixes |
| my @files = map { "$_.$ext" } @$corpora; |
| |
| # This block makes sure that the files have a nonzero file size |
| map { |
| if (-z $_) { |
| print STDERR "* FATAL: $label file '$_' is empty"; |
| exit 1; |
| } |
| } @files; |
| |
| # a list of all the files (in case of multiple corpora prefixes) |
| my $files = join(" ",@files); |
| if (-e $files[0]) { |
| $cachepipe->cmd("$label-copy-$ext", |
| "cat $files | gzip -9n > $DATA_DIRS{$label}/$label.$ext.gz", |
| @files, "$DATA_DIRS{$label}/$label.$ext.gz"); |
| |
| chomp(my $lines = `$CAT $DATA_DIRS{$label}/$label.$ext.gz | wc -l`); |
| $numlines = $lines if ($numlines == -1); |
| if ($lines != $numlines) { |
| print STDERR "* FATAL: $DATA_DIRS{$label}/$label.$ext.gz has a different number of lines ($lines) than a 'parallel' file that preceded it ($numlines)\n"; |
| exit(1); |
| } |
| } |
| } |
| |
| my $prefix = "$label"; |
| |
| # tokenize the data |
| foreach my $lang ($TARGET,$SOURCE,"$TARGET.0","$TARGET.1","$TARGET.2","$TARGET.3") { |
| if (-e "$DATA_DIRS{$label}/$prefix.$lang.gz") { |
| if (is_lattice("$DATA_DIRS{$label}/$prefix.$lang.gz")) { |
| system("cp $DATA_DIRS{$label}/$prefix.$lang.gz $DATA_DIRS{$label}/$prefix.tok.$lang.gz"); |
| } else { |
| $cachepipe->cmd("$label-tokenize-$lang", |
| "$CAT $DATA_DIRS{$label}/$prefix.$lang.gz | $NORMALIZER $lang | $TOKENIZER -l $lang 2> /dev/null | gzip -9n > $DATA_DIRS{$label}/$prefix.tok.$lang.gz", |
| "$DATA_DIRS{$label}/$prefix.$lang.gz", "$DATA_DIRS{$label}/$prefix.tok.$lang.gz"); |
| } |
| |
| } |
| } |
| # extend the prefix |
| $prefix .= ".tok"; |
| $prefixes{tokenized} = $prefix; |
| |
| if ($maxlen > 0) { |
| my (@infiles, @outfiles); |
| foreach my $ext ($TARGET, $SOURCE, "$TARGET.0", "$TARGET.1", "$TARGET.2", "$TARGET.3") { |
| my $infile = "$DATA_DIRS{$label}/$prefix.$ext.gz"; |
| my $outfile = "$DATA_DIRS{$label}/$prefix.$maxlen.$ext.gz"; |
| if (-e $infile) { |
| push(@infiles, $infile); |
| push(@outfiles, $outfile); |
| } |
| } |
| |
| my $infilelist = join(" ", map { "<(gzip -cd $_)" } @infiles); |
| my $outfilelist = join(" ", @outfiles); |
| |
| # trim training data |
| $cachepipe->cmd("$label-trim", |
| "paste $infilelist | $SCRIPTDIR/training/trim_parallel_corpus.pl $maxlen | $SCRIPTDIR/training/split2files.pl $outfilelist", |
| @infiles, |
| @outfiles); |
| $prefix .= ".$maxlen"; |
| } |
| # record this whether we shortened or not |
| $prefixes{shortened} = $prefix; |
| |
| # lowercase |
| foreach my $lang ($TARGET,$SOURCE,"$TARGET.0","$TARGET.1","$TARGET.2","$TARGET.3") { |
| if (-e "$DATA_DIRS{$label}/$prefix.$lang.gz") { |
| if (is_lattice("$DATA_DIRS{$label}/$prefix.$lang.gz")) { |
| system("gzip -cd $DATA_DIRS{$label}/$prefix.$lang.gz > $DATA_DIRS{$label}/$prefix.lc.$lang"); |
| } else { |
| $cachepipe->cmd("$label-lowercase-$lang", |
| "gzip -cd $DATA_DIRS{$label}/$prefix.$lang.gz | $SCRIPTDIR/lowercase.perl > $DATA_DIRS{$label}/$prefix.lc.$lang", |
| "$DATA_DIRS{$label}/$prefix.$lang.gz", |
| "$DATA_DIRS{$label}/$prefix.lc.$lang"); |
| } |
| } |
| } |
| $prefix .= ".lc"; |
| $prefixes{lowercased} = $prefix; |
| |
| foreach my $lang ($TARGET,$SOURCE,"$TARGET.0","$TARGET.1","$TARGET.2","$TARGET.3") { |
| if (-e "$DATA_DIRS{$label}/$prefixes{lowercased}.$lang") { |
| system("ln -sf $prefixes{lowercased}.$lang $DATA_DIRS{$label}/corpus.$lang"); |
| } |
| } |
| |
| return \%prefixes; |
| } |
| |
| sub maybe_quit { |
| my ($current_step) = @_; |
| |
| if (defined $LAST_STEP and $current_step eq $LAST_STEP) { |
| print "* Quitting at this step\n"; |
| exit(0); |
| } |
| } |
| |
| ## returns 1 if every sentence in the corpus begins with an open paren, |
| ## false otherwise |
| sub already_parsed { |
| my ($corpus) = @_; |
| |
| open(CORPUS, $corpus) or die "can't read corpus file '$corpus'\n"; |
| while (<CORPUS>) { |
| # if we see a line not beginning with an open paren, we consider |
| # the file not to be parsed |
| return 0 unless /^\(/; |
| } |
| close(CORPUS); |
| |
| return 1; |
| } |
| |
| sub not_defined { |
| my ($var) = @_; |
| |
| print "* FATAL: environment variable \$$var is not defined.\n"; |
| exit; |
| } |
| |
| # Takes a prefix. If that prefix exists, then all the references are |
| # assumed to be in that file. Otherwise, we successively append an |
| # index, looking for parallel references. |
| sub get_numrefs { |
| my ($prefix) = @_; |
| |
| if (-e "$prefix.0") { |
| my $index = 0; |
| while (-e "$prefix.$index") { |
| $index++; |
| } |
| return $index; |
| } else { |
| return 1; |
| } |
| } |
| |
| sub start_hadoop_cluster { |
| rollout_hadoop_cluster(); |
| |
| # start the cluster |
| # system("./hadoop/bin/start-all.sh"); |
| # sleep(120); |
| } |
| |
| sub rollout_hadoop_cluster { |
| # if it's not already unpacked, unpack it |
| if (! -d "hadoop") { |
| |
| system("tar xzf $JOSHUA/lib/hadoop-0.20.2.tar.gz"); |
| system("ln -sf hadoop-0.20.2 hadoop"); |
| if (defined $HADOOP_CONF) { |
| print STDERR "Copying HADOOP_CONF($HADOOP_CONF) to hadoop/conf/core-site.xml\n"; |
| system("cp $HADOOP_CONF hadoop/conf/core-site.xml"); |
| } |
| } |
| |
| $ENV{HADOOP} = $HADOOP = "hadoop"; |
| $ENV{HADOOP_CONF_DIR} = ""; |
| } |
| |
| sub stop_hadoop_cluster { |
| if ($HADOOP ne "hadoop") { |
| system("hadoop/bin/stop-all.sh"); |
| } |
| } |
| |
| sub teardown_hadoop_cluster { |
| stop_hadoop_cluster(); |
| system("rm -rf hadoop-0.20.2 hadoop"); |
| } |
| |
| sub is_lattice { |
| my $file = shift; |
| open READ, "$CAT $file|" or die "can't read from potential lattice '$file'"; |
| my $line = <READ>; |
| close(READ); |
| if ($line =~ /^\(\(\(/) { |
| $DOING_LATTICES = 1; |
| return 1; |
| } else { |
| return 0; |
| } |
| } |
| |
| # This counts the number of TM features present in a grammar |
| sub count_num_features { |
| my ($grammar) = @_; |
| |
| open GRAMMAR, "$CAT $grammar|" or die "FATAL: can't read $grammar"; |
| chomp(my $line = <GRAMMAR>); |
| close(GRAMMAR); |
| |
| my @tokens = split(/ \|\|\| /, $line); |
| my @numfeatures = split(' ', $tokens[-1]); |
| my $num = scalar(@numfeatures); |
| |
| return scalar @numfeatures; |
| } |
| |
| # File names reflecting relative paths need to be absolute-ized for --rundir to work. |
| # Does not work with paths that do not exist! |
| sub get_absolute_path { |
| my ($file,$basedir) = @_; |
| $basedir = $STARTDIR unless defined $basedir; |
| |
| if (defined $file) { |
| $file = "$basedir/$file" unless $file =~ /^\//; |
| |
| # prepend startdir (which is absolute) unless the path is absolute. |
| my $abs_path = abs_path($file); |
| if (defined $abs_path) { |
| $file = $abs_path; |
| } |
| } |
| |
| return $file; |
| } |
| |
| sub analyze_testrun { |
| my ($output,$source,$reference) = @_; |
| my $dir = dirname($output); |
| |
| mkdir("$dir/analysis") unless -d "$dir/analysis"; |
| |
| my @references; |
| if (-e "$reference.0") { |
| my $num = 0; |
| while (-e "$reference.$num") { |
| push(@references, "$reference.$num"); |
| $num++; |
| } |
| } else { |
| push(@references, $reference); |
| } |
| |
| my $references = join(" -r ", @references); |
| |
| my $runname = "analyze-$dir"; |
| $runname =~ s/\//-/g; |
| $cachepipe->cmd($runname, |
| "$SCRIPTDIR/analysis/sentence-by-sentence.pl -s $source -r $references $output > $dir/analysis/sentence-by-sentence.html", |
| "$dir/test.output.1best", |
| "$dir/analysis/sentence-by-sentence.html"); |
| } |
| |
| sub compute_bleu_summary { |
| my ($filepattern, $outputfile) = @_; |
| |
| # Now average the runs, report BLEU |
| my @bleus; |
| my $numrecs = 0; |
| open CMD, "grep ' BLEU = ' $filepattern |"; |
| while (<CMD>) { |
| my @F = split; |
| push(@bleus, 1.0 * $F[-1]); |
| } |
| close(CMD); |
| |
| if (scalar @bleus) { |
| my $final_bleu = sum(@bleus) / (scalar @bleus); |
| |
| open BLEU, ">$outputfile" or die "Can't write to $outputfile"; |
| printf(BLEU "%s / %d = %.4f\n", join(" + ", @bleus), scalar @bleus, $final_bleu); |
| close(BLEU); |
| } |
| } |
| |
| sub compute_time_summary { |
| my ($filepattern, $outputfile) = @_; |
| |
| # Now average the runs, report BLEU |
| my @times; |
| foreach my $file (glob($filepattern)) { |
| open FILE, $file; |
| my $time = 0.0; |
| my $numrecs = 0; |
| while (<FILE>) { |
| next unless /translation of .* took/; |
| my @F = split; |
| $time += $F[5]; |
| $numrecs++; |
| } |
| close(FILE); |
| |
| push(@times, $time); |
| } |
| |
| if (scalar @times) { |
| open TIMES, ">$outputfile" or die "Can't write to $outputfile"; |
| printf(TIMES "%s / %d = %s\n", join(" + ", @times), scalar(@times), 1.0 * sum(@times) / scalar(@times)); |
| close(TIMES); |
| } |
| } |