| #!/usr/bin/perl |
| |
| # This script implements the Joshua pipeline. It can run a complete |
| # pipeline --- from raw training corpora to bleu scores on a test set |
| # --- and it allows jumping into arbitrary points of the pipeline. |
| |
| my $JOSHUA; |
| |
| BEGIN { |
| if (! exists $ENV{JOSHUA} || $ENV{JOSHUA} eq "" || |
| ! exists $ENV{JAVA_HOME} || $ENV{JAVA_HOME} eq "") { |
| print "Several environment variables must be set before running the pipeline. Please set:\n"; |
| print "* \$JOSHUA to the root of the Joshua source code.\n" |
| if (! exists $ENV{JOSHUA} || $ENV{JOSHUA} eq ""); |
| print "* \$JAVA_HOME to the directory of your local java installation. \n" |
| if (! exists $ENV{JAVA_HOME} || $ENV{JAVA_HOME} eq ""); |
| exit; |
| } |
| $JOSHUA = $ENV{JOSHUA}; |
| unshift(@INC,"$JOSHUA/scripts/training/cachepipe"); |
| unshift(@INC,"$JOSHUA/lib"); |
| } |
| |
| use strict; |
| use warnings; |
| use Getopt::Long; |
| use File::Basename; |
| use Cwd qw[abs_path getcwd]; |
| use POSIX qw[ceil]; |
| use List::Util qw[max min sum]; |
| use File::Temp qw[:mktemp tempdir]; |
| use CachePipe; |
| # use Thread::Pool; |
| |
| # Hadoop uses a stupid hacker trick to change directories, but (per Lane Schwartz) if CDPATH |
| # contains ".", it triggers the printing of the directory, which kills the stupid hacker trick. |
| # Thus we undefine CDPATH to ensure this doesn't happen. |
| delete $ENV{CDPATH}; |
| |
| my $HADOOP = $ENV{HADOOP}; |
| my $MOSES = $ENV{MOSES}; |
| delete $ENV{GREP_OPTIONS}; |
| |
| my $THRAX = "$JOSHUA/thrax"; |
| |
| die not_defined("JAVA_HOME") unless exists $ENV{JAVA_HOME}; |
| |
| my (@CORPORA,$TUNE,$TEST,$ALIGNMENT,$SOURCE,$TARGET,@LMFILES,$GRAMMAR_FILE,$GLUE_GRAMMAR_FILE,$_TUNE_GRAMMAR_FILE,$_TEST_GRAMMAR_FILE,$THRAX_CONF_FILE, $_JOSHUA_CONFIG, $_JOSHUA_ARGS); |
| my $FIRST_STEP = "FIRST"; |
| my $LAST_STEP = "LAST"; |
| my $LMFILTER = "$ENV{HOME}/code/filter/filter"; |
| |
| # The maximum length of training sentences (--maxlen). The threshold is applied to both sides. |
| my $MAXLEN = 50; |
| |
| # The maximum span rules in the main grammar can be applied to |
| my $MAXSPAN = 20; |
| |
| # The maximum length of tuning and testing sentences (--maxlen-tune and --maxlen-test). |
| my $MAXLEN_TUNE = 0; |
| my $MAXLEN_TEST = 0; |
| |
| # when doing phrase-based decoding, the maximum length of a phrase (source side) |
| my $MAX_PHRASE_LEN = 5; |
| |
| my $DO_FILTER_TM = 1; |
| my $DO_SUBSAMPLE = 0; |
| my $DO_PACK_GRAMMARS = 1; |
| my $SCRIPTDIR = "$JOSHUA/scripts"; |
| my $TOKENIZER_SOURCE = "$SCRIPTDIR/training/penn-treebank-tokenizer.perl"; |
| my $TOKENIZER_TARGET = "$SCRIPTDIR/training/penn-treebank-tokenizer.perl"; |
| my $NORMALIZER = "$SCRIPTDIR/training/normalize-punctuation.pl"; |
| my $GIZA_TRAINER = "$SCRIPTDIR/training/run-giza.pl"; |
| my $TUNECONFDIR = "$SCRIPTDIR/training/templates/tune"; |
| my $SRILM = ($ENV{SRILM}||"")."/bin/i686-m64/ngram-count"; |
| my $COPY_CONFIG = "$SCRIPTDIR/copy-config.pl"; |
| my $BUNDLER = "$JOSHUA/scripts/support/run_bundler.py"; |
| my $STARTDIR; |
| my $RUNDIR = $STARTDIR = getcwd(); |
| my $GRAMMAR_TYPE = "hiero"; # or "itg" or "samt" or "ghkm" or "phrase" or "phrasal" |
| my $SEARCH_ALGORITHM = "cky"; # or "stack" (for phrase-based) |
| |
| # Which GHKM extractor to use ("galley" or "moses") |
| my $GHKM_EXTRACTOR = "moses"; |
| my $EXTRACT_OPTIONS = ""; |
| |
| my $WITTEN_BELL = 0; |
| |
| # Run description. |
| my $README = undef; |
| |
| # gzip-aware cat |
| my $CAT = "$SCRIPTDIR/training/scat"; |
| |
| # where processed data files are stored |
| my $DATA_DIR = "data"; |
| |
| # Whether to do MBR decoding on the n-best list (for test data). |
| my $DO_MBR = 0; |
| |
| # Which aligner to use. The options are "giza" or "berkeley". |
| my $ALIGNER = "giza"; # "berkeley" or "giza" or "jacana" |
| |
| # Filter rules to the following maximum scope (Hopkins & Langmead, 2011). |
| my $SCOPE = 3; |
| |
| # What kind of filtering to use ("fast" or "exact"). |
| my $FILTERING = "fast"; |
| |
| # This is the amount of memory made available to Joshua. You'll need |
| # a lot more than this for SAMT decoding (though really it depends |
| # mostly on your grammar size) |
| my $JOSHUA_MEM = "3100m"; |
| |
| # the amount of memory available for hadoop processes (passed to |
| # Hadoop via -Dmapred.child.java.opts |
| my $HADOOP_MEM = "2g"; |
| |
| # The location of a custom core-site.xml file, if desired (optional). |
| my $HADOOP_CONF = undef; |
| |
| # memory available to the parser |
| my $PARSER_MEM = "2g"; |
| |
| # memory available for building the language model |
| my $BUILDLM_MEM = "2G"; |
| |
| # Memory available for packing the grammar. |
| my $PACKER_MEM = "8g"; |
| |
| # Memory available for MERT/PRO. |
| my $TUNER_MEM = "8g"; |
| |
| # When qsub is called for decoding, these arguments should be passed to it. |
| my $QSUB_ARGS = ""; |
| |
| # When qsub is called for aligning, these arguments should be passed to it. |
| my $QSUB_ALIGN_ARGS = "-l h_rt=168:00:00,h_vmem=15g,mem_free=10g,num_proc=1"; |
| |
| # Amount of memory for the Berkeley aligner. |
| my $ALIGNER_MEM = "10g"; |
| |
| # Align corpus files a million lines at a time. |
| my $ALIGNER_BLOCKSIZE = 1000000; |
| |
| # The number of machines to decode on. If you set this higher than 1, |
| # you need to have qsub configured for your environment. |
| my $NUM_JOBS = 1; |
| |
| # The number of threads to use at different pieces in the pipeline |
| # (giza, decoding) |
| my $NUM_THREADS = 1; |
| |
| # which LM to use (kenlm or berkeleylm) |
| my $LM_TYPE = "kenlm"; |
| |
| # n-gram order |
| my $LM_ORDER = 5; |
| |
| # Whether to build and include an LM from the target-side of the |
| # corpus when manually-specified LM files are passed with --lmfile. |
| my $DO_BUILD_LM_FROM_CORPUS = 1; |
| |
| # Whether to build and include an LM from the target-side of the |
| # corpus when manually-specified LM files are passed with --lmfile. |
| my $DO_BUILD_CLASS_LM = 0; |
| my $CLASS_LM_CORPUS = undef; |
| my $CLASS_MAP = undef; |
| my $CLASS_LM_ORDER = 9; |
| |
| # whether to tokenize and lowercase training, tuning, and test data |
| my $DO_PREPARE_CORPORA = 1; |
| |
| # how many optimizer runs to perform |
| my $OPTIMIZER_RUNS = 1; |
| |
| # what to use to create language models ("berkeleylm" or "srilm") |
| my $LM_GEN = "kenlm"; |
| my $LM_OPTIONS = ""; |
| |
| my @STEPS = qw[FIRST SUBSAMPLE ALIGN PARSE THRAX GRAMMAR PHRASE TUNE MERT PRO TEST LAST]; |
| my %STEPS = map { $STEPS[$_] => $_ + 1 } (0..$#STEPS); |
| |
| # Methods to use for merging alignments (see Koehn et al., 2003). |
| # Options are union, {intersect, grow, srctotgt, tgttosrc}-{diag,final,final-and,diag-final,diag-final-and} |
| my $GIZA_MERGE = "grow-diag-final"; |
| |
| # Whether to merge all the --lmfile LMs into a single LM using weights based on the development corpus |
| my $MERGE_LMS = 0; |
| |
| # Which tuner to use by default |
| my $TUNER = "mert"; # or pro, mira, or kbmira (the latter calling out to Moses) |
| |
| # The number of iterations of the mira to run |
| my $TUNER_ITERATIONS = 15; |
| |
| # location of already-parsed corpus |
| my $PARSED_CORPUS = undef; |
| |
| # location of the ner tagger wrapper script for annotation |
| my $NER_TAGGER = undef; |
| |
| # Allows the user to set a temp dir for various tasks |
| my $TMPDIR = "/tmp"; |
| |
| # Enable forest rescoring |
| my $LM_STATE_MINIMIZATION = 1; |
| |
| my $NBEST = 300; |
| |
| my $REORDERING_LIMIT = 6; |
| my $NUM_TRANSLATION_OPTIONS = 20; |
| |
| my $retval = GetOptions( |
| "readme=s" => \$README, |
| "corpus=s" => \@CORPORA, |
| "parsed-corpus=s" => \$PARSED_CORPUS, |
| "tune=s" => \$TUNE, |
| "test=s" => \$TEST, |
| "prepare!" => \$DO_PREPARE_CORPORA, |
| "aligner=s" => \$ALIGNER, |
| "alignment=s" => \$ALIGNMENT, |
| "aligner-mem=s" => \$ALIGNER_MEM, |
| "giza-merge=s" => \$GIZA_MERGE, |
| "source=s" => \$SOURCE, |
| "target=s" => \$TARGET, |
| "rundir=s" => \$RUNDIR, |
| "filter-tm!" => \$DO_FILTER_TM, |
| "scope=i" => \$SCOPE, |
| "filtering=s" => \$FILTERING, |
| "lm=s" => \$LM_TYPE, |
| "lmfile=s" => \@LMFILES, |
| "merge-lms!" => \$MERGE_LMS, |
| "lm-gen=s" => \$LM_GEN, |
| "lm-gen-options=s" => \$LM_OPTIONS, |
| "lm-order=i" => \$LM_ORDER, |
| "corpus-lm!" => \$DO_BUILD_LM_FROM_CORPUS, |
| "witten-bell!" => \$WITTEN_BELL, |
| "tune-grammar=s" => \$_TUNE_GRAMMAR_FILE, |
| "test-grammar=s" => \$_TEST_GRAMMAR_FILE, |
| "grammar=s" => \$GRAMMAR_FILE, |
| "glue-grammar=s" => \$GLUE_GRAMMAR_FILE, |
| "maxspan=i" => \$MAXSPAN, |
| "mbr!" => \$DO_MBR, |
| "type=s" => \$GRAMMAR_TYPE, |
| "ghkm-extractor=s" => \$GHKM_EXTRACTOR, |
| "extract-options=s" => \$EXTRACT_OPTIONS, |
| "maxlen=i" => \$MAXLEN, |
| "maxlen-tune=i" => \$MAXLEN_TUNE, |
| "maxlen-test=i" => \$MAXLEN_TEST, |
| "tokenizer-source=s" => \$TOKENIZER_SOURCE, |
| "tokenizer-target=s" => \$TOKENIZER_TARGET, |
| "joshua-config=s" => \$_JOSHUA_CONFIG, |
| "joshua-args=s" => \$_JOSHUA_ARGS, |
| "joshua-mem=s" => \$JOSHUA_MEM, |
| "hadoop-mem=s" => \$HADOOP_MEM, |
| "parser-mem=s" => \$PARSER_MEM, |
| "buildlm-mem=s" => \$BUILDLM_MEM, |
| "packer-mem=s" => \$PACKER_MEM, |
| "pack!" => \$DO_PACK_GRAMMARS, |
| "tuner=s" => \$TUNER, |
| "tuner-mem=s" => \$TUNER_MEM, |
| "tuner-iterations=i" => \$TUNER_ITERATIONS, |
| "thrax=s" => \$THRAX, |
| "thrax-conf=s" => \$THRAX_CONF_FILE, |
| "jobs=i" => \$NUM_JOBS, |
| "threads=i" => \$NUM_THREADS, |
| "subsample!" => \$DO_SUBSAMPLE, |
| "qsub-args=s" => \$QSUB_ARGS, |
| "qsub-align-args=s" => \$QSUB_ALIGN_ARGS, |
| "first-step=s" => \$FIRST_STEP, |
| "last-step=s" => \$LAST_STEP, |
| "aligner-chunk-size=s" => \$ALIGNER_BLOCKSIZE, |
| "hadoop=s" => \$HADOOP, |
| "hadoop-conf=s" => \$HADOOP_CONF, |
| "tmp=s" => \$TMPDIR, |
| "nbest=i" => \$NBEST, |
| "reordering-limit=i" => \$REORDERING_LIMIT, |
| "num-translation-options=i" => \$NUM_TRANSLATION_OPTIONS, |
| "ner-tagger=s" => \$NER_TAGGER, |
| "class-lm!" => \$DO_BUILD_CLASS_LM, |
| "class-lm-corpus=s" => \$CLASS_LM_CORPUS, |
| "class-map=s" => \$CLASS_MAP, |
| ); |
| |
| if (! $retval) { |
| print "Invalid usage, quitting\n"; |
| exit 1; |
| } |
| |
| $RUNDIR = get_absolute_path($RUNDIR); |
| |
| $TUNER = lc $TUNER; |
| |
| my $DOING_LATTICES = 0; |
| |
| # Prepend a space to the arguments list if it's non-empty and doesn't already have the space. |
| my $JOSHUA_ARGS = $_JOSHUA_ARGS; |
| |
| my %DATA_DIRS = ( |
| train => get_absolute_path("$RUNDIR/$DATA_DIR/train"), |
| tune => get_absolute_path("$RUNDIR/$DATA_DIR/tune"), |
| test => get_absolute_path("$RUNDIR/$DATA_DIR/test"), |
| ); |
| |
| # capitalize these to offset a common error: |
| $FIRST_STEP = uc($FIRST_STEP); |
| $LAST_STEP = uc($LAST_STEP); |
| |
| $| = 1; |
| |
| my $cachepipe = new CachePipe(); |
| |
| # This tells cachepipe not to include the command signature when determining to run a command. Note |
| # that this is not backwards compatible! |
| $cachepipe->omit_cmd(); |
| |
| $SIG{INT} = sub { |
| print "* Got C-c, quitting\n"; |
| $cachepipe->cleanup(); |
| exit 1; |
| }; |
| |
| # if no LMs were specified, we need to build one from the target side of the corpus |
| if (scalar @LMFILES == 0) { |
| $DO_BUILD_LM_FROM_CORPUS = 1; |
| } |
| |
| ## Sanity Checking ################################################### |
| |
| # If a language model was specified and no corpus was given to build another one from the target |
| # side of the training data (which could happen, for example, when starting at the tuning step with |
| # an existing LM), turn off building an LM from the corpus. The user could have done this |
| # explicitly with --no-corpus-lm, but might have forgotten to, and we con't want to pester them with |
| # an error about easily-inferrable intentions. |
| if (scalar @LMFILES && ! scalar(@CORPORA)) { |
| $DO_BUILD_LM_FROM_CORPUS = 0; |
| } |
| |
| |
| # if merging LMs, make sure there are at least 2 LMs to merge. |
| # first, pin $DO_BUILD_LM_FROM_CORPUS to 0 or 1 so that the subsequent check works. |
| if ($MERGE_LMS) { |
| if ($DO_BUILD_LM_FROM_CORPUS != 0) { |
| $DO_BUILD_LM_FROM_CORPUS = 1 |
| } |
| |
| if (@LMFILES + $DO_BUILD_LM_FROM_CORPUS < 2) { |
| print "* FATAL: I need 2 or more language models to merge (including the corpus target-side LM)."; |
| exit 2; |
| } |
| } |
| |
| # absolutize LM file paths |
| map { |
| $LMFILES[$_] = get_absolute_path($LMFILES[$_]); |
| } 0..$#LMFILES; |
| |
| # make sure the LMs exist |
| foreach my $lmfile (@LMFILES) { |
| if (! -e $lmfile) { |
| print "* FATAL: couldn't find language model file '$lmfile'\n"; |
| exit 1; |
| } |
| } |
| |
| # case-normalize this |
| $GRAMMAR_TYPE = lc $GRAMMAR_TYPE; |
| |
| if ($GRAMMAR_TYPE eq "phrase") { |
| $SEARCH_ALGORITHM = "stack"; |
| $MAXSPAN = 0; |
| } |
| |
| # make sure source and target were specified |
| if (! defined $SOURCE or $SOURCE eq "") { |
| print "* FATAL: I need a source language extension (--source)\n"; |
| exit 1; |
| } |
| if (! defined $TARGET or $TARGET eq "") { |
| print "* FATAL: I need a target language extension (--target)\n"; |
| exit 1; |
| } |
| |
| # make sure a corpus was provided if we're doing any step before tuning |
| if (@CORPORA == 0 and $STEPS{$FIRST_STEP} < $STEPS{TUNE}) { |
| print "* FATAL: need at least one training corpus (--corpus)\n"; |
| exit 1; |
| } |
| |
| # make sure a tuning corpus was provided if we're doing tuning |
| if (! defined $TUNE and ($STEPS{$FIRST_STEP} <= $STEPS{TUNE} |
| and $STEPS{$LAST_STEP} >= $STEPS{TUNE})) { |
| print "* FATAL: need a tuning set (--tune)\n"; |
| exit 1; |
| } |
| |
| # make sure a test corpus was provided if we're decoding a test set |
| if (! defined $TEST and ($STEPS{$FIRST_STEP} <= $STEPS{TEST} |
| and $STEPS{$LAST_STEP} >= $STEPS{TEST})) { |
| print "* FATAL: need a test set (--test)\n"; |
| exit 1; |
| } |
| |
| # Joshua config |
| my $JOSHUA_CONFIG = get_absolute_path($_JOSHUA_CONFIG || "$TUNECONFDIR/joshua.config", $STARTDIR); |
| |
| # make sure we have a tuned config file if we're skipping model building and tuning |
| if ($STEPS{$FIRST_STEP} >= $STEPS{TEST}) { |
| if (! defined $JOSHUA_CONFIG) { |
| print "* FATAL: You need to provide a tuned Joshua config file (--joshua-config)\n"; |
| print " if you're skipping straight to testing\n"; |
| exit 1; |
| } |
| } |
| |
| # make sure we have either a config file or a grammar and LM if we're skipping model building |
| if ($STEPS{$FIRST_STEP} >= $STEPS{TUNE}) { |
| if (! defined $JOSHUA_CONFIG and ((! defined $_TUNE_GRAMMAR_FILE and ! defined $GRAMMAR_FILE) or scalar(@LMFILES) == 0)) { |
| print "* FATAL: You must provide either a Joshua config file (--joshua-config) or\n"; |
| print " a grammar (--grammar or --tune-grammar) and at least one LM (--lmfile)\n"; |
| print " if you're skipping straight to tuning\n"; |
| exit 1; |
| } |
| } |
| |
| # make sure SRILM is defined if we're building a language model |
| if ($LM_GEN eq "srilm" && (scalar @LMFILES == 0) && $STEPS{$FIRST_STEP} <= $STEPS{TUNE} && $STEPS{$LAST_STEP} >= $STEPS{TUNE}) { |
| not_defined("SRILM") unless exists $ENV{SRILM} and -d $ENV{SRILM}; |
| } |
| |
| # check for file presence |
| if (defined $JOSHUA_CONFIG and ! -e $JOSHUA_CONFIG) { |
| print "* FATAL: couldn't find joshua config file '$JOSHUA_CONFIG'\n"; |
| exit 1; |
| } |
| if (defined $GRAMMAR_FILE and ! -e $GRAMMAR_FILE) { |
| print "* FATAL: couldn't find grammar file '$GRAMMAR_FILE'\n"; |
| exit 1; |
| } |
| if (defined $_TUNE_GRAMMAR_FILE and ! -e $_TUNE_GRAMMAR_FILE) { |
| print "* FATAL: couldn't find tuning grammar file '$_TUNE_GRAMMAR_FILE'\n"; |
| exit 1; |
| } |
| if (defined $_TEST_GRAMMAR_FILE and ! -e $_TEST_GRAMMAR_FILE) { |
| print "* FATAL: couldn't find test grammar file '$_TEST_GRAMMAR_FILE'\n"; |
| exit 1; |
| } |
| if (defined $ALIGNMENT and ! -e $ALIGNMENT) { |
| print "* FATAL: couldn't find alignment file '$ALIGNMENT'\n"; |
| exit 1; |
| } |
| |
| # If $CORPUS was a relative path, prepend the starting directory (under the assumption it was |
| # relative to there). This makes sure that everything will still work if we change the run |
| # directory. |
| map { |
| $CORPORA[$_] = get_absolute_path("$CORPORA[$_]"); |
| } (0..$#CORPORA); |
| |
| # Do the same for tuning and test data, and other files |
| $TUNE = get_absolute_path($TUNE); |
| $TEST = get_absolute_path($TEST); |
| |
| $GRAMMAR_FILE = get_absolute_path($GRAMMAR_FILE); |
| $GLUE_GRAMMAR_FILE = get_absolute_path($GLUE_GRAMMAR_FILE); |
| $_TUNE_GRAMMAR_FILE = get_absolute_path($_TUNE_GRAMMAR_FILE); |
| $_TEST_GRAMMAR_FILE = get_absolute_path($_TEST_GRAMMAR_FILE); |
| $THRAX_CONF_FILE = get_absolute_path($THRAX_CONF_FILE); |
| $ALIGNMENT = get_absolute_path($ALIGNMENT); |
| $HADOOP_CONF = get_absolute_path($HADOOP_CONF); |
| |
| foreach my $corpus (@CORPORA) { |
| foreach my $ext ($TARGET,$SOURCE) { |
| if (! -e "$corpus.$ext") { |
| print "* FATAL: can't find '$corpus.$ext'"; |
| exit 1; |
| } |
| } |
| } |
| |
| if ($ALIGNER ne "giza" and $ALIGNER ne "berkeley" and $ALIGNER ne "jacana") { |
| print "* FATAL: aligner must be one of 'giza', 'berkeley' or 'jacana' (only French-English)\n"; |
| exit 1; |
| } |
| |
| if ($LM_TYPE ne "kenlm" and $LM_TYPE ne "berkeleylm") { |
| print "* FATAL: lm type (--lm) must be one of 'kenlm' or 'berkeleylm'\n"; |
| exit 1; |
| } |
| |
| if ($LM_TYPE ne "kenlm") { |
| $LM_STATE_MINIMIZATION = 0; |
| } |
| |
| if ($LM_GEN ne "berkeleylm" and $LM_GEN ne "srilm" and $LM_GEN ne "kenlm") { |
| print "* FATAL: lm generating code (--lm-gen) must be one of 'kenlm' (default), 'berkeleylm', or 'srilm'\n"; |
| exit 1; |
| } |
| |
| if ($TUNER eq "kbmira" and ! defined $MOSES) { |
| print "* FATAL: using 'kbmira' for tuning requires setting the MOSES environment variable\n"; |
| exit 1; |
| } |
| |
| if ($GRAMMAR_TYPE eq "phrase" and ! defined $MOSES) { |
| print "* FATAL: building phrase-based models (--type phrase) requires setting the MOSES environment variable\n"; |
| exit 1; |
| } |
| |
| if ($TUNER ne "mert" and $TUNER ne "zmert" and $TUNER ne "mira" and $TUNER ne "local-mira" and $TUNER ne "pro" and $TUNER ne "kbmira") { |
| print "* FATAL: --tuner must be one of '[z]mert', 'pro', '[local]-mira', or 'kbmira'.\n"; |
| exit 1; |
| } |
| |
| $FILTERING = lc $FILTERING; |
| if ($FILTERING eq "fast") { |
| $FILTERING = "-f" |
| } elsif ($FILTERING eq "exact") { |
| $FILTERING = "-e"; |
| } elsif ($FILTERING eq "loose") { |
| $FILTERING = "-l"; |
| } else { |
| print "* FATAL: --filtering must be one of 'fast' (default) or 'exact' or 'loose'\n"; |
| exit 1; |
| } |
| |
| if (defined $HADOOP_CONF && ! -e $HADOOP_CONF) { |
| print STDERR "* FATAL: Couldn't find \$HADOOP_CONF file '$HADOOP_CONF'\n"; |
| exit 1; |
| } |
| |
| ## END SANITY CHECKS |
| |
| #################################################################################################### |
| ## Dependent variable setting ###################################################################### |
| #################################################################################################### |
| |
| my $OOV = ($GRAMMAR_TYPE eq "hiero" or $GRAMMAR_TYPE eq "itg" or $GRAMMAR_TYPE eq "phrase") ? "X" : "OOV"; |
| |
| # The phrasal system should use the ITG grammar, allowing for limited distortion |
| if ($GRAMMAR_TYPE eq "phrasal") { |
| $GLUE_GRAMMAR_FILE = get_absolute_path("$JOSHUA/scripts/training/templates/glue-grammar.itg"); |
| } |
| |
| # use this default unless it's already been defined by a command-line argument |
| $THRAX_CONF_FILE = "$JOSHUA/scripts/training/templates/thrax-$GRAMMAR_TYPE.conf" unless defined $THRAX_CONF_FILE; |
| |
| mkdir $RUNDIR unless -d $RUNDIR; |
| chdir($RUNDIR); |
| |
| if (defined $README) { |
| open DESC, ">README" or die "can't write README file"; |
| print DESC $README; |
| print DESC $/; |
| close DESC; |
| } |
| |
| # default values -- these are overridden if the full script is run |
| # (after tokenization and normalization) |
| my (%TRAIN,%TUNE,%TEST); |
| if (@CORPORA) { |
| $TRAIN{prefix} = $CORPORA[0]; |
| $TRAIN{source} = "$CORPORA[0].$SOURCE"; |
| $TRAIN{target} = "$CORPORA[0].$TARGET"; |
| } |
| |
| # set the location of the parsed corpus if that was defined |
| if (defined $PARSED_CORPUS) { |
| $TRAIN{parsed} = get_absolute_path($PARSED_CORPUS); |
| } |
| |
| if ($TUNE) { |
| $TUNE{source} = "$TUNE.$SOURCE"; |
| $TUNE{target} = "$TUNE.$TARGET"; |
| |
| if (! -e "$TUNE{source}") { |
| print "* FATAL: couldn't find tune source file at '$TUNE{source}'\n"; |
| exit; |
| } |
| } |
| |
| if ($TEST) { |
| $TEST{source} = "$TEST.$SOURCE"; |
| $TEST{target} = "$TEST.$TARGET"; |
| |
| if (! -e "$TEST{source}") { |
| print "* FATAL: couldn't find test source file at '$TEST{source}'\n"; |
| exit; |
| } |
| } |
| |
| if ($FIRST_STEP ne "FIRST") { |
| if (@CORPORA > 1) { |
| print "* FATAL: you can't skip steps if you specify more than one --corpus\n"; |
| exit(1); |
| } |
| |
| if (eval { goto $FIRST_STEP }) { |
| print "* Skipping to step $FIRST_STEP\n"; |
| goto $FIRST_STEP; |
| } else { |
| print "* No such step $FIRST_STEP\n"; |
| exit 1; |
| } |
| } |
| |
| ## STEP 1: filter and preprocess corpora ############################# |
| FIRST: |
| ; |
| |
| if (defined $ALIGNMENT) { |
| print "* FATAL: it doesn't make sense to provide an alignment and then do\n"; |
| print " tokenization. Either remove --alignment or specify a first step\n"; |
| print " of Thrax (--first-step THRAX)\n"; |
| exit 1; |
| } |
| |
| if (@CORPORA == 0) { |
| print "* FATAL: need at least one training corpus (--corpus)\n"; |
| exit 1; |
| } |
| |
| # prepare the training data |
| my %PREPPED = ( |
| TRAIN => 0, |
| TUNE => 0, |
| TEST => 0 |
| ); |
| |
| |
| if ($DO_PREPARE_CORPORA) { |
| my $prefixes = prepare_data("train",\@CORPORA,$MAXLEN); |
| |
| # used for parsing |
| $TRAIN{mixedcase} = "$DATA_DIRS{train}/$prefixes->{shortened}.$TARGET.gz"; |
| |
| $TRAIN{prefix} = "$DATA_DIRS{train}/corpus"; |
| $TRAIN{source} = "$DATA_DIRS{train}/corpus.$SOURCE"; |
| $TRAIN{target} = "$DATA_DIRS{train}/corpus.$TARGET"; |
| $PREPPED{TRAIN} = 1; |
| } |
| |
| # prepare the tuning and development data |
| if (defined $TUNE and $DO_PREPARE_CORPORA) { |
| my $prefixes = prepare_data("tune",[$TUNE],$MAXLEN_TUNE); |
| $TUNE{source} = "$DATA_DIRS{tune}/corpus.$SOURCE"; |
| $TUNE{target} = "$DATA_DIRS{tune}/corpus.$TARGET"; |
| my $ner_return = ner_annotate("$TUNE{source}", "$TUNE{source}.ner", $SOURCE); |
| if ($ner_return == 2) { |
| $TUNE{source} = "$TUNE{source}.ner"; |
| } |
| $PREPPED{TUNE} = 1; |
| } |
| |
| if (defined $TEST and $DO_PREPARE_CORPORA) { |
| my $prefixes = prepare_data("test",[$TEST],$MAXLEN_TEST); |
| $TEST{source} = "$DATA_DIRS{test}/corpus.$SOURCE"; |
| $TEST{target} = "$DATA_DIRS{test}/corpus.$TARGET"; |
| my $ner_return = ner_annotate("$TEST{source}", "$TEST{source}.ner", $SOURCE); |
| if ($ner_return == 2) { |
| $TEST{source} = "$TEST{source}.ner"; |
| } |
| $PREPPED{TEST} = 1; |
| } |
| |
| maybe_quit("FIRST"); |
| |
| ## SUBSAMPLE ######################################################### |
| |
| SUBSAMPLE: |
| ; |
| |
| # subsample |
| if ($DO_SUBSAMPLE) { |
| mkdir("$DATA_DIRS{train}/subsampled") unless -d "$DATA_DIRS{train}/subsampled"; |
| |
| $cachepipe->cmd("subsample-manifest", |
| "echo corpus > $DATA_DIRS{train}/subsampled/manifest", |
| "$DATA_DIRS{train}/subsampled/manifest"); |
| |
| $cachepipe->cmd("subsample-testdata", |
| "cat $TUNE{source} $TEST{source} > $DATA_DIRS{train}/subsampled/test-data", |
| $TUNE{source}, |
| $TEST{source}, |
| "$DATA_DIRS{train}/subsampled/test-data"); |
| |
| $cachepipe->cmd("subsample", |
| "java -Xmx4g -Dfile.encoding=utf8 -cp $JOSHUA/bin:$JOSHUA/lib/commons-cli-2.0-SNAPSHOT.jar joshua.subsample.Subsampler -e $TARGET -f $SOURCE -epath $DATA_DIRS{train}/ -fpath $DATA_DIRS{train}/ -output $DATA_DIRS{train}/subsampled/subsampled.$MAXLEN -ratio 1.04 -test $DATA_DIRS{train}/subsampled/test-data -training $DATA_DIRS{train}/subsampled/manifest", |
| "$DATA_DIRS{train}/subsampled/manifest", |
| "$DATA_DIRS{train}/subsampled/test-data", |
| $TRAIN{source}, |
| $TRAIN{target}, |
| "$DATA_DIRS{train}/subsampled/subsampled.$MAXLEN.$TARGET", |
| "$DATA_DIRS{train}/subsampled/subsampled.$MAXLEN.$SOURCE"); |
| |
| # rewrite the symlinks to point to the subsampled corpus |
| foreach my $lang ($TARGET,$SOURCE) { |
| system("ln -sf subsampled/subsampled.$MAXLEN.$lang $DATA_DIRS{train}/corpus.$lang"); |
| } |
| } |
| |
| maybe_quit("SUBSAMPLE"); |
| |
| |
| ## ALIGN ############################################################# |
| |
| ALIGN: |
| ; |
| |
| # This basically means that we've skipped tokenization, in which case |
| # we still want to move the input files into the canonical place |
| if ($FIRST_STEP eq "ALIGN") { |
| if (defined $ALIGNMENT) { |
| print "* FATAL: It doesn't make sense to provide an alignment\n"; |
| print " but not to skip the tokenization and subsampling steps\n"; |
| exit 1; |
| } |
| |
| # TODO: copy the files into the canonical place |
| |
| # Jumping straight to alignment is probably the same thing as |
| # skipping tokenization, and might also be implemented by a |
| # --no-tokenization flag |
| } |
| |
| # skip this step if an alignment was provided |
| if (! defined $ALIGNMENT) { |
| |
| # We process the data in chunks which by default are 1,000,000 sentence pairs. So first split up |
| # the data into those chunks. |
| system("mkdir","-p","$DATA_DIRS{train}/splits") unless -d "$DATA_DIRS{train}/splits"; |
| |
| $cachepipe->cmd("source-numlines", |
| "cat $TRAIN{source} | wc -l", |
| $TRAIN{source}); |
| my $numlines = $cachepipe->stdout(); |
| my $numchunks = ceil($numlines / $ALIGNER_BLOCKSIZE); |
| |
| open TARGET, $TRAIN{target} or die "can't read $TRAIN{target}"; |
| open SOURCE, $TRAIN{source} or die "can't read $TRAIN{source}"; |
| |
| my $lastchunk = -1; |
| while (my $target = <TARGET>) { |
| my $source = <SOURCE>; |
| |
| # We want to prevent a very small last chunk, which we accomplish |
| # by folding the last chunk into the penultimate chunk. |
| my $chunk = ($numchunks <= 2) |
| ? 0 |
| : min($numchunks - 2, |
| int( (${.} - 1) / $ALIGNER_BLOCKSIZE )); |
| |
| if ($chunk != $lastchunk) { |
| close CHUNK_SOURCE; |
| close CHUNK_TARGET; |
| open CHUNK_SOURCE, ">", "$DATA_DIRS{train}/splits/corpus.$SOURCE.$chunk" or die; |
| open CHUNK_TARGET, ">", "$DATA_DIRS{train}/splits/corpus.$TARGET.$chunk" or die; |
| |
| $lastchunk = $chunk; |
| } |
| |
| print CHUNK_SOURCE $source; |
| print CHUNK_TARGET $target; |
| } |
| close CHUNK_SOURCE; |
| close CHUNK_TARGET; |
| |
| close SOURCE; |
| close TARGET; |
| |
| # my $max_aligner_threads = $NUM_THREADS; |
| # if ($ALIGNER eq "giza" and $max_aligner_threads > 1) { |
| # $max_aligner_threads /= 2; |
| # } |
| |
| # # With multi-threading, we can use a pool to set up concurrent GIZA jobs on the chunks. |
| # |
| # TODO: implement this. There appears to be a problem with calling system() in threads. |
| # |
| # my $pool = new Thread::Pool(Min => 1, Max => $max_aligner_threads); |
| |
| system("mkdir alignments") unless -d "alignments"; |
| |
| my $aligner_cmd = ( |
| "$SCRIPTDIR/training/paralign.pl " |
| . " -aligner $ALIGNER" |
| . " -num_threads 1" |
| . " -giza_merge $GIZA_MERGE" |
| . " -aligner_mem $ALIGNER_MEM" |
| . " -source $SOURCE" |
| . " -target $TARGET" |
| . " -giza_trainer \"$GIZA_TRAINER\"" |
| . " -train_dir \"$DATA_DIRS{train}\" " |
| . "> alignments/run.log" |
| ); |
| |
| # Start a parallel job on each core |
| my @children = (); |
| my $next_chunk = 0; |
| foreach my $core (1..$NUM_THREADS) { |
| if ($next_chunk < $lastchunk + 1) { |
| my $child = fork(); |
| if (! $child) { # I am child |
| exec("echo $next_chunk | $aligner_cmd"); |
| exit 0; |
| } |
| push @children, $child; |
| $next_chunk++; |
| next; |
| } |
| } |
| |
| # Start another concurrent job as each oldest job finishes |
| while (@children) { |
| my $old_child = shift @children; |
| waitpid( $old_child, 0 ); |
| print "child finished\n"; |
| |
| if ($next_chunk < $lastchunk + 1) { |
| my $new_child = fork(); |
| if (! $new_child) { # I am child |
| exec("echo $next_chunk | $aligner_cmd"); |
| exit 0; |
| } |
| $next_chunk++; |
| push @children, $new_child; |
| } |
| } |
| |
| my @aligned_files; |
| if ($ALIGNER eq "giza") { |
| @aligned_files = map { "alignments/$_/model/aligned.$GIZA_MERGE" } (0..$lastchunk); |
| } elsif ($ALIGNER eq "berkeley") { |
| @aligned_files = map { "alignments/$_/training.align" } (0..$lastchunk); |
| } elsif ($ALIGNER eq "jacana") { |
| @aligned_files = map { "alignments/$_/training.align" } (0..$lastchunk); |
| } |
| my $aligned_file_list = join(" ", @aligned_files); |
| |
| # wait for all the threads to finish |
| # $pool->join(); |
| |
| # combine the alignments |
| $cachepipe->cmd("aligner-combine", |
| "cat $aligned_file_list > alignments/training.align", |
| $aligned_files[-1], |
| "alignments/training.align"); |
| |
| # at the end, all the files are concatenated into a single alignment file parallel to the input |
| # corpora |
| $ALIGNMENT = "alignments/training.align"; |
| } |
| |
| maybe_quit("ALIGN"); |
| |
| |
| ## PARSE ############################################################# |
| |
| PARSE: |
| ; |
| |
| # Parsing only happens for SAMT grammars. |
| |
| if ($FIRST_STEP eq "PARSE" and ($GRAMMAR_TYPE eq "hiero" or $GRAMMAR_TYPE eq "phrasal" or $GRAMMAR_TYPE eq "phrase")) { |
| print STDERR "* FATAL: parsing doesn't apply to hiero grammars; You need to add '--type samt|ghkm'\n"; |
| exit; |
| } |
| |
| if ($GRAMMAR_TYPE eq "samt" || $GRAMMAR_TYPE eq "ghkm") { |
| |
| # If the user passed in the already-parsed corpus, use that (after copying it into place) |
| if (defined $TRAIN{parsed} && -e $TRAIN{parsed}) { |
| # copy and adjust the location of the file to its canonical location |
| system("cp $TRAIN{parsed} $DATA_DIRS{train}/corpus.parsed.$TARGET"); |
| $TRAIN{parsed} = "$DATA_DIRS{train}/corpus.parsed.$TARGET"; |
| } else { |
| |
| system("mkdir -p $DATA_DIRS{train}") unless -e $DATA_DIRS{train}; |
| |
| $cachepipe->cmd("build-vocab", |
| "cat $TRAIN{target} | $SCRIPTDIR/training/build-vocab.pl > $DATA_DIRS{train}/vocab.$TARGET", |
| $TRAIN{target}, |
| "$DATA_DIRS{train}/vocab.$TARGET"); |
| |
| my $file_to_parse = (exists $TRAIN{mixedcase}) ? $TRAIN{mixedcase} : $TRAIN{target}; |
| |
| if ($NUM_JOBS > 1) { |
| # the black-box parallelizer model doesn't work with multiple |
| # threads, so we're always spawning single-threaded instances here |
| |
| # open PARSE, ">parse.sh" or die; |
| # print PARSE "cat $TRAIN{target} | $JOSHUA/scripts/training/parallelize/parallelize.pl --jobs $NUM_JOBS --qsub-args \"$QSUB_ARGS\" -- java -d64 -Xmx${PARSER_MEM} -jar $JOSHUA/lib/BerkeleyParser.jar -gr $JOSHUA/lib/eng_sm6.gr -nThreads 1 | sed 's/^\(/\(TOP/' | tee $DATA_DIRS{train}/corpus.$TARGET.parsed.mc | perl -pi -e 's/(\\S+)\\)/lc(\$1).\")\"/ge' | tee $DATA_DIRS{train}/corpus.$TARGET.parsed | perl $SCRIPTDIR/training/add-OOVs.pl $DATA_DIRS{train}/vocab.$TARGET > $DATA_DIRS{train}/corpus.parsed.$TARGET\n"; |
| # close PARSE; |
| # chmod 0755, "parse.sh"; |
| # $cachepipe->cmd("parse", |
| # "setsid ./parse.sh", |
| # "$TRAIN{target}", |
| # "$DATA_DIRS{train}/corpus.parsed.$TARGET"); |
| |
| $cachepipe->cmd("parse", |
| "$CAT $file_to_parse | $JOSHUA/scripts/training/parallelize/parallelize.pl --jobs $NUM_JOBS --qsub-args \"$QSUB_ARGS\" -p 8g -- java -d64 -Xmx${PARSER_MEM} -jar $JOSHUA/lib/BerkeleyParser.jar -gr $JOSHUA/lib/eng_sm6.gr -nThreads 1 | sed 's/^(())\$//; s/^(/(TOP/' | perl $SCRIPTDIR/training/add-OOVs.pl $DATA_DIRS{train}/vocab.$TARGET | tee $DATA_DIRS{train}/corpus.$TARGET.Parsed | $SCRIPTDIR/training/lowercase-leaves.pl > $DATA_DIRS{train}/corpus.parsed.$TARGET", |
| "$TRAIN{target}", |
| "$DATA_DIRS{train}/corpus.parsed.$TARGET"); |
| } else { |
| # Multi-threading in the Berkeley parser is broken, so we use a black-box parallelizer on top |
| # of it. |
| $cachepipe->cmd("parse", |
| "$CAT $file_to_parse | $JOSHUA/scripts/training/parallelize/parallelize.pl --jobs $NUM_THREADS --use-fork -- java -d64 -Xmx${PARSER_MEM} -jar $JOSHUA/lib/BerkeleyParser.jar -gr $JOSHUA/lib/eng_sm6.gr -nThreads 1 | sed 's/^(())\$//; s/^(/(TOP/' | perl $SCRIPTDIR/training/add-OOVs.pl $DATA_DIRS{train}/vocab.$TARGET | tee $DATA_DIRS{train}/corpus.$TARGET.Parsed | $SCRIPTDIR/training/lowercase-leaves.pl > $DATA_DIRS{train}/corpus.parsed.$TARGET", |
| "$TRAIN{target}", |
| "$DATA_DIRS{train}/corpus.parsed.$TARGET"); |
| } |
| |
| $TRAIN{parsed} = "$DATA_DIRS{train}/corpus.parsed.$TARGET"; |
| } |
| } |
| |
| maybe_quit("PARSE"); |
| |
| ## THRAX ############################################################# |
| |
| GRAMMAR: |
| ; |
| THRAX: |
| ; |
| PHRASE: |
| ; |
| |
| system("mkdir -p $DATA_DIRS{train}") unless -d $DATA_DIRS{train}; |
| |
| if ($GRAMMAR_TYPE eq "samt" || $GRAMMAR_TYPE eq "ghkm") { |
| |
| # if we jumped right here, $TRAIN{target} should be parsed |
| if (exists $TRAIN{parsed}) { |
| # parsing step happened in-script or a parsed corpus was passed in explicitly, all is well |
| |
| } elsif (already_parsed($TRAIN{target})) { |
| # skipped straight to this step, passing a parsed corpus |
| |
| $TRAIN{parsed} = "$DATA_DIRS{train}/corpus.parsed.$TARGET"; |
| |
| $cachepipe->cmd("cp-train-$TARGET", |
| "cp $TRAIN{target} $TRAIN{parsed}", |
| $TRAIN{target}, |
| $TRAIN{parsed}); |
| |
| $TRAIN{target} = "$DATA_DIRS{train}/corpus.$TARGET"; |
| |
| # now extract the leaves of the parsed corpus |
| $cachepipe->cmd("extract-leaves", |
| "cat $TRAIN{parsed} | perl -pe 's/\\(.*?(\\S\+)\\)\+?/\$1/g' | perl -pe 's/\\)//g' > $TRAIN{target}", |
| $TRAIN{parsed}, |
| $TRAIN{target}); |
| |
| if ($TRAIN{source} ne "$DATA_DIRS{train}/corpus.$SOURCE") { |
| $cachepipe->cmd("cp-train-$SOURCE", |
| "cp $TRAIN{source} $DATA_DIRS{train}/corpus.$SOURCE", |
| $TRAIN{source}, "$DATA_DIRS{train}/corpus.$SOURCE"); |
| $TRAIN{source} = "$DATA_DIRS{train}/corpus.$SOURCE"; |
| } |
| |
| } else { |
| print "* FATAL: You requested to build an SAMT grammar, but provided an\n"; |
| print " unparsed corpus. Please re-run the pipeline and begin no later\n"; |
| print " than the PARSE step (--first-step PARSE), or pass in a parsed corpus\n"; |
| print " using --parsed-corpus CORPUS.\n"; |
| exit 1; |
| } |
| |
| } |
| |
| # we may have skipped directly to this step, in which case we need to |
| # ensure an alignment was provided |
| if (! defined $ALIGNMENT) { |
| print "* FATAL: no alignment file specified\n"; |
| exit(1); |
| } |
| |
| # Look for a pre-existing grammar, since building it is expensive, and something we want to |
| # avoid if this is a rerun |
| if (-e "grammar.gz" && ! -z "grammar.gz") { |
| chomp(my $is_empty = `gzip -cd grammar.gz | head | wc -l`); |
| $GRAMMAR_FILE = "grammar.gz" unless ($is_empty == 0); |
| } |
| |
| # If the grammar file wasn't specified |
| if (! defined $GRAMMAR_FILE) { |
| |
| my $target_file = ($GRAMMAR_TYPE eq "hiero" or $GRAMMAR_TYPE eq "phrasal" or $GRAMMAR_TYPE eq "phrase") ? $TRAIN{target} : $TRAIN{parsed}; |
| |
| if ($GRAMMAR_TYPE eq "ghkm") { |
| if ($GHKM_EXTRACTOR eq "galley") { |
| $cachepipe->cmd("ghkm-extract", |
| "java -Xmx4g -Xms4g -cp $JOSHUA/lib/ghkm-modified.jar:$JOSHUA/lib/fastutil.jar -XX:+UseCompressedOops edu.stanford.nlp.mt.syntax.ghkm.RuleExtractor -fCorpus $TRAIN{source} -eParsedCorpus $target_file -align $ALIGNMENT -threads $NUM_THREADS -joshuaFormat true -maxCompositions 1 -reversedAlignment false | $SCRIPTDIR/support/splittabs.pl ghkm-mapping.gz grammar.gz", |
| $ALIGNMENT, |
| "grammar.gz"); |
| } elsif ($GHKM_EXTRACTOR eq "moses") { |
| # XML-ize, also replacing unary chains with OOV at the bottom by removing their unary parents |
| $cachepipe->cmd("ghkm-moses-xmlize", |
| "cat $target_file | perl -pe 's/\\(\\S+ \\(OOV (.*?)\\)\\)/(OOV \$1)/g' | $MOSES/scripts/training/wrappers/berkeleyparsed2mosesxml.perl > $DATA_DIRS{train}/corpus.xml", |
| # "cat $target_file | perl -pe 's/\\(\\S+ \\(OOV (.*?)\\)\\)/(OOV \$1)/g' > $DATA_DIRS{train}/corpus.ptb", |
| $target_file, |
| "$DATA_DIRS{train}/corpus.xml"); |
| |
| if (! -e "$DATA_DIRS{train}/corpus.$SOURCE") { |
| system("ln -sf $TRAIN{source} $DATA_DIRS{train}/corpus.$SOURCE"); |
| } |
| |
| if ($ALIGNMENT ne "alignments/training.align") { |
| system("mkdir alignments") unless -d "alignments"; |
| system("ln -sf $ALIGNMENT alignments/training.align"); |
| $ALIGNMENT = "alignments/training.align"; |
| } |
| |
| system("mkdir model"); |
| $cachepipe->cmd("ghkm-moses-extract", |
| "$MOSES/scripts/training/train-model.perl --first-step 4 --last-step 6 --corpus $DATA_DIRS{train}/corpus --ghkm --f $SOURCE --e xml --alignment-file alignments/training --alignment align --target-syntax --cores $NUM_THREADS --pcfg --alt-direct-rule-score-1 --ghkm-tree-fragments --glue-grammar --glue-grammar-file glue-grammar.ghkm --extract-options \"$EXTRACT_OPTIONS --UnknownWordLabel oov-labels.txt\"", |
| "$DATA_DIRS{train}/corpus.xml", |
| "glue-grammar.ghkm", |
| "model/rule-table.gz"); |
| |
| open LABELS, "oov-labels.txt"; |
| chomp(my @labels = <LABELS>); |
| close LABELS; |
| my $oov_list = "\"" . join(" ", @labels) . "\""; |
| $JOSHUA_ARGS .= " -oov-list $oov_list"; |
| |
| $cachepipe->cmd("ghkm-moses-convert", |
| "gzip -cd model/rule-table.gz | /home/hltcoe/mpost/code/joshua/scripts/support/moses2joshua_grammar.pl -m rule-fragment-map.txt | gzip -9n > grammar.gz", |
| "model/rule-table.gz", |
| "grammar.gz"); |
| |
| } else { |
| print STDERR "* FATAL: no such GHKM extractor '$GHKM_EXTRACTOR'\n"; |
| exit(1); |
| } |
| |
| $GRAMMAR_FILE = "grammar.gz"; |
| |
| } elsif ($GRAMMAR_TYPE eq "phrase") { |
| |
| mkdir("model") unless -d "model"; |
| |
| if ($ALIGNMENT ne "alignments/training.align") { |
| system("mkdir alignments") unless -d "alignments"; |
| system("ln -sf $ALIGNMENT alignments/training.align"); |
| $ALIGNMENT = "alignments/training.align"; |
| } |
| |
| # Compute lexical probabilities |
| $cachepipe->cmd("build-lex-trans", |
| "$MOSES/scripts/training/train-model.perl -mgiza -mgiza-cpus $NUM_THREADS -dont-zip -first-step 4 -last-step 4 -external-bin-dir $MOSES/bin -f $SOURCE -e $TARGET -max-phrase-length $MAX_PHRASE_LEN -score-options '--GoodTuring' -parallel -lexical-file model/lex -alignment-file alignments/training -alignment align -corpus $TRAIN{prefix}", |
| $TRAIN{source}, |
| $TRAIN{target}, |
| $ALIGNMENT, |
| "model/lex.e2f", |
| "model/lex.f2e" |
| ); |
| |
| # Extract the phrases |
| $cachepipe->cmd("extract-phrases", |
| "$MOSES/scripts/training/train-model.perl -mgiza -mgiza-cpus $NUM_THREADS -dont-zip -first-step 5 -last-step 5 -external-bin-dir $MOSES/bin -f $SOURCE -e $TARGET -max-phrase-length $MAX_PHRASE_LEN -score-options '--GoodTuring' -parallel -alignment-file alignments/training -alignment align -extract-file model/extract -corpus $TRAIN{prefix}", |
| $TRAIN{source}, |
| $TRAIN{target}, |
| $ALIGNMENT, |
| "model/extract.sorted.gz", |
| "model/extract.inv.sorted.gz" |
| ); |
| |
| # Build the phrase table |
| $cachepipe->cmd("build-ttable", |
| "$MOSES/scripts/training/train-model.perl -mgiza -mgiza-cpus $NUM_THREADS -dont-zip -first-step 6 -last-step 6 -external-bin-dir $MOSES/bin -f $SOURCE -e $TARGET -alignment grow-diag-final-and -max-phrase-length $MAX_PHRASE_LEN -score-options '--GoodTuring' -parallel -extract-file model/extract -lexical-file model/lex -phrase-translation-table model/phrase-table", |
| "model/lex.e2f", |
| "model/extract.sorted.gz" |
| ); |
| |
| $GRAMMAR_FILE = "model/phrase-table.gz"; |
| |
| } elsif ($GRAMMAR_TYPE eq "samt" or $GRAMMAR_TYPE eq "hiero") { |
| |
| # Since this is an expensive step, we short-circuit it if the grammar file is present. I'm not |
| # sure that this is the right behavior. |
| |
| # create the input file |
| $cachepipe->cmd("thrax-input-file", |
| "paste $TRAIN{source} $target_file $ALIGNMENT | perl -pe 's/\\t/ ||| /g' | grep -v '()' | grep -v '||| \\+\$' > $DATA_DIRS{train}/thrax-input-file", |
| $TRAIN{source}, $target_file, $ALIGNMENT, |
| "$DATA_DIRS{train}/thrax-input-file"); |
| |
| |
| # Rollout the hadoop cluster if needed. This causes $HADOOP to be defined (pointing to the |
| # unrolled directory). |
| start_hadoop_cluster() unless defined $HADOOP; |
| |
| # put the hadoop files in place |
| my $THRAXDIR; |
| my $thrax_input; |
| if ($HADOOP eq "hadoop") { |
| $THRAXDIR = "thrax"; |
| |
| $thrax_input = "$DATA_DIRS{train}/thrax-input-file" |
| |
| } else { |
| $THRAXDIR = "pipeline-$SOURCE-$TARGET-$GRAMMAR_TYPE-$RUNDIR"; |
| $THRAXDIR =~ s#/#_#g; |
| |
| $cachepipe->cmd("thrax-prep", |
| "$HADOOP/bin/hadoop fs -rm -r $THRAXDIR; $HADOOP/bin/hadoop fs -mkdir $THRAXDIR; $HADOOP/bin/hadoop fs -put $DATA_DIRS{train}/thrax-input-file $THRAXDIR/input-file", |
| "$DATA_DIRS{train}/thrax-input-file", |
| "grammar.gz"); |
| |
| $thrax_input = "$THRAXDIR/input-file"; |
| } |
| |
| # copy the thrax config file |
| my $thrax_file = "thrax-$GRAMMAR_TYPE.conf"; |
| system("grep -v ^input-file $THRAX_CONF_FILE > $thrax_file.tmp"); |
| system("echo input-file $thrax_input >> $thrax_file.tmp"); |
| system("mv $thrax_file.tmp $thrax_file"); |
| |
| $cachepipe->cmd("thrax-run", |
| "$HADOOP/bin/hadoop jar $THRAX/bin/thrax.jar -D mapred.child.java.opts='-Xmx$HADOOP_MEM' $thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; $HADOOP/bin/hadoop fs -getmerge $THRAXDIR/final/ grammar.gz", |
| # "$HADOOP/bin/hadoop jar $THRAX/bin/thrax.jar -D mapred.child.java.opts='-Xmx$HADOOP_MEM' $thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; $HADOOP/bin/hadoop fs -getmerge $THRAXDIR/final/ grammar.gz; $HADOOP/bin/hadoop fs -rmr $THRAXDIR", |
| "$DATA_DIRS{train}/thrax-input-file", |
| $thrax_file, |
| "grammar.gz"); |
| #perl -pi -e 's/\.?0+\b//g' grammar; |
| |
| stop_hadoop_cluster() if $HADOOP eq "hadoop"; |
| |
| # cache the thrax-prep step, which depends on grammar.gz |
| if ($HADOOP ne "hadoop") { |
| $cachepipe->cmd("thrax-prep", "--cache-only"); |
| } |
| |
| # clean up |
| # TODO: clean up real hadoop clusters too |
| # if ($HADOOP eq "hadoop") { |
| # system("rm -rf $THRAXDIR hadoop hadoop-0.20.2"); |
| # } |
| |
| $GRAMMAR_FILE = "grammar.gz"; |
| } else { |
| |
| print STDERR "* FATAL: There was no way to build a grammar, and none was passed in\n"; |
| print STDERR "* Please try one of the following:\n"; |
| print STDERR "* - Specify a grammar with --grammar /path/to/grammar\n"; |
| print STDERR "* - Delete any existing grammar named 'grammar.gz'\n"; |
| |
| exit 1; |
| } |
| } |
| |
| maybe_quit("THRAX"); |
| maybe_quit("GRAMMAR"); |
| |
| ## TUNING ############################################################## |
| TUNE: |
| ; |
| |
| # prep the tuning data, unless already prepped |
| if (! $PREPPED{TUNE} and $DO_PREPARE_CORPORA) { |
| my $prefixes = prepare_data("tune",[$TUNE],$MAXLEN_TUNE); |
| $TUNE{source} = "$DATA_DIRS{tune}/$prefixes->{lowercased}.$SOURCE"; |
| $TUNE{target} = "$DATA_DIRS{tune}/$prefixes->{lowercased}.$TARGET"; |
| $PREPPED{TUNE} = 1; |
| } |
| |
| sub compile_lm($) { |
| my $lmfile = shift; |
| if ($LM_TYPE eq "kenlm") { |
| my $kenlm_file = basename($lmfile, ".gz") . ".kenlm"; |
| $cachepipe->cmd("compile-kenlm", |
| "$JOSHUA/src/joshua/decoder/ff/lm/kenlm/build_binary $lmfile $kenlm_file", |
| $lmfile, $kenlm_file); |
| return $kenlm_file; |
| |
| } elsif ($LM_TYPE eq "berkeleylm") { |
| my $berkeleylm_file = basename($lmfile, ".gz") . ".berkeleylm"; |
| $cachepipe->cmd("compile-berkeleylm", |
| "java -cp $JOSHUA/lib/berkeleylm.jar -server -mx$BUILDLM_MEM edu.berkeley.nlp.lm.io.MakeLmBinaryFromArpa $lmfile $berkeleylm_file", |
| $lmfile, $berkeleylm_file); |
| return $berkeleylm_file; |
| |
| } else { |
| print "* FATAL: trying to compile an LM to neither kenlm nor berkeleylm."; |
| exit 2; |
| } |
| } |
| |
| # Build the language model if needed |
| if (defined $TRAIN{target} and $DO_BUILD_LM_FROM_CORPUS) { |
| |
| # make sure the training data is prepped |
| if (! $PREPPED{TRAIN} and $DO_PREPARE_CORPORA) { |
| my $prefixes = prepare_data("train",\@CORPORA,$MAXLEN); |
| |
| $TRAIN{prefix} = "$DATA_DIRS{train}/corpus"; |
| foreach my $lang ($SOURCE,$TARGET) { |
| system("ln -sf $prefixes->{lowercased}.$lang $DATA_DIRS{train}/corpus.$lang"); |
| } |
| $TRAIN{source} = "$DATA_DIRS{train}/corpus.$SOURCE"; |
| $TRAIN{target} = "$DATA_DIRS{train}/corpus.$TARGET"; |
| $PREPPED{TRAIN} = 1; |
| } |
| |
| my $lmfile = "lm.gz"; |
| |
| # sort and uniq the training data |
| $cachepipe->cmd("lm-sort-uniq", |
| "$CAT $TRAIN{target} | sort -u -T $TMPDIR -S $BUILDLM_MEM | gzip -9n > $TRAIN{target}.uniq", |
| $TRAIN{target}, |
| "$TRAIN{target}.uniq"); |
| |
| # If an NER Tagger is specified, use that to annotate the corpus before |
| # sending it off to the LM |
| my $ner_return = ner_annotate("$TRAIN{target}.uniq", "$TRAIN{target}.uniq.ner", $TARGET); |
| if ($ner_return == 2) { |
| $TRAIN{ner_lm} = 1; |
| } |
| |
| my $lm_input = "$TRAIN{target}.uniq"; |
| # Choose LM input based on whether an annotated corpus was created |
| if (defined $TRAIN{ner_lm}) { |
| $lm_input = replace_tokens_with_types("$TRAIN{target}.uniq.ner"); |
| } |
| |
| if ($LM_GEN eq "srilm") { |
| my $smoothing = ($WITTEN_BELL) ? "-wbdiscount" : "-kndiscount"; |
| $cachepipe->cmd("srilm", |
| "$SRILM -order $LM_ORDER -interpolate $smoothing -unk -gt3min 1 -gt4min 1 -gt5min 1 -text $TRAIN{target}.uniq $LM_OPTIONS -lm lm.gz", |
| "$lm_input", |
| $lmfile); |
| } elsif ($LM_GEN eq "berkeleylm") { |
| $cachepipe->cmd("berkeleylm", |
| "java -ea -mx$BUILDLM_MEM -server -cp $JOSHUA/lib/berkeleylm.jar edu.berkeley.nlp.lm.io.MakeKneserNeyArpaFromText $LM_ORDER lm.gz $TRAIN{target}.uniq", |
| "$lm_input", |
| $lmfile); |
| } else { |
| # Make sure it exists |
| if (! -e "$JOSHUA/bin/lmplz") { |
| print "* FATAL: $JOSHUA/bin/lmplz (for building LMs) does not exist.\n"; |
| print " This is often a problem with the boost libraries (particularly threaded\n"; |
| print " versus unthreaded).\n"; |
| exit 1; |
| } |
| |
| # Needs to be capitalized |
| my $mem = uc $BUILDLM_MEM; |
| $cachepipe->cmd("kenlm", |
| "$JOSHUA/bin/lmplz -o $LM_ORDER -T $TMPDIR -S $mem --verbose_header --text $TRAIN{target}.uniq $LM_OPTIONS | gzip -9n > lm.gz", |
| "$TRAIN{target}.uniq", |
| $lmfile); |
| } |
| |
| if ((! $MERGE_LMS) && ($LM_TYPE eq "kenlm" || $LM_TYPE eq "berkeleylm")) { |
| push (@LMFILES, get_absolute_path(compile_lm $lmfile, $RUNDIR)); |
| } else { |
| push (@LMFILES, get_absolute_path($lmfile, $RUNDIR)); |
| } |
| } |
| |
| if ($DO_BUILD_CLASS_LM) { |
| # Build a Class LM |
| # First check to see if an class map and class corpus are defined |
| if (! defined $CLASS_LM_CORPUS or ! defined $CLASS_MAP) { |
| print "* FATAL: A class LM corpus (--class-lm-corpus) and a class map (--class-map) are required with the --class-lm switch"; |
| exit 1; |
| } |
| if (! -e $CLASS_LM_CORPUS or ! -e $CLASS_MAP) { |
| print "* FATAL: Could not find the Class LM corpus or map"; |
| exit 1; |
| } |
| if (! -e "$JOSHUA/bin/lmplz") { |
| print "* FATAL: $JOSHUA/bin/lmplz (for building LMs) does not exist.\n"; |
| print " This is often a problem with the boost libraries (particularly threaded\n"; |
| print " versus unthreaded).\n"; |
| exit 1; |
| } |
| |
| # Needs to be capitalized |
| my $mem = uc $BUILDLM_MEM; |
| my $class_lmfile = "class_lm.gz"; |
| $cachepipe->cmd("classlm", |
| "$JOSHUA/bin/lmplz -o 9 -T $TMPDIR -S $mem --discount_fallback=0.5 1 1.5 --verbose_header --text $CLASS_LM_CORPUS $LM_OPTIONS | gzip -9n > $class_lmfile", |
| "$CLASS_LM_CORPUS", |
| $class_lmfile); |
| } |
| |
| if ($MERGE_LMS) { |
| # Merge @LMFILES. |
| my $merged_lm = "lm-merged.gz"; |
| |
| # Use the target first target reference if there are multiple ones |
| my $target_ref = (-e $TUNE{target}) ? $TUNE{target} : "$TUNE{target}.0"; |
| |
| $cachepipe->cmd("merge-lms", |
| "$JOSHUA/scripts/support/merge_lms.py " |
| . "@LMFILES " |
| . "$target_ref " |
| . "lm-merged.gz " |
| . "--temp-dir data/merge_lms ", |
| @LMFILES, |
| $merged_lm); |
| |
| # Empty out @LMFILES. |
| @LMFILES = (); |
| |
| # Compile merged LM |
| if ($LM_TYPE eq "kenlm" || $LM_TYPE eq "berkeleylm") { |
| push (@LMFILES, get_absolute_path(compile_lm $merged_lm, $RUNDIR)); |
| |
| } else { |
| push (@LMFILES, get_absolute_path($merged_lm, $RUNDIR)); |
| } |
| } |
| |
| system("mkdir -p $DATA_DIRS{tune}") unless -d $DATA_DIRS{tune}; |
| |
| # figure out how many references there are |
| my $numrefs = get_numrefs($TUNE{target}); |
| |
| # make sure the dev source exist |
| if (! -e $TUNE{source}) { |
| print STDERR "* FATAL: couldn't fine tuning source file '$TUNE{source}'\n"; |
| exit 1; |
| } |
| if ($numrefs > 1) { |
| for my $i (0..$numrefs-1) { |
| if (! -e "$TUNE{target}.$i") { |
| print STDERR "* FATAL: couldn't find tuning reference file '$TUNE{target}.$i'\n"; |
| exit 1; |
| } |
| } |
| } else { |
| if (! -e $TUNE{target}) { |
| print STDERR "* FATAL: couldn't find tuning reference file '$TUNE{target}'\n"; |
| exit 1; |
| } |
| } |
| |
| # Set $TUNE_GRAMMAR to a specifically-passed tuning grammar or the |
| # main default grammar. Then update it if filtering was requested and |
| # is possible. |
| my $TUNE_GRAMMAR = $_TUNE_GRAMMAR_FILE || $GRAMMAR_FILE; |
| if ($DO_FILTER_TM and defined $TUNE_GRAMMAR and ! $DOING_LATTICES and ! defined $_TUNE_GRAMMAR_FILE) { |
| $TUNE_GRAMMAR = "$DATA_DIRS{tune}/grammar.filtered.gz"; |
| |
| $cachepipe->cmd("filter-tune", |
| "$SCRIPTDIR/support/filter_grammar.sh -g $GRAMMAR_FILE $FILTERING -v $TUNE{source} | $SCRIPTDIR/training/filter-rules.pl -bus$SCOPE | gzip -9n > $TUNE_GRAMMAR", |
| $GRAMMAR_FILE, |
| $TUNE{source}, |
| $TUNE_GRAMMAR); |
| } |
| |
| # Create the glue grammars. This is done by looking at all the symbols in the grammar file and |
| # creating all the needed rules. This is only done if there is a $TUNE_GRAMMAR defined (which |
| # can be skipped if we skip straight to the tuning step). |
| if (defined $TUNE_GRAMMAR and $GRAMMAR_TYPE ne "phrase") { |
| if (! defined $GLUE_GRAMMAR_FILE) { |
| $cachepipe->cmd("glue-tune", |
| "java -Xmx2g -cp $JOSHUA/lib/*:$THRAX/bin/thrax.jar edu.jhu.thrax.util.CreateGlueGrammar $TUNE_GRAMMAR > $DATA_DIRS{tune}/grammar.glue", |
| get_file_from_grammar($TUNE_GRAMMAR), |
| "$DATA_DIRS{tune}/grammar.glue"); |
| $GLUE_GRAMMAR_FILE = "$DATA_DIRS{tune}/grammar.glue"; |
| } else { |
| # just create a symlink to it |
| my $filename = $DATA_DIRS{tune} . "/" . basename($GLUE_GRAMMAR_FILE); |
| system("ln -sf $GLUE_GRAMMAR_FILE $filename"); |
| } |
| } |
| |
| # Add in feature functions |
| my $weightstr = ""; |
| my @feature_functions; |
| my $lm_index = 0; |
| for my $i (0..$#LMFILES) { |
| if ($LM_STATE_MINIMIZATION) { |
| push(@feature_functions, "StateMinimizingLanguageModel -lm_order $LM_ORDER -lm_file $LMFILES[$i]"); |
| } else { |
| push(@feature_functions, "LanguageModel -lm_type $LM_TYPE -lm_order $LM_ORDER -lm_file $LMFILES[$i]"); |
| } |
| |
| $weightstr .= "lm_$i 1 "; |
| $lm_index += 1; |
| } |
| |
| if ($DO_BUILD_CLASS_LM) { |
| push(@feature_functions, "LanguageModel -lm_type kenlm -lm_order 9 -lm_file $RUNDIR/class_lm.gz -class_map $CLASS_MAP"); |
| $weightstr .= "lm_$lm_index 1 "; |
| } |
| |
| if ($DOING_LATTICES) { |
| push(@feature_functions, "SourcePath"); |
| } |
| if ($GRAMMAR_TYPE eq "phrase") { |
| push(@feature_functions, "Distortion"); |
| push(@feature_functions, "PhrasePenalty"); |
| |
| $weightstr .= "Distortion 1.0 PhrasePenalty 1.0 "; |
| } |
| my $feature_functions = join(" ", map { "-feature-function \"$_\"" } @feature_functions); |
| |
| # Build out the weight string |
| my $TM_OWNER = "pt"; |
| my $GLUE_OWNER = "glue"; |
| if (defined $TUNE_GRAMMAR) { |
| my @tm_features = get_features($TUNE_GRAMMAR); |
| foreach my $feature (@tm_features) { |
| # Only assign initial weights to dense features |
| $weightstr .= "tm_${TM_OWNER}_$feature 1 " if ($feature =~ /^\d+$/); |
| } |
| |
| # Glue grammars are only needed for hierarchical models |
| if ($GRAMMAR_TYPE ne "phrase") { |
| # Glue grammar |
| $weightstr .= "tm_${GLUE_OWNER}_0 1 "; |
| } |
| } |
| |
| my $tm_type = $GRAMMAR_TYPE; |
| if ($GRAMMAR_TYPE eq "phrase") { |
| $tm_type = "moses"; |
| } |
| |
| sub get_file_from_grammar { |
| # Cachepipe doesn't work on directories, so we need to make sure we |
| # have a representative file to use to cache grammars. Returns undef if file not found |
| my ($grammar_file) = @_; |
| return undef unless defined $grammar_file and -e $grammar_file; |
| my $file = (-d $grammar_file) ? "$grammar_file/slice_00000.source" : $grammar_file; |
| return $file; |
| } |
| |
| my $tunedir = "$RUNDIR/tune"; |
| system("mkdir -p $tunedir") unless -d $tunedir; |
| |
| # Build the filtered tuning model |
| my $tunemodeldir = "$tunedir/model"; |
| |
| # We build up this string with TMs to substitute in, if any are provided |
| my $tm_switch = ""; |
| my $tm_copy_config_args = ""; |
| if (defined $TUNE_GRAMMAR) { |
| $tm_switch .= ($DO_PACK_GRAMMARS) ? "--pack-tm" : "--tm"; |
| $tm_switch .= " $TUNE_GRAMMAR"; |
| $tm_copy_config_args = " -tm0/type $tm_type -tm0/owner ${TM_OWNER} -tm0/maxspan $MAXSPAN"; |
| } |
| # If we specified a new glue grammar, put that in |
| if (defined $GLUE_GRAMMAR_FILE) { |
| $tm_switch .= " --tm $GLUE_GRAMMAR_FILE"; |
| $tm_copy_config_args .= " -tm1/owner ${GLUE_OWNER}"; |
| } else { |
| # if there is no glue grammar, remove it from the config template |
| $tm_copy_config_args .= " -tm1 DELETE"; |
| } |
| |
| # Now build the bundle |
| $cachepipe->cmd("tune-bundle", |
| "$BUNDLER --force --symlink --absolute --verbose $JOSHUA_CONFIG $tunemodeldir --copy-config-options '-top-n $NBEST -output-format \"%i ||| %s ||| %f ||| %c\" -mark-oovs false -search $SEARCH_ALGORITHM -weights \"$weightstr\" $feature_functions ${tm_copy_config_args}' ${tm_switch}", |
| $JOSHUA_CONFIG, |
| get_file_from_grammar($TUNE_GRAMMAR) || $JOSHUA_CONFIG,); |
| |
| # Update the tune grammar to its new location in the bundle |
| if (defined $TUNE_GRAMMAR) { |
| # Now update the tuning grammar to its new path |
| my $basename = basename($TUNE_GRAMMAR); |
| if (-e "tune/model/$basename") { |
| $TUNE_GRAMMAR = "tune/model/$basename"; |
| } elsif (-e "tune/model/$basename.packed") { |
| $TUNE_GRAMMAR = "tune/model/$basename.packed"; |
| } else { |
| print STDERR "* FATAL: tune model bundling didn't produce a grammar?"; |
| exit 1; |
| } |
| } |
| |
| # Update the config file location |
| $JOSHUA_CONFIG = "$tunedir/model/joshua.config"; |
| |
| # Write the decoder run command. The decoder will use the config file in the bundled |
| # directory, continually updating it. |
| $JOSHUA_ARGS .= " -output-format \"%i ||| %s ||| %f ||| %c\""; |
| |
| open DEC_CMD, ">$tunedir/decoder_command"; |
| print DEC_CMD "cat $TUNE{source} | $tunedir/model/run-joshua.sh -m $JOSHUA_MEM -config $JOSHUA_CONFIG -threads $NUM_THREADS $JOSHUA_ARGS > $tunedir/output.nbest 2> $tunedir/joshua.log\n"; |
| close(DEC_CMD); |
| chmod(0755,"$tunedir/decoder_command"); |
| |
| # tune |
| if ($TUNER eq "mert" or $TUNER eq "zmert" or $TUNER eq "pro" or $TUNER eq "mira" or $TUNER eq "local-mira") { |
| $cachepipe->cmd($TUNER, |
| "$SCRIPTDIR/training/run_tuner.py $TUNE{source} $TUNE{target} --tunedir $tunedir --tuner $TUNER --decoder-config $JOSHUA_CONFIG --iterations $TUNER_ITERATIONS", |
| $TUNE{source}, |
| $JOSHUA_CONFIG, |
| get_file_from_grammar($TUNE_GRAMMAR) || $JOSHUA_CONFIG, |
| "$tunedir/joshua.config.final"); |
| |
| } elsif ($TUNER eq "kbmira") { # Moses' batch MIRA |
| my $refs_path = $TUNE{target}; |
| $refs_path .= "." if (get_numrefs($TUNE{target}) > 1); |
| |
| my $extra_args = $JOSHUA_ARGS; |
| $extra_args =~ s/"/\\"/g; |
| $cachepipe->cmd("mira", |
| "$SCRIPTDIR/training/mira/run-mira.pl --mertdir $MOSES/bin --rootdir $MOSES/scripts --batch-mira --working-dir $tunedir --maximum-iterations $TUNER_ITERATIONS --nbest $NBEST --no-filter-phrase-table --decoder-flags \"-m $JOSHUA_MEM -threads $NUM_THREADS -moses $extra_args\" $TUNE{source} $refs_path $tunedir/model/run-joshua.sh $tunedir/model/joshua.config > $tunedir/mira.log 2>&1", |
| get_file_from_grammar($TUNE_GRAMMAR), |
| $TUNE{source}, |
| "$tunedir/joshua.config.final"); |
| } |
| |
| $JOSHUA_CONFIG = "$tunedir/joshua.config.final"; |
| |
| # Go to the next tuning run if tuning is the last step. |
| if ($LAST_STEP eq "TUNE") { |
| next; |
| } |
| |
| |
| ################################################################# |
| ## TESTING ###################################################### |
| ################################################################# |
| |
| TEST: |
| ; |
| |
| # prepare the testing data |
| if (! $PREPPED{TEST} and $DO_PREPARE_CORPORA) { |
| my $prefixes = prepare_data("test",[$TEST],$MAXLEN_TEST); |
| $TEST{source} = "$DATA_DIRS{test}/$prefixes->{lowercased}.$SOURCE"; |
| $TEST{target} = "$DATA_DIRS{test}/$prefixes->{lowercased}.$TARGET"; |
| $PREPPED{TEST} = 1; |
| } |
| |
| system("mkdir -p $DATA_DIRS{test}") unless -d $DATA_DIRS{test}; |
| |
| # Define the test grammar, if it was provided |
| my $TEST_GRAMMAR = $_TEST_GRAMMAR_FILE || $GRAMMAR_FILE; |
| |
| # Now filter, if its defined and should be done |
| if ($DO_FILTER_TM and defined $TEST_GRAMMAR and ! $DOING_LATTICES and ! defined $_TEST_GRAMMAR_FILE) { |
| $TEST_GRAMMAR = "$DATA_DIRS{test}/grammar.filtered.gz"; |
| |
| $cachepipe->cmd("filter-test", |
| "$SCRIPTDIR/support/filter_grammar.sh -g $GRAMMAR_FILE $FILTERING -v $TEST{source} | $SCRIPTDIR/training/filter-rules.pl -bus$SCOPE | gzip -9n > $TEST_GRAMMAR", |
| $GRAMMAR_FILE, |
| $TEST{source}, |
| $TEST_GRAMMAR); |
| } |
| |
| my $testdir = "$RUNDIR/test"; |
| |
| # Create and update the glue file, if the test grammar was provided (if not, we assume these |
| # are in the $JOSHUA_CONFIG) |
| if (defined $TEST_GRAMMAR and $GRAMMAR_TYPE ne "phrase") { |
| if (! defined $GLUE_GRAMMAR_FILE) { |
| $cachepipe->cmd("glue-test", |
| "java -Xmx1g -cp $JOSHUA/lib/*:$THRAX/bin/thrax.jar edu.jhu.thrax.util.CreateGlueGrammar $TEST_GRAMMAR > $DATA_DIRS{test}/grammar.glue", |
| $TEST_GRAMMAR, |
| "$DATA_DIRS{test}/grammar.glue"); |
| $GLUE_GRAMMAR_FILE = "$DATA_DIRS{test}/grammar.glue"; |
| |
| } else { |
| # just create a symlink to it |
| my $filename = $DATA_DIRS{test} . "/" . basename($GLUE_GRAMMAR_FILE); |
| if ($GLUE_GRAMMAR_FILE =~ /^\//) { |
| system("ln -sf $GLUE_GRAMMAR_FILE $filename"); |
| } else { |
| system("ln -sf $STARTDIR/$GLUE_GRAMMAR_FILE $filename"); |
| } |
| } |
| } |
| |
| $tm_switch = ""; |
| $tm_copy_config_args = ""; |
| if (defined $TEST_GRAMMAR) { |
| $tm_switch .= ($DO_PACK_GRAMMARS) ? "--pack-tm" : "--tm"; |
| $tm_switch .= " $TEST_GRAMMAR"; |
| } |
| if (defined $GLUE_GRAMMAR_FILE) { |
| $tm_switch .= " --tm $GLUE_GRAMMAR_FILE"; |
| } |
| |
| # Build the filtered testing model |
| $cachepipe->cmd("test-bundle", |
| "$BUNDLER --force --symlink --verbose $JOSHUA_CONFIG test/model --copy-config-options '-top-n $NBEST -output-format \"%i ||| %s ||| %f ||| %c\" -mark-oovs false' ${tm_switch}", |
| $JOSHUA_CONFIG, |
| get_file_from_grammar($TEST_GRAMMAR) || $JOSHUA_CONFIG, |
| "$testdir/model/joshua.config"); |
| |
| if (defined $TEST_GRAMMAR) { |
| # Update the test grammar (if defined) to its new path |
| my $basename = basename($TEST_GRAMMAR); |
| if (-e "$testdir/model/$basename") { |
| $TEST_GRAMMAR = "$testdir/model/$basename"; |
| } elsif (-e "$testdir/model/$basename.packed") { |
| $TEST_GRAMMAR = "$testdir/model/$basename.packed"; |
| } else { |
| print STDERR "* FATAL: test model bundling didn't produce a grammar?"; |
| exit 1; |
| } |
| } |
| |
| my $testrun = get_absolute_path("test", $RUNDIR); |
| my $bestoutput = "$testrun/output"; |
| my $nbestoutput = "$testrun/output.nbest"; |
| my $output; |
| |
| # If we're decoding a lattice, also output the source side path we chose |
| $JOSHUA_ARGS = $_JOSHUA_ARGS; |
| if ($DOING_LATTICES) { |
| $JOSHUA_ARGS .= " -maxlen 0 -output-format \"%i ||| %s ||| %e ||| %f ||| %c\""; |
| } |
| |
| if ($DO_MBR) { |
| $JOSHUA_ARGS .= " -top-n $NBEST -output-format \"%i ||| %s ||| %f ||| %c\""; |
| $output = $nbestoutput; |
| } else { |
| $JOSHUA_ARGS .= " -top-n 0 -output-format %s"; |
| $output = $bestoutput; |
| } |
| |
| # Write the decoder run command |
| open DEC_CMD, ">$testrun/decoder_command"; |
| print DEC_CMD "cat $TEST{source} | $testrun/model/run-joshua.sh -m $JOSHUA_MEM -threads $NUM_THREADS $JOSHUA_ARGS > $output 2> $testrun/joshua.log\n"; |
| close(DEC_CMD); |
| chmod(0755,"$testrun/decoder_command"); |
| |
| # Decode. $output here is either $nbestoutput (if doing MBR decoding, in which case we'll |
| # need the n-best output) or $bestoutput (which only outputs the hypothesis but is tons faster) |
| $cachepipe->cmd("test-decode", |
| "$testrun/decoder_command", |
| $TEST{source}, |
| "$testrun/decoder_command", |
| "$testrun/model/joshua.config", |
| get_file_from_grammar($TEST_GRAMMAR) || "$testrun/model/joshua.config", |
| $output); |
| |
| # $cachepipe->cmd("remove-oov", |
| # "cat $testoutput | perl -pe 's/_OOV//g' > $testoutput.noOOV", |
| # $testoutput, |
| # "$testoutput.noOOV"); |
| |
| # Extract the 1-best output from the n-best file if the n-best file alone was output |
| if ($DO_MBR) { |
| $cachepipe->cmd("test-extract-onebest", |
| "java -Xmx500m -cp $JOSHUA/class -Dfile.encoding=utf8 joshua.util.ExtractTopCand $nbestoutput $bestoutput", |
| $nbestoutput, |
| $bestoutput); |
| } |
| |
| # Now compute the BLEU score on the 1-best output |
| $cachepipe->cmd("test-bleu", |
| "$JOSHUA/bin/bleu $output $TEST{target} > $testrun/bleu", |
| $bestoutput, |
| "$testrun/bleu"); |
| |
| # Update the BLEU summary. |
| compute_bleu_summary("$testrun/bleu", "$testrun/final-bleu"); |
| |
| if ($DO_MBR) { |
| my $numlines = `cat $TEST{source} | wc -l`; |
| $numlines--; |
| my $mbr_output = "$testrun/output.mbr"; |
| |
| $cachepipe->cmd("test-onebest-parmbr", |
| "cat $nbestoutput | java -Xmx1700m -cp $JOSHUA/class -Dfile.encoding=utf8 joshua.decoder.NbestMinRiskReranker false 1 $NUM_THREADS > $mbr_output", |
| $nbestoutput, |
| $mbr_output); |
| |
| $cachepipe->cmd("test-bleu-mbr", |
| "$JOSHUA/bin/bleu output $TEST{target} $numrefs > $testrun/bleu.mbr", |
| $mbr_output, |
| "$testrun/bleu.mbr"); |
| |
| compute_bleu_summary("$testrun/bleu.mbr", "$testrun/final-bleu-mbr"); |
| } |
| |
| compute_time_summary("$testrun/joshua.log", "$testrun/final-times"); |
| |
| # Now do the analysis |
| if ($DOING_LATTICES) { |
| # extract the source |
| my $source = "$testrun/test.lattice-path.txt"; |
| $cachepipe->cmd("test-lattice-extract-source", |
| "$JOSHUA/bin/extract-1best $nbestoutput 2 | perl -pe 's/<s> //' > $source", |
| $nbestoutput, $source); |
| |
| analyze_testrun($bestoutput,$source,$TEST{target}); |
| } else { |
| analyze_testrun($bestoutput,$TEST{source},$TEST{target}); |
| } |
| |
| |
| ###################################################################### |
| ## SUBROUTINES ####################################################### |
| ###################################################################### |
| LAST: |
| 1; |
| |
| # Does tokenization and normalization of training, tuning, and test data. |
| # $label: one of train, tune, or test |
| # $corpora: arrayref of files (multiple allowed for training data) |
| # $maxlen: maximum length (only applicable to training) |
| sub prepare_data { |
| my ($label,$corpora,$maxlen) = @_; |
| $maxlen = 0 unless defined $maxlen; |
| |
| system("mkdir -p $DATA_DIR") unless -d $DATA_DIR; |
| system("mkdir -p $DATA_DIRS{$label}") unless -d $DATA_DIRS{$label}; |
| |
| # records the pieces that are produced |
| my %prefixes; |
| |
| # copy the data from its original location to our location |
| my $numlines = -1; |
| |
| # Build the list of extensions. For training data, there may be multiple corpora; for |
| # tuning and test data, there may be multiple references. |
| my @exts = ($SOURCE); |
| my $target_corpus = "$corpora->[0].$TARGET"; |
| push(@exts, $TARGET) if -e $target_corpus; |
| for (my $i = 0; ; $i++) { |
| my $file = "$target_corpus.$i"; |
| if (-e $file) { |
| push(@exts, "$TARGET.$i"); |
| } else { |
| last; |
| } |
| } |
| |
| # Read through all input files, concatenate them (if multiple were passed), and filter them |
| # First, assemble the file handles |
| my (@infiles, @indeps, @outfiles); |
| foreach my $ext (@exts) { |
| my @files = map { "$_.$ext" } @$corpora; |
| push(@indeps, @files); |
| if (@files > 1) { |
| push(@infiles, "<(cat " . join(" ", @files) . ")"); |
| } else { |
| push(@infiles, $files[0]); |
| } |
| push (@outfiles, "$DATA_DIRS{$label}/$label.$ext.gz"); |
| } |
| |
| my $infiles = join(" ", @infiles); |
| my $outfiles = join(" ", @outfiles); |
| $cachepipe->cmd("$label-copy-and-filter", |
| "paste $infiles | $SCRIPTDIR/training/filter-empty-lines.pl | $SCRIPTDIR/training/split2files.pl $outfiles", |
| @indeps, @outfiles); |
| # Done concatenating and filtering files |
| |
| my $prefix = "$label"; |
| |
| # tokenize the data |
| foreach my $lang (@exts) { |
| if (-e "$DATA_DIRS{$label}/$prefix.$lang.gz") { |
| if (is_lattice("$DATA_DIRS{$label}/$prefix.$lang.gz")) { |
| system("cp $DATA_DIRS{$label}/$prefix.$lang.gz $DATA_DIRS{$label}/$prefix.tok.$lang.gz"); |
| } else { |
| my $TOKENIZER = ($lang eq $SOURCE) ? $TOKENIZER_SOURCE : $TOKENIZER_TARGET; |
| my $ext = $lang; $ext =~ s/\.\d//; |
| $cachepipe->cmd("$label-tokenize-$lang", |
| "$CAT $DATA_DIRS{$label}/$prefix.$lang.gz | $NORMALIZER $ext | $TOKENIZER -l $ext 2> /dev/null | gzip -9n > $DATA_DIRS{$label}/$prefix.tok.$lang.gz", |
| "$DATA_DIRS{$label}/$prefix.$lang.gz", "$DATA_DIRS{$label}/$prefix.tok.$lang.gz"); |
| } |
| |
| } |
| } |
| # extend the prefix |
| $prefix .= ".tok"; |
| $prefixes{tokenized} = $prefix; |
| |
| if ($maxlen > 0) { |
| my (@infiles, @outfiles); |
| foreach my $ext (@exts) { |
| my $infile = "$DATA_DIRS{$label}/$prefix.$ext.gz"; |
| my $outfile = "$DATA_DIRS{$label}/$prefix.$maxlen.$ext.gz"; |
| if (-e $infile) { |
| push(@infiles, $infile); |
| push(@outfiles, $outfile); |
| } |
| } |
| |
| my $infilelist = join(" ", map { "<(gzip -cd $_)" } @infiles); |
| my $outfilelist = join(" ", @outfiles); |
| |
| # trim training data |
| $cachepipe->cmd("$label-trim", |
| "paste $infilelist | $SCRIPTDIR/training/trim_parallel_corpus.pl $maxlen | $SCRIPTDIR/training/split2files.pl $outfilelist", |
| @infiles, |
| @outfiles); |
| $prefix .= ".$maxlen"; |
| } |
| # record this whether we shortened or not |
| $prefixes{shortened} = $prefix; |
| |
| # lowercase |
| foreach my $lang (@exts) { |
| if (-e "$DATA_DIRS{$label}/$prefix.$lang.gz") { |
| if (is_lattice("$DATA_DIRS{$label}/$prefix.$lang.gz")) { |
| system("gzip -cd $DATA_DIRS{$label}/$prefix.$lang.gz > $DATA_DIRS{$label}/$prefix.lc.$lang"); |
| } else { |
| $cachepipe->cmd("$label-lowercase-$lang", |
| "gzip -cd $DATA_DIRS{$label}/$prefix.$lang.gz | $SCRIPTDIR/lowercase.perl > $DATA_DIRS{$label}/$prefix.lc.$lang", |
| "$DATA_DIRS{$label}/$prefix.$lang.gz", |
| "$DATA_DIRS{$label}/$prefix.lc.$lang"); |
| } |
| } |
| } |
| $prefix .= ".lc"; |
| $prefixes{lowercased} = $prefix; |
| |
| foreach my $lang (@exts) { |
| if (-e "$DATA_DIRS{$label}/$prefixes{lowercased}.$lang") { |
| system("ln -sf $prefixes{lowercased}.$lang $DATA_DIRS{$label}/corpus.$lang"); |
| } |
| } |
| |
| # Build a vocabulary |
| foreach my $ext (@exts) { |
| $cachepipe->cmd("$label-vocab-$ext", |
| "cat $DATA_DIRS{$label}/corpus.$ext | $SCRIPTDIR/training/build-vocab.pl > $DATA_DIRS{$label}/vocab.$ext", |
| "$DATA_DIRS{$label}/corpus.$ext", |
| "$DATA_DIRS{$label}/vocab.$ext"); |
| } |
| |
| return \%prefixes; |
| } |
| |
| sub maybe_quit { |
| my ($current_step) = @_; |
| |
| if (defined $LAST_STEP and $current_step eq $LAST_STEP) { |
| print "* Quitting at this step\n"; |
| exit(0); |
| } |
| } |
| |
| ## returns 1 if every sentence in the corpus begins with an open paren, |
| ## false otherwise |
| sub already_parsed { |
| my ($corpus) = @_; |
| |
| open(CORPUS, $corpus) or die "can't read corpus file '$corpus'\n"; |
| while (<CORPUS>) { |
| # if we see a line not beginning with an open paren, we consider |
| # the file not to be parsed |
| return 0 unless /^\(/; |
| } |
| close(CORPUS); |
| |
| return 1; |
| } |
| |
| sub not_defined { |
| my ($var) = @_; |
| |
| print "* FATAL: environment variable \$$var is not defined.\n"; |
| exit; |
| } |
| |
| # Takes a prefix. If that prefix exists, then all the references are |
| # assumed to be in that file. Otherwise, we successively append an |
| # index, looking for parallel references. |
| sub get_numrefs { |
| my ($prefix) = @_; |
| |
| if (-e "$prefix.0") { |
| my $index = 0; |
| while (-e "$prefix.$index") { |
| $index++; |
| } |
| return $index; |
| } else { |
| return 1; |
| } |
| } |
| |
| sub start_hadoop_cluster { |
| rollout_hadoop_cluster(); |
| |
| # start the cluster |
| # system("./hadoop/bin/start-all.sh"); |
| # sleep(120); |
| } |
| |
| sub rollout_hadoop_cluster { |
| # if it's not already unpacked, unpack it |
| if (! -d "hadoop") { |
| |
| my $hadoop_tmp_dir = tempdir("hadoop-0.20.2.XXXX", DIR => $TMPDIR, CLEANUP => 1); |
| system("tar xzf $JOSHUA/lib/hadoop-0.20.2.tar.gz -C $hadoop_tmp_dir"); |
| system("ln -sf $hadoop_tmp_dir/hadoop-0.20.2 hadoop"); |
| if (defined $HADOOP_CONF) { |
| print STDERR "Copying HADOOP_CONF($HADOOP_CONF) to hadoop/conf/core-site.xml\n"; |
| system("cp $HADOOP_CONF hadoop/conf/core-site.xml"); |
| } |
| } |
| |
| $ENV{HADOOP} = $HADOOP = "hadoop"; |
| $ENV{HADOOP_CONF_DIR} = ""; |
| } |
| |
| sub stop_hadoop_cluster { |
| if ($HADOOP ne "hadoop") { |
| system("hadoop/bin/stop-all.sh"); |
| } |
| } |
| |
| sub teardown_hadoop_cluster { |
| stop_hadoop_cluster(); |
| system("rm -f hadoop"); |
| } |
| |
| sub is_lattice { |
| my $file = shift; |
| open READ, "$CAT $file|" or die "can't read from potential lattice '$file'"; |
| my $line = <READ>; |
| close(READ); |
| if ($line =~ /^\(\(\(/) { |
| $DOING_LATTICES = 1; |
| $FILTERING = "-l"; |
| return 1; |
| } else { |
| return 0; |
| } |
| } |
| |
| # This function retrieves the names of all the features in the grammar. Dense features |
| # are named with consecutive integers starting at 0, while sparse features can have any name. |
| # To get the feature names from an unpacked grammar, we have to read through the whole grammar, |
| # since sparse features can be anywhere. For packed grammars, this can be read directly from |
| # the encoding. |
| sub get_features { |
| my ($grammar) = @_; |
| |
| if (-d $grammar) { |
| chomp(my @features = `java -cp $JOSHUA/class joshua.util.encoding.EncoderConfiguration $grammar | grep ^feature: | awk '{print \$NF}'`); |
| return @features; |
| |
| } elsif (-e $grammar) { |
| my %features; |
| open GRAMMAR, "$CAT $grammar|" or die "FATAL: can't read $grammar"; |
| while (my $line = <GRAMMAR>) { |
| chomp($line); |
| my @tokens = split(/ \|\|\| /, $line); |
| # field 4 for regular grammars, field 3 for phrase tables |
| my $feature_str = ($line =~ /^\[/) ? $tokens[3] : $tokens[2]; |
| my @features = split(' ', $feature_str); |
| my $feature_no = 0; |
| foreach my $feature (@features) { |
| if ($feature =~ /=/) { |
| my ($name) = split(/=/, $feature); |
| $features{$name} = 1; |
| } else { |
| $features{$feature_no++} = 1; |
| } |
| } |
| } |
| close(GRAMMAR); |
| return keys(%features); |
| } |
| } |
| |
| # File names reflecting relative paths need to be absolute-ized for --rundir to work. |
| # Does not work with paths that do not exist! |
| sub get_absolute_path { |
| my ($file,$basedir) = @_; |
| $basedir = $STARTDIR unless defined $basedir; |
| |
| if (defined $file) { |
| $file = "$basedir/$file" unless $file =~ /^\//; |
| |
| # prepend startdir (which is absolute) unless the path is absolute. |
| my $abs_path = abs_path($file); |
| if (defined $abs_path) { |
| $file = $abs_path; |
| } |
| } |
| |
| return $file; |
| } |
| |
| sub analyze_testrun { |
| my ($output,$source,$reference) = @_; |
| my $dir = dirname($output); |
| |
| mkdir("$dir/analysis") unless -d "$dir/analysis"; |
| |
| my @references; |
| if (-e "$reference.0") { |
| my $num = 0; |
| while (-e "$reference.$num") { |
| push(@references, "$reference.$num"); |
| $num++; |
| } |
| } else { |
| push(@references, $reference); |
| } |
| |
| my $references = join(" -r ", @references); |
| |
| $cachepipe->cmd("analyze-test", |
| "$SCRIPTDIR/analysis/sentence-by-sentence.pl -s $source -r $references $output > $dir/analysis/sentence-by-sentence.html", |
| $output, |
| "$dir/analysis/sentence-by-sentence.html"); |
| } |
| |
| sub compute_bleu_summary { |
| my ($filepattern, $outputfile) = @_; |
| |
| # Now average the runs, report BLEU |
| my @bleus; |
| my $numrecs = 0; |
| open CMD, "grep ' BLEU = ' $filepattern |"; |
| while (<CMD>) { |
| my @F = split; |
| push(@bleus, 1.0 * $F[-1]); |
| } |
| close(CMD); |
| |
| if (scalar @bleus) { |
| my $final_bleu = sum(@bleus) / (scalar @bleus); |
| |
| open BLEU, ">$outputfile" or die "Can't write to $outputfile"; |
| printf(BLEU "%s / %d = %.4f\n", join(" + ", @bleus), scalar @bleus, $final_bleu); |
| close(BLEU); |
| } |
| } |
| |
| sub compute_time_summary { |
| my ($filepattern, $outputfile) = @_; |
| |
| # Now average the runs, report BLEU |
| my @times; |
| foreach my $file (glob($filepattern)) { |
| open FILE, $file; |
| my $time = 0.0; |
| my $numrecs = 0; |
| while (<FILE>) { |
| next unless /^Input \d+: Translation took/; |
| my @F = split; |
| $time += $F[4]; |
| $numrecs++; |
| } |
| close(FILE); |
| |
| push(@times, $time); |
| } |
| |
| if (scalar @times) { |
| open TIMES, ">$outputfile" or die "Can't write to $outputfile"; |
| printf(TIMES "%s / %d = %s\n", join(" + ", @times), scalar(@times), 1.0 * sum(@times) / scalar(@times)); |
| close(TIMES); |
| } |
| } |
| |
| sub is_packed { |
| my ($grammar) = @_; |
| |
| if (-d $grammar && -e "$grammar/encoding") { |
| return 1; |
| } |
| |
| return 0; |
| } |
| |
| sub ner_annotate { |
| my ($inputfile, $outputfile, $lang) = @_; |
| if (defined $NER_TAGGER) { |
| # Check if NER tagger exists |
| if (! -e $NER_TAGGER) { |
| print "* FATAL: The specified NER tagger was not found"; |
| exit(1); |
| } |
| $cachepipe->cmd("ner-annotate", "$NER_TAGGER $inputfile $outputfile $lang"); |
| # Check if annotated file exists |
| if (! -e "$outputfile") { |
| print "* FATAL : The NER tagger did not create the required annotated file : $outputfile"; |
| exit(1); |
| } |
| return 2; |
| } |
| return 0; |
| } |
| |
| sub replace_tokens_with_types { |
| # Replace the tokens with types |
| my ($inputfile) = @_; |
| qx{sed -ir 's:\$([A-Za-z0-9]+)_\([^)]+\):\1:g' $inputfile} |
| } |