blob: 4c6380c22b4081592f48bc80ecaa58beddcddd7a [file] [log] [blame]
#!/usr/bin/env perl
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script implements the Joshua pipeline. It can run a complete
# pipeline --- from raw training corpora to bleu scores on a test set
# --- and it allows jumping into arbitrary points of the pipeline.
my $JOSHUA;
BEGIN {
if (! exists $ENV{JOSHUA} || $ENV{JOSHUA} eq "" ||
! exists $ENV{JAVA_HOME} || $ENV{JAVA_HOME} eq "") {
print "Several environment variables must be set before running the pipeline. Please set:\n";
print "* \$JOSHUA to the root of the Joshua source code.\n"
if (! exists $ENV{JOSHUA} || $ENV{JOSHUA} eq "");
print "* \$JAVA_HOME to the directory of your local java installation. \n"
if (! exists $ENV{JAVA_HOME} || $ENV{JAVA_HOME} eq "");
exit;
}
$JOSHUA = $ENV{JOSHUA};
unshift(@INC,"$JOSHUA/scripts/training/cachepipe");
unshift(@INC,"$JOSHUA/lib");
}
use strict;
use warnings;
use Getopt::Long;
use File::Basename;
use Cwd qw[abs_path getcwd];
use POSIX qw[ceil];
use List::Util qw[max min sum];
use File::Temp qw[:mktemp tempdir];
use CachePipe;
# There are some Perl 5.10 Unicode bugs that cause problems, mostly in sub-scripts
use v5.12;
# use Thread::Pool;
# Hadoop uses a stupid hacker trick to change directories, but (per Lane Schwartz) if CDPATH
# contains ".", it triggers the printing of the directory, which kills the stupid hacker trick.
# Thus we undefine CDPATH to ensure this doesn't happen.
delete $ENV{CDPATH};
my $MOSES = $ENV{MOSES};
my $METEOR = $ENV{METEOR};
my $THRAX = "$JOSHUA/thrax";
delete $ENV{GREP_OPTIONS};
die not_defined("JAVA_HOME") unless exists $ENV{JAVA_HOME};
my (@CORPORA,@TUNE,$TEST,$ALIGNMENT,$SOURCE,$TARGET,@LMFILES,$GRAMMAR_FILE,$GLUE_GRAMMAR_FILE,$_TUNE_GRAMMAR_FILE,$_TEST_GRAMMAR_FILE,$THRAX_CONF_FILE, $_JOSHUA_CONFIG, $_JOSHUA_ARGS);
my $FIRST_STEP = "SUBSAMPLE";
my $LAST_STEP = "LAST";
my $LMFILTER = "$ENV{HOME}/code/filter/filter";
# The maximum length of training sentences (--maxlen). The threshold is applied to both sides.
my $MAXLEN = 50;
# The maximum span rules in the main grammar can be applied to
my $MAXSPAN = 20;
# The maximum length of tuning and testing sentences (--maxlen-tune and --maxlen-test).
my $MAXLEN_TUNE = 0;
my $MAXLEN_TEST = 0;
# Maximum number of lines from any single corpus
my $MAXLINES = 0;
# when doing phrase-based decoding, the maximum length of a phrase (source side)
my $MAX_PHRASE_LEN = 5;
my $DO_FILTER_TM = 0;
my $DO_SUBSAMPLE = 0;
my $DO_PACK_GRAMMARS = 1;
my $SCRIPTDIR = "$JOSHUA/scripts";
my $TOKENIZER_SOURCE = "$SCRIPTDIR/preparation/tokenize.pl";
my $TOKENIZER_TARGET = "$SCRIPTDIR/preparation/tokenize.pl";
my $NORMALIZER = "$SCRIPTDIR/preparation/normalize.pl";
my $LOWERCASER = "$SCRIPTDIR/preparation/lowercase.pl";
my $GIZA_TRAINER = "$SCRIPTDIR/training/run-giza.pl";
my $TUNECONFDIR = "$SCRIPTDIR/training/templates/tune";
my $SRILM = ($ENV{SRILM}||"")."/bin/i686-m64/ngram-count";
my $COPY_CONFIG = "$SCRIPTDIR/copy-config.pl";
my $BUNDLER = "$JOSHUA/scripts/support/run_bundler.py";
my $STARTDIR;
my $RUNDIR = $STARTDIR = getcwd();
my $GRAMMAR_TYPE = undef; # hiero, itg, samt, ghkm, phrase, or moses
my $SEARCH_ALGORITHM = "cky"; # or "stack" (for phrase-based)
# Which GHKM extractor to use ("galley" or "moses")
my $GHKM_EXTRACTOR = "moses";
my $EXTRACT_OPTIONS = "";
my $WITTEN_BELL = 0;
# Run description.
my $README = undef;
# gzip-aware cat
my $CAT = "$SCRIPTDIR/training/scat";
# custom version of paste that dies on unequal file lengths
my $PASTE = "$SCRIPTDIR/training/paste";
# where processed data files are stored
my $DATA_DIR = "data";
# Whether to do MBR decoding on the n-best list (for test data).
my $DO_MBR = 0;
# Which aligner to use. The options are "giza" or "berkeley".
my $ALIGNER = "giza"; # "berkeley" or "giza" or "jacana"
my $ALIGNER_CONF = "$JOSHUA/scripts/training/templates/alignment/word-align.conf";
# Filter rules to the following maximum scope (Hopkins & Langmead, 2011).
my $SCOPE = 3;
# What kind of filtering to use ("fast" or "exact").
my $FILTERING = "fast";
# This is the amount of memory made available to Joshua. You'll need
# a lot more than this for SAMT decoding (though really it depends
# mostly on your grammar size)
my $JOSHUA_MEM = "4g";
# the amount of memory available for hadoop processes (passed to
# Hadoop via -Dmapred.child.java.opts
my $HADOOP_MEM = "4g";
# memory available to the parser
my $PARSER_MEM = "2g";
# memory available for building the language model
my $BUILDLM_MEM = "8G";
# Memory available for packing the grammar.
my $PACKER_MEM = "8g";
# Memory available for MERT/PRO.
my $TUNER_MEM = "8g";
# When qsub is called for decoding, these arguments should be passed to it.
my $QSUB_ARGS = "";
# When qsub is called for aligning, these arguments should be passed to it.
my $QSUB_ALIGN_ARGS = "-l h_rt=168:00:00,h_vmem=15g,mem_free=10g,num_proc=1";
# Amount of memory for the Berkeley aligner.
my $ALIGNER_MEM = "10g";
# Align corpus files a million lines at a time.
my $ALIGNER_BLOCKSIZE = 1000000;
# The number of machines to decode on. If you set this higher than 1,
# you need to have qsub configured for your environment.
my $NUM_JOBS = 1;
# The number of threads to use at different pieces in the pipeline
# (giza, decoding)
my $NUM_THREADS = 1;
# which LM to use (kenlm or berkeleylm)
my $LM_TYPE = "kenlm";
# n-gram order
my $LM_ORDER = 5;
# Whether to build and include an LM from the target-side of the
# corpus when manually-specified LM files are passed with --lmfile.
my $DO_BUILD_LM_FROM_CORPUS = 1;
# Whether to build and include an LM from the target-side of the
# corpus when manually-specified LM files are passed with --lmfile.
my $DO_BUILD_CLASS_LM = 0;
my $CLASS_LM_CORPUS = undef;
my $CLASS_MAP = undef;
my $CLASS_LM_ORDER = 9;
# whether to tokenize and lowercase training, tuning, and test data
my $DO_PREPARE_CORPORA = 1;
# compute the nth optimizer run
my $OPTIMIZER_RUN = 1;
# what to use to create language models ("berkeleylm" or "srilm")
my $LM_GEN = "kenlm";
my $LM_OPTIONS = "";
my @STEPS = qw[FIRST SUBSAMPLE ALIGN PARSE THRAX MODEL GRAMMAR PHRASE TUNE MERT PRO TEST LAST];
my %STEPS = map { $STEPS[$_] => $_ + 1 } (0..$#STEPS);
# Methods to use for merging alignments (see Koehn et al., 2003).
# Options are union, {intersect, grow, srctotgt, tgttosrc}-{diag,final,final-and,diag-final,diag-final-and}
my $GIZA_MERGE = "grow-diag-final";
# Whether to merge all the --lmfile LMs into a single LM using weights based on the development corpus
my $MERGE_LMS = 0;
# Which tuner to use by default
my @TUNERS = ("mert", "pro", "mira", "adagrad", "kbmira");
my $TUNER = "mert";
# The metric to update to
my $METRIC = "BLEU 4 closest";
# The number of iterations of the tuner to run
my $TUNER_ITERATIONS = 10;
# location of already-parsed corpus
my $PARSED_CORPUS = undef;
# location of the ner tagger wrapper script for annotation
my $NER_TAGGER = undef;
# Allows the user to set a temp dir for various tasks
my $TMPDIR = $ENV{TMP} || "/tmp";
# Enable forest rescoring
my $LM_STATE_MINIMIZATION = 1;
my $NBEST = 300;
my $REORDERING_LIMIT = 6;
my $NUM_TRANSLATION_OPTIONS = 20;
my $retval = GetOptions(
"readme=s" => \$README,
"corpus=s" => \@CORPORA,
"parsed-corpus=s" => \$PARSED_CORPUS,
"tune=s" => \@TUNE,
"test=s" => \$TEST,
"prepare!" => \$DO_PREPARE_CORPORA,
"aligner=s" => \$ALIGNER,
"alignment=s" => \$ALIGNMENT,
"aligner-mem=s" => \$ALIGNER_MEM,
"aligner-conf=s" => \$ALIGNER_CONF,
"giza-merge=s" => \$GIZA_MERGE,
"source=s" => \$SOURCE,
"target=s" => \$TARGET,
"rundir=s" => \$RUNDIR,
"filter-tm!" => \$DO_FILTER_TM,
"scope=i" => \$SCOPE,
"filtering=s" => \$FILTERING,
"lm=s" => \$LM_TYPE,
"lmfile=s" => \@LMFILES,
"merge-lms!" => \$MERGE_LMS,
"lm-gen=s" => \$LM_GEN,
"lm-gen-options=s" => \$LM_OPTIONS,
"lm-order=i" => \$LM_ORDER,
"corpus-lm!" => \$DO_BUILD_LM_FROM_CORPUS,
"witten-bell!" => \$WITTEN_BELL,
"tune-grammar=s" => \$_TUNE_GRAMMAR_FILE,
"test-grammar=s" => \$_TEST_GRAMMAR_FILE,
"grammar=s" => \$GRAMMAR_FILE,
"model=s" => \$GRAMMAR_FILE,
"maxspan=i" => \$MAXSPAN,
"mbr!" => \$DO_MBR,
"type=s" => \$GRAMMAR_TYPE,
"ghkm-extractor=s" => \$GHKM_EXTRACTOR,
"extract-options=s" => \$EXTRACT_OPTIONS,
"maxlen=i" => \$MAXLEN,
"maxlen-tune=i" => \$MAXLEN_TUNE,
"maxlen-test=i" => \$MAXLEN_TEST,
"maxlines=i" => \$MAXLINES,
"maxlen-phrase=i" => \$MAX_PHRASE_LEN,
"tokenizer-source=s" => \$TOKENIZER_SOURCE,
"tokenizer-target=s" => \$TOKENIZER_TARGET,
"normalizer=s" => \$NORMALIZER,
"lowercaser=s" => \$LOWERCASER,
"joshua-config=s" => \$_JOSHUA_CONFIG,
"joshua-args=s" => \$_JOSHUA_ARGS,
"joshua-mem=s" => \$JOSHUA_MEM,
"hadoop-mem=s" => \$HADOOP_MEM,
"parser-mem=s" => \$PARSER_MEM,
"buildlm-mem=s" => \$BUILDLM_MEM,
"packer-mem=s" => \$PACKER_MEM,
"pack!" => \$DO_PACK_GRAMMARS,
"tuner=s" => \$TUNER,
"tuner-mem=s" => \$TUNER_MEM,
"tuner-iterations=i" => \$TUNER_ITERATIONS,
"tuner-metric=s" => \$METRIC,
"thrax=s" => \$THRAX,
"thrax-conf=s" => \$THRAX_CONF_FILE,
"jobs=i" => \$NUM_JOBS,
"threads=i" => \$NUM_THREADS,
"subsample!" => \$DO_SUBSAMPLE,
"qsub-args=s" => \$QSUB_ARGS,
"qsub-align-args=s" => \$QSUB_ALIGN_ARGS,
"first-step=s" => \$FIRST_STEP,
"last-step=s" => \$LAST_STEP,
"aligner-chunk-size=s" => \$ALIGNER_BLOCKSIZE,
"tmp=s" => \$TMPDIR,
"nbest=i" => \$NBEST,
"reordering-limit=i" => \$REORDERING_LIMIT,
"num-translation-options=i" => \$NUM_TRANSLATION_OPTIONS,
"ner-tagger=s" => \$NER_TAGGER,
"class-lm!" => \$DO_BUILD_CLASS_LM,
"class-lm-corpus=s" => \$CLASS_LM_CORPUS,
"class-map=s" => \$CLASS_MAP,
"class-lm-order=s" => \$CLASS_LM_ORDER,
"optimizer-run=i" => \$OPTIMIZER_RUN,
);
if (! $retval) {
print "Invalid usage, quitting\n";
exit 1;
}
$RUNDIR = get_absolute_path($RUNDIR);
$TUNER = lc $TUNER;
my $DOING_LATTICES = 0;
my $JOSHUA_ARGS = (defined $_JOSHUA_ARGS) ? $_JOSHUA_ARGS : "";
my %DATA_DIRS = (
train => get_absolute_path("$RUNDIR/$DATA_DIR/train"),
tune => get_absolute_path("$RUNDIR/$DATA_DIR/tune"),
test => get_absolute_path("$RUNDIR/$DATA_DIR/test"),
);
if (! -x $NORMALIZER) {
print "* FATAL: couldn't find normalizer '$NORMALIZER'\n";
exit 1;
}
# Absolutize paths
$ALIGNER_CONF = get_absolute_path($ALIGNER_CONF);
$ALIGNMENT = get_absolute_path($ALIGNMENT);
# capitalize these to offset a common error:
$FIRST_STEP = uc($FIRST_STEP);
$LAST_STEP = uc($LAST_STEP);
$| = 1;
my $cachepipe = new CachePipe();
# This tells cachepipe not to include the command signature when determining to run a command. Note
# that this is not backwards compatible!
$cachepipe->omit_cmd();
$SIG{INT} = sub {
print "* Got C-c, quitting\n";
$cachepipe->cleanup();
exit 1;
};
# if no LMs were specified, we need to build one from the target side of the corpus
if (scalar @LMFILES == 0) {
$DO_BUILD_LM_FROM_CORPUS = 1;
}
## Sanity Checking ###################################################
# If a language model was specified and no corpus was given to build another one from the target
# side of the training data (which could happen, for example, when starting at the tuning step with
# an existing LM), turn off building an LM from the corpus. The user could have done this
# explicitly with --no-corpus-lm, but might have forgotten to, and we con't want to pester them with
# an error about easily-inferrable intentions.
if (scalar @LMFILES && ! scalar(@CORPORA)) {
$DO_BUILD_LM_FROM_CORPUS = 0;
}
# if merging LMs, make sure there are at least 2 LMs to merge.
# first, pin $DO_BUILD_LM_FROM_CORPUS to 0 or 1 so that the subsequent check works.
if ($MERGE_LMS) {
if ($DO_BUILD_LM_FROM_CORPUS != 0) {
$DO_BUILD_LM_FROM_CORPUS = 1
}
if (@LMFILES + $DO_BUILD_LM_FROM_CORPUS < 2) {
print "* FATAL: I need 2 or more language models to merge (including the corpus target-side LM).";
exit 2;
}
}
# absolutize LM file paths
map {
$LMFILES[$_] = get_absolute_path($LMFILES[$_]);
} 0..$#LMFILES;
# make sure the LMs exist
foreach my $lmfile (@LMFILES) {
if (! -e $lmfile) {
print "* FATAL: couldn't find language model file '$lmfile'\n";
exit 1;
}
}
my @GRAMMAR_TYPES = qw/hiero samt ghkm phrase moses/;
if (! defined $GRAMMAR_TYPE or ! in($GRAMMAR_TYPE,\@GRAMMAR_TYPES)) {
print "* FATAL: You must define --type (" . join("|", @GRAMMAR_TYPES) . ")\n";
exit 47;
}
# case-normalize this
$GRAMMAR_TYPE = lc $GRAMMAR_TYPE;
if ($GRAMMAR_TYPE eq "phrase" or $GRAMMAR_TYPE eq "moses") {
$SEARCH_ALGORITHM = "stack";
$MAXSPAN = 0;
}
# make sure source and target were specified
if (! defined $SOURCE or $SOURCE eq "") {
print "* FATAL: I need a source language extension (--source)\n";
exit 1;
}
if (! defined $TARGET or $TARGET eq "") {
print "* FATAL: I need a target language extension (--target)\n";
exit 1;
}
# make sure a corpus was provided if we're doing any step before tuning
if (@CORPORA == 0 and $STEPS{$FIRST_STEP} < $STEPS{TUNE}) {
print "* FATAL: need at least one training corpus (--corpus)\n";
exit 1;
}
# make sure a tuning corpus was provided if we're doing tuning
if (scalar(@TUNE) == 0 and ($STEPS{$FIRST_STEP} <= $STEPS{TUNE}
and $STEPS{$LAST_STEP} >= $STEPS{TUNE})) {
print "* FATAL: need at least one tuning set (--tune)\n";
exit 1;
}
# make sure a test corpus was provided if we're decoding a test set
if (! defined $TEST and ($STEPS{$FIRST_STEP} <= $STEPS{TEST}
and $STEPS{$LAST_STEP} >= $STEPS{TEST})) {
print "* FATAL: need a test set (--test)\n";
exit 1;
}
# Joshua config
my $JOSHUA_CONFIG = get_absolute_path($_JOSHUA_CONFIG || "$TUNECONFDIR/joshua.config", $STARTDIR);
# make sure we have a tuned config file if we're skipping model building and tuning
if ($STEPS{$FIRST_STEP} >= $STEPS{TEST}) {
if (! defined $JOSHUA_CONFIG) {
print "* FATAL: You need to provide a tuned Joshua config file (--joshua-config)\n";
print " if you're skipping straight to testing\n";
exit 1;
}
}
# make sure we have either a config file or a grammar and LM if we're skipping model building
if ($STEPS{$FIRST_STEP} >= $STEPS{TUNE}) {
if (! defined $JOSHUA_CONFIG and ((! defined $_TUNE_GRAMMAR_FILE and ! defined $GRAMMAR_FILE) or scalar(@LMFILES) == 0)) {
print "* FATAL: You must provide either a Joshua config file (--joshua-config) or\n";
print " a grammar (--grammar or --tune-grammar) and at least one LM (--lmfile)\n";
print " if you're skipping straight to tuning\n";
exit 1;
}
}
# make sure SRILM is defined if we're building a language model
if ($LM_GEN eq "srilm" && (scalar @LMFILES == 0) && $STEPS{$FIRST_STEP} <= $STEPS{TUNE} && $STEPS{$LAST_STEP} >= $STEPS{TUNE}) {
not_defined("SRILM") unless exists $ENV{SRILM} and -d $ENV{SRILM};
}
# check for file presence
if (defined $JOSHUA_CONFIG and ! -e $JOSHUA_CONFIG) {
print "* FATAL: couldn't find joshua config file '$JOSHUA_CONFIG'\n";
exit 1;
}
if (defined $GRAMMAR_FILE and ! -e $GRAMMAR_FILE) {
print "* FATAL: couldn't find grammar file '$GRAMMAR_FILE'\n";
exit 1;
}
if (defined $_TUNE_GRAMMAR_FILE and ! -e $_TUNE_GRAMMAR_FILE) {
print "* FATAL: couldn't find tuning grammar file '$_TUNE_GRAMMAR_FILE'\n";
exit 1;
}
if (defined $_TEST_GRAMMAR_FILE and ! -e $_TEST_GRAMMAR_FILE) {
print "* FATAL: couldn't find test grammar file '$_TEST_GRAMMAR_FILE'\n";
exit 1;
}
if (defined $ALIGNMENT and ! -e $ALIGNMENT) {
print "* FATAL: couldn't find alignment file '$ALIGNMENT'\n";
exit 1;
}
# If $CORPUS was a relative path, prepend the starting directory (under the assumption it was
# relative to there). This makes sure that everything will still work if we change the run
# directory.
map {
$CORPORA[$_] = get_absolute_path("$CORPORA[$_]");
} (0..$#CORPORA);
# Do the same for tuning and test data, and other files
map {
$TUNE[$_] = get_absolute_path($TUNE[$_]);
} (0..$#TUNE);
$TEST = get_absolute_path($TEST);
$GRAMMAR_FILE = get_absolute_path($GRAMMAR_FILE);
$GLUE_GRAMMAR_FILE = get_absolute_path($GLUE_GRAMMAR_FILE);
$_TUNE_GRAMMAR_FILE = get_absolute_path($_TUNE_GRAMMAR_FILE);
$_TEST_GRAMMAR_FILE = get_absolute_path($_TEST_GRAMMAR_FILE);
$THRAX_CONF_FILE = get_absolute_path($THRAX_CONF_FILE);
$ALIGNMENT = get_absolute_path($ALIGNMENT);
foreach my $corpus (@CORPORA) {
foreach my $ext ($TARGET,$SOURCE) {
if (! -e "$corpus.$ext") {
print "* FATAL: can't find '$corpus.$ext'";
exit 1;
}
}
}
if ($ALIGNER ne "giza" and $ALIGNER ne "berkeley" and $ALIGNER ne "jacana") {
print "* FATAL: aligner must be one of 'giza', 'berkeley' or 'jacana' (only French-English)\n";
exit 1;
}
if ($LM_TYPE ne "kenlm" and $LM_TYPE ne "berkeleylm") {
print "* FATAL: lm type (--lm) must be one of 'kenlm' or 'berkeleylm'\n";
exit 1;
}
if ($LM_TYPE ne "kenlm") {
$LM_STATE_MINIMIZATION = 0;
}
if ($LM_GEN ne "berkeleylm" and $LM_GEN ne "srilm" and $LM_GEN ne "kenlm") {
print "* FATAL: lm generating code (--lm-gen) must be one of 'kenlm' (default), 'berkeleylm', or 'srilm'\n";
exit 1;
}
if ($TUNER eq "kbmira" and ! defined $MOSES) {
print "* FATAL: using 'kbmira' for tuning requires setting the MOSES environment variable\n";
exit 1;
}
if ($GRAMMAR_TYPE eq "moses" and ! defined $MOSES) {
print "* FATAL: building Moses phrase-based models (--type moses) requires setting the MOSES environment variable\n";
exit 1;
}
if (! in($TUNER, \@TUNERS)) {
print "* FATAL: --tuner must be one of " . join(", ", @TUNERS) . $/;
exit 1;
}
$FILTERING = lc $FILTERING;
if ($FILTERING eq "fast") {
$FILTERING = "-f"
} elsif ($FILTERING eq "exact") {
$FILTERING = "-e";
} elsif ($FILTERING eq "loose") {
$FILTERING = "-l";
} else {
print "* FATAL: --filtering must be one of 'fast' (default) or 'exact' or 'loose'\n";
exit 1;
}
## END SANITY CHECKS
####################################################################################################
## Dependent variable setting ######################################################################
####################################################################################################
my $OOV = ($GRAMMAR_TYPE eq "hiero" or $GRAMMAR_TYPE eq "itg" or $GRAMMAR_TYPE eq "phrase" or $GRAMMAR_TYPE eq "moses") ? "X" : "OOV";
# The phrasal system should use the ITG grammar, allowing for limited distortion
if ($GRAMMAR_TYPE eq "phrasal") {
$GLUE_GRAMMAR_FILE = get_absolute_path("$JOSHUA/scripts/training/templates/glue-grammar.itg");
}
# use this default unless it's already been defined by a command-line argument
$THRAX_CONF_FILE = "$JOSHUA/scripts/training/templates/thrax-$GRAMMAR_TYPE.conf" unless defined $THRAX_CONF_FILE;
mkdir $RUNDIR unless -d $RUNDIR;
chdir($RUNDIR);
if (defined $README) {
open DESC, ">README" or die "can't write README file";
print DESC $README;
print DESC $/;
close DESC;
}
# default values -- these are overridden if the full script is run
# (after tokenization and normalization)
my (%TRAIN,%TUNE,%TEST);
if (@CORPORA) {
$TRAIN{prefix} = $CORPORA[0];
$TRAIN{source} = "$CORPORA[0].$SOURCE";
$TRAIN{target} = "$CORPORA[0].$TARGET";
}
# set the location of the parsed corpus if that was defined
if (defined $PARSED_CORPUS) {
$TRAIN{parsed} = get_absolute_path($PARSED_CORPUS);
}
if (scalar(@TUNE) > 0) {
$TUNE{source} = "$TUNE[0].$SOURCE";
$TUNE{target} = "$TUNE[0].$TARGET";
if (! -e "$TUNE{source}") {
print "* FATAL: couldn't find tune source file at '$TUNE{source}'\n";
exit;
}
}
if ($TEST) {
$TEST{source} = "$TEST.$SOURCE";
$TEST{target} = "$TEST.$TARGET";
if (! -e "$TEST{source}") {
print "* FATAL: couldn't find test source file at '$TEST{source}'\n";
exit;
}
}
# Record the preprocessing scripts that were used
mkdir("scripts") unless -e "scripts";
unlink "scripts/normalize.$SOURCE";
unlink "scripts/normalize.$TARGET";
symlink $NORMALIZER, "scripts/normalize.$SOURCE";
symlink $NORMALIZER, "scripts/normalize.$TARGET";
unlink "scripts/tokenize.$SOURCE";
unlink "scripts/tokenize.$TARGET";
symlink $TOKENIZER_SOURCE, "scripts/tokenize.$SOURCE";
symlink $TOKENIZER_TARGET, "scripts/tokenize.$TARGET";
## STEP 1: filter and preprocess corpora #############################
if (defined $ALIGNMENT and $STEPS{$FIRST_STEP} < $STEPS{ALIGN}) {
print "* FATAL: it doesn't make sense to provide an alignment and then do\n";
print " tokenization. Either remove --alignment or specify a first step\n";
print " of Thrax (--first-step THRAX)\n";
exit 1;
}
if (@CORPORA == 0 and $STEPS{$FIRST_STEP} < $STEPS{TUNE}) {
print "* FATAL: need at least one training corpus (--corpus)\n";
exit 1;
}
# prepare the training data
my %PREPPED = (
TRAIN => 0,
TUNE => 0,
TEST => 0);
if (@CORPORA > 0) {
my $prefixes = prepare_data("train",\@CORPORA,$MAXLEN);
# used for parsing
if (exists $prefixes->{shortened}) {
$TRAIN{mixedcase} = "$DATA_DIRS{train}/$prefixes->{shortened}.$TARGET";
}
$TRAIN{prefix} = "$DATA_DIRS{train}/corpus";
$TRAIN{source} = "$DATA_DIRS{train}/corpus.$SOURCE";
$TRAIN{target} = "$DATA_DIRS{train}/corpus.$TARGET";
$PREPPED{TRAIN} = 1;
}
# prepare the tuning and development data
if (@TUNE > 0) {
my $prefixes = prepare_data("tune",\@TUNE,$MAXLEN_TUNE);
$TUNE{source} = "$DATA_DIRS{tune}/corpus.$SOURCE";
$TUNE{target} = "$DATA_DIRS{tune}/corpus.$TARGET";
my $ner_return = ner_annotate("$TUNE{source}", "$TUNE{source}.ner", $SOURCE);
if ($ner_return == 2) {
$TUNE{source} = "$TUNE{source}.ner";
}
$PREPPED{TUNE} = 1;
}
if (defined $TEST) {
my $prefixes = prepare_data("test",[$TEST],$MAXLEN_TEST);
$TEST{source} = "$DATA_DIRS{test}/corpus.$SOURCE";
$TEST{target} = "$DATA_DIRS{test}/corpus.$TARGET";
my $ner_return = ner_annotate("$TEST{source}", "$TEST{source}.ner", $SOURCE);
if ($ner_return == 2) {
$TEST{source} = "$TEST{source}.ner";
}
$PREPPED{TEST} = 1;
}
## Use of GOTO considered very useful
if (eval { goto $FIRST_STEP }) {
print "* Skipping to step $FIRST_STEP\n";
goto $FIRST_STEP;
} else {
print "* No such step $FIRST_STEP\n";
exit 1;
}
## SUBSAMPLE #########################################################
SUBSAMPLE:
;
# subsample
if ($DO_SUBSAMPLE) {
mkdir("$DATA_DIRS{train}/subsampled") unless -d "$DATA_DIRS{train}/subsampled";
$cachepipe->cmd("subsample-manifest",
"echo corpus > $DATA_DIRS{train}/subsampled/manifest",
"$DATA_DIRS{train}/subsampled/manifest");
$cachepipe->cmd("subsample-testdata",
"cat $TUNE{source} $TEST{source} > $DATA_DIRS{train}/subsampled/test-data",
$TUNE{source},
$TEST{source},
"$DATA_DIRS{train}/subsampled/test-data");
$cachepipe->cmd("subsample",
"java -Xmx4g -Dfile.encoding=utf8 -cp $JOSHUA/bin:$JOSHUA/lib/commons-cli-2.0-SNAPSHOT.jar joshua.subsample.Subsampler -e $TARGET -f $SOURCE -epath $DATA_DIRS{train}/ -fpath $DATA_DIRS{train}/ -output $DATA_DIRS{train}/subsampled/subsampled.$MAXLEN -ratio 1.04 -test $DATA_DIRS{train}/subsampled/test-data -training $DATA_DIRS{train}/subsampled/manifest",
"$DATA_DIRS{train}/subsampled/manifest",
"$DATA_DIRS{train}/subsampled/test-data",
$TRAIN{source},
$TRAIN{target},
"$DATA_DIRS{train}/subsampled/subsampled.$MAXLEN.$TARGET",
"$DATA_DIRS{train}/subsampled/subsampled.$MAXLEN.$SOURCE");
# rewrite the symlinks to point to the subsampled corpus
foreach my $lang ($TARGET,$SOURCE) {
system("ln -sf subsampled/subsampled.$MAXLEN.$lang $DATA_DIRS{train}/corpus.$lang");
}
}
maybe_quit("SUBSAMPLE");
## ALIGN #############################################################
ALIGN:
;
# This basically means that we've skipped tokenization, in which case
# we still want to move the input files into the canonical place
if ($FIRST_STEP eq "ALIGN") {
if (defined $ALIGNMENT) {
print "* FATAL: It doesn't make sense to provide an alignment\n";
print " but not to skip the tokenization and subsampling steps\n";
exit 1;
}
# TODO: copy the files into the canonical place
# Jumping straight to alignment is probably the same thing as
# skipping tokenization, and might also be implemented by a
# --no-tokenization flag
}
# Use an existing alignment file if it's present, short-circuits
# rebuilding the alignments...
if (-s "alignments/training.align") {
$ALIGNMENT = "alignments/training.align";
}
# skip this step if an alignment was provided or it already exists
if (! defined $ALIGNMENT) {
# We process the data in chunks which by default are 1,000,000 sentence pairs. So first split up
# the data into those chunks.
system("mkdir","-p","$DATA_DIRS{train}/splits") unless -d "$DATA_DIRS{train}/splits";
$cachepipe->cmd("source-numlines",
"cat $TRAIN{source} | wc -l",
$TRAIN{source});
my $numlines = $cachepipe->stdout();
my $numchunks = ceil($numlines / $ALIGNER_BLOCKSIZE);
open TARGET, $TRAIN{target} or die "can't read $TRAIN{target}";
open SOURCE, $TRAIN{source} or die "can't read $TRAIN{source}";
my $lastchunk = -1;
while (my $target = <TARGET>) {
my $source = <SOURCE>;
# We want to prevent a very small last chunk, which we accomplish
# by folding the last chunk into the penultimate chunk.
my $chunk = ($numchunks <= 2)
? 0
: min($numchunks - 2,
int( (${.} - 1) / $ALIGNER_BLOCKSIZE ));
if ($chunk != $lastchunk) {
close CHUNK_SOURCE;
close CHUNK_TARGET;
mkdir("$DATA_DIRS{train}/splits/$chunk");
open CHUNK_SOURCE, ">", "$DATA_DIRS{train}/splits/$chunk/corpus.$SOURCE" or die;
open CHUNK_TARGET, ">", "$DATA_DIRS{train}/splits/$chunk/corpus.$TARGET" or die;
$lastchunk = $chunk;
}
print CHUNK_SOURCE $source;
print CHUNK_TARGET $target;
}
close CHUNK_SOURCE;
close CHUNK_TARGET;
close SOURCE;
close TARGET;
# my $max_aligner_threads = $NUM_THREADS;
# if ($ALIGNER eq "giza" and $max_aligner_threads > 1) {
# $max_aligner_threads /= 2;
# }
mkdir("alignments") unless -d "alignments";
my $aligner_cmd = (
"$SCRIPTDIR/training/paralign.pl "
. " -aligner $ALIGNER"
. " -conf $ALIGNER_CONF"
. " -num_threads 2"
. " -giza_merge $GIZA_MERGE"
. " -aligner_mem $ALIGNER_MEM"
. " -source $SOURCE"
. " -target $TARGET"
. " -giza_trainer \"$GIZA_TRAINER\""
. " -train_dir \"$DATA_DIRS{train}\" "
. "> alignments/run.log"
);
# Start a parallel job on each core
my @children = ();
my $next_chunk = 0;
foreach my $core (1..$NUM_THREADS) {
if ($next_chunk < $lastchunk + 1) {
my $child = fork();
if (! $child) { # I am child
exec("echo $next_chunk | $aligner_cmd");
exit 0;
}
push @children, $child;
$next_chunk++;
next;
}
}
# Start another concurrent job as each oldest job finishes
while (@children) {
my $old_child = shift @children;
waitpid( $old_child, 0 );
if ($next_chunk < $lastchunk + 1) {
my $new_child = fork();
if (! $new_child) { # I am child
exec("echo $next_chunk | $aligner_cmd");
exit 0;
}
$next_chunk++;
push @children, $new_child;
}
}
my @aligned_files;
if ($ALIGNER eq "giza") {
@aligned_files = map { "alignments/$_/model/aligned.$GIZA_MERGE" } (0..$lastchunk);
} elsif ($ALIGNER eq "berkeley") {
@aligned_files = map { "alignments/$_/training.$TARGET-$SOURCE.align" } (0..$lastchunk);
} elsif ($ALIGNER eq "jacana") {
@aligned_files = map { "alignments/$_/training.align" } (0..$lastchunk);
}
my $aligned_file_list = join(" ", @aligned_files);
# wait for all the threads to finish
# $pool->join();
# combine the alignments
$cachepipe->cmd("aligner-combine",
"cat $aligned_file_list > alignments/training.align",
$aligned_files[-1],
"alignments/training.align");
# at the end, all the files are concatenated into a single alignment file parallel to the input
# corpora
$ALIGNMENT = "alignments/training.align";
}
maybe_quit("ALIGN");
## PARSE #############################################################
PARSE:
;
# Parsing only happens for SAMT grammars.
if ($FIRST_STEP eq "PARSE" and ($GRAMMAR_TYPE eq "hiero" or $GRAMMAR_TYPE eq "phrasal" or $GRAMMAR_TYPE eq "phrase" or $GRAMMAR_TYPE eq "moses")) {
print STDERR "* FATAL: parsing only applies to GHKM and SAMT grammars; you need to add '--type samt|ghkm'\n";
exit;
}
if ($GRAMMAR_TYPE eq "samt" || $GRAMMAR_TYPE eq "ghkm") {
# If the user passed in the already-parsed corpus, use that (after copying it into place)
if (defined $TRAIN{parsed} && -e $TRAIN{parsed}) {
# copy and adjust the location of the file to its canonical location
system("cp $TRAIN{parsed} $DATA_DIRS{train}/corpus.parsed.$TARGET");
$TRAIN{parsed} = "$DATA_DIRS{train}/corpus.parsed.$TARGET";
} else {
system("mkdir -p $DATA_DIRS{train}") unless -e $DATA_DIRS{train};
$cachepipe->cmd("build-vocab",
"cat $TRAIN{target} | $SCRIPTDIR/training/build-vocab.pl > $DATA_DIRS{train}/vocab.$TARGET",
$TRAIN{target},
"$DATA_DIRS{train}/vocab.$TARGET");
my $file_to_parse = (exists $TRAIN{mixedcase}) ? $TRAIN{mixedcase} : $TRAIN{target};
if ($NUM_JOBS > 1) {
# the black-box parallelizer model doesn't work with multiple
# threads, so we're always spawning single-threaded instances here
# open PARSE, ">parse.sh" or die;
# print PARSE "cat $TRAIN{target} | $JOSHUA/scripts/training/parallelize/parallelize.pl --jobs $NUM_JOBS --qsub-args \"$QSUB_ARGS\" -- java -d64 -Xmx${PARSER_MEM} -jar $JOSHUA/lib/BerkeleyParser.jar -gr $JOSHUA/lib/eng_sm6.gr -nThreads 1 | sed 's/^\(/\(TOP/' | tee $DATA_DIRS{train}/corpus.$TARGET.parsed.mc | perl -pi -e 's/(\\S+)\\)/lc(\$1).\")\"/ge' | tee $DATA_DIRS{train}/corpus.$TARGET.parsed | perl $SCRIPTDIR/training/add-OOVs.pl $DATA_DIRS{train}/vocab.$TARGET > $DATA_DIRS{train}/corpus.parsed.$TARGET\n";
# close PARSE;
# chmod 0755, "parse.sh";
# $cachepipe->cmd("parse",
# "setsid ./parse.sh",
# "$TRAIN{target}",
# "$DATA_DIRS{train}/corpus.parsed.$TARGET");
$cachepipe->cmd("parse",
"$CAT $file_to_parse | $JOSHUA/scripts/training/parallelize/parallelize.pl --jobs $NUM_JOBS --qsub-args \"$QSUB_ARGS\" -p 8g -- java -d64 -Xmx${PARSER_MEM} -jar $JOSHUA/lib/BerkeleyParser.jar -gr $JOSHUA/lib/eng_sm6.gr -nThreads 1 | sed 's/^(())\$//; s/^(/(TOP/' | perl $SCRIPTDIR/training/add-OOVs.pl $DATA_DIRS{train}/vocab.$TARGET | tee $DATA_DIRS{train}/corpus.$TARGET.Parsed | $SCRIPTDIR/training/lowercase-leaves.pl > $DATA_DIRS{train}/corpus.parsed.$TARGET",
"$TRAIN{target}",
"$DATA_DIRS{train}/corpus.parsed.$TARGET");
} else {
# Multi-threading in the Berkeley parser is broken, so we use a black-box parallelizer on top
# of it.
$cachepipe->cmd("parse",
"$CAT $file_to_parse | $JOSHUA/scripts/training/parallelize/parallelize.pl --jobs $NUM_THREADS --use-fork -- java -d64 -Xmx${PARSER_MEM} -jar $JOSHUA/lib/BerkeleyParser.jar -gr $JOSHUA/lib/eng_sm6.gr -nThreads 1 | sed 's/^(())\$//; s/^(/(TOP/' | perl $SCRIPTDIR/training/add-OOVs.pl $DATA_DIRS{train}/vocab.$TARGET | tee $DATA_DIRS{train}/corpus.$TARGET.Parsed | $SCRIPTDIR/training/lowercase-leaves.pl > $DATA_DIRS{train}/corpus.parsed.$TARGET",
"$TRAIN{target}",
"$DATA_DIRS{train}/corpus.parsed.$TARGET");
}
$TRAIN{parsed} = "$DATA_DIRS{train}/corpus.parsed.$TARGET";
}
}
maybe_quit("PARSE");
## THRAX #############################################################
MODEL:
;
GRAMMAR:
;
THRAX:
;
PHRASE:
;
system("mkdir -p $DATA_DIRS{train}") unless -d $DATA_DIRS{train};
if ($GRAMMAR_TYPE eq "samt" || $GRAMMAR_TYPE eq "ghkm") {
# if we jumped right here, $TRAIN{target} should be parsed
if (exists $TRAIN{parsed}) {
# parsing step happened in-script or a parsed corpus was passed in explicitly, all is well
} elsif (already_parsed($TRAIN{target})) {
# skipped straight to this step, passing a parsed corpus
$TRAIN{parsed} = "$DATA_DIRS{train}/corpus.parsed.$TARGET";
$cachepipe->cmd("cp-train-$TARGET",
"cp $TRAIN{target} $TRAIN{parsed}",
$TRAIN{target},
$TRAIN{parsed});
$TRAIN{target} = "$DATA_DIRS{train}/corpus.$TARGET";
# now extract the leaves of the parsed corpus
$cachepipe->cmd("extract-leaves",
"cat $TRAIN{parsed} | perl -pe 's/\\(.*?(\\S\+)\\)\+?/\$1/g' | perl -pe 's/\\)//g' > $TRAIN{target}",
$TRAIN{parsed},
$TRAIN{target});
if ($TRAIN{source} ne "$DATA_DIRS{train}/corpus.$SOURCE") {
$cachepipe->cmd("cp-train-$SOURCE",
"cp $TRAIN{source} $DATA_DIRS{train}/corpus.$SOURCE",
$TRAIN{source}, "$DATA_DIRS{train}/corpus.$SOURCE");
$TRAIN{source} = "$DATA_DIRS{train}/corpus.$SOURCE";
}
} else {
print "* FATAL: You requested to build an SAMT grammar, but provided an\n";
print " unparsed corpus. Please re-run the pipeline and begin no later\n";
print " than the PARSE step (--first-step PARSE), or pass in a parsed corpus\n";
print " using --parsed-corpus CORPUS.\n";
exit 1;
}
}
# we may have skipped directly to this step, in which case we need to
# ensure an alignment was provided
if (! defined $ALIGNMENT) {
print "* FATAL: no alignment file specified\n";
exit(1);
}
# Since this is an expensive step, we short-circuit it if the grammar file is present. I'm not
# sure that this is the right behavior.
if (-e "grammar.gz" && ! -z "grammar.gz") {
chomp(my $is_empty = `gzip -cd grammar.gz | head | wc -l`);
$GRAMMAR_FILE = "grammar.gz" unless ($is_empty == 0);
}
# If the grammar file wasn't specified, or found, we need to build it!
if (! defined $GRAMMAR_FILE) {
my $target_file = ($GRAMMAR_TYPE eq "ghkm" or $GRAMMAR_TYPE eq "samt") ? $TRAIN{parsed} : $TRAIN{target};
if ($GRAMMAR_TYPE eq "ghkm") {
if ($GHKM_EXTRACTOR eq "galley") {
$cachepipe->cmd("ghkm-extract",
"java -Xmx4g -Xms4g -cp $JOSHUA/lib/ghkm-modified.jar:$JOSHUA/lib/fastutil.jar -XX:+UseCompressedOops edu.stanford.nlp.mt.syntax.ghkm.RuleExtractor -fCorpus $TRAIN{source} -eParsedCorpus $target_file -align $ALIGNMENT -threads $NUM_THREADS -joshuaFormat true -maxCompositions 1 -reversedAlignment false | $SCRIPTDIR/support/split2files ghkm-mapping.gz grammar.gz",
$ALIGNMENT,
"grammar.gz");
} elsif ($GHKM_EXTRACTOR eq "moses") {
# XML-ize, also replacing unary chains with OOV at the bottom by removing their unary parents
$cachepipe->cmd("ghkm-moses-xmlize",
"cat $target_file | perl -pe 's/\\(\\S+ \\(OOV (.*?)\\)\\)/(OOV \$1)/g' | $MOSES/scripts/training/wrappers/berkeleyparsed2mosesxml.perl > $DATA_DIRS{train}/corpus.xml",
# "cat $target_file | perl -pe 's/\\(\\S+ \\(OOV (.*?)\\)\\)/(OOV \$1)/g' > $DATA_DIRS{train}/corpus.ptb",
$target_file,
"$DATA_DIRS{train}/corpus.xml");
if (! -e "$DATA_DIRS{train}/corpus.$SOURCE") {
system("ln -sf $TRAIN{source} $DATA_DIRS{train}/corpus.$SOURCE");
}
if ($ALIGNMENT ne "alignments/training.align") {
system("mkdir alignments") unless -d "alignments";
system("ln -sf $ALIGNMENT alignments/training.align");
$ALIGNMENT = "alignments/training.align";
}
system("mkdir model");
$cachepipe->cmd("ghkm-moses-extract",
"$MOSES/scripts/training/train-model.perl --first-step 4 --last-step 6 --corpus $DATA_DIRS{train}/corpus --ghkm --f $SOURCE --e xml --alignment-file alignments/training --alignment align --target-syntax --cores $NUM_THREADS --pcfg --alt-direct-rule-score-1 --ghkm-tree-fragments --glue-grammar --glue-grammar-file glue-grammar.ghkm --extract-options \"$EXTRACT_OPTIONS --UnknownWordLabel oov-labels.txt\"",
"$DATA_DIRS{train}/corpus.xml",
"glue-grammar.ghkm",
"model/rule-table.gz");
open LABELS, "oov-labels.txt";
chomp(my @labels = <LABELS>);
close LABELS;
my $oov_list = "\"" . join(" ", @labels) . "\"";
$JOSHUA_ARGS .= " -oov-list $oov_list";
$cachepipe->cmd("ghkm-moses-convert",
"gzip -cd model/rule-table.gz | /home/hltcoe/mpost/code/joshua/scripts/support/moses2joshua_grammar.pl -m rule-fragment-map.txt | gzip -9n > grammar.gz",
"model/rule-table.gz",
"grammar.gz");
} else {
print STDERR "* FATAL: no such GHKM extractor '$GHKM_EXTRACTOR'\n";
exit(1);
}
$GRAMMAR_FILE = "grammar.gz";
} elsif ($GRAMMAR_TYPE eq "moses") {
mkdir("model") unless -d "model";
if ($ALIGNMENT ne "alignments/training.align") {
system("mkdir alignments") unless -d "alignments";
system("ln -sf $ALIGNMENT alignments/training.align");
$ALIGNMENT = "alignments/training.align";
}
# Compute lexical probabilities
$cachepipe->cmd("build-lex-trans",
"$MOSES/scripts/training/train-model.perl -mgiza -mgiza-cpus $NUM_THREADS -dont-zip -first-step 4 -last-step 4 -external-bin-dir $MOSES/bin -f $SOURCE -e $TARGET -max-phrase-length $MAX_PHRASE_LEN -score-options '--GoodTuring' -parallel -lexical-file model/lex -alignment-file alignments/training -alignment align -corpus $TRAIN{prefix}",
$TRAIN{source},
$TRAIN{target},
$ALIGNMENT,
"model/lex.e2f",
"model/lex.f2e"
);
# Extract the phrases
$cachepipe->cmd("extract-phrases",
"$MOSES/scripts/training/train-model.perl -mgiza -mgiza-cpus $NUM_THREADS -dont-zip -first-step 5 -last-step 5 -external-bin-dir $MOSES/bin -f $SOURCE -e $TARGET -max-phrase-length $MAX_PHRASE_LEN -score-options '--GoodTuring' -parallel -alignment-file alignments/training -alignment align -extract-file model/extract -corpus $TRAIN{prefix}",
$TRAIN{source},
$TRAIN{target},
$ALIGNMENT,
"model/extract.sorted.gz",
"model/extract.inv.sorted.gz"
);
# Build the phrase table
$cachepipe->cmd("build-ttable",
"$MOSES/scripts/training/train-model.perl -mgiza -mgiza-cpus $NUM_THREADS -dont-zip -first-step 6 -last-step 6 -external-bin-dir $MOSES/bin -f $SOURCE -e $TARGET -alignment grow-diag-final-and -max-phrase-length $MAX_PHRASE_LEN -score-options '--GoodTuring' -parallel -extract-file model/extract -lexical-file model/lex -phrase-translation-table model/phrase-table",
"model/lex.e2f",
"model/extract.sorted.gz",
"model/phrase-table.gz",
);
# Convert the model to Joshua format
$cachepipe->cmd("convert-moses-to-joshua",
"$CAT model/phrase-table.gz | $SCRIPTDIR/support/phrase2hiero.py -moses | gzip -9n > grammar.gz",
"model/phrase-table.gz",
"grammar.gz",
);
$GRAMMAR_FILE = "grammar.gz";
} elsif ($GRAMMAR_TYPE eq "samt" or $GRAMMAR_TYPE eq "hiero" or $GRAMMAR_TYPE eq "phrase") {
# create the input file
$cachepipe->cmd("thrax-input-file",
"$PASTE $TRAIN{source} $target_file $ALIGNMENT | perl -pe 's/\\t/ ||| /g' | grep -v '()' | grep -v '||| \\+\$' > $DATA_DIRS{train}/thrax-input-file",
$TRAIN{source}, $target_file, $ALIGNMENT,
"$DATA_DIRS{train}/thrax-input-file");
# put the hadoop files in place
my $thrax_input;
my $THRAXDIR = "pipeline-$SOURCE-$TARGET-$GRAMMAR_TYPE-$RUNDIR";
$THRAXDIR =~ s#/#_#g;
$cachepipe->cmd("thrax-prep",
"hadoop fs -rm -r $THRAXDIR; hadoop fs -mkdir $THRAXDIR; hadoop fs -put $DATA_DIRS{train}/thrax-input-file $THRAXDIR/input-file",
"$DATA_DIRS{train}/thrax-input-file",
"grammar.gz");
$thrax_input = "$THRAXDIR/input-file";
# copy the thrax config file
my $thrax_file = "thrax-$GRAMMAR_TYPE.conf";
system("grep -v ^input-file $THRAX_CONF_FILE | perl -pe 's/<MAXPHRLEN>/$MAX_PHRASE_LEN/g' > $thrax_file.tmp");
system("echo input-file $thrax_input >> $thrax_file.tmp");
system("mv $thrax_file.tmp $thrax_file");
$cachepipe->cmd("thrax-run",
"hadoop jar $THRAX/bin/thrax.jar -D mapreduce.task.timeout=0 -D mapreduce.map.java.opts='-Xmx$HADOOP_MEM' -D mapreduce.reduce.java.opts='-Xmx$HADOOP_MEM' -D hadoop.tmp.dir=$TMPDIR $thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; hadoop fs -cat $THRAXDIR/final/* | gzip -cd | $JOSHUA/scripts/training/filter-rules.pl -t 100 | gzip -9n > grammar.gz",
"$DATA_DIRS{train}/thrax-input-file",
$thrax_file,
"grammar.gz");
#perl -pi -e 's/\.?0+\b//g' grammar;
$GRAMMAR_FILE = "grammar.gz";
# cleanup if successful
if (-s $GRAMMAR_FILE) {
system("hadoop fs -rm -r $THRAXDIR");
}
} else {
print STDERR "* FATAL: There was no way to build a grammar, and none was passed in\n";
print STDERR "* Please try one of the following:\n";
print STDERR "* - Specify a grammar with --grammar /path/to/grammar\n";
print STDERR "* - Delete any existing grammar named 'grammar.gz'\n";
exit 1;
}
}
# Pack the entire model! Saves filtering and repacking of tuning and test sets
if ($DO_PACK_GRAMMARS and ! $DO_FILTER_TM and ! -e "grammar.packed") {
$cachepipe->cmd("pack-grammar",
"$SCRIPTDIR/support/grammar-packer.pl -a -T $TMPDIR -m $PACKER_MEM -g $GRAMMAR_FILE -o $RUNDIR/grammar.packed",
"$RUNDIR/grammar.packed/vocabulary",
"$RUNDIR/grammar.packed/encoding",
"$RUNDIR/grammar.packed/slice_00000.source");
$GRAMMAR_FILE = "$RUNDIR/grammar.packed";
}
maybe_quit("THRAX");
maybe_quit("GRAMMAR");
maybe_quit("MODEL");
## TUNING ##############################################################
TUNE:
;
# prep the tuning data, unless already prepped
if (! $PREPPED{TUNE}) {
my $prefixes = prepare_data("tune",\@TUNE,$MAXLEN_TUNE);
$TUNE{source} = "$DATA_DIRS{tune}/$prefixes->{lowercased}.$SOURCE";
$TUNE{target} = "$DATA_DIRS{tune}/$prefixes->{lowercased}.$TARGET";
$PREPPED{TUNE} = 1;
}
# figure out how many references there are
my $numrefs = get_numrefs($TUNE{target});
# make sure the dev source exist
if (! -e $TUNE{source}) {
print STDERR "* FATAL: couldn't fine tuning source file '$TUNE{source}'\n";
exit 1;
}
if ($numrefs > 1) {
for my $i (0..$numrefs-1) {
if (! -e "$TUNE{target}.$i") {
print STDERR "* FATAL: couldn't find tuning reference file '$TUNE{target}.$i'\n";
exit 1;
}
}
} else {
if (! -e $TUNE{target}) {
print STDERR "* FATAL: couldn't find tuning reference file '$TUNE{target}'\n";
exit 1;
}
}
sub compile_lm($) {
my $lmfile = shift;
if ($LM_TYPE eq "kenlm") {
my $kenlm_file = basename($lmfile, ".gz") . ".kenlm";
$cachepipe->cmd("compile-kenlm",
"$JOSHUA/bin/build_binary $lmfile $kenlm_file",
$lmfile, $kenlm_file);
return $kenlm_file;
} elsif ($LM_TYPE eq "berkeleylm") {
my $berkeleylm_file = basename($lmfile, ".gz") . ".berkeleylm";
$cachepipe->cmd("compile-berkeleylm",
"$JOSHUA/scripts/lm/compile_berkeley.py -m $BUILDLM_MEM $lmfile $berkeleylm_file",
$lmfile, $berkeleylm_file);
return $berkeleylm_file;
} else {
print "* FATAL: trying to compile an LM to neither kenlm nor berkeleylm.";
exit 2;
}
}
# Build the language model if needed
if (defined $TRAIN{target} and $DO_BUILD_LM_FROM_CORPUS) {
# make sure the training data is prepped
if (! $PREPPED{TRAIN}) {
my $prefixes = prepare_data("train", \@CORPORA, $MAXLEN);
$TRAIN{prefix} = "$DATA_DIRS{train}/corpus";
foreach my $lang ($SOURCE,$TARGET) {
system("ln -sf $prefixes->{lowercased}.$lang $DATA_DIRS{train}/corpus.$lang");
}
$TRAIN{source} = "$DATA_DIRS{train}/corpus.$SOURCE";
$TRAIN{target} = "$DATA_DIRS{train}/corpus.$TARGET";
$PREPPED{TRAIN} = 1;
}
my $lmfile = "lm.gz";
# sort and uniq the training data
$cachepipe->cmd("lm-sort-uniq",
"$CAT $TRAIN{target} | sort -u -T $TMPDIR -S $BUILDLM_MEM | gzip -9n > $TRAIN{target}.uniq",
$TRAIN{target},
"$TRAIN{target}.uniq");
# If an NER Tagger is specified, use that to annotate the corpus before
# sending it off to the LM
my $ner_return = ner_annotate("$TRAIN{target}.uniq", "$TRAIN{target}.uniq.ner", $TARGET);
if ($ner_return == 2) {
$TRAIN{ner_lm} = 1;
}
my $lm_input = "$TRAIN{target}.uniq";
# Choose LM input based on whether an annotated corpus was created
if (defined $TRAIN{ner_lm}) {
$lm_input = replace_tokens_with_types("$TRAIN{target}.uniq.ner");
}
if ($LM_GEN eq "srilm") {
my $smoothing = ($WITTEN_BELL) ? "-wbdiscount" : "-kndiscount";
$cachepipe->cmd("srilm",
"$SRILM -order $LM_ORDER -interpolate $smoothing -unk -gt3min 1 -gt4min 1 -gt5min 1 -text $TRAIN{target}.uniq $LM_OPTIONS -lm lm.gz",
"$lm_input",
$lmfile);
} elsif ($LM_GEN eq "berkeleylm") {
$cachepipe->cmd("berkeleylm",
"java -ea -mx$BUILDLM_MEM -server -cp $JOSHUA/ext/berkeleylm/jar/berkeleylm.jar edu.berkeley.nlp.lm.io.MakeKneserNeyArpaFromText $LM_ORDER lm.gz $TRAIN{target}.uniq",
"$lm_input",
$lmfile);
} else {
# Make sure it exists
if (! -e "$JOSHUA/bin/lmplz") {
print "* FATAL: $JOSHUA/bin/lmplz (for building LMs) does not exist.\n";
print " This is often a problem with the boost libraries (particularly threaded\n";
print " versus unthreaded).\n";
exit 1;
}
# Needs to be capitalized
my $mem = uc $BUILDLM_MEM;
$cachepipe->cmd("kenlm",
"$JOSHUA/bin/lmplz -o $LM_ORDER -T $TMPDIR -S $mem --verbose_header --text $TRAIN{target}.uniq $LM_OPTIONS | gzip -9n > lm.gz",
"$TRAIN{target}.uniq",
$lmfile);
}
if ((! $MERGE_LMS) && ($LM_TYPE eq "kenlm" || $LM_TYPE eq "berkeleylm")) {
push (@LMFILES, get_absolute_path(compile_lm $lmfile, $RUNDIR));
} else {
push (@LMFILES, get_absolute_path($lmfile, $RUNDIR));
}
}
if ($DO_BUILD_CLASS_LM) {
# Build a Class LM
# First check to see if an class map and class corpus are defined
if (! defined $CLASS_LM_CORPUS or ! defined $CLASS_MAP) {
print "* FATAL: A class LM corpus (--class-lm-corpus) and a class map (--class-map) are required with the --class-lm switch";
exit 1;
}
if (! -e $CLASS_LM_CORPUS or ! -e $CLASS_MAP) {
print "* FATAL: Could not find the Class LM corpus or map";
exit 1;
}
if (! -e "$JOSHUA/bin/lmplz") {
print "* FATAL: $JOSHUA/bin/lmplz (for building LMs) does not exist.\n";
print " This is often a problem with the boost libraries (particularly threaded\n";
print " versus unthreaded).\n";
exit 1;
}
# Needs to be capitalized
my $mem = uc $BUILDLM_MEM;
my $class_lmfile = "class_lm.gz";
$cachepipe->cmd("classlm",
"$JOSHUA/bin/lmplz -o $CLASS_LM_ORDER -T $TMPDIR -S $mem --discount_fallback=0.5 1 1.5 --verbose_header --text $CLASS_LM_CORPUS $LM_OPTIONS | gzip -9n > $class_lmfile",
"$CLASS_LM_CORPUS",
$class_lmfile);
}
if ($MERGE_LMS) {
# Merge @LMFILES.
my $merged_lm = "lm-merged.gz";
# Use the target first target reference if there are multiple ones
my $target_ref = (-e $TUNE{target}) ? $TUNE{target} : "$TUNE{target}.0";
$cachepipe->cmd("merge-lms",
"$JOSHUA/scripts/support/merge_lms.py "
. "@LMFILES "
. "$target_ref "
. "lm-merged.gz "
. "--temp-dir data/merge_lms ",
@LMFILES,
$merged_lm);
# Empty out @LMFILES.
@LMFILES = ();
# Compile merged LM
if ($LM_TYPE eq "kenlm" || $LM_TYPE eq "berkeleylm") {
push (@LMFILES, get_absolute_path(compile_lm $merged_lm, $RUNDIR));
} else {
push (@LMFILES, get_absolute_path($merged_lm, $RUNDIR));
}
}
system("mkdir -p $DATA_DIRS{tune}") unless -d $DATA_DIRS{tune};
# Set $TUNE_GRAMMAR to a specifically-passed tuning grammar or the
# main default grammar. Then update it if filtering was requested and
# is possible.
my $TUNE_GRAMMAR = $_TUNE_GRAMMAR_FILE || $GRAMMAR_FILE;
if ($DO_FILTER_TM and defined $GRAMMAR_FILE and ! $DOING_LATTICES and ! defined $_TUNE_GRAMMAR_FILE) {
$TUNE_GRAMMAR = "$DATA_DIRS{tune}/grammar.filtered.gz";
if ($OPTIMIZER_RUN == 1 and ! is_packed($TUNE_GRAMMAR)) {
$cachepipe->cmd("filter-tune",
"$SCRIPTDIR/support/filter_grammar.sh -g $GRAMMAR_FILE $FILTERING -v $TUNE{source} | $SCRIPTDIR/training/filter-rules.pl -bus$SCOPE | gzip -9n > $TUNE_GRAMMAR",
$GRAMMAR_FILE,
$TUNE{source},
"$DATA_DIRS{tune}/grammar.filtered.gz");
}
}
# Create the glue grammars. This is done by looking at all the symbols in the grammar file and
# creating all the needed rules. This is only done if there is a $TUNE_GRAMMAR defined (which
# can be skipped if we skip straight to the tuning step).
if ($OPTIMIZER_RUN == 1 and defined $TUNE_GRAMMAR and $GRAMMAR_TYPE ne "phrase" and $GRAMMAR_TYPE ne "moses") {
if (! defined $GLUE_GRAMMAR_FILE) {
$cachepipe->cmd("glue-tune",
"$JOSHUA/scripts/support/create_glue_grammar.sh $TUNE_GRAMMAR > $DATA_DIRS{tune}/grammar.glue",
get_file_from_grammar($TUNE_GRAMMAR),
"$DATA_DIRS{tune}/grammar.glue");
$GLUE_GRAMMAR_FILE = "$DATA_DIRS{tune}/grammar.glue";
} else {
# just create a symlink to it
my $filename = $DATA_DIRS{tune} . "/" . basename($GLUE_GRAMMAR_FILE);
system("ln -sf $GLUE_GRAMMAR_FILE $filename");
}
}
# Add in feature functions
my $weightstr = "";
my @feature_functions;
my $lm_index = 0;
for my $i (0..$#LMFILES) {
if ($LM_STATE_MINIMIZATION) {
push(@feature_functions, "StateMinimizingLanguageModel -lm_order $LM_ORDER -lm_file $LMFILES[$i]");
} else {
push(@feature_functions, "LanguageModel -lm_type $LM_TYPE -lm_order $LM_ORDER -lm_file $LMFILES[$i]");
}
$weightstr .= "lm_$i 1 ";
$lm_index += 1;
}
if ($DO_BUILD_CLASS_LM) {
push(@feature_functions, "LanguageModel -lm_type kenlm -lm_order $CLASS_LM_ORDER -lm_file $RUNDIR/class_lm.gz -class_map $CLASS_MAP");
$weightstr .= "lm_$lm_index 1 ";
}
if ($DOING_LATTICES) {
push(@feature_functions, "SourcePath");
$weightstr .= "SourcePath 1.0 ";
}
if ($GRAMMAR_TYPE eq "phrase" or $GRAMMAR_TYPE eq "moses") {
push(@feature_functions, "Distortion");
push(@feature_functions, "PhrasePenalty");
$weightstr .= "Distortion 1.0 PhrasePenalty 1.0 ";
}
my $feature_functions = join(" ", map { "-feature-function \"$_\"" } @feature_functions);
# Build out the weight string
my $TM_OWNER = "pt";
my $GLUE_OWNER = "glue";
if (defined $TUNE_GRAMMAR) {
my @tm_features = get_features($TUNE_GRAMMAR);
foreach my $feature (@tm_features) {
# Only assign initial weights to dense features
$weightstr .= "tm_${TM_OWNER}_$feature 1 " if ($feature =~ /^\d+$/);
}
# Glue grammars are only needed for hierarchical models
if ($GRAMMAR_TYPE ne "phrase" and $GRAMMAR_TYPE ne "moses") {
# Glue grammar
$weightstr .= "tm_${GLUE_OWNER}_0 1 ";
}
}
my $tm_type = $GRAMMAR_TYPE;
if ($GRAMMAR_TYPE eq "moses") {
$tm_type = "moses";
}
sub get_file_from_grammar {
# Cachepipe doesn't work on directories, so we need to make sure we
# have a representative file to use to cache grammars. Returns undef if file not found
my ($grammar_file) = @_;
return undef unless defined $grammar_file and -e $grammar_file;
my $file = (-d $grammar_file) ? "$grammar_file/slice_00000.source" : $grammar_file;
return $file;
}
# The first tuning run is just a symlink to the tune/ directory (for backward compat.)
# Subsequent runs are under their run number
my $tunedir;
if ($OPTIMIZER_RUN == 1) {
$tunedir = "$RUNDIR/tune";
system("mkdir -p $tunedir") unless -d $tunedir;
symlink "$RUNDIR/tune", "$RUNDIR/tune/1";
} else {
$tunedir = "$RUNDIR/tune/$OPTIMIZER_RUN";
system("mkdir -p $tunedir") unless -d $tunedir;
}
system("mkdir -p $tunedir") unless -d $tunedir;
# Build the filtered tuning model
my $tunemodeldir = "$RUNDIR/tune/model";
# We build up this string with TMs to substitute in, if any are provided
my $tm_switch = "";
my $tm_copy_config_args = "";
if (defined $TUNE_GRAMMAR) {
$tm_switch .= ($DO_PACK_GRAMMARS) ? "--pack-tm" : "--tm";
$tm_switch .= " $TUNE_GRAMMAR";
$tm_copy_config_args = " -tm0/type $tm_type -tm0/owner ${TM_OWNER} -tm0/maxspan $MAXSPAN";
}
# If we specified a new glue grammar, put that in
if ($GRAMMAR_TYPE eq "phrase" or $GRAMMAR_TYPE eq "moses") {
# if there is no glue grammar, remove it from the config template
$tm_copy_config_args .= " -tm1 DELETE";
} elsif (defined $GLUE_GRAMMAR_FILE) {
$tm_switch .= " --tm $GLUE_GRAMMAR_FILE";
$tm_copy_config_args .= " -tm1/owner ${GLUE_OWNER}";
}
# Now build the bundle
if ($OPTIMIZER_RUN == 1) {
$cachepipe->cmd("tune-bundle",
"$BUNDLER --force --symlink --absolute --verbose -T $TMPDIR $JOSHUA_CONFIG $tunemodeldir --copy-config-options '-top-n $NBEST -output-format \"%i ||| %s ||| %f ||| %c\" -mark-oovs false -search $SEARCH_ALGORITHM -weights \"$weightstr\" $feature_functions ${tm_copy_config_args}' ${tm_switch}",
$JOSHUA_CONFIG,
get_file_from_grammar($TUNE_GRAMMAR) || $JOSHUA_CONFIG,
"$tunemodeldir/run-joshua.sh");
}
# Update the tune grammar to its new location in the bundle
if (defined $TUNE_GRAMMAR) {
# Now update the tuning grammar to its new path
my $basename = basename($TUNE_GRAMMAR);
if (-e "tune/model/$basename") {
$TUNE_GRAMMAR = "tune/model/$basename";
} elsif (-e "tune/model/$basename.packed") {
$TUNE_GRAMMAR = "tune/model/$basename.packed";
} else {
print STDERR "* FATAL: tune model bundling didn't produce a grammar?\n";
exit 1;
}
}
# Copy the generated config to the tunedir, and update the config file location
system("cp $tunemodeldir/joshua.config $tunedir/joshua.config");
$JOSHUA_CONFIG = "$tunedir/joshua.config";
# Write the decoder run command. The decoder will use the config file in the bundled
# directory, continually updating it.
# If we're decoding a lattice, also output the source side path we chose
$JOSHUA_ARGS = "";
if ($DOING_LATTICES) {
$JOSHUA_ARGS .= " -maxlen 0 -lattice-decoding";
}
$JOSHUA_ARGS .= " -output-format \"%i ||| %s ||| %f ||| %c\"";
$JOSHUA_ARGS .= " $_JOSHUA_ARGS" if defined $_JOSHUA_ARGS;
open DEC_CMD, ">$tunedir/decoder_command";
print DEC_CMD "cat $TUNE{source} | $tunemodeldir/run-joshua.sh -m $JOSHUA_MEM -config $JOSHUA_CONFIG -threads $NUM_THREADS $JOSHUA_ARGS > $tunedir/output.nbest 2> $tunedir/joshua.log\n";
close(DEC_CMD);
chmod(0755,"$tunedir/decoder_command");
# tune
if ($TUNER ne "kbmira") {
$cachepipe->cmd("${TUNER}-${OPTIMIZER_RUN}",
"$SCRIPTDIR/training/run_tuner.py $TUNE{source} $TUNE{target} --tunedir $tunedir --tuner $TUNER --decoder $tunedir/decoder_command --decoder-config $JOSHUA_CONFIG --decoder-output-file $tunedir/output.nbest --decoder-log-file $tunedir/joshua.log --iterations $TUNER_ITERATIONS --metric '$METRIC'",
$TUNE{source},
$JOSHUA_CONFIG,
get_file_from_grammar($TUNE_GRAMMAR) || $JOSHUA_CONFIG,
"$tunedir/joshua.config.final");
} else { # Moses' batch kbmira
my $refs_path = $TUNE{target};
$refs_path .= "." if (get_numrefs($TUNE{target}) > 1);
my $extra_args = $JOSHUA_ARGS;
$extra_args =~ s/"/\\"/g;
$cachepipe->cmd("kbmira-${OPTIMIZER_RUN}",
"$SCRIPTDIR/training/mira/run-mira.pl --mertdir $MOSES/bin --rootdir $MOSES/scripts --batch-mira --working-dir $tunedir --maximum-iterations $TUNER_ITERATIONS --nbest $NBEST --no-filter-phrase-table --decoder-flags \"-m $JOSHUA_MEM -threads $NUM_THREADS -moses $extra_args\" $TUNE{source} $refs_path $tunemodeldir/run-joshua.sh $JOSHUA_CONFIG > $tunedir/mira.log 2>&1",
get_file_from_grammar($TUNE_GRAMMAR) || $JOSHUA_CONFIG,
$TUNE{source},
"$tunedir/joshua.config.final");
}
$JOSHUA_CONFIG = "$tunedir/joshua.config.final";
# Go to the next tuning run if tuning is the last step.
maybe_quit("TUNE");
#################################################################
## TESTING ######################################################
#################################################################
TEST:
;
# prepare the testing data
if (! $PREPPED{TEST} and $OPTIMIZER_RUN == 1) {
my $prefixes = prepare_data("test", [$TEST], $MAXLEN_TEST);
$TEST{source} = "$DATA_DIRS{test}/$prefixes->{lowercased}.$SOURCE";
$TEST{target} = "$DATA_DIRS{test}/$prefixes->{lowercased}.$TARGET";
$PREPPED{TEST} = 1;
}
system("mkdir -p $DATA_DIRS{test}") unless -d $DATA_DIRS{test};
# Define the test grammar, if it was provided
my $TEST_GRAMMAR = $_TEST_GRAMMAR_FILE || $GRAMMAR_FILE;
if ($DO_FILTER_TM and defined $GRAMMAR_FILE and ! $DOING_LATTICES and ! defined $_TEST_GRAMMAR_FILE) {
# On the first test run, we take some pains to prepare and pack the model, which won't have
# to be done for subsequent runs
if ($OPTIMIZER_RUN == 1 and ! is_packed($TEST_GRAMMAR)) {
$TEST_GRAMMAR = "$DATA_DIRS{test}/grammar.filtered.gz";
$cachepipe->cmd("filter-test",
"$SCRIPTDIR/support/filter_grammar.sh -g $GRAMMAR_FILE $FILTERING -v $TEST{source} | $SCRIPTDIR/training/filter-rules.pl -bus$SCOPE | gzip -9n > $TEST_GRAMMAR",
$GRAMMAR_FILE,
$TEST{source},
"$DATA_DIRS{test}/grammar.filtered.gz");
}
}
# Create the glue grammar
if ($OPTIMIZER_RUN == 1 and defined $TEST_GRAMMAR and $GRAMMAR_TYPE ne "phrase" and $GRAMMAR_TYPE ne "moses") {
if (! defined $GLUE_GRAMMAR_FILE) {
$cachepipe->cmd("glue-test",
"$JOSHUA/scripts/support/create_glue_grammar.sh $TEST_GRAMMAR > $DATA_DIRS{test}/grammar.glue",
get_file_from_grammar($TEST_GRAMMAR),
"$DATA_DIRS{test}/grammar.glue");
$GLUE_GRAMMAR_FILE = "$DATA_DIRS{test}/grammar.glue";
} else {
# just create a symlink to it
my $filename = $DATA_DIRS{test} . "/" . basename($GLUE_GRAMMAR_FILE);
if ($GLUE_GRAMMAR_FILE =~ /^\//) {
system("ln -sf $GLUE_GRAMMAR_FILE $filename");
} else {
system("ln -sf $STARTDIR/$GLUE_GRAMMAR_FILE $filename");
}
}
}
# Create the test directory
my $testdir;
if ($OPTIMIZER_RUN == 1) {
$testdir = "$RUNDIR/test";
system("mkdir -p $testdir") unless -d $testdir;
symlink("$RUNDIR/test", "$RUNDIR/test/1");
} else {
$testdir = "$RUNDIR/test/$OPTIMIZER_RUN";
system("mkdir -p $testdir") unless -d $testdir;
}
$tm_switch = "";
if (defined $TEST_GRAMMAR) {
$tm_copy_config_args = "";
$tm_switch .= ($DO_PACK_GRAMMARS) ? "--pack-tm" : "--tm";
$tm_switch .= " $TEST_GRAMMAR";
}
# Add in the glue grammar
if (defined $GLUE_GRAMMAR_FILE) {
$tm_switch .= " --tm $GLUE_GRAMMAR_FILE";
}
# Build the test model
my $testmodeldir = "$RUNDIR/test/$OPTIMIZER_RUN/model";
$cachepipe->cmd("test-bundle-${OPTIMIZER_RUN}",
"$BUNDLER --force --symlink --absolute --verbose -T $TMPDIR $JOSHUA_CONFIG $testmodeldir --copy-config-options '-top-n $NBEST -pop-limit 5000 -output-format \"%i ||| %s ||| %f ||| %c\" -mark-oovs false' ${tm_switch}",
$JOSHUA_CONFIG,
get_file_from_grammar($TEST_GRAMMAR) || $JOSHUA_CONFIG,
"$testmodeldir/joshua.config");
if (defined $TEST_GRAMMAR) {
# Update the test grammar (if defined) to its new path
my $basename = basename($TEST_GRAMMAR);
if (-e "$testmodeldir/$basename") {
$TEST_GRAMMAR = "$testmodeldir/$basename";
} elsif (-e "$testmodeldir/$basename.packed") {
$TEST_GRAMMAR = "$testmodeldir/$basename.packed";
} else {
print STDERR "* FATAL: test model bundling didn't produce a grammar?";
exit 1;
}
}
my $bestoutput = "$testdir/output";
my $nbestoutput = "$testdir/output.nbest";
my $output;
# If we're decoding a lattice, also output the source side path we chose
$JOSHUA_ARGS = "";
if ($DOING_LATTICES) {
$JOSHUA_ARGS .= " -maxlen 0 -lattice-decoding -output-format \"%i ||| %s ||| %e ||| %f ||| %c\"";
}
if ($DO_MBR) {
$JOSHUA_ARGS .= " -top-n $NBEST -output-format \"%i ||| %s ||| %f ||| %c\"";
$output = $nbestoutput;
} else {
$JOSHUA_ARGS .= " -top-n 0 -output-format %s";
$output = $bestoutput;
}
$JOSHUA_ARGS .= " $_JOSHUA_ARGS" if defined $_JOSHUA_ARGS;
# Write the decoder run command
open DEC_CMD, ">$testdir/decoder_command";
print DEC_CMD "cat $TEST{source} | $testmodeldir/run-joshua.sh -m $JOSHUA_MEM -threads $NUM_THREADS $JOSHUA_ARGS > $output 2> $testdir/joshua.log\n";
close(DEC_CMD);
chmod(0755,"$testdir/decoder_command");
# Decode. $output here is either $nbestoutput (if doing MBR decoding, in which case we'll
# need the n-best output) or $bestoutput (which only outputs the hypothesis but is tons faster)
$cachepipe->cmd("test-decode-${OPTIMIZER_RUN}",
"$testdir/decoder_command",
$TEST{source},
"$testdir/decoder_command",
"$testmodeldir/joshua.config",
get_file_from_grammar($TEST_GRAMMAR) || "$testmodeldir/joshua.config",
$output);
# $cachepipe->cmd("remove-oov",
# "cat $testoutput | perl -pe 's/_OOV//g' > $testoutput.noOOV",
# $testoutput,
# "$testoutput.noOOV");
# Extract the 1-best output from the n-best file if the n-best file alone was output
if ($DO_MBR) {
$cachepipe->cmd("test-extract-onebest-${OPTIMIZER_RUN}",
"java -Xmx500m -cp $JOSHUA/class -Dfile.encoding=utf8 joshua.util.ExtractTopCand $nbestoutput $bestoutput",
$nbestoutput,
$bestoutput);
}
# Now compute the BLEU score on the 1-best output
$cachepipe->cmd("test-bleu-${OPTIMIZER_RUN}",
"$JOSHUA/bin/bleu $output $TEST{target} > $testdir/bleu",
$bestoutput,
"$testdir/bleu");
# Update the BLEU summary.
# Sometimes the target side for test doesn't exist (e.g., WMT)
if (-e $TEST{target} || -e "$TEST{target}.0") {
compute_bleu_summary("test/*/bleu", "test/final-bleu");
if (defined $METEOR) {
$cachepipe->cmd("test-meteor-${OPTIMIZER_RUN}",
"$JOSHUA/bin/meteor $output $TEST{target} $TARGET > $testdir/meteor",
$bestoutput,
"$testdir/meteor");
compute_meteor_summary("test/*/meteor", "test/final-meteor");
}
}
if ($DO_MBR) {
my $numlines = `cat $TEST{source} | wc -l`;
$numlines--;
my $mbr_output = "$testdir/output.mbr";
$cachepipe->cmd("test-onebest-parmbr-${OPTIMIZER_RUN}",
"cat $nbestoutput | java -Xmx1700m -cp $JOSHUA/class -Dfile.encoding=utf8 joshua.decoder.NbestMinRiskReranker false 1 $NUM_THREADS > $mbr_output",
$nbestoutput,
$mbr_output);
if (-e $TEST{target}) {
$cachepipe->cmd("test-bleu-mbr-${OPTIMIZER_RUN}",
"$JOSHUA/bin/bleu output $TEST{target} $numrefs > $testdir/bleu.mbr",
$mbr_output,
"$testdir/bleu.mbr");
compute_bleu_summary("test/*/bleu.mbr", "test/final-bleu-mbr");
}
}
compute_time_summary("test/*/joshua.log", "test/final-times");
# Now do the analysis
if ($DOING_LATTICES) {
# extract the source
my $source = "$testdir/test.lattice-path.txt";
$cachepipe->cmd("test-lattice-extract-source-${OPTIMIZER_RUN}",
"$JOSHUA/bin/extract-1best $nbestoutput 2 | perl -pe 's/<s> //' > $source",
$nbestoutput, $source);
analyze_testrun($bestoutput,$source,$TEST{target});
} else {
analyze_testrun($bestoutput,$TEST{source},$TEST{target});
}
######################################################################
## SUBROUTINES #######################################################
######################################################################
LAST:
1;
# Does tokenization and normalization of training, tuning, and test data.
# $label: one of train, tune, or test
# $corpora: arrayref of files (multiple allowed for training data)
# $maxlen: maximum length (only applicable to training)
sub prepare_data {
my ($label,$corpora,$maxlen) = @_;
$maxlen = 0 unless defined $maxlen;
system("mkdir -p $DATA_DIR") unless -d $DATA_DIR;
system("mkdir -p $DATA_DIRS{$label}") unless -d $DATA_DIRS{$label};
# records the pieces that are produced
my %prefixes;
# copy the data from its original location to our location
my $numlines = -1;
# Build the list of extensions. For training data, there may be multiple corpora; for
# tuning and test data, there may be multiple references.
my @exts = ($SOURCE);
my $target_corpus = "$corpora->[0].$TARGET";
push(@exts, $TARGET) if -e $target_corpus;
for (my $i = 0; ; $i++) {
my $file = "$target_corpus.$i";
if (-e $file) {
push(@exts, "$TARGET.$i");
} else {
last;
}
}
# Read through all input files, concatenate them (if multiple were passed), and filter them
# First, assemble the file handles
my (@infiles, @indeps, @outfiles);
foreach my $ext (@exts) {
my @files = map { "$_.$ext" } @$corpora;
push(@indeps, @files);
if ($MAXLINES != 0) {
push(@infiles, "<(head -qn $MAXLINES " . join(" ", @files) . ")");
} else {
push(@infiles, "<(cat " . join(" ", @files) . ")");
}
push (@outfiles, "$DATA_DIRS{$label}/$label.$ext");
}
my $infiles = join(" ", @infiles);
my $outfiles = join(" ", @outfiles);
# only skip blank lines for training data
if ($label ne "test") {
$cachepipe->cmd("$label-copy-and-filter",
"$PASTE $infiles | $SCRIPTDIR/training/filter-empty-lines.pl | $SCRIPTDIR/support/split2files $outfiles",
@indeps, @outfiles);
} else {
$cachepipe->cmd("$label-copy-and-filter",
"$PASTE $infiles | $SCRIPTDIR/support/split2files $outfiles",
@indeps, @outfiles);
}
# Done concatenating and filtering files
# record where the concatenated input files were
$prefixes{last_step} = $prefixes{input} = "$DATA_DIRS{$label}/$label";
if ($DO_PREPARE_CORPORA) {
my $prefix = $label;
# tokenize the data
foreach my $lang (@exts) {
if (-e "$DATA_DIRS{$label}/$prefix.$lang") {
if (is_lattice("$DATA_DIRS{$label}/$prefix.$lang")) {
system("cp $DATA_DIRS{$label}/$prefix.$lang $DATA_DIRS{$label}/$prefix.tok.$lang");
} else {
my $TOKENIZER = ($lang eq $SOURCE) ? $TOKENIZER_SOURCE : $TOKENIZER_TARGET;
my $ext = $lang; $ext =~ s/\.\d//;
$cachepipe->cmd("$label-tokenize-$lang",
"$CAT $DATA_DIRS{$label}/$prefix.$lang | $NORMALIZER $ext | $TOKENIZER -l $ext 2> /dev/null > $DATA_DIRS{$label}/$prefix.tok.$lang",
"$DATA_DIRS{$label}/$prefix.$lang", "$DATA_DIRS{$label}/$prefix.tok.$lang");
}
}
}
# extend the prefix
$prefix .= ".tok";
$prefixes{tokenized} = $prefix;
if ($maxlen > 0) {
my (@infiles, @outfiles);
foreach my $ext (@exts) {
my $infile = "$DATA_DIRS{$label}/$prefix.$ext";
my $outfile = "$DATA_DIRS{$label}/$prefix.$maxlen.$ext";
if (-e $infile) {
push(@infiles, $infile);
push(@outfiles, $outfile);
}
}
my $infilelist = join(" ", @infiles);
my $outfilelist = join(" ", @outfiles);
# trim training data
$cachepipe->cmd("$label-trim",
"$PASTE $infilelist | $SCRIPTDIR/training/trim_parallel_corpus.pl $maxlen | $SCRIPTDIR/support/split2files $outfilelist",
@infiles,
@outfiles);
$prefix .= ".$maxlen";
}
# record this whether we shortened or not
$prefixes{shortened} = $prefix;
# lowercase
foreach my $lang (@exts) {
if (-e "$DATA_DIRS{$label}/$prefix.$lang") {
if (is_lattice("$DATA_DIRS{$label}/$prefix.$lang")) {
system("cat $DATA_DIRS{$label}/$prefix.$lang > $DATA_DIRS{$label}/$prefix.lc.$lang");
} else {
$cachepipe->cmd("$label-lowercase-$lang",
"cat $DATA_DIRS{$label}/$prefix.$lang | $LOWERCASER > $DATA_DIRS{$label}/$prefix.lc.$lang",
"$DATA_DIRS{$label}/$prefix.$lang",
"$DATA_DIRS{$label}/$prefix.lc.$lang");
}
}
}
$prefix .= ".lc";
$prefixes{last_step} = $prefixes{lowercased} = $prefix;
}
foreach my $lang (@exts) {
system("ln -sf $prefixes{last_step}.$lang $DATA_DIRS{$label}/corpus.$lang");
}
# Build a vocabulary
foreach my $ext (@exts) {
$cachepipe->cmd("$label-vocab-$ext",
"cat $DATA_DIRS{$label}/corpus.$ext | $SCRIPTDIR/training/build-vocab.pl > $DATA_DIRS{$label}/vocab.$ext",
"$DATA_DIRS{$label}/corpus.$ext",
"$DATA_DIRS{$label}/vocab.$ext");
}
return \%prefixes;
}
sub maybe_quit {
my ($current_step) = @_;
if (defined $LAST_STEP and $current_step eq $LAST_STEP) {
print "* Quitting at this step\n";
exit(0);
}
}
## returns 1 if every sentence in the corpus begins with an open paren,
## false otherwise
sub already_parsed {
my ($corpus) = @_;
open(CORPUS, $corpus) or die "can't read corpus file '$corpus'\n";
while (<CORPUS>) {
# if we see a line not beginning with an open paren, we consider
# the file not to be parsed
return 0 unless /^\(/;
}
close(CORPUS);
return 1;
}
sub not_defined {
my ($var) = @_;
print "* FATAL: environment variable \$$var is not defined.\n";
exit;
}
# Takes a prefix. If that prefix exists, then all the references are
# assumed to be in that file. Otherwise, we successively append an
# index, looking for parallel references.
sub get_numrefs {
my ($prefix) = @_;
if (-e "$prefix.0") {
my $index = 0;
while (-e "$prefix.$index") {
$index++;
}
return $index;
} else {
return 1;
}
}
sub is_lattice {
my $file = shift;
open READ, "$CAT $file|" or die "can't read from potential lattice '$file'";
my $line = <READ>;
close(READ);
if ($line =~ /^\(\(\(/) {
$DOING_LATTICES = 1;
$FILTERING = "-l";
return 1;
} else {
return 0;
}
}
# Set membership: is value in array?
sub in {
my ($value, $array) = @_;
return grep( /^$value$/, @$array );
}
# This function retrieves the names of all the features in the grammar. Dense features
# are named with consecutive integers starting at 0, while sparse features can have any name.
# To get the feature names from an unpacked grammar, we have to read through the whole grammar,
# since sparse features can be anywhere. For packed grammars, this can be read directly from
# the encoding.
sub get_features {
my ($grammar) = @_;
if (-d $grammar) {
chomp(my @features = `java -cp $JOSHUA/target/joshua-*-with-dependencies.jar org.apache.joshua.util.encoding.EncoderConfiguration $grammar | grep ^feature: | awk '{print \$NF}'`);
return @features;
} elsif (-e $grammar) {
my %features;
open GRAMMAR, "$CAT $grammar|" or die "FATAL: can't read $grammar";
while (my $line = <GRAMMAR>) {
chomp($line);
my @tokens = split(/ \|\|\| /, $line);
# field 4 for regular grammars, field 3 for phrase tables
my $feature_str = ($line =~ /^\[/) ? $tokens[3] : $tokens[2];
my @features = split(' ', $feature_str);
my $feature_no = 0;
foreach my $feature (@features) {
if ($feature =~ /=/) {
my ($name) = split(/=/, $feature);
$features{$name} = 1;
} else {
$features{$feature_no++} = 1;
}
}
}
close(GRAMMAR);
return keys(%features);
}
}
# File names reflecting relative paths need to be absolute-ized for --rundir to work.
# Does not work with paths that do not exist!
sub get_absolute_path {
my ($file,$basedir) = @_;
$basedir = $STARTDIR unless defined $basedir;
if (defined $file) {
$file = "$basedir/$file" unless $file =~ /^\//;
# prepend startdir (which is absolute) unless the path is absolute.
my $abs_path = abs_path($file);
if (defined $abs_path) {
$file = $abs_path;
}
}
return $file;
}
sub analyze_testrun {
my ($output,$source,$reference) = @_;
my $dir = dirname($output);
if (-e $reference) {
mkdir("$dir/analysis") unless -d "$dir/analysis";
my @references;
if (-e "$reference.0") {
my $num = 0;
while (-e "$reference.$num") {
push(@references, "$reference.$num");
$num++;
}
} else {
push(@references, $reference);
}
my $references = join(" -r ", @references);
$cachepipe->cmd("analyze-test-${OPTIMIZER_RUN}",
"$SCRIPTDIR/analysis/sentence-by-sentence.pl -s $source -r $references $output > $dir/analysis/sentence-by-sentence.html",
$output,
"$dir/analysis/sentence-by-sentence.html");
}
}
sub compute_meteor_summary {
my ($filepattern, $outputfile) = @_;
# Average the runs, report result
my @scores;
my $numrecs = 0;
open CMD, "grep '^Final score' $filepattern |";
my @F = split(' ', <CMD>);
close(CMD);
push(@scores, 1.0 * $F[-1]);
if (scalar @scores) {
my $final_score = sum(@scores) / (scalar @scores);
open SUMMARY, ">$outputfile" or die "Can't write to $outputfile";
printf(SUMMARY "%s / %d = %.4f\n", join(" + ", @scores), scalar @scores, $final_score);
close(SUMMARY);
}
}
sub compute_bleu_summary {
my ($filepattern, $outputfile) = @_;
# Now average the runs, report BLEU
my @bleus;
my $numrecs = 0;
open CMD, "grep ' BLEU = ' $filepattern |";
while (<CMD>) {
my @F = split;
push(@bleus, 1.0 * $F[-1]);
}
close(CMD);
if (scalar @bleus) {
my $final_bleu = sum(@bleus) / (scalar @bleus);
open BLEU, ">$outputfile" or die "Can't write to $outputfile";
printf(BLEU "%s / %d = %.4f\n", join(" + ", @bleus), scalar @bleus, $final_bleu);
close(BLEU);
}
}
sub compute_time_summary {
my ($filepattern, $outputfile) = @_;
# Now average the runs, report BLEU
my @times;
foreach my $file (glob($filepattern)) {
open FILE, $file;
my $time = 0.0;
my $numrecs = 0;
while (<FILE>) {
next unless /^Input \d+: Translation took/;
my @F = split;
$time += $F[4];
$numrecs++;
}
close(FILE);
push(@times, $time);
}
if (scalar @times) {
open TIMES, ">$outputfile" or die "Can't write to $outputfile";
printf(TIMES "%s / %d = %s\n", join(" + ", @times), scalar(@times), 1.0 * sum(@times) / scalar(@times));
close(TIMES);
}
}
sub is_packed {
my ($grammar) = @_;
if (-d $grammar && -e "$grammar/encoding") {
return 1;
}
return 0;
}
sub ner_annotate {
my ($inputfile, $outputfile, $lang) = @_;
if (defined $NER_TAGGER) {
# Check if NER tagger exists
if (! -e $NER_TAGGER) {
print "* FATAL: The specified NER tagger was not found";
exit(1);
}
$cachepipe->cmd("ner-annotate", "$NER_TAGGER $inputfile $outputfile $lang");
# Check if annotated file exists
if (! -e "$outputfile") {
print "* FATAL : The NER tagger did not create the required annotated file : $outputfile";
exit(1);
}
return 2;
}
return 0;
}
sub replace_tokens_with_types {
# Replace the tokens with types
my ($inputfile) = @_;
qx{sed -ir 's:\$([A-Za-z0-9]+)_\([^)]+\):\1:g' $inputfile}
}