#!/usr/bin/env perl
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This script implements the Joshua pipeline.  It can run a complete
# pipeline --- from raw training corpora to bleu scores on a test set
# --- and it allows jumping into arbitrary points of the pipeline. 

my $JOSHUA;

BEGIN {
  if (! exists $ENV{JOSHUA} || $ENV{JOSHUA} eq "" ||
      ! exists $ENV{JAVA_HOME} || $ENV{JAVA_HOME} eq "") {
                print "Several environment variables must be set before running the pipeline.  Please set:\n";
                print "* \$JOSHUA to the root of the Joshua source code.\n"
                                if (! exists $ENV{JOSHUA} || $ENV{JOSHUA} eq "");
                print "* \$JAVA_HOME to the directory of your local java installation. \n"
                                if (! exists $ENV{JAVA_HOME} || $ENV{JAVA_HOME} eq "");
                exit;
  }
  $JOSHUA = $ENV{JOSHUA};
  unshift(@INC,"$JOSHUA/scripts/training/cachepipe");
  unshift(@INC,"$JOSHUA/lib");
}

use strict;
use warnings;
use Getopt::Long;
use File::Basename;
use Cwd qw[abs_path getcwd];
use POSIX qw[ceil];
use List::Util qw[max min sum];
use File::Temp qw[:mktemp tempdir];
use CachePipe;

# There are some Perl 5.10 Unicode bugs that cause problems, mostly in sub-scripts
use v5.12;
# use Thread::Pool;

# Hadoop uses a stupid hacker trick to change directories, but (per Lane Schwartz) if CDPATH
# contains ".", it triggers the printing of the directory, which kills the stupid hacker trick.
# Thus we undefine CDPATH to ensure this doesn't happen.
delete $ENV{CDPATH};

my $MOSES = $ENV{MOSES};
my $METEOR = $ENV{METEOR};
my $THRAX = "$JOSHUA/thrax";
delete $ENV{GREP_OPTIONS};

die not_defined("JAVA_HOME") unless exists $ENV{JAVA_HOME};

my (@CORPORA,@TUNE,$TEST,$ALIGNMENT,$SOURCE,$TARGET,@LMFILES,$GRAMMAR_FILE,$GLUE_GRAMMAR_FILE,$_TUNE_GRAMMAR_FILE,$_TEST_GRAMMAR_FILE,$THRAX_CONF_FILE, $_JOSHUA_CONFIG, $_JOSHUA_ARGS);
my $FIRST_STEP = "SUBSAMPLE";
my $LAST_STEP  = "LAST";
my $LMFILTER = "$ENV{HOME}/code/filter/filter";

# The maximum length of training sentences (--maxlen). The threshold is applied to both sides.
my $MAXLEN = 50;

# The maximum span rules in the main grammar can be applied to
my $MAXSPAN = 20;

# The maximum length of tuning and testing sentences (--maxlen-tune and --maxlen-test).
my $MAXLEN_TUNE = 0;
my $MAXLEN_TEST = 0;

# Maximum number of lines from any single corpus
my $MAXLINES = 0;

# when doing phrase-based decoding, the maximum length of a phrase (source side)
my $MAX_PHRASE_LEN = 5;

my $DO_FILTER_TM = 0;
my $DO_SUBSAMPLE = 0;
my $DO_PACK_GRAMMARS = 1;
my $SCRIPTDIR = "$JOSHUA/scripts";
my $TOKENIZER_SOURCE = "$SCRIPTDIR/preparation/tokenize.pl";
my $TOKENIZER_TARGET = "$SCRIPTDIR/preparation/tokenize.pl";
my $NORMALIZER = "$SCRIPTDIR/preparation/normalize.pl";
my $LOWERCASER = "$SCRIPTDIR/preparation/lowercase.pl";
my $GIZA_TRAINER = "$SCRIPTDIR/training/run-giza.pl";
my $TUNECONFDIR = "$SCRIPTDIR/training/templates/tune";
my $SRILM = ($ENV{SRILM}||"")."/bin/i686-m64/ngram-count";
my $COPY_CONFIG = "$SCRIPTDIR/copy-config.pl";
my $BUNDLER = "$JOSHUA/scripts/support/run_bundler.py";
my $STARTDIR;
my $RUNDIR = $STARTDIR = getcwd();
my $GRAMMAR_TYPE = undef; # hiero, itg, samt, ghkm, phrase, or moses
my $SEARCH_ALGORITHM = "cky"; # or "stack" (for phrase-based)

# Which GHKM extractor to use ("galley" or "moses")
my $GHKM_EXTRACTOR = "moses";
my $EXTRACT_OPTIONS = "";

my $WITTEN_BELL = 0;

# Run description.
my $README = undef;

# gzip-aware cat
my $CAT = "$SCRIPTDIR/training/scat";

# custom version of paste that dies on unequal file lengths
my $PASTE = "$SCRIPTDIR/training/paste";

# where processed data files are stored
my $DATA_DIR = "data";

# Whether to do MBR decoding on the n-best list (for test data).
my $DO_MBR = 0;

# Which aligner to use. The options are "giza" or "berkeley".
my $ALIGNER = "giza"; # "berkeley" or "giza" or "jacana"
my $ALIGNER_CONF = "$JOSHUA/scripts/training/templates/alignment/word-align.conf";

# Filter rules to the following maximum scope (Hopkins & Langmead, 2011).
my $SCOPE = 3;

# What kind of filtering to use ("fast" or "exact").
my $FILTERING = "fast";

# This is the amount of memory made available to Joshua.  You'll need
# a lot more than this for SAMT decoding (though really it depends
# mostly on your grammar size)
my $JOSHUA_MEM = "4g";

# the amount of memory available for hadoop processes (passed to
# Hadoop via -Dmapred.child.java.opts
my $HADOOP_MEM = "4g";

# memory available to the parser
my $PARSER_MEM = "2g";

# memory available for building the language model
my $BUILDLM_MEM = "8G";

# Memory available for packing the grammar.
my $PACKER_MEM = "8g";

# Memory available for MERT/PRO.
my $TUNER_MEM = "8g";

# When qsub is called for decoding, these arguments should be passed to it.
my $QSUB_ARGS  = "";

# When qsub is called for aligning, these arguments should be passed to it.
my $QSUB_ALIGN_ARGS  = "-l h_rt=168:00:00,h_vmem=15g,mem_free=10g,num_proc=1";

# Amount of memory for the Berkeley aligner.
my $ALIGNER_MEM = "10g";

# Align corpus files a million lines at a time.
my $ALIGNER_BLOCKSIZE = 1000000;

# The number of machines to decode on.  If you set this higher than 1,
# you need to have qsub configured for your environment.
my $NUM_JOBS = 1;

# The number of threads to use at different pieces in the pipeline
# (giza, decoding)
my $NUM_THREADS = 1;

# which LM to use (kenlm or berkeleylm)
my $LM_TYPE = "kenlm";

# n-gram order
my $LM_ORDER = 5;

# Whether to build and include an LM from the target-side of the
# corpus when manually-specified LM files are passed with --lmfile.
my $DO_BUILD_LM_FROM_CORPUS = 1;

# Whether to build and include an LM from the target-side of the
# corpus when manually-specified LM files are passed with --lmfile.
my $DO_BUILD_CLASS_LM = 0;
my $CLASS_LM_CORPUS = undef;
my $CLASS_MAP = undef;
my $CLASS_LM_ORDER = 9;

# whether to tokenize and lowercase training, tuning, and test data
my $DO_PREPARE_CORPORA = 1;

# compute the nth optimizer run
my $OPTIMIZER_RUN = 1;

# what to use to create language models ("berkeleylm" or "srilm")
my $LM_GEN = "kenlm";
my $LM_OPTIONS = "";

my @STEPS = qw[FIRST SUBSAMPLE ALIGN PARSE THRAX MODEL GRAMMAR PHRASE TUNE MERT PRO TEST LAST];
my %STEPS = map { $STEPS[$_] => $_ + 1 } (0..$#STEPS);

# Methods to use for merging alignments (see Koehn et al., 2003).
# Options are union, {intersect, grow, srctotgt, tgttosrc}-{diag,final,final-and,diag-final,diag-final-and}
my $GIZA_MERGE = "grow-diag-final";

# Whether to merge all the --lmfile LMs into a single LM using weights based on the development corpus
my $MERGE_LMS = 0;

# Which tuner to use by default
my @TUNERS = ("mert", "pro", "mira", "adagrad", "kbmira");
my $TUNER = "mert";

# The metric to update to
my $METRIC = "BLEU 4 closest";

# The number of iterations of the tuner to run
my $TUNER_ITERATIONS = 10;

# location of already-parsed corpus
my $PARSED_CORPUS = undef;

# location of the ner tagger wrapper script for annotation
my $NER_TAGGER = undef;

# Allows the user to set a temp dir for various tasks
my $TMPDIR = $ENV{TMP} || "/tmp";

# Enable forest rescoring
my $LM_STATE_MINIMIZATION = 1;

my $NBEST = 300;

my $REORDERING_LIMIT = 6;
my $NUM_TRANSLATION_OPTIONS = 20;

my $retval = GetOptions(
  "readme=s"    => \$README,
  "corpus=s"        => \@CORPORA,
  "parsed-corpus=s"   => \$PARSED_CORPUS,
  "tune=s"          => \@TUNE,
  "test=s"            => \$TEST,
  "prepare!"          => \$DO_PREPARE_CORPORA,
  "aligner=s"         => \$ALIGNER,
  "alignment=s"      => \$ALIGNMENT,
  "aligner-mem=s"     => \$ALIGNER_MEM,
  "aligner-conf=s"   => \$ALIGNER_CONF,
  "giza-merge=s"      => \$GIZA_MERGE,
  "source=s"          => \$SOURCE,
  "target=s"         => \$TARGET,
  "rundir=s"        => \$RUNDIR,
  "filter-tm!"        => \$DO_FILTER_TM,
  "scope=i"           => \$SCOPE,
  "filtering=s"       => \$FILTERING,
  "lm=s"              => \$LM_TYPE,
  "lmfile=s"        => \@LMFILES,
  "merge-lms!"        => \$MERGE_LMS,
  "lm-gen=s"          => \$LM_GEN,
  "lm-gen-options=s"          => \$LM_OPTIONS,
  "lm-order=i"        => \$LM_ORDER,
  "corpus-lm!"        => \$DO_BUILD_LM_FROM_CORPUS,
  "witten-bell!"     => \$WITTEN_BELL,
  "tune-grammar=s"    => \$_TUNE_GRAMMAR_FILE,
  "test-grammar=s"    => \$_TEST_GRAMMAR_FILE,
  "grammar=s"        => \$GRAMMAR_FILE,
  "model=s"          => \$GRAMMAR_FILE,
  "maxspan=i"         => \$MAXSPAN,
  "mbr!"              => \$DO_MBR,
  "type=s"           => \$GRAMMAR_TYPE,
  "ghkm-extractor=s"  => \$GHKM_EXTRACTOR,
  "extract-options=s" => \$EXTRACT_OPTIONS,
  "maxlen=i"        => \$MAXLEN,
  "maxlen-tune=i"        => \$MAXLEN_TUNE,
  "maxlen-test=i"        => \$MAXLEN_TEST,
  "maxlines=i"        => \$MAXLINES,
  "maxlen-phrase=i"   => \$MAX_PHRASE_LEN,
  "tokenizer-source=s"      => \$TOKENIZER_SOURCE,
  "tokenizer-target=s"      => \$TOKENIZER_TARGET,
  "normalizer=s"      => \$NORMALIZER,
  "lowercaser=s"      => \$LOWERCASER,
  "joshua-config=s"   => \$_JOSHUA_CONFIG,
  "joshua-args=s"      => \$_JOSHUA_ARGS,
  "joshua-mem=s"      => \$JOSHUA_MEM,
  "hadoop-mem=s"      => \$HADOOP_MEM,
  "parser-mem=s"      => \$PARSER_MEM,
  "buildlm-mem=s"     => \$BUILDLM_MEM,
  "packer-mem=s"      => \$PACKER_MEM,
  "pack!"             => \$DO_PACK_GRAMMARS,
  "tuner=s"           => \$TUNER,
  "tuner-mem=s"       => \$TUNER_MEM,
  "tuner-iterations=i" => \$TUNER_ITERATIONS,
  "tuner-metric=s"    => \$METRIC,
  "thrax=s"           => \$THRAX,
  "thrax-conf=s"      => \$THRAX_CONF_FILE,
  "jobs=i"            => \$NUM_JOBS,
  "threads=i"         => \$NUM_THREADS,
  "subsample!"       => \$DO_SUBSAMPLE,
  "qsub-args=s"      => \$QSUB_ARGS,
  "qsub-align-args=s"      => \$QSUB_ALIGN_ARGS,
  "first-step=s"     => \$FIRST_STEP,
  "last-step=s"      => \$LAST_STEP,
  "aligner-chunk-size=s" => \$ALIGNER_BLOCKSIZE,
  "tmp=s"             => \$TMPDIR,
  "nbest=i"           => \$NBEST,
  "reordering-limit=i" => \$REORDERING_LIMIT,
  "num-translation-options=i" => \$NUM_TRANSLATION_OPTIONS,
  "ner-tagger=s"   => \$NER_TAGGER,
  "class-lm!"     => \$DO_BUILD_CLASS_LM,
  "class-lm-corpus=s"   => \$CLASS_LM_CORPUS,
  "class-map=s"     => \$CLASS_MAP,
  "class-lm-order=s"     => \$CLASS_LM_ORDER,
  "optimizer-run=i" => \$OPTIMIZER_RUN,
);

if (! $retval) {
  print "Invalid usage, quitting\n";
  exit 1;
}

$RUNDIR = get_absolute_path($RUNDIR);

$TUNER = lc $TUNER;

my $DOING_LATTICES = 0;

my $JOSHUA_ARGS = (defined $_JOSHUA_ARGS) ? $_JOSHUA_ARGS : "";

my %DATA_DIRS = (
  train => get_absolute_path("$RUNDIR/$DATA_DIR/train"),
  tune  => get_absolute_path("$RUNDIR/$DATA_DIR/tune"),
  test  => get_absolute_path("$RUNDIR/$DATA_DIR/test"),
);

if (! -x $NORMALIZER) {
  print "* FATAL: couldn't find normalizer '$NORMALIZER'\n";
  exit 1;
}

# Absolutize paths
$ALIGNER_CONF = get_absolute_path($ALIGNER_CONF);
$ALIGNMENT = get_absolute_path($ALIGNMENT);

# capitalize these to offset a common error:
$FIRST_STEP = uc($FIRST_STEP);
$LAST_STEP  = uc($LAST_STEP);

$| = 1;

my $cachepipe = new CachePipe();

# This tells cachepipe not to include the command signature when determining to run a command.  Note
# that this is not backwards compatible!
$cachepipe->omit_cmd();

$SIG{INT} = sub { 
  print "* Got C-c, quitting\n";
  $cachepipe->cleanup();
  exit 1; 
};

# if no LMs were specified, we need to build one from the target side of the corpus
if (scalar @LMFILES == 0) {
  $DO_BUILD_LM_FROM_CORPUS = 1;
}

## Sanity Checking ###################################################

# If a language model was specified and no corpus was given to build another one from the target
# side of the training data (which could happen, for example, when starting at the tuning step with
# an existing LM), turn off building an LM from the corpus.  The user could have done this
# explicitly with --no-corpus-lm, but might have forgotten to, and we con't want to pester them with
# an error about easily-inferrable intentions.
if (scalar @LMFILES && ! scalar(@CORPORA)) {
  $DO_BUILD_LM_FROM_CORPUS = 0;
}


# if merging LMs, make sure there are at least 2 LMs to merge.
# first, pin $DO_BUILD_LM_FROM_CORPUS to 0 or 1 so that the subsequent check works.
if ($MERGE_LMS) {
  if ($DO_BUILD_LM_FROM_CORPUS != 0) {
    $DO_BUILD_LM_FROM_CORPUS = 1
  }

  if (@LMFILES + $DO_BUILD_LM_FROM_CORPUS < 2) {
    print "* FATAL: I need 2 or more language models to merge (including the corpus target-side LM).";
    exit 2;
  }
}

# absolutize LM file paths
map {
  $LMFILES[$_] = get_absolute_path($LMFILES[$_]);
} 0..$#LMFILES;

# make sure the LMs exist
foreach my $lmfile (@LMFILES) {
  if (! -e $lmfile) {
    print "* FATAL: couldn't find language model file '$lmfile'\n";
    exit 1;
  }
}

my @GRAMMAR_TYPES = qw/hiero samt ghkm phrase moses/;
if (! defined $GRAMMAR_TYPE or ! in($GRAMMAR_TYPE,\@GRAMMAR_TYPES)) {
  print "* FATAL: You must define --type (" . join("|", @GRAMMAR_TYPES) . ")\n";
  exit 47;
}

# case-normalize this
$GRAMMAR_TYPE = lc $GRAMMAR_TYPE;

if ($GRAMMAR_TYPE eq "phrase" or $GRAMMAR_TYPE eq "moses") {
  $SEARCH_ALGORITHM = "stack";
  $MAXSPAN = 0;
}

# make sure source and target were specified
if (! defined $SOURCE or $SOURCE eq "") {
  print "* FATAL: I need a source language extension (--source)\n";
  exit 1;
}
if (! defined $TARGET or $TARGET eq "") {
  print "* FATAL: I need a target language extension (--target)\n";
  exit 1;
}

# make sure a corpus was provided if we're doing any step before tuning
if (@CORPORA == 0 and $STEPS{$FIRST_STEP} < $STEPS{TUNE}) {
  print "* FATAL: need at least one training corpus (--corpus)\n";
  exit 1;
}

# make sure a tuning corpus was provided if we're doing tuning
if (scalar(@TUNE) == 0 and ($STEPS{$FIRST_STEP} <= $STEPS{TUNE}
                         and $STEPS{$LAST_STEP} >= $STEPS{TUNE})) { 
  print "* FATAL: need at least one tuning set (--tune)\n";
  exit 1;
}

# make sure a test corpus was provided if we're decoding a test set
if (! defined $TEST and ($STEPS{$FIRST_STEP} <= $STEPS{TEST}
                         and $STEPS{$LAST_STEP} >= $STEPS{TEST})) {
  print "* FATAL: need a test set (--test)\n";
  exit 1;
}

# Joshua config
my $JOSHUA_CONFIG = get_absolute_path($_JOSHUA_CONFIG || "$TUNECONFDIR/joshua.config", $STARTDIR);

# make sure we have a tuned config file if we're skipping model building and tuning
if ($STEPS{$FIRST_STEP} >= $STEPS{TEST}) {
  if (! defined $JOSHUA_CONFIG) {
    print "* FATAL: You need to provide a tuned Joshua config file (--joshua-config)\n";
    print "         if you're skipping straight to testing\n";
    exit 1;
  }
}

# make sure we have either a config file or a grammar and LM if we're skipping model building
if ($STEPS{$FIRST_STEP} >= $STEPS{TUNE}) {
  if (! defined $JOSHUA_CONFIG and ((! defined $_TUNE_GRAMMAR_FILE and ! defined $GRAMMAR_FILE) or scalar(@LMFILES) == 0)) {
    print "* FATAL: You must provide either a Joshua config file (--joshua-config) or\n";
    print "         a grammar (--grammar or --tune-grammar) and at least one LM (--lmfile)\n";
    print "         if you're skipping straight to tuning\n";
    exit 1;
  }
}

# make sure SRILM is defined if we're building a language model
if ($LM_GEN eq "srilm" && (scalar @LMFILES == 0) && $STEPS{$FIRST_STEP} <= $STEPS{TUNE} && $STEPS{$LAST_STEP} >= $STEPS{TUNE}) {
  not_defined("SRILM") unless exists $ENV{SRILM} and -d $ENV{SRILM};
}

# check for file presence
if (defined $JOSHUA_CONFIG and ! -e $JOSHUA_CONFIG) {
  print "* FATAL: couldn't find joshua config file '$JOSHUA_CONFIG'\n";
  exit 1;
}
if (defined $GRAMMAR_FILE and ! -e $GRAMMAR_FILE) {
  print "* FATAL: couldn't find grammar file '$GRAMMAR_FILE'\n";
  exit 1;
}
if (defined $_TUNE_GRAMMAR_FILE and ! -e $_TUNE_GRAMMAR_FILE) {
  print "* FATAL: couldn't find tuning grammar file '$_TUNE_GRAMMAR_FILE'\n";
  exit 1;
}
if (defined $_TEST_GRAMMAR_FILE and ! -e $_TEST_GRAMMAR_FILE) {
  print "* FATAL: couldn't find test grammar file '$_TEST_GRAMMAR_FILE'\n";
  exit 1;
}
if (defined $ALIGNMENT and ! -e $ALIGNMENT) {
  print "* FATAL: couldn't find alignment file '$ALIGNMENT'\n";
  exit 1;
}

# If $CORPUS was a relative path, prepend the starting directory (under the assumption it was
# relative to there).  This makes sure that everything will still work if we change the run
# directory.
map {
  $CORPORA[$_] = get_absolute_path("$CORPORA[$_]");
} (0..$#CORPORA);

# Do the same for tuning and test data, and other files
map {
  $TUNE[$_] = get_absolute_path($TUNE[$_]);
} (0..$#TUNE);
$TEST = get_absolute_path($TEST);

$GRAMMAR_FILE = get_absolute_path($GRAMMAR_FILE);
$GLUE_GRAMMAR_FILE = get_absolute_path($GLUE_GRAMMAR_FILE);
$_TUNE_GRAMMAR_FILE = get_absolute_path($_TUNE_GRAMMAR_FILE);
$_TEST_GRAMMAR_FILE = get_absolute_path($_TEST_GRAMMAR_FILE);
$THRAX_CONF_FILE = get_absolute_path($THRAX_CONF_FILE);
$ALIGNMENT = get_absolute_path($ALIGNMENT);

foreach my $corpus (@CORPORA) {
  foreach my $ext ($TARGET,$SOURCE) {
    if (! -e "$corpus.$ext") {
      print "* FATAL: can't find '$corpus.$ext'";
      exit 1;
    } 
  }
}

if ($ALIGNER ne "giza" and $ALIGNER ne "berkeley" and $ALIGNER ne "jacana") {
  print "* FATAL: aligner must be one of 'giza', 'berkeley' or 'jacana' (only French-English)\n";
  exit 1;
}

if ($LM_TYPE ne "kenlm" and $LM_TYPE ne "berkeleylm") {
  print "* FATAL: lm type (--lm) must be one of 'kenlm' or 'berkeleylm'\n";
  exit 1;
}

if ($LM_TYPE ne "kenlm") {
  $LM_STATE_MINIMIZATION = 0;
}

if ($LM_GEN ne "berkeleylm" and $LM_GEN ne "srilm" and $LM_GEN ne "kenlm") {
  print "* FATAL: lm generating code (--lm-gen) must be one of 'kenlm' (default), 'berkeleylm', or 'srilm'\n";
  exit 1;
}

if ($TUNER eq "kbmira" and ! defined $MOSES) {
  print "* FATAL: using 'kbmira' for tuning requires setting the MOSES environment variable\n";
  exit 1;
}

if ($GRAMMAR_TYPE eq "moses" and ! defined $MOSES) {
  print "* FATAL: building Moses phrase-based models (--type moses) requires setting the MOSES environment variable\n";
  exit 1;
}

if (! in($TUNER, \@TUNERS)) {
  print "* FATAL: --tuner must be one of " . join(", ", @TUNERS) . $/;
  exit 1;
}

$FILTERING = lc $FILTERING;
if ($FILTERING eq "fast") {
  $FILTERING = "-f"
} elsif ($FILTERING eq "exact") {
  $FILTERING = "-e";
} elsif ($FILTERING eq "loose") {
  $FILTERING = "-l";
} else {
  print "* FATAL: --filtering must be one of 'fast' (default) or 'exact' or 'loose'\n";
  exit 1;
}

## END SANITY CHECKS

####################################################################################################
## Dependent variable setting ######################################################################
####################################################################################################

my $OOV = ($GRAMMAR_TYPE eq "hiero" or $GRAMMAR_TYPE eq "itg" or $GRAMMAR_TYPE eq "phrase" or $GRAMMAR_TYPE eq "moses") ? "X" : "OOV";

# The phrasal system should use the ITG grammar, allowing for limited distortion
if ($GRAMMAR_TYPE eq "phrasal") {
  $GLUE_GRAMMAR_FILE = get_absolute_path("$JOSHUA/scripts/training/templates/glue-grammar.itg");
}

# use this default unless it's already been defined by a command-line argument
$THRAX_CONF_FILE = "$JOSHUA/scripts/training/templates/thrax-$GRAMMAR_TYPE.conf" unless defined $THRAX_CONF_FILE;

mkdir $RUNDIR unless -d $RUNDIR;
chdir($RUNDIR);

if (defined $README) {
  open DESC, ">README" or die "can't write README file";
  print DESC $README;
  print DESC $/;
  close DESC;
}

# default values -- these are overridden if the full script is run
# (after tokenization and normalization)
my (%TRAIN,%TUNE,%TEST);
if (@CORPORA) {
  $TRAIN{prefix} = $CORPORA[0];
  $TRAIN{source} = "$CORPORA[0].$SOURCE";
  $TRAIN{target} = "$CORPORA[0].$TARGET";
}

# set the location of the parsed corpus if that was defined
if (defined $PARSED_CORPUS) {
  $TRAIN{parsed} = get_absolute_path($PARSED_CORPUS);
}

if (scalar(@TUNE) > 0) {
  $TUNE{source} = "$TUNE[0].$SOURCE";
  $TUNE{target} = "$TUNE[0].$TARGET";

  if (! -e "$TUNE{source}") {
    print "* FATAL: couldn't find tune source file at '$TUNE{source}'\n";
    exit;
  }
}

if ($TEST) {
  $TEST{source} = "$TEST.$SOURCE";
  $TEST{target} = "$TEST.$TARGET";

  if (! -e "$TEST{source}") {
    print "* FATAL: couldn't find test source file at '$TEST{source}'\n";
    exit;
  }
}

# Record the preprocessing scripts that were used
mkdir("scripts") unless -e "scripts";
unlink "scripts/normalize.$SOURCE";
unlink "scripts/normalize.$TARGET";
symlink $NORMALIZER, "scripts/normalize.$SOURCE";
symlink $NORMALIZER, "scripts/normalize.$TARGET";
unlink "scripts/tokenize.$SOURCE";
unlink "scripts/tokenize.$TARGET";
symlink $TOKENIZER_SOURCE, "scripts/tokenize.$SOURCE";
symlink $TOKENIZER_TARGET, "scripts/tokenize.$TARGET";

## STEP 1: filter and preprocess corpora #############################

if (defined $ALIGNMENT and $STEPS{$FIRST_STEP} < $STEPS{ALIGN}) {
  print "* FATAL: it doesn't make sense to provide an alignment and then do\n";
  print "  tokenization.  Either remove --alignment or specify a first step\n";
  print "  of Thrax (--first-step THRAX)\n";
  exit 1;
}

if (@CORPORA == 0 and $STEPS{$FIRST_STEP} < $STEPS{TUNE}) {
  print "* FATAL: need at least one training corpus (--corpus)\n";
  exit 1;
}

# prepare the training data
my %PREPPED = (
  TRAIN => 0,
  TUNE => 0,
  TEST => 0);

if (@CORPORA > 0) {
  my $prefixes = prepare_data("train",\@CORPORA,$MAXLEN);

  # used for parsing
  if (exists $prefixes->{shortened}) {
    $TRAIN{mixedcase} = "$DATA_DIRS{train}/$prefixes->{shortened}.$TARGET";
  }

  $TRAIN{prefix} = "$DATA_DIRS{train}/corpus";
  $TRAIN{source} = "$DATA_DIRS{train}/corpus.$SOURCE";
  $TRAIN{target} = "$DATA_DIRS{train}/corpus.$TARGET";
  $PREPPED{TRAIN} = 1;
}

# prepare the tuning and development data
if (@TUNE > 0) {
  my $prefixes = prepare_data("tune",\@TUNE,$MAXLEN_TUNE);
  $TUNE{source} = "$DATA_DIRS{tune}/corpus.$SOURCE";
  $TUNE{target} = "$DATA_DIRS{tune}/corpus.$TARGET";
  my $ner_return = ner_annotate("$TUNE{source}", "$TUNE{source}.ner", $SOURCE);
  if ($ner_return == 2) {
    $TUNE{source} = "$TUNE{source}.ner";
  }
  $PREPPED{TUNE} = 1;
}

if (defined $TEST) {
  my $prefixes = prepare_data("test",[$TEST],$MAXLEN_TEST);
  $TEST{source} = "$DATA_DIRS{test}/corpus.$SOURCE";
  $TEST{target} = "$DATA_DIRS{test}/corpus.$TARGET";
  my $ner_return = ner_annotate("$TEST{source}", "$TEST{source}.ner", $SOURCE);
  if ($ner_return == 2) {
    $TEST{source} = "$TEST{source}.ner";
  }
  $PREPPED{TEST} = 1;
}

## Use of GOTO considered very useful
if (eval { goto $FIRST_STEP }) {
  print "* Skipping to step $FIRST_STEP\n";
  goto $FIRST_STEP;
} else {
  print "* No such step $FIRST_STEP\n";
  exit 1;
}

## SUBSAMPLE #########################################################

SUBSAMPLE:
    ;

# subsample
if ($DO_SUBSAMPLE) {
  mkdir("$DATA_DIRS{train}/subsampled") unless -d "$DATA_DIRS{train}/subsampled";

  $cachepipe->cmd("subsample-manifest",
                  "echo corpus > $DATA_DIRS{train}/subsampled/manifest",
                  "$DATA_DIRS{train}/subsampled/manifest");

  $cachepipe->cmd("subsample-testdata",
                  "cat $TUNE{source} $TEST{source} > $DATA_DIRS{train}/subsampled/test-data",
                  $TUNE{source},
                  $TEST{source},
                  "$DATA_DIRS{train}/subsampled/test-data");

  $cachepipe->cmd("subsample",
                  "java -Xmx4g -Dfile.encoding=utf8 -cp $JOSHUA/bin:$JOSHUA/lib/commons-cli-2.0-SNAPSHOT.jar joshua.subsample.Subsampler -e $TARGET -f $SOURCE -epath $DATA_DIRS{train}/ -fpath $DATA_DIRS{train}/ -output $DATA_DIRS{train}/subsampled/subsampled.$MAXLEN -ratio 1.04 -test $DATA_DIRS{train}/subsampled/test-data -training $DATA_DIRS{train}/subsampled/manifest",
                  "$DATA_DIRS{train}/subsampled/manifest",
                  "$DATA_DIRS{train}/subsampled/test-data",
                  $TRAIN{source},
                  $TRAIN{target},
                  "$DATA_DIRS{train}/subsampled/subsampled.$MAXLEN.$TARGET",
                  "$DATA_DIRS{train}/subsampled/subsampled.$MAXLEN.$SOURCE");

  # rewrite the symlinks to point to the subsampled corpus
  foreach my $lang ($TARGET,$SOURCE) {
    system("ln -sf subsampled/subsampled.$MAXLEN.$lang $DATA_DIRS{train}/corpus.$lang");
  }
}

maybe_quit("SUBSAMPLE");


## ALIGN #############################################################

ALIGN:
    ;

# This basically means that we've skipped tokenization, in which case
# we still want to move the input files into the canonical place
if ($FIRST_STEP eq "ALIGN") {
  if (defined $ALIGNMENT) {
    print "* FATAL: It doesn't make sense to provide an alignment\n";
    print "  but not to skip the tokenization and subsampling steps\n";
    exit 1;
  }

  # TODO: copy the files into the canonical place 

  # Jumping straight to alignment is probably the same thing as
  # skipping tokenization, and might also be implemented by a
  # --no-tokenization flag
}

# Use an existing alignment file if it's present, short-circuits
# rebuilding the alignments...
if (-s "alignments/training.align") {
  $ALIGNMENT = "alignments/training.align";
}

# skip this step if an alignment was provided or it already exists
if (! defined $ALIGNMENT) {

  # We process the data in chunks which by default are 1,000,000 sentence pairs.  So first split up
  # the data into those chunks.
  system("mkdir","-p","$DATA_DIRS{train}/splits") unless -d "$DATA_DIRS{train}/splits";

  $cachepipe->cmd("source-numlines",
									"cat $TRAIN{source} | wc -l",
									$TRAIN{source});
  my $numlines = $cachepipe->stdout();
  my $numchunks = ceil($numlines / $ALIGNER_BLOCKSIZE);

  open TARGET, $TRAIN{target} or die "can't read $TRAIN{target}";
  open SOURCE, $TRAIN{source} or die "can't read $TRAIN{source}";

  my $lastchunk = -1;
  while (my $target = <TARGET>) {
		my $source = <SOURCE>;

		# We want to prevent a very small last chunk, which we accomplish
		# by folding the last chunk into the penultimate chunk.
		my $chunk = ($numchunks <= 2)
				? 0 
				: min($numchunks - 2,
							int( (${.} - 1) / $ALIGNER_BLOCKSIZE ));
		
		if ($chunk != $lastchunk) {
			close CHUNK_SOURCE;
			close CHUNK_TARGET;

      mkdir("$DATA_DIRS{train}/splits/$chunk");

			open CHUNK_SOURCE, ">", "$DATA_DIRS{train}/splits/$chunk/corpus.$SOURCE" or die;
			open CHUNK_TARGET, ">", "$DATA_DIRS{train}/splits/$chunk/corpus.$TARGET" or die;

			$lastchunk = $chunk;
		}

		print CHUNK_SOURCE $source;
		print CHUNK_TARGET $target;
  }
  close CHUNK_SOURCE;
  close CHUNK_TARGET;

  close SOURCE;
  close TARGET;

  # my $max_aligner_threads = $NUM_THREADS;
  # if ($ALIGNER eq "giza" and $max_aligner_threads > 1) {
  #   $max_aligner_threads /= 2;
  # }

  mkdir("alignments") unless -d "alignments";

  my $aligner_cmd = (
    "$SCRIPTDIR/training/paralign.pl "
    . " -aligner $ALIGNER"
    . " -conf $ALIGNER_CONF"
    . " -num_threads 2"
    . " -giza_merge $GIZA_MERGE"
    . " -aligner_mem $ALIGNER_MEM"
    . " -source $SOURCE"
    . " -target $TARGET"
    . " -giza_trainer \"$GIZA_TRAINER\""
    . " -train_dir \"$DATA_DIRS{train}\" "
    . "> alignments/run.log"
  );

  # Start a parallel job on each core
  my @children = ();
  my $next_chunk = 0;
  foreach my $core (1..$NUM_THREADS) {
    if ($next_chunk < $lastchunk + 1) {
      my $child = fork();
      if (! $child) { # I am child
        exec("echo $next_chunk | $aligner_cmd");
        exit 0;
      }
      push @children, $child;
      $next_chunk++;
      next;
    }
  }

  # Start another concurrent job as each oldest job finishes
  while (@children) {
    my $old_child = shift @children;
    waitpid( $old_child, 0 );

    if ($next_chunk < $lastchunk + 1) {
      my $new_child = fork();
      if (! $new_child) { # I am child
        exec("echo $next_chunk | $aligner_cmd");
        exit 0;
      }
      $next_chunk++;
      push @children, $new_child;
    }
  }

  my @aligned_files;
  if ($ALIGNER eq "giza") {
    @aligned_files = map { "alignments/$_/model/aligned.$GIZA_MERGE" } (0..$lastchunk);
  } elsif ($ALIGNER eq "berkeley") {
    @aligned_files = map { "alignments/$_/training.$TARGET-$SOURCE.align" } (0..$lastchunk);
  } elsif ($ALIGNER eq "jacana") {
    @aligned_files = map { "alignments/$_/training.align" } (0..$lastchunk);
  }
	my $aligned_file_list = join(" ", @aligned_files);

  # wait for all the threads to finish
  # $pool->join();

	# combine the alignments
	$cachepipe->cmd("aligner-combine",
									"cat $aligned_file_list > alignments/training.align",
									$aligned_files[-1],
									"alignments/training.align");

  # at the end, all the files are concatenated into a single alignment file parallel to the input
  # corpora
  $ALIGNMENT = "alignments/training.align";
}

maybe_quit("ALIGN");


## PARSE #############################################################

PARSE:
    ;

# Parsing only happens for SAMT grammars.

if ($FIRST_STEP eq "PARSE" and ($GRAMMAR_TYPE eq "hiero" or $GRAMMAR_TYPE eq "phrasal" or $GRAMMAR_TYPE eq "phrase" or $GRAMMAR_TYPE eq "moses")) {
  print STDERR "* FATAL: parsing only applies to GHKM and SAMT grammars; you need to add '--type samt|ghkm'\n";
  exit;
}

if ($GRAMMAR_TYPE eq "samt" || $GRAMMAR_TYPE eq "ghkm") {

  # If the user passed in the already-parsed corpus, use that (after copying it into place)
  if (defined $TRAIN{parsed} && -e $TRAIN{parsed}) {
    # copy and adjust the location of the file to its canonical location
    system("cp $TRAIN{parsed} $DATA_DIRS{train}/corpus.parsed.$TARGET");
    $TRAIN{parsed} = "$DATA_DIRS{train}/corpus.parsed.$TARGET";
  } else {

    system("mkdir -p $DATA_DIRS{train}") unless -e $DATA_DIRS{train};

    $cachepipe->cmd("build-vocab",
                    "cat $TRAIN{target} | $SCRIPTDIR/training/build-vocab.pl > $DATA_DIRS{train}/vocab.$TARGET",
                    $TRAIN{target},
                    "$DATA_DIRS{train}/vocab.$TARGET");

    my $file_to_parse = (exists $TRAIN{mixedcase}) ? $TRAIN{mixedcase} : $TRAIN{target};

    if ($NUM_JOBS > 1) {
      # the black-box parallelizer model doesn't work with multiple
      # threads, so we're always spawning single-threaded instances here

      # open PARSE, ">parse.sh" or die;
      # print PARSE "cat $TRAIN{target} | $JOSHUA/scripts/training/parallelize/parallelize.pl --jobs $NUM_JOBS --qsub-args \"$QSUB_ARGS\" -- java -d64 -Xmx${PARSER_MEM} -jar $JOSHUA/lib/BerkeleyParser.jar -gr $JOSHUA/lib/eng_sm6.gr -nThreads 1 | sed 's/^\(/\(TOP/' | tee $DATA_DIRS{train}/corpus.$TARGET.parsed.mc | perl -pi -e 's/(\\S+)\\)/lc(\$1).\")\"/ge' | tee $DATA_DIRS{train}/corpus.$TARGET.parsed | perl $SCRIPTDIR/training/add-OOVs.pl $DATA_DIRS{train}/vocab.$TARGET > $DATA_DIRS{train}/corpus.parsed.$TARGET\n";
      # close PARSE;
      # chmod 0755, "parse.sh";
      # $cachepipe->cmd("parse",
      #         "setsid ./parse.sh",
      #         "$TRAIN{target}",
      #         "$DATA_DIRS{train}/corpus.parsed.$TARGET");

      $cachepipe->cmd("parse",
                      "$CAT $file_to_parse | $JOSHUA/scripts/training/parallelize/parallelize.pl --jobs $NUM_JOBS --qsub-args \"$QSUB_ARGS\" -p 8g -- java -d64 -Xmx${PARSER_MEM} -jar $JOSHUA/lib/BerkeleyParser.jar -gr $JOSHUA/lib/eng_sm6.gr -nThreads 1 | sed 's/^(())\$//; s/^(/(TOP/' | perl $SCRIPTDIR/training/add-OOVs.pl $DATA_DIRS{train}/vocab.$TARGET | tee $DATA_DIRS{train}/corpus.$TARGET.Parsed | $SCRIPTDIR/training/lowercase-leaves.pl > $DATA_DIRS{train}/corpus.parsed.$TARGET",
                      "$TRAIN{target}",
                      "$DATA_DIRS{train}/corpus.parsed.$TARGET");
    } else {
      # Multi-threading in the Berkeley parser is broken, so we use a black-box parallelizer on top
      # of it.
      $cachepipe->cmd("parse",
                      "$CAT $file_to_parse | $JOSHUA/scripts/training/parallelize/parallelize.pl --jobs $NUM_THREADS --use-fork -- java -d64 -Xmx${PARSER_MEM} -jar $JOSHUA/lib/BerkeleyParser.jar -gr $JOSHUA/lib/eng_sm6.gr -nThreads 1 | sed 's/^(())\$//; s/^(/(TOP/' | perl $SCRIPTDIR/training/add-OOVs.pl $DATA_DIRS{train}/vocab.$TARGET | tee $DATA_DIRS{train}/corpus.$TARGET.Parsed | $SCRIPTDIR/training/lowercase-leaves.pl > $DATA_DIRS{train}/corpus.parsed.$TARGET",
                      "$TRAIN{target}",
                      "$DATA_DIRS{train}/corpus.parsed.$TARGET");
    }

    $TRAIN{parsed} = "$DATA_DIRS{train}/corpus.parsed.$TARGET";
  }
}

maybe_quit("PARSE");

## THRAX #############################################################

MODEL:
    ;
GRAMMAR:
    ;
THRAX:
    ;
PHRASE:
    ;

system("mkdir -p $DATA_DIRS{train}") unless -d $DATA_DIRS{train};

if ($GRAMMAR_TYPE eq "samt" || $GRAMMAR_TYPE eq "ghkm") {

  # if we jumped right here, $TRAIN{target} should be parsed
  if (exists $TRAIN{parsed}) {
		# parsing step happened in-script or a parsed corpus was passed in explicitly, all is well

  } elsif (already_parsed($TRAIN{target})) {
		# skipped straight to this step, passing a parsed corpus

		$TRAIN{parsed} = "$DATA_DIRS{train}/corpus.parsed.$TARGET";
		
		$cachepipe->cmd("cp-train-$TARGET",
										"cp $TRAIN{target} $TRAIN{parsed}",
										$TRAIN{target}, 
										$TRAIN{parsed});

		$TRAIN{target} = "$DATA_DIRS{train}/corpus.$TARGET";

		# now extract the leaves of the parsed corpus
		$cachepipe->cmd("extract-leaves",
										"cat $TRAIN{parsed} | perl -pe 's/\\(.*?(\\S\+)\\)\+?/\$1/g' | perl -pe 's/\\)//g' > $TRAIN{target}",
										$TRAIN{parsed},
										$TRAIN{target});

		if ($TRAIN{source} ne "$DATA_DIRS{train}/corpus.$SOURCE") {
			$cachepipe->cmd("cp-train-$SOURCE",
											"cp $TRAIN{source} $DATA_DIRS{train}/corpus.$SOURCE",
											$TRAIN{source}, "$DATA_DIRS{train}/corpus.$SOURCE");
			$TRAIN{source} = "$DATA_DIRS{train}/corpus.$SOURCE";
		}

  } else {
		print "* FATAL: You requested to build an SAMT grammar, but provided an\n";
		print "  unparsed corpus.  Please re-run the pipeline and begin no later\n";
		print "  than the PARSE step (--first-step PARSE), or pass in a parsed corpus\n";
		print "  using --parsed-corpus CORPUS.\n";
		exit 1;
  }
	
}

# we may have skipped directly to this step, in which case we need to
# ensure an alignment was provided
if (! defined $ALIGNMENT) {
  print "* FATAL: no alignment file specified\n";
  exit(1);
}


# Since this is an expensive step, we short-circuit it if the grammar file is present.  I'm not
# sure that this is the right behavior.
if (-e "grammar.gz" && ! -z "grammar.gz") {
  chomp(my $is_empty = `gzip -cd grammar.gz | head | wc -l`);
  $GRAMMAR_FILE = "grammar.gz" unless ($is_empty == 0);
}

# If the grammar file wasn't specified, or found, we need to build it!
if (! defined $GRAMMAR_FILE) {

  my $target_file = ($GRAMMAR_TYPE eq "ghkm" or $GRAMMAR_TYPE eq "samt") ? $TRAIN{parsed} : $TRAIN{target};

  if ($GRAMMAR_TYPE eq "ghkm") {
    if ($GHKM_EXTRACTOR eq "galley") {
      $cachepipe->cmd("ghkm-extract",
                      "java -Xmx4g -Xms4g -cp $JOSHUA/lib/ghkm-modified.jar:$JOSHUA/lib/fastutil.jar -XX:+UseCompressedOops edu.stanford.nlp.mt.syntax.ghkm.RuleExtractor -fCorpus $TRAIN{source} -eParsedCorpus $target_file -align $ALIGNMENT -threads $NUM_THREADS -joshuaFormat true -maxCompositions 1 -reversedAlignment false | $SCRIPTDIR/support/split2files ghkm-mapping.gz grammar.gz",
                      $ALIGNMENT,
                      "grammar.gz");
    } elsif ($GHKM_EXTRACTOR eq "moses") {
      # XML-ize, also replacing unary chains with OOV at the bottom by removing their unary parents
      $cachepipe->cmd("ghkm-moses-xmlize",
                      "cat $target_file | perl -pe 's/\\(\\S+ \\(OOV (.*?)\\)\\)/(OOV \$1)/g' | $MOSES/scripts/training/wrappers/berkeleyparsed2mosesxml.perl > $DATA_DIRS{train}/corpus.xml",
                      # "cat $target_file | perl -pe 's/\\(\\S+ \\(OOV (.*?)\\)\\)/(OOV \$1)/g' > $DATA_DIRS{train}/corpus.ptb",
                      $target_file,
                      "$DATA_DIRS{train}/corpus.xml");

      if (! -e "$DATA_DIRS{train}/corpus.$SOURCE") {
        system("ln -sf $TRAIN{source} $DATA_DIRS{train}/corpus.$SOURCE");
      }

      if ($ALIGNMENT ne "alignments/training.align") {
        system("mkdir alignments") unless -d "alignments";
        system("ln -sf $ALIGNMENT alignments/training.align");
        $ALIGNMENT = "alignments/training.align";
      }

      system("mkdir model");
      $cachepipe->cmd("ghkm-moses-extract",
                      "$MOSES/scripts/training/train-model.perl --first-step 4 --last-step 6 --corpus $DATA_DIRS{train}/corpus --ghkm --f $SOURCE --e xml --alignment-file alignments/training --alignment align --target-syntax --cores $NUM_THREADS --pcfg --alt-direct-rule-score-1 --ghkm-tree-fragments --glue-grammar --glue-grammar-file glue-grammar.ghkm --extract-options \"$EXTRACT_OPTIONS --UnknownWordLabel oov-labels.txt\"",
                      "$DATA_DIRS{train}/corpus.xml",
                      "glue-grammar.ghkm",
                      "model/rule-table.gz");

      open LABELS, "oov-labels.txt";
      chomp(my @labels = <LABELS>);
      close LABELS;
      my $oov_list = "\"" . join(" ", @labels) . "\"";
      $JOSHUA_ARGS .= " -oov-list $oov_list";

      $cachepipe->cmd("ghkm-moses-convert",
                      "gzip -cd model/rule-table.gz | /home/hltcoe/mpost/code/joshua/scripts/support/moses2joshua_grammar.pl -m rule-fragment-map.txt | gzip -9n > grammar.gz",
                      "model/rule-table.gz",
                      "grammar.gz");

    } else {
      print STDERR "* FATAL: no such GHKM extractor '$GHKM_EXTRACTOR'\n";
      exit(1);
    }

    $GRAMMAR_FILE = "grammar.gz";

  } elsif ($GRAMMAR_TYPE eq "moses") {

    mkdir("model") unless -d "model";

    if ($ALIGNMENT ne "alignments/training.align") {
      system("mkdir alignments") unless -d "alignments";
      system("ln -sf $ALIGNMENT alignments/training.align");
      $ALIGNMENT = "alignments/training.align";
    }

    # Compute lexical probabilities
    $cachepipe->cmd("build-lex-trans",
                    "$MOSES/scripts/training/train-model.perl -mgiza -mgiza-cpus $NUM_THREADS -dont-zip -first-step 4 -last-step 4 -external-bin-dir $MOSES/bin -f $SOURCE -e $TARGET -max-phrase-length $MAX_PHRASE_LEN -score-options '--GoodTuring' -parallel -lexical-file model/lex -alignment-file alignments/training -alignment align -corpus $TRAIN{prefix}",
                    $TRAIN{source},
                    $TRAIN{target},
                    $ALIGNMENT,
                    "model/lex.e2f",
                    "model/lex.f2e"
        );

    # Extract the phrases
    $cachepipe->cmd("extract-phrases",
                    "$MOSES/scripts/training/train-model.perl -mgiza -mgiza-cpus $NUM_THREADS -dont-zip -first-step 5 -last-step 5 -external-bin-dir $MOSES/bin -f $SOURCE -e $TARGET -max-phrase-length $MAX_PHRASE_LEN -score-options '--GoodTuring' -parallel -alignment-file alignments/training -alignment align -extract-file model/extract -corpus $TRAIN{prefix}",
                    $TRAIN{source},
                    $TRAIN{target},
                    $ALIGNMENT,
                    "model/extract.sorted.gz",
                    "model/extract.inv.sorted.gz"
        );

    # Build the phrase table
    $cachepipe->cmd("build-ttable",
                    "$MOSES/scripts/training/train-model.perl -mgiza -mgiza-cpus $NUM_THREADS -dont-zip -first-step 6 -last-step 6 -external-bin-dir $MOSES/bin -f $SOURCE -e $TARGET -alignment grow-diag-final-and -max-phrase-length $MAX_PHRASE_LEN -score-options '--GoodTuring' -parallel -extract-file model/extract -lexical-file model/lex -phrase-translation-table model/phrase-table",
                    "model/lex.e2f",
                    "model/extract.sorted.gz",
                    "model/phrase-table.gz",
        );

    # Convert the model to Joshua format
    $cachepipe->cmd("convert-moses-to-joshua",
                    "$CAT model/phrase-table.gz | $SCRIPTDIR/support/phrase2hiero.py -moses | gzip -9n > grammar.gz",
                    "model/phrase-table.gz",
                    "grammar.gz",
        );

    $GRAMMAR_FILE = "grammar.gz";

  } elsif ($GRAMMAR_TYPE eq "samt" or $GRAMMAR_TYPE eq "hiero" or $GRAMMAR_TYPE eq "phrase") {

    # create the input file
    $cachepipe->cmd("thrax-input-file",
                    "$PASTE $TRAIN{source} $target_file $ALIGNMENT | perl -pe 's/\\t/ ||| /g' | grep -v '()' | grep -v '||| \\+\$' > $DATA_DIRS{train}/thrax-input-file",
                    $TRAIN{source}, $target_file, $ALIGNMENT,
                    "$DATA_DIRS{train}/thrax-input-file");

    # put the hadoop files in place
    my $thrax_input;
    my $THRAXDIR = "pipeline-$SOURCE-$TARGET-$GRAMMAR_TYPE-$RUNDIR";
    $THRAXDIR =~ s#/#_#g;

    $cachepipe->cmd("thrax-prep",
                    "hadoop fs -rm -r $THRAXDIR; hadoop fs -mkdir $THRAXDIR; hadoop fs -put $DATA_DIRS{train}/thrax-input-file $THRAXDIR/input-file",
                    "$DATA_DIRS{train}/thrax-input-file", 
                    "grammar.gz");

    $thrax_input = "$THRAXDIR/input-file";

    # copy the thrax config file
    my $thrax_file = "thrax-$GRAMMAR_TYPE.conf";
    system("grep -v ^input-file $THRAX_CONF_FILE | perl -pe 's/<MAXPHRLEN>/$MAX_PHRASE_LEN/g' > $thrax_file.tmp");
    system("echo input-file $thrax_input >> $thrax_file.tmp");
    system("mv $thrax_file.tmp $thrax_file");

    $cachepipe->cmd("thrax-run",
                    "hadoop jar $THRAX/bin/thrax.jar -D mapreduce.task.timeout=0 -D mapreduce.map.java.opts='-Xmx$HADOOP_MEM' -D mapreduce.reduce.java.opts='-Xmx$HADOOP_MEM' -D hadoop.tmp.dir=$TMPDIR $thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; hadoop fs -cat $THRAXDIR/final/* | gzip -cd | $JOSHUA/scripts/training/filter-rules.pl -t 100 | gzip -9n > grammar.gz",
                    "$DATA_DIRS{train}/thrax-input-file",
                    $thrax_file,
                    "grammar.gz");
#perl -pi -e 's/\.?0+\b//g' grammar; 

    $GRAMMAR_FILE = "grammar.gz";

    # cleanup if successful
    if (-s $GRAMMAR_FILE) {
      system("hadoop fs -rm -r $THRAXDIR");
    }

  } else {

    print STDERR "* FATAL: There was no way to build a grammar, and none was passed in\n";
    print STDERR "*        Please try one of the following:\n";
    print STDERR "*        - Specify a grammar with --grammar /path/to/grammar\n";
    print STDERR "*        - Delete any existing grammar named 'grammar.gz'\n";

    exit 1;
  }
}

# Pack the entire model! Saves filtering and repacking of tuning and test sets
if ($DO_PACK_GRAMMARS and ! $DO_FILTER_TM and ! -e "grammar.packed") {
  $cachepipe->cmd("pack-grammar",
                  "$SCRIPTDIR/support/grammar-packer.pl -a -T $TMPDIR -m $PACKER_MEM -g $GRAMMAR_FILE -o $RUNDIR/grammar.packed",
                  "$RUNDIR/grammar.packed/vocabulary",
                  "$RUNDIR/grammar.packed/encoding",
                  "$RUNDIR/grammar.packed/slice_00000.source");
  $GRAMMAR_FILE = "$RUNDIR/grammar.packed";
}


maybe_quit("THRAX");
maybe_quit("GRAMMAR");
maybe_quit("MODEL");

## TUNING ##############################################################
TUNE:
    ;

# prep the tuning data, unless already prepped
if (! $PREPPED{TUNE}) {
  my $prefixes = prepare_data("tune",\@TUNE,$MAXLEN_TUNE);
  $TUNE{source} = "$DATA_DIRS{tune}/$prefixes->{lowercased}.$SOURCE";
  $TUNE{target} = "$DATA_DIRS{tune}/$prefixes->{lowercased}.$TARGET";
  $PREPPED{TUNE} = 1;
}


# figure out how many references there are
my $numrefs = get_numrefs($TUNE{target});

# make sure the dev source exist
if (! -e $TUNE{source}) {
  print STDERR "* FATAL: couldn't fine tuning source file '$TUNE{source}'\n";
  exit 1;
}
if ($numrefs > 1) {
  for my $i (0..$numrefs-1) {
		if (! -e "$TUNE{target}.$i") {
			print STDERR "* FATAL: couldn't find tuning reference file '$TUNE{target}.$i'\n";
			exit 1;
		}
  }
} else {
  if (! -e $TUNE{target}) {
		print STDERR "* FATAL: couldn't find tuning reference file '$TUNE{target}'\n";
		exit 1;
  }
}

sub compile_lm($) {
  my $lmfile = shift;
  if ($LM_TYPE eq "kenlm") {
    my $kenlm_file = basename($lmfile, ".gz") . ".kenlm";
    $cachepipe->cmd("compile-kenlm",
                    "$JOSHUA/bin/build_binary $lmfile $kenlm_file",
                    $lmfile, $kenlm_file);
    return $kenlm_file;

  } elsif ($LM_TYPE eq "berkeleylm") {
    my $berkeleylm_file = basename($lmfile, ".gz") . ".berkeleylm";
    $cachepipe->cmd("compile-berkeleylm",
                    "$JOSHUA/scripts/lm/compile_berkeley.py -m $BUILDLM_MEM $lmfile $berkeleylm_file",
                    $lmfile, $berkeleylm_file);
    return $berkeleylm_file;

  } else {
    print "* FATAL: trying to compile an LM to neither kenlm nor berkeleylm.";
    exit 2;
  }
}

# Build the language model if needed
if (defined $TRAIN{target} and $DO_BUILD_LM_FROM_CORPUS) {

  # make sure the training data is prepped
  if (! $PREPPED{TRAIN}) {
		my $prefixes = prepare_data("train", \@CORPORA, $MAXLEN);

		$TRAIN{prefix} = "$DATA_DIRS{train}/corpus";
		foreach my $lang ($SOURCE,$TARGET) {
			system("ln -sf $prefixes->{lowercased}.$lang $DATA_DIRS{train}/corpus.$lang");
		}
		$TRAIN{source} = "$DATA_DIRS{train}/corpus.$SOURCE";
		$TRAIN{target} = "$DATA_DIRS{train}/corpus.$TARGET";
		$PREPPED{TRAIN} = 1;
  }

  my $lmfile = "lm.gz";

  # sort and uniq the training data
  $cachepipe->cmd("lm-sort-uniq",
                  "$CAT $TRAIN{target} | sort -u -T $TMPDIR -S $BUILDLM_MEM | gzip -9n > $TRAIN{target}.uniq",
                  $TRAIN{target},
                  "$TRAIN{target}.uniq");

  # If an NER Tagger is specified, use that to annotate the corpus before 
  # sending it off to the LM
  my $ner_return = ner_annotate("$TRAIN{target}.uniq", "$TRAIN{target}.uniq.ner", $TARGET);
  if ($ner_return == 2) {
    $TRAIN{ner_lm} = 1;
  }

  my $lm_input = "$TRAIN{target}.uniq";
  # Choose LM input based on whether an annotated corpus was created
  if (defined $TRAIN{ner_lm}) {
    $lm_input = replace_tokens_with_types("$TRAIN{target}.uniq.ner");
  }

  if ($LM_GEN eq "srilm") {
		my $smoothing = ($WITTEN_BELL) ? "-wbdiscount" : "-kndiscount";
		$cachepipe->cmd("srilm",
										"$SRILM -order $LM_ORDER -interpolate $smoothing -unk -gt3min 1 -gt4min 1 -gt5min 1 -text $TRAIN{target}.uniq $LM_OPTIONS -lm lm.gz",
                    "$lm_input",
										$lmfile);
  } elsif ($LM_GEN eq "berkeleylm") {
		$cachepipe->cmd("berkeleylm",
				"java -ea -mx$BUILDLM_MEM -server -cp $JOSHUA/ext/berkeleylm/jar/berkeleylm.jar edu.berkeley.nlp.lm.io.MakeKneserNeyArpaFromText $LM_ORDER lm.gz $TRAIN{target}.uniq",
                    "$lm_input",
										$lmfile);
  } else {
    # Make sure it exists
    if (! -e "$JOSHUA/bin/lmplz") {
      print "* FATAL: $JOSHUA/bin/lmplz (for building LMs) does not exist.\n";
      print "  This is often a problem with the boost libraries (particularly threaded\n";
      print "  versus unthreaded).\n";
      exit 1;
    }

    # Needs to be capitalized
    my $mem = uc $BUILDLM_MEM;
    $cachepipe->cmd("kenlm",
                    "$JOSHUA/bin/lmplz -o $LM_ORDER -T $TMPDIR -S $mem --verbose_header --text $TRAIN{target}.uniq $LM_OPTIONS | gzip -9n > lm.gz",
                    "$TRAIN{target}.uniq",
                    $lmfile);
  }

  if ((! $MERGE_LMS) && ($LM_TYPE eq "kenlm" || $LM_TYPE eq "berkeleylm")) {
    push (@LMFILES, get_absolute_path(compile_lm $lmfile, $RUNDIR));
  } else {
    push (@LMFILES, get_absolute_path($lmfile, $RUNDIR));
  }
}

if ($DO_BUILD_CLASS_LM) {
  # Build a Class LM
  # First check to see if an class map and class corpus are defined
  if (! defined $CLASS_LM_CORPUS or ! defined $CLASS_MAP) {
    print "* FATAL: A class LM corpus (--class-lm-corpus) and a class map (--class-map) are required with the --class-lm switch";
    exit 1;
  }
  if (! -e $CLASS_LM_CORPUS or ! -e $CLASS_MAP) {
    print "* FATAL: Could not find the Class LM corpus or map";
    exit 1;
  }
  if (! -e "$JOSHUA/bin/lmplz") {
    print "* FATAL: $JOSHUA/bin/lmplz (for building LMs) does not exist.\n";
    print "  This is often a problem with the boost libraries (particularly threaded\n";
    print "  versus unthreaded).\n";
    exit 1;
  }

  # Needs to be capitalized
  my $mem = uc $BUILDLM_MEM;
  my $class_lmfile = "class_lm.gz";
  $cachepipe->cmd("classlm",
                  "$JOSHUA/bin/lmplz -o $CLASS_LM_ORDER -T $TMPDIR -S $mem --discount_fallback=0.5 1 1.5 --verbose_header --text $CLASS_LM_CORPUS $LM_OPTIONS | gzip -9n > $class_lmfile",
                  "$CLASS_LM_CORPUS",
                  $class_lmfile);
}

if ($MERGE_LMS) {
  # Merge @LMFILES.
  my $merged_lm = "lm-merged.gz";

  # Use the target first target reference if there are multiple ones
  my $target_ref = (-e $TUNE{target}) ? $TUNE{target} : "$TUNE{target}.0";

  $cachepipe->cmd("merge-lms",
                  "$JOSHUA/scripts/support/merge_lms.py "
                    . "@LMFILES "
                    . "$target_ref "
                    . "lm-merged.gz "
                    . "--temp-dir data/merge_lms ",
                  @LMFILES,
                  $merged_lm);

  # Empty out @LMFILES.
  @LMFILES = ();

  # Compile merged LM
  if ($LM_TYPE eq "kenlm" || $LM_TYPE eq "berkeleylm") {
    push (@LMFILES, get_absolute_path(compile_lm $merged_lm, $RUNDIR));

  } else {
    push (@LMFILES, get_absolute_path($merged_lm, $RUNDIR));
  }
}

system("mkdir -p $DATA_DIRS{tune}") unless -d $DATA_DIRS{tune};

# Set $TUNE_GRAMMAR to a specifically-passed tuning grammar or the
# main default grammar. Then update it if filtering was requested and
# is possible.
my $TUNE_GRAMMAR = $_TUNE_GRAMMAR_FILE || $GRAMMAR_FILE;
if ($DO_FILTER_TM and defined $GRAMMAR_FILE and ! $DOING_LATTICES and ! defined $_TUNE_GRAMMAR_FILE) {
  $TUNE_GRAMMAR = "$DATA_DIRS{tune}/grammar.filtered.gz";

  if ($OPTIMIZER_RUN == 1 and ! is_packed($TUNE_GRAMMAR)) {
    $cachepipe->cmd("filter-tune",
                    "$SCRIPTDIR/support/filter_grammar.sh -g $GRAMMAR_FILE $FILTERING -v $TUNE{source} | $SCRIPTDIR/training/filter-rules.pl -bus$SCOPE | gzip -9n > $TUNE_GRAMMAR",
                    $GRAMMAR_FILE,
                    $TUNE{source},
                    "$DATA_DIRS{tune}/grammar.filtered.gz");
  }
}

# Create the glue grammars. This is done by looking at all the symbols in the grammar file and
# creating all the needed rules. This is only done if there is a $TUNE_GRAMMAR defined (which
# can be skipped if we skip straight to the tuning step).
if ($OPTIMIZER_RUN == 1 and defined $TUNE_GRAMMAR and $GRAMMAR_TYPE ne "phrase" and $GRAMMAR_TYPE ne "moses") {
  if (! defined $GLUE_GRAMMAR_FILE) {
    $cachepipe->cmd("glue-tune",
                    "$JOSHUA/scripts/support/create_glue_grammar.sh $TUNE_GRAMMAR > $DATA_DIRS{tune}/grammar.glue",
                    get_file_from_grammar($TUNE_GRAMMAR),
                    "$DATA_DIRS{tune}/grammar.glue");
    $GLUE_GRAMMAR_FILE = "$DATA_DIRS{tune}/grammar.glue";
  } else {
    # just create a symlink to it
    my $filename = $DATA_DIRS{tune} . "/" . basename($GLUE_GRAMMAR_FILE);
    system("ln -sf $GLUE_GRAMMAR_FILE $filename");
  }
}

# Add in feature functions
my $weightstr = "";
my @feature_functions;
my $lm_index = 0;
for my $i (0..$#LMFILES) {
  if ($LM_STATE_MINIMIZATION) {
    push(@feature_functions, "StateMinimizingLanguageModel -lm_order $LM_ORDER -lm_file $LMFILES[$i]");
  } else {
    push(@feature_functions, "LanguageModel -lm_type $LM_TYPE -lm_order $LM_ORDER -lm_file $LMFILES[$i]");
  }

  $weightstr .= "lm_$i 1 ";
  $lm_index += 1;
}

if ($DO_BUILD_CLASS_LM) {
  push(@feature_functions, "LanguageModel -lm_type kenlm -lm_order $CLASS_LM_ORDER -lm_file $RUNDIR/class_lm.gz -class_map $CLASS_MAP");
  $weightstr .= "lm_$lm_index 1 ";
}

if ($DOING_LATTICES) {
  push(@feature_functions, "SourcePath");

  $weightstr .= "SourcePath 1.0 ";
}
if ($GRAMMAR_TYPE eq "phrase" or $GRAMMAR_TYPE eq "moses") {
  push(@feature_functions, "Distortion");
  push(@feature_functions, "PhrasePenalty");

  $weightstr .= "Distortion 1.0 PhrasePenalty 1.0 ";
}
my $feature_functions = join(" ", map { "-feature-function \"$_\"" } @feature_functions);

# Build out the weight string
my $TM_OWNER = "pt";
my $GLUE_OWNER = "glue";
if (defined $TUNE_GRAMMAR) {
  my @tm_features = get_features($TUNE_GRAMMAR);
  foreach my $feature (@tm_features) {
    # Only assign initial weights to dense features
    $weightstr .= "tm_${TM_OWNER}_$feature 1 " if ($feature =~ /^\d+$/);
  }

  # Glue grammars are only needed for hierarchical models
  if ($GRAMMAR_TYPE ne "phrase" and $GRAMMAR_TYPE ne "moses") {
    # Glue grammar
    $weightstr .= "tm_${GLUE_OWNER}_0 1 ";
  }
}

my $tm_type = $GRAMMAR_TYPE;
if ($GRAMMAR_TYPE eq "moses") {
  $tm_type = "moses";
}

sub get_file_from_grammar {
  # Cachepipe doesn't work on directories, so we need to make sure we
  # have a representative file to use to cache grammars. Returns undef if file not found
  my ($grammar_file) = @_;
  return undef unless defined $grammar_file and -e $grammar_file;
  my $file = (-d $grammar_file) ? "$grammar_file/slice_00000.source" : $grammar_file;
  return $file;
}

# The first tuning run is just a symlink to the tune/ directory (for backward compat.)
# Subsequent runs are under their run number
my $tunedir;
if ($OPTIMIZER_RUN == 1) {
  $tunedir = "$RUNDIR/tune";
  system("mkdir -p $tunedir") unless -d $tunedir;
  symlink "$RUNDIR/tune", "$RUNDIR/tune/1";
} else {
  $tunedir = "$RUNDIR/tune/$OPTIMIZER_RUN";
  system("mkdir -p $tunedir") unless -d $tunedir;
}

system("mkdir -p $tunedir") unless -d $tunedir;

# Build the filtered tuning model
my $tunemodeldir = "$RUNDIR/tune/model";

# We build up this string with TMs to substitute in, if any are provided
my $tm_switch = "";
my $tm_copy_config_args = "";
if (defined $TUNE_GRAMMAR) {
  $tm_switch .= ($DO_PACK_GRAMMARS) ? "--pack-tm" : "--tm";
  $tm_switch .= " $TUNE_GRAMMAR";
  $tm_copy_config_args = " -tm0/type $tm_type -tm0/owner ${TM_OWNER} -tm0/maxspan $MAXSPAN";
}
# If we specified a new glue grammar, put that in
if ($GRAMMAR_TYPE eq "phrase" or $GRAMMAR_TYPE eq "moses") {
  # if there is no glue grammar, remove it from the config template
  $tm_copy_config_args .= " -tm1 DELETE";
} elsif (defined $GLUE_GRAMMAR_FILE) {
  $tm_switch .= " --tm $GLUE_GRAMMAR_FILE";
  $tm_copy_config_args .= " -tm1/owner ${GLUE_OWNER}";
}

# Now build the bundle
if ($OPTIMIZER_RUN == 1) {
  $cachepipe->cmd("tune-bundle",
                  "$BUNDLER --force --symlink --absolute --verbose -T $TMPDIR $JOSHUA_CONFIG $tunemodeldir --copy-config-options '-top-n $NBEST -output-format \"%i ||| %s ||| %f ||| %c\" -mark-oovs false -search $SEARCH_ALGORITHM -weights \"$weightstr\" $feature_functions ${tm_copy_config_args}' ${tm_switch}",
                  $JOSHUA_CONFIG,
                  get_file_from_grammar($TUNE_GRAMMAR) || $JOSHUA_CONFIG,
                  "$tunemodeldir/run-joshua.sh");
}

# Update the tune grammar to its new location in the bundle
if (defined $TUNE_GRAMMAR) {
  # Now update the tuning grammar to its new path
  my $basename = basename($TUNE_GRAMMAR);
  if (-e "tune/model/$basename") {
    $TUNE_GRAMMAR = "tune/model/$basename";
  } elsif (-e "tune/model/$basename.packed") {
    $TUNE_GRAMMAR = "tune/model/$basename.packed";
  } else {
    print STDERR "* FATAL: tune model bundling didn't produce a grammar?\n";
    exit 1;
  }
}

# Copy the generated config to the tunedir, and update the config file location
system("cp $tunemodeldir/joshua.config $tunedir/joshua.config");
$JOSHUA_CONFIG = "$tunedir/joshua.config";

# Write the decoder run command. The decoder will use the config file in the bundled
# directory, continually updating it.

# If we're decoding a lattice, also output the source side path we chose
$JOSHUA_ARGS = "";
if ($DOING_LATTICES) {
  $JOSHUA_ARGS .= " -maxlen 0 -lattice-decoding";
}
$JOSHUA_ARGS .= " -output-format \"%i ||| %s ||| %f ||| %c\"";
$JOSHUA_ARGS .= " $_JOSHUA_ARGS" if defined $_JOSHUA_ARGS;

open DEC_CMD, ">$tunedir/decoder_command";
print DEC_CMD "cat $TUNE{source} | $tunemodeldir/run-joshua.sh -m $JOSHUA_MEM -config $JOSHUA_CONFIG -threads $NUM_THREADS $JOSHUA_ARGS > $tunedir/output.nbest 2> $tunedir/joshua.log\n";
close(DEC_CMD);
chmod(0755,"$tunedir/decoder_command");

# tune
if ($TUNER ne "kbmira") {
  $cachepipe->cmd("${TUNER}-${OPTIMIZER_RUN}",
                  "$SCRIPTDIR/training/run_tuner.py $TUNE{source} $TUNE{target} --tunedir $tunedir --tuner $TUNER --decoder $tunedir/decoder_command --decoder-config $JOSHUA_CONFIG --decoder-output-file $tunedir/output.nbest --decoder-log-file $tunedir/joshua.log --iterations $TUNER_ITERATIONS --metric '$METRIC'",
                  $TUNE{source},
                  $JOSHUA_CONFIG,
                  get_file_from_grammar($TUNE_GRAMMAR) || $JOSHUA_CONFIG,
                  "$tunedir/joshua.config.final");

} else { # Moses' batch kbmira
  my $refs_path = $TUNE{target};
  $refs_path .= "." if (get_numrefs($TUNE{target}) > 1);

  my $extra_args = $JOSHUA_ARGS;
  $extra_args =~ s/"/\\"/g;
  $cachepipe->cmd("kbmira-${OPTIMIZER_RUN}",
                  "$SCRIPTDIR/training/mira/run-mira.pl --mertdir $MOSES/bin --rootdir $MOSES/scripts --batch-mira --working-dir $tunedir --maximum-iterations $TUNER_ITERATIONS --nbest $NBEST --no-filter-phrase-table --decoder-flags \"-m $JOSHUA_MEM -threads $NUM_THREADS -moses $extra_args\" $TUNE{source} $refs_path $tunemodeldir/run-joshua.sh $JOSHUA_CONFIG > $tunedir/mira.log 2>&1",
                  get_file_from_grammar($TUNE_GRAMMAR) || $JOSHUA_CONFIG,
                  $TUNE{source},
                  "$tunedir/joshua.config.final");
}

$JOSHUA_CONFIG = "$tunedir/joshua.config.final";

# Go to the next tuning run if tuning is the last step.
maybe_quit("TUNE");

#################################################################
## TESTING ######################################################
#################################################################

TEST:
    ;

# prepare the testing data
if (! $PREPPED{TEST} and $OPTIMIZER_RUN == 1) {
  my $prefixes = prepare_data("test", [$TEST], $MAXLEN_TEST);
  $TEST{source} = "$DATA_DIRS{test}/$prefixes->{lowercased}.$SOURCE";
  $TEST{target} = "$DATA_DIRS{test}/$prefixes->{lowercased}.$TARGET";
  $PREPPED{TEST} = 1;
}

system("mkdir -p $DATA_DIRS{test}") unless -d $DATA_DIRS{test};

# Define the test grammar, if it was provided
my $TEST_GRAMMAR = $_TEST_GRAMMAR_FILE || $GRAMMAR_FILE;

if ($DO_FILTER_TM and defined $GRAMMAR_FILE and ! $DOING_LATTICES and ! defined $_TEST_GRAMMAR_FILE) {
  # On the first test run, we take some pains to prepare and pack the model, which won't have
  # to be done for subsequent runs
  if ($OPTIMIZER_RUN == 1 and ! is_packed($TEST_GRAMMAR)) {
    $TEST_GRAMMAR = "$DATA_DIRS{test}/grammar.filtered.gz";

    $cachepipe->cmd("filter-test",
                    "$SCRIPTDIR/support/filter_grammar.sh -g $GRAMMAR_FILE $FILTERING -v $TEST{source} | $SCRIPTDIR/training/filter-rules.pl -bus$SCOPE | gzip -9n > $TEST_GRAMMAR",
                    $GRAMMAR_FILE,
                    $TEST{source},
                    "$DATA_DIRS{test}/grammar.filtered.gz");
  }
}

# Create the glue grammar
if ($OPTIMIZER_RUN == 1 and defined $TEST_GRAMMAR and $GRAMMAR_TYPE ne "phrase" and $GRAMMAR_TYPE ne "moses") {
  if (! defined $GLUE_GRAMMAR_FILE) {
    $cachepipe->cmd("glue-test",
                    "$JOSHUA/scripts/support/create_glue_grammar.sh $TEST_GRAMMAR > $DATA_DIRS{test}/grammar.glue",
                    get_file_from_grammar($TEST_GRAMMAR),
                    "$DATA_DIRS{test}/grammar.glue");
    $GLUE_GRAMMAR_FILE = "$DATA_DIRS{test}/grammar.glue";
    
  } else {
    # just create a symlink to it
    my $filename = $DATA_DIRS{test} . "/" . basename($GLUE_GRAMMAR_FILE);
    if ($GLUE_GRAMMAR_FILE =~ /^\//) {
      system("ln -sf $GLUE_GRAMMAR_FILE $filename");
    } else {
      system("ln -sf $STARTDIR/$GLUE_GRAMMAR_FILE $filename");
    }
  }
}

# Create the test directory
my $testdir;
if ($OPTIMIZER_RUN == 1) {
  $testdir = "$RUNDIR/test";
  system("mkdir -p $testdir") unless -d $testdir;
  symlink("$RUNDIR/test", "$RUNDIR/test/1");
} else {
  $testdir = "$RUNDIR/test/$OPTIMIZER_RUN";
  system("mkdir -p $testdir") unless -d $testdir;
}

$tm_switch = "";
if (defined $TEST_GRAMMAR) {
  $tm_copy_config_args = "";
  $tm_switch .= ($DO_PACK_GRAMMARS) ? "--pack-tm" : "--tm";
  $tm_switch .= " $TEST_GRAMMAR";
}

# Add in the glue grammar
if (defined $GLUE_GRAMMAR_FILE) {
  $tm_switch .= " --tm $GLUE_GRAMMAR_FILE";
}

# Build the test model
my $testmodeldir = "$RUNDIR/test/$OPTIMIZER_RUN/model";
$cachepipe->cmd("test-bundle-${OPTIMIZER_RUN}",
                "$BUNDLER --force --symlink --absolute --verbose -T $TMPDIR $JOSHUA_CONFIG $testmodeldir --copy-config-options '-top-n $NBEST -pop-limit 5000 -output-format \"%i ||| %s ||| %f ||| %c\" -mark-oovs false' ${tm_switch}",
                $JOSHUA_CONFIG,
                get_file_from_grammar($TEST_GRAMMAR) || $JOSHUA_CONFIG,
                "$testmodeldir/joshua.config");

if (defined $TEST_GRAMMAR) {
  # Update the test grammar (if defined) to its new path
  my $basename = basename($TEST_GRAMMAR);
  if (-e "$testmodeldir/$basename") {
    $TEST_GRAMMAR = "$testmodeldir/$basename";
  } elsif (-e "$testmodeldir/$basename.packed") {
    $TEST_GRAMMAR = "$testmodeldir/$basename.packed";
  } else {
    print STDERR "* FATAL: test model bundling didn't produce a grammar?";
    exit 1;
  }
}

my $bestoutput = "$testdir/output";
my $nbestoutput = "$testdir/output.nbest";
my $output;

# If we're decoding a lattice, also output the source side path we chose
$JOSHUA_ARGS = "";
if ($DOING_LATTICES) {
  $JOSHUA_ARGS .= " -maxlen 0 -lattice-decoding -output-format \"%i ||| %s ||| %e ||| %f ||| %c\"";
}

if ($DO_MBR) {
  $JOSHUA_ARGS .= " -top-n $NBEST -output-format \"%i ||| %s ||| %f ||| %c\"";
  $output = $nbestoutput;
} else {
  $JOSHUA_ARGS .= " -top-n 0 -output-format %s";
  $output = $bestoutput;
}
$JOSHUA_ARGS .= " $_JOSHUA_ARGS" if defined $_JOSHUA_ARGS;

# Write the decoder run command
open DEC_CMD, ">$testdir/decoder_command";
print DEC_CMD "cat $TEST{source} | $testmodeldir/run-joshua.sh -m $JOSHUA_MEM -threads $NUM_THREADS $JOSHUA_ARGS > $output 2> $testdir/joshua.log\n";
close(DEC_CMD);
chmod(0755,"$testdir/decoder_command");

# Decode. $output here is either $nbestoutput (if doing MBR decoding, in which case we'll
# need the n-best output) or $bestoutput (which only outputs the hypothesis but is tons faster)
$cachepipe->cmd("test-decode-${OPTIMIZER_RUN}",
                "$testdir/decoder_command",
                $TEST{source},
                "$testdir/decoder_command",
                "$testmodeldir/joshua.config",
                get_file_from_grammar($TEST_GRAMMAR) || "$testmodeldir/joshua.config",
                $output);

# $cachepipe->cmd("remove-oov",
#                 "cat $testoutput | perl -pe 's/_OOV//g' > $testoutput.noOOV",
#                 $testoutput,
#                 "$testoutput.noOOV");

# Extract the 1-best output from the n-best file if the n-best file alone was output
if ($DO_MBR) {
  $cachepipe->cmd("test-extract-onebest-${OPTIMIZER_RUN}",
                  "java -Xmx500m -cp $JOSHUA/class -Dfile.encoding=utf8 joshua.util.ExtractTopCand $nbestoutput $bestoutput",
                  $nbestoutput,
                  $bestoutput);
}  

# Now compute the BLEU score on the 1-best output
$cachepipe->cmd("test-bleu-${OPTIMIZER_RUN}",
                "$JOSHUA/bin/bleu $output $TEST{target} > $testdir/bleu",
                $bestoutput,
                "$testdir/bleu");

# Update the BLEU summary.
# Sometimes the target side for test doesn't exist (e.g., WMT)
if (-e $TEST{target} || -e "$TEST{target}.0") {
  compute_bleu_summary("test/*/bleu", "test/final-bleu");

  if (defined $METEOR) {
    $cachepipe->cmd("test-meteor-${OPTIMIZER_RUN}",
                    "$JOSHUA/bin/meteor $output $TEST{target} $TARGET > $testdir/meteor",
                    $bestoutput,
                    "$testdir/meteor");
    compute_meteor_summary("test/*/meteor", "test/final-meteor");
  }
}

if ($DO_MBR) {
  my $numlines = `cat $TEST{source} | wc -l`;
  $numlines--;
  my $mbr_output = "$testdir/output.mbr";

  $cachepipe->cmd("test-onebest-parmbr-${OPTIMIZER_RUN}", 
                  "cat $nbestoutput | java -Xmx1700m -cp $JOSHUA/class -Dfile.encoding=utf8 joshua.decoder.NbestMinRiskReranker false 1 $NUM_THREADS > $mbr_output",
                  $nbestoutput,
                  $mbr_output);

  if (-e $TEST{target}) {
    $cachepipe->cmd("test-bleu-mbr-${OPTIMIZER_RUN}",
                    "$JOSHUA/bin/bleu output $TEST{target} $numrefs > $testdir/bleu.mbr",
                    $mbr_output,
                    "$testdir/bleu.mbr");

    compute_bleu_summary("test/*/bleu.mbr", "test/final-bleu-mbr");
  }
}

compute_time_summary("test/*/joshua.log", "test/final-times");

# Now do the analysis
if ($DOING_LATTICES) {
  # extract the source
  my $source = "$testdir/test.lattice-path.txt";
  $cachepipe->cmd("test-lattice-extract-source-${OPTIMIZER_RUN}",
                  "$JOSHUA/bin/extract-1best $nbestoutput 2 | perl -pe 's/<s> //' > $source",
                  $nbestoutput, $source);

  analyze_testrun($bestoutput,$source,$TEST{target});
} else {
  analyze_testrun($bestoutput,$TEST{source},$TEST{target});
}


######################################################################
## SUBROUTINES #######################################################
######################################################################
LAST:
		1;

# Does tokenization and normalization of training, tuning, and test data.
# $label: one of train, tune, or test
# $corpora: arrayref of files (multiple allowed for training data)
# $maxlen: maximum length (only applicable to training)
sub prepare_data {
  my ($label,$corpora,$maxlen) = @_;
  $maxlen = 0 unless defined $maxlen;

  system("mkdir -p $DATA_DIR") unless -d $DATA_DIR;
  system("mkdir -p $DATA_DIRS{$label}") unless -d $DATA_DIRS{$label};

  # records the pieces that are produced
  my %prefixes;

  # copy the data from its original location to our location
	my $numlines = -1;
  
  # Build the list of extensions. For training data, there may be multiple corpora; for
  # tuning and test data, there may be multiple references.
  my @exts = ($SOURCE);
  my $target_corpus = "$corpora->[0].$TARGET";
  push(@exts, $TARGET) if -e $target_corpus;
  for (my $i = 0; ; $i++) {
    my $file = "$target_corpus.$i";
    if (-e $file) {
      push(@exts, "$TARGET.$i");
    } else {
      last;
    }
  }

  # Read through all input files, concatenate them (if multiple were passed), and filter them
  # First, assemble the file handles
  my (@infiles, @indeps, @outfiles);
  foreach my $ext (@exts) {
    my @files =  map { "$_.$ext" } @$corpora;
    push(@indeps, @files);
    if ($MAXLINES != 0) {
      push(@infiles, "<(head -qn $MAXLINES " . join(" ", @files) . ")");
    } else {
      push(@infiles, "<(cat " . join(" ", @files) . ")");
    }
    push (@outfiles, "$DATA_DIRS{$label}/$label.$ext");
  }

  my $infiles =  join(" ", @infiles);
  my $outfiles = join(" ", @outfiles);
  # only skip blank lines for training data
  if ($label ne "test") {
    $cachepipe->cmd("$label-copy-and-filter",
                    "$PASTE $infiles | $SCRIPTDIR/training/filter-empty-lines.pl | $SCRIPTDIR/support/split2files $outfiles",
                    @indeps, @outfiles);
  } else {
    $cachepipe->cmd("$label-copy-and-filter",
                    "$PASTE $infiles | $SCRIPTDIR/support/split2files $outfiles",
                    @indeps, @outfiles);
  }

  # Done concatenating and filtering files

  # record where the concatenated input files were
  $prefixes{last_step} = $prefixes{input} = "$DATA_DIRS{$label}/$label";

  if ($DO_PREPARE_CORPORA) {
    my $prefix = $label;

    # tokenize the data
    foreach my $lang (@exts) {
      if (-e "$DATA_DIRS{$label}/$prefix.$lang") {
        if (is_lattice("$DATA_DIRS{$label}/$prefix.$lang")) { 
          system("cp $DATA_DIRS{$label}/$prefix.$lang $DATA_DIRS{$label}/$prefix.tok.$lang");
        } else {
          my $TOKENIZER = ($lang eq $SOURCE) ? $TOKENIZER_SOURCE : $TOKENIZER_TARGET;

          my $ext = $lang; $ext =~ s/\.\d//;
          $cachepipe->cmd("$label-tokenize-$lang",
                          "$CAT $DATA_DIRS{$label}/$prefix.$lang | $NORMALIZER $ext | $TOKENIZER -l $ext 2> /dev/null > $DATA_DIRS{$label}/$prefix.tok.$lang",
                          "$DATA_DIRS{$label}/$prefix.$lang", "$DATA_DIRS{$label}/$prefix.tok.$lang");
        }

      }
    }
    # extend the prefix
    $prefix .= ".tok";
    $prefixes{tokenized} = $prefix;

    if ($maxlen > 0) {
      my (@infiles, @outfiles);
      foreach my $ext (@exts) {
        my $infile = "$DATA_DIRS{$label}/$prefix.$ext";
        my $outfile = "$DATA_DIRS{$label}/$prefix.$maxlen.$ext";
        if (-e $infile) {
          push(@infiles, $infile);
          push(@outfiles, $outfile);
        }
      }

      my $infilelist = join(" ", @infiles);
      my $outfilelist = join(" ", @outfiles);

      # trim training data
      $cachepipe->cmd("$label-trim",
                      "$PASTE $infilelist | $SCRIPTDIR/training/trim_parallel_corpus.pl $maxlen | $SCRIPTDIR/support/split2files $outfilelist",
                      @infiles,
                      @outfiles);
      $prefix .= ".$maxlen";
    }
    # record this whether we shortened or not
    $prefixes{shortened} = $prefix;

    # lowercase
    foreach my $lang (@exts) {
      if (-e "$DATA_DIRS{$label}/$prefix.$lang") {
        if (is_lattice("$DATA_DIRS{$label}/$prefix.$lang")) { 
          system("cat $DATA_DIRS{$label}/$prefix.$lang > $DATA_DIRS{$label}/$prefix.lc.$lang");
        } else { 
          $cachepipe->cmd("$label-lowercase-$lang",
                          "cat $DATA_DIRS{$label}/$prefix.$lang | $LOWERCASER > $DATA_DIRS{$label}/$prefix.lc.$lang",
                          "$DATA_DIRS{$label}/$prefix.$lang",
                          "$DATA_DIRS{$label}/$prefix.lc.$lang");
        }
      }
    }
    $prefix .= ".lc";
    $prefixes{last_step} = $prefixes{lowercased} = $prefix;
  }

  foreach my $lang (@exts) {
    system("ln -sf $prefixes{last_step}.$lang $DATA_DIRS{$label}/corpus.$lang");
  }

  # Build a vocabulary
  foreach my $ext (@exts) {
    $cachepipe->cmd("$label-vocab-$ext",
                    "cat $DATA_DIRS{$label}/corpus.$ext | $SCRIPTDIR/training/build-vocab.pl > $DATA_DIRS{$label}/vocab.$ext",
                    "$DATA_DIRS{$label}/corpus.$ext",
                    "$DATA_DIRS{$label}/vocab.$ext");
  }

  return \%prefixes;
}

sub maybe_quit {
  my ($current_step) = @_;

  if (defined $LAST_STEP and $current_step eq $LAST_STEP) {
		print "* Quitting at this step\n";
		exit(0);
  }
}

## returns 1 if every sentence in the corpus begins with an open paren,
## false otherwise
sub already_parsed {
  my ($corpus) = @_;

  open(CORPUS, $corpus) or die "can't read corpus file '$corpus'\n";
  while (<CORPUS>) {
		# if we see a line not beginning with an open paren, we consider
		# the file not to be parsed
		return 0 unless /^\(/;
  }
  close(CORPUS);

  return 1;
}

sub not_defined {
  my ($var) = @_;

  print "* FATAL: environment variable \$$var is not defined.\n";
  exit;
}

# Takes a prefix.  If that prefix exists, then all the references are
# assumed to be in that file.  Otherwise, we successively append an
# index, looking for parallel references.
sub get_numrefs {
  my ($prefix) = @_;

  if (-e "$prefix.0") {
		my $index = 0;
		while (-e "$prefix.$index") {
			$index++;
		}
		return $index;
  } else {
		return 1;
  }
}

sub is_lattice {
  my $file = shift;
  open READ, "$CAT $file|" or die "can't read from potential lattice '$file'";
  my $line = <READ>;
  close(READ);
  if ($line =~ /^\(\(\(/) {
		$DOING_LATTICES = 1;
		$FILTERING = "-l";
		return 1;
  } else {
		return 0;
  }
}

# Set membership: is value in array?
sub in {
  my ($value, $array) = @_;
  return grep( /^$value$/, @$array );
}

# This function retrieves the names of all the features in the grammar. Dense features
# are named with consecutive integers starting at 0, while sparse features can have any name.
# To get the feature names from an unpacked grammar, we have to read through the whole grammar,
# since sparse features can be anywhere. For packed grammars, this can be read directly from
# the encoding.
sub get_features {
  my ($grammar) = @_;

  if (-d $grammar) {
    chomp(my @features = `java -cp $JOSHUA/target/joshua-*-with-dependencies.jar org.apache.joshua.util.encoding.EncoderConfiguration $grammar | grep ^feature: | awk '{print \$NF}'`);
    return @features;

  } elsif (-e $grammar) {
    my %features;
    open GRAMMAR, "$CAT $grammar|" or die "FATAL: can't read $grammar";
    while (my $line = <GRAMMAR>) {
      chomp($line);
      my @tokens = split(/ \|\|\| /, $line);
      # field 4 for regular grammars, field 3 for phrase tables
      my $feature_str = ($line =~ /^\[/) ? $tokens[3] : $tokens[2];
      my @features = split(' ', $feature_str);
      my $feature_no = 0;
      foreach my $feature (@features) {
        if ($feature =~ /=/) {
          my ($name) = split(/=/, $feature);
          $features{$name} = 1;
        } else {
          $features{$feature_no++} = 1;
        }
      } 
    }
    close(GRAMMAR);
    return keys(%features);
  }
}

# File names reflecting relative paths need to be absolute-ized for --rundir to work.
# Does not work with paths that do not exist!
sub get_absolute_path {
  my ($file,$basedir) = @_;
  $basedir = $STARTDIR unless defined $basedir;

  if (defined $file) {
    $file = "$basedir/$file" unless $file =~ /^\//;

    # prepend startdir (which is absolute) unless the path is absolute.
    my $abs_path = abs_path($file);
    if (defined $abs_path) {
      $file = $abs_path;
    }
  }

  return $file;
}

sub analyze_testrun {
  my ($output,$source,$reference) = @_;
  my $dir = dirname($output);

  if (-e $reference) {
    mkdir("$dir/analysis") unless -d "$dir/analysis";

    my @references;
    if (-e "$reference.0") {
      my $num = 0;
      while (-e "$reference.$num") {
        push(@references, "$reference.$num");
        $num++;
      }
    } else {
      push(@references, $reference);
    }

    my $references = join(" -r ", @references);

    $cachepipe->cmd("analyze-test-${OPTIMIZER_RUN}",
                    "$SCRIPTDIR/analysis/sentence-by-sentence.pl -s $source -r $references $output > $dir/analysis/sentence-by-sentence.html",
                    $output,
                    "$dir/analysis/sentence-by-sentence.html");
  }
}

sub compute_meteor_summary {
  my ($filepattern, $outputfile) = @_;

  # Average the runs, report result
  my @scores;
  my $numrecs = 0;
  open CMD, "grep '^Final score' $filepattern |";
  my @F = split(' ', <CMD>);
  close(CMD);
  push(@scores, 1.0 * $F[-1]);

  if (scalar @scores) {
    my $final_score = sum(@scores) / (scalar @scores);

    open SUMMARY, ">$outputfile" or die "Can't write to $outputfile";
    printf(SUMMARY "%s / %d = %.4f\n", join(" + ", @scores), scalar @scores, $final_score);
    close(SUMMARY);
  }
}

sub compute_bleu_summary {
  my ($filepattern, $outputfile) = @_;

  # Now average the runs, report BLEU
  my @bleus;
  my $numrecs = 0;
  open CMD, "grep ' BLEU = ' $filepattern |";
  while (<CMD>) {
    my @F = split;
    push(@bleus, 1.0 * $F[-1]);
  }
  close(CMD);

  if (scalar @bleus) {
    my $final_bleu = sum(@bleus) / (scalar @bleus);

    open BLEU, ">$outputfile" or die "Can't write to $outputfile";
    printf(BLEU "%s / %d = %.4f\n", join(" + ", @bleus), scalar @bleus, $final_bleu);
    close(BLEU);
  }
}

sub compute_time_summary {
  my ($filepattern, $outputfile) = @_;

  # Now average the runs, report BLEU
  my @times;
  foreach my $file (glob($filepattern)) {
    open FILE, $file;
    my $time = 0.0;
    my $numrecs = 0;
    while (<FILE>) {
      next unless /^Input \d+: Translation took/;
      my @F = split;
      $time += $F[4];
      $numrecs++;
    }
    close(FILE);

    push(@times, $time);
  }

  if (scalar @times) {
    open TIMES, ">$outputfile" or die "Can't write to $outputfile";
    printf(TIMES "%s / %d = %s\n", join(" + ", @times), scalar(@times), 1.0 * sum(@times) / scalar(@times));
    close(TIMES);
  }
}

sub is_packed {
  my ($grammar) = @_;

  if (-d $grammar && -e "$grammar/encoding") {
    return 1;
  }

  return 0;
}

sub ner_annotate {
  my ($inputfile, $outputfile, $lang) = @_;
  if (defined $NER_TAGGER) {
    # Check if NER tagger exists
    if (! -e $NER_TAGGER) {
      print "* FATAL: The specified NER tagger was not found";
      exit(1);
    }
    $cachepipe->cmd("ner-annotate", "$NER_TAGGER $inputfile $outputfile $lang");
    # Check if annotated file exists
    if (! -e "$outputfile") {
      print "* FATAL : The NER tagger did not create the required annotated file : $outputfile";
      exit(1);
    }
    return 2;
  }
  return 0;
}

sub replace_tokens_with_types {
  # Replace the tokens with types
  my ($inputfile) = @_;
  qx{sed -ir 's:\$([A-Za-z0-9]+)_\([^)]+\):\1:g' $inputfile}
}
