Merge remote-tracking branch 'origin/pro-mira-fix'
diff --git a/scripts/copy-config.pl b/scripts/copy-config.pl
index e5e9d97..867b470 100755
--- a/scripts/copy-config.pl
+++ b/scripts/copy-config.pl
@@ -57,7 +57,7 @@
# Step 2. Now read through the config file.
my @weights_order;
-my $tm_index = 0;
+my $tm_index = -1;
while (my $line = <>) {
if ($line =~ /^\s*$/ or $line =~ /^#/) {
# Comments, empty lines
@@ -78,9 +78,17 @@
# TMs get special treatment. We parse the line (supporting old format and new keyword format),
# and then compare to command-line args to see what gets updated
if ($norm_key =~ /^tm/) {
+ $tm_index++;
+
# get the hash of tm values from the config file
my $tm_hash = parse_tm_line($value);
+ # Delete TM lines if they've been requested to be deleted
+ if (exists $params{"tm${tm_index}"} and $params{"tm${tm_index}"} eq "DELETE") {
+ delete $params{"tm${tm_index}"};
+ next;
+ }
+
# check if each one was passed as a command-line argument, and if so, retrieve its new value
foreach my $tmkey (keys %$tm_hash) {
my $concat = "tm${tm_index}/${tmkey}";
@@ -95,7 +103,6 @@
next if $tmkey eq "type";
$params{$norm_key} .= " -$tmkey $tm_hash->{$tmkey}";
}
- $tm_index++;
}
# if the parameter was found on the command line, print out its replaced value
diff --git a/scripts/support/run_bundler.py b/scripts/support/run_bundler.py
index aea641d..9675f27 100755
--- a/scripts/support/run_bundler.py
+++ b/scripts/support/run_bundler.py
@@ -22,7 +22,7 @@
$JOSHUA/scripts/support/run_bundler.py \
--force \
--verbose \
- /path/to/origin/directory/test/1/joshua.config \
+ /path/to/origin/directory/test/model/joshua.config \
--root /path/to/origin/directory \
new-bundle-directory \
--copy-config-options \
@@ -494,7 +494,7 @@
parser.add_argument(
'config', type=argparse.FileType('r'),
help='path to the origin configuration file. e.g. '
- '/path/to/test/1/joshua.config.final'
+ '/path/to/tune/dir/joshua.config.final'
)
parser.add_argument(
'dest_dir',
diff --git a/scripts/training/class-lm/replaceTokensWithClasses.py b/scripts/training/class-lm/replaceTokensWithClasses.py
new file mode 100644
index 0000000..c7691c2
--- /dev/null
+++ b/scripts/training/class-lm/replaceTokensWithClasses.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+"""
+Converts the words in a tokenized corpus into classes using the provided map.
+
+Usage:
+
+ $0 MAP INPUT_FILE OUTPUT_FILE
+
+where the format of the map is
+
+ WORD CLASS
+"""
+
+import sys
+
+classMap = {}
+
+classFile = open(sys.argv[1])
+input = open(sys.argv[2])
+output = open(sys.argv[3], 'w+')
+
+# First read classMap
+for line in classFile:
+ line = line.strip()
+ lineComp = line.split()
+ classMap[lineComp[0]] = lineComp[1]
+
+# Now read corpus
+for line in input:
+ line = line.strip().lower()
+ lineComp = line.split()
+ translation = []
+ for word in lineComp:
+ if word in classMap:
+ translation.append(classMap[word])
+ else:
+ translation.append("-1")
+ output.write(" ".join(translation) + "\n")
+
+classFile.close()
+input.close()
+output.close()
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 7bff0ac..ae42532 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -77,7 +77,7 @@
my $BUNDLER = "$JOSHUA/scripts/support/run_bundler.py";
my $STARTDIR;
my $RUNDIR = $STARTDIR = getcwd();
-my $GRAMMAR_TYPE = "hiero"; # or "itg" or "samt" or "ghkm" or "phrase"
+my $GRAMMAR_TYPE = "hiero"; # or "itg" or "samt" or "ghkm" or "phrase" or "phrasal"
my $SEARCH_ALGORITHM = "cky"; # or "stack" (for phrase-based)
# Which GHKM extractor to use ("galley" or "moses")
@@ -280,7 +280,7 @@
"ner-tagger=s" => \$NER_TAGGER,
"class-lm!" => \$DO_BUILD_CLASS_LM,
"class-lm-corpus=s" => \$CLASS_LM_CORPUS,
- "class-map" => \$CLASS_MAP,
+ "class-map=s" => \$CLASS_MAP,
);
if (! $retval) {
@@ -288,9 +288,6 @@
exit 1;
}
-# Joshua config
-my $JOSHUA_CONFIG = $_JOSHUA_CONFIG || "$TUNECONFDIR/joshua.config";
-
$RUNDIR = get_absolute_path($RUNDIR);
$TUNER = lc $TUNER;
@@ -405,18 +402,25 @@
exit 1;
}
-# make sure a grammar file was given if we're skipping training
-if (! defined $GRAMMAR_FILE) {
- if ($STEPS{$FIRST_STEP} >= $STEPS{TEST}) {
- if (! defined $_TEST_GRAMMAR_FILE) {
- print "* FATAL: need a grammar (--grammar or --test-grammar) if you're skipping to testing\n";
- exit 1;
- }
- } elsif ($STEPS{$FIRST_STEP} >= $STEPS{TUNE}) {
- if (! defined $_TUNE_GRAMMAR_FILE) {
- print "* FATAL: need a grammar (--grammar or --tune-grammar) if you're skipping grammar learning\n";
- exit 1;
- }
+# Joshua config
+my $JOSHUA_CONFIG = get_absolute_path($_JOSHUA_CONFIG || "$TUNECONFDIR/joshua.config", $STARTDIR);
+
+# make sure we have a tuned config file if we're skipping model building and tuning
+if ($STEPS{$FIRST_STEP} >= $STEPS{TEST}) {
+ if (! defined $JOSHUA_CONFIG) {
+ print "* FATAL: You need to provide a tuned Joshua config file (--joshua-config)\n";
+ print " if you're skipping straight to testing\n";
+ exit 1;
+ }
+}
+
+# make sure we have either a config file or a grammar and LM if we're skipping model building
+if ($STEPS{$FIRST_STEP} >= $STEPS{TUNE}) {
+ if (! defined $JOSHUA_CONFIG and ((! defined $_TUNE_GRAMMAR_FILE and ! defined $GRAMMAR_FILE) or scalar(@LMFILES) == 0)) {
+ print "* FATAL: You must provide either a Joshua config file (--joshua-config) or\n";
+ print " a grammar (--grammar or --tune-grammar) and at least one LM (--lmfile)\n";
+ print " if you're skipping straight to tuning\n";
+ exit 1;
}
}
@@ -426,6 +430,10 @@
}
# check for file presence
+if (defined $JOSHUA_CONFIG and ! -e $JOSHUA_CONFIG) {
+ print "* FATAL: couldn't find joshua config file '$JOSHUA_CONFIG'\n";
+ exit 1;
+}
if (defined $GRAMMAR_FILE and ! -e $GRAMMAR_FILE) {
print "* FATAL: couldn't find grammar file '$GRAMMAR_FILE'\n";
exit 1;
@@ -490,11 +498,14 @@
exit 1;
}
-if ($TUNER eq "kbmira") {
- if (! defined $MOSES) {
- print "* FATAL: using 'kbmira' for tuning requires setting the MOSES environment variable\n";
- exit 1;
- }
+if ($TUNER eq "kbmira" and ! defined $MOSES) {
+ print "* FATAL: using 'kbmira' for tuning requires setting the MOSES environment variable\n";
+ exit 1;
+}
+
+if ($GRAMMAR_TYPE eq "phrase" and ! defined $MOSES) {
+ print "* FATAL: building phrase-based models (--type phrase) requires setting the MOSES environment variable\n";
+ exit 1;
}
if ($TUNER ne "mert" and $TUNER ne "zmert" and $TUNER ne "mira" and $TUNER ne "local-mira" and $TUNER ne "pro" and $TUNER ne "kbmira") {
@@ -771,8 +782,51 @@
system("mkdir alignments") unless -d "alignments";
- # Run the parallel aligner
- system("seq 0 $lastchunk | $SCRIPTDIR/training/paralign.pl -aligner $ALIGNER -num_threads $NUM_THREADS -giza_merge $GIZA_MERGE -aligner_mem $ALIGNER_MEM -source $SOURCE -target $TARGET -giza_trainer \"$GIZA_TRAINER\" -train_dir \"$DATA_DIRS{train}\" > alignments/run.log");
+ my $aligner_cmd = (
+ "$SCRIPTDIR/training/paralign.pl "
+ . " -aligner $ALIGNER"
+ . " -num_threads 1"
+ . " -giza_merge $GIZA_MERGE"
+ . " -aligner_mem $ALIGNER_MEM"
+ . " -source $SOURCE"
+ . " -target $TARGET"
+ . " -giza_trainer \"$GIZA_TRAINER\""
+ . " -train_dir \"$DATA_DIRS{train}\" "
+ . "> alignments/run.log"
+ );
+
+ # Start a parallel job on each core
+ my @children = ();
+ my $next_chunk = 0;
+ foreach my $core (1..$NUM_THREADS) {
+ if ($next_chunk < $lastchunk + 1) {
+ my $child = fork();
+ if (! $child) { # I am child
+ exec("echo $next_chunk | $aligner_cmd");
+ exit 0;
+ }
+ push @children, $child;
+ $next_chunk++;
+ next;
+ }
+ }
+
+ # Start another concurrent job as each oldest job finishes
+ while (@children) {
+ my $old_child = shift @children;
+ waitpid( $old_child, 0 );
+ print "child finished\n";
+
+ if ($next_chunk < $lastchunk + 1) {
+ my $new_child = fork();
+ if (! $new_child) { # I am child
+ exec("echo $next_chunk | $aligner_cmd");
+ exit 0;
+ }
+ $next_chunk++;
+ push @children, $new_child;
+ }
+ }
my @aligned_files;
if ($ALIGNER eq "giza") {
@@ -1051,7 +1105,7 @@
$THRAXDIR =~ s#/#_#g;
$cachepipe->cmd("thrax-prep",
- "$HADOOP/bin/hadoop fs -rmr $THRAXDIR; $HADOOP/bin/hadoop fs -mkdir $THRAXDIR; $HADOOP/bin/hadoop fs -put $DATA_DIRS{train}/thrax-input-file $THRAXDIR/input-file",
+ "$HADOOP/bin/hadoop fs -rm -r $THRAXDIR; $HADOOP/bin/hadoop fs -mkdir $THRAXDIR; $HADOOP/bin/hadoop fs -put $DATA_DIRS{train}/thrax-input-file $THRAXDIR/input-file",
"$DATA_DIRS{train}/thrax-input-file",
"grammar.gz");
@@ -1135,7 +1189,7 @@
}
# Build the language model if needed
-if ($DO_BUILD_LM_FROM_CORPUS) {
+if (defined $TRAIN{target} and $DO_BUILD_LM_FROM_CORPUS) {
# make sure the training data is prepped
if (! $PREPPED{TRAIN} and $DO_PREPARE_CORPORA) {
@@ -1150,11 +1204,6 @@
$PREPPED{TRAIN} = 1;
}
- if (! -e $TRAIN{target}) {
- print "* FATAL: I need a training corpus to build the language model from (--corpus)\n";
- exit(1);
- }
-
my $lmfile = "lm.gz";
# sort and uniq the training data
@@ -1232,8 +1281,8 @@
# Needs to be capitalized
my $mem = uc $BUILDLM_MEM;
my $class_lmfile = "class_lm.gz";
- $cachepipe->cmd("kenlm",
- "$JOSHUA/bin/lmplz -o $LM_ORDER -T $TMPDIR -S $mem --discount_fallback=0.5 1 1.5 --verbose_header --text $CLASS_LM_CORPUS $LM_OPTIONS | gzip -9n > lm.gz",
+ $cachepipe->cmd("classlm",
+ "$JOSHUA/bin/lmplz -o 9 -T $TMPDIR -S $mem --discount_fallback=0.5 1 1.5 --verbose_header --text $CLASS_LM_CORPUS $LM_OPTIONS | gzip -9n > $class_lmfile",
"$CLASS_LM_CORPUS",
$class_lmfile);
}
@@ -1294,7 +1343,7 @@
# main default grammar. Then update it if filtering was requested and
# is possible.
my $TUNE_GRAMMAR = $_TUNE_GRAMMAR_FILE || $GRAMMAR_FILE;
-if ($DO_FILTER_TM and ! $DOING_LATTICES and ! defined $_TUNE_GRAMMAR_FILE) {
+if ($DO_FILTER_TM and defined $TUNE_GRAMMAR and ! $DOING_LATTICES and ! defined $_TUNE_GRAMMAR_FILE) {
$TUNE_GRAMMAR = "$DATA_DIRS{tune}/grammar.filtered.gz";
$cachepipe->cmd("filter-tune",
@@ -1305,22 +1354,26 @@
}
# Create the glue grammars. This is done by looking at all the symbols in the grammar file and
-# creating all the needed rules.
-if (! defined $GLUE_GRAMMAR_FILE) {
- $cachepipe->cmd("glue-tune",
- "java -Xmx2g -cp $JOSHUA/lib/*:$THRAX/bin/thrax.jar edu.jhu.thrax.util.CreateGlueGrammar $TUNE_GRAMMAR > $DATA_DIRS{tune}/grammar.glue",
- get_file_from_grammar($TUNE_GRAMMAR),
- "$DATA_DIRS{tune}/grammar.glue");
- $GLUE_GRAMMAR_FILE = "$DATA_DIRS{tune}/grammar.glue";
-} else {
- # just create a symlink to it
- my $filename = $DATA_DIRS{tune} . "/" . basename($GLUE_GRAMMAR_FILE);
- system("ln -sf $GLUE_GRAMMAR_FILE $filename");
+# creating all the needed rules. This is only done if there is a $TUNE_GRAMMAR defined (which
+# can be skipped if we skip straight to the tuning step).
+if (defined $TUNE_GRAMMAR and $GRAMMAR_TYPE ne "phrase") {
+ if (! defined $GLUE_GRAMMAR_FILE) {
+ $cachepipe->cmd("glue-tune",
+ "java -Xmx2g -cp $JOSHUA/lib/*:$THRAX/bin/thrax.jar edu.jhu.thrax.util.CreateGlueGrammar $TUNE_GRAMMAR > $DATA_DIRS{tune}/grammar.glue",
+ get_file_from_grammar($TUNE_GRAMMAR),
+ "$DATA_DIRS{tune}/grammar.glue");
+ $GLUE_GRAMMAR_FILE = "$DATA_DIRS{tune}/grammar.glue";
+ } else {
+ # just create a symlink to it
+ my $filename = $DATA_DIRS{tune} . "/" . basename($GLUE_GRAMMAR_FILE);
+ system("ln -sf $GLUE_GRAMMAR_FILE $filename");
+ }
}
# Add in feature functions
my $weightstr = "";
my @feature_functions;
+my $lm_index = 0;
for my $i (0..$#LMFILES) {
if ($LM_STATE_MINIMIZATION) {
push(@feature_functions, "StateMinimizingLanguageModel -lm_order $LM_ORDER -lm_file $LMFILES[$i]");
@@ -1329,6 +1382,12 @@
}
$weightstr .= "lm_$i 1 ";
+ $lm_index += 1;
+}
+
+if ($DO_BUILD_CLASS_LM) {
+ push(@feature_functions, "LanguageModel -lm_type kenlm -lm_order 9 -lm_file $RUNDIR/class_lm.gz -class_map $CLASS_MAP");
+ $weightstr .= "lm_$lm_index 1 ";
}
if ($DOING_LATTICES) {
@@ -1345,14 +1404,18 @@
# Build out the weight string
my $TM_OWNER = "pt";
my $GLUE_OWNER = "glue";
-{
+if (defined $TUNE_GRAMMAR) {
my @tm_features = get_features($TUNE_GRAMMAR);
foreach my $feature (@tm_features) {
# Only assign initial weights to dense features
$weightstr .= "tm_${TM_OWNER}_$feature 1 " if ($feature =~ /^\d+$/);
}
- # Glue grammar
- $weightstr .= "tm_${GLUE_OWNER}_0 1 ";
+
+ # Glue grammars are only needed for hierarchical models
+ if ($GRAMMAR_TYPE ne "phrase") {
+ # Glue grammar
+ $weightstr .= "tm_${GLUE_OWNER}_0 1 ";
+ }
}
my $tm_type = $GRAMMAR_TYPE;
@@ -1362,8 +1425,9 @@
sub get_file_from_grammar {
# Cachepipe doesn't work on directories, so we need to make sure we
- # have a representative file to use to cache grammars.
+ # have a representative file to use to cache grammars. Returns undef if file not found
my ($grammar_file) = @_;
+ return undef unless defined $grammar_file and -e $grammar_file;
my $file = (-d $grammar_file) ? "$grammar_file/slice_00000.source" : $grammar_file;
return $file;
}
@@ -1373,15 +1437,33 @@
# Build the filtered tuning model
my $tunemodeldir = "$tunedir/model";
-my $tm_switch = ($DO_PACK_GRAMMARS) ? "--pack-tm" : "--tm";
-$cachepipe->cmd("tune-bundle",
- "$BUNDLER --force --symlink --absolute --verbose $JOSHUA_CONFIG $tunemodeldir --copy-config-options '-top-n $NBEST -output-format \"%i ||| %s ||| %f ||| %c\" -mark-oovs false -tm0/type $tm_type -tm0/owner ${TM_OWNER} -tm0/maxspan $MAXSPAN -tm1/owner ${GLUE_OWNER} -search $SEARCH_ALGORITHM -weights \"$weightstr\" $feature_functions' ${tm_switch} $TUNE_GRAMMAR --tm $GLUE_GRAMMAR_FILE",
- $JOSHUA_CONFIG,
- get_file_from_grammar($TUNE_GRAMMAR), # in case it's packed
- "$tunemodeldir/joshua.config");
-{
- # Now update the tuning grammar
+# We build up this string with TMs to substitute in, if any are provided
+my $tm_switch = "";
+my $tm_copy_config_args = "";
+if (defined $TUNE_GRAMMAR) {
+ $tm_switch .= ($DO_PACK_GRAMMARS) ? "--pack-tm" : "--tm";
+ $tm_switch .= " $TUNE_GRAMMAR";
+ $tm_copy_config_args = " -tm0/type $tm_type -tm0/owner ${TM_OWNER} -tm0/maxspan $MAXSPAN";
+}
+# If we specified a new glue grammar, put that in
+if (defined $GLUE_GRAMMAR_FILE) {
+ $tm_switch .= " --tm $GLUE_GRAMMAR_FILE";
+ $tm_copy_config_args .= " -tm1/owner ${GLUE_OWNER}";
+} else {
+ # if there is no glue grammar, remove it from the config template
+ $tm_copy_config_args .= " -tm1 DELETE";
+}
+
+# Now build the bundle
+$cachepipe->cmd("tune-bundle",
+ "$BUNDLER --force --symlink --absolute --verbose $JOSHUA_CONFIG $tunemodeldir --copy-config-options '-top-n $NBEST -output-format \"%i ||| %s ||| %f ||| %c\" -mark-oovs false -search $SEARCH_ALGORITHM -weights \"$weightstr\" $feature_functions ${tm_copy_config_args}' ${tm_switch}",
+ $JOSHUA_CONFIG,
+ get_file_from_grammar($TUNE_GRAMMAR) || $JOSHUA_CONFIG,);
+
+# Update the tune grammar to its new location in the bundle
+if (defined $TUNE_GRAMMAR) {
+ # Now update the tuning grammar to its new path
my $basename = basename($TUNE_GRAMMAR);
if (-e "tune/model/$basename") {
$TUNE_GRAMMAR = "tune/model/$basename";
@@ -1393,13 +1475,13 @@
}
}
+# Update the config file location
+$JOSHUA_CONFIG = "$tunedir/model/joshua.config";
+
# Write the decoder run command. The decoder will use the config file in the bundled
# directory, continually updating it.
$JOSHUA_ARGS .= " -output-format \"%i ||| %s ||| %f ||| %c\"";
-# Update the config file location
-$JOSHUA_CONFIG = "$tunedir/model/joshua.config";
-
open DEC_CMD, ">$tunedir/decoder_command";
print DEC_CMD "cat $TUNE{source} | $tunedir/model/run-joshua.sh -m $JOSHUA_MEM -config $JOSHUA_CONFIG -threads $NUM_THREADS $JOSHUA_ARGS > $tunedir/output.nbest 2> $tunedir/joshua.log\n";
close(DEC_CMD);
@@ -1411,7 +1493,7 @@
"$SCRIPTDIR/training/run_tuner.py $TUNE{source} $TUNE{target} --tunedir $tunedir --tuner $TUNER --decoder-config $JOSHUA_CONFIG --iterations $TUNER_ITERATIONS",
$TUNE{source},
$JOSHUA_CONFIG,
- get_file_from_grammar($TUNE_GRAMMAR),
+ get_file_from_grammar($TUNE_GRAMMAR) || $JOSHUA_CONFIG,
"$tunedir/joshua.config.final");
} elsif ($TUNER eq "kbmira") { # Moses' batch MIRA
@@ -1450,10 +1532,13 @@
$PREPPED{TEST} = 1;
}
-# filter the test grammar
system("mkdir -p $DATA_DIRS{test}") unless -d $DATA_DIRS{test};
+
+# Define the test grammar, if it was provided
my $TEST_GRAMMAR = $_TEST_GRAMMAR_FILE || $GRAMMAR_FILE;
-if ($DO_FILTER_TM and ! $DOING_LATTICES and ! defined $_TEST_GRAMMAR_FILE) {
+
+# Now filter, if its defined and should be done
+if ($DO_FILTER_TM and defined $TEST_GRAMMAR and ! $DOING_LATTICES and ! defined $_TEST_GRAMMAR_FILE) {
$TEST_GRAMMAR = "$DATA_DIRS{test}/grammar.filtered.gz";
$cachepipe->cmd("filter-test",
@@ -1465,36 +1550,46 @@
my $testdir = "$RUNDIR/test";
-# Create the glue file.
-if (! defined $GLUE_GRAMMAR_FILE) {
- $cachepipe->cmd("glue-test",
- "java -Xmx1g -cp $JOSHUA/lib/*:$THRAX/bin/thrax.jar edu.jhu.thrax.util.CreateGlueGrammar $TEST_GRAMMAR > $DATA_DIRS{test}/grammar.glue",
- $TEST_GRAMMAR,
- "$DATA_DIRS{test}/grammar.glue");
- $GLUE_GRAMMAR_FILE = "$DATA_DIRS{test}/grammar.glue";
-
-} else {
- # just create a symlink to it
- my $filename = $DATA_DIRS{test} . "/" . basename($GLUE_GRAMMAR_FILE);
- if ($GLUE_GRAMMAR_FILE =~ /^\//) {
- system("ln -sf $GLUE_GRAMMAR_FILE $filename");
+# Create and update the glue file, if the test grammar was provided (if not, we assume these
+# are in the $JOSHUA_CONFIG)
+if (defined $TEST_GRAMMAR and $GRAMMAR_TYPE ne "phrase") {
+ if (! defined $GLUE_GRAMMAR_FILE) {
+ $cachepipe->cmd("glue-test",
+ "java -Xmx1g -cp $JOSHUA/lib/*:$THRAX/bin/thrax.jar edu.jhu.thrax.util.CreateGlueGrammar $TEST_GRAMMAR > $DATA_DIRS{test}/grammar.glue",
+ $TEST_GRAMMAR,
+ "$DATA_DIRS{test}/grammar.glue");
+ $GLUE_GRAMMAR_FILE = "$DATA_DIRS{test}/grammar.glue";
+
} else {
- system("ln -sf $STARTDIR/$GLUE_GRAMMAR_FILE $filename");
+ # just create a symlink to it
+ my $filename = $DATA_DIRS{test} . "/" . basename($GLUE_GRAMMAR_FILE);
+ if ($GLUE_GRAMMAR_FILE =~ /^\//) {
+ system("ln -sf $GLUE_GRAMMAR_FILE $filename");
+ } else {
+ system("ln -sf $STARTDIR/$GLUE_GRAMMAR_FILE $filename");
+ }
}
}
+$tm_switch = "";
+$tm_copy_config_args = "";
+if (defined $TEST_GRAMMAR) {
+ $tm_switch .= ($DO_PACK_GRAMMARS) ? "--pack-tm" : "--tm";
+ $tm_switch .= " $TEST_GRAMMAR";
+}
+if (defined $GLUE_GRAMMAR_FILE) {
+ $tm_switch .= " --tm $GLUE_GRAMMAR_FILE";
+}
+
# Build the filtered testing model
$cachepipe->cmd("test-bundle",
- "$BUNDLER --force --symlink --verbose $JOSHUA_CONFIG test/model --copy-config-options '-top-n $NBEST -output-format \"%i ||| %s ||| %f ||| %c\" -mark-oovs false' ${tm_switch} $TEST_GRAMMAR --tm $GLUE_GRAMMAR_FILE",
+ "$BUNDLER --force --symlink --verbose $JOSHUA_CONFIG test/model --copy-config-options '-top-n $NBEST -output-format \"%i ||| %s ||| %f ||| %c\" -mark-oovs false' ${tm_switch}",
$JOSHUA_CONFIG,
- get_file_from_grammar($TEST_GRAMMAR),
- "$testdir/joshua.config");
+ get_file_from_grammar($TEST_GRAMMAR) || $JOSHUA_CONFIG,
+ "$testdir/model/joshua.config");
-{
- # Update some variables. $TEST_GRAMMAR_FILE, which previously held
- # an optional command-line argument of a pre-filtered tuning
- # grammar, is now used to record the text-based grammar, which is
- # needed later for different things.
+if (defined $TEST_GRAMMAR) {
+ # Update the test grammar (if defined) to its new path
my $basename = basename($TEST_GRAMMAR);
if (-e "$testdir/model/$basename") {
$TEST_GRAMMAR = "$testdir/model/$basename";
@@ -1535,10 +1630,10 @@
# need the n-best output) or $bestoutput (which only outputs the hypothesis but is tons faster)
$cachepipe->cmd("test-decode",
"$testrun/decoder_command",
- "$testrun/decoder_command",
$TEST{source},
+ "$testrun/decoder_command",
"$testrun/model/joshua.config",
- get_file_from_grammar($TEST_GRAMMAR),
+ get_file_from_grammar($TEST_GRAMMAR) || "$testrun/model/joshua.config",
$output);
# $cachepipe->cmd("remove-oov",
diff --git a/src/joshua/decoder/ff/lm/LanguageModelFF.java b/src/joshua/decoder/ff/lm/LanguageModelFF.java
index 81572cc..cb01367 100644
--- a/src/joshua/decoder/ff/lm/LanguageModelFF.java
+++ b/src/joshua/decoder/ff/lm/LanguageModelFF.java
@@ -121,9 +121,9 @@
this.type = parsedArgs.get("lm_type");
this.ngramOrder = Integer.parseInt(parsedArgs.get("lm_order"));
this.path = parsedArgs.get("lm_file");
- this.isClassLM = parsedArgs.containsKey("lm_class");
- if (isClassLM && parsedArgs.containsKey("class_map"))
+ if (parsedArgs.containsKey("class_map"))
try {
+ this.isClassLM = true;
this.classMap = new ClassMap(parsedArgs.get("class_map"));
} catch (IOException e) {
// TODO Auto-generated catch block
@@ -252,8 +252,7 @@
/**
- * Gets the target side classes for the class LM
- *
+ * Replace each word in a rule with the target side classes.
*/
protected int[] getClasses(Rule rule) {
if (this.classMap == null) {
diff --git a/src/joshua/decoder/ff/lm/kenlm/Makefile b/src/joshua/decoder/ff/lm/kenlm/Makefile
index 483a941..ecc38c3 100644
--- a/src/joshua/decoder/ff/lm/kenlm/Makefile
+++ b/src/joshua/decoder/ff/lm/kenlm/Makefile
@@ -16,7 +16,7 @@
HEADERS= lm/bhiksha.hh lm/binary_format.hh lm/blank.hh lm/builder/adjust_counts.hh lm/builder/corpus_count.hh lm/builder/discount.hh lm/builder/header_info.hh lm/builder/initial_probabilities.hh lm/builder/interpolate.hh lm/builder/joint_order.hh lm/builder/multi_stream.hh lm/builder/ngram.hh lm/builder/ngram_stream.hh lm/builder/pipeline.hh lm/builder/print.hh lm/builder/sort.hh lm/config.hh lm/enumerate_vocab.hh lm/facade.hh lm/left.hh lm/lm_exception.hh lm/max_order.hh lm/model.hh lm/model_type.hh lm/ngram_query.hh lm/partial.hh lm/quantize.hh lm/read_arpa.hh lm/return.hh lm/search_hashed.hh lm/search_trie.hh lm/sizes.hh lm/state.hh lm/trie.hh lm/trie_sort.hh lm/value_build.hh lm/value.hh lm/virtual_interface.hh lm/vocab.hh lm/weights.hh lm/word_index.hh util/bit_packing.hh util/ersatz_progress.hh util/exception.hh util/fake_ofstream.hh util/file.hh util/file_piece.hh util/getopt.hh util/have.hh util/joint_sort.hh util/mmap.hh util/multi_intersection.hh util/murmur_hash.hh util/pcqueue.hh util/pool.hh util/probing_hash_table.hh util/proxy_iterator.hh util/read_compressed.hh util/scoped.hh util/sized_iterator.hh util/sorted_uniform.hh util/stream/block.hh util/stream/chain.hh util/stream/config.hh util/stream/io.hh util/stream/line_input.hh util/stream/multi_progress.hh util/stream/sort.hh util/stream/stream.hh util/stream/timer.hh util/string_piece_hash.hh util/string_piece.hh util/thread_pool.hh util/tokenize_piece.hh util/usage.hh util/parallel_read.hh
-CPPFLAGS = $(CXXFLAGS) -I. -O3 -DKENLM_MAX_ORDER=$(MAX_ORDER) -DHAVE_ZLIB -DNDEBUG
+CPPFLAGS = $(CXXFLAGS) -I. -I$(BOOST_ROOT)/include -O3 -DKENLM_MAX_ORDER=$(MAX_ORDER) -DHAVE_ZLIB -DNDEBUG
.cc.o: $(HEADERS)
$(CC) -c $(CPPFLAGS) -fPIC -o $@ $<
@@ -36,6 +36,9 @@
#lmplz
SHELL=bash
+ifeq (,$(wildcard $(BOOST_ROOT)/lib/libboost_thread.a))
+ $(error BOOST_ROOT ($(BOOST_ROOT)) does not point to a Boost installation, quitting...)
+endif
ifeq ($(shell $(CC) -L$(BOOST_ROOT)/lib -lboost_program_options$(BOOST_MT) -lboost_thread$(BOOST_MT) -x c++ - <<<'int main() {}' -o dummy && rm dummy && echo Boost),Boost)
$(info Detected Boost)
LMPLZ=lm/builder/adjust_counts.o lm/builder/corpus_count.o lm/builder/initial_probabilities.o lm/builder/interpolate.o lm/builder/lmplz_main.o lm/builder/pipeline.o lm/builder/print.o lm/builder/output.o util/stream/chain.o util/stream/io.o util/stream/line_input.o util/stream/multi_progress.o
diff --git a/src/joshua/decoder/ff/lm/kenlm/jni/KenLM.java b/src/joshua/decoder/ff/lm/kenlm/jni/KenLM.java
index bc3d8f4..a110aa9 100644
--- a/src/joshua/decoder/ff/lm/kenlm/jni/KenLM.java
+++ b/src/joshua/decoder/ff/lm/kenlm/jni/KenLM.java
@@ -16,7 +16,17 @@
public class KenLM implements NGramLanguageModel, Comparable<KenLM> {
static {
- System.loadLibrary("ken");
+ try {
+ System.loadLibrary("ken");
+ } catch (UnsatisfiedLinkError e) {
+ System.err.println("* FATAL: Can't find libken.so (libken.dylib on OS X) in $JOSHUA/lib");
+ System.err.println("* This probably means that the KenLM library didn't compile.");
+ System.err.println("* Make sure that BOOST_ROOT is set to the root of your boost");
+ System.err.println("* installation (it's not /opt/local/, the default), change to");
+ System.err.println("* $JOSHUA, and type 'ant kenlm'. If problems persist, see the");
+ System.err.println("* website (joshua-decoder.org).");
+ System.exit(1);
+ }
}
private final long pointer;