Merge remote-tracking branch 'origin/pro-mira-fix'

commit: 89d5d0d52ac6382a86f56417d44c24c89ec8980f [log] [tgz]
author: Matt Post <post@cs.jhu.edu> Mon Jun 15 16:58:22 2015 -0400
committer: Matt Post <post@cs.jhu.edu> Mon Jun 15 16:58:22 2015 -0400
tree: 2fd57c1af60a668e1e8cc8307147128cc5e0a1b3
parent: dd0fb74d1317f8759152ce23c3b00a977906c22d [diff]
parent: fe83fbc324a8dfe1049adc96eac26349ddafbdf0 [diff]
diff --git a/scripts/copy-config.pl b/scripts/copy-config.pl
index e5e9d97..867b470 100755
--- a/scripts/copy-config.pl
+++ b/scripts/copy-config.pl

@@ -57,7 +57,7 @@
 # Step 2.  Now read through the config file.
 
 my @weights_order;
-my $tm_index = 0;
+my $tm_index = -1;
 while (my $line = <>) {
   if ($line =~ /^\s*$/ or $line =~ /^#/) {
     # Comments, empty lines
@@ -78,9 +78,17 @@
     # TMs get special treatment. We parse the line (supporting old format and new keyword format),
     # and then compare to command-line args to see what gets updated
     if ($norm_key =~ /^tm/) {
+      $tm_index++;
+
       # get the hash of tm values from the config file
       my $tm_hash = parse_tm_line($value);
 
+      # Delete TM lines if they've been requested to be deleted
+      if (exists $params{"tm${tm_index}"} and $params{"tm${tm_index}"} eq "DELETE") {
+        delete $params{"tm${tm_index}"};
+        next;
+      }
+
       # check if each one was passed as a command-line argument, and if so, retrieve its new value
       foreach my $tmkey (keys %$tm_hash) {
         my $concat = "tm${tm_index}/${tmkey}";
@@ -95,7 +103,6 @@
         next if $tmkey eq "type";
         $params{$norm_key} .= " -$tmkey $tm_hash->{$tmkey}";
       }
-      $tm_index++;
     }
 
     # if the parameter was found on the command line, print out its replaced value

diff --git a/scripts/support/run_bundler.py b/scripts/support/run_bundler.py
index aea641d..9675f27 100755
--- a/scripts/support/run_bundler.py
+++ b/scripts/support/run_bundler.py

@@ -22,7 +22,7 @@
 $JOSHUA/scripts/support/run_bundler.py \
   --force \
   --verbose \
-  /path/to/origin/directory/test/1/joshua.config \
+  /path/to/origin/directory/test/model/joshua.config \
   --root /path/to/origin/directory \
   new-bundle-directory \
   --copy-config-options \
@@ -494,7 +494,7 @@
     parser.add_argument(
         'config', type=argparse.FileType('r'),
         help='path to the origin configuration file. e.g. '
-             '/path/to/test/1/joshua.config.final'
+             '/path/to/tune/dir/joshua.config.final'
     )
     parser.add_argument(
         'dest_dir',

diff --git a/scripts/training/class-lm/replaceTokensWithClasses.py b/scripts/training/class-lm/replaceTokensWithClasses.py
new file mode 100644
index 0000000..c7691c2
--- /dev/null
+++ b/scripts/training/class-lm/replaceTokensWithClasses.py

@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+"""
+Converts the words in a tokenized corpus into classes using the provided map.
+
+Usage:
+
+  $0 MAP INPUT_FILE OUTPUT_FILE
+
+where the format of the map is 
+
+  WORD CLASS
+"""
+ 
+import sys
+ 
+classMap = {}
+ 
+classFile = open(sys.argv[1])
+input = open(sys.argv[2])
+output = open(sys.argv[3], 'w+')
+ 
+# First read classMap
+for line in classFile:
+  line = line.strip()
+  lineComp = line.split()
+  classMap[lineComp[0]] = lineComp[1]
+ 
+# Now read corpus
+for line in input:
+  line = line.strip().lower()
+  lineComp = line.split()
+  translation = []
+  for word in lineComp:
+    if word in classMap:
+      translation.append(classMap[word])
+    else:
+      translation.append("-1")
+  output.write(" ".join(translation) + "\n")
+ 
+classFile.close()
+input.close()
+output.close()

diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 7bff0ac..ae42532 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl

@@ -77,7 +77,7 @@
 my $BUNDLER = "$JOSHUA/scripts/support/run_bundler.py";
 my $STARTDIR;
 my $RUNDIR = $STARTDIR = getcwd();
-my $GRAMMAR_TYPE = "hiero";  # or "itg" or "samt" or "ghkm" or "phrase"
+my $GRAMMAR_TYPE = "hiero";  # or "itg" or "samt" or "ghkm" or "phrase" or "phrasal"
 my $SEARCH_ALGORITHM = "cky"; # or "stack" (for phrase-based)
 
 # Which GHKM extractor to use ("galley" or "moses")
@@ -280,7 +280,7 @@
   "ner-tagger=s"   => \$NER_TAGGER,
   "class-lm!"     => \$DO_BUILD_CLASS_LM,
   "class-lm-corpus=s"   => \$CLASS_LM_CORPUS,
-  "class-map"     => \$CLASS_MAP,
+  "class-map=s"     => \$CLASS_MAP,
 );
 
 if (! $retval) {
@@ -288,9 +288,6 @@
   exit 1;
 }
 
-# Joshua config
-my $JOSHUA_CONFIG = $_JOSHUA_CONFIG || "$TUNECONFDIR/joshua.config";
-
 $RUNDIR = get_absolute_path($RUNDIR);
 
 $TUNER = lc $TUNER;
@@ -405,18 +402,25 @@
   exit 1;
 }
 
-# make sure a grammar file was given if we're skipping training
-if (! defined $GRAMMAR_FILE) {
-  if ($STEPS{$FIRST_STEP} >= $STEPS{TEST}) {
-    if (! defined $_TEST_GRAMMAR_FILE) {
-      print "* FATAL: need a grammar (--grammar or --test-grammar) if you're skipping to testing\n";
-			exit 1;
-		}
-  } elsif ($STEPS{$FIRST_STEP} >= $STEPS{TUNE}) {
-		if (! defined $_TUNE_GRAMMAR_FILE) {
-			print "* FATAL: need a grammar (--grammar or --tune-grammar) if you're skipping grammar learning\n";
-			exit 1;
-		}
+# Joshua config
+my $JOSHUA_CONFIG = get_absolute_path($_JOSHUA_CONFIG || "$TUNECONFDIR/joshua.config", $STARTDIR);
+
+# make sure we have a tuned config file if we're skipping model building and tuning
+if ($STEPS{$FIRST_STEP} >= $STEPS{TEST}) {
+  if (! defined $JOSHUA_CONFIG) {
+    print "* FATAL: You need to provide a tuned Joshua config file (--joshua-config)\n";
+    print "         if you're skipping straight to testing\n";
+    exit 1;
+  }
+}
+
+# make sure we have either a config file or a grammar and LM if we're skipping model building
+if ($STEPS{$FIRST_STEP} >= $STEPS{TUNE}) {
+  if (! defined $JOSHUA_CONFIG and ((! defined $_TUNE_GRAMMAR_FILE and ! defined $GRAMMAR_FILE) or scalar(@LMFILES) == 0)) {
+    print "* FATAL: You must provide either a Joshua config file (--joshua-config) or\n";
+    print "         a grammar (--grammar or --tune-grammar) and at least one LM (--lmfile)\n";
+    print "         if you're skipping straight to tuning\n";
+    exit 1;
   }
 }
 
@@ -426,6 +430,10 @@
 }
 
 # check for file presence
+if (defined $JOSHUA_CONFIG and ! -e $JOSHUA_CONFIG) {
+  print "* FATAL: couldn't find joshua config file '$JOSHUA_CONFIG'\n";
+  exit 1;
+}
 if (defined $GRAMMAR_FILE and ! -e $GRAMMAR_FILE) {
   print "* FATAL: couldn't find grammar file '$GRAMMAR_FILE'\n";
   exit 1;
@@ -490,11 +498,14 @@
   exit 1;
 }
 
-if ($TUNER eq "kbmira") {
-  if (! defined $MOSES) {
-    print "* FATAL: using 'kbmira' for tuning requires setting the MOSES environment variable\n";
-    exit 1;
-  }
+if ($TUNER eq "kbmira" and ! defined $MOSES) {
+  print "* FATAL: using 'kbmira' for tuning requires setting the MOSES environment variable\n";
+  exit 1;
+}
+
+if ($GRAMMAR_TYPE eq "phrase" and ! defined $MOSES) {
+  print "* FATAL: building phrase-based models (--type phrase) requires setting the MOSES environment variable\n";
+  exit 1;
 }
 
 if ($TUNER ne "mert" and $TUNER ne "zmert" and $TUNER ne "mira" and $TUNER ne "local-mira" and $TUNER ne "pro" and $TUNER ne "kbmira") {
@@ -771,8 +782,51 @@
 
   system("mkdir alignments") unless -d "alignments";
 
-  # Run the parallel aligner
-  system("seq 0 $lastchunk | $SCRIPTDIR/training/paralign.pl -aligner $ALIGNER -num_threads $NUM_THREADS -giza_merge $GIZA_MERGE -aligner_mem $ALIGNER_MEM -source $SOURCE -target $TARGET -giza_trainer \"$GIZA_TRAINER\" -train_dir \"$DATA_DIRS{train}\" > alignments/run.log");
+  my $aligner_cmd = (
+    "$SCRIPTDIR/training/paralign.pl "
+    . " -aligner $ALIGNER"
+    . " -num_threads 1"
+    . " -giza_merge $GIZA_MERGE"
+    . " -aligner_mem $ALIGNER_MEM"
+    . " -source $SOURCE"
+    . " -target $TARGET"
+    . " -giza_trainer \"$GIZA_TRAINER\""
+    . " -train_dir \"$DATA_DIRS{train}\" "
+    . "> alignments/run.log"
+  );
+
+  # Start a parallel job on each core
+  my @children = ();
+  my $next_chunk = 0;
+  foreach my $core (1..$NUM_THREADS) {
+    if ($next_chunk < $lastchunk + 1) {
+      my $child = fork();
+      if (! $child) { # I am child
+        exec("echo $next_chunk | $aligner_cmd");
+        exit 0;
+      }
+      push @children, $child;
+      $next_chunk++;
+      next;
+    }
+  }
+
+  # Start another concurrent job as each oldest job finishes
+  while (@children) {
+    my $old_child = shift @children;
+    waitpid( $old_child, 0 );
+    print "child finished\n";
+
+    if ($next_chunk < $lastchunk + 1) {
+      my $new_child = fork();
+      if (! $new_child) { # I am child
+        exec("echo $next_chunk | $aligner_cmd");
+        exit 0;
+      }
+      $next_chunk++;
+      push @children, $new_child;
+    }
+  }
 
   my @aligned_files;
   if ($ALIGNER eq "giza") {
@@ -1051,7 +1105,7 @@
       $THRAXDIR =~ s#/#_#g;
 
       $cachepipe->cmd("thrax-prep",
-                      "$HADOOP/bin/hadoop fs -rmr $THRAXDIR; $HADOOP/bin/hadoop fs -mkdir $THRAXDIR; $HADOOP/bin/hadoop fs -put $DATA_DIRS{train}/thrax-input-file $THRAXDIR/input-file",
+                      "$HADOOP/bin/hadoop fs -rm -r $THRAXDIR; $HADOOP/bin/hadoop fs -mkdir $THRAXDIR; $HADOOP/bin/hadoop fs -put $DATA_DIRS{train}/thrax-input-file $THRAXDIR/input-file",
                       "$DATA_DIRS{train}/thrax-input-file", 
                       "grammar.gz");
 
@@ -1135,7 +1189,7 @@
 }
 
 # Build the language model if needed
-if ($DO_BUILD_LM_FROM_CORPUS) {
+if (defined $TRAIN{target} and $DO_BUILD_LM_FROM_CORPUS) {
 
   # make sure the training data is prepped
   if (! $PREPPED{TRAIN} and $DO_PREPARE_CORPORA) {
@@ -1150,11 +1204,6 @@
 		$PREPPED{TRAIN} = 1;
   }
 
-  if (! -e $TRAIN{target}) {
-		print "* FATAL: I need a training corpus to build the language model from (--corpus)\n";
-		exit(1);
-  }
-
   my $lmfile = "lm.gz";
 
   # sort and uniq the training data
@@ -1232,8 +1281,8 @@
   # Needs to be capitalized
   my $mem = uc $BUILDLM_MEM;
   my $class_lmfile = "class_lm.gz";
-  $cachepipe->cmd("kenlm",
-                  "$JOSHUA/bin/lmplz -o $LM_ORDER -T $TMPDIR -S $mem --discount_fallback=0.5 1 1.5 --verbose_header --text $CLASS_LM_CORPUS $LM_OPTIONS | gzip -9n > lm.gz",
+  $cachepipe->cmd("classlm",
+                  "$JOSHUA/bin/lmplz -o 9 -T $TMPDIR -S $mem --discount_fallback=0.5 1 1.5 --verbose_header --text $CLASS_LM_CORPUS $LM_OPTIONS | gzip -9n > $class_lmfile",
                   "$CLASS_LM_CORPUS",
                   $class_lmfile);
 }
@@ -1294,7 +1343,7 @@
 # main default grammar. Then update it if filtering was requested and
 # is possible.
 my $TUNE_GRAMMAR = $_TUNE_GRAMMAR_FILE || $GRAMMAR_FILE;
-if ($DO_FILTER_TM and ! $DOING_LATTICES and ! defined $_TUNE_GRAMMAR_FILE) {
+if ($DO_FILTER_TM and defined $TUNE_GRAMMAR and ! $DOING_LATTICES and ! defined $_TUNE_GRAMMAR_FILE) {
   $TUNE_GRAMMAR = "$DATA_DIRS{tune}/grammar.filtered.gz";
 
   $cachepipe->cmd("filter-tune",
@@ -1305,22 +1354,26 @@
 }
 
 # Create the glue grammars. This is done by looking at all the symbols in the grammar file and
-# creating all the needed rules.
-if (! defined $GLUE_GRAMMAR_FILE) {
-  $cachepipe->cmd("glue-tune",
-                  "java -Xmx2g -cp $JOSHUA/lib/*:$THRAX/bin/thrax.jar edu.jhu.thrax.util.CreateGlueGrammar $TUNE_GRAMMAR > $DATA_DIRS{tune}/grammar.glue",
-                  get_file_from_grammar($TUNE_GRAMMAR),
-                  "$DATA_DIRS{tune}/grammar.glue");
-  $GLUE_GRAMMAR_FILE = "$DATA_DIRS{tune}/grammar.glue";
-} else {
-  # just create a symlink to it
-  my $filename = $DATA_DIRS{tune} . "/" . basename($GLUE_GRAMMAR_FILE);
-  system("ln -sf $GLUE_GRAMMAR_FILE $filename");
+# creating all the needed rules. This is only done if there is a $TUNE_GRAMMAR defined (which
+# can be skipped if we skip straight to the tuning step).
+if (defined $TUNE_GRAMMAR and $GRAMMAR_TYPE ne "phrase") {
+  if (! defined $GLUE_GRAMMAR_FILE) {
+    $cachepipe->cmd("glue-tune",
+                    "java -Xmx2g -cp $JOSHUA/lib/*:$THRAX/bin/thrax.jar edu.jhu.thrax.util.CreateGlueGrammar $TUNE_GRAMMAR > $DATA_DIRS{tune}/grammar.glue",
+                    get_file_from_grammar($TUNE_GRAMMAR),
+                    "$DATA_DIRS{tune}/grammar.glue");
+    $GLUE_GRAMMAR_FILE = "$DATA_DIRS{tune}/grammar.glue";
+  } else {
+    # just create a symlink to it
+    my $filename = $DATA_DIRS{tune} . "/" . basename($GLUE_GRAMMAR_FILE);
+    system("ln -sf $GLUE_GRAMMAR_FILE $filename");
+  }
 }
 
 # Add in feature functions
 my $weightstr = "";
 my @feature_functions;
+my $lm_index = 0;
 for my $i (0..$#LMFILES) {
   if ($LM_STATE_MINIMIZATION) {
     push(@feature_functions, "StateMinimizingLanguageModel -lm_order $LM_ORDER -lm_file $LMFILES[$i]");
@@ -1329,6 +1382,12 @@
   }
 
   $weightstr .= "lm_$i 1 ";
+  $lm_index += 1;
+}
+
+if ($DO_BUILD_CLASS_LM) {
+  push(@feature_functions, "LanguageModel -lm_type kenlm -lm_order 9 -lm_file $RUNDIR/class_lm.gz -class_map $CLASS_MAP");
+  $weightstr .= "lm_$lm_index 1 ";
 }
 
 if ($DOING_LATTICES) {
@@ -1345,14 +1404,18 @@
 # Build out the weight string
 my $TM_OWNER = "pt";
 my $GLUE_OWNER = "glue";
-{
+if (defined $TUNE_GRAMMAR) {
   my @tm_features = get_features($TUNE_GRAMMAR);
   foreach my $feature (@tm_features) {
     # Only assign initial weights to dense features
     $weightstr .= "tm_${TM_OWNER}_$feature 1 " if ($feature =~ /^\d+$/);
   }
-  # Glue grammar
-  $weightstr .= "tm_${GLUE_OWNER}_0 1 ";
+
+  # Glue grammars are only needed for hierarchical models
+  if ($GRAMMAR_TYPE ne "phrase") {
+    # Glue grammar
+    $weightstr .= "tm_${GLUE_OWNER}_0 1 ";
+  }
 }
 
 my $tm_type = $GRAMMAR_TYPE;
@@ -1362,8 +1425,9 @@
 
 sub get_file_from_grammar {
   # Cachepipe doesn't work on directories, so we need to make sure we
-  # have a representative file to use to cache grammars.
+  # have a representative file to use to cache grammars. Returns undef if file not found
   my ($grammar_file) = @_;
+  return undef unless defined $grammar_file and -e $grammar_file;
   my $file = (-d $grammar_file) ? "$grammar_file/slice_00000.source" : $grammar_file;
   return $file;
 }
@@ -1373,15 +1437,33 @@
 
 # Build the filtered tuning model
 my $tunemodeldir = "$tunedir/model";
-my $tm_switch = ($DO_PACK_GRAMMARS) ? "--pack-tm" : "--tm";
-$cachepipe->cmd("tune-bundle",
-                "$BUNDLER --force --symlink --absolute --verbose $JOSHUA_CONFIG $tunemodeldir --copy-config-options '-top-n $NBEST -output-format \"%i ||| %s ||| %f ||| %c\" -mark-oovs false -tm0/type $tm_type -tm0/owner ${TM_OWNER} -tm0/maxspan $MAXSPAN -tm1/owner ${GLUE_OWNER} -search $SEARCH_ALGORITHM -weights \"$weightstr\" $feature_functions' ${tm_switch} $TUNE_GRAMMAR --tm $GLUE_GRAMMAR_FILE",
-                $JOSHUA_CONFIG,
-                get_file_from_grammar($TUNE_GRAMMAR),  # in case it's packed
-                "$tunemodeldir/joshua.config");
 
-{
-  # Now update the tuning grammar
+# We build up this string with TMs to substitute in, if any are provided
+my $tm_switch = "";
+my $tm_copy_config_args = "";
+if (defined $TUNE_GRAMMAR) {
+  $tm_switch .= ($DO_PACK_GRAMMARS) ? "--pack-tm" : "--tm";
+  $tm_switch .= " $TUNE_GRAMMAR";
+  $tm_copy_config_args = " -tm0/type $tm_type -tm0/owner ${TM_OWNER} -tm0/maxspan $MAXSPAN";
+}
+# If we specified a new glue grammar, put that in
+if (defined $GLUE_GRAMMAR_FILE) {
+  $tm_switch .= " --tm $GLUE_GRAMMAR_FILE";
+  $tm_copy_config_args .= " -tm1/owner ${GLUE_OWNER}";
+} else {
+  # if there is no glue grammar, remove it from the config template
+  $tm_copy_config_args .= " -tm1 DELETE";
+}
+
+# Now build the bundle
+$cachepipe->cmd("tune-bundle",
+                "$BUNDLER --force --symlink --absolute --verbose $JOSHUA_CONFIG $tunemodeldir --copy-config-options '-top-n $NBEST -output-format \"%i ||| %s ||| %f ||| %c\" -mark-oovs false -search $SEARCH_ALGORITHM -weights \"$weightstr\" $feature_functions ${tm_copy_config_args}' ${tm_switch}",
+                $JOSHUA_CONFIG,
+                get_file_from_grammar($TUNE_GRAMMAR) || $JOSHUA_CONFIG,);
+
+# Update the tune grammar to its new location in the bundle
+if (defined $TUNE_GRAMMAR) {
+  # Now update the tuning grammar to its new path
   my $basename = basename($TUNE_GRAMMAR);
   if (-e "tune/model/$basename") {
     $TUNE_GRAMMAR = "tune/model/$basename";
@@ -1393,13 +1475,13 @@
   }
 }
 
+# Update the config file location
+$JOSHUA_CONFIG = "$tunedir/model/joshua.config";
+
 # Write the decoder run command. The decoder will use the config file in the bundled
 # directory, continually updating it.
 $JOSHUA_ARGS .= " -output-format \"%i ||| %s ||| %f ||| %c\"";
 
-# Update the config file location
-$JOSHUA_CONFIG = "$tunedir/model/joshua.config";
-
 open DEC_CMD, ">$tunedir/decoder_command";
 print DEC_CMD "cat $TUNE{source} | $tunedir/model/run-joshua.sh -m $JOSHUA_MEM -config $JOSHUA_CONFIG -threads $NUM_THREADS $JOSHUA_ARGS > $tunedir/output.nbest 2> $tunedir/joshua.log\n";
 close(DEC_CMD);
@@ -1411,7 +1493,7 @@
                   "$SCRIPTDIR/training/run_tuner.py $TUNE{source} $TUNE{target} --tunedir $tunedir --tuner $TUNER --decoder-config $JOSHUA_CONFIG --iterations $TUNER_ITERATIONS",
                   $TUNE{source},
                   $JOSHUA_CONFIG,
-                  get_file_from_grammar($TUNE_GRAMMAR),
+                  get_file_from_grammar($TUNE_GRAMMAR) || $JOSHUA_CONFIG,
                   "$tunedir/joshua.config.final");
 
 } elsif ($TUNER eq "kbmira") { # Moses' batch MIRA
@@ -1450,10 +1532,13 @@
   $PREPPED{TEST} = 1;
 }
 
-# filter the test grammar
 system("mkdir -p $DATA_DIRS{test}") unless -d $DATA_DIRS{test};
+
+# Define the test grammar, if it was provided
 my $TEST_GRAMMAR = $_TEST_GRAMMAR_FILE || $GRAMMAR_FILE;
-if ($DO_FILTER_TM and ! $DOING_LATTICES and ! defined $_TEST_GRAMMAR_FILE) {
+
+# Now filter, if its defined and should be done
+if ($DO_FILTER_TM and defined $TEST_GRAMMAR and ! $DOING_LATTICES and ! defined $_TEST_GRAMMAR_FILE) {
   $TEST_GRAMMAR = "$DATA_DIRS{test}/grammar.filtered.gz";
   
   $cachepipe->cmd("filter-test",
@@ -1465,36 +1550,46 @@
 
 my $testdir = "$RUNDIR/test";
 
-# Create the glue file.
-if (! defined $GLUE_GRAMMAR_FILE) {
-  $cachepipe->cmd("glue-test",
-                  "java -Xmx1g -cp $JOSHUA/lib/*:$THRAX/bin/thrax.jar edu.jhu.thrax.util.CreateGlueGrammar $TEST_GRAMMAR > $DATA_DIRS{test}/grammar.glue",
-                  $TEST_GRAMMAR,
-                  "$DATA_DIRS{test}/grammar.glue");
-  $GLUE_GRAMMAR_FILE = "$DATA_DIRS{test}/grammar.glue";
-  
-} else {
-  # just create a symlink to it
-  my $filename = $DATA_DIRS{test} . "/" . basename($GLUE_GRAMMAR_FILE);
-  if ($GLUE_GRAMMAR_FILE =~ /^\//) {
-    system("ln -sf $GLUE_GRAMMAR_FILE $filename");
+# Create and update the glue file, if the test grammar was provided (if not, we assume these
+# are in the $JOSHUA_CONFIG)
+if (defined $TEST_GRAMMAR and $GRAMMAR_TYPE ne "phrase") {
+  if (! defined $GLUE_GRAMMAR_FILE) {
+    $cachepipe->cmd("glue-test",
+                    "java -Xmx1g -cp $JOSHUA/lib/*:$THRAX/bin/thrax.jar edu.jhu.thrax.util.CreateGlueGrammar $TEST_GRAMMAR > $DATA_DIRS{test}/grammar.glue",
+                    $TEST_GRAMMAR,
+                    "$DATA_DIRS{test}/grammar.glue");
+    $GLUE_GRAMMAR_FILE = "$DATA_DIRS{test}/grammar.glue";
+    
   } else {
-    system("ln -sf $STARTDIR/$GLUE_GRAMMAR_FILE $filename");
+    # just create a symlink to it
+    my $filename = $DATA_DIRS{test} . "/" . basename($GLUE_GRAMMAR_FILE);
+    if ($GLUE_GRAMMAR_FILE =~ /^\//) {
+      system("ln -sf $GLUE_GRAMMAR_FILE $filename");
+    } else {
+      system("ln -sf $STARTDIR/$GLUE_GRAMMAR_FILE $filename");
+    }
   }
 }
 
+$tm_switch = "";
+$tm_copy_config_args = "";
+if (defined $TEST_GRAMMAR) {
+  $tm_switch .= ($DO_PACK_GRAMMARS) ? "--pack-tm" : "--tm";
+  $tm_switch .= " $TEST_GRAMMAR";
+}
+if (defined $GLUE_GRAMMAR_FILE) {
+  $tm_switch .= " --tm $GLUE_GRAMMAR_FILE";
+}
+
 # Build the filtered testing model
 $cachepipe->cmd("test-bundle",
-                "$BUNDLER --force --symlink --verbose $JOSHUA_CONFIG test/model --copy-config-options '-top-n $NBEST -output-format \"%i ||| %s ||| %f ||| %c\" -mark-oovs false' ${tm_switch} $TEST_GRAMMAR --tm $GLUE_GRAMMAR_FILE",
+                "$BUNDLER --force --symlink --verbose $JOSHUA_CONFIG test/model --copy-config-options '-top-n $NBEST -output-format \"%i ||| %s ||| %f ||| %c\" -mark-oovs false' ${tm_switch}",
                 $JOSHUA_CONFIG,
-                get_file_from_grammar($TEST_GRAMMAR),
-                "$testdir/joshua.config");
+                get_file_from_grammar($TEST_GRAMMAR) || $JOSHUA_CONFIG,
+                "$testdir/model/joshua.config");
 
-{
-  # Update some variables. $TEST_GRAMMAR_FILE, which previously held
-  # an optional command-line argument of a pre-filtered tuning
-  # grammar, is now used to record the text-based grammar, which is
-  # needed later for different things.
+if (defined $TEST_GRAMMAR) {
+  # Update the test grammar (if defined) to its new path
   my $basename = basename($TEST_GRAMMAR);
   if (-e "$testdir/model/$basename") {
     $TEST_GRAMMAR = "$testdir/model/$basename";
@@ -1535,10 +1630,10 @@
 # need the n-best output) or $bestoutput (which only outputs the hypothesis but is tons faster)
 $cachepipe->cmd("test-decode",
                 "$testrun/decoder_command",
-                "$testrun/decoder_command",
                 $TEST{source},
+                "$testrun/decoder_command",
                 "$testrun/model/joshua.config",
-                get_file_from_grammar($TEST_GRAMMAR),
+                get_file_from_grammar($TEST_GRAMMAR) || "$testrun/model/joshua.config",
                 $output);
 
 # $cachepipe->cmd("remove-oov",

diff --git a/src/joshua/decoder/ff/lm/LanguageModelFF.java b/src/joshua/decoder/ff/lm/LanguageModelFF.java
index 81572cc..cb01367 100644
--- a/src/joshua/decoder/ff/lm/LanguageModelFF.java
+++ b/src/joshua/decoder/ff/lm/LanguageModelFF.java

@@ -121,9 +121,9 @@
     this.type = parsedArgs.get("lm_type");
     this.ngramOrder = Integer.parseInt(parsedArgs.get("lm_order")); 
     this.path = parsedArgs.get("lm_file");
-    this.isClassLM = parsedArgs.containsKey("lm_class");
-    if (isClassLM && parsedArgs.containsKey("class_map"))
+    if (parsedArgs.containsKey("class_map"))
       try {
+        this.isClassLM = true;
         this.classMap = new ClassMap(parsedArgs.get("class_map"));
       } catch (IOException e) {
         // TODO Auto-generated catch block
@@ -252,8 +252,7 @@
   
   
   /**
-   * Gets the target side classes for the class LM
-   * 
+   * Replace each word in a rule with the target side classes.
    */
   protected int[] getClasses(Rule rule) {
     if (this.classMap == null) {

diff --git a/src/joshua/decoder/ff/lm/kenlm/Makefile b/src/joshua/decoder/ff/lm/kenlm/Makefile
index 483a941..ecc38c3 100644
--- a/src/joshua/decoder/ff/lm/kenlm/Makefile
+++ b/src/joshua/decoder/ff/lm/kenlm/Makefile

@@ -16,7 +16,7 @@
 
 HEADERS= lm/bhiksha.hh lm/binary_format.hh lm/blank.hh lm/builder/adjust_counts.hh lm/builder/corpus_count.hh lm/builder/discount.hh lm/builder/header_info.hh lm/builder/initial_probabilities.hh lm/builder/interpolate.hh lm/builder/joint_order.hh lm/builder/multi_stream.hh lm/builder/ngram.hh lm/builder/ngram_stream.hh lm/builder/pipeline.hh lm/builder/print.hh lm/builder/sort.hh lm/config.hh lm/enumerate_vocab.hh lm/facade.hh lm/left.hh lm/lm_exception.hh lm/max_order.hh lm/model.hh lm/model_type.hh lm/ngram_query.hh lm/partial.hh lm/quantize.hh lm/read_arpa.hh lm/return.hh lm/search_hashed.hh lm/search_trie.hh lm/sizes.hh lm/state.hh lm/trie.hh lm/trie_sort.hh lm/value_build.hh lm/value.hh lm/virtual_interface.hh lm/vocab.hh lm/weights.hh lm/word_index.hh util/bit_packing.hh util/ersatz_progress.hh util/exception.hh util/fake_ofstream.hh util/file.hh util/file_piece.hh util/getopt.hh util/have.hh util/joint_sort.hh util/mmap.hh util/multi_intersection.hh util/murmur_hash.hh util/pcqueue.hh util/pool.hh util/probing_hash_table.hh util/proxy_iterator.hh util/read_compressed.hh util/scoped.hh util/sized_iterator.hh util/sorted_uniform.hh util/stream/block.hh util/stream/chain.hh util/stream/config.hh util/stream/io.hh util/stream/line_input.hh util/stream/multi_progress.hh util/stream/sort.hh util/stream/stream.hh util/stream/timer.hh util/string_piece_hash.hh util/string_piece.hh util/thread_pool.hh util/tokenize_piece.hh util/usage.hh util/parallel_read.hh
 
-CPPFLAGS = $(CXXFLAGS) -I. -O3 -DKENLM_MAX_ORDER=$(MAX_ORDER) -DHAVE_ZLIB -DNDEBUG
+CPPFLAGS = $(CXXFLAGS) -I. -I$(BOOST_ROOT)/include -O3 -DKENLM_MAX_ORDER=$(MAX_ORDER) -DHAVE_ZLIB -DNDEBUG
 
 .cc.o: $(HEADERS)
 	$(CC) -c $(CPPFLAGS) -fPIC -o $@ $<
@@ -36,6 +36,9 @@
 
 #lmplz
 SHELL=bash
+ifeq (,$(wildcard $(BOOST_ROOT)/lib/libboost_thread.a))
+  $(error BOOST_ROOT ($(BOOST_ROOT)) does not point to a Boost installation, quitting...)
+endif
 ifeq ($(shell $(CC) -L$(BOOST_ROOT)/lib -lboost_program_options$(BOOST_MT) -lboost_thread$(BOOST_MT) -x c++ - <<<'int main() {}' -o dummy && rm dummy && echo Boost),Boost)
   $(info Detected Boost)
 LMPLZ=lm/builder/adjust_counts.o lm/builder/corpus_count.o lm/builder/initial_probabilities.o lm/builder/interpolate.o lm/builder/lmplz_main.o lm/builder/pipeline.o lm/builder/print.o lm/builder/output.o util/stream/chain.o util/stream/io.o util/stream/line_input.o util/stream/multi_progress.o

diff --git a/src/joshua/decoder/ff/lm/kenlm/jni/KenLM.java b/src/joshua/decoder/ff/lm/kenlm/jni/KenLM.java
index bc3d8f4..a110aa9 100644
--- a/src/joshua/decoder/ff/lm/kenlm/jni/KenLM.java
+++ b/src/joshua/decoder/ff/lm/kenlm/jni/KenLM.java

@@ -16,7 +16,17 @@
 public class KenLM implements NGramLanguageModel, Comparable<KenLM> {
 
   static {
-    System.loadLibrary("ken");
+    try {
+      System.loadLibrary("ken");
+    } catch (UnsatisfiedLinkError e) {
+      System.err.println("* FATAL: Can't find libken.so (libken.dylib on OS X) in $JOSHUA/lib");
+      System.err.println("*        This probably means that the KenLM library didn't compile.");
+      System.err.println("*        Make sure that BOOST_ROOT is set to the root of your boost");
+      System.err.println("*        installation (it's not /opt/local/, the default), change to");
+      System.err.println("*        $JOSHUA, and type 'ant kenlm'. If problems persist, see the");
+      System.err.println("*        website (joshua-decoder.org).");
+      System.exit(1);
+    }
   }
 
   private final long pointer;
commit	89d5d0d52ac6382a86f56417d44c24c89ec8980f	[log] [tgz]
author	Matt Post <post@cs.jhu.edu>	Mon Jun 15 16:58:22 2015 -0400
committer	Matt Post <post@cs.jhu.edu>	Mon Jun 15 16:58:22 2015 -0400
tree	2fd57c1af60a668e1e8cc8307147128cc5e0a1b3
parent	dd0fb74d1317f8759152ce23c3b00a977906c22d [diff]
parent	fe83fbc324a8dfe1049adc96eac26349ddafbdf0 [diff]