Merge branch 'master' into release

commit: 481996a5db4974bbe569341a124a6562f820db40 [log] [tgz]
author: Matt Post <post@cs.jhu.edu> Wed Nov 04 14:40:15 2015 -0500
committer: Matt Post <post@cs.jhu.edu> Wed Nov 04 14:40:15 2015 -0500
tree: 627eee78e925ae106ce044c83fed4742eba16f24
parent: a81f00b82ba36ca816722782e16ff2877e7151ff [diff]
parent: 71583d64b064fe141c07b02f011e0907c2ce278d [diff]
diff --git a/CHANGELOG b/CHANGELOG
index c90dc35..95cbaf4 100644
--- a/CHANGELOG
+++ b/CHANGELOG

@@ -1,9 +1,9 @@
 6.0.5 (October 23, 2015)
 ========================
 
-- KenLM updated, includes lastly improved cmake-based build
+- KenLM updated, includes vastly improved cmake-based build
 - Fix for grammar packing that previously limited the size of grammars (esp. Hiero)
-- Support for decoding with multiple packed grammars (if packed together)
+- Support for packing and decoding with multiple packed grammars (if packed together)
 - Feature functions now report dense features, for more efficient handling
 - Added AdaGrad and internal MIRA
 - Pipeline:

diff --git a/README.md b/README.md
index 0a127cf..4f610ce 100644
--- a/README.md
+++ b/README.md

@@ -59,6 +59,6 @@
 
 The basic method for invoking the decoder looks like this:
 
-    cat SOURCE | JOSHUA m MEM -c CONFIG OPTIONS > OUTPUT
+    cat SOURCE | $JOSHUA/bin/joshua -m MEM -c CONFIG OPTIONS > OUTPUT
 
 Some example usage scenarios and scripts can be found in the [examples/](https://github.com/joshua-decoder/joshua/tree/master/examples) directory.

diff --git a/scripts/support/grammar-packer.pl b/scripts/support/grammar-packer.pl
index 533bcae..f5e8481 100755
--- a/scripts/support/grammar-packer.pl
+++ b/scripts/support/grammar-packer.pl

@@ -68,13 +68,13 @@
   chomp(my $first_line = `$CAT $grammar | head -n1`);
   if ($first_line =~ /^\[/) {
     # regular grammar
-    if (system("$CAT $grammar | sed 's/ ||| /\t/g' | sort -k2,2 -k3,3 --buffer-size=$opts{m} -T $opts{T} | sed 's/\t/ ||| /g' | gzip -9n > $sorted_grammar")) {
+    if (system("$CAT $grammar | sed 's/ ||| /\t/g' | LC_ALL=C sort -t'\t' -k2,2 -k3,3 --buffer-size=$opts{m} -T $opts{T} | sed 's/\t/ ||| /g' | gzip -9n > $sorted_grammar")) {
       print STDERR "* FATAL: Couldn't sort the grammar (not enough memory? short on tmp space?)\n";
       exit 2;
     }
   } else {
     # Moses phrase-based grammar -- prepend nonterminal symbol and -log() the weights
-    if (system("$CAT $grammar | $JOSHUA/scripts/support/moses_phrase_to_joshua.pl | sort -k3,3 --buffer-size=$opts{m} -T $opts{T} | gzip -9n > $sorted_grammar")) {
+    if (system("$CAT $grammar | $JOSHUA/scripts/support/moses_phrase_to_joshua.pl | sed 's/ ||| /\t/g' | LC_ALL=C sort -t'\t' -k2,2 -k3,3 --buffer-size=$opts{m} -T $opts{T} | sed 's/\t/ ||| /g' | gzip -9n > $sorted_grammar")) {
       print STDERR "* FATAL: Couldn't sort the grammar (not enough memory? short on tmp space?)\n";
       exit 2;
     }

diff --git a/scripts/support/run_bundler.py b/scripts/support/run_bundler.py
index 9311e69..1c094f1 100755
--- a/scripts/support/run_bundler.py
+++ b/scripts/support/run_bundler.py

@@ -327,7 +327,7 @@
 def run_grammar_packer(src_path, dest_path):
     cmd = [os.path.join(JOSHUA_PATH, "scripts/support/grammar-packer.pl"),
            "-T", opts.tmpdir,
-           src_path, dest_path]
+           "-g", src_path, "-o", dest_path]
     logging.info(
         'Running the grammar-packer.pl script with the command: %s'
         % ' '.join(cmd)

diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 92ddbb5..a0a6458 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl

@@ -618,6 +618,8 @@
 unlink "scripts/normalize.$TARGET";
 symlink $NORMALIZER, "scripts/normalize.$SOURCE";
 symlink $NORMALIZER, "scripts/normalize.$TARGET";
+unlink "scripts/tokenize.$SOURCE";
+unlink "scripts/tokenize.$TARGET";
 symlink $TOKENIZER_SOURCE, "scripts/tokenize.$SOURCE";
 symlink $TOKENIZER_TARGET, "scripts/tokenize.$TARGET";
 
@@ -1212,7 +1214,7 @@
   if ($LM_TYPE eq "kenlm") {
     my $kenlm_file = basename($lmfile, ".gz") . ".kenlm";
     $cachepipe->cmd("compile-kenlm",
-                    "$JOSHUA/src/joshua/decoder/ff/lm/kenlm/build_binary $lmfile $kenlm_file",
+                    "$JOSHUA/bin/build_binary $lmfile $kenlm_file",
                     $lmfile, $kenlm_file);
     return $kenlm_file;
 

diff --git a/src/giza-pp/GIZA++-v2/Makefile b/src/giza-pp/GIZA++-v2/Makefile
index 17daae5..0148849 100644
--- a/src/giza-pp/GIZA++-v2/Makefile
+++ b/src/giza-pp/GIZA++-v2/Makefile

@@ -4,7 +4,7 @@
 
 #CXX = g++
 
-CFLAGS = $(CFLAGS_GLOBAL) -Wall -Wno-parentheses
+CFLAGS = $(CFLAGS_GLOBAL) -Wall -Wno-parentheses -std=c++11
 #CFLAGS_OPT = $(CFLAGS) -O3 -DNDEBUG -DWORDINDEX_WITH_4_BYTE -O3 -DNDEBUG -DWORDINDEX_WITH_4_BYTE -ffast-math
 CFLAGS_OPT = $(CFLAGS) -O3 -funroll-loops -DNDEBUG -DWORDINDEX_WITH_4_BYTE -DBINARY_SEARCH_FOR_TTABLE -DWORDINDEX_WITH_4_BYTE
 CFLAGS_PRF = $(CFLAGS) -O2 -pg -DNDEBUG -DWORDINDEX_WITH_4_BYTE

diff --git a/src/giza-pp/mkcls-v2/Makefile b/src/giza-pp/mkcls-v2/Makefile
index f773740..cec1673 100644
--- a/src/giza-pp/mkcls-v2/Makefile
+++ b/src/giza-pp/mkcls-v2/Makefile

@@ -5,7 +5,7 @@
             KategProblemWBC.o KategProblem.o StatVar.o general.o \
             mkcls.o
 
-CFLAGS = $(CFLAGS_GLOBAL) -Wall -W -DNDEBUG -O3 -funroll-loops
+CFLAGS = $(CFLAGS_GLOBAL) -Wall -W -DNDEBUG -O3 -funroll-loops -std=c++11
 
 .cpp.o:
 	$(CXX) $(CFLAGS) -c $< -o $@

diff --git a/src/joshua/decoder/Decoder.java b/src/joshua/decoder/Decoder.java
index caa3258..2f603c4 100644
--- a/src/joshua/decoder/Decoder.java
+++ b/src/joshua/decoder/Decoder.java

@@ -17,12 +17,15 @@
 import joshua.decoder.ff.FeatureFunction;
 import joshua.decoder.ff.PhraseModel;
 import joshua.decoder.ff.tm.Grammar;
+import joshua.decoder.ff.tm.Rule;
+import joshua.decoder.ff.tm.format.HieroFormatReader;
 import joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
 import joshua.decoder.ff.tm.packed.PackedGrammar;
 import joshua.decoder.io.TranslationRequest;
 import joshua.decoder.phrase.PhraseTable;
 import joshua.decoder.segment_file.Sentence;
 import joshua.util.FileUtility;
+import joshua.util.FormatUtils;
 import joshua.util.Regex;
 import joshua.util.io.LineReader;
 
@@ -532,6 +535,26 @@
       this.grammars.add(glueGrammar);
     }
     
+    /* Create an epsilon-deleting grammar */
+    if (joshuaConfiguration.lattice_decoding) {
+      Decoder.LOG(1, "Creating an epsilon-deleting grammar");
+      MemoryBasedBatchGrammar latticeGrammar = new MemoryBasedBatchGrammar("lattice", joshuaConfiguration);
+      latticeGrammar.setSpanLimit(-1);
+      HieroFormatReader reader = new HieroFormatReader();
+
+      String goalNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.goal_symbol);
+      String defaultNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.default_non_terminal);
+
+      String ruleString = String.format("[%s] ||| [%s,1] <eps> ||| [%s,1] ||| ", goalNT, goalNT, defaultNT,
+          goalNT, defaultNT);
+
+      Rule rule = reader.parseLine(ruleString);
+      latticeGrammar.addRule(rule);
+      rule.estimateRuleCost(featureFunctions);
+
+      this.grammars.add(latticeGrammar);
+    }
+
     /* Now create a feature function for each owner */
     HashSet<String> ownersSeen = new HashSet<String>();
 

diff --git a/src/joshua/decoder/Translation.java b/src/joshua/decoder/Translation.java
index 427e3d9..9aa83e5 100644
--- a/src/joshua/decoder/Translation.java
+++ b/src/joshua/decoder/Translation.java

@@ -50,7 +50,7 @@
         Decoder.weights.increment("BLEU", 0);
 
         String best = ViterbiExtractor.extractViterbiString(hypergraph.goalNode).trim();
-        best = best.substring(best.indexOf(' ') + 1, best.lastIndexOf(' '));
+        best = best.substring(new String("<s>").length() + 1, best.lastIndexOf("</s>"));
         
         Decoder.LOG(1, String.format("Translation %d: %.3f %s", source.id(), hypergraph.goalNode.getScore(),
             best));
@@ -88,26 +88,20 @@
 
       } else {
         
-        if (source.isEmpty()) {
-          // Empty output just gets echoed back
-          out.write("");
-          out.newLine();
-        } else {
-          // Failed translations get empty formatted outputs
-          // @formatter:off
-          String outputString = joshuaConfiguration.outputFormat
-              .replace("%s", source.source())
-              .replace("%e", "")
-              .replace("%S", "")
-              .replace("%t", "()")
-              .replace("%i", Integer.toString(source.id()))
-              .replace("%f", "")
-              .replace("%c", "0.000");
-          // @formatter:on
+        // Failed translations and blank lines get empty formatted outputs
+        // @formatter:off
+        String outputString = joshuaConfiguration.outputFormat
+            .replace("%s", source.source())
+            .replace("%e", "")
+            .replace("%S", "")
+            .replace("%t", "()")
+            .replace("%i", Integer.toString(source.id()))
+            .replace("%f", "")
+            .replace("%c", "0.000");
+        // @formatter:on
 
-          out.write(outputString);
-          out.newLine();
-        }
+        out.write(outputString);
+        out.newLine();
       }
 
       out.flush();

diff --git a/src/joshua/decoder/ff/FeatureVector.java b/src/joshua/decoder/ff/FeatureVector.java
index 9ad3436..e36448b 100644
--- a/src/joshua/decoder/ff/FeatureVector.java
+++ b/src/joshua/decoder/ff/FeatureVector.java

@@ -158,7 +158,7 @@
     for (String key : this.sparseFeatures.keySet())
       newOne.set(key, this.sparseFeatures.get(key));
     for (int i = 0; i < denseFeatures.size(); i++)
-      newOne.set(i, denseFeatures.get(i));
+      newOne.set(i, getDense(i));
     return newOne;
   }
 
@@ -169,7 +169,7 @@
    */
   public void subtract(FeatureVector other) {
     for (int i = 0; i < denseFeatures.size(); i++)
-      denseFeatures.set(i, denseFeatures.get(i) - other.getDense(i));
+      denseFeatures.set(i, getDense(i) - other.getDense(i));
     
     for (String key : other.keySet()) {
       float oldValue = (sparseFeatures.containsKey(key)) ? sparseFeatures.get(key) : 0.0f;
@@ -213,7 +213,8 @@
   }
   
   /**
-   * Return the weight of a dense feature, indexed by its feature index.
+   * Return the weight of a dense feature, indexed by its feature index, or 0.0f, if the feature
+   * is not found. In other words, this is a safe way to query the dense feature vector.
    * 
    * @param id
    * @return the dense feature's value, or 0 if not found.
@@ -278,7 +279,7 @@
     
     // First print all the dense feature names in order
     for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) {
-      outputString += String.format("%s=%.3f ", DENSE_FEATURE_NAMES.get(i).replaceAll("_", "-"), denseFeatures.get(i));
+      outputString += String.format("%s=%.3f ", DENSE_FEATURE_NAMES.get(i).replaceAll("_", "-"), getDense(i));
       printed_keys.add(DENSE_FEATURE_NAMES.get(i));
     }
     

diff --git a/src/joshua/tools/GrammarPacker.java b/src/joshua/tools/GrammarPacker.java
index fb13ee4..8eba340 100644
--- a/src/joshua/tools/GrammarPacker.java
+++ b/src/joshua/tools/GrammarPacker.java

@@ -202,7 +202,7 @@
       if (line.startsWith("[")) {
         // hierarchical model
         if (fields.size() < 4) {
-          logger.warning("Incomplete grammar line at line " + counter);
+          logger.warning(String.format("Incomplete grammar line at line %d: '%s'", counter, line));
           continue;
         }
         lhs = fields.remove(0);

diff --git a/test/server/expected b/test/server/expected
index eb1146d..fe7f422 100644
--- a/test/server/expected
+++ b/test/server/expected

@@ -1,7 +1,7 @@
 0 ||| this_OOV	0 ||| this_OOV	0 ||| this_OOV	0 ||| this_OOV	0 ||| this_OOV	0 ||| this_OOV	0 ||| this_OOV	0 ||| this_OOV	0 ||| this_OOV	0 ||| this_OOV
 1 ||| that_OOV	1 ||| that_OOV	1 ||| that_OOV	1 ||| that_OOV	1 ||| that_OOV	1 ||| that_OOV	1 ||| that_OOV	1 ||| that_OOV	1 ||| that_OOV	1 ||| that_OOV
 2 ||| these_OOV	2 ||| these_OOV	2 ||| these_OOV	2 ||| these_OOV	2 ||| these_OOV	2 ||| these_OOV	2 ||| these_OOV	2 ||| these_OOV	2 ||| these_OOV	2 ||| these_OOV
-									
+3 ||| 	3 ||| 	3 ||| 	3 ||| 	3 ||| 	3 ||| 	3 ||| 	3 ||| 	3 ||| 	3 ||| 
 4 ||| those_OOV	4 ||| those_OOV	4 ||| those_OOV	4 ||| those_OOV	4 ||| those_OOV	4 ||| those_OOV	4 ||| those_OOV	4 ||| those_OOV	4 ||| those_OOV	4 ||| those_OOV
 5 ||| mine_OOV	5 ||| mine_OOV	5 ||| mine_OOV	5 ||| mine_OOV	5 ||| mine_OOV	5 ||| mine_OOV	5 ||| mine_OOV	5 ||| mine_OOV	5 ||| mine_OOV	5 ||| mine_OOV
 6 ||| his_OOV	6 ||| his_OOV	6 ||| his_OOV	6 ||| his_OOV	6 ||| his_OOV	6 ||| his_OOV	6 ||| his_OOV	6 ||| his_OOV	6 ||| his_OOV	6 ||| his_OOV
commit	481996a5db4974bbe569341a124a6562f820db40	[log] [tgz]
author	Matt Post <post@cs.jhu.edu>	Wed Nov 04 14:40:15 2015 -0500
committer	Matt Post <post@cs.jhu.edu>	Wed Nov 04 14:40:15 2015 -0500
tree	627eee78e925ae106ce044c83fed4742eba16f24
parent	a81f00b82ba36ca816722782e16ff2877e7151ff [diff]
parent	71583d64b064fe141c07b02f011e0907c2ce278d [diff]