Merge branch 'master' into release
diff --git a/CHANGELOG b/CHANGELOG
index c90dc35..95cbaf4 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,9 +1,9 @@
6.0.5 (October 23, 2015)
========================
-- KenLM updated, includes lastly improved cmake-based build
+- KenLM updated, includes vastly improved cmake-based build
- Fix for grammar packing that previously limited the size of grammars (esp. Hiero)
-- Support for decoding with multiple packed grammars (if packed together)
+- Support for packing and decoding with multiple packed grammars (if packed together)
- Feature functions now report dense features, for more efficient handling
- Added AdaGrad and internal MIRA
- Pipeline:
diff --git a/README.md b/README.md
index 0a127cf..4f610ce 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,6 @@
The basic method for invoking the decoder looks like this:
- cat SOURCE | JOSHUA m MEM -c CONFIG OPTIONS > OUTPUT
+ cat SOURCE | $JOSHUA/bin/joshua -m MEM -c CONFIG OPTIONS > OUTPUT
Some example usage scenarios and scripts can be found in the [examples/](https://github.com/joshua-decoder/joshua/tree/master/examples) directory.
diff --git a/scripts/support/grammar-packer.pl b/scripts/support/grammar-packer.pl
index 533bcae..f5e8481 100755
--- a/scripts/support/grammar-packer.pl
+++ b/scripts/support/grammar-packer.pl
@@ -68,13 +68,13 @@
chomp(my $first_line = `$CAT $grammar | head -n1`);
if ($first_line =~ /^\[/) {
# regular grammar
- if (system("$CAT $grammar | sed 's/ ||| /\t/g' | sort -k2,2 -k3,3 --buffer-size=$opts{m} -T $opts{T} | sed 's/\t/ ||| /g' | gzip -9n > $sorted_grammar")) {
+ if (system("$CAT $grammar | sed 's/ ||| /\t/g' | LC_ALL=C sort -t'\t' -k2,2 -k3,3 --buffer-size=$opts{m} -T $opts{T} | sed 's/\t/ ||| /g' | gzip -9n > $sorted_grammar")) {
print STDERR "* FATAL: Couldn't sort the grammar (not enough memory? short on tmp space?)\n";
exit 2;
}
} else {
# Moses phrase-based grammar -- prepend nonterminal symbol and -log() the weights
- if (system("$CAT $grammar | $JOSHUA/scripts/support/moses_phrase_to_joshua.pl | sort -k3,3 --buffer-size=$opts{m} -T $opts{T} | gzip -9n > $sorted_grammar")) {
+ if (system("$CAT $grammar | $JOSHUA/scripts/support/moses_phrase_to_joshua.pl | sed 's/ ||| /\t/g' | LC_ALL=C sort -t'\t' -k2,2 -k3,3 --buffer-size=$opts{m} -T $opts{T} | sed 's/\t/ ||| /g' | gzip -9n > $sorted_grammar")) {
print STDERR "* FATAL: Couldn't sort the grammar (not enough memory? short on tmp space?)\n";
exit 2;
}
diff --git a/scripts/support/run_bundler.py b/scripts/support/run_bundler.py
index 9311e69..1c094f1 100755
--- a/scripts/support/run_bundler.py
+++ b/scripts/support/run_bundler.py
@@ -327,7 +327,7 @@
def run_grammar_packer(src_path, dest_path):
cmd = [os.path.join(JOSHUA_PATH, "scripts/support/grammar-packer.pl"),
"-T", opts.tmpdir,
- src_path, dest_path]
+ "-g", src_path, "-o", dest_path]
logging.info(
'Running the grammar-packer.pl script with the command: %s'
% ' '.join(cmd)
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 92ddbb5..a0a6458 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -618,6 +618,8 @@
unlink "scripts/normalize.$TARGET";
symlink $NORMALIZER, "scripts/normalize.$SOURCE";
symlink $NORMALIZER, "scripts/normalize.$TARGET";
+unlink "scripts/tokenize.$SOURCE";
+unlink "scripts/tokenize.$TARGET";
symlink $TOKENIZER_SOURCE, "scripts/tokenize.$SOURCE";
symlink $TOKENIZER_TARGET, "scripts/tokenize.$TARGET";
@@ -1212,7 +1214,7 @@
if ($LM_TYPE eq "kenlm") {
my $kenlm_file = basename($lmfile, ".gz") . ".kenlm";
$cachepipe->cmd("compile-kenlm",
- "$JOSHUA/src/joshua/decoder/ff/lm/kenlm/build_binary $lmfile $kenlm_file",
+ "$JOSHUA/bin/build_binary $lmfile $kenlm_file",
$lmfile, $kenlm_file);
return $kenlm_file;
diff --git a/src/giza-pp/GIZA++-v2/Makefile b/src/giza-pp/GIZA++-v2/Makefile
index 17daae5..0148849 100644
--- a/src/giza-pp/GIZA++-v2/Makefile
+++ b/src/giza-pp/GIZA++-v2/Makefile
@@ -4,7 +4,7 @@
#CXX = g++
-CFLAGS = $(CFLAGS_GLOBAL) -Wall -Wno-parentheses
+CFLAGS = $(CFLAGS_GLOBAL) -Wall -Wno-parentheses -std=c++11
#CFLAGS_OPT = $(CFLAGS) -O3 -DNDEBUG -DWORDINDEX_WITH_4_BYTE -O3 -DNDEBUG -DWORDINDEX_WITH_4_BYTE -ffast-math
CFLAGS_OPT = $(CFLAGS) -O3 -funroll-loops -DNDEBUG -DWORDINDEX_WITH_4_BYTE -DBINARY_SEARCH_FOR_TTABLE -DWORDINDEX_WITH_4_BYTE
CFLAGS_PRF = $(CFLAGS) -O2 -pg -DNDEBUG -DWORDINDEX_WITH_4_BYTE
diff --git a/src/giza-pp/mkcls-v2/Makefile b/src/giza-pp/mkcls-v2/Makefile
index f773740..cec1673 100644
--- a/src/giza-pp/mkcls-v2/Makefile
+++ b/src/giza-pp/mkcls-v2/Makefile
@@ -5,7 +5,7 @@
KategProblemWBC.o KategProblem.o StatVar.o general.o \
mkcls.o
-CFLAGS = $(CFLAGS_GLOBAL) -Wall -W -DNDEBUG -O3 -funroll-loops
+CFLAGS = $(CFLAGS_GLOBAL) -Wall -W -DNDEBUG -O3 -funroll-loops -std=c++11
.cpp.o:
$(CXX) $(CFLAGS) -c $< -o $@
diff --git a/src/joshua/decoder/Decoder.java b/src/joshua/decoder/Decoder.java
index caa3258..2f603c4 100644
--- a/src/joshua/decoder/Decoder.java
+++ b/src/joshua/decoder/Decoder.java
@@ -17,12 +17,15 @@
import joshua.decoder.ff.FeatureFunction;
import joshua.decoder.ff.PhraseModel;
import joshua.decoder.ff.tm.Grammar;
+import joshua.decoder.ff.tm.Rule;
+import joshua.decoder.ff.tm.format.HieroFormatReader;
import joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
import joshua.decoder.ff.tm.packed.PackedGrammar;
import joshua.decoder.io.TranslationRequest;
import joshua.decoder.phrase.PhraseTable;
import joshua.decoder.segment_file.Sentence;
import joshua.util.FileUtility;
+import joshua.util.FormatUtils;
import joshua.util.Regex;
import joshua.util.io.LineReader;
@@ -532,6 +535,26 @@
this.grammars.add(glueGrammar);
}
+ /* Create an epsilon-deleting grammar */
+ if (joshuaConfiguration.lattice_decoding) {
+ Decoder.LOG(1, "Creating an epsilon-deleting grammar");
+ MemoryBasedBatchGrammar latticeGrammar = new MemoryBasedBatchGrammar("lattice", joshuaConfiguration);
+ latticeGrammar.setSpanLimit(-1);
+ HieroFormatReader reader = new HieroFormatReader();
+
+ String goalNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.goal_symbol);
+ String defaultNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.default_non_terminal);
+
+ String ruleString = String.format("[%s] ||| [%s,1] <eps> ||| [%s,1] ||| ", goalNT, goalNT, defaultNT,
+ goalNT, defaultNT);
+
+ Rule rule = reader.parseLine(ruleString);
+ latticeGrammar.addRule(rule);
+ rule.estimateRuleCost(featureFunctions);
+
+ this.grammars.add(latticeGrammar);
+ }
+
/* Now create a feature function for each owner */
HashSet<String> ownersSeen = new HashSet<String>();
diff --git a/src/joshua/decoder/Translation.java b/src/joshua/decoder/Translation.java
index 427e3d9..9aa83e5 100644
--- a/src/joshua/decoder/Translation.java
+++ b/src/joshua/decoder/Translation.java
@@ -50,7 +50,7 @@
Decoder.weights.increment("BLEU", 0);
String best = ViterbiExtractor.extractViterbiString(hypergraph.goalNode).trim();
- best = best.substring(best.indexOf(' ') + 1, best.lastIndexOf(' '));
+ best = best.substring(new String("<s>").length() + 1, best.lastIndexOf("</s>"));
Decoder.LOG(1, String.format("Translation %d: %.3f %s", source.id(), hypergraph.goalNode.getScore(),
best));
@@ -88,26 +88,20 @@
} else {
- if (source.isEmpty()) {
- // Empty output just gets echoed back
- out.write("");
- out.newLine();
- } else {
- // Failed translations get empty formatted outputs
- // @formatter:off
- String outputString = joshuaConfiguration.outputFormat
- .replace("%s", source.source())
- .replace("%e", "")
- .replace("%S", "")
- .replace("%t", "()")
- .replace("%i", Integer.toString(source.id()))
- .replace("%f", "")
- .replace("%c", "0.000");
- // @formatter:on
+ // Failed translations and blank lines get empty formatted outputs
+ // @formatter:off
+ String outputString = joshuaConfiguration.outputFormat
+ .replace("%s", source.source())
+ .replace("%e", "")
+ .replace("%S", "")
+ .replace("%t", "()")
+ .replace("%i", Integer.toString(source.id()))
+ .replace("%f", "")
+ .replace("%c", "0.000");
+ // @formatter:on
- out.write(outputString);
- out.newLine();
- }
+ out.write(outputString);
+ out.newLine();
}
out.flush();
diff --git a/src/joshua/decoder/ff/FeatureVector.java b/src/joshua/decoder/ff/FeatureVector.java
index 9ad3436..e36448b 100644
--- a/src/joshua/decoder/ff/FeatureVector.java
+++ b/src/joshua/decoder/ff/FeatureVector.java
@@ -158,7 +158,7 @@
for (String key : this.sparseFeatures.keySet())
newOne.set(key, this.sparseFeatures.get(key));
for (int i = 0; i < denseFeatures.size(); i++)
- newOne.set(i, denseFeatures.get(i));
+ newOne.set(i, getDense(i));
return newOne;
}
@@ -169,7 +169,7 @@
*/
public void subtract(FeatureVector other) {
for (int i = 0; i < denseFeatures.size(); i++)
- denseFeatures.set(i, denseFeatures.get(i) - other.getDense(i));
+ denseFeatures.set(i, getDense(i) - other.getDense(i));
for (String key : other.keySet()) {
float oldValue = (sparseFeatures.containsKey(key)) ? sparseFeatures.get(key) : 0.0f;
@@ -213,7 +213,8 @@
}
/**
- * Return the weight of a dense feature, indexed by its feature index.
+ * Return the weight of a dense feature, indexed by its feature index, or 0.0f, if the feature
+ * is not found. In other words, this is a safe way to query the dense feature vector.
*
* @param id
* @return the dense feature's value, or 0 if not found.
@@ -278,7 +279,7 @@
// First print all the dense feature names in order
for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) {
- outputString += String.format("%s=%.3f ", DENSE_FEATURE_NAMES.get(i).replaceAll("_", "-"), denseFeatures.get(i));
+ outputString += String.format("%s=%.3f ", DENSE_FEATURE_NAMES.get(i).replaceAll("_", "-"), getDense(i));
printed_keys.add(DENSE_FEATURE_NAMES.get(i));
}
diff --git a/src/joshua/tools/GrammarPacker.java b/src/joshua/tools/GrammarPacker.java
index fb13ee4..8eba340 100644
--- a/src/joshua/tools/GrammarPacker.java
+++ b/src/joshua/tools/GrammarPacker.java
@@ -202,7 +202,7 @@
if (line.startsWith("[")) {
// hierarchical model
if (fields.size() < 4) {
- logger.warning("Incomplete grammar line at line " + counter);
+ logger.warning(String.format("Incomplete grammar line at line %d: '%s'", counter, line));
continue;
}
lhs = fields.remove(0);
diff --git a/test/server/expected b/test/server/expected
index eb1146d..fe7f422 100644
--- a/test/server/expected
+++ b/test/server/expected
@@ -1,7 +1,7 @@
0 ||| this_OOV 0 ||| this_OOV 0 ||| this_OOV 0 ||| this_OOV 0 ||| this_OOV 0 ||| this_OOV 0 ||| this_OOV 0 ||| this_OOV 0 ||| this_OOV 0 ||| this_OOV
1 ||| that_OOV 1 ||| that_OOV 1 ||| that_OOV 1 ||| that_OOV 1 ||| that_OOV 1 ||| that_OOV 1 ||| that_OOV 1 ||| that_OOV 1 ||| that_OOV 1 ||| that_OOV
2 ||| these_OOV 2 ||| these_OOV 2 ||| these_OOV 2 ||| these_OOV 2 ||| these_OOV 2 ||| these_OOV 2 ||| these_OOV 2 ||| these_OOV 2 ||| these_OOV 2 ||| these_OOV
-
+3 ||| 3 ||| 3 ||| 3 ||| 3 ||| 3 ||| 3 ||| 3 ||| 3 ||| 3 |||
4 ||| those_OOV 4 ||| those_OOV 4 ||| those_OOV 4 ||| those_OOV 4 ||| those_OOV 4 ||| those_OOV 4 ||| those_OOV 4 ||| those_OOV 4 ||| those_OOV 4 ||| those_OOV
5 ||| mine_OOV 5 ||| mine_OOV 5 ||| mine_OOV 5 ||| mine_OOV 5 ||| mine_OOV 5 ||| mine_OOV 5 ||| mine_OOV 5 ||| mine_OOV 5 ||| mine_OOV 5 ||| mine_OOV
6 ||| his_OOV 6 ||| his_OOV 6 ||| his_OOV 6 ||| his_OOV 6 ||| his_OOV 6 ||| his_OOV 6 ||| his_OOV 6 ||| his_OOV 6 ||| his_OOV 6 ||| his_OOV