blob: ef694a3f9b6a7cc76a8a289059ad64bde6fdf862 [file] [log] [blame]
package joshua.decoder.phrase;
import java.io.File;
import java.io.IOException;
import java.util.List;
import joshua.corpus.Vocabulary;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.ff.FeatureFunction;
import joshua.decoder.ff.tm.Grammar;
import joshua.decoder.ff.tm.Rule;
import joshua.decoder.ff.tm.RuleCollection;
import joshua.decoder.ff.tm.Trie;
import joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
import joshua.decoder.ff.tm.packed.PackedGrammar;
/**
* Represents a phrase table, and is implemented as a wrapper around either a {@link PackedGrammar}
* or a {@link MemoryBasedBatchGrammar}.
*
* TODO: this should all be implemented as a two-level trie (source trie and target trie).
*/
public class PhraseTable implements Grammar {
private JoshuaConfiguration config;
private Grammar backend;
/**
* Chain to the super with a number of defaults. For example, we only use a single nonterminal,
* and there is no span limit.
*
* @param grammarFile
* @param owner
* @param config
* @throws IOException
*/
public PhraseTable(String grammarFile, String owner, JoshuaConfiguration config, int maxSource)
throws IOException {
this.config = config;
int spanLimit = 0;
if (new File(grammarFile).isDirectory()) {
this.backend = new PackedGrammar(grammarFile, spanLimit, owner, "moses", config);
if (this.backend.getMaxSourcePhraseLength() == -1) {
System.err.println("FATAL: Using a packed grammar for a phrase table backend requires that you");
System.err.println(" packed the grammar with Joshua 6.0.2 or greater");
System.exit(-1);
}
} else {
this.backend = new MemoryBasedBatchGrammar("moses", grammarFile, owner, "[X]", spanLimit, config);
}
}
public PhraseTable(String owner, JoshuaConfiguration config) {
this.config = config;
this.backend = new MemoryBasedBatchGrammar(owner, config);
}
/**
* Returns the longest source phrase read. For {@link MemoryBasedBatchGrammar}s, we subtract 1
* since the grammar includes the nonterminal. For {@link PackedGrammar}s, the value was either
* in the packed config file (Joshua 6.0.2+) or was passed in via the TM config line.
*
* @return
*/
@Override
public int getMaxSourcePhraseLength() {
if (backend instanceof MemoryBasedBatchGrammar)
return this.backend.getMaxSourcePhraseLength() - 1;
else
return this.backend.getMaxSourcePhraseLength();
}
/**
* Collect the set of target-side phrases associated with a source phrase.
*
* @param sourceWords the sequence of source words
* @return the rules
*/
public RuleCollection getPhrases(int[] sourceWords) {
if (sourceWords.length != 0) {
Trie pointer = getTrieRoot();
if (! (backend instanceof PackedGrammar))
pointer = pointer.match(Vocabulary.id("[X]"));
int i = 0;
while (pointer != null && i < sourceWords.length)
pointer = pointer.match(sourceWords[i++]);
if (pointer != null && pointer.hasRules()) {
return pointer.getRuleCollection();
}
}
return null;
}
/**
* Adds a rule to the grammar. Only supported when the backend is a MemoryBasedBatchGrammar.
*
* @param rule the rule to add
*/
public void addRule(Rule rule) {
((MemoryBasedBatchGrammar)backend).addRule(rule);
}
@Override
public void addOOVRules(int sourceWord, List<FeatureFunction> featureFunctions) {
// TODO: _OOV shouldn't be outright added, since the word might not be OOV for the LM (but now almost
// certainly is)
int targetWord = config.mark_oovs
? Vocabulary.id(Vocabulary.word(sourceWord) + "_OOV")
: sourceWord;
int nt_i = Vocabulary.id("[X]");
Rule oovRule = new Rule(nt_i, new int[] { nt_i, sourceWord },
new int[] { -1, targetWord }, "", 1, null);
addRule(oovRule);
oovRule.estimateRuleCost(featureFunctions);
// String ruleString = String.format("[X] ||| [X,1] %s ||| [X,1] %s",
// Vocabulary.word(sourceWord), Vocabulary.word(targetWord));
// BilingualRule oovRule = new HieroFormatReader().parseLine(ruleString);
// oovRule.setOwner(Vocabulary.id("oov"));
// addRule(oovRule);
// oovRule.estimateRuleCost(featureFunctions);
}
@Override
public Trie getTrieRoot() {
return backend.getTrieRoot();
}
@Override
public void sortGrammar(List<FeatureFunction> models) {
backend.sortGrammar(models);
}
@Override
public boolean isSorted() {
return backend.isSorted();
}
/**
* This should never be called.
*/
@Override
public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) {
return true;
}
@Override
public int getNumRules() {
return backend.getNumRules();
}
@Override
public Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords, float[] scores,
int arity) {
return backend.constructManualRule(lhs, sourceWords, targetWords, scores, arity);
}
@Override
public void writeGrammarOnDisk(String file) {
backend.writeGrammarOnDisk(file);
}
@Override
public boolean isRegexpGrammar() {
return backend.isRegexpGrammar();
}
@Override
public int getOwner() {
return backend.getOwner();
}
}