src/joshua/decoder/phrase/PhraseTable.java - joshua - Git at Google

 package joshua.decoder.phrase;

 import java.io.File;
 import java.io.IOException;
 import java.util.List;

 import joshua.corpus.Vocabulary;
 import joshua.decoder.JoshuaConfiguration;
 import joshua.decoder.ff.FeatureFunction;
 import joshua.decoder.ff.tm.Grammar;
 import joshua.decoder.ff.tm.Rule;
 import joshua.decoder.ff.tm.RuleCollection;
 import joshua.decoder.ff.tm.Trie;
 import joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
 import joshua.decoder.ff.tm.packed.PackedGrammar;

 /**
  * Represents a phrase table, and is implemented as a wrapper around either a {@link PackedGrammar}
  * or a {@link MemoryBasedBatchGrammar}.
  *
  * TODO: this should all be implemented as a two-level trie (source trie and target trie).
  */
 public class PhraseTable implements Grammar {

   private JoshuaConfiguration config;
   private Grammar backend;

   /**
    * Chain to the super with a number of defaults. For example, we only use a single nonterminal,
    * and there is no span limit.
    *
    * @param grammarFile
    * @param owner
    * @param config
    * @throws IOException
    */
   public PhraseTable(String grammarFile, String owner, JoshuaConfiguration config, int maxSource)
       throws IOException {
     this.config = config;
     int spanLimit = 0;

     if (new File(grammarFile).isDirectory()) {
       this.backend = new PackedGrammar(grammarFile, spanLimit, owner, "moses", config);
       if (this.backend.getMaxSourcePhraseLength() == -1) {
         System.err.println("FATAL: Using a packed grammar for a phrase table backend requires that you");
         System.err.println("       packed the grammar with Joshua 6.0.2 or greater");
         System.exit(-1);
       }

     } else {
       this.backend = new MemoryBasedBatchGrammar("moses", grammarFile, owner, "[X]", spanLimit, config);
     }
   }

   public PhraseTable(String owner, JoshuaConfiguration config) {
     this.config = config;

     this.backend = new MemoryBasedBatchGrammar(owner, config);
   }

   /**
    * Returns the longest source phrase read. For {@link MemoryBasedBatchGrammar}s, we subtract 1
    * since the grammar includes the nonterminal. For {@link PackedGrammar}s, the value was either
    * in the packed config file (Joshua 6.0.2+) or was passed in via the TM config line.
    *
    * @return
    */
   @Override
   public int getMaxSourcePhraseLength() {
     if (backend instanceof MemoryBasedBatchGrammar)
       return this.backend.getMaxSourcePhraseLength() - 1;
     else
       return this.backend.getMaxSourcePhraseLength();
   }

   /**
    * Collect the set of target-side phrases associated with a source phrase.
    *
    * @param sourceWords the sequence of source words
    * @return the rules
    */
   public RuleCollection getPhrases(int[] sourceWords) {
     if (sourceWords.length != 0) {
       Trie pointer = getTrieRoot();
       if (! (backend instanceof PackedGrammar))
         pointer = pointer.match(Vocabulary.id("[X]"));
       int i = 0;
       while (pointer != null && i < sourceWords.length)
         pointer = pointer.match(sourceWords[i++]);

       if (pointer != null && pointer.hasRules()) {
         return pointer.getRuleCollection();
       }
     }

     return null;
   }

   /**
    * Adds a rule to the grammar. Only supported when the backend is a MemoryBasedBatchGrammar.
    *
    * @param rule the rule to add
    */
   public void addRule(Rule rule) {
     ((MemoryBasedBatchGrammar)backend).addRule(rule);
   }

   @Override
   public void addOOVRules(int sourceWord, List<FeatureFunction> featureFunctions) {
     // TODO: _OOV shouldn't be outright added, since the word might not be OOV for the LM (but now almost
     // certainly is)
     int targetWord = config.mark_oovs
         ? Vocabulary.id(Vocabulary.word(sourceWord) + "_OOV")
         : sourceWord;

     int nt_i = Vocabulary.id("[X]");
     Rule oovRule = new Rule(nt_i, new int[] { nt_i, sourceWord },
         new int[] { -1, targetWord }, "", 1, null);
     addRule(oovRule);
     oovRule.estimateRuleCost(featureFunctions);

 //    String ruleString = String.format("[X] ||| [X,1] %s ||| [X,1] %s",
 //        Vocabulary.word(sourceWord), Vocabulary.word(targetWord));
 //    BilingualRule oovRule = new HieroFormatReader().parseLine(ruleString);
 //    oovRule.setOwner(Vocabulary.id("oov"));
 //    addRule(oovRule);
 //    oovRule.estimateRuleCost(featureFunctions);
   }

   @Override
   public Trie getTrieRoot() {
     return backend.getTrieRoot();
   }

   @Override
   public void sortGrammar(List<FeatureFunction> models) {
     backend.sortGrammar(models);
   }

   @Override
   public boolean isSorted() {
     return backend.isSorted();
   }

   /**
    * This should never be called.
    */
   @Override
   public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) {
     return true;
   }

   @Override
   public int getNumRules() {
     return backend.getNumRules();
   }

   @Override
   public Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords, float[] scores,
       int arity) {
     return backend.constructManualRule(lhs,  sourceWords, targetWords, scores, arity);
   }

   @Override
   public void writeGrammarOnDisk(String file) {
     backend.writeGrammarOnDisk(file);
   }

   @Override
   public boolean isRegexpGrammar() {
     return backend.isRegexpGrammar();
   }

   @Override
   public int getOwner() {
     return backend.getOwner();
   }
 }
	package joshua.decoder.phrase;

	import java.io.File;
	import java.io.IOException;
	import java.util.List;

	import joshua.corpus.Vocabulary;
	import joshua.decoder.JoshuaConfiguration;
	import joshua.decoder.ff.FeatureFunction;
	import joshua.decoder.ff.tm.Grammar;
	import joshua.decoder.ff.tm.Rule;
	import joshua.decoder.ff.tm.RuleCollection;
	import joshua.decoder.ff.tm.Trie;
	import joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
	import joshua.decoder.ff.tm.packed.PackedGrammar;

	/**
	* Represents a phrase table, and is implemented as a wrapper around either a {@link PackedGrammar}
	* or a {@link MemoryBasedBatchGrammar}.
	*
	* TODO: this should all be implemented as a two-level trie (source trie and target trie).
	*/
	public class PhraseTable implements Grammar {

	private JoshuaConfiguration config;
	private Grammar backend;

	/**
	* Chain to the super with a number of defaults. For example, we only use a single nonterminal,
	* and there is no span limit.
	*
	* @param grammarFile
	* @param owner
	* @param config
	* @throws IOException
	*/
	public PhraseTable(String grammarFile, String owner, JoshuaConfiguration config, int maxSource)
	throws IOException {
	this.config = config;
	int spanLimit = 0;

	if (new File(grammarFile).isDirectory()) {
	this.backend = new PackedGrammar(grammarFile, spanLimit, owner, "moses", config);
	if (this.backend.getMaxSourcePhraseLength() == -1) {
	System.err.println("FATAL: Using a packed grammar for a phrase table backend requires that you");
	System.err.println(" packed the grammar with Joshua 6.0.2 or greater");
	System.exit(-1);
	}

	} else {
	this.backend = new MemoryBasedBatchGrammar("moses", grammarFile, owner, "[X]", spanLimit, config);
	}
	}

	public PhraseTable(String owner, JoshuaConfiguration config) {
	this.config = config;

	this.backend = new MemoryBasedBatchGrammar(owner, config);
	}

	/**
	* Returns the longest source phrase read. For {@link MemoryBasedBatchGrammar}s, we subtract 1
	* since the grammar includes the nonterminal. For {@link PackedGrammar}s, the value was either
	* in the packed config file (Joshua 6.0.2+) or was passed in via the TM config line.
	*
	* @return
	*/
	@Override
	public int getMaxSourcePhraseLength() {
	if (backend instanceof MemoryBasedBatchGrammar)
	return this.backend.getMaxSourcePhraseLength() - 1;
	else
	return this.backend.getMaxSourcePhraseLength();
	}

	/**
	* Collect the set of target-side phrases associated with a source phrase.
	*
	* @param sourceWords the sequence of source words
	* @return the rules
	*/
	public RuleCollection getPhrases(int[] sourceWords) {
	if (sourceWords.length != 0) {
	Trie pointer = getTrieRoot();
	if (! (backend instanceof PackedGrammar))
	pointer = pointer.match(Vocabulary.id("[X]"));
	int i = 0;
	while (pointer != null && i < sourceWords.length)
	pointer = pointer.match(sourceWords[i++]);

	if (pointer != null && pointer.hasRules()) {
	return pointer.getRuleCollection();
	}
	}

	return null;
	}

	/**
	* Adds a rule to the grammar. Only supported when the backend is a MemoryBasedBatchGrammar.
	*
	* @param rule the rule to add
	*/
	public void addRule(Rule rule) {
	((MemoryBasedBatchGrammar)backend).addRule(rule);
	}

	@Override
	public void addOOVRules(int sourceWord, List<FeatureFunction> featureFunctions) {
	// TODO: _OOV shouldn't be outright added, since the word might not be OOV for the LM (but now almost
	// certainly is)
	int targetWord = config.mark_oovs
	? Vocabulary.id(Vocabulary.word(sourceWord) + "_OOV")
	: sourceWord;

	int nt_i = Vocabulary.id("[X]");
	Rule oovRule = new Rule(nt_i, new int[] { nt_i, sourceWord },
	new int[] { -1, targetWord }, "", 1, null);
	addRule(oovRule);
	oovRule.estimateRuleCost(featureFunctions);

	// String ruleString = String.format("[X] \|\|\| [X,1] %s \|\|\| [X,1] %s",
	// Vocabulary.word(sourceWord), Vocabulary.word(targetWord));
	// BilingualRule oovRule = new HieroFormatReader().parseLine(ruleString);
	// oovRule.setOwner(Vocabulary.id("oov"));
	// addRule(oovRule);
	// oovRule.estimateRuleCost(featureFunctions);
	}

	@Override
	public Trie getTrieRoot() {
	return backend.getTrieRoot();
	}

	@Override
	public void sortGrammar(List<FeatureFunction> models) {
	backend.sortGrammar(models);
	}

	@Override
	public boolean isSorted() {
	return backend.isSorted();
	}

	/**
	* This should never be called.
	*/
	@Override
	public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) {
	return true;
	}

	@Override
	public int getNumRules() {
	return backend.getNumRules();
	}

	@Override
	public Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords, float[] scores,
	int arity) {
	return backend.constructManualRule(lhs, sourceWords, targetWords, scores, arity);
	}

	@Override
	public void writeGrammarOnDisk(String file) {
	backend.writeGrammarOnDisk(file);
	}

	@Override
	public boolean isRegexpGrammar() {
	return backend.isRegexpGrammar();
	}

	@Override
	public int getOwner() {
	return backend.getOwner();
	}
	}