src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java - joshua - Git at Google

 package joshua.decoder.ff.tm.hash_based;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;

 import joshua.corpus.Vocabulary;
 import joshua.decoder.Decoder;
 import joshua.decoder.JoshuaConfiguration;
 import joshua.decoder.JoshuaConfiguration.OOVItem;
 import joshua.decoder.ff.FeatureFunction;
 import joshua.decoder.ff.tm.AbstractGrammar;
 import joshua.decoder.ff.tm.Rule;
 import joshua.decoder.ff.tm.GrammarReader;
 import joshua.decoder.ff.tm.Trie;
 import joshua.decoder.ff.tm.format.HieroFormatReader;
 import joshua.decoder.ff.tm.format.MosesFormatReader;
 import joshua.decoder.ff.tm.format.SamtFormatReader;
 import joshua.util.FormatUtils;

 /**
  * This class implements a memory-based bilingual BatchGrammar.
  * <p>
  * The rules are stored in a trie. Each trie node has: (1) RuleBin: a list of rules matching the
  * french sides so far (2) A HashMap of next-layer trie nodes, the next french word used as the key
  * in HashMap
  *
  * @author Zhifei Li <zhifei.work@gmail.com>
  * @author Matt Post <post@cs.jhu.edu
  */
 public class MemoryBasedBatchGrammar extends AbstractGrammar {

   // ===============================================================
   // Instance Fields
   // ===============================================================

   /* The number of rules read. */
   private int qtyRulesRead = 0;

   /* The number of distinct source sides. */
   private int qtyRuleBins = 0;

   /* The trie root. */
   private MemoryBasedTrie root = null;

   /* The file containing the grammar. */
   private String grammarFile;

   private GrammarReader<Rule> modelReader;

   /* Whether the grammar's rules contain regular expressions. */
   private boolean isRegexpGrammar = false;

   // ===============================================================
   // Static Fields
   // ===============================================================

   // ===============================================================
   // Constructors
   // ===============================================================

   public MemoryBasedBatchGrammar(JoshuaConfiguration joshuaConfiguration) {
     super(joshuaConfiguration);
     this.root = new MemoryBasedTrie();
     this.joshuaConfiguration = joshuaConfiguration;
   }

   public MemoryBasedBatchGrammar(String owner, JoshuaConfiguration joshuaConfiguration) {
     this(joshuaConfiguration);
     this.owner = Vocabulary.id(owner);
   }

   public MemoryBasedBatchGrammar(GrammarReader<Rule> gr,JoshuaConfiguration joshuaConfiguration) {
     // this.defaultOwner = Vocabulary.id(defaultOwner);
     // this.defaultLHS = Vocabulary.id(defaultLHSSymbol);
     this(joshuaConfiguration);
     modelReader = gr;
   }

   public MemoryBasedBatchGrammar(String formatKeyword, String grammarFile, String owner,
       String defaultLHSSymbol, int spanLimit, JoshuaConfiguration joshuaConfiguration) throws IOException {

     this(joshuaConfiguration);
     this.owner = Vocabulary.id(owner);
     Vocabulary.id(defaultLHSSymbol);
     this.spanLimit = spanLimit;
     this.grammarFile = grammarFile;
     this.setRegexpGrammar(formatKeyword.equals("regexp"));

     // ==== loading grammar
     this.modelReader = createReader(formatKeyword, grammarFile);
     if (modelReader != null) {
       modelReader.initialize();
       for (Rule rule : modelReader)
         if (rule != null) {
           addRule(rule);
         }
     } else {
       Decoder.LOG(1, "Couldn't create a GrammarReader for file " + grammarFile + " with format "
             + formatKeyword);
     }

     this.printGrammar();
   }

   protected GrammarReader<Rule> createReader(String format, String grammarFile) {

     if (grammarFile != null) {
       if ("hiero".equals(format) || "thrax".equals(format) || "regexp".equals(format)) {
         return new HieroFormatReader(grammarFile);
       } else if ("samt".equals(format)) {
         return new SamtFormatReader(grammarFile);
       } else if ("phrase".equals(format) || "moses".equals(format)) {
         return new MosesFormatReader(grammarFile);
       } else {
         throw new RuntimeException(String.format("* FATAL: unknown grammar format '%s'", format));
       }
     }
     return null;
   }


   // ===============================================================
   // Methods
   // ===============================================================

   public void setSpanLimit(int spanLimit) {
     this.spanLimit = spanLimit;
   }

   @Override
   public int getNumRules() {
     return this.qtyRulesRead;
   }

   @Override
   public Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords,
       float[] denseScores, int arity) {
     return null;
   }

   /**
    * if the span covered by the chart bin is greater than the limit, then return false
    */
   public boolean hasRuleForSpan(int i, int j, int pathLength) {
     if (this.spanLimit == -1) { // mono-glue grammar
       return (i == 0);
     } else {
 //      System.err.println(String.format("%s HASRULEFORSPAN(%d,%d,%d)/%d = %s", Vocabulary.word(this.owner), i, j, pathLength, spanLimit, pathLength <= this.spanLimit));
       return (pathLength <= this.spanLimit);
     }
   }

   public Trie getTrieRoot() {
     return this.root;
   }

   /**
    * Adds a rule to the grammar.
    */
   public void addRule(Rule rule) {

     // TODO: Why two increments?
     this.qtyRulesRead++;

 //    if (owner == -1) {
 //      System.err.println("* FATAL: MemoryBasedBatchGrammar::addRule(): owner not set for grammar");
 //      System.exit(1);
 //    }
     rule.setOwner(owner);

     // === identify the position, and insert the trie nodes as necessary
     MemoryBasedTrie pos = root;
     int[] french = rule.getFrench();

     maxSourcePhraseLength = Math.max(maxSourcePhraseLength, french.length);

     for (int k = 0; k < french.length; k++) {
       int curSymID = french[k];

       /*
        * Note that the nonTerminal symbol in the french is not cleaned (i.e., will be sth like
        * [X,1]), but the symbol in the Trie has to be cleaned, so that the match does not care about
        * the markup (i.e., [X,1] or [X,2] means the same thing, that is X) if
        * (Vocabulary.nt(french[k])) { curSymID = modelReader.cleanNonTerminal(french[k]); if
        * (logger.isLoggable(Level.FINEST)) logger.finest("Amended to: " + curSymID); }
        */

       MemoryBasedTrie nextLayer = (MemoryBasedTrie) pos.match(curSymID);
       if (null == nextLayer) {
         nextLayer = new MemoryBasedTrie();
         if (pos.hasExtensions() == false) {
           pos.childrenTbl = new HashMap<Integer, MemoryBasedTrie>();
         }
         pos.childrenTbl.put(curSymID, nextLayer);
       }
       pos = nextLayer;
     }

     // === add the rule into the trie node
     if (!pos.hasRules()) {
       pos.ruleBin = new MemoryBasedRuleBin(rule.getArity(), rule.getFrench());
       this.qtyRuleBins++;
     }
     pos.ruleBin.addRule(rule);
   }

   protected void printGrammar() {
     Decoder.LOG(1,  String.format("MemoryBasedBatchGrammar: Read %d rules with %d distinct source sides from '%s'",
         this.qtyRulesRead, this.qtyRuleBins, grammarFile));
   }

   /**
    * This returns true if the grammar contains rules that are regular expressions, possibly matching
    * many different inputs.
    *
    * @return true if the grammar's rules may contain regular expressions.
    */
   @Override
   public boolean isRegexpGrammar() {
     return this.isRegexpGrammar;
   }

   public void setRegexpGrammar(boolean value) {
     this.isRegexpGrammar = value;
   }

   /***
    * Takes an input word and creates an OOV rule in the current grammar for that word.
    *
    * @param sourceWord
    * @param featureFunctions
    */
   @Override
   public void addOOVRules(int sourceWord, List<FeatureFunction> featureFunctions) {

     // TODO: _OOV shouldn't be outright added, since the word might not be OOV for the LM (but now almost
     // certainly is)
     final int targetWord = this.joshuaConfiguration.mark_oovs
         ? Vocabulary.id(Vocabulary.word(sourceWord) + "_OOV")
         : sourceWord;

     int[] sourceWords = { sourceWord };
     int[] targetWords = { targetWord };
     final String oovAlignment = "0-0";

     if (this.joshuaConfiguration.oovList != null && this.joshuaConfiguration.oovList.size() != 0) {
       for (OOVItem item: this.joshuaConfiguration.oovList) {
         Rule oovRule = new Rule(
             Vocabulary.id(item.label), sourceWords, targetWords, "", 0,
             oovAlignment);
         addRule(oovRule);
         oovRule.estimateRuleCost(featureFunctions);
       }
     } else {
       int nt_i = Vocabulary.id(this.joshuaConfiguration.default_non_terminal);
       Rule oovRule = new Rule(nt_i, sourceWords, targetWords, "", 0,
           oovAlignment);
       addRule(oovRule);
       oovRule.estimateRuleCost(featureFunctions);
     }
   }

   /**
    * Adds a default set of glue rules.
    *
    * @param featureFunctions
    */
   public void addGlueRules(ArrayList<FeatureFunction> featureFunctions) {
     HieroFormatReader reader = new HieroFormatReader();

     String goalNT = FormatUtils.cleanNonterminal(joshuaConfiguration.goal_symbol);
     String defaultNT = FormatUtils.cleanNonterminal(joshuaConfiguration.default_non_terminal);

     String[] ruleStrings = new String[] {
         String.format("[%s] ||| %s ||| %s ||| 0", goalNT, Vocabulary.START_SYM,
             Vocabulary.START_SYM),
         String.format("[%s] ||| [%s,1] [%s,2] ||| [%s,1] [%s,2] ||| -1",
             goalNT, goalNT, defaultNT, goalNT, defaultNT),
         String.format("[%s] ||| [%s,1] %s ||| [%s,1] %s ||| 0",
             goalNT, goalNT, Vocabulary.STOP_SYM, goalNT, Vocabulary.STOP_SYM)
     };

     for (String ruleString: ruleStrings) {
       Rule rule = reader.parseLine(ruleString);
       addRule(rule);
       rule.estimateRuleCost(featureFunctions);
     }
   }
 }
	package joshua.decoder.ff.tm.hash_based;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.List;

	import joshua.corpus.Vocabulary;
	import joshua.decoder.Decoder;
	import joshua.decoder.JoshuaConfiguration;
	import joshua.decoder.JoshuaConfiguration.OOVItem;
	import joshua.decoder.ff.FeatureFunction;
	import joshua.decoder.ff.tm.AbstractGrammar;
	import joshua.decoder.ff.tm.Rule;
	import joshua.decoder.ff.tm.GrammarReader;
	import joshua.decoder.ff.tm.Trie;
	import joshua.decoder.ff.tm.format.HieroFormatReader;
	import joshua.decoder.ff.tm.format.MosesFormatReader;
	import joshua.decoder.ff.tm.format.SamtFormatReader;
	import joshua.util.FormatUtils;

	/**
	* This class implements a memory-based bilingual BatchGrammar.
	* <p>
	* The rules are stored in a trie. Each trie node has: (1) RuleBin: a list of rules matching the
	* french sides so far (2) A HashMap of next-layer trie nodes, the next french word used as the key
	* in HashMap
	*
	* @author Zhifei Li <zhifei.work@gmail.com>
	* @author Matt Post <post@cs.jhu.edu
	*/
	public class MemoryBasedBatchGrammar extends AbstractGrammar {

	// ===============================================================
	// Instance Fields
	// ===============================================================

	/* The number of rules read. */
	private int qtyRulesRead = 0;

	/* The number of distinct source sides. */
	private int qtyRuleBins = 0;

	/* The trie root. */
	private MemoryBasedTrie root = null;

	/* The file containing the grammar. */
	private String grammarFile;

	private GrammarReader<Rule> modelReader;

	/* Whether the grammar's rules contain regular expressions. */
	private boolean isRegexpGrammar = false;

	// ===============================================================
	// Static Fields
	// ===============================================================

	// ===============================================================
	// Constructors
	// ===============================================================

	public MemoryBasedBatchGrammar(JoshuaConfiguration joshuaConfiguration) {
	super(joshuaConfiguration);
	this.root = new MemoryBasedTrie();
	this.joshuaConfiguration = joshuaConfiguration;
	}

	public MemoryBasedBatchGrammar(String owner, JoshuaConfiguration joshuaConfiguration) {
	this(joshuaConfiguration);
	this.owner = Vocabulary.id(owner);
	}

	public MemoryBasedBatchGrammar(GrammarReader<Rule> gr,JoshuaConfiguration joshuaConfiguration) {
	// this.defaultOwner = Vocabulary.id(defaultOwner);
	// this.defaultLHS = Vocabulary.id(defaultLHSSymbol);
	this(joshuaConfiguration);
	modelReader = gr;
	}

	public MemoryBasedBatchGrammar(String formatKeyword, String grammarFile, String owner,
	String defaultLHSSymbol, int spanLimit, JoshuaConfiguration joshuaConfiguration) throws IOException {

	this(joshuaConfiguration);
	this.owner = Vocabulary.id(owner);
	Vocabulary.id(defaultLHSSymbol);
	this.spanLimit = spanLimit;
	this.grammarFile = grammarFile;
	this.setRegexpGrammar(formatKeyword.equals("regexp"));

	// ==== loading grammar
	this.modelReader = createReader(formatKeyword, grammarFile);
	if (modelReader != null) {
	modelReader.initialize();
	for (Rule rule : modelReader)
	if (rule != null) {
	addRule(rule);
	}
	} else {
	Decoder.LOG(1, "Couldn't create a GrammarReader for file " + grammarFile + " with format "
	+ formatKeyword);
	}

	this.printGrammar();
	}

	protected GrammarReader<Rule> createReader(String format, String grammarFile) {

	if (grammarFile != null) {
	if ("hiero".equals(format) \|\| "thrax".equals(format) \|\| "regexp".equals(format)) {
	return new HieroFormatReader(grammarFile);
	} else if ("samt".equals(format)) {
	return new SamtFormatReader(grammarFile);
	} else if ("phrase".equals(format) \|\| "moses".equals(format)) {
	return new MosesFormatReader(grammarFile);
	} else {
	throw new RuntimeException(String.format("* FATAL: unknown grammar format '%s'", format));
	}
	}
	return null;
	}


	// ===============================================================
	// Methods
	// ===============================================================

	public void setSpanLimit(int spanLimit) {
	this.spanLimit = spanLimit;
	}

	@Override
	public int getNumRules() {
	return this.qtyRulesRead;
	}

	@Override
	public Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords,
	float[] denseScores, int arity) {
	return null;
	}

	/**
	* if the span covered by the chart bin is greater than the limit, then return false
	*/
	public boolean hasRuleForSpan(int i, int j, int pathLength) {
	if (this.spanLimit == -1) { // mono-glue grammar
	return (i == 0);
	} else {
	// System.err.println(String.format("%s HASRULEFORSPAN(%d,%d,%d)/%d = %s", Vocabulary.word(this.owner), i, j, pathLength, spanLimit, pathLength <= this.spanLimit));
	return (pathLength <= this.spanLimit);
	}
	}

	public Trie getTrieRoot() {
	return this.root;
	}

	/**
	* Adds a rule to the grammar.
	*/
	public void addRule(Rule rule) {

	// TODO: Why two increments?
	this.qtyRulesRead++;

	// if (owner == -1) {
	// System.err.println("* FATAL: MemoryBasedBatchGrammar::addRule(): owner not set for grammar");
	// System.exit(1);
	// }
	rule.setOwner(owner);

	// === identify the position, and insert the trie nodes as necessary
	MemoryBasedTrie pos = root;
	int[] french = rule.getFrench();

	maxSourcePhraseLength = Math.max(maxSourcePhraseLength, french.length);

	for (int k = 0; k < french.length; k++) {
	int curSymID = french[k];

	/*
	* Note that the nonTerminal symbol in the french is not cleaned (i.e., will be sth like
	* [X,1]), but the symbol in the Trie has to be cleaned, so that the match does not care about
	* the markup (i.e., [X,1] or [X,2] means the same thing, that is X) if
	* (Vocabulary.nt(french[k])) { curSymID = modelReader.cleanNonTerminal(french[k]); if
	* (logger.isLoggable(Level.FINEST)) logger.finest("Amended to: " + curSymID); }
	*/

	MemoryBasedTrie nextLayer = (MemoryBasedTrie) pos.match(curSymID);
	if (null == nextLayer) {
	nextLayer = new MemoryBasedTrie();
	if (pos.hasExtensions() == false) {
	pos.childrenTbl = new HashMap<Integer, MemoryBasedTrie>();
	}
	pos.childrenTbl.put(curSymID, nextLayer);
	}
	pos = nextLayer;
	}

	// === add the rule into the trie node
	if (!pos.hasRules()) {
	pos.ruleBin = new MemoryBasedRuleBin(rule.getArity(), rule.getFrench());
	this.qtyRuleBins++;
	}
	pos.ruleBin.addRule(rule);
	}

	protected void printGrammar() {
	Decoder.LOG(1, String.format("MemoryBasedBatchGrammar: Read %d rules with %d distinct source sides from '%s'",
	this.qtyRulesRead, this.qtyRuleBins, grammarFile));
	}

	/**
	* This returns true if the grammar contains rules that are regular expressions, possibly matching
	* many different inputs.
	*
	* @return true if the grammar's rules may contain regular expressions.
	*/
	@Override
	public boolean isRegexpGrammar() {
	return this.isRegexpGrammar;
	}

	public void setRegexpGrammar(boolean value) {
	this.isRegexpGrammar = value;
	}

	/***
	* Takes an input word and creates an OOV rule in the current grammar for that word.
	*
	* @param sourceWord
	* @param featureFunctions
	*/
	@Override
	public void addOOVRules(int sourceWord, List<FeatureFunction> featureFunctions) {

	// TODO: _OOV shouldn't be outright added, since the word might not be OOV for the LM (but now almost
	// certainly is)
	final int targetWord = this.joshuaConfiguration.mark_oovs
	? Vocabulary.id(Vocabulary.word(sourceWord) + "_OOV")
	: sourceWord;

	int[] sourceWords = { sourceWord };
	int[] targetWords = { targetWord };
	final String oovAlignment = "0-0";

	if (this.joshuaConfiguration.oovList != null && this.joshuaConfiguration.oovList.size() != 0) {
	for (OOVItem item: this.joshuaConfiguration.oovList) {
	Rule oovRule = new Rule(
	Vocabulary.id(item.label), sourceWords, targetWords, "", 0,
	oovAlignment);
	addRule(oovRule);
	oovRule.estimateRuleCost(featureFunctions);
	}
	} else {
	int nt_i = Vocabulary.id(this.joshuaConfiguration.default_non_terminal);
	Rule oovRule = new Rule(nt_i, sourceWords, targetWords, "", 0,
	oovAlignment);
	addRule(oovRule);
	oovRule.estimateRuleCost(featureFunctions);
	}
	}

	/**
	* Adds a default set of glue rules.
	*
	* @param featureFunctions
	*/
	public void addGlueRules(ArrayList<FeatureFunction> featureFunctions) {
	HieroFormatReader reader = new HieroFormatReader();

	String goalNT = FormatUtils.cleanNonterminal(joshuaConfiguration.goal_symbol);
	String defaultNT = FormatUtils.cleanNonterminal(joshuaConfiguration.default_non_terminal);

	String[] ruleStrings = new String[] {
	String.format("[%s] \|\|\| %s \|\|\| %s \|\|\| 0", goalNT, Vocabulary.START_SYM,
	Vocabulary.START_SYM),
	String.format("[%s] \|\|\| [%s,1] [%s,2] \|\|\| [%s,1] [%s,2] \|\|\| -1",
	goalNT, goalNT, defaultNT, goalNT, defaultNT),
	String.format("[%s] \|\|\| [%s,1] %s \|\|\| [%s,1] %s \|\|\| 0",
	goalNT, goalNT, Vocabulary.STOP_SYM, goalNT, Vocabulary.STOP_SYM)
	};

	for (String ruleString: ruleStrings) {
	Rule rule = reader.parseLine(ruleString);
	addRule(rule);
	rule.estimateRuleCost(featureFunctions);
	}
	}
	}