src/joshua/decoder/ff/lm/NGramLanguageModel.java - joshua - Git at Google

 /*
  * This file is part of the Joshua Machine Translation System.
  *
  * Joshua is free software; you can redistribute it and/or modify it under the terms of the GNU
  * Lesser General Public License as published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
  * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public License along with this library;
  * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  * 02111-1307 USA
  */

 package joshua.decoder.ff.lm;

 // BUG: At best we should use List, but we use int[] everywhere to
 // represent phrases therefore these additional methods are excessive.
 import java.util.List;

 /**
  * An interface for new language models to implement. An object of this type is passed to
  * LanguageModelFF, which will handle all the dynamic programming and state maintinence.
  *
  * All the function here should return LogP, not the cost.
  *
  * @author wren ng thornton <wren@users.sourceforge.net>
  * @author Zhifei Li, <zhifei.work@gmail.com>
  * @version $LastChangedDate$
  */
 public interface NGramLanguageModel {

   // ===============================================================
   // Attributes
   // ===============================================================
   int getOrder();

   // ===============================================================
   // Methods
   // ===============================================================

   /**
    * Language models may have their own private vocabulary mapping strings to integers; for example,
    * if they make use of a compile format (as KenLM and BerkeleyLM do). This mapping is likely
    * different from the global mapping containing in joshua.corpus.Vocabulary, which is used to
    * convert the input string and grammars. This function is used to tell the language model what
    * the global mapping is, so that the language model can convert it into its own private mapping.
    *
    * @param word
    * @param id
    * @return Whether any collisions were detected.
    */
   boolean registerWord(String token, int id);


   /**
    * @param sentence the sentence to be scored
    * @param order the order of N-grams for the LM
    * @param startIndex the index of first event-word we want to get its probability; if we want to
    *        get the prob for the whole sentence, then startIndex should be 1
    * @return the LogP of the whole sentence
    */
   float sentenceLogProbability(int[] sentence, int order, int startIndex);

   float ngramLogProbability(int[] ngram, int order);

   float ngramLogProbability(int[] ngram);


   // ===============================================================
   // Equivalent LM State (use DefaultNGramLanguageModel if you don't care)
   // ===============================================================

   /**
    * This returns the log probability of the special backoff symbol used to fill out contexts which
    * have been backed-off. The LanguageModelFF implementation is to call this unigram probability
    * for each such token, and then call ngramLogProbability for the remaining actual N-gram.
    */
   // TODO Is this really the best interface?
   float logProbOfBackoffState(List<Integer> ngram, int order, int qtyAdditionalBackoffWeight);

   float logProbabilityOfBackoffState(int[] ngram, int order, int qtyAdditionalBackoffWeight);

   int[] leftEquivalentState(int[] originalState, int order, double[] cost);

   int[] rightEquivalentState(int[] originalState, int order);

 }
	/*
	* This file is part of the Joshua Machine Translation System.
	*
	* Joshua is free software; you can redistribute it and/or modify it under the terms of the GNU
	* Lesser General Public License as published by the Free Software Foundation; either version 2.1 of
	* the License, or (at your option) any later version.
	*
	* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
	* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Lesser General Public License for more details.
	*
	* You should have received a copy of the GNU Lesser General Public License along with this library;
	* if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
	* 02111-1307 USA
	*/

	package joshua.decoder.ff.lm;

	// BUG: At best we should use List, but we use int[] everywhere to
	// represent phrases therefore these additional methods are excessive.
	import java.util.List;

	/**
	* An interface for new language models to implement. An object of this type is passed to
	* LanguageModelFF, which will handle all the dynamic programming and state maintinence.
	*
	* All the function here should return LogP, not the cost.
	*
	* @author wren ng thornton <wren@users.sourceforge.net>
	* @author Zhifei Li, <zhifei.work@gmail.com>
	* @version $LastChangedDate$
	*/
	public interface NGramLanguageModel {

	// ===============================================================
	// Attributes
	// ===============================================================
	int getOrder();

	// ===============================================================
	// Methods
	// ===============================================================

	/**
	* Language models may have their own private vocabulary mapping strings to integers; for example,
	* if they make use of a compile format (as KenLM and BerkeleyLM do). This mapping is likely
	* different from the global mapping containing in joshua.corpus.Vocabulary, which is used to
	* convert the input string and grammars. This function is used to tell the language model what
	* the global mapping is, so that the language model can convert it into its own private mapping.
	*
	* @param word
	* @param id
	* @return Whether any collisions were detected.
	*/
	boolean registerWord(String token, int id);


	/**
	* @param sentence the sentence to be scored
	* @param order the order of N-grams for the LM
	* @param startIndex the index of first event-word we want to get its probability; if we want to
	* get the prob for the whole sentence, then startIndex should be 1
	* @return the LogP of the whole sentence
	*/
	float sentenceLogProbability(int[] sentence, int order, int startIndex);

	float ngramLogProbability(int[] ngram, int order);

	float ngramLogProbability(int[] ngram);


	// ===============================================================
	// Equivalent LM State (use DefaultNGramLanguageModel if you don't care)
	// ===============================================================

	/**
	* This returns the log probability of the special backoff symbol used to fill out contexts which
	* have been backed-off. The LanguageModelFF implementation is to call this unigram probability
	* for each such token, and then call ngramLogProbability for the remaining actual N-gram.
	*/
	// TODO Is this really the best interface?
	float logProbOfBackoffState(List<Integer> ngram, int order, int qtyAdditionalBackoffWeight);

	float logProbabilityOfBackoffState(int[] ngram, int order, int qtyAdditionalBackoffWeight);

	int[] leftEquivalentState(int[] originalState, int order, double[] cost);

	int[] rightEquivalentState(int[] originalState, int order);

	}