src/kenlm/lm/interpolate/merge_probabilities.hh - joshua - Git at Google

 #ifndef LM_INTERPOLATE_MERGE_PROBABILITIES_H
 #define LM_INTERPOLATE_MERGE_PROBABILITIES_H

 #include "lm/common/ngram.hh"
 #include "lm/interpolate/bounded_sequence_encoding.hh"
 #include "util/fixed_array.hh"
 #include "util/stream/multi_stream.hh"

 #include <stdint.h>

 namespace lm {
 namespace interpolate {

 struct InterpolateInfo;

 /**
  * Make the encoding of backoff values for a given order.  This stores values
  * in [PartialProbGamma::FromBegin(), PartialProbGamma::FromEnd())
  */
 BoundedSequenceEncoding MakeEncoder(const InterpolateInfo &info, uint8_t order);

 /**
  * The first pass for the offline log-linear interpolation algorithm. This
  * reads K **suffix-ordered** streams for each model, for each order, of
  * ngram records (ngram-id, prob, backoff). It further assumes that the
  * ngram-ids have been unified over all of the stream inputs.
  *
  * Its output is records of (ngram-id, prob-prod, backoff-level,
  * backoff-level, ...) where the backoff-levels (of which there are K) are
  * the context length (0 for unigrams) that the corresponding model had to
  * back off to in order to obtain a probability for that ngram-id. Each of
  * these streams is terminated with a record whose ngram-id is all
  * maximum-integers for simplicity in implementation here.
  *
  * @param models An array of length N (max_i N_i) containing at
  *  the ChainPositions for the streams for order (i + 1).
  * @param output_chains The output chains for each order (of length K)
  */
 void MergeProbabilities(
     const InterpolateInfo &info,
     util::FixedArray<util::stream::ChainPositions> &models_by_order,
     util::stream::Chains &output_chains);

 /**
  * This class represents the output payload for this pass, which consists
  * of an ngram-id, a probability, and then a vector of orders from which
  * each of the component models backed off to for this ngram, encoded
  * using the BoundedSequenceEncoding class.
  */
 class PartialProbGamma : public lm::NGramHeader {
 public:
   PartialProbGamma(std::size_t order, std::size_t backoff_bytes)
       : lm::NGramHeader(NULL, order), backoff_bytes_(backoff_bytes) {
     // nothing
   }

   std::size_t TotalSize() const {
     return sizeof(WordIndex) * Order() + sizeof(After) + backoff_bytes_;
   }

   // TODO: cache bounded sequence encoding in the pipeline?
   static std::size_t TotalSize(const InterpolateInfo &info, uint8_t order) {
     return sizeof(WordIndex) * order + sizeof(After) + MakeEncoder(info, order).EncodedLength();
   }

   float &Prob() { return Pay().prob; }
   float Prob() const { return Pay().prob; }

   float &LowerProb() { return Pay().lower_prob; }
   float LowerProb() const { return Pay().lower_prob; }

   const uint8_t *FromBegin() const { return Pay().from; }
   uint8_t *FromBegin() { return Pay().from; }

 private:
   struct After {
     // Note that backoff_and_normalize assumes this comes first.
     float prob;
     float lower_prob;
     uint8_t from[];
   };
   const After &Pay() const { return *reinterpret_cast<const After *>(end()); }
   After &Pay() { return *reinterpret_cast<After*>(end()); }

   std::size_t backoff_bytes_;
 };

 }} // namespaces
 #endif // LM_INTERPOLATE_MERGE_PROBABILITIES_H
	#ifndef LM_INTERPOLATE_MERGE_PROBABILITIES_H
	#define LM_INTERPOLATE_MERGE_PROBABILITIES_H

	#include "lm/common/ngram.hh"
	#include "lm/interpolate/bounded_sequence_encoding.hh"
	#include "util/fixed_array.hh"
	#include "util/stream/multi_stream.hh"

	#include <stdint.h>

	namespace lm {
	namespace interpolate {

	struct InterpolateInfo;

	/**
	* Make the encoding of backoff values for a given order. This stores values
	* in [PartialProbGamma::FromBegin(), PartialProbGamma::FromEnd())
	*/
	BoundedSequenceEncoding MakeEncoder(const InterpolateInfo &info, uint8_t order);

	/**
	* The first pass for the offline log-linear interpolation algorithm. This
	* reads K suffix-ordered streams for each model, for each order, of
	* ngram records (ngram-id, prob, backoff). It further assumes that the
	* ngram-ids have been unified over all of the stream inputs.
	*
	* Its output is records of (ngram-id, prob-prod, backoff-level,
	* backoff-level, ...) where the backoff-levels (of which there are K) are
	* the context length (0 for unigrams) that the corresponding model had to
	* back off to in order to obtain a probability for that ngram-id. Each of
	* these streams is terminated with a record whose ngram-id is all
	* maximum-integers for simplicity in implementation here.
	*
	* @param models An array of length N (max_i N_i) containing at
	* the ChainPositions for the streams for order (i + 1).
	* @param output_chains The output chains for each order (of length K)
	*/
	void MergeProbabilities(
	const InterpolateInfo &info,
	util::FixedArray<util::stream::ChainPositions> &models_by_order,
	util::stream::Chains &output_chains);

	/**
	* This class represents the output payload for this pass, which consists
	* of an ngram-id, a probability, and then a vector of orders from which
	* each of the component models backed off to for this ngram, encoded
	* using the BoundedSequenceEncoding class.
	*/
	class PartialProbGamma : public lm::NGramHeader {
	public:
	PartialProbGamma(std::size_t order, std::size_t backoff_bytes)
	: lm::NGramHeader(NULL, order), backoff_bytes_(backoff_bytes) {
	// nothing
	}

	std::size_t TotalSize() const {
	return sizeof(WordIndex) * Order() + sizeof(After) + backoff_bytes_;
	}

	// TODO: cache bounded sequence encoding in the pipeline?
	static std::size_t TotalSize(const InterpolateInfo &info, uint8_t order) {
	return sizeof(WordIndex) * order + sizeof(After) + MakeEncoder(info, order).EncodedLength();
	}

	float &Prob() { return Pay().prob; }
	float Prob() const { return Pay().prob; }

	float &LowerProb() { return Pay().lower_prob; }
	float LowerProb() const { return Pay().lower_prob; }

	const uint8_t *FromBegin() const { return Pay().from; }
	uint8_t *FromBegin() { return Pay().from; }

	private:
	struct After {
	// Note that backoff_and_normalize assumes this comes first.
	float prob;
	float lower_prob;
	uint8_t from[];
	};
	const After &Pay() const { return reinterpret_cast<const After >(end()); }
	After &Pay() { return reinterpret_cast<After>(end()); }

	std::size_t backoff_bytes_;
	};

	}} // namespaces
	#endif // LM_INTERPOLATE_MERGE_PROBABILITIES_H