src/kenlm/lm/filter/phrase.cc - joshua - Git at Google

 #include "lm/filter/phrase.hh"

 #include "lm/filter/format.hh"

 #include <algorithm>
 #include <functional>
 #include <iostream>
 #include <queue>
 #include <string>
 #include <vector>

 #include <cctype>

 namespace lm {
 namespace phrase {

 unsigned int ReadMultiple(std::istream &in, Substrings &out) {
   bool sentence_content = false;
   unsigned int sentence_id = 0;
   std::vector<Hash> phrase;
   std::string word;
   while (in) {
     char c;
     // Gather a word.
     while (!isspace(c = in.get()) && in) word += c;
     // Treat EOF like a newline.
     if (!in) c = '\n';
     // Add the word to the phrase.
     if (!word.empty()) {
       phrase.push_back(util::MurmurHashNative(word.data(), word.size()));
       word.clear();
     }
     if (c == ' ') continue;
     // It's more than just a space.  Close out the phrase.
     if (!phrase.empty()) {
       sentence_content = true;
       out.AddPhrase(sentence_id, phrase.begin(), phrase.end());
       phrase.clear();
     }
     if (c == '\t' || c == '\v') continue;
     // It's more than a space or tab: a newline.
     if (sentence_content) {
       ++sentence_id;
       sentence_content = false;
     }
   }
   if (!in.eof()) in.exceptions(std::istream::failbit | std::istream::badbit);
   return sentence_id + sentence_content;
 }

 namespace {
 typedef unsigned int Sentence;
 typedef std::vector<Sentence> Sentences;
 } // namespace

 namespace detail {

 const StringPiece kEndSentence("</s>");

 class Arc {
   public:
     Arc() {}

     // For arcs from one vertex to another.
     void SetPhrase(detail::Vertex &from, detail::Vertex &to, const Sentences &intersect) {
       Set(to, intersect);
       from_ = &from;
     }

     /* For arcs from before the n-gram begins to somewhere in the n-gram (right
      * aligned).  These have no from_ vertex; it implictly matches every
      * sentence.  This also handles when the n-gram is a substring of a phrase.
      */
     void SetRight(detail::Vertex &to, const Sentences &complete) {
       Set(to, complete);
       from_ = NULL;
     }

     Sentence Current() const {
       return *current_;
     }

     bool Empty() const {
       return current_ == last_;
     }

     /* When this function returns:
      * If Empty() then there's nothing left from this intersection.
      *
      * If Current() == to then to is part of the intersection.
      *
      * Otherwise, Current() > to.  In this case, to is not part of the
      * intersection and neither is anything < Current().  To determine if
      * any value >= Current() is in the intersection, call LowerBound again
      * with the value.
      */
     void LowerBound(const Sentence to);

   private:
     void Set(detail::Vertex &to, const Sentences &sentences);

     const Sentence *current_;
     const Sentence *last_;
     detail::Vertex *from_;
 };

 struct ArcGreater : public std::binary_function<const Arc *, const Arc *, bool> {
   bool operator()(const Arc *first, const Arc *second) const {
     return first->Current() > second->Current();
   }
 };

 class Vertex {
   public:
     Vertex() : current_(0) {}

     Sentence Current() const {
       return current_;
     }

     bool Empty() const {
       return incoming_.empty();
     }

     void LowerBound(const Sentence to);

   private:
     friend class Arc;

     void AddIncoming(Arc *arc) {
       if (!arc->Empty()) incoming_.push(arc);
     }

     unsigned int current_;
     std::priority_queue<Arc*, std::vector<Arc*>, ArcGreater> incoming_;
 };

 void Arc::LowerBound(const Sentence to) {
   current_ = std::lower_bound(current_, last_, to);
   // If *current_ > to, don't advance from_.  The intervening values of
   // from_ may be useful for another one of its outgoing arcs.
   if (!from_ || Empty() || (Current() > to)) return;
   assert(Current() == to);
   from_->LowerBound(to);
   if (from_->Empty()) {
     current_ = last_;
     return;
   }
   assert(from_->Current() >= to);
   if (from_->Current() > to) {
     current_ = std::lower_bound(current_ + 1, last_, from_->Current());
   }
 }

 void Arc::Set(Vertex &to, const Sentences &sentences) {
   current_ = &*sentences.begin();
   last_ = &*sentences.end();
   to.AddIncoming(this);
 }

 void Vertex::LowerBound(const Sentence to) {
   if (Empty()) return;
   // Union lower bound.
   while (true) {
     Arc *top = incoming_.top();
     if (top->Current() > to) {
       current_ = top->Current();
       return;
     }
     // If top->Current() == to, we still need to verify that's an actual
     // element and not just a bound.
     incoming_.pop();
     top->LowerBound(to);
     if (!top->Empty()) {
       incoming_.push(top);
       if (top->Current() == to) {
         current_ = to;
         return;
       }
     } else if (Empty()) {
       return;
     }
   }
 }

 } // namespace detail

 namespace {

 void BuildGraph(const Substrings &phrase, const std::vector<Hash> &hashes, detail::Vertex *const vertices, detail::Arc *free_arc) {
   using detail::Vertex;
   using detail::Arc;
   assert(!hashes.empty());

   const Hash *const first_word = &*hashes.begin();
   const Hash *const last_word = &*hashes.end() - 1;

   Hash hash = 0;
   const Sentences *found;
   // Phrases starting at or before the first word in the n-gram.
   {
     Vertex *vertex = vertices;
     for (const Hash *word = first_word; ; ++word, ++vertex) {
       hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *word);
       // Now hash is [hashes.begin(), word].
       if (word == last_word) {
         if (phrase.FindSubstring(hash, found))
           (free_arc++)->SetRight(*vertex, *found);
         break;
       }
       if (!phrase.FindRight(hash, found)) break;
       (free_arc++)->SetRight(*vertex, *found);
     }
   }

   // Phrases starting at the second or later word in the n-gram.
   Vertex *vertex_from = vertices;
   for (const Hash *word_from = first_word + 1; word_from != &*hashes.end(); ++word_from, ++vertex_from) {
     hash = 0;
     Vertex *vertex_to = vertex_from + 1;
     for (const Hash *word_to = word_from; ; ++word_to, ++vertex_to) {
       // Notice that word_to and vertex_to have the same index.
       hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *word_to);
       // Now hash covers [word_from, word_to].
       if (word_to == last_word) {
         if (phrase.FindLeft(hash, found))
           (free_arc++)->SetPhrase(*vertex_from, *vertex_to, *found);
         break;
       }
       if (!phrase.FindPhrase(hash, found)) break;
       (free_arc++)->SetPhrase(*vertex_from, *vertex_to, *found);
     }
   }
 }

 } // namespace

 namespace detail {

 // Here instead of header due to forward declaration.
 ConditionCommon::ConditionCommon(const Substrings &substrings) : substrings_(substrings) {}

 // Rest of the variables are temporaries anyway
 ConditionCommon::ConditionCommon(const ConditionCommon &from) : substrings_(from.substrings_) {}

 ConditionCommon::~ConditionCommon() {}

 detail::Vertex &ConditionCommon::MakeGraph() {
   assert(!hashes_.empty());
   vertices_.clear();
   vertices_.resize(hashes_.size());
   arcs_.clear();
   // One for every substring.
   arcs_.resize(((hashes_.size() + 1) * hashes_.size()) / 2);
   BuildGraph(substrings_, hashes_, &*vertices_.begin(), &*arcs_.begin());
   return vertices_[hashes_.size() - 1];
 }

 } // namespace detail

 bool Union::Evaluate() {
   detail::Vertex &last_vertex = MakeGraph();
   unsigned int lower = 0;
   while (true) {
     last_vertex.LowerBound(lower);
     if (last_vertex.Empty()) return false;
     if (last_vertex.Current() == lower) return true;
     lower = last_vertex.Current();
   }
 }

 template <class Output> void Multiple::Evaluate(const StringPiece &line, Output &output) {
   detail::Vertex &last_vertex = MakeGraph();
   unsigned int lower = 0;
   while (true) {
     last_vertex.LowerBound(lower);
     if (last_vertex.Empty()) return;
     if (last_vertex.Current() == lower) {
       output.SingleAddNGram(lower, line);
       ++lower;
     } else {
       lower = last_vertex.Current();
     }
   }
 }

 template void Multiple::Evaluate<CountFormat::Multiple>(const StringPiece &line, CountFormat::Multiple &output);
 template void Multiple::Evaluate<ARPAFormat::Multiple>(const StringPiece &line, ARPAFormat::Multiple &output);
 template void Multiple::Evaluate<MultipleOutputBuffer>(const StringPiece &line, MultipleOutputBuffer &output);

 } // namespace phrase
 } // namespace lm
	#include "lm/filter/phrase.hh"

	#include "lm/filter/format.hh"

	#include <algorithm>
	#include <functional>
	#include <iostream>
	#include <queue>
	#include <string>
	#include <vector>

	#include <cctype>

	namespace lm {
	namespace phrase {

	unsigned int ReadMultiple(std::istream &in, Substrings &out) {
	bool sentence_content = false;
	unsigned int sentence_id = 0;
	std::vector<Hash> phrase;
	std::string word;
	while (in) {
	char c;
	// Gather a word.
	while (!isspace(c = in.get()) && in) word += c;
	// Treat EOF like a newline.
	if (!in) c = '\n';
	// Add the word to the phrase.
	if (!word.empty()) {
	phrase.push_back(util::MurmurHashNative(word.data(), word.size()));
	word.clear();
	}
	if (c == ' ') continue;
	// It's more than just a space. Close out the phrase.
	if (!phrase.empty()) {
	sentence_content = true;
	out.AddPhrase(sentence_id, phrase.begin(), phrase.end());
	phrase.clear();
	}
	if (c == '\t' \|\| c == '\v') continue;
	// It's more than a space or tab: a newline.
	if (sentence_content) {
	++sentence_id;
	sentence_content = false;
	}
	}
	if (!in.eof()) in.exceptions(std::istream::failbit \| std::istream::badbit);
	return sentence_id + sentence_content;
	}

	namespace {
	typedef unsigned int Sentence;
	typedef std::vector<Sentence> Sentences;
	} // namespace

	namespace detail {

	const StringPiece kEndSentence("</s>");

	class Arc {
	public:
	Arc() {}

	// For arcs from one vertex to another.
	void SetPhrase(detail::Vertex &from, detail::Vertex &to, const Sentences &intersect) {
	Set(to, intersect);
	from_ = &from;
	}

	/* For arcs from before the n-gram begins to somewhere in the n-gram (right
	* aligned). These have no from_ vertex; it implictly matches every
	* sentence. This also handles when the n-gram is a substring of a phrase.
	*/
	void SetRight(detail::Vertex &to, const Sentences &complete) {
	Set(to, complete);
	from_ = NULL;
	}

	Sentence Current() const {
	return *current_;
	}

	bool Empty() const {
	return current_ == last_;
	}

	/* When this function returns:
	* If Empty() then there's nothing left from this intersection.
	*
	* If Current() == to then to is part of the intersection.
	*
	* Otherwise, Current() > to. In this case, to is not part of the
	* intersection and neither is anything < Current(). To determine if
	* any value >= Current() is in the intersection, call LowerBound again
	* with the value.
	*/
	void LowerBound(const Sentence to);

	private:
	void Set(detail::Vertex &to, const Sentences &sentences);

	const Sentence *current_;
	const Sentence *last_;
	detail::Vertex *from_;
	};

	struct ArcGreater : public std::binary_function<const Arc , const Arc , bool> {
	bool operator()(const Arc first, const Arc second) const {
	return first->Current() > second->Current();
	}
	};

	class Vertex {
	public:
	Vertex() : current_(0) {}

	Sentence Current() const {
	return current_;
	}

	bool Empty() const {
	return incoming_.empty();
	}

	void LowerBound(const Sentence to);

	private:
	friend class Arc;

	void AddIncoming(Arc *arc) {
	if (!arc->Empty()) incoming_.push(arc);
	}

	unsigned int current_;
	std::priority_queue<Arc, std::vector<Arc>, ArcGreater> incoming_;
	};

	void Arc::LowerBound(const Sentence to) {
	current_ = std::lower_bound(current_, last_, to);
	// If *current_ > to, don't advance from_. The intervening values of
	// from_ may be useful for another one of its outgoing arcs.
	if (!from_ \|\| Empty() \|\| (Current() > to)) return;
	assert(Current() == to);
	from_->LowerBound(to);
	if (from_->Empty()) {
	current_ = last_;
	return;
	}
	assert(from_->Current() >= to);
	if (from_->Current() > to) {
	current_ = std::lower_bound(current_ + 1, last_, from_->Current());
	}
	}

	void Arc::Set(Vertex &to, const Sentences &sentences) {
	current_ = &*sentences.begin();
	last_ = &*sentences.end();
	to.AddIncoming(this);
	}

	void Vertex::LowerBound(const Sentence to) {
	if (Empty()) return;
	// Union lower bound.
	while (true) {
	Arc *top = incoming_.top();
	if (top->Current() > to) {
	current_ = top->Current();
	return;
	}
	// If top->Current() == to, we still need to verify that's an actual
	// element and not just a bound.
	incoming_.pop();
	top->LowerBound(to);
	if (!top->Empty()) {
	incoming_.push(top);
	if (top->Current() == to) {
	current_ = to;
	return;
	}
	} else if (Empty()) {
	return;
	}
	}
	}

	} // namespace detail

	namespace {

	void BuildGraph(const Substrings &phrase, const std::vector<Hash> &hashes, detail::Vertex const vertices, detail::Arc free_arc) {
	using detail::Vertex;
	using detail::Arc;
	assert(!hashes.empty());

	const Hash const first_word = &hashes.begin();
	const Hash const last_word = &hashes.end() - 1;

	Hash hash = 0;
	const Sentences *found;
	// Phrases starting at or before the first word in the n-gram.
	{
	Vertex *vertex = vertices;
	for (const Hash *word = first_word; ; ++word, ++vertex) {
	hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *word);
	// Now hash is [hashes.begin(), word].
	if (word == last_word) {
	if (phrase.FindSubstring(hash, found))
	(free_arc++)->SetRight(vertex, found);
	break;
	}
	if (!phrase.FindRight(hash, found)) break;
	(free_arc++)->SetRight(vertex, found);
	}
	}

	// Phrases starting at the second or later word in the n-gram.
	Vertex *vertex_from = vertices;
	for (const Hash word_from = first_word + 1; word_from != &hashes.end(); ++word_from, ++vertex_from) {
	hash = 0;
	Vertex *vertex_to = vertex_from + 1;
	for (const Hash *word_to = word_from; ; ++word_to, ++vertex_to) {
	// Notice that word_to and vertex_to have the same index.
	hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *word_to);
	// Now hash covers [word_from, word_to].
	if (word_to == last_word) {
	if (phrase.FindLeft(hash, found))
	(free_arc++)->SetPhrase(vertex_from, vertex_to, *found);
	break;
	}
	if (!phrase.FindPhrase(hash, found)) break;
	(free_arc++)->SetPhrase(vertex_from, vertex_to, *found);
	}
	}
	}

	} // namespace

	namespace detail {

	// Here instead of header due to forward declaration.
	ConditionCommon::ConditionCommon(const Substrings &substrings) : substrings_(substrings) {}

	// Rest of the variables are temporaries anyway
	ConditionCommon::ConditionCommon(const ConditionCommon &from) : substrings_(from.substrings_) {}

	ConditionCommon::~ConditionCommon() {}

	detail::Vertex &ConditionCommon::MakeGraph() {
	assert(!hashes_.empty());
	vertices_.clear();
	vertices_.resize(hashes_.size());
	arcs_.clear();
	// One for every substring.
	arcs_.resize(((hashes_.size() + 1) * hashes_.size()) / 2);
	BuildGraph(substrings_, hashes_, &vertices_.begin(), &arcs_.begin());
	return vertices_[hashes_.size() - 1];
	}

	} // namespace detail

	bool Union::Evaluate() {
	detail::Vertex &last_vertex = MakeGraph();
	unsigned int lower = 0;
	while (true) {
	last_vertex.LowerBound(lower);
	if (last_vertex.Empty()) return false;
	if (last_vertex.Current() == lower) return true;
	lower = last_vertex.Current();
	}
	}

	template <class Output> void Multiple::Evaluate(const StringPiece &line, Output &output) {
	detail::Vertex &last_vertex = MakeGraph();
	unsigned int lower = 0;
	while (true) {
	last_vertex.LowerBound(lower);
	if (last_vertex.Empty()) return;
	if (last_vertex.Current() == lower) {
	output.SingleAddNGram(lower, line);
	++lower;
	} else {
	lower = last_vertex.Current();
	}
	}
	}

	template void Multiple::Evaluate<CountFormat::Multiple>(const StringPiece &line, CountFormat::Multiple &output);
	template void Multiple::Evaluate<ARPAFormat::Multiple>(const StringPiece &line, ARPAFormat::Multiple &output);
	template void Multiple::Evaluate<MultipleOutputBuffer>(const StringPiece &line, MultipleOutputBuffer &output);

	} // namespace phrase
	} // namespace lm