src/joshua/decoder/ff/lm/kenlm/lm/trie_sort.cc - joshua - Git at Google

 #include "lm/trie_sort.hh"

 #include "lm/config.hh"
 #include "lm/lm_exception.hh"
 #include "lm/read_arpa.hh"
 #include "lm/vocab.hh"
 #include "lm/weights.hh"
 #include "lm/word_index.hh"
 #include "util/file_piece.hh"
 #include "util/mmap.hh"
 #include "util/proxy_iterator.hh"
 #include "util/sized_iterator.hh"

 #include <algorithm>
 #include <cstring>
 #include <cstdio>
 #include <cstdlib>
 #include <deque>
 #include <iterator>
 #include <limits>
 #include <vector>

 namespace lm {
 namespace ngram {
 namespace trie {
 namespace {

 typedef util::SizedIterator NGramIter;

 // Proxy for an entry except there is some extra cruft between the entries.  This is used to sort (n-1)-grams using the same memory as the sorted n-grams.
 class PartialViewProxy {
   public:
     PartialViewProxy() : attention_size_(0), inner_() {}

     PartialViewProxy(void *ptr, std::size_t block_size, std::size_t attention_size) : attention_size_(attention_size), inner_(ptr, block_size) {}

     operator std::string() const {
       return std::string(reinterpret_cast<const char*>(inner_.Data()), attention_size_);
     }

     PartialViewProxy &operator=(const PartialViewProxy &from) {
       memcpy(inner_.Data(), from.inner_.Data(), attention_size_);
       return *this;
     }

     PartialViewProxy &operator=(const std::string &from) {
       memcpy(inner_.Data(), from.data(), attention_size_);
       return *this;
     }

     const void *Data() const { return inner_.Data(); }
     void *Data() { return inner_.Data(); }

     friend void swap(PartialViewProxy first, PartialViewProxy second) {
       std::swap_ranges(reinterpret_cast<char*>(first.Data()), reinterpret_cast<char*>(first.Data()) + first.attention_size_, reinterpret_cast<char*>(second.Data()));
     }

   private:
     friend class util::ProxyIterator<PartialViewProxy>;

     typedef std::string value_type;

     const std::size_t attention_size_;

     typedef util::SizedInnerIterator InnerIterator;
     InnerIterator &Inner() { return inner_; }
     const InnerIterator &Inner() const { return inner_; }
     InnerIterator inner_;
 };

 typedef util::ProxyIterator<PartialViewProxy> PartialIter;

 FILE *DiskFlush(const void *mem_begin, const void *mem_end, const std::string &temp_prefix) {
   util::scoped_fd file(util::MakeTemp(temp_prefix));
   util::WriteOrThrow(file.get(), mem_begin, (uint8_t*)mem_end - (uint8_t*)mem_begin);
   return util::FDOpenOrThrow(file);
 }

 FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &temp_prefix, std::size_t entry_size, unsigned char order) {
   const size_t context_size = sizeof(WordIndex) * (order - 1);
   // Sort just the contexts using the same memory.
   PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), entry_size, context_size));
   PartialIter context_end(PartialViewProxy(end + sizeof(WordIndex), entry_size, context_size));

 #if defined(_WIN32) || defined(_WIN64)
   std::stable_sort
 #else
   std::sort
 #endif
     (context_begin, context_end, util::SizedCompare<EntryCompare, PartialViewProxy>(EntryCompare(order - 1)));

   util::scoped_FILE out(util::FMakeTemp(temp_prefix));

   // Write out to file and uniqueify at the same time.  Could have used unique_copy if there was an appropriate OutputIterator.
   if (context_begin == context_end) return out.release();
   PartialIter i(context_begin);
   util::WriteOrThrow(out.get(), i->Data(), context_size);
   const void *previous = i->Data();
   ++i;
   for (; i != context_end; ++i) {
     if (memcmp(previous, i->Data(), context_size)) {
       util::WriteOrThrow(out.get(), i->Data(), context_size);
       previous = i->Data();
     }
   }
   return out.release();
 }

 struct ThrowCombine {
   void operator()(std::size_t entry_size, unsigned char order, const void *first, const void *second, FILE * /*out*/) const {
     const WordIndex *base = reinterpret_cast<const WordIndex*>(first);
     FormatLoadException e;
     e << "Duplicate n-gram detected with vocab ids";
     for (const WordIndex *i = base; i != base + order; ++i) {
       e << ' ' << *i;
     }
     throw e;
   }
 };

 // Useful for context files that just contain records with no value.
 struct FirstCombine {
   void operator()(std::size_t entry_size, unsigned char /*order*/, const void *first, const void * /*second*/, FILE *out) const {
     util::WriteOrThrow(out, first, entry_size);
   }
 };

 template <class Combine> FILE *MergeSortedFiles(FILE *first_file, FILE *second_file, const std::string &temp_prefix, std::size_t weights_size, unsigned char order, const Combine &combine) {
   std::size_t entry_size = sizeof(WordIndex) * order + weights_size;
   RecordReader first, second;
   first.Init(first_file, entry_size);
   second.Init(second_file, entry_size);
   util::scoped_FILE out_file(util::FMakeTemp(temp_prefix));
   EntryCompare less(order);
   while (first && second) {
     if (less(first.Data(), second.Data())) {
       util::WriteOrThrow(out_file.get(), first.Data(), entry_size);
       ++first;
     } else if (less(second.Data(), first.Data())) {
       util::WriteOrThrow(out_file.get(), second.Data(), entry_size);
       ++second;
     } else {
       combine(entry_size, order, first.Data(), second.Data(), out_file.get());
       ++first; ++second;
     }
   }
   for (RecordReader &remains = (first ? first : second); remains; ++remains) {
     util::WriteOrThrow(out_file.get(), remains.Data(), entry_size);
   }
   return out_file.release();
 }

 } // namespace

 void RecordReader::Init(FILE *file, std::size_t entry_size) {
   entry_size_ = entry_size;
   data_.reset(malloc(entry_size));
   UTIL_THROW_IF(!data_.get(), util::ErrnoException, "Failed to malloc read buffer");
   file_ = file;
   if (file) {
     rewind(file);
     remains_ = true;
     ++*this;
   } else {
     remains_ = false;
   }
 }

 void RecordReader::Overwrite(const void *start, std::size_t amount) {
   long internal = (uint8_t*)start - (uint8_t*)data_.get();
   UTIL_THROW_IF(fseek(file_, internal - entry_size_, SEEK_CUR), util::ErrnoException, "Couldn't seek backwards for revision");
   util::WriteOrThrow(file_, start, amount);
   long forward = entry_size_ - internal - amount;
 #if !defined(_WIN32) && !defined(_WIN64)
   if (forward)
 #endif
     UTIL_THROW_IF(fseek(file_, forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision");
 }

 void RecordReader::Rewind() {
   if (file_) {
     rewind(file_);
     remains_ = true;
     ++*this;
   } else {
     remains_ = false;
   }
 }

 SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) {
   PositiveProbWarn warn(config.positive_log_probability);
   unigram_.reset(util::MakeTemp(file_prefix));
   {
     // In case <unk> appears.
     size_t size_out = (counts[0] + 1) * sizeof(ProbBackoff);
     util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_.get(), size_out), size_out);
     Read1Grams(f, counts[0], vocab, reinterpret_cast<ProbBackoff*>(unigram_mmap.get()), warn);
     CheckSpecials(config, vocab);
     if (!vocab.SawUnk()) ++counts[0];
   }

   // Only use as much buffer as we need.
   size_t buffer_use = 0;
   for (unsigned int order = 2; order < counts.size(); ++order) {
     buffer_use = std::max<size_t>(buffer_use, static_cast<size_t>((sizeof(WordIndex) * order + 2 * sizeof(float)) * counts[order - 1]));
   }
   buffer_use = std::max<size_t>(buffer_use, static_cast<size_t>((sizeof(WordIndex) * counts.size() + sizeof(float)) * counts.back()));
   buffer = std::min<size_t>(buffer, buffer_use);

   util::scoped_malloc mem;
   mem.reset(malloc(buffer));
   if (!mem.get()) UTIL_THROW(util::ErrnoException, "malloc failed for sort buffer size " << buffer);

   for (unsigned char order = 2; order <= counts.size(); ++order) {
     ConvertToSorted(f, vocab, counts, file_prefix, order, warn, mem.get(), buffer);
   }
   ReadEnd(f);
 }

 namespace {
 class Closer {
   public:
     explicit Closer(std::deque<FILE*> &files) : files_(files) {}

     ~Closer() {
       for (std::deque<FILE*>::iterator i = files_.begin(); i != files_.end(); ++i) {
         util::scoped_FILE deleter(*i);
       }
     }

     void PopFront() {
       util::scoped_FILE deleter(files_.front());
       files_.pop_front();
     }
   private:
     std::deque<FILE*> &files_;
 };
 } // namespace

 void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const std::string &file_prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size) {
   ReadNGramHeader(f, order);
   const size_t count = counts[order - 1];
   // Size of weights.  Does it include backoff?
   const size_t words_size = sizeof(WordIndex) * order;
   const size_t weights_size = sizeof(float) + ((order == counts.size()) ? 0 : sizeof(float));
   const size_t entry_size = words_size + weights_size;
   const size_t batch_size = std::min(count, mem_size / entry_size);
   uint8_t *const begin = reinterpret_cast<uint8_t*>(mem);

   std::deque<FILE*> files, contexts;
   Closer files_closer(files), contexts_closer(contexts);

   for (std::size_t batch = 0, done = 0; done < count; ++batch) {
     uint8_t *out = begin;
     uint8_t *out_end = out + std::min(count - done, batch_size) * entry_size;
     if (order == counts.size()) {
       for (; out != out_end; out += entry_size) {
         std::reverse_iterator<WordIndex*> it(reinterpret_cast<WordIndex*>(out) + order);
         ReadNGram(f, order, vocab, it, *reinterpret_cast<Prob*>(out + words_size), warn);
       }
     } else {
       for (; out != out_end; out += entry_size) {
         std::reverse_iterator<WordIndex*> it(reinterpret_cast<WordIndex*>(out) + order);
         ReadNGram(f, order, vocab, it, *reinterpret_cast<ProbBackoff*>(out + words_size), warn);
       }
     }
     // Sort full records by full n-gram.
     util::SizedProxy proxy_begin(begin, entry_size), proxy_end(out_end, entry_size);
     // parallel_sort uses too much RAM.  TODO: figure out why windows sort doesn't like my proxies.
 #if defined(_WIN32) || defined(_WIN64)
     std::stable_sort
 #else
     std::sort
 #endif
         (NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare<EntryCompare>(EntryCompare(order)));
     files.push_back(DiskFlush(begin, out_end, file_prefix));
     contexts.push_back(WriteContextFile(begin, out_end, file_prefix, entry_size, order));

     done += (out_end - begin) / entry_size;
   }

   // All individual files created.  Merge them.

   while (files.size() > 1) {
     files.push_back(MergeSortedFiles(files[0], files[1], file_prefix, weights_size, order, ThrowCombine()));
     files_closer.PopFront();
     files_closer.PopFront();
     contexts.push_back(MergeSortedFiles(contexts[0], contexts[1], file_prefix, 0, order - 1, FirstCombine()));
     contexts_closer.PopFront();
     contexts_closer.PopFront();
   }

   if (!files.empty()) {
     // Steal from closers.
     full_[order - 2].reset(files.front());
     files.pop_front();
     context_[order - 2].reset(contexts.front());
     contexts.pop_front();
   }
 }

 } // namespace trie
 } // namespace ngram
 } // namespace lm
	#include "lm/trie_sort.hh"

	#include "lm/config.hh"
	#include "lm/lm_exception.hh"
	#include "lm/read_arpa.hh"
	#include "lm/vocab.hh"
	#include "lm/weights.hh"
	#include "lm/word_index.hh"
	#include "util/file_piece.hh"
	#include "util/mmap.hh"
	#include "util/proxy_iterator.hh"
	#include "util/sized_iterator.hh"

	#include <algorithm>
	#include <cstring>
	#include <cstdio>
	#include <cstdlib>
	#include <deque>
	#include <iterator>
	#include <limits>
	#include <vector>

	namespace lm {
	namespace ngram {
	namespace trie {
	namespace {

	typedef util::SizedIterator NGramIter;

	// Proxy for an entry except there is some extra cruft between the entries. This is used to sort (n-1)-grams using the same memory as the sorted n-grams.
	class PartialViewProxy {
	public:
	PartialViewProxy() : attention_size_(0), inner_() {}

	PartialViewProxy(void *ptr, std::size_t block_size, std::size_t attention_size) : attention_size_(attention_size), inner_(ptr, block_size) {}

	operator std::string() const {
	return std::string(reinterpret_cast<const char*>(inner_.Data()), attention_size_);
	}

	PartialViewProxy &operator=(const PartialViewProxy &from) {
	memcpy(inner_.Data(), from.inner_.Data(), attention_size_);
	return *this;
	}

	PartialViewProxy &operator=(const std::string &from) {
	memcpy(inner_.Data(), from.data(), attention_size_);
	return *this;
	}

	const void *Data() const { return inner_.Data(); }
	void *Data() { return inner_.Data(); }

	friend void swap(PartialViewProxy first, PartialViewProxy second) {
	std::swap_ranges(reinterpret_cast<char>(first.Data()), reinterpret_cast<char>(first.Data()) + first.attention_size_, reinterpret_cast<char*>(second.Data()));
	}

	private:
	friend class util::ProxyIterator<PartialViewProxy>;

	typedef std::string value_type;

	const std::size_t attention_size_;

	typedef util::SizedInnerIterator InnerIterator;
	InnerIterator &Inner() { return inner_; }
	const InnerIterator &Inner() const { return inner_; }
	InnerIterator inner_;
	};

	typedef util::ProxyIterator<PartialViewProxy> PartialIter;

	FILE DiskFlush(const void mem_begin, const void *mem_end, const std::string &temp_prefix) {
	util::scoped_fd file(util::MakeTemp(temp_prefix));
	util::WriteOrThrow(file.get(), mem_begin, (uint8_t)mem_end - (uint8_t)mem_begin);
	return util::FDOpenOrThrow(file);
	}

	FILE WriteContextFile(uint8_t begin, uint8_t *end, const std::string &temp_prefix, std::size_t entry_size, unsigned char order) {
	const size_t context_size = sizeof(WordIndex) * (order - 1);
	// Sort just the contexts using the same memory.
	PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), entry_size, context_size));
	PartialIter context_end(PartialViewProxy(end + sizeof(WordIndex), entry_size, context_size));

	#if defined(_WIN32) \|\| defined(_WIN64)
	std::stable_sort
	#else
	std::sort
	#endif
	(context_begin, context_end, util::SizedCompare<EntryCompare, PartialViewProxy>(EntryCompare(order - 1)));

	util::scoped_FILE out(util::FMakeTemp(temp_prefix));

	// Write out to file and uniqueify at the same time. Could have used unique_copy if there was an appropriate OutputIterator.
	if (context_begin == context_end) return out.release();
	PartialIter i(context_begin);
	util::WriteOrThrow(out.get(), i->Data(), context_size);
	const void *previous = i->Data();
	++i;
	for (; i != context_end; ++i) {
	if (memcmp(previous, i->Data(), context_size)) {
	util::WriteOrThrow(out.get(), i->Data(), context_size);
	previous = i->Data();
	}
	}
	return out.release();
	}

	struct ThrowCombine {
	void operator()(std::size_t entry_size, unsigned char order, const void first, const void second, FILE * /out/) const {
	const WordIndex base = reinterpret_cast<const WordIndex>(first);
	FormatLoadException e;
	e << "Duplicate n-gram detected with vocab ids";
	for (const WordIndex *i = base; i != base + order; ++i) {
	e << ' ' << *i;
	}
	throw e;
	}
	};

	// Useful for context files that just contain records with no value.
	struct FirstCombine {
	void operator()(std::size_t entry_size, unsigned char /order/, const void first, const void /second/, FILE *out) const {
	util::WriteOrThrow(out, first, entry_size);
	}
	};

	template <class Combine> FILE MergeSortedFiles(FILE first_file, FILE *second_file, const std::string &temp_prefix, std::size_t weights_size, unsigned char order, const Combine &combine) {
	std::size_t entry_size = sizeof(WordIndex) * order + weights_size;
	RecordReader first, second;
	first.Init(first_file, entry_size);
	second.Init(second_file, entry_size);
	util::scoped_FILE out_file(util::FMakeTemp(temp_prefix));
	EntryCompare less(order);
	while (first && second) {
	if (less(first.Data(), second.Data())) {
	util::WriteOrThrow(out_file.get(), first.Data(), entry_size);
	++first;
	} else if (less(second.Data(), first.Data())) {
	util::WriteOrThrow(out_file.get(), second.Data(), entry_size);
	++second;
	} else {
	combine(entry_size, order, first.Data(), second.Data(), out_file.get());
	++first; ++second;
	}
	}
	for (RecordReader &remains = (first ? first : second); remains; ++remains) {
	util::WriteOrThrow(out_file.get(), remains.Data(), entry_size);
	}
	return out_file.release();
	}

	} // namespace

	void RecordReader::Init(FILE *file, std::size_t entry_size) {
	entry_size_ = entry_size;
	data_.reset(malloc(entry_size));
	UTIL_THROW_IF(!data_.get(), util::ErrnoException, "Failed to malloc read buffer");
	file_ = file;
	if (file) {
	rewind(file);
	remains_ = true;
	++*this;
	} else {
	remains_ = false;
	}
	}

	void RecordReader::Overwrite(const void *start, std::size_t amount) {
	long internal = (uint8_t)start - (uint8_t)data_.get();
	UTIL_THROW_IF(fseek(file_, internal - entry_size_, SEEK_CUR), util::ErrnoException, "Couldn't seek backwards for revision");
	util::WriteOrThrow(file_, start, amount);
	long forward = entry_size_ - internal - amount;
	#if !defined(_WIN32) && !defined(_WIN64)
	if (forward)
	#endif
	UTIL_THROW_IF(fseek(file_, forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision");
	}

	void RecordReader::Rewind() {
	if (file_) {
	rewind(file_);
	remains_ = true;
	++*this;
	} else {
	remains_ = false;
	}
	}

	SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) {
	PositiveProbWarn warn(config.positive_log_probability);
	unigram_.reset(util::MakeTemp(file_prefix));
	{
	// In case <unk> appears.
	size_t size_out = (counts[0] + 1) * sizeof(ProbBackoff);
	util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_.get(), size_out), size_out);
	Read1Grams(f, counts[0], vocab, reinterpret_cast<ProbBackoff*>(unigram_mmap.get()), warn);
	CheckSpecials(config, vocab);
	if (!vocab.SawUnk()) ++counts[0];
	}

	// Only use as much buffer as we need.
	size_t buffer_use = 0;
	for (unsigned int order = 2; order < counts.size(); ++order) {
	buffer_use = std::max<size_t>(buffer_use, static_cast<size_t>((sizeof(WordIndex) * order + 2 * sizeof(float)) * counts[order - 1]));
	}
	buffer_use = std::max<size_t>(buffer_use, static_cast<size_t>((sizeof(WordIndex) * counts.size() + sizeof(float)) * counts.back()));
	buffer = std::min<size_t>(buffer, buffer_use);

	util::scoped_malloc mem;
	mem.reset(malloc(buffer));
	if (!mem.get()) UTIL_THROW(util::ErrnoException, "malloc failed for sort buffer size " << buffer);

	for (unsigned char order = 2; order <= counts.size(); ++order) {
	ConvertToSorted(f, vocab, counts, file_prefix, order, warn, mem.get(), buffer);
	}
	ReadEnd(f);
	}

	namespace {
	class Closer {
	public:
	explicit Closer(std::deque<FILE*> &files) : files_(files) {}

	~Closer() {
	for (std::deque<FILE*>::iterator i = files_.begin(); i != files_.end(); ++i) {
	util::scoped_FILE deleter(*i);
	}
	}

	void PopFront() {
	util::scoped_FILE deleter(files_.front());
	files_.pop_front();
	}
	private:
	std::deque<FILE*> &files_;
	};
	} // namespace

	void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const std::string &file_prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size) {
	ReadNGramHeader(f, order);
	const size_t count = counts[order - 1];
	// Size of weights. Does it include backoff?
	const size_t words_size = sizeof(WordIndex) * order;
	const size_t weights_size = sizeof(float) + ((order == counts.size()) ? 0 : sizeof(float));
	const size_t entry_size = words_size + weights_size;
	const size_t batch_size = std::min(count, mem_size / entry_size);
	uint8_t const begin = reinterpret_cast<uint8_t>(mem);

	std::deque<FILE*> files, contexts;
	Closer files_closer(files), contexts_closer(contexts);

	for (std::size_t batch = 0, done = 0; done < count; ++batch) {
	uint8_t *out = begin;
	uint8_t out_end = out + std::min(count - done, batch_size) entry_size;
	if (order == counts.size()) {
	for (; out != out_end; out += entry_size) {
	std::reverse_iterator<WordIndex> it(reinterpret_cast<WordIndex>(out) + order);
	ReadNGram(f, order, vocab, it, reinterpret_cast<Prob>(out + words_size), warn);
	}
	} else {
	for (; out != out_end; out += entry_size) {
	std::reverse_iterator<WordIndex> it(reinterpret_cast<WordIndex>(out) + order);
	ReadNGram(f, order, vocab, it, reinterpret_cast<ProbBackoff>(out + words_size), warn);
	}
	}
	// Sort full records by full n-gram.
	util::SizedProxy proxy_begin(begin, entry_size), proxy_end(out_end, entry_size);
	// parallel_sort uses too much RAM. TODO: figure out why windows sort doesn't like my proxies.
	#if defined(_WIN32) \|\| defined(_WIN64)
	std::stable_sort
	#else
	std::sort
	#endif
	(NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare<EntryCompare>(EntryCompare(order)));
	files.push_back(DiskFlush(begin, out_end, file_prefix));
	contexts.push_back(WriteContextFile(begin, out_end, file_prefix, entry_size, order));

	done += (out_end - begin) / entry_size;
	}

	// All individual files created. Merge them.

	while (files.size() > 1) {
	files.push_back(MergeSortedFiles(files[0], files[1], file_prefix, weights_size, order, ThrowCombine()));
	files_closer.PopFront();
	files_closer.PopFront();
	contexts.push_back(MergeSortedFiles(contexts[0], contexts[1], file_prefix, 0, order - 1, FirstCombine()));
	contexts_closer.PopFront();
	contexts_closer.PopFront();
	}

	if (!files.empty()) {
	// Steal from closers.
	full_[order - 2].reset(files.front());
	files.pop_front();
	context_[order - 2].reset(contexts.front());
	contexts.pop_front();
	}
	}

	} // namespace trie
	} // namespace ngram
	} // namespace lm