src/joshua/decoder/ff/lm/kenlm/lm/builder/print.cc - joshua - Git at Google

 #include "lm/builder/print.hh"

 #include "util/fake_ofstream.hh"
 #include "util/file.hh"
 #include "util/mmap.hh"
 #include "util/scoped.hh"
 #include "util/stream/timer.hh"

 #include <sstream>

 #include <string.h>

 namespace lm { namespace builder {

 VocabReconstitute::VocabReconstitute(int fd) {
   uint64_t size = util::SizeOrThrow(fd);
   util::MapRead(util::POPULATE_OR_READ, fd, 0, size, memory_);
   const char *const start = static_cast<const char*>(memory_.get());
   const char *i;
   for (i = start; i != start + size; i += strlen(i) + 1) {
     map_.push_back(i);
   }
   // Last one for LookupPiece.
   map_.push_back(i);
 }

 void PrintARPA::Run(const util::stream::ChainPositions &positions) {
   VocabReconstitute vocab(GetVocabFD());

   // Write header.  TODO: integers in FakeOFStream.
   {
     std::stringstream stream;
     if (verbose_header_) {
       stream << "# Input file: " << GetHeader().input_file << '\n';
       stream << "# Token count: " << GetHeader().token_count << '\n';
       stream << "# Smoothing: Modified Kneser-Ney" << '\n';
     }
     stream << "\\data\\\n";
     for (size_t i = 0; i < positions.size(); ++i) {
       stream << "ngram " << (i+1) << '=' << GetHeader().counts_pruned[i] << '\n';
     }
     stream << '\n';
     std::string as_string(stream.str());
     util::WriteOrThrow(out_fd_.get(), as_string.data(), as_string.size());
   }

   util::FakeOFStream out(out_fd_.get());
   for (unsigned order = 1; order <= positions.size(); ++order) {
     out << "\\" << order << "-grams:" << '\n';
     for (NGramStream stream(positions[order - 1]); stream; ++stream) {
       // Correcting for numerical precision issues.  Take that IRST.
       out << stream->Value().complete.prob << '\t' << vocab.Lookup(*stream->begin());
       for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
         out << ' ' << vocab.Lookup(*i);
       }
       if (order != positions.size())
         out << '\t' << stream->Value().complete.backoff;
       out << '\n';

     }
     out << '\n';
   }
   out << "\\end\\\n";
 }

 }} // namespaces
	#include "lm/builder/print.hh"

	#include "util/fake_ofstream.hh"
	#include "util/file.hh"
	#include "util/mmap.hh"
	#include "util/scoped.hh"
	#include "util/stream/timer.hh"

	#include <sstream>

	#include <string.h>

	namespace lm { namespace builder {

	VocabReconstitute::VocabReconstitute(int fd) {
	uint64_t size = util::SizeOrThrow(fd);
	util::MapRead(util::POPULATE_OR_READ, fd, 0, size, memory_);
	const char const start = static_cast<const char>(memory_.get());
	const char *i;
	for (i = start; i != start + size; i += strlen(i) + 1) {
	map_.push_back(i);
	}
	// Last one for LookupPiece.
	map_.push_back(i);
	}

	void PrintARPA::Run(const util::stream::ChainPositions &positions) {
	VocabReconstitute vocab(GetVocabFD());

	// Write header. TODO: integers in FakeOFStream.
	{
	std::stringstream stream;
	if (verbose_header_) {
	stream << "# Input file: " << GetHeader().input_file << '\n';
	stream << "# Token count: " << GetHeader().token_count << '\n';
	stream << "# Smoothing: Modified Kneser-Ney" << '\n';
	}
	stream << "\\data\\\n";
	for (size_t i = 0; i < positions.size(); ++i) {
	stream << "ngram " << (i+1) << '=' << GetHeader().counts_pruned[i] << '\n';
	}
	stream << '\n';
	std::string as_string(stream.str());
	util::WriteOrThrow(out_fd_.get(), as_string.data(), as_string.size());
	}

	util::FakeOFStream out(out_fd_.get());
	for (unsigned order = 1; order <= positions.size(); ++order) {
	out << "\\" << order << "-grams:" << '\n';
	for (NGramStream stream(positions[order - 1]); stream; ++stream) {
	// Correcting for numerical precision issues. Take that IRST.
	out << stream->Value().complete.prob << '\t' << vocab.Lookup(*stream->begin());
	for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
	out << ' ' << vocab.Lookup(*i);
	}
	if (order != positions.size())
	out << '\t' << stream->Value().complete.backoff;
	out << '\n';

	}
	out << '\n';
	}
	out << "\\end\\\n";
	}

	}} // namespaces