src/joshua/decoder/ff/lm/kenlm/lm/builder/lmplz_main.cc - joshua - Git at Google

 #include "lm/builder/output.hh"
 #include "lm/builder/pipeline.hh"
 #include "lm/builder/print.hh"
 #include "lm/lm_exception.hh"
 #include "util/file.hh"
 #include "util/file_piece.hh"
 #include "util/usage.hh"

 #include <iostream>

 #include <boost/program_options.hpp>
 #include <boost/version.hpp>
 #include <vector>

 namespace {
 class SizeNotify {
   public:
     SizeNotify(std::size_t &out) : behind_(out) {}

     void operator()(const std::string &from) {
       behind_ = util::ParseSize(from);
     }

   private:
     std::size_t &behind_;
 };

 boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, const char *default_value) {
   return boost::program_options::value<std::string>()->notifier(SizeNotify(to))->default_value(default_value);
 }

 // Parse and validate pruning thresholds then return vector of threshold counts
 // for each n-grams order.
 std::vector<uint64_t> ParsePruning(const std::vector<std::string> &param, std::size_t order) {
   // convert to vector of integers
   std::vector<uint64_t> prune_thresholds;
   prune_thresholds.reserve(order);
   for (std::vector<std::string>::const_iterator it(param.begin()); it != param.end(); ++it) {
     try {
       prune_thresholds.push_back(boost::lexical_cast<uint64_t>(*it));
     } catch(const boost::bad_lexical_cast &) {
       UTIL_THROW(util::Exception, "Bad pruning threshold " << *it);
     }
   }

   // Fill with zeros by default.
   if (prune_thresholds.empty()) {
     prune_thresholds.resize(order, 0);
     return prune_thresholds;
   }

   // validate pruning threshold if specified
   // throw if each n-gram order has not  threshold specified
   UTIL_THROW_IF(prune_thresholds.size() > order, util::Exception, "You specified pruning thresholds for orders 1 through " << prune_thresholds.size() << " but the model only has order " << order);
   // threshold for unigram can only be 0 (no pruning)

   // check if threshold are not in decreasing order
   uint64_t lower_threshold = 0;
   for (std::vector<uint64_t>::iterator it = prune_thresholds.begin(); it != prune_thresholds.end(); ++it) {
     UTIL_THROW_IF(lower_threshold > *it, util::Exception, "Pruning thresholds should be in non-decreasing order.  Otherwise substrings would be removed, which is bad for query-time data structures.");
     lower_threshold = *it;
   }

   // Pad to all orders using the last value.
   prune_thresholds.resize(order, prune_thresholds.back());
   return prune_thresholds;
 }

 lm::builder::Discount ParseDiscountFallback(const std::vector<std::string> &param) {
   lm::builder::Discount ret;
   UTIL_THROW_IF(param.size() > 3, util::Exception, "Specify at most three fallback discounts: 1, 2, and 3+");
   UTIL_THROW_IF(param.empty(), util::Exception, "Fallback discounting enabled, but no discount specified");
   ret.amount[0] = 0.0;
   for (unsigned i = 0; i < 3; ++i) {
     float discount = boost::lexical_cast<float>(param[i < param.size() ? i : (param.size() - 1)]);
     UTIL_THROW_IF(discount < 0.0 || discount > static_cast<float>(i+1), util::Exception, "The discount for count " << (i+1) << " was parsed as " << discount << " which is not in the range [0, " << (i+1) << "].");
     ret.amount[i + 1] = discount;
   }
   return ret;
 }

 } // namespace

 int main(int argc, char *argv[]) {
   try {
     namespace po = boost::program_options;
     po::options_description options("Language model building options");
     lm::builder::PipelineConfig pipeline;

     std::string text, arpa;
     std::vector<std::string> pruning;
     std::vector<std::string> discount_fallback;
     std::vector<std::string> discount_fallback_default;
     discount_fallback_default.push_back("0.5");
     discount_fallback_default.push_back("1");
     discount_fallback_default.push_back("1.5");
     bool verbose_header;

     options.add_options()
       ("help,h", po::bool_switch(), "Show this help message")
       ("order,o", po::value<std::size_t>(&pipeline.order)
 #if BOOST_VERSION >= 104200
          ->required()
 #endif
          , "Order of the model")
       ("interpolate_unigrams", po::value<bool>(&pipeline.initial_probs.interpolate_unigrams)->default_value(true)->implicit_value(true), "Interpolate the unigrams (default) as opposed to giving lots of mass to <unk> like SRI.  If you want SRI's behavior with a large <unk> and the old lmplz default, use --interpolate_unigrams 0.")
       ("skip_symbols", po::bool_switch(), "Treat <s>, </s>, and <unk> as whitespace instead of throwing an exception")
       ("temp_prefix,T", po::value<std::string>(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix")
       ("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
       ("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow")
       ("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)")
       ("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
       ("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
       ("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write a file containing the unique vocabulary strings delimited by null bytes")
       ("vocab_pad", po::value<uint64_t>(&pipeline.vocab_size_for_unk)->default_value(0), "If the vocabulary is smaller than this value, pad with <unk> to reach this size. Requires --interpolate_unigrams")
       ("verbose_header", po::bool_switch(&verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
       ("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
       ("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout")
       ("collapse_values", po::bool_switch(&pipeline.output_q), "Collapse probability and backoff into a single value, q that yields the same sentence-level probabilities.  See http://kheafield.com/professional/edinburgh/rest_paper.pdf for more details, including a proof.")
       ("prune", po::value<std::vector<std::string> >(&pruning)->multitoken(), "Prune n-grams with count less than or equal to the given threshold.  Specify one value for each order i.e. 0 0 1 to prune singleton trigrams and above.  The sequence of values must be non-decreasing and the last value applies to any remaining orders. Default is to not prune, which is equivalent to --prune 0.")
       ("limit_vocab_file", po::value<std::string>(&pipeline.prune_vocab_file)->default_value(""), "Read allowed vocabulary separated by whitespace. N-grams that contain vocabulary items not in this list will be pruned. Can be combined with --prune arg")
       ("discount_fallback", po::value<std::vector<std::string> >(&discount_fallback)->multitoken()->implicit_value(discount_fallback_default, "0.5 1 1.5"), "The closed-form estimate for Kneser-Ney discounts does not work without singletons or doubletons.  It can also fail if these values are out of range.  This option falls back to user-specified discounts when the closed-form estimate fails.  Note that this option is generally a bad idea: you should deduplicate your corpus instead.  However, class-based models need custom discounts because they lack singleton unigrams.  Provide up to three discounts (for adjusted counts 1, 2, and 3+), which will be applied to all orders where the closed-form estimates fail.");
     po::variables_map vm;
     po::store(po::parse_command_line(argc, argv, options), vm);

     if (argc == 1 || vm["help"].as<bool>()) {
       std::cerr <<
         "Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
         "Please cite:\n"
         "@inproceedings{Heafield-estimate,\n"
         "  author = {Kenneth Heafield and Ivan Pouzyrevsky and Jonathan H. Clark and Philipp Koehn},\n"
         "  title = {Scalable Modified {Kneser-Ney} Language Model Estimation},\n"
         "  year = {2013},\n"
         "  month = {8},\n"
         "  booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics},\n"
         "  address = {Sofia, Bulgaria},\n"
         "  url = {http://kheafield.com/professional/edinburgh/estimate\\_paper.pdf},\n"
         "}\n\n"
         "Provide the corpus on stdin.  The ARPA file will be written to stdout.  Order of\n"
         "the model (-o) is the only mandatory option.  As this is an on-disk program,\n"
         "setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n"
         "Memory sizes are specified like GNU sort: a number followed by a unit character.\n"
         "Valid units are \% for percentage of memory (supported platforms only) and (in\n"
         "increasing powers of 1024): b, K, M, G, T, P, E, Z, Y.  Default is K (*1024).\n";
       uint64_t mem = util::GuessPhysicalMemory();
       if (mem) {
         std::cerr << "This machine has " << mem << " bytes of memory.\n\n";
       } else {
         std::cerr << "Unable to determine the amount of memory on this machine.\n\n";
       }
       std::cerr << options << std::endl;
       return 1;
     }

     po::notify(vm);

     // required() appeared in Boost 1.42.0.
 #if BOOST_VERSION < 104200
     if (!vm.count("order")) {
       std::cerr << "the option '--order' is required but missing" << std::endl;
       return 1;
     }
 #endif

     if (pipeline.vocab_size_for_unk && !pipeline.initial_probs.interpolate_unigrams) {
       std::cerr << "--vocab_pad requires --interpolate_unigrams be on" << std::endl;
       return 1;
     }

     if (vm["skip_symbols"].as<bool>()) {
       pipeline.disallowed_symbol_action = lm::COMPLAIN;
     } else {
       pipeline.disallowed_symbol_action = lm::THROW_UP;
     }

     if (vm.count("discount_fallback")) {
       pipeline.discount.fallback = ParseDiscountFallback(discount_fallback);
       pipeline.discount.bad_action = lm::COMPLAIN;
     } else {
       // Unused, just here to prevent the compiler from complaining about uninitialized.
       pipeline.discount.fallback = lm::builder::Discount();
       pipeline.discount.bad_action = lm::THROW_UP;
     }

     // parse pruning thresholds.  These depend on order, so it is not done as a notifier.
     pipeline.prune_thresholds = ParsePruning(pruning, pipeline.order);

     if (!vm["limit_vocab_file"].as<std::string>().empty()) {
       pipeline.prune_vocab = true;
     }
     else {
       pipeline.prune_vocab = false;
     }

     util::NormalizeTempPrefix(pipeline.sort.temp_prefix);

     lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
     // TODO: evaluate options for these.
     initial.adder_in.total_memory = 32768;
     initial.adder_in.block_count = 2;
     initial.adder_out.total_memory = 32768;
     initial.adder_out.block_count = 2;
     pipeline.read_backoffs = initial.adder_out;

     util::scoped_fd in(0), out(1);
     if (vm.count("text")) {
       in.reset(util::OpenReadOrThrow(text.c_str()));
     }
     if (vm.count("arpa")) {
       out.reset(util::CreateOrThrow(arpa.c_str()));
     }

     // Read from stdin
     try {
       lm::builder::Output output;
       output.Add(new lm::builder::PrintARPA(out.release(), verbose_header));
       lm::builder::Pipeline(pipeline, in.release(), output);
     } catch (const util::MallocException &e) {
       std::cerr << e.what() << std::endl;
       std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as<std::string>() << std::endl;
       return 1;
     }
     util::PrintUsage(std::cerr);
   } catch (const std::exception &e) {
     std::cerr << e.what() << std::endl;
     return 1;
   }
 }
	#include "lm/builder/output.hh"
	#include "lm/builder/pipeline.hh"
	#include "lm/builder/print.hh"
	#include "lm/lm_exception.hh"
	#include "util/file.hh"
	#include "util/file_piece.hh"
	#include "util/usage.hh"

	#include <iostream>

	#include <boost/program_options.hpp>
	#include <boost/version.hpp>
	#include <vector>

	namespace {
	class SizeNotify {
	public:
	SizeNotify(std::size_t &out) : behind_(out) {}

	void operator()(const std::string &from) {
	behind_ = util::ParseSize(from);
	}

	private:
	std::size_t &behind_;
	};

	boost::program_options::typed_value<std::string> SizeOption(std::size_t &to, const char default_value) {
	return boost::program_options::value<std::string>()->notifier(SizeNotify(to))->default_value(default_value);
	}

	// Parse and validate pruning thresholds then return vector of threshold counts
	// for each n-grams order.
	std::vector<uint64_t> ParsePruning(const std::vector<std::string> &param, std::size_t order) {
	// convert to vector of integers
	std::vector<uint64_t> prune_thresholds;
	prune_thresholds.reserve(order);
	for (std::vector<std::string>::const_iterator it(param.begin()); it != param.end(); ++it) {
	try {
	prune_thresholds.push_back(boost::lexical_cast<uint64_t>(*it));
	} catch(const boost::bad_lexical_cast &) {
	UTIL_THROW(util::Exception, "Bad pruning threshold " << *it);
	}
	}

	// Fill with zeros by default.
	if (prune_thresholds.empty()) {
	prune_thresholds.resize(order, 0);
	return prune_thresholds;
	}

	// validate pruning threshold if specified
	// throw if each n-gram order has not threshold specified
	UTIL_THROW_IF(prune_thresholds.size() > order, util::Exception, "You specified pruning thresholds for orders 1 through " << prune_thresholds.size() << " but the model only has order " << order);
	// threshold for unigram can only be 0 (no pruning)

	// check if threshold are not in decreasing order
	uint64_t lower_threshold = 0;
	for (std::vector<uint64_t>::iterator it = prune_thresholds.begin(); it != prune_thresholds.end(); ++it) {
	UTIL_THROW_IF(lower_threshold > *it, util::Exception, "Pruning thresholds should be in non-decreasing order. Otherwise substrings would be removed, which is bad for query-time data structures.");
	lower_threshold = *it;
	}

	// Pad to all orders using the last value.
	prune_thresholds.resize(order, prune_thresholds.back());
	return prune_thresholds;
	}

	lm::builder::Discount ParseDiscountFallback(const std::vector<std::string> &param) {
	lm::builder::Discount ret;
	UTIL_THROW_IF(param.size() > 3, util::Exception, "Specify at most three fallback discounts: 1, 2, and 3+");
	UTIL_THROW_IF(param.empty(), util::Exception, "Fallback discounting enabled, but no discount specified");
	ret.amount[0] = 0.0;
	for (unsigned i = 0; i < 3; ++i) {
	float discount = boost::lexical_cast<float>(param[i < param.size() ? i : (param.size() - 1)]);
	UTIL_THROW_IF(discount < 0.0 \|\| discount > static_cast<float>(i+1), util::Exception, "The discount for count " << (i+1) << " was parsed as " << discount << " which is not in the range [0, " << (i+1) << "].");
	ret.amount[i + 1] = discount;
	}
	return ret;
	}

	} // namespace

	int main(int argc, char *argv[]) {
	try {
	namespace po = boost::program_options;
	po::options_description options("Language model building options");
	lm::builder::PipelineConfig pipeline;

	std::string text, arpa;
	std::vector<std::string> pruning;
	std::vector<std::string> discount_fallback;
	std::vector<std::string> discount_fallback_default;
	discount_fallback_default.push_back("0.5");
	discount_fallback_default.push_back("1");
	discount_fallback_default.push_back("1.5");
	bool verbose_header;

	options.add_options()
	("help,h", po::bool_switch(), "Show this help message")
	("order,o", po::value<std::size_t>(&pipeline.order)
	#if BOOST_VERSION >= 104200
	->required()
	#endif
	, "Order of the model")
	("interpolate_unigrams", po::value<bool>(&pipeline.initial_probs.interpolate_unigrams)->default_value(true)->implicit_value(true), "Interpolate the unigrams (default) as opposed to giving lots of mass to <unk> like SRI. If you want SRI's behavior with a large <unk> and the old lmplz default, use --interpolate_unigrams 0.")
	("skip_symbols", po::bool_switch(), "Treat <s>, </s>, and <unk> as whitespace instead of throwing an exception")
	("temp_prefix,T", po::value<std::string>(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix")
	("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
	("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow")
	("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)")
	("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
	("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
	("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write a file containing the unique vocabulary strings delimited by null bytes")
	("vocab_pad", po::value<uint64_t>(&pipeline.vocab_size_for_unk)->default_value(0), "If the vocabulary is smaller than this value, pad with <unk> to reach this size. Requires --interpolate_unigrams")
	("verbose_header", po::bool_switch(&verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
	("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
	("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout")
	("collapse_values", po::bool_switch(&pipeline.output_q), "Collapse probability and backoff into a single value, q that yields the same sentence-level probabilities. See http://kheafield.com/professional/edinburgh/rest_paper.pdf for more details, including a proof.")
	("prune", po::value<std::vector<std::string> >(&pruning)->multitoken(), "Prune n-grams with count less than or equal to the given threshold. Specify one value for each order i.e. 0 0 1 to prune singleton trigrams and above. The sequence of values must be non-decreasing and the last value applies to any remaining orders. Default is to not prune, which is equivalent to --prune 0.")
	("limit_vocab_file", po::value<std::string>(&pipeline.prune_vocab_file)->default_value(""), "Read allowed vocabulary separated by whitespace. N-grams that contain vocabulary items not in this list will be pruned. Can be combined with --prune arg")
	("discount_fallback", po::value<std::vector<std::string> >(&discount_fallback)->multitoken()->implicit_value(discount_fallback_default, "0.5 1 1.5"), "The closed-form estimate for Kneser-Ney discounts does not work without singletons or doubletons. It can also fail if these values are out of range. This option falls back to user-specified discounts when the closed-form estimate fails. Note that this option is generally a bad idea: you should deduplicate your corpus instead. However, class-based models need custom discounts because they lack singleton unigrams. Provide up to three discounts (for adjusted counts 1, 2, and 3+), which will be applied to all orders where the closed-form estimates fail.");
	po::variables_map vm;
	po::store(po::parse_command_line(argc, argv, options), vm);

	if (argc == 1 \|\| vm["help"].as<bool>()) {
	std::cerr <<
	"Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
	"Please cite:\n"
	"@inproceedings{Heafield-estimate,\n"
	" author = {Kenneth Heafield and Ivan Pouzyrevsky and Jonathan H. Clark and Philipp Koehn},\n"
	" title = {Scalable Modified {Kneser-Ney} Language Model Estimation},\n"
	" year = {2013},\n"
	" month = {8},\n"
	" booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics},\n"
	" address = {Sofia, Bulgaria},\n"
	" url = {http://kheafield.com/professional/edinburgh/estimate\\_paper.pdf},\n"
	"}\n\n"
	"Provide the corpus on stdin. The ARPA file will be written to stdout. Order of\n"
	"the model (-o) is the only mandatory option. As this is an on-disk program,\n"
	"setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n"
	"Memory sizes are specified like GNU sort: a number followed by a unit character.\n"
	"Valid units are \% for percentage of memory (supported platforms only) and (in\n"
	"increasing powers of 1024): b, K, M, G, T, P, E, Z, Y. Default is K (*1024).\n";
	uint64_t mem = util::GuessPhysicalMemory();
	if (mem) {
	std::cerr << "This machine has " << mem << " bytes of memory.\n\n";
	} else {
	std::cerr << "Unable to determine the amount of memory on this machine.\n\n";
	}
	std::cerr << options << std::endl;
	return 1;
	}

	po::notify(vm);

	// required() appeared in Boost 1.42.0.
	#if BOOST_VERSION < 104200
	if (!vm.count("order")) {
	std::cerr << "the option '--order' is required but missing" << std::endl;
	return 1;
	}
	#endif

	if (pipeline.vocab_size_for_unk && !pipeline.initial_probs.interpolate_unigrams) {
	std::cerr << "--vocab_pad requires --interpolate_unigrams be on" << std::endl;
	return 1;
	}

	if (vm["skip_symbols"].as<bool>()) {
	pipeline.disallowed_symbol_action = lm::COMPLAIN;
	} else {
	pipeline.disallowed_symbol_action = lm::THROW_UP;
	}

	if (vm.count("discount_fallback")) {
	pipeline.discount.fallback = ParseDiscountFallback(discount_fallback);
	pipeline.discount.bad_action = lm::COMPLAIN;
	} else {
	// Unused, just here to prevent the compiler from complaining about uninitialized.
	pipeline.discount.fallback = lm::builder::Discount();
	pipeline.discount.bad_action = lm::THROW_UP;
	}

	// parse pruning thresholds. These depend on order, so it is not done as a notifier.
	pipeline.prune_thresholds = ParsePruning(pruning, pipeline.order);

	if (!vm["limit_vocab_file"].as<std::string>().empty()) {
	pipeline.prune_vocab = true;
	}
	else {
	pipeline.prune_vocab = false;
	}

	util::NormalizeTempPrefix(pipeline.sort.temp_prefix);

	lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
	// TODO: evaluate options for these.
	initial.adder_in.total_memory = 32768;
	initial.adder_in.block_count = 2;
	initial.adder_out.total_memory = 32768;
	initial.adder_out.block_count = 2;
	pipeline.read_backoffs = initial.adder_out;

	util::scoped_fd in(0), out(1);
	if (vm.count("text")) {
	in.reset(util::OpenReadOrThrow(text.c_str()));
	}
	if (vm.count("arpa")) {
	out.reset(util::CreateOrThrow(arpa.c_str()));
	}

	// Read from stdin
	try {
	lm::builder::Output output;
	output.Add(new lm::builder::PrintARPA(out.release(), verbose_header));
	lm::builder::Pipeline(pipeline, in.release(), output);
	} catch (const util::MallocException &e) {
	std::cerr << e.what() << std::endl;
	std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as<std::string>() << std::endl;
	return 1;
	}
	util::PrintUsage(std::cerr);
	} catch (const std::exception &e) {
	std::cerr << e.what() << std::endl;
	return 1;
	}
	}