blob: 66f1fd9a8f168d4b232645f98b88f58748112eba [file] [log] [blame]
#ifndef LM_BUILDER_PIPELINE_H
#define LM_BUILDER_PIPELINE_H
#include "lm/builder/adjust_counts.hh"
#include "lm/builder/initial_probabilities.hh"
#include "lm/builder/header_info.hh"
#include "lm/lm_exception.hh"
#include "lm/word_index.hh"
#include "util/stream/config.hh"
#include "util/file_piece.hh"
#include <string>
#include <cstddef>
namespace lm { namespace builder {
class Output;
struct PipelineConfig {
std::size_t order;
util::stream::SortConfig sort;
InitialProbabilitiesConfig initial_probs;
util::stream::ChainConfig read_backoffs;
// Estimated vocabulary size. Used for sizing CorpusCount memory and
// initial probing hash table sizing, also in CorpusCount.
lm::WordIndex vocab_estimate;
// Minimum block size to tolerate.
std::size_t minimum_block;
// Number of blocks to use. This will be overridden to 1 if everything fits.
std::size_t block_count;
// n-gram count thresholds for pruning. 0 values means no pruning for
// corresponding n-gram order
std::vector<uint64_t> prune_thresholds; //mjd
bool prune_vocab;
std::string prune_vocab_file;
/* Renumber the vocabulary the way the trie likes it? */
bool renumber_vocabulary;
// What to do with discount failures.
DiscountConfig discount;
// Compute collapsed q values instead of probability and backoff
bool output_q;
/* Computing the perplexity of LMs with different vocabularies is hard. For
* example, the lowest perplexity is attained by a unigram model that
* predicts p(<unk>) = 1 and has no other vocabulary. Also, linearly
* interpolated models will sum to more than 1 because <unk> is duplicated
* (SRI just pretends p(<unk>) = 0 for these purposes, which makes it sum to
* 1 but comes with its own problems). This option will make the vocabulary
* a particular size by replicating <unk> multiple times for purposes of
* computing vocabulary size. It has no effect if the actual vocabulary is
* larger. This parameter serves the same purpose as IRSTLM's "dub".
*/
uint64_t vocab_size_for_unk;
/* What to do the first time <s>, </s>, or <unk> appears in the input. If
* this is anything but THROW_UP, then the symbol will always be treated as
* whitespace.
*/
WarningAction disallowed_symbol_action;
const std::string &TempPrefix() const { return sort.temp_prefix; }
std::size_t TotalMemory() const { return sort.total_memory; }
};
// Takes ownership of text_file and out_arpa.
void Pipeline(PipelineConfig &config, int text_file, Output &output);
}} // namespaces
#endif // LM_BUILDER_PIPELINE_H