| #ifndef LM_BUILDER_CORPUS_COUNT_H |
| #define LM_BUILDER_CORPUS_COUNT_H |
| |
| #include "lm/lm_exception.hh" |
| #include "lm/word_index.hh" |
| #include "util/scoped.hh" |
| |
| #include <cstddef> |
| #include <string> |
| #include <stdint.h> |
| #include <vector> |
| |
| namespace util { |
| class FilePiece; |
| namespace stream { |
| class ChainPosition; |
| } // namespace stream |
| } // namespace util |
| |
| namespace lm { |
| namespace builder { |
| |
| class CorpusCount { |
| public: |
| // Memory usage will be DedupeMultipler(order) * block_size + total_chain_size + unknown vocab_hash_size |
| static float DedupeMultiplier(std::size_t order); |
| |
| // How much memory vocabulary will use based on estimated size of the vocab. |
| static std::size_t VocabUsage(std::size_t vocab_estimate); |
| |
| // token_count: out. |
| // type_count aka vocabulary size. Initialize to an estimate. It is set to the exact value. |
| CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::vector<bool> &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol); |
| |
| void Run(const util::stream::ChainPosition &position); |
| |
| private: |
| util::FilePiece &from_; |
| int vocab_write_; |
| uint64_t &token_count_; |
| WordIndex &type_count_; |
| std::vector<bool>& prune_words_; |
| const std::string& prune_vocab_filename_; |
| |
| std::size_t dedupe_mem_size_; |
| util::scoped_malloc dedupe_mem_; |
| |
| WarningAction disallowed_symbol_action_; |
| }; |
| |
| } // namespace builder |
| } // namespace lm |
| #endif // LM_BUILDER_CORPUS_COUNT_H |