src/kenlm/lm/interpolate/train_params_main.cc - joshua - Git at Google

 #include "lm/ngram_query.hh"
 #include "lm/model.hh"
 #include "lm/word_index.hh"
 #include "lm/interpolate/enumerate_global_vocab.hh"


 #include <string>
 #include <vector>
 #include <iostream>
 #include <fstream>
 #include <map>

 #include <boost/program_options.hpp>
 #include <boost/version.hpp>
 #include <boost/foreach.hpp>

 #include "util/fixed_array.hh"

 #include <Eigen/Eigen>

 typedef Eigen::MatrixXf FMatrix;
 typedef Eigen::VectorXf FVector;

 bool HAS_BIAS = true;

 using namespace lm::ngram;
 using namespace lm;

 inline float logProb(Model * model, const std::vector<std::string>& ctx, const std::string& word) {

   // Horribly inefficient
   const Vocabulary &vocab = model->GetVocabulary();

   State nextState; //throwaway

   WordIndex word_idx = vocab.Index(word);
   WordIndex context_idx[ctx.size()];

   //reverse context
   for(unsigned int i = 0; i < ctx.size(); i++) {
     context_idx[ctx.size() - 1 - i] = vocab.Index(ctx[i]);
   }

   FullScoreReturn score = model->FullScoreForgotState(context_idx, &(context_idx[ctx.size() -1]), word_idx, nextState);

   float ret = score.prob;
   //std::cerr << "w: " << word << " p: " << ret << std::endl;
   return ret;
 }

 void set_features(const std::vector<std::string>& ctx,
                   const std::string& word,
                   const std::vector<Model *>& models,
                   FVector& v) {

   //std::cerr << "setting feats for " << word << std::endl;

   if (HAS_BIAS) {
     v(0) = 1;
     for (unsigned i=0; i < models.size(); ++i)
       v(i + 1) = logProb(models[i], ctx, word);
   } else {
     for (unsigned i=0; i < models.size(); ++i)
       v(i) = logProb(models[i], ctx, word);
   }
 }

 void translate_input(
     const std::vector<std::vector<std::string> >& corpus,
     const std::vector<std::string>& gvocab,
     const std::vector<Model *>& models,
     std::vector<std::vector<std::vector<WordIndex> > >&translated_corpus,
     std::vector<std::vector<WordIndex> >&translated_vocab
   ) {
   translated_corpus.resize(models.size());
   translated_vocab.resize(models.size());
   for (unsigned mn=0; mn < models.size(); ++mn) { // models

     const Vocabulary &vocab = models[mn]->GetVocabulary();

     for (unsigned i = 0; i < gvocab.size(); ++i) {
       translated_vocab[mn].push_back(vocab.Index(gvocab[i]));
     }

     translated_corpus[mn].resize(corpus.size());
     for (unsigned ci = 0; ci < corpus.size(); ++ci) { // sentences in tuning corpus
       const std::vector<std::string>& sentence = corpus[ci];
       for (int t = sentence.size() -1; t >= 0; --t) { // words in sentence
         translated_corpus[mn][ci].push_back(vocab.Index(sentence[t]));
       }
       for (int i=0; i<5; ++i) {
         translated_corpus[mn][ci].push_back(vocab.Index("<s>"));
       }
     }
   }
 }


 void train_params_fast(
     const std::vector<std::vector<std::string> >& corpus,
     const std::vector<std::string>& vocab,
     const std::vector<Model *>& models) {
   using namespace std;

   // model / sentence / words in sentence in reverse order with <s> padding
   std::vector<std::vector<std::vector<WordIndex> > > t_corpus;
   std::vector<std::vector<WordIndex> > t_vocab;
   translate_input(corpus, vocab, models, t_corpus, t_vocab);


   const int ITERATIONS = 10;
   const int nlambdas = models.size() + (HAS_BIAS ? 1 : 0); // bias + #models
   FVector params = FVector::Zero(nlambdas);
   vector<FVector> feats(vocab.size(), params);
   vector<float> us(vocab.size(), 0);
   vector<float> ps(vocab.size(), 0);
   FVector grad = FVector::Zero(nlambdas);
   FMatrix H = FMatrix::Zero(nlambdas, nlambdas);
   FVector ef = FVector::Zero(nlambdas);
   for (int iter = 0; iter < ITERATIONS; ++iter) { // iterations
     grad.setZero();
     H.setZero();
     double loss = 0;
     unsigned numchars = 0;
     for (unsigned ci = 0; ci < corpus.size(); ++ci) { // sentences in tuning corpus
       const vector<string>& sentence = corpus[ci];
       double z = 0;
       for (int t = sentence.size() -1 ; t >=0; --t) { // words in sentence
         ++numchars;
         int ref_word = 0;
         for (unsigned i = 0; i < vocab.size(); ++i) { // vocab
           // set_features(context, vocab[i], models, feats[i]);
           for (unsigned j=0; j < models.size(); ++j) {
             // NOTE: reference ---- WordIndex word_idx = t_corpus[j][ci][t];
             WordIndex word_idx = t_vocab[j][i];
             State nextState; //throwaway
             FullScoreReturn score = models[j]->FullScoreForgotState(&(t_corpus[j][ci][t]), &(t_corpus[j][ci][t+5]), word_idx, nextState);
             feats[i](j) = score.prob;
             // feats[i](j) = logProb(models[j], ctx, word);
           }

           us[i] = params.dot(feats[i]);
           z += exp(double(us[i]));
         }
 	//std::cerr << "there..." << std::endl;
         const float logz = log(z);

         // expected feature values
         ef.setZero();
         for (unsigned i = 0; i < vocab.size(); ++i) {
           ps[i] = expf(us[i] - logz);
           ef += ps[i] * feats[i];
         }
         loss -= log(ps[ref_word]);
         const FVector& reffeats = feats[ref_word];
         grad += ef - reffeats;

         // Hessian
         for (unsigned i = 0; i < vocab.size(); ++i)
           H.noalias() += ps[i] * feats[i] * feats[i].transpose() -
                          ps[i] * feats[i] * ef.transpose();

         // this should just be the state for each model
       }
       cerr << ".";
     }
     cerr << "ITERATION " << (iter + 1) << ": PPL=" << exp(loss / numchars) << endl;
     params = H.colPivHouseholderQr().solve(grad);
     cerr << params << endl;
   }
 }


 //const util::FixedArray<Model *>& models)
 void train_params(
     const std::vector<std::vector<std::string> >& corpus,
     const std::vector<std::string>& vocab,
     const std::vector<Model *>& models) {
   using namespace std;

   vector<string> context(5, "<s>");
   const int ITERATIONS = 10;
   const int nlambdas = models.size() + (HAS_BIAS ? 1 : 0); // bias + #models
   FVector params = FVector::Zero(nlambdas);
   vector<FVector> feats(vocab.size(), params);
   vector<float> us(vocab.size(), 0);
   vector<float> ps(vocab.size(), 0);
   FVector grad = FVector::Zero(nlambdas);
   FMatrix H = FMatrix::Zero(nlambdas, nlambdas);
   FVector ef = FVector::Zero(nlambdas);
   for (int iter = 0; iter < ITERATIONS; ++iter) { // iterations
     grad.setZero();
     H.setZero();
     double loss = 0;
     unsigned numchars = 0;
     for (unsigned ci = 0; ci < corpus.size(); ++ci) { // sentences in tuning corpus
       const vector<string>& sentence = corpus[ci];
       std::fill(context.begin(), context.end(), "<s>");
       for (unsigned t = 0; t < sentence.size(); ++t) { // words in sentence
         ++numchars;
         const string& ref_word_string = sentence[t];
         int ref_word = 0; // TODO
         double z = 0;
 	//std::cerr << "here..." << std::endl;
         for (unsigned i = 0; i < vocab.size(); ++i) { // vocab
           set_features(context, vocab[i], models, feats[i]);
           us[i] = params.dot(feats[i]);
           z += exp(double(us[i]));
         }
 	//std::cerr << "there..." << std::endl;
         context.push_back(ref_word_string);
         const float logz = log(z);

         // expected feature values
         ef.setZero();
         for (unsigned i = 0; i < vocab.size(); ++i) {
           ps[i] = expf(us[i] - logz);
           ef += ps[i] * feats[i];
         }
         loss -= log(ps[ref_word]);
         const FVector& reffeats = feats[ref_word];
         grad += ef - reffeats;

         // Hessian
         for (unsigned i = 0; i < vocab.size(); ++i)
           H.noalias() += ps[i] * feats[i] * feats[i].transpose() -
                          ps[i] * feats[i] * ef.transpose();

         // this should just be the state for each model
       }
       cerr << ".";
     }
     cerr << "ITERATION " << (iter + 1) << ": PPL=" << exp(loss / numchars) << endl;
     params = H.colPivHouseholderQr().solve(grad);
     cerr << params << endl;
   }
 }

 int main(int argc, char** argv) {

   std::string tuning_data;
   std::vector<std::string> lms;

   try {
     namespace po = boost::program_options;
     po::options_description options("train-params");

     options.add_options()
       ("help,h", po::bool_switch(), "Show this help message")
       ("no_bias_term,B", po::bool_switch(), "Do not include a 'bias' feature")
       ("tuning_data,t", po::value<std::string>(&tuning_data), "File to tune perplexity on")
       ("model,m", po::value<std::vector<std::string> >(&lms), "Language models in KenLM format to interpolate");
     po::variables_map vm;
     po::store(po::parse_command_line(argc, argv, options), vm);

     // Display help
     if(argc == 1 || vm["help"].as<bool>()) {
       std::cerr << options << std::endl;
       return 1;
     }
     if (vm["no_bias_term"].as<bool>())
       HAS_BIAS = false;
     lms = vm["model"].as<std::vector<std::string> >();
     tuning_data = vm["tuning_data"].as<std::string>();
   }
   catch(const std::exception &e) {

     std::cerr << e.what() << std::endl;
     return 1;

   }
   if (lms.size() < 2) {
     std::cerr << "Please specify at least two language model files with -m LM.KLM\n";
     return 1;
   }
   if (tuning_data.empty()) {
     std::cerr << "Please specify tuning set with -t FILE.TXT\n";
     return 1;
   }

   //Growable vocab here
   //GrowableVocab gvoc(100000); //dummy default

   std::map<std::string, int*> vmap;
   util::FixedArray<WordIndex> vm(2);

   //stuff it into the
   EnumerateGlobalVocab * globalVocabBuilder = new EnumerateGlobalVocab(&vmap, lms.size());
   // EnumerateGlobalVocab * globalVocabBuilder = new EnumerateGlobalVocab(vm);

   Config cfg;
   cfg.enumerate_vocab = (EnumerateVocab *) globalVocabBuilder;

   //load models
   //util::FixedArray<Model *> models(lms.size());
   std::vector<Model *> models;
   for(int i=0; i < lms.size(); i++) {
     std::cerr << "Loading LM file: " << lms[i] << std::endl;

     //haaaack
     globalVocabBuilder->SetCurModel(i); //yes this is dumb

     //models[i] = new Model(lms[i].c_str());
     Model * this_model = new Model(lms[i].c_str(), cfg);
     models.push_back( this_model );

   }

   //assemble vocabulary vector
   std::vector<std::string> vocab;
   std::cerr << "Global Vocab Map has size: " << vmap.size() << std::endl;

   std::pair<StringPiece,int *> me;

   for(std::map<std::string, int*>::iterator iter = vmap.begin(); iter != vmap.end(); ++iter) {
     vocab.push_back(iter->first);
   }
   std::cerr << "Vocab vector has size: " << vocab.size() << std::endl;

   //load context sorted ngrams into vector of vectors
   std::vector<std::vector<std::string> > corpus;

   std::cerr << "Loading context-sorted ngrams: " << tuning_data << std::endl;
   std::ifstream infile(tuning_data);

   for(std::string line; std::getline(infile, line); ) {

     std::vector<std::string> words; {

       std::stringstream stream(line);
       std::string word;

       while(stream >> word) {
 	words.push_back(word);
       }
     }
     corpus.push_back(words);
   }

   train_params_fast(corpus, vocab, models);

   return 0;
 }
	#include "lm/ngram_query.hh"
	#include "lm/model.hh"
	#include "lm/word_index.hh"
	#include "lm/interpolate/enumerate_global_vocab.hh"


	#include <string>
	#include <vector>
	#include <iostream>
	#include <fstream>
	#include <map>

	#include <boost/program_options.hpp>
	#include <boost/version.hpp>
	#include <boost/foreach.hpp>

	#include "util/fixed_array.hh"

	#include <Eigen/Eigen>

	typedef Eigen::MatrixXf FMatrix;
	typedef Eigen::VectorXf FVector;

	bool HAS_BIAS = true;

	using namespace lm::ngram;
	using namespace lm;

	inline float logProb(Model * model, const std::vector<std::string>& ctx, const std::string& word) {

	// Horribly inefficient
	const Vocabulary &vocab = model->GetVocabulary();

	State nextState; //throwaway

	WordIndex word_idx = vocab.Index(word);
	WordIndex context_idx[ctx.size()];

	//reverse context
	for(unsigned int i = 0; i < ctx.size(); i++) {
	context_idx[ctx.size() - 1 - i] = vocab.Index(ctx[i]);
	}

	FullScoreReturn score = model->FullScoreForgotState(context_idx, &(context_idx[ctx.size() -1]), word_idx, nextState);

	float ret = score.prob;
	//std::cerr << "w: " << word << " p: " << ret << std::endl;
	return ret;
	}

	void set_features(const std::vector<std::string>& ctx,
	const std::string& word,
	const std::vector<Model *>& models,
	FVector& v) {

	//std::cerr << "setting feats for " << word << std::endl;

	if (HAS_BIAS) {
	v(0) = 1;
	for (unsigned i=0; i < models.size(); ++i)
	v(i + 1) = logProb(models[i], ctx, word);
	} else {
	for (unsigned i=0; i < models.size(); ++i)
	v(i) = logProb(models[i], ctx, word);
	}
	}

	void translate_input(
	const std::vector<std::vector<std::string> >& corpus,
	const std::vector<std::string>& gvocab,
	const std::vector<Model *>& models,
	std::vector<std::vector<std::vector<WordIndex> > >&translated_corpus,
	std::vector<std::vector<WordIndex> >&translated_vocab
	) {
	translated_corpus.resize(models.size());
	translated_vocab.resize(models.size());
	for (unsigned mn=0; mn < models.size(); ++mn) { // models

	const Vocabulary &vocab = models[mn]->GetVocabulary();

	for (unsigned i = 0; i < gvocab.size(); ++i) {
	translated_vocab[mn].push_back(vocab.Index(gvocab[i]));
	}

	translated_corpus[mn].resize(corpus.size());
	for (unsigned ci = 0; ci < corpus.size(); ++ci) { // sentences in tuning corpus
	const std::vector<std::string>& sentence = corpus[ci];
	for (int t = sentence.size() -1; t >= 0; --t) { // words in sentence
	translated_corpus[mn][ci].push_back(vocab.Index(sentence[t]));
	}
	for (int i=0; i<5; ++i) {
	translated_corpus[mn][ci].push_back(vocab.Index("<s>"));
	}
	}
	}
	}


	void train_params_fast(
	const std::vector<std::vector<std::string> >& corpus,
	const std::vector<std::string>& vocab,
	const std::vector<Model *>& models) {
	using namespace std;

	// model / sentence / words in sentence in reverse order with <s> padding
	std::vector<std::vector<std::vector<WordIndex> > > t_corpus;
	std::vector<std::vector<WordIndex> > t_vocab;
	translate_input(corpus, vocab, models, t_corpus, t_vocab);



	const int ITERATIONS = 10;
	const int nlambdas = models.size() + (HAS_BIAS ? 1 : 0); // bias + #models
	FVector params = FVector::Zero(nlambdas);
	vector<FVector> feats(vocab.size(), params);
	vector<float> us(vocab.size(), 0);
	vector<float> ps(vocab.size(), 0);
	FVector grad = FVector::Zero(nlambdas);
	FMatrix H = FMatrix::Zero(nlambdas, nlambdas);
	FVector ef = FVector::Zero(nlambdas);
	for (int iter = 0; iter < ITERATIONS; ++iter) { // iterations
	grad.setZero();
	H.setZero();
	double loss = 0;
	unsigned numchars = 0;
	for (unsigned ci = 0; ci < corpus.size(); ++ci) { // sentences in tuning corpus
	const vector<string>& sentence = corpus[ci];
	double z = 0;
	for (int t = sentence.size() -1 ; t >=0; --t) { // words in sentence
	++numchars;
	int ref_word = 0;
	for (unsigned i = 0; i < vocab.size(); ++i) { // vocab
	// set_features(context, vocab[i], models, feats[i]);
	for (unsigned j=0; j < models.size(); ++j) {
	// NOTE: reference ---- WordIndex word_idx = t_corpus[j][ci][t];
	WordIndex word_idx = t_vocab[j][i];
	State nextState; //throwaway
	FullScoreReturn score = models[j]->FullScoreForgotState(&(t_corpus[j][ci][t]), &(t_corpus[j][ci][t+5]), word_idx, nextState);
	feats[i](j) = score.prob;
	// feats[i](j) = logProb(models[j], ctx, word);
	}

	us[i] = params.dot(feats[i]);
	z += exp(double(us[i]));
	}
	//std::cerr << "there..." << std::endl;
	const float logz = log(z);

	// expected feature values
	ef.setZero();
	for (unsigned i = 0; i < vocab.size(); ++i) {
	ps[i] = expf(us[i] - logz);
	ef += ps[i] * feats[i];
	}
	loss -= log(ps[ref_word]);
	const FVector& reffeats = feats[ref_word];
	grad += ef - reffeats;

	// Hessian
	for (unsigned i = 0; i < vocab.size(); ++i)
	H.noalias() += ps[i] * feats[i] * feats[i].transpose() -
	ps[i] * feats[i] * ef.transpose();

	// this should just be the state for each model
	}
	cerr << ".";
	}
	cerr << "ITERATION " << (iter + 1) << ": PPL=" << exp(loss / numchars) << endl;
	params = H.colPivHouseholderQr().solve(grad);
	cerr << params << endl;
	}
	}




	//const util::FixedArray<Model *>& models)
	void train_params(
	const std::vector<std::vector<std::string> >& corpus,
	const std::vector<std::string>& vocab,
	const std::vector<Model *>& models) {
	using namespace std;

	vector<string> context(5, "<s>");
	const int ITERATIONS = 10;
	const int nlambdas = models.size() + (HAS_BIAS ? 1 : 0); // bias + #models
	FVector params = FVector::Zero(nlambdas);
	vector<FVector> feats(vocab.size(), params);
	vector<float> us(vocab.size(), 0);
	vector<float> ps(vocab.size(), 0);
	FVector grad = FVector::Zero(nlambdas);
	FMatrix H = FMatrix::Zero(nlambdas, nlambdas);
	FVector ef = FVector::Zero(nlambdas);
	for (int iter = 0; iter < ITERATIONS; ++iter) { // iterations
	grad.setZero();
	H.setZero();
	double loss = 0;
	unsigned numchars = 0;
	for (unsigned ci = 0; ci < corpus.size(); ++ci) { // sentences in tuning corpus
	const vector<string>& sentence = corpus[ci];
	std::fill(context.begin(), context.end(), "<s>");
	for (unsigned t = 0; t < sentence.size(); ++t) { // words in sentence
	++numchars;
	const string& ref_word_string = sentence[t];
	int ref_word = 0; // TODO
	double z = 0;
	//std::cerr << "here..." << std::endl;
	for (unsigned i = 0; i < vocab.size(); ++i) { // vocab
	set_features(context, vocab[i], models, feats[i]);
	us[i] = params.dot(feats[i]);
	z += exp(double(us[i]));
	}
	//std::cerr << "there..." << std::endl;
	context.push_back(ref_word_string);
	const float logz = log(z);

	// expected feature values
	ef.setZero();
	for (unsigned i = 0; i < vocab.size(); ++i) {
	ps[i] = expf(us[i] - logz);
	ef += ps[i] * feats[i];
	}
	loss -= log(ps[ref_word]);
	const FVector& reffeats = feats[ref_word];
	grad += ef - reffeats;

	// Hessian
	for (unsigned i = 0; i < vocab.size(); ++i)
	H.noalias() += ps[i] * feats[i] * feats[i].transpose() -
	ps[i] * feats[i] * ef.transpose();

	// this should just be the state for each model
	}
	cerr << ".";
	}
	cerr << "ITERATION " << (iter + 1) << ": PPL=" << exp(loss / numchars) << endl;
	params = H.colPivHouseholderQr().solve(grad);
	cerr << params << endl;
	}
	}

	int main(int argc, char** argv) {

	std::string tuning_data;
	std::vector<std::string> lms;

	try {
	namespace po = boost::program_options;
	po::options_description options("train-params");

	options.add_options()
	("help,h", po::bool_switch(), "Show this help message")
	("no_bias_term,B", po::bool_switch(), "Do not include a 'bias' feature")
	("tuning_data,t", po::value<std::string>(&tuning_data), "File to tune perplexity on")
	("model,m", po::value<std::vector<std::string> >(&lms), "Language models in KenLM format to interpolate");
	po::variables_map vm;
	po::store(po::parse_command_line(argc, argv, options), vm);

	// Display help
	if(argc == 1 \|\| vm["help"].as<bool>()) {
	std::cerr << options << std::endl;
	return 1;
	}
	if (vm["no_bias_term"].as<bool>())
	HAS_BIAS = false;
	lms = vm["model"].as<std::vector<std::string> >();
	tuning_data = vm["tuning_data"].as<std::string>();
	}
	catch(const std::exception &e) {

	std::cerr << e.what() << std::endl;
	return 1;

	}
	if (lms.size() < 2) {
	std::cerr << "Please specify at least two language model files with -m LM.KLM\n";
	return 1;
	}
	if (tuning_data.empty()) {
	std::cerr << "Please specify tuning set with -t FILE.TXT\n";
	return 1;
	}

	//Growable vocab here
	//GrowableVocab gvoc(100000); //dummy default

	std::map<std::string, int*> vmap;
	util::FixedArray<WordIndex> vm(2);

	//stuff it into the
	EnumerateGlobalVocab * globalVocabBuilder = new EnumerateGlobalVocab(&vmap, lms.size());
	// EnumerateGlobalVocab * globalVocabBuilder = new EnumerateGlobalVocab(vm);

	Config cfg;
	cfg.enumerate_vocab = (EnumerateVocab *) globalVocabBuilder;

	//load models
	//util::FixedArray<Model *> models(lms.size());
	std::vector<Model *> models;
	for(int i=0; i < lms.size(); i++) {
	std::cerr << "Loading LM file: " << lms[i] << std::endl;

	//haaaack
	globalVocabBuilder->SetCurModel(i); //yes this is dumb

	//models[i] = new Model(lms[i].c_str());
	Model * this_model = new Model(lms[i].c_str(), cfg);
	models.push_back( this_model );

	}

	//assemble vocabulary vector
	std::vector<std::string> vocab;
	std::cerr << "Global Vocab Map has size: " << vmap.size() << std::endl;

	std::pair<StringPiece,int *> me;

	for(std::map<std::string, int*>::iterator iter = vmap.begin(); iter != vmap.end(); ++iter) {
	vocab.push_back(iter->first);
	}
	std::cerr << "Vocab vector has size: " << vocab.size() << std::endl;

	//load context sorted ngrams into vector of vectors
	std::vector<std::vector<std::string> > corpus;

	std::cerr << "Loading context-sorted ngrams: " << tuning_data << std::endl;
	std::ifstream infile(tuning_data);

	for(std::string line; std::getline(infile, line); ) {

	std::vector<std::string> words; {

	std::stringstream stream(line);
	std::string word;

	while(stream >> word) {
	words.push_back(word);
	}
	}
	corpus.push_back(words);
	}

	train_params_fast(corpus, vocab, models);

	return 0;
	}