src/kenlm/lm/interpolate/train_params_grant_main.cc - joshua - Git at Google

 #include "lm/ngram_query.hh"
 #include "lm/model.hh"
 #include "lm/word_index.hh"
 #include "lm/interpolate/enumerate_global_vocab.hh"

 #include <string>
 #include <vector>
 #include <iostream>
 #include <fstream>
 #include <map>
 #include <iomanip>

 #include <boost/program_options.hpp>
 #include <boost/version.hpp>
 #include <boost/foreach.hpp>

 #include "util/fixed_array.hh"

 #include <Eigen/Eigen>
 #include <Eigen/Dense>

 // typedef Eigen::MatrixXf FMatrix;
 // typedef Eigen::VectorXf FVector;
 typedef Eigen::MatrixXd DMatrix;
 typedef Eigen::VectorXd DVector;

 bool HAS_BIAS = true;

 using namespace lm::ngram;
 using namespace lm;

 inline double logProb(Model *model, const std::vector<std::string> &ctx,
                       WordIndex word_idx) {
   // Horribly inefficient
   const Vocabulary &vocab = model->GetVocabulary();

   State nextState;  // throwaway

   WordIndex context_idx[ctx.size()];

   // reverse context
   for (std::size_t i = 0; i < ctx.size(); i++) {
     context_idx[ctx.size() - 1 - i] = vocab.Index(ctx[i]);
   }

   FullScoreReturn score = model->FullScoreForgotState(
       context_idx, &(context_idx[ctx.size() - 1]), word_idx, nextState);

   double ret = score.prob;
   // std::cerr << "w: " << word << " p: " << ret << std::endl;
   return ret;
 }

 inline double logProb(Model *model, double unkprob,
                       const std::vector<std::string> &ctx,
                       const std::string &word) {
   // Horribly inefficient
   const Vocabulary &vocab = model->GetVocabulary();

   WordIndex word_idx = vocab.Index(word);
   if (word_idx == lm::kUNK) return unkprob;

   return logProb(model, ctx, word_idx);
 }

 void set_features(const std::vector<std::string> &ctx, const std::string &word,
                   const std::vector<Model *> &models,
                   const std::vector<double> &unkprobs, DVector &v) {
   if (HAS_BIAS) {
     v(0) = 1;
     for (std::size_t i = 0; i < models.size(); ++i)
       v(i + 1) = logProb(models[i], unkprobs[i], ctx, word);
   } else {
     for (std::size_t i = 0; i < models.size(); ++i)
       v(i) = logProb(models[i], unkprobs[i], ctx, word);
   }
 }

 void train_params(const std::vector<std::vector<std::string> > &corpus,
                   const std::vector<std::string> &vocab,
                   const std::vector<Model *> &models) {
   using namespace std;

   // A safeguarded Newton's method to find optimum parameters.
   // Reverts to steepest-descent linesearch if Newton step does not improve
   // objective.
   //
   // Two Boolean variables below are used to "AllowExtrapolation" and
   // "AllowNegativeParams".

   bool AllowExtrapolation = true;   // if true, params need not sum to one
   bool AllowNegativeParams = true;  // if true, params can be negative
   const int ITERATIONS = 20;
   double minstepsize = 1.0e-9;  // convergence criterion
   int context_size = 5;  // (context_size+1)-grams considered in perplexity
   double stepdecreasefactor = 0.1;        // if step unsuccessful
   double initstepsize = 1.0;              // Initial step size
   std::size_t linesinstartercorpus = 12;  // The first few lines are tuned
                                           // first, to find basin of attraction
                                           // for Newton
   // bias + #models
   const std::size_t nlambdas = models.size() + (HAS_BIAS ? 1 : 0);

   // initialize to sum to 1
   DVector params = DVector::Constant(nlambdas, 1.0 / nlambdas);
   DMatrix N = DMatrix::Constant(
       nlambdas, nlambdas - 1,
       -1.0 / sqrt((nlambdas - 1) * (nlambdas - 1) + nlambdas - 1.0));
   for (unsigned i = 0; i < nlambdas - 1; ++i)
     N(i, i) = N(i, i) * (1.0 - nlambdas);
   // N is nullspace matrix, each column sums to zero

   cerr << setprecision(16) << "++ Parameter training ++" << endl;
   if (AllowExtrapolation)
     cerr << " Allowing extrapolation (sharpening and flattening of individual "
             "LM distributions)" << endl;
   else
     cerr << " Interpolating only (not sharpening or flattening individual LM "
             "distributions)" << endl;
   if (AllowNegativeParams)
     cerr << " Allowing negative parameters\n"
          << "  (more general but slow and rarely useful\n"
          << "   -LM with negative weight has probability rankings reversed and "
             "is weighted more highly than all LMs with positive weights)"
          << endl;
   else
     cerr << "Not allowing negative parameters (mild assumption, and faster)"
          << endl;
   cerr << " Maximum number of iterations: " << ITERATIONS << endl;
   cerr << " Minimum step size: " << minstepsize << endl;
   cerr << " Perplexity computed with " << context_size + 1 << "-grams" << endl;

   if ((!AllowExtrapolation) && (nlambdas == 1)) {
     // Nothing to optimize. One parameter, and it sums to one.
     cerr << "Training complete. Best weights:" << endl;
     cerr << setprecision(16) << 1.0 << endl;
     return;
   }

   // Smart initialization of full tuning by tuning on smaller set first
   vector<std::size_t> linestotune;
   if (linesinstartercorpus < corpus.size())
     linestotune.push_back(linesinstartercorpus);
   linestotune.push_back(corpus.size());

   for (std::size_t setiter = 0; setiter < linestotune.size(); ++setiter) {
     cerr << " Now tuning the first " << linestotune[setiter] << " lines"
          << endl;

     vector<DVector> paramhistory;
     double bestppl = 0.0;                          // best recorded ppl
     DVector bestgrad = DVector::Zero(nlambdas);    // corresp. gradient,
                                                    // feasible direction
     DVector bestparams = DVector::Zero(nlambdas);  // corresp. weights
     double maxbestgradstep = 0.0;    // max feasible step in grad. direction
     double stepsize = initstepsize;  // Initial step size

     for (int iter = 0; iter < ITERATIONS; ++iter) {  // iterations
       cerr << "ITERATION " << iter + 1 << " (of max " << ITERATIONS
            << "), step size " << stepsize << " (of min " << minstepsize
            << "), weights: " << endl;
       cerr << params << endl;

       paramhistory.push_back(params);
       // Hard-coded to be 6-gram perplexity
       vector<string> context(context_size, "<s>");
       double ppl = 0.0;
       DVector grad = DVector::Zero(nlambdas);
       DMatrix H = DMatrix::Zero(nlambdas, nlambdas);
       cerr << "o";

       std::vector<double> unkprobs(models.size());
       // for each sentence in tuning corpus
       for (std::size_t ci = 0; ci < linestotune[setiter]; ++ci) {
         const vector<string> &sentence = corpus[ci];
         // pad our beginning context
         std::fill(context.begin(), context.end(), "<s>");

         // for each word in sentence
         for (std::size_t t = 0; t < sentence.size(); ++t) {
           // fill in unk probabilities for this context, to avoid having to
           // look them up redundantly later
           for (std::size_t mi = 0; mi < models.size(); ++mi) {
             unkprobs[mi] = logProb(models[mi], context, lm::kUNK);
           }

           DVector feats = DVector::Zero(nlambdas);
           // probs for actual n-gram
           set_features(context, sentence[t], models, unkprobs, feats);

           double z = 0.0;
           double maxlogprob = 0.0;  // Allows us to avoid overflow with
                                     // negative params
           DVector expectfeats = DVector::Zero(nlambdas);
           DMatrix expectfeatmatrix = DMatrix::Zero(nlambdas, nlambdas);
           // Logically, this should be in the loop's scope
           DVector iterfeats = DVector::Zero(nlambdas);

           // probs over possible n-grams, for normalization
           for (std::size_t i = 0; i < vocab.size(); ++i) {
             set_features(context, vocab[i], models, unkprobs, iterfeats);
             double logprob = params.dot(iterfeats);
             if (i == 0) {
               // maxlogprob=logprob;// more precise, less underflow
               maxlogprob = 0.0;  // reduces number of updates
             } else if (logprob > maxlogprob) {
               // Adjust all old values to new scaling
               double adjust = exp(maxlogprob - logprob);
               z *= adjust;
               expectfeats *= adjust;
               expectfeatmatrix *= adjust;
               maxlogprob = logprob;
             }
             double us = exp(params.dot(iterfeats) - maxlogprob);  // measure

             z += us;
             expectfeats += us * iterfeats;
             expectfeatmatrix += us * (iterfeats * iterfeats.transpose());
           }
           expectfeats /= z;       // Expectation
           expectfeatmatrix /= z;  // Expectation

           // Add sentence[t] to the end of the context
           context[0] = sentence[t];
           std::rotate(context.begin(), context.begin() + 1, context.end());

           // Perplexity (actually log(perplexity))
           ppl += params.dot(feats) - log(z);
           // Gradient
           grad += feats - expectfeats;
           // Hessian
           H += -expectfeatmatrix + expectfeats * expectfeats.transpose();
         }
         cerr << ".";
       }
       ppl *= -1.0 / corpus.size();
       // The gradient and Hessian coefficients cancel out, so don't really need
       // to do this, but it's fast.
       grad *= -1.0 / corpus.size();
       H *= -1.0 / corpus.size();
       cerr << " log(PPL)=" << ppl << " PPL=" << exp(ppl) << endl;

       // Use results to determine next params to evaluate
       if ((ppl < bestppl) || (iter == 0)) {
         // Found a new best
         bestppl = ppl;
         bestparams = params;
         double beststepsize = stepsize;
         if (iter > 0)
           cerr << " New best point found, step size " << beststepsize << endl;
         else
           cerr << " New best point found" << endl;

         bestgrad = grad;
         DVector deltaparams = DVector::Zero(nlambdas);

         bool reverttograd = false;

         {
           double gradnorm = 0.0;
           double solvenorm = 0.0;
           double errnorm = 0.0;
           // Find Newton step
           if (AllowExtrapolation) {
             deltaparams = -H.colPivHouseholderQr().solve(grad);
             Eigen::SelfAdjointEigenSolver<DMatrix> eigensolver(H);
             cerr << "Eigenvalues (negative values should be negligible):\n"
                  << eigensolver.eigenvalues() << endl;
             gradnorm = grad.norm();
             solvenorm = (H * deltaparams).norm();
             errnorm = (grad + H * deltaparams).norm();
           } else {
             // Project gradient to interpolation space
             bestgrad = N * N.transpose() * bestgrad;

             // need to work in nullspace to maintain unit sum
             DMatrix Hnull = DMatrix::Zero(nlambdas - 1, nlambdas - 1);

             // Looks like we don't need the three lines below -- we can do it
             // in-line (if we don't want eigenvalues)
             Hnull = N.transpose() * H * N;
             Eigen::SelfAdjointEigenSolver<DMatrix> eigensolver(Hnull);
             cerr << "Eigenvalues (best if all positive):\n"
                  << eigensolver.eigenvalues() << endl;
             deltaparams =
                 -N * Hnull.fullPivHouseholderQr().solve(N.transpose() * grad);
             gradnorm = (N.transpose() * grad).norm();
             solvenorm = (Hnull * deltaparams).norm();
             errnorm = (N.transpose() * grad + Hnull * deltaparams).norm();
           }
           // eventually, params = bestparams + deltaparams;
           cerr << " Error norm " << errnorm << ", gradient norm " << gradnorm
                << ", solution norm " << solvenorm << endl;
           // Check for numerical errors. Don't trust Newton step if they are too
           // big.
           if (errnorm < 1e-12 * std::max(1.0, std::min(gradnorm, solvenorm))) {
             stepsize = 0.0;
             for (std::size_t i = 0; i < nlambdas; i++)
               stepsize += deltaparams(i) * deltaparams(i);
             stepsize = sqrt(stepsize);  // holds length of Newton step
             cerr << "Newton step, length " << stepsize << ": " << endl;
             cerr << deltaparams << endl;

             // Don't let the Newton step get much bigger than last successful
             // step (likely would have to shrink later, anyway)
             if (stepsize > 2.0 * beststepsize) {
               stepsize = 1.5 * beststepsize;
               reverttograd = true;
               cerr << "Reverting to gradient, because Newton step is too large."
                    << endl;
             }
           } else {
             stepsize = 1.5 * beststepsize;
             reverttograd = true;
             cerr << "Reverting to gradient, because Newton step computation "
                     "unsuccessful." << endl;
           }
           // Make the gradient unit norm, in feasible search direction.
           if (!AllowNegativeParams) {
             // Project gradient to be a feasible search direction
             vector<bool> active(nlambdas, false);
             std::size_t numactive = 0;
             for (std::size_t i = 0; i < nlambdas; i++) {
               // Project gradient to inactive constraints
               if ((bestparams(i) == 0) && (bestgrad(i) > 0)) {
                 active[i] = true;
                 bestgrad(i) = 0.0;  // Do this now, in case extrapolation
                                     // allowed.
                 ++numactive;
               }
             }
             if (numactive > 0) {
               if (!AllowExtrapolation) {
                 // Project gradient, for activity concerns
                 DMatrix tmpN = DMatrix::Constant(
                     nlambdas, nlambdas - 1,
                     -1.0 / sqrt((nlambdas - numactive - 1) *
                                     (nlambdas - numactive - 1) +
                                 nlambdas - numactive - 1.0));

                 for (std::size_t i = 0; i < nlambdas - 1; ++i)
                   tmpN(i, i) = tmpN(i, i) * (1.0 - (nlambdas - numactive));

                 for (std::size_t i = 0; i < nlambdas; ++i) {
                   if (active[i]) {
                     for (std::size_t j = 0; j < nlambdas - 1; ++i) {
                       tmpN(i, j) = 0;
                     }
                   }
                 }

                 // projected gradient onto unit sum and active set constraints
                 bestgrad = -tmpN * tmpN.transpose() * bestgrad;
               }
             }
           }
         }
         double norm = 0.0;
         for (std::size_t i = 0; i < nlambdas; i++)
           norm += bestgrad(i) * bestgrad(i);
         if (norm != 0) {
           bestgrad /= sqrt(norm);
         } else {
           cerr << " Gradient is zero. Exiting.";
           break;
         }
         cerr << "Gradient, unit length: " << endl;
         cerr << bestgrad << endl;

         // Find max step in gradient direction that remains feasible.
         if (!AllowNegativeParams) {
           double limitfraction = 0.5;  // Not 1: If Newton step is bad, probably
                                        // will need to reduce later anyway
           for (std::size_t i = 0; i < nlambdas; i++) {
             if (bestparams(i) - maxbestgradstep * bestgrad(i) < 0) {
               double tmplimitfraction =
                   bestparams(i) / (bestgrad(i) * maxbestgradstep);
               if (tmplimitfraction < limitfraction)
                 limitfraction = tmplimitfraction;
             }
           }
           maxbestgradstep = stepsize * limitfraction;
           cerr << " Max grad step: " << maxbestgradstep << endl;
         } else {
           maxbestgradstep = stepsize;
         }

         if (!reverttograd) {
           if (!AllowNegativeParams) {
             for (std::size_t i = 0; i < nlambdas; i++) {
               if (bestparams(i) + deltaparams(i) < 0) {
                 // Can't do Newton step. Revert to descent.
                 reverttograd = true;
               }
             }
           }
           if (reverttograd) {
             cerr << "Reverting to gradient, since Newton step infeasible:"
                  << endl;
           }
         }

         if (reverttograd) {
           stepsize = maxbestgradstep;
           deltaparams = -bestgrad * stepsize;
         }

         params = bestparams + deltaparams;
         cerr << "Change in weights from best, step size " << stepsize << ": "
              << endl;
         cerr << deltaparams << endl;
       } else {
         // Last attempt failed at being better, so move in gradient direction
         // with reduced step.
         // stepsize reduction factor is empirical
         stepsize = std::min(stepdecreasefactor * stepsize, maxbestgradstep);
         cerr << "Taking smaller step: " << stepsize << endl;
         params = bestparams - bestgrad * stepsize;
       }
       // Clean the parameters up.
       double sumparams = 0.0;
       for (std::size_t i = 0; i < nlambdas; i++) {
         if (!AllowNegativeParams) {
           if (params(i) < 1e-12) {
             // snap to zero, for active set and duplicate weights
             params(i) = 0;
           }
         }
         sumparams += params(i);
       }
       if (!AllowExtrapolation) params /= sumparams;

       bool duplicateentry = false;
       for (std::size_t i = 0; i < paramhistory.size(); ++i) {
         if (params == paramhistory[i]) duplicateentry = true;
       }
       while ((duplicateentry) && (stepsize >= minstepsize)) {
         cerr << "Duplicate weight found: " << endl;
         cerr << params << endl;
         stepsize *= 0.5;  // Step in this direction is duplicate, so try again
                           // with smaller step
         params = bestparams - stepsize * bestgrad;

         sumparams = 0.0;
         for (std::size_t i = 0; i < nlambdas; i++) {
           if (!AllowNegativeParams) {
             if (params(i) < 1e-12) params(i) = 0;
           }
           sumparams += params(i);
         }
         if (!AllowExtrapolation) params /= sumparams;

         duplicateentry = false;
         for (std::size_t i = 0; i < paramhistory.size(); ++i) {
           if (params == paramhistory[i]) duplicateentry = true;
         }
       }
       if (stepsize < minstepsize) break;  // No need to make another step
     }

     params = bestparams;  // So that next setiter is correct
     cerr << "Training complete. Best weights:" << endl;
     cerr << params << endl;
   }
 }

 int main(int argc, char **argv) {
   std::string tuning_data;
   std::vector<std::string> lms;

   try {
     namespace po = boost::program_options;
     po::options_description options("train-params");

     options.add_options()("help,h", po::bool_switch(),
                           "Show this help message")(
         "no_bias_term,B", po::bool_switch(), "Do not include a 'bias' feature")(
         "tuning_data,t", po::value<std::string>(&tuning_data),
         "File to tune perplexity on")(
         "model,m", po::value<std::vector<std::string> >(&lms),
         "Language models in KenLM format to interpolate");
     po::variables_map vm;
     po::store(po::parse_command_line(argc, argv, options), vm);

     // Display help
     if (argc == 1 || vm["help"].as<bool>()) {
       std::cerr << options << std::endl;
       return 1;
     }
     if (vm["no_bias_term"].as<bool>()) HAS_BIAS = false;
     lms = vm["model"].as<std::vector<std::string> >();
     tuning_data = vm["tuning_data"].as<std::string>();
   } catch (const std::exception &e) {
     std::cerr << e.what() << std::endl;
     return 1;
   }
   if (lms.size() < 2) {
     std::cerr
         << "Please specify at least two language model files with -m LM.KLM\n";
     return 1;
   }
   if (tuning_data.empty()) {
     std::cerr << "Please specify tuning set with -t FILE.TXT\n";
     return 1;
   }

   // Growable vocab here
   // GrowableVocab gvoc(100000); //dummy default

   // no comment
   std::map<std::string, int *> vmap;

   // stuff it into the
   EnumerateGlobalVocab *globalVocabBuilder =
       new EnumerateGlobalVocab(&vmap, lms.size());

   Config cfg;
   cfg.enumerate_vocab = (EnumerateVocab *)globalVocabBuilder;

   // load models
   std::vector<Model *> models;
   for (std::size_t i = 0; i < lms.size(); i++) {
     std::cerr << "Loading LM file: " << lms[i] << std::endl;

     // haaaack
     globalVocabBuilder->SetCurModel(i);  // yes this is dumb

     Model *this_model = new Model(lms[i].c_str(), cfg);
     models.push_back(this_model);
   }

   // assemble vocabulary vector
   std::vector<std::string> vocab;
   std::cerr << "Global Vocab Map has size: " << vmap.size() << std::endl;

   for (std::map<std::string, int *>::iterator iter = vmap.begin();
        iter != vmap.end(); ++iter) {
     vocab.push_back(iter->first);
   }
   std::cerr << "Vocab vector has size: " << vocab.size() << std::endl;

   // load context sorted ngrams into vector of vectors
   std::vector<std::vector<std::string> > corpus;

   std::cerr << "Loading context-sorted ngrams: " << tuning_data << std::endl;
   std::ifstream infile(tuning_data.c_str());

   for (std::string line; std::getline(infile, line);) {
     std::vector<std::string> words;
     std::stringstream stream(line);
     std::string word;

     while (stream >> word)
       words.push_back(word);
     corpus.push_back(words);
   }

   train_params(corpus, vocab, models);

   return 0;
 }
	#include "lm/ngram_query.hh"
	#include "lm/model.hh"
	#include "lm/word_index.hh"
	#include "lm/interpolate/enumerate_global_vocab.hh"

	#include <string>
	#include <vector>
	#include <iostream>
	#include <fstream>
	#include <map>
	#include <iomanip>

	#include <boost/program_options.hpp>
	#include <boost/version.hpp>
	#include <boost/foreach.hpp>

	#include "util/fixed_array.hh"

	#include <Eigen/Eigen>
	#include <Eigen/Dense>

	// typedef Eigen::MatrixXf FMatrix;
	// typedef Eigen::VectorXf FVector;
	typedef Eigen::MatrixXd DMatrix;
	typedef Eigen::VectorXd DVector;

	bool HAS_BIAS = true;

	using namespace lm::ngram;
	using namespace lm;

	inline double logProb(Model *model, const std::vector<std::string> &ctx,
	WordIndex word_idx) {
	// Horribly inefficient
	const Vocabulary &vocab = model->GetVocabulary();

	State nextState; // throwaway

	WordIndex context_idx[ctx.size()];

	// reverse context
	for (std::size_t i = 0; i < ctx.size(); i++) {
	context_idx[ctx.size() - 1 - i] = vocab.Index(ctx[i]);
	}

	FullScoreReturn score = model->FullScoreForgotState(
	context_idx, &(context_idx[ctx.size() - 1]), word_idx, nextState);

	double ret = score.prob;
	// std::cerr << "w: " << word << " p: " << ret << std::endl;
	return ret;
	}

	inline double logProb(Model *model, double unkprob,
	const std::vector<std::string> &ctx,
	const std::string &word) {
	// Horribly inefficient
	const Vocabulary &vocab = model->GetVocabulary();

	WordIndex word_idx = vocab.Index(word);
	if (word_idx == lm::kUNK) return unkprob;

	return logProb(model, ctx, word_idx);
	}

	void set_features(const std::vector<std::string> &ctx, const std::string &word,
	const std::vector<Model *> &models,
	const std::vector<double> &unkprobs, DVector &v) {
	if (HAS_BIAS) {
	v(0) = 1;
	for (std::size_t i = 0; i < models.size(); ++i)
	v(i + 1) = logProb(models[i], unkprobs[i], ctx, word);
	} else {
	for (std::size_t i = 0; i < models.size(); ++i)
	v(i) = logProb(models[i], unkprobs[i], ctx, word);
	}
	}

	void train_params(const std::vector<std::vector<std::string> > &corpus,
	const std::vector<std::string> &vocab,
	const std::vector<Model *> &models) {
	using namespace std;

	// A safeguarded Newton's method to find optimum parameters.
	// Reverts to steepest-descent linesearch if Newton step does not improve
	// objective.
	//
	// Two Boolean variables below are used to "AllowExtrapolation" and
	// "AllowNegativeParams".

	bool AllowExtrapolation = true; // if true, params need not sum to one
	bool AllowNegativeParams = true; // if true, params can be negative
	const int ITERATIONS = 20;
	double minstepsize = 1.0e-9; // convergence criterion
	int context_size = 5; // (context_size+1)-grams considered in perplexity
	double stepdecreasefactor = 0.1; // if step unsuccessful
	double initstepsize = 1.0; // Initial step size
	std::size_t linesinstartercorpus = 12; // The first few lines are tuned
	// first, to find basin of attraction
	// for Newton
	// bias + #models
	const std::size_t nlambdas = models.size() + (HAS_BIAS ? 1 : 0);

	// initialize to sum to 1
	DVector params = DVector::Constant(nlambdas, 1.0 / nlambdas);
	DMatrix N = DMatrix::Constant(
	nlambdas, nlambdas - 1,
	-1.0 / sqrt((nlambdas - 1) * (nlambdas - 1) + nlambdas - 1.0));
	for (unsigned i = 0; i < nlambdas - 1; ++i)
	N(i, i) = N(i, i) * (1.0 - nlambdas);
	// N is nullspace matrix, each column sums to zero

	cerr << setprecision(16) << "++ Parameter training ++" << endl;
	if (AllowExtrapolation)
	cerr << " Allowing extrapolation (sharpening and flattening of individual "
	"LM distributions)" << endl;
	else
	cerr << " Interpolating only (not sharpening or flattening individual LM "
	"distributions)" << endl;
	if (AllowNegativeParams)
	cerr << " Allowing negative parameters\n"
	<< " (more general but slow and rarely useful\n"
	<< " -LM with negative weight has probability rankings reversed and "
	"is weighted more highly than all LMs with positive weights)"
	<< endl;
	else
	cerr << "Not allowing negative parameters (mild assumption, and faster)"
	<< endl;
	cerr << " Maximum number of iterations: " << ITERATIONS << endl;
	cerr << " Minimum step size: " << minstepsize << endl;
	cerr << " Perplexity computed with " << context_size + 1 << "-grams" << endl;

	if ((!AllowExtrapolation) && (nlambdas == 1)) {
	// Nothing to optimize. One parameter, and it sums to one.
	cerr << "Training complete. Best weights:" << endl;
	cerr << setprecision(16) << 1.0 << endl;
	return;
	}

	// Smart initialization of full tuning by tuning on smaller set first
	vector<std::size_t> linestotune;
	if (linesinstartercorpus < corpus.size())
	linestotune.push_back(linesinstartercorpus);
	linestotune.push_back(corpus.size());

	for (std::size_t setiter = 0; setiter < linestotune.size(); ++setiter) {
	cerr << " Now tuning the first " << linestotune[setiter] << " lines"
	<< endl;

	vector<DVector> paramhistory;
	double bestppl = 0.0; // best recorded ppl
	DVector bestgrad = DVector::Zero(nlambdas); // corresp. gradient,
	// feasible direction
	DVector bestparams = DVector::Zero(nlambdas); // corresp. weights
	double maxbestgradstep = 0.0; // max feasible step in grad. direction
	double stepsize = initstepsize; // Initial step size

	for (int iter = 0; iter < ITERATIONS; ++iter) { // iterations
	cerr << "ITERATION " << iter + 1 << " (of max " << ITERATIONS
	<< "), step size " << stepsize << " (of min " << minstepsize
	<< "), weights: " << endl;
	cerr << params << endl;

	paramhistory.push_back(params);
	// Hard-coded to be 6-gram perplexity
	vector<string> context(context_size, "<s>");
	double ppl = 0.0;
	DVector grad = DVector::Zero(nlambdas);
	DMatrix H = DMatrix::Zero(nlambdas, nlambdas);
	cerr << "o";

	std::vector<double> unkprobs(models.size());
	// for each sentence in tuning corpus
	for (std::size_t ci = 0; ci < linestotune[setiter]; ++ci) {
	const vector<string> &sentence = corpus[ci];
	// pad our beginning context
	std::fill(context.begin(), context.end(), "<s>");

	// for each word in sentence
	for (std::size_t t = 0; t < sentence.size(); ++t) {
	// fill in unk probabilities for this context, to avoid having to
	// look them up redundantly later
	for (std::size_t mi = 0; mi < models.size(); ++mi) {
	unkprobs[mi] = logProb(models[mi], context, lm::kUNK);
	}

	DVector feats = DVector::Zero(nlambdas);
	// probs for actual n-gram
	set_features(context, sentence[t], models, unkprobs, feats);

	double z = 0.0;
	double maxlogprob = 0.0; // Allows us to avoid overflow with
	// negative params
	DVector expectfeats = DVector::Zero(nlambdas);
	DMatrix expectfeatmatrix = DMatrix::Zero(nlambdas, nlambdas);
	// Logically, this should be in the loop's scope
	DVector iterfeats = DVector::Zero(nlambdas);

	// probs over possible n-grams, for normalization
	for (std::size_t i = 0; i < vocab.size(); ++i) {
	set_features(context, vocab[i], models, unkprobs, iterfeats);
	double logprob = params.dot(iterfeats);
	if (i == 0) {
	// maxlogprob=logprob;// more precise, less underflow
	maxlogprob = 0.0; // reduces number of updates
	} else if (logprob > maxlogprob) {
	// Adjust all old values to new scaling
	double adjust = exp(maxlogprob - logprob);
	z *= adjust;
	expectfeats *= adjust;
	expectfeatmatrix *= adjust;
	maxlogprob = logprob;
	}
	double us = exp(params.dot(iterfeats) - maxlogprob); // measure

	z += us;
	expectfeats += us * iterfeats;
	expectfeatmatrix += us * (iterfeats * iterfeats.transpose());
	}
	expectfeats /= z; // Expectation
	expectfeatmatrix /= z; // Expectation

	// Add sentence[t] to the end of the context
	context[0] = sentence[t];
	std::rotate(context.begin(), context.begin() + 1, context.end());

	// Perplexity (actually log(perplexity))
	ppl += params.dot(feats) - log(z);
	// Gradient
	grad += feats - expectfeats;
	// Hessian
	H += -expectfeatmatrix + expectfeats * expectfeats.transpose();
	}
	cerr << ".";
	}
	ppl *= -1.0 / corpus.size();
	// The gradient and Hessian coefficients cancel out, so don't really need
	// to do this, but it's fast.
	grad *= -1.0 / corpus.size();
	H *= -1.0 / corpus.size();
	cerr << " log(PPL)=" << ppl << " PPL=" << exp(ppl) << endl;

	// Use results to determine next params to evaluate
	if ((ppl < bestppl) \|\| (iter == 0)) {
	// Found a new best
	bestppl = ppl;
	bestparams = params;
	double beststepsize = stepsize;
	if (iter > 0)
	cerr << " New best point found, step size " << beststepsize << endl;
	else
	cerr << " New best point found" << endl;

	bestgrad = grad;
	DVector deltaparams = DVector::Zero(nlambdas);

	bool reverttograd = false;

	{
	double gradnorm = 0.0;
	double solvenorm = 0.0;
	double errnorm = 0.0;
	// Find Newton step
	if (AllowExtrapolation) {
	deltaparams = -H.colPivHouseholderQr().solve(grad);
	Eigen::SelfAdjointEigenSolver<DMatrix> eigensolver(H);
	cerr << "Eigenvalues (negative values should be negligible):\n"
	<< eigensolver.eigenvalues() << endl;
	gradnorm = grad.norm();
	solvenorm = (H * deltaparams).norm();
	errnorm = (grad + H * deltaparams).norm();
	} else {
	// Project gradient to interpolation space
	bestgrad = N * N.transpose() * bestgrad;

	// need to work in nullspace to maintain unit sum
	DMatrix Hnull = DMatrix::Zero(nlambdas - 1, nlambdas - 1);

	// Looks like we don't need the three lines below -- we can do it
	// in-line (if we don't want eigenvalues)
	Hnull = N.transpose() * H * N;
	Eigen::SelfAdjointEigenSolver<DMatrix> eigensolver(Hnull);
	cerr << "Eigenvalues (best if all positive):\n"
	<< eigensolver.eigenvalues() << endl;
	deltaparams =
	-N * Hnull.fullPivHouseholderQr().solve(N.transpose() * grad);
	gradnorm = (N.transpose() * grad).norm();
	solvenorm = (Hnull * deltaparams).norm();
	errnorm = (N.transpose() * grad + Hnull * deltaparams).norm();
	}
	// eventually, params = bestparams + deltaparams;
	cerr << " Error norm " << errnorm << ", gradient norm " << gradnorm
	<< ", solution norm " << solvenorm << endl;
	// Check for numerical errors. Don't trust Newton step if they are too
	// big.
	if (errnorm < 1e-12 * std::max(1.0, std::min(gradnorm, solvenorm))) {
	stepsize = 0.0;
	for (std::size_t i = 0; i < nlambdas; i++)
	stepsize += deltaparams(i) * deltaparams(i);
	stepsize = sqrt(stepsize); // holds length of Newton step
	cerr << "Newton step, length " << stepsize << ": " << endl;
	cerr << deltaparams << endl;

	// Don't let the Newton step get much bigger than last successful
	// step (likely would have to shrink later, anyway)
	if (stepsize > 2.0 * beststepsize) {
	stepsize = 1.5 * beststepsize;
	reverttograd = true;
	cerr << "Reverting to gradient, because Newton step is too large."
	<< endl;
	}
	} else {
	stepsize = 1.5 * beststepsize;
	reverttograd = true;
	cerr << "Reverting to gradient, because Newton step computation "
	"unsuccessful." << endl;
	}
	// Make the gradient unit norm, in feasible search direction.
	if (!AllowNegativeParams) {
	// Project gradient to be a feasible search direction
	vector<bool> active(nlambdas, false);
	std::size_t numactive = 0;
	for (std::size_t i = 0; i < nlambdas; i++) {
	// Project gradient to inactive constraints
	if ((bestparams(i) == 0) && (bestgrad(i) > 0)) {
	active[i] = true;
	bestgrad(i) = 0.0; // Do this now, in case extrapolation
	// allowed.
	++numactive;
	}
	}
	if (numactive > 0) {
	if (!AllowExtrapolation) {
	// Project gradient, for activity concerns
	DMatrix tmpN = DMatrix::Constant(
	nlambdas, nlambdas - 1,
	-1.0 / sqrt((nlambdas - numactive - 1) *
	(nlambdas - numactive - 1) +
	nlambdas - numactive - 1.0));

	for (std::size_t i = 0; i < nlambdas - 1; ++i)
	tmpN(i, i) = tmpN(i, i) * (1.0 - (nlambdas - numactive));

	for (std::size_t i = 0; i < nlambdas; ++i) {
	if (active[i]) {
	for (std::size_t j = 0; j < nlambdas - 1; ++i) {
	tmpN(i, j) = 0;
	}
	}
	}

	// projected gradient onto unit sum and active set constraints
	bestgrad = -tmpN * tmpN.transpose() * bestgrad;
	}
	}
	}
	}
	double norm = 0.0;
	for (std::size_t i = 0; i < nlambdas; i++)
	norm += bestgrad(i) * bestgrad(i);
	if (norm != 0) {
	bestgrad /= sqrt(norm);
	} else {
	cerr << " Gradient is zero. Exiting.";
	break;
	}
	cerr << "Gradient, unit length: " << endl;
	cerr << bestgrad << endl;

	// Find max step in gradient direction that remains feasible.
	if (!AllowNegativeParams) {
	double limitfraction = 0.5; // Not 1: If Newton step is bad, probably
	// will need to reduce later anyway
	for (std::size_t i = 0; i < nlambdas; i++) {
	if (bestparams(i) - maxbestgradstep * bestgrad(i) < 0) {
	double tmplimitfraction =
	bestparams(i) / (bestgrad(i) * maxbestgradstep);
	if (tmplimitfraction < limitfraction)
	limitfraction = tmplimitfraction;
	}
	}
	maxbestgradstep = stepsize * limitfraction;
	cerr << " Max grad step: " << maxbestgradstep << endl;
	} else {
	maxbestgradstep = stepsize;
	}

	if (!reverttograd) {
	if (!AllowNegativeParams) {
	for (std::size_t i = 0; i < nlambdas; i++) {
	if (bestparams(i) + deltaparams(i) < 0) {
	// Can't do Newton step. Revert to descent.
	reverttograd = true;
	}
	}
	}
	if (reverttograd) {
	cerr << "Reverting to gradient, since Newton step infeasible:"
	<< endl;
	}
	}

	if (reverttograd) {
	stepsize = maxbestgradstep;
	deltaparams = -bestgrad * stepsize;
	}

	params = bestparams + deltaparams;
	cerr << "Change in weights from best, step size " << stepsize << ": "
	<< endl;
	cerr << deltaparams << endl;
	} else {
	// Last attempt failed at being better, so move in gradient direction
	// with reduced step.
	// stepsize reduction factor is empirical
	stepsize = std::min(stepdecreasefactor * stepsize, maxbestgradstep);
	cerr << "Taking smaller step: " << stepsize << endl;
	params = bestparams - bestgrad * stepsize;
	}
	// Clean the parameters up.
	double sumparams = 0.0;
	for (std::size_t i = 0; i < nlambdas; i++) {
	if (!AllowNegativeParams) {
	if (params(i) < 1e-12) {
	// snap to zero, for active set and duplicate weights
	params(i) = 0;
	}
	}
	sumparams += params(i);
	}
	if (!AllowExtrapolation) params /= sumparams;

	bool duplicateentry = false;
	for (std::size_t i = 0; i < paramhistory.size(); ++i) {
	if (params == paramhistory[i]) duplicateentry = true;
	}
	while ((duplicateentry) && (stepsize >= minstepsize)) {
	cerr << "Duplicate weight found: " << endl;
	cerr << params << endl;
	stepsize *= 0.5; // Step in this direction is duplicate, so try again
	// with smaller step
	params = bestparams - stepsize * bestgrad;

	sumparams = 0.0;
	for (std::size_t i = 0; i < nlambdas; i++) {
	if (!AllowNegativeParams) {
	if (params(i) < 1e-12) params(i) = 0;
	}
	sumparams += params(i);
	}
	if (!AllowExtrapolation) params /= sumparams;

	duplicateentry = false;
	for (std::size_t i = 0; i < paramhistory.size(); ++i) {
	if (params == paramhistory[i]) duplicateentry = true;
	}
	}
	if (stepsize < minstepsize) break; // No need to make another step
	}

	params = bestparams; // So that next setiter is correct
	cerr << "Training complete. Best weights:" << endl;
	cerr << params << endl;
	}
	}

	int main(int argc, char **argv) {
	std::string tuning_data;
	std::vector<std::string> lms;

	try {
	namespace po = boost::program_options;
	po::options_description options("train-params");

	options.add_options()("help,h", po::bool_switch(),
	"Show this help message")(
	"no_bias_term,B", po::bool_switch(), "Do not include a 'bias' feature")(
	"tuning_data,t", po::value<std::string>(&tuning_data),
	"File to tune perplexity on")(
	"model,m", po::value<std::vector<std::string> >(&lms),
	"Language models in KenLM format to interpolate");
	po::variables_map vm;
	po::store(po::parse_command_line(argc, argv, options), vm);

	// Display help
	if (argc == 1 \|\| vm["help"].as<bool>()) {
	std::cerr << options << std::endl;
	return 1;
	}
	if (vm["no_bias_term"].as<bool>()) HAS_BIAS = false;
	lms = vm["model"].as<std::vector<std::string> >();
	tuning_data = vm["tuning_data"].as<std::string>();
	} catch (const std::exception &e) {
	std::cerr << e.what() << std::endl;
	return 1;
	}
	if (lms.size() < 2) {
	std::cerr
	<< "Please specify at least two language model files with -m LM.KLM\n";
	return 1;
	}
	if (tuning_data.empty()) {
	std::cerr << "Please specify tuning set with -t FILE.TXT\n";
	return 1;
	}

	// Growable vocab here
	// GrowableVocab gvoc(100000); //dummy default

	// no comment
	std::map<std::string, int *> vmap;

	// stuff it into the
	EnumerateGlobalVocab *globalVocabBuilder =
	new EnumerateGlobalVocab(&vmap, lms.size());

	Config cfg;
	cfg.enumerate_vocab = (EnumerateVocab *)globalVocabBuilder;

	// load models
	std::vector<Model *> models;
	for (std::size_t i = 0; i < lms.size(); i++) {
	std::cerr << "Loading LM file: " << lms[i] << std::endl;

	// haaaack
	globalVocabBuilder->SetCurModel(i); // yes this is dumb

	Model *this_model = new Model(lms[i].c_str(), cfg);
	models.push_back(this_model);
	}

	// assemble vocabulary vector
	std::vector<std::string> vocab;
	std::cerr << "Global Vocab Map has size: " << vmap.size() << std::endl;

	for (std::map<std::string, int *>::iterator iter = vmap.begin();
	iter != vmap.end(); ++iter) {
	vocab.push_back(iter->first);
	}
	std::cerr << "Vocab vector has size: " << vocab.size() << std::endl;

	// load context sorted ngrams into vector of vectors
	std::vector<std::vector<std::string> > corpus;

	std::cerr << "Loading context-sorted ngrams: " << tuning_data << std::endl;
	std::ifstream infile(tuning_data.c_str());

	for (std::string line; std::getline(infile, line);) {
	std::vector<std::string> words;
	std::stringstream stream(line);
	std::string word;

	while (stream >> word)
	words.push_back(word);
	corpus.push_back(words);
	}

	train_params(corpus, vocab, models);

	return 0;
	}