blob: 39233e828f455ca4a6944d4248a8a70b120afd01 [file] [log] [blame]
#include "lm/ngram_query.hh"
#include "lm/model.hh"
#include "lm/word_index.hh"
#include "lm/interpolate/enumerate_global_vocab.hh"
#include <string>
#include <vector>
#include <iostream>
#include <fstream>
#include <map>
#include <boost/program_options.hpp>
#include <boost/version.hpp>
#include <boost/foreach.hpp>
#include "util/fixed_array.hh"
#include <Eigen/Eigen>
typedef Eigen::MatrixXf FMatrix;
typedef Eigen::VectorXf FVector;
bool HAS_BIAS = true;
using namespace lm::ngram;
using namespace lm;
inline float logProb(Model * model, const std::vector<std::string>& ctx, const std::string& word) {
// Horribly inefficient
const Vocabulary &vocab = model->GetVocabulary();
State nextState; //throwaway
WordIndex word_idx = vocab.Index(word);
WordIndex context_idx[ctx.size()];
//reverse context
for(unsigned int i = 0; i < ctx.size(); i++) {
context_idx[ctx.size() - 1 - i] = vocab.Index(ctx[i]);
}
FullScoreReturn score = model->FullScoreForgotState(context_idx, &(context_idx[ctx.size() -1]), word_idx, nextState);
float ret = score.prob;
//std::cerr << "w: " << word << " p: " << ret << std::endl;
return ret;
}
void set_features(const std::vector<std::string>& ctx,
const std::string& word,
const std::vector<Model *>& models,
FVector& v) {
//std::cerr << "setting feats for " << word << std::endl;
if (HAS_BIAS) {
v(0) = 1;
for (unsigned i=0; i < models.size(); ++i)
v(i + 1) = logProb(models[i], ctx, word);
} else {
for (unsigned i=0; i < models.size(); ++i)
v(i) = logProb(models[i], ctx, word);
}
}
void translate_input(
const std::vector<std::vector<std::string> >& corpus,
const std::vector<std::string>& gvocab,
const std::vector<Model *>& models,
std::vector<std::vector<std::vector<WordIndex> > >&translated_corpus,
std::vector<std::vector<WordIndex> >&translated_vocab
) {
translated_corpus.resize(models.size());
translated_vocab.resize(models.size());
for (unsigned mn=0; mn < models.size(); ++mn) { // models
const Vocabulary &vocab = models[mn]->GetVocabulary();
for (unsigned i = 0; i < gvocab.size(); ++i) {
translated_vocab[mn].push_back(vocab.Index(gvocab[i]));
}
translated_corpus[mn].resize(corpus.size());
for (unsigned ci = 0; ci < corpus.size(); ++ci) { // sentences in tuning corpus
const std::vector<std::string>& sentence = corpus[ci];
for (int t = sentence.size() -1; t >= 0; --t) { // words in sentence
translated_corpus[mn][ci].push_back(vocab.Index(sentence[t]));
}
for (int i=0; i<5; ++i) {
translated_corpus[mn][ci].push_back(vocab.Index("<s>"));
}
}
}
}
void train_params_fast(
const std::vector<std::vector<std::string> >& corpus,
const std::vector<std::string>& vocab,
const std::vector<Model *>& models) {
using namespace std;
// model / sentence / words in sentence in reverse order with <s> padding
std::vector<std::vector<std::vector<WordIndex> > > t_corpus;
std::vector<std::vector<WordIndex> > t_vocab;
translate_input(corpus, vocab, models, t_corpus, t_vocab);
const int ITERATIONS = 10;
const int nlambdas = models.size() + (HAS_BIAS ? 1 : 0); // bias + #models
FVector params = FVector::Zero(nlambdas);
vector<FVector> feats(vocab.size(), params);
vector<float> us(vocab.size(), 0);
vector<float> ps(vocab.size(), 0);
FVector grad = FVector::Zero(nlambdas);
FMatrix H = FMatrix::Zero(nlambdas, nlambdas);
FVector ef = FVector::Zero(nlambdas);
for (int iter = 0; iter < ITERATIONS; ++iter) { // iterations
grad.setZero();
H.setZero();
double loss = 0;
unsigned numchars = 0;
for (unsigned ci = 0; ci < corpus.size(); ++ci) { // sentences in tuning corpus
const vector<string>& sentence = corpus[ci];
double z = 0;
for (int t = sentence.size() -1 ; t >=0; --t) { // words in sentence
++numchars;
int ref_word = 0;
for (unsigned i = 0; i < vocab.size(); ++i) { // vocab
// set_features(context, vocab[i], models, feats[i]);
for (unsigned j=0; j < models.size(); ++j) {
// NOTE: reference ---- WordIndex word_idx = t_corpus[j][ci][t];
WordIndex word_idx = t_vocab[j][i];
State nextState; //throwaway
FullScoreReturn score = models[j]->FullScoreForgotState(&(t_corpus[j][ci][t]), &(t_corpus[j][ci][t+5]), word_idx, nextState);
feats[i](j) = score.prob;
// feats[i](j) = logProb(models[j], ctx, word);
}
us[i] = params.dot(feats[i]);
z += exp(double(us[i]));
}
//std::cerr << "there..." << std::endl;
const float logz = log(z);
// expected feature values
ef.setZero();
for (unsigned i = 0; i < vocab.size(); ++i) {
ps[i] = expf(us[i] - logz);
ef += ps[i] * feats[i];
}
loss -= log(ps[ref_word]);
const FVector& reffeats = feats[ref_word];
grad += ef - reffeats;
// Hessian
for (unsigned i = 0; i < vocab.size(); ++i)
H.noalias() += ps[i] * feats[i] * feats[i].transpose() -
ps[i] * feats[i] * ef.transpose();
// this should just be the state for each model
}
cerr << ".";
}
cerr << "ITERATION " << (iter + 1) << ": PPL=" << exp(loss / numchars) << endl;
params = H.colPivHouseholderQr().solve(grad);
cerr << params << endl;
}
}
//const util::FixedArray<Model *>& models)
void train_params(
const std::vector<std::vector<std::string> >& corpus,
const std::vector<std::string>& vocab,
const std::vector<Model *>& models) {
using namespace std;
vector<string> context(5, "<s>");
const int ITERATIONS = 10;
const int nlambdas = models.size() + (HAS_BIAS ? 1 : 0); // bias + #models
FVector params = FVector::Zero(nlambdas);
vector<FVector> feats(vocab.size(), params);
vector<float> us(vocab.size(), 0);
vector<float> ps(vocab.size(), 0);
FVector grad = FVector::Zero(nlambdas);
FMatrix H = FMatrix::Zero(nlambdas, nlambdas);
FVector ef = FVector::Zero(nlambdas);
for (int iter = 0; iter < ITERATIONS; ++iter) { // iterations
grad.setZero();
H.setZero();
double loss = 0;
unsigned numchars = 0;
for (unsigned ci = 0; ci < corpus.size(); ++ci) { // sentences in tuning corpus
const vector<string>& sentence = corpus[ci];
std::fill(context.begin(), context.end(), "<s>");
for (unsigned t = 0; t < sentence.size(); ++t) { // words in sentence
++numchars;
const string& ref_word_string = sentence[t];
int ref_word = 0; // TODO
double z = 0;
//std::cerr << "here..." << std::endl;
for (unsigned i = 0; i < vocab.size(); ++i) { // vocab
set_features(context, vocab[i], models, feats[i]);
us[i] = params.dot(feats[i]);
z += exp(double(us[i]));
}
//std::cerr << "there..." << std::endl;
context.push_back(ref_word_string);
const float logz = log(z);
// expected feature values
ef.setZero();
for (unsigned i = 0; i < vocab.size(); ++i) {
ps[i] = expf(us[i] - logz);
ef += ps[i] * feats[i];
}
loss -= log(ps[ref_word]);
const FVector& reffeats = feats[ref_word];
grad += ef - reffeats;
// Hessian
for (unsigned i = 0; i < vocab.size(); ++i)
H.noalias() += ps[i] * feats[i] * feats[i].transpose() -
ps[i] * feats[i] * ef.transpose();
// this should just be the state for each model
}
cerr << ".";
}
cerr << "ITERATION " << (iter + 1) << ": PPL=" << exp(loss / numchars) << endl;
params = H.colPivHouseholderQr().solve(grad);
cerr << params << endl;
}
}
int main(int argc, char** argv) {
std::string tuning_data;
std::vector<std::string> lms;
try {
namespace po = boost::program_options;
po::options_description options("train-params");
options.add_options()
("help,h", po::bool_switch(), "Show this help message")
("no_bias_term,B", po::bool_switch(), "Do not include a 'bias' feature")
("tuning_data,t", po::value<std::string>(&tuning_data), "File to tune perplexity on")
("model,m", po::value<std::vector<std::string> >(&lms), "Language models in KenLM format to interpolate");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, options), vm);
// Display help
if(argc == 1 || vm["help"].as<bool>()) {
std::cerr << options << std::endl;
return 1;
}
if (vm["no_bias_term"].as<bool>())
HAS_BIAS = false;
lms = vm["model"].as<std::vector<std::string> >();
tuning_data = vm["tuning_data"].as<std::string>();
}
catch(const std::exception &e) {
std::cerr << e.what() << std::endl;
return 1;
}
if (lms.size() < 2) {
std::cerr << "Please specify at least two language model files with -m LM.KLM\n";
return 1;
}
if (tuning_data.empty()) {
std::cerr << "Please specify tuning set with -t FILE.TXT\n";
return 1;
}
//Growable vocab here
//GrowableVocab gvoc(100000); //dummy default
std::map<std::string, int*> vmap;
util::FixedArray<WordIndex> vm(2);
//stuff it into the
EnumerateGlobalVocab * globalVocabBuilder = new EnumerateGlobalVocab(&vmap, lms.size());
// EnumerateGlobalVocab * globalVocabBuilder = new EnumerateGlobalVocab(vm);
Config cfg;
cfg.enumerate_vocab = (EnumerateVocab *) globalVocabBuilder;
//load models
//util::FixedArray<Model *> models(lms.size());
std::vector<Model *> models;
for(int i=0; i < lms.size(); i++) {
std::cerr << "Loading LM file: " << lms[i] << std::endl;
//haaaack
globalVocabBuilder->SetCurModel(i); //yes this is dumb
//models[i] = new Model(lms[i].c_str());
Model * this_model = new Model(lms[i].c_str(), cfg);
models.push_back( this_model );
}
//assemble vocabulary vector
std::vector<std::string> vocab;
std::cerr << "Global Vocab Map has size: " << vmap.size() << std::endl;
std::pair<StringPiece,int *> me;
for(std::map<std::string, int*>::iterator iter = vmap.begin(); iter != vmap.end(); ++iter) {
vocab.push_back(iter->first);
}
std::cerr << "Vocab vector has size: " << vocab.size() << std::endl;
//load context sorted ngrams into vector of vectors
std::vector<std::vector<std::string> > corpus;
std::cerr << "Loading context-sorted ngrams: " << tuning_data << std::endl;
std::ifstream infile(tuning_data);
for(std::string line; std::getline(infile, line); ) {
std::vector<std::string> words; {
std::stringstream stream(line);
std::string word;
while(stream >> word) {
words.push_back(word);
}
}
corpus.push_back(words);
}
train_params_fast(corpus, vocab, models);
return 0;
}