blob: 25c126f73e8525e292dd2dfcfb3c7050c1212173 [file] [log] [blame]
/*
EGYPT Toolkit for Statistical Machine Translation
Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA.
*/
#include "TTables.h"
#include "Parameter.h"
GLOBAL_PARAMETER(float,PROB_CUTOFF,"PROB CUTOFF","Probability cutoff threshold for lexicon probabilities",PARLEV_OPTHEUR,1e-7);
GLOBAL_PARAMETER2(float, COUNTINCREASE_CUTOFF,"COUNTINCREASE CUTOFF","countCutoff","Counts increment cutoff threshold",PARLEV_OPTHEUR,1e-6);
#ifdef BINARY_SEARCH_FOR_TTABLE
template <class COUNT, class PROB>
void tmodel<COUNT, PROB>::printCountTable(const char *,
const Vector<WordEntry>&,
const Vector<WordEntry>&,
const bool) const
{
}
template <class COUNT, class PROB>
void tmodel<COUNT, PROB>::printProbTable(const char *filename,
const Vector<WordEntry>& evlist,
const Vector<WordEntry>& fvlist,
const bool actual) const
{
ofstream of(filename);
/* for(unsigned int i=0;i<es.size()-1;++i)
for(unsigned int j=es[i];j<es[i+1];++j)
{
const CPPair&x=fs[j].second;
WordIndex e=i,f=fs[j].first;
if( actual )
of << evlist[e].word << ' ' << fvlist[f].word << ' ' << x.prob << '\n';
else
of << e << ' ' << f << ' ' << x.prob << '\n';
}*/
for(unsigned int i=0;i<lexmat.size();++i)
{
if( lexmat[i] )
for(unsigned int j=0;j<lexmat[i]->size();++j)
{
const CPPair&x=(*lexmat[i])[j].second;
WordIndex e=i,f=(*lexmat[i])[j].first;
if( x.prob>PROB_SMOOTH )
if( actual )
of << evlist[e].word << ' ' << fvlist[f].word << ' ' << x.prob << '\n';
else
of << e << ' ' << f << ' ' << x.prob << '\n';
}
}
}
template <class COUNT, class PROB>
void tmodel<COUNT, PROB>::printProbTableInverse(const char *,
const Vector<WordEntry>&,
const Vector<WordEntry>&,
const double,
const double,
const bool ) const
{
}
template <class COUNT, class PROB>
void tmodel<COUNT, PROB>::normalizeTable(const vcbList&, const vcbList&, int)
{
for(unsigned int i=0;i<lexmat.size();++i)
{
double c=0.0;
if( lexmat[i] )
{
unsigned int lSize=lexmat[i]->size();
for(unsigned int j=0;j<lSize;++j)
c+=(*lexmat[i])[j].second.count;
for(unsigned int j=0;j<lSize;++j)
{
if( c==0 )
(*lexmat[i])[j].second.prob=1.0/(lSize);
else
(*lexmat[i])[j].second.prob=(*lexmat[i])[j].second.count/c;
(*lexmat[i])[j].second.count=0;
}
}
}
}
template <class COUNT, class PROB>
void tmodel<COUNT, PROB>::readProbTable(const char *){
}
template class tmodel<COUNT,PROB> ;
#else
/* ------------------ Method Definiotns for Class tmodel --------------------*/
#
template <class COUNT, class PROB>
void tmodel<COUNT, PROB>::printCountTable(const char *filename,
const Vector<WordEntry>& evlist,
const Vector<WordEntry>& fvlist,
const bool actual) const
// this function dumps the t table. Each line is of the following format:
//
// c(target_word/source_word) source_word target_word
{
ofstream of(filename);
typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
for(i = ef.begin(); i != ef.end();++i){
if ( ((*i).second).count > COUNTINCREASE_CUTOFF)
if (actual)
of << ((*i).second).count << ' ' << evlist[ ((*i).first).first ].word << ' ' << fvlist[((*i).first).second].word << ' ' << (*i).second.prob << '\n';
else
of << ((*i).second).count << ' ' << ((*i).first).first << ' ' << ((*i).first).second << ' ' << (*i).second.prob << '\n';
}
}
template <class COUNT, class PROB>
void tmodel<COUNT, PROB>::printProbTable(const char *filename,
const Vector<WordEntry>& evlist,
const Vector<WordEntry>& fvlist,
const bool actual) const
// this function dumps the t table. Each line is of the following format:
//
// source_word target_word p(target_word/source_word)
{
ofstream of(filename);
typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
for(i = ef.begin(); i != ef.end();++i)
if( actual )
of << evlist[((*i).first).first].word << ' ' <<
fvlist[((*i).first).second].word << ' ' << (*i).second.prob << '\n';
else
of << ((*i).first).first << ' ' << ((*i).first).second << ' ' <<
(*i).second.prob << '\n';
}
template <class COUNT, class PROB>
void tmodel<COUNT, PROB>::printProbTableInverse(const char *filename,
const Vector<WordEntry>& evlist,
const Vector<WordEntry>& fvlist,
const double,
const double,
const bool actual) const
// this function dumps the inverse t table. Each line is of the format:
//
// target_word_id source_word_id p(source_word/target_word)
//
// if flag "actual " is true then print actual word entries instead of
// token ids
{
cerr << "Dumping the t table inverse to file: " << filename << '\n';
ofstream of(filename);
typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
PROB p_inv = 0 ;
// static const PROB ratio(double(fTotal)/eTotal);
WordIndex e, f ;
int no_errors(0);
vector<PROB> total(fvlist.size(),PROB(0)) ; // Sum over all e of P(f/e) * p(e) - needed for normalization
for(i = ef.begin(); i != ef.end(); i++){
e = ((*i).first).first ;
f = ((*i).first).second ;
total[f] += (PROB) evlist[e].freq * ((*i).second.prob); //add P(f/ei) * F(ei)
}
for(i = ef.begin(); i != ef.end(); i++){
e = ((*i).first).first ;
f = ((*i).first).second ;
p_inv = ((*i).second.prob) * (PROB) evlist[e].freq / total[f] ;
if (p_inv > 1.0001 || p_inv < 0){
no_errors++;
if (no_errors <= 10){
cerr << "printProbTableInverse(): Error - P("<<evlist[e].word<<"("<<
e<<") / "<<fvlist[f].word << "("<<f<<")) = " << p_inv <<'\n';
cerr << "f(e) = "<<evlist[e].freq << " Sum(p(f/e).f(e)) = " << total[f] <<
" P(f/e) = " <<((*i).second.prob) <<'\n';
if (no_errors == 10)
cerr<<"printProbTableInverse(): Too many P inverse errors ..\n";
}
}
if (actual)
of << fvlist[f].word << ' ' << evlist[e].word << ' ' << p_inv << '\n';
else
of << f << ' ' << e << ' ' << p_inv << '\n';
}
}
/*
{
cerr << "Dumping the t table inverse to file: " << filename << '\n';
ofstream of(filename);
hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
PROB p_inv = 0 ;
static const PROB ratio(double(fTotal)/eTotal);
WordIndex e, f ;
for(i = ef.begin(); i != ef.end(); i++){
e = ((*i).first).first ;
f = ((*i).first).second ;
p_inv = ((*i).second.prob) * ratio * (PROB) evlist[e].freq /
(PROB) fvlist[f].freq ;
if (actual)
of << fvlist[f].word << ' ' << evlist[e].word << ' ' << p_inv << '\n';
else
of << f << ' ' << e << ' ' << p_inv << '\n';
}
}
*/
template <class COUNT, class PROB>
void tmodel<COUNT, PROB>::normalizeTable(const vcbList&engl, const vcbList&french, int iter)
// normalize conditional probability P(fj/ei):
// i.e. make sure that Sum over all j of P(fj/e) = 1
// this method reads the counts portion of the table and normalize into
// the probability portion. Then the counts are cleared (i.e. zeroed)
// if the resulting probability of an entry is below a threshold, then
// remove it .
{
if( iter==2 )
{
total2.resize(engl.uniqTokens());for(unsigned int i=0;i<total2.size();i++)total2[i]=0.0;
}
nFrench.resize(engl.uniqTokens());for(unsigned int i=0;i<nFrench.size();i++)nFrench[i]=0;
nEng.resize(french.uniqTokens());for(unsigned int i=0;i<nEng.size();i++)nEng[i]=0;
Vector<double> total(engl.uniqTokens(),0.0);
//Vector<int> nFrench(engl.uniqTokens(), 0);
//Vector<int> nEng(french.uniqTokens(), 0);
typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
for(i = ef.begin(); i != ef.end(); i++){ // for all possible source words e
if( iter==2 )
total2[((*i).first).first] += (*i).second.count;
total[((*i).first).first] += (*i).second.count;
nFrench[((*i).first).first]++;
nEng[((*i).first).second]++;
}
for(unsigned int k=0;k<engl.uniqTokens();++k)
if( nFrench[k] )
{
double probMass=(french.uniqTokensInCorpus()-nFrench[k])*PROB_SMOOTH;
if( probMass<0.0 )
cout << k << " french.uniqTokensInCorpus(): " << french.uniqTokensInCorpus() << " nFrench[k]:"<< nFrench[k] << '\n';
total[k]+= total[k]*probMass/(1-probMass);
}
typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::iterator j, k;
PROB p ;
int nParams=0;
for(j = ef.begin(); j != ef.end(); ){
k = j;
k++ ;
if( (total[((*j).first).first])>0.0 )
p = ((((*j).second).count) /(total[((*j).first).first])) ;
else
p= 0.0;
if (p > PROB_CUTOFF)
{
if( iter>0 )
{
((*j).second).prob = 0 ;
((*j).second).count = p ;
}
else
{
((*j).second).prob = p ;
((*j).second).count = 0 ;
}
nParams++;
}
else {
erase(((*j).first).first, ((*j).first).second);
}
j = k ;
}
if( iter>0 )
return normalizeTable(engl, french, iter-1);
else
{
}
}
template <class COUNT, class PROB>
void tmodel<COUNT, PROB>::readProbTable(const char *filename){
/* This function reads the t table from a file.
Each line is of the format: source_word_id target_word_id p(target_word|source_word)
This is the inverse operation of the printTable function.
NAS, 7/11/99
*/
ifstream inf(filename);
cerr << "Reading t prob. table from " << filename << "\n";
if(!inf){
cerr << "\nERROR: Cannot open " << filename << "\n";
return;
}
WordIndex src_id, trg_id;
PROB prob;
int nEntry=0;
while( inf >> src_id >> trg_id >> prob){
insert(src_id, trg_id, 0.0, prob);
nEntry++;
}
cerr << "Read " << nEntry << " entries in prob. table.\n";
}
template class tmodel<COUNT,PROB> ;
/* ---------------- End of Method Definitions of class tmodel ---------------*/
#endif