blob: 85673ef0ff50248a5de3485218ac1211cbcfc651 [file] [log] [blame]
/*
EGYPT Toolkit for Statistical Machine Translation
Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA.
*/
/* --------------------------------------------------------------------------*
* *
* Module : TTables *
* *
* Prototypes File: TTables.h *
* *
* Objective: Defines clases and methods for handling I/O for Probability & *
* Count tables and also alignment tables *
*****************************************************************************/
#ifndef _ttables_h
#define _ttables_h 1
#include "defs.h"
#include "vocab.h"
#include <cassert>
#include <iostream>
#include <algorithm>
#include <functional>
#include <map>
#include <set>
#include "Vector.h"
#include <utility>
#include <fstream>
#include "Globals.h"
/* The tables defined in the following classes are defined as hash tables. For
example. the t-table is a hash function of a word pair; an alignment is
a hash function of a vector of integer numbers (sentence positions) and so
on */
/*----------- Defnition of Hash Function for class tmodel ------- -----------*/
typedef pair<WordIndex, WordIndex> wordPairIds;
class hashpair : public unary_function< pair<WordIndex, WordIndex>, size_t >
{
public:
size_t operator() (const pair<WordIndex, WordIndex>& key) const
{
return (size_t) MAX_W*key.first + key.second; /* hash function and it
is guarnteed to have
unique id for each
unique pair */
}
};
/* ------------------ Class Prototype Definitions ---------------------------*
Class Name: tmodel
Objective: This defines the underlying data structur for t Tables and t
Count Tables. They are defined as a hash table. Each entry in the hash table
is the probability (P(fj/ei) ) or count collected for ( C(fj/ei)). The
probability and the count are represented as log integer probability as
defined by the class LogProb .
This class is used to represents t Tables (probabiliity) and n (fertility
Tables and also their corresponding count tables .
*---------------------------------------------------------------------------*/
//typedef float COUNT ;
//typedef LogProb PROB ;
template <class COUNT, class PROB>
class LpPair {
public:
COUNT count ;
PROB prob ;
public: // constructor
LpPair():count(0), prob(0){} ;
LpPair(COUNT c, PROB p):count(c), prob(p){};
} ;
#ifdef BINARY_SEARCH_FOR_TTABLE
template<class T>
T*mbinary_search(T*x,T*y,unsigned int val)
{
if( y-x==0 )
return 0;
if( x->first==val)
return x;
if( y-x<2 )
return 0;
T*mid=x+(y-x)/2;
if( val < mid->first )
return mbinary_search(x,mid,val);
else
return mbinary_search(mid,y,val);
}
template<class T>
const T*mbinary_search(const T*x,const T*y,unsigned int val)
{
if( y-x==0 )
return 0;
if( x->first==val)
return x;
if( y-x<2 )
return 0;
const T*mid=x+(y-x)/2;
if( val < mid->first )
return mbinary_search(x,mid,val);
else
return mbinary_search(mid,y,val);
}
template <class COUNT, class PROB>
class tmodel{
typedef LpPair<COUNT, PROB> CPPair;
public:
int noEnglishWords; // total number of unique source words
int noFrenchWords; // total number of unique target words
//vector<pair<unsigned int,CPPair> > fs;
//vector<unsigned int> es;
vector< vector<pair<unsigned int,CPPair> >* > lexmat;
void erase(WordIndex e, WordIndex f)
{
CPPair *p=find(e,f);
if(p)
*p=CPPair(0,0);
};
CPPair*find(int e,int f)
{
//pair<unsigned int,CPPair> *be=&(fs[0])+es[e];
//pair<unsigned int,CPPair> *en=&(fs[0])+es[e+1];
pair<unsigned int,CPPair> *be=&(*lexmat[e])[0];
pair<unsigned int,CPPair> *en=&(*lexmat[e])[0]+(*lexmat[e]).size();
pair<unsigned int,CPPair> *x= mbinary_search(be,en,f);
if( x==0 )
{
//cerr << "A:DID NOT FIND ENTRY: " << e << " " << f << '\n';
//abort();
return 0;
}
return &(x->second);
}
const CPPair*find(int e,int f)const
{
const pair<unsigned int,CPPair> *be=&(*lexmat[e])[0];
const pair<unsigned int,CPPair> *en=&(*lexmat[e])[0]+(*lexmat[e]).size();
//const pair<unsigned int,CPPair> *be=&(fs[0])+es[e];
//const pair<unsigned int,CPPair> *en=&(fs[0])+es[e+1];
const pair<unsigned int,CPPair> *x= mbinary_search(be,en,f);
if( x==0 )
{
//cerr << "B:DID NOT FIND ENTRY: " << e << " " << f << '\n';
//abort();
return 0;
}
return &(x->second);
}
public:
void insert(WordIndex e, WordIndex f, COUNT cval=0.0, PROB pval = 0.0){
*find(e,f)=CPPair(cval,pval);
}
CPPair*getPtr(int e,int f){return find(e,f);}
tmodel(const string&fn)
{
int count=0,count2=0;
ifstream infile2(fn.c_str());
int e,f,olde=-1,oldf=-1;
pair<unsigned int,CPPair> cp;
vector< pair<unsigned int,CPPair> > cps;
while(infile2>>e>>f)
{
cp.first=f;
assert(e>=olde);
assert(e>olde ||f>oldf);
if( e!=olde&&olde>=0 )
{
int oldsize=lexmat.size();
lexmat.resize(olde+1);
for(unsigned int i=oldsize;i<lexmat.size();++i)
lexmat[i]=0;
lexmat[olde]=new vector< pair<unsigned int,CPPair> > (cps);
cps.clear();
if( !((*lexmat[olde]).size()==(*lexmat[olde]).capacity()) )
cerr << "eRROR: waste of memory: " << (*lexmat[olde]).size() << " " << (*lexmat[olde]).capacity() << endl;
count2+=lexmat[olde]->capacity();
}
cps.push_back(cp);
olde=e;
oldf=f;
count++;
}
lexmat.resize(olde+1);
lexmat[olde]=new vector< pair<unsigned int,CPPair> > (cps);
count2+=lexmat[olde]->capacity();
cout << "There are " << count << " " << count2 << " entries in table" << '\n';
}
/* tmodel(const string&fn)
{
size_t count=0;
{
ifstream infile1(fn.c_str());
if( !infile1 )
{
cerr << "ERROR: can't read coocurrence file " << fn << '\n';
abort();
}
int e,f;
while(infile1>>e>>f)
count++;
}
cout << "There are " << count << " entries in table" << '\n';
ifstream infile2(fn.c_str());
fs.resize(count);
int e,f,olde=-1,oldf=-1;
pair<unsigned int,CPPair> cp;
count=0;
while(infile2>>e>>f)
{
assert(e>=olde);
assert(e>olde ||f>oldf);
if( e!=olde )
{
es.resize(e+1);
for(unsigned int i=olde+1;int(i)<=e;++i)
es[i]=count;
}
cp.first=f;
assert(count<fs.size());
fs[count]=cp;
//fs.push_back(cp);
olde=e;
oldf=f;
count++;
}
assert(count==fs.size());
es.push_back(fs.size());
cout << fs.size() << " " << count << " coocurrences read" << '\n';
}*/
void incCount(WordIndex e, WordIndex f, COUNT inc)
{
if( inc )
{
CPPair *p=find(e,f);
if( p )
p->count += inc ;
}
}
PROB getProb(WordIndex e, WordIndex f) const
{
const CPPair *p=find(e,f);
if( p )
return max(p->prob, PROB_SMOOTH);
else
return PROB_SMOOTH;
}
COUNT getCount(WordIndex e, WordIndex f) const
{
const CPPair *p=find(e,f);
if( p )
return p->count;
else
return 0.0;
}
void printProbTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
void printCountTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
void printProbTableInverse(const char *filename,
const Vector<WordEntry>& evlist,
const Vector<WordEntry>& fvlist,
const double eTotal,
const double fTotal,
const bool actual = false ) const;
void normalizeTable(const vcbList&engl, const vcbList&french, int iter=2);
void readProbTable(const char *filename);
};
#else
template <class COUNT, class PROB>
class tmodel{
typedef LpPair<COUNT, PROB> CPPair;
public:
int noEnglishWords; // total number of unique source words
int noFrenchWords; // total number of unique target words
hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> > ef;
void erase(WordIndex e, WordIndex f)
// In: a source and a target token ids.
// removes the entry with that pair from table
{
ef.erase(wordPairIds(e, f));
};
public:
Vector<PROB> total2;
Vector<int> nFrench;
Vector<int> nEng;
// methods;
// insert: add entry P(fj/ei) to the hash function, Default value is 0.0
void insert(WordIndex e, WordIndex f, COUNT cval=0.0, PROB pval = 0.0){
ef[wordPairIds(e, f)].count = cval ;
ef[wordPairIds(e, f)].prob = pval ;
}
// returns a reference to the word pair, if does not exists, it creates it.
CPPair&getRe(WordIndex e, WordIndex f)
{return ef[wordPairIds(e, f)];}
// returns a pointer to an existing word pair. if pair does not exists,
// the method returns the zero pointer (NULL)
CPPair*getPtr(WordIndex e, WordIndex f)
{
// look up this pair and return its position
typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::iterator i = ef.find(wordPairIds(e, f));
if(i != ef.end()) // if it exists, return a pointer to it.
return(&((*i).second));
else return(0) ; // else return NULL pointer
}
void incCount(WordIndex e, WordIndex f, COUNT inc)
// increments the count of the given word pair. if the pair does not exist,
// it creates it with the given value.
{
if( inc )
ef[wordPairIds(e, f)].count += inc ;
}
PROB getProb(WordIndex e, WordIndex f) const
// read probability value for P(fj/ei) from the hash table
// if pair does not exist, return floor value PROB_SMOOTH
{
typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i= ef.find(wordPairIds(e, f));
if(i == ef.end())
return PROB_SMOOTH;
else
return max(((*i).second).prob, PROB_SMOOTH);
}
COUNT getCount(WordIndex e, WordIndex f) const
/* read count value for entry pair (fj/ei) from the hash table */
{
typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i= ef.find(wordPairIds(e, f));
if(i == ef.end())
return 0;
else
return ((*i).second).count;
}
inline const hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >& getHash(void) const {return ef;};
/* get a refernece to the hash table */
//inline void resize(WordIndex n) {ef.resize(n);};
// to resize he hash table
void printProbTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
void printCountTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
// print the t table to the given file but this time print actual source and
// target words instead of thier token ids
void printProbTableInverse(const char *filename,
const Vector<WordEntry>& evlist,
const Vector<WordEntry>& fvlist,
const double eTotal,
const double fTotal,
const bool actual = false ) const;
// dump inverse of t table (i.e P(ei/fj)) to the given file name,
// if the given flag is true then actual words are printed not token ids
void normalizeTable(const vcbList&engl, const vcbList&french, int iter=2);
// to norlmalize the table i.e. make sure P(fj/ei) for all j is equal to 1
void readProbTable(const char *filename);
// void readAsFertilityTable(const char *filename);
};
/*--------------- End of Class Definition for tmodel -----------------------*/
#endif
#endif