scripts/training/MGIZA/src/TTables.h - joshua - Git at Google

 /*

 EGYPT Toolkit for Statistical Machine Translation
 Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.

 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU General Public License
 as published by the Free Software Foundation; either version 2
 of the License, or (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
 USA.

 */
 /* --------------------------------------------------------------------------*
  *                                                                           *
  * Module : TTables                                                          *
  *                                                                           *
  * Prototypes File: TTables.h                                               *
  *                                                                           *
  * Objective: Defines clases and methods for handling I/O for Probability &  *
  *            Count tables and also alignment tables                         *
  *****************************************************************************/

 #ifndef _ttables_h
 #define _ttables_h 1


 #include "defs.h"
 #include "vocab.h"

 #include <assert.h>

 #include <iostream>
 #include <algorithm>
 #include <functional>
 #include <map>
 #include <set>
 #include "Vector.h"
 #include <utility>
 #include "syncObj.h"

 #if __GNUC__>2
 #include <ext/hash_map>
 using __gnu_cxx::hash_map;
 #else
 #include <hash_map>
 #endif

 #include <fstream>

 #include "Globals.h"


 /* The tables defined in the following classes are defined as hash tables. For
    example. the t-table is a hash function of a word pair; an alignment is
    a hash function of a vector of integer numbers (sentence positions) and so
    on   */


 /*----------- Defnition of Hash Function for class tmodel ------- -----------*/

 typedef pair<WordIndex, WordIndex> wordPairIds;


 class hashpair : public unary_function< pair<WordIndex, WordIndex>, size_t >
 {
 public:
   size_t operator() (const pair<WordIndex, WordIndex>& key) const
     {
       return (size_t) MAX_W*key.first + key.second; /* hash function and it
 						       is guarnteed to have
 						       unique id for each
 						       unique pair */
     }
   #ifdef WIN32
      inline bool operator() (const pair<WordIndex, WordIndex>& key, const pair<WordIndex, WordIndex>& key2){
 		 return key.first==key2.first && key.second==key2.second;
 	 }
 	 enum
 	 {	// parameters for hash table
 		 bucket_size = 1		// 0 < bucket_size
 	 };
   #endif
 };


 /* ------------------ Class Prototype Definitions ---------------------------*
   Class Name: tmodel
   Objective: This defines the underlying data structur for t Tables and t
   Count Tables. They are defined as a hash table. Each entry in the hash table
   is the probability (P(fj/ei) ) or count collected for ( C(fj/ei)). The
   probability and the count are represented as log integer probability as
   defined by the class LogProb .

   This class is used to represents t Tables (probabiliity) and n (fertility
   Tables and also their corresponding count tables .

  *---------------------------------------------------------------------------*/

 //typedef float COUNT ;
 //typedef LogProb PROB ;
 template <class COUNT, class PROB>
 class LpPair {
 public:
     COUNT count ;
     PROB  prob ;
 public: // constructor
     LpPair():count(0), prob(0){} ;
     LpPair(COUNT c, PROB p):count(c), prob(p){};
 } ;

 template<class T>
 T*mbinary_search(T*x,T*y,unsigned int val)
 {
   if( y-x==0 )
     return 0;
   if( x->first==val)
     return x;
   if( y-x<2 )
     return 0;
   T*mid=x+(y-x)/2;
   if( val < mid->first )
     return mbinary_search(x,mid,val);
   else
     return mbinary_search(mid,y,val);

 }

 template<class T>
 const T*mbinary_search(const T*x,const T*y,unsigned int val)
 {
   if( y-x==0 )
     return 0;
   if( x->first==val)
     return x;
   if( y-x<2 )
     return 0;
   const T*mid=x+(y-x)/2;
   if( val < mid->first )
     return mbinary_search(x,mid,val);
   else
     return mbinary_search(mid,y,val);

 }

 template <class COUNT, class PROB>
 class tmodel{
 	typedef LpPair<COUNT, PROB> CPPair;
 public:
 	bool recordDiff;

 public:
 	int noEnglishWords;  // total number of unique source words
 	int noFrenchWords;   // total number of unique target words
 	//vector<pair<unsigned int,CPPair> > fs;
     //vector<unsigned int> es;
     vector< vector<pair<unsigned int,CPPair> >* > lexmat;
 	vector< Mutex* > mutex;

     void erase(WordIndex e, WordIndex f){
 		CPPair *p=find(e,f);
 		if(p)
 			*p=CPPair(0,0);
 	};

     CPPair*find(int e,int f){
 		//pair<unsigned int,CPPair> *be=&(fs[0])+es[e];
 		//pair<unsigned int,CPPair> *en=&(fs[0])+es[e+1];
 		if(e>=lexmat.size()||lexmat[e]==NULL){
 			return NULL;
 		}
 		pair<unsigned int,CPPair> *be=&(*lexmat[e])[0];
 		pair<unsigned int,CPPair> *en=&(*lexmat[e])[0]+(*lexmat[e]).size();
 		pair<unsigned int,CPPair> *x= mbinary_search(be,en,f);
 		if( x==0 ){
 			//cerr << "A:DID NOT FIND ENTRY: " << e << " " << f << '\n';
 			//abort();
 			return 0;
 		}
 		return &(x->second);
 	}

 	const CPPair*find(int e,int f)const{
 		if(e>=lexmat.size()||lexmat[e]==NULL){
 			return NULL;
 		}
         const pair<unsigned int,CPPair> *be=&(*lexmat[e])[0];
 		const pair<unsigned int,CPPair> *en=&(*lexmat[e])[0]+(*lexmat[e]).size();
 		//const pair<unsigned int,CPPair> *be=&(fs[0])+es[e];
 		//const pair<unsigned int,CPPair> *en=&(fs[0])+es[e+1];
 		const pair<unsigned int,CPPair> *x= mbinary_search(be,en,f);
 		if( x==0 ){
 			//cerr << "B:DID NOT FIND ENTRY: " << e << " " << f << '\n';
 			//abort();
 			return 0;
 		}

 		return &(x->second);
 	}
 public:
 	void insert(WordIndex e, WordIndex f, COUNT cval=0.0, PROB pval = 0.0){
 		CPPair* found = find(e,f);
 		if(found)
 			*found=CPPair(cval,pval);
 	}

     CPPair*getPtr(int e,int f){return find(e,f);}

 	tmodel(){};
     tmodel(const string&fn)	{
 		recordDiff = false;
 		int count=0,count2=0;
 		ifstream infile2(fn.c_str());
 		cerr << "Inputfile in " << fn << endl;
 		int e,f,olde=-1,oldf=-1;
 		pair<unsigned int,CPPair> cp;
 		vector< pair<unsigned int,CPPair> > cps;
 		while(infile2>>e>>f){
 			cp.first=f;
 			assert(e>=olde);
 			assert(e>olde ||f>oldf);
 			if( e!=olde&&olde>=0 ){
 				int oldsize=lexmat.size();
 				lexmat.resize(olde+1);
 				for(unsigned int i=oldsize;i<lexmat.size();++i)
 					lexmat[i]=0;
 				lexmat[olde]=new vector< pair<unsigned int,CPPair> > (cps);
 				cps.clear();
 				if( !((*lexmat[olde]).size()==(*lexmat[olde]).capacity()) )
 					cerr << "eRROR: waste of memory: " << (*lexmat[olde]).size() << " " << (*lexmat[olde]).capacity() << endl;
 				count2+=lexmat[olde]->capacity();
 			}
 			cps.push_back(cp);
 			olde=e;
 			oldf=f;
 			count++;
 		}
 		lexmat.resize(olde+1);
 		lexmat[olde]=new vector< pair<unsigned int,CPPair> > (cps);
 		count2+=lexmat[olde]->capacity();
 		cout << "There are " << count << " " << count2 << " entries in table" << '\n';
 		mutex.resize(lexmat.size());
 		for(int _i = 0; _i< lexmat.size();_i++){
 			mutex[_i] = new Mutex();
 		}
 		/* Create mutex */
 	}

 	~tmodel(){
 		for(int _i = 0; _i< lexmat.size();_i++){
 			delete mutex[_i];
 		}

 	}


   /*  tmodel(const string&fn)
     {
       size_t count=0;
       {
 	ifstream infile1(fn.c_str());
 	if( !infile1 )
 	  {
 	    cerr << "ERROR: can't read coocurrence file " << fn << '\n';
 	    abort();
 	  }
 	int e,f;
 	while(infile1>>e>>f)
 	  count++;
       }
       cout << "There are " << count << " entries in table" << '\n';
       ifstream infile2(fn.c_str());
       fs.resize(count);
       int e,f,olde=-1,oldf=-1;
       pair<unsigned int,CPPair> cp;
       count=0;
       while(infile2>>e>>f)
 	{
 	  assert(e>=olde);
 	  assert(e>olde ||f>oldf);
 	  if( e!=olde )
 	    {
 	      es.resize(e+1);
 	      for(unsigned int i=olde+1;int(i)<=e;++i)
 		es[i]=count;
 	    }
 	  cp.first=f;
 	  assert(count<fs.size());
 	  fs[count]=cp;
 	  //fs.push_back(cp);
 	  olde=e;
 	  oldf=f;
 	  count++;
 	}
       assert(count==fs.size());
       es.push_back(fs.size());
       cout << fs.size() << " " << count << " coocurrences read" << '\n';
       }*/

     void incCount(WordIndex e, WordIndex f, COUNT inc) {
         if( inc ){
             CPPair *p=find(e,f);
             if( p ){
 				mutex[e]->lock();
 				p->count += inc ;
 				mutex[e]->unlock();
 			}
         }
     }

 	PROB getProb(WordIndex e, WordIndex f) const{
 		const CPPair *p=find(e,f);
         if( p )
             return max(p->prob, PROB_SMOOTH);
         else
             return PROB_SMOOTH;
     }

 	COUNT getCount(WordIndex e, WordIndex f) const
 	{
 		const CPPair *p=find(e,f);
 		if( p )
 			return p->count;
 		else
 			return 0.0;
     }

   void printProbTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
   void printCountTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
   void printProbTableInverse(const char *filename,
 			     const Vector<WordEntry>& evlist,
 			     const Vector<WordEntry>& fvlist,
 			     const double eTotal,
 			     const double fTotal,
 			     const bool actual = false ) const;
   void normalizeTable(const vcbList&engl, const vcbList&french, int iter=2);
   bool readProbTable(const char *filename);
   bool readSubSampledProbTable(const char* filename, std::set<WordIndex> &e, std::set<WordIndex> &f);
 };

 #endif
	/*

	EGYPT Toolkit for Statistical Machine Translation
	Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.

	This program is free software; you can redistribute it and/or
	modify it under the terms of the GNU General Public License
	as published by the Free Software Foundation; either version 2
	of the License, or (at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program; if not, write to the Free Software
	Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	USA.

	*/
	/* --------------------------------------------------------------------------*
	* *
	* Module : TTables *
	* *
	* Prototypes File: TTables.h *
	* *
	* Objective: Defines clases and methods for handling I/O for Probability & *
	* Count tables and also alignment tables *
	*****************************************************************************/

	#ifndef _ttables_h
	#define _ttables_h 1


	#include "defs.h"
	#include "vocab.h"

	#include <assert.h>

	#include <iostream>
	#include <algorithm>
	#include <functional>
	#include <map>
	#include <set>
	#include "Vector.h"
	#include <utility>
	#include "syncObj.h"

	#if __GNUC__>2
	#include <ext/hash_map>
	using __gnu_cxx::hash_map;
	#else
	#include <hash_map>
	#endif

	#include <fstream>

	#include "Globals.h"


	/* The tables defined in the following classes are defined as hash tables. For
	example. the t-table is a hash function of a word pair; an alignment is
	a hash function of a vector of integer numbers (sentence positions) and so
	on */


	/----------- Defnition of Hash Function for class tmodel ------- -----------/

	typedef pair<WordIndex, WordIndex> wordPairIds;



	class hashpair : public unary_function< pair<WordIndex, WordIndex>, size_t >
	{
	public:
	size_t operator() (const pair<WordIndex, WordIndex>& key) const
	{
	return (size_t) MAX_Wkey.first + key.second; / hash function and it
	is guarnteed to have
	unique id for each
	unique pair */
	}
	#ifdef WIN32
	inline bool operator() (const pair<WordIndex, WordIndex>& key, const pair<WordIndex, WordIndex>& key2){
	return key.first==key2.first && key.second==key2.second;
	}
	enum
	{ // parameters for hash table
	bucket_size = 1 // 0 < bucket_size
	};
	#endif
	};



	/* ------------------ Class Prototype Definitions ---------------------------*
	Class Name: tmodel
	Objective: This defines the underlying data structur for t Tables and t
	Count Tables. They are defined as a hash table. Each entry in the hash table
	is the probability (P(fj/ei) ) or count collected for ( C(fj/ei)). The
	probability and the count are represented as log integer probability as
	defined by the class LogProb .

	This class is used to represents t Tables (probabiliity) and n (fertility
	Tables and also their corresponding count tables .

	---------------------------------------------------------------------------/

	//typedef float COUNT ;
	//typedef LogProb PROB ;
	template <class COUNT, class PROB>
	class LpPair {
	public:
	COUNT count ;
	PROB prob ;
	public: // constructor
	LpPair():count(0), prob(0){} ;
	LpPair(COUNT c, PROB p):count(c), prob(p){};
	} ;

	template<class T>
	Tmbinary_search(Tx,T*y,unsigned int val)
	{
	if( y-x==0 )
	return 0;
	if( x->first==val)
	return x;
	if( y-x<2 )
	return 0;
	T*mid=x+(y-x)/2;
	if( val < mid->first )
	return mbinary_search(x,mid,val);
	else
	return mbinary_search(mid,y,val);

	}

	template<class T>
	const Tmbinary_search(const Tx,const T*y,unsigned int val)
	{
	if( y-x==0 )
	return 0;
	if( x->first==val)
	return x;
	if( y-x<2 )
	return 0;
	const T*mid=x+(y-x)/2;
	if( val < mid->first )
	return mbinary_search(x,mid,val);
	else
	return mbinary_search(mid,y,val);

	}

	template <class COUNT, class PROB>
	class tmodel{
	typedef LpPair<COUNT, PROB> CPPair;
	public:
	bool recordDiff;

	public:
	int noEnglishWords; // total number of unique source words
	int noFrenchWords; // total number of unique target words
	//vector<pair<unsigned int,CPPair> > fs;
	//vector<unsigned int> es;
	vector< vector<pair<unsigned int,CPPair> >* > lexmat;
	vector< Mutex* > mutex;

	void erase(WordIndex e, WordIndex f){
	CPPair *p=find(e,f);
	if(p)
	*p=CPPair(0,0);
	};

	CPPair*find(int e,int f){
	//pair<unsigned int,CPPair> *be=&(fs[0])+es[e];
	//pair<unsigned int,CPPair> *en=&(fs[0])+es[e+1];
	if(e>=lexmat.size()\|\|lexmat[e]==NULL){
	return NULL;
	}
	pair<unsigned int,CPPair> be=&(lexmat[e])[0];
	pair<unsigned int,CPPair> en=&(lexmat[e])[0]+(*lexmat[e]).size();
	pair<unsigned int,CPPair> *x= mbinary_search(be,en,f);
	if( x==0 ){
	//cerr << "A:DID NOT FIND ENTRY: " << e << " " << f << '\n';
	//abort();
	return 0;
	}
	return &(x->second);
	}

	const CPPair*find(int e,int f)const{
	if(e>=lexmat.size()\|\|lexmat[e]==NULL){
	return NULL;
	}
	const pair<unsigned int,CPPair> be=&(lexmat[e])[0];
	const pair<unsigned int,CPPair> en=&(lexmat[e])[0]+(*lexmat[e]).size();
	//const pair<unsigned int,CPPair> *be=&(fs[0])+es[e];
	//const pair<unsigned int,CPPair> *en=&(fs[0])+es[e+1];
	const pair<unsigned int,CPPair> *x= mbinary_search(be,en,f);
	if( x==0 ){
	//cerr << "B:DID NOT FIND ENTRY: " << e << " " << f << '\n';
	//abort();
	return 0;
	}

	return &(x->second);
	}
	public:
	void insert(WordIndex e, WordIndex f, COUNT cval=0.0, PROB pval = 0.0){
	CPPair* found = find(e,f);
	if(found)
	*found=CPPair(cval,pval);
	}

	CPPair*getPtr(int e,int f){return find(e,f);}

	tmodel(){};
	tmodel(const string&fn) {
	recordDiff = false;
	int count=0,count2=0;
	ifstream infile2(fn.c_str());
	cerr << "Inputfile in " << fn << endl;
	int e,f,olde=-1,oldf=-1;
	pair<unsigned int,CPPair> cp;
	vector< pair<unsigned int,CPPair> > cps;
	while(infile2>>e>>f){
	cp.first=f;
	assert(e>=olde);
	assert(e>olde \|\|f>oldf);
	if( e!=olde&&olde>=0 ){
	int oldsize=lexmat.size();
	lexmat.resize(olde+1);
	for(unsigned int i=oldsize;i<lexmat.size();++i)
	lexmat[i]=0;
	lexmat[olde]=new vector< pair<unsigned int,CPPair> > (cps);
	cps.clear();
	if( !((lexmat[olde]).size()==(lexmat[olde]).capacity()) )
	cerr << "eRROR: waste of memory: " << (lexmat[olde]).size() << " " << (lexmat[olde]).capacity() << endl;
	count2+=lexmat[olde]->capacity();
	}
	cps.push_back(cp);
	olde=e;
	oldf=f;
	count++;
	}
	lexmat.resize(olde+1);
	lexmat[olde]=new vector< pair<unsigned int,CPPair> > (cps);
	count2+=lexmat[olde]->capacity();
	cout << "There are " << count << " " << count2 << " entries in table" << '\n';
	mutex.resize(lexmat.size());
	for(int _i = 0; _i< lexmat.size();_i++){
	mutex[_i] = new Mutex();
	}
	/* Create mutex */
	}

	~tmodel(){
	for(int _i = 0; _i< lexmat.size();_i++){
	delete mutex[_i];
	}

	}


	/* tmodel(const string&fn)
	{
	size_t count=0;
	{
	ifstream infile1(fn.c_str());
	if( !infile1 )
	{
	cerr << "ERROR: can't read coocurrence file " << fn << '\n';
	abort();
	}
	int e,f;
	while(infile1>>e>>f)
	count++;
	}
	cout << "There are " << count << " entries in table" << '\n';
	ifstream infile2(fn.c_str());
	fs.resize(count);
	int e,f,olde=-1,oldf=-1;
	pair<unsigned int,CPPair> cp;
	count=0;
	while(infile2>>e>>f)
	{
	assert(e>=olde);
	assert(e>olde \|\|f>oldf);
	if( e!=olde )
	{
	es.resize(e+1);
	for(unsigned int i=olde+1;int(i)<=e;++i)
	es[i]=count;
	}
	cp.first=f;
	assert(count<fs.size());
	fs[count]=cp;
	//fs.push_back(cp);
	olde=e;
	oldf=f;
	count++;
	}
	assert(count==fs.size());
	es.push_back(fs.size());
	cout << fs.size() << " " << count << " coocurrences read" << '\n';
	}*/

	void incCount(WordIndex e, WordIndex f, COUNT inc) {
	if( inc ){
	CPPair *p=find(e,f);
	if( p ){
	mutex[e]->lock();
	p->count += inc ;
	mutex[e]->unlock();
	}
	}
	}

	PROB getProb(WordIndex e, WordIndex f) const{
	const CPPair *p=find(e,f);
	if( p )
	return max(p->prob, PROB_SMOOTH);
	else
	return PROB_SMOOTH;
	}

	COUNT getCount(WordIndex e, WordIndex f) const
	{
	const CPPair *p=find(e,f);
	if( p )
	return p->count;
	else
	return 0.0;
	}

	void printProbTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
	void printCountTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
	void printProbTableInverse(const char *filename,
	const Vector<WordEntry>& evlist,
	const Vector<WordEntry>& fvlist,
	const double eTotal,
	const double fTotal,
	const bool actual = false ) const;
	void normalizeTable(const vcbList&engl, const vcbList&french, int iter=2);
	bool readProbTable(const char *filename);
	bool readSubSampledProbTable(const char* filename, std::set<WordIndex> &e, std::set<WordIndex> &f);
	};

	#endif