scripts/training/MGIZA/src/ttableDiff.hpp - joshua - Git at Google

 /* -*- Mode: C; indent-tabs-mode: t; c-basic-offset: 4; tab-width: 4 -*- */
 /*
  * newgiza
  * Copyright (C) Qin Gao 2007 <qing@cs.cmu.edu>
  *
  * newgiza is free software.
  *
  * You may redistribute it and/or modify it under the terms of the
  * GNU General Public License, as published by the Free Software
  * Foundation; either version 2 of the License, or (at your option)
  * any later version.
  *
  * newgiza is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  * See the GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with newgiza.  If not, write to:
  * 	The Free Software Foundation, Inc.,
  * 	51 Franklin Street, Fifth Floor
  * 	Boston, MA  02110-1301, USA.
  */


 #ifndef _TTABLEDIFF_HPP_
 #define _TTABLEDIFF_HPP_
 #include "TTables.h"
 #include <sstream>
 #include <string>
 #include "types.h"

 using namespace std;
 #ifdef WIN32
 		typedef hash_map<wordPairIds, COUNT, hashpair> wordpair_hash;
 #else
 		typedef hash_map<wordPairIds, COUNT, hashpair, equal_to<wordPairIds> > wordpair_hash;
 #endif
 /*!
 This class is meant to create a difference file in order to make
 GIZA paralell.
 */
 template <class COUNT,class PROB>
 class CTTableDiff{
 private:
 		INT32 noEnglishWords;  // total number of unique source words
 		INT32 noFrenchWords;   // total number of unique target words
 		/*!
 		Store only the counting*/
 		wordpair_hash ef;

 public:
 		INT32 SaveToFile(const char* filename){
 			ofstream ofs(filename);
 			if(!ofs.is_open()){
 				return -1;
 			}else{
 				wordpair_hash::iterator it;
 				for( it = ef.begin() ; it != ef.end(); it++){
 					ofs << it->first.first << " " << it->first.second << " "
 						<< it->second << std::endl;
 				}
 			}
 			return SUCCESS;
 		}

 		INT32 LoadFromFile(const char* filename){
 			ef.clear();
 			ifstream ifs(filename);
 			if(!ifs.is_open()){
 				return -1;
 			}
 			string sline;
 			while(!ifs.eof()){
 				sline = "";
 				std::getline(ifs,sline);
 				if(sline.length()){
 					//cout << sline << endl;
 					stringstream ss(sline.c_str());
 					WordIndex we=-1,wf=-1;
 					COUNT ct=-1 ;
 					ss >> we >> wf >> ct;
 					if(we==-1||wf==-1||ct==-1)
 						continue;
 					ef[wordPairIds(we,wf)] = ct;
 				}
 			}
 			return SUCCESS;
 		}

 		COUNT * GetPtr(WordIndex e, WordIndex f){
 			// look up this pair and return its position
 			wordpair_hash::iterator i = ef.find(wordPairIds(e, f));
 			if(i != ef.end())  // if it exists, return a pointer to it.
 				return(&((*i).second));
 			else return(0) ; // else return NULL pointer
 		}

 		void incCount(WordIndex e, WordIndex f, COUNT inc)
 		// increments the count of the given word pair. if the pair does not exist,
 		// it creates it with the given value.
 		{
 			if( inc )
 				ef[wordPairIds(e, f)] += inc ;
 		}

 		INT32 AugmentTTable(tmodel<COUNT,PROB>& ttable){
 			wordpair_hash::iterator it;
 			for( it = ef.begin() ; it != ef.end(); it++){
 				ttable.incCount(it->first.first,it->first.second,it->second);
 			}
 			return SUCCESS;
 		}

 protected:

 };

 #endif // _TTABLEDIFF_HPP_
	/* -- Mode: C; indent-tabs-mode: t; c-basic-offset: 4; tab-width: 4 -- */
	/*
	* newgiza
	* Copyright (C) Qin Gao 2007 <qing@cs.cmu.edu>
	*
	* newgiza is free software.
	*
	* You may redistribute it and/or modify it under the terms of the
	* GNU General Public License, as published by the Free Software
	* Foundation; either version 2 of the License, or (at your option)
	* any later version.
	*
	* newgiza is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	* See the GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with newgiza. If not, write to:
	* The Free Software Foundation, Inc.,
	* 51 Franklin Street, Fifth Floor
	* Boston, MA 02110-1301, USA.
	*/


	#ifndef _TTABLEDIFF_HPP_
	#define _TTABLEDIFF_HPP_
	#include "TTables.h"
	#include <sstream>
	#include <string>
	#include "types.h"

	using namespace std;
	#ifdef WIN32
	typedef hash_map<wordPairIds, COUNT, hashpair> wordpair_hash;
	#else
	typedef hash_map<wordPairIds, COUNT, hashpair, equal_to<wordPairIds> > wordpair_hash;
	#endif
	/*!
	This class is meant to create a difference file in order to make
	GIZA paralell.
	*/
	template <class COUNT,class PROB>
	class CTTableDiff{
	private:
	INT32 noEnglishWords; // total number of unique source words
	INT32 noFrenchWords; // total number of unique target words
	/*!
	Store only the counting*/
	wordpair_hash ef;

	public:
	INT32 SaveToFile(const char* filename){
	ofstream ofs(filename);
	if(!ofs.is_open()){
	return -1;
	}else{
	wordpair_hash::iterator it;
	for( it = ef.begin() ; it != ef.end(); it++){
	ofs << it->first.first << " " << it->first.second << " "
	<< it->second << std::endl;
	}
	}
	return SUCCESS;
	}

	INT32 LoadFromFile(const char* filename){
	ef.clear();
	ifstream ifs(filename);
	if(!ifs.is_open()){
	return -1;
	}
	string sline;
	while(!ifs.eof()){
	sline = "";
	std::getline(ifs,sline);
	if(sline.length()){
	//cout << sline << endl;
	stringstream ss(sline.c_str());
	WordIndex we=-1,wf=-1;
	COUNT ct=-1 ;
	ss >> we >> wf >> ct;
	if(we==-1\|\|wf==-1\|\|ct==-1)
	continue;
	ef[wordPairIds(we,wf)] = ct;
	}
	}
	return SUCCESS;
	}

	COUNT * GetPtr(WordIndex e, WordIndex f){
	// look up this pair and return its position
	wordpair_hash::iterator i = ef.find(wordPairIds(e, f));
	if(i != ef.end()) // if it exists, return a pointer to it.
	return(&((*i).second));
	else return(0) ; // else return NULL pointer
	}

	void incCount(WordIndex e, WordIndex f, COUNT inc)
	// increments the count of the given word pair. if the pair does not exist,
	// it creates it with the given value.
	{
	if( inc )
	ef[wordPairIds(e, f)] += inc ;
	}

	INT32 AugmentTTable(tmodel<COUNT,PROB>& ttable){
	wordpair_hash::iterator it;
	for( it = ef.begin() ; it != ef.end(); it++){
	ttable.incCount(it->first.first,it->first.second,it->second);
	}
	return SUCCESS;
	}

	protected:

	};

	#endif // _TTABLEDIFF_HPP_