scripts/training/giza-pp/GIZA++-v2/vocab.cpp - joshua - Git at Google

 /*

 EGYPT Toolkit for Statistical Machine Translation
 Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.

 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU General Public License
 as published by the Free Software Foundation; either version 2
 of the License, or (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
 USA.

 */
 #include "vocab.h"

 void vcbList::readVocabList()
      // reads a vocabulary file from fname. It expects the following format:
      //
      // token_id token_string frequency
 {

   int freq=0;
   WordIndex word_id ;
   WordEntry entry("NULL",0) ;

   string line, word ;
   cerr << "Reading vocabulary file from:" << fname << "\n";
   //  total = 0 ;
   ifstream vFile(fname);
   if(!vFile){
     cerr <<  "\nCannot open vocabulary file " << fname << "file";
     exit(1);
   }

   list.push_back(entry);
   s2i[entry.word]=list.size()-1;

   while(getline(vFile, line)){
     istringstream buffer(line);
     if(!(buffer >> word_id >> word >> freq))
       cerr << "ERROR: reading vocabulary; " << word_id << ' ' << word << ' ' << freq << endl;
     if (word_id == 0){
       cerr << "ERROR: TOKEN ID 0 is reserved for special token NULL, in line: \n"<< line<<"\n" ;
       exit(-1);
     }
     else if (word_id >= MAX_VOCAB_SIZE){
       cerr << "ERROR: TOKEN ID is greater than maximum vocabulary size "
 	   << MAX_VOCAB_SIZE << " in line :\n"<< line <<"\n" ;
       exit(-1);
     }
     else if (freq < 0){
       cerr << "ERROR: frequency must be a positive integer, in line :\n"
 	   << line <<"\n";
       exit(-1);
     }
     else if(word_id >= list.size()){
       list.resize(word_id+1);
       list[word_id].word = word ;
       s2i[word]=word_id;
       list[word_id].freq = 0 ;
       noUniqueTokens = word_id + 1 ;
       //      noUniqueTokens++ ;
       //      total += freq ;
     }
     else if(list[word_id].word != "\0"){
       cerr << "ERROR: TOKEN ID must be unique for each token, in line :\n"
 	   << line <<"\n";
       cerr << "TOKEN ID " << word_id << " has already been assigned to: " <<
 	list[word_id].word << "\n";
       exit(-1);
     }
     else { // line  has valid information
       list[word_id].word = word ;
       s2i[word]=word_id;
       list[word_id].freq = 0 ;
       //      noUniqueTokens++ ;
       noUniqueTokens  = word_id + 1 ;
       //      total += freq ;
     }
   } // end of while
 }
	/*

	EGYPT Toolkit for Statistical Machine Translation
	Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.

	This program is free software; you can redistribute it and/or
	modify it under the terms of the GNU General Public License
	as published by the Free Software Foundation; either version 2
	of the License, or (at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program; if not, write to the Free Software
	Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	USA.

	*/
	#include "vocab.h"

	void vcbList::readVocabList()
	// reads a vocabulary file from fname. It expects the following format:
	//
	// token_id token_string frequency
	{

	int freq=0;
	WordIndex word_id ;
	WordEntry entry("NULL",0) ;

	string line, word ;
	cerr << "Reading vocabulary file from:" << fname << "\n";
	// total = 0 ;
	ifstream vFile(fname);
	if(!vFile){
	cerr << "\nCannot open vocabulary file " << fname << "file";
	exit(1);
	}

	list.push_back(entry);
	s2i[entry.word]=list.size()-1;

	while(getline(vFile, line)){
	istringstream buffer(line);
	if(!(buffer >> word_id >> word >> freq))
	cerr << "ERROR: reading vocabulary; " << word_id << ' ' << word << ' ' << freq << endl;
	if (word_id == 0){
	cerr << "ERROR: TOKEN ID 0 is reserved for special token NULL, in line: \n"<< line<<"\n" ;
	exit(-1);
	}
	else if (word_id >= MAX_VOCAB_SIZE){
	cerr << "ERROR: TOKEN ID is greater than maximum vocabulary size "
	<< MAX_VOCAB_SIZE << " in line :\n"<< line <<"\n" ;
	exit(-1);
	}
	else if (freq < 0){
	cerr << "ERROR: frequency must be a positive integer, in line :\n"
	<< line <<"\n";
	exit(-1);
	}
	else if(word_id >= list.size()){
	list.resize(word_id+1);
	list[word_id].word = word ;
	s2i[word]=word_id;
	list[word_id].freq = 0 ;
	noUniqueTokens = word_id + 1 ;
	// noUniqueTokens++ ;
	// total += freq ;
	}
	else if(list[word_id].word != "\0"){
	cerr << "ERROR: TOKEN ID must be unique for each token, in line :\n"
	<< line <<"\n";
	cerr << "TOKEN ID " << word_id << " has already been assigned to: " <<
	list[word_id].word << "\n";
	exit(-1);
	}
	else { // line has valid information
	list[word_id].word = word ;
	s2i[word]=word_id;
	list[word_id].freq = 0 ;
	// noUniqueTokens++ ;
	noUniqueTokens = word_id + 1 ;
	// total += freq ;
	}
	} // end of while
	}