blob: e7bf13a1790af7f6e5370f007dc5ceb2915a830d [file] [log] [blame]
/*
EGYPT Toolkit for Statistical Machine Translation
Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA.
*/
#include "vocab.h"
void vcbList::readVocabList()
// reads a vocabulary file from fname. It expects the following format:
//
// token_id token_string frequency
{
int freq=0;
WordIndex word_id ;
WordEntry entry("NULL",0) ;
string line, word ;
cerr << "Reading vocabulary file from:" << fname << "\n";
// total = 0 ;
ifstream ifs(fname);
if(!ifs){
cerr << "\nCannot open vocabulary file " << fname << "file";
exit(1);
}
size_t sline = 0;
while(getline(ifs, line)){
sline ++;
}
ifs.close();
ifstream vFile(fname);
if(!vFile){
cerr << "\nCannot open vocabulary file " << fname << "file";
exit(1);
}
list.reserve(sline+100); // Reserve space to prevent re-allocating
list.push_back(entry);
s2i[entry.word]=list.size()-1;
while(getline(vFile, line)){
istrstream buffer(line.c_str());
if(!(buffer >> word_id >> word >> freq))
cerr << "ERROR: reading vocabulary; " << word_id << ' ' << word << ' ' << freq << endl;
if (word_id == 0){
cerr << "ERROR: TOKEN ID 0 is reserved for special token NULL, in line: \n"<< line<<"\n" ;
exit(-1);
}
else if (word_id >= MAX_VOCAB_SIZE){
cerr << "ERROR: TOKEN ID is greater than maximum vocabulary size "
<< MAX_VOCAB_SIZE << " in line :\n"<< line <<"\n" ;
exit(-1);
}
else if (freq < 0){
cerr << "ERROR: frequency must be a positive integer, in line :\n"
<< line <<"\n";
exit(-1);
}
else if(word_id >= list.size()){
list.resize(word_id+1);
list[word_id].word = word ;
s2i[word]=word_id;
list[word_id].freq = 0 ;
noUniqueTokens = word_id + 1 ;
// noUniqueTokens++ ;
// total += freq ;
}
else if(list[word_id].word != "\0"){
cerr << "ERROR: TOKEN ID must be unique for each token, in line :\n"
<< line <<"\n";
cerr << "TOKEN ID " << word_id << " has already been assigned to: " <<
list[word_id].word << "\n";
exit(-1);
}
else { // line has valid information
list[word_id].word = word ;
s2i[word]=word_id;
list[word_id].freq = 0 ;
// noUniqueTokens++ ;
noUniqueTokens = word_id + 1 ;
// total += freq ;
}
} // end of while
}
void vcbList::compact(const std::set<WordIndex>& evoc){
int del = 0;
for(int i=0; i< list.size() ; i++){
if(evoc.find(i)==evoc.end()){ // Not appear in corpus
s2i.erase(list[i].word);
list[i].word = "";
del++;
}
}
cerr << "Compacted Vocabulary, eliminated " << del << " entries "
<< s2i.size() << " remains " << endl;
}