blob: 8bf5de750a3395a679e2262428848ea407dd076d [file] [log] [blame]
/*
EGYPT Toolkit for Statistical Machine Translation
Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA.
*/
#ifndef _vocab_h
#define _vocab_h 1
#include "defs.h"
#include "Vector.h"
#include <fstream>
#include <strstream>
#include <map>
#include <set>
class WordEntry {
public:
string word ;
double freq ;
WordEntry():word("\0"), freq(0){};
WordEntry(string w, int f):word(w), freq(f){};
};
class vcbList{
private:
Vector<WordEntry>& list ;
map<string,int> s2i;
double total;
WordIndex noUniqueTokens ;
WordIndex noUniqueTokensInCorpus ;
const char* fname ;
public:
vcbList(Vector<WordEntry>& vcb,const char* f=0):list(vcb), total(0), noUniqueTokens(0), noUniqueTokensInCorpus(0), fname(f){};
void setName(const char*f)
{ fname=f; }
vcbList(const vcbList& a):list(a.list), total(a.total), noUniqueTokens(a.noUniqueTokens), noUniqueTokensInCorpus(0), fname(a.fname){};
void compact(const std::set<WordIndex>& evoc);
inline WordIndex size()const {return (list.size());};
inline WordIndex uniqTokens()const {return noUniqueTokens;};
inline WordIndex uniqTokensInCorpus()const {return noUniqueTokensInCorpus;};
inline double totalVocab() const {return total;};
inline Vector<WordEntry>& getVocabList() { return(list);};
inline const Vector<WordEntry>& getVocabList()const { return(list);};
void readVocabList();
void incFreq(WordIndex id , double f){
if(id < list.size()){
if (list[id].freq == 0)
noUniqueTokensInCorpus++;
list[id].freq += f ;
total += f ;
}
};
void clearAllFreq(){
for (WordIndex id = 0 ; id < list.size() ; id++)
list[id].freq = 0 ;
total = 0 ;
noUniqueTokensInCorpus = 0 ;
};
const bool has_word(const string& x) const{
map<string,int>::const_iterator i=s2i.find(x);
return i!=s2i.end();
}
int operator()(const string&x)const
{
map<string,int>::const_iterator i=s2i.find(x);
if( i!=s2i.end() )
return i->second;
else
{
cerr << "ERROR: no word index for '"<<x<<"'\n";
return 0;
}
}
const string operator()(WordIndex id) const { // Yaser - 2000-12-13
if (id < list.size())
return list[id].word ;
else return 0 ;
}
const string operator[](WordIndex id) const { // Yaser - 2000-12-13
if (id < list.size())
return list[id].word ;
else return 0 ;
}
void printVocabList(ostream& of){
for (WordIndex i = 1 ; i < list.size() ; i++){
if (list[i].word != "" && list[i].freq > 0)
of << i << ' ' << list[i].word << ' ' << list[i].freq << '\n';
}
}
};
#endif