| /* |
| |
| EGYPT Toolkit for Statistical Machine Translation |
| Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. |
| |
| This program is free software; you can redistribute it and/or |
| modify it under the terms of the GNU General Public License |
| as published by the Free Software Foundation; either version 2 |
| of the License, or (at your option) any later version. |
| |
| This program is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with this program; if not, write to the Free Software |
| Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, |
| USA. |
| |
| */ |
| #ifndef _vocab_h |
| #define _vocab_h 1 |
| |
| #include "defs.h" |
| #include "Vector.h" |
| |
| #include <fstream> |
| #include <strstream> |
| #include <map> |
| #include <set> |
| |
| class WordEntry { |
| public: |
| string word ; |
| double freq ; |
| WordEntry():word("\0"), freq(0){}; |
| WordEntry(string w, int f):word(w), freq(f){}; |
| }; |
| |
| class vcbList{ |
| private: |
| Vector<WordEntry>& list ; |
| map<string,int> s2i; |
| double total; |
| WordIndex noUniqueTokens ; |
| WordIndex noUniqueTokensInCorpus ; |
| const char* fname ; |
| public: |
| vcbList(Vector<WordEntry>& vcb,const char* f=0):list(vcb), total(0), noUniqueTokens(0), noUniqueTokensInCorpus(0), fname(f){}; |
| void setName(const char*f) |
| { fname=f; } |
| vcbList(const vcbList& a):list(a.list), total(a.total), noUniqueTokens(a.noUniqueTokens), noUniqueTokensInCorpus(0), fname(a.fname){}; |
| void compact(const std::set<WordIndex>& evoc); |
| inline WordIndex size()const {return (list.size());}; |
| inline WordIndex uniqTokens()const {return noUniqueTokens;}; |
| inline WordIndex uniqTokensInCorpus()const {return noUniqueTokensInCorpus;}; |
| inline double totalVocab() const {return total;}; |
| inline Vector<WordEntry>& getVocabList() { return(list);}; |
| inline const Vector<WordEntry>& getVocabList()const { return(list);}; |
| void readVocabList(); |
| void incFreq(WordIndex id , double f){ |
| if(id < list.size()){ |
| if (list[id].freq == 0) |
| noUniqueTokensInCorpus++; |
| list[id].freq += f ; |
| total += f ; |
| } |
| }; |
| void clearAllFreq(){ |
| for (WordIndex id = 0 ; id < list.size() ; id++) |
| list[id].freq = 0 ; |
| total = 0 ; |
| noUniqueTokensInCorpus = 0 ; |
| }; |
| |
| const bool has_word(const string& x) const{ |
| map<string,int>::const_iterator i=s2i.find(x); |
| return i!=s2i.end(); |
| } |
| int operator()(const string&x)const |
| { |
| map<string,int>::const_iterator i=s2i.find(x); |
| if( i!=s2i.end() ) |
| return i->second; |
| else |
| { |
| cerr << "ERROR: no word index for '"<<x<<"'\n"; |
| return 0; |
| } |
| } |
| const string operator()(WordIndex id) const { // Yaser - 2000-12-13 |
| if (id < list.size()) |
| return list[id].word ; |
| else return 0 ; |
| } |
| const string operator[](WordIndex id) const { // Yaser - 2000-12-13 |
| if (id < list.size()) |
| return list[id].word ; |
| else return 0 ; |
| } |
| void printVocabList(ostream& of){ |
| for (WordIndex i = 1 ; i < list.size() ; i++){ |
| if (list[i].word != "" && list[i].freq > 0) |
| of << i << ' ' << list[i].word << ' ' << list[i].freq << '\n'; |
| } |
| } |
| |
| }; |
| #endif |