blob: 1e4f3b6e3b534e1b9dd0e4fb05386e919a9b1a13 [file] [log] [blame]
/*
EGYPT Toolkit for Statistical Machine Translation
Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA.
*/
#include "TTables.h"
#include "Parameter.h"
#include<iostream>
#include <fstream>
GLOBAL_PARAMETER(float,PROB_CUTOFF,"PROB CUTOFF","Probability cutoff threshold for lexicon probabilities",PARLEV_OPTHEUR,1e-7);
GLOBAL_PARAMETER2(float, COUNTINCREASE_CUTOFF,"COUNTINCREASE CUTOFF","countCutoff","Counts increment cutoff threshold",PARLEV_OPTHEUR,1e-6);
/* ------------------ Method Definiotns for Class tmodel --------------------*/
// To output to STDOUT, submit filename as NULL
template <class COUNT, class PROB>
void tmodel<COUNT, PROB>::printCountTable(const char *filename,
const Vector<WordEntry>& evlist,
const Vector<WordEntry>& fvlist,
const bool actual) const
{
ostream *tof;
if(filename)
tof = new ofstream(filename);
else
tof = & cout;
ostream &of = *tof;
/* for(unsigned int i=0;i<es.size()-1;++i)
for(unsigned int j=es[i];j<es[i+1];++j)
{
const CPPair&x=fs[j].second;
WordIndex e=i,f=fs[j].first;
if( actual )
of << evlist[e].word << ' ' << fvlist[f].word << ' ' << x.prob << '\n';
else
of << e << ' ' << f << ' ' << x.prob << '\n';
}*/
for(unsigned int i=0;i<lexmat.size();++i){
if( lexmat[i] ){
for(unsigned int j=0;j<lexmat[i]->size();++j)
{
const CPPair&x=(*lexmat[i])[j].second;
WordIndex e=i,f=(*lexmat[i])[j].first;
if( x.prob>MINCOUNTINCREASE ){
if( actual ){
of << evlist[e].word << ' ' << fvlist[f].word << ' ' << x.count << '\n';
}else{
of << e << ' ' << f << ' ' << x.count << '\n';
}
}
}
}
}
if(filename){
((ofstream*)tof)->close();
delete tof;
}
}
template <class COUNT, class PROB>
void tmodel<COUNT, PROB>::printProbTable(const char *filename,
const Vector<WordEntry>& evlist,
const Vector<WordEntry>& fvlist,
const bool actual) const
{
ofstream of(filename);
/* for(unsigned int i=0;i<es.size()-1;++i)
for(unsigned int j=es[i];j<es[i+1];++j)
{
const CPPair&x=fs[j].second;
WordIndex e=i,f=fs[j].first;
if( actual )
of << evlist[e].word << ' ' << fvlist[f].word << ' ' << x.prob << '\n';
else
of << e << ' ' << f << ' ' << x.prob << '\n';
}*/
for(unsigned int i=0;i<lexmat.size();++i){
if( lexmat[i] ){
for(unsigned int j=0;j<lexmat[i]->size();++j)
{
const CPPair&x=(*lexmat[i])[j].second;
WordIndex e=i,f=(*lexmat[i])[j].first;
if( x.prob>PROB_SMOOTH ){
if( actual ){
of << evlist[e].word << ' ' << fvlist[f].word << ' ' << x.prob << '\n';
}else{
of << e << ' ' << f << ' ' << x.prob << '\n';
}
}
}
}
}
}
template <class COUNT, class PROB>
void tmodel<COUNT, PROB>::printProbTableInverse(const char *,
const Vector<WordEntry>&,
const Vector<WordEntry>&,
const double,
const double,
const bool ) const
{
}
template <class COUNT, class PROB>
void tmodel<COUNT, PROB>::normalizeTable(const vcbList&, const vcbList&, int)
{
for(unsigned int i=0;i<lexmat.size();++i){
double c=0.0;
if( lexmat[i] ){
unsigned int lSize=lexmat[i]->size();
for(unsigned int j=0;j<lSize;++j)
c+=(*lexmat[i])[j].second.count;
for(unsigned int j=0;j<lSize;++j) {
if( c==0 )
(*lexmat[i])[j].second.prob=1.0/(lSize);
else
(*lexmat[i])[j].second.prob=(*lexmat[i])[j].second.count/c;
(*lexmat[i])[j].second.count=0;
}
}
}
}
template <class COUNT, class PROB>
bool tmodel<COUNT, PROB>::readProbTable(const char *filename){
/* This function reads the t table from a file.
Each line is of the format: source_word_id target_word_id p(target_word|source_word)
This is the inverse operation of the printTable function.
NAS, 7/11/99
*/
ifstream inf(filename);
cerr << "Reading t prob. table from " << filename << "\n";
if (!inf) {
cerr << "\nERROR: Cannot open " << filename << "\n";
return false;
}
WordIndex src_id, trg_id;
PROB prob;
int nEntry=0;
while (inf >> src_id >> trg_id >> prob) {
insert(src_id, trg_id, 0.0, prob);
nEntry++;
}
cerr << "Read " << nEntry << " entries in prob. table.\n";
return true;
}
template class tmodel<COUNT,PROB> ;
/* ---------------- End of Method Definitions of class tmodel ---------------*/