| /* |
| |
| EGYPT Toolkit for Statistical Machine Translation |
| Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. |
| |
| This program is free software; you can redistribute it and/or |
| modify it under the terms of the GNU General Public License |
| as published by the Free Software Foundation; either version 2 |
| of the License, or (at your option) any later version. |
| |
| This program is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with this program; if not, write to the Free Software |
| Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, |
| USA. |
| |
| */ |
| /* --------------------------------------------------------------------------* |
| * * |
| * Module : getSentece * |
| * * |
| * Method Definitions File: getSentence.cc * |
| * * |
| * Objective: Defines clases and methods for handling I/O for the parallel * |
| * corpus. * |
| *****************************************************************************/ |
| |
| |
| #include "getSentence.h" |
| #include <iostream> |
| #include <strstream> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <boost/algorithm/string.hpp> |
| #include <vector> |
| #include <set> |
| #include <pthread.h> |
| #include "Parameter.h" |
| #include "errno.h" |
| |
| int PrintedTooLong=0; |
| |
| /* -------------- Method Defnitions for Class sentenceHandler ---------------*/ |
| |
| GLOBAL_PARAMETER(double,ManlexMAX_MULTIPLICITY,"manlexMAX_MULTIPLICITY","",PARLEV_EM,20.0); |
| GLOBAL_PARAMETER(double,Manlexfactor1,"manlexfactor1","",PARLEV_EM,0.0); |
| GLOBAL_PARAMETER(double,Manlexfactor2,"manlexfactor2","",PARLEV_EM,0.0); |
| |
| sentenceHandler::sentenceHandler(const char* filename, vcbList* elist, |
| vcbList* flist) : realCount(0) |
| // This method is the constructor of the class, it also intitializes the |
| // sentence pair sequential number (count) to zero. |
| { |
| readsent_mutex=new boost::mutex(); |
| setprob_mutex = new boost::mutex(); |
| |
| position = 0; |
| readflag = false ; |
| allInMemory = false ; |
| inputFilename = filename ; |
| inputFile = new ifstream(filename); |
| pair_no = 0 ; |
| if(!(*inputFile)){ |
| cerr << "\nERROR:(a) Cannot open " << filename; |
| exit(1); |
| } |
| currentSentence = 0; |
| totalPairs1 = 0 ; |
| totalPairs2 =0; |
| pair_no = 0 ; |
| noSentInBuffer = 0 ; |
| Buffer.clear(); |
| bool isNegative=0; |
| std::set<WordIndex> evoc,fvoc; |
| evoc.insert(0); |
| fvoc.insert(0); |
| if (elist && flist){ |
| cout << "Calculating vocabulary frequencies from corpus " << filename << '\n'; |
| sentPair s ; |
| while (getNextSentence(s, elist, flist)) |
| { |
| for(int i = 0 ; i< s.eSent.size() ; i++){ |
| evoc.insert(s.eSent[i]); |
| } |
| for(int i = 0 ; i< s.fSent.size() ; i++){ |
| fvoc.insert(s.fSent[i]); |
| } |
| totalPairs1++; |
| totalPairs2+=s.realCount; |
| // NOTE: this value might change during training |
| // for words from the manual dictionary, yet this is ignored! |
| |
| if( s.noOcc<0 ) |
| isNegative=1; |
| } |
| } |
| if( isNegative==1 ) |
| { |
| cerr << "WARNING: corpus contains negative occurrency frequencies => these are interpreted as entries of a manual dictionary.\n"; |
| realCount=new Vector<double>(totalPairs1,1.0); |
| } |
| else |
| realCount=0; |
| elist->compact(evoc); |
| flist->compact(fvoc); |
| } |
| |
| sentenceHandler::sentenceHandler(const char* filename, vcbList* elist, |
| vcbList* flist,std::set<WordIndex>& eapp, std::set<WordIndex>& fapp) : realCount(0) |
| // This method is the constructor of the class, it also intitializes the |
| // sentence pair sequential number (count) to z |
| { |
| readsent_mutex=new boost::mutex(); |
| setprob_mutex=new boost::mutex(); |
| position = 0; |
| readflag = false ; |
| allInMemory = false ; |
| inputFilename = filename ; |
| inputFile = new ifstream(filename); |
| pair_no = 0 ; |
| if(!(*inputFile)){ |
| cerr << "\nERROR:(a) Cannot open " << filename; |
| exit(1); |
| } |
| currentSentence = 0; |
| totalPairs1 = 0 ; |
| totalPairs2 =0; |
| pair_no = 0 ; |
| noSentInBuffer = 0 ; |
| Buffer.clear(); |
| bool isNegative=0; |
| if (elist && flist){ |
| cout << "Calculating vocabulary frequencies from corpus " << filename << '\n'; |
| sentPair s ; |
| while (getNextSentence(s, elist, flist)) |
| { |
| int k; |
| for(k=0;k<s.eSent.size();k++){ |
| eapp.insert(s.eSent[k]); |
| } |
| for(k=0;k<s.fSent.size();k++){ |
| fapp.insert(s.fSent[k]); |
| } |
| totalPairs1++; |
| totalPairs2+=s.realCount; |
| // NOTE: this value might change during training |
| // for words from the manual dictionary, yet this is ignored! |
| |
| if( s.noOcc<0 ) |
| isNegative=1; |
| } |
| } |
| if( isNegative==1 ) |
| { |
| cerr << "WARNING: corpus contains negative occurrency frequencies => these are interpreted as entries of a manual dictionary.\n"; |
| realCount=new Vector<double>(totalPairs1,1.0); |
| } |
| else |
| realCount=0; |
| } |
| |
| void sentenceHandler::rewind() |
| { |
| readsent_mutex->lock(); |
| position = 0; |
| currentSentence = 0; |
| readflag = false ; |
| if (!allInMemory || |
| !(Buffer.size() >= 1 && Buffer[currentSentence].sentenceNo == 1)){ |
| // check if the buffer doe not already has the first chunk of pairs |
| if (Buffer.size() > 0) |
| cerr << ' ' << Buffer[currentSentence].sentenceNo << '\n'; |
| // totalPairs = 0 ; |
| pair_no = 0 ; |
| noSentInBuffer = 0 ; |
| Buffer.clear(); |
| } |
| if (!allInMemory){ |
| delete inputFile; |
| inputFile = new ifstream(inputFilename); |
| if(!(*inputFile)){ |
| cerr << "\nERROR:(b) Cannot open " << inputFilename << " " << (int)errno; |
| } |
| } |
| readsent_mutex->unlock(); |
| } |
| |
| |
| int sentenceHandler::getNextSentence(sentPair& sent, vcbList* elist, vcbList* flist) |
| { |
| readsent_mutex->lock(); |
| |
| do{ |
| sentPair s ; |
| if (readflag){ |
| cerr << "Attempting to read from the end of corpus, rewinding\n"; |
| //rewind(); |
| break; |
| } |
| if (currentSentence >= noSentInBuffer){ |
| if (allInMemory) |
| break; |
| /* no more sentences in buffer */ |
| noSentInBuffer = 0 ; |
| currentSentence = 0 ; |
| Buffer.clear(); |
| cout << "Reading more sentence pairs into memory ... \n"; |
| while((noSentInBuffer < TRAIN_BUFFER_SIZE) && readNextSentence(s)){ |
| if ((s.fSent.size()-1) > (MAX_FERTILITY-1) * (s.eSent.size()-1)){ |
| cerr << "WARNING: The following sentence pair has source/target sentence length ration more than\n"<< |
| "the maximum allowed limit for a source word fertility\n"<< |
| " source length = " << s.eSent.size()-1 << " target length = " << s.fSent.size()-1 << |
| " ratio " << double(s.fSent.size()-1)/ (s.eSent.size()-1) << " ferility limit : " << |
| MAX_FERTILITY-1 << '\n'; |
| cerr << "Shortening sentence \n"; |
| cerr << s; |
| s.eSent.resize(min(s.eSent.size(),s.fSent.size())); |
| s.fSent.resize(min(s.eSent.size(),s.fSent.size())); |
| } |
| Buffer.push_back(s) ; |
| if (elist && flist){ |
| if ((*elist).size() > 0) |
| for (WordIndex i= 0 ; i < s.eSent.size() ; i++){ |
| if (s.eSent[i] >= (*elist).uniqTokens()){ |
| if( PrintedTooLong++<100) |
| cerr << "ERROR: source word " << s.eSent[i] << " is not in the vocabulary list \n"; |
| exit(-1); |
| } |
| (*elist).incFreq(s.eSent[i], s.realCount); |
| } |
| if ((*flist).size() > 0) |
| for (WordIndex j= 1 ; j < s.fSent.size() ; j++){ |
| if (s.fSent[j] >= (*flist).uniqTokens()){ |
| cerr << "ERROR: target word " << s.fSent[j] << " is not in the vocabulary list \n"; |
| exit(-1); |
| } |
| (*flist).incFreq(s.fSent[j], s.realCount); |
| } |
| } |
| noSentInBuffer++; |
| } |
| if (inputFile->eof()){ |
| allInMemory = (Buffer.size() >= 1 && |
| Buffer[currentSentence].sentenceNo == 1) ; |
| if (allInMemory) |
| cout << "Corpus fits in memory, corpus has: " << Buffer.size() << |
| " sentence pairs.\n"; |
| } |
| } |
| if(noSentInBuffer <= 0 ){ |
| //cerr << "# sent in buffer " << noSentInBuffer << '\n'; |
| readflag = true ; |
| break; |
| } |
| sent = Buffer[currentSentence++] ; |
| position ++; |
| if( sent.noOcc<0 && realCount ){ |
| if( Manlexfactor1 && sent.noOcc==-1.0 ) |
| sent.realCount=Manlexfactor1; |
| else if( Manlexfactor2 && sent.noOcc==-2.0 ) |
| sent.realCount=Manlexfactor2; |
| else |
| sent.realCount=(*realCount)[sent.getSentenceNo()-1]; |
| } |
| readsent_mutex->unlock(); |
| return position ; |
| }while(false); |
| readsent_mutex->unlock(); |
| return 0; |
| } |
| bool sentenceHandler::readNextSentence(sentPair& sent) |
| /* This method reads in a new pair of sentences, each pair is read from the |
| corpus file as line triples. The first line the no of times this line |
| pair occured in the corpus, the second line is the source sentence and |
| the third is the target sentence. The sentences are represented by a space |
| separated positive integer token ids. */ |
| { |
| |
| string line; |
| bool fail(false) ; |
| |
| sent.clear(); |
| vector<string> splits; |
| if (getline(*inputFile, line)){ |
| |
| boost::algorithm::split(splits,line,boost::algorithm::is_any_of("|#*")); |
| |
| if(splits.size() == 1 || splits.size() == 0){ |
| // continue, no problem |
| |
| }else if(splits.size()==3){ |
| line = splits[0]; |
| }else{ |
| fail = true; |
| return false; |
| } |
| |
| istrstream buffer(line.c_str()); |
| buffer >> sent.noOcc; |
| if( sent.noOcc<0 ) |
| { |
| if( realCount ) |
| { |
| if( Manlexfactor1 && sent.noOcc==-1.0 ) |
| sent.realCount=Manlexfactor1; |
| else if( Manlexfactor2 && sent.noOcc==-2.0 ) |
| sent.realCount=Manlexfactor2; |
| else |
| { |
| sent.realCount=(*realCount)[pair_no]; |
| } |
| } |
| else |
| sent.realCount=1.0; |
| } |
| else |
| sent.realCount=sent.noOcc; |
| } |
| else { |
| fail = true ;; |
| } |
| if (splits.size()==3 || getline(*inputFile, line)){ |
| if(splits.size()==3){ |
| line = splits[1]; |
| } |
| istrstream buffer(line.c_str()); |
| WordIndex w; // w is a local variabe for token id |
| sent.eSent.push_back(0); // each source word is assumed to have 0 == |
| // a null word (id 0) at the begining of the sentence. |
| while(buffer>>w){ // read source sentece , word by word . |
| if (sent.eSent.size() < MAX_SENTENCE_LENGTH) |
| sent.eSent.push_back(w); |
| else { |
| if( PrintedTooLong++<100) |
| cerr << "{WARNING:(a)truncated sentence "<<pair_no<<"}"; |
| //cerr << "ERROR: getSentence.cc:getNextSentence(): sentence exceeds preset length limit of : " << MAX_SENTENCE_LENGTH << '\n'; |
| //cerr << "The following sentence will be truncated\n" << line; |
| break ; |
| } |
| } |
| } |
| else { |
| fail = true ; |
| } |
| if (splits.size()==3 ||getline(*inputFile, line)){ |
| if(splits.size()==3){ |
| line = splits[2]; |
| } |
| istrstream buffer(line.c_str()); |
| WordIndex w; // w is a local variabe for token id |
| sent.fSent.push_back(0); //0 is inserted for program uniformity |
| while(buffer>>w){ // read target sentece , word by word . |
| if (sent.fSent.size() < MAX_SENTENCE_LENGTH) |
| sent.fSent.push_back(w); |
| else { |
| if( PrintedTooLong++<100) |
| cerr << "{WARNING:(b)truncated sentence "<<pair_no<<"}"; |
| //cerr << "ERROR: getSentence.cc:getNextSentence(): sentence exceeds preset length limit of : " << MAX_SENTENCE_LENGTH << '\n'; |
| //cerr << "The following sentence will be truncated\n" << line; |
| break ; |
| } |
| } |
| } |
| else { |
| fail = true ; |
| } |
| if (fail){ |
| sent.eSent.clear(); |
| sent.fSent.clear(); |
| sent.sentenceNo = 0 ; |
| sent.noOcc = 0 ; |
| sent.realCount=0; |
| return(false); |
| } |
| if( sent.eSent.size()==1||sent.fSent.size()==1 ) |
| cerr << "ERROR: Forbidden zero sentence length " << sent.sentenceNo << endl; |
| sent.sentenceNo = ++pair_no; |
| if(pair_no % 100000 == 0) |
| cout << "[sent:" << sent.sentenceNo << "]"<< '\n'; |
| return true; |
| } |
| |
| double optimize_lambda(Vector<double>&vd) |
| { |
| Vector<double> l; |
| for(double lambda=1.0;lambda<ManlexMAX_MULTIPLICITY;lambda+=0.33) |
| { |
| double prod=0.0; |
| for(unsigned int i=0;i<vd.size();++i) |
| { |
| prod += vd[i]*exp(lambda*vd[i])/(exp(lambda*vd[i])-1.0); |
| } |
| l.push_back(fabs(prod-1.0)); |
| } |
| double lam=double(min_element(l.begin(),l.end())-l.begin())*0.33+1.0; |
| if( lam<1.0 ) |
| { |
| cerr << "ERROR: lambda is smaller than one: " << lam << endl; |
| for(unsigned int i=0;i<vd.size();++i) |
| cerr << vd[i] << ' '; |
| cerr << endl; |
| } |
| return lam; |
| } |
| |
| void sentenceHandler::setProbOfSentence(const sentPair&s,double d) |
| { |
| |
| if( realCount==0 ) |
| return; |
| else{ |
| setprob_mutex->lock(); |
| if( s.noOcc<=0 ) |
| { |
| double ed=exp(d); |
| if( oldPairs.size()>0&&(oldPairs.back().get_eSent()!=s.get_eSent()||oldPairs.back().getSentenceNo()>=s.getSentenceNo()) ) |
| { |
| double lambda=optimize_lambda(oldProbs); |
| for(unsigned int i=0;i<oldPairs.size();++i) |
| { |
| if( oldProbs[i]<1e-5 ) |
| (*realCount)[oldPairs[i].getSentenceNo()-1]=1.0; |
| else |
| (*realCount)[oldPairs[i].getSentenceNo()-1]=lambda*oldProbs[i]/(1-exp(-lambda*oldProbs[i])); |
| } |
| oldPairs.clear(); |
| oldProbs.clear(); |
| } |
| oldPairs.push_back(s); |
| oldProbs.push_back(ed); |
| } |
| setprob_mutex->unlock(); |
| } |
| } |
| |
| /* ------------- End of Method Definition of Class sentenceHandler ----------*/ |
| |
| |
| |
| |
| |
| |