| /* |
| |
| EGYPT Toolkit for Statistical Machine Translation |
| Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. |
| |
| This program is free software; you can redistribute it and/or |
| modify it under the terms of the GNU General Public License |
| as published by the Free Software Foundation; either version 2 |
| of the License, or (at your option) any later version. |
| |
| This program is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with this program; if not, write to the Free Software |
| Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, |
| USA. |
| |
| */ |
| #include "mystl.h" |
| #include "model3.h" |
| #include "collCounts.h" |
| #include "utility.h" |
| #include "Globals.h" |
| #include "D5Tables.h" |
| #include "transpair_model5.h" |
| #include "transpair_modelhmm.h" |
| #include "myassert.h" |
| #include "Parameter.h" |
| |
| GLOBAL_PARAMETER(float,PrintN,"nbestalignments","for printing the n best alignments",PARLEV_OUTPUT,0); |
| |
| const short LogHillClimb=0,LogPeg=0; |
| const short UseHMMViterbiAlignmentIfPossible=1; |
| short DoViterbiTraining=0; |
| |
| GLOBAL_PARAMETER(int,VerboseSentence,"VerboseSentence","number of sentence for which a lot of information should be printed (negative: no output)",PARLEV_OUTPUT,-10); |
| GLOBAL_PARAMETER(double,PEGGED_CUTOFF,"PEGGED_CUTOFF","relative cutoff probability for alignment-centers in pegging",PARLEV_OPTHEUR,3e-2); |
| GLOBAL_PARAMETER2(float, COUNTINCREASE_CUTOFF_AL,"COUNTINCREASE CUTOFF AL","countCutoffAl","Counts increment cutoff threshold for alignments in training of fertility models",PARLEV_OPTHEUR,1e-5); |
| |
| //int SentNr; |
| bool UseLinkCache=1; /// optimization for pegging |
| int NumberOfAlignmentsInSophisticatedCountCollection; |
| |
| extern bool ONLYALDUMPS; |
| |
| int PrintHillClimbWarning=0; |
| int PrintZeroScoreWarning=0; |
| |
| |
| LogProb model3::viterbi_model2(const transpair_modelhmm&ef, alignment&output, int |
| #ifdef STORE_HMM_ALIGNMENTS |
| pair_no |
| #endif |
| , int i_peg , int j_peg )const |
| { |
| static Vector<pair<alignment,LogProb> > viterbis; |
| Vector<int>vit; |
| int m=ef.get_m(); |
| int l=ef.get_l(); |
| double ret=0.0; |
| //#define STORE_HMM_ALIGNMENTS |
| #ifdef STORE_HMM_ALIGNMENTS |
| if( i_peg==-1 && j_peg==-1 && viterbis.size()>pair_no ){ |
| output=viterbis[pair_no].first; |
| ret=viterbis[pair_no].second; |
| massert( ret==HMMRealViterbi(*ef.net,vit,i_peg-1,j_peg-1)*ef.net->finalMultiply ); |
| } else{ |
| ret=HMMRealViterbi(*ef.net,vit,i_peg-1,j_peg-1)*ef.net->finalMultiply; |
| for(int j=1;j<=m;j++){ |
| if( vit[j-1]+1>l ) |
| output.set(j,0); |
| else |
| output.set(j,vit[j-1]+1); |
| massert( (j==j_peg&&int(output(j))==i_peg) || j_peg!=j); |
| } |
| if( i_peg==-1 && j_peg==-1 ){ |
| iassert(viterbis.size()==pair_no); |
| viterbis.push_back(make_pair(output,ret)); |
| } |
| } |
| #else |
| ret=HMMRealViterbi(*ef.net,vit,i_peg-1,j_peg-1)*ef.net->finalMultiply; |
| for(int j=1;j<=m;j++){ |
| if( vit[j-1]+1>l ) |
| output.set(j,0); |
| else |
| output.set(j,vit[j-1]+1); |
| massert( (j==j_peg&&int(output(j))==i_peg) || j_peg!=j); |
| } |
| #endif |
| massert( j_peg==-1 || int(output(j_peg))==i_peg ); |
| if( j_peg!=-1 ) |
| massert(int(output(j_peg))==i_peg); |
| if( output.valid() ) |
| return ret; |
| else{ |
| return _viterbi_model2(ef,output,i_peg,j_peg); |
| } |
| } |
| |
| LogProb model3::_viterbi_model2(const transpair_model2&ef, alignment&output, int i_peg, int j_peg)const{ |
| WordIndex best_i=0; |
| LogProb ss=1; |
| PositionIndex l = ef.get_l(), m=ef.get_m(); |
| Vector<WordIndex> Fert(l+1, (WordIndex)0); |
| if ((j_peg != -1) && (i_peg != -1)){ |
| output.set(j_peg, i_peg); |
| ss *= ef.get_t(i_peg, j_peg) * ef.get_a(i_peg, j_peg); |
| if( ss==0 ) |
| cerr << "WARNING: already starting is zero: " << ef.get_t(i_peg, j_peg) << " " << ef.get_a(i_peg, j_peg) << '\n'; |
| }else |
| ss=1; |
| for (PositionIndex j = 1 ; j <= m ; j++)if (int(j) != j_peg){ |
| LogProb score = 0 ; |
| for (PositionIndex i = 0 ; i <= l ; i++){ |
| if( Fert[i]+1<MAX_FERTILITY && (i != 0 || m>=(2 * (Fert[0] + 1)))){ |
| LogProb temp = ef.get_t(i, j) * ef.get_a(i, j); |
| if (temp > score ){ |
| best_i = i ; |
| score = temp ; |
| } |
| } |
| } |
| if (score == 0){ |
| cerr << "WARNING: In searching for model2 best alignment\n"; |
| cerr << "Nothing was set for target token at position j: " << j << "\n"; |
| for (PositionIndex i = 0 ; i <= l ; i++){ |
| cerr << "i: " << i << "ttable("<<i<<", "<<j<<") = " << |
| ef.get_t(i, j) << " atable(" << i<<", "<<j<<", "<< |
| l<<", "<<m<<") = "<< ef.get_a(i, j) << " product " << |
| ef.get_t(i, j) * ef.get_a(i, j) ; |
| if ((Fert[i]+1 < MAX_FERTILITY) && ((i == 0 && (m >= 2*(Fert[0]+1))) |
| || (i != 0))) |
| cerr <<"Passed fertility condition \n"; |
| else |
| cerr <<"Failed fertility condition \n"; |
| } |
| }else{ |
| output.set(j, best_i); |
| Fert[best_i]++; |
| } |
| ss *= score; |
| } |
| if (ss <= 0){ |
| //cerr << ef; |
| cerr << "WARNING: Model2 viterbi alignment has zero score.\n" ; |
| cerr << "Here are the different elements that made this alignment probability zero \n"; |
| cerr << "Source length " << l << " target length " << m << '\n'; |
| LogProb gg=1 ; // for debugging only ..... |
| for (PositionIndex j = 1 ; j <= m ; j++)if (int(j) != j_peg){ |
| LogProb score = 0 ; |
| LogProb a = 0, t =0 ; |
| for (PositionIndex i = 0 ; i <= l ; i++){ |
| // if( Debug_Fert[i]+1<MAX_FERTILITY && (i != 0 || m>=(2 * (Debug_Fert[0] + 1)))){ |
| LogProb temp = ef.get_t(i, j) * ef.get_a(i, j); |
| if (temp > score ){ |
| score = temp ; |
| best_i = i ; |
| a = ef.get_a(i, j); |
| t = ef.get_t(i, j) ; |
| } |
| // } |
| } |
| gg *= score ; |
| cerr << "best: fs[" << j << "] "<< j <<" : es[" << best_i << "] " << |
| best_i << " , a: " << ef.get_a(best_i, j) << " t: " << t << " score " << score << " product : " << gg << " ss " << |
| ss << '\n'; |
| } |
| for(PositionIndex i = 0 ; i <= l ; i++) |
| cerr << "Fert["<<i<<"] selected " << Fert[i] << '\n'; |
| } |
| massert(output.valid()); |
| return ss; |
| } |
| LogProb model3::viterbi_model2(const transpair_model3&ef, alignment&output, int pair_no,int i_peg , int j_peg )const |
| { |
| if( h&&UseHMMViterbiAlignmentIfPossible ){ |
| transpair_modelhmm efhmm(ef.E,ef.F,tTable,aTable,dTable,nTable,0.0,0.0,h); |
| LogProb ret=viterbi_model2(efhmm,output,pair_no,i_peg,j_peg); |
| massert(output.valid()); |
| return ret; |
| } |
| return _viterbi_model2(ef,output,i_peg,j_peg); |
| } |
| |
| //int HillClimbingSteps=0; |
| |
| template<class TRANSPAIR> |
| LogProb greedyClimb_WithIBM3Scoring(MoveSwapMatrix<TRANSPAIR>&msc2,int& HillClimbingSteps,int j_peg=-1) |
| { |
| PositionIndex l = msc2.get_l(), m=msc2.get_m(); |
| int changed=0; |
| int iter=0; |
| bool hereVERB=0; |
| do |
| { |
| MoveSwapMatrix<typename TRANSPAIR::simpler_transpair_model> msc_IBM3(msc2.get_ef(),alignment(msc2)); |
| vector<pair<double,OneMoveSwap> > msvec; |
| for (PositionIndex j = 1 ; j <= m ; j++)if (int(j) != j_peg) |
| { |
| WordIndex aj=msc2(j); |
| for (PositionIndex j1 = j + 1 ; j1 <= m; j1++) |
| if((aj != msc2(j1)) && (int(j1) != j_peg)) |
| msvec.push_back(pair<double,OneMoveSwap>(-msc_IBM3.cswap(j,j1),OneMoveSwap(1,j,j1))); |
| for (PositionIndex i = 0 ; i <= l ; i++) |
| if(i != aj &&(i != 0 || (m >= 2 * (msc2.fert(0)+1))) && msc2.fert(i)+1<MAX_FERTILITY) |
| msvec.push_back(pair<double,OneMoveSwap>(-msc_IBM3.cmove(i,j),OneMoveSwap(2,i,j))); |
| } |
| sort(msvec.begin(),msvec.end()); |
| HillClimbingSteps++; |
| int iused=-1; |
| changed=0; |
| for(unsigned int i=0;i<msvec.size()&&changed==0;++i) |
| { |
| LogProb csts; |
| const OneMoveSwap &oms=msvec[i].second; |
| if( oms.type==1&&(csts=msc2.cswap(oms.a,oms.b))>1.0001 ) |
| { |
| if( hereVERB==1 ) |
| cerr << "SWAP: " << csts << '\n'; |
| msc2.doSwap(oms.a,oms.b); |
| changed=1; |
| iused=i; |
| break; |
| } |
| if( oms.type==2&&(csts=msc2.cmove(oms.a,oms.b))>1.0001 ) |
| { |
| if( hereVERB==1 ) |
| cerr << "MOVE: " << csts << '\n'; |
| msc2.doMove(oms.a,oms.b); |
| changed=1; |
| iused=i; |
| break; |
| } |
| } |
| if( ++iter>30 ) |
| { |
| //msc2.ef.verboseTP=1; |
| hereVERB=1; |
| cerr << "ERROR: more than 30 iterations in hill-climbing: " << iused |
| << " improvement: " << msvec[iused].first << " value:" << msvec[iused].second |
| << '\n' << msc2 << '\n'; |
| for(int a=0;a<20;++a) |
| cout << a << ' ' << msvec[a].first << ' ' << msvec[a].second << '\n'; |
| //cerr << msvec << '\n'; |
| } |
| if( iter>50 ) |
| break; |
| } while(changed); |
| return msc2.get_ef().prob_of_target_and_alignment_given_source(msc2); |
| } |
| |
| template<class TRANSPAIR> |
| LogProb greedyClimb(MoveSwapMatrix<TRANSPAIR>&msc2, int& HillClimbingSteps, int j_peg = -1) |
| { |
| if( msc2.get_ef().greedyHillClimbing()==1 ) |
| return greedyClimb_WithIBM3Scoring(msc2,HillClimbingSteps,j_peg); |
| PositionIndex l = msc2.get_l(), m=msc2.get_m(); |
| int changed=0; |
| do |
| { |
| HillClimbingSteps++; |
| changed=0; |
| for (PositionIndex j = 1 ; j <= m ; j++)if (int(j) != j_peg) |
| { |
| WordIndex aj=msc2(j); |
| for (PositionIndex j1 = j + 1 ; j1 <= m; j1++)if((aj != msc2(j1)) && (int(j1) != j_peg)&&msc2.cswap(j, j1) > 1.0) |
| msc2.doSwap(j, j1), changed=1; |
| for (PositionIndex i = 0 ; i <= l ; i++)if(i != aj &&(i != 0 || (m >= 2 * (msc2.fert(0)+1))) && msc2.fert(i)+1<MAX_FERTILITY && msc2.cmove(i, j)>1.0) |
| msc2.doMove(i, j), changed=1; |
| } |
| } while (changed); |
| return msc2.get_ef().prob_of_target_and_alignment_given_source(msc2); |
| } |
| |
| template<class TRANSPAIR> |
| LogProb hillClimb_std(MoveSwapMatrix<TRANSPAIR>&msc2, int &HillClimbingSteps,int= -1,int j_peg = -1) |
| { |
| if( msc2.isLazy() ) |
| return greedyClimb_WithIBM3Scoring(msc2,HillClimbingSteps,j_peg); |
| if( LogHillClimb>1 ) |
| cout << msc2 << '\n'; |
| PositionIndex l = msc2.get_l(), m=msc2.get_m(); |
| int changes=0; |
| int best_change_type=-1, best_change_v1=-1, best_change_v2=-1; |
| do |
| { |
| HillClimbingSteps++; |
| LogProb best_change_so_far = 1.00001 ; |
| best_change_type=0; |
| for (PositionIndex j = 1 ; j <= m ; j++)if (int(j) != j_peg) |
| { |
| WordIndex aj=msc2(j); |
| for (PositionIndex j1 = j + 1 ; j1 <= m; j1++)if((aj != msc2(j1)) && (int(j1) != j_peg)) |
| { |
| LogProb change = msc2.cswap(j, j1); |
| if (change > best_change_so_far) |
| { |
| best_change_so_far = change ; |
| best_change_type=1; |
| best_change_v1=j; |
| best_change_v2=j1; |
| if( LogHillClimb ) |
| cerr << "CLIMB: " << best_change_type << " " << best_change_v1 << " " << best_change_v2 << " " << best_change_so_far << msc2 << '\n'; |
| massert(msc2.get_ef().isSubOptimal()==1); |
| } |
| } |
| for (PositionIndex i = 0 ; i <= l ; i++)if(i != aj &&(i != 0 || (m >= 2 * (msc2.fert(0)+1))) && msc2.fert(i)+1<MAX_FERTILITY) |
| { |
| LogProb change = msc2.cmove(i, j); |
| if (change > best_change_so_far) |
| { |
| best_change_so_far = change ; |
| best_change_type=2; |
| best_change_v1=j; |
| best_change_v2=i; |
| if( LogHillClimb ) |
| cerr << "CLIMB: " << best_change_type << " " << best_change_v1 << " " << best_change_v2 << " " << best_change_so_far << msc2 << '\n'; |
| massert(msc2.get_ef().isSubOptimal()==1); |
| } |
| } |
| } |
| if (best_change_type==1) |
| { |
| msc2.doSwap(best_change_v1, best_change_v2); |
| if( LogHillClimb ) |
| cerr << "SW-CLIMB-DONE: " << j_peg << msc2 << '\n'; |
| } |
| if (best_change_type==2) |
| { |
| msc2.doMove(best_change_v2, best_change_v1); |
| if( LogHillClimb ) |
| cerr << "MO-CLIMB-DONE: " << j_peg << msc2 << '\n'; |
| } |
| changes++; |
| if( changes>40 ) |
| { |
| if( PrintHillClimbWarning++<1000 ) |
| cerr << "WARNING: already " << changes << " iterations in hillclimb: " << best_change_so_far << " " << best_change_type << " " << best_change_v1 << " " << best_change_v2 << '\n'; |
| else if (PrintHillClimbWarning==1000) |
| cerr << "ERROR: too many hill climbing warnings => I do not print more.\n"; |
| } |
| if(changes>60 ) |
| { |
| cerr << msc2 << '\n'; |
| break; |
| } |
| } while (best_change_type); |
| return msc2.get_ef().prob_of_target_and_alignment_given_source(msc2); |
| } |
| |
| template<class MODEL_TYPE> |
| bool extendCenterList(Vector<pair<MoveSwapMatrix<MODEL_TYPE>*,LogProb> >&setOfGoodCenters,MoveSwapMatrix<MODEL_TYPE> *msc,double peggedAlignmentScore) |
| { |
| unsigned int l=msc->get_ef().get_l(); |
| set<OneMoveSwap> alreadyCovered; |
| for(unsigned int nr=0;nr<setOfGoodCenters.size();nr++) |
| makeOneMoveSwap(*setOfGoodCenters[nr].first,*msc,alreadyCovered); |
| for(set<OneMoveSwap>::const_iterator i=alreadyCovered.begin();i!=alreadyCovered.end();++i) |
| { |
| if( i->type==1||i->type==4) |
| msc->delCenter(); |
| if( i->type==1 ) |
| { |
| for(unsigned int ii=0;ii<=l;++ii) |
| if( (*msc)(i->a)!=ii ) |
| msc->delMove(ii,i->a); |
| } |
| else if( i->type==2||i->type==4 ) |
| msc->delSwap(i->a,i->b); |
| else if( i->type==3 ) |
| msc->delMove(i->b,i->a); |
| else abort(); |
| } |
| setOfGoodCenters.push_back(make_pair(msc,peggedAlignmentScore)); |
| return 1; |
| } |
| |
| bool OldLog=0; |
| short OldLogPeg=0,OldLogHillClimb=0; |
| class Als |
| { |
| public: |
| int s,a,b; |
| double v; |
| Als(int _s,int _a,int _b,double _v) |
| : s(_s),a(_a),b(_b),v(_v) {} |
| }; |
| |
| inline bool operator<(const Als&x,const Als&y) |
| {return x.v>y.v;} |
| |
| template<class MODEL_TYPE, class ADDITIONAL_MODEL_DATA_IN,class ADDITIONAL_MODEL_DATA_OUT> |
| void model3::viterbi_loop_with_tricks(Perplexity& perp, Perplexity& viterbiPerp, sentenceHandler& sHandler1, |
| bool dump_files, const char* alignfile, |
| bool collect_counts, string model, bool final, |
| ADDITIONAL_MODEL_DATA_IN*dm_in, |
| ADDITIONAL_MODEL_DATA_OUT*dm_out){ |
| ofstream *writeNBestErrorsFile=0; |
| if( (dump_files||FEWDUMPS)&&PrintN&&ReferenceAlignment.size()>0 ) { |
| string x=alignfile+string("NBEST"); |
| writeNBestErrorsFile= new ofstream(x.c_str()); |
| } |
| ofstream *of3=0; |
| PositionIndex i, j, l, m ; |
| ofstream of2; |
| int pair_no; |
| int HillClimbingSteps=0; |
| NumberOfAlignmentsInSophisticatedCountCollection=0; |
| if (dump_files||FEWDUMPS||(final&&(ONLYALDUMPS)) ){ |
| of2.open(alignfile); |
| if(of2.is_open()){ |
| cout << "I will write alignment to " << alignfile << endl; |
| } |
| } |
| /* if(!of2.is_open()){ |
| cerr << "I don't know why you do not let me dump file " << alignfile << endl; |
| }*/ |
| if( dump_files&&PrintN&&final ){ |
| string x=alignfile+string("NBEST"); |
| of3= new ofstream(x.c_str()); |
| } |
| pair_no = 0 ; // sentence pair number |
| // for each sentence pair in the corpus |
| perp.clear() ; // clears cross_entrop & perplexity |
| viterbiPerp.clear() ; // clears cross_entrop & perplexity |
| sentPair sent ; |
| int NCenter=0,NHillClimbed=0,NAlignment=0,NTotal=0,NBetterByPegging=0; |
| while(sHandler1.getNextSentence(sent)){ |
| if( sent.eSent.size()==1||sent.fSent.size()==1 ) |
| continue; |
| // SentNr=sent.sentenceNo; |
| Vector<WordIndex>& es = sent.eSent; |
| Vector<WordIndex>& fs = sent.fSent; |
| const float count = sent.getCount(); |
| if ((sent.sentenceNo % 10000) == 0) |
| cerr <<sent.sentenceNo << '\n'; |
| time_t sent_s = time(NULL) ; |
| pair_no++ ; |
| l = es.size() - 1 ; |
| m = fs.size() - 1 ; |
| |
| LogProb align_total_count=0; |
| alignment viterbi2alignment(l,m); |
| MODEL_TYPE ef(es,fs,tTable,aTable,dTable,nTable,p1,p0,dm_in); |
| viterbi_model2(ef,viterbi2alignment,pair_no-1); |
| Vector<pair<MoveSwapMatrix<MODEL_TYPE>*,LogProb> >setOfGoodCenters(1); |
| set<alignment> alignments; |
| MoveSwapMatrix<MODEL_TYPE> *best = (setOfGoodCenters[0].first = new MoveSwapMatrix<MODEL_TYPE>(ef, viterbi2alignment)); |
| MoveSwapMatrix<MODEL_TYPE> _viterbi(*best), *viterbi=&_viterbi; // please, don't delete this line (FJO) |
| if( ef.isSubOptimal() ) |
| setOfGoodCenters[0].second = hillClimb_std(*best,HillClimbingSteps); |
| else{ |
| setOfGoodCenters[0].second = best->get_ef().prob_of_target_and_alignment_given_source(*best); |
| if( setOfGoodCenters[0].second==0 ){ |
| cerr << "PROBLEM: alignment is 0.\n"; |
| best->get_ef().prob_of_target_and_alignment_given_source(*best,1); |
| } |
| } |
| int bestAlignment=0; |
| |
| for(unsigned int i=0;i<setOfGoodCenters.size();++i) |
| setOfGoodCenters[i].first->check(); |
| alignments.insert(*best); |
| if (setOfGoodCenters[bestAlignment].second <= 0){ |
| if( PrintZeroScoreWarning++<100 ){ |
| cerr << "WARNING: Hill Climbing yielded a zero score viterbi alignment for the following pair:\n"; |
| cerr << alignment(*setOfGoodCenters[bestAlignment].first) ; |
| printSentencePair(es, fs, cerr); |
| } |
| else if(PrintZeroScoreWarning==100) { |
| cerr << "ERROR: too many zero score warnings => no additional one will be printed\n"; |
| } |
| setOfGoodCenters[bestAlignment].second=1e-300; |
| continue; |
| } |
| int nHillClimbed=1,nAlignment=1; |
| bool flagBetterByPegging=0; |
| if ( Peg ){ |
| const MoveSwapMatrix<MODEL_TYPE> *useMatrix=viterbi; // it is faster using 'best', ... (FJO) |
| Array2<short, vector<short> > linkCache(l+1, m+1, false); |
| if(UseLinkCache)for(unsigned int j=1;j<=m;j++)linkCache((*useMatrix)(j), j)=1; |
| for(PositionIndex j=1;j<=m;j++)for(PositionIndex i=0;i<=l;i++){ |
| nAlignment++; |
| if( i!=(*useMatrix)(j) && (UseLinkCache==0||linkCache(i,j)==0) && |
| ef.get_t(i,j)>ef.get_t((*useMatrix)(j),j)*PEGGED_CUTOFF && |
| (i != 0 || (m >= 2 * (useMatrix->fert(0)+1)))){ |
| MoveSwapMatrix<MODEL_TYPE> *BESTPEGGED=0; |
| LogProb peggedAlignmentScore; |
| nHillClimbed++; |
| if( ef.isSubOptimal() ){ |
| BESTPEGGED = new MoveSwapMatrix<MODEL_TYPE>(*useMatrix); |
| BESTPEGGED->doMove(i, j); |
| peggedAlignmentScore= hillClimb_std(*BESTPEGGED,HillClimbingSteps, i,j); |
| }else{ |
| alignment pegAlignment(l,m); |
| peggedAlignmentScore=viterbi_model2(ef,pegAlignment,pair_no-1,i,j); |
| BESTPEGGED = new MoveSwapMatrix<MODEL_TYPE>(ef,pegAlignment); |
| massert( pegAlignment(j)==i ); |
| } |
| if(UseLinkCache) |
| for(unsigned int j=1;j<=m;j++) |
| linkCache((*BESTPEGGED)(j), j)=1; |
| if( peggedAlignmentScore>setOfGoodCenters[bestAlignment].second*(LogProb)PEGGED_CUTOFF && alignments.count(*BESTPEGGED)==0 ){ |
| if(extendCenterList(setOfGoodCenters,BESTPEGGED,peggedAlignmentScore)){ |
| alignments.insert(*BESTPEGGED); |
| if( peggedAlignmentScore>1.00001*setOfGoodCenters[bestAlignment].second ){ |
| if( LogPeg ){ |
| cerr << "found better alignment by pegging " << pair_no << " " << peggedAlignmentScore/setOfGoodCenters[bestAlignment].second << '\n'; |
| cerr << "NEW BEST: " << alignment(*BESTPEGGED); |
| cerr << "OLD : " << alignment(*setOfGoodCenters[bestAlignment].first); |
| } |
| flagBetterByPegging=1; |
| bestAlignment=alignments.size()-1; |
| } |
| } |
| assert( differences(*BESTPEGGED, *best)!=0 ); |
| BESTPEGGED=0; |
| }else |
| delete BESTPEGGED; |
| } |
| } |
| } // end of if(Peg) |
| NBetterByPegging+=flagBetterByPegging; |
| for(unsigned int i=0;i<setOfGoodCenters.size();++i) |
| setOfGoodCenters[i].first->check(); |
| if( LogPeg>1 ) |
| cout << "PEGGED: " << setOfGoodCenters.size() << " HILLCLIMBED:" << nHillClimbed << " TOTAL:" << nAlignment << " alignments." << '\n'; |
| int alTotal=collectCountsOverNeighborhood(setOfGoodCenters,es, fs, tTable, aCountTable, |
| dCountTable, nCountTable, p1_count, p0_count, |
| align_total_count, count, collect_counts, dm_out); |
| if( LogPeg>1 ){ |
| cout << "ALL: " << alTotal << " from " << pow(float(l+1),float(m)) << '\n'; |
| massert(alTotal<=pow(double(l+1),double(m))); |
| } |
| NCenter+=setOfGoodCenters.size();NHillClimbed+=nHillClimbed;NAlignment+=nAlignment;NTotal+=alTotal; |
| perp.addFactor(log(double(align_total_count)), count, l, m,0); |
| viterbiPerp.addFactor(log(double(setOfGoodCenters[bestAlignment].second)), count, l, m,0); |
| massert(log(double(setOfGoodCenters[bestAlignment].second)) <= log(double(align_total_count))); |
| if (dump_files||(FEWDUMPS&&sent.sentenceNo<1000)||(final&&(ONLYALDUMPS)) ) |
| printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, (setOfGoodCenters[bestAlignment].first)->getAlignment(), sent.sentenceNo, |
| setOfGoodCenters[bestAlignment].second); |
| for(unsigned int i=0;i<setOfGoodCenters.size();++i) |
| setOfGoodCenters[i].first->check(); |
| if( of3||(writeNBestErrorsFile&&pair_no<int(ReferenceAlignment.size())) ){ |
| vector<Als> als; |
| for(unsigned int s=0;s<setOfGoodCenters.size();++s){ |
| const MoveSwapMatrix<MODEL_TYPE>&msc= *setOfGoodCenters[s].first; |
| msc.check(); |
| double normalized_ascore=setOfGoodCenters[s].second; |
| if( !msc.isCenterDeleted() ) |
| als.push_back( Als(s,0,0,normalized_ascore) ); |
| |
| for(WordIndex j=1;j<=m;j++) |
| for(WordIndex i=0;i<=l;i++) |
| if( i!=msc(j)&& !msc.isDelMove(i,j) ) |
| als.push_back( Als(s,i,j,msc.cmove(i,j)*normalized_ascore)); |
| for(PositionIndex j1=1;j1<=m;j1++) |
| for(PositionIndex j2=j1+1;j2<=m;j2++) |
| if( msc(j1)!=msc(j2) && !msc.isDelSwap(j1,j2) ) |
| als.push_back( Als(s,-j1,-j2,msc.cswap(j1,j2)*normalized_ascore)); |
| } |
| sort(als.begin(),als.end()); |
| double sum=0,sum2=0; |
| for(unsigned int i=0;i<als.size();++i) |
| sum+=als[i].v; |
| for(unsigned int i=0;i<min((unsigned int)als.size(),(unsigned int)PrintN);++i){ |
| alignment x=*setOfGoodCenters[als[i].s].first; |
| if( !(als[i].a==0 && als[i].b==0) ){ |
| if( als[i].a<=0&&als[i].b<=0 ) |
| x.doSwap(-als[i].a,-als[i].b); |
| else |
| x.doMove(als[i].a,als[i].b); |
| } |
| if( of3&&i<PrintN ) |
| printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(),*of3,x.getAlignment(), sent.sentenceNo, |
| als[i].v/sum*count); |
| sum2+=als[i].v; |
| if( writeNBestErrorsFile ){ |
| if( pair_no<int(ReferenceAlignment.size()) ){ |
| int ALmissing=0,ALtoomuch=0,ALeventsMissing=0,ALeventsToomuch=0; |
| vector<double> scores; |
| ErrorsInAlignment(ReferenceAlignment[pair_no-1],x.getAlignment(),l,ALmissing,ALtoomuch,ALeventsMissing,ALeventsToomuch,pair_no); |
| ef.computeScores(x,scores); |
| *writeNBestErrorsFile << ALmissing+ALtoomuch << ' '; |
| for(unsigned int i=0;i<scores.size();++i) |
| *writeNBestErrorsFile << ((scores[i]>0.0)?(-log(scores[i])):1.0e6) << ' '; |
| *writeNBestErrorsFile << '\n'; |
| } |
| } |
| } |
| if( writeNBestErrorsFile ) |
| *writeNBestErrorsFile << '\n'; |
| } |
| addAL((setOfGoodCenters[bestAlignment].first)->getAlignment(),sent.sentenceNo,l); |
| for(unsigned int i=0;i<setOfGoodCenters.size();i++) |
| delete setOfGoodCenters[i].first; |
| double period = difftime(time(NULL), sent_s); |
| if (Verbose) |
| cerr << "processing this sentence pair took : " << period |
| << " seconds\n"; |
| |
| } /* of sentence pair E, F */ |
| //sHandler1.rewind(); |
| if (dump_files||FEWDUMPS||(final&&(ONLYALDUMPS)) ) |
| of2.close(); |
| delete of3; |
| delete writeNBestErrorsFile; |
| double FSent=pair_no; |
| cout << "#centers(pre/hillclimbed/real): " << NAlignment/FSent << " " << NHillClimbed/FSent << " " << NCenter/FSent << " #al: " << NTotal/FSent << " #alsophisticatedcountcollection: " << NumberOfAlignmentsInSophisticatedCountCollection/FSent << " #hcsteps: " << HillClimbingSteps/FSent << '\n'; |
| cout << "#peggingImprovements: " << NBetterByPegging/FSent << '\n'; |
| } |
| |
| /*Perform only one step of viterbi alignment*/ |
| #if 0 |
| template<class MODEL_TYPE, class ADDITIONAL_MODEL_DATA_IN,class ADDITIONAL_MODEL_DATA_OUT> |
| void model3::viterbi_loop_with_tricks_1(Perplexity& perp, Perplexity& viterbiPerp, sentenceHandler& sHandler1, |
| bool dump_files, const char* alignfile, |
| bool collect_counts, string model, bool final, |
| ADDITIONAL_MODEL_DATA_IN*dm_in, |
| ADDITIONAL_MODEL_DATA_OUT*dm_out){ |
| ofstream *writeNBestErrorsFile=0; |
| if( (dump_files||FEWDUMPS)&&PrintN&&ReferenceAlignment.size()>0 ) { |
| string x=alignfile+string("NBEST"); |
| writeNBestErrorsFile= new ofstream(x.c_str()); |
| } |
| ofstream *of3=0; |
| ofstream of2; |
| int pair_no; |
| HillClimbingSteps=0; |
| NumberOfAlignmentsInSophisticatedCountCollection=0; |
| if (dump_files||FEWDUMPS||(final&&(ONLYALDUMPS)) ) |
| of2.open(alignfile); |
| if( dump_files&&PrintN&&final ){ |
| string x=alignfile+string("NBEST"); |
| of3= new ofstream(x.c_str()); |
| } |
| pair_no = 0 ; // sentence pair number |
| // for each sentence pair in the corpus |
| perp.clear() ; // clears cross_entrop & perplexity |
| viterbiPerp.clear() ; // clears cross_entrop & perplexity |
| sentPair sent ; |
| int NCenter=0,NHillClimbed=0,NAlignment=0,NTotal=0,NBetterByPegging=0; |
| while(sHandler1.getNextSentence(sent)){ |
| if( sent.eSent.size()==1||sent.fSent.size()==1 ) |
| continue; |
| SentNr=sent.sentenceNo; |
| Vector<WordIndex>& es = sent.eSent; |
| Vector<WordIndex>& fs = sent.fSent; |
| const float count = sent.getCount(); |
| if ((sent.sentenceNo % 10000) == 0) |
| cerr <<sent.sentenceNo << '\n'; |
| time_t sent_s = time(NULL) ; |
| pair_no++ ; |
| l = es.size() - 1 ; |
| m = fs.size() - 1 ; |
| if (Log){ |
| logmsg << "Processing sentence pair:\n\t"; |
| printSentencePair(es, fs, logmsg); |
| for (i = 0 ; i <= l ; i++) |
| logmsg << Elist.getVocabList()[es[i]].word << " "; |
| logmsg << "\n\t"; |
| for (j = 1 ; j <= m ; j++) |
| logmsg << Flist.getVocabList()[fs[j]].word << " "; |
| logmsg << "\n"; |
| } |
| |
| LogProb align_total_count=0; |
| alignment viterbi2alignment(l,m); |
| MODEL_TYPE ef(es,fs,tTable,aTable,dTable,nTable,p1,p0,dm_in); |
| viterbi_model2(ef,viterbi2alignment,pair_no-1); |
| Vector<pair<MoveSwapMatrix<MODEL_TYPE>*,LogProb> >setOfGoodCenters(1); |
| set<alignment> alignments; |
| MoveSwapMatrix<MODEL_TYPE> *best = (setOfGoodCenters[0].first = new MoveSwapMatrix<MODEL_TYPE>(ef, viterbi2alignment)); |
| MoveSwapMatrix<MODEL_TYPE> _viterbi(*best), *viterbi=&_viterbi; // please, don't delete this line (FJO) |
| if (Log) |
| logmsg << "VITERBI: " << alignment(_viterbi); |
| if( ef.isSubOptimal() ) |
| setOfGoodCenters[0].second = hillClimb_std(*best); |
| else{ |
| setOfGoodCenters[0].second = best->get_ef().prob_of_target_and_alignment_given_source(*best); |
| if( setOfGoodCenters[0].second==0 ){ |
| cerr << "PROBLEM: alignment is 0.\n"; |
| best->get_ef().prob_of_target_and_alignment_given_source(*best,1); |
| } |
| } |
| int bestAlignment=0; |
| |
| for(unsigned int i=0;i<setOfGoodCenters.size();++i) |
| setOfGoodCenters[i].first->check(); |
| alignments.insert(*best); |
| if (setOfGoodCenters[bestAlignment].second <= 0){ |
| if( PrintZeroScoreWarning++<100 ){ |
| cerr << "WARNING: Hill Climbing yielded a zero score viterbi alignment for the following pair:\n"; |
| cerr << alignment(*setOfGoodCenters[bestAlignment].first) ; |
| printSentencePair(es, fs, cerr); |
| if(Log){ |
| logmsg << "WARNING: Hill Climbing yielded a zero score viterbi alignment for the following pair:\n"; |
| printSentencePair(es, fs, logmsg); |
| } |
| } |
| else if(PrintZeroScoreWarning==100) { |
| cerr << "ERROR: too many zero score warnings => no additional one will be printed\n"; |
| } |
| setOfGoodCenters[bestAlignment].second=1e-300; |
| continue; |
| } |
| int nHillClimbed=1,nAlignment=1; |
| bool flagBetterByPegging=0; |
| if ( Peg ){ |
| const MoveSwapMatrix<MODEL_TYPE> *useMatrix=viterbi; // it is faster using 'best', ... (FJO) |
| Array2<short, vector<short> > linkCache(l+1, m+1, false); |
| if(UseLinkCache)for(unsigned int j=1;j<=m;j++)linkCache((*useMatrix)(j), j)=1; |
| for(PositionIndex j=1;j<=m;j++)for(PositionIndex i=0;i<=l;i++){ |
| nAlignment++; |
| if( i!=(*useMatrix)(j) && (UseLinkCache==0||linkCache(i,j)==0) && |
| ef.get_t(i,j)>ef.get_t((*useMatrix)(j),j)*PEGGED_CUTOFF && |
| (i != 0 || (m >= 2 * (useMatrix->fert(0)+1)))){ |
| MoveSwapMatrix<MODEL_TYPE> *BESTPEGGED=0; |
| LogProb peggedAlignmentScore; |
| nHillClimbed++; |
| if( ef.isSubOptimal() ){ |
| BESTPEGGED = new MoveSwapMatrix<MODEL_TYPE>(*useMatrix); |
| BESTPEGGED->doMove(i, j); |
| peggedAlignmentScore= hillClimb_std(*BESTPEGGED, i,j); |
| }else{ |
| alignment pegAlignment(l,m); |
| peggedAlignmentScore=viterbi_model2(ef,pegAlignment,pair_no-1,i,j); |
| BESTPEGGED = new MoveSwapMatrix<MODEL_TYPE>(ef,pegAlignment); |
| massert( pegAlignment(j)==i ); |
| } |
| if(UseLinkCache) |
| for(unsigned int j=1;j<=m;j++) |
| linkCache((*BESTPEGGED)(j), j)=1; |
| if( peggedAlignmentScore>setOfGoodCenters[bestAlignment].second*(LogProb)PEGGED_CUTOFF && alignments.count(*BESTPEGGED)==0 ){ |
| if(extendCenterList(setOfGoodCenters,BESTPEGGED,peggedAlignmentScore)){ |
| alignments.insert(*BESTPEGGED); |
| if( peggedAlignmentScore>1.00001*setOfGoodCenters[bestAlignment].second ){ |
| if( LogPeg ){ |
| cerr << "found better alignment by pegging " << pair_no << " " << peggedAlignmentScore/setOfGoodCenters[bestAlignment].second << '\n'; |
| cerr << "NEW BEST: " << alignment(*BESTPEGGED); |
| cerr << "OLD : " << alignment(*setOfGoodCenters[bestAlignment].first); |
| } |
| flagBetterByPegging=1; |
| bestAlignment=alignments.size()-1; |
| } |
| } |
| assert( differences(*BESTPEGGED, *best)!=0 ); |
| BESTPEGGED=0; |
| }else |
| delete BESTPEGGED; |
| } |
| } |
| } // end of if(Peg) |
| NBetterByPegging+=flagBetterByPegging; |
| for(unsigned int i=0;i<setOfGoodCenters.size();++i) |
| setOfGoodCenters[i].first->check(); |
| if( LogPeg>1 ) |
| cout << "PEGGED: " << setOfGoodCenters.size() << " HILLCLIMBED:" << nHillClimbed << " TOTAL:" << nAlignment << " alignments." << '\n'; |
| int alTotal=collectCountsOverNeighborhood(setOfGoodCenters,es, fs, tTable, aCountTable, |
| dCountTable, nCountTable, p1_count, p0_count, |
| align_total_count, count, collect_counts, dm_out); |
| if( LogPeg>1 ){ |
| cout << "ALL: " << alTotal << " from " << pow(float(l+1),float(m)) << '\n'; |
| massert(alTotal<=pow(double(l+1),double(m))); |
| } |
| NCenter+=setOfGoodCenters.size();NHillClimbed+=nHillClimbed;NAlignment+=nAlignment;NTotal+=alTotal; |
| perp.addFactor(log(double(align_total_count)), count, l, m,0); |
| viterbiPerp.addFactor(log(double(setOfGoodCenters[bestAlignment].second)), count, l, m,0); |
| massert(log(double(setOfGoodCenters[bestAlignment].second)) <= log(double(align_total_count))); |
| if (dump_files||(FEWDUMPS&&sent.sentenceNo<1000)||(final&&(ONLYALDUMPS)) ) |
| printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, (setOfGoodCenters[bestAlignment].first)->getAlignment(), sent.sentenceNo, |
| setOfGoodCenters[bestAlignment].second); |
| for(unsigned int i=0;i<setOfGoodCenters.size();++i) |
| setOfGoodCenters[i].first->check(); |
| if( of3||(writeNBestErrorsFile&&pair_no<int(ReferenceAlignment.size())) ){ |
| vector<Als> als; |
| for(unsigned int s=0;s<setOfGoodCenters.size();++s){ |
| const MoveSwapMatrix<MODEL_TYPE>&msc= *setOfGoodCenters[s].first; |
| msc.check(); |
| double normalized_ascore=setOfGoodCenters[s].second; |
| if( !msc.isCenterDeleted() ) |
| als.push_back( Als(s,0,0,normalized_ascore) ); |
| |
| for(WordIndex j=1;j<=m;j++) |
| for(WordIndex i=0;i<=l;i++) |
| if( i!=msc(j)&& !msc.isDelMove(i,j) ) |
| als.push_back( Als(s,i,j,msc.cmove(i,j)*normalized_ascore)); |
| for(PositionIndex j1=1;j1<=m;j1++) |
| for(PositionIndex j2=j1+1;j2<=m;j2++) |
| if( msc(j1)!=msc(j2) && !msc.isDelSwap(j1,j2) ) |
| als.push_back( Als(s,-j1,-j2,msc.cswap(j1,j2)*normalized_ascore)); |
| } |
| sort(als.begin(),als.end()); |
| double sum=0,sum2=0; |
| for(unsigned int i=0;i<als.size();++i) |
| sum+=als[i].v; |
| for(unsigned int i=0;i<min((unsigned int)als.size(),(unsigned int)PrintN);++i){ |
| alignment x=*setOfGoodCenters[als[i].s].first; |
| if( !(als[i].a==0 && als[i].b==0) ){ |
| if( als[i].a<=0&&als[i].b<=0 ) |
| x.doSwap(-als[i].a,-als[i].b); |
| else |
| x.doMove(als[i].a,als[i].b); |
| } |
| if( of3&&i<PrintN ) |
| printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(),*of3,x.getAlignment(), sent.sentenceNo, |
| als[i].v/sum*count); |
| sum2+=als[i].v; |
| if( writeNBestErrorsFile ){ |
| if( pair_no<int(ReferenceAlignment.size()) ){ |
| int ALmissing=0,ALtoomuch=0,ALeventsMissing=0,ALeventsToomuch=0; |
| vector<double> scores; |
| ErrorsInAlignment(ReferenceAlignment[pair_no-1],x.getAlignment(),l,ALmissing,ALtoomuch,ALeventsMissing,ALeventsToomuch,pair_no); |
| ef.computeScores(x,scores); |
| *writeNBestErrorsFile << ALmissing+ALtoomuch << ' '; |
| for(unsigned int i=0;i<scores.size();++i) |
| *writeNBestErrorsFile << ((scores[i]>0.0)?(-log(scores[i])):1.0e6) << ' '; |
| *writeNBestErrorsFile << '\n'; |
| } |
| } |
| } |
| if( writeNBestErrorsFile ) |
| *writeNBestErrorsFile << '\n'; |
| } |
| addAL((setOfGoodCenters[bestAlignment].first)->getAlignment(),sent.sentenceNo,l); |
| for(unsigned int i=0;i<setOfGoodCenters.size();i++) |
| delete setOfGoodCenters[i].first; |
| double period = difftime(time(NULL), sent_s); |
| if (Verbose) |
| cerr << "processing this sentence pair took : " << period |
| << " seconds\n"; |
| |
| } /* of sentence pair E, F */ |
| //sHandler1.rewind(); |
| if (dump_files||FEWDUMPS||(final&&(ONLYALDUMPS)) ) |
| of2.close(); |
| delete of3; |
| delete writeNBestErrorsFile; |
| double FSent=pair_no; |
| cout << "#centers(pre/hillclimbed/real): " << NAlignment/FSent << " " << NHillClimbed/FSent << " " << NCenter/FSent << " #al: " << NTotal/FSent << " #alsophisticatedcountcollection: " << NumberOfAlignmentsInSophisticatedCountCollection/FSent << " #hcsteps: " << HillClimbingSteps/FSent << '\n'; |
| cout << "#peggingImprovements: " << NBetterByPegging/FSent << '\n'; |
| } |
| #endif |
| |
| |
| |
| #include "collCounts.cpp" |
| #define INSTANTIATE(A,B,C) template \ |
| void model3::viterbi_loop_with_tricks<A,B,C>(Perplexity& perp, Perplexity& viterbiPerp, sentenceHandler& sHandler1, \ |
| bool dump_files, const char* alignfile,bool collect_counts, string, bool final,\ |
| B*d4m,C*d5m); |
| |
| INSTANTIATE(transpair_model3, void, void); |
| INSTANTIATE(transpair_modelhmm, const hmm, void); |
| INSTANTIATE(transpair_modelhmm, const hmm, d4model); |
| INSTANTIATE(transpair_modelhmm, const hmm, d5model); |
| INSTANTIATE(transpair_model3, void,d4model); |
| INSTANTIATE(transpair_model3, void,d5model); |
| INSTANTIATE(transpair_model4, d4model,d4model); |
| INSTANTIATE(transpair_model4, d4model,d5model); |
| INSTANTIATE(transpair_model5, d5model,d5model); |