src/main/java/org/apache/joshua/adagrad/Optimizer.java - joshua - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.joshua.adagrad;

 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Set;
 import java.util.Vector;

 import org.apache.joshua.corpus.Vocabulary;
 import org.apache.joshua.metrics.EvaluationMetric;

 // this class implements the AdaGrad algorithm
 public class Optimizer {
     public Optimizer(Vector<String>_output, boolean[] _isOptimizable, double[] _initialLambda,
       HashMap<String, String>[] _feat_hash, HashMap<String, String>[] _stats_hash) {
     output = _output; // (not used for now)
     isOptimizable = _isOptimizable;
     initialLambda = _initialLambda; // initial weights array
     paramDim = initialLambda.length - 1;
     initialLambda = _initialLambda;
     feat_hash = _feat_hash; // feature hash table
     stats_hash = _stats_hash; // suff. stats hash table
     finalLambda = new double[initialLambda.length];
 	    System.arraycopy(initialLambda, 0, finalLambda, 0, finalLambda.length);
   }

   //run AdaGrad for one epoch
   public double[] runOptimizer() {
       List<Integer> sents = new ArrayList<>();
       for( int i = 0; i < sentNum; ++i )
 	  sents.add(i);
       double[] avgLambda = new double[initialLambda.length]; //only needed if averaging is required
       for( int i = 0; i < initialLambda.length; ++i )
 	  avgLambda[i] = 0;
       for ( int iter = 0; iter < adagradIter; ++iter ) {
 	  System.arraycopy(finalLambda, 1, initialLambda, 1, paramDim);
     	  if(needShuffle)
 	      Collections.shuffle(sents);

 	  double oraMetric, oraScore, predMetric, predScore;
 	  double[] oraPredScore = new double[4];
 	  double loss = 0;
 	  double diff = 0;
 	  double sumMetricScore = 0;
 	  double sumModelScore = 0;
 	  String oraFeat = "";
 	  String predFeat = "";
 	  String[] oraPredFeat = new String[2];
 	  String[] vecOraFeat;
 	  String[] vecPredFeat;
 	  String[] featInfo;
 	  int numBatch = 0;
 	  int numUpdate = 0;
 	  Iterator<Integer> it;
 	  Integer diffFeatId;

 	  //update weights
 	  Integer s;
 	  int sentCount = 0;
 	  double prevLambda = 0;
 	  double diffFeatVal = 0;
 	  double oldVal = 0;
 	  double gdStep = 0;
 	  double Hii = 0;
 	  double gradiiSquare = 0;
 	  int lastUpdateTime = 0;
 	  HashMap<Integer, Integer> lastUpdate = new HashMap<>();
 	  HashMap<Integer, Double> lastVal = new HashMap<>();
 	  HashMap<Integer, Double> H = new HashMap<>();
 	  while( sentCount < sentNum ) {
 	      loss = 0;
 	      ++numBatch;
 	      HashMap<Integer, Double> featDiff = new HashMap<>();
 	      for(int b = 0; b < batchSize; ++b ) {
 		  //find out oracle and prediction
 		  s = sents.get(sentCount);
 		  findOraPred(s, oraPredScore, oraPredFeat, finalLambda, featScale);

 		  //the model scores here are already scaled in findOraPred
 		  oraMetric = oraPredScore[0];
 		  oraScore = oraPredScore[1];
 		  predMetric = oraPredScore[2];
 		  predScore = oraPredScore[3];
 		  oraFeat = oraPredFeat[0];
 		  predFeat = oraPredFeat[1];

 		  //update the scale
 		  if(needScale) { //otherwise featscale remains 1.0
 		      sumMetricScore += Math.abs(oraMetric + predMetric);
 		      //restore the original model score
 		      sumModelScore += Math.abs(oraScore + predScore) / featScale;

 		      if(sumModelScore/sumMetricScore > scoreRatio)
 			  featScale = sumMetricScore/sumModelScore;
 		  }
 		  // processedSent++;

 		  vecOraFeat = oraFeat.split("\\s+");
 		  vecPredFeat = predFeat.split("\\s+");

 		  //accumulate difference feature vector
 		  if ( b == 0 ) {
 			  for (String aVecOraFeat : vecOraFeat) {
 				  featInfo = aVecOraFeat.split("=");
 				  diffFeatId = Integer.parseInt(featInfo[0]);
 				  featDiff.put(diffFeatId, Double.parseDouble(featInfo[1]));
 			  }
 			  for (String aVecPredFeat : vecPredFeat) {
 				  featInfo = aVecPredFeat.split("=");
 				  diffFeatId = Integer.parseInt(featInfo[0]);
 				  if (featDiff.containsKey(diffFeatId)) { //overlapping features
 					  diff = featDiff.get(diffFeatId) - Double.parseDouble(featInfo[1]);
 					  if (Math.abs(diff) > 1e-20)
 						  featDiff.put(diffFeatId, diff);
 					  else
 						  featDiff.remove(diffFeatId);
 				  } else //features only firing in the 2nd feature vector
 					  featDiff.put(diffFeatId, -1.0 * Double.parseDouble(featInfo[1]));
 			  }
 		  } else {
 			  for (String aVecOraFeat : vecOraFeat) {
 				  featInfo = aVecOraFeat.split("=");
 				  diffFeatId = Integer.parseInt(featInfo[0]);
 				  if (featDiff.containsKey(diffFeatId)) { //overlapping features
 					  diff = featDiff.get(diffFeatId) + Double.parseDouble(featInfo[1]);
 					  if (Math.abs(diff) > 1e-20)
 						  featDiff.put(diffFeatId, diff);
 					  else
 						  featDiff.remove(diffFeatId);
 				  } else //features only firing in the new oracle feature vector
 					  featDiff.put(diffFeatId, Double.parseDouble(featInfo[1]));
 			  }
 			  for (String aVecPredFeat : vecPredFeat) {
 				  featInfo = aVecPredFeat.split("=");
 				  diffFeatId = Integer.parseInt(featInfo[0]);
 				  if (featDiff.containsKey(diffFeatId)) { //overlapping features
 					  diff = featDiff.get(diffFeatId) - Double.parseDouble(featInfo[1]);
 					  if (Math.abs(diff) > 1e-20)
 						  featDiff.put(diffFeatId, diff);
 					  else
 						  featDiff.remove(diffFeatId);
 				  } else //features only firing in the new prediction feature vector
 					  featDiff.put(diffFeatId, -1.0 * Double.parseDouble(featInfo[1]));
 			  }
 		  }

 		  //remember the model scores here are already scaled
 		  double singleLoss = evalMetric.getToBeMinimized() ?
 		      (predMetric-oraMetric) - (oraScore-predScore)/featScale:
 		      (oraMetric-predMetric) - (oraScore-predScore)/featScale;
 		  if(singleLoss > 0)
 		      loss += singleLoss;
 		  ++sentCount;
 		  if( sentCount >= sentNum ) {
 		      break;
 		  }
 	      } //for(int b : batchSize)

 	      //System.out.println("\n\n"+sentCount+":");

 	      if( loss > 0 ) {
 	      //if(true) {
 		  ++numUpdate;
 		  //update weights (see Duchi'11, Eq.23. For l1-reg, use lazy update)
 		  Set<Integer> diffFeatSet = featDiff.keySet();
 		  it = diffFeatSet.iterator();
 		  while(it.hasNext()) { //note these are all non-zero gradients!
 		      diffFeatId = it.next();
 		      diffFeatVal = -1.0 * featDiff.get(diffFeatId); //gradient
 		      if( regularization > 0 ) {
 			  lastUpdateTime =
 			      lastUpdate.get(diffFeatId) == null ? 0 : lastUpdate.get(diffFeatId);
 			  if( lastUpdateTime < numUpdate - 1 ) {
 			      //haven't been updated (gradient=0) for at least 2 steps
 			      //lazy compute prevLambda now
 			      oldVal =
 				  lastVal.get(diffFeatId) == null ? initialLambda[diffFeatId] : lastVal.get(diffFeatId);
 			      Hii =
 				  H.get(diffFeatId) == null ? 0 : H.get(diffFeatId);
 			      if(Math.abs(Hii) > 1e-20) {
 				  if( regularization == 1 )
 				      prevLambda =
 					  Math.signum(oldVal) * clip( Math.abs(oldVal) - lam * eta * (numBatch - 1 - lastUpdateTime) / Hii );
 				  else if( regularization == 2 ) {
 				      prevLambda =
 					  Math.pow( Hii/(lam+Hii), (numUpdate - 1 - lastUpdateTime) ) * oldVal;
 				      if(needAvg) { //fill the gap due to lazy update
 					  double prevLambdaCopy = prevLambda;
 					  double scale = Hii/(lam+Hii);
 					  for( int t = 0; t < numUpdate - 1 - lastUpdateTime; ++t ) {
 					      avgLambda[diffFeatId] += prevLambdaCopy;
 					      prevLambdaCopy /= scale;
 					  }
 				      }
 				  }
 			      } else {
 				  if( regularization == 1 )
 				      prevLambda = 0;
 				  else if( regularization == 2 )
 				      prevLambda = oldVal;
 			      }
 			  } else //just updated at last time step or just started
 			      prevLambda = finalLambda[diffFeatId];
 			  if(H.get(diffFeatId) != null) {
 			      gradiiSquare = H.get(diffFeatId);
 			      gradiiSquare *= gradiiSquare;
 			      gradiiSquare += diffFeatVal * diffFeatVal;
 			      Hii = Math.sqrt(gradiiSquare);
 			  } else
 			      Hii = Math.abs(diffFeatVal);
 			  H.put(diffFeatId, Hii);
 			  //update the weight
 			  if( regularization == 1 ) {
 			      gdStep = prevLambda - eta * diffFeatVal / Hii;
 			      finalLambda[diffFeatId] = Math.signum(gdStep) * clip( Math.abs(gdStep) - lam * eta / Hii );
 			  } else if(regularization == 2 ) {
 			      finalLambda[diffFeatId] = (Hii * prevLambda - eta * diffFeatVal) / (lam + Hii);
 			      if(needAvg)
 				  avgLambda[diffFeatId] += finalLambda[diffFeatId];
 			  }
 			  lastUpdate.put(diffFeatId, numUpdate);
 			  lastVal.put(diffFeatId, finalLambda[diffFeatId]);
 		      } else { //if no regularization
 			  if(H.get(diffFeatId) != null) {
 			      gradiiSquare = H.get(diffFeatId);
 			      gradiiSquare *= gradiiSquare;
 			      gradiiSquare += diffFeatVal * diffFeatVal;
 			      Hii = Math.sqrt(gradiiSquare);
 			  } else
 			      Hii = Math.abs(diffFeatVal);
 			  H.put(diffFeatId, Hii);
 			  finalLambda[diffFeatId] = finalLambda[diffFeatId] - eta * diffFeatVal / Hii;
 			  if(needAvg)
 			      avgLambda[diffFeatId] += finalLambda[diffFeatId];
 		      }
 		  } //while(it.hasNext())
 	      } //if(loss > 0)
 	      else { //no loss, therefore the weight update is skipped
 		  //however, the avg weights still need to be accumulated
 		  if( regularization == 0 ) {
 		      for( int i = 1; i < finalLambda.length; ++i )
 			  avgLambda[i] += finalLambda[i];
 		  } else if( regularization == 2 ) {
 		      if(needAvg) {
 			  //due to lazy update, we need to figure out the actual
 			  //weight vector at this point first...
 			  for( int i = 1; i < finalLambda.length; ++i ) {
 			      if( lastUpdate.get(i) != null ) {
 			      	  if( lastUpdate.get(i) < numUpdate ) {
 			      	      oldVal = lastVal.get(i);
 			      	      Hii = H.get(i);
 			      	      //lazy compute
 			      	      avgLambda[i] +=
 					  Math.pow( Hii/(lam+Hii), (numUpdate - lastUpdate.get(i)) ) * oldVal;
 			      	  } else
 			      	      avgLambda[i] += finalLambda[i];
 			      }
 			      avgLambda[i] += finalLambda[i];
 			  }
 		      }
 		  }
 	      }
 	  } //while( sentCount < sentNum )
 	  if( regularization > 0 ) {
 	      for( int i = 1; i < finalLambda.length; ++i ) {
 		  //now lazy compute those weights that haven't been taken care of
 		  if( lastUpdate.get(i) == null )
 		      finalLambda[i] = 0;
 		  else if( lastUpdate.get(i) < numUpdate ) {
 		      oldVal = lastVal.get(i);
 		      Hii = H.get(i);
 		      if( regularization == 1 )
 		  	  finalLambda[i] =
 		  	      Math.signum(oldVal) * clip( Math.abs(oldVal) - lam * eta * (numUpdate - lastUpdate.get(i)) / Hii );
 		      else if( regularization == 2 ) {
 		  	  finalLambda[i] =
 		  	      Math.pow( Hii/(lam+Hii), (numUpdate - lastUpdate.get(i)) ) * oldVal;
 		  	  if(needAvg) { //fill the gap due to lazy update
 		  	      double prevLambdaCopy = finalLambda[i];
 		  	      double scale = Hii/(lam+Hii);
 		  	      for( int t = 0; t < numUpdate - lastUpdate.get(i); ++t ) {
 		  		  avgLambda[i] += prevLambdaCopy;
 		  		  prevLambdaCopy /= scale;
 		  	      }
 		  	  }
 		      }
 		  }
 		  if( regularization == 2 && needAvg ) {
 		      if( iter == adagradIter - 1 )
 			  finalLambda[i] = avgLambda[i] / ( numBatch * adagradIter );
 		  }
 	      }
 	  } else { //if no regularization
 	      if( iter == adagradIter - 1 && needAvg ) {
 		  for( int i = 1; i < finalLambda.length; ++i )
 		      finalLambda[i] = avgLambda[i] / ( numBatch * adagradIter );
 	      }
 	  }

 	  double initMetricScore;
 	  if (iter == 0) {
 	      initMetricScore = computeCorpusMetricScore(initialLambda);
 	      finalMetricScore = computeCorpusMetricScore(finalLambda);
 	  } else  {
 	      initMetricScore = finalMetricScore;
 	      finalMetricScore = computeCorpusMetricScore(finalLambda);
 	  }
 	  // prepare the printing info
 	  String result = " Initial "
 	      + evalMetric.get_metricName() + "=" + String.format("%.4f", initMetricScore) + " Final "
 	      + evalMetric.get_metricName() + "=" + String.format("%.4f", finalMetricScore);
 	  //print lambda info
 	  // int numParamToPrint = 0;
 	  // numParamToPrint = paramDim > 10 ? 10 : paramDim; // how many parameters
 	  // // to print
 	  // result = paramDim > 10 ? "Final lambda (first 10): {" : "Final lambda: {";

 	  // for (int i = 1; i <= numParamToPrint; ++i)
 	  //     result += String.format("%.4f", finalLambda[i]) + " ";

 	  output.add(result);
       } //for ( int iter = 0; iter < adagradIter; ++iter ) {

       //non-optimizable weights should remain unchanged
       ArrayList<Double> cpFixWt = new ArrayList<>();
       for ( int i = 1; i < isOptimizable.length; ++i ) {
 	  if ( ! isOptimizable[i] )
 	      cpFixWt.add(finalLambda[i]);
       }
       normalizeLambda(finalLambda);
       int countNonOpt = 0;
       for ( int i = 1; i < isOptimizable.length; ++i ) {
 	  if ( ! isOptimizable[i] ) {
 	      finalLambda[i] = cpFixWt.get(countNonOpt);
 	      ++countNonOpt;
 	  }
       }
       return finalLambda;
   }

   private double clip(double x) {
       return x > 0 ? x : 0;
   }

   public double computeCorpusMetricScore(double[] finalLambda) {
     int suffStatsCount = evalMetric.get_suffStatsCount();
     double modelScore;
     double maxModelScore;
     Set<String> candSet;
     String candStr;
     String[] feat_str;
     String[] tmpStatsVal = new String[suffStatsCount];
     int[] corpusStatsVal = new int[suffStatsCount];
     for (int i = 0; i < suffStatsCount; i++)
       corpusStatsVal[i] = 0;

     for (int i = 0; i < sentNum; i++) {
       candSet = feat_hash[i].keySet();

       // find out the 1-best candidate for each sentence
       // this depends on the training mode
       maxModelScore = NegInf;
 	    for (String aCandSet : candSet) {
 		    modelScore = 0.0;
 		    candStr = aCandSet.toString();

 		    feat_str = feat_hash[i].get(candStr).split("\\s+");

 		    String[] feat_info;

 		    for (String aFeat_str : feat_str) {
 			    feat_info = aFeat_str.split("=");
 			    modelScore += Double.parseDouble(feat_info[1]) * finalLambda[Vocabulary.id(feat_info[0])];
 		    }

 		    if (maxModelScore < modelScore) {
 			    maxModelScore = modelScore;
 			    tmpStatsVal = stats_hash[i].get(candStr).split("\\s+"); // save the
 			    // suff stats
 		    }
 	    }

       for (int j = 0; j < suffStatsCount; j++)
         corpusStatsVal[j] += Integer.parseInt(tmpStatsVal[j]); // accumulate
                                                                // corpus-leve
                                                                // suff stats
     } // for( int i=0; i<sentNum; i++ )

     return evalMetric.score(corpusStatsVal);
   }

   private void findOraPred(int sentId, double[] oraPredScore, String[] oraPredFeat, double[] lambda, double featScale)
   {
     double oraMetric=0, oraScore=0, predMetric=0, predScore=0;
     String oraFeat="", predFeat="";
     double candMetric = 0, candScore = 0; //metric and model scores for each cand
     Set<String> candSet = stats_hash[sentId].keySet();
     String cand = "";
     String feats = "";
     String oraCand = ""; //only used when BLEU/TER-BLEU is used as metric
     String[] featStr;
     String[] featInfo;

     int actualFeatId;
     double bestOraScore;
     double worstPredScore;

     if(oraSelectMode==1)
       bestOraScore = NegInf; //larger score will be selected
     else {
       if(evalMetric.getToBeMinimized())
         bestOraScore = PosInf; //smaller score will be selected
       else
         bestOraScore = NegInf;
     }

     if(predSelectMode==1 || predSelectMode==2)
       worstPredScore = NegInf; //larger score will be selected
     else {
       if(evalMetric.getToBeMinimized())
         worstPredScore = NegInf; //larger score will be selected
       else
         worstPredScore = PosInf;
     }

 	  for (String aCandSet : candSet) {
 		  cand = aCandSet.toString();
 		  candMetric = computeSentMetric(sentId, cand); //compute metric score

 		  //start to compute model score
 		  candScore = 0;
 		  featStr = feat_hash[sentId].get(cand).split("\\s+");
 		  feats = "";

 		  for (String aFeatStr : featStr) {
 			  featInfo = aFeatStr.split("=");
 			  actualFeatId = Vocabulary.id(featInfo[0]);
 			  candScore += Double.parseDouble(featInfo[1]) * lambda[actualFeatId];
 			  if ((actualFeatId < isOptimizable.length && isOptimizable[actualFeatId])
 					  || actualFeatId >= isOptimizable.length)
 				  feats += actualFeatId + "=" + Double.parseDouble(featInfo[1]) + " ";
 		  }

 		  candScore *= featScale;  //scale the model score

 		  //is this cand oracle?
 		  if (oraSelectMode == 1) {//"hope", b=1, r=1
 			  if (evalMetric.getToBeMinimized()) {//if the smaller the metric score, the better
 				  if (bestOraScore <= (candScore - candMetric)) {
 					  bestOraScore = candScore - candMetric;
 					  oraMetric = candMetric;
 					  oraScore = candScore;
 					  oraFeat = feats;
 					  oraCand = cand;
 				  }
 			  } else {
 				  if (bestOraScore <= (candScore + candMetric)) {
 					  bestOraScore = candScore + candMetric;
 					  oraMetric = candMetric;
 					  oraScore = candScore;
 					  oraFeat = feats;
 					  oraCand = cand;
 				  }
 			  }
 		  } else {//best metric score(ex: max BLEU), b=1, r=0
 			  if (evalMetric.getToBeMinimized()) {//if the smaller the metric score, the better
 				  if (bestOraScore >= candMetric) {
 					  bestOraScore = candMetric;
 					  oraMetric = candMetric;
 					  oraScore = candScore;
 					  oraFeat = feats;
 					  oraCand = cand;
 				  }
 			  } else {
 				  if (bestOraScore <= candMetric) {
 					  bestOraScore = candMetric;
 					  oraMetric = candMetric;
 					  oraScore = candScore;
 					  oraFeat = feats;
 					  oraCand = cand;
 				  }
 			  }
 		  }

 		  //is this cand prediction?
 		  if (predSelectMode == 1) {//"fear"
 			  if (evalMetric.getToBeMinimized()) {//if the smaller the metric score, the better
 				  if (worstPredScore <= (candScore + candMetric)) {
 					  worstPredScore = candScore + candMetric;
 					  predMetric = candMetric;
 					  predScore = candScore;
 					  predFeat = feats;
 				  }
 			  } else {
 				  if (worstPredScore <= (candScore - candMetric)) {
 					  worstPredScore = candScore - candMetric;
 					  predMetric = candMetric;
 					  predScore = candScore;
 					  predFeat = feats;
 				  }
 			  }
 		  } else if (predSelectMode == 2) {//model prediction(max model score)
 			  if (worstPredScore <= candScore) {
 				  worstPredScore = candScore;
 				  predMetric = candMetric;
 				  predScore = candScore;
 				  predFeat = feats;
 			  }
 		  } else {//worst metric score(ex: min BLEU)
 			  if (evalMetric.getToBeMinimized()) {//if the smaller the metric score, the better
 				  if (worstPredScore <= candMetric) {
 					  worstPredScore = candMetric;
 					  predMetric = candMetric;
 					  predScore = candScore;
 					  predFeat = feats;
 				  }
 			  } else {
 				  if (worstPredScore >= candMetric) {
 					  worstPredScore = candMetric;
 					  predMetric = candMetric;
 					  predScore = candScore;
 					  predFeat = feats;
 				  }
 			  }
 		  }
 	  }

     oraPredScore[0] = oraMetric;
     oraPredScore[1] = oraScore;
     oraPredScore[2] = predMetric;
     oraPredScore[3] = predScore;
     oraPredFeat[0] = oraFeat;
     oraPredFeat[1] = predFeat;

     //update the BLEU metric statistics if pseudo corpus is used to compute BLEU/TER-BLEU
     if(evalMetric.get_metricName().equals("BLEU") && usePseudoBleu ) {
       String statString;
       String[] statVal_str;
       statString = stats_hash[sentId].get(oraCand);
       statVal_str = statString.split("\\s+");

       for (int j = 0; j < evalMetric.get_suffStatsCount(); j++)
         bleuHistory[sentId][j] = R*bleuHistory[sentId][j]+Integer.parseInt(statVal_str[j]);
     }

     if(evalMetric.get_metricName().equals("TER-BLEU") && usePseudoBleu ) {
       String statString;
       String[] statVal_str;
       statString = stats_hash[sentId].get(oraCand);
       statVal_str = statString.split("\\s+");

       for (int j = 0; j < evalMetric.get_suffStatsCount()-2; j++)
         bleuHistory[sentId][j] = R*bleuHistory[sentId][j]+Integer.parseInt(statVal_str[j+2]); //the first 2 stats are TER stats
     }
   }

   // compute *sentence-level* metric score for cand
   private double computeSentMetric(int sentId, String cand) {
     String statString;
     String[] statVal_str;
     int[] statVal = new int[evalMetric.get_suffStatsCount()];

     statString = stats_hash[sentId].get(cand);
     statVal_str = statString.split("\\s+");

     if(evalMetric.get_metricName().equals("BLEU") && usePseudoBleu) {
       for (int j = 0; j < evalMetric.get_suffStatsCount(); j++)
         statVal[j] = (int) (Integer.parseInt(statVal_str[j]) + bleuHistory[sentId][j]);
     } else if(evalMetric.get_metricName().equals("TER-BLEU") && usePseudoBleu) {
       for (int j = 0; j < evalMetric.get_suffStatsCount()-2; j++)
         statVal[j+2] = (int)(Integer.parseInt(statVal_str[j+2]) + bleuHistory[sentId][j]); //only modify the BLEU stats part(TER has 2 stats)
     } else { //in all other situations, use normal stats
       for (int j = 0; j < evalMetric.get_suffStatsCount(); j++)
         statVal[j] = Integer.parseInt(statVal_str[j]);
     }

     return evalMetric.score(statVal);
   }

   // from ZMERT
   private void normalizeLambda(double[] origLambda) {
     // private String[] normalizationOptions;
     // How should a lambda[] vector be normalized (before decoding)?
     // nO[0] = 0: no normalization
     // nO[0] = 1: scale so that parameter nO[2] has absolute value nO[1]
     // nO[0] = 2: scale so that the maximum absolute value is nO[1]
     // nO[0] = 3: scale so that the minimum absolute value is nO[1]
     // nO[0] = 4: scale so that the L-nO[1] norm equals nO[2]

     int normalizationMethod = (int) normalizationOptions[0];
     double scalingFactor = 1.0;
     if (normalizationMethod == 0) {
       scalingFactor = 1.0;
     } else if (normalizationMethod == 1) {
       int c = (int) normalizationOptions[2];
       scalingFactor = normalizationOptions[1] / Math.abs(origLambda[c]);
     } else if (normalizationMethod == 2) {
       double maxAbsVal = -1;
       int maxAbsVal_c = 0;
       for (int c = 1; c <= paramDim; ++c) {
         if (Math.abs(origLambda[c]) > maxAbsVal) {
           maxAbsVal = Math.abs(origLambda[c]);
           maxAbsVal_c = c;
         }
       }
       scalingFactor = normalizationOptions[1] / Math.abs(origLambda[maxAbsVal_c]);

     } else if (normalizationMethod == 3) {
       double minAbsVal = PosInf;
       int minAbsVal_c = 0;

       for (int c = 1; c <= paramDim; ++c) {
         if (Math.abs(origLambda[c]) < minAbsVal) {
           minAbsVal = Math.abs(origLambda[c]);
           minAbsVal_c = c;
         }
       }
       scalingFactor = normalizationOptions[1] / Math.abs(origLambda[minAbsVal_c]);

     } else if (normalizationMethod == 4) {
       double pow = normalizationOptions[1];
       double norm = L_norm(origLambda, pow);
       scalingFactor = normalizationOptions[2] / norm;
     }

     for (int c = 1; c <= paramDim; ++c) {
       origLambda[c] *= scalingFactor;
     }
   }

   // from ZMERT
   private double L_norm(double[] A, double pow) {
     // calculates the L-pow norm of A[]
     // NOTE: this calculation ignores A[0]
     double sum = 0.0;
     for (int i = 1; i < A.length; ++i)
       sum += Math.pow(Math.abs(A[i]), pow);

     return Math.pow(sum, 1 / pow);
   }

   public static double getScale()
   {
     return featScale;
   }

   public static void initBleuHistory(int sentNum, int statCount)
   {
     bleuHistory = new double[sentNum][statCount];
     for(int i=0; i<sentNum; i++) {
       for(int j=0; j<statCount; j++) {
         bleuHistory[i][j] = 0.0;
       }
     }
   }

   public double getMetricScore()
   {
       return finalMetricScore;
   }

   private final Vector<String> output;
   private double[] initialLambda;
   private final double[] finalLambda;
   private double finalMetricScore;
   private final HashMap<String, String>[] feat_hash;
   private final HashMap<String, String>[] stats_hash;
   private final int paramDim;
   private final boolean[] isOptimizable;
   public static int sentNum;
   public static int adagradIter; //AdaGrad internal iterations
   public static int oraSelectMode;
   public static int predSelectMode;
   public static int batchSize;
   public static int regularization;
   public static boolean needShuffle;
   public static boolean needScale;
   public static double scoreRatio;
   public static boolean needAvg;
   public static boolean usePseudoBleu;
   public static double featScale = 1.0; //scale the features in order to make the model score comparable with metric score
                                             //updates in each epoch if necessary
   public static double eta;
   public static double lam;
   public static double R; //corpus decay(used only when pseudo corpus is used to compute BLEU)
   public static EvaluationMetric evalMetric;
   public static double[] normalizationOptions;
   public static double[][] bleuHistory;

   private final static double NegInf = (-1.0 / 0.0);
   private final static double PosInf = (+1.0 / 0.0);
 }