opennlp-ml/src/main/java/org/apache/opennlp/ml/maxent/GISTrainer.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.opennlp.ml.maxent;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;

 import org.apache.opennlp.ml.model.DataIndexer;
 import org.apache.opennlp.ml.model.EvalParameters;
 import org.apache.opennlp.ml.model.EventStream;
 import org.apache.opennlp.ml.model.MutableContext;
 import org.apache.opennlp.ml.model.OnePassDataIndexer;
 import org.apache.opennlp.ml.model.Prior;
 import org.apache.opennlp.ml.model.UniformPrior;


 /**
  * An implementation of Generalized Iterative Scaling.  The reference paper
  * for this implementation was Adwait Ratnaparkhi's tech report at the
  * University of Pennsylvania's Institute for Research in Cognitive Science,
  * and is available at <a href ="ftp://ftp.cis.upenn.edu/pub/ircs/tr/97-08.ps.Z"><code>ftp://ftp.cis.upenn.edu/pub/ircs/tr/97-08.ps.Z</code></a>.
  *
  * The slack parameter used in the above implementation has been removed by default
  * from the computation and a method for updating with Gaussian smoothing has been
  * added per Investigating GIS and Smoothing for Maximum Entropy Taggers, Clark and Curran (2002).
  * <a href="http://acl.ldc.upenn.edu/E/E03/E03-1071.pdf"><code>http://acl.ldc.upenn.edu/E/E03/E03-1071.pdf</code></a>
  * The slack parameter can be used by setting <code>useSlackParameter</code> to true.
  * Gaussian smoothing can be used by setting <code>useGaussianSmoothing</code> to true.
  *
  * A prior can be used to train models which converge to the distribution which minimizes the
  * relative entropy between the distribution specified by the empirical constraints of the training
  * data and the specified prior.  By default, the uniform distribution is used as the prior.
  */
 class GISTrainer {

   /**
    * Specifies whether unseen context/outcome pairs should be estimated as occur very infrequently.
    */
   private boolean useSimpleSmoothing = false;

   /**
    * Specified whether parameter updates should prefer a distribution of parameters which
    * is gaussian.
    */
   private boolean useGaussianSmoothing = false;

   private double sigma = 2.0;

   // If we are using smoothing, this is used as the "number" of
   // times we want the trainer to imagine that it saw a feature that it
   // actually didn't see.  Defaulted to 0.1.
   private double _smoothingObservation = 0.1;

   private final boolean printMessages;

   /**
    * Number of unique events which occured in the event set.
    */
   private int numUniqueEvents;

   /**
    * Number of predicates.
    */
   private int numPreds;

   /**
    * Number of outcomes.
    */
   private int numOutcomes;

   /**
    * Records the array of predicates seen in each event.
    */
   private int[][] contexts;

   /**
    * The value associated with each context. If null then context values are assumes to be 1.
    */
   private float[][] values;

   /**
    * List of outcomes for each event i, in context[i].
    */
   private int[] outcomeList;

   /**
    * Records the num of times an event has been seen for each event i, in context[i].
    */
   private int[] numTimesEventsSeen;

   /**
    * The number of times a predicate occured in the training data.
    */
   private int[] predicateCounts;

   private int cutoff;

   /**
    * Stores the String names of the outcomes. The GIS only tracks outcomes as
    * ints, and so this array is needed to save the model to disk and thereby
    * allow users to know what the outcome was in human understandable terms.
    */
   private String[] outcomeLabels;

   /**
    * Stores the String names of the predicates. The GIS only tracks predicates
    * as ints, and so this array is needed to save the model to disk and thereby
    * allow users to know what the outcome was in human understandable terms.
    */
   private String[] predLabels;

   /**
    * Stores the observed expected values of the features based on training data.
    */
   private MutableContext[] observedExpects;

   /**
    * Stores the estimated parameter value of each predicate during iteration
    */
   private MutableContext[] params;

   /**
    * Stores the expected values of the features based on the current models
    */
   private MutableContext[][] modelExpects;

   /**
    * This is the prior distribution that the model uses for training.
    */
   private Prior prior;

   private static final double LLThreshold = 0.0001;

   /**
    * Initial probability for all outcomes.
    */
   private EvalParameters evalParams;

   /**
    * Creates a new <code>GISTrainer</code> instance which does not print
    * progress messages about training to STDOUT.
    *
    */
   GISTrainer() {
     printMessages = false;
   }

   /**
    * Creates a new <code>GISTrainer</code> instance.
    *
    * @param printMessages sends progress messages about training to
    *                      STDOUT when true; trains silently otherwise.
    */
   GISTrainer(boolean printMessages) {
     this.printMessages = printMessages;
   }

   /**
    * Sets whether this trainer will use smoothing while training the model.
    * This can improve model accuracy, though training will potentially take
    * longer and use more memory.  Model size will also be larger.
    *
    * @param smooth true if smoothing is desired, false if not
    */
   public void setSmoothing(boolean smooth) {
     useSimpleSmoothing = smooth;
   }

   /**
    * Sets whether this trainer will use smoothing while training the model.
    * This can improve model accuracy, though training will potentially take
    * longer and use more memory.  Model size will also be larger.
    *
    * @param timesSeen the "number" of times we want the trainer to imagine
    *                  it saw a feature that it actually didn't see
    */
   public void setSmoothingObservation(double timesSeen) {
     _smoothingObservation = timesSeen;
   }

   /**
    * Sets whether this trainer will use smoothing while training the model.
    * This can improve model accuracy, though training will potentially take
    * longer and use more memory.  Model size will also be larger.
    *
    */
   public void setGaussianSigma(double sigmaValue) {
     useGaussianSmoothing = true;
     sigma = sigmaValue;
   }

   /**
    * Trains a GIS model on the event in the specified event stream, using the specified number
    * of iterations and the specified count cutoff.
    * @param eventStream A stream of all events.
    * @param iterations The number of iterations to use for GIS.
    * @param cutoff The number of times a feature must occur to be included.
    * @return A GIS model trained with specified
    */
   public GISModel trainModel(EventStream eventStream, int iterations, int cutoff) throws IOException {
     return trainModel(iterations, new OnePassDataIndexer(eventStream,cutoff),cutoff);
   }

   /**
    * Train a model using the GIS algorithm.
    *
    * @param iterations  The number of GIS iterations to perform.
    * @param di The data indexer used to compress events in memory.
    * @return The newly trained model, which can be used immediately or saved
    *         to disk using an opennlp.maxent.io.GISModelWriter object.
    */
   public GISModel trainModel(int iterations, DataIndexer di, int cutoff) {
     return trainModel(iterations,di,new UniformPrior(),cutoff,1);
   }

   /**
    * Train a model using the GIS algorithm.
    *
    * @param iterations  The number of GIS iterations to perform.
    * @param di The data indexer used to compress events in memory.
    * @param modelPrior The prior distribution used to train this model.
    * @return The newly trained model, which can be used immediately or saved
    *         to disk using an opennlp.maxent.io.GISModelWriter object.
    */
   public GISModel trainModel(int iterations, DataIndexer di, Prior modelPrior, int cutoff, int threads) {

     if (threads <= 0)
       throw new IllegalArgumentException("threads must be at leat one or greater!");

     modelExpects = new MutableContext[threads][];

     /************** Incorporate all of the needed info ******************/
     display("Incorporating indexed data for training...  \n");
     contexts = di.getContexts();
     values = di.getValues();
     this.cutoff = cutoff;
     predicateCounts = di.getPredCounts();
     numTimesEventsSeen = di.getNumTimesEventsSeen();
     numUniqueEvents = contexts.length;
     this.prior = modelPrior;
     //printTable(contexts);

     // determine the correction constant and its inverse
     double correctionConstant = 0;
     for (int ci = 0; ci < contexts.length; ci++) {
       if (values == null || values[ci] == null) {
         if (contexts[ci].length > correctionConstant) {
           correctionConstant = contexts[ci].length;
         }
       }
       else {
         float cl = values[ci][0];
         for (int vi=1;vi<values[ci].length;vi++) {
           cl+=values[ci][vi];
         }

         if (cl > correctionConstant) {
           correctionConstant = cl;
         }
       }
     }
     display("done.\n");

     outcomeLabels = di.getOutcomeLabels();
     outcomeList = di.getOutcomeList();
     numOutcomes = outcomeLabels.length;

     predLabels = di.getPredLabels();
     prior.setLabels(outcomeLabels,predLabels);
     numPreds = predLabels.length;

     display("\tNumber of Event Tokens: " + numUniqueEvents + "\n");
     display("\t    Number of Outcomes: " + numOutcomes + "\n");
     display("\t  Number of Predicates: " + numPreds + "\n");

     // set up feature arrays
     float[][] predCount = new float[numPreds][numOutcomes];
     for (int ti = 0; ti < numUniqueEvents; ti++) {
       for (int j = 0; j < contexts[ti].length; j++) {
         if (values != null && values[ti] != null) {
           predCount[contexts[ti][j]][outcomeList[ti]] += numTimesEventsSeen[ti]*values[ti][j];
         }
         else {
           predCount[contexts[ti][j]][outcomeList[ti]] += numTimesEventsSeen[ti];
         }
       }
     }

     //printTable(predCount);
     di = null; // don't need it anymore

     // A fake "observation" to cover features which are not detected in
     // the data.  The default is to assume that we observed "1/10th" of a
     // feature during training.
     final double smoothingObservation = _smoothingObservation;

     // Get the observed expectations of the features. Strictly speaking,
     // we should divide the counts by the number of Tokens, but because of
     // the way the model's expectations are approximated in the
     // implementation, this is cancelled out when we compute the next
     // iteration of a parameter, making the extra divisions wasteful.
     params = new MutableContext[numPreds];
     for (int i = 0; i< modelExpects.length; i++)
       modelExpects[i] = new MutableContext[numPreds];
     observedExpects = new MutableContext[numPreds];

     // The model does need the correction constant and the correction feature. The correction constant
     // is only needed during training, and the correction feature is not necessary.
     // For compatibility reasons the model contains form now on a correction constant of 1,
     // and a correction param 0.
     evalParams = new EvalParameters(params,0,1,numOutcomes);
     int[] activeOutcomes = new int[numOutcomes];
     int[] outcomePattern;
     int[] allOutcomesPattern= new int[numOutcomes];
     for (int oi = 0; oi < numOutcomes; oi++) {
       allOutcomesPattern[oi] = oi;
     }
     int numActiveOutcomes = 0;
     for (int pi = 0; pi < numPreds; pi++) {
       numActiveOutcomes = 0;
       if (useSimpleSmoothing) {
         numActiveOutcomes = numOutcomes;
         outcomePattern = allOutcomesPattern;
       }
       else { //determine active outcomes
         for (int oi = 0; oi < numOutcomes; oi++) {
           if (predCount[pi][oi] > 0 && predicateCounts[pi] >= cutoff) {
             activeOutcomes[numActiveOutcomes] = oi;
             numActiveOutcomes++;
           }
         }
         if (numActiveOutcomes == numOutcomes) {
           outcomePattern = allOutcomesPattern;
         }
         else {
           outcomePattern = new int[numActiveOutcomes];
           for (int aoi=0;aoi<numActiveOutcomes;aoi++) {
             outcomePattern[aoi] = activeOutcomes[aoi];
           }
         }
       }
       params[pi] = new MutableContext(outcomePattern,new double[numActiveOutcomes]);
       for (int i = 0; i< modelExpects.length; i++)
         modelExpects[i][pi] = new MutableContext(outcomePattern,new double[numActiveOutcomes]);
       observedExpects[pi] = new MutableContext(outcomePattern,new double[numActiveOutcomes]);
       for (int aoi=0;aoi<numActiveOutcomes;aoi++) {
         int oi = outcomePattern[aoi];
         params[pi].setParameter(aoi, 0.0);
         for (MutableContext[] modelExpect : modelExpects) {
           modelExpect[pi].setParameter(aoi, 0.0);
         }
         if (predCount[pi][oi] > 0) {
             observedExpects[pi].setParameter(aoi, predCount[pi][oi]);
         }
         else if (useSimpleSmoothing) {
           observedExpects[pi].setParameter(aoi,smoothingObservation);
         }
       }
     }

     predCount = null; // don't need it anymore

     display("...done.\n");

     /***************** Find the parameters ************************/
     if (threads == 1)
       display("Computing model parameters ...\n");
     else
       display("Computing model parameters in " + threads +" threads...\n");

     findParameters(iterations, correctionConstant);

     /*************** Create and return the model ******************/
     // To be compatible with old models the correction constant is always 1
     return new GISModel(params, predLabels, outcomeLabels, 1, evalParams.getCorrectionParam());

   }

   /* Estimate and return the model parameters. */
   private void findParameters(int iterations, double correctionConstant) {
     double prevLL = 0.0;
     double currLL = 0.0;
     display("Performing " + iterations + " iterations.\n");
     for (int i = 1; i <= iterations; i++) {
       if (i < 10)
         display("  " + i + ":  ");
       else if (i < 100)
         display(" " + i + ":  ");
       else
         display(i + ":  ");
       currLL = nextIteration(correctionConstant);
       if (i > 1) {
         if (prevLL > currLL) {
           System.err.println("Model Diverging: loglikelihood decreased");
           break;
         }
         if (currLL - prevLL < LLThreshold) {
           break;
         }
       }
       prevLL = currLL;
     }

     // kill a bunch of these big objects now that we don't need them
     observedExpects = null;
     modelExpects = null;
     numTimesEventsSeen = null;
     contexts = null;
   }

   //modeled on implementation in  Zhang Le's maxent kit
   private double gaussianUpdate(int predicate, int oid, int n, double correctionConstant) {
     double param = params[predicate].getParameters()[oid];
     double x0 = 0.0;
     double modelValue = modelExpects[0][predicate].getParameters()[oid];
     double observedValue = observedExpects[predicate].getParameters()[oid];
     for (int i = 0; i < 50; i++) {
       double tmp = modelValue * Math.exp(correctionConstant * x0);
       double f = tmp + (param + x0) / sigma - observedValue;
       double fp = tmp * correctionConstant + 1 / sigma;
       if (fp == 0) {
         break;
       }
       double x = x0 - f / fp;
       if (Math.abs(x - x0) < 0.000001) {
         x0 = x;
         break;
       }
       x0 = x;
     }
     return x0;
   }

   private class ModelExpactationComputeTask implements Callable<ModelExpactationComputeTask> {

     private final int startIndex;
     private final int length;

     private double loglikelihood = 0;

     private int numEvents = 0;
     private int numCorrect = 0;

     final private int threadIndex;

     // startIndex to compute, number of events to compute
     ModelExpactationComputeTask(int threadIndex, int startIndex, int length) {
       this.startIndex = startIndex;
       this.length = length;
       this.threadIndex = threadIndex;
     }

     public ModelExpactationComputeTask call() {

       final double[] modelDistribution = new double[numOutcomes];


       for (int ei = startIndex; ei < startIndex + length; ei++) {

         // TODO: check interruption status here, if interrupted set a poisoned flag and return

         if (values != null) {
           prior.logPrior(modelDistribution, contexts[ei], values[ei]);
           GISModel.eval(contexts[ei], values[ei], modelDistribution, evalParams);
         }
         else {
           prior.logPrior(modelDistribution,contexts[ei]);
           GISModel.eval(contexts[ei], modelDistribution, evalParams);
         }
         for (int j = 0; j < contexts[ei].length; j++) {
           int pi = contexts[ei][j];
           if (predicateCounts[pi] >= cutoff) {
             int[] activeOutcomes = modelExpects[threadIndex][pi].getOutcomes();
             for (int aoi=0;aoi<activeOutcomes.length;aoi++) {
               int oi = activeOutcomes[aoi];

               // numTimesEventsSeen must also be thread safe
               if (values != null && values[ei] != null) {
                 modelExpects[threadIndex][pi].updateParameter(aoi,modelDistribution[oi] * values[ei][j] * numTimesEventsSeen[ei]);
               }
               else {
                 modelExpects[threadIndex][pi].updateParameter(aoi,modelDistribution[oi] * numTimesEventsSeen[ei]);
               }
             }
           }
         }

         loglikelihood += Math.log(modelDistribution[outcomeList[ei]]) * numTimesEventsSeen[ei];

         numEvents += numTimesEventsSeen[ei];
         if (printMessages) {
           int max = 0;
           for (int oi = 1; oi < numOutcomes; oi++) {
             if (modelDistribution[oi] > modelDistribution[max]) {
               max = oi;
             }
           }
           if (max == outcomeList[ei]) {
             numCorrect += numTimesEventsSeen[ei];
           }
         }

       }

       return this;
     }

     synchronized int getNumEvents() {
       return numEvents;
     }

     synchronized int getNumCorrect() {
       return numCorrect;
     }

     synchronized double getLoglikelihood() {
       return loglikelihood;
     }
   }

   /* Compute one iteration of GIS and retutn log-likelihood.*/
   private double nextIteration(double correctionConstant) {
     // compute contribution of p(a|b_i) for each feature and the new
     // correction parameter
     double loglikelihood = 0.0;
     int numEvents = 0;
     int numCorrect = 0;

     int numberOfThreads = modelExpects.length;

     ExecutorService executor = Executors.newFixedThreadPool(numberOfThreads);

     int taskSize = numUniqueEvents / numberOfThreads;

     int leftOver = numUniqueEvents % numberOfThreads;

     List<Future<?>> futures = new ArrayList<Future<?>>();

     for (int i = 0; i < numberOfThreads; i++) {
       if (i != numberOfThreads - 1)
         futures.add(executor.submit(new ModelExpactationComputeTask(i, i*taskSize, taskSize)));
       else
         futures.add(executor.submit(new ModelExpactationComputeTask(i, i*taskSize, taskSize + leftOver)));
     }

     for (Future<?> future : futures) {
       ModelExpactationComputeTask finishedTask = null;
       try {
         finishedTask = (ModelExpactationComputeTask) future.get();
       } catch (InterruptedException e) {
         // TODO: We got interrupted, but that is currently not really supported!
         // For now we just print the exception and fail hard. We hopefully soon
         // handle this case properly!
         e.printStackTrace();
         throw new IllegalStateException("Interruption is not supported!", e);
       } catch (ExecutionException e) {
         // Only runtime exception can be thrown during training, if one was thrown
         // it should be re-thrown. That could for example be a NullPointerException
         // which is caused through a bug in our implementation.
         throw new RuntimeException(e.getCause());
       }

       // When they are done, retrieve the results ...
       numEvents += finishedTask.getNumEvents();
       numCorrect += finishedTask.getNumCorrect();
       loglikelihood += finishedTask.getLoglikelihood();
     }

     executor.shutdown();

     display(".");

     // merge the results of the two computations
     for (int pi = 0; pi < numPreds; pi++) {
       int[] activeOutcomes = params[pi].getOutcomes();

       for (int aoi=0;aoi<activeOutcomes.length;aoi++) {
         for (int i = 1; i < modelExpects.length; i++) {
           modelExpects[0][pi].updateParameter(aoi, modelExpects[i][pi].getParameters()[aoi]);
         }
       }
     }

     display(".");

     // compute the new parameter values
     for (int pi = 0; pi < numPreds; pi++) {
       double[] observed = observedExpects[pi].getParameters();
       double[] model = modelExpects[0][pi].getParameters();
       int[] activeOutcomes = params[pi].getOutcomes();
       for (int aoi=0;aoi<activeOutcomes.length;aoi++) {
         if (useGaussianSmoothing) {
           params[pi].updateParameter(aoi,gaussianUpdate(pi,aoi,numEvents,correctionConstant));
         }
         else {
           if (model[aoi] == 0) {
             System.err.println("Model expects == 0 for "+predLabels[pi]+" "+outcomeLabels[aoi]);
           }
           //params[pi].updateParameter(aoi,(Math.log(observed[aoi]) - Math.log(model[aoi])));
           params[pi].updateParameter(aoi,((Math.log(observed[aoi]) - Math.log(model[aoi]))/correctionConstant));
         }

         for (MutableContext[] modelExpect : modelExpects) {
           modelExpect[pi].setParameter(aoi, 0.0); // re-initialize to 0.0's
         }

       }
     }

     display(". loglikelihood=" + loglikelihood + "\t" + ((double) numCorrect / numEvents) + "\n");

     return loglikelihood;
   }

   private void display(String s) {
     if (printMessages)
       System.out.print(s);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.opennlp.ml.maxent;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.List;
	import java.util.concurrent.Callable;
	import java.util.concurrent.ExecutionException;
	import java.util.concurrent.ExecutorService;
	import java.util.concurrent.Executors;
	import java.util.concurrent.Future;

	import org.apache.opennlp.ml.model.DataIndexer;
	import org.apache.opennlp.ml.model.EvalParameters;
	import org.apache.opennlp.ml.model.EventStream;
	import org.apache.opennlp.ml.model.MutableContext;
	import org.apache.opennlp.ml.model.OnePassDataIndexer;
	import org.apache.opennlp.ml.model.Prior;
	import org.apache.opennlp.ml.model.UniformPrior;


	/**
	* An implementation of Generalized Iterative Scaling. The reference paper
	* for this implementation was Adwait Ratnaparkhi's tech report at the
	* University of Pennsylvania's Institute for Research in Cognitive Science,
	* and is available at <a href ="ftp://ftp.cis.upenn.edu/pub/ircs/tr/97-08.ps.Z"><code>ftp://ftp.cis.upenn.edu/pub/ircs/tr/97-08.ps.Z</code></a>.
	*
	* The slack parameter used in the above implementation has been removed by default
	* from the computation and a method for updating with Gaussian smoothing has been
	* added per Investigating GIS and Smoothing for Maximum Entropy Taggers, Clark and Curran (2002).
	* <a href="http://acl.ldc.upenn.edu/E/E03/E03-1071.pdf"><code>http://acl.ldc.upenn.edu/E/E03/E03-1071.pdf</code></a>
	* The slack parameter can be used by setting <code>useSlackParameter</code> to true.
	* Gaussian smoothing can be used by setting <code>useGaussianSmoothing</code> to true.
	*
	* A prior can be used to train models which converge to the distribution which minimizes the
	* relative entropy between the distribution specified by the empirical constraints of the training
	* data and the specified prior. By default, the uniform distribution is used as the prior.
	*/
	class GISTrainer {

	/**
	* Specifies whether unseen context/outcome pairs should be estimated as occur very infrequently.
	*/
	private boolean useSimpleSmoothing = false;

	/**
	* Specified whether parameter updates should prefer a distribution of parameters which
	* is gaussian.
	*/
	private boolean useGaussianSmoothing = false;

	private double sigma = 2.0;

	// If we are using smoothing, this is used as the "number" of
	// times we want the trainer to imagine that it saw a feature that it
	// actually didn't see. Defaulted to 0.1.
	private double _smoothingObservation = 0.1;

	private final boolean printMessages;

	/**
	* Number of unique events which occured in the event set.
	*/
	private int numUniqueEvents;

	/**
	* Number of predicates.
	*/
	private int numPreds;

	/**
	* Number of outcomes.
	*/
	private int numOutcomes;

	/**
	* Records the array of predicates seen in each event.
	*/
	private int[][] contexts;

	/**
	* The value associated with each context. If null then context values are assumes to be 1.
	*/
	private float[][] values;

	/**
	* List of outcomes for each event i, in context[i].
	*/
	private int[] outcomeList;

	/**
	* Records the num of times an event has been seen for each event i, in context[i].
	*/
	private int[] numTimesEventsSeen;

	/**
	* The number of times a predicate occured in the training data.
	*/
	private int[] predicateCounts;

	private int cutoff;

	/**
	* Stores the String names of the outcomes. The GIS only tracks outcomes as
	* ints, and so this array is needed to save the model to disk and thereby
	* allow users to know what the outcome was in human understandable terms.
	*/
	private String[] outcomeLabels;

	/**
	* Stores the String names of the predicates. The GIS only tracks predicates
	* as ints, and so this array is needed to save the model to disk and thereby
	* allow users to know what the outcome was in human understandable terms.
	*/
	private String[] predLabels;

	/**
	* Stores the observed expected values of the features based on training data.
	*/
	private MutableContext[] observedExpects;

	/**
	* Stores the estimated parameter value of each predicate during iteration
	*/
	private MutableContext[] params;

	/**
	* Stores the expected values of the features based on the current models
	*/
	private MutableContext[][] modelExpects;

	/**
	* This is the prior distribution that the model uses for training.
	*/
	private Prior prior;

	private static final double LLThreshold = 0.0001;

	/**
	* Initial probability for all outcomes.
	*/
	private EvalParameters evalParams;

	/**
	* Creates a new <code>GISTrainer</code> instance which does not print
	* progress messages about training to STDOUT.
	*
	*/
	GISTrainer() {
	printMessages = false;
	}

	/**
	* Creates a new <code>GISTrainer</code> instance.
	*
	* @param printMessages sends progress messages about training to
	* STDOUT when true; trains silently otherwise.
	*/
	GISTrainer(boolean printMessages) {
	this.printMessages = printMessages;
	}

	/**
	* Sets whether this trainer will use smoothing while training the model.
	* This can improve model accuracy, though training will potentially take
	* longer and use more memory. Model size will also be larger.
	*
	* @param smooth true if smoothing is desired, false if not
	*/
	public void setSmoothing(boolean smooth) {
	useSimpleSmoothing = smooth;
	}

	/**
	* Sets whether this trainer will use smoothing while training the model.
	* This can improve model accuracy, though training will potentially take
	* longer and use more memory. Model size will also be larger.
	*
	* @param timesSeen the "number" of times we want the trainer to imagine
	* it saw a feature that it actually didn't see
	*/
	public void setSmoothingObservation(double timesSeen) {
	_smoothingObservation = timesSeen;
	}

	/**
	* Sets whether this trainer will use smoothing while training the model.
	* This can improve model accuracy, though training will potentially take
	* longer and use more memory. Model size will also be larger.
	*
	*/
	public void setGaussianSigma(double sigmaValue) {
	useGaussianSmoothing = true;
	sigma = sigmaValue;
	}

	/**
	* Trains a GIS model on the event in the specified event stream, using the specified number
	* of iterations and the specified count cutoff.
	* @param eventStream A stream of all events.
	* @param iterations The number of iterations to use for GIS.
	* @param cutoff The number of times a feature must occur to be included.
	* @return A GIS model trained with specified
	*/
	public GISModel trainModel(EventStream eventStream, int iterations, int cutoff) throws IOException {
	return trainModel(iterations, new OnePassDataIndexer(eventStream,cutoff),cutoff);
	}

	/**
	* Train a model using the GIS algorithm.
	*
	* @param iterations The number of GIS iterations to perform.
	* @param di The data indexer used to compress events in memory.
	* @return The newly trained model, which can be used immediately or saved
	* to disk using an opennlp.maxent.io.GISModelWriter object.
	*/
	public GISModel trainModel(int iterations, DataIndexer di, int cutoff) {
	return trainModel(iterations,di,new UniformPrior(),cutoff,1);
	}

	/**
	* Train a model using the GIS algorithm.
	*
	* @param iterations The number of GIS iterations to perform.
	* @param di The data indexer used to compress events in memory.
	* @param modelPrior The prior distribution used to train this model.
	* @return The newly trained model, which can be used immediately or saved
	* to disk using an opennlp.maxent.io.GISModelWriter object.
	*/
	public GISModel trainModel(int iterations, DataIndexer di, Prior modelPrior, int cutoff, int threads) {

	if (threads <= 0)
	throw new IllegalArgumentException("threads must be at leat one or greater!");

	modelExpects = new MutableContext[threads][];

	/************ Incorporate all of the needed info ****************/
	display("Incorporating indexed data for training... \n");
	contexts = di.getContexts();
	values = di.getValues();
	this.cutoff = cutoff;
	predicateCounts = di.getPredCounts();
	numTimesEventsSeen = di.getNumTimesEventsSeen();
	numUniqueEvents = contexts.length;
	this.prior = modelPrior;
	//printTable(contexts);

	// determine the correction constant and its inverse
	double correctionConstant = 0;
	for (int ci = 0; ci < contexts.length; ci++) {
	if (values == null \|\| values[ci] == null) {
	if (contexts[ci].length > correctionConstant) {
	correctionConstant = contexts[ci].length;
	}
	}
	else {
	float cl = values[ci][0];
	for (int vi=1;vi<values[ci].length;vi++) {
	cl+=values[ci][vi];
	}

	if (cl > correctionConstant) {
	correctionConstant = cl;
	}
	}
	}
	display("done.\n");

	outcomeLabels = di.getOutcomeLabels();
	outcomeList = di.getOutcomeList();
	numOutcomes = outcomeLabels.length;

	predLabels = di.getPredLabels();
	prior.setLabels(outcomeLabels,predLabels);
	numPreds = predLabels.length;

	display("\tNumber of Event Tokens: " + numUniqueEvents + "\n");
	display("\t Number of Outcomes: " + numOutcomes + "\n");
	display("\t Number of Predicates: " + numPreds + "\n");

	// set up feature arrays
	float[][] predCount = new float[numPreds][numOutcomes];
	for (int ti = 0; ti < numUniqueEvents; ti++) {
	for (int j = 0; j < contexts[ti].length; j++) {
	if (values != null && values[ti] != null) {
	predCount[contexts[ti][j]][outcomeList[ti]] += numTimesEventsSeen[ti]*values[ti][j];
	}
	else {
	predCount[contexts[ti][j]][outcomeList[ti]] += numTimesEventsSeen[ti];
	}
	}
	}

	//printTable(predCount);
	di = null; // don't need it anymore

	// A fake "observation" to cover features which are not detected in
	// the data. The default is to assume that we observed "1/10th" of a
	// feature during training.
	final double smoothingObservation = _smoothingObservation;

	// Get the observed expectations of the features. Strictly speaking,
	// we should divide the counts by the number of Tokens, but because of
	// the way the model's expectations are approximated in the
	// implementation, this is cancelled out when we compute the next
	// iteration of a parameter, making the extra divisions wasteful.
	params = new MutableContext[numPreds];
	for (int i = 0; i< modelExpects.length; i++)
	modelExpects[i] = new MutableContext[numPreds];
	observedExpects = new MutableContext[numPreds];

	// The model does need the correction constant and the correction feature. The correction constant
	// is only needed during training, and the correction feature is not necessary.
	// For compatibility reasons the model contains form now on a correction constant of 1,
	// and a correction param 0.
	evalParams = new EvalParameters(params,0,1,numOutcomes);
	int[] activeOutcomes = new int[numOutcomes];
	int[] outcomePattern;
	int[] allOutcomesPattern= new int[numOutcomes];
	for (int oi = 0; oi < numOutcomes; oi++) {
	allOutcomesPattern[oi] = oi;
	}
	int numActiveOutcomes = 0;
	for (int pi = 0; pi < numPreds; pi++) {
	numActiveOutcomes = 0;
	if (useSimpleSmoothing) {
	numActiveOutcomes = numOutcomes;
	outcomePattern = allOutcomesPattern;
	}
	else { //determine active outcomes
	for (int oi = 0; oi < numOutcomes; oi++) {
	if (predCount[pi][oi] > 0 && predicateCounts[pi] >= cutoff) {
	activeOutcomes[numActiveOutcomes] = oi;
	numActiveOutcomes++;
	}
	}
	if (numActiveOutcomes == numOutcomes) {
	outcomePattern = allOutcomesPattern;
	}
	else {
	outcomePattern = new int[numActiveOutcomes];
	for (int aoi=0;aoi<numActiveOutcomes;aoi++) {
	outcomePattern[aoi] = activeOutcomes[aoi];
	}
	}
	}
	params[pi] = new MutableContext(outcomePattern,new double[numActiveOutcomes]);
	for (int i = 0; i< modelExpects.length; i++)
	modelExpects[i][pi] = new MutableContext(outcomePattern,new double[numActiveOutcomes]);
	observedExpects[pi] = new MutableContext(outcomePattern,new double[numActiveOutcomes]);
	for (int aoi=0;aoi<numActiveOutcomes;aoi++) {
	int oi = outcomePattern[aoi];
	params[pi].setParameter(aoi, 0.0);
	for (MutableContext[] modelExpect : modelExpects) {
	modelExpect[pi].setParameter(aoi, 0.0);
	}
	if (predCount[pi][oi] > 0) {
	observedExpects[pi].setParameter(aoi, predCount[pi][oi]);
	}
	else if (useSimpleSmoothing) {
	observedExpects[pi].setParameter(aoi,smoothingObservation);
	}
	}
	}

	predCount = null; // don't need it anymore

	display("...done.\n");

	/*************** Find the parameters **********************/
	if (threads == 1)
	display("Computing model parameters ...\n");
	else
	display("Computing model parameters in " + threads +" threads...\n");

	findParameters(iterations, correctionConstant);

	/************* Create and return the model ****************/
	// To be compatible with old models the correction constant is always 1
	return new GISModel(params, predLabels, outcomeLabels, 1, evalParams.getCorrectionParam());

	}

	/* Estimate and return the model parameters. */
	private void findParameters(int iterations, double correctionConstant) {
	double prevLL = 0.0;
	double currLL = 0.0;
	display("Performing " + iterations + " iterations.\n");
	for (int i = 1; i <= iterations; i++) {
	if (i < 10)
	display(" " + i + ": ");
	else if (i < 100)
	display(" " + i + ": ");
	else
	display(i + ": ");
	currLL = nextIteration(correctionConstant);
	if (i > 1) {
	if (prevLL > currLL) {
	System.err.println("Model Diverging: loglikelihood decreased");
	break;
	}
	if (currLL - prevLL < LLThreshold) {
	break;
	}
	}
	prevLL = currLL;
	}

	// kill a bunch of these big objects now that we don't need them
	observedExpects = null;
	modelExpects = null;
	numTimesEventsSeen = null;
	contexts = null;
	}

	//modeled on implementation in Zhang Le's maxent kit
	private double gaussianUpdate(int predicate, int oid, int n, double correctionConstant) {
	double param = params[predicate].getParameters()[oid];
	double x0 = 0.0;
	double modelValue = modelExpects[0][predicate].getParameters()[oid];
	double observedValue = observedExpects[predicate].getParameters()[oid];
	for (int i = 0; i < 50; i++) {
	double tmp = modelValue * Math.exp(correctionConstant * x0);
	double f = tmp + (param + x0) / sigma - observedValue;
	double fp = tmp * correctionConstant + 1 / sigma;
	if (fp == 0) {
	break;
	}
	double x = x0 - f / fp;
	if (Math.abs(x - x0) < 0.000001) {
	x0 = x;
	break;
	}
	x0 = x;
	}
	return x0;
	}

	private class ModelExpactationComputeTask implements Callable<ModelExpactationComputeTask> {

	private final int startIndex;
	private final int length;

	private double loglikelihood = 0;

	private int numEvents = 0;
	private int numCorrect = 0;

	final private int threadIndex;

	// startIndex to compute, number of events to compute
	ModelExpactationComputeTask(int threadIndex, int startIndex, int length) {
	this.startIndex = startIndex;
	this.length = length;
	this.threadIndex = threadIndex;
	}

	public ModelExpactationComputeTask call() {

	final double[] modelDistribution = new double[numOutcomes];


	for (int ei = startIndex; ei < startIndex + length; ei++) {

	// TODO: check interruption status here, if interrupted set a poisoned flag and return

	if (values != null) {
	prior.logPrior(modelDistribution, contexts[ei], values[ei]);
	GISModel.eval(contexts[ei], values[ei], modelDistribution, evalParams);
	}
	else {
	prior.logPrior(modelDistribution,contexts[ei]);
	GISModel.eval(contexts[ei], modelDistribution, evalParams);
	}
	for (int j = 0; j < contexts[ei].length; j++) {
	int pi = contexts[ei][j];
	if (predicateCounts[pi] >= cutoff) {
	int[] activeOutcomes = modelExpects[threadIndex][pi].getOutcomes();
	for (int aoi=0;aoi<activeOutcomes.length;aoi++) {
	int oi = activeOutcomes[aoi];

	// numTimesEventsSeen must also be thread safe
	if (values != null && values[ei] != null) {
	modelExpects[threadIndex][pi].updateParameter(aoi,modelDistribution[oi] * values[ei][j] * numTimesEventsSeen[ei]);
	}
	else {
	modelExpects[threadIndex][pi].updateParameter(aoi,modelDistribution[oi] * numTimesEventsSeen[ei]);
	}
	}
	}
	}

	loglikelihood += Math.log(modelDistribution[outcomeList[ei]]) * numTimesEventsSeen[ei];

	numEvents += numTimesEventsSeen[ei];
	if (printMessages) {
	int max = 0;
	for (int oi = 1; oi < numOutcomes; oi++) {
	if (modelDistribution[oi] > modelDistribution[max]) {
	max = oi;
	}
	}
	if (max == outcomeList[ei]) {
	numCorrect += numTimesEventsSeen[ei];
	}
	}

	}

	return this;
	}

	synchronized int getNumEvents() {
	return numEvents;
	}

	synchronized int getNumCorrect() {
	return numCorrect;
	}

	synchronized double getLoglikelihood() {
	return loglikelihood;
	}
	}

	/* Compute one iteration of GIS and retutn log-likelihood.*/
	private double nextIteration(double correctionConstant) {
	// compute contribution of p(a\|b_i) for each feature and the new
	// correction parameter
	double loglikelihood = 0.0;
	int numEvents = 0;
	int numCorrect = 0;

	int numberOfThreads = modelExpects.length;

	ExecutorService executor = Executors.newFixedThreadPool(numberOfThreads);

	int taskSize = numUniqueEvents / numberOfThreads;

	int leftOver = numUniqueEvents % numberOfThreads;

	List<Future<?>> futures = new ArrayList<Future<?>>();

	for (int i = 0; i < numberOfThreads; i++) {
	if (i != numberOfThreads - 1)
	futures.add(executor.submit(new ModelExpactationComputeTask(i, i*taskSize, taskSize)));
	else
	futures.add(executor.submit(new ModelExpactationComputeTask(i, i*taskSize, taskSize + leftOver)));
	}

	for (Future<?> future : futures) {
	ModelExpactationComputeTask finishedTask = null;
	try {
	finishedTask = (ModelExpactationComputeTask) future.get();
	} catch (InterruptedException e) {
	// TODO: We got interrupted, but that is currently not really supported!
	// For now we just print the exception and fail hard. We hopefully soon
	// handle this case properly!
	e.printStackTrace();
	throw new IllegalStateException("Interruption is not supported!", e);
	} catch (ExecutionException e) {
	// Only runtime exception can be thrown during training, if one was thrown
	// it should be re-thrown. That could for example be a NullPointerException
	// which is caused through a bug in our implementation.
	throw new RuntimeException(e.getCause());
	}

	// When they are done, retrieve the results ...
	numEvents += finishedTask.getNumEvents();
	numCorrect += finishedTask.getNumCorrect();
	loglikelihood += finishedTask.getLoglikelihood();
	}

	executor.shutdown();

	display(".");

	// merge the results of the two computations
	for (int pi = 0; pi < numPreds; pi++) {
	int[] activeOutcomes = params[pi].getOutcomes();

	for (int aoi=0;aoi<activeOutcomes.length;aoi++) {
	for (int i = 1; i < modelExpects.length; i++) {
	modelExpects[0][pi].updateParameter(aoi, modelExpects[i][pi].getParameters()[aoi]);
	}
	}
	}

	display(".");

	// compute the new parameter values
	for (int pi = 0; pi < numPreds; pi++) {
	double[] observed = observedExpects[pi].getParameters();
	double[] model = modelExpects[0][pi].getParameters();
	int[] activeOutcomes = params[pi].getOutcomes();
	for (int aoi=0;aoi<activeOutcomes.length;aoi++) {
	if (useGaussianSmoothing) {
	params[pi].updateParameter(aoi,gaussianUpdate(pi,aoi,numEvents,correctionConstant));
	}
	else {
	if (model[aoi] == 0) {
	System.err.println("Model expects == 0 for "+predLabels[pi]+" "+outcomeLabels[aoi]);
	}
	//params[pi].updateParameter(aoi,(Math.log(observed[aoi]) - Math.log(model[aoi])));
	params[pi].updateParameter(aoi,((Math.log(observed[aoi]) - Math.log(model[aoi]))/correctionConstant));
	}

	for (MutableContext[] modelExpect : modelExpects) {
	modelExpect[pi].setParameter(aoi, 0.0); // re-initialize to 0.0's
	}

	}
	}

	display(". loglikelihood=" + loglikelihood + "\t" + ((double) numCorrect / numEvents) + "\n");

	return loglikelihood;
	}

	private void display(String s) {
	if (printMessages)
	System.out.print(s);
	}
	}