opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java - opennlp - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.parser;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Set;
 import java.util.SortedSet;
 import java.util.TreeSet;

 import opennlp.tools.chunker.Chunker;
 import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.ngram.NGramModel;
 import opennlp.tools.parser.chunking.ParserEventStream;
 import opennlp.tools.postag.POSTagger;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.Sequence;
 import opennlp.tools.util.Span;
 import opennlp.tools.util.StringList;
 import opennlp.tools.util.TrainingParameters;

 /**
  * Abstract class which contains code to tag and chunk parses for bottom up parsing and
  * leaves implementation of advancing parses and completing parses to extend class.
  * <p>
  * <b>Note:</b> <br> The nodes within the returned parses are shared with other parses
  * and therefore their parent node references will not be consistent with their child
  * node reference.  {@link #setParents setParents} can be used to make the parents consistent
  * with a particular parse, but subsequent calls to <code>setParents</code> can invalidate
  * the results of earlier calls.<br>
  */
 public abstract class AbstractBottomUpParser implements Parser {

   /**
    * The maximum number of parses advanced from all preceding
    * parses at each derivation step.
    */
   protected int M;

   /**
    * The maximum number of parses to advance from a single preceding parse.
    */
   protected int K;

   /**
    * The minimum total probability mass of advanced outcomes.
    */
   protected double Q;

   /**
    * The default beam size used if no beam size is given.
    */
   public static final int defaultBeamSize = 20;

   /**
    * The default amount of probability mass required of advanced outcomes.
    */
   public static final double defaultAdvancePercentage = 0.95;

   /**
    * Completed parses.
    */
   private SortedSet<Parse> completeParses;

   /**
    * Incomplete parses which will be advanced.
    */
   private SortedSet<Parse> odh;

   /**
    * Incomplete parses which have been advanced.
    */
   private SortedSet<Parse> ndh;

   /**
    * The head rules for the parser.
    */
   protected HeadRules headRules;

   /**
    * The set strings which are considered punctuation for the parser.
    * Punctuation is not attached, but floats to the top of the parse as attachment
    * decisions are made about its non-punctuation sister nodes.
    */
   protected Set<String> punctSet;

   /**
    * The label for the top node.
    */
   public static final String TOP_NODE = "TOP";

   /**
    * The label for the top if an incomplete node.
    */
   public static final String INC_NODE = "INC";

   /**
    * The label for a token node.
    */
   public static final String TOK_NODE = "TK";

   /**
    * The integer 0.
    */
   public static final Integer ZERO = 0;

   /**
    * Prefix for outcomes starting a constituent.
    */
   public static final String START = "S-";

   /**
    * Prefix for outcomes continuing a constituent.
    */
   public static final String CONT = "C-";

   /**
    * Outcome for token which is not contained in a basal constituent.
    */
   public static final String OTHER = "O";

   /**
    * Outcome used when a constituent is complete.
    */
   public static final String COMPLETE = "c";

   /**
    * Outcome used when a constituent is incomplete.
    */
   public static final String INCOMPLETE = "i";

   /**
    * The pos-tagger that the parser uses.
    */
   protected POSTagger tagger;

   /**
    * The chunker that the parser uses to chunk non-recursive structures.
    */
   protected Chunker chunker;

   /**
    * Specifies whether failed parses should be reported to standard error.
    */
   protected boolean reportFailedParse;

   /**
    * Specifies whether a derivation string should be created during parsing.
    * This is useful for debugging.
    */
   protected boolean createDerivationString = false;

   /**
    * Turns debug print on or off.
    */
   protected boolean debugOn = false;

   public AbstractBottomUpParser(POSTagger tagger, Chunker chunker, HeadRules headRules,
       int beamSize, double advancePercentage) {
     this.tagger = tagger;
     this.chunker = chunker;
     this.M = beamSize;
     this.K = beamSize;
     this.Q = advancePercentage;
     reportFailedParse = true;
     this.headRules = headRules;
     this.punctSet = headRules.getPunctuationTags();
     odh = new TreeSet<>();
     ndh = new TreeSet<>();
     completeParses = new TreeSet<>();
   }

   /**
    * Specifies whether the parser should report when it was unable to find a parse for
    * a particular sentence.
    * @param errorReporting If true then un-parsed sentences are reported, false otherwise.
    */
   public void setErrorReporting(boolean errorReporting) {
     this.reportFailedParse = errorReporting;
   }

   /**
    * Assigns parent references for the specified parse so that they
    * are consistent with the children references.
    * @param p The parse whose parent references need to be assigned.
    */
   public static void setParents(Parse p) {
     Parse[] children = p.getChildren();
     for (int ci = 0; ci < children.length; ci++) {
       children[ci].setParent(p);
       setParents(children[ci]);
     }
   }

   /**
    * Removes the punctuation from the specified set of chunks, adds it to the parses
    * adjacent to the punctuation is specified, and returns a new array of parses with
    * the punctuation removed.
    *
    * @param chunks A set of parses.
    * @param punctSet The set of punctuation which is to be removed.
    * @return An array of parses which is a subset of chunks with punctuation removed.
    */
   public static Parse[] collapsePunctuation(Parse[] chunks, Set<String> punctSet) {
     List<Parse> collapsedParses = new ArrayList<>(chunks.length);
     int lastNonPunct = -1;
     int nextNonPunct;
     for (int ci = 0, cn = chunks.length; ci < cn; ci++) {
       if (punctSet.contains(chunks[ci].getType())) {
         if (lastNonPunct >= 0) {
           chunks[lastNonPunct].addNextPunctuation(chunks[ci]);
         }
         for (nextNonPunct = ci + 1; nextNonPunct < cn; nextNonPunct++) {
           if (!punctSet.contains(chunks[nextNonPunct].getType())) {
             break;
           }
         }
         if (nextNonPunct < cn) {
           chunks[nextNonPunct].addPreviousPunctuation(chunks[ci]);
         }
       }
       else {
         collapsedParses.add(chunks[ci]);
         lastNonPunct = ci;
       }
     }
     if (collapsedParses.size() == chunks.length) {
       return chunks;
     }
     //System.err.println("collapsedPunctuation: collapsedParses"+collapsedParses);
     return collapsedParses.toArray(new Parse[collapsedParses.size()]);
   }


   /**
    * Advances the specified parse and returns the an array advanced parses whose
    * probability accounts for more than the specified amount of probability mass.
    *
    * @param p The parse to advance.
    * @param probMass The amount of probability mass that should be accounted for
    *                 by the advanced parses.
    */
   protected abstract Parse[] advanceParses(final Parse p, double probMass);

   /**
    * Adds the "TOP" node to the specified parse.
    * @param p The complete parse.
    */
   protected abstract void advanceTop(Parse p);

   public Parse[] parse(Parse tokens, int numParses) {
     if (createDerivationString) tokens.setDerivation(new StringBuffer(100));
     odh.clear();
     ndh.clear();
     completeParses.clear();
     int derivationStage = 0; //derivation length
     int maxDerivationLength = 2 * tokens.getChildCount() + 3;
     odh.add(tokens);
     Parse guess = null;
     double minComplete = 2;
     double bestComplete = -100000; //approximating -infinity/0 in ln domain
     while (odh.size() > 0 && (completeParses.size() < M || (odh.first()).getProb() < minComplete)
         && derivationStage < maxDerivationLength) {
       ndh = new TreeSet<>();

       int derivationRank = 0;
       for (Iterator<Parse> pi = odh.iterator(); pi.hasNext()
           && derivationRank < K; derivationRank++) { // foreach derivation
         Parse tp = pi.next();
         //TODO: Need to look at this for K-best parsing cases
         /*
          //this parse and the ones which follow will never win, stop advancing.
          if (tp.getProb() < bestComplete) {
          break;
          }
          */
         if (guess == null && derivationStage == 2) {
           guess = tp;
         }
         if (debugOn) {
           System.out.print(derivationStage + " " + derivationRank + " " + tp.getProb());
           tp.show();
           System.out.println();
         }
         Parse[] nd;
         if (0 == derivationStage) {
           nd = advanceTags(tp);
         }
         else if (1 == derivationStage) {
           if (ndh.size() < K) {
             //System.err.println("advancing ts "+j+" "+ndh.size()+" < "+K);
             nd = advanceChunks(tp,bestComplete);
           }
           else {
             //System.err.println("advancing ts "+j+" prob="+((Parse) ndh.last()).getProb());
             nd = advanceChunks(tp,(ndh.last()).getProb());
           }
         }
         else { // i > 1
           nd = advanceParses(tp, Q);
         }
         if (nd != null) {
           for (int k = 0, kl = nd.length; k < kl; k++) {
             if (nd[k].complete()) {
               advanceTop(nd[k]);
               if (nd[k].getProb() > bestComplete) {
                 bestComplete = nd[k].getProb();
               }
               if (nd[k].getProb() < minComplete) {
                 minComplete = nd[k].getProb();
               }
               completeParses.add(nd[k]);
             }
             else {
               ndh.add(nd[k]);
             }
           }
         }
         else {
           //if (reportFailedParse) {
           //  System.err.println("Couldn't advance parse " + derivationStage
           //      + " stage " + derivationRank + "!\n");
           //}
           advanceTop(tp);
           completeParses.add(tp);
         }
       }
       derivationStage++;
       odh = ndh;
     }
     if (completeParses.size() == 0) {
       // if (reportFailedParse) System.err.println("Couldn't find parse for: " + tokens);
       //Parse r = (Parse) odh.first();
       //r.show();
       //System.out.println();
       return new Parse[] {guess};
     }
     else if (numParses == 1) {
       return new Parse[] {completeParses.first()};
     }
     else {
       List<Parse> topParses = new ArrayList<>(numParses);
       while (!completeParses.isEmpty() && topParses.size() < numParses) {
         Parse tp = completeParses.last();
         completeParses.remove(tp);
         topParses.add(tp);
         //parses.remove(tp);
       }
       return topParses.toArray(new Parse[topParses.size()]);
     }
   }

   public Parse parse(Parse tokens) {

     if (tokens.getChildCount() > 0) {
       Parse p = parse(tokens,1)[0];
       setParents(p);
       return p;
     }
     else {
       return tokens;
     }
   }

   /**
    * Returns the top chunk sequences for the specified parse.
    * @param p A pos-tag assigned parse.
    * @param minChunkScore A minimum score below which chunks should not be advanced.
    * @return The top chunk assignments to the specified parse.
    */
   protected Parse[] advanceChunks(final Parse p, double minChunkScore) {
     // chunk
     Parse[] children = p.getChildren();
     String[] words = new String[children.length];
     String[] ptags = new String[words.length];
     double[] probs = new double[words.length];

     for (int i = 0, il = children.length; i < il; i++) {
       Parse sp = children[i];
       words[i] = sp.getHead().getCoveredText();
       ptags[i] = sp.getType();
     }
     //System.err.println("adjusted mcs = "+(minChunkScore-p.getProb()));
     Sequence[] cs = chunker.topKSequences(words, ptags,minChunkScore - p.getProb());
     Parse[] newParses = new Parse[cs.length];
     for (int si = 0, sl = cs.length; si < sl; si++) {
       newParses[si] = (Parse) p.clone(); //copies top level
       if (createDerivationString) newParses[si].getDerivation().append(si).append(".");
       String[] tags = cs[si].getOutcomes().toArray(new String[words.length]);
       cs[si].getProbs(probs);
       int start = -1;
       int end = 0;
       String type = null;
       //System.err.print("sequence "+si+" ");
       for (int j = 0; j <= tags.length; j++) {
         // if (j != tags.length) {System.err.println(words[j]+" "
         // +ptags[j]+" "+tags[j]+" "+probs.get(j));}
         if (j != tags.length) {
           newParses[si].addProb(StrictMath.log(probs[j]));
         }
         // if continue just update end chunking tag don't use contTypeMap
         if (j != tags.length && tags[j].startsWith(CONT)) {
           end = j;
         }
         else { //make previous constituent if it exists
           if (type != null) {
             //System.err.println("inserting tag "+tags[j]);
             Parse p1 = p.getChildren()[start];
             Parse p2 = p.getChildren()[end];
             // System.err.println("Putting "+type+" at "+start+","+end+" for "
             // +j+" "+newParses[si].getProb());
             Parse[] cons = new Parse[end - start + 1];
             cons[0] = p1;
             //cons[0].label="Start-"+type;
             if (end - start != 0) {
               cons[end - start] = p2;
               //cons[end-start].label="Cont-"+type;
               for (int ci = 1; ci < end - start; ci++) {
                 cons[ci] = p.getChildren()[ci + start];
                 //cons[ci].label="Cont-"+type;
               }
             }
             Parse chunk = new Parse(p1.getText(), new Span(p1.getSpan().getStart(),
                 p2.getSpan().getEnd()), type, 1, headRules.getHead(cons, type));
             chunk.isChunk(true);
             newParses[si].insert(chunk);
           }
           if (j != tags.length) { //update for new constituent
             if (tags[j].startsWith(START)) { // don't use startTypeMap these are chunk tags
               type = tags[j].substring(START.length());
               start = j;
               end = j;
             }
             else { // other
               type = null;
             }
           }
         }
       }
       //newParses[si].show();System.out.println();
     }
     return newParses;
   }

   /**
    * Advances the parse by assigning it POS tags and returns multiple tag sequences.
    * @param p The parse to be tagged.
    * @return Parses with different POS-tag sequence assignments.
    */
   protected Parse[] advanceTags(final Parse p) {
     Parse[] children = p.getChildren();
     String[] words = new String[children.length];
     double[] probs = new double[words.length];
     for (int i = 0,il = children.length; i < il; i++) {
       words[i] = children[i].getCoveredText();
     }
     Sequence[] ts = tagger.topKSequences(words);
     Parse[] newParses = new Parse[ts.length];
     for (int i = 0; i < ts.length; i++) {
       String[] tags = ts[i].getOutcomes().toArray(new String[words.length]);
       ts[i].getProbs(probs);
       newParses[i] = (Parse) p.clone(); //copies top level
       if (createDerivationString) newParses[i].getDerivation().append(i).append(".");
       for (int j = 0; j < words.length; j++) {
         Parse word = children[j];
         //System.err.println("inserting tag "+tags[j]);
         double prob = probs[j];
         newParses[i].insert(new Parse(word.getText(), word.getSpan(), tags[j], prob,j));
         newParses[i].addProb(StrictMath.log(prob));
       }
     }
     return newParses;
   }

   /**
    * Determines the mapping between the specified index into the specified parses without punctuation to
    * the corresponding index into the specified parses.
    * @param index An index into the parses without punctuation.
    * @param nonPunctParses The parses without punctuation.
    * @param parses The parses wit punctuation.
    * @return An index into the specified parses which corresponds to the same node the specified index
    *     into the parses with punctuation.
    */
   protected int mapParseIndex(int index, Parse[] nonPunctParses, Parse[] parses) {
     int parseIndex = index;
     while (parses[parseIndex] != nonPunctParses[index]) {
       parseIndex++;
     }
     return parseIndex;
   }

   private static boolean lastChild(Parse child, Parse parent, Set<String> punctSet) {
     if (parent == null) {
       return false;
     }

     Parse[] kids = collapsePunctuation(parent.getChildren(), punctSet);
     return (kids[kids.length - 1] == child);
   }

   /**
    * Creates a n-gram dictionary from the specified data stream using the specified
    * head rule and specified cut-off.
    *
    * @param data The data stream of parses.
    * @param rules The head rules for the parses.
    * @param params can contain a cutoff, the minimum number of entries required for the
    *        n-gram to be saved as part of the dictionary.
    * @return A dictionary object.
    */
   public static Dictionary buildDictionary(ObjectStream<Parse> data, HeadRules rules,
       TrainingParameters params) throws IOException {

     int cutoff = params.getIntParameter("dict", TrainingParameters.CUTOFF_PARAM, 5);

     NGramModel mdict = new NGramModel();
     Parse p;
     while ((p = data.read()) != null) {
       p.updateHeads(rules);
       Parse[] pwords = p.getTagNodes();
       String[] words = new String[pwords.length];
       //add all uni-grams
       for (int wi = 0;wi < words.length; wi++) {
         words[wi] = pwords[wi].getCoveredText();
       }

       mdict.add(new StringList(words), 1, 1);
       //add tri-grams and bi-grams for inital sequence
       Parse[] chunks = collapsePunctuation(ParserEventStream.getInitialChunks(p),
           rules.getPunctuationTags());
       String[] cwords = new String[chunks.length];
       for (int wi = 0; wi < cwords.length; wi++) {
         cwords[wi] = chunks[wi].getHead().getCoveredText();
       }
       mdict.add(new StringList(cwords), 2, 3);

       //emulate reductions to produce additional n-grams
       int ci = 0;
       while (ci < chunks.length) {
         // System.err.println("chunks["+ci+"]="+chunks[ci].getHead().getCoveredText()
         // +" chunks.length="+chunks.length + "  " + chunks[ci].getParent());

         if (chunks[ci].getParent() == null) {
           chunks[ci].show();
         }
         if (lastChild(chunks[ci], chunks[ci].getParent(),rules.getPunctuationTags())) {
           //perform reduce
           int reduceStart = ci;
           while (reduceStart >= 0 && chunks[reduceStart].getParent() == chunks[ci].getParent()) {
             reduceStart--;
           }
           reduceStart++;
           chunks = ParserEventStream.reduceChunks(chunks,ci,chunks[ci].getParent());
           ci = reduceStart;
           if (chunks.length != 0) {
             String[] window = new String[5];
             int wi = 0;
             if (ci - 2 >= 0) window[wi++] = chunks[ci - 2].getHead().getCoveredText();
             if (ci - 1 >= 0) window[wi++] = chunks[ci - 1].getHead().getCoveredText();
             window[wi++] = chunks[ci].getHead().getCoveredText();
             if (ci + 1 < chunks.length) window[wi++] = chunks[ci + 1].getHead().getCoveredText();
             if (ci + 2 < chunks.length) window[wi++] = chunks[ci + 2].getHead().getCoveredText();
             if (wi < 5) {
               String[] subWindow = new String[wi];
               System.arraycopy(window, 0, subWindow, 0, wi);
               window = subWindow;
             }
             if (window.length >= 3) {
               mdict.add(new StringList(window), 2, 3);
             }
             else if (window.length == 2) {
               mdict.add(new StringList(window), 2, 2);
             }
           }
           ci = reduceStart - 1; //ci will be incremented at end of loop
         }
         ci++;
       }
     }
     //System.err.println("gas,and="+mdict.getCount((new TokenList(new String[] {"gas","and"}))));
     mdict.cutoff(cutoff, Integer.MAX_VALUE);
     return mdict.toDictionary(true);
   }

   /**
    * Creates a n-gram dictionary from the specified data stream using the specified
    * head rule and specified cut-off.
    *
    * @param data The data stream of parses.
    * @param rules The head rules for the parses.
    * @param cutoff The minimum number of entries required for the n-gram to be
    *               saved as part of the dictionary.
    * @return A dictionary object.
    */
   public static Dictionary buildDictionary(ObjectStream<Parse> data, HeadRules rules, int cutoff)
       throws IOException {

     TrainingParameters params = new TrainingParameters();
     params.put("dict", TrainingParameters.CUTOFF_PARAM, cutoff);

     return buildDictionary(data, rules, params);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.tools.parser;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Iterator;
	import java.util.List;
	import java.util.Set;
	import java.util.SortedSet;
	import java.util.TreeSet;

	import opennlp.tools.chunker.Chunker;
	import opennlp.tools.dictionary.Dictionary;
	import opennlp.tools.ngram.NGramModel;
	import opennlp.tools.parser.chunking.ParserEventStream;
	import opennlp.tools.postag.POSTagger;
	import opennlp.tools.util.ObjectStream;
	import opennlp.tools.util.Sequence;
	import opennlp.tools.util.Span;
	import opennlp.tools.util.StringList;
	import opennlp.tools.util.TrainingParameters;

	/**
	* Abstract class which contains code to tag and chunk parses for bottom up parsing and
	* leaves implementation of advancing parses and completing parses to extend class.
	* <p>
	* <b>Note:</b> <br> The nodes within the returned parses are shared with other parses
	* and therefore their parent node references will not be consistent with their child
	* node reference. {@link #setParents setParents} can be used to make the parents consistent
	* with a particular parse, but subsequent calls to <code>setParents</code> can invalidate
	* the results of earlier calls.<br>
	*/
	public abstract class AbstractBottomUpParser implements Parser {

	/**
	* The maximum number of parses advanced from all preceding
	* parses at each derivation step.
	*/
	protected int M;

	/**
	* The maximum number of parses to advance from a single preceding parse.
	*/
	protected int K;

	/**
	* The minimum total probability mass of advanced outcomes.
	*/
	protected double Q;

	/**
	* The default beam size used if no beam size is given.
	*/
	public static final int defaultBeamSize = 20;

	/**
	* The default amount of probability mass required of advanced outcomes.
	*/
	public static final double defaultAdvancePercentage = 0.95;

	/**
	* Completed parses.
	*/
	private SortedSet<Parse> completeParses;

	/**
	* Incomplete parses which will be advanced.
	*/
	private SortedSet<Parse> odh;

	/**
	* Incomplete parses which have been advanced.
	*/
	private SortedSet<Parse> ndh;

	/**
	* The head rules for the parser.
	*/
	protected HeadRules headRules;

	/**
	* The set strings which are considered punctuation for the parser.
	* Punctuation is not attached, but floats to the top of the parse as attachment
	* decisions are made about its non-punctuation sister nodes.
	*/
	protected Set<String> punctSet;

	/**
	* The label for the top node.
	*/
	public static final String TOP_NODE = "TOP";

	/**
	* The label for the top if an incomplete node.
	*/
	public static final String INC_NODE = "INC";

	/**
	* The label for a token node.
	*/
	public static final String TOK_NODE = "TK";

	/**
	* The integer 0.
	*/
	public static final Integer ZERO = 0;

	/**
	* Prefix for outcomes starting a constituent.
	*/
	public static final String START = "S-";

	/**
	* Prefix for outcomes continuing a constituent.
	*/
	public static final String CONT = "C-";

	/**
	* Outcome for token which is not contained in a basal constituent.
	*/
	public static final String OTHER = "O";

	/**
	* Outcome used when a constituent is complete.
	*/
	public static final String COMPLETE = "c";

	/**
	* Outcome used when a constituent is incomplete.
	*/
	public static final String INCOMPLETE = "i";

	/**
	* The pos-tagger that the parser uses.
	*/
	protected POSTagger tagger;

	/**
	* The chunker that the parser uses to chunk non-recursive structures.
	*/
	protected Chunker chunker;

	/**
	* Specifies whether failed parses should be reported to standard error.
	*/
	protected boolean reportFailedParse;

	/**
	* Specifies whether a derivation string should be created during parsing.
	* This is useful for debugging.
	*/
	protected boolean createDerivationString = false;

	/**
	* Turns debug print on or off.
	*/
	protected boolean debugOn = false;

	public AbstractBottomUpParser(POSTagger tagger, Chunker chunker, HeadRules headRules,
	int beamSize, double advancePercentage) {
	this.tagger = tagger;
	this.chunker = chunker;
	this.M = beamSize;
	this.K = beamSize;
	this.Q = advancePercentage;
	reportFailedParse = true;
	this.headRules = headRules;
	this.punctSet = headRules.getPunctuationTags();
	odh = new TreeSet<>();
	ndh = new TreeSet<>();
	completeParses = new TreeSet<>();
	}

	/**
	* Specifies whether the parser should report when it was unable to find a parse for
	* a particular sentence.
	* @param errorReporting If true then un-parsed sentences are reported, false otherwise.
	*/
	public void setErrorReporting(boolean errorReporting) {
	this.reportFailedParse = errorReporting;
	}

	/**
	* Assigns parent references for the specified parse so that they
	* are consistent with the children references.
	* @param p The parse whose parent references need to be assigned.
	*/
	public static void setParents(Parse p) {
	Parse[] children = p.getChildren();
	for (int ci = 0; ci < children.length; ci++) {
	children[ci].setParent(p);
	setParents(children[ci]);
	}
	}

	/**
	* Removes the punctuation from the specified set of chunks, adds it to the parses
	* adjacent to the punctuation is specified, and returns a new array of parses with
	* the punctuation removed.
	*
	* @param chunks A set of parses.
	* @param punctSet The set of punctuation which is to be removed.
	* @return An array of parses which is a subset of chunks with punctuation removed.
	*/
	public static Parse[] collapsePunctuation(Parse[] chunks, Set<String> punctSet) {
	List<Parse> collapsedParses = new ArrayList<>(chunks.length);
	int lastNonPunct = -1;
	int nextNonPunct;
	for (int ci = 0, cn = chunks.length; ci < cn; ci++) {
	if (punctSet.contains(chunks[ci].getType())) {
	if (lastNonPunct >= 0) {
	chunks[lastNonPunct].addNextPunctuation(chunks[ci]);
	}
	for (nextNonPunct = ci + 1; nextNonPunct < cn; nextNonPunct++) {
	if (!punctSet.contains(chunks[nextNonPunct].getType())) {
	break;
	}
	}
	if (nextNonPunct < cn) {
	chunks[nextNonPunct].addPreviousPunctuation(chunks[ci]);
	}
	}
	else {
	collapsedParses.add(chunks[ci]);
	lastNonPunct = ci;
	}
	}
	if (collapsedParses.size() == chunks.length) {
	return chunks;
	}
	//System.err.println("collapsedPunctuation: collapsedParses"+collapsedParses);
	return collapsedParses.toArray(new Parse[collapsedParses.size()]);
	}



	/**
	* Advances the specified parse and returns the an array advanced parses whose
	* probability accounts for more than the specified amount of probability mass.
	*
	* @param p The parse to advance.
	* @param probMass The amount of probability mass that should be accounted for
	* by the advanced parses.
	*/
	protected abstract Parse[] advanceParses(final Parse p, double probMass);

	/**
	* Adds the "TOP" node to the specified parse.
	* @param p The complete parse.
	*/
	protected abstract void advanceTop(Parse p);

	public Parse[] parse(Parse tokens, int numParses) {
	if (createDerivationString) tokens.setDerivation(new StringBuffer(100));
	odh.clear();
	ndh.clear();
	completeParses.clear();
	int derivationStage = 0; //derivation length
	int maxDerivationLength = 2 * tokens.getChildCount() + 3;
	odh.add(tokens);
	Parse guess = null;
	double minComplete = 2;
	double bestComplete = -100000; //approximating -infinity/0 in ln domain
	while (odh.size() > 0 && (completeParses.size() < M \|\| (odh.first()).getProb() < minComplete)
	&& derivationStage < maxDerivationLength) {
	ndh = new TreeSet<>();

	int derivationRank = 0;
	for (Iterator<Parse> pi = odh.iterator(); pi.hasNext()
	&& derivationRank < K; derivationRank++) { // foreach derivation
	Parse tp = pi.next();
	//TODO: Need to look at this for K-best parsing cases
	/*
	//this parse and the ones which follow will never win, stop advancing.
	if (tp.getProb() < bestComplete) {
	break;
	}
	*/
	if (guess == null && derivationStage == 2) {
	guess = tp;
	}
	if (debugOn) {
	System.out.print(derivationStage + " " + derivationRank + " " + tp.getProb());
	tp.show();
	System.out.println();
	}
	Parse[] nd;
	if (0 == derivationStage) {
	nd = advanceTags(tp);
	}
	else if (1 == derivationStage) {
	if (ndh.size() < K) {
	//System.err.println("advancing ts "+j+" "+ndh.size()+" < "+K);
	nd = advanceChunks(tp,bestComplete);
	}
	else {
	//System.err.println("advancing ts "+j+" prob="+((Parse) ndh.last()).getProb());
	nd = advanceChunks(tp,(ndh.last()).getProb());
	}
	}
	else { // i > 1
	nd = advanceParses(tp, Q);
	}
	if (nd != null) {
	for (int k = 0, kl = nd.length; k < kl; k++) {
	if (nd[k].complete()) {
	advanceTop(nd[k]);
	if (nd[k].getProb() > bestComplete) {
	bestComplete = nd[k].getProb();
	}
	if (nd[k].getProb() < minComplete) {
	minComplete = nd[k].getProb();
	}
	completeParses.add(nd[k]);
	}
	else {
	ndh.add(nd[k]);
	}
	}
	}
	else {
	//if (reportFailedParse) {
	// System.err.println("Couldn't advance parse " + derivationStage
	// + " stage " + derivationRank + "!\n");
	//}
	advanceTop(tp);
	completeParses.add(tp);
	}
	}
	derivationStage++;
	odh = ndh;
	}
	if (completeParses.size() == 0) {
	// if (reportFailedParse) System.err.println("Couldn't find parse for: " + tokens);
	//Parse r = (Parse) odh.first();
	//r.show();
	//System.out.println();
	return new Parse[] {guess};
	}
	else if (numParses == 1) {
	return new Parse[] {completeParses.first()};
	}
	else {
	List<Parse> topParses = new ArrayList<>(numParses);
	while (!completeParses.isEmpty() && topParses.size() < numParses) {
	Parse tp = completeParses.last();
	completeParses.remove(tp);
	topParses.add(tp);
	//parses.remove(tp);
	}
	return topParses.toArray(new Parse[topParses.size()]);
	}
	}

	public Parse parse(Parse tokens) {

	if (tokens.getChildCount() > 0) {
	Parse p = parse(tokens,1)[0];
	setParents(p);
	return p;
	}
	else {
	return tokens;
	}
	}

	/**
	* Returns the top chunk sequences for the specified parse.
	* @param p A pos-tag assigned parse.
	* @param minChunkScore A minimum score below which chunks should not be advanced.
	* @return The top chunk assignments to the specified parse.
	*/
	protected Parse[] advanceChunks(final Parse p, double minChunkScore) {
	// chunk
	Parse[] children = p.getChildren();
	String[] words = new String[children.length];
	String[] ptags = new String[words.length];
	double[] probs = new double[words.length];

	for (int i = 0, il = children.length; i < il; i++) {
	Parse sp = children[i];
	words[i] = sp.getHead().getCoveredText();
	ptags[i] = sp.getType();
	}
	//System.err.println("adjusted mcs = "+(minChunkScore-p.getProb()));
	Sequence[] cs = chunker.topKSequences(words, ptags,minChunkScore - p.getProb());
	Parse[] newParses = new Parse[cs.length];
	for (int si = 0, sl = cs.length; si < sl; si++) {
	newParses[si] = (Parse) p.clone(); //copies top level
	if (createDerivationString) newParses[si].getDerivation().append(si).append(".");
	String[] tags = cs[si].getOutcomes().toArray(new String[words.length]);
	cs[si].getProbs(probs);
	int start = -1;
	int end = 0;
	String type = null;
	//System.err.print("sequence "+si+" ");
	for (int j = 0; j <= tags.length; j++) {
	// if (j != tags.length) {System.err.println(words[j]+" "
	// +ptags[j]+" "+tags[j]+" "+probs.get(j));}
	if (j != tags.length) {
	newParses[si].addProb(StrictMath.log(probs[j]));
	}
	// if continue just update end chunking tag don't use contTypeMap
	if (j != tags.length && tags[j].startsWith(CONT)) {
	end = j;
	}
	else { //make previous constituent if it exists
	if (type != null) {
	//System.err.println("inserting tag "+tags[j]);
	Parse p1 = p.getChildren()[start];
	Parse p2 = p.getChildren()[end];
	// System.err.println("Putting "+type+" at "+start+","+end+" for "
	// +j+" "+newParses[si].getProb());
	Parse[] cons = new Parse[end - start + 1];
	cons[0] = p1;
	//cons[0].label="Start-"+type;
	if (end - start != 0) {
	cons[end - start] = p2;
	//cons[end-start].label="Cont-"+type;
	for (int ci = 1; ci < end - start; ci++) {
	cons[ci] = p.getChildren()[ci + start];
	//cons[ci].label="Cont-"+type;
	}
	}
	Parse chunk = new Parse(p1.getText(), new Span(p1.getSpan().getStart(),
	p2.getSpan().getEnd()), type, 1, headRules.getHead(cons, type));
	chunk.isChunk(true);
	newParses[si].insert(chunk);
	}
	if (j != tags.length) { //update for new constituent
	if (tags[j].startsWith(START)) { // don't use startTypeMap these are chunk tags
	type = tags[j].substring(START.length());
	start = j;
	end = j;
	}
	else { // other
	type = null;
	}
	}
	}
	}
	//newParses[si].show();System.out.println();
	}
	return newParses;
	}

	/**
	* Advances the parse by assigning it POS tags and returns multiple tag sequences.
	* @param p The parse to be tagged.
	* @return Parses with different POS-tag sequence assignments.
	*/
	protected Parse[] advanceTags(final Parse p) {
	Parse[] children = p.getChildren();
	String[] words = new String[children.length];
	double[] probs = new double[words.length];
	for (int i = 0,il = children.length; i < il; i++) {
	words[i] = children[i].getCoveredText();
	}
	Sequence[] ts = tagger.topKSequences(words);
	Parse[] newParses = new Parse[ts.length];
	for (int i = 0; i < ts.length; i++) {
	String[] tags = ts[i].getOutcomes().toArray(new String[words.length]);
	ts[i].getProbs(probs);
	newParses[i] = (Parse) p.clone(); //copies top level
	if (createDerivationString) newParses[i].getDerivation().append(i).append(".");
	for (int j = 0; j < words.length; j++) {
	Parse word = children[j];
	//System.err.println("inserting tag "+tags[j]);
	double prob = probs[j];
	newParses[i].insert(new Parse(word.getText(), word.getSpan(), tags[j], prob,j));
	newParses[i].addProb(StrictMath.log(prob));
	}
	}
	return newParses;
	}

	/**
	* Determines the mapping between the specified index into the specified parses without punctuation to
	* the corresponding index into the specified parses.
	* @param index An index into the parses without punctuation.
	* @param nonPunctParses The parses without punctuation.
	* @param parses The parses wit punctuation.
	* @return An index into the specified parses which corresponds to the same node the specified index
	* into the parses with punctuation.
	*/
	protected int mapParseIndex(int index, Parse[] nonPunctParses, Parse[] parses) {
	int parseIndex = index;
	while (parses[parseIndex] != nonPunctParses[index]) {
	parseIndex++;
	}
	return parseIndex;
	}

	private static boolean lastChild(Parse child, Parse parent, Set<String> punctSet) {
	if (parent == null) {
	return false;
	}

	Parse[] kids = collapsePunctuation(parent.getChildren(), punctSet);
	return (kids[kids.length - 1] == child);
	}

	/**
	* Creates a n-gram dictionary from the specified data stream using the specified
	* head rule and specified cut-off.
	*
	* @param data The data stream of parses.
	* @param rules The head rules for the parses.
	* @param params can contain a cutoff, the minimum number of entries required for the
	* n-gram to be saved as part of the dictionary.
	* @return A dictionary object.
	*/
	public static Dictionary buildDictionary(ObjectStream<Parse> data, HeadRules rules,
	TrainingParameters params) throws IOException {

	int cutoff = params.getIntParameter("dict", TrainingParameters.CUTOFF_PARAM, 5);

	NGramModel mdict = new NGramModel();
	Parse p;
	while ((p = data.read()) != null) {
	p.updateHeads(rules);
	Parse[] pwords = p.getTagNodes();
	String[] words = new String[pwords.length];
	//add all uni-grams
	for (int wi = 0;wi < words.length; wi++) {
	words[wi] = pwords[wi].getCoveredText();
	}

	mdict.add(new StringList(words), 1, 1);
	//add tri-grams and bi-grams for inital sequence
	Parse[] chunks = collapsePunctuation(ParserEventStream.getInitialChunks(p),
	rules.getPunctuationTags());
	String[] cwords = new String[chunks.length];
	for (int wi = 0; wi < cwords.length; wi++) {
	cwords[wi] = chunks[wi].getHead().getCoveredText();
	}
	mdict.add(new StringList(cwords), 2, 3);

	//emulate reductions to produce additional n-grams
	int ci = 0;
	while (ci < chunks.length) {
	// System.err.println("chunks["+ci+"]="+chunks[ci].getHead().getCoveredText()
	// +" chunks.length="+chunks.length + " " + chunks[ci].getParent());

	if (chunks[ci].getParent() == null) {
	chunks[ci].show();
	}
	if (lastChild(chunks[ci], chunks[ci].getParent(),rules.getPunctuationTags())) {
	//perform reduce
	int reduceStart = ci;
	while (reduceStart >= 0 && chunks[reduceStart].getParent() == chunks[ci].getParent()) {
	reduceStart--;
	}
	reduceStart++;
	chunks = ParserEventStream.reduceChunks(chunks,ci,chunks[ci].getParent());
	ci = reduceStart;
	if (chunks.length != 0) {
	String[] window = new String[5];
	int wi = 0;
	if (ci - 2 >= 0) window[wi++] = chunks[ci - 2].getHead().getCoveredText();
	if (ci - 1 >= 0) window[wi++] = chunks[ci - 1].getHead().getCoveredText();
	window[wi++] = chunks[ci].getHead().getCoveredText();
	if (ci + 1 < chunks.length) window[wi++] = chunks[ci + 1].getHead().getCoveredText();
	if (ci + 2 < chunks.length) window[wi++] = chunks[ci + 2].getHead().getCoveredText();
	if (wi < 5) {
	String[] subWindow = new String[wi];
	System.arraycopy(window, 0, subWindow, 0, wi);
	window = subWindow;
	}
	if (window.length >= 3) {
	mdict.add(new StringList(window), 2, 3);
	}
	else if (window.length == 2) {
	mdict.add(new StringList(window), 2, 2);
	}
	}
	ci = reduceStart - 1; //ci will be incremented at end of loop
	}
	ci++;
	}
	}
	//System.err.println("gas,and="+mdict.getCount((new TokenList(new String[] {"gas","and"}))));
	mdict.cutoff(cutoff, Integer.MAX_VALUE);
	return mdict.toDictionary(true);
	}

	/**
	* Creates a n-gram dictionary from the specified data stream using the specified
	* head rule and specified cut-off.
	*
	* @param data The data stream of parses.
	* @param rules The head rules for the parses.
	* @param cutoff The minimum number of entries required for the n-gram to be
	* saved as part of the dictionary.
	* @return A dictionary object.
	*/
	public static Dictionary buildDictionary(ObjectStream<Parse> data, HeadRules rules, int cutoff)
	throws IOException {

	TrainingParameters params = new TrainingParameters();
	params.put("dict", TrainingParameters.CUTOFF_PARAM, cutoff);

	return buildDictionary(data, rules, params);
	}
	}