opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java - opennlp-sandbox - Git at Google


 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.textsimilarity.chunker2matcher;

 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.logging.Level;
 import java.util.logging.Logger;

 import opennlp.tools.chunker.ChunkerME;
 import opennlp.tools.chunker.ChunkerModel;
 import opennlp.tools.cmdline.parser.ParserTool;
 import opennlp.tools.parser.AbstractBottomUpParser;
 import opennlp.tools.parser.Parse;
 import opennlp.tools.parser.Parser;
 import opennlp.tools.parser.ParserFactory;
 import opennlp.tools.parser.ParserModel;
 import opennlp.tools.postag.POSModel;
 import opennlp.tools.postag.POSTagger;
 import opennlp.tools.postag.POSTaggerME;
 import opennlp.tools.sentdetect.SentenceDetector;
 import opennlp.tools.sentdetect.SentenceDetectorME;
 import opennlp.tools.sentdetect.SentenceModel;
 import opennlp.tools.textsimilarity.LemmaPair;
 import opennlp.tools.textsimilarity.ParseTreeChunk;
 import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic;
 import opennlp.tools.textsimilarity.SentencePairMatchResult;
 import opennlp.tools.textsimilarity.TextProcessor;
 import opennlp.tools.tokenize.Tokenizer;
 import opennlp.tools.tokenize.TokenizerME;
 import opennlp.tools.tokenize.TokenizerModel;
 import opennlp.tools.util.Span;

 public class ParserChunker2MatcherProcessor {
   protected static final int MIN_SENTENCE_LENGTH = 10;
   private static final String MODEL_DIR_KEY = "nlp.models.dir";
   // TODO config
   // this is where resources should live
   private static String MODEL_DIR=null, MODEL_DIR_REL = "src/test/resources/models";
   protected static ParserChunker2MatcherProcessor instance;

   private SentenceDetector sentenceDetector;
   private Tokenizer tokenizer;
   private POSTagger posTagger;
   private Parser parser;
   private ChunkerME chunker;
   private final int NUMBER_OF_SECTIONS_IN_SENTENCE_CHUNKS = 5;
   private static Logger LOG = Logger
       .getLogger("opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor");
   private Map<String, String[][]> sentence_parseObject = new HashMap<String, String[][]>();

   public SentenceDetector getSentenceDetector() {
     return sentenceDetector;
   }

   public void setSentenceDetector(SentenceDetector sentenceDetector) {
     this.sentenceDetector = sentenceDetector;
   }

   public Tokenizer getTokenizer() {
     return tokenizer;
   }

   public void setTokenizer(Tokenizer tokenizer) {
     this.tokenizer = tokenizer;
   }

   public ChunkerME getChunker() {
     return chunker;
   }

   public void setChunker(ChunkerME chunker) {
     this.chunker = chunker;
   }

   @SuppressWarnings("unchecked")
   protected ParserChunker2MatcherProcessor() {
     try {
       sentence_parseObject = (Map<String, String[][]>) ParserCacheSerializer
           .readObject();
     } catch (Exception e) {
       // this file might not exist initially
       LOG.fine("parsing  cache file does not exist (but should be created)");
       sentence_parseObject = new HashMap<String, String[][]>();
     }
     if (sentence_parseObject == null)
       sentence_parseObject = new HashMap<String, String[][]>();

     try {
     	if (MODEL_DIR==null || MODEL_DIR.equals("/models")) {
     		String absPath = new File(".").getAbsolutePath();
     		absPath = absPath.substring(0, absPath.length()-1);
     		MODEL_DIR = absPath + MODEL_DIR_REL;
     	}
     	//get full path from constructor

       initializeSentenceDetector();
       initializeTokenizer();
       initializePosTagger();
       initializeParser();
       initializeChunker();
     } catch (Exception e) { // a typical error when 'model' is not installed
       System.err.println("Please install OpenNLP model files in 'src/test/resources' (folder 'model'");
       LOG.fine("The model can't be read and we rely on cache");
     }
   }

   // closing the processor, clearing loaded ling models and serializing parsing
   // cache
   public void close() {
     instance = null;
     ParserCacheSerializer.writeObject(sentence_parseObject);
   }

   /**
    * singleton method of instantiating the processor
    *
    * @return the instance
    */
   public synchronized static ParserChunker2MatcherProcessor getInstance() {
     if (instance == null)
       instance = new ParserChunker2MatcherProcessor();

     return instance;
   }

   public synchronized static ParserChunker2MatcherProcessor getInstance(String fullPathToResources) {
 	    MODEL_DIR = fullPathToResources+"/models";
 	    if (instance == null)
 	      instance = new ParserChunker2MatcherProcessor();

 	    return instance;
 	  }

   /**
    * General parsing function, which returns lists of parses for a portion of
    * text
    *
    * @param text
    *          to be parsed
    * @return lists of parses
    */
   public List<List<Parse>> parseTextNlp(String text) {
     if (text == null || text.trim().length() == 0)
       return null;

     List<List<Parse>> textParses = new ArrayList<List<Parse>>(1);

     // parse paragraph by paragraph
     String[] paragraphList = splitParagraph(text);
     for (String paragraph : paragraphList) {
       if (paragraph.length() == 0)
         continue;

       List<Parse> paragraphParses = parseParagraphNlp(paragraph);
       if (paragraphParses != null)
         textParses.add(paragraphParses);
     }

     return textParses;
   }

   public List<Parse> parseParagraphNlp(String paragraph) {
     if (paragraph == null || paragraph.trim().length() == 0)
       return null;

     // normalize the text before parsing, otherwise, the sentences may not
     // be
     // separated correctly

     // parse sentence by sentence
     String[] sentences = splitSentences(paragraph);
     List<Parse> parseList = new ArrayList<Parse>(sentences.length);
     for (String sentence : sentences) {
       sentence = sentence.trim();
       if (sentence.length() == 0)
         continue;

       Parse sentenceParse = parseSentenceNlp(sentence, false);
       if (sentenceParse != null)
         parseList.add(sentenceParse);
     }

     return parseList;
   }

   public Parse parseSentenceNlp(String sentence) {
     // if we parse an individual sentence, we want to normalize the text
     // before parsing
     return parseSentenceNlp(sentence, true);
   }

   public synchronized Parse parseSentenceNlp(String sentence,
       boolean normalizeText) {
     // don't try to parse very short sentence, not much info in it anyway,
     // most likely a heading
     if (sentence == null || sentence.trim().length() < MIN_SENTENCE_LENGTH)
       return null;

     Parse[] parseArray = null;
     try {
       parseArray = ParserTool.parseLine(sentence, parser, 1);
     } catch (Throwable t) {
       LOG.log(Level.WARNING, "failed to parse the sentence : '" + sentence); //, t);
       return null;
     }
     // there should be only one result parse
     if (parseArray != null && parseArray.length > 0)
       return parseArray[0];
     else
       return null;
   }

   /**
    *
    * @param para
    *          input text string which is assumed to be a paragraph and is split
    *          into sentences
    * @return a list of lists of phrases with their POS tags for each phrase type
    *         (noun, verb etc.)
    */

   public synchronized List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForPara(
       String para) {
     List<List<ParseTreeChunk>> listOfChunksAccum = new ArrayList<List<ParseTreeChunk>>();
     String[] sentences = splitSentences(para);
     for (String sent : sentences) {
       List<List<ParseTreeChunk>> singleSentChunks = formGroupedPhrasesFromChunksForSentence(sent);
       if (singleSentChunks == null)
         continue;
       if (listOfChunksAccum.size() < 1) {
         listOfChunksAccum = new ArrayList<List<ParseTreeChunk>>(
             singleSentChunks);
       } else
         for (int i = 0; i < NUMBER_OF_SECTIONS_IN_SENTENCE_CHUNKS; i++) {
           // make sure not null
           if (singleSentChunks == null
               || singleSentChunks.size() != NUMBER_OF_SECTIONS_IN_SENTENCE_CHUNKS)
             break;
           List<ParseTreeChunk> phraseI = singleSentChunks.get(i);
           List<ParseTreeChunk> phraseIaccum = listOfChunksAccum.get(i);
           phraseIaccum.addAll(phraseI);
           listOfChunksAccum.set(i, phraseIaccum);
         }
     }
     return listOfChunksAccum;
   }

   String[][] parseChunkSentence(String sentenceInp) {
     String[][] resToksTags = sentence_parseObject.get(sentenceInp);
     if (resToksTags != null)
       return resToksTags;
     if (tokenizer == null)
       return null;

     String sentence = TextProcessor.removePunctuation(sentenceInp);

     String[] toks = tokenizer.tokenize(sentence);
     String[] tags = new String[toks.length]; // posTagger.tag(toks);
     SentenceNode node = parseSentenceNode(sentence);
     if (node == null) {
       LOG.info("Problem parsing sentence '" + sentence);
       return null;
     }
     List<String> POSlist = node.getOrderedPOSList();

     tags = POSlist.toArray(new String[0]);
     if (toks.length != tags.length) {
       LOG.finest("disagreement between toks and tags; sent =  '" + sentence
           + "'\n tags = " + tags
           + "\n will now try this sentence in lower case");
       node = parseSentenceNode(sentence.toLowerCase());
       if (node == null) {
         LOG.finest("Problem parsing sentence '" + sentence);
         return null;
       }
       POSlist = node.getOrderedPOSList();
       tags = POSlist.toArray(new String[0]);
       if (toks.length != tags.length) {
         LOG.finest("AGAIN: disagreement between toks and tags for lower case! ");
         if (toks.length > tags.length) {
           String[] newToks = new String[tags.length];
           for (int i = 0; i < tags.length; i++) {
             newToks[i] = toks[i];
           }
           toks = newToks;

         } else
           return null;
       }
     }

     String[] res = chunker.chunk(toks, tags);
     String[][] resTagToks = new String[][] { res, tags, toks };
     sentence_parseObject.put(sentenceInp, resTagToks);
     return resTagToks;
   }

   /**
    *
    * @param para
    *          input text string which is assumed to be a sentence
    * @return a list of lists of phrases with their POS tags for each phrase type
    *         (noun, verb etc.)
    */
   public synchronized List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForSentence(
       String sentence) {
     if (sentence == null || sentence.trim().length() < MIN_SENTENCE_LENGTH)
       return null;
     /*
      * sentence = TextProcessor.removePunctuation(sentence);
      *
      * String[] toks = tokenizer.tokenize(sentence); String[] tags = new
      * String[toks.length]; //posTagger.tag(toks); SentenceNode node =
      * parseSentenceNode(sentence); if (node==null){
      * LOG.info("Problem parsing sentence '"+sentence); return null; }
      * List<String> POSlist = node.getOrderedPOSList();
      *
      * tags = POSlist.toArray(new String[0]); if (toks.length != tags.length){
      * LOG.info("disagreement between toks and tags; sent =  '"+sentence +
      * "'\n tags = "+tags + "\n will now try this sentence in lower case" );
      * node = parseSentenceNode(sentence.toLowerCase()); if (node==null){
      * LOG.info("Problem parsing sentence '"+sentence); return null; } POSlist =
      * node.getOrderedPOSList(); tags = POSlist.toArray(new String[0]); if
      * (toks.length != tags.length){
      * LOG.info("AGAIN: disagreement between toks and tags for lower case! ");
      * if (toks.length>tags.length){ String[] newToks = new String[tags.length];
      * for(int i = 0; i<tags.length; i++ ){ newToks[i] = toks[i]; } toks =
      * newToks;
      *
      * } else return null; } }
      */
     String[][] resTagToks = parseChunkSentence(sentence);
     if (resTagToks == null)
       return null;
     String[] res = resTagToks[0];
     String[] tags = resTagToks[1];
     String[] toks = resTagToks[2];

     // String[] res = chunker.chunk(toks, tags);

     List<List<ParseTreeChunk>> listOfChunks = new ArrayList<List<ParseTreeChunk>>();
     List<ParseTreeChunk> nounPhr = new ArrayList<ParseTreeChunk>(), prepPhr = new ArrayList<ParseTreeChunk>(), verbPhr = new ArrayList<ParseTreeChunk>(), adjPhr = new ArrayList<ParseTreeChunk>(),
     // to store the whole sentence
     wholeSentence = new ArrayList<ParseTreeChunk>();
     List<String> pOSsAll = new ArrayList<String>(), lemmasAll = new ArrayList<String>();

     for (int i = 0; i < toks.length; i++) {
       pOSsAll.add(tags[i]);
       lemmasAll.add(toks[i]);
     }
     wholeSentence.add(new ParseTreeChunk("SENTENCE", lemmasAll, pOSsAll));

     boolean currPhraseClosed = false;
     for (int i = 0; i < res.length; i++) {
       String bi_POS = res[i];
       currPhraseClosed = false;
       if (bi_POS.startsWith("B-NP")) {// beginning of a phrase

         List<String> pOSs = new ArrayList<String>(), lemmas = new ArrayList<String>();
         pOSs.add(tags[i]);
         lemmas.add(toks[i]);
         for (int j = i + 1; j < res.length; j++) {
           if (res[j].startsWith("B-VP")) {
             nounPhr.add(new ParseTreeChunk("NP", lemmas, pOSs));
             // LOG.info(i + " => " +lemmas);
             currPhraseClosed = true;
             break;
           } else {
             pOSs.add(tags[j]);
             lemmas.add(toks[j]);
           }
         }
         if (!currPhraseClosed) {
           nounPhr.add(new ParseTreeChunk("NP", lemmas, pOSs));
           // LOG.fine(i + " => " + lemmas);
         }

       } else if (bi_POS.startsWith("B-PP")) {// beginning of a phrase
         List<String> pOSs = new ArrayList<String>(), lemmas = new ArrayList<String>();
         pOSs.add(tags[i]);
         lemmas.add(toks[i]);

         for (int j = i + 1; j < res.length; j++) {
           if (res[j].startsWith("B-VP")) {
             prepPhr.add(new ParseTreeChunk("PP", lemmas, pOSs));
             // LOG.fine(i + " => " + lemmas);
             currPhraseClosed = true;
             break;
           } else {
             pOSs.add(tags[j]);
             lemmas.add(toks[j]);
           }
         }
         if (!currPhraseClosed) {
           prepPhr.add(new ParseTreeChunk("PP", lemmas, pOSs));
           // LOG.fine(i + " => " + lemmas);
         }
       } else if (bi_POS.startsWith("B-VP")) {// beginning of a phrase
         List<String> pOSs = new ArrayList<String>(), lemmas = new ArrayList<String>();
         pOSs.add(tags[i]);
         lemmas.add(toks[i]);

         for (int j = i + 1; j < res.length; j++) {
           if (res[j].startsWith("B-VP")) {
             verbPhr.add(new ParseTreeChunk("VP", lemmas, pOSs));
             // LOG.fine(i + " => " +lemmas);
             currPhraseClosed = true;
             break;
           } else {
             pOSs.add(tags[j]);
             lemmas.add(toks[j]);
           }
         }
         if (!currPhraseClosed) {
           verbPhr.add(new ParseTreeChunk("VP", lemmas, pOSs));
           // LOG.fine(i + " => " + lemmas);
         }
       } else if (bi_POS.startsWith("B-ADJP")) {// beginning of a phrase
         List<String> pOSs = new ArrayList<String>(), lemmas = new ArrayList<String>();
         pOSs.add(tags[i]);
         lemmas.add(toks[i]);

         for (int j = i + 1; j < res.length; j++) {
           if (res[j].startsWith("B-VP")) {
             adjPhr.add(new ParseTreeChunk("ADJP", lemmas, pOSs));
             // LOG.fine(i + " => " +lemmas);
             currPhraseClosed = true;
             break;
           } else {
             pOSs.add(tags[j]);
             lemmas.add(toks[j]);
           }
         }
         if (!currPhraseClosed) {
           adjPhr.add(new ParseTreeChunk("ADJP", lemmas, pOSs));
           // LOG.fine(i + " => " + lemmas);
         }
       }
     }
     listOfChunks.add(nounPhr);
     listOfChunks.add(verbPhr);
     listOfChunks.add(prepPhr);
     listOfChunks.add(adjPhr);
     listOfChunks.add(wholeSentence);

     return listOfChunks;
   }

   public static List<List<SentenceNode>> textToSentenceNodes(
       List<List<Parse>> textParses) {
     if (textParses == null || textParses.size() == 0)
       return null;

     List<List<SentenceNode>> textNodes = new ArrayList<List<SentenceNode>>(
         textParses.size());
     for (List<Parse> paragraphParses : textParses) {
       List<SentenceNode> paragraphNodes = paragraphToSentenceNodes(paragraphParses);

       // append paragraph node if any
       if (paragraphNodes != null && paragraphNodes.size() > 0)
         textNodes.add(paragraphNodes);
     }

     if (textNodes.size() > 0)
       return textNodes;
     else
       return null;
   }

   public static List<SentenceNode> paragraphToSentenceNodes(
       List<Parse> paragraphParses) {
     if (paragraphParses == null || paragraphParses.size() == 0)
       return null;

     List<SentenceNode> paragraphNodes = new ArrayList<SentenceNode>(
         paragraphParses.size());
     for (Parse sentenceParse : paragraphParses) {
       SentenceNode sentenceNode = null;
       try {
         sentenceNode = sentenceToSentenceNode(sentenceParse);
       } catch (Exception e) {
         // don't fail the whole paragraph when a single sentence fails
         LOG.severe("Failed to convert sentence to node. error: " + e);
         sentenceNode = null;
       }

       if (sentenceNode != null)
         paragraphNodes.add(sentenceNode);
     }

     if (paragraphNodes.size() > 0)
       return paragraphNodes;
     else
       return null;
   }

   public static SentenceNode sentenceToSentenceNode(Parse sentenceParse) {
     if (sentenceParse == null)
       return null;

     // convert the OpenNLP Parse to our own tree nodes
     SyntacticTreeNode node = toSyntacticTreeNode(sentenceParse);
     if ((node == null))
       return null;
     if (node instanceof SentenceNode)
       return (SentenceNode) node;
     else if (node instanceof PhraseNode) {
       SentenceNode sn = new SentenceNode("sentence", node.getChildren());
       return sn;
     } else
       return null;
   }

   public List<List<SentenceNode>> parseTextNode(String text) {
     List<List<Parse>> textParseList = parseTextNlp(text);
     return textToSentenceNodes(textParseList);
   }

   public List<SentenceNode> parseParagraphNode(String paragraph) {
     List<Parse> paragraphParseList = parseParagraphNlp(paragraph);
     return paragraphToSentenceNodes(paragraphParseList);
   }

   public SentenceNode parseSentenceNode(String sentence) {
     return parseSentenceNode(sentence, true);
   }

   public synchronized SentenceNode parseSentenceNode(String sentence,
       boolean normalizeText) {
     Parse sentenceParse = parseSentenceNlp(sentence, normalizeText);
     return sentenceToSentenceNode(sentenceParse);
   }

   public String[] splitParagraph(String text) {
     String[] res = text.split("\n");
     if (res == null || res.length <= 1)
       return new String[] { text };
     else
       return res;

   }

   public String[] splitSentences(String text) {
     if (text == null)
       return null;
     // if (sentenceDetector!=null)
     // return sentenceDetector.sentDetect(text);
     else {
       List<String> sents = TextProcessor.splitToSentences(text);
       return sents.toArray(new String[0]);
     }
   }

   public String[] tokenizeSentence(String sentence) {
     if (sentence == null)
       return null;

     return tokenizer.tokenize(sentence);
   }

   protected void initializeSentenceDetector() {
     InputStream is = null;
     try {
       is = new FileInputStream(MODEL_DIR + "/en-sent.bin"

       );
       SentenceModel model = new SentenceModel(is);
       sentenceDetector = new SentenceDetectorME(model);
     } catch (IOException e) {
       e.printStackTrace();
     } finally {
       if (is != null) {
         try {
           is.close();
         } catch (IOException e) {
            // we swallow exception to support the cached run
         	e.printStackTrace();
         }
       }
     }
   }

   protected void initializeTokenizer() {
     InputStream is = null;
     try {
       is = new FileInputStream(MODEL_DIR + "/en-token.bin");
       TokenizerModel model = new TokenizerModel(is);
       tokenizer = new TokenizerME(model);
     } catch (IOException e) {
          // we swallow exception to support the cached run
     } finally {
       if (is != null) {
         try {
           is.close();
         } catch (IOException e) { // we swallow exception to support the cached run
         }
       }
     }
   }

   protected void initializePosTagger() {
     InputStream is = null;
     try {
       is = new FileInputStream(MODEL_DIR + "/en-pos-maxent.bin");
       POSModel model = new POSModel(is);
       posTagger = new POSTaggerME(model);
     } catch (IOException e) {
    // we swallow exception to support the cached run
     } finally {
       if (is != null) {
         try {
           is.close();
         } catch (IOException e) {
         }
       }
     }
   }

   protected void initializeParser() {
     InputStream is = null;
     try {
       is = new FileInputStream(MODEL_DIR + "/en-parser-chunking.bin");
       ParserModel model = new ParserModel(is);
       parser = ParserFactory.create(model);
     } catch (IOException e) {
       //e.printStackTrace();
     } finally {
       if (is != null) {
         try {
           is.close();
         } catch (IOException e) { // we swallow exception to support the cached run
         }
       }
     }
   }

   private void initializeChunker() {
     InputStream is = null;
     try {
       is = new FileInputStream(MODEL_DIR + "/en-chunker.bin");
       ChunkerModel model = new ChunkerModel(is);
       chunker = new ChunkerME(model);
     } catch (IOException e) {
       //e.printStackTrace();
     } finally {
       if (is != null) {
         try {
           is.close();
         } catch (IOException e) { // we swallow exception to support the cached run
         }
       }
     }
   }

   /**
    * convert an instance of Parse to SyntacticTreeNode, by filtering out the
    * unnecessary data and assigning the word for each node
    *
    * @param parse
    */
   private static SyntacticTreeNode toSyntacticTreeNode(Parse parse) {
     if (parse == null)
       return null;

     // check for junk types
     String type = parse.getType();
     if (SyntacticTreeNode.isJunkType(type, parse))
       return null;

     String text = parse.getText();
     ArrayList<SyntacticTreeNode> childrenNodeList = convertChildrenNodes(parse);

     // check sentence node, the node contained in the top node
     if (type.equals(AbstractBottomUpParser.TOP_NODE)
         && childrenNodeList != null && childrenNodeList.size() > 0) {
       PhraseNode rootNode;
 	try {
 		rootNode = (PhraseNode) childrenNodeList.get(0);
 	} catch (Exception e) {
 		return null;
 	}
       return new SentenceNode(text, rootNode.getChildren());
     }

     // if this node contains children nodes, then it is a phrase node
     if (childrenNodeList != null && childrenNodeList.size() > 0) {
       // System.out.println("Found "+ type + " phrase = "+ childrenNodeList);
       return new PhraseNode(type, childrenNodeList);

     }

     // otherwise, it is a word node
     Span span = parse.getSpan();
     String word = text.substring(span.getStart(), span.getEnd()).trim();

     return new WordNode(type, word);
   }

   private static ArrayList<SyntacticTreeNode> convertChildrenNodes(Parse parse) {
     if (parse == null)
       return null;

     Parse[] children = parse.getChildren();
     if (children == null || children.length == 0)
       return null;

     ArrayList<SyntacticTreeNode> childrenNodeList = new ArrayList<SyntacticTreeNode>();
     for (Parse child : children) {
       SyntacticTreeNode childNode = toSyntacticTreeNode(child);
       if (childNode != null)
         childrenNodeList.add(childNode);
     }

     return childrenNodeList;
   }

   /**
    * The key function of similarity component which takes two portions of text
    * and does similarity assessment by finding the set of all maximum common
    * subtrees of the set of parse trees for each portion of text
    *
    * @param input
    *          text 1
    * @param input
    *          text 2
    * @return the matching results structure, which includes the similarity score
    */
   public SentencePairMatchResult assessRelevance(String para1, String para2) {
     List<List<ParseTreeChunk>> sent1GrpLst = formGroupedPhrasesFromChunksForPara(para1), sent2GrpLst = formGroupedPhrasesFromChunksForPara(para2);

     List<LemmaPair> origChunks1 = listListParseTreeChunk2ListLemmaPairs(sent1GrpLst);

     ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic();
     List<List<ParseTreeChunk>> res = md
         .matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);
     return new SentencePairMatchResult(res, origChunks1);

   }

   protected List<LemmaPair> listListParseTreeChunk2ListLemmaPairs(
       List<List<ParseTreeChunk>> sent1GrpLst) {
     List<LemmaPair> results = new ArrayList<LemmaPair>();
     if (sent1GrpLst == null || sent1GrpLst.size() < 1)
       return results;
     List<ParseTreeChunk> wholeSentence = sent1GrpLst
         .get(sent1GrpLst.size() - 1); // whole sentence is last list in the list
                                       // of lists

     List<String> pOSs = wholeSentence.get(0).getPOSs();
     List<String> lemmas = wholeSentence.get(0).getLemmas();
     for (int i = 0; i < lemmas.size(); i++) {
       results.add(new LemmaPair(pOSs.get(i), lemmas.get(i), i));
     }

     return results;
   }

   public void printParseTree(String phrase1) {
     ParserChunker2MatcherProcessor p = ParserChunker2MatcherProcessor
         .getInstance();
     List<List<SentenceNode>> nodeListList = p.parseTextNode(phrase1);
     for (List<SentenceNode> nodeList : nodeListList) {
       for (SentenceNode node : nodeList) {
         System.out.println(node);
       }
     }
   }
 }

 /*
  *
  * java.lang.ClassCastException: opennlp.tools.textsimilarity.chunker2matcher.WordNode cannot be cast to opennlp.tools.textsimilarity.chunker2matcher.PhraseNode
 	at opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor.toSyntacticTreeNode(ParserChunker2MatcherProcessor.java:699)
 	at opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor.sentenceToSentenceNode(ParserChunker2MatcherProcessor.java:525)
 	at opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor.parseSentenceNode(ParserChunker2MatcherProcessor.java:554)
 	at opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor.parseSentenceNode(ParserChunker2MatcherProcessor.java:548)
 	at opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor.parseChunkSentence(ParserChunker2MatcherProcessor.java:282)
 	at opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor.formGroupedPhrasesFromChunksForSentence(ParserChunker2MatcherProcessor.java:355)
 	at opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor.formGroupedPhrasesFromChunksForPara(ParserChunker2MatcherProcessor.java:250)
 	at opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor.assessRelevance(ParserChunker2MatcherProcessor.java:747)
 	at opennlp.tools.similarity.apps.RelatedSentenceFinder.augmentWithMinedSentencesAndVerifyRelevance(RelatedSentenceFinder.java:458)
 	at opennlp.tools.similarity.apps.RelatedSentenceFinder.generateContentAbout(RelatedSentenceFinder.java:156)
 	at
 	*/