opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.textsimilarity.chunker2matcher;

 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.logging.Level;
 import java.util.logging.Logger;

 import opennlp.tools.chunker.ChunkerME;
 import opennlp.tools.chunker.ChunkerModel;
 import opennlp.tools.cmdline.parser.ParserTool;
 import opennlp.tools.parser.AbstractBottomUpParser;
 import opennlp.tools.parser.Parse;
 import opennlp.tools.parser.Parser;
 import opennlp.tools.parser.ParserFactory;
 import opennlp.tools.parser.ParserModel;
 import opennlp.tools.postag.POSModel;
 import opennlp.tools.postag.POSTagger;
 import opennlp.tools.postag.POSTaggerME;
 import opennlp.tools.sentdetect.SentenceDetector;
 import opennlp.tools.sentdetect.SentenceDetectorME;
 import opennlp.tools.sentdetect.SentenceModel;
 import opennlp.tools.textsimilarity.LemmaPair;
 import opennlp.tools.textsimilarity.ParseTreeChunk;
 import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic;
 import opennlp.tools.textsimilarity.SentencePairMatchResult;
 import opennlp.tools.textsimilarity.TextProcessor;
 import opennlp.tools.tokenize.Tokenizer;
 import opennlp.tools.tokenize.TokenizerME;
 import opennlp.tools.tokenize.TokenizerModel;
 import opennlp.tools.util.Span;


 public class ParserChunker2MatcherProcessor {
 	protected static final int MIN_SENTENCE_LENGTH = 10;
 	private static final String MODEL_DIR_KEY = "nlp.models.dir";
 	// TODO config
 	// this is where resources shoudl live
 	private static String MODEL_DIR = "resources/models";
 	protected static ParserChunker2MatcherProcessor instance;

 	private SentenceDetector sentenceDetector;
 	private Tokenizer tokenizer;
 	private POSTagger posTagger;
 	private Parser parser;
 	private ChunkerME chunker;
 	private final int NUMBER_OF_SECTIONS_IN_SENTENCE_CHUNKS = 5;
 	private static Logger LOG = Logger.getLogger("opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor");

 	protected ParserChunker2MatcherProcessor() {
 		MODEL_DIR = new File(".").getAbsolutePath().replace(".", "")+MODEL_DIR;
 		initializeSentenceDetector();
 		initializeTokenizer();
 		initializePosTagger();
 		initializeParser();
 		initializeChunker();
 	}

 	/**
 	 * singleton method of instantiating the processor
 	 * @return the instance
 	 */
 	public synchronized static ParserChunker2MatcherProcessor getInstance() {
 		if (instance == null)
 			instance = new ParserChunker2MatcherProcessor();

 		return instance;
 	}

 	/**
 	 * General parsing function, which returns lists of parses for a portion of text
 	 * @param text to be parsed
 	 * @return lists of parses
 	 */
 	public List<List<Parse>> parseTextNlp(String text) {
 		if (text == null || text.trim().length() == 0)
 			return null;

 		List<List<Parse>> textParses = new ArrayList<List<Parse>>(1);

 		// parse paragraph by paragraph
 		String[] paragraphList = splitParagraph(text);
 		for (String paragraph : paragraphList) {
 			if (paragraph.length() == 0)
 				continue;

 			List<Parse> paragraphParses = parseParagraphNlp(paragraph);
 			if (paragraphParses != null)
 				textParses.add(paragraphParses);
 		}

 		return textParses;
 	}

 	public List<Parse> parseParagraphNlp(String paragraph) {
 		if (paragraph == null || paragraph.trim().length() == 0)
 			return null;

 		// normalize the text before parsing, otherwise, the sentences may not
 		// be
 		// separated correctly

 		//paragraph = TextNormalizer.normalizeText(paragraph);

 		// parse sentence by sentence
 		String[] sentences = splitSentences(paragraph);
 		List<Parse> parseList = new ArrayList<Parse>(sentences.length);
 		for (String sentence : sentences) {
 			sentence = sentence.trim();
 			if (sentence.length() == 0)
 				continue;

 			Parse sentenceParse = parseSentenceNlp(sentence, false);
 			if (sentenceParse != null)
 				parseList.add(sentenceParse);
 		}

 		return parseList;
 	}

 	public Parse parseSentenceNlp(String sentence) {
 		// if we parse an individual sentence, we want to normalize the text
 		// before parsing
 		return parseSentenceNlp(sentence, true);
 	}

 	public synchronized Parse parseSentenceNlp(String sentence,
 			boolean normalizeText) {
 		// don't try to parse very short sentence, not much info in it anyway,
 		// most likely a heading
 		if (sentence == null || sentence.trim().length() < MIN_SENTENCE_LENGTH)
 			return null;

 		//if (normalizeText)
 		//	sentence = TextNormalizer.normalizeText(sentence);

 		Parse[] parseArray = null;
 		try {
 			parseArray = ParserTool.parseLine(sentence, parser, 1);
 		} catch (Throwable t) {
 			LOG.log(Level.WARNING, "failed to parse the sentence : '"+sentence, t);
 			return null;
 		}

 		//	Parse[] chunks = ChunkSampleStream.getInitialChunks(parseArray[0]) ;

 		// there should be only one result parse
 		if (parseArray != null && parseArray.length > 0)
 			return parseArray[0];
 		else
 			return null;
 	}

 	/**
 	 *
 	 * @param para input text string which is assumed to be a paragraph and is split into sentences
 	 * @return a list of lists of phrases with their POS tags for each phrase type (noun, verb etc.)
 	 */

 	public synchronized List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForPara(String para){
 		List<List<ParseTreeChunk>> listOfChunksAccum = new ArrayList<List<ParseTreeChunk>>();
 		String[] sentences = splitSentences(para);
 		for(String sent: sentences){
 			List<List<ParseTreeChunk>> singleSentChunks = formGroupedPhrasesFromChunksForSentence(sent);
 			if (singleSentChunks==null)
 				continue;
 			if (listOfChunksAccum.size()<1 ){
 				listOfChunksAccum = new ArrayList<List<ParseTreeChunk>>(singleSentChunks);
 			} else
 				for(int i= 0; i<NUMBER_OF_SECTIONS_IN_SENTENCE_CHUNKS; i++){
 					//make sure not null
 					if (singleSentChunks == null || singleSentChunks.size()!=NUMBER_OF_SECTIONS_IN_SENTENCE_CHUNKS)
 						break;
 					List<ParseTreeChunk> phraseI = singleSentChunks.get(i);
 					List<ParseTreeChunk> phraseIaccum  = listOfChunksAccum.get(i);
 					phraseIaccum.addAll(phraseI);
 					listOfChunksAccum.set(i, phraseIaccum);
 				}
 		}
 		return listOfChunksAccum;
 	}

 	/**
 	 *
 	 * @param para input text string which is assumed to be a sentence
 	 * @return a list of lists of phrases with their POS tags for each phrase type (noun, verb etc.)
 	 */
 	public synchronized List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForSentence(String sentence) {
 		if (sentence == null || sentence.trim().length() < MIN_SENTENCE_LENGTH)
 			return null;

 		sentence = TextProcessor.removePunctuation(sentence);

 		String[] toks = tokenizer.tokenize(sentence);
 		String[] tags = new String[toks.length]; //posTagger.tag(toks);
 		SentenceNode node  = parseSentenceNode(sentence);
 		if (node==null){
 			LOG.info("Problem parsing sentence '"+sentence);
 			return null;
 		}
 		List<String> POSlist = node.getOrderedPOSList();

 		tags = POSlist.toArray(new String[0]);
 		if (toks.length != tags.length){
 			LOG.info("disagreement between toks and tags; sent =  '"+sentence + "'\n tags = "+tags +
 					"\n will now try this sentence in lower case" );
 			node  = parseSentenceNode(sentence.toLowerCase());
 			if (node==null){
 				LOG.info("Problem parsing sentence '"+sentence);
 				return null;
 			}
 			POSlist = node.getOrderedPOSList();
 			tags = POSlist.toArray(new String[0]);
 			if (toks.length != tags.length){
 				LOG.info("AGAIN: disagreement between toks and tags for lower case! ");
 				if (toks.length>tags.length){
 					String[] newToks = new String[tags.length];
 					for(int i = 0; i<tags.length; i++ ){
 						newToks[i] = toks[i];
 					}
 					toks = newToks;

 				} else
 					return null;
 			}
 		}
 		String[] res = chunker.chunk(toks, tags);

 		List<List<ParseTreeChunk>> listOfChunks = new ArrayList<List<ParseTreeChunk>>();
 		List<ParseTreeChunk> nounPhr = new ArrayList<ParseTreeChunk>(),
 		prepPhr = new ArrayList<ParseTreeChunk>(), verbPhr  = new ArrayList<ParseTreeChunk>(),
 		adjPhr  = new ArrayList<ParseTreeChunk>(),
 		// to store the whole sentence
 		wholeSentence = new ArrayList<ParseTreeChunk>();
 		List<String> pOSsAll = new ArrayList<String>(), lemmasAll = new ArrayList<String>();

 		for(int i = 0; i< toks.length; i++){
 			pOSsAll.add(tags[i]);
 			lemmasAll.add(toks[i]);
 		}
 		wholeSentence.add(new ParseTreeChunk("SENTENCE", lemmasAll, pOSsAll));

 		boolean currPhraseClosed = false;
 		for(int i=0; i< res.length; i++){
 			String bi_POS = res[i];
 			currPhraseClosed = false;
 			if (bi_POS.startsWith("B-NP")){// beginning of a phrase

 				List<String> pOSs = new ArrayList<String>(), lemmas = new ArrayList<String>();
 				pOSs.add(tags[i]);
 				lemmas.add(toks[i]);
 				for(int j=i+1; j<res.length; j++){
 					if (res[j].startsWith("B-VP")){
 						nounPhr.add(new ParseTreeChunk("NP", lemmas, pOSs));
 						//LOG.info(i + " => " +lemmas);
 						currPhraseClosed = true;
 						break;
 					} else {
 						pOSs.add(tags[j]);
 						lemmas.add(toks[j]);
 					}
 				}
 				if (!currPhraseClosed){
 					nounPhr.add(new ParseTreeChunk("NP", lemmas, pOSs));
 					//LOG.fine(i + " => " + lemmas);
 				}

 			} else if (bi_POS.startsWith("B-PP")){// beginning of a phrase
 				List<String> pOSs = new ArrayList<String>(), lemmas = new ArrayList<String>();
 				pOSs.add(tags[i]);
 				lemmas.add(toks[i]);

 				for(int j=i+1; j<res.length; j++){
 					if (res[j].startsWith("B-VP")){
 						prepPhr.add(new ParseTreeChunk("PP", lemmas, pOSs));
 						//LOG.fine(i + " => " + lemmas);
 						currPhraseClosed = true;
 						break;
 					} else {
 						pOSs.add(tags[j]);
 						lemmas.add(toks[j]);
 					}
 				}
 				if (!currPhraseClosed){
 					prepPhr.add(new ParseTreeChunk("PP", lemmas, pOSs));
 					//LOG.fine(i + " => " + lemmas);
 				}
 			} else
 				if (bi_POS.startsWith("B-VP")){// beginning of a phrase
 					List<String> pOSs = new ArrayList<String>(), lemmas = new ArrayList<String>();
 					pOSs.add(tags[i]);
 					lemmas.add(toks[i]);

 					for(int j=i+1; j<res.length; j++){
 						if (res[j].startsWith("B-VP")){
 							verbPhr.add(new ParseTreeChunk("VP", lemmas, pOSs));
 							//LOG.fine(i + " => " +lemmas);
 							currPhraseClosed = true;
 							break;
 						} else {
 							pOSs.add(tags[j]);
 							lemmas.add(toks[j]);
 						}
 					}
 					if (!currPhraseClosed){
 						verbPhr.add(new ParseTreeChunk("VP", lemmas, pOSs));
 						//LOG.fine(i + " => " + lemmas);
 					}
 				} else
 					if (bi_POS.startsWith("B-ADJP") ){// beginning of a phrase
 						List<String> pOSs = new ArrayList<String>(), lemmas = new ArrayList<String>();
 						pOSs.add(tags[i]);
 						lemmas.add(toks[i]);

 						for(int j=i+1; j<res.length; j++){
 							if (res[j].startsWith("B-VP")){
 								adjPhr.add(new ParseTreeChunk("ADJP", lemmas, pOSs));
 								//LOG.fine(i + " => " +lemmas);
 								currPhraseClosed = true;
 								break;
 							} else {
 								pOSs.add(tags[j]);
 								lemmas.add(toks[j]);
 							}
 						}
 						if (!currPhraseClosed){
 							adjPhr.add(new ParseTreeChunk("ADJP", lemmas, pOSs));
 							//LOG.fine(i + " => " + lemmas);
 						}
 					}
 		}
 		listOfChunks.add(nounPhr);
 		listOfChunks.add(verbPhr);
 		listOfChunks.add(prepPhr);
 		listOfChunks.add(adjPhr);
 		listOfChunks.add(wholeSentence);

 		return listOfChunks;
 	}

 	public static List<List<SentenceNode>> textToSentenceNodes(
 			List<List<Parse>> textParses) {
 		if (textParses == null || textParses.size() == 0)
 			return null;

 		List<List<SentenceNode>> textNodes = new ArrayList<List<SentenceNode>>(
 				textParses.size());
 		for (List<Parse> paragraphParses : textParses) {
 			List<SentenceNode> paragraphNodes = paragraphToSentenceNodes(paragraphParses);

 			// append paragraph node if any
 			if (paragraphNodes != null && paragraphNodes.size() > 0)
 				textNodes.add(paragraphNodes);
 		}

 		if (textNodes.size() > 0)
 			return textNodes;
 		else
 			return null;
 	}

 	public static List<SentenceNode> paragraphToSentenceNodes(
 			List<Parse> paragraphParses) {
 		if (paragraphParses == null || paragraphParses.size() == 0)
 			return null;

 		List<SentenceNode> paragraphNodes = new ArrayList<SentenceNode>(
 				paragraphParses.size());
 		for (Parse sentenceParse : paragraphParses) {
 			SentenceNode sentenceNode = null;
 			try {
 				sentenceNode = sentenceToSentenceNode(sentenceParse);
 			} catch (Exception e) {
 				// don't fail the whole paragraph when a single sentence fails
 				LOG.severe("Failed to convert sentence to node. error: " + e);
 				sentenceNode = null;
 			}

 			if (sentenceNode != null)
 				paragraphNodes.add(sentenceNode);
 		}

 		if (paragraphNodes.size() > 0)
 			return paragraphNodes;
 		else
 			return null;
 	}

 	public static SentenceNode sentenceToSentenceNode(Parse sentenceParse) {
 		if (sentenceParse == null)
 			return null;

 		// convert the OpenNLP Parse to our own tree nodes
 		SyntacticTreeNode node = toSyntacticTreeNode(sentenceParse);
 		if ((node == null))
 			return null;
 		if (node instanceof SentenceNode)
 			return (SentenceNode)node;
 		else if (node instanceof PhraseNode){
 			SentenceNode sn = new SentenceNode("sentence", node.getChildren()) ;
 			return sn;
 		} else return null;
 	}

 	public List<List<SentenceNode>> parseTextNode(String text) {
 		List<List<Parse>> textParseList = parseTextNlp(text);
 		return textToSentenceNodes(textParseList);
 	}

 	public List<SentenceNode> parseParagraphNode(String paragraph) {
 		List<Parse> paragraphParseList = parseParagraphNlp(paragraph);
 		return paragraphToSentenceNodes(paragraphParseList);
 	}

 	public SentenceNode parseSentenceNode(String sentence) {
 		return parseSentenceNode(sentence, true);
 	}

 	public synchronized SentenceNode parseSentenceNode(String sentence,
 			boolean normalizeText) {
 		Parse sentenceParse = parseSentenceNlp(sentence, normalizeText);
 		return sentenceToSentenceNode(sentenceParse);
 	}

 	public String[] splitParagraph(String text) {
 		String[] res = text.split("\n");
 		if (res == null || res.length<=1)
 			return new String[] {text};
 		else
 			return res;

 	}

 	public String[] splitSentences(String text) {
 		if (text == null)
 			return null;

 		return sentenceDetector.sentDetect(text);
 	}

 	public String[] tokenizeSentence(String sentence) {
 		if (sentence == null)
 			return null;

 		return tokenizer.tokenize(sentence);
 	}

 	protected void initializeSentenceDetector() {
 		InputStream is = null;
 		try {
 			is = new FileInputStream(
 					MODEL_DIR + "/en-sent.bin"

 			);
 			SentenceModel model = new SentenceModel(is);
 			sentenceDetector = new SentenceDetectorME(model);
 		} catch (IOException e) {
 			e.printStackTrace();
 		} finally {
 			if (is != null) {
 				try {
 					is.close();
 				} catch (IOException e) {
 					e.printStackTrace();
 				}
 			}
 		}
 	}

 	protected void initializeTokenizer() {
 		InputStream is = null;
 		try {
 			is = new FileInputStream(
 					MODEL_DIR+ "/en-token.bin"
 			);
 			TokenizerModel model = new TokenizerModel(is);
 			tokenizer = new TokenizerME(model);
 		} catch (IOException e) {
 			e.printStackTrace();
 		} finally {
 			if (is != null) {
 				try {
 					is.close();
 				} catch (IOException e) {
 				}
 			}
 		}
 	}

 	protected void initializePosTagger() {
 		InputStream is = null;
 		try {
 			is = new FileInputStream(MODEL_DIR
 					+ "/en-pos-maxent.bin");
 			POSModel model = new POSModel(is);
 			posTagger = new POSTaggerME(model);
 		} catch (IOException e) {
 			e.printStackTrace();
 		} finally {
 			if (is != null) {
 				try {
 					is.close();
 				} catch (IOException e) {
 				}
 			}
 		}
 	}

 	protected void initializeParser() {
 		InputStream is = null;
 		try {
 			is = new FileInputStream(MODEL_DIR
 					+ "/en-parser-chunking.bin");
 			ParserModel model = new ParserModel(is);
 			parser = ParserFactory.create(model);
 		} catch (IOException e) {
 			e.printStackTrace();
 		} finally {
 			if (is != null) {
 				try {
 					is.close();
 				} catch (IOException e) {
 				}
 			}
 		}
 	}

 	private void initializeChunker() {
 		InputStream is = null;
 		try {
 			is = new FileInputStream(MODEL_DIR
 					+ "/en-chunker.bin");
 			ChunkerModel model = new ChunkerModel(is);
 			chunker = new ChunkerME(model);
 		} catch (IOException e) {
 			e.printStackTrace();
 		} finally {
 			if (is != null) {
 				try {
 					is.close();
 				} catch (IOException e) {
 				}
 			}
 		}
 	}

 	/**
 	 * convert an instance of Parse to SyntacticTreeNode, by filtering out the
 	 * unnecessary data and assigning the word for each node
 	 *
 	 * @param parse
 	 */
 	private static SyntacticTreeNode toSyntacticTreeNode(Parse parse) {
 		if (parse == null)
 			return null;

 		// check for junk types
 		String type = parse.getType();
 		if (SyntacticTreeNode.isJunkType(type, parse) )
 			return null;

 		String text = parse.getText();
 		ArrayList<SyntacticTreeNode> childrenNodeList = convertChildrenNodes(parse);

 		// check sentence node, the node contained in the top node
 		if (type.equals(AbstractBottomUpParser.TOP_NODE)
 				&& childrenNodeList != null && childrenNodeList.size() > 0) {
 			PhraseNode rootNode = (PhraseNode) childrenNodeList.get(0);
 			return new SentenceNode(text, rootNode.getChildren());
 		}

 		// if this node contains children nodes, then it is a phrase node
 		if (childrenNodeList != null && childrenNodeList.size() > 0) {
 			//System.out.println("Found "+ type + " phrase = "+ childrenNodeList);
 			return new PhraseNode(type, childrenNodeList);

 		}

 		// otherwise, it is a word node
 		Span span = parse.getSpan();
 		String word = text.substring(span.getStart(), span.getEnd()).trim();

 		return new WordNode(type, word);
 	}

 	private static ArrayList<SyntacticTreeNode> convertChildrenNodes(Parse parse) {
 		if (parse == null)
 			return null;

 		Parse[] children = parse.getChildren();
 		if (children == null || children.length == 0)
 			return null;

 		ArrayList<SyntacticTreeNode> childrenNodeList = new ArrayList<SyntacticTreeNode>();
 		for (Parse child : children) {
 			SyntacticTreeNode childNode = toSyntacticTreeNode(child);
 			if (childNode != null)
 				childrenNodeList.add(childNode);
 		}

 		return childrenNodeList;
 	}

 	/**
 	 * The key function of similarity component which takes two portions of text and does similarity assessment by finding the set of all maximum common subtrees
 	 * of the set of parse trees for each portion of text
 	 * @param input text 1
 	 * @param input text 2
 	 * @return the matching results structure, which includes the similarity score
 	 */
 	public SentencePairMatchResult assessRelevance(String para1, String para2)
 	{
 		List<List<ParseTreeChunk>> sent1GrpLst = formGroupedPhrasesFromChunksForPara(para1),
 		sent2GrpLst = formGroupedPhrasesFromChunksForPara(para2);

 		List<LemmaPair> origChunks1 = listListParseTreeChunk2ListLemmaPairs(sent1GrpLst);


 		ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic();
 		List<List<ParseTreeChunk>> res = md.matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);
 		return new SentencePairMatchResult(res, origChunks1);

 	}

 	protected List<LemmaPair> listListParseTreeChunk2ListLemmaPairs(
 			List<List<ParseTreeChunk>> sent1GrpLst) {
 		List<LemmaPair>  results = new ArrayList<LemmaPair>();
 		if (sent1GrpLst==null || sent1GrpLst.size() <1)
 			return  results;
 		List<ParseTreeChunk> wholeSentence = sent1GrpLst.get(sent1GrpLst.size()-1); // whole sentence is last list in the list of lists

 		List<String> pOSs = wholeSentence.get(0).getPOSs();
 		List<String> lemmas = wholeSentence.get(0).getLemmas();
 		for(int i= 0; i< lemmas.size(); i++){
 			results.add(new LemmaPair( pOSs.get(i), lemmas.get(i), i  ));
 		}

 		return results;
 	}

 	public void printParseTree(String phrase1){
 		ParserChunker2MatcherProcessor p = ParserChunker2MatcherProcessor.getInstance();
 		List<List<SentenceNode>> nodeListList = p.parseTextNode(phrase1);
 		for (List<SentenceNode> nodeList : nodeListList) {
 			for (SentenceNode node : nodeList) {
 				System.out.println(node);
 			}
 		}
 	}
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.tools.textsimilarity.chunker2matcher;

	import java.io.File;
	import java.io.FileInputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.util.ArrayList;
	import java.util.List;
	import java.util.logging.Level;
	import java.util.logging.Logger;

	import opennlp.tools.chunker.ChunkerME;
	import opennlp.tools.chunker.ChunkerModel;
	import opennlp.tools.cmdline.parser.ParserTool;
	import opennlp.tools.parser.AbstractBottomUpParser;
	import opennlp.tools.parser.Parse;
	import opennlp.tools.parser.Parser;
	import opennlp.tools.parser.ParserFactory;
	import opennlp.tools.parser.ParserModel;
	import opennlp.tools.postag.POSModel;
	import opennlp.tools.postag.POSTagger;
	import opennlp.tools.postag.POSTaggerME;
	import opennlp.tools.sentdetect.SentenceDetector;
	import opennlp.tools.sentdetect.SentenceDetectorME;
	import opennlp.tools.sentdetect.SentenceModel;
	import opennlp.tools.textsimilarity.LemmaPair;
	import opennlp.tools.textsimilarity.ParseTreeChunk;
	import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic;
	import opennlp.tools.textsimilarity.SentencePairMatchResult;
	import opennlp.tools.textsimilarity.TextProcessor;
	import opennlp.tools.tokenize.Tokenizer;
	import opennlp.tools.tokenize.TokenizerME;
	import opennlp.tools.tokenize.TokenizerModel;
	import opennlp.tools.util.Span;


	public class ParserChunker2MatcherProcessor {
	protected static final int MIN_SENTENCE_LENGTH = 10;
	private static final String MODEL_DIR_KEY = "nlp.models.dir";
	// TODO config
	// this is where resources shoudl live
	private static String MODEL_DIR = "resources/models";
	protected static ParserChunker2MatcherProcessor instance;

	private SentenceDetector sentenceDetector;
	private Tokenizer tokenizer;
	private POSTagger posTagger;
	private Parser parser;
	private ChunkerME chunker;
	private final int NUMBER_OF_SECTIONS_IN_SENTENCE_CHUNKS = 5;
	private static Logger LOG = Logger.getLogger("opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor");

	protected ParserChunker2MatcherProcessor() {
	MODEL_DIR = new File(".").getAbsolutePath().replace(".", "")+MODEL_DIR;
	initializeSentenceDetector();
	initializeTokenizer();
	initializePosTagger();
	initializeParser();
	initializeChunker();
	}

	/**
	* singleton method of instantiating the processor
	* @return the instance
	*/
	public synchronized static ParserChunker2MatcherProcessor getInstance() {
	if (instance == null)
	instance = new ParserChunker2MatcherProcessor();

	return instance;
	}

	/**
	* General parsing function, which returns lists of parses for a portion of text
	* @param text to be parsed
	* @return lists of parses
	*/
	public List<List<Parse>> parseTextNlp(String text) {
	if (text == null \|\| text.trim().length() == 0)
	return null;

	List<List<Parse>> textParses = new ArrayList<List<Parse>>(1);

	// parse paragraph by paragraph
	String[] paragraphList = splitParagraph(text);
	for (String paragraph : paragraphList) {
	if (paragraph.length() == 0)
	continue;

	List<Parse> paragraphParses = parseParagraphNlp(paragraph);
	if (paragraphParses != null)
	textParses.add(paragraphParses);
	}

	return textParses;
	}

	public List<Parse> parseParagraphNlp(String paragraph) {
	if (paragraph == null \|\| paragraph.trim().length() == 0)
	return null;

	// normalize the text before parsing, otherwise, the sentences may not
	// be
	// separated correctly

	//paragraph = TextNormalizer.normalizeText(paragraph);

	// parse sentence by sentence
	String[] sentences = splitSentences(paragraph);
	List<Parse> parseList = new ArrayList<Parse>(sentences.length);
	for (String sentence : sentences) {
	sentence = sentence.trim();
	if (sentence.length() == 0)
	continue;

	Parse sentenceParse = parseSentenceNlp(sentence, false);
	if (sentenceParse != null)
	parseList.add(sentenceParse);
	}

	return parseList;
	}

	public Parse parseSentenceNlp(String sentence) {
	// if we parse an individual sentence, we want to normalize the text
	// before parsing
	return parseSentenceNlp(sentence, true);
	}

	public synchronized Parse parseSentenceNlp(String sentence,
	boolean normalizeText) {
	// don't try to parse very short sentence, not much info in it anyway,
	// most likely a heading
	if (sentence == null \|\| sentence.trim().length() < MIN_SENTENCE_LENGTH)
	return null;

	//if (normalizeText)
	// sentence = TextNormalizer.normalizeText(sentence);

	Parse[] parseArray = null;
	try {
	parseArray = ParserTool.parseLine(sentence, parser, 1);
	} catch (Throwable t) {
	LOG.log(Level.WARNING, "failed to parse the sentence : '"+sentence, t);
	return null;
	}

	// Parse[] chunks = ChunkSampleStream.getInitialChunks(parseArray[0]) ;

	// there should be only one result parse
	if (parseArray != null && parseArray.length > 0)
	return parseArray[0];
	else
	return null;
	}

	/**
	*
	* @param para input text string which is assumed to be a paragraph and is split into sentences
	* @return a list of lists of phrases with their POS tags for each phrase type (noun, verb etc.)
	*/

	public synchronized List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForPara(String para){
	List<List<ParseTreeChunk>> listOfChunksAccum = new ArrayList<List<ParseTreeChunk>>();
	String[] sentences = splitSentences(para);
	for(String sent: sentences){
	List<List<ParseTreeChunk>> singleSentChunks = formGroupedPhrasesFromChunksForSentence(sent);
	if (singleSentChunks==null)
	continue;
	if (listOfChunksAccum.size()<1 ){
	listOfChunksAccum = new ArrayList<List<ParseTreeChunk>>(singleSentChunks);
	} else
	for(int i= 0; i<NUMBER_OF_SECTIONS_IN_SENTENCE_CHUNKS; i++){
	//make sure not null
	if (singleSentChunks == null \|\| singleSentChunks.size()!=NUMBER_OF_SECTIONS_IN_SENTENCE_CHUNKS)
	break;
	List<ParseTreeChunk> phraseI = singleSentChunks.get(i);
	List<ParseTreeChunk> phraseIaccum = listOfChunksAccum.get(i);
	phraseIaccum.addAll(phraseI);
	listOfChunksAccum.set(i, phraseIaccum);
	}
	}
	return listOfChunksAccum;
	}

	/**
	*
	* @param para input text string which is assumed to be a sentence
	* @return a list of lists of phrases with their POS tags for each phrase type (noun, verb etc.)
	*/
	public synchronized List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForSentence(String sentence) {
	if (sentence == null \|\| sentence.trim().length() < MIN_SENTENCE_LENGTH)
	return null;

	sentence = TextProcessor.removePunctuation(sentence);

	String[] toks = tokenizer.tokenize(sentence);
	String[] tags = new String[toks.length]; //posTagger.tag(toks);
	SentenceNode node = parseSentenceNode(sentence);
	if (node==null){
	LOG.info("Problem parsing sentence '"+sentence);
	return null;
	}
	List<String> POSlist = node.getOrderedPOSList();

	tags = POSlist.toArray(new String[0]);
	if (toks.length != tags.length){
	LOG.info("disagreement between toks and tags; sent = '"+sentence + "'\n tags = "+tags +
	"\n will now try this sentence in lower case" );
	node = parseSentenceNode(sentence.toLowerCase());
	if (node==null){
	LOG.info("Problem parsing sentence '"+sentence);
	return null;
	}
	POSlist = node.getOrderedPOSList();
	tags = POSlist.toArray(new String[0]);
	if (toks.length != tags.length){
	LOG.info("AGAIN: disagreement between toks and tags for lower case! ");
	if (toks.length>tags.length){
	String[] newToks = new String[tags.length];
	for(int i = 0; i<tags.length; i++ ){
	newToks[i] = toks[i];
	}
	toks = newToks;

	} else
	return null;
	}
	}
	String[] res = chunker.chunk(toks, tags);

	List<List<ParseTreeChunk>> listOfChunks = new ArrayList<List<ParseTreeChunk>>();
	List<ParseTreeChunk> nounPhr = new ArrayList<ParseTreeChunk>(),
	prepPhr = new ArrayList<ParseTreeChunk>(), verbPhr = new ArrayList<ParseTreeChunk>(),
	adjPhr = new ArrayList<ParseTreeChunk>(),
	// to store the whole sentence
	wholeSentence = new ArrayList<ParseTreeChunk>();
	List<String> pOSsAll = new ArrayList<String>(), lemmasAll = new ArrayList<String>();

	for(int i = 0; i< toks.length; i++){
	pOSsAll.add(tags[i]);
	lemmasAll.add(toks[i]);
	}
	wholeSentence.add(new ParseTreeChunk("SENTENCE", lemmasAll, pOSsAll));

	boolean currPhraseClosed = false;
	for(int i=0; i< res.length; i++){
	String bi_POS = res[i];
	currPhraseClosed = false;
	if (bi_POS.startsWith("B-NP")){// beginning of a phrase

	List<String> pOSs = new ArrayList<String>(), lemmas = new ArrayList<String>();
	pOSs.add(tags[i]);
	lemmas.add(toks[i]);
	for(int j=i+1; j<res.length; j++){
	if (res[j].startsWith("B-VP")){
	nounPhr.add(new ParseTreeChunk("NP", lemmas, pOSs));
	//LOG.info(i + " => " +lemmas);
	currPhraseClosed = true;
	break;
	} else {
	pOSs.add(tags[j]);
	lemmas.add(toks[j]);
	}
	}
	if (!currPhraseClosed){
	nounPhr.add(new ParseTreeChunk("NP", lemmas, pOSs));
	//LOG.fine(i + " => " + lemmas);
	}

	} else if (bi_POS.startsWith("B-PP")){// beginning of a phrase
	List<String> pOSs = new ArrayList<String>(), lemmas = new ArrayList<String>();
	pOSs.add(tags[i]);
	lemmas.add(toks[i]);

	for(int j=i+1; j<res.length; j++){
	if (res[j].startsWith("B-VP")){
	prepPhr.add(new ParseTreeChunk("PP", lemmas, pOSs));
	//LOG.fine(i + " => " + lemmas);
	currPhraseClosed = true;
	break;
	} else {
	pOSs.add(tags[j]);
	lemmas.add(toks[j]);
	}
	}
	if (!currPhraseClosed){
	prepPhr.add(new ParseTreeChunk("PP", lemmas, pOSs));
	//LOG.fine(i + " => " + lemmas);
	}
	} else
	if (bi_POS.startsWith("B-VP")){// beginning of a phrase
	List<String> pOSs = new ArrayList<String>(), lemmas = new ArrayList<String>();
	pOSs.add(tags[i]);
	lemmas.add(toks[i]);

	for(int j=i+1; j<res.length; j++){
	if (res[j].startsWith("B-VP")){
	verbPhr.add(new ParseTreeChunk("VP", lemmas, pOSs));
	//LOG.fine(i + " => " +lemmas);
	currPhraseClosed = true;
	break;
	} else {
	pOSs.add(tags[j]);
	lemmas.add(toks[j]);
	}
	}
	if (!currPhraseClosed){
	verbPhr.add(new ParseTreeChunk("VP", lemmas, pOSs));
	//LOG.fine(i + " => " + lemmas);
	}
	} else
	if (bi_POS.startsWith("B-ADJP") ){// beginning of a phrase
	List<String> pOSs = new ArrayList<String>(), lemmas = new ArrayList<String>();
	pOSs.add(tags[i]);
	lemmas.add(toks[i]);

	for(int j=i+1; j<res.length; j++){
	if (res[j].startsWith("B-VP")){
	adjPhr.add(new ParseTreeChunk("ADJP", lemmas, pOSs));
	//LOG.fine(i + " => " +lemmas);
	currPhraseClosed = true;
	break;
	} else {
	pOSs.add(tags[j]);
	lemmas.add(toks[j]);
	}
	}
	if (!currPhraseClosed){
	adjPhr.add(new ParseTreeChunk("ADJP", lemmas, pOSs));
	//LOG.fine(i + " => " + lemmas);
	}
	}
	}
	listOfChunks.add(nounPhr);
	listOfChunks.add(verbPhr);
	listOfChunks.add(prepPhr);
	listOfChunks.add(adjPhr);
	listOfChunks.add(wholeSentence);

	return listOfChunks;
	}

	public static List<List<SentenceNode>> textToSentenceNodes(
	List<List<Parse>> textParses) {
	if (textParses == null \|\| textParses.size() == 0)
	return null;

	List<List<SentenceNode>> textNodes = new ArrayList<List<SentenceNode>>(
	textParses.size());
	for (List<Parse> paragraphParses : textParses) {
	List<SentenceNode> paragraphNodes = paragraphToSentenceNodes(paragraphParses);

	// append paragraph node if any
	if (paragraphNodes != null && paragraphNodes.size() > 0)
	textNodes.add(paragraphNodes);
	}

	if (textNodes.size() > 0)
	return textNodes;
	else
	return null;
	}

	public static List<SentenceNode> paragraphToSentenceNodes(
	List<Parse> paragraphParses) {
	if (paragraphParses == null \|\| paragraphParses.size() == 0)
	return null;

	List<SentenceNode> paragraphNodes = new ArrayList<SentenceNode>(
	paragraphParses.size());
	for (Parse sentenceParse : paragraphParses) {
	SentenceNode sentenceNode = null;
	try {
	sentenceNode = sentenceToSentenceNode(sentenceParse);
	} catch (Exception e) {
	// don't fail the whole paragraph when a single sentence fails
	LOG.severe("Failed to convert sentence to node. error: " + e);
	sentenceNode = null;
	}

	if (sentenceNode != null)
	paragraphNodes.add(sentenceNode);
	}

	if (paragraphNodes.size() > 0)
	return paragraphNodes;
	else
	return null;
	}

	public static SentenceNode sentenceToSentenceNode(Parse sentenceParse) {
	if (sentenceParse == null)
	return null;

	// convert the OpenNLP Parse to our own tree nodes
	SyntacticTreeNode node = toSyntacticTreeNode(sentenceParse);
	if ((node == null))
	return null;
	if (node instanceof SentenceNode)
	return (SentenceNode)node;
	else if (node instanceof PhraseNode){
	SentenceNode sn = new SentenceNode("sentence", node.getChildren()) ;
	return sn;
	} else return null;
	}

	public List<List<SentenceNode>> parseTextNode(String text) {
	List<List<Parse>> textParseList = parseTextNlp(text);
	return textToSentenceNodes(textParseList);
	}

	public List<SentenceNode> parseParagraphNode(String paragraph) {
	List<Parse> paragraphParseList = parseParagraphNlp(paragraph);
	return paragraphToSentenceNodes(paragraphParseList);
	}

	public SentenceNode parseSentenceNode(String sentence) {
	return parseSentenceNode(sentence, true);
	}

	public synchronized SentenceNode parseSentenceNode(String sentence,
	boolean normalizeText) {
	Parse sentenceParse = parseSentenceNlp(sentence, normalizeText);
	return sentenceToSentenceNode(sentenceParse);
	}

	public String[] splitParagraph(String text) {
	String[] res = text.split("\n");
	if (res == null \|\| res.length<=1)
	return new String[] {text};
	else
	return res;

	}

	public String[] splitSentences(String text) {
	if (text == null)
	return null;

	return sentenceDetector.sentDetect(text);
	}

	public String[] tokenizeSentence(String sentence) {
	if (sentence == null)
	return null;

	return tokenizer.tokenize(sentence);
	}

	protected void initializeSentenceDetector() {
	InputStream is = null;
	try {
	is = new FileInputStream(
	MODEL_DIR + "/en-sent.bin"

	);
	SentenceModel model = new SentenceModel(is);
	sentenceDetector = new SentenceDetectorME(model);
	} catch (IOException e) {
	e.printStackTrace();
	} finally {
	if (is != null) {
	try {
	is.close();
	} catch (IOException e) {
	e.printStackTrace();
	}
	}
	}
	}

	protected void initializeTokenizer() {
	InputStream is = null;
	try {
	is = new FileInputStream(
	MODEL_DIR+ "/en-token.bin"
	);
	TokenizerModel model = new TokenizerModel(is);
	tokenizer = new TokenizerME(model);
	} catch (IOException e) {
	e.printStackTrace();
	} finally {
	if (is != null) {
	try {
	is.close();
	} catch (IOException e) {
	}
	}
	}
	}

	protected void initializePosTagger() {
	InputStream is = null;
	try {
	is = new FileInputStream(MODEL_DIR
	+ "/en-pos-maxent.bin");
	POSModel model = new POSModel(is);
	posTagger = new POSTaggerME(model);
	} catch (IOException e) {
	e.printStackTrace();
	} finally {
	if (is != null) {
	try {
	is.close();
	} catch (IOException e) {
	}
	}
	}
	}

	protected void initializeParser() {
	InputStream is = null;
	try {
	is = new FileInputStream(MODEL_DIR
	+ "/en-parser-chunking.bin");
	ParserModel model = new ParserModel(is);
	parser = ParserFactory.create(model);
	} catch (IOException e) {
	e.printStackTrace();
	} finally {
	if (is != null) {
	try {
	is.close();
	} catch (IOException e) {
	}
	}
	}
	}

	private void initializeChunker() {
	InputStream is = null;
	try {
	is = new FileInputStream(MODEL_DIR
	+ "/en-chunker.bin");
	ChunkerModel model = new ChunkerModel(is);
	chunker = new ChunkerME(model);
	} catch (IOException e) {
	e.printStackTrace();
	} finally {
	if (is != null) {
	try {
	is.close();
	} catch (IOException e) {
	}
	}
	}
	}

	/**
	* convert an instance of Parse to SyntacticTreeNode, by filtering out the
	* unnecessary data and assigning the word for each node
	*
	* @param parse
	*/
	private static SyntacticTreeNode toSyntacticTreeNode(Parse parse) {
	if (parse == null)
	return null;

	// check for junk types
	String type = parse.getType();
	if (SyntacticTreeNode.isJunkType(type, parse) )
	return null;

	String text = parse.getText();
	ArrayList<SyntacticTreeNode> childrenNodeList = convertChildrenNodes(parse);

	// check sentence node, the node contained in the top node
	if (type.equals(AbstractBottomUpParser.TOP_NODE)
	&& childrenNodeList != null && childrenNodeList.size() > 0) {
	PhraseNode rootNode = (PhraseNode) childrenNodeList.get(0);
	return new SentenceNode(text, rootNode.getChildren());
	}

	// if this node contains children nodes, then it is a phrase node
	if (childrenNodeList != null && childrenNodeList.size() > 0) {
	//System.out.println("Found "+ type + " phrase = "+ childrenNodeList);
	return new PhraseNode(type, childrenNodeList);

	}

	// otherwise, it is a word node
	Span span = parse.getSpan();
	String word = text.substring(span.getStart(), span.getEnd()).trim();

	return new WordNode(type, word);
	}

	private static ArrayList<SyntacticTreeNode> convertChildrenNodes(Parse parse) {
	if (parse == null)
	return null;

	Parse[] children = parse.getChildren();
	if (children == null \|\| children.length == 0)
	return null;

	ArrayList<SyntacticTreeNode> childrenNodeList = new ArrayList<SyntacticTreeNode>();
	for (Parse child : children) {
	SyntacticTreeNode childNode = toSyntacticTreeNode(child);
	if (childNode != null)
	childrenNodeList.add(childNode);
	}

	return childrenNodeList;
	}

	/**
	* The key function of similarity component which takes two portions of text and does similarity assessment by finding the set of all maximum common subtrees
	* of the set of parse trees for each portion of text
	* @param input text 1
	* @param input text 2
	* @return the matching results structure, which includes the similarity score
	*/
	public SentencePairMatchResult assessRelevance(String para1, String para2)
	{
	List<List<ParseTreeChunk>> sent1GrpLst = formGroupedPhrasesFromChunksForPara(para1),
	sent2GrpLst = formGroupedPhrasesFromChunksForPara(para2);

	List<LemmaPair> origChunks1 = listListParseTreeChunk2ListLemmaPairs(sent1GrpLst);


	ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic();
	List<List<ParseTreeChunk>> res = md.matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);
	return new SentencePairMatchResult(res, origChunks1);

	}

	protected List<LemmaPair> listListParseTreeChunk2ListLemmaPairs(
	List<List<ParseTreeChunk>> sent1GrpLst) {
	List<LemmaPair> results = new ArrayList<LemmaPair>();
	if (sent1GrpLst==null \|\| sent1GrpLst.size() <1)
	return results;
	List<ParseTreeChunk> wholeSentence = sent1GrpLst.get(sent1GrpLst.size()-1); // whole sentence is last list in the list of lists

	List<String> pOSs = wholeSentence.get(0).getPOSs();
	List<String> lemmas = wholeSentence.get(0).getLemmas();
	for(int i= 0; i< lemmas.size(); i++){
	results.add(new LemmaPair( pOSs.get(i), lemmas.get(i), i ));
	}

	return results;
	}

	public void printParseTree(String phrase1){
	ParserChunker2MatcherProcessor p = ParserChunker2MatcherProcessor.getInstance();
	List<List<SentenceNode>> nodeListList = p.parseTextNode(phrase1);
	for (List<SentenceNode> nodeList : nodeListList) {
	for (SentenceNode node : nodeList) {
	System.out.println(node);
	}
	}
	}
	}